Commit 53f01f3e by yangchen
parents 7eca1950 137e06b2
...@@ -20,6 +20,7 @@ import java.util.HashMap; ...@@ -20,6 +20,7 @@ import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
...@@ -29,6 +30,7 @@ import com.zhiwei.crawler.core.HttpBoot; ...@@ -29,6 +30,7 @@ import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils; import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.tools.timeparse.TimeParse; import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.toutiao.bean.Signature; import com.zhiwei.toutiao.bean.Signature;
import com.zhiwei.toutiao.bean.TouTiaoArticle; import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.util.Tools; import com.zhiwei.toutiao.util.Tools;
...@@ -376,7 +378,7 @@ public class TouTiaoArticleParse { ...@@ -376,7 +378,7 @@ public class TouTiaoArticleParse {
Long max_behot_time) throws IOException { Long max_behot_time) throws IOException {
String url = "https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id=" + user_id; String url = "https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id=" + user_id;
if (max_behot_time != null) { if (max_behot_time != null) {
url = url + "?max_behot_time=" + max_behot_time; url = url + "&max_behot_time=" + max_behot_time;
} }
logger.info("微头条采集链接:::{}", url); logger.info("微头条采集链接:::{}", url);
Map<String, String> headerMap = Tools.getTouTiaoHeader(); Map<String, String> headerMap = Tools.getTouTiaoHeader();
...@@ -574,6 +576,36 @@ public class TouTiaoArticleParse { ...@@ -574,6 +576,36 @@ public class TouTiaoArticleParse {
return map; return map;
} }
/**
* 根据链接获取全文
* @param url
* @param proxy
* @return
*/
public static String getContent(String url,Proxy proxy) {
try {
String htmlBody = downloadHtml(url, proxy, null);
if(!StringUtils.isBlank(htmlBody)) {
if(htmlBody.contains("content:")) {
String content = htmlBody.split(" content: '")[1].split("',")[0];
return ZhiWeiTools.delHTMLTag(content);
}
}
return null;
} catch (Exception e) {
logger.error("跟据链接采集全文出现错误", e);
return null;
}
}
private static String downloadHtml(String url, Proxy proxy, Map<String,String> headMap) { private static String downloadHtml(String url, Proxy proxy, Map<String,String> headMap) {
// 下载数据页面 // 下载数据页面
......
...@@ -37,38 +37,41 @@ public class TouTiaoExample { ...@@ -37,38 +37,41 @@ public class TouTiaoExample {
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
ProxyFactory.init(registry, group, GroupType.PROVIDER); ProxyFactory.init(registry, group, GroupType.PROVIDER);
String url = "https://www.toutiao.com/a6659244827009352196/";
String content = TouTiaoArticleParse.getContent(url, null);
System.out.println(content);
List<String> urlList = new ArrayList<String>(); // List<String> urlList = new ArrayList<String>();
urlList.add("1920576965"); // urlList.add("1920576965");
Date endTime = TimeParse.stringFormartDate("2018-10-01"); // Date endTime = TimeParse.stringFormartDate("2018-10-01");
//
for (String url : urlList) { // for (String url : urlList) {
long a = System.currentTimeMillis(); // long a = System.currentTimeMillis();
String mid = url; // String mid = url;
Long max_behot_time = 0L; // Long max_behot_time = 0L;
List<TouTiaoArticle> list = new ArrayList<>(); // List<TouTiaoArticle> list = new ArrayList<>();
boolean f = true; // boolean f = true;
while (f) { // while (f) {
Map<String, Object> dataMap = null; // Map<String, Object> dataMap = null;
dataMap = TouTiaoArticleParse.getTouTiaoHistory(mid, max_behot_time+"", endTime, ProxyHolder.NAT_PROXY); // dataMap = TouTiaoArticleParse.getTouTiaoHistory(mid, max_behot_time+"", endTime, ProxyHolder.NAT_PROXY);
if (dataMap != null && !dataMap.isEmpty()) { // if (dataMap != null && !dataMap.isEmpty()) {
List<TouTiaoArticle> ttlist = (List<TouTiaoArticle>) dataMap.get("data"); // List<TouTiaoArticle> ttlist = (List<TouTiaoArticle>) dataMap.get("data");
max_behot_time = (Long)dataMap.get("max_behot_time"); // max_behot_time = (Long)dataMap.get("max_behot_time");
System.out.println(max_behot_time + "=======" + ttlist.size()); // System.out.println(max_behot_time + "=======" + ttlist.size());
if (null == max_behot_time || ttlist.isEmpty()) { // if (null == max_behot_time || ttlist.isEmpty()) {
f = false; // f = false;
} else { // } else {
if (ttlist.size() > 0) { // if (ttlist.size() > 0) {
list.addAll(ttlist); // list.addAll(ttlist);
} // }
} // }
}else{ // }else{
f = false; // f = false;
} // }
} // }
long b = System.currentTimeMillis(); // long b = System.currentTimeMillis();
System.out.println("一轮的采集时间为:" + (b - a) / 1000+" 数据量为" + list.size()); // System.out.println("一轮的采集时间为:" + (b - a) / 1000+" 数据量为" + list.size());
} // }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment