Commit 53f01f3e by yangchen
parents 7eca1950 137e06b2
......@@ -20,6 +20,7 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
......@@ -29,6 +30,7 @@ import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.toutiao.bean.Signature;
import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.util.Tools;
......@@ -376,7 +378,7 @@ public class TouTiaoArticleParse {
Long max_behot_time) throws IOException {
String url = "https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id=" + user_id;
if (max_behot_time != null) {
url = url + "?max_behot_time=" + max_behot_time;
url = url + "&max_behot_time=" + max_behot_time;
}
logger.info("微头条采集链接:::{}", url);
Map<String, String> headerMap = Tools.getTouTiaoHeader();
......@@ -574,6 +576,36 @@ public class TouTiaoArticleParse {
return map;
}
/**
* 根据链接获取全文
* @param url
* @param proxy
* @return
*/
public static String getContent(String url,Proxy proxy) {
try {
String htmlBody = downloadHtml(url, proxy, null);
if(!StringUtils.isBlank(htmlBody)) {
if(htmlBody.contains("content:")) {
String content = htmlBody.split(" content: '")[1].split("',")[0];
return ZhiWeiTools.delHTMLTag(content);
}
}
return null;
} catch (Exception e) {
logger.error("跟据链接采集全文出现错误", e);
return null;
}
}
private static String downloadHtml(String url, Proxy proxy, Map<String,String> headMap) {
// 下载数据页面
......
......@@ -37,38 +37,41 @@ public class TouTiaoExample {
public static void main(String[] args) throws Exception {
ProxyFactory.init(registry, group, GroupType.PROVIDER);
String url = "https://www.toutiao.com/a6659244827009352196/";
String content = TouTiaoArticleParse.getContent(url, null);
System.out.println(content);
List<String> urlList = new ArrayList<String>();
urlList.add("1920576965");
Date endTime = TimeParse.stringFormartDate("2018-10-01");
for (String url : urlList) {
long a = System.currentTimeMillis();
String mid = url;
Long max_behot_time = 0L;
List<TouTiaoArticle> list = new ArrayList<>();
boolean f = true;
while (f) {
Map<String, Object> dataMap = null;
dataMap = TouTiaoArticleParse.getTouTiaoHistory(mid, max_behot_time+"", endTime, ProxyHolder.NAT_PROXY);
if (dataMap != null && !dataMap.isEmpty()) {
List<TouTiaoArticle> ttlist = (List<TouTiaoArticle>) dataMap.get("data");
max_behot_time = (Long)dataMap.get("max_behot_time");
System.out.println(max_behot_time + "=======" + ttlist.size());
if (null == max_behot_time || ttlist.isEmpty()) {
f = false;
} else {
if (ttlist.size() > 0) {
list.addAll(ttlist);
}
}
}else{
f = false;
}
}
long b = System.currentTimeMillis();
System.out.println("一轮的采集时间为:" + (b - a) / 1000+" 数据量为" + list.size());
}
// List<String> urlList = new ArrayList<String>();
// urlList.add("1920576965");
// Date endTime = TimeParse.stringFormartDate("2018-10-01");
//
// for (String url : urlList) {
// long a = System.currentTimeMillis();
// String mid = url;
// Long max_behot_time = 0L;
// List<TouTiaoArticle> list = new ArrayList<>();
// boolean f = true;
// while (f) {
// Map<String, Object> dataMap = null;
// dataMap = TouTiaoArticleParse.getTouTiaoHistory(mid, max_behot_time+"", endTime, ProxyHolder.NAT_PROXY);
// if (dataMap != null && !dataMap.isEmpty()) {
// List<TouTiaoArticle> ttlist = (List<TouTiaoArticle>) dataMap.get("data");
// max_behot_time = (Long)dataMap.get("max_behot_time");
// System.out.println(max_behot_time + "=======" + ttlist.size());
// if (null == max_behot_time || ttlist.isEmpty()) {
// f = false;
// } else {
// if (ttlist.size() > 0) {
// list.addAll(ttlist);
// }
// }
// }else{
// f = false;
// }
// }
// long b = System.currentTimeMillis();
// System.out.println("一轮的采集时间为:" + (b - a) / 1000+" 数据量为" + list.size());
// }
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment