Commit c2e5c825 by [zhangzhiwei]

微头条及头条文章采集

parent 26dc222c
......@@ -26,6 +26,7 @@ import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.util.Tools;
......@@ -78,6 +79,36 @@ public class TouTiaoArticleParse {
return null;
}
public static Map<String, Object> getTouTiaoList(String media_id, Long max_behot_time,Date endData, ProxyHolder proxy ) throws Exception{
String as=Tools.getAS().split("_")[0];
String cp=Tools.getAS().split("_")[1];
String url = "https://www.toutiao.com/pgc/ma/?page_type=1&media_id="+media_id+"&count=20&as="+as+"&cp="+cp;
if(max_behot_time!=null){
url = url + "&max_behot_time="+max_behot_time;
}
System.out.println("url=========="+url);
Map<String,String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", url);
String htmlBody = null;
try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
if(htmlBody != null && htmlBody.contains("behot_time")){
Map<String, Object> ttList = parseHtmlByAccount(htmlBody, endData);
if(ttList!=null && ttList.size()>0){
return ttList;
}
}else{
logger.info("数据为null");
}
} catch (Exception e) {
logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
throw e;
}
return null;
}
/***
* 根据帐号解析历史文章地址
*
......@@ -178,7 +209,6 @@ public class TouTiaoArticleParse {
System.out.println(url);
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy).body().string();
System.out.println(htmlBody);
if (htmlBody != null) {
Map<String, Object> dataMap = parseHtmlByMicroAccount(htmlBody, endDate);
if(dataMap!=null && dataMap.size()>0){
......@@ -196,6 +226,31 @@ public class TouTiaoArticleParse {
public static Map<String, Object> getMicroTouTiaoCrawler(String user_id, Date endDate, ProxyHolder proxy, Long max_behot_time) throws IOException {
String url = "https://www.toutiao.com/c/ugc/content/list/" + user_id+"/";
if(max_behot_time!=null){
url = url + "?max_time=" + max_behot_time;
}
Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", "https://www.toutiao.com/c/user/" + user_id + "/");
System.out.println(url);
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy).body().string();
if (htmlBody != null) {
Map<String, Object> dataMap = parseHtmlByMicroAccount(htmlBody, endDate);
if(dataMap!=null && dataMap.size()>0){
return dataMap;
}
}else{
logger.info("数据为null");
}
} catch (Exception e) {
logger.info("获取数据出错::{},数据为null", e);
return null;
}
return null;
}
/**
* @Title: parseHtmlByMicroAccount
* @author hero
......
......@@ -31,7 +31,7 @@ public class TouTiaoExample {
public static void main(String[] args) throws Exception {
long a = System.currentTimeMillis();
List<String> urlList = new ArrayList<String>();
urlList.add("23782107381");
urlList.add("6075371636");
System.out.println(urlList.size());
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment