Commit 9d384b56 by zhiwei

添加更新今日头条阅读数功能

parent 34d3c078
/** /**
* @Title: TouTiaoParse.java * @Title: TouTiaoParse.java
* @Package com.zhiwei.toutiao.parse * @Package com.zhiwei.toutiao.parse
* @Description: * @Description:
* @author hero * @author hero
* @date 2016年9月2日 上午11:17:44 * @date 2016年9月2日 上午11:17:44
* @version V1.0 * @version V1.0
*/ */
/** /**
* *
*/ */
package com.zhiwei.toutiao.parse; package com.zhiwei.toutiao.parse;
import java.io.IOException; import java.io.IOException;
import java.net.Proxy; import java.net.Proxy;
import java.util.ArrayList; import java.util.*;
import java.util.Collections; import java.util.regex.Matcher;
import java.util.Date; import java.util.regex.Pattern;
import java.util.HashMap;
import java.util.List; import javax.script.ScriptEngine;
import java.util.Map; import javax.script.ScriptEngineManager;
import java.util.regex.Matcher;
import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import javax.script.ScriptEngine; import org.apache.logging.log4j.Logger;
import javax.script.ScriptEngineManager; import org.jsoup.Jsoup;
import org.apache.commons.lang3.StringUtils; import com.alibaba.fastjson.JSONArray;
import org.apache.logging.log4j.LogManager; import com.alibaba.fastjson.JSONObject;
import org.apache.logging.log4j.Logger; import com.zhiwei.crawler.core.HttpBoot;
import org.jsoup.Jsoup; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.alibaba.fastjson.JSONArray; import com.zhiwei.tools.timeparse.TimeParse;
import com.alibaba.fastjson.JSONObject; import com.zhiwei.toutiao.bean.Signature;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.toutiao.util.Tools;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.timeparse.TimeParse; import okhttp3.Response;
import com.zhiwei.toutiao.bean.Signature;
import com.zhiwei.toutiao.bean.TouTiaoArticle; /**
import com.zhiwei.toutiao.util.Tools; * @Description:头条帐号采集
* @author hero
import okhttp3.Response; * @date 2016年9月2日 上午11:17:44
*/
/** public class TouTiaoArticleParse {
* @Description:头条帐号采集
* @author hero private static ScriptEngine scriptEngine = new ScriptEngineManager().getEngineByName("javascript");
* @date 2016年9月2日 上午11:17:44 private static Logger logger = LogManager.getLogger(TouTiaoArticleParse.class);
*/ private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
public class TouTiaoArticleParse {
private static ScriptEngine scriptEngine = new ScriptEngineManager().getEngineByName("javascript"); /***
private static Logger logger = LogManager.getLogger(TouTiaoArticleParse.class); * 获取头条数据
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); *
* @Description:
* @param @param
/*** * url
* 获取头条数据 * @param @return
* * @return List<TouTiao> 返回类型
* @Description: * @throws Exception
* @param @param */
* url public static Map<String, Object> getTouTiaoList(String mediaId, String maxBehotTime, Date endData, Proxy proxy)
* @param @return throws Exception {
* @return List<TouTiao> 返回类型 Signature signature = new Signature();
* @throws Exception String url = "https://www.toutiao.com/pgc/ma/?page_type=1&media_id=" + mediaId + "&count=20&as="
*/ + signature.getAs() + "&cp=" + signature.getCp();
public static Map<String, Object> getTouTiaoList(String mediaId, String maxBehotTime, Date endData, Proxy proxy) if (maxBehotTime != null) {
throws Exception { url = url + "&max_behot_time=" + maxBehotTime;
Signature signature = new Signature(); }
String url = "https://www.toutiao.com/pgc/ma/?page_type=1&media_id=" + mediaId + "&count=20&as=" Map<String, String> headerMap = Tools.getTouTiaoHeader();
+ signature.getAs() + "&cp=" + signature.getCp(); headerMap.put("Referer", url);
if (maxBehotTime != null) { try {
url = url + "&max_behot_time=" + maxBehotTime; String htmlBody = downloadHtml(url, proxy, headerMap);
} if (htmlBody != null && htmlBody.contains("behot_time")) {
Map<String, String> headerMap = Tools.getTouTiaoHeader(); Map<String, Object> ttList = parseHtmlByAccount(htmlBody, endData);
headerMap.put("Referer", url); if (ttList != null && ttList.size() > 0) {
try { return ttList;
String htmlBody = downloadHtml(url, proxy, headerMap); }
if (htmlBody != null && htmlBody.contains("behot_time")) { } else {
Map<String, Object> ttList = parseHtmlByAccount(htmlBody, endData); logger.info("数据为null");
if (ttList != null && ttList.size() > 0) { }
return ttList; } catch (Exception e) {
} logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
} else { throw e;
logger.info("数据为null"); }
} return Collections.emptyMap();
} catch (Exception e) { }
logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
throw e;
} public static Map<String, Object> getTouTiaoList(String mediaId, String maxBehotTime, Date endData, ProxyHolder proxy)
return Collections.emptyMap(); throws Exception {
} Signature signature = new Signature();
String url = "https://www.toutiao.com/pgc/ma/?page_type=1&media_id=" + mediaId + "&count=20&as="
+ signature.getAs() + "&cp=" + signature.getCp();
public static Map<String, Object> getTouTiaoList(String mediaId, String maxBehotTime, Date endData, ProxyHolder proxy) if (maxBehotTime != null) {
throws Exception { url = url + "&max_behot_time=" + maxBehotTime;
Signature signature = new Signature(); }
String url = "https://www.toutiao.com/pgc/ma/?page_type=1&media_id=" + mediaId + "&count=20&as=" Map<String, String> headerMap = Tools.getTouTiaoHeader();
+ signature.getAs() + "&cp=" + signature.getCp(); headerMap.put("Referer", url);
if (maxBehotTime != null) { try {
url = url + "&max_behot_time=" + maxBehotTime; String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
} if (htmlBody != null && htmlBody.contains("behot_time")) {
Map<String, String> headerMap = Tools.getTouTiaoHeader(); Map<String, Object> ttList = parseHtmlByAccount(htmlBody, endData);
headerMap.put("Referer", url); if (ttList != null && ttList.size() > 0) {
try { return ttList;
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string(); }
if (htmlBody != null && htmlBody.contains("behot_time")) { } else {
Map<String, Object> ttList = parseHtmlByAccount(htmlBody, endData); logger.info("数据为null");
if (ttList != null && ttList.size() > 0) { }
return ttList; } catch (Exception e) {
} logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
} else { throw e;
logger.info("数据为null"); }
} return Collections.emptyMap();
} catch (Exception e) { }
logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
throw e; /**
} * 获取今日头条历史文章接口新
return Collections.emptyMap(); *
} * @param user_id
* @param max_behot_time
/** * @param endData
* 获取今日头条历史文章接口新 * @param proxy
* * @return
* @param user_id * @throws Exception
* @param max_behot_time */
* @param endData public static Map<String, Object> getTouTiaoHistory(String userId, String maxBehotTime, Date endData,
* @param proxy Proxy proxy) throws Exception {
* @return for (int i = 0; i < 3; i++) {
* @throws Exception Signature signature = new Signature(userId, maxBehotTime);
*/ String as = signature.getAs();
public static Map<String, Object> getTouTiaoHistory(String userId, String maxBehotTime, Date endData, String cp = signature.getCp();
Proxy proxy) throws Exception { String signatureStr = signature.getSignature();
for (int i = 0; i < 3; i++) { String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id=" + userId + "&max_behot_time="
Signature signature = new Signature(userId, maxBehotTime); + maxBehotTime + "&count=20&as=" + as + "&cp=" + cp + "&_signature=" + signatureStr;
String as = signature.getAs(); Map<String, String> headerMap = new HashMap<String, String>();
String cp = signature.getCp(); headerMap.put("user-agent",
String signatureStr = signature.getSignature(); "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id=" + userId + "&max_behot_time=" headerMap.put("referer", "https://www.toutiao.com/c/user/" + userId + "/");
+ maxBehotTime + "&count=20&as=" + as + "&cp=" + cp + "&_signature=" + signatureStr; try {
Map<String, String> headerMap = new HashMap<String, String>(); String htmlBody = downloadHtml(url, proxy, headerMap);
headerMap.put("user-agent", if (htmlBody != null && htmlBody.contains("behot_time")) {
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"); Map<String, Object> ttList = parseHtmlByAccount(userId, htmlBody, endData);
headerMap.put("referer", "https://www.toutiao.com/c/user/" + userId + "/"); if (ttList != null && ttList.size() > 0) {
try { return ttList;
String htmlBody = downloadHtml(url, proxy, headerMap); }
if (htmlBody != null && htmlBody.contains("behot_time")) { } else {
Map<String, Object> ttList = parseHtmlByAccount(userId, htmlBody, endData); logger.info("数据为null");
if (ttList != null && ttList.size() > 0) { continue;
return ttList; }
} } catch (Exception e) {
} else { logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
logger.info("数据为null"); throw e;
continue; }
} }
} catch (Exception e) { return Collections.emptyMap();
logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace()); }
throw e;
} public static Map<String, Object> getTouTiaoHistory(String userId, String maxBehotTime, Date endData,
} ProxyHolder proxy) throws Exception {
return Collections.emptyMap(); for (int i = 0; i < 3; i++) {
} Signature signature = new Signature(userId, maxBehotTime);
String as = signature.getAs();
public static Map<String, Object> getTouTiaoHistory(String userId, String maxBehotTime, Date endData, String cp = signature.getCp();
ProxyHolder proxy) throws Exception { String signatureStr = signature.getSignature();
for (int i = 0; i < 3; i++) { String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id=" + userId + "&max_behot_time="
Signature signature = new Signature(userId, maxBehotTime); + maxBehotTime + "&count=20&as=" + as + "&cp=" + cp + "&_signature=" + signatureStr;
String as = signature.getAs(); logger.info("当前采集的历史文章链接:::{}", url);
String cp = signature.getCp(); Map<String, String> headerMap = new HashMap<>();
String signatureStr = signature.getSignature(); headerMap.put("user-agent",
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id=" + userId + "&max_behot_time=" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
+ maxBehotTime + "&count=20&as=" + as + "&cp=" + cp + "&_signature=" + signatureStr; headerMap.put("referer", "https://www.toutiao.com/c/user/" + userId + "/");
logger.info("当前采集的历史文章链接:::{}", url); String htmlBody = null;
Map<String, String> headerMap = new HashMap<>(); try {
headerMap.put("user-agent", htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"); if (htmlBody != null && htmlBody.contains("behot_time")) {
headerMap.put("referer", "https://www.toutiao.com/c/user/" + userId + "/"); Map<String, Object> ttList = parseHtmlByAccount(userId, htmlBody, endData);
String htmlBody = null; if (ttList != null && ttList.size() > 0) {
try { return ttList;
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string(); } else {
if (htmlBody != null && htmlBody.contains("behot_time")) { break;
Map<String, Object> ttList = parseHtmlByAccount(userId, htmlBody, endData); }
if (ttList != null && ttList.size() > 0) { } else {
return ttList; logger.info("数据为null,获取到的文本为:::{}", htmlBody);
} else { continue;
break; }
} } catch (Exception e) {
} else { logger.error("获取今日头条帐号数据连接超时", e);
logger.info("数据为null,获取到的文本为:::{}", htmlBody); throw e;
continue; }
} }
} catch (Exception e) { return Collections.emptyMap();
logger.error("获取今日头条帐号数据连接超时", e); }
throw e;
} /***
} * 根据帐号解析历史文章地址
return Collections.emptyMap(); *
} * @Description:根据帐号解析历史文章地址
* @param @param
/*** * htmlBody
* 根据帐号解析历史文章地址 * @param @return
* * @return List<String> 返回类型
* @Description:根据帐号解析历史文章地址 */
* @param @param private static Map<String, Object> parseHtmlByAccount(String htmlBody, Date endDate) {
* htmlBody Map<String, Object> map = new HashMap<>();
* @param @return Long maxBehotTime = null;
* @return List<String> 返回类型 List<TouTiaoArticle> dataList = new ArrayList<>();
*/ try {
private static Map<String, Object> parseHtmlByAccount(String htmlBody, Date endDate) { JSONObject json = JSONObject.parseObject(htmlBody);
Map<String, Object> map = new HashMap<>(); JSONArray jsonArray = json.getJSONArray("data");
Long maxBehotTime = null; maxBehotTime = Long.valueOf(json.getJSONObject("next").getString("max_behot_time"));
List<TouTiaoArticle> dataList = new ArrayList<>(); String title = null;
try { String content = null;
JSONObject json = JSONObject.parseObject(htmlBody); String time = null;
JSONArray jsonArray = json.getJSONArray("data"); Date date = null;
maxBehotTime = Long.valueOf(json.getJSONObject("next").getString("max_behot_time")); String readNum = null;
String title = null; String commentNum = null;
String content = null; String playNum = null;
String time = null; String shareNum = null;
Date date = null; String source = null;
String readNum = null; String userId = null;
String commentNum = null; String articleType = null;
String playNum = null; List<String> labelList = null;
String shareNum = null; String likeNum = null;
String source = null; for (int i = 0; i < jsonArray.size(); i++) {
String userId = null; try {
String articleType = null; JSONObject data = jsonArray.getJSONObject(i);
List<String> labelList = null; String href = "https://www.toutiao.com/";
String likeNum = null; if (data.containsKey("group_id")) {
for (int i = 0; i < jsonArray.size(); i++) { href = href + "a" + data.getLongValue("group_id");
try { title = data.getString("title");
JSONObject data = jsonArray.getJSONObject(i); content = data.getString("abstract");
String href = "https://www.toutiao.com/"; time = data.getLongValue("behot_time") * 1000 + "";
if (data.containsKey("group_id")) { date = TimeParse.stringFormartDate(time);
href = href + "a" + data.getLongValue("group_id"); readNum = data.getString("go_detail_count");
title = data.getString("title"); commentNum = data.getString("comments_count");
content = data.getString("abstract"); playNum = data.getString("detail_play_effective_count");
time = data.getLongValue("behot_time") * 1000 + ""; shareNum = data.getString("share_count");
date = TimeParse.stringFormartDate(time); source = data.getString("source");
readNum = data.getString("go_detail_count"); userId = data.getLong("creator_uid") + "";
commentNum = data.getString("comments_count"); articleType = data.getString("chinese_tag");
playNum = data.getString("detail_play_effective_count"); TouTiaoArticle tt = new TouTiaoArticle(href, title, userId, source, date, content, commentNum,
shareNum = data.getString("share_count"); playNum, readNum, shareNum, "今日头条", articleType,likeNum);
source = data.getString("source"); if (data.containsKey("label")) {
userId = data.getLong("creator_uid") + ""; labelList = data.getJSONArray("label").toJavaList(String.class);
articleType = data.getString("chinese_tag"); tt.setLabelList(labelList);
TouTiaoArticle tt = new TouTiaoArticle(href, title, userId, source, date, content, commentNum, }
playNum, readNum, shareNum, "今日头条", articleType,likeNum); dataList.add(tt);
if (data.containsKey("label")) { }
labelList = data.getJSONArray("label").toJavaList(String.class); } catch (Exception e) {
tt.setLabelList(labelList); logger.error("数据解析出现问题,{}", e.getMessage());
} continue;
dataList.add(tt); }
} }
} catch (Exception e) { } catch (Exception e) {
logger.error("数据解析出现问题,{}", e.getMessage()); logger.error("数据解析出现问题,{}", e.getMessage());
continue; return null;
} }
}
} catch (Exception e) { if (endDate != null) {
logger.error("数据解析出现问题,{}", e.getMessage()); if (maxBehotTime != null && !"0".equals(maxBehotTime)) {
return null; Date nextDate = new Date(Long.valueOf(maxBehotTime + "000"));
} if (endDate.after(nextDate)) {
maxBehotTime = null;
if (endDate != null) { }
if (maxBehotTime != null && !"0".equals(maxBehotTime)) { }
Date nextDate = new Date(Long.valueOf(maxBehotTime + "000")); }
if (endDate.after(nextDate)) { map.put("max_behot_time", maxBehotTime);
maxBehotTime = null; map.put("data", dataList);
} return map;
} }
}
map.put("max_behot_time", maxBehotTime); private static Map<String, Object> parseHtmlByAccount(String userId, String htmlBody, Date endDate) {
map.put("data", dataList); Map<String, Object> map = new HashMap<>();
return map; Long maxBehotTime = null;
} List<TouTiaoArticle> dataList = new ArrayList<>();
try {
private static Map<String, Object> parseHtmlByAccount(String userId, String htmlBody, Date endDate) { JSONObject json = JSONObject.parseObject(htmlBody);
Map<String, Object> map = new HashMap<>(); JSONArray jsonArray = json.getJSONArray("data");
Long maxBehotTime = null; maxBehotTime = Long.valueOf(json.getJSONObject("next").getString("max_behot_time"));
List<TouTiaoArticle> dataList = new ArrayList<>(); String title = null;
try { String content = null;
JSONObject json = JSONObject.parseObject(htmlBody); String time = null;
JSONArray jsonArray = json.getJSONArray("data"); Date date = null;
maxBehotTime = Long.valueOf(json.getJSONObject("next").getString("max_behot_time")); String readNum = null;
String title = null; String commentNum = null;
String content = null; String playNum = null;
String time = null; String shareNum = null;
Date date = null; String source = null;
String readNum = null; String articleType = null;
String commentNum = null; List<String> labelList = null;
String playNum = null; String likeNum = null;
String shareNum = null; for (int i = 0; i < jsonArray.size(); i++) {
String source = null; try {
String articleType = null; JSONObject data = jsonArray.getJSONObject(i);
List<String> labelList = null; String href = "https://www.toutiao.com/";
String likeNum = null; if (data.containsKey("group_id")) {
for (int i = 0; i < jsonArray.size(); i++) { href = href + "a" + data.getLongValue("group_id");
try { title = data.getString("title");
JSONObject data = jsonArray.getJSONObject(i); content = data.getString("abstract");
String href = "https://www.toutiao.com/"; time = data.getLongValue("behot_time") * 1000 + "";
if (data.containsKey("group_id")) { date = TimeParse.stringFormartDate(time);
href = href + "a" + data.getLongValue("group_id"); readNum = data.getString("go_detail_count");
title = data.getString("title"); commentNum = data.getString("comments_count");
content = data.getString("abstract"); playNum = data.getString("detail_play_effective_count");
time = data.getLongValue("behot_time") * 1000 + ""; shareNum = data.getString("share_count");
date = TimeParse.stringFormartDate(time); source = data.getString("source");
readNum = data.getString("go_detail_count"); articleType = data.getString("chinese_tag");
commentNum = data.getString("comments_count"); TouTiaoArticle tt = new TouTiaoArticle(href, title, userId, source, date, content, commentNum,
playNum = data.getString("detail_play_effective_count"); playNum, readNum, shareNum, "今日头条", articleType,likeNum);
shareNum = data.getString("share_count"); if (data.containsKey("label")) {
source = data.getString("source"); labelList = data.getJSONArray("label").toJavaList(String.class);
articleType = data.getString("chinese_tag"); tt.setLabelList(labelList);
TouTiaoArticle tt = new TouTiaoArticle(href, title, userId, source, date, content, commentNum, }
playNum, readNum, shareNum, "今日头条", articleType,likeNum); dataList.add(tt);
if (data.containsKey("label")) { }
labelList = data.getJSONArray("label").toJavaList(String.class); } catch (Exception e) {
tt.setLabelList(labelList); logger.error("数据解析出现问题,{}", e.getMessage());
} continue;
dataList.add(tt); }
} }
} catch (Exception e) { } catch (Exception e) {
logger.error("数据解析出现问题,{}", e.getMessage()); logger.error("数据解析出现问题,{}", e.getMessage());
continue; return null;
} }
} if (endDate != null) {
} catch (Exception e) { if (maxBehotTime != null && !"0".equals(maxBehotTime)) {
logger.error("数据解析出现问题,{}", e.getMessage()); Date nextDate = new Date(Long.valueOf(maxBehotTime + "000"));
return null; if (endDate.after(nextDate)) {
} maxBehotTime = null;
if (endDate != null) { }
if (maxBehotTime != null && !"0".equals(maxBehotTime)) { }
Date nextDate = new Date(Long.valueOf(maxBehotTime + "000")); }
if (endDate.after(nextDate)) { map.put("max_behot_time", maxBehotTime);
maxBehotTime = null; map.put("data", dataList);
} return map;
} }
}
map.put("max_behot_time", maxBehotTime); /**
map.put("data", dataList); * @Title: getMicroTouTiaoCrawler
return map; * @author hero
} * @Description: 根据用户user_id查询用户微头条数据
* @param @param
/** * user_id
* @Title: getMicroTouTiaoCrawler * @param @param
* @author hero * endDate
* @Description: 根据用户user_id查询用户微头条数据 * @param @param
* @param @param * proxy
* user_id * @param @return
* @param @param * @param @throws
* endDate * IOException 设定文件
* @param @param * @return List<Map<String,Object>> 返回类型
* proxy */
* @param @return public static Map<String, Object> getMicroTouTiaoCrawler(String userId, Date endDate, Proxy proxy,
* @param @throws String maxBehotTime) throws IOException {
* IOException 设定文件 String url = "https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id=" + userId;
* @return List<Map<String,Object>> 返回类型 if (maxBehotTime != null) {
*/ url = url + "&max_behot_time=" + maxBehotTime;
public static Map<String, Object> getMicroTouTiaoCrawler(String userId, Date endDate, Proxy proxy, }
String maxBehotTime) throws IOException { System.out.println(url);
String url = "https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id=" + userId; Map<String, String> headerMap = Tools.getTouTiaoHeader();
if (maxBehotTime != null) { headerMap.put("Referer", "https://www.toutiao.com/c/user/" + userId + "/");
url = url + "&max_behot_time=" + maxBehotTime; try {
} String htmlBody = downloadHtml(url, proxy, headerMap);
System.out.println(url); if (htmlBody != null) {
Map<String, String> headerMap = Tools.getTouTiaoHeader(); Map<String, Object> dataMap = parseHtmlByMicroAccount(htmlBody, endDate);
headerMap.put("Referer", "https://www.toutiao.com/c/user/" + userId + "/"); if (dataMap != null && dataMap.size() > 0) {
try { return dataMap;
String htmlBody = downloadHtml(url, proxy, headerMap); }
if (htmlBody != null) { } else {
Map<String, Object> dataMap = parseHtmlByMicroAccount(htmlBody, endDate); logger.info("数据为null");
if (dataMap != null && dataMap.size() > 0) { }
return dataMap; } catch (Exception e) {
} logger.info("获取数据出错::{},数据为null", e);
} else { return null;
logger.info("数据为null"); }
} return null;
} catch (Exception e) { }
logger.info("获取数据出错::{},数据为null", e);
return null; public static Map<String, Object> getMicroTouTiaoCrawler(String userId, Date endDate, ProxyHolder proxy,
} Long maxBehotTime) throws IOException {
return null; String url = "https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id=" + userId;
} if (maxBehotTime != null) {
url = url + "&max_behot_time=" + maxBehotTime;
public static Map<String, Object> getMicroTouTiaoCrawler(String userId, Date endDate, ProxyHolder proxy, }
Long maxBehotTime) throws IOException { logger.info("微头条采集链接:::{}", url);
String url = "https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id=" + userId; Map<String, String> headerMap = Tools.getTouTiaoHeader();
if (maxBehotTime != null) { headerMap.put("Referer", "https://www.toutiao.com/c/user/" + userId + "/");
url = url + "&max_behot_time=" + maxBehotTime; try {
} String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
logger.info("微头条采集链接:::{}", url); if (htmlBody != null && htmlBody.contains("create_time")) {
Map<String, String> headerMap = Tools.getTouTiaoHeader(); Map<String, Object> dataMap = parseHtmlByMicroAccount(htmlBody, endDate);
headerMap.put("Referer", "https://www.toutiao.com/c/user/" + userId + "/"); if (dataMap != null && dataMap.size() > 0) {
try { return dataMap;
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string(); }
if (htmlBody != null && htmlBody.contains("create_time")) { } else {
Map<String, Object> dataMap = parseHtmlByMicroAccount(htmlBody, endDate); logger.info("数据为null");
if (dataMap != null && dataMap.size() > 0) { }
return dataMap; } catch (Exception e) {
} logger.info("获取数据出错::{},数据为null", e);
} else { return null;
logger.info("数据为null"); }
} return null;
} catch (Exception e) { }
logger.info("获取数据出错::{},数据为null", e);
return null; /**
} *
return null; * @Description 微头条客户端解析
} * @param userId
* @param endDate
/** * @param proxy
* * @param max_behot_time
* @Description 微头条客户端解析 * @return
* @param userId */
* @param endDate public static List<Map<String,Object>> getClientMicroToutiaoCrawler(String userId, ProxyHolder proxy,
* @param proxy Long maxBehotTime) {
* @param max_behot_time List<Map<String,Object>> dataList = new ArrayList<>();
* @return String ma = "";
*/ while(true) {
public static List<Map<String,Object>> getClientMicroToutiaoCrawler(String userId, ProxyHolder proxy, String url = "https://i.snssdk.com/api/feed/profile/v1/?visited_uid="+userId+"&offset="+ maxBehotTime;
Long maxBehotTime) { ma = String.valueOf(maxBehotTime);
List<Map<String,Object>> dataList = new ArrayList<>(); try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url),proxy)){
String ma = ""; String result = response.body().string();
while(true) { JSONObject json = JSONObject.parseObject(result);
String url = "https://i.snssdk.com/api/feed/profile/v1/?visited_uid="+userId+"&offset="+ maxBehotTime; maxBehotTime = json.getLongValue("offset");
ma = String.valueOf(maxBehotTime); JSONArray jsonArray = json.getJSONArray("data");
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url),proxy)){ for (int i = 0; i < jsonArray.size(); i++) {
String result = response.body().string(); JSONObject data = jsonArray.getJSONObject(i);
JSONObject json = JSONObject.parseObject(result); try {
maxBehotTime = json.getLongValue("offset"); JSONObject dataJSON = data.getJSONObject("content").getJSONObject("raw_data");
JSONArray jsonArray = json.getJSONArray("data"); Map<String,Object> map = new HashMap<>();
for (int i = 0; i < jsonArray.size(); i++) { if(dataJSON.containsKey("comment_base") && dataJSON.getJSONObject("comment_base")!=null) {
JSONObject data = jsonArray.getJSONObject(i); JSONObject commentBase = dataJSON.getJSONObject("comment_base");
try { Date date = new Date(commentBase.getLongValue("create_time") * 1000);
JSONObject dataJSON = data.getJSONObject("content").getJSONObject("raw_data"); String href = "http://weitoutiao.zjurl.cn/ugc/share/wap/comment/" + dataJSON.getLongValue("id");
Map<String,Object> map = new HashMap<>(); String source = commentBase.getJSONObject("user").getJSONObject("info").getString("name");
if(dataJSON.containsKey("comment_base") && dataJSON.getJSONObject("comment_base")!=null) { String content = commentBase.getString("content");
JSONObject commentBase = dataJSON.getJSONObject("comment_base"); String readNum = commentBase.getJSONObject("action").getInteger("read_count") + "";
Date date = new Date(commentBase.getLongValue("create_time") * 1000); String commentNum = commentBase.getJSONObject("action").getInteger("comment_count") + "";
String href = "http://weitoutiao.zjurl.cn/ugc/share/wap/comment/" + dataJSON.getLongValue("id"); userId = commentBase.getJSONObject("user").getJSONObject("info").getString("user_id");
String source = commentBase.getJSONObject("user").getJSONObject("info").getString("name"); if(dataJSON.containsKey("origin_group")) {
String content = commentBase.getString("content"); String replayUrl = dataJSON.getJSONObject("origin_group").getString("article_url");
String readNum = commentBase.getJSONObject("action").getInteger("read_count") + ""; String title = dataJSON.getJSONObject("origin_group").getString("title");
String commentNum = commentBase.getJSONObject("action").getInteger("comment_count") + ""; map.put("title", title);
userId = commentBase.getJSONObject("user").getJSONObject("info").getString("user_id"); map.put("replayUrl", replayUrl);
if(dataJSON.containsKey("origin_group")) { }
String replayUrl = dataJSON.getJSONObject("origin_group").getString("article_url"); map.put("time", date);
String title = dataJSON.getJSONObject("origin_group").getString("title"); map.put("href", href);
map.put("title", title); map.put("source", source);
map.put("replayUrl", replayUrl); map.put("content", content);
} map.put("readNum", readNum);
map.put("time", date); map.put("commentNum", commentNum);
map.put("href", href); map.put("user_id", userId);
map.put("source", source); dataList.add(map);
map.put("content", content); }
map.put("readNum", readNum); } catch (Exception e) {
map.put("commentNum", commentNum); // System.out.println(data.toString());
map.put("user_id", userId); e.printStackTrace();
dataList.add(map); }
} }
} catch (Exception e) {
// System.out.println(data.toString()); System.out.println(" 采集到 条 == "+dataList.size() + " -- " +ma + " -- " + maxBehotTime);
e.printStackTrace(); if(ma.equals(String.valueOf(maxBehotTime))) {
} break;
} }
} catch (Exception e) {
System.out.println(" 采集到 条 == "+dataList.size() + " -- " +ma + " -- " + maxBehotTime); logger.info("客户端微头条采集错误 {}",e);
if(ma.equals(String.valueOf(maxBehotTime))) { }
break; }
} return dataList;
} catch (Exception e) { }
logger.info("客户端微头条采集错误 {}",e);
} /**
} * @Title: parseHtmlByMicroAccount
return dataList; * @author hero
} * @Description: 解析微头条数据
* @param @param
/** * htmlBody
* @Title: parseHtmlByMicroAccount * @param @param
* @author hero * endDate
* @Description: 解析微头条数据 * @param @return
* @param @param * 设定文件
* htmlBody * @return Map<String,Object> 返回类型
* @param @param */
* endDate private static Map<String, Object> parseHtmlByMicroAccount(String htmlBody, Date endDate) {
* @param @return Map<String, Object> map = new HashMap<>();
* 设定文件 Long maxBehotTime = null;
* @return Map<String,Object> 返回类型 List<TouTiaoArticle> dataList = new ArrayList<>();
*/ try {
private static Map<String, Object> parseHtmlByMicroAccount(String htmlBody, Date endDate) { JSONObject json = JSONObject.parseObject(htmlBody);
Map<String, Object> map = new HashMap<>(); boolean more = false;
Long maxBehotTime = null; if(json.containsKey("has_more")) {
List<TouTiaoArticle> dataList = new ArrayList<>(); more = json.getBoolean("has_more");
try { }
JSONObject json = JSONObject.parseObject(htmlBody); if(json.containsKey("next")) {
boolean more = false; maxBehotTime = json.getJSONObject("next").getLongValue("max_behot_time");
if(json.containsKey("has_more")) { }
more = json.getBoolean("has_more");
} Date date = null;
if(json.containsKey("next")) { if(json.containsKey("data")) {
maxBehotTime = json.getJSONObject("next").getLongValue("max_behot_time"); JSONArray jsonArray = json.getJSONArray("data");
} String href = null;
String source = null;
Date date = null; String title = null;
if(json.containsKey("data")) { String content = null;
JSONArray jsonArray = json.getJSONArray("data"); String readNum = null;
String href = null; String commentNum = null;
String source = null; String playNum = null;
String title = null; String userId = null;
String content = null; String likeNum = null;
String readNum = null; String articleType = null;
String commentNum = null; int count = 16;
String playNum = null; for (int i = 0; i < jsonArray.size(); i++) {
String userId = null; try {
String likeNum = null; JSONObject data = jsonArray.getJSONObject(i);
String articleType = null; String text = null;
int count = 16; if(data.containsKey("stream_cell") && data.getJSONObject("stream_cell")!=null) {
for (int i = 0; i < jsonArray.size(); i++) { text = data.getJSONObject("stream_cell").getString("raw_data");
try { }else if(data.containsKey("concern_talk_cell")) {
JSONObject data = jsonArray.getJSONObject(i); text = data.getJSONObject("concern_talk_cell").getString("packed_json_str");
String text = null; }
if(data.containsKey("stream_cell") && data.getJSONObject("stream_cell")!=null) {
text = data.getJSONObject("stream_cell").getString("raw_data"); JSONObject dataJSON = JSONObject.parseObject(text);
}else if(data.containsKey("concern_talk_cell")) { if(dataJSON.containsKey("comment_base") && dataJSON.getJSONObject("comment_base")!=null) {
text = data.getJSONObject("concern_talk_cell").getString("packed_json_str"); JSONObject commentBase = dataJSON.getJSONObject("comment_base");
} date = new Date(commentBase.getLongValue("create_time") * 1000);
href = "https://www.toutiao.com/a" + dataJSON.getLongValue("id");
JSONObject dataJSON = JSONObject.parseObject(text); source = commentBase.getJSONObject("user").getJSONObject("info").getString("name");
if(dataJSON.containsKey("comment_base") && dataJSON.getJSONObject("comment_base")!=null) { content = dataJSON.getString("content");
JSONObject commentBase = dataJSON.getJSONObject("comment_base"); readNum = dataJSON.getJSONObject("action").getInteger("read_count") + "";
date = new Date(commentBase.getLongValue("create_time") * 1000); likeNum = dataJSON.getJSONObject("action").getInteger("digg_count")+"";
href = "https://www.toutiao.com/a" + dataJSON.getLongValue("id"); commentNum = dataJSON.getJSONObject("action").getInteger("comment_count") + "";
source = commentBase.getJSONObject("user").getJSONObject("info").getString("name"); userId = commentBase.getJSONObject("user").getJSONObject("info").getString("user_id");
content = dataJSON.getString("content"); if (content != null && !"".equals(content)) {
readNum = dataJSON.getJSONObject("action").getInteger("read_count") + ""; if (content.length() < 16) {
likeNum = dataJSON.getJSONObject("action").getInteger("digg_count")+""; count = content.length();
commentNum = dataJSON.getJSONObject("action").getInteger("comment_count") + ""; }
userId = commentBase.getJSONObject("user").getJSONObject("info").getString("user_id"); title = content.substring(0, count);
if (content != null && !"".equals(content)) { }
if (content.length() < 16) {
count = content.length(); }else {
} date = new Date(dataJSON.getLongValue("create_time") * 1000);
title = content.substring(0, count); href = "https://www.toutiao.com/a" + dataJSON.getString("thread_id");
} source = dataJSON.getJSONObject("user").getString("name");
content = dataJSON.getString("content");
}else { readNum = dataJSON.getInteger("read_count") + "";
date = new Date(dataJSON.getLongValue("create_time") * 1000); commentNum = dataJSON.getInteger("comment_count") + "";
href = "https://www.toutiao.com/a" + dataJSON.getString("thread_id"); likeNum = dataJSON.getInteger("digg_count")+"";
source = dataJSON.getJSONObject("user").getString("name"); userId = dataJSON.getJSONObject("user").getString("user_id");
content = dataJSON.getString("content"); if (content != null && !"".equals(content)) {
readNum = dataJSON.getInteger("read_count") + ""; if (content.length() < 16) {
commentNum = dataJSON.getInteger("comment_count") + ""; count = content.length();
likeNum = dataJSON.getInteger("digg_count")+""; }
userId = dataJSON.getJSONObject("user").getString("user_id"); title = content.substring(0, count);
if (content != null && !"".equals(content)) { }
if (content.length() < 16) { }
count = content.length(); TouTiaoArticle tt = new TouTiaoArticle(href, title, userId, source, date, content, commentNum,
} playNum, readNum, "0", "微头条", articleType,likeNum);
title = content.substring(0, count); dataList.add(tt);
} } catch (Exception e) {
} continue;
TouTiaoArticle tt = new TouTiaoArticle(href, title, userId, source, date, content, commentNum, }
playNum, readNum, "0", "微头条", articleType,likeNum); }
dataList.add(tt); }else {
} catch (Exception e) { System.out.println(json);
continue; }
}
}
}else { /** 验证是否有下一页数据 **/
System.out.println(json); if (more) {
} if (maxBehotTime != null && maxBehotTime != 0) {
if (endDate.after(date)) {
maxBehotTime = null;
/** 验证是否有下一页数据 **/ }
if (more) { }
if (maxBehotTime != null && maxBehotTime != 0) { } else {
if (endDate.after(date)) { maxBehotTime = null;
maxBehotTime = null; }
} } catch (Exception e) {
} e.printStackTrace();
} else { }
maxBehotTime = null;
} map.put("max_behot_time", maxBehotTime);
} catch (Exception e) { map.put("data", dataList);
e.printStackTrace();
} return map;
}
map.put("max_behot_time", maxBehotTime);
map.put("data", dataList); /**
* 根据链接获取全文
return map; * @param url
} * @param proxy
* @return
/** */
* 根据链接获取全文 public static String getContent(String url,Proxy proxy) {
* @param url try {
* @param proxy String htmlBody = downloadHtml(url, proxy, null);
* @return String regex = "<script>var BASE_DATA[\\s\\S]+?</script>";
*/ if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("articleInfo")) {
public static String getContent(String url,Proxy proxy) { //通过正则截取需要的js代码
try { Matcher matcher = Pattern.compile(regex).matcher(htmlBody);
String htmlBody = downloadHtml(url, proxy, null); if(matcher.find()) {
String regex = "<script>var BASE_DATA[\\s\\S]+?</script>"; String content = matcher.group().replaceAll("<script>var BASE_DATA = |;</script>", "");
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("articleInfo")) { //通过js引擎执行js代码
//通过正则截取需要的js代码 String jsContent = "eval(("+ content +")).articleInfo.content.toString();";
Matcher matcher = Pattern.compile(regex).matcher(htmlBody); String contentHtml = scriptEngine.eval(jsContent).toString();
if(matcher.find()) { //解析最后的数据
String content = matcher.group().replaceAll("<script>var BASE_DATA = |;</script>", ""); return Jsoup.parse(contentHtml).text();
//通过js引擎执行js代码 }
String jsContent = "eval(("+ content +")).articleInfo.content.toString();"; }
String contentHtml = scriptEngine.eval(jsContent).toString(); return null;
//解析最后的数据 } catch (Exception e) {
return Jsoup.parse(contentHtml).text(); logger.error("跟据链接采集全文出现错误", e);
} return null;
} }
return null;
} catch (Exception e) { }
logger.error("跟据链接采集全文出现错误", e);
return null;
} /**
* 根据文章url获取itemId
} * @param url
* @param proxy
/** * @return
* 下载数据 * @throws Exception
* @param url */
* @param proxy private static String getItemIdByUrl(String url,Proxy proxy) throws Exception
* @param headMap {
* @return String itemId = null;
*/ Map<String,String> headerMap = Tools.getTouTiaoHeader();
private static String downloadHtml(String url, Proxy proxy, Map<String,String> headMap) { String htmlBody = downloadHtml(url, proxy, headerMap);
for (int i = 1; i <= 3; i++) { if(htmlBody != null)
try { {
Response response = null; if(htmlBody.contains("itemId"))
if(proxy != null) { {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headMap),proxy); itemId = htmlBody.split("itemId: '")[1]
}else { .split("',")[0].trim();
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headMap), ProxyHolder.NAT_HEAVY_PROXY); }
} }else
return response.body().string(); {
} catch (Exception e) { logger.info("获取itemId失败,链接地址为:{}",url);
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace()); }
if(i==3){ return itemId;
break; }
}else{
continue; /**
} * 根据文章url获取文章信息
} * @param url
} * @param proxy
return null; * @return
} * @throws Exception
} */
public static TouTiaoArticle getToutiaoArticleInfoByUrl(String url, Proxy proxy) throws Exception
{
String itemId = getItemIdByUrl(url, proxy);
if(Objects.nonNull(itemId)){
for(int i=0; i<3; i++){
try {
String urlNew = "https://m.toutiao.com/i" + itemId + "/info/?_signature=&i="+ itemId;
//设置头信息
Map<String,String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer","https://m.toutiao.com/i" + itemId + "/");
headerMap.put("User-Agent", "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Mobile Safari/537.36");
String htmlBody = downloadHtml(urlNew, proxy, headerMap);
if(htmlBody!=null)
{
try {
JSONObject data = JSONObject.parseObject(htmlBody).getJSONObject("data");
String commentNum = data.getInteger("comment_count").toString();
String readNum = data.getInteger("impression_count").toString();
String playCount = data.getInteger("video_play_count").toString();
String userId = data.getJSONObject("media_user").getLong("id").toString();
String source = data.getString("source");
String title = data.getString("title");
String link = data.getString("url");
String content = data.getString("content");
if(data.containsKey("content") && StringUtils.isNotBlank(content)){
content = Jsoup.parse(content).text();
}
Date time = new Date(data.getLong("publish_time")*1000);
TouTiaoArticle touTiaoArticle = new TouTiaoArticle();
touTiaoArticle.setUrl(url);
touTiaoArticle.setTitle(title);
touTiaoArticle.setUser_id(userId);
touTiaoArticle.setSource(source);
touTiaoArticle.setTime(time);
touTiaoArticle.setContent(content);
touTiaoArticle.setCommentCount(commentNum);
touTiaoArticle.setReadNum(readNum);
touTiaoArticle.setPlayCount(playCount);
return touTiaoArticle;
} catch (Exception e) {
logger.info("获取评论总页数时出现问题:{}",e);
}
}
} catch (Exception e) {
continue;
}
}
}
return null;
}
/**
* 下载数据
* @param url
* @param proxy
* @param headMap
* @return
*/
private static String downloadHtml(String url, Proxy proxy, Map<String,String> headMap) {
for (int i = 1; i <= 3; i++) {
try {
Response response = null;
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headMap), ProxyHolder.NAT_HEAVY_PROXY);
}
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
break;
}else{
continue;
}
}
}
return null;
}
}
package com.zhiwei.toutiao.parse; package com.zhiwei.toutiao.parse;
import java.io.IOException; import java.io.IOException;
import java.net.Proxy; import java.net.Proxy;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects; import java.util.Objects;
import org.apache.logging.log4j.LogManager; import com.zhiwei.toutiao.bean.TouTiaoArticle;
import org.apache.logging.log4j.Logger; import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import com.alibaba.fastjson.JSONArray; import org.apache.logging.log4j.Logger;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.alibaba.fastjson.JSONArray;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.toutiao.bean.TouTiaoComment; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.toutiao.util.Tools; import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.toutiao.bean.TouTiaoComment;
import okhttp3.Response; import com.zhiwei.toutiao.util.Tools;
/** import okhttp3.Response;
* @ClassName: TouTiaoComment import org.jsoup.Jsoup;
* @Description: 今日头条评论数据
* @author hero /**
* @date 2016年12月9日 下午7:50:28 * @ClassName: TouTiaoComment
*/ * @Description: 今日头条评论数据
public class TouTiaoCommentParse { * @author hero
* @date 2016年12月9日 下午7:50:28
private static Logger logger = LogManager.getLogger(TouTiaoCommentParse.class); */
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); public class TouTiaoCommentParse {
private static Logger logger = LogManager.getLogger(TouTiaoCommentParse.class);
/** private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
*
* @Title: getTouTiaoComment
* @author hero /**
* @Description: 获取评论列表,可指定限制返回条数 *
* @param @param url * @Title: getTouTiaoComment
* @param @param count * @author hero
* @param @param proxy * @Description: 获取评论列表,可指定限制返回条数
* @param @return * @param @param url
* @param @throws Exception 设定文件 * @param @param count
* @return List<TouTiaoComment> 返回类型 * @param @param proxy
*/ * @param @return
public static List<TouTiaoComment> getTouTiaoComment(String url,int returnCount,Proxy proxy) throws Exception * @param @throws Exception 设定文件
{ * @return List<TouTiaoComment> 返回类型
List<TouTiaoComment> ttList = new ArrayList<TouTiaoComment>(); */
String group_id = getGroupId(url, proxy); public static List<TouTiaoComment> getTouTiaoComment(String url,int returnCount,Proxy proxy) throws Exception
//查询评论总页数 {
if(group_id != null){ List<TouTiaoComment> ttList = new ArrayList<TouTiaoComment>();
int page = getPage(group_id,proxy); String group_id = getGroupId(url, proxy);
if(returnCount>0){ //查询评论总页数
int pageMax = (int)Math.ceil((double)returnCount/20.0); if(group_id != null){
if(page>=pageMax){ int page = getPage(group_id,proxy);
page = pageMax; if(returnCount>0){
} int pageMax = (int)Math.ceil((double)returnCount/20.0);
} if(page>=pageMax){
for(int i=0;i<page;i++) page = pageMax;
{ }
String urlNew = "http://is.snssdk.com/article/v2/tab_comments/?app_name=news_article&offset=" }
+i*20+"&group_id="+group_id+"&aggr_type=1&count=20&fold=1&item_id="+group_id+"&ts="+System.currentTimeMillis(); for(int i=0;i<page;i++)
//设置头信息 {
Map<String,String> headerMap = Tools.getTouTiaoHeader(); String urlNew = "http://is.snssdk.com/article/v2/tab_comments/?app_name=news_article&offset="
headerMap.put("User-Agent", "News 6.6.5 rv:6.6.5.03 (iPhone; iOS 11.3; zh_CN) Cronet"); +i*20+"&group_id="+group_id+"&aggr_type=1&count=20&fold=1&item_id="+group_id+"&ts="+System.currentTimeMillis();
headerMap.put("Host", "is.snssdk.com"); //设置头信息
for(int j=1; j<=3; j++){ Map<String,String> headerMap = Tools.getTouTiaoHeader();
try { headerMap.put("User-Agent", "News 6.6.5 rv:6.6.5.03 (iPhone; iOS 11.3; zh_CN) Cronet");
String htmlBody = downloadHtml(urlNew, proxy, headerMap); headerMap.put("Host", "is.snssdk.com");
if(htmlBody!=null) for(int j=1; j<=3; j++){
{ try {
List<TouTiaoComment> commentes = analySisComment(htmlBody, url); String htmlBody = downloadHtml(urlNew, proxy, headerMap);
ttList.addAll(commentes); if(htmlBody!=null)
logger.info(" url {} 采集到第 {} 页 采集到 {} 条数据 ",url,page,ttList.size()); {
}else List<TouTiaoComment> commentes = analySisComment(htmlBody, url);
{ ttList.addAll(commentes);
logger.info("采集出现问题,地址为:{}", url); logger.info(" url {} 采集到第 {} 页 采集到 {} 条数据 ",url,page,ttList.size());
} }else
if(Objects.nonNull(proxy)) { {
ZhiWeiTools.sleep(100); logger.info("采集出现问题,地址为:{}", url);
}else { }
ZhiWeiTools.sleep(4000); if(Objects.nonNull(proxy)) {
} ZhiWeiTools.sleep(100);
break; }else {
} catch (Exception e) { ZhiWeiTools.sleep(4000);
continue; }
} break;
} } catch (Exception e) {
} continue;
} }
return ttList; }
} }
}
return ttList;
/** }
* @Title: analySisComment
* @Description: TODO(解析评论列表)
* @param @param htmlBody /**
* @param @return 设定文件 * @Title: analySisComment
* @return List<DBObject> 返回类型 * @Description: TODO(解析评论列表)
*/ * @param @param htmlBody
private static List<TouTiaoComment> analySisComment(String htmlBody,String url) * @param @return 设定文件
{ * @return List<DBObject> 返回类型
List<TouTiaoComment> list = new ArrayList<>(); */
try { private static List<TouTiaoComment> analySisComment(String htmlBody,String url)
JSONObject json = JSONObject.parseObject(htmlBody); {
JSONArray commentes = json.getJSONArray("data"); List<TouTiaoComment> list = new ArrayList<>();
for(int a = 0;a<commentes.size();a++) try {
{ JSONObject json = JSONObject.parseObject(htmlBody);
JSONObject comment = commentes.getJSONObject(a).getJSONObject("comment"); JSONArray commentes = json.getJSONArray("data");
String id = comment.getString("id"); for(int a = 0;a<commentes.size();a++)
String text = comment.getString("text"); {
String name = comment.getString("user_name"); JSONObject comment = commentes.getJSONObject(a).getJSONObject("comment");
int reply_count = comment.getIntValue("reply_count"); String id = comment.getString("id");
int digg_count = comment.getIntValue("digg_count"); String text = comment.getString("text");
long timeLong = comment.getLongValue("create_time")*1000; String name = comment.getString("user_name");
Date date = new Date(timeLong); int reply_count = comment.getIntValue("reply_count");
int digg_count = comment.getIntValue("digg_count");
TouTiaoComment ttComment = new TouTiaoComment(id, long timeLong = comment.getLongValue("create_time")*1000;
text, name, reply_count, digg_count, Date date = new Date(timeLong);
date, url);
list.add(ttComment); TouTiaoComment ttComment = new TouTiaoComment(id,
} text, name, reply_count, digg_count,
} catch (Exception e) { date, url);
logger.debug("解析今日头条评论列表出现为题,{}",e); list.add(ttComment);
} }
return list; } catch (Exception e) {
} logger.debug("解析今日头条评论列表出现为题,{}",e);
}
/** return list;
* @Title: getPage }
* @Description: TODO(获取总页数)
* @param @param url /**
* @param @return 设定文件 * @Title: getPage
* @return int 返回类型 * @Description: TODO(获取总页数)
* @throws Exception * @param @param url
*/ * @param @return 设定文件
private static int getPage(String groupId,Proxy proxy) throws Exception * @return int 返回类型
{ * @throws Exception
String urlNew = "http://www.toutiao.com/api/comment/list/?group_id="+ groupId +"&item_id=0&count=20&offset=0"; */
//设置头信息 private static int getPage(String groupId,Proxy proxy) throws Exception
Map<String,String> headerMap = Tools.getTouTiaoHeader(); {
String htmlBody = downloadHtml(urlNew, proxy, headerMap); String urlNew = "http://www.toutiao.com/api/comment/list/?group_id="+ groupId +"&item_id=0&count=20&offset=0";
if(htmlBody!=null) //设置头信息
{ Map<String,String> headerMap = Tools.getTouTiaoHeader();
try { String htmlBody = downloadHtml(urlNew, proxy, headerMap);
JSONObject json = JSONObject.parseObject(htmlBody); if(htmlBody!=null)
JSONObject data = json.getJSONObject("data"); {
int count = data.getIntValue("total"); try {
JSONObject json = JSONObject.parseObject(htmlBody);
return (int)Math.ceil((double)count/20.0); JSONObject data = json.getJSONObject("data");
} catch (Exception e) { int count = data.getIntValue("total");
logger.info("获取评论总页数时出现问题:{}",e);
} return (int)Math.ceil((double)count/20.0);
} } catch (Exception e) {
return -1; logger.info("获取评论总页数时出现问题:{}",e);
} }
}
return -1;
}
/**
* @Title: findCommentCount
* @author hero
* @Description: 根据id获取头条评论数 /**
* @param @param url * @Title: findCommentCount
* @param @param proxy * @author hero
* @param @return 设定文件 * @Description: 根据id获取头条评论数
* @return int 返回类型 * @param @param url
*/ * @param @param proxy
public static int findCommentCount(String url,Proxy proxy) * @param @return 设定文件
{ * @return int 返回类型
for(int i=0; i<3; i++){ */
try { public static int findCommentCount(String url,Proxy proxy)
//设置头信息 {
Map<String,String> headerMap = Tools.getTouTiaoHeader(); for(int i=0; i<3; i++){
String htmlBody = downloadHtml(url, proxy, headerMap); try {
if(htmlBody!=null && htmlBody.contains("commentInfo")) //设置头信息
{ Map<String,String> headerMap = Tools.getTouTiaoHeader();
try { String htmlBody = downloadHtml(url, proxy, headerMap);
return Integer.valueOf(htmlBody.split("comments_count: ")[1].split(",")[0]); if(htmlBody!=null && htmlBody.contains("commentInfo"))
} catch (Exception e) { {
logger.error("解析头条评论数错误:::{}", e.fillInStackTrace()); try {
return 0; return Integer.valueOf(htmlBody.split("comments_count: ")[1].split(",")[0]);
} } catch (Exception e) {
} logger.error("解析头条评论数错误:::{}", e.fillInStackTrace());
} catch (Exception e) { return 0;
continue; }
} }
} } catch (Exception e) {
return 0; continue;
} }
}
/** return 0;
* @Title: findCommentCount }
* @author hero
* @Description: 根据id获取头条评论数 /**
* @param @param url * @Title: findCommentCount
* @param @param proxy * @author hero
* @param @return 设定文件 * @Description: 根据id获取头条评论数
* @return int 返回类型 * @param @param url
*/ * @param @param proxy
public static int findNewCommentCountByProxy(String url,Proxy proxy) * @param @return 设定文件
{ * @return int 返回类型
try { */
//设置头信息 public static int findNewCommentCountByProxy(String url,Proxy proxy)
Map<String,String> headerMap = Tools.getTouTiaoHeader(); {
String htmlBody = downloadHtml(url, proxy, headerMap); try {
if(htmlBody!=null && htmlBody.contains("commentInfo")) //设置头信息
{ Map<String,String> headerMap = Tools.getTouTiaoHeader();
try { String htmlBody = downloadHtml(url, proxy, headerMap);
return Integer.valueOf(htmlBody.split("comments_count: ")[1].split(",")[0]); if(htmlBody!=null && htmlBody.contains("commentInfo"))
} catch (Exception e) { {
logger.error("解析头条评论数错误:::{}", e.fillInStackTrace()); try {
return -1; return Integer.valueOf(htmlBody.split("comments_count: ")[1].split(",")[0]);
} } catch (Exception e) {
} logger.error("解析头条评论数错误:::{}", e.fillInStackTrace());
} catch (Exception e) { return -1;
logger.error("解析头条评论数错误:::{}", e.fillInStackTrace()); }
} }
return -1; } catch (Exception e) {
} logger.error("解析头条评论数错误:::{}", e.fillInStackTrace());
}
/** return -1;
* @Title: getCommentCount }
* @Description: TODO(根据id查看评论数)
* @param @param url /**
* @param @return 设定文件 * @Title: getCommentCount
* @return int 返回类型 * @Description: TODO(根据id查看评论数)
* @throws IOException * @param @param url
*/ * @param @return 设定文件
public static int getCommentCount(String url,Proxy proxy) * @return int 返回类型
{ * @throws IOException
String group_id = getGroupId(url, proxy); */
for(int i=0; i<3; i++){ public static int getCommentCount(String url,Proxy proxy) throws Exception
try { {
String urlNew = "http://www.toutiao.com/api/comment/list/?group_id="+group_id+"&item_id=0&count=20&offset=0"; String group_id = getGroupId(url, proxy);
//设置头信息 for(int i=0; i<3; i++){
Map<String,String> headerMap = Tools.getTouTiaoHeader(); try {
String htmlBody = downloadHtml(urlNew, proxy, headerMap); String urlNew = "http://www.toutiao.com/api/comment/list/?group_id="+group_id+"&item_id=0&count=20&offset=0";
if(htmlBody!=null) //设置头信息
{ Map<String,String> headerMap = Tools.getTouTiaoHeader();
try { String htmlBody = downloadHtml(urlNew, proxy, headerMap);
JSONObject json = JSONObject.parseObject(htmlBody); if(htmlBody!=null)
JSONObject data = json.getJSONObject("data"); {
return data.getIntValue("total"); try {
JSONObject json = JSONObject.parseObject(htmlBody);
} catch (Exception e) { JSONObject data = json.getJSONObject("data");
logger.info("获取评论总页数时出现问题:{}",e); return data.getIntValue("total");
}
} } catch (Exception e) {
} catch (Exception e) { logger.info("获取评论总页数时出现问题:{}",e);
continue; }
} }
} } catch (Exception e) {
return -1; continue;
} }
}
return -1;
}
/**
* @Title: getGroupId
* @Description: TODO(获取groupId用于更新评论列表) /**
* @param @param url * @Title: getGroupId
* @param @return 设定文件 * @Description: TODO(获取groupId用于更新评论列表)
* @return String 返回类型 * @param @param url
*/ * @param @return 设定文件
private static String getGroupId(String url,Proxy proxy) * @return String 返回类型
{ */
String groupId = null; private static String getGroupId(String url,Proxy proxy) throws Exception
if(url.contains("/a")||url.contains("/group/")) {
{ String groupId = null;
if(url.contains("/a")) if(url.contains("/a")||url.contains("/group/"))
{ {
groupId = url.split("/a")[1].replace("/", ""); if(url.contains("/a"))
}else {
{ groupId = url.split("/a")[1].replace("/", "");
groupId = url.split("/group/")[1].replace("/", ""); }else
} {
}else if(url.contains("/i")||url.contains("/item/")) groupId = url.split("/group/")[1].replace("/", "");
{ }
groupId = gettGroupIdByUrl(url, proxy); }else if(url.contains("/i")||url.contains("/item/"))
} {
return groupId; groupId = getGroupIdByUrl(url, proxy);
}
} return groupId;
/** }
* @Title: gettGroupIdByUrl
* @Description: TODO(解析并获取groupId) /**
* @param @param url * @Title: gettGroupIdByUrl
* @param @return 设定文件 * @Description: TODO(解析并获取groupId)
* @return String 返回类型 * @param @param url
*/ * @param @return 设定文件
private static String gettGroupIdByUrl(String url,Proxy proxy) * @return String 返回类型
{ */
String groupId = null; private static String getGroupIdByUrl(String url,Proxy proxy) throws Exception
Map<String,String> headerMap = Tools.getTouTiaoHeader(); {
try { String groupId = null;
String htmlBody = downloadHtml(url, proxy, headerMap); Map<String,String> headerMap = Tools.getTouTiaoHeader();
if(htmlBody != null) String htmlBody = downloadHtml(url, proxy, headerMap);
{ if(htmlBody != null)
if(htmlBody.contains("groupId")) {
{ if(htmlBody.contains("groupId"))
groupId = htmlBody.split("groupId: '")[1] {
.split("',")[0].trim(); groupId = htmlBody.split("groupId: '")[1]
} .split("',")[0].trim();
}else }
{ }else
logger.info("获取groupId失败,链接地址为:{}",url); {
} logger.info("获取groupId失败,链接地址为:{}",url);
} catch (Exception e) { }
e.printStackTrace(); return groupId;
logger.error("获取groupId失败,链接地址为:{}",url,e); }
}
return groupId; /**
} * 下载数据
* @param url
private static String downloadHtml(String url, Proxy proxy, Map<String,String> headerMap) { * @param proxy
// 下载数据页面 * @param headerMap
for (int i = 1; i <= 3; i++) { * @return
try { */
Response response = null; private static String downloadHtml(String url, Proxy proxy, Map<String,String> headerMap) {
if(proxy != null) { // 下载数据页面
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy); for (int i = 1; i <= 3; i++) {
}else { try {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_HEAVY_PROXY); Response response = null;
} if(proxy != null) {
return response.body().string(); response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy);
} catch (Exception e) { }else {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace()); response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_HEAVY_PROXY);
if(i==3){ }
break; return response.body().string();
}else{ } catch (Exception e) {
continue; logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
} if(i==3){
} break;
} }else{
return null; continue;
} }
}
} }
return null;
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment