Commit 9d384b56 by zhiwei

添加更新今日头条阅读数功能

parent 34d3c078
/**
* @Title: TouTiaoParse.java
* @Package com.zhiwei.toutiao.parse
* @Description:
* @author hero
* @date 2016年9月2日 上午11:17:44
* @version V1.0
*/
/**
*
*/
package com.zhiwei.toutiao.parse;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.toutiao.bean.Signature;
import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.util.Tools;
import okhttp3.Response;
/**
* @Description:头条帐号采集
* @author hero
* @date 2016年9月2日 上午11:17:44
*/
public class TouTiaoArticleParse {
private static ScriptEngine scriptEngine = new ScriptEngineManager().getEngineByName("javascript");
private static Logger logger = LogManager.getLogger(TouTiaoArticleParse.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/***
* 获取头条数据
*
* @Description:
* @param @param
* url
* @param @return
* @return List<TouTiao> 返回类型
* @throws Exception
*/
public static Map<String, Object> getTouTiaoList(String mediaId, String maxBehotTime, Date endData, Proxy proxy)
throws Exception {
Signature signature = new Signature();
String url = "https://www.toutiao.com/pgc/ma/?page_type=1&media_id=" + mediaId + "&count=20&as="
+ signature.getAs() + "&cp=" + signature.getCp();
if (maxBehotTime != null) {
url = url + "&max_behot_time=" + maxBehotTime;
}
Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", url);
try {
String htmlBody = downloadHtml(url, proxy, headerMap);
if (htmlBody != null && htmlBody.contains("behot_time")) {
Map<String, Object> ttList = parseHtmlByAccount(htmlBody, endData);
if (ttList != null && ttList.size() > 0) {
return ttList;
}
} else {
logger.info("数据为null");
}
} catch (Exception e) {
logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
throw e;
}
return Collections.emptyMap();
}
public static Map<String, Object> getTouTiaoList(String mediaId, String maxBehotTime, Date endData, ProxyHolder proxy)
throws Exception {
Signature signature = new Signature();
String url = "https://www.toutiao.com/pgc/ma/?page_type=1&media_id=" + mediaId + "&count=20&as="
+ signature.getAs() + "&cp=" + signature.getCp();
if (maxBehotTime != null) {
url = url + "&max_behot_time=" + maxBehotTime;
}
Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", url);
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
if (htmlBody != null && htmlBody.contains("behot_time")) {
Map<String, Object> ttList = parseHtmlByAccount(htmlBody, endData);
if (ttList != null && ttList.size() > 0) {
return ttList;
}
} else {
logger.info("数据为null");
}
} catch (Exception e) {
logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
throw e;
}
return Collections.emptyMap();
}
/**
* 获取今日头条历史文章接口新
*
* @param user_id
* @param max_behot_time
* @param endData
* @param proxy
* @return
* @throws Exception
*/
public static Map<String, Object> getTouTiaoHistory(String userId, String maxBehotTime, Date endData,
Proxy proxy) throws Exception {
for (int i = 0; i < 3; i++) {
Signature signature = new Signature(userId, maxBehotTime);
String as = signature.getAs();
String cp = signature.getCp();
String signatureStr = signature.getSignature();
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id=" + userId + "&max_behot_time="
+ maxBehotTime + "&count=20&as=" + as + "&cp=" + cp + "&_signature=" + signatureStr;
Map<String, String> headerMap = new HashMap<String, String>();
headerMap.put("user-agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
headerMap.put("referer", "https://www.toutiao.com/c/user/" + userId + "/");
try {
String htmlBody = downloadHtml(url, proxy, headerMap);
if (htmlBody != null && htmlBody.contains("behot_time")) {
Map<String, Object> ttList = parseHtmlByAccount(userId, htmlBody, endData);
if (ttList != null && ttList.size() > 0) {
return ttList;
}
} else {
logger.info("数据为null");
continue;
}
} catch (Exception e) {
logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
throw e;
}
}
return Collections.emptyMap();
}
public static Map<String, Object> getTouTiaoHistory(String userId, String maxBehotTime, Date endData,
ProxyHolder proxy) throws Exception {
for (int i = 0; i < 3; i++) {
Signature signature = new Signature(userId, maxBehotTime);
String as = signature.getAs();
String cp = signature.getCp();
String signatureStr = signature.getSignature();
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id=" + userId + "&max_behot_time="
+ maxBehotTime + "&count=20&as=" + as + "&cp=" + cp + "&_signature=" + signatureStr;
logger.info("当前采集的历史文章链接:::{}", url);
Map<String, String> headerMap = new HashMap<>();
headerMap.put("user-agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
headerMap.put("referer", "https://www.toutiao.com/c/user/" + userId + "/");
String htmlBody = null;
try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
if (htmlBody != null && htmlBody.contains("behot_time")) {
Map<String, Object> ttList = parseHtmlByAccount(userId, htmlBody, endData);
if (ttList != null && ttList.size() > 0) {
return ttList;
} else {
break;
}
} else {
logger.info("数据为null,获取到的文本为:::{}", htmlBody);
continue;
}
} catch (Exception e) {
logger.error("获取今日头条帐号数据连接超时", e);
throw e;
}
}
return Collections.emptyMap();
}
/***
* 根据帐号解析历史文章地址
*
* @Description:根据帐号解析历史文章地址
* @param @param
* htmlBody
* @param @return
* @return List<String> 返回类型
*/
private static Map<String, Object> parseHtmlByAccount(String htmlBody, Date endDate) {
Map<String, Object> map = new HashMap<>();
Long maxBehotTime = null;
List<TouTiaoArticle> dataList = new ArrayList<>();
try {
JSONObject json = JSONObject.parseObject(htmlBody);
JSONArray jsonArray = json.getJSONArray("data");
maxBehotTime = Long.valueOf(json.getJSONObject("next").getString("max_behot_time"));
String title = null;
String content = null;
String time = null;
Date date = null;
String readNum = null;
String commentNum = null;
String playNum = null;
String shareNum = null;
String source = null;
String userId = null;
String articleType = null;
List<String> labelList = null;
String likeNum = null;
for (int i = 0; i < jsonArray.size(); i++) {
try {
JSONObject data = jsonArray.getJSONObject(i);
String href = "https://www.toutiao.com/";
if (data.containsKey("group_id")) {
href = href + "a" + data.getLongValue("group_id");
title = data.getString("title");
content = data.getString("abstract");
time = data.getLongValue("behot_time") * 1000 + "";
date = TimeParse.stringFormartDate(time);
readNum = data.getString("go_detail_count");
commentNum = data.getString("comments_count");
playNum = data.getString("detail_play_effective_count");
shareNum = data.getString("share_count");
source = data.getString("source");
userId = data.getLong("creator_uid") + "";
articleType = data.getString("chinese_tag");
TouTiaoArticle tt = new TouTiaoArticle(href, title, userId, source, date, content, commentNum,
playNum, readNum, shareNum, "今日头条", articleType,likeNum);
if (data.containsKey("label")) {
labelList = data.getJSONArray("label").toJavaList(String.class);
tt.setLabelList(labelList);
}
dataList.add(tt);
}
} catch (Exception e) {
logger.error("数据解析出现问题,{}", e.getMessage());
continue;
}
}
} catch (Exception e) {
logger.error("数据解析出现问题,{}", e.getMessage());
return null;
}
if (endDate != null) {
if (maxBehotTime != null && !"0".equals(maxBehotTime)) {
Date nextDate = new Date(Long.valueOf(maxBehotTime + "000"));
if (endDate.after(nextDate)) {
maxBehotTime = null;
}
}
}
map.put("max_behot_time", maxBehotTime);
map.put("data", dataList);
return map;
}
private static Map<String, Object> parseHtmlByAccount(String userId, String htmlBody, Date endDate) {
Map<String, Object> map = new HashMap<>();
Long maxBehotTime = null;
List<TouTiaoArticle> dataList = new ArrayList<>();
try {
JSONObject json = JSONObject.parseObject(htmlBody);
JSONArray jsonArray = json.getJSONArray("data");
maxBehotTime = Long.valueOf(json.getJSONObject("next").getString("max_behot_time"));
String title = null;
String content = null;
String time = null;
Date date = null;
String readNum = null;
String commentNum = null;
String playNum = null;
String shareNum = null;
String source = null;
String articleType = null;
List<String> labelList = null;
String likeNum = null;
for (int i = 0; i < jsonArray.size(); i++) {
try {
JSONObject data = jsonArray.getJSONObject(i);
String href = "https://www.toutiao.com/";
if (data.containsKey("group_id")) {
href = href + "a" + data.getLongValue("group_id");
title = data.getString("title");
content = data.getString("abstract");
time = data.getLongValue("behot_time") * 1000 + "";
date = TimeParse.stringFormartDate(time);
readNum = data.getString("go_detail_count");
commentNum = data.getString("comments_count");
playNum = data.getString("detail_play_effective_count");
shareNum = data.getString("share_count");
source = data.getString("source");
articleType = data.getString("chinese_tag");
TouTiaoArticle tt = new TouTiaoArticle(href, title, userId, source, date, content, commentNum,
playNum, readNum, shareNum, "今日头条", articleType,likeNum);
if (data.containsKey("label")) {
labelList = data.getJSONArray("label").toJavaList(String.class);
tt.setLabelList(labelList);
}
dataList.add(tt);
}
} catch (Exception e) {
logger.error("数据解析出现问题,{}", e.getMessage());
continue;
}
}
} catch (Exception e) {
logger.error("数据解析出现问题,{}", e.getMessage());
return null;
}
if (endDate != null) {
if (maxBehotTime != null && !"0".equals(maxBehotTime)) {
Date nextDate = new Date(Long.valueOf(maxBehotTime + "000"));
if (endDate.after(nextDate)) {
maxBehotTime = null;
}
}
}
map.put("max_behot_time", maxBehotTime);
map.put("data", dataList);
return map;
}
/**
* @Title: getMicroTouTiaoCrawler
* @author hero
* @Description: 根据用户user_id查询用户微头条数据
* @param @param
* user_id
* @param @param
* endDate
* @param @param
* proxy
* @param @return
* @param @throws
* IOException 设定文件
* @return List<Map<String,Object>> 返回类型
*/
public static Map<String, Object> getMicroTouTiaoCrawler(String userId, Date endDate, Proxy proxy,
String maxBehotTime) throws IOException {
String url = "https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id=" + userId;
if (maxBehotTime != null) {
url = url + "&max_behot_time=" + maxBehotTime;
}
System.out.println(url);
Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", "https://www.toutiao.com/c/user/" + userId + "/");
try {
String htmlBody = downloadHtml(url, proxy, headerMap);
if (htmlBody != null) {
Map<String, Object> dataMap = parseHtmlByMicroAccount(htmlBody, endDate);
if (dataMap != null && dataMap.size() > 0) {
return dataMap;
}
} else {
logger.info("数据为null");
}
} catch (Exception e) {
logger.info("获取数据出错::{},数据为null", e);
return null;
}
return null;
}
public static Map<String, Object> getMicroTouTiaoCrawler(String userId, Date endDate, ProxyHolder proxy,
Long maxBehotTime) throws IOException {
String url = "https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id=" + userId;
if (maxBehotTime != null) {
url = url + "&max_behot_time=" + maxBehotTime;
}
logger.info("微头条采集链接:::{}", url);
Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", "https://www.toutiao.com/c/user/" + userId + "/");
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
if (htmlBody != null && htmlBody.contains("create_time")) {
Map<String, Object> dataMap = parseHtmlByMicroAccount(htmlBody, endDate);
if (dataMap != null && dataMap.size() > 0) {
return dataMap;
}
} else {
logger.info("数据为null");
}
} catch (Exception e) {
logger.info("获取数据出错::{},数据为null", e);
return null;
}
return null;
}
/**
*
* @Description 微头条客户端解析
* @param userId
* @param endDate
* @param proxy
* @param max_behot_time
* @return
*/
public static List<Map<String,Object>> getClientMicroToutiaoCrawler(String userId, ProxyHolder proxy,
Long maxBehotTime) {
List<Map<String,Object>> dataList = new ArrayList<>();
String ma = "";
while(true) {
String url = "https://i.snssdk.com/api/feed/profile/v1/?visited_uid="+userId+"&offset="+ maxBehotTime;
ma = String.valueOf(maxBehotTime);
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url),proxy)){
String result = response.body().string();
JSONObject json = JSONObject.parseObject(result);
maxBehotTime = json.getLongValue("offset");
JSONArray jsonArray = json.getJSONArray("data");
for (int i = 0; i < jsonArray.size(); i++) {
JSONObject data = jsonArray.getJSONObject(i);
try {
JSONObject dataJSON = data.getJSONObject("content").getJSONObject("raw_data");
Map<String,Object> map = new HashMap<>();
if(dataJSON.containsKey("comment_base") && dataJSON.getJSONObject("comment_base")!=null) {
JSONObject commentBase = dataJSON.getJSONObject("comment_base");
Date date = new Date(commentBase.getLongValue("create_time") * 1000);
String href = "http://weitoutiao.zjurl.cn/ugc/share/wap/comment/" + dataJSON.getLongValue("id");
String source = commentBase.getJSONObject("user").getJSONObject("info").getString("name");
String content = commentBase.getString("content");
String readNum = commentBase.getJSONObject("action").getInteger("read_count") + "";
String commentNum = commentBase.getJSONObject("action").getInteger("comment_count") + "";
userId = commentBase.getJSONObject("user").getJSONObject("info").getString("user_id");
if(dataJSON.containsKey("origin_group")) {
String replayUrl = dataJSON.getJSONObject("origin_group").getString("article_url");
String title = dataJSON.getJSONObject("origin_group").getString("title");
map.put("title", title);
map.put("replayUrl", replayUrl);
}
map.put("time", date);
map.put("href", href);
map.put("source", source);
map.put("content", content);
map.put("readNum", readNum);
map.put("commentNum", commentNum);
map.put("user_id", userId);
dataList.add(map);
}
} catch (Exception e) {
// System.out.println(data.toString());
e.printStackTrace();
}
}
System.out.println(" 采集到 条 == "+dataList.size() + " -- " +ma + " -- " + maxBehotTime);
if(ma.equals(String.valueOf(maxBehotTime))) {
break;
}
} catch (Exception e) {
logger.info("客户端微头条采集错误 {}",e);
}
}
return dataList;
}
/**
* @Title: parseHtmlByMicroAccount
* @author hero
* @Description: 解析微头条数据
* @param @param
* htmlBody
* @param @param
* endDate
* @param @return
* 设定文件
* @return Map<String,Object> 返回类型
*/
private static Map<String, Object> parseHtmlByMicroAccount(String htmlBody, Date endDate) {
Map<String, Object> map = new HashMap<>();
Long maxBehotTime = null;
List<TouTiaoArticle> dataList = new ArrayList<>();
try {
JSONObject json = JSONObject.parseObject(htmlBody);
boolean more = false;
if(json.containsKey("has_more")) {
more = json.getBoolean("has_more");
}
if(json.containsKey("next")) {
maxBehotTime = json.getJSONObject("next").getLongValue("max_behot_time");
}
Date date = null;
if(json.containsKey("data")) {
JSONArray jsonArray = json.getJSONArray("data");
String href = null;
String source = null;
String title = null;
String content = null;
String readNum = null;
String commentNum = null;
String playNum = null;
String userId = null;
String likeNum = null;
String articleType = null;
int count = 16;
for (int i = 0; i < jsonArray.size(); i++) {
try {
JSONObject data = jsonArray.getJSONObject(i);
String text = null;
if(data.containsKey("stream_cell") && data.getJSONObject("stream_cell")!=null) {
text = data.getJSONObject("stream_cell").getString("raw_data");
}else if(data.containsKey("concern_talk_cell")) {
text = data.getJSONObject("concern_talk_cell").getString("packed_json_str");
}
JSONObject dataJSON = JSONObject.parseObject(text);
if(dataJSON.containsKey("comment_base") && dataJSON.getJSONObject("comment_base")!=null) {
JSONObject commentBase = dataJSON.getJSONObject("comment_base");
date = new Date(commentBase.getLongValue("create_time") * 1000);
href = "https://www.toutiao.com/a" + dataJSON.getLongValue("id");
source = commentBase.getJSONObject("user").getJSONObject("info").getString("name");
content = dataJSON.getString("content");
readNum = dataJSON.getJSONObject("action").getInteger("read_count") + "";
likeNum = dataJSON.getJSONObject("action").getInteger("digg_count")+"";
commentNum = dataJSON.getJSONObject("action").getInteger("comment_count") + "";
userId = commentBase.getJSONObject("user").getJSONObject("info").getString("user_id");
if (content != null && !"".equals(content)) {
if (content.length() < 16) {
count = content.length();
}
title = content.substring(0, count);
}
}else {
date = new Date(dataJSON.getLongValue("create_time") * 1000);
href = "https://www.toutiao.com/a" + dataJSON.getString("thread_id");
source = dataJSON.getJSONObject("user").getString("name");
content = dataJSON.getString("content");
readNum = dataJSON.getInteger("read_count") + "";
commentNum = dataJSON.getInteger("comment_count") + "";
likeNum = dataJSON.getInteger("digg_count")+"";
userId = dataJSON.getJSONObject("user").getString("user_id");
if (content != null && !"".equals(content)) {
if (content.length() < 16) {
count = content.length();
}
title = content.substring(0, count);
}
}
TouTiaoArticle tt = new TouTiaoArticle(href, title, userId, source, date, content, commentNum,
playNum, readNum, "0", "微头条", articleType,likeNum);
dataList.add(tt);
} catch (Exception e) {
continue;
}
}
}else {
System.out.println(json);
}
/** 验证是否有下一页数据 **/
if (more) {
if (maxBehotTime != null && maxBehotTime != 0) {
if (endDate.after(date)) {
maxBehotTime = null;
}
}
} else {
maxBehotTime = null;
}
} catch (Exception e) {
e.printStackTrace();
}
map.put("max_behot_time", maxBehotTime);
map.put("data", dataList);
return map;
}
/**
* 根据链接获取全文
* @param url
* @param proxy
* @return
*/
public static String getContent(String url,Proxy proxy) {
try {
String htmlBody = downloadHtml(url, proxy, null);
String regex = "<script>var BASE_DATA[\\s\\S]+?</script>";
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("articleInfo")) {
//通过正则截取需要的js代码
Matcher matcher = Pattern.compile(regex).matcher(htmlBody);
if(matcher.find()) {
String content = matcher.group().replaceAll("<script>var BASE_DATA = |;</script>", "");
//通过js引擎执行js代码
String jsContent = "eval(("+ content +")).articleInfo.content.toString();";
String contentHtml = scriptEngine.eval(jsContent).toString();
//解析最后的数据
return Jsoup.parse(contentHtml).text();
}
}
return null;
} catch (Exception e) {
logger.error("跟据链接采集全文出现错误", e);
return null;
}
}
/**
* 下载数据
* @param url
* @param proxy
* @param headMap
* @return
*/
private static String downloadHtml(String url, Proxy proxy, Map<String,String> headMap) {
for (int i = 1; i <= 3; i++) {
try {
Response response = null;
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headMap), ProxyHolder.NAT_HEAVY_PROXY);
}
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
break;
}else{
continue;
}
}
}
return null;
}
}
/**
* @Title: TouTiaoParse.java
* @Package com.zhiwei.toutiao.parse
* @Description:
* @author hero
* @date 2016年9月2日 上午11:17:44
* @version V1.0
*/
/**
*
*/
package com.zhiwei.toutiao.parse;
import java.io.IOException;
import java.net.Proxy;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.toutiao.bean.Signature;
import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.util.Tools;
import okhttp3.Response;
/**
* @Description:头条帐号采集
* @author hero
* @date 2016年9月2日 上午11:17:44
*/
public class TouTiaoArticleParse {
private static ScriptEngine scriptEngine = new ScriptEngineManager().getEngineByName("javascript");
private static Logger logger = LogManager.getLogger(TouTiaoArticleParse.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/***
* 获取头条数据
*
* @Description:
* @param @param
* url
* @param @return
* @return List<TouTiao> 返回类型
* @throws Exception
*/
public static Map<String, Object> getTouTiaoList(String mediaId, String maxBehotTime, Date endData, Proxy proxy)
throws Exception {
Signature signature = new Signature();
String url = "https://www.toutiao.com/pgc/ma/?page_type=1&media_id=" + mediaId + "&count=20&as="
+ signature.getAs() + "&cp=" + signature.getCp();
if (maxBehotTime != null) {
url = url + "&max_behot_time=" + maxBehotTime;
}
Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", url);
try {
String htmlBody = downloadHtml(url, proxy, headerMap);
if (htmlBody != null && htmlBody.contains("behot_time")) {
Map<String, Object> ttList = parseHtmlByAccount(htmlBody, endData);
if (ttList != null && ttList.size() > 0) {
return ttList;
}
} else {
logger.info("数据为null");
}
} catch (Exception e) {
logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
throw e;
}
return Collections.emptyMap();
}
public static Map<String, Object> getTouTiaoList(String mediaId, String maxBehotTime, Date endData, ProxyHolder proxy)
throws Exception {
Signature signature = new Signature();
String url = "https://www.toutiao.com/pgc/ma/?page_type=1&media_id=" + mediaId + "&count=20&as="
+ signature.getAs() + "&cp=" + signature.getCp();
if (maxBehotTime != null) {
url = url + "&max_behot_time=" + maxBehotTime;
}
Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", url);
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
if (htmlBody != null && htmlBody.contains("behot_time")) {
Map<String, Object> ttList = parseHtmlByAccount(htmlBody, endData);
if (ttList != null && ttList.size() > 0) {
return ttList;
}
} else {
logger.info("数据为null");
}
} catch (Exception e) {
logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
throw e;
}
return Collections.emptyMap();
}
/**
* 获取今日头条历史文章接口新
*
* @param user_id
* @param max_behot_time
* @param endData
* @param proxy
* @return
* @throws Exception
*/
public static Map<String, Object> getTouTiaoHistory(String userId, String maxBehotTime, Date endData,
Proxy proxy) throws Exception {
for (int i = 0; i < 3; i++) {
Signature signature = new Signature(userId, maxBehotTime);
String as = signature.getAs();
String cp = signature.getCp();
String signatureStr = signature.getSignature();
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id=" + userId + "&max_behot_time="
+ maxBehotTime + "&count=20&as=" + as + "&cp=" + cp + "&_signature=" + signatureStr;
Map<String, String> headerMap = new HashMap<String, String>();
headerMap.put("user-agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
headerMap.put("referer", "https://www.toutiao.com/c/user/" + userId + "/");
try {
String htmlBody = downloadHtml(url, proxy, headerMap);
if (htmlBody != null && htmlBody.contains("behot_time")) {
Map<String, Object> ttList = parseHtmlByAccount(userId, htmlBody, endData);
if (ttList != null && ttList.size() > 0) {
return ttList;
}
} else {
logger.info("数据为null");
continue;
}
} catch (Exception e) {
logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
throw e;
}
}
return Collections.emptyMap();
}
public static Map<String, Object> getTouTiaoHistory(String userId, String maxBehotTime, Date endData,
ProxyHolder proxy) throws Exception {
for (int i = 0; i < 3; i++) {
Signature signature = new Signature(userId, maxBehotTime);
String as = signature.getAs();
String cp = signature.getCp();
String signatureStr = signature.getSignature();
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id=" + userId + "&max_behot_time="
+ maxBehotTime + "&count=20&as=" + as + "&cp=" + cp + "&_signature=" + signatureStr;
logger.info("当前采集的历史文章链接:::{}", url);
Map<String, String> headerMap = new HashMap<>();
headerMap.put("user-agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
headerMap.put("referer", "https://www.toutiao.com/c/user/" + userId + "/");
String htmlBody = null;
try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
if (htmlBody != null && htmlBody.contains("behot_time")) {
Map<String, Object> ttList = parseHtmlByAccount(userId, htmlBody, endData);
if (ttList != null && ttList.size() > 0) {
return ttList;
} else {
break;
}
} else {
logger.info("数据为null,获取到的文本为:::{}", htmlBody);
continue;
}
} catch (Exception e) {
logger.error("获取今日头条帐号数据连接超时", e);
throw e;
}
}
return Collections.emptyMap();
}
/***
* 根据帐号解析历史文章地址
*
* @Description:根据帐号解析历史文章地址
* @param @param
* htmlBody
* @param @return
* @return List<String> 返回类型
*/
private static Map<String, Object> parseHtmlByAccount(String htmlBody, Date endDate) {
Map<String, Object> map = new HashMap<>();
Long maxBehotTime = null;
List<TouTiaoArticle> dataList = new ArrayList<>();
try {
JSONObject json = JSONObject.parseObject(htmlBody);
JSONArray jsonArray = json.getJSONArray("data");
maxBehotTime = Long.valueOf(json.getJSONObject("next").getString("max_behot_time"));
String title = null;
String content = null;
String time = null;
Date date = null;
String readNum = null;
String commentNum = null;
String playNum = null;
String shareNum = null;
String source = null;
String userId = null;
String articleType = null;
List<String> labelList = null;
String likeNum = null;
for (int i = 0; i < jsonArray.size(); i++) {
try {
JSONObject data = jsonArray.getJSONObject(i);
String href = "https://www.toutiao.com/";
if (data.containsKey("group_id")) {
href = href + "a" + data.getLongValue("group_id");
title = data.getString("title");
content = data.getString("abstract");
time = data.getLongValue("behot_time") * 1000 + "";
date = TimeParse.stringFormartDate(time);
readNum = data.getString("go_detail_count");
commentNum = data.getString("comments_count");
playNum = data.getString("detail_play_effective_count");
shareNum = data.getString("share_count");
source = data.getString("source");
userId = data.getLong("creator_uid") + "";
articleType = data.getString("chinese_tag");
TouTiaoArticle tt = new TouTiaoArticle(href, title, userId, source, date, content, commentNum,
playNum, readNum, shareNum, "今日头条", articleType,likeNum);
if (data.containsKey("label")) {
labelList = data.getJSONArray("label").toJavaList(String.class);
tt.setLabelList(labelList);
}
dataList.add(tt);
}
} catch (Exception e) {
logger.error("数据解析出现问题,{}", e.getMessage());
continue;
}
}
} catch (Exception e) {
logger.error("数据解析出现问题,{}", e.getMessage());
return null;
}
if (endDate != null) {
if (maxBehotTime != null && !"0".equals(maxBehotTime)) {
Date nextDate = new Date(Long.valueOf(maxBehotTime + "000"));
if (endDate.after(nextDate)) {
maxBehotTime = null;
}
}
}
map.put("max_behot_time", maxBehotTime);
map.put("data", dataList);
return map;
}
private static Map<String, Object> parseHtmlByAccount(String userId, String htmlBody, Date endDate) {
Map<String, Object> map = new HashMap<>();
Long maxBehotTime = null;
List<TouTiaoArticle> dataList = new ArrayList<>();
try {
JSONObject json = JSONObject.parseObject(htmlBody);
JSONArray jsonArray = json.getJSONArray("data");
maxBehotTime = Long.valueOf(json.getJSONObject("next").getString("max_behot_time"));
String title = null;
String content = null;
String time = null;
Date date = null;
String readNum = null;
String commentNum = null;
String playNum = null;
String shareNum = null;
String source = null;
String articleType = null;
List<String> labelList = null;
String likeNum = null;
for (int i = 0; i < jsonArray.size(); i++) {
try {
JSONObject data = jsonArray.getJSONObject(i);
String href = "https://www.toutiao.com/";
if (data.containsKey("group_id")) {
href = href + "a" + data.getLongValue("group_id");
title = data.getString("title");
content = data.getString("abstract");
time = data.getLongValue("behot_time") * 1000 + "";
date = TimeParse.stringFormartDate(time);
readNum = data.getString("go_detail_count");
commentNum = data.getString("comments_count");
playNum = data.getString("detail_play_effective_count");
shareNum = data.getString("share_count");
source = data.getString("source");
articleType = data.getString("chinese_tag");
TouTiaoArticle tt = new TouTiaoArticle(href, title, userId, source, date, content, commentNum,
playNum, readNum, shareNum, "今日头条", articleType,likeNum);
if (data.containsKey("label")) {
labelList = data.getJSONArray("label").toJavaList(String.class);
tt.setLabelList(labelList);
}
dataList.add(tt);
}
} catch (Exception e) {
logger.error("数据解析出现问题,{}", e.getMessage());
continue;
}
}
} catch (Exception e) {
logger.error("数据解析出现问题,{}", e.getMessage());
return null;
}
if (endDate != null) {
if (maxBehotTime != null && !"0".equals(maxBehotTime)) {
Date nextDate = new Date(Long.valueOf(maxBehotTime + "000"));
if (endDate.after(nextDate)) {
maxBehotTime = null;
}
}
}
map.put("max_behot_time", maxBehotTime);
map.put("data", dataList);
return map;
}
/**
* @Title: getMicroTouTiaoCrawler
* @author hero
* @Description: 根据用户user_id查询用户微头条数据
* @param @param
* user_id
* @param @param
* endDate
* @param @param
* proxy
* @param @return
* @param @throws
* IOException 设定文件
* @return List<Map<String,Object>> 返回类型
*/
public static Map<String, Object> getMicroTouTiaoCrawler(String userId, Date endDate, Proxy proxy,
String maxBehotTime) throws IOException {
String url = "https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id=" + userId;
if (maxBehotTime != null) {
url = url + "&max_behot_time=" + maxBehotTime;
}
System.out.println(url);
Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", "https://www.toutiao.com/c/user/" + userId + "/");
try {
String htmlBody = downloadHtml(url, proxy, headerMap);
if (htmlBody != null) {
Map<String, Object> dataMap = parseHtmlByMicroAccount(htmlBody, endDate);
if (dataMap != null && dataMap.size() > 0) {
return dataMap;
}
} else {
logger.info("数据为null");
}
} catch (Exception e) {
logger.info("获取数据出错::{},数据为null", e);
return null;
}
return null;
}
public static Map<String, Object> getMicroTouTiaoCrawler(String userId, Date endDate, ProxyHolder proxy,
Long maxBehotTime) throws IOException {
String url = "https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id=" + userId;
if (maxBehotTime != null) {
url = url + "&max_behot_time=" + maxBehotTime;
}
logger.info("微头条采集链接:::{}", url);
Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", "https://www.toutiao.com/c/user/" + userId + "/");
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
if (htmlBody != null && htmlBody.contains("create_time")) {
Map<String, Object> dataMap = parseHtmlByMicroAccount(htmlBody, endDate);
if (dataMap != null && dataMap.size() > 0) {
return dataMap;
}
} else {
logger.info("数据为null");
}
} catch (Exception e) {
logger.info("获取数据出错::{},数据为null", e);
return null;
}
return null;
}
/**
*
* @Description 微头条客户端解析
* @param userId
* @param endDate
* @param proxy
* @param max_behot_time
* @return
*/
public static List<Map<String,Object>> getClientMicroToutiaoCrawler(String userId, ProxyHolder proxy,
Long maxBehotTime) {
List<Map<String,Object>> dataList = new ArrayList<>();
String ma = "";
while(true) {
String url = "https://i.snssdk.com/api/feed/profile/v1/?visited_uid="+userId+"&offset="+ maxBehotTime;
ma = String.valueOf(maxBehotTime);
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url),proxy)){
String result = response.body().string();
JSONObject json = JSONObject.parseObject(result);
maxBehotTime = json.getLongValue("offset");
JSONArray jsonArray = json.getJSONArray("data");
for (int i = 0; i < jsonArray.size(); i++) {
JSONObject data = jsonArray.getJSONObject(i);
try {
JSONObject dataJSON = data.getJSONObject("content").getJSONObject("raw_data");
Map<String,Object> map = new HashMap<>();
if(dataJSON.containsKey("comment_base") && dataJSON.getJSONObject("comment_base")!=null) {
JSONObject commentBase = dataJSON.getJSONObject("comment_base");
Date date = new Date(commentBase.getLongValue("create_time") * 1000);
String href = "http://weitoutiao.zjurl.cn/ugc/share/wap/comment/" + dataJSON.getLongValue("id");
String source = commentBase.getJSONObject("user").getJSONObject("info").getString("name");
String content = commentBase.getString("content");
String readNum = commentBase.getJSONObject("action").getInteger("read_count") + "";
String commentNum = commentBase.getJSONObject("action").getInteger("comment_count") + "";
userId = commentBase.getJSONObject("user").getJSONObject("info").getString("user_id");
if(dataJSON.containsKey("origin_group")) {
String replayUrl = dataJSON.getJSONObject("origin_group").getString("article_url");
String title = dataJSON.getJSONObject("origin_group").getString("title");
map.put("title", title);
map.put("replayUrl", replayUrl);
}
map.put("time", date);
map.put("href", href);
map.put("source", source);
map.put("content", content);
map.put("readNum", readNum);
map.put("commentNum", commentNum);
map.put("user_id", userId);
dataList.add(map);
}
} catch (Exception e) {
// System.out.println(data.toString());
e.printStackTrace();
}
}
System.out.println(" 采集到 条 == "+dataList.size() + " -- " +ma + " -- " + maxBehotTime);
if(ma.equals(String.valueOf(maxBehotTime))) {
break;
}
} catch (Exception e) {
logger.info("客户端微头条采集错误 {}",e);
}
}
return dataList;
}
/**
* @Title: parseHtmlByMicroAccount
* @author hero
* @Description: 解析微头条数据
* @param @param
* htmlBody
* @param @param
* endDate
* @param @return
* 设定文件
* @return Map<String,Object> 返回类型
*/
private static Map<String, Object> parseHtmlByMicroAccount(String htmlBody, Date endDate) {
Map<String, Object> map = new HashMap<>();
Long maxBehotTime = null;
List<TouTiaoArticle> dataList = new ArrayList<>();
try {
JSONObject json = JSONObject.parseObject(htmlBody);
boolean more = false;
if(json.containsKey("has_more")) {
more = json.getBoolean("has_more");
}
if(json.containsKey("next")) {
maxBehotTime = json.getJSONObject("next").getLongValue("max_behot_time");
}
Date date = null;
if(json.containsKey("data")) {
JSONArray jsonArray = json.getJSONArray("data");
String href = null;
String source = null;
String title = null;
String content = null;
String readNum = null;
String commentNum = null;
String playNum = null;
String userId = null;
String likeNum = null;
String articleType = null;
int count = 16;
for (int i = 0; i < jsonArray.size(); i++) {
try {
JSONObject data = jsonArray.getJSONObject(i);
String text = null;
if(data.containsKey("stream_cell") && data.getJSONObject("stream_cell")!=null) {
text = data.getJSONObject("stream_cell").getString("raw_data");
}else if(data.containsKey("concern_talk_cell")) {
text = data.getJSONObject("concern_talk_cell").getString("packed_json_str");
}
JSONObject dataJSON = JSONObject.parseObject(text);
if(dataJSON.containsKey("comment_base") && dataJSON.getJSONObject("comment_base")!=null) {
JSONObject commentBase = dataJSON.getJSONObject("comment_base");
date = new Date(commentBase.getLongValue("create_time") * 1000);
href = "https://www.toutiao.com/a" + dataJSON.getLongValue("id");
source = commentBase.getJSONObject("user").getJSONObject("info").getString("name");
content = dataJSON.getString("content");
readNum = dataJSON.getJSONObject("action").getInteger("read_count") + "";
likeNum = dataJSON.getJSONObject("action").getInteger("digg_count")+"";
commentNum = dataJSON.getJSONObject("action").getInteger("comment_count") + "";
userId = commentBase.getJSONObject("user").getJSONObject("info").getString("user_id");
if (content != null && !"".equals(content)) {
if (content.length() < 16) {
count = content.length();
}
title = content.substring(0, count);
}
}else {
date = new Date(dataJSON.getLongValue("create_time") * 1000);
href = "https://www.toutiao.com/a" + dataJSON.getString("thread_id");
source = dataJSON.getJSONObject("user").getString("name");
content = dataJSON.getString("content");
readNum = dataJSON.getInteger("read_count") + "";
commentNum = dataJSON.getInteger("comment_count") + "";
likeNum = dataJSON.getInteger("digg_count")+"";
userId = dataJSON.getJSONObject("user").getString("user_id");
if (content != null && !"".equals(content)) {
if (content.length() < 16) {
count = content.length();
}
title = content.substring(0, count);
}
}
TouTiaoArticle tt = new TouTiaoArticle(href, title, userId, source, date, content, commentNum,
playNum, readNum, "0", "微头条", articleType,likeNum);
dataList.add(tt);
} catch (Exception e) {
continue;
}
}
}else {
System.out.println(json);
}
/** 验证是否有下一页数据 **/
if (more) {
if (maxBehotTime != null && maxBehotTime != 0) {
if (endDate.after(date)) {
maxBehotTime = null;
}
}
} else {
maxBehotTime = null;
}
} catch (Exception e) {
e.printStackTrace();
}
map.put("max_behot_time", maxBehotTime);
map.put("data", dataList);
return map;
}
/**
* 根据链接获取全文
* @param url
* @param proxy
* @return
*/
public static String getContent(String url,Proxy proxy) {
try {
String htmlBody = downloadHtml(url, proxy, null);
String regex = "<script>var BASE_DATA[\\s\\S]+?</script>";
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("articleInfo")) {
//通过正则截取需要的js代码
Matcher matcher = Pattern.compile(regex).matcher(htmlBody);
if(matcher.find()) {
String content = matcher.group().replaceAll("<script>var BASE_DATA = |;</script>", "");
//通过js引擎执行js代码
String jsContent = "eval(("+ content +")).articleInfo.content.toString();";
String contentHtml = scriptEngine.eval(jsContent).toString();
//解析最后的数据
return Jsoup.parse(contentHtml).text();
}
}
return null;
} catch (Exception e) {
logger.error("跟据链接采集全文出现错误", e);
return null;
}
}
/**
* 根据文章url获取itemId
* @param url
* @param proxy
* @return
* @throws Exception
*/
private static String getItemIdByUrl(String url,Proxy proxy) throws Exception
{
String itemId = null;
Map<String,String> headerMap = Tools.getTouTiaoHeader();
String htmlBody = downloadHtml(url, proxy, headerMap);
if(htmlBody != null)
{
if(htmlBody.contains("itemId"))
{
itemId = htmlBody.split("itemId: '")[1]
.split("',")[0].trim();
}
}else
{
logger.info("获取itemId失败,链接地址为:{}",url);
}
return itemId;
}
/**
* 根据文章url获取文章信息
* @param url
* @param proxy
* @return
* @throws Exception
*/
public static TouTiaoArticle getToutiaoArticleInfoByUrl(String url, Proxy proxy) throws Exception
{
String itemId = getItemIdByUrl(url, proxy);
if(Objects.nonNull(itemId)){
for(int i=0; i<3; i++){
try {
String urlNew = "https://m.toutiao.com/i" + itemId + "/info/?_signature=&i="+ itemId;
//设置头信息
Map<String,String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer","https://m.toutiao.com/i" + itemId + "/");
headerMap.put("User-Agent", "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Mobile Safari/537.36");
String htmlBody = downloadHtml(urlNew, proxy, headerMap);
if(htmlBody!=null)
{
try {
JSONObject data = JSONObject.parseObject(htmlBody).getJSONObject("data");
String commentNum = data.getInteger("comment_count").toString();
String readNum = data.getInteger("impression_count").toString();
String playCount = data.getInteger("video_play_count").toString();
String userId = data.getJSONObject("media_user").getLong("id").toString();
String source = data.getString("source");
String title = data.getString("title");
String link = data.getString("url");
String content = data.getString("content");
if(data.containsKey("content") && StringUtils.isNotBlank(content)){
content = Jsoup.parse(content).text();
}
Date time = new Date(data.getLong("publish_time")*1000);
TouTiaoArticle touTiaoArticle = new TouTiaoArticle();
touTiaoArticle.setUrl(url);
touTiaoArticle.setTitle(title);
touTiaoArticle.setUser_id(userId);
touTiaoArticle.setSource(source);
touTiaoArticle.setTime(time);
touTiaoArticle.setContent(content);
touTiaoArticle.setCommentCount(commentNum);
touTiaoArticle.setReadNum(readNum);
touTiaoArticle.setPlayCount(playCount);
return touTiaoArticle;
} catch (Exception e) {
logger.info("获取评论总页数时出现问题:{}",e);
}
}
} catch (Exception e) {
continue;
}
}
}
return null;
}
/**
* 下载数据
* @param url
* @param proxy
* @param headMap
* @return
*/
private static String downloadHtml(String url, Proxy proxy, Map<String,String> headMap) {
for (int i = 1; i <= 3; i++) {
try {
Response response = null;
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headMap), ProxyHolder.NAT_HEAVY_PROXY);
}
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
break;
}else{
continue;
}
}
}
return null;
}
}
package com.zhiwei.toutiao.parse;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.toutiao.bean.TouTiaoComment;
import com.zhiwei.toutiao.util.Tools;
import okhttp3.Response;
/**
* @ClassName: TouTiaoComment
* @Description: 今日头条评论数据
* @author hero
* @date 2016年12月9日 下午7:50:28
*/
public class TouTiaoCommentParse {
private static Logger logger = LogManager.getLogger(TouTiaoCommentParse.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
*
* @Title: getTouTiaoComment
* @author hero
* @Description: 获取评论列表,可指定限制返回条数
* @param @param url
* @param @param count
* @param @param proxy
* @param @return
* @param @throws Exception 设定文件
* @return List<TouTiaoComment> 返回类型
*/
public static List<TouTiaoComment> getTouTiaoComment(String url,int returnCount,Proxy proxy) throws Exception
{
List<TouTiaoComment> ttList = new ArrayList<TouTiaoComment>();
String group_id = getGroupId(url, proxy);
//查询评论总页数
if(group_id != null){
int page = getPage(group_id,proxy);
if(returnCount>0){
int pageMax = (int)Math.ceil((double)returnCount/20.0);
if(page>=pageMax){
page = pageMax;
}
}
for(int i=0;i<page;i++)
{
String urlNew = "http://is.snssdk.com/article/v2/tab_comments/?app_name=news_article&offset="
+i*20+"&group_id="+group_id+"&aggr_type=1&count=20&fold=1&item_id="+group_id+"&ts="+System.currentTimeMillis();
//设置头信息
Map<String,String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("User-Agent", "News 6.6.5 rv:6.6.5.03 (iPhone; iOS 11.3; zh_CN) Cronet");
headerMap.put("Host", "is.snssdk.com");
for(int j=1; j<=3; j++){
try {
String htmlBody = downloadHtml(urlNew, proxy, headerMap);
if(htmlBody!=null)
{
List<TouTiaoComment> commentes = analySisComment(htmlBody, url);
ttList.addAll(commentes);
logger.info(" url {} 采集到第 {} 页 采集到 {} 条数据 ",url,page,ttList.size());
}else
{
logger.info("采集出现问题,地址为:{}", url);
}
if(Objects.nonNull(proxy)) {
ZhiWeiTools.sleep(100);
}else {
ZhiWeiTools.sleep(4000);
}
break;
} catch (Exception e) {
continue;
}
}
}
}
return ttList;
}
/**
* @Title: analySisComment
* @Description: TODO(解析评论列表)
* @param @param htmlBody
* @param @return 设定文件
* @return List<DBObject> 返回类型
*/
private static List<TouTiaoComment> analySisComment(String htmlBody,String url)
{
List<TouTiaoComment> list = new ArrayList<>();
try {
JSONObject json = JSONObject.parseObject(htmlBody);
JSONArray commentes = json.getJSONArray("data");
for(int a = 0;a<commentes.size();a++)
{
JSONObject comment = commentes.getJSONObject(a).getJSONObject("comment");
String id = comment.getString("id");
String text = comment.getString("text");
String name = comment.getString("user_name");
int reply_count = comment.getIntValue("reply_count");
int digg_count = comment.getIntValue("digg_count");
long timeLong = comment.getLongValue("create_time")*1000;
Date date = new Date(timeLong);
TouTiaoComment ttComment = new TouTiaoComment(id,
text, name, reply_count, digg_count,
date, url);
list.add(ttComment);
}
} catch (Exception e) {
logger.debug("解析今日头条评论列表出现为题,{}",e);
}
return list;
}
/**
* @Title: getPage
* @Description: TODO(获取总页数)
* @param @param url
* @param @return 设定文件
* @return int 返回类型
* @throws Exception
*/
private static int getPage(String groupId,Proxy proxy) throws Exception
{
String urlNew = "http://www.toutiao.com/api/comment/list/?group_id="+ groupId +"&item_id=0&count=20&offset=0";
//设置头信息
Map<String,String> headerMap = Tools.getTouTiaoHeader();
String htmlBody = downloadHtml(urlNew, proxy, headerMap);
if(htmlBody!=null)
{
try {
JSONObject json = JSONObject.parseObject(htmlBody);
JSONObject data = json.getJSONObject("data");
int count = data.getIntValue("total");
return (int)Math.ceil((double)count/20.0);
} catch (Exception e) {
logger.info("获取评论总页数时出现问题:{}",e);
}
}
return -1;
}
/**
* @Title: findCommentCount
* @author hero
* @Description: 根据id获取头条评论数
* @param @param url
* @param @param proxy
* @param @return 设定文件
* @return int 返回类型
*/
public static int findCommentCount(String url,Proxy proxy)
{
for(int i=0; i<3; i++){
try {
//设置头信息
Map<String,String> headerMap = Tools.getTouTiaoHeader();
String htmlBody = downloadHtml(url, proxy, headerMap);
if(htmlBody!=null && htmlBody.contains("commentInfo"))
{
try {
return Integer.valueOf(htmlBody.split("comments_count: ")[1].split(",")[0]);
} catch (Exception e) {
logger.error("解析头条评论数错误:::{}", e.fillInStackTrace());
return 0;
}
}
} catch (Exception e) {
continue;
}
}
return 0;
}
/**
* @Title: findCommentCount
* @author hero
* @Description: 根据id获取头条评论数
* @param @param url
* @param @param proxy
* @param @return 设定文件
* @return int 返回类型
*/
public static int findNewCommentCountByProxy(String url,Proxy proxy)
{
try {
//设置头信息
Map<String,String> headerMap = Tools.getTouTiaoHeader();
String htmlBody = downloadHtml(url, proxy, headerMap);
if(htmlBody!=null && htmlBody.contains("commentInfo"))
{
try {
return Integer.valueOf(htmlBody.split("comments_count: ")[1].split(",")[0]);
} catch (Exception e) {
logger.error("解析头条评论数错误:::{}", e.fillInStackTrace());
return -1;
}
}
} catch (Exception e) {
logger.error("解析头条评论数错误:::{}", e.fillInStackTrace());
}
return -1;
}
/**
* @Title: getCommentCount
* @Description: TODO(根据id查看评论数)
* @param @param url
* @param @return 设定文件
* @return int 返回类型
* @throws IOException
*/
public static int getCommentCount(String url,Proxy proxy)
{
String group_id = getGroupId(url, proxy);
for(int i=0; i<3; i++){
try {
String urlNew = "http://www.toutiao.com/api/comment/list/?group_id="+group_id+"&item_id=0&count=20&offset=0";
//设置头信息
Map<String,String> headerMap = Tools.getTouTiaoHeader();
String htmlBody = downloadHtml(urlNew, proxy, headerMap);
if(htmlBody!=null)
{
try {
JSONObject json = JSONObject.parseObject(htmlBody);
JSONObject data = json.getJSONObject("data");
return data.getIntValue("total");
} catch (Exception e) {
logger.info("获取评论总页数时出现问题:{}",e);
}
}
} catch (Exception e) {
continue;
}
}
return -1;
}
/**
* @Title: getGroupId
* @Description: TODO(获取groupId用于更新评论列表)
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
private static String getGroupId(String url,Proxy proxy)
{
String groupId = null;
if(url.contains("/a")||url.contains("/group/"))
{
if(url.contains("/a"))
{
groupId = url.split("/a")[1].replace("/", "");
}else
{
groupId = url.split("/group/")[1].replace("/", "");
}
}else if(url.contains("/i")||url.contains("/item/"))
{
groupId = gettGroupIdByUrl(url, proxy);
}
return groupId;
}
/**
* @Title: gettGroupIdByUrl
* @Description: TODO(解析并获取groupId)
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
private static String gettGroupIdByUrl(String url,Proxy proxy)
{
String groupId = null;
Map<String,String> headerMap = Tools.getTouTiaoHeader();
try {
String htmlBody = downloadHtml(url, proxy, headerMap);
if(htmlBody != null)
{
if(htmlBody.contains("groupId"))
{
groupId = htmlBody.split("groupId: '")[1]
.split("',")[0].trim();
}
}else
{
logger.info("获取groupId失败,链接地址为:{}",url);
}
} catch (Exception e) {
e.printStackTrace();
logger.error("获取groupId失败,链接地址为:{}",url,e);
}
return groupId;
}
private static String downloadHtml(String url, Proxy proxy, Map<String,String> headerMap) {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = null;
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_HEAVY_PROXY);
}
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
break;
}else{
continue;
}
}
}
return null;
}
}
package com.zhiwei.toutiao.parse;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import com.zhiwei.toutiao.bean.TouTiaoArticle;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.toutiao.bean.TouTiaoComment;
import com.zhiwei.toutiao.util.Tools;
import okhttp3.Response;
import org.jsoup.Jsoup;
/**
* @ClassName: TouTiaoComment
* @Description: 今日头条评论数据
* @author hero
* @date 2016年12月9日 下午7:50:28
*/
public class TouTiaoCommentParse {
private static Logger logger = LogManager.getLogger(TouTiaoCommentParse.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
*
* @Title: getTouTiaoComment
* @author hero
* @Description: 获取评论列表,可指定限制返回条数
* @param @param url
* @param @param count
* @param @param proxy
* @param @return
* @param @throws Exception 设定文件
* @return List<TouTiaoComment> 返回类型
*/
public static List<TouTiaoComment> getTouTiaoComment(String url,int returnCount,Proxy proxy) throws Exception
{
List<TouTiaoComment> ttList = new ArrayList<TouTiaoComment>();
String group_id = getGroupId(url, proxy);
//查询评论总页数
if(group_id != null){
int page = getPage(group_id,proxy);
if(returnCount>0){
int pageMax = (int)Math.ceil((double)returnCount/20.0);
if(page>=pageMax){
page = pageMax;
}
}
for(int i=0;i<page;i++)
{
String urlNew = "http://is.snssdk.com/article/v2/tab_comments/?app_name=news_article&offset="
+i*20+"&group_id="+group_id+"&aggr_type=1&count=20&fold=1&item_id="+group_id+"&ts="+System.currentTimeMillis();
//设置头信息
Map<String,String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("User-Agent", "News 6.6.5 rv:6.6.5.03 (iPhone; iOS 11.3; zh_CN) Cronet");
headerMap.put("Host", "is.snssdk.com");
for(int j=1; j<=3; j++){
try {
String htmlBody = downloadHtml(urlNew, proxy, headerMap);
if(htmlBody!=null)
{
List<TouTiaoComment> commentes = analySisComment(htmlBody, url);
ttList.addAll(commentes);
logger.info(" url {} 采集到第 {} 页 采集到 {} 条数据 ",url,page,ttList.size());
}else
{
logger.info("采集出现问题,地址为:{}", url);
}
if(Objects.nonNull(proxy)) {
ZhiWeiTools.sleep(100);
}else {
ZhiWeiTools.sleep(4000);
}
break;
} catch (Exception e) {
continue;
}
}
}
}
return ttList;
}
/**
* @Title: analySisComment
* @Description: TODO(解析评论列表)
* @param @param htmlBody
* @param @return 设定文件
* @return List<DBObject> 返回类型
*/
private static List<TouTiaoComment> analySisComment(String htmlBody,String url)
{
List<TouTiaoComment> list = new ArrayList<>();
try {
JSONObject json = JSONObject.parseObject(htmlBody);
JSONArray commentes = json.getJSONArray("data");
for(int a = 0;a<commentes.size();a++)
{
JSONObject comment = commentes.getJSONObject(a).getJSONObject("comment");
String id = comment.getString("id");
String text = comment.getString("text");
String name = comment.getString("user_name");
int reply_count = comment.getIntValue("reply_count");
int digg_count = comment.getIntValue("digg_count");
long timeLong = comment.getLongValue("create_time")*1000;
Date date = new Date(timeLong);
TouTiaoComment ttComment = new TouTiaoComment(id,
text, name, reply_count, digg_count,
date, url);
list.add(ttComment);
}
} catch (Exception e) {
logger.debug("解析今日头条评论列表出现为题,{}",e);
}
return list;
}
/**
* @Title: getPage
* @Description: TODO(获取总页数)
* @param @param url
* @param @return 设定文件
* @return int 返回类型
* @throws Exception
*/
private static int getPage(String groupId,Proxy proxy) throws Exception
{
String urlNew = "http://www.toutiao.com/api/comment/list/?group_id="+ groupId +"&item_id=0&count=20&offset=0";
//设置头信息
Map<String,String> headerMap = Tools.getTouTiaoHeader();
String htmlBody = downloadHtml(urlNew, proxy, headerMap);
if(htmlBody!=null)
{
try {
JSONObject json = JSONObject.parseObject(htmlBody);
JSONObject data = json.getJSONObject("data");
int count = data.getIntValue("total");
return (int)Math.ceil((double)count/20.0);
} catch (Exception e) {
logger.info("获取评论总页数时出现问题:{}",e);
}
}
return -1;
}
/**
* @Title: findCommentCount
* @author hero
* @Description: 根据id获取头条评论数
* @param @param url
* @param @param proxy
* @param @return 设定文件
* @return int 返回类型
*/
public static int findCommentCount(String url,Proxy proxy)
{
for(int i=0; i<3; i++){
try {
//设置头信息
Map<String,String> headerMap = Tools.getTouTiaoHeader();
String htmlBody = downloadHtml(url, proxy, headerMap);
if(htmlBody!=null && htmlBody.contains("commentInfo"))
{
try {
return Integer.valueOf(htmlBody.split("comments_count: ")[1].split(",")[0]);
} catch (Exception e) {
logger.error("解析头条评论数错误:::{}", e.fillInStackTrace());
return 0;
}
}
} catch (Exception e) {
continue;
}
}
return 0;
}
/**
* @Title: findCommentCount
* @author hero
* @Description: 根据id获取头条评论数
* @param @param url
* @param @param proxy
* @param @return 设定文件
* @return int 返回类型
*/
public static int findNewCommentCountByProxy(String url,Proxy proxy)
{
try {
//设置头信息
Map<String,String> headerMap = Tools.getTouTiaoHeader();
String htmlBody = downloadHtml(url, proxy, headerMap);
if(htmlBody!=null && htmlBody.contains("commentInfo"))
{
try {
return Integer.valueOf(htmlBody.split("comments_count: ")[1].split(",")[0]);
} catch (Exception e) {
logger.error("解析头条评论数错误:::{}", e.fillInStackTrace());
return -1;
}
}
} catch (Exception e) {
logger.error("解析头条评论数错误:::{}", e.fillInStackTrace());
}
return -1;
}
/**
* @Title: getCommentCount
* @Description: TODO(根据id查看评论数)
* @param @param url
* @param @return 设定文件
* @return int 返回类型
* @throws IOException
*/
public static int getCommentCount(String url,Proxy proxy) throws Exception
{
String group_id = getGroupId(url, proxy);
for(int i=0; i<3; i++){
try {
String urlNew = "http://www.toutiao.com/api/comment/list/?group_id="+group_id+"&item_id=0&count=20&offset=0";
//设置头信息
Map<String,String> headerMap = Tools.getTouTiaoHeader();
String htmlBody = downloadHtml(urlNew, proxy, headerMap);
if(htmlBody!=null)
{
try {
JSONObject json = JSONObject.parseObject(htmlBody);
JSONObject data = json.getJSONObject("data");
return data.getIntValue("total");
} catch (Exception e) {
logger.info("获取评论总页数时出现问题:{}",e);
}
}
} catch (Exception e) {
continue;
}
}
return -1;
}
/**
* @Title: getGroupId
* @Description: TODO(获取groupId用于更新评论列表)
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
private static String getGroupId(String url,Proxy proxy) throws Exception
{
String groupId = null;
if(url.contains("/a")||url.contains("/group/"))
{
if(url.contains("/a"))
{
groupId = url.split("/a")[1].replace("/", "");
}else
{
groupId = url.split("/group/")[1].replace("/", "");
}
}else if(url.contains("/i")||url.contains("/item/"))
{
groupId = getGroupIdByUrl(url, proxy);
}
return groupId;
}
/**
* @Title: gettGroupIdByUrl
* @Description: TODO(解析并获取groupId)
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
private static String getGroupIdByUrl(String url,Proxy proxy) throws Exception
{
String groupId = null;
Map<String,String> headerMap = Tools.getTouTiaoHeader();
String htmlBody = downloadHtml(url, proxy, headerMap);
if(htmlBody != null)
{
if(htmlBody.contains("groupId"))
{
groupId = htmlBody.split("groupId: '")[1]
.split("',")[0].trim();
}
}else
{
logger.info("获取groupId失败,链接地址为:{}",url);
}
return groupId;
}
/**
* 下载数据
* @param url
* @param proxy
* @param headerMap
* @return
*/
private static String downloadHtml(String url, Proxy proxy, Map<String,String> headerMap) {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = null;
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_HEAVY_PROXY);
}
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
break;
}else{
continue;
}
}
}
return null;
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment