Commit ae21017e by zhiwei

添加解析今日头条单篇文章内容方法及代码规范部分修改

parent a5f5a270
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>toutiao</artifactId> <artifactId>toutiao</artifactId>
<version>0.3.7-SNAPSHOT</version> <version>0.3.8-SNAPSHOT</version>
<dependencies> <dependencies>
<dependency> <dependency>
......
...@@ -19,10 +19,16 @@ import java.util.Date; ...@@ -19,10 +19,16 @@ import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
...@@ -30,7 +36,6 @@ import com.zhiwei.crawler.core.HttpBoot; ...@@ -30,7 +36,6 @@ import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.timeparse.TimeParse; import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.toutiao.bean.Signature; import com.zhiwei.toutiao.bean.Signature;
import com.zhiwei.toutiao.bean.TouTiaoArticle; import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.util.Tools; import com.zhiwei.toutiao.util.Tools;
...@@ -43,9 +48,8 @@ import okhttp3.Response; ...@@ -43,9 +48,8 @@ import okhttp3.Response;
* @date 2016年9月2日 上午11:17:44 * @date 2016年9月2日 上午11:17:44
*/ */
public class TouTiaoArticleParse { public class TouTiaoArticleParse {
private TouTiaoArticleParse() {
} private static ScriptEngine scriptEngine = new ScriptEngineManager().getEngineByName("javascript");
private static Logger logger = LogManager.getLogger(TouTiaoArticleParse.class); private static Logger logger = LogManager.getLogger(TouTiaoArticleParse.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
...@@ -60,13 +64,13 @@ public class TouTiaoArticleParse { ...@@ -60,13 +64,13 @@ public class TouTiaoArticleParse {
* @return List<TouTiao> 返回类型 * @return List<TouTiao> 返回类型
* @throws Exception * @throws Exception
*/ */
public static Map<String, Object> getTouTiaoList(String media_id, String max_behot_time, Date endData, Proxy proxy) public static Map<String, Object> getTouTiaoList(String mediaId, String maxBehotTime, Date endData, Proxy proxy)
throws Exception { throws Exception {
Signature signature = new Signature(); Signature signature = new Signature();
String url = "https://www.toutiao.com/pgc/ma/?page_type=1&media_id=" + media_id + "&count=20&as=" String url = "https://www.toutiao.com/pgc/ma/?page_type=1&media_id=" + mediaId + "&count=20&as="
+ signature.getAs() + "&cp=" + signature.getCp(); + signature.getAs() + "&cp=" + signature.getCp();
if (max_behot_time != null) { if (maxBehotTime != null) {
url = url + "&max_behot_time=" + max_behot_time; url = url + "&max_behot_time=" + maxBehotTime;
} }
Map<String, String> headerMap = Tools.getTouTiaoHeader(); Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", url); headerMap.put("Referer", url);
...@@ -88,13 +92,13 @@ public class TouTiaoArticleParse { ...@@ -88,13 +92,13 @@ public class TouTiaoArticleParse {
} }
public static Map<String, Object> getTouTiaoList(String media_id, String max_behot_time, Date endData, ProxyHolder proxy) public static Map<String, Object> getTouTiaoList(String mediaId, String maxBehotTime, Date endData, ProxyHolder proxy)
throws Exception { throws Exception {
Signature signature = new Signature(); Signature signature = new Signature();
String url = "https://www.toutiao.com/pgc/ma/?page_type=1&media_id=" + media_id + "&count=20&as=" String url = "https://www.toutiao.com/pgc/ma/?page_type=1&media_id=" + mediaId + "&count=20&as="
+ signature.getAs() + "&cp=" + signature.getCp(); + signature.getAs() + "&cp=" + signature.getCp();
if (max_behot_time != null) { if (maxBehotTime != null) {
url = url + "&max_behot_time=" + max_behot_time; url = url + "&max_behot_time=" + maxBehotTime;
} }
Map<String, String> headerMap = Tools.getTouTiaoHeader(); Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", url); headerMap.put("Referer", url);
...@@ -125,23 +129,23 @@ public class TouTiaoArticleParse { ...@@ -125,23 +129,23 @@ public class TouTiaoArticleParse {
* @return * @return
* @throws Exception * @throws Exception
*/ */
public static Map<String, Object> getTouTiaoHistory(String user_id, String max_behot_time, Date endData, public static Map<String, Object> getTouTiaoHistory(String userId, String maxBehotTime, Date endData,
Proxy proxy) throws Exception { Proxy proxy) throws Exception {
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
Signature signature = new Signature(user_id, max_behot_time); Signature signature = new Signature(userId, maxBehotTime);
String as = signature.getAs(); String as = signature.getAs();
String cp = signature.getCp(); String cp = signature.getCp();
String _signature = signature.getSignature(); String signatureStr = signature.getSignature();
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id=" + user_id + "&max_behot_time=" String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id=" + userId + "&max_behot_time="
+ max_behot_time + "&count=20&as=" + as + "&cp=" + cp + "&_signature=" + _signature; + maxBehotTime + "&count=20&as=" + as + "&cp=" + cp + "&_signature=" + signatureStr;
Map<String, String> headerMap = new HashMap<String, String>(); Map<String, String> headerMap = new HashMap<String, String>();
headerMap.put("user-agent", headerMap.put("user-agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"); "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
headerMap.put("referer", "https://www.toutiao.com/c/user/" + user_id + "/"); headerMap.put("referer", "https://www.toutiao.com/c/user/" + userId + "/");
try { try {
String htmlBody = downloadHtml(url, proxy, headerMap); String htmlBody = downloadHtml(url, proxy, headerMap);
if (htmlBody != null && htmlBody.contains("behot_time")) { if (htmlBody != null && htmlBody.contains("behot_time")) {
Map<String, Object> ttList = parseHtmlByAccount(user_id, htmlBody, endData); Map<String, Object> ttList = parseHtmlByAccount(userId, htmlBody, endData);
if (ttList != null && ttList.size() > 0) { if (ttList != null && ttList.size() > 0) {
return ttList; return ttList;
} }
...@@ -157,25 +161,25 @@ public class TouTiaoArticleParse { ...@@ -157,25 +161,25 @@ public class TouTiaoArticleParse {
return Collections.emptyMap(); return Collections.emptyMap();
} }
public static Map<String, Object> getTouTiaoHistory(String user_id, String max_behot_time, Date endData, public static Map<String, Object> getTouTiaoHistory(String userId, String maxBehotTime, Date endData,
ProxyHolder proxy) throws Exception { ProxyHolder proxy) throws Exception {
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
Signature signature = new Signature(user_id, max_behot_time); Signature signature = new Signature(userId, maxBehotTime);
String as = signature.getAs(); String as = signature.getAs();
String cp = signature.getCp(); String cp = signature.getCp();
String _signature = signature.getSignature(); String signatureStr = signature.getSignature();
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id=" + user_id + "&max_behot_time=" String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id=" + userId + "&max_behot_time="
+ max_behot_time + "&count=20&as=" + as + "&cp=" + cp + "&_signature=" + _signature; + maxBehotTime + "&count=20&as=" + as + "&cp=" + cp + "&_signature=" + signatureStr;
logger.info("当前采集的历史文章链接:::{}", url); logger.info("当前采集的历史文章链接:::{}", url);
Map<String, String> headerMap = new HashMap<>(); Map<String, String> headerMap = new HashMap<>();
headerMap.put("user-agent", headerMap.put("user-agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"); "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
headerMap.put("referer", "https://www.toutiao.com/c/user/" + user_id + "/"); headerMap.put("referer", "https://www.toutiao.com/c/user/" + userId + "/");
String htmlBody = null; String htmlBody = null;
try { try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string(); htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
if (htmlBody != null && htmlBody.contains("behot_time")) { if (htmlBody != null && htmlBody.contains("behot_time")) {
Map<String, Object> ttList = parseHtmlByAccount(user_id, htmlBody, endData); Map<String, Object> ttList = parseHtmlByAccount(userId, htmlBody, endData);
if (ttList != null && ttList.size() > 0) { if (ttList != null && ttList.size() > 0) {
return ttList; return ttList;
} else { } else {
...@@ -203,13 +207,13 @@ public class TouTiaoArticleParse { ...@@ -203,13 +207,13 @@ public class TouTiaoArticleParse {
* @return List<String> 返回类型 * @return List<String> 返回类型
*/ */
private static Map<String, Object> parseHtmlByAccount(String htmlBody, Date endDate) { private static Map<String, Object> parseHtmlByAccount(String htmlBody, Date endDate) {
Map<String, Object> map = new HashMap<String, Object>(); Map<String, Object> map = new HashMap<>();
Long max_behot_time = null; Long maxBehotTime = null;
List<TouTiaoArticle> dataList = new ArrayList<TouTiaoArticle>(); List<TouTiaoArticle> dataList = new ArrayList<>();
try { try {
JSONObject json = JSONObject.parseObject(htmlBody); JSONObject json = JSONObject.parseObject(htmlBody);
JSONArray jsonArray = json.getJSONArray("data"); JSONArray jsonArray = json.getJSONArray("data");
max_behot_time = Long.valueOf(json.getJSONObject("next").getString("max_behot_time")); maxBehotTime = Long.valueOf(json.getJSONObject("next").getString("max_behot_time"));
String title = null; String title = null;
String content = null; String content = null;
String time = null; String time = null;
...@@ -219,7 +223,7 @@ public class TouTiaoArticleParse { ...@@ -219,7 +223,7 @@ public class TouTiaoArticleParse {
String playNum = null; String playNum = null;
String shareNum = null; String shareNum = null;
String source = null; String source = null;
String user_id = null; String userId = null;
String articleType = null; String articleType = null;
List<String> labelList = null; List<String> labelList = null;
String likeNum = null; String likeNum = null;
...@@ -238,9 +242,9 @@ public class TouTiaoArticleParse { ...@@ -238,9 +242,9 @@ public class TouTiaoArticleParse {
playNum = data.getString("detail_play_effective_count"); playNum = data.getString("detail_play_effective_count");
shareNum = data.getString("share_count"); shareNum = data.getString("share_count");
source = data.getString("source"); source = data.getString("source");
user_id = data.getLong("creator_uid") + ""; userId = data.getLong("creator_uid") + "";
articleType = data.getString("chinese_tag"); articleType = data.getString("chinese_tag");
TouTiaoArticle tt = new TouTiaoArticle(href, title, user_id, source, date, content, commentNum, TouTiaoArticle tt = new TouTiaoArticle(href, title, userId, source, date, content, commentNum,
playNum, readNum, shareNum, "今日头条", articleType,likeNum); playNum, readNum, shareNum, "今日头条", articleType,likeNum);
if (data.containsKey("label")) { if (data.containsKey("label")) {
labelList = data.getJSONArray("label").toJavaList(String.class); labelList = data.getJSONArray("label").toJavaList(String.class);
...@@ -259,26 +263,26 @@ public class TouTiaoArticleParse { ...@@ -259,26 +263,26 @@ public class TouTiaoArticleParse {
} }
if (endDate != null) { if (endDate != null) {
if (max_behot_time != null && !"0".equals(max_behot_time)) { if (maxBehotTime != null && !"0".equals(maxBehotTime)) {
Date nextDate = new Date(Long.valueOf(max_behot_time + "000")); Date nextDate = new Date(Long.valueOf(maxBehotTime + "000"));
if (endDate.after(nextDate)) { if (endDate.after(nextDate)) {
max_behot_time = null; maxBehotTime = null;
} }
} }
} }
map.put("max_behot_time", max_behot_time); map.put("max_behot_time", maxBehotTime);
map.put("data", dataList); map.put("data", dataList);
return map; return map;
} }
private static Map<String, Object> parseHtmlByAccount(String user_id, String htmlBody, Date endDate) { private static Map<String, Object> parseHtmlByAccount(String userId, String htmlBody, Date endDate) {
Map<String, Object> map = new HashMap<String, Object>(); Map<String, Object> map = new HashMap<>();
Long max_behot_time = null; Long maxBehotTime = null;
List<TouTiaoArticle> dataList = new ArrayList<TouTiaoArticle>(); List<TouTiaoArticle> dataList = new ArrayList<>();
try { try {
JSONObject json = JSONObject.parseObject(htmlBody); JSONObject json = JSONObject.parseObject(htmlBody);
JSONArray jsonArray = json.getJSONArray("data"); JSONArray jsonArray = json.getJSONArray("data");
max_behot_time = Long.valueOf(json.getJSONObject("next").getString("max_behot_time")); maxBehotTime = Long.valueOf(json.getJSONObject("next").getString("max_behot_time"));
String title = null; String title = null;
String content = null; String content = null;
String time = null; String time = null;
...@@ -307,7 +311,7 @@ public class TouTiaoArticleParse { ...@@ -307,7 +311,7 @@ public class TouTiaoArticleParse {
shareNum = data.getString("share_count"); shareNum = data.getString("share_count");
source = data.getString("source"); source = data.getString("source");
articleType = data.getString("chinese_tag"); articleType = data.getString("chinese_tag");
TouTiaoArticle tt = new TouTiaoArticle(href, title, user_id, source, date, content, commentNum, TouTiaoArticle tt = new TouTiaoArticle(href, title, userId, source, date, content, commentNum,
playNum, readNum, shareNum, "今日头条", articleType,likeNum); playNum, readNum, shareNum, "今日头条", articleType,likeNum);
if (data.containsKey("label")) { if (data.containsKey("label")) {
labelList = data.getJSONArray("label").toJavaList(String.class); labelList = data.getJSONArray("label").toJavaList(String.class);
...@@ -325,14 +329,14 @@ public class TouTiaoArticleParse { ...@@ -325,14 +329,14 @@ public class TouTiaoArticleParse {
return null; return null;
} }
if (endDate != null) { if (endDate != null) {
if (max_behot_time != null && !"0".equals(max_behot_time)) { if (maxBehotTime != null && !"0".equals(maxBehotTime)) {
Date nextDate = new Date(Long.valueOf(max_behot_time + "000")); Date nextDate = new Date(Long.valueOf(maxBehotTime + "000"));
if (endDate.after(nextDate)) { if (endDate.after(nextDate)) {
max_behot_time = null; maxBehotTime = null;
} }
} }
} }
map.put("max_behot_time", max_behot_time); map.put("max_behot_time", maxBehotTime);
map.put("data", dataList); map.put("data", dataList);
return map; return map;
} }
...@@ -352,14 +356,14 @@ public class TouTiaoArticleParse { ...@@ -352,14 +356,14 @@ public class TouTiaoArticleParse {
* IOException 设定文件 * IOException 设定文件
* @return List<Map<String,Object>> 返回类型 * @return List<Map<String,Object>> 返回类型
*/ */
public static Map<String, Object> getMicroTouTiaoCrawler(String user_id, Date endDate, Proxy proxy, public static Map<String, Object> getMicroTouTiaoCrawler(String userId, Date endDate, Proxy proxy,
String max_behot_time) throws IOException { String maxBehotTime) throws IOException {
String url = "https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id=" + user_id; String url = "https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id=" + userId;
if (max_behot_time != null) { if (maxBehotTime != null) {
url = url + "?max_behot_time=" + max_behot_time; url = url + "?max_behot_time=" + maxBehotTime;
} }
Map<String, String> headerMap = Tools.getTouTiaoHeader(); Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", "https://www.toutiao.com/c/user/" + user_id + "/?tab=weitoutiao"); headerMap.put("Referer", "https://www.toutiao.com/c/user/" + userId + "/?tab=weitoutiao");
try { try {
String htmlBody = downloadHtml(url, proxy, headerMap); String htmlBody = downloadHtml(url, proxy, headerMap);
if (htmlBody != null) { if (htmlBody != null) {
...@@ -377,15 +381,15 @@ public class TouTiaoArticleParse { ...@@ -377,15 +381,15 @@ public class TouTiaoArticleParse {
return null; return null;
} }
public static Map<String, Object> getMicroTouTiaoCrawler(String user_id, Date endDate, ProxyHolder proxy, public static Map<String, Object> getMicroTouTiaoCrawler(String userId, Date endDate, ProxyHolder proxy,
Long max_behot_time) throws IOException { Long maxBehotTime) throws IOException {
String url = "https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id=" + user_id; String url = "https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id=" + userId;
if (max_behot_time != null) { if (maxBehotTime != null) {
url = url + "&max_behot_time=" + max_behot_time; url = url + "&max_behot_time=" + maxBehotTime;
} }
logger.info("微头条采集链接:::{}", url); logger.info("微头条采集链接:::{}", url);
Map<String, String> headerMap = Tools.getTouTiaoHeader(); Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", "https://www.toutiao.com/c/user/" + user_id + "/?tab=weitoutiao"); headerMap.put("Referer", "https://www.toutiao.com/c/user/" + userId + "/?tab=weitoutiao");
try { try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string(); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
if (htmlBody != null && htmlBody.contains("create_time")) { if (htmlBody != null && htmlBody.contains("create_time")) {
...@@ -413,24 +417,21 @@ public class TouTiaoArticleParse { ...@@ -413,24 +417,21 @@ public class TouTiaoArticleParse {
* @return * @return
*/ */
public static List<Map<String,Object>> getClientMicroToutiaoCrawler(String userId, ProxyHolder proxy, public static List<Map<String,Object>> getClientMicroToutiaoCrawler(String userId, ProxyHolder proxy,
Long max_behot_time) { Long maxBehotTime) {
List<Map<String,Object>> dataList = new ArrayList<>(); List<Map<String,Object>> dataList = new ArrayList<>();
String ma = ""; String ma = "";
while(true) { while(true) {
String url = "https://i.snssdk.com/api/feed/profile/v1/?visited_uid="+userId+"&offset="+max_behot_time; String url = "https://i.snssdk.com/api/feed/profile/v1/?visited_uid="+userId+"&offset="+ maxBehotTime;
System.out.println(url); ma = String.valueOf(maxBehotTime);
ma = String.valueOf(max_behot_time);
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url),proxy)){ try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url),proxy)){
String result = response.body().string(); String result = response.body().string();
JSONObject json = JSONObject.parseObject(result); JSONObject json = JSONObject.parseObject(result);
max_behot_time = json.getLongValue("offset"); maxBehotTime = json.getLongValue("offset");
JSONArray jsonArray = json.getJSONArray("data"); JSONArray jsonArray = json.getJSONArray("data");
System.out.println(json.toString());
for (int i = 0; i < jsonArray.size(); i++) { for (int i = 0; i < jsonArray.size(); i++) {
JSONObject data = jsonArray.getJSONObject(i); JSONObject data = jsonArray.getJSONObject(i);
try { try {
JSONObject dataJSON = data.getJSONObject("content").getJSONObject("raw_data"); JSONObject dataJSON = data.getJSONObject("content").getJSONObject("raw_data");
System.out.println(dataJSON.toString());
Map<String,Object> map = new HashMap<>(); Map<String,Object> map = new HashMap<>();
if(dataJSON.containsKey("comment_base") && dataJSON.getJSONObject("comment_base")!=null) { if(dataJSON.containsKey("comment_base") && dataJSON.getJSONObject("comment_base")!=null) {
JSONObject commentBase = dataJSON.getJSONObject("comment_base"); JSONObject commentBase = dataJSON.getJSONObject("comment_base");
...@@ -454,7 +455,6 @@ public class TouTiaoArticleParse { ...@@ -454,7 +455,6 @@ public class TouTiaoArticleParse {
map.put("readNum", readNum); map.put("readNum", readNum);
map.put("commentNum", commentNum); map.put("commentNum", commentNum);
map.put("user_id", user_id); map.put("user_id", user_id);
// System.out.println(map.toString());
dataList.add(map); dataList.add(map);
} }
} catch (Exception e) { } catch (Exception e) {
...@@ -463,8 +463,8 @@ public class TouTiaoArticleParse { ...@@ -463,8 +463,8 @@ public class TouTiaoArticleParse {
} }
} }
System.out.println(" 采集到 条 == "+dataList.size() + " -- " +ma + " -- " + max_behot_time); System.out.println(" 采集到 条 == "+dataList.size() + " -- " +ma + " -- " + maxBehotTime);
if(ma.equals(String.valueOf(max_behot_time))) { if(ma.equals(String.valueOf(maxBehotTime))) {
break; break;
} }
} catch (Exception e) { } catch (Exception e) {
...@@ -487,16 +487,16 @@ public class TouTiaoArticleParse { ...@@ -487,16 +487,16 @@ public class TouTiaoArticleParse {
* @return Map<String,Object> 返回类型 * @return Map<String,Object> 返回类型
*/ */
private static Map<String, Object> parseHtmlByMicroAccount(String htmlBody, Date endDate) { private static Map<String, Object> parseHtmlByMicroAccount(String htmlBody, Date endDate) {
Map<String, Object> map = new HashMap<String, Object>(); Map<String, Object> map = new HashMap<>();
Long max_behot_time = null; Long maxBehotTime = null;
List<TouTiaoArticle> dataList = new ArrayList<TouTiaoArticle>(); List<TouTiaoArticle> dataList = new ArrayList<>();
try { try {
JSONObject json = JSONObject.parseObject(htmlBody); JSONObject json = JSONObject.parseObject(htmlBody);
boolean more = false; boolean more = false;
if(json.containsKey("has_more")) { if(json.containsKey("has_more")) {
more = json.getBoolean("has_more"); more = json.getBoolean("has_more");
} }
max_behot_time = json.getJSONObject("next").getLongValue("max_behot_time"); maxBehotTime = json.getJSONObject("next").getLongValue("max_behot_time");
JSONArray jsonArray = json.getJSONArray("data"); JSONArray jsonArray = json.getJSONArray("data");
Date date = null; Date date = null;
String href = null; String href = null;
...@@ -564,19 +564,19 @@ public class TouTiaoArticleParse { ...@@ -564,19 +564,19 @@ public class TouTiaoArticleParse {
/** 验证是否有下一页数据 **/ /** 验证是否有下一页数据 **/
if (more) { if (more) {
if (max_behot_time != null && max_behot_time != 0) { if (maxBehotTime != null && maxBehotTime != 0) {
if (endDate.after(date)) { if (endDate.after(date)) {
max_behot_time = null; maxBehotTime = null;
} }
} }
} else { } else {
max_behot_time = null; maxBehotTime = null;
} }
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
} }
map.put("max_behot_time", max_behot_time); map.put("max_behot_time", maxBehotTime);
map.put("data", dataList); map.put("data", dataList);
return map; return map;
...@@ -591,10 +591,17 @@ public class TouTiaoArticleParse { ...@@ -591,10 +591,17 @@ public class TouTiaoArticleParse {
public static String getContent(String url,Proxy proxy) { public static String getContent(String url,Proxy proxy) {
try { try {
String htmlBody = downloadHtml(url, proxy, null); String htmlBody = downloadHtml(url, proxy, null);
if(!StringUtils.isBlank(htmlBody)) { String regex = "<script>var BASE_DATA[\\s\\S]+?</script>";
if(htmlBody.contains("content:")) { if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("articleInfo")) {
String content = htmlBody.split(" content: '")[1].split("',")[0]; //通过正则截取需要的js代码
return ZhiWeiTools.delHTMLTag(content); Matcher matcher = Pattern.compile(regex).matcher(htmlBody);
if(matcher.find()) {
String content = matcher.group().replace("<script>var BASE_DATA = |;</script>", "");
//通过js引擎执行js代码
String jsContent = "eval(("+ content +")).articleInfo.content.toString();";
String contentHtml = scriptEngine.eval(jsContent).toString();
//解析最后的数据
return Jsoup.parse(contentHtml).text();
} }
} }
return null; return null;
...@@ -605,16 +612,14 @@ public class TouTiaoArticleParse { ...@@ -605,16 +612,14 @@ public class TouTiaoArticleParse {
} }
/**
* 下载数据
* @param url
* @param proxy
* @param headMap
* @return
*/
private static String downloadHtml(String url, Proxy proxy, Map<String,String> headMap) { private static String downloadHtml(String url, Proxy proxy, Map<String,String> headMap) {
// 下载数据页面
for (int i = 1; i <= 3; i++) { for (int i = 1; i <= 3; i++) {
try { try {
Response response = null; Response response = null;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment