Commit 474fd84b by zhiwei

修复微头条采集

parent f4ed3aa0
...@@ -349,12 +349,12 @@ public class TouTiaoArticleParse { ...@@ -349,12 +349,12 @@ public class TouTiaoArticleParse {
*/ */
public static Map<String, Object> getMicroTouTiaoCrawler(String user_id, Date endDate, Proxy proxy, public static Map<String, Object> getMicroTouTiaoCrawler(String user_id, Date endDate, Proxy proxy,
String max_behot_time) throws IOException { String max_behot_time) throws IOException {
String url = "https://www.toutiao.com/c/ugc/content/list/" + user_id + "/"; String url = "https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id=" + user_id;
if (max_behot_time != null) { if (max_behot_time != null) {
url = url + "?max_time=" + max_behot_time; url = url + "?max_behot_time=" + max_behot_time;
} }
Map<String, String> headerMap = Tools.getTouTiaoHeader(); Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", "https://www.toutiao.com/c/user/" + user_id + "/"); headerMap.put("Referer", "https://www.toutiao.com/c/user/" + user_id + "/?tab=weitoutiao");
try { try {
String htmlBody = downloadHtml(url, proxy, headerMap); String htmlBody = downloadHtml(url, proxy, headerMap);
if (htmlBody != null) { if (htmlBody != null) {
...@@ -374,12 +374,12 @@ public class TouTiaoArticleParse { ...@@ -374,12 +374,12 @@ public class TouTiaoArticleParse {
public static Map<String, Object> getMicroTouTiaoCrawler(String user_id, Date endDate, ProxyHolder proxy, public static Map<String, Object> getMicroTouTiaoCrawler(String user_id, Date endDate, ProxyHolder proxy,
Long max_behot_time) throws IOException { Long max_behot_time) throws IOException {
String url = "https://www.toutiao.com/c/ugc/content/list/" + user_id + "/"; String url = "https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id=" + user_id;
if (max_behot_time != null) { if (max_behot_time != null) {
url = url + "?max_time=" + max_behot_time; url = url + "?max_behot_time=" + max_behot_time;
} }
Map<String, String> headerMap = Tools.getTouTiaoHeader(); Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", "https://www.toutiao.com/c/user/" + user_id + "/"); headerMap.put("Referer", "https://www.toutiao.com/c/user/" + user_id + "/?tab=weitoutiao");
try { try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string(); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
if (htmlBody != null) { if (htmlBody != null) {
...@@ -414,9 +414,10 @@ public class TouTiaoArticleParse { ...@@ -414,9 +414,10 @@ public class TouTiaoArticleParse {
Long max_behot_time = null; Long max_behot_time = null;
List<TouTiaoArticle> dataList = new ArrayList<TouTiaoArticle>(); List<TouTiaoArticle> dataList = new ArrayList<TouTiaoArticle>();
try { try {
JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data"); JSONObject json = JSONObject.parseObject(htmlBody);
boolean more = json.getBoolean("has_more"); boolean more = json.getBoolean("has_more");
JSONArray jsonArray = json.getJSONArray("list"); max_behot_time = json.getJSONObject("next").getLongValue("max_behot_time");
JSONArray jsonArray = json.getJSONArray("data");
Date date = null; Date date = null;
String href = null; String href = null;
String source = null; String source = null;
...@@ -431,19 +432,44 @@ public class TouTiaoArticleParse { ...@@ -431,19 +432,44 @@ public class TouTiaoArticleParse {
for (int i = 0; i < jsonArray.size(); i++) { for (int i = 0; i < jsonArray.size(); i++) {
try { try {
JSONObject data = jsonArray.getJSONObject(i); JSONObject data = jsonArray.getJSONObject(i);
max_behot_time = data.getLongValue("create_time"); String text = null;
date = new Date(max_behot_time * 1000); if(data.containsKey("stream_cell") && data.getJSONObject("stream_cell")!=null) {
href = "https://www.toutiao.com/a" + data.getString("thread_id"); text = data.getJSONObject("stream_cell").getString("raw_data");
source = data.getJSONObject("ugc_user").getString("name"); }else if(data.containsKey("concern_talk_cell")) {
content = data.getString("content"); text = data.getJSONObject("concern_talk_cell").getString("packed_json_str");
readNum = data.getInteger("read_count") + ""; }
commentNum = data.getInteger("comment_count") + "";
user_id = data.getJSONObject("ugc_user").getString("user_id"); JSONObject dataJSON = JSONObject.parseObject(text);
if (content != null && !"".equals(content)) { if(dataJSON.containsKey("comment_base") && dataJSON.getJSONObject("comment_base")!=null) {
if (content.length() < 16) { JSONObject comment_base = dataJSON.getJSONObject("comment_base");
count = content.length(); date = new Date(comment_base.getLongValue("create_time") * 1000);
href = "https://www.toutiao.com/a" + dataJSON.getLongValue("id");
source = comment_base.getJSONObject("user").getJSONObject("info").getString("name");
content = dataJSON.getString("content");
readNum = dataJSON.getJSONObject("action").getInteger("read_count") + "";
commentNum = dataJSON.getJSONObject("action").getInteger("comment_count") + "";
user_id = comment_base.getJSONObject("user").getJSONObject("info").getString("user_id");
if (content != null && !"".equals(content)) {
if (content.length() < 16) {
count = content.length();
}
title = content.substring(0, count);
}
}else {
date = new Date(dataJSON.getLongValue("create_time") * 1000);
href = "https://www.toutiao.com/a" + dataJSON.getString("thread_id");
source = dataJSON.getJSONObject("user").getString("name");
content = dataJSON.getString("content");
readNum = dataJSON.getInteger("read_count") + "";
commentNum = dataJSON.getInteger("comment_count") + "";
user_id = dataJSON.getJSONObject("user").getString("user_id");
if (content != null && !"".equals(content)) {
if (content.length() < 16) {
count = content.length();
}
title = content.substring(0, count);
} }
title = content.substring(0, count);
} }
TouTiaoArticle tt = new TouTiaoArticle(href, title, user_id, source, date, content, commentNum, TouTiaoArticle tt = new TouTiaoArticle(href, title, user_id, source, date, content, commentNum,
playNum, readNum, "0", "微头条", articleType); playNum, readNum, "0", "微头条", articleType);
...@@ -452,6 +478,7 @@ public class TouTiaoArticleParse { ...@@ -452,6 +478,7 @@ public class TouTiaoArticleParse {
continue; continue;
} }
} }
/** 验证是否有下一页数据 **/ /** 验证是否有下一页数据 **/
if (more) { if (more) {
if (max_behot_time != null && max_behot_time != 0) { if (max_behot_time != null && max_behot_time != 0) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment