Commit 5578cebf by zhiwei

修复微头条采集问题

parent ae21017e
...@@ -12,7 +12,11 @@ package com.zhiwei.toutiao.bean; ...@@ -12,7 +12,11 @@ package com.zhiwei.toutiao.bean;
import java.io.Serializable; import java.io.Serializable;
import java.util.Date; import java.util.Date;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
import com.alibaba.fastjson.JSONObject;
/** /**
* @Description: * @Description:
...@@ -165,5 +169,4 @@ public class TouTiaoArticle implements Serializable{ ...@@ -165,5 +169,4 @@ public class TouTiaoArticle implements Serializable{
+ "]"; + "]";
} }
} }
...@@ -360,10 +360,11 @@ public class TouTiaoArticleParse { ...@@ -360,10 +360,11 @@ public class TouTiaoArticleParse {
String maxBehotTime) throws IOException { String maxBehotTime) throws IOException {
String url = "https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id=" + userId; String url = "https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id=" + userId;
if (maxBehotTime != null) { if (maxBehotTime != null) {
url = url + "?max_behot_time=" + maxBehotTime; url = url + "&max_behot_time=" + maxBehotTime;
} }
System.out.println(url);
Map<String, String> headerMap = Tools.getTouTiaoHeader(); Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", "https://www.toutiao.com/c/user/" + userId + "/?tab=weitoutiao"); headerMap.put("Referer", "https://www.toutiao.com/c/user/" + userId + "/");
try { try {
String htmlBody = downloadHtml(url, proxy, headerMap); String htmlBody = downloadHtml(url, proxy, headerMap);
if (htmlBody != null) { if (htmlBody != null) {
...@@ -389,7 +390,7 @@ public class TouTiaoArticleParse { ...@@ -389,7 +390,7 @@ public class TouTiaoArticleParse {
} }
logger.info("微头条采集链接:::{}", url); logger.info("微头条采集链接:::{}", url);
Map<String, String> headerMap = Tools.getTouTiaoHeader(); Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", "https://www.toutiao.com/c/user/" + userId + "/?tab=weitoutiao"); headerMap.put("Referer", "https://www.toutiao.com/c/user/" + userId + "/");
try { try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string(); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
if (htmlBody != null && htmlBody.contains("create_time")) { if (htmlBody != null && htmlBody.contains("create_time")) {
...@@ -441,7 +442,7 @@ public class TouTiaoArticleParse { ...@@ -441,7 +442,7 @@ public class TouTiaoArticleParse {
String content = commentBase.getString("content"); String content = commentBase.getString("content");
String readNum = commentBase.getJSONObject("action").getInteger("read_count") + ""; String readNum = commentBase.getJSONObject("action").getInteger("read_count") + "";
String commentNum = commentBase.getJSONObject("action").getInteger("comment_count") + ""; String commentNum = commentBase.getJSONObject("action").getInteger("comment_count") + "";
String user_id = commentBase.getJSONObject("user").getJSONObject("info").getString("user_id"); userId = commentBase.getJSONObject("user").getJSONObject("info").getString("user_id");
if(dataJSON.containsKey("origin_group")) { if(dataJSON.containsKey("origin_group")) {
String replayUrl = dataJSON.getJSONObject("origin_group").getString("article_url"); String replayUrl = dataJSON.getJSONObject("origin_group").getString("article_url");
String title = dataJSON.getJSONObject("origin_group").getString("title"); String title = dataJSON.getJSONObject("origin_group").getString("title");
...@@ -454,7 +455,7 @@ public class TouTiaoArticleParse { ...@@ -454,7 +455,7 @@ public class TouTiaoArticleParse {
map.put("content", content); map.put("content", content);
map.put("readNum", readNum); map.put("readNum", readNum);
map.put("commentNum", commentNum); map.put("commentNum", commentNum);
map.put("user_id", user_id); map.put("user_id", userId);
dataList.add(map); dataList.add(map);
} }
} catch (Exception e) { } catch (Exception e) {
...@@ -496,72 +497,80 @@ public class TouTiaoArticleParse { ...@@ -496,72 +497,80 @@ public class TouTiaoArticleParse {
if(json.containsKey("has_more")) { if(json.containsKey("has_more")) {
more = json.getBoolean("has_more"); more = json.getBoolean("has_more");
} }
maxBehotTime = json.getJSONObject("next").getLongValue("max_behot_time"); if(json.containsKey("next")) {
JSONArray jsonArray = json.getJSONArray("data"); maxBehotTime = json.getJSONObject("next").getLongValue("max_behot_time");
}
Date date = null; Date date = null;
String href = null; if(json.containsKey("data")) {
String source = null; JSONArray jsonArray = json.getJSONArray("data");
String title = null; String href = null;
String content = null; String source = null;
String readNum = null; String title = null;
String commentNum = null; String content = null;
String playNum = null; String readNum = null;
String user_id = null; String commentNum = null;
String likeNum = null; String playNum = null;
String articleType = null; String userId = null;
int count = 16; String likeNum = null;
for (int i = 0; i < jsonArray.size(); i++) { String articleType = null;
try { int count = 16;
JSONObject data = jsonArray.getJSONObject(i); for (int i = 0; i < jsonArray.size(); i++) {
String text = null; try {
if(data.containsKey("stream_cell") && data.getJSONObject("stream_cell")!=null) { JSONObject data = jsonArray.getJSONObject(i);
text = data.getJSONObject("stream_cell").getString("raw_data"); String text = null;
}else if(data.containsKey("concern_talk_cell")) { if(data.containsKey("stream_cell") && data.getJSONObject("stream_cell")!=null) {
text = data.getJSONObject("concern_talk_cell").getString("packed_json_str"); text = data.getJSONObject("stream_cell").getString("raw_data");
} }else if(data.containsKey("concern_talk_cell")) {
text = data.getJSONObject("concern_talk_cell").getString("packed_json_str");
JSONObject dataJSON = JSONObject.parseObject(text);
if(dataJSON.containsKey("comment_base") && dataJSON.getJSONObject("comment_base")!=null) {
JSONObject comment_base = dataJSON.getJSONObject("comment_base");
date = new Date(comment_base.getLongValue("create_time") * 1000);
href = "https://www.toutiao.com/a" + dataJSON.getLongValue("id");
source = comment_base.getJSONObject("user").getJSONObject("info").getString("name");
content = dataJSON.getString("content");
readNum = dataJSON.getJSONObject("action").getInteger("read_count") + "";
likeNum = dataJSON.getJSONObject("action").getInteger("digg_count")+"";
commentNum = dataJSON.getJSONObject("action").getInteger("comment_count") + "";
user_id = comment_base.getJSONObject("user").getJSONObject("info").getString("user_id");
if (content != null && !"".equals(content)) {
if (content.length() < 16) {
count = content.length();
}
title = content.substring(0, count);
} }
}else { JSONObject dataJSON = JSONObject.parseObject(text);
date = new Date(dataJSON.getLongValue("create_time") * 1000); if(dataJSON.containsKey("comment_base") && dataJSON.getJSONObject("comment_base")!=null) {
href = "https://www.toutiao.com/a" + dataJSON.getString("thread_id"); JSONObject commentBase = dataJSON.getJSONObject("comment_base");
source = dataJSON.getJSONObject("user").getString("name"); date = new Date(commentBase.getLongValue("create_time") * 1000);
content = dataJSON.getString("content"); href = "https://www.toutiao.com/a" + dataJSON.getLongValue("id");
readNum = dataJSON.getInteger("read_count") + ""; source = commentBase.getJSONObject("user").getJSONObject("info").getString("name");
commentNum = dataJSON.getInteger("comment_count") + ""; content = dataJSON.getString("content");
likeNum = dataJSON.getInteger("digg_count")+""; readNum = dataJSON.getJSONObject("action").getInteger("read_count") + "";
user_id = dataJSON.getJSONObject("user").getString("user_id"); likeNum = dataJSON.getJSONObject("action").getInteger("digg_count")+"";
if (content != null && !"".equals(content)) { commentNum = dataJSON.getJSONObject("action").getInteger("comment_count") + "";
if (content.length() < 16) { userId = commentBase.getJSONObject("user").getJSONObject("info").getString("user_id");
count = content.length(); if (content != null && !"".equals(content)) {
if (content.length() < 16) {
count = content.length();
}
title = content.substring(0, count);
}
}else {
date = new Date(dataJSON.getLongValue("create_time") * 1000);
href = "https://www.toutiao.com/a" + dataJSON.getString("thread_id");
source = dataJSON.getJSONObject("user").getString("name");
content = dataJSON.getString("content");
readNum = dataJSON.getInteger("read_count") + "";
commentNum = dataJSON.getInteger("comment_count") + "";
likeNum = dataJSON.getInteger("digg_count")+"";
userId = dataJSON.getJSONObject("user").getString("user_id");
if (content != null && !"".equals(content)) {
if (content.length() < 16) {
count = content.length();
}
title = content.substring(0, count);
} }
title = content.substring(0, count);
} }
TouTiaoArticle tt = new TouTiaoArticle(href, title, userId, source, date, content, commentNum,
playNum, readNum, "0", "微头条", articleType,likeNum);
dataList.add(tt);
} catch (Exception e) {
continue;
} }
TouTiaoArticle tt = new TouTiaoArticle(href, title, user_id, source, date, content, commentNum,
playNum, readNum, "0", "微头条", articleType,likeNum);
dataList.add(tt);
} catch (Exception e) {
continue;
} }
}else {
System.out.println(json);
} }
/** 验证是否有下一页数据 **/ /** 验证是否有下一页数据 **/
if (more) { if (more) {
if (maxBehotTime != null && maxBehotTime != 0) { if (maxBehotTime != null && maxBehotTime != 0) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment