Commit 5f4ff8d3 by zhiwei

360网页采集

parent 491f1e25
...@@ -215,7 +215,7 @@ public class SoCrawlerParse extends HttpClientTemplateOK { ...@@ -215,7 +215,7 @@ public class SoCrawlerParse extends HttpClientTemplateOK {
String time = dataJson.getJSONObject("data").getJSONArray("question").getJSONObject(0).getString("create_time"); String time = dataJson.getJSONObject("data").getJSONArray("question").getJSONObject(0).getString("create_time");
String source = dataJson.getJSONObject("data").getJSONArray("question").getJSONObject(0).getJSONObject("user").getString("uname"); String source = dataJson.getJSONObject("data").getJSONArray("question").getJSONObject(0).getJSONObject("user").getString("uname");
String user_id = dataJson.getJSONObject("data").getJSONArray("question").getJSONObject(0).getJSONObject("user").getString("user_id"); String user_id = dataJson.getJSONObject("data").getJSONArray("question").getJSONObject(0).getJSONObject("user").getString("user_id");
String link = "https://www.toutiao.com/a"+dataJson.getJSONObject("data").getJSONArray("question").getJSONObject(0).getString("qid")+"/"; String link = "https://www.toutiao.com/a"+dataJson.getJSONObject("data").getJSONArray("question").getJSONObject(0).getString("qid");
return new NewsData(link, title, source, time, content, "头条问答", word, user_id); return new NewsData(link, title, source, time, content, "头条问答", word, user_id);
}else if(htmlBody.contains("var BASE_DATA = ")){ }else if(htmlBody.contains("var BASE_DATA = ")){
...@@ -226,7 +226,7 @@ public class SoCrawlerParse extends HttpClientTemplateOK { ...@@ -226,7 +226,7 @@ public class SoCrawlerParse extends HttpClientTemplateOK {
String time = dataJson.getJSONObject("articleInfo").getJSONObject("subInfo").getString("time"); String time = dataJson.getJSONObject("articleInfo").getJSONObject("subInfo").getString("time");
String source = dataJson.getJSONObject("mediaInfo").getString("name"); String source = dataJson.getJSONObject("mediaInfo").getString("name");
String user_id = dataJson.getJSONObject("mediaInfo").getString("uid"); String user_id = dataJson.getJSONObject("mediaInfo").getString("uid");
String link = "https://www.toutiao.com/a"+dataJson.getJSONObject("articleInfo").getString("groupId")+"/"; String link = "https://www.toutiao.com/a"+dataJson.getJSONObject("articleInfo").getString("groupId");
return new NewsData(link, title, source, time, content, "今日头条", word, user_id); return new NewsData(link, title, source, time, content, "今日头条", word, user_id);
}else if(htmlBody.contains("404错误页")){ }else if(htmlBody.contains("404错误页")){
logger.info("{}:::数据有问题,该文章已被删除}", url); logger.info("{}:::数据有问题,该文章已被删除}", url);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment