Commit 9a849364 by zhiwei
parents 06c9a6ab 53f01f3e
......@@ -103,8 +103,9 @@ public class TouTiaoAccountParse {
boolean f = true;
int page = 0;
while(f){
String url = "https://www.toutiao.com/search_content/?offset="+page*20+"&format=json&keyword="+URLCodeUtil.getURLEncode(word, "utf-8")+"&autoload=true&count=20&cur_tab=4&from=media";
String url = "https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&offset="+page*20+"&format=json&keyword="+URLCodeUtil.getURLEncode(word, "utf-8")+"&autoload=true&count=20&en_qc=1&cur_tab=4&from=media&pd=user";
headerMap = Tools.getTouTiaoHeader();
System.out.println(url);
try {
String htmlBody = downloadHtml(url, proxy, headerMap);
if(htmlBody != null){
......@@ -358,19 +359,23 @@ public class TouTiaoAccountParse {
JSONObject data = jsonArray.getJSONObject(i);
user_id = data.getLong("id");
name = data.getString("name");
media_id = data.getLong("media_id");
if(data.containsKey("media_id")) {
media_id = data.getLong("media_id");
}
description = data.getString("description");
user_verified = data.getInteger("user_verified");
verify_content = data.getString("verify_content");
follow_count = data.getInteger("follow_count");
img_url = "https:"+data.getString("avatar_url");
create_time = new Date(Integer.valueOf(data.getString("create_time")+"000"));
create_time = new Date(Long.parseLong((data.getString("create_time")+"000")));
gender = data.getString("gender");
user_type = data.getString("user_type");
tta = new TouTiaoAccount(user_id, name, media_id, description, user_verified,
verify_content, follow_count,img_url,create_time, gender, user_type);
ZhiWeiTools.sleep(1000);
if(Objects.nonNull(proxy)) {
ZhiWeiTools.sleep(1000);
}
TouTiaoAccount ttaUpdate = getTouTiaoAccountInfoByUserId(user_id+"", proxy);
if(ttaUpdate != null){
tta.setFriend_count(ttaUpdate.getFriend_count());
......
......@@ -399,7 +399,78 @@ public class TouTiaoArticleParse {
}
return null;
}
/**
*
* @Description 微头条客户端解析
* @param userId
* @param endDate
* @param proxy
* @param max_behot_time
* @return
*/
public static List<Map<String,Object>> getClientMicroToutiaoCrawler(String userId, ProxyHolder proxy,
Long max_behot_time) {
List<Map<String,Object>> dataList = new ArrayList<>();
String ma = "";
while(true) {
String url = "https://i.snssdk.com/api/feed/profile/v1/?visited_uid="+userId+"&offset="+max_behot_time;
System.out.println(url);
ma = String.valueOf(max_behot_time);
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url),proxy)){
String result = response.body().string();
JSONObject json = JSONObject.parseObject(result);
max_behot_time = json.getLongValue("offset");
JSONArray jsonArray = json.getJSONArray("data");
System.out.println(json.toString());
for (int i = 0; i < jsonArray.size(); i++) {
JSONObject data = jsonArray.getJSONObject(i);
try {
JSONObject dataJSON = data.getJSONObject("content").getJSONObject("raw_data");
System.out.println(dataJSON.toString());
Map<String,Object> map = new HashMap<>();
if(dataJSON.containsKey("comment_base") && dataJSON.getJSONObject("comment_base")!=null) {
JSONObject commentBase = dataJSON.getJSONObject("comment_base");
Date date = new Date(commentBase.getLongValue("create_time") * 1000);
String href = "http://weitoutiao.zjurl.cn/ugc/share/wap/comment/" + dataJSON.getLongValue("id");
String source = commentBase.getJSONObject("user").getJSONObject("info").getString("name");
String content = commentBase.getString("content");
String readNum = commentBase.getJSONObject("action").getInteger("read_count") + "";
String commentNum = commentBase.getJSONObject("action").getInteger("comment_count") + "";
String user_id = commentBase.getJSONObject("user").getJSONObject("info").getString("user_id");
if(dataJSON.containsKey("origin_group")) {
String replayUrl = dataJSON.getJSONObject("origin_group").getString("article_url");
String title = dataJSON.getJSONObject("origin_group").getString("title");
map.put("title", title);
map.put("replayUrl", replayUrl);
}
map.put("time", date);
map.put("href", href);
map.put("source", source);
map.put("content", content);
map.put("readNum", readNum);
map.put("commentNum", commentNum);
map.put("user_id", user_id);
// System.out.println(map.toString());
dataList.add(map);
}
} catch (Exception e) {
// System.out.println(data.toString());
e.printStackTrace();
}
}
System.out.println(" 采集到 条 == "+dataList.size() + " -- " +ma + " -- " + max_behot_time);
if(ma.equals(String.valueOf(max_behot_time))) {
break;
}
} catch (Exception e) {
logger.info("客户端微头条采集错误 {}",e);
}
}
return dataList;
}
/**
* @Title: parseHtmlByMicroAccount
* @author hero
......
......@@ -6,6 +6,7 @@ import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
......@@ -72,11 +73,16 @@ public class TouTiaoCommentParse {
{
List<TouTiaoComment> commentes = analySisComment(htmlBody, url);
ttList.addAll(commentes);
logger.info(" url {} 采集到第 {} 页 采集到 {} 条数据 ",url,page,ttList.size());
}else
{
logger.info("采集出现问题,地址为:{}", url);
}
ZhiWeiTools.sleep(4000);
if(Objects.nonNull(proxy)) {
ZhiWeiTools.sleep(100);
}else {
ZhiWeiTools.sleep(4000);
}
break;
} catch (Exception e) {
continue;
......@@ -97,7 +103,7 @@ public class TouTiaoCommentParse {
*/
private static List<TouTiaoComment> analySisComment(String htmlBody,String url)
{
List<TouTiaoComment> list = new ArrayList<TouTiaoComment>();
List<TouTiaoComment> list = new ArrayList<>();
try {
JSONObject json = JSONObject.parseObject(htmlBody);
JSONArray commentes = json.getJSONArray("data");
......@@ -118,9 +124,7 @@ public class TouTiaoCommentParse {
list.add(ttComment);
}
} catch (Exception e) {
e.printStackTrace();
logger.debug("解析今日头条评论列表出现为题,{}",e.getMessage());
return null;
logger.debug("解析今日头条评论列表出现为题,{}",e);
}
return list;
}
......@@ -148,12 +152,10 @@ public class TouTiaoCommentParse {
return (int)Math.ceil((double)count/20.0);
} catch (Exception e) {
e.printStackTrace();
logger.info("获取评论总页数时出现问题:{}",e.getMessage());
return 0;
logger.info("获取评论总页数时出现问题:{}",e);
}
}
return 0;
return -1;
}
......@@ -218,7 +220,7 @@ public class TouTiaoCommentParse {
} catch (Exception e) {
logger.error("解析头条评论数错误:::{}", e.fillInStackTrace());
}
return 0;
return -1;
}
/**
......@@ -243,19 +245,17 @@ public class TouTiaoCommentParse {
try {
JSONObject json = JSONObject.parseObject(htmlBody);
JSONObject data = json.getJSONObject("data");
int count = data.getIntValue("total");
return data.getIntValue("total");
return count;
} catch (Exception e) {
e.printStackTrace();
logger.info("获取评论总页数时出现问题:{}",e.getMessage());
logger.info("获取评论总页数时出现问题:{}",e);
}
}
} catch (Exception e) {
continue;
}
}
return 0;
return -1;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment