Commit 314e5609 by zhiwei

今日头条采集程序初次提交

parents
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>toutiao</artifactId>
<version>0.2.2-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>com.zhiwei</groupId>
<artifactId>zhiweiTools</artifactId>
<version>0.0.6-SNAPSHOT</version>
</dependency>
</dependencies>
<!-- 打包管理 -->
<build>
<plugins>
<!-- 发布源码 -->
<plugin>
<artifactId>maven-source-plugin</artifactId>
<version>2.4</version>
<configuration>
<attach>true</attach>
</configuration>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.10.4</version>
</plugin>
<!-- 解决maven test命令时console出现中文乱码乱码 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.7.2</version>
<configuration>
<forkMode>once</forkMode>
<argLine>-Dfile.encoding=UTF-8</argLine>
</configuration>
</plugin>
</plugins>
</build>
<!-- 分发管理:管理distribution和supporting files -->
<distributionManagement>
<snapshotRepository>
<id>nexus-releases</id>
<name>User Porject Snapshot</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/snapshots/</url >
<uniqueVersion>true</uniqueVersion>
</snapshotRepository>
<repository>
<id>nexus-releases</id>
<name>User Porject Release</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/releases/</url
>
</repository>
</distributionManagement>
</project>
\ No newline at end of file
package com.zhiwei.toutiao.bean;
import java.io.Serializable;
import java.util.Date;
/**
* @ClassName: TouTiaoAccount
* @Description: TODO(头条帐号信息)
* @author hero
* @date 2017年10月17日 下午2:50:46
*/
public class TouTiaoAccount implements Serializable{
private static final long serialVersionUID = -7447778477165461146L;
public String id; //主键 帐号id
public Long user_id; //帐号id
public String name; //帐号昵称
public Long media_id; //未知
public String description; //描述
public Integer user_verified; //是否认证 (0,不是;1 是)
public String verify_content; //认证原因
public Integer follow_count; //粉丝数
public Integer friend_count; //关注数
public String img_url; //头像地址
public Date create_time; //账号注册时间
public String gender; //性别
public String user_type; //用户类型
@Override
public String toString(){
return "new TouTiaoAccount["
+ "id = " + id
+ ", user_id = " + user_id
+ ", name = " + name
+ ", media_id = " + media_id
+ ", description = " + description
+ ", user_verified = " + user_verified
+ ", verify_content = " + verify_content
+ ", follow_count = " + follow_count
+ ", friend_count = " + friend_count
+ ", img_url = " + img_url
+ ", create_time = " + create_time
+ ", gender = " + gender
+ ", user_type = " + user_type
+ "]";
}
public TouTiaoAccount(){}
public TouTiaoAccount(Long user_id,String name, Long media_id, String description,Integer user_verified
,String verify_content, Integer follow_count, String img_url, Date create_time, String gender
,String user_type){
this.id = user_id+"";
this.user_id = user_id;
this.name = name;
this.media_id = media_id;
this.description = description;
this.user_verified = user_verified;
this.verify_content = verify_content;
this.follow_count = follow_count;
this.img_url = img_url;
this.create_time = create_time;
this. gender = gender;
this.user_type = user_type;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public Long getUser_id() {
return user_id;
}
public void setUser_id(Long user_id) {
this.user_id = user_id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public Long getMedia_id() {
return media_id;
}
public void setMedia_id(Long media_id) {
this.media_id = media_id;
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = description;
}
public Integer getUser_verified() {
return user_verified;
}
public void setUser_verified(Integer user_verified) {
this.user_verified = user_verified;
}
public Integer getFollow_count() {
return follow_count;
}
public void setFollow_count(Integer follow_count) {
this.follow_count = follow_count;
}
public String getVerify_content() {
return verify_content;
}
public void setVerify_content(String verify_content) {
this.verify_content = verify_content;
}
public String getImg_url() {
return img_url;
}
public void setImg_url(String img_url) {
this.img_url = img_url;
}
public Date getCreate_time() {
return create_time;
}
public void setCreate_time(Date create_time) {
this.create_time = create_time;
}
public String getGender() {
return gender;
}
public int getFriend_count() {
return friend_count;
}
public void setFriend_count(int friend_count) {
this.friend_count = friend_count;
}
public void setGender(String gender) {
this.gender = gender;
}
public String getUser_type() {
return user_type;
}
public void setUser_type(String user_type) {
this.user_type = user_type;
}
}
/**
* @Title: TouTiao.java
* @Package com.zhiwei.toutiao.entity
* @Description:
* @author hero
* @date 2016年9月2日 上午8:47:13
* @version V1.0
*/ /**
*
*/
package com.zhiwei.toutiao.bean;
import java.io.Serializable;
import java.util.Date;
/**
* @Description:
* @author hero
* @date 2016年9月2日 上午8:47:13
*/
public class TouTiaoArticle implements Serializable{
private static final long serialVersionUID = 7745861002592578553L;
private String url;
private String title;
private String type;
private String source;
private String user_id;
private Date time;
private String content;
private String commentCount;
private String playCount;
private String readNum;
public String getCommentCount() {
return commentCount;
}
public void setCommentCount(String commentCount) {
this.commentCount = commentCount;
}
public String getPlayCount() {
return playCount;
}
public void setPlayCount(String playCount) {
this.playCount = playCount;
}
public String getReadNum() {
return readNum;
}
public void setReadNum(String readNum) {
this.readNum = readNum;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public Date getTime() {
return time;
}
public void setTime(Date time) {
this.time = time;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public TouTiaoArticle(){}
public String getUser_id() {
return user_id;
}
public void setUser_id(String user_id) {
this.user_id = user_id;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public TouTiaoArticle(String url,String title,String user_id,
String source,Date time,String content,String commentCount,
String playCount,String readNum,String type)
{
this.url = url ;
this.title = title;
this.type = type;
this.source = source;
this.user_id = user_id;
this.time = time;
this.content = content;
this.readNum = readNum;
this.playCount = playCount;
this.commentCount = commentCount;
}
public String toString()
{
return "new TouTiaoArticle["
+ "url = " + url
+ ", title = " + title
+ ", type = " + type
+ ", source = " + source
+ ", user_id = " + user_id
+ ", time = " + time
+ ", content = " + content
+ ", commentCount = " + commentCount
+ ", playCount = " + playCount
+ ", readNum = " + readNum
+ "]";
}
}
package com.zhiwei.toutiao.bean;
import java.io.Serializable;
import java.util.Date;
public class TouTiaoComment implements Serializable{
private static final long serialVersionUID = -1536817427402243392L;
private String id;
private String text;
private String userName;
private Integer reply_count;
private Integer digg_count;
private Date time;
private String source_url;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getText() {
return text;
}
public void setText(String text) {
this.text = text;
}
public String getUserName() {
return userName;
}
public void setUserName(String userName) {
this.userName = userName;
}
public Integer getReply_count() {
return reply_count;
}
public void setReply_count(Integer reply_count) {
this.reply_count = reply_count;
}
public Integer getDigg_count() {
return digg_count;
}
public void setDigg_count(Integer digg_count) {
this.digg_count = digg_count;
}
public Date getTime() {
return time;
}
public void setTime(Date time) {
this.time = time;
}
public String getSource_url() {
return source_url;
}
public void setSource_url(String source_url) {
this.source_url = source_url;
}
@Override
public String toString()
{
return "new TouTiaoComment["
+ "id = " + id
+ ", text = " + text
+ ", userName = " + userName
+ ", reply_count = " + reply_count
+ ", digg_count = " + digg_count
+ ", time = " + time
+ ", source_url = " + source_url
+ "]";
}
public TouTiaoComment(String id,String text, String userName
,Integer reply_count, Integer digg_count, Date time, String source_url)
{
this.id = id;
this.text = text;
this.userName = userName;
this.reply_count = reply_count;
this.digg_count = digg_count;
this.time = time;
this.source_url = source_url;
}
}
package com.zhiwei.toutiao.bean;
import java.io.Serializable;
import java.util.Date;
/**
* @ClassName: TouTiaoQuestion
* @Description: TODO(头条问答(又名悟空问答)采集)
* @author hero
* @date 2017年7月20日 上午11:23:24
*/
public class TouTiaoQuestion implements Serializable{
private static final long serialVersionUID = 7743044965507540483L;
private String url; //问题链接
private String title; //标题
private String content; //问题
private String source; //发布者
private Date time; //发布时间
private Integer follow_count; //关注数
private Integer nice_ans_count; //精选回答数据
private Integer normal_ans_count; //正常回答数
private Integer ans_count; //总的回答数
public TouTiaoQuestion(String url,String title,String source,String content,Date time
,int follow_count, int nice_ans_count, int normal_ans_count, int ans_count){
this.url = url;
this.title = title;
this.content = content;
this.source = source;
this.time = time;
this.follow_count = follow_count;
this.nice_ans_count = nice_ans_count;
this.normal_ans_count = normal_ans_count;
this.ans_count = ans_count;
}
@Override
public String toString(){
return "new TouTiaoQuestion["
+ "url = " + url
+ ", title = " + title
+ ", content = " + content
+ ", source = " + source
+ ", time = " + time
+ ", follow_count = " + follow_count
+ ", nice_ans_count = " + nice_ans_count
+ ", normal_ans_count = " + normal_ans_count
+ ", ans_count = " + ans_count
+ "]";
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public Date getTime() {
return time;
}
public void setTime(Date time) {
this.time = time;
}
public Integer getFollow_count() {
return follow_count;
}
public void setFollow_count(Integer follow_count) {
this.follow_count = follow_count;
}
public Integer getNice_ans_count() {
return nice_ans_count;
}
public void setNice_ans_count(Integer nice_ans_count) {
this.nice_ans_count = nice_ans_count;
}
public Integer getNormal_ans_count() {
return normal_ans_count;
}
public void setNormal_ans_count(Integer normal_ans_count) {
this.normal_ans_count = normal_ans_count;
}
public Integer getAns_count() {
return ans_count;
}
public void setAns_count(Integer ans_count) {
this.ans_count = ans_count;
}
}
package com.zhiwei.toutiao.bean;
import java.io.Serializable;
import java.util.Date;
/**
* @ClassName: TouTiaoQuestionAnswer
* @Description: TODO(头条问答的回答)
* @author hero
* @date 2017年7月28日 下午6:13:10
*/
public class TouTiaoQuestionAnswer implements Serializable{
private static final long serialVersionUID = -252511004299401886L;
private String id; //回答id,主键
private String questionId; //问题id
private String content; //回答内容
private String user_id; //用户昵称
private String username; //用户昵称
private Date time; //回答时间
private int comment_count; //评论数
private int digg_count; //点赞数
public TouTiaoQuestionAnswer(){}
public TouTiaoQuestionAnswer(String id,String questionId,String content, String user_id,String username,Date time
,int comment_count,int digg_count){
this.id = id;
this.questionId = questionId;
this.content = content;
this.user_id = user_id;
this.username = username;
this.time = time;
this.comment_count = comment_count;
this.digg_count = digg_count;
}
@Override
public String toString(){
return "new TouTiaoQuestionAnswer["
+ "id = " + id
+ ", questionId = " + questionId
+ ", content = " + content
+ ", user_id = " + user_id
+ ", username = " + username
+ ", time = " + time
+ ", comment_count = " + comment_count
+ ", digg_count = " + digg_count
+ "]";
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getQuestionId() {
return questionId;
}
public void setQuestionId(String questionId) {
this.questionId = questionId;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getUsername() {
return username;
}
public void setUsername(String username) {
this.username = username;
}
public Date getTime() {
return time;
}
public void setTime(Date time) {
this.time = time;
}
public int getComment_count() {
return comment_count;
}
public void setComment_count(int comment_count) {
this.comment_count = comment_count;
}
public int getDigg_count() {
return digg_count;
}
public void setDigg_count(int digg_count) {
this.digg_count = digg_count;
}
public String getUser_id() {
return user_id;
}
public void setUser_id(String user_id) {
this.user_id = user_id;
}
}
package com.zhiwei.toutiao.parse;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.toutiao.bean.TouTiaoAccount;
import com.zhiwei.toutiao.util.Tools;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
/**
* @ClassName: TouTiaoAccountParse
* @Description: TODO(今日头条帐号采集)
* @author hero
* @date 2017年10月17日 下午3:36:54
*/
public class TouTiaoAccountParse {
private static Map<String, String> headerMap;
private static Logger logger = LoggerFactory.getLogger(TouTiaoAccountParse.class);
/**
* @Title: getTouTiaoAccountInfo
* @author hero
* @Description: TODO(根据帐号名获取帐号信息)
* @param @param word
* @param @param proxy
* @param @return 设定文件
* @return TouTiaoAccount 返回类型
*/
public static TouTiaoAccount getTouTiaoAccountInfoByName(String name, Proxy proxy){
String url = "https://www.toutiao.com/search_content/?offset=0&format=json&keyword="+URLCodeUtil.getURLEncode(name, "utf-8")+"&autoload=true&count=20&cur_tab=1";
headerMap = Tools.getTouTiaoHeader();
TouTiaoAccount tta = null;
try {
String htmlBody = null;
htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
if(htmlBody != null && htmlBody.contains("media_id")){
tta = parseHtmlByAccount(htmlBody, name, proxy);
if(tta == null){
url = "https://www.toutiao.com/search_content/?offset=0&format=json&keyword="+URLCodeUtil.getURLEncode(name, "utf-8")+"&autoload=true&count=20&cur_tab=4&from=media";
headerMap.put("Referer","https://www.toutiao.com/search/?keyword="+URLCodeUtil.getURLEncode(name, "utf-8"));
htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
if(htmlBody != null && htmlBody.contains("media_id")){
tta = parseHtmlByAccount(htmlBody, name, proxy);
}
}
}else {
url = "https://www.toutiao.com/search_content/?offset=0&format=json&keyword="+URLCodeUtil.getURLEncode(name, "utf-8")+"&autoload=true&count=20&cur_tab=4&from=media";
headerMap.put("Referer","https://www.toutiao.com/search/?keyword="+URLCodeUtil.getURLEncode(name, "utf-8"));
htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
if(htmlBody != null && htmlBody.contains("media_id")){
tta = parseHtmlByAccount(htmlBody, name, proxy);
}
}
} catch (Exception e) {
e.fillInStackTrace();
logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
return null;
}
return tta;
}
public static TouTiaoAccount getTouTiaoAccountInfoByUserId(String user_id, Proxy proxy){
String url = "https://www.toutiao.com/c/user/"+user_id+"/";
headerMap = Tools.getTouTiaoHeader();
TouTiaoAccount tta = null;
try {
String htmlBody = null;
htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
if(htmlBody != null && htmlBody.contains("mediaId")){
tta = parseAccountByUserId(htmlBody, user_id);
}
} catch (Exception e) {
e.fillInStackTrace();
logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
return null;
}
return tta;
}
/**
* @Title: getTouTiaoAccountInfoByWord
* @author hero
* @Description: TODO(根据关键词查询今日头条帐号信息)
* @param @param word
* @param @param proxy
* @param @return 设定文件
* @return List<TouTiaoAccount> 返回类型
*/
public static List<TouTiaoAccount> getTouTiaoAccountInfoByWord(String word, Proxy proxy){
List<TouTiaoAccount> list = new ArrayList<TouTiaoAccount>();
boolean f = true;
int page = 0;
while(f){
String url = "https://www.toutiao.com/search_content/?offset="+page*20+"&format=json&keyword="+URLCodeUtil.getURLEncode(word, "utf-8")+"&autoload=true&count=20&cur_tab=4&from=media";
headerMap = Tools.getTouTiaoHeader();
try {
String htmlBody = null;
htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
if(htmlBody != null && htmlBody.contains("media_id")){
JSONObject json = JSONObject.parseObject(htmlBody);
list.addAll(parseHtmlByWord(json, proxy));
if(json.getIntValue("has_more")==0){
f = false;
}
}
page++;
} catch (Exception e) {
logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
f = false;
}
}
return list;
}
/**
* @Title: getFriendsList
* @author hero
* @Description: 获取用户关注列表
* @param @param userid
* @param @param proxy
* @param @return 设定文件
* @return List<TouTiaoAccount> 返回类型
*/
public static List<TouTiaoAccount> getFriendsList(String userid,Proxy proxy,long sleep){
List<TouTiaoAccount> ttaList = new ArrayList<TouTiaoAccount>();
boolean more = true;
int page = 0;
while(more){
String url = "http://is.snssdk.com/user/following/?offset="+page*50+"&device_id=35330393347&count=50&user_id="+userid+"&ts="+System.currentTimeMillis()/1000;
page++;
headerMap = Tools.getTouTiaoHeader();
headerMap.put("Host", "is.snssdk.com");
try {
String htmlBody = null;
htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
if(htmlBody != null && htmlBody.contains("name")){
JSONObject json = JSONObject.parseObject(htmlBody);
more = json.getJSONObject("data").getBooleanValue("has_more");
List<TouTiaoAccount> dataList = parseHtmlByFans(json);
if(dataList!=null && dataList.size()>0){
ttaList.addAll(dataList);
}else{
more = false;
}
}else{
more = false;
}
ZhiWeiTools.sleep(sleep);
} catch (Exception e) {
logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
more = false;
return null;
}
}
return ttaList;
}
/**
* @Title: parseHtmlByAccount
* @author hero
* @Description: TODO(解析单个帐号信息)
* @param @param htmlBody
* @param @param word
* @param @return 设定文件
* @return TouTiaoAccount 返回类型
*/
private static TouTiaoAccount parseHtmlByAccount(String htmlBody, String word, Proxy proxy) {
try {
JSONObject json = JSONObject.parseObject(htmlBody);
JSONArray jsonArray = json.getJSONArray("data");
for (int i = 0; i < jsonArray.size(); i++) {
try {
JSONObject data = jsonArray.getJSONObject(i);
if(data.containsKey("media_id")){
long user_id = data.getLong("id");
String name = data.getString("name");
long media_id = data.getLong("media_id");
String description = data.getString("description");
int user_verified = data.getInteger("user_verified");
String verify_content = data.getString("verify_content");
int follow_count = data.getInteger("follow_count");
String img_url = "https:"+data.getString("avatar_url");
System.out.println(data.getString("create_time"));
Date create_time = new Date(Long.valueOf(data.getString("create_time"))*1000);
String gender = data.getString("gender");
String user_type = data.getString("user_type");
if(name.equals(word)){
TouTiaoAccount tta = new TouTiaoAccount(user_id, name, media_id, description, user_verified,
verify_content, follow_count,img_url,create_time, gender, user_type);
ZhiWeiTools.sleep(1000);
TouTiaoAccount ttaUpdate = getTouTiaoAccountInfoByUserId(user_id+"", proxy);
if(ttaUpdate != null){
tta.setFriend_count(ttaUpdate.getFriend_count());
tta.setUser_type(ttaUpdate.getUser_type());
}
return tta;
}
}
} catch (Exception e) {
logger.error("数据解析出现问题,{}", e);
continue;
}
}
} catch (Exception e) {
logger.error("数据解析出现问题,{}", e);
return null;
}
return null;
}
/**
* @Title: parseAccountByUserId
* @author hero
* @Description: 根据uid更新用户部分信息
* @param @param htmlBody
* @param @param user_id
* @param @return 设定文件
* @return TouTiaoAccount 返回类型
*/
private static TouTiaoAccount parseAccountByUserId(String htmlBody,String user_id) {
try {
TouTiaoAccount touTiaoAccount = new TouTiaoAccount();
if(htmlBody.contains("var header={")){
String name = htmlBody.split("var header")[1].split("name:'")[1].split("',")[0];
String img_url = "https:"+htmlBody.split("avtar_img:'")[1].split("',")[0];
String type = htmlBody.split("type: '")[1].split("'")[0];
if(htmlBody.contains("guanzhu")){
int guanzhu = Integer.valueOf(htmlBody.split("guanzhu:'")[1].split("',")[0]);
int fensi = Integer.valueOf(htmlBody.split("fensi:'")[1].split("',")[0]);
touTiaoAccount.setFriend_count(guanzhu);
touTiaoAccount.setFollow_count(fensi);
}
touTiaoAccount.setId(user_id);
touTiaoAccount.setImg_url(img_url);
touTiaoAccount.setName(name);
touTiaoAccount.setUser_type(type);
return touTiaoAccount;
}
} catch (Exception e) {
logger.error("数据解析出现问题,{}", e);
return null;
}
return null;
}
/**
*
* @Title: parseHtmlByAccount
* @author hero
* @Description: 解析根据关键词获取帐号列表
* @param @param htmlBody
* @param @param word
* @param @return 设定文件
* @return List<TouTiaoAccount> 返回类型
*/
private static List<TouTiaoAccount> parseHtmlByWord(JSONObject json, Proxy proxy) {
List<TouTiaoAccount> ttaList = new ArrayList<TouTiaoAccount>();
try {
JSONArray jsonArray = json.getJSONArray("data");
long user_id = 0;
String name = null;
long media_id = 0;
String description = null;
int user_verified = 0;
String verify_content = null;
int follow_count = 0;
String img_url = null;
Date create_time = null;
String gender = null;
String user_type = null;
TouTiaoAccount tta = null;
for (int i = 0; i < jsonArray.size(); i++) {
try {
JSONObject data = jsonArray.getJSONObject(i);
if(data.containsKey("media_id")){
user_id = data.getLong("id");
name = data.getString("name");
media_id = data.getLong("media_id");
description = data.getString("description");
user_verified = data.getInteger("user_verified");
verify_content = data.getString("verify_content");
follow_count = data.getInteger("follow_count");
img_url = "https:"+data.getString("avatar_url");
create_time = new Date(Integer.valueOf(data.getString("create_time")+"000"));
gender = data.getString("gender");
user_type = data.getString("user_type");
tta = new TouTiaoAccount(user_id, name, media_id, description, user_verified,
verify_content, follow_count,img_url,create_time, gender, user_type);
ZhiWeiTools.sleep(1000);
TouTiaoAccount ttaUpdate = getTouTiaoAccountInfoByUserId(user_id+"", proxy);
if(ttaUpdate != null){
tta.setFriend_count(ttaUpdate.getFriend_count());
tta.setUser_type(ttaUpdate.getUser_type());
}
ttaList.add(tta);
}
} catch (Exception e) {
logger.error("数据解析出现问题,{}", e);
continue;
}
}
} catch (Exception e) {
logger.error("数据解析出现问题,{}", e);
return null;
}
return ttaList;
}
/***
* @Title: parseHtmlByFans
* @author hero
* @Description: 获取头条账号粉丝列表
* @param @param htmlBody
* @param @return 设定文件
* @return List<TouTiaoAccount> 返回类型
*/
private static List<TouTiaoAccount> parseHtmlByFans(JSONObject json) {
List<TouTiaoAccount> ttaList = null;
try {
ttaList = new ArrayList<TouTiaoAccount>();
JSONArray jsonArray = json.getJSONObject("data").getJSONArray("users");
Long user_id = null;
String name = null;
String description = null;
Integer user_verified = null;
String verify_content = null;
int follow_count = 0;
String img_url = null;
Date create_time = null;
String gender = null;
String user_type = null;
TouTiaoAccount tta = null;
for (int i = 0; i < jsonArray.size(); i++) {
try {
JSONObject data = jsonArray.getJSONObject(i);
user_id = data.getLong("user_id");
name = data.getString("name");
description = data.getString("description");
verify_content = data.getString("verified_content");
img_url = data.getString("avatar_url");
user_verified = data.getBoolean("user_verified")==true? 0 : 1;
tta = new TouTiaoAccount(user_id, name, null, description, user_verified,
verify_content, follow_count, img_url, create_time, gender, user_type);
ttaList.add(tta);
} catch (Exception e) {
logger.error("数据解析出现问题,{}", e);
continue;
}
}
} catch (Exception e) {
logger.error("数据解析出现问题,{}", e);
return null;
}
return ttaList;
}
}
/**
* @Title: TouTiaoParse.java
* @Package com.zhiwei.toutiao.parse
* @Description:
* @author hero
* @date 2016年9月2日 上午11:17:44
* @version V1.0
*/
/**
*
*/
package com.zhiwei.toutiao.parse;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.util.Tools;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
/**
* @Description:头条帐号采集
* @author hero
* @date 2016年9月2日 上午11:17:44
*/
public class TouTiaoArticleParse {
private static Map<String, String> headerMap;
private static Logger logger = LoggerFactory.getLogger(TouTiaoArticleParse.class);
/***
* 获取头条数据
*
* @Description:
* @param @param
* url
* @param @return
* @return List<TouTiao> 返回类型
* @throws Exception
*/
public static Map<String, Object> getTouTiaoList(String media_id, String max_behot_time,Date endData, Proxy proxy ) throws Exception{
String as=Tools.getAS().split("_")[0];
String cp=Tools.getAS().split("_")[1];
String url = "https://www.toutiao.com/pgc/ma/?page_type=1&media_id="+media_id+"&count=20&as="+as+"&cp="+cp;
if(max_behot_time!=null){
url = url + "&max_behot_time="+max_behot_time;
}
headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", url);
String htmlBody = null;
try {
htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
if(htmlBody != null && htmlBody.contains("behot_time")){
Map<String, Object> ttList = parseHtmlByAccount(htmlBody, endData);
if(ttList!=null && ttList.size()>0){
return ttList;
}
}else{
logger.info("数据为null");
}
} catch (Exception e) {
logger.error("获取今日头条帐号数据连接超时", e.fillInStackTrace());
throw e;
}
return null;
}
/***
* 根据帐号解析历史文章地址
*
* @Description:根据帐号解析历史文章地址
* @param @param
* htmlBody
* @param @return
* @return List<String> 返回类型
*/
private static Map<String, Object> parseHtmlByAccount(String htmlBody, Date endDate) {
Map<String, Object> map = new HashMap<String, Object>();
String max_behot_time = null;
List<TouTiaoArticle> dataList = new ArrayList<TouTiaoArticle>();
try {
JSONObject json = JSONObject.parseObject(htmlBody);
JSONArray jsonArray = json.getJSONArray("data");
max_behot_time = json.getJSONObject("next").getString("max_behot_time");
for (int i = 0; i < jsonArray.size(); i++) {
try {
JSONObject data = jsonArray.getJSONObject(i);
String href = "https://www.toutiao.com/";
if(data.containsKey("group_id")){
href = href+"a"+data.getLongValue("group_id");
String title = data.getString("title");
String content = data.getString("abstract");
String time = data.getLongValue("behot_time")*1000+"";
Date date = TimeParse.stringFormartDate(time);
String readNum = data.getString("go_detail_count");
String commentNum = data.getString("comments_count");
String playNum = data.getString("play_effective_count");
String source = data.getString("source");
String user_id = data.getLong("creator_uid").toString();
TouTiaoArticle tt = new TouTiaoArticle(href, title, user_id, source, date, content, commentNum, playNum, readNum, "今日头条");
dataList.add(tt);
}
} catch (Exception e) {
logger.error("数据解析出现问题,{}", e.getMessage());
continue;
}
}
} catch (Exception e) {
logger.error("数据解析出现问题,{}", e.getMessage());
return null;
}
if(max_behot_time!=null && !"0".equals(max_behot_time)){
Date nextDate = new Date(Long.valueOf(max_behot_time+"000"));
if(endDate.after(nextDate)){
max_behot_time = null;
}
}
map.put("max_behot_time", max_behot_time);
map.put("data", dataList);
return map;
}
/**
* @Title: getMicroTouTiaoCrawler
* @author hero
* @Description: 根据用户user_id查询用户微头条数据
* @param @param user_id
* @param @param endDate
* @param @param proxy
* @param @return
* @param @throws IOException 设定文件
* @return List<Map<String,Object>> 返回类型
*/
public static Map<String, Object> getMicroTouTiaoCrawler(String user_id, Date endDate, Proxy proxy, String max_behot_time) throws IOException {
String url = "https://www.toutiao.com/c/ugc/content/list/" + user_id+"/";
if(max_behot_time!=null){
url = url + "?max_time=" + max_behot_time;
}
Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", "https://www.toutiao.com/c/user/" + user_id + "/");
String htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
if (htmlBody != null) {
Map<String, Object> dataMap = parseHtmlByMicroAccount(htmlBody, endDate);
if(dataMap!=null && dataMap.size()>0){
return dataMap;
}
}else{
logger.info("数据为null");
}
return null;
}
/**
* @Title: parseHtmlByMicroAccount
* @author hero
* @Description: 解析微头条数据
* @param @param htmlBody
* @param @param endDate
* @param @return 设定文件
* @return Map<String,Object> 返回类型
*/
private static Map<String, Object> parseHtmlByMicroAccount(String htmlBody, Date endDate) {
Map<String, Object> map = new HashMap<String, Object>();
Long max_behot_time = null;
List<TouTiaoArticle> dataList = new ArrayList<TouTiaoArticle>();
try {
JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data");
boolean more = json.getBoolean("has_more");
JSONArray jsonArray = json.getJSONArray("list");
Date date = null;
String href = null;
String source = null;
String title = null;
String content = null;
String readNum = null;
String commentNum = null;
String playNum = null;
String user_id = null;
int count = 16;
for (int i = 0; i < jsonArray.size(); i++) {
try {
JSONObject data = jsonArray.getJSONObject(i);
max_behot_time = data.getLongValue("create_time");
date = new Date(max_behot_time*1000);
href = "https://www.toutiao.com/a" + data.getString("thread_id");
source = data.getJSONObject("ugc_user").getString("name");
content = data.getString("rich_content");
readNum = data.getInteger("read_count")+"";
commentNum = data.getInteger("comment_count")+"";
user_id = data.getJSONObject("ugc_user").getString("user_id");
if(content!=null && !"".equals(content)){
if(content.length()<16){
count = content.length();
}
title = content.substring(0, count);
}
TouTiaoArticle tt = new TouTiaoArticle(href, title, user_id, source,date, content, commentNum, playNum, readNum, "微头条");
dataList.add(tt);
} catch (Exception e) {
continue;
}
}
/**验证是否有下一页数据**/
if(more){
if(max_behot_time!=null && !"0".equals(max_behot_time)){
if(endDate.after(date)){
max_behot_time = null;
}
}
}else{
max_behot_time = null;
}
} catch (Exception e) {
e.printStackTrace();
}
map.put("max_behot_time", max_behot_time);
map.put("data", dataList);
return map;
}
}
package com.zhiwei.toutiao.parse;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONException;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.util.Tools;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
/**
* @ClassName: TouTiaoChannel
* @Description: TODO(今日头条按照频道采集)
* @author hero
* @date 2017年7月24日 下午4:57:22
*/
public class TouTiaoChannelParse {
private static Map<String, String> headerMap;
private static Logger logger = LoggerFactory.getLogger(TouTiaoChannelParse.class);
/**
* @Title: touTiaoChannel
* @author hero
* @Description: TODO(解析)
* @param @param url
* @param @return 设定文件
* @return Map<String,Object> 返回类型
* @throws Exception
*/
public static Map<String,Object> touTiaoChannel(String url,Proxy proxy) throws Exception{
headerMap = Tools.getTouTiaoChannelHeader();
headerMap.put("referer", url);
String htmlBody = null;
try {
htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
}catch (Exception e) {
logger.error("获取数据连接出现问题:",e.fillInStackTrace());
throw e;
}
if (htmlBody != null) {
return parseHtmlByChannel(htmlBody);
}
return null;
}
/**
* @Title: parseHtmlByChannel
* @author hero
* @Description: TODO(解析)
* @param @param htmlBody
* @param @return 设定文件
* @return Map<String,Object> 返回类型
*/
private static Map<String,Object> parseHtmlByChannel(String htmlBody){
Map<String,Object> dataMap = new HashMap<String,Object>();
List<TouTiaoArticle> ttList = new ArrayList<TouTiaoArticle>();
JSONObject jsonObject = JSONObject.parseObject(htmlBody);
JSONArray dataList = jsonObject.getJSONArray("data");
Long next = null;
try {
next = jsonObject.getJSONObject("next").getLong("max_behot_time");
} catch (Exception e) {
next = null;
}
String time = null;
String title = null;
String content = null;
String comment_count = null;
Date date = null;
String source = null;
for (int i = 0; i < dataList.size(); i++) {
JSONObject jso = dataList.getJSONObject(i);
try {
time = String.valueOf(jso.getLongValue("behot_time")*1000);
title = jso.getString("title");
content = jso.getString("abstract");
comment_count = jso.getIntValue("comments_count")+"";
source = jso.getString("source");
String url = null;
if(null != jso.getString("group_id")){
url = "http://www.toutiao.com/a" + jso.getString("group_id")+"/";
}
url = getUrl(url);
date = TimeParse.stringFormartDate(time);
TouTiaoArticle tt = new TouTiaoArticle(url, title, null,source, date, content, comment_count, "-1", "-1","今日头条");
ttList.add(tt);
} catch (JSONException e) {
continue;
}
}
dataMap.put("data", ttList);
dataMap.put("next", next);
return dataMap;
}
/**
* @Title: getUrl
* @author hero
* @Description: TODO(处理url)
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
private static String getUrl(String url){
if(url.contains("group/"))
{
url = url.replace("group/", "a");
}
if (url.contains("item")) {
url = url.replace("/item/", "/i");
}
if (url.contains("m.")) {
url = url.replace("m.", "");
}
if(!url.contains("www"))
{
url = url.replace("toutiao.com", "www.toutiao.com");
}
String urlIndex = url.substring(url.length()-1, url.length());
if(!urlIndex.equals("/"))
{
url = url+"/";
}
return url;
}
}
package com.zhiwei.toutiao.parse;
import java.io.IOException;
import java.net.Proxy;
import java.net.SocketTimeoutException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.toutiao.bean.TouTiaoComment;
import com.zhiwei.toutiao.util.Tools;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
/**
* @ClassName: TouTiaoComment
* @Description: TODO(今日头条评论数据)
* @author hero
* @date 2016年12月9日 下午7:50:28
*/
public class TouTiaoCommentParse {
private static Logger logger = LoggerFactory.getLogger(TouTiaoCommentParse.class);
/**
*
* @Title: getTouTiaoComment
* @author hero
* @Description: 获取评论列表,可指定限制返回条数
* @param @param url
* @param @param count
* @param @param proxy
* @param @return
* @param @throws Exception 设定文件
* @return List<TouTiaoComment> 返回类型
*/
public static List<TouTiaoComment> getTouTiaoComment(String url,int returnCount,Proxy proxy) throws Exception
{
List<TouTiaoComment> ttList = new ArrayList<TouTiaoComment>();
String group_id = getGroupId(url, proxy);
//查询评论总页数
if(group_id != null){
int page = getPage(group_id);
if(returnCount>0){
int pageMax = (int)Math.ceil((double)returnCount/20.0);
if(page>=pageMax){
page = pageMax;
}
}
for(int i=0;i<page;i++)
{
String urlNew = "http://is.snssdk.com/article/v2/tab_comments/?app_name=news_article&offset="
+i*20+"&group_id="+group_id+"&aggr_type=1&count=20&fold=1&item_id="+group_id+"&ts="+System.currentTimeMillis();
//设置头信息
Map<String,String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("User-Agent", "News 6.6.5 rv:6.6.5.03 (iPhone; iOS 11.3; zh_CN) Cronet");
headerMap.put("Host", "is.snssdk.com");
for(int j=1; j<=3; j++){
try {
String htmlBody = HttpClientTemplateOK.get(urlNew, null,headerMap);
if(htmlBody!=null)
{
List<TouTiaoComment> commentes = analySisComment(htmlBody, url);
ttList.addAll(commentes);
}else
{
logger.info("采集出现问题,地址为:{}", url);
}
ZhiWeiTools.sleep(4000);
break;
} catch (SocketTimeoutException e) {
continue;
}
}
}
}
return ttList;
}
/**
* @Title: analySisComment
* @Description: TODO(解析评论列表)
* @param @param htmlBody
* @param @return 设定文件
* @return List<DBObject> 返回类型
*/
private static List<TouTiaoComment> analySisComment(String htmlBody,String url)
{
List<TouTiaoComment> list = new ArrayList<TouTiaoComment>();
try {
JSONObject json = JSONObject.parseObject(htmlBody);
JSONArray commentes = json.getJSONArray("data");
for(int a = 0;a<commentes.size();a++)
{
JSONObject comment = commentes.getJSONObject(a).getJSONObject("comment");
String id = comment.getString("id");
String text = comment.getString("text");
String name = comment.getString("user_name");
int reply_count = comment.getIntValue("reply_count");
int digg_count = comment.getIntValue("digg_count");
long timeLong = comment.getLongValue("create_time")*1000;
Date date = new Date(timeLong);
TouTiaoComment ttComment = new TouTiaoComment(id,
text, name, reply_count, digg_count,
date, url);
list.add(ttComment);
}
} catch (Exception e) {
e.printStackTrace();
logger.debug("解析今日头条评论列表出现为题,{}",e.getMessage());
return null;
}
return list;
}
/**
* @Title: getPage
* @Description: TODO(获取总页数)
* @param @param url
* @param @return 设定文件
* @return int 返回类型
* @throws Exception
*/
private static int getPage(String group_id) throws Exception
{
String urlNew = "http://www.toutiao.com/api/comment/list/?group_id="+group_id+"&item_id=0&count=20&offset=0";
//设置头信息
Map<String,String> headerMap = Tools.getTouTiaoHeader();
String htmlBody = HttpClientTemplateOK.get(urlNew, null,headerMap);
if(htmlBody!=null)
{
try {
JSONObject json = JSONObject.parseObject(htmlBody);
JSONObject data = json.getJSONObject("data");
int count = data.getIntValue("total");
return (int)Math.ceil((double)count/20.0);
} catch (Exception e) {
e.printStackTrace();
logger.info("获取评论总页数时出现问题:{}",e.getMessage());
return 0;
}
}
return 0;
}
/**
* @Title: findCommentCount
* @author hero
* @Description: 根据id获取头条评论数
* @param @param url
* @param @param proxy
* @param @return 设定文件
* @return int 返回类型
*/
public static int findCommentCount(String url,Proxy proxy)
{
for(int i=0; i<3; i++){
try {
//设置头信息
Map<String,String> headerMap = Tools.getTouTiaoHeader();
String htmlBody = HttpClientTemplateOK.get(url, null,headerMap);
if(htmlBody!=null && htmlBody.contains("commentInfo"))
{
try {
return Integer.valueOf(htmlBody.split("comments_count: ")[1].split(",")[0]);
} catch (Exception e) {
logger.error("解析头条评论数错误:::{}", e.fillInStackTrace());
return 0;
}
}
} catch (Exception e) {
ZhiWeiTools.sleep(5000);
continue;
}
}
return 0;
}
/**
* @Title: getCommentCount
* @Description: TODO(根据id查看评论数)
* @param @param url
* @param @return 设定文件
* @return int 返回类型
* @throws IOException
*/
public static int getCommentCount(String url,Proxy proxy)
{
String group_id = getGroupId(url, proxy);
for(int i=0; i<3; i++){
try {
String urlNew = "http://www.toutiao.com/api/comment/list/?group_id="+group_id+"&item_id=0&count=20&offset=0";
//设置头信息
Map<String,String> headerMap = Tools.getTouTiaoHeader();
String htmlBody = HttpClientTemplateOK.get(urlNew, null,headerMap);
if(htmlBody!=null)
{
try {
JSONObject json = JSONObject.parseObject(htmlBody);
JSONObject data = json.getJSONObject("data");
int count = data.getIntValue("total");
return count;
} catch (Exception e) {
e.printStackTrace();
logger.info("获取评论总页数时出现问题:{}",e.getMessage());
}
}
} catch (Exception e) {
ZhiWeiTools.sleep(5000);
continue;
}
}
return 0;
}
/**
* @Title: getGroupId
* @Description: TODO(获取groupId用于更新评论列表)
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
private static String getGroupId(String url,Proxy proxy)
{
String groupId = null;
if(url.contains("/a")||url.contains("/group/"))
{
if(url.contains("/a"))
{
groupId = url.split("/a")[1].replace("/", "");
}else
{
groupId = url.split("/group/")[1].replace("/", "");
}
}else if(url.contains("/i")||url.contains("/item/"))
{
groupId = gettGroupIdByUrl(url, proxy);
}
return groupId;
}
/**
* @Title: gettGroupIdByUrl
* @Description: TODO(解析并获取groupId)
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
private static String gettGroupIdByUrl(String url,Proxy proxy)
{
String groupId = null;
Map<String,String> headerMap = Tools.getTouTiaoHeader();
try {
String htmlBody = HttpClientTemplateOK.get(url, proxy,headerMap);
if(htmlBody != null)
{
if(htmlBody.contains("groupId"))
{
groupId = htmlBody.split("groupId: '")[1]
.split("',")[0].trim();
}
}else
{
logger.info("获取groupId失败,链接地址为:{}",url);
}
} catch (Exception e) {
e.printStackTrace();
logger.error("获取groupId失败,链接地址为:{}",url,e);
}
return groupId;
}
}
/**
* @Title: TouTiaoParse.java
* @Package com.zhiwei.toutiao.parse
* @Description:
* @author hero
* @date 2016年9月2日 上午11:17:44
* @version V1.0
*/
/**
*
*/
package com.zhiwei.toutiao.parse;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.net.Proxy.Type;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.util.Tools;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
/**
* @Description:
* @author hero
* @date 2016年9月2日 上午11:17:44
*/
public class TouTiaoParse {
private Map<String, String> headerMap ;
private Logger logger = LoggerFactory.getLogger(TouTiaoCommentParse.class);
/***
* 获取头条数据
*
* @Description:
* @param @param url
* @param @return
* @return List<TouTiao> 返回类型
* @throws Exception
*/
@Deprecated
public Map<String, Object> getTouTiaoList(String url,
Date endData, String source,String hostname,int host) throws Exception {
headerMap = Tools.getTouTiaoHeader();
String htmlBody = null;
if(hostname != null)
{
Proxy proxy = new Proxy(Type.HTTP, new InetSocketAddress(hostname, host));
htmlBody = HttpClientTemplateOK.get(url, proxy,headerMap);
}else
{
htmlBody = HttpClientTemplateOK.get(url, null,headerMap);
}
if (htmlBody != null && htmlBody.contains("abstract")) {
return parseHtmlByAccount(htmlBody, endData, source);
}
return null;
}
/***
* 根据文章地址解析文章详情
*
* @Description:根据文章地址解析文章详情
* @param @param
* url
* @param @param
* htmlBody
* @param @return
* @return TouTiao 返回类型
*/
@SuppressWarnings("unused")
private String parseHtmlByArticle(String url, String htmlBody) {
try {
Document doc = Jsoup.parse(htmlBody);
String content = null;
if (doc.select("[class=article-content]") != null) {
content = doc.select("[class=article-content]").text();
} else {
content = doc.select("[class=content]").text();
}
return content;
} catch (Exception e) {
return null;
}
}
/***
* 根据帐号解析历史文章地址
*
* @Description:根据帐号解析历史文章地址
* @param @param
* htmlBody
* @param @return
* @return List<String> 返回类型
*/
private Map<String, Object> parseHtmlByAccount(String htmlBody, Date endData, String source) {
Map<String, Object> map = new HashMap<String, Object>();
String max_behot_time = null;
List<TouTiaoArticle> dataList = new ArrayList<TouTiaoArticle>();
try {
JSONObject json = JSONObject.parseObject(htmlBody);
JSONArray jsonArray = json.getJSONArray("data");
max_behot_time = json.getJSONObject("next").getString("max_behot_time");
for (int i = 0; i < jsonArray.size(); i++) {
try {
JSONObject data = jsonArray.getJSONObject(i);
String href = data.getString("source_url");
if (href.contains("item")) {
href = href.replace("/item/", "/i");
}
if (href.contains("group")) {
href = href.replace("/group/", "/a");
}
if (href.contains("m.")) {
href = href.replace("m.", "");
}
if(!href.contains("www"))
{
href = href.replace("toutiao.com", "www.toutiao.com");
}
String urlIndex = href.substring(href.length()-1, href.length());
if(!urlIndex.equals("/"))
{
href = href+"/";
}
String title = data.getString("title");
String content = data.getString("abstract");
String time = data.getString("datetime");
Date date = TimeParse.stringFormartDate(time);
String readNum = data.getString("go_detail_count");
String commentNum = data.getString("comments_count");
String playNum = data.getString("play_effective_count");
if (endData.before(date)) {
TouTiaoArticle tt = new TouTiaoArticle(href, title, null,source, date, content, commentNum, playNum, readNum, "今日头条");
dataList.add(tt);
}else
{
max_behot_time = null;
logger.info("数据不再时间段内,{}",time);
}
} catch (Exception e) {
logger.error("数据解析出现问题,{}",e);
continue;
}
}
} catch (Exception e) {
logger.error("数据解析出现问题,{}",e);
return null;
}
map.put("max_behot_time", max_behot_time);
map.put("data", dataList);
return map;
}
}
package com.zhiwei.toutiao.parse;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.toutiao.bean.TouTiaoQuestionAnswer;
import com.zhiwei.toutiao.util.Tools;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
/**
* @ClassName: TouTiaoQuestionAnswer
* @Description: TODO(头条问答的回答)
* @author hero
* @date 2017年7月28日 下午6:12:31
*/
public class TouTiaoQuestionAnswerParse {
private static Map<String, String> headerMap ;
private static Logger logger = LoggerFactory.getLogger(TouTiaoQuestionAnswerParse.class);
public static Map<String,Object> getAnserList(String questionId,int page,int req_type,Proxy proxy){
String url = "https://www.wukong.com/wenda/web/question/loadmorev1/?qid="+questionId+"&count=20&req_type="+req_type+"&offset=" + page*20;
headerMap = Tools.getTouTiaoQuestionAnswerHeader();
headerMap.put("referer", "https://www.wukong.com/question/"+questionId+"/");
Map<String,Object> result = new HashMap<String,Object>();
List<TouTiaoQuestionAnswer> anserList = new ArrayList<TouTiaoQuestionAnswer>();
try {
String htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
if(htmlBody != null){
JSONObject jsonObject = JSONObject.parseObject(htmlBody);
if(jsonObject.getJSONObject("data") != null){
JSONObject data = jsonObject.getJSONObject("data");
System.out.println(data.getIntValue("has_more"));
page++;
JSONArray ans_list = data.getJSONArray("ans_list");
for(int i= 0; i<ans_list.size(); i++){
JSONObject ans = ans_list.getJSONObject(i);
String ansid = ans.getString("ansid");
String content = ans.getString("content");
String username = ans.getJSONObject("user").getString("uname");
String user_id = ans.getJSONObject("user").getString("user_id");
Date time = TimeParse.stringFormartDate(ans.getLongValue("create_time")*1000+"");
int comment_count = ans.getIntValue("comment_count");
int digg_count = ans.getIntValue("digg_count");
TouTiaoQuestionAnswer answer = new TouTiaoQuestionAnswer(ansid, questionId, content, user_id,username, time, comment_count, digg_count);
anserList.add(answer);
}
}else{
return null;
}
}else{
return null;
}
} catch (Exception e) {
logger.error("头条问答问题获取出现问题",e.fillInStackTrace());
return null;
}
result.put("page", page);
result.put("ansList", anserList);
return result;
}
/**
* @Title: getAnswerCount
* @author hero
* @Description: TODO(根据头条问答地址更新回答数)
* @param @param questionId
* @param @return 设定文件
* @return String 返回类型
*/
public String getAnswerCount(String questionId,Proxy proxy){
String result = null;
String url = "https://www.wukong.com/question/"+questionId+"/";
System.out.println(url);
headerMap = Tools.getTouTiaoQuestionAnswerHeader();
headerMap.put("referer", url);
try {
String htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
if(htmlBody != null){
Document document = Jsoup.parse(htmlBody);
String text = document.select("[class=question question-single]").text();
if(text.contains("该问题不存在")){
result = "已删除";
}else{
result = document.select("div.question-item").select("h3.answer-count-h").text();
result = result.split("个回答")[0];
}
}
} catch (Exception e) {
return "-1";
}
return result;
}
}
package com.zhiwei.toutiao.parse;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.toutiao.bean.TouTiaoQuestion;
import com.zhiwei.toutiao.util.Tools;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
/**
* @ClassName: TouTiaoQuestionParse
* @Description: TODO(头条问道(悟空问答)数据解析程序)
* @author hero
* @date 2017年7月20日 下午2:14:48
*/
public class TouTiaoQuestionParse {
private static Map<String, String> headerMap;
private static Logger logger = LoggerFactory.getLogger(TouTiaoQuestionParse.class);
/**
* @Title: getSearchTouTiaoQuestion
* @author hero
* @Description: TODO(根据关键词查询头条问答全部)
* @param @param
* url
* @param @return
* 设定文件
* @return List<TouTiaoQuestion> 返回类型
* @throws Exception
*/
public static List<TouTiaoQuestion> getSearchTouTiaoQuestion(String url,Proxy proxy) throws Exception {
List<TouTiaoQuestion> questtionList = new ArrayList<TouTiaoQuestion>();
headerMap = Tools.getTouTiaoQuestionHeader();
headerMap.put("referer", url);
String htmlBody = null;
try {
htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
if (htmlBody != null) {
List<TouTiaoQuestion> ttList = parseHtmlByQuestion(htmlBody);
if (ttList != null && ttList.size() > 0) {
return ttList;
}
}
} catch (Exception e) {
throw e;
}
return questtionList;
}
/**
* @Title: parseHtmlByQuestion
* @author hero
* @Description: TODO(解析头条问答数据)
* @param @param
* htmlBody
* @param @return
* 设定文件
* @return List<TouTiaoQuestion> 返回类型
*/
private static List<TouTiaoQuestion> parseHtmlByQuestion(String htmlBody) {
List<TouTiaoQuestion> questtionList = new ArrayList<TouTiaoQuestion>();
JSONObject jsonObject = JSONObject.parseObject(htmlBody);
String err_tips = jsonObject.getString("err_tips");
if (err_tips.equals("success")) {
JSONObject json = jsonObject.getJSONObject("data");
JSONArray jsonArray = json.getJSONArray("feed_question");
for (int i = 0; i < jsonArray.size(); i++) {
try {
JSONObject question = jsonArray.getJSONObject(i).getJSONObject("question");
String content = question.getJSONObject("content").getString("text");
String title = question.getString("title");
String url = "http://www.toutiao.com/a" + question.getString("qid") + "/";
Date time = TimeParse.stringFormartDate(question.getLong("create_time") * 1000L + "");
String source = question.getJSONObject("user").getString("uname");
int follow_count = question.getIntValue("follow_count");
int nice_ans_count = question.getIntValue("nice_ans_count");
int normal_ans_count = question.getIntValue("normal_ans_count");
int ans_count = nice_ans_count + normal_ans_count;
TouTiaoQuestion touTiaoQuestion = new TouTiaoQuestion(url, title, source, content, time,
follow_count, nice_ans_count, normal_ans_count, ans_count);
questtionList.add(touTiaoQuestion);
} catch (Exception e) {
logger.info("头条问答解析数据出现问题", e.fillInStackTrace());
continue;
}
}
}
return questtionList;
}
}
package com.zhiwei.toutiao.parse;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONException;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.util.Tools;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
/**
* @ClassName: TouTiaoSearch
* @Description: TODO(今日头条搜索采集解析程序)
* @author hero
* @date 2017年7月24日 下午3:58:27
*/
public class TouTiaoSearchParse {
private static Map<String, String> headerMap;
private static Logger logger = LoggerFactory.getLogger(TouTiaoSearchParse.class);
/**
* @Title: touTiaoSearchByWord
* @author hero
* @Description: TODO(根据关键词采集今日头条数据)
* @param @param url
* @param @return 设定文件
* @return List<TouTiaoArticle> 返回类型
* @throws Exception
*/
public static Map<String,Object> touTiaoSearchByWord(String url,Proxy proxy ) throws Exception{
headerMap = Tools.getTouTiaoSearchHeader();
headerMap.put("referer", url);
String htmlBody = null;
try {
htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
if(htmlBody != null){
Map<String,Object> dataMap = parseHtmlBySearch(htmlBody);
if(dataMap!=null && dataMap.size()>0){
return dataMap;
}
}
}catch (Exception e) {
throw e;
}
return null;
}
/**
* @Title: parseHtmlBySearch
* @author hero
* @Description: TODO(解析文本)
* @param @param htmlBody
* @param @return 设定文件
* @return List<TouTiaoArticle> 返回类型
*/
private static Map<String,Object> parseHtmlBySearch(String htmlBody){
JSONObject jsonObject = JSONObject.parseObject(htmlBody);
JSONArray dataList = jsonObject.getJSONArray("data");
int has_more = jsonObject.getIntValue("has_more");
if(null!=dataList && dataList.size()>0){
Map<String,Object> result = new HashMap<String,Object>();
List<TouTiaoArticle> ttList = new ArrayList<TouTiaoArticle>();
for (int i = 0; i < dataList.size(); i++) {
JSONObject jso = dataList.getJSONObject(i);
try {
String time = String.valueOf(jso.getLongValue("create_time")*1000);
String title = jso.getString("title");
String content = jso.getString("abstract");
String comment_count = jso.getIntValue("comment_count")+"";
String url = null;
if(null != jso.getString("group_id")){
url = "http://www.toutiao.com/a" + jso.getString("group_id")+"/";
}
String source = jso.getString("source");
String user_id = jso.getString("user_id");
Date date = TimeParse.stringFormartDate(time);
TouTiaoArticle tt = new TouTiaoArticle(url, title, user_id,source, date, content, comment_count, "-1", "-1", "今日头条");
ttList.add(tt);
} catch (JSONException e) {
logger.debug("解析数据出现问题", e.fillInStackTrace());
continue;
}
}
result.put("data", ttList);
result.put("has_more", has_more);
return result;
}
return null;
}
/**
* @Title: getUrl
* @author hero
* @Description: TODO(处理url)
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
@SuppressWarnings("unused")
private static String getUrl(String url){
if(url.contains("group/"))
{
url = url.replace("group/", "a");
}
if (url.contains("item")) {
url = url.replace("/item/", "/i");
}
if (url.contains("m.")) {
url = url.replace("m.", "");
}
if(!url.contains("www"))
{
url = url.replace("toutiao.com", "www.toutiao.com");
}
String urlIndex = url.substring(url.length()-1, url.length());
if(!urlIndex.equals("/"))
{
url = url+"/";
}
return url;
}
}
package com.zhiwei.toutiao.util;
import java.io.InputStream;
import java.util.Properties;
public class Config {
static {
Properties conf = null;
try {
InputStream is = Thread.currentThread().getContextClassLoader()
.getResourceAsStream("proxyip.properties");
conf = new Properties();
conf.load(is);
is.close();
registry = conf.getProperty("registry");
group = conf.getProperty("group");
minCount = Integer.valueOf(conf.getProperty("minCount"));
maxCount = Integer.valueOf(conf.getProperty("maxCount"));
} catch (Exception e) {
e.printStackTrace();
}
}
public static String registry;
public static String group;
public static int minCount;
public static int maxCount;
}
package com.zhiwei.toutiao.util;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
public class Tools {
public static String getText(String textFileName) {
try {
StringBuffer sb = new StringBuffer();
InputStream is = Thread.currentThread().getContextClassLoader()
.getResourceAsStream("tac_sign.txt");
BufferedReader br=new BufferedReader(new InputStreamReader(is));
String line = "";
while((line = br.readLine())!=null)
{
sb.append(line);
}
br.close();
return sb.toString();
} catch (IOException e) {
return null;
}
}
/**
* @Title: getAS
* @Description: TODO(获取今日头条加密值)
* @param @return 设定文件
* @return String 返回类型
*/
public static String getAS()
{
long i = (long)Math.floor(new Date().getTime()/1000L);
String t = Long.toHexString(i).toUpperCase();
char[] ts = t.toCharArray();
String e = parseStrToMd5L32(i+"").toString().toUpperCase();
// System.out.println(i+"========"+t);
char[] s = e.substring(0, 5).toCharArray();
char[] a = e.substring(e.length()-5,e.length()).toCharArray();
String c = "";
String o = "";
for(int n = 0; 5 > n; n++)
{
o += ""+s[n] + ts[n];
}
for (int r = 0; 5 > r; r++)
{
c += ""+ts[r + 3] + a[r];
}
String as = "A1" + o + t.substring(t.length()-3, t.length());
String cp = t.substring(0,3) + c + "E1";
return as + "_" + cp;
}
// public static void main(String[] args) {
// Tools.getAS();
// }
/**
* 计算字符串Md5
* @Title: md5
* @param str
* @return String
*/
public static String md5(String str) {
String result = null;
try {
MessageDigest md = MessageDigest.getInstance("MD5");
byte[] bytes = str.getBytes("utf-8");
md.update(bytes);
bytes = md.digest();
result = bytesToHexString(bytes);
} catch(Exception e) {}
return result;
}
/**
* 将二进制转换成16进制字符串
* @Title bytesToHexString
* @param buf
* @return String
*/
private static String bytesToHexString(byte bytes[]) {
String result = null;
if(bytes != null) {
if(bytes.length > 0) {
StringBuffer sb = new StringBuffer();
for(int i = 0; i < bytes.length; i++) {
String hex = Integer.toHexString(bytes[i] & 0xFF);
if (hex.length() == 1) {
hex = '0' + hex;
}
sb.append(hex);
}
result = sb.toString().toLowerCase();
}
}
return result;
}
/**
* @param str
* @return
* @Date: 2013-9-6
* @Author: lulei
* @Description: 32位小写MD5
*/
public static String parseStrToMd5L32(String str){
String reStr = null;
try {
MessageDigest md5 = MessageDigest.getInstance("MD5");
byte[] bytes = md5.digest(str.getBytes());
StringBuffer stringBuffer = new StringBuffer();
for (byte b : bytes){
int bt = b&0xff;
if (bt < 16){
stringBuffer.append(0);
}
stringBuffer.append(Integer.toHexString(bt));
}
reStr = stringBuffer.toString();
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
}
return reStr;
}
/**
* @Title: getTouTiaoHeader
* @author hero
* @Description: TODO(头条帐号头部信息)
* @param @return 设定文件
* @return List<TouTiaoQuestion> 返回类型
*/
public static Map<String,String> getTouTiaoHeader()
{
Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
headerMap.put("Accept-Language","zh-CN,zh;q=0.8");
headerMap.put("AlexaToolbar-ALX_NS_PH","AlexaToolbar/alx-4.0");
headerMap.put("Cache-Control","no-cache");
headerMap.put("Host", "www.toutiao.com");
headerMap.put("Pragma", "no-cache");
headerMap.put("Proxy-Connection", "keep-alive");
headerMap.put("Upgrade-Insecure-Requests", "1");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36");
headerMap.put("Accept-Encoding", "deflate, br");
return headerMap;
}
/**
* @Title: getWangYiHeader
* @author hero
* @Description: TODO(网易帐号头部信息)
* @param @return 设定文件
* @return List<TouTiaoQuestion> 返回类型
*/
public static Map<String,String> getWangYiHeader()
{
Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
headerMap.put("Accept-Language","zh-CN,zh;q=0.8");
headerMap.put("AlexaToolbar-ALX_NS_PH","AlexaToolbar/alx-4.0");
headerMap.put("Cache-Control","no-cache");
headerMap.put("Host", "c.m.163.com");
headerMap.put("Pragma", "no-cache");
headerMap.put("Proxy-Connection", "keep-alive");
headerMap.put("Upgrade-Insecure-Requests", "1");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36");
return headerMap;
}
/**
* @Title: getTouTiaoQuestionHeader
* @author hero
* @Description: TODO(头条问答头部信息)
* @param @return 设定文件
* @return List<TouTiaoQuestion> 返回类型
*/
public static Map<String,String> getTouTiaoQuestionHeader(){
Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("Accept","application/json, text/javascript, */*; q=0.01");
headerMap.put("Accept-Language","zh-CN,zh;q=0.8");
headerMap.put("wendacsrftoken", "undefined");
headerMap.put("x-requested-with", "XMLHttpRequest");
headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36");
return headerMap;
}
/**
* @Title: getTouTiaoSearchHeader
* @author hero
* @Description: TODO(头条搜索头部信息)
* @param @return 设定文件
* @return List<TouTiaoQuestion> 返回类型
*/
public static Map<String,String> getTouTiaoSearchHeader(){
Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("Accept","application/json, text/javascript, */*; q=0.01");
headerMap.put("Accept-Language","zh-CN,zh;q=0.8");
headerMap.put("Host", "www.toutiao.com");
headerMap.put("x-requested-with", "XMLHttpRequest");
headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36");
return headerMap;
}
/**
* @Title: getTouTiaoChannelHeader
* @author hero
* @Description: TODO(头条频道头部信息)
* @param @return 设定文件
* @return List<TouTiaoQuestion> 返回类型
*/
public static Map<String,String> getTouTiaoChannelHeader(){
Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("Accept","application/json, text/javascript, */*; q=0.01");
headerMap.put("Accept-Language","zh-CN,zh;q=0.8");
headerMap.put("Host", "www.toutiao.com");
headerMap.put("x-requested-with", "XMLHttpRequest");
headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36");
return headerMap;
}
/**
* @Title: getTouTiaoChannelHeader
* @author hero
* @Description: TODO(头条问答回答列表)
* @param @return 设定文件
* @return List<TouTiaoQuestion> 返回类型
*/
public static Map<String,String> getTouTiaoQuestionAnswerHeader(){
Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("Accept","application/json, text/javascript, */*; q=0.01");
headerMap.put("Accept-Language","zh-CN,zh;q=0.8");
headerMap.put("x-requested-with", "XMLHttpRequest");
headerMap.put("user-agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36");
return headerMap;
}
}
package com.zhiwei.wangyi.bean;
import java.io.Serializable;
import java.util.Date;
/**
* @ClassName: WangYiNews
* @Description: TODO(网易新闻)
* @author hero
* @date 2017年1月3日 上午9:22:42
*/
public class WangYiNews implements Serializable{
private static final long serialVersionUID = 2222466947676512589L;
private String id; //主键id,文章地址
private String title; //标题
private String source; //来源
private Date time; //发布时间
private String content; //简介
private int reply_count; //跟帖数
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public Date getTime() {
return time;
}
public void setTime(Date time) {
this.time = time;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public int getReply_count() {
return reply_count;
}
public void setReply_count(int reply_count) {
this.reply_count = reply_count;
}
@Override
public String toString()
{
return "new WangYiNews["
+ "id = " + id
+ ", title = " + title
+ ", source = " + source
+ ", time = " + time
+ ", content = " + content
+ ", reply_count = " + reply_count
+ "]";
}
public WangYiNews(){};
public WangYiNews(String id,String title,String source
,Date time,String content,int reply_count)
{
this.id = id;
this.title = title;
this.source = source;
this.time = time;
this.content = content;
this.reply_count = reply_count;
}
}
package com.zhiwei.wangyi.parse;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.toutiao.util.Tools;
import com.zhiwei.wangyi.bean.WangYiNews;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class WangyiNewParse {
private static Logger logger = LoggerFactory.getLogger(WangyiNewParse.class);
private static boolean finish = true;
/**
* @Title: getWYHistory
* @Description: TODO(根据文章地址解析网易号历史文章)
* @param @param url
* @param @return 设定文件
* @return List<WangYiNews> 返回类型
* @throws Exception
*/
public static List<WangYiNews> getWYHistory(String tid,Date endTime) throws Exception
{
List<WangYiNews> list = new ArrayList<WangYiNews>();
Map<String,String> headerMap = Tools.getWangYiHeader();
int page = 0;
//解析翻页数据
while(finish)
{
String url = "http://c.m.163.com/nc/subscribe/list/"+tid+"/all/"+page*20+"-20.html";
System.out.println(url);
String htmlBody = HttpClientTemplateOK.get(url, null,headerMap);
if(htmlBody!=null)
{
List<WangYiNews> wyList = analysis(htmlBody,endTime);
if(wyList == null)
{
finish = false;
}
list.addAll(wyList);
}
page++;
ZhiWeiTools.sleep(10000);
}
return list;
}
/**
* @Title: analysis
* @Description: TODO(解析头条号历史文章)
* @param @param htmlBody
* @param @param endTime
* @param @return 设定文件
* @return List<WangYiNews> 返回类型
*/
private static List<WangYiNews> analysis(String htmlBody,Date endTime)
{
List<WangYiNews> dataList = new ArrayList<WangYiNews>();
try {
JSONObject dataJosn = JSONObject.parseObject(htmlBody);
//解析来源
JSONObject subscribe_info = dataJosn.getJSONObject("subscribe_info");
String source = subscribe_info.getString("tname");
//解析列表
JSONArray tab_list = dataJosn.getJSONArray("tab_list");
if(tab_list.size()>=1)
{
for(int i = 0; i<tab_list.size(); i++)
{
JSONObject data = tab_list.getJSONObject(i);
try {
String url = "https://c.m.163.com/news/a/"+ data.getString("docid")+".html?spss=newsapp&spsw=1";
String title = data.getString("title");
String content = data.getString("aheadBody");
Date time = TimeParse.stringFormartDate(data.getString("ptime"));
int reply_count = data.getIntValue("replyCount");
if(time.after(endTime))
{
WangYiNews wy = new WangYiNews(url,title,source,time,content,reply_count);
dataList.add(wy);
}else {
finish = false;
}
} catch (Exception e) {
e.printStackTrace();
logger.error("网易号历史文章解析,单个字段解析出现问题",e.getMessage());
continue;
}
}
}else
{
finish = false;
}
} catch (Exception e) {
e.printStackTrace();
logger.error("网易号历史文章解析,需要解析的文本结构有问题",e.getMessage());
return null;
}
return dataList;
}
}
//package com.zhiwei.toutiao.test;
//
//import java.util.List;
//
//import org.junit.Test;
//
//import com.zhiwei.toutiao.bean.TouTiaoAccount;
//import com.zhiwei.toutiao.parse.TouTiaoAccountParse;
//
///**
// * @ClassName: TouTiaoAccountExample
// * @Description: TODO(今日头条帐号采集)
// * @author hero
// * @date 2017年10月17日 下午4:03:44
// */
//public class TouTiaoAccountExample {
//
// public void touTiaoAccountTest(){
// String word = "华尔街瞭望";
// System.out.println("===================="+TouTiaoAccountParse.getTouTiaoAccountInfoByName(word, null));
// }
//
//
//
// @Test
// public void touTiaoAccountFriendTest(){
// String userid = "3350881978";
// List<TouTiaoAccount> userList = TouTiaoAccountParse.getFriendsList(userid, null,1000);
// for(TouTiaoAccount tta : userList){
// System.out.println(tta);
// }
// }
//}
//package com.zhiwei.toutiao.test;
//
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.toutiao.bean.TouTiaoArticle;
//import com.zhiwei.toutiao.parse.TouTiaoChannelParse;
//import com.zhiwei.toutiao.util.Tools;
//
///**
// * @ClassName: TouTiaoChannelExample
// * @Description: TODO(头条频道解析测试)
// * @author hero
// * @date 2017年7月24日 下午5:10:52
// */
//public class TouTiaoChannelExample {
//
// public static void main(String[] args) {
//
// long max_behot_time = 0;
// for(int i= 0;i<3; i++){
// System.out.println("i=============="+i);
// if( i==0 ){
// max_behot_time = 0;
// }
// String as = Tools.getAS().split("_")[0];
// String cp = Tools.getAS().split("_")[1];
// String url = "http://www.toutiao.com/api/pc/feed/?category=news_tech&utm_source=toutiao"
// + "&widen=1&max_behot_time="+max_behot_time+"&max_behot_time_tmp="+max_behot_time
// +"&tadrequire=true&as=" +as +"&cp=" + cp;
// System.out.println("url:" + url);
//
// Map<String,Object> result = TouTiaoChannelParse.touTiaoChannel(url, null);
// if(result!=null){
// Long next = (Long)result.get("next");
// List<TouTiaoArticle> ttList = (List<TouTiaoArticle>)result.get("data");
// System.out.println("ttlist size is " + ttList.size());
// for(TouTiaoArticle tt : ttList){
// System.out.println(tt);
// }
// if(next != null){
// max_behot_time = next;
// }else{
// break;
// }
// }
// }
// }
//
//}
//package com.zhiwei.toutiao.test;
//
//import java.net.InetSocketAddress;
//import java.net.Proxy;
//import java.net.Proxy.Type;
//import java.util.ArrayList;
//import java.util.List;
//
//import com.zhiwei.toutiao.bean.TouTiaoComment;
//import com.zhiwei.toutiao.parse.TouTiaoCommentParse;
//
///**
// * @ClassName: TouTiaoCommentExample
// * @Description: TODO(今日头条评论测试)
// * @author hero
// * @date 2016年12月9日 下午8:08:02
// */
//public class TouTiaoCommentExample {
// private static String hostname = "192.168.9.37";
// private static int host = 31128;
// private static Proxy proxy = new Proxy(Type.HTTP, new InetSocketAddress(hostname, host));
//
// public static void main(String[] args) throws Exception {
//
// TouTiaoCommentParse touTiaoComment = new TouTiaoCommentParse();
//
// List<String> mids = new ArrayList<String>();
// mids.add("https://www.toutiao.com/a6549289895376978436/");
//
// for(String mid : mids)
// {
// List<TouTiaoComment> list = touTiaoComment.getTouTiaoComment(mid, null);
// System.out.println(mid+"============="+list.size());
// for(TouTiaoComment ttc : list)
// {
// System.out.println(ttc);
//// DBObject doc = new BasicDBObject();
//// doc.put("_id", ttc.getId());
//// doc.put("text", ttc.getText());
//// doc.put("time", ttc.getTime());
//// doc.put("username", ttc.getUserName());
//// doc.put("reply_count", ttc.getReply_count());
//// doc.put("digg_count", ttc.getDigg_count());
//// doc.put("source_url", ttc.getId());
//// touTiaoCommentDAO.addTouTiaoComment(doc);
// }
// }
//
// }
//
//
//}
///**
// * @Title: TouTiaoExample.java
// * @Package com.zhiwei.toutiao.test
// * @Description:
// * @author hero
// * @date 2016年9月2日 上午11:48:51
// * @version V1.0
// */
///**
//*
//*/
//package com.zhiwei.toutiao.test;
//
//import java.util.ArrayList;
//import java.util.Date;
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.toutiao.bean.TouTiaoArticle;
//import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
//import com.zhiwei.toutiao.util.Tools;
//import com.zhiwei.zhiweiTools.timeParse.TimeParse;
//
///**
// * @Description:
// * @author hero
// * @date 2016年9月2日 上午11:48:51
// */
//public class TouTiaoExample {
//
// @SuppressWarnings("unchecked")
// public static void main(String[] args) throws Exception {
// long a = System.currentTimeMillis();
// List<String> urlList = new ArrayList<String>();
// urlList.add("6859134443");
//
// System.out.println(urlList.size());
//
// Date endTime = TimeParse.stringFormartDate("2018-04-01");
//
// for (String url : urlList) {
// String mid = url;
// String max_behot_time = "0";
// while (true) {
// Map<String, Object> dataMap = null;
// dataMap = TouTiaoArticleParse.getTouTiaoList(mid, max_behot_time, endTime,null);
// if (dataMap != null) {
// List<TouTiaoArticle> ttlist = (List<TouTiaoArticle>) dataMap.get("data");
// max_behot_time = (String) dataMap.get("max_behot_time");
// System.out.println(max_behot_time + "=======" + ttlist.size());
// if (max_behot_time == null || ttlist.isEmpty()) {
// break;
// } else {
// if (ttlist.size() > 0) {
// for (TouTiaoArticle tt : ttlist) {
// System.out.println(tt);
// }
// }
// }
// }
// }
// }
// long b = System.currentTimeMillis();
// System.out.println("一轮的采集时间为:" + (b - a) / 1000);
// }
//
//}
///**
// * @Title: TouTiaoExample.java
// * @Package com.zhiwei.toutiao.test
// * @Description:
// * @author hero
// * @date 2016年9月2日 上午11:48:51
// * @version V1.0
// */
//package com.zhiwei.toutiao.test;
//import java.util.Date;
//import java.util.List;
//import java.util.Map;
//import com.zhiwei.toutiao.bean.TouTiaoArticle;
//import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
//import com.zhiwei.zhiweiTools.timeParse.TimeParse;
//import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
//
///**
// * @Description:
// * @author hero
// * @date 2016年9月2日 上午11:48:51
// */
//public class TouTiaoMicroExample {
//
// public static void main(String[] args) throws Exception {
// long a = System.currentTimeMillis();
// String user_id = "55301399445";
// Date date = new Date((new Date().getTime()-24*60*60*1000));
// parseMicroTouTiao(user_id, date);
// long b = System.currentTimeMillis();
// System.out.println("一轮的采集时间为:" + (b - a) / 1000);
//
// }
//
//
// @SuppressWarnings("unchecked")
// public static void parseMicroTouTiao(String user_id, Date endDate){
// int count = 1;
// boolean f = true;
// String max_behot_time = null;
// while(f)
// {
// if(count==3){
// f = false;
// }
// for(int i=0; i<3; i++){
// try {
// Map<String, Object> dataMap = TouTiaoArticleParse.getMicroTouTiaoCrawler(user_id, endDate, null, max_behot_time);
// List<TouTiaoArticle> ttlist = null;
// if(dataMap!=null && !dataMap.isEmpty())
// {
// ttlist = (List<TouTiaoArticle>) dataMap.get("data");
// max_behot_time = dataMap.get("max_behot_time")!=null?dataMap.get("max_behot_time").toString():null;
// if (ttlist!=null && ttlist.size() > 0)
// {
// for(TouTiaoArticle touTiaoArticle : ttlist){
// System.out.println(TimeParse.dateFormartString(touTiaoArticle.getTime(), "yyyy-MM-dd HH:mm:ss"));
// }
// }
// count++;
// break;
// }else{
// continue;
// }
// } catch (Exception e) {
// e.printStackTrace();
// continue;
// }
// }
// ZhiWeiTools.sleep(7000);
// }
// }
//
//
//
//}
//package com.zhiwei.toutiao.test;
//
//import java.util.ArrayList;
//import java.util.HashMap;
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.proxyip.util.Tools;
//import com.zhiwei.toutiao.bean.TouTiaoQuestionAnswer;
//import com.zhiwei.toutiao.parse.TouTiaoQuestionAnswerParse;
//import com.zhiwei.zhiweiTools.excel.PoiExcelUtil;
//import com.zhiwei.zhiweiTools.timeParse.TimeParse;
//
///**
// * @ClassName: TouTiaoQuestionAnswerExample
// * @Description: TODO(头条问答回答测试)
// * @author hero
// * @date 2017年7月28日 下午8:38:54
// */
//public class TouTiaoQuestionAnswerExample {
//
//
//
//
// public static void main(String[] args) {
//
// String path = "E://头条问答采集需求.xlsx";
// String write_path = "E://头条问答回答列表0801.xlsx";
//
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// Map<String,Object> map = poi.importExcel(path, 0);
// List<Map<String,Object>> dataMap = (List<Map<String, Object>>) map.get("body");
//
// List<String> headerList = new ArrayList<String>();
// headerList.add("问题链接");
// headerList.add("问题标题");
// headerList.add("回答用户uid");
// headerList.add("回答用户昵称");
// headerList.add("回答时间");
// headerList.add("回答内容");
// headerList.add("回答评论数");
// headerList.add("回答点赞数");
//
// List<Map<String,Object>> answerList = new ArrayList<Map<String,Object>>();
// for(Map<String,Object> data : dataMap){
// String title = data.get("标题").toString();
// String link = data.get("链接").toString();
// String[] questionIdes = link.split("/");
// System.out.println(questionIdes.length);
// String questionId = questionIdes[questionIdes.length-1];
// questionId = questionId.substring(1, questionId.length());
// System.out.println(link+"========"+questionId);
//
// int page = 0;
// int nextPage = 1;
// int req_type = 1;
// while(page != nextPage && req_type != 3){
// Map<String,Object> result = TouTiaoQuestionAnswerParse.getAnserList(questionId, page, req_type);
// System.out.println(result);
// page = (int) result.get("page");
// nextPage++;
// List<TouTiaoQuestionAnswer> ansList = (List<TouTiaoQuestionAnswer>) result.get("ansList");
// if(ansList.size()>0){
// for(TouTiaoQuestionAnswer answer : ansList){
// Map<String,Object> answerMap = new HashMap<String,Object>();
// answerMap.put("问题链接", link);
// answerMap.put("问题标题", title);
// answerMap.put("回答用户uid", answer.getUsername());
// answerMap.put("回答用户昵称", answer.getUser_id());
// answerMap.put("回答时间", TimeParse.dateFormartString(answer.getTime(), "yyyy-MM-dd HH:mm:ss"));
// answerMap.put("回答内容", answer.getContent());
// answerMap.put("回答评论数", answer.getComment_count());
// answerMap.put("回答点赞数", answer.getDigg_count());
// answerList.add(answerMap);
// }
// }else{
// req_type++;
// page = 0;
// nextPage = 1;
// }
// System.out.println(page+"=========="+nextPage+"============"+req_type);
// Tools.sleep(8000);
// }
// }
//
// poi.exportExcel(write_path, "0", headerList, answerList);
//
// }
//
//}
//package com.zhiwei.toutiao.test;
//
//import java.util.List;
//
//import org.junit.Test;
//
//import com.zhiwei.toutiao.bean.TouTiaoQuestion;
//import com.zhiwei.toutiao.parse.TouTiaoQuestionParse;
//import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
//
///**
// * @ClassName: TouTiaoQuestionExample
// * @Description: TODO(头条问答采集测试类)
// * @author hero
// * @date 2017年7月20日 下午3:06:51
// */
//public class TouTiaoQuestionExample {
//
//
//
// @Test
// public void touTiaoQuestionTest(){
// String word = "京东";
//
// String url = "https://www.wukong.com/wenda/web/search/question/brow/?search_text="+
// URLCodeUtil.getURLEncode(word, "UTF-8")+"&count=15";
//
// List<TouTiaoQuestion> list = TouTiaoQuestionParse.getSearchTouTiaoQuestion(url);
// System.out.println(list.size());
// for(TouTiaoQuestion question : list){
// System.out.println(question);
// }
// }
//
//}
//package com.zhiwei.toutiao.test;
//
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.toutiao.bean.TouTiaoArticle;
//import com.zhiwei.toutiao.parse.TouTiaoSearchParse;
//import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
//
///**
// * @ClassName: TouTiaoSearchExample
// * @Description: TODO(头条搜索测试)
// * @author hero
// * @date 2017年7月24日 下午5:11:15
// */
//public class TouTiaoSearchExample {
//
// public static void main(String[] args) {
// String word = "京东";
// for (int i = 0; i < 3; i++) {
// String url = "http://www.toutiao.com/search_content/?offset=" + i * 20 + "&format=json&keyword="
// + URLCodeUtil.getURLDecode(word, "utf--8") + "&autoload=true&count=20&cur_tab=1";
// System.out.println(url);
// Map<String, Object> ttList;
// try {
// ttList = TouTiaoSearchParse.touTiaoSearchByWord(url,null);
// System.out.println("ttsize is : " + ttList.size());
// } catch (Exception e) {
// e.printStackTrace();
// }
//
// }
// }
//}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment