Commit 4c650e8d by yangchen

一点资讯,凤凰,搜狐等自媒体采集

parents
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>articlenewscrawler</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>articlenewscrawler</name>
<description>采集凤凰,一点资讯,搜狐历时文章和文章评论</description>
<dependencies>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.29</version>
</dependency>
<dependency>
<groupId>com.zhiwei</groupId>
<artifactId>zhiweiTools</artifactId>
<version>0.0.6-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
</dependency>
<dependency>
<groupId>com.zhiwei</groupId>
<artifactId>excelpoi</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
</dependencies>
<!-- 打包管理 -->
<build>
<plugins>
<!-- 发布源码 -->
<plugin>
<artifactId>maven-source-plugin</artifactId>
<version>2.4</version>
<configuration>
<attach>true</attach>
</configuration>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.10.4</version>
</plugin>
<!-- 解决maven test命令时console出现中文乱码乱码 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.19.1</version>
<configuration>
<forkMode>once</forkMode>
<argLine>-Dfile.encoding=UTF-8</argLine>
</configuration>
</plugin>
</plugins>
</build>
<!-- 分发管理:管理distribution和supporting files -->
<distributionManagement>
<snapshotRepository>
<id>nexus-releases</id>
<name>User Porject Snapshot</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/snapshots/</url>
<uniqueVersion>true</uniqueVersion>
</snapshotRepository>
<repository>
<id>nexus-releases</id>
<name>User Porject Release</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/releases/</url>
</repository>
</distributionManagement>
</project>
\ No newline at end of file
package com.zhiwei.httpclient;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class HeadGet {
/**
*
* @Description (搜狐号历史文章头信息)
* @param url
* @param cookie
* @return
* @throws IOException
*/
public static Map<String, String> getSouhuAccountHeaderMap(String cookie) {
Map<String, String> headerMap = new HashMap<String, String>();
headerMap.put("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36");
headerMap.put("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.8");
headerMap.put("Connection", "keep-alive");
headerMap.put("Host", "mp.sohu.com");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
/**
*
* @Description (搜狐号评论采集头信息)
* @param url
* @param cookie
* @return
* @throws IOException
*/
public static Map<String,String> getSouhuCommentHeaderMap(String cookie) {
Map<String, String> headerMap = new HashMap<String, String>();
headerMap.put("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
headerMap.put("Accept",
"*/*");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Connection", "keep-alive");
headerMap.put("Host", "apiv2.sohu.com");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
/**
*
* @Description (一点资讯历史文章采集头信息)
* @param url
* @param cookie
* @return
* @throws IOException
*/
public static Map<String,String> getYidianzixunAccountHeaderMap(String cookie) {
Map<String, String> headerMap = new HashMap<String, String>();
headerMap.put("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
headerMap.put("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Connection", "keep-alive");
headerMap.put("Host", "www.yidianzixun.com");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
/**
*
* @Description (一点资讯评论采集头信息)
* @param url
* @param cookie
* @return
* @throws IOException
*/
public static Map<String,String> getYidianzixunCommentHeaderMap(String cookie) {
Map<String, String> headerMap = new HashMap<String, String>();
headerMap.put("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
headerMap.put("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Connection", "keep-alive");
headerMap.put("Host", "www.yidianzixun.com");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
/**
*
* @Description (凤凰号历史文章头信息)
* @param url
* @param cookie
* @return
* @throws IOException
*/
public static Map<String,String> getFenghuangAccountHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("User-Agent",
"%E5%87%A4%E5%87%B0%E6%96%B0%E9%97%BB/5.7.4.0 CFNetwork/811.5.4 Darwin/16.7.0");
headerMap.put("Accept",
"*/*");
headerMap.put("Accept-Language", "zh-cn");
headerMap.put("Connection", "keep-alive");
headerMap.put("Host", "api.3g.ifeng.com");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
/**
*
* @Description (凤凰号文章评论头信息)
* @param url
* @param cookie
* @return
* @throws IOException
*/
public static Map<String,String> getFenghuangCommentHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("User-Agent",
"IfengNews/5.7.4 (iPhone; iOS 10.3.3; Scale/2.00)");
headerMap.put("Accept",
"*/*");
headerMap.put("Accept-Language", "zh-Hans-CN;q=1");
headerMap.put("Connection", "keep-alive");
headerMap.put("Host", "user.iclient.ifeng.com");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
/**
*
* @Description (凤凰号文章采集头信息)
* @param url
* @param cookie
* @return
* @throws IOException
*/
public static Map<String,String> getFenghuangWordHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
headerMap.put("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Connection", "keep-alive");
headerMap.put("Host", "search.ifeng.com");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
/**
*
* @Description 一点资讯关键词采集头信息
* @param url
* @param cookie
* @return
* @throws IOException
*/
public static Map<String,String> getYidianzixunWordHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
headerMap.put("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Connection", "keep-alive");
headerMap.put("Host", "www.yidianzixun.com");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
/**
*
* @Description 百家号历史文章采集头信息
* @param url
* @param cookie
* @return
* @throws IOException
*/
public static Map<String,String> getBaijiaAccountHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
headerMap.put("Accept",
"application/json, text/javascript, */*; q=0.01");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Connection", "keep-alive");
headerMap.put("Host", "baijia.baidu.com");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
/**
*
* @Description 大鱼号历史文章采集头信息
* @param url
* @param cookie
* @return
* @throws IOException
*/
public static Map<String,String> getDayuAccountHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
headerMap.put("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Connection", "keep-alive");
headerMap.put("Host", "ff.dayu.com");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
/**
*
* @Description 大鱼号评论采集
* @param url
* @param cookie
* @return
* @throws IOException
*/
public static Map<String,String> getDayuCommentHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
headerMap.put("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Connection", "keep-alive");
headerMap.put("Host", "m.uczzd.cn");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
/**
*
* @Description 企鹅号历史文章获取
* @param cookie
* @return
*/
public static Map<String,String> getQQAccountHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("Host", "r.cnews.qq.com");
headerMap.put("Connection", "keep-alive");
headerMap.put("Accept", "*/*");
headerMap.put("Accept-Language", "zh-Hans-CN;q=1");
headerMap.put("User-Agent", "天天快报 4.6.0 qnreading (iPhone8,1; iOS 10.3.3; zh_CN; 4.6.0.81)");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
public static Map<String,Object> getQQAccountOneParamMap(String chlid) {
Map<String,Object> paramMap = new HashMap<String,Object>();
paramMap.put("chlid", chlid);
return paramMap;
}
public static Map<String,Object> getQQAccountOtherParamMap(String ids) {
Map<String,Object> paramMap = new HashMap<String,Object>();
paramMap.put("ids", ids);
return paramMap;
}
/**
*
* @Description 秒拍依据连接获取数据头信息
* @param cookie
* @return
*/
public static Map<String,String> getMiaoPaiByURlHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("Host", "www.miaopai.com");
headerMap.put("Connection", "keep-alive");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
/**
*
* @Description 秒拍依据关键词获取数据头信息
* @param cookie
* @return
*/
public static Map<String,String> getMeipaiBywordHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>();
headerMap.put("Host", "www.meipai.com");
headerMap.put("Connection", "keep-alive");
headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
headerMap.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
if(cookie != null) {
headerMap.put("Cookie", cookie);
}
return headerMap;
}
public static void main(String[] args) {
String url = "http://www.miaopai.com/show/H99oVYnsv47ejBqK8TMZXA__.htm";
String cookie = "kg_udid=9E907CB26A8E3CC24F416CB5CF360E9F; sessionId=f771a3317da8040ace111d192f5e32df; udid=59B30CA793DBDCA2D400F41C8B3DDA78; aliyungf_tc=AQAAAEVz1U5hFwoA6tbnc5AZYZww6PRM";
Map<String, String> headerMap = HeadGet.getMiaoPaiByURlHeaderMap(null);
String result = HttpClient.executeHttpRequestGet(url, headerMap);
Document doc = Jsoup.parse(result);
String s = doc.select("body > div.box885 > div.contentLeft.contentLeft_detail > div.videoList.video_detail > div.videoIntr > div > div.personalAbout > div.personalData > p.personalDataN > a").text();
System.out.println(s);
// System.out.println(result);
// System.out.println(result.length());
}
}
package com.zhiwei.httpclient;
import java.io.IOException;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
public class HttpClient {
private static Logger logger = LoggerFactory.getLogger(HttpClient.class);
/**
*
* @Description (TODO这里用一句话描述这个方法的作用)
* @param url
* @param cookie
* @return
* @throws IOException
*/
public static String executeHttpRequestGet(String url,Map<String, String> headerMap) {
try {
String result = HttpClientTemplateOK.get(url, null, headerMap);
return result;
} catch (Exception e) {
logger.error("httpClient 获取数据出现问题:{}", e.getMessage());
return null;
}
}
public static String executeHttpRequestPost(String url,Map<String, String> headerMap,Map<String, Object> paramMap) {
try {
String result = HttpClientTemplateOK.post(url, null, headerMap, paramMap);
return result;
} catch (Exception e) {
logger.error("httpClient 获取数据出现问题:{}", e.getMessage());
return null;
}
}
}
package com.zhiwei.httpclient;
public class Test {
public static void main(String[] args) {
String s = "<p>原标题:变态!浙江某大学惊现偷窥狂魔,拍下700张女生隐私部位照片!</p><p>来源:舟山公安</p><p>版权归原作者所有,如有侵权请联系我们</p><p>";
s = s.replaceAll("<.*?>", "");
System.out.println(s);
}
}
package com.zhiwei.parse;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.BaijiaAccountAnalysis;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class Baijia {
private static Logger logger = LoggerFactory.getLogger(Baijia.class);
private static BaijiaAccountAnalysis baijiaAccountAnalysis = new BaijiaAccountAnalysis();
public static List<Map<String,Object>> getBaijiaAccountData(String app_id,String startTime) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
int i = 0;
try {
while(true) {
try {
String url = "https://baijia.baidu.com/writerlistarticle?ajax=json&app_id="+app_id+"&_limit=20&_skip=";
System.out.println(url+i);
Map<String,String> headerMap = HeadGet.getBaijiaAccountHeaderMap(null);
String result = HttpClient.executeHttpRequestGet(url + i, headerMap);
List<Map<String,Object>> list = baijiaAccountAnalysis.getBaijiaAccountData(result, startTime);
if(list == null || list.size() < 1){
break;
}
i += 20;
ZhiWeiTools.sleep(6000);
dataList.addAll(list);
} catch (Exception e) {
e.printStackTrace();
ZhiWeiTools.sleep(5000);
logger.error("此页解析出错",e.getMessage());
continue;
}
}
return dataList;
} catch (Exception e) {
logger.error("获取百家号历史文章失败",e.getMessage());
return dataList;
}
}
}
package com.zhiwei.parse;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.DayuAccountAnalysis;
import com.zhiwei.parse.analysis.DayuCommentAnalysis;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class Dayu {
private static Logger logger = LoggerFactory.getLogger(Dayu.class);
private static DayuAccountAnalysis dayuAccountAnalysis = new DayuAccountAnalysis();
private static DayuCommentAnalysis dayuCommentAnalysis = new DayuCommentAnalysis();
/**
*
* @Description 获取大鱼号历史文章信息
* @param mid
* @return
*/
public static List<Map<String,Object>> getDayuAccountData(String mid,String name,String startTime) {
int i = 0;
Map<String,String> headerMap = HeadGet.getDayuAccountHeaderMap(null);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
try {
while(true) {
String url = "http://ff.dayu.com/contents/author/"+mid+"?biz_id=1002&_size=50&_page="+i+"&_order_type=published_at&status=1&_fetch=1";
String result = HttpClient.executeHttpRequestGet(url, headerMap);
System.out.println(url);
List<Map<String,Object>> lists = dayuAccountAnalysis.getDayuAccountData(result,name,startTime);
if(lists == null) {
break;
}
if(lists.size() < 1) {
break;
}
dataList.addAll(lists);
System.out.println("================解析第"+i+"页====此时有数据=="+dataList.size());
i++;
ZhiWeiTools.sleep(8000);
}
return dataList;
} catch (Exception e) {
logger.error("获取大鱼号历史文章出错",e.getMessage());
e.printStackTrace();
return dataList;
}
}
/**
*
* @Description 获取大鱼号评论信息
* @param articleId
* @return
*/
public static List<Map<String,Object>> getDayuCommentData(String articleId) {
Map<String,String> headerMap = HeadGet.getDayuCommentHeaderMap(null);
String url = "http://m.uczzd.cn/iflow/api/v2/cmt/article/"+articleId+"/comments/byhot?sn=0&count=10&ts="+new Date().getTime();
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
int i = 9991;
try {
System.out.println(url);
String result = HttpClient.executeHttpRequestGet(url, headerMap);
List<Map<String,Object>> lists = dayuCommentAnalysis.getDayuCommentData(result,articleId);
dataList.addAll(lists);
while(true) {
lists.clear();
ZhiWeiTools.sleep(5000);
System.out.println(url+"&hotValue="+i);
result = HttpClient.executeHttpRequestGet(url+"&hotValue="+i, headerMap);
lists = dayuCommentAnalysis.getDayuCommentData(result,articleId);
if(lists == null || lists.size() < 1) {
break;
}
dataList.addAll(lists);
i -= 10;
}
return dataList;
} catch (Exception e) {
e.printStackTrace();
logger.error("获取评论失败",e.getMessage());
return dataList;
}
}
public static int getDayuCommentCount(String articleId) {
String url = "http://m.uczzd.cn/iflow/api/v2/cmt/article/"+articleId+"/comments/byhot";
Map<String,String> headerMap = HeadGet.getDayuCommentHeaderMap(null);
String result = HttpClient.executeHttpRequestGet(url, headerMap);
JSONObject json = JSONObject.parseObject(result);
return json.getJSONObject("data").getInteger("comment_cnt");
}
}
package com.zhiwei.parse;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.FenghuangAccountAnalysis;
import com.zhiwei.parse.analysis.FenghuangByWordAnalysis;
import com.zhiwei.parse.analysis.FenghuangCommentAnalysis;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class Fenghuang {
private static Logger logger = LoggerFactory.getLogger(Fenghuang.class);
private static FenghuangAccountAnalysis fenghuangAccountAnalysis = new FenghuangAccountAnalysis();
private static FenghuangCommentAnalysis fenghuangCommentAnalysis = new FenghuangCommentAnalysis();
private static FenghuangByWordAnalysis fenghuangByWordAnalysis = new FenghuangByWordAnalysis();
/**
*
* @Description 获取数据(传入用户id)
* @param id
* @param startTime 可不传 格式(2017-12-09 17:53:02)
* @return
*/
public static List<Map<String,Object>> getFenghuangAccountData(String id,String startTime) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
int i = 1;
while(true){
try {
String url = "http://api.3g.ifeng.com/api_wemedia_index?followid=weMedia_"+id+"&page="+i+"&pagesize=20&tag=article";
System.out.println("====================采集第"+i+"页");
List<Map<String,Object>> list = fenghuangAccountAnalysis.getArticleData(url, startTime);
if(list == null || list.size() < 1) {
break;
}
dataList.addAll(list);
ZhiWeiTools.sleep(2000);
i++;
} catch (Exception e) {
logger.error("程序出错",e.getMessage());
return dataList;
}
}
return dataList;
}
/**
*
* @Description 凤凰文章评论采集
* @param docUrl
* @return
*/
public static List<Map<String,Object>> getFenghuangCommentData(String url) {
url = fenghuangCommentAnalysis.getdocUrl(url);
if(url == null) {
return null;
}
int i = 1;
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
while(true) {
System.out.println(url+i);
ZhiWeiTools.sleep(2000);
List<Map<String,Object>> list = fenghuangCommentAnalysis.getData(url+i);
if(list == null || list.size() < 1) {
break;
}
i++;
dataList.addAll(list);
}
return dataList;
}
/**
*
* @Description 获取凤凰评论数
* @param url
* @return
*/
public static Map<String,Object> getFenghuangCommentCount(String url) {
url = fenghuangCommentAnalysis.getdocUrl(url);
if(url == null) {
return null;
}
Map<String,Object> map = fenghuangCommentAnalysis.getFenghuangCommentCount(url);
return map;
}
public static List<Map<String,Object>> getFenghuangByWord(String word) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
int i = 1;
try {
while (true) {
String url = "http://search.ifeng.com/sofeng/search.action?q="+URLEncoder.encode(word, "UTF-8")+"&c=1&p=";
Map<String,String> headerMap = HeadGet.getFenghuangWordHeaderMap(null);
String result = HttpClient.executeHttpRequestGet(url+i, headerMap);
List<Map<String,Object>> lists = fenghuangByWordAnalysis.getFenghuangByWord(result);
if(lists == null || lists.size() < 1) {
break;
}
if(lists != null && lists.size() > 0) {
dataList.addAll(lists);
}
System.out.println(word+"===================以获取的数据==:" + dataList.size());
i++;
if(i == 76) {
break;
}
ZhiWeiTools.sleep(4000);
}
return dataList;
} catch (UnsupportedEncodingException e) {
logger.error("依据关键词获取凤凰文章出错",e.getMessage());
e.printStackTrace();
return dataList;
} catch (IOException e) {
e.printStackTrace();
logger.error("链接获取凤凰信息出错",e.getMessage());
return dataList;
}
}
}
package com.zhiwei.parse;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.MeipaiByWordAnalysis;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class Meipai {
private static Logger logger = LoggerFactory.getLogger(Meipai.class);
private static MeipaiByWordAnalysis meipaiByWordAnalysis = new MeipaiByWordAnalysis();
public static List<Map<String,Object>> getMeipaiByWordData(String word) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
try {
String url = "http://www.meipai.com/search/mv?q="+URLEncoder.encode(word, "UTF-8");
Map<String,String> headerMap = HeadGet.getMeipaiBywordHeaderMap(null);
int i = 1;
while(true) {
ZhiWeiTools.sleep(5000);
String result = HttpClient.executeHttpRequestGet(url+"&page="+i, headerMap);
List<String> urlList = meipaiByWordAnalysis.getURl(result);
if(urlList.size() < 1) {
break;
}
for(String newurl : urlList) {
Map<String,Object> map = meipaiByWordAnalysis.getMeipaiData(headerMap,newurl);
if(map != null) {
dataList.add(map);
}
ZhiWeiTools.sleep(4000);
}
System.out.println(url+"&page="+i+"==========="+dataList.size()+"条");
i++;
}
return dataList;
} catch (Exception e) {
e.printStackTrace();
return dataList;
}
}
}
package com.zhiwei.parse;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
public class Miaopai {
private static Logger logger = LoggerFactory.getLogger(Miaopai.class);
public static Map<String,Object> getMiaopaiDataByURL(String url) {
Map<String,String> headerMap = HeadGet.getMiaoPaiByURlHeaderMap(null);
String result = HttpClient.executeHttpRequestGet(url, headerMap);
Map<String,Object> dataMap = new HashMap<String,Object>();
Document doc = Jsoup.parse(result);
String time = doc.select("div.personalData > p.personalDataT > span:nth-child(1)").text();
String source = doc.select("div.videoIntr > div > div.personalAbout > div.personalData > p.personalDataN > a").text();
String video_count = doc.select("div.personalData > p.personalDataT > span.red").text().split("观看")[0];
String title = doc.select("div.videoList.video_detail > div.videoIntr > div > div.viedoAbout > p:nth-child(1)").text();
dataMap.put("time", time);
dataMap.put("source", source);
dataMap.put("video_count", video_count);
dataMap.put("title", title);
dataMap.put("url", url);
return dataMap;
}
}
package com.zhiwei.parse;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.QQAccountAnalysis;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class QQ {
private static Logger logger = LoggerFactory.getLogger(QQ.class);
private static QQAccountAnalysis qqAccountAnalysis = new QQAccountAnalysis();
public static List<Map<String,Object>> getQQAccountData(String child,String cookie) {
String url = "http://r.cnews.qq.com/getSubNewsIndex";
Map<String,String> headerMap = HeadGet.getQQAccountHeaderMap(cookie);
Map<String,Object> paramMap = HeadGet.getQQAccountOneParamMap(child);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
try {
String result = HttpClient.executeHttpRequestPost(url, headerMap, paramMap);
List<String> idsList = qqAccountAnalysis.getQQAllIds(result);
System.out.println(idsList.size());
url = "http://r.cnews.qq.com/getSubNewsListItems";
String ids = "";
int i = 0;
for(String id : idsList ) {
ids = ids + id + ",";
i++;
if(i >= 20) {
try {
ids = ids.substring(0,ids.length()-1);
System.out.println(ids);
ZhiWeiTools.sleep(8000);
paramMap.clear();
paramMap = HeadGet.getQQAccountOtherParamMap(ids);
result = HttpClient.executeHttpRequestPost(url, headerMap, paramMap);
List<Map<String,Object>> list = qqAccountAnalysis.analysisQQAccountData(result);
dataList.addAll(list);
ids = "";
i = 0;
} catch (Exception e) {
ids = "";
paramMap.clear();
continue;
}
}
}
if(ids.length() > 1) {
ids = ids.substring(0,ids.length()-1);
ZhiWeiTools.sleep(8000);
paramMap.clear();
paramMap = HeadGet.getQQAccountOtherParamMap(ids);
result = HttpClient.executeHttpRequestPost(url, headerMap, paramMap);
List<Map<String,Object>> list = qqAccountAnalysis.analysisQQAccountData(result);
dataList.addAll(list);
}
return dataList;
} catch (Exception e) {
logger.error("获取企鹅号历史文章未完全成功",e.getMessage());
e.printStackTrace();
return dataList;
}
}
}
package com.zhiwei.parse;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.SouhuAccountAnalysis;
import com.zhiwei.parse.analysis.SouhuCommentAnalysis;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class Souhu {
private static Logger logger = LoggerFactory.getLogger(Souhu.class);
private static SouhuAccountAnalysis souhuAccountAnalysis = new SouhuAccountAnalysis();
private static SouhuCommentAnalysis souhuCommentAnalysis = new SouhuCommentAnalysis();
public static int getSouhuCommentCount(String url) {
String newurl = souhuCommentAnalysis.getSouhuURL(url);
int i;
try {
i = souhuCommentAnalysis.getSouhuCommentCount(newurl);
return i;
} catch (Exception e) {
logger.error("搜狐获取评论数出错了",e.getMessage());
return 0;
}
}
/**
*
* @Description (依据传入的时间和帐号xpt 获取时间内的历史文章)
* @param xpt 帐号xpt
* @param startTime 开始时间
* @param cookie
* @param isCulling 是否采集精选
* @return
*/
public static List<Map<String,Object>> getSouHuAccountData(String xpt,String startTime,boolean isCulling) {
int i = 1;
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
Map<String,String> headerMap = HeadGet.getSouhuAccountHeaderMap(null);
boolean f = true;
while(f) {
try {
String url = "http://mp.sohu.com/apiV2/profile/newsListAjax?xpt="+xpt+"&pageNumber="+i+"&pageSize=10";
String result = null;
if(isCulling) {
url = url + "&categoryId=-1";
}
try {
result = HttpClient.executeHttpRequestGet(url,headerMap);
} catch (Exception e) {
e.printStackTrace();
}
result = result.replaceAll("\\\\", "");
result = result.substring(1, result.length()-1);
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArray = json.getJSONArray("data");
List<Map<String,Object>> dataList1 = souhuAccountAnalysis.analysisData(jsonArray);
if(jsonArray.size() < 1) {
break;
}
dataList.addAll(dataList1);
//判断时间
if(startTime != null) {
for(Map<String,Object> map : dataList1) {
String time = TimeParse.dateFormartString((Date)map.get("time"),"yyyy-MM-dd HH:mm:ss");
if(time.compareTo(startTime) < 0) {
f = false;
break;
}
dataList.add(map);
}
}
i++;
ZhiWeiTools.sleep(3000);
} catch (Exception e) {
logger.error("出错了",e.getMessage());
continue;
}
}
return dataList;
}
/**
*
* @Description 传入搜狐文章链接和cookie 可获取此文章所有评论
* @param url
* @param cookie
* @return
*/
public static List<Map<String,Object>> getSouhuCommentData(String url) {
Map<String,String> headerMap = HeadGet.getSouhuCommentHeaderMap(null);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
int j = 1;
try {
while(true) {
String newurl = souhuCommentAnalysis.getSouhuURL(url) + "&page_no=" + j;
String result = HttpClient.executeHttpRequestGet(newurl,headerMap);
System.out.println(newurl);
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONObject("jsonObject").getJSONArray("comments");
if(jsonArry.size() < 1) {
break;
}
for(int i = 0;i < jsonArry.size();i++) {
JSONObject data = jsonArry.getJSONObject(i);
Map<String,Object> map = souhuCommentAnalysis.getData(data);
dataList.add(map);
}
j++;
ZhiWeiTools.sleep(3000);
}
} catch (Exception e) {
e.printStackTrace();
logger.error("获取搜狐文章评论出错",e.getMessage());
}
return dataList;
}
}
package com.zhiwei.parse;
import java.io.IOException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.YidianzixunAccountAnalysis;
import com.zhiwei.parse.analysis.YidianzixunByWordAnalysis;
import com.zhiwei.parse.analysis.YidianzixunCommentAnalysis;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class Yidianzixun {
private static Logger logger = LoggerFactory.getLogger(Yidianzixun.class);
private static YidianzixunAccountAnalysis yidianzixunAccountAnalysis = new YidianzixunAccountAnalysis();
private static YidianzixunCommentAnalysis yidianzixunCommentAnalysis = new YidianzixunCommentAnalysis();
private static YidianzixunByWordAnalysis yidianzixunByWordAnalysis = new YidianzixunByWordAnalysis();
/**
*
* @Description (获取一点资讯历时文章)
* @param channelid
* @param startTime
* @return
*/
public static List<Map<String,Object>> getYidianzixunAccountData(String channelid,String startTime) {
Map<String,String> headerMap = HeadGet.getYidianzixunAccountHeaderMap(null);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
int j = 0;
boolean f = true;
try {
while(f) {
String url = "http://www.yidianzixun.com/home/q/news_list_for_channel?channel_id="+channelid+"&cstart="+j+"&cend="+(j+10);
String result = HttpClient.executeHttpRequestGet(url, headerMap);
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONArray("result");
if(jsonArry.size() == 0) {
break;
}
for(int i = 0;i < jsonArry.size();i++) {
Map<String,Object> map = yidianzixunAccountAnalysis.parseJsonByAccount(jsonArry.getJSONObject(i));
if(startTime != null) {
String time = map.get("time")+"";
if(startTime.compareTo(time) > 0) {
f = false;
break;
}
}
dataList.add(map);
}
System.out.println("================================" + dataList.size());
ZhiWeiTools.sleep(3000);
j += 10;
}
} catch (Exception e) {
logger.error("数据获取出错",e.getMessage());
e.printStackTrace();
}
return dataList;
}
/**
*
* @Description (传入文章链接获取文章评论)
* @param url
* @param cookie
* @return
*/
public static List<Map<String,Object>> getYidianzixunCommentData(String url) {
url = yidianzixunCommentAnalysis.analysisURL(url);
Map<String, String> headerMap = HeadGet.getYidianzixunCommentHeaderMap(null);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
try {
String urlb = url;
while(true) {
String result = HttpClient.executeHttpRequestGet(url,headerMap);
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONArray("comments");
String comment_id = "";
for (int i = 0; i < jsonArry.size(); i++) {
JSONObject data = jsonArry.getJSONObject(i);
Map<String, Object> map = yidianzixunCommentAnalysis.parseJsonByAccount(data);
if(map != null) {
dataList.add(map);
}
if (data.toString().contains("replies")) {
ZhiWeiTools.sleep(2000);
List<Map<String,Object>> replyList = yidianzixunCommentAnalysis.getrepliesData(map, null, headerMap);
if(replyList != null && replyList.size() > 0) {
dataList.addAll(replyList);
}
}
comment_id = map.get("comment_id")+"";
}
url = urlb + comment_id;
if(jsonArry.size() < 1){
break;
}
}
return dataList;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
*
* @Description 依据关键词获取文章
* @param word
* @return
*/
public static List<Map<String,Object>> getYidianzixunDataByWord(String word) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
try {
int i = 0;
while(true) {
String url = "http://www.yidianzixun.com/home/q/news_list_for_keyword?display="+URLEncoder.encode(word, "UTF-8")+"&cstart="+i+"&cend="+(i+10)+"&word_type=token";
Map<String,String> headerMap = HeadGet.getYidianzixunWordHeaderMap(null);
String result = HttpClient.executeHttpRequestGet(url, headerMap);
List<Map<String,Object>> list = yidianzixunByWordAnalysis.getOnePageData(result);
if(list == null || list.size() < 1) {
break;
}
dataList.addAll(list);
i += 10;
System.out.println(word+"=============已获取到数据 "+dataList.size());
ZhiWeiTools.sleep(3000);
}
return dataList;
} catch (Exception e) {
logger.error("获取一点资讯数据失败",e.getMessage());
e.printStackTrace();
return dataList;
}
}
}
package com.zhiwei.parse.analysis;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class BaijiaAccountAnalysis {
private static Logger logger = LoggerFactory.getLogger(BaijiaAccountAnalysis.class);
/**
*
* @Description 解析一页历史文章
* @param result
* @param startTime
* @return
*/
public List<Map<String,Object>> getBaijiaAccountData(String result,String startTime) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
try {
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONObject("data").getJSONObject("WriterArticleList").getJSONArray("items");
for(int i = 0; i < jsonArry.size();i++ ) {
JSONObject data = jsonArry.getJSONObject(i);
String time = data.getString("created_at");
if(startTime != null && startTime.length() > 1) {
if(time.compareTo(startTime) < 1) {
continue;
}
}
Map<String,Object> map = new HashMap<String, Object>();
String id = data.getString("id");
map.put("id", id);
map.put("title", data.getString("title"));
String url = data.getString("url");
if(url == null) {
url = "https://baijia.baidu.com/s?old_id=" + id;
}
map.put("content", getBaijiaContent(url));
map.put("read_amount", data.getString("read_amount")==null?0:data.getString("read_amount"));
map.put("app_id", data.getString("app_id"));
map.put("time", time);
map.put("url", url);
map.put("source", data.getString("writer_name"));
System.out.println(map.toString());
dataList.add(map);
}
return dataList;
} catch (Exception e) {
logger.error("解析百家号历史文章出错",e.getMessage());
return dataList;
}
}
public String getBaijiaContent(String url) {
ZhiWeiTools.sleep(2000);
Map<String,String> headerMap = HeadGet.getBaijiaAccountHeaderMap(null);
try {
String result = HttpClient.executeHttpRequestGet(url, headerMap);
Document document = Jsoup.parse(result);
return document.select("section.news-content").text();
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
}
package com.zhiwei.parse.analysis;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
public class DayuAccountAnalysis {
private static Logger logger = LoggerFactory.getLogger(DayuAccountAnalysis.class);
/**
*
* @Description 解析历史文章信息
* @param result
* @param startTime
* @return
*/
public List<Map<String,Object>> getDayuAccountData(String result,String name,String startTime) {
JSONObject json = JSONObject.parseObject(result);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
JSONArray jsonArray = json.getJSONArray("data");
try {
for(int i = 0;i < jsonArray.size();i++) {
JSONObject data = jsonArray.getJSONObject(i);
Map<String,Object> map = getOneData(data,name,startTime);
if(map != null) {
dataList.add(map);
System.out.println(map.toString());
}
}
return dataList;
} catch (Exception e) {
logger.error("解析大鱼号历史文章出错",e.getMessage());
return null;
}
}
/**
*
* @Description 解析单条信息 时间对比
* @param data
* @param startTime
* @return
*/
private Map<String,Object> getOneData(JSONObject data,String name,String startTime) {
Map<String,Object> map = new HashMap<String, Object>();
try {
String time = data.getString("published_at").replace("T", " ").replace(".000+0800", "");
if(startTime != null && startTime.length() > 1) {
if(time.compareTo(startTime) < 0) {
return null;
}
}
map.put("title", data.getString("title"));
map.put("time", time);
map.put("url", "http://a.mp.uc.cn/article.html?uc_param_str=frdnsnpfvecpntnwprdssskt&wm_cid=" + data.getString("content_id"));
map.put("content_id", data.getString("content_id"));
String content = data.getJSONObject("body").getString("text");
if(content != null) {
content = content.replaceAll("<.*?>", "")
.replace(".isc11", "").replaceAll("\\{.*\\}", "").replaceAll("&nbsp;/", "").replace("&nbsp;", "");
}
map.put("content", content);
map.put("source", name);
map.put("origin_id", data.getString("origin_id"));
map.put("xss_item_id", data.getJSONObject("_extra").getString("xss_item_id"));
return map;
} catch (Exception e) {
logger.error("解析此条历史文章出错",e.getMessage());
return null;
}
}
}
package com.zhiwei.parse.analysis;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class DayuCommentAnalysis {
private static Logger logger = LoggerFactory.getLogger(DayuCommentAnalysis.class);
/**
*
* @Description 大鱼号评论解析
* @param result
* @param articleId
* @return
*/
public List<Map<String,Object>> getDayuCommentData(String result,String articleId) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
try {
JSONObject json = JSONObject.parseObject(result).getJSONObject("data").getJSONObject("comments_map");
Map<String,Object> map = (Map<String,Object>)json;
for(Map.Entry<String, Object> entry : map.entrySet() ) {
Map<String,Object> dataMap = new HashMap<String, Object>();
JSONObject data = JSONObject.parseObject(entry.getValue().toString());
dataMap.put("content", data.getString("content"));
dataMap.put("nickname", data.getJSONObject("user").getString("nickname"));
dataMap.put("like", data.getString("up_cnt"));
String id = data.getString("id");
dataMap.put("id", id);
dataMap.put("url", data.getString("shareUrl"));
long time = data.getLong("timeShow");
dataMap.put("time", TimeParse.dateFormartString(new Date(time), "yyyy-MM-dd HH:mm:ss"));
int i = data.getInteger("reply_cnt");
dataMap.put("replay_count", i);
if(i > 0) {
// System.out.println(dataMap.toString());
dataList.addAll(getReplayData(id,articleId));
}
dataList.add(dataMap);
}
return dataList;
} catch (Exception e) {
logger.error("解析出错",e.getMessage());
return dataList;
}
}
/**
*
* @Description 解析
* @param id
* @param articleId
* @return
*/
private List<Map<String,Object>> getReplayData(String id,String articleId) {
Map<String,String> headerMap = HeadGet.getDayuCommentHeaderMap(null);
String url = "http://m.uczzd.cn/iflow/api/v2/cmt/detail/"+id+"/comments?articleId="+articleId+"&count=10&ts=";
String result = HttpClient.executeHttpRequestGet(url+"-1", headerMap);
List<Map<String,Object>> data = new ArrayList<Map<String,Object>>();
List<String> timeList = new ArrayList<String>();
while(true) {
ZhiWeiTools.sleep(3000);
long time = analysisReplayData(result,data);
if(timeList.contains(String.valueOf(time))){
break;
}
timeList.add(String.valueOf(time));
if(time == 0) {
break;
}
result = HttpClient.executeHttpRequestGet(url+time, headerMap);
}
System.out.println("====================="+data.size());
return data;
}
/**
*
* @Description 解析
* @param result
* @param dataList
* @return
*/
private long analysisReplayData(String result,List<Map<String,Object>> dataList) {
long time = 0;
try {
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONObject("data").getJSONArray("replies");
for(int i = 0; i < jsonArry.size();i++) {
Map<String,Object> map = new HashMap<String, Object>();
JSONObject data = jsonArry.getJSONObject(i);
map.put("content", data.getString("content"));
map.put("nickname", data.getString("nickname"));
map.put("like", data.getString("up_cnt"));
map.put("id", data.getString("commentId"));
map.put("url", data.getString("shareUrl"));
time = data.getLong("timeShow");
map.put("time", TimeParse.dateFormartString(new Date(time), "yyyy-MM-dd HH:mm:ss"));
map.put("replay_count", data.getInteger("replyCnt"));
dataList.add(map);
}
return time;
} catch (Exception e) {
logger.error("获取大鱼号评论出错--回复的",e.getMessage());
return 0;
}
}
}
package com.zhiwei.parse.analysis;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class FenghuangAccountAnalysis {
private static Logger logger = LoggerFactory.getLogger(FenghuangAccountAnalysis.class);
/**
*
* @Description 解析页面 返回信息
* @param result
* @return
*/
public List<Map<String,Object>> getArticleData(String url,String startTime) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
try {
Map<String,String> headerMap = HeadGet.getFenghuangAccountHeaderMap(null);
String result = HttpClient.executeHttpRequestGet(url, headerMap);
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONObject("data").getJSONObject("feeds").getJSONArray("list");
for(int i = 0;i < jsonArry.size();i++) {
try {
JSONObject data = jsonArry.getJSONObject(i);
String articleurl = data.getString("id");
String articleResult = HttpClient.executeHttpRequestGet(articleurl, headerMap);
Map<String,Object> dataMap = getArticle(articleResult);
ZhiWeiTools.sleep(1000);
if(dataMap != null) {
String time = (String)dataMap.get("time");
if(time.compareTo(startTime) >= 0) {
dataList.add(dataMap);
continue;
}
}
} catch (Exception e) {
e.printStackTrace();
continue;
}
}
} catch (Exception e1) {
e1.printStackTrace();
return null;
}
return dataList;
}
private static Map<String,Object> getArticle(String articleResult) {
JSONObject json = JSONObject.parseObject(articleResult).getJSONObject("body");
Map<String,Object> map = new HashMap<String, Object>();
try {
map.put("title", json.getString("title"));
String time = json.getString("cTime").replaceAll("/", "-");
map.put("time", time);
map.put("text", json.getString("text").replaceAll("<.*?>", ""));
map.put("source", json.getString("source"));
map.put("url", json.getString("shareurl"));
map.put("id", json.getString("aid"));
} catch (Exception e) {
logger.error("解析具体文章的时候出错",e.getMessage());
return null;
}
return map;
}
}
package com.zhiwei.parse.analysis;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class FenghuangByWordAnalysis {
private static Logger logger = LoggerFactory.getLogger(FenghuangByWordAnalysis.class);
public List<Map<String,Object>> getFenghuangByWord(String result) {
Document document = Jsoup.parse(result);
Elements elements = document.select("div.mainM").select("div.searchResults");
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
try {
for(Element element : elements) {
Map<String,Object> map = new HashMap<String, Object>();
String title = element.select("p.line24").select("a").text();
if(title == null ) {
continue;
}
map.put("title", title);
map.put("url", element.select("p.line24").select("a").attr("href"));
map.put("content", element.select("p:nth-child(2)").text());
String sourceAndTime = element.select("p:nth-child(3) > font").text();
String[] str = sourceAndTime.split(" ");
String time = str[1] + " " + str[2];
String source = str[0];
map.put("time", time);
map.put("source", source);
dataList.add(map);
}
return dataList;
} catch (Exception e) {
logger.error("解析凤凰关键词获取文章页面出错",e.getMessage());
return dataList;
}
}
}
package com.zhiwei.parse.analysis;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
public class FenghuangCommentAnalysis {
private static Logger logger = LoggerFactory.getLogger(FenghuangCommentAnalysis.class);
public Map<String,Object> getFenghuangCommentCount(String url) {
Map<String, String> headerMap = HeadGet.getFenghuangCommentHeaderMap(null);
Map<String,Object> map = new HashMap<String, Object>();
try {
String result = HttpClient.executeHttpRequestGet(url, headerMap);
JSONObject json = JSONObject.parseObject(result);
map.put("real_count", json.getInteger("real_num"));
map.put("comment_num", json.getInteger("comment_num"));
return map;
} catch (Exception e) {
logger.error("获取凤凰评论数出错",e.getMessage());
return null;
}
}
/**
* http://news.ifeng.com/a/20161229/50492484_0.shtml
* @Description 链接转换
* @param url
* @return
*/
public String getdocUrl(String url) {
try {
if(url.contains("/a")) {
url = url.replace(":", "%3A");
url = "https://user.iclient.ifeng.com/Social_Api_Comment/getCommentList?comments_url="+url+"&hasChild=1&limit=30&page=";
// System.out.println(url);
}else {
String docUrl = "";
// if(url.contains("?")) {
// url = url.split("\\?")[0];
// }
// docUrl = url.split("//")[1].split("/")[1];
docUrl = url.substring(url.length()-8,url.length());
url = "https://user.iclient.ifeng.com/Social_Api_Comment/getCommentList?comments_url=sub_"+docUrl+"&hasChild=1&limit=30&page=";
}
return url;
} catch (Exception e) {
logger.error("解析连接出错",e.getMessage());
e.printStackTrace();
return null;
}
}
/**
*
* @Description (解析评论)
* @param url
* @return
*/
public List<Map<String,Object>> getData(String url) {
Map<String,String> headerMap = HeadGet.getFenghuangCommentHeaderMap(null);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
String result;
try {
result = HttpClient.executeHttpRequestGet(url, headerMap);
} catch (Exception e) {
logger.error("链接获取信息失败",e.getMessage());
return null;
}
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONArray("data");
try {
for(int i = 0;i < jsonArry.size(); i ++) {
Map<String,Object> map = getcommentData(jsonArry.getJSONObject(i));
dataList.add(map);
}
} catch (Exception e) {
logger.error("获取信息出错",e.getMessage());
return null;
}
return dataList;
}
/**
*
* @Description (解析评论具体信息)
* @param json
* @return
*/
private Map<String,Object> getcommentData(JSONObject json) {
Map<String,Object> map = new HashMap<String, Object>();
try {
JSONObject data = json.getJSONObject("data");
map.put("nickname", json.getString("nickname"));
map.put("content", data.getString("comment_contents").replaceAll("&quot;", ""));
map.put("id", data.getString("comment_id"));
map.put("root_id", data.getString("comment_root_id")); //若此评论为回复评论的 为回复的评论的id
map.put("like", data.getString("like"));
map.put("unlike", data.getString("unlike"));
map.put("total_num", data.getString("total_num"));
map.put("from", data.getString("ip_from"));
map.put("source", data.getString("device_type"));
long time = data.getLong("add_time") * 1000;
map.put("time", TimeParse.dateFormartString(new Date(time), "yyyy-MM-dd HH:mm:ss"));
} catch (Exception e) {
logger.error("具体解析一条数据出错",e.getMessage());
return null;
}
return map;
}
}
package com.zhiwei.parse.analysis;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.util.TimeUtil;
public class MeipaiByWordAnalysis {
private static Logger logger = LoggerFactory.getLogger(MeipaiByWordAnalysis.class);
/**
*
* @Description 解析此页
* @param result
* @return
*/
public Map<String,Object> getMeipaiData(Map<String,String> headerMap,String url) {
try {
Map<String,Object> dataMap = new HashMap<String,Object>();
String result = HttpClient.executeHttpRequestGet(url, headerMap);
Document doc = Jsoup.parse(result);
String video_count = doc.select("div.detail-location").text().split("播放")[0];
String time = doc.select("div.detail-time.pa > strong").text();
String content = doc.select("div.detail-info.pr > h1").text();
String like = doc.select("span.detail-like.dbl.pr.cp > span").text();
String comment_count = doc.select("#commentCount").text();
String source = doc.select("#mediaUser > div > a").select("img").attr("alt");
String source_url = doc.select("#mediaUser > div > a").attr("href");
if("评论".equals(comment_count)){
comment_count = "0";
}
time = TimeUtil.timeUtil(time);
if(time == null) {
return null;
}
dataMap.put("time", time);
dataMap.put("video_count", video_count);
dataMap.put("content", content);
dataMap.put("url", url);
dataMap.put("like", like);
dataMap.put("comment_count", comment_count);
dataMap.put("source", source);
dataMap.put("source_url", "http://www.meipai.com"+source_url);
System.out.println(dataMap.toString());
return dataMap;
} catch (Exception e) {
logger.error("解析出错",e.getMessage());
return null;
}
}
/**
*
* @Description 解析此页所有链接
* @param result
* @return
*/
public List<String> getURl(String result) {
List<String> urlList = new ArrayList<String>();
try {
Document doc = Jsoup.parse(result);
Elements elements = doc.select("#mediasList").select("li");
for(Element element : elements) {
String url = element.select("div.cp").select("a").attr("href");
urlList.add("http://www.meipai.com"+url);
}
return urlList;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
}
package com.zhiwei.parse.analysis;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
public class QQAccountAnalysis {
private static Logger logger = LoggerFactory.getLogger(QQAccountAnalysis.class);
/**
*
* @Description 解析
* @param result
* @return
*/
public List<Map<String,Object>> analysisQQAccountData(String result) {
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONArray("newslist");
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
try {
for(int i = 0;i < jsonArry.size();i++) {
JSONObject data = jsonArry.getJSONObject(i);
Map<String,Object> map = new HashMap<String,Object>();
map.put("url", data.getString("url_comment"));
map.put("time", data.getString("time"));
map.put("title", data.getString("title"));
map.put("content", data.getString("abstract"));
map.put("source", data.getString("source"));
map.put("id", data.getString("id"));
map.put("commentid", data.getString("commentid"));
dataList.add(map);
}
} catch (Exception e) {
logger.error("解析出错",e.getMessage());
return null;
}
return dataList;
}
/**
*
* @Description 获取此企鹅号的所有文章id
* @param result
* @return
*/
public List<String> getQQAllIds(String result) {
List<String> list = new ArrayList<String>();
try {
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONArray("ids");
for(int i = 0;i < jsonArry.size();i++) {
JSONObject data = jsonArry.getJSONObject(i);
list.add(data.getString("id"));
}
return list;
} catch (Exception e) {
logger.error("获取企鹅号所有id出错",e.getMessage());
return null;
}
}
}
package com.zhiwei.parse.analysis;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
public class SouhuAccountAnalysis {
private static Logger logger = LoggerFactory.getLogger(SouhuAccountAnalysis.class);
/**
*
* @Description (解析完整json 返回完整数据)
* @param jsonArray
* @param startTime
* @return
*/
public List<Map<String,Object>> analysisData(JSONArray jsonArray) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
for(int i = 0;i < jsonArray.size();i++) {
JSONObject data = jsonArray.getJSONObject(i);
Map<String,Object> map = parseHtmlByAccount(data);
if(map != null) {
dataList.add(map);
}
}
return dataList;
}
/**
*
* @Description 解析搜狐历史文章单个数据
* @param data
* @return
*/
private static Map<String,Object> parseHtmlByAccount(JSONObject data) {
Map<String,Object> map = new HashMap<String, Object>();
try {
String title = data.getString("title");
map.put("title", URLDecoder.decode(title, "UTF-8"));
String content = data.getString("brief");
map.put("content", URLDecoder.decode(content,"UTF-8"));
map.put("newsPv", data.getString("newsPv"));
map.put("url", data.getString("url"));
long timelong = Long.valueOf(data.getString("postTime"));
map.put("time", new Date(timelong));
map.put("comment", data.getString("commentsCnt"));
JSONArray jsonArry = data.getJSONArray("tags");
String tags = "";
for(int i = 0;i < jsonArry.size();i++) {
JSONObject ob = jsonArry.getJSONObject(i);
tags = tags + ob.getString("name") + ",";
}
if(tags.length() > 1) {
tags = tags.substring(0,tags.length()-1);
}
map.put("tags", tags);
map.put("newsid", data.getString("newsid"));
} catch (Exception e) {
logger.error("搜狐历史文章解析出错了",e.getMessage());
System.out.println(data.toString());
return null;
}
return map;
}
}
package com.zhiwei.parse.analysis;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
public class SouhuCommentAnalysis {
private static Logger logger = LoggerFactory.getLogger(SouhuCommentAnalysis.class);
/**
*
* @Description (解析链接)
* @param url
* @return
*/
public String getSouhuURL(String url) {
String topic_id = "";
String source_id = "";
try {
if(url.contains("?")){
url = url.split("\\?")[0];
}
String s = url.split("a/")[1];
topic_id = s.split("_")[1];
source_id = s.split("_")[0];
} catch (Exception e) {
logger.error("链接解析错误",e.getMessage());
}
String newurl = "http://apiv2.sohu.com/api/comment/list?page_size=10&topic_id="+topic_id+"&source_id=mp_"+source_id;
return newurl;
}
public int getSouhuCommentCount(String url) {
Map<String,String> headerMap = HeadGet.getSouhuCommentHeaderMap(null);
int i;
try {
String result = HttpClient.executeHttpRequestGet(url, headerMap);
JSONObject json = JSONObject.parseObject(result);
i = json.getJSONObject("jsonObject").getInteger("participation_sum");
return i;
} catch (Exception e) {
logger.error("获取搜狐评论数信息出错",e.getMessage());
return 0;
}
}
/**
*
* @Description (解析一个json数据)
* @param data
* @return
*/
public Map<String,Object> getData(JSONObject data) {
Map<String,Object> map = new HashMap<String, Object>();
try {
long timelong = data.getLongValue("create_time");
map.put("time", new Date(timelong));
map.put("content", data.getString("content"));
map.put("location", data.getString("ip_location"));
map.put("user_id", data.getString("user_id"));
map.put("support_count", data.getInteger("support_count"));
JSONObject json = JSONObject.parseObject(data.getString("passport"));
map.put("nickname", json.getString("nickname"));
map.put("comment_id", data.getString("comment_id"));
map.put("reply_id", data.getString("reply_id"));
} catch (Exception e) {
System.out.println(data.toString());
System.out.println(map.toString());
logger.error("解析出错",e.getMessage());
}
return map;
}
}
package com.zhiwei.parse.analysis;
import java.util.HashMap;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
public class YidianzixunAccountAnalysis {
private static Logger logger = LoggerFactory.getLogger(YidianzixunAccountAnalysis.class);
/**
*
* @Description (解析一点咨询一条数据)
* @param data
* @return
*/
public Map<String,Object> parseJsonByAccount(JSONObject data) {
Map<String,Object> map = new HashMap<String, Object>();
try {
map.put("title", data.getString("title"));
map.put("time", data.getString("date"));
map.put("comment_count", data.getString("comment_count")==null?0:data.getString("comment_count"));
map.put("ctype", data.getString("ctype"));
map.put("source", data.getString("source"));
map.put("url", data.getString("url"));
map.put("summary", data.getString("summary"));
} catch (Exception e) {
System.out.println(data.toString());
System.out.println(map.toString());
logger.error("解析此条出错",e.getMessage());
}
return map;
}
}
package com.zhiwei.parse.analysis;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
public class YidianzixunByWordAnalysis {
private static Logger logger = LoggerFactory.getLogger(YidianzixunByWordAnalysis.class);
/**
*
* @Description 解析一页
* @param result
* @return
*/
public List<Map<String,Object>> getOnePageData(String result) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
try {
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONArray("result");
for(int j = 0;j < jsonArry.size();j++) {
JSONObject data = jsonArry.getJSONObject(j);
Map<String,Object> map = getData(data);
if(map != null) {
dataList.add(map);
}
}
return dataList;
} catch (Exception e) {
logger.error("解析出错",e.getMessage());
return dataList;
}
}
/**
*
* @Description 解析单条文章
* @param data
* @return
*/
private static Map<String,Object> getData(JSONObject data) {
Map<String,Object> map = new HashMap<String, Object>();
try {
String docid = data.getString("docid");
if(docid == null || docid.length() < 1) {
return null;
}
map.put("docid", docid);
map.put("title", data.getString("title"));
map.put("comment_count", data.getString("comment_count")==null?0:data.getString("comment_count"));
map.put("summary", data.getString("summary"));
map.put("source", data.getString("source"));
map.put("wm_copyright", data.getString("wm_copyright")==null?"":data.getString("wm_copyright"));
map.put("time", data.getString("date"));
String url = data.getString("url");
if(url.contains("html")) {
map.put("url", url);
}else {
map.put("url", "http://www.yidianzixun.com/article/"+docid);
}
return map;
} catch (Exception e) {
logger.error("解析出错",e.getMessage());
return null;
}
}
}
package com.zhiwei.parse.analysis;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.httpclient.HttpClient;
public class YidianzixunCommentAnalysis {
private static Logger logger = LoggerFactory.getLogger(YidianzixunCommentAnalysis.class);
/**
*
* @Description 解析传入的url
* @param url
* @return
*/
public String analysisURL(String url) {
try {
String[] docids = url.split("/");
String docid = docids[docids.length-1];
return "http://www.yidianzixun.com/home/q/getcomments?docid="+docid+"&s=&count=30&last_comment_id=";
} catch (Exception e) {
return null;
}
}
/**
*
* @Description (获取此条评论的回复)
* @param map
* @param cookie
* @param headerMap
* @return
*/
public List<Map<String, Object>> getrepliesData(Map<String, Object> map, String cookie,
Map<String, String> headerMap) {
List<Map<String,Object>> replylists = new ArrayList<Map<String,Object>>();
try {
String replyurl = "http://www.yidianzixun.com/home/q/getmorereplies?comment_id="
+ map.get("comment_id");
String replyresult = HttpClient.executeHttpRequestGet(replyurl, headerMap);
JSONObject replyjson = JSONObject.parseObject(replyresult);
JSONArray replyjsonArry = replyjson.getJSONObject("comment").getJSONArray("replies");
for (int j = 0; j < replyjsonArry.size(); j++) {
JSONObject data1 = replyjsonArry.getJSONObject(j);
replylists.add(parseJsonByAccount(data1));
}
} catch (Exception e) {
e.printStackTrace();
return null;
}
return replylists;
}
/**
*
* @Description (解析一点资讯一条评论)
* @param data
* @return
*/
public Map<String, Object> parseJsonByAccount(JSONObject data) {
Map<String, Object> map = new HashMap<String, Object>();
try {
map.put("nickname", data.getString("nickname"));
map.put("comment", data.getString("comment"));
map.put("comment_id",
data.getString("comment_id") == null
? data.getString("reply_id")
: data.getString("comment_id"));
map.put("time", data.getString("createAt"));
map.put("like", data.getString("like") == null ? 0
: data.getString("like"));
map.put("utk", data.getString("utk"));
map.put("replay_to", data.getString("reply_to"));
} catch (Exception e) {
logger.error("出错了", e.getMessage());
return null;
}
return map;
}
}
package com.zhiwei.util;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class TimeUtil {
private static Logger logger = LoggerFactory.getLogger(TimeUtil.class);
public static String timeUtil(String time) {
if(time.split("-").length == 2) {
time = "2017-"+time+":00";
}else {
return null;
}
return time;
}
public static void main(String[] args) {
String time = "17-12-12 15:01";
System.out.println(timeUtil(time));
}
}
package com.zhiwei.util;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class WordReadFile {
private static Logger logger = LoggerFactory.getLogger(WordReadFile.class);
/**
*
* @Title: getWords
* @author hero
* @Description: 从txt文件中读取关键词
* @param @param
* wordFileName 关键词文件全路径
* @param @return 设定文件
* @return List<String> 返回类型
*/
public static List<String> getWords(String wordFileName) {
List<String> list = null;
try {
list = new ArrayList<String>();
BufferedReader br=new BufferedReader(new InputStreamReader(new FileInputStream(wordFileName),"GBK"));
String line = "";
while((line = br.readLine())!=null)
{
list.add(line);
}
br.close();
return list;
} catch (IOException e) {
logger.debug("读取关键词文件失败.{}",e.getMessage());
return null;
}
}
}
package com.zhiwei.crawler;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.parse.Baijia;
public class BaijiaAccountExample {
@Test
public void baijiaAccountTest() {
String app_id = "1536772853921543";
String startTime = "";
//2017-11-30 17:48:17
List<Map<String,Object>> lists = Baijia.getBaijiaAccountData(app_id,startTime);
System.out.println(lists.size());
}
}
package com.zhiwei.crawler;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.parse.Dayu;
public class DayuAccountExample {
@Test
public void dayuAccountTest() {
//https://api.m.sm.cn/rest?method=Subscribe.list&format=html&from=wh10331&uc_biz_str=S:custom%7CC:search%7CN:true
String mid = "b38221a2bb594482aba7593bab942162";
String name = "";
String startTime = "2017-12-05 22:08:01";
List<Map<String,Object>> dataList = Dayu.getDayuAccountData(mid,name,null);
System.out.println(dataList.size());
}
}
package com.zhiwei.crawler;
import org.junit.Test;
import com.zhiwei.parse.Dayu;
public class DayuCommentCountExample {
@Test
public void dayuCommentCountTest() {
String articleId = "6987993456991247474";
int i = Dayu.getDayuCommentCount(articleId);
System.out.println(i);
}
}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Dayu;
public class DayuCommentExample {
@Test
public void getDayuCommentTest() {
//若已获取历史文章 哪里有这个字段 其他文章的
//http://m.uczzd.cn/iflow/api/v2/cmt/article/14180961224021425316/comments/byhot
//14180961224021425316 这个为此参数
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> map = poi.importExcel("D://crawlerdata/UC评论采集.xlsx", 0);
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
List<String> urlList = new ArrayList<String>();
for(Map<String,Object> map1 : list) {
String url = "";
try {
url = map1.get("url")+"";
String articleId = url.split("aid=")[1].split("&")[0];
List<Map<String,Object>> dataList = Dayu.getDayuCommentData(articleId);
if(dataList.size() <= 0) {
urlList.add(url);
}
if(dataList != null) {
bodyList.addAll(dataList);
}
} catch (Exception e) {
System.out.println(url);
e.printStackTrace();
continue;
}
}
List<String> headList = new ArrayList<String>();
headList.add("nickname");
headList.add("content");
headList.add("id");
headList.add("url");
headList.add("like");
headList.add("time");
headList.add("replay_count");
for(String s : urlList) {
System.out.println(s);
}
poi.exportExcel("D://crawlerdata/UC评论采集.xlsx", "评论", headList, bodyList);
}
}
package com.zhiwei.crawler;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.parse.Fenghuang;
public class FenghuangAccountExample {
@Test
public void fenghuangAccountTest() {
//所用时间长 1s1篇文章吧
String id = "733691";
String startTime = "2017-11-15 00:00:00"; //可为空
List<Map<String,Object>> dataList = Fenghuang.getFenghuangAccountData(id, startTime);
for(Map<String,Object> map : dataList) {
System.out.println(map.toString());
}
System.out.println(dataList.size());
}
}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Fenghuang;
import com.zhiwei.parse.Yidianzixun;
import com.zhiwei.util.WordReadFile;
public class FenghuangByWordExample {
@Test
public void fenghuangByWordTest() {
List<String> wordList = WordReadFile.getWords("D://crawlerdata/关键词.txt");
List<Map<String,Object>> listAll = new ArrayList<Map<String,Object>>();
for(String word : wordList) {
try {
List<Map<String,Object>> dataList = Fenghuang.getFenghuangByWord(word);
if(dataList != null && dataList.size() > 0) {
listAll.addAll(dataList);
}
System.out.println(dataList.size()+"==========="+listAll.size());
} catch (Exception e) {
continue;
}
}
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("content");
headList.add("source");
headList.add("time");
headList.add("url");
System.out.println(listAll.size());
poi.exportExcel("D://crawlerdata/凤凰-美林.xlsx", "asd", headList, listAll);
}
}
package com.zhiwei.crawler;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.parse.Fenghuang;
public class FenghuangCommentCountExample {
@Test
public void fenghuangCommentCountTest() {
String url = "http://wemedia.ifeng.com/40906977/wemedia.shtml";
//http://news.ifeng.com/a/20161229/50492484_0.shtml
//http://wemedia.ifeng.com/4096977/wemedia.shtml
Map<String,Object> map = Fenghuang.getFenghuangCommentCount(url);
System.out.println(map.toString());
}
}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Fenghuang;
public class FenghuangCommentExample {
@Test
public void fenghuangCommentTest() {
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> map = poi.importExcel("D://crawlerdata/凤凰评论采集.xlsx", 0);
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
List<String> urlList = new ArrayList<String>();
for(Map<String,Object> map1 : list) {
String url = "";
try {
url = map1.get("url")+"";
List<Map<String,Object>> dataList = Fenghuang.getFenghuangCommentData(url);
if(dataList.size() <= 0) {
urlList.add(url);
}
if(dataList != null) {
bodyList.addAll(dataList);
}
} catch (Exception e) {
System.out.println(url);
e.printStackTrace();
continue;
}
}
List<String> headList = new ArrayList<String>();
headList.add("nickname");
headList.add("content");
headList.add("id");
// headList.add("rootid");
headList.add("like");
headList.add("unlike");
headList.add("total_num");
headList.add("from");
headList.add("source");
headList.add("time");
for(String s : urlList) {
System.out.println(s);
}
poi.exportExcel("D://crawlerdata/凤凰评论采集.xlsx", "asd", headList, bodyList);
}
}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Meipai;
public class MeipaiByWordExample {
@Test
public void meipaiByWordTest() {
String word = "美食,吃,菜";
String[] words = word.split(",");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(String w : words) {
List<Map<String,Object>> dataList = Meipai.getMeipaiByWordData(w);
if(dataList != null) {
bodyList.addAll(dataList);
}
}
List<String> headList = new ArrayList<String>();
headList.add("time");
headList.add("video_count");
headList.add("content");
headList.add("url");
headList.add("like");
headList.add("comment_count");
headList.add("source");
headList.add("source_url");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata/美拍关键词采集.xlsx", "美拍数据", headList, bodyList);
}
}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Fenghuang;
import com.zhiwei.parse.Miaopai;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class MiaopaiByUrlExample {
@Test
public void miaopaiByUrlTest() {
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> map = poi.importExcel("D://crawlerdata/秒拍美食.xlsx", 0);
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
List<String> urlList = new ArrayList<String>();
for(Map<String,Object> map1 : list) {
String url = "";
try {
url = map1.get("url")+"";
if(urlList.contains(url)) {
continue;
}
urlList.add(url);
ZhiWeiTools.sleep(5000);
System.out.println(url);
Map<String,Object> dataMap = Miaopai.getMiaopaiDataByURL(url);
if(dataMap != null) {
bodyList.add(dataMap);
}
} catch (Exception e) {
System.out.println(url);
e.printStackTrace();
continue;
}
}
List<String> headList = new ArrayList<String>();
headList.add("time");
headList.add("source");
headList.add("title");
headList.add("url");
headList.add("video_count");
poi.exportExcel("D://crawlerdata/秒拍美食.xlsx", "asd", headList, bodyList);
}
}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.QQ;
public class QQAccountExample {
@Test
public void qqAccountTest() {
String child = "5002744";
String cookie = "phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=00030000db3c2ec2393ea968f523f50144db7ab5aec60e79d2509c271bdacdf784e88ac1f58b7493c23ceb15;%20uin=o0497332654;%20skey=M67MOgvFQJ;%20sigA2=D3046D543D9BA50CFE749D63B1F05AF28A281C29B4F1353374AB7A19D9527497A67E507C6829AE44F67C1EA032C2A3728301D2ABC864DA32BCA7D4C7A61609F9F3BC9AE0A7243003;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmUT_jxJCnY5yVwhmL3e2K5FOTRth6jz8SKVHGseA3v9s8UIZxw00LpF1uC9l7W5WL2trdb69LlCvE1s7twReOw;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0";
List<Map<String,Object>> dataList = QQ.getQQAccountData(child, cookie);
System.out.println(dataList.size());
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("content");
headList.add("url");
headList.add("commentid");
poi.exportExcel("D://crawlerdata/qq-5002744.xlsx", "asd", headList, dataList);
}
}
package com.zhiwei.crawler;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.parse.Souhu;
public class SouhuAccountExample {
@Test
public void souhuAccountTest() {
List<Map<String,Object>> lists = Souhu.getSouHuAccountData("MjQ4MDQ5Nzg2MEBzaW5hLnNvaHUuY29t",null,true);
System.out.println(lists.size());
int i = 0;
for(Map<String,Object> map : lists) {
System.out.println(map.toString());
System.out.println(i++);
}
}
}
package com.zhiwei.crawler;
import org.junit.Test;
import com.zhiwei.parse.Souhu;
public class SouhuCommentCountExample {
@Test
public void souhuCommentCountTest() {
String url = "https://www.sohu.com/a/210588884_267106?_f=index_news_7";
int i = Souhu.getSouhuCommentCount(url);
System.out.println(i);
}
}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Fenghuang;
import com.zhiwei.parse.Souhu;
public class SouhuCommentExample {
@Test
public void souhuCommentTest() {
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> map = poi.importExcel("D://crawlerdata/搜狐评论采集.xlsx", 0);
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
List<String> urlList = new ArrayList<String>();
for(Map<String,Object> map1 : list) {
String url = "";
try {
url = map1.get("url")+"";
List<Map<String,Object>> dataList = Souhu.getSouhuCommentData(url);
if(dataList.size() <= 0) {
urlList.add(url);
}
if(dataList != null) {
bodyList.addAll(dataList);
}
} catch (Exception e) {
System.out.println(url);
e.printStackTrace();
continue;
}
}
List<String> headList = new ArrayList<String>();
headList.add("nickname");
headList.add("content");
headList.add("user_id");
headList.add("loaction");
headList.add("support_count");
headList.add("comment_id");
headList.add("reply_id");
headList.add("time");
for(String s : urlList) {
System.out.println(s);
}
poi.exportExcel("D://crawlerdata/搜狐评论采集.xlsx", "搜狐评论", headList, bodyList);
}
}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Yidianzixun;
import com.zhiwei.util.WordReadFile;
public class YidainzixunByWordExample {
@Test
public void yidianzixunByWordTest() {
List<String> wordList = WordReadFile.getWords("D://crawlerdata/关键词.txt");
List<Map<String,Object>> listAll = new ArrayList<Map<String,Object>>();
for(String word : wordList) {
List<Map<String,Object>> dataList = Yidianzixun.getYidianzixunDataByWord(word);
System.out.println(dataList.size());
listAll.addAll(dataList);
System.out.println(listAll.size());
}
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<String>();
headList.add("docid");
headList.add("title");
headList.add("comment_count");
headList.add("summary");
headList.add("source");
headList.add("wm_copyright");
headList.add("time");
headList.add("url");
System.out.println(listAll.size());
poi.exportExcel("D://crawlerdata/一点资讯-美林.xlsx", "asd", headList, listAll);
}
}
package com.zhiwei.crawler;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.parse.Yidianzixun;
public class YidianzixunAccountExample {
@Test
public void yidianzixunAccountTest() {
String channelid = "m133695";
String startTime = "2017-09-10 09:42:05";
List<Map<String,Object>> dataList = Yidianzixun.getYidianzixunAccountData(channelid, startTime);
for(Map<String,Object> map : dataList) {
System.out.println(map.toString());
}
}
}
package com.zhiwei.crawler;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.parse.Yidianzixun;
public class YidianzixunCommentExample {
@Test
public void yidianzixunCommentTest() {
String url = "http://www.yidianzixun.com/article/0HjrjVFY";
List<Map<String,Object>> lists = Yidianzixun.getYidianzixunCommentData(url);
System.out.println(lists.size());
for(Map<String,Object> map : lists) {
System.out.println(map.toString());
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment