Commit 9234d24c by yangchen

更新

parent cb5516a0
...@@ -3,42 +3,27 @@ ...@@ -3,42 +3,27 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>articlenewscrawler</artifactId> <artifactId>articlenewscrawler</artifactId>
<version>0.1.3-SNAPSHOT</version> <version>0.1.6-SNAPSHOT</version>
<name>articlenewscrawler</name> <name>articlenewscrawler</name>
<description>采集凤凰,一点资讯,搜狐历时文章和文章评论</description> <description>采集凤凰,一点资讯,搜狐历时文章和文章评论</description>
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>
<version>6.14.3</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.29</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
</dependency>
<dependency>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>excelpoi</artifactId> <artifactId>excelpoi</artifactId>
<version>0.0.1-SNAPSHOT</version> <version>0.0.3-SNAPSHOT</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.zhiwei.tools</groupId> <groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId> <artifactId>zhiwei-tools</artifactId>
<version>0.1.2-SNAPSHOT</version> <version>0.1.3-SNAPSHOT</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.zhiwei.crawler</groupId> <groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId> <artifactId>crawler-core</artifactId>
<version>0.3.0-RELEASE</version> <version>0.3.6-RELEASE</version>
<scope>provided</scope> <scope>provided</scope>
</dependency> </dependency>
</dependencies> </dependencies>
<!-- 打包管理 --> <!-- 打包管理 -->
......
...@@ -113,14 +113,14 @@ public class HeadGet { ...@@ -113,14 +113,14 @@ public class HeadGet {
* @throws IOException * @throws IOException
*/ */
public static Map<String,String> getFenghuangAccountHeaderMap(String cookie) { public static Map<String,String> getFenghuangAccountHeaderMap(String cookie) {
Map<String,String> headerMap = new HashMap<String, String>(); Map<String,String> headerMap = new HashMap<>();
headerMap.put("User-Agent", headerMap.put("User-Agent",
"IfengNews/6.1.8 (iPhone; iOS 11.2.1; Scale/2.00)"); "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36");
headerMap.put("Accept", headerMap.put("Accept",
"*/*"); "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3");
headerMap.put("Accept-Language", "zh-cn"); headerMap.put("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8");
headerMap.put("Connection", "keep-alive"); headerMap.put("Connection", "keep-alive");
headerMap.put("Host", "api.3g.ifeng.com"); headerMap.put("Host", "shankapi.ifeng.com");
if(cookie != null) { if(cookie != null) {
headerMap.put("Cookie", cookie); headerMap.put("Cookie", cookie);
} }
......
...@@ -16,7 +16,7 @@ import okhttp3.Response; ...@@ -16,7 +16,7 @@ import okhttp3.Response;
public class HttpClient { public class HttpClient {
private static Logger logger = LoggerFactory.getLogger(HttpClient.class); private static Logger logger = LoggerFactory.getLogger(HttpClient.class);
private static HttpBoot httpBoot = new HttpBoot(false,2); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/** /**
* *
...@@ -44,15 +44,27 @@ public class HttpClient { ...@@ -44,15 +44,27 @@ public class HttpClient {
* @throws IOException * @throws IOException
*/ */
public static String executeHttpRequestGet(String url,ProxyHolder proxy,Map<String, String> headerMap) { public static String executeHttpRequestGet(String url,ProxyHolder proxy,Map<String, String> headerMap) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy)){ for(int i = 0;i < 3;i++) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy)){
return response.body().string();
} catch (Exception e) {
logger.error("httpClient 获取数据出现问题:{}", e);
}
}
return null;
}
public static String executeHttpRequestPost(String url,Proxy proxy,Map<String, String> headerMap,Map<String, Object> paramMap) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapPost(url, headerMap, paramMap), proxy)){
return response.body().string(); return response.body().string();
} catch (Exception e) { } catch (Exception e) {
logger.error("httpClient 获取数据出现问题:{}", e); logger.error("httpClient 获取数据出现问题:{}", e);
return null; return null;
} }
} }
public static String executeHttpRequestPost(String url,Proxy proxy,Map<String, String> headerMap,Map<String, Object> paramMap) { public static String executeHttpRequestPost(String url,ProxyHolder proxy,Map<String, String> headerMap,Map<String, Object> paramMap) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapPost(url, headerMap, paramMap), proxy)){ try (Response response = httpBoot.syncCall(RequestUtils.wrapPost(url, headerMap, paramMap), proxy)){
return response.body().string(); return response.body().string();
} catch (Exception e) { } catch (Exception e) {
......
...@@ -20,7 +20,7 @@ public class Aika { ...@@ -20,7 +20,7 @@ public class Aika {
private static Logger logger = LoggerFactory.getLogger(Aika.class); private static Logger logger = LoggerFactory.getLogger(Aika.class);
private static AikaCommentAnalysis aikaCommentAnalysis = new AikaCommentAnalysis(); private static AikaCommentAnalysis aikaCommentAnalysis = new AikaCommentAnalysis();
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static List<Map<String,Object>> getAikaComment(String url,ProxyHolder proxy) { public static List<Map<String,Object>> getAikaComment(String url,ProxyHolder proxy) {
...@@ -46,6 +46,7 @@ public class Aika { ...@@ -46,6 +46,7 @@ public class Aika {
page++; page++;
} catch (Exception e) { } catch (Exception e) {
logger.error("爱卡汽车 评论采集出错 {}", e); logger.error("爱卡汽车 评论采集出错 {}", e);
break;
} }
} }
......
...@@ -23,7 +23,7 @@ import okhttp3.Response; ...@@ -23,7 +23,7 @@ import okhttp3.Response;
public class Aiqiyi { public class Aiqiyi {
private static Logger logger = LoggerFactory.getLogger(Aiqiyi.class); private static Logger logger = LoggerFactory.getLogger(Aiqiyi.class);
private static AiqiyiByWordAnalysis aiqiyiByWordAnalysis = new AiqiyiByWordAnalysis(); private static AiqiyiByWordAnalysis aiqiyiByWordAnalysis = new AiqiyiByWordAnalysis();
private static HttpBoot httpBoot = new HttpBoot(false, 2); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/** /**
* *
......
...@@ -2,8 +2,11 @@ package com.zhiwei.parse; ...@@ -2,8 +2,11 @@ package com.zhiwei.parse;
import java.net.Proxy; import java.net.Proxy;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
...@@ -23,7 +26,7 @@ import okhttp3.Request; ...@@ -23,7 +26,7 @@ import okhttp3.Request;
public class Baijia { public class Baijia {
private static Logger logger = LoggerFactory.getLogger(Baijia.class); private static Logger logger = LoggerFactory.getLogger(Baijia.class);
private static BaijiaAccountAnalysis baijiaAccountAnalysis = new BaijiaAccountAnalysis(); private static BaijiaAccountAnalysis baijiaAccountAnalysis = new BaijiaAccountAnalysis();
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/** /**
* *
...@@ -77,27 +80,29 @@ public class Baijia { ...@@ -77,27 +80,29 @@ public class Baijia {
Map<String,String> headerMap = HeaderTool.getCommonHead(); Map<String,String> headerMap = HeaderTool.getCommonHead();
List<Map<String,Object>> dataList = new ArrayList<>(); List<Map<String,Object>> dataList = new ArrayList<>();
headerMap.put("cookie",cookie); headerMap.put("cookie",cookie);
String uk = getUkData(app_id,proxy,cookie);
if(Objects.isNull(uk)) {
return Collections.emptyList();
}
boolean f = true; boolean f = true;
int n = 0; String ctime = "";
while(f) { while(f) {
for(int i = 1;i < 3;i++) { for(int i = 1;i < 3;i++) {
try { try {
String url = "https://author.baidu.com/list?type=article&context={%22offset%22:%22-1_"+n+"%22,%22app_id%22:%22"+app_id+"%22,%22pageSize%22:20}"; String url = "https://author.baidu.com/list?type=article&tab=2&uk="+uk+"&ctime="+ctime+"&num=50";
System.out.println(url);
Request request = RequestUtils.wrapGet(url, headerMap); Request request = RequestUtils.wrapGet(url, headerMap);
String result = httpBoot.syncCall(request, proxy, false).body().string(); String result = httpBoot.syncCall(request, proxy).body().string();
Map<String,Object> dMap = baijiaAccountAnalysis.getBaijiaAccountData3(result,name, startTime); Map<String,Object> dMap = baijiaAccountAnalysis.getBaijiaAccountData3(result,name, startTime);
List<Map<String,Object>> dList = (List<Map<String, Object>>) dMap.get("data"); List<Map<String,Object>> dList = (List<Map<String, Object>>) dMap.get("data");
dataList.addAll(dList); dataList.addAll(dList);
logger.info("{} 数据采集结果 {}",name, dataList.size()); logger.info("{} 数据采集结果 {}",app_id, dataList.size());
if(!(boolean) dMap.get("more")) { if(!(boolean) dMap.get("more")) {
f = false; f = false;
} }
ctime = String.valueOf(dMap.get("ctime"));
ZhiWeiTools.sleep(3000); ZhiWeiTools.sleep(3000);
n += 20;
break; break;
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace();
ZhiWeiTools.sleep(3000); ZhiWeiTools.sleep(3000);
} }
} }
...@@ -106,6 +111,22 @@ public class Baijia { ...@@ -106,6 +111,22 @@ public class Baijia {
return dataList; return dataList;
} }
private static String getUkData(String app_id,Proxy proxy,String cookie) {
String url = "https://author.baidu.com/profile?context={\"from\":0,\"app_id\":\""
+app_id+"\"}&cmdType=&pagelets[]=root&reqID=0&ispeed=1#";
Map<String,Object> headers = new HashMap<>();
headers.put("Host", "author.baidu.com");
headers.put("cookie", cookie);
for(int i = 0; i < 3;i++) {
try {
String result = httpBoot.syncCall(RequestUtils.wrapGet(url,headers), proxy).body().string();
return result.split("uk\\\\\":\\\\\"")[1].split("\\\\\",")[0];
} catch (Exception e) {
logger.error("百家号uk 获取失败");
}
}
return null;
}
/** /**
* *
* @Description 百家号历史文章采集 * @Description 百家号历史文章采集
...@@ -114,7 +135,7 @@ public class Baijia { ...@@ -114,7 +135,7 @@ public class Baijia {
* @return * @return
*/ */
public static List<Map<String,Object>> getBaijiaAccountData(String app_id,String startTime,Proxy proxy) { public static List<Map<String,Object>> getBaijiaAccountData(String app_id,String startTime,Proxy proxy) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> dataList = new ArrayList<>();
int i = 0; int i = 0;
Map<String,String> headerMap = HeadGet.getBaijiaAccountHeaderMap(null); Map<String,String> headerMap = HeadGet.getBaijiaAccountHeaderMap(null);
try { try {
......
...@@ -12,28 +12,28 @@ import org.slf4j.Logger; ...@@ -12,28 +12,28 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.HttpRequestBuilder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.parse.analysis.BilibilikeyWordAnalysis; import com.zhiwei.parse.analysis.BilibilikeyWordAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Headers; import okhttp3.Headers;
import okhttp3.Request;
public class BiliBili { public class BiliBili {
private static final Logger logger = LoggerFactory.getLogger(BiliBili.class); private static final Logger logger = LoggerFactory.getLogger(BiliBili.class);
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).useCookieJar(true).build();
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static List<Map<String,Object>> getData(String word,Proxy proxy,String cookie) { public static List<Map<String,Object>> getData(String word,Proxy proxy,String endTime,String cookie) {
List<Map<String,Object>> bodyList = new ArrayList<>(); List<Map<String,Object>> bodyList = new ArrayList<>();
try { try {
//
String url = "https://search.bilibili.com/all?keyword="+URLEncoder.encode(word, "utf-8")+"&order=pubdate&duration=0&tids_1=0"; String url = "https://search.bilibili.com/all?keyword="+URLEncoder.encode(word, "utf-8")+"&order=pubdate&duration=0&tids_1=0";
Headers header = Headers.of("cookie",cookie,"Referer","https://www.bilibili.com/","Host","search.bilibili.com"); Headers header = Headers.of("cookie",cookie,"Referer","https://www.bilibili.com/","Host","search.bilibili.com");
Request request = HttpRequestBuilder.newGetRequest(url, header); String result = httpBoot.syncCall(RequestUtils.wrapGet(url, header), ProxyHolder.NAT_HEAVY_PROXY).body().string();
String result = httpBoot.syncCall(request, proxy).body().string(); ZhiWeiTools.sleep(100);
ZhiWeiTools.sleep(3000); Map<String,Object> map = BilibilikeyWordAnalysis.getData(result,word,endTime);
Map<String,Object> map = BilibilikeyWordAnalysis.getData(result,word);
boolean more = (boolean) map.get("more"); boolean more = (boolean) map.get("more");
List<Map<String,Object>> dataList = (List<Map<String, Object>>) map.get("data"); List<Map<String,Object>> dataList = (List<Map<String, Object>>) map.get("data");
if(dataList != null) { if(dataList != null) {
...@@ -43,27 +43,23 @@ public class BiliBili { ...@@ -43,27 +43,23 @@ public class BiliBili {
while(more) { while(more) {
map.clear(); map.clear();
String ur = url + "&page=" + n; String ur = url + "&page=" + n;
System.out.println(ur); String result2 = httpBoot.syncCall(RequestUtils.wrapGet(ur, header), ProxyHolder.NAT_HEAVY_PROXY).body().string();
request = HttpRequestBuilder.newGetRequest(ur, header); map = BilibilikeyWordAnalysis.getData(result2,word,endTime);
String result2 = httpBoot.syncCall(request, proxy).body().string();
map = BilibilikeyWordAnalysis.getData(result2,word);
List<Map<String,Object>> dataList2 = (List<Map<String, Object>>) map.get("data"); List<Map<String,Object>> dataList2 = (List<Map<String, Object>>) map.get("data");
if(dataList2 != null) { if(dataList2 != null) {
bodyList.addAll(dataList2); bodyList.addAll(dataList2);
} }
System.out.println(n + "页,数据总量为 -- " + bodyList.size() ); logger.info("word {} , {} 页,数据总量为 -- {}",word,n, bodyList.size());
more = (boolean) map.get("more"); more = (boolean) map.get("more");
n++; n++;
ZhiWeiTools.sleep(3000); ZhiWeiTools.sleep(100);
} }
return bodyList; return bodyList;
} catch (UnsupportedEncodingException e) { } catch (UnsupportedEncodingException e) {
logger.error("e ",e); logger.error("e {}",e);
} catch (Exception e) { } catch (Exception e) {
logger.error("e ",e); logger.error("e {}",e);
} }
return Collections.emptyList(); return Collections.emptyList();
} }
......
...@@ -25,7 +25,7 @@ import okhttp3.Response; ...@@ -25,7 +25,7 @@ import okhttp3.Response;
public class Chejia { public class Chejia {
private static final Logger logger = LoggerFactory.getLogger(Chejia.class); private static final Logger logger = LoggerFactory.getLogger(Chejia.class);
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/** /**
* *
......
...@@ -11,6 +11,7 @@ import org.slf4j.Logger; ...@@ -11,6 +11,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.httpclient.HeadGet; import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient; import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.DayuAccountAnalysis; import com.zhiwei.parse.analysis.DayuAccountAnalysis;
...@@ -30,26 +31,23 @@ public class Dayu { ...@@ -30,26 +31,23 @@ public class Dayu {
* @param mid * @param mid
* @return * @return
*/ */
public static List<Map<String,Object>> getDayuAccountData(String mid,String name,String startTime,Proxy proxy) { public static List<Map<String,Object>> getDayuAccountData(String mid,String name,String startTime,ProxyHolder proxy) {
int i = 1; int i = 1;
Map<String,String> headerMap = HeadGet.getDayuAccountHeaderMap(null); Map<String,String> headerMap = HeadGet.getDayuAccountHeaderMap(null);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> dataList = new ArrayList<>();
try { try {
while(true) { while(true) {
String url = "http://ff.dayu.com/contents/author/"+mid+"?biz_id=1002&_size=50&_page="+i+"&_order_type=published_at&status=1&_fetch=1"; String url = "http://ff.dayu.com/contents/author/"+mid+"?biz_id=1002&_size=50&_page="+i+"&_order_type=published_at&status=1&_fetch=1";
String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap); String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap);
System.out.println(url); System.out.println(url);
List<Map<String,Object>> lists = dayuAccountAnalysis.getDayuAccountData(result,name,startTime); List<Map<String,Object>> lists = dayuAccountAnalysis.getDayuAccountData(result,name,startTime);
if(lists == null) { if(lists == null || lists.isEmpty()) {
break;
}
if(lists.size() < 1) {
break; break;
} }
dataList.addAll(lists); dataList.addAll(lists);
System.out.println("================解析第"+i+"页====此时有数据=="+dataList.size()); System.out.println("================解析第"+i+"页====此时有数据=="+dataList.size());
i++; i++;
ZhiWeiTools.sleep(7000); ZhiWeiTools.sleep(100);
} }
return dataList; return dataList;
} catch (Exception e) { } catch (Exception e) {
......
...@@ -25,7 +25,7 @@ public class Douban { ...@@ -25,7 +25,7 @@ public class Douban {
private static final Logger logger = LoggerFactory.getLogger(Double.class); private static final Logger logger = LoggerFactory.getLogger(Double.class);
private static DoubanCommentAnalysis doubanCommentAnalysis = new DoubanCommentAnalysis(); private static DoubanCommentAnalysis doubanCommentAnalysis = new DoubanCommentAnalysis();
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/** /**
* *
......
...@@ -19,6 +19,7 @@ import com.zhiwei.parse.analysis.FenghuangCommentAnalysis; ...@@ -19,6 +19,7 @@ import com.zhiwei.parse.analysis.FenghuangCommentAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
public class Fenghuang { public class Fenghuang {
private static Logger logger = LoggerFactory.getLogger(Fenghuang.class); private static Logger logger = LoggerFactory.getLogger(Fenghuang.class);
private static FenghuangAccountAnalysis fenghuangAccountAnalysis = new FenghuangAccountAnalysis(); private static FenghuangAccountAnalysis fenghuangAccountAnalysis = new FenghuangAccountAnalysis();
private static FenghuangCommentAnalysis fenghuangCommentAnalysis = new FenghuangCommentAnalysis(); private static FenghuangCommentAnalysis fenghuangCommentAnalysis = new FenghuangCommentAnalysis();
...@@ -31,7 +32,7 @@ public class Fenghuang { ...@@ -31,7 +32,7 @@ public class Fenghuang {
* @param startTime 可不传 格式(2017-12-09 17:53:02) * @param startTime 可不传 格式(2017-12-09 17:53:02)
* @return * @return
*/ */
public static List<Map<String,Object>> getFenghuangAccountData(String id,String startTime,Proxy proxy) { public static List<Map<String,Object>> getFenghuangAccountData(String id,String startTime,ProxyHolder proxy) {
List<Map<String,Object>> dataList = new ArrayList<>(); List<Map<String,Object>> dataList = new ArrayList<>();
int i = 1; int i = 1;
boolean f = true; boolean f = true;
...@@ -39,17 +40,17 @@ public class Fenghuang { ...@@ -39,17 +40,17 @@ public class Fenghuang {
try { try {
for(int j = 0;j< 3;j++){ for(int j = 0;j< 3;j++){
f = true; f = true;
String url = "http://api.3g.ifeng.com/api_wemedia_index?followid=weMedia_"+id+"&page="+i+"&pagesize=20&tag=article&uid=fe659b7e510444c28a31f88dee7a2747"; String url = "https://shankapi.ifeng.com/winter/feng/author/getFengAuthorListData/"+id+"/doc/"+i+"/getFengAuthorListData";
List<Map<String,Object>> list = fenghuangAccountAnalysis.getArticleData(url, startTime,proxy); List<Map<String,Object>> list = fenghuangAccountAnalysis.getArticleData(url, startTime,proxy);
if(list != null && !list.isEmpty()) { if(list != null && !list.isEmpty()) {
dataList.addAll(list); dataList.addAll(list);
logger.info("====================采集第 {} 页===共获取数据== {}",i,dataList.size()); logger.info("采集第 {} 页,.共获取数据{}",i,dataList.size());
i++; i++;
ZhiWeiTools.sleep(2000); ZhiWeiTools.sleep(100);
break; break;
} }
f = false; f = false;
ZhiWeiTools.sleep(2000); ZhiWeiTools.sleep(100);
} }
} catch (Exception e) { } catch (Exception e) {
logger.error("程序出错 {}",e); logger.error("程序出错 {}",e);
......
...@@ -18,7 +18,7 @@ public class Gftai { ...@@ -18,7 +18,7 @@ public class Gftai {
private static final Logger logger = LoggerFactory.getLogger(Gftai.class); private static final Logger logger = LoggerFactory.getLogger(Gftai.class);
private static GftaiAnalysis gftaiAnalysis = new GftaiAnalysis(); private static GftaiAnalysis gftaiAnalysis = new GftaiAnalysis();
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
public static List<Map<String,Object>> getData(String word,Proxy proxy) { public static List<Map<String,Object>> getData(String word,Proxy proxy) {
List<Map<String,Object>> dataList = new ArrayList<>(); List<Map<String,Object>> dataList = new ArrayList<>();
......
...@@ -19,7 +19,7 @@ public class KuaiTousu { ...@@ -19,7 +19,7 @@ public class KuaiTousu {
private static Logger logger = LoggerFactory.getLogger(KuaiTousu.class); private static Logger logger = LoggerFactory.getLogger(KuaiTousu.class);
private static KuaiTousuAnalysis kuaiTousuAnalysis = new KuaiTousuAnalysis(); private static KuaiTousuAnalysis kuaiTousuAnalysis = new KuaiTousuAnalysis();
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
public static List<Map<String,Object>> getData(String word,Proxy proxy) { public static List<Map<String,Object>> getData(String word,Proxy proxy) {
int page = 1; int page = 1;
......
...@@ -30,7 +30,7 @@ import okhttp3.Response; ...@@ -30,7 +30,7 @@ import okhttp3.Response;
public class Maimai { public class Maimai {
private static Logger logger = LoggerFactory.getLogger(Maimai.class); private static Logger logger = LoggerFactory.getLogger(Maimai.class);
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
private static MaimaiBywordAnalysis maimaiBywordAnalysis = new MaimaiBywordAnalysis(); private static MaimaiBywordAnalysis maimaiBywordAnalysis = new MaimaiBywordAnalysis();
......
...@@ -22,7 +22,7 @@ public class Pcauto { ...@@ -22,7 +22,7 @@ public class Pcauto {
private static Logger logger = LoggerFactory.getLogger(Pcauto.class); private static Logger logger = LoggerFactory.getLogger(Pcauto.class);
private static PcautoCommentAnalysis pcautoCommentAnalysis = new PcautoCommentAnalysis(); private static PcautoCommentAnalysis pcautoCommentAnalysis = new PcautoCommentAnalysis();
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static List<Map<String, Object>> getPcAutoComment(String url,ProxyHolder proxy) { public static List<Map<String, Object>> getPcAutoComment(String url,ProxyHolder proxy) {
......
...@@ -13,6 +13,7 @@ import com.alibaba.fastjson.JSONArray; ...@@ -13,6 +13,7 @@ import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.bean.QQkbUser; import com.zhiwei.bean.QQkbUser;
import com.zhiwei.crawler.proxy.ProxyFactory; import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.httpclient.HeadGet; import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient; import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.QQKBAccountAnalysis; import com.zhiwei.parse.analysis.QQKBAccountAnalysis;
...@@ -120,7 +121,7 @@ public class QQKB { ...@@ -120,7 +121,7 @@ public class QQKB {
while(true) { while(true) {
try { try {
String result = HttpClient.executeHttpRequestPost("http://r.cnews.qq.com/getQQNewsComment",ProxyFactory.getNatProxy(), headerMap, paramMap); String result = HttpClient.executeHttpRequestPost("http://r.cnews.qq.com/getQQNewsComment",ProxyHolder.NAT_HEAVY_PROXY, headerMap, paramMap);
paramMap.clear(); paramMap.clear();
List<Map<String,Object>> lists = qqkbCommentAnalysis.getCommentData(result,null,comment_id, article_id,proxy); List<Map<String,Object>> lists = qqkbCommentAnalysis.getCommentData(result,null,comment_id, article_id,proxy);
if(lists == null || lists.size() < 1) { if(lists == null || lists.size() < 1) {
...@@ -148,7 +149,7 @@ public class QQKB { ...@@ -148,7 +149,7 @@ public class QQKB {
String cookie = "luin=o0497332654;%20lskey=00030000d63ffaf7eba88c86106eac5f2910d45515222334b91c75a66b449c990c2be43cd202ba39b35bef60;%20uin=o0497332654;%20skey=MH3wukytS4;%20sigA2=7AB4D8DEDF73E313801FD348FD77EC3B05C06DBC4D9DA669B20CA04A8D6B80F300A69567FBD11A7B799E419BB796F22D47D3AE5FA95E708A0ABC66161061131B0B21A0031AA0807C;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmSQ3EqlyzseC9-qGEFY7Tkr0Ypk5vsnSwOaMC-IGxsBeY2K7knHrYstj_5dZpisJd5nihvLNQvCdsFhFwZQcT8;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0"; String cookie = "luin=o0497332654;%20lskey=00030000d63ffaf7eba88c86106eac5f2910d45515222334b91c75a66b449c990c2be43cd202ba39b35bef60;%20uin=o0497332654;%20skey=MH3wukytS4;%20sigA2=7AB4D8DEDF73E313801FD348FD77EC3B05C06DBC4D9DA669B20CA04A8D6B80F300A69567FBD11A7B799E419BB796F22D47D3AE5FA95E708A0ABC66161061131B0B21A0031AA0807C;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmSQ3EqlyzseC9-qGEFY7Tkr0Ypk5vsnSwOaMC-IGxsBeY2K7knHrYstj_5dZpisJd5nihvLNQvCdsFhFwZQcT8;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0";
Map<String,String> headerMap = HeadGet.getQQkbUserHeaderMap(cookie); Map<String,String> headerMap = HeadGet.getQQkbUserHeaderMap(cookie);
Map<String,Object> paramMap = HeadGet.getQQkbUserParamMap(name); Map<String,Object> paramMap = HeadGet.getQQkbUserParamMap(name);
String result = HttpClient.executeHttpRequestPost(url, null, headerMap, paramMap); String result = HttpClient.executeHttpRequestPost(url, ProxyHolder.NAT_HEAVY_PROXY, headerMap, paramMap);
JSONObject json = JSONObject.parseObject(result); JSONObject json = JSONObject.parseObject(result);
JSONObject json1 = json.getJSONObject("new_list"); JSONObject json1 = json.getJSONObject("new_list");
JSONObject json2 = json1.getJSONArray("data").getJSONObject(0); JSONObject json2 = json1.getJSONArray("data").getJSONObject(0);
......
...@@ -24,7 +24,7 @@ public class QQNews { ...@@ -24,7 +24,7 @@ public class QQNews {
private static final Logger logger = LoggerFactory.getLogger(QQNews.class); private static final Logger logger = LoggerFactory.getLogger(QQNews.class);
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/** /**
* . * .
......
...@@ -17,7 +17,7 @@ import com.zhiwei.tools.tools.ZhiWeiTools; ...@@ -17,7 +17,7 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
public class QicheHome { public class QicheHome {
private static Logger logger = LoggerFactory.getLogger(QicheHome.class); private static Logger logger = LoggerFactory.getLogger(QicheHome.class);
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
private static QicheHomeKwyWordAnalysis qicheHomeKwyWordAnalysis = new QicheHomeKwyWordAnalysis(); private static QicheHomeKwyWordAnalysis qicheHomeKwyWordAnalysis = new QicheHomeKwyWordAnalysis();
......
...@@ -24,7 +24,7 @@ public class SinaKeji { ...@@ -24,7 +24,7 @@ public class SinaKeji {
private static Logger logger = LoggerFactory.getLogger(SinaKeji.class); private static Logger logger = LoggerFactory.getLogger(SinaKeji.class);
private static SinaKejiCommentAnalysis sinaKejiCommentAnalysis = new SinaKejiCommentAnalysis(); private static SinaKejiCommentAnalysis sinaKejiCommentAnalysis = new SinaKejiCommentAnalysis();
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/** /**
* https://tech.sina.com.cn/it/2018-07-17/doc-ihfkffam0632649.shtml * https://tech.sina.com.cn/it/2018-07-17/doc-ihfkffam0632649.shtml
......
...@@ -21,7 +21,7 @@ public class SinaTousu { ...@@ -21,7 +21,7 @@ public class SinaTousu {
private static final Logger logger = LoggerFactory.getLogger(SinaTousu.class); private static final Logger logger = LoggerFactory.getLogger(SinaTousu.class);
private static SinaTousuAnalysis sinaTousuAnalysis = new SinaTousuAnalysis(); private static SinaTousuAnalysis sinaTousuAnalysis = new SinaTousuAnalysis();
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
public static List<Map<String,Object>> getSinaTousuData(String word,ProxyHolder proxy,String time) { public static List<Map<String,Object>> getSinaTousuData(String word,ProxyHolder proxy,String time) {
List<Map<String,Object>> bodyList = new ArrayList<>(); List<Map<String,Object>> bodyList = new ArrayList<>();
......
...@@ -2,14 +2,11 @@ package com.zhiwei.parse; ...@@ -2,14 +2,11 @@ package com.zhiwei.parse;
import static java.util.Objects.nonNull; import static java.util.Objects.nonNull;
import java.net.Proxy;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
...@@ -73,36 +70,28 @@ public class Souhu { ...@@ -73,36 +70,28 @@ public class Souhu {
* @param isCulling 是否采集精选 * @param isCulling 是否采集精选
* @return * @return
*/ */
public static List<Map<String,Object>> getSouHuAccountData(String xpt,String startTime,boolean isCulling,Proxy proxy) { public static List<Map<String,Object>> getSouHuAccountData(String id,String name,String startTime,boolean isCulling,ProxyHolder proxy) {
int i = 1; int i = 1;
String name = getName(xpt,proxy); ZhiWeiTools.sleep(200);
ZhiWeiTools.sleep(2000); List<Map<String,Object>> dataList = new ArrayList<>();
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
Map<String,String> headerMap = HeadGet.getSouhuAccountHeaderMap(null);
boolean f = true; boolean f = true;
int j = 0; int j = 0;
while(f) { while(f) {
try { try {
String url = "http://mp.sohu.com/apiV2/profile/newsListAjax?xpt="+xpt+"&pageNumber="+i+"&pageSize=10"; String url = "http://v2.sohu.com/author-page-api/author-articles/pc/"+id+"?pNo="+i;
String result = null;
if(isCulling) { if(isCulling) {
url = url + "&categoryId=-1"; url = url + "&columnId=-1";
}
try {
result = HttpClient.executeHttpRequestGet(url,proxy,headerMap);
} catch (Exception e) {
e.printStackTrace();
} }
result = result.replaceAll("\\\\", ""); String result = HttpClient.executeHttpRequestGet(url,proxy,null);
result = result.substring(1, result.length()-1);
JSONObject json = JSONObject.parseObject(result); JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArray = json.getJSONArray("data"); JSONArray jsonArray = json.getJSONObject("data").getJSONArray("pcArticleVOS");
List<Map<String,Object>> dataList1 = souhuAccountAnalysis.analysisData(jsonArray,name); List<Map<String,Object>> dataList1 = souhuAccountAnalysis.analysisData(jsonArray,name);
if(jsonArray.size() < 1) { if(jsonArray.isEmpty()) {
break; break;
} }
if(startTime == null) { if(startTime == null) {
j = 0;
dataList.addAll(dataList1); dataList.addAll(dataList1);
} }
//判断时间 //判断时间
...@@ -113,40 +102,26 @@ public class Souhu { ...@@ -113,40 +102,26 @@ public class Souhu {
f = false; f = false;
break; break;
} }
j = 0;
dataList.add(map); dataList.add(map);
} }
} }
logger.info("=============获取到的数据数目{}",dataList.size()); logger.info("=============获取到的数据数目{}",dataList.size());
i++; i++;
ZhiWeiTools.sleep(3000); ZhiWeiTools.sleep(300);
} catch (Exception e) { } catch (Exception e) {
ZhiWeiTools.sleep(3000); ZhiWeiTools.sleep(300);
logger.error("出错了",e.getMessage()); logger.error("出错了 {}",e);
j++; j++;
if(j > 5) { if(j > 5) {
f = false; f = false;
} }
continue;
} }
} }
return dataList; return dataList;
} }
private static String getName(String xpt,Proxy proxy) {
Map<String,String> headerMap = HeadGet.getSouhuAccountHeaderMap(null);
try {
String result = HttpClient.executeHttpRequestGet("http://mp.sohu.com/profile?xpt="+xpt, proxy, headerMap);
Document doc = Jsoup.parse(result);
String name = doc.select("p#ff").text();
System.out.println(name);
return name;
} catch (Exception e) {
return null;
}
}
/** /**
* *
* @Description 传入搜狐文章链接和cookie 可获取此文章所有评论 * @Description 传入搜狐文章链接和cookie 可获取此文章所有评论
...@@ -161,7 +136,7 @@ public class Souhu { ...@@ -161,7 +136,7 @@ public class Souhu {
try { try {
while(true) { while(true) {
String newurl = souhuCommentAnalysis.getSouhuURL(url,proxy) + "&page_no=" + j; String newurl = souhuCommentAnalysis.getSouhuURL(url,proxy) + "&page_no=" + j;
String result = HttpClient.executeHttpRequestGet(newurl,ProxyFactory.getNatProxy(),headerMap); String result = HttpClient.executeHttpRequestGet(newurl,ProxyHolder.NAT_HEAVY_PROXY,headerMap);
System.out.println(newurl); System.out.println(newurl);
JSONObject json = JSONObject.parseObject(result); JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONObject("jsonObject").getJSONArray("comments"); JSONArray jsonArry = json.getJSONObject("jsonObject").getJSONArray("comments");
......
...@@ -19,6 +19,7 @@ import com.zhiwei.crawler.utils.RequestUtils; ...@@ -19,6 +19,7 @@ import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.httpclient.HeadGet; import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient; import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.TXNewsByWordAnalysis; import com.zhiwei.parse.analysis.TXNewsByWordAnalysis;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response; import okhttp3.Response;
...@@ -28,7 +29,7 @@ public class TXNews { ...@@ -28,7 +29,7 @@ public class TXNews {
private static Logger logger = LoggerFactory.getLogger(TXNews.class); private static Logger logger = LoggerFactory.getLogger(TXNews.class);
private static TXNewsByWordAnalysis txNewsByWordAnalysis = new TXNewsByWordAnalysis(); private static TXNewsByWordAnalysis txNewsByWordAnalysis = new TXNewsByWordAnalysis();
public static boolean txNewshasMoreData = true; public static boolean txNewshasMoreData = true;
public static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
public static List<Map<String,Object>> getData(String word,String devid,Proxy proxy) { public static List<Map<String,Object>> getData(String word,String devid,Proxy proxy) {
List<Map<String,Object>> dataList = new ArrayList<>(); List<Map<String,Object>> dataList = new ArrayList<>();
...@@ -120,5 +121,47 @@ public class TXNews { ...@@ -120,5 +121,47 @@ public class TXNews {
return -1; return -1;
} }
public static List<Map<String,Object>> getTxNewsHistory(String mid,String endTime,ProxyHolder proxy) {
List<Map<String,Object>> dataList = new ArrayList<>();
int page = 0;
int errorNum = 0;
while(true) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet("https://pacaio.match.qq.com/om/mediaArticles?mid="+mid+"&num=30&page="+page), proxy)){
String result = response.body().string();
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArray = json.getJSONArray("data");
for(int i = 0,j = jsonArray.size();i < j;i++) {
JSONObject data = jsonArray.getJSONObject(i);
Map<String,Object> map = new HashMap<>();
String time = TimeParse.dateFormartString(new Date(data.getLong("timestamp")*1000L), "yyyy-MM-dd HH:mm:ss");
if(endTime != null && endTime.length() > 1) {
System.out.println(time);
if(time.compareTo(endTime) <= 0) {
logger.info("超时时间采集范围 跳出采集");
return dataList;
}
}
map.put("title", data.getString("title"));
map.put("content", data.getString("abstract"));
map.put("time", time);
map.put("source", data.getString("source"));
map.put("url", data.getString("vurl"));
dataList.add(map);
}
logger.info("mid = {} , cralwer count = {}",mid,dataList.size() );
page++;
if(jsonArray.size() < 10) {
break;
}
} catch (Exception e) {
logger.info("采集数据出错 {}",e);
errorNum++;
if(errorNum > 3) {
break;
}
}
}
return dataList;
}
} }
...@@ -21,7 +21,7 @@ public class TechTx { ...@@ -21,7 +21,7 @@ public class TechTx {
private static Logger logger = LoggerFactory.getLogger(TechTx.class); private static Logger logger = LoggerFactory.getLogger(TechTx.class);
private static TechTxCommentAnalysis techTxCommentAnalysis = new TechTxCommentAnalysis(); private static TechTxCommentAnalysis techTxCommentAnalysis = new TechTxCommentAnalysis();
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static List<Map<String,Object>> getTechTxComment(String url,ProxyHolder proxy) { public static List<Map<String,Object>> getTechTxComment(String url,ProxyHolder proxy) {
......
...@@ -2,6 +2,7 @@ package com.zhiwei.parse; ...@@ -2,6 +2,7 @@ package com.zhiwei.parse;
import java.net.Proxy; import java.net.Proxy;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
...@@ -9,18 +10,24 @@ import org.jsoup.Jsoup; ...@@ -9,18 +10,24 @@ import org.jsoup.Jsoup;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.httpclient.HeadGet; import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient; import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.parse.analysis.WangyiCommentAnalysis; import com.zhiwei.parse.analysis.WangyiCommentAnalysis;
import com.zhiwei.parse.analysis.WangyiHistoryAnalysis; import com.zhiwei.parse.analysis.WangyiHistoryAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
public class Wangyi { public class Wangyi {
private static Logger logger = LoggerFactory.getLogger(Wangyi.class); private static Logger logger = LoggerFactory.getLogger(Wangyi.class);
private static WangyiCommentAnalysis wangyiCommentAnalysis = new WangyiCommentAnalysis(); private static WangyiCommentAnalysis wangyiCommentAnalysis = new WangyiCommentAnalysis();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
private static WangyiHistoryAnalysis wangyiHistoryAnalysis = new WangyiHistoryAnalysis(); private static WangyiHistoryAnalysis wangyiHistoryAnalysis = new WangyiHistoryAnalysis();
/** /**
...@@ -74,24 +81,31 @@ public class Wangyi { ...@@ -74,24 +81,31 @@ public class Wangyi {
} }
} }
/**
*
* @Description 网易网页版数据
* @param url
* @param proxy
* @param endTime
* @return
*/
public static List<Map<String,Object>> getHistoryData(String url,Proxy proxy,String endTime) { public static List<Map<String,Object>> getHistoryData(String url,Proxy proxy,String endTime) {
Map<String,String> headerMap = HeadGet.getWangyiHistoryHeaderMap(null); Map<String,String> headerMap = HeadGet.getWangyiHistoryHeaderMap(null);
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> bodyList = new ArrayList<>();
String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap); String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap);
String wemediaid = result.split("data-wemediaid=\"")[1].split("\"")[0]; String wemediaid = result.split("data-wemediaid=\"")[1].split("\"")[0];
String source = Jsoup.parse(result).select("body > div.colum_wrap.fl > div > div.colum_des > div.normal > div.colum_info > h4").text(); String source = Jsoup.parse(result).select("body > div.colum_wrap.fl > div > div.colum_des > div.normal > div.colum_info > h4").text();
boolean f = true; boolean f = true;
url = "http://dy.163.com/v2/article/list.do?wemediaId="+wemediaid+"&size=20&pageNo="; url = "http://dy.163.com/v2/article/list.do?wemediaId="+wemediaid+"&size=10&pageNo=";
int i = 1; int i = 1;
ZhiWeiTools.sleep(1000); ZhiWeiTools.sleep(1000);
int j = 0; int j = 0;
while(f) { while(f) {
try { try {
result = "";
result = HttpClient.executeHttpRequestGet(url+i,proxy, headerMap); result = HttpClient.executeHttpRequestGet(url+i,proxy, headerMap);
JSONObject json = JSONObject.parseObject(result); JSONObject json = JSONObject.parseObject(result);
List<Map<String,Object>> dataList = wangyiHistoryAnalysis.getData(result,proxy, endTime,source); List<Map<String,Object>> dataList = wangyiHistoryAnalysis.getData(result,proxy, endTime,source);
if(dataList == null || dataList.size() < 1) { if(dataList == null || dataList.isEmpty()) {
break; break;
} }
bodyList.addAll(dataList); bodyList.addAll(dataList);
...@@ -109,10 +123,58 @@ public class Wangyi { ...@@ -109,10 +123,58 @@ public class Wangyi {
if(j > 5) { if(j > 5) {
f = false; f = false;
} }
continue;
} }
} }
return bodyList; return bodyList;
} }
public static List<Map<String,Object>> getWangyiClientHistory(String id,ProxyHolder proxy,String endTime) {
List<Map<String,Object>> dataList = new ArrayList<>();
int page = 0;
int errorNum = 0;
while(true) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet("https://c.m.163.com/nc/subscribe/list/"+id+"/all/"+page+"-20.html"), proxy)){
String result = response.body().string();
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArray = json.getJSONArray("tab_list");
for(int i = 0,j = jsonArray.size();i < j;i++) {
JSONObject data = jsonArray.getJSONObject(i);
Map<String,Object> map = new HashMap<>();
String time = data.getString("ptime");
if(endTime != null && endTime.length() > 1) {
System.out.println(time);
if(time.compareTo(endTime) <= 0) {
logger.info("超时时间采集范围 跳出采集");
return dataList;
}
}
map.put("title", data.getString("title"));
map.put("content", data.getString("aheadBody"));
map.put("time", time);
map.put("source", data.getString("source"));
if("video".equals(data.getString("skipType"))) {
map.put("url", "https://c.m.163.com/news/v/" + data.getString("skipID") + ".html");
}else {
map.put("url", "https://c.m.163.com/news/a/" + data.getString("postid") + ".html");
}
// System.out.println(map.toString());
dataList.add(map);
}
logger.info("id = {} , cralwer count = {}",id,dataList.size() );
page += 20;
if(jsonArray.size() < 10) {
break;
}
} catch (Exception e) {
logger.info("采集数据出错 {}",e);
errorNum++;
if(errorNum > 3) {
break;
}
}
}
return dataList;
}
} }
...@@ -26,12 +26,12 @@ import okhttp3.Response; ...@@ -26,12 +26,12 @@ import okhttp3.Response;
public class Xueqiu { public class Xueqiu {
private static Logger logger = LoggerFactory.getLogger(Xueqiu.class); private static Logger logger = LoggerFactory.getLogger(Xueqiu.class);
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
private static XueqiuKeyWordAnalysis xueqiuKeyWordAnalysis = new XueqiuKeyWordAnalysis(); private static XueqiuKeyWordAnalysis xueqiuKeyWordAnalysis = new XueqiuKeyWordAnalysis();
/** /**
* *
* @Description 关键词采集历史文章 * @Description 关键词采集文章
* @param word * @param word
* @param endTime * @param endTime
* @param proxy * @param proxy
...@@ -53,13 +53,16 @@ public class Xueqiu { ...@@ -53,13 +53,16 @@ public class Xueqiu {
Request request = RequestUtils.wrapGet(url, headers); Request request = RequestUtils.wrapGet(url, headers);
String result = httpBoot.syncCall(request, proxy).body().string(); String result = httpBoot.syncCall(request, proxy).body().string();
List<Map<String,Object>> list = xueqiuKeyWordAnalysis.getData(result, endTime); List<Map<String,Object>> list = xueqiuKeyWordAnalysis.getData(result, endTime);
ZhiWeiTools.sleep(3000); if(list.isEmpty()) {
if(list.size() < 1) {
i++; i++;
}else { }else {
int count = JSONObject.parseObject(result).getIntValue("maxPage");
bodyList.addAll(list); bodyList.addAll(list);
logger.info("采集到第{} 页 , 一共采集到 {} 数据",page,bodyList.size()); logger.info("采集到第{} 页 , 一共采集到 {} 数据",page,bodyList.size());
page++; page++;
if(count < page) {
break;
}
} }
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
...@@ -98,16 +101,17 @@ public class Xueqiu { ...@@ -98,16 +101,17 @@ public class Xueqiu {
/** /**
* *
* @Description (TODO这里用一句话描述这个方法的作用) * @Description 雪球历史文章采集
* @return * @return
*/ */
public List<Map<String,Object>> getXueqiuAccountData(String userId,String cookie,Proxy proxy) { public static List<Map<String,Object>> getXueqiuAccountData(String userId,String cookie,Proxy proxy) {
Map<String,Object> headers = new HashMap<>(); Map<String,Object> headers = new HashMap<>();
headers.put("cookie", cookie); headers.put("cookie", cookie);
List<Map<String,Object>> bodyList = new ArrayList<>(); List<Map<String,Object>> bodyList = new ArrayList<>();
int page = 1;
int errorCount = 1;
while(true) { while(true) {
int page = 1; String url = "https://xueqiu.com/v4/statuses/user_timeline.json?page=" + page + "&user_id="+userId+"&type=0";
String url = "https://xueqiu.com/v4/statuses/user_timeline.json?page=" + page + "&user_id=6687544095&type=0";
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headers), proxy)){ try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headers), proxy)){
String result = response.body().string(); String result = response.body().string();
JSONObject json = JSONObject.parseObject(result); JSONObject json = JSONObject.parseObject(result);
...@@ -121,26 +125,30 @@ public class Xueqiu { ...@@ -121,26 +125,30 @@ public class Xueqiu {
Date date = TimeParse.stringFormartDate(timeBefore); Date date = TimeParse.stringFormartDate(timeBefore);
Map<String, Object> map = new HashMap<>(); Map<String, Object> map = new HashMap<>();
map.put("name", ob.getJSONObject("user").getString("screen_name"));//statuses user screen_name map.put("source", ob.getJSONObject("user").getString("screen_name"));//statuses user screen_name
map.put("time", date);//statuses timeBefore map.put("time", date);//statuses timeBefore
map.put("source", ob.getString("source"));//statuses source
map.put("content", ob.getString("description").replaceAll("<.*?>", ""));//statuses description map.put("content", ob.getString("description").replaceAll("<.*?>", ""));//statuses description
map.put("title", ob.getString("rawTitle"));
map.put("repostCount", ob.getString("retweet_count"));//statuses retweet_count map.put("repostCount", ob.getString("retweet_count"));//statuses retweet_count
map.put("commentCount", ob.getString("reply_count"));//statuses reply_count map.put("commentCount", ob.getString("reply_count"));//statuses reply_count
map.put("likeCount", ob.getString("like_count"));//statuses like_count map.put("likeCount", ob.getString("like_count"));//statuses like_count
map.put("url", "https://xueqiu.coms" + ob.getString("target")); map.put("url", "https://xueqiu.com" + ob.getString("target"));
bodyList.add(map); bodyList.add(map);
} }
int maxPage = json.getInteger("maxPage"); int maxPage = json.getInteger("maxPage");
page++; page++;
logger.info("userId = {} , crawler count = {} ,page = {} , maxPage = {}",userId,bodyList.size(),page,maxPage);
if(page > maxPage) { if(page > maxPage) {
break; break;
} }
} catch (Exception e) { } catch (Exception e) {
logger.error("采集解析出错 {}",e); logger.error("采集解析出错 {}",e);
break; errorCount++;
if(errorCount > 3) {
break;
}
} }
ZhiWeiTools.sleep(2000);
} }
return bodyList; return bodyList;
} }
......
package com.zhiwei.parse;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
/**
*
* @ClassName Yangshi
* @Description 央视网 采集
* @author byte-zbs
* @Date 2019年7月4日 下午6:08:12
* @version 1.0.0
*/
public class Yangshi {
private static final Logger logger = LoggerFactory.getLogger(Yangshi.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
public static List<Map<String,Object>> getData() {
return Collections.emptyList();
}
private static List<Map<String,Object>> analysisData(String result) {
List<Map<String,Object>> bodyList = new ArrayList<>();
JSONArray jsonArray = JSONObject.parseObject(result).getJSONArray("list");
try {
for (int i = 0; i < jsonArray.size(); i++) {
JSONObject ob = jsonArray.getJSONObject(i);
String allTitle = ob.getString("all_title"); //视频标题
String urllink = ob.getString("urllink"); //链接
String channel = ob.getString("channel"); //频道来源
String uploadtime = ob.getString("uploadtime"); //时间
String durations = ob.getString("durations"); //时长
Map<String, Object> map = new HashMap<>();
map.put("视频标题", allTitle);
map.put("链接", urllink);
map.put("频道来源", channel);
map.put("时间", uploadtime);
map.put("时长", durations+" s");
System.out.println(map.toString());
bodyList.add(map);
}
} catch (Exception e) {
// TODO: handle exception
e.printStackTrace();
}
return bodyList;
}
}
...@@ -23,7 +23,7 @@ import okhttp3.Response; ...@@ -23,7 +23,7 @@ import okhttp3.Response;
public class Yiche { public class Yiche {
private static final Logger logger = LoggerFactory.getLogger(Yiche.class); private static final Logger logger = LoggerFactory.getLogger(Yiche.class);
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/** /**
* *
......
...@@ -33,7 +33,7 @@ public class Yidianzixun { ...@@ -33,7 +33,7 @@ public class Yidianzixun {
private static YidianzixunCommentAnalysis yidianzixunCommentAnalysis = new YidianzixunCommentAnalysis(); private static YidianzixunCommentAnalysis yidianzixunCommentAnalysis = new YidianzixunCommentAnalysis();
private static YidianzixunByWordAnalysis yidianzixunByWordAnalysis = new YidianzixunByWordAnalysis(); private static YidianzixunByWordAnalysis yidianzixunByWordAnalysis = new YidianzixunByWordAnalysis();
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/** /**
* *
...@@ -42,19 +42,19 @@ public class Yidianzixun { ...@@ -42,19 +42,19 @@ public class Yidianzixun {
* @param startTime * @param startTime
* @return * @return
*/ */
public static List<Map<String,Object>> getYidianzixunAccountData(String channelid,String startTime,Proxy proxy,String cookie) { public static List<Map<String,Object>> getYidianzixunAccountData(String channelid,String startTime,ProxyHolder proxy,String cookie) {
Map<String,String> headerMap = HeadGet.getYidianzixunAccountHeaderMap(cookie,"http://www.yidianzixun.com/channel/"+channelid); Map<String,String> headerMap = HeadGet.getYidianzixunAccountHeaderMap(cookie,"http://www.yidianzixun.com/channel/"+channelid);
List<Map<String,Object>> dataList = new ArrayList<>(); List<Map<String,Object>> dataList = new ArrayList<>();
int j = 0; int j = 0;
boolean f = true; boolean f = true;
try { try {
while(f) { while(f) {
String url = "http://www.yidianzixun.com/"+getSpt(channelid, j, j+10); String url = "http://www.yidianzixun.com"+getSpt(channelid, j, j+10);
System.out.println(url);
String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap); String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap);
System.out.println(result);
JSONObject json = JSONObject.parseObject(result); JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONArray("result"); JSONArray jsonArry = json.getJSONArray("result");
if(jsonArry.size() == 0) { if(jsonArry.isEmpty()) {
break; break;
} }
for(int i = 0;i < jsonArry.size();i++) { for(int i = 0;i < jsonArry.size();i++) {
...@@ -70,13 +70,12 @@ public class Yidianzixun { ...@@ -70,13 +70,12 @@ public class Yidianzixun {
dataList.add(map); dataList.add(map);
} }
} }
System.out.println("================================" + dataList.size()); logger.info("channelid = {} , crawler size = {}",channelid,dataList.size());
ZhiWeiTools.sleep(3000); ZhiWeiTools.sleep(100);
j = dataList.size(); j = dataList.size();
} }
} catch (Exception e) { } catch (Exception e) {
logger.error("数据获取出错",e.getMessage()); logger.error("数据获取出错 {}",e);
e.printStackTrace();
} }
return dataList; return dataList;
} }
......
...@@ -24,7 +24,7 @@ import okhttp3.Response; ...@@ -24,7 +24,7 @@ import okhttp3.Response;
public class Youku { public class Youku {
private static final Logger logger = LoggerFactory.getLogger(Youku.class); private static final Logger logger = LoggerFactory.getLogger(Youku.class);
private static HttpBoot httpBoot = new HttpBoot(false,2); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
public static List<Map<String,Object>> getDataList(String word) { public static List<Map<String,Object>> getDataList(String word) {
String aaid = "9cae49f0e031664b00d8f9c108e586ab"; String aaid = "9cae49f0e031664b00d8f9c108e586ab";
...@@ -33,7 +33,7 @@ public class Youku { ...@@ -33,7 +33,7 @@ public class Youku {
String url = "https://so.youku.com/search_video/q_"+URLCodeUtil.getURLEncode(word, "UTF-8")+"?spm=a2h0k.11417342.filter.dnew&orderfield=createtime&aaid="+aaid+"&pg="+i; String url = "https://so.youku.com/search_video/q_"+URLCodeUtil.getURLEncode(word, "UTF-8")+"?spm=a2h0k.11417342.filter.dnew&orderfield=createtime&aaid="+aaid+"&pg="+i;
System.out.println(url); System.out.println(url);
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyFactory.getNatProxy())){ try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY)){
String result = response.body().string(); String result = response.body().string();
String jsondata = result.split("bigview.view\\(")[1].split("\\)\\</script\\>")[0]; String jsondata = result.split("bigview.view\\(")[1].split("\\)\\</script\\>")[0];
JSONObject json = JSONObject.parseObject(jsondata); JSONObject json = JSONObject.parseObject(jsondata);
...@@ -45,7 +45,7 @@ public class Youku { ...@@ -45,7 +45,7 @@ public class Youku {
String title = element.select("div.mod-main > div.mod-header > h2 > a").text(); String title = element.select("div.mod-main > div.mod-header > h2 > a").text();
String surl = element.select("div.mod-main > div.mod-header > h2 > a").attr("href"); String surl = element.select("div.mod-main > div.mod-header > h2 > a").attr("href");
String time = element.select("div.mod-main > div.mod-info > p").text(); String time = element.select("div.mod-main > div.mod-info > p").text();
if(time.contains("上传时间:")) { if(time.contains("上传时间:") && surl.contains("v.youku.com")) {
map.put("title", title); map.put("title", title);
map.put("url", "https:"+surl); map.put("url", "https:"+surl);
map.put("time", time.replaceAll("上传时间:", "").split(" ")[0]); map.put("time", time.replaceAll("上传时间:", "").split(" ")[0]);
......
...@@ -24,7 +24,7 @@ import okhttp3.Response; ...@@ -24,7 +24,7 @@ import okhttp3.Response;
public class BaijiaAccountAnalysis { public class BaijiaAccountAnalysis {
private static Logger logger = LoggerFactory.getLogger(BaijiaAccountAnalysis.class); private static Logger logger = LoggerFactory.getLogger(BaijiaAccountAnalysis.class);
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
public Map<String,Object> getBaijiaAccount2Data(JSONObject data) { public Map<String,Object> getBaijiaAccount2Data(JSONObject data) {
Map<String,Object> map = new HashMap<String,Object>(); Map<String,Object> map = new HashMap<String,Object>();
...@@ -57,12 +57,13 @@ public class BaijiaAccountAnalysis { ...@@ -57,12 +57,13 @@ public class BaijiaAccountAnalysis {
boolean more = false; boolean more = false;
try { try {
JSONObject json = JSONObject.parseObject(result); JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONObject("data").getJSONArray("items"); JSONArray jsonArry = json.getJSONObject("data").getJSONArray("list");
if(json.getJSONObject("data") != null && json.getJSONObject("data").getBoolean("has_more") != null) { if(json.getJSONObject("data").getBoolean("has_more") != null &&
if(json.getJSONObject("data").getBoolean("has_more")) { json.getJSONObject("data").getBoolean("has_more") ) {
more = true; more = true;
} rmap.put("ctime", json.getJSONObject("data").getString("ctime"));
} }
// String name = json.getJSONObject("data").getJSONObject("author").getString("display_name");
for(int i = 0;i < jsonArry.size();i++) { for(int i = 0;i < jsonArry.size();i++) {
Map<String,Object> map = new HashMap<>(); Map<String,Object> map = new HashMap<>();
JSONObject data = jsonArry.getJSONObject(i); JSONObject data = jsonArry.getJSONObject(i);
...@@ -77,10 +78,7 @@ public class BaijiaAccountAnalysis { ...@@ -77,10 +78,7 @@ public class BaijiaAccountAnalysis {
} }
} }
map.put("title", data.getString("title")); map.put("title", data.getString("title"));
String url = data.getString("url"); String url = "http://baijiahao.baidu.com/s?id=" + id;
if(url == null) {
url = "https://baijia.baidu.com/s?old_id=" + id;
}
map.put("content", ZhiWeiTools.delHTMLTag(getContent3(data))); map.put("content", ZhiWeiTools.delHTMLTag(getContent3(data)));
map.put("read_amount", data.getString("read_amount")==null?0:data.getString("read_amount")); map.put("read_amount", data.getString("read_amount")==null?0:data.getString("read_amount"));
map.put("app_id", data.getString("app_id")); map.put("app_id", data.getString("app_id"));
......
...@@ -5,6 +5,7 @@ import java.util.Collections; ...@@ -5,6 +5,7 @@ import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
...@@ -13,7 +14,7 @@ import org.jsoup.select.Elements; ...@@ -13,7 +14,7 @@ import org.jsoup.select.Elements;
public class BilibilikeyWordAnalysis { public class BilibilikeyWordAnalysis {
public static Map<String,Object> getData(String result,String word) { public static Map<String,Object> getData(String result,String word,String endTime) {
try { try {
Document doc = Jsoup.parse(result); Document doc = Jsoup.parse(result);
boolean more = false; boolean more = false;
...@@ -28,10 +29,9 @@ public class BilibilikeyWordAnalysis { ...@@ -28,10 +29,9 @@ public class BilibilikeyWordAnalysis {
String source = null; String source = null;
String submitcount = null; String submitcount = null;
Elements elements = doc.select("ul.video-contain.clearfix").select("li"); Elements elements = doc.select("ul.video-contain.clearfix").select("li");
System.out.println(elements.size() + " --- " + more); List<Map<String,Object>> dataList = new ArrayList<>();
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
for(Element element : elements) { for(Element element : elements) {
Map<String,Object> map = new HashMap<String,Object>(); Map<String,Object> map = new HashMap<>();
title = element.select("a").attr("title"); title = element.select("a").attr("title");
url = element.select("a").attr("href"); url = element.select("a").attr("href");
playcount = element.select("div.tags").select("span.watch-num").text(); playcount = element.select("div.tags").select("span.watch-num").text();
...@@ -45,6 +45,9 @@ public class BilibilikeyWordAnalysis { ...@@ -45,6 +45,9 @@ public class BilibilikeyWordAnalysis {
map.put("source", source); map.put("source", source);
map.put("submitcount", submitcount); map.put("submitcount", submitcount);
map.put("word", word); map.put("word", word);
if(Objects.nonNull(endTime) && endTime.compareTo(time) > -1) {
more = false;
}
dataList.add(map); dataList.add(map);
} }
Map<String,Object> rmap = new HashMap<>(); Map<String,Object> rmap = new HashMap<>();
......
...@@ -49,7 +49,7 @@ public class DayuAccountAnalysis { ...@@ -49,7 +49,7 @@ public class DayuAccountAnalysis {
* @return * @return
*/ */
private Map<String,Object> getOneData(JSONObject data,String name,String startTime) { private Map<String,Object> getOneData(JSONObject data,String name,String startTime) {
Map<String,Object> map = new HashMap<String, Object>(); Map<String,Object> map = new HashMap<>();
try { try {
String time = data.getString("published_at").replace("T", " ").split("\\.")[0]; String time = data.getString("published_at").replace("T", " ").split("\\.")[0];
if(startTime != null && startTime.length() > 1) { if(startTime != null && startTime.length() > 1) {
......
...@@ -24,7 +24,7 @@ import okhttp3.Response; ...@@ -24,7 +24,7 @@ import okhttp3.Response;
public class DayuByWordAnalysis { public class DayuByWordAnalysis {
private static Logger logger = LoggerFactory.getLogger(DayuByWordAnalysis.class); private static Logger logger = LoggerFactory.getLogger(DayuByWordAnalysis.class);
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
public List<Map<String,Object>> getDayuByWordData(String result,Proxy proxy) { public List<Map<String,Object>> getDayuByWordData(String result,Proxy proxy) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
......
package com.zhiwei.parse.analysis; package com.zhiwei.parse.analysis;
import java.net.Proxy;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
...@@ -12,16 +11,15 @@ import org.slf4j.LoggerFactory; ...@@ -12,16 +11,15 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.httpclient.HeadGet; import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response; import okhttp3.Response;
public class FenghuangAccountAnalysis { public class FenghuangAccountAnalysis {
private static Logger logger = LoggerFactory.getLogger(FenghuangAccountAnalysis.class); private static Logger logger = LoggerFactory.getLogger(FenghuangAccountAnalysis.class);
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/** /**
* *
...@@ -29,70 +27,61 @@ public class FenghuangAccountAnalysis { ...@@ -29,70 +27,61 @@ public class FenghuangAccountAnalysis {
* @param result * @param result
* @return * @return
*/ */
public List<Map<String,Object>> getArticleData(String url,String startTime,Proxy proxy) { public List<Map<String,Object>> getArticleData(String url,String startTime,ProxyHolder proxy) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> dataList = new ArrayList<>();
try { try {
Map<String,String> headerMap = HeadGet.getFenghuangAccountHeaderMap(null); Map<String,String> headerMap = HeadGet.getFenghuangAccountHeaderMap(null);
JSONArray jsonArry = null; for(int i = 0;i < 3;i++) {
for(int i = 0;i < 3;i++) { try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy)){
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy)){ String result = response.body().string();
String result = response.body().string(); System.out.println(result);
JSONObject json = JSONObject.parseObject(result); JSONObject json = JSONObject.parseObject(result.replace("getFengAuthorListData(", "").replace("]})", "]}"));
jsonArry = json.getJSONObject("data").getJSONObject("feeds").getJSONArray("list"); JSONArray jsonArry = json.getJSONArray("data");
if(jsonArry == null || jsonArry.size() < 1) { for(int j = 0;j < jsonArry.size();j++) {
try {
JSONObject data = jsonArry.getJSONObject(j);
Map<String,Object> map = new HashMap<>();
map.put("title", data.getString("title"));
String time = data.getString("newsTime");
map.put("time", data.getString("newsTime"));
map.put("url", "https:" + data.getString("url"));
map.put("id", data.getString("commentUrl"));
if(time.compareTo(startTime) >= 0) {
dataList.add(map);
}
} catch (Exception e) {
logger.error(" exception {}",e);
}
}
break;
} catch (Exception e) {
e.printStackTrace();
continue; continue;
} }
} catch (Exception e) {
continue;
} }
}
if(jsonArry == null || jsonArry.size() < 1) {
return dataList; return dataList;
}
for(int i = 0;i < jsonArry.size();i++) {
try {
JSONObject data = jsonArry.getJSONObject(i);
String articleurl = data.getString("id");
String articleResult = HttpClient.executeHttpRequestGet(articleurl,proxy, headerMap);
Map<String,Object> dataMap = getArticle(articleResult);
ZhiWeiTools.sleep(1000);
if(dataMap != null) {
String time = (String)dataMap.get("time");
if(time.compareTo(startTime) >= 0) {
dataList.add(dataMap);
continue;
}
}
} catch (Exception e) {
e.printStackTrace();
continue;
}
}
return dataList;
} catch (Exception e1) { } catch (Exception e1) {
e1.printStackTrace();
return dataList; return dataList;
} }
} }
private static Map<String,Object> getArticle(String articleResult) { // private static Map<String,Object> getArticle(String articleResult) {
JSONObject json = JSONObject.parseObject(articleResult).getJSONObject("body"); // try {
Map<String,Object> map = new HashMap<String, Object>(); // Map<String,Object> map = new HashMap<>();
try { // JSONObject json = JSONObject.parseObject(articleResult).getJSONObject("body");
map.put("title", json.getString("title")); // map.put("title", json.getString("title"));
String time = json.getString("cTime").replaceAll("/", "-"); // String time = json.getString("cTime").replaceAll("/", "-");
map.put("time", time); // map.put("time", time);
map.put("text", json.getString("text").replaceAll("<.*?>", "")); // map.put("text", json.getString("text").replaceAll("<.*?>", ""));
map.put("source", json.getString("source")); // map.put("source", json.getString("source"));
map.put("url", json.getString("shareurl")); // map.put("url", "https://share.iclient.ifeng.com/news/shareNews?aid=sub_" + json.getString("aid"));
map.put("id", json.getString("aid")); // map.put("id", json.getString("aid"));
} catch (Exception e) { // return map;
logger.error("解析具体文章的时候出错 {}",e); // } catch (Exception e) {
return null; // logger.error("解析具体文章的时候出错 {}",e);
} // return null;
return map; // }
} // }
......
...@@ -23,7 +23,7 @@ import okhttp3.Response; ...@@ -23,7 +23,7 @@ import okhttp3.Response;
public class FenghuangCommentAnalysis { public class FenghuangCommentAnalysis {
private static Logger logger = LoggerFactory.getLogger(FenghuangCommentAnalysis.class); private static Logger logger = LoggerFactory.getLogger(FenghuangCommentAnalysis.class);
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
public Map<String,Object> getFenghuangCommentCount(String url,ProxyHolder proxy) { public Map<String,Object> getFenghuangCommentCount(String url,ProxyHolder proxy) {
Map<String,Object> map = new HashMap<>(); Map<String,Object> map = new HashMap<>();
......
package com.zhiwei.parse.analysis; package com.zhiwei.parse.analysis;
import java.net.URLDecoder;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
...@@ -26,7 +25,7 @@ public class SouhuAccountAnalysis { ...@@ -26,7 +25,7 @@ public class SouhuAccountAnalysis {
* @return * @return
*/ */
public List<Map<String,Object>> analysisData(JSONArray jsonArray,String name) { public List<Map<String,Object>> analysisData(JSONArray jsonArray,String name) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> dataList = new ArrayList<>();
for(int i = 0;i < jsonArray.size();i++) { for(int i = 0;i < jsonArray.size();i++) {
JSONObject data = jsonArray.getJSONObject(i); JSONObject data = jsonArray.getJSONObject(i);
Map<String,Object> map = parseHtmlByAccount(data,name); Map<String,Object> map = parseHtmlByAccount(data,name);
...@@ -46,19 +45,15 @@ public class SouhuAccountAnalysis { ...@@ -46,19 +45,15 @@ public class SouhuAccountAnalysis {
* @return * @return
*/ */
private static Map<String,Object> parseHtmlByAccount(JSONObject data,String name) { private static Map<String,Object> parseHtmlByAccount(JSONObject data,String name) {
Map<String,Object> map = new HashMap<String, Object>(); Map<String,Object> map = new HashMap<>();
try { try {
String title = data.getString("title"); map.put("title", data.getString("title"));
map.put("title", URLDecoder.decode(title, "UTF-8"));
map.put("source", name); map.put("source", name);
String content = data.getString("brief"); map.put("content", data.getString("brief"));
map.put("content", URLDecoder.decode(content,"UTF-8"));
map.put("newsPv", data.getString("newsPv")); map.put("newsPv", data.getString("newsPv"));
map.put("url", data.getString("url")); map.put("url", data.getString("link"));
long timelong = Long.valueOf(data.getString("postTime")); map.put("time", new Date(data.getLong("publicTime")));
map.put("time", new Date(timelong)); JSONArray jsonArry = data.getJSONArray("tagDetails");
map.put("comment", data.getString("commentsCnt"));
JSONArray jsonArry = data.getJSONArray("tags");
String tags = ""; String tags = "";
for(int i = 0;i < jsonArry.size();i++) { for(int i = 0;i < jsonArry.size();i++) {
JSONObject ob = jsonArry.getJSONObject(i); JSONObject ob = jsonArry.getJSONObject(i);
...@@ -68,10 +63,9 @@ public class SouhuAccountAnalysis { ...@@ -68,10 +63,9 @@ public class SouhuAccountAnalysis {
tags = tags.substring(0,tags.length()-1); tags = tags.substring(0,tags.length()-1);
} }
map.put("tags", tags); map.put("tags", tags);
map.put("newsid", data.getString("newsid")); map.put("newsid", data.getString("id"));
} catch (Exception e) { } catch (Exception e) {
logger.error("搜狐历史文章解析出错了",e.getMessage()); logger.error("搜狐历史文章解析出错了 {}",e.getMessage());
System.out.println(data.toString());
return null; return null;
} }
......
...@@ -20,7 +20,7 @@ import okhttp3.Response; ...@@ -20,7 +20,7 @@ import okhttp3.Response;
public class SouhuCommentAnalysis { public class SouhuCommentAnalysis {
private static Logger logger = LoggerFactory.getLogger(SouhuCommentAnalysis.class); private static Logger logger = LoggerFactory.getLogger(SouhuCommentAnalysis.class);
private HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/** /**
* *
......
package com.zhiwei.parse.shipin; package com.zhiwei.parse.shipin;
import java.net.Proxy;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
...@@ -18,7 +17,6 @@ import org.slf4j.LoggerFactory; ...@@ -18,7 +17,6 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
...@@ -36,7 +34,7 @@ import okhttp3.Response; ...@@ -36,7 +34,7 @@ import okhttp3.Response;
public class QQTV { public class QQTV {
private static final Logger logger = LoggerFactory.getLogger(QQTV.class); private static final Logger logger = LoggerFactory.getLogger(QQTV.class);
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
public static List<Map<String,Object>> getData(String word,String time,ProxyHolder proxy) { public static List<Map<String,Object>> getData(String word,String time,ProxyHolder proxy) {
List<Map<String,Object>> dataList = new ArrayList<>(); List<Map<String,Object>> dataList = new ArrayList<>();
...@@ -52,8 +50,8 @@ public class QQTV { ...@@ -52,8 +50,8 @@ public class QQTV {
logger.info(" 关键词 {} 量 {} 页 数 {} 此页量 {} ",word,dataList.size(),page,elements.size()); logger.info(" 关键词 {} 量 {} 页 数 {} 此页量 {} ",word,dataList.size(),page,elements.size());
for(Element element : elements) { for(Element element : elements) {
String nurl = element.select("h2.result_title").select("a").attr("href"); String nurl = element.select("h2.result_title").select("a").attr("href");
Map<String,Object> map = getUrlData(nurl, ProxyFactory.getNatProxy()); Map<String,Object> map = getUrlData(nurl, ProxyHolder.NAT_HEAVY_PROXY);
if(Objects.nonNull(map) && time.compareTo(String.valueOf(map.get("time"))) < 1) { if(Objects.nonNull(map) && !map.isEmpty() && time.compareTo(String.valueOf(map.get("time"))) < 1) {
map.put("word", word); map.put("word", word);
dataList.add(map); dataList.add(map);
} }
...@@ -61,6 +59,9 @@ public class QQTV { ...@@ -61,6 +59,9 @@ public class QQTV {
} }
page++; page++;
if(count != dataList.size()) { if(count != dataList.size()) {
if(page > 20) {
break;
}
continue; continue;
} }
...@@ -76,24 +77,26 @@ public class QQTV { ...@@ -76,24 +77,26 @@ public class QQTV {
return dataList; return dataList;
} }
private static Map<String,Object> getUrlData(String url,Proxy proxy) { private static Map<String,Object> getUrlData(String url,ProxyHolder proxy) {
for(int i = 1;i < 3;i++) { if(!url.contains("v.qq.com")) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){ return null;
String result = response.body().string(); }
String source = result.split("\\<span class=\"user_name\"\\>")[1].split("\\</span\\>")[0]; System.out.println(url);
result = result.split("var VIDEO_INFO =")[1].split("\\</script\\>")[0]; try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
JSONObject json = JSONObject.parseObject(result); String result = response.body().string();
Map<String,Object> map = new HashMap<>(); String source = result.split("\\<span class=\"user_name\"\\>")[1].split("\\</span\\>")[0];
result = result.split("var VIDEO_INFO =")[1].split("\\</script\\>")[0];
map.put("playCount", json.getInteger("view_all_count")); JSONObject json = JSONObject.parseObject(result);
map.put("title", json.getString("title")); Map<String,Object> map = new HashMap<>();
map.put("time", json.getString("video_checkup_time"));
map.put("source", source); map.put("playCount", json.getInteger("view_all_count"));
map.put("url", url); map.put("title", json.getString("title"));
return map; map.put("time", json.getString("video_checkup_time"));
} catch (Exception e) { map.put("source", source);
e.printStackTrace(); map.put("url", url);
} return map;
} catch (Exception e) {
e.printStackTrace();
} }
return Collections.emptyMap(); return Collections.emptyMap();
} }
......
...@@ -24,7 +24,7 @@ import okhttp3.Response; ...@@ -24,7 +24,7 @@ import okhttp3.Response;
public class SohuTV { public class SohuTV {
private static final Logger logger = LoggerFactory.getLogger(SohuTV.class); private static final Logger logger = LoggerFactory.getLogger(SohuTV.class);
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
public static List<Map<String,Object>> sohuTVData(String word,String cookie,Proxy proxy) { public static List<Map<String,Object>> sohuTVData(String word,String cookie,Proxy proxy) {
List<Map<String,Object>> dataList = new ArrayList<>(); List<Map<String,Object>> dataList = new ArrayList<>();
......
//package com.zhiwei.Comment; //package com.zhiwei.Comment;
// //
//import org.testng.annotations.Test; //import org.junit.Test;
// //
//import com.zhiwei.parse.Aika; //import com.zhiwei.parse.Aika;
//import com.zhiwei.tools.timeparse.TimeExtraction;
//import com.zhiwei.tools.timeparse.TimeParse;
// //
//public class AikaComment { //public class AikaComment {
// @Test // @Test
// public void f() { // public void f() {
// String url = "http://newcar.xcar.com.cn/201809/news_2021765_1.html"; // String url = "http://info.xcar.com.cn/201906/news_2039730_1.html";
// //
// Aika.getAikaComment(url, null); // Aika.getAikaComment(url, null);
// //
......
package com.zhiwei.Comment; //package com.zhiwei.Comment;
//
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.testng.annotations.Test; //import org.junit.Test;
//
import com.zhiwei.common.config.GroupType; //import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory; //import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder; //import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Aiqiyi; //import com.zhiwei.parse.Aiqiyi;
//
public class AiqiyiHotCountTest { //public class AiqiyiHotCountTest {
@Test // @Test
public void f() { // public void f() {
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
String path = "C:\\Users\\byte-zbs\\Documents\\WXWork\\1688854025129101\\Cache\\File\\2019-03\\爱奇艺.xlsx"; // String path = "C:\\Users\\byte-zbs\\Documents\\WXWork\\1688854025129101\\Cache\\File\\2019-03\\爱奇艺.xlsx";
Map<String,Object> map = poi.importExcel(path, 0); // Map<String,Object> map = poi.importExcel(path, 0);
List<Map<String,Object>> dataList = (List<Map<String, Object>>) map.get("body"); // List<Map<String,Object>> dataList = (List<Map<String, Object>>) map.get("body");
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER); // ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
List<String> headList = (List<String>) map.get("head"); // List<String> headList = (List<String>) map.get("head");
headList.add("count"); // headList.add("count");
dataList.forEach(m -> { // dataList.forEach(m -> {
String url = String.valueOf(m.get("链接")); // String url = String.valueOf(m.get("链接"));
//
int i = Aiqiyi.aiqiyiHotCount(url, ProxyHolder.NAT_PROXY); // int i = Aiqiyi.aiqiyiHotCount(url, ProxyHolder.NAT_PROXY);
System.out.println(url + " -- " + i); // System.out.println(url + " -- " + i);
m.put("count", i); // m.put("count", i);
}); // });
poi.exportExcel(path, "data", headList, dataList); // poi.exportExcel(path, "data", headList, dataList);
} // }
} //}
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
//import java.util.List; //import java.util.List;
//import java.util.Map; //import java.util.Map;
// //
//import org.testng.annotations.Test; //import org.junit.Test;
// //
//import com.zhiwei.common.config.GroupType; //import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory; //import com.zhiwei.crawler.proxy.ProxyFactory;
...@@ -18,27 +18,28 @@ ...@@ -18,27 +18,28 @@
// public void f() { // public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", // ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
// GroupType.PROVIDER); // GroupType.PROVIDER);
// PoiExcelUtil poi = PoiExcelUtil.getInstance(); //// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// //
// Map<String, Object> map = poi //// Map<String, Object> map = poi
// .importExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\脉脉公司圈.xlsx", 0); //// .importExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\脉脉公司圈.xlsx", 0);
// List<Map<String, Object>> list = (List<Map<String, Object>>) map.get("body"); //// List<Map<String, Object>> list = (List<Map<String, Object>>) map.get("body");
// String cookie = "_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; sessionid=lejfy3gdu5tf9x9zowxfhtq5o73dubc5; guid=GxsfBBgZGwQYGx4EGBkeVgcYGxkdHhMeHhkcVhwZBB0ZHwVDWEtMS3kKEhMEEh0fGQQaBBsdBU9HRVhCaQoDRUFJT20KT0FDRgoGZmd+YmECChwZBB0ZHwVeQ2FIT31PRlpaawoDHhx9ZX0KERkEHAp+ZApZXUVOREN9AgoaBB8FS0ZGQ1BFZw==; seid=s1550814253444; token=\"rhItcea5qkO6WCSnVcczW/NRVLLCTsq3kQbpUCGAwQ0ceLunVJRjT5rgoFVYrIBA8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoiVjJuNHdCVDBncVNacTRxVllGM29jRUVwIiwic3RhdHVzIjp0cnVlLCJfZXhwaXJlIjoxNTUwOTAyMTY3MDQ5LCJfbWF4QWdlIjo4NjQwMDAwMH0=; session.sig=zbs4cHtzTcHWvjtkpjAZmoqLXsQ"; //// List<Map<String, Object>> bodyList = new ArrayList<Map<String, Object>>();
// List<Map<String, Object>> bodyList = new ArrayList<Map<String, Object>>(); //// List<String> headList = (List<String>) map.get("head");
// List<String> headList = (List<String>) map.get("head"); //// for (Map<String, Object> map1 : list) {
// for (Map<String, Object> map1 : list) { //// String url = map1.get("地址") + "";
// String url = map1.get("地址") + ""; // String cookie = "_buuid=0668b664-13b3-4bd0-aa37-99d747432e85; guid=HBoEGxgEGBscBBsZGlYHGBsZHRsYExwZHFYcGQQdGR8FQ1hLTEt5ChITBBIdHxkEGgQbHQVPR0VYQmkKA0VBSU9tCk9BQ0YKBmZnfmJhAgocGQQdGR8FXkNhSE99T0ZaWmsKAx4cfWV9ChEZBBwKfmQKWV1FTkRDfQIKGgQfBUtGRkNQRWc=; token=\"ou+mv1hjxjm0uOOTss1vgck9+h6OCS/lYQUeFnJgSK70FHprmw6GmjBGwk2qPQH88CKuzcDfAvoCmBm7+jVysA==\"; uid=\"A8ELjewCDRgHnZ5bX0Vy0/Airs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMjI3NjU0NTI0Iiwic2VjcmV0IjoiV0wyZmEtZDZxbkx2TEkzZHF2dTN4UG5SIiwiX2V4cGlyZSI6MTU2MDU5Mzg4Mjc5NCwiX21heEFnZSI6ODY0MDAwMDB9; session.sig=ujhqvC3wPAn-WlCPXfB6C5ZJIgY";
// Map<String,Object> map3 = Maimai.getMaiaiCount(url,null, ProxyHolder.NAT_PROXY); // String url = "https://maimai.cn/web/gossip_detail?src=app&webid=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlZ2lkIjoiMjU5OTg4YmE4YzI3MTFlOTllMjEyNDZlOTZiNDgwODgiLCJ1IjoxNzY2NDQ3NjUsImlkIjoyMjIwMDM4NH0.gBdp3i99L0xUKLglaVIJf07OrrPk6yoG9HAo-3Yftqk";
// Map<String,Object> map3 = Maimai.getMaiaiCount(url,cookie, ProxyHolder.NAT_HEAVY_PROXY);
// System.out.println(map3.toString()); // System.out.println(map3.toString());
// System.out.println(url); // System.out.println(url);
// map1.putAll(map3); //// map1.putAll(map3);
// ZhiWeiTools.sleep(500); //// ZhiWeiTools.sleep(500);
// System.out.println("--------------------------"); //// System.out.println("--------------------------");
// } //// }
// headList.add("like"); //// headList.add("like");
// headList.add("spreads"); //// headList.add("spreads");
// headList.add("cmts"); //// headList.add("cmts");
// poi.exportExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\脉脉公司圈.xlsx", "评论采集", headList, //// poi.exportExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\脉脉公司圈.xlsx", "评论采集", headList,
// list); //// list);
// } // }
//} //}
package com.zhiwei.Comment; //package com.zhiwei.Comment;
//
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.testng.annotations.Test; //import org.junit.Test;
//
import com.zhiwei.common.config.GroupType; //import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory; //import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder; //import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Aiqiyi; //import com.zhiwei.parse.Youku;
import com.zhiwei.parse.Youku; //
//public class YoukuHotCountTest {
public class YoukuHotCountTest { // @Test
@Test // public void f() {
public void f() { //
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // String path = "C:\\Users\\byte-zbs\\Documents\\WXWork\\1688854025129101\\Cache\\File\\2019-03\\视频奶粉.xlsx";
String path = "C:\\Users\\byte-zbs\\Documents\\WXWork\\1688854025129101\\Cache\\File\\2019-03\\优酷.xlsx"; // Map<String,Object> map = poi.importExcel(path, 0);
Map<String,Object> map = poi.importExcel(path, 0); // List<Map<String,Object>> dataList = (List<Map<String, Object>>) map.get("body");
List<Map<String,Object>> dataList = (List<Map<String, Object>>) map.get("body"); // ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER); // List<String> headList = (List<String>) map.get("head");
List<String> headList = (List<String>) map.get("head"); // headList.add("count");
headList.add("count"); // dataList.forEach(m -> {
dataList.forEach(m -> { // String url = String.valueOf(m.get("url"));
String url = String.valueOf(m.get("链接")); //
// int i = Youku.getYoukuHotCount(url, ProxyHolder.NAT_PROXY);
int i = Youku.getYoukuHotCount(url, ProxyHolder.NAT_PROXY); // System.out.println(url + " -- " + i);
System.out.println(url + " -- " + i); // m.put("count", i);
m.put("count", i); // });
}); // poi.exportExcel(path, "data", headList, dataList);
poi.exportExcel(path, "data", headList, dataList); //
//
// }
} //}
}
//package com.zhiwei; package com.zhiwei;
//
//import java.io.IOException; import com.zhiwei.crawler.core.HttpBoot;
//import java.util.HashMap; import com.zhiwei.crawler.utils.RequestUtils;
//import java.util.Map;
// import okhttp3.Response;
//import java.util.HashMap;
// public class TestHttpBoot {
//import org.testng.annotations.Test;
// public static void main(String[] args) {
//import com.zhiwei.crawler.core.HttpBoot; HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).followSslRedirects(false).build();
//import com.zhiwei.crawler.core.RequestUtils; String url = "http://v.youku.com/v_show/id_XMzg1ODAwOTcwOA==.html";
// try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url))){
//public class TestHttpBoot { url = response.body().string();
// @Test System.out.println(url);
// public void f() { } catch (Exception e) {
// HttpBoot httpBoot = new HttpBoot(); e.printStackTrace();
// String url = "https://www.toutiao.com/c/user/following/?user_id=1034006366&count=20&_signature=wp5wPBAVmXlosTC8Fobui8KecC"; }
// Map<String,Object> headers = new HashMap<>(); }
// headers.put("referer", "https://www.qctt.cn/news/349056");
// headers.put("cookie", "PHPSESSID=3rd6bvonb4g15t1fp777mjums0; Hm_lvt_70af9ea91e7adc8195f6d49511b9a2f1=1542253722; open_ad=1; Hm_lpvt_70af9ea91e7adc8195f6d49511b9a2f1=1542271394; vcode=sqmm; XSRF-TOKEN=eyJpdiI6IlFTNzkyYWNcLzB2SUwyN2dcL1hhUlpsZz09IiwidmFsdWUiOiJRSUpycjZJNGx3d1hUWkpOQUl1R2psSStuVU0yYW8xT1YxXC9QOFY1NjdyRXNrMWpFVE1kSm9IQ1o5Nm5keXlMTEFnZXdCOHVvWDg0U2picTE1cjZzMkE9PSIsIm1hYyI6IjZlYzk5NDI3ODEzMzA3ZTJjNDc3M2ZjMjBlNDJhZjc2YjU2ODFmYmY3YWRlMzdlMzM1NTBlNWMxNDk3MjFiZDEifQ%3D%3D; laravel_session=eyJpdiI6InJQMnByeFlIbXVhaUVVVVBLK1wvaXlRPT0iLCJ2YWx1ZSI6IlhUOUtIS2ZQZ0ZKNFh1RDVQYjBjSVZkVkpQZTdYRDNpa1wvV0o5QlJPbk8xZE0rQ3dZdnFMdjcya011ejVkdWEwUk1Qa29Zb2Y3OU0yUGkrWDF4Wk5adz09IiwibWFjIjoiZGJiYjlkNWZhNmJhMDFiMjkxYTAyMmUwZTEyMWVmZTQ0NmJiZDQ2ZGU3ZjNjNmUzNTIwZGI0NTc4NDJlZjNiMCJ9"); }
// headers.put("origin", "https://www.qctt.cn");
// Map<String,Object> params = new HashMap<>();
// params.put("id", "349056");
// params.put("page", "3");
// params.put("_token", "EJ58V0qilRw7P77czp0U6iO9QW2IOS1ZGiBk4wH1");
// try {
// String result = httpBoot.syncCall(RequestUtils.wrapGet(url)).body().string();
// System.out.println(result);
//
// } catch (IOException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
//
//
// }
//}
//package com.zhiwei.crawler; package com.zhiwei.crawler;
//
//import java.util.ArrayList; import java.util.ArrayList;
//import java.util.List; import java.util.List;
//import java.util.Map; import java.util.Map;
//
//import org.junit.Test; import org.junit.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil; import com.zhiwei.common.config.GroupType;
//import com.zhiwei.parse.Dayu; import com.zhiwei.crawler.proxy.ProxyFactory;
// import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//public class DayuAccountExample { import com.zhiwei.parse.Dayu;
//
// public class DayuAccountExample {
// @Test
// public void dayuAccountTest() {
// //https://api.m.sm.cn/rest?method=Subscribe.list&format=html&from=wh10331&uc_biz_str=S:custom%7CC:search%7CN:true @Test
// public void dayuAccountTest() {
// //https://api.m.sm.cn/rest?method=Subscribe.list&format=html&from=wh10331&uc_biz_str=S:custom%7CC:search%7CN:true
//// String mid = "d7300311c1504d24a229c3da345785c6"; ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
//// String name = "大鱼海棠雨";
// String startTime = "2017-01-01 00:00:00"; // String mid = "0a8b2360fd8b4ded971cd324a56d32f0";
// String path = "D:\\crawlerdata\\自媒体\\大鱼历史文章.xlsx"; // String name = "大鱼海棠雨";
// PoiExcelUtil poi = PoiExcelUtil.getInstance(); String startTime = "2017-01-01 00:00:00";
// Map<String,Object> map = poi.importExcel(path, 0); String path = "D:\\crawlerdata\\自媒体\\大鱼历史文章.xlsx";
// List<Map<String,Object>> lists = (List<Map<String, Object>>) map.get("body"); PoiExcelUtil poi = PoiExcelUtil.getInstance();
// List<String> headList = new ArrayList<String>(); Map<String,Object> map = poi.importExcel(path, 0);
// headList.add("title"); List<Map<String,Object>> lists = (List<Map<String, Object>>) map.get("body");
// headList.add("time"); List<String> headList = new ArrayList<String>();
// headList.add("content"); headList.add("title");
// headList.add("source"); headList.add("time");
// headList.add("url"); headList.add("content");
//// headList.add("content_id"); headList.add("source");
//// headList.add("origin_id"); headList.add("url");
//// headList.add("xss_item_id"); // headList.add("content_id");
// for(Map<String,Object> data : lists) { // headList.add("origin_id");
// String mid = data.get("mid")+""; // headList.add("xss_item_id");
// String name = data.get("name")+""; for(Map<String,Object> data : lists) {
// if(mid.length() < 1 && name.length() < 1) { String mid = data.get("mid")+"";
// continue; String name = data.get("name")+"";
// } mid = "7b345070c4124574b9cbcab8c4a1aeb8";
// List<Map<String,Object>> dataList = Dayu.getDayuAccountData(mid,name,null,null); name = "国魂";
// poi.exportExcel(path, name, headList, dataList); if(mid.length() < 1 && name.length() < 1) {
// } continue;
// }
// List<Map<String,Object>> dataList = Dayu.getDayuAccountData(mid,name,null,null);
// } poi.exportExcel(path, name, headList, dataList);
// }
//
//}
}
}
package com.zhiwei.crawler; //package com.zhiwei.crawler;
//
import org.junit.Test; //import org.junit.Test;
//
import com.zhiwei.parse.Dayu; //import com.zhiwei.parse.Dayu;
//
public class DayuCommentCountExample { //public class DayuCommentCountExample {
//
@Test // @Test
public void dayuCommentCountTest() { // public void dayuCommentCountTest() {
String articleId = "6987993456991247474"; // String articleId = "6987993456991247474";
//
int i = Dayu.getDayuCommentCount(articleId,null); // int i = Dayu.getDayuCommentCount(articleId,null);
System.out.println(i); // System.out.println(i);
} // }
//
//
//
} //}
package com.zhiwei.crawler; //package com.zhiwei.crawler;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.junit.Test; //import org.junit.Test;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Dayu; //import com.zhiwei.parse.Dayu;
//
public class DayuCommentExample { //public class DayuCommentExample {
//
@Test // @Test
public void getDayuCommentTest() { // public void getDayuCommentTest() {
//若已获取历史文章 哪里有这个字段 其他文章的 // //若已获取历史文章 哪里有这个字段 其他文章的
//http://m.uczzd.cn/iflow/api/v2/cmt/article/14180961224021425316/comments/byhot // //http://m.uczzd.cn/iflow/api/v2/cmt/article/14180961224021425316/comments/byhot
//14180961224021425316 这个为此参数 // //14180961224021425316 这个为此参数
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体//UC评论采集-1.xlsx", 0); // Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体//UC评论采集-1.xlsx", 0);
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body"); // List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); // List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
List<String> urlList = new ArrayList<String>(); // List<String> urlList = new ArrayList<String>();
for(Map<String,Object> map1 : list) { // for(Map<String,Object> map1 : list) {
String url = ""; // String url = "";
try { // try {
url = map1.get("url")+""; // url = map1.get("url")+"";
String articleId = ""; // String articleId = "";
url = "16848608935470442496"; // url = "16848608935470442496";
if(url.contains("aid")) { // if(url.contains("aid")) {
articleId = url.split("aid=")[1].split("&")[0]; // articleId = url.split("aid=")[1].split("&")[0];
}else { // }else {
articleId = url; // articleId = url;
} // }
List<Map<String,Object>> dataList = Dayu.getDayuCommentData(articleId,null); // List<Map<String,Object>> dataList = Dayu.getDayuCommentData(articleId,null);
if(dataList.size() <= 0) { // if(dataList.size() <= 0) {
urlList.add(url); // urlList.add(url);
} // }
if(dataList != null) { // if(dataList != null) {
bodyList.addAll(dataList); // bodyList.addAll(dataList);
} // }
} catch (Exception e) { // } catch (Exception e) {
System.out.println(url); // System.out.println(url);
e.printStackTrace(); // e.printStackTrace();
continue; // continue;
} // }
} // }
List<String> headList = new ArrayList<String>(); // List<String> headList = new ArrayList<String>();
headList.add("nickname"); // headList.add("nickname");
headList.add("content"); // headList.add("content");
headList.add("id"); // headList.add("id");
headList.add("url"); // headList.add("url");
headList.add("like"); // headList.add("like");
headList.add("time"); // headList.add("time");
headList.add("replay_count"); // headList.add("replay_count");
for(String s : urlList) { // for(String s : urlList) {
System.out.println(s); // System.out.println(s);
} // }
poi.exportExcel("D://crawlerdata/UC评论采集.xlsx", "评论", headList, bodyList); // poi.exportExcel("D://crawlerdata/UC评论采集.xlsx", "评论", headList, bodyList);
//
} // }
//
//
} //}
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Fenghuang;
public class FenghuangAccountExample {
@Test
public void fenghuangAccountTest() {
//所用时间长 1s1篇文章吧
//https://api.3g.ifeng.com/client_search_subscribe?k=号外财经
String id = "6452";
String[] ids = id.split(",");
for(int i = 0;i < ids.length;i++) {
try {
String startTime = "2010-05-01 00:00:00"; //可为空
List<Map<String,Object>> dataList = Fenghuang.getFenghuangAccountData(ids[i], startTime,null);
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("text");
headList.add("source");
headList.add("url");
headList.add("id");
poi.exportExcel("D://crawlerdata/凤凰-6452.xlsx", ids[i], headList, dataList);
} catch (Exception e) {
continue;
}
}
}
}
package com.zhiwei.crawler; //package com.zhiwei.crawler;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.junit.Test; //import org.junit.Test;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Fenghuang; //import com.zhiwei.parse.Fenghuang;
import com.zhiwei.parse.Yidianzixun; //import com.zhiwei.parse.Yidianzixun;
import com.zhiwei.util.WordReadFile; //import com.zhiwei.util.WordReadFile;
//
public class FenghuangByWordExample { //public class FenghuangByWordExample {
//
@Test // @Test
public void fenghuangByWordTest() { // public void fenghuangByWordTest() {
List<String> wordList = WordReadFile.getWords("D://crawlerdata/关键词.txt"); // List<String> wordList = WordReadFile.getWords("D://crawlerdata/关键词.txt");
List<Map<String,Object>> listAll = new ArrayList<Map<String,Object>>(); // List<Map<String,Object>> listAll = new ArrayList<Map<String,Object>>();
for(String word : wordList) { // for(String word : wordList) {
try { // try {
List<Map<String,Object>> dataList = Fenghuang.getFenghuangByWord(word,null); // List<Map<String,Object>> dataList = Fenghuang.getFenghuangByWord(word,null);
if(dataList != null && dataList.size() > 0) { // if(dataList != null && dataList.size() > 0) {
listAll.addAll(dataList); // listAll.addAll(dataList);
} // }
System.out.println(dataList.size()+"==========="+listAll.size()); // System.out.println(dataList.size()+"==========="+listAll.size());
} catch (Exception e) { // } catch (Exception e) {
continue; // continue;
} // }
} // }
//
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<String>(); // List<String> headList = new ArrayList<String>();
headList.add("title"); // headList.add("title");
headList.add("content"); // headList.add("content");
headList.add("source"); // headList.add("source");
headList.add("time"); // headList.add("time");
headList.add("url"); // headList.add("url");
System.out.println(listAll.size()); // System.out.println(listAll.size());
poi.exportExcel("D://crawlerdata/凤凰-美林.xlsx", "asd", headList, listAll); // poi.exportExcel("D://crawlerdata/凤凰-美林.xlsx", "asd", headList, listAll);
} // }
//
//
//
} //}
package com.zhiwei.crawler; //package com.zhiwei.crawler;
//
import java.util.Map; //import java.util.Map;
//
import org.junit.Test; //import org.junit.Test;
//
import com.zhiwei.parse.Fenghuang; //import com.zhiwei.parse.Fenghuang;
//
//
public class FenghuangCommentCountExample { //public class FenghuangCommentCountExample {
//
@Test // @Test
public void fenghuangCommentCountTest() { // public void fenghuangCommentCountTest() {
String url = "http://tech.ifeng.com/a/20181113/45222352_0.shtml"; // String url = "http://tech.ifeng.com/a/20181113/45222352_0.shtml";
//http://news.ifeng.com/a/20161229/50492484_0.shtml // //http://news.ifeng.com/a/20161229/50492484_0.shtml
//http://wemedia.ifeng.com/4096977/wemedia.shtml // //http://wemedia.ifeng.com/4096977/wemedia.shtml
Map<String,Object> map = Fenghuang.getFenghuangCommentCount(url,null); // Map<String,Object> map = Fenghuang.getFenghuangCommentCount(url,null);
System.out.println(map.toString()); // System.out.println(map.toString());
//
} // }
//
//
} //}
package com.zhiwei.crawler; //package com.zhiwei.crawler;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.junit.Test; //import org.junit.Test;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Fenghuang; //import com.zhiwei.parse.Fenghuang;
import com.zhiwei.tools.tools.ZhiWeiTools; //import com.zhiwei.tools.tools.ZhiWeiTools;
//
public class FenghuangCommentExample { //public class FenghuangCommentExample {
//
@Test // @Test
public void fenghuangCommentTest() { // public void fenghuangCommentTest() {
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体/凤凰评论采集.xlsx", 0); // Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体/凤凰评论采集.xlsx", 0);
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body"); // List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); // List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
List<String> urlList = new ArrayList<String>(); // List<String> urlList = new ArrayList<String>();
for(Map<String,Object> map1 : list) { // for(Map<String,Object> map1 : list) {
String url = ""; // String url = "";
try { // try {
url = map1.get("url")+""; // url = map1.get("url")+"";
System.out.println(url); // System.out.println(url);
List<Map<String,Object>> dataList = Fenghuang.getFenghuangCommentData2(url,null); // List<Map<String,Object>> dataList = Fenghuang.getFenghuangCommentData2(url,null);
if(dataList == null || dataList.size() <= 0) { // if(dataList == null || dataList.size() <= 0) {
urlList.add(url); // urlList.add(url);
} // }
if(dataList != null) { // if(dataList != null) {
for(Map<String,Object> m : dataList) { // for(Map<String,Object> m : dataList) {
m.put("from_url", url); // m.put("from_url", url);
bodyList.add(m); // bodyList.add(m);
} // }
} // }
} catch (Exception e) { // } catch (Exception e) {
System.out.println(url); // System.out.println(url);
e.printStackTrace(); // e.printStackTrace();
continue; // continue;
} // }
ZhiWeiTools.sleep(1000); // ZhiWeiTools.sleep(1000);
} // }
List<String> headList = new ArrayList<String>(); // List<String> headList = new ArrayList<String>();
headList.add("nickname"); // headList.add("nickname");
headList.add("content"); // headList.add("content");
headList.add("id"); // headList.add("id");
headList.add("like"); // headList.add("like");
headList.add("from"); // headList.add("from");
headList.add("time"); // headList.add("time");
headList.add("from_url"); // headList.add("from_url");
for(String s : urlList) { // for(String s : urlList) {
System.out.println(s); // System.out.println(s);
} // }
poi.exportExcel("D://crawlerdata//自媒体/凤凰评论采集.xlsx", "评论采集", headList, bodyList); // poi.exportExcel("D://crawlerdata//自媒体/凤凰评论采集.xlsx", "评论采集", headList, bodyList);
//
} // }
//
//
} //}
//package com.zhiwei.crawler; package com.zhiwei.crawler;
//
//import java.util.ArrayList; import java.util.ArrayList;
//import java.util.Arrays; import java.util.Arrays;
//import java.util.List; import java.util.List;
//import java.util.Map; import java.util.Map;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil; import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Maimai; import com.zhiwei.parse.Maimai;
//
//public class MaimaiBywordExample { public class MaimaiBywordExample {
//
// public static void main(String[] args) { public static void main(String[] args) {
// String word = "美团|某团|MT|大众点评|新美大|美团点评"; String word = "美团|某团|MT|大众点评|新美大|美团点评";
// String cookie = "_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; sessionid=lejfy3gdu5tf9x9zowxfhtq5o73dubc5; guid=GxsfBBgZGwQYGx4EGBkeVgcYGxkdHhMeHhkcVhwZBB0ZHwVDWEtMS3kKEhMEEh0fGQQaBBsdBU9HRVhCaQoDRUFJT20KT0FDRgoGZmd+YmECChwZBB0ZHwVeQ2FIT31PRlpaawoDHhx9ZX0KERkEHAp+ZApZXUVOREN9AgoaBB8FS0ZGQ1BFZw==; seid=s1550814253444; token=\"G8eNNNylPoi3oIPLUr/d/RDaMgtnpZCskxT7wu1pRRrkiy3J8G7StHgTx9DQBq4O8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoiVjJuNHdCVDBncVNacTRxVllGM29jRUVwIiwic3RhdHVzIjp0cnVlLCJfZXhwaXJlIjoxNTUwOTAwNjY1Njg4LCJfbWF4QWdlIjo4NjQwMDAwMH0=; session.sig=b_tga85tZskxsgKX8YIM_JKByi0"; String cookie = "_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; sessionid=8d1sx8i4gj0ocmtyc86x2yj0467ymayv; token=\"wl8U6GizDpoS6uzZ1ug93sJjfBucfB7IOoDxDVWOy+g7egJdXL/riMlMlHuQj+gM8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoiLVctRlpDLXg3N1h4ZEhkeEs0Qi1NR0VDIiwibWlkNDU2ODc2MCI6ZmFsc2UsInN0YXR1cyI6dHJ1ZSwiX2V4cGlyZSI6MTU1NzEyNDAxMzA0NSwiX21heEFnZSI6ODY0MDAwMDB9; session.sig=NZ2D9ZQU_Wlx6JGAFap4Znviz6k";
// String time = "2019-02-15 00:00:00"; String time = "2019-02-15 00:00:00";
// String[] words = word.split("\\|"); String[] words = word.split("\\|");
// List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
// for(String w : words) { for(String w : words) {
// //实名动态 //实名动态
//// List<Map<String,Object>> c = Maimai.getData(w, cookie, time, null); // List<Map<String,Object>> c = Maimai.getData(w, cookie, time, null);
// //职言交流 //职言交流
// List<Map<String,Object>> c2 = Maimai.getDataByNoName(w, cookie, time, null); List<Map<String,Object>> c2 = Maimai.getDataByNoName(w, cookie, time, null);
//// bodyList.addAll(c); // bodyList.addAll(c);
// bodyList.addAll(c2); bodyList.addAll(c2);
// } }
// List<String> headList = Arrays.asList("time","url","text","name","like","comment_count","spreads","word"); List<String> headList = Arrays.asList("time","url","text","name","like","comment_count","spreads","word");
// PoiExcelUtil poi = PoiExcelUtil.getInstance(); PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D:\\crawlerdata\\自媒体\\脉脉关键词采集-美团-0222.xlsx", "脉脉关键词", headList, bodyList); poi.exportExcel("D:\\crawlerdata\\自媒体\\脉脉关键词采集-美团-0222.xlsx", "脉脉关键词", headList, bodyList);
// } }
//
//} }
package com.zhiwei.crawler; //package com.zhiwei.crawler;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.junit.Test; //import org.junit.Test;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Meipai; //import com.zhiwei.parse.Meipai;
//
public class MeipaiByWordExample { //public class MeipaiByWordExample {
//
@Test // @Test
public void meipaiByWordTest() { // public void meipaiByWordTest() {
String word = "美食,吃,菜"; // String word = "美食,吃,菜";
String[] words = word.split(","); // String[] words = word.split(",");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); // List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(String w : words) { // for(String w : words) {
List<Map<String,Object>> dataList = Meipai.getMeipaiByWordData(w,null); // List<Map<String,Object>> dataList = Meipai.getMeipaiByWordData(w,null);
if(dataList != null) { // if(dataList != null) {
bodyList.addAll(dataList); // bodyList.addAll(dataList);
} // }
} // }
List<String> headList = new ArrayList<String>(); // List<String> headList = new ArrayList<String>();
headList.add("time"); // headList.add("time");
headList.add("video_count"); // headList.add("video_count");
headList.add("content"); // headList.add("content");
headList.add("url"); // headList.add("url");
headList.add("like"); // headList.add("like");
headList.add("comment_count"); // headList.add("comment_count");
headList.add("source"); // headList.add("source");
headList.add("source_url"); // headList.add("source_url");
//
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
poi.exportExcel("D://crawlerdata/美拍关键词采集.xlsx", "美拍数据", headList, bodyList); // poi.exportExcel("D://crawlerdata/美拍关键词采集.xlsx", "美拍数据", headList, bodyList);
//
} // }
//
//
//
} //}
package com.zhiwei.crawler; //package com.zhiwei.crawler;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.junit.Test; //import org.junit.Test;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Miaopai; //import com.zhiwei.parse.Miaopai;
import com.zhiwei.tools.tools.ZhiWeiTools; //import com.zhiwei.tools.tools.ZhiWeiTools;
//
public class MiaopaiByUrlExample { //public class MiaopaiByUrlExample {
//
@Test // @Test
public void miaopaiByUrlTest() { // public void miaopaiByUrlTest() {
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
Map<String,Object> map = poi.importExcel("D://crawlerdata/秒拍美食.xlsx", 0); // Map<String,Object> map = poi.importExcel("D://crawlerdata/秒拍美食.xlsx", 0);
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body"); // List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); // List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
List<String> urlList = new ArrayList<String>(); // List<String> urlList = new ArrayList<String>();
for(Map<String,Object> map1 : list) { // for(Map<String,Object> map1 : list) {
String url = ""; // String url = "";
try { // try {
url = map1.get("url")+""; // url = map1.get("url")+"";
if(urlList.contains(url)) { // if(urlList.contains(url)) {
continue; // continue;
} // }
urlList.add(url); // urlList.add(url);
ZhiWeiTools.sleep(5000); // ZhiWeiTools.sleep(5000);
System.out.println(url); // System.out.println(url);
Map<String,Object> dataMap = Miaopai.getMiaopaiDataByURL(url,null); // Map<String,Object> dataMap = Miaopai.getMiaopaiDataByURL(url,null);
if(dataMap != null) { // if(dataMap != null) {
bodyList.add(dataMap); // bodyList.add(dataMap);
} // }
} catch (Exception e) { // } catch (Exception e) {
System.out.println(url); // System.out.println(url);
e.printStackTrace(); // e.printStackTrace();
continue; // continue;
} // }
} // }
List<String> headList = new ArrayList<String>(); // List<String> headList = new ArrayList<String>();
headList.add("time"); // headList.add("time");
headList.add("source"); // headList.add("source");
headList.add("title"); // headList.add("title");
headList.add("url"); // headList.add("url");
headList.add("video_count"); // headList.add("video_count");
poi.exportExcel("D://crawlerdata/秒拍美食.xlsx", "asd", headList, bodyList); // poi.exportExcel("D://crawlerdata/秒拍美食.xlsx", "asd", headList, bodyList);
//
} // }
//
//
} //}
package com.zhiwei.crawler; //package com.zhiwei.crawler;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.junit.Test; //import org.junit.Test;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.PearVideo; //import com.zhiwei.parse.PearVideo;
//
public class PearVideoByWordExample { //public class PearVideoByWordExample {
//
@Test // @Test
public void pearVideoByWordTest() { // public void pearVideoByWordTest() {
String word = "大宝 甲醛"; // String word = "大宝 甲醛";
//
List<Map<String,Object>> bodyList = PearVideo.getPearVideoData(word,null); // List<Map<String,Object>> bodyList = PearVideo.getPearVideoData(word,null);
List<String> headList = new ArrayList<String>(); // List<String> headList = new ArrayList<String>();
headList.add("time"); // headList.add("time");
headList.add("title"); // headList.add("title");
headList.add("content"); // headList.add("content");
headList.add("url"); // headList.add("url");
headList.add("like"); // headList.add("like");
headList.add("source"); // headList.add("source");
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata/梨视频关键词采集.xlsx", "梨视频采集结果", headList, bodyList); // poi.exportExcel("D://crawlerdata/梨视频关键词采集.xlsx", "梨视频采集结果", headList, bodyList);
//
//
} // }
//
//
} //}
package com.zhiwei.crawler; //package com.zhiwei.crawler;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.junit.Test; //import org.junit.Test;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.QQKB; //import com.zhiwei.parse.QQKB;
import com.zhiwei.tools.tools.ZhiWeiTools; //import com.zhiwei.tools.tools.ZhiWeiTools;
//
public class QQAccountExample { //public class QQAccountExample {
//
@Test // @Test
public void qqAccountTest() { // public void qqAccountTest() {
//
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> dataMap = poi.importExcel("D://crawlerdata//自媒体/天天快报历史文章采集.xlsx", 0); // Map<String,Object> dataMap = poi.importExcel("D://crawlerdata//自媒体/天天快报历史文章采集.xlsx", 0);
List<Map<String,Object>> dataList = (List<Map<String, Object>>) dataMap.get("body"); // List<Map<String,Object>> dataList = (List<Map<String, Object>>) dataMap.get("body");
String cookie = "phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=000300006218e8444698ebbad28deed8243ef28e87f247b133bf5d19f160bce91d4e3e06ec2003cd4f92aff0;%20uin=o0497332654;%20skey=MQL8ScQBxu;%20sigA2=2EBEFFCC5C22FD27B32E6F21C76CA4A2AD8BF5D626B629A38C50923810CCD19C574CD093CFAD29C845084C8CD3E04B57DD24A69E4418E060C899696A728FB3B8B14C577F1FF93340;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmSQ3EqlyzseC9-qGEFY7Tkr0Ypk5vsnSwOaMC-IGxsBeY2K7knHrYstj_5dZpisJd5nihvLNQvCdsFhFwZQcT8;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0"; // String cookie = "phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=000300006218e8444698ebbad28deed8243ef28e87f247b133bf5d19f160bce91d4e3e06ec2003cd4f92aff0;%20uin=o0497332654;%20skey=MQL8ScQBxu;%20sigA2=2EBEFFCC5C22FD27B32E6F21C76CA4A2AD8BF5D626B629A38C50923810CCD19C574CD093CFAD29C845084C8CD3E04B57DD24A69E4418E060C899696A728FB3B8B14C577F1FF93340;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmSQ3EqlyzseC9-qGEFY7Tkr0Ypk5vsnSwOaMC-IGxsBeY2K7knHrYstj_5dZpisJd5nihvLNQvCdsFhFwZQcT8;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0";
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); // List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(Map<String,Object> map : dataList) { // for(Map<String,Object> map : dataList) {
String child = map.get("帐号链接")+""; // String child = map.get("帐号链接")+"";
// System.out.println(child.split("chlid=")[1]); //// System.out.println(child.split("chlid=")[1]);
System.out.println(child.split("=")[1]); // System.out.println(child.split("=")[1]);
//
List<Map<String,Object>> lists = QQKB.getQQAccountData("5001789", cookie,null); // List<Map<String,Object>> lists = QQKB.getQQAccountData("5456950", cookie,null);
if(lists != null) { // if(lists != null) {
for(Map<String,Object> map1 : lists) { // for(Map<String,Object> map1 : lists) {
map1.put("name", map.get("呢称")); // map1.put("name", map.get("呢称"));
map1.put("主页地址", map.get("帐号链接")); // map1.put("主页地址", map.get("帐号链接"));
bodyList.add(map1); // bodyList.add(map1);
} // }
} // }
System.out.println("采集到的历史文章数总和============="+bodyList.size()); // System.out.println("采集到的历史文章数总和============="+bodyList.size());
ZhiWeiTools.sleep(5000); // ZhiWeiTools.sleep(5000);
} // }
System.out.println(dataList.size()); // System.out.println(dataList.size());
List<String> headList = new ArrayList<String>(); // List<String> headList = new ArrayList<String>();
headList.add("name"); // headList.add("name");
headList.add("主页地址"); // headList.add("主页地址");
headList.add("title"); // headList.add("title");
headList.add("time"); // headList.add("time");
headList.add("content"); // headList.add("content");
headList.add("url"); // headList.add("url");
headList.add("commentid"); // headList.add("commentid");
poi.exportExcel("D://crawlerdata//自媒体/天天快报采集-科技编年史.xlsx", "asd", headList, bodyList); // poi.exportExcel("D://crawlerdata//自媒体/天天快报采集-科技编年史.xlsx", "asd", headList, bodyList);
} // }
//
//
} //}
package com.zhiwei.crawler; //package com.zhiwei.crawler;
//
import org.junit.Test; //import org.junit.Test;
//
import com.zhiwei.parse.QQKB; //import com.zhiwei.parse.QQKB;
//
public class QQKBByWordExample { //public class QQKBByWordExample {
//
@Test // @Test
public void qqkbByWordTest() { // public void qqkbByWordTest() {
String word = "麦当劳"; // String word = "麦当劳";
String cookie = "phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=00030000a7aa9ad0c4636295c0f484bf54820005db056be0651296f399e092bc56e4910bbedaf413ced3fb8a;%20uin=o0497332654;%20skey=MSF4MCe62n;%20sigA2=71400747BF93A51F5F103AC6180C723E940637B87127C104AA325F5709FD378BEB014D649D6B653031F6B2E0962C0F8D9C06807A1CE509983C1C70B641606FEA84B90777926863E4;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwsY93PCkKQ3oQ4nAAU9-VNf0eKb4SlfVLb-YW7bRaEL2jS3XDtJGO8m9DoDrFZ4bFyv96Pb5ZfvVIyehq-jQ65o;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0"; // String cookie = "phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=00030000a7aa9ad0c4636295c0f484bf54820005db056be0651296f399e092bc56e4910bbedaf413ced3fb8a;%20uin=o0497332654;%20skey=MSF4MCe62n;%20sigA2=71400747BF93A51F5F103AC6180C723E940637B87127C104AA325F5709FD378BEB014D649D6B653031F6B2E0962C0F8D9C06807A1CE509983C1C70B641606FEA84B90777926863E4;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwsY93PCkKQ3oQ4nAAU9-VNf0eKb4SlfVLb-YW7bRaEL2jS3XDtJGO8m9DoDrFZ4bFyv96Pb5ZfvVIyehq-jQ65o;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0";
//无法找到下一页 // //无法找到下一页
// QQKB.getQQKBByWordData(word, cookie); //// QQKB.getQQKBByWordData(word, cookie);
//
} // }
//
//
//
} //}
package com.zhiwei.crawler; //package com.zhiwei.crawler;
//
import org.junit.Test; //import org.junit.Test;
//
import com.zhiwei.parse.QQKB; //import com.zhiwei.parse.QQKB;
//
public class QQKBCommentCountExample { //public class QQKBCommentCountExample {
//
//
@Test // @Test
public void qqkbCommentCountTest() { // public void qqkbCommentCountTest() {
String cookie = ""; // String cookie = "";
String url = "https://kuaibao.qq.com/s/20190305A16P6L00"; // String url = "https://kuaibao.qq.com/s/20190305A16P6L00";
//
int i = QQKB.getCommentCount(url,null); // int i = QQKB.getCommentCount(url,null);
System.out.println(i); // System.out.println(i);
//
} // }
//
//
} //}
package com.zhiwei.crawler; //package com.zhiwei.crawler;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.junit.Test; //import org.junit.Test;
//
import com.zhiwei.common.config.GroupType; //import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory; //import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.QQKB; //import com.zhiwei.parse.QQKB;
import com.zhiwei.tools.tools.ZhiWeiTools; //import com.zhiwei.tools.tools.ZhiWeiTools;
//
public class QQKBCommentExample { //public class QQKBCommentExample {
//
//天天快报与腾讯新闻都可用 不用cookie // //天天快报与腾讯新闻都可用 不用cookie
@Test // @Test
public void qqkbCommentTest() { // public void qqkbCommentTest() {
String url = "https://kuaibao.qq.com/s/20181122A11WQB00"; // String url = "https://kuaibao.qq.com/s/20181122A11WQB00";
//https://kuaibao.qq.com/s/20180423A1PI7400?refer=kb_news // //https://kuaibao.qq.com/s/20180423A1PI7400?refer=kb_news
// https://kuaibao.qq.com/s/20180423A0L60800?refer=kb_news //// https://kuaibao.qq.com/s/20180423A0L60800?refer=kb_news
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", // ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
GroupType.PROVIDER); // GroupType.PROVIDER);
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体/快报评论采集.xlsx", 0); // Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体/快报评论采集.xlsx", 0);
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body"); // List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); // List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(Map<String,Object> m : list) { // for(Map<String,Object> m : list) {
String u = m.get("地址").toString(); // String u = m.get("地址").toString();
System.out.println(u); // System.out.println(u);
ZhiWeiTools.sleep(2000); // ZhiWeiTools.sleep(2000);
List<Map<String,Object>> dataList = QQKB.getQQKBCommentData(u,null); // List<Map<String,Object>> dataList = QQKB.getQQKBCommentData(u,null);
if(dataList!= null) { // if(dataList!= null) {
bodyList.addAll(dataList); // bodyList.addAll(dataList);
} // }
} // }
List<String> headList = new ArrayList<String>(); // List<String> headList = new ArrayList<String>();
headList.add("reply_id"); //id // headList.add("reply_id"); //id
headList.add("like"); //点赞数 // headList.add("like"); //点赞数
headList.add("name"); //呢称 // headList.add("name"); //呢称
headList.add("reply_num"); //回复数 // headList.add("reply_num"); //回复数
headList.add("time"); //时间 // headList.add("time"); //时间
headList.add("content"); //内容 // headList.add("content"); //内容
System.out.println(bodyList.size()); // System.out.println(bodyList.size());
poi.exportExcel("D:\\crawlerdata\\自媒体\\快报评论采集.xlsx", "sada", headList, bodyList); // poi.exportExcel("D:\\crawlerdata\\自媒体\\快报评论采集.xlsx", "sada", headList, bodyList);
//
} // }
//
//
} //}
package com.zhiwei.crawler; //package com.zhiwei.crawler;
//
import java.io.IOException; //import java.io.IOException;
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.HashMap; //import java.util.HashMap;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import com.alibaba.fastjson.JSONArray; //import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; //import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.httpclient.HeaderTool; //import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK; //import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
//
/** ///**
* @ClassName: SinaCommentListTest // * @ClassName: SinaCommentListTest
* @Description: TODO(新浪新闻评论抓取) // * @Description: TODO(新浪新闻评论抓取)
* @author hero // * @author hero
* @date 2017年8月10日 下午6:08:41 // * @date 2017年8月10日 下午6:08:41
*/ // */
public class SinaCommentListTest { //public class SinaCommentListTest {
//
//
public static void sinaCommentListTest(String url) { // public static void sinaCommentListTest(String url) {
Map<String,String> headerMap = HeaderTool.getCommonHead(); // Map<String,String> headerMap = HeaderTool.getCommonHead();
String newsId = getCommentId(url).split("=====")[1]; // String newsId = getCommentId(url).split("=====")[1];
String channel = getCommentId(url).split("=====")[0]; // String channel = getCommentId(url).split("=====")[0];
int page = 1; // int page = 1;
try { // try {
String comment_url = "http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel="+channel+"&newsid="+newsId+"&group=0&compress=0&ie=gbk&oe=gbk&page="+page+"&page_size=20&jsvar=loader_1525576000752_30189682"; // String comment_url = "http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel="+channel+"&newsid="+newsId+"&group=0&compress=0&ie=gbk&oe=gbk&page="+page+"&page_size=20&jsvar=loader_1525576000752_30189682";
System.out.println("commenturl========"+comment_url); // System.out.println("commenturl========"+comment_url);
String html = HttpClientTemplateOK.get(comment_url, null, headerMap); // String html = HttpClientTemplateOK.get(comment_url, null, headerMap);
if(html!=null){ // if(html!=null){
html = html.substring(html.indexOf("=",0)+1,html.length()); // html = html.substring(html.indexOf("=",0)+1,html.length());
System.out.println(html); // System.out.println(html);
JSONObject data = JSONObject.parseObject(html).getJSONObject("result"); // JSONObject data = JSONObject.parseObject(html).getJSONObject("result");
JSONArray jsonArray = data.getJSONArray("cmntlist"); // JSONArray jsonArray = data.getJSONArray("cmntlist");
for(int a = 0;a<jsonArray.size();a++){ // for(int a = 0;a<jsonArray.size();a++){
Map<String,Object> doc = new HashMap<String, Object>(); // Map<String,Object> doc = new HashMap<String, Object>();
JSONObject json = jsonArray.getJSONObject(a); // JSONObject json = jsonArray.getJSONObject(a);
doc.put("_id", json.getString("mid")); // doc.put("_id", json.getString("mid"));
doc.put("content", json.getString("content")); // doc.put("content", json.getString("content"));
doc.put("area", json.getString("area")); // doc.put("area", json.getString("area"));
doc.put("nick", json.getString("nick")); // doc.put("nick", json.getString("nick"));
doc.put("time", json.getString("time")); // doc.put("time", json.getString("time"));
doc.put("agree", json.getInteger("agree")); // doc.put("agree", json.getInteger("agree"));
doc.put("against", json.getInteger("against")); // doc.put("against", json.getInteger("against"));
doc.put("vote", json.getInteger("vote")); // doc.put("vote", json.getInteger("vote"));
doc.put("fromUrl", url); // doc.put("fromUrl", url);
System.out.println("doc==========="+doc); // System.out.println("doc==========="+doc);
//
} // }
}else{ // }else{
System.out.println("--------------"); // System.out.println("--------------");
} // }
//
} catch (Exception e) { // } catch (Exception e) {
e.printStackTrace(); // e.printStackTrace();
} // }
} // }
//
//
//
public static String getCommentId(String url){ // public static String getCommentId(String url){
String newsid = null; // String newsid = null;
String channel = null; // String channel = null;
Map<String,String> headerMap = HeaderTool.getCommonHead(); // Map<String,String> headerMap = HeaderTool.getCommonHead();
System.out.println(url); // System.out.println(url);
try { // try {
String html = HttpClientTemplateOK.get(url, null, headerMap); // String html = HttpClientTemplateOK.get(url, null, headerMap);
if(html!=null && html.contains("newsid")){ // if(html!=null && html.contains("newsid")){
newsid = html.split("newsid: '")[1].split("',")[0]; // newsid = html.split("newsid: '")[1].split("',")[0];
channel = html.split("channel: '")[1].split("',")[0]; // channel = html.split("channel: '")[1].split("',")[0];
System.out.println(channel+"============"+newsid); // System.out.println(channel+"============"+newsid);
return channel+"====="+newsid; // return channel+"====="+newsid;
} // }
} catch (IOException e) { // } catch (IOException e) {
return null; // return null;
} catch (Exception e) { // } catch (Exception e) {
e.printStackTrace(); // e.printStackTrace();
} // }
return newsid; // return newsid;
} // }
//
} //}
package com.zhiwei.crawler; //package com.zhiwei.crawler;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.junit.Test; //import org.junit.Test;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Soku; //import com.zhiwei.parse.Soku;
//
public class SoKuByWordExample { //public class SoKuByWordExample {
//
@Test // @Test
public void sokuByWordTest() { // public void sokuByWordTest() {
String word = "美食,味道,吃,试吃,美味,好吃"; // String word = "美食,味道,吃,试吃,美味,好吃";
String type = "174,103,176"; // String type = "174,103,176";
String[] words = word.split(","); // String[] words = word.split(",");
String[] types = type.split(","); // String[] types = type.split(",");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); // List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(String w : words ) { // for(String w : words ) {
for(String t : types) { // for(String t : types) {
List<Map<String,Object>> list = Soku.getSoKuByWordData(w, t,null); // List<Map<String,Object>> list = Soku.getSoKuByWordData(w, t,null);
if(list != null && list.size() > 0) { // if(list != null && list.size() > 0) {
bodyList.addAll(list); // bodyList.addAll(list);
} // }
} // }
} // }
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<String>(); // List<String> headList = new ArrayList<String>();
headList.add("title"); // headList.add("title");
headList.add("time"); // headList.add("time");
headList.add("play_count"); // headList.add("play_count");
headList.add("url"); // headList.add("url");
headList.add("source"); // headList.add("source");
poi.exportExcel("D://crawlerdata/优酷采集.xlsx", "优酷数据", headList, bodyList); // poi.exportExcel("D://crawlerdata/优酷采集.xlsx", "优酷数据", headList, bodyList);
//
} // }
//
} //}
package com.zhiwei.crawler; //package com.zhiwei.crawler;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.junit.Test; //import org.junit.Test;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.common.config.GroupType;
import com.zhiwei.parse.Souhu; //import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Souhu;
public class SouhuAccountExample { //
//
//http://search.sohu.com/?keyword=%E8%99%8E%E5%97%85&source=article&queryType=edit&ie=utf8 //public class SouhuAccountExample {
//
@Test // //http://search.sohu.com/?keyword=%E8%99%8E%E5%97%85&source=article&queryType=edit&ie=utf8
public void souhuAccountTest() { //
List<Map<String,Object>> lists = Souhu.getSouHuAccountData("c29odXptdHNmbjZ0cnRAc29odS5jb20=","2018-05-01 00:00:00",false,null); // @Test
System.out.println(lists.size()); // public void souhuAccountTest() {
List<String> headList = new ArrayList<String>(); // ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER);
headList.add("title"); // List<Map<String,Object>> lists = Souhu.getSouHuAccountData("99938933","浅黑科技","2018-05-01 00:00:00",false,null);
headList.add("time"); // System.out.println(lists.size());
headList.add("content"); // List<String> headList = new ArrayList<String>();
headList.add("url"); // headList.add("title");
headList.add("comment"); // headList.add("time");
headList.add("tags"); // headList.add("content");
headList.add("newsid"); // headList.add("url");
headList.add("source"); // headList.add("comment");
headList.add("newsPv"); // headList.add("tags");
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // headList.add("newsid");
poi.exportExcel("D:\\crawlerdata\\搜狐号历史文章-乔.xlsx", "乔", headList, lists); // headList.add("source");
} // headList.add("newsPv");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
} // poi.exportExcel("D:\\crawlerdata\\搜狐号历史文章-乔.xlsx", "乔", headList, lists);
// }
//
//}
package com.zhiwei.crawler; //package com.zhiwei.crawler;
//
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.junit.Test; //import org.junit.Test;
//
import com.zhiwei.common.config.GroupType; //import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory; //import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder; //import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Souhu; //import com.zhiwei.parse.Souhu;
//
public class SouhuCommentCountExample { //public class SouhuCommentCountExample {
//
//
@SuppressWarnings("unchecked") // @SuppressWarnings("unchecked")
@Test // @Test
public void souhuCommentCountTest() { // public void souhuCommentCountTest() {
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", // ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
GroupType.PROVIDER); // GroupType.PROVIDER);
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", 0); // Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", 0);
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body"); // List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List<String> headList = (List<String>) map.get("head"); // List<String> headList = (List<String>) map.get("head");
for(Map<String,Object> map1 : list) { // for(Map<String,Object> map1 : list) {
String url = ""; // String url = "";
try { // try {
url = map1.get("url")+""; // url = map1.get("url")+"";
System.out.println(url); // System.out.println(url);
url = "http://m.sohu.com/a/299389309_114988"; // url = "http://m.sohu.com/a/299389309_114988";
int i = Souhu.getSouhuCommentCount(url,ProxyHolder.NAT_PROXY); // int i = Souhu.getSouhuCommentCount(url,ProxyHolder.NAT_PROXY);
int j = Souhu.getSohuReadNum(url, ProxyHolder.NAT_PROXY); // int j = Souhu.getSohuReadNum(url, ProxyHolder.NAT_PROXY);
map1.put("count", i); // map1.put("count", i);
map1.put("redNum", j); // map1.put("redNum", j);
System.out.println(map1.toString()); // System.out.println(map1.toString());
} catch (Exception e) { // } catch (Exception e) {
System.out.println(url); // System.out.println(url);
e.printStackTrace(); // e.printStackTrace();
continue; // continue;
} // }
} // }
headList.add("count"); // headList.add("count");
poi.exportExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", "sheet2", headList, list); // poi.exportExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", "sheet2", headList, list);
} // }
//
//
} //}
package com.zhiwei.crawler; //package com.zhiwei.crawler;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.junit.Test; //import org.junit.Test;
//
import com.zhiwei.common.config.GroupType; //import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory; //import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Fenghuang; //import com.zhiwei.parse.Fenghuang;
import com.zhiwei.parse.Souhu; //import com.zhiwei.parse.Souhu;
import com.zhiwei.tools.tools.ZhiWeiTools; //import com.zhiwei.tools.tools.ZhiWeiTools;
//
public class SouhuCommentExample { //public class SouhuCommentExample {
//
@Test // @Test
public void souhuCommentTest() { // public void souhuCommentTest() {
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", // ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
GroupType.PROVIDER); // GroupType.PROVIDER);
Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", 0); // Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", 0);
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body"); // List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); // List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
List<String> urlList = new ArrayList<String>(); // List<String> urlList = new ArrayList<String>();
for(Map<String,Object> map1 : list) { // for(Map<String,Object> map1 : list) {
String url = ""; // String url = "";
try { // try {
url = map1.get("url")+""; // url = map1.get("url")+"";
System.out.println(url); // System.out.println(url);
List<Map<String,Object>> dataList = Souhu.getSouhuCommentData(url,null); // List<Map<String,Object>> dataList = Souhu.getSouhuCommentData(url,null);
if(dataList.size() <= 0) { // if(dataList.size() <= 0) {
urlList.add(url); // urlList.add(url);
} // }
ZhiWeiTools.sleep(100); // ZhiWeiTools.sleep(100);
if(dataList != null) { // if(dataList != null) {
bodyList.addAll(dataList); // bodyList.addAll(dataList);
} // }
} catch (Exception e) { // } catch (Exception e) {
System.out.println(url); // System.out.println(url);
e.printStackTrace(); // e.printStackTrace();
continue; // continue;
} // }
} // }
List<String> headList = new ArrayList<String>(); // List<String> headList = new ArrayList<String>();
headList.add("nickname"); // headList.add("nickname");
headList.add("content"); // headList.add("content");
headList.add("user_id"); // headList.add("user_id");
headList.add("loaction"); // headList.add("loaction");
headList.add("support_count"); // headList.add("support_count");
headList.add("comment_id"); // headList.add("comment_id");
headList.add("reply_id"); // headList.add("reply_id");
headList.add("time"); // headList.add("time");
for(String s : urlList) { // for(String s : urlList) {
System.out.println(s); // System.out.println(s);
} // }
poi.exportExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", "搜狐评论", headList, bodyList); // poi.exportExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", "搜狐评论", headList, bodyList);
} // }
//
//
} //}
package com.zhiwei.crawler; //package com.zhiwei.crawler;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.TXNews; //import com.zhiwei.parse.TXNews;
//
public class TXNewsByWordExample { //public class TXNewsByWordExample {
//
public static void main(String[] args) { // public static void main(String[] args) {
String word = "唐嫣"; // String word = "唐嫣";
String devid = "6D33F35F-880D-42A6-A23F-881BEC6960EC"; // String devid = "6D33F35F-880D-42A6-A23F-881BEC6960EC";
List<Map<String,Object>> dataList = TXNews.getData(word,devid,null); // List<Map<String,Object>> dataList = TXNews.getData(word,devid,null);
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<String>(); // List<String> headList = new ArrayList<String>();
headList.add("title"); // headList.add("title");
headList.add("time"); // headList.add("time");
headList.add("content"); // headList.add("content");
headList.add("url"); // headList.add("url");
headList.add("id"); // headList.add("id");
headList.add("source"); // headList.add("source");
poi.exportExcel("D://crawlerdata/腾讯新闻-唐嫣-1.xlsx", "腾讯新闻数据", headList, dataList); // poi.exportExcel("D://crawlerdata/腾讯新闻-唐嫣-1.xlsx", "腾讯新闻数据", headList, dataList);
} // }
//
} //}
package com.zhiwei.crawler; //package com.zhiwei.crawler;
//
import java.io.UnsupportedEncodingException; //import java.io.UnsupportedEncodingException;
import java.net.URLEncoder; //import java.net.URLEncoder;
import java.util.regex.Matcher; //import java.util.regex.Matcher;
import java.util.regex.Pattern; //import java.util.regex.Pattern;
//
import org.junit.Test; //import org.junit.Test;
//
public class Test1 { //public class Test1 {
//
//
public static void main(String[] args) { // public static void main(String[] args) {
String time = "https://view.inews.qq.com/a/NEW2018021000440002"; // String time = "https://view.inews.qq.com/a/NEW2018021000440002";
//
System.out.println(time.split("/")[4]); // System.out.println(time.split("/")[4]);
//
} // }
//
} //}
package com.zhiwei.crawler; //package com.zhiwei.crawler;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.junit.Test; //import org.junit.Test;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Wangyi; //import com.zhiwei.parse.Wangyi;
import com.zhiwei.tools.tools.ZhiWeiTools; //import com.zhiwei.tools.tools.ZhiWeiTools;
//
public class WangyiCommentCountExample { //public class WangyiCommentCountExample {
//
@Test // @Test
public void wangyiCommentCountTest() { // public void wangyiCommentCountTest() {
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
String path = "D:\\crawlerdata\\自媒体\\网易评论采集-1.xlsx"; // String path = "D:\\crawlerdata\\自媒体\\网易评论采集-1.xlsx";
Map<String,Object> map = poi.importExcel(path, 0); // Map<String,Object> map = poi.importExcel(path, 0);
//
List<Map<String,Object>> list = (List<Map<String, Object>>) map.get("body"); // List<Map<String,Object>> list = (List<Map<String, Object>>) map.get("body");
List<String> urlList = new ArrayList<String>(); // List<String> urlList = new ArrayList<String>();
for(Map<String,Object> u : list) { // for(Map<String,Object> u : list) {
String url = u.get("链接")+""; // String url = u.get("链接")+"";
urlList.add(url); // urlList.add(url);
} // }
//
List<Map<String,Object>> bodyList = new ArrayList<>(); // List<Map<String,Object>> bodyList = new ArrayList<>();
for(String url : urlList) { // for(String url : urlList) {
url = "https://3g.163.com/all/article/E9GAO0PK051188EC.html"; // url = "https://3g.163.com/all/article/E9GAO0PK051188EC.html";
String id = url.split("/")[url.split("/").length-1].split(".ht")[0]; // String id = url.split("/")[url.split("/").length-1].split(".ht")[0];
System.out.println(id); // System.out.println(id);
int lists = Wangyi.getWangyiCommentCount(id, null); // int lists = Wangyi.getWangyiCommentCount(id, null);
System.out.println(lists); // System.out.println(lists);
ZhiWeiTools.sleep(3000); // ZhiWeiTools.sleep(3000);
} // }
List<String> headList = new ArrayList<String>(); // List<String> headList = new ArrayList<String>();
headList.add("content"); // headList.add("content");
headList.add("id"); // headList.add("id");
headList.add("time"); // headList.add("time");
headList.add("name"); // headList.add("name");
headList.add("like"); // headList.add("like");
headList.add("unlike"); // headList.add("unlike");
headList.add("from_url"); // headList.add("from_url");
//
poi.exportExcel(path, "评论数据", headList, bodyList); // poi.exportExcel(path, "评论数据", headList, bodyList);
//
} // }
//
//
} //}
package com.zhiwei.crawler; //package com.zhiwei.crawler;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.junit.Test; //import org.junit.Test;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Wangyi; //import com.zhiwei.parse.Wangyi;
import com.zhiwei.tools.tools.ZhiWeiTools; //import com.zhiwei.tools.tools.ZhiWeiTools;
//
public class WangyiCommentExample { //public class WangyiCommentExample {
//
//若出错 可能数据有重复 以id为准 // //若出错 可能数据有重复 以id为准
@Test // @Test
public void wangyiCommentTest() { // public void wangyiCommentTest() {
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
String path = "D:\\crawlerdata\\自媒体\\网易评论采集-1.xlsx"; // String path = "D:\\crawlerdata\\自媒体\\网易评论采集-1.xlsx";
Map<String,Object> map = poi.importExcel(path, 0); // Map<String,Object> map = poi.importExcel(path, 0);
//
List<Map<String,Object>> list = (List<Map<String, Object>>) map.get("body"); // List<Map<String,Object>> list = (List<Map<String, Object>>) map.get("body");
List<String> urlList = new ArrayList<String>(); // List<String> urlList = new ArrayList<String>();
for(Map<String,Object> u : list) { // for(Map<String,Object> u : list) {
String url = u.get("链接")+""; // String url = u.get("链接")+"";
urlList.add(url); // urlList.add(url);
} // }
//
List<Map<String,Object>> bodyList = new ArrayList<>(); // List<Map<String,Object>> bodyList = new ArrayList<>();
for(String url : urlList) { // for(String url : urlList) {
String id = url.split("/")[url.split("/").length-1].split(".ht")[0]; // String id = url.split("/")[url.split("/").length-1].split(".ht")[0];
System.out.println(id); // System.out.println(id);
List<Map<String,Object>> lists = Wangyi.getWangyiCommentData(id,null); // List<Map<String,Object>> lists = Wangyi.getWangyiCommentData(id,null);
System.out.println(url+"====="+lists.size()); // System.out.println(url+"====="+lists.size());
if(lists != null) { // if(lists != null) {
for(Map<String,Object> m : lists) { // for(Map<String,Object> m : lists) {
m.put("from_url", url); // m.put("from_url", url);
bodyList.add(m); // bodyList.add(m);
} // }
} // }
ZhiWeiTools.sleep(3000); // ZhiWeiTools.sleep(3000);
} // }
List<String> headList = new ArrayList<String>(); // List<String> headList = new ArrayList<String>();
headList.add("content"); // headList.add("content");
headList.add("id"); // headList.add("id");
headList.add("time"); // headList.add("time");
headList.add("name"); // headList.add("name");
headList.add("like"); // headList.add("like");
headList.add("unlike"); // headList.add("unlike");
headList.add("from_url"); // headList.add("from_url");
//
poi.exportExcel(path, "评论数据", headList, bodyList); // poi.exportExcel(path, "评论数据", headList, bodyList);
//
} // }
//
//
//
//
//
} //}
package com.zhiwei.crawler; //package com.zhiwei.crawler;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Wangyi; //import com.zhiwei.parse.Wangyi;
//
public class WangyiHistoryExample { //public class WangyiHistoryExample {
//
public static void main(String[] args) { // public static void main(String[] args) {
//
String url = "http://dy.163.com/v2/article/detail/DPLAOP1605198CJN.html"; // String url = "http://dy.163.com/v2/article/detail/EBR9PF6J0512MLBG.html";
//
List<Map<String,Object>> list = Wangyi.getHistoryData(url, null, "2018-05-01 00:00:00"); // List<Map<String,Object>> list = Wangyi.getHistoryData(url, null, "2018-05-01 00:00:00");
//
List<String> headList = new ArrayList<String>(); // List<String> headList = new ArrayList<String>();
headList.add("title"); // headList.add("title");
headList.add("time"); // headList.add("time");
headList.add("content"); // headList.add("content");
headList.add("source"); // headList.add("source");
headList.add("url"); // headList.add("url");
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata//自媒体/网易-财联社.xlsx", "财联社", headList, list); // poi.exportExcel("D://crawlerdata//自媒体/网易-财联社.xlsx", "财联社", headList, list);
//
} // }
//
//
} //}
package com.zhiwei.crawler; //package com.zhiwei.crawler;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Xiaomi; //import com.zhiwei.parse.Xiaomi;
//
public class XiaomiShequByWordExample { //public class XiaomiShequByWordExample {
//
public static void main(String[] args) { // public static void main(String[] args) {
String word = "小米 电饭煲 锅体开裂,小米 电饭煲 开裂,小米 电饭煲 裂缝,小米 电饭煲 烫伤,小米 电饭煲 变形"; // String word = "小米 电饭煲 锅体开裂,小米 电饭煲 开裂,小米 电饭煲 裂缝,小米 电饭煲 烫伤,小米 电饭煲 变形";
// // //
String[] words = word.split(","); // String[] words = word.split(",");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); // List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(String w : words) { // for(String w : words) {
List<Map<String,Object>> dataList = Xiaomi.getXiaomiByWordData(w,null); // List<Map<String,Object>> dataList = Xiaomi.getXiaomiByWordData(w,null);
if(dataList != null && dataList.size() > 0) { // if(dataList != null && dataList.size() > 0) {
bodyList.addAll(dataList); // bodyList.addAll(dataList);
} // }
} // }
List<String> headList = new ArrayList<String>(); // List<String> headList = new ArrayList<String>();
headList.add("title"); // headList.add("title");
headList.add("time"); // headList.add("time");
headList.add("source"); // headList.add("source");
headList.add("url"); // headList.add("url");
headList.add("content"); // headList.add("content");
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
poi.exportExcel("D:\\crawlerdata\\小米-关键词电饭煲相关.xlsx", "小米社区采集", headList, bodyList); // poi.exportExcel("D:\\crawlerdata\\小米-关键词电饭煲相关.xlsx", "小米社区采集", headList, bodyList);
//
} // }
//
} //}
package com.zhiwei.crawler; //package com.zhiwei.crawler;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.junit.Test; //import org.junit.Test;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.XiGua; //import com.zhiwei.parse.XiGua;
//
public class XiguaAccountExample { //public class XiguaAccountExample {
//
@Test // @Test
public void xiguaAccountTest() { // public void xiguaAccountTest() {
String path = "D:\\crawlerdata\\西瓜视频采集12.28.xlsx"; // String path = "D:\\crawlerdata\\西瓜视频采集12.28.xlsx";
String startTime = "2017-01-01 00:00:00"; // String startTime = "2017-01-01 00:00:00";
//2017-01-01 00:00:00 // //2017-01-01 00:00:00
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); // List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
Map<String,Object> map = poi.importExcel(path, 0); // Map<String,Object> map = poi.importExcel(path, 0);
List<Map<String,Object>> lists = (List<Map<String, Object>>) map.get("body"); // List<Map<String,Object>> lists = (List<Map<String, Object>>) map.get("body");
for(Map<String,Object> map1 : lists ) { // for(Map<String,Object> map1 : lists ) {
String url = map1.get("主页")+""; // String url = map1.get("主页")+"";
if(url != null && url.length() > 5) { // if(url != null && url.length() > 5) {
List<Map<String,Object>> lists1 = XiGua.getXiguaAccountData(url,startTime,null); // List<Map<String,Object>> lists1 = XiGua.getXiguaAccountData(url,startTime,null);
if(lists1 != null && lists.size() > 0) { // if(lists1 != null && lists.size() > 0) {
bodyList.addAll(lists1); // bodyList.addAll(lists1);
} // }
} // }
} // }
List<String> headList = new ArrayList<String>(); // List<String> headList = new ArrayList<String>();
headList.add("title"); // headList.add("title");
headList.add("comments_count"); // headList.add("comments_count");
headList.add("time"); // headList.add("time");
headList.add("content"); // headList.add("content");
headList.add("url"); // headList.add("url");
headList.add("video_watch_count"); // headList.add("video_watch_count");
headList.add("source"); // headList.add("source");
poi.exportExcel(path, "数据采集结果", headList, bodyList); // poi.exportExcel(path, "数据采集结果", headList, bodyList);
} // }
//
//
} //}
package com.zhiwei.crawler; //package com.zhiwei.crawler;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.junit.Test; //import org.junit.Test;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.XiGua; //import com.zhiwei.parse.XiGua;
import com.zhiwei.tools.tools.ZhiWeiTools; //import com.zhiwei.tools.tools.ZhiWeiTools;
//
public class XiguaByWordExample { //public class XiguaByWordExample {
//
//
@Test // @Test
public void XiguaByWordTest() { // public void XiguaByWordTest() {
String word = "美食,味道,吃,试吃,美味,好吃"; // String word = "美食,味道,吃,试吃,美味,好吃";
String[] words = word.split(","); // String[] words = word.split(",");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); // List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(String w : words) { // for(String w : words) {
List<Map<String,Object>> list = XiGua.getXiguaVideoByWordData(w,null); // List<Map<String,Object>> list = XiGua.getXiguaVideoByWordData(w,null);
if(list != null && list.size() > 0) { // if(list != null && list.size() > 0) {
bodyList.addAll(list); // bodyList.addAll(list);
} // }
ZhiWeiTools.sleep(5000); // ZhiWeiTools.sleep(5000);
System.out.println("============总数" + bodyList.size()); // System.out.println("============总数" + bodyList.size());
} // }
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<String>(); // List<String> headList = new ArrayList<String>();
headList.add("title"); // headList.add("title");
headList.add("time"); // headList.add("time");
headList.add("content"); // headList.add("content");
headList.add("like"); // headList.add("like");
headList.add("unlike"); // headList.add("unlike");
headList.add("play_count"); // headList.add("play_count");
headList.add("source"); // headList.add("source");
headList.add("comment_count"); // headList.add("comment_count");
headList.add("url"); // headList.add("url");
//
poi.exportExcel("D://crawlerdata/西瓜美食-1.xlsx", "西瓜好吃不", headList, bodyList); // poi.exportExcel("D://crawlerdata/西瓜美食-1.xlsx", "西瓜好吃不", headList, bodyList);
//
} // }
//
//
//
} //}
...@@ -35,7 +35,7 @@ public class YidainzixunByWordExample { ...@@ -35,7 +35,7 @@ public class YidainzixunByWordExample {
headList.add("time"); headList.add("time");
headList.add("url"); headList.add("url");
System.out.println(listAll.size()); System.out.println(listAll.size());
poi.exportExcel("D://crawlerdata/一点资讯-美食.xlsx", "asd", headList, listAll); poi.exportExcel("D://crawlerdata/一点资讯-软博会.xlsx", "asd", headList, listAll);
} }
......
package com.zhiwei.crawler; //package com.zhiwei.crawler;
//
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.junit.Test; //import org.junit.Test;
//
import com.zhiwei.parse.Yidianzixun; //import com.zhiwei.parse.Yidianzixun;
//
public class YidianzixunCommentExample { //public class YidianzixunCommentExample {
//
@Test // @Test
public void yidianzixunCommentTest() { // public void yidianzixunCommentTest() {
String url = "http://www.yidianzixun.com/article/0ILHigvv"; // String url = "http://www.yidianzixun.com/article/0ILHigvv";
List<Map<String,Object>> lists = Yidianzixun.getYidianzixunCommentData(url,null); // List<Map<String,Object>> lists = Yidianzixun.getYidianzixunCommentData(url,null);
System.out.println(lists.size()); // System.out.println(lists.size());
for(Map<String,Object> map : lists) { // for(Map<String,Object> map : lists) {
System.out.println(map.toString()); // System.out.println(map.toString());
} // }
} // }
//
//
} //}
//package com.zhiwei.hsitory;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Baijia;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//public class BaijiaAccountExample {
//
// @Test
// public void test3() {
// String path = "D://crawlerdata//自媒体/百家号采集.xlsx";
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// String startTime = "2018-05-01 00:00:00";
// Map<String,Object> map = poi.importExcel(path, 0);
// List<Map<String,Object>> list = (List<Map<String, Object>>) map.get("body");
// List<Map<String,Object>> bodyList = new ArrayList<>();
// for(Map<String,Object> m : list) {
// try {
// String app_id = m.get("id").toString();
// app_id = "1594158489045754";
// String cookie = "__cfduid=d847baca85b97d1967b3da02ebb345b831535524251; BAIDUID=C0F0F81EF770C5219AB9C178654135EC:FG=1; PSTM=1536376257; BIDUPSID=250CCE0442BEBCB3568D8EC515953434; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; locale=zh; delPer=0; H_PS_PSSID=1447_21117_20930; PSINO=5";
// List<Map<String,Object>> lists = Baijia.getBaijiaAccountByBaiduData(app_id, startTime,cookie, null);
// if(lists != null) {
// bodyList.addAll(lists);
// }
// break;
// } catch (Exception e) {
// }
// }
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("source");
// headList.add("url");
// headList.add("content");
// headList.add("read_amount");
// poi.exportExcel("D://crawlerdata//历史文章采集/百家号-lxj-2.xlsx", "娱乐资本论", headList, bodyList);
// }
//
//}
//package com.zhiwei.hsitory;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Fenghuang;
//
//public class FenghuangAccountExample {
//
// @Test
// public void fenghuangAccountTest() {
// //所用时间长 1s1篇文章吧
// //https://api.3g.ifeng.com/client_search_subscribe?k=号外财经
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// String id = "1165210";
// String[] ids = id.split(",");
// String startTime = "2010-05-01 00:00:00"; //可为空
// for(int i = 0;i < ids.length;i++) {
// try {
// List<Map<String,Object>> dataList = Fenghuang.getFenghuangAccountData(ids[i], startTime,ProxyHolder.NAT_HEAVY_PROXY);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("source");
// headList.add("url");
// headList.add("id");
// poi.exportExcel("D://crawlerdata//历史文章采集/凤凰-三言财经.xlsx", ids[i], headList, dataList);
// } catch (Exception e) {
// continue;
// }
// }
// }
//
//}
package com.zhiwei.hsitory;
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Souhu;
//
//
//public class SouhuAccountExample {
//
// //http://search.sohu.com/?keyword=%E8%99%8E%E5%97%85&source=article&queryType=edit&ie=utf8
//
// @Test
// public void souhuAccountTest() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER);
// List<Map<String,Object>> lists = Souhu.getSouHuAccountData("99938933","浅黑科技","2018-05-01 00:00:00",false,null);
// System.out.println(lists.size());
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("url");
// headList.add("comment");
// headList.add("tags");
// headList.add("newsid");
// headList.add("source");
// headList.add("newsPv");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D:\\crawlerdata\\搜狐号历史文章-乔.xlsx", "乔", headList, lists);
// }
//
//}
//package com.zhiwei.hsitory;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.TXNews;
//
//public class TxNewsHostoryExample {
//
// public static void main(String[] args) {
//
//
// String url = "6839743";
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// List<Map<String,Object>> list = TXNews.getTxNewsHistory(url, null,ProxyHolder.NAT_PROXY);
//
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("source");
// headList.add("url");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//历史文章采集/腾讯网-三言财经-right.xlsx", "财联社", headList, list);
//
//
// }
//
//}
//package com.zhiwei.hsitory;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Wangyi;
//
//public class WangyiHistoryExample {
//
// public static void main(String[] args) {
//
// String url = "T1520579168852";
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// List<Map<String,Object>> list = Wangyi.getWangyiClientHistory(url, ProxyHolder.NAT_PROXY, "2019-01-01 00:00:00");
//
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("source");
// headList.add("url");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//历史文章采集/网易-三言财经.xlsx", "财联社", headList, list);
//
// }
//
//
//}
//package com.zhiwei.hsitory;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Xueqiu;
//
//public class XueqiuHostoryExample {
//
// public static void main(String[] args) {
//
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER);
// String cookie = "_ga=GA1.2.2045733994.1547169202; device_id=5a986a59915983c3e2ef8074f80112ec; s=e618lxk3qw; __utmz=1.1547185990.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=1.2045733994.1547169202.1548122251.1553047746.3; aliyungf_tc=AQAAAJHA7Vrq7AYAgtgMPALb3ZCQP9o+; _gid=GA1.2.334283760.1554779038; Hm_lvt_1db88642e346389874251b5a1eded6e3=1553046552,1553046993,1553150890,1554779038; _gat=1; remember=1; remember.sig=K4F3faYzmVuqC0iXIERCQf55g2Y; xq_a_token=fed387c342aedea5c7883d1062ae6faf167975d8; xq_a_token.sig=j47ktDdYWr1FOgeL74U6yMCPhOY; xqat=fed387c342aedea5c7883d1062ae6faf167975d8; xqat.sig=oZPD4-6V_GPw-KsnR04L7vxf5oM; xq_r_token=6ffffd472dc300e2f89195a77b8e7064da45d78d; xq_r_token.sig=TPd7Y11kYPcQeOgzXVDApbRQauQ; xq_is_login=1; xq_is_login.sig=J3LxgPVPUzbBg3Kee_PquUfih7Q; u=5878436335; u.sig=j_g6RZ9GzzrgOfIsGHi9O9M1wvc; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1554791719";
// String userId = "7441422641";
//
// List<Map<String,Object>> dataList = Xueqiu.getXueqiuAccountData(userId, cookie, null);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("source");
// headList.add("content");
// headList.add("repostCount");
// headList.add("commentCount");
// headList.add("likeCount");
// headList.add("url");
// poi.exportExcel("D://crawlerdata//历史文章采集/雪球-三言财经.xlsx", "三言财经", headList, dataList);
//
// }
//
//}
package com.zhiwei.crawler; package com.zhiwei.hsitory;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
...@@ -6,6 +6,9 @@ import java.util.Map; ...@@ -6,6 +6,9 @@ import java.util.Map;
import org.junit.Test; import org.junit.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.excelpoi.excel.PoiExcelUtil; import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Yidianzixun; import com.zhiwei.parse.Yidianzixun;
...@@ -14,10 +17,10 @@ public class YidianzixunAccountExample { ...@@ -14,10 +17,10 @@ public class YidianzixunAccountExample {
@Test @Test
public void yidianzixunAccountTest() { public void yidianzixunAccountTest() {
String channelid = "m23315"; ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER);
String channelid = "m190159";
String startTime = "2007-01-01 00:00:00"; String startTime = "2007-01-01 00:00:00";
String cookie = "wuid=90742539356820; wuid_createAt=2019-01-10 11:45:41; UM_distinctid=16835dd9ba11cb-0ef8d17063d93f-671b197c-1fa400-16835dd9ba2243; JSESSIONID=174b8df350cb5400283abedf2c26076357b0b7af0581024f2e39e90532b4edc9; weather_auth=2; DID=node82eee6d174caf2d4; Hm_lvt_15fafbae2b9b11d280c79eff3b840e45=1551686450,1551686458; CNZZDATA1255169715=931563543-1547087800-%7C1551761063; captcha=s%3A6e56492ffceaf88d9f131fa79435464a.TLAhZ1cfwj0vBTjKTO9Qf5qc6QLuipitrEMZjiqm8BM; Hm_lpvt_15fafbae2b9b11d280c79eff3b840e45=1551764582; cn_1255169715_dplus=%7B%22distinct_id%22%3A%20%2216835dd9ba11cb-0ef8d17063d93f-671b197c-1fa400-16835dd9ba2243%22%2C%22sp%22%3A%20%7B%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201547544080%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201547544080%7D%2C%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201551765057%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201551765057%7D"; List<Map<String,Object>> dataList = Yidianzixun.getYidianzixunAccountData(channelid, startTime,ProxyHolder.NAT_HEAVY_PROXY,null);
List<Map<String,Object>> dataList = Yidianzixun.getYidianzixunAccountData(channelid, startTime,null,cookie);
PoiExcelUtil poi = PoiExcelUtil.getInstance(); PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<String>(); List<String> headList = new ArrayList<String>();
headList.add("title"); headList.add("title");
...@@ -27,7 +30,7 @@ public class YidianzixunAccountExample { ...@@ -27,7 +30,7 @@ public class YidianzixunAccountExample {
headList.add("source"); headList.add("source");
headList.add("url"); headList.add("url");
headList.add("summary"); headList.add("summary");
poi.exportExcel("D://crawlerdata/一点资讯-m23315.xlsx", "虎嗅", headList, dataList); poi.exportExcel("D://crawlerdata//历史文章采集/一点资讯-新华社中国新三板.xlsx", "新华社中国新三板", headList, dataList);
} }
......
//package com.zhiwei.keyword; package com.zhiwei.keyword;
//
//import java.util.ArrayList; import java.util.ArrayList;
//import java.util.List; import java.util.List;
//import java.util.Map; import java.util.Map;
//
//import org.testng.annotations.Test; import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType; import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory; import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil; import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Xueqiu; import com.zhiwei.parse.Xueqiu;
//
//public class XueqiuKeyWord { public class XueqiuKeyWord {
// @Test
// public void f() { @Test
//// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER); public void f() {
// String word = "腾讯 股价|腾讯 市值|腾讯 估值|腾讯 投资|腾讯 财报"; ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
// String endTime = "2018-01-01 00:00:00"; String word = "软博会|软件博览会";
// String cookie = "_ga=GA1.2.590296572.1538275545; device_id=812aac42a8874c32ca97893a7b270684; s=ff12lgxbt1; __utma=1.590296572.1538275545.1538280279.1538280279.1; __utmz=1.1538280279.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); aliyungf_tc=AQAAAPtocWy01ggA6tbncwYt4OIDsaiG; xq_a_token=088c6ad5e275496d7c91b8b5b2ecb929bee15772; xq_a_token.sig=NcpAm7FRsAVoMGRMOLrLRveBT7U; xq_r_token=08131eb9a4f33b43b2340fd782f3776c9823d74a; xq_r_token.sig=AZ7au6ICJtTgQJVPybkOJs_Fr54; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539599526,1539913171,1539913178,1541157360; _gid=GA1.2.1817290343.1541157360; u=721541157360383; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1541212289"; String endTime = "2018-01-01 00:00:00";
// String cookie = "aliyungf_tc=AQAAADi3G3I7GAEAgtgMPF/W3FeFauXk; device_id=3221abfd50f9f8be2abca9182c338c9e; Hm_lvt_1db88642e346389874251b5a1eded6e3=1561962809; s=d911zeymzo; remember=1; remember.sig=K4F3faYzmVuqC0iXIERCQf55g2Y; xq_a_token=b2a8d68c56272cca3b71e1bcaf04b744b16e9497; xq_a_token.sig=hj7OrhYid1GuAaF-AmLYrPuyFDk; xqat=b2a8d68c56272cca3b71e1bcaf04b744b16e9497; xqat.sig=gker8dcZJ2Ez4s58eIGPa5zCd0w; xq_r_token=b0ad7b059411acac8c9e2f0a40336804bb60d047; xq_r_token.sig=DwNSLIigE8xtwQKLygagcM28Dd0; xq_is_login=1; xq_is_login.sig=J3LxgPVPUzbBg3Kee_PquUfih7Q; u=1273068356; u.sig=UyqQW-br8hcsOyT6anFmS_SFpCs; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1561963129";
//
//
// String[] words = word.split("\\|");
// String[] words = word.split("\\|");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); PoiExcelUtil poi = PoiExcelUtil.getInstance();
// for(String w : words) { List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
// System.out.println(w); for(String w : words) {
// System.out.println(w);
// List<Map<String, Object>> dataList = Xueqiu.getData(w, endTime, null, cookie);
// System.out.println(w + " ---- " + dataList.size()); List<Map<String, Object>> dataList = Xueqiu.getData(w, endTime, null, cookie);
// bodyList.addAll(dataList); System.out.println(w + " ---- " + dataList.size());
// } bodyList.addAll(dataList);
// List<String> headList = new ArrayList<String>(); }
// headList.add("title"); List<String> headList = new ArrayList<String>();
// headList.add("time"); headList.add("title");
// headList.add("content"); headList.add("time");
// headList.add("uper"); headList.add("content");
// headList.add("url"); headList.add("uper");
// headList.add("likeCount"); headList.add("url");
// headList.add("replyCount"); headList.add("likeCount");
// poi.exportExcel("D:\\crawlerdata\\自媒体\\雪球网页采集-腾讯-relevance.xlsx", "马化腾", headList, bodyList); headList.add("replyCount");
// poi.exportExcel("D:\\crawlerdata\\自媒体\\雪球网页采集-软博会-relevance.xlsx", "马化腾", headList, bodyList);
// }
//} }
}
...@@ -4,7 +4,7 @@ import java.util.ArrayList; ...@@ -4,7 +4,7 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.testng.annotations.Test; import org.junit.Test;
import com.zhiwei.common.config.GroupType; import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory; import com.zhiwei.crawler.proxy.ProxyFactory;
...@@ -21,7 +21,7 @@ public class AiqiyiTest { ...@@ -21,7 +21,7 @@ public class AiqiyiTest {
List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt"); List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(String w : wordList) { for(String w : wordList) {
List<Map<String,Object>> dataList = Aiqiyi.getAiqiyiByWordData(w,ProxyHolder.NAT_PROXY); List<Map<String,Object>> dataList = Aiqiyi.getAiqiyiByWordData(w,ProxyHolder.NAT_HEAVY_PROXY);
if(dataList != null && dataList.size() >= 1) { if(dataList != null && dataList.size() >= 1) {
bodyList.addAll(dataList); bodyList.addAll(dataList);
} }
...@@ -34,7 +34,7 @@ public class AiqiyiTest { ...@@ -34,7 +34,7 @@ public class AiqiyiTest {
headList.add("title"); headList.add("title");
headList.add("word"); headList.add("word");
PoiExcelUtil poi = PoiExcelUtil.getInstance(); PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata/爱奇艺关键词采集-txh-0320.xlsx", "数据", headList, bodyList); poi.exportExcel("D://crawlerdata//视频/爱奇艺关键词采集-毓婷-0716.xlsx", "数据", headList, bodyList);
......
...@@ -4,8 +4,10 @@ import java.util.ArrayList; ...@@ -4,8 +4,10 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.testng.annotations.Test; import org.junit.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.excelpoi.excel.PoiExcelUtil; import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.BiliBili; import com.zhiwei.parse.BiliBili;
import com.zhiwei.util.WordReadFile; import com.zhiwei.util.WordReadFile;
...@@ -13,11 +15,12 @@ import com.zhiwei.util.WordReadFile; ...@@ -13,11 +15,12 @@ import com.zhiwei.util.WordReadFile;
public class BilibiliTest { public class BilibiliTest {
@Test @Test
public void f() { public void f() {
List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt"); ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词-1.txt");
List<Map<String, Object>> bodyList = new ArrayList<>(); List<Map<String, Object>> bodyList = new ArrayList<>();
String cookie = "LIVE_BUVID=AUTO8715300758995538; sid=kp5rluge; fts=1530161621; im_notify_type_35324319=0; buvid3=08ABE6AE-5061-4CE5-B34F-1A8AAB64DB3320712infoc; rpdid=olppsmkxmpdoskwoxiwww; finger=edc6ecda; stardustvideo=1; UM_distinctid=164fe68fb31996-01f161c3523abe-6114167a-1fa400-164fe68fb32274"; String cookie = "LIVE_BUVID=AUTO8715300758995538; sid=kp5rluge; fts=1530161621; im_notify_type_35324319=0; buvid3=08ABE6AE-5061-4CE5-B34F-1A8AAB64DB3320712infoc; rpdid=olppsmkxmpdoskwoxiwww; finger=edc6ecda; stardustvideo=1; UM_distinctid=164fe68fb31996-01f161c3523abe-6114167a-1fa400-164fe68fb32274";
for (String word : wordList) { for (String word : wordList) {
List<Map<String, Object>> dataList = BiliBili.getData(word, null, List<Map<String, Object>> dataList = BiliBili.getData(word, null, "2019-07-18 00:00:00",
cookie); cookie);
if (dataList != null) { if (dataList != null) {
System.out.println(word + " ----- " + dataList.size()); System.out.println(word + " ----- " + dataList.size());
...@@ -33,7 +36,7 @@ public class BilibiliTest { ...@@ -33,7 +36,7 @@ public class BilibiliTest {
headlist.add("url"); headlist.add("url");
headlist.add("word"); headlist.add("word");
PoiExcelUtil poi = PoiExcelUtil.getInstance(); PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata//bilibili关键词采集数据-txh-0320.xlsx", "B站数据", headlist, bodyList); poi.exportExcel("D://crawlerdata//视频//bilibili关键词采集数据-吃鸡否-0722.xlsx", "B站数据", headlist, bodyList);
} }
} }
...@@ -4,7 +4,7 @@ import java.util.ArrayList; ...@@ -4,7 +4,7 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.testng.annotations.Test; import org.junit.Test;
import com.zhiwei.common.config.GroupType; import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory; import com.zhiwei.crawler.proxy.ProxyFactory;
...@@ -18,11 +18,11 @@ public class QQTVTest { ...@@ -18,11 +18,11 @@ public class QQTVTest {
@Test @Test
public void f() { public void f() {
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER); ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER);
String time = "2018-01-01 00:00:00"; String time = "2019-04-11 00:00:00";
List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt"); List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
List<Map<String, Object>> bodyList = new ArrayList<>(); List<Map<String, Object>> bodyList = new ArrayList<>();
for (String word : wordList) { for (String word : wordList) {
List<Map<String, Object>> dataList = QQTV.getData(word,time, ProxyHolder.NAT_PROXY); List<Map<String, Object>> dataList = QQTV.getData(word,time, ProxyHolder.NAT_HEAVY_PROXY);
if (dataList != null) { if (dataList != null) {
System.out.println(word + " ----- " + dataList.size()); System.out.println(word + " ----- " + dataList.size());
bodyList.addAll(dataList); bodyList.addAll(dataList);
...@@ -37,7 +37,7 @@ public class QQTVTest { ...@@ -37,7 +37,7 @@ public class QQTVTest {
headlist.add("url"); headlist.add("url");
headlist.add("word"); headlist.add("word");
PoiExcelUtil poi = PoiExcelUtil.getInstance(); PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata//腾讯视频关键词采集数据-txh-0320.xlsx", "腾讯视频数据", headlist, bodyList); poi.exportExcel("D://crawlerdata//视频//腾讯视频关键词采集数据-毓婷-0716.xlsx", "腾讯视频数据", headlist, bodyList);
......
...@@ -4,7 +4,7 @@ import java.util.ArrayList; ...@@ -4,7 +4,7 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.testng.annotations.Test; import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil; import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.shipin.SohuTV; import com.zhiwei.parse.shipin.SohuTV;
...@@ -33,7 +33,7 @@ public class SohuTVTest { ...@@ -33,7 +33,7 @@ public class SohuTVTest {
headlist.add("url"); headlist.add("url");
headlist.add("word"); headlist.add("word");
PoiExcelUtil poi = PoiExcelUtil.getInstance(); PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata//搜狐视频关键词采集数据-txh-0320.xlsx", "搜狐数据", headlist, bodyList); poi.exportExcel("D://crawlerdata//视频//搜狐视频关键词采集数据-毓婷-0716.xlsx", "搜狐数据", headlist, bodyList);
} }
} }
...@@ -4,7 +4,7 @@ import java.util.ArrayList; ...@@ -4,7 +4,7 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.testng.annotations.Test; import org.junit.Test;
import com.zhiwei.common.config.GroupType; import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory; import com.zhiwei.crawler.proxy.ProxyFactory;
...@@ -30,7 +30,7 @@ public class YoukuKeyWordTest { ...@@ -30,7 +30,7 @@ public class YoukuKeyWordTest {
headList.add("uper"); headList.add("uper");
headList.add("word"); headList.add("word");
PoiExcelUtil poi = PoiExcelUtil.getInstance(); PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D://crawlerdata//优酷数据-txh-0320.xlsx", "数据", headList, bodyList); poi.exportExcel("D://crawlerdata//视频//优酷数据-毓婷-0716.xlsx", "数据", headList, bodyList);
} }
} }
//package com.zhiwei.user; package com.zhiwei.user;
//
//import java.util.ArrayList; import java.util.ArrayList;
//import java.util.Arrays; import java.util.Arrays;
//import java.util.List; import java.util.List;
//import java.util.Map; import java.util.Map;
//
//import org.testng.annotations.Test; import com.zhiwei.excelpoi.excel.PoiExcelUtil;
// import com.zhiwei.parse.Maimai;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Maimai; public class MaimaiTest {
//
//public class MaimaiTest { public static void main(String[] args) {
// @Test
// public void maimaiUserCrawler() { String path = "D:\\crawlerdata\\用户采集\\脉脉用户.xlsx";
// String path = "D:\\crawlerdata\\脉脉用户.xlsx"; String word = "巨量引擎|巨量 引擎|巨 量 引 擎|巨 量 引擎|巨量引 擎";
// String word = "美团|美团网|大众点评|美团点评|摩拜|猫眼|榛果|三快科技|三快在线"; String cookie = "_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; guid=HBoEGxgEGBscBBsZGlYHGBseHxoYGhIZHFYcGQQdGR8FQ1hLTEt5ChITBBIdHxkEGgQbHQVPR0VYQmkKA0VBSU9tCk9BQ0YKBmZnfmJhAgocGQQdGR8FXkNhSE99T0ZaWmsKAx4cfWV9ChEZBBwKfmQKWV1FTkRDfQIKGgQfBUtGRkNQRWc=; seid=s1553309971270; token=\"iUifMkpE9YKuFpz0yEj+jiWpUqM6IXvEvwWKzdd/jK8YgrWsT1/Ku7k9bkIRRYvG8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoidzdPUkhMelktVS1iN1Nsb3VxLXZQV2JvIiwic3RhdHVzIjp0cnVlLCJfZXhwaXJlIjoxNTUzMzk2Mzk0MzczLCJfbWF4QWdlIjo4NjQwMDAwMH0=; session.sig=zGIN7VMizkYf1v48nLqTGAG1k8U";
// String cookie = "_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; sessionid=lejfy3gdu5tf9x9zowxfhtq5o73dubc5; guid=GxsfBBgZGwQYGx4EGBkeVgcYGxkdHhMeHhkcVhwZBB0ZHwVDWEtMS3kKEhMEEh0fGQQaBBsdBU9HRVhCaQoDRUFJT20KT0FDRgoGZmd+YmECChwZBB0ZHwVeQ2FIT31PRlpaawoDHhx9ZX0KERkEHAp+ZApZXUVOREN9AgoaBB8FS0ZGQ1BFZw==; seid=s1550629286782; token=\"OCY36EFdeYzGytlQFyKRdM0DcXNdViYI02kT4QbUMpaSk/CqMXrqBOx8EFo5/fQU8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"q1bNxxk8WW3MzjbCfKr/hfAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTc2NjQ0NzY1Iiwic2VjcmV0IjoiLXFsV2c2Ym9feEJqOWxQbWdWTjcwWWg3Iiwic3RhdHVzIjp0cnVlLCJtaWQ0NTY4NzYwIjpmYWxzZSwiX2V4cGlyZSI6MTU1MDcxNTc2NzgwMSwiX21heEFnZSI6ODY0MDAwMDB9; session.sig=lVCTA7DLvo1K_r_bTjbQOH13Alc"; String[] words = word.split("\\|");
// String[] words = word.split("\\|"); List<Map<String,Object>> bodyList = new ArrayList<>();
// List<Map<String,Object>> bodyList = new ArrayList<>(); for(String w : words) {
// for(String w : words) { bodyList.addAll(Maimai.getUserList(w, cookie, null));
// bodyList.addAll(Maimai.getUserList(w, cookie, null)); }
// } List<String> headList = Arrays.asList("id","name","gender","url","rank","compos","city");
// List<String> headList = Arrays.asList("id","name","gender","url","rank","compos","city"); PoiExcelUtil poi = PoiExcelUtil.getInstance();
// PoiExcelUtil poi = PoiExcelUtil.getInstance(); poi.exportExcel(path, "result", headList, bodyList);
// poi.exportExcel(path, "result", headList, bodyList);
// } }
//}
}
package com.zhiwei.user; //package com.zhiwei.user;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.HashMap; //import java.util.HashMap;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.testng.annotations.Test; //import org.junit.Test;
//
import com.zhiwei.bean.QQKandianUser; //import com.zhiwei.bean.QQKandianUser;
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.QQKandian; //import com.zhiwei.parse.QQKandian;
import com.zhiwei.tools.tools.ZhiWeiTools; //import com.zhiwei.tools.tools.ZhiWeiTools;
//
public class QQkandianExample { //public class QQkandianExample {
//
@Test // @Test
public void f() { // public void f() {
QQKandian qqKandian = new QQKandian(); // QQKandian qqKandian = new QQKandian();
String path = "D:\\crawlerdata\\用户采集\\qq看点用户.xlsx"; // String path = "D:\\crawlerdata\\用户采集\\qq看点用户.xlsx";
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> map = poi.importExcel(path, 0); // Map<String,Object> map = poi.importExcel(path, 0);
//
List<Map<String,Object>> dataList = (List<Map<String, Object>>) map.get("body"); // List<Map<String,Object>> dataList = (List<Map<String, Object>>) map.get("body");
List<QQKandianUser> allList = new ArrayList<QQKandianUser>(); // List<QQKandianUser> allList = new ArrayList<QQKandianUser>();
for(Map<String,Object> m : dataList) { // for(Map<String,Object> m : dataList) {
String name = m.get("渠道")+""; // String name = m.get("渠道")+"";
System.out.println(name); // System.out.println(name);
List<QQKandianUser> qqKandianUsers = qqKandian.getUser(name, null); // List<QQKandianUser> qqKandianUsers = qqKandian.getUser(name, null);
if(qqKandianUsers != null) { // if(qqKandianUsers != null) {
System.out.println(qqKandianUsers.size()); // System.out.println(qqKandianUsers.size());
allList.addAll(qqKandianUsers); // allList.addAll(qqKandianUsers);
}else { // }else {
System.out.println( name + "--- null"); // System.out.println( name + "--- null");
} // }
ZhiWeiTools.sleep(3000); // ZhiWeiTools.sleep(3000);
} // }
List<String> headList = new ArrayList<String>(); // List<String> headList = new ArrayList<String>();
headList.add("name"); // headList.add("name");
headList.add("url"); // headList.add("url");
headList.add("verity"); // headList.add("verity");
headList.add("desc"); // headList.add("desc");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); // List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(QQKandianUser qqKandianUser : allList) { // for(QQKandianUser qqKandianUser : allList) {
Map<String,Object> m = new HashMap<String,Object>(); // Map<String,Object> m = new HashMap<String,Object>();
m.put("name", qqKandianUser.getName()); // m.put("name", qqKandianUser.getName());
m.put("url", qqKandianUser.getUrl()); // m.put("url", qqKandianUser.getUrl());
m.put("verity", qqKandianUser.isVerify()); // m.put("verity", qqKandianUser.isVerify());
m.put("desc", qqKandianUser.getDesc()); // m.put("desc", qqKandianUser.getDesc());
bodyList.add(m); // bodyList.add(m);
} // }
poi.exportExcel(path, "数据完成后", headList, bodyList); // poi.exportExcel(path, "数据完成后", headList, bodyList);
} // }
//
} //}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment