Commit 41208eb5 by yangchen

增加 部分网站评论采集

parent a731c54c
package com.zhiwei.parse;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import static java.util.Objects.nonNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.parse.analysis.AikaCommentAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class Aika {
private static Logger logger = LoggerFactory.getLogger(Aika.class);
private static AikaCommentAnalysis aikaCommentAnalysis = new AikaCommentAnalysis();
private static HttpBoot httpBoot = new HttpBoot();
@SuppressWarnings("unchecked")
public static List<Map<String,Object>> getAikaComment(String url,Proxy proxy) {
String commentId = getCommentId(url);
if(nonNull(commentId)) {
List<Map<String,Object>> bodyList = new ArrayList<>();
int page = 1;
while(true) {
try {
String newUrl = "http://comment.xcar.com.cn/interface/index.php?iact=CommentLevel&cid="+ commentId
+ "&action=getNewsComment&sort=ups&ctype=0&limit=25&page=" + page;
String result = httpBoot.syncCall(RequestUtils.wrapGet(newUrl), proxy).body().string();
Map<String, Object> map = aikaCommentAnalysis.getAikaComment(result);
if(nonNull(map.get("data"))) {
List<Map<String,Object>> bdList = (List<Map<String, Object>>) map.get("data");
bodyList.addAll(bdList);
}
logger.info("爱卡汽车 评论采集 第 {} 页 , 一共采集到 {} 条",page,bodyList.size());
if (!(boolean) map.get("next")) {
break;
}
ZhiWeiTools.sleep(3000);
page++;
} catch (Exception e) {
logger.error("爱卡汽车 评论采集出错 {}", e);
}
}
return bodyList;
}
return Collections.emptyList();
}
private static String getCommentId(String url) {
try {
return url.split("news_")[1].split("_")[0];
} catch (Exception e) {
return null;
}
}
}
...@@ -33,7 +33,7 @@ public class Baijia { ...@@ -33,7 +33,7 @@ public class Baijia {
* @return * @return
*/ */
public static List<Map<String,Object>> getBaijiaAccount2Data(String app_id,String startTime,Proxy proxy) { public static List<Map<String,Object>> getBaijiaAccount2Data(String app_id,String startTime,Proxy proxy) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> dataList = new ArrayList<>();
Map<String,String> headerMap = HeadGet.getBaijiaAccount2HeaderMap(null); Map<String,String> headerMap = HeadGet.getBaijiaAccount2HeaderMap(null);
String url = "https://news.baidu.com/sn/api/homesubcribe?forum_id="+app_id; String url = "https://news.baidu.com/sn/api/homesubcribe?forum_id="+app_id;
boolean f = true; boolean f = true;
...@@ -72,6 +72,7 @@ public class Baijia { ...@@ -72,6 +72,7 @@ public class Baijia {
* @param proxy * @param proxy
* @return * @return
*/ */
@SuppressWarnings("unchecked")
public static List<Map<String,Object>> getBaijiaAccountByBaiduData(String app_id,String name,String startTime,String cookie,Proxy proxy) { public static List<Map<String,Object>> getBaijiaAccountByBaiduData(String app_id,String name,String startTime,String cookie,Proxy proxy) {
Map<String,String> headerMap = HeaderTool.getCommonHead(); Map<String,String> headerMap = HeaderTool.getCommonHead();
List<Map<String,Object>> dataList = new ArrayList<>(); List<Map<String,Object>> dataList = new ArrayList<>();
...@@ -82,6 +83,7 @@ public class Baijia { ...@@ -82,6 +83,7 @@ public class Baijia {
for(int i = 1;i < 3;i++) { for(int i = 1;i < 3;i++) {
try { try {
String url = "https://author.baidu.com/list?type=article&context={%22offset%22:%22-1_"+n+"%22,%22app_id%22:%22"+app_id+"%22,%22pageSize%22:20}"; String url = "https://author.baidu.com/list?type=article&context={%22offset%22:%22-1_"+n+"%22,%22app_id%22:%22"+app_id+"%22,%22pageSize%22:20}";
System.out.println(url);
Request request = RequestUtils.wrapGet(url, headerMap); Request request = RequestUtils.wrapGet(url, headerMap);
String result = httpBoot.syncCall(request, proxy, false).body().string(); String result = httpBoot.syncCall(request, proxy, false).body().string();
Map<String,Object> dMap = baijiaAccountAnalysis.getBaijiaAccountData3(result,name, startTime); Map<String,Object> dMap = baijiaAccountAnalysis.getBaijiaAccountData3(result,name, startTime);
...@@ -97,7 +99,6 @@ public class Baijia { ...@@ -97,7 +99,6 @@ public class Baijia {
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
ZhiWeiTools.sleep(3000); ZhiWeiTools.sleep(3000);
continue;
} }
} }
} }
......
...@@ -12,7 +12,6 @@ import org.slf4j.Logger; ...@@ -12,7 +12,6 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.HttpClientBuilder;
import com.zhiwei.crawler.core.HttpRequestBuilder; import com.zhiwei.crawler.core.HttpRequestBuilder;
import com.zhiwei.excelpoi.excel.PoiExcelUtil; import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.analysis.BilibilikeyWordAnalysis; import com.zhiwei.parse.analysis.BilibilikeyWordAnalysis;
...@@ -20,7 +19,6 @@ import com.zhiwei.tools.tools.ZhiWeiTools; ...@@ -20,7 +19,6 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.util.WordReadFile; import com.zhiwei.util.WordReadFile;
import okhttp3.Headers; import okhttp3.Headers;
import okhttp3.OkHttpClient;
import okhttp3.Request; import okhttp3.Request;
public class BiliBili { public class BiliBili {
...@@ -28,8 +26,9 @@ public class BiliBili { ...@@ -28,8 +26,9 @@ public class BiliBili {
private static Logger logger = LoggerFactory.getLogger(BiliBili.class); private static Logger logger = LoggerFactory.getLogger(BiliBili.class);
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot();
@SuppressWarnings("unchecked")
public static List<Map<String,Object>> getData(String word,Proxy proxy,String cookie) { public static List<Map<String,Object>> getData(String word,Proxy proxy,String cookie) {
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> bodyList = new ArrayList<>();
try { try {
String url = "https://search.bilibili.com/all?keyword="+URLEncoder.encode(word, "utf-8")+"&order=pubdate&duration=0&tids_1=0"; String url = "https://search.bilibili.com/all?keyword="+URLEncoder.encode(word, "utf-8")+"&order=pubdate&duration=0&tids_1=0";
Headers header = Headers.of("cookie",cookie,"Referer","https://www.bilibili.com/","Host","search.bilibili.com"); Headers header = Headers.of("cookie",cookie,"Referer","https://www.bilibili.com/","Host","search.bilibili.com");
......
...@@ -3,6 +3,7 @@ package com.zhiwei.parse; ...@@ -3,6 +3,7 @@ package com.zhiwei.parse;
import java.net.Proxy; import java.net.Proxy;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
...@@ -30,7 +31,7 @@ public class Fenghuang { ...@@ -30,7 +31,7 @@ public class Fenghuang {
* @return * @return
*/ */
public static List<Map<String,Object>> getFenghuangAccountData(String id,String startTime,Proxy proxy) { public static List<Map<String,Object>> getFenghuangAccountData(String id,String startTime,Proxy proxy) {
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> dataList = new ArrayList<>();
int i = 1; int i = 1;
boolean f = true; boolean f = true;
while(f){ while(f){
...@@ -38,11 +39,10 @@ public class Fenghuang { ...@@ -38,11 +39,10 @@ public class Fenghuang {
for(int j = 0;j< 3;j++){ for(int j = 0;j< 3;j++){
f = true; f = true;
String url = "http://api.3g.ifeng.com/api_wemedia_index?followid=weMedia_"+id+"&page="+i+"&pagesize=20&tag=article&uid=fe659b7e510444c28a31f88dee7a2747"; String url = "http://api.3g.ifeng.com/api_wemedia_index?followid=weMedia_"+id+"&page="+i+"&pagesize=20&tag=article&uid=fe659b7e510444c28a31f88dee7a2747";
System.out.println(url);
List<Map<String,Object>> list = fenghuangAccountAnalysis.getArticleData(url, startTime,proxy); List<Map<String,Object>> list = fenghuangAccountAnalysis.getArticleData(url, startTime,proxy);
if(list != null && list.size() > 0) { if(list != null && !list.isEmpty()) {
dataList.addAll(list); dataList.addAll(list);
System.out.println("====================采集第"+i+"页===共获取数据=="+dataList.size()); logger.info("====================采集第 {} 页===共获取数据== {}",i,dataList.size());
i++; i++;
ZhiWeiTools.sleep(2000); ZhiWeiTools.sleep(2000);
break; break;
...@@ -51,7 +51,7 @@ public class Fenghuang { ...@@ -51,7 +51,7 @@ public class Fenghuang {
ZhiWeiTools.sleep(2000); ZhiWeiTools.sleep(2000);
} }
} catch (Exception e) { } catch (Exception e) {
logger.error("程序出错",e.getMessage()); logger.error("程序出错 {}",e);
return dataList; return dataList;
} }
} }
...@@ -67,15 +67,15 @@ public class Fenghuang { ...@@ -67,15 +67,15 @@ public class Fenghuang {
public static List<Map<String,Object>> getFenghuangCommentData(String url,Proxy proxy) { public static List<Map<String,Object>> getFenghuangCommentData(String url,Proxy proxy) {
url = fenghuangCommentAnalysis.getdocUrl(url,proxy); url = fenghuangCommentAnalysis.getdocUrl(url,proxy);
if(url == null) { if(url == null) {
return null; return Collections.emptyList();
} }
int i = 1; int i = 1;
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> dataList = new ArrayList<>();
while(true) { while(true) {
System.out.println(url+i); System.out.println(url+i);
ZhiWeiTools.sleep(2000); ZhiWeiTools.sleep(2000);
List<Map<String,Object>> list = fenghuangCommentAnalysis.getData(url+i,proxy); List<Map<String,Object>> list = fenghuangCommentAnalysis.getData(url+i,proxy);
if(list == null || list.size() < 1) { if(list == null || list.isEmpty()) {
break; break;
} }
i++; i++;
...@@ -84,22 +84,30 @@ public class Fenghuang { ...@@ -84,22 +84,30 @@ public class Fenghuang {
return dataList; return dataList;
} }
/**
* http://tech.ifeng.com/a/20181113/45222352_0.shtml
* https://comment.ifeng.com/get.php?callback=newCommentListCallBack&orderby=&docUrl=sub_87465244&job=1&p=2&pageSize=20
* @Description (TODO这里用一句话描述这个方法的作用)
* @param url
* @param proxy
* @return
*/
public static List<Map<String,Object>> getFenghuangCommentData2(String url,Proxy proxy) { public static List<Map<String,Object>> getFenghuangCommentData2(String url,Proxy proxy) {
url = fenghuangCommentAnalysis.getdocUrl(url,proxy); url = fenghuangCommentAnalysis.getdocUrl(url,proxy);
if(url == null) { if(url == null) {
return null; return Collections.emptyList();
} }
int i = 1; int i = 1;
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> dataList = new ArrayList<>();
while(true) { while(true) {
System.out.println(url+i);
ZhiWeiTools.sleep(2000); ZhiWeiTools.sleep(2000);
List<Map<String,Object>> list = fenghuangCommentAnalysis.getData2(url+i,proxy); List<Map<String,Object>> list = fenghuangCommentAnalysis.getData2(url+i,proxy);
if(list == null || list.size() < 1) { if(list == null || list.isEmpty()) {
break; break;
} }
i++;
dataList.addAll(list); dataList.addAll(list);
logger.info("采集到第 {} 页 ,总共采集到的数据量 {}",i,dataList.size());
i++;
} }
return dataList; return dataList;
} }
...@@ -112,11 +120,11 @@ public class Fenghuang { ...@@ -112,11 +120,11 @@ public class Fenghuang {
*/ */
public static Map<String,Object> getFenghuangCommentCount(String url,Proxy proxy) { public static Map<String,Object> getFenghuangCommentCount(String url,Proxy proxy) {
url = fenghuangCommentAnalysis.getdocUrl(url,proxy); url = fenghuangCommentAnalysis.getdocUrl(url,proxy);
System.out.println(url);
if(url == null) { if(url == null) {
return null; return null;
} }
Map<String,Object> map = fenghuangCommentAnalysis.getFenghuangCommentCount(url,proxy); return fenghuangCommentAnalysis.getFenghuangCommentCount(url,proxy);
return map;
} }
/** /**
...@@ -135,26 +143,24 @@ public class Fenghuang { ...@@ -135,26 +143,24 @@ public class Fenghuang {
Map<String,String> headerMap = HeadGet.getFenghuangWordHeaderMap(null); Map<String,String> headerMap = HeadGet.getFenghuangWordHeaderMap(null);
String result = HttpClient.executeHttpRequestGet(url+i,proxy, headerMap); String result = HttpClient.executeHttpRequestGet(url+i,proxy, headerMap);
List<Map<String,Object>> lists = fenghuangByWordAnalysis.getFenghuangByWord(result); List<Map<String,Object>> lists = fenghuangByWordAnalysis.getFenghuangByWord(result);
if(lists == null || lists.size() < 1) { if(lists == null || lists.isEmpty()) {
break; break;
} }else {
if(lists != null && lists.size() > 0) {
dataList.addAll(lists); dataList.addAll(lists);
} }
System.out.println(word+"===================以获取的数据==:" + dataList.size()); logger.info(" {}===================以获取的数据==: {}",word,dataList.size());
i++; i++;
if(i == 76) { if(i == 76) {
break; break;
} }
ZhiWeiTools.sleep(4000); ZhiWeiTools.sleep(4000);
} catch (Exception e) { } catch (Exception e) {
continue; logger.info("凤凰 关键词采集出错 {}",e);
} }
} }
return dataList; return dataList;
} catch (Exception e) { } catch (Exception e) {
logger.error("依据关键词获取凤凰文章出错",e.getMessage()); logger.error("依据关键词获取凤凰文章出错 {}",e);
e.printStackTrace();
return dataList; return dataList;
} }
} }
......
package com.zhiwei.parse;
import static java.util.Objects.nonNull;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.parse.analysis.PcautoCommentAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class Pcauto {
private static Logger logger = LoggerFactory.getLogger(Pcauto.class);
private static PcautoCommentAnalysis pcautoCommentAnalysis = new PcautoCommentAnalysis();
private static HttpBoot httpBoot = new HttpBoot();
@SuppressWarnings("unchecked")
public static List<Map<String, Object>> getPcAutoComment(String url,Proxy proxy) {
String newUrl = getCommentUrl(url, proxy);
if(nonNull(newUrl)) {
List<Map<String,Object>> dataList = new ArrayList<>();
int page = 2;
while(true) {
try {
ZhiWeiTools.sleep(3000);
String result = httpBoot.syncCall(RequestUtils.wrapGet(newUrl), proxy).body().string();
Map<String,Object> map = pcautoCommentAnalysis.getPcautoComment(result);
List<Map<String,Object>> datLi = (List<Map<String, Object>>) map.get("data");
if(nonNull(datLi)) {
dataList.addAll(datLi);
}
logger.info("采集第 {} 页 , 采集的总数为 {}",page,dataList.size());
newUrl = newUrl.replace("/p1/", "/p"+page+"/");
page++;
if(!(boolean) map.get("next")) {
break;
}
} catch (Exception e) {
logger.error("获取 评论出错 {}",e);
}
}
return dataList;
}
return Collections.emptyList();
}
private static String getCommentUrl(String url,Proxy proxy) {
for(int i = 0;i < 3;i++) {
try {
String result = httpBoot.syncCall(RequestUtils.wrapGet("https://cmt.pcauto.com.cn/action/topic/get_data.jsp?url="+url), proxy).body().string();
JSONObject json = JSONObject.parseObject(result);
if(nonNull(json.getString("url"))) {
return json.getString("url");
}
} catch (IOException e) {
logger.error("太平洋汽车网 获取 评论链接出错");
e.printStackTrace();
}
}
return null;
}
}
package com.zhiwei.parse;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.parse.analysis.QicheHomeKwyWordAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class QicheHome {
private static Logger logger = LoggerFactory.getLogger(QicheHome.class);
private static HttpBoot httpBoot = new HttpBoot();
private static QicheHomeKwyWordAnalysis qicheHomeKwyWordAnalysis = new QicheHomeKwyWordAnalysis();
public static List<Map<String,Object>> getQiCheComment(String articleid,Proxy proxy) {
List<Map<String,Object>> bodyList = new ArrayList<>();
int page = 1;
int count = 2;
int n = 1;
while(true) {
try {
String url = "https://reply.autohome.com.cn/api/comments/show.json?count=50&page="+page+"&id="+articleid+"&appid=1&datatype=json&order=0&replyid=0";
String result = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy).body().string();
if(page == 1) {
count = qicheHomeKwyWordAnalysis.getCount(result);
}
bodyList.addAll(qicheHomeKwyWordAnalysis.getData(result));
page++;
logger.info("采集 articleid {} 总页数 {} 第 {} 页 , 采集总数 {}",articleid,count,page,bodyList.size());
ZhiWeiTools.sleep(3000);
if(page > count) {
break;
}
} catch (IOException e) {
e.printStackTrace();
n++;
if(n > 3) {
break;
}
}
}
return Collections.emptyList();
}
}
package com.zhiwei.parse;
import static java.util.Objects.nonNull;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.parse.analysis.SinaKejiCommentAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class SinaKeji {
private static Logger logger = LoggerFactory.getLogger(SinaKeji.class);
private static SinaKejiCommentAnalysis sinaKejiCommentAnalysis = new SinaKejiCommentAnalysis();
private static HttpBoot httpBoot = new HttpBoot();
/**
* https://tech.sina.com.cn/it/2018-07-17/doc-ihfkffam0632649.shtml
* @Description 新浪科技 评论采集
* @param url
* @param proxy
* @return
*/
public static List<Map<String, Object>> getSinaKejiComment(String url,Proxy proxy) {
String commentId = getCommentId(url, proxy);
if(nonNull(commentId)) {
List<Map<String,Object>> dataList = new ArrayList<>();
int page = 1;
int count = 1;
while(true) {
try {
ZhiWeiTools.sleep(3000);
String newUrl = "http://comment.sina.com.cn/page/info?version=1&format=json&channel=kj&newsid="+commentId+"&compress=0&ie=utf-8&oe=utf-8&page_size=20&t_size=3&h_size=3&thread=1&page="+page;
String result = httpBoot.syncCall(RequestUtils.wrapGet(newUrl), proxy).body().string();
List<Map<String,Object>> list = sinaKejiCommentAnalysis.getSinaCommet(result);
dataList.addAll(list);
logger.info("采集到 第 {} 页 ,一共采集到 {} 条",page,dataList.size());
if(list.isEmpty()) {
count++;
if(count > 3) {
break;
}
}
page++;
} catch (IOException e) {
logger.error("新浪科技评论采集出错 {}",e);
}
}
}
return Collections.emptyList();
}
private static String getCommentId(String url,Proxy proxy) {
String commentId = null;
for(int i = 0;i < 3;i++) {
try {
String result = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy).body().string();
if(result.contains("newsid:")) {
commentId = result.split("newsid: '")[1].split("'")[0];
if(nonNull(commentId)) {
return commentId;
}
}
} catch (IOException e) {
logger.error("获取 文章评论 id 失败");
}
}
return null;
}
}
...@@ -36,7 +36,7 @@ public class TXNews { ...@@ -36,7 +36,7 @@ public class TXNews {
// //
Map<String,Object> param2Map = HeadGet.getTxNewspagemoreParamMap(word, page); Map<String,Object> param2Map = HeadGet.getTxNewspagemoreParamMap(word, page);
//6D33F35F-880D-42A6-A23F-881BEC6960EC //6D33F35F-880D-42A6-A23F-881BEC6960EC
String result2 = HttpClient.executeHttpRequestPost("http://r.inews.qq.com/searchMore?appver=11.2.1_qqnews_5.5.60&devid=496d3626-9684-45ef-8d22-7a71fbfd22da",proxy, header2Map, param2Map); String result2 = HttpClient.executeHttpRequestPost("http://r.inews.qq.com/searchMore?appver=11.2.1_qqnews_5.5.60&devid="+devid,proxy, header2Map, param2Map);
page++; page++;
List<Map<String,Object>> dList2 = txNewsByWordAnalysis.getData(result2); List<Map<String,Object>> dList2 = txNewsByWordAnalysis.getData(result2);
dataList.addAll(dList2); dataList.addAll(dList2);
......
package com.zhiwei.parse;
import static java.util.Objects.nonNull;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.parse.analysis.TechTxCommentAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class TechTx {
private static Logger logger = LoggerFactory.getLogger(TechTx.class);
private static TechTxCommentAnalysis techTxCommentAnalysis = new TechTxCommentAnalysis();
private static HttpBoot httpBoot = new HttpBoot();
@SuppressWarnings("unchecked")
public static List<Map<String,Object>> getTechTxComment(String url,Proxy proxy) {
String commentID = getCommentId(url, proxy);
String next = "";
if(nonNull(commentID)) {
List<Map<String, Object>> bodyList = new ArrayList<>();
while(true) {
try {
ZhiWeiTools.sleep(3000);
String newUrl = "http://coral.qq.com/article/"+commentID+"/comment/v2?oriorder=o&pageflag=1&cursor="+next;
String result = httpBoot.syncCall(RequestUtils.wrapGet(newUrl), proxy).body().string();
Map<String,Object> map = techTxCommentAnalysis.getTechTxComment(result);
List<Map<String,Object>> dataList = (List<Map<String, Object>>) map.get("data");
if(nonNull(dataList)) {
bodyList.addAll(dataList);
}
if(nonNull(map.get("next"))) {
logger.info("现在采集总数为 {}",bodyList.size());
next = map.get("next").toString();
}else {
break;
}
} catch (IOException e) {
logger.error("无法获取腾讯科技 评论 {}",e);
}
}
return bodyList;
}
return Collections.emptyList();
}
private static String getCommentId(String url,Proxy proxy) {
String commentID = null;
for(int i = 0;i < 3;i++) {
try {
String result = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy).body().string();
commentID = result.split("cmt_id = ")[1].split(";")[0];
if(nonNull(commentID)) {
return commentID;
}
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
}
...@@ -22,7 +22,7 @@ public class Toutiao { ...@@ -22,7 +22,7 @@ public class Toutiao {
private static ToutiaoKeyWordAnalysis toutiaoKeyWordAnalysis = new ToutiaoKeyWordAnalysis(); private static ToutiaoKeyWordAnalysis toutiaoKeyWordAnalysis = new ToutiaoKeyWordAnalysis();
public static List<Map<String,Object>> getKeyWordData(String word,Proxy proxy,String devoid) { public static List<Map<String,Object>> getKeyWordData(String word,Proxy proxy,String devoid) {
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> bodyList = new ArrayList<>();
Map<String,String> headerMap = HeadGet.getToutiaoHeaderMap(null); Map<String,String> headerMap = HeadGet.getToutiaoHeaderMap(null);
String url = null; String url = null;
try { try {
......
...@@ -30,27 +30,26 @@ public class Wangyi { ...@@ -30,27 +30,26 @@ public class Wangyi {
*/ */
public static List<Map<String,Object>> getWangyiCommentData(String id,Proxy proxy) { public static List<Map<String,Object>> getWangyiCommentData(String id,Proxy proxy) {
Map<String,String> headerMap = HeadGet.getWangyiCommentHeaderMap(null); Map<String,String> headerMap = HeadGet.getWangyiCommentHeaderMap(null);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> dataList = new ArrayList<>();
int i = 0; int i = 0;
List<String> idList = new ArrayList<String>(); List<String> idList = new ArrayList<>();
try { try {
while(true) { while(true) {
String url = "http://comment.dy.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/"+id+"/comments/newList?offset="+i+"&limit=30"; String url = "http://comment.dy.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/"+id+"/comments/newList?offset="+i+"&limit=30";
String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap); String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap);
System.out.println(url);
List<Map<String,Object>> lists = wangyiCommentAnalysis.getWangyiCommentData(result,idList); List<Map<String,Object>> lists = wangyiCommentAnalysis.getWangyiCommentData(result,idList);
if(lists == null || lists.size() < 1) { if(lists == null || lists.isEmpty()) {
break; break;
} }
dataList.addAll(lists); dataList.addAll(lists);
i += 30; i += 30;
ZhiWeiTools.sleep(4000); ZhiWeiTools.sleep(4000);
System.out.println("==================已采集到的数据=" + dataList.size()); logger.info("==================已采集到的数据= {}",dataList.size());
} }
//去重 //去重
return dataList; return dataList;
} catch (Exception e) { } catch (Exception e) {
logger.error("获取网易评论出错",e.getMessage()); logger.error("获取网易评论出错 {}",e);
return dataList; return dataList;
} }
......
package com.zhiwei.parse;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.Proxy;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.parse.analysis.XueqiuKeyWordAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request;
public class Xueqiu {
private static Logger logger = LoggerFactory.getLogger(Xueqiu.class);
private static HttpBoot httpBoot = new HttpBoot();
private static XueqiuKeyWordAnalysis xueqiuKeyWordAnalysis = new XueqiuKeyWordAnalysis();
public static List<Map<String,Object>> getData(String word,String endTime,Proxy proxy,String cookie) {
List<Map<String,Object>> bodyList = new ArrayList<>();
int i = 0;
int page = 1;
Map<String,Object> headers = new HashMap<>();
headers.put("cookie", cookie);
headers.put("Host", "xueqiu.com");
while(true) {
try {
// relevance time reply
String url = "https://xueqiu.com/statuses/search.json?sort=relevance&source=all&q="+URLEncoder.encode(word, "utf-8")+"&count=20&page="+page;
System.out.println(url);
Request request = RequestUtils.wrapGet(url, headers);
String result = httpBoot.syncCall(request, proxy).body().string();
List<Map<String,Object>> list = xueqiuKeyWordAnalysis.getData(result, endTime);
ZhiWeiTools.sleep(3000);
if(list.size() < 1) {
i++;
}else {
bodyList.addAll(list);
logger.info("采集到第{} 页 , 一共采集到 {} 数据",page,bodyList.size());
page++;
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
i++;
} catch (IOException e) {
e.printStackTrace();
i++;
}
if(i > 3) {
break;
}
}
return bodyList;
}
}
package com.zhiwei.parse.analysis;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.timeparse.TimeExtraction;
public class AikaCommentAnalysis {
private static Logger logger = LoggerFactory.getLogger(AikaCommentAnalysis.class);
public Map<String,Object> getAikaComment(String result) {
try {
List<Map<String,Object>> dataList = new ArrayList<>();
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArray = json.getJSONArray("list");
for(int i = 0;i < jsonArray.size();i++) {
JSONObject data = jsonArray.getJSONObject(i);
Map<String,Object> map = new HashMap<>();
map.put("id", data.getString("id"));
map.put("content", data.getString("conts").replaceAll("<.*?>", ""));
map.put("source", data.getString("user_name"));
map.put("like", data.getString("ups"));
String time = data.getString("pub_time");
map.put("time", new Date(TimeExtraction.parseFormatTime(time, "MM dd")));
dataList.add(map);
}
boolean next = false;
if(json.getJSONObject("config").getInteger("hasMore") == 1) {
next = true;
}
Map<String,Object> rMap = new HashMap<>();
rMap.put("next", next);
rMap.put("data", dataList);
return rMap;
} catch (Exception e) {
logger.error("爱卡汽车 评论解析错误 {}",e);
}
return Collections.emptyMap();
}
}
...@@ -84,6 +84,7 @@ public class BaijiaAccountAnalysis { ...@@ -84,6 +84,7 @@ public class BaijiaAccountAnalysis {
map.put("url", url); map.put("url", url);
map.put("source", name); map.put("source", name);
dataList.add(map); dataList.add(map);
// System.out.println(map.toString());
} }
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
......
...@@ -19,10 +19,10 @@ public class FenghuangByWordAnalysis { ...@@ -19,10 +19,10 @@ public class FenghuangByWordAnalysis {
public List<Map<String,Object>> getFenghuangByWord(String result) { public List<Map<String,Object>> getFenghuangByWord(String result) {
Document document = Jsoup.parse(result); Document document = Jsoup.parse(result);
Elements elements = document.select("div.mainM").select("div.searchResults"); Elements elements = document.select("div.mainM").select("div.searchResults");
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>(); List<Map<String,Object>> dataList = new ArrayList<>();
try { try {
for(Element element : elements) { for(Element element : elements) {
Map<String,Object> map = new HashMap<String, Object>(); Map<String,Object> map = new HashMap<>();
String title = element.select("p.line24").select("a").text(); String title = element.select("p.line24").select("a").text();
if(title == null ) { if(title == null ) {
continue; continue;
...@@ -40,7 +40,7 @@ public class FenghuangByWordAnalysis { ...@@ -40,7 +40,7 @@ public class FenghuangByWordAnalysis {
} }
return dataList; return dataList;
} catch (Exception e) { } catch (Exception e) {
logger.error("解析凤凰关键词获取文章页面出错",e.getMessage()); logger.error("解析凤凰关键词获取文章页面出错 {}",e);
return dataList; return dataList;
} }
} }
......
...@@ -2,6 +2,7 @@ package com.zhiwei.parse.analysis; ...@@ -2,6 +2,7 @@ package com.zhiwei.parse.analysis;
import java.net.Proxy; import java.net.Proxy;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.Date; import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
...@@ -20,17 +21,16 @@ public class FenghuangCommentAnalysis { ...@@ -20,17 +21,16 @@ public class FenghuangCommentAnalysis {
private static Logger logger = LoggerFactory.getLogger(FenghuangCommentAnalysis.class); private static Logger logger = LoggerFactory.getLogger(FenghuangCommentAnalysis.class);
public Map<String,Object> getFenghuangCommentCount(String url,Proxy proxy) { public Map<String,Object> getFenghuangCommentCount(String url,Proxy proxy) {
Map<String, String> headerMap = HeadGet.getFenghuangCommentHeaderMap(null); Map<String,Object> map = new HashMap<>();
Map<String,Object> map = new HashMap<String, Object>();
try { try {
String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap); String result = HttpClient.executeHttpRequestGet(url,proxy, null);
JSONObject json = JSONObject.parseObject(result); JSONObject json = JSONObject.parseObject(result);
map.put("real_count", json.getInteger("real_num")); map.put("real_count", json.getInteger("count"));
map.put("comment_num", json.getInteger("comment_num")); map.put("comment_num", json.getInteger("join_count"));
return map; return map;
} catch (Exception e) { } catch (Exception e) {
logger.error("获取凤凰评论数出错",e.getMessage()); logger.error("获取凤凰评论数出错 {}",e);
return null; return Collections.emptyMap();
} }
} }
...@@ -42,27 +42,23 @@ public class FenghuangCommentAnalysis { ...@@ -42,27 +42,23 @@ public class FenghuangCommentAnalysis {
* @return * @return
*/ */
public String getdocUrl(String url,Proxy proxy) { public String getdocUrl(String url,Proxy proxy) {
String docUrl = null;
for(int i = 0;i < 3;i++) {
try { try {
String result = HttpClient.executeHttpRequestGet(url,proxy, null); String result = HttpClient.executeHttpRequestGet(url,proxy, null);
if(result.contains("commentUrl\":\"")) { if(result.contains("commentUrl")) {
result = result.split("commentUrl\":\"")[1].split("\",")[0]; docUrl = result.split("commentUrl\": \"")[1].split("\",")[0];
}else { break;
return "http://comment.ifeng.com/get.php?docUrl="+url.replaceAll(":", "%3A").replaceAll("/", "%2F")+"&format=js&job=1&pageSize=20&p="; }
}
System.out.println(result);
url = "http://comment.ifeng.com/get.php?docUrl="+result+"&format=js&job=1&pageSize=20&p=";
// if(url.contains("/a")) {
// url = url.replace(":", "%3A");
// url = "https://user.iclient.ifeng.com/Social_Api_Comment/getCommentList?comments_url="+url+"&hasChild=1&limit=30&page=";
// }else {
// String docUrl = "";
// docUrl = url.substring(url.length()-8,url.length());
// url = "https://user.iclient.ifeng.com/Social_Api_Comment/getCommentList?comments_url=sub_"+docUrl+"&hasChild=1&limit=30&page=";
// }
return url;
} catch (Exception e) { } catch (Exception e) {
logger.error("解析连接出错",e.getMessage()); logger.error("解析连接出错 {}",e);
}
}
if(docUrl != null) {
logger.info(docUrl);
url = "https://comment.ifeng.com/get.php?docUrl="+docUrl+"&callback=newCommentListCallBack&orderby=&job=1&pageSize=20&p=";
return url;
}else {
return null; return null;
} }
...@@ -99,18 +95,24 @@ public class FenghuangCommentAnalysis { ...@@ -99,18 +95,24 @@ public class FenghuangCommentAnalysis {
} }
/**
*
* http://tech.ifeng.com/a/20181113/45222352_0.shtml
* @Description docUrl
* @param url
* @param proxy
* @return
*/
public List<Map<String,Object>> getData2(String url,Proxy proxy) { public List<Map<String,Object>> getData2(String url,Proxy proxy) {
Map<String,String> headerMap = HeadGet.getFenghuangCommentHeaderMap(null); List<Map<String,Object>> dataList = new ArrayList<>();
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
String result; String result;
try { try {
result = HttpClient.executeHttpRequestGet(url,proxy, headerMap); result = HttpClient.executeHttpRequestGet(url,proxy, null);
} catch (Exception e) { } catch (Exception e) {
logger.error("链接获取信息失败",e.getMessage()); logger.error("链接获取信息失败 {}",e);
return null; return Collections.emptyList();
} }
result = result.split("commentJsonVarStr___=")[1]; JSONObject json = JSONObject.parseObject(result);
JSONObject json = JSONObject.parseObject(result.substring(0, result.length()-1));
JSONArray jsonArry = json.getJSONArray("comments"); JSONArray jsonArry = json.getJSONArray("comments");
try { try {
for(int i = 0;i < jsonArry.size(); i ++) { for(int i = 0;i < jsonArry.size(); i ++) {
...@@ -118,25 +120,25 @@ public class FenghuangCommentAnalysis { ...@@ -118,25 +120,25 @@ public class FenghuangCommentAnalysis {
dataList.add(map); dataList.add(map);
} }
} catch (Exception e) { } catch (Exception e) {
logger.error("获取信息出错",e.getMessage()); logger.error("获取信息出错 {}",e);
return null; return Collections.emptyList();
} }
return dataList; return dataList;
} }
private Map<String,Object> getcommentData2(JSONObject json) { private Map<String,Object> getcommentData2(JSONObject json) {
Map<String,Object> map = new HashMap<String, Object>(); Map<String,Object> map = new HashMap<>();
try { try {
map.put("nickname", json.getString("uname")); map.put("nickname", json.getString("uname"));
map.put("content", json.getString("comment_contents").replaceAll("&quot;", "")); map.put("content", json.getString("comment_contents").replaceAll("&quot;", ""));
map.put("id", json.getString("articel_id")); map.put("id", json.getString("article_id"));
map.put("like", json.getString("uptimes")); map.put("like", json.getString("uptimes"));
map.put("from", json.getString("ip_from")); map.put("from", json.getString("ip_from"));
map.put("time", json.getString("comment_date")); map.put("time", TimeParse.stringFormartDate(json.getString("comment_date")));
} catch (Exception e) { } catch (Exception e) {
logger.error("具体解析一条数据出错",e.getMessage()); logger.error("具体解析一条数据出错 {}",e);
return null; return Collections.emptyMap();
} }
return map; return map;
} }
......
package com.zhiwei.parse.analysis;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.tools.timeparse.TimeParse;
public class PcautoCommentAnalysis {
private static Logger logger = LoggerFactory.getLogger(PcautoCommentAnalysis.class);
public Map<String,Object> getPcautoComment(String result) {
try {
List<Map<String,Object>> dataList = new ArrayList<>();
Document doc = Jsoup.parse(result);
Elements elements = doc.select("#commentTable > li");
for(Element element : elements) {
Map<String,Object> map = new HashMap<>();
String content = element.select("div.cmtMain > div > div.tbTB.clearfix > p").text();
map.put("content", content);
String source = element.select("div.cmtMain > div > div.thTB > span.cmtTitle > a > em").text();
map.put("source", source);
String time = element.select("div.cmtMain > div > div.thTB > span.cmtTime").text();
map.put("time", TimeParse.stringFormartDate(time));
String like = element.select("span.cmtSupportNum").text();
map.put("like", like);
dataList.add(map);
}
boolean next = false;
String nextd = doc.select("div.pagecmt.clearfix > div.pcauto_page > a.next").text();
if(nextd!=null && nextd.length() > 1) {
next = true;
}
Map<String,Object> rMap = new HashMap<>();
rMap.put("data", dataList);
rMap.put("next", next);
return rMap;
} catch (Exception e) {
logger.error("太平洋汽车网 评论页面解析出错 {}",e);
}
return Collections.emptyMap();
}
}
package com.zhiwei.parse.analysis;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.timeparse.TimeParse;
public class QicheHomeKwyWordAnalysis {
private static Logger logger = LoggerFactory.getLogger(QicheHomeKwyWordAnalysis.class);
public List<Map<String,Object>> getData(String result) {
try {
JSONObject json = JSONObject.parseObject(result);
List<Map<String,Object>> bodyList = new ArrayList<>();
JSONArray jsonArray = json.getJSONArray("commentlist");
for(int i = 0;i < jsonArray.size();i++) {
JSONObject data = jsonArray.getJSONObject(i);
Map<String,Object> map = new HashMap<>();
map.put("name", data.getString("RMemberName"));
map.put("nameId", data.getString("RMemberId"));
map.put("content", data.getString("RContent"));
String time = data.getString("RReplyDate");
map.put("time", TimeParse.dateFormartString(TimeParse.stringFormartDate(time.split("Date\\(")[1].split("\\+")[0]),"yyyy-MM-dd HH:mm:ss"));
map.put("like", data.getInteger("RUp"));
bodyList.add(map);
}
return bodyList;
} catch (Exception e) {
return Collections.emptyList();
}
}
public int getCount(String result) {
try {
JSONObject json = JSONObject.parseObject(result);
return json.getInteger("commentcountall")/50 + 1;
} catch (Exception e) {
return 2;
}
}
}
package com.zhiwei.parse.analysis;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
public class SinaKejiCommentAnalysis {
private static Logger logger = LoggerFactory.getLogger(SinaKejiCommentAnalysis.class);
public List<Map<String,Object>> getSinaCommet(String result) {
try {
List<Map<String, Object>> dataList = new ArrayList<>();
JSONObject json = JSONObject.parseObject(result).getJSONObject("result");
JSONArray jsonArray = json.getJSONArray("cmntlist");
for(int i = 0;i < jsonArray.size();i++) {
JSONObject data = jsonArray.getJSONObject(i);
Map<String,Object> map = new HashMap<>();
map.put("content", data.getString("content"));
map.put("time", data.getString("time"));
map.put("source", data.getString("nick"));
map.put("like", data.getString("agree"));
map.put("location", data.getString("area"));
dataList.add(map);
}
return dataList;
} catch (Exception e) {
logger.error("腾讯科技 评论解析出错 {}",e);
}
return Collections.emptyList();
}
}
...@@ -40,7 +40,7 @@ public class TXNewsByWordAnalysis { ...@@ -40,7 +40,7 @@ public class TXNewsByWordAnalysis {
map.put("id", js2.getString("id")); map.put("id", js2.getString("id"));
map.put("url", js2.getString("url")); map.put("url", js2.getString("url"));
dataList.add(map); dataList.add(map);
System.out.println(map.toString()); // System.out.println(map.toString());
} catch (Exception e) { } catch (Exception e) {
logger.error("采集出错:{}",e.getMessage()); logger.error("采集出错:{}",e.getMessage());
System.out.println(js2.toString()); System.out.println(js2.toString());
......
package com.zhiwei.parse.analysis;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.timeparse.TimeParse;
public class TechTxCommentAnalysis {
private static Logger logger = LoggerFactory.getLogger(TechTxCommentAnalysis.class);
public Map<String, Object> getTechTxComment(String result) {
Map<String,Object> rMap = new HashMap<>();
try {
String next = null;
Map<String,Object> userMap = getUserName(result);
JSONObject json = JSONObject.parseObject(result).getJSONObject("data");
JSONArray jsonArray = json.getJSONArray("oriCommList");
List<Map<String,Object>> dataList = new ArrayList<>();
for(int i = 0;i < jsonArray.size();i++) {
Map<String,Object> map = new HashMap<>();
JSONObject data = jsonArray.getJSONObject(i);
map.put("content", data.getString("content"));
map.put("source", userMap.get(data.getString("userid")));
map.put("time", TimeParse.stringFormartDate(data.getString("time")+"000"));
map.put("like", data.getString("up"));
map.put("userId", data.getString("userid"));
map.put("id", data.getString("id"));
dataList.add(map);
if(json.getBoolean("hasnext")) {
next = data.getString("id");
}
}
rMap.put("data", dataList);
rMap.put("next", next);
return rMap;
} catch (Exception e) {
logger.error("腾讯科技 输出错误 {}",e);
return Collections.emptyMap();
}
}
@SuppressWarnings("unchecked")
private Map<String,Object> getUserName(String result) {
Map<String,Object> map = new HashMap<>();
JSONObject json = JSONObject.parseObject(result).getJSONObject("data");
Map<String,Map<String,Object>> jsonMap = (Map<String, Map<String, Object>>) json.get("userList");
for(Entry<String, Map<String,Object>> entry : jsonMap.entrySet()) {
map.put(entry.getKey(), entry.getValue().get("nick"));
}
return map;
}
}
package com.zhiwei.parse.analysis;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.timeparse.TimeParse;
public class XueqiuKeyWordAnalysis {
private static Logger logger = LoggerFactory.getLogger(XueqiuKeyWordAnalysis.class);
public List<Map<String,Object>> getData(String result,String endTime) {
List<Map<String,Object>> bodyList = new ArrayList<>();
try {
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArray = json.getJSONArray("list");
for(int i =0;i<jsonArray.size();i++) {
try {
JSONObject data = jsonArray.getJSONObject(i);
Map<String,Object> map = new HashMap<>();
Long time = data.getLong("created_at");
// if(time > TimeParse.stringFormartDate(endTime).getTime()) {
map.put("title", data.getString("title").replaceAll("<.*?>", ""));
map.put("time", TimeParse.dateFormartString(new Date(time), "yyyy-MM-dd HH:mm:ss"));
map.put("content", data.getString("text").replaceAll("<.*?>", ""));
map.put("uper", data.getJSONObject("user").getString("screen_name"));
map.put("url", "https://xueqiu.com"+data.getString("target"));
map.put("likeCount", data.getInteger("like_count"));
map.put("replyCount", data.getInteger("reply_count"));
// System.out.println(map.toString());
bodyList.add(map);
// }
} catch (Exception e) {
logger.error("解析出错 {} ",e);
}
}
} catch (Exception e) {
return bodyList;
}
return bodyList;
}
}
package com.zhiwei.Comment;
import org.testng.annotations.Test;
import com.zhiwei.parse.Aika;
import com.zhiwei.tools.timeparse.TimeExtraction;
import com.zhiwei.tools.timeparse.TimeParse;
public class AikaComment {
@Test
public void f() {
String url = "http://newcar.xcar.com.cn/201809/news_2021765_1.html";
Aika.getAikaComment(url, null);
// System.out.println(TimeExtraction.parseFormatTime("09月12日", "MM dd"));
}
}
package com.zhiwei.Comment;
import java.util.List;
import java.util.Map;
import org.testng.annotations.Test;
import com.zhiwei.parse.Pcauto;
public class PcautoComment {
@Test
public void f() {
String url = "https://www.pcauto.com.cn/nation/1352/13523485.html";
List<Map<String,Object>> data = Pcauto.getPcAutoComment(url, null);
System.out.println(data.size());
}
}
package com.zhiwei.Comment;
import org.testng.annotations.Test;
import com.zhiwei.parse.SinaKeji;
public class SinaKejiComment {
@Test
public void f() {
String url = "https://tech.sina.com.cn/it/2018-07-17/doc-ihfkffam0632649.shtml";
SinaKeji.getSinaKejiComment(url, null);
}
}
package com.zhiwei.Comment;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.testng.annotations.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.TechTx;
public class TechTxComment {
@Test
public void f() {
String url = "http://tech.qq.com/a/20170629/005621.htm";
List<Map<String,Object>> bodyList = TechTx.getTechTxComment(url, null);
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<>();
headList.add("time");
headList.add("content");
headList.add("source");
headList.add("like");
headList.add("userId");
headList.add("id");
poi.exportExcel("D://crawlerdata//自媒体/腾讯科技评论采集.xlsx", "ces", headList, bodyList);
System.out.println(bodyList.size());
}
}
...@@ -67,7 +67,7 @@ public class BaijiaAccountExample { ...@@ -67,7 +67,7 @@ public class BaijiaAccountExample {
for(Map<String,Object> m : list) { for(Map<String,Object> m : list) {
try { try {
String app_id = m.get("id").toString(); String app_id = m.get("id").toString();
app_id = "1563725611969509"; app_id = "1594158489045754";
String name = m.get("name").toString(); String name = m.get("name").toString();
String cookie = "__cfduid=d847baca85b97d1967b3da02ebb345b831535524251; BAIDUID=C0F0F81EF770C5219AB9C178654135EC:FG=1; PSTM=1536376257; BIDUPSID=250CCE0442BEBCB3568D8EC515953434; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; locale=zh; delPer=0; H_PS_PSSID=1447_21117_20930; PSINO=5"; String cookie = "__cfduid=d847baca85b97d1967b3da02ebb345b831535524251; BAIDUID=C0F0F81EF770C5219AB9C178654135EC:FG=1; PSTM=1536376257; BIDUPSID=250CCE0442BEBCB3568D8EC515953434; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; locale=zh; delPer=0; H_PS_PSSID=1447_21117_20930; PSINO=5";
List<Map<String,Object>> lists = Baijia.getBaijiaAccountByBaiduData(app_id,name, startTime,cookie, null); List<Map<String,Object>> lists = Baijia.getBaijiaAccountByBaiduData(app_id,name, startTime,cookie, null);
......
...@@ -11,7 +11,7 @@ public class FenghuangCommentCountExample { ...@@ -11,7 +11,7 @@ public class FenghuangCommentCountExample {
@Test @Test
public void fenghuangCommentCountTest() { public void fenghuangCommentCountTest() {
String url = "http://wemedia.ifeng.com/40906977/wemedia.shtml"; String url = "http://tech.ifeng.com/a/20181113/45222352_0.shtml";
//http://news.ifeng.com/a/20161229/50492484_0.shtml //http://news.ifeng.com/a/20161229/50492484_0.shtml
//http://wemedia.ifeng.com/4096977/wemedia.shtml //http://wemedia.ifeng.com/4096977/wemedia.shtml
Map<String,Object> map = Fenghuang.getFenghuangCommentCount(url,null); Map<String,Object> map = Fenghuang.getFenghuangCommentCount(url,null);
......
package com.zhiwei.keyword;
import org.testng.annotations.Test;
import com.zhiwei.parse.QicheHome;
public class QicheKeyWord {
@Test
public void f() {
String articleid = "922761";
QicheHome.getQiCheComment(articleid, null);
}
}
package com.zhiwei.keyword;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.testng.annotations.Test;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Xueqiu;
public class XueqiuKeyWord {
@Test
public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
String word = "腾讯 股价|腾讯 市值|腾讯 估值|腾讯 投资|腾讯 财报";
String endTime = "2018-01-01 00:00:00";
String cookie = "_ga=GA1.2.590296572.1538275545; device_id=812aac42a8874c32ca97893a7b270684; s=ff12lgxbt1; __utma=1.590296572.1538275545.1538280279.1538280279.1; __utmz=1.1538280279.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); aliyungf_tc=AQAAAPtocWy01ggA6tbncwYt4OIDsaiG; xq_a_token=088c6ad5e275496d7c91b8b5b2ecb929bee15772; xq_a_token.sig=NcpAm7FRsAVoMGRMOLrLRveBT7U; xq_r_token=08131eb9a4f33b43b2340fd782f3776c9823d74a; xq_r_token.sig=AZ7au6ICJtTgQJVPybkOJs_Fr54; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539599526,1539913171,1539913178,1541157360; _gid=GA1.2.1817290343.1541157360; u=721541157360383; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1541212289";
String[] words = word.split("\\|");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(String w : words) {
System.out.println(w);
List<Map<String, Object>> dataList = Xueqiu.getData(w, endTime, null, cookie);
System.out.println(w + " ---- " + dataList.size());
bodyList.addAll(dataList);
}
List<String> headList = new ArrayList<String>();
headList.add("title");
headList.add("time");
headList.add("content");
headList.add("uper");
headList.add("url");
headList.add("likeCount");
headList.add("replyCount");
poi.exportExcel("D:\\crawlerdata\\自媒体\\雪球网页采集-腾讯-relevance.xlsx", "马化腾", headList, bodyList);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment