Commit 1116d3c5 by yangchen

易车网 评论采集添加

parent 7fb5554a
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>articlenewscrawler</artifactId>
<version>0.0.3-SNAPSHOT</version>
<version>0.0.4-SNAPSHOT</version>
<name>articlenewscrawler</name>
<description>采集凤凰,一点资讯,搜狐历时文章和文章评论</description>
......@@ -14,11 +14,6 @@
<version>6.14.3</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.29</version>
......@@ -36,7 +31,13 @@
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.0.9-SNAPSHOT</version>
<version>0.1.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.1.1-RELEASE</version>
<scope>provided</scope>
</dependency>
</dependencies>
......
package com.zhiwei.parse;
import java.io.IOException;
import static java.util.Objects.nonNull;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.poi.xwpf.usermodel.BodyElementType;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
......@@ -16,7 +15,6 @@ import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static java.util.Objects.nonNull;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.parse.analysis.DoubanCommentAnalysis;
......@@ -66,7 +64,7 @@ public class Douban {
if(nonNull(elements)) {
for (Element element : elements) {
link = element.select("td.td-subject").select("a").attr("href");
title = element.select("td.td-subject").select("a").text();
title = element.select("td.td-subject").select("a").attr("title");
time = element.select("td.td-time").attr("title");
replyCount = Integer.valueOf(element.select("td.td-reply").select("span").text().split("回应")[0].trim());
group = element.select("td").get(3).text();
......@@ -79,9 +77,10 @@ public class Douban {
map.put("time", time);
map.put("reply_count", replyCount);
bodyList.add(map);
// System.out.println(map.toString());
}
}
if(cou == bodyList.size()){
if(bodyList.size() - cou < 30){
more = false;
}
logger.info("采集到 第 {} 页 ,一共采集到 {} 条 是否有下一页 {}",page,bodyList.size(),more);
......@@ -95,7 +94,7 @@ public class Douban {
logger.error("豆瓣 topic 采集出错 {}",e);
}
}
return Collections.emptyList();
return bodyList;
}
/**
......
......@@ -19,6 +19,7 @@ import com.zhiwei.parse.analysis.SouhuAccountAnalysis;
import com.zhiwei.parse.analysis.SouhuCommentAnalysis;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
import static java.util.Objects.nonNull;
public class Souhu {
private static Logger logger = LoggerFactory.getLogger(Souhu.class);
......@@ -33,15 +34,17 @@ public class Souhu {
* @return
*/
public static int getSouhuCommentCount(String url,Proxy proxy) {
String newurl = souhuCommentAnalysis.getSouhuURL(url);
int i;
try {
String newurl = souhuCommentAnalysis.getSouhuURL(url,proxy);
if(nonNull(newurl)) {
int i;
i = souhuCommentAnalysis.getSouhuCommentCount(newurl,proxy);
return i;
}
} catch (Exception e) {
logger.error("搜狐获取评论数出错了",e.getMessage());
return 0;
logger.error("搜狐获取评论数出错了 {}",e);
}
return -1;
}
......@@ -137,11 +140,11 @@ public class Souhu {
*/
public static List<Map<String,Object>> getSouhuCommentData(String url,Proxy proxy) {
Map<String,String> headerMap = HeadGet.getSouhuCommentHeaderMap(null);
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
List<Map<String,Object>> dataList = new ArrayList<>();
int j = 1;
try {
while(true) {
String newurl = souhuCommentAnalysis.getSouhuURL(url) + "&page_no=" + j;
String newurl = souhuCommentAnalysis.getSouhuURL(url,proxy) + "&page_no=" + j;
String result = HttpClient.executeHttpRequestGet(newurl,proxy,headerMap);
System.out.println(newurl);
JSONObject json = JSONObject.parseObject(result);
......@@ -159,8 +162,7 @@ public class Souhu {
}
} catch (Exception e) {
e.printStackTrace();
logger.error("获取搜狐文章评论出错",e.getMessage());
logger.error("获取搜狐文章评论出错 {}",e);
}
return dataList;
......
......@@ -62,11 +62,15 @@ public class Wangyi {
* @return
*/
public static int getWangyiCommentCount(String id,Proxy proxy) {
try {
String url = "http://comment.dy.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/"+id;
Map<String,String> headerMap = HeadGet.getWangyiCommentHeaderMap(null);
String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap);
JSONObject json = JSONObject.parseObject(result);
return json.getInteger("tcount");
} catch (Exception e) {
return -1;
}
}
public static List<Map<String,Object>> getHistoryData(String url,Proxy proxy,String endTime) {
......
package com.zhiwei.parse;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.tools.tools.ZhiWeiTools;
import static java.util.Objects.nonNull;
import okhttp3.Response;
public class Yiche {
private static final Logger logger = LoggerFactory.getLogger(Yiche.class);
private static HttpBoot httpBoot = new HttpBoot();
/**
*
* @Description 易车网 评论获取
* @param url
* @param proxy
* @return
*/
public static int getYicheCount(String url,Proxy proxy) {
String nurl = getnewsId(url, proxy);
if(nonNull(nurl)) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(nurl), proxy)){
String result = response.body().string();
JSONObject json = JSONObject.parseObject(result);
return json.getJSONObject("result").getInteger("total");
} catch (Exception e) {
logger.error("error {} ",e);
}
}
return -1;
}
/**
*
* @Description 易车网 评论采集
* @param url
* @param proxy
* @return
*/
public static List<Map<String,Object>> getYicheComment(String url,Proxy proxy) {
String nUrl = getnewsId(url, proxy);
if(nonNull(nUrl)) {
int page = 1;
List<Map<String, Object>> bodyList = new ArrayList<>();
boolean f = true;
while(f) {
String surl = nUrl + "&pageSize=50&isHot=false&pageIndex=" + page;
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(surl), proxy)){
String result = response.body().string();
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArray = json.getJSONObject("result").getJSONArray("list");
for(int i = 0;i< jsonArray.size();i++) {
JSONObject data = jsonArray.getJSONObject(i);
Map<String,Object> map = new HashMap<>();
map.put("source", data.getString("showName"));
map.put("time", data.getString("createTime"));
map.put("content", data.getString("content"));
map.put("like", data.get("likeCount"));
map.put("id", data.getString("id"));
bodyList.add(map);
}
int total = json.getJSONObject("result").getInteger("total");
logger.info(" 一共采集 了 {} 条 采集到 {} 页 一共有 {} 条",bodyList.size(),page,total);
if(page*50 > total) {
f = false;
}
} catch (Exception e) {
logger.error("error {} ",e);
f = false;
}
ZhiWeiTools.sleep(2000);
page++;
}
}
return Collections.emptyList();
}
private static String getnewsId(String url,Proxy proxy) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
String result = response.body().string();
String productId = result.split("productId: ")[1].split(",")[0];
String objectId = result.split("newsId = '")[1].split("',")[0];
return "http://newsapi.bitauto.com/comment/comment/getdata?productId="+productId+"&objectId="+objectId;
} catch (Exception e) {
logger.error("error {} ",e);
}
return null;
}
}
......@@ -42,7 +42,7 @@ public class DoubanCommentAnalysis {
map.put("content", content);
map.put("id", id);
map.put("like", getLikeNum(result, "c"+id));
System.out.println(map.toString());
// System.out.println(map.toString());
bodyList.add(map);
}
}
......
......@@ -9,12 +9,17 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.httpclient.HeadGet;
import com.zhiwei.httpclient.HttpClient;
import okhttp3.Response;
public class SouhuCommentAnalysis {
private static Logger logger = LoggerFactory.getLogger(SouhuCommentAnalysis.class);
private HttpBoot httpBoot = new HttpBoot();
/**
*
......@@ -22,35 +27,30 @@ public class SouhuCommentAnalysis {
* @param url
* @return
*/
public String getSouhuURL(String url) {
String topic_id = "";
String source_id = "";
try {
if(url.contains("?")){
url = url.split("\\?")[0];
}
String s = url.split("a/")[1];
topic_id = s.split("_")[1];
source_id = s.split("_")[0];
public String getSouhuURL(String url,Proxy proxy) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
String result = response.body().string();
String source_id = result.split("news_id: \"")[1].split("\",")[0];
String topic_id = result.split("media_id: \"")[1].split("\",")[0];
return "http://apiv2.sohu.com/api/comment/list?page_size=10&topic_id="+topic_id+"&source_id=mp_"+source_id;
} catch (Exception e) {
logger.error("链接解析错误",e.getMessage());
return null;
logger.error("Exception {} ",e);
}
String newurl = "http://apiv2.sohu.com/api/comment/list?page_size=10&topic_id="+topic_id+"&source_id=mp_"+source_id;
return newurl;
return null;
}
public int getSouhuCommentCount(String url,Proxy proxy) {
Map<String,String> headerMap = HeadGet.getSouhuCommentHeaderMap(null);
int i;
try {
System.out.println(url);
String result = HttpClient.executeHttpRequestGet(url,proxy, headerMap);
JSONObject json = JSONObject.parseObject(result);
i = json.getJSONObject("jsonObject").getInteger("participation_sum");
i = json.getJSONObject("jsonObject").getInteger("cmt_sum");
return i;
} catch (Exception e) {
logger.error("获取搜狐评论数信息出错",e.getMessage());
return 0;
logger.error("获取搜狐评论数信息出错 {}",e);
return -1;
}
}
......
package com.zhiwei.Comment;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.testng.annotations.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Douban;
public class DoubanCommentTest {
@Test
public void f() {
String url = "https://www.douban.com/group/topic/128726395/";
String cookie = "bid=rymxzs5aojg; ps=y; ll=\"118173\"; __utmc=30149280; dbcl2=\"188038058:9IHyVcSobVc\"; ck=_RvF; push_noty_num=0; push_doumail_num=0; __utmv=30149280.18803; douban-fav-remind=1; __yadk_uid=qLflXyj3R14ro9e0cLoZOQlJoMGVN32j; douban-profile-remind=1; _vwo_uuid_v2=D85F60C118B0AF465035D9CC7BBFDA7A6|4bf255e1e3a2e9aeede3708192f5f1bc; __utmz=30149280.1543564973.3.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; ap_v=0,6.0; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1543823236%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DPk19bxnhsVWwfnrcnwT0PquON7D1JpLsbfSu9rRowalyi4pOeM3rMHKFaJo9jJF7%26wd%3D%26eqid%3De7f262650001ef98000000045c00e64f%22%5D; _pk_id.100001.8cb4=6828fef49f6bcf34.1543559455.5.1543823236.1543820463.; _pk_ses.100001.8cb4=*; __utma=30149280.824403997.1543559458.1543818802.1543823236.6; __utmt=1; __utmb=30149280.5.7.1543823236";
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<Map<String,Object>> bodyList = Douban.getDoubanComment(url, null, cookie);
List<String> headList = new ArrayList<>();
headList.add("source");
headList.add("time");
headList.add("like");
headList.add("content");
headList.add("id");
poi.exportExcel("D://crawlerdata//自媒体/douban评论采集-2.xlsx", "asd", headList, bodyList);
}
}
//package com.zhiwei.Comment;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Douban;
//
//public class DoubanCommentTest {
// @Test
// public void f() {
// String url = "https://www.douban.com/group/topic/128726395/";
// String cookie = "bid=rymxzs5aojg; ps=y; ll=\"118173\"; __utmc=30149280; dbcl2=\"188038058:9IHyVcSobVc\"; ck=_RvF; push_noty_num=0; push_doumail_num=0; __utmv=30149280.18803; douban-fav-remind=1; __yadk_uid=qLflXyj3R14ro9e0cLoZOQlJoMGVN32j; douban-profile-remind=1; _vwo_uuid_v2=D85F60C118B0AF465035D9CC7BBFDA7A6|4bf255e1e3a2e9aeede3708192f5f1bc; __utmz=30149280.1543564973.3.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; ap_v=0,6.0; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1543823236%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DPk19bxnhsVWwfnrcnwT0PquON7D1JpLsbfSu9rRowalyi4pOeM3rMHKFaJo9jJF7%26wd%3D%26eqid%3De7f262650001ef98000000045c00e64f%22%5D; _pk_id.100001.8cb4=6828fef49f6bcf34.1543559455.5.1543823236.1543820463.; _pk_ses.100001.8cb4=*; __utma=30149280.824403997.1543559458.1543818802.1543823236.6; __utmt=1; __utmb=30149280.5.7.1543823236";
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
// List<Map<String,Object>> bodyList = Douban.getDoubanComment(url, null, cookie);
// List<String> headList = new ArrayList<>();
// headList.add("source");
// headList.add("time");
// headList.add("like");
// headList.add("content");
// headList.add("id");
//
// poi.exportExcel("D://crawlerdata//自媒体/douban评论采集-2.xlsx", "asd", headList, bodyList);
// }
//}
//package com.zhiwei.Comment;
//
//import java.util.ArrayList;
//import java.util.HashMap;
//import java.util.List;
//import java.util.Map;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Fenghuang;
//import com.zhiwei.parse.Yiche;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//public class YicheCommentCountTest {
// @SuppressWarnings("unchecked")
// @Test
// public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
// GroupType.PROVIDER);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
// Map<String, Object> map = poi
// .importExcel("D://crawlerdata//自媒体/易车链接.xlsx", 0);
// List<Map<String, Object>> list = (List<Map<String, Object>>) map.get("body");
// List<Map<String, Object>> bodyList = new ArrayList<Map<String, Object>>();
// List<String> headList = (List<String>) map.get("head");
// for (Map<String, Object> map1 : list) {
// String url = map1.get("易车链接") + "";
// url = "http://news.bitauto.com/xinchexiaoxi/20181212/2309130374.html#comment";
// System.out.println(url);
//// int i = Yiche.getYicheCount(url, ProxyFactory.getNatProxy());
//// System.out.println(i);
//// map1.put("count", i);
// Yiche.getYicheComment(url, ProxyFactory.getNatProxy());
// ZhiWeiTools.sleep(500);
// }
// headList.add("count");
// poi.exportExcel("D://crawlerdata//自媒体/易车链接.xlsx", "评论采集", headList,
// list);
//
// }
//}
......@@ -20,17 +20,6 @@ import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
public class SinaCommentListTest {
public static void main(String[] args) {
List<String> urlList = new ArrayList<String>();
for(String url : urlList){
sinaCommentListTest(url);
}
}
public static void sinaCommentListTest(String url) {
Map<String,String> headerMap = HeaderTool.getCommonHead();
String newsId = getCommentId(url).split("=====")[1];
......
......@@ -9,7 +9,7 @@ public class SouhuCommentCountExample {
@Test
public void souhuCommentCountTest() {
String url = "https://www.sohu.com/a/210588884_267106?_f=index_news_7";
String url = "http://www.sohu.com/a/281414426_133392";
int i = Souhu.getSouhuCommentCount(url,null);
System.out.println(i);
......
package com.zhiwei.crawler;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Wangyi;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class WangyiCommentCountExample {
@Test
public void wangyiCommentCountTest() {
String id = "D77CENT50001875P";
PoiExcelUtil poi = PoiExcelUtil.getInstance();
String path = "D:\\crawlerdata\\自媒体\\网易评论采集-1.xlsx";
Map<String,Object> map = poi.importExcel(path, 0);
List<Map<String,Object>> list = (List<Map<String, Object>>) map.get("body");
List<String> urlList = new ArrayList<String>();
for(Map<String,Object> u : list) {
String url = u.get("链接")+"";
urlList.add(url);
}
int i = Wangyi.getWangyiCommentCount(id,null);
System.out.println(i);
List<Map<String,Object>> bodyList = new ArrayList<>();
for(String url : urlList) {
String id = url.split("/")[url.split("/").length-1].split(".ht")[0];
System.out.println(id);
int lists = Wangyi.getWangyiCommentCount(id, null);
ZhiWeiTools.sleep(3000);
}
List<String> headList = new ArrayList<String>();
headList.add("content");
headList.add("id");
headList.add("time");
headList.add("name");
headList.add("like");
headList.add("unlike");
headList.add("from_url");
poi.exportExcel(path, "评论数据", headList, bodyList);
}
......
......@@ -7,9 +7,9 @@
//public class DoubanTopicTest {
// @Test
// public void f() {
// String word = "唐嫣";
// String cookie = "bid=rymxzs5aojg; ps=y; ll=\"118173\"; __utmc=30149280; __utmz=30149280.1543559458.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); dbcl2=\"188038058:9IHyVcSobVc\"; ck=_RvF; ap_v=0,6.0; push_noty_num=0; push_doumail_num=0; __utmv=30149280.18803; douban-fav-remind=1; __yadk_uid=qLflXyj3R14ro9e0cLoZOQlJoMGVN32j; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1543562805%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DPk19bxnhsVWwfnrcnwT0PquON7D1JpLsbfSu9rRowalyi4pOeM3rMHKFaJo9jJF7%26wd%3D%26eqid%3De7f262650001ef98000000045c00e64f%22%5D; _pk_ses.100001.8cb4=*; douban-profile-remind=1; __utma=30149280.824403997.1543559458.1543559458.1543562809.2; __utmt=1; _pk_id.100001.8cb4=6828fef49f6bcf34.1543559455.2.1543564606.1543559542.; __utmb=30149280.227.9.1543564257221";
// String time = "2018-11-27 15:47:41";
// String word = "胡歌";
// String cookie = "bid=rymxzs5aojg; ps=y; ll=\"118173\"; __utmc=30149280; dbcl2=\"188038058:9IHyVcSobVc\"; ck=_RvF; push_noty_num=0; push_doumail_num=0; __utmv=30149280.18803; douban-fav-remind=1; __yadk_uid=qLflXyj3R14ro9e0cLoZOQlJoMGVN32j; douban-profile-remind=1; _vwo_uuid_v2=D85F60C118B0AF465035D9CC7BBFDA7A6|4bf255e1e3a2e9aeede3708192f5f1bc; __utmz=30149280.1543564973.3.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1543908324%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DPk19bxnhsVWwfnrcnwT0PquON7D1JpLsbfSu9rRowalyi4pOeM3rMHKFaJo9jJF7%26wd%3D%26eqid%3De7f262650001ef98000000045c00e64f%22%5D; _pk_ses.100001.8cb4=*; ap_v=0,6.0; __utma=30149280.824403997.1543559458.1543885946.1543908324.10; __utmt=1; _pk_id.100001.8cb4=6828fef49f6bcf34.1543559455.9.1543908331.1543885945.; __utmb=30149280.9.7.1543908324";
// String time = "2018-11-16 00:00:00";
//
// Douban.doubanTopicGetByWord(word, null, cookie,time);
//
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment