Commit 5bb9510d by win 10

解决冲突

parent a56fa9e1
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>media_data_crawler</artifactId> <artifactId>media_data_crawler</artifactId>
<version>0.1.2-SNAPSHOT</version> <version>0.1.3-SNAPSHOT</version>
<name>media_data_crawler</name> <name>media_data_crawler</name>
<description>网媒数据抓取,包含百度新闻、搜狗新闻、360新闻、知乎回答列表等</description> <description>网媒数据抓取,包含百度新闻、搜狗新闻、360新闻、知乎回答列表等</description>
...@@ -16,9 +16,23 @@ ...@@ -16,9 +16,23 @@
<dependency> <dependency>
<groupId>com.zhiwei.crawler</groupId> <groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId> <artifactId>crawler-core</artifactId>
<version>0.5.2-RELEASE</version> <version>0.6.1.0-SNAPSHOT</version>
<scope>provided</scope> <scope>provided</scope>
</dependency> </dependency>
<!-- excel导出 -->
<dependency>
<groupId>com.zhiwei</groupId>
<artifactId>excelpoi</artifactId>
<version>0.0.3-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>easyexcel</artifactId>
<version>2.0.0-beta3</version>
<scope>provided</scope>
</dependency>
</dependencies> </dependencies>
<!-- 打包管理 --> <!-- 打包管理 -->
......
...@@ -16,8 +16,11 @@ import org.slf4j.LoggerFactory; ...@@ -16,8 +16,11 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.ZhihuAnswer; import com.zhiwei.media_data_crawler.entity.ZhihuAnswer;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response; import okhttp3.Response;
...@@ -49,6 +52,50 @@ public class ZhihuAnwserCrawlerParse { ...@@ -49,6 +52,50 @@ public class ZhihuAnwserCrawlerParse {
return -1; return -1;
} }
public static List<ZhihuAnswer> getPictureCount(String url) {
List<ZhihuAnswer> answerList = new ArrayList<>();
logger.info("知乎回答采集开始:{}",url);
try {
if(url.contains("/answer")) {
url = url.split("/answer")[0];
}
int n = -1;
int i = 1;
while(true) {
try {
n++;
Map<String, Object> dataMap = DataCrawler.getAnswerList(url, n, TimeParse.stringFormartDate("2000-01-01"), ProxyHolder.NAT_HEAVY_PROXY);
List<ZhihuAnswer> list = (List<ZhihuAnswer>)dataMap.get("data");
if(list!=null && !list.isEmpty()){
logger.info("知乎回答采集链接:{} 页数 {} ,此页总数 {}",url,n,list.size());
i = 1;
answerList.addAll(list);
}else {
n--;
i++;
}
// ZhiWeiTools.sleep(100);
if(!(boolean) dataMap.get("more")) {
break;
}
} catch (Exception e) {
logger.error(" exception {} ",e);
// ZhiWeiTools.sleep(100);
i++;
n--;
}
if(i > 10) {
break;
}
}
} catch (Exception e) {
e.toString();
}
// ZhiWeiTools.sleep(1000);
logger.info("知乎回答采集结束:{}",url);
return answerList;
}
/** /**
* 知乎回答采集 * 知乎回答采集
* @param url * @param url
...@@ -57,7 +104,7 @@ public class ZhihuAnwserCrawlerParse { ...@@ -57,7 +104,7 @@ public class ZhihuAnwserCrawlerParse {
* @return * @return
* @throws Exception * @throws Exception
*/ */
public static List<ZhihuAnswer> getAnswerList(String url, Date endDate, Proxy proxy) throws Exception{ public static List<ZhihuAnswer> getAnswerList(String url, Date endDate, ProxyHolder proxy) throws Exception{
try{ try{
List<ZhihuAnswer> answerList = new ArrayList<>(); List<ZhihuAnswer> answerList = new ArrayList<>();
String questionId = getQuestionId(url); String questionId = getQuestionId(url);
...@@ -80,7 +127,7 @@ public class ZhihuAnwserCrawlerParse { ...@@ -80,7 +127,7 @@ public class ZhihuAnwserCrawlerParse {
more = false; more = false;
} }
//单线程采集避免被封休眠8s //单线程采集避免被封休眠8s
ZhiWeiTools.sleep(8000); // ZhiWeiTools.sleep(3000);
page++; page++;
}catch (Exception e){ }catch (Exception e){
more = false; more = false;
...@@ -92,7 +139,6 @@ public class ZhihuAnwserCrawlerParse { ...@@ -92,7 +139,6 @@ public class ZhihuAnwserCrawlerParse {
} }
} }
/** /**
* 获取问题的关注者和浏览量 * 获取问题的关注者和浏览量
* @param url * @param url
...@@ -100,7 +146,7 @@ public class ZhihuAnwserCrawlerParse { ...@@ -100,7 +146,7 @@ public class ZhihuAnwserCrawlerParse {
* @return * @return
* @throws Exception * @throws Exception
*/ */
private static String getNumberBoard(String url, Proxy proxy) throws Exception{ private static String getNumberBoard(String url, ProxyHolder proxy) throws Exception{
try{ try{
String body = download(url, proxy); String body = download(url, proxy);
Document document = Jsoup.parse(body); Document document = Jsoup.parse(body);
...@@ -117,10 +163,6 @@ public class ZhihuAnwserCrawlerParse { ...@@ -117,10 +163,6 @@ public class ZhihuAnwserCrawlerParse {
} }
} }
/** /**
* 获取单页数据 * 获取单页数据
* @param url * @param url
...@@ -130,17 +172,16 @@ public class ZhihuAnwserCrawlerParse { ...@@ -130,17 +172,16 @@ public class ZhihuAnwserCrawlerParse {
* @return * @return
* @throws Exception * @throws Exception
*/ */
public static Map<String,Object> getAnswerList(String url, int page, Date endDate, Proxy proxy) throws Exception{ public static Map<String,Object> getAnswerList(String url, int page, Date endDate, ProxyHolder proxy) throws Exception{
try{ try{
String questionId = getQuestionId(url); String questionId = getQuestionId(url);
String bord = getNumberBoard(url, proxy); String bord = getNumberBoard(url, proxy);
return analsis(questionId,endDate,page,bord ,proxy); return analsis(questionId,endDate,page,bord ,proxy);
}catch (Exception e){ }catch (Exception e){
throw e; throw e;
} }
} }
/** /**
* 解析数据 * 解析数据
* @param questionId * @param questionId
...@@ -150,7 +191,7 @@ public class ZhihuAnwserCrawlerParse { ...@@ -150,7 +191,7 @@ public class ZhihuAnwserCrawlerParse {
* @return * @return
* @throws Exception * @throws Exception
*/ */
private static Map<String,Object> analsis(String questionId, Date endDate, int page, String bord, Proxy proxy) throws Exception{ private static Map<String,Object> analsis(String questionId, Date endDate, int page, String bord, ProxyHolder proxy) throws Exception{
try{ try{
boolean more = true; boolean more = true;
List<ZhihuAnswer> answerList = new ArrayList<>(); List<ZhihuAnswer> answerList = new ArrayList<>();
...@@ -160,23 +201,29 @@ public class ZhihuAnwserCrawlerParse { ...@@ -160,23 +201,29 @@ public class ZhihuAnwserCrawlerParse {
Integer count = dataJson.getJSONObject("paging").getInteger("totals"); Integer count = dataJson.getJSONObject("paging").getInteger("totals");
JSONArray jsonArray = dataJson.getJSONArray("data"); JSONArray jsonArray = dataJson.getJSONArray("data");
String from_url = "https://www.zhihu.com/question/" + questionId; String fromUrl = "https://www.zhihu.com/question/" + questionId;
Integer sort = page*20 + 1;
for(int i=0; i<jsonArray.size(); i++){ for(int i=0; i<jsonArray.size(); i++){
JSONObject answerJson = jsonArray.getJSONObject(i); JSONObject answerJson = jsonArray.getJSONObject(i);
Date time = new Date(answerJson.getLong("created_time")*1000); Date time = new Date(answerJson.getLong("created_time")*1000);
if(time.after(endDate)){ if(time.after(endDate)){
String answerId = answerJson.getString("id"); String answerId = answerJson.getString("id");
String link = from_url+"/answer/" + answerId; String link = fromUrl+"/answer/" + answerId;
System.out.println("正在处理 === " + link);
String author = answerJson.getJSONObject("author").getString("name"); String author = answerJson.getJSONObject("author").getString("name");
String authorUrl = "https://www.zhihu.com/people/"+answerJson.getJSONObject("author").getString("url_token"); String authorUrl = "https://www.zhihu.com/people/"+answerJson.getJSONObject("author").getString("url_token");
String content = ZhiWeiTools.delHTMLTag(answerJson.getString("content")); String content = ZhiWeiTools.delHTMLTag(answerJson.getString("content"));
String[] imgContent = answerJson.getString("content").split("<img");
Integer imgCount = (imgContent.length-1)/2;
String title = answerJson.getJSONObject("question").getString("title"); String title = answerJson.getJSONObject("question").getString("title");
Integer voteup_count = answerJson.getInteger("voteup_count"); Integer voteupCount = answerJson.getInteger("voteup_count");
Integer comment_count = answerJson.getInteger("comment_count"); Integer commentCount = answerJson.getInteger("comment_count");
Integer guanzhu_count = Integer.valueOf(bord.split(",")[0]); Integer guanzhuCount = Integer.valueOf(bord.split(",")[0]);
Integer bord_count = Integer.valueOf(bord.split(",")[1]); Integer bordCount = Integer.valueOf(bord.split(",")[1]);
ZhihuAnswer zhihuAnswer = new ZhihuAnswer(link, from_url, title, time, author, authorUrl, content,voteup_count ,comment_count, guanzhu_count, bord_count); ZhihuAnswer zhihuAnswer = new ZhihuAnswer(link, fromUrl, title, time, author, authorUrl, content,voteupCount ,commentCount, guanzhuCount, bordCount, imgCount, sort);
answerList.add(zhihuAnswer); answerList.add(zhihuAnswer);
System.out.println(imgCount + " ---- " + sort);
sort ++;
} }
} }
if(count<page*20){ if(count<page*20){
...@@ -191,7 +238,6 @@ public class ZhihuAnwserCrawlerParse { ...@@ -191,7 +238,6 @@ public class ZhihuAnwserCrawlerParse {
} }
} }
/** /**
* 根据链接获取数据 * 根据链接获取数据
* @param url * @param url
...@@ -199,7 +245,7 @@ public class ZhihuAnwserCrawlerParse { ...@@ -199,7 +245,7 @@ public class ZhihuAnwserCrawlerParse {
* @return * @return
* @throws Exception * @throws Exception
*/ */
private static String download(String url, Proxy proxy) throws Exception{ private static String download(String url, ProxyHolder proxy) throws Exception{
try(Response response = httpBoot.syncCall(RequestUtils.wrapGet(url),proxy)){ try(Response response = httpBoot.syncCall(RequestUtils.wrapGet(url),proxy)){
return response.body().string(); return response.body().string();
}catch (Exception e){ }catch (Exception e){
...@@ -207,7 +253,6 @@ public class ZhihuAnwserCrawlerParse { ...@@ -207,7 +253,6 @@ public class ZhihuAnwserCrawlerParse {
} }
} }
/** /**
* 根据链接获取问题id * 根据链接获取问题id
* @param url * @param url
...@@ -237,11 +282,9 @@ public class ZhihuAnwserCrawlerParse { ...@@ -237,11 +282,9 @@ public class ZhihuAnwserCrawlerParse {
"Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit" + "Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit" +
"%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2" + "%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2" +
"Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp" + "Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp" +
"%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics&offset="+page*20+"&limit=20&sort_by=created"; "%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics&offset="+page*20+"&limit=20";
} }
public static void main(String[] args){ public static void main(String[] args){
// String url = "https://www.zhihu.com/question/288128510"; // String url = "https://www.zhihu.com/question/288128510";
// Date endDate = TimeParse.stringFormartDate("2018-09-20 08:00:00"); // Date endDate = TimeParse.stringFormartDate("2018-09-20 08:00:00");
...@@ -253,7 +296,4 @@ public class ZhihuAnwserCrawlerParse { ...@@ -253,7 +296,4 @@ public class ZhihuAnwserCrawlerParse {
getAnswerCount("https://www.zhihu.com/question/41539825", null); getAnswerCount("https://www.zhihu.com/question/41539825", null);
} }
} }
...@@ -35,7 +35,33 @@ public class DataCrawler { ...@@ -35,7 +35,33 @@ public class DataCrawler {
try { try {
return BaiduInforCrawlerParse.getBaiduInforData(word,endTime); return BaiduInforCrawlerParse.getBaiduInforData(word,endTime);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.toString();
return Collections.emptyList();
}
}
/**
*
* @Title: getBaiduNewsData
* @author hero
* @Description: 根据关键词和时间,全文匹配百度新闻数据
* @param @param
* word
* @param @param
* startTime
* @param @param
* endTime
* @param @param
* proxy
* @param @return
* 设定文件
* @return List<NewsData> 返回类型
*/
public static List<NewsData> getBaiduInforDataManyWord(String word,String endTime, String saveWord) {
try {
return BaiduInforCrawlerParse.getBaiduInforDataManyWord(word,endTime,saveWord);
} catch (Exception e) {
e.toString();
return Collections.emptyList(); return Collections.emptyList();
} }
} }
...@@ -62,8 +88,8 @@ public class DataCrawler { ...@@ -62,8 +88,8 @@ public class DataCrawler {
try { try {
return BaiduNewsCrawlerParse.getBaiduNewsData(word, startTime, endTime, proxy); return BaiduNewsCrawlerParse.getBaiduNewsData(word, startTime, endTime, proxy);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.toString();
return null; return Collections.emptyList();
} }
} }
...@@ -106,7 +132,7 @@ public class DataCrawler { ...@@ -106,7 +132,7 @@ public class DataCrawler {
try { try {
return BaiduNewsCrawlerParse.getBaiduNewsCount(word, startTime, endTime, proxy,cookie); return BaiduNewsCrawlerParse.getBaiduNewsCount(word, startTime, endTime, proxy,cookie);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.toString();
return -1; return -1;
} }
} }
...@@ -132,8 +158,8 @@ public class DataCrawler { ...@@ -132,8 +158,8 @@ public class DataCrawler {
try { try {
return BaiduNewsCrawlerParse.getBaiduNewsDataByTitle(word, startTime, endTime, proxy); return BaiduNewsCrawlerParse.getBaiduNewsDataByTitle(word, startTime, endTime, proxy);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.toString();
return null; return Collections.emptyList();
} }
} }
...@@ -154,8 +180,8 @@ public class DataCrawler { ...@@ -154,8 +180,8 @@ public class DataCrawler {
try { try {
return SoNewsCrawlerParse.getSoNewsData(word, proxy); return SoNewsCrawlerParse.getSoNewsData(word, proxy);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.toString();
return null; return Collections.emptyList();
} }
} }
...@@ -176,8 +202,8 @@ public class DataCrawler { ...@@ -176,8 +202,8 @@ public class DataCrawler {
try { try {
return SoNewsCrawlerParse.getSoNewsDataByTitle(word, proxy); return SoNewsCrawlerParse.getSoNewsDataByTitle(word, proxy);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.toString();
return null; return Collections.emptyList();
} }
} }
...@@ -199,8 +225,8 @@ public class DataCrawler { ...@@ -199,8 +225,8 @@ public class DataCrawler {
System.out.println("开始采集sogou"); System.out.println("开始采集sogou");
return SougouNewsCrawlerParse.getSougouNewsData(word, proxy); return SougouNewsCrawlerParse.getSougouNewsData(word, proxy);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.toString();
return null; return Collections.emptyList();
} }
} }
...@@ -221,8 +247,8 @@ public class DataCrawler { ...@@ -221,8 +247,8 @@ public class DataCrawler {
try { try {
return SougouNewsCrawlerParse.getSougouNewsDataByTitle(word, proxy); return SougouNewsCrawlerParse.getSougouNewsDataByTitle(word, proxy);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.toString();
return null; return Collections.emptyList();
} }
} }
...@@ -242,8 +268,8 @@ public class DataCrawler { ...@@ -242,8 +268,8 @@ public class DataCrawler {
try { try {
return SougouZhihuCrawlerParse.getSougouZhihuData(word, proxy); return SougouZhihuCrawlerParse.getSougouZhihuData(word, proxy);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.toString();
return null; return Collections.emptyList();
} }
} }
...@@ -259,14 +285,14 @@ public class DataCrawler { ...@@ -259,14 +285,14 @@ public class DataCrawler {
* 设定文件 * 设定文件
* @return List<TiebaData> 返回类型 * @return List<TiebaData> 返回类型
*/ */
public static List<TiebaData> getBaiduTiebaData(String word, Proxy proxy) { public static List<TiebaData> getBaiduTiebaDataSortByTime(String word, Proxy proxy, String startTime) {
try { try {
return BaiduTiebaCrawlerParse.getBaiduTiebaData(word, proxy, null); return BaiduTiebaCrawlerParse.getBaiduTiebaData(word, proxy, null, startTime);
} catch (Exception e) { } catch (Exception e) {
return Collections.emptyList(); return Collections.emptyList();
} }
} }
/** /**
* @Title: getBaiduTiebaData * @Title: getBaiduTiebaData
* @author hero * @author hero
...@@ -291,8 +317,8 @@ public class DataCrawler { ...@@ -291,8 +317,8 @@ public class DataCrawler {
try { try {
return BaiduTiebaCrawlerParse.getBaiduTiebaAnswerDataByUrl(url, proxy); return BaiduTiebaCrawlerParse.getBaiduTiebaAnswerDataByUrl(url, proxy);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.toString();
return null; return Collections.emptyList();
} }
} }
...@@ -308,10 +334,10 @@ public class DataCrawler { ...@@ -308,10 +334,10 @@ public class DataCrawler {
*/ */
public static List<TiebaData> getBaiduTiebaData(String word, Proxy proxy, String tiebaName) { public static List<TiebaData> getBaiduTiebaData(String word, Proxy proxy, String tiebaName) {
try { try {
return BaiduTiebaCrawlerParse.getBaiduTiebaData(word, proxy, tiebaName); return BaiduTiebaCrawlerParse.getBaiduTiebaData(word, proxy, tiebaName, null);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.toString();
return null; return Collections.emptyList();
} }
} }
...@@ -353,8 +379,8 @@ public class DataCrawler { ...@@ -353,8 +379,8 @@ public class DataCrawler {
try { try {
return DoubanCrawlerParse.getDoubanData(word, type, proxy); return DoubanCrawlerParse.getDoubanData(word, type, proxy);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.toString();
return null; return Collections.emptyList();
} }
} }
...@@ -374,8 +400,8 @@ public class DataCrawler { ...@@ -374,8 +400,8 @@ public class DataCrawler {
try { try {
return SoCrawlerParse.getSoData(word, site, time, proxy); return SoCrawlerParse.getSoData(word, site, time, proxy);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.toString();
return null; return Collections.emptyList();
} }
} }
...@@ -387,10 +413,11 @@ public class DataCrawler { ...@@ -387,10 +413,11 @@ public class DataCrawler {
* @return * @return
* @throws Exception * @throws Exception
*/ */
public static List<ZhihuAnswer> getAnswerList(String url, Date endDate, Proxy proxy) throws Exception{ public static List<ZhihuAnswer> getAnswerList(String url, Date endDate, ProxyHolder proxy) throws Exception{
try{ try{
return ZhihuAnwserCrawlerParse.getAnswerList(url,endDate, proxy); return ZhihuAnwserCrawlerParse.getAnswerList(url,endDate, proxy);
}catch (Exception e){ }catch (Exception e){
e.toString();
throw e; throw e;
} }
} }
...@@ -404,10 +431,11 @@ public class DataCrawler { ...@@ -404,10 +431,11 @@ public class DataCrawler {
* @return * @return
* @throws Exception * @throws Exception
*/ */
public static Map<String,Object> getAnswerList(String url, int page, Date endDate, Proxy proxy) throws Exception{ public static Map<String,Object> getAnswerList(String url, int page, Date endDate, ProxyHolder proxy) throws Exception{
try{ try{
return ZhihuAnwserCrawlerParse.getAnswerList(url,page,endDate, proxy); return ZhihuAnwserCrawlerParse.getAnswerList(url,page,endDate, proxy);
}catch (Exception e){ }catch (Exception e){
e.toString();
throw e; throw e;
} }
} }
...@@ -428,6 +456,7 @@ public class DataCrawler { ...@@ -428,6 +456,7 @@ public class DataCrawler {
try{ try{
return ZhihuCrawlerParse.getZhihuData(word, timeLimit, proxy, endDate); return ZhihuCrawlerParse.getZhihuData(word, timeLimit, proxy, endDate);
}catch (Exception e){ }catch (Exception e){
e.toString();
throw e; throw e;
} }
} }
...@@ -443,6 +472,7 @@ public class DataCrawler { ...@@ -443,6 +472,7 @@ public class DataCrawler {
try{ try{
return ZhihuCrawlerParse.getZhihuUser(url, proxy); return ZhihuCrawlerParse.getZhihuUser(url, proxy);
}catch (Exception e){ }catch (Exception e){
e.toString();
throw e; throw e;
} }
} }
...@@ -458,6 +488,7 @@ public class DataCrawler { ...@@ -458,6 +488,7 @@ public class DataCrawler {
try{ try{
return ZhihuUserAnswerCrawlerParse.getData(userId, proxy); return ZhihuUserAnswerCrawlerParse.getData(userId, proxy);
}catch (Exception e){ }catch (Exception e){
e.toString();
throw e; throw e;
} }
} }
......
...@@ -5,12 +5,12 @@ import java.util.HashMap; ...@@ -5,12 +5,12 @@ import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory; import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.excelpoi.excel.PoiExcelUtil; import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.media_data_crawler.crawler.WordsReadFile; import com.zhiwei.media_data_crawler.crawler.WordsReadFile;
import com.zhiwei.media_data_crawler.data.DataCrawler; import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.LunTanData; import com.zhiwei.media_data_crawler.entity.LunTanData;
import com.zhiwei.proxy.config.SimpleConfig;
/** /**
* 天涯论坛数据获取 * 天涯论坛数据获取
...@@ -25,7 +25,12 @@ public class GetTiayaDataTest { ...@@ -25,7 +25,12 @@ public class GetTiayaDataTest {
String startTime = "2019-01-01 00:00:00"; //开始时间 String startTime = "2019-01-01 00:00:00"; //开始时间
String endTime = "2019-11-08 23:59:59"; //结束时间 String endTime = "2019-11-08 23:59:59"; //结束时间
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER, 10000008); //代理地址
String address = "zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181";
String appName = "xumaioxin";
long appId = 10000008L;
ProxyFactory.init(SimpleConfig.builder().registry(address).appName(appName).appId(appId).group("local").build());
List<String> wordList = WordsReadFile.getWords(wordFilePath); List<String> wordList = WordsReadFile.getWords(wordFilePath);
List<LunTanData> list = new ArrayList<>(); List<LunTanData> list = new ArrayList<>();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment