Commit b327c7a1 by cwy

Merge branch 'master' of

http://git.zhiweidata.top/zhangzhiwei/media_data_crawler.git

Conflicts:
	src/main/java/com/zhiwei/media_data_crawler/crawler/TianYaCrawlerParse.java
	src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
parents 605564af a56fa9e1
package com.zhiwei.media_data_crawler.crawler; package com.zhiwei.media_data_crawler.crawler;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
...@@ -16,12 +15,10 @@ import org.jsoup.select.Elements; ...@@ -16,12 +15,10 @@ import org.jsoup.select.Elements;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.LunTanData; import com.zhiwei.media_data_crawler.entity.LunTanData;
import com.zhiwei.tools.httpclient.HeaderTool; import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.timeparse.TimeParse; import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.URLCodeUtil; import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response; import okhttp3.Response;
...@@ -34,7 +31,7 @@ public class TianYaCrawlerParse { ...@@ -34,7 +31,7 @@ public class TianYaCrawlerParse {
/** /**
* @Title: getBaiduTiebaData * @Title: getBaiduTiebaData
* @author hero * @author hero
* @Description: 根據關鍵詞獲取百度貼吧數據(最多50頁) * @Description: 根据关键词获取天涯论坛数据(最多50頁)
* @param @param word * @param @param word
* @param @param proxy * @param @param proxy
* @param @param tiebaName * @param @param tiebaName
...@@ -43,18 +40,22 @@ public class TianYaCrawlerParse { ...@@ -43,18 +40,22 @@ public class TianYaCrawlerParse {
* @return List<TiebaData> 返回类型 * @return List<TiebaData> 返回类型
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static List<LunTanData> getLunTanData(String word, ProxyHolder proxy, String endTime) throws Exception { public static List<LunTanData> getLunTanData(String word, ProxyHolder proxy, String startTime, String endTime) {
List<LunTanData> list = new ArrayList<LunTanData>(); List<LunTanData> list = new ArrayList<LunTanData>();
int page = 0; int page = 0;
boolean more = true; boolean more = true;
while (more) { while (more) {
// 最大页数为20 for(int i = 0; i < 4; i++) {
if (page > 50) { // 最大页数为50
more = false;
}
String htmlBody = downloadHtml(word, proxy, page); String htmlBody = downloadHtml(word, proxy, page);
/** 解析页面 */
if (htmlBody != null) { if (htmlBody != null) {
Map<String, Object> dataMap = analysisData(htmlBody, proxy, word, endTime); Document document = Jsoup.parse(htmlBody);
if(!document.select("div.long-pages").select("a").text().contains("下一页") && (i < 3)) {
continue;
}
Map<String, Object> dataMap = analysisData(document, word, startTime, endTime);
List<LunTanData> dataList = (List<LunTanData>) dataMap.get("data"); List<LunTanData> dataList = (List<LunTanData>) dataMap.get("data");
list.addAll(dataList); list.addAll(dataList);
more = (Boolean) dataMap.get("more"); more = (Boolean) dataMap.get("more");
...@@ -62,10 +63,13 @@ public class TianYaCrawlerParse { ...@@ -62,10 +63,13 @@ public class TianYaCrawlerParse {
more = false; more = false;
} }
page++; page++;
if(DataCrawler.sleepTime!=null){
ZhiWeiTools.sleep(DataCrawler.sleepTime); if (page > 50) {
more = false;
} }
break;
}
} }
return list; return list;
} }
...@@ -74,7 +78,7 @@ public class TianYaCrawlerParse { ...@@ -74,7 +78,7 @@ public class TianYaCrawlerParse {
/** /**
* @Title: downloadHtml * @Title: downloadHtml
* @author hero * @author hero
* @Description: 下載百度貼吧數據 * @Description: 下载天涯论坛数据
* @param @param word * @param @param word
* @param @param proxy * @param @param proxy
* @param @param tiebaName * @param @param tiebaName
...@@ -83,8 +87,7 @@ public class TianYaCrawlerParse { ...@@ -83,8 +87,7 @@ public class TianYaCrawlerParse {
* @param @throws Exception 设定文件 * @param @throws Exception 设定文件
* @return String 返回类型 * @return String 返回类型
*/ */
private static String downloadHtml(String word, ProxyHolder proxy, private static String downloadHtml(String word, ProxyHolder proxy, int page) {
int page) throws Exception{
// 获取通用请求头 // 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead(); Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址 // 获取链接地址
...@@ -93,21 +96,10 @@ public class TianYaCrawlerParse { ...@@ -93,21 +96,10 @@ public class TianYaCrawlerParse {
headerMap.put("Referer", url); headerMap.put("Referer", url);
// 下载数据页面 // 下载数据页面
for (int i = 1; i <= 3; i++) { for (int i = 1; i <= 3; i++) {
try { try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy == null ? ProxyHolder.NAT_HEAVY_PROXY : proxy )) {
Response response = null;
if(proxy != null) {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy);
}else {
response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), ProxyHolder.NAT_HEAVY_PROXY);
}
return response.body().string(); return response.body().string();
} catch (Exception e) { } catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace()); logger.error("获取数据时出现问题,问题为:{}", e.toString());
if(i==3){
throw e;
}else{
continue;
}
} }
} }
return null; return null;
...@@ -117,20 +109,18 @@ public class TianYaCrawlerParse { ...@@ -117,20 +109,18 @@ public class TianYaCrawlerParse {
/** /**
* @Title: analysisData * @Title: analysisData
* @author hero * @author hero
* @Description: 解析Baidu貼吧數據 * @Description: 解析天涯论坛数据
* @param @param htmlBody * @param @param htmlBody
* @param @param proxy
* @param @param word * @param @param word
* @param @return * @param @return
* @param @throws Exception 设定文件 * @param @throws Exception 设定文件
* @return Map<String,Object> 返回类型 * @return Map<String,Object> 返回类型
*/ */
private static Map<String, Object> analysisData(String htmlBody, ProxyHolder proxy, String word, String endTime) throws Exception{ private static Map<String, Object> analysisData(Document document, String word, String startTime, String endTime) {
Map<String, Object> resultMap = new HashMap<String, Object>(); Map<String, Object> resultMap = new HashMap<String, Object>();
try {
List<LunTanData> list = new ArrayList<LunTanData>(); List<LunTanData> list = new ArrayList<LunTanData>();
boolean more = true; boolean more = true;
/** 解析页面 */
Document document = Jsoup.parse(htmlBody);
/** 判断是否有下一页 **/ /** 判断是否有下一页 **/
if (!document.select("div.long-pages").select("a").text().contains("下一页")) { if (!document.select("div.long-pages").select("a").text().contains("下一页")) {
more = false; more = false;
...@@ -143,30 +133,38 @@ public class TianYaCrawlerParse { ...@@ -143,30 +133,38 @@ public class TianYaCrawlerParse {
String title = null; String title = null;
String content = null; String content = null;
String author = null; String author = null;
Integer reply_count = 0; Integer replyCount = 0;
long startDate = TimeParse.stringFormartDate(startTime).getTime();
long endDate = TimeParse.stringFormartDate(endTime).getTime();
for(Element element : elementes) { for(Element element : elementes) {
if(element.toString().contains("search_msg")) {
break;
}
title = element.select("div").select("h3").select("a").text(); title = element.select("div").select("h3").select("a").text();
link = element.select("div").select("h3").select("a").attr("href"); link = element.select("div").select("h3").select("a").attr("href");
content = element.select("div").select("p").text(); content = element.select("div").select("p").text();
source = element.select("p.source").select("a").get(0).text(); source = element.select("p.source").select("a:nth-child(1)").text();
author = element.select("p.source").select("a").get(1).text(); author = element.select("p.source").select("a:nth-child(2)").text();
time = element.select("p.source").select("span").get(0).text(); time = element.select("p.source").select("span").get(0).text();
reply_count = Integer.valueOf(element.select("p.source").select("span").get(1).text()); replyCount = Integer.valueOf(element.select("p.source").select("span").get(1).text());
LunTanData luntanData = new LunTanData(link, title, time, source, author, content, reply_count, pt, word); LunTanData luntanData = new LunTanData(link, title, time, source, author, content, replyCount, pt, word);
Date date = TimeParse.stringFormartDate(time); long date = TimeParse.stringFormartDate(time).getTime();
Date endDate = TimeParse.stringFormartDate(endTime);
if(date.before(endDate)){ if(date >= startDate && (date <= endDate)){
more = false;
}else{
// System.out.println(luntanData);
list.add(luntanData); list.add(luntanData);
} else if(date < startDate){
more = false;
} }
} }
if(elementes.size()==0){ if(elementes.isEmpty()){
more = false; more = false;
} }
resultMap.put("data", list); resultMap.put("data", list);
resultMap.put("more", more); resultMap.put("more", more);
} catch (Exception e) {
e.toString();
}
return resultMap; return resultMap;
} }
...@@ -186,7 +184,7 @@ public class TianYaCrawlerParse { ...@@ -186,7 +184,7 @@ public class TianYaCrawlerParse {
url = "http://search.tianya.cn/bbs?q="+ URLCodeUtil.getURLEncode(word, "utf-8") url = "http://search.tianya.cn/bbs?q="+ URLCodeUtil.getURLEncode(word, "utf-8")
+"&s=4&f=0&pn="+page; +"&s=4&f=0&pn="+page;
} }
System.out.println(url); System.out.println( word + " == " + url);
return url; return url;
} }
......
...@@ -329,9 +329,9 @@ public class DataCrawler { ...@@ -329,9 +329,9 @@ public class DataCrawler {
* 设定文件 * 设定文件
* @return List<LunTanData> 返回类型 * @return List<LunTanData> 返回类型
*/ */
public static List<LunTanData> getLunTanData(String word, ProxyHolder proxy, String endTime) { public static List<LunTanData> getLunTanData(String word, ProxyHolder proxy, String startTime, String endTime) {
try { try {
return TianYaCrawlerParse.getLunTanData(word, proxy, endTime); return TianYaCrawlerParse.getLunTanData(word, proxy, startTime, endTime);
} catch (Exception e) { } catch (Exception e) {
return Collections.emptyList(); return Collections.emptyList();
} }
......
//package com.zhiwei.media_data_crawler.test;
//
//import java.util.ArrayList;
//import java.util.HashMap;
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.media_data_crawler.crawler.WordsReadFile;
//import com.zhiwei.media_data_crawler.data.DataCrawler;
//import com.zhiwei.media_data_crawler.entity.LunTanData;
//
///**
// * 天涯论坛数据获取
// * @author xMx
// * @date 2019年11月8日 下午4:08:29
// */
//public class GetTiayaDataTest {
//
// public static void main(String[] args) {
// String wordFilePath = "D:\\crawlerdata\\关键词6.txt"; //关键词
// String filePath = "D:\\crawlerdata\\天涯论坛-精装房.xlsx";
// String startTime = "2019-01-01 00:00:00"; //开始时间
// String endTime = "2019-11-08 23:59:59"; //结束时间
//
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER, 10000008);
// List<String> wordList = WordsReadFile.getWords(wordFilePath);
//
// List<LunTanData> list = new ArrayList<>();
// wordList.forEach(word ->{
// list.addAll(DataCrawler.getLunTanData(word, null, startTime, endTime));
// });
//
// List<Map<String, Object>> bodyList = new ArrayList<>();
//
// list.forEach(data ->{
// Map<String, Object> map = new HashMap<>();
// map.put("地址", data.getUrl());
// map.put("标题", data.getTitle());
// map.put("时间", data.getTime());
// map.put("来源", data.getSource());
// map.put("回复者或楼主", data.getAuthor());
// map.put("回复内容", data.getContent());
// map.put("回复数", data.getReply_count());
// map.put("平台", data.getPt());
// map.put("关键词", data.getWord());
//
// bodyList.add(map);
// });
//
// List<String> headList = new ArrayList<>();
// headList.add("地址");
// headList.add("标题");
// headList.add("时间");
// headList.add("来源");
// headList.add("回复者或楼主");
// headList.add("回复内容");
// headList.add("回复数");
// headList.add("平台");
// headList.add("关键词");
//
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel(filePath, "数据", headList, bodyList);
// System.out.println("导出成功");
// }
//
//}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment