Commit 4986288a by zhiwei

Merge branch 'master' of

http://git.zhiweidata.top/zhangzhiwei/media_data_crawler.git

Conflicts:
	src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java
	src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
	src/test/java/com/zhiwei/media_data_crawler/test/DataCrawlerTest.java
parents 3e60233c daa0d81c
...@@ -16,12 +16,12 @@ ...@@ -16,12 +16,12 @@
</arguments> </arguments>
</buildCommand> </buildCommand>
<buildCommand> <buildCommand>
<name>org.eclipse.m2e.core.maven2Builder</name> <name>org.eclipse.wst.validation.validationbuilder</name>
<arguments> <arguments>
</arguments> </arguments>
</buildCommand> </buildCommand>
<buildCommand> <buildCommand>
<name>org.eclipse.wst.validation.validationbuilder</name> <name>org.eclipse.m2e.core.maven2Builder</name>
<arguments> <arguments>
</arguments> </arguments>
</buildCommand> </buildCommand>
......
...@@ -76,6 +76,36 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -76,6 +76,36 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
/** /**
* @Title: getBaiduNewsData * @Title: getBaiduNewsData
* @author hero * @author hero
* @Description: 采集百度新闻数据
* @param @param
* word
* @param @param
* startTime
* @param @param
* endTime
* @param @param
* proxy
* @param @return
* 设定文件
* @return Integer 返回类型
* @throws Exception
*/
public static int getBaiduNewsCount(String word, String startTime, String endTime, Proxy proxy,String cookie) throws Exception {
try {
String result = downloadHtml(word, startTime, endTime, proxy, "newsdy", 1,cookie);
System.out.println(result);
String s = result.split("找到相关新闻")[1];
String s1 = s.split("篇")[0];
s1 = s1.replace(",", "").replace("约", "");
return Integer.valueOf(s1);
} catch (Exception e) {
return -1;
}
}
/**
* @Title: getBaiduNewsData
* @author hero
* @Description: 根据关键词获取数据 * @Description: 根据关键词获取数据
* @param @param word * @param @param word
* @param @param startTime * @param @param startTime
...@@ -160,6 +190,52 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -160,6 +190,52 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
* @return String 返回类型 * @return String 返回类型
*/ */
private static String downloadHtml(String word, String startTime, String endTime, Proxy proxy, String tn, private static String downloadHtml(String word, String startTime, String endTime, Proxy proxy, String tn,
int page,String cookie) throws Exception{
// 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址
String url = getUrl(word, startTime, endTime, tn, page);
System.out.println(url);
headerMap.put("Host", "news.baidu.com");
headerMap.put("cookie",cookie);
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
}else{
continue;
}
}
}
return null;
}
/**
* @Title: downloadHtml
* @author hero
* @Description: 获取数据流
* @param @param
* word
* @param @param
* startTime
* @param @param
* endTime
* @param @param
* proxy
* @param @param
* tn (tn=newsdy 为 全文匹配, tn=newstitle 为标题匹配)
* @param @param
* page
* @param @return
* 设定文件
* @return String 返回类型
*/
private static String downloadHtml(String word, String startTime, String endTime, Proxy proxy, String tn,
int page) throws Exception{ int page) throws Exception{
// 获取通用请求头 // 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead(); Map<String, String> headerMap = HeaderTool.getCommonHead();
...@@ -263,6 +339,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -263,6 +339,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
/** 文章发布时间处理 **/ /** 文章发布时间处理 **/
time = time.replaceAll(" ", ""); time = time.replaceAll(" ", "");
time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss"); time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss");
time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time.trim()), "yyyy-MM-dd HH:mm:ss");
// 处理文章简介 // 处理文章简介
if (element.select("div.c-row") != null) { if (element.select("div.c-row") != null) {
descript = element.select("div.c-row").text(); descript = element.select("div.c-row").text();
......
...@@ -6,6 +6,9 @@ import java.util.ArrayList; ...@@ -6,6 +6,9 @@ import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
...@@ -61,6 +64,151 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK { ...@@ -61,6 +64,151 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
return list; return list;
} }
@SuppressWarnings("unchecked")
public static List<TiebaData> getBaiduTiebaAnswerDataByUrl(String url, Proxy proxy) throws Exception {
List<TiebaData> list = new ArrayList<TiebaData>();
if(url.contains("?")) {
url = url.split("\\?")[0];
}
String aid = url.split("\\/")[4];
int page = 1;
boolean more = true;
while (more) {
// 最大页数为20
if (page > 50) {
more = false;
}
String ur = url + "?pn=" + page;
String htmlBody = downloadHtml(ur, proxy);
System.out.println(url + "------------" + aid);
if (htmlBody != null) {
Map<String, Object> dataMap = analysisDataAnswer(htmlBody,aid);
List<TiebaData> dataList = (List<TiebaData>) dataMap.get("data");
list.addAll(dataList);
System.out.println(list.size());
more = (Boolean) dataMap.get("more");
} else {
more = false;
}
page++;
ZhiWeiTools.sleep(3000);
}
return list;
}
private static Map<String, Object> analysisDataAnswer(String htmlBody,
String aid) {
Map<String, Object> resultMap = new HashMap<String, Object>();
List<TiebaData> list = new ArrayList<TiebaData>();
boolean more = true;
/** 解析页面 */
Document document = Jsoup.parse(htmlBody);
/** 判断是否有下一页 **/
if (!document.select("li.l_pager.pager_theme_4.pb_list_pager").text().contains("下一页")) {
more = false;
}
// 开始解析
Elements elementes = document.select("div.p_postlist > div");
String title = null;
title = document.select("div.core_title.core_title_theme_bright > h1").text();
if(title == null || title.length() < 1) {
title = document.select("#j_core_title_wrap > h3").text();
}
System.out.println(title);
for(Element element : elementes) {
String time = null;
String content = null;
String author = null;
String tid = null;
author = element.select("li.d_name").select("a").text();
content = element.select("div.p_content_nameplate").select("cc").select("div.clearfix").text();
if(content == null ||content.length() < 1) {
content = element.select("div.j_d_post_content").text();
}
time = getTime(element);
Pattern pa2 = Pattern.compile("post_id&quot(.*?),&quot");
Matcher ma2 = pa2.matcher(element.toString());
while(ma2.find()) {
tid = ma2.group(0);
tid = tid.split("&quot;:")[1].split(",&quot")[0];
break;
}
if(time != null && time.length() > 1) {
TiebaData tbd = new TiebaData("http://tieba.baidu.com/p/"+aid, title, time, tid, null, author, content, aid);
System.out.println(tbd.toString());
list.add(tbd);
}
}
if(elementes.size()==0){
more = false;
}
resultMap.put("data", list);
resultMap.put("more", more);
return resultMap;
}
/**
*
* @Description 百度贴吧获取时间
* @param element
* @return
*/
private static String getTime(Element element) {
String time = null;
if(time == null || time.length() < 1) {
time = element.select("span.tail-info").text();
if(time.contains("楼")) {
time = time.split("楼")[1].trim();
}
}
if(time == null || time.trim().length() < 1) {
Pattern pa = Pattern.compile("date&quot;:&quot;(.*?)&quot");
Matcher ma = pa.matcher(element.toString());
while(ma.find()) {
time = ma.group(0);
time = time.split("date&quot;:&quot;")[1].split("&quot")[0];
break;
}
}
return time;
}
/**
* @Title: downloadHtml
* @author hero
* @Description: 下載百度貼吧具体页面数据
* @param @param word
* @param @param proxy
* @param @param tiebaName
* @param @param page
* @param @return
* @param @throws Exception 设定文件
* @return String 返回类型
*/
private static String downloadHtml(String url, Proxy proxy) throws Exception{
// 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址
headerMap.put("Host", "tieba.baidu.com");
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
}else{
continue;
}
}
}
return null;
}
/** /**
* @Title: downloadHtml * @Title: downloadHtml
...@@ -172,11 +320,11 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK { ...@@ -172,11 +320,11 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
String url = null; String url = null;
if (word != null) { if (word != null) {
if(tiebaName!=null){ if(tiebaName!=null){
url = "http://tieba.baidu.com/f/search/res?isnew=1&kw="+URLCodeUtil.getURLEncode(word, "utf-8")+"&qw="+ url = "http://tieba.baidu.com/f/search/res?isnew=1&kw="+URLCodeUtil.getURLEncode(word, "GBK")+"&qw="+
URLCodeUtil.getURLEncode(word, "utf-8")+"&rn=30&un=&only_thread=0&sm=1&sd=&ed=&pn="+page; URLCodeUtil.getURLEncode(word, "GBK")+"&rn=10&un=&only_thread=0&sm=1&sd=&ed=&pn="+page;
}else{ }else{
url = "http://tieba.baidu.com/f/search/res?isnew=1&kw=&qw="+ url = "http://tieba.baidu.com/f/search/res?isnew=1&kw=&qw="+
URLCodeUtil.getURLEncode(word, "utf-8")+"&rn=30&un=&only_thread=0&sm=1&sd=&ed=&pn="+page; URLCodeUtil.getURLEncode(word, "GBK")+"&rn=10&un=&only_thread=0&sm=1&sd=&ed=&pn="+page;
} }
} }
System.out.println(url); System.out.println(url);
......
...@@ -232,7 +232,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -232,7 +232,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
//添加到数据集合中 //添加到数据集合中
if(title != null && !title.equals("") && source!=null && time!=null){ if(title != null && !title.equals("") && source!=null && time!=null){
NewsData newsData = new NewsData(link, title, source, time, content, pt, word); NewsData newsData = new NewsData(link, title, source, time, content, pt, word);
logger.info("搜狗新闻数据:{}", newsData); // logger.info("搜狗新闻数据:{}", newsData);
list.add(newsData); list.add(newsData);
} }
if(!type.equals("other")){ if(!type.equals("other")){
......
...@@ -50,6 +50,27 @@ public class DataCrawler { ...@@ -50,6 +50,27 @@ public class DataCrawler {
/** /**
* *
* @Title: getBaiduNewsData
* @author hero
* @Description: 根据关键词和时间,百度新闻数量
* @param @param word
* @param @param startTime
* @param @param endTime
* @param @param proxy
* @param @return 设定文件
* @return List<NewsData> 返回类型
*/
public static int getBaiduNewsCount(String word, String startTime, String endTime, Proxy proxy,String cookie){
try {
return BaiduNewsCrawlerParse.getBaiduNewsCount(word, startTime, endTime, proxy,cookie);
} catch (Exception e) {
e.printStackTrace();
return -1;
}
}
/**
*
* @Title: getBaiduNewsDataByTitle * @Title: getBaiduNewsDataByTitle
* @author hero * @author hero
* @Description: 根据关键词和时间,标题匹配百度新闻数据 * @Description: 根据关键词和时间,标题匹配百度新闻数据
...@@ -216,6 +237,31 @@ public class DataCrawler { ...@@ -216,6 +237,31 @@ public class DataCrawler {
* tiebaName * tiebaName
* @param @return * @param @return
* 设定文件 * 设定文件
* @Title: getBaiduTiebaData
* @author hero
* @Description: 根据关键词采集贴吧数据
* @param @param word
* @param @param proxy
* @param @return 设定文件
* @return List<TiebaData> 返回类型
*/
public static List<TiebaData> getBaiduTiebaAnswserDataByUrl(String url, Proxy proxy){
try {
return BaiduTiebaCrawlerParse.getBaiduTiebaAnswerDataByUrl(url, proxy);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* @Title: getBaiduTiebaData
* @author hero
* @Description: 根据关键词采集指定贴吧内数据
* @param @param word
* @param @param proxy
* @param @param tiebaName
* @param @return 设定文件
* @return List<TiebaData> 返回类型 * @return List<TiebaData> 返回类型
*/ */
public static List<TiebaData> getBaiduTiebaData(String word, Proxy proxy, String tiebaName) { public static List<TiebaData> getBaiduTiebaData(String word, Proxy proxy, String tiebaName) {
......
...@@ -57,5 +57,4 @@ public class DataCrawlerTest { ...@@ -57,5 +57,4 @@ public class DataCrawlerTest {
e.printStackTrace(); e.printStackTrace();
} }
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment