Commit 4986288a by zhiwei

Merge branch 'master' of

http://git.zhiweidata.top/zhangzhiwei/media_data_crawler.git

Conflicts:
	src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java
	src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
	src/test/java/com/zhiwei/media_data_crawler/test/DataCrawlerTest.java
parents 3e60233c daa0d81c
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<classpath> <classpath>
<classpathentry kind="src" output="target/classes" path="src/main/java"> <classpathentry kind="src" output="target/classes" path="src/main/java">
<attributes> <attributes>
<attribute name="optional" value="true"/> <attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/> <attribute name="maven.pomderived" value="true"/>
</attributes> </attributes>
</classpathentry> </classpathentry>
<classpathentry kind="src" output="target/test-classes" path="src/test/java"> <classpathentry kind="src" output="target/test-classes" path="src/test/java">
<attributes> <attributes>
<attribute name="optional" value="true"/> <attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/> <attribute name="maven.pomderived" value="true"/>
</attributes> </attributes>
</classpathentry> </classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8"> <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8">
<attributes> <attributes>
<attribute name="maven.pomderived" value="true"/> <attribute name="maven.pomderived" value="true"/>
</attributes> </attributes>
</classpathentry> </classpathentry>
<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER"> <classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
<attributes> <attributes>
<attribute name="maven.pomderived" value="true"/> <attribute name="maven.pomderived" value="true"/>
<attribute name="org.eclipse.jst.component.nondependency" value=""/> <attribute name="org.eclipse.jst.component.nondependency" value=""/>
</attributes> </attributes>
</classpathentry> </classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/> <classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
<classpathentry kind="output" path="target/classes"/> <classpathentry kind="output" path="target/classes"/>
</classpath> </classpath>
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<projectDescription> <projectDescription>
<name>media_data_crawler</name> <name>media_data_crawler</name>
<comment></comment> <comment></comment>
<projects> <projects>
</projects> </projects>
<buildSpec> <buildSpec>
<buildCommand> <buildCommand>
<name>org.eclipse.wst.common.project.facet.core.builder</name> <name>org.eclipse.wst.common.project.facet.core.builder</name>
<arguments> <arguments>
</arguments> </arguments>
</buildCommand> </buildCommand>
<buildCommand> <buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name> <name>org.eclipse.jdt.core.javabuilder</name>
<arguments> <arguments>
</arguments> </arguments>
</buildCommand> </buildCommand>
<buildCommand> <buildCommand>
<name>org.eclipse.m2e.core.maven2Builder</name> <name>org.eclipse.wst.validation.validationbuilder</name>
<arguments> <arguments>
</arguments> </arguments>
</buildCommand> </buildCommand>
<buildCommand> <buildCommand>
<name>org.eclipse.wst.validation.validationbuilder</name> <name>org.eclipse.m2e.core.maven2Builder</name>
<arguments> <arguments>
</arguments> </arguments>
</buildCommand> </buildCommand>
</buildSpec> </buildSpec>
<natures> <natures>
<nature>org.eclipse.jem.workbench.JavaEMFNature</nature> <nature>org.eclipse.jem.workbench.JavaEMFNature</nature>
<nature>org.eclipse.wst.common.modulecore.ModuleCoreNature</nature> <nature>org.eclipse.wst.common.modulecore.ModuleCoreNature</nature>
<nature>org.eclipse.jdt.core.javanature</nature> <nature>org.eclipse.jdt.core.javanature</nature>
<nature>org.eclipse.m2e.core.maven2Nature</nature> <nature>org.eclipse.m2e.core.maven2Nature</nature>
<nature>org.eclipse.wst.common.project.facet.core.nature</nature> <nature>org.eclipse.wst.common.project.facet.core.nature</nature>
</natures> </natures>
</projectDescription> </projectDescription>
eclipse.preferences.version=1 eclipse.preferences.version=1
org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
org.eclipse.jdt.core.compiler.compliance=1.8 org.eclipse.jdt.core.compiler.compliance=1.8
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
org.eclipse.jdt.core.compiler.source=1.8 org.eclipse.jdt.core.compiler.source=1.8
activeProfiles= activeProfiles=
eclipse.preferences.version=1 eclipse.preferences.version=1
resolveWorkspaceProjects=true resolveWorkspaceProjects=true
version=1 version=1
<?xml version="1.0" encoding="UTF-8"?><project-modules id="moduleCoreId" project-version="1.5.0"> <?xml version="1.0" encoding="UTF-8"?><project-modules id="moduleCoreId" project-version="1.5.0">
<wb-module deploy-name="media_data_crawler"> <wb-module deploy-name="media_data_crawler">
<wb-resource deploy-path="/" source-path="/src/main/java"/> <wb-resource deploy-path="/" source-path="/src/main/java"/>
</wb-module> </wb-module>
</project-modules> </project-modules>
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<faceted-project> <faceted-project>
<installed facet="java" version="1.8"/> <installed facet="java" version="1.8"/>
<installed facet="jst.utility" version="1.0"/> <installed facet="jst.utility" version="1.0"/>
</faceted-project> </faceted-project>
...@@ -74,6 +74,36 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -74,6 +74,36 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
} }
/** /**
* @Title: getBaiduNewsData
* @author hero
* @Description: 采集百度新闻数据
* @param @param
* word
* @param @param
* startTime
* @param @param
* endTime
* @param @param
* proxy
* @param @return
* 设定文件
* @return Integer 返回类型
* @throws Exception
*/
public static int getBaiduNewsCount(String word, String startTime, String endTime, Proxy proxy,String cookie) throws Exception {
try {
String result = downloadHtml(word, startTime, endTime, proxy, "newsdy", 1,cookie);
System.out.println(result);
String s = result.split("找到相关新闻")[1];
String s1 = s.split("篇")[0];
s1 = s1.replace(",", "").replace("约", "");
return Integer.valueOf(s1);
} catch (Exception e) {
return -1;
}
}
/**
* @Title: getBaiduNewsData * @Title: getBaiduNewsData
* @author hero * @author hero
* @Description: 根据关键词获取数据 * @Description: 根据关键词获取数据
...@@ -138,7 +168,53 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -138,7 +168,53 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
} }
return list; return list;
} }
/**
* @Title: downloadHtml
* @author hero
* @Description: 获取数据流
* @param @param
* word
* @param @param
* startTime
* @param @param
* endTime
* @param @param
* proxy
* @param @param
* tn (tn=newsdy 为 全文匹配, tn=newstitle 为标题匹配)
* @param @param
* page
* @param @return
* 设定文件
* @return String 返回类型
*/
private static String downloadHtml(String word, String startTime, String endTime, Proxy proxy, String tn,
int page,String cookie) throws Exception{
// 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址
String url = getUrl(word, startTime, endTime, tn, page);
System.out.println(url);
headerMap.put("Host", "news.baidu.com");
headerMap.put("cookie",cookie);
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
}else{
continue;
}
}
}
return null;
}
/** /**
* @Title: downloadHtml * @Title: downloadHtml
* @author hero * @author hero
...@@ -263,6 +339,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -263,6 +339,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
/** 文章发布时间处理 **/ /** 文章发布时间处理 **/
time = time.replaceAll(" ", ""); time = time.replaceAll(" ", "");
time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss"); time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss");
time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time.trim()), "yyyy-MM-dd HH:mm:ss");
// 处理文章简介 // 处理文章简介
if (element.select("div.c-row") != null) { if (element.select("div.c-row") != null) {
descript = element.select("div.c-row").text(); descript = element.select("div.c-row").text();
......
...@@ -6,6 +6,9 @@ import java.util.ArrayList; ...@@ -6,6 +6,9 @@ import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
...@@ -61,8 +64,153 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK { ...@@ -61,8 +64,153 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
return list; return list;
} }
@SuppressWarnings("unchecked")
public static List<TiebaData> getBaiduTiebaAnswerDataByUrl(String url, Proxy proxy) throws Exception {
List<TiebaData> list = new ArrayList<TiebaData>();
if(url.contains("?")) {
url = url.split("\\?")[0];
}
String aid = url.split("\\/")[4];
int page = 1;
boolean more = true;
while (more) {
// 最大页数为20
if (page > 50) {
more = false;
}
String ur = url + "?pn=" + page;
String htmlBody = downloadHtml(ur, proxy);
System.out.println(url + "------------" + aid);
if (htmlBody != null) {
Map<String, Object> dataMap = analysisDataAnswer(htmlBody,aid);
List<TiebaData> dataList = (List<TiebaData>) dataMap.get("data");
list.addAll(dataList);
System.out.println(list.size());
more = (Boolean) dataMap.get("more");
} else {
more = false;
}
page++;
ZhiWeiTools.sleep(3000);
}
return list;
}
private static Map<String, Object> analysisDataAnswer(String htmlBody,
String aid) {
Map<String, Object> resultMap = new HashMap<String, Object>();
List<TiebaData> list = new ArrayList<TiebaData>();
boolean more = true;
/** 解析页面 */
Document document = Jsoup.parse(htmlBody);
/** 判断是否有下一页 **/
if (!document.select("li.l_pager.pager_theme_4.pb_list_pager").text().contains("下一页")) {
more = false;
}
// 开始解析
Elements elementes = document.select("div.p_postlist > div");
String title = null;
title = document.select("div.core_title.core_title_theme_bright > h1").text();
if(title == null || title.length() < 1) {
title = document.select("#j_core_title_wrap > h3").text();
}
System.out.println(title);
for(Element element : elementes) {
String time = null;
String content = null;
String author = null;
String tid = null;
author = element.select("li.d_name").select("a").text();
content = element.select("div.p_content_nameplate").select("cc").select("div.clearfix").text();
if(content == null ||content.length() < 1) {
content = element.select("div.j_d_post_content").text();
}
time = getTime(element);
Pattern pa2 = Pattern.compile("post_id&quot(.*?),&quot");
Matcher ma2 = pa2.matcher(element.toString());
while(ma2.find()) {
tid = ma2.group(0);
tid = tid.split("&quot;:")[1].split(",&quot")[0];
break;
}
if(time != null && time.length() > 1) {
TiebaData tbd = new TiebaData("http://tieba.baidu.com/p/"+aid, title, time, tid, null, author, content, aid);
System.out.println(tbd.toString());
list.add(tbd);
}
}
if(elementes.size()==0){
more = false;
}
resultMap.put("data", list);
resultMap.put("more", more);
return resultMap;
}
/** /**
*
* @Description 百度贴吧获取时间
* @param element
* @return
*/
private static String getTime(Element element) {
String time = null;
if(time == null || time.length() < 1) {
time = element.select("span.tail-info").text();
if(time.contains("楼")) {
time = time.split("楼")[1].trim();
}
}
if(time == null || time.trim().length() < 1) {
Pattern pa = Pattern.compile("date&quot;:&quot;(.*?)&quot");
Matcher ma = pa.matcher(element.toString());
while(ma.find()) {
time = ma.group(0);
time = time.split("date&quot;:&quot;")[1].split("&quot")[0];
break;
}
}
return time;
}
/**
* @Title: downloadHtml
* @author hero
* @Description: 下載百度貼吧具体页面数据
* @param @param word
* @param @param proxy
* @param @param tiebaName
* @param @param page
* @param @return
* @param @throws Exception 设定文件
* @return String 返回类型
*/
private static String downloadHtml(String url, Proxy proxy) throws Exception{
// 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址
headerMap.put("Host", "tieba.baidu.com");
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
}else{
continue;
}
}
}
return null;
}
/**
* @Title: downloadHtml * @Title: downloadHtml
* @author hero * @author hero
* @Description: 下載百度貼吧數據 * @Description: 下載百度貼吧數據
...@@ -172,11 +320,11 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK { ...@@ -172,11 +320,11 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
String url = null; String url = null;
if (word != null) { if (word != null) {
if(tiebaName!=null){ if(tiebaName!=null){
url = "http://tieba.baidu.com/f/search/res?isnew=1&kw="+URLCodeUtil.getURLEncode(word, "utf-8")+"&qw="+ url = "http://tieba.baidu.com/f/search/res?isnew=1&kw="+URLCodeUtil.getURLEncode(word, "GBK")+"&qw="+
URLCodeUtil.getURLEncode(word, "utf-8")+"&rn=30&un=&only_thread=0&sm=1&sd=&ed=&pn="+page; URLCodeUtil.getURLEncode(word, "GBK")+"&rn=10&un=&only_thread=0&sm=1&sd=&ed=&pn="+page;
}else{ }else{
url = "http://tieba.baidu.com/f/search/res?isnew=1&kw=&qw="+ url = "http://tieba.baidu.com/f/search/res?isnew=1&kw=&qw="+
URLCodeUtil.getURLEncode(word, "utf-8")+"&rn=30&un=&only_thread=0&sm=1&sd=&ed=&pn="+page; URLCodeUtil.getURLEncode(word, "GBK")+"&rn=10&un=&only_thread=0&sm=1&sd=&ed=&pn="+page;
} }
} }
System.out.println(url); System.out.println(url);
......
...@@ -232,7 +232,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK { ...@@ -232,7 +232,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
//添加到数据集合中 //添加到数据集合中
if(title != null && !title.equals("") && source!=null && time!=null){ if(title != null && !title.equals("") && source!=null && time!=null){
NewsData newsData = new NewsData(link, title, source, time, content, pt, word); NewsData newsData = new NewsData(link, title, source, time, content, pt, word);
logger.info("搜狗新闻数据:{}", newsData); // logger.info("搜狗新闻数据:{}", newsData);
list.add(newsData); list.add(newsData);
} }
if(!type.equals("other")){ if(!type.equals("other")){
......
...@@ -49,6 +49,27 @@ public class DataCrawler { ...@@ -49,6 +49,27 @@ public class DataCrawler {
} }
/** /**
*
* @Title: getBaiduNewsData
* @author hero
* @Description: 根据关键词和时间,百度新闻数量
* @param @param word
* @param @param startTime
* @param @param endTime
* @param @param proxy
* @param @return 设定文件
* @return List<NewsData> 返回类型
*/
public static int getBaiduNewsCount(String word, String startTime, String endTime, Proxy proxy,String cookie){
try {
return BaiduNewsCrawlerParse.getBaiduNewsCount(word, startTime, endTime, proxy,cookie);
} catch (Exception e) {
e.printStackTrace();
return -1;
}
}
/**
* *
* @Title: getBaiduNewsDataByTitle * @Title: getBaiduNewsDataByTitle
* @author hero * @author hero
...@@ -216,6 +237,31 @@ public class DataCrawler { ...@@ -216,6 +237,31 @@ public class DataCrawler {
* tiebaName * tiebaName
* @param @return * @param @return
* 设定文件 * 设定文件
* @Title: getBaiduTiebaData
* @author hero
* @Description: 根据关键词采集贴吧数据
* @param @param word
* @param @param proxy
* @param @return 设定文件
* @return List<TiebaData> 返回类型
*/
public static List<TiebaData> getBaiduTiebaAnswserDataByUrl(String url, Proxy proxy){
try {
return BaiduTiebaCrawlerParse.getBaiduTiebaAnswerDataByUrl(url, proxy);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* @Title: getBaiduTiebaData
* @author hero
* @Description: 根据关键词采集指定贴吧内数据
* @param @param word
* @param @param proxy
* @param @param tiebaName
* @param @return 设定文件
* @return List<TiebaData> 返回类型 * @return List<TiebaData> 返回类型
*/ */
public static List<TiebaData> getBaiduTiebaData(String word, Proxy proxy, String tiebaName) { public static List<TiebaData> getBaiduTiebaData(String word, Proxy proxy, String tiebaName) {
......
...@@ -57,5 +57,4 @@ public class DataCrawlerTest { ...@@ -57,5 +57,4 @@ public class DataCrawlerTest {
e.printStackTrace(); e.printStackTrace();
} }
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment