Merge branch 'master' of

http://git.zhiweidata.top/zhangzhiwei/media_data_crawler.git Conflicts: src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java src/test/java/com/zhiwei/media_data_crawler/test/DataCrawlerTest.java

Merge branch 'master' of
http://git.zhiweidata.top/zhangzhiwei/media_data_crawler.git Conflicts: src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java src/test/java/com/zhiwei/media_data_crawler/test/DataCrawlerTest.java
4986288a · zhiwei · 3e60233c · daa0d81c · 4986288a · 4986288a
Commit 4986288a authored Aug 02, 2018 by zhiwei
12 changed files
--- a/.classpath
+++ b/.classpath
 <?xml version="1.0" encoding="UTF-8"?>
 <classpath>
 	<classpathentry kind="src" output="target/classes" path="src/main/java">
 		<attributes>
 			<attribute name="optional" value="true"/>
 			<attribute name="maven.pomderived" value="true"/>
 		</attributes>
 	</classpathentry>
 	<classpathentry kind="src" output="target/test-classes" path="src/test/java">
 		<attributes>
 			<attribute name="optional" value="true"/>
 			<attribute name="maven.pomderived" value="true"/>
 		</attributes>
 	</classpathentry>
 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8">
 		<attributes>
 			<attribute name="maven.pomderived" value="true"/>
 		</attributes>
 	</classpathentry>
 	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
 		<attributes>
 			<attribute name="maven.pomderived" value="true"/>
 			<attribute name="org.eclipse.jst.component.nondependency" value=""/>
 		</attributes>
 	</classpathentry>
 	<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
 	<classpathentry kind="output" path="target/classes"/>
 </classpath>
--- a/.gitignore
+++ b/.gitignore
 /target/
--- a/.project
+++ b/.project
 <?xml version="1.0" encoding="UTF-8"?>
 <projectDescription>
 	<name>media_data_crawler</name>
 	<comment></comment>
 	<projects>
 	</projects>
 	<buildSpec>
 		<buildCommand>
 			<name>org.eclipse.wst.common.project.facet.core.builder</name>
 			<arguments>
 			</arguments>
 		</buildCommand>
 		<buildCommand>
 			<name>org.eclipse.jdt.core.javabuilder</name>
 			<arguments>
 			</arguments>
 		</buildCommand>
 		<buildCommand>
-			<name>org.eclipse.m2e.core.maven2Builder</name>
+			<name>org.eclipse.wst.validation.validationbuilder</name>
 			<arguments>
 			</arguments>
 		</buildCommand>
 		<buildCommand>
-			<name>org.eclipse.wst.validation.validationbuilder</name>
+			<name>org.eclipse.m2e.core.maven2Builder</name>
 			<arguments>
 			</arguments>
 		</buildCommand>
 	</buildSpec>
 	<natures>
 		<nature>org.eclipse.jem.workbench.JavaEMFNature</nature>
 		<nature>org.eclipse.wst.common.modulecore.ModuleCoreNature</nature>
 		<nature>org.eclipse.jdt.core.javanature</nature>
 		<nature>org.eclipse.m2e.core.maven2Nature</nature>
 		<nature>org.eclipse.wst.common.project.facet.core.nature</nature>
 	</natures>
 </projectDescription>
--- a/.settings/org.eclipse.jdt.core.prefs
+++ b/.settings/org.eclipse.jdt.core.prefs
 eclipse.preferences.version=1
 org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
 org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
 org.eclipse.jdt.core.compiler.compliance=1.8
 org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
 org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
 org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
 org.eclipse.jdt.core.compiler.source=1.8
--- a/.settings/org.eclipse.m2e.core.prefs
+++ b/.settings/org.eclipse.m2e.core.prefs
 activeProfiles=
 eclipse.preferences.version=1
 resolveWorkspaceProjects=true
 version=1
--- a/.settings/org.eclipse.wst.common.component
+++ b/.settings/org.eclipse.wst.common.component
 <?xml version="1.0" encoding="UTF-8"?><project-modules id="moduleCoreId" project-version="1.5.0">
    <wb-module deploy-name="media_data_crawler">
        <wb-resource deploy-path="/" source-path="/src/main/java"/>
    </wb-module>
 </project-modules>
--- a/.settings/org.eclipse.wst.common.project.facet.core.xml
+++ b/.settings/org.eclipse.wst.common.project.facet.core.xml
 <?xml version="1.0" encoding="UTF-8"?>
 <faceted-project>
  <installed facet="java" version="1.8"/>
  <installed facet="jst.utility" version="1.0"/>
 </faceted-project>
--- a/src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java
+++ b/src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java
@@ -74,6 +74,36 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
 	}
 	/**
+     * @Title: getBaiduNewsData
+     * @author hero
+     * @Description: 采集百度新闻数据
+     * @param @param
+     *            word
+     * @param @param
+     *            startTime
+     * @param @param
+     *            endTime
+     * @param @param
+     *            proxy
+     * @param @return
+     *            设定文件
+     * @return Integer 返回类型
+     * @throws Exception 
+     */
+    public static int getBaiduNewsCount(String word, String startTime, String endTime, Proxy proxy,String cookie) throws Exception {
+        try {
+            String result = downloadHtml(word, startTime, endTime, proxy, "newsdy", 1,cookie);
+            System.out.println(result);
+            String s = result.split("找到相关新闻")[1];
+            String s1 = s.split("篇")[0];
+            s1 = s1.replace(",", "").replace("约", "");
+            return Integer.valueOf(s1);
+        } catch (Exception e) {
+            return -1;
+        }
+    }
+	/**
 	 * @Title: getBaiduNewsData 
 	 * @author hero 
 	 * @Description: 根据关键词获取数据
@@ -138,7 +168,53 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
 		}
 		return list;
 	}
+	/**
+    * @Title: downloadHtml
+    * @author hero
+    * @Description: 获取数据流
+    * @param @param
+    *            word
+    * @param @param
+    *            startTime
+    * @param @param
+    *            endTime
+    * @param @param
+    *            proxy
+    * @param @param
+    *            tn (tn=newsdy 为 全文匹配, tn=newstitle 为标题匹配)
+    * @param @param
+    *            page
+    * @param @return
+    *            设定文件
+    * @return String 返回类型
+    */
+   private static String downloadHtml(String word, String startTime, String endTime, Proxy proxy, String tn,
+           int page,String cookie) throws Exception{
+       // 获取通用请求头
+       Map<String, String> headerMap = HeaderTool.getCommonHead();
+       // 获取链接地址
+       String url = getUrl(word, startTime, endTime, tn, page);
+       System.out.println(url);
+       headerMap.put("Host", "news.baidu.com");
+       headerMap.put("cookie",cookie);
+       // 下载数据页面
+       for (int i = 1; i <= 3; i++) {
+           try {
+               return get(url, proxy, headerMap);
+           } catch (IOException e) {
+               logger.error("获取数据时出现问题,问题为：{}", e.fillInStackTrace());
+               if(i==3){
+                   throw e;
+               }else{
+                   continue;
+               }
+           }
+       }
+       return null;
+   }
 	/**
 	 * @Title: downloadHtml
 	 * @author hero
@@ -263,6 +339,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
 				/** 文章发布时间处理 **/
 				time = time.replaceAll(" ", "");
 				time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss");
+				time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time.trim()), "yyyy-MM-dd HH:mm:ss");
 				// 处理文章简介
 				if (element.select("div.c-row") != null) {
 					descript = element.select("div.c-row").text();

--- a/src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduTiebaCrawlerParse.java
+++ b/src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduTiebaCrawlerParse.java
@@ -6,6 +6,9 @@ import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
@@ -61,8 +64,153 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
 		return list;
 	}
+	@SuppressWarnings("unchecked")
+    public static List<TiebaData> getBaiduTiebaAnswerDataByUrl(String url, Proxy proxy) throws Exception {
+	    List<TiebaData> list = new ArrayList<TiebaData>();
+	    if(url.contains("?")) {
+	        url = url.split("\\?")[0];
+	    }
+	    String aid = url.split("\\/")[4];
+        int page = 1;
+        boolean more = true;
+        while (more) {
+            // 最大页数为20
+            if (page > 50) {
+                more = false;
+            }
+            String ur = url + "?pn=" + page;
+            String htmlBody = downloadHtml(ur, proxy);
+            System.out.println(url + "------------" + aid);
+            if (htmlBody != null) {
+                Map<String, Object> dataMap = analysisDataAnswer(htmlBody,aid);
+                List<TiebaData> dataList = (List<TiebaData>) dataMap.get("data");
+                list.addAll(dataList);
+                System.out.println(list.size());
+                more = (Boolean) dataMap.get("more");
+            } else {
+                more = false;
+            }
+            page++;
+            ZhiWeiTools.sleep(3000);
+        }
+        return list;
+	}
+	private static Map<String, Object> analysisDataAnswer(String htmlBody,
+            String aid) {
+	    Map<String, Object> resultMap = new HashMap<String, Object>();
+        List<TiebaData> list = new ArrayList<TiebaData>();
+        boolean more = true;
+        /** 解析页面 */
+        Document document = Jsoup.parse(htmlBody);
+        /** 判断是否有下一页 **/
+        if (!document.select("li.l_pager.pager_theme_4.pb_list_pager").text().contains("下一页")) {
+            more = false;
+        }
+        // 开始解析
+        Elements elementes = document.select("div.p_postlist > div");
+        String title = null;
+        title = document.select("div.core_title.core_title_theme_bright > h1").text();
+        if(title == null || title.length() < 1) {
+            title = document.select("#j_core_title_wrap > h3").text();
+        }
+        System.out.println(title);
+        for(Element element : elementes) {
+            String time = null;
+            String content = null;
+            String author = null;
+            String tid = null;
+            author = element.select("li.d_name").select("a").text();
+            content = element.select("div.p_content_nameplate").select("cc").select("div.clearfix").text();
+            if(content == null ||content.length() < 1) {
+                content = element.select("div.j_d_post_content").text();
+            }
+            time = getTime(element);
+            Pattern pa2 = Pattern.compile("post_id&quot(.*?),&quot");
+            Matcher ma2 = pa2.matcher(element.toString());
+            while(ma2.find()) {
+                tid = ma2.group(0);
+                tid = tid.split("&quot;:")[1].split(",&quot")[0];
+                break;
+            }
+            if(time != null && time.length() > 1) {
+                TiebaData tbd = new TiebaData("http://tieba.baidu.com/p/"+aid, title, time, tid, null, author, content, aid);
+                System.out.println(tbd.toString());
+                list.add(tbd);
+            }
+        }
+        if(elementes.size()==0){
+            more = false;
+        }
+        resultMap.put("data", list);
+        resultMap.put("more", more);
+        return resultMap;
+    }
 	/**
+	 * 
+	 * @Description 百度贴吧获取时间
+	 * @param element
+	 * @return
+	 */
+	private static String getTime(Element element) {
+	    String time = null;
+	    if(time == null || time.length() < 1) {
+            time = element.select("span.tail-info").text();
+            if(time.contains("楼")) {
+                time = time.split("楼")[1].trim();
+            }
+        }
+        if(time == null || time.trim().length() < 1) {
+            Pattern pa = Pattern.compile("date&quot;:&quot;(.*?)&quot");
+            Matcher ma = pa.matcher(element.toString());
+            while(ma.find()) {
+                time = ma.group(0);
+                time = time.split("date&quot;:&quot;")[1].split("&quot")[0];
+                break;
+            }
+        }
+        return time;
+	}
+    /**
+     * @Title: downloadHtml 
+     * @author hero 
+     * @Description: 下載百度貼吧具体页面数据
+     * @param @param word
+     * @param @param proxy
+     * @param @param tiebaName
+     * @param @param page
+     * @param @return
+     * @param @throws Exception 设定文件 
+     * @return String 返回类型
+     */
+    private static String downloadHtml(String url, Proxy proxy) throws Exception{
+        // 获取通用请求头
+        Map<String, String> headerMap = HeaderTool.getCommonHead();
+        // 获取链接地址
+        headerMap.put("Host", "tieba.baidu.com");
+        // 下载数据页面
+        for (int i = 1; i <= 3; i++) {
+            try {
+                return get(url, proxy, headerMap);
+            } catch (IOException e) {
+                logger.error("获取数据时出现问题,问题为：{}", e.fillInStackTrace());
+                if(i==3){
+                    throw e;
+                }else{
+                    continue;
+                }
+            }
+        }
+        return null;
+    }
+    /**
 	 * @Title: downloadHtml 
 	 * @author hero 
 	 * @Description: 下載百度貼吧數據
@@ -172,11 +320,11 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
 		String url = null;
 		if (word != null) {
 			if(tiebaName!=null){
-				url = "http://tieba.baidu.com/f/search/res?isnew=1&kw="+URLCodeUtil.getURLEncode(word, "utf-8")+"&qw="+
+				url = "http://tieba.baidu.com/f/search/res?isnew=1&kw="+URLCodeUtil.getURLEncode(word, "GBK")+"&qw="+
-						URLCodeUtil.getURLEncode(word, "utf-8")+"&rn=30&un=&only_thread=0&sm=1&sd=&ed=&pn="+page;
+						URLCodeUtil.getURLEncode(word, "GBK")+"&rn=10&un=&only_thread=0&sm=1&sd=&ed=&pn="+page;
 			}else{
 				url = "http://tieba.baidu.com/f/search/res?isnew=1&kw=&qw="+
-						URLCodeUtil.getURLEncode(word, "utf-8")+"&rn=30&un=&only_thread=0&sm=1&sd=&ed=&pn="+page;
+						URLCodeUtil.getURLEncode(word, "GBK")+"&rn=10&un=&only_thread=0&sm=1&sd=&ed=&pn="+page;
 			}
 		}
 		System.out.println(url);

--- a/src/main/java/com/zhiwei/media_data_crawler/crawler/SougouNewsCrawlerParse.java
+++ b/src/main/java/com/zhiwei/media_data_crawler/crawler/SougouNewsCrawlerParse.java
@@ -232,7 +232,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
 				//添加到数据集合中
 				if(title != null && !title.equals("") && source!=null && time!=null){
 					NewsData newsData = new NewsData(link, title, source, time, content, pt, word);
-					logger.info("搜狗新闻数据:{}", newsData);
+//					logger.info("搜狗新闻数据:{}", newsData);
 					list.add(newsData);
 				}
 				if(!type.equals("other")){

--- a/src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
+++ b/src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
@@ -49,6 +49,27 @@ public class DataCrawler {
 	}
 	/**
+     * 
+     * @Title: getBaiduNewsData 
+     * @author hero 
+     * @Description: 根据关键词和时间，百度新闻数量
+     * @param @param word
+     * @param @param startTime
+     * @param @param endTime
+     * @param @param proxy
+     * @param @return 设定文件 
+     * @return List<NewsData> 返回类型
+     */
+    public static int getBaiduNewsCount(String word, String startTime, String endTime, Proxy proxy,String cookie){
+        try {
+            return BaiduNewsCrawlerParse.getBaiduNewsCount(word, startTime, endTime, proxy,cookie);
+        } catch (Exception e) {
+            e.printStackTrace();
+            return -1;
+        }
+    }
+	/**
 	 * 
 	 * @Title: getBaiduNewsDataByTitle
 	 * @author hero
@@ -216,6 +237,31 @@ public class DataCrawler {
 	 *            tiebaName
 	 * @param @return
 	 *            设定文件
+     * @Title: getBaiduTiebaData 
+     * @author hero 
+     * @Description: 根据关键词采集贴吧数据
+     * @param @param word
+     * @param @param proxy
+     * @param @return 设定文件 
+     * @return List<TiebaData> 返回类型
+     */
+    public static List<TiebaData> getBaiduTiebaAnswserDataByUrl(String url, Proxy proxy){
+        try {
+            return BaiduTiebaCrawlerParse.getBaiduTiebaAnswerDataByUrl(url, proxy);
+        } catch (Exception e) {
+            e.printStackTrace();
+            return null;
+        }
+    }
+	/**
+	 * @Title: getBaiduTiebaData 
+	 * @author hero 
+	 * @Description: 根据关键词采集指定贴吧内数据 
+	 * @param @param word
+	 * @param @param proxy
+	 * @param @param tiebaName
+	 * @param @return 设定文件 
 	 * @return List<TiebaData> 返回类型
 	 */
 	public static List<TiebaData> getBaiduTiebaData(String word, Proxy proxy, String tiebaName) {

--- a/src/test/java/com/zhiwei/media_data_crawler/test/DataCrawlerTest.java
+++ b/src/test/java/com/zhiwei/media_data_crawler/test/DataCrawlerTest.java
@@ -57,5 +57,4 @@ public class DataCrawlerTest {
 			e.printStackTrace();
 		}
 	}
 }