修改代理ip及爬虫核心包

20ce0e8c · [zhangzhiwei] · 9ef31c31 · 20ce0e8c · 20ce0e8c · 20ce0e8c
Commit 20ce0e8c authored Nov 17, 2018 by [zhangzhiwei]
6 changed files
--- a/pom.xml
+++ b/pom.xml
@@ -65,7 +65,12 @@
 		<dependency>
 			<groupId>com.zhiwei.tools</groupId>
 			<artifactId>zhiwei-tools</artifactId>
-			<version>0.0.8-SNAPSHOT</version>
+			<version>0.0.9-SNAPSHOT</version>
+		</dependency>
+		<dependency>
+			<groupId>com.zhiwei</groupId>
+			<artifactId>excelpoi</artifactId>
+			<version>0.0.1-SNAPSHOT</version>
 		</dependency>
 	</dependencies>
 </project>
\ No newline at end of file
--- a/src/main/java/com/zhiwei/media_data_crawler/crawler/SougouZhihuCrawlerParse.java
+++ b/src/main/java/com/zhiwei/media_data_crawler/crawler/SougouZhihuCrawlerParse.java
@@ -195,7 +195,7 @@ public class SougouZhihuCrawlerParse{
 						}
 						comment_count = Integer.valueOf(commentCount);
 					}
-					zhihu = new ZhiHuData(link, title, pt, type, null, source, null, attitudes_count, null, comment_count, word);
+					zhihu = new ZhiHuData(link, title, pt, type, null, source, null, attitudes_count, null, comment_count, null,word);
 					zhihu = analysisZhihuArticle(link, proxy, zhihu);
 				}else {
 					Integer answer_count = 0;
@@ -206,7 +206,7 @@ public class SougouZhihuCrawlerParse{
 						}
 						answer_count = Integer.valueOf(answerText);
 					}
-					zhihu = new ZhiHuData(link, title, pt, type, null, null, null, null, answer_count, null, word);
+					zhihu = new ZhiHuData(link, title, pt, type, null, null, null, null, answer_count, null, null,word);
 					zhihu = analysisZhihuAnswer(link, proxy, zhihu);
 				}
 				list.add(zhihu);
@@ -241,6 +241,15 @@ public class SougouZhihuCrawlerParse{
 				Document document = Jsoup.parse(htmlBody);
 				String content = document.select("div.QuestionHeader-main").select("div.QuestionHeader-detail").text();
 				String commentCountText = document.select("div.QuestionHeader-Comment").text();
+				String time = "";
+				if(htmlBody.contains("pubDate")){
+					time = htmlBody.split("&quot;pubDate&quot;: &quot;")[1].split("&quot;")[0];
+					if(time!=null){
+						time = time.replaceAll("T", " ");
+					}
+				}else{
+					System.out.println("+++++++++++++++++++++++");
+				}
 				String regEx="[^0-9]";  
 				Pattern p = Pattern.compile(regEx);   
 				Matcher m = p.matcher(commentCountText);   
@@ -251,6 +260,7 @@ public class SougouZhihuCrawlerParse{
 				}
 				zhihu.setContent(content);
 				zhihu.setComment_count(comment_count);
+				zhihu.setTime(time);
 			}
 			return zhihu;
 		} catch (Exception e) {
@@ -274,7 +284,7 @@ public class SougouZhihuCrawlerParse{
 		try {
 			String htmlBody = downloadHtml(url, proxy, "文章");
 			Document document = Jsoup.parse(htmlBody);
-			String time = document.select("div.HoverTitle").first().select("time").attr("datetime");
+			String time = htmlBody.split("&quot;updated&quot;:")[1].split(",&quot;reviewers")[0];
 			Date date = new Date(time);
 			time = TimeParse.dateFormartString(date, "yyyy-MM-dd HH:mm:ss");
 			String content = document.select("[class=\"RichText PostIndex-content av-paddingSide av-card\"]").text();

--- a/src/main/java/com/zhiwei/media_data_crawler/crawler/ZhihuCrawlerParse.java
+++ b/src/main/java/com/zhiwei/media_data_crawler/crawler/ZhihuCrawlerParse.java
+package com.zhiwei.media_data_crawler.crawler;
+import java.net.Proxy;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import com.alibaba.fastjson.JSONArray;
+import com.alibaba.fastjson.JSONObject;
+import com.zhiwei.crawler.core.HttpBoot;
+import com.zhiwei.crawler.core.RequestUtils;
+import com.zhiwei.media_data_crawler.data.DataCrawler;
+import com.zhiwei.media_data_crawler.entity.ZhiHuData;
+import com.zhiwei.tools.httpclient.HeaderTool;
+import com.zhiwei.tools.timeparse.TimeParse;
+import com.zhiwei.tools.tools.URLCodeUtil;
+import com.zhiwei.tools.tools.ZhiWeiTools;
+import okhttp3.Response;
+public class ZhihuCrawlerParse {
+	private static Logger logger = LogManager.getLogger(TianYaCrawlerParse.class);
+	private static HttpBoot httpBoot = new HttpBoot();
+	/**
+	 * @Title: getBaiduTiebaData 
+	 * @author hero 
+	 * @Description: 根據關鍵詞獲取百度貼吧數據（最多50頁）
+	 * @param @param word
+	 * @param @param proxy
+	 * @param @param tiebaName
+	 * @param @return
+	 * @param @throws Exception 设定文件 
+	 * @return List<TiebaData> 返回类型
+	 */
+	@SuppressWarnings("unchecked")
+	public static List<ZhiHuData> getZhihuData(String word, String timeLimit,Proxy proxy, Date endTime) throws Exception {
+		List<ZhiHuData> list = new ArrayList<ZhiHuData>();
+		int page = 0;
+		boolean more = true;
+		while (more) {
+			// 最大页数为20
+			if (page > 20) {
+				more = false;
+			}
+			String htmlBody = downloadHtml(word, timeLimit, proxy, page);
+			if (htmlBody != null) {
+				Map<String, Object> dataMap = analysisData(htmlBody, proxy, word, endTime);
+				more = (Boolean) dataMap.get("more");
+				List<ZhiHuData> dataList = (List<ZhiHuData>) dataMap.get("data");
+				if(dataList!=null && !dataList.isEmpty()){
+					list.addAll(dataList);
+				}else{
+					more = false;
+				}
+			} else {
+				more = false;
+			}
+			page++;
+			if (DataCrawler.sleepTime == null) {
+				ZhiWeiTools.sleep(3000);
+			}
+		}
+		return list;
+	}
+	/**
+	 * @param word
+	 * @param timeLimit
+	 * @param proxy
+	 * @param page
+	 * @return
+	 * @throws Exception
+	 */
+	private static String downloadHtml(String word, String timeLimit,Proxy proxy,
+			int page) throws Exception{
+		// 获取通用请求头
+		Map<String, String> headerMap = HeaderTool.getCommonHead();
+		// 获取链接地址
+		String url = getUrl(word, timeLimit, page);
+		// 下载数据页面
+		for (int i = 1; i <= 3; i++) {
+			try {
+				Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
+				return response.body().string();
+			} catch (Exception e) {
+				logger.error("获取数据时出现问题,问题为：{}", e.fillInStackTrace());
+				if(i==3){
+					throw e;
+				}else{
+					continue;
+				}
+			}
+		}
+		return null;
+	}
+	/**
+	 * @Title: analysisData 
+	 * @author hero 
+	 * @Description: 解析Baidu貼吧數據
+	 * @param @param htmlBody
+	 * @param @param proxy
+	 * @param @param word
+	 * @param @return
+	 * @param @throws Exception 设定文件 
+	 * @return Map<String,Object> 返回类型
+	 */
+	private static Map<String, Object> analysisData(String htmlBody, Proxy proxy, String word, Date endTime) throws Exception{
+		Map<String, Object> resultMap = new HashMap<String, Object>();
+		List<ZhiHuData> list = new ArrayList<ZhiHuData>();
+		boolean more = true;
+		try {
+			JSONArray dataJson = JSONObject.parseObject(htmlBody).getJSONArray("data");
+			if(dataJson!=null && dataJson.size()>=0){
+				String url = null;       //地址
+				String title;     //标题
+				String type;      //类型
+				String time;      //时间
+				String source;    //发布者
+				String content;   //内容
+				Integer attitudes_count; //点赞数
+				Integer answer_count;    //回答数
+				Integer comment_count;   //评论数
+				Integer follower_count;   //评论数
+				Date date = null;
+				for(int i=0;i<dataJson.size();i++){
+					JSONObject objectJson = dataJson.getJSONObject(i).getJSONObject("object");
+					try {
+						if(!dataJson.getJSONObject(i).containsKey("data_list")){
+							date = new Date(objectJson.getLong("created_time")*1000);
+							time = TimeParse.dateFormartString(date, "yyyy-MM-dd HH:mm:ss"); 
+							source = objectJson.getJSONObject("author").getString("name");
+							type = objectJson.getString("type");
+							attitudes_count = objectJson.getInteger("voteup_count")!=null?objectJson.getInteger("voteup_count"):0;
+							follower_count = objectJson.getInteger("follower_count")!=null?objectJson.getInteger("follower_count"):0;
+							comment_count = objectJson.getInteger("comment_count")!=null?objectJson.getInteger("comment_count"):0;
+							answer_count = objectJson.getInteger("answer_count")!=null?objectJson.getInteger("answer_count"):0;		
+							if(objectJson.containsKey("question")){
+								title = objectJson.getJSONObject("question").getString("name");
+								content = objectJson.getString("content")+objectJson.getString("excerpt");
+							}else{
+								title = objectJson.getString("title");
+								content = objectJson.getString("content")+objectJson.getString("excerpt");
+							}
+							if(type.equals("answer")){
+								url = "https://www.zhihu.com/question/"+objectJson.getLong("id")+"/answer/"+objectJson.getJSONObject("question").getLong("id");
+							}else if(type.equals("article")){
+								url = "https://zhuanlan.zhihu.com/p/"+objectJson.getLong("id");
+							}else if(type.equals("question")){
+								url = "https://www.zhihu.com/question/"+objectJson.getLong("id");
+							}
+							content = ZhiWeiTools.delHTMLTag(content);
+							title = ZhiWeiTools.delHTMLTag(title);
+							ZhiHuData zhihuData = new ZhiHuData(url, title, "知乎", type, time, source, content, attitudes_count, answer_count, comment_count, follower_count,word);
+							list.add(zhihuData);
+						}
+					} catch (Exception e) {
+						System.out.println("======="+objectJson);
+						continue;
+					}
+				}
+			}else{
+				more = false;
+			}
+		} catch (Exception e) {
+			e.printStackTrace();
+			System.out.println();
+			more = false;
+		}
+		resultMap.put("data", list);
+		resultMap.put("more", more);
+		return resultMap;
+	}
+	/**
+	 * @Title: getUrl 
+	 * @author hero 
+	 * @Description: 拼接請求鏈接
+	 * @param @param word
+	 * @param @param tiebaName
+	 * @param @param page
+	 * @param @return 设定文件 
+	 * @return String 返回类型
+	 */
+	private static String getUrl(String word, String timeLimit,int page) {
+		String url = null;
+		if (word != null) {
+			url = "https://www.zhihu.com/api/v4/search_v3?t=general&correction=1&limit=50&show_all_topics=0&q="+ URLCodeUtil.getURLEncode(word, "utf-8")
+					+"&show_all_topics=0&time_zone="+ timeLimit +"&offset="+page*50;
+		}
+		System.out.println(url);
+		return url;
+	}
+}
--- a/src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
+++ b/src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
@@ -369,4 +369,24 @@ public class DataCrawler {
+	/**
+	 * 知乎根据关键词采集
+	 * @param word
+	 * @param timeLimit   a_day 1天内, a_week 一周内, three_months 三个月内
+	 * @param endDate
+	 * @param proxy
+	 * @return
+	 * @throws Exception
+	 */
+	public static List<ZhiHuData> getZhihuByWord(String word, String timeLimit,Date endDate, Proxy proxy) throws Exception{
+		try{
+			return ZhihuCrawlerParse.getZhihuData(word, timeLimit, proxy, endDate);
+		}catch (Exception e){
+			throw e;
+		}
+	}
 }
--- a/src/main/java/com/zhiwei/media_data_crawler/entity/ZhiHuData.java
+++ b/src/main/java/com/zhiwei/media_data_crawler/entity/ZhiHuData.java
@@ -26,6 +26,8 @@ public class ZhiHuData implements Serializable{
 	private Integer comment_count;   //评论数
+	private Integer follower_count;
 	@Override
 	public String toString(){
 		return "new ZhiHuData["
@@ -39,6 +41,7 @@ public class ZhiHuData implements Serializable{
 				+ ", attitudes_count = " + attitudes_count
 				+ ", answer_count = " + answer_count
 				+ ", comment_count = " + comment_count
+				+ ", follower_count = " + follower_count
 				+ ", word = " + word
 				+ "]";
 	}
@@ -47,7 +50,7 @@ public class ZhiHuData implements Serializable{
 	public ZhiHuData(String url, String title, String pt, String type, String time, String source,
 			String content, Integer attitudes_count, Integer answer_count, Integer comment_count
-			,String word){
+			,Integer follower_count,String word){
 				this.url = url;
 				this.title = title;
 				this.pt = pt;
@@ -58,6 +61,7 @@ public class ZhiHuData implements Serializable{
 				this.attitudes_count = attitudes_count;
 				this.answer_count = answer_count;
 				this.comment_count = comment_count;
+				this.follower_count = follower_count;
 				this.word = word;
 	}
@@ -151,5 +155,11 @@ private String word;      //采集关键词
 		this.comment_count = comment_count;
 	}
+	public Integer getFollower_count() {
+		return follower_count;
+	}
+	public void setFollower_count(Integer follower_count) {
+		this.follower_count = follower_count;
+	}
 }
--- a/src/test/java/com/zhiwei/media_data_crawler/test/DataCrawlerTest.java
+++ b/src/test/java/com/zhiwei/media_data_crawler/test/DataCrawlerTest.java
 package com.zhiwei.media_data_crawler.test;
 import java.net.Proxy;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
-import com.zhiwei.media_data_crawler.crawler.BaiduTiebaCrawlerParse;
+import com.zhiwei.excelpoi.excel.PoiExcelUtil;
-import com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse;
 import com.zhiwei.media_data_crawler.data.DataCrawler;
-import com.zhiwei.media_data_crawler.entity.DouBanData;
-import com.zhiwei.media_data_crawler.entity.LunTanData;
-import com.zhiwei.media_data_crawler.entity.NewsData;
-import com.zhiwei.media_data_crawler.entity.TiebaData;
 import com.zhiwei.media_data_crawler.entity.ZhiHuData;
+import com.zhiwei.tools.timeparse.TimeParse;
 public class DataCrawlerTest {
+	public static void main(String[] args) {
+		DataCrawlerTest.getSoNewsTest();
+	}
+	public static void getSoNewsTest(){
-	public void getSoNewsTest(){
+		String word = "58同城";     //关键词
-		String word = "马云";     //关键词
+		String startTime = "2018-10-23 23:00:00";  //开始时间
-		String startTime = "2017-03-01 00:00:00";  //开始时间
+		String endTime = "2018-10-23 23:59:59";    //结束时间
-		String endTime = "2017-03-01 23:59:59";    //结束时间
 		Proxy proxy = null;      //代理IP，不用可不填写
 		try {
 //			//百度新闻采集demo
-			List<NewsData> list = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
+//			List<NewsData> list = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
 //			//搜狗新闻关键词采集demo
 //			List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
 //			//360新闻采集demo
 //			List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
-//			//搜狗知乎采集
-//			List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
-//			System.out.println(zhihuList.size());
 //			//Baidu貼吧採集
 //			String tiebaName = "京东";  //贴吧名称，指定贴吧内采集，无则为null
 //			List<TiebaData> tiebaList = DataCrawler.getBaiduTiebaData(word, proxy, tiebaName);
@@ -41,17 +40,64 @@ public class DataCrawlerTest {
 			//豆瓣采集
 //			String type = "topic";   //topic 为指定话题采集，note为指定日记采集
 //			List<DouBanData> list = DataCrawler.getDouBanData(word, type, proxy);
+//			List<NewsData> list = DataCrawler.getSoData("京东", "www.toutiao.com", "d", proxy);
+			Date endDate = TimeParse.stringFormartDate(endTime);
+			PoiExcelUtil poi = PoiExcelUtil.getInstance();
+			List<Map<String,Object>> dataList = new ArrayList<>();
+			List<String> headList = new ArrayList<>();
+			headList.add("url");
+			headList.add("title");
+			headList.add("pt");
+			headList.add("type");
+			headList.add("time");
+			headList.add("source");
+			headList.add("content");
+			headList.add("attitudes_count");
+			headList.add("answer_count");
+			headList.add("comment_count");
+			headList.add("word");
+			//搜狗知乎采集
+			String[] words = word.split("\\|");
-//			List<NewsData> list = DataCrawler.getSoData("京东", "www.toutiao.com", "d", proxy);
+			for(int i=0;i<words.length;i++){
-			for(NewsData newsData : list) {
+				System.out.println(words[i]+"   开始采集");
-				System.out.println(newsData);
+				List<ZhiHuData> zhihuList = DataCrawler.getZhihuByWord(words[i], "a_week",endDate, proxy);
+				System.out.println(words[i]+"=============="+zhihuList.size());
+				for(ZhiHuData zhiHuData : zhihuList) {
+					Map<String,Object> map = new HashMap<String,Object>();
+					map.put("url", zhiHuData.getUrl());
+					map.put("title", zhiHuData.getTitle());
+					map.put("pt", zhiHuData.getPt());
+					map.put("type", zhiHuData.getType());
+					map.put("time", zhiHuData.getTime());
+					map.put("source", zhiHuData.getSource());
+					map.put("content", zhiHuData.getContent());
+					map.put("attitudes_count", zhiHuData.getAttitudes_count());
+					map.put("answer_count", zhiHuData.getAnswer_count());
+					map.put("comment_count", zhiHuData.getComment_count());                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     
+					map.put("word", zhiHuData.getWord());
+					dataList.add(map);
 				}
+			}
+			poi.exportExcel("F://知乎数据采集.xlsx", "0", headList, dataList);;
 		} catch (Exception e) {
 			// TODO Auto-generated catch block
 			e.printStackTrace();
 		}
 	}
 }