Commit 20ce0e8c by [zhangzhiwei]

修改代理ip及爬虫核心包

parent 9ef31c31
...@@ -65,7 +65,12 @@ ...@@ -65,7 +65,12 @@
<dependency> <dependency>
<groupId>com.zhiwei.tools</groupId> <groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId> <artifactId>zhiwei-tools</artifactId>
<version>0.0.8-SNAPSHOT</version> <version>0.0.9-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei</groupId>
<artifactId>excelpoi</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency> </dependency>
</dependencies> </dependencies>
</project> </project>
\ No newline at end of file
...@@ -195,7 +195,7 @@ public class SougouZhihuCrawlerParse{ ...@@ -195,7 +195,7 @@ public class SougouZhihuCrawlerParse{
} }
comment_count = Integer.valueOf(commentCount); comment_count = Integer.valueOf(commentCount);
} }
zhihu = new ZhiHuData(link, title, pt, type, null, source, null, attitudes_count, null, comment_count, word); zhihu = new ZhiHuData(link, title, pt, type, null, source, null, attitudes_count, null, comment_count, null,word);
zhihu = analysisZhihuArticle(link, proxy, zhihu); zhihu = analysisZhihuArticle(link, proxy, zhihu);
}else { }else {
Integer answer_count = 0; Integer answer_count = 0;
...@@ -206,7 +206,7 @@ public class SougouZhihuCrawlerParse{ ...@@ -206,7 +206,7 @@ public class SougouZhihuCrawlerParse{
} }
answer_count = Integer.valueOf(answerText); answer_count = Integer.valueOf(answerText);
} }
zhihu = new ZhiHuData(link, title, pt, type, null, null, null, null, answer_count, null, word); zhihu = new ZhiHuData(link, title, pt, type, null, null, null, null, answer_count, null, null,word);
zhihu = analysisZhihuAnswer(link, proxy, zhihu); zhihu = analysisZhihuAnswer(link, proxy, zhihu);
} }
list.add(zhihu); list.add(zhihu);
...@@ -241,6 +241,15 @@ public class SougouZhihuCrawlerParse{ ...@@ -241,6 +241,15 @@ public class SougouZhihuCrawlerParse{
Document document = Jsoup.parse(htmlBody); Document document = Jsoup.parse(htmlBody);
String content = document.select("div.QuestionHeader-main").select("div.QuestionHeader-detail").text(); String content = document.select("div.QuestionHeader-main").select("div.QuestionHeader-detail").text();
String commentCountText = document.select("div.QuestionHeader-Comment").text(); String commentCountText = document.select("div.QuestionHeader-Comment").text();
String time = "";
if(htmlBody.contains("pubDate")){
time = htmlBody.split("&quot;pubDate&quot;: &quot;")[1].split("&quot;")[0];
if(time!=null){
time = time.replaceAll("T", " ");
}
}else{
System.out.println("+++++++++++++++++++++++");
}
String regEx="[^0-9]"; String regEx="[^0-9]";
Pattern p = Pattern.compile(regEx); Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(commentCountText); Matcher m = p.matcher(commentCountText);
...@@ -251,6 +260,7 @@ public class SougouZhihuCrawlerParse{ ...@@ -251,6 +260,7 @@ public class SougouZhihuCrawlerParse{
} }
zhihu.setContent(content); zhihu.setContent(content);
zhihu.setComment_count(comment_count); zhihu.setComment_count(comment_count);
zhihu.setTime(time);
} }
return zhihu; return zhihu;
} catch (Exception e) { } catch (Exception e) {
...@@ -274,7 +284,7 @@ public class SougouZhihuCrawlerParse{ ...@@ -274,7 +284,7 @@ public class SougouZhihuCrawlerParse{
try { try {
String htmlBody = downloadHtml(url, proxy, "文章"); String htmlBody = downloadHtml(url, proxy, "文章");
Document document = Jsoup.parse(htmlBody); Document document = Jsoup.parse(htmlBody);
String time = document.select("div.HoverTitle").first().select("time").attr("datetime"); String time = htmlBody.split("&quot;updated&quot;:")[1].split(",&quot;reviewers")[0];
Date date = new Date(time); Date date = new Date(time);
time = TimeParse.dateFormartString(date, "yyyy-MM-dd HH:mm:ss"); time = TimeParse.dateFormartString(date, "yyyy-MM-dd HH:mm:ss");
String content = document.select("[class=\"RichText PostIndex-content av-paddingSide av-card\"]").text(); String content = document.select("[class=\"RichText PostIndex-content av-paddingSide av-card\"]").text();
......
package com.zhiwei.media_data_crawler.crawler;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.ZhiHuData;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
public class ZhihuCrawlerParse {
private static Logger logger = LogManager.getLogger(TianYaCrawlerParse.class);
private static HttpBoot httpBoot = new HttpBoot();
/**
* @Title: getBaiduTiebaData
* @author hero
* @Description: 根據關鍵詞獲取百度貼吧數據(最多50頁)
* @param @param word
* @param @param proxy
* @param @param tiebaName
* @param @return
* @param @throws Exception 设定文件
* @return List<TiebaData> 返回类型
*/
@SuppressWarnings("unchecked")
public static List<ZhiHuData> getZhihuData(String word, String timeLimit,Proxy proxy, Date endTime) throws Exception {
List<ZhiHuData> list = new ArrayList<ZhiHuData>();
int page = 0;
boolean more = true;
while (more) {
// 最大页数为20
if (page > 20) {
more = false;
}
String htmlBody = downloadHtml(word, timeLimit, proxy, page);
if (htmlBody != null) {
Map<String, Object> dataMap = analysisData(htmlBody, proxy, word, endTime);
more = (Boolean) dataMap.get("more");
List<ZhiHuData> dataList = (List<ZhiHuData>) dataMap.get("data");
if(dataList!=null && !dataList.isEmpty()){
list.addAll(dataList);
}else{
more = false;
}
} else {
more = false;
}
page++;
if (DataCrawler.sleepTime == null) {
ZhiWeiTools.sleep(3000);
}
}
return list;
}
/**
* @param word
* @param timeLimit
* @param proxy
* @param page
* @return
* @throws Exception
*/
private static String downloadHtml(String word, String timeLimit,Proxy proxy,
int page) throws Exception{
// 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址
String url = getUrl(word, timeLimit, page);
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
}else{
continue;
}
}
}
return null;
}
/**
* @Title: analysisData
* @author hero
* @Description: 解析Baidu貼吧數據
* @param @param htmlBody
* @param @param proxy
* @param @param word
* @param @return
* @param @throws Exception 设定文件
* @return Map<String,Object> 返回类型
*/
private static Map<String, Object> analysisData(String htmlBody, Proxy proxy, String word, Date endTime) throws Exception{
Map<String, Object> resultMap = new HashMap<String, Object>();
List<ZhiHuData> list = new ArrayList<ZhiHuData>();
boolean more = true;
try {
JSONArray dataJson = JSONObject.parseObject(htmlBody).getJSONArray("data");
if(dataJson!=null && dataJson.size()>=0){
String url = null; //地址
String title; //标题
String type; //类型
String time; //时间
String source; //发布者
String content; //内容
Integer attitudes_count; //点赞数
Integer answer_count; //回答数
Integer comment_count; //评论数
Integer follower_count; //评论数
Date date = null;
for(int i=0;i<dataJson.size();i++){
JSONObject objectJson = dataJson.getJSONObject(i).getJSONObject("object");
try {
if(!dataJson.getJSONObject(i).containsKey("data_list")){
date = new Date(objectJson.getLong("created_time")*1000);
time = TimeParse.dateFormartString(date, "yyyy-MM-dd HH:mm:ss");
source = objectJson.getJSONObject("author").getString("name");
type = objectJson.getString("type");
attitudes_count = objectJson.getInteger("voteup_count")!=null?objectJson.getInteger("voteup_count"):0;
follower_count = objectJson.getInteger("follower_count")!=null?objectJson.getInteger("follower_count"):0;
comment_count = objectJson.getInteger("comment_count")!=null?objectJson.getInteger("comment_count"):0;
answer_count = objectJson.getInteger("answer_count")!=null?objectJson.getInteger("answer_count"):0;
if(objectJson.containsKey("question")){
title = objectJson.getJSONObject("question").getString("name");
content = objectJson.getString("content")+objectJson.getString("excerpt");
}else{
title = objectJson.getString("title");
content = objectJson.getString("content")+objectJson.getString("excerpt");
}
if(type.equals("answer")){
url = "https://www.zhihu.com/question/"+objectJson.getLong("id")+"/answer/"+objectJson.getJSONObject("question").getLong("id");
}else if(type.equals("article")){
url = "https://zhuanlan.zhihu.com/p/"+objectJson.getLong("id");
}else if(type.equals("question")){
url = "https://www.zhihu.com/question/"+objectJson.getLong("id");
}
content = ZhiWeiTools.delHTMLTag(content);
title = ZhiWeiTools.delHTMLTag(title);
ZhiHuData zhihuData = new ZhiHuData(url, title, "知乎", type, time, source, content, attitudes_count, answer_count, comment_count, follower_count,word);
list.add(zhihuData);
}
} catch (Exception e) {
System.out.println("======="+objectJson);
continue;
}
}
}else{
more = false;
}
} catch (Exception e) {
e.printStackTrace();
System.out.println();
more = false;
}
resultMap.put("data", list);
resultMap.put("more", more);
return resultMap;
}
/**
* @Title: getUrl
* @author hero
* @Description: 拼接請求鏈接
* @param @param word
* @param @param tiebaName
* @param @param page
* @param @return 设定文件
* @return String 返回类型
*/
private static String getUrl(String word, String timeLimit,int page) {
String url = null;
if (word != null) {
url = "https://www.zhihu.com/api/v4/search_v3?t=general&correction=1&limit=50&show_all_topics=0&q="+ URLCodeUtil.getURLEncode(word, "utf-8")
+"&show_all_topics=0&time_zone="+ timeLimit +"&offset="+page*50;
}
System.out.println(url);
return url;
}
}
...@@ -369,4 +369,24 @@ public class DataCrawler { ...@@ -369,4 +369,24 @@ public class DataCrawler {
/**
* 知乎根据关键词采集
* @param word
* @param timeLimit a_day 1天内, a_week 一周内, three_months 三个月内
* @param endDate
* @param proxy
* @return
* @throws Exception
*/
public static List<ZhiHuData> getZhihuByWord(String word, String timeLimit,Date endDate, Proxy proxy) throws Exception{
try{
return ZhihuCrawlerParse.getZhihuData(word, timeLimit, proxy, endDate);
}catch (Exception e){
throw e;
}
}
} }
...@@ -26,6 +26,8 @@ public class ZhiHuData implements Serializable{ ...@@ -26,6 +26,8 @@ public class ZhiHuData implements Serializable{
private Integer comment_count; //评论数 private Integer comment_count; //评论数
private Integer follower_count;
@Override @Override
public String toString(){ public String toString(){
return "new ZhiHuData[" return "new ZhiHuData["
...@@ -39,6 +41,7 @@ public class ZhiHuData implements Serializable{ ...@@ -39,6 +41,7 @@ public class ZhiHuData implements Serializable{
+ ", attitudes_count = " + attitudes_count + ", attitudes_count = " + attitudes_count
+ ", answer_count = " + answer_count + ", answer_count = " + answer_count
+ ", comment_count = " + comment_count + ", comment_count = " + comment_count
+ ", follower_count = " + follower_count
+ ", word = " + word + ", word = " + word
+ "]"; + "]";
} }
...@@ -47,7 +50,7 @@ public class ZhiHuData implements Serializable{ ...@@ -47,7 +50,7 @@ public class ZhiHuData implements Serializable{
public ZhiHuData(String url, String title, String pt, String type, String time, String source, public ZhiHuData(String url, String title, String pt, String type, String time, String source,
String content, Integer attitudes_count, Integer answer_count, Integer comment_count String content, Integer attitudes_count, Integer answer_count, Integer comment_count
,String word){ ,Integer follower_count,String word){
this.url = url; this.url = url;
this.title = title; this.title = title;
this.pt = pt; this.pt = pt;
...@@ -58,6 +61,7 @@ public class ZhiHuData implements Serializable{ ...@@ -58,6 +61,7 @@ public class ZhiHuData implements Serializable{
this.attitudes_count = attitudes_count; this.attitudes_count = attitudes_count;
this.answer_count = answer_count; this.answer_count = answer_count;
this.comment_count = comment_count; this.comment_count = comment_count;
this.follower_count = follower_count;
this.word = word; this.word = word;
} }
...@@ -151,5 +155,11 @@ private String word; //采集关键词 ...@@ -151,5 +155,11 @@ private String word; //采集关键词
this.comment_count = comment_count; this.comment_count = comment_count;
} }
public Integer getFollower_count() {
return follower_count;
}
public void setFollower_count(Integer follower_count) {
this.follower_count = follower_count;
}
} }
package com.zhiwei.media_data_crawler.test; package com.zhiwei.media_data_crawler.test;
import java.net.Proxy; import java.net.Proxy;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
import com.zhiwei.media_data_crawler.crawler.BaiduTiebaCrawlerParse; import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse;
import com.zhiwei.media_data_crawler.data.DataCrawler; import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.DouBanData;
import com.zhiwei.media_data_crawler.entity.LunTanData;
import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.media_data_crawler.entity.TiebaData;
import com.zhiwei.media_data_crawler.entity.ZhiHuData; import com.zhiwei.media_data_crawler.entity.ZhiHuData;
import com.zhiwei.tools.timeparse.TimeParse;
public class DataCrawlerTest { public class DataCrawlerTest {
public static void main(String[] args) {
DataCrawlerTest.getSoNewsTest();
}
public static void getSoNewsTest(){
public void getSoNewsTest(){ String word = "58同城"; //关键词
String word = "马云"; //关键词 String startTime = "2018-10-23 23:00:00"; //开始时间
String startTime = "2017-03-01 00:00:00"; //开始时间 String endTime = "2018-10-23 23:59:59"; //结束时间
String endTime = "2017-03-01 23:59:59"; //结束时间
Proxy proxy = null; //代理IP,不用可不填写 Proxy proxy = null; //代理IP,不用可不填写
try { try {
// //百度新闻采集demo // //百度新闻采集demo
List<NewsData> list = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy); // List<NewsData> list = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
// //搜狗新闻关键词采集demo // //搜狗新闻关键词采集demo
// List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy); // List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
// //360新闻采集demo // //360新闻采集demo
// List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy); // List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
// //搜狗知乎采集
// List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
// System.out.println(zhihuList.size());
// //Baidu貼吧採集 // //Baidu貼吧採集
// String tiebaName = "京东"; //贴吧名称,指定贴吧内采集,无则为null // String tiebaName = "京东"; //贴吧名称,指定贴吧内采集,无则为null
// List<TiebaData> tiebaList = DataCrawler.getBaiduTiebaData(word, proxy, tiebaName); // List<TiebaData> tiebaList = DataCrawler.getBaiduTiebaData(word, proxy, tiebaName);
...@@ -41,17 +40,64 @@ public class DataCrawlerTest { ...@@ -41,17 +40,64 @@ public class DataCrawlerTest {
//豆瓣采集 //豆瓣采集
// String type = "topic"; //topic 为指定话题采集,note为指定日记采集 // String type = "topic"; //topic 为指定话题采集,note为指定日记采集
// List<DouBanData> list = DataCrawler.getDouBanData(word, type, proxy); // List<DouBanData> list = DataCrawler.getDouBanData(word, type, proxy);
// List<NewsData> list = DataCrawler.getSoData("京东", "www.toutiao.com", "d", proxy);
Date endDate = TimeParse.stringFormartDate(endTime);
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<Map<String,Object>> dataList = new ArrayList<>();
List<String> headList = new ArrayList<>();
headList.add("url");
headList.add("title");
headList.add("pt");
headList.add("type");
headList.add("time");
headList.add("source");
headList.add("content");
headList.add("attitudes_count");
headList.add("answer_count");
headList.add("comment_count");
headList.add("word");
//搜狗知乎采集
String[] words = word.split("\\|");
// List<NewsData> list = DataCrawler.getSoData("京东", "www.toutiao.com", "d", proxy); for(int i=0;i<words.length;i++){
for(NewsData newsData : list) { System.out.println(words[i]+" 开始采集");
System.out.println(newsData); List<ZhiHuData> zhihuList = DataCrawler.getZhihuByWord(words[i], "a_week",endDate, proxy);
System.out.println(words[i]+"=============="+zhihuList.size());
for(ZhiHuData zhiHuData : zhihuList) {
Map<String,Object> map = new HashMap<String,Object>();
map.put("url", zhiHuData.getUrl());
map.put("title", zhiHuData.getTitle());
map.put("pt", zhiHuData.getPt());
map.put("type", zhiHuData.getType());
map.put("time", zhiHuData.getTime());
map.put("source", zhiHuData.getSource());
map.put("content", zhiHuData.getContent());
map.put("attitudes_count", zhiHuData.getAttitudes_count());
map.put("answer_count", zhiHuData.getAnswer_count());
map.put("comment_count", zhiHuData.getComment_count());
map.put("word", zhiHuData.getWord());
dataList.add(map);
} }
}
poi.exportExcel("F://知乎数据采集.xlsx", "0", headList, dataList);;
} catch (Exception e) { } catch (Exception e) {
// TODO Auto-generated catch block // TODO Auto-generated catch block
e.printStackTrace(); e.printStackTrace();
} }
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment