Commit eaa3a775 by zhiwei

添加搜狗知乎采集程序

parent fc1372d5
...@@ -26,6 +26,8 @@ ...@@ -26,6 +26,8 @@
List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy); List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
//360新闻采集demo //360新闻采集demo
List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy); List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
//搜狗知乎采集
List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
......
package com.zhiwei.media_data_crawler.crawler;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.media_data_crawler.entity.ZhiHuData;
import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
private static Logger logger = LoggerFactory.getLogger(SougouZhihuCrawlerParse.class);
private static final String pt = "搜狗知乎";
/**
* @Title: getSougouNewsData
* @author hero
* @Description: 根据关键词从搜狗上采集知乎的数据
* @param @param word
* @param @param proxy
* @param @return 设定文件
* @return List<ZhiHuData> 返回类型
*/
@SuppressWarnings("unchecked")
public static List<ZhiHuData> getSougouZhihuData(String word, Proxy proxy){
List<ZhiHuData> list = new ArrayList<ZhiHuData>();
int page = 1;
boolean more = true;
while(more){
//最大页数为50
if(page>50){
more = false;
}
String htmlBody = downloadHtml(word, proxy, page);
if(htmlBody != null && !htmlBody.equals("")){
Map<String,Object> dataMap = analysisData(htmlBody, proxy, word);
List<ZhiHuData> dataList = (List<ZhiHuData>)dataMap.get("data");
list.addAll(dataList);
logger.info("当前采集关键词:{}, 采集到第:{}页,采集到的数据总量为:{}", word, page, list.size());
more = (Boolean)dataMap.get("more");
}else{
more = false;
}
ZhiWeiTools.sleep(5000);
page++;
}
return list;
}
/**
*
* @Title: downloadHtml
* @author hero
* @Description: 获取数据流
* @param @param word
* @param @param mode (mode为匹配规则,mode=1 全文匹配, mode=2 为标题匹配)
* @param @param proxy
* @param @param page
* @param @return 设定文件
* @return String 返回类型
*/
private static String downloadHtml(String word, Proxy proxy, int page) {
//获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址
String url = getUrl(word, page);
headerMap.put("Host", "zhihu.sogou.com");
//下载数据页面
for(int i = 1; i<=3; i++){
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
continue;
}
}
return null;
}
private static String downloadHtml(String url, Proxy proxy, String type) {
//获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址
if(type.contains("文章")){
headerMap.put("Host", "zhuanlan.zhihu.com");
}else{
headerMap.put("Host", "www.zhihu.com");
}
headerMap.put("Referer", url);
//下载数据页面
for(int i = 1; i<=3; i++){
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
continue;
}
}
return null;
}
/**
* @Title: analysisData
* @author hero
* @Description: 解析百度新闻数据
* @param @param htmlBody
* @param @param proxy
* @param @param word
* @param @return 设定文件
* @return Map<String,Object> 返回类型
*/
private static Map<String,Object> analysisData(String htmlBody, Proxy proxy, String word) {
Map<String,Object> resultMap = new HashMap<String,Object>();
List<ZhiHuData> list = new ArrayList<ZhiHuData>();
boolean more = true;
/** 解析页面 */
Document document = Jsoup.parse(htmlBody);
/**判断是否有下一页**/
if(document.select("a#zhihu_page_next") == null)
{
more = false;
}else
{
if(!document.select("div.result-page").text().contains("下一页"))
{
more = false;
}
}
//开始解析
Elements elementes = document.select("div.box-result").select("div.result-about-list");
for (Element element : elementes)
{
try {
String link = element.select("h4.about-list-title").select("a").attr("href");
String title = element.select("h4.about-list-title").select("a").text();
String typeAndAnswerText = element.select("div.about-text").select("span.answer-num").text();
String answerText = element.select("div.about-text").select("span.answer-num").select("a").text();
String type = typeAndAnswerText.replaceAll(answerText, "");
ZhiHuData zhihu = null;
if(type.contains("文章")){
String source = element.select("p.about-answer").select("cite").text();
Integer attitudes_count = Integer.valueOf(element.select("p.about-answer").select("span.count").text().replaceAll("个赞", ""));
Integer comment_count = 0;
if(!"".equals(answerText.replace("个评论", "").trim())){
comment_count = Integer.valueOf(answerText.replace("个评论", "").trim());
}
zhihu = new ZhiHuData(link, title, pt, type, null, source, null, attitudes_count, null, comment_count, word);
zhihu = analysisZhihuArticle(link, proxy, zhihu);
}else {
Integer answer_count = 0;
answerText = answerText.replace("个回答", "").trim();
if(answerText!=null && !"".equals(answerText)){
answer_count = Integer.valueOf(answer_count);
}
zhihu = new ZhiHuData(link, title, pt, type, null, null, null, null, answer_count, null, word);
zhihu = analysisZhihuAnswer(link, proxy, zhihu);
}
list.add(zhihu);
ZhiWeiTools.sleep(1000);
} catch (Exception e) {
e.printStackTrace();
// logger.error("搜狗新闻数据解析时出现问题,问题为:{}", e.fillInStackTrace());
continue;
}
}
resultMap.put("data", list);
resultMap.put("more", more);
return resultMap;
}
/**
*
* @Title: analysisZhihuAnswer
* @author hero
* @Description: 解析问答
* @param @param url
* @param @param htmlBody
* @param @return 设定文件
* @return ZhiHuData 返回类型
*/
private static ZhiHuData analysisZhihuAnswer(String url, Proxy proxy,ZhiHuData zhihu){
try {
String htmlBody = downloadHtml(url, proxy, "问答");
if(htmlBody != null){
Document document = Jsoup.parse(htmlBody);
String content = document.select("div.QuestionHeader-main").select("div.QuestionHeader-detail").text();
String commentCountText = document.select("div.QuestionHeader-Comment").text();
String regEx="[^0-9]";
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(commentCountText);
commentCountText = m.replaceAll("").trim();
int comment_count = 0;
if(!commentCountText.equals("") && commentCountText!=null){
comment_count = Integer.valueOf(commentCountText);
}
zhihu.setContent(content);
zhihu.setComment_count(comment_count);
}
return zhihu;
} catch (Exception e) {
e.printStackTrace();
return zhihu;
}
}
/**
* @Title: analysisZhihuArticle
* @author hero
* @Description: 解析文章
* @param @param url
* @param @param htmlBody
* @param @return 设定文件
* @return ZhiHuData 返回类型
*/
@SuppressWarnings("deprecation")
private static ZhiHuData analysisZhihuArticle(String url, Proxy proxy, ZhiHuData zhihu){
try {
String htmlBody = downloadHtml(url, proxy, "文章");
Document document = Jsoup.parse(htmlBody);
String time = document.select("div.HoverTitle").first().select("time").attr("datetime");
Date date = new Date(time);
time = TimeParse.dateFormartString(date, "yyyy-MM-dd HH:mm:ss");
String content = document.select("[class=\"RichText PostIndex-content av-paddingSide av-card\"]").text();
zhihu.setTime(time);
zhihu.setContent(content);
return zhihu;
} catch (Exception e) {
return zhihu;
}
}
/**
* @Title: getUrl
* @author hero
* @Description: 获取链接
* @param @param word
* @param @param mode (mode为匹配规则,mode=1 全文匹配, mode=2 为标题匹配)
* @param @param page
* @param @return 设定文件
* @return String 返回类型
*/
private static String getUrl(String word, int page){
String url = null;
if(word!=null){
url = "http://zhihu.sogou.com/zhihu?query=" + URLCodeUtil.getURLEncode(word, "utf-8") + "&page="+page;
}
return url;
}
}
...@@ -6,7 +6,9 @@ import java.util.List; ...@@ -6,7 +6,9 @@ import java.util.List;
import com.zhiwei.media_data_crawler.crawler.BaiduNewsCrawlerParse; import com.zhiwei.media_data_crawler.crawler.BaiduNewsCrawlerParse;
import com.zhiwei.media_data_crawler.crawler.SoNewsCrawlerParse; import com.zhiwei.media_data_crawler.crawler.SoNewsCrawlerParse;
import com.zhiwei.media_data_crawler.crawler.SougouNewsCrawlerParse; import com.zhiwei.media_data_crawler.crawler.SougouNewsCrawlerParse;
import com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse;
import com.zhiwei.media_data_crawler.entity.NewsData; import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.media_data_crawler.entity.ZhiHuData;
public class DataCrawler { public class DataCrawler {
...@@ -132,6 +134,22 @@ public class DataCrawler { ...@@ -132,6 +134,22 @@ public class DataCrawler {
} }
} }
/**
* @Title: getSougouZhihuData
* @author hero
* @Description: 根据关键词在搜狗知乎采集相应的知乎数据
* @param @param word
* @param @param proxy
* @param @return 设定文件
* @return List<ZhiHuData> 返回类型
*/
public static List<ZhiHuData> getSougouZhihuData(String word, Proxy proxy){
try {
return SougouZhihuCrawlerParse.getSougouZhihuData(word, proxy);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
} }
package com.zhiwei.media_data_crawler.entity;
import java.io.Serializable;
public class ZhiHuData implements Serializable{
private static final long serialVersionUID = 1L;
private String url; //地址
private String title; //标题
private String pt; //平台
private String type; //类型
private String time; //时间
private String source; //发布者
private String content; //内容
private Integer attitudes_count; //点赞数
private Integer answer_count; //回答数
private Integer comment_count; //评论数
@Override
public String toString(){
return "new ZhiHuData["
+ "url = " + url
+ ", title = " + title
+ ", pt = " + pt
+ ", type = " + type
+ ", time = " + time
+ ", source = " + source
+ ", content = " + content
+ ", attitudes_count = " + attitudes_count
+ ", answer_count = " + answer_count
+ ", comment_count = " + comment_count
+ ", word = " + word
+ "]";
}
public ZhiHuData(){}
public ZhiHuData(String url, String title, String pt, String type, String time, String source,
String content, Integer attitudes_count, Integer answer_count, Integer comment_count
,String word){
this.url = url;
this.title = title;
this.pt = pt;
this.type = type;
this.time = time;
this.source = source;
this.content = content;
this.attitudes_count = attitudes_count;
this.answer_count = answer_count;
this.comment_count = comment_count;
this.word = word;
}
private String word; //采集关键词
public String getWord() {
return word;
}
public void setWord(String word) {
this.word = word;
}
public String getPt() {
return pt;
}
public void setPt(String pt) {
this.pt = pt;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public Integer getAttitudes_count() {
return attitudes_count;
}
public void setAttitudes_count(Integer attitudes_count) {
this.attitudes_count = attitudes_count;
}
public Integer getAnswer_count() {
return answer_count;
}
public void setAnswer_count(Integer answer_count) {
this.answer_count = answer_count;
}
public Integer getComment_count() {
return comment_count;
}
public void setComment_count(Integer comment_count) {
this.comment_count = comment_count;
}
}
package com.zhiwei.media_data_crawler.test; package com.zhiwei.media_data_crawler.test;
import java.net.Proxy;
import java.util.List;
import org.junit.Test;
import com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.media_data_crawler.entity.ZhiHuData;
public class DataCrawlerTest { public class DataCrawlerTest {
// @Test @Test
// public void getSoNewsTest(){ public void getSoNewsTest(){
// String word = "马云"; String word = "马云"; //关键词
// List<NewsData> list = DataCrawler.getSoNewsData(word, null); String startTime = "2017-03-01 00:00:00"; //开始时间
// for(NewsData newsData : list){ String endTime = "2017-03-01 23:59:59"; //结束时间
// System.out.println(newsData); Proxy proxy = null; //代理IP,不用可不填写
// } //百度新闻采集demo
// } // List<NewsData> baiduNewsList = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
// //搜狗新闻关键词采集demo
// List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
// //360新闻采集demo
// List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
//搜狗知乎采集
List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
System.out.println(zhihuList.size());
}
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment