Commit 41547bad by zhiwei

搜索引擎采集项目初次提交,项目中主要包含

1.百度新闻采集
2.360新闻采集
3.搜狗新闻采集
parents
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>media_data_crawler</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>media_data_crawler</name>
<description>网媒数据抓取,包含百度新闻、搜狗新闻、360新闻等</description>
<dependencies>
<dependency>
<groupId>com.zhiwei</groupId>
<artifactId>zhiweiTools</artifactId>
<version>0.0.6-SNAPSHOT</version>
</dependency>
</dependencies>
<!-- 打包管理 -->
<build>
<plugins>
<!-- 发布源码 -->
<plugin>
<artifactId>maven-source-plugin</artifactId>
<version>2.4</version>
<configuration>
<attach>true</attach>
</configuration>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.10.4</version>
</plugin>
<!-- 解决maven test命令时console出现中文乱码乱码 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.19.1</version>
<configuration>
<forkMode>once</forkMode>
<argLine>-Dfile.encoding=UTF-8</argLine>
</configuration>
</plugin>
</plugins>
</build>
<!-- 分发管理:管理distribution和supporting files -->
<distributionManagement>
<snapshotRepository>
<id>nexus-releases</id>
<name>User Porject Snapshot</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/snapshots/</url>
<uniqueVersion>true</uniqueVersion>
</snapshotRepository>
<repository>
<id>nexus-releases</id>
<name>User Porject Release</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/releases/</url>
</repository>
</distributionManagement>
</project>
\ No newline at end of file
package com.zhiwei.media_data_crawler.crawler;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
private static Logger logger = LoggerFactory.getLogger(BaiduNewsCrawlerParse.class);
private static final String pt = "百度新闻";
/**
* @Title: getBaiduNewsData
* @author hero
* @Description: 采集百度新闻数据
* @param @param word
* @param @param startTime
* @param @param endTime
* @param @param proxy
* @param @return 设定文件
* @return List<NewsData> 返回类型
*/
@SuppressWarnings("unchecked")
public static List<NewsData> getBaiduNewsData(String word, String startTime, String endTime, Proxy proxy){
List<NewsData> list = new ArrayList<NewsData>();
int page = 0;
boolean more = true;
while(more){
//最大页数为20
if(page>20){
more = false;
}
String htmlBody = downloadHtml(word, startTime, endTime, proxy, "newsdy", page);
if(htmlBody != null){
Map<String,Object> dataMap = analysisData(htmlBody, proxy, word);
List<NewsData> dataList = (List<NewsData>)dataMap.get("data");
list.addAll(dataList);
more = (Boolean)dataMap.get("more");
}else{
more = false;
}
page++;
}
return list;
}
@SuppressWarnings("unchecked")
public static List<NewsData> getBaiduNewsDataByTitle(String word, String startTime, String endTime, Proxy proxy){
List<NewsData> list = new ArrayList<NewsData>();
int page = 0;
boolean more = true;
while(more){
//最大页数为20
if(page>20){
more = false;
}
String htmlBody = downloadHtml(word, startTime, endTime, proxy, "newstitle", page);
if(htmlBody != null){
Map<String,Object> dataMap = analysisData(htmlBody, proxy, word);
List<NewsData> dataList = (List<NewsData>)dataMap.get("data");
list.addAll(dataList);
more = (Boolean)dataMap.get("more");
}else{
more = false;
}
page++;
}
return list;
}
/**
* @Title: downloadHtml
* @author hero
* @Description: 获取数据流
* @param @param word
* @param @param startTime
* @param @param endTime
* @param @param proxy
* @param @param tn (tn=newsdy 为 全文匹配, tn=newstitle 为标题匹配)
* @param @param page
* @param @return 设定文件
* @return String 返回类型
*/
private static String downloadHtml(String word, String startTime, String endTime, Proxy proxy, String tn,int page) {
//获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址
String url = getUrl(word, startTime, endTime, tn, page);
headerMap.put("Host", "news.baidu.com");
headerMap.put("Referer", url);
//下载数据页面
for(int i = 1; i<=3; i++){
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
continue;
}
}
return null;
}
private static String downloadHtml(String url, Proxy proxy, int page) {
//获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址
url = url + "&pn="+page*30;
headerMap.put("Host", "news.baidu.com");
headerMap.put("Referer", url);
//下载数据页面
for(int i = 1; i<=3; i++){
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
continue;
}
}
return null;
}
/**
* @Title: analysisData
* @author hero
* @Description: 解析百度新闻数据
* @param @param htmlBody
* @param @param proxy
* @param @param word
* @param @return 设定文件
* @return Map<String,Object> 返回类型
*/
private static Map<String,Object> analysisData(String htmlBody, Proxy proxy, String word) {
Map<String,Object> resultMap = new HashMap<String,Object>();
List<NewsData> list = new ArrayList<NewsData>();
boolean more = true;
/** 解析页面 */
Document document = Jsoup.parse(htmlBody);
/**判断是否有下一页**/
if(document.select("p#page") == null)
{
more = false;
}else
{
if(!document.select("p#page").text().contains("下一页"))
{
more = false;
}
}
//开始解析
Elements elementes = document.select("div.result");
String time = null;
String source = null;
String link = null;
String title = null;
String soureAndtime = null;
String descript = null;
String soureAndtimeText = null;
String content = null;
Pattern pattern = null;
Matcher matcher = null;
for (Element element : elementes)
{
try {
link = element.select("h3.c-title").select("a").attr("href");
title = element.select("h3.c-title").select("a").text();
soureAndtime = element.select("div.c-row").select("p.c-author").html();
/**截取时间*/
if (soureAndtime.contains("&nbsp;&nbsp;")) {
String soureAndtimes[] = soureAndtime.split("&nbsp;&nbsp;");
time = soureAndtimes[1];
source = soureAndtimes[0];
} else {
time = element.select("div.c-row").select("p.c-author").text();
}
/**文章发布时间处理**/
time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss") ;
// 处理文章简介
if(element.select("div.c-row")!=null){
descript = element.select("div.c-row").text();
soureAndtimeText = element.select("div.c-row").select("p.c-author").text();
content = descript.substring(soureAndtimeText.length(), descript.length());
pattern = Pattern.compile("\\d*条相同新闻");
matcher = pattern.matcher(content);
content = matcher.replaceAll("").replace("-", "").replace("百度快照", "");
}
//添加到数据集合中
NewsData newsData = new NewsData(link, title, source, time, content, pt, word);
list.add(newsData);
/**采集相同新闻链接**/
if(element.select("div.c-row").select("a.c-more_link")!=null)
{
String otherLink = "http://news.baidu.com"+element.select("div.c-row").select("a.c-more_link").attr("href");
List<NewsData> otherDataList = getOherBaiduNewsData(otherLink, word, proxy);
list.addAll(otherDataList);
}
} catch (Exception e) {
e.printStackTrace();
logger.error("百度新闻数据解析时出现问题,问题为:{}", e.fillInStackTrace());
continue;
}
}
resultMap.put("data", list);
resultMap.put("more", more);
return resultMap;
}
/**
* @Title: getOherBaiduNewsData
* @author hero
* @Description: 解析相似新闻
* @param @param url
* @param @param word
* @param @param proxy
* @param @return 设定文件
* @return List<NewsData> 返回类型
*/
@SuppressWarnings("unchecked")
public static List<NewsData> getOherBaiduNewsData(String url, String word, Proxy proxy){
List<NewsData> list = new ArrayList<NewsData>();
int page = 0;
boolean more = true;
while(more){
//最大页数为20
if(page>20){
more = false;
}
String htmlBody = downloadHtml(url, proxy, page);
if(htmlBody != null){
Map<String,Object> dataMap = analysisData(htmlBody, null, word);
List<NewsData> dataList = (List<NewsData>)dataMap.get("data");
list.addAll(dataList);
more = (Boolean)dataMap.get("more");
}else{
more = false;
}
page++;
}
return list;
}
/**
* @Title: getUrl
* @author hero
* @Description: 获取链接
* @param @param word
* @param @param startTime
* @param @param endTime
* @param @param page
* @param @return 设定文件
* @return String 返回类型
*/
private static String getUrl(String word, String startTime, String endTime, String tn, int page){
long bt = 0;
long et = 0;
String url = null;
if(startTime!=null){
bt = TimeParse.stringFormartDate(startTime).getTime()/1000;
}
if(endTime!=null){
et = TimeParse.stringFormartDate(endTime).getTime()/1000;
}
if(word!=null){
url = "http://news.baidu.com/ns?from=news&cl=2&bt=" + bt
+ "&et=" + et + "&q1=" +URLCodeUtil.getURLEncode(word, "utf-8") + "&q3=&q4=&tn="+ tn +"&ct=0&rn=50&clk=sortbytime&q6=&pn=" + page * 50;
}
return url;
}
}
package com.zhiwei.media_data_crawler.crawler;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
public class SoNewsCrawlerParse extends HttpClientTemplateOK {
private static Logger logger = LoggerFactory.getLogger(SoNewsCrawlerParse.class);
private static final String pt = "360新闻";
/**
* @Title: getSoNewsData
* @author hero
* @Description: 采集360新闻数据
* @param @param word
* @param @param proxy
* @param @return 设定文件
* @return List<NewsData> 返回类型
*/
@SuppressWarnings("unchecked")
public static List<NewsData> getSoNewsData(String word, Proxy proxy) {
List<NewsData> list = new ArrayList<NewsData>();
int page = 1;
boolean more = true;
while (more) {
// 最大页数为50
if (page > 50) {
more = false;
}
String htmlBody = downloadHtml(word, "news", proxy, page);
if (htmlBody != null) {
Map<String, Object> dataMap = analysisData(htmlBody, proxy, word);
List<NewsData> dataList = (List<NewsData>) dataMap.get("data");
list.addAll(dataList);
more = (Boolean) dataMap.get("more");
} else {
more = false;
}
page++;
}
return list;
}
/**
* @Title: getSoNewsDataByTitle
* @author hero
* @Description: 采集360新闻数据,标题匹配
* @param @param word
* @param @param proxy
* @param @return 设定文件
* @return List<NewsData> 返回类型
*/
@SuppressWarnings("unchecked")
public static List<NewsData> getSoNewsDataByTitle(String word, Proxy proxy) {
List<NewsData> list = new ArrayList<NewsData>();
int page = 1;
boolean more = true;
while (more) {
// 最大页数为50
if (page > 50) {
more = false;
}
String htmlBody = downloadHtml(word, "newstitle", proxy, page);
if (htmlBody != null) {
Map<String, Object> dataMap = analysisDataByTitle(htmlBody, proxy, word);
List<NewsData> dataList = (List<NewsData>) dataMap.get("data");
list.addAll(dataList);
more = (Boolean) dataMap.get("more");
} else {
more = false;
}
page++;
}
return list;
}
/**
* @Title: downloadHtml
* @author hero
* @Description: 获取数据流
* @param @param word
* @param @param tn (tn=news为全文匹配, tn=newstitle为标题匹配)
* @param @param proxy
* @param @param page
* @param @return 设定文件
* @return String 返回类型
*/
private static String downloadHtml(String word, String tn, Proxy proxy, int page) {
// 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址
String url = getUrl(word, tn, page);
headerMap.put("Host", "news.baidu.com");
headerMap.put("Referer", url);
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
logger.error("获取360新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
continue;
}
}
return null;
}
/**
* @Title: analysisData
* @author hero
* @Description: 解析360新闻数据
* @param @param
* htmlBody
* @param @param
* proxy
* @param @param
* word
* @param @return
* 设定文件
* @return Map<String,Object> 返回类型
*/
private static Map<String, Object> analysisData(String htmlBody, Proxy proxy, String word) {
Map<String, Object> resultMap = new HashMap<String, Object>();
List<NewsData> list = new ArrayList<NewsData>();
boolean more = true;
/** 解析页面 */
Document document = Jsoup.parse(htmlBody);
/** 判断是否有下一页 **/
if (document.select("div#page") == null) {
more = false;
} else {
if (!document.select("div#page").text().contains("下一页")) {
more = false;
}
}
// 开始解析
Elements elementes = document.select("ul#news").select("li");
String time = null;
String source = null;
String link = null;
String title = null;
String content = null;
for (Element element : elementes) {
try {
if(!element.attr("class").equals("res-list hasimg hasmediav")){
link = element.select("h3").select("a").attr("href");
title = element.select("h3").select("a").text();
time = element.select("p.newsinfo").select("span.posttime").attr("title");
source = element.select("p.newsinfo").select("span.sitename").text();
/** 文章发布时间处理 **/
time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss");
// 处理文章简介
content = element.select("p.content").text();
// 添加到数据集合中
NewsData newsData = new NewsData(link, title, source, time, content, pt, word);
list.add(newsData);
}
} catch (Exception e) {
e.printStackTrace();
logger.error("360新闻数据解析时出现问题,问题为:{}", e.fillInStackTrace());
continue;
}
}
resultMap.put("data", list);
resultMap.put("more", more);
return resultMap;
}
/**
*
* @Title: analysisDataByTitle
* @author hero
* @Description: 根据标题匹配数据
* @param @param htmlBody
* @param @param proxy
* @param @param word
* @param @return 设定文件
* @return Map<String,Object> 返回类型
*/
private static Map<String, Object> analysisDataByTitle(String htmlBody, Proxy proxy, String word) {
Map<String, Object> resultMap = new HashMap<String, Object>();
List<NewsData> list = new ArrayList<NewsData>();
boolean more = true;
/** 解析页面 */
Document document = Jsoup.parse(htmlBody);
/** 判断是否有下一页 **/
if (document.select("div#page") == null) {
more = false;
} else {
if (!document.select("div#page").text().contains("下一页")) {
more = false;
}
}
// 开始解析
Elements elementes = document.select("ul#news").select("li");
String time = null;
String source = null;
String link = null;
String title = null;
String content = null;
for (Element element : elementes) {
try {
link = element.select("a.news_title").attr("href");
title = element.select("a.news_title").text();
time = element.select("div.ntinfo").select("span.pdate").text();
source = element.select("div.ntinfo").select("span.stname").text();
/** 文章发布时间处理 **/
time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss");
// 添加到数据集合中
NewsData newsData = new NewsData(link, title, source, time, content, pt, word);
list.add(newsData);
} catch (Exception e) {
e.printStackTrace();
logger.error("360新闻数据解析时出现问题,问题为:{}", e.fillInStackTrace());
continue;
}
}
resultMap.put("data", list);
resultMap.put("more", more);
return resultMap;
}
/**
* @Title: getUrl
* @author hero
* @Description: 获取链接
* @param @param word
* @param @param tn (tn=news为全文匹配, tn=newstitle为标题匹配)
* @param @param page
* @param @return 设定文件
* @return String 返回类型
*/
private static String getUrl(String word, String tn, int page) {
String url = null;
if (word != null) {
url = "https://news.so.com/ns?q=" + URLCodeUtil.getURLEncode(word, "utf-8") + "&tn=" + tn
+ "&rank=rank&j=0&nso=8&tp=10&nc=0&src=page&pn=" + page;
}
return url;
}
}
package com.zhiwei.media_data_crawler.crawler;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.media_data_crawler.entity.NewsData;
import com.zhiwei.zhiweiTools.httpClient.HeaderTool;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.URLCodeUtil;
public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
private static Logger logger = LoggerFactory.getLogger(SougouNewsCrawlerParse.class);
private static final String pt = "搜狗新闻";
/**
* @Title: getBaiduNewsData
* @author hero
* @Description: 采集百度新闻数据
* @param @param word
* @param @param startTime
* @param @param endTime
* @param @param proxy
* @param @return 设定文件
* @return List<NewsData> 返回类型
*/
@SuppressWarnings("unchecked")
public static List<NewsData> getSougouNewsData(String word, Proxy proxy){
List<NewsData> list = new ArrayList<NewsData>();
int page = 1;
boolean more = true;
while(more){
//最大页数为20
if(page>100){
more = false;
}
String htmlBody = downloadHtml(word, 1, proxy, page);
if(htmlBody != null){
Map<String,Object> dataMap = analysisData(htmlBody, proxy, word);
List<NewsData> dataList = (List<NewsData>)dataMap.get("data");
list.addAll(dataList);
more = (Boolean)dataMap.get("more");
}else{
more = false;
}
page++;
}
return list;
}
@SuppressWarnings("unchecked")
public static List<NewsData> getSougouNewsDataByTitle(String word, Proxy proxy){
List<NewsData> list = new ArrayList<NewsData>();
int page = 0;
boolean more = true;
while(more){
//最大页数为20
if(page>20){
more = false;
}
String htmlBody = downloadHtml(word, 2, proxy, page);
if(htmlBody != null){
Map<String,Object> dataMap = analysisData(htmlBody, proxy, word);
List<NewsData> dataList = (List<NewsData>)dataMap.get("data");
list.addAll(dataList);
more = (Boolean)dataMap.get("more");
}else{
more = false;
}
page++;
}
return list;
}
/**
*
* @Title: downloadHtml
* @author hero
* @Description: 获取数据流
* @param @param word
* @param @param mode (mode为匹配规则,mode=1 全文匹配, mode=2 为标题匹配)
* @param @param proxy
* @param @param page
* @param @return 设定文件
* @return String 返回类型
*/
private static String downloadHtml(String word, int mode, Proxy proxy, int page) {
//获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址
String url = getUrl(word, mode, page);
headerMap.put("Host", "news.sogou.com");
headerMap.put("Referer", url.split("&page=")[0]+"&page="+(page-1));
//下载数据页面
for(int i = 1; i<=3; i++){
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
continue;
}
}
return null;
}
private static String downloadHtml(String url, Proxy proxy, int page) {
//获取通用请求头
Map<String,String> headerMap = HeaderTool.getCommonHead();
//获取链接地址
url = url + "&page" + page;
headerMap.put("Host", "news.sogou.com");
headerMap.put("Referer", url);
//下载数据页面
for(int i = 1; i<=3; i++){
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
continue;
}
}
return null;
}
/**
* @Title: analysisData
* @author hero
* @Description: 解析百度新闻数据
* @param @param htmlBody
* @param @param proxy
* @param @param word
* @param @return 设定文件
* @return Map<String,Object> 返回类型
*/
private static Map<String,Object> analysisData(String htmlBody, Proxy proxy, String word) {
Map<String,Object> resultMap = new HashMap<String,Object>();
List<NewsData> list = new ArrayList<NewsData>();
boolean more = true;
/** 解析页面 */
Document document = Jsoup.parse(htmlBody);
/**判断是否有下一页**/
if(document.select("div#pagebar_container") == null)
{
more = false;
}else
{
if(!document.select("div#pagebar_container").text().contains("下一页"))
{
more = false;
}
}
//开始解析
Elements elementes = document.select("div.results").select("div.vrwrap");
String time = null;
String source = null;
String link = null;
String title = null;
String soureAndtime = null;
String content = null;
for (Element element : elementes)
{
try {
link = element.select("h3.vrTitle").select("a").attr("href");
title = element.select("h3.vrTitle").select("a").text();
soureAndtime = element.select("div.news-detail").select("div.news-info").select("p.news-from").html();
/**截取时间*/
if (soureAndtime.contains("&nbsp;")) {
String soureAndtimes[] = soureAndtime.split("&nbsp;");
time = soureAndtimes[1];
source = soureAndtimes[0];
} else {
time = element.select("div.news-detail").select("div.news-info").select("p.news-from").text();
}
/**文章发布时间处理**/
time = TimeParse.dateFormartString(TimeParse.stringFormartDate(time), "yyyy-MM-dd HH:mm:ss") ;
// 处理文章简介
content = element.select("div.news-detail").select("div.news-info").select("p.news-txt").select("span#summary_1").text();
//添加到数据集合中
if(title != null){
NewsData newsData = new NewsData(link, title, source, time, content, pt, word);
list.add(newsData);
}
/**采集相同新闻链接**/
if(element.select("div.news-detail").select("div.news-info").select("p.news-txt").select("a#news_similar")!=null)
{
String otherLink = "http://news.sogou.com/news"+element.select("div.news-detail").select("div.news-info").select("p.news-txt").select("a#news_similar").attr("href");
List<NewsData> otherDataList = getOherSougouNewsData(otherLink, word, proxy);
list.addAll(otherDataList);
}
} catch (Exception e) {
e.printStackTrace();
logger.error("搜狗新闻数据解析时出现问题,问题为:{}", e.fillInStackTrace());
continue;
}
}
resultMap.put("data", list);
resultMap.put("more", more);
return resultMap;
}
/**
* @Title: getOherBaiduNewsData
* @author hero
* @Description: 解析相似新闻
* @param @param url
* @param @param word
* @param @param proxy
* @param @return 设定文件
* @return List<NewsData> 返回类型
*/
@SuppressWarnings("unchecked")
public static List<NewsData> getOherSougouNewsData(String url, String word, Proxy proxy){
List<NewsData> list = new ArrayList<NewsData>();
int page = 1;
boolean more = true;
while(more){
//最大页数为20
if(page>10){
more = false;
}
String htmlBody = downloadHtml(url, proxy, page);
if(htmlBody != null){
Map<String,Object> dataMap = analysisData(htmlBody, null, word);
List<NewsData> dataList = (List<NewsData>)dataMap.get("data");
list.addAll(dataList);
more = (Boolean)dataMap.get("more");
}else{
more = false;
}
page++;
}
return list;
}
/**
* @Title: getUrl
* @author hero
* @Description: 获取链接
* @param @param word
* @param @param mode (mode为匹配规则,mode=1 全文匹配, mode=2 为标题匹配)
* @param @param page
* @param @return 设定文件
* @return String 返回类型
*/
private static String getUrl(String word, int mode ,int page){
String url = null;
if(word!=null){
url = "http://news.sogou.com/news?mode="+ mode +"&media=&query="
+ URLCodeUtil.getURLEncode(word, "utf-8") + "&time=0&clusterId=&sort=1&page=2&dp=1&page="+page;
}
return url;
}
}
package com.zhiwei.media_data_crawler.data;
import java.net.Proxy;
import java.util.List;
import com.zhiwei.media_data_crawler.crawler.BaiduNewsCrawlerParse;
import com.zhiwei.media_data_crawler.crawler.SoNewsCrawlerParse;
import com.zhiwei.media_data_crawler.crawler.SougouNewsCrawlerParse;
import com.zhiwei.media_data_crawler.entity.NewsData;
public class DataCrawler {
/**
*
* @Title: getBaiduNewsData
* @author hero
* @Description: 根据关键词和时间,全文匹配百度新闻数据
* @param @param word
* @param @param startTime
* @param @param endTime
* @param @param proxy
* @param @return 设定文件
* @return List<NewsData> 返回类型
*/
public static List<NewsData> getBaiduNewsData(String word, String startTime, String endTime, Proxy proxy){
try {
return BaiduNewsCrawlerParse.getBaiduNewsData(word, startTime, endTime, proxy);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
*
* @Title: getBaiduNewsDataByTitle
* @author hero
* @Description: 根据关键词和时间,标题匹配百度新闻数据
* @param @param word
* @param @param startTime
* @param @param endTime
* @param @param proxy
* @param @return 设定文件
* @return List<NewsData> 返回类型
*/
public static List<NewsData> getBaiduNewsDataByTitle(String word, String startTime, String endTime, Proxy proxy){
try {
return BaiduNewsCrawlerParse.getBaiduNewsDataByTitle(word, startTime, endTime, proxy);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
*
* @Title: getSoNewsData
* @author hero
* @Description: 采集360新闻数据,按照全文匹配
* @param @param word
* @param @param proxy
* @param @return 设定文件
* @return List<NewsData> 返回类型
*/
public static List<NewsData> getSoNewsData(String word, Proxy proxy){
try {
return SoNewsCrawlerParse.getSoNewsData(word, proxy);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
*
* @Title: getSoNewsDataByTitle
* @author hero
* @Description: 采集360新闻数据 ,按照标题匹配
* @param @param word
* @param @param proxy
* @param @return 设定文件
* @return List<NewsData> 返回类型
*/
public static List<NewsData> getSoNewsDataByTitle(String word, Proxy proxy){
try {
return SoNewsCrawlerParse.getSoNewsDataByTitle(word, proxy);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
*
* @Title: getSougouNewsData
* @author hero
* @Description: 搜狗新闻采集,全文匹配
* @param @param word
* @param @param proxy
* @param @return 设定文件
* @return List<NewsData> 返回类型
*/
public static List<NewsData> getSougouNewsData(String word, Proxy proxy){
try {
return SougouNewsCrawlerParse.getSougouNewsData(word, proxy);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
*
* @Title: getSougouNewsDataByTitle
* @author hero
* @Description: 搜狗新闻采集,标题匹配
* @param @param word
* @param @param proxy
* @param @return 设定文件
* @return List<NewsData> 返回类型
*/
public static List<NewsData> getSougouNewsDataByTitle(String word, Proxy proxy){
try {
return SougouNewsCrawlerParse.getSougouNewsDataByTitle(word, proxy);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
}
package com.zhiwei.media_data_crawler.entity;
import java.io.Serializable;
/**
* @ClassName: NewsData
* @Description:数据实体类
* @author hero
* @date 2018年2月24日 下午5:51:31
*/
public class NewsData implements Serializable{
private static final long serialVersionUID = -4767006433365382515L;
private String url; //文章地址
private String title; //文章标题
private String source; //文章来源
private String time; //文章时间
private String content; //文章简介
private String pt; //采集来源
private String word; //采集关键词
public NewsData() {}
public NewsData(String url, String title, String source, String time
,String content, String pt, String word) {
this.url = url;
this.title = title;
this.source = source;
this.time = time;
this.content = content;
this.pt = pt;
this.word = word;
}
@Override
public String toString(){
return "new NewsData["
+ "url = " + url
+ ", title = " + title
+ ", source = " + source
+ ", time = " + time
+ ", content = " + content
+ ", pt = " + pt
+ ", word = " + word
+ "]";
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getPt() {
return pt;
}
public void setPt(String pt) {
this.pt = pt;
}
public String getWord() {
return word;
}
public void setWord(String word) {
this.word = word;
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment