Commit 3e60233c by zhiwei

添加360新闻采集今日头条

parent 0537cf5a
......@@ -66,7 +66,7 @@
<dependency>
<groupId>com.zhiwei</groupId>
<artifactId>zhiweiTools</artifactId>
<version>0.0.6-SNAPSHOT</version>
<version>0.0.7-SNAPSHOT</version>
</dependency>
</dependencies>
</project>
\ No newline at end of file
......@@ -165,7 +165,6 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址
String url = getUrl(word, startTime, endTime, tn, page);
System.out.println(url);
headerMap.put("Host", "news.baidu.com");
headerMap.put("Referer", url);
// 下载数据页面
......@@ -286,6 +285,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
ZhiWeiTools.sleep(100);
}
} catch (Exception e) {
System.out.println("soureAndtime======"+soureAndtime);
e.printStackTrace();
logger.error("百度新闻数据解析时出现问题,问题为:{}", e.fillInStackTrace());
continue;
......
package com.zhiwei.media_data_crawler.crawler;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.HashMap;
......@@ -124,18 +123,18 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @return String 返回类型
*/
private static String downloadHtml(String word, String tn, Proxy proxy, int page)throws IOException {
private static String downloadHtml(String word, String tn, Proxy proxy, int page)throws Exception {
// 获取通用请求头
Map<String, String> headerMap = HeaderTool.getCommonHead();
// 获取链接地址
String url = getUrl(word, tn, page);
headerMap.put("Host", "news.baidu.com");
headerMap.put("Host", "www.so.com");
headerMap.put("Referer", url);
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
return get(url, proxy, headerMap);
} catch (IOException e) {
} catch (Exception e) {
logger.error("获取360新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
if(i==3){
throw e;
......
......@@ -6,6 +6,7 @@ import java.util.List;
import com.zhiwei.media_data_crawler.crawler.BaiduNewsCrawlerParse;
import com.zhiwei.media_data_crawler.crawler.BaiduTiebaCrawlerParse;
import com.zhiwei.media_data_crawler.crawler.DoubanCrawlerParse;
import com.zhiwei.media_data_crawler.crawler.SoCrawlerParse;
import com.zhiwei.media_data_crawler.crawler.SoNewsCrawlerParse;
import com.zhiwei.media_data_crawler.crawler.SougouNewsCrawlerParse;
import com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse;
......@@ -20,10 +21,6 @@ public class DataCrawler {
public static Long sleepTime;
public void setSleepTime(Long sleepTime) {
DataCrawler.sleepTime = sleepTime;
}
/**
*
* @Title: getBaiduNewsData
......@@ -278,4 +275,26 @@ public class DataCrawler {
}
}
/**
* 根据域名匹配数据来源
* @Title: getSoData
* @author hero
* @param @param word
* @param @param site
* @param @param time
* @param @param proxy
* @param @return 设定文件
* @return List<NewsData> 返回类型
*/
public static List<NewsData> getSoData(String word, String site, String time, Proxy proxy) {
try {
return SoCrawlerParse.getSoData(word, site, time, proxy);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
}
......@@ -18,10 +18,10 @@ public class NewsData implements Serializable{
private String time; //文章时间
private String content; //文章简介
private String pt; //采集来源
private String user_id; //用户id
private String word; //采集关键词
public NewsData() {}
......@@ -36,6 +36,18 @@ public class NewsData implements Serializable{
this.word = word;
}
public NewsData(String url, String title, String source, String time
,String content, String pt, String word,String user_id) {
this.url = url;
this.title = title;
this.source = source;
this.time = time;
this.content = content;
this.pt = pt;
this.word = word;
this.user_id = user_id;
}
@Override
public String toString(){
return "new NewsData["
......@@ -46,6 +58,7 @@ public class NewsData implements Serializable{
+ ", content = " + content
+ ", pt = " + pt
+ ", word = " + word
+ ", user_id = " + user_id
+ "]";
}
......@@ -94,4 +107,12 @@ public class NewsData implements Serializable{
this.word = word;
}
public String getUser_id() {
return user_id;
}
public void setUser_id(String user_id) {
this.user_id = user_id;
}
}
......@@ -46,6 +46,12 @@ public class DataCrawlerTest {
// List<DouBanData> list = DataCrawler.getDouBanData(word, type, proxy);
List<NewsData> list = DataCrawler.getSoData("京东", "www.toutiao.com", "d", proxy);
for(NewsData newsData : list) {
System.out.println(newsData);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment