Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
media_data_crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
media_data_crawler
Commits
3e60233c
Commit
3e60233c
authored
Aug 02, 2018
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加360新闻采集今日头条
parent
0537cf5a
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
57 additions
and
11 deletions
+57
-11
pom.xml
+2
-1
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java
+1
-1
src/main/java/com/zhiwei/media_data_crawler/crawler/SoNewsCrawlerParse.java
+3
-4
src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
+23
-4
src/main/java/com/zhiwei/media_data_crawler/entity/NewsData.java
+22
-1
src/test/java/com/zhiwei/media_data_crawler/test/DataCrawlerTest.java
+6
-0
No files found.
pom.xml
View file @
3e60233c
...
@@ -66,7 +66,7 @@
...
@@ -66,7 +66,7 @@
<dependency>
<dependency>
<groupId>
com.zhiwei
</groupId>
<groupId>
com.zhiwei
</groupId>
<artifactId>
zhiweiTools
</artifactId>
<artifactId>
zhiweiTools
</artifactId>
<version>
0.0.
6
-SNAPSHOT
</version>
<version>
0.0.
7
-SNAPSHOT
</version>
</dependency>
</dependency>
</dependencies>
</dependencies>
</project>
</project>
\ No newline at end of file
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java
View file @
3e60233c
...
@@ -165,7 +165,6 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
...
@@ -165,7 +165,6 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
// 获取链接地址
// 获取链接地址
String
url
=
getUrl
(
word
,
startTime
,
endTime
,
tn
,
page
);
String
url
=
getUrl
(
word
,
startTime
,
endTime
,
tn
,
page
);
System
.
out
.
println
(
url
);
headerMap
.
put
(
"Host"
,
"news.baidu.com"
);
headerMap
.
put
(
"Host"
,
"news.baidu.com"
);
headerMap
.
put
(
"Referer"
,
url
);
headerMap
.
put
(
"Referer"
,
url
);
// 下载数据页面
// 下载数据页面
...
@@ -286,6 +285,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
...
@@ -286,6 +285,7 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
ZhiWeiTools
.
sleep
(
100
);
ZhiWeiTools
.
sleep
(
100
);
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
System
.
out
.
println
(
"soureAndtime======"
+
soureAndtime
);
e
.
printStackTrace
();
e
.
printStackTrace
();
logger
.
error
(
"百度新闻数据解析时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
logger
.
error
(
"百度新闻数据解析时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
continue
;
continue
;
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/SoNewsCrawlerParse.java
View file @
3e60233c
package
com
.
zhiwei
.
media_data_crawler
.
crawler
;
package
com
.
zhiwei
.
media_data_crawler
.
crawler
;
import
java.io.IOException
;
import
java.net.Proxy
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.HashMap
;
...
@@ -124,18 +123,18 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
...
@@ -124,18 +123,18 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @param @return 设定文件
* @return String 返回类型
* @return String 返回类型
*/
*/
private
static
String
downloadHtml
(
String
word
,
String
tn
,
Proxy
proxy
,
int
page
)
throws
IO
Exception
{
private
static
String
downloadHtml
(
String
word
,
String
tn
,
Proxy
proxy
,
int
page
)
throws
Exception
{
// 获取通用请求头
// 获取通用请求头
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
// 获取链接地址
// 获取链接地址
String
url
=
getUrl
(
word
,
tn
,
page
);
String
url
=
getUrl
(
word
,
tn
,
page
);
headerMap
.
put
(
"Host"
,
"
news.baidu
.com"
);
headerMap
.
put
(
"Host"
,
"
www.so
.com"
);
headerMap
.
put
(
"Referer"
,
url
);
headerMap
.
put
(
"Referer"
,
url
);
// 下载数据页面
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
try
{
return
get
(
url
,
proxy
,
headerMap
);
return
get
(
url
,
proxy
,
headerMap
);
}
catch
(
IO
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取360新闻数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
logger
.
error
(
"获取360新闻数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
if
(
i
==
3
){
throw
e
;
throw
e
;
...
...
src/main/java/com/zhiwei/media_data_crawler/data/DataCrawler.java
View file @
3e60233c
...
@@ -6,6 +6,7 @@ import java.util.List;
...
@@ -6,6 +6,7 @@ import java.util.List;
import
com.zhiwei.media_data_crawler.crawler.BaiduNewsCrawlerParse
;
import
com.zhiwei.media_data_crawler.crawler.BaiduNewsCrawlerParse
;
import
com.zhiwei.media_data_crawler.crawler.BaiduTiebaCrawlerParse
;
import
com.zhiwei.media_data_crawler.crawler.BaiduTiebaCrawlerParse
;
import
com.zhiwei.media_data_crawler.crawler.DoubanCrawlerParse
;
import
com.zhiwei.media_data_crawler.crawler.DoubanCrawlerParse
;
import
com.zhiwei.media_data_crawler.crawler.SoCrawlerParse
;
import
com.zhiwei.media_data_crawler.crawler.SoNewsCrawlerParse
;
import
com.zhiwei.media_data_crawler.crawler.SoNewsCrawlerParse
;
import
com.zhiwei.media_data_crawler.crawler.SougouNewsCrawlerParse
;
import
com.zhiwei.media_data_crawler.crawler.SougouNewsCrawlerParse
;
import
com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse
;
import
com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse
;
...
@@ -20,10 +21,6 @@ public class DataCrawler {
...
@@ -20,10 +21,6 @@ public class DataCrawler {
public
static
Long
sleepTime
;
public
static
Long
sleepTime
;
public
void
setSleepTime
(
Long
sleepTime
)
{
DataCrawler
.
sleepTime
=
sleepTime
;
}
/**
/**
*
*
* @Title: getBaiduNewsData
* @Title: getBaiduNewsData
...
@@ -277,5 +274,27 @@ public class DataCrawler {
...
@@ -277,5 +274,27 @@ public class DataCrawler {
return
null
;
return
null
;
}
}
}
}
/**
* 根据域名匹配数据来源
* @Title: getSoData
* @author hero
* @param @param word
* @param @param site
* @param @param time
* @param @param proxy
* @param @return 设定文件
* @return List<NewsData> 返回类型
*/
public
static
List
<
NewsData
>
getSoData
(
String
word
,
String
site
,
String
time
,
Proxy
proxy
)
{
try
{
return
SoCrawlerParse
.
getSoData
(
word
,
site
,
time
,
proxy
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
null
;
}
}
}
}
src/main/java/com/zhiwei/media_data_crawler/entity/NewsData.java
View file @
3e60233c
...
@@ -18,10 +18,10 @@ public class NewsData implements Serializable{
...
@@ -18,10 +18,10 @@ public class NewsData implements Serializable{
private
String
time
;
//文章时间
private
String
time
;
//文章时间
private
String
content
;
//文章简介
private
String
content
;
//文章简介
private
String
pt
;
//采集来源
private
String
pt
;
//采集来源
private
String
user_id
;
//用户id
private
String
word
;
//采集关键词
private
String
word
;
//采集关键词
public
NewsData
()
{}
public
NewsData
()
{}
...
@@ -36,6 +36,18 @@ public class NewsData implements Serializable{
...
@@ -36,6 +36,18 @@ public class NewsData implements Serializable{
this
.
word
=
word
;
this
.
word
=
word
;
}
}
public
NewsData
(
String
url
,
String
title
,
String
source
,
String
time
,
String
content
,
String
pt
,
String
word
,
String
user_id
)
{
this
.
url
=
url
;
this
.
title
=
title
;
this
.
source
=
source
;
this
.
time
=
time
;
this
.
content
=
content
;
this
.
pt
=
pt
;
this
.
word
=
word
;
this
.
user_id
=
user_id
;
}
@Override
@Override
public
String
toString
(){
public
String
toString
(){
return
"new NewsData["
return
"new NewsData["
...
@@ -46,6 +58,7 @@ public class NewsData implements Serializable{
...
@@ -46,6 +58,7 @@ public class NewsData implements Serializable{
+
", content = "
+
content
+
", content = "
+
content
+
", pt = "
+
pt
+
", pt = "
+
pt
+
", word = "
+
word
+
", word = "
+
word
+
", user_id = "
+
user_id
+
"]"
;
+
"]"
;
}
}
...
@@ -94,4 +107,12 @@ public class NewsData implements Serializable{
...
@@ -94,4 +107,12 @@ public class NewsData implements Serializable{
this
.
word
=
word
;
this
.
word
=
word
;
}
}
public
String
getUser_id
()
{
return
user_id
;
}
public
void
setUser_id
(
String
user_id
)
{
this
.
user_id
=
user_id
;
}
}
}
src/test/java/com/zhiwei/media_data_crawler/test/DataCrawlerTest.java
View file @
3e60233c
...
@@ -46,6 +46,12 @@ public class DataCrawlerTest {
...
@@ -46,6 +46,12 @@ public class DataCrawlerTest {
// List<DouBanData> list = DataCrawler.getDouBanData(word, type, proxy);
// List<DouBanData> list = DataCrawler.getDouBanData(word, type, proxy);
List
<
NewsData
>
list
=
DataCrawler
.
getSoData
(
"京东"
,
"www.toutiao.com"
,
"d"
,
proxy
);
for
(
NewsData
newsData
:
list
)
{
System
.
out
.
println
(
newsData
);
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
// TODO Auto-generated catch block
// TODO Auto-generated catch block
e
.
printStackTrace
();
e
.
printStackTrace
();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment