Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
media_data_crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
media_data_crawler
Commits
38bcf00d
Commit
38bcf00d
authored
Mar 06, 2018
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加自助翻页功能,如使用请添加休眠时间
parent
0930c2aa
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
87 additions
and
54 deletions
+87
-54
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java
+0
-0
src/main/java/com/zhiwei/media_data_crawler/crawler/SoNewsCrawlerParse.java
+19
-5
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouNewsCrawlerParse.java
+17
-6
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouZhihuCrawlerParse.java
+14
-6
src/test/java/com/zhiwei/media_data_crawler/test/DataCrawlerTest.java
+37
-37
No files found.
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java
View file @
38bcf00d
This diff is collapsed.
Click to expand it.
src/main/java/com/zhiwei/media_data_crawler/crawler/SoNewsCrawlerParse.java
View file @
38bcf00d
...
@@ -32,9 +32,10 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
...
@@ -32,9 +32,10 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* @param @param proxy
* @param @param proxy
* @param @return 设定文件
* @param @return 设定文件
* @return List<NewsData> 返回类型
* @return List<NewsData> 返回类型
* @throws Exception
*/
*/
@SuppressWarnings
(
"unchecked"
)
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
NewsData
>
getSoNewsData
(
String
word
,
Proxy
proxy
)
{
public
static
List
<
NewsData
>
getSoNewsData
(
String
word
,
Proxy
proxy
)
throws
Exception
{
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
int
page
=
1
;
int
page
=
1
;
boolean
more
=
true
;
boolean
more
=
true
;
...
@@ -59,6 +60,19 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
...
@@ -59,6 +60,19 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
return
list
;
return
list
;
}
}
public
static
Map
<
String
,
Object
>
getSoNewsData
(
String
word
,
Proxy
proxy
,
int
page
)
throws
Exception
{
String
htmlBody
=
downloadHtml
(
word
,
"news"
,
proxy
,
page
);
if
(
htmlBody
!=
null
)
{
return
analysisData
(
htmlBody
,
proxy
,
word
);
}
return
null
;
}
/**
/**
* @Title: getSoNewsDataByTitle
* @Title: getSoNewsDataByTitle
* @author hero
* @author hero
...
@@ -69,7 +83,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
...
@@ -69,7 +83,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* @return List<NewsData> 返回类型
* @return List<NewsData> 返回类型
*/
*/
@SuppressWarnings
(
"unchecked"
)
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
NewsData
>
getSoNewsDataByTitle
(
String
word
,
Proxy
proxy
)
{
public
static
List
<
NewsData
>
getSoNewsDataByTitle
(
String
word
,
Proxy
proxy
)
throws
Exception
{
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
int
page
=
1
;
int
page
=
1
;
boolean
more
=
true
;
boolean
more
=
true
;
...
@@ -104,7 +118,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
...
@@ -104,7 +118,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @param @return 设定文件
* @return String 返回类型
* @return String 返回类型
*/
*/
private
static
String
downloadHtml
(
String
word
,
String
tn
,
Proxy
proxy
,
int
page
)
{
private
static
String
downloadHtml
(
String
word
,
String
tn
,
Proxy
proxy
,
int
page
)
throws
Exception
{
// 获取通用请求头
// 获取通用请求头
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
// 获取链接地址
// 获取链接地址
...
@@ -138,7 +152,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
...
@@ -138,7 +152,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* 设定文件
* 设定文件
* @return Map<String,Object> 返回类型
* @return Map<String,Object> 返回类型
*/
*/
private
static
Map
<
String
,
Object
>
analysisData
(
String
htmlBody
,
Proxy
proxy
,
String
word
)
{
private
static
Map
<
String
,
Object
>
analysisData
(
String
htmlBody
,
Proxy
proxy
,
String
word
)
throws
Exception
{
Map
<
String
,
Object
>
resultMap
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
Object
>
resultMap
=
new
HashMap
<
String
,
Object
>();
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
boolean
more
=
true
;
boolean
more
=
true
;
...
@@ -199,7 +213,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
...
@@ -199,7 +213,7 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @param @return 设定文件
* @return Map<String,Object> 返回类型
* @return Map<String,Object> 返回类型
*/
*/
private
static
Map
<
String
,
Object
>
analysisDataByTitle
(
String
htmlBody
,
Proxy
proxy
,
String
word
)
{
private
static
Map
<
String
,
Object
>
analysisDataByTitle
(
String
htmlBody
,
Proxy
proxy
,
String
word
)
throws
Exception
{
Map
<
String
,
Object
>
resultMap
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
Object
>
resultMap
=
new
HashMap
<
String
,
Object
>();
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
boolean
more
=
true
;
boolean
more
=
true
;
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouNewsCrawlerParse.java
View file @
38bcf00d
...
@@ -35,9 +35,10 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
...
@@ -35,9 +35,10 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @param @param proxy
* @param @param proxy
* @param @return 设定文件
* @param @return 设定文件
* @return List<NewsData> 返回类型
* @return List<NewsData> 返回类型
* @throws Exception
*/
*/
@SuppressWarnings
(
"unchecked"
)
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
NewsData
>
getSougouNewsData
(
String
word
,
Proxy
proxy
){
public
static
List
<
NewsData
>
getSougouNewsData
(
String
word
,
Proxy
proxy
)
throws
Exception
{
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
int
page
=
1
;
int
page
=
1
;
boolean
more
=
true
;
boolean
more
=
true
;
...
@@ -62,8 +63,18 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
...
@@ -62,8 +63,18 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
}
}
public
static
Map
<
String
,
Object
>
getSougouNewsData
(
String
word
,
Proxy
proxy
,
int
page
)
throws
Exception
{
String
htmlBody
=
downloadHtml
(
word
,
1
,
proxy
,
page
);
if
(
htmlBody
!=
null
&&
!
htmlBody
.
equals
(
""
)){
return
analysisData
(
htmlBody
,
proxy
,
word
);
}
return
null
;
}
@SuppressWarnings
(
"unchecked"
)
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
NewsData
>
getSougouNewsDataByTitle
(
String
word
,
Proxy
proxy
){
public
static
List
<
NewsData
>
getSougouNewsDataByTitle
(
String
word
,
Proxy
proxy
)
throws
Exception
{
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
int
page
=
0
;
int
page
=
0
;
boolean
more
=
true
;
boolean
more
=
true
;
...
@@ -99,7 +110,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
...
@@ -99,7 +110,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @param @return 设定文件
* @return String 返回类型
* @return String 返回类型
*/
*/
private
static
String
downloadHtml
(
String
word
,
int
mode
,
Proxy
proxy
,
int
page
)
{
private
static
String
downloadHtml
(
String
word
,
int
mode
,
Proxy
proxy
,
int
page
)
throws
Exception
{
//获取通用请求头
//获取通用请求头
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
//获取链接地址
//获取链接地址
...
@@ -118,7 +129,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
...
@@ -118,7 +129,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
return
null
;
return
null
;
}
}
private
static
String
downloadHtml
(
String
url
,
Proxy
proxy
,
int
page
)
{
private
static
String
downloadHtml
(
String
url
,
Proxy
proxy
,
int
page
)
throws
Exception
{
//获取通用请求头
//获取通用请求头
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
//获取链接地址
//获取链接地址
...
@@ -149,7 +160,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
...
@@ -149,7 +160,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @param @return 设定文件
* @return Map<String,Object> 返回类型
* @return Map<String,Object> 返回类型
*/
*/
private
static
Map
<
String
,
Object
>
analysisData
(
String
htmlBody
,
Proxy
proxy
,
String
word
)
{
private
static
Map
<
String
,
Object
>
analysisData
(
String
htmlBody
,
Proxy
proxy
,
String
word
)
throws
Exception
{
Map
<
String
,
Object
>
resultMap
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
Object
>
resultMap
=
new
HashMap
<
String
,
Object
>();
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
boolean
more
=
true
;
boolean
more
=
true
;
...
@@ -241,7 +252,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
...
@@ -241,7 +252,7 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
* @return List<NewsData> 返回类型
* @return List<NewsData> 返回类型
*/
*/
@SuppressWarnings
(
"unchecked"
)
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
NewsData
>
getOherSougouNewsData
(
String
url
,
String
word
,
Proxy
proxy
){
public
static
List
<
NewsData
>
getOherSougouNewsData
(
String
url
,
String
word
,
Proxy
proxy
)
throws
Exception
{
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
int
page
=
1
;
int
page
=
1
;
boolean
more
=
true
;
boolean
more
=
true
;
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouZhihuCrawlerParse.java
View file @
38bcf00d
...
@@ -38,9 +38,10 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
...
@@ -38,9 +38,10 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @param @param proxy
* @param @param proxy
* @param @return 设定文件
* @param @return 设定文件
* @return List<ZhiHuData> 返回类型
* @return List<ZhiHuData> 返回类型
* @throws Exception
*/
*/
@SuppressWarnings
(
"unchecked"
)
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
ZhiHuData
>
getSougouZhihuData
(
String
word
,
Proxy
proxy
){
public
static
List
<
ZhiHuData
>
getSougouZhihuData
(
String
word
,
Proxy
proxy
)
throws
Exception
{
List
<
ZhiHuData
>
list
=
new
ArrayList
<
ZhiHuData
>();
List
<
ZhiHuData
>
list
=
new
ArrayList
<
ZhiHuData
>();
int
page
=
1
;
int
page
=
1
;
boolean
more
=
true
;
boolean
more
=
true
;
...
@@ -66,6 +67,13 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
...
@@ -66,6 +67,13 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
}
}
public
static
Map
<
String
,
Object
>
getSougouZhihuData
(
String
word
,
Proxy
proxy
,
int
page
)
throws
Exception
{
String
htmlBody
=
downloadHtml
(
word
,
proxy
,
page
);
if
(
htmlBody
!=
null
&&
!
htmlBody
.
equals
(
""
)){
return
analysisData
(
htmlBody
,
proxy
,
word
);
}
return
null
;
}
/**
/**
*
*
...
@@ -79,7 +87,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
...
@@ -79,7 +87,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @param @return 设定文件
* @return String 返回类型
* @return String 返回类型
*/
*/
private
static
String
downloadHtml
(
String
word
,
Proxy
proxy
,
int
page
)
{
private
static
String
downloadHtml
(
String
word
,
Proxy
proxy
,
int
page
)
throws
Exception
{
//获取通用请求头
//获取通用请求头
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
//获取链接地址
//获取链接地址
...
@@ -97,7 +105,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
...
@@ -97,7 +105,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
return
null
;
return
null
;
}
}
private
static
String
downloadHtml
(
String
url
,
Proxy
proxy
,
String
type
)
{
private
static
String
downloadHtml
(
String
url
,
Proxy
proxy
,
String
type
)
throws
Exception
{
//获取通用请求头
//获取通用请求头
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
//获取链接地址
//获取链接地址
...
@@ -131,7 +139,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
...
@@ -131,7 +139,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @param @return 设定文件
* @return Map<String,Object> 返回类型
* @return Map<String,Object> 返回类型
*/
*/
private
static
Map
<
String
,
Object
>
analysisData
(
String
htmlBody
,
Proxy
proxy
,
String
word
)
{
private
static
Map
<
String
,
Object
>
analysisData
(
String
htmlBody
,
Proxy
proxy
,
String
word
)
throws
Exception
{
Map
<
String
,
Object
>
resultMap
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
Object
>
resultMap
=
new
HashMap
<
String
,
Object
>();
List
<
ZhiHuData
>
list
=
new
ArrayList
<
ZhiHuData
>();
List
<
ZhiHuData
>
list
=
new
ArrayList
<
ZhiHuData
>();
boolean
more
=
true
;
boolean
more
=
true
;
...
@@ -203,7 +211,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
...
@@ -203,7 +211,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @param @return 设定文件
* @param @return 设定文件
* @return ZhiHuData 返回类型
* @return ZhiHuData 返回类型
*/
*/
private
static
ZhiHuData
analysisZhihuAnswer
(
String
url
,
Proxy
proxy
,
ZhiHuData
zhihu
){
private
static
ZhiHuData
analysisZhihuAnswer
(
String
url
,
Proxy
proxy
,
ZhiHuData
zhihu
)
throws
Exception
{
try
{
try
{
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
"问答"
);
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
"问答"
);
if
(
htmlBody
!=
null
){
if
(
htmlBody
!=
null
){
...
@@ -239,7 +247,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
...
@@ -239,7 +247,7 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
* @return ZhiHuData 返回类型
* @return ZhiHuData 返回类型
*/
*/
@SuppressWarnings
(
"deprecation"
)
@SuppressWarnings
(
"deprecation"
)
private
static
ZhiHuData
analysisZhihuArticle
(
String
url
,
Proxy
proxy
,
ZhiHuData
zhihu
){
private
static
ZhiHuData
analysisZhihuArticle
(
String
url
,
Proxy
proxy
,
ZhiHuData
zhihu
)
throws
Exception
{
try
{
try
{
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
"文章"
);
String
htmlBody
=
downloadHtml
(
url
,
proxy
,
"文章"
);
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Document
document
=
Jsoup
.
parse
(
htmlBody
);
...
...
src/test/java/com/zhiwei/media_data_crawler/test/DataCrawlerTest.java
View file @
38bcf00d
package
com
.
zhiwei
.
media_data_crawler
.
test
;
//
package com.zhiwei.media_data_crawler.test;
//
import
java.net.Proxy
;
//
import java.net.Proxy;
import
java.util.List
;
//
import java.util.List;
//
import
org.junit.Test
;
//
import org.junit.Test;
//
import
com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse
;
//
import com.zhiwei.media_data_crawler.crawler.SougouZhihuCrawlerParse;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
//
import com.zhiwei.media_data_crawler.data.DataCrawler;
import
com.zhiwei.media_data_crawler.entity.NewsData
;
//
import com.zhiwei.media_data_crawler.entity.NewsData;
import
com.zhiwei.media_data_crawler.entity.ZhiHuData
;
//
import com.zhiwei.media_data_crawler.entity.ZhiHuData;
//
public
class
DataCrawlerTest
{
//
public class DataCrawlerTest {
//
//
//
//
//
@Test
//
@Test
public
void
getSoNewsTest
(){
//
public void getSoNewsTest(){
String
word
=
"马云"
;
//关键词
//
String word = "马云"; //关键词
String
startTime
=
"2017-03-01 00:00:00"
;
//开始时间
//
String startTime = "2017-03-01 00:00:00"; //开始时间
String
endTime
=
"2017-03-01 23:59:59"
;
//结束时间
//
String endTime = "2017-03-01 23:59:59"; //结束时间
Proxy
proxy
=
null
;
//代理IP,不用可不填写
//
Proxy proxy = null; //代理IP,不用可不填写
//百度新闻采集demo
//
//百度新闻采集demo
// List<NewsData> baiduNewsList = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
//
//
List<NewsData> baiduNewsList = DataCrawler.getBaiduNewsData(word, startTime, endTime, proxy);
// //搜狗新闻关键词采集demo
//
//
//搜狗新闻关键词采集demo
// List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
//
//
List<NewsData> sogouNewsList = DataCrawler.getSougouNewsData(word, proxy);
// //360新闻采集demo
//
//
//360新闻采集demo
// List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
//
//
List<NewsData> soNewsList = DataCrawler.getSoNewsData(word, proxy);
//搜狗知乎采集
////
//搜狗知乎采集
List
<
ZhiHuData
>
zhihuList
=
DataCrawler
.
getSougouZhihuData
(
word
,
proxy
);
////
List<ZhiHuData> zhihuList = DataCrawler.getSougouZhihuData(word, proxy);
System
.
out
.
println
(
zhihuList
.
size
());
////
System.out.println(zhihuList.size());
//
}
//
}
//
}
//
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment