Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
media_data_crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
media_data_crawler
Commits
9453f8d6
Commit
9453f8d6
authored
Sep 05, 2018
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修改http-core核心包,增加程序稳定性及处理乱码
parent
cc347740
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
196 additions
and
184 deletions
+196
-184
README.md
+1
-1
pom.xml
+5
-5
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java
+27
-24
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduTiebaCrawlerParse.java
+24
-24
src/main/java/com/zhiwei/media_data_crawler/crawler/DoubanCrawlerParse.java
+25
-24
src/main/java/com/zhiwei/media_data_crawler/crawler/SoCrawlerParse.java
+25
-20
src/main/java/com/zhiwei/media_data_crawler/crawler/SoNewsCrawlerParse.java
+20
-17
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouNewsCrawlerParse.java
+24
-21
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouZhihuCrawlerParse.java
+25
-26
src/main/java/com/zhiwei/media_data_crawler/crawler/TianYaCrawlerParse.java
+20
-22
No files found.
README.md
View file @
9453f8d6
...
...
@@ -4,7 +4,7 @@
#####更新提示2018-03-06
本次更新内容为添加搜狗知乎采集
添加自助翻页功能,如使用请添加休眠时间,以下是使用例子,百度为例
public static List
<NewsData>
getBaiduNewsData(String word, String startTime, String
endTime, Proxy proxy) throws Exception {
public static List
<NewsData>
getBaiduNewsData(String word, String startTime, String endTime, Proxy proxy) throws Exception {
List
<NewsData>
list = new ArrayList
<NewsData>
();
int page = 0;
boolean more = true;
...
...
pom.xml
View file @
9453f8d6
...
...
@@ -60,13 +60,12 @@
<url>
http://192.168.0.30:8081/nexus/content/repositories/releases/
</url>
</repository>
</distributionManagement>
<dependencies>
<dependency>
<groupId>
com.zhiwei
</groupId>
<artifactId>
zhiwei
T
ools
</artifactId>
<version>
0.0.
7
-SNAPSHOT
</version>
<groupId>
com.zhiwei
.tools
</groupId>
<artifactId>
zhiwei
-t
ools
</artifactId>
<version>
0.0.
4
-SNAPSHOT
</version>
</dependency>
</dependencies>
</project>
\ No newline at end of file
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduNewsCrawlerParse.java
View file @
9453f8d6
package
com
.
zhiwei
.
media_data_crawler
.
crawler
;
import
java.io.IOException
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.NewsData
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
...
...
@@ -9,24 +24,10 @@ import java.util.Map;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.NewsData
;
import
com.zhiwei.zhiweiTools.httpClient.HeaderTool
;
import
com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK
;
import
com.zhiwei.zhiweiTools.timeParse.TimeParse
;
import
com.zhiwei.zhiweiTools.tools.URLCodeUtil
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
public
class
BaiduNewsCrawlerParse
extends
HttpClientTemplateOK
{
public
class
BaiduNewsCrawlerParse
{
private
static
Logger
logger
=
Log
gerFactory
.
getLogger
(
BaiduNewsCrawlerParse
.
class
);
private
static
Logger
logger
=
Log
Manager
.
getLogger
(
BaiduNewsCrawlerParse
.
class
);
private
static
final
String
pt
=
"百度新闻"
;
/**
...
...
@@ -194,14 +195,14 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
// 获取链接地址
String
url
=
getUrl
(
word
,
startTime
,
endTime
,
tn
,
page
);
System
.
out
.
println
(
url
);
headerMap
.
put
(
"Host"
,
"news.baidu.com"
);
headerMap
.
put
(
"cookie"
,
cookie
);
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
return
get
(
url
,
proxy
,
headerMap
);
}
catch
(
IOException
e
)
{
Response
response
=
HttpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
));
return
response
.
body
().
toString
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
throw
e
;
...
...
@@ -245,8 +246,9 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
return
get
(
url
,
proxy
,
headerMap
);
}
catch
(
IOException
e
)
{
Response
response
=
HttpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
);
return
response
.
body
().
toString
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
throw
e
;
...
...
@@ -268,8 +270,9 @@ public class BaiduNewsCrawlerParse extends HttpClientTemplateOK {
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
return
get
(
url
,
proxy
,
headerMap
);
}
catch
(
IOException
e
)
{
Response
response
=
HttpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
);
return
response
.
body
().
toString
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
throw
e
;
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduTiebaCrawlerParse.java
View file @
9453f8d6
package
com
.
zhiwei
.
media_data_crawler
.
crawler
;
import
java.io.IOException
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.TiebaData
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
...
...
@@ -9,23 +23,9 @@ import java.util.Map;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.TiebaData
;
import
com.zhiwei.zhiweiTools.httpClient.HeaderTool
;
import
com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK
;
import
com.zhiwei.zhiweiTools.tools.URLCodeUtil
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
public
class
BaiduTiebaCrawlerParse
extends
HttpClientTemplateOK
{
public
class
BaiduTiebaCrawlerParse
{
private
static
Logger
logger
=
Log
gerFactory
.
getLogger
(
BaiduTiebaCrawlerParse
.
class
);
private
static
Logger
logger
=
Log
Manager
.
getLogger
(
BaiduTiebaCrawlerParse
.
class
);
/**
* @Title: getBaiduTiebaData
* @author hero
...
...
@@ -107,7 +107,6 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
Map
<
String
,
Object
>
dataMap
=
analysisDataAnswer
(
htmlBody
,
aid
);
List
<
TiebaData
>
dataList
=
(
List
<
TiebaData
>)
dataMap
.
get
(
"data"
);
list
.
addAll
(
dataList
);
System
.
out
.
println
(
list
.
size
());
more
=
(
Boolean
)
dataMap
.
get
(
"more"
);
}
else
{
more
=
false
;
...
...
@@ -218,8 +217,9 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
return
get
(
url
,
proxy
,
headerMap
);
}
catch
(
IOException
e
)
{
Response
response
=
HttpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
);
return
response
.
body
().
toString
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
throw
e
;
...
...
@@ -255,8 +255,9 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
return
get
(
url
,
proxy
,
headerMap
);
}
catch
(
IOException
e
)
{
Response
response
=
HttpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
);
return
response
.
body
().
toString
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
throw
e
;
...
...
@@ -341,14 +342,13 @@ public class BaiduTiebaCrawlerParse extends HttpClientTemplateOK {
String
url
=
null
;
if
(
word
!=
null
)
{
if
(
tiebaName
!=
null
){
url
=
"http://tieba.baidu.com/f/search/res?isnew=1&kw="
+
URLCodeUtil
.
getURLEncode
(
tiebaName
,
"GBK"
)+
"&qw="
+
url
=
"http://tieba.baidu.com/f/search/res?isnew=1&kw="
+
URLCodeUtil
.
getURLEncode
(
tiebaName
,
"GBK"
)+
"&qw="
+
URLCodeUtil
.
getURLEncode
(
word
,
"GBK"
)+
"&rn=10&un=&only_thread=0&sm=1&sd=&ed=&pn="
+
page
;
}
else
{
url
=
"http://tieba.baidu.com/f/search/res?isnew=1&kw=&qw="
+
URLCodeUtil
.
getURLEncode
(
word
,
"GBK"
)+
"&rn=10&un=&only_thread=0&sm=1&sd=&ed=&pn="
+
page
;
}
}
System
.
out
.
println
(
url
);
return
url
;
}
}
src/main/java/com/zhiwei/media_data_crawler/crawler/DoubanCrawlerParse.java
View file @
9453f8d6
package
com
.
zhiwei
.
media_data_crawler
.
crawler
;
import
java.io.IOException
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.DouBanData
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.DouBanData
;
import
com.zhiwei.zhiweiTools.httpClient.HeaderTool
;
import
com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK
;
import
com.zhiwei.zhiweiTools.tools.URLCodeUtil
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
public
class
DoubanCrawlerParse
extends
HttpClientTemplateOK
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
DoubanCrawlerParse
.
class
);
public
class
DoubanCrawlerParse
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
BaiduTiebaCrawlerParse
.
class
);
/**
*
* @Title: getDoubanData
...
...
@@ -93,8 +93,9 @@ public class DoubanCrawlerParse extends HttpClientTemplateOK {
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
return
get
(
url
,
proxy
,
headerMap
);
}
catch
(
IOException
e
)
{
Response
response
=
HttpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
);
return
response
.
body
().
toString
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
throw
e
;
...
...
@@ -114,8 +115,9 @@ public class DoubanCrawlerParse extends HttpClientTemplateOK {
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
return
get
(
url
,
proxy
,
headerMap
);
}
catch
(
IOException
e
)
{
Response
response
=
HttpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
);
return
response
.
body
().
toString
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
throw
e
;
...
...
@@ -294,12 +296,11 @@ public class DoubanCrawlerParse extends HttpClientTemplateOK {
String
url
=
null
;
if
(
word
!=
null
)
{
if
(
type
.
equals
(
"topic"
)){
url
=
"https://www.douban.com/group/search?q="
+
URLCodeUtil
.
getURLEncode
(
word
,
"utf-8"
)+
"&start="
+
page
*
50
+
"&cat=1013&sort=time"
;
url
=
"https://www.douban.com/group/search?q="
+
URLCodeUtil
.
getURLEncode
(
word
,
"utf-8"
)+
"&start="
+
page
*
50
+
"&cat=1013&sort=time"
;
}
else
if
(
type
.
equals
(
"note"
)){
url
=
"https://www.douban.com/j/search?q="
+
URLCodeUtil
.
getURLEncode
(
word
,
"utf-8"
)+
"&start="
+
page
*
20
+
"&cat=1015"
;
}
}
System
.
out
.
println
(
url
);
return
url
;
}
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/SoCrawlerParse.java
View file @
9453f8d6
package
com
.
zhiwei
.
media_data_crawler
.
crawler
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.NewsData
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.httpclient.HttpClientTemplateOK
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.NewsData
;
import
com.zhiwei.zhiweiTools.httpClient.HeaderTool
;
import
com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK
;
import
com.zhiwei.zhiweiTools.timeParse.TimeParse
;
import
com.zhiwei.zhiweiTools.tools.URLCodeUtil
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
public
class
SoCrawlerParse
extends
HttpClientTemplateOK
{
public
class
SoCrawlerParse
{
private
static
Logger
logger
=
Log
gerFactory
.
getLogger
(
SoCrawlerParse
.
class
);
private
static
Logger
logger
=
Log
Manager
.
getLogger
(
SoCrawlerParse
.
class
);
private
static
final
String
pt
=
"360网页"
;
/**
...
...
@@ -99,7 +103,8 @@ public class SoCrawlerParse extends HttpClientTemplateOK {
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
return
get
(
url
,
proxy
,
headerMap
);
Response
response
=
HttpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
);
return
response
.
body
().
toString
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取360新闻数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
...
...
@@ -155,7 +160,6 @@ public class SoCrawlerParse extends HttpClientTemplateOK {
if
(!
element
.
attr
(
"class"
).
equals
(
"res-list hasimg hasmediav"
)){
link
=
element
.
select
(
"h3.res-title"
).
select
(
"a"
).
attr
(
"href"
);
title
=
element
.
select
(
"h3.res-title"
).
select
(
"a"
).
text
();
System
.
out
.
println
(
title
+
"============"
+
link
);
NewsData
newsData
=
null
;
String
realUrl
=
link
;
if
(
link
.
contains
(
"www.so.com/link"
))
{
...
...
@@ -257,7 +261,7 @@ public class SoCrawlerParse extends HttpClientTemplateOK {
private
static
String
getUrl
(
String
word
,
String
site
,
String
time
,
int
page
)
{
String
url
=
null
;
if
(
word
!=
null
)
{
url
=
"https://www.so.com/s?q="
+
URLCodeUtil
.
getURLEncode
(
word
,
"utf-8"
);
url
=
"https://www.so.com/s?q="
+
URLCodeUtil
.
getURLEncode
(
word
,
"utf-8"
);
if
(
site
!=
null
)
{
url
=
url
+
"+site%3A"
+
site
;
}
...
...
@@ -279,7 +283,8 @@ public class SoCrawlerParse extends HttpClientTemplateOK {
String
url
=
null
;
if
(
link
!=
null
)
{
try
{
String
htmlBody
=
HttpClientTemplateOK
.
get
(
link
,
proxy
,
null
);
Response
response
=
HttpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
,
false
);
String
htmlBody
=
response
.
body
().
toString
();
if
(
htmlBody
!=
null
)
{
url
=
htmlBody
.
split
(
"window.location.replace\\(\""
)[
1
].
split
(
"\"\\)"
)[
0
];
url
=
url
.
replaceAll
(
"http"
,
"https"
);
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/SoNewsCrawlerParse.java
View file @
9453f8d6
package
com
.
zhiwei
.
media_data_crawler
.
crawler
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.NewsData
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.NewsData
;
import
com.zhiwei.zhiweiTools.httpClient.HeaderTool
;
import
com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK
;
import
com.zhiwei.zhiweiTools.timeParse.TimeParse
;
import
com.zhiwei.zhiweiTools.tools.URLCodeUtil
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
public
class
SoNewsCrawlerParse
extends
HttpClientTemplateOK
{
public
class
SoNewsCrawlerParse
{
private
static
Logger
logger
=
Log
gerFactory
.
getLogger
(
SoNewsCrawlerParse
.
class
);
private
static
Logger
logger
=
Log
Manager
.
getLogger
(
SoNewsCrawlerParse
.
class
);
private
static
final
String
pt
=
"360新闻"
;
/**
...
...
@@ -133,7 +135,8 @@ public class SoNewsCrawlerParse extends HttpClientTemplateOK {
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
return
get
(
url
,
proxy
,
headerMap
);
Response
response
=
HttpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
);
return
response
.
body
().
toString
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取360新闻数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouNewsCrawlerParse.java
View file @
9453f8d6
package
com
.
zhiwei
.
media_data_crawler
.
crawler
;
import
java.io.IOException
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.NewsData
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.NewsData
;
import
com.zhiwei.zhiweiTools.httpClient.HeaderTool
;
import
com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK
;
import
com.zhiwei.zhiweiTools.timeParse.TimeParse
;
import
com.zhiwei.zhiweiTools.tools.URLCodeUtil
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
public
class
SougouNewsCrawlerParse
{
public
class
SougouNewsCrawlerParse
extends
HttpClientTemplateOK
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
SougouNewsCrawlerParse
.
class
);
private
static
Logger
logger
=
LogManager
.
getLogger
(
BaiduTiebaCrawlerParse
.
class
);
private
static
final
String
pt
=
"搜狗新闻"
;
...
...
@@ -127,8 +128,9 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
//下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++){
try
{
return
get
(
url
,
proxy
,
headerMap
);
}
catch
(
IOException
e
)
{
Response
response
=
HttpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
);
return
response
.
body
().
toString
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取搜狗新闻数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
throw
e
;
...
...
@@ -150,8 +152,9 @@ public class SougouNewsCrawlerParse extends HttpClientTemplateOK {
//下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++){
try
{
return
get
(
url
,
proxy
,
headerMap
);
}
catch
(
IOException
e
)
{
Response
response
=
HttpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
);
return
response
.
body
().
toString
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取搜狗新闻数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
throw
e
;
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/SougouZhihuCrawlerParse.java
View file @
9453f8d6
package
com
.
zhiwei
.
media_data_crawler
.
crawler
;
import
java.io.IOException
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.ZhiHuData
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.ZhiHuData
;
import
com.zhiwei.zhiweiTools.httpClient.HeaderTool
;
import
com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK
;
import
com.zhiwei.zhiweiTools.timeParse.TimeParse
;
import
com.zhiwei.zhiweiTools.tools.URLCodeUtil
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
import
java.net.Proxy
;
import
java.util.*
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
public
class
SougouZhihuCrawlerParse
extends
HttpClientTemplateOK
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
SougouZhihuCrawlerParse
.
class
);
public
class
SougouZhihuCrawlerParse
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
SougouZhihuCrawlerParse
.
class
);
private
static
final
String
pt
=
"搜狗知乎"
;
...
...
@@ -99,8 +96,9 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
//下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++){
try
{
return
get
(
url
,
proxy
,
headerMap
);
}
catch
(
IOException
e
)
{
Response
response
=
HttpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
);
return
response
.
body
().
toString
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取搜狗新闻数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
throw
e
;
...
...
@@ -125,8 +123,9 @@ public class SougouZhihuCrawlerParse extends HttpClientTemplateOK {
//下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++){
try
{
return
get
(
url
,
proxy
,
headerMap
);
}
catch
(
IOException
e
)
{
Response
response
=
HttpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
);
return
response
.
body
().
toString
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取搜狗新闻数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
throw
e
;
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/TianYaCrawlerParse.java
View file @
9453f8d6
package
com
.
zhiwei
.
media_data_crawler
.
crawler
;
import
java.io.IOException
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.LunTanData
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.LunTanData
;
import
com.zhiwei.zhiweiTools.httpClient.HeaderTool
;
import
com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK
;
import
com.zhiwei.zhiweiTools.timeParse.TimeParse
;
import
com.zhiwei.zhiweiTools.tools.URLCodeUtil
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
import
java.net.Proxy
;
import
java.util.*
;
public
class
TianYaCrawlerParse
extends
HttpClientTemplateOK
{
private
static
Logger
logger
=
Log
gerFactory
.
getLogger
(
TianYaCrawlerParse
.
class
);
public
class
TianYaCrawlerParse
{
private
static
Logger
logger
=
Log
Manager
.
getLogger
(
TianYaCrawlerParse
.
class
);
private
static
final
String
pt
=
"天涯论坛"
;
/**
* @Title: getBaiduTiebaData
...
...
@@ -89,8 +86,9 @@ public class TianYaCrawlerParse extends HttpClientTemplateOK {
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
return
get
(
url
,
proxy
,
headerMap
);
}
catch
(
IOException
e
)
{
Response
response
=
HttpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
,
false
);
return
response
.
body
().
toString
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
throw
e
;
...
...
@@ -172,7 +170,7 @@ public class TianYaCrawlerParse extends HttpClientTemplateOK {
private
static
String
getUrl
(
String
word
,
int
page
)
{
String
url
=
null
;
if
(
word
!=
null
)
{
url
=
"http://search.tianya.cn/bbs?q="
+
URLCodeUtil
.
getURLEncode
(
word
,
"utf-8"
)
url
=
"http://search.tianya.cn/bbs?q="
+
URLCodeUtil
.
getURLEncode
(
word
,
"utf-8"
)
+
"&s=4&f=0&pn="
+
page
;
}
System
.
out
.
println
(
url
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment