Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
media_data_crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
media_data_crawler
Commits
491f1e25
Commit
491f1e25
authored
Aug 02, 2018
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加360采集今日头条数据
parent
4986288a
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
296 additions
and
0 deletions
+296
-0
src/main/java/com/zhiwei/media_data_crawler/crawler/SoCrawlerParse.java
+296
-0
No files found.
src/main/java/com/zhiwei/media_data_crawler/crawler/SoCrawlerParse.java
0 → 100644
View file @
491f1e25
package
com
.
zhiwei
.
media_data_crawler
.
crawler
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.media_data_crawler.data.DataCrawler
;
import
com.zhiwei.media_data_crawler.entity.NewsData
;
import
com.zhiwei.zhiweiTools.httpClient.HeaderTool
;
import
com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK
;
import
com.zhiwei.zhiweiTools.timeParse.TimeParse
;
import
com.zhiwei.zhiweiTools.tools.URLCodeUtil
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
public
class
SoCrawlerParse
extends
HttpClientTemplateOK
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
SoCrawlerParse
.
class
);
private
static
final
String
pt
=
"360网页"
;
/**
* @Title: getSoNewsData
* @author hero
* @Description: 采集360新闻数据
* @param @param word
* @param @param proxy
* @param @return 设定文件
* @return List<NewsData> 返回类型
* @throws Exception
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
NewsData
>
getSoData
(
String
word
,
String
site
,
String
time
,
Proxy
proxy
)
throws
Exception
{
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
int
page
=
1
;
boolean
more
=
true
;
while
(
more
)
{
// 最大页数为50
if
(
page
>
50
)
{
more
=
false
;
}
String
htmlBody
=
downloadHtml
(
word
,
site
,
time
,
proxy
,
page
);
if
(
htmlBody
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
analysisData
(
htmlBody
,
proxy
,
word
);
List
<
NewsData
>
dataList
=
(
List
<
NewsData
>)
dataMap
.
get
(
"data"
);
logger
.
info
(
"当前采集页数:{},当前采集关键词:{},当页数据量{}"
,
page
,
word
,
dataList
.
size
());
list
.
addAll
(
dataList
);
more
=
(
Boolean
)
dataMap
.
get
(
"more"
);
}
else
{
more
=
false
;
}
page
++;
if
(
DataCrawler
.
sleepTime
==
null
){
ZhiWeiTools
.
sleep
(
5000
);
}
}
return
list
;
}
public
static
Map
<
String
,
Object
>
getSoData
(
String
word
,
String
site
,
String
time
,
Proxy
proxy
,
int
page
)
throws
Exception
{
String
htmlBody
=
downloadHtml
(
word
,
site
,
time
,
proxy
,
page
);
if
(
htmlBody
!=
null
)
{
return
analysisData
(
htmlBody
,
proxy
,
word
);
}
return
null
;
}
/**
* 根据指定域名获取相应关键词数据
* @Title: downloadHtml
* @author hero
* @param @param word
* @param @param site
* @param @param time 需要采集的时间: d 1天内, w 1周内, m 1个月内, 3m 三个月内, y 1年内
* @param @param proxy
* @param @param page
* @param @return
* @param @throws Exception 设定文件
* @return String 返回类型
*/
private
static
String
downloadHtml
(
String
word
,
String
site
,
String
time
,
Proxy
proxy
,
int
page
)
throws
Exception
{
// 获取通用请求头
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
// 获取链接地址
String
url
=
getUrl
(
word
,
site
,
time
,
page
);
headerMap
.
put
(
"Host"
,
"www.so.com"
);
headerMap
.
put
(
"Referer"
,
url
);
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
{
return
get
(
url
,
proxy
,
headerMap
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取360新闻数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
if
(
i
==
3
){
throw
e
;
}
else
{
continue
;
}
}
}
return
null
;
}
/**
* @Title: analysisData
* @author hero
* @Description: 解析360新闻数据
* @param @param
* htmlBody
* @param @param
* proxy
* @param @param
* word
* @param @return
* 设定文件
* @return Map<String,Object> 返回类型
*/
private
static
Map
<
String
,
Object
>
analysisData
(
String
htmlBody
,
Proxy
proxy
,
String
word
)
throws
Exception
{
Map
<
String
,
Object
>
resultMap
=
new
HashMap
<
String
,
Object
>();
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
boolean
more
=
true
;
/** 解析页面 */
Document
document
=
Jsoup
.
parse
(
htmlBody
);
/** 判断是否有下一页 **/
if
(
document
.
select
(
"div#page"
)
==
null
)
{
more
=
false
;
}
else
{
if
(!
document
.
select
(
"div#page"
).
text
().
contains
(
"下一页"
))
{
more
=
false
;
}
}
// 开始解析
Elements
elementes
=
document
.
select
(
"ul.result"
).
select
(
"li.res-list"
);
String
time
=
null
;
String
source
=
null
;
String
link
=
null
;
String
title
=
null
;
String
content
=
null
;
logger
.
info
(
"关键词:::{},抓取回来的数据量为:::{}"
,
word
,
elementes
.
size
());
for
(
Element
element
:
elementes
)
{
try
{
if
(!
element
.
attr
(
"class"
).
equals
(
"res-list hasimg hasmediav"
)){
link
=
element
.
select
(
"h3.res-title"
).
select
(
"a"
).
attr
(
"href"
);
title
=
element
.
select
(
"h3.res-title"
).
select
(
"a"
).
text
();
System
.
out
.
println
(
title
+
"============"
+
link
);
NewsData
newsData
=
null
;
String
realUrl
=
link
;
if
(
link
.
contains
(
"www.so.com/link"
))
{
realUrl
=
getRealURL
(
link
,
proxy
);
}
if
(
realUrl
.
contains
(
"www.toutiao.com"
))
{
newsData
=
getTouTiaoInfo
(
realUrl
,
proxy
,
word
);
}
else
{
time
=
element
.
select
(
"span.gray"
).
text
();
source
=
element
.
select
(
"p.res-linkinfo"
).
select
(
"a.mingpian"
).
text
();
/** 文章发布时间处理 **/
time
=
TimeParse
.
dateFormartString
(
TimeParse
.
stringFormartDate
(
time
),
"yyyy-MM-dd HH:mm:ss"
);
// 处理文章简介
content
=
element
.
select
(
"[class=\"res-rich so-rich-news clearfix\"]"
).
text
();
// 添加到数据集合中
newsData
=
new
NewsData
(
realUrl
,
title
,
source
,
time
,
content
,
pt
,
word
);
}
if
(
newsData
!=
null
)
{
list
.
add
(
newsData
);
}
else
{
break
;
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"360新闻数据解析时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
continue
;
}
}
resultMap
.
put
(
"data"
,
list
);
resultMap
.
put
(
"more"
,
more
);
return
resultMap
;
}
/**
* 特殊处理拉取今日头条账号信息
* @Title: getTouTiaoInfo
* @author hero
* @param @param url
* @param @return 设定文件
* @return Map<String,String> 返回类型
*/
private
static
NewsData
getTouTiaoInfo
(
String
url
,
Proxy
proxy
,
String
word
)
{
try
{
Map
<
String
,
String
>
headMap
=
HeaderTool
.
getCommonHead
();
headMap
.
put
(
"accept-encoding"
,
"deflate, br"
);
headMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36"
);
String
htmlBody
=
HttpClientTemplateOK
.
get
(
url
,
proxy
,
headMap
);
if
(
htmlBody
!=
null
){
if
(
htmlBody
.
contains
(
"question"
)){
String
html
=
htmlBody
.
split
(
"var __wenda_data ="
)[
1
].
split
(
"\"err_tips\":\"\"};"
)[
0
]+
"\"err_tips\":\"\"}"
;
JSONObject
dataJson
=
JSONObject
.
parseObject
(
html
);
String
title
=
dataJson
.
getJSONObject
(
"data"
).
getJSONArray
(
"question"
).
getJSONObject
(
0
).
getString
(
"title"
);
String
content
=
dataJson
.
getJSONObject
(
"data"
).
getJSONArray
(
"question"
).
getJSONObject
(
0
).
getJSONObject
(
"content"
).
getString
(
"text"
);
String
time
=
dataJson
.
getJSONObject
(
"data"
).
getJSONArray
(
"question"
).
getJSONObject
(
0
).
getString
(
"create_time"
);
String
source
=
dataJson
.
getJSONObject
(
"data"
).
getJSONArray
(
"question"
).
getJSONObject
(
0
).
getJSONObject
(
"user"
).
getString
(
"uname"
);
String
user_id
=
dataJson
.
getJSONObject
(
"data"
).
getJSONArray
(
"question"
).
getJSONObject
(
0
).
getJSONObject
(
"user"
).
getString
(
"user_id"
);
String
link
=
"https://www.toutiao.com/a"
+
dataJson
.
getJSONObject
(
"data"
).
getJSONArray
(
"question"
).
getJSONObject
(
0
).
getString
(
"qid"
)+
"/"
;
return
new
NewsData
(
link
,
title
,
source
,
time
,
content
,
"头条问答"
,
word
,
user_id
);
}
else
if
(
htmlBody
.
contains
(
"var BASE_DATA = "
)){
String
html
=
htmlBody
.
split
(
"var BASE_DATA = "
)[
1
].
split
(
"pgcInfo"
)[
0
]+
"data:{}}"
;
JSONObject
dataJson
=
JSONObject
.
parseObject
(
html
);
String
title
=
dataJson
.
getJSONObject
(
"articleInfo"
).
getString
(
"title"
);
String
content
=
dataJson
.
getJSONObject
(
"articleInfo"
).
getString
(
"content"
);
String
time
=
dataJson
.
getJSONObject
(
"articleInfo"
).
getJSONObject
(
"subInfo"
).
getString
(
"time"
);
String
source
=
dataJson
.
getJSONObject
(
"mediaInfo"
).
getString
(
"name"
);
String
user_id
=
dataJson
.
getJSONObject
(
"mediaInfo"
).
getString
(
"uid"
);
String
link
=
"https://www.toutiao.com/a"
+
dataJson
.
getJSONObject
(
"articleInfo"
).
getString
(
"groupId"
)+
"/"
;
return
new
NewsData
(
link
,
title
,
source
,
time
,
content
,
"今日头条"
,
word
,
user_id
);
}
else
if
(
htmlBody
.
contains
(
"404错误页"
)){
logger
.
info
(
"{}:::数据有问题,该文章已被删除}"
,
url
);
}
else
{
logger
.
info
(
"{}:::数据有问题,页面中无文章信息,页面"
,
url
);
}
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
logger
.
info
(
"{}:::拉取页面信心出现错误,错误为::{}"
,
url
,
e
.
fillInStackTrace
());
return
null
;
}
return
null
;
}
/**
* 获取链接
* @Title: getUrl
* @author hero
* @param @param word
* @param @param site 需要匹配的域名
* @param @param time 需要采集的时间: d 1天内, w 1周内, m 1个月内, 3m 三个月内, y 1年内
* @param @param page
* @param @return 设定文件
* @return String 返回类型
*/
private
static
String
getUrl
(
String
word
,
String
site
,
String
time
,
int
page
)
{
String
url
=
null
;
if
(
word
!=
null
)
{
url
=
"https://www.so.com/s?q="
+
URLCodeUtil
.
getURLEncode
(
word
,
"utf-8"
);
if
(
site
!=
null
)
{
url
=
url
+
"+site%3A"
+
site
;
}
url
=
url
+
"&src=srp&fr=tab_news&psid=5fd92fac25104eda591f0de2029a346b&adv_t="
+
time
+
"&pn="
+
page
;
}
System
.
out
.
println
(
url
);
return
url
;
}
/**
* 获取真实地址
* @Title: getRealURL
* @author hero
* @param @param link
* @param @return 设定文件
* @return String 返回类型
*/
private
static
String
getRealURL
(
String
link
,
Proxy
proxy
)
{
String
url
=
null
;
if
(
link
!=
null
)
{
try
{
String
htmlBody
=
HttpClientTemplateOK
.
get
(
link
,
proxy
,
null
);
if
(
htmlBody
!=
null
)
{
url
=
htmlBody
.
split
(
"window.location.replace\\(\""
)[
1
].
split
(
"\"\\)"
)[
0
];
url
=
url
.
replaceAll
(
"http"
,
"https"
);
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
null
;
}
}
return
url
;
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment