Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
4b9aee87
Commit
4b9aee87
authored
Nov 10, 2020
by
马黎滨
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
微博热搜导语更新+腾讯较真热榜采集
parent
e2f0cb6f
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
633 additions
and
17 deletions
+633
-17
README.md
+3
-0
pom.xml
+7
-0
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchCache.java
+37
-0
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchList.java
+19
-1
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/config/RedisConfig.java
+36
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/TengXunCrawler.java
+35
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/ToutiaoHotSearchCrawler.java
+85
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
+111
-3
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
+1
-1
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
+79
-4
src/main/java/com/zhiwei/searchhotcrawler/dao/RedisDao.java
+72
-0
src/main/java/com/zhiwei/searchhotcrawler/timer/TouTiaoExecutor.java
+55
-0
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
+73
-7
src/main/resources/redis.properties
+18
-0
No files found.
README.md
View file @
4b9aee87
...
...
@@ -33,6 +33,7 @@
17.
网易新闻跟帖热议
18.
搜狗微信热搜
19.
微博话题
20.
微博预热榜
#### Mongo内网
192.
168.0.101,192.168.0.106,192.168.0.108
...
...
@@ -42,6 +43,8 @@
30000
#### Mongo数据表名
hot_search_list
#### zookeeper
zookeeper://192.168.0.203:2182?backup=192.168.0.104:2182,192.168.0.105:2182
...
...
pom.xml
View file @
4b9aee87
...
...
@@ -112,6 +112,13 @@
<artifactId>
spring-tx
</artifactId>
<version>
${spring.version}
</version>
</dependency>
<!-- redis写 -->
<dependency>
<groupId>
redis.clients
</groupId>
<artifactId>
jedis
</artifactId>
<version>
2.8.1
</version>
</dependency>
</dependencies>
<build>
...
...
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchCache.java
View file @
4b9aee87
...
...
@@ -80,7 +80,20 @@ public class HotSearchCache {
*/
private
Boolean
recommend
;
/**
* 阅读量
*/
private
Integer
readCount
;
/**
* 讨论量
*/
private
Integer
discussCount
;
/**
* 话题真假(腾讯较真榜使用)
*/
private
String
topicResult
;
public
HotSearchCache
(
String
url
,
String
name
,
String
topicLead
,
Integer
highestCount
,
Integer
lastCount
,
Boolean
hot
,
Date
startTime
,
Date
endTime
,
Integer
highestRank
,
Integer
lastRank
,
String
type
,
Integer
duration
){
...
...
@@ -107,4 +120,28 @@ public class HotSearchCache {
public
void
setRecommend
(
Boolean
recommend
)
{
this
.
recommend
=
recommend
;
}
public
Integer
getReadCount
()
{
return
readCount
;
}
public
void
setReadCount
(
Integer
readCount
)
{
this
.
readCount
=
readCount
;
}
public
Integer
getDiscussCount
()
{
return
discussCount
;
}
public
void
setDiscussCount
(
Integer
discussCount
)
{
this
.
discussCount
=
discussCount
;
}
public
String
getTopicLead
()
{
return
topicLead
;
}
public
void
setTopicLead
(
String
topicLead
)
{
this
.
topicLead
=
topicLead
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchList.java
View file @
4b9aee87
...
...
@@ -75,10 +75,15 @@ public class HotSearchList implements Serializable{
private
String
icon
;
/**
* 话题讨论量
* 话题讨论量
或阅读量
*/
private
Integer
commentCount
;
/**
* 话题真假结果(腾讯较真榜使用)
*/
private
String
topicResult
;
public
HotSearchList
(){}
public
HotSearchList
(
String
url
,
String
name
,
Integer
count
,
Boolean
hot
,
Integer
rank
,
String
type
,
String
icon
,
Date
date
){
...
...
@@ -122,4 +127,17 @@ public class HotSearchList implements Serializable{
this
.
topicLead
=
topicLead
;
}
public
HotSearchList
(
String
url
,
String
name
,
Integer
count
,
Boolean
hot
,
Integer
rank
,
String
type
,
Date
date
,
String
icon
,
String
topicResult
){
this
.
id
=
name
+
"_"
+
new
Date
().
getTime
()
+
"_"
+
type
;
this
.
url
=
url
;
this
.
name
=
name
;
this
.
hot
=
hot
;
this
.
count
=
count
;
this
.
rank
=
rank
;
this
.
time
=
date
;
this
.
day
=
TimeParse
.
dateFormartString
(
date
,
"yyyy-MM-dd"
);
this
.
type
=
type
;
this
.
icon
=
icon
;
this
.
topicResult
=
topicResult
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
View file @
4b9aee87
...
...
@@ -17,5 +17,6 @@ public enum HotSearchType {
凤凰新闻热搜
,
网易热榜
,
网易跟帖热议
,
微博预热榜
微博预热榜
,
腾讯较真榜
}
src/main/java/com/zhiwei/searchhotcrawler/config/RedisConfig.java
0 → 100644
View file @
4b9aee87
package
com
.
zhiwei
.
searchhotcrawler
.
config
;
import
java.io.IOException
;
import
java.util.Properties
;
public
class
RedisConfig
{
public
static
String
redisHost
;
public
static
Integer
redisPort
;
public
static
String
redisPassword
;
public
static
Integer
redisDataBase
;
public
static
Integer
redisMaxIdle
;
public
static
Integer
redisMinIdle
;
public
static
Integer
redisMaxTotal
;
public
static
Integer
redisTimeout
;
/** 采集到的微博热搜Id */
public
static
String
WEIBO_HOTSEARCHIDS
=
"weibo_hotsearchIds"
;
static
{
Properties
redisProperties
=
new
Properties
();
try
{
redisProperties
.
load
(
RedisConfig
.
class
.
getClassLoader
().
getResourceAsStream
(
"redis.properties"
));
redisHost
=
redisProperties
.
getProperty
(
"redis.host"
);
redisPort
=
Integer
.
valueOf
(
redisProperties
.
getProperty
(
"redis.port"
));
redisPassword
=
redisProperties
.
getProperty
(
"redis.password"
);
redisDataBase
=
Integer
.
valueOf
(
redisProperties
.
getProperty
(
"redis.database"
));
redisMaxIdle
=
Integer
.
valueOf
(
redisProperties
.
getProperty
(
"redis.maxIdle"
));
redisMinIdle
=
Integer
.
valueOf
(
redisProperties
.
getProperty
(
"redis.minIdle"
));
redisMaxTotal
=
Integer
.
valueOf
(
redisProperties
.
getProperty
(
"redis.maxTotal"
));
redisTimeout
=
Integer
.
valueOf
(
redisProperties
.
getProperty
(
"redis.timeout"
));
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/TengXunCrawler.java
View file @
4b9aee87
...
...
@@ -79,4 +79,39 @@ public class TengXunCrawler {
log
.
info
(
"腾讯新闻采集结束"
);
return
list
;
}
/**
* 腾讯较真辟谣榜数据采集
* @param date
* @return
*/
public
static
List
<
HotSearchList
>
getTengXunVerificationList
(
Date
date
)
{
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
String
htmlBody
=
null
;
String
url
=
"https://vp.fact.qq.com/hotlistData?num=20"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
//采集为空最多重试3次
for
(
int
t
=
0
;
t
<
3
;
t
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
)){
JSONArray
jsonArray
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"data"
);
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++){
JSONObject
jsonObject
=
jsonArray
.
getJSONObject
(
i
);
Integer
rank
=
jsonObject
.
getIntValue
(
"index"
);
String
name
=
jsonObject
.
getString
(
"title"
);
Integer
count
=
jsonObject
.
getIntValue
(
"score"
);
String
tengxunUrl
=
jsonObject
.
getString
(
"link"
);
String
topicResult
=
jsonObject
.
getString
(
"result"
);
HotSearchList
hotSearchList
=
new
HotSearchList
(
tengxunUrl
,
name
,
count
,
false
,
rank
,
HotSearchType
.
腾讯较真榜
.
name
(),
date
,
null
,
topicResult
);
list
.
add
(
hotSearchList
);
}
return
list
;
}
}
return
list
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/ToutiaoHotSearchCrawler.java
View file @
4b9aee87
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
com.alibaba.fastjson.JSON
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
...
...
@@ -7,13 +8,17 @@ import com.zhiwei.crawler.proxy.ProxyHolder;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
java.io.IOException
;
import
java.util.*
;
...
...
@@ -96,6 +101,86 @@ public class ToutiaoHotSearchCrawler {
}
// /**
// * 采集今日头条数据
// * @param date
// * @return
// */
// public static List<HotSearchList> toutiaoHotSearchByPhone(Date date){
// List<HotSearchList> hotSearchLists = new ArrayList<>();
// //采集头条内容
// String url = "https://api5-normal-c-lq.snssdk.com/api/feed/hotboard_online/v1/?category=hotboard_online&count=50";
// Map<String,Object> headerMap = new HashMap<>();
// headerMap.put("upgrade-insecure-requests","1");
// headerMap.put("user-agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Mobile Safari/537.36");
// String htmlBody = null;
// Request request = RequestUtils.wrapGet(url,headerMap);
// try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) {
// htmlBody = response.body().string();
// log.info(htmlBody);
// } catch (IOException e1) {
// log.error("解析今日头条实时热搜时出现连接失败",e1);
// }
// if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")) {
// try {
// JSONArray words = JSONObject.parseObject(htmlBody).getJSONArray("data");
// for (int i = 0; i < words.size(); i++) {
// JSONObject jsonObject = JSON.parseObject(words.get(i).toString());
// int rank = i+1;
// String name =jsonObject.getJSONObject("content").getJSONObject("raw_data").getString("title");
// String link = jsonObject.getJSONObject("content").getJSONObject("raw_data").getString("schema");
// if(link.contains("keyword=")) {
// link = "https://so.toutiao.com/search/?"+link.substring(link.indexOf("keyword="), link.indexOf("&search_json="));
// }else{
// link = null;
// }
// HotSearchList hotSearch = new HotSearchList(link, name, null, true, rank, HotSearchType.今日头条热搜.name(), null,date);
// hotSearchLists.add(hotSearch);
// }
// } catch (Exception e) {
// log.error("解析今日头条实时热搜时出现解析错误,数据不是json结构", e);
// }
// } else {
// log.info("解析今日头条实时热搜时出现解析错误,页面结构有问题");
// }
// return hotSearchLists;
// }
/**
* 获取今日头条热搜阅读量
* @param hotSearchList
* @return
*/
public
static
HotSearchList
toutiaoReadCount
(
HotSearchList
hotSearchList
){
if
(
hotSearchList
.
getUrl
()
!=
null
)
{
String
htmlBody
=
null
;
String
url
=
hotSearchList
.
getUrl
();
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
i
=
0
;
i
<=
5
;
i
++)
{
ZhiWeiTools
.
sleep
(
1000L
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e1
)
{
log
.
error
(
"解析今日头条热搜详情页面出现连接失败"
,
e1
);
}
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Elements
elements
=
document
.
select
(
".result-content .cs-view .cs-topone-tail .cs-view .margin-bottom-m .margin-left-m"
);
if
(
Objects
.
nonNull
(
elements
)
&&
!
elements
.
isEmpty
())
{
Element
element
=
elements
.
first
();
String
readCount
=
element
.
text
().
replaceAll
(
"阅读"
,
""
);
Integer
count
=
TipsUtils
.
getHotCount
(
readCount
);
log
.
info
(
"{},阅读量:{}"
,
hotSearchList
.
getName
(),
count
);
hotSearchList
.
setCommentCount
(
count
);
return
hotSearchList
;
}
}
}
}
return
hotSearchList
;
}
/**
* 热搜类型
* @param wordsType
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
View file @
4b9aee87
...
...
@@ -5,12 +5,16 @@ import java.util.*;
import
java.util.stream.Collectors
;
import
com.alibaba.fastjson.JSON
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchCache
;
import
com.zhiwei.searchhotcrawler.config.RedisConfig
;
import
com.zhiwei.searchhotcrawler.dao.RedisDao
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.bson.Document
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
...
...
@@ -25,6 +29,7 @@ import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.mail.SendMailWeibo
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
org.springframework.beans.factory.annotation.Autowired
;
/**
* @ClassName: WeiboHotSearch
...
...
@@ -34,7 +39,7 @@ import com.zhiwei.tools.tools.URLCodeUtil;
*/
@Log4j2
public
class
WeiboHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* @Title: weiboHotSearchTest
...
...
@@ -107,6 +112,7 @@ public class WeiboHotSearchCrawler {
* @return void 返回类型
*/
public
static
List
<
HotSearchList
>
weiboHotSearchByPhone
(
Date
date
){
RedisDao
redisDao
=
new
RedisDao
();
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583"
;
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
);
...
...
@@ -147,10 +153,12 @@ public class WeiboHotSearchCrawler {
if
(
StringUtils
.
isNotBlank
(
icon
))
{
icon
=
icon
.
split
(
"_"
)[
1
].
split
(
".png"
)[
0
];
}
String
id
=
"http://s.weibo.com/weibo/"
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
)
+
"&Refer=top"
;
// String id = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
String
id
=
cardInfo
.
getString
(
"scheme"
);
HotSearchList
hotSearch
=
new
HotSearchList
(
id
,
name
,
hotCount
,
hot
,
rank
,
HotSearchType
.
微博热搜
.
name
(),
icon
,
date
);
result
.
add
(
hotSearch
);
rank
++;
redisDao
.
addDataToSet
(
RedisConfig
.
WEIBO_HOTSEARCHIDS
,
name
+
"_微博热搜"
);
}
}
else
{
log
.
info
(
"card 数据结构为:{}"
,
card
);
...
...
@@ -214,4 +222,104 @@ public class WeiboHotSearchCrawler {
return
result
;
}
/**
* 微博热搜数据更新导语,阅读量,讨论量
* @param document
* @return
*/
public
static
Document
weiboUpdate
(
Document
document
)
{
log
.
info
(
"更新微博热搜{}导语阅读量和讨论量"
,
document
.
getString
(
"name"
));
String
url
=
"https://m.weibo.cn/api/container/getIndex?"
+
document
.
getString
(
"url"
).
substring
(
document
.
getString
(
"url"
).
indexOf
(
"?"
)+
1
,
document
.
getString
(
"url"
).
indexOf
(
"&"
));
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博热搜详情页面时出现连接失败"
,
e
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONObject
(
"cardlistInfo"
);
if
(
json
.
containsKey
(
"desc"
)){
String
topicLead
=
json
.
getString
(
"desc"
);
if
(!
""
.
equals
(
topicLead
))
{
document
.
put
(
"topicLead"
,
topicLead
);
}
}
if
(
json
.
containsKey
(
"cardlist_head_cards"
)){
JSONObject
readJson
=
json
.
getJSONArray
(
"cardlist_head_cards"
).
getJSONObject
(
0
);
if
(
readJson
.
containsKey
(
"head_data"
))
{
String
midText
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"midtext"
);
String
read
=
midText
.
replaceAll
(
"阅读"
,
""
).
replaceAll
(
"讨论.*"
,
""
).
trim
();
String
discussCount
=
midText
.
replaceAll
(
".*讨论"
,
""
).
replaceAll
(
"详情.*"
,
""
).
trim
();
document
.
put
(
"readCount"
,
TipsUtils
.
getHotCount
(
read
));
document
.
put
(
"discussCount"
,
TipsUtils
.
getHotCount
(
discussCount
));
}
}
return
document
;
}
}
return
null
;
}
// /**
// * 微博更新历史数据
// * @param hotSearch
// * @return
// */
// public static Document updateWeiBoTopic(Document hotSearch){
// String hotUrl = hotSearch.getString("url");
// String htmlBody = null;
// if(hotUrl != null && !"http://s.weibo.comjavascript:void(0);".equals(hotUrl)) {
// Request request = RequestUtils.wrapGet(hotUrl);
// try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
// htmlBody = response.body().string();
// org.jsoup.nodes.Document document = Jsoup.parse(htmlBody);
// Element element = document.select(".m-wrap .m-con-r .card-wrap .card-interest .card-content .item-topic .info h2 a").first();
// if (element != null) {
// String topicLeadUrl = element.attr("href");
// return analyHtml(hotSearch,topicLeadUrl);
// } else{
// return analyHtml(hotSearch,hotUrl);
// }
// } catch (Exception e) {
// log.error("解析微博话题榜时出现解析错误,页面结构有问题", e);
// }
// }
// return null;
// }
//
// public static Document analyHtml(Document hotSearch,String topicUrl){
// String htmlBody = null;
// Request request = RequestUtils.wrapGet(topicUrl);
// try(Response response = httpBoot.syncCall(request,ProxyHolder.NAT_HEAVY_PROXY)) {
// htmlBody = response.body().string();
// org.jsoup.nodes.Document document = Jsoup.parse(htmlBody);
// if(document != null) {
// Element body = document.body();
// Element read = body.select(".card-topic-a .info .total span").first();
// if(read != null) {
// String readCount = read.text().replaceAll("阅读", "");
// hotSearch.put("readCount",TipsUtils.getHotCount(readCount));
// }
// Element dis = body.select(".card-topic-a .info .total span").last();
// if(dis != null) {
// String disCount = dis.text().replaceAll("讨论", "");
// hotSearch.put("discussCount",TipsUtils.getHotCount(disCount));
// }
// Element topicLead = body.select(".m-wrap .card-wrap .card-topic-lead p").first();
// if(topicLead != null) {
// String topicLeadString = topicLead.html().replaceAll("<strong>导语:</strong>", "");
// topicLeadString = topicLeadString.length() > 150 ? topicLeadString.substring(0,150) : topicLeadString;
// hotSearch.put("topicLead",topicLeadString);
// }
// return hotSearch;
// }
// } catch (Exception e) {
// log.error("解析微博话题导语时出现解析错误,页面结构有问题", e);
// }
// return null;
// }
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
View file @
4b9aee87
...
...
@@ -121,7 +121,7 @@ public class ZhihuHotSearchCrawler {
hotCount
=
(
int
)
(
Double
.
parseDouble
(
hotText
)
*
10000
);
}
else
if
(
hotText
.
contains
(
"亿"
))
{
hotText
=
hotText
.
replaceAll
(
"亿.*"
,
""
).
trim
();
hotCount
=
(
int
)
(
Double
.
parseDouble
(
hotText
)
*
10000000
);
hotCount
=
(
int
)
(
Double
.
parseDouble
(
hotText
)
*
10000000
0
);
}
else
{
hotCount
=
Integer
.
getInteger
(
hotText
);
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
View file @
4b9aee87
package
com
.
zhiwei
.
searchhotcrawler
.
dao
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.client.FindIterable
;
import
com.mongodb.client.MongoCollection
;
import
com.mongodb.client.MongoCursor
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.config.DBConfig
;
import
com.zhiwei.searchhotcrawler.crawler.WeiboHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
import
lombok.extern.log4j.Log4j2
;
import
org.bson.Document
;
...
...
@@ -44,6 +48,12 @@ public class HotSearchCacheDAO {
document
.
put
(
"topic_lead"
,
hotSearch
.
getTopicLead
());
document
.
put
(
"comment_count"
,
hotSearch
.
getCommentCount
());
}
if
(
"今日头条热搜"
.
equals
(
hotSearch
.
getType
())){
document
.
put
(
"comment_count"
,
hotSearch
.
getCommentCount
());
}
if
(
"腾讯较真榜"
.
equals
(
hotSearch
.
getType
())){
document
.
put
(
"topic_result"
,
hotSearch
.
getTopicResult
());
}
addAndUpdateData
(
document
);
dataes
.
add
(
document
);
});
...
...
@@ -69,8 +79,10 @@ public class HotSearchCacheDAO {
String
topicLead
=
document
.
getString
(
"topic_lead"
)!=
null
?
document
.
getString
(
"topic_lead"
):
null
;
boolean
hot
=
document
.
getBoolean
(
"hot"
)!=
null
?
document
.
getBoolean
(
"hot"
):
true
;
String
url
=
document
.
getString
(
"url"
)!=
null
?
document
.
getString
(
"url"
):
null
;
String
topicResult
=
document
.
getString
(
"topic_result"
)!=
null
?
document
.
getString
(
"topic_result"
):
null
;
String
id
=
name
+
"_"
+
type
;
boolean
recommend
=
false
;
Integer
readCount
=
document
.
getInteger
(
"comment_count"
);
if
(
"微博热搜"
.
equals
(
type
)){
String
icon
=
document
.
getString
(
"icon"
);
if
(
"recom"
.
equals
(
icon
)
||
"jian"
.
equals
(
icon
)){
...
...
@@ -87,7 +99,7 @@ public class HotSearchCacheDAO {
Integer
highestCount
=
nowDoc
.
getInteger
(
"highestCount"
);
Integer
preRank
=
nowDoc
.
getInteger
(
"lastRank"
);
Integer
preCount
=
nowDoc
.
getInteger
(
"lastCount"
);
String
lastUrl
=
nowDoc
.
getString
(
"url"
);
//判断最大热度值
if
(
Objects
.
nonNull
(
lastCount
)
&&
Objects
.
nonNull
(
highestCount
)
&&
lastCount
>
highestCount
)
{
highestCount
=
lastCount
;
...
...
@@ -104,6 +116,9 @@ public class HotSearchCacheDAO {
int
durationNow
=
getDuration
(
type
,
duration
);
// endTime = getEndTime(type, new Date());
//更新相应信息
if
(
url
!=
null
&&
!
url
.
equals
(
lastUrl
)){
nowDoc
.
put
(
"url"
,
url
);
}
nowDoc
.
put
(
"endTime"
,
endTime
);
nowDoc
.
put
(
"lastRank"
,
lastRank
);
nowDoc
.
put
(
"lastCount"
,
lastCount
);
...
...
@@ -113,6 +128,12 @@ public class HotSearchCacheDAO {
nowDoc
.
put
(
"preCount"
,
preCount
);
nowDoc
.
put
(
"duration"
,
durationNow
);
nowDoc
.
put
(
"recommend"
,
recommend
);
if
(
readCount
!=
null
){
nowDoc
.
put
(
"readCount"
,
readCount
);
}
if
(
topicResult
!=
null
){
nowDoc
.
put
(
"topicResult"
,
topicResult
);
}
collection
.
replaceOne
(
query
,
nowDoc
);
}
else
{
nowDoc
=
new
Document
();
...
...
@@ -133,12 +154,25 @@ public class HotSearchCacheDAO {
nowDoc
.
put
(
"preRank"
,
null
);
nowDoc
.
put
(
"preCount"
,
null
);
nowDoc
.
put
(
"recommend"
,
recommend
);
if
(
readCount
!=
null
){
nowDoc
.
put
(
"readCount"
,
readCount
);
}
if
(
topicResult
!=
null
){
nowDoc
.
put
(
"topicResult"
,
topicResult
);
}
if
(
"微博热搜"
.
equals
(
type
)){
nowDoc
=
WeiboHotSearchCrawler
.
weiboUpdate
(
nowDoc
);
if
(
nowDoc
.
containsKey
(
"topicLead"
)){
nowDoc
.
put
(
"topicLead"
,
nowDoc
.
getString
(
"topicLead"
));
}
if
(
nowDoc
.
containsKey
(
"readCount"
)
&&
nowDoc
.
containsKey
(
"discussCount"
))
{
nowDoc
.
put
(
"readCount"
,
nowDoc
.
getInteger
(
"readCount"
));
nowDoc
.
put
(
"discussCount"
,
nowDoc
.
getInteger
(
"discussCount"
));
}
}
collection
.
insertOne
(
nowDoc
);
}
}
}
catch
(
Exception
e
){
log
.
info
(
"数据存储时出错:{}"
,
e
);
}
...
...
@@ -158,6 +192,19 @@ public class HotSearchCacheDAO {
}
}
// public List<Document> getHotSearchList(){
// List<Document> documentList = new ArrayList<>();
// Document query = new Document("type","微博热搜");
// query.put("endTime",new BasicDBObject("$gte", new Date(1604851200000L)).append("$lt",new Date(1604973600000L)));
// query.put("readCount",new BasicDBObject("$exists",false));
// FindIterable<Document> findIterable = collection.find(query);
// MongoCursor<Document> mongoCursor = findIterable.iterator();
// while(mongoCursor.hasNext()){
// documentList.add(mongoCursor.next());
// }
// return documentList;
// }
/**
* 计算热搜时长
...
...
@@ -232,6 +279,34 @@ public class HotSearchCacheDAO {
return
new
Date
(
timeLong
);
}
/**
* 根据主键查询对应热搜
* @param id
* @return
*/
public
Document
getHotSearchById
(
String
id
){
Document
query
=
new
Document
(
"_id"
,
id
);
return
(
Document
)
collection
.
find
(
query
).
first
();
}
/**
*
* @param document
* @param id
*/
public
void
updateWeibo
(
Document
document
,
String
id
){
Document
query
=
new
Document
(
"_id"
,
id
);
Document
nowDoc
=
(
Document
)
collection
.
find
(
query
).
first
();
if
(
Objects
.
nonNull
(
nowDoc
))
{
if
(
document
.
containsKey
(
"topicLead"
)
&&
document
.
getString
(
"topicLead"
)
!=
null
)
{
nowDoc
.
put
(
"topicLead"
,
document
.
getString
(
"topicLead"
));
}
if
(
document
.
containsKey
(
"readCount"
)
&&
document
.
containsKey
(
"discussCount"
))
{
nowDoc
.
put
(
"readCount"
,
document
.
getInteger
(
"readCount"
));
nowDoc
.
put
(
"discussCount"
,
document
.
getInteger
(
"discussCount"
));
}
collection
.
replaceOne
(
query
,
nowDoc
);
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/dao/RedisDao.java
0 → 100644
View file @
4b9aee87
package
com
.
zhiwei
.
searchhotcrawler
.
dao
;
import
com.zhiwei.searchhotcrawler.config.RedisConfig
;
import
lombok.extern.log4j.Log4j2
;
import
redis.clients.jedis.Jedis
;
import
redis.clients.jedis.JedisPool
;
import
redis.clients.jedis.JedisPoolConfig
;
import
java.util.Set
;
/**
* redis基础操作类
*/
@Log4j2
public
class
RedisDao
{
private
Jedis
jedis
;
public
RedisDao
(){
JedisPoolConfig
poolConfig
=
new
JedisPoolConfig
();
poolConfig
.
setMaxIdle
(
RedisConfig
.
redisMaxIdle
);
poolConfig
.
setMaxTotal
(
RedisConfig
.
redisMaxTotal
);
poolConfig
.
setMinIdle
(
RedisConfig
.
redisMinIdle
);
JedisPool
jedisPool
=
new
JedisPool
(
poolConfig
,
RedisConfig
.
redisHost
,
RedisConfig
.
redisPort
,
RedisConfig
.
redisTimeout
);
jedis
=
jedisPool
.
getResource
();
jedis
.
select
(
RedisConfig
.
redisDataBase
);
}
/**
* 存储redis数据
* @param key
* @param value
*/
public
void
setRedisData
(
String
key
,
String
value
){
jedis
.
set
(
key
,
value
);
}
/**
* 读取redis缓存数据
* @param key
* @return
*/
public
String
getRedisData
(
String
key
){
return
jedis
.
get
(
key
);
}
/**
* redis存值set集合,
* @param key
* @param value
*/
public
void
addDataToSet
(
String
key
,
String
value
){
jedis
.
sadd
(
key
,
value
);
}
/**
* redis读取set集合的值
* @param key
* @return
*/
public
Set
<
String
>
getRedisSetData
(
String
key
){
return
jedis
.
smembers
(
key
);
}
/**
* 移除redis
* @param key
*/
public
void
removeRedis
(
String
key
){
jedis
.
del
(
key
);
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/TouTiaoExecutor.java
0 → 100644
View file @
4b9aee87
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.crawler.ToutiaoHotSearchCrawler
;
import
lombok.extern.log4j.Log4j2
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.concurrent.ExecutorService
;
import
java.util.concurrent.Executors
;
import
java.util.concurrent.TimeUnit
;
@Log4j2
public
class
TouTiaoExecutor
extends
Thread
{
private
HotSearchList
hotSearchList
;
private
static
List
<
HotSearchList
>
resultList
;
public
TouTiaoExecutor
(
HotSearchList
hotSearchList
){
this
.
hotSearchList
=
hotSearchList
;
}
@Override
public
void
run
()
{
try
{
hotSearchList
=
ToutiaoHotSearchCrawler
.
toutiaoReadCount
(
hotSearchList
);
resultList
.
add
(
hotSearchList
);
}
catch
(
Exception
e
){
e
.
printStackTrace
();
}
}
/**
* 今日头条阅读量统计
* @param list
* @return
*/
public
static
List
<
HotSearchList
>
countTouTiaoReadCount
(
List
<
HotSearchList
>
list
){
resultList
=
new
ArrayList
<>();
ExecutorService
service
=
Executors
.
newFixedThreadPool
(
list
.
size
());
for
(
int
i
=
0
;
i
<
list
.
size
();
i
++){
service
.
execute
(
new
TouTiaoExecutor
(
list
.
get
(
i
)));
}
service
.
shutdown
();
try
{
if
(!
service
.
awaitTermination
(
1
,
TimeUnit
.
MINUTES
)){
log
.
info
(
"查询今日头条阅读量超时"
);
}
}
catch
(
InterruptedException
e
)
{
log
.
info
(
e
.
fillInStackTrace
());
}
return
resultList
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
View file @
4b9aee87
...
...
@@ -3,11 +3,15 @@ package com.zhiwei.searchhotcrawler.timer.quartz;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.bean.WeiboSuperTopic
;
import
com.zhiwei.searchhotcrawler.config.RedisConfig
;
import
com.zhiwei.searchhotcrawler.crawler.*
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO
;
import
com.zhiwei.searchhotcrawler.dao.RedisDao
;
import
com.zhiwei.searchhotcrawler.dao.WeiboSuperTopicDAO
;
import
com.zhiwei.searchhotcrawler.timer.TouTiaoExecutor
;
import
com.zhiwei.searchhotcrawler.util.DateUtils
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
org.bson.Document
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -17,9 +21,7 @@ import org.springframework.scheduling.annotation.EnableScheduling;
import
org.springframework.scheduling.annotation.Scheduled
;
import
org.springframework.stereotype.Component
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.*
;
@Component
@EnableScheduling
...
...
@@ -49,6 +51,32 @@ public class GatherTimer {
}
/**
* 微博热搜导语,阅读量,讨论量更新
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"45 0/10 * * * ? "
)
public
void
updateWeiBo
(){
logger
.
info
(
"微博热搜导语更新..."
);
RedisDao
redisDao
=
new
RedisDao
();
HotSearchCacheDAO
hotSearchCacheDAO
=
new
HotSearchCacheDAO
();
Set
<
String
>
hotSearchIdSet
=
redisDao
.
getRedisSetData
(
RedisConfig
.
WEIBO_HOTSEARCHIDS
);
redisDao
.
removeRedis
(
RedisConfig
.
WEIBO_HOTSEARCHIDS
);
Iterator
<
String
>
hotSearchIterator
=
hotSearchIdSet
.
iterator
();
while
(
hotSearchIterator
.
hasNext
()){
String
id
=
hotSearchIterator
.
next
();
Document
document
=
hotSearchCacheDAO
.
getHotSearchById
(
id
);
if
(
document
!=
null
){
document
=
WeiboHotSearchCrawler
.
weiboUpdate
(
document
);
if
(
document
.
containsKey
(
"topicLead"
)
||
document
.
containsKey
(
"readCount"
)
||
document
.
containsKey
(
"discussCount"
))
{
hotSearchCacheDAO
.
updateWeibo
(
document
,
id
);
}
ZhiWeiTools
.
sleep
(
3000L
);
}
}
logger
.
info
(
"微博热搜导语更新结束..."
);
}
/**
* 今日头条热搜的采集
*/
@Async
(
value
=
"myScheduler"
)
...
...
@@ -57,8 +85,10 @@ public class GatherTimer {
logger
.
info
(
"今日头条热搜开始采集..."
);
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
toutiaoList
=
ToutiaoHotSearchCrawler
.
toutiaoHotSearchByPhone
(
date
);
logger
.
info
(
"{}, 今日头条此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
toutiaoList
!=
null
?
toutiaoList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
HotSearchType
.
今日头条热搜
.
name
(),
toutiaoList
);
List
<
HotSearchList
>
toutiaoResult
=
new
ArrayList
<>();
toutiaoResult
=
TouTiaoExecutor
.
countTouTiaoReadCount
(
toutiaoList
);
logger
.
info
(
"{}, 今日头条此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
toutiaoResult
!=
null
?
toutiaoResult
.
size
()
:
0
));
TipsUtils
.
addHotList
(
HotSearchType
.
今日头条热搜
.
name
(),
toutiaoResult
);
logger
.
info
(
"今日头条热搜采集结束..."
);
}
...
...
@@ -228,8 +258,8 @@ public class GatherTimer {
/**
* 凤凰新闻热搜的采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"10 * * * * ? "
)
//
@Async(value = "myScheduler")
//
@Scheduled(cron = "10 * * * * ? ")
public
void
crawlerFengHuangHotSearch
(){
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
list
=
FengHuangSearchCrawler
.
getFengHuangHotSearch
(
date
);
...
...
@@ -237,6 +267,20 @@ public class GatherTimer {
}
/**
* 腾讯较真辟谣榜采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"10 * * * * ? "
)
public
void
crawlerTengXunVerificationHotSearch
(){
logger
.
info
(
"{},腾讯较真辟谣榜开始采集"
,
new
Date
());
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
list
=
TengXunCrawler
.
getTengXunVerificationList
(
date
);
logger
.
info
(
"腾讯较真辟谣榜本轮采集数量:{}"
,
list
.
size
());
TipsUtils
.
addHotList
(
HotSearchType
.
腾讯较真榜
.
name
(),
list
);
logger
.
info
(
"{},腾讯较真辟谣榜采集结束"
,
new
Date
());
}
/**
* 搜狐话题的采集
*/
@Async
(
value
=
"myScheduler"
)
...
...
@@ -261,6 +305,9 @@ public class GatherTimer {
logger
.
info
(
"知乎热搜话题采集结束..."
);
}
/**
* 微博预热榜的采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"20 * * * * ? "
)
public
void
crawlerWeiBoPreheat
(){
...
...
@@ -330,6 +377,25 @@ public class GatherTimer {
logger
.
info
(
"微博话题采集结束........"
);
}
// @Async(value = "myScheduler")
// @Scheduled(cron = "0 05 09 * * ? ")
// public void updateWeiboHistory(){
// HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
// List<Document> documentList = hotSearchCacheDAO.getHotSearchList();
// int i=0;
// for (Document document : documentList){
// document = WeiboHotSearchCrawler.updateWeiBoTopic(document);
// if(document != null){
// hotSearchCacheDAO.updateWeibo(document,document.getString("_id"));
// ZhiWeiTools.sleep(500L);
// }
// i++;
// logger.info("更新进度:{}",i*100/documentList.size());
// }
// logger.info("更新结束");
// }
/**
* 知乎子类采集函数
* @param type
...
...
src/main/resources/redis.properties
0 → 100644
View file @
4b9aee87
#redis.host=127.0.0.1
#redis.port=6379
#redis.password=
#redis
redis.host
=
192.168.0.39
redis.port
=
6379
redis.database
=
1
#maxIdle
redis.maxIdle
=
10
#minIdle
redis.minIdle
=
5
#maxTotal
redis.maxTotal
=
10
#timeout
redis.timeout
=
5000
redis.testOnBorrow
=
false
redis.testOnReturn
=
false
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment