Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
b528f200
Commit
b528f200
authored
Jul 12, 2019
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加百度热搜、微信热搜、抖音热搜帮
parent
ca20b119
Hide whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
217 additions
and
234 deletions
+217
-234
pom.xml
+2
-2
src/main/java/com/zhiwei/searchhotcrawler/bean/DouyinHotSearch.java
+14
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/BaiDuHotSearchCrawler.java
+80
-86
src/main/java/com/zhiwei/searchhotcrawler/crawler/DouyinHotSearchCrawler.java
+28
-46
src/main/java/com/zhiwei/searchhotcrawler/crawler/SougoHotSearchCrawler.java
+1
-1
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
+1
-1
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
+20
-25
src/main/java/com/zhiwei/searchhotcrawler/dao/BaiduHotSearchDAO.java
+8
-12
src/main/java/com/zhiwei/searchhotcrawler/dao/DouyinHotSearchDAO.java
+9
-1
src/main/java/com/zhiwei/searchhotcrawler/dao/SougoHotSearchDAO.java
+0
-9
src/main/java/com/zhiwei/searchhotcrawler/dao/WeiboHotSearchDAO.java
+8
-9
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
+3
-3
src/main/java/com/zhiwei/searchhotcrawler/timer/BaiduHotSearchRun.java
+18
-15
src/main/java/com/zhiwei/searchhotcrawler/timer/DouyinHotSearchRun.java
+4
-3
src/main/java/com/zhiwei/searchhotcrawler/timer/SougoHotSearchRun.java
+3
-6
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboHotSearchRun.java
+1
-1
src/main/java/com/zhiwei/searchhotcrawler/util/WechatCodeUtil.java
+10
-7
src/main/resources/db.properties
+7
-7
No files found.
pom.xml
View file @
b528f200
...
...
@@ -38,12 +38,12 @@
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<version>
0.
3.0
-RELEASE
</version>
<version>
0.
5.2
-RELEASE
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.tools
</groupId>
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.1.
2
-SNAPSHOT
</version>
<version>
0.1.
4
-SNAPSHOT
</version>
</dependency>
</dependencies>
...
...
src/main/java/com/zhiwei/searchhotcrawler/bean/DouyinHotSearch.java
View file @
b528f200
...
...
@@ -3,6 +3,8 @@ package com.zhiwei.searchhotcrawler.bean;
import
java.io.Serializable
;
import
java.util.Date
;
import
com.zhiwei.tools.timeparse.TimeParse
;
public
class
DouyinHotSearch
implements
Serializable
{
...
...
@@ -22,6 +24,8 @@ public class DouyinHotSearch implements Serializable {
private
int
changeCount
;
//据上分钟变化量
private
String
day
;
public
DouyinHotSearch
(){}
...
...
@@ -31,6 +35,7 @@ public class DouyinHotSearch implements Serializable {
this
.
word
=
word
;
this
.
hot_value
=
hot_value
;
this
.
time
=
new
Date
();
this
.
day
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
}
@Override
...
...
@@ -104,4 +109,13 @@ public class DouyinHotSearch implements Serializable {
public
void
setChangeCount
(
int
changeCount
)
{
this
.
changeCount
=
changeCount
;
}
public
String
getDay
()
{
return
day
;
}
public
void
setDay
(
String
day
)
{
this
.
day
=
day
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/BaiDuHotSearchCrawler.java
View file @
b528f200
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.List
;
import
java.util.Objects
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -17,104 +17,97 @@ import com.zhiwei.crawler.utils.RequestUtils;
import
com.zhiwei.searchhotcrawler.bean.BaiDuHotSearch
;
/**
* @ClassName:BaiDuHotSearch
* @Description: TODO(百度风云榜热搜采集)
* @author hero
* @ClassName:BaiDuHotSearch
* @Description: TODO(百度风云榜热搜采集)
* @author hero
* @date 2019年7月10日 上午10:54:31
*/
public
class
BaiDuHotSearchCrawler
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
BaiDuHotSearchCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* @Title: BaiDuHotSearchTest
* @author hero
* @Description: TODO(PC端百度风云榜采集)
* @param
设定文件
* @Title: BaiDuHotSearchTest
* @author hero
* @Description: TODO(PC端百度风云榜采集)
* @param
设定文件
* @return void 返回类型
*/
public
static
List
<
BaiDuHotSearch
>
baiduHotSearch
()
{
public
static
List
<
BaiDuHotSearch
>
baiduHotSearch
()
{
String
url
=
"http://top.baidu.com/buzz?b=1&fr=topindex"
;
List
<
BaiDuHotSearch
>
list
=
new
ArrayList
<
BaiDuHotSearch
>();
for
(
int
i
=
0
;
i
<
3
;
i
++){
String
htmlBody
=
null
;
try
{
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
)).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"mainBody"
)){
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
)).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"mainBody"
))
{
return
ansysData
(
htmlBody
);
}
else
{
logger
.
info
(
"解析百度风云榜时出现解析错误,页面结构有问题"
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析百度风云榜时出现解析错误,页面结构有问题"
,
e
);
}
return
Collections
.
emptyList
();
}
/**
* 解析数据
* @param htmlBody
* @return
*/
private
static
List
<
BaiDuHotSearch
>
ansysData
(
String
htmlBody
){
List
<
BaiDuHotSearch
>
list
=
new
ArrayList
<>();
try
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Elements
elements
=
document
.
select
(
"table.list-table"
).
select
(
"tr"
);
if
(
Objects
.
nonNull
(
elements
)
&&
!
elements
.
isEmpty
())
{
elements
.
forEach
(
element
->
{
try
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Elements
elements
=
document
.
select
(
"table.list-table"
).
select
(
"tr"
);
for
(
Element
element
:
elements
)
{
try
{
//获取排名rank
String
rankStr
=
null
;
//根据网页标签,给rankStr做判断
if
(!
element
.
select
(
"td.first"
).
select
(
"span.num-top"
).
isEmpty
())
{
rankStr
=
element
.
select
(
"td.first"
).
select
(
"span.num-top"
).
text
();
}
else
if
(!
element
.
select
(
"td.first"
).
select
(
"span.num-normal"
).
isEmpty
())
{
rankStr
=
element
.
select
(
"td.first"
).
select
(
"span.num-normal"
).
text
();
}
Integer
rank
=
null
;
//判断rankStr是否为空
if
(
StringUtils
.
isNoneBlank
(
rankStr
))
{
rank
=
Integer
.
valueOf
(
rankStr
);
}
//获取id(主键String)
// String id = element.select("td.keyword").select("a").text() + "_" +
// TimeParse.dateFormartString(new Date(), "yyyy-MM-dd HH:mm:ss");
//获取关键词(String)
String
kw
=
element
.
select
(
"td.keyword"
).
select
(
"a.list-title"
).
text
();
logger
.
info
(
"关键词:{}"
,
kw
);
//获取关键词相关链接everurl(String)
String
everurl
=
element
.
select
(
"td.keyword"
).
select
(
"a.list-title"
).
attr
(
"href"
);
//获取搜索指数count(int)
String
hot
=
null
;
//判断热度值所在的规则是否为null
if
(!
element
.
select
(
"td.last"
).
select
(
"span.icon-fall"
).
isEmpty
())
{
hot
=
element
.
select
(
"td.last"
).
select
(
"span.icon-fall"
).
text
();
}
else
if
(!
element
.
select
(
"td.last"
).
select
(
"span.icon-rise"
).
isEmpty
())
{
hot
=
element
.
select
(
"td.last"
).
select
(
"span.icon-rise"
).
text
();
}
int
count
=
0
;
//判断hot是否为空
if
(
StringUtils
.
isNotBlank
(
hot
))
{
count
=
Integer
.
valueOf
(
hot
);
}
BaiDuHotSearch
hotSearch
=
new
BaiDuHotSearch
(
rank
,
kw
,
everurl
,
count
);
if
(
Objects
.
nonNull
(
rank
))
{
list
.
add
(
hotSearch
);
}
}
catch
(
Exception
e
)
{
// SendMailWeibo.sendMail("百度风云榜采集出现问题", "859548429@qq.com");
logger
.
error
(
"解析百度风云榜时出现解析错误"
,
e
);
continue
;
// 获取排名rank
String
rankStr
=
null
;
// 根据网页标签,给rankStr做判断
if
(!
element
.
select
(
"td.first"
).
select
(
"span.num-top"
).
isEmpty
())
{
rankStr
=
element
.
select
(
"td.first"
).
select
(
"span.num-top"
).
text
();
}
else
if
(!
element
.
select
(
"td.first"
).
select
(
"span.num-normal"
).
isEmpty
())
{
rankStr
=
element
.
select
(
"td.first"
).
select
(
"span.num-normal"
).
text
();
}
Integer
rank
=
null
;
// 判断rankStr是否为空
if
(
StringUtils
.
isNoneBlank
(
rankStr
))
{
rank
=
Integer
.
valueOf
(
rankStr
);
}
// 获取关键词(String)
String
kw
=
element
.
select
(
"td.keyword"
).
select
(
"a.list-title"
).
text
();
logger
.
info
(
"关键词:{}"
,
kw
);
// 获取关键词相关链接everurl(String)
String
everurl
=
element
.
select
(
"td.keyword"
).
select
(
"a.list-title"
).
attr
(
"href"
);
// 获取搜索指数count(int)
String
hot
=
null
;
// 判断热度值所在的规则是否为null
if
(!
element
.
select
(
"td.last"
).
select
(
"span.icon-fall"
).
isEmpty
())
{
hot
=
element
.
select
(
"td.last"
).
select
(
"span.icon-fall"
).
text
();
}
else
if
(!
element
.
select
(
"td.last"
).
select
(
"span.icon-rise"
).
isEmpty
())
{
hot
=
element
.
select
(
"td.last"
).
select
(
"span.icon-rise"
).
text
();
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析百度风云榜时出现解析错误,数据不是json结构"
,
e
.
fillInStackTrace
());
// SendMailWeibo.sendMail("百度风云榜采集出现问题", "859548429@qq.com");
return
null
;
int
count
=
0
;
// 判断hot是否为空
if
(
StringUtils
.
isNotBlank
(
hot
))
{
count
=
Integer
.
valueOf
(
hot
);
}
}
else
{
// SendMailWeibo.sendMail("百度风云榜采集出现问题", "859548429@qq.com");
logger
.
info
(
"解析百度风云榜时出现解析错误,页面结构有问题"
);
}
break
;
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析百度风云榜时出现解析错误,页面结构有问题"
,
e
);
BaiDuHotSearch
hotSearch
=
new
BaiDuHotSearch
(
rank
,
kw
,
everurl
,
count
);
if
(
Objects
.
nonNull
(
rank
))
{
list
.
add
(
hotSearch
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析百度风云榜时出现解析错误"
,
e
);
}
});
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析百度风云榜时出现解析错误,数据不是json结构"
,
e
);
}
logger
.
info
(
"次轮采集的数据量为:"
,
list
.
size
());
return
list
;
}
}
\ No newline at end of file
src/main/java/com/zhiwei/searchhotcrawler/crawler/DouyinHotSearchCrawler.java
View file @
b528f200
...
...
@@ -4,6 +4,7 @@ import java.io.IOException;
import
java.util.ArrayList
;
import
java.util.List
;
import
org.apache.commons.lang3.StringUtils
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -24,7 +25,7 @@ import com.zhiwei.searchhotcrawler.bean.DouyinHotSearch;
public
class
DouyinHotSearchCrawler
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
ZhihuHotSearchCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* @Title: getMobileDouyinHotList
...
...
@@ -36,53 +37,34 @@ public class DouyinHotSearchCrawler {
public
static
List
<
DouyinHotSearch
>
getMobileDouyinHotList
(){
List
<
DouyinHotSearch
>
list
=
null
;
String
url
=
"https://api.amemv.com/aweme/v1/hot/search/list/"
;
// Map<String,String> headerMap = HeaderTool.getCommonHead();
// headerMap.put("Host", "api.zhihu.com");
// headerMap.put("Referer", url);
// headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36");
// headerMap.put("X-UDID", "AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8=");
// headerMap.put("authorization", "oauth c3cef7c66a1843f8b3a9e6a1e3160e20");
for
(
int
j
=
0
;
j
<
3
;
j
++){
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
)).
body
().
string
();
if
(
htmlBody
!=
null
){
if
(
htmlBody
.
contains
(
"word_list"
)){
list
=
new
ArrayList
<
DouyinHotSearch
>();
JSONObject
data
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
word_list
=
data
.
getJSONObject
(
"data"
).
getJSONArray
(
"word_list"
);
String
positionStr
=
null
;
String
word
=
null
;
String
hot_value_str
=
null
;
for
(
int
i
=
0
;
i
<
word_list
.
size
();
i
++)
{
JSONObject
wl
=
word_list
.
getJSONObject
(
i
);
//获取排名
positionStr
=
wl
.
getString
(
"position"
);
Integer
position
=
null
;
position
=
Integer
.
valueOf
(
positionStr
);
//获取关键词
word
=
wl
.
getString
(
"word"
);
//获取热度值
hot_value_str
=
wl
.
getString
(
"hot_value"
);
Integer
hot_value
=
null
;
hot_value
=
Integer
.
valueOf
(
hot_value_str
);
logger
.
info
(
"热度为:::{}"
,
hot_value
);
DouyinHotSearch
douyin
=
new
DouyinHotSearch
(
position
,
word
,
hot_value
);
list
.
add
(
douyin
);
}
break
;
}
else
{
System
.
out
.
println
(
"---------------"
);
}
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
)).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"word_list"
)){
list
=
new
ArrayList
<
DouyinHotSearch
>();
JSONObject
data
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
wordList
=
data
.
getJSONObject
(
"data"
).
getJSONArray
(
"word_list"
);
String
positionStr
=
null
;
String
word
=
null
;
String
hotValueStr
=
null
;
for
(
int
i
=
0
;
i
<
wordList
.
size
();
i
++)
{
JSONObject
wl
=
wordList
.
getJSONObject
(
i
);
//获取排名
positionStr
=
wl
.
getString
(
"position"
);
Integer
position
=
null
;
position
=
Integer
.
valueOf
(
positionStr
);
//获取关键词
word
=
wl
.
getString
(
"word"
);
//获取热度值
hotValueStr
=
wl
.
getString
(
"hot_value"
);
Integer
hotValue
=
null
;
hotValue
=
Integer
.
valueOf
(
hotValueStr
);
// logger.info("热度为:::{}", hot_value);
DouyinHotSearch
douyin
=
new
DouyinHotSearch
(
position
,
word
,
hotValue
);
list
.
add
(
douyin
);
}
}
catch
(
IOException
e
)
{
logger
.
debug
(
"获取抖音热搜榜时出现问题:{}"
,
e
.
fillInStackTrace
());
continue
;
}
}
catch
(
IOException
e
)
{
logger
.
debug
(
"获取抖音热搜榜时出现问题:{}"
,
e
);
}
return
list
;
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/SougoHotSearchCrawler.java
View file @
b528f200
...
...
@@ -25,7 +25,7 @@ package com.zhiwei.searchhotcrawler.crawler;
public
class
SougoHotSearchCrawler
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
SougoHotSearchCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* @Title: SougoHotSearchTest
* @author hero
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
View file @
b528f200
...
...
@@ -30,7 +30,7 @@ import com.zhiwei.tools.tools.URLCodeUtil;
public
class
WeiboHotSearchCrawler
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WeiboHotSearchCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* @Title: weiboHotSearchTest
* @author hero
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/ZhihuHotSearchCrawler.java
View file @
b528f200
...
...
@@ -26,7 +26,7 @@ import com.zhiwei.tools.tools.URLCodeUtil;
public
class
ZhihuHotSearchCrawler
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
ZhihuHotSearchCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* @Title: getZhihuHotList
* @author hero
...
...
@@ -45,32 +45,28 @@ public class ZhihuHotSearchCrawler {
headerMap
.
put
(
"accept"
,
"application/json, text/plain, */*"
);
headerMap
.
put
(
"authorization"
,
"oauth c3cef7c66a1843f8b3a9e6a1e3160e20"
);
headerMap
.
put
(
"Referer"
,
rerferer
);
for
(
int
j
=
0
;
j
<
3
;
j
++){
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
)).
body
().
string
();
if
(
htmlBody
!=
null
){
if
(
htmlBody
.
contains
(
"words"
)){
list
=
new
ArrayList
<
ZhihuHotSearch
>();
JSONObject
top_search
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
words
=
top_search
.
getJSONObject
(
"top_search"
).
getJSONArray
(
"words"
);
String
link
=
null
;
String
display_query
=
null
;
String
query
=
null
;
for
(
int
i
=
0
;
i
<
words
.
size
();
i
++)
{
JSONObject
word
=
words
.
getJSONObject
(
i
);
query
=
word
.
getString
(
"query"
);
display_query
=
word
.
getString
(
"display_query"
);
link
=
"https://www.zhihu.com/search?q="
+
URLCodeUtil
.
getURLEncode
(
query
,
"utf-8"
)+
"&utm_content=search_hot&utm_medium=organic&utm_source=zhihu&type=content"
;
ZhihuHotSearch
zhihu
=
new
ZhihuHotSearch
(
link
,
query
,
display_query
,
new
Date
());
list
.
add
(
zhihu
);
}
break
;
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
)).
body
().
string
();
if
(
htmlBody
!=
null
){
if
(
htmlBody
.
contains
(
"words"
)){
list
=
new
ArrayList
<>();
JSONObject
topSearch
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
words
=
topSearch
.
getJSONObject
(
"top_search"
).
getJSONArray
(
"words"
);
String
link
=
null
;
String
displayQuery
=
null
;
String
query
=
null
;
for
(
int
i
=
0
;
i
<
words
.
size
();
i
++)
{
JSONObject
word
=
words
.
getJSONObject
(
i
);
query
=
word
.
getString
(
"query"
);
displayQuery
=
word
.
getString
(
"display_query"
);
link
=
"https://www.zhihu.com/search?q="
+
URLCodeUtil
.
getURLEncode
(
query
,
"utf-8"
)+
"&utm_content=search_hot&utm_medium=organic&utm_source=zhihu&type=content"
;
ZhihuHotSearch
zhihu
=
new
ZhihuHotSearch
(
link
,
query
,
displayQuery
,
new
Date
());
list
.
add
(
zhihu
);
}
}
}
catch
(
IOException
e
)
{
logger
.
debug
(
"获取知乎热搜时出现问题:{}"
,
e
.
fillInStackTrace
());
continue
;
}
}
catch
(
IOException
e
)
{
logger
.
debug
(
"获取知乎热搜时出现问题:{}"
,
e
.
fillInStackTrace
());
}
return
list
;
}
...
...
@@ -120,7 +116,6 @@ public class ZhihuHotSearchCrawler {
}
}
catch
(
IOException
e
)
{
logger
.
debug
(
"获取知乎热搜时出现问题:{}"
,
e
.
fillInStackTrace
());
continue
;
}
}
return
list
;
...
...
src/main/java/com/zhiwei/searchhotcrawler/dao/BaiduHotSearchDAO.java
View file @
b528f200
package
com
.
zhiwei
.
searchhotcrawler
.
dao
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.Calendar
;
import
java.util.List
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBCursor
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.bean.BaiDuHotSearch
;
import
com.zhiwei.searchhotcrawler.cache.CacheManager
;
import
com.zhiwei.searchhotcrawler.config.Config
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
...
...
@@ -18,15 +16,13 @@ public class BaiduHotSearchDAO extends MongoDBTemplate{
public
BaiduHotSearchDAO
()
{
super
();
super
.
setDbName
(
Config
.
dbName
);
// Date date = new Date();
// String time = TimeParse.dateFormartString(date, "yyyy");
// if(Calendar.MONTH<6){
// collWeiboName = Config.collWeiboName + time+"_01";
// }else{
// collWeiboName = Config.collWeiboName + time+"_06";
// }
// System.out.println("collWeiboName========="+collWeiboName);
super
.
setCollName
(
Config
.
collBaiduName
);
String
collWeiboName
;
if
(
Calendar
.
MONTH
<
6
){
collWeiboName
=
Config
.
collBaiduName
+
Calendar
.
YEAR
+
"_01"
;
}
else
{
collWeiboName
=
Config
.
collBaiduName
+
Calendar
.
YEAR
+
"_06"
;
}
super
.
setCollName
(
collWeiboName
);
}
/**
...
...
src/main/java/com/zhiwei/searchhotcrawler/dao/DouyinHotSearchDAO.java
View file @
b528f200
package
com
.
zhiwei
.
searchhotcrawler
.
dao
;
import
java.util.Calendar
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBCursor
;
import
com.mongodb.DBObject
;
...
...
@@ -14,7 +16,13 @@ public class DouyinHotSearchDAO extends MongoDBTemplate{
public
DouyinHotSearchDAO
()
{
super
();
super
.
setDbName
(
Config
.
dbName
);
super
.
setCollName
(
Config
.
collDouyinName
);
String
collWeiboName
;
if
(
Calendar
.
MONTH
<
6
){
collWeiboName
=
Config
.
collDouyinName
+
Calendar
.
YEAR
+
"_01"
;
}
else
{
collWeiboName
=
Config
.
collDouyinName
+
Calendar
.
YEAR
+
"_06"
;
}
super
.
setCollName
(
collWeiboName
);
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/dao/SougoHotSearchDAO.java
View file @
b528f200
...
...
@@ -12,14 +12,6 @@ public class SougoHotSearchDAO extends MongoDBTemplate{
public
SougoHotSearchDAO
()
{
super
();
super
.
setDbName
(
Config
.
dbName
);
// Date date = new Date();
// String time = TimeParse.dateFormartString(date, "yyyy");
// if(Calendar.MONTH<6){
// collWeiboName = Config.collWeiboName + time+"_01";
// }else{
// collWeiboName = Config.collWeiboName + time+"_06";
// }
// System.out.println("collWeiboName========="+collWeiboName);
super
.
setCollName
(
Config
.
collSougoName
);
}
...
...
@@ -38,7 +30,6 @@ public class SougoHotSearchDAO extends MongoDBTemplate{
break
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
continue
;
}
}
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/dao/WeiboHotSearchDAO.java
View file @
b528f200
...
...
@@ -2,6 +2,7 @@ package com.zhiwei.searchhotcrawler.dao;
import
java.util.ArrayList
;
import
java.util.Calendar
;
import
java.util.Date
;
import
java.util.List
;
...
...
@@ -19,15 +20,13 @@ public class WeiboHotSearchDAO extends MongoDBTemplate{
public
WeiboHotSearchDAO
()
{
super
();
super
.
setDbName
(
Config
.
dbName
);
// Date date = new Date();
// String time = TimeParse.dateFormartString(date, "yyyy");
// if(Calendar.MONTH<6){
// collWeiboName = Config.collWeiboName + time+"_01";
// }else{
// collWeiboName = Config.collWeiboName + time+"_06";
// }
// System.out.println("collWeiboName========="+collWeiboName);
super
.
setCollName
(
Config
.
collWeiboName
);
String
collWeiboName
;
if
(
Calendar
.
MONTH
<
6
){
collWeiboName
=
Config
.
collWeiboName
+
Calendar
.
YEAR
+
"_01"
;
}
else
{
collWeiboName
=
Config
.
collWeiboName
+
Calendar
.
YEAR
+
"_06"
;
}
super
.
setCollName
(
collWeiboName
);
}
/**
...
...
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
View file @
b528f200
...
...
@@ -21,14 +21,14 @@ public class HotSearchRun {
private
ScheduledExecutorService
scheduExec
;
public
HotSearchRun
()
{
this
.
scheduExec
=
Executors
.
newScheduledThreadPool
(
2
);
this
.
scheduExec
=
Executors
.
newScheduledThreadPool
(
5
);
}
public
void
showTimer
()
{
scheduExec
.
scheduleAtFixedRate
(
new
WeiboHotSearchRun
(),
0
,
1
,
TimeUnit
.
MINUTES
);
scheduExec
.
scheduleAtFixedRate
(
new
ZhihuHotSearchRun
(),
0
,
1
,
TimeUnit
.
MINUTES
);
scheduExec
.
scheduleAtFixedRate
(
new
ZhihuHotSearchRun
(),
0
,
5
,
TimeUnit
.
MINUTES
);
scheduExec
.
scheduleAtFixedRate
(
new
BaiduHotSearchRun
(),
0
,
5
,
TimeUnit
.
MINUTES
);
scheduExec
.
scheduleAtFixedRate
(
new
SougoHotSearchRun
(),
0
,
1
,
TimeUnit
.
MINUTES
);
scheduExec
.
scheduleAtFixedRate
(
new
SougoHotSearchRun
(),
0
,
5
,
TimeUnit
.
MINUTES
);
scheduExec
.
scheduleAtFixedRate
(
new
DouyinHotSearchRun
(),
0
,
10
,
TimeUnit
.
MINUTES
);
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/BaiduHotSearchRun.java
View file @
b528f200
...
...
@@ -3,6 +3,7 @@ package com.zhiwei.searchhotcrawler.timer;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.Objects
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -22,22 +23,24 @@ public class BaiduHotSearchRun extends Thread{
public
void
run
()
{
logger
.
info
(
"百度风云榜采集开始........"
);
List
<
BaiDuHotSearch
>
list
=
BaiDuHotSearchCrawler
.
baiduHotSearch
();
logger
.
info
(
"{}, 此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
DBObject
>
data
=
new
ArrayList
<
DBObject
>();
for
(
BaiDuHotSearch
baiduHotSearch
:
list
){
int
changeCount
=
baiduHotSearchDAO
.
getChangeCount
(
baiduHotSearch
);
DBObject
doc
=
new
BasicDBObject
();
doc
.
put
(
"_id"
,
baiduHotSearch
.
getId
());
doc
.
put
(
"kw"
,
baiduHotSearch
.
getKw
());
doc
.
put
(
"everurl"
,
baiduHotSearch
.
getEverurl
());
doc
.
put
(
"count"
,
baiduHotSearch
.
getCount
());
doc
.
put
(
"day"
,
baiduHotSearch
.
getDay
());
doc
.
put
(
"time"
,
baiduHotSearch
.
getTime
());
doc
.
put
(
"changeCount"
,
changeCount
);
doc
.
put
(
"rank"
,
baiduHotSearch
.
getRank
());
data
.
add
(
doc
);
logger
.
info
(
"{}, 此轮百度风云榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
DBObject
>
saveDataList
=
new
ArrayList
<>();
if
(
Objects
.
nonNull
(
list
)
&&
!
list
.
isEmpty
())
{
list
.
forEach
(
baiduHotSearch
->{
int
changeCount
=
baiduHotSearchDAO
.
getChangeCount
(
baiduHotSearch
);
DBObject
doc
=
new
BasicDBObject
();
doc
.
put
(
"_id"
,
baiduHotSearch
.
getId
());
doc
.
put
(
"name"
,
baiduHotSearch
.
getKw
());
doc
.
put
(
"url"
,
baiduHotSearch
.
getEverurl
());
doc
.
put
(
"count"
,
baiduHotSearch
.
getCount
());
doc
.
put
(
"day"
,
baiduHotSearch
.
getDay
());
doc
.
put
(
"time"
,
baiduHotSearch
.
getTime
());
doc
.
put
(
"changeCount"
,
changeCount
);
doc
.
put
(
"rank"
,
baiduHotSearch
.
getRank
());
saveDataList
.
add
(
doc
);
});
}
baiduHotSearchDAO
.
addBaiduSearch
(
data
);
baiduHotSearchDAO
.
addBaiduSearch
(
saveDataList
);
logger
.
info
(
"百度风云榜采集结束........"
);
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/DouyinHotSearchRun.java
View file @
b528f200
...
...
@@ -28,10 +28,11 @@ public class DouyinHotSearchRun extends Thread{
int
changeCount
=
douyinHotSearchDAO
.
getChangeCount
(
douyinHotSearch
);
DBObject
douyin
=
new
BasicDBObject
();
douyin
.
put
(
"_id"
,
douyinHotSearch
.
getId
());
douyin
.
put
(
"
word
"
,
douyinHotSearch
.
getWord
());
douyin
.
put
(
"
position
"
,
douyinHotSearch
.
getPosition
());
douyin
.
put
(
"
hot_value
"
,
douyinHotSearch
.
getHot_value
());
douyin
.
put
(
"
name
"
,
douyinHotSearch
.
getWord
());
douyin
.
put
(
"
rank
"
,
douyinHotSearch
.
getPosition
());
douyin
.
put
(
"
count
"
,
douyinHotSearch
.
getHot_value
());
// douyin.put("url", douyinHotSearch.getUrl());
douyin
.
put
(
"day"
,
douyinHotSearch
.
getDay
());
douyin
.
put
(
"time"
,
douyinHotSearch
.
getTime
());
douyin
.
put
(
"changeCount"
,
changeCount
);
data
.
add
(
douyin
);
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/SougoHotSearchRun.java
View file @
b528f200
...
...
@@ -23,17 +23,14 @@ public class SougoHotSearchRun extends Thread {
logger
.
info
(
"搜狗微信采集开始........"
);
List
<
SougoHotSearch
>
list
=
SougoHotSearchCrawler
.
sougoHotSearch
();
logger
.
info
(
"{}, 此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
DBObject
>
data
=
new
ArrayList
<
DBObject
>();
List
<
DBObject
>
data
=
new
ArrayList
<>();
for
(
SougoHotSearch
sougoHotSearch
:
list
){
// int changeCount = baiduHotSearchDAO.getChangeCount(sougoHotSearch);
DBObject
doc
=
new
BasicDBObject
();
doc
.
put
(
"_id"
,
sougoHotSearch
.
getId
());
doc
.
put
(
"kw"
,
sougoHotSearch
.
getKw
());
doc
.
put
(
"everurl"
,
sougoHotSearch
.
getEverurl
());
// doc.put("count", baiduHotSearch.getCount());
doc
.
put
(
"name"
,
sougoHotSearch
.
getKw
());
doc
.
put
(
"url"
,
sougoHotSearch
.
getEverurl
());
doc
.
put
(
"day"
,
sougoHotSearch
.
getDay
());
doc
.
put
(
"time"
,
sougoHotSearch
.
getTime
());
// doc.put("changeCount", changeCount);
doc
.
put
(
"rank"
,
sougoHotSearch
.
getRank
());
data
.
add
(
doc
);
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboHotSearchRun.java
View file @
b528f200
...
...
@@ -23,7 +23,7 @@ public class WeiboHotSearchRun extends Thread{
logger
.
info
(
"微博话题采集开始........"
);
List
<
WeiboHotSearch
>
list
=
WeiboHotSearchCrawler
.
weiboHotSearch
();
logger
.
info
(
"{}, 微博此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
DBObject
>
data
=
new
ArrayList
<
DBObject
>();
List
<
DBObject
>
data
=
new
ArrayList
<>();
for
(
WeiboHotSearch
weiboHotSearch
:
list
){
int
changeCount
=
weiboHotSearchDAO
.
getChangeCount
(
weiboHotSearch
);
DBObject
doc
=
new
BasicDBObject
();
...
...
src/main/java/com/zhiwei/searchhotcrawler/util/WechatCodeUtil.java
View file @
b528f200
...
...
@@ -9,17 +9,18 @@ import org.apache.commons.lang3.StringUtils;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.dubbo.rpc.protocol.rest.support.ContentType
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.crawler.utils.RequestUtils.HttpMethod
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
okhttp3.MediaType
;
import
okhttp3.RequestBody
;
public
class
WechatCodeUtil
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WechatCodeUtil
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* @Title: getToken
* @author hero
...
...
@@ -65,7 +66,8 @@ public class WechatCodeUtil {
int
msgid
=
0
;
String
url
=
WechatConstant
.
WECHAT_TEMPLET_SEND_URL
.
replace
(
"ACCESS_TOKEN"
,
getToken
());
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapPost
(
url
,
"application/json"
,
templateJson
.
toJSONString
())).
body
().
string
();
RequestBody
requestBody
=
RequestBody
.
create
(
MediaType
.
get
(
"application/json"
),
templateJson
.
toJSONString
());
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapPost
(
url
,
requestBody
)).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
))
{
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
if
(
null
!=
jsonObject
)
{
...
...
@@ -103,8 +105,8 @@ public class WechatCodeUtil {
JSONObject
postData
=
new
JSONObject
();
postData
.
put
(
"tagid"
,
getGroupIp
(
groupName
));
postData
.
put
(
"next_openid"
,
""
);
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapPost
(
url
,
"application/json"
,
postData
.
toJSONString
()
)).
body
().
string
();
RequestBody
requestBody
=
RequestBody
.
create
(
MediaType
.
get
(
"application/json"
),
postData
.
toJSONString
());
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapPost
(
url
,
requestBody
)).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
if
(
null
!=
jsonObject
)
{
...
...
@@ -136,7 +138,8 @@ public class WechatCodeUtil {
JSONObject
postData
=
new
JSONObject
();
postData
.
put
(
"tagid"
,
groupId
);
postData
.
put
(
"next_openid"
,
""
);
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapPost
(
url
,
"application/json"
,
postData
.
toJSONString
())).
body
().
string
();
RequestBody
requestBody
=
RequestBody
.
create
(
MediaType
.
get
(
"application/json"
),
postData
.
toJSONString
());
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapPost
(
url
,
requestBody
)).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
htmlBody
);
if
(
null
!=
jsonObject
)
{
...
...
src/main/resources/db.properties
View file @
b528f200
#mongoIp=202.107.192.94
mongoIp
=
192.168.0.
247
mongoIp
=
192.168.0.
81
mongoPort
=
27017
db.username
=
zzwno
db.paasword
=
zzwno1q2w3e4r
db.certifiedDB
=
oneDB
dbName
=
NetWork
collWeiboName
=
weibo_hotsearch
2018_10
collZhihuName
=
zhihu_hotsearch
2018_10
collWeiboName
=
weibo_hotsearch
collZhihuName
=
zhihu_hotsearch
collWechatUserName
=
wechat_user
collBaiduName
=
baidu_hotsearch2019_07
collSougoName
=
sougo_hotsearch2019_07
collDouyinName
=
douyin_hotsearch2019_07
\ No newline at end of file
collBaiduName
=
baidu_hotsearch
collSougoName
=
sougo_hotsearch
collDouyinName
=
douyin_hotsearch
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment