Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
572289d9
Commit
572289d9
authored
Jul 11, 2019
by
win 10
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
将百度 搜狗 抖音 热搜采集遗漏的提交
parent
c21d66b0
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
472 additions
and
0 deletions
+472
-0
src/main/java/com/zhiwei/searchhotcrawler/bean/SougoHotSearch.java
+124
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/SougoHotSearchCrawler.java
+93
-0
src/main/java/com/zhiwei/searchhotcrawler/dao/DouyinHotSearchDAO.java
+94
-0
src/main/java/com/zhiwei/searchhotcrawler/dao/SougoHotSearchDAO.java
+74
-0
src/main/java/com/zhiwei/searchhotcrawler/timer/DouyinHotSearchRun.java
+43
-0
src/main/java/com/zhiwei/searchhotcrawler/timer/SougoHotSearchRun.java
+44
-0
No files found.
src/main/java/com/zhiwei/searchhotcrawler/bean/SougoHotSearch.java
0 → 100644
View file @
572289d9
package
com
.
zhiwei
.
searchhotcrawler
.
bean
;
import
java.io.Serializable
;
import
java.util.Date
;
import
com.zhiwei.tools.timeparse.TimeParse
;
public
class
SougoHotSearch
implements
Serializable
{
private
static
final
long
serialVersionUID
=
2076919584659821600L
;
private
String
id
;
//主键(关键词+时间)
private
String
url
;
//主链接
private
String
everurl
;
//相关链接
private
String
kw
;
//关键词
private
String
day
;
//天
private
Date
time
;
//时间
private
Integer
rank
;
//排名
public
SougoHotSearch
()
{}
public
SougoHotSearch
(
Integer
rank
,
String
kw
,
String
everurl
)
{
this
.
id
=
kw
+
"_"
+
new
Date
().
getTime
();
this
.
rank
=
rank
;
this
.
kw
=
kw
;
this
.
everurl
=
everurl
;
this
.
rank
=
rank
;
this
.
time
=
new
Date
();
this
.
day
=
TimeParse
.
dateFormartString
(
new
Date
(),
"yyyy-MM-dd"
);
}
@Override
public
String
toString
(){
return
"new SougoHotSearch["
+
"id = "
+
id
+
", url = "
+
url
+
", everurl = "
+
everurl
+
", kw = "
+
kw
+
", day = "
+
day
+
", time = "
+
time
+
", rank = "
+
rank
+
"]"
;
}
public
String
getId
()
{
return
id
;
}
public
void
setId
(
String
id
)
{
this
.
id
=
id
;
}
public
String
getUrl
()
{
return
url
;
}
public
void
setUrl
(
String
url
)
{
this
.
url
=
url
;
}
public
String
getEverurl
()
{
return
everurl
;
}
public
void
setEverurl
(
String
everurl
)
{
this
.
everurl
=
everurl
;
}
public
String
getKw
()
{
return
kw
;
}
public
void
setKw
(
String
kw
)
{
this
.
kw
=
kw
;
}
public
String
getDay
()
{
return
day
;
}
public
void
setDay
(
String
day
)
{
this
.
day
=
day
;
}
public
Date
getTime
()
{
return
time
;
}
public
void
setTime
(
Date
time
)
{
this
.
time
=
time
;
}
public
Integer
getRank
()
{
return
rank
;
}
public
void
setRank
(
Integer
rank
)
{
this
.
rank
=
rank
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/SougoHotSearchCrawler.java
0 → 100644
View file @
572289d9
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Objects
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.SougoHotSearch
;
/**
* @ClassName:SougoHotSearch
* @Description: TODO(搜狗微信关键词采集)
* @author hero
* @date 2019年7月10日 上午10:54:31
*/
public
class
SougoHotSearchCrawler
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
SougoHotSearchCrawler
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
/**
* @Title: SougoHotSearchTest
* @author hero
* @Description: TODO(PC端搜狗微信关键词采集)
* @param 设定文件
* @return void 返回类型
*/
public
static
List
<
SougoHotSearch
>
sougoHotSearch
(){
String
url
=
"https://weixin.sogou.com"
;
List
<
SougoHotSearch
>
list
=
new
ArrayList
<
SougoHotSearch
>();
for
(
int
i
=
0
;
i
<
3
;
i
++){
String
htmlBody
=
null
;
try
{
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
)).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"topwords"
)){
try
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Elements
elements
=
document
.
select
(
"ol#topwords"
).
select
(
"li"
);
for
(
Element
element
:
elements
)
{
try
{
//获取排名rank
String
rankStr
=
null
;
if
(!
element
.
select
(
"li"
).
select
(
"i"
).
isEmpty
())
{
rankStr
=
element
.
select
(
"li"
).
select
(
"i"
).
text
();
}
Integer
rank
=
null
;
if
(
StringUtils
.
isNoneBlank
(
rankStr
))
{
rank
=
Integer
.
valueOf
(
rankStr
);
}
//获取关键词(String)
String
kw
=
element
.
select
(
"li"
).
select
(
"a"
).
text
();
logger
.
info
(
"关键词:{}"
,
kw
);
//获取关键词相关链接everurl(String)
String
everurl
=
element
.
select
(
"li"
).
select
(
"a"
).
attr
(
"href"
);
SougoHotSearch
hotSearch
=
new
SougoHotSearch
(
rank
,
kw
,
everurl
);
if
(
Objects
.
nonNull
(
rank
))
{
list
.
add
(
hotSearch
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析搜狗微信时出现解析错误"
,
e
);
continue
;
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析搜狗微信时出现解析错误,数据不是json结构"
,
e
.
fillInStackTrace
());
return
null
;
}
}
else
{
logger
.
info
(
"解析搜狗微信时出现解析错误,页面结构有问题"
);
}
break
;
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析搜狗微信时出现解析错误,页面结构有问题"
,
e
);
}
}
logger
.
info
(
"此轮采集的数据量为:"
,
list
.
size
());
return
list
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/dao/DouyinHotSearchDAO.java
0 → 100644
View file @
572289d9
package
com
.
zhiwei
.
searchhotcrawler
.
dao
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBCursor
;
import
com.mongodb.DBObject
;
import
com.mongodb.WriteConcern
;
import
com.zhiwei.searchhotcrawler.bean.DouyinHotSearch
;
import
com.zhiwei.searchhotcrawler.config.Config
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
DouyinHotSearchDAO
extends
MongoDBTemplate
{
public
DouyinHotSearchDAO
()
{
super
();
super
.
setDbName
(
Config
.
dbName
);
super
.
setCollName
(
Config
.
collDouyinName
);
}
@SuppressWarnings
(
"deprecation"
)
public
void
addDouyinHotSearch
(
DBObject
douyin
){
for
(
int
i
=
0
;
i
<
3
;
i
++){
try
{
this
.
getReadColl
().
insert
(
douyin
,
WriteConcern
.
SAFE
);
ZhiWeiTools
.
sleep
(
200
);
break
;
}
catch
(
Exception
e
)
{
continue
;
}
}
}
/**
* @Title: getChangeCount
* @author hero
* @Description: TODO(查询据上次变化量)
* @param @param douyinHotSearch
* @param @return 设定文件
* @return int 返回类型
*/
public
int
getChangeCount
(
DouyinHotSearch
douyinHotSearch
){
int
result
=
0
;
DBObject
query
=
new
BasicDBObject
();
query
.
put
(
"word"
,
douyinHotSearch
.
getWord
());
DBObject
sort
=
new
BasicDBObject
();
sort
.
put
(
"time"
,
-
1
);
try
{
DBCursor
cur
=
this
.
getReadColl
().
find
(
query
).
sort
(
sort
).
limit
(
1
);
while
(
cur
.
hasNext
()){
DBObject
doc
=
cur
.
next
();
result
=
douyinHotSearch
.
getHot_value
()
-
Integer
.
valueOf
(
doc
.
get
(
"hot_value"
).
toString
());
break
;
}
cur
.
close
();
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
result
;
}
return
result
;
}
/**
* @Title: getDouyinHotSearch
* @author hero
* @Description: TODO(这里用一句话描述这个方法的作用)
* @param @return 设定文件
* @return List<DBObject> 返回类型
*/
// public List<DBObject> getDouyinHotSearch(){
// List<DBObject> list = null;
// try {
// Date date = new Date((new Date().getTime()-60*60*1000));
// DBObject query = new BasicDBObject();
// query.put("time", new BasicDBObject("$gte", date));
//
// long count = this.getReadColl().count(query);
// if(count>0){
// list = new ArrayList<DBObject>();
// DBCursor cur = this.getReadColl().find(query);
// while(cur.hasNext()){
// DBObject doc = cur.next();
// list.add(doc);
// }
// cur.close();
// }
// return list;
// } catch (Exception e) {
// e.printStackTrace();
// return list;
// }
// }
}
src/main/java/com/zhiwei/searchhotcrawler/dao/SougoHotSearchDAO.java
0 → 100644
View file @
572289d9
package
com
.
zhiwei
.
searchhotcrawler
.
dao
;
import
java.util.List
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.config.Config
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
SougoHotSearchDAO
extends
MongoDBTemplate
{
public
SougoHotSearchDAO
()
{
super
();
super
.
setDbName
(
Config
.
dbName
);
// Date date = new Date();
// String time = TimeParse.dateFormartString(date, "yyyy");
// if(Calendar.MONTH<6){
// collWeiboName = Config.collWeiboName + time+"_01";
// }else{
// collWeiboName = Config.collWeiboName + time+"_06";
// }
// System.out.println("collWeiboName========="+collWeiboName);
super
.
setCollName
(
Config
.
collSougoName
);
}
/**
* @Title: addSougoHotSearch
* @author hero
* @Description: TODO(这里用一句话描述这个方法的作用)
* @param @param doc 设定文件
* @return void 返回类型
*/
public
void
addSougoSearch
(
List
<
DBObject
>
list
){
for
(
int
i
=
0
;
i
<
3
;
i
++){
try
{
this
.
getReadColl
().
insert
(
list
);
ZhiWeiTools
.
sleep
(
200
);
break
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
continue
;
}
}
}
/**
* @Title: getChangeCount
* @author hero
* @Description: TODO(查询据上次变化量)
* @param @param sougoHotSearch
* @param @return 设定文件
* @return int 返回类型
*/
// public int getChangeCount(SougoHotSearch sougoHotSearch){
// int result = 0;
// DBObject query = new BasicDBObject();
// query.put("kw", sougoHotSearch.getKw());
// DBObject sort = new BasicDBObject();
// sort.put("time", -1);
// try {
// DBCursor cur = this.getReadColl().find(query).sort(sort).limit(1);
// while(cur.hasNext()){
// DBObject doc = cur.next();
// result = sougoHotSearch.getCount() - Integer.valueOf(doc.get("count").toString());
// break;
// }
// cur.close();
// } catch (Exception e) {
// e.printStackTrace();
// return result;
// }
// return result;
// }
}
src/main/java/com/zhiwei/searchhotcrawler/timer/DouyinHotSearchRun.java
0 → 100644
View file @
572289d9
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.bean.DouyinHotSearch
;
import
com.zhiwei.searchhotcrawler.crawler.DouyinHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.dao.DouyinHotSearchDAO
;
public
class
DouyinHotSearchRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
DouyinHotSearchRun
.
class
);
private
DouyinHotSearchDAO
douyinHotSearchDAO
=
new
DouyinHotSearchDAO
();
@Override
public
void
run
()
{
logger
.
info
(
"抖音热搜榜采集开始........"
);
List
<
DouyinHotSearch
>
list
=
DouyinHotSearchCrawler
.
getMobileDouyinHotList
();
logger
.
info
(
"{}, 抖音热搜榜此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
DBObject
>
data
=
new
ArrayList
<
DBObject
>();
for
(
DouyinHotSearch
douyinHotSearch
:
list
){
int
changeCount
=
douyinHotSearchDAO
.
getChangeCount
(
douyinHotSearch
);
DBObject
douyin
=
new
BasicDBObject
();
douyin
.
put
(
"_id"
,
douyinHotSearch
.
getId
());
douyin
.
put
(
"word"
,
douyinHotSearch
.
getWord
());
douyin
.
put
(
"position"
,
douyinHotSearch
.
getPosition
());
douyin
.
put
(
"hot_value"
,
douyinHotSearch
.
getHot_value
());
// douyin.put("url", douyinHotSearch.getUrl());
douyin
.
put
(
"time"
,
douyinHotSearch
.
getTime
());
douyin
.
put
(
"changeCount"
,
changeCount
);
data
.
add
(
douyin
);
douyinHotSearchDAO
.
addDouyinHotSearch
(
douyin
);
}
logger
.
info
(
"抖音热搜榜采集结束........"
);
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/SougoHotSearchRun.java
0 → 100644
View file @
572289d9
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.bean.SougoHotSearch
;
import
com.zhiwei.searchhotcrawler.crawler.SougoHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.dao.SougoHotSearchDAO
;
public
class
SougoHotSearchRun
extends
Thread
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
SougoHotSearchRun
.
class
);
private
SougoHotSearchDAO
sougoHotSearchDAO
=
new
SougoHotSearchDAO
();
@Override
public
void
run
()
{
logger
.
info
(
"搜狗微信采集开始........"
);
List
<
SougoHotSearch
>
list
=
SougoHotSearchCrawler
.
sougoHotSearch
();
logger
.
info
(
"{}, 此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
DBObject
>
data
=
new
ArrayList
<
DBObject
>();
for
(
SougoHotSearch
sougoHotSearch
:
list
){
// int changeCount = baiduHotSearchDAO.getChangeCount(sougoHotSearch);
DBObject
doc
=
new
BasicDBObject
();
doc
.
put
(
"_id"
,
sougoHotSearch
.
getId
());
doc
.
put
(
"kw"
,
sougoHotSearch
.
getKw
());
doc
.
put
(
"everurl"
,
sougoHotSearch
.
getEverurl
());
// doc.put("count", baiduHotSearch.getCount());
doc
.
put
(
"day"
,
sougoHotSearch
.
getDay
());
doc
.
put
(
"time"
,
sougoHotSearch
.
getTime
());
// doc.put("changeCount", changeCount);
doc
.
put
(
"rank"
,
sougoHotSearch
.
getRank
());
data
.
add
(
doc
);
}
sougoHotSearchDAO
.
addSougoSearch
(
data
);
logger
.
info
(
"搜狗微信采集结束........"
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment