Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
f5589b9f
Commit
f5589b9f
authored
Apr 08, 2020
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加今日头条热搜榜采集
parent
b2d4bb96
Show whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
422 additions
and
87 deletions
+422
-87
pom.xml
+1
-1
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
+2
-1
src/main/java/com/zhiwei/searchhotcrawler/config/DBConfig.java
+2
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/ToutiaoHotSearchCrawler.java
+104
-0
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
+37
-3
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
+1
-0
src/main/java/com/zhiwei/searchhotcrawler/test/TopicTest.java
+215
-0
src/main/java/com/zhiwei/searchhotcrawler/timer/BaiduHotSearchRun.java
+2
-16
src/main/java/com/zhiwei/searchhotcrawler/timer/DouyinHotSearchRun.java
+2
-16
src/main/java/com/zhiwei/searchhotcrawler/timer/SougoHotSearchRun.java
+2
-14
src/main/java/com/zhiwei/searchhotcrawler/timer/ToutiaoHotSearchRun.java
+47
-0
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboHotSearchRun.java
+5
-20
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuHotSearchRun.java
+2
-16
No files found.
pom.xml
View file @
f5589b9f
...
...
@@ -27,7 +27,7 @@
<dependency>
<groupId>
org.mongodb
</groupId>
<artifactId>
mongo-java-driver
</artifactId>
<version>
3.
6.3
</version>
<version>
3.
12.2
</version>
</dependency>
<dependency>
...
...
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
View file @
f5589b9f
...
...
@@ -6,5 +6,6 @@ public enum HotSearchType {
知乎热搜
,
抖音热搜
,
搜狗微信热搜
,
微博话题
微博话题
,
今日头条热搜
}
src/main/java/com/zhiwei/searchhotcrawler/config/DBConfig.java
View file @
f5589b9f
...
...
@@ -13,6 +13,7 @@ public class DBConfig {
conf
.
load
(
is
);
is
.
close
();
mongoUri
=
conf
.
getProperty
(
"mongoUri"
);
mongoLocalUri
=
conf
.
getProperty
(
"mongoLocalUri"
);
dbName
=
conf
.
getProperty
(
"dbName"
);
searchCollName
=
conf
.
getProperty
(
"searchCollName"
);
searchCacheCollName
=
conf
.
getProperty
(
"searchCacheCollName"
);
...
...
@@ -25,6 +26,7 @@ public class DBConfig {
public
static
String
mongoUri
;
public
static
String
mongoLocalUri
;
public
static
String
dbName
;
public
static
String
searchCollName
;
public
static
String
searchCacheCollName
;
...
...
src/main/java/com/zhiwei/searchhotcrawler/crawler/ToutiaoHotSearchCrawler.java
0 → 100644
View file @
f5589b9f
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
lombok.extern.log4j.Log4j2
;
import
org.apache.commons.lang3.StringUtils
;
import
java.io.IOException
;
import
java.util.*
;
/**
* @ProjectName: searchhotcrawler
* @ClassName: ToutiaoHotSearchCrawler
* @Author: hero
* @Description: 今日头条实时热搜榜单
* @Date: 2020/4/8 16:21
* @Version: 1.0
*/
@Log4j2
public
class
ToutiaoHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
/**
* @Title: weiboHotSearchByPhoneTest
* @author hero
* @Description: TODO(手机端Iphone 微博热搜采集)
* @return void 返回类型
*/
public
static
List
<
HotSearchList
>
toutiaoHotSearchByPhone
(){
for
(
int
count
=
0
;
count
<=
5
;
count
++){
String
url
=
"https://ib.snssdk.com/api/suggest_words/?business_id=10017"
;
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"
);
headerMap
.
put
(
"referer"
,
"https://ib.snssdk.com/rogue/aladdin_landingpage/template/aladdin_landingpage/hot_words.html?isBrowser=true&traffic_source="
);
String
htmlBody
;
try
{
List
<
HotSearchList
>
result
=
new
ArrayList
<
HotSearchList
>();
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
)).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"words"
)){
try
{
JSONArray
words
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"data"
).
getJSONObject
(
0
).
getJSONArray
(
"words"
);
int
rank
=
1
;
for
(
int
i
=
0
;
i
<
words
.
size
();
i
++){
try
{
JSONObject
word
=
words
.
getJSONObject
(
i
);
String
name
=
word
.
getString
(
"word"
);
String
link
=
"https://ib.snssdk.com/search/?keyword="
+
URLCodeUtil
.
getURLEncode
(
name
,
"utf-8"
)
+
"&pd=synthesis&source=trending_list&traffic_source="
;
Integer
hotCount
=
word
.
getJSONObject
(
"params"
).
getInteger
(
"fake_click_cnt"
);
Integer
wordsType
=
word
.
getInteger
(
"words_type"
);
String
icon
=
getIcon
(
wordsType
);
HotSearchList
hotSearch
=
new
HotSearchList
(
link
,
name
,
hotCount
,
true
,
rank
,
HotSearchType
.
今日头条热搜
.
name
(),
icon
);
result
.
add
(
hotSearch
);
rank
++;
}
catch
(
Exception
e
)
{
log
.
error
(
"解析今日头条实时热搜时出现解析错误"
,
e
);
continue
;
}
}
return
result
;
}
catch
(
Exception
e
)
{
log
.
error
(
"解析今日头条实时热搜时出现解析错误,数据不是json结构"
,
e
);
}
}
else
{
log
.
info
(
"解析今日头条实时热搜时出现解析错误,页面结构有问题"
);
}
}
catch
(
IOException
e1
)
{
log
.
error
(
"解析今日头条实时热搜时出现连接失败"
,
e1
);
}
}
return
Collections
.
emptyList
();
}
/**
* 热搜类型
* @param wordsType
* @return
*/
private
static
String
getIcon
(
Integer
wordsType
){
String
icon
=
"无"
;
if
(
Objects
.
nonNull
(
wordsType
)){
switch
(
wordsType
){
case
1
:
icon
=
"新"
;
break
;
case
2
:
icon
=
"热"
;
break
;
case
3
:
icon
=
"爆"
;
break
;
}
}
return
icon
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
View file @
f5589b9f
package
com
.
zhiwei
.
searchhotcrawler
.
dao
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.DBObject
;
import
com.mongodb.client.MongoCollection
;
import
com.zhiwei.searchhotcrawler.bean.HotSearch
Type
;
import
com.zhiwei.searchhotcrawler.bean.HotSearch
List
;
import
com.zhiwei.searchhotcrawler.config.DBConfig
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
import
lombok.extern.log4j.Log4j2
;
import
org.bson.Document
;
import
javax.print.Doc
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.Objects
;
...
...
@@ -21,6 +21,37 @@ public class HotSearchCacheDAO {
private
static
MongoCollection
collection
=
MongoDBTemplate
.
getCollection
(
DBConfig
.
dbName
,
DBConfig
.
searchCacheCollName
);
/**
* 存储数据
* @param dataList
* @return
*/
public
List
<
Document
>
addData
(
List
<
HotSearchList
>
dataList
){
List
<
Document
>
dataes
=
new
ArrayList
<>();
dataList
.
forEach
(
hotSearch
->{
Document
document
=
new
Document
();
document
.
put
(
"_id"
,
hotSearch
.
getId
());
document
.
put
(
"name"
,
hotSearch
.
getName
());
document
.
put
(
"url"
,
hotSearch
.
getUrl
());
document
.
put
(
"count"
,
hotSearch
.
getCount
());
document
.
put
(
"hot"
,
hotSearch
.
getHot
());
document
.
put
(
"day"
,
hotSearch
.
getDay
());
document
.
put
(
"time"
,
hotSearch
.
getTime
());
document
.
put
(
"rank"
,
hotSearch
.
getRank
());
document
.
put
(
"type"
,
hotSearch
.
getType
());
document
.
put
(
"icon"
,
hotSearch
.
getIcon
());
addAndUpdateData
(
document
);
dataes
.
add
(
document
);
});
return
dataes
;
}
/**
* 添加及更新相应数据表中的数据
* @param document
...
...
@@ -114,6 +145,9 @@ public class HotSearchCacheDAO {
case
"微博话题"
:
duration
=
duration
+
3
;
break
;
case
"今日头条热搜"
:
duration
=
duration
+
1
;
break
;
default
:
duration
=
duration
+
1
;
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
View file @
f5589b9f
...
...
@@ -50,6 +50,7 @@ public class HotSearchRun {
new
ZhihuHotSearchRun
().
start
();
new
WeiboSuperTopicRun
().
start
();
new
WeiboTopicRun
().
start
();
new
ToutiaoHotSearchRun
().
start
();
}
}
src/main/java/com/zhiwei/searchhotcrawler/test/TopicTest.java
0 → 100644
View file @
f5589b9f
//package com.zhiwei.searchhotcrawler.test;
//
//import com.mongodb.client.MongoCollection;
//import com.mongodb.client.MongoCursor;
//import com.mongodb.client.MongoDatabase;
//import com.zhiwei.searchhotcrawler.config.DBConfig;
//import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
//import com.zhiwei.tools.timeparse.TimeParse;
//import lombok.extern.log4j.Log4j2;
//import org.bson.Document;
//
//import java.util.*;
//
//@Log4j2
//public class TopicTest {
//
// private static MongoDatabase mongoDB = MongoDBTemplate.getDB(DBConfig.dbName);
//
// public static void main(String[] args) {
//// repairTopic();
//
// updateHotSearchCache();
// }
//
// /**
// * 修复热搜话题类型错误问题
// */
// public static void repairTopic(){
// MongoCollection mongoCollection = mongoDB.getCollection("hot_search_list2020_04");
// Document query = new Document("comment_count", new Document("$ne", null));
// query.put("type", "微博热搜");
// Date time = TimeParse.stringFormartDate("2020-03-12 18:00:00");
//
// long count = mongoCollection.countDocuments(query);
// log.info("count is {}", count);
// for(int i=0;i<55;i++){
// MongoCursor<Document> cursor = mongoCollection.find(query).limit(1000).iterator();
// while(cursor.hasNext()){
// Document update = cursor.next();
// update.put("type", "微博话题");
// Document query2 = new Document();
// query2.put("_id", update.getString("_id"));
// mongoCollection.findOneAndReplace(query2, update);
// time = update.getDate("time");
// }
// log.info("i========{}", i);
// }
// }
//
//
//
//
// public static void updateHotSearchCache(){
// for(int month = 3; month<=3; month++){
// String collectionName = "hot_search_list2020_0" + month;
// if(month>=10){
// collectionName = "hot_search_list2020_" + month;
// }
// log.info("collectionName is {}", collectionName);
// MongoCollection mongoCollection = mongoDB.getCollection(collectionName);
// MongoCollection mongoCollectionLocal = mongoDBLocal.getCollection("hot_search_cache");
//
// long count = mongoCollection.countDocuments();
// int pageCount = 10000;
// int pages = (int)Math.ceil((double)count/(double)pageCount);
// log.info("count====={},pages====={}",count, pages);
// Date date = TimeParse.stringFormartDate("2020-03-12 18:00:00");
//
// Map<String,Document> resultMap = new HashMap<>();
//
// for(int page = 1; page<pages; page++){
// Document query = new Document();
// if(page>1) {
// query.put("time", new Document("$gt", date));
// }
// log.info("page is {} ,query is {},coutn is {}", page ,query ,mongoCollection.countDocuments(query));
// MongoCursor<Document> cursor = mongoCollection.find(query).limit(pageCount).sort(new Document("time",1)).iterator();
// while(cursor.hasNext()){
// Document document = cursor.next();
// String name = document.getString("name");
// String type = document.getString("type");
// int lastRank = document.getInteger("rank")!=null?document.getInteger("rank"): -1;
// int lastCount = document.getInteger("count")!=null?document.getInteger("count"): -1;
// Date startTime = document.getDate("time");
// Date endTime = new Date(startTime.getTime() + (60 * 1000));
// String topicLead = document.getString("topic_lead")!=null?document.getString("topic_lead"):null;
// boolean hot = document.getBoolean("hot")!=null?document.getBoolean("hot"):true;
// String url = document.getString("url")!=null?document.getString("url"):null;
// String id = name + "_" + type;
//
// Document nowDoc = resultMap.get(id);
// if (Objects.nonNull(nowDoc)) {
// int highestRank = nowDoc.getInteger("highestRank");
// int highestCount = nowDoc.getInteger("highestCount");
// //判断最大热度值
// if (lastCount>0 && lastCount > highestCount) {
// highestCount = lastCount;
// }
// //判断最高排名
// if (lastRank>0 && lastRank < highestRank) {
// highestRank = lastRank;
// }
// //计算热搜时长
// int duration = nowDoc.getInteger("duration");
// int durationNow = getDuration(type, duration);
//
// //更新相应信息
// nowDoc.put("endTime", endTime);
// nowDoc.put("lastRank", lastRank);
// nowDoc.put("lastCount", lastCount);
// nowDoc.put("highestRank", highestRank);
// nowDoc.put("highestCount", highestCount);
// nowDoc.put("duration", durationNow);
// } else {
// nowDoc = new Document();
// int durationNow = getDuration(type, 0);
// nowDoc.put("_id", id);
// nowDoc.put("url", url);
// nowDoc.put("name", name);
// nowDoc.put("hot", hot);
// nowDoc.put("topicLead", topicLead);
// nowDoc.put("type", type);
// nowDoc.put("lastRank", lastRank);
// nowDoc.put("highestRank", lastRank);
// nowDoc.put("lastCount", lastCount);
// nowDoc.put("highestCount", lastCount);
// nowDoc.put("startTime", startTime);
// nowDoc.put("endTime", endTime);
// nowDoc.put("duration", durationNow);
// }
// resultMap.put(id, nowDoc);
// date = startTime;
// }
// cursor.close();
// }
//
// log.info("list size is {}", resultMap.size());
// for (Map.Entry<String,Document> entry: resultMap.entrySet()){
// String id = entry.getKey();
// Document document = entry.getValue();
// String name = document.getString("name");
// String type = document.getString("type");
// int lastRank = document.getInteger("lastRank");
// int lastCount = document.getInteger("lastCount");
// int highestRank = document.getInteger("highestRank");
// int highestCount = document.getInteger("highestCount");
// int duration = document.getInteger("duration");
//
// Document query = new Document("_id", id);
// Document resultDoc = (Document) mongoCollectionLocal.find(query).first();
// if(Objects.isNull(resultDoc)){
// mongoCollectionLocal.insertOne(document);
// }else{
//
// int highestRankResult = resultDoc.getInteger("highestRank");
// int highestCountResult = resultDoc.getInteger("highestCount");
// int durationResult = document.getInteger("duration");
// //判断最大热度值
// if (highestCountResult > highestCount) {
// highestCount = highestCountResult;
// }
// //判断最高排名
// if (highestRankResult < highestRank) {
// highestRank = highestRankResult;
// }
// //计算热搜时长
// int durationNow = duration + durationResult;
// Date endTime = document.getDate("endTime");
// //更新相应信息
// resultDoc.put("endTime", endTime);
// resultDoc.put("lastRank", lastRank);
// resultDoc.put("lastCount", lastCount);
// resultDoc.put("highestRank", highestRank);
// resultDoc.put("highestCount", highestCount);
// resultDoc.put("duration", durationNow);
// mongoCollectionLocal.findOneAndReplace(query, resultDoc);
// }
// }
// }
// }
//
//
// /**
// * 计算热搜时长
// * @param type
// * @param duration
// * @return
// */
// private static int getDuration(String type, int duration){
// switch (type){
// case "微博热搜" :
// duration = duration + 1;
// break;
// case "百度热搜" :
// duration = duration + 5;
// break;
// case "知乎热搜" :
// duration = duration + 10;
// break;
// case "抖音热搜" :
// duration = duration + 10;
// break;
// case "搜狗微信热搜" :
// duration = duration + 5;
// break;
// case "微博话题" :
// duration = duration + 3;
// break;
// default :
// duration = duration + 1;
// }
// return duration;
// }
//
//}
src/main/java/com/zhiwei/searchhotcrawler/timer/BaiduHotSearchRun.java
View file @
f5589b9f
...
...
@@ -44,24 +44,10 @@ public class BaiduHotSearchRun extends Thread{
HotSearchCacheDAO
hotSearchCacheDAO
=
new
HotSearchCacheDAO
();
List
<
HotSearchList
>
list
=
BaiDuHotSearchCrawler
.
baiduHotSearch
();
log
.
info
(
"{}, 此轮百度风云榜采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
Document
>
saveDataList
=
new
ArrayList
<>();
if
(
Objects
.
nonNull
(
list
)
&&
!
list
.
isEmpty
())
{
list
.
forEach
(
baiduHotSearch
->{
Document
doc
=
new
Document
();
doc
.
put
(
"_id"
,
baiduHotSearch
.
getId
());
doc
.
put
(
"name"
,
baiduHotSearch
.
getName
());
doc
.
put
(
"url"
,
baiduHotSearch
.
getUrl
());
doc
.
put
(
"count"
,
baiduHotSearch
.
getCount
());
doc
.
put
(
"day"
,
baiduHotSearch
.
getDay
());
doc
.
put
(
"time"
,
baiduHotSearch
.
getTime
());
doc
.
put
(
"rank"
,
baiduHotSearch
.
getRank
());
doc
.
put
(
"type"
,
baiduHotSearch
.
getType
());
saveDataList
.
add
(
doc
);
hotSearchCacheDAO
.
addAndUpdateData
(
doc
);
});
List
<
Document
>
data
=
hotSearchCacheDAO
.
addData
(
list
);
hotSearchDAO
.
addHotSearchList
(
data
);
}
hotSearchDAO
.
addHotSearchList
(
saveDataList
);
log
.
info
(
"百度风云榜采集结束........"
);
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/DouyinHotSearchRun.java
View file @
f5589b9f
...
...
@@ -47,22 +47,8 @@ public class DouyinHotSearchRun extends Thread{
HotSearchCacheDAO
hotSearchCacheDAO
=
new
HotSearchCacheDAO
();
List
<
HotSearchList
>
list
=
DouyinHotSearchCrawler
.
getMobileDouyinHotList
();
log
.
info
(
"{}, 抖音热搜榜此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
Document
>
data
=
new
ArrayList
<>();
for
(
HotSearchList
douyinHotSearch
:
list
){
Document
douyin
=
new
Document
();
douyin
.
put
(
"_id"
,
douyinHotSearch
.
getId
());
douyin
.
put
(
"name"
,
douyinHotSearch
.
getName
());
douyin
.
put
(
"rank"
,
douyinHotSearch
.
getRank
());
douyin
.
put
(
"count"
,
douyinHotSearch
.
getCount
());
douyin
.
put
(
"hot"
,
douyinHotSearch
.
getHot
());
douyin
.
put
(
"day"
,
douyinHotSearch
.
getDay
());
douyin
.
put
(
"time"
,
douyinHotSearch
.
getTime
());
douyin
.
put
(
"url"
,
null
);
douyin
.
put
(
"type"
,
douyinHotSearch
.
getType
());
data
.
add
(
douyin
);
hotSearchDAO
.
addHotSearch
(
douyin
);
hotSearchCacheDAO
.
addAndUpdateData
(
douyin
);
}
List
<
Document
>
data
=
hotSearchCacheDAO
.
addData
(
list
);
hotSearchDAO
.
addHotSearchList
(
data
);
log
.
info
(
"抖音热搜榜采集结束........"
);
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/SougoHotSearchRun.java
View file @
f5589b9f
...
...
@@ -6,6 +6,7 @@ import java.util.List;
import
java.util.concurrent.TimeUnit
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
lombok.extern.log4j.Log4j2
;
import
org.bson.Document
;
import
org.slf4j.Logger
;
...
...
@@ -15,7 +16,6 @@ import com.mongodb.BasicDBObject;
import
com.mongodb.DBObject
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.crawler.SougoHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
@Log4j2
...
...
@@ -43,19 +43,7 @@ public class SougoHotSearchRun extends Thread {
log
.
info
(
"搜狗微信采集开始........"
);
List
<
HotSearchList
>
list
=
SougoHotSearchCrawler
.
sougoHotSearch
();
log
.
info
(
"{}, 此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
Document
>
data
=
new
ArrayList
<>();
for
(
HotSearchList
sougoHotSearch
:
list
){
Document
doc
=
new
Document
();
doc
.
put
(
"_id"
,
sougoHotSearch
.
getId
());
doc
.
put
(
"name"
,
sougoHotSearch
.
getName
());
doc
.
put
(
"url"
,
sougoHotSearch
.
getUrl
());
doc
.
put
(
"day"
,
sougoHotSearch
.
getDay
());
doc
.
put
(
"time"
,
sougoHotSearch
.
getTime
());
doc
.
put
(
"rank"
,
sougoHotSearch
.
getRank
());
doc
.
put
(
"type"
,
sougoHotSearch
.
getType
());
data
.
add
(
doc
);
hotSearchCacheDAO
.
addAndUpdateData
(
doc
);
}
List
<
Document
>
data
=
hotSearchCacheDAO
.
addData
(
list
);
hotSearchDAO
.
addHotSearchList
(
data
);
log
.
info
(
"搜狗微信采集结束........"
);
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/timer/ToutiaoHotSearchRun.java
0 → 100644
View file @
f5589b9f
package
com
.
zhiwei
.
searchhotcrawler
.
timer
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.crawler.ToutiaoHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.crawler.WeiboHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO
;
import
com.zhiwei.searchhotcrawler.dao.HotSearchListDAO
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
org.bson.Document
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.concurrent.TimeUnit
;
@Log4j2
public
class
ToutiaoHotSearchRun
extends
Thread
{
@Override
public
void
run
()
{
boolean
f
=
true
;
while
(
f
)
{
try
{
getHotList
();
TimeUnit
.
MINUTES
.
sleep
(
1
);
}
catch
(
Exception
e
)
{
e
.
fillInStackTrace
();
ZhiWeiTools
.
sleep
(
60
*
1000
);
}
ZhiWeiTools
.
sleep
(
50
);
}
}
private
void
getHotList
()
{
log
.
info
(
"今日头条热搜采集开始........"
);
HotSearchListDAO
hotSearchDAO
=
new
HotSearchListDAO
();
HotSearchCacheDAO
hotSearchCacheDAO
=
new
HotSearchCacheDAO
();
List
<
HotSearchList
>
list
=
ToutiaoHotSearchCrawler
.
toutiaoHotSearchByPhone
();
log
.
info
(
"{}, 今日头条此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
Document
>
data
=
hotSearchCacheDAO
.
addData
(
list
);
hotSearchDAO
.
addHotSearchList
(
data
);
log
.
info
(
"今日头条热搜采集结束........"
);
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/WeiboHotSearchRun.java
View file @
f5589b9f
...
...
@@ -33,30 +33,15 @@ public class WeiboHotSearchRun extends Thread{
private
void
getHotList
()
{
log
.
info
(
"微博
话题
采集开始........"
);
HotSearchListDAO
weiboH
otSearchDAO
=
new
HotSearchListDAO
();
log
.
info
(
"微博
热搜
采集开始........"
);
HotSearchListDAO
h
otSearchDAO
=
new
HotSearchListDAO
();
HotSearchCacheDAO
hotSearchCacheDAO
=
new
HotSearchCacheDAO
();
// List<HotSearchList> list = WeiboHotSearchCrawler.weiboHotSearch();
List
<
HotSearchList
>
list
=
WeiboHotSearchCrawler
.
weiboHotSearchByPhone
();
log
.
info
(
"{}, 微博此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
Document
>
data
=
new
ArrayList
<>();
for
(
HotSearchList
weiboHotSearch
:
list
){
Document
doc
=
new
Document
();
doc
.
put
(
"_id"
,
weiboHotSearch
.
getId
());
doc
.
put
(
"name"
,
weiboHotSearch
.
getName
());
doc
.
put
(
"url"
,
weiboHotSearch
.
getUrl
());
doc
.
put
(
"count"
,
weiboHotSearch
.
getCount
());
doc
.
put
(
"hot"
,
weiboHotSearch
.
getHot
());
doc
.
put
(
"day"
,
weiboHotSearch
.
getDay
());
doc
.
put
(
"time"
,
weiboHotSearch
.
getTime
());
doc
.
put
(
"rank"
,
weiboHotSearch
.
getRank
());
doc
.
put
(
"type"
,
weiboHotSearch
.
getType
());
doc
.
put
(
"icon"
,
weiboHotSearch
.
getIcon
());
data
.
add
(
doc
);
hotSearchCacheDAO
.
addAndUpdateData
(
doc
);
}
weiboHotSearchDAO
.
addHotSearchList
(
data
);
log
.
info
(
"微博话题采集结束........"
);
List
<
Document
>
data
=
hotSearchCacheDAO
.
addData
(
list
);
hotSearchDAO
.
addHotSearchList
(
data
);
log
.
info
(
"微博热搜采集结束........"
);
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/ZhihuHotSearchRun.java
View file @
f5589b9f
...
...
@@ -45,22 +45,8 @@ public class ZhihuHotSearchRun extends Thread{
List
<
HotSearchList
>
mobilelist
=
ZhihuHotSearchCrawler
.
getMobileZhihuHotList
();
list
.
addAll
(
mobilelist
);
log
.
info
(
"{}, 知乎此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list
!=
null
?
list
.
size
()
:
0
));
List
<
Document
>
dataList
=
new
ArrayList
<>();
for
(
HotSearchList
zhihuHotSearch
:
list
){
Document
zhihu
=
new
Document
();
zhihu
.
put
(
"_id"
,
zhihuHotSearch
.
getId
());
zhihu
.
put
(
"name"
,
zhihuHotSearch
.
getName
());
zhihu
.
put
(
"url"
,
zhihuHotSearch
.
getUrl
());
zhihu
.
put
(
"count"
,
zhihuHotSearch
.
getCount
());
zhihu
.
put
(
"hot"
,
zhihuHotSearch
.
getHot
());
zhihu
.
put
(
"day"
,
zhihuHotSearch
.
getDay
());
zhihu
.
put
(
"time"
,
zhihuHotSearch
.
getTime
());
zhihu
.
put
(
"rank"
,
zhihuHotSearch
.
getRank
());
zhihu
.
put
(
"type"
,
zhihuHotSearch
.
getType
());
dataList
.
add
(
zhihu
);
hotSearchCacheDAO
.
addAndUpdateData
(
zhihu
);
}
hotSearchDAO
.
addHotSearchList
(
dataList
);
List
<
Document
>
data
=
hotSearchCacheDAO
.
addData
(
list
);
hotSearchDAO
.
addHotSearchList
(
data
);
log
.
info
(
"知乎话题采集结束........"
);
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment