Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
8d85b0c2
Commit
8d85b0c2
authored
Jun 07, 2021
by
chenweitao
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'working' into 'master'
Working See merge request
!89
parents
51117558
c495fcc6
Show whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
1622 additions
and
81 deletions
+1622
-81
dependency-reduced-pom.xml
+26
-0
pom.xml
+20
-15
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
+3
-1
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiBoMassage.java
+131
-0
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiBoUser.java
+65
-0
src/main/java/com/zhiwei/searchhotcrawler/config/DBConfig.java
+5
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/HotSearch36KrCrawler.java
+142
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/HuXiuHotSearchCrawler.java
+112
-0
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
+272
-3
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
+11
-5
src/main/java/com/zhiwei/searchhotcrawler/dao/WeiBoMassageDao.java
+81
-0
src/main/java/com/zhiwei/searchhotcrawler/dao/WeiBoUserDao.java
+59
-0
src/main/java/com/zhiwei/searchhotcrawler/test/HotSearch36KrCrawlerTest.java
+148
-0
src/main/java/com/zhiwei/searchhotcrawler/test/HuXiuHotSearchCrawlerTest.java
+118
-0
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
+26
-0
src/test/java/weiboTest/WeiboHotSearchTest.java
+403
-57
No files found.
dependency-reduced-pom.xml
View file @
8d85b0c2
...
...
@@ -71,6 +71,32 @@
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<version>
0.6.7.2-RELEASE
</version>
<scope>
test
</scope>
</dependency>
<dependency>
<groupId>
junit
</groupId>
<artifactId>
junit
</artifactId>
<version>
4.13
</version>
<scope>
test
</scope>
</dependency>
<dependency>
<groupId>
org.projectlombok
</groupId>
<artifactId>
lombok
</artifactId>
<version>
1.18.20
</version>
<scope>
test
</scope>
</dependency>
<dependency>
<groupId>
org.springframework
</groupId>
<artifactId>
spring-test
</artifactId>
<version>
5.3.6
</version>
<scope>
test
</scope>
</dependency>
</dependencies>
<properties>
<project.reporting.outputEncoding>
UTF-8
</project.reporting.outputEncoding>
<project.build.sourceEncoding>
UTF-8
</project.build.sourceEncoding>
...
...
pom.xml
View file @
8d85b0c2
...
...
@@ -5,7 +5,7 @@
<version>
0.0.6-SNAPSHOT
</version>
<name>
各平台热搜榜单采集程序
</name>
<description>
各平台热搜榜单采集程序
目前包含:1.微博时时热搜采集程序、2.知乎热搜采集程序
</description>
目前包含:1.微博时时热搜采集程序、2.知乎热搜采集程序
</description>
<properties>
<project.build.sourceEncoding>
UTF-8
</project.build.sourceEncoding>
...
...
@@ -51,16 +51,16 @@
<artifactId>
lombok
</artifactId>
<version>
1.18.8
</version>
</dependency>
<!-- <dependency>-->
<!-- <groupId>org.quartz-scheduler</groupId>-->
<!-- <artifactId>quartz</artifactId>-->
<!-- <version>${quartz.version}</version>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>org.quartz-scheduler</groupId>-->
<!-- <artifactId>quartz-jobs</artifactId>-->
<!-- <version>${quartz.version}</version>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>org.quartz-scheduler</groupId>-->
<!-- <artifactId>quartz</artifactId>-->
<!-- <version>${quartz.version}</version>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>org.quartz-scheduler</groupId>-->
<!-- <artifactId>quartz-jobs</artifactId>-->
<!-- <version>${quartz.version}</version>-->
<!-- </dependency>-->
<!-- Spring文件配置 -->
<dependency>
<groupId>
org.springframework
</groupId>
...
...
@@ -119,6 +119,11 @@
<version>
4.12
</version>
<scope>
test
</scope>
</dependency>
<dependency>
<groupId>
org.apache.httpcomponents
</groupId>
<artifactId>
httpclient
</artifactId>
<version>
4.5.6
</version>
</dependency>
</dependencies>
...
...
@@ -147,10 +152,10 @@
</filters>
<transformers>
<!-- <transformer-->
<!-- implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">-->
<!-- <mainClass>com.zhiwei.searchhotcrawler.run.HotSearchRun</mainClass>-->
<!-- </transformer>-->
<!-- <transformer-->
<!-- implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">-->
<!-- <mainClass>com.zhiwei.searchhotcrawler.run.HotSearchRun</mainClass>-->
<!-- </transformer>-->
<!-- 不覆盖同名文件,而是追加合并同名文件 -->
<transformer
implementation=
"org.apache.maven.plugins.shade.resource.AppendingTransformer"
>
<resource>
META-INF/spring.handlers
</resource>
...
...
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchType.java
View file @
8d85b0c2
...
...
@@ -22,5 +22,7 @@ public enum HotSearchType {
腾讯较真榜
,
脉脉热榜
,
B
站排行榜
,
B
站热搜
B
站热搜
,
人气榜
36
氪
,
虎嗅热文推荐
,
}
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiBoMassage.java
0 → 100644
View file @
8d85b0c2
package
com
.
zhiwei
.
searchhotcrawler
.
bean
;
/**
* @ClassName: WeiBoMassage
* @Description: 微博主要信息
* @author ll
* @date 2021年5月27日 下午2:26:11
*/
import
lombok.Data
;
import
lombok.ToString
;
import
java.io.Serializable
;
import
java.util.Date
;
import
java.util.List
;
@Data
@ToString
public
class
WeiBoMassage
implements
Serializable
{
private
static
final
long
serialVersionUID
=
5640606453392799871L
;
/**
* 主键
*/
private
String
id
;
/**
* 用户id
*/
private
String
userId
;
/**
* 内容
*/
private
String
text
;
/**
* 用户名
*/
private
String
userName
;
/**
*
*/
private
String
mid
;
/**
* 创建时间
*/
private
Date
creatTime
;
/**
* 编辑时间
*/
private
Date
editTime
;
/**
*
*/
private
Integer
cardType
;
/**
* 显示类型
*/
private
Integer
showType
;
/**
* 转发数
*/
private
Long
repostCount
;
/**
* 评论数
*/
private
Long
commentCount
;
/**
* 点赞数
*/
private
Long
attitudeCount
;
/**
* 播放量
*/
private
Long
playCount
;
/**
* 图片地址
*/
private
List
<
String
>
pictureUrlList
;
/**
* 来源
*/
private
String
source
;
/**
* 类型
*/
private
String
type
;
/**
* 话题
*/
private
String
topic
;
//是否转发
private
Integer
forward
;
//转发 源微博mid
private
String
root_mid
;
//转发 源微博user信息
//转发 源id
private
String
root_id
;
//转发 源name
private
String
root_name
;
//转发 源微博text
private
String
root_text
;
//转发 源来源
private
String
root_source
;
public
WeiBoMassage
()
{
}
public
WeiBoMassage
(
String
userId
,
String
text
,
String
userName
,
String
mid
,
Date
creatTime
,
Date
editTime
,
Integer
cardType
,
Integer
showType
,
Long
repostCount
,
Long
commentCount
,
Long
attitudeCount
,
String
source
,
String
type
,
String
topic
)
{
this
.
id
=
mid
+
"_"
+
HotSearchType
.
微博热搜
.
name
()+
"_"
+
topic
;
this
.
userId
=
userId
;
this
.
text
=
text
;
this
.
userName
=
userName
;
this
.
mid
=
mid
;
this
.
creatTime
=
creatTime
;
this
.
editTime
=
editTime
;
this
.
cardType
=
cardType
;
this
.
showType
=
showType
;
this
.
repostCount
=
repostCount
;
this
.
commentCount
=
commentCount
;
this
.
attitudeCount
=
attitudeCount
;
this
.
source
=
source
;
this
.
type
=
type
;
this
.
topic
=
topic
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/bean/WeiBoUser.java
0 → 100644
View file @
8d85b0c2
package
com
.
zhiwei
.
searchhotcrawler
.
bean
;
/**
* @ClassName: WeiBoUser
* @Description: 微博用户
* @author ll
* @date 2021年5月27日 下午3:26:11
*/
import
lombok.Data
;
import
lombok.ToString
;
import
java.io.Serializable
;
import
java.util.Date
;
@Data
@ToString
public
class
WeiBoUser
implements
Serializable
{
private
static
final
long
serialVersionUID
=
-
2856936638431788899L
;
/**
* 主键
*/
private
String
id
;
/**
* 用户id
*/
private
String
userId
;
/**
* 认证信息
*/
private
String
attestationMassage
;
/**
* 用户名
*/
private
String
userName
;
/**
* 话题
*/
private
String
topic
;
/**
*时间
*/
private
Date
time
;
/**
* 粉丝数
*/
private
Long
followerCount
;
public
WeiBoUser
()
{
}
public
WeiBoUser
(
String
userId
,
String
attestationMassage
,
String
userName
,
String
topic
,
Date
time
,
Long
followerCount
)
{
this
.
id
=
userId
+
"_"
+
HotSearchType
.
微博热搜
.
name
()+
"_"
+
topic
;
this
.
userId
=
userId
;
this
.
attestationMassage
=
attestationMassage
;
this
.
userName
=
userName
;
this
.
topic
=
topic
;
this
.
time
=
time
;
this
.
followerCount
=
followerCount
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/config/DBConfig.java
View file @
8d85b0c2
...
...
@@ -19,6 +19,9 @@ public class DBConfig {
searchCacheCollName
=
conf
.
getProperty
(
"searchCacheCollName"
);
topicCollName
=
conf
.
getProperty
(
"topicCollName"
);
collWechatUserName
=
conf
.
getProperty
(
"collWechatUserName"
);
weiBoMassageCollName
=
conf
.
getProperty
(
"weiBoMassageCollName"
);
weiBoUserCollName
=
conf
.
getProperty
(
"weiBoUserCollName"
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
...
...
@@ -32,4 +35,6 @@ public class DBConfig {
public
static
String
searchCacheCollName
;
public
static
String
topicCollName
;
public
static
String
collWechatUserName
;
public
static
String
weiBoMassageCollName
;
public
static
String
weiBoUserCollName
;
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/HotSearch36KrCrawler.java
0 → 100644
View file @
8d85b0c2
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
java.time.Duration
;
import
java.util.*
;
/**
* @author ll
* @ClassName:HotSearch36KrCrawler
* @Description:
* @date 2021年5月21日 上午11:54:31
*/
@Log4j2
public
class
HotSearch36KrCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
/**
* @return void 返回类型
* @Title: hotSearch36KrCrawler
* @author hero
* @Description: PC端36Kr人气榜采集
*/
public
static
List
<
HotSearchList
>
hotSearch36Kr
(
Date
date
)
{
String
url
=
"https://www.36kr.com/hot-list/catalog"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析36Kr人气榜时出现解析错误,页面结构有问题"
,
e
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"article-list"
))
{
return
ansysData
(
htmlBody
,
date
);
}
else
{
log
.
info
(
"解析36Kr人气榜时出现解析错误,页面结构有问题"
);
}
return
Collections
.
emptyList
();
}
// public static List<HotSearchList> hotSearch36Kr(Date date) {
// String url = "https://www.36kr.com/hot-list/catalog";
// //建立一个新的客户端请求(创建HttpClient对象)
// CloseableHttpClient httpClient = HttpClients.createDefault();
// //创建请求对象实例
// HttpGet httpGet = new HttpGet(url);
// httpGet.addHeader("User-Agent", "spider");
// //获取响应的结果
// CloseableHttpResponse response = null;
// try {
// //调用HttpClient对象的execute方法发送请求
// response = httpClient.execute(httpGet);
//
// if (Objects.nonNull(response)) {
// //获取HttpEntity对象其中包含了响应内容(响应头)
// HttpEntity entity = response.getEntity();
//
// String htmlBody = EntityUtils.toString(entity);
// return ansysData(htmlBody,date);
// }
// } catch (Exception e) {
// e.printStackTrace();
// }
// return Collections.emptyList();
// }
/**
* 解析数据
*
* @param htmlBody
* @return
*/
private
static
List
<
HotSearchList
>
ansysData
(
String
htmlBody
,
Date
date
)
{
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
String
webSite
=
"https://www.36kr.com"
;
try
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Elements
elements
=
document
.
select
(
"div.article-list"
).
first
().
select
(
"div.article-wrapper"
);
if
(
Objects
.
nonNull
(
elements
)
&&
!
elements
.
isEmpty
())
{
// 获取排名rank
int
rank
=
0
;
for
(
Element
element
:
elements
)
{
try
{
rank
++;
// 获取关键词(String)
String
keyWord
=
element
.
select
(
"p.title-wrapper"
).
select
(
"a.article-item-title"
).
text
();
// logger.info("关键词:{}", kw);
// 获取关键词相关链接everurl(String)
String
everurl
=
element
.
select
(
"p.title-wrapper"
).
select
(
"a.article-item-title"
).
attr
(
"href"
);
// 获取搜索指数count(int)
String
url
=
webSite
+
everurl
;
String
hot
=
null
;
// 判断热度值所在的规则是否为null
if
(!
element
.
select
(
"span"
).
isEmpty
())
{
hot
=
element
.
select
(
"span"
).
text
();
}
Long
count
=
0L
;
// 判断hot是否为空
if
(
StringUtils
.
isNotBlank
(
hot
))
{
String
[]
hots
=
hot
.
split
(
"热度"
);
String
trim
=
hots
[
1
].
trim
();
Double
num
=
Double
.
valueOf
(
trim
);
count
=
Math
.
round
(
num
);
}
if
(
Objects
.
nonNull
(
rank
))
{
if
(
count
==
0
)
{
log
.
info
(
htmlBody
);
log
.
info
(
hot
);
log
.
info
(
element
);
}
else
{
HotSearchList
hotSearch
=
new
HotSearchList
(
url
,
keyWord
,
count
,
rank
,
HotSearchType
.
人气榜
36
氪
.
name
(),
date
);
list
.
add
(
hotSearch
);
}
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析36Kr人气榜时出现解析错误"
,
e
);
}
}
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析36Kr人气榜时出现解析错误,数据不是json结构"
,
e
);
}
return
list
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/HuXiuHotSearchCrawler.java
0 → 100644
View file @
8d85b0c2
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
java.time.Duration
;
import
java.util.*
;
/**
* @author: ll
* @ClassName: HuXiuHotSearchCrawler
* @Description: pc端虎嗅热文推荐采集
* @date: 2021年5月24日 下午16:35:31
* @Title: HuXiuHotSearchCrawler
*/
@Log4j2
public
class
HuXiuHotSearchCrawler
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
public
static
List
<
HotSearchList
>
HuXiuHotArticleRecommended
(
Date
date
){
String
url
=
"https://www.huxiu.com/"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析虎嗅热文推荐时出现解析错误,页面结构有问题"
,
e
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"hot__list"
))
{
return
ansysData
(
htmlBody
,
date
);
}
else
{
log
.
info
(
"解析虎嗅热文推荐时出现解析错误,页面结构有问题"
);
}
return
Collections
.
emptyList
();
}
// String url="https://www.huxiu.com/";
// //创建客户端请求对象
// CloseableHttpClient httpClient = HttpClients.createDefault();
// //创建请求对象实例
// HttpGet httpGet = new HttpGet(url);
// //设置头信息
// httpGet.addHeader("User-Agent","spider");
//
// //获取响应结果
// try {
// CloseableHttpResponse response = httpClient.execute(httpGet);
// //判断响应结果是否为空
// if (Objects.nonNull(response)) {
// //获取HttpEntity对象其中包含了响应内容(响应头)
// HttpEntity entity = response.getEntity();
// String htmlBody = EntityUtils.toString(entity);
// return ansysData(htmlBody,date);
// }
// } catch (Exception e) {
// e.printStackTrace();
// }
//
// return Collections.emptyList();
// }
//解析页面数据
private
static
List
<
HotSearchList
>
ansysData
(
String
htmlBody
,
Date
date
)
{
ArrayList
<
HotSearchList
>
list
=
new
ArrayList
<>();
String
webSite
=
"https://www.huxiu.com"
;
try
{
//获取Document文档对象
Document
document
=
Jsoup
.
parse
(
htmlBody
);
//获取元素集合
Elements
elements
=
document
.
select
(
"div.hot__list"
).
select
(
"div.focus-item"
);
if
(
Objects
.
nonNull
(
elements
)
&&
!
elements
.
isEmpty
()){
// 获取排名rank
Integer
rank
=
0
;
for
(
Element
element
:
elements
)
{
try
{
rank
++;
//获取关键词
String
keyWord
=
element
.
select
(
"p"
).
text
();
//获取关键词相关链接
String
href
=
element
.
select
(
"a.focus-item__left"
).
attr
(
"href"
);
String
url
=
webSite
+
href
;
//获取讨论量
String
comment
=
element
.
select
(
"i"
).
first
().
text
();
Long
commentCount
=
Long
.
valueOf
(
comment
);
String
topicLead
=
null
;
long
count
=
0L
;
HotSearchList
hotSearchList
=
new
HotSearchList
(
url
,
keyWord
,
count
,
rank
,
HotSearchType
.
虎嗅热文推荐
.
name
(),
commentCount
,
topicLead
,
date
);
list
.
add
(
hotSearchList
);
}
catch
(
NumberFormatException
e
)
{
log
.
error
(
"解析虎嗅热文推荐时出现解析错误"
,
e
);
}
}
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析虎嗅热文推荐时出现解析错误,数据不是json结构"
,
e
);
}
return
list
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/crawler/WeiboHotSearchCrawler.java
View file @
8d85b0c2
package
com
.
zhiwei
.
searchhotcrawler
.
crawler
;
import
java.io.IOException
;
import
java.text.ParseException
;
import
java.text.SimpleDateFormat
;
import
java.util.*
;
import
java.util.stream.Collectors
;
import
com.alibaba.fastjson.JSON
;
import
com.zhiwei.searchhotcrawler.bean.
HotSearchCache
;
import
com.zhiwei.searchhotcrawler.bean.
*
;
import
com.zhiwei.searchhotcrawler.config.RedisConfig
;
import
com.zhiwei.searchhotcrawler.dao.RedisDao
;
import
com.zhiwei.searchhotcrawler.dao.WeiBoMassageDao
;
import
com.zhiwei.searchhotcrawler.dao.WeiBoUserDao
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
...
...
@@ -25,12 +29,12 @@ import com.alibaba.fastjson.JSONObject;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.mail.SendMailWeibo
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
org.springframework.beans.factory.annotation.Autowired
;
import
static
java
.
util
.
Objects
.
nonNull
;
/**
* @ClassName: WeiboHotSearch
* @Description: 微博实时热搜采集
...
...
@@ -169,6 +173,7 @@ public class WeiboHotSearchCrawler {
continue
;
}
// }
return
result
;
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博时热搜时出现解析错误,数据不是json结构"
,
e
);
...
...
@@ -242,6 +247,7 @@ public class WeiboHotSearchCrawler {
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
getJSONObject
(
"cardlistInfo"
);
List
<
JSONObject
>
cardsJsons
=
(
List
<
JSONObject
>)
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
).
get
(
"cards"
);
if
(
json
.
containsKey
(
"desc"
)){
String
topicLead
=
json
.
getString
(
"desc"
);
if
(!
""
.
equals
(
topicLead
))
{
...
...
@@ -266,12 +272,275 @@ public class WeiboHotSearchCrawler {
}
}
}
//调用weiBoMassageDao添加数据
WeiBoMassageDao
weiBoMassageDao
=
new
WeiBoMassageDao
();
//解析cards,获取热门微博、人物
for
(
JSONObject
jsonObject
:
cardsJsons
)
{
if
(
nonNull
(
jsonObject
)
&&
!
jsonObject
.
isEmpty
())
{
if
(
jsonObject
.
containsKey
(
"mblog"
))
{
if
(
jsonObject
.
getJSONObject
(
"mblog"
).
containsKey
(
"title"
))
{
WeiBoMassage
weiBoMassage
=
analysisWeiboMBlog
(
jsonObject
,
document
.
getString
(
"name"
));
if
(
Objects
.
nonNull
(
weiBoMassage
))
{
weiBoMassageDao
.
addWeiBoMassage
(
weiBoMassage
);
}
}
}
else
if
(
jsonObject
.
containsKey
(
"card_group"
))
{
JSONArray
cardGroup
=
jsonObject
.
getJSONArray
(
"card_group"
);
WeiBoMassage
weiBoMassage
=
analysisWeiboMassage
(
cardGroup
,
document
.
getString
(
"name"
));
if
(
Objects
.
nonNull
(
weiBoMassage
))
{
weiBoMassageDao
.
addWeiBoMassage
(
weiBoMassage
);
}
analysisWeiBoUsers
(
cardGroup
,
document
.
getString
(
"name"
));
}
}
else
{
log
.
info
(
"获取数据失败"
);
}
}
return
document
;
}
}
return
null
;
}
/**
* 解析微博信息
*
* @param cardGroup
* @param topic
* @return
*/
public
static
WeiBoMassage
analysisWeiboMassage
(
JSONArray
cardGroup
,
String
topic
)
{
for
(
int
i
=
0
;
i
<
cardGroup
.
size
();
i
++)
{
if
(
cardGroup
.
getJSONObject
(
i
).
containsKey
(
"mblog"
))
{
if
(
cardGroup
.
getJSONObject
(
i
).
getJSONObject
(
"mblog"
).
containsKey
(
"title"
))
{
WeiBoMassage
weiBoMassage
=
analysisWeiboMBlog
(
cardGroup
.
getJSONObject
(
i
),
topic
);
return
weiBoMassage
;
}
}
}
return
null
;
}
/**
* 解析用户信息
*
* @param cardGroup
* @param topic
* @return
*/
public
static
void
analysisWeiBoUsers
(
JSONArray
cardGroup
,
String
topic
)
{
//解析weibo人物信息
//创建weiBoUserDao
WeiBoUserDao
weiBoUserDao
=
new
WeiBoUserDao
();
Date
date
=
new
Date
();
for
(
int
i
=
0
;
i
<
cardGroup
.
size
();
i
++)
{
if
(
3
==
Integer
.
valueOf
(
cardGroup
.
getJSONObject
(
i
).
getString
(
"card_type"
)))
{
if
(
cardGroup
.
getJSONObject
(
i
).
containsKey
(
"users"
)){
JSONArray
users
=
cardGroup
.
getJSONObject
(
i
).
getJSONArray
(
"users"
);
for
(
int
i1
=
0
;
i1
<
users
.
size
();
i1
++)
{
//获取用户id
String
userId
=
users
.
getJSONObject
(
i1
).
getString
(
"id"
);
//获取用户名
String
userName
=
users
.
getJSONObject
(
i1
).
getString
(
"screen_name"
);
//获取认证信息
String
attestationMassage
=
users
.
getJSONObject
(
i1
).
getString
(
"verified_reason"
);
//获取粉丝数量
String
followers_count
=
users
.
getJSONObject
(
i1
).
getString
(
"followers_count"
);
Long
followerCount
=
null
;
if
(!
followers_count
.
contains
(
"万"
)){
followerCount
=
Long
.
valueOf
(
followers_count
);
}
else
{
String
[]
split
=
followers_count
.
split
(
"万"
);
followerCount
=
Long
.
valueOf
(
split
[
0
])*
10000
;
}
WeiBoUser
weiBoUser
=
new
WeiBoUser
(
userId
,
attestationMassage
,
userName
,
topic
,
date
,
followerCount
);
//判断weiBoUser是否为空添加数据
if
(
weiBoUser
!=
null
)
{
//调用weiBoUserDao中的方法添加数据
weiBoUserDao
.
addWeiBoUser
(
weiBoUser
);
}
else
{
log
.
info
(
"未采集到用户信息"
);
}
}
}
}
else
if
(
10
==
Integer
.
valueOf
(
cardGroup
.
getJSONObject
(
i
).
getString
(
"card_type"
)))
{
if
(
cardGroup
.
getJSONObject
(
i
).
containsKey
(
"user"
)){
JSONObject
user
=
cardGroup
.
getJSONObject
(
i
).
getJSONObject
(
"user"
);
//获取用户id
String
userId
=
user
.
getString
(
"id"
);
//获取用户名
String
userName
=
user
.
getString
(
"screen_name"
);
//获取认证信息
String
attestationMassage
=
user
.
getString
(
"verified_reason"
);
//获取粉丝数
String
followers_count
=
user
.
getString
(
"followers_count"
);
Long
followerCount
=
null
;
if
(
followers_count
.
contains
(
"万"
)){
String
[]
split
=
followers_count
.
split
(
"万"
);
followerCount
=
Long
.
valueOf
(
split
[
0
])*
10000
;
}
else
{
followerCount
=
Long
.
valueOf
(
followers_count
);
}
WeiBoUser
weiBoUser
=
new
WeiBoUser
(
userId
,
attestationMassage
,
userName
,
topic
,
date
,
followerCount
);
//判断weiBoUser是否为空添加数据
if
(
weiBoUser
!=
null
)
{
//调用weiBoUserDao中的方法添加数据
weiBoUserDao
.
addWeiBoUser
(
weiBoUser
);
}
else
{
log
.
info
(
"未采集到用户信息"
);
}
}
}
}
}
/**
* 解析微博类型
*
* @param jsonObject
* @param topic
* @return
*/
public
static
WeiBoMassage
analysisWeiboMBlog
(
JSONObject
jsonObject
,
String
topic
)
{
JSONObject
mblog
=
jsonObject
.
getJSONObject
(
"mblog"
);
String
type
=
mblog
.
getJSONObject
(
"title"
).
getString
(
"text"
);
String
card_type
=
jsonObject
.
getString
(
"card_type"
);
Integer
cardType
=
Integer
.
valueOf
(
card_type
);
String
show_type
=
jsonObject
.
getString
(
"show_type"
);
Integer
showType
=
Integer
.
valueOf
(
show_type
);
//点赞数
String
attitudes_count
=
mblog
.
getString
(
"attitudes_count"
);
Long
attitudeCount
=
null
;
if
(
attitudes_count
.
contains
(
"万"
))
{
String
[]
split
=
attitudes_count
.
split
(
"万"
);
attitudeCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
{
attitudeCount
=
Long
.
valueOf
(
attitudes_count
);
}
//评论数
String
comments_count
=
mblog
.
getString
(
"comments_count"
);
Long
commentCount
=
null
;
if
(
comments_count
.
contains
(
"万"
))
{
String
[]
split
=
comments_count
.
split
(
"万"
);
commentCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
{
commentCount
=
Long
.
valueOf
(
comments_count
);
}
//转发数
String
reposts_count
=
mblog
.
getString
(
"reposts_count"
);
Long
repostCount
=
null
;
if
(
reposts_count
.
contains
(
"万"
)){
String
[]
split
=
reposts_count
.
split
(
"万"
);
repostCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
{
repostCount
=
Long
.
valueOf
(
reposts_count
);
}
Date
createTime
=
null
;
Date
editTime
=
null
;
try
{
SimpleDateFormat
simpleDateFormat
=
new
SimpleDateFormat
(
"EEE MMM dd HH:mm:ss z yyyy"
,
java
.
util
.
Locale
.
US
);
//创建时间
String
created_at
=
mblog
.
getString
(
"created_at"
);
createTime
=
simpleDateFormat
.
parse
(
created_at
);
//编辑时间
if
(
mblog
.
containsKey
(
"edit_at"
)){
String
edit_at
=
mblog
.
getString
(
"edit_at"
);
editTime
=
simpleDateFormat
.
parse
(
edit_at
);
}
}
catch
(
ParseException
e
)
{
log
.
error
(
"创建时间和编辑时间解析异常"
,
e
);
}
String
mid
=
mblog
.
getString
(
"mid"
);
//用户id
String
userId
=
mblog
.
getJSONObject
(
"user"
).
getString
(
"id"
);
//用户名
String
userName
=
mblog
.
getJSONObject
(
"user"
).
getString
(
"screen_name"
);
//来源
String
source
=
mblog
.
getString
(
"source"
);
//内容
String
content
=
null
;
if
(
mblog
.
getString
(
"text"
).
contains
(
"<"
))
{
String
text
=
mblog
.
getString
(
"text"
);
org
.
jsoup
.
nodes
.
Document
parse
=
Jsoup
.
parse
(
text
);
content
=
parse
.
text
();
}
else
{
content
=
mblog
.
getString
(
"text"
);
}
WeiBoMassage
weiBoMassage
=
new
WeiBoMassage
(
userId
,
content
,
userName
,
mid
,
createTime
,
editTime
,
cardType
,
showType
,
repostCount
,
commentCount
,
attitudeCount
,
source
,
type
,
topic
);
//默认不转发为0
weiBoMassage
.
setForward
(
0
);
JSONObject
weiboJson
=
null
;
//微博实体 是否转发
if
(
mblog
.
containsKey
(
"retweeted_status"
))
{
weiboJson
=
mblog
.
getJSONObject
(
"retweeted_status"
);
//处理转发特有的
//weiBoMassage.set
//源mid
String
rootMid
=
weiboJson
.
getString
(
"mid"
);
//源来源
String
rootSource
=
weiboJson
.
getString
(
"source"
);
//源text
String
text
=
weiboJson
.
getString
(
"text"
);
//解析
org
.
jsoup
.
nodes
.
Document
parse
=
Jsoup
.
parse
(
text
);
String
rootText
=
parse
.
text
();
//源用户id
String
rootId
=
weiboJson
.
getJSONObject
(
"user"
).
getString
(
"id"
);
//源用户名
String
rootName
=
weiboJson
.
getJSONObject
(
"user"
).
getString
(
"screen_name"
);
//数据保存到对象中
weiBoMassage
.
setRoot_mid
(
rootMid
);
weiBoMassage
.
setRoot_id
(
rootId
);
weiBoMassage
.
setRoot_source
(
rootSource
);
weiBoMassage
.
setRoot_text
(
rootText
);
weiBoMassage
.
setRoot_name
(
rootName
);
//转发为1
weiBoMassage
.
setForward
(
1
);
}
else
{
weiboJson
=
mblog
;
}
List
<
String
>
pictureUrlList
=
new
ArrayList
();
Long
playCount
=
null
;
//获取播放量和图片链接
if
(
weiboJson
.
getJSONArray
(
"pic_ids"
).
size
()
>
0
)
{
JSONArray
jsonArray
=
weiboJson
.
getJSONArray
(
"pics"
);
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
String
picUrl
=
jsonArray
.
getJSONObject
(
i
).
getString
(
"url"
);
pictureUrlList
.
add
(
picUrl
);
}
}
else
if
(
weiboJson
.
containsKey
(
"page_info"
))
{
if
(
weiboJson
.
getJSONObject
(
"page_info"
).
containsKey
(
"play_count"
)){
String
play
=
weiboJson
.
getJSONObject
(
"page_info"
).
getString
(
"play_count"
);
if
(
play
.
contains
(
"万"
))
{
String
[]
split
=
play
.
split
(
"万"
);
playCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
if
(
play
.
contains
(
"次"
)){
String
[]
split
=
play
.
split
(
"次"
);
playCount
=
Long
.
valueOf
(
split
[
0
]);
}
}
}
weiBoMassage
.
setPlayCount
(
playCount
);
weiBoMassage
.
setPictureUrlList
(
pictureUrlList
);
return
weiBoMassage
;
}
// /**
// * 微博更新历史数据
// * @param hotSearch
...
...
src/main/java/com/zhiwei/searchhotcrawler/dao/HotSearchCacheDAO.java
View file @
8d85b0c2
package
com
.
zhiwei
.
searchhotcrawler
.
dao
;
import
com.mongodb.BasicDBObject
;
import
com.mongodb.client.FindIterable
;
import
com.mongodb.client.MongoCollection
;
import
com.mongodb.client.MongoCursor
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.config.DBConfig
;
import
com.zhiwei.searchhotcrawler.crawler.WeiboHotSearchCrawler
;
...
...
@@ -52,6 +50,10 @@ public class HotSearchCacheDAO {
// if("今日头条热搜".equals(hotSearch.getType())){
// document.put("comment_count", hotSearch.getCommentCount());
// }
if
(
"虎嗅热文推荐"
.
equals
(
hotSearch
.
getType
())){
document
.
put
(
"comment_count"
,
hotSearch
.
getCommentCount
());
}
if
(
"腾讯较真榜"
.
equals
(
hotSearch
.
getType
())){
document
.
put
(
"topic_result"
,
hotSearch
.
getTopicResult
());
}
...
...
@@ -125,7 +127,7 @@ public class HotSearchCacheDAO {
//计算上升速度
double
riseSpeed
=
nowDoc
.
containsKey
(
"riseSpeed"
)?
nowDoc
.
getDouble
(
"riseSpeed"
):
0.00
;
if
(
nonNull
(
lastCount
)
&&
nowDoc
.
containsKey
(
"firstCount"
))
{
long
firstCount
=
Long
.
parseLong
(
nowDoc
.
get
(
"firstCount"
).
toString
()
);
long
firstCount
=
nowDoc
.
getLong
(
"firstCount"
);
riseSpeed
=
((
double
)(
lastCount
-
firstCount
)/(
double
)
firstCount
)*
1000
/((
double
)
duration
);
}
// endTime = getEndTime(type, new Date());
...
...
@@ -181,6 +183,10 @@ public class HotSearchCacheDAO {
// if(readCount != null){
// nowDoc.put("readCount",readCount);
// }
if
(
"虎嗅热文推荐"
.
equals
(
type
)){
nowDoc
.
put
(
"comment_count"
,
document
.
getLong
(
"comment_count"
));
}
if
(
topicResult
!=
null
){
nowDoc
.
put
(
"topicResult"
,
topicResult
);
}
...
...
@@ -207,7 +213,7 @@ public class HotSearchCacheDAO {
}
}
}
catch
(
Exception
e
){
log
.
error
(
"数据存储时出错:"
,
e
);
log
.
error
(
"数据存储时出错:
{}
"
,
e
);
}
}
...
...
src/main/java/com/zhiwei/searchhotcrawler/dao/WeiBoMassageDao.java
0 → 100644
View file @
8d85b0c2
package
com
.
zhiwei
.
searchhotcrawler
.
dao
;
import
com.mongodb.client.MongoCollection
;
import
com.mongodb.client.MongoDatabase
;
import
com.zhiwei.searchhotcrawler.bean.WeiBoMassage
;
import
com.zhiwei.searchhotcrawler.config.DBConfig
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
import
lombok.extern.log4j.Log4j2
;
import
org.bson.Document
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Objects
;
/**
*微博信息入库
*/
@Log4j2
public
class
WeiBoMassageDao
{
public
static
MongoDatabase
mongoDatabase
=
MongoDBTemplate
.
getDB
(
DBConfig
.
dbName
);
public
static
MongoCollection
mongoCollection
;
public
WeiBoMassageDao
()
{
String
collName
=
DBConfig
.
weiBoMassageCollName
;
mongoCollection
=
mongoDatabase
.
getCollection
(
collName
);
//给数据表创建索引
MongoDBTemplate
.
createIndex
(
DBConfig
.
dbName
,
collName
);
}
/**
* 添加数据入库
* @param weiBoMassage
*/
public
void
addWeiBoMassage
(
WeiBoMassage
weiBoMassage
){
log
.
info
(
"weiBoMassage对象开始转document对象"
);
Document
document
=
new
Document
();
document
.
put
(
"_id"
,
weiBoMassage
.
getId
());
document
.
put
(
"userId"
,
weiBoMassage
.
getUserId
());
document
.
put
(
"text"
,
weiBoMassage
.
getText
());
document
.
put
(
"userName"
,
weiBoMassage
.
getUserName
());
document
.
put
(
"mid"
,
weiBoMassage
.
getMid
());
document
.
put
(
"creatTime"
,
weiBoMassage
.
getCreatTime
());
if
(
Objects
.
nonNull
(
weiBoMassage
.
getEditTime
())){
document
.
put
(
"editTime"
,
weiBoMassage
.
getEditTime
());
}
document
.
put
(
"cardType"
,
weiBoMassage
.
getCardType
());
document
.
put
(
"showType"
,
weiBoMassage
.
getShowType
());
document
.
put
(
"repostCount"
,
weiBoMassage
.
getRepostCount
());
document
.
put
(
"commentCount"
,
weiBoMassage
.
getCommentCount
());
document
.
put
(
"attitudeCount"
,
weiBoMassage
.
getAttitudeCount
());
if
(
Objects
.
nonNull
(
weiBoMassage
.
getPlayCount
())){
document
.
put
(
"playCount"
,
weiBoMassage
.
getPlayCount
());
}
if
(
weiBoMassage
.
getPictureUrlList
().
size
()!=
0
){
document
.
put
(
"pictureUrlList"
,
weiBoMassage
.
getPictureUrlList
());
}
document
.
put
(
"source"
,
weiBoMassage
.
getSource
());
document
.
put
(
"type"
,
weiBoMassage
.
getType
());
document
.
put
(
"topic"
,
weiBoMassage
.
getTopic
());
document
.
put
(
"forward"
,
weiBoMassage
.
getForward
());
if
(
0
!=
weiBoMassage
.
getForward
()){
document
.
put
(
"root_mid"
,
weiBoMassage
.
getRoot_mid
());
document
.
put
(
"root_id"
,
weiBoMassage
.
getRoot_id
());
document
.
put
(
"root_name"
,
weiBoMassage
.
getRoot_name
());
document
.
put
(
"root_text"
,
weiBoMassage
.
getRoot_text
());
document
.
put
(
"root_source"
,
weiBoMassage
.
getRoot_source
());
}
log
.
info
(
"weiBoMassage对象转document对象完成"
);
try
{
mongoCollection
.
insertOne
(
document
);
log
.
info
(
"数据插入成功"
);
}
catch
(
Exception
e
)
{
log
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/dao/WeiBoUserDao.java
0 → 100644
View file @
8d85b0c2
package
com
.
zhiwei
.
searchhotcrawler
.
dao
;
import
com.mongodb.client.MongoCollection
;
import
com.mongodb.client.MongoDatabase
;
import
com.zhiwei.searchhotcrawler.bean.WeiBoMassage
;
import
com.zhiwei.searchhotcrawler.bean.WeiBoUser
;
import
com.zhiwei.searchhotcrawler.config.DBConfig
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate
;
import
lombok.extern.log4j.Log4j2
;
import
org.bson.Document
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Objects
;
@Log4j2
public
class
WeiBoUserDao
{
public
static
MongoDatabase
mongoDatabase
=
MongoDBTemplate
.
getDB
(
DBConfig
.
dbName
);
public
static
MongoCollection
mongoCollection
;
public
WeiBoUserDao
()
{
String
collName
=
DBConfig
.
weiBoUserCollName
;
mongoCollection
=
mongoDatabase
.
getCollection
(
collName
);
//给数据表创建索引
MongoDBTemplate
.
createIndex
(
DBConfig
.
dbName
,
collName
);
}
/**
* 添加数据入库
* @param weiBoUser
*/
public
void
addWeiBoUser
(
WeiBoUser
weiBoUser
){
log
.
info
(
"WeiBoUser对象开始转document对象"
);
Document
document
=
new
Document
();
document
.
put
(
"_id"
,
weiBoUser
.
getId
());
document
.
put
(
"userId"
,
weiBoUser
.
getUserId
());
if
(
Objects
.
nonNull
(
weiBoUser
.
getAttestationMassage
())){
document
.
put
(
"attestationMassage"
,
weiBoUser
.
getAttestationMassage
());
}
document
.
put
(
"userName"
,
weiBoUser
.
getUserName
());
document
.
put
(
"topic"
,
weiBoUser
.
getTopic
());
document
.
put
(
"time"
,
weiBoUser
.
getTime
());
document
.
put
(
"followerCount"
,
weiBoUser
.
getFollowerCount
());
log
.
info
(
"WeiBoUser对象转document对象完成"
);
try
{
mongoCollection
.
insertOne
(
document
);
log
.
info
(
"数据插入成功"
);
}
catch
(
Exception
e
)
{
log
.
error
(
"存储数据时出错,错误为:{}"
,
e
);
}
}
}
src/main/java/com/zhiwei/searchhotcrawler/test/HotSearch36KrCrawlerTest.java
0 → 100644
View file @
8d85b0c2
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.http.HttpEntity
;
import
org.apache.http.client.methods.CloseableHttpResponse
;
import
org.apache.http.client.methods.HttpGet
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.apache.http.impl.client.HttpClients
;
import
org.apache.http.util.EntityUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
java.time.Duration
;
import
java.util.*
;
/**
* @author ll
* @ClassName:HotSearch36KrCrawler
* @Description:
* @date 2021年5月21日 上午11:54:31
*/
@Log4j2
public
class
HotSearch36KrCrawlerTest
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
/**
* @return void 返回类型
* @Title: hotSearch36KrCrawler
* @author hero
* @Description: PC端36Kr人气榜采集
*/
public
static
List
<
HotSearchList
>
hotSearch36Kr
(
Date
date
)
{
String
url
=
"https://www.36kr.com/hot-list/catalog"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析36Kr人气榜时出现解析错误,页面结构有问题"
,
e
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"article-list"
))
{
return
ansysData
(
htmlBody
,
date
);
}
else
{
log
.
info
(
"解析36Kr人气榜时出现解析错误,页面结构有问题"
);
}
return
Collections
.
emptyList
();
}
// public static List<HotSearchList> hotSearch36Kr(Date date) {
// String url = "https://www.36kr.com/hot-list/catalog";
// //建立一个新的客户端请求(创建HttpClient对象)
// CloseableHttpClient httpClient = HttpClients.createDefault();
// //创建请求对象实例
// HttpGet httpGet = new HttpGet(url);
// httpGet.addHeader("User-Agent", "spider");
// //获取响应的结果
// CloseableHttpResponse response = null;
// try {
// //调用HttpClient对象的execute方法发送请求
// response = httpClient.execute(httpGet);
//
// if (Objects.nonNull(response)) {
// //获取HttpEntity对象其中包含了响应内容(响应头)
// HttpEntity entity = response.getEntity();
//
// String htmlBody = EntityUtils.toString(entity);
// return ansysData(htmlBody,date);
// }
// } catch (Exception e) {
// e.printStackTrace();
// }
// return Collections.emptyList();
// }
/**
* 解析数据
*
* @param htmlBody
* @return
*/
private
static
List
<
HotSearchList
>
ansysData
(
String
htmlBody
,
Date
date
)
{
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
String
webSite
=
"https://www.36kr.com"
;
try
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
Elements
elements
=
document
.
select
(
"div.article-list"
).
first
().
select
(
"div.article-wrapper"
);
if
(
Objects
.
nonNull
(
elements
)
&&
!
elements
.
isEmpty
())
{
// 获取排名rank
int
rank
=
0
;
for
(
Element
element
:
elements
)
{
try
{
rank
++;
// 获取关键词(String)
String
keyWord
=
element
.
select
(
"p.title-wrapper"
).
select
(
"a.article-item-title"
).
text
();
// logger.info("关键词:{}", kw);
// 获取关键词相关链接everurl(String)
String
everurl
=
element
.
select
(
"p.title-wrapper"
).
select
(
"a.article-item-title"
).
attr
(
"href"
);
// 获取搜索指数count(int)
String
url
=
webSite
+
everurl
;
String
hot
=
null
;
// 判断热度值所在的规则是否为null
if
(!
element
.
select
(
"span"
).
isEmpty
())
{
hot
=
element
.
select
(
"span"
).
text
();
}
Long
count
=
0L
;
// 判断hot是否为空
if
(
StringUtils
.
isNotBlank
(
hot
))
{
String
[]
hots
=
hot
.
split
(
"热度"
);
String
trim
=
hots
[
1
].
trim
();
Double
num
=
Double
.
valueOf
(
trim
);
count
=
Math
.
round
(
num
);
}
if
(
Objects
.
nonNull
(
rank
))
{
if
(
count
==
0
)
{
log
.
info
(
htmlBody
);
log
.
info
(
hot
);
log
.
info
(
element
);
}
else
{
HotSearchList
hotSearch
=
new
HotSearchList
(
url
,
keyWord
,
count
,
rank
,
HotSearchType
.
人气榜
36
氪
.
name
(),
date
);
list
.
add
(
hotSearch
);
}
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析36Kr人气榜时出现解析错误"
,
e
);
}
}
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析36Kr人气榜时出现解析错误,数据不是json结构"
,
e
);
}
return
list
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/test/HuXiuHotSearchCrawlerTest.java
0 → 100644
View file @
8d85b0c2
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.http.HttpEntity
;
import
org.apache.http.client.methods.CloseableHttpResponse
;
import
org.apache.http.client.methods.HttpGet
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.apache.http.impl.client.HttpClients
;
import
org.apache.http.util.EntityUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
java.time.Duration
;
import
java.util.*
;
/**
* @author: ll
* @ClassName: HuXiuHotSearchCrawler
* @Description: pc端虎嗅热文推荐采集
* @date: 2021年5月24日 下午16:35:31
* @Title: HuXiuHotSearchCrawler
*/
@Log4j2
public
class
HuXiuHotSearchCrawlerTest
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
throwException
(
false
).
retryTimes
(
3
).
connectTimeout
(
Duration
.
ofSeconds
(
60
)).
build
();
public
static
List
<
HotSearchList
>
HuXiuHotArticleRecommended
(
Date
date
){
String
url
=
"https://www.huxiu.com/"
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
log
.
error
(
"解析虎嗅热文推荐时出现解析错误,页面结构有问题"
,
e
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"hot__list"
))
{
return
ansysData
(
htmlBody
,
date
);
}
else
{
log
.
info
(
"解析虎嗅热文推荐时出现解析错误,页面结构有问题"
);
}
return
Collections
.
emptyList
();
}
// String url="https://www.huxiu.com/";
// //创建客户端请求对象
// CloseableHttpClient httpClient = HttpClients.createDefault();
// //创建请求对象实例
// HttpGet httpGet = new HttpGet(url);
// //设置头信息
// httpGet.addHeader("User-Agent","spider");
//
// //获取响应结果
// try {
// CloseableHttpResponse response = httpClient.execute(httpGet);
// //判断响应结果是否为空
// if (Objects.nonNull(response)) {
// //获取HttpEntity对象其中包含了响应内容(响应头)
// HttpEntity entity = response.getEntity();
// String htmlBody = EntityUtils.toString(entity);
// return ansysData(htmlBody,date);
// }
// } catch (Exception e) {
// e.printStackTrace();
// }
//
// return Collections.emptyList();
// }
//解析页面数据
private
static
List
<
HotSearchList
>
ansysData
(
String
htmlBody
,
Date
date
)
{
ArrayList
<
HotSearchList
>
list
=
new
ArrayList
<>();
String
webSite
=
"https://www.huxiu.com"
;
try
{
//获取Document文档对象
Document
document
=
Jsoup
.
parse
(
htmlBody
);
//获取元素集合
Elements
elements
=
document
.
select
(
"div.hot__list"
).
select
(
"div.focus-item"
);
if
(
Objects
.
nonNull
(
elements
)
&&
!
elements
.
isEmpty
()){
// 获取排名rank
Integer
rank
=
0
;
for
(
Element
element
:
elements
)
{
try
{
rank
++;
//获取关键词
String
keyWord
=
element
.
select
(
"p"
).
text
();
//获取关键词相关链接
String
href
=
element
.
select
(
"a.focus-item__left"
).
attr
(
"href"
);
String
url
=
webSite
+
href
;
//获取讨论量
String
comment
=
element
.
select
(
"i"
).
first
().
text
();
Long
commentCount
=
Long
.
valueOf
(
comment
);
String
topicLead
=
null
;
long
count
=
0L
;
HotSearchList
hotSearchList
=
new
HotSearchList
(
url
,
keyWord
,
count
,
rank
,
HotSearchType
.
虎嗅热文推荐
.
name
(),
commentCount
,
topicLead
,
date
);
list
.
add
(
hotSearchList
);
}
catch
(
NumberFormatException
e
)
{
log
.
error
(
"解析虎嗅热文推荐时出现解析错误"
,
e
);
}
}
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析虎嗅热文推荐时出现解析错误,数据不是json结构"
,
e
);
}
return
list
;
}
}
src/main/java/com/zhiwei/searchhotcrawler/timer/quartz/GatherTimer.java
View file @
8d85b0c2
...
...
@@ -8,6 +8,8 @@ import com.zhiwei.searchhotcrawler.crawler.*;
import
com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO
;
import
com.zhiwei.searchhotcrawler.dao.RedisDao
;
import
com.zhiwei.searchhotcrawler.dao.WeiboSuperTopicDAO
;
import
com.zhiwei.searchhotcrawler.crawler.HotSearch36KrCrawler
;
import
com.zhiwei.searchhotcrawler.crawler.HuXiuHotSearchCrawler
;
import
com.zhiwei.searchhotcrawler.timer.TouTiaoExecutor
;
import
com.zhiwei.searchhotcrawler.util.DateUtils
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
...
...
@@ -38,6 +40,30 @@ public class GatherTimer {
/** 知乎时事子分类 */
private
String
DEPTH
=
"depth"
;
/**
* 虎嗅热文推荐的采集
*/
@Async
(
value
=
"myScheduler"
)
@Scheduled
(
cron
=
"0 * * * * ?"
)
public
void
crawlerHuXiu
()
{
logger
.
info
(
"虎嗅热文推荐开始采集..."
);
Date
date
=
DateUtils
.
getMillSecondTime
(
new
Date
());
List
<
HotSearchList
>
huXiuList
=
HuXiuHotSearchCrawler
.
HuXiuHotArticleRecommended
(
date
);
logger
.
info
(
"{}, 虎嗅热文推荐此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
huXiuList
!=
null
?
huXiuList
.
size
()
:
0
));
TipsUtils
.
addHotList
(
HotSearchType
.
虎嗅热文推荐
.
name
(),
huXiuList
);
logger
.
info
(
"虎嗅热文推荐采集结束..."
);
/**
* 36氪人气榜的采集
*/
logger
.
info
(
"36氪人气榜开始采集..."
);
List
<
HotSearchList
>
list36Kr
=
HotSearch36KrCrawler
.
hotSearch36Kr
(
date
);
logger
.
info
(
"{}, 36氪人气榜此轮采集到的数据量为:{}"
,
new
Date
(),
Integer
.
valueOf
(
list36Kr
!=
null
?
list36Kr
.
size
()
:
0
));
TipsUtils
.
addHotList
(
HotSearchType
.
人气榜
36
氪
.
name
(),
list36Kr
);
logger
.
info
(
"36氪人气榜采集结束..."
);
}
/**
* 微博热搜的采集
*/
...
...
src/test/java/weiboTest/WeiboHotSearchTest.java
View file @
8d85b0c2
...
...
@@ -4,12 +4,19 @@ package weiboTest;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyFactory
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.config.RedisConfig
;
import
com.zhiwei.searchhotcrawler.bean.WeiBoMassage
;
import
com.zhiwei.searchhotcrawler.bean.WeiBoUser
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.searchhotcrawler.dao.WeiBoMassageDao
;
import
com.zhiwei.searchhotcrawler.dao.WeiBoUserDao
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
...
...
@@ -20,9 +27,14 @@ import org.junit.Test;
import
org.junit.runner.RunWith
;
import
org.springframework.test.context.ContextConfiguration
;
import
org.springframework.test.context.junit4.SpringJUnit4ClassRunner
;
import
java.io.IOException
;
import
java.text.ParseException
;
import
java.text.SimpleDateFormat
;
import
java.util.*
;
import
java.util.concurrent.TimeUnit
;
import
static
java
.
util
.
Objects
.
nonNull
;
/**
* @author cwt
...
...
@@ -31,69 +43,161 @@ import java.util.*;
@Log4j2
@RunWith
(
SpringJUnit4ClassRunner
.
class
)
@ContextConfiguration
(
locations
=
{
"classpath:applicationContext.xml"
})
public
class
WeiboHotSearchTest
{
{
"classpath:applicationContext.xml"
})
public
class
WeiboHotSearchTest
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
@Test
public
void
test
(){
Document
document
=
Jsoup
.
parse
(
"a href=\"https://m.weibo.cn/search?containerid=231522type%3D1%26t%3D10%26q%3D%23%E9%82%93%E4%BC%A6%E8%AE%B2%E6%88%8F%E4%B8%93%E4%B8%9A%23&extparam=%23%E9%82%93%E4%BC%A6%E8%AE%B2%E6%88%8F%E4%B8%93%E4%B8%9A%23&luicode=10000011&lfid=100103type%3D1%26t%3D10%26q%3D%23%E9%82%93%E4%BC%A6%E8%AE%B2%E6%88%8F%E4%B8%93%E4%B8%9A%23\" data-hide=\"\"><span class=\"surl-text\">#邓伦讲戏专业#</span></a><a href=\"https://m.weibo.cn/search?containerid=231522type%3D1%26t%3D10%26q%3D%23%E6%9E%81%E9%99%90%E6%8C%91%E6%88%98%23&luicode=10000011&lfid=100103type%3D1%26t%3D10%26q%3D%23%E9%82%93%E4%BC%A6%E8%AE%B2%E6%88%8F%E4%B8%93%E4%B8%9A%23\" data-hide=\"\"><span class=\"surl-text\">#极限挑战#</span></a> <a href='/n/邓伦'>@邓伦</a> 和<a href='/n/景甜'>@景甜</a> 改编《甄嬛传》剧本,伦伦认真讲戏的样子让人瞬间穿越到拍摄现场。看来戏瘾上身的邓伦还过了一把导演的瘾,这专业的模样要不要考虑跨界当当导演呀~<span class=\"url-icon\"><img alt=[哈哈] src=\"https://h5.sinaimg.cn/m/emoticon/icon/default/d_haha-0ec05e6dad.png\" style=\"width:1em; height:1em;\" /></span><a data-url=\"http://t.cn/A6VJPN9w\" href=\"https://video.weibo.com/show?fid=1034:4640837901156490\" data-hide=\"\"><span class='url-icon'><img style='width: 1rem;height: 1rem' src='https://h5.sinaimg.cn/upload/2015/09/25/3/timeline_card_small_video_default.png'></span><span class=\"surl-text\">东方卫视极限挑战的微博视频</span></a>"
);
public
void
test
()
{
Document
document
=
Jsoup
.
parse
(
"<a href=\\\"https://m.weibo.cn/search?containerid=231522type%3D1%26t%3D10%26q%3D%23%E5%91%A8%E6%9F%AF%E5%AE%87%E7%88%B8%E7%88%B8%23&extparam=%23%E5%91%A8%E6%9F%AF%E5%AE%87%E7%88%B8%E7%88%B8%23&luicode=10000011&lfid=231522type%3D1%26q%3D%23%E5%91%A8%E6%9F%AF%E5%AE%87%E7%88%B8%E7%88%B8%23\\\" data-hide=\\\"\\\"><span class=\\\"surl-text\\\">#周柯宇爸爸#</span></a> \uD83E\uDDD0<a href=\\\"https://m.weibo.cn/search?containerid=231522type%3D1%26t%3D10%26q%3D%23%E5%91%A8%E6%9F%AF%E5%AE%87%E4%BC%A0%E9%94%80%E4%B9%8B%E5%AD%90%23&extparam=%23%E5%91%A8%E6%9F%AF%E5%AE%87%E4%BC%A0%E9%94%80%E4%B9%8B%E5%AD%90%23&luicode=10000011&lfid=231522type%3D1%26q%3D%23%E5%91%A8%E6%9F%AF%E5%AE%87%E7%88%B8%E7%88%B8%23\\\" data-hide=\\\"\\\"><span class=\\\"surl-text\\\">#周柯宇传销之子#</span></a> <br />周柯宇粉丝今天懂法了吗?没有我一会再来普法。周柯宇粉丝为传销洗地,周柯宇偶像失格,周柯宇粉丝素质低下,道德沦丧 \"\n"
);
System
.
out
.
println
(
document
.
text
());
}
@Test
public
void
test1
()
{
String
url
=
"<a href"
;
System
.
out
.
println
(
url
.
startsWith
(
"<"
));
}
@Test
public
void
testHotWeibo
(){
public
void
testHotWeibo
()
{
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
Date
date
=
new
Date
();
while
(
true
)
{
try
{
Date
date
=
new
Date
();
List
<
HotSearchList
>
hotSearchLists
=
weiboHotSearchByPhone
(
date
);
for
(
HotSearchList
hotSearchList
:
hotSearchLists
)
{
try
{
org
.
bson
.
Document
document
=
new
org
.
bson
.
Document
();
//System.out.println(hotSearchList);
document
.
put
(
"url"
,
hotSearchList
.
getUrl
());
document
.
put
(
"name"
,
hotSearchList
.
getName
());
test12
(
document
);
}
catch
(
Exception
e
)
{
log
.
info
(
"数据解析异常"
,
e
);
}
}
log
.
info
(
"本轮微博话题采集解析完毕"
);
log
.
info
(
hotSearchLists
.
size
());
TimeUnit
.
MINUTES
.
sleep
(
1
);
}
catch
(
Exception
e
)
{
log
.
info
(
"微博热搜采集异常"
,
e
);
ZhiWeiTools
.
sleep
(
60
*
60
*
1000
);
}
ZhiWeiTools
.
sleep
(
50
);
}
//
// Date date = new Date();
// List<HotSearchList> hotSearchLists = weiboHotSearchByPhone(date);
// for (HotSearchList hotSearchList : hotSearchLists) {
// System.out.println(hotSearchList);
// }
}
//org.bson.Document document
// @Test
public
void
test12
(
org
.
bson
.
Document
document
)
{
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
// org.bson.Document document = new org.bson.Document();
// document.put("name","新疆人讲述真实的新疆");
// document.put("url","https://m.weibo.cn/search?containerid=100103type%3D1%26t%3D10%26q%3D%23%E6%96%B0%E7%96%86%E4%BA%BA%E8%AE%B2%E8%BF%B0%E7%9C%9F%E5%AE%9E%E7%9A%84%E6%96%B0%E7%96%86%23&isnewpage=1&extparam=seat%3D1%26filter_type%3Drealtimehot%26dgr%3D0%26cate%3D0%26pos%3D1%26realpos%3D2%26flag%3D1%26c_type%3D31%26display_time%3D1622705918&luicode=10000011&lfid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot");
log
.
info
(
"更新微博热搜{}导语阅读量和讨论量"
,
document
.
getString
(
"name"
));
String
url
=
"https://m.weibo.cn/api/container/getIndex?"
+
document
.
getString
(
"url"
).
substring
(
document
.
getString
(
"url"
).
indexOf
(
"?"
)
+
1
,
document
.
getString
(
"url"
).
indexOf
(
"&"
));
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
2
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博热搜详情页面时出现连接失败"
,
e
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
dataJson
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
JSONObject
cardlistInfoJson
=
dataJson
.
getJSONObject
(
"cardlistInfo"
);
List
<
JSONObject
>
cardsJsons
=
(
List
<
JSONObject
>)
dataJson
.
get
(
"cards"
);
//解析cardlistInfo,讨论、导语、阅读
if
(
cardlistInfoJson
.
containsKey
(
"desc"
))
{
String
topicLead
=
cardlistInfoJson
.
getString
(
"desc"
);
if
(!
""
.
equals
(
topicLead
))
{
document
.
put
(
"topicLead"
,
topicLead
);
}
}
if
(
cardlistInfoJson
.
containsKey
(
"cardlist_head_cards"
))
{
JSONObject
readJson
=
cardlistInfoJson
.
getJSONArray
(
"cardlist_head_cards"
).
getJSONObject
(
0
);
if
(
readJson
.
containsKey
(
"head_data"
))
{
String
midText
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"midtext"
);
String
read
=
midText
.
replaceAll
(
"阅读"
,
""
).
replaceAll
(
"讨论.*"
,
""
).
trim
();
String
discussCount
=
midText
.
replaceAll
(
".*讨论"
,
""
).
replaceAll
(
"详情.*"
,
""
).
trim
();
String
pictureUrl
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"portrait_url"
);
document
.
put
(
"readCount"
,
TipsUtils
.
getHotCount
(
read
));
document
.
put
(
"discussCount"
,
TipsUtils
.
getHotCount
(
discussCount
));
document
.
put
(
"pictureUrl"
,
pictureUrl
);
if
(
readJson
.
getJSONObject
(
"head_data"
).
containsKey
(
"downtext"
))
{
String
downtext
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"downtext"
);
if
(!
""
.
equals
(
downtext
))
{
document
.
put
(
"downtext"
,
downtext
.
replaceAll
(
"主持人:"
,
""
));
}
}
}
}
//调用weiBoMassageDao添加数据
WeiBoMassageDao
weiBoMassageDao
=
new
WeiBoMassageDao
();
//解析cards,获取热门微博、人物
for
(
JSONObject
jsonObject
:
cardsJsons
)
{
if
(
nonNull
(
jsonObject
)
&&
!
jsonObject
.
isEmpty
())
{
if
(
jsonObject
.
containsKey
(
"mblog"
))
{
if
(
jsonObject
.
getJSONObject
(
"mblog"
).
containsKey
(
"title"
))
{
WeiBoMassage
weiBoMassage
=
analysisWeiboMBlog
(
jsonObject
,
document
.
getString
(
"name"
));
if
(
Objects
.
nonNull
(
weiBoMassage
))
{
weiBoMassageDao
.
addWeiBoMassage
(
weiBoMassage
);
}
}
}
else
if
(
jsonObject
.
containsKey
(
"card_group"
))
{
JSONArray
cardGroup
=
jsonObject
.
getJSONArray
(
"card_group"
);
WeiBoMassage
weiBoMassage
=
analysisWeiboMassage
(
cardGroup
,
document
.
getString
(
"name"
));
if
(
Objects
.
nonNull
(
weiBoMassage
))
{
weiBoMassageDao
.
addWeiBoMassage
(
weiBoMassage
);
}
analysisWeiBoUsers
(
cardGroup
,
document
.
getString
(
"name"
));
}
}
else
{
log
.
info
(
"获取数据失败"
);
}
}
break
;
}
}
}
/**
* 微博热搜数据更新导语,阅读量,讨论量
*
* @param document
* @return
*/
public
static
org
.
bson
.
Document
weiboUpdate
(
org
.
bson
.
Document
document
)
{
log
.
info
(
"更新微博热搜{}导语阅读量和讨论量"
,
document
.
getString
(
"name"
));
String
url
=
"https://m.weibo.cn/api/container/getIndex?"
+
document
.
getString
(
"url"
).
substring
(
document
.
getString
(
"url"
).
indexOf
(
"?"
)
+
1
,
document
.
getString
(
"url"
).
indexOf
(
"&"
));
log
.
info
(
"更新微博热搜{}导语阅读量和讨论量"
,
document
.
getString
(
"name"
));
String
url
=
"https://m.weibo.cn/api/container/getIndex?"
+
document
.
getString
(
"url"
).
substring
(
document
.
getString
(
"url"
).
indexOf
(
"?"
)
+
1
,
document
.
getString
(
"url"
).
indexOf
(
"&"
));
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
for
(
int
count
=
0
;
count
<=
2
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
...
...
@@ -104,13 +208,13 @@ public class WeiboHotSearchTest{
JSONObject
cardlistInfoJson
=
dataJson
.
getJSONObject
(
"cardlistInfo"
);
List
<
JSONObject
>
cardsJsons
=
(
List
<
JSONObject
>)
dataJson
.
get
(
"cards"
);
//解析cardlistInfo,讨论、导语、阅读
if
(
cardlistInfoJson
.
containsKey
(
"desc"
))
{
if
(
cardlistInfoJson
.
containsKey
(
"desc"
))
{
String
topicLead
=
cardlistInfoJson
.
getString
(
"desc"
);
if
(!
""
.
equals
(
topicLead
))
{
if
(!
""
.
equals
(
topicLead
))
{
document
.
put
(
"topicLead"
,
topicLead
);
}
}
if
(
cardlistInfoJson
.
containsKey
(
"cardlist_head_cards"
))
{
if
(
cardlistInfoJson
.
containsKey
(
"cardlist_head_cards"
))
{
JSONObject
readJson
=
cardlistInfoJson
.
getJSONArray
(
"cardlist_head_cards"
).
getJSONObject
(
0
);
if
(
readJson
.
containsKey
(
"head_data"
))
{
String
midText
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"midtext"
);
...
...
@@ -119,63 +223,304 @@ public class WeiboHotSearchTest{
String
pictureUrl
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"portrait_url"
);
document
.
put
(
"readCount"
,
TipsUtils
.
getHotCount
(
read
));
document
.
put
(
"discussCount"
,
TipsUtils
.
getHotCount
(
discussCount
));
document
.
put
(
"pictureUrl"
,
pictureUrl
);
if
(
readJson
.
getJSONObject
(
"head_data"
).
containsKey
(
"downtext"
)){
document
.
put
(
"pictureUrl"
,
pictureUrl
);
if
(
readJson
.
getJSONObject
(
"head_data"
).
containsKey
(
"downtext"
))
{
String
downtext
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"downtext"
);
if
(!
""
.
equals
(
downtext
))
{
document
.
put
(
"downtext"
,
downtext
.
replaceAll
(
"主持人:"
,
""
));
if
(!
""
.
equals
(
downtext
))
{
document
.
put
(
"downtext"
,
downtext
.
replaceAll
(
"主持人:"
,
""
));
}
}
}
}
//调用weiBoMassageDao添加数据
WeiBoMassageDao
weiBoMassageDao
=
new
WeiBoMassageDao
();
//解析cards,获取热门微博、人物
for
(
JSONObject
jsonObject
:
cardsJsons
)
{
if
(
nonNull
(
jsonObject
)
&&
!
jsonObject
.
isEmpty
())
{
if
(
jsonObject
.
containsKey
(
"mblog"
))
{
if
(
jsonObject
.
getJSONObject
(
"mblog"
).
containsKey
(
"title"
))
{
WeiBoMassage
weiBoMassage
=
analysisWeiboMBlog
(
jsonObject
,
document
.
getString
(
"name"
));
if
(
Objects
.
nonNull
(
weiBoMassage
))
{
weiBoMassageDao
.
addWeiBoMassage
(
weiBoMassage
);
}
}
}
else
if
(
jsonObject
.
containsKey
(
"card_group"
))
{
JSONArray
cardGroup
=
jsonObject
.
getJSONArray
(
"card_group"
);
WeiBoMassage
weiBoMassage
=
analysisWeiboMassage
(
cardGroup
,
document
.
getString
(
"name"
));
if
(
Objects
.
nonNull
(
weiBoMassage
))
{
weiBoMassageDao
.
addWeiBoMassage
(
weiBoMassage
);
}
analysisWeiBoUsers
(
cardGroup
,
document
.
getString
(
"name"
));
}
}
else
{
log
.
info
(
"获取数据失败"
);
}
}
return
document
;
}
}
return
null
;
}
/**
* 解析微博信息
*
* @param cardGroup
* @param topic
* @return
*/
public
static
WeiBoMassage
analysisWeiboMassage
(
JSONArray
cardGroup
,
String
topic
)
{
for
(
int
i
=
0
;
i
<
cardGroup
.
size
();
i
++)
{
if
(
cardGroup
.
getJSONObject
(
i
).
containsKey
(
"mblog"
))
{
if
(
cardGroup
.
getJSONObject
(
i
).
getJSONObject
(
"mblog"
).
containsKey
(
"title"
))
{
WeiBoMassage
weiBoMassage
=
analysisWeiboMBlog
(
cardGroup
.
getJSONObject
(
i
),
topic
);
return
weiBoMassage
;
}
}
}
return
null
;
}
/**
* 解析用户信息
*
* @param cardGroup
* @param topic
* @return
*/
public
static
void
analysisWeiBoUsers
(
JSONArray
cardGroup
,
String
topic
)
{
//解析weibo人物信息
//创建weiBoUserDao
WeiBoUserDao
weiBoUserDao
=
new
WeiBoUserDao
();
Date
date
=
new
Date
();
for
(
int
i
=
0
;
i
<
cardGroup
.
size
();
i
++)
{
if
(
3
==
Integer
.
valueOf
(
cardGroup
.
getJSONObject
(
i
).
getString
(
"card_type"
)))
{
if
(
cardGroup
.
getJSONObject
(
i
).
containsKey
(
"users"
)){
JSONArray
users
=
cardGroup
.
getJSONObject
(
i
).
getJSONArray
(
"users"
);
for
(
int
i1
=
0
;
i1
<
users
.
size
();
i1
++)
{
//获取用户id
String
userId
=
users
.
getJSONObject
(
i1
).
getString
(
"id"
);
//获取用户名
String
userName
=
users
.
getJSONObject
(
i1
).
getString
(
"screen_name"
);
//获取认证信息
String
attestationMassage
=
users
.
getJSONObject
(
i1
).
getString
(
"verified_reason"
);
//获取粉丝数量
String
followers_count
=
users
.
getJSONObject
(
i1
).
getString
(
"followers_count"
);
Long
followerCount
=
null
;
if
(!
followers_count
.
contains
(
"万"
)){
followerCount
=
Long
.
valueOf
(
followers_count
);
}
else
{
String
[]
split
=
followers_count
.
split
(
"万"
);
followerCount
=
Long
.
valueOf
(
split
[
0
])*
10000
;
}
WeiBoUser
weiBoUser
=
new
WeiBoUser
(
userId
,
attestationMassage
,
userName
,
topic
,
date
,
followerCount
);
//判断weiBoUser是否为空添加数据
if
(
weiBoUser
!=
null
)
{
//调用weiBoUserDao中的方法添加数据
weiBoUserDao
.
addWeiBoUser
(
weiBoUser
);
}
else
{
log
.
info
(
"未采集到用户信息"
);
}
}
}
}
else
if
(
10
==
Integer
.
valueOf
(
cardGroup
.
getJSONObject
(
i
).
getString
(
"card_type"
)))
{
if
(
cardGroup
.
getJSONObject
(
i
).
containsKey
(
"user"
)){
JSONObject
user
=
cardGroup
.
getJSONObject
(
i
).
getJSONObject
(
"user"
);
//获取用户id
String
userId
=
user
.
getString
(
"id"
);
//获取用户名
String
userName
=
user
.
getString
(
"screen_name"
);
//获取认证信息
String
attestationMassage
=
user
.
getString
(
"verified_reason"
);
//获取粉丝数
String
followers_count
=
user
.
getString
(
"followers_count"
);
Long
followerCount
=
null
;
if
(
followers_count
.
contains
(
"万"
)){
String
[]
split
=
followers_count
.
split
(
"万"
);
followerCount
=
Long
.
valueOf
(
split
[
0
])*
10000
;
}
else
{
followerCount
=
Long
.
valueOf
(
followers_count
);
}
return
document
;
WeiBoUser
weiBoUser
=
new
WeiBoUser
(
userId
,
attestationMassage
,
userName
,
topic
,
date
,
followerCount
);
//判断weiBoUser是否为空添加数据
if
(
weiBoUser
!=
null
)
{
//调用weiBoUserDao中的方法添加数据
weiBoUserDao
.
addWeiBoUser
(
weiBoUser
);
}
else
{
log
.
info
(
"未采集到用户信息"
);
}
}
}
}
return
null
;
}
public
JSONObject
analysisWeiboSon
(
JSONObject
readJson
){
/**
* 解析微博类型
*
* @param jsonObject
* @param topic
* @return
*/
public
static
WeiBoMassage
analysisWeiboMBlog
(
JSONObject
jsonObject
,
String
topic
)
{
JSONObject
mblog
=
jsonObject
.
getJSONObject
(
"mblog"
);
String
type
=
mblog
.
getJSONObject
(
"title"
).
getString
(
"text"
);
String
card_type
=
jsonObject
.
getString
(
"card_type"
);
Integer
cardType
=
Integer
.
valueOf
(
card_type
);
String
show_type
=
jsonObject
.
getString
(
"show_type"
);
Integer
showType
=
Integer
.
valueOf
(
show_type
);
//点赞数
String
attitudes_count
=
mblog
.
getString
(
"attitudes_count"
);
Long
attitudeCount
=
null
;
if
(
attitudes_count
.
contains
(
"万"
))
{
String
[]
split
=
attitudes_count
.
split
(
"万"
);
attitudeCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
{
attitudeCount
=
Long
.
valueOf
(
attitudes_count
);
}
//评论数
String
comments_count
=
mblog
.
getString
(
"comments_count"
);
Long
commentCount
=
null
;
if
(
comments_count
.
contains
(
"万"
))
{
String
[]
split
=
comments_count
.
split
(
"万"
);
commentCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
{
commentCount
=
Long
.
valueOf
(
comments_count
);
}
//转发数
String
reposts_count
=
mblog
.
getString
(
"reposts_count"
);
Long
repostCount
=
null
;
if
(
reposts_count
.
contains
(
"万"
)){
String
[]
split
=
reposts_count
.
split
(
"万"
);
repostCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
{
repostCount
=
Long
.
valueOf
(
reposts_count
);
}
Date
createTime
=
null
;
Date
editTime
=
null
;
return
null
;
try
{
SimpleDateFormat
simpleDateFormat
=
new
SimpleDateFormat
(
"EEE MMM dd HH:mm:ss z yyyy"
,
java
.
util
.
Locale
.
US
);
//创建时间
String
created_at
=
mblog
.
getString
(
"created_at"
);
createTime
=
simpleDateFormat
.
parse
(
created_at
);
//编辑时间
if
(
mblog
.
containsKey
(
"edit_at"
)){
String
edit_at
=
mblog
.
getString
(
"edit_at"
);
editTime
=
simpleDateFormat
.
parse
(
edit_at
);
}
}
catch
(
ParseException
e
)
{
log
.
error
(
"创建时间和编辑时间解析异常"
,
e
);
}
String
mid
=
mblog
.
getString
(
"mid"
);
//用户id
String
userId
=
mblog
.
getJSONObject
(
"user"
).
getString
(
"id"
);
//用户名
String
userName
=
mblog
.
getJSONObject
(
"user"
).
getString
(
"screen_name"
);
//来源
String
source
=
mblog
.
getString
(
"source"
);
//内容
String
content
=
null
;
if
(
mblog
.
getString
(
"text"
).
contains
(
"<"
))
{
String
text
=
mblog
.
getString
(
"text"
);
Document
parse
=
Jsoup
.
parse
(
text
);
content
=
parse
.
text
();
}
else
{
content
=
mblog
.
getString
(
"text"
);
}
WeiBoMassage
weiBoMassage
=
new
WeiBoMassage
(
userId
,
content
,
userName
,
mid
,
createTime
,
editTime
,
cardType
,
showType
,
repostCount
,
commentCount
,
attitudeCount
,
source
,
type
,
topic
);
//默认不转发为0
weiBoMassage
.
setForward
(
0
);
JSONObject
weiboJson
=
null
;
//微博实体 是否转发
if
(
mblog
.
containsKey
(
"retweeted_status"
))
{
weiboJson
=
mblog
.
getJSONObject
(
"retweeted_status"
);
//处理转发特有的
//weiBoMassage.set
//源mid
String
rootMid
=
weiboJson
.
getString
(
"mid"
);
//源来源
String
rootSource
=
weiboJson
.
getString
(
"source"
);
//源text
String
text
=
weiboJson
.
getString
(
"text"
);
//解析
Document
parse
=
Jsoup
.
parse
(
text
);
String
rootText
=
parse
.
text
();
//源用户id
String
rootId
=
weiboJson
.
getJSONObject
(
"user"
).
getString
(
"id"
);
//源用户名
String
rootName
=
weiboJson
.
getJSONObject
(
"user"
).
getString
(
"screen_name"
);
//数据保存到对象中
weiBoMassage
.
setRoot_mid
(
rootMid
);
weiBoMassage
.
setRoot_id
(
rootId
);
weiBoMassage
.
setRoot_source
(
rootSource
);
weiBoMassage
.
setRoot_text
(
rootText
);
weiBoMassage
.
setRoot_name
(
rootName
);
//转发为1
weiBoMassage
.
setForward
(
1
);
}
else
{
weiboJson
=
mblog
;
}
List
<
String
>
pictureUrlList
=
new
ArrayList
();
Long
playCount
=
null
;
//获取播放量和图片链接
if
(
weiboJson
.
getJSONArray
(
"pic_ids"
).
size
()
>
0
)
{
JSONArray
jsonArray
=
weiboJson
.
getJSONArray
(
"pics"
);
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
String
picUrl
=
jsonArray
.
getJSONObject
(
i
).
getString
(
"url"
);
pictureUrlList
.
add
(
picUrl
);
}
}
else
if
(
weiboJson
.
containsKey
(
"page_info"
))
{
if
(
weiboJson
.
getJSONObject
(
"page_info"
).
containsKey
(
"play_count"
)){
String
play
=
weiboJson
.
getJSONObject
(
"page_info"
).
getString
(
"play_count"
);
if
(
play
.
contains
(
"万"
))
{
String
[]
split
=
play
.
split
(
"万"
);
playCount
=
Long
.
valueOf
(
split
[
0
])
*
10000
;
}
else
if
(
play
.
contains
(
"次"
)){
String
[]
split
=
play
.
split
(
"次"
);
playCount
=
Long
.
valueOf
(
split
[
0
]);
}
}
}
weiBoMassage
.
setPlayCount
(
playCount
);
weiBoMassage
.
setPictureUrlList
(
pictureUrlList
);
return
weiBoMassage
;
}
/**
* @return void 返回类型
* @Title: weiboHotSearchByPhoneTest
* @author hero
* @Description: TODO(手机端Iphone 微博热搜采集)
* @return void 返回类型
*/
public
static
List
<
HotSearchList
>
weiboHotSearchByPhone
(
Date
date
){
public
static
List
<
HotSearchList
>
weiboHotSearchByPhone
(
Date
date
)
{
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583"
;
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
);
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博时热搜时出现连接失败"
,
e
);
log
.
error
(
"解析微博时热搜时出现连接失败"
,
e
);
}
List
<
HotSearchList
>
result
=
new
ArrayList
<
HotSearchList
>();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
...
...
@@ -187,11 +532,11 @@ public class WeiboHotSearchTest{
try
{
JSONObject
card
=
cards
.
getJSONObject
(
0
);
JSONArray
cardGroup
=
card
.
getJSONArray
(
"card_group"
);
JSONObject
topCard
=
cardGroup
.
getJSONObject
(
0
);
if
(!
topCard
.
containsKey
(
"pic"
))
{
JSONObject
topCard
=
cardGroup
.
getJSONObject
(
0
);
if
(!
topCard
.
containsKey
(
"pic"
))
{
rank
=
1
;
}
if
(
Objects
.
nonNull
(
cardGroup
)
&&
!
cardGroup
.
isEmpty
())
{
if
(
nonNull
(
cardGroup
)
&&
!
cardGroup
.
isEmpty
())
{
// String title = card.getString("title");
boolean
hot
=
true
;
// if (Objects.nonNull(title) && title.contains("实时上升热点")) {
...
...
@@ -232,4 +577,5 @@ public class WeiboHotSearchTest{
return
Collections
.
emptyList
();
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment