Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
searchhotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
searchhotcrawler
Commits
b21d2070
Commit
b21d2070
authored
May 26, 2021
by
chenweitao
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
测试类及项目结构文件、爬虫核心包升级、热搜基础项目构造器更新
parent
a2bf4e4f
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
459 additions
and
9 deletions
+459
-9
pom.xml
+6
-0
searchhotcrawler.iml
+82
-0
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchList.java
+5
-5
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
+1
-1
src/main/java/com/zhiwei/searchhotcrawler/test/Job51Test.java
+96
-0
src/main/resources/redis.properties
+8
-3
src/test/java/ObjectTest.java
+26
-0
src/test/java/weiboTest/WeiboHotSearchTest.java
+235
-0
No files found.
pom.xml
View file @
b21d2070
...
@@ -113,6 +113,12 @@
...
@@ -113,6 +113,12 @@
<artifactId>
jedis
</artifactId>
<artifactId>
jedis
</artifactId>
<version>
2.8.1
</version>
<version>
2.8.1
</version>
</dependency>
</dependency>
<dependency>
<groupId>
junit
</groupId>
<artifactId>
junit
</artifactId>
<version>
4.12
</version>
<scope>
test
</scope>
</dependency>
</dependencies>
</dependencies>
...
...
searchhotcrawler.iml
0 → 100644
View file @
b21d2070
<?xml version="1.0" encoding="UTF-8"?>
<module
org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule=
"true"
type=
"JAVA_MODULE"
version=
"4"
>
<component
name=
"FacetManager"
>
<facet
type=
"Spring"
name=
"Spring"
>
<configuration
/>
</facet>
</component>
<component
name=
"NewModuleRootManager"
LANGUAGE_LEVEL=
"JDK_1_8"
>
<output
url=
"file://$MODULE_DIR$/target/classes"
/>
<output-test
url=
"file://$MODULE_DIR$/target/test-classes"
/>
<content
url=
"file://$MODULE_DIR$"
>
<sourceFolder
url=
"file://$MODULE_DIR$/src/main/java"
isTestSource=
"false"
/>
<sourceFolder
url=
"file://$MODULE_DIR$/src/main/resources"
type=
"java-resource"
/>
<sourceFolder
url=
"file://$MODULE_DIR$/src/test/java"
isTestSource=
"true"
/>
<excludeFolder
url=
"file://$MODULE_DIR$/target"
/>
</content>
<orderEntry
type=
"inheritedJdk"
/>
<orderEntry
type=
"sourceFolder"
forTests=
"false"
/>
<orderEntry
type=
"library"
name=
"Maven: org.mongodb:mongo-java-driver:3.12.2"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: com.zhiwei:sendmail:0.0.1-SNAPSHOT"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: javax.mail:mail:1.4.7"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: javax.activation:activation:1.1"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: com.zhiwei.tools:zhiwei-tools:0.1.6-SNAPSHOT"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: com.alibaba:fastjson:1.2.58"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: de.ruedigermoeller:fst:2.57"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: com.fasterxml.jackson.core:jackson-core:2.8.8"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.javassist:javassist:3.21.0-GA"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.objenesis:objenesis:2.5.1"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.apache.commons:commons-lang3:3.8.1"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: com.zhiwei.crawler:crawler-core:0.6.7.4-SNAPSHOT"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: com.squareup.okhttp3:okhttp:3.14.9"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: com.squareup.okio:okio:1.17.2"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.jsoup:jsoup:1.13.1"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: cn.wanghaomiao:JsoupXpath:2.3.2"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.antlr:antlr4-runtime:4.7"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.apache.commons:commons-compress:1.20"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.brotli:dec:0.1.2"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: com.ibm.icu:icu4j:67.1"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: com.google.guava:guava:29.0-jre"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: com.google.guava:failureaccess:1.0.1"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: com.google.guava:listenablefuture:9999.0-empty-to-avoid-conflict-with-guava"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: com.google.code.findbugs:jsr305:3.0.2"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.checkerframework:checker-qual:2.11.1"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: com.google.errorprone:error_prone_annotations:2.3.4"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: com.google.j2objc:j2objc-annotations:1.3"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.apache.logging.log4j:log4j-core:2.13.3"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.apache.logging.log4j:log4j-api:2.13.3"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.apache.logging.log4j:log4j-1.2-api:2.13.3"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.slf4j:slf4j-log4j12:1.8.0-beta4"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.slf4j:slf4j-api:1.8.0-beta4"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: log4j:log4j:1.2.17"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: com.zhiwei.async:task-boot:0.0.3-SNAPSHOT"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: com.zhiwei.crawler:proxy-client:1.0.5-SNAPSHOT"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.apache.dubbo:dubbo:2.7.4.1"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: io.netty:netty-all:4.1.25.Final"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: com.google.code.gson:gson:2.8.5"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.apache.curator:curator-recipes:2.12.0"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.apache.curator:curator-framework:2.12.0"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.apache.curator:curator-client:2.12.0"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.apache.zookeeper:zookeeper:3.4.8"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: jline:jline:0.9.94"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: io.netty:netty:3.7.0.Final"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: com.kohlschutter.boilerpipe:boilerpipe-extractor:0.0.1-SNAPSHOT"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.projectlombok:lombok:1.18.8"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.springframework:spring-aop:4.2.2.RELEASE"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: aopalliance:aopalliance:1.0"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.springframework:spring-beans:4.2.2.RELEASE"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.springframework:spring-core:4.2.2.RELEASE"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: commons-logging:commons-logging:1.2"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.springframework:spring-test:4.2.2.RELEASE"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.springframework:spring-context:4.2.2.RELEASE"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.springframework:spring-expression:4.2.2.RELEASE"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.springframework:spring-context-support:4.2.2.RELEASE"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.springframework:spring-web:4.2.2.RELEASE"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.springframework:spring-tx:4.2.2.RELEASE"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: redis.clients:jedis:2.8.1"
level=
"project"
/>
<orderEntry
type=
"library"
name=
"Maven: org.apache.commons:commons-pool2:2.4.2"
level=
"project"
/>
<orderEntry
type=
"library"
scope=
"TEST"
name=
"Maven: junit:junit:4.12"
level=
"project"
/>
<orderEntry
type=
"library"
scope=
"TEST"
name=
"Maven: org.hamcrest:hamcrest-core:1.3"
level=
"project"
/>
</component>
</module>
\ No newline at end of file
src/main/java/com/zhiwei/searchhotcrawler/bean/HotSearchList.java
View file @
b21d2070
...
@@ -102,7 +102,7 @@ public class HotSearchList implements Serializable{
...
@@ -102,7 +102,7 @@ public class HotSearchList implements Serializable{
public
HotSearchList
(){}
public
HotSearchList
(){}
public
HotSearchList
(
String
url
,
String
name
,
Integer
count
,
Boolean
hot
,
Integer
rank
,
String
type
,
String
icon
,
Date
date
){
public
HotSearchList
(
String
url
,
String
name
,
Integer
count
,
Boolean
hot
,
Integer
rank
,
String
type
,
String
icon
,
Date
date
){
this
.
id
=
name
+
"_"
+
new
Date
().
getTime
()
+
"_"
+
type
;
this
.
id
=
name
+
"_"
+
System
.
currentTimeMillis
()
+
"_"
+
type
;
this
.
url
=
url
;
this
.
url
=
url
;
this
.
name
=
name
;
this
.
name
=
name
;
this
.
count
=
count
;
this
.
count
=
count
;
...
@@ -116,7 +116,7 @@ public class HotSearchList implements Serializable{
...
@@ -116,7 +116,7 @@ public class HotSearchList implements Serializable{
public
HotSearchList
(
String
url
,
String
name
,
Integer
count
,
Integer
rank
,
String
type
,
Date
date
){
public
HotSearchList
(
String
url
,
String
name
,
Integer
count
,
Integer
rank
,
String
type
,
Date
date
){
this
.
id
=
name
+
"_"
+
new
Date
().
getTime
()+
"_"
+
type
;
this
.
id
=
name
+
"_"
+
System
.
currentTimeMillis
()+
"_"
+
type
;
this
.
url
=
url
;
this
.
url
=
url
;
this
.
name
=
name
;
this
.
name
=
name
;
this
.
count
=
count
;
this
.
count
=
count
;
...
@@ -129,7 +129,7 @@ public class HotSearchList implements Serializable{
...
@@ -129,7 +129,7 @@ public class HotSearchList implements Serializable{
public
HotSearchList
(
String
url
,
String
name
,
Integer
count
,
Integer
rank
,
String
type
,
Integer
commentCount
,
String
topicLead
,
Date
date
){
public
HotSearchList
(
String
url
,
String
name
,
Integer
count
,
Integer
rank
,
String
type
,
Integer
commentCount
,
String
topicLead
,
Date
date
){
this
.
id
=
name
+
"_"
+
new
Date
().
getTime
()+
"_"
+
type
;
this
.
id
=
name
+
"_"
+
System
.
currentTimeMillis
()+
"_"
+
type
;
this
.
url
=
url
;
this
.
url
=
url
;
this
.
name
=
name
;
this
.
name
=
name
;
this
.
count
=
count
;
this
.
count
=
count
;
...
@@ -143,7 +143,7 @@ public class HotSearchList implements Serializable{
...
@@ -143,7 +143,7 @@ public class HotSearchList implements Serializable{
}
}
public
HotSearchList
(
String
url
,
String
name
,
Integer
count
,
Boolean
hot
,
Integer
rank
,
String
type
,
Date
date
,
String
icon
,
String
topicResult
){
public
HotSearchList
(
String
url
,
String
name
,
Integer
count
,
Boolean
hot
,
Integer
rank
,
String
type
,
Date
date
,
String
icon
,
String
topicResult
){
this
.
id
=
name
+
"_"
+
new
Date
().
getTime
()
+
"_"
+
type
;
this
.
id
=
name
+
"_"
+
System
.
currentTimeMillis
()
+
"_"
+
type
;
this
.
url
=
url
;
this
.
url
=
url
;
this
.
name
=
name
;
this
.
name
=
name
;
this
.
hot
=
hot
;
this
.
hot
=
hot
;
...
@@ -157,7 +157,7 @@ public class HotSearchList implements Serializable{
...
@@ -157,7 +157,7 @@ public class HotSearchList implements Serializable{
}
}
public
HotSearchList
(
String
url
,
String
name
,
String
topicLead
,
Integer
count
,
Boolean
hot
,
Date
time
,
Integer
rank
,
String
type
,
Integer
view
,
Integer
barrage
,
String
pictureUrl
)
{
public
HotSearchList
(
String
url
,
String
name
,
String
topicLead
,
Integer
count
,
Boolean
hot
,
Date
time
,
Integer
rank
,
String
type
,
Integer
view
,
Integer
barrage
,
String
pictureUrl
)
{
this
.
id
=
name
+
"_"
+
new
Date
().
getTime
()+
"_"
+
type
;
this
.
id
=
name
+
"_"
+
System
.
currentTimeMillis
()+
"_"
+
type
;
this
.
url
=
url
;
this
.
url
=
url
;
this
.
name
=
name
;
this
.
name
=
name
;
this
.
topicLead
=
topicLead
;
this
.
topicLead
=
topicLead
;
...
...
src/main/java/com/zhiwei/searchhotcrawler/run/HotSearchRun.java
View file @
b21d2070
package
com
.
zhiwei
.
searchhotcrawler
.
run
;
package
com
.
zhiwei
.
searchhotcrawler
.
run
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.
core.
proxy.ProxyFactory
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.cache.CacheListener
;
import
com.zhiwei.searchhotcrawler.cache.CacheListener
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
...
...
src/main/java/com/zhiwei/searchhotcrawler/test/Job51Test.java
0 → 100644
View file @
b21d2070
package
com
.
zhiwei
.
searchhotcrawler
.
test
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.mongodb.client.MongoDatabase
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyFactory
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.config.DBConfig
;
import
com.zhiwei.searchhotcrawler.config.ProxyConfig
;
import
com.zhiwei.searchhotcrawler.dbtemplate.MongoDBLocalTemplate
;
import
com.zhiwei.searchhotcrawler.util.HttpClientUtils
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.springframework.context.ApplicationContext
;
import
org.springframework.context.support.ClassPathXmlApplicationContext
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
@Log4j2
public
class
Job51Test
{
public
static
void
main
(
String
[]
args
)
{
// ApplicationContext context = new ClassPathXmlApplicationContext("applicationContext.xml");
SimpleConfig
simpleConfig
=
SimpleConfig
.
builder
().
registry
(
ProxyConfig
.
registry
)
.
group
(
ProxyConfig
.
group
).
appId
(
10000013
).
appName
(
"hotsearch"
).
build
();
ProxyFactory
.
init
(
simpleConfig
);
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
// MongoDatabase mongoDBLocal = MongoDBLocalTemplate.getDB(DBConfig.dbName);
List
<
HotSearchList
>
list
=
new
ArrayList
<>();
String
url
=
"https://search.51job.com/list/080300,000000,0000,00,9,99,java,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="
;
Map
<
String
,
Object
>
header
=
new
HashMap
<>();
header
.
put
(
"Accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
);
header
.
put
(
"Accept-Encoding"
,
"gzip, deflate, br"
);
header
.
put
(
"Accept-Language"
,
"zh-CN,zh;q=0.9"
);
header
.
put
(
"Cache-Control"
,
"max-age=0"
);
header
.
put
(
"Connection"
,
"keep-alive"
);
header
.
put
(
"Cookie"
,
"guid=1925f996c7ae446cdf1f579f113bff6e; _ujz=MTg3NDg4MTM4MA%3D%3D; ps=needv%3D0; slife=lowbrowser%3Dnot%26%7C%26lastlogindate%3D20210318%26%7C%26securetime%3DBztcaVQzWTsEZlJrWmJdPwQ2Ajw%253D; track=registertype%3D1; 51job=cuid%3D187488138%26%7C%26cusername%3Dphone_15757871020_202103189219%26%7C%26cpassword%3D%26%7C%26cname%3D%25B3%25C2%25EC%25BF%25CC%25CE%26%7C%26cemail%3D15757871020%2540163.com%26%7C%26cemailstatus%3D0%26%7C%26cnickname%3D%26%7C%26ccry%3D.0b4qUteozwmg%26%7C%26cconfirmkey%3D%25241%2524UXfAYBHG%2524Hni.5zaFu5kr7BN.eVcOU%252F%26%7C%26cautologin%3D1%26%7C%26cenglish%3D0%26%7C%26sex%3D0%26%7C%26cnamekey%3D%25241%2524CN04lL8j%2524kCHAFcf4TNh%252F2odmIqujW1%26%7C%26to%3D8019a57bb26817913b5f3c2080ba5792605354bf%26%7C%26; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; search=jobarea%7E%60080300%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60080300%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FAjava%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60080300%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21"
);
header
.
put
(
"Host"
,
"search.51job.com"
);
header
.
put
(
"Referer"
,
"https://search.51job.com/list/080300,000000,0000,00,9,99,%2B,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="
);
header
.
put
(
"sec-ch-ua"
,
"\"Google Chrome\";v=\"89\", \"Chromium\";v=\"89\", \";Not A Brand\";v=\"99\""
);
header
.
put
(
"Sec-Fetch-Dest"
,
"document"
);
header
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
);
// header.put("","");
JSONObject
jsonObject
=
null
;
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
header
);
for
(
int
t
=
0
;
t
<
1
&&
jsonObject
==
null
;
t
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"知乎热搜页面连接异常"
,
e
);
}
if
(
htmlBody
!=
null
)
{
Document
document
=
Jsoup
.
parse
(
htmlBody
);
log
.
info
(
"document:{}"
,
document
);
log
.
info
(
"======================"
);
String
html
=
document
.
getElementsByClass
(
"j_joblist"
).
first
().
html
();
log
.
info
(
"html:{}"
,
html
);
jsonObject
=
JSONObject
.
parseObject
(
html
);
if
(
jsonObject
!=
null
)
{
// JSONArray dataJson = jsonObject.getJSONObject("initialState").getJSONObject("topsearch").getJSONArray("data");
// for (int i = 0; i < dataJson.size(); i++) {
// Integer rank = i + 1;
// JSONObject data = dataJson.getJSONObject(i);
// String name = data.getString("queryDisplay");
// String realQuery = data.getString("realQuery");
// String zhihuUrl = "https://www.zhihu.com/search?q=" + realQuery + "&utm_content=search_hot&type=content";
//
// }
}
}
else
{
log
.
error
(
"临时爬取出问题"
);
}
}
}
}
src/main/resources/redis.properties
View file @
b21d2070
#redis.host=1
27.0.0.
1
#redis.host=1
15.236.59.9
1
#redis.port=
6379
#redis.port=
7382
#redis.password=
#redis.password=
#redis
#redis
#redis.host = 192.168.0.39
#redis.port = 7382
#redis.database = 3
#redis
redis.host
=
192.168.0.39
redis.host
=
192.168.0.39
redis.port
=
6379
redis.port
=
6379
redis.database
=
1
redis.database
=
1
#maxIdle
#maxIdle
redis.maxIdle
=
20
redis.maxIdle
=
20
#minIdle
#minIdle
...
...
src/test/java/ObjectTest.java
0 → 100644
View file @
b21d2070
/**
* ***************************************************
* Copyright (C), NingBo ZhiWeiReach info. Co., Ltd. *
*****************************************************
* 类的详细说明
*
* @author 东临碣石
* @Date 2016年1月16日
* @version 1.00
*/
import
org.junit.runner.RunWith
;
import
org.springframework.test.context.ContextConfiguration
;
import
org.springframework.test.context.junit4.AbstractJUnit4SpringContextTests
;
import
org.springframework.test.context.junit4.SpringJUnit4ClassRunner
;
/**
* @Description: SpringTest的父类,用来加载基础的配置文件
* @date 2016年1月16日 上午11:40:14
*/
@RunWith
(
SpringJUnit4ClassRunner
.
class
)
@ContextConfiguration
(
locations
=
{
"classpath:applicationContext.xml"
})
public
abstract
class
ObjectTest
extends
AbstractJUnit4SpringContextTests
{
}
src/test/java/weiboTest/WeiboHotSearchTest.java
0 → 100644
View file @
b21d2070
package
weiboTest
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.utils.RequestUtils
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchList
;
import
com.zhiwei.searchhotcrawler.bean.HotSearchType
;
import
com.zhiwei.searchhotcrawler.config.RedisConfig
;
import
com.zhiwei.searchhotcrawler.util.TipsUtils
;
import
lombok.extern.log4j.Log4j2
;
import
okhttp3.Request
;
import
okhttp3.Response
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.junit.Test
;
import
org.junit.runner.RunWith
;
import
org.springframework.test.context.ContextConfiguration
;
import
org.springframework.test.context.junit4.SpringJUnit4ClassRunner
;
import
java.io.IOException
;
import
java.util.*
;
/**
* @author cwt
* @date 2021/5/26 10:35
*/
@Log4j2
@RunWith
(
SpringJUnit4ClassRunner
.
class
)
@ContextConfiguration
(
locations
=
{
"classpath:applicationContext.xml"
})
public
class
WeiboHotSearchTest
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
3
).
build
();
@Test
public
void
test
(){
Document
document
=
Jsoup
.
parse
(
"a href=\"https://m.weibo.cn/search?containerid=231522type%3D1%26t%3D10%26q%3D%23%E9%82%93%E4%BC%A6%E8%AE%B2%E6%88%8F%E4%B8%93%E4%B8%9A%23&extparam=%23%E9%82%93%E4%BC%A6%E8%AE%B2%E6%88%8F%E4%B8%93%E4%B8%9A%23&luicode=10000011&lfid=100103type%3D1%26t%3D10%26q%3D%23%E9%82%93%E4%BC%A6%E8%AE%B2%E6%88%8F%E4%B8%93%E4%B8%9A%23\" data-hide=\"\"><span class=\"surl-text\">#邓伦讲戏专业#</span></a><a href=\"https://m.weibo.cn/search?containerid=231522type%3D1%26t%3D10%26q%3D%23%E6%9E%81%E9%99%90%E6%8C%91%E6%88%98%23&luicode=10000011&lfid=100103type%3D1%26t%3D10%26q%3D%23%E9%82%93%E4%BC%A6%E8%AE%B2%E6%88%8F%E4%B8%93%E4%B8%9A%23\" data-hide=\"\"><span class=\"surl-text\">#极限挑战#</span></a> <a href='/n/邓伦'>@邓伦</a> 和<a href='/n/景甜'>@景甜</a> 改编《甄嬛传》剧本,伦伦认真讲戏的样子让人瞬间穿越到拍摄现场。看来戏瘾上身的邓伦还过了一把导演的瘾,这专业的模样要不要考虑跨界当当导演呀~<span class=\"url-icon\"><img alt=[哈哈] src=\"https://h5.sinaimg.cn/m/emoticon/icon/default/d_haha-0ec05e6dad.png\" style=\"width:1em; height:1em;\" /></span><a data-url=\"http://t.cn/A6VJPN9w\" href=\"https://video.weibo.com/show?fid=1034:4640837901156490\" data-hide=\"\"><span class='url-icon'><img style='width: 1rem;height: 1rem' src='https://h5.sinaimg.cn/upload/2015/09/25/3/timeline_card_small_video_default.png'></span><span class=\"surl-text\">东方卫视极限挑战的微博视频</span></a>"
);
System
.
out
.
println
(
document
.
text
());
}
@Test
public
void
testHotWeibo
(){
Date
date
=
new
Date
();
List
<
HotSearchList
>
hotSearchLists
=
weiboHotSearchByPhone
(
date
);
for
(
HotSearchList
hotSearchList
:
hotSearchLists
)
{
}
}
/**
* 微博热搜数据更新导语,阅读量,讨论量
* @param document
* @return
*/
public
static
org
.
bson
.
Document
weiboUpdate
(
org
.
bson
.
Document
document
)
{
log
.
info
(
"更新微博热搜{}导语阅读量和讨论量"
,
document
.
getString
(
"name"
));
String
url
=
"https://m.weibo.cn/api/container/getIndex?"
+
document
.
getString
(
"url"
).
substring
(
document
.
getString
(
"url"
).
indexOf
(
"?"
)+
1
,
document
.
getString
(
"url"
).
indexOf
(
"&"
));
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
);
for
(
int
count
=
0
;
count
<=
5
;
count
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博热搜详情页面时出现连接失败"
,
e
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
contains
(
"data"
))
{
JSONObject
dataJson
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
JSONObject
cardlistInfoJson
=
dataJson
.
getJSONObject
(
"cardlistInfo"
);
List
<
JSONObject
>
cardsJsons
=
(
List
<
JSONObject
>)
dataJson
.
get
(
"cards"
);
//解析cardlistInfo,讨论、导语、阅读
if
(
cardlistInfoJson
.
containsKey
(
"desc"
)){
String
topicLead
=
cardlistInfoJson
.
getString
(
"desc"
);
if
(!
""
.
equals
(
topicLead
))
{
document
.
put
(
"topicLead"
,
topicLead
);
}
}
if
(
cardlistInfoJson
.
containsKey
(
"cardlist_head_cards"
)){
JSONObject
readJson
=
cardlistInfoJson
.
getJSONArray
(
"cardlist_head_cards"
).
getJSONObject
(
0
);
if
(
readJson
.
containsKey
(
"head_data"
))
{
String
midText
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"midtext"
);
String
read
=
midText
.
replaceAll
(
"阅读"
,
""
).
replaceAll
(
"讨论.*"
,
""
).
trim
();
String
discussCount
=
midText
.
replaceAll
(
".*讨论"
,
""
).
replaceAll
(
"详情.*"
,
""
).
trim
();
String
pictureUrl
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"portrait_url"
);
document
.
put
(
"readCount"
,
TipsUtils
.
getHotCount
(
read
));
document
.
put
(
"discussCount"
,
TipsUtils
.
getHotCount
(
discussCount
));
document
.
put
(
"pictureUrl"
,
pictureUrl
);
if
(
readJson
.
getJSONObject
(
"head_data"
).
containsKey
(
"downtext"
)){
String
downtext
=
readJson
.
getJSONObject
(
"head_data"
).
getString
(
"downtext"
);
if
(!
""
.
equals
(
downtext
))
{
document
.
put
(
"downtext"
,
downtext
.
replaceAll
(
"主持人:"
,
""
));
}
}
}
}
//解析cards,获取热门微博、人物
for
(
JSONObject
jsonObject
:
cardsJsons
)
{
}
return
document
;
}
}
return
null
;
}
public
JSONObject
analysisWeiboSon
(
JSONObject
readJson
){
return
null
;
}
/**
* @Title: weiboHotSearchByPhoneTest
* @author hero
* @Description: TODO(手机端Iphone 微博热搜采集)
* @return void 返回类型
*/
public
static
List
<
HotSearchList
>
weiboHotSearchByPhone
(
Date
date
){
String
url
=
"https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583"
;
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
);
String
htmlBody
=
null
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
for
(
int
count
=
0
;
count
<=
5
;
count
++){
try
(
Response
response
=
httpBoot
.
syncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
))
{
htmlBody
=
response
.
body
().
string
();
}
catch
(
IOException
e
)
{
log
.
error
(
"解析微博时热搜时出现连接失败"
,
e
);
}
List
<
HotSearchList
>
result
=
new
ArrayList
<
HotSearchList
>();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"cards"
))
{
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
JSONArray
cards
=
json
.
getJSONArray
(
"cards"
);
int
rank
=
0
;
// for (int i = 0; i < cards.size(); i++) {
try
{
JSONObject
card
=
cards
.
getJSONObject
(
0
);
JSONArray
cardGroup
=
card
.
getJSONArray
(
"card_group"
);
JSONObject
topCard
=
cardGroup
.
getJSONObject
(
0
);
if
(!
topCard
.
containsKey
(
"pic"
)){
rank
=
1
;
}
if
(
Objects
.
nonNull
(
cardGroup
)
&&
!
cardGroup
.
isEmpty
())
{
// String title = card.getString("title");
boolean
hot
=
true
;
// if (Objects.nonNull(title) && title.contains("实时上升热点")) {
// hot = false;
// rank = 51;
// }
for
(
int
j
=
0
;
j
<
cardGroup
.
size
();
j
++)
{
JSONObject
cardInfo
=
cardGroup
.
getJSONObject
(
j
);
String
name
=
cardInfo
.
getString
(
"desc"
);
int
hotCount
=
cardInfo
.
getIntValue
(
"desc_extr"
);
String
icon
=
cardInfo
.
getString
(
"icon"
);
if
(
StringUtils
.
isNotBlank
(
icon
))
{
icon
=
icon
.
split
(
"_"
)[
1
].
split
(
".png"
)[
0
];
}
// String id = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
String
id
=
cardInfo
.
getString
(
"scheme"
);
HotSearchList
hotSearch
=
new
HotSearchList
(
id
,
name
,
hotCount
,
hot
,
rank
,
HotSearchType
.
微博热搜
.
name
(),
icon
,
date
);
result
.
add
(
hotSearch
);
rank
++;
// redisDao.addDataToSet(RedisConfig.WEIBO_HOTSEARCHIDS,name+"_微博热搜");
}
}
else
{
log
.
info
(
"card 数据结构为:{}"
,
card
);
}
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博时热搜时出现解析错误"
,
e
);
continue
;
}
// }
return
result
;
}
catch
(
Exception
e
)
{
log
.
error
(
"解析微博时热搜时出现解析错误,数据不是json结构"
,
e
);
}
}
else
{
log
.
info
(
"解析微博时热搜时出现解析错误,页面结构有问题"
);
}
}
return
Collections
.
emptyList
();
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment