Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
articlenewscrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
articlenewscrawler
Commits
2a35dd02
Commit
2a35dd02
authored
Feb 25, 2019
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
提升版本 修改脉脉采集
parent
b3d545a3
Hide whitespace changes
Inline
Side-by-side
Showing
35 changed files
with
550 additions
and
479 deletions
+550
-479
pom.xml
+1
-1
src/main/java/com/zhiwei/httpclient/HttpClient.java
+18
-1
src/main/java/com/zhiwei/parse/Aika.java
+4
-3
src/main/java/com/zhiwei/parse/Aiqiyi.java
+1
-1
src/main/java/com/zhiwei/parse/Chejia.java
+4
-4
src/main/java/com/zhiwei/parse/Fenghuang.java
+4
-3
src/main/java/com/zhiwei/parse/Maimai.java
+98
-5
src/main/java/com/zhiwei/parse/Pcauto.java
+4
-3
src/main/java/com/zhiwei/parse/QQKB.java
+0
-4
src/main/java/com/zhiwei/parse/QQKandian.java
+2
-2
src/main/java/com/zhiwei/parse/QicheHome.java
+3
-3
src/main/java/com/zhiwei/parse/SinaKeji.java
+3
-3
src/main/java/com/zhiwei/parse/SouBao.java
+172
-172
src/main/java/com/zhiwei/parse/Souhu.java
+4
-3
src/main/java/com/zhiwei/parse/TechTx.java
+4
-3
src/main/java/com/zhiwei/parse/Wangyi.java
+2
-1
src/main/java/com/zhiwei/parse/Xueqiu.java
+0
-1
src/main/java/com/zhiwei/parse/Yiche.java
+6
-5
src/main/java/com/zhiwei/parse/Youku.java
+0
-2
src/main/java/com/zhiwei/parse/analysis/AiqiyiByWordAnalysis.java
+0
-6
src/main/java/com/zhiwei/parse/analysis/FenghuangCommentAnalysis.java
+5
-5
src/main/java/com/zhiwei/parse/analysis/MaimaiBywordAnalysis.java
+9
-6
src/main/java/com/zhiwei/parse/analysis/SouhuCommentAnalysis.java
+3
-3
src/main/java/com/zhiwei/parse/shipin/QQTV.java
+2
-2
src/main/java/com/zhiwei/parse/shipin/SohuTV.java
+1
-0
src/test/java/com/zhiwei/Comment/MaimaiCommentCountTest.java
+44
-40
src/test/java/com/zhiwei/crawler/AiqiyiByWordExample.java
+45
-45
src/test/java/com/zhiwei/crawler/MaimaiBywordExample.java
+5
-11
src/test/java/com/zhiwei/crawler/SouhuCommentCountExample.java
+3
-3
src/test/java/com/zhiwei/keyword/YoukuKeyWordTest.java
+0
-68
src/test/java/com/zhiwei/shipin/BilibiliTest.java
+2
-2
src/test/java/com/zhiwei/shipin/DouyinHotExample.java
+27
-27
src/test/java/com/zhiwei/shipin/QQTVTest.java
+1
-2
src/test/java/com/zhiwei/shipin/SohuTVTest.java
+38
-39
src/test/java/com/zhiwei/shipin/YoukuKeyWordTest.java
+35
-0
No files found.
pom.xml
View file @
2a35dd02
...
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<artifactId>
articlenewscrawler
</artifactId>
<version>
0.0.
8
-SNAPSHOT
</version>
<version>
0.0.
9
-SNAPSHOT
</version>
<name>
articlenewscrawler
</name>
<description>
采集凤凰,一点资讯,搜狐历时文章和文章评论
</description>
...
...
src/main/java/com/zhiwei/httpclient/HttpClient.java
View file @
2a35dd02
...
...
@@ -9,6 +9,7 @@ import org.slf4j.LoggerFactory;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
okhttp3.Response
;
...
...
@@ -32,7 +33,23 @@ public class HttpClient {
logger
.
error
(
"httpClient 获取数据出现问题:{}"
,
e
);
return
null
;
}
}
/**
*
* @Description (TODO这里用一句话描述这个方法的作用)
* @param url
* @param cookie
* @return
* @throws IOException
*/
public
static
String
executeHttpRequestGet
(
String
url
,
ProxyHolder
proxy
,
Map
<
String
,
String
>
headerMap
)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
)){
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"httpClient 获取数据出现问题:{}"
,
e
);
return
null
;
}
}
public
static
String
executeHttpRequestPost
(
String
url
,
Proxy
proxy
,
Map
<
String
,
String
>
headerMap
,
Map
<
String
,
Object
>
paramMap
)
{
...
...
src/main/java/com/zhiwei/parse/Aika.java
View file @
2a35dd02
package
com
.
zhiwei
.
parse
;
import
java.net.Proxy
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.List
;
import
java.util.Map
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.parse.analysis.AikaCommentAnalysis
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
...
...
@@ -22,7 +23,7 @@ public class Aika {
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
Map
<
String
,
Object
>>
getAikaComment
(
String
url
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getAikaComment
(
String
url
,
Proxy
Holder
proxy
)
{
String
commentId
=
getCommentId
(
url
);
if
(
nonNull
(
commentId
))
{
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
...
...
src/main/java/com/zhiwei/parse/Aiqiyi.java
View file @
2a35dd02
...
...
@@ -29,7 +29,7 @@ public class Aiqiyi {
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getAiqiyiBywordHeaderMap
(
null
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
try
{
for
(
int
i
=
1
;
i
<=
20
;
i
++)
{
for
(
int
i
=
1
;
i
<=
5
;
i
++)
{
int
count
=
dataList
.
size
();
String
url
=
"https://so.iqiyi.com/so/q_"
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)+
"_ctg__t_0_page_"
+
i
+
"_p_1_qc_0_rd__site__m_4_bitrate_"
;
System
.
out
.
println
(
url
);
...
...
src/main/java/com/zhiwei/parse/Chejia.java
View file @
2a35dd02
...
...
@@ -2,7 +2,6 @@ package com.zhiwei.parse;
import
static
java
.
util
.
Objects
.
nonNull
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.Date
;
...
...
@@ -17,6 +16,7 @@ import com.alibaba.fastjson.JSONArray;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
...
...
@@ -34,7 +34,7 @@ public class Chejia {
* @param proxy
* @return
*/
public
static
int
getChejiaCommentCount
(
String
url
,
Proxy
proxy
)
{
public
static
int
getChejiaCommentCount
(
String
url
,
Proxy
Holder
proxy
)
{
String
id
=
getCommentUrl
(
url
,
proxy
);
if
(
nonNull
(
id
))
{
System
.
out
.
println
(
id
);
...
...
@@ -57,7 +57,7 @@ public class Chejia {
* @param proxy
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getChejiaComment
(
String
url
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getChejiaComment
(
String
url
,
Proxy
Holder
proxy
)
{
String
nUrl
=
getCommentUrl
(
url
,
proxy
);
if
(
nonNull
(
nUrl
))
{
int
page
=
1
;
...
...
@@ -98,7 +98,7 @@ public class Chejia {
return
Collections
.
emptyList
();
}
private
static
String
getCommentUrl
(
String
url
,
Proxy
proxy
)
{
private
static
String
getCommentUrl
(
String
url
,
Proxy
Holder
proxy
)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
String
objectID
=
response
.
body
().
string
().
split
(
"pvTrack.object = "
)[
1
].
split
(
";"
)[
0
].
replace
(
"\""
,
""
);
return
"https://reply.autohome.com.cn/api/comments/show.json?appid=21&count=50&id="
+
objectID
;
...
...
src/main/java/com/zhiwei/parse/Fenghuang.java
View file @
2a35dd02
...
...
@@ -10,6 +10,7 @@ import java.util.Map;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.parse.analysis.FenghuangAccountAnalysis
;
...
...
@@ -64,7 +65,7 @@ public class Fenghuang {
* @param docUrl
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getFenghuangCommentData
(
String
url
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getFenghuangCommentData
(
String
url
,
Proxy
Holder
proxy
)
{
url
=
fenghuangCommentAnalysis
.
getdocUrl
(
url
,
proxy
);
if
(
url
==
null
)
{
return
Collections
.
emptyList
();
...
...
@@ -92,7 +93,7 @@ public class Fenghuang {
* @param proxy
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getFenghuangCommentData2
(
String
url
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getFenghuangCommentData2
(
String
url
,
Proxy
Holder
proxy
)
{
url
=
fenghuangCommentAnalysis
.
getdocUrl
(
url
,
proxy
);
if
(
url
==
null
)
{
return
Collections
.
emptyList
();
...
...
@@ -118,7 +119,7 @@ public class Fenghuang {
* @param url
* @return
*/
public
static
Map
<
String
,
Object
>
getFenghuangCommentCount
(
String
url
,
Proxy
proxy
)
{
public
static
Map
<
String
,
Object
>
getFenghuangCommentCount
(
String
url
,
Proxy
Holder
proxy
)
{
url
=
fenghuangCommentAnalysis
.
getdocUrl
(
url
,
proxy
);
System
.
out
.
println
(
url
);
if
(
url
==
null
)
{
...
...
src/main/java/com/zhiwei/parse/Maimai.java
View file @
2a35dd02
...
...
@@ -9,6 +9,7 @@ import java.util.Collections;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -21,6 +22,7 @@ import com.zhiwei.crawler.proxy.ProxyHolder;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.parse.analysis.MaimaiBywordAnalysis
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
...
...
@@ -51,7 +53,7 @@ public class Maimai {
int
i
=
20
;
while
(
f
)
{
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
Map
<
String
,
Object
>
map
=
maimaiBywordAnalysis
.
getData
(
result
,
time
);
Map
<
String
,
Object
>
map
=
maimaiBywordAnalysis
.
getData
(
result
,
time
,
key
);
f
=
(
boolean
)
map
.
get
(
"hasMore"
);
List
<
Map
<
String
,
Object
>>
daList
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"data"
);
if
(
daList
!=
null
&&
!
daList
.
isEmpty
())
{
...
...
@@ -89,7 +91,7 @@ public class Maimai {
int
i
=
20
;
while
(
f
)
{
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
Map
<
String
,
Object
>
map
=
maimaiBywordAnalysis
.
getDataByNoName
(
result
,
time
);
Map
<
String
,
Object
>
map
=
maimaiBywordAnalysis
.
getDataByNoName
(
result
,
time
,
key
);
f
=
(
boolean
)
map
.
get
(
"hasMore"
);
List
<
Map
<
String
,
Object
>>
daList
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"data"
);
if
(
daList
!=
null
&&
daList
.
size
()
>
0
)
{
...
...
@@ -129,6 +131,40 @@ public class Maimai {
map
.
put
(
"gid"
,
data
.
getLong
(
"id"
));
map
.
put
(
"title"
,
data
.
getString
(
"text"
));
map
.
put
(
"author"
,
data
.
getString
(
"author"
));
map
.
put
(
"userId"
,
data
.
getString
(
"mmid"
));
return
map
;
}
catch
(
Exception
e
)
{
logger
.
error
(
" 脉脉 转评攒 获取失败 {}"
,
e
);
}
return
Collections
.
emptyMap
();
}
/**
* //https://maimai.cn/web/gossip_detail?encode_id=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MTk2MzEyNjYsImlhdCI6MTU0ODI5NzI5NX0.N6SPmcf-fyitLNomzY-a8BEY31eseYnvG7RTUQ3jxYY
* @Description 获取脉脉转评赞
* @param url
* @param proxy
* @return
*/
public
static
Map
<
String
,
Object
>
getMaiaiCount
(
String
url
,
String
cookie
,
ProxyHolder
proxy
)
{
Map
<
String
,
Object
>
headers
=
new
HashMap
<>();
if
(
Objects
.
nonNull
(
cookie
)
&&
!
cookie
.
isEmpty
())
{
headers
.
put
(
"cookie"
,
cookie
);
}
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headers
),
proxy
)){
String
result
=
response
.
body
().
string
();
result
=
result
.
split
(
"JSON.parse\\(\""
)[
1
].
split
(
"\"\\);\\</script\\>"
)[
0
];
result
=
ZhiWeiTools
.
decodeUnicode
(
result
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
JSONObject
data
=
json
.
getJSONObject
(
"data"
).
getJSONObject
(
"gossip"
);
map
.
put
(
"like"
,
data
.
getInteger
(
"likes"
));
map
.
put
(
"spreads"
,
data
.
getInteger
(
"spreads"
));
map
.
put
(
"cmts"
,
data
.
getInteger
(
"cmts"
));
map
.
put
(
"gid"
,
data
.
getLong
(
"id"
));
map
.
put
(
"title"
,
data
.
getString
(
"text"
));
map
.
put
(
"author"
,
data
.
getString
(
"author"
));
map
.
put
(
"userId"
,
data
.
getString
(
"mmid"
));
return
map
;
}
catch
(
Exception
e
)
{
logger
.
error
(
" 脉脉 转评攒 获取失败 {}"
,
e
);
...
...
@@ -144,9 +180,13 @@ public class Maimai {
* @return
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
Map
<
String
,
Object
>>
getMaimaiCommentList
(
String
url
,
ProxyHolder
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getMaimaiCommentList
(
String
url
,
String
cookie
,
ProxyHolder
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
Map
<
String
,
Object
>
mmid
=
getMaiaiCount
(
url
,
proxy
);
Map
<
String
,
Object
>
mmid
=
getMaiaiCount
(
url
,
cookie
,
proxy
);
Map
<
String
,
Object
>
headers
=
new
HashMap
<>();
if
(
Objects
.
nonNull
(
cookie
)
&&
!
cookie
.
isEmpty
())
{
headers
.
put
(
"cookie"
,
cookie
);
}
if
(
mmid
!=
null
)
{
String
gid
=
String
.
valueOf
(
mmid
.
get
(
"gid"
));
boolean
more
=
true
;
...
...
@@ -154,7 +194,10 @@ public class Maimai {
while
(
more
)
{
try
{
String
link
=
"https://maimai.cn/sdk/web/gossip/getcmts?gid="
+
gid
+
"&page="
+
page
+
"&count=50&hotcmts_limit_count=100"
;
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
link
),
proxy
).
body
().
string
();
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
link
,
headers
),
proxy
).
body
().
string
();
if
(
Objects
.
nonNull
(
cookie
)
&&
!
cookie
.
isEmpty
())
{
ZhiWeiTools
.
sleep
(
2000
);
}
if
(
htmlBody
!=
null
&&
htmlBody
.
length
()>
0
)
{
JSONObject
dataJson
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
commentJson
=
dataJson
.
getJSONArray
(
"comments"
);
...
...
@@ -184,4 +227,54 @@ public class Maimai {
return
Collections
.
emptyList
();
}
public
static
List
<
Map
<
String
,
Object
>>
getUserList
(
String
word
,
String
cookie
,
Proxy
proxy
)
{
String
url
=
"https://maimai.cn/search/contacts?count=50&query="
+
URLCodeUtil
.
getURLEncode
(
word
,
"utf-8"
)+
"&dist=0&searchTokens=&highlight=true&jsononly=1&pc=1&page="
;
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
Map
<
String
,
Object
>
headers
=
new
HashMap
<>();
System
.
out
.
println
(
url
);
headers
.
put
(
"cookie"
,
cookie
);
headers
.
put
(
"referer"
,
"https://maimai.cn/web/search_center?type=contact&query="
+
URLCodeUtil
.
getURLEncode
(
word
,
"utf-8"
)+
"&highlight=true"
);
int
page
=
0
;
while
(
true
)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
+
page
,
headers
),
proxy
)){
String
result
=
response
.
body
().
string
();
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArray
=
json
.
getJSONObject
(
"data"
).
getJSONArray
(
"contacts"
);
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
Map
<
String
,
Object
>
map
=
getUserMap
(
data
);
dataList
.
add
(
map
);
}
page
++;
logger
.
info
(
" 采集到 {} 页 ,一共采集到 {} 条"
,
page
,
dataList
.
size
());
ZhiWeiTools
.
sleep
(
2000
);
if
(
jsonArray
.
isEmpty
())
{
break
;
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
break
;
}
}
return
dataList
;
}
private
static
Map
<
String
,
Object
>
getUserMap
(
JSONObject
data
)
{
try
{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
JSONObject
da
=
data
.
getJSONObject
(
"contact"
);
map
.
put
(
"id"
,
data
.
getString
(
"uid"
));
map
.
put
(
"name"
,
da
.
getString
(
"name"
));
map
.
put
(
"gender"
,
da
.
getInteger
(
"gender"
));
map
.
put
(
"url"
,
"https://maimai.cn/contact/detail/"
+
da
.
getString
(
"encode_mmid"
));
map
.
put
(
"rank"
,
da
.
getInteger
(
"rank"
));
map
.
put
(
"compos"
,
da
.
getString
(
"compos"
));
map
.
put
(
"city"
,
da
.
getString
(
"city"
));
return
map
;
}
catch
(
Exception
e
)
{
logger
.
error
(
" 脉脉用户解析出错 "
);
}
return
Collections
.
emptyMap
();
}
}
src/main/java/com/zhiwei/parse/Pcauto.java
View file @
2a35dd02
package
com
.
zhiwei
.
parse
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
java.io.IOException
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.List
;
...
...
@@ -14,6 +14,7 @@ import org.slf4j.LoggerFactory;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.parse.analysis.PcautoCommentAnalysis
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
...
...
@@ -24,7 +25,7 @@ public class Pcauto {
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
Map
<
String
,
Object
>>
getPcAutoComment
(
String
url
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getPcAutoComment
(
String
url
,
Proxy
Holder
proxy
)
{
String
newUrl
=
getCommentUrl
(
url
,
proxy
);
if
(
nonNull
(
newUrl
))
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
...
...
@@ -53,7 +54,7 @@ public class Pcauto {
return
Collections
.
emptyList
();
}
private
static
String
getCommentUrl
(
String
url
,
Proxy
proxy
)
{
private
static
String
getCommentUrl
(
String
url
,
Proxy
Holder
proxy
)
{
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
try
{
String
result
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
"https://cmt.pcauto.com.cn/action/topic/get_data.jsp?url="
+
url
),
proxy
).
body
().
string
();
...
...
src/main/java/com/zhiwei/parse/QQKB.java
View file @
2a35dd02
...
...
@@ -11,7 +11,6 @@ import org.slf4j.LoggerFactory;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.bean.QQkbUser
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
...
...
@@ -19,13 +18,10 @@ import com.zhiwei.parse.analysis.QQKBAccountAnalysis;
import
com.zhiwei.parse.analysis.QQKBCommentAnalysis
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
public
class
QQKB
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
QQKB
.
class
);
private
static
QQKBAccountAnalysis
qqAccountAnalysis
=
new
QQKBAccountAnalysis
();
private
static
QQKBCommentAnalysis
qqkbCommentAnalysis
=
new
QQKBCommentAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
/**
*
...
...
src/main/java/com/zhiwei/parse/QQKandian.java
View file @
2a35dd02
...
...
@@ -32,9 +32,9 @@ public class QQKandian {
public
List
<
QQKandianUser
>
getUser
(
String
name
,
Proxy
proxy
)
{
if
(
name
!=
null
&&
name
.
length
()
>
0
)
{
List
<
QQKandianUser
>
dataList
=
new
ArrayList
<
QQKandianUser
>();
List
<
QQKandianUser
>
dataList
=
new
ArrayList
<>();
OkHttpClient
okhttp
=
HttpClientBuilder
.
newInstance
();
Map
<
String
,
String
>
map
=
new
HashMap
<
String
,
String
>();
Map
<
String
,
String
>
map
=
new
HashMap
<>();
map
.
put
(
"Host"
,
"sou.qq.com"
);
map
.
put
(
"Referer"
,
"https://sou.qq.com/kandian/kd.html?_bid=3216&_wv=3&_wwv=1293&_wvSb=0&hotword=%E7%9F%A5%E5%90%8D%E5%A4%A7V%E7%AB%A0%E6%96%87%E6%B6%89%E6%80%A7%E4%BE%B5"
);
map
.
put
(
"Cookie"
,
"skey=MUzU7gdtRz; uin=o0497332654; RK=rNiJH0RBav; pgv_pvid=8990378504; pt2gguin=o0497332654; ptcz=062d936df33011f468637ee72be262a020a8df79977df7e7bde9c105b2b2ddf6"
);
...
...
src/main/java/com/zhiwei/parse/QicheHome.java
View file @
2a35dd02
package
com
.
zhiwei
.
parse
;
import
java.io.IOException
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
...
...
@@ -11,6 +10,7 @@ import org.slf4j.LoggerFactory;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.parse.analysis.QicheHomeKwyWordAnalysis
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
...
...
@@ -21,7 +21,7 @@ public class QicheHome {
private
static
QicheHomeKwyWordAnalysis
qicheHomeKwyWordAnalysis
=
new
QicheHomeKwyWordAnalysis
();
public
static
List
<
Map
<
String
,
Object
>>
getQiCheComment
(
String
articleid
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getQiCheComment
(
String
articleid
,
Proxy
Holder
proxy
)
{
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
int
page
=
1
;
int
count
=
2
;
...
...
@@ -35,7 +35,7 @@ public class QicheHome {
}
bodyList
.
addAll
(
qicheHomeKwyWordAnalysis
.
getData
(
result
));
logger
.
info
(
"采集 articleid {} 总页数 {} 第 {} 页 , 采集总数 {}"
,
articleid
,
count
,
page
,
bodyList
.
size
());
ZhiWeiTools
.
sleep
(
30
00
);
ZhiWeiTools
.
sleep
(
2
00
);
if
(
page
>
count
)
{
break
;
}
...
...
src/main/java/com/zhiwei/parse/SinaKeji.java
View file @
2a35dd02
...
...
@@ -3,7 +3,6 @@ package com.zhiwei.parse;
import
static
java
.
util
.
Objects
.
nonNull
;
import
java.io.IOException
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.List
;
...
...
@@ -14,6 +13,7 @@ import org.slf4j.LoggerFactory;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.parse.analysis.SinaKejiCommentAnalysis
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
...
...
@@ -30,7 +30,7 @@ public class SinaKeji {
* @param proxy
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getSinaKejiComment
(
String
url
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getSinaKejiComment
(
String
url
,
Proxy
Holder
proxy
)
{
String
commentId
=
getCommentId
(
url
,
proxy
);
if
(
nonNull
(
commentId
))
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
...
...
@@ -60,7 +60,7 @@ public class SinaKeji {
return
Collections
.
emptyList
();
}
private
static
String
getCommentId
(
String
url
,
Proxy
proxy
)
{
private
static
String
getCommentId
(
String
url
,
Proxy
Holder
proxy
)
{
String
commentId
=
null
;
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
try
{
...
...
src/main/java/com/zhiwei/parse/SouBao.java
View file @
2a35dd02
package
com
.
zhiwei
.
parse
;
import
java.io.BufferedReader
;
import
java.io.FileInputStream
;
import
java.io.IOException
;
import
java.io.InputStreamReader
;
import
java.io.UnsupportedEncodingException
;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
SouBao
{
public
static
void
main
(
String
[]
args
)
{
List
<
String
>
wordList
=
getWords
(
"D:\\crawlerdata\\关键词.txt"
);
Map
<
String
,
String
>
map1
=
getdata
();
String
cookie
=
"UM_distinctid=163edb1f5e369-014b755d3bd662-6f14167a-1fa400-163edb1f5e648c; ASP.NET_SessionId=zy45xibjfmchosyskjqznwz0; CNZZDATA4625144=cnzz_eid%3D240947030-1528700717-%26ntime%3D1528965485; CNZZDATA1260939784=1605411930-1528965615-http%253A%252F%252Fwww.cnepaper.com%252F%7C1528965615"
;
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
String
word
:
wordList
)
{
ZhiWeiTools
.
sleep
(
2000
);
List
<
Map
<
String
,
Object
>>
dataList
=
getData
(
word
,
cookie
,
"2017-12-27"
,
"2018-06-27"
);
for
(
Map
<
String
,
Object
>
map
:
dataList
)
{
boolean
f
=
false
;
String
time
=
(
String
)
map
.
get
(
"time"
);
String
w
=
(
String
)
map
.
get
(
"word"
);
String
[]
words
=
w
.
split
(
" "
);
String
matchContent
=
(
String
)
map
.
get
(
"title"
)
+
"_"
+
(
String
)
map
.
get
(
"content"
);
if
(
"20160101"
.
equals
(
time
))
{
continue
;
}
for
(
int
i
=
0
;
i
<
words
.
length
;
i
++){
if
(
matchContent
.
toLowerCase
().
contains
(
words
[
i
].
toLowerCase
()))
{
f
=
true
;
}
else
{
f
=
false
;
break
;
}
}
if
(
f
){
System
.
out
.
println
(
map
.
toString
());
map
.
put
(
"品牌"
,
map1
.
get
(
w
));
bodyList
.
add
(
map
);
}
}
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"content"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
headList
.
add
(
"word"
);
headList
.
add
(
"品牌"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D:\\crawlerdata\\搜报网-EA 品牌 关键词-06.11-06.12.xlsx"
,
"sa"
,
headList
,
bodyList
);
}
@SuppressWarnings
(
"unchecked"
)
public
static
Map
<
String
,
String
>
getdata
()
{
Map
<
String
,
String
>
map
=
new
HashMap
<
String
,
String
>();
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
Map
<
String
,
Object
>
m
=
poi
.
importExcel
(
"D:\\crawlerdata\\品牌区分.xlsx"
,
0
);
List
<
Map
<
String
,
Object
>>
l
=
(
List
<
Map
<
String
,
Object
>>)
m
.
get
(
"body"
);
for
(
Map
<
String
,
Object
>
mm
:
l
)
{
map
.
put
((
String
)
mm
.
get
(
"关键词"
),
(
String
)
mm
.
get
(
"品牌"
));
}
System
.
out
.
println
(
map
.
toString
());
return
map
;
}
public
static
List
<
String
>
getWords
(
String
wordFileName
)
{
List
<
String
>
list
=
null
;
try
{
list
=
new
ArrayList
<
String
>();
BufferedReader
br
=
new
BufferedReader
(
new
InputStreamReader
(
new
FileInputStream
(
wordFileName
),
"GBK"
));
String
line
=
""
;
while
((
line
=
br
.
readLine
())!=
null
)
{
if
(
line
.
length
()
>=
1
)
{
list
.
add
(
line
);
}
}
br
.
close
();
return
list
;
}
catch
(
IOException
e
)
{
return
null
;
}
}
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
String
cookie
,
String
start
,
String
end
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getSoubaoHeaderMap
(
cookie
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
try
{
String
url
=
"http://www.soubao.net/search/searchList.aspx?keyword="
+
URLEncoder
.
encode
(
word
,
"utf-8"
)+
"&startdate="
+
start
+
"&enddate="
+
end
+
"×el=custom&checkNum="
;
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
null
,
headerMap
);
Document
doc
=
Jsoup
.
parse
(
result
);
Elements
elements
=
doc
.
select
(
"div#srh_main"
).
select
(
"ul"
);
for
(
Element
element
:
elements
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
map
.
put
(
"title"
,
element
.
select
(
"h2"
).
select
(
"a"
).
text
());
map
.
put
(
"content"
,
element
.
select
(
"p.newCon"
).
text
());
map
.
put
(
"source"
,
element
.
select
(
"p.newsInfo"
).
select
(
"em.paperName"
).
select
(
"span"
).
text
());
map
.
put
(
"time"
,
element
.
select
(
"p.newsInfo"
).
select
(
"em.postDate"
).
select
(
"span"
).
text
());
map
.
put
(
"url"
,
"http://www.soubao.net"
+
element
.
select
(
"h2"
).
select
(
"a"
).
attr
(
"href"
));
map
.
put
(
"word"
,
word
);
System
.
out
.
println
(
map
.
toString
());
dataList
.
add
(
map
);
}
if
(
elements
.
size
()
<
10
)
{
return
dataList
;
}
dataList
.
addAll
(
gettwoData
(
word
,
doc
,
cookie
,
start
,
end
));
System
.
out
.
println
(
"================================="
);
ZhiWeiTools
.
sleep
(
2000
);
}
catch
(
UnsupportedEncodingException
e
)
{
e
.
printStackTrace
();
}
return
dataList
;
}
public
static
List
<
Map
<
String
,
Object
>>
gettwoData
(
String
word
,
Document
doc
,
String
cookie
,
String
start
,
String
end
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getSoubaoHeaderMap
(
cookie
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
int
i
=
2
;
while
(
true
)
{
try
{
Map
<
String
,
Object
>
paramMap
=
HeadGet
.
getSoubaoParamMap
(
word
,
i
,
doc
,
start
,
end
);
String
result
=
HttpClient
.
executeHttpRequestPost
(
"http://www.soubao.net/search/searchList.aspx"
,
null
,
headerMap
,
paramMap
);
doc
=
null
;
doc
=
Jsoup
.
parse
(
result
);
Elements
elements
=
doc
.
select
(
"div#srh_main"
).
select
(
"ul"
);
for
(
Element
element
:
elements
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
map
.
put
(
"title"
,
element
.
select
(
"h2"
).
select
(
"a"
).
text
());
map
.
put
(
"content"
,
element
.
select
(
"p.newCon"
).
text
());
map
.
put
(
"source"
,
element
.
select
(
"p.newsInfo"
).
select
(
"em.paperName"
).
select
(
"span"
).
text
());
map
.
put
(
"time"
,
element
.
select
(
"p.newsInfo"
).
select
(
"em.postDate"
).
select
(
"span"
).
text
());
map
.
put
(
"url"
,
"http://www.soubao.net"
+
element
.
select
(
"h2"
).
select
(
"a"
).
attr
(
"href"
));
map
.
put
(
"word"
,
word
);
System
.
out
.
println
(
map
.
toString
());
dataList
.
add
(
map
);
}
if
(
elements
.
size
()
<
10
)
{
return
dataList
;
}
System
.
out
.
println
(
"================================="
);
ZhiWeiTools
.
sleep
(
2000
);
i
++;
}
catch
(
Exception
e
)
{
// TODO: handle exception
}
}
}
}
//
package com.zhiwei.parse;
//
//
import java.io.BufferedReader;
//
import java.io.FileInputStream;
//
import java.io.IOException;
//
import java.io.InputStreamReader;
//
import java.io.UnsupportedEncodingException;
//
import java.net.URLEncoder;
//
import java.util.ArrayList;
//
import java.util.HashMap;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.jsoup.Jsoup;
//
import org.jsoup.nodes.Document;
//
import org.jsoup.nodes.Element;
//
import org.jsoup.select.Elements;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.httpclient.HeadGet;
//
import com.zhiwei.httpclient.HttpClient;
//
import com.zhiwei.tools.tools.ZhiWeiTools;
//
//
public class SouBao {
//
//
public static void main(String[] args) {
//
List<String> wordList = getWords("D:\\crawlerdata\\关键词.txt");
//
Map<String,String> map1 = getdata();
//
String cookie = "UM_distinctid=163edb1f5e369-014b755d3bd662-6f14167a-1fa400-163edb1f5e648c; ASP.NET_SessionId=zy45xibjfmchosyskjqznwz0; CNZZDATA4625144=cnzz_eid%3D240947030-1528700717-%26ntime%3D1528965485; CNZZDATA1260939784=1605411930-1528965615-http%253A%252F%252Fwww.cnepaper.com%252F%7C1528965615";
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
//
for(String word : wordList) {
//
ZhiWeiTools.sleep(2000);
//
List<Map<String,Object>> dataList = getData(word, cookie,"2017-12-27","2018-06-27");
//
for(Map<String,Object> map : dataList) {
//
boolean f = false;
//
String time = (String) map.get("time");
//
String w = (String) map.get("word");
//
String[] words = w.split(" ");
//
String matchContent = (String) map.get("title") + "_" + (String) map.get("content");
//
if("20160101".equals(time)) {
//
continue;
//
}
//
for(int i=0;i<words.length;i++){
//
if(matchContent.toLowerCase().contains(words[i].toLowerCase()))
//
{
//
f = true;
//
}else{
//
f = false;
//
break;
//
}
//
}
//
if(f){
//
System.out.println(map.toString());
//
map.put("品牌", map1.get(w));
//
bodyList.add(map);
//
}
//
}
//
}
//
List<String> headList = new ArrayList<String>();
//
headList.add("title");
//
headList.add("time");
//
headList.add("content");
//
headList.add("source");
//
headList.add("url");
//
headList.add("word");
//
headList.add("品牌");
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
poi.exportExcel("D:\\crawlerdata\\搜报网-EA 品牌 关键词-06.11-06.12.xlsx", "sa", headList, bodyList);
//
}
//
//
@SuppressWarnings("unchecked")
//
public static Map<String,String> getdata() {
//
Map<String,String> map = new HashMap<String,String>();
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
Map<String,Object> m = poi.importExcel("D:\\crawlerdata\\品牌区分.xlsx", 0);
//
List<Map<String,Object>> l = (List<Map<String, Object>>) m.get("body");
//
//
for(Map<String,Object> mm : l) {
//
map.put((String)mm.get("关键词"), (String)mm.get("品牌"));
//
}
//
System.out.println(map.toString());
//
return map;
//
}
//
//
public static List<String> getWords(String wordFileName) {
//
//
List<String> list = null;
//
try {
//
list = new ArrayList<String>();
//
BufferedReader br=new BufferedReader(new InputStreamReader(new FileInputStream(wordFileName),"GBK"));
//
String line = "";
//
while((line = br.readLine())!=null)
//
{
//
if(line.length() >= 1) {
//
list.add(line);
//
}
//
}
//
br.close();
//
return list;
//
} catch (IOException e) {
//
return null;
//
}
//
}
//
//
public static List<Map<String,Object>> getData(String word,String cookie,String start,String end) {
//
Map<String,String> headerMap = HeadGet.getSoubaoHeaderMap(cookie);
//
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
//
try {
//
String url = "http://www.soubao.net/search/searchList.aspx?keyword="+URLEncoder.encode(word,"utf-8")+"&startdate="+start+"&enddate="+end+"×el=custom&checkNum=";
//
String result = HttpClient.executeHttpRequestGet(url, null, headerMap);
//
Document doc = Jsoup.parse(result);
//
Elements elements = doc.select("div#srh_main").select("ul");
//
for(Element element : elements) {
//
Map<String,Object> map = new HashMap<String,Object>();
//
map.put("title", element.select("h2").select("a").text());
//
map.put("content", element.select("p.newCon").text());
//
map.put("source", element.select("p.newsInfo").select("em.paperName").select("span").text());
//
map.put("time", element.select("p.newsInfo").select("em.postDate").select("span").text());
//
map.put("url","http://www.soubao.net" + element.select("h2").select("a").attr("href"));
//
map.put("word", word);
//
System.out.println(map.toString());
//
dataList.add(map);
//
}
//
if(elements.size() < 10) {
//
return dataList;
//
}
//
dataList.addAll(gettwoData(word, doc, cookie, start, end));
//
System.out.println("=================================");
//
ZhiWeiTools.sleep(2000);
//
} catch (UnsupportedEncodingException e) {
//
e.printStackTrace();
//
}
//
return dataList;
//
//
}
//
//
public static List<Map<String,Object>> gettwoData(String word,Document doc,String cookie,String start,String end) {
//
Map<String, String> headerMap = HeadGet.getSoubaoHeaderMap(cookie);
//
List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
//
int i = 2;
//
while(true) {
//
try {
//
Map<String,Object> paramMap = HeadGet.getSoubaoParamMap(word, i, doc, start, end);
//
String result = HttpClient.executeHttpRequestPost("http://www.soubao.net/search/searchList.aspx", null, headerMap, paramMap);
//
doc = null;
//
doc = Jsoup.parse(result);
//
Elements elements = doc.select("div#srh_main").select("ul");
//
for(Element element : elements) {
//
Map<String,Object> map = new HashMap<String,Object>();
//
map.put("title", element.select("h2").select("a").text());
//
map.put("content", element.select("p.newCon").text());
//
map.put("source", element.select("p.newsInfo").select("em.paperName").select("span").text());
//
map.put("time", element.select("p.newsInfo").select("em.postDate").select("span").text());
//
map.put("url","http://www.soubao.net" + element.select("h2").select("a").attr("href"));
//
map.put("word", word);
//
System.out.println(map.toString());
//
dataList.add(map);
//
}
//
if(elements.size() < 10) {
//
return dataList;
//
}
//
System.out.println("=================================");
//
ZhiWeiTools.sleep(2000);
//
i++;
//
} catch (Exception e) {
//
// TODO: handle exception
//
}
//
}
//
//
}
//
//
}
src/main/java/com/zhiwei/parse/Souhu.java
View file @
2a35dd02
...
...
@@ -14,6 +14,7 @@ import org.slf4j.LoggerFactory;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.parse.analysis.SouhuAccountAnalysis
;
...
...
@@ -34,7 +35,7 @@ public class Souhu {
* @param url
* @return
*/
public
static
int
getSouhuCommentCount
(
String
url
,
Proxy
proxy
)
{
public
static
int
getSouhuCommentCount
(
String
url
,
Proxy
Holder
proxy
)
{
try
{
String
newurl
=
souhuCommentAnalysis
.
getSouhuURL
(
url
,
proxy
);
if
(
nonNull
(
newurl
))
{
...
...
@@ -139,13 +140,13 @@ public class Souhu {
* @param cookie
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getSouhuCommentData
(
String
url
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getSouhuCommentData
(
String
url
,
Proxy
Holder
proxy
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getSouhuCommentHeaderMap
(
null
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
int
j
=
1
;
try
{
while
(
true
)
{
String
newurl
=
souhuCommentAnalysis
.
getSouhuURL
(
url
,
ProxyFactory
.
getNatProxy
()
)
+
"&page_no="
+
j
;
String
newurl
=
souhuCommentAnalysis
.
getSouhuURL
(
url
,
proxy
)
+
"&page_no="
+
j
;
String
result
=
HttpClient
.
executeHttpRequestGet
(
newurl
,
ProxyFactory
.
getNatProxy
(),
headerMap
);
System
.
out
.
println
(
newurl
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
...
...
src/main/java/com/zhiwei/parse/TechTx.java
View file @
2a35dd02
package
com
.
zhiwei
.
parse
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
java.io.IOException
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.List
;
...
...
@@ -13,6 +13,7 @@ import org.slf4j.LoggerFactory;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.parse.analysis.TechTxCommentAnalysis
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
...
...
@@ -23,7 +24,7 @@ public class TechTx {
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
Map
<
String
,
Object
>>
getTechTxComment
(
String
url
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getTechTxComment
(
String
url
,
Proxy
Holder
proxy
)
{
String
commentID
=
getCommentId
(
url
,
proxy
);
String
next
=
""
;
if
(
nonNull
(
commentID
))
{
...
...
@@ -53,7 +54,7 @@ public class TechTx {
return
Collections
.
emptyList
();
}
private
static
String
getCommentId
(
String
url
,
Proxy
proxy
)
{
private
static
String
getCommentId
(
String
url
,
Proxy
Holder
proxy
)
{
String
commentID
=
null
;
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
try
{
...
...
src/main/java/com/zhiwei/parse/Wangyi.java
View file @
2a35dd02
...
...
@@ -10,6 +10,7 @@ import org.slf4j.Logger;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.parse.analysis.WangyiCommentAnalysis
;
...
...
@@ -61,7 +62,7 @@ public class Wangyi {
* @param id
* @return
*/
public
static
int
getWangyiCommentCount
(
String
id
,
Proxy
proxy
)
{
public
static
int
getWangyiCommentCount
(
String
id
,
Proxy
Holder
proxy
)
{
try
{
String
url
=
"http://comment.dy.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/"
+
id
;
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getWangyiCommentHeaderMap
(
null
);
...
...
src/main/java/com/zhiwei/parse/Xueqiu.java
View file @
2a35dd02
...
...
@@ -5,7 +5,6 @@ import java.io.UnsupportedEncodingException;
import
java.net.Proxy
;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
import
java.util.Collection
;
import
java.util.Collections
;
import
java.util.HashMap
;
import
java.util.List
;
...
...
src/main/java/com/zhiwei/parse/Yiche.java
View file @
2a35dd02
package
com
.
zhiwei
.
parse
;
import
java.net.Proxy
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.HashMap
;
...
...
@@ -14,9 +15,9 @@ import com.alibaba.fastjson.JSONArray;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
okhttp3.Response
;
public
class
Yiche
{
...
...
@@ -31,7 +32,7 @@ public class Yiche {
* @param proxy
* @return
*/
public
static
int
getYicheCount
(
String
url
,
Proxy
proxy
)
{
public
static
int
getYicheCount
(
String
url
,
Proxy
Holder
proxy
)
{
String
nurl
=
getnewsId
(
url
,
proxy
);
if
(
nonNull
(
nurl
))
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
nurl
),
proxy
)){
...
...
@@ -52,7 +53,7 @@ public class Yiche {
* @param proxy
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getYicheComment
(
String
url
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getYicheComment
(
String
url
,
Proxy
Holder
proxy
)
{
String
nUrl
=
getnewsId
(
url
,
proxy
);
if
(
nonNull
(
nUrl
))
{
int
page
=
1
;
...
...
@@ -92,7 +93,7 @@ public class Yiche {
return
Collections
.
emptyList
();
}
private
static
String
getnewsId
(
String
url
,
Proxy
proxy
)
{
private
static
String
getnewsId
(
String
url
,
Proxy
Holder
proxy
)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
String
result
=
response
.
body
().
string
();
String
productId
=
result
.
split
(
"productId: "
)[
1
].
split
(
","
)[
0
];
...
...
src/main/java/com/zhiwei/parse/Youku.java
View file @
2a35dd02
...
...
@@ -56,8 +56,6 @@ public class Youku {
}
catch
(
Exception
e
)
{
logger
.
error
(
" Exception {} "
,
e
);
}
}
return
list
;
...
...
src/main/java/com/zhiwei/parse/analysis/AiqiyiByWordAnalysis.java
View file @
2a35dd02
...
...
@@ -13,14 +13,8 @@ import org.jsoup.select.Elements;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
okhttp3.Response
;
public
class
AiqiyiByWordAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
AiqiyiByWordAnalysis
.
class
);
...
...
src/main/java/com/zhiwei/parse/analysis/FenghuangCommentAnalysis.java
View file @
2a35dd02
package
com
.
zhiwei
.
parse
.
analysis
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.Date
;
...
...
@@ -15,6 +14,7 @@ import com.alibaba.fastjson.JSONArray;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.tools.timeparse.TimeParse
;
...
...
@@ -25,7 +25,7 @@ public class FenghuangCommentAnalysis {
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
public
Map
<
String
,
Object
>
getFenghuangCommentCount
(
String
url
,
Proxy
proxy
)
{
public
Map
<
String
,
Object
>
getFenghuangCommentCount
(
String
url
,
Proxy
Holder
proxy
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
String
result
=
response
.
body
().
string
();
...
...
@@ -46,7 +46,7 @@ public class FenghuangCommentAnalysis {
* @param url
* @return
*/
public
String
getdocUrl
(
String
url
,
Proxy
proxy
)
{
public
String
getdocUrl
(
String
url
,
Proxy
Holder
proxy
)
{
String
docUrl
=
null
;
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
...
...
@@ -79,7 +79,7 @@ public class FenghuangCommentAnalysis {
* @param url
* @return
*/
public
List
<
Map
<
String
,
Object
>>
getData
(
String
url
,
Proxy
proxy
)
{
public
List
<
Map
<
String
,
Object
>>
getData
(
String
url
,
Proxy
Holder
proxy
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getFenghuangCommentHeaderMap
(
null
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
)){
...
...
@@ -106,7 +106,7 @@ public class FenghuangCommentAnalysis {
* @param proxy
* @return
*/
public
List
<
Map
<
String
,
Object
>>
getData2
(
String
url
,
Proxy
proxy
)
{
public
List
<
Map
<
String
,
Object
>>
getData2
(
String
url
,
Proxy
Holder
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
String
result
=
response
.
body
().
string
();
...
...
src/main/java/com/zhiwei/parse/analysis/MaimaiBywordAnalysis.java
View file @
2a35dd02
...
...
@@ -11,17 +11,17 @@ import com.alibaba.fastjson.JSONObject;
public
class
MaimaiBywordAnalysis
{
public
Map
<
String
,
Object
>
getData
(
String
result
,
String
time
)
{
Map
<
String
,
Object
>
map1
=
new
HashMap
<
String
,
Object
>();
public
Map
<
String
,
Object
>
getData
(
String
result
,
String
time
,
String
key
)
{
Map
<
String
,
Object
>
map1
=
new
HashMap
<>();
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArry
=
json
.
getJSONObject
(
"data"
).
getJSONArray
(
"feeds"
);
boolean
f
=
true
;
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>
>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
f
=
json
.
getJSONObject
(
"data"
).
getInteger
(
"more"
)==
1
?
true
:
false
;
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
JSONObject
data
=
jsonArry
.
getJSONObject
(
i
);
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
String
url
=
"https://maimai.cn/article/detail?fid="
+
data
.
getJSONObject
(
"feed"
).
getString
(
"id"
);
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
String
url
=
"https://maimai.cn/article/detail?fid="
+
data
.
getJSONObject
(
"feed"
).
getString
(
"id"
)
+
"&efid="
+
data
.
getString
(
"efid"
)
;
String
atime
=
data
.
getJSONObject
(
"feed"
).
getString
(
"crtime_string"
);
if
(
time
.
compareTo
(
atime
)
>
-
1
)
{
f
=
false
;
...
...
@@ -34,6 +34,8 @@ public class MaimaiBywordAnalysis {
map
.
put
(
"like"
,
data
.
getJSONObject
(
"feed"
).
getInteger
(
"likes"
));
map
.
put
(
"comment_count"
,
data
.
getJSONObject
(
"feed"
).
getInteger
(
"total_cnt"
));
map
.
put
(
"spreads"
,
data
.
getJSONObject
(
"feed"
).
getInteger
(
"spreads"
));
//传播数
map
.
put
(
"career"
,
data
.
getJSONObject
(
"contact"
).
getString
(
"career"
));
map
.
put
(
"word"
,
key
);
// System.out.println(map.toString());
dataList
.
add
(
map
);
}
...
...
@@ -42,7 +44,7 @@ public class MaimaiBywordAnalysis {
return
map1
;
}
public
Map
<
String
,
Object
>
getDataByNoName
(
String
result
,
String
time
)
{
public
Map
<
String
,
Object
>
getDataByNoName
(
String
result
,
String
time
,
String
key
)
{
Map
<
String
,
Object
>
map1
=
new
HashMap
<
String
,
Object
>();
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArry
=
json
.
getJSONObject
(
"data"
).
getJSONArray
(
"gossips"
);
...
...
@@ -65,6 +67,7 @@ public class MaimaiBywordAnalysis {
map
.
put
(
"like"
,
data
.
getJSONObject
(
"gossip"
).
getInteger
(
"likes"
));
map
.
put
(
"comment_count"
,
data
.
getJSONObject
(
"gossip"
).
getInteger
(
"total_cnt"
));
map
.
put
(
"spreads"
,
data
.
getJSONObject
(
"gossip"
).
getInteger
(
"search_order"
));
//传播数
map
.
put
(
"word"
,
key
);
// System.out.println(map.toString());
dataList
.
add
(
map
);
}
...
...
src/main/java/com/zhiwei/parse/analysis/SouhuCommentAnalysis.java
View file @
2a35dd02
package
com
.
zhiwei
.
parse
.
analysis
;
import
java.net.Proxy
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.Map
;
...
...
@@ -11,6 +10,7 @@ import org.slf4j.LoggerFactory;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
...
...
@@ -27,7 +27,7 @@ public class SouhuCommentAnalysis {
* @param url
* @return
*/
public
String
getSouhuURL
(
String
url
,
Proxy
proxy
)
{
public
String
getSouhuURL
(
String
url
,
Proxy
Holder
proxy
)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
String
result
=
response
.
body
().
string
();
String
source_id
=
result
.
split
(
"news_id: \""
)[
1
].
split
(
"\","
)[
0
];
...
...
@@ -39,7 +39,7 @@ public class SouhuCommentAnalysis {
return
null
;
}
public
int
getSouhuCommentCount
(
String
url
,
Proxy
proxy
)
{
public
int
getSouhuCommentCount
(
String
url
,
Proxy
Holder
proxy
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getSouhuCommentHeaderMap
(
null
);
int
i
;
try
{
...
...
src/main/java/com/zhiwei/parse/shipin/QQTV.java
View file @
2a35dd02
...
...
@@ -4,7 +4,6 @@ import java.net.Proxy;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
...
...
@@ -55,7 +54,7 @@ public class QQTV {
String
nurl
=
element
.
select
(
"h2.result_title"
).
select
(
"a"
).
attr
(
"href"
);
Map
<
String
,
Object
>
map
=
getUrlData
(
nurl
,
ProxyFactory
.
getNatProxy
());
if
(
Objects
.
nonNull
(
map
)
&&
time
.
compareTo
(
String
.
valueOf
(
map
.
get
(
"time"
)))
<
1
)
{
System
.
out
.
println
(
map
.
toString
());
//
System.out.println(map.toString());
dataList
.
add
(
map
);
}
ZhiWeiTools
.
sleep
(
50
);
...
...
@@ -64,6 +63,7 @@ public class QQTV {
if
(
count
!=
dataList
.
size
())
{
continue
;
}
break
;
}
catch
(
Exception
e
)
{
logger
.
error
(
" 数据采集出错 {} "
,
e
);
...
...
src/main/java/com/zhiwei/parse/shipin/SohuTV.java
View file @
2a35dd02
...
...
@@ -35,6 +35,7 @@ public class SohuTV {
headers
.
put
(
"cookie"
,
cookie
);
while
(
true
)
{
int
count
=
dataList
.
size
();
System
.
out
.
println
(
url
+
page
);
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
+
page
,
headers
),
proxy
)){
String
result
=
response
.
body
().
string
();
Document
document
=
Jsoup
.
parse
(
result
);
...
...
src/test/java/com/zhiwei/Comment/MaimaiCommentCountTest.java
View file @
2a35dd02
//package com.zhiwei.Comment;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Maimai;
//import com.zhiwei.parse.Yiche;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//public class MaimaiCommentCountTest {
// @Test
// public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
// GroupType.PROVIDER);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
// Map<String, Object> map = poi
// .importExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\脉脉#美团 裁员#汇总截至12月20日10点30分.xlsx(1).xlsx", 0);
// List<Map<String, Object>> list = (List<Map<String, Object>>) map.get("body");
// List<Map<String, Object>> bodyList = new ArrayList<Map<String, Object>>();
// List<String> headList = (List<String>) map.get("head");
// for (Map<String, Object> map1 : list) {
// String url = map1.get("地址") + "";
// Map<String,Object> map3 = Maimai.getMaiaiCount(url, ProxyFactory.getNatProxy());
// map1.putAll(map3);
// ZhiWeiTools.sleep(100);
// }
// headList.add("like");
// headList.add("spreads");
// headList.add("cmts");
// poi.exportExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\脉脉#美团 裁员#汇总截至12月20日10点30分.xlsx(1).xlsx", "评论采集", headList,
// list);
// }
//}
package
com
.
zhiwei
.
Comment
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.testng.annotations.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Maimai
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
MaimaiCommentCountTest
{
@Test
public
void
f
()
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
"C:\\Users\\byte-zbs\\Desktop\\tx\\脉脉公司圈.xlsx"
,
0
);
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
String
cookie
=
"_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; sessionid=lejfy3gdu5tf9x9zowxfhtq5o73dubc5; guid=GxsfBBgZGwQYGx4EGBkeVgcYGxkdHhMeHhkcVhwZBB0ZHwVDWEtMS3kKEhMEEh0fGQQaBBsdBU9HRVhCaQoDRUFJT20KT0FDRgoGZmd+YmECChwZBB0ZHwVeQ2FIT31PRlpaawoDHhx9ZX0KERkEHAp+ZApZXUVOREN9AgoaBB8FS0ZGQ1BFZw==; seid=s1550814253444; token=\"rhItcea5qkO6WCSnVcczW/NRVLLCTsq3kQbpUCGAwQ0ceLunVJRjT5rgoFVYrIBA8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoiVjJuNHdCVDBncVNacTRxVllGM29jRUVwIiwic3RhdHVzIjp0cnVlLCJfZXhwaXJlIjoxNTUwOTAyMTY3MDQ5LCJfbWF4QWdlIjo4NjQwMDAwMH0=; session.sig=zbs4cHtzTcHWvjtkpjAZmoqLXsQ"
;
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
List
<
String
>
headList
=
(
List
<
String
>)
map
.
get
(
"head"
);
for
(
Map
<
String
,
Object
>
map1
:
list
)
{
String
url
=
map1
.
get
(
"地址"
)
+
""
;
Map
<
String
,
Object
>
map3
=
Maimai
.
getMaiaiCount
(
url
,
null
,
ProxyHolder
.
NAT_PROXY
);
System
.
out
.
println
(
map3
.
toString
());
System
.
out
.
println
(
url
);
map1
.
putAll
(
map3
);
ZhiWeiTools
.
sleep
(
500
);
System
.
out
.
println
(
"--------------------------"
);
}
headList
.
add
(
"like"
);
headList
.
add
(
"spreads"
);
headList
.
add
(
"cmts"
);
poi
.
exportExcel
(
"C:\\Users\\byte-zbs\\Desktop\\tx\\脉脉公司圈.xlsx"
,
"评论采集"
,
headList
,
list
);
}
}
src/test/java/com/zhiwei/crawler/AiqiyiByWordExample.java
View file @
2a35dd02
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Aiqiyi
;
import
com.zhiwei.util.WordReadFile
;
public
class
AiqiyiByWordExample
{
@Test
public
void
aiqiyiByWordTest
()
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"D://crawlerdata//关键词.txt"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
String
w
:
wordList
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
Aiqiyi
.
getAiqiyiByWordData
(
w
,
null
);
if
(
dataList
!=
null
&&
dataList
.
size
()
>=
1
)
{
bodyList
.
addAll
(
dataList
);
}
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"count"
);
headList
.
add
(
"time"
);
headList
.
add
(
"source"
);
headList
.
add
(
"content"
);
headList
.
add
(
"url"
);
headList
.
add
(
"title"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata/爱奇艺关键词采集.xlsx"
,
"数据"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.crawler;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.common.config.GroupType;
//
import com.zhiwei.crawler.proxy.ProxyFactory;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Aiqiyi;
//
import com.zhiwei.util.WordReadFile;
//
//
public class AiqiyiByWordExample {
//
//
//
@Test
//
public void aiqiyiByWordTest() {
//
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
//
List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
//
for(String w : wordList) {
//
List<Map<String,Object>> dataList = Aiqiyi.getAiqiyiByWordData(w,null);
//
if(dataList != null && dataList.size() >= 1) {
//
bodyList.addAll(dataList);
//
}
//
}
//
List<String> headList = new ArrayList<String>();
//
headList.add("count");
//
headList.add("time");
//
headList.add("source");
//
headList.add("content");
//
headList.add("url");
//
headList.add("title");
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
poi.exportExcel("D://crawlerdata/爱奇艺关键词采集.xlsx", "数据", headList, bodyList);
//
//
//
//
}
//
//
//
//
}
src/test/java/com/zhiwei/crawler/MaimaiBywordExample.java
View file @
2a35dd02
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.Arrays
;
import
java.util.List
;
import
java.util.Map
;
...
...
@@ -11,8 +12,8 @@ public class MaimaiBywordExample {
public
static
void
main
(
String
[]
args
)
{
String
word
=
"美团|某团|MT|大众点评|新美大|美团点评"
;
String
cookie
=
"_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; sessionid=
3oatshv55and4kwcz9gdpie7qdpj27yt; guid=GxsfBBgZGwQYGx4EGBkeVgcYGxkdHxwdGRMcVhwZBB0ZHwVDWEtMS3kKEhMEEh0fGQQaBBsdBU9HRVhCaQoDRUFJT20KT0FDRgoGZmd+YmECChwZBB0ZHwVeQ2FIT31PRlpaawoDHhx9ZX0KERkEHAp+ZApZXUVOREN9AgoaBB8FS0ZGQ1BFZw==; seid=s1548984672861; token=\"Ap1u6QzIdn8FCrohEAEPI86n9mNSKk1qJWlauQ8KeSbn7fDKTu6bN2Yv6B9V19nO8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoibVVlSlRTUW1NdVdUTUUtRjV0SjBZbExtIiwibWlkNDU2ODc2MCI6ZmFsc2UsInN0YXR1cyI6dHJ1ZSwiX2V4cGlyZSI6MTU0OTA3MTEzOTA2NywiX21heEFnZSI6ODY0MDAwMDB9; session.sig=UOz44C2rF-uJFxFvSwHyII5aJxM
"
;
String
time
=
"2019-0
1-24
00:00:00"
;
String
cookie
=
"_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; sessionid=
lejfy3gdu5tf9x9zowxfhtq5o73dubc5; guid=GxsfBBgZGwQYGx4EGBkeVgcYGxkdHhMeHhkcVhwZBB0ZHwVDWEtMS3kKEhMEEh0fGQQaBBsdBU9HRVhCaQoDRUFJT20KT0FDRgoGZmd+YmECChwZBB0ZHwVeQ2FIT31PRlpaawoDHhx9ZX0KERkEHAp+ZApZXUVOREN9AgoaBB8FS0ZGQ1BFZw==; seid=s1550814253444; token=\"G8eNNNylPoi3oIPLUr/d/RDaMgtnpZCskxT7wu1pRRrkiy3J8G7StHgTx9DQBq4O8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoiVjJuNHdCVDBncVNacTRxVllGM29jRUVwIiwic3RhdHVzIjp0cnVlLCJfZXhwaXJlIjoxNTUwOTAwNjY1Njg4LCJfbWF4QWdlIjo4NjQwMDAwMH0=; session.sig=b_tga85tZskxsgKX8YIM_JKByi0
"
;
String
time
=
"2019-0
2-15
00:00:00"
;
String
[]
words
=
word
.
split
(
"\\|"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
String
w
:
words
)
{
...
...
@@ -23,16 +24,9 @@ public class MaimaiBywordExample {
// bodyList.addAll(c);
bodyList
.
addAll
(
c2
);
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"time"
);
headList
.
add
(
"url"
);
headList
.
add
(
"text"
);
headList
.
add
(
"name"
);
headList
.
add
(
"like"
);
headList
.
add
(
"comment_count"
);
headList
.
add
(
"spreads"
);
List
<
String
>
headList
=
Arrays
.
asList
(
"time"
,
"url"
,
"text"
,
"name"
,
"like"
,
"comment_count"
,
"spreads"
,
"word"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\脉脉关键词采集-美团-02
01
.xlsx"
,
"脉脉关键词"
,
headList
,
bodyList
);
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\脉脉关键词采集-美团-02
22
.xlsx"
,
"脉脉关键词"
,
headList
,
bodyList
);
}
}
src/test/java/com/zhiwei/crawler/SouhuCommentCountExample.java
View file @
2a35dd02
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
...
...
@@ -8,13 +7,14 @@ import org.junit.Test;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Souhu
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
SouhuCommentCountExample
{
@SuppressWarnings
(
"unchecked"
)
@Test
public
void
souhuCommentCountTest
()
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
...
...
@@ -28,7 +28,7 @@ public class SouhuCommentCountExample {
try
{
url
=
map1
.
get
(
"url"
)+
""
;
System
.
out
.
println
(
url
);
int
i
=
Souhu
.
getSouhuCommentCount
(
url
,
Proxy
Factory
.
getNatProxy
()
);
int
i
=
Souhu
.
getSouhuCommentCount
(
url
,
Proxy
Holder
.
NAT_PROXY
);
map1
.
put
(
"count"
,
i
);
System
.
out
.
println
(
map1
.
toString
());
}
catch
(
Exception
e
)
{
...
...
src/test/java/com/zhiwei/keyword/YoukuKeyWordTest.java
deleted
100644 → 0
View file @
b3d545a3
package
com
.
zhiwei
.
keyword
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.testng.annotations.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Youku
;
public
class
YoukuKeyWordTest
{
@Test
public
void
f
()
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
String
word
=
"帮宝适 二噁英,"
+
"帮宝适 二恶英,"
+
"帮宝适 甲醛,"
+
"帮宝适 荧光,"
+
"帮宝适 致癌,"
+
"帮宝适 有毒,"
+
"帮宝适 超标,"
+
"帮宝适 防腐剂,"
+
"帮宝适 起诉,"
+
"帮宝适 伤害,"
+
"帮宝适 气味,"
+
"帮宝适 异味,"
+
"帮宝适 起坨,"
+
"帮宝适 异物,"
+
"帮宝适 漏,"
+
"帮宝适 刺鼻,"
+
"帮宝适 勒,"
+
"帮宝适 脱皮,"
+
"帮宝适 划伤,"
+
"绿帮 二噁英,"
+
"绿帮 二恶英,"
+
"绿帮 甲醛,"
+
"绿帮 荧光,"
+
"绿帮 致癌,"
+
"绿帮 有毒,"
+
"绿帮 超标,"
+
"绿帮 起诉,"
+
"绿帮 气味,"
+
"绿帮 异味,"
+
"绿帮 异物,"
+
"绿帮 漏,"
+
"绿帮 刺鼻,"
+
"绿帮 勒,"
+
"绿帮 脱皮"
;
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
String
[]
words
=
word
.
split
(
","
);
for
(
String
w
:
words
)
{
System
.
out
.
println
(
w
);
bodyList
.
addAll
(
Youku
.
getDataList
(
w
));
}
List
<
String
>
headList
=
new
ArrayList
<>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"url"
);
headList
.
add
(
"uper"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"C:\\Users\\byte-zbs\\Desktop\\tx\\优酷数据-txh-0121.xlsx"
,
"数据"
,
headList
,
bodyList
);
}
}
src/test/java/com/zhiwei/shipin/BilibiliTest.java
View file @
2a35dd02
...
...
@@ -13,7 +13,7 @@ import com.zhiwei.util.WordReadFile;
public
class
BilibiliTest
{
@Test
public
void
f
()
{
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"D://crawlerdata//关键词.txt"
);
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"D://crawlerdata//关键词
-2
.txt"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
String
cookie
=
"LIVE_BUVID=AUTO8715300758995538; sid=kp5rluge; fts=1530161621; im_notify_type_35324319=0; buvid3=08ABE6AE-5061-4CE5-B34F-1A8AAB64DB3320712infoc; rpdid=olppsmkxmpdoskwoxiwww; finger=edc6ecda; stardustvideo=1; UM_distinctid=164fe68fb31996-01f161c3523abe-6114167a-1fa400-164fe68fb32274"
;
for
(
String
word
:
wordList
)
{
...
...
@@ -32,7 +32,7 @@ public class BilibiliTest {
headlist
.
add
(
"title"
);
headlist
.
add
(
"url"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata//bilibili关键词采集数据-txh-0
130
.xlsx"
,
"B站数据"
,
headlist
,
bodyList
);
poi
.
exportExcel
(
"D://crawlerdata//bilibili关键词采集数据-txh-0
219-农药
.xlsx"
,
"B站数据"
,
headlist
,
bodyList
);
}
}
src/test/java/com/zhiwei/shipin/DouyinHotExample.java
View file @
2a35dd02
package
com
.
zhiwei
.
shipin
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Douyin
;
public
class
DouyinHotExample
{
public
static
void
main
(
String
[]
args
)
{
List
<
Map
<
String
,
Object
>>
bodyList
=
Douyin
.
getDouyinHotData
(
"https://www.iesdouyin.com/share/challenge/1604239741363223?utm_campaign=client_share&app=aweme&utm_medium=ios&tt_from=qq&utm_source=qq&iid=36454376501"
,
null
);
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"text"
);
headList
.
add
(
"url"
);
headList
.
add
(
"time"
);
headList
.
add
(
"author"
);
headList
.
add
(
"comment_count"
);
headList
.
add
(
"like_count"
);
headList
.
add
(
"share_count"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D:\\crawlerdata\\抖音热门采集测试.xlsx"
,
"asd"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.shipin;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Douyin;
//
//
public class DouyinHotExample {
//
//
public static void main(String[] args) {
//
//
List<Map<String,Object>> bodyList = Douyin.getDouyinHotData("https://www.iesdouyin.com/share/challenge/1604239741363223?utm_campaign=client_share&app=aweme&utm_medium=ios&tt_from=qq&utm_source=qq&iid=36454376501",null);
//
List<String> headList = new ArrayList<String>();
//
headList.add("text");
//
headList.add("url");
//
headList.add("time");
//
headList.add("author");
//
headList.add("comment_count");
//
headList.add("like_count");
//
headList.add("share_count");
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
poi.exportExcel("D:\\crawlerdata\\抖音热门采集测试.xlsx", "asd", headList, bodyList);
//
}
//
//
}
src/test/java/com/zhiwei/shipin/QQTVTest.java
View file @
2a35dd02
...
...
@@ -11,7 +11,6 @@ import com.zhiwei.crawler.proxy.ProxyFactory;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.shipin.QQTV
;
import
com.zhiwei.parse.shipin.SohuTV
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.util.WordReadFile
;
...
...
@@ -19,7 +18,7 @@ public class QQTVTest {
@Test
public
void
f
()
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
String
time
=
"
2018
-07-01 00:00:00"
;
String
time
=
"
1970
-07-01 00:00:00"
;
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"D://crawlerdata//关键词.txt"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
for
(
String
word
:
wordList
)
{
...
...
src/test/java/com/zhiwei/shipin/SohuTVTest.java
View file @
2a35dd02
package
com
.
zhiwei
.
shipin
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.testng.annotations.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.BiliBili
;
import
com.zhiwei.parse.shipin.SohuTV
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.util.WordReadFile
;
public
class
SohuTVTest
{
@Test
public
void
f
()
{
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"D://crawlerdata//关键词.txt"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
String
cookie
=
"SUV=1901101134139015; IPLOC=CN3301; gidinf=x099980109ee0f08567b42835000336ade2ef3762611; fuid=15474616189304048886; newpuid=15474616191372936893; beans_mz_userid=UBThg01XRPg8; pmai=dad35c1c318bdd22; ifoxinstalled=false; beans_freq=1; beans_dmp=%7B%22admaster%22%3A1547461620%2C%22shunfei%22%3A1547461620%2C%22reachmax%22%3A1548816807%2C%22lingji%22%3A1547461620%2C%22yoyi%22%3A1547461620%2C%22ipinyou%22%3A1547461620%2C%22ipinyou_admaster%22%3A1547461620%2C%22miaozhen%22%3A1548816807%2C%22diantong%22%3A1547461620%2C%22huayang%22%3A1547461620%7D; beans_dmp_done=1; reqtype=pc; sokey=%5B%7B%22key%22%3A%22%E7%BE%8E%E5%9B%A2%22%7D%2C%7B%22key%22%3A%22%E5%B8%AE%E5%AE%9D%E9%80%82%22%7D%2C%7B%22key%22%3A%22%E5%B8%AE%E5%AE%9D%E9%80%82%20%E4%BA%8C%E5%99%81%E8%8B%B1%22%7D%5D; t=1548817812321"
;
for
(
String
word
:
wordList
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
SohuTV
.
sohuTVData
(
word
,
cookie
,
null
);
if
(
dataList
!=
null
)
{
System
.
out
.
println
(
word
+
" ----- "
+
dataList
.
size
());
bodyList
.
addAll
(
dataList
);
}
ZhiWeiTools
.
sleep
(
1000
);
}
List
<
String
>
headlist
=
new
ArrayList
<>();
headlist
.
add
(
"playCount"
);
headlist
.
add
(
"time"
);
headlist
.
add
(
"source"
);
headlist
.
add
(
"title"
);
headlist
.
add
(
"url"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata//搜狐视频关键词采集数据-txh-0130.xlsx"
,
"B站数据"
,
headlist
,
bodyList
);
}
}
//package com.zhiwei.shipin;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.shipin.SohuTV;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//import com.zhiwei.util.WordReadFile;
//
//public class SohuTVTest {
// @Test
// public void f() {
// List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
// List<Map<String, Object>> bodyList = new ArrayList<>();
// String cookie = "SUV=1901101134139015; IPLOC=CN3301; gidinf=x099980109ee0f08567b42835000336ade2ef3762611; fuid=15474616189304048886; newpuid=15474616191372936893; beans_mz_userid=UBThg01XRPg8; pmai=dad35c1c318bdd22; ifoxinstalled=false; beans_freq=1; beans_dmp=%7B%22admaster%22%3A1547461620%2C%22shunfei%22%3A1547461620%2C%22reachmax%22%3A1548816807%2C%22lingji%22%3A1547461620%2C%22yoyi%22%3A1547461620%2C%22ipinyou%22%3A1547461620%2C%22ipinyou_admaster%22%3A1547461620%2C%22miaozhen%22%3A1548816807%2C%22diantong%22%3A1547461620%2C%22huayang%22%3A1547461620%7D; beans_dmp_done=1; reqtype=pc; sokey=%5B%7B%22key%22%3A%22%E7%BE%8E%E5%9B%A2%22%7D%2C%7B%22key%22%3A%22%E5%B8%AE%E5%AE%9D%E9%80%82%22%7D%2C%7B%22key%22%3A%22%E5%B8%AE%E5%AE%9D%E9%80%82%20%E4%BA%8C%E5%99%81%E8%8B%B1%22%7D%5D; t=1548817812321";
// for (String word : wordList) {
// List<Map<String, Object>> dataList = SohuTV.sohuTVData(word, cookie, null);
// if (dataList != null) {
// System.out.println(word + " ----- " + dataList.size());
// bodyList.addAll(dataList);
// }
// ZhiWeiTools.sleep(1000);
// }
// List<String> headlist = new ArrayList<>();
// headlist.add("playCount");
// headlist.add("time");
// headlist.add("source");
// headlist.add("title");
// headlist.add("url");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//搜狐视频关键词采集数据-txh-0219.xlsx", "B站数据", headlist, bodyList);
//
// }
//}
src/test/java/com/zhiwei/shipin/YoukuKeyWordTest.java
0 → 100644
View file @
2a35dd02
//package com.zhiwei.shipin;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Youku;
//import com.zhiwei.util.WordReadFile;
//
//public class YoukuKeyWordTest {
// @Test
// public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
// GroupType.PROVIDER);
// List<String> words = WordReadFile.getWords("D://crawlerdata//关键词.txt");
// List<Map<String,Object>> bodyList = new ArrayList<>();
// for(String w : words) {
// System.out.println(w);
// bodyList.addAll(Youku.getDataList(w));
// }
// List<String> headList = new ArrayList<>();
// headList.add("title");
// headList.add("time");
// headList.add("url");
// headList.add("uper");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//优酷数据-txh-0219.xlsx", "数据", headList, bodyList);
//
// }
//}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment