Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
articlenewscrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
articlenewscrawler
Commits
6018f0b3
Commit
6018f0b3
authored
Nov 18, 2019
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
提交修改
parent
3e350f8b
Show whitespace changes
Inline
Side-by-side
Showing
28 changed files
with
559 additions
and
391 deletions
+559
-391
pom.xml
+2
-2
src/main/java/com/zhiwei/httpclient/HeadGet.java
+4
-5
src/main/java/com/zhiwei/httpclient/HttpClient.java
+66
-5
src/main/java/com/zhiwei/parse/Baijia.java
+8
-6
src/main/java/com/zhiwei/parse/BiliBili.java
+6
-7
src/main/java/com/zhiwei/parse/Dayu.java
+34
-1
src/main/java/com/zhiwei/parse/Douban.java
+2
-3
src/main/java/com/zhiwei/parse/Maimai.java
+40
-0
src/main/java/com/zhiwei/parse/QQKB.java
+2
-3
src/main/java/com/zhiwei/parse/QQKandian.java
+252
-253
src/main/java/com/zhiwei/parse/SinaKeji.java
+19
-8
src/main/java/com/zhiwei/parse/SinaTousu.java
+1
-1
src/main/java/com/zhiwei/parse/Souhu.java
+2
-8
src/main/java/com/zhiwei/parse/TXNews.java
+3
-1
src/main/java/com/zhiwei/parse/Wangyi.java
+17
-1
src/main/java/com/zhiwei/parse/analysis/AiqiyiByWordAnalysis.java
+5
-7
src/main/java/com/zhiwei/parse/analysis/BaijiaAccountAnalysis.java
+5
-7
src/main/java/com/zhiwei/parse/analysis/BilibilikeyWordAnalysis.java
+2
-2
src/main/java/com/zhiwei/parse/analysis/SouhuCommentAnalysis.java
+20
-12
src/test/java/com/zhiwei/Comment/MaimaiCommentCountTest.java
+2
-2
src/test/java/com/zhiwei/Comment/SinaKejiComment.java
+9
-3
src/test/java/com/zhiwei/crawler/QQAccountExample.java
+1
-1
src/test/java/com/zhiwei/hsitory/BaijiaAccountExample.java
+8
-4
src/test/java/com/zhiwei/shipin/AiqiyiTest.java
+2
-2
src/test/java/com/zhiwei/shipin/BilibiliTest.java
+42
-42
src/test/java/com/zhiwei/shipin/QQTVTest.java
+3
-3
src/test/java/com/zhiwei/shipin/SohuTVTest.java
+1
-1
src/test/java/com/zhiwei/shipin/YoukuKeyWordTest.java
+1
-1
No files found.
pom.xml
View file @
6018f0b3
...
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<artifactId>
articlenewscrawler
</artifactId>
<version>
0.
1.7
-SNAPSHOT
</version>
<version>
0.
2.2
-SNAPSHOT
</version>
<name>
articlenewscrawler
</name>
<description>
采集凤凰,一点资讯,搜狐历时文章和文章评论
</description>
...
...
@@ -21,7 +21,7 @@
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<version>
0.
3.6-RELEASE
</version>
<version>
0.
5.5.6-SNAPSHOT
</version>
<scope>
provided
</scope>
</dependency>
</dependencies>
...
...
src/main/java/com/zhiwei/httpclient/HeadGet.java
View file @
6018f0b3
...
...
@@ -67,7 +67,7 @@ public class HeadGet {
public
static
Map
<
String
,
String
>
getYidianzixunAccountHeaderMap
(
String
cookie
,
String
referer
)
{
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (
Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36
"
);
"Mozilla/5.0 (
iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1
"
);
headerMap
.
put
(
"Accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-CN,zh;q=0.9"
);
...
...
@@ -254,14 +254,13 @@ public class HeadGet {
* @throws IOException
*/
public
static
Map
<
String
,
String
>
getDayuCommentHeaderMap
(
String
cookie
)
{
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
);
headerMap
.
put
(
"Accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-CN,zh;q=0.9"
);
headerMap
.
put
(
"Connection"
,
"keep-alive"
);
headerMap
.
put
(
"Host"
,
"m.uczzd.cn"
);
if
(
cookie
!=
null
)
{
headerMap
.
put
(
"Cookie"
,
cookie
);
}
...
...
@@ -293,13 +292,13 @@ public class HeadGet {
}
public
static
Map
<
String
,
Object
>
getQQAccountOneParamMap
(
String
chlid
)
{
Map
<
String
,
Object
>
paramMap
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
Object
>
paramMap
=
new
HashMap
<>();
paramMap
.
put
(
"chlid"
,
chlid
);
return
paramMap
;
}
public
static
Map
<
String
,
Object
>
getQQAccountOtherParamMap
(
String
ids
)
{
Map
<
String
,
Object
>
paramMap
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
Object
>
paramMap
=
new
HashMap
<>();
paramMap
.
put
(
"ids"
,
ids
);
return
paramMap
;
}
...
...
src/main/java/com/zhiwei/httpclient/HttpClient.java
View file @
6018f0b3
...
...
@@ -3,6 +3,7 @@ package com.zhiwei.httpclient;
import
java.io.IOException
;
import
java.net.Proxy
;
import
java.util.Map
;
import
java.util.Objects
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -11,12 +12,14 @@ import com.zhiwei.crawler.core.HttpBoot;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
okhttp3.FormBody
;
import
okhttp3.Headers
;
import
okhttp3.Response
;
public
class
HttpClient
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
HttpClient
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
throwException
(
false
).
build
();
/**
*
...
...
@@ -43,6 +46,25 @@ public class HttpClient {
* @return
* @throws IOException
*/
public
static
String
executeHttpRequestGet
(
String
url
,
ProxyHolder
proxy
)
{
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"httpClient 获取数据出现问题:{}"
,
e
);
}
}
return
null
;
}
/**
*
* @Description (TODO这里用一句话描述这个方法的作用)
* @param url
* @param cookie
* @return
* @throws IOException
*/
public
static
String
executeHttpRequestGet
(
String
url
,
ProxyHolder
proxy
,
Map
<
String
,
String
>
headerMap
)
{
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
)){
...
...
@@ -54,8 +76,37 @@ public class HttpClient {
return
null
;
}
public
static
String
executeHttpRequestPost
(
String
url
,
Proxy
proxy
,
Map
<
String
,
String
>
headerMap
,
Map
<
String
,
Object
>
paramMap
)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapPost
(
url
,
headerMap
,
paramMap
),
proxy
)){
/**
*
* @Description (TODO这里用一句话描述这个方法的作用)
* @param url
* @param cookie
* @return
* @throws IOException
*/
public
static
String
executeHttpRequestGet
(
String
url
,
ProxyHolder
proxy
,
Headers
header
)
{
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
header
),
proxy
)){
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"httpClient 获取数据出现问题:{}"
,
e
);
}
}
return
null
;
}
public
static
String
executeHttpRequestPost
(
String
url
,
Proxy
proxy
,
Map
<
String
,
String
>
headerMap
,
Map
<
String
,
Object
>
params
)
{
FormBody
body
=
null
;
if
(
Objects
.
nonNull
(
params
)
&&
!
params
.
isEmpty
())
{
FormBody
.
Builder
builder
=
new
FormBody
.
Builder
();
params
.
forEach
((
lt
,
rt
)
->
{
if
(
Objects
.
nonNull
(
lt
))
{
builder
.
add
(
String
.
valueOf
(
lt
),
Objects
.
isNull
(
rt
)
?
""
:
String
.
valueOf
(
rt
));
}
});
body
=
builder
.
build
();
}
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapPost
(
url
,
headerMap
,
body
),
proxy
)){
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"httpClient 获取数据出现问题:{}"
,
e
);
...
...
@@ -64,8 +115,18 @@ public class HttpClient {
}
public
static
String
executeHttpRequestPost
(
String
url
,
ProxyHolder
proxy
,
Map
<
String
,
String
>
headerMap
,
Map
<
String
,
Object
>
paramMap
)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapPost
(
url
,
headerMap
,
paramMap
),
proxy
)){
public
static
String
executeHttpRequestPost
(
String
url
,
ProxyHolder
proxy
,
Map
<
String
,
String
>
headerMap
,
Map
<
String
,
Object
>
params
)
{
FormBody
body
=
null
;
if
(
Objects
.
nonNull
(
params
)
&&
!
params
.
isEmpty
())
{
FormBody
.
Builder
builder
=
new
FormBody
.
Builder
();
params
.
forEach
((
lt
,
rt
)
->
{
if
(
Objects
.
nonNull
(
lt
))
{
builder
.
add
(
String
.
valueOf
(
lt
),
Objects
.
isNull
(
rt
)
?
""
:
String
.
valueOf
(
rt
));
}
});
body
=
builder
.
build
();
}
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapPost
(
url
,
headerMap
,
body
),
proxy
)){
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"httpClient 获取数据出现问题:{}"
,
e
);
...
...
src/main/java/com/zhiwei/parse/Baijia.java
View file @
6018f0b3
...
...
@@ -89,20 +89,21 @@ public class Baijia {
while
(
f
)
{
for
(
int
i
=
1
;
i
<
3
;
i
++)
{
try
{
String
url
=
"https://
author.baidu.com/list?type=article&tab=2&uk="
+
uk
+
"&ctime="
+
ctime
+
"&num=50
"
;
String
url
=
"https://
mbd.baidu.com/webpage?tab=article&num=10&uk="
+
uk
+
"&ctime="
+
ctime
+
"&type=newhome&action=dynamic&format=json
"
;
String
result
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
Map
<
String
,
Object
>
dMap
=
baijiaAccountAnalysis
.
getBaijiaAccountData3
(
result
,
name
,
startTime
);
List
<
Map
<
String
,
Object
>>
dList
=
(
List
<
Map
<
String
,
Object
>>)
dMap
.
get
(
"data"
);
if
(
Objects
.
nonNull
(
dList
))
dataList
.
addAll
(
dList
);
logger
.
info
(
"{} 数据采集结果 {}"
,
appId
,
dataList
.
size
());
if
(!(
boolean
)
dMap
.
get
(
"more"
))
{
f
=
false
;
}
ctime
=
String
.
valueOf
(
dMap
.
get
(
"ctime"
));
ZhiWeiTools
.
sleep
(
3
000
);
ZhiWeiTools
.
sleep
(
1
000
);
break
;
}
catch
(
Exception
e
)
{
ZhiWeiTools
.
sleep
(
3
000
);
ZhiWeiTools
.
sleep
(
2
000
);
}
}
}
...
...
@@ -111,15 +112,16 @@ public class Baijia {
}
private
static
String
getUkData
(
String
appId
,
ProxyHolder
proxy
,
String
cookie
)
{
String
url
=
"https://author.baidu.com/profile?context={\"from\":0,\"app_id\":\""
+
appId
+
"\"}&cmdType=&pagelets[]=root&reqID=0&ispeed=1#"
;
// String url = "https://author.baidu.com/profile?context={\"from\":0,\"app_id\":\""
// +appId+"\"}&cmdType=&pagelets[]=root&reqID=0&ispeed=1#";
String
url
=
"https://author.baidu.com/home/"
+
appId
;
Map
<
String
,
Object
>
headers
=
new
HashMap
<>();
headers
.
put
(
"Host"
,
"author.baidu.com"
);
headers
.
put
(
"cookie"
,
cookie
);
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
try
{
String
result
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headers
),
proxy
).
body
().
string
();
return
result
.
split
(
"uk\
\\\\":\\\\\""
)[
1
].
split
(
"\\\\
\","
)[
0
];
return
result
.
split
(
"uk\
":\""
)[
1
].
split
(
"
\","
)[
0
];
}
catch
(
Exception
e
)
{
logger
.
error
(
"百家号uk 获取失败"
);
}
...
...
src/main/java/com/zhiwei/parse/BiliBili.java
View file @
6018f0b3
...
...
@@ -4,7 +4,6 @@ import java.io.UnsupportedEncodingException;
import
java.net.Proxy
;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.List
;
import
java.util.Map
;
...
...
@@ -13,7 +12,7 @@ import org.slf4j.LoggerFactory;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.
crawler.utils.RequestUtils
;
import
com.zhiwei.
httpclient.HttpClient
;
import
com.zhiwei.parse.analysis.BilibilikeyWordAnalysis
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
...
...
@@ -22,16 +21,16 @@ import okhttp3.Headers;
public
class
BiliBili
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
BiliBili
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
useCookieJar
(
true
).
build
();
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
Proxy
proxy
,
String
endTime
,
String
cookie
)
{
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
try
{
//
String
url
=
"https://search.bilibili.com/all?keyword="
+
URLEncoder
.
encode
(
word
,
"utf-8"
)+
"&order=pubdate&duration=0&tids_1=0"
;
String
url
=
"https://search.bilibili.com/all?keyword="
+
URLEncoder
.
encode
(
word
,
"utf-8"
)+
"&single_column=1&order=stow&duration=0&tids_1=0"
;
System
.
out
.
println
(
url
);
Headers
header
=
Headers
.
of
(
"cookie"
,
cookie
,
"Referer"
,
"https://www.bilibili.com/"
,
"Host"
,
"search.bilibili.com"
);
String
result
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
header
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
(
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
ProxyHolder
.
NAT_HEAVY_PROXY
,
header
);
ZhiWeiTools
.
sleep
(
100
);
Map
<
String
,
Object
>
map
=
BilibilikeyWordAnalysis
.
getData
(
result
,
word
,
endTime
);
boolean
more
=
(
boolean
)
map
.
get
(
"more"
);
...
...
@@ -43,7 +42,7 @@ public class BiliBili {
while
(
more
)
{
map
.
clear
();
String
ur
=
url
+
"&page="
+
n
;
String
result2
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
ur
,
header
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
(
);
String
result2
=
HttpClient
.
executeHttpRequestGet
(
ur
,
ProxyHolder
.
NAT_HEAVY_PROXY
,
header
);
map
=
BilibilikeyWordAnalysis
.
getData
(
result2
,
word
,
endTime
);
List
<
Map
<
String
,
Object
>>
dataList2
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"data"
);
if
(
dataList2
!=
null
)
{
...
...
@@ -60,7 +59,7 @@ public class BiliBili {
}
catch
(
Exception
e
)
{
logger
.
error
(
"e {}"
,
e
);
}
return
Collections
.
emptyList
()
;
return
bodyList
;
}
...
...
src/main/java/com/zhiwei/parse/Dayu.java
View file @
6018f0b3
...
...
@@ -102,7 +102,7 @@ public class Dayu {
* @param articleId
* @return
*/
public
static
int
getDayuCommentCount
(
String
articleId
,
Proxy
proxy
)
{
public
static
int
getDayuCommentCount
(
String
articleId
,
Proxy
Holder
proxy
)
{
String
url
=
"http://m.uczzd.cn/iflow/api/v2/cmt/article/"
+
articleId
+
"/comments/byhot"
;
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getDayuCommentHeaderMap
(
null
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
...
...
@@ -110,6 +110,39 @@ public class Dayu {
return
json
.
getJSONObject
(
"data"
).
getInteger
(
"comment_cnt"
);
}
/**
** 大鱼阅读数
* @param url
* @param proxy
* @return
* @return int
*/
public
static
int
getDayuReadCount
(
String
url
,
ProxyHolder
proxy
)
{
try
{
if
(
url
.
contains
(
"!wm_aid="
))
{
String
articleId
=
url
.
split
(
"wm_aid="
)[
1
];
String
eUrl
=
"https://ff.dayu.com/contents/origin/"
+
articleId
+
"?biz_id=1002&_fetch_author=1&_incr_fields=click1,click2,click3,click_total,play,like"
;
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getDayuCommentHeaderMap
(
null
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
eUrl
,
proxy
,
headerMap
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
return
json
.
getJSONObject
(
"data"
).
getJSONObject
(
"_incrs"
).
getIntValue
(
"click2"
)
+
json
.
getJSONObject
(
"data"
).
getJSONObject
(
"_incrs"
).
getIntValue
(
"click1"
);
}
else
if
(
url
.
contains
(
"wm_cid="
))
{
String
articleId
=
url
.
split
(
"wm_cid="
)[
1
];
String
eUrl
=
"https://ff.dayu.com/contents/"
+
articleId
+
"?biz_id=1002&_fetch_author=1&_incr_fields=click1,click2,click3,click_total,play,like"
;
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getDayuCommentHeaderMap
(
null
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
eUrl
,
proxy
,
headerMap
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
return
json
.
getJSONObject
(
"data"
).
getJSONObject
(
"_incrs"
).
getIntValue
(
"click2"
)
+
json
.
getJSONObject
(
"data"
).
getJSONObject
(
"_incrs"
).
getIntValue
(
"click1"
);
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
return
-
1
;
}
/**
*
* @Description 大鱼号依据关键词采集
...
...
src/main/java/com/zhiwei/parse/Douban.java
View file @
6018f0b3
...
...
@@ -38,7 +38,7 @@ public class Douban {
*/
public
static
List
<
Map
<
String
,
Object
>>
doubanTopicGetByWord
(
String
word
,
ProxyHolder
proxy
,
String
cookie
,
String
stime
)
{
int
page
=
0
;
int
count
=
2
0
;
int
count
=
5
0
;
boolean
more
=
true
;
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"Host"
,
"www.douban.com"
);
...
...
@@ -77,10 +77,9 @@ public class Douban {
map
.
put
(
"time"
,
time
);
map
.
put
(
"reply_count"
,
replyCount
);
bodyList
.
add
(
map
);
// System.out.println(map.toString());
}
}
if
(
bodyList
.
size
()
-
cou
<
3
0
){
if
(
bodyList
.
size
()
-
cou
<
10
||
page
>
50
0
){
more
=
false
;
}
logger
.
info
(
"采集到 第 {} 页 ,一共采集到 {} 条 是否有下一页 {}"
,
page
,
bodyList
.
size
(),
more
);
...
...
src/main/java/com/zhiwei/parse/Maimai.java
View file @
6018f0b3
...
...
@@ -16,7 +16,9 @@ import org.slf4j.LoggerFactory;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.httpclient.HeadGet
;
...
...
@@ -139,6 +141,44 @@ public class Maimai {
return
Collections
.
emptyMap
();
}
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
,
10000002L
);
String
url
=
"https://maimai.cn/web/feed_detail?fid=1353566056&efid=QTa45Y1e-oQzyn1dZ5ozlQ"
;
System
.
out
.
println
(
getMaiaiCount2
(
url
,
ProxyHolder
.
NAT_HEAVY_PROXY
));
}
/**
* https://maimai.cn/web/feed_detail?fid=1304191535&efid=0CQbJXhoYLXdC87NFIkRMA
* @Description 获取脉脉转评赞
* @param url
* @param proxy
* @return
*/
public
static
Map
<
String
,
Object
>
getMaiaiCount2
(
String
url
,
ProxyHolder
proxy
)
{
for
(
int
i
=
1
;
i
<
3
;
i
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
String
result
=
response
.
body
().
string
();
result
=
result
.
split
(
"JSON.parse\\(\""
)[
1
].
split
(
"\"\\);\\</script\\>"
)[
0
];
result
=
ZhiWeiTools
.
decodeUnicode
(
result
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
JSONObject
data
=
json
.
getJSONObject
(
"data"
).
getJSONObject
(
"feed"
);
map
.
put
(
"like"
,
data
.
getJSONObject
(
"likes"
).
getInteger
(
"n"
));
map
.
put
(
"spreads"
,
data
.
getJSONObject
(
"spreads"
).
getInteger
(
"n"
));
map
.
put
(
"cmts"
,
data
.
getJSONObject
(
"comments"
).
getInteger
(
"n"
));
map
.
put
(
"gid"
,
data
.
getLong
(
"id"
));
map
.
put
(
"title"
,
data
.
getJSONObject
(
"main"
).
getString
(
"text"
));
map
.
put
(
"author"
,
data
.
getJSONObject
(
"main"
).
getJSONObject
(
"u"
).
getString
(
"name"
));
map
.
put
(
"userId"
,
data
.
getJSONObject
(
"main"
).
getJSONObject
(
"u"
).
getString
(
"mmid"
));
map
.
put
(
"company"
,
data
.
getJSONObject
(
"main"
).
getJSONObject
(
"u"
).
getString
(
"career_str"
));
return
map
;
}
catch
(
Exception
e
)
{
logger
.
error
(
" 脉脉 转评攒 获取失败 {}"
,
e
);
}
}
return
Collections
.
emptyMap
();
}
/**
* //https://maimai.cn/web/gossip_detail?encode_id=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MTk2MzEyNjYsImlhdCI6MTU0ODI5NzI5NX0.N6SPmcf-fyitLNomzY-a8BEY31eseYnvG7RTUQ3jxYY
* @Description 获取脉脉转评赞
...
...
src/main/java/com/zhiwei/parse/QQKB.java
View file @
6018f0b3
...
...
@@ -60,8 +60,8 @@ public class QQKB {
try
{
for
(
int
j
=
1
;
j
<
3
;
j
++)
{
ids
=
ids
.
substring
(
0
,
ids
.
length
()-
1
);
System
.
out
.
println
(
ids
);
ZhiWeiTools
.
sleep
(
7
000
);
logger
.
info
(
"data {}"
,
ids
);
ZhiWeiTools
.
sleep
(
1
000
);
paramMap
.
clear
();
paramMap
=
HeadGet
.
getQQAccountOtherParamMap
(
ids
);
result
=
HttpClient
.
executeHttpRequestPost
(
url
,
proxy
,
headerMap
,
paramMap
);
...
...
@@ -76,7 +76,6 @@ public class QQKB {
}
catch
(
Exception
e
)
{
ids
=
""
;
paramMap
.
clear
();
continue
;
}
}
}
...
...
src/main/java/com/zhiwei/parse/QQKandian.java
View file @
6018f0b3
package
com
.
zhiwei
.
parse
;
import
java.io.IOException
;
import
java.net.Proxy
;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.bean.HistortyBean
;
import
com.zhiwei.bean.QQKandianUser
;
import
com.zhiwei.crawler.core.HttpClientBuilder
;
import
com.zhiwei.crawler.core.HttpRequestBuilder
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Headers
;
import
okhttp3.OkHttpClient
;
import
okhttp3.Request
;
public
class
QQKandian
{
public
List
<
QQKandianUser
>
getUser
(
String
name
,
Proxy
proxy
)
{
if
(
name
!=
null
&&
name
.
length
()
>
0
)
{
List
<
QQKandianUser
>
dataList
=
new
ArrayList
<>();
OkHttpClient
okhttp
=
HttpClientBuilder
.
newInstance
();
Map
<
String
,
String
>
map
=
new
HashMap
<>();
map
.
put
(
"Host"
,
"sou.qq.com"
);
map
.
put
(
"Referer"
,
"https://sou.qq.com/kandian/kd.html?_bid=3216&_wv=3&_wwv=1293&_wvSb=0&hotword=%E7%9F%A5%E5%90%8D%E5%A4%A7V%E7%AB%A0%E6%96%87%E6%B6%89%E6%80%A7%E4%BE%B5"
);
map
.
put
(
"Cookie"
,
"skey=MUzU7gdtRz; uin=o0497332654; RK=rNiJH0RBav; pgv_pvid=8990378504; pt2gguin=o0497332654; ptcz=062d936df33011f468637ee72be262a020a8df79977df7e7bde9c105b2b2ddf6"
);
try
{
//https://sou.qq.com/cgi-bin/kandian/tab_search?key_word=%E9%98%BF%E9%87%8C&business=64&page_size=20&cookie=&Group_masks=1003&bkn=1215238072
String
url
=
"https://sou.qq.com/cgi-bin/kandian/tab_search?key_word="
+
URLEncoder
.
encode
(
name
,
"utf-8"
)+
"&business=64&page_size=20&cookie=&Group_masks=1003&bkn=1215238072"
;
Request
request
=
HttpRequestBuilder
.
newGetRequest
(
url
,
Headers
.
of
(
map
));
okhttp
=
okhttp
.
newBuilder
().
proxy
(
proxy
).
build
();
String
result
=
okhttp
.
newCall
(
request
).
execute
().
body
().
string
();
System
.
out
.
println
(
result
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONObject
json2
=
json
.
getJSONObject
(
"result"
).
getJSONArray
(
"item_groups"
).
getJSONObject
(
0
);
JSONArray
jsonArray
=
json2
.
getJSONArray
(
"result_items"
);
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
if
(
name
.
equals
(
data
.
getString
(
"name"
)))
{
QQKandianUser
kandianUser
=
new
QQKandianUser
();
if
(
data
.
getString
(
"layout_content"
)!=
null
)
{
String
m
=
data
.
getString
(
"layout_content"
);
JSONObject
m1
=
JSONObject
.
parseObject
(
m
);
kandianUser
.
setVerify
(
m1
.
getBoolean
(
"verify"
));
kandianUser
.
setDesc
(
m1
.
getString
(
"secondLineText"
));
}
String
nam
=
data
.
getString
(
"name"
);
String
ur
=
data
.
getString
(
"jmp_url"
);
String
id
=
data
.
getString
(
"result_id"
);
System
.
out
.
println
(
data
.
toString
());
kandianUser
.
setId
(
id
);
kandianUser
.
setName
(
nam
);
kandianUser
.
setUrl
(
ur
);
dataList
.
add
(
kandianUser
);
}
}
return
dataList
;
}
catch
(
Exception
e
)
{
return
null
;
}
}
return
null
;
}
public
List
<
HistortyBean
>
getHistoryData
(
String
uid
,
Proxy
proxy
)
{
String
url
=
"https://kandian.qq.com/cgi-bin/social/getHomePage?uin="
+
uid
+
"&pageNo=1&pageSize=10&pageCookies=&is715=1&isInQQ=1&g_tk=1066845421&bkn=1066845421&_="
+
new
Date
().
getTime
();
List
<
HistortyBean
>
dataList
=
new
ArrayList
<
HistortyBean
>();
OkHttpClient
okhttp
=
HttpClientBuilder
.
newInstance
();
Map
<
String
,
String
>
map
=
new
HashMap
<
String
,
String
>();
map
.
put
(
"Host"
,
"kandian.qq.com"
);
map
.
put
(
"Referer"
,
"https://kandian.qq.com/mqq/vue/main?_wv=10145&_bid=2378&adfrom=search&x5PreFetch=1&accountId=MjY2MTY0MjM4Ng%3D%3D"
);
map
.
put
(
"Cookie"
,
"skey=MQmBo5A1N7; uin=o0497332654; pgv_pvid=8990378504"
);
try
{
okhttp
=
okhttp
.
newBuilder
().
proxy
(
proxy
).
build
();
while
(
true
)
{
try
{
Request
request
=
HttpRequestBuilder
.
newGetRequest
(
url
,
Headers
.
of
(
map
));
String
result
=
okhttp
.
newCall
(
request
).
execute
().
body
().
string
();
JSONObject
json
=
JSONObject
.
parseObject
(
result
).
getJSONObject
(
"result"
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"articleinfos"
);
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
String
ur
=
data
.
getString
(
"articleurl"
);
HistortyBean
history
=
getOnhistoryData
(
ur
);
if
(
history
!=
null
)
{
dataList
.
add
(
history
);
}
ZhiWeiTools
.
sleep
(
1500
);
}
String
pageCookies
=
json
.
getString
(
"pageCookies"
);
String
pacs
=
request
.
url
().
queryParameter
(
"pageCookies"
);
int
pageno
=
Integer
.
valueOf
(
request
.
url
().
queryParameter
(
"pageNo"
));
url
=
request
.
url
().
toString
().
replace
(
"pageNo="
+
pageno
,
"pageNo="
+(
pageno
+
1
)).
replace
(
"&pageCookies="
+
pacs
,
"&pageCookies="
+
pageCookies
);
ZhiWeiTools
.
sleep
(
5000
);
}
catch
(
Exception
e
)
{
break
;
}
}
return
dataList
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
return
null
;
}
private
static
HistortyBean
getOnhistoryData
(
String
url
)
{
OkHttpClient
okhttp
=
HttpClientBuilder
.
newInstance
();
Request
request
=
HttpRequestBuilder
.
newGetRequest
(
url
,
Headers
.
of
(
"Host"
,
"post.mp.qq.com"
));
try
{
HistortyBean
histortyBean
=
new
HistortyBean
();
String
result
=
okhttp
.
newCall
(
request
).
execute
().
body
().
string
();
Date
date
=
getTime
(
result
);
String
source
=
getSource
(
result
);
if
(
date
!=
null
&&
source
!=
null
)
{
Document
doc
=
Jsoup
.
parse
(
result
);
String
content
=
doc
.
select
(
"div#main-content"
).
select
(
"section"
).
text
();
String
title
=
doc
.
select
(
"meta[itemprop=name]"
).
attr
(
"content"
);
histortyBean
.
setSource
(
source
);
histortyBean
.
setTime
(
date
);
histortyBean
.
setTitle
(
title
);
histortyBean
.
setContent
(
content
);
histortyBean
.
setUrl
(
url
);
return
histortyBean
;
}
}
catch
(
IOException
e
)
{
// TODO Auto-generated catch block
e
.
printStackTrace
();
}
return
null
;
}
public
List
<
HistortyBean
>
getDataByword
(
String
word
,
Proxy
proxy
)
{
List
<
HistortyBean
>
dataList
=
new
ArrayList
<
HistortyBean
>();
OkHttpClient
okhttp
=
HttpClientBuilder
.
newInstance
();
Map
<
String
,
String
>
map
=
new
HashMap
<
String
,
String
>();
map
.
put
(
"Host"
,
"sou.qq.com"
);
map
.
put
(
"Referer"
,
"https://sou.qq.com/kandian/kd.html?_bid=3216&_wv=3&_wwv=1293&_wvSb=0&hotword=%E8%BF%AA%E4%B8%BD%E7%83%AD%E5%B7%B4%E9%82%93%E4%BC%A6%E7%94%B5%E6%A2%AF%E5%90%BB"
);
map
.
put
(
"Cookie"
,
"skey=MU7zbaRXu8; uin=o0497332654; RK=rNiJH0RBav; pgv_pvid=8990378504; pt2gguin=o0497332654; ptcz=062d936df33011f468637ee72be262a020a8df79977df7e7bde9c105b2b2ddf6"
);
try
{
String
url
=
"https://sou.qq.com/cgi-bin/kandian/unite_search?key_word="
+
URLEncoder
.
encode
(
word
,
"utf-8"
)+
"&business=64&page_size=20&cookie=&bkn=2000031506"
;
//https://sou.qq.com/cgi-bin/kandian/unite_search?key_word=%E5%94%90%E5%AB%A3&business=64&page_size=20&cookie=&bkn=2000031506
List
<
String
>
urlList
=
new
ArrayList
<
String
>();
int
count
=
0
;
while
(
true
)
{
try
{
okhttp
=
okhttp
.
newBuilder
().
proxy
(
proxy
).
build
();
Request
request
=
HttpRequestBuilder
.
newGetRequest
(
url
,
Headers
.
of
(
map
));
String
result
=
okhttp
.
newCall
(
request
).
execute
().
body
().
string
();
JSONObject
json
=
JSONObject
.
parseObject
(
result
).
getJSONObject
(
"result"
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"item_groups"
);
count
=
urlList
.
size
();
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
String
type
=
data
.
getString
(
"group_name"
);
if
(
"视频"
.
equals
(
type
)
||
"小视频"
.
equals
(
type
)
||
"相关搜索"
.
equals
(
type
)
||
"话题"
.
equals
(
type
))
{
}
else
{
JSONObject
da
=
data
.
getJSONArray
(
"result_items"
).
getJSONObject
(
0
);
String
title
=
da
.
getString
(
"name"
);
String
ur
=
da
.
getString
(
"jmp_url"
);
if
(!
urlList
.
contains
(
ur
.
split
(
"\\?"
)[
0
]))
{
urlList
.
add
(
ur
.
split
(
"\\?"
)[
0
]);
String
extension
=
da
.
getString
(
"extension"
);
JSONObject
obj
=
JSONObject
.
parseObject
(
extension
);
String
time
=
obj
.
getString
(
"create_time"
);
String
content
=
obj
.
getString
(
"content"
);
if
(
content
==
null
)
{
content
=
obj
.
getString
(
"brief"
);
}
String
source
=
obj
.
getString
(
"from"
);
HistortyBean
histortyBean
=
new
HistortyBean
();
System
.
out
.
println
(
title
+
" -- "
+
ur
.
split
(
"\\?"
)[
0
]);
histortyBean
.
setTime
(
TimeParse
.
stringFormartDate
(
time
+
"000"
));
histortyBean
.
setContent
(
content
);
histortyBean
.
setTitle
(
title
);
histortyBean
.
setSource
(
source
);
histortyBean
.
setUrl
(
ur
);
dataList
.
add
(
histortyBean
);
}
}
}
if
(
count
==
urlList
.
size
())
{
break
;
}
String
pageCookies
=
json
.
getString
(
"cookie"
);
String
pacs
=
request
.
url
().
queryParameter
(
"cookie"
);
url
=
request
.
url
().
toString
().
replace
(
"&cookie="
+
pacs
,
"&cookie="
+
pageCookies
);
ZhiWeiTools
.
sleep
(
3000
);
}
catch
(
Exception
e
)
{
break
;
}
}
return
dataList
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
return
null
;
}
/**
*
* @Description 获取时间
* @param result
* @return
*/
private
static
Date
getTime
(
String
result
)
{
Pattern
pa
=
Pattern
.
compile
(
"data-timestamp=\"(.*?)\""
);
Matcher
ma
=
pa
.
matcher
(
result
);
while
(
ma
.
find
())
{
String
t
=
ma
.
group
(
0
);
t
=
t
.
split
(
"ata-timestamp=\""
)[
1
].
split
(
"\""
)[
0
];
return
TimeParse
.
stringFormartDate
(
t
+
"000"
);
}
return
null
;
}
/**
*
* @Description 获取来源
* @param result
* @return
*/
private
static
String
getSource
(
String
result
)
{
Pattern
pa
=
Pattern
.
compile
(
"ata-author=\"(.*?)\""
);
Matcher
ma
=
pa
.
matcher
(
result
);
while
(
ma
.
find
())
{
String
t
=
ma
.
group
(
0
);
t
=
t
.
split
(
"ata-author=\""
)[
1
].
split
(
"\""
)[
0
];
return
t
;
}
return
null
;
}
}
//package com.zhiwei.parse;
//
//import java.io.IOException;
//import java.net.Proxy;
//import java.net.URLEncoder;
//import java.util.ArrayList;
//import java.util.Date;
//import java.util.HashMap;
//import java.util.List;
//import java.util.Map;
//import java.util.regex.Matcher;
//import java.util.regex.Pattern;
//
//import org.jsoup.Jsoup;
//import org.jsoup.nodes.Document;
//
//import com.alibaba.fastjson.JSONArray;
//import com.alibaba.fastjson.JSONObject;
//import com.zhiwei.bean.HistortyBean;
//import com.zhiwei.bean.QQKandianUser;
//import com.zhiwei.crawler.core.HttpClientBuilder;
//import com.zhiwei.crawler.core.HttpRequestBuilder;
//import com.zhiwei.tools.timeparse.TimeParse;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//import okhttp3.Headers;
//import okhttp3.OkHttpClient;
//import okhttp3.Request;
//
//public class QQKandian {
//
// public List<QQKandianUser> getUser(String name,Proxy proxy) {
// if(name != null && name.length() > 0) {
// List<QQKandianUser> dataList = new ArrayList<>();
// OkHttpClient okhttp = HttpClientBuilder.newInstance();
// Map<String,String> map = new HashMap<>();
// map.put("Host", "sou.qq.com");
// map.put("Referer", "https://sou.qq.com/kandian/kd.html?_bid=3216&_wv=3&_wwv=1293&_wvSb=0&hotword=%E7%9F%A5%E5%90%8D%E5%A4%A7V%E7%AB%A0%E6%96%87%E6%B6%89%E6%80%A7%E4%BE%B5");
// map.put("Cookie", "skey=MUzU7gdtRz; uin=o0497332654; RK=rNiJH0RBav; pgv_pvid=8990378504; pt2gguin=o0497332654; ptcz=062d936df33011f468637ee72be262a020a8df79977df7e7bde9c105b2b2ddf6");
// try {
// //https://sou.qq.com/cgi-bin/kandian/tab_search?key_word=%E9%98%BF%E9%87%8C&business=64&page_size=20&cookie=&Group_masks=1003&bkn=1215238072
// String url = "https://sou.qq.com/cgi-bin/kandian/tab_search?key_word="+URLEncoder.encode(name, "utf-8")+"&business=64&page_size=20&cookie=&Group_masks=1003&bkn=1215238072";
// Request request = HttpRequestBuilder.newGetRequest(url, Headers.of(map));
// okhttp = okhttp.newBuilder().proxy(proxy).build();
// String result = okhttp.newCall(request).execute().body().string();
// System.out.println(result);
// JSONObject json = JSONObject.parseObject(result);
// JSONObject json2 = json.getJSONObject("result").getJSONArray("item_groups").getJSONObject(0);
// JSONArray jsonArray = json2.getJSONArray("result_items");
// for(int i = 0;i < jsonArray.size();i++) {
// JSONObject data = jsonArray.getJSONObject(i);
// if(name.equals(data.getString("name"))) {
// QQKandianUser kandianUser = new QQKandianUser();
// if(data.getString("layout_content")!=null) {
// String m = data.getString("layout_content");
// JSONObject m1 = JSONObject.parseObject(m);
// kandianUser.setVerify(m1.getBoolean("verify"));
// kandianUser.setDesc(m1.getString("secondLineText"));
// }
// String nam = data.getString("name");
// String ur = data.getString("jmp_url");
// String id = data.getString("result_id");
// System.out.println(data.toString());
// kandianUser.setId(id);
// kandianUser.setName(nam);
// kandianUser.setUrl(ur);
// dataList.add(kandianUser);
// }
// }
// return dataList;
// } catch (Exception e) {
// return null;
// }
// }
// return null;
// }
//
// public List<HistortyBean> getHistoryData(String uid,Proxy proxy) {
// String url = "https://kandian.qq.com/cgi-bin/social/getHomePage?uin="+uid+"&pageNo=1&pageSize=10&pageCookies=&is715=1&isInQQ=1&g_tk=1066845421&bkn=1066845421&_="+new Date().getTime();
// List<HistortyBean> dataList = new ArrayList<HistortyBean>();
// OkHttpClient okhttp = HttpClientBuilder.newInstance();
// Map<String,String> map = new HashMap<String,String>();
// map.put("Host", "kandian.qq.com");
// map.put("Referer", "https://kandian.qq.com/mqq/vue/main?_wv=10145&_bid=2378&adfrom=search&x5PreFetch=1&accountId=MjY2MTY0MjM4Ng%3D%3D");
// map.put("Cookie", "skey=MQmBo5A1N7; uin=o0497332654; pgv_pvid=8990378504");
// try {
// okhttp = okhttp.newBuilder().proxy(proxy).build();
// while(true) {
// try {
// Request request = HttpRequestBuilder.newGetRequest(url, Headers.of(map));
// String result = okhttp.newCall(request).execute().body().string();
// JSONObject json = JSONObject.parseObject(result).getJSONObject("result");
// JSONArray jsonArray = json.getJSONArray("articleinfos");
// for(int i = 0;i < jsonArray.size();i++) {
// JSONObject data = jsonArray.getJSONObject(i);
// String ur = data.getString("articleurl");
// HistortyBean history = getOnhistoryData(ur);
// if(history != null) {
// dataList.add(history);
// }
// ZhiWeiTools.sleep(1500);
// }
// String pageCookies = json.getString("pageCookies");
// String pacs = request.url().queryParameter("pageCookies");
// int pageno = Integer.valueOf(request.url().queryParameter("pageNo"));
// url = request.url().toString().replace("pageNo="+pageno, "pageNo="+(pageno+1)).replace("&pageCookies="+pacs, "&pageCookies="+pageCookies);
// ZhiWeiTools.sleep(5000);
// } catch (Exception e) {
// break;
// }
// }
// return dataList;
// } catch (Exception e) {
// e.printStackTrace();
// }
//
// return null;
// }
//
//
// private static HistortyBean getOnhistoryData(String url) {
// OkHttpClient okhttp = HttpClientBuilder.newInstance();
// Request request = HttpRequestBuilder.newGetRequest(url, Headers.of("Host","post.mp.qq.com"));
// try {
// HistortyBean histortyBean = new HistortyBean();
// String result = okhttp.newCall(request).execute().body().string();
// Date date = getTime(result);
// String source = getSource(result);
// if(date != null && source != null) {
//
// Document doc = Jsoup.parse(result);
// String content = doc.select("div#main-content").select("section").text();
// String title = doc.select("meta[itemprop=name]").attr("content");
//
// histortyBean.setSource(source);
// histortyBean.setTime(date);
// histortyBean.setTitle(title);
// histortyBean.setContent(content);
// histortyBean.setUrl(url);
// return histortyBean;
// }
// } catch (IOException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
// return null;
// }
//
// public List<HistortyBean> getDataByword(String word,Proxy proxy) {
// List<HistortyBean> dataList = new ArrayList<HistortyBean>();
// OkHttpClient okhttp = HttpClientBuilder.newInstance();
// Map<String,String> map = new HashMap<String,String>();
// map.put("Host", "sou.qq.com");
// map.put("Referer", "https://sou.qq.com/kandian/kd.html?_bid=3216&_wv=3&_wwv=1293&_wvSb=0&hotword=%E8%BF%AA%E4%B8%BD%E7%83%AD%E5%B7%B4%E9%82%93%E4%BC%A6%E7%94%B5%E6%A2%AF%E5%90%BB");
// map.put("Cookie", "skey=MU7zbaRXu8; uin=o0497332654; RK=rNiJH0RBav; pgv_pvid=8990378504; pt2gguin=o0497332654; ptcz=062d936df33011f468637ee72be262a020a8df79977df7e7bde9c105b2b2ddf6");
// try {
// String url = "https://sou.qq.com/cgi-bin/kandian/unite_search?key_word="+URLEncoder.encode(word, "utf-8")+"&business=64&page_size=20&cookie=&bkn=2000031506";
// //https://sou.qq.com/cgi-bin/kandian/unite_search?key_word=%E5%94%90%E5%AB%A3&business=64&page_size=20&cookie=&bkn=2000031506
// List<String> urlList = new ArrayList<String>();
// int count = 0;
// while(true) {
// try {
// okhttp = okhttp.newBuilder().proxy(proxy).build();
// Request request = HttpRequestBuilder.newGetRequest(url, Headers.of(map));
// String result = okhttp.newCall(request).execute().body().string();
// JSONObject json = JSONObject.parseObject(result).getJSONObject("result");
// JSONArray jsonArray = json.getJSONArray("item_groups");
// count = urlList.size();
// for(int i = 0;i < jsonArray.size();i++) {
// JSONObject data = jsonArray.getJSONObject(i);
// String type = data.getString("group_name");
// if("视频".equals(type) || "小视频".equals(type) || "相关搜索".equals(type) || "话题".equals(type)) {
//
// }else {
// JSONObject da = data.getJSONArray("result_items").getJSONObject(0);
// String title = da.getString("name");
// String ur = da.getString("jmp_url");
// if(!urlList.contains(ur.split("\\?")[0])) {
// urlList.add(ur.split("\\?")[0]);
// String extension = da.getString("extension");
// JSONObject obj = JSONObject.parseObject(extension);
// String time = obj.getString("create_time");
// String content = obj.getString("content");
// if(content == null) {
// content = obj.getString("brief");
// }
// String source = obj.getString("from");
// HistortyBean histortyBean = new HistortyBean();
// System.out.println(title + " -- " + ur.split("\\?")[0]);
// histortyBean.setTime(TimeParse.stringFormartDate(time+"000"));
// histortyBean.setContent(content);
// histortyBean.setTitle(title);
// histortyBean.setSource(source);
// histortyBean.setUrl(ur);
// dataList.add(histortyBean);
// }
// }
// }
// if(count == urlList.size()) {
// break;
// }
// String pageCookies = json.getString("cookie");
// String pacs = request.url().queryParameter("cookie");
// url = request.url().toString().replace("&cookie="+pacs, "&cookie="+pageCookies);
// ZhiWeiTools.sleep(3000);
// } catch (Exception e) {
// break;
// }
// }
// return dataList;
// } catch (Exception e) {
// e.printStackTrace();
// }
//
// return null;
// }
//
// /**
// *
// * @Description 获取时间
// * @param result
// * @return
// */
// private static Date getTime(String result) {
// Pattern pa = Pattern.compile("data-timestamp=\"(.*?)\"");
// Matcher ma = pa.matcher(result);
// while(ma.find()) {
// String t = ma.group(0);
// t = t.split("ata-timestamp=\"")[1].split("\"")[0];
// return TimeParse.stringFormartDate(t+"000");
// }
// return null;
// }
//
// /**
// *
// * @Description 获取来源
// * @param result
// * @return
// */
// private static String getSource(String result) {
// Pattern pa = Pattern.compile("ata-author=\"(.*?)\"");
// Matcher ma = pa.matcher(result);
// while(ma.find()) {
// String t = ma.group(0);
// t = t.split("ata-author=\"")[1].split("\"")[0];
// return t;
// }
// return null;
// }
//
//}
src/main/java/com/zhiwei/parse/SinaKeji.java
View file @
6018f0b3
...
...
@@ -34,15 +34,15 @@ public class SinaKeji {
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getSinaKejiComment
(
String
url
,
ProxyHolder
proxy
)
{
String
com
mentId
=
getCommentId
(
url
,
proxy
);
if
(
nonNull
(
com
mentId
))
{
String
com
Url
=
getCommentId
(
url
,
proxy
);
if
(
nonNull
(
com
Url
))
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
int
page
=
1
;
int
count
=
1
;
while
(
true
)
{
try
{
ZhiWeiTools
.
sleep
(
3
000
);
String
newUrl
=
"http://comment.sina.com.cn/page/info?version=1&format=json&channel=kj&newsid="
+
commentId
+
"&compress=0&ie=utf-8&oe=utf-8&page_size=20&t_size=3&h_size=3&thread=1&page="
+
page
;
ZhiWeiTools
.
sleep
(
1
000
);
String
newUrl
=
comUrl
+
page
;
String
result
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
newUrl
),
proxy
).
body
().
string
();
List
<
Map
<
String
,
Object
>>
list
=
sinaKejiCommentAnalysis
.
getSinaCommet
(
result
);
dataList
.
addAll
(
list
);
...
...
@@ -63,16 +63,27 @@ public class SinaKeji {
return
Collections
.
emptyList
();
}
/**
** 获取新浪评论链接
* @param url
* @param proxy
* @return
* @return String
*/
private
static
String
getCommentId
(
String
url
,
ProxyHolder
proxy
)
{
String
commentId
=
null
;
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
try
{
String
result
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
).
body
().
string
();
//list?channel=
if
(
result
.
contains
(
"newsid:"
))
{
commentId
=
result
.
split
(
"newsid: '"
)[
1
].
split
(
"'"
)[
0
];
if
(
nonNull
(
commentId
))
{
return
commentId
;
String
commentId
=
result
.
split
(
"newsid: '"
)[
1
].
split
(
"'"
)[
0
];
String
channel
=
result
.
split
(
"channel: '"
)[
1
].
split
(
"'"
)[
0
];
if
(
nonNull
(
commentId
)
&&
nonNull
(
channel
))
{
return
"http://comment.sina.com.cn/page/info?version=1&format=json&channel="
+
channel
+
"&newsid="
+
commentId
+
"&compress=0&ie=utf-8&oe=utf-8&page_size=20&t_size=3&h_size=3&thread=1&page="
;
}
}
else
if
(
result
.
contains
(
"__cmntId"
)){
String
key
=
result
.
split
(
"__cmntId\":\""
)[
1
].
split
(
"\","
)[
0
];
return
"http://comment.sina.com.cn/page/info?version=1&format=json&channel="
+
key
.
split
(
":"
)[
0
]+
"&newsid="
+
key
.
split
(
":"
)[
1
]+
"&compress=0&ie=utf-8&oe=utf-8&page_size=20&t_size=3&h_size=3&thread=1&page="
;
}
}
catch
(
IOException
e
)
{
logger
.
error
(
"获取 文章评论 id 失败"
);
...
...
src/main/java/com/zhiwei/parse/SinaTousu.java
View file @
6018f0b3
...
...
@@ -29,7 +29,7 @@ public class SinaTousu {
int
count
=
1
;
while
(
true
)
{
try
{
if
(
count
>
3
)
{
if
(
count
>
3
||
page
>
200
)
{
break
;
}
String
url
=
"https://tousu.sina.com.cn/api/index/s?keywords="
+
URLEncoder
.
encode
(
word
,
"utf-8"
)+
"&page_size=100&page="
;
...
...
src/main/java/com/zhiwei/parse/Souhu.java
View file @
6018f0b3
package
com
.
zhiwei
.
parse
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
...
...
@@ -12,7 +10,6 @@ import org.slf4j.LoggerFactory;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
...
...
@@ -35,10 +32,7 @@ public class Souhu {
*/
public
static
int
getSouhuCommentCount
(
String
url
,
ProxyHolder
proxy
)
{
try
{
String
newurl
=
souhuCommentAnalysis
.
getSouhuURL
(
url
,
proxy
);
if
(
nonNull
(
newurl
))
{
return
souhuCommentAnalysis
.
getSouhuCommentCount
(
newurl
,
proxy
);
}
return
souhuCommentAnalysis
.
getSouhuCommentCount
(
url
,
proxy
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜狐获取评论数出错了 {}"
,
e
);
}
...
...
@@ -83,7 +77,7 @@ public class Souhu {
if
(
isCulling
)
{
url
=
url
+
"&columnId=-1"
;
}
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
null
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArray
=
json
.
getJSONObject
(
"data"
).
getJSONArray
(
"pcArticleVOS"
);
List
<
Map
<
String
,
Object
>>
dataList1
=
souhuAccountAnalysis
.
analysisData
(
jsonArray
,
name
);
...
...
src/main/java/com/zhiwei/parse/TXNews.java
View file @
6018f0b3
...
...
@@ -21,6 +21,8 @@ import com.zhiwei.parse.analysis.TXNewsByWordAnalysis;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.MediaType
;
import
okhttp3.RequestBody
;
import
okhttp3.Response
;
public
class
TXNews
{
...
...
@@ -71,7 +73,7 @@ public class TXNews {
String
content
=
StringUtils
.
join
(
"coral_uin="
,
coralUin
,
"&coral_uid="
,
coralUid
,
"&reply_id="
,
replayId
);
//eca55388bbbb596e632bca03a2378efe94b83142fd046f1f70 876579532
System
.
out
.
println
(
content
);
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapPost
(
"https://r.inews.qq.com/getMyComments"
,
"application/json"
,
content
),
proxy
)){
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapPost
(
"https://r.inews.qq.com/getMyComments"
,
RequestBody
.
create
(
MediaType
.
get
(
"application/json"
),
content
)
),
proxy
)){
JSONObject
json
=
JSONObject
.
parseObject
(
response
.
body
().
string
());
JSONArray
jsonArray
=
json
.
getJSONObject
(
"comments"
).
getJSONArray
(
"new"
);
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
...
...
src/main/java/com/zhiwei/parse/Wangyi.java
View file @
6018f0b3
...
...
@@ -2,6 +2,7 @@ package com.zhiwei.parse;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
...
...
@@ -27,7 +28,7 @@ public class Wangyi {
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Wangyi
.
class
);
private
static
WangyiCommentAnalysis
wangyiCommentAnalysis
=
new
WangyiCommentAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
throwException
(
false
).
build
();
private
static
WangyiHistoryAnalysis
wangyiHistoryAnalysis
=
new
WangyiHistoryAnalysis
();
/**
...
...
@@ -81,6 +82,21 @@ public class Wangyi {
}
}
public
static
Map
<
String
,
Object
>
getReadAndLikeCount
(
String
url
,
ProxyHolder
proxy
)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
String
result
=
response
.
body
().
string
();
String
cCount
=
result
.
split
(
"data-count=\""
)[
1
].
split
(
"\" data-hidead"
)[
0
];
String
lCount
=
result
.
split
(
"data-like=\""
)[
1
].
split
(
"\"><"
)[
0
];
Map
<
String
,
Object
>
rMap
=
new
HashMap
<>();
rMap
.
put
(
"commentCount"
,
cCount
);
rMap
.
put
(
"likes"
,
lCount
);
return
rMap
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
return
Collections
.
emptyMap
();
}
/**
*
* @Description 网易网页版数据
...
...
src/main/java/com/zhiwei/parse/analysis/AiqiyiByWordAnalysis.java
View file @
6018f0b3
...
...
@@ -22,15 +22,13 @@ public class AiqiyiByWordAnalysis {
List
<
Map
<
String
,
Object
>>
dataMap
=
new
ArrayList
<>();
try
{
Document
doc
=
Jsoup
.
parse
(
result
);
Elements
elements
=
doc
.
select
(
"
li.list_item
"
);
Elements
elements
=
doc
.
select
(
"
div.layout-main > div
"
);
for
(
Element
element
:
elements
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
String
title
=
element
.
select
(
"li"
).
attr
(
"data-widget-searchlist-tvname"
);
String
time
=
element
.
select
(
"em.result_info_desc"
).
text
().
split
(
" "
)[
0
];
if
(
element
.
select
(
"label.result_info_lbl"
).
text
().
contains
(
"上传者"
))
{
map
.
put
(
"source"
,
element
.
select
(
"a.result_info_link"
).
text
());
}
String
uurl
=
element
.
select
(
"h3.result_title > a"
).
attr
(
"href"
);
String
title
=
element
.
select
(
"a.main-tit"
).
attr
(
"title"
);
String
time
=
element
.
select
(
"span.info-des"
).
text
().
split
(
" "
)[
0
];
String
uurl
=
element
.
select
(
"a.main-tit"
).
attr
(
"href"
);
map
.
put
(
"source"
,
element
.
select
(
"a.uploader-name"
).
text
());
map
.
put
(
"time"
,
TimeParse
.
stringFormartDate
(
time
));
map
.
put
(
"url"
,
uurl
);
map
.
put
(
"title"
,
title
);
...
...
src/main/java/com/zhiwei/parse/analysis/BaijiaAccountAnalysis.java
View file @
6018f0b3
...
...
@@ -58,25 +58,23 @@ public class BaijiaAccountAnalysis {
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArry
=
json
.
getJSONObject
(
"data"
).
getJSONArray
(
"list"
);
if
(
json
.
getJSONObject
(
"data"
).
getBoolean
(
"has
_m
ore"
)
!=
null
&&
json
.
getJSONObject
(
"data"
).
getBoolean
(
"has
_m
ore"
)
)
{
if
(
json
.
getJSONObject
(
"data"
).
getBoolean
(
"has
M
ore"
)
!=
null
&&
json
.
getJSONObject
(
"data"
).
getBoolean
(
"has
M
ore"
)
)
{
more
=
true
;
rmap
.
put
(
"ctime"
,
json
.
getJSONObject
(
"data"
).
getString
(
"ctime"
));
rmap
.
put
(
"ctime"
,
json
.
getJSONObject
(
"data"
).
get
JSONObject
(
"query"
).
get
String
(
"ctime"
));
}
// String name = json.getJSONObject("data").getJSONObject("author").getString("display_name");
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
JSONObject
data
=
jsonArry
.
getJSONObject
(
i
);
JSONObject
data
=
jsonArry
.
getJSONObject
(
i
)
.
getJSONObject
(
"itemData"
)
;
String
id
=
data
.
getString
(
"article_id"
);
int
t
=
data
.
getInteger
(
"updated_at"
);
String
time
=
TimeParse
.
dateFormartString
(
new
Date
(
t
*
1000L
),
"yyyy-MM-dd HH:mm:ss"
);
System
.
out
.
println
(
time
);
if
(
startTime
!=
null
&&
startTime
.
length
()
>
1
)
{
if
(
time
.
compareTo
(
startTime
)
<
1
)
{
if
(
startTime
!=
null
&&
startTime
.
length
()
>
1
&&
time
.
compareTo
(
startTime
)
<
1
)
{
more
=
false
;
continue
;
}
}
map
.
put
(
"title"
,
data
.
getString
(
"title"
));
String
url
=
"http://baijiahao.baidu.com/s?id="
+
id
;
map
.
put
(
"content"
,
ZhiWeiTools
.
delHTMLTag
(
getContent3
(
data
)));
...
...
src/main/java/com/zhiwei/parse/analysis/BilibilikeyWordAnalysis.java
View file @
6018f0b3
...
...
@@ -18,7 +18,7 @@ public class BilibilikeyWordAnalysis {
try
{
Document
doc
=
Jsoup
.
parse
(
result
);
boolean
more
=
false
;
if
(
doc
.
select
(
"#
server-search-app > div.contain > div.body-contain > div
> div.page-wrap > div > ul > li.page-item.next > button"
).
text
().
contains
(
"下一页"
))
{
if
(
doc
.
select
(
"#
all-list > div.flow-loader
> div.page-wrap > div > ul > li.page-item.next > button"
).
text
().
contains
(
"下一页"
))
{
more
=
true
;
}
...
...
@@ -28,7 +28,7 @@ public class BilibilikeyWordAnalysis {
String
playcount
=
null
;
String
source
=
null
;
String
submitcount
=
null
;
Elements
elements
=
doc
.
select
(
"ul.video-
contain
.clearfix"
).
select
(
"li"
);
Elements
elements
=
doc
.
select
(
"ul.video-
list
.clearfix"
).
select
(
"li"
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
for
(
Element
element
:
elements
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
...
...
src/main/java/com/zhiwei/parse/analysis/SouhuCommentAnalysis.java
View file @
6018f0b3
...
...
@@ -77,9 +77,10 @@ public class SouhuCommentAnalysis {
public
int
getSouhuCommentCount
(
String
url
,
ProxyHolder
proxy
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getSouhuCommentHeaderMap
(
null
);
try
{
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
String
id
=
getUrlId
(
url
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
"https://apiv2.sohu.com/api/comment/list?callback=&page_size=10&topic_id=&page_no=1&source_id=mp_"
+
id
,
proxy
,
headerMap
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
if
(
json
.
getInteger
(
"code"
)
==
500
)
{
if
(
Objects
.
nonNull
(
json
.
get
(
"code"
))
&&
json
.
getInteger
(
"code"
)
==
500
)
{
return
0
;
}
return
json
.
getJSONObject
(
"jsonObject"
).
getInteger
(
"cmt_sum"
);
...
...
@@ -116,21 +117,28 @@ public class SouhuCommentAnalysis {
}
public
int
getReadNum
(
String
url
,
ProxyHolder
proxy
)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
String
result
=
response
.
body
().
string
();
String
sourceId
=
getNewsId
(
result
);
url
=
"http://v2.sohu.com/public-api/articles/pv?articleIds="
+
sourceId
;
result
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
).
body
().
string
();
return
JSONObject
.
parseObject
(
result
).
getInteger
(
sourceId
);
String
id
=
getUrlId
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
"http://v2.sohu.com/public-api/articles/"
+
id
+
"/pv"
),
proxy
)){
return
Integer
.
parseInt
(
response
.
body
().
string
());
}
catch
(
Exception
e
)
{
logger
.
error
(
"Exception {} "
,
e
);
}
return
-
1
;
}
/**
** 获取搜狐文章id
* @param url
* @return
* @return String
*/
private
String
getUrlId
(
String
url
)
{
try
{
return
url
.
split
(
"/a/"
)[
1
].
split
(
"_"
)[
0
];
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜狐链接解析失败"
);
}
return
null
;
}
}
src/test/java/com/zhiwei/Comment/MaimaiCommentCountTest.java
View file @
6018f0b3
...
...
@@ -27,9 +27,9 @@
//// List<String> headList = (List<String>) map.get("head");
//// for (Map<String, Object> map1 : list) {
//// String url = map1.get("地址") + "";
// String cookie = "
_buuid=0668b664-13b3-4bd0-aa37-99d747432e85; guid=HBoEGxgEGBscBBsZGlYHGBsZHRsYExwZHFYcGQQdGR8FQ1hLTEt5ChITBBIdHxkEGgQbHQVPR0VYQmkKA0VBSU9tCk9BQ0YKBmZnfmJhAgocGQQdGR8FXkNhSE99T0ZaWmsKAx4cfWV9ChEZBBwKfmQKWV1FTkRDfQIKGgQfBUtGRkNQRWc=; token=\"ou+mv1hjxjm0uOOTss1vgck9+h6OCS/lYQUeFnJgSK70FHprmw6GmjBGwk2qPQH88CKuzcDfAvoCmBm7+jVysA==\"; uid=\"A8ELjewCDRgHnZ5bX0Vy0/Airs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMjI3NjU0NTI0Iiwic2VjcmV0IjoiV0wyZmEtZDZxbkx2TEkzZHF2dTN4UG5SIiwiX2V4cGlyZSI6MTU2MDU5Mzg4Mjc5NCwiX21heEFnZSI6ODY0MDAwMDB9; session.sig=ujhqvC3wPAn-WlCPXfB6C5ZJIgY
";
// String cookie = "";
// String url = "https://maimai.cn/web/gossip_detail?src=app&webid=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlZ2lkIjoiMjU5OTg4YmE4YzI3MTFlOTllMjEyNDZlOTZiNDgwODgiLCJ1IjoxNzY2NDQ3NjUsImlkIjoyMjIwMDM4NH0.gBdp3i99L0xUKLglaVIJf07OrrPk6yoG9HAo-3Yftqk";
//
Map<String,Object> map3 = Maimai.getMaiaiCount(url,
cookie, ProxyHolder.NAT_HEAVY_PROXY);
//
List<Map<String, Object>> map3 = Maimai.getMaimaiCommentList(url,
cookie, ProxyHolder.NAT_HEAVY_PROXY);
// System.out.println(map3.toString());
// System.out.println(url);
//// map1.putAll(map3);
...
...
src/test/java/com/zhiwei/Comment/SinaKejiComment.java
View file @
6018f0b3
//package com.zhiwei.Comment;
//
//import org.
testng.annotations
.Test;
//import org.
junit
.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.parse.SinaKeji;
//
//public class SinaKejiComment {
//
// @Test
// public void f() {
// String url = "https://tech.sina.com.cn/it/2018-07-17/doc-ihfkffam0632649.shtml";
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// String url = "https://tech.sina.com.cn/d/v/2019-08-19/doc-ihytcitn0207512.shtml";
//
// SinaKeji.getSinaKejiComment(url,
null
);
// SinaKeji.getSinaKejiComment(url,
ProxyHolder.NAT_HEAVY_PROXY
);
//
// }
//
//}
src/test/java/com/zhiwei/crawler/QQAccountExample.java
View file @
6018f0b3
...
...
@@ -25,7 +25,7 @@
//// System.out.println(child.split("chlid=")[1]);
// System.out.println(child.split("=")[1]);
//
// List<Map<String,Object>> lists = QQKB.getQQAccountData("5
456950
", cookie,null);
// List<Map<String,Object>> lists = QQKB.getQQAccountData("5
060059
", cookie,null);
// if(lists != null) {
// for(Map<String,Object> map1 : lists) {
// map1.put("name", map.get("呢称"));
...
...
src/test/java/com/zhiwei/hsitory/BaijiaAccountExample.java
View file @
6018f0b3
...
...
@@ -6,14 +6,17 @@
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Baijia;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//public class BaijiaAccountExample {
//
// @Test
// public void test3() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// String path = "D://crawlerdata//自媒体/百家号采集.xlsx";
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// String startTime = "2018-05-01 00:00:00";
...
...
@@ -23,9 +26,10 @@
// for(Map<String,Object> m : list) {
// try {
// String app_id = m.get("id").toString();
// app_id = "1594158489045754";
// String cookie = "__cfduid=d847baca85b97d1967b3da02ebb345b831535524251; BAIDUID=C0F0F81EF770C5219AB9C178654135EC:FG=1; PSTM=1536376257; BIDUPSID=250CCE0442BEBCB3568D8EC515953434; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; locale=zh; delPer=0; H_PS_PSSID=1447_21117_20930; PSINO=5";
// List<Map<String,Object>> lists = Baijia.getBaijiaAccountByBaiduData(app_id, startTime,cookie, null);
// app_id = "1602674438508810";
// String cookie = "BAIDUID=7D453C932433A93F7AD1F3B8ABC8B0E1:FG=1; BIDUPSID=7D453C932433A93F7AD1F3B8ABC8B0E1; PSTM=1570766401; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDSFRCVID=eH-OJeCmH6VwoRJwCdmehrB7leKK0gOTHllvCh8hmwLadLIVJeC6EG0Ptf8g0KubFTPRogKK0gOTH6KF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tJkD_I_hJKt3qn7I5KToh4Athxob2bbXHDo-LIvHWT6cOR5JhfA-3R-e046f3-3L5CbH5D3s5lvvhb3O3M7ShbKdMa732RbrKCnraxQF5l8-sq0x0bOte-bQypoa0q3TLDOMahkM5h7xOKQoQlPK5JkgMx6MqpQJQeQ-5KQN3KJmfbL9bT3YjjISKx-_J5LJJxK; H_PS_PSSID=1442_21103_29567_29699_29220_22158; delPer=0; PSINO=5; ZD_ENTRY=baidu; yjs_js_security_passport=9687699d4b0965c0be1e6e312fc59ff5cf3d03a2_1571106914_js; Hmery-Time=1215393878";
// System.out.println(app_id);
// List<Map<String,Object>> lists = Baijia.getBaijiaAccountByBaiduData(app_id,"聚富财经", startTime,cookie, ProxyHolder.NAT_HEAVY_PROXY);
// if(lists != null) {
// bodyList.addAll(lists);
// }
...
...
src/test/java/com/zhiwei/shipin/AiqiyiTest.java
View file @
6018f0b3
...
...
@@ -17,7 +17,7 @@
// @Test
// public void aiqiyiTest() {
//
// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER
,10000002
);
// List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
// List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
// for(String w : wordList) {
...
...
@@ -34,7 +34,7 @@
// headList.add("title");
// headList.add("word");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//视频/爱奇艺关键词采集-
毓婷-0716
.xlsx", "数据", headList, bodyList);
// poi.exportExcel("D://crawlerdata//视频/爱奇艺关键词采集-
精装修
.xlsx", "数据", headList, bodyList);
//
//
//
...
...
src/test/java/com/zhiwei/shipin/BilibiliTest.java
View file @
6018f0b3
//
package com.zhiwei.shipin;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.common.config.GroupType;
//
import com.zhiwei.crawler.proxy.ProxyFactory;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.BiliBili;
//
import com.zhiwei.util.WordReadFile;
//
//
public class BilibiliTest {
//
@Test
//
public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER
);
// List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词-1
.txt");
//
List<Map<String, Object>> bodyList = new ArrayList<>();
//
String cookie = "LIVE_BUVID=AUTO8715300758995538; sid=kp5rluge; fts=1530161621; im_notify_type_35324319=0; buvid3=08ABE6AE-5061-4CE5-B34F-1A8AAB64DB3320712infoc; rpdid=olppsmkxmpdoskwoxiwww; finger=edc6ecda; stardustvideo=1; UM_distinctid=164fe68fb31996-01f161c3523abe-6114167a-1fa400-164fe68fb32274";
//
for (String word : wordList) {
// List<Map<String, Object>> dataList = BiliBili.getData(word, null, "2019-07-18
00:00:00",
//
cookie);
//
if (dataList != null) {
//
System.out.println(word + " ----- " + dataList.size());
//
bodyList.addAll(dataList);
//
}
//
}
//
List<String> headlist = new ArrayList<>();
//
headlist.add("submitcount");
//
headlist.add("playcount");
//
headlist.add("time");
//
headlist.add("source");
//
headlist.add("title");
//
headlist.add("url");
//
headlist.add("word");
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//视频//bilibili关键词采集数据-吃鸡否-0722
.xlsx", "B站数据", headlist, bodyList);
//
//
}
//
}
package
com
.
zhiwei
.
shipin
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.BiliBili
;
import
com.zhiwei.util.WordReadFile
;
public
class
BilibiliTest
{
@Test
public
void
f
()
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181"
,
"local"
,
GroupType
.
PROVIDER
,
10000002L
);
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"D://crawlerdata//关键词
.txt"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
String
cookie
=
"LIVE_BUVID=AUTO8715300758995538; sid=kp5rluge; fts=1530161621; im_notify_type_35324319=0; buvid3=08ABE6AE-5061-4CE5-B34F-1A8AAB64DB3320712infoc; rpdid=olppsmkxmpdoskwoxiwww; finger=edc6ecda; stardustvideo=1; UM_distinctid=164fe68fb31996-01f161c3523abe-6114167a-1fa400-164fe68fb32274"
;
for
(
String
word
:
wordList
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
BiliBili
.
getData
(
word
,
null
,
"2001-01-14
00:00:00"
,
cookie
);
if
(
dataList
!=
null
)
{
System
.
out
.
println
(
word
+
" ----- "
+
dataList
.
size
());
bodyList
.
addAll
(
dataList
);
}
}
List
<
String
>
headlist
=
new
ArrayList
<>();
headlist
.
add
(
"submitcount"
);
headlist
.
add
(
"playcount"
);
headlist
.
add
(
"time"
);
headlist
.
add
(
"source"
);
headlist
.
add
(
"title"
);
headlist
.
add
(
"url"
);
headlist
.
add
(
"word"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata//视频//bilibili关键词采集数据-封神神话-收藏
.xlsx"
,
"B站数据"
,
headlist
,
bodyList
);
}
}
src/test/java/com/zhiwei/shipin/QQTVTest.java
View file @
6018f0b3
...
...
@@ -17,8 +17,8 @@
//public class QQTVTest {
// @Test
// public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER);
// String time = "2019-0
4
-11 00:00:00";
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER
, 10000002
);
// String time = "2019-0
1
-11 00:00:00";
// List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
// List<Map<String, Object>> bodyList = new ArrayList<>();
// for (String word : wordList) {
...
...
@@ -37,7 +37,7 @@
// headlist.add("url");
// headlist.add("word");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//视频//腾讯视频关键词采集数据-
毓婷-0716
.xlsx", "腾讯视频数据", headlist, bodyList);
// poi.exportExcel("D://crawlerdata//视频//腾讯视频关键词采集数据-
精装修
.xlsx", "腾讯视频数据", headlist, bodyList);
//
//
//
...
...
src/test/java/com/zhiwei/shipin/SohuTVTest.java
View file @
6018f0b3
...
...
@@ -33,7 +33,7 @@
// headlist.add("url");
// headlist.add("word");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//视频//搜狐视频关键词采集数据-
毓婷-0716
.xlsx", "搜狐数据", headlist, bodyList);
// poi.exportExcel("D://crawlerdata//视频//搜狐视频关键词采集数据-
华为-0812
.xlsx", "搜狐数据", headlist, bodyList);
//
// }
//}
src/test/java/com/zhiwei/shipin/YoukuKeyWordTest.java
View file @
6018f0b3
...
...
@@ -30,7 +30,7 @@
// headList.add("uper");
// headList.add("word");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//视频//优酷数据-
毓婷-0716
.xlsx", "数据", headList, bodyList);
// poi.exportExcel("D://crawlerdata//视频//优酷数据-
华为-0812
.xlsx", "数据", headList, bodyList);
//
// }
//}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment