Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
articlenewscrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
articlenewscrawler
Commits
9234d24c
Commit
9234d24c
authored
Jul 26, 2019
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
更新
parent
cb5516a0
Hide whitespace changes
Inline
Side-by-side
Showing
90 changed files
with
2247 additions
and
1901 deletions
+2247
-1901
pom.xml
+5
-20
src/main/java/com/zhiwei/httpclient/HeadGet.java
+5
-5
src/main/java/com/zhiwei/httpclient/HttpClient.java
+15
-3
src/main/java/com/zhiwei/parse/Aika.java
+2
-1
src/main/java/com/zhiwei/parse/Aiqiyi.java
+1
-1
src/main/java/com/zhiwei/parse/Baijia.java
+30
-9
src/main/java/com/zhiwei/parse/BiliBili.java
+14
-18
src/main/java/com/zhiwei/parse/Chejia.java
+1
-1
src/main/java/com/zhiwei/parse/Dayu.java
+5
-7
src/main/java/com/zhiwei/parse/Douban.java
+1
-1
src/main/java/com/zhiwei/parse/Fenghuang.java
+6
-5
src/main/java/com/zhiwei/parse/Gftai.java
+1
-1
src/main/java/com/zhiwei/parse/KuaiTousu.java
+1
-1
src/main/java/com/zhiwei/parse/Maimai.java
+1
-1
src/main/java/com/zhiwei/parse/Pcauto.java
+1
-1
src/main/java/com/zhiwei/parse/QQKB.java
+3
-2
src/main/java/com/zhiwei/parse/QQNews.java
+1
-1
src/main/java/com/zhiwei/parse/QicheHome.java
+1
-1
src/main/java/com/zhiwei/parse/SinaKeji.java
+1
-1
src/main/java/com/zhiwei/parse/SinaTousu.java
+1
-1
src/main/java/com/zhiwei/parse/Souhu.java
+14
-39
src/main/java/com/zhiwei/parse/TXNews.java
+44
-1
src/main/java/com/zhiwei/parse/TechTx.java
+1
-1
src/main/java/com/zhiwei/parse/Wangyi.java
+67
-5
src/main/java/com/zhiwei/parse/Xueqiu.java
+21
-13
src/main/java/com/zhiwei/parse/Yangshi.java
+69
-0
src/main/java/com/zhiwei/parse/Yiche.java
+1
-1
src/main/java/com/zhiwei/parse/Yidianzixun.java
+8
-9
src/main/java/com/zhiwei/parse/Youku.java
+3
-3
src/main/java/com/zhiwei/parse/analysis/BaijiaAccountAnalysis.java
+8
-10
src/main/java/com/zhiwei/parse/analysis/BilibilikeyWordAnalysis.java
+7
-4
src/main/java/com/zhiwei/parse/analysis/DayuAccountAnalysis.java
+1
-1
src/main/java/com/zhiwei/parse/analysis/DayuByWordAnalysis.java
+1
-1
src/main/java/com/zhiwei/parse/analysis/FenghuangAccountAnalysis.java
+47
-58
src/main/java/com/zhiwei/parse/analysis/FenghuangCommentAnalysis.java
+1
-1
src/main/java/com/zhiwei/parse/analysis/SouhuAccountAnalysis.java
+9
-15
src/main/java/com/zhiwei/parse/analysis/SouhuCommentAnalysis.java
+1
-1
src/main/java/com/zhiwei/parse/shipin/QQTV.java
+26
-23
src/main/java/com/zhiwei/parse/shipin/SohuTV.java
+1
-1
src/test/java/com/zhiwei/Comment/AikaComment.java
+2
-4
src/test/java/com/zhiwei/Comment/AiqiyiHotCountTest.java
+33
-33
src/test/java/com/zhiwei/Comment/MaimaiCommentCountTest.java
+21
-20
src/test/java/com/zhiwei/Comment/YoukuHotCountTest.java
+36
-37
src/test/java/com/zhiwei/TestHttpBoot.java
+21
-38
src/test/java/com/zhiwei/crawler/DayuAccountExample.java
+54
-50
src/test/java/com/zhiwei/crawler/DayuCommentCountExample.java
+19
-19
src/test/java/com/zhiwei/crawler/DayuCommentExample.java
+65
-65
src/test/java/com/zhiwei/crawler/FenghuangAccountExample.java
+0
-39
src/test/java/com/zhiwei/crawler/FenghuangByWordExample.java
+45
-45
src/test/java/com/zhiwei/crawler/FenghuangCommentCountExample.java
+23
-23
src/test/java/com/zhiwei/crawler/FenghuangCommentExample.java
+61
-61
src/test/java/com/zhiwei/crawler/MaimaiBywordExample.java
+32
-32
src/test/java/com/zhiwei/crawler/MeipaiByWordExample.java
+43
-43
src/test/java/com/zhiwei/crawler/MiaopaiByUrlExample.java
+54
-54
src/test/java/com/zhiwei/crawler/PearVideoByWordExample.java
+33
-33
src/test/java/com/zhiwei/crawler/QQAccountExample.java
+52
-52
src/test/java/com/zhiwei/crawler/QQKBByWordExample.java
+20
-20
src/test/java/com/zhiwei/crawler/QQKBCommentCountExample.java
+21
-21
src/test/java/com/zhiwei/crawler/QQKBCommentExample.java
+51
-51
src/test/java/com/zhiwei/crawler/QQNewsCommentListTest.java
+137
-137
src/test/java/com/zhiwei/crawler/SinaCommentListTest.java
+84
-84
src/test/java/com/zhiwei/crawler/SoKuByWordExample.java
+40
-40
src/test/java/com/zhiwei/crawler/SouhuAccountExample.java
+38
-35
src/test/java/com/zhiwei/crawler/SouhuCommentCountExample.java
+48
-48
src/test/java/com/zhiwei/crawler/SouhuCommentExample.java
+62
-62
src/test/java/com/zhiwei/crawler/TXNewsByWordExample.java
+27
-27
src/test/java/com/zhiwei/crawler/Test1.java
+20
-20
src/test/java/com/zhiwei/crawler/WangyiCommentCountExample.java
+51
-51
src/test/java/com/zhiwei/crawler/WangyiCommentExample.java
+60
-60
src/test/java/com/zhiwei/crawler/WangyiHistoryExample.java
+30
-30
src/test/java/com/zhiwei/crawler/XiaomiShequByWordExample.java
+35
-35
src/test/java/com/zhiwei/crawler/XiguaAccountExample.java
+44
-44
src/test/java/com/zhiwei/crawler/XiguaByWordExample.java
+47
-47
src/test/java/com/zhiwei/crawler/YidainzixunByWordExample.java
+1
-1
src/test/java/com/zhiwei/crawler/YidianzixunCommentExample.java
+23
-23
src/test/java/com/zhiwei/hsitory/BaijiaAccountExample.java
+46
-0
src/test/java/com/zhiwei/hsitory/FenghuangAccountExample.java
+43
-0
src/test/java/com/zhiwei/hsitory/SouhuAccountExample.java
+39
-0
src/test/java/com/zhiwei/hsitory/TxNewsHostoryExample.java
+34
-0
src/test/java/com/zhiwei/hsitory/WangyiHistoryExample.java
+33
-0
src/test/java/com/zhiwei/hsitory/XueqiuHostoryExample.java
+35
-0
src/test/java/com/zhiwei/hsitory/YidianzixunAccountExample.java
+8
-5
src/test/java/com/zhiwei/keyword/XueqiuKeyWord.java
+47
-46
src/test/java/com/zhiwei/shipin/AiqiyiTest.java
+3
-3
src/test/java/com/zhiwei/shipin/BilibiliTest.java
+7
-4
src/test/java/com/zhiwei/shipin/QQTVTest.java
+4
-4
src/test/java/com/zhiwei/shipin/SohuTVTest.java
+2
-2
src/test/java/com/zhiwei/shipin/YoukuKeyWordTest.java
+2
-2
src/test/java/com/zhiwei/user/MaimaiTest.java
+30
-28
src/test/java/com/zhiwei/user/QQkandianExample.java
+55
-55
No files found.
pom.xml
View file @
9234d24c
...
...
@@ -3,42 +3,27 @@
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<artifactId>
articlenewscrawler
</artifactId>
<version>
0.1.
3
-SNAPSHOT
</version>
<version>
0.1.
6
-SNAPSHOT
</version>
<name>
articlenewscrawler
</name>
<description>
采集凤凰,一点资讯,搜狐历时文章和文章评论
</description>
<dependencies>
<dependency>
<groupId>
org.testng
</groupId>
<artifactId>
testng
</artifactId>
<version>
6.14.3
</version>
</dependency>
<dependency>
<groupId>
com.alibaba
</groupId>
<artifactId>
fastjson
</artifactId>
<version>
1.2.29
</version>
</dependency>
<dependency>
<groupId>
junit
</groupId>
<artifactId>
junit
</artifactId>
<version>
4.11
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei
</groupId>
<artifactId>
excelpoi
</artifactId>
<version>
0.0.
1
-SNAPSHOT
</version>
<version>
0.0.
3
-SNAPSHOT
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.tools
</groupId>
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.1.
2
-SNAPSHOT
</version>
<version>
0.1.
3
-SNAPSHOT
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<version>
0.3.
0
-RELEASE
</version>
<version>
0.3.
6
-RELEASE
</version>
<scope>
provided
</scope>
</dependency>
</dependency>
</dependencies>
<!-- 打包管理 -->
...
...
src/main/java/com/zhiwei/httpclient/HeadGet.java
View file @
9234d24c
...
...
@@ -113,14 +113,14 @@ public class HeadGet {
* @throws IOException
*/
public
static
Map
<
String
,
String
>
getFenghuangAccountHeaderMap
(
String
cookie
)
{
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"User-Agent"
,
"
IfengNews/6.1.8 (iPhone; iOS 11.2.1; Scale/2.00)
"
);
"
Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36
"
);
headerMap
.
put
(
"Accept"
,
"
*/*
"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-
cn
"
);
"
text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-
CN,zh;q=0.9,en;q=0.8
"
);
headerMap
.
put
(
"Connection"
,
"keep-alive"
);
headerMap
.
put
(
"Host"
,
"
api.3g
.ifeng.com"
);
headerMap
.
put
(
"Host"
,
"
shankapi
.ifeng.com"
);
if
(
cookie
!=
null
)
{
headerMap
.
put
(
"Cookie"
,
cookie
);
}
...
...
src/main/java/com/zhiwei/httpclient/HttpClient.java
View file @
9234d24c
...
...
@@ -16,7 +16,7 @@ import okhttp3.Response;
public
class
HttpClient
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
HttpClient
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
(
false
,
2
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
(
);
/**
*
...
...
@@ -44,15 +44,27 @@ public class HttpClient {
* @throws IOException
*/
public
static
String
executeHttpRequestGet
(
String
url
,
ProxyHolder
proxy
,
Map
<
String
,
String
>
headerMap
)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
)){
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
)){
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"httpClient 获取数据出现问题:{}"
,
e
);
}
}
return
null
;
}
public
static
String
executeHttpRequestPost
(
String
url
,
Proxy
proxy
,
Map
<
String
,
String
>
headerMap
,
Map
<
String
,
Object
>
paramMap
)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapPost
(
url
,
headerMap
,
paramMap
),
proxy
)){
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"httpClient 获取数据出现问题:{}"
,
e
);
return
null
;
}
}
public
static
String
executeHttpRequestPost
(
String
url
,
Proxy
proxy
,
Map
<
String
,
String
>
headerMap
,
Map
<
String
,
Object
>
paramMap
)
{
public
static
String
executeHttpRequestPost
(
String
url
,
Proxy
Holder
proxy
,
Map
<
String
,
String
>
headerMap
,
Map
<
String
,
Object
>
paramMap
)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapPost
(
url
,
headerMap
,
paramMap
),
proxy
)){
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
...
...
src/main/java/com/zhiwei/parse/Aika.java
View file @
9234d24c
...
...
@@ -20,7 +20,7 @@ public class Aika {
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Aika
.
class
);
private
static
AikaCommentAnalysis
aikaCommentAnalysis
=
new
AikaCommentAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
Map
<
String
,
Object
>>
getAikaComment
(
String
url
,
ProxyHolder
proxy
)
{
...
...
@@ -46,6 +46,7 @@ public class Aika {
page
++;
}
catch
(
Exception
e
)
{
logger
.
error
(
"爱卡汽车 评论采集出错 {}"
,
e
);
break
;
}
}
...
...
src/main/java/com/zhiwei/parse/Aiqiyi.java
View file @
9234d24c
...
...
@@ -23,7 +23,7 @@ import okhttp3.Response;
public
class
Aiqiyi
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Aiqiyi
.
class
);
private
static
AiqiyiByWordAnalysis
aiqiyiByWordAnalysis
=
new
AiqiyiByWordAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
(
false
,
2
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
(
);
/**
*
...
...
src/main/java/com/zhiwei/parse/Baijia.java
View file @
9234d24c
...
...
@@ -2,8 +2,11 @@ package com.zhiwei.parse;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -23,7 +26,7 @@ import okhttp3.Request;
public
class
Baijia
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Baijia
.
class
);
private
static
BaijiaAccountAnalysis
baijiaAccountAnalysis
=
new
BaijiaAccountAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
/**
*
...
...
@@ -77,27 +80,29 @@ public class Baijia {
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
headerMap
.
put
(
"cookie"
,
cookie
);
String
uk
=
getUkData
(
app_id
,
proxy
,
cookie
);
if
(
Objects
.
isNull
(
uk
))
{
return
Collections
.
emptyList
();
}
boolean
f
=
true
;
int
n
=
0
;
String
ctime
=
""
;
while
(
f
)
{
for
(
int
i
=
1
;
i
<
3
;
i
++)
{
try
{
String
url
=
"https://author.baidu.com/list?type=article&context={%22offset%22:%22-1_"
+
n
+
"%22,%22app_id%22:%22"
+
app_id
+
"%22,%22pageSize%22:20}"
;
System
.
out
.
println
(
url
);
String
url
=
"https://author.baidu.com/list?type=article&tab=2&uk="
+
uk
+
"&ctime="
+
ctime
+
"&num=50"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
String
result
=
httpBoot
.
syncCall
(
request
,
proxy
,
false
).
body
().
string
();
String
result
=
httpBoot
.
syncCall
(
request
,
proxy
).
body
().
string
();
Map
<
String
,
Object
>
dMap
=
baijiaAccountAnalysis
.
getBaijiaAccountData3
(
result
,
name
,
startTime
);
List
<
Map
<
String
,
Object
>>
dList
=
(
List
<
Map
<
String
,
Object
>>)
dMap
.
get
(
"data"
);
dataList
.
addAll
(
dList
);
logger
.
info
(
"{} 数据采集结果 {}"
,
name
,
dataList
.
size
());
logger
.
info
(
"{} 数据采集结果 {}"
,
app_id
,
dataList
.
size
());
if
(!(
boolean
)
dMap
.
get
(
"more"
))
{
f
=
false
;
}
ctime
=
String
.
valueOf
(
dMap
.
get
(
"ctime"
));
ZhiWeiTools
.
sleep
(
3000
);
n
+=
20
;
break
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
ZhiWeiTools
.
sleep
(
3000
);
}
}
...
...
@@ -106,6 +111,22 @@ public class Baijia {
return
dataList
;
}
private
static
String
getUkData
(
String
app_id
,
Proxy
proxy
,
String
cookie
)
{
String
url
=
"https://author.baidu.com/profile?context={\"from\":0,\"app_id\":\""
+
app_id
+
"\"}&cmdType=&pagelets[]=root&reqID=0&ispeed=1#"
;
Map
<
String
,
Object
>
headers
=
new
HashMap
<>();
headers
.
put
(
"Host"
,
"author.baidu.com"
);
headers
.
put
(
"cookie"
,
cookie
);
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
try
{
String
result
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headers
),
proxy
).
body
().
string
();
return
result
.
split
(
"uk\\\\\":\\\\\""
)[
1
].
split
(
"\\\\\","
)[
0
];
}
catch
(
Exception
e
)
{
logger
.
error
(
"百家号uk 获取失败"
);
}
}
return
null
;
}
/**
*
* @Description 百家号历史文章采集
...
...
@@ -114,7 +135,7 @@ public class Baijia {
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getBaijiaAccountData
(
String
app_id
,
String
startTime
,
Proxy
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>
>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
int
i
=
0
;
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getBaijiaAccountHeaderMap
(
null
);
try
{
...
...
src/main/java/com/zhiwei/parse/BiliBili.java
View file @
9234d24c
...
...
@@ -12,28 +12,28 @@ import org.slf4j.Logger;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.HttpRequestBuilder
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.parse.analysis.BilibilikeyWordAnalysis
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Headers
;
import
okhttp3.Request
;
public
class
BiliBili
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
BiliBili
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
useCookieJar
(
true
).
build
();
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
Proxy
proxy
,
String
cookie
)
{
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
Proxy
proxy
,
String
endTime
,
String
cookie
)
{
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
try
{
//
String
url
=
"https://search.bilibili.com/all?keyword="
+
URLEncoder
.
encode
(
word
,
"utf-8"
)+
"&order=pubdate&duration=0&tids_1=0"
;
Headers
header
=
Headers
.
of
(
"cookie"
,
cookie
,
"Referer"
,
"https://www.bilibili.com/"
,
"Host"
,
"search.bilibili.com"
);
Request
request
=
HttpRequestBuilder
.
newGetRequest
(
url
,
header
);
String
result
=
httpBoot
.
syncCall
(
request
,
proxy
).
body
().
string
();
ZhiWeiTools
.
sleep
(
3000
);
Map
<
String
,
Object
>
map
=
BilibilikeyWordAnalysis
.
getData
(
result
,
word
);
String
result
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
header
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
ZhiWeiTools
.
sleep
(
100
);
Map
<
String
,
Object
>
map
=
BilibilikeyWordAnalysis
.
getData
(
result
,
word
,
endTime
);
boolean
more
=
(
boolean
)
map
.
get
(
"more"
);
List
<
Map
<
String
,
Object
>>
dataList
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"data"
);
if
(
dataList
!=
null
)
{
...
...
@@ -43,27 +43,23 @@ public class BiliBili {
while
(
more
)
{
map
.
clear
();
String
ur
=
url
+
"&page="
+
n
;
System
.
out
.
println
(
ur
);
request
=
HttpRequestBuilder
.
newGetRequest
(
ur
,
header
);
String
result2
=
httpBoot
.
syncCall
(
request
,
proxy
).
body
().
string
();
map
=
BilibilikeyWordAnalysis
.
getData
(
result2
,
word
);
String
result2
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
ur
,
header
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
map
=
BilibilikeyWordAnalysis
.
getData
(
result2
,
word
,
endTime
);
List
<
Map
<
String
,
Object
>>
dataList2
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"data"
);
if
(
dataList2
!=
null
)
{
bodyList
.
addAll
(
dataList2
);
}
System
.
out
.
println
(
n
+
"页,数据总量为 -- "
+
bodyList
.
size
()
);
logger
.
info
(
"word {} , {} 页,数据总量为 -- {}"
,
word
,
n
,
bodyList
.
size
()
);
more
=
(
boolean
)
map
.
get
(
"more"
);
n
++;
ZhiWeiTools
.
sleep
(
30
00
);
ZhiWeiTools
.
sleep
(
1
00
);
}
return
bodyList
;
}
catch
(
UnsupportedEncodingException
e
)
{
logger
.
error
(
"e "
,
e
);
logger
.
error
(
"e
{}
"
,
e
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"e "
,
e
);
logger
.
error
(
"e
{}
"
,
e
);
}
return
Collections
.
emptyList
();
}
...
...
src/main/java/com/zhiwei/parse/Chejia.java
View file @
9234d24c
...
...
@@ -25,7 +25,7 @@ import okhttp3.Response;
public
class
Chejia
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Chejia
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
/**
*
...
...
src/main/java/com/zhiwei/parse/Dayu.java
View file @
9234d24c
...
...
@@ -11,6 +11,7 @@ import org.slf4j.Logger;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.parse.analysis.DayuAccountAnalysis
;
...
...
@@ -30,26 +31,23 @@ public class Dayu {
* @param mid
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getDayuAccountData
(
String
mid
,
String
name
,
String
startTime
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getDayuAccountData
(
String
mid
,
String
name
,
String
startTime
,
Proxy
Holder
proxy
)
{
int
i
=
1
;
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getDayuAccountHeaderMap
(
null
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>
>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
try
{
while
(
true
)
{
String
url
=
"http://ff.dayu.com/contents/author/"
+
mid
+
"?biz_id=1002&_size=50&_page="
+
i
+
"&_order_type=published_at&status=1&_fetch=1"
;
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
System
.
out
.
println
(
url
);
List
<
Map
<
String
,
Object
>>
lists
=
dayuAccountAnalysis
.
getDayuAccountData
(
result
,
name
,
startTime
);
if
(
lists
==
null
)
{
break
;
}
if
(
lists
.
size
()
<
1
)
{
if
(
lists
==
null
||
lists
.
isEmpty
())
{
break
;
}
dataList
.
addAll
(
lists
);
System
.
out
.
println
(
"================解析第"
+
i
+
"页====此时有数据=="
+
dataList
.
size
());
i
++;
ZhiWeiTools
.
sleep
(
70
00
);
ZhiWeiTools
.
sleep
(
1
00
);
}
return
dataList
;
}
catch
(
Exception
e
)
{
...
...
src/main/java/com/zhiwei/parse/Douban.java
View file @
9234d24c
...
...
@@ -25,7 +25,7 @@ public class Douban {
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Double
.
class
);
private
static
DoubanCommentAnalysis
doubanCommentAnalysis
=
new
DoubanCommentAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
/**
*
...
...
src/main/java/com/zhiwei/parse/Fenghuang.java
View file @
9234d24c
...
...
@@ -19,6 +19,7 @@ import com.zhiwei.parse.analysis.FenghuangCommentAnalysis;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
Fenghuang
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Fenghuang
.
class
);
private
static
FenghuangAccountAnalysis
fenghuangAccountAnalysis
=
new
FenghuangAccountAnalysis
();
private
static
FenghuangCommentAnalysis
fenghuangCommentAnalysis
=
new
FenghuangCommentAnalysis
();
...
...
@@ -31,7 +32,7 @@ public class Fenghuang {
* @param startTime 可不传 格式(2017-12-09 17:53:02)
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getFenghuangAccountData
(
String
id
,
String
startTime
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getFenghuangAccountData
(
String
id
,
String
startTime
,
Proxy
Holder
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
int
i
=
1
;
boolean
f
=
true
;
...
...
@@ -39,17 +40,17 @@ public class Fenghuang {
try
{
for
(
int
j
=
0
;
j
<
3
;
j
++){
f
=
true
;
String
url
=
"http
://api.3g.ifeng.com/api_wemedia_index?followid=weMedia_"
+
id
+
"&page="
+
i
+
"&pagesize=20&tag=article&uid=fe659b7e510444c28a31f88dee7a2747
"
;
String
url
=
"http
s://shankapi.ifeng.com/winter/feng/author/getFengAuthorListData/"
+
id
+
"/doc/"
+
i
+
"/getFengAuthorListData
"
;
List
<
Map
<
String
,
Object
>>
list
=
fenghuangAccountAnalysis
.
getArticleData
(
url
,
startTime
,
proxy
);
if
(
list
!=
null
&&
!
list
.
isEmpty
())
{
dataList
.
addAll
(
list
);
logger
.
info
(
"
====================采集第 {} 页===共获取数据==
{}"
,
i
,
dataList
.
size
());
logger
.
info
(
"
采集第 {} 页,.共获取数据
{}"
,
i
,
dataList
.
size
());
i
++;
ZhiWeiTools
.
sleep
(
20
00
);
ZhiWeiTools
.
sleep
(
1
00
);
break
;
}
f
=
false
;
ZhiWeiTools
.
sleep
(
20
00
);
ZhiWeiTools
.
sleep
(
1
00
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"程序出错 {}"
,
e
);
...
...
src/main/java/com/zhiwei/parse/Gftai.java
View file @
9234d24c
...
...
@@ -18,7 +18,7 @@ public class Gftai {
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Gftai
.
class
);
private
static
GftaiAnalysis
gftaiAnalysis
=
new
GftaiAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
Proxy
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
...
...
src/main/java/com/zhiwei/parse/KuaiTousu.java
View file @
9234d24c
...
...
@@ -19,7 +19,7 @@ public class KuaiTousu {
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
KuaiTousu
.
class
);
private
static
KuaiTousuAnalysis
kuaiTousuAnalysis
=
new
KuaiTousuAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
Proxy
proxy
)
{
int
page
=
1
;
...
...
src/main/java/com/zhiwei/parse/Maimai.java
View file @
9234d24c
...
...
@@ -30,7 +30,7 @@ import okhttp3.Response;
public
class
Maimai
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Maimai
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
MaimaiBywordAnalysis
maimaiBywordAnalysis
=
new
MaimaiBywordAnalysis
();
...
...
src/main/java/com/zhiwei/parse/Pcauto.java
View file @
9234d24c
...
...
@@ -22,7 +22,7 @@ public class Pcauto {
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Pcauto
.
class
);
private
static
PcautoCommentAnalysis
pcautoCommentAnalysis
=
new
PcautoCommentAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
Map
<
String
,
Object
>>
getPcAutoComment
(
String
url
,
ProxyHolder
proxy
)
{
...
...
src/main/java/com/zhiwei/parse/QQKB.java
View file @
9234d24c
...
...
@@ -13,6 +13,7 @@ import com.alibaba.fastjson.JSONArray;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.bean.QQkbUser
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.parse.analysis.QQKBAccountAnalysis
;
...
...
@@ -120,7 +121,7 @@ public class QQKB {
while
(
true
)
{
try
{
String
result
=
HttpClient
.
executeHttpRequestPost
(
"http://r.cnews.qq.com/getQQNewsComment"
,
Proxy
Factory
.
getNatProxy
()
,
headerMap
,
paramMap
);
String
result
=
HttpClient
.
executeHttpRequestPost
(
"http://r.cnews.qq.com/getQQNewsComment"
,
Proxy
Holder
.
NAT_HEAVY_PROXY
,
headerMap
,
paramMap
);
paramMap
.
clear
();
List
<
Map
<
String
,
Object
>>
lists
=
qqkbCommentAnalysis
.
getCommentData
(
result
,
null
,
comment_id
,
article_id
,
proxy
);
if
(
lists
==
null
||
lists
.
size
()
<
1
)
{
...
...
@@ -148,7 +149,7 @@ public class QQKB {
String
cookie
=
"luin=o0497332654;%20lskey=00030000d63ffaf7eba88c86106eac5f2910d45515222334b91c75a66b449c990c2be43cd202ba39b35bef60;%20uin=o0497332654;%20skey=MH3wukytS4;%20sigA2=7AB4D8DEDF73E313801FD348FD77EC3B05C06DBC4D9DA669B20CA04A8D6B80F300A69567FBD11A7B799E419BB796F22D47D3AE5FA95E708A0ABC66161061131B0B21A0031AA0807C;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmSQ3EqlyzseC9-qGEFY7Tkr0Ypk5vsnSwOaMC-IGxsBeY2K7knHrYstj_5dZpisJd5nihvLNQvCdsFhFwZQcT8;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0"
;
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getQQkbUserHeaderMap
(
cookie
);
Map
<
String
,
Object
>
paramMap
=
HeadGet
.
getQQkbUserParamMap
(
name
);
String
result
=
HttpClient
.
executeHttpRequestPost
(
url
,
null
,
headerMap
,
paramMap
);
String
result
=
HttpClient
.
executeHttpRequestPost
(
url
,
ProxyHolder
.
NAT_HEAVY_PROXY
,
headerMap
,
paramMap
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONObject
json1
=
json
.
getJSONObject
(
"new_list"
);
JSONObject
json2
=
json1
.
getJSONArray
(
"data"
).
getJSONObject
(
0
);
...
...
src/main/java/com/zhiwei/parse/QQNews.java
View file @
9234d24c
...
...
@@ -24,7 +24,7 @@ public class QQNews {
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
QQNews
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
/**
* .
...
...
src/main/java/com/zhiwei/parse/QicheHome.java
View file @
9234d24c
...
...
@@ -17,7 +17,7 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
public
class
QicheHome
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
QicheHome
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
QicheHomeKwyWordAnalysis
qicheHomeKwyWordAnalysis
=
new
QicheHomeKwyWordAnalysis
();
...
...
src/main/java/com/zhiwei/parse/SinaKeji.java
View file @
9234d24c
...
...
@@ -24,7 +24,7 @@ public class SinaKeji {
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
SinaKeji
.
class
);
private
static
SinaKejiCommentAnalysis
sinaKejiCommentAnalysis
=
new
SinaKejiCommentAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
/**
* https://tech.sina.com.cn/it/2018-07-17/doc-ihfkffam0632649.shtml
...
...
src/main/java/com/zhiwei/parse/SinaTousu.java
View file @
9234d24c
...
...
@@ -21,7 +21,7 @@ public class SinaTousu {
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
SinaTousu
.
class
);
private
static
SinaTousuAnalysis
sinaTousuAnalysis
=
new
SinaTousuAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
public
static
List
<
Map
<
String
,
Object
>>
getSinaTousuData
(
String
word
,
ProxyHolder
proxy
,
String
time
)
{
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
...
...
src/main/java/com/zhiwei/parse/Souhu.java
View file @
9234d24c
...
...
@@ -2,14 +2,11 @@ package com.zhiwei.parse;
import
static
java
.
util
.
Objects
.
nonNull
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.Map
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
...
...
@@ -73,36 +70,28 @@ public class Souhu {
* @param isCulling 是否采集精选
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getSouHuAccountData
(
String
xpt
,
String
startTime
,
boolean
isCulling
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getSouHuAccountData
(
String
id
,
String
name
,
String
startTime
,
boolean
isCulling
,
ProxyHolder
proxy
)
{
int
i
=
1
;
String
name
=
getName
(
xpt
,
proxy
);
ZhiWeiTools
.
sleep
(
2000
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getSouhuAccountHeaderMap
(
null
);
ZhiWeiTools
.
sleep
(
200
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
boolean
f
=
true
;
int
j
=
0
;
while
(
f
)
{
try
{
String
url
=
"http://mp.sohu.com/apiV2/profile/newsListAjax?xpt="
+
xpt
+
"&pageNumber="
+
i
+
"&pageSize=10"
;
String
result
=
null
;
String
url
=
"http://v2.sohu.com/author-page-api/author-articles/pc/"
+
id
+
"?pNo="
+
i
;
if
(
isCulling
)
{
url
=
url
+
"&categoryId=-1"
;
}
try
{
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
url
=
url
+
"&columnId=-1"
;
}
result
=
result
.
replaceAll
(
"\\\\"
,
""
);
result
=
result
.
substring
(
1
,
result
.
length
()-
1
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
null
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArray
=
json
.
getJSON
Array
(
"data
"
);
JSONArray
jsonArray
=
json
.
getJSON
Object
(
"data"
).
getJSONArray
(
"pcArticleVOS
"
);
List
<
Map
<
String
,
Object
>>
dataList1
=
souhuAccountAnalysis
.
analysisData
(
jsonArray
,
name
);
if
(
jsonArray
.
size
()
<
1
)
{
if
(
jsonArray
.
isEmpty
()
)
{
break
;
}
if
(
startTime
==
null
)
{
j
=
0
;
dataList
.
addAll
(
dataList1
);
}
//判断时间
...
...
@@ -113,40 +102,26 @@ public class Souhu {
f
=
false
;
break
;
}
j
=
0
;
dataList
.
add
(
map
);
}
}
logger
.
info
(
"=============获取到的数据数目{}"
,
dataList
.
size
());
i
++;
ZhiWeiTools
.
sleep
(
300
0
);
ZhiWeiTools
.
sleep
(
300
);
}
catch
(
Exception
e
)
{
ZhiWeiTools
.
sleep
(
300
0
);
logger
.
error
(
"出错了
"
,
e
.
getMessage
()
);
ZhiWeiTools
.
sleep
(
300
);
logger
.
error
(
"出错了
{}"
,
e
);
j
++;
if
(
j
>
5
)
{
f
=
false
;
}
continue
;
}
}
return
dataList
;
}
private
static
String
getName
(
String
xpt
,
Proxy
proxy
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getSouhuAccountHeaderMap
(
null
);
try
{
String
result
=
HttpClient
.
executeHttpRequestGet
(
"http://mp.sohu.com/profile?xpt="
+
xpt
,
proxy
,
headerMap
);
Document
doc
=
Jsoup
.
parse
(
result
);
String
name
=
doc
.
select
(
"p#ff"
).
text
();
System
.
out
.
println
(
name
);
return
name
;
}
catch
(
Exception
e
)
{
return
null
;
}
}
/**
*
* @Description 传入搜狐文章链接和cookie 可获取此文章所有评论
...
...
@@ -161,7 +136,7 @@ public class Souhu {
try
{
while
(
true
)
{
String
newurl
=
souhuCommentAnalysis
.
getSouhuURL
(
url
,
proxy
)
+
"&page_no="
+
j
;
String
result
=
HttpClient
.
executeHttpRequestGet
(
newurl
,
Proxy
Factory
.
getNatProxy
()
,
headerMap
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
newurl
,
Proxy
Holder
.
NAT_HEAVY_PROXY
,
headerMap
);
System
.
out
.
println
(
newurl
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArry
=
json
.
getJSONObject
(
"jsonObject"
).
getJSONArray
(
"comments"
);
...
...
src/main/java/com/zhiwei/parse/TXNews.java
View file @
9234d24c
...
...
@@ -19,6 +19,7 @@ import com.zhiwei.crawler.utils.RequestUtils;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.parse.analysis.TXNewsByWordAnalysis
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
...
...
@@ -28,7 +29,7 @@ public class TXNews {
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
TXNews
.
class
);
private
static
TXNewsByWordAnalysis
txNewsByWordAnalysis
=
new
TXNewsByWordAnalysis
();
public
static
boolean
txNewshasMoreData
=
true
;
p
ublic
static
HttpBoot
httpBoot
=
new
HttpBoot
();
p
rivate
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
String
devid
,
Proxy
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
...
...
@@ -120,5 +121,47 @@ public class TXNews {
return
-
1
;
}
public
static
List
<
Map
<
String
,
Object
>>
getTxNewsHistory
(
String
mid
,
String
endTime
,
ProxyHolder
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
int
page
=
0
;
int
errorNum
=
0
;
while
(
true
)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
"https://pacaio.match.qq.com/om/mediaArticles?mid="
+
mid
+
"&num=30&page="
+
page
),
proxy
)){
String
result
=
response
.
body
().
string
();
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"data"
);
for
(
int
i
=
0
,
j
=
jsonArray
.
size
();
i
<
j
;
i
++)
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
String
time
=
TimeParse
.
dateFormartString
(
new
Date
(
data
.
getLong
(
"timestamp"
)*
1000L
),
"yyyy-MM-dd HH:mm:ss"
);
if
(
endTime
!=
null
&&
endTime
.
length
()
>
1
)
{
System
.
out
.
println
(
time
);
if
(
time
.
compareTo
(
endTime
)
<=
0
)
{
logger
.
info
(
"超时时间采集范围 跳出采集"
);
return
dataList
;
}
}
map
.
put
(
"title"
,
data
.
getString
(
"title"
));
map
.
put
(
"content"
,
data
.
getString
(
"abstract"
));
map
.
put
(
"time"
,
time
);
map
.
put
(
"source"
,
data
.
getString
(
"source"
));
map
.
put
(
"url"
,
data
.
getString
(
"vurl"
));
dataList
.
add
(
map
);
}
logger
.
info
(
"mid = {} , cralwer count = {}"
,
mid
,
dataList
.
size
()
);
page
++;
if
(
jsonArray
.
size
()
<
10
)
{
break
;
}
}
catch
(
Exception
e
)
{
logger
.
info
(
"采集数据出错 {}"
,
e
);
errorNum
++;
if
(
errorNum
>
3
)
{
break
;
}
}
}
return
dataList
;
}
}
src/main/java/com/zhiwei/parse/TechTx.java
View file @
9234d24c
...
...
@@ -21,7 +21,7 @@ public class TechTx {
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
TechTx
.
class
);
private
static
TechTxCommentAnalysis
techTxCommentAnalysis
=
new
TechTxCommentAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
Map
<
String
,
Object
>>
getTechTxComment
(
String
url
,
ProxyHolder
proxy
)
{
...
...
src/main/java/com/zhiwei/parse/Wangyi.java
View file @
9234d24c
...
...
@@ -2,6 +2,7 @@ package com.zhiwei.parse;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
...
...
@@ -9,18 +10,24 @@ import org.jsoup.Jsoup;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.parse.analysis.WangyiCommentAnalysis
;
import
com.zhiwei.parse.analysis.WangyiHistoryAnalysis
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
public
class
Wangyi
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Wangyi
.
class
);
private
static
WangyiCommentAnalysis
wangyiCommentAnalysis
=
new
WangyiCommentAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
WangyiHistoryAnalysis
wangyiHistoryAnalysis
=
new
WangyiHistoryAnalysis
();
/**
...
...
@@ -74,24 +81,31 @@ public class Wangyi {
}
}
/**
*
* @Description 网易网页版数据
* @param url
* @param proxy
* @param endTime
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getHistoryData
(
String
url
,
Proxy
proxy
,
String
endTime
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getWangyiHistoryHeaderMap
(
null
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>
>();
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
String
wemediaid
=
result
.
split
(
"data-wemediaid=\""
)[
1
].
split
(
"\""
)[
0
];
String
source
=
Jsoup
.
parse
(
result
).
select
(
"body > div.colum_wrap.fl > div > div.colum_des > div.normal > div.colum_info > h4"
).
text
();
boolean
f
=
true
;
url
=
"http://dy.163.com/v2/article/list.do?wemediaId="
+
wemediaid
+
"&size=
2
0&pageNo="
;
url
=
"http://dy.163.com/v2/article/list.do?wemediaId="
+
wemediaid
+
"&size=
1
0&pageNo="
;
int
i
=
1
;
ZhiWeiTools
.
sleep
(
1000
);
int
j
=
0
;
while
(
f
)
{
try
{
result
=
""
;
result
=
HttpClient
.
executeHttpRequestGet
(
url
+
i
,
proxy
,
headerMap
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
List
<
Map
<
String
,
Object
>>
dataList
=
wangyiHistoryAnalysis
.
getData
(
result
,
proxy
,
endTime
,
source
);
if
(
dataList
==
null
||
dataList
.
size
()
<
1
)
{
if
(
dataList
==
null
||
dataList
.
isEmpty
()
)
{
break
;
}
bodyList
.
addAll
(
dataList
);
...
...
@@ -109,10 +123,58 @@ public class Wangyi {
if
(
j
>
5
)
{
f
=
false
;
}
continue
;
}
}
return
bodyList
;
}
public
static
List
<
Map
<
String
,
Object
>>
getWangyiClientHistory
(
String
id
,
ProxyHolder
proxy
,
String
endTime
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
int
page
=
0
;
int
errorNum
=
0
;
while
(
true
)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
"https://c.m.163.com/nc/subscribe/list/"
+
id
+
"/all/"
+
page
+
"-20.html"
),
proxy
)){
String
result
=
response
.
body
().
string
();
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"tab_list"
);
for
(
int
i
=
0
,
j
=
jsonArray
.
size
();
i
<
j
;
i
++)
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
String
time
=
data
.
getString
(
"ptime"
);
if
(
endTime
!=
null
&&
endTime
.
length
()
>
1
)
{
System
.
out
.
println
(
time
);
if
(
time
.
compareTo
(
endTime
)
<=
0
)
{
logger
.
info
(
"超时时间采集范围 跳出采集"
);
return
dataList
;
}
}
map
.
put
(
"title"
,
data
.
getString
(
"title"
));
map
.
put
(
"content"
,
data
.
getString
(
"aheadBody"
));
map
.
put
(
"time"
,
time
);
map
.
put
(
"source"
,
data
.
getString
(
"source"
));
if
(
"video"
.
equals
(
data
.
getString
(
"skipType"
)))
{
map
.
put
(
"url"
,
"https://c.m.163.com/news/v/"
+
data
.
getString
(
"skipID"
)
+
".html"
);
}
else
{
map
.
put
(
"url"
,
"https://c.m.163.com/news/a/"
+
data
.
getString
(
"postid"
)
+
".html"
);
}
// System.out.println(map.toString());
dataList
.
add
(
map
);
}
logger
.
info
(
"id = {} , cralwer count = {}"
,
id
,
dataList
.
size
()
);
page
+=
20
;
if
(
jsonArray
.
size
()
<
10
)
{
break
;
}
}
catch
(
Exception
e
)
{
logger
.
info
(
"采集数据出错 {}"
,
e
);
errorNum
++;
if
(
errorNum
>
3
)
{
break
;
}
}
}
return
dataList
;
}
}
src/main/java/com/zhiwei/parse/Xueqiu.java
View file @
9234d24c
...
...
@@ -26,12 +26,12 @@ import okhttp3.Response;
public
class
Xueqiu
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Xueqiu
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
XueqiuKeyWordAnalysis
xueqiuKeyWordAnalysis
=
new
XueqiuKeyWordAnalysis
();
/**
*
* @Description 关键词采集
历史
文章
* @Description 关键词采集文章
* @param word
* @param endTime
* @param proxy
...
...
@@ -53,13 +53,16 @@ public class Xueqiu {
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
String
result
=
httpBoot
.
syncCall
(
request
,
proxy
).
body
().
string
();
List
<
Map
<
String
,
Object
>>
list
=
xueqiuKeyWordAnalysis
.
getData
(
result
,
endTime
);
ZhiWeiTools
.
sleep
(
3000
);
if
(
list
.
size
()
<
1
)
{
if
(
list
.
isEmpty
())
{
i
++;
}
else
{
int
count
=
JSONObject
.
parseObject
(
result
).
getIntValue
(
"maxPage"
);
bodyList
.
addAll
(
list
);
logger
.
info
(
"采集到第{} 页 , 一共采集到 {} 数据"
,
page
,
bodyList
.
size
());
page
++;
if
(
count
<
page
)
{
break
;
}
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
...
...
@@ -98,16 +101,17 @@ public class Xueqiu {
/**
*
* @Description
(TODO这里用一句话描述这个方法的作用)
* @Description
雪球历史文章采集
* @return
*/
public
List
<
Map
<
String
,
Object
>>
getXueqiuAccountData
(
String
userId
,
String
cookie
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getXueqiuAccountData
(
String
userId
,
String
cookie
,
Proxy
proxy
)
{
Map
<
String
,
Object
>
headers
=
new
HashMap
<>();
headers
.
put
(
"cookie"
,
cookie
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
int
page
=
1
;
int
errorCount
=
1
;
while
(
true
)
{
int
page
=
1
;
String
url
=
"https://xueqiu.com/v4/statuses/user_timeline.json?page="
+
page
+
"&user_id=6687544095&type=0"
;
String
url
=
"https://xueqiu.com/v4/statuses/user_timeline.json?page="
+
page
+
"&user_id="
+
userId
+
"&type=0"
;
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headers
),
proxy
)){
String
result
=
response
.
body
().
string
();
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
...
...
@@ -121,26 +125,30 @@ public class Xueqiu {
Date
date
=
TimeParse
.
stringFormartDate
(
timeBefore
);
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
map
.
put
(
"
nam
e"
,
ob
.
getJSONObject
(
"user"
).
getString
(
"screen_name"
));
//statuses user screen_name
map
.
put
(
"
sourc
e"
,
ob
.
getJSONObject
(
"user"
).
getString
(
"screen_name"
));
//statuses user screen_name
map
.
put
(
"time"
,
date
);
//statuses timeBefore
map
.
put
(
"source"
,
ob
.
getString
(
"source"
));
//statuses source
map
.
put
(
"content"
,
ob
.
getString
(
"description"
).
replaceAll
(
"<.*?>"
,
""
));
//statuses description
map
.
put
(
"title"
,
ob
.
getString
(
"rawTitle"
));
map
.
put
(
"repostCount"
,
ob
.
getString
(
"retweet_count"
));
//statuses retweet_count
map
.
put
(
"commentCount"
,
ob
.
getString
(
"reply_count"
));
//statuses reply_count
map
.
put
(
"likeCount"
,
ob
.
getString
(
"like_count"
));
//statuses like_count
map
.
put
(
"url"
,
"https://xueqiu.coms"
+
ob
.
getString
(
"target"
));
map
.
put
(
"url"
,
"https://xueqiu.com"
+
ob
.
getString
(
"target"
));
bodyList
.
add
(
map
);
}
int
maxPage
=
json
.
getInteger
(
"maxPage"
);
page
++;
logger
.
info
(
"userId = {} , crawler count = {} ,page = {} , maxPage = {}"
,
userId
,
bodyList
.
size
(),
page
,
maxPage
);
if
(
page
>
maxPage
)
{
break
;
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"采集解析出错 {}"
,
e
);
break
;
errorCount
++;
if
(
errorCount
>
3
)
{
break
;
}
}
ZhiWeiTools
.
sleep
(
2000
);
}
return
bodyList
;
}
...
...
src/main/java/com/zhiwei/parse/Yangshi.java
0 → 100644
View file @
9234d24c
package
com
.
zhiwei
.
parse
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.HashMap
;
import
java.util.LinkedHashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
/**
*
* @ClassName Yangshi
* @Description 央视网 采集
* @author byte-zbs
* @Date 2019年7月4日 下午6:08:12
* @version 1.0.0
*/
public
class
Yangshi
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Yangshi
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
public
static
List
<
Map
<
String
,
Object
>>
getData
()
{
return
Collections
.
emptyList
();
}
private
static
List
<
Map
<
String
,
Object
>>
analysisData
(
String
result
)
{
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
JSONArray
jsonArray
=
JSONObject
.
parseObject
(
result
).
getJSONArray
(
"list"
);
try
{
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
JSONObject
ob
=
jsonArray
.
getJSONObject
(
i
);
String
allTitle
=
ob
.
getString
(
"all_title"
);
//视频标题
String
urllink
=
ob
.
getString
(
"urllink"
);
//链接
String
channel
=
ob
.
getString
(
"channel"
);
//频道来源
String
uploadtime
=
ob
.
getString
(
"uploadtime"
);
//时间
String
durations
=
ob
.
getString
(
"durations"
);
//时长
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
map
.
put
(
"视频标题"
,
allTitle
);
map
.
put
(
"链接"
,
urllink
);
map
.
put
(
"频道来源"
,
channel
);
map
.
put
(
"时间"
,
uploadtime
);
map
.
put
(
"时长"
,
durations
+
" s"
);
System
.
out
.
println
(
map
.
toString
());
bodyList
.
add
(
map
);
}
}
catch
(
Exception
e
)
{
// TODO: handle exception
e
.
printStackTrace
();
}
return
bodyList
;
}
}
src/main/java/com/zhiwei/parse/Yiche.java
View file @
9234d24c
...
...
@@ -23,7 +23,7 @@ import okhttp3.Response;
public
class
Yiche
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Yiche
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
/**
*
...
...
src/main/java/com/zhiwei/parse/Yidianzixun.java
View file @
9234d24c
...
...
@@ -33,7 +33,7 @@ public class Yidianzixun {
private
static
YidianzixunCommentAnalysis
yidianzixunCommentAnalysis
=
new
YidianzixunCommentAnalysis
();
private
static
YidianzixunByWordAnalysis
yidianzixunByWordAnalysis
=
new
YidianzixunByWordAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
/**
*
...
...
@@ -42,19 +42,19 @@ public class Yidianzixun {
* @param startTime
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getYidianzixunAccountData
(
String
channelid
,
String
startTime
,
Proxy
proxy
,
String
cookie
)
{
public
static
List
<
Map
<
String
,
Object
>>
getYidianzixunAccountData
(
String
channelid
,
String
startTime
,
Proxy
Holder
proxy
,
String
cookie
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getYidianzixunAccountHeaderMap
(
cookie
,
"http://www.yidianzixun.com/channel/"
+
channelid
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
int
j
=
0
;
boolean
f
=
true
;
try
{
while
(
f
)
{
String
url
=
"http://www.yidianzixun.com/"
+
getSpt
(
channelid
,
j
,
j
+
10
);
String
url
=
"http://www.yidianzixun.com"
+
getSpt
(
channelid
,
j
,
j
+
10
);
System
.
out
.
println
(
url
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
System
.
out
.
println
(
result
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArry
=
json
.
getJSONArray
(
"result"
);
if
(
jsonArry
.
size
()
==
0
)
{
if
(
jsonArry
.
isEmpty
()
)
{
break
;
}
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
...
...
@@ -70,13 +70,12 @@ public class Yidianzixun {
dataList
.
add
(
map
);
}
}
System
.
out
.
println
(
"================================"
+
dataList
.
size
());
ZhiWeiTools
.
sleep
(
30
00
);
logger
.
info
(
"channelid = {} , crawler size = {}"
,
channelid
,
dataList
.
size
());
ZhiWeiTools
.
sleep
(
1
00
);
j
=
dataList
.
size
();
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"数据获取出错"
,
e
.
getMessage
());
e
.
printStackTrace
();
logger
.
error
(
"数据获取出错 {}"
,
e
);
}
return
dataList
;
}
...
...
src/main/java/com/zhiwei/parse/Youku.java
View file @
9234d24c
...
...
@@ -24,7 +24,7 @@ import okhttp3.Response;
public
class
Youku
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Youku
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
(
false
,
2
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
(
);
public
static
List
<
Map
<
String
,
Object
>>
getDataList
(
String
word
)
{
String
aaid
=
"9cae49f0e031664b00d8f9c108e586ab"
;
...
...
@@ -33,7 +33,7 @@ public class Youku {
String
url
=
"https://so.youku.com/search_video/q_"
+
URLCodeUtil
.
getURLEncode
(
word
,
"UTF-8"
)+
"?spm=a2h0k.11417342.filter.dnew&orderfield=createtime&aaid="
+
aaid
+
"&pg="
+
i
;
System
.
out
.
println
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
Proxy
Factory
.
getNatProxy
()
)){
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
Proxy
Holder
.
NAT_HEAVY_PROXY
)){
String
result
=
response
.
body
().
string
();
String
jsondata
=
result
.
split
(
"bigview.view\\("
)[
1
].
split
(
"\\)\\</script\\>"
)[
0
];
JSONObject
json
=
JSONObject
.
parseObject
(
jsondata
);
...
...
@@ -45,7 +45,7 @@ public class Youku {
String
title
=
element
.
select
(
"div.mod-main > div.mod-header > h2 > a"
).
text
();
String
surl
=
element
.
select
(
"div.mod-main > div.mod-header > h2 > a"
).
attr
(
"href"
);
String
time
=
element
.
select
(
"div.mod-main > div.mod-info > p"
).
text
();
if
(
time
.
contains
(
"上传时间:"
))
{
if
(
time
.
contains
(
"上传时间:"
)
&&
surl
.
contains
(
"v.youku.com"
)
)
{
map
.
put
(
"title"
,
title
);
map
.
put
(
"url"
,
"https:"
+
surl
);
map
.
put
(
"time"
,
time
.
replaceAll
(
"上传时间:"
,
""
).
split
(
" "
)[
0
]);
...
...
src/main/java/com/zhiwei/parse/analysis/BaijiaAccountAnalysis.java
View file @
9234d24c
...
...
@@ -24,7 +24,7 @@ import okhttp3.Response;
public
class
BaijiaAccountAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
BaijiaAccountAnalysis
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
public
Map
<
String
,
Object
>
getBaijiaAccount2Data
(
JSONObject
data
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
...
...
@@ -57,12 +57,13 @@ public class BaijiaAccountAnalysis {
boolean
more
=
false
;
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArry
=
json
.
getJSONObject
(
"data"
).
getJSONArray
(
"
items
"
);
if
(
json
.
getJSONObject
(
"data"
)
!=
null
&&
json
.
getJSONObject
(
"data"
).
getBoolean
(
"has_more"
)
!=
null
)
{
if
(
json
.
getJSONObject
(
"data"
).
getBoolean
(
"has_more"
)
)
{
more
=
true
;
}
JSONArray
jsonArry
=
json
.
getJSONObject
(
"data"
).
getJSONArray
(
"
list
"
);
if
(
json
.
getJSONObject
(
"data"
)
.
getBoolean
(
"has_more"
)
!=
null
&&
json
.
getJSONObject
(
"data"
).
getBoolean
(
"has_more"
)
)
{
more
=
true
;
rmap
.
put
(
"ctime"
,
json
.
getJSONObject
(
"data"
).
getString
(
"ctime"
));
}
// String name = json.getJSONObject("data").getJSONObject("author").getString("display_name");
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
JSONObject
data
=
jsonArry
.
getJSONObject
(
i
);
...
...
@@ -77,10 +78,7 @@ public class BaijiaAccountAnalysis {
}
}
map
.
put
(
"title"
,
data
.
getString
(
"title"
));
String
url
=
data
.
getString
(
"url"
);
if
(
url
==
null
)
{
url
=
"https://baijia.baidu.com/s?old_id="
+
id
;
}
String
url
=
"http://baijiahao.baidu.com/s?id="
+
id
;
map
.
put
(
"content"
,
ZhiWeiTools
.
delHTMLTag
(
getContent3
(
data
)));
map
.
put
(
"read_amount"
,
data
.
getString
(
"read_amount"
)==
null
?
0
:
data
.
getString
(
"read_amount"
));
map
.
put
(
"app_id"
,
data
.
getString
(
"app_id"
));
...
...
src/main/java/com/zhiwei/parse/analysis/BilibilikeyWordAnalysis.java
View file @
9234d24c
...
...
@@ -5,6 +5,7 @@ import java.util.Collections;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
...
...
@@ -13,7 +14,7 @@ import org.jsoup.select.Elements;
public
class
BilibilikeyWordAnalysis
{
public
static
Map
<
String
,
Object
>
getData
(
String
result
,
String
word
)
{
public
static
Map
<
String
,
Object
>
getData
(
String
result
,
String
word
,
String
endTime
)
{
try
{
Document
doc
=
Jsoup
.
parse
(
result
);
boolean
more
=
false
;
...
...
@@ -28,10 +29,9 @@ public class BilibilikeyWordAnalysis {
String
source
=
null
;
String
submitcount
=
null
;
Elements
elements
=
doc
.
select
(
"ul.video-contain.clearfix"
).
select
(
"li"
);
System
.
out
.
println
(
elements
.
size
()
+
" --- "
+
more
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
for
(
Element
element
:
elements
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
title
=
element
.
select
(
"a"
).
attr
(
"title"
);
url
=
element
.
select
(
"a"
).
attr
(
"href"
);
playcount
=
element
.
select
(
"div.tags"
).
select
(
"span.watch-num"
).
text
();
...
...
@@ -45,6 +45,9 @@ public class BilibilikeyWordAnalysis {
map
.
put
(
"source"
,
source
);
map
.
put
(
"submitcount"
,
submitcount
);
map
.
put
(
"word"
,
word
);
if
(
Objects
.
nonNull
(
endTime
)
&&
endTime
.
compareTo
(
time
)
>
-
1
)
{
more
=
false
;
}
dataList
.
add
(
map
);
}
Map
<
String
,
Object
>
rmap
=
new
HashMap
<>();
...
...
src/main/java/com/zhiwei/parse/analysis/DayuAccountAnalysis.java
View file @
9234d24c
...
...
@@ -49,7 +49,7 @@ public class DayuAccountAnalysis {
* @return
*/
private
Map
<
String
,
Object
>
getOneData
(
JSONObject
data
,
String
name
,
String
startTime
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
try
{
String
time
=
data
.
getString
(
"published_at"
).
replace
(
"T"
,
" "
).
split
(
"\\."
)[
0
];
if
(
startTime
!=
null
&&
startTime
.
length
()
>
1
)
{
...
...
src/main/java/com/zhiwei/parse/analysis/DayuByWordAnalysis.java
View file @
9234d24c
...
...
@@ -24,7 +24,7 @@ import okhttp3.Response;
public
class
DayuByWordAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
DayuByWordAnalysis
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
public
List
<
Map
<
String
,
Object
>>
getDayuByWordData
(
String
result
,
Proxy
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
...
...
src/main/java/com/zhiwei/parse/analysis/FenghuangAccountAnalysis.java
View file @
9234d24c
package
com
.
zhiwei
.
parse
.
analysis
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
...
...
@@ -12,16 +11,15 @@ import org.slf4j.LoggerFactory;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
public
class
FenghuangAccountAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
FenghuangAccountAnalysis
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
/**
*
...
...
@@ -29,70 +27,61 @@ public class FenghuangAccountAnalysis {
* @param result
* @return
*/
public
List
<
Map
<
String
,
Object
>>
getArticleData
(
String
url
,
String
startTime
,
Proxy
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
public
List
<
Map
<
String
,
Object
>>
getArticleData
(
String
url
,
String
startTime
,
ProxyHolder
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
try
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getFenghuangAccountHeaderMap
(
null
);
JSONArray
jsonArry
=
null
;
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
)){
String
result
=
response
.
body
().
string
();
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
jsonArry
=
json
.
getJSONObject
(
"data"
).
getJSONObject
(
"feeds"
).
getJSONArray
(
"list"
);
if
(
jsonArry
==
null
||
jsonArry
.
size
()
<
1
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getFenghuangAccountHeaderMap
(
null
);
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
)){
String
result
=
response
.
body
().
string
();
System
.
out
.
println
(
result
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
.
replace
(
"getFengAuthorListData("
,
""
).
replace
(
"]})"
,
"]}"
));
JSONArray
jsonArry
=
json
.
getJSONArray
(
"data"
);
for
(
int
j
=
0
;
j
<
jsonArry
.
size
();
j
++)
{
try
{
JSONObject
data
=
jsonArry
.
getJSONObject
(
j
);
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
map
.
put
(
"title"
,
data
.
getString
(
"title"
));
String
time
=
data
.
getString
(
"newsTime"
);
map
.
put
(
"time"
,
data
.
getString
(
"newsTime"
));
map
.
put
(
"url"
,
"https:"
+
data
.
getString
(
"url"
));
map
.
put
(
"id"
,
data
.
getString
(
"commentUrl"
));
if
(
time
.
compareTo
(
startTime
)
>=
0
)
{
dataList
.
add
(
map
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
" exception {}"
,
e
);
}
}
break
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
continue
;
}
}
catch
(
Exception
e
)
{
continue
;
}
}
if
(
jsonArry
==
null
||
jsonArry
.
size
()
<
1
)
{
return
dataList
;
}
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
try
{
JSONObject
data
=
jsonArry
.
getJSONObject
(
i
);
String
articleurl
=
data
.
getString
(
"id"
);
String
articleResult
=
HttpClient
.
executeHttpRequestGet
(
articleurl
,
proxy
,
headerMap
);
Map
<
String
,
Object
>
dataMap
=
getArticle
(
articleResult
);
ZhiWeiTools
.
sleep
(
1000
);
if
(
dataMap
!=
null
)
{
String
time
=
(
String
)
dataMap
.
get
(
"time"
);
if
(
time
.
compareTo
(
startTime
)
>=
0
)
{
dataList
.
add
(
dataMap
);
continue
;
}
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
continue
;
}
}
return
dataList
;
}
catch
(
Exception
e1
)
{
e1
.
printStackTrace
();
return
dataList
;
}
}
private
static
Map
<
String
,
Object
>
getArticle
(
String
articleResult
)
{
JSONObject
json
=
JSONObject
.
parseObject
(
articleResult
).
getJSONObject
(
"body"
);
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
try
{
map
.
put
(
"title"
,
json
.
getString
(
"title"
));
String
time
=
json
.
getString
(
"cTime"
).
replaceAll
(
"/"
,
"-"
);
map
.
put
(
"time"
,
time
);
map
.
put
(
"text"
,
json
.
getString
(
"text"
).
replaceAll
(
"<.*?>"
,
""
));
map
.
put
(
"source"
,
json
.
getString
(
"source"
));
map
.
put
(
"url"
,
json
.
getString
(
"shareurl
"
));
map
.
put
(
"id"
,
json
.
getString
(
"aid"
));
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析具体文章的时候出错 {}"
,
e
);
return
null
;
}
return
map
;
}
//
private static Map<String,Object> getArticle(String articleResult) {
// try {
// Map<String,Object> map = new HashMap<
>();
// JSONObject json = JSONObject.parseObject(articleResult).getJSONObject("body");
//
map.put("title", json.getString("title"));
//
String time = json.getString("cTime").replaceAll("/", "-");
//
map.put("time", time);
//
map.put("text", json.getString("text").replaceAll("<.*?>", ""));
//
map.put("source", json.getString("source"));
// map.put("url", "https://share.iclient.ifeng.com/news/shareNews?aid=sub_" + json.getString("aid
"));
//
map.put("id", json.getString("aid"));
// return map;
// } catch (Exception e) {
// logger.error("解析具体文章的时候出错 {}",e)
;
// return null;
// }
//
}
...
...
src/main/java/com/zhiwei/parse/analysis/FenghuangCommentAnalysis.java
View file @
9234d24c
...
...
@@ -23,7 +23,7 @@ import okhttp3.Response;
public
class
FenghuangCommentAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
FenghuangCommentAnalysis
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
public
Map
<
String
,
Object
>
getFenghuangCommentCount
(
String
url
,
ProxyHolder
proxy
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
...
...
src/main/java/com/zhiwei/parse/analysis/SouhuAccountAnalysis.java
View file @
9234d24c
package
com
.
zhiwei
.
parse
.
analysis
;
import
java.net.URLDecoder
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.HashMap
;
...
...
@@ -26,7 +25,7 @@ public class SouhuAccountAnalysis {
* @return
*/
public
List
<
Map
<
String
,
Object
>>
analysisData
(
JSONArray
jsonArray
,
String
name
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>
>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
Map
<
String
,
Object
>
map
=
parseHtmlByAccount
(
data
,
name
);
...
...
@@ -46,19 +45,15 @@ public class SouhuAccountAnalysis {
* @return
*/
private
static
Map
<
String
,
Object
>
parseHtmlByAccount
(
JSONObject
data
,
String
name
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
try
{
String
title
=
data
.
getString
(
"title"
);
map
.
put
(
"title"
,
URLDecoder
.
decode
(
title
,
"UTF-8"
));
map
.
put
(
"title"
,
data
.
getString
(
"title"
));
map
.
put
(
"source"
,
name
);
String
content
=
data
.
getString
(
"brief"
);
map
.
put
(
"content"
,
URLDecoder
.
decode
(
content
,
"UTF-8"
));
map
.
put
(
"content"
,
data
.
getString
(
"brief"
));
map
.
put
(
"newsPv"
,
data
.
getString
(
"newsPv"
));
map
.
put
(
"url"
,
data
.
getString
(
"url"
));
long
timelong
=
Long
.
valueOf
(
data
.
getString
(
"postTime"
));
map
.
put
(
"time"
,
new
Date
(
timelong
));
map
.
put
(
"comment"
,
data
.
getString
(
"commentsCnt"
));
JSONArray
jsonArry
=
data
.
getJSONArray
(
"tags"
);
map
.
put
(
"url"
,
data
.
getString
(
"link"
));
map
.
put
(
"time"
,
new
Date
(
data
.
getLong
(
"publicTime"
)));
JSONArray
jsonArry
=
data
.
getJSONArray
(
"tagDetails"
);
String
tags
=
""
;
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
JSONObject
ob
=
jsonArry
.
getJSONObject
(
i
);
...
...
@@ -68,10 +63,9 @@ public class SouhuAccountAnalysis {
tags
=
tags
.
substring
(
0
,
tags
.
length
()-
1
);
}
map
.
put
(
"tags"
,
tags
);
map
.
put
(
"newsid"
,
data
.
getString
(
"
news
id"
));
map
.
put
(
"newsid"
,
data
.
getString
(
"id"
));
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜狐历史文章解析出错了"
,
e
.
getMessage
());
System
.
out
.
println
(
data
.
toString
());
logger
.
error
(
"搜狐历史文章解析出错了 {}"
,
e
.
getMessage
());
return
null
;
}
...
...
src/main/java/com/zhiwei/parse/analysis/SouhuCommentAnalysis.java
View file @
9234d24c
...
...
@@ -20,7 +20,7 @@ import okhttp3.Response;
public
class
SouhuCommentAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
SouhuCommentAnalysis
.
class
);
private
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
/**
*
...
...
src/main/java/com/zhiwei/parse/shipin/QQTV.java
View file @
9234d24c
package
com
.
zhiwei
.
parse
.
shipin
;
import
java.net.Proxy
;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
import
java.util.Collections
;
...
...
@@ -18,7 +17,6 @@ import org.slf4j.LoggerFactory;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
...
...
@@ -36,7 +34,7 @@ import okhttp3.Response;
public
class
QQTV
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
QQTV
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
String
time
,
ProxyHolder
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
...
...
@@ -52,8 +50,8 @@ public class QQTV {
logger
.
info
(
" 关键词 {} 量 {} 页 数 {} 此页量 {} "
,
word
,
dataList
.
size
(),
page
,
elements
.
size
());
for
(
Element
element
:
elements
)
{
String
nurl
=
element
.
select
(
"h2.result_title"
).
select
(
"a"
).
attr
(
"href"
);
Map
<
String
,
Object
>
map
=
getUrlData
(
nurl
,
Proxy
Factory
.
getNatProxy
()
);
if
(
Objects
.
nonNull
(
map
)
&&
time
.
compareTo
(
String
.
valueOf
(
map
.
get
(
"time"
)))
<
1
)
{
Map
<
String
,
Object
>
map
=
getUrlData
(
nurl
,
Proxy
Holder
.
NAT_HEAVY_PROXY
);
if
(
Objects
.
nonNull
(
map
)
&&
!
map
.
isEmpty
()
&&
time
.
compareTo
(
String
.
valueOf
(
map
.
get
(
"time"
)))
<
1
)
{
map
.
put
(
"word"
,
word
);
dataList
.
add
(
map
);
}
...
...
@@ -61,6 +59,9 @@ public class QQTV {
}
page
++;
if
(
count
!=
dataList
.
size
())
{
if
(
page
>
20
)
{
break
;
}
continue
;
}
...
...
@@ -76,24 +77,26 @@ public class QQTV {
return
dataList
;
}
private
static
Map
<
String
,
Object
>
getUrlData
(
String
url
,
Proxy
proxy
)
{
for
(
int
i
=
1
;
i
<
3
;
i
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
String
result
=
response
.
body
().
string
();
String
source
=
result
.
split
(
"\\<span class=\"user_name\"\\>"
)[
1
].
split
(
"\\</span\\>"
)[
0
];
result
=
result
.
split
(
"var VIDEO_INFO ="
)[
1
].
split
(
"\\</script\\>"
)[
0
];
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
map
.
put
(
"playCount"
,
json
.
getInteger
(
"view_all_count"
));
map
.
put
(
"title"
,
json
.
getString
(
"title"
));
map
.
put
(
"time"
,
json
.
getString
(
"video_checkup_time"
));
map
.
put
(
"source"
,
source
);
map
.
put
(
"url"
,
url
);
return
map
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
private
static
Map
<
String
,
Object
>
getUrlData
(
String
url
,
ProxyHolder
proxy
)
{
if
(!
url
.
contains
(
"v.qq.com"
))
{
return
null
;
}
System
.
out
.
println
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
String
result
=
response
.
body
().
string
();
String
source
=
result
.
split
(
"\\<span class=\"user_name\"\\>"
)[
1
].
split
(
"\\</span\\>"
)[
0
];
result
=
result
.
split
(
"var VIDEO_INFO ="
)[
1
].
split
(
"\\</script\\>"
)[
0
];
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
map
.
put
(
"playCount"
,
json
.
getInteger
(
"view_all_count"
));
map
.
put
(
"title"
,
json
.
getString
(
"title"
));
map
.
put
(
"time"
,
json
.
getString
(
"video_checkup_time"
));
map
.
put
(
"source"
,
source
);
map
.
put
(
"url"
,
url
);
return
map
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
return
Collections
.
emptyMap
();
}
...
...
src/main/java/com/zhiwei/parse/shipin/SohuTV.java
View file @
9234d24c
...
...
@@ -24,7 +24,7 @@ import okhttp3.Response;
public
class
SohuTV
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
SohuTV
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
public
static
List
<
Map
<
String
,
Object
>>
sohuTVData
(
String
word
,
String
cookie
,
Proxy
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
...
...
src/test/java/com/zhiwei/Comment/AikaComment.java
View file @
9234d24c
//package com.zhiwei.Comment;
//
//import org.
testng.annotations
.Test;
//import org.
junit
.Test;
//
//import com.zhiwei.parse.Aika;
//import com.zhiwei.tools.timeparse.TimeExtraction;
//import com.zhiwei.tools.timeparse.TimeParse;
//
//public class AikaComment {
// @Test
// public void f() {
// String url = "http://
newcar.xcar.com.cn/201809/news_2021765
_1.html";
// String url = "http://
info.xcar.com.cn/201906/news_2039730
_1.html";
//
// Aika.getAikaComment(url, null);
//
...
...
src/test/java/com/zhiwei/Comment/AiqiyiHotCountTest.java
View file @
9234d24c
package
com
.
zhiwei
.
Comment
;
import
java.util.List
;
import
java.util.Map
;
import
org.testng.annotations
.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Aiqiyi
;
public
class
AiqiyiHotCountTest
{
@Test
public
void
f
()
{
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
String
path
=
"C:\\Users\\byte-zbs\\Documents\\WXWork\\1688854025129101\\Cache\\File\\2019-03\\爱奇艺.xlsx"
;
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
path
,
0
);
List
<
Map
<
String
,
Object
>>
dataList
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
List
<
String
>
headList
=
(
List
<
String
>)
map
.
get
(
"head"
);
headList
.
add
(
"count"
);
dataList
.
forEach
(
m
->
{
String
url
=
String
.
valueOf
(
m
.
get
(
"链接"
));
int
i
=
Aiqiyi
.
aiqiyiHotCount
(
url
,
ProxyHolder
.
NAT_PROXY
);
System
.
out
.
println
(
url
+
" -- "
+
i
);
m
.
put
(
"count"
,
i
);
});
poi
.
exportExcel
(
path
,
"data"
,
headList
,
dataList
);
}
}
//
package com.zhiwei.Comment;
//
//
import java.util.List;
//
import java.util.Map;
//
//import org.junit
.Test;
//
//
import com.zhiwei.common.config.GroupType;
//
import com.zhiwei.crawler.proxy.ProxyFactory;
//
import com.zhiwei.crawler.proxy.ProxyHolder;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Aiqiyi;
//
//
public class AiqiyiHotCountTest {
//
@Test
//
public void f() {
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
String path = "C:\\Users\\byte-zbs\\Documents\\WXWork\\1688854025129101\\Cache\\File\\2019-03\\爱奇艺.xlsx";
//
Map<String,Object> map = poi.importExcel(path, 0);
//
List<Map<String,Object>> dataList = (List<Map<String, Object>>) map.get("body");
//
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
//
List<String> headList = (List<String>) map.get("head");
//
headList.add("count");
//
dataList.forEach(m -> {
//
String url = String.valueOf(m.get("链接"));
//
//
int i = Aiqiyi.aiqiyiHotCount(url, ProxyHolder.NAT_PROXY);
//
System.out.println(url + " -- " + i);
//
m.put("count", i);
//
});
//
poi.exportExcel(path, "data", headList, dataList);
//
}
//
}
src/test/java/com/zhiwei/Comment/MaimaiCommentCountTest.java
View file @
9234d24c
...
...
@@ -4,7 +4,7 @@
//import java.util.List;
//import java.util.Map;
//
//import org.
testng.annotations
.Test;
//import org.
junit
.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
...
...
@@ -18,27 +18,28 @@
// public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
// GroupType.PROVIDER);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
// Map<String, Object> map = poi
// .importExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\脉脉公司圈.xlsx", 0);
// List<Map<String, Object>> list = (List<Map<String, Object>>) map.get("body");
// String cookie = "_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; sessionid=lejfy3gdu5tf9x9zowxfhtq5o73dubc5; guid=GxsfBBgZGwQYGx4EGBkeVgcYGxkdHhMeHhkcVhwZBB0ZHwVDWEtMS3kKEhMEEh0fGQQaBBsdBU9HRVhCaQoDRUFJT20KT0FDRgoGZmd+YmECChwZBB0ZHwVeQ2FIT31PRlpaawoDHhx9ZX0KERkEHAp+ZApZXUVOREN9AgoaBB8FS0ZGQ1BFZw==; seid=s1550814253444; token=\"rhItcea5qkO6WCSnVcczW/NRVLLCTsq3kQbpUCGAwQ0ceLunVJRjT5rgoFVYrIBA8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoiVjJuNHdCVDBncVNacTRxVllGM29jRUVwIiwic3RhdHVzIjp0cnVlLCJfZXhwaXJlIjoxNTUwOTAyMTY3MDQ5LCJfbWF4QWdlIjo4NjQwMDAwMH0=; session.sig=zbs4cHtzTcHWvjtkpjAZmoqLXsQ";
// List<Map<String, Object>> bodyList = new ArrayList<Map<String, Object>>();
// List<String> headList = (List<String>) map.get("head");
// for (Map<String, Object> map1 : list) {
// String url = map1.get("地址") + "";
// Map<String,Object> map3 = Maimai.getMaiaiCount(url,null, ProxyHolder.NAT_PROXY);
//// Map<String, Object> map = poi
//// .importExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\脉脉公司圈.xlsx", 0);
//// List<Map<String, Object>> list = (List<Map<String, Object>>) map.get("body");
//// List<Map<String, Object>> bodyList = new ArrayList<Map<String, Object>>();
//// List<String> headList = (List<String>) map.get("head");
//// for (Map<String, Object> map1 : list) {
//// String url = map1.get("地址") + "";
// String cookie = "_buuid=0668b664-13b3-4bd0-aa37-99d747432e85; guid=HBoEGxgEGBscBBsZGlYHGBsZHRsYExwZHFYcGQQdGR8FQ1hLTEt5ChITBBIdHxkEGgQbHQVPR0VYQmkKA0VBSU9tCk9BQ0YKBmZnfmJhAgocGQQdGR8FXkNhSE99T0ZaWmsKAx4cfWV9ChEZBBwKfmQKWV1FTkRDfQIKGgQfBUtGRkNQRWc=; token=\"ou+mv1hjxjm0uOOTss1vgck9+h6OCS/lYQUeFnJgSK70FHprmw6GmjBGwk2qPQH88CKuzcDfAvoCmBm7+jVysA==\"; uid=\"A8ELjewCDRgHnZ5bX0Vy0/Airs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMjI3NjU0NTI0Iiwic2VjcmV0IjoiV0wyZmEtZDZxbkx2TEkzZHF2dTN4UG5SIiwiX2V4cGlyZSI6MTU2MDU5Mzg4Mjc5NCwiX21heEFnZSI6ODY0MDAwMDB9; session.sig=ujhqvC3wPAn-WlCPXfB6C5ZJIgY";
// String url = "https://maimai.cn/web/gossip_detail?src=app&webid=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlZ2lkIjoiMjU5OTg4YmE4YzI3MTFlOTllMjEyNDZlOTZiNDgwODgiLCJ1IjoxNzY2NDQ3NjUsImlkIjoyMjIwMDM4NH0.gBdp3i99L0xUKLglaVIJf07OrrPk6yoG9HAo-3Yftqk";
// Map<String,Object> map3 = Maimai.getMaiaiCount(url,cookie, ProxyHolder.NAT_HEAVY_PROXY);
// System.out.println(map3.toString());
// System.out.println(url);
// map1.putAll(map3);
// ZhiWeiTools.sleep(500);
// System.out.println("--------------------------");
// }
// headList.add("like");
// headList.add("spreads");
// headList.add("cmts");
// poi.exportExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\脉脉公司圈.xlsx", "评论采集", headList,
// list);
//
//
map1.putAll(map3);
//
//
ZhiWeiTools.sleep(500);
//
//
System.out.println("--------------------------");
//
//
}
//
//
headList.add("like");
//
//
headList.add("spreads");
//
//
headList.add("cmts");
//
//
poi.exportExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\脉脉公司圈.xlsx", "评论采集", headList,
//
//
list);
// }
//}
src/test/java/com/zhiwei/Comment/YoukuHotCountTest.java
View file @
9234d24c
package
com
.
zhiwei
.
Comment
;
import
java.util.List
;
import
java.util.Map
;
import
org.testng.annotations.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Aiqiyi
;
import
com.zhiwei.parse.Youku
;
public
class
YoukuHotCountTest
{
@Test
public
void
f
()
{
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
String
path
=
"C:\\Users\\byte-zbs\\Documents\\WXWork\\1688854025129101\\Cache\\File\\2019-03\\优酷.xlsx"
;
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
path
,
0
);
List
<
Map
<
String
,
Object
>>
dataList
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
List
<
String
>
headList
=
(
List
<
String
>)
map
.
get
(
"head"
);
headList
.
add
(
"count"
);
dataList
.
forEach
(
m
->
{
String
url
=
String
.
valueOf
(
m
.
get
(
"链接"
));
int
i
=
Youku
.
getYoukuHotCount
(
url
,
ProxyHolder
.
NAT_PROXY
);
System
.
out
.
println
(
url
+
" -- "
+
i
);
m
.
put
(
"count"
,
i
);
});
poi
.
exportExcel
(
path
,
"data"
,
headList
,
dataList
);
}
}
//package com.zhiwei.Comment;
//
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Youku;
//
//public class YoukuHotCountTest {
// @Test
// public void f() {
//
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// String path = "C:\\Users\\byte-zbs\\Documents\\WXWork\\1688854025129101\\Cache\\File\\2019-03\\视频奶粉.xlsx";
// Map<String,Object> map = poi.importExcel(path, 0);
// List<Map<String,Object>> dataList = (List<Map<String, Object>>) map.get("body");
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// List<String> headList = (List<String>) map.get("head");
// headList.add("count");
// dataList.forEach(m -> {
// String url = String.valueOf(m.get("url"));
//
// int i = Youku.getYoukuHotCount(url, ProxyHolder.NAT_PROXY);
// System.out.println(url + " -- " + i);
// m.put("count", i);
// });
// poi.exportExcel(path, "data", headList, dataList);
//
//
// }
//}
src/test/java/com/zhiwei/TestHttpBoot.java
View file @
9234d24c
//package com.zhiwei;
//
//import java.io.IOException;
//import java.util.HashMap;
//import java.util.Map;
//
//import java.util.HashMap;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.crawler.core.HttpBoot;
//import com.zhiwei.crawler.core.RequestUtils;
//
//public class TestHttpBoot {
// @Test
// public void f() {
// HttpBoot httpBoot = new HttpBoot();
// String url = "https://www.toutiao.com/c/user/following/?user_id=1034006366&count=20&_signature=wp5wPBAVmXlosTC8Fobui8KecC";
// Map<String,Object> headers = new HashMap<>();
// headers.put("referer", "https://www.qctt.cn/news/349056");
// headers.put("cookie", "PHPSESSID=3rd6bvonb4g15t1fp777mjums0; Hm_lvt_70af9ea91e7adc8195f6d49511b9a2f1=1542253722; open_ad=1; Hm_lpvt_70af9ea91e7adc8195f6d49511b9a2f1=1542271394; vcode=sqmm; XSRF-TOKEN=eyJpdiI6IlFTNzkyYWNcLzB2SUwyN2dcL1hhUlpsZz09IiwidmFsdWUiOiJRSUpycjZJNGx3d1hUWkpOQUl1R2psSStuVU0yYW8xT1YxXC9QOFY1NjdyRXNrMWpFVE1kSm9IQ1o5Nm5keXlMTEFnZXdCOHVvWDg0U2picTE1cjZzMkE9PSIsIm1hYyI6IjZlYzk5NDI3ODEzMzA3ZTJjNDc3M2ZjMjBlNDJhZjc2YjU2ODFmYmY3YWRlMzdlMzM1NTBlNWMxNDk3MjFiZDEifQ%3D%3D; laravel_session=eyJpdiI6InJQMnByeFlIbXVhaUVVVVBLK1wvaXlRPT0iLCJ2YWx1ZSI6IlhUOUtIS2ZQZ0ZKNFh1RDVQYjBjSVZkVkpQZTdYRDNpa1wvV0o5QlJPbk8xZE0rQ3dZdnFMdjcya011ejVkdWEwUk1Qa29Zb2Y3OU0yUGkrWDF4Wk5adz09IiwibWFjIjoiZGJiYjlkNWZhNmJhMDFiMjkxYTAyMmUwZTEyMWVmZTQ0NmJiZDQ2ZGU3ZjNjNmUzNTIwZGI0NTc4NDJlZjNiMCJ9");
// headers.put("origin", "https://www.qctt.cn");
// Map<String,Object> params = new HashMap<>();
// params.put("id", "349056");
// params.put("page", "3");
// params.put("_token", "EJ58V0qilRw7P77czp0U6iO9QW2IOS1ZGiBk4wH1");
// try {
// String result = httpBoot.syncCall(RequestUtils.wrapGet(url)).body().string();
// System.out.println(result);
//
// } catch (IOException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
//
//
// }
//}
package
com
.
zhiwei
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
okhttp3.Response
;
public
class
TestHttpBoot
{
public
static
void
main
(
String
[]
args
)
{
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
followSslRedirects
(
false
).
build
();
String
url
=
"http://v.youku.com/v_show/id_XMzg1ODAwOTcwOA==.html"
;
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
))){
url
=
response
.
body
().
string
();
System
.
out
.
println
(
url
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
}
src/test/java/com/zhiwei/crawler/DayuAccountExample.java
View file @
9234d24c
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Dayu;
//
//public class DayuAccountExample {
//
//
// @Test
// public void dayuAccountTest() {
// //https://api.m.sm.cn/rest?method=Subscribe.list&format=html&from=wh10331&uc_biz_str=S:custom%7CC:search%7CN:true
//
//
//// String mid = "d7300311c1504d24a229c3da345785c6";
//// String name = "大鱼海棠雨";
// String startTime = "2017-01-01 00:00:00";
// String path = "D:\\crawlerdata\\自媒体\\大鱼历史文章.xlsx";
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// Map<String,Object> map = poi.importExcel(path, 0);
// List<Map<String,Object>> lists = (List<Map<String, Object>>) map.get("body");
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("source");
// headList.add("url");
//// headList.add("content_id");
//// headList.add("origin_id");
//// headList.add("xss_item_id");
// for(Map<String,Object> data : lists) {
// String mid = data.get("mid")+"";
// String name = data.get("name")+"";
// if(mid.length() < 1 && name.length() < 1) {
// continue;
// }
// List<Map<String,Object>> dataList = Dayu.getDayuAccountData(mid,name,null,null);
// poi.exportExcel(path, name, headList, dataList);
// }
//
//
// }
//
//
//}
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Dayu
;
public
class
DayuAccountExample
{
@Test
public
void
dayuAccountTest
()
{
//https://api.m.sm.cn/rest?method=Subscribe.list&format=html&from=wh10331&uc_biz_str=S:custom%7CC:search%7CN:true
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
// String mid = "0a8b2360fd8b4ded971cd324a56d32f0";
// String name = "大鱼海棠雨";
String
startTime
=
"2017-01-01 00:00:00"
;
String
path
=
"D:\\crawlerdata\\自媒体\\大鱼历史文章.xlsx"
;
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
path
,
0
);
List
<
Map
<
String
,
Object
>>
lists
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"content"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
// headList.add("content_id");
// headList.add("origin_id");
// headList.add("xss_item_id");
for
(
Map
<
String
,
Object
>
data
:
lists
)
{
String
mid
=
data
.
get
(
"mid"
)+
""
;
String
name
=
data
.
get
(
"name"
)+
""
;
mid
=
"7b345070c4124574b9cbcab8c4a1aeb8"
;
name
=
"国魂"
;
if
(
mid
.
length
()
<
1
&&
name
.
length
()
<
1
)
{
continue
;
}
List
<
Map
<
String
,
Object
>>
dataList
=
Dayu
.
getDayuAccountData
(
mid
,
name
,
null
,
null
);
poi
.
exportExcel
(
path
,
name
,
headList
,
dataList
);
}
}
}
src/test/java/com/zhiwei/crawler/DayuCommentCountExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
import
org.junit.Test
;
import
com.zhiwei.parse.Dayu
;
public
class
DayuCommentCountExample
{
@Test
public
void
dayuCommentCountTest
()
{
String
articleId
=
"6987993456991247474"
;
int
i
=
Dayu
.
getDayuCommentCount
(
articleId
,
null
);
System
.
out
.
println
(
i
);
}
}
//
package com.zhiwei.crawler;
//
//
import org.junit.Test;
//
//
import com.zhiwei.parse.Dayu;
//
//
public class DayuCommentCountExample {
//
//
@Test
//
public void dayuCommentCountTest() {
//
String articleId = "6987993456991247474";
//
//
int i = Dayu.getDayuCommentCount(articleId,null);
//
System.out.println(i);
//
}
//
//
//
//
}
src/test/java/com/zhiwei/crawler/DayuCommentExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Dayu
;
public
class
DayuCommentExample
{
@Test
public
void
getDayuCommentTest
()
{
//若已获取历史文章 哪里有这个字段 其他文章的
//http://m.uczzd.cn/iflow/api/v2/cmt/article/14180961224021425316/comments/byhot
//14180961224021425316 这个为此参数
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
"D://crawlerdata//自媒体//UC评论采集-1.xlsx"
,
0
);
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
List
<
String
>
urlList
=
new
ArrayList
<
String
>();
for
(
Map
<
String
,
Object
>
map1
:
list
)
{
String
url
=
""
;
try
{
url
=
map1
.
get
(
"url"
)+
""
;
String
articleId
=
""
;
url
=
"16848608935470442496"
;
if
(
url
.
contains
(
"aid"
))
{
articleId
=
url
.
split
(
"aid="
)[
1
].
split
(
"&"
)[
0
];
}
else
{
articleId
=
url
;
}
List
<
Map
<
String
,
Object
>>
dataList
=
Dayu
.
getDayuCommentData
(
articleId
,
null
);
if
(
dataList
.
size
()
<=
0
)
{
urlList
.
add
(
url
);
}
if
(
dataList
!=
null
)
{
bodyList
.
addAll
(
dataList
);
}
}
catch
(
Exception
e
)
{
System
.
out
.
println
(
url
);
e
.
printStackTrace
();
continue
;
}
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"nickname"
);
headList
.
add
(
"content"
);
headList
.
add
(
"id"
);
headList
.
add
(
"url"
);
headList
.
add
(
"like"
);
headList
.
add
(
"time"
);
headList
.
add
(
"replay_count"
);
for
(
String
s
:
urlList
)
{
System
.
out
.
println
(
s
);
}
poi
.
exportExcel
(
"D://crawlerdata/UC评论采集.xlsx"
,
"评论"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.crawler;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Dayu;
//
//
public class DayuCommentExample {
//
//
@Test
//
public void getDayuCommentTest() {
//
//若已获取历史文章 哪里有这个字段 其他文章的
//
//http://m.uczzd.cn/iflow/api/v2/cmt/article/14180961224021425316/comments/byhot
//
//14180961224021425316 这个为此参数
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
//
Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体//UC评论采集-1.xlsx", 0);
//
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
//
List<String> urlList = new ArrayList<String>();
//
for(Map<String,Object> map1 : list) {
//
String url = "";
//
try {
//
url = map1.get("url")+"";
//
String articleId = "";
//
url = "16848608935470442496";
//
if(url.contains("aid")) {
//
articleId = url.split("aid=")[1].split("&")[0];
//
}else {
//
articleId = url;
//
}
//
List<Map<String,Object>> dataList = Dayu.getDayuCommentData(articleId,null);
//
if(dataList.size() <= 0) {
//
urlList.add(url);
//
}
//
if(dataList != null) {
//
bodyList.addAll(dataList);
//
}
//
} catch (Exception e) {
//
System.out.println(url);
//
e.printStackTrace();
//
continue;
//
}
//
}
//
List<String> headList = new ArrayList<String>();
//
headList.add("nickname");
//
headList.add("content");
//
headList.add("id");
//
headList.add("url");
//
headList.add("like");
//
headList.add("time");
//
headList.add("replay_count");
//
for(String s : urlList) {
//
System.out.println(s);
//
}
//
poi.exportExcel("D://crawlerdata/UC评论采集.xlsx", "评论", headList, bodyList);
//
//
}
//
//
//
}
src/test/java/com/zhiwei/crawler/FenghuangAccountExample.java
deleted
100644 → 0
View file @
cb5516a0
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Fenghuang
;
public
class
FenghuangAccountExample
{
@Test
public
void
fenghuangAccountTest
()
{
//所用时间长 1s1篇文章吧
//https://api.3g.ifeng.com/client_search_subscribe?k=号外财经
String
id
=
"6452"
;
String
[]
ids
=
id
.
split
(
","
);
for
(
int
i
=
0
;
i
<
ids
.
length
;
i
++)
{
try
{
String
startTime
=
"2010-05-01 00:00:00"
;
//可为空
List
<
Map
<
String
,
Object
>>
dataList
=
Fenghuang
.
getFenghuangAccountData
(
ids
[
i
],
startTime
,
null
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"text"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
headList
.
add
(
"id"
);
poi
.
exportExcel
(
"D://crawlerdata/凤凰-6452.xlsx"
,
ids
[
i
],
headList
,
dataList
);
}
catch
(
Exception
e
)
{
continue
;
}
}
}
}
src/test/java/com/zhiwei/crawler/FenghuangByWordExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Fenghuang
;
import
com.zhiwei.parse.Yidianzixun
;
import
com.zhiwei.util.WordReadFile
;
public
class
FenghuangByWordExample
{
@Test
public
void
fenghuangByWordTest
()
{
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"D://crawlerdata/关键词.txt"
);
List
<
Map
<
String
,
Object
>>
listAll
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
String
word
:
wordList
)
{
try
{
List
<
Map
<
String
,
Object
>>
dataList
=
Fenghuang
.
getFenghuangByWord
(
word
,
null
);
if
(
dataList
!=
null
&&
dataList
.
size
()
>
0
)
{
listAll
.
addAll
(
dataList
);
}
System
.
out
.
println
(
dataList
.
size
()+
"==========="
+
listAll
.
size
());
}
catch
(
Exception
e
)
{
continue
;
}
}
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"content"
);
headList
.
add
(
"source"
);
headList
.
add
(
"time"
);
headList
.
add
(
"url"
);
System
.
out
.
println
(
listAll
.
size
());
poi
.
exportExcel
(
"D://crawlerdata/凤凰-美林.xlsx"
,
"asd"
,
headList
,
listAll
);
}
}
//
package com.zhiwei.crawler;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Fenghuang;
//
import com.zhiwei.parse.Yidianzixun;
//
import com.zhiwei.util.WordReadFile;
//
//
public class FenghuangByWordExample {
//
//
@Test
//
public void fenghuangByWordTest() {
//
List<String> wordList = WordReadFile.getWords("D://crawlerdata/关键词.txt");
//
List<Map<String,Object>> listAll = new ArrayList<Map<String,Object>>();
//
for(String word : wordList) {
//
try {
//
List<Map<String,Object>> dataList = Fenghuang.getFenghuangByWord(word,null);
//
if(dataList != null && dataList.size() > 0) {
//
listAll.addAll(dataList);
//
}
//
System.out.println(dataList.size()+"==========="+listAll.size());
//
} catch (Exception e) {
//
continue;
//
}
//
}
//
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
List<String> headList = new ArrayList<String>();
//
headList.add("title");
//
headList.add("content");
//
headList.add("source");
//
headList.add("time");
//
headList.add("url");
//
System.out.println(listAll.size());
//
poi.exportExcel("D://crawlerdata/凤凰-美林.xlsx", "asd", headList, listAll);
//
}
//
//
//
//
}
src/test/java/com/zhiwei/crawler/FenghuangCommentCountExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.parse.Fenghuang
;
public
class
FenghuangCommentCountExample
{
@Test
public
void
fenghuangCommentCountTest
()
{
String
url
=
"http://tech.ifeng.com/a/20181113/45222352_0.shtml"
;
//http://news.ifeng.com/a/20161229/50492484_0.shtml
//http://wemedia.ifeng.com/4096977/wemedia.shtml
Map
<
String
,
Object
>
map
=
Fenghuang
.
getFenghuangCommentCount
(
url
,
null
);
System
.
out
.
println
(
map
.
toString
());
}
}
//
package com.zhiwei.crawler;
//
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.parse.Fenghuang;
//
//
//
public class FenghuangCommentCountExample {
//
//
@Test
//
public void fenghuangCommentCountTest() {
//
String url = "http://tech.ifeng.com/a/20181113/45222352_0.shtml";
//
//http://news.ifeng.com/a/20161229/50492484_0.shtml
//
//http://wemedia.ifeng.com/4096977/wemedia.shtml
//
Map<String,Object> map = Fenghuang.getFenghuangCommentCount(url,null);
//
System.out.println(map.toString());
//
//
}
//
//
//
}
src/test/java/com/zhiwei/crawler/FenghuangCommentExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Fenghuang
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
FenghuangCommentExample
{
@Test
public
void
fenghuangCommentTest
()
{
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
"D://crawlerdata//自媒体/凤凰评论采集.xlsx"
,
0
);
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
List
<
String
>
urlList
=
new
ArrayList
<
String
>();
for
(
Map
<
String
,
Object
>
map1
:
list
)
{
String
url
=
""
;
try
{
url
=
map1
.
get
(
"url"
)+
""
;
System
.
out
.
println
(
url
);
List
<
Map
<
String
,
Object
>>
dataList
=
Fenghuang
.
getFenghuangCommentData2
(
url
,
null
);
if
(
dataList
==
null
||
dataList
.
size
()
<=
0
)
{
urlList
.
add
(
url
);
}
if
(
dataList
!=
null
)
{
for
(
Map
<
String
,
Object
>
m
:
dataList
)
{
m
.
put
(
"from_url"
,
url
);
bodyList
.
add
(
m
);
}
}
}
catch
(
Exception
e
)
{
System
.
out
.
println
(
url
);
e
.
printStackTrace
();
continue
;
}
ZhiWeiTools
.
sleep
(
1000
);
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"nickname"
);
headList
.
add
(
"content"
);
headList
.
add
(
"id"
);
headList
.
add
(
"like"
);
headList
.
add
(
"from"
);
headList
.
add
(
"time"
);
headList
.
add
(
"from_url"
);
for
(
String
s
:
urlList
)
{
System
.
out
.
println
(
s
);
}
poi
.
exportExcel
(
"D://crawlerdata//自媒体/凤凰评论采集.xlsx"
,
"评论采集"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.crawler;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Fenghuang;
//
import com.zhiwei.tools.tools.ZhiWeiTools;
//
//
public class FenghuangCommentExample {
//
//
@Test
//
public void fenghuangCommentTest() {
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
//
Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体/凤凰评论采集.xlsx", 0);
//
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
//
List<String> urlList = new ArrayList<String>();
//
for(Map<String,Object> map1 : list) {
//
String url = "";
//
try {
//
url = map1.get("url")+"";
//
System.out.println(url);
//
List<Map<String,Object>> dataList = Fenghuang.getFenghuangCommentData2(url,null);
//
if(dataList == null || dataList.size() <= 0) {
//
urlList.add(url);
//
}
//
if(dataList != null) {
//
for(Map<String,Object> m : dataList) {
//
m.put("from_url", url);
//
bodyList.add(m);
//
}
//
}
//
} catch (Exception e) {
//
System.out.println(url);
//
e.printStackTrace();
//
continue;
//
}
//
ZhiWeiTools.sleep(1000);
//
}
//
List<String> headList = new ArrayList<String>();
//
headList.add("nickname");
//
headList.add("content");
//
headList.add("id");
//
headList.add("like");
//
headList.add("from");
//
headList.add("time");
//
headList.add("from_url");
//
for(String s : urlList) {
//
System.out.println(s);
//
}
//
poi.exportExcel("D://crawlerdata//自媒体/凤凰评论采集.xlsx", "评论采集", headList, bodyList);
//
//
}
//
//
//
}
src/test/java/com/zhiwei/crawler/MaimaiBywordExample.java
View file @
9234d24c
//
package com.zhiwei.crawler;
//
//
import java.util.ArrayList;
//
import java.util.Arrays;
//
import java.util.List;
//
import java.util.Map;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Maimai;
//
//
public class MaimaiBywordExample {
//
//
public static void main(String[] args) {
//
String word = "美团|某团|MT|大众点评|新美大|美团点评";
// String cookie = "_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; sessionid=lejfy3gdu5tf9x9zowxfhtq5o73dubc5; guid=GxsfBBgZGwQYGx4EGBkeVgcYGxkdHhMeHhkcVhwZBB0ZHwVDWEtMS3kKEhMEEh0fGQQaBBsdBU9HRVhCaQoDRUFJT20KT0FDRgoGZmd+YmECChwZBB0ZHwVeQ2FIT31PRlpaawoDHhx9ZX0KERkEHAp+ZApZXUVOREN9AgoaBB8FS0ZGQ1BFZw==; seid=s1550814253444; token=\"G8eNNNylPoi3oIPLUr/d/RDaMgtnpZCskxT7wu1pRRrkiy3J8G7StHgTx9DQBq4O8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoiVjJuNHdCVDBncVNacTRxVllGM29jRUVwIiwic3RhdHVzIjp0cnVlLCJfZXhwaXJlIjoxNTUwOTAwNjY1Njg4LCJfbWF4QWdlIjo4NjQwMDAwMH0=; session.sig=b_tga85tZskxsgKX8YIM_JKByi0
";
//
String time = "2019-02-15 00:00:00";
//
String[] words = word.split("\\|");
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
//
for(String w : words) {
//
//实名动态
//
//
List<Map<String,Object>> c = Maimai.getData(w, cookie, time, null);
//
//职言交流
//
List<Map<String,Object>> c2 = Maimai.getDataByNoName(w, cookie, time, null);
//
//
bodyList.addAll(c);
//
bodyList.addAll(c2);
//
}
//
List<String> headList = Arrays.asList("time","url","text","name","like","comment_count","spreads","word");
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
poi.exportExcel("D:\\crawlerdata\\自媒体\\脉脉关键词采集-美团-0222.xlsx", "脉脉关键词", headList, bodyList);
//
}
//
//
}
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.Arrays
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Maimai
;
public
class
MaimaiBywordExample
{
public
static
void
main
(
String
[]
args
)
{
String
word
=
"美团|某团|MT|大众点评|新美大|美团点评"
;
String
cookie
=
"_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; sessionid=8d1sx8i4gj0ocmtyc86x2yj0467ymayv; token=\"wl8U6GizDpoS6uzZ1ug93sJjfBucfB7IOoDxDVWOy+g7egJdXL/riMlMlHuQj+gM8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoiLVctRlpDLXg3N1h4ZEhkeEs0Qi1NR0VDIiwibWlkNDU2ODc2MCI6ZmFsc2UsInN0YXR1cyI6dHJ1ZSwiX2V4cGlyZSI6MTU1NzEyNDAxMzA0NSwiX21heEFnZSI6ODY0MDAwMDB9; session.sig=NZ2D9ZQU_Wlx6JGAFap4Znviz6k
"
;
String
time
=
"2019-02-15 00:00:00"
;
String
[]
words
=
word
.
split
(
"\\|"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
String
w
:
words
)
{
//实名动态
// List<Map<String,Object>> c = Maimai.getData(w, cookie, time, null);
//职言交流
List
<
Map
<
String
,
Object
>>
c2
=
Maimai
.
getDataByNoName
(
w
,
cookie
,
time
,
null
);
// bodyList.addAll(c);
bodyList
.
addAll
(
c2
);
}
List
<
String
>
headList
=
Arrays
.
asList
(
"time"
,
"url"
,
"text"
,
"name"
,
"like"
,
"comment_count"
,
"spreads"
,
"word"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\脉脉关键词采集-美团-0222.xlsx"
,
"脉脉关键词"
,
headList
,
bodyList
);
}
}
src/test/java/com/zhiwei/crawler/MeipaiByWordExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Meipai
;
public
class
MeipaiByWordExample
{
@Test
public
void
meipaiByWordTest
()
{
String
word
=
"美食,吃,菜"
;
String
[]
words
=
word
.
split
(
","
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
String
w
:
words
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
Meipai
.
getMeipaiByWordData
(
w
,
null
);
if
(
dataList
!=
null
)
{
bodyList
.
addAll
(
dataList
);
}
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"time"
);
headList
.
add
(
"video_count"
);
headList
.
add
(
"content"
);
headList
.
add
(
"url"
);
headList
.
add
(
"like"
);
headList
.
add
(
"comment_count"
);
headList
.
add
(
"source"
);
headList
.
add
(
"source_url"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata/美拍关键词采集.xlsx"
,
"美拍数据"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.crawler;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Meipai;
//
//
public class MeipaiByWordExample {
//
//
@Test
//
public void meipaiByWordTest() {
//
String word = "美食,吃,菜";
//
String[] words = word.split(",");
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
//
for(String w : words) {
//
List<Map<String,Object>> dataList = Meipai.getMeipaiByWordData(w,null);
//
if(dataList != null) {
//
bodyList.addAll(dataList);
//
}
//
}
//
List<String> headList = new ArrayList<String>();
//
headList.add("time");
//
headList.add("video_count");
//
headList.add("content");
//
headList.add("url");
//
headList.add("like");
//
headList.add("comment_count");
//
headList.add("source");
//
headList.add("source_url");
//
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
//
poi.exportExcel("D://crawlerdata/美拍关键词采集.xlsx", "美拍数据", headList, bodyList);
//
//
}
//
//
//
//
}
src/test/java/com/zhiwei/crawler/MiaopaiByUrlExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Miaopai
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
MiaopaiByUrlExample
{
@Test
public
void
miaopaiByUrlTest
()
{
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
"D://crawlerdata/秒拍美食.xlsx"
,
0
);
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
List
<
String
>
urlList
=
new
ArrayList
<
String
>();
for
(
Map
<
String
,
Object
>
map1
:
list
)
{
String
url
=
""
;
try
{
url
=
map1
.
get
(
"url"
)+
""
;
if
(
urlList
.
contains
(
url
))
{
continue
;
}
urlList
.
add
(
url
);
ZhiWeiTools
.
sleep
(
5000
);
System
.
out
.
println
(
url
);
Map
<
String
,
Object
>
dataMap
=
Miaopai
.
getMiaopaiDataByURL
(
url
,
null
);
if
(
dataMap
!=
null
)
{
bodyList
.
add
(
dataMap
);
}
}
catch
(
Exception
e
)
{
System
.
out
.
println
(
url
);
e
.
printStackTrace
();
continue
;
}
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"time"
);
headList
.
add
(
"source"
);
headList
.
add
(
"title"
);
headList
.
add
(
"url"
);
headList
.
add
(
"video_count"
);
poi
.
exportExcel
(
"D://crawlerdata/秒拍美食.xlsx"
,
"asd"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.crawler;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Miaopai;
//
import com.zhiwei.tools.tools.ZhiWeiTools;
//
//
public class MiaopaiByUrlExample {
//
//
@Test
//
public void miaopaiByUrlTest() {
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
//
Map<String,Object> map = poi.importExcel("D://crawlerdata/秒拍美食.xlsx", 0);
//
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
//
List<String> urlList = new ArrayList<String>();
//
for(Map<String,Object> map1 : list) {
//
String url = "";
//
try {
//
url = map1.get("url")+"";
//
if(urlList.contains(url)) {
//
continue;
//
}
//
urlList.add(url);
//
ZhiWeiTools.sleep(5000);
//
System.out.println(url);
//
Map<String,Object> dataMap = Miaopai.getMiaopaiDataByURL(url,null);
//
if(dataMap != null) {
//
bodyList.add(dataMap);
//
}
//
} catch (Exception e) {
//
System.out.println(url);
//
e.printStackTrace();
//
continue;
//
}
//
}
//
List<String> headList = new ArrayList<String>();
//
headList.add("time");
//
headList.add("source");
//
headList.add("title");
//
headList.add("url");
//
headList.add("video_count");
//
poi.exportExcel("D://crawlerdata/秒拍美食.xlsx", "asd", headList, bodyList);
//
//
}
//
//
//
}
src/test/java/com/zhiwei/crawler/PearVideoByWordExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.PearVideo
;
public
class
PearVideoByWordExample
{
@Test
public
void
pearVideoByWordTest
()
{
String
word
=
"大宝 甲醛"
;
List
<
Map
<
String
,
Object
>>
bodyList
=
PearVideo
.
getPearVideoData
(
word
,
null
);
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"time"
);
headList
.
add
(
"title"
);
headList
.
add
(
"content"
);
headList
.
add
(
"url"
);
headList
.
add
(
"like"
);
headList
.
add
(
"source"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata/梨视频关键词采集.xlsx"
,
"梨视频采集结果"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.crawler;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.PearVideo;
//
//
public class PearVideoByWordExample {
//
//
@Test
//
public void pearVideoByWordTest() {
//
String word = "大宝 甲醛";
//
//
List<Map<String,Object>> bodyList = PearVideo.getPearVideoData(word,null);
//
List<String> headList = new ArrayList<String>();
//
headList.add("time");
//
headList.add("title");
//
headList.add("content");
//
headList.add("url");
//
headList.add("like");
//
headList.add("source");
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
poi.exportExcel("D://crawlerdata/梨视频关键词采集.xlsx", "梨视频采集结果", headList, bodyList);
//
//
//
}
//
//
//
}
src/test/java/com/zhiwei/crawler/QQAccountExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.QQKB
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
QQAccountExample
{
@Test
public
void
qqAccountTest
()
{
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
Map
<
String
,
Object
>
dataMap
=
poi
.
importExcel
(
"D://crawlerdata//自媒体/天天快报历史文章采集.xlsx"
,
0
);
List
<
Map
<
String
,
Object
>>
dataList
=
(
List
<
Map
<
String
,
Object
>>)
dataMap
.
get
(
"body"
);
String
cookie
=
"phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=000300006218e8444698ebbad28deed8243ef28e87f247b133bf5d19f160bce91d4e3e06ec2003cd4f92aff0;%20uin=o0497332654;%20skey=MQL8ScQBxu;%20sigA2=2EBEFFCC5C22FD27B32E6F21C76CA4A2AD8BF5D626B629A38C50923810CCD19C574CD093CFAD29C845084C8CD3E04B57DD24A69E4418E060C899696A728FB3B8B14C577F1FF93340;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmSQ3EqlyzseC9-qGEFY7Tkr0Ypk5vsnSwOaMC-IGxsBeY2K7knHrYstj_5dZpisJd5nihvLNQvCdsFhFwZQcT8;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0"
;
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
Map
<
String
,
Object
>
map
:
dataList
)
{
String
child
=
map
.
get
(
"帐号链接"
)+
""
;
// System.out.println(child.split("chlid=")[1]);
System
.
out
.
println
(
child
.
split
(
"="
)[
1
]);
List
<
Map
<
String
,
Object
>>
lists
=
QQKB
.
getQQAccountData
(
"5001789
"
,
cookie
,
null
);
if
(
lists
!=
null
)
{
for
(
Map
<
String
,
Object
>
map1
:
lists
)
{
map1
.
put
(
"name"
,
map
.
get
(
"呢称"
));
map1
.
put
(
"主页地址"
,
map
.
get
(
"帐号链接"
));
bodyList
.
add
(
map1
);
}
}
System
.
out
.
println
(
"采集到的历史文章数总和============="
+
bodyList
.
size
());
ZhiWeiTools
.
sleep
(
5000
);
}
System
.
out
.
println
(
dataList
.
size
());
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"name"
);
headList
.
add
(
"主页地址"
);
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"content"
);
headList
.
add
(
"url"
);
headList
.
add
(
"commentid"
);
poi
.
exportExcel
(
"D://crawlerdata//自媒体/天天快报采集-科技编年史.xlsx"
,
"asd"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.crawler;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.QQKB;
//
import com.zhiwei.tools.tools.ZhiWeiTools;
//
//
public class QQAccountExample {
//
//
@Test
//
public void qqAccountTest() {
//
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
Map<String,Object> dataMap = poi.importExcel("D://crawlerdata//自媒体/天天快报历史文章采集.xlsx", 0);
//
List<Map<String,Object>> dataList = (List<Map<String, Object>>) dataMap.get("body");
//
String cookie = "phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=000300006218e8444698ebbad28deed8243ef28e87f247b133bf5d19f160bce91d4e3e06ec2003cd4f92aff0;%20uin=o0497332654;%20skey=MQL8ScQBxu;%20sigA2=2EBEFFCC5C22FD27B32E6F21C76CA4A2AD8BF5D626B629A38C50923810CCD19C574CD093CFAD29C845084C8CD3E04B57DD24A69E4418E060C899696A728FB3B8B14C577F1FF93340;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmSQ3EqlyzseC9-qGEFY7Tkr0Ypk5vsnSwOaMC-IGxsBeY2K7knHrYstj_5dZpisJd5nihvLNQvCdsFhFwZQcT8;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0";
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
//
for(Map<String,Object> map : dataList) {
//
String child = map.get("帐号链接")+"";
//
//
System.out.println(child.split("chlid=")[1]);
//
System.out.println(child.split("=")[1]);
//
// List<Map<String,Object>> lists = QQKB.getQQAccountData("5456950
", cookie,null);
//
if(lists != null) {
//
for(Map<String,Object> map1 : lists) {
//
map1.put("name", map.get("呢称"));
//
map1.put("主页地址", map.get("帐号链接"));
//
bodyList.add(map1);
//
}
//
}
//
System.out.println("采集到的历史文章数总和============="+bodyList.size());
//
ZhiWeiTools.sleep(5000);
//
}
//
System.out.println(dataList.size());
//
List<String> headList = new ArrayList<String>();
//
headList.add("name");
//
headList.add("主页地址");
//
headList.add("title");
//
headList.add("time");
//
headList.add("content");
//
headList.add("url");
//
headList.add("commentid");
//
poi.exportExcel("D://crawlerdata//自媒体/天天快报采集-科技编年史.xlsx", "asd", headList, bodyList);
//
}
//
//
//
}
src/test/java/com/zhiwei/crawler/QQKBByWordExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
import
org.junit.Test
;
import
com.zhiwei.parse.QQKB
;
public
class
QQKBByWordExample
{
@Test
public
void
qqkbByWordTest
()
{
String
word
=
"麦当劳"
;
String
cookie
=
"phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=00030000a7aa9ad0c4636295c0f484bf54820005db056be0651296f399e092bc56e4910bbedaf413ced3fb8a;%20uin=o0497332654;%20skey=MSF4MCe62n;%20sigA2=71400747BF93A51F5F103AC6180C723E940637B87127C104AA325F5709FD378BEB014D649D6B653031F6B2E0962C0F8D9C06807A1CE509983C1C70B641606FEA84B90777926863E4;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwsY93PCkKQ3oQ4nAAU9-VNf0eKb4SlfVLb-YW7bRaEL2jS3XDtJGO8m9DoDrFZ4bFyv96Pb5ZfvVIyehq-jQ65o;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0"
;
//无法找到下一页
// QQKB.getQQKBByWordData(word, cookie);
}
}
//
package com.zhiwei.crawler;
//
//
import org.junit.Test;
//
//
import com.zhiwei.parse.QQKB;
//
//
public class QQKBByWordExample {
//
//
@Test
//
public void qqkbByWordTest() {
//
String word = "麦当劳";
//
String cookie = "phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=00030000a7aa9ad0c4636295c0f484bf54820005db056be0651296f399e092bc56e4910bbedaf413ced3fb8a;%20uin=o0497332654;%20skey=MSF4MCe62n;%20sigA2=71400747BF93A51F5F103AC6180C723E940637B87127C104AA325F5709FD378BEB014D649D6B653031F6B2E0962C0F8D9C06807A1CE509983C1C70B641606FEA84B90777926863E4;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwsY93PCkKQ3oQ4nAAU9-VNf0eKb4SlfVLb-YW7bRaEL2jS3XDtJGO8m9DoDrFZ4bFyv96Pb5ZfvVIyehq-jQ65o;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0";
//
//无法找到下一页
//
//
QQKB.getQQKBByWordData(word, cookie);
//
//
}
//
//
//
//
}
src/test/java/com/zhiwei/crawler/QQKBCommentCountExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
import
org.junit.Test
;
import
com.zhiwei.parse.QQKB
;
public
class
QQKBCommentCountExample
{
@Test
public
void
qqkbCommentCountTest
()
{
String
cookie
=
""
;
String
url
=
"https://kuaibao.qq.com/s/20190305A16P6L00"
;
int
i
=
QQKB
.
getCommentCount
(
url
,
null
);
System
.
out
.
println
(
i
);
}
}
//
package com.zhiwei.crawler;
//
//
import org.junit.Test;
//
//
import com.zhiwei.parse.QQKB;
//
//
public class QQKBCommentCountExample {
//
//
//
@Test
//
public void qqkbCommentCountTest() {
//
String cookie = "";
//
String url = "https://kuaibao.qq.com/s/20190305A16P6L00";
//
//
int i = QQKB.getCommentCount(url,null);
//
System.out.println(i);
//
//
}
//
//
//
}
src/test/java/com/zhiwei/crawler/QQKBCommentExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.QQKB
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
QQKBCommentExample
{
//天天快报与腾讯新闻都可用 不用cookie
@Test
public
void
qqkbCommentTest
()
{
String
url
=
"https://kuaibao.qq.com/s/20181122A11WQB00"
;
//https://kuaibao.qq.com/s/20180423A1PI7400?refer=kb_news
// https://kuaibao.qq.com/s/20180423A0L60800?refer=kb_news
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
"D://crawlerdata//自媒体/快报评论采集.xlsx"
,
0
);
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
Map
<
String
,
Object
>
m
:
list
)
{
String
u
=
m
.
get
(
"地址"
).
toString
();
System
.
out
.
println
(
u
);
ZhiWeiTools
.
sleep
(
2000
);
List
<
Map
<
String
,
Object
>>
dataList
=
QQKB
.
getQQKBCommentData
(
u
,
null
);
if
(
dataList
!=
null
)
{
bodyList
.
addAll
(
dataList
);
}
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"reply_id"
);
//id
headList
.
add
(
"like"
);
//点赞数
headList
.
add
(
"name"
);
//呢称
headList
.
add
(
"reply_num"
);
//回复数
headList
.
add
(
"time"
);
//时间
headList
.
add
(
"content"
);
//内容
System
.
out
.
println
(
bodyList
.
size
());
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\快报评论采集.xlsx"
,
"sada"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.crawler;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.common.config.GroupType;
//
import com.zhiwei.crawler.proxy.ProxyFactory;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.QQKB;
//
import com.zhiwei.tools.tools.ZhiWeiTools;
//
//
public class QQKBCommentExample {
//
//
//天天快报与腾讯新闻都可用 不用cookie
//
@Test
//
public void qqkbCommentTest() {
//
String url = "https://kuaibao.qq.com/s/20181122A11WQB00";
//
//https://kuaibao.qq.com/s/20180423A1PI7400?refer=kb_news
//
//
https://kuaibao.qq.com/s/20180423A0L60800?refer=kb_news
//
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
//
GroupType.PROVIDER);
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体/快报评论采集.xlsx", 0);
//
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
//
for(Map<String,Object> m : list) {
//
String u = m.get("地址").toString();
//
System.out.println(u);
//
ZhiWeiTools.sleep(2000);
//
List<Map<String,Object>> dataList = QQKB.getQQKBCommentData(u,null);
//
if(dataList!= null) {
//
bodyList.addAll(dataList);
//
}
//
}
//
List<String> headList = new ArrayList<String>();
//
headList.add("reply_id"); //id
//
headList.add("like"); //点赞数
//
headList.add("name"); //呢称
//
headList.add("reply_num"); //回复数
//
headList.add("time"); //时间
//
headList.add("content"); //内容
//
System.out.println(bodyList.size());
//
poi.exportExcel("D:\\crawlerdata\\自媒体\\快报评论采集.xlsx", "sada", headList, bodyList);
//
//
}
//
//
//
}
src/test/java/com/zhiwei/crawler/QQNewsCommentListTest.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.httpclient.HttpClientTemplateOK
;
import
com.zhiwei.tools.timeparse.TimeParse
;
/**
* @ClassName: QQNewsCommentListTest
* @Description: TODO(腾讯新闻评论抓取)
* @author hero
* @date 2017年8月10日 下午6:08:41
*/
public
class
QQNewsCommentListTest
{
public
static
void
main
(
String
[]
args
)
{
List
<
String
>
urlList
=
new
ArrayList
<
String
>();
for
(
String
url
:
urlList
){
qqNewsCommentListTest
(
url
);
}
}
public
static
void
qqNewsCommentListTest
(
String
url
)
{
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
String
newsId
=
getCommentId
(
url
);
String
splitId
=
"_article"
+
newsId
+
"commentv2"
;
System
.
out
.
println
(
splitId
);
int
pages
=
0
;
try
{
String
comment_url
=
"http://coral.qq.com/article/"
+
newsId
+
"/comment/v2?callback=_article"
+
newsId
+
"commentv2&orinum=30&oriorder=t&pageflag=0&source=1&_="
+
System
.
currentTimeMillis
();
System
.
out
.
println
(
"commenturl========"
+
comment_url
);
String
html
=
HttpClientTemplateOK
.
get
(
comment_url
,
null
,
headerMap
);
if
(
html
!=
null
){
html
=
html
.
split
(
splitId
)[
1
];
html
=
html
.
substring
(
1
,
html
.
length
()-
1
);
System
.
out
.
println
(
html
);
JSONObject
data
=
JSONObject
.
parseObject
(
html
).
getJSONObject
(
"data"
);
JSONArray
jsonArray
=
data
.
getJSONArray
(
"oriCommList"
);
JSONObject
userData
=
data
.
getJSONObject
(
"userList"
);
pages
=
(
int
)
Math
.
ceil
((
double
)
data
.
getIntValue
(
"oritotal"
)/
30.0
);
for
(
int
a
=
0
;
a
<
jsonArray
.
size
();
a
++){
Map
<
String
,
Object
>
doc
=
new
HashMap
<
String
,
Object
>();
JSONObject
json
=
jsonArray
.
getJSONObject
(
a
);
JSONObject
user
=
userData
.
getJSONObject
(
json
.
getString
(
"userid"
));
if
(
user
!=
null
){
doc
.
put
(
"nick"
,
user
.
getString
(
"nick"
));
doc
.
put
(
"gender"
,
user
.
getString
(
"gender"
));
doc
.
put
(
"localtion"
,
user
.
getString
(
"region"
));
}
doc
.
put
(
"_id"
,
json
.
getString
(
"id"
));
doc
.
put
(
"content"
,
json
.
getString
(
"content"
));
doc
.
put
(
"time"
,
TimeParse
.
dateFormartString
(
new
Date
(
json
.
getLong
(
"time"
)*
1000
),
"yyyy-MM-dd HH:mm:ss"
));
doc
.
put
(
"up"
,
json
.
getInteger
(
"up"
));
doc
.
put
(
"pokenum"
,
json
.
getInteger
(
"pokenum"
));
doc
.
put
(
"repnum"
,
json
.
getInteger
(
"repnum"
));
doc
.
put
(
"fromUrl"
,
url
);
System
.
out
.
println
(
"doc==========="
+
doc
);
}
}
else
{
System
.
out
.
println
(
"--------------"
);
}
for
(
int
i
=
1
;
i
<=
pages
;
i
++){
comment_url
=
"http://coral.qq.com/article/"
+
newsId
+
"/comment/v2?callback=_article"
+
newsId
+
"commentv2&orinum=30&oriorder=t&pageflag="
+
i
+
"&source=1&_="
+
System
.
currentTimeMillis
();
html
=
HttpClientTemplateOK
.
get
(
comment_url
,
null
,
headerMap
);
if
(
html
!=
null
){
html
=
html
.
split
(
splitId
)[
1
];
html
=
html
.
substring
(
1
,
html
.
length
()-
1
);
System
.
out
.
println
(
html
);
System
.
out
.
println
(
html
);
JSONObject
data
=
JSONObject
.
parseObject
(
html
).
getJSONObject
(
"data"
);
JSONArray
jsonArray
=
data
.
getJSONArray
(
"oriCommList"
);
JSONObject
userData
=
data
.
getJSONObject
(
"userList"
);
pages
=
(
int
)
Math
.
ceil
((
double
)
data
.
getIntValue
(
"oritotal"
)/
30.0
);
for
(
int
a
=
0
;
a
<
jsonArray
.
size
();
a
++){
Map
<
String
,
Object
>
doc
=
new
HashMap
<
String
,
Object
>();
JSONObject
json
=
jsonArray
.
getJSONObject
(
a
);
JSONObject
user
=
userData
.
getJSONObject
(
json
.
getString
(
"userid"
));
if
(
user
!=
null
){
doc
.
put
(
"nick"
,
user
.
getString
(
"nick"
));
doc
.
put
(
"gender"
,
user
.
getString
(
"gender"
));
doc
.
put
(
"localtion"
,
user
.
getString
(
"region"
));
}
doc
.
put
(
"_id"
,
json
.
getString
(
"mid"
));
doc
.
put
(
"content"
,
json
.
getString
(
"content"
));
doc
.
put
(
"time"
,
TimeParse
.
dateFormartString
(
new
Date
(
json
.
getLong
(
"time"
)*
1000
),
"yyyy-MM-dd HH:mm:ss"
));
doc
.
put
(
"up"
,
json
.
getInteger
(
"up"
));
doc
.
put
(
"pokenum"
,
json
.
getInteger
(
"pokenum"
));
doc
.
put
(
"repnum"
,
json
.
getInteger
(
"repnum"
));
doc
.
put
(
"fromUrl"
,
url
);
System
.
out
.
println
(
"doc==========="
+
doc
);
}
}
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
public
static
String
getCommentId
(
String
url
){
String
cmt_id
=
null
;
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
System
.
out
.
println
(
url
);
try
{
String
html
=
HttpClientTemplateOK
.
get
(
url
,
null
,
headerMap
);
if
(
html
!=
null
&&
html
.
contains
(
"cmt_id = "
)){
cmt_id
=
html
.
split
(
"cmt_id = "
)[
1
].
split
(
";"
)[
0
];
System
.
out
.
println
(
"cmt_id============"
+
cmt_id
);
return
cmt_id
;
}
}
catch
(
IOException
e
)
{
return
null
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
return
cmt_id
;
}
}
//
package com.zhiwei.crawler;
//
//
import java.io.IOException;
//
import java.util.ArrayList;
//
import java.util.Date;
//
import java.util.HashMap;
//
import java.util.List;
//
import java.util.Map;
//
//
import com.alibaba.fastjson.JSONArray;
//
import com.alibaba.fastjson.JSONObject;
//
import com.zhiwei.tools.httpclient.HeaderTool;
//
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
//
import com.zhiwei.tools.timeparse.TimeParse;
//
/
//
**
//
* @ClassName: QQNewsCommentListTest
//
* @Description: TODO(腾讯新闻评论抓取)
//
* @author hero
//
* @date 2017年8月10日 下午6:08:41
//
*/
//
public class QQNewsCommentListTest {
//
//
//
public static void main(String[] args) {
//
//
List<String> urlList = new ArrayList<String>();
//
for(String url : urlList){
//
qqNewsCommentListTest(url);
//
}
//
//
}
//
//
//
//
public static void qqNewsCommentListTest(String url) {
//
Map<String,String> headerMap = HeaderTool.getCommonHead();
//
String newsId = getCommentId(url);
//
String splitId = "_article"+newsId+"commentv2";
//
System.out.println(splitId);
//
int pages = 0;
//
try {
//
String comment_url = "http://coral.qq.com/article/"+newsId+"/comment/v2?callback=_article"+newsId+"commentv2&orinum=30&oriorder=t&pageflag=0&source=1&_="+System.currentTimeMillis();
//
System.out.println("commenturl========"+comment_url);
//
String html = HttpClientTemplateOK.get(comment_url, null, headerMap);
//
if(html!=null){
//
html = html.split(splitId)[1];
//
html = html.substring(1, html.length()-1);
//
System.out.println(html);
//
JSONObject data = JSONObject.parseObject(html).getJSONObject("data");
//
JSONArray jsonArray = data.getJSONArray("oriCommList");
//
JSONObject userData = data.getJSONObject("userList");
//
pages = (int)Math.ceil((double)data.getIntValue("oritotal")/30.0);
//
for(int a = 0;a<jsonArray.size();a++){
//
Map<String,Object> doc = new HashMap<String, Object>();
//
JSONObject json = jsonArray.getJSONObject(a);
//
//
JSONObject user = userData.getJSONObject(json.getString("userid"));
//
if(user!=null){
//
doc.put("nick", user.getString("nick"));
//
doc.put("gender", user.getString("gender"));
//
doc.put("localtion", user.getString("region"));
//
}
//
doc.put("_id", json.getString("id"));
//
doc.put("content", json.getString("content"));
//
doc.put("time", TimeParse.dateFormartString(new Date(json.getLong("time")*1000), "yyyy-MM-dd HH:mm:ss"));
//
doc.put("up", json.getInteger("up"));
//
doc.put("pokenum", json.getInteger("pokenum"));
//
doc.put("repnum", json.getInteger("repnum"));
//
doc.put("fromUrl", url);
//
System.out.println("doc==========="+doc);
//
//
}
//
}else{
//
System.out.println("--------------");
//
}
//
//
for(int i=1;i<=pages;i++){
//
comment_url = "http://coral.qq.com/article/"+newsId+"/comment/v2?callback=_article"+newsId+"commentv2&orinum=30&oriorder=t&pageflag="+i+"&source=1&_="+System.currentTimeMillis();
//
html = HttpClientTemplateOK.get(comment_url, null, headerMap);
//
if(html!=null){
//
html = html.split(splitId)[1];
//
html = html.substring(1, html.length()-1);
//
System.out.println(html);
//
System.out.println(html);
//
JSONObject data = JSONObject.parseObject(html).getJSONObject("data");
//
JSONArray jsonArray = data.getJSONArray("oriCommList");
//
JSONObject userData = data.getJSONObject("userList");
//
pages = (int)Math.ceil((double)data.getIntValue("oritotal")/30.0);
//
for(int a = 0;a<jsonArray.size();a++){
//
Map<String,Object> doc = new HashMap<String, Object>();
//
JSONObject json = jsonArray.getJSONObject(a);
//
//
JSONObject user = userData.getJSONObject(json.getString("userid"));
//
if(user!=null){
//
doc.put("nick", user.getString("nick"));
//
doc.put("gender", user.getString("gender"));
//
doc.put("localtion", user.getString("region"));
//
}
//
doc.put("_id", json.getString("mid"));
//
doc.put("content", json.getString("content"));
//
doc.put("time", TimeParse.dateFormartString(new Date(json.getLong("time")*1000), "yyyy-MM-dd HH:mm:ss"));
//
doc.put("up", json.getInteger("up"));
//
doc.put("pokenum", json.getInteger("pokenum"));
//
doc.put("repnum", json.getInteger("repnum"));
//
doc.put("fromUrl", url);
//
System.out.println("doc==========="+doc);
//
//
}
//
}
//
}
//
} catch (Exception e) {
//
e.printStackTrace();
//
}
//
}
//
//
//
public static String getCommentId(String url){
//
String cmt_id = null;
//
Map<String,String> headerMap = HeaderTool.getCommonHead();
//
System.out.println(url);
//
try {
//
String html = HttpClientTemplateOK.get(url, null, headerMap);
//
if(html!=null && html.contains("cmt_id = ")){
//
cmt_id = html.split("cmt_id = ")[1].split(";")[0];
//
System.out.println("cmt_id============"+cmt_id);
//
return cmt_id;
//
}
//
} catch (IOException e) {
//
return null;
//
} catch (Exception e) {
//
e.printStackTrace();
//
}
//
return cmt_id;
//
}
//
//
}
src/test/java/com/zhiwei/crawler/SinaCommentListTest.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.httpclient.HttpClientTemplateOK
;
/**
* @ClassName: SinaCommentListTest
* @Description: TODO(新浪新闻评论抓取)
* @author hero
* @date 2017年8月10日 下午6:08:41
*/
public
class
SinaCommentListTest
{
public
static
void
sinaCommentListTest
(
String
url
)
{
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
String
newsId
=
getCommentId
(
url
).
split
(
"====="
)[
1
];
String
channel
=
getCommentId
(
url
).
split
(
"====="
)[
0
];
int
page
=
1
;
try
{
String
comment_url
=
"http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel="
+
channel
+
"&newsid="
+
newsId
+
"&group=0&compress=0&ie=gbk&oe=gbk&page="
+
page
+
"&page_size=20&jsvar=loader_1525576000752_30189682"
;
System
.
out
.
println
(
"commenturl========"
+
comment_url
);
String
html
=
HttpClientTemplateOK
.
get
(
comment_url
,
null
,
headerMap
);
if
(
html
!=
null
){
html
=
html
.
substring
(
html
.
indexOf
(
"="
,
0
)+
1
,
html
.
length
());
System
.
out
.
println
(
html
);
JSONObject
data
=
JSONObject
.
parseObject
(
html
).
getJSONObject
(
"result"
);
JSONArray
jsonArray
=
data
.
getJSONArray
(
"cmntlist"
);
for
(
int
a
=
0
;
a
<
jsonArray
.
size
();
a
++){
Map
<
String
,
Object
>
doc
=
new
HashMap
<
String
,
Object
>();
JSONObject
json
=
jsonArray
.
getJSONObject
(
a
);
doc
.
put
(
"_id"
,
json
.
getString
(
"mid"
));
doc
.
put
(
"content"
,
json
.
getString
(
"content"
));
doc
.
put
(
"area"
,
json
.
getString
(
"area"
));
doc
.
put
(
"nick"
,
json
.
getString
(
"nick"
));
doc
.
put
(
"time"
,
json
.
getString
(
"time"
));
doc
.
put
(
"agree"
,
json
.
getInteger
(
"agree"
));
doc
.
put
(
"against"
,
json
.
getInteger
(
"against"
));
doc
.
put
(
"vote"
,
json
.
getInteger
(
"vote"
));
doc
.
put
(
"fromUrl"
,
url
);
System
.
out
.
println
(
"doc==========="
+
doc
);
}
}
else
{
System
.
out
.
println
(
"--------------"
);
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
public
static
String
getCommentId
(
String
url
){
String
newsid
=
null
;
String
channel
=
null
;
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
System
.
out
.
println
(
url
);
try
{
String
html
=
HttpClientTemplateOK
.
get
(
url
,
null
,
headerMap
);
if
(
html
!=
null
&&
html
.
contains
(
"newsid"
)){
newsid
=
html
.
split
(
"newsid: '"
)[
1
].
split
(
"',"
)[
0
];
channel
=
html
.
split
(
"channel: '"
)[
1
].
split
(
"',"
)[
0
];
System
.
out
.
println
(
channel
+
"============"
+
newsid
);
return
channel
+
"====="
+
newsid
;
}
}
catch
(
IOException
e
)
{
return
null
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
return
newsid
;
}
}
//
package com.zhiwei.crawler;
//
//
import java.io.IOException;
//
import java.util.ArrayList;
//
import java.util.HashMap;
//
import java.util.List;
//
import java.util.Map;
//
//
import com.alibaba.fastjson.JSONArray;
//
import com.alibaba.fastjson.JSONObject;
//
import com.zhiwei.tools.httpclient.HeaderTool;
//
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
//
/
//
**
//
* @ClassName: SinaCommentListTest
//
* @Description: TODO(新浪新闻评论抓取)
//
* @author hero
//
* @date 2017年8月10日 下午6:08:41
//
*/
//
public class SinaCommentListTest {
//
//
//
public static void sinaCommentListTest(String url) {
//
Map<String,String> headerMap = HeaderTool.getCommonHead();
//
String newsId = getCommentId(url).split("=====")[1];
//
String channel = getCommentId(url).split("=====")[0];
//
int page = 1;
//
try {
//
String comment_url = "http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel="+channel+"&newsid="+newsId+"&group=0&compress=0&ie=gbk&oe=gbk&page="+page+"&page_size=20&jsvar=loader_1525576000752_30189682";
//
System.out.println("commenturl========"+comment_url);
//
String html = HttpClientTemplateOK.get(comment_url, null, headerMap);
//
if(html!=null){
//
html = html.substring(html.indexOf("=",0)+1,html.length());
//
System.out.println(html);
//
JSONObject data = JSONObject.parseObject(html).getJSONObject("result");
//
JSONArray jsonArray = data.getJSONArray("cmntlist");
//
for(int a = 0;a<jsonArray.size();a++){
//
Map<String,Object> doc = new HashMap<String, Object>();
//
JSONObject json = jsonArray.getJSONObject(a);
//
doc.put("_id", json.getString("mid"));
//
doc.put("content", json.getString("content"));
//
doc.put("area", json.getString("area"));
//
doc.put("nick", json.getString("nick"));
//
doc.put("time", json.getString("time"));
//
doc.put("agree", json.getInteger("agree"));
//
doc.put("against", json.getInteger("against"));
//
doc.put("vote", json.getInteger("vote"));
//
doc.put("fromUrl", url);
//
System.out.println("doc==========="+doc);
//
//
}
//
}else{
//
System.out.println("--------------");
//
}
//
//
} catch (Exception e) {
//
e.printStackTrace();
//
}
//
}
//
//
//
//
public static String getCommentId(String url){
//
String newsid = null;
//
String channel = null;
//
Map<String,String> headerMap = HeaderTool.getCommonHead();
//
System.out.println(url);
//
try {
//
String html = HttpClientTemplateOK.get(url, null, headerMap);
//
if(html!=null && html.contains("newsid")){
//
newsid = html.split("newsid: '")[1].split("',")[0];
//
channel = html.split("channel: '")[1].split("',")[0];
//
System.out.println(channel+"============"+newsid);
//
return channel+"====="+newsid;
//
}
//
} catch (IOException e) {
//
return null;
//
} catch (Exception e) {
//
e.printStackTrace();
//
}
//
return newsid;
//
}
//
//
}
src/test/java/com/zhiwei/crawler/SoKuByWordExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Soku
;
public
class
SoKuByWordExample
{
@Test
public
void
sokuByWordTest
()
{
String
word
=
"美食,味道,吃,试吃,美味,好吃"
;
String
type
=
"174,103,176"
;
String
[]
words
=
word
.
split
(
","
);
String
[]
types
=
type
.
split
(
","
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
String
w
:
words
)
{
for
(
String
t
:
types
)
{
List
<
Map
<
String
,
Object
>>
list
=
Soku
.
getSoKuByWordData
(
w
,
t
,
null
);
if
(
list
!=
null
&&
list
.
size
()
>
0
)
{
bodyList
.
addAll
(
list
);
}
}
}
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"play_count"
);
headList
.
add
(
"url"
);
headList
.
add
(
"source"
);
poi
.
exportExcel
(
"D://crawlerdata/优酷采集.xlsx"
,
"优酷数据"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.crawler;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Soku;
//
//
public class SoKuByWordExample {
//
//
@Test
//
public void sokuByWordTest() {
//
String word = "美食,味道,吃,试吃,美味,好吃";
//
String type = "174,103,176";
//
String[] words = word.split(",");
//
String[] types = type.split(",");
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
//
for(String w : words ) {
//
for(String t : types) {
//
List<Map<String,Object>> list = Soku.getSoKuByWordData(w, t,null);
//
if(list != null && list.size() > 0) {
//
bodyList.addAll(list);
//
}
//
}
//
}
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
List<String> headList = new ArrayList<String>();
//
headList.add("title");
//
headList.add("time");
//
headList.add("play_count");
//
headList.add("url");
//
headList.add("source");
//
poi.exportExcel("D://crawlerdata/优酷采集.xlsx", "优酷数据", headList, bodyList);
//
//
}
//
//
}
src/test/java/com/zhiwei/crawler/SouhuAccountExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Souhu
;
public
class
SouhuAccountExample
{
//http://search.sohu.com/?keyword=%E8%99%8E%E5%97%85&source=article&queryType=edit&ie=utf8
@Test
public
void
souhuAccountTest
()
{
List
<
Map
<
String
,
Object
>>
lists
=
Souhu
.
getSouHuAccountData
(
"c29odXptdHNmbjZ0cnRAc29odS5jb20="
,
"2018-05-01 00:00:00"
,
false
,
null
);
System
.
out
.
println
(
lists
.
size
());
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"content"
);
headList
.
add
(
"url"
);
headList
.
add
(
"comment"
);
headList
.
add
(
"tags"
);
headList
.
add
(
"newsid"
);
headList
.
add
(
"source"
);
headList
.
add
(
"newsPv"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D:\\crawlerdata\\搜狐号历史文章-乔.xlsx"
,
"乔"
,
headList
,
lists
);
}
}
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Souhu;
//
//
//public class SouhuAccountExample {
//
// //http://search.sohu.com/?keyword=%E8%99%8E%E5%97%85&source=article&queryType=edit&ie=utf8
//
// @Test
// public void souhuAccountTest() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER);
// List<Map<String,Object>> lists = Souhu.getSouHuAccountData("99938933","浅黑科技","2018-05-01 00:00:00",false,null);
// System.out.println(lists.size());
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("url");
// headList.add("comment");
// headList.add("tags");
// headList.add("newsid");
// headList.add("source");
// headList.add("newsPv");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D:\\crawlerdata\\搜狐号历史文章-乔.xlsx", "乔", headList, lists);
// }
//
//}
src/test/java/com/zhiwei/crawler/SouhuCommentCountExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Souhu
;
public
class
SouhuCommentCountExample
{
@SuppressWarnings
(
"unchecked"
)
@Test
public
void
souhuCommentCountTest
()
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
"D://crawlerdata//自媒体//搜狐评论采集.xlsx"
,
0
);
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
List
<
String
>
headList
=
(
List
<
String
>)
map
.
get
(
"head"
);
for
(
Map
<
String
,
Object
>
map1
:
list
)
{
String
url
=
""
;
try
{
url
=
map1
.
get
(
"url"
)+
""
;
System
.
out
.
println
(
url
);
url
=
"http://m.sohu.com/a/299389309_114988"
;
int
i
=
Souhu
.
getSouhuCommentCount
(
url
,
ProxyHolder
.
NAT_PROXY
);
int
j
=
Souhu
.
getSohuReadNum
(
url
,
ProxyHolder
.
NAT_PROXY
);
map1
.
put
(
"count"
,
i
);
map1
.
put
(
"redNum"
,
j
);
System
.
out
.
println
(
map1
.
toString
());
}
catch
(
Exception
e
)
{
System
.
out
.
println
(
url
);
e
.
printStackTrace
();
continue
;
}
}
headList
.
add
(
"count"
);
poi
.
exportExcel
(
"D://crawlerdata//自媒体//搜狐评论采集.xlsx"
,
"sheet2"
,
headList
,
list
);
}
}
//
package com.zhiwei.crawler;
//
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.common.config.GroupType;
//
import com.zhiwei.crawler.proxy.ProxyFactory;
//
import com.zhiwei.crawler.proxy.ProxyHolder;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Souhu;
//
//
public class SouhuCommentCountExample {
//
//
//
@SuppressWarnings("unchecked")
//
@Test
//
public void souhuCommentCountTest() {
//
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
//
GroupType.PROVIDER);
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", 0);
//
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
//
List<String> headList = (List<String>) map.get("head");
//
for(Map<String,Object> map1 : list) {
//
String url = "";
//
try {
//
url = map1.get("url")+"";
//
System.out.println(url);
//
url = "http://m.sohu.com/a/299389309_114988";
//
int i = Souhu.getSouhuCommentCount(url,ProxyHolder.NAT_PROXY);
//
int j = Souhu.getSohuReadNum(url, ProxyHolder.NAT_PROXY);
//
map1.put("count", i);
//
map1.put("redNum", j);
//
System.out.println(map1.toString());
//
} catch (Exception e) {
//
System.out.println(url);
//
e.printStackTrace();
//
continue;
//
}
//
}
//
headList.add("count");
//
poi.exportExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", "sheet2", headList, list);
//
}
//
//
//
}
src/test/java/com/zhiwei/crawler/SouhuCommentExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Fenghuang
;
import
com.zhiwei.parse.Souhu
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
SouhuCommentExample
{
@Test
public
void
souhuCommentTest
()
{
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
"D://crawlerdata//自媒体//搜狐评论采集.xlsx"
,
0
);
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
List
<
String
>
urlList
=
new
ArrayList
<
String
>();
for
(
Map
<
String
,
Object
>
map1
:
list
)
{
String
url
=
""
;
try
{
url
=
map1
.
get
(
"url"
)+
""
;
System
.
out
.
println
(
url
);
List
<
Map
<
String
,
Object
>>
dataList
=
Souhu
.
getSouhuCommentData
(
url
,
null
);
if
(
dataList
.
size
()
<=
0
)
{
urlList
.
add
(
url
);
}
ZhiWeiTools
.
sleep
(
100
);
if
(
dataList
!=
null
)
{
bodyList
.
addAll
(
dataList
);
}
}
catch
(
Exception
e
)
{
System
.
out
.
println
(
url
);
e
.
printStackTrace
();
continue
;
}
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"nickname"
);
headList
.
add
(
"content"
);
headList
.
add
(
"user_id"
);
headList
.
add
(
"loaction"
);
headList
.
add
(
"support_count"
);
headList
.
add
(
"comment_id"
);
headList
.
add
(
"reply_id"
);
headList
.
add
(
"time"
);
for
(
String
s
:
urlList
)
{
System
.
out
.
println
(
s
);
}
poi
.
exportExcel
(
"D://crawlerdata//自媒体//搜狐评论采集.xlsx"
,
"搜狐评论"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.crawler;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.common.config.GroupType;
//
import com.zhiwei.crawler.proxy.ProxyFactory;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Fenghuang;
//
import com.zhiwei.parse.Souhu;
//
import com.zhiwei.tools.tools.ZhiWeiTools;
//
//
public class SouhuCommentExample {
//
//
@Test
//
public void souhuCommentTest() {
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
//
GroupType.PROVIDER);
//
Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", 0);
//
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
//
List<String> urlList = new ArrayList<String>();
//
for(Map<String,Object> map1 : list) {
//
String url = "";
//
try {
//
url = map1.get("url")+"";
//
System.out.println(url);
//
List<Map<String,Object>> dataList = Souhu.getSouhuCommentData(url,null);
//
if(dataList.size() <= 0) {
//
urlList.add(url);
//
}
//
ZhiWeiTools.sleep(100);
//
if(dataList != null) {
//
bodyList.addAll(dataList);
//
}
//
} catch (Exception e) {
//
System.out.println(url);
//
e.printStackTrace();
//
continue;
//
}
//
}
//
List<String> headList = new ArrayList<String>();
//
headList.add("nickname");
//
headList.add("content");
//
headList.add("user_id");
//
headList.add("loaction");
//
headList.add("support_count");
//
headList.add("comment_id");
//
headList.add("reply_id");
//
headList.add("time");
//
for(String s : urlList) {
//
System.out.println(s);
//
}
//
poi.exportExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", "搜狐评论", headList, bodyList);
//
}
//
//
//
}
src/test/java/com/zhiwei/crawler/TXNewsByWordExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.TXNews
;
public
class
TXNewsByWordExample
{
public
static
void
main
(
String
[]
args
)
{
String
word
=
"唐嫣"
;
String
devid
=
"6D33F35F-880D-42A6-A23F-881BEC6960EC"
;
List
<
Map
<
String
,
Object
>>
dataList
=
TXNews
.
getData
(
word
,
devid
,
null
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"content"
);
headList
.
add
(
"url"
);
headList
.
add
(
"id"
);
headList
.
add
(
"source"
);
poi
.
exportExcel
(
"D://crawlerdata/腾讯新闻-唐嫣-1.xlsx"
,
"腾讯新闻数据"
,
headList
,
dataList
);
}
}
//
package com.zhiwei.crawler;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.TXNews;
//
//
public class TXNewsByWordExample {
//
//
public static void main(String[] args) {
//
String word = "唐嫣";
//
String devid = "6D33F35F-880D-42A6-A23F-881BEC6960EC";
//
List<Map<String,Object>> dataList = TXNews.getData(word,devid,null);
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
List<String> headList = new ArrayList<String>();
//
headList.add("title");
//
headList.add("time");
//
headList.add("content");
//
headList.add("url");
//
headList.add("id");
//
headList.add("source");
//
poi.exportExcel("D://crawlerdata/腾讯新闻-唐嫣-1.xlsx", "腾讯新闻数据", headList, dataList);
//
}
//
//
}
src/test/java/com/zhiwei/crawler/Test1.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
import
java.io.UnsupportedEncodingException
;
import
java.net.URLEncoder
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
org.junit.Test
;
public
class
Test1
{
public
static
void
main
(
String
[]
args
)
{
String
time
=
"https://view.inews.qq.com/a/NEW2018021000440002"
;
System
.
out
.
println
(
time
.
split
(
"/"
)[
4
]);
}
}
//
package com.zhiwei.crawler;
//
//
import java.io.UnsupportedEncodingException;
//
import java.net.URLEncoder;
//
import java.util.regex.Matcher;
//
import java.util.regex.Pattern;
//
//
import org.junit.Test;
//
//
public class Test1 {
//
//
//
public static void main(String[] args) {
//
String time = "https://view.inews.qq.com/a/NEW2018021000440002";
//
//
System.out.println(time.split("/")[4]);
//
//
}
//
//
}
src/test/java/com/zhiwei/crawler/WangyiCommentCountExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Wangyi
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
WangyiCommentCountExample
{
@Test
public
void
wangyiCommentCountTest
()
{
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
String
path
=
"D:\\crawlerdata\\自媒体\\网易评论采集-1.xlsx"
;
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
path
,
0
);
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
List
<
String
>
urlList
=
new
ArrayList
<
String
>();
for
(
Map
<
String
,
Object
>
u
:
list
)
{
String
url
=
u
.
get
(
"链接"
)+
""
;
urlList
.
add
(
url
);
}
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
for
(
String
url
:
urlList
)
{
url
=
"https://3g.163.com/all/article/E9GAO0PK051188EC.html"
;
String
id
=
url
.
split
(
"/"
)[
url
.
split
(
"/"
).
length
-
1
].
split
(
".ht"
)[
0
];
System
.
out
.
println
(
id
);
int
lists
=
Wangyi
.
getWangyiCommentCount
(
id
,
null
);
System
.
out
.
println
(
lists
);
ZhiWeiTools
.
sleep
(
3000
);
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"content"
);
headList
.
add
(
"id"
);
headList
.
add
(
"time"
);
headList
.
add
(
"name"
);
headList
.
add
(
"like"
);
headList
.
add
(
"unlike"
);
headList
.
add
(
"from_url"
);
poi
.
exportExcel
(
path
,
"评论数据"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.crawler;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Wangyi;
//
import com.zhiwei.tools.tools.ZhiWeiTools;
//
//
public class WangyiCommentCountExample {
//
//
@Test
//
public void wangyiCommentCountTest() {
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
String path = "D:\\crawlerdata\\自媒体\\网易评论采集-1.xlsx";
//
Map<String,Object> map = poi.importExcel(path, 0);
//
//
List<Map<String,Object>> list = (List<Map<String, Object>>) map.get("body");
//
List<String> urlList = new ArrayList<String>();
//
for(Map<String,Object> u : list) {
//
String url = u.get("链接")+"";
//
urlList.add(url);
//
}
//
//
List<Map<String,Object>> bodyList = new ArrayList<>();
//
for(String url : urlList) {
//
url = "https://3g.163.com/all/article/E9GAO0PK051188EC.html";
//
String id = url.split("/")[url.split("/").length-1].split(".ht")[0];
//
System.out.println(id);
//
int lists = Wangyi.getWangyiCommentCount(id, null);
//
System.out.println(lists);
//
ZhiWeiTools.sleep(3000);
//
}
//
List<String> headList = new ArrayList<String>();
//
headList.add("content");
//
headList.add("id");
//
headList.add("time");
//
headList.add("name");
//
headList.add("like");
//
headList.add("unlike");
//
headList.add("from_url");
//
//
poi.exportExcel(path, "评论数据", headList, bodyList);
//
//
}
//
//
//
}
src/test/java/com/zhiwei/crawler/WangyiCommentExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Wangyi
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
WangyiCommentExample
{
//若出错 可能数据有重复 以id为准
@Test
public
void
wangyiCommentTest
()
{
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
String
path
=
"D:\\crawlerdata\\自媒体\\网易评论采集-1.xlsx"
;
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
path
,
0
);
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
List
<
String
>
urlList
=
new
ArrayList
<
String
>();
for
(
Map
<
String
,
Object
>
u
:
list
)
{
String
url
=
u
.
get
(
"链接"
)+
""
;
urlList
.
add
(
url
);
}
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
for
(
String
url
:
urlList
)
{
String
id
=
url
.
split
(
"/"
)[
url
.
split
(
"/"
).
length
-
1
].
split
(
".ht"
)[
0
];
System
.
out
.
println
(
id
);
List
<
Map
<
String
,
Object
>>
lists
=
Wangyi
.
getWangyiCommentData
(
id
,
null
);
System
.
out
.
println
(
url
+
"====="
+
lists
.
size
());
if
(
lists
!=
null
)
{
for
(
Map
<
String
,
Object
>
m
:
lists
)
{
m
.
put
(
"from_url"
,
url
);
bodyList
.
add
(
m
);
}
}
ZhiWeiTools
.
sleep
(
3000
);
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"content"
);
headList
.
add
(
"id"
);
headList
.
add
(
"time"
);
headList
.
add
(
"name"
);
headList
.
add
(
"like"
);
headList
.
add
(
"unlike"
);
headList
.
add
(
"from_url"
);
poi
.
exportExcel
(
path
,
"评论数据"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.crawler;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Wangyi;
//
import com.zhiwei.tools.tools.ZhiWeiTools;
//
//
public class WangyiCommentExample {
//
//
//若出错 可能数据有重复 以id为准
//
@Test
//
public void wangyiCommentTest() {
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
String path = "D:\\crawlerdata\\自媒体\\网易评论采集-1.xlsx";
//
Map<String,Object> map = poi.importExcel(path, 0);
//
//
List<Map<String,Object>> list = (List<Map<String, Object>>) map.get("body");
//
List<String> urlList = new ArrayList<String>();
//
for(Map<String,Object> u : list) {
//
String url = u.get("链接")+"";
//
urlList.add(url);
//
}
//
//
List<Map<String,Object>> bodyList = new ArrayList<>();
//
for(String url : urlList) {
//
String id = url.split("/")[url.split("/").length-1].split(".ht")[0];
//
System.out.println(id);
//
List<Map<String,Object>> lists = Wangyi.getWangyiCommentData(id,null);
//
System.out.println(url+"====="+lists.size());
//
if(lists != null) {
//
for(Map<String,Object> m : lists) {
//
m.put("from_url", url);
//
bodyList.add(m);
//
}
//
}
//
ZhiWeiTools.sleep(3000);
//
}
//
List<String> headList = new ArrayList<String>();
//
headList.add("content");
//
headList.add("id");
//
headList.add("time");
//
headList.add("name");
//
headList.add("like");
//
headList.add("unlike");
//
headList.add("from_url");
//
//
poi.exportExcel(path, "评论数据", headList, bodyList);
//
//
}
//
//
//
//
//
//
}
src/test/java/com/zhiwei/crawler/WangyiHistoryExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Wangyi
;
public
class
WangyiHistoryExample
{
public
static
void
main
(
String
[]
args
)
{
String
url
=
"http://dy.163.com/v2/article/detail/DPLAOP1605198CJN
.html"
;
List
<
Map
<
String
,
Object
>>
list
=
Wangyi
.
getHistoryData
(
url
,
null
,
"2018-05-01 00:00:00"
);
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"content"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata//自媒体/网易-财联社.xlsx"
,
"财联社"
,
headList
,
list
);
}
}
//
package com.zhiwei.crawler;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Wangyi;
//
//
public class WangyiHistoryExample {
//
//
public static void main(String[] args) {
//
// String url = "http://dy.163.com/v2/article/detail/EBR9PF6J0512MLBG
.html";
//
//
List<Map<String,Object>> list = Wangyi.getHistoryData(url, null, "2018-05-01 00:00:00");
//
//
List<String> headList = new ArrayList<String>();
//
headList.add("title");
//
headList.add("time");
//
headList.add("content");
//
headList.add("source");
//
headList.add("url");
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
poi.exportExcel("D://crawlerdata//自媒体/网易-财联社.xlsx", "财联社", headList, list);
//
//
}
//
//
//
}
src/test/java/com/zhiwei/crawler/XiaomiShequByWordExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Xiaomi
;
public
class
XiaomiShequByWordExample
{
public
static
void
main
(
String
[]
args
)
{
String
word
=
"小米 电饭煲 锅体开裂,小米 电饭煲 开裂,小米 电饭煲 裂缝,小米 电饭煲 烫伤,小米 电饭煲 变形"
;
//
String
[]
words
=
word
.
split
(
","
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
String
w
:
words
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
Xiaomi
.
getXiaomiByWordData
(
w
,
null
);
if
(
dataList
!=
null
&&
dataList
.
size
()
>
0
)
{
bodyList
.
addAll
(
dataList
);
}
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
headList
.
add
(
"content"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D:\\crawlerdata\\小米-关键词电饭煲相关.xlsx"
,
"小米社区采集"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.crawler;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Xiaomi;
//
//
public class XiaomiShequByWordExample {
//
//
public static void main(String[] args) {
//
String word = "小米 电饭煲 锅体开裂,小米 电饭煲 开裂,小米 电饭煲 裂缝,小米 电饭煲 烫伤,小米 电饭煲 变形";
//
//
//
String[] words = word.split(",");
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
//
for(String w : words) {
//
List<Map<String,Object>> dataList = Xiaomi.getXiaomiByWordData(w,null);
//
if(dataList != null && dataList.size() > 0) {
//
bodyList.addAll(dataList);
//
}
//
}
//
List<String> headList = new ArrayList<String>();
//
headList.add("title");
//
headList.add("time");
//
headList.add("source");
//
headList.add("url");
//
headList.add("content");
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
//
poi.exportExcel("D:\\crawlerdata\\小米-关键词电饭煲相关.xlsx", "小米社区采集", headList, bodyList);
//
//
}
//
//
}
src/test/java/com/zhiwei/crawler/XiguaAccountExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.XiGua
;
public
class
XiguaAccountExample
{
@Test
public
void
xiguaAccountTest
()
{
String
path
=
"D:\\crawlerdata\\西瓜视频采集12.28.xlsx"
;
String
startTime
=
"2017-01-01 00:00:00"
;
//2017-01-01 00:00:00
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
path
,
0
);
List
<
Map
<
String
,
Object
>>
lists
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
for
(
Map
<
String
,
Object
>
map1
:
lists
)
{
String
url
=
map1
.
get
(
"主页"
)+
""
;
if
(
url
!=
null
&&
url
.
length
()
>
5
)
{
List
<
Map
<
String
,
Object
>>
lists1
=
XiGua
.
getXiguaAccountData
(
url
,
startTime
,
null
);
if
(
lists1
!=
null
&&
lists
.
size
()
>
0
)
{
bodyList
.
addAll
(
lists1
);
}
}
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"comments_count"
);
headList
.
add
(
"time"
);
headList
.
add
(
"content"
);
headList
.
add
(
"url"
);
headList
.
add
(
"video_watch_count"
);
headList
.
add
(
"source"
);
poi
.
exportExcel
(
path
,
"数据采集结果"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.crawler;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.XiGua;
//
//
public class XiguaAccountExample {
//
//
@Test
//
public void xiguaAccountTest() {
//
String path = "D:\\crawlerdata\\西瓜视频采集12.28.xlsx";
//
String startTime = "2017-01-01 00:00:00";
//
//2017-01-01 00:00:00
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
//
Map<String,Object> map = poi.importExcel(path, 0);
//
List<Map<String,Object>> lists = (List<Map<String, Object>>) map.get("body");
//
for(Map<String,Object> map1 : lists ) {
//
String url = map1.get("主页")+"";
//
if(url != null && url.length() > 5) {
//
List<Map<String,Object>> lists1 = XiGua.getXiguaAccountData(url,startTime,null);
//
if(lists1 != null && lists.size() > 0) {
//
bodyList.addAll(lists1);
//
}
//
}
//
}
//
List<String> headList = new ArrayList<String>();
//
headList.add("title");
//
headList.add("comments_count");
//
headList.add("time");
//
headList.add("content");
//
headList.add("url");
//
headList.add("video_watch_count");
//
headList.add("source");
//
poi.exportExcel(path, "数据采集结果", headList, bodyList);
//
}
//
//
//
}
src/test/java/com/zhiwei/crawler/XiguaByWordExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.XiGua
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
XiguaByWordExample
{
@Test
public
void
XiguaByWordTest
()
{
String
word
=
"美食,味道,吃,试吃,美味,好吃"
;
String
[]
words
=
word
.
split
(
","
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
String
w
:
words
)
{
List
<
Map
<
String
,
Object
>>
list
=
XiGua
.
getXiguaVideoByWordData
(
w
,
null
);
if
(
list
!=
null
&&
list
.
size
()
>
0
)
{
bodyList
.
addAll
(
list
);
}
ZhiWeiTools
.
sleep
(
5000
);
System
.
out
.
println
(
"============总数"
+
bodyList
.
size
());
}
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"content"
);
headList
.
add
(
"like"
);
headList
.
add
(
"unlike"
);
headList
.
add
(
"play_count"
);
headList
.
add
(
"source"
);
headList
.
add
(
"comment_count"
);
headList
.
add
(
"url"
);
poi
.
exportExcel
(
"D://crawlerdata/西瓜美食-1.xlsx"
,
"西瓜好吃不"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.crawler;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.XiGua;
//
import com.zhiwei.tools.tools.ZhiWeiTools;
//
//
public class XiguaByWordExample {
//
//
//
@Test
//
public void XiguaByWordTest() {
//
String word = "美食,味道,吃,试吃,美味,好吃";
//
String[] words = word.split(",");
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
//
for(String w : words) {
//
List<Map<String,Object>> list = XiGua.getXiguaVideoByWordData(w,null);
//
if(list != null && list.size() > 0) {
//
bodyList.addAll(list);
//
}
//
ZhiWeiTools.sleep(5000);
//
System.out.println("============总数" + bodyList.size());
//
}
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
List<String> headList = new ArrayList<String>();
//
headList.add("title");
//
headList.add("time");
//
headList.add("content");
//
headList.add("like");
//
headList.add("unlike");
//
headList.add("play_count");
//
headList.add("source");
//
headList.add("comment_count");
//
headList.add("url");
//
//
poi.exportExcel("D://crawlerdata/西瓜美食-1.xlsx", "西瓜好吃不", headList, bodyList);
//
//
}
//
//
//
//
}
src/test/java/com/zhiwei/crawler/YidainzixunByWordExample.java
View file @
9234d24c
...
...
@@ -35,7 +35,7 @@ public class YidainzixunByWordExample {
headList
.
add
(
"time"
);
headList
.
add
(
"url"
);
System
.
out
.
println
(
listAll
.
size
());
poi
.
exportExcel
(
"D://crawlerdata/一点资讯-
美食
.xlsx"
,
"asd"
,
headList
,
listAll
);
poi
.
exportExcel
(
"D://crawlerdata/一点资讯-
软博会
.xlsx"
,
"asd"
,
headList
,
listAll
);
}
...
...
src/test/java/com/zhiwei/crawler/YidianzixunCommentExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.parse.Yidianzixun
;
public
class
YidianzixunCommentExample
{
@Test
public
void
yidianzixunCommentTest
()
{
String
url
=
"http://www.yidianzixun.com/article/0ILHigvv"
;
List
<
Map
<
String
,
Object
>>
lists
=
Yidianzixun
.
getYidianzixunCommentData
(
url
,
null
);
System
.
out
.
println
(
lists
.
size
());
for
(
Map
<
String
,
Object
>
map
:
lists
)
{
System
.
out
.
println
(
map
.
toString
());
}
}
}
//
package com.zhiwei.crawler;
//
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.parse.Yidianzixun;
//
//
public class YidianzixunCommentExample {
//
//
@Test
//
public void yidianzixunCommentTest() {
//
String url = "http://www.yidianzixun.com/article/0ILHigvv";
//
List<Map<String,Object>> lists = Yidianzixun.getYidianzixunCommentData(url,null);
//
System.out.println(lists.size());
//
for(Map<String,Object> map : lists) {
//
System.out.println(map.toString());
//
}
//
}
//
//
//
}
src/test/java/com/zhiwei/hsitory/BaijiaAccountExample.java
0 → 100644
View file @
9234d24c
//package com.zhiwei.hsitory;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Baijia;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//public class BaijiaAccountExample {
//
// @Test
// public void test3() {
// String path = "D://crawlerdata//自媒体/百家号采集.xlsx";
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// String startTime = "2018-05-01 00:00:00";
// Map<String,Object> map = poi.importExcel(path, 0);
// List<Map<String,Object>> list = (List<Map<String, Object>>) map.get("body");
// List<Map<String,Object>> bodyList = new ArrayList<>();
// for(Map<String,Object> m : list) {
// try {
// String app_id = m.get("id").toString();
// app_id = "1594158489045754";
// String cookie = "__cfduid=d847baca85b97d1967b3da02ebb345b831535524251; BAIDUID=C0F0F81EF770C5219AB9C178654135EC:FG=1; PSTM=1536376257; BIDUPSID=250CCE0442BEBCB3568D8EC515953434; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; locale=zh; delPer=0; H_PS_PSSID=1447_21117_20930; PSINO=5";
// List<Map<String,Object>> lists = Baijia.getBaijiaAccountByBaiduData(app_id, startTime,cookie, null);
// if(lists != null) {
// bodyList.addAll(lists);
// }
// break;
// } catch (Exception e) {
// }
// }
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("source");
// headList.add("url");
// headList.add("content");
// headList.add("read_amount");
// poi.exportExcel("D://crawlerdata//历史文章采集/百家号-lxj-2.xlsx", "娱乐资本论", headList, bodyList);
// }
//
//}
src/test/java/com/zhiwei/hsitory/FenghuangAccountExample.java
0 → 100644
View file @
9234d24c
//package com.zhiwei.hsitory;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Fenghuang;
//
//public class FenghuangAccountExample {
//
// @Test
// public void fenghuangAccountTest() {
// //所用时间长 1s1篇文章吧
// //https://api.3g.ifeng.com/client_search_subscribe?k=号外财经
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// String id = "1165210";
// String[] ids = id.split(",");
// String startTime = "2010-05-01 00:00:00"; //可为空
// for(int i = 0;i < ids.length;i++) {
// try {
// List<Map<String,Object>> dataList = Fenghuang.getFenghuangAccountData(ids[i], startTime,ProxyHolder.NAT_HEAVY_PROXY);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("source");
// headList.add("url");
// headList.add("id");
// poi.exportExcel("D://crawlerdata//历史文章采集/凤凰-三言财经.xlsx", ids[i], headList, dataList);
// } catch (Exception e) {
// continue;
// }
// }
// }
//
//}
src/test/java/com/zhiwei/hsitory/SouhuAccountExample.java
0 → 100644
View file @
9234d24c
package
com
.
zhiwei
.
hsitory
;
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Souhu;
//
//
//public class SouhuAccountExample {
//
// //http://search.sohu.com/?keyword=%E8%99%8E%E5%97%85&source=article&queryType=edit&ie=utf8
//
// @Test
// public void souhuAccountTest() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER);
// List<Map<String,Object>> lists = Souhu.getSouHuAccountData("99938933","浅黑科技","2018-05-01 00:00:00",false,null);
// System.out.println(lists.size());
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("url");
// headList.add("comment");
// headList.add("tags");
// headList.add("newsid");
// headList.add("source");
// headList.add("newsPv");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D:\\crawlerdata\\搜狐号历史文章-乔.xlsx", "乔", headList, lists);
// }
//
//}
src/test/java/com/zhiwei/hsitory/TxNewsHostoryExample.java
0 → 100644
View file @
9234d24c
//package com.zhiwei.hsitory;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.TXNews;
//
//public class TxNewsHostoryExample {
//
// public static void main(String[] args) {
//
//
// String url = "6839743";
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// List<Map<String,Object>> list = TXNews.getTxNewsHistory(url, null,ProxyHolder.NAT_PROXY);
//
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("source");
// headList.add("url");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//历史文章采集/腾讯网-三言财经-right.xlsx", "财联社", headList, list);
//
//
// }
//
//}
src/test/java/com/zhiwei/hsitory/WangyiHistoryExample.java
0 → 100644
View file @
9234d24c
//package com.zhiwei.hsitory;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Wangyi;
//
//public class WangyiHistoryExample {
//
// public static void main(String[] args) {
//
// String url = "T1520579168852";
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// List<Map<String,Object>> list = Wangyi.getWangyiClientHistory(url, ProxyHolder.NAT_PROXY, "2019-01-01 00:00:00");
//
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("source");
// headList.add("url");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//历史文章采集/网易-三言财经.xlsx", "财联社", headList, list);
//
// }
//
//
//}
src/test/java/com/zhiwei/hsitory/XueqiuHostoryExample.java
0 → 100644
View file @
9234d24c
//package com.zhiwei.hsitory;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Xueqiu;
//
//public class XueqiuHostoryExample {
//
// public static void main(String[] args) {
//
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER);
// String cookie = "_ga=GA1.2.2045733994.1547169202; device_id=5a986a59915983c3e2ef8074f80112ec; s=e618lxk3qw; __utmz=1.1547185990.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=1.2045733994.1547169202.1548122251.1553047746.3; aliyungf_tc=AQAAAJHA7Vrq7AYAgtgMPALb3ZCQP9o+; _gid=GA1.2.334283760.1554779038; Hm_lvt_1db88642e346389874251b5a1eded6e3=1553046552,1553046993,1553150890,1554779038; _gat=1; remember=1; remember.sig=K4F3faYzmVuqC0iXIERCQf55g2Y; xq_a_token=fed387c342aedea5c7883d1062ae6faf167975d8; xq_a_token.sig=j47ktDdYWr1FOgeL74U6yMCPhOY; xqat=fed387c342aedea5c7883d1062ae6faf167975d8; xqat.sig=oZPD4-6V_GPw-KsnR04L7vxf5oM; xq_r_token=6ffffd472dc300e2f89195a77b8e7064da45d78d; xq_r_token.sig=TPd7Y11kYPcQeOgzXVDApbRQauQ; xq_is_login=1; xq_is_login.sig=J3LxgPVPUzbBg3Kee_PquUfih7Q; u=5878436335; u.sig=j_g6RZ9GzzrgOfIsGHi9O9M1wvc; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1554791719";
// String userId = "7441422641";
//
// List<Map<String,Object>> dataList = Xueqiu.getXueqiuAccountData(userId, cookie, null);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("source");
// headList.add("content");
// headList.add("repostCount");
// headList.add("commentCount");
// headList.add("likeCount");
// headList.add("url");
// poi.exportExcel("D://crawlerdata//历史文章采集/雪球-三言财经.xlsx", "三言财经", headList, dataList);
//
// }
//
//}
src/test/java/com/zhiwei/
crawler
/YidianzixunAccountExample.java
→
src/test/java/com/zhiwei/
hsitory
/YidianzixunAccountExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
package
com
.
zhiwei
.
hsitory
;
import
java.util.ArrayList
;
import
java.util.List
;
...
...
@@ -6,6 +6,9 @@ import java.util.Map;
import
org.junit.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Yidianzixun
;
...
...
@@ -14,10 +17,10 @@ public class YidianzixunAccountExample {
@Test
public
void
yidianzixunAccountTest
()
{
String
channelid
=
"m23315"
;
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
String
channelid
=
"m190159"
;
String
startTime
=
"2007-01-01 00:00:00"
;
String
cookie
=
"wuid=90742539356820; wuid_createAt=2019-01-10 11:45:41; UM_distinctid=16835dd9ba11cb-0ef8d17063d93f-671b197c-1fa400-16835dd9ba2243; JSESSIONID=174b8df350cb5400283abedf2c26076357b0b7af0581024f2e39e90532b4edc9; weather_auth=2; DID=node82eee6d174caf2d4; Hm_lvt_15fafbae2b9b11d280c79eff3b840e45=1551686450,1551686458; CNZZDATA1255169715=931563543-1547087800-%7C1551761063; captcha=s%3A6e56492ffceaf88d9f131fa79435464a.TLAhZ1cfwj0vBTjKTO9Qf5qc6QLuipitrEMZjiqm8BM; Hm_lpvt_15fafbae2b9b11d280c79eff3b840e45=1551764582; cn_1255169715_dplus=%7B%22distinct_id%22%3A%20%2216835dd9ba11cb-0ef8d17063d93f-671b197c-1fa400-16835dd9ba2243%22%2C%22sp%22%3A%20%7B%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201547544080%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201547544080%7D%2C%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201551765057%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201551765057%7D"
;
List
<
Map
<
String
,
Object
>>
dataList
=
Yidianzixun
.
getYidianzixunAccountData
(
channelid
,
startTime
,
null
,
cookie
);
List
<
Map
<
String
,
Object
>>
dataList
=
Yidianzixun
.
getYidianzixunAccountData
(
channelid
,
startTime
,
ProxyHolder
.
NAT_HEAVY_PROXY
,
null
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
...
...
@@ -27,7 +30,7 @@ public class YidianzixunAccountExample {
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
headList
.
add
(
"summary"
);
poi
.
exportExcel
(
"D://crawlerdata/
一点资讯-m23315.xlsx"
,
"虎嗅
"
,
headList
,
dataList
);
poi
.
exportExcel
(
"D://crawlerdata/
/历史文章采集/一点资讯-新华社中国新三板.xlsx"
,
"新华社中国新三板
"
,
headList
,
dataList
);
}
...
...
src/test/java/com/zhiwei/keyword/XueqiuKeyWord.java
View file @
9234d24c
//package com.zhiwei.keyword;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Xueqiu;
//
//public class XueqiuKeyWord {
// @Test
// public void f() {
//// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
// String word = "腾讯 股价|腾讯 市值|腾讯 估值|腾讯 投资|腾讯 财报";
// String endTime = "2018-01-01 00:00:00";
// String cookie = "_ga=GA1.2.590296572.1538275545; device_id=812aac42a8874c32ca97893a7b270684; s=ff12lgxbt1; __utma=1.590296572.1538275545.1538280279.1538280279.1; __utmz=1.1538280279.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); aliyungf_tc=AQAAAPtocWy01ggA6tbncwYt4OIDsaiG; xq_a_token=088c6ad5e275496d7c91b8b5b2ecb929bee15772; xq_a_token.sig=NcpAm7FRsAVoMGRMOLrLRveBT7U; xq_r_token=08131eb9a4f33b43b2340fd782f3776c9823d74a; xq_r_token.sig=AZ7au6ICJtTgQJVPybkOJs_Fr54; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539599526,1539913171,1539913178,1541157360; _gid=GA1.2.1817290343.1541157360; u=721541157360383; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1541212289";
//
//
//
// String[] words = word.split("\\|");
//
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
// for(String w : words) {
// System.out.println(w);
//
// List<Map<String, Object>> dataList = Xueqiu.getData(w, endTime, null, cookie);
// System.out.println(w + " ---- " + dataList.size());
// bodyList.addAll(dataList);
// }
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("uper");
// headList.add("url");
// headList.add("likeCount");
// headList.add("replyCount");
// poi.exportExcel("D:\\crawlerdata\\自媒体\\雪球网页采集-腾讯-relevance.xlsx", "马化腾", headList, bodyList);
//
// }
//}
package
com
.
zhiwei
.
keyword
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Xueqiu
;
public
class
XueqiuKeyWord
{
@Test
public
void
f
()
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
String
word
=
"软博会|软件博览会"
;
String
endTime
=
"2018-01-01 00:00:00"
;
String
cookie
=
"aliyungf_tc=AQAAADi3G3I7GAEAgtgMPF/W3FeFauXk; device_id=3221abfd50f9f8be2abca9182c338c9e; Hm_lvt_1db88642e346389874251b5a1eded6e3=1561962809; s=d911zeymzo; remember=1; remember.sig=K4F3faYzmVuqC0iXIERCQf55g2Y; xq_a_token=b2a8d68c56272cca3b71e1bcaf04b744b16e9497; xq_a_token.sig=hj7OrhYid1GuAaF-AmLYrPuyFDk; xqat=b2a8d68c56272cca3b71e1bcaf04b744b16e9497; xqat.sig=gker8dcZJ2Ez4s58eIGPa5zCd0w; xq_r_token=b0ad7b059411acac8c9e2f0a40336804bb60d047; xq_r_token.sig=DwNSLIigE8xtwQKLygagcM28Dd0; xq_is_login=1; xq_is_login.sig=J3LxgPVPUzbBg3Kee_PquUfih7Q; u=1273068356; u.sig=UyqQW-br8hcsOyT6anFmS_SFpCs; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1561963129"
;
String
[]
words
=
word
.
split
(
"\\|"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
String
w
:
words
)
{
System
.
out
.
println
(
w
);
List
<
Map
<
String
,
Object
>>
dataList
=
Xueqiu
.
getData
(
w
,
endTime
,
null
,
cookie
);
System
.
out
.
println
(
w
+
" ---- "
+
dataList
.
size
());
bodyList
.
addAll
(
dataList
);
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"content"
);
headList
.
add
(
"uper"
);
headList
.
add
(
"url"
);
headList
.
add
(
"likeCount"
);
headList
.
add
(
"replyCount"
);
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\雪球网页采集-软博会-relevance.xlsx"
,
"马化腾"
,
headList
,
bodyList
);
}
}
src/test/java/com/zhiwei/shipin/AiqiyiTest.java
View file @
9234d24c
...
...
@@ -4,7 +4,7 @@ import java.util.ArrayList;
import
java.util.List
;
import
java.util.Map
;
import
org.
testng.annotations
.Test
;
import
org.
junit
.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
...
...
@@ -21,7 +21,7 @@ public class AiqiyiTest {
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"D://crawlerdata//关键词.txt"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
String
w
:
wordList
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
Aiqiyi
.
getAiqiyiByWordData
(
w
,
ProxyHolder
.
NAT_PROXY
);
List
<
Map
<
String
,
Object
>>
dataList
=
Aiqiyi
.
getAiqiyiByWordData
(
w
,
ProxyHolder
.
NAT_
HEAVY_
PROXY
);
if
(
dataList
!=
null
&&
dataList
.
size
()
>=
1
)
{
bodyList
.
addAll
(
dataList
);
}
...
...
@@ -34,7 +34,7 @@ public class AiqiyiTest {
headList
.
add
(
"title"
);
headList
.
add
(
"word"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata/
爱奇艺关键词采集-txh-0320
.xlsx"
,
"数据"
,
headList
,
bodyList
);
poi
.
exportExcel
(
"D://crawlerdata/
/视频/爱奇艺关键词采集-毓婷-0716
.xlsx"
,
"数据"
,
headList
,
bodyList
);
...
...
src/test/java/com/zhiwei/shipin/BilibiliTest.java
View file @
9234d24c
...
...
@@ -4,8 +4,10 @@ import java.util.ArrayList;
import
java.util.List
;
import
java.util.Map
;
import
org.
testng.annotations
.Test
;
import
org.
junit
.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.BiliBili
;
import
com.zhiwei.util.WordReadFile
;
...
...
@@ -13,11 +15,12 @@ import com.zhiwei.util.WordReadFile;
public
class
BilibiliTest
{
@Test
public
void
f
()
{
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"D://crawlerdata//关键词.txt"
);
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"D://crawlerdata//关键词-1.txt"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
String
cookie
=
"LIVE_BUVID=AUTO8715300758995538; sid=kp5rluge; fts=1530161621; im_notify_type_35324319=0; buvid3=08ABE6AE-5061-4CE5-B34F-1A8AAB64DB3320712infoc; rpdid=olppsmkxmpdoskwoxiwww; finger=edc6ecda; stardustvideo=1; UM_distinctid=164fe68fb31996-01f161c3523abe-6114167a-1fa400-164fe68fb32274"
;
for
(
String
word
:
wordList
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
BiliBili
.
getData
(
word
,
null
,
List
<
Map
<
String
,
Object
>>
dataList
=
BiliBili
.
getData
(
word
,
null
,
"2019-07-18 00:00:00"
,
cookie
);
if
(
dataList
!=
null
)
{
System
.
out
.
println
(
word
+
" ----- "
+
dataList
.
size
());
...
...
@@ -33,7 +36,7 @@ public class BilibiliTest {
headlist
.
add
(
"url"
);
headlist
.
add
(
"word"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata//
bilibili关键词采集数据-txh-0320
.xlsx"
,
"B站数据"
,
headlist
,
bodyList
);
poi
.
exportExcel
(
"D://crawlerdata//
视频//bilibili关键词采集数据-吃鸡否-0722
.xlsx"
,
"B站数据"
,
headlist
,
bodyList
);
}
}
src/test/java/com/zhiwei/shipin/QQTVTest.java
View file @
9234d24c
...
...
@@ -4,7 +4,7 @@ import java.util.ArrayList;
import
java.util.List
;
import
java.util.Map
;
import
org.
testng.annotations
.Test
;
import
org.
junit
.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
...
...
@@ -18,11 +18,11 @@ public class QQTVTest {
@Test
public
void
f
()
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
String
time
=
"201
8-01-0
1 00:00:00"
;
String
time
=
"201
9-04-1
1 00:00:00"
;
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"D://crawlerdata//关键词.txt"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
for
(
String
word
:
wordList
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
QQTV
.
getData
(
word
,
time
,
ProxyHolder
.
NAT_PROXY
);
List
<
Map
<
String
,
Object
>>
dataList
=
QQTV
.
getData
(
word
,
time
,
ProxyHolder
.
NAT_
HEAVY_
PROXY
);
if
(
dataList
!=
null
)
{
System
.
out
.
println
(
word
+
" ----- "
+
dataList
.
size
());
bodyList
.
addAll
(
dataList
);
...
...
@@ -37,7 +37,7 @@ public class QQTVTest {
headlist
.
add
(
"url"
);
headlist
.
add
(
"word"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata//
腾讯视频关键词采集数据-txh-0320
.xlsx"
,
"腾讯视频数据"
,
headlist
,
bodyList
);
poi
.
exportExcel
(
"D://crawlerdata//
视频//腾讯视频关键词采集数据-毓婷-0716
.xlsx"
,
"腾讯视频数据"
,
headlist
,
bodyList
);
...
...
src/test/java/com/zhiwei/shipin/SohuTVTest.java
View file @
9234d24c
...
...
@@ -4,7 +4,7 @@ import java.util.ArrayList;
import
java.util.List
;
import
java.util.Map
;
import
org.
testng.annotations
.Test
;
import
org.
junit
.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.shipin.SohuTV
;
...
...
@@ -33,7 +33,7 @@ public class SohuTVTest {
headlist
.
add
(
"url"
);
headlist
.
add
(
"word"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata//
搜狐视频关键词采集数据-txh-0320
.xlsx"
,
"搜狐数据"
,
headlist
,
bodyList
);
poi
.
exportExcel
(
"D://crawlerdata//
视频//搜狐视频关键词采集数据-毓婷-0716
.xlsx"
,
"搜狐数据"
,
headlist
,
bodyList
);
}
}
src/test/java/com/zhiwei/shipin/YoukuKeyWordTest.java
View file @
9234d24c
...
...
@@ -4,7 +4,7 @@ import java.util.ArrayList;
import
java.util.List
;
import
java.util.Map
;
import
org.
testng.annotations
.Test
;
import
org.
junit
.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
...
...
@@ -30,7 +30,7 @@ public class YoukuKeyWordTest {
headList
.
add
(
"uper"
);
headList
.
add
(
"word"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata//
优酷数据-txh-0320
.xlsx"
,
"数据"
,
headList
,
bodyList
);
poi
.
exportExcel
(
"D://crawlerdata//
视频//优酷数据-毓婷-0716
.xlsx"
,
"数据"
,
headList
,
bodyList
);
}
}
src/test/java/com/zhiwei/user/MaimaiTest.java
View file @
9234d24c
//package com.zhiwei.user;
//
//import java.util.ArrayList;
//import java.util.Arrays;
//import java.util.List;
//import java.util.Map;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Maimai;
//
//public class MaimaiTest {
// @Test
// public void maimaiUserCrawler() {
// String path = "D:\\crawlerdata\\脉脉用户.xlsx";
// String word = "美团|美团网|大众点评|美团点评|摩拜|猫眼|榛果|三快科技|三快在线";
// String cookie = "_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; sessionid=lejfy3gdu5tf9x9zowxfhtq5o73dubc5; guid=GxsfBBgZGwQYGx4EGBkeVgcYGxkdHhMeHhkcVhwZBB0ZHwVDWEtMS3kKEhMEEh0fGQQaBBsdBU9HRVhCaQoDRUFJT20KT0FDRgoGZmd+YmECChwZBB0ZHwVeQ2FIT31PRlpaawoDHhx9ZX0KERkEHAp+ZApZXUVOREN9AgoaBB8FS0ZGQ1BFZw==; seid=s1550629286782; token=\"OCY36EFdeYzGytlQFyKRdM0DcXNdViYI02kT4QbUMpaSk/CqMXrqBOx8EFo5/fQU8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"q1bNxxk8WW3MzjbCfKr/hfAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTc2NjQ0NzY1Iiwic2VjcmV0IjoiLXFsV2c2Ym9feEJqOWxQbWdWTjcwWWg3Iiwic3RhdHVzIjp0cnVlLCJtaWQ0NTY4NzYwIjpmYWxzZSwiX2V4cGlyZSI6MTU1MDcxNTc2NzgwMSwiX21heEFnZSI6ODY0MDAwMDB9; session.sig=lVCTA7DLvo1K_r_bTjbQOH13Alc";
// String[] words = word.split("\\|");
// List<Map<String,Object>> bodyList = new ArrayList<>();
// for(String w : words) {
// bodyList.addAll(Maimai.getUserList(w, cookie, null));
// }
// List<String> headList = Arrays.asList("id","name","gender","url","rank","compos","city");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel(path, "result", headList, bodyList);
// }
//}
package
com
.
zhiwei
.
user
;
import
java.util.ArrayList
;
import
java.util.Arrays
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Maimai
;
public
class
MaimaiTest
{
public
static
void
main
(
String
[]
args
)
{
String
path
=
"D:\\crawlerdata\\用户采集\\脉脉用户.xlsx"
;
String
word
=
"巨量引擎|巨量 引擎|巨 量 引 擎|巨 量 引擎|巨量引 擎"
;
String
cookie
=
"_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; guid=HBoEGxgEGBscBBsZGlYHGBseHxoYGhIZHFYcGQQdGR8FQ1hLTEt5ChITBBIdHxkEGgQbHQVPR0VYQmkKA0VBSU9tCk9BQ0YKBmZnfmJhAgocGQQdGR8FXkNhSE99T0ZaWmsKAx4cfWV9ChEZBBwKfmQKWV1FTkRDfQIKGgQfBUtGRkNQRWc=; seid=s1553309971270; token=\"iUifMkpE9YKuFpz0yEj+jiWpUqM6IXvEvwWKzdd/jK8YgrWsT1/Ku7k9bkIRRYvG8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoidzdPUkhMelktVS1iN1Nsb3VxLXZQV2JvIiwic3RhdHVzIjp0cnVlLCJfZXhwaXJlIjoxNTUzMzk2Mzk0MzczLCJfbWF4QWdlIjo4NjQwMDAwMH0=; session.sig=zGIN7VMizkYf1v48nLqTGAG1k8U"
;
String
[]
words
=
word
.
split
(
"\\|"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
for
(
String
w
:
words
)
{
bodyList
.
addAll
(
Maimai
.
getUserList
(
w
,
cookie
,
null
));
}
List
<
String
>
headList
=
Arrays
.
asList
(
"id"
,
"name"
,
"gender"
,
"url"
,
"rank"
,
"compos"
,
"city"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
path
,
"result"
,
headList
,
bodyList
);
}
}
src/test/java/com/zhiwei/user/QQkandianExample.java
View file @
9234d24c
package
com
.
zhiwei
.
user
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.testng.annotations
.Test
;
import
com.zhiwei.bean.QQKandianUser
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.QQKandian
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
QQkandianExample
{
@Test
public
void
f
()
{
QQKandian
qqKandian
=
new
QQKandian
();
String
path
=
"D:\\crawlerdata\\用户采集\\qq看点用户.xlsx"
;
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
path
,
0
);
List
<
Map
<
String
,
Object
>>
dataList
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
List
<
QQKandianUser
>
allList
=
new
ArrayList
<
QQKandianUser
>();
for
(
Map
<
String
,
Object
>
m
:
dataList
)
{
String
name
=
m
.
get
(
"渠道"
)+
""
;
System
.
out
.
println
(
name
);
List
<
QQKandianUser
>
qqKandianUsers
=
qqKandian
.
getUser
(
name
,
null
);
if
(
qqKandianUsers
!=
null
)
{
System
.
out
.
println
(
qqKandianUsers
.
size
());
allList
.
addAll
(
qqKandianUsers
);
}
else
{
System
.
out
.
println
(
name
+
"--- null"
);
}
ZhiWeiTools
.
sleep
(
3000
);
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"name"
);
headList
.
add
(
"url"
);
headList
.
add
(
"verity"
);
headList
.
add
(
"desc"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
QQKandianUser
qqKandianUser
:
allList
)
{
Map
<
String
,
Object
>
m
=
new
HashMap
<
String
,
Object
>();
m
.
put
(
"name"
,
qqKandianUser
.
getName
());
m
.
put
(
"url"
,
qqKandianUser
.
getUrl
());
m
.
put
(
"verity"
,
qqKandianUser
.
isVerify
());
m
.
put
(
"desc"
,
qqKandianUser
.
getDesc
());
bodyList
.
add
(
m
);
}
poi
.
exportExcel
(
path
,
"数据完成后"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.user;
//
//
import java.util.ArrayList;
//
import java.util.HashMap;
//
import java.util.List;
//
import java.util.Map;
//
//import org.junit
.Test;
//
//
import com.zhiwei.bean.QQKandianUser;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.QQKandian;
//
import com.zhiwei.tools.tools.ZhiWeiTools;
//
//
public class QQkandianExample {
//
//
@Test
//
public void f() {
//
QQKandian qqKandian = new QQKandian();
//
String path = "D:\\crawlerdata\\用户采集\\qq看点用户.xlsx";
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
Map<String,Object> map = poi.importExcel(path, 0);
//
//
List<Map<String,Object>> dataList = (List<Map<String, Object>>) map.get("body");
//
List<QQKandianUser> allList = new ArrayList<QQKandianUser>();
//
for(Map<String,Object> m : dataList) {
//
String name = m.get("渠道")+"";
//
System.out.println(name);
//
List<QQKandianUser> qqKandianUsers = qqKandian.getUser(name, null);
//
if(qqKandianUsers != null) {
//
System.out.println(qqKandianUsers.size());
//
allList.addAll(qqKandianUsers);
//
}else {
//
System.out.println( name + "--- null");
//
}
//
ZhiWeiTools.sleep(3000);
//
}
//
List<String> headList = new ArrayList<String>();
//
headList.add("name");
//
headList.add("url");
//
headList.add("verity");
//
headList.add("desc");
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
//
for(QQKandianUser qqKandianUser : allList) {
//
Map<String,Object> m = new HashMap<String,Object>();
//
m.put("name", qqKandianUser.getName());
//
m.put("url", qqKandianUser.getUrl());
//
m.put("verity", qqKandianUser.isVerify());
//
m.put("desc", qqKandianUser.getDesc());
//
bodyList.add(m);
//
}
//
poi.exportExcel(path, "数据完成后", headList, bodyList);
//
}
//
//
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment