Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
articlenewscrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
articlenewscrawler
Commits
9234d24c
Commit
9234d24c
authored
Jul 26, 2019
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
更新
parent
cb5516a0
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
90 changed files
with
2110 additions
and
1764 deletions
+2110
-1764
pom.xml
+5
-20
src/main/java/com/zhiwei/httpclient/HeadGet.java
+5
-5
src/main/java/com/zhiwei/httpclient/HttpClient.java
+15
-3
src/main/java/com/zhiwei/parse/Aika.java
+2
-1
src/main/java/com/zhiwei/parse/Aiqiyi.java
+1
-1
src/main/java/com/zhiwei/parse/Baijia.java
+30
-9
src/main/java/com/zhiwei/parse/BiliBili.java
+14
-18
src/main/java/com/zhiwei/parse/Chejia.java
+1
-1
src/main/java/com/zhiwei/parse/Dayu.java
+5
-7
src/main/java/com/zhiwei/parse/Douban.java
+1
-1
src/main/java/com/zhiwei/parse/Fenghuang.java
+6
-5
src/main/java/com/zhiwei/parse/Gftai.java
+1
-1
src/main/java/com/zhiwei/parse/KuaiTousu.java
+1
-1
src/main/java/com/zhiwei/parse/Maimai.java
+1
-1
src/main/java/com/zhiwei/parse/Pcauto.java
+1
-1
src/main/java/com/zhiwei/parse/QQKB.java
+3
-2
src/main/java/com/zhiwei/parse/QQNews.java
+1
-1
src/main/java/com/zhiwei/parse/QicheHome.java
+1
-1
src/main/java/com/zhiwei/parse/SinaKeji.java
+1
-1
src/main/java/com/zhiwei/parse/SinaTousu.java
+1
-1
src/main/java/com/zhiwei/parse/Souhu.java
+14
-39
src/main/java/com/zhiwei/parse/TXNews.java
+44
-1
src/main/java/com/zhiwei/parse/TechTx.java
+1
-1
src/main/java/com/zhiwei/parse/Wangyi.java
+67
-5
src/main/java/com/zhiwei/parse/Xueqiu.java
+21
-13
src/main/java/com/zhiwei/parse/Yangshi.java
+69
-0
src/main/java/com/zhiwei/parse/Yiche.java
+1
-1
src/main/java/com/zhiwei/parse/Yidianzixun.java
+8
-9
src/main/java/com/zhiwei/parse/Youku.java
+3
-3
src/main/java/com/zhiwei/parse/analysis/BaijiaAccountAnalysis.java
+8
-10
src/main/java/com/zhiwei/parse/analysis/BilibilikeyWordAnalysis.java
+7
-4
src/main/java/com/zhiwei/parse/analysis/DayuAccountAnalysis.java
+1
-1
src/main/java/com/zhiwei/parse/analysis/DayuByWordAnalysis.java
+1
-1
src/main/java/com/zhiwei/parse/analysis/FenghuangAccountAnalysis.java
+47
-58
src/main/java/com/zhiwei/parse/analysis/FenghuangCommentAnalysis.java
+1
-1
src/main/java/com/zhiwei/parse/analysis/SouhuAccountAnalysis.java
+9
-15
src/main/java/com/zhiwei/parse/analysis/SouhuCommentAnalysis.java
+1
-1
src/main/java/com/zhiwei/parse/shipin/QQTV.java
+26
-23
src/main/java/com/zhiwei/parse/shipin/SohuTV.java
+1
-1
src/test/java/com/zhiwei/Comment/AikaComment.java
+2
-4
src/test/java/com/zhiwei/Comment/AiqiyiHotCountTest.java
+33
-33
src/test/java/com/zhiwei/Comment/MaimaiCommentCountTest.java
+21
-20
src/test/java/com/zhiwei/Comment/YoukuHotCountTest.java
+36
-37
src/test/java/com/zhiwei/TestHttpBoot.java
+21
-38
src/test/java/com/zhiwei/crawler/DayuAccountExample.java
+54
-50
src/test/java/com/zhiwei/crawler/DayuCommentCountExample.java
+19
-19
src/test/java/com/zhiwei/crawler/DayuCommentExample.java
+65
-65
src/test/java/com/zhiwei/crawler/FenghuangAccountExample.java
+0
-39
src/test/java/com/zhiwei/crawler/FenghuangByWordExample.java
+45
-45
src/test/java/com/zhiwei/crawler/FenghuangCommentCountExample.java
+23
-23
src/test/java/com/zhiwei/crawler/FenghuangCommentExample.java
+61
-61
src/test/java/com/zhiwei/crawler/MaimaiBywordExample.java
+32
-32
src/test/java/com/zhiwei/crawler/MeipaiByWordExample.java
+43
-43
src/test/java/com/zhiwei/crawler/MiaopaiByUrlExample.java
+54
-54
src/test/java/com/zhiwei/crawler/PearVideoByWordExample.java
+33
-33
src/test/java/com/zhiwei/crawler/QQAccountExample.java
+52
-52
src/test/java/com/zhiwei/crawler/QQKBByWordExample.java
+20
-20
src/test/java/com/zhiwei/crawler/QQKBCommentCountExample.java
+21
-21
src/test/java/com/zhiwei/crawler/QQKBCommentExample.java
+51
-51
src/test/java/com/zhiwei/crawler/QQNewsCommentListTest.java
+0
-0
src/test/java/com/zhiwei/crawler/SinaCommentListTest.java
+84
-84
src/test/java/com/zhiwei/crawler/SoKuByWordExample.java
+40
-40
src/test/java/com/zhiwei/crawler/SouhuAccountExample.java
+38
-35
src/test/java/com/zhiwei/crawler/SouhuCommentCountExample.java
+48
-48
src/test/java/com/zhiwei/crawler/SouhuCommentExample.java
+62
-62
src/test/java/com/zhiwei/crawler/TXNewsByWordExample.java
+27
-27
src/test/java/com/zhiwei/crawler/Test1.java
+20
-20
src/test/java/com/zhiwei/crawler/WangyiCommentCountExample.java
+51
-51
src/test/java/com/zhiwei/crawler/WangyiCommentExample.java
+60
-60
src/test/java/com/zhiwei/crawler/WangyiHistoryExample.java
+30
-30
src/test/java/com/zhiwei/crawler/XiaomiShequByWordExample.java
+35
-35
src/test/java/com/zhiwei/crawler/XiguaAccountExample.java
+44
-44
src/test/java/com/zhiwei/crawler/XiguaByWordExample.java
+47
-47
src/test/java/com/zhiwei/crawler/YidainzixunByWordExample.java
+1
-1
src/test/java/com/zhiwei/crawler/YidianzixunCommentExample.java
+23
-23
src/test/java/com/zhiwei/hsitory/BaijiaAccountExample.java
+46
-0
src/test/java/com/zhiwei/hsitory/FenghuangAccountExample.java
+43
-0
src/test/java/com/zhiwei/hsitory/SouhuAccountExample.java
+39
-0
src/test/java/com/zhiwei/hsitory/TxNewsHostoryExample.java
+34
-0
src/test/java/com/zhiwei/hsitory/WangyiHistoryExample.java
+33
-0
src/test/java/com/zhiwei/hsitory/XueqiuHostoryExample.java
+35
-0
src/test/java/com/zhiwei/hsitory/YidianzixunAccountExample.java
+8
-5
src/test/java/com/zhiwei/keyword/XueqiuKeyWord.java
+47
-46
src/test/java/com/zhiwei/shipin/AiqiyiTest.java
+3
-3
src/test/java/com/zhiwei/shipin/BilibiliTest.java
+7
-4
src/test/java/com/zhiwei/shipin/QQTVTest.java
+4
-4
src/test/java/com/zhiwei/shipin/SohuTVTest.java
+2
-2
src/test/java/com/zhiwei/shipin/YoukuKeyWordTest.java
+2
-2
src/test/java/com/zhiwei/user/MaimaiTest.java
+30
-28
src/test/java/com/zhiwei/user/QQkandianExample.java
+55
-55
No files found.
pom.xml
View file @
9234d24c
...
@@ -3,42 +3,27 @@
...
@@ -3,42 +3,27 @@
<modelVersion>
4.0.0
</modelVersion>
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<groupId>
com.zhiwei
</groupId>
<artifactId>
articlenewscrawler
</artifactId>
<artifactId>
articlenewscrawler
</artifactId>
<version>
0.1.
3
-SNAPSHOT
</version>
<version>
0.1.
6
-SNAPSHOT
</version>
<name>
articlenewscrawler
</name>
<name>
articlenewscrawler
</name>
<description>
采集凤凰,一点资讯,搜狐历时文章和文章评论
</description>
<description>
采集凤凰,一点资讯,搜狐历时文章和文章评论
</description>
<dependencies>
<dependencies>
<dependency>
<dependency>
<groupId>
org.testng
</groupId>
<artifactId>
testng
</artifactId>
<version>
6.14.3
</version>
</dependency>
<dependency>
<groupId>
com.alibaba
</groupId>
<artifactId>
fastjson
</artifactId>
<version>
1.2.29
</version>
</dependency>
<dependency>
<groupId>
junit
</groupId>
<artifactId>
junit
</artifactId>
<version>
4.11
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei
</groupId>
<groupId>
com.zhiwei
</groupId>
<artifactId>
excelpoi
</artifactId>
<artifactId>
excelpoi
</artifactId>
<version>
0.0.
1
-SNAPSHOT
</version>
<version>
0.0.
3
-SNAPSHOT
</version>
</dependency>
</dependency>
<dependency>
<dependency>
<groupId>
com.zhiwei.tools
</groupId>
<groupId>
com.zhiwei.tools
</groupId>
<artifactId>
zhiwei-tools
</artifactId>
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.1.
2
-SNAPSHOT
</version>
<version>
0.1.
3
-SNAPSHOT
</version>
</dependency>
</dependency>
<dependency>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<artifactId>
crawler-core
</artifactId>
<version>
0.3.
0
-RELEASE
</version>
<version>
0.3.
6
-RELEASE
</version>
<scope>
provided
</scope>
<scope>
provided
</scope>
</dependency>
</dependency>
</dependencies>
</dependencies>
<!-- 打包管理 -->
<!-- 打包管理 -->
...
...
src/main/java/com/zhiwei/httpclient/HeadGet.java
View file @
9234d24c
...
@@ -113,14 +113,14 @@ public class HeadGet {
...
@@ -113,14 +113,14 @@ public class HeadGet {
* @throws IOException
* @throws IOException
*/
*/
public
static
Map
<
String
,
String
>
getFenghuangAccountHeaderMap
(
String
cookie
)
{
public
static
Map
<
String
,
String
>
getFenghuangAccountHeaderMap
(
String
cookie
)
{
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"User-Agent"
,
headerMap
.
put
(
"User-Agent"
,
"
IfengNews/6.1.8 (iPhone; iOS 11.2.1; Scale/2.00)
"
);
"
Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36
"
);
headerMap
.
put
(
"Accept"
,
headerMap
.
put
(
"Accept"
,
"
*/*
"
);
"
text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-
cn
"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-
CN,zh;q=0.9,en;q=0.8
"
);
headerMap
.
put
(
"Connection"
,
"keep-alive"
);
headerMap
.
put
(
"Connection"
,
"keep-alive"
);
headerMap
.
put
(
"Host"
,
"
api.3g
.ifeng.com"
);
headerMap
.
put
(
"Host"
,
"
shankapi
.ifeng.com"
);
if
(
cookie
!=
null
)
{
if
(
cookie
!=
null
)
{
headerMap
.
put
(
"Cookie"
,
cookie
);
headerMap
.
put
(
"Cookie"
,
cookie
);
}
}
...
...
src/main/java/com/zhiwei/httpclient/HttpClient.java
View file @
9234d24c
...
@@ -16,7 +16,7 @@ import okhttp3.Response;
...
@@ -16,7 +16,7 @@ import okhttp3.Response;
public
class
HttpClient
{
public
class
HttpClient
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
HttpClient
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
HttpClient
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
(
false
,
2
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
(
);
/**
/**
*
*
...
@@ -44,15 +44,27 @@ public class HttpClient {
...
@@ -44,15 +44,27 @@ public class HttpClient {
* @throws IOException
* @throws IOException
*/
*/
public
static
String
executeHttpRequestGet
(
String
url
,
ProxyHolder
proxy
,
Map
<
String
,
String
>
headerMap
)
{
public
static
String
executeHttpRequestGet
(
String
url
,
ProxyHolder
proxy
,
Map
<
String
,
String
>
headerMap
)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
)){
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
)){
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"httpClient 获取数据出现问题:{}"
,
e
);
}
}
return
null
;
}
public
static
String
executeHttpRequestPost
(
String
url
,
Proxy
proxy
,
Map
<
String
,
String
>
headerMap
,
Map
<
String
,
Object
>
paramMap
)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapPost
(
url
,
headerMap
,
paramMap
),
proxy
)){
return
response
.
body
().
string
();
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"httpClient 获取数据出现问题:{}"
,
e
);
logger
.
error
(
"httpClient 获取数据出现问题:{}"
,
e
);
return
null
;
return
null
;
}
}
}
}
public
static
String
executeHttpRequestPost
(
String
url
,
Proxy
proxy
,
Map
<
String
,
String
>
headerMap
,
Map
<
String
,
Object
>
paramMap
)
{
public
static
String
executeHttpRequestPost
(
String
url
,
Proxy
Holder
proxy
,
Map
<
String
,
String
>
headerMap
,
Map
<
String
,
Object
>
paramMap
)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapPost
(
url
,
headerMap
,
paramMap
),
proxy
)){
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapPost
(
url
,
headerMap
,
paramMap
),
proxy
)){
return
response
.
body
().
string
();
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
...
...
src/main/java/com/zhiwei/parse/Aika.java
View file @
9234d24c
...
@@ -20,7 +20,7 @@ public class Aika {
...
@@ -20,7 +20,7 @@ public class Aika {
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Aika
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Aika
.
class
);
private
static
AikaCommentAnalysis
aikaCommentAnalysis
=
new
AikaCommentAnalysis
();
private
static
AikaCommentAnalysis
aikaCommentAnalysis
=
new
AikaCommentAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
@SuppressWarnings
(
"unchecked"
)
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
Map
<
String
,
Object
>>
getAikaComment
(
String
url
,
ProxyHolder
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getAikaComment
(
String
url
,
ProxyHolder
proxy
)
{
...
@@ -46,6 +46,7 @@ public class Aika {
...
@@ -46,6 +46,7 @@ public class Aika {
page
++;
page
++;
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"爱卡汽车 评论采集出错 {}"
,
e
);
logger
.
error
(
"爱卡汽车 评论采集出错 {}"
,
e
);
break
;
}
}
}
}
...
...
src/main/java/com/zhiwei/parse/Aiqiyi.java
View file @
9234d24c
...
@@ -23,7 +23,7 @@ import okhttp3.Response;
...
@@ -23,7 +23,7 @@ import okhttp3.Response;
public
class
Aiqiyi
{
public
class
Aiqiyi
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Aiqiyi
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Aiqiyi
.
class
);
private
static
AiqiyiByWordAnalysis
aiqiyiByWordAnalysis
=
new
AiqiyiByWordAnalysis
();
private
static
AiqiyiByWordAnalysis
aiqiyiByWordAnalysis
=
new
AiqiyiByWordAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
(
false
,
2
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
(
);
/**
/**
*
*
...
...
src/main/java/com/zhiwei/parse/Baijia.java
View file @
9234d24c
...
@@ -2,8 +2,11 @@ package com.zhiwei.parse;
...
@@ -2,8 +2,11 @@ package com.zhiwei.parse;
import
java.net.Proxy
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
java.util.Objects
;
import
org.slf4j.Logger
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
org.slf4j.LoggerFactory
;
...
@@ -23,7 +26,7 @@ import okhttp3.Request;
...
@@ -23,7 +26,7 @@ import okhttp3.Request;
public
class
Baijia
{
public
class
Baijia
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Baijia
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Baijia
.
class
);
private
static
BaijiaAccountAnalysis
baijiaAccountAnalysis
=
new
BaijiaAccountAnalysis
();
private
static
BaijiaAccountAnalysis
baijiaAccountAnalysis
=
new
BaijiaAccountAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
/**
/**
*
*
...
@@ -77,27 +80,29 @@ public class Baijia {
...
@@ -77,27 +80,29 @@ public class Baijia {
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
headerMap
.
put
(
"cookie"
,
cookie
);
headerMap
.
put
(
"cookie"
,
cookie
);
String
uk
=
getUkData
(
app_id
,
proxy
,
cookie
);
if
(
Objects
.
isNull
(
uk
))
{
return
Collections
.
emptyList
();
}
boolean
f
=
true
;
boolean
f
=
true
;
int
n
=
0
;
String
ctime
=
""
;
while
(
f
)
{
while
(
f
)
{
for
(
int
i
=
1
;
i
<
3
;
i
++)
{
for
(
int
i
=
1
;
i
<
3
;
i
++)
{
try
{
try
{
String
url
=
"https://author.baidu.com/list?type=article&context={%22offset%22:%22-1_"
+
n
+
"%22,%22app_id%22:%22"
+
app_id
+
"%22,%22pageSize%22:20}"
;
String
url
=
"https://author.baidu.com/list?type=article&tab=2&uk="
+
uk
+
"&ctime="
+
ctime
+
"&num=50"
;
System
.
out
.
println
(
url
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
String
result
=
httpBoot
.
syncCall
(
request
,
proxy
,
false
).
body
().
string
();
String
result
=
httpBoot
.
syncCall
(
request
,
proxy
).
body
().
string
();
Map
<
String
,
Object
>
dMap
=
baijiaAccountAnalysis
.
getBaijiaAccountData3
(
result
,
name
,
startTime
);
Map
<
String
,
Object
>
dMap
=
baijiaAccountAnalysis
.
getBaijiaAccountData3
(
result
,
name
,
startTime
);
List
<
Map
<
String
,
Object
>>
dList
=
(
List
<
Map
<
String
,
Object
>>)
dMap
.
get
(
"data"
);
List
<
Map
<
String
,
Object
>>
dList
=
(
List
<
Map
<
String
,
Object
>>)
dMap
.
get
(
"data"
);
dataList
.
addAll
(
dList
);
dataList
.
addAll
(
dList
);
logger
.
info
(
"{} 数据采集结果 {}"
,
name
,
dataList
.
size
());
logger
.
info
(
"{} 数据采集结果 {}"
,
app_id
,
dataList
.
size
());
if
(!(
boolean
)
dMap
.
get
(
"more"
))
{
if
(!(
boolean
)
dMap
.
get
(
"more"
))
{
f
=
false
;
f
=
false
;
}
}
ctime
=
String
.
valueOf
(
dMap
.
get
(
"ctime"
));
ZhiWeiTools
.
sleep
(
3000
);
ZhiWeiTools
.
sleep
(
3000
);
n
+=
20
;
break
;
break
;
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
ZhiWeiTools
.
sleep
(
3000
);
ZhiWeiTools
.
sleep
(
3000
);
}
}
}
}
...
@@ -106,6 +111,22 @@ public class Baijia {
...
@@ -106,6 +111,22 @@ public class Baijia {
return
dataList
;
return
dataList
;
}
}
private
static
String
getUkData
(
String
app_id
,
Proxy
proxy
,
String
cookie
)
{
String
url
=
"https://author.baidu.com/profile?context={\"from\":0,\"app_id\":\""
+
app_id
+
"\"}&cmdType=&pagelets[]=root&reqID=0&ispeed=1#"
;
Map
<
String
,
Object
>
headers
=
new
HashMap
<>();
headers
.
put
(
"Host"
,
"author.baidu.com"
);
headers
.
put
(
"cookie"
,
cookie
);
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
try
{
String
result
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headers
),
proxy
).
body
().
string
();
return
result
.
split
(
"uk\\\\\":\\\\\""
)[
1
].
split
(
"\\\\\","
)[
0
];
}
catch
(
Exception
e
)
{
logger
.
error
(
"百家号uk 获取失败"
);
}
}
return
null
;
}
/**
/**
*
*
* @Description 百家号历史文章采集
* @Description 百家号历史文章采集
...
@@ -114,7 +135,7 @@ public class Baijia {
...
@@ -114,7 +135,7 @@ public class Baijia {
* @return
* @return
*/
*/
public
static
List
<
Map
<
String
,
Object
>>
getBaijiaAccountData
(
String
app_id
,
String
startTime
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getBaijiaAccountData
(
String
app_id
,
String
startTime
,
Proxy
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>
>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
int
i
=
0
;
int
i
=
0
;
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getBaijiaAccountHeaderMap
(
null
);
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getBaijiaAccountHeaderMap
(
null
);
try
{
try
{
...
...
src/main/java/com/zhiwei/parse/BiliBili.java
View file @
9234d24c
...
@@ -12,28 +12,28 @@ import org.slf4j.Logger;
...
@@ -12,28 +12,28 @@ import org.slf4j.Logger;
import
org.slf4j.LoggerFactory
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.HttpRequestBuilder
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.parse.analysis.BilibilikeyWordAnalysis
;
import
com.zhiwei.parse.analysis.BilibilikeyWordAnalysis
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Headers
;
import
okhttp3.Headers
;
import
okhttp3.Request
;
public
class
BiliBili
{
public
class
BiliBili
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
BiliBili
.
class
);
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
BiliBili
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
useCookieJar
(
true
).
build
();
@SuppressWarnings
(
"unchecked"
)
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
Proxy
proxy
,
String
cookie
)
{
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
Proxy
proxy
,
String
endTime
,
String
cookie
)
{
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
try
{
try
{
//
String
url
=
"https://search.bilibili.com/all?keyword="
+
URLEncoder
.
encode
(
word
,
"utf-8"
)+
"&order=pubdate&duration=0&tids_1=0"
;
String
url
=
"https://search.bilibili.com/all?keyword="
+
URLEncoder
.
encode
(
word
,
"utf-8"
)+
"&order=pubdate&duration=0&tids_1=0"
;
Headers
header
=
Headers
.
of
(
"cookie"
,
cookie
,
"Referer"
,
"https://www.bilibili.com/"
,
"Host"
,
"search.bilibili.com"
);
Headers
header
=
Headers
.
of
(
"cookie"
,
cookie
,
"Referer"
,
"https://www.bilibili.com/"
,
"Host"
,
"search.bilibili.com"
);
Request
request
=
HttpRequestBuilder
.
newGetRequest
(
url
,
header
);
String
result
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
header
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
String
result
=
httpBoot
.
syncCall
(
request
,
proxy
).
body
().
string
();
ZhiWeiTools
.
sleep
(
100
);
ZhiWeiTools
.
sleep
(
3000
);
Map
<
String
,
Object
>
map
=
BilibilikeyWordAnalysis
.
getData
(
result
,
word
,
endTime
);
Map
<
String
,
Object
>
map
=
BilibilikeyWordAnalysis
.
getData
(
result
,
word
);
boolean
more
=
(
boolean
)
map
.
get
(
"more"
);
boolean
more
=
(
boolean
)
map
.
get
(
"more"
);
List
<
Map
<
String
,
Object
>>
dataList
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"data"
);
List
<
Map
<
String
,
Object
>>
dataList
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"data"
);
if
(
dataList
!=
null
)
{
if
(
dataList
!=
null
)
{
...
@@ -43,27 +43,23 @@ public class BiliBili {
...
@@ -43,27 +43,23 @@ public class BiliBili {
while
(
more
)
{
while
(
more
)
{
map
.
clear
();
map
.
clear
();
String
ur
=
url
+
"&page="
+
n
;
String
ur
=
url
+
"&page="
+
n
;
System
.
out
.
println
(
ur
);
String
result2
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
ur
,
header
),
ProxyHolder
.
NAT_HEAVY_PROXY
).
body
().
string
();
request
=
HttpRequestBuilder
.
newGetRequest
(
ur
,
header
);
map
=
BilibilikeyWordAnalysis
.
getData
(
result2
,
word
,
endTime
);
String
result2
=
httpBoot
.
syncCall
(
request
,
proxy
).
body
().
string
();
map
=
BilibilikeyWordAnalysis
.
getData
(
result2
,
word
);
List
<
Map
<
String
,
Object
>>
dataList2
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"data"
);
List
<
Map
<
String
,
Object
>>
dataList2
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"data"
);
if
(
dataList2
!=
null
)
{
if
(
dataList2
!=
null
)
{
bodyList
.
addAll
(
dataList2
);
bodyList
.
addAll
(
dataList2
);
}
}
System
.
out
.
println
(
n
+
"页,数据总量为 -- "
+
bodyList
.
size
()
);
logger
.
info
(
"word {} , {} 页,数据总量为 -- {}"
,
word
,
n
,
bodyList
.
size
()
);
more
=
(
boolean
)
map
.
get
(
"more"
);
more
=
(
boolean
)
map
.
get
(
"more"
);
n
++;
n
++;
ZhiWeiTools
.
sleep
(
30
00
);
ZhiWeiTools
.
sleep
(
1
00
);
}
}
return
bodyList
;
return
bodyList
;
}
catch
(
UnsupportedEncodingException
e
)
{
}
catch
(
UnsupportedEncodingException
e
)
{
logger
.
error
(
"e "
,
e
);
logger
.
error
(
"e
{}
"
,
e
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"e "
,
e
);
logger
.
error
(
"e
{}
"
,
e
);
}
}
return
Collections
.
emptyList
();
return
Collections
.
emptyList
();
}
}
...
...
src/main/java/com/zhiwei/parse/Chejia.java
View file @
9234d24c
...
@@ -25,7 +25,7 @@ import okhttp3.Response;
...
@@ -25,7 +25,7 @@ import okhttp3.Response;
public
class
Chejia
{
public
class
Chejia
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Chejia
.
class
);
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Chejia
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
/**
/**
*
*
...
...
src/main/java/com/zhiwei/parse/Dayu.java
View file @
9234d24c
...
@@ -11,6 +11,7 @@ import org.slf4j.Logger;
...
@@ -11,6 +11,7 @@ import org.slf4j.Logger;
import
org.slf4j.LoggerFactory
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.parse.analysis.DayuAccountAnalysis
;
import
com.zhiwei.parse.analysis.DayuAccountAnalysis
;
...
@@ -30,26 +31,23 @@ public class Dayu {
...
@@ -30,26 +31,23 @@ public class Dayu {
* @param mid
* @param mid
* @return
* @return
*/
*/
public
static
List
<
Map
<
String
,
Object
>>
getDayuAccountData
(
String
mid
,
String
name
,
String
startTime
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getDayuAccountData
(
String
mid
,
String
name
,
String
startTime
,
Proxy
Holder
proxy
)
{
int
i
=
1
;
int
i
=
1
;
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getDayuAccountHeaderMap
(
null
);
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getDayuAccountHeaderMap
(
null
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>
>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
try
{
try
{
while
(
true
)
{
while
(
true
)
{
String
url
=
"http://ff.dayu.com/contents/author/"
+
mid
+
"?biz_id=1002&_size=50&_page="
+
i
+
"&_order_type=published_at&status=1&_fetch=1"
;
String
url
=
"http://ff.dayu.com/contents/author/"
+
mid
+
"?biz_id=1002&_size=50&_page="
+
i
+
"&_order_type=published_at&status=1&_fetch=1"
;
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
System
.
out
.
println
(
url
);
System
.
out
.
println
(
url
);
List
<
Map
<
String
,
Object
>>
lists
=
dayuAccountAnalysis
.
getDayuAccountData
(
result
,
name
,
startTime
);
List
<
Map
<
String
,
Object
>>
lists
=
dayuAccountAnalysis
.
getDayuAccountData
(
result
,
name
,
startTime
);
if
(
lists
==
null
)
{
if
(
lists
==
null
||
lists
.
isEmpty
())
{
break
;
}
if
(
lists
.
size
()
<
1
)
{
break
;
break
;
}
}
dataList
.
addAll
(
lists
);
dataList
.
addAll
(
lists
);
System
.
out
.
println
(
"================解析第"
+
i
+
"页====此时有数据=="
+
dataList
.
size
());
System
.
out
.
println
(
"================解析第"
+
i
+
"页====此时有数据=="
+
dataList
.
size
());
i
++;
i
++;
ZhiWeiTools
.
sleep
(
70
00
);
ZhiWeiTools
.
sleep
(
1
00
);
}
}
return
dataList
;
return
dataList
;
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
...
...
src/main/java/com/zhiwei/parse/Douban.java
View file @
9234d24c
...
@@ -25,7 +25,7 @@ public class Douban {
...
@@ -25,7 +25,7 @@ public class Douban {
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Double
.
class
);
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Double
.
class
);
private
static
DoubanCommentAnalysis
doubanCommentAnalysis
=
new
DoubanCommentAnalysis
();
private
static
DoubanCommentAnalysis
doubanCommentAnalysis
=
new
DoubanCommentAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
/**
/**
*
*
...
...
src/main/java/com/zhiwei/parse/Fenghuang.java
View file @
9234d24c
...
@@ -19,6 +19,7 @@ import com.zhiwei.parse.analysis.FenghuangCommentAnalysis;
...
@@ -19,6 +19,7 @@ import com.zhiwei.parse.analysis.FenghuangCommentAnalysis;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
Fenghuang
{
public
class
Fenghuang
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Fenghuang
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Fenghuang
.
class
);
private
static
FenghuangAccountAnalysis
fenghuangAccountAnalysis
=
new
FenghuangAccountAnalysis
();
private
static
FenghuangAccountAnalysis
fenghuangAccountAnalysis
=
new
FenghuangAccountAnalysis
();
private
static
FenghuangCommentAnalysis
fenghuangCommentAnalysis
=
new
FenghuangCommentAnalysis
();
private
static
FenghuangCommentAnalysis
fenghuangCommentAnalysis
=
new
FenghuangCommentAnalysis
();
...
@@ -31,7 +32,7 @@ public class Fenghuang {
...
@@ -31,7 +32,7 @@ public class Fenghuang {
* @param startTime 可不传 格式(2017-12-09 17:53:02)
* @param startTime 可不传 格式(2017-12-09 17:53:02)
* @return
* @return
*/
*/
public
static
List
<
Map
<
String
,
Object
>>
getFenghuangAccountData
(
String
id
,
String
startTime
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getFenghuangAccountData
(
String
id
,
String
startTime
,
Proxy
Holder
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
int
i
=
1
;
int
i
=
1
;
boolean
f
=
true
;
boolean
f
=
true
;
...
@@ -39,17 +40,17 @@ public class Fenghuang {
...
@@ -39,17 +40,17 @@ public class Fenghuang {
try
{
try
{
for
(
int
j
=
0
;
j
<
3
;
j
++){
for
(
int
j
=
0
;
j
<
3
;
j
++){
f
=
true
;
f
=
true
;
String
url
=
"http
://api.3g.ifeng.com/api_wemedia_index?followid=weMedia_"
+
id
+
"&page="
+
i
+
"&pagesize=20&tag=article&uid=fe659b7e510444c28a31f88dee7a2747
"
;
String
url
=
"http
s://shankapi.ifeng.com/winter/feng/author/getFengAuthorListData/"
+
id
+
"/doc/"
+
i
+
"/getFengAuthorListData
"
;
List
<
Map
<
String
,
Object
>>
list
=
fenghuangAccountAnalysis
.
getArticleData
(
url
,
startTime
,
proxy
);
List
<
Map
<
String
,
Object
>>
list
=
fenghuangAccountAnalysis
.
getArticleData
(
url
,
startTime
,
proxy
);
if
(
list
!=
null
&&
!
list
.
isEmpty
())
{
if
(
list
!=
null
&&
!
list
.
isEmpty
())
{
dataList
.
addAll
(
list
);
dataList
.
addAll
(
list
);
logger
.
info
(
"
====================采集第 {} 页===共获取数据==
{}"
,
i
,
dataList
.
size
());
logger
.
info
(
"
采集第 {} 页,.共获取数据
{}"
,
i
,
dataList
.
size
());
i
++;
i
++;
ZhiWeiTools
.
sleep
(
20
00
);
ZhiWeiTools
.
sleep
(
1
00
);
break
;
break
;
}
}
f
=
false
;
f
=
false
;
ZhiWeiTools
.
sleep
(
20
00
);
ZhiWeiTools
.
sleep
(
1
00
);
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"程序出错 {}"
,
e
);
logger
.
error
(
"程序出错 {}"
,
e
);
...
...
src/main/java/com/zhiwei/parse/Gftai.java
View file @
9234d24c
...
@@ -18,7 +18,7 @@ public class Gftai {
...
@@ -18,7 +18,7 @@ public class Gftai {
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Gftai
.
class
);
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Gftai
.
class
);
private
static
GftaiAnalysis
gftaiAnalysis
=
new
GftaiAnalysis
();
private
static
GftaiAnalysis
gftaiAnalysis
=
new
GftaiAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
Proxy
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
...
...
src/main/java/com/zhiwei/parse/KuaiTousu.java
View file @
9234d24c
...
@@ -19,7 +19,7 @@ public class KuaiTousu {
...
@@ -19,7 +19,7 @@ public class KuaiTousu {
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
KuaiTousu
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
KuaiTousu
.
class
);
private
static
KuaiTousuAnalysis
kuaiTousuAnalysis
=
new
KuaiTousuAnalysis
();
private
static
KuaiTousuAnalysis
kuaiTousuAnalysis
=
new
KuaiTousuAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
Proxy
proxy
)
{
int
page
=
1
;
int
page
=
1
;
...
...
src/main/java/com/zhiwei/parse/Maimai.java
View file @
9234d24c
...
@@ -30,7 +30,7 @@ import okhttp3.Response;
...
@@ -30,7 +30,7 @@ import okhttp3.Response;
public
class
Maimai
{
public
class
Maimai
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Maimai
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Maimai
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
MaimaiBywordAnalysis
maimaiBywordAnalysis
=
new
MaimaiBywordAnalysis
();
private
static
MaimaiBywordAnalysis
maimaiBywordAnalysis
=
new
MaimaiBywordAnalysis
();
...
...
src/main/java/com/zhiwei/parse/Pcauto.java
View file @
9234d24c
...
@@ -22,7 +22,7 @@ public class Pcauto {
...
@@ -22,7 +22,7 @@ public class Pcauto {
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Pcauto
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Pcauto
.
class
);
private
static
PcautoCommentAnalysis
pcautoCommentAnalysis
=
new
PcautoCommentAnalysis
();
private
static
PcautoCommentAnalysis
pcautoCommentAnalysis
=
new
PcautoCommentAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
@SuppressWarnings
(
"unchecked"
)
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
Map
<
String
,
Object
>>
getPcAutoComment
(
String
url
,
ProxyHolder
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getPcAutoComment
(
String
url
,
ProxyHolder
proxy
)
{
...
...
src/main/java/com/zhiwei/parse/QQKB.java
View file @
9234d24c
...
@@ -13,6 +13,7 @@ import com.alibaba.fastjson.JSONArray;
...
@@ -13,6 +13,7 @@ import com.alibaba.fastjson.JSONArray;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.bean.QQkbUser
;
import
com.zhiwei.bean.QQkbUser
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.parse.analysis.QQKBAccountAnalysis
;
import
com.zhiwei.parse.analysis.QQKBAccountAnalysis
;
...
@@ -120,7 +121,7 @@ public class QQKB {
...
@@ -120,7 +121,7 @@ public class QQKB {
while
(
true
)
{
while
(
true
)
{
try
{
try
{
String
result
=
HttpClient
.
executeHttpRequestPost
(
"http://r.cnews.qq.com/getQQNewsComment"
,
Proxy
Factory
.
getNatProxy
()
,
headerMap
,
paramMap
);
String
result
=
HttpClient
.
executeHttpRequestPost
(
"http://r.cnews.qq.com/getQQNewsComment"
,
Proxy
Holder
.
NAT_HEAVY_PROXY
,
headerMap
,
paramMap
);
paramMap
.
clear
();
paramMap
.
clear
();
List
<
Map
<
String
,
Object
>>
lists
=
qqkbCommentAnalysis
.
getCommentData
(
result
,
null
,
comment_id
,
article_id
,
proxy
);
List
<
Map
<
String
,
Object
>>
lists
=
qqkbCommentAnalysis
.
getCommentData
(
result
,
null
,
comment_id
,
article_id
,
proxy
);
if
(
lists
==
null
||
lists
.
size
()
<
1
)
{
if
(
lists
==
null
||
lists
.
size
()
<
1
)
{
...
@@ -148,7 +149,7 @@ public class QQKB {
...
@@ -148,7 +149,7 @@ public class QQKB {
String
cookie
=
"luin=o0497332654;%20lskey=00030000d63ffaf7eba88c86106eac5f2910d45515222334b91c75a66b449c990c2be43cd202ba39b35bef60;%20uin=o0497332654;%20skey=MH3wukytS4;%20sigA2=7AB4D8DEDF73E313801FD348FD77EC3B05C06DBC4D9DA669B20CA04A8D6B80F300A69567FBD11A7B799E419BB796F22D47D3AE5FA95E708A0ABC66161061131B0B21A0031AA0807C;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmSQ3EqlyzseC9-qGEFY7Tkr0Ypk5vsnSwOaMC-IGxsBeY2K7knHrYstj_5dZpisJd5nihvLNQvCdsFhFwZQcT8;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0"
;
String
cookie
=
"luin=o0497332654;%20lskey=00030000d63ffaf7eba88c86106eac5f2910d45515222334b91c75a66b449c990c2be43cd202ba39b35bef60;%20uin=o0497332654;%20skey=MH3wukytS4;%20sigA2=7AB4D8DEDF73E313801FD348FD77EC3B05C06DBC4D9DA669B20CA04A8D6B80F300A69567FBD11A7B799E419BB796F22D47D3AE5FA95E708A0ABC66161061131B0B21A0031AA0807C;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmSQ3EqlyzseC9-qGEFY7Tkr0Ypk5vsnSwOaMC-IGxsBeY2K7knHrYstj_5dZpisJd5nihvLNQvCdsFhFwZQcT8;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0"
;
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getQQkbUserHeaderMap
(
cookie
);
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getQQkbUserHeaderMap
(
cookie
);
Map
<
String
,
Object
>
paramMap
=
HeadGet
.
getQQkbUserParamMap
(
name
);
Map
<
String
,
Object
>
paramMap
=
HeadGet
.
getQQkbUserParamMap
(
name
);
String
result
=
HttpClient
.
executeHttpRequestPost
(
url
,
null
,
headerMap
,
paramMap
);
String
result
=
HttpClient
.
executeHttpRequestPost
(
url
,
ProxyHolder
.
NAT_HEAVY_PROXY
,
headerMap
,
paramMap
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONObject
json1
=
json
.
getJSONObject
(
"new_list"
);
JSONObject
json1
=
json
.
getJSONObject
(
"new_list"
);
JSONObject
json2
=
json1
.
getJSONArray
(
"data"
).
getJSONObject
(
0
);
JSONObject
json2
=
json1
.
getJSONArray
(
"data"
).
getJSONObject
(
0
);
...
...
src/main/java/com/zhiwei/parse/QQNews.java
View file @
9234d24c
...
@@ -24,7 +24,7 @@ public class QQNews {
...
@@ -24,7 +24,7 @@ public class QQNews {
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
QQNews
.
class
);
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
QQNews
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
/**
/**
* .
* .
...
...
src/main/java/com/zhiwei/parse/QicheHome.java
View file @
9234d24c
...
@@ -17,7 +17,7 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
...
@@ -17,7 +17,7 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
public
class
QicheHome
{
public
class
QicheHome
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
QicheHome
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
QicheHome
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
QicheHomeKwyWordAnalysis
qicheHomeKwyWordAnalysis
=
new
QicheHomeKwyWordAnalysis
();
private
static
QicheHomeKwyWordAnalysis
qicheHomeKwyWordAnalysis
=
new
QicheHomeKwyWordAnalysis
();
...
...
src/main/java/com/zhiwei/parse/SinaKeji.java
View file @
9234d24c
...
@@ -24,7 +24,7 @@ public class SinaKeji {
...
@@ -24,7 +24,7 @@ public class SinaKeji {
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
SinaKeji
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
SinaKeji
.
class
);
private
static
SinaKejiCommentAnalysis
sinaKejiCommentAnalysis
=
new
SinaKejiCommentAnalysis
();
private
static
SinaKejiCommentAnalysis
sinaKejiCommentAnalysis
=
new
SinaKejiCommentAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
/**
/**
* https://tech.sina.com.cn/it/2018-07-17/doc-ihfkffam0632649.shtml
* https://tech.sina.com.cn/it/2018-07-17/doc-ihfkffam0632649.shtml
...
...
src/main/java/com/zhiwei/parse/SinaTousu.java
View file @
9234d24c
...
@@ -21,7 +21,7 @@ public class SinaTousu {
...
@@ -21,7 +21,7 @@ public class SinaTousu {
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
SinaTousu
.
class
);
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
SinaTousu
.
class
);
private
static
SinaTousuAnalysis
sinaTousuAnalysis
=
new
SinaTousuAnalysis
();
private
static
SinaTousuAnalysis
sinaTousuAnalysis
=
new
SinaTousuAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
public
static
List
<
Map
<
String
,
Object
>>
getSinaTousuData
(
String
word
,
ProxyHolder
proxy
,
String
time
)
{
public
static
List
<
Map
<
String
,
Object
>>
getSinaTousuData
(
String
word
,
ProxyHolder
proxy
,
String
time
)
{
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
...
...
src/main/java/com/zhiwei/parse/Souhu.java
View file @
9234d24c
...
@@ -2,14 +2,11 @@ package com.zhiwei.parse;
...
@@ -2,14 +2,11 @@ package com.zhiwei.parse;
import
static
java
.
util
.
Objects
.
nonNull
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.slf4j.Logger
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
org.slf4j.LoggerFactory
;
...
@@ -73,36 +70,28 @@ public class Souhu {
...
@@ -73,36 +70,28 @@ public class Souhu {
* @param isCulling 是否采集精选
* @param isCulling 是否采集精选
* @return
* @return
*/
*/
public
static
List
<
Map
<
String
,
Object
>>
getSouHuAccountData
(
String
xpt
,
String
startTime
,
boolean
isCulling
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getSouHuAccountData
(
String
id
,
String
name
,
String
startTime
,
boolean
isCulling
,
ProxyHolder
proxy
)
{
int
i
=
1
;
int
i
=
1
;
String
name
=
getName
(
xpt
,
proxy
);
ZhiWeiTools
.
sleep
(
200
);
ZhiWeiTools
.
sleep
(
2000
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getSouhuAccountHeaderMap
(
null
);
boolean
f
=
true
;
boolean
f
=
true
;
int
j
=
0
;
int
j
=
0
;
while
(
f
)
{
while
(
f
)
{
try
{
try
{
String
url
=
"http://mp.sohu.com/apiV2/profile/newsListAjax?xpt="
+
xpt
+
"&pageNumber="
+
i
+
"&pageSize=10"
;
String
url
=
"http://v2.sohu.com/author-page-api/author-articles/pc/"
+
id
+
"?pNo="
+
i
;
String
result
=
null
;
if
(
isCulling
)
{
if
(
isCulling
)
{
url
=
url
+
"&categoryId=-1"
;
url
=
url
+
"&columnId=-1"
;
}
try
{
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
result
=
result
.
replaceAll
(
"\\\\"
,
""
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
null
);
result
=
result
.
substring
(
1
,
result
.
length
()-
1
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArray
=
json
.
getJSON
Array
(
"data
"
);
JSONArray
jsonArray
=
json
.
getJSON
Object
(
"data"
).
getJSONArray
(
"pcArticleVOS
"
);
List
<
Map
<
String
,
Object
>>
dataList1
=
souhuAccountAnalysis
.
analysisData
(
jsonArray
,
name
);
List
<
Map
<
String
,
Object
>>
dataList1
=
souhuAccountAnalysis
.
analysisData
(
jsonArray
,
name
);
if
(
jsonArray
.
size
()
<
1
)
{
if
(
jsonArray
.
isEmpty
()
)
{
break
;
break
;
}
}
if
(
startTime
==
null
)
{
if
(
startTime
==
null
)
{
j
=
0
;
dataList
.
addAll
(
dataList1
);
dataList
.
addAll
(
dataList1
);
}
}
//判断时间
//判断时间
...
@@ -113,40 +102,26 @@ public class Souhu {
...
@@ -113,40 +102,26 @@ public class Souhu {
f
=
false
;
f
=
false
;
break
;
break
;
}
}
j
=
0
;
dataList
.
add
(
map
);
dataList
.
add
(
map
);
}
}
}
}
logger
.
info
(
"=============获取到的数据数目{}"
,
dataList
.
size
());
logger
.
info
(
"=============获取到的数据数目{}"
,
dataList
.
size
());
i
++;
i
++;
ZhiWeiTools
.
sleep
(
300
0
);
ZhiWeiTools
.
sleep
(
300
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
ZhiWeiTools
.
sleep
(
300
0
);
ZhiWeiTools
.
sleep
(
300
);
logger
.
error
(
"出错了
"
,
e
.
getMessage
()
);
logger
.
error
(
"出错了
{}"
,
e
);
j
++;
j
++;
if
(
j
>
5
)
{
if
(
j
>
5
)
{
f
=
false
;
f
=
false
;
}
}
continue
;
}
}
}
}
return
dataList
;
return
dataList
;
}
}
private
static
String
getName
(
String
xpt
,
Proxy
proxy
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getSouhuAccountHeaderMap
(
null
);
try
{
String
result
=
HttpClient
.
executeHttpRequestGet
(
"http://mp.sohu.com/profile?xpt="
+
xpt
,
proxy
,
headerMap
);
Document
doc
=
Jsoup
.
parse
(
result
);
String
name
=
doc
.
select
(
"p#ff"
).
text
();
System
.
out
.
println
(
name
);
return
name
;
}
catch
(
Exception
e
)
{
return
null
;
}
}
/**
/**
*
*
* @Description 传入搜狐文章链接和cookie 可获取此文章所有评论
* @Description 传入搜狐文章链接和cookie 可获取此文章所有评论
...
@@ -161,7 +136,7 @@ public class Souhu {
...
@@ -161,7 +136,7 @@ public class Souhu {
try
{
try
{
while
(
true
)
{
while
(
true
)
{
String
newurl
=
souhuCommentAnalysis
.
getSouhuURL
(
url
,
proxy
)
+
"&page_no="
+
j
;
String
newurl
=
souhuCommentAnalysis
.
getSouhuURL
(
url
,
proxy
)
+
"&page_no="
+
j
;
String
result
=
HttpClient
.
executeHttpRequestGet
(
newurl
,
Proxy
Factory
.
getNatProxy
()
,
headerMap
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
newurl
,
Proxy
Holder
.
NAT_HEAVY_PROXY
,
headerMap
);
System
.
out
.
println
(
newurl
);
System
.
out
.
println
(
newurl
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArry
=
json
.
getJSONObject
(
"jsonObject"
).
getJSONArray
(
"comments"
);
JSONArray
jsonArry
=
json
.
getJSONObject
(
"jsonObject"
).
getJSONArray
(
"comments"
);
...
...
src/main/java/com/zhiwei/parse/TXNews.java
View file @
9234d24c
...
@@ -19,6 +19,7 @@ import com.zhiwei.crawler.utils.RequestUtils;
...
@@ -19,6 +19,7 @@ import com.zhiwei.crawler.utils.RequestUtils;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.parse.analysis.TXNewsByWordAnalysis
;
import
com.zhiwei.parse.analysis.TXNewsByWordAnalysis
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
import
okhttp3.Response
;
...
@@ -28,7 +29,7 @@ public class TXNews {
...
@@ -28,7 +29,7 @@ public class TXNews {
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
TXNews
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
TXNews
.
class
);
private
static
TXNewsByWordAnalysis
txNewsByWordAnalysis
=
new
TXNewsByWordAnalysis
();
private
static
TXNewsByWordAnalysis
txNewsByWordAnalysis
=
new
TXNewsByWordAnalysis
();
public
static
boolean
txNewshasMoreData
=
true
;
public
static
boolean
txNewshasMoreData
=
true
;
p
ublic
static
HttpBoot
httpBoot
=
new
HttpBoot
();
p
rivate
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
String
devid
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
String
devid
,
Proxy
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
...
@@ -120,5 +121,47 @@ public class TXNews {
...
@@ -120,5 +121,47 @@ public class TXNews {
return
-
1
;
return
-
1
;
}
}
public
static
List
<
Map
<
String
,
Object
>>
getTxNewsHistory
(
String
mid
,
String
endTime
,
ProxyHolder
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
int
page
=
0
;
int
errorNum
=
0
;
while
(
true
)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
"https://pacaio.match.qq.com/om/mediaArticles?mid="
+
mid
+
"&num=30&page="
+
page
),
proxy
)){
String
result
=
response
.
body
().
string
();
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"data"
);
for
(
int
i
=
0
,
j
=
jsonArray
.
size
();
i
<
j
;
i
++)
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
String
time
=
TimeParse
.
dateFormartString
(
new
Date
(
data
.
getLong
(
"timestamp"
)*
1000L
),
"yyyy-MM-dd HH:mm:ss"
);
if
(
endTime
!=
null
&&
endTime
.
length
()
>
1
)
{
System
.
out
.
println
(
time
);
if
(
time
.
compareTo
(
endTime
)
<=
0
)
{
logger
.
info
(
"超时时间采集范围 跳出采集"
);
return
dataList
;
}
}
map
.
put
(
"title"
,
data
.
getString
(
"title"
));
map
.
put
(
"content"
,
data
.
getString
(
"abstract"
));
map
.
put
(
"time"
,
time
);
map
.
put
(
"source"
,
data
.
getString
(
"source"
));
map
.
put
(
"url"
,
data
.
getString
(
"vurl"
));
dataList
.
add
(
map
);
}
logger
.
info
(
"mid = {} , cralwer count = {}"
,
mid
,
dataList
.
size
()
);
page
++;
if
(
jsonArray
.
size
()
<
10
)
{
break
;
}
}
catch
(
Exception
e
)
{
logger
.
info
(
"采集数据出错 {}"
,
e
);
errorNum
++;
if
(
errorNum
>
3
)
{
break
;
}
}
}
return
dataList
;
}
}
}
src/main/java/com/zhiwei/parse/TechTx.java
View file @
9234d24c
...
@@ -21,7 +21,7 @@ public class TechTx {
...
@@ -21,7 +21,7 @@ public class TechTx {
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
TechTx
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
TechTx
.
class
);
private
static
TechTxCommentAnalysis
techTxCommentAnalysis
=
new
TechTxCommentAnalysis
();
private
static
TechTxCommentAnalysis
techTxCommentAnalysis
=
new
TechTxCommentAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
@SuppressWarnings
(
"unchecked"
)
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
Map
<
String
,
Object
>>
getTechTxComment
(
String
url
,
ProxyHolder
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getTechTxComment
(
String
url
,
ProxyHolder
proxy
)
{
...
...
src/main/java/com/zhiwei/parse/Wangyi.java
View file @
9234d24c
...
@@ -2,6 +2,7 @@ package com.zhiwei.parse;
...
@@ -2,6 +2,7 @@ package com.zhiwei.parse;
import
java.net.Proxy
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
...
@@ -9,18 +10,24 @@ import org.jsoup.Jsoup;
...
@@ -9,18 +10,24 @@ import org.jsoup.Jsoup;
import
org.slf4j.Logger
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.parse.analysis.WangyiCommentAnalysis
;
import
com.zhiwei.parse.analysis.WangyiCommentAnalysis
;
import
com.zhiwei.parse.analysis.WangyiHistoryAnalysis
;
import
com.zhiwei.parse.analysis.WangyiHistoryAnalysis
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
public
class
Wangyi
{
public
class
Wangyi
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Wangyi
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Wangyi
.
class
);
private
static
WangyiCommentAnalysis
wangyiCommentAnalysis
=
new
WangyiCommentAnalysis
();
private
static
WangyiCommentAnalysis
wangyiCommentAnalysis
=
new
WangyiCommentAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
WangyiHistoryAnalysis
wangyiHistoryAnalysis
=
new
WangyiHistoryAnalysis
();
private
static
WangyiHistoryAnalysis
wangyiHistoryAnalysis
=
new
WangyiHistoryAnalysis
();
/**
/**
...
@@ -74,24 +81,31 @@ public class Wangyi {
...
@@ -74,24 +81,31 @@ public class Wangyi {
}
}
}
}
/**
*
* @Description 网易网页版数据
* @param url
* @param proxy
* @param endTime
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getHistoryData
(
String
url
,
Proxy
proxy
,
String
endTime
)
{
public
static
List
<
Map
<
String
,
Object
>>
getHistoryData
(
String
url
,
Proxy
proxy
,
String
endTime
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getWangyiHistoryHeaderMap
(
null
);
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getWangyiHistoryHeaderMap
(
null
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>
>();
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
String
wemediaid
=
result
.
split
(
"data-wemediaid=\""
)[
1
].
split
(
"\""
)[
0
];
String
wemediaid
=
result
.
split
(
"data-wemediaid=\""
)[
1
].
split
(
"\""
)[
0
];
String
source
=
Jsoup
.
parse
(
result
).
select
(
"body > div.colum_wrap.fl > div > div.colum_des > div.normal > div.colum_info > h4"
).
text
();
String
source
=
Jsoup
.
parse
(
result
).
select
(
"body > div.colum_wrap.fl > div > div.colum_des > div.normal > div.colum_info > h4"
).
text
();
boolean
f
=
true
;
boolean
f
=
true
;
url
=
"http://dy.163.com/v2/article/list.do?wemediaId="
+
wemediaid
+
"&size=
2
0&pageNo="
;
url
=
"http://dy.163.com/v2/article/list.do?wemediaId="
+
wemediaid
+
"&size=
1
0&pageNo="
;
int
i
=
1
;
int
i
=
1
;
ZhiWeiTools
.
sleep
(
1000
);
ZhiWeiTools
.
sleep
(
1000
);
int
j
=
0
;
int
j
=
0
;
while
(
f
)
{
while
(
f
)
{
try
{
try
{
result
=
""
;
result
=
HttpClient
.
executeHttpRequestGet
(
url
+
i
,
proxy
,
headerMap
);
result
=
HttpClient
.
executeHttpRequestGet
(
url
+
i
,
proxy
,
headerMap
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
List
<
Map
<
String
,
Object
>>
dataList
=
wangyiHistoryAnalysis
.
getData
(
result
,
proxy
,
endTime
,
source
);
List
<
Map
<
String
,
Object
>>
dataList
=
wangyiHistoryAnalysis
.
getData
(
result
,
proxy
,
endTime
,
source
);
if
(
dataList
==
null
||
dataList
.
size
()
<
1
)
{
if
(
dataList
==
null
||
dataList
.
isEmpty
()
)
{
break
;
break
;
}
}
bodyList
.
addAll
(
dataList
);
bodyList
.
addAll
(
dataList
);
...
@@ -109,10 +123,58 @@ public class Wangyi {
...
@@ -109,10 +123,58 @@ public class Wangyi {
if
(
j
>
5
)
{
if
(
j
>
5
)
{
f
=
false
;
f
=
false
;
}
}
continue
;
}
}
}
}
return
bodyList
;
return
bodyList
;
}
}
public
static
List
<
Map
<
String
,
Object
>>
getWangyiClientHistory
(
String
id
,
ProxyHolder
proxy
,
String
endTime
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
int
page
=
0
;
int
errorNum
=
0
;
while
(
true
)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
"https://c.m.163.com/nc/subscribe/list/"
+
id
+
"/all/"
+
page
+
"-20.html"
),
proxy
)){
String
result
=
response
.
body
().
string
();
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"tab_list"
);
for
(
int
i
=
0
,
j
=
jsonArray
.
size
();
i
<
j
;
i
++)
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
String
time
=
data
.
getString
(
"ptime"
);
if
(
endTime
!=
null
&&
endTime
.
length
()
>
1
)
{
System
.
out
.
println
(
time
);
if
(
time
.
compareTo
(
endTime
)
<=
0
)
{
logger
.
info
(
"超时时间采集范围 跳出采集"
);
return
dataList
;
}
}
map
.
put
(
"title"
,
data
.
getString
(
"title"
));
map
.
put
(
"content"
,
data
.
getString
(
"aheadBody"
));
map
.
put
(
"time"
,
time
);
map
.
put
(
"source"
,
data
.
getString
(
"source"
));
if
(
"video"
.
equals
(
data
.
getString
(
"skipType"
)))
{
map
.
put
(
"url"
,
"https://c.m.163.com/news/v/"
+
data
.
getString
(
"skipID"
)
+
".html"
);
}
else
{
map
.
put
(
"url"
,
"https://c.m.163.com/news/a/"
+
data
.
getString
(
"postid"
)
+
".html"
);
}
// System.out.println(map.toString());
dataList
.
add
(
map
);
}
logger
.
info
(
"id = {} , cralwer count = {}"
,
id
,
dataList
.
size
()
);
page
+=
20
;
if
(
jsonArray
.
size
()
<
10
)
{
break
;
}
}
catch
(
Exception
e
)
{
logger
.
info
(
"采集数据出错 {}"
,
e
);
errorNum
++;
if
(
errorNum
>
3
)
{
break
;
}
}
}
return
dataList
;
}
}
}
src/main/java/com/zhiwei/parse/Xueqiu.java
View file @
9234d24c
...
@@ -26,12 +26,12 @@ import okhttp3.Response;
...
@@ -26,12 +26,12 @@ import okhttp3.Response;
public
class
Xueqiu
{
public
class
Xueqiu
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Xueqiu
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Xueqiu
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
private
static
XueqiuKeyWordAnalysis
xueqiuKeyWordAnalysis
=
new
XueqiuKeyWordAnalysis
();
private
static
XueqiuKeyWordAnalysis
xueqiuKeyWordAnalysis
=
new
XueqiuKeyWordAnalysis
();
/**
/**
*
*
* @Description 关键词采集
历史
文章
* @Description 关键词采集文章
* @param word
* @param word
* @param endTime
* @param endTime
* @param proxy
* @param proxy
...
@@ -53,13 +53,16 @@ public class Xueqiu {
...
@@ -53,13 +53,16 @@ public class Xueqiu {
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headers
);
String
result
=
httpBoot
.
syncCall
(
request
,
proxy
).
body
().
string
();
String
result
=
httpBoot
.
syncCall
(
request
,
proxy
).
body
().
string
();
List
<
Map
<
String
,
Object
>>
list
=
xueqiuKeyWordAnalysis
.
getData
(
result
,
endTime
);
List
<
Map
<
String
,
Object
>>
list
=
xueqiuKeyWordAnalysis
.
getData
(
result
,
endTime
);
ZhiWeiTools
.
sleep
(
3000
);
if
(
list
.
isEmpty
())
{
if
(
list
.
size
()
<
1
)
{
i
++;
i
++;
}
else
{
}
else
{
int
count
=
JSONObject
.
parseObject
(
result
).
getIntValue
(
"maxPage"
);
bodyList
.
addAll
(
list
);
bodyList
.
addAll
(
list
);
logger
.
info
(
"采集到第{} 页 , 一共采集到 {} 数据"
,
page
,
bodyList
.
size
());
logger
.
info
(
"采集到第{} 页 , 一共采集到 {} 数据"
,
page
,
bodyList
.
size
());
page
++;
page
++;
if
(
count
<
page
)
{
break
;
}
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
e
.
printStackTrace
();
...
@@ -98,16 +101,17 @@ public class Xueqiu {
...
@@ -98,16 +101,17 @@ public class Xueqiu {
/**
/**
*
*
* @Description
(TODO这里用一句话描述这个方法的作用)
* @Description
雪球历史文章采集
* @return
* @return
*/
*/
public
List
<
Map
<
String
,
Object
>>
getXueqiuAccountData
(
String
userId
,
String
cookie
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getXueqiuAccountData
(
String
userId
,
String
cookie
,
Proxy
proxy
)
{
Map
<
String
,
Object
>
headers
=
new
HashMap
<>();
Map
<
String
,
Object
>
headers
=
new
HashMap
<>();
headers
.
put
(
"cookie"
,
cookie
);
headers
.
put
(
"cookie"
,
cookie
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
int
page
=
1
;
int
errorCount
=
1
;
while
(
true
)
{
while
(
true
)
{
int
page
=
1
;
String
url
=
"https://xueqiu.com/v4/statuses/user_timeline.json?page="
+
page
+
"&user_id="
+
userId
+
"&type=0"
;
String
url
=
"https://xueqiu.com/v4/statuses/user_timeline.json?page="
+
page
+
"&user_id=6687544095&type=0"
;
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headers
),
proxy
)){
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headers
),
proxy
)){
String
result
=
response
.
body
().
string
();
String
result
=
response
.
body
().
string
();
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
...
@@ -121,26 +125,30 @@ public class Xueqiu {
...
@@ -121,26 +125,30 @@ public class Xueqiu {
Date
date
=
TimeParse
.
stringFormartDate
(
timeBefore
);
Date
date
=
TimeParse
.
stringFormartDate
(
timeBefore
);
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
map
.
put
(
"
nam
e"
,
ob
.
getJSONObject
(
"user"
).
getString
(
"screen_name"
));
//statuses user screen_name
map
.
put
(
"
sourc
e"
,
ob
.
getJSONObject
(
"user"
).
getString
(
"screen_name"
));
//statuses user screen_name
map
.
put
(
"time"
,
date
);
//statuses timeBefore
map
.
put
(
"time"
,
date
);
//statuses timeBefore
map
.
put
(
"source"
,
ob
.
getString
(
"source"
));
//statuses source
map
.
put
(
"content"
,
ob
.
getString
(
"description"
).
replaceAll
(
"<.*?>"
,
""
));
//statuses description
map
.
put
(
"content"
,
ob
.
getString
(
"description"
).
replaceAll
(
"<.*?>"
,
""
));
//statuses description
map
.
put
(
"title"
,
ob
.
getString
(
"rawTitle"
));
map
.
put
(
"repostCount"
,
ob
.
getString
(
"retweet_count"
));
//statuses retweet_count
map
.
put
(
"repostCount"
,
ob
.
getString
(
"retweet_count"
));
//statuses retweet_count
map
.
put
(
"commentCount"
,
ob
.
getString
(
"reply_count"
));
//statuses reply_count
map
.
put
(
"commentCount"
,
ob
.
getString
(
"reply_count"
));
//statuses reply_count
map
.
put
(
"likeCount"
,
ob
.
getString
(
"like_count"
));
//statuses like_count
map
.
put
(
"likeCount"
,
ob
.
getString
(
"like_count"
));
//statuses like_count
map
.
put
(
"url"
,
"https://xueqiu.coms"
+
ob
.
getString
(
"target"
));
map
.
put
(
"url"
,
"https://xueqiu.com"
+
ob
.
getString
(
"target"
));
bodyList
.
add
(
map
);
bodyList
.
add
(
map
);
}
}
int
maxPage
=
json
.
getInteger
(
"maxPage"
);
int
maxPage
=
json
.
getInteger
(
"maxPage"
);
page
++;
page
++;
logger
.
info
(
"userId = {} , crawler count = {} ,page = {} , maxPage = {}"
,
userId
,
bodyList
.
size
(),
page
,
maxPage
);
if
(
page
>
maxPage
)
{
if
(
page
>
maxPage
)
{
break
;
break
;
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"采集解析出错 {}"
,
e
);
logger
.
error
(
"采集解析出错 {}"
,
e
);
break
;
errorCount
++;
if
(
errorCount
>
3
)
{
break
;
}
}
}
ZhiWeiTools
.
sleep
(
2000
);
}
}
return
bodyList
;
return
bodyList
;
}
}
...
...
src/main/java/com/zhiwei/parse/Yangshi.java
0 → 100644
View file @
9234d24c
package
com
.
zhiwei
.
parse
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.HashMap
;
import
java.util.LinkedHashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
/**
*
* @ClassName Yangshi
* @Description 央视网 采集
* @author byte-zbs
* @Date 2019年7月4日 下午6:08:12
* @version 1.0.0
*/
public
class
Yangshi
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Yangshi
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
public
static
List
<
Map
<
String
,
Object
>>
getData
()
{
return
Collections
.
emptyList
();
}
private
static
List
<
Map
<
String
,
Object
>>
analysisData
(
String
result
)
{
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
JSONArray
jsonArray
=
JSONObject
.
parseObject
(
result
).
getJSONArray
(
"list"
);
try
{
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
JSONObject
ob
=
jsonArray
.
getJSONObject
(
i
);
String
allTitle
=
ob
.
getString
(
"all_title"
);
//视频标题
String
urllink
=
ob
.
getString
(
"urllink"
);
//链接
String
channel
=
ob
.
getString
(
"channel"
);
//频道来源
String
uploadtime
=
ob
.
getString
(
"uploadtime"
);
//时间
String
durations
=
ob
.
getString
(
"durations"
);
//时长
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
map
.
put
(
"视频标题"
,
allTitle
);
map
.
put
(
"链接"
,
urllink
);
map
.
put
(
"频道来源"
,
channel
);
map
.
put
(
"时间"
,
uploadtime
);
map
.
put
(
"时长"
,
durations
+
" s"
);
System
.
out
.
println
(
map
.
toString
());
bodyList
.
add
(
map
);
}
}
catch
(
Exception
e
)
{
// TODO: handle exception
e
.
printStackTrace
();
}
return
bodyList
;
}
}
src/main/java/com/zhiwei/parse/Yiche.java
View file @
9234d24c
...
@@ -23,7 +23,7 @@ import okhttp3.Response;
...
@@ -23,7 +23,7 @@ import okhttp3.Response;
public
class
Yiche
{
public
class
Yiche
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Yiche
.
class
);
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Yiche
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
/**
/**
*
*
...
...
src/main/java/com/zhiwei/parse/Yidianzixun.java
View file @
9234d24c
...
@@ -33,7 +33,7 @@ public class Yidianzixun {
...
@@ -33,7 +33,7 @@ public class Yidianzixun {
private
static
YidianzixunCommentAnalysis
yidianzixunCommentAnalysis
=
new
YidianzixunCommentAnalysis
();
private
static
YidianzixunCommentAnalysis
yidianzixunCommentAnalysis
=
new
YidianzixunCommentAnalysis
();
private
static
YidianzixunByWordAnalysis
yidianzixunByWordAnalysis
=
new
YidianzixunByWordAnalysis
();
private
static
YidianzixunByWordAnalysis
yidianzixunByWordAnalysis
=
new
YidianzixunByWordAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
/**
/**
*
*
...
@@ -42,19 +42,19 @@ public class Yidianzixun {
...
@@ -42,19 +42,19 @@ public class Yidianzixun {
* @param startTime
* @param startTime
* @return
* @return
*/
*/
public
static
List
<
Map
<
String
,
Object
>>
getYidianzixunAccountData
(
String
channelid
,
String
startTime
,
Proxy
proxy
,
String
cookie
)
{
public
static
List
<
Map
<
String
,
Object
>>
getYidianzixunAccountData
(
String
channelid
,
String
startTime
,
Proxy
Holder
proxy
,
String
cookie
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getYidianzixunAccountHeaderMap
(
cookie
,
"http://www.yidianzixun.com/channel/"
+
channelid
);
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getYidianzixunAccountHeaderMap
(
cookie
,
"http://www.yidianzixun.com/channel/"
+
channelid
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
int
j
=
0
;
int
j
=
0
;
boolean
f
=
true
;
boolean
f
=
true
;
try
{
try
{
while
(
f
)
{
while
(
f
)
{
String
url
=
"http://www.yidianzixun.com/"
+
getSpt
(
channelid
,
j
,
j
+
10
);
String
url
=
"http://www.yidianzixun.com"
+
getSpt
(
channelid
,
j
,
j
+
10
);
System
.
out
.
println
(
url
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
System
.
out
.
println
(
result
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArry
=
json
.
getJSONArray
(
"result"
);
JSONArray
jsonArry
=
json
.
getJSONArray
(
"result"
);
if
(
jsonArry
.
size
()
==
0
)
{
if
(
jsonArry
.
isEmpty
()
)
{
break
;
break
;
}
}
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
...
@@ -70,13 +70,12 @@ public class Yidianzixun {
...
@@ -70,13 +70,12 @@ public class Yidianzixun {
dataList
.
add
(
map
);
dataList
.
add
(
map
);
}
}
}
}
System
.
out
.
println
(
"================================"
+
dataList
.
size
());
logger
.
info
(
"channelid = {} , crawler size = {}"
,
channelid
,
dataList
.
size
());
ZhiWeiTools
.
sleep
(
30
00
);
ZhiWeiTools
.
sleep
(
1
00
);
j
=
dataList
.
size
();
j
=
dataList
.
size
();
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"数据获取出错"
,
e
.
getMessage
());
logger
.
error
(
"数据获取出错 {}"
,
e
);
e
.
printStackTrace
();
}
}
return
dataList
;
return
dataList
;
}
}
...
...
src/main/java/com/zhiwei/parse/Youku.java
View file @
9234d24c
...
@@ -24,7 +24,7 @@ import okhttp3.Response;
...
@@ -24,7 +24,7 @@ import okhttp3.Response;
public
class
Youku
{
public
class
Youku
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Youku
.
class
);
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Youku
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
(
false
,
2
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
(
);
public
static
List
<
Map
<
String
,
Object
>>
getDataList
(
String
word
)
{
public
static
List
<
Map
<
String
,
Object
>>
getDataList
(
String
word
)
{
String
aaid
=
"9cae49f0e031664b00d8f9c108e586ab"
;
String
aaid
=
"9cae49f0e031664b00d8f9c108e586ab"
;
...
@@ -33,7 +33,7 @@ public class Youku {
...
@@ -33,7 +33,7 @@ public class Youku {
String
url
=
"https://so.youku.com/search_video/q_"
+
URLCodeUtil
.
getURLEncode
(
word
,
"UTF-8"
)+
"?spm=a2h0k.11417342.filter.dnew&orderfield=createtime&aaid="
+
aaid
+
"&pg="
+
i
;
String
url
=
"https://so.youku.com/search_video/q_"
+
URLCodeUtil
.
getURLEncode
(
word
,
"UTF-8"
)+
"?spm=a2h0k.11417342.filter.dnew&orderfield=createtime&aaid="
+
aaid
+
"&pg="
+
i
;
System
.
out
.
println
(
url
);
System
.
out
.
println
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
Proxy
Factory
.
getNatProxy
()
)){
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
Proxy
Holder
.
NAT_HEAVY_PROXY
)){
String
result
=
response
.
body
().
string
();
String
result
=
response
.
body
().
string
();
String
jsondata
=
result
.
split
(
"bigview.view\\("
)[
1
].
split
(
"\\)\\</script\\>"
)[
0
];
String
jsondata
=
result
.
split
(
"bigview.view\\("
)[
1
].
split
(
"\\)\\</script\\>"
)[
0
];
JSONObject
json
=
JSONObject
.
parseObject
(
jsondata
);
JSONObject
json
=
JSONObject
.
parseObject
(
jsondata
);
...
@@ -45,7 +45,7 @@ public class Youku {
...
@@ -45,7 +45,7 @@ public class Youku {
String
title
=
element
.
select
(
"div.mod-main > div.mod-header > h2 > a"
).
text
();
String
title
=
element
.
select
(
"div.mod-main > div.mod-header > h2 > a"
).
text
();
String
surl
=
element
.
select
(
"div.mod-main > div.mod-header > h2 > a"
).
attr
(
"href"
);
String
surl
=
element
.
select
(
"div.mod-main > div.mod-header > h2 > a"
).
attr
(
"href"
);
String
time
=
element
.
select
(
"div.mod-main > div.mod-info > p"
).
text
();
String
time
=
element
.
select
(
"div.mod-main > div.mod-info > p"
).
text
();
if
(
time
.
contains
(
"上传时间:"
))
{
if
(
time
.
contains
(
"上传时间:"
)
&&
surl
.
contains
(
"v.youku.com"
)
)
{
map
.
put
(
"title"
,
title
);
map
.
put
(
"title"
,
title
);
map
.
put
(
"url"
,
"https:"
+
surl
);
map
.
put
(
"url"
,
"https:"
+
surl
);
map
.
put
(
"time"
,
time
.
replaceAll
(
"上传时间:"
,
""
).
split
(
" "
)[
0
]);
map
.
put
(
"time"
,
time
.
replaceAll
(
"上传时间:"
,
""
).
split
(
" "
)[
0
]);
...
...
src/main/java/com/zhiwei/parse/analysis/BaijiaAccountAnalysis.java
View file @
9234d24c
...
@@ -24,7 +24,7 @@ import okhttp3.Response;
...
@@ -24,7 +24,7 @@ import okhttp3.Response;
public
class
BaijiaAccountAnalysis
{
public
class
BaijiaAccountAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
BaijiaAccountAnalysis
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
BaijiaAccountAnalysis
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
public
Map
<
String
,
Object
>
getBaijiaAccount2Data
(
JSONObject
data
)
{
public
Map
<
String
,
Object
>
getBaijiaAccount2Data
(
JSONObject
data
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
...
@@ -57,12 +57,13 @@ public class BaijiaAccountAnalysis {
...
@@ -57,12 +57,13 @@ public class BaijiaAccountAnalysis {
boolean
more
=
false
;
boolean
more
=
false
;
try
{
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArry
=
json
.
getJSONObject
(
"data"
).
getJSONArray
(
"
items
"
);
JSONArray
jsonArry
=
json
.
getJSONObject
(
"data"
).
getJSONArray
(
"
list
"
);
if
(
json
.
getJSONObject
(
"data"
)
!=
null
&&
json
.
getJSONObject
(
"data"
).
getBoolean
(
"has_more"
)
!=
null
)
{
if
(
json
.
getJSONObject
(
"data"
)
.
getBoolean
(
"has_more"
)
!=
null
&&
if
(
json
.
getJSONObject
(
"data"
).
getBoolean
(
"has_more"
)
)
{
json
.
getJSONObject
(
"data"
).
getBoolean
(
"has_more"
)
)
{
more
=
true
;
more
=
true
;
}
rmap
.
put
(
"ctime"
,
json
.
getJSONObject
(
"data"
).
getString
(
"ctime"
));
}
}
// String name = json.getJSONObject("data").getJSONObject("author").getString("display_name");
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
JSONObject
data
=
jsonArry
.
getJSONObject
(
i
);
JSONObject
data
=
jsonArry
.
getJSONObject
(
i
);
...
@@ -77,10 +78,7 @@ public class BaijiaAccountAnalysis {
...
@@ -77,10 +78,7 @@ public class BaijiaAccountAnalysis {
}
}
}
}
map
.
put
(
"title"
,
data
.
getString
(
"title"
));
map
.
put
(
"title"
,
data
.
getString
(
"title"
));
String
url
=
data
.
getString
(
"url"
);
String
url
=
"http://baijiahao.baidu.com/s?id="
+
id
;
if
(
url
==
null
)
{
url
=
"https://baijia.baidu.com/s?old_id="
+
id
;
}
map
.
put
(
"content"
,
ZhiWeiTools
.
delHTMLTag
(
getContent3
(
data
)));
map
.
put
(
"content"
,
ZhiWeiTools
.
delHTMLTag
(
getContent3
(
data
)));
map
.
put
(
"read_amount"
,
data
.
getString
(
"read_amount"
)==
null
?
0
:
data
.
getString
(
"read_amount"
));
map
.
put
(
"read_amount"
,
data
.
getString
(
"read_amount"
)==
null
?
0
:
data
.
getString
(
"read_amount"
));
map
.
put
(
"app_id"
,
data
.
getString
(
"app_id"
));
map
.
put
(
"app_id"
,
data
.
getString
(
"app_id"
));
...
...
src/main/java/com/zhiwei/parse/analysis/BilibilikeyWordAnalysis.java
View file @
9234d24c
...
@@ -5,6 +5,7 @@ import java.util.Collections;
...
@@ -5,6 +5,7 @@ import java.util.Collections;
import
java.util.HashMap
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
java.util.Objects
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
...
@@ -13,7 +14,7 @@ import org.jsoup.select.Elements;
...
@@ -13,7 +14,7 @@ import org.jsoup.select.Elements;
public
class
BilibilikeyWordAnalysis
{
public
class
BilibilikeyWordAnalysis
{
public
static
Map
<
String
,
Object
>
getData
(
String
result
,
String
word
)
{
public
static
Map
<
String
,
Object
>
getData
(
String
result
,
String
word
,
String
endTime
)
{
try
{
try
{
Document
doc
=
Jsoup
.
parse
(
result
);
Document
doc
=
Jsoup
.
parse
(
result
);
boolean
more
=
false
;
boolean
more
=
false
;
...
@@ -28,10 +29,9 @@ public class BilibilikeyWordAnalysis {
...
@@ -28,10 +29,9 @@ public class BilibilikeyWordAnalysis {
String
source
=
null
;
String
source
=
null
;
String
submitcount
=
null
;
String
submitcount
=
null
;
Elements
elements
=
doc
.
select
(
"ul.video-contain.clearfix"
).
select
(
"li"
);
Elements
elements
=
doc
.
select
(
"ul.video-contain.clearfix"
).
select
(
"li"
);
System
.
out
.
println
(
elements
.
size
()
+
" --- "
+
more
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
Element
element
:
elements
)
{
for
(
Element
element
:
elements
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
title
=
element
.
select
(
"a"
).
attr
(
"title"
);
title
=
element
.
select
(
"a"
).
attr
(
"title"
);
url
=
element
.
select
(
"a"
).
attr
(
"href"
);
url
=
element
.
select
(
"a"
).
attr
(
"href"
);
playcount
=
element
.
select
(
"div.tags"
).
select
(
"span.watch-num"
).
text
();
playcount
=
element
.
select
(
"div.tags"
).
select
(
"span.watch-num"
).
text
();
...
@@ -45,6 +45,9 @@ public class BilibilikeyWordAnalysis {
...
@@ -45,6 +45,9 @@ public class BilibilikeyWordAnalysis {
map
.
put
(
"source"
,
source
);
map
.
put
(
"source"
,
source
);
map
.
put
(
"submitcount"
,
submitcount
);
map
.
put
(
"submitcount"
,
submitcount
);
map
.
put
(
"word"
,
word
);
map
.
put
(
"word"
,
word
);
if
(
Objects
.
nonNull
(
endTime
)
&&
endTime
.
compareTo
(
time
)
>
-
1
)
{
more
=
false
;
}
dataList
.
add
(
map
);
dataList
.
add
(
map
);
}
}
Map
<
String
,
Object
>
rmap
=
new
HashMap
<>();
Map
<
String
,
Object
>
rmap
=
new
HashMap
<>();
...
...
src/main/java/com/zhiwei/parse/analysis/DayuAccountAnalysis.java
View file @
9234d24c
...
@@ -49,7 +49,7 @@ public class DayuAccountAnalysis {
...
@@ -49,7 +49,7 @@ public class DayuAccountAnalysis {
* @return
* @return
*/
*/
private
Map
<
String
,
Object
>
getOneData
(
JSONObject
data
,
String
name
,
String
startTime
)
{
private
Map
<
String
,
Object
>
getOneData
(
JSONObject
data
,
String
name
,
String
startTime
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
try
{
try
{
String
time
=
data
.
getString
(
"published_at"
).
replace
(
"T"
,
" "
).
split
(
"\\."
)[
0
];
String
time
=
data
.
getString
(
"published_at"
).
replace
(
"T"
,
" "
).
split
(
"\\."
)[
0
];
if
(
startTime
!=
null
&&
startTime
.
length
()
>
1
)
{
if
(
startTime
!=
null
&&
startTime
.
length
()
>
1
)
{
...
...
src/main/java/com/zhiwei/parse/analysis/DayuByWordAnalysis.java
View file @
9234d24c
...
@@ -24,7 +24,7 @@ import okhttp3.Response;
...
@@ -24,7 +24,7 @@ import okhttp3.Response;
public
class
DayuByWordAnalysis
{
public
class
DayuByWordAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
DayuByWordAnalysis
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
DayuByWordAnalysis
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
public
List
<
Map
<
String
,
Object
>>
getDayuByWordData
(
String
result
,
Proxy
proxy
)
{
public
List
<
Map
<
String
,
Object
>>
getDayuByWordData
(
String
result
,
Proxy
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
...
...
src/main/java/com/zhiwei/parse/analysis/FenghuangAccountAnalysis.java
View file @
9234d24c
package
com
.
zhiwei
.
parse
.
analysis
;
package
com
.
zhiwei
.
parse
.
analysis
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.List
;
...
@@ -12,16 +11,15 @@ import org.slf4j.LoggerFactory;
...
@@ -12,16 +11,15 @@ import org.slf4j.LoggerFactory;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
import
okhttp3.Response
;
public
class
FenghuangAccountAnalysis
{
public
class
FenghuangAccountAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
FenghuangAccountAnalysis
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
FenghuangAccountAnalysis
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
/**
/**
*
*
...
@@ -29,70 +27,61 @@ public class FenghuangAccountAnalysis {
...
@@ -29,70 +27,61 @@ public class FenghuangAccountAnalysis {
* @param result
* @param result
* @return
* @return
*/
*/
public
List
<
Map
<
String
,
Object
>>
getArticleData
(
String
url
,
String
startTime
,
Proxy
proxy
)
{
public
List
<
Map
<
String
,
Object
>>
getArticleData
(
String
url
,
String
startTime
,
ProxyHolder
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
try
{
try
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getFenghuangAccountHeaderMap
(
null
);
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getFenghuangAccountHeaderMap
(
null
);
JSONArray
jsonArry
=
null
;
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
)){
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
)){
String
result
=
response
.
body
().
string
();
String
result
=
response
.
body
().
string
();
System
.
out
.
println
(
result
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
.
replace
(
"getFengAuthorListData("
,
""
).
replace
(
"]})"
,
"]}"
));
jsonArry
=
json
.
getJSONObject
(
"data"
).
getJSONObject
(
"feeds"
).
getJSONArray
(
"list"
);
JSONArray
jsonArry
=
json
.
getJSONArray
(
"data"
);
if
(
jsonArry
==
null
||
jsonArry
.
size
()
<
1
)
{
for
(
int
j
=
0
;
j
<
jsonArry
.
size
();
j
++)
{
try
{
JSONObject
data
=
jsonArry
.
getJSONObject
(
j
);
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
map
.
put
(
"title"
,
data
.
getString
(
"title"
));
String
time
=
data
.
getString
(
"newsTime"
);
map
.
put
(
"time"
,
data
.
getString
(
"newsTime"
));
map
.
put
(
"url"
,
"https:"
+
data
.
getString
(
"url"
));
map
.
put
(
"id"
,
data
.
getString
(
"commentUrl"
));
if
(
time
.
compareTo
(
startTime
)
>=
0
)
{
dataList
.
add
(
map
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
" exception {}"
,
e
);
}
}
break
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
continue
;
continue
;
}
}
}
catch
(
Exception
e
)
{
continue
;
}
}
}
if
(
jsonArry
==
null
||
jsonArry
.
size
()
<
1
)
{
return
dataList
;
return
dataList
;
}
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
try
{
JSONObject
data
=
jsonArry
.
getJSONObject
(
i
);
String
articleurl
=
data
.
getString
(
"id"
);
String
articleResult
=
HttpClient
.
executeHttpRequestGet
(
articleurl
,
proxy
,
headerMap
);
Map
<
String
,
Object
>
dataMap
=
getArticle
(
articleResult
);
ZhiWeiTools
.
sleep
(
1000
);
if
(
dataMap
!=
null
)
{
String
time
=
(
String
)
dataMap
.
get
(
"time"
);
if
(
time
.
compareTo
(
startTime
)
>=
0
)
{
dataList
.
add
(
dataMap
);
continue
;
}
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
continue
;
}
}
return
dataList
;
}
catch
(
Exception
e1
)
{
}
catch
(
Exception
e1
)
{
e1
.
printStackTrace
();
return
dataList
;
return
dataList
;
}
}
}
}
private
static
Map
<
String
,
Object
>
getArticle
(
String
articleResult
)
{
//
private static Map<String,Object> getArticle(String articleResult) {
JSONObject
json
=
JSONObject
.
parseObject
(
articleResult
).
getJSONObject
(
"body"
);
// try {
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
// Map<String,Object> map = new HashMap<
>();
try
{
// JSONObject json = JSONObject.parseObject(articleResult).getJSONObject("body");
map
.
put
(
"title"
,
json
.
getString
(
"title"
));
//
map.put("title", json.getString("title"));
String
time
=
json
.
getString
(
"cTime"
).
replaceAll
(
"/"
,
"-"
);
//
String time = json.getString("cTime").replaceAll("/", "-");
map
.
put
(
"time"
,
time
);
//
map.put("time", time);
map
.
put
(
"text"
,
json
.
getString
(
"text"
).
replaceAll
(
"<.*?>"
,
""
));
//
map.put("text", json.getString("text").replaceAll("<.*?>", ""));
map
.
put
(
"source"
,
json
.
getString
(
"source"
));
//
map.put("source", json.getString("source"));
map
.
put
(
"url"
,
json
.
getString
(
"shareurl
"
));
// map.put("url", "https://share.iclient.ifeng.com/news/shareNews?aid=sub_" + json.getString("aid
"));
map
.
put
(
"id"
,
json
.
getString
(
"aid"
));
//
map.put("id", json.getString("aid"));
}
catch
(
Exception
e
)
{
// return map;
logger
.
error
(
"解析具体文章的时候出错 {}"
,
e
);
// } catch (Exception e) {
return
null
;
// logger.error("解析具体文章的时候出错 {}",e)
;
}
// return null;
return
map
;
// }
}
//
}
...
...
src/main/java/com/zhiwei/parse/analysis/FenghuangCommentAnalysis.java
View file @
9234d24c
...
@@ -23,7 +23,7 @@ import okhttp3.Response;
...
@@ -23,7 +23,7 @@ import okhttp3.Response;
public
class
FenghuangCommentAnalysis
{
public
class
FenghuangCommentAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
FenghuangCommentAnalysis
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
FenghuangCommentAnalysis
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
public
Map
<
String
,
Object
>
getFenghuangCommentCount
(
String
url
,
ProxyHolder
proxy
)
{
public
Map
<
String
,
Object
>
getFenghuangCommentCount
(
String
url
,
ProxyHolder
proxy
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
...
...
src/main/java/com/zhiwei/parse/analysis/SouhuAccountAnalysis.java
View file @
9234d24c
package
com
.
zhiwei
.
parse
.
analysis
;
package
com
.
zhiwei
.
parse
.
analysis
;
import
java.net.URLDecoder
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.HashMap
;
...
@@ -26,7 +25,7 @@ public class SouhuAccountAnalysis {
...
@@ -26,7 +25,7 @@ public class SouhuAccountAnalysis {
* @return
* @return
*/
*/
public
List
<
Map
<
String
,
Object
>>
analysisData
(
JSONArray
jsonArray
,
String
name
)
{
public
List
<
Map
<
String
,
Object
>>
analysisData
(
JSONArray
jsonArray
,
String
name
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>
>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
Map
<
String
,
Object
>
map
=
parseHtmlByAccount
(
data
,
name
);
Map
<
String
,
Object
>
map
=
parseHtmlByAccount
(
data
,
name
);
...
@@ -46,19 +45,15 @@ public class SouhuAccountAnalysis {
...
@@ -46,19 +45,15 @@ public class SouhuAccountAnalysis {
* @return
* @return
*/
*/
private
static
Map
<
String
,
Object
>
parseHtmlByAccount
(
JSONObject
data
,
String
name
)
{
private
static
Map
<
String
,
Object
>
parseHtmlByAccount
(
JSONObject
data
,
String
name
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
try
{
try
{
String
title
=
data
.
getString
(
"title"
);
map
.
put
(
"title"
,
data
.
getString
(
"title"
));
map
.
put
(
"title"
,
URLDecoder
.
decode
(
title
,
"UTF-8"
));
map
.
put
(
"source"
,
name
);
map
.
put
(
"source"
,
name
);
String
content
=
data
.
getString
(
"brief"
);
map
.
put
(
"content"
,
data
.
getString
(
"brief"
));
map
.
put
(
"content"
,
URLDecoder
.
decode
(
content
,
"UTF-8"
));
map
.
put
(
"newsPv"
,
data
.
getString
(
"newsPv"
));
map
.
put
(
"newsPv"
,
data
.
getString
(
"newsPv"
));
map
.
put
(
"url"
,
data
.
getString
(
"url"
));
map
.
put
(
"url"
,
data
.
getString
(
"link"
));
long
timelong
=
Long
.
valueOf
(
data
.
getString
(
"postTime"
));
map
.
put
(
"time"
,
new
Date
(
data
.
getLong
(
"publicTime"
)));
map
.
put
(
"time"
,
new
Date
(
timelong
));
JSONArray
jsonArry
=
data
.
getJSONArray
(
"tagDetails"
);
map
.
put
(
"comment"
,
data
.
getString
(
"commentsCnt"
));
JSONArray
jsonArry
=
data
.
getJSONArray
(
"tags"
);
String
tags
=
""
;
String
tags
=
""
;
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
JSONObject
ob
=
jsonArry
.
getJSONObject
(
i
);
JSONObject
ob
=
jsonArry
.
getJSONObject
(
i
);
...
@@ -68,10 +63,9 @@ public class SouhuAccountAnalysis {
...
@@ -68,10 +63,9 @@ public class SouhuAccountAnalysis {
tags
=
tags
.
substring
(
0
,
tags
.
length
()-
1
);
tags
=
tags
.
substring
(
0
,
tags
.
length
()-
1
);
}
}
map
.
put
(
"tags"
,
tags
);
map
.
put
(
"tags"
,
tags
);
map
.
put
(
"newsid"
,
data
.
getString
(
"
news
id"
));
map
.
put
(
"newsid"
,
data
.
getString
(
"id"
));
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜狐历史文章解析出错了"
,
e
.
getMessage
());
logger
.
error
(
"搜狐历史文章解析出错了 {}"
,
e
.
getMessage
());
System
.
out
.
println
(
data
.
toString
());
return
null
;
return
null
;
}
}
...
...
src/main/java/com/zhiwei/parse/analysis/SouhuCommentAnalysis.java
View file @
9234d24c
...
@@ -20,7 +20,7 @@ import okhttp3.Response;
...
@@ -20,7 +20,7 @@ import okhttp3.Response;
public
class
SouhuCommentAnalysis
{
public
class
SouhuCommentAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
SouhuCommentAnalysis
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
SouhuCommentAnalysis
.
class
);
private
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
/**
/**
*
*
...
...
src/main/java/com/zhiwei/parse/shipin/QQTV.java
View file @
9234d24c
package
com
.
zhiwei
.
parse
.
shipin
;
package
com
.
zhiwei
.
parse
.
shipin
;
import
java.net.Proxy
;
import
java.net.URLEncoder
;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.Collections
;
...
@@ -18,7 +17,6 @@ import org.slf4j.LoggerFactory;
...
@@ -18,7 +17,6 @@ import org.slf4j.LoggerFactory;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
...
@@ -36,7 +34,7 @@ import okhttp3.Response;
...
@@ -36,7 +34,7 @@ import okhttp3.Response;
public
class
QQTV
{
public
class
QQTV
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
QQTV
.
class
);
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
QQTV
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
String
time
,
ProxyHolder
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
String
time
,
ProxyHolder
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
...
@@ -52,8 +50,8 @@ public class QQTV {
...
@@ -52,8 +50,8 @@ public class QQTV {
logger
.
info
(
" 关键词 {} 量 {} 页 数 {} 此页量 {} "
,
word
,
dataList
.
size
(),
page
,
elements
.
size
());
logger
.
info
(
" 关键词 {} 量 {} 页 数 {} 此页量 {} "
,
word
,
dataList
.
size
(),
page
,
elements
.
size
());
for
(
Element
element
:
elements
)
{
for
(
Element
element
:
elements
)
{
String
nurl
=
element
.
select
(
"h2.result_title"
).
select
(
"a"
).
attr
(
"href"
);
String
nurl
=
element
.
select
(
"h2.result_title"
).
select
(
"a"
).
attr
(
"href"
);
Map
<
String
,
Object
>
map
=
getUrlData
(
nurl
,
Proxy
Factory
.
getNatProxy
()
);
Map
<
String
,
Object
>
map
=
getUrlData
(
nurl
,
Proxy
Holder
.
NAT_HEAVY_PROXY
);
if
(
Objects
.
nonNull
(
map
)
&&
time
.
compareTo
(
String
.
valueOf
(
map
.
get
(
"time"
)))
<
1
)
{
if
(
Objects
.
nonNull
(
map
)
&&
!
map
.
isEmpty
()
&&
time
.
compareTo
(
String
.
valueOf
(
map
.
get
(
"time"
)))
<
1
)
{
map
.
put
(
"word"
,
word
);
map
.
put
(
"word"
,
word
);
dataList
.
add
(
map
);
dataList
.
add
(
map
);
}
}
...
@@ -61,6 +59,9 @@ public class QQTV {
...
@@ -61,6 +59,9 @@ public class QQTV {
}
}
page
++;
page
++;
if
(
count
!=
dataList
.
size
())
{
if
(
count
!=
dataList
.
size
())
{
if
(
page
>
20
)
{
break
;
}
continue
;
continue
;
}
}
...
@@ -76,24 +77,26 @@ public class QQTV {
...
@@ -76,24 +77,26 @@ public class QQTV {
return
dataList
;
return
dataList
;
}
}
private
static
Map
<
String
,
Object
>
getUrlData
(
String
url
,
Proxy
proxy
)
{
private
static
Map
<
String
,
Object
>
getUrlData
(
String
url
,
ProxyHolder
proxy
)
{
for
(
int
i
=
1
;
i
<
3
;
i
++)
{
if
(!
url
.
contains
(
"v.qq.com"
))
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
return
null
;
String
result
=
response
.
body
().
string
();
}
String
source
=
result
.
split
(
"\\<span class=\"user_name\"\\>"
)[
1
].
split
(
"\\</span\\>"
)[
0
];
System
.
out
.
println
(
url
);
result
=
result
.
split
(
"var VIDEO_INFO ="
)[
1
].
split
(
"\\</script\\>"
)[
0
];
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
String
result
=
response
.
body
().
string
();
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
String
source
=
result
.
split
(
"\\<span class=\"user_name\"\\>"
)[
1
].
split
(
"\\</span\\>"
)[
0
];
result
=
result
.
split
(
"var VIDEO_INFO ="
)[
1
].
split
(
"\\</script\\>"
)[
0
];
map
.
put
(
"playCount"
,
json
.
getInteger
(
"view_all_count"
));
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
map
.
put
(
"title"
,
json
.
getString
(
"title"
));
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
map
.
put
(
"time"
,
json
.
getString
(
"video_checkup_time"
));
map
.
put
(
"source"
,
source
);
map
.
put
(
"playCount"
,
json
.
getInteger
(
"view_all_count"
));
map
.
put
(
"url"
,
url
);
map
.
put
(
"title"
,
json
.
getString
(
"title"
));
return
map
;
map
.
put
(
"time"
,
json
.
getString
(
"video_checkup_time"
));
}
catch
(
Exception
e
)
{
map
.
put
(
"source"
,
source
);
e
.
printStackTrace
();
map
.
put
(
"url"
,
url
);
}
return
map
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
return
Collections
.
emptyMap
();
return
Collections
.
emptyMap
();
}
}
...
...
src/main/java/com/zhiwei/parse/shipin/SohuTV.java
View file @
9234d24c
...
@@ -24,7 +24,7 @@ import okhttp3.Response;
...
@@ -24,7 +24,7 @@ import okhttp3.Response;
public
class
SohuTV
{
public
class
SohuTV
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
SohuTV
.
class
);
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
SohuTV
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
public
static
List
<
Map
<
String
,
Object
>>
sohuTVData
(
String
word
,
String
cookie
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
sohuTVData
(
String
word
,
String
cookie
,
Proxy
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
...
...
src/test/java/com/zhiwei/Comment/AikaComment.java
View file @
9234d24c
//package com.zhiwei.Comment;
//package com.zhiwei.Comment;
//
//
//import org.
testng.annotations
.Test;
//import org.
junit
.Test;
//
//
//import com.zhiwei.parse.Aika;
//import com.zhiwei.parse.Aika;
//import com.zhiwei.tools.timeparse.TimeExtraction;
//import com.zhiwei.tools.timeparse.TimeParse;
//
//
//public class AikaComment {
//public class AikaComment {
// @Test
// @Test
// public void f() {
// public void f() {
// String url = "http://
newcar.xcar.com.cn/201809/news_2021765
_1.html";
// String url = "http://
info.xcar.com.cn/201906/news_2039730
_1.html";
//
//
// Aika.getAikaComment(url, null);
// Aika.getAikaComment(url, null);
//
//
...
...
src/test/java/com/zhiwei/Comment/AiqiyiHotCountTest.java
View file @
9234d24c
package
com
.
zhiwei
.
Comment
;
//
package com.zhiwei.Comment;
//
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
org.testng.annotations
.Test
;
//import org.junit
.Test;
//
import
com.zhiwei.common.config.GroupType
;
//
import com.zhiwei.common.config.GroupType;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
//
import com.zhiwei.crawler.proxy.ProxyFactory;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
//
import com.zhiwei.crawler.proxy.ProxyHolder;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.parse.Aiqiyi
;
//
import com.zhiwei.parse.Aiqiyi;
//
public
class
AiqiyiHotCountTest
{
//
public class AiqiyiHotCountTest {
@Test
//
@Test
public
void
f
()
{
//
public void f() {
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
String
path
=
"C:\\Users\\byte-zbs\\Documents\\WXWork\\1688854025129101\\Cache\\File\\2019-03\\爱奇艺.xlsx"
;
//
String path = "C:\\Users\\byte-zbs\\Documents\\WXWork\\1688854025129101\\Cache\\File\\2019-03\\爱奇艺.xlsx";
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
path
,
0
);
//
Map<String,Object> map = poi.importExcel(path, 0);
List
<
Map
<
String
,
Object
>>
dataList
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
//
List<Map<String,Object>> dataList = (List<Map<String, Object>>) map.get("body");
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
//
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
List
<
String
>
headList
=
(
List
<
String
>)
map
.
get
(
"head"
);
//
List<String> headList = (List<String>) map.get("head");
headList
.
add
(
"count"
);
//
headList.add("count");
dataList
.
forEach
(
m
->
{
//
dataList.forEach(m -> {
String
url
=
String
.
valueOf
(
m
.
get
(
"链接"
));
//
String url = String.valueOf(m.get("链接"));
//
int
i
=
Aiqiyi
.
aiqiyiHotCount
(
url
,
ProxyHolder
.
NAT_PROXY
);
//
int i = Aiqiyi.aiqiyiHotCount(url, ProxyHolder.NAT_PROXY);
System
.
out
.
println
(
url
+
" -- "
+
i
);
//
System.out.println(url + " -- " + i);
m
.
put
(
"count"
,
i
);
//
m.put("count", i);
});
//
});
poi
.
exportExcel
(
path
,
"data"
,
headList
,
dataList
);
//
poi.exportExcel(path, "data", headList, dataList);
}
//
}
}
//
}
src/test/java/com/zhiwei/Comment/MaimaiCommentCountTest.java
View file @
9234d24c
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
//import java.util.List;
//import java.util.List;
//import java.util.Map;
//import java.util.Map;
//
//
//import org.
testng.annotations
.Test;
//import org.
junit
.Test;
//
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyFactory;
...
@@ -18,27 +18,28 @@
...
@@ -18,27 +18,28 @@
// public void f() {
// public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
// GroupType.PROVIDER);
// GroupType.PROVIDER);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
//
// Map<String, Object> map = poi
//// Map<String, Object> map = poi
// .importExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\脉脉公司圈.xlsx", 0);
//// .importExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\脉脉公司圈.xlsx", 0);
// List<Map<String, Object>> list = (List<Map<String, Object>>) map.get("body");
//// List<Map<String, Object>> list = (List<Map<String, Object>>) map.get("body");
// String cookie = "_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; sessionid=lejfy3gdu5tf9x9zowxfhtq5o73dubc5; guid=GxsfBBgZGwQYGx4EGBkeVgcYGxkdHhMeHhkcVhwZBB0ZHwVDWEtMS3kKEhMEEh0fGQQaBBsdBU9HRVhCaQoDRUFJT20KT0FDRgoGZmd+YmECChwZBB0ZHwVeQ2FIT31PRlpaawoDHhx9ZX0KERkEHAp+ZApZXUVOREN9AgoaBB8FS0ZGQ1BFZw==; seid=s1550814253444; token=\"rhItcea5qkO6WCSnVcczW/NRVLLCTsq3kQbpUCGAwQ0ceLunVJRjT5rgoFVYrIBA8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoiVjJuNHdCVDBncVNacTRxVllGM29jRUVwIiwic3RhdHVzIjp0cnVlLCJfZXhwaXJlIjoxNTUwOTAyMTY3MDQ5LCJfbWF4QWdlIjo4NjQwMDAwMH0=; session.sig=zbs4cHtzTcHWvjtkpjAZmoqLXsQ";
//// List<Map<String, Object>> bodyList = new ArrayList<Map<String, Object>>();
// List<Map<String, Object>> bodyList = new ArrayList<Map<String, Object>>();
//// List<String> headList = (List<String>) map.get("head");
// List<String> headList = (List<String>) map.get("head");
//// for (Map<String, Object> map1 : list) {
// for (Map<String, Object> map1 : list) {
//// String url = map1.get("地址") + "";
// String url = map1.get("地址") + "";
// String cookie = "_buuid=0668b664-13b3-4bd0-aa37-99d747432e85; guid=HBoEGxgEGBscBBsZGlYHGBsZHRsYExwZHFYcGQQdGR8FQ1hLTEt5ChITBBIdHxkEGgQbHQVPR0VYQmkKA0VBSU9tCk9BQ0YKBmZnfmJhAgocGQQdGR8FXkNhSE99T0ZaWmsKAx4cfWV9ChEZBBwKfmQKWV1FTkRDfQIKGgQfBUtGRkNQRWc=; token=\"ou+mv1hjxjm0uOOTss1vgck9+h6OCS/lYQUeFnJgSK70FHprmw6GmjBGwk2qPQH88CKuzcDfAvoCmBm7+jVysA==\"; uid=\"A8ELjewCDRgHnZ5bX0Vy0/Airs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMjI3NjU0NTI0Iiwic2VjcmV0IjoiV0wyZmEtZDZxbkx2TEkzZHF2dTN4UG5SIiwiX2V4cGlyZSI6MTU2MDU5Mzg4Mjc5NCwiX21heEFnZSI6ODY0MDAwMDB9; session.sig=ujhqvC3wPAn-WlCPXfB6C5ZJIgY";
// Map<String,Object> map3 = Maimai.getMaiaiCount(url,null, ProxyHolder.NAT_PROXY);
// String url = "https://maimai.cn/web/gossip_detail?src=app&webid=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlZ2lkIjoiMjU5OTg4YmE4YzI3MTFlOTllMjEyNDZlOTZiNDgwODgiLCJ1IjoxNzY2NDQ3NjUsImlkIjoyMjIwMDM4NH0.gBdp3i99L0xUKLglaVIJf07OrrPk6yoG9HAo-3Yftqk";
// Map<String,Object> map3 = Maimai.getMaiaiCount(url,cookie, ProxyHolder.NAT_HEAVY_PROXY);
// System.out.println(map3.toString());
// System.out.println(map3.toString());
// System.out.println(url);
// System.out.println(url);
// map1.putAll(map3);
//
//
map1.putAll(map3);
// ZhiWeiTools.sleep(500);
//
//
ZhiWeiTools.sleep(500);
// System.out.println("--------------------------");
//
//
System.out.println("--------------------------");
// }
//
//
}
// headList.add("like");
//
//
headList.add("like");
// headList.add("spreads");
//
//
headList.add("spreads");
// headList.add("cmts");
//
//
headList.add("cmts");
// poi.exportExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\脉脉公司圈.xlsx", "评论采集", headList,
//
//
poi.exportExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\脉脉公司圈.xlsx", "评论采集", headList,
// list);
//
//
list);
// }
// }
//}
//}
src/test/java/com/zhiwei/Comment/YoukuHotCountTest.java
View file @
9234d24c
package
com
.
zhiwei
.
Comment
;
//package com.zhiwei.Comment;
//
import
java.util.List
;
//import java.util.List;
import
java.util.Map
;
//import java.util.Map;
//
import
org.testng.annotations.Test
;
//import org.junit.Test;
//
import
com.zhiwei.common.config.GroupType
;
//import com.zhiwei.common.config.GroupType;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
//import com.zhiwei.crawler.proxy.ProxyFactory;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
//import com.zhiwei.crawler.proxy.ProxyHolder;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.parse.Aiqiyi
;
//import com.zhiwei.parse.Youku;
import
com.zhiwei.parse.Youku
;
//
//public class YoukuHotCountTest {
public
class
YoukuHotCountTest
{
// @Test
@Test
// public void f() {
public
void
f
()
{
//
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
// String path = "C:\\Users\\byte-zbs\\Documents\\WXWork\\1688854025129101\\Cache\\File\\2019-03\\视频奶粉.xlsx";
String
path
=
"C:\\Users\\byte-zbs\\Documents\\WXWork\\1688854025129101\\Cache\\File\\2019-03\\优酷.xlsx"
;
// Map<String,Object> map = poi.importExcel(path, 0);
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
path
,
0
);
// List<Map<String,Object>> dataList = (List<Map<String, Object>>) map.get("body");
List
<
Map
<
String
,
Object
>>
dataList
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
// List<String> headList = (List<String>) map.get("head");
List
<
String
>
headList
=
(
List
<
String
>)
map
.
get
(
"head"
);
// headList.add("count");
headList
.
add
(
"count"
);
// dataList.forEach(m -> {
dataList
.
forEach
(
m
->
{
// String url = String.valueOf(m.get("url"));
String
url
=
String
.
valueOf
(
m
.
get
(
"链接"
));
//
// int i = Youku.getYoukuHotCount(url, ProxyHolder.NAT_PROXY);
int
i
=
Youku
.
getYoukuHotCount
(
url
,
ProxyHolder
.
NAT_PROXY
);
// System.out.println(url + " -- " + i);
System
.
out
.
println
(
url
+
" -- "
+
i
);
// m.put("count", i);
m
.
put
(
"count"
,
i
);
// });
});
// poi.exportExcel(path, "data", headList, dataList);
poi
.
exportExcel
(
path
,
"data"
,
headList
,
dataList
);
//
//
// }
}
//}
}
src/test/java/com/zhiwei/TestHttpBoot.java
View file @
9234d24c
//package com.zhiwei;
package
com
.
zhiwei
;
//
//import java.io.IOException;
import
com.zhiwei.crawler.core.HttpBoot
;
//import java.util.HashMap;
import
com.zhiwei.crawler.utils.RequestUtils
;
//import java.util.Map;
//
import
okhttp3.Response
;
//import java.util.HashMap;
//
public
class
TestHttpBoot
{
//import org.testng.annotations.Test;
//
public
static
void
main
(
String
[]
args
)
{
//import com.zhiwei.crawler.core.HttpBoot;
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
followSslRedirects
(
false
).
build
();
//import com.zhiwei.crawler.core.RequestUtils;
String
url
=
"http://v.youku.com/v_show/id_XMzg1ODAwOTcwOA==.html"
;
//
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
))){
//public class TestHttpBoot {
url
=
response
.
body
().
string
();
// @Test
System
.
out
.
println
(
url
);
// public void f() {
}
catch
(
Exception
e
)
{
// HttpBoot httpBoot = new HttpBoot();
e
.
printStackTrace
();
// String url = "https://www.toutiao.com/c/user/following/?user_id=1034006366&count=20&_signature=wp5wPBAVmXlosTC8Fobui8KecC";
}
// Map<String,Object> headers = new HashMap<>();
}
// headers.put("referer", "https://www.qctt.cn/news/349056");
// headers.put("cookie", "PHPSESSID=3rd6bvonb4g15t1fp777mjums0; Hm_lvt_70af9ea91e7adc8195f6d49511b9a2f1=1542253722; open_ad=1; Hm_lpvt_70af9ea91e7adc8195f6d49511b9a2f1=1542271394; vcode=sqmm; XSRF-TOKEN=eyJpdiI6IlFTNzkyYWNcLzB2SUwyN2dcL1hhUlpsZz09IiwidmFsdWUiOiJRSUpycjZJNGx3d1hUWkpOQUl1R2psSStuVU0yYW8xT1YxXC9QOFY1NjdyRXNrMWpFVE1kSm9IQ1o5Nm5keXlMTEFnZXdCOHVvWDg0U2picTE1cjZzMkE9PSIsIm1hYyI6IjZlYzk5NDI3ODEzMzA3ZTJjNDc3M2ZjMjBlNDJhZjc2YjU2ODFmYmY3YWRlMzdlMzM1NTBlNWMxNDk3MjFiZDEifQ%3D%3D; laravel_session=eyJpdiI6InJQMnByeFlIbXVhaUVVVVBLK1wvaXlRPT0iLCJ2YWx1ZSI6IlhUOUtIS2ZQZ0ZKNFh1RDVQYjBjSVZkVkpQZTdYRDNpa1wvV0o5QlJPbk8xZE0rQ3dZdnFMdjcya011ejVkdWEwUk1Qa29Zb2Y3OU0yUGkrWDF4Wk5adz09IiwibWFjIjoiZGJiYjlkNWZhNmJhMDFiMjkxYTAyMmUwZTEyMWVmZTQ0NmJiZDQ2ZGU3ZjNjNmUzNTIwZGI0NTc4NDJlZjNiMCJ9");
}
// headers.put("origin", "https://www.qctt.cn");
// Map<String,Object> params = new HashMap<>();
// params.put("id", "349056");
// params.put("page", "3");
// params.put("_token", "EJ58V0qilRw7P77czp0U6iO9QW2IOS1ZGiBk4wH1");
// try {
// String result = httpBoot.syncCall(RequestUtils.wrapGet(url)).body().string();
// System.out.println(result);
//
// } catch (IOException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
//
//
// }
//}
src/test/java/com/zhiwei/crawler/DayuAccountExample.java
View file @
9234d24c
//package com.zhiwei.crawler;
package
com
.
zhiwei
.
crawler
;
//
//import java.util.ArrayList;
import
java.util.ArrayList
;
//import java.util.List;
import
java.util.List
;
//import java.util.Map;
import
java.util.Map
;
//
//import org.junit.Test;
import
org.junit.Test
;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.common.config.GroupType
;
//import com.zhiwei.parse.Dayu;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
//
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//public class DayuAccountExample {
import
com.zhiwei.parse.Dayu
;
//
//
public
class
DayuAccountExample
{
// @Test
// public void dayuAccountTest() {
// //https://api.m.sm.cn/rest?method=Subscribe.list&format=html&from=wh10331&uc_biz_str=S:custom%7CC:search%7CN:true
@Test
//
public
void
dayuAccountTest
()
{
//
//https://api.m.sm.cn/rest?method=Subscribe.list&format=html&from=wh10331&uc_biz_str=S:custom%7CC:search%7CN:true
//// String mid = "d7300311c1504d24a229c3da345785c6";
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
//// String name = "大鱼海棠雨";
// String startTime = "2017-01-01 00:00:00";
// String mid = "0a8b2360fd8b4ded971cd324a56d32f0";
// String path = "D:\\crawlerdata\\自媒体\\大鱼历史文章.xlsx";
// String name = "大鱼海棠雨";
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
String
startTime
=
"2017-01-01 00:00:00"
;
// Map<String,Object> map = poi.importExcel(path, 0);
String
path
=
"D:\\crawlerdata\\自媒体\\大鱼历史文章.xlsx"
;
// List<Map<String,Object>> lists = (List<Map<String, Object>>) map.get("body");
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
// List<String> headList = new ArrayList<String>();
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
path
,
0
);
// headList.add("title");
List
<
Map
<
String
,
Object
>>
lists
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
// headList.add("time");
List
<
String
>
headList
=
new
ArrayList
<
String
>();
// headList.add("content");
headList
.
add
(
"title"
);
// headList.add("source");
headList
.
add
(
"time"
);
// headList.add("url");
headList
.
add
(
"content"
);
//// headList.add("content_id");
headList
.
add
(
"source"
);
//// headList.add("origin_id");
headList
.
add
(
"url"
);
//// headList.add("xss_item_id");
// headList.add("content_id");
// for(Map<String,Object> data : lists) {
// headList.add("origin_id");
// String mid = data.get("mid")+"";
// headList.add("xss_item_id");
// String name = data.get("name")+"";
for
(
Map
<
String
,
Object
>
data
:
lists
)
{
// if(mid.length() < 1 && name.length() < 1) {
String
mid
=
data
.
get
(
"mid"
)+
""
;
// continue;
String
name
=
data
.
get
(
"name"
)+
""
;
// }
mid
=
"7b345070c4124574b9cbcab8c4a1aeb8"
;
// List<Map<String,Object>> dataList = Dayu.getDayuAccountData(mid,name,null,null);
name
=
"国魂"
;
// poi.exportExcel(path, name, headList, dataList);
if
(
mid
.
length
()
<
1
&&
name
.
length
()
<
1
)
{
// }
continue
;
//
}
//
List
<
Map
<
String
,
Object
>>
dataList
=
Dayu
.
getDayuAccountData
(
mid
,
name
,
null
,
null
);
// }
poi
.
exportExcel
(
path
,
name
,
headList
,
dataList
);
//
}
//
//}
}
}
src/test/java/com/zhiwei/crawler/DayuCommentCountExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
//
package com.zhiwei.crawler;
//
import
org.junit.Test
;
//
import org.junit.Test;
//
import
com.zhiwei.parse.Dayu
;
//
import com.zhiwei.parse.Dayu;
//
public
class
DayuCommentCountExample
{
//
public class DayuCommentCountExample {
//
@Test
//
@Test
public
void
dayuCommentCountTest
()
{
//
public void dayuCommentCountTest() {
String
articleId
=
"6987993456991247474"
;
//
String articleId = "6987993456991247474";
//
int
i
=
Dayu
.
getDayuCommentCount
(
articleId
,
null
);
//
int i = Dayu.getDayuCommentCount(articleId,null);
System
.
out
.
println
(
i
);
//
System.out.println(i);
}
//
}
//
//
//
}
//
}
src/test/java/com/zhiwei/crawler/DayuCommentExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
//
package com.zhiwei.crawler;
//
import
java.util.ArrayList
;
//
import java.util.ArrayList;
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
org.junit.Test
;
//
import org.junit.Test;
//
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.parse.Dayu
;
//
import com.zhiwei.parse.Dayu;
//
public
class
DayuCommentExample
{
//
public class DayuCommentExample {
//
@Test
//
@Test
public
void
getDayuCommentTest
()
{
//
public void getDayuCommentTest() {
//若已获取历史文章 哪里有这个字段 其他文章的
//
//若已获取历史文章 哪里有这个字段 其他文章的
//http://m.uczzd.cn/iflow/api/v2/cmt/article/14180961224021425316/comments/byhot
//
//http://m.uczzd.cn/iflow/api/v2/cmt/article/14180961224021425316/comments/byhot
//14180961224021425316 这个为此参数
//
//14180961224021425316 这个为此参数
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
"D://crawlerdata//自媒体//UC评论采集-1.xlsx"
,
0
);
//
Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体//UC评论采集-1.xlsx", 0);
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
//
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
List
<
String
>
urlList
=
new
ArrayList
<
String
>();
//
List<String> urlList = new ArrayList<String>();
for
(
Map
<
String
,
Object
>
map1
:
list
)
{
//
for(Map<String,Object> map1 : list) {
String
url
=
""
;
//
String url = "";
try
{
//
try {
url
=
map1
.
get
(
"url"
)+
""
;
//
url = map1.get("url")+"";
String
articleId
=
""
;
//
String articleId = "";
url
=
"16848608935470442496"
;
//
url = "16848608935470442496";
if
(
url
.
contains
(
"aid"
))
{
//
if(url.contains("aid")) {
articleId
=
url
.
split
(
"aid="
)[
1
].
split
(
"&"
)[
0
];
//
articleId = url.split("aid=")[1].split("&")[0];
}
else
{
//
}else {
articleId
=
url
;
//
articleId = url;
}
//
}
List
<
Map
<
String
,
Object
>>
dataList
=
Dayu
.
getDayuCommentData
(
articleId
,
null
);
//
List<Map<String,Object>> dataList = Dayu.getDayuCommentData(articleId,null);
if
(
dataList
.
size
()
<=
0
)
{
//
if(dataList.size() <= 0) {
urlList
.
add
(
url
);
//
urlList.add(url);
}
//
}
if
(
dataList
!=
null
)
{
//
if(dataList != null) {
bodyList
.
addAll
(
dataList
);
//
bodyList.addAll(dataList);
}
//
}
}
catch
(
Exception
e
)
{
//
} catch (Exception e) {
System
.
out
.
println
(
url
);
//
System.out.println(url);
e
.
printStackTrace
();
//
e.printStackTrace();
continue
;
//
continue;
}
//
}
}
//
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
//
List<String> headList = new ArrayList<String>();
headList
.
add
(
"nickname"
);
//
headList.add("nickname");
headList
.
add
(
"content"
);
//
headList.add("content");
headList
.
add
(
"id"
);
//
headList.add("id");
headList
.
add
(
"url"
);
//
headList.add("url");
headList
.
add
(
"like"
);
//
headList.add("like");
headList
.
add
(
"time"
);
//
headList.add("time");
headList
.
add
(
"replay_count"
);
//
headList.add("replay_count");
for
(
String
s
:
urlList
)
{
//
for(String s : urlList) {
System
.
out
.
println
(
s
);
//
System.out.println(s);
}
//
}
poi
.
exportExcel
(
"D://crawlerdata/UC评论采集.xlsx"
,
"评论"
,
headList
,
bodyList
);
//
poi.exportExcel("D://crawlerdata/UC评论采集.xlsx", "评论", headList, bodyList);
//
}
//
}
//
//
}
//
}
src/test/java/com/zhiwei/crawler/FenghuangAccountExample.java
deleted
100644 → 0
View file @
cb5516a0
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Fenghuang
;
public
class
FenghuangAccountExample
{
@Test
public
void
fenghuangAccountTest
()
{
//所用时间长 1s1篇文章吧
//https://api.3g.ifeng.com/client_search_subscribe?k=号外财经
String
id
=
"6452"
;
String
[]
ids
=
id
.
split
(
","
);
for
(
int
i
=
0
;
i
<
ids
.
length
;
i
++)
{
try
{
String
startTime
=
"2010-05-01 00:00:00"
;
//可为空
List
<
Map
<
String
,
Object
>>
dataList
=
Fenghuang
.
getFenghuangAccountData
(
ids
[
i
],
startTime
,
null
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"text"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
headList
.
add
(
"id"
);
poi
.
exportExcel
(
"D://crawlerdata/凤凰-6452.xlsx"
,
ids
[
i
],
headList
,
dataList
);
}
catch
(
Exception
e
)
{
continue
;
}
}
}
}
src/test/java/com/zhiwei/crawler/FenghuangByWordExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
//
package com.zhiwei.crawler;
//
import
java.util.ArrayList
;
//
import java.util.ArrayList;
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
org.junit.Test
;
//
import org.junit.Test;
//
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.parse.Fenghuang
;
//
import com.zhiwei.parse.Fenghuang;
import
com.zhiwei.parse.Yidianzixun
;
//
import com.zhiwei.parse.Yidianzixun;
import
com.zhiwei.util.WordReadFile
;
//
import com.zhiwei.util.WordReadFile;
//
public
class
FenghuangByWordExample
{
//
public class FenghuangByWordExample {
//
@Test
//
@Test
public
void
fenghuangByWordTest
()
{
//
public void fenghuangByWordTest() {
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"D://crawlerdata/关键词.txt"
);
//
List<String> wordList = WordReadFile.getWords("D://crawlerdata/关键词.txt");
List
<
Map
<
String
,
Object
>>
listAll
=
new
ArrayList
<
Map
<
String
,
Object
>>();
//
List<Map<String,Object>> listAll = new ArrayList<Map<String,Object>>();
for
(
String
word
:
wordList
)
{
//
for(String word : wordList) {
try
{
//
try {
List
<
Map
<
String
,
Object
>>
dataList
=
Fenghuang
.
getFenghuangByWord
(
word
,
null
);
//
List<Map<String,Object>> dataList = Fenghuang.getFenghuangByWord(word,null);
if
(
dataList
!=
null
&&
dataList
.
size
()
>
0
)
{
//
if(dataList != null && dataList.size() > 0) {
listAll
.
addAll
(
dataList
);
//
listAll.addAll(dataList);
}
//
}
System
.
out
.
println
(
dataList
.
size
()+
"==========="
+
listAll
.
size
());
//
System.out.println(dataList.size()+"==========="+listAll.size());
}
catch
(
Exception
e
)
{
//
} catch (Exception e) {
continue
;
//
continue;
}
//
}
}
//
}
//
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
//
List<String> headList = new ArrayList<String>();
headList
.
add
(
"title"
);
//
headList.add("title");
headList
.
add
(
"content"
);
//
headList.add("content");
headList
.
add
(
"source"
);
//
headList.add("source");
headList
.
add
(
"time"
);
//
headList.add("time");
headList
.
add
(
"url"
);
//
headList.add("url");
System
.
out
.
println
(
listAll
.
size
());
//
System.out.println(listAll.size());
poi
.
exportExcel
(
"D://crawlerdata/凤凰-美林.xlsx"
,
"asd"
,
headList
,
listAll
);
//
poi.exportExcel("D://crawlerdata/凤凰-美林.xlsx", "asd", headList, listAll);
}
//
}
//
//
//
}
//
}
src/test/java/com/zhiwei/crawler/FenghuangCommentCountExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
//
package com.zhiwei.crawler;
//
import
java.util.Map
;
//
import java.util.Map;
//
import
org.junit.Test
;
//
import org.junit.Test;
//
import
com.zhiwei.parse.Fenghuang
;
//
import com.zhiwei.parse.Fenghuang;
//
//
public
class
FenghuangCommentCountExample
{
//
public class FenghuangCommentCountExample {
//
@Test
//
@Test
public
void
fenghuangCommentCountTest
()
{
//
public void fenghuangCommentCountTest() {
String
url
=
"http://tech.ifeng.com/a/20181113/45222352_0.shtml"
;
//
String url = "http://tech.ifeng.com/a/20181113/45222352_0.shtml";
//http://news.ifeng.com/a/20161229/50492484_0.shtml
//
//http://news.ifeng.com/a/20161229/50492484_0.shtml
//http://wemedia.ifeng.com/4096977/wemedia.shtml
//
//http://wemedia.ifeng.com/4096977/wemedia.shtml
Map
<
String
,
Object
>
map
=
Fenghuang
.
getFenghuangCommentCount
(
url
,
null
);
//
Map<String,Object> map = Fenghuang.getFenghuangCommentCount(url,null);
System
.
out
.
println
(
map
.
toString
());
//
System.out.println(map.toString());
//
}
//
}
//
//
}
//
}
src/test/java/com/zhiwei/crawler/FenghuangCommentExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
//
package com.zhiwei.crawler;
//
import
java.util.ArrayList
;
//
import java.util.ArrayList;
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
org.junit.Test
;
//
import org.junit.Test;
//
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.parse.Fenghuang
;
//
import com.zhiwei.parse.Fenghuang;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
//
import com.zhiwei.tools.tools.ZhiWeiTools;
//
public
class
FenghuangCommentExample
{
//
public class FenghuangCommentExample {
//
@Test
//
@Test
public
void
fenghuangCommentTest
()
{
//
public void fenghuangCommentTest() {
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
"D://crawlerdata//自媒体/凤凰评论采集.xlsx"
,
0
);
//
Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体/凤凰评论采集.xlsx", 0);
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
//
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
List
<
String
>
urlList
=
new
ArrayList
<
String
>();
//
List<String> urlList = new ArrayList<String>();
for
(
Map
<
String
,
Object
>
map1
:
list
)
{
//
for(Map<String,Object> map1 : list) {
String
url
=
""
;
//
String url = "";
try
{
//
try {
url
=
map1
.
get
(
"url"
)+
""
;
//
url = map1.get("url")+"";
System
.
out
.
println
(
url
);
//
System.out.println(url);
List
<
Map
<
String
,
Object
>>
dataList
=
Fenghuang
.
getFenghuangCommentData2
(
url
,
null
);
//
List<Map<String,Object>> dataList = Fenghuang.getFenghuangCommentData2(url,null);
if
(
dataList
==
null
||
dataList
.
size
()
<=
0
)
{
//
if(dataList == null || dataList.size() <= 0) {
urlList
.
add
(
url
);
//
urlList.add(url);
}
//
}
if
(
dataList
!=
null
)
{
//
if(dataList != null) {
for
(
Map
<
String
,
Object
>
m
:
dataList
)
{
//
for(Map<String,Object> m : dataList) {
m
.
put
(
"from_url"
,
url
);
//
m.put("from_url", url);
bodyList
.
add
(
m
);
//
bodyList.add(m);
}
//
}
}
//
}
}
catch
(
Exception
e
)
{
//
} catch (Exception e) {
System
.
out
.
println
(
url
);
//
System.out.println(url);
e
.
printStackTrace
();
//
e.printStackTrace();
continue
;
//
continue;
}
//
}
ZhiWeiTools
.
sleep
(
1000
);
//
ZhiWeiTools.sleep(1000);
}
//
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
//
List<String> headList = new ArrayList<String>();
headList
.
add
(
"nickname"
);
//
headList.add("nickname");
headList
.
add
(
"content"
);
//
headList.add("content");
headList
.
add
(
"id"
);
//
headList.add("id");
headList
.
add
(
"like"
);
//
headList.add("like");
headList
.
add
(
"from"
);
//
headList.add("from");
headList
.
add
(
"time"
);
//
headList.add("time");
headList
.
add
(
"from_url"
);
//
headList.add("from_url");
for
(
String
s
:
urlList
)
{
//
for(String s : urlList) {
System
.
out
.
println
(
s
);
//
System.out.println(s);
}
//
}
poi
.
exportExcel
(
"D://crawlerdata//自媒体/凤凰评论采集.xlsx"
,
"评论采集"
,
headList
,
bodyList
);
//
poi.exportExcel("D://crawlerdata//自媒体/凤凰评论采集.xlsx", "评论采集", headList, bodyList);
//
}
//
}
//
//
}
//
}
src/test/java/com/zhiwei/crawler/MaimaiBywordExample.java
View file @
9234d24c
//
package com.zhiwei.crawler;
package
com
.
zhiwei
.
crawler
;
//
//
import java.util.ArrayList;
import
java.util.ArrayList
;
//
import java.util.Arrays;
import
java.util.Arrays
;
//
import java.util.List;
import
java.util.List
;
//
import java.util.Map;
import
java.util.Map
;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//
import com.zhiwei.parse.Maimai;
import
com.zhiwei.parse.Maimai
;
//
//
public class MaimaiBywordExample {
public
class
MaimaiBywordExample
{
//
//
public static void main(String[] args) {
public
static
void
main
(
String
[]
args
)
{
//
String word = "美团|某团|MT|大众点评|新美大|美团点评";
String
word
=
"美团|某团|MT|大众点评|新美大|美团点评"
;
// String cookie = "_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; sessionid=lejfy3gdu5tf9x9zowxfhtq5o73dubc5; guid=GxsfBBgZGwQYGx4EGBkeVgcYGxkdHhMeHhkcVhwZBB0ZHwVDWEtMS3kKEhMEEh0fGQQaBBsdBU9HRVhCaQoDRUFJT20KT0FDRgoGZmd+YmECChwZBB0ZHwVeQ2FIT31PRlpaawoDHhx9ZX0KERkEHAp+ZApZXUVOREN9AgoaBB8FS0ZGQ1BFZw==; seid=s1550814253444; token=\"G8eNNNylPoi3oIPLUr/d/RDaMgtnpZCskxT7wu1pRRrkiy3J8G7StHgTx9DQBq4O8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoiVjJuNHdCVDBncVNacTRxVllGM29jRUVwIiwic3RhdHVzIjp0cnVlLCJfZXhwaXJlIjoxNTUwOTAwNjY1Njg4LCJfbWF4QWdlIjo4NjQwMDAwMH0=; session.sig=b_tga85tZskxsgKX8YIM_JKByi0
";
String
cookie
=
"_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; sessionid=8d1sx8i4gj0ocmtyc86x2yj0467ymayv; token=\"wl8U6GizDpoS6uzZ1ug93sJjfBucfB7IOoDxDVWOy+g7egJdXL/riMlMlHuQj+gM8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoiLVctRlpDLXg3N1h4ZEhkeEs0Qi1NR0VDIiwibWlkNDU2ODc2MCI6ZmFsc2UsInN0YXR1cyI6dHJ1ZSwiX2V4cGlyZSI6MTU1NzEyNDAxMzA0NSwiX21heEFnZSI6ODY0MDAwMDB9; session.sig=NZ2D9ZQU_Wlx6JGAFap4Znviz6k
"
;
//
String time = "2019-02-15 00:00:00";
String
time
=
"2019-02-15 00:00:00"
;
//
String[] words = word.split("\\|");
String
[]
words
=
word
.
split
(
"\\|"
);
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
//
for(String w : words) {
for
(
String
w
:
words
)
{
//
//实名动态
//实名动态
//
//
List<Map<String,Object>> c = Maimai.getData(w, cookie, time, null);
// List<Map<String,Object>> c = Maimai.getData(w, cookie, time, null);
//
//职言交流
//职言交流
//
List<Map<String,Object>> c2 = Maimai.getDataByNoName(w, cookie, time, null);
List
<
Map
<
String
,
Object
>>
c2
=
Maimai
.
getDataByNoName
(
w
,
cookie
,
time
,
null
);
//
//
bodyList.addAll(c);
// bodyList.addAll(c);
//
bodyList.addAll(c2);
bodyList
.
addAll
(
c2
);
//
}
}
//
List<String> headList = Arrays.asList("time","url","text","name","like","comment_count","spreads","word");
List
<
String
>
headList
=
Arrays
.
asList
(
"time"
,
"url"
,
"text"
,
"name"
,
"like"
,
"comment_count"
,
"spreads"
,
"word"
);
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
//
poi.exportExcel("D:\\crawlerdata\\自媒体\\脉脉关键词采集-美团-0222.xlsx", "脉脉关键词", headList, bodyList);
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\脉脉关键词采集-美团-0222.xlsx"
,
"脉脉关键词"
,
headList
,
bodyList
);
//
}
}
//
//
}
}
src/test/java/com/zhiwei/crawler/MeipaiByWordExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
//
package com.zhiwei.crawler;
//
import
java.util.ArrayList
;
//
import java.util.ArrayList;
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
org.junit.Test
;
//
import org.junit.Test;
//
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.parse.Meipai
;
//
import com.zhiwei.parse.Meipai;
//
public
class
MeipaiByWordExample
{
//
public class MeipaiByWordExample {
//
@Test
//
@Test
public
void
meipaiByWordTest
()
{
//
public void meipaiByWordTest() {
String
word
=
"美食,吃,菜"
;
//
String word = "美食,吃,菜";
String
[]
words
=
word
.
split
(
","
);
//
String[] words = word.split(",");
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for
(
String
w
:
words
)
{
//
for(String w : words) {
List
<
Map
<
String
,
Object
>>
dataList
=
Meipai
.
getMeipaiByWordData
(
w
,
null
);
//
List<Map<String,Object>> dataList = Meipai.getMeipaiByWordData(w,null);
if
(
dataList
!=
null
)
{
//
if(dataList != null) {
bodyList
.
addAll
(
dataList
);
//
bodyList.addAll(dataList);
}
//
}
}
//
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
//
List<String> headList = new ArrayList<String>();
headList
.
add
(
"time"
);
//
headList.add("time");
headList
.
add
(
"video_count"
);
//
headList.add("video_count");
headList
.
add
(
"content"
);
//
headList.add("content");
headList
.
add
(
"url"
);
//
headList.add("url");
headList
.
add
(
"like"
);
//
headList.add("like");
headList
.
add
(
"comment_count"
);
//
headList.add("comment_count");
headList
.
add
(
"source"
);
//
headList.add("source");
headList
.
add
(
"source_url"
);
//
headList.add("source_url");
//
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
poi
.
exportExcel
(
"D://crawlerdata/美拍关键词采集.xlsx"
,
"美拍数据"
,
headList
,
bodyList
);
//
poi.exportExcel("D://crawlerdata/美拍关键词采集.xlsx", "美拍数据", headList, bodyList);
//
}
//
}
//
//
//
}
//
}
src/test/java/com/zhiwei/crawler/MiaopaiByUrlExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
//
package com.zhiwei.crawler;
//
import
java.util.ArrayList
;
//
import java.util.ArrayList;
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
org.junit.Test
;
//
import org.junit.Test;
//
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.parse.Miaopai
;
//
import com.zhiwei.parse.Miaopai;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
//
import com.zhiwei.tools.tools.ZhiWeiTools;
//
public
class
MiaopaiByUrlExample
{
//
public class MiaopaiByUrlExample {
//
@Test
//
@Test
public
void
miaopaiByUrlTest
()
{
//
public void miaopaiByUrlTest() {
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
"D://crawlerdata/秒拍美食.xlsx"
,
0
);
//
Map<String,Object> map = poi.importExcel("D://crawlerdata/秒拍美食.xlsx", 0);
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
//
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
List
<
String
>
urlList
=
new
ArrayList
<
String
>();
//
List<String> urlList = new ArrayList<String>();
for
(
Map
<
String
,
Object
>
map1
:
list
)
{
//
for(Map<String,Object> map1 : list) {
String
url
=
""
;
//
String url = "";
try
{
//
try {
url
=
map1
.
get
(
"url"
)+
""
;
//
url = map1.get("url")+"";
if
(
urlList
.
contains
(
url
))
{
//
if(urlList.contains(url)) {
continue
;
//
continue;
}
//
}
urlList
.
add
(
url
);
//
urlList.add(url);
ZhiWeiTools
.
sleep
(
5000
);
//
ZhiWeiTools.sleep(5000);
System
.
out
.
println
(
url
);
//
System.out.println(url);
Map
<
String
,
Object
>
dataMap
=
Miaopai
.
getMiaopaiDataByURL
(
url
,
null
);
//
Map<String,Object> dataMap = Miaopai.getMiaopaiDataByURL(url,null);
if
(
dataMap
!=
null
)
{
//
if(dataMap != null) {
bodyList
.
add
(
dataMap
);
//
bodyList.add(dataMap);
}
//
}
}
catch
(
Exception
e
)
{
//
} catch (Exception e) {
System
.
out
.
println
(
url
);
//
System.out.println(url);
e
.
printStackTrace
();
//
e.printStackTrace();
continue
;
//
continue;
}
//
}
}
//
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
//
List<String> headList = new ArrayList<String>();
headList
.
add
(
"time"
);
//
headList.add("time");
headList
.
add
(
"source"
);
//
headList.add("source");
headList
.
add
(
"title"
);
//
headList.add("title");
headList
.
add
(
"url"
);
//
headList.add("url");
headList
.
add
(
"video_count"
);
//
headList.add("video_count");
poi
.
exportExcel
(
"D://crawlerdata/秒拍美食.xlsx"
,
"asd"
,
headList
,
bodyList
);
//
poi.exportExcel("D://crawlerdata/秒拍美食.xlsx", "asd", headList, bodyList);
//
}
//
}
//
//
}
//
}
src/test/java/com/zhiwei/crawler/PearVideoByWordExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
//
package com.zhiwei.crawler;
//
import
java.util.ArrayList
;
//
import java.util.ArrayList;
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
org.junit.Test
;
//
import org.junit.Test;
//
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.parse.PearVideo
;
//
import com.zhiwei.parse.PearVideo;
//
public
class
PearVideoByWordExample
{
//
public class PearVideoByWordExample {
//
@Test
//
@Test
public
void
pearVideoByWordTest
()
{
//
public void pearVideoByWordTest() {
String
word
=
"大宝 甲醛"
;
//
String word = "大宝 甲醛";
//
List
<
Map
<
String
,
Object
>>
bodyList
=
PearVideo
.
getPearVideoData
(
word
,
null
);
//
List<Map<String,Object>> bodyList = PearVideo.getPearVideoData(word,null);
List
<
String
>
headList
=
new
ArrayList
<
String
>();
//
List<String> headList = new ArrayList<String>();
headList
.
add
(
"time"
);
//
headList.add("time");
headList
.
add
(
"title"
);
//
headList.add("title");
headList
.
add
(
"content"
);
//
headList.add("content");
headList
.
add
(
"url"
);
//
headList.add("url");
headList
.
add
(
"like"
);
//
headList.add("like");
headList
.
add
(
"source"
);
//
headList.add("source");
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi
.
exportExcel
(
"D://crawlerdata/梨视频关键词采集.xlsx"
,
"梨视频采集结果"
,
headList
,
bodyList
);
//
poi.exportExcel("D://crawlerdata/梨视频关键词采集.xlsx", "梨视频采集结果", headList, bodyList);
//
//
}
//
}
//
//
}
//
}
src/test/java/com/zhiwei/crawler/QQAccountExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
//
package com.zhiwei.crawler;
//
import
java.util.ArrayList
;
//
import java.util.ArrayList;
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
org.junit.Test
;
//
import org.junit.Test;
//
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.parse.QQKB
;
//
import com.zhiwei.parse.QQKB;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
//
import com.zhiwei.tools.tools.ZhiWeiTools;
//
public
class
QQAccountExample
{
//
public class QQAccountExample {
//
@Test
//
@Test
public
void
qqAccountTest
()
{
//
public void qqAccountTest() {
//
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map
<
String
,
Object
>
dataMap
=
poi
.
importExcel
(
"D://crawlerdata//自媒体/天天快报历史文章采集.xlsx"
,
0
);
//
Map<String,Object> dataMap = poi.importExcel("D://crawlerdata//自媒体/天天快报历史文章采集.xlsx", 0);
List
<
Map
<
String
,
Object
>>
dataList
=
(
List
<
Map
<
String
,
Object
>>)
dataMap
.
get
(
"body"
);
//
List<Map<String,Object>> dataList = (List<Map<String, Object>>) dataMap.get("body");
String
cookie
=
"phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=000300006218e8444698ebbad28deed8243ef28e87f247b133bf5d19f160bce91d4e3e06ec2003cd4f92aff0;%20uin=o0497332654;%20skey=MQL8ScQBxu;%20sigA2=2EBEFFCC5C22FD27B32E6F21C76CA4A2AD8BF5D626B629A38C50923810CCD19C574CD093CFAD29C845084C8CD3E04B57DD24A69E4418E060C899696A728FB3B8B14C577F1FF93340;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmSQ3EqlyzseC9-qGEFY7Tkr0Ypk5vsnSwOaMC-IGxsBeY2K7knHrYstj_5dZpisJd5nihvLNQvCdsFhFwZQcT8;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0"
;
//
String cookie = "phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=000300006218e8444698ebbad28deed8243ef28e87f247b133bf5d19f160bce91d4e3e06ec2003cd4f92aff0;%20uin=o0497332654;%20skey=MQL8ScQBxu;%20sigA2=2EBEFFCC5C22FD27B32E6F21C76CA4A2AD8BF5D626B629A38C50923810CCD19C574CD093CFAD29C845084C8CD3E04B57DD24A69E4418E060C899696A728FB3B8B14C577F1FF93340;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmSQ3EqlyzseC9-qGEFY7Tkr0Ypk5vsnSwOaMC-IGxsBeY2K7knHrYstj_5dZpisJd5nihvLNQvCdsFhFwZQcT8;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0";
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for
(
Map
<
String
,
Object
>
map
:
dataList
)
{
//
for(Map<String,Object> map : dataList) {
String
child
=
map
.
get
(
"帐号链接"
)+
""
;
//
String child = map.get("帐号链接")+"";
// System.out.println(child.split("chlid=")[1]);
//
//
System.out.println(child.split("chlid=")[1]);
System
.
out
.
println
(
child
.
split
(
"="
)[
1
]);
//
System.out.println(child.split("=")[1]);
//
List
<
Map
<
String
,
Object
>>
lists
=
QQKB
.
getQQAccountData
(
"5001789
"
,
cookie
,
null
);
// List<Map<String,Object>> lists = QQKB.getQQAccountData("5456950
", cookie,null);
if
(
lists
!=
null
)
{
//
if(lists != null) {
for
(
Map
<
String
,
Object
>
map1
:
lists
)
{
//
for(Map<String,Object> map1 : lists) {
map1
.
put
(
"name"
,
map
.
get
(
"呢称"
));
//
map1.put("name", map.get("呢称"));
map1
.
put
(
"主页地址"
,
map
.
get
(
"帐号链接"
));
//
map1.put("主页地址", map.get("帐号链接"));
bodyList
.
add
(
map1
);
//
bodyList.add(map1);
}
//
}
}
//
}
System
.
out
.
println
(
"采集到的历史文章数总和============="
+
bodyList
.
size
());
//
System.out.println("采集到的历史文章数总和============="+bodyList.size());
ZhiWeiTools
.
sleep
(
5000
);
//
ZhiWeiTools.sleep(5000);
}
//
}
System
.
out
.
println
(
dataList
.
size
());
//
System.out.println(dataList.size());
List
<
String
>
headList
=
new
ArrayList
<
String
>();
//
List<String> headList = new ArrayList<String>();
headList
.
add
(
"name"
);
//
headList.add("name");
headList
.
add
(
"主页地址"
);
//
headList.add("主页地址");
headList
.
add
(
"title"
);
//
headList.add("title");
headList
.
add
(
"time"
);
//
headList.add("time");
headList
.
add
(
"content"
);
//
headList.add("content");
headList
.
add
(
"url"
);
//
headList.add("url");
headList
.
add
(
"commentid"
);
//
headList.add("commentid");
poi
.
exportExcel
(
"D://crawlerdata//自媒体/天天快报采集-科技编年史.xlsx"
,
"asd"
,
headList
,
bodyList
);
//
poi.exportExcel("D://crawlerdata//自媒体/天天快报采集-科技编年史.xlsx", "asd", headList, bodyList);
}
//
}
//
//
}
//
}
src/test/java/com/zhiwei/crawler/QQKBByWordExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
//
package com.zhiwei.crawler;
//
import
org.junit.Test
;
//
import org.junit.Test;
//
import
com.zhiwei.parse.QQKB
;
//
import com.zhiwei.parse.QQKB;
//
public
class
QQKBByWordExample
{
//
public class QQKBByWordExample {
//
@Test
//
@Test
public
void
qqkbByWordTest
()
{
//
public void qqkbByWordTest() {
String
word
=
"麦当劳"
;
//
String word = "麦当劳";
String
cookie
=
"phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=00030000a7aa9ad0c4636295c0f484bf54820005db056be0651296f399e092bc56e4910bbedaf413ced3fb8a;%20uin=o0497332654;%20skey=MSF4MCe62n;%20sigA2=71400747BF93A51F5F103AC6180C723E940637B87127C104AA325F5709FD378BEB014D649D6B653031F6B2E0962C0F8D9C06807A1CE509983C1C70B641606FEA84B90777926863E4;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwsY93PCkKQ3oQ4nAAU9-VNf0eKb4SlfVLb-YW7bRaEL2jS3XDtJGO8m9DoDrFZ4bFyv96Pb5ZfvVIyehq-jQ65o;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0"
;
//
String cookie = "phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=00030000a7aa9ad0c4636295c0f484bf54820005db056be0651296f399e092bc56e4910bbedaf413ced3fb8a;%20uin=o0497332654;%20skey=MSF4MCe62n;%20sigA2=71400747BF93A51F5F103AC6180C723E940637B87127C104AA325F5709FD378BEB014D649D6B653031F6B2E0962C0F8D9C06807A1CE509983C1C70B641606FEA84B90777926863E4;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwsY93PCkKQ3oQ4nAAU9-VNf0eKb4SlfVLb-YW7bRaEL2jS3XDtJGO8m9DoDrFZ4bFyv96Pb5ZfvVIyehq-jQ65o;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0";
//无法找到下一页
//
//无法找到下一页
// QQKB.getQQKBByWordData(word, cookie);
//
//
QQKB.getQQKBByWordData(word, cookie);
//
}
//
}
//
//
//
}
//
}
src/test/java/com/zhiwei/crawler/QQKBCommentCountExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
//
package com.zhiwei.crawler;
//
import
org.junit.Test
;
//
import org.junit.Test;
//
import
com.zhiwei.parse.QQKB
;
//
import com.zhiwei.parse.QQKB;
//
public
class
QQKBCommentCountExample
{
//
public class QQKBCommentCountExample {
//
//
@Test
//
@Test
public
void
qqkbCommentCountTest
()
{
//
public void qqkbCommentCountTest() {
String
cookie
=
""
;
//
String cookie = "";
String
url
=
"https://kuaibao.qq.com/s/20190305A16P6L00"
;
//
String url = "https://kuaibao.qq.com/s/20190305A16P6L00";
//
int
i
=
QQKB
.
getCommentCount
(
url
,
null
);
//
int i = QQKB.getCommentCount(url,null);
System
.
out
.
println
(
i
);
//
System.out.println(i);
//
}
//
}
//
//
}
//
}
src/test/java/com/zhiwei/crawler/QQKBCommentExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
//
package com.zhiwei.crawler;
//
import
java.util.ArrayList
;
//
import java.util.ArrayList;
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
org.junit.Test
;
//
import org.junit.Test;
//
import
com.zhiwei.common.config.GroupType
;
//
import com.zhiwei.common.config.GroupType;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
//
import com.zhiwei.crawler.proxy.ProxyFactory;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.parse.QQKB
;
//
import com.zhiwei.parse.QQKB;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
//
import com.zhiwei.tools.tools.ZhiWeiTools;
//
public
class
QQKBCommentExample
{
//
public class QQKBCommentExample {
//
//天天快报与腾讯新闻都可用 不用cookie
//
//天天快报与腾讯新闻都可用 不用cookie
@Test
//
@Test
public
void
qqkbCommentTest
()
{
//
public void qqkbCommentTest() {
String
url
=
"https://kuaibao.qq.com/s/20181122A11WQB00"
;
//
String url = "https://kuaibao.qq.com/s/20181122A11WQB00";
//https://kuaibao.qq.com/s/20180423A1PI7400?refer=kb_news
//
//https://kuaibao.qq.com/s/20180423A1PI7400?refer=kb_news
// https://kuaibao.qq.com/s/20180423A0L60800?refer=kb_news
//
//
https://kuaibao.qq.com/s/20180423A0L60800?refer=kb_news
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
//
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
GroupType
.
PROVIDER
);
//
GroupType.PROVIDER);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
"D://crawlerdata//自媒体/快报评论采集.xlsx"
,
0
);
//
Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体/快报评论采集.xlsx", 0);
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
//
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for
(
Map
<
String
,
Object
>
m
:
list
)
{
//
for(Map<String,Object> m : list) {
String
u
=
m
.
get
(
"地址"
).
toString
();
//
String u = m.get("地址").toString();
System
.
out
.
println
(
u
);
//
System.out.println(u);
ZhiWeiTools
.
sleep
(
2000
);
//
ZhiWeiTools.sleep(2000);
List
<
Map
<
String
,
Object
>>
dataList
=
QQKB
.
getQQKBCommentData
(
u
,
null
);
//
List<Map<String,Object>> dataList = QQKB.getQQKBCommentData(u,null);
if
(
dataList
!=
null
)
{
//
if(dataList!= null) {
bodyList
.
addAll
(
dataList
);
//
bodyList.addAll(dataList);
}
//
}
}
//
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
//
List<String> headList = new ArrayList<String>();
headList
.
add
(
"reply_id"
);
//id
//
headList.add("reply_id"); //id
headList
.
add
(
"like"
);
//点赞数
//
headList.add("like"); //点赞数
headList
.
add
(
"name"
);
//呢称
//
headList.add("name"); //呢称
headList
.
add
(
"reply_num"
);
//回复数
//
headList.add("reply_num"); //回复数
headList
.
add
(
"time"
);
//时间
//
headList.add("time"); //时间
headList
.
add
(
"content"
);
//内容
//
headList.add("content"); //内容
System
.
out
.
println
(
bodyList
.
size
());
//
System.out.println(bodyList.size());
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\快报评论采集.xlsx"
,
"sada"
,
headList
,
bodyList
);
//
poi.exportExcel("D:\\crawlerdata\\自媒体\\快报评论采集.xlsx", "sada", headList, bodyList);
//
}
//
}
//
//
}
//
}
src/test/java/com/zhiwei/crawler/QQNewsCommentListTest.java
View file @
9234d24c
This diff is collapsed.
Click to expand it.
src/test/java/com/zhiwei/crawler/SinaCommentListTest.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
//
package com.zhiwei.crawler;
//
import
java.io.IOException
;
//
import java.io.IOException;
import
java.util.ArrayList
;
//
import java.util.ArrayList;
import
java.util.HashMap
;
//
import java.util.HashMap;
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
com.alibaba.fastjson.JSONArray
;
//
import com.alibaba.fastjson.JSONArray;
import
com.alibaba.fastjson.JSONObject
;
//
import com.alibaba.fastjson.JSONObject;
import
com.zhiwei.tools.httpclient.HeaderTool
;
//
import com.zhiwei.tools.httpclient.HeaderTool;
import
com.zhiwei.tools.httpclient.HttpClientTemplateOK
;
//
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
//
/**
/
//
**
* @ClassName: SinaCommentListTest
//
* @ClassName: SinaCommentListTest
* @Description: TODO(新浪新闻评论抓取)
//
* @Description: TODO(新浪新闻评论抓取)
* @author hero
//
* @author hero
* @date 2017年8月10日 下午6:08:41
//
* @date 2017年8月10日 下午6:08:41
*/
//
*/
public
class
SinaCommentListTest
{
//
public class SinaCommentListTest {
//
//
public
static
void
sinaCommentListTest
(
String
url
)
{
//
public static void sinaCommentListTest(String url) {
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
//
Map<String,String> headerMap = HeaderTool.getCommonHead();
String
newsId
=
getCommentId
(
url
).
split
(
"====="
)[
1
];
//
String newsId = getCommentId(url).split("=====")[1];
String
channel
=
getCommentId
(
url
).
split
(
"====="
)[
0
];
//
String channel = getCommentId(url).split("=====")[0];
int
page
=
1
;
//
int page = 1;
try
{
//
try {
String
comment_url
=
"http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel="
+
channel
+
"&newsid="
+
newsId
+
"&group=0&compress=0&ie=gbk&oe=gbk&page="
+
page
+
"&page_size=20&jsvar=loader_1525576000752_30189682"
;
//
String comment_url = "http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel="+channel+"&newsid="+newsId+"&group=0&compress=0&ie=gbk&oe=gbk&page="+page+"&page_size=20&jsvar=loader_1525576000752_30189682";
System
.
out
.
println
(
"commenturl========"
+
comment_url
);
//
System.out.println("commenturl========"+comment_url);
String
html
=
HttpClientTemplateOK
.
get
(
comment_url
,
null
,
headerMap
);
//
String html = HttpClientTemplateOK.get(comment_url, null, headerMap);
if
(
html
!=
null
){
//
if(html!=null){
html
=
html
.
substring
(
html
.
indexOf
(
"="
,
0
)+
1
,
html
.
length
());
//
html = html.substring(html.indexOf("=",0)+1,html.length());
System
.
out
.
println
(
html
);
//
System.out.println(html);
JSONObject
data
=
JSONObject
.
parseObject
(
html
).
getJSONObject
(
"result"
);
//
JSONObject data = JSONObject.parseObject(html).getJSONObject("result");
JSONArray
jsonArray
=
data
.
getJSONArray
(
"cmntlist"
);
//
JSONArray jsonArray = data.getJSONArray("cmntlist");
for
(
int
a
=
0
;
a
<
jsonArray
.
size
();
a
++){
//
for(int a = 0;a<jsonArray.size();a++){
Map
<
String
,
Object
>
doc
=
new
HashMap
<
String
,
Object
>();
//
Map<String,Object> doc = new HashMap<String, Object>();
JSONObject
json
=
jsonArray
.
getJSONObject
(
a
);
//
JSONObject json = jsonArray.getJSONObject(a);
doc
.
put
(
"_id"
,
json
.
getString
(
"mid"
));
//
doc.put("_id", json.getString("mid"));
doc
.
put
(
"content"
,
json
.
getString
(
"content"
));
//
doc.put("content", json.getString("content"));
doc
.
put
(
"area"
,
json
.
getString
(
"area"
));
//
doc.put("area", json.getString("area"));
doc
.
put
(
"nick"
,
json
.
getString
(
"nick"
));
//
doc.put("nick", json.getString("nick"));
doc
.
put
(
"time"
,
json
.
getString
(
"time"
));
//
doc.put("time", json.getString("time"));
doc
.
put
(
"agree"
,
json
.
getInteger
(
"agree"
));
//
doc.put("agree", json.getInteger("agree"));
doc
.
put
(
"against"
,
json
.
getInteger
(
"against"
));
//
doc.put("against", json.getInteger("against"));
doc
.
put
(
"vote"
,
json
.
getInteger
(
"vote"
));
//
doc.put("vote", json.getInteger("vote"));
doc
.
put
(
"fromUrl"
,
url
);
//
doc.put("fromUrl", url);
System
.
out
.
println
(
"doc==========="
+
doc
);
//
System.out.println("doc==========="+doc);
//
}
//
}
}
else
{
//
}else{
System
.
out
.
println
(
"--------------"
);
//
System.out.println("--------------");
}
//
}
//
}
catch
(
Exception
e
)
{
//
} catch (Exception e) {
e
.
printStackTrace
();
//
e.printStackTrace();
}
//
}
}
//
}
//
//
//
public
static
String
getCommentId
(
String
url
){
//
public static String getCommentId(String url){
String
newsid
=
null
;
//
String newsid = null;
String
channel
=
null
;
//
String channel = null;
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
//
Map<String,String> headerMap = HeaderTool.getCommonHead();
System
.
out
.
println
(
url
);
//
System.out.println(url);
try
{
//
try {
String
html
=
HttpClientTemplateOK
.
get
(
url
,
null
,
headerMap
);
//
String html = HttpClientTemplateOK.get(url, null, headerMap);
if
(
html
!=
null
&&
html
.
contains
(
"newsid"
)){
//
if(html!=null && html.contains("newsid")){
newsid
=
html
.
split
(
"newsid: '"
)[
1
].
split
(
"',"
)[
0
];
//
newsid = html.split("newsid: '")[1].split("',")[0];
channel
=
html
.
split
(
"channel: '"
)[
1
].
split
(
"',"
)[
0
];
//
channel = html.split("channel: '")[1].split("',")[0];
System
.
out
.
println
(
channel
+
"============"
+
newsid
);
//
System.out.println(channel+"============"+newsid);
return
channel
+
"====="
+
newsid
;
//
return channel+"====="+newsid;
}
//
}
}
catch
(
IOException
e
)
{
//
} catch (IOException e) {
return
null
;
//
return null;
}
catch
(
Exception
e
)
{
//
} catch (Exception e) {
e
.
printStackTrace
();
//
e.printStackTrace();
}
//
}
return
newsid
;
//
return newsid;
}
//
}
//
}
//
}
src/test/java/com/zhiwei/crawler/SoKuByWordExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
//
package com.zhiwei.crawler;
//
import
java.util.ArrayList
;
//
import java.util.ArrayList;
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
org.junit.Test
;
//
import org.junit.Test;
//
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.parse.Soku
;
//
import com.zhiwei.parse.Soku;
//
public
class
SoKuByWordExample
{
//
public class SoKuByWordExample {
//
@Test
//
@Test
public
void
sokuByWordTest
()
{
//
public void sokuByWordTest() {
String
word
=
"美食,味道,吃,试吃,美味,好吃"
;
//
String word = "美食,味道,吃,试吃,美味,好吃";
String
type
=
"174,103,176"
;
//
String type = "174,103,176";
String
[]
words
=
word
.
split
(
","
);
//
String[] words = word.split(",");
String
[]
types
=
type
.
split
(
","
);
//
String[] types = type.split(",");
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for
(
String
w
:
words
)
{
//
for(String w : words ) {
for
(
String
t
:
types
)
{
//
for(String t : types) {
List
<
Map
<
String
,
Object
>>
list
=
Soku
.
getSoKuByWordData
(
w
,
t
,
null
);
//
List<Map<String,Object>> list = Soku.getSoKuByWordData(w, t,null);
if
(
list
!=
null
&&
list
.
size
()
>
0
)
{
//
if(list != null && list.size() > 0) {
bodyList
.
addAll
(
list
);
//
bodyList.addAll(list);
}
//
}
}
//
}
}
//
}
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
//
List<String> headList = new ArrayList<String>();
headList
.
add
(
"title"
);
//
headList.add("title");
headList
.
add
(
"time"
);
//
headList.add("time");
headList
.
add
(
"play_count"
);
//
headList.add("play_count");
headList
.
add
(
"url"
);
//
headList.add("url");
headList
.
add
(
"source"
);
//
headList.add("source");
poi
.
exportExcel
(
"D://crawlerdata/优酷采集.xlsx"
,
"优酷数据"
,
headList
,
bodyList
);
//
poi.exportExcel("D://crawlerdata/优酷采集.xlsx", "优酷数据", headList, bodyList);
//
}
//
}
//
}
//
}
src/test/java/com/zhiwei/crawler/SouhuAccountExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
//package com.zhiwei.crawler;
//
import
java.util.ArrayList
;
//import java.util.ArrayList;
import
java.util.List
;
//import java.util.List;
import
java.util.Map
;
//import java.util.Map;
//
import
org.junit.Test
;
//import org.junit.Test;
//
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//import com.zhiwei.common.config.GroupType;
import
com.zhiwei.parse.Souhu
;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Souhu;
public
class
SouhuAccountExample
{
//
//
//http://search.sohu.com/?keyword=%E8%99%8E%E5%97%85&source=article&queryType=edit&ie=utf8
//public class SouhuAccountExample {
//
@Test
// //http://search.sohu.com/?keyword=%E8%99%8E%E5%97%85&source=article&queryType=edit&ie=utf8
public
void
souhuAccountTest
()
{
//
List
<
Map
<
String
,
Object
>>
lists
=
Souhu
.
getSouHuAccountData
(
"c29odXptdHNmbjZ0cnRAc29odS5jb20="
,
"2018-05-01 00:00:00"
,
false
,
null
);
// @Test
System
.
out
.
println
(
lists
.
size
());
// public void souhuAccountTest() {
List
<
String
>
headList
=
new
ArrayList
<
String
>();
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER);
headList
.
add
(
"title"
);
// List<Map<String,Object>> lists = Souhu.getSouHuAccountData("99938933","浅黑科技","2018-05-01 00:00:00",false,null);
headList
.
add
(
"time"
);
// System.out.println(lists.size());
headList
.
add
(
"content"
);
// List<String> headList = new ArrayList<String>();
headList
.
add
(
"url"
);
// headList.add("title");
headList
.
add
(
"comment"
);
// headList.add("time");
headList
.
add
(
"tags"
);
// headList.add("content");
headList
.
add
(
"newsid"
);
// headList.add("url");
headList
.
add
(
"source"
);
// headList.add("comment");
headList
.
add
(
"newsPv"
);
// headList.add("tags");
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
// headList.add("newsid");
poi
.
exportExcel
(
"D:\\crawlerdata\\搜狐号历史文章-乔.xlsx"
,
"乔"
,
headList
,
lists
);
// headList.add("source");
}
// headList.add("newsPv");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
}
// poi.exportExcel("D:\\crawlerdata\\搜狐号历史文章-乔.xlsx", "乔", headList, lists);
// }
//
//}
src/test/java/com/zhiwei/crawler/SouhuCommentCountExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
//
package com.zhiwei.crawler;
//
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
org.junit.Test
;
//
import org.junit.Test;
//
import
com.zhiwei.common.config.GroupType
;
//
import com.zhiwei.common.config.GroupType;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
//
import com.zhiwei.crawler.proxy.ProxyFactory;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
//
import com.zhiwei.crawler.proxy.ProxyHolder;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.parse.Souhu
;
//
import com.zhiwei.parse.Souhu;
//
public
class
SouhuCommentCountExample
{
//
public class SouhuCommentCountExample {
//
//
@SuppressWarnings
(
"unchecked"
)
//
@SuppressWarnings("unchecked")
@Test
//
@Test
public
void
souhuCommentCountTest
()
{
//
public void souhuCommentCountTest() {
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
//
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
GroupType
.
PROVIDER
);
//
GroupType.PROVIDER);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
"D://crawlerdata//自媒体//搜狐评论采集.xlsx"
,
0
);
//
Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", 0);
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
//
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List
<
String
>
headList
=
(
List
<
String
>)
map
.
get
(
"head"
);
//
List<String> headList = (List<String>) map.get("head");
for
(
Map
<
String
,
Object
>
map1
:
list
)
{
//
for(Map<String,Object> map1 : list) {
String
url
=
""
;
//
String url = "";
try
{
//
try {
url
=
map1
.
get
(
"url"
)+
""
;
//
url = map1.get("url")+"";
System
.
out
.
println
(
url
);
//
System.out.println(url);
url
=
"http://m.sohu.com/a/299389309_114988"
;
//
url = "http://m.sohu.com/a/299389309_114988";
int
i
=
Souhu
.
getSouhuCommentCount
(
url
,
ProxyHolder
.
NAT_PROXY
);
//
int i = Souhu.getSouhuCommentCount(url,ProxyHolder.NAT_PROXY);
int
j
=
Souhu
.
getSohuReadNum
(
url
,
ProxyHolder
.
NAT_PROXY
);
//
int j = Souhu.getSohuReadNum(url, ProxyHolder.NAT_PROXY);
map1
.
put
(
"count"
,
i
);
//
map1.put("count", i);
map1
.
put
(
"redNum"
,
j
);
//
map1.put("redNum", j);
System
.
out
.
println
(
map1
.
toString
());
//
System.out.println(map1.toString());
}
catch
(
Exception
e
)
{
//
} catch (Exception e) {
System
.
out
.
println
(
url
);
//
System.out.println(url);
e
.
printStackTrace
();
//
e.printStackTrace();
continue
;
//
continue;
}
//
}
}
//
}
headList
.
add
(
"count"
);
//
headList.add("count");
poi
.
exportExcel
(
"D://crawlerdata//自媒体//搜狐评论采集.xlsx"
,
"sheet2"
,
headList
,
list
);
//
poi.exportExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", "sheet2", headList, list);
}
//
}
//
//
}
//
}
src/test/java/com/zhiwei/crawler/SouhuCommentExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
//
package com.zhiwei.crawler;
//
import
java.util.ArrayList
;
//
import java.util.ArrayList;
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
org.junit.Test
;
//
import org.junit.Test;
//
import
com.zhiwei.common.config.GroupType
;
//
import com.zhiwei.common.config.GroupType;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
//
import com.zhiwei.crawler.proxy.ProxyFactory;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.parse.Fenghuang
;
//
import com.zhiwei.parse.Fenghuang;
import
com.zhiwei.parse.Souhu
;
//
import com.zhiwei.parse.Souhu;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
//
import com.zhiwei.tools.tools.ZhiWeiTools;
//
public
class
SouhuCommentExample
{
//
public class SouhuCommentExample {
//
@Test
//
@Test
public
void
souhuCommentTest
()
{
//
public void souhuCommentTest() {
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
//
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
GroupType
.
PROVIDER
);
//
GroupType.PROVIDER);
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
"D://crawlerdata//自媒体//搜狐评论采集.xlsx"
,
0
);
//
Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", 0);
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
//
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
List
<
String
>
urlList
=
new
ArrayList
<
String
>();
//
List<String> urlList = new ArrayList<String>();
for
(
Map
<
String
,
Object
>
map1
:
list
)
{
//
for(Map<String,Object> map1 : list) {
String
url
=
""
;
//
String url = "";
try
{
//
try {
url
=
map1
.
get
(
"url"
)+
""
;
//
url = map1.get("url")+"";
System
.
out
.
println
(
url
);
//
System.out.println(url);
List
<
Map
<
String
,
Object
>>
dataList
=
Souhu
.
getSouhuCommentData
(
url
,
null
);
//
List<Map<String,Object>> dataList = Souhu.getSouhuCommentData(url,null);
if
(
dataList
.
size
()
<=
0
)
{
//
if(dataList.size() <= 0) {
urlList
.
add
(
url
);
//
urlList.add(url);
}
//
}
ZhiWeiTools
.
sleep
(
100
);
//
ZhiWeiTools.sleep(100);
if
(
dataList
!=
null
)
{
//
if(dataList != null) {
bodyList
.
addAll
(
dataList
);
//
bodyList.addAll(dataList);
}
//
}
}
catch
(
Exception
e
)
{
//
} catch (Exception e) {
System
.
out
.
println
(
url
);
//
System.out.println(url);
e
.
printStackTrace
();
//
e.printStackTrace();
continue
;
//
continue;
}
//
}
}
//
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
//
List<String> headList = new ArrayList<String>();
headList
.
add
(
"nickname"
);
//
headList.add("nickname");
headList
.
add
(
"content"
);
//
headList.add("content");
headList
.
add
(
"user_id"
);
//
headList.add("user_id");
headList
.
add
(
"loaction"
);
//
headList.add("loaction");
headList
.
add
(
"support_count"
);
//
headList.add("support_count");
headList
.
add
(
"comment_id"
);
//
headList.add("comment_id");
headList
.
add
(
"reply_id"
);
//
headList.add("reply_id");
headList
.
add
(
"time"
);
//
headList.add("time");
for
(
String
s
:
urlList
)
{
//
for(String s : urlList) {
System
.
out
.
println
(
s
);
//
System.out.println(s);
}
//
}
poi
.
exportExcel
(
"D://crawlerdata//自媒体//搜狐评论采集.xlsx"
,
"搜狐评论"
,
headList
,
bodyList
);
//
poi.exportExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", "搜狐评论", headList, bodyList);
}
//
}
//
//
}
//
}
src/test/java/com/zhiwei/crawler/TXNewsByWordExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
//
package com.zhiwei.crawler;
//
import
java.util.ArrayList
;
//
import java.util.ArrayList;
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.parse.TXNews
;
//
import com.zhiwei.parse.TXNews;
//
public
class
TXNewsByWordExample
{
//
public class TXNewsByWordExample {
//
public
static
void
main
(
String
[]
args
)
{
//
public static void main(String[] args) {
String
word
=
"唐嫣"
;
//
String word = "唐嫣";
String
devid
=
"6D33F35F-880D-42A6-A23F-881BEC6960EC"
;
//
String devid = "6D33F35F-880D-42A6-A23F-881BEC6960EC";
List
<
Map
<
String
,
Object
>>
dataList
=
TXNews
.
getData
(
word
,
devid
,
null
);
//
List<Map<String,Object>> dataList = TXNews.getData(word,devid,null);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
//
List<String> headList = new ArrayList<String>();
headList
.
add
(
"title"
);
//
headList.add("title");
headList
.
add
(
"time"
);
//
headList.add("time");
headList
.
add
(
"content"
);
//
headList.add("content");
headList
.
add
(
"url"
);
//
headList.add("url");
headList
.
add
(
"id"
);
//
headList.add("id");
headList
.
add
(
"source"
);
//
headList.add("source");
poi
.
exportExcel
(
"D://crawlerdata/腾讯新闻-唐嫣-1.xlsx"
,
"腾讯新闻数据"
,
headList
,
dataList
);
//
poi.exportExcel("D://crawlerdata/腾讯新闻-唐嫣-1.xlsx", "腾讯新闻数据", headList, dataList);
}
//
}
//
}
//
}
src/test/java/com/zhiwei/crawler/Test1.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
//
package com.zhiwei.crawler;
//
import
java.io.UnsupportedEncodingException
;
//
import java.io.UnsupportedEncodingException;
import
java.net.URLEncoder
;
//
import java.net.URLEncoder;
import
java.util.regex.Matcher
;
//
import java.util.regex.Matcher;
import
java.util.regex.Pattern
;
//
import java.util.regex.Pattern;
//
import
org.junit.Test
;
//
import org.junit.Test;
//
public
class
Test1
{
//
public class Test1 {
//
//
public
static
void
main
(
String
[]
args
)
{
//
public static void main(String[] args) {
String
time
=
"https://view.inews.qq.com/a/NEW2018021000440002"
;
//
String time = "https://view.inews.qq.com/a/NEW2018021000440002";
//
System
.
out
.
println
(
time
.
split
(
"/"
)[
4
]);
//
System.out.println(time.split("/")[4]);
//
}
//
}
//
}
//
}
src/test/java/com/zhiwei/crawler/WangyiCommentCountExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
//
package com.zhiwei.crawler;
//
import
java.util.ArrayList
;
//
import java.util.ArrayList;
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
org.junit.Test
;
//
import org.junit.Test;
//
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.parse.Wangyi
;
//
import com.zhiwei.parse.Wangyi;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
//
import com.zhiwei.tools.tools.ZhiWeiTools;
//
public
class
WangyiCommentCountExample
{
//
public class WangyiCommentCountExample {
//
@Test
//
@Test
public
void
wangyiCommentCountTest
()
{
//
public void wangyiCommentCountTest() {
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
String
path
=
"D:\\crawlerdata\\自媒体\\网易评论采集-1.xlsx"
;
//
String path = "D:\\crawlerdata\\自媒体\\网易评论采集-1.xlsx";
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
path
,
0
);
//
Map<String,Object> map = poi.importExcel(path, 0);
//
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
//
List<Map<String,Object>> list = (List<Map<String, Object>>) map.get("body");
List
<
String
>
urlList
=
new
ArrayList
<
String
>();
//
List<String> urlList = new ArrayList<String>();
for
(
Map
<
String
,
Object
>
u
:
list
)
{
//
for(Map<String,Object> u : list) {
String
url
=
u
.
get
(
"链接"
)+
""
;
//
String url = u.get("链接")+"";
urlList
.
add
(
url
);
//
urlList.add(url);
}
//
}
//
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
//
List<Map<String,Object>> bodyList = new ArrayList<>();
for
(
String
url
:
urlList
)
{
//
for(String url : urlList) {
url
=
"https://3g.163.com/all/article/E9GAO0PK051188EC.html"
;
//
url = "https://3g.163.com/all/article/E9GAO0PK051188EC.html";
String
id
=
url
.
split
(
"/"
)[
url
.
split
(
"/"
).
length
-
1
].
split
(
".ht"
)[
0
];
//
String id = url.split("/")[url.split("/").length-1].split(".ht")[0];
System
.
out
.
println
(
id
);
//
System.out.println(id);
int
lists
=
Wangyi
.
getWangyiCommentCount
(
id
,
null
);
//
int lists = Wangyi.getWangyiCommentCount(id, null);
System
.
out
.
println
(
lists
);
//
System.out.println(lists);
ZhiWeiTools
.
sleep
(
3000
);
//
ZhiWeiTools.sleep(3000);
}
//
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
//
List<String> headList = new ArrayList<String>();
headList
.
add
(
"content"
);
//
headList.add("content");
headList
.
add
(
"id"
);
//
headList.add("id");
headList
.
add
(
"time"
);
//
headList.add("time");
headList
.
add
(
"name"
);
//
headList.add("name");
headList
.
add
(
"like"
);
//
headList.add("like");
headList
.
add
(
"unlike"
);
//
headList.add("unlike");
headList
.
add
(
"from_url"
);
//
headList.add("from_url");
//
poi
.
exportExcel
(
path
,
"评论数据"
,
headList
,
bodyList
);
//
poi.exportExcel(path, "评论数据", headList, bodyList);
//
}
//
}
//
//
}
//
}
src/test/java/com/zhiwei/crawler/WangyiCommentExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
//
package com.zhiwei.crawler;
//
import
java.util.ArrayList
;
//
import java.util.ArrayList;
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
org.junit.Test
;
//
import org.junit.Test;
//
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.parse.Wangyi
;
//
import com.zhiwei.parse.Wangyi;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
//
import com.zhiwei.tools.tools.ZhiWeiTools;
//
public
class
WangyiCommentExample
{
//
public class WangyiCommentExample {
//
//若出错 可能数据有重复 以id为准
//
//若出错 可能数据有重复 以id为准
@Test
//
@Test
public
void
wangyiCommentTest
()
{
//
public void wangyiCommentTest() {
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
String
path
=
"D:\\crawlerdata\\自媒体\\网易评论采集-1.xlsx"
;
//
String path = "D:\\crawlerdata\\自媒体\\网易评论采集-1.xlsx";
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
path
,
0
);
//
Map<String,Object> map = poi.importExcel(path, 0);
//
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
//
List<Map<String,Object>> list = (List<Map<String, Object>>) map.get("body");
List
<
String
>
urlList
=
new
ArrayList
<
String
>();
//
List<String> urlList = new ArrayList<String>();
for
(
Map
<
String
,
Object
>
u
:
list
)
{
//
for(Map<String,Object> u : list) {
String
url
=
u
.
get
(
"链接"
)+
""
;
//
String url = u.get("链接")+"";
urlList
.
add
(
url
);
//
urlList.add(url);
}
//
}
//
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
//
List<Map<String,Object>> bodyList = new ArrayList<>();
for
(
String
url
:
urlList
)
{
//
for(String url : urlList) {
String
id
=
url
.
split
(
"/"
)[
url
.
split
(
"/"
).
length
-
1
].
split
(
".ht"
)[
0
];
//
String id = url.split("/")[url.split("/").length-1].split(".ht")[0];
System
.
out
.
println
(
id
);
//
System.out.println(id);
List
<
Map
<
String
,
Object
>>
lists
=
Wangyi
.
getWangyiCommentData
(
id
,
null
);
//
List<Map<String,Object>> lists = Wangyi.getWangyiCommentData(id,null);
System
.
out
.
println
(
url
+
"====="
+
lists
.
size
());
//
System.out.println(url+"====="+lists.size());
if
(
lists
!=
null
)
{
//
if(lists != null) {
for
(
Map
<
String
,
Object
>
m
:
lists
)
{
//
for(Map<String,Object> m : lists) {
m
.
put
(
"from_url"
,
url
);
//
m.put("from_url", url);
bodyList
.
add
(
m
);
//
bodyList.add(m);
}
//
}
}
//
}
ZhiWeiTools
.
sleep
(
3000
);
//
ZhiWeiTools.sleep(3000);
}
//
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
//
List<String> headList = new ArrayList<String>();
headList
.
add
(
"content"
);
//
headList.add("content");
headList
.
add
(
"id"
);
//
headList.add("id");
headList
.
add
(
"time"
);
//
headList.add("time");
headList
.
add
(
"name"
);
//
headList.add("name");
headList
.
add
(
"like"
);
//
headList.add("like");
headList
.
add
(
"unlike"
);
//
headList.add("unlike");
headList
.
add
(
"from_url"
);
//
headList.add("from_url");
//
poi
.
exportExcel
(
path
,
"评论数据"
,
headList
,
bodyList
);
//
poi.exportExcel(path, "评论数据", headList, bodyList);
//
}
//
}
//
//
//
//
//
}
//
}
src/test/java/com/zhiwei/crawler/WangyiHistoryExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
//
package com.zhiwei.crawler;
//
import
java.util.ArrayList
;
//
import java.util.ArrayList;
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.parse.Wangyi
;
//
import com.zhiwei.parse.Wangyi;
//
public
class
WangyiHistoryExample
{
//
public class WangyiHistoryExample {
//
public
static
void
main
(
String
[]
args
)
{
//
public static void main(String[] args) {
//
String
url
=
"http://dy.163.com/v2/article/detail/DPLAOP1605198CJN
.html"
;
// String url = "http://dy.163.com/v2/article/detail/EBR9PF6J0512MLBG
.html";
//
List
<
Map
<
String
,
Object
>>
list
=
Wangyi
.
getHistoryData
(
url
,
null
,
"2018-05-01 00:00:00"
);
//
List<Map<String,Object>> list = Wangyi.getHistoryData(url, null, "2018-05-01 00:00:00");
//
List
<
String
>
headList
=
new
ArrayList
<
String
>();
//
List<String> headList = new ArrayList<String>();
headList
.
add
(
"title"
);
//
headList.add("title");
headList
.
add
(
"time"
);
//
headList.add("time");
headList
.
add
(
"content"
);
//
headList.add("content");
headList
.
add
(
"source"
);
//
headList.add("source");
headList
.
add
(
"url"
);
//
headList.add("url");
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi
.
exportExcel
(
"D://crawlerdata//自媒体/网易-财联社.xlsx"
,
"财联社"
,
headList
,
list
);
//
poi.exportExcel("D://crawlerdata//自媒体/网易-财联社.xlsx", "财联社", headList, list);
//
}
//
}
//
//
}
//
}
src/test/java/com/zhiwei/crawler/XiaomiShequByWordExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
//
package com.zhiwei.crawler;
//
import
java.util.ArrayList
;
//
import java.util.ArrayList;
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.parse.Xiaomi
;
//
import com.zhiwei.parse.Xiaomi;
//
public
class
XiaomiShequByWordExample
{
//
public class XiaomiShequByWordExample {
//
public
static
void
main
(
String
[]
args
)
{
//
public static void main(String[] args) {
String
word
=
"小米 电饭煲 锅体开裂,小米 电饭煲 开裂,小米 电饭煲 裂缝,小米 电饭煲 烫伤,小米 电饭煲 变形"
;
//
String word = "小米 电饭煲 锅体开裂,小米 电饭煲 开裂,小米 电饭煲 裂缝,小米 电饭煲 烫伤,小米 电饭煲 变形";
//
//
//
String
[]
words
=
word
.
split
(
","
);
//
String[] words = word.split(",");
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for
(
String
w
:
words
)
{
//
for(String w : words) {
List
<
Map
<
String
,
Object
>>
dataList
=
Xiaomi
.
getXiaomiByWordData
(
w
,
null
);
//
List<Map<String,Object>> dataList = Xiaomi.getXiaomiByWordData(w,null);
if
(
dataList
!=
null
&&
dataList
.
size
()
>
0
)
{
//
if(dataList != null && dataList.size() > 0) {
bodyList
.
addAll
(
dataList
);
//
bodyList.addAll(dataList);
}
//
}
}
//
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
//
List<String> headList = new ArrayList<String>();
headList
.
add
(
"title"
);
//
headList.add("title");
headList
.
add
(
"time"
);
//
headList.add("time");
headList
.
add
(
"source"
);
//
headList.add("source");
headList
.
add
(
"url"
);
//
headList.add("url");
headList
.
add
(
"content"
);
//
headList.add("content");
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
poi
.
exportExcel
(
"D:\\crawlerdata\\小米-关键词电饭煲相关.xlsx"
,
"小米社区采集"
,
headList
,
bodyList
);
//
poi.exportExcel("D:\\crawlerdata\\小米-关键词电饭煲相关.xlsx", "小米社区采集", headList, bodyList);
//
}
//
}
//
}
//
}
src/test/java/com/zhiwei/crawler/XiguaAccountExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
//
package com.zhiwei.crawler;
//
import
java.util.ArrayList
;
//
import java.util.ArrayList;
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
org.junit.Test
;
//
import org.junit.Test;
//
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.parse.XiGua
;
//
import com.zhiwei.parse.XiGua;
//
public
class
XiguaAccountExample
{
//
public class XiguaAccountExample {
//
@Test
//
@Test
public
void
xiguaAccountTest
()
{
//
public void xiguaAccountTest() {
String
path
=
"D:\\crawlerdata\\西瓜视频采集12.28.xlsx"
;
//
String path = "D:\\crawlerdata\\西瓜视频采集12.28.xlsx";
String
startTime
=
"2017-01-01 00:00:00"
;
//
String startTime = "2017-01-01 00:00:00";
//2017-01-01 00:00:00
//
//2017-01-01 00:00:00
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
path
,
0
);
//
Map<String,Object> map = poi.importExcel(path, 0);
List
<
Map
<
String
,
Object
>>
lists
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
//
List<Map<String,Object>> lists = (List<Map<String, Object>>) map.get("body");
for
(
Map
<
String
,
Object
>
map1
:
lists
)
{
//
for(Map<String,Object> map1 : lists ) {
String
url
=
map1
.
get
(
"主页"
)+
""
;
//
String url = map1.get("主页")+"";
if
(
url
!=
null
&&
url
.
length
()
>
5
)
{
//
if(url != null && url.length() > 5) {
List
<
Map
<
String
,
Object
>>
lists1
=
XiGua
.
getXiguaAccountData
(
url
,
startTime
,
null
);
//
List<Map<String,Object>> lists1 = XiGua.getXiguaAccountData(url,startTime,null);
if
(
lists1
!=
null
&&
lists
.
size
()
>
0
)
{
//
if(lists1 != null && lists.size() > 0) {
bodyList
.
addAll
(
lists1
);
//
bodyList.addAll(lists1);
}
//
}
}
//
}
}
//
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
//
List<String> headList = new ArrayList<String>();
headList
.
add
(
"title"
);
//
headList.add("title");
headList
.
add
(
"comments_count"
);
//
headList.add("comments_count");
headList
.
add
(
"time"
);
//
headList.add("time");
headList
.
add
(
"content"
);
//
headList.add("content");
headList
.
add
(
"url"
);
//
headList.add("url");
headList
.
add
(
"video_watch_count"
);
//
headList.add("video_watch_count");
headList
.
add
(
"source"
);
//
headList.add("source");
poi
.
exportExcel
(
path
,
"数据采集结果"
,
headList
,
bodyList
);
//
poi.exportExcel(path, "数据采集结果", headList, bodyList);
}
//
}
//
//
}
//
}
src/test/java/com/zhiwei/crawler/XiguaByWordExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
//
package com.zhiwei.crawler;
//
import
java.util.ArrayList
;
//
import java.util.ArrayList;
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
org.junit.Test
;
//
import org.junit.Test;
//
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.parse.XiGua
;
//
import com.zhiwei.parse.XiGua;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
//
import com.zhiwei.tools.tools.ZhiWeiTools;
//
public
class
XiguaByWordExample
{
//
public class XiguaByWordExample {
//
//
@Test
//
@Test
public
void
XiguaByWordTest
()
{
//
public void XiguaByWordTest() {
String
word
=
"美食,味道,吃,试吃,美味,好吃"
;
//
String word = "美食,味道,吃,试吃,美味,好吃";
String
[]
words
=
word
.
split
(
","
);
//
String[] words = word.split(",");
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for
(
String
w
:
words
)
{
//
for(String w : words) {
List
<
Map
<
String
,
Object
>>
list
=
XiGua
.
getXiguaVideoByWordData
(
w
,
null
);
//
List<Map<String,Object>> list = XiGua.getXiguaVideoByWordData(w,null);
if
(
list
!=
null
&&
list
.
size
()
>
0
)
{
//
if(list != null && list.size() > 0) {
bodyList
.
addAll
(
list
);
//
bodyList.addAll(list);
}
//
}
ZhiWeiTools
.
sleep
(
5000
);
//
ZhiWeiTools.sleep(5000);
System
.
out
.
println
(
"============总数"
+
bodyList
.
size
());
//
System.out.println("============总数" + bodyList.size());
}
//
}
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
//
List<String> headList = new ArrayList<String>();
headList
.
add
(
"title"
);
//
headList.add("title");
headList
.
add
(
"time"
);
//
headList.add("time");
headList
.
add
(
"content"
);
//
headList.add("content");
headList
.
add
(
"like"
);
//
headList.add("like");
headList
.
add
(
"unlike"
);
//
headList.add("unlike");
headList
.
add
(
"play_count"
);
//
headList.add("play_count");
headList
.
add
(
"source"
);
//
headList.add("source");
headList
.
add
(
"comment_count"
);
//
headList.add("comment_count");
headList
.
add
(
"url"
);
//
headList.add("url");
//
poi
.
exportExcel
(
"D://crawlerdata/西瓜美食-1.xlsx"
,
"西瓜好吃不"
,
headList
,
bodyList
);
//
poi.exportExcel("D://crawlerdata/西瓜美食-1.xlsx", "西瓜好吃不", headList, bodyList);
//
}
//
}
//
//
//
}
//
}
src/test/java/com/zhiwei/crawler/YidainzixunByWordExample.java
View file @
9234d24c
...
@@ -35,7 +35,7 @@ public class YidainzixunByWordExample {
...
@@ -35,7 +35,7 @@ public class YidainzixunByWordExample {
headList
.
add
(
"time"
);
headList
.
add
(
"time"
);
headList
.
add
(
"url"
);
headList
.
add
(
"url"
);
System
.
out
.
println
(
listAll
.
size
());
System
.
out
.
println
(
listAll
.
size
());
poi
.
exportExcel
(
"D://crawlerdata/一点资讯-
美食
.xlsx"
,
"asd"
,
headList
,
listAll
);
poi
.
exportExcel
(
"D://crawlerdata/一点资讯-
软博会
.xlsx"
,
"asd"
,
headList
,
listAll
);
}
}
...
...
src/test/java/com/zhiwei/crawler/YidianzixunCommentExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
//
package com.zhiwei.crawler;
//
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
org.junit.Test
;
//
import org.junit.Test;
//
import
com.zhiwei.parse.Yidianzixun
;
//
import com.zhiwei.parse.Yidianzixun;
//
public
class
YidianzixunCommentExample
{
//
public class YidianzixunCommentExample {
//
@Test
//
@Test
public
void
yidianzixunCommentTest
()
{
//
public void yidianzixunCommentTest() {
String
url
=
"http://www.yidianzixun.com/article/0ILHigvv"
;
//
String url = "http://www.yidianzixun.com/article/0ILHigvv";
List
<
Map
<
String
,
Object
>>
lists
=
Yidianzixun
.
getYidianzixunCommentData
(
url
,
null
);
//
List<Map<String,Object>> lists = Yidianzixun.getYidianzixunCommentData(url,null);
System
.
out
.
println
(
lists
.
size
());
//
System.out.println(lists.size());
for
(
Map
<
String
,
Object
>
map
:
lists
)
{
//
for(Map<String,Object> map : lists) {
System
.
out
.
println
(
map
.
toString
());
//
System.out.println(map.toString());
}
//
}
}
//
}
//
//
}
//
}
src/test/java/com/zhiwei/hsitory/BaijiaAccountExample.java
0 → 100644
View file @
9234d24c
//package com.zhiwei.hsitory;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Baijia;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//public class BaijiaAccountExample {
//
// @Test
// public void test3() {
// String path = "D://crawlerdata//自媒体/百家号采集.xlsx";
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// String startTime = "2018-05-01 00:00:00";
// Map<String,Object> map = poi.importExcel(path, 0);
// List<Map<String,Object>> list = (List<Map<String, Object>>) map.get("body");
// List<Map<String,Object>> bodyList = new ArrayList<>();
// for(Map<String,Object> m : list) {
// try {
// String app_id = m.get("id").toString();
// app_id = "1594158489045754";
// String cookie = "__cfduid=d847baca85b97d1967b3da02ebb345b831535524251; BAIDUID=C0F0F81EF770C5219AB9C178654135EC:FG=1; PSTM=1536376257; BIDUPSID=250CCE0442BEBCB3568D8EC515953434; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; locale=zh; delPer=0; H_PS_PSSID=1447_21117_20930; PSINO=5";
// List<Map<String,Object>> lists = Baijia.getBaijiaAccountByBaiduData(app_id, startTime,cookie, null);
// if(lists != null) {
// bodyList.addAll(lists);
// }
// break;
// } catch (Exception e) {
// }
// }
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("source");
// headList.add("url");
// headList.add("content");
// headList.add("read_amount");
// poi.exportExcel("D://crawlerdata//历史文章采集/百家号-lxj-2.xlsx", "娱乐资本论", headList, bodyList);
// }
//
//}
src/test/java/com/zhiwei/hsitory/FenghuangAccountExample.java
0 → 100644
View file @
9234d24c
//package com.zhiwei.hsitory;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Fenghuang;
//
//public class FenghuangAccountExample {
//
// @Test
// public void fenghuangAccountTest() {
// //所用时间长 1s1篇文章吧
// //https://api.3g.ifeng.com/client_search_subscribe?k=号外财经
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// String id = "1165210";
// String[] ids = id.split(",");
// String startTime = "2010-05-01 00:00:00"; //可为空
// for(int i = 0;i < ids.length;i++) {
// try {
// List<Map<String,Object>> dataList = Fenghuang.getFenghuangAccountData(ids[i], startTime,ProxyHolder.NAT_HEAVY_PROXY);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("source");
// headList.add("url");
// headList.add("id");
// poi.exportExcel("D://crawlerdata//历史文章采集/凤凰-三言财经.xlsx", ids[i], headList, dataList);
// } catch (Exception e) {
// continue;
// }
// }
// }
//
//}
src/test/java/com/zhiwei/hsitory/SouhuAccountExample.java
0 → 100644
View file @
9234d24c
package
com
.
zhiwei
.
hsitory
;
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Souhu;
//
//
//public class SouhuAccountExample {
//
// //http://search.sohu.com/?keyword=%E8%99%8E%E5%97%85&source=article&queryType=edit&ie=utf8
//
// @Test
// public void souhuAccountTest() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER);
// List<Map<String,Object>> lists = Souhu.getSouHuAccountData("99938933","浅黑科技","2018-05-01 00:00:00",false,null);
// System.out.println(lists.size());
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("url");
// headList.add("comment");
// headList.add("tags");
// headList.add("newsid");
// headList.add("source");
// headList.add("newsPv");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D:\\crawlerdata\\搜狐号历史文章-乔.xlsx", "乔", headList, lists);
// }
//
//}
src/test/java/com/zhiwei/hsitory/TxNewsHostoryExample.java
0 → 100644
View file @
9234d24c
//package com.zhiwei.hsitory;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.TXNews;
//
//public class TxNewsHostoryExample {
//
// public static void main(String[] args) {
//
//
// String url = "6839743";
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// List<Map<String,Object>> list = TXNews.getTxNewsHistory(url, null,ProxyHolder.NAT_PROXY);
//
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("source");
// headList.add("url");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//历史文章采集/腾讯网-三言财经-right.xlsx", "财联社", headList, list);
//
//
// }
//
//}
src/test/java/com/zhiwei/hsitory/WangyiHistoryExample.java
0 → 100644
View file @
9234d24c
//package com.zhiwei.hsitory;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Wangyi;
//
//public class WangyiHistoryExample {
//
// public static void main(String[] args) {
//
// String url = "T1520579168852";
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// List<Map<String,Object>> list = Wangyi.getWangyiClientHistory(url, ProxyHolder.NAT_PROXY, "2019-01-01 00:00:00");
//
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("content");
// headList.add("source");
// headList.add("url");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("D://crawlerdata//历史文章采集/网易-三言财经.xlsx", "财联社", headList, list);
//
// }
//
//
//}
src/test/java/com/zhiwei/hsitory/XueqiuHostoryExample.java
0 → 100644
View file @
9234d24c
//package com.zhiwei.hsitory;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Xueqiu;
//
//public class XueqiuHostoryExample {
//
// public static void main(String[] args) {
//
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER);
// String cookie = "_ga=GA1.2.2045733994.1547169202; device_id=5a986a59915983c3e2ef8074f80112ec; s=e618lxk3qw; __utmz=1.1547185990.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=1.2045733994.1547169202.1548122251.1553047746.3; aliyungf_tc=AQAAAJHA7Vrq7AYAgtgMPALb3ZCQP9o+; _gid=GA1.2.334283760.1554779038; Hm_lvt_1db88642e346389874251b5a1eded6e3=1553046552,1553046993,1553150890,1554779038; _gat=1; remember=1; remember.sig=K4F3faYzmVuqC0iXIERCQf55g2Y; xq_a_token=fed387c342aedea5c7883d1062ae6faf167975d8; xq_a_token.sig=j47ktDdYWr1FOgeL74U6yMCPhOY; xqat=fed387c342aedea5c7883d1062ae6faf167975d8; xqat.sig=oZPD4-6V_GPw-KsnR04L7vxf5oM; xq_r_token=6ffffd472dc300e2f89195a77b8e7064da45d78d; xq_r_token.sig=TPd7Y11kYPcQeOgzXVDApbRQauQ; xq_is_login=1; xq_is_login.sig=J3LxgPVPUzbBg3Kee_PquUfih7Q; u=5878436335; u.sig=j_g6RZ9GzzrgOfIsGHi9O9M1wvc; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1554791719";
// String userId = "7441422641";
//
// List<Map<String,Object>> dataList = Xueqiu.getXueqiuAccountData(userId, cookie, null);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("source");
// headList.add("content");
// headList.add("repostCount");
// headList.add("commentCount");
// headList.add("likeCount");
// headList.add("url");
// poi.exportExcel("D://crawlerdata//历史文章采集/雪球-三言财经.xlsx", "三言财经", headList, dataList);
//
// }
//
//}
src/test/java/com/zhiwei/
crawler
/YidianzixunAccountExample.java
→
src/test/java/com/zhiwei/
hsitory
/YidianzixunAccountExample.java
View file @
9234d24c
package
com
.
zhiwei
.
crawler
;
package
com
.
zhiwei
.
hsitory
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.List
;
...
@@ -6,6 +6,9 @@ import java.util.Map;
...
@@ -6,6 +6,9 @@ import java.util.Map;
import
org.junit.Test
;
import
org.junit.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Yidianzixun
;
import
com.zhiwei.parse.Yidianzixun
;
...
@@ -14,10 +17,10 @@ public class YidianzixunAccountExample {
...
@@ -14,10 +17,10 @@ public class YidianzixunAccountExample {
@Test
@Test
public
void
yidianzixunAccountTest
()
{
public
void
yidianzixunAccountTest
()
{
String
channelid
=
"m23315"
;
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
String
channelid
=
"m190159"
;
String
startTime
=
"2007-01-01 00:00:00"
;
String
startTime
=
"2007-01-01 00:00:00"
;
String
cookie
=
"wuid=90742539356820; wuid_createAt=2019-01-10 11:45:41; UM_distinctid=16835dd9ba11cb-0ef8d17063d93f-671b197c-1fa400-16835dd9ba2243; JSESSIONID=174b8df350cb5400283abedf2c26076357b0b7af0581024f2e39e90532b4edc9; weather_auth=2; DID=node82eee6d174caf2d4; Hm_lvt_15fafbae2b9b11d280c79eff3b840e45=1551686450,1551686458; CNZZDATA1255169715=931563543-1547087800-%7C1551761063; captcha=s%3A6e56492ffceaf88d9f131fa79435464a.TLAhZ1cfwj0vBTjKTO9Qf5qc6QLuipitrEMZjiqm8BM; Hm_lpvt_15fafbae2b9b11d280c79eff3b840e45=1551764582; cn_1255169715_dplus=%7B%22distinct_id%22%3A%20%2216835dd9ba11cb-0ef8d17063d93f-671b197c-1fa400-16835dd9ba2243%22%2C%22sp%22%3A%20%7B%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201547544080%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201547544080%7D%2C%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201551765057%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201551765057%7D"
;
List
<
Map
<
String
,
Object
>>
dataList
=
Yidianzixun
.
getYidianzixunAccountData
(
channelid
,
startTime
,
ProxyHolder
.
NAT_HEAVY_PROXY
,
null
);
List
<
Map
<
String
,
Object
>>
dataList
=
Yidianzixun
.
getYidianzixunAccountData
(
channelid
,
startTime
,
null
,
cookie
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"title"
);
...
@@ -27,7 +30,7 @@ public class YidianzixunAccountExample {
...
@@ -27,7 +30,7 @@ public class YidianzixunAccountExample {
headList
.
add
(
"source"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
headList
.
add
(
"url"
);
headList
.
add
(
"summary"
);
headList
.
add
(
"summary"
);
poi
.
exportExcel
(
"D://crawlerdata/
一点资讯-m23315.xlsx"
,
"虎嗅
"
,
headList
,
dataList
);
poi
.
exportExcel
(
"D://crawlerdata/
/历史文章采集/一点资讯-新华社中国新三板.xlsx"
,
"新华社中国新三板
"
,
headList
,
dataList
);
}
}
...
...
src/test/java/com/zhiwei/keyword/XueqiuKeyWord.java
View file @
9234d24c
//package com.zhiwei.keyword;
package
com
.
zhiwei
.
keyword
;
//
//import java.util.ArrayList;
import
java.util.ArrayList
;
//import java.util.List;
import
java.util.List
;
//import java.util.Map;
import
java.util.Map
;
//
//import org.testng.annotations.Test;
import
org.junit.Test
;
//
//import com.zhiwei.common.config.GroupType;
import
com.zhiwei.common.config.GroupType
;
//import com.zhiwei.crawler.proxy.ProxyFactory;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//import com.zhiwei.parse.Xueqiu;
import
com.zhiwei.parse.Xueqiu
;
//
//public class XueqiuKeyWord {
public
class
XueqiuKeyWord
{
// @Test
// public void f() {
@Test
//// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
public
void
f
()
{
// String word = "腾讯 股价|腾讯 市值|腾讯 估值|腾讯 投资|腾讯 财报";
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
// String endTime = "2018-01-01 00:00:00";
String
word
=
"软博会|软件博览会"
;
// String cookie = "_ga=GA1.2.590296572.1538275545; device_id=812aac42a8874c32ca97893a7b270684; s=ff12lgxbt1; __utma=1.590296572.1538275545.1538280279.1538280279.1; __utmz=1.1538280279.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); aliyungf_tc=AQAAAPtocWy01ggA6tbncwYt4OIDsaiG; xq_a_token=088c6ad5e275496d7c91b8b5b2ecb929bee15772; xq_a_token.sig=NcpAm7FRsAVoMGRMOLrLRveBT7U; xq_r_token=08131eb9a4f33b43b2340fd782f3776c9823d74a; xq_r_token.sig=AZ7au6ICJtTgQJVPybkOJs_Fr54; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539599526,1539913171,1539913178,1541157360; _gid=GA1.2.1817290343.1541157360; u=721541157360383; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1541212289";
String
endTime
=
"2018-01-01 00:00:00"
;
//
String
cookie
=
"aliyungf_tc=AQAAADi3G3I7GAEAgtgMPF/W3FeFauXk; device_id=3221abfd50f9f8be2abca9182c338c9e; Hm_lvt_1db88642e346389874251b5a1eded6e3=1561962809; s=d911zeymzo; remember=1; remember.sig=K4F3faYzmVuqC0iXIERCQf55g2Y; xq_a_token=b2a8d68c56272cca3b71e1bcaf04b744b16e9497; xq_a_token.sig=hj7OrhYid1GuAaF-AmLYrPuyFDk; xqat=b2a8d68c56272cca3b71e1bcaf04b744b16e9497; xqat.sig=gker8dcZJ2Ez4s58eIGPa5zCd0w; xq_r_token=b0ad7b059411acac8c9e2f0a40336804bb60d047; xq_r_token.sig=DwNSLIigE8xtwQKLygagcM28Dd0; xq_is_login=1; xq_is_login.sig=J3LxgPVPUzbBg3Kee_PquUfih7Q; u=1273068356; u.sig=UyqQW-br8hcsOyT6anFmS_SFpCs; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1561963129"
;
//
//
// String[] words = word.split("\\|");
//
String
[]
words
=
word
.
split
(
"\\|"
);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
// for(String w : words) {
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
// System.out.println(w);
for
(
String
w
:
words
)
{
//
System
.
out
.
println
(
w
);
// List<Map<String, Object>> dataList = Xueqiu.getData(w, endTime, null, cookie);
// System.out.println(w + " ---- " + dataList.size());
List
<
Map
<
String
,
Object
>>
dataList
=
Xueqiu
.
getData
(
w
,
endTime
,
null
,
cookie
);
// bodyList.addAll(dataList);
System
.
out
.
println
(
w
+
" ---- "
+
dataList
.
size
());
// }
bodyList
.
addAll
(
dataList
);
// List<String> headList = new ArrayList<String>();
}
// headList.add("title");
List
<
String
>
headList
=
new
ArrayList
<
String
>();
// headList.add("time");
headList
.
add
(
"title"
);
// headList.add("content");
headList
.
add
(
"time"
);
// headList.add("uper");
headList
.
add
(
"content"
);
// headList.add("url");
headList
.
add
(
"uper"
);
// headList.add("likeCount");
headList
.
add
(
"url"
);
// headList.add("replyCount");
headList
.
add
(
"likeCount"
);
// poi.exportExcel("D:\\crawlerdata\\自媒体\\雪球网页采集-腾讯-relevance.xlsx", "马化腾", headList, bodyList);
headList
.
add
(
"replyCount"
);
//
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\雪球网页采集-软博会-relevance.xlsx"
,
"马化腾"
,
headList
,
bodyList
);
// }
//}
}
}
src/test/java/com/zhiwei/shipin/AiqiyiTest.java
View file @
9234d24c
...
@@ -4,7 +4,7 @@ import java.util.ArrayList;
...
@@ -4,7 +4,7 @@ import java.util.ArrayList;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
org.
testng.annotations
.Test
;
import
org.
junit
.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
...
@@ -21,7 +21,7 @@ public class AiqiyiTest {
...
@@ -21,7 +21,7 @@ public class AiqiyiTest {
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"D://crawlerdata//关键词.txt"
);
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"D://crawlerdata//关键词.txt"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
String
w
:
wordList
)
{
for
(
String
w
:
wordList
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
Aiqiyi
.
getAiqiyiByWordData
(
w
,
ProxyHolder
.
NAT_PROXY
);
List
<
Map
<
String
,
Object
>>
dataList
=
Aiqiyi
.
getAiqiyiByWordData
(
w
,
ProxyHolder
.
NAT_
HEAVY_
PROXY
);
if
(
dataList
!=
null
&&
dataList
.
size
()
>=
1
)
{
if
(
dataList
!=
null
&&
dataList
.
size
()
>=
1
)
{
bodyList
.
addAll
(
dataList
);
bodyList
.
addAll
(
dataList
);
}
}
...
@@ -34,7 +34,7 @@ public class AiqiyiTest {
...
@@ -34,7 +34,7 @@ public class AiqiyiTest {
headList
.
add
(
"title"
);
headList
.
add
(
"title"
);
headList
.
add
(
"word"
);
headList
.
add
(
"word"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata/
爱奇艺关键词采集-txh-0320
.xlsx"
,
"数据"
,
headList
,
bodyList
);
poi
.
exportExcel
(
"D://crawlerdata/
/视频/爱奇艺关键词采集-毓婷-0716
.xlsx"
,
"数据"
,
headList
,
bodyList
);
...
...
src/test/java/com/zhiwei/shipin/BilibiliTest.java
View file @
9234d24c
...
@@ -4,8 +4,10 @@ import java.util.ArrayList;
...
@@ -4,8 +4,10 @@ import java.util.ArrayList;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
org.
testng.annotations
.Test
;
import
org.
junit
.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.BiliBili
;
import
com.zhiwei.parse.BiliBili
;
import
com.zhiwei.util.WordReadFile
;
import
com.zhiwei.util.WordReadFile
;
...
@@ -13,11 +15,12 @@ import com.zhiwei.util.WordReadFile;
...
@@ -13,11 +15,12 @@ import com.zhiwei.util.WordReadFile;
public
class
BilibiliTest
{
public
class
BilibiliTest
{
@Test
@Test
public
void
f
()
{
public
void
f
()
{
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"D://crawlerdata//关键词.txt"
);
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"D://crawlerdata//关键词-1.txt"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
String
cookie
=
"LIVE_BUVID=AUTO8715300758995538; sid=kp5rluge; fts=1530161621; im_notify_type_35324319=0; buvid3=08ABE6AE-5061-4CE5-B34F-1A8AAB64DB3320712infoc; rpdid=olppsmkxmpdoskwoxiwww; finger=edc6ecda; stardustvideo=1; UM_distinctid=164fe68fb31996-01f161c3523abe-6114167a-1fa400-164fe68fb32274"
;
String
cookie
=
"LIVE_BUVID=AUTO8715300758995538; sid=kp5rluge; fts=1530161621; im_notify_type_35324319=0; buvid3=08ABE6AE-5061-4CE5-B34F-1A8AAB64DB3320712infoc; rpdid=olppsmkxmpdoskwoxiwww; finger=edc6ecda; stardustvideo=1; UM_distinctid=164fe68fb31996-01f161c3523abe-6114167a-1fa400-164fe68fb32274"
;
for
(
String
word
:
wordList
)
{
for
(
String
word
:
wordList
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
BiliBili
.
getData
(
word
,
null
,
List
<
Map
<
String
,
Object
>>
dataList
=
BiliBili
.
getData
(
word
,
null
,
"2019-07-18 00:00:00"
,
cookie
);
cookie
);
if
(
dataList
!=
null
)
{
if
(
dataList
!=
null
)
{
System
.
out
.
println
(
word
+
" ----- "
+
dataList
.
size
());
System
.
out
.
println
(
word
+
" ----- "
+
dataList
.
size
());
...
@@ -33,7 +36,7 @@ public class BilibiliTest {
...
@@ -33,7 +36,7 @@ public class BilibiliTest {
headlist
.
add
(
"url"
);
headlist
.
add
(
"url"
);
headlist
.
add
(
"word"
);
headlist
.
add
(
"word"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata//
bilibili关键词采集数据-txh-0320
.xlsx"
,
"B站数据"
,
headlist
,
bodyList
);
poi
.
exportExcel
(
"D://crawlerdata//
视频//bilibili关键词采集数据-吃鸡否-0722
.xlsx"
,
"B站数据"
,
headlist
,
bodyList
);
}
}
}
}
src/test/java/com/zhiwei/shipin/QQTVTest.java
View file @
9234d24c
...
@@ -4,7 +4,7 @@ import java.util.ArrayList;
...
@@ -4,7 +4,7 @@ import java.util.ArrayList;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
org.
testng.annotations
.Test
;
import
org.
junit
.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
...
@@ -18,11 +18,11 @@ public class QQTVTest {
...
@@ -18,11 +18,11 @@ public class QQTVTest {
@Test
@Test
public
void
f
()
{
public
void
f
()
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
String
time
=
"201
8-01-0
1 00:00:00"
;
String
time
=
"201
9-04-1
1 00:00:00"
;
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"D://crawlerdata//关键词.txt"
);
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"D://crawlerdata//关键词.txt"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
for
(
String
word
:
wordList
)
{
for
(
String
word
:
wordList
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
QQTV
.
getData
(
word
,
time
,
ProxyHolder
.
NAT_PROXY
);
List
<
Map
<
String
,
Object
>>
dataList
=
QQTV
.
getData
(
word
,
time
,
ProxyHolder
.
NAT_
HEAVY_
PROXY
);
if
(
dataList
!=
null
)
{
if
(
dataList
!=
null
)
{
System
.
out
.
println
(
word
+
" ----- "
+
dataList
.
size
());
System
.
out
.
println
(
word
+
" ----- "
+
dataList
.
size
());
bodyList
.
addAll
(
dataList
);
bodyList
.
addAll
(
dataList
);
...
@@ -37,7 +37,7 @@ public class QQTVTest {
...
@@ -37,7 +37,7 @@ public class QQTVTest {
headlist
.
add
(
"url"
);
headlist
.
add
(
"url"
);
headlist
.
add
(
"word"
);
headlist
.
add
(
"word"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata//
腾讯视频关键词采集数据-txh-0320
.xlsx"
,
"腾讯视频数据"
,
headlist
,
bodyList
);
poi
.
exportExcel
(
"D://crawlerdata//
视频//腾讯视频关键词采集数据-毓婷-0716
.xlsx"
,
"腾讯视频数据"
,
headlist
,
bodyList
);
...
...
src/test/java/com/zhiwei/shipin/SohuTVTest.java
View file @
9234d24c
...
@@ -4,7 +4,7 @@ import java.util.ArrayList;
...
@@ -4,7 +4,7 @@ import java.util.ArrayList;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
org.
testng.annotations
.Test
;
import
org.
junit
.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.shipin.SohuTV
;
import
com.zhiwei.parse.shipin.SohuTV
;
...
@@ -33,7 +33,7 @@ public class SohuTVTest {
...
@@ -33,7 +33,7 @@ public class SohuTVTest {
headlist
.
add
(
"url"
);
headlist
.
add
(
"url"
);
headlist
.
add
(
"word"
);
headlist
.
add
(
"word"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata//
搜狐视频关键词采集数据-txh-0320
.xlsx"
,
"搜狐数据"
,
headlist
,
bodyList
);
poi
.
exportExcel
(
"D://crawlerdata//
视频//搜狐视频关键词采集数据-毓婷-0716
.xlsx"
,
"搜狐数据"
,
headlist
,
bodyList
);
}
}
}
}
src/test/java/com/zhiwei/shipin/YoukuKeyWordTest.java
View file @
9234d24c
...
@@ -4,7 +4,7 @@ import java.util.ArrayList;
...
@@ -4,7 +4,7 @@ import java.util.ArrayList;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
org.
testng.annotations
.Test
;
import
org.
junit
.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
...
@@ -30,7 +30,7 @@ public class YoukuKeyWordTest {
...
@@ -30,7 +30,7 @@ public class YoukuKeyWordTest {
headList
.
add
(
"uper"
);
headList
.
add
(
"uper"
);
headList
.
add
(
"word"
);
headList
.
add
(
"word"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata//
优酷数据-txh-0320
.xlsx"
,
"数据"
,
headList
,
bodyList
);
poi
.
exportExcel
(
"D://crawlerdata//
视频//优酷数据-毓婷-0716
.xlsx"
,
"数据"
,
headList
,
bodyList
);
}
}
}
}
src/test/java/com/zhiwei/user/MaimaiTest.java
View file @
9234d24c
//package com.zhiwei.user;
package
com
.
zhiwei
.
user
;
//
//import java.util.ArrayList;
import
java.util.ArrayList
;
//import java.util.Arrays;
import
java.util.Arrays
;
//import java.util.List;
import
java.util.List
;
//import java.util.Map;
import
java.util.Map
;
//
//import org.testng.annotations.Test;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//
import
com.zhiwei.parse.Maimai
;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Maimai;
public
class
MaimaiTest
{
//
//public class MaimaiTest {
public
static
void
main
(
String
[]
args
)
{
// @Test
// public void maimaiUserCrawler() {
String
path
=
"D:\\crawlerdata\\用户采集\\脉脉用户.xlsx"
;
// String path = "D:\\crawlerdata\\脉脉用户.xlsx";
String
word
=
"巨量引擎|巨量 引擎|巨 量 引 擎|巨 量 引擎|巨量引 擎"
;
// String word = "美团|美团网|大众点评|美团点评|摩拜|猫眼|榛果|三快科技|三快在线";
String
cookie
=
"_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; guid=HBoEGxgEGBscBBsZGlYHGBseHxoYGhIZHFYcGQQdGR8FQ1hLTEt5ChITBBIdHxkEGgQbHQVPR0VYQmkKA0VBSU9tCk9BQ0YKBmZnfmJhAgocGQQdGR8FXkNhSE99T0ZaWmsKAx4cfWV9ChEZBBwKfmQKWV1FTkRDfQIKGgQfBUtGRkNQRWc=; seid=s1553309971270; token=\"iUifMkpE9YKuFpz0yEj+jiWpUqM6IXvEvwWKzdd/jK8YgrWsT1/Ku7k9bkIRRYvG8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoidzdPUkhMelktVS1iN1Nsb3VxLXZQV2JvIiwic3RhdHVzIjp0cnVlLCJfZXhwaXJlIjoxNTUzMzk2Mzk0MzczLCJfbWF4QWdlIjo4NjQwMDAwMH0=; session.sig=zGIN7VMizkYf1v48nLqTGAG1k8U"
;
// String cookie = "_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; sessionid=lejfy3gdu5tf9x9zowxfhtq5o73dubc5; guid=GxsfBBgZGwQYGx4EGBkeVgcYGxkdHhMeHhkcVhwZBB0ZHwVDWEtMS3kKEhMEEh0fGQQaBBsdBU9HRVhCaQoDRUFJT20KT0FDRgoGZmd+YmECChwZBB0ZHwVeQ2FIT31PRlpaawoDHhx9ZX0KERkEHAp+ZApZXUVOREN9AgoaBB8FS0ZGQ1BFZw==; seid=s1550629286782; token=\"OCY36EFdeYzGytlQFyKRdM0DcXNdViYI02kT4QbUMpaSk/CqMXrqBOx8EFo5/fQU8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"q1bNxxk8WW3MzjbCfKr/hfAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTc2NjQ0NzY1Iiwic2VjcmV0IjoiLXFsV2c2Ym9feEJqOWxQbWdWTjcwWWg3Iiwic3RhdHVzIjp0cnVlLCJtaWQ0NTY4NzYwIjpmYWxzZSwiX2V4cGlyZSI6MTU1MDcxNTc2NzgwMSwiX21heEFnZSI6ODY0MDAwMDB9; session.sig=lVCTA7DLvo1K_r_bTjbQOH13Alc";
String
[]
words
=
word
.
split
(
"\\|"
);
// String[] words = word.split("\\|");
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
// List<Map<String,Object>> bodyList = new ArrayList<>();
for
(
String
w
:
words
)
{
// for(String w : words) {
bodyList
.
addAll
(
Maimai
.
getUserList
(
w
,
cookie
,
null
));
// bodyList.addAll(Maimai.getUserList(w, cookie, null));
}
// }
List
<
String
>
headList
=
Arrays
.
asList
(
"id"
,
"name"
,
"gender"
,
"url"
,
"rank"
,
"compos"
,
"city"
);
// List<String> headList = Arrays.asList("id","name","gender","url","rank","compos","city");
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi
.
exportExcel
(
path
,
"result"
,
headList
,
bodyList
);
// poi.exportExcel(path, "result", headList, bodyList);
// }
}
//}
}
src/test/java/com/zhiwei/user/QQkandianExample.java
View file @
9234d24c
package
com
.
zhiwei
.
user
;
//
package com.zhiwei.user;
//
import
java.util.ArrayList
;
//
import java.util.ArrayList;
import
java.util.HashMap
;
//
import java.util.HashMap;
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
org.testng.annotations
.Test
;
//import org.junit
.Test;
//
import
com.zhiwei.bean.QQKandianUser
;
//
import com.zhiwei.bean.QQKandianUser;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.parse.QQKandian
;
//
import com.zhiwei.parse.QQKandian;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
//
import com.zhiwei.tools.tools.ZhiWeiTools;
//
public
class
QQkandianExample
{
//
public class QQkandianExample {
//
@Test
//
@Test
public
void
f
()
{
//
public void f() {
QQKandian
qqKandian
=
new
QQKandian
();
//
QQKandian qqKandian = new QQKandian();
String
path
=
"D:\\crawlerdata\\用户采集\\qq看点用户.xlsx"
;
//
String path = "D:\\crawlerdata\\用户采集\\qq看点用户.xlsx";
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
path
,
0
);
//
Map<String,Object> map = poi.importExcel(path, 0);
//
List
<
Map
<
String
,
Object
>>
dataList
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
//
List<Map<String,Object>> dataList = (List<Map<String, Object>>) map.get("body");
List
<
QQKandianUser
>
allList
=
new
ArrayList
<
QQKandianUser
>();
//
List<QQKandianUser> allList = new ArrayList<QQKandianUser>();
for
(
Map
<
String
,
Object
>
m
:
dataList
)
{
//
for(Map<String,Object> m : dataList) {
String
name
=
m
.
get
(
"渠道"
)+
""
;
//
String name = m.get("渠道")+"";
System
.
out
.
println
(
name
);
//
System.out.println(name);
List
<
QQKandianUser
>
qqKandianUsers
=
qqKandian
.
getUser
(
name
,
null
);
//
List<QQKandianUser> qqKandianUsers = qqKandian.getUser(name, null);
if
(
qqKandianUsers
!=
null
)
{
//
if(qqKandianUsers != null) {
System
.
out
.
println
(
qqKandianUsers
.
size
());
//
System.out.println(qqKandianUsers.size());
allList
.
addAll
(
qqKandianUsers
);
//
allList.addAll(qqKandianUsers);
}
else
{
//
}else {
System
.
out
.
println
(
name
+
"--- null"
);
//
System.out.println( name + "--- null");
}
//
}
ZhiWeiTools
.
sleep
(
3000
);
//
ZhiWeiTools.sleep(3000);
}
//
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
//
List<String> headList = new ArrayList<String>();
headList
.
add
(
"name"
);
//
headList.add("name");
headList
.
add
(
"url"
);
//
headList.add("url");
headList
.
add
(
"verity"
);
//
headList.add("verity");
headList
.
add
(
"desc"
);
//
headList.add("desc");
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for
(
QQKandianUser
qqKandianUser
:
allList
)
{
//
for(QQKandianUser qqKandianUser : allList) {
Map
<
String
,
Object
>
m
=
new
HashMap
<
String
,
Object
>();
//
Map<String,Object> m = new HashMap<String,Object>();
m
.
put
(
"name"
,
qqKandianUser
.
getName
());
//
m.put("name", qqKandianUser.getName());
m
.
put
(
"url"
,
qqKandianUser
.
getUrl
());
//
m.put("url", qqKandianUser.getUrl());
m
.
put
(
"verity"
,
qqKandianUser
.
isVerify
());
//
m.put("verity", qqKandianUser.isVerify());
m
.
put
(
"desc"
,
qqKandianUser
.
getDesc
());
//
m.put("desc", qqKandianUser.getDesc());
bodyList
.
add
(
m
);
//
bodyList.add(m);
}
//
}
poi
.
exportExcel
(
path
,
"数据完成后"
,
headList
,
bodyList
);
//
poi.exportExcel(path, "数据完成后", headList, bodyList);
}
//
}
//
}
//
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment