Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
articlenewscrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
articlenewscrawler
Commits
d979d793
Commit
d979d793
authored
Jan 24, 2019
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
脉脉 评论采集 和部分视频采集
parent
1116d3c5
Show whitespace changes
Inline
Side-by-side
Showing
51 changed files
with
1106 additions
and
602 deletions
+1106
-602
pom.xml
+1
-1
src/main/java/com/zhiwei/httpclient/HeadGet.java
+7
-27
src/main/java/com/zhiwei/httpclient/HttpClient.java
+12
-9
src/main/java/com/zhiwei/parse/Aiqiyi.java
+1
-1
src/main/java/com/zhiwei/parse/BiliBili.java
+8
-7
src/main/java/com/zhiwei/parse/Chejia.java
+111
-0
src/main/java/com/zhiwei/parse/Douyin.java
+1
-0
src/main/java/com/zhiwei/parse/Gftai.java
+0
-1
src/main/java/com/zhiwei/parse/Maimai.java
+113
-5
src/main/java/com/zhiwei/parse/PearVideo.java
+17
-10
src/main/java/com/zhiwei/parse/QQKB.java
+11
-2
src/main/java/com/zhiwei/parse/QQKandian.java
+0
-3
src/main/java/com/zhiwei/parse/QicheHome.java
+0
-1
src/main/java/com/zhiwei/parse/SinaTousu.java
+0
-1
src/main/java/com/zhiwei/parse/SouBao.java
+1
-0
src/main/java/com/zhiwei/parse/Souhu.java
+4
-3
src/main/java/com/zhiwei/parse/Toutiao.java
+0
-4
src/main/java/com/zhiwei/parse/Xueqiu.java
+23
-2
src/main/java/com/zhiwei/parse/Yiche.java
+1
-0
src/main/java/com/zhiwei/parse/Youku.java
+67
-0
src/main/java/com/zhiwei/parse/analysis/AiqiyiByWordAnalysis.java
+10
-4
src/main/java/com/zhiwei/parse/analysis/BaijiaAccountAnalysis.java
+7
-3
src/main/java/com/zhiwei/parse/analysis/DayuByWordAnalysis.java
+12
-11
src/main/java/com/zhiwei/parse/analysis/DayuCommentAnalysis.java
+64
-68
src/main/java/com/zhiwei/parse/analysis/DoubanCommentAnalysis.java
+0
-2
src/main/java/com/zhiwei/parse/analysis/DouyinHotDataAnalysis.java
+0
-4
src/main/java/com/zhiwei/parse/analysis/FenghuangAccountAnalysis.java
+8
-3
src/main/java/com/zhiwei/parse/analysis/FenghuangCommentAnalysis.java
+20
-27
src/main/java/com/zhiwei/parse/analysis/MaimaiBywordAnalysis.java
+2
-6
src/main/java/com/zhiwei/parse/analysis/MeipaiByWordAnalysis.java
+0
-1
src/main/java/com/zhiwei/parse/analysis/QQKBCommentAnalysis.java
+13
-16
src/main/java/com/zhiwei/parse/analysis/QicheHomeKwyWordAnalysis.java
+0
-4
src/main/java/com/zhiwei/parse/analysis/WangyiHistoryAnalysis.java
+3
-2
src/main/resources/log4j.properties
+2
-1
src/test/java/com/zhiwei/Comment/ChejiaCommentCountTest.java
+42
-0
src/test/java/com/zhiwei/Comment/MaimaiCommentCountTest.java
+40
-0
src/test/java/com/zhiwei/Comment/XueqiuCommentCountTest.java
+48
-0
src/test/java/com/zhiwei/crawler/AiqiyiByWordExample.java
+42
-42
src/test/java/com/zhiwei/crawler/BaijiaAccountExample.java
+89
-89
src/test/java/com/zhiwei/crawler/DayuAccountExample.java
+50
-50
src/test/java/com/zhiwei/crawler/DayuByWordExample.java
+25
-25
src/test/java/com/zhiwei/crawler/MaimaiBywordExample.java
+10
-7
src/test/java/com/zhiwei/crawler/PearVideoByWordExample.java
+1
-1
src/test/java/com/zhiwei/crawler/QQKBCommentExample.java
+5
-2
src/test/java/com/zhiwei/crawler/SouhuCommentCountExample.java
+30
-4
src/test/java/com/zhiwei/crawler/SouhuCommentExample.java
+5
-2
src/test/java/com/zhiwei/hsitory/QQkandianHistoryExample.java
+42
-42
src/test/java/com/zhiwei/keyword/GftaiTest.java
+33
-33
src/test/java/com/zhiwei/keyword/KuaiTousuTest.java
+38
-38
src/test/java/com/zhiwei/keyword/SinaTousuTest.java
+38
-38
src/test/java/com/zhiwei/keyword/YoukuKeyWordTest.java
+49
-0
No files found.
pom.xml
View file @
d979d793
...
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<artifactId>
articlenewscrawler
</artifactId>
<version>
0.0.
4
-SNAPSHOT
</version>
<version>
0.0.
8
-SNAPSHOT
</version>
<name>
articlenewscrawler
</name>
<description>
采集凤凰,一点资讯,搜狐历时文章和文章评论
</description>
...
...
src/main/java/com/zhiwei/httpclient/HeadGet.java
View file @
d979d793
package
com
.
zhiwei
.
httpclient
;
import
java.io.IOException
;
import
java.io.UnsupportedEncodingException
;
import
java.net.InetSocketAddress
;
import
java.net.Proxy
;
import
java.net.SocketAddress
;
import
java.net.URLEncoder
;
import
java.net.Proxy.Type
;
import
java.util.HashMap
;
import
java.util.Map
;
import
org.jsoup.nodes.Document
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
public
class
HeadGet
{
/**
...
...
@@ -409,12 +401,10 @@ public class HeadGet {
* @return
*/
public
static
Map
<
String
,
String
>
getPearVideoByWordHeaderMap
(
String
cookie
)
{
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"Host"
,
"www.pearvideo.com"
);
headerMap
.
put
(
"Connection"
,
"keep-alive"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-CN,zh;q=0.9"
);
headerMap
.
put
(
"Accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
);
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
);
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"Accept-Language"
,
"zh-CN,zh;q=0.9,en;q=0.8"
);
headerMap
.
put
(
"Accept"
,
"text/html, */*; q=0.01"
);
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
);
if
(
cookie
!=
null
)
{
headerMap
.
put
(
"Cookie"
,
cookie
);
}
...
...
@@ -492,8 +482,8 @@ public class HeadGet {
*/
public
static
Map
<
String
,
String
>
getQQKBCommentHeaderMap
(
String
cookie
)
{
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"User-Agent"
,
"天天快报 4.6.2 qnreading (iPhone8,1; iOS 11.2.1; zh_CN; 4.6.2.89)"
);
//
headerMap.put("User-Agent",
//
"天天快报 4.6.2 qnreading (iPhone8,1; iOS 11.2.1; zh_CN; 4.6.2.89)");
headerMap
.
put
(
"Accept"
,
"*/*"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-Hans-CN;q=1"
);
...
...
@@ -514,7 +504,7 @@ public class HeadGet {
* @return
*/
public
static
Map
<
String
,
Object
>
getQQKBCommentParamMap
(
String
comment_id
,
String
article_id
){
Map
<
String
,
Object
>
param
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
Object
>
param
=
new
HashMap
<>();
param
.
put
(
"chlid"
,
"daily_timeline"
);
param
.
put
(
"comment_id"
,
comment_id
);
param
.
put
(
"page"
,
1
);
...
...
@@ -944,15 +934,5 @@ public class HeadGet {
return
paramMap
;
}
public
static
void
main
(
String
[]
args
)
throws
UnsupportedEncodingException
{
String
url
=
"http://180.186.38.200/rest/n/feed/profile2"
;
System
.
out
.
println
(
url
);
String
cookie
=
""
;
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
Map
<
String
,
Object
>
paramMap
=
HeadGet
.
getKuaishouParamMap
();
String
result
=
HttpClient
.
executeHttpRequestPost
(
url
,
null
,
headerMap
,
paramMap
);
System
.
out
.
println
(
result
);
System
.
out
.
println
(
result
.
length
());
}
}
src/main/java/com/zhiwei/httpclient/HttpClient.java
View file @
d979d793
...
...
@@ -7,11 +7,16 @@ import java.util.Map;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.tools.httpclient.HttpClientTemplateOK
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
okhttp3.Response
;
public
class
HttpClient
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
HttpClient
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
/**
*
* @Description (TODO这里用一句话描述这个方法的作用)
...
...
@@ -21,22 +26,20 @@ public class HttpClient {
* @throws IOException
*/
public
static
String
executeHttpRequestGet
(
String
url
,
Proxy
proxy
,
Map
<
String
,
String
>
headerMap
)
{
try
{
String
result
=
HttpClientTemplateOK
.
get
(
url
,
proxy
,
headerMap
);
return
result
;
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
)){
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"httpClient 获取数据出现问题:{}"
,
e
.
getMessage
()
);
logger
.
error
(
"httpClient 获取数据出现问题:{}"
,
e
);
return
null
;
}
}
public
static
String
executeHttpRequestPost
(
String
url
,
Proxy
proxy
,
Map
<
String
,
String
>
headerMap
,
Map
<
String
,
Object
>
paramMap
)
{
try
{
String
result
=
HttpClientTemplateOK
.
post
(
url
,
proxy
,
headerMap
,
paramMap
);
return
result
;
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapPost
(
url
,
headerMap
,
paramMap
),
proxy
)){
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"httpClient 获取数据出现问题:{}"
,
e
.
getMessage
()
);
logger
.
error
(
"httpClient 获取数据出现问题:{}"
,
e
);
return
null
;
}
...
...
src/main/java/com/zhiwei/parse/Aiqiyi.java
View file @
d979d793
...
...
@@ -28,7 +28,7 @@ public class Aiqiyi {
public
static
List
<
Map
<
String
,
Object
>>
getAiqiyiByWordData
(
String
word
,
Proxy
proxy
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getAiqiyiBywordHeaderMap
(
null
);
Map
<
String
,
String
>
headerMap1
=
HeadGet
.
getAiqiyiHeaderMap
(
null
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>
>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
try
{
for
(
int
i
=
1
;
i
<=
20
;
i
++)
{
String
url
=
"http://so.iqiyi.com/so/q_"
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)+
"_ctg_%E7%94%9F%E6%B4%BB_t_0_page_"
+
i
+
"_p_1_qc_0_rd__site__m_11_bitrate_?af=true"
;
...
...
src/main/java/com/zhiwei/parse/BiliBili.java
View file @
d979d793
package
com
.
zhiwei
.
parse
;
import
java.io.IOException
;
import
java.io.UnsupportedEncodingException
;
import
java.net.Proxy
;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.List
;
import
java.util.Map
;
...
...
@@ -23,7 +23,7 @@ import okhttp3.Request;
public
class
BiliBili
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
BiliBili
.
class
);
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
BiliBili
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
@SuppressWarnings
(
"unchecked"
)
...
...
@@ -46,6 +46,7 @@ public class BiliBili {
while
(
more
)
{
map
.
clear
();
String
ur
=
url
+
"&page="
+
n
;
System
.
out
.
println
(
ur
);
request
=
HttpRequestBuilder
.
newGetRequest
(
ur
,
header
);
String
result2
=
httpBoot
.
syncCall
(
request
,
proxy
).
body
().
string
();
map
=
BilibilikeyWordAnalysis
.
getData
(
result2
);
...
...
@@ -60,13 +61,13 @@ public class BiliBili {
}
return
bodyList
;
}
catch
(
UnsupportedEncodingException
e
)
{
e
.
printStackTrace
(
);
}
catch
(
IO
Exception
e
)
{
e
.
printStackTrace
(
);
logger
.
error
(
"e "
,
e
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"e "
,
e
);
}
return
null
;
return
Collections
.
emptyList
()
;
}
public
static
void
main
(
String
[]
args
)
{
...
...
@@ -88,7 +89,7 @@ public class BiliBili {
headlist
.
add
(
"title"
);
headlist
.
add
(
"url"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata//bilibili关键词采集数据-
竹鼠
.xlsx"
,
"B站数据"
,
headlist
,
bodyList
);
poi
.
exportExcel
(
"D://crawlerdata//bilibili关键词采集数据-
txh
.xlsx"
,
"B站数据"
,
headlist
,
bodyList
);
}
...
...
src/main/java/com/zhiwei/parse/Chejia.java
0 → 100644
View file @
d979d793
package
com
.
zhiwei
.
parse
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
public
class
Chejia
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Chejia
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
/**
*
* @Description 车家 号 评论数
* @param url
* @param proxy
* @return
*/
public
static
int
getChejiaCommentCount
(
String
url
,
Proxy
proxy
)
{
String
id
=
getCommentUrl
(
url
,
proxy
);
if
(
nonNull
(
id
))
{
System
.
out
.
println
(
id
);
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
id
),
proxy
)){
String
result
=
response
.
body
().
string
();
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
return
json
.
getInteger
(
"commentcount"
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"error {} "
,
e
);
}
}
return
-
1
;
}
/**
*
* @Description 车家 号 评论数
* @param url
* @param proxy
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getChejiaComment
(
String
url
,
Proxy
proxy
)
{
String
nUrl
=
getCommentUrl
(
url
,
proxy
);
if
(
nonNull
(
nUrl
))
{
int
page
=
1
;
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
boolean
f
=
true
;
while
(
f
)
{
String
surl
=
nUrl
+
"&page="
+
page
;
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
surl
),
proxy
)){
String
result
=
response
.
body
().
string
();
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"commentlist"
);
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
map
.
put
(
"source"
,
data
.
getString
(
"RMemberName"
));
String
time
=
data
.
getString
(
"RReplyDate"
);
time
=
time
.
split
(
"/Date\\("
)[
1
].
split
(
"\\+"
)[
0
];
map
.
put
(
"time"
,
TimeParse
.
dateFormartString
(
new
Date
(
Long
.
parseLong
(
time
)),
"yyyy-MM-dd HH:mm:ss"
));
map
.
put
(
"content"
,
data
.
getString
(
"RContent"
));
map
.
put
(
"like"
,
data
.
get
(
"RUp"
));
map
.
put
(
"id"
,
data
.
getString
(
"ReplyId"
));
bodyList
.
add
(
map
);
}
int
total
=
json
.
getInteger
(
"commentcount"
);
logger
.
info
(
" 一共采集 了 {} 条 采集到 {} 页 一共有 {} 条"
,
bodyList
.
size
(),
page
,
total
);
if
(
page
*
50
>
total
)
{
f
=
false
;
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"error {} "
,
e
);
f
=
false
;
}
ZhiWeiTools
.
sleep
(
2000
);
page
++;
}
return
bodyList
;
}
return
Collections
.
emptyList
();
}
private
static
String
getCommentUrl
(
String
url
,
Proxy
proxy
)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
String
objectID
=
response
.
body
().
string
().
split
(
"pvTrack.object = "
)[
1
].
split
(
";"
)[
0
].
replace
(
"\""
,
""
);
return
"https://reply.autohome.com.cn/api/comments/show.json?appid=21&count=50&id="
+
objectID
;
}
catch
(
Exception
e
)
{
logger
.
error
(
"error {} "
,
e
);
}
return
null
;
}
}
src/main/java/com/zhiwei/parse/Douyin.java
View file @
d979d793
...
...
@@ -25,6 +25,7 @@ public class Douyin {
* @param url
* @return
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
Map
<
String
,
Object
>>
getDouyinHotData
(
String
url
,
Proxy
proxy
)
{
String
iid
=
url
.
split
(
"iid="
)[
1
].
split
(
"&"
)[
0
];
String
ch_id
=
url
.
split
(
"challenge/"
)[
1
].
split
(
"\\?"
)[
0
];
...
...
src/main/java/com/zhiwei/parse/Gftai.java
View file @
d979d793
...
...
@@ -3,7 +3,6 @@ package com.zhiwei.parse;
import
java.net.Proxy
;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.List
;
import
java.util.Map
;
...
...
src/main/java/com/zhiwei/parse/Maimai.java
View file @
d979d793
package
com
.
zhiwei
.
parse
;
import
static
com
.
alibaba
.
fastjson
.
JSON
.
toJavaObject
;
import
java.net.Proxy
;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.parse.analysis.MaimaiBywordAnalysis
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
public
class
Maimai
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Maimai
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
MaimaiBywordAnalysis
maimaiBywordAnalysis
=
new
MaimaiBywordAnalysis
();
/**
*
* @Description 实名动态
* @param key
* @param cookie
* @param time
* @param proxy
* @return
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
key
,
String
cookie
,
String
time
,
Proxy
proxy
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getMaimaiKeywordHeaderMap
(
cookie
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>
>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
boolean
f
=
true
;
try
{
String
url
=
"https://maimai.cn/search/feeds?query="
+
URLEncoder
.
encode
(
key
,
"utf-8"
)+
"&limit=20&offset=0&highlight=true&sortby=time&jsononly=1"
;
...
...
@@ -32,11 +54,11 @@ public class Maimai {
Map
<
String
,
Object
>
map
=
maimaiBywordAnalysis
.
getData
(
result
,
time
);
f
=
(
boolean
)
map
.
get
(
"hasMore"
);
List
<
Map
<
String
,
Object
>>
daList
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"data"
);
if
(
daList
!=
null
&&
daList
.
size
()
>
0
)
{
if
(
daList
!=
null
&&
!
daList
.
isEmpty
()
)
{
dataList
.
addAll
(
daList
);
url
=
"https://maimai.cn/search/feeds?query="
+
URLEncoder
.
encode
(
key
,
"utf-8"
)+
"&limit=20&offset="
+
i
+
"&highlight=true&sortby=time&jsononly=1"
;
i
+=
20
;
logger
.
info
(
"{}
==采集到的数据量=="
+
dataList
.
size
(),
key
);
logger
.
info
(
"{}
==采集到的数据量== {}"
,
dataList
.
size
(),
key
);
ZhiWeiTools
.
sleep
(
2000
);
}
else
{
break
;
...
...
@@ -48,9 +70,19 @@ public class Maimai {
return
dataList
;
}
/**
*
* @Description 职言交流
* @param key
* @param cookie
* @param time
* @param proxy
* @return
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
Map
<
String
,
Object
>>
getDataByNoName
(
String
key
,
String
cookie
,
String
time
,
Proxy
proxy
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getMaimaiKeywordHeaderMap
(
cookie
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>
>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
boolean
f
=
true
;
try
{
String
url
=
"https://maimai.cn/search/gossips?query="
+
URLEncoder
.
encode
(
key
,
"utf-8"
)+
"&limit=20&offset=0&highlight=true&sortby=time&jsononly=1"
;
...
...
@@ -64,7 +96,7 @@ public class Maimai {
dataList
.
addAll
(
daList
);
url
=
"https://maimai.cn/search/gossips?query="
+
URLEncoder
.
encode
(
key
,
"utf-8"
)+
"&limit=20&offset="
+
i
+
"highlight=true&sortby=time&jsononly=1"
;
i
+=
20
;
logger
.
info
(
"{}
==采集到的数据量=="
+
dataList
.
size
(),
key
);
logger
.
info
(
"{}
==采集到的数据量== {} "
,
dataList
.
size
(),
key
);
ZhiWeiTools
.
sleep
(
2000
);
}
else
{
break
;
...
...
@@ -76,4 +108,80 @@ public class Maimai {
return
dataList
;
}
/**
* //https://maimai.cn/web/gossip_detail?encode_id=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MTk2MzEyNjYsImlhdCI6MTU0ODI5NzI5NX0.N6SPmcf-fyitLNomzY-a8BEY31eseYnvG7RTUQ3jxYY
* @Description 获取脉脉转评赞
* @param url
* @param proxy
* @return
*/
public
static
Map
<
String
,
Object
>
getMaiaiCount
(
String
url
,
ProxyHolder
proxy
)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
String
result
=
response
.
body
().
string
();
result
=
result
.
split
(
"JSON.parse\\(\""
)[
1
].
split
(
"\"\\);\\</script\\>"
)[
0
];
result
=
ZhiWeiTools
.
decodeUnicode
(
result
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
JSONObject
data
=
json
.
getJSONObject
(
"data"
).
getJSONObject
(
"gossip"
);
map
.
put
(
"like"
,
data
.
getInteger
(
"likes"
));
map
.
put
(
"spreads"
,
data
.
getInteger
(
"spreads"
));
map
.
put
(
"cmts"
,
data
.
getInteger
(
"cmts"
));
map
.
put
(
"gid"
,
data
.
getLong
(
"id"
));
map
.
put
(
"title"
,
data
.
getString
(
"text"
));
map
.
put
(
"author"
,
data
.
getString
(
"author"
));
return
map
;
}
catch
(
Exception
e
)
{
logger
.
error
(
" 脉脉 转评攒 获取失败 {}"
,
e
);
}
return
Collections
.
emptyMap
();
}
/**
* //https://maimai.cn/web/gossip_detail?encode_id=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpZCI6MTk2MzEyNjYsImlhdCI6MTU0ODI5NzI5NX0.N6SPmcf-fyitLNomzY-a8BEY31eseYnvG7RTUQ3jxYY
* @Description 脉脉评论采集获取
* @param url
* @param proxy
* @return
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
Map
<
String
,
Object
>>
getMaimaiCommentList
(
String
url
,
ProxyHolder
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
Map
<
String
,
Object
>
mmid
=
getMaiaiCount
(
url
,
proxy
);
if
(
mmid
!=
null
)
{
String
gid
=
String
.
valueOf
(
mmid
.
get
(
"gid"
));
boolean
more
=
true
;
int
page
=
0
;
while
(
more
)
{
try
{
String
link
=
"https://maimai.cn/sdk/web/gossip/getcmts?gid="
+
gid
+
"&page="
+
page
+
"&count=50&hotcmts_limit_count=100"
;
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
link
),
proxy
).
body
().
string
();
if
(
htmlBody
!=
null
&&
htmlBody
.
length
()>
0
)
{
JSONObject
dataJson
=
JSONObject
.
parseObject
(
htmlBody
);
JSONArray
commentJson
=
dataJson
.
getJSONArray
(
"comments"
);
if
(
commentJson
!=
null
&&
!
commentJson
.
isEmpty
())
{
for
(
int
i
=
0
;
i
<
commentJson
.
size
();
i
++)
{
JSONObject
json
=
commentJson
.
getJSONObject
(
i
);
Map
<
String
,
Object
>
dataMap
=
toJavaObject
(
json
,
Map
.
class
);
dataMap
.
put
(
"fromUrl"
,
url
);
dataMap
.
putAll
(
mmid
);
dataList
.
add
(
dataMap
);
}
page
++;
}
else
{
more
=
false
;
}
int
moreInt
=
dataJson
.
getIntValue
(
"more"
);
if
(
moreInt
==
0
)
{
more
=
false
;
}
}
}
catch
(
Exception
e
)
{
logger
.
info
(
"数据采集出错 {}"
,
e
);
}
}
return
dataList
;
}
return
Collections
.
emptyList
();
}
}
src/main/java/com/zhiwei/parse/PearVideo.java
View file @
d979d793
package
com
.
zhiwei
.
parse
;
import
java.io.UnsupportedEncodingException
;
import
java.net.Proxy
;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.parse.analysis.PearVideoByWordAnalysis
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
...
...
@@ -26,24 +26,31 @@ public class PearVideo {
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getPearVideoData
(
String
word
,
Proxy
proxy
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getPearVideoByWordHeaderMap
(
null
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>();
headerMap
.
put
(
"Accept-Language"
,
"zh-CN,zh;q=0.9,en;q=0.8"
);
headerMap
.
put
(
"Accept"
,
"text/html, */*; q=0.01"
);
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
);
headerMap
.
put
(
":authority"
,
"www.pearvideo.com"
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
try
{
headerMap
.
put
(
"referer"
,
"https://www.pearvideo.com/search.jsp?start=0&k="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
));
for
(
int
i
=
0
;
i
<=
9000
;
i
+=
10
)
{
String
url
=
"http
://www.pearvideo.com/search_loading.jsp?start="
+
i
+
"&k="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)
;
String
url
=
"http
s://www.pearvideo.com/search_loading.jsp?start="
+
i
+
"&k="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)
+
"&sort=first_publish_time"
;
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
List
<
Map
<
String
,
Object
>>
dataList1
=
pearVideoByWordAnalysis
.
getPearVideoData
(
result
);
if
(
dataList1
!=
null
&&
dataList1
.
size
()
>
0
)
{
if
(
dataList1
!=
null
&&
!
dataList1
.
isEmpty
()
)
{
dataList
.
addAll
(
dataList1
);
}
System
.
out
.
println
(
i
+
"=========="
+
dataList
.
size
());
ZhiWeiTools
.
sleep
(
4000
);
}
return
dataList
;
}
catch
(
UnsupportedEncodingException
e
)
{
logger
.
error
(
"获取数据出错"
,
e
.
getMessage
());
e
.
printStackTrace
();
return
null
;
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据出错 {}"
,
e
);
return
Collections
.
emptyList
();
}
}
...
...
src/main/java/com/zhiwei/parse/QQKB.java
View file @
d979d793
...
...
@@ -11,16 +11,21 @@ import org.slf4j.LoggerFactory;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.bean.QQkbUser
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.parse.analysis.QQKBAccountAnalysis
;
import
com.zhiwei.parse.analysis.QQKBCommentAnalysis
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
public
class
QQKB
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
QQKB
.
class
);
private
static
QQKBAccountAnalysis
qqAccountAnalysis
=
new
QQKBAccountAnalysis
();
private
static
QQKBCommentAnalysis
qqkbCommentAnalysis
=
new
QQKBCommentAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
/**
*
...
...
@@ -113,8 +118,9 @@ public class QQKB {
Map
<
String
,
Object
>
paramMap
=
HeadGet
.
getQQKBCommentParamMap
(
comment_id
,
article_id
);
int
i
=
1
;
while
(
true
)
{
try
{
String
result
=
HttpClient
.
executeHttpRequestPost
(
"http://r.cnews.qq.com/getQQNewsComment"
,
proxy
,
headerMap
,
paramMap
);
String
result
=
HttpClient
.
executeHttpRequestPost
(
"http://r.cnews.qq.com/getQQNewsComment"
,
ProxyFactory
.
getNatProxy
()
,
headerMap
,
paramMap
);
paramMap
.
clear
();
List
<
Map
<
String
,
Object
>>
lists
=
qqkbCommentAnalysis
.
getCommentData
(
result
,
null
,
comment_id
,
article_id
,
proxy
);
if
(
lists
==
null
||
lists
.
size
()
<
1
)
{
...
...
@@ -124,7 +130,10 @@ public class QQKB {
paramMap
=
qqkbCommentAnalysis
.
getParamMap
(
result
,
i
,
comment_id
,
article_id
);
i
++;
ZhiWeiTools
.
sleep
(
5000
);
ZhiWeiTools
.
sleep
(
300
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
return
dataList
;
}
catch
(
Exception
e
)
{
...
...
src/main/java/com/zhiwei/parse/QQKandian.java
View file @
d979d793
...
...
@@ -13,8 +13,6 @@ import java.util.regex.Pattern;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
...
...
@@ -31,7 +29,6 @@ import okhttp3.Request;
public
class
QQKandian
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
QQKandian
.
class
);
public
List
<
QQKandianUser
>
getUser
(
String
name
,
Proxy
proxy
)
{
if
(
name
!=
null
&&
name
.
length
()
>
0
)
{
...
...
src/main/java/com/zhiwei/parse/QicheHome.java
View file @
d979d793
...
...
@@ -3,7 +3,6 @@ package com.zhiwei.parse;
import
java.io.IOException
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.List
;
import
java.util.Map
;
...
...
src/main/java/com/zhiwei/parse/SinaTousu.java
View file @
d979d793
...
...
@@ -5,7 +5,6 @@ import java.io.UnsupportedEncodingException;
import
java.net.Proxy
;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.List
;
import
java.util.Map
;
...
...
src/main/java/com/zhiwei/parse/SouBao.java
View file @
d979d793
...
...
@@ -68,6 +68,7 @@ public class SouBao {
poi
.
exportExcel
(
"D:\\crawlerdata\\搜报网-EA 品牌 关键词-06.11-06.12.xlsx"
,
"sa"
,
headList
,
bodyList
);
}
@SuppressWarnings
(
"unchecked"
)
public
static
Map
<
String
,
String
>
getdata
()
{
Map
<
String
,
String
>
map
=
new
HashMap
<
String
,
String
>();
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
...
...
src/main/java/com/zhiwei/parse/Souhu.java
View file @
d979d793
...
...
@@ -13,6 +13,7 @@ import org.slf4j.LoggerFactory;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.parse.analysis.SouhuAccountAnalysis
;
...
...
@@ -144,8 +145,8 @@ public class Souhu {
int
j
=
1
;
try
{
while
(
true
)
{
String
newurl
=
souhuCommentAnalysis
.
getSouhuURL
(
url
,
proxy
)
+
"&page_no="
+
j
;
String
result
=
HttpClient
.
executeHttpRequestGet
(
newurl
,
proxy
,
headerMap
);
String
newurl
=
souhuCommentAnalysis
.
getSouhuURL
(
url
,
ProxyFactory
.
getNatProxy
()
)
+
"&page_no="
+
j
;
String
result
=
HttpClient
.
executeHttpRequestGet
(
newurl
,
ProxyFactory
.
getNatProxy
()
,
headerMap
);
System
.
out
.
println
(
newurl
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArry
=
json
.
getJSONObject
(
"jsonObject"
).
getJSONArray
(
"comments"
);
...
...
@@ -158,7 +159,7 @@ public class Souhu {
dataList
.
add
(
map
);
}
j
++;
ZhiWeiTools
.
sleep
(
300
0
);
ZhiWeiTools
.
sleep
(
300
);
}
}
catch
(
Exception
e
)
{
...
...
src/main/java/com/zhiwei/parse/Toutiao.java
View file @
d979d793
...
...
@@ -7,9 +7,6 @@ import java.util.ArrayList;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.parse.analysis.ToutiaoKeyWordAnalysis
;
...
...
@@ -17,7 +14,6 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
public
class
Toutiao
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Toutiao
.
class
);
private
static
ToutiaoKeyWordAnalysis
toutiaoKeyWordAnalysis
=
new
ToutiaoKeyWordAnalysis
();
...
...
src/main/java/com/zhiwei/parse/Xueqiu.java
View file @
d979d793
...
...
@@ -5,6 +5,8 @@ import java.io.UnsupportedEncodingException;
import
java.net.Proxy
;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
import
java.util.Collection
;
import
java.util.Collections
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
...
...
@@ -12,13 +14,14 @@ import java.util.Map;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.parse.analysis.XueqiuKeyWordAnalysis
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
import
okhttp3.Response
;
public
class
Xueqiu
{
...
...
@@ -60,8 +63,26 @@ public class Xueqiu {
break
;
}
}
return
bodyList
;
}
public
static
Map
<
String
,
Object
>
getUrlData
(
String
url
,
Proxy
proxy
)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
String
result
=
response
.
body
().
string
();
String
jsondata
=
result
.
split
(
"window.SNOWMAN_STATUS = "
)[
1
].
split
(
"window.SNOWMAN_TARGET"
)[
0
];
jsondata
=
jsondata
.
substring
(
0
,
jsondata
.
lastIndexOf
(
";"
));
JSONObject
json
=
JSONObject
.
parseObject
(
jsondata
);
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
map
.
put
(
"like"
,
json
.
getInteger
(
"like_count"
));
map
.
put
(
"repostCount"
,
json
.
getInteger
(
"retweet_count"
));
map
.
put
(
"commentCount"
,
json
.
getInteger
(
"reply_count"
));
return
map
;
}
catch
(
Exception
e
)
{
logger
.
error
(
" 雪球 数据转评赞获取失败 exception {} url = {}"
,
e
,
url
);
}
return
Collections
.
emptyMap
();
}
}
src/main/java/com/zhiwei/parse/Yiche.java
View file @
d979d793
...
...
@@ -86,6 +86,7 @@ public class Yiche {
ZhiWeiTools
.
sleep
(
2000
);
page
++;
}
return
bodyList
;
}
return
Collections
.
emptyList
();
...
...
src/main/java/com/zhiwei/parse/Youku.java
0 → 100644
View file @
d979d793
package
com
.
zhiwei
.
parse
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
okhttp3.Response
;
public
class
Youku
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Youku
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
public
static
List
<
Map
<
String
,
Object
>>
getDataList
(
String
word
)
{
String
aaid
=
"9cae49f0e031664b00d8f9c108e586ab"
;
List
<
Map
<
String
,
Object
>>
list
=
new
ArrayList
<>();
for
(
int
i
=
1
;
i
<=
20
;
i
++)
{
String
url
=
"https://so.youku.com/search_video/q_"
+
URLCodeUtil
.
getURLEncode
(
word
,
"UTF-8"
)+
"?spm=a2h0k.11417342.filter.dnew&orderfield=createtime&aaid="
+
aaid
+
"&pg="
+
i
;
System
.
out
.
println
(
url
);
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyFactory
.
getNatProxy
())){
String
result
=
response
.
body
().
string
();
String
jsondata
=
result
.
split
(
"bigview.view\\("
)[
1
].
split
(
"\\)\\</script\\>"
)[
0
];
JSONObject
json
=
JSONObject
.
parseObject
(
jsondata
);
String
docData
=
json
.
getString
(
"html"
);
Document
doc
=
Jsoup
.
parse
(
docData
);
Elements
elements
=
doc
.
select
(
"div.sk-result-list"
).
select
(
"div.sk-mod"
);
for
(
Element
element
:
elements
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
String
title
=
element
.
select
(
"div.mod-main > div.mod-header > h2 > a"
).
text
();
String
surl
=
element
.
select
(
"div.mod-main > div.mod-header > h2 > a"
).
attr
(
"href"
);
String
time
=
element
.
select
(
"div.mod-main > div.mod-info > p"
).
text
();
if
(
time
.
contains
(
"上传时间:"
))
{
map
.
put
(
"title"
,
title
);
map
.
put
(
"url"
,
"https:"
+
surl
);
map
.
put
(
"time"
,
time
.
replaceAll
(
"上传时间:"
,
""
).
split
(
" "
)[
0
]);
map
.
put
(
"uper"
,
time
.
replace
(
time
.
split
(
"上传者:"
)[
0
],
""
));
list
.
add
(
map
);
}
}
logger
.
info
(
" i = {} dataSize = {} "
,
i
,
list
.
size
());
}
catch
(
Exception
e
)
{
logger
.
error
(
" Exception {} "
,
e
);
}
}
return
list
;
}
}
src/main/java/com/zhiwei/parse/analysis/AiqiyiByWordAnalysis.java
View file @
d979d793
...
...
@@ -13,13 +13,19 @@ import org.jsoup.select.Elements;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
public
class
AiqiyiByWordAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
AiqiyiByWordAnalysis
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
/**
*
* @Description 解析出所有有用链接
...
...
@@ -45,9 +51,9 @@ public class AiqiyiByWordAnalysis {
}
public
Map
<
String
,
Object
>
getAiqiyiData
(
String
url
,
Map
<
String
,
String
>
headerMap
,
Proxy
proxy
)
{
Map
<
String
,
Object
>
dataMap
=
new
HashMap
<
String
,
Object
>();
try
{
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
Map
<
String
,
Object
>
dataMap
=
new
HashMap
<>();
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
))
{
String
result
=
response
.
body
().
string
(
);
Document
doc
=
Jsoup
.
parse
(
result
);
String
time
=
doc
.
select
(
"#widget-vshort-ptime"
).
text
();
if
(!
time
.
contains
(
"2017"
))
{
...
...
@@ -68,7 +74,7 @@ public class AiqiyiByWordAnalysis {
System
.
out
.
println
(
dataMap
.
toString
());
return
dataMap
;
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析出错
"
,
e
.
getMessage
()
);
logger
.
error
(
"解析出错
{}"
,
e
);
return
dataMap
;
}
}
...
...
src/main/java/com/zhiwei/parse/analysis/BaijiaAccountAnalysis.java
View file @
d979d793
...
...
@@ -14,13 +14,17 @@ import org.slf4j.LoggerFactory;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
public
class
BaijiaAccountAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
BaijiaAccountAnalysis
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
public
Map
<
String
,
Object
>
getBaijiaAccount2Data
(
JSONObject
data
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
...
...
@@ -159,8 +163,8 @@ public class BaijiaAccountAnalysis {
public
String
getBaijiaContent
(
String
url
,
Proxy
proxy
)
{
ZhiWeiTools
.
sleep
(
2000
);
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getBaijiaAccountHeaderMap
(
null
);
try
{
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
))
{
String
result
=
response
.
body
().
string
(
);
Document
document
=
Jsoup
.
parse
(
result
);
return
document
.
select
(
"section.news-content"
).
text
();
}
catch
(
Exception
e
)
{
...
...
src/main/java/com/zhiwei/parse/analysis/DayuByWordAnalysis.java
View file @
d979d793
...
...
@@ -14,13 +14,17 @@ import org.slf4j.LoggerFactory;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
public
class
DayuByWordAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
DayuByWordAnalysis
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
public
List
<
Map
<
String
,
Object
>>
getDayuByWordData
(
String
result
,
Proxy
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
...
...
@@ -28,7 +32,7 @@ public class DayuByWordAnalysis {
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArry
=
json
.
getJSONObject
(
"data"
).
getJSONArray
(
"iflowItems"
);
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
JSONObject
data
=
jsonArry
.
getJSONObject
(
i
);
map
.
put
(
"title"
,
data
.
getString
(
"title"
).
replaceAll
(
"<.*?>"
,
""
));
String
url
=
data
.
getString
(
"zzd_url"
);
...
...
@@ -42,7 +46,7 @@ public class DayuByWordAnalysis {
}
return
dataList
;
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析出错
"
,
e
.
getMessage
()
);
logger
.
error
(
"解析出错
{}"
,
e
);
return
dataList
;
}
...
...
@@ -51,22 +55,19 @@ public class DayuByWordAnalysis {
public
String
getContent
(
String
url
,
Proxy
proxy
)
{
ZhiWeiTools
.
sleep
(
2000
);
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getDayuCommentHeaderMap
(
null
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
)){
String
result
=
response
.
body
().
string
();
Pattern
pat
=
Pattern
.
compile
(
"xissJsonData = (.*);"
);
Matcher
matcher
=
pat
.
matcher
(
result
);
try
{
if
(
matcher
.
find
())
{
String
s
=
matcher
.
group
(
0
);
JSONObject
json
=
JSONObject
.
parseObject
(
s
.
substring
(
15
,
s
.
length
()
-
1
));
String
content
=
json
.
getString
(
"content"
).
replaceAll
(
"<.*?>"
,
""
);
return
content
;
return
json
.
getString
(
"content"
).
replaceAll
(
"<.*?>"
,
""
);
}
return
null
;
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析文本出错"
,
e
.
getMessage
());
System
.
out
.
println
(
result
);
return
null
;
e
.
printStackTrace
();
}
return
null
;
}
}
src/main/java/com/zhiwei/parse/analysis/DayuCommentAnalysis.java
View file @
d979d793
...
...
@@ -10,12 +10,8 @@ import java.util.Map;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
DayuCommentAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
DayuCommentAnalysis
.
class
);
...
...
@@ -33,7 +29,7 @@ public class DayuCommentAnalysis {
JSONObject
json
=
JSONObject
.
parseObject
(
result
).
getJSONObject
(
"data"
).
getJSONObject
(
"comments_map"
);
Map
<
String
,
Object
>
map
=
(
Map
<
String
,
Object
>)
json
;
for
(
Map
.
Entry
<
String
,
Object
>
entry
:
map
.
entrySet
()
)
{
Map
<
String
,
Object
>
dataMap
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
Object
>
dataMap
=
new
HashMap
<>();
JSONObject
data
=
JSONObject
.
parseObject
(
entry
.
getValue
().
toString
());
dataMap
.
put
(
"content"
,
data
.
getString
(
"content"
));
dataMap
.
put
(
"nickname"
,
data
.
getJSONObject
(
"user"
).
getString
(
"nickname"
));
...
...
@@ -45,78 +41,78 @@ public class DayuCommentAnalysis {
dataMap
.
put
(
"time"
,
TimeParse
.
dateFormartString
(
new
Date
(
time
),
"yyyy-MM-dd HH:mm:ss"
));
int
i
=
data
.
getInteger
(
"reply_cnt"
);
dataMap
.
put
(
"replay_count"
,
i
);
if
(
i
>
0
)
{
dataList
.
addAll
(
getReplayData
(
id
,
articleId
,
proxy
));
}
//
if(i > 0) {
//
dataList.addAll(getReplayData(id,articleId,proxy));
//
}
dataList
.
add
(
dataMap
);
}
return
dataList
;
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析出错
"
,
e
.
getMessage
()
);
logger
.
error
(
"解析出错
{}"
,
e
);
return
dataList
;
}
}
/**
*
* @Description 解析
* @param id
* @param articleId
* @return
*/
private
List
<
Map
<
String
,
Object
>>
getReplayData
(
String
id
,
String
articleId
,
Proxy
proxy
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getDayuCommentHeaderMap
(
null
);
String
url
=
"http://m.uczzd.cn/iflow/api/v2/cmt/detail/"
+
id
+
"/comments?articleId="
+
articleId
+
"&count=10&ts="
;
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
+
"-1"
,
proxy
,
headerMap
);
List
<
Map
<
String
,
Object
>>
data
=
new
ArrayList
<
Map
<
String
,
Object
>>();
List
<
String
>
timeList
=
new
ArrayList
<
String
>();
while
(
true
)
{
ZhiWeiTools
.
sleep
(
2000
);
long
time
=
analysisReplayData
(
result
,
data
);
if
(
timeList
.
contains
(
String
.
valueOf
(
time
))){
break
;
}
timeList
.
add
(
String
.
valueOf
(
time
));
if
(
time
==
0
)
{
break
;
}
result
=
HttpClient
.
executeHttpRequestGet
(
url
+
time
,
proxy
,
headerMap
);
}
System
.
out
.
println
(
"=====================评论下回复获取数=="
+
data
.
size
());
return
data
;
}
//
/**
//
*
//
* @Description 解析
//
* @param id
//
* @param articleId
//
* @return
//
*/
//
private List<Map<String,Object>> getReplayData(String id,String articleId,Proxy proxy) {
//
Map<String,String> headerMap = HeadGet.getDayuCommentHeaderMap(null);
//
String url = "http://m.uczzd.cn/iflow/api/v2/cmt/detail/"+id+"/comments?articleId="+articleId+"&count=10&ts=";
//
String result = HttpClient.executeHttpRequestGet(url+"-1",proxy, headerMap);
//
List<Map<String,Object>> data = new ArrayList<Map<String,Object>>();
//
List<String> timeList = new ArrayList<String>();
//
while(true) {
//
ZhiWeiTools.sleep(2000);
//
long time = analysisReplayData(result,data);
//
if(timeList.contains(String.valueOf(time))){
//
break;
//
}
//
timeList.add(String.valueOf(time));
//
if(time == 0) {
//
break;
//
}
//
result = HttpClient.executeHttpRequestGet(url+time,proxy, headerMap);
//
}
//
System.out.println("=====================评论下回复获取数=="+data.size());
//
return data;
//
}
/**
*
* @Description 解析
* @param result
* @param dataList
* @return
*/
private
long
analysisReplayData
(
String
result
,
List
<
Map
<
String
,
Object
>>
dataList
)
{
long
time
=
0
;
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArry
=
json
.
getJSONObject
(
"data"
).
getJSONArray
(
"replies"
);
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
JSONObject
data
=
jsonArry
.
getJSONObject
(
i
);
map
.
put
(
"content"
,
data
.
getString
(
"content"
));
map
.
put
(
"nickname"
,
data
.
getString
(
"nickname"
));
map
.
put
(
"like"
,
data
.
getString
(
"up_cnt"
));
map
.
put
(
"id"
,
data
.
getString
(
"commentId"
));
map
.
put
(
"url"
,
data
.
getString
(
"shareUrl"
));
time
=
data
.
getLong
(
"timeShow"
);
map
.
put
(
"time"
,
TimeParse
.
dateFormartString
(
new
Date
(
time
),
"yyyy-MM-dd HH:mm:ss"
));
map
.
put
(
"replay_count"
,
data
.
getInteger
(
"replyCnt"
));
dataList
.
add
(
map
);
}
return
time
;
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取大鱼号评论出错--回复的"
,
e
.
getMessage
());
return
0
;
}
}
//
/**
//
*
//
* @Description 解析
//
* @param result
//
* @param dataList
//
* @return
//
*/
//
private long analysisReplayData(String result,List<Map<String,Object>> dataList) {
//
long time = 0;
//
try {
//
JSONObject json = JSONObject.parseObject(result);
//
JSONArray jsonArry = json.getJSONObject("data").getJSONArray("replies");
//
for(int i = 0; i < jsonArry.size();i++) {
//
Map<String,Object> map = new HashMap<String, Object>();
//
JSONObject data = jsonArry.getJSONObject(i);
//
map.put("content", data.getString("content"));
//
map.put("nickname", data.getString("nickname"));
//
map.put("like", data.getString("up_cnt"));
//
map.put("id", data.getString("commentId"));
//
map.put("url", data.getString("shareUrl"));
//
time = data.getLong("timeShow");
//
map.put("time", TimeParse.dateFormartString(new Date(time), "yyyy-MM-dd HH:mm:ss"));
//
map.put("replay_count", data.getInteger("replyCnt"));
//
dataList.add(map);
//
}
//
return time;
//
} catch (Exception e) {
//
logger.error("获取大鱼号评论出错--回复的",e.getMessage());
//
return 0;
//
}
//
}
...
...
src/main/java/com/zhiwei/parse/analysis/DoubanCommentAnalysis.java
View file @
d979d793
...
...
@@ -10,8 +10,6 @@ import java.util.Map;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
javax.swing.plaf.synth.SynthSpinnerUI
;
import
org.apache.commons.lang3.math.NumberUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
...
...
src/main/java/com/zhiwei/parse/analysis/DouyinHotDataAnalysis.java
View file @
d979d793
...
...
@@ -6,16 +6,12 @@ import java.util.HashMap;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.tools.timeparse.TimeParse
;
public
class
DouyinHotDataAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
DouyinHotDataAnalysis
.
class
);
public
Map
<
String
,
Object
>
getData
(
String
result
)
{
try
{
...
...
src/main/java/com/zhiwei/parse/analysis/FenghuangAccountAnalysis.java
View file @
d979d793
...
...
@@ -11,12 +11,17 @@ import org.slf4j.LoggerFactory;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
public
class
FenghuangAccountAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
FenghuangAccountAnalysis
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
/**
*
...
...
@@ -31,8 +36,8 @@ public class FenghuangAccountAnalysis {
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getFenghuangAccountHeaderMap
(
null
);
JSONArray
jsonArry
=
null
;
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
try
{
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
))
{
String
result
=
response
.
body
().
string
(
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
jsonArry
=
json
.
getJSONObject
(
"data"
).
getJSONObject
(
"feeds"
).
getJSONArray
(
"list"
);
if
(
jsonArry
==
null
||
jsonArry
.
size
()
<
1
)
{
...
...
@@ -83,7 +88,7 @@ public class FenghuangAccountAnalysis {
map
.
put
(
"url"
,
json
.
getString
(
"shareurl"
));
map
.
put
(
"id"
,
json
.
getString
(
"aid"
));
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析具体文章的时候出错
"
,
e
.
getMessage
()
);
logger
.
error
(
"解析具体文章的时候出错
{}"
,
e
);
return
null
;
}
return
map
;
...
...
src/main/java/com/zhiwei/parse/analysis/FenghuangCommentAnalysis.java
View file @
d979d793
...
...
@@ -13,17 +13,22 @@ import org.slf4j.LoggerFactory;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
okhttp3.Response
;
public
class
FenghuangCommentAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
FenghuangCommentAnalysis
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
public
Map
<
String
,
Object
>
getFenghuangCommentCount
(
String
url
,
Proxy
proxy
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
try
{
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
null
);
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
))
{
String
result
=
response
.
body
().
string
(
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
map
.
put
(
"real_count"
,
json
.
getInteger
(
"count"
));
map
.
put
(
"comment_num"
,
json
.
getInteger
(
"join_count"
));
...
...
@@ -44,8 +49,8 @@ public class FenghuangCommentAnalysis {
public
String
getdocUrl
(
String
url
,
Proxy
proxy
)
{
String
docUrl
=
null
;
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
try
{
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
null
);
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
))
{
String
result
=
response
.
body
().
string
(
);
if
(
result
.
contains
(
"commentUrl\": \""
))
{
docUrl
=
result
.
split
(
"commentUrl\": \""
)[
1
].
split
(
"\","
)[
0
];
break
;
...
...
@@ -76,24 +81,18 @@ public class FenghuangCommentAnalysis {
*/
public
List
<
Map
<
String
,
Object
>>
getData
(
String
url
,
Proxy
proxy
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getFenghuangCommentHeaderMap
(
null
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
String
result
;
try
{
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"链接获取信息失败"
,
e
.
getMessage
());
return
null
;
}
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
)){
String
result
=
response
.
body
().
string
();
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArry
=
json
.
getJSONArray
(
"data"
);
try
{
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
Map
<
String
,
Object
>
map
=
getcommentData
(
jsonArry
.
getJSONObject
(
i
));
dataList
.
add
(
map
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"
获取信息出错"
,
e
.
getMessage
()
);
return
null
;
logger
.
error
(
"
链接获取信息失败"
,
e
);
return
Collections
.
emptyList
()
;
}
return
dataList
;
...
...
@@ -109,22 +108,16 @@ public class FenghuangCommentAnalysis {
*/
public
List
<
Map
<
String
,
Object
>>
getData2
(
String
url
,
Proxy
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
String
result
;
try
{
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
null
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"链接获取信息失败 {}"
,
e
);
return
Collections
.
emptyList
();
}
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
String
result
=
response
.
body
().
string
();
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArry
=
json
.
getJSONArray
(
"comments"
);
try
{
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
Map
<
String
,
Object
>
map
=
getcommentData2
(
jsonArry
.
getJSONObject
(
i
));
dataList
.
add
(
map
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"
获取信息出错
{}"
,
e
);
logger
.
error
(
"
链接获取信息失败
{}"
,
e
);
return
Collections
.
emptyList
();
}
return
dataList
;
...
...
@@ -154,7 +147,7 @@ public class FenghuangCommentAnalysis {
* @return
*/
private
Map
<
String
,
Object
>
getcommentData
(
JSONObject
json
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
try
{
JSONObject
data
=
json
.
getJSONObject
(
"data"
);
map
.
put
(
"nickname"
,
json
.
getString
(
"nickname"
));
...
...
@@ -169,7 +162,7 @@ public class FenghuangCommentAnalysis {
long
time
=
data
.
getLong
(
"add_time"
)
*
1000
;
map
.
put
(
"time"
,
TimeParse
.
dateFormartString
(
new
Date
(
time
),
"yyyy-MM-dd HH:mm:ss"
));
}
catch
(
Exception
e
)
{
logger
.
error
(
"具体解析一条数据出错
"
,
e
.
getMessage
()
);
logger
.
error
(
"具体解析一条数据出错
{}"
,
e
);
return
null
;
}
return
map
;
...
...
src/main/java/com/zhiwei/parse/analysis/MaimaiBywordAnalysis.java
View file @
d979d793
...
...
@@ -5,15 +5,11 @@ import java.util.HashMap;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
public
class
MaimaiBywordAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
MaimaiBywordAnalysis
.
class
);
public
Map
<
String
,
Object
>
getData
(
String
result
,
String
time
)
{
Map
<
String
,
Object
>
map1
=
new
HashMap
<
String
,
Object
>();
...
...
@@ -38,7 +34,7 @@ public class MaimaiBywordAnalysis {
map
.
put
(
"like"
,
data
.
getJSONObject
(
"feed"
).
getInteger
(
"likes"
));
map
.
put
(
"comment_count"
,
data
.
getJSONObject
(
"feed"
).
getInteger
(
"total_cnt"
));
map
.
put
(
"spreads"
,
data
.
getJSONObject
(
"feed"
).
getInteger
(
"spreads"
));
//传播数
System
.
out
.
println
(
map
.
toString
());
//
System.out.println(map.toString());
dataList
.
add
(
map
);
}
map1
.
put
(
"data"
,
dataList
);
...
...
@@ -69,7 +65,7 @@ public class MaimaiBywordAnalysis {
map
.
put
(
"like"
,
data
.
getJSONObject
(
"gossip"
).
getInteger
(
"likes"
));
map
.
put
(
"comment_count"
,
data
.
getJSONObject
(
"gossip"
).
getInteger
(
"total_cnt"
));
map
.
put
(
"spreads"
,
data
.
getJSONObject
(
"gossip"
).
getInteger
(
"search_order"
));
//传播数
System
.
out
.
println
(
map
.
toString
());
//
System.out.println(map.toString());
dataList
.
add
(
map
);
}
map1
.
put
(
"data"
,
dataList
);
...
...
src/main/java/com/zhiwei/parse/analysis/MeipaiByWordAnalysis.java
View file @
d979d793
...
...
@@ -19,7 +19,6 @@ import com.zhiwei.util.TimeUtil;
public
class
MeipaiByWordAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
MeipaiByWordAnalysis
.
class
);
/**
*
* @Description 解析此页
...
...
src/main/java/com/zhiwei/parse/analysis/QQKBCommentAnalysis.java
View file @
d979d793
...
...
@@ -20,7 +20,6 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
public
class
QQKBCommentAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
QQKBCommentAnalysis
.
class
);
/**
*
* @Description 获取post信息
...
...
@@ -37,10 +36,9 @@ public class QQKBCommentAnalysis {
JSONObject
data
=
jsonArry
.
getJSONArray
(
jsonArry
.
size
()-
1
).
getJSONObject
(
0
);
String
coral_scorem
=
data
.
getString
(
"coral_score"
);
String
reply_id
=
data
.
getString
(
"reply_id"
);
Map
<
String
,
Object
>
paMap
=
HeadGet
.
getQQKBCommentParamMap2
(
comment_id
,
page
,
coral_scorem
,
article_id
,
reply_id
);
return
paMap
;
return
HeadGet
.
getQQKBCommentParamMap2
(
comment_id
,
page
,
coral_scorem
,
article_id
,
reply_id
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"构造post请求信息失败
"
,
e
.
getMessage
()
);
logger
.
error
(
"构造post请求信息失败
{}"
,
e
);
return
null
;
}
}
...
...
@@ -52,13 +50,13 @@ public class QQKBCommentAnalysis {
* @return
*/
public
List
<
Map
<
String
,
Object
>>
getCommentData
(
String
result
,
String
cookie
,
String
comment_id
,
String
article_id
,
Proxy
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>
>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArry
=
json
.
getJSONObject
(
"comments"
).
getJSONArray
(
"hot"
);
for
(
int
i
=
0
;
i
<
jsonArry
.
size
()
;
i
++)
{
JSONObject
data
=
jsonArry
.
getJSONArray
(
i
).
getJSONObject
(
0
);
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
map
.
put
(
"content"
,
data
.
getString
(
"reply_content"
));
map
.
put
(
"time"
,
TimeParse
.
dateFormartString
(
new
Date
(
Long
.
valueOf
(
data
.
getString
(
"tipstime"
))
*
1000L
),
"yyyy-MM-dd HH:mm:ss"
));
map
.
put
(
"name"
,
data
.
getString
(
"nick"
));
...
...
@@ -66,12 +64,11 @@ public class QQKBCommentAnalysis {
int
replay_num
=
0
;
String
reply_id
=
data
.
getString
(
"reply_id"
);
if
(
data
.
toString
().
contains
(
"reply_num"
))
{
replay_num
=
data
.
getInteger
(
"reply_num"
);
List
<
Map
<
String
,
Object
>>
lists
=
getReplyCommentData
(
cookie
,
reply_id
,
comment_id
,
article_id
,
proxy
);
if
(
lists
!=
null
&&
lists
.
size
()
>
0
)
{
dataList
.
addAll
(
lists
);
}
map
.
put
(
"reply_num"
,
replay_num
);
// replay_num = data.getInteger("reply_num");
// List<Map<String,Object>> lists = getReplyCommentData(cookie,reply_id,comment_id, article_id,proxy);
// if(lists != null && lists.size() > 0) {
// dataList.addAll(lists);
// }
}
map
.
put
(
"reply_id"
,
reply_id
);
map
.
put
(
"reply_num"
,
replay_num
);
...
...
@@ -80,7 +77,7 @@ public class QQKBCommentAnalysis {
}
return
dataList
;
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析数据出错
"
,
e
.
getMessage
()
);
logger
.
error
(
"解析数据出错
{}"
,
e
);
return
dataList
;
}
...
...
@@ -93,7 +90,7 @@ public class QQKBCommentAnalysis {
* @return
*/
public
Map
<
String
,
Object
>
getOneReplyComment
(
JSONObject
data
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
try
{
map
.
put
(
"content"
,
data
.
getString
(
"reply_content"
));
map
.
put
(
"time"
,
TimeParse
.
dateFormartString
(
new
Date
(
Long
.
valueOf
(
data
.
getString
(
"tipstime"
))
*
1000L
),
"yyyy-MM-dd HH:mm:ss"
));
...
...
@@ -103,13 +100,13 @@ public class QQKBCommentAnalysis {
System
.
out
.
println
(
map
.
toString
());
return
map
;
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取单个回复评论出错
"
,
e
.
getMessage
()
);
logger
.
error
(
"获取单个回复评论出错
{}"
,
e
);
return
null
;
}
}
public
List
<
Map
<
String
,
Object
>>
getReplyCommentData
(
String
cookie
,
String
reply_id
,
String
comment_id
,
String
article_id
,
Proxy
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>
>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getQQKBCommentHeaderMap
(
cookie
);
try
{
String
old_reply_id
=
""
;
...
...
src/main/java/com/zhiwei/parse/analysis/QicheHomeKwyWordAnalysis.java
View file @
d979d793
...
...
@@ -6,16 +6,12 @@ import java.util.HashMap;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.tools.timeparse.TimeParse
;
public
class
QicheHomeKwyWordAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
QicheHomeKwyWordAnalysis
.
class
);
public
List
<
Map
<
String
,
Object
>>
getData
(
String
result
)
{
try
{
...
...
src/main/java/com/zhiwei/parse/analysis/WangyiHistoryAnalysis.java
View file @
d979d793
...
...
@@ -21,15 +21,16 @@ public class WangyiHistoryAnalysis {
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
WangyiHistoryAnalysis
.
class
);
public
List
<
Map
<
String
,
Object
>>
getData
(
String
result
,
Proxy
proxy
,
String
endTime
,
String
source
)
{
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArry
=
json
.
getJSONObject
(
"data"
).
getJSONArray
(
"list"
);
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getWangyiHistoryHeaderMap
(
null
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>
>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
try
{
JSONObject
data
=
jsonArry
.
getJSONObject
(
i
);
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
map
.
put
(
"title"
,
data
.
getString
(
"title"
));
if
(
endTime
!=
null
&&
endTime
.
length
()
>
1
)
{
if
(
data
.
getString
(
"ptime"
).
compareTo
(
endTime
)
<=
0
)
{
...
...
src/main/resources/log4j.properties
View file @
d979d793
...
...
@@ -4,7 +4,7 @@
log4j.appender.stdout.layout.ConversionPattern
=
<%d>[%5p] %c - %m%n
log4j.appender.ROLLING_FILE
=
org.apache.log4j.DailyRollingFileAppender
log4j.appender.ROLLING_FILE.Threshold
=
stdout
log4j.appender.ROLLING_FILE.File
=
./Log/
wechatcrawler
.log
log4j.appender.ROLLING_FILE.File
=
./Log/
artivleData
.log
log4j.appender.ROLLING_FILE.Append
=
true
log4j.appender.ROLLING_FILE.layout
=
org.apache.log4j.PatternLayout
log4j.appender.ROLLING_FILE.layout.ConversionPattern
=
<%d>[%5p] %c - %m%n
\ No newline at end of file
src/test/java/com/zhiwei/Comment/ChejiaCommentCountTest.java
0 → 100644
View file @
d979d793
//package com.zhiwei.Comment;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Chejia;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//public class ChejiaCommentCountTest {
// @Test
// public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
// GroupType.PROVIDER);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
// Map<String, Object> map = poi
// .importExcel("D://crawlerdata//自媒体/车家号.xlsx", 0);
// List<Map<String, Object>> list = (List<Map<String, Object>>) map.get("body");
// List<Map<String, Object>> bodyList = new ArrayList<Map<String, Object>>();
// List<String> headList = (List<String>) map.get("head");
// for (Map<String, Object> map1 : list) {
// String url = map1.get("地址") + "";
//// url = "https://chejiahao.autohome.com.cn/info/3073188#reply";
// System.out.println(url);
// Chejia.getChejiaComment(url, ProxyFactory.getNatProxy());
//// int i = Chejia.getChejiaCommentCount(url, ProxyFactory.getNatProxy());
//// System.out.println(i);
//// map1.put("count", i);
// ZhiWeiTools.sleep(100);
// }
// headList.add("count");
// poi.exportExcel("D://crawlerdata//自媒体/车家号.xlsx", "评论采集", headList,
// list);
//
// }
//}
src/test/java/com/zhiwei/Comment/MaimaiCommentCountTest.java
0 → 100644
View file @
d979d793
//package com.zhiwei.Comment;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Maimai;
//import com.zhiwei.parse.Yiche;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//public class MaimaiCommentCountTest {
// @Test
// public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
// GroupType.PROVIDER);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
// Map<String, Object> map = poi
// .importExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\脉脉#美团 裁员#汇总截至12月20日10点30分.xlsx(1).xlsx", 0);
// List<Map<String, Object>> list = (List<Map<String, Object>>) map.get("body");
// List<Map<String, Object>> bodyList = new ArrayList<Map<String, Object>>();
// List<String> headList = (List<String>) map.get("head");
// for (Map<String, Object> map1 : list) {
// String url = map1.get("地址") + "";
// Map<String,Object> map3 = Maimai.getMaiaiCount(url, ProxyFactory.getNatProxy());
// map1.putAll(map3);
// ZhiWeiTools.sleep(100);
// }
// headList.add("like");
// headList.add("spreads");
// headList.add("cmts");
// poi.exportExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\脉脉#美团 裁员#汇总截至12月20日10点30分.xlsx(1).xlsx", "评论采集", headList,
// list);
// }
//}
src/test/java/com/zhiwei/Comment/XueqiuCommentCountTest.java
0 → 100644
View file @
d979d793
//package com.zhiwei.Comment;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//import java.util.Objects;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Xueqiu;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//public class XueqiuCommentCountTest {
// @Test
// public void f() {
//
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
// GroupType.PROVIDER);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
// Map<String, Object> map = poi
// .importExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\雪球-腾讯.xlsx", 0);
// List<Map<String, Object>> list = (List<Map<String, Object>>) map.get("body");
// List<Map<String, Object>> bodyList = new ArrayList<Map<String, Object>>();
// List<String> headList = (List<String>) map.get("head");
// for (Map<String, Object> map1 : list) {
// for(int i = 1;i < 5;i++) {
// String url = map1.get("地址") + "";
// Map<String,Object> map3 = Xueqiu.getUrlData(url, ProxyFactory.getNatProxy());
// ZhiWeiTools.sleep(100);
// if(Objects.nonNull(map3)) {
// System.out.println(map3.toString());
// map1.putAll(map3);
// break;
// }
// }
// }
// headList.add("like");
// headList.add("repostCount");
// headList.add("commentCount");
// poi.exportExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\雪球-腾讯.xlsx", "评论数采集", headList,
// list);
//
// }
//}
src/test/java/com/zhiwei/crawler/AiqiyiByWordExample.java
View file @
d979d793
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Aiqiyi
;
public
class
AiqiyiByWordExample
{
@Test
public
void
aiqiyiByWordTest
()
{
String
word
=
"美食,味道,菜"
;
String
[]
words
=
word
.
split
(
","
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
String
w
:
words
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
Aiqiyi
.
getAiqiyiByWordData
(
w
,
null
);
if
(
dataList
!=
null
&&
dataList
.
size
()
>=
1
)
{
bodyList
.
addAll
(
dataList
);
}
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"count"
);
headList
.
add
(
"time"
);
headList
.
add
(
"source"
);
headList
.
add
(
"content"
);
headList
.
add
(
"url"
);
headList
.
add
(
"title"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata/爱奇艺关键词采集.xlsx"
,
"数据"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.crawler;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Aiqiyi;
//
//
public class AiqiyiByWordExample {
//
//
//
@Test
//
public void aiqiyiByWordTest() {
//
String word = "美食,味道,菜";
//
String[] words = word.split(",");
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
//
for(String w : words) {
//
List<Map<String,Object>> dataList = Aiqiyi.getAiqiyiByWordData(w,null);
//
if(dataList != null && dataList.size() >= 1) {
//
bodyList.addAll(dataList);
//
}
//
}
//
List<String> headList = new ArrayList<String>();
//
headList.add("count");
//
headList.add("time");
//
headList.add("source");
//
headList.add("content");
//
headList.add("url");
//
headList.add("title");
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
poi.exportExcel("D://crawlerdata/爱奇艺关键词采集.xlsx", "数据", headList, bodyList);
//
//
//
//
}
//
//
//
//
}
src/test/java/com/zhiwei/crawler/BaijiaAccountExample.java
View file @
d979d793
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Baijia
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
BaijiaAccountExample
{
//package com.zhiwei.crawler;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Baijia;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//public class BaijiaAccountExample {
//
//// @Test
// public void baijiaAccountTest() {
// String app_id = "1536766276004443";
// String startTime = "2015-01-01 00:00:00";
// //2017-11-30 17:48:17
// List<Map<String,Object>> lists = Baijia.getBaijiaAccountData(app_id,startTime,null);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("read_amount");
// headList.add("app_id");
// headList.add("source");
// headList.add("url");
// headList.add("content");
// poi.exportExcel("D://crawlerdata/百家号-马继华.xlsx", "马继华", headList, lists);
// }
//
//// @Test
// public void baijiaAccount2Test() {
// String app_id = "b_1548519002063358";
// String startTime = "2018-01-01 00:00:00";
// //2017-11-30 17:48:17
// List<String> idList = new ArrayList<>();
// idList.add("b_1548519002063358");
// idList.add("b_1536766292852334");
// idList.add("b_1536766781763274");
// idList.add("b_1536766200338498");
// List<Map<String,Object>> bodyList = new ArrayList<>();
// for(String id : idList) {
// ZhiWeiTools.sleep(5000);
// List<Map<String,Object>> lists = Baijia.getBaijiaAccount2Data(id,startTime,null);
// bodyList.addAll(lists);
// }
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("source");
// headList.add("url");
// headList.add("content");
// poi.exportExcel("D://crawlerdata//自媒体/百家号-all.xlsx", "科学的fan", headList, bodyList);
// }
//
// @Test
public
void
baijiaAccountTest
()
{
String
app_id
=
"1536766276004443"
;
String
startTime
=
"2015-01-01 00:00:00"
;
//2017-11-30 17:48:17
List
<
Map
<
String
,
Object
>>
lists
=
Baijia
.
getBaijiaAccountData
(
app_id
,
startTime
,
null
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"read_amount"
);
headList
.
add
(
"app_id"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
headList
.
add
(
"content"
);
poi
.
exportExcel
(
"D://crawlerdata/百家号-马继华.xlsx"
,
"马继华"
,
headList
,
lists
);
}
// @Test
public
void
baijiaAccount2Test
()
{
String
app_id
=
"b_1548519002063358"
;
String
startTime
=
"2018-01-01 00:00:00"
;
//2017-11-30 17:48:17
List
<
String
>
idList
=
new
ArrayList
<>();
idList
.
add
(
"b_1548519002063358"
);
idList
.
add
(
"b_1536766292852334"
);
idList
.
add
(
"b_1536766781763274"
);
idList
.
add
(
"b_1536766200338498"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
for
(
String
id
:
idList
)
{
ZhiWeiTools
.
sleep
(
5000
);
List
<
Map
<
String
,
Object
>>
lists
=
Baijia
.
getBaijiaAccount2Data
(
id
,
startTime
,
null
);
bodyList
.
addAll
(
lists
);
}
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
headList
.
add
(
"content"
);
poi
.
exportExcel
(
"D://crawlerdata//自媒体/百家号-all.xlsx"
,
"科学的fan"
,
headList
,
bodyList
);
}
@Test
public
void
test3
()
{
String
path
=
"D://crawlerdata//自媒体/百家号采集.xlsx"
;
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
String
startTime
=
"2018-05-01 00:00:00"
;
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
path
,
0
);
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
for
(
Map
<
String
,
Object
>
m
:
list
)
{
try
{
String
app_id
=
m
.
get
(
"id"
).
toString
();
app_id
=
"1594158489045754"
;
String
name
=
m
.
get
(
"name"
).
toString
();
String
cookie
=
"__cfduid=d847baca85b97d1967b3da02ebb345b831535524251; BAIDUID=C0F0F81EF770C5219AB9C178654135EC:FG=1; PSTM=1536376257; BIDUPSID=250CCE0442BEBCB3568D8EC515953434; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; locale=zh; delPer=0; H_PS_PSSID=1447_21117_20930; PSINO=5"
;
List
<
Map
<
String
,
Object
>>
lists
=
Baijia
.
getBaijiaAccountByBaiduData
(
app_id
,
name
,
startTime
,
cookie
,
null
);
if
(
lists
!=
null
)
{
bodyList
.
addAll
(
lists
);
}
}
catch
(
Exception
e
)
{
}
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
headList
.
add
(
"content"
);
headList
.
add
(
"read_amount"
);
poi
.
exportExcel
(
"D://crawlerdata//自媒体/百家号-lxj-2.xlsx"
,
"娱乐资本论"
,
headList
,
bodyList
);
}
}
// public void test3() {
// String path = "D://crawlerdata//自媒体/百家号采集.xlsx";
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// String startTime = "2018-05-01 00:00:00";
// Map<String,Object> map = poi.importExcel(path, 0);
// List<Map<String,Object>> list = (List<Map<String, Object>>) map.get("body");
// List<Map<String,Object>> bodyList = new ArrayList<>();
// for(Map<String,Object> m : list) {
// try {
// String app_id = m.get("id").toString();
// app_id = "1594158489045754";
// String name = m.get("name").toString();
// String cookie = "__cfduid=d847baca85b97d1967b3da02ebb345b831535524251; BAIDUID=C0F0F81EF770C5219AB9C178654135EC:FG=1; PSTM=1536376257; BIDUPSID=250CCE0442BEBCB3568D8EC515953434; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; locale=zh; delPer=0; H_PS_PSSID=1447_21117_20930; PSINO=5";
// List<Map<String,Object>> lists = Baijia.getBaijiaAccountByBaiduData(app_id,name, startTime,cookie, null);
// if(lists != null) {
// bodyList.addAll(lists);
// }
// } catch (Exception e) {
// }
// }
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
// headList.add("source");
// headList.add("url");
// headList.add("content");
// headList.add("read_amount");
// poi.exportExcel("D://crawlerdata//自媒体/百家号-lxj-2.xlsx", "娱乐资本论", headList, bodyList);
// }
//
//}
src/test/java/com/zhiwei/crawler/DayuAccountExample.java
View file @
d979d793
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Dayu
;
public
class
DayuAccountExample
{
@Test
public
void
dayuAccountTest
()
{
//https://api.m.sm.cn/rest?method=Subscribe.list&format=html&from=wh10331&uc_biz_str=S:custom%7CC:search%7CN:true
// String mid = "d7300311c1504d24a229c3da345785c6";
// String name = "大鱼海棠雨";
String
startTime
=
"2017-01-01 00:00:00"
;
String
path
=
"D:\\crawlerdata\\自媒体\\大鱼历史文章.xlsx"
;
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
path
,
0
);
List
<
Map
<
String
,
Object
>>
lists
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"content"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
// headList.add("content_id");
// headList.add("origin_id");
// headList.add("xss_item_id");
for
(
Map
<
String
,
Object
>
data
:
lists
)
{
String
mid
=
data
.
get
(
"mid"
)+
""
;
String
name
=
data
.
get
(
"name"
)+
""
;
if
(
mid
.
length
()
<
1
&&
name
.
length
()
<
1
)
{
continue
;
}
List
<
Map
<
String
,
Object
>>
dataList
=
Dayu
.
getDayuAccountData
(
mid
,
name
,
null
,
null
);
poi
.
exportExcel
(
path
,
name
,
headList
,
dataList
);
}
}
}
//
package com.zhiwei.crawler;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Dayu;
//
//
public class DayuAccountExample {
//
//
//
@Test
//
public void dayuAccountTest() {
//
//https://api.m.sm.cn/rest?method=Subscribe.list&format=html&from=wh10331&uc_biz_str=S:custom%7CC:search%7CN:true
//
//
//
//
String mid = "d7300311c1504d24a229c3da345785c6";
//
//
String name = "大鱼海棠雨";
//
String startTime = "2017-01-01 00:00:00";
//
String path = "D:\\crawlerdata\\自媒体\\大鱼历史文章.xlsx";
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
Map<String,Object> map = poi.importExcel(path, 0);
//
List<Map<String,Object>> lists = (List<Map<String, Object>>) map.get("body");
//
List<String> headList = new ArrayList<String>();
//
headList.add("title");
//
headList.add("time");
//
headList.add("content");
//
headList.add("source");
//
headList.add("url");
//
//
headList.add("content_id");
//
//
headList.add("origin_id");
//
//
headList.add("xss_item_id");
//
for(Map<String,Object> data : lists) {
//
String mid = data.get("mid")+"";
//
String name = data.get("name")+"";
//
if(mid.length() < 1 && name.length() < 1) {
//
continue;
//
}
//
List<Map<String,Object>> dataList = Dayu.getDayuAccountData(mid,name,null,null);
//
poi.exportExcel(path, name, headList, dataList);
//
}
//
//
//
}
//
//
//
}
src/test/java/com/zhiwei/crawler/DayuByWordExample.java
View file @
d979d793
package
com
.
zhiwei
.
crawler
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.parse.Dayu
;
public
class
DayuByWordExample
{
@Test
public
void
dayuByWordTest
()
{
String
word
=
"11"
;
List
<
Map
<
String
,
Object
>>
dataList
=
Dayu
.
getDayuByWordData
(
word
,
null
);
System
.
out
.
println
(
dataList
.
size
());
}
}
//
package com.zhiwei.crawler;
//
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.parse.Dayu;
//
//
public class DayuByWordExample {
//
//
//
@Test
//
public void dayuByWordTest() {
//
String word = "11";
//
//
List<Map<String,Object>> dataList = Dayu.getDayuByWordData(word,null);
//
//
System.out.println(dataList.size());
//
//
//
}
//
//
//
}
src/test/java/com/zhiwei/crawler/MaimaiBywordExample.java
View file @
d979d793
...
...
@@ -10,15 +10,18 @@ import com.zhiwei.parse.Maimai;
public
class
MaimaiBywordExample
{
public
static
void
main
(
String
[]
args
)
{
String
word
=
"美团
晋升
"
;
String
cookie
=
"
sessionid=y87knknqrc3fi6xto2zv0s4kugmleepk; guid=GxsfBBgZGwQYGx4EGBkeVgcYGx4fHhwcGhgbVhwZBB0ZHwVDWEtMS3kKGhobBB0THhkEGgQTHAVPR0VYQmkKA0VBSU9tCk9BQ0YKBmZnfmJhAgocGQQdGR8FXkNhSE99T0ZaWmsKAx4cfWV9ChEZBBwKfmQKWV1FTkRDfQIKGgQfBUtGRkNQRWc=; seid=s1539933372113; token=\"ZTjnEij9jsL4ZCdnKF2CaUAwcJHgcem/zHvAbXp3MXdY+uSPva8scjbe2zHl2gE98CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoiSFVMLVhKb2g5TkJGNHRJanljUW5Qa1V5IiwiX2V4cGlyZSI6MTU0MDAxOTc5MTUwNSwiX21heEFnZSI6ODY0MDAwMDB9; session.sig=dJmy52LHX-stqroAbm66u2zJaZA
"
;
String
time
=
"201
8-10-15
00:00:00"
;
String
word
=
"美团
|某团|MT|大众点评|新美大|美团点评
"
;
String
cookie
=
"
guid=GxsfBBgZGwQYGx4EGBkeVhsfGB4aHBpWHBkEHRkfBUNYS0xLeQoSEwQSHR8ZBBoEGx0FT0dFWEJpCgNFQUlPbQpPQUNGCgZmZ35iYQIKHBkEHRkfBV5DYUhPfU9GWlprCgMeHH1lfQoRGQQcCn5kClldRU5EQ30CChoEHwVLRkZDUEVn; token=\"7IGuqjEwgJ2gXX5PZ0UYSxvn81Aws6v5OFrwpSErsbctlSd1e/7+AzYEMMMeeFJJ8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; _buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoiOGtDSnF6VG5QcFk0R3ZmVFB4MThIMW1ZIiwiX2V4cGlyZSI6MTU0ODMwODU0MTMyNCwiX21heEFnZSI6ODY0MDAwMDB9; session.sig=cnQ0i1LwYxhjO3_BvQ4Coh0f9PQ
"
;
String
time
=
"201
9-01-17
00:00:00"
;
String
[]
words
=
word
.
split
(
"\\|"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
String
w
:
words
)
{
List
<
Map
<
String
,
Object
>>
c
=
Maimai
.
getData
(
w
,
cookie
,
time
,
null
);
// List<Map<String,Object>> c = Maimai.getDataByNoName(w, cookie, time, null);
bodyList
.
addAll
(
c
);
//实名动态
// List<Map<String,Object>> c = Maimai.getData(w, cookie, time, null);
//职言交流
List
<
Map
<
String
,
Object
>>
c2
=
Maimai
.
getDataByNoName
(
w
,
cookie
,
time
,
null
);
// bodyList.addAll(c);
bodyList
.
addAll
(
c2
);
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"time"
);
...
...
@@ -29,7 +32,7 @@ public class MaimaiBywordExample {
headList
.
add
(
"comment_count"
);
headList
.
add
(
"spreads"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\脉脉关键词采集-美团
晋升-1015
.xlsx"
,
"脉脉关键词"
,
headList
,
bodyList
);
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\脉脉关键词采集-美团
-0123
.xlsx"
,
"脉脉关键词"
,
headList
,
bodyList
);
}
}
src/test/java/com/zhiwei/crawler/PearVideoByWordExample.java
View file @
d979d793
...
...
@@ -13,7 +13,7 @@ public class PearVideoByWordExample {
@Test
public
void
pearVideoByWordTest
()
{
String
word
=
"
美食
"
;
String
word
=
"
大宝 甲醛
"
;
List
<
Map
<
String
,
Object
>>
bodyList
=
PearVideo
.
getPearVideoData
(
word
,
null
);
List
<
String
>
headList
=
new
ArrayList
<
String
>();
...
...
src/test/java/com/zhiwei/crawler/QQKBCommentExample.java
View file @
d979d793
...
...
@@ -6,6 +6,8 @@ import java.util.Map;
import
org.junit.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.QQKB
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
...
...
@@ -18,7 +20,8 @@ public class QQKBCommentExample {
String
url
=
"https://kuaibao.qq.com/s/20181122A11WQB00"
;
//https://kuaibao.qq.com/s/20180423A1PI7400?refer=kb_news
// https://kuaibao.qq.com/s/20180423A0L60800?refer=kb_news
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
"D://crawlerdata//自媒体/快报评论采集.xlsx"
,
0
);
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
...
...
@@ -40,7 +43,7 @@ public class QQKBCommentExample {
headList
.
add
(
"time"
);
//时间
headList
.
add
(
"content"
);
//内容
System
.
out
.
println
(
bodyList
.
size
());
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\快报评论采集
-zhj
.xlsx"
,
"sada"
,
headList
,
bodyList
);
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\快报评论采集.xlsx"
,
"sada"
,
headList
,
bodyList
);
}
...
...
src/test/java/com/zhiwei/crawler/SouhuCommentCountExample.java
View file @
d979d793
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Souhu
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
SouhuCommentCountExample
{
@Test
public
void
souhuCommentCountTest
()
{
String
url
=
"http://www.sohu.com/a/281414426_133392"
;
int
i
=
Souhu
.
getSouhuCommentCount
(
url
,
null
);
System
.
out
.
println
(
i
);
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
"D://crawlerdata//自媒体//搜狐评论采集.xlsx"
,
0
);
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
List
<
String
>
headList
=
(
List
<
String
>)
map
.
get
(
"head"
);
for
(
Map
<
String
,
Object
>
map1
:
list
)
{
String
url
=
""
;
try
{
url
=
map1
.
get
(
"url"
)+
""
;
System
.
out
.
println
(
url
);
int
i
=
Souhu
.
getSouhuCommentCount
(
url
,
ProxyFactory
.
getNatProxy
());
map1
.
put
(
"count"
,
i
);
System
.
out
.
println
(
map1
.
toString
());
}
catch
(
Exception
e
)
{
System
.
out
.
println
(
url
);
e
.
printStackTrace
();
continue
;
}
}
headList
.
add
(
"count"
);
poi
.
exportExcel
(
"D://crawlerdata//自媒体//搜狐评论采集.xlsx"
,
"sheet2"
,
headList
,
list
);
}
...
...
src/test/java/com/zhiwei/crawler/SouhuCommentExample.java
View file @
d979d793
...
...
@@ -6,6 +6,8 @@ import java.util.Map;
import
org.junit.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Fenghuang
;
import
com.zhiwei.parse.Souhu
;
...
...
@@ -16,7 +18,8 @@ public class SouhuCommentExample {
@Test
public
void
souhuCommentTest
()
{
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
"D://crawlerdata//自媒体//搜狐评论采集.xlsx"
,
0
);
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
...
...
@@ -30,7 +33,7 @@ public class SouhuCommentExample {
if
(
dataList
.
size
()
<=
0
)
{
urlList
.
add
(
url
);
}
ZhiWeiTools
.
sleep
(
20
00
);
ZhiWeiTools
.
sleep
(
1
00
);
if
(
dataList
!=
null
)
{
bodyList
.
addAll
(
dataList
);
}
...
...
src/test/java/com/zhiwei/hsitory/QQkandianHistoryExample.java
View file @
d979d793
package
com
.
zhiwei
.
hsitory
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.testng.annotations.Test
;
import
com.zhiwei.bean.HistortyBean
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.QQKandian
;
public
class
QQkandianHistoryExample
{
@Test
public
void
f
()
{
String
uid
=
"2661642386"
;
QQKandian
qqKandian
=
new
QQKandian
();
List
<
HistortyBean
>
dataList
=
qqKandian
.
getHistoryData
(
uid
,
null
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
HistortyBean
h
:
dataList
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
map
.
put
(
"标题"
,
h
.
getTitle
());
map
.
put
(
"时间"
,
h
.
getTime
());
map
.
put
(
"来源"
,
h
.
getSource
());
map
.
put
(
"正文"
,
h
.
getContent
());
map
.
put
(
"链接"
,
h
.
getUrl
());
bodyList
.
add
(
map
);
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"标题"
);
headList
.
add
(
"来源"
);
headList
.
add
(
"链接"
);
headList
.
add
(
"正文"
);
headList
.
add
(
"时间"
);
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\qq看点-数据-2661642386.xlsx"
,
"数据"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.hsitory;
//
//
import java.util.ArrayList;
//
import java.util.HashMap;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.testng.annotations.Test;
//
//
import com.zhiwei.bean.HistortyBean;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.QQKandian;
//
//
public class QQkandianHistoryExample {
//
@Test
//
public void f() {
//
String uid = "2661642386";
//
//
QQKandian qqKandian = new QQKandian();
//
List<HistortyBean> dataList = qqKandian.getHistoryData(uid, null);
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
//
for(HistortyBean h : dataList) {
//
Map<String, Object> map = new HashMap<String,Object>();
//
map.put("标题", h.getTitle());
//
map.put("时间", h.getTime());
//
map.put("来源", h.getSource());
//
map.put("正文", h.getContent());
//
map.put("链接", h.getUrl());
//
bodyList.add(map);
//
}
//
List<String> headList = new ArrayList<String>();
//
headList.add("标题");
//
headList.add("来源");
//
headList.add("链接");
//
headList.add("正文");
//
headList.add("时间");
//
poi.exportExcel("D:\\crawlerdata\\自媒体\\qq看点-数据-2661642386.xlsx", "数据", headList, bodyList);
//
//
//
}
//
}
src/test/java/com/zhiwei/keyword/GftaiTest.java
View file @
d979d793
//
package com.zhiwei.keyword;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.testng.annotations.Test;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Gftai;
//
//
public class GftaiTest {
//
@Test
//
public void f() {
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
// String words = "民宿|短租|住宿|途家|爱彼迎|小猪短租|榛果民宿|Airbnb
";
//
String[] ws = words.split("\\|");
//
List<Map<String,Object>> bodyList = new ArrayList<>();
//
for(String word : ws) {
//
List<Map<String,Object>> list = Gftai.getData(word, null);
//
bodyList.addAll(list);
//
System.out.println(word + " --------- " + bodyList.size());
//
}
//
List<String> headList = new ArrayList<>();
//
headList.add("title");
//
headList.add("time");
//
headList.add("content");
//
headList.add("source");
//
headList.add("url");
//
// poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\国富泰信用
.xlsx", "数据", headList, bodyList);
//
}
//
}
package
com
.
zhiwei
.
keyword
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.testng.annotations.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Gftai
;
public
class
GftaiTest
{
@Test
public
void
f
()
{
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
String
words
=
"美团 催收|美团 借款|美团 还钱|三快 借钱|三快 生活费|三快 借款|美团 征信
"
;
String
[]
ws
=
words
.
split
(
"\\|"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
for
(
String
word
:
ws
)
{
List
<
Map
<
String
,
Object
>>
list
=
Gftai
.
getData
(
word
,
null
);
bodyList
.
addAll
(
list
);
System
.
out
.
println
(
word
+
" --------- "
+
bodyList
.
size
());
}
List
<
String
>
headList
=
new
ArrayList
<>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"content"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\投诉\\国富泰信用-美团-2
.xlsx"
,
"数据"
,
headList
,
bodyList
);
}
}
src/test/java/com/zhiwei/keyword/KuaiTousuTest.java
View file @
d979d793
//
package com.zhiwei.keyword;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.testng.annotations.Test;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Gftai;
//
import com.zhiwei.parse.KuaiTousu;
//
//
public class KuaiTousuTest {
//
@Test
//
public void f() {
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
// String words = "民宿|短租|住宿|途家|爱彼迎|小猪短租|榛果民宿|Airbnb
";
//
String[] ws = words.split("\\|");
//
List<Map<String,Object>> bodyList = new ArrayList<>();
//
for(String word : ws) {
//
List<Map<String,Object>> list = KuaiTousu.getData(word, null);
//
bodyList.addAll(list);
//
System.out.println(word + " --------- " + bodyList.size());
//
}
//
List<String> headList = new ArrayList<>();
//
headList.add("title");
//
headList.add("time");
//
headList.add("content");
//
headList.add("source");
//
headList.add("url");
//
// poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\新浪广东快投诉
.xlsx", "数据", headList, bodyList);
//
//
//
//
//
}
//
}
package
com
.
zhiwei
.
keyword
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.testng.annotations.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Gftai
;
import
com.zhiwei.parse.KuaiTousu
;
public
class
KuaiTousuTest
{
@Test
public
void
f
()
{
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
String
words
=
"美团 催收|美团 借款|美团 还钱|三快 借钱|三快 生活费|三快 借款|美团 征信
"
;
String
[]
ws
=
words
.
split
(
"\\|"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
for
(
String
word
:
ws
)
{
List
<
Map
<
String
,
Object
>>
list
=
KuaiTousu
.
getData
(
word
,
null
);
bodyList
.
addAll
(
list
);
System
.
out
.
println
(
word
+
" --------- "
+
bodyList
.
size
());
}
List
<
String
>
headList
=
new
ArrayList
<>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"content"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\投诉\\新浪广东快投诉-美团-2
.xlsx"
,
"数据"
,
headList
,
bodyList
);
}
}
src/test/java/com/zhiwei/keyword/SinaTousuTest.java
View file @
d979d793
//
package com.zhiwei.keyword;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.testng.annotations.Test;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.KuaiTousu;
//
import com.zhiwei.parse.SinaTousu;
//
//
public class SinaTousuTest {
//
//
@Test
//
public void getSinaTousuData() {
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
// String words = "民宿|短租|住宿|途家|爱彼迎|小猪短租|榛果民宿|Airbnb
";
//
String[] ws = words.split("\\|");
//
List<Map<String,Object>> bodyList = new ArrayList<>();
//
for(String word : ws) {
// List<Map<String,Object>> list = SinaTousu.getSinaTousuData(word, null, "2018-01
-01 00:00:00");
//
bodyList.addAll(list);
//
System.out.println(word + " --------- " + bodyList.size());
//
}
//
List<String> headList = new ArrayList<>();
//
headList.add("title");
//
headList.add("time");
//
headList.add("content");
//
headList.add("source");
//
headList.add("url");
//
// poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\黑猫投诉
.xlsx", "数据", headList, bodyList);
//
//
//
//
}
//
}
package
com
.
zhiwei
.
keyword
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.testng.annotations.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.KuaiTousu
;
import
com.zhiwei.parse.SinaTousu
;
public
class
SinaTousuTest
{
@Test
public
void
getSinaTousuData
()
{
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
String
words
=
"美团 催收|美团 借款|美团 还钱|三快 借钱|三快 生活费|三快 借款|美团 征信
"
;
String
[]
ws
=
words
.
split
(
"\\|"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
for
(
String
word
:
ws
)
{
List
<
Map
<
String
,
Object
>>
list
=
SinaTousu
.
getSinaTousuData
(
word
,
null
,
"2018-07
-01 00:00:00"
);
bodyList
.
addAll
(
list
);
System
.
out
.
println
(
word
+
" --------- "
+
bodyList
.
size
());
}
List
<
String
>
headList
=
new
ArrayList
<>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"content"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\投诉\\黑猫投诉-美团-2
.xlsx"
,
"数据"
,
headList
,
bodyList
);
}
}
src/test/java/com/zhiwei/keyword/YoukuKeyWordTest.java
0 → 100644
View file @
d979d793
//package com.zhiwei.keyword;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Youku;
//
//public class YoukuKeyWordTest {
// @Test
// public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
// GroupType.PROVIDER);
// String word = "帮宝适 二噁英," +
// "帮宝适 二恶英," +
// "帮宝适 有毒," +
// "帮宝适 剧毒," +
// "帮宝适 致癌," +
// "宝洁 二噁英," +
// "宝洁 二恶英," +
// "宝洁 有毒," +
// "宝洁 剧毒," +
// "宝洁 致癌," +
// "纸尿裤 二噁英," +
// "纸尿裤 二恶英," +
// "纸尿裤 有毒," +
// "纸尿裤 剧毒," +
// "纸尿裤 致癌";
// List<Map<String,Object>> bodyList = new ArrayList<>();
// String[] words = word.split(",");
// for(String w : words) {
// System.out.println(w);
// bodyList.addAll(Youku.getDataList(w));
// }
// List<String> headList = new ArrayList<>();
// headList.add("title");
// headList.add("time");
// headList.add("url");
// headList.add("uper");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\优酷数据-txh-0121.xlsx", "数据", headList, bodyList);
//
// }
//}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment