Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
articlenewscrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
articlenewscrawler
Commits
a731c54c
Commit
a731c54c
authored
Nov 14, 2018
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修改 腾讯新闻关键词采集
parent
d555bdda
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
128 additions
and
40 deletions
+128
-40
pom.xml
+1
-1
src/main/java/com/zhiwei/httpclient/HeadGet.java
+17
-4
src/main/java/com/zhiwei/parse/Baijia.java
+7
-1
src/main/java/com/zhiwei/parse/BiliBili.java
+9
-9
src/main/java/com/zhiwei/parse/Maimai.java
+28
-0
src/main/java/com/zhiwei/parse/QQKandian.java
+2
-2
src/main/java/com/zhiwei/parse/TXNews.java
+8
-5
src/main/java/com/zhiwei/parse/Yidianzixun.java
+1
-0
src/main/java/com/zhiwei/parse/analysis/BaijiaAccountAnalysis.java
+6
-4
src/main/java/com/zhiwei/parse/analysis/BilibilikeyWordAnalysis.java
+1
-1
src/main/java/com/zhiwei/parse/analysis/MaimaiBywordAnalysis.java
+32
-0
src/main/java/com/zhiwei/parse/analysis/TXNewsByWordAnalysis.java
+3
-3
src/test/java/com/zhiwei/crawler/BaijiaAccountExample.java
+4
-3
src/test/java/com/zhiwei/crawler/DayuByWordExample.java
+1
-1
src/test/java/com/zhiwei/crawler/MaimaiBywordExample.java
+5
-4
src/test/java/com/zhiwei/crawler/TXNewsByWordExample.java
+3
-2
No files found.
pom.xml
View file @
a731c54c
...
@@ -36,7 +36,7 @@
...
@@ -36,7 +36,7 @@
<dependency>
<dependency>
<groupId>
com.zhiwei.tools
</groupId>
<groupId>
com.zhiwei.tools
</groupId>
<artifactId>
zhiwei-tools
</artifactId>
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.0.
3
-SNAPSHOT
</version>
<version>
0.0.
8
-SNAPSHOT
</version>
</dependency>
</dependency>
</dependencies>
</dependencies>
...
...
src/main/java/com/zhiwei/httpclient/HeadGet.java
View file @
a731c54c
...
@@ -12,6 +12,8 @@ import java.util.Map;
...
@@ -12,6 +12,8 @@ import java.util.Map;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
public
class
HeadGet
{
public
class
HeadGet
{
/**
/**
...
@@ -931,12 +933,23 @@ public class HeadGet {
...
@@ -931,12 +933,23 @@ public class HeadGet {
return
paramMap
;
return
paramMap
;
}
}
public
static
Map
<
String
,
Object
>
getKuaishouParamMap
()
{
Map
<
String
,
Object
>
paramMap
=
new
HashMap
<
String
,
Object
>();
paramMap
.
put
(
"count"
,
20
);
paramMap
.
put
(
"user_id"
,
"475195458"
);
paramMap
.
put
(
"client_key"
,
"56c3713c"
);
paramMap
.
put
(
"token"
,
"10e4f33e55c0488e99ae750c5f3d46ff-1032060898"
);
paramMap
.
put
(
"sig"
,
"ebd688038026858f30cdde57045996f9"
);
paramMap
.
put
(
"__NStokensig"
,
"f768b1f8d0ad8f0491be35c102742b278194faaa41f4ecd25a8f3ae44c7daa0a"
);
return
paramMap
;
}
public
static
void
main
(
String
[]
args
)
throws
UnsupportedEncodingException
{
public
static
void
main
(
String
[]
args
)
throws
UnsupportedEncodingException
{
String
url
=
"http
s://r.cnews.qq.com/searchByType
"
;
String
url
=
"http
://180.186.38.200/rest/n/feed/profile2
"
;
System
.
out
.
println
(
url
);
System
.
out
.
println
(
url
);
String
cookie
=
"
luin=o0497332654;%20lskey=00030000d63ffaf7eba88c86106eac5f2910d45515222334b91c75a66b449c990c2be43cd202ba39b35bef60;%20uin=o0497332654;%20skey=MH3wukytS4;%20sigA2=7AB4D8DEDF73E313801FD348FD77EC3B05C06DBC4D9DA669B20CA04A8D6B80F300A69567FBD11A7B799E419BB796F22D47D3AE5FA95E708A0ABC66161061131B0B21A0031AA0807C;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmSQ3EqlyzseC9-qGEFY7Tkr0Ypk5vsnSwOaMC-IGxsBeY2K7knHrYstj_5dZpisJd5nihvLNQvCdsFhFwZQcT8;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0
"
;
String
cookie
=
""
;
Map
<
String
,
String
>
headerMap
=
Head
Get
.
getQQkbUserHeaderMap
(
cookie
);
Map
<
String
,
String
>
headerMap
=
Head
erTool
.
getCommonHead
(
);
Map
<
String
,
Object
>
paramMap
=
HeadGet
.
get
QQkbUserParamMap
(
"虎嗅"
);
Map
<
String
,
Object
>
paramMap
=
HeadGet
.
get
KuaishouParamMap
(
);
String
result
=
HttpClient
.
executeHttpRequestPost
(
url
,
null
,
headerMap
,
paramMap
);
String
result
=
HttpClient
.
executeHttpRequestPost
(
url
,
null
,
headerMap
,
paramMap
);
System
.
out
.
println
(
result
);
System
.
out
.
println
(
result
);
System
.
out
.
println
(
result
.
length
());
System
.
out
.
println
(
result
.
length
());
...
...
src/main/java/com/zhiwei/parse/Baijia.java
View file @
a731c54c
...
@@ -10,15 +10,20 @@ import org.slf4j.LoggerFactory;
...
@@ -10,15 +10,20 @@ import org.slf4j.LoggerFactory;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.parse.analysis.BaijiaAccountAnalysis
;
import
com.zhiwei.parse.analysis.BaijiaAccountAnalysis
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
public
class
Baijia
{
public
class
Baijia
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Baijia
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Baijia
.
class
);
private
static
BaijiaAccountAnalysis
baijiaAccountAnalysis
=
new
BaijiaAccountAnalysis
();
private
static
BaijiaAccountAnalysis
baijiaAccountAnalysis
=
new
BaijiaAccountAnalysis
();
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
/**
/**
*
*
...
@@ -77,7 +82,8 @@ public class Baijia {
...
@@ -77,7 +82,8 @@ public class Baijia {
for
(
int
i
=
1
;
i
<
3
;
i
++)
{
for
(
int
i
=
1
;
i
<
3
;
i
++)
{
try
{
try
{
String
url
=
"https://author.baidu.com/list?type=article&context={%22offset%22:%22-1_"
+
n
+
"%22,%22app_id%22:%22"
+
app_id
+
"%22,%22pageSize%22:20}"
;
String
url
=
"https://author.baidu.com/list?type=article&context={%22offset%22:%22-1_"
+
n
+
"%22,%22app_id%22:%22"
+
app_id
+
"%22,%22pageSize%22:20}"
;
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
String
result
=
httpBoot
.
syncCall
(
request
,
proxy
,
false
).
body
().
string
();
Map
<
String
,
Object
>
dMap
=
baijiaAccountAnalysis
.
getBaijiaAccountData3
(
result
,
name
,
startTime
);
Map
<
String
,
Object
>
dMap
=
baijiaAccountAnalysis
.
getBaijiaAccountData3
(
result
,
name
,
startTime
);
List
<
Map
<
String
,
Object
>>
dList
=
(
List
<
Map
<
String
,
Object
>>)
dMap
.
get
(
"data"
);
List
<
Map
<
String
,
Object
>>
dList
=
(
List
<
Map
<
String
,
Object
>>)
dMap
.
get
(
"data"
);
dataList
.
addAll
(
dList
);
dataList
.
addAll
(
dList
);
...
...
src/main/java/com/zhiwei/parse/BiliBili.java
View file @
a731c54c
...
@@ -11,10 +11,11 @@ import java.util.Map;
...
@@ -11,10 +11,11 @@ import java.util.Map;
import
org.slf4j.Logger
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.HttpClientBuilder
;
import
com.zhiwei.crawler.core.HttpRequestBuilder
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.analysis.BilibilikeyWordAnalysis
;
import
com.zhiwei.parse.analysis.BilibilikeyWordAnalysis
;
import
com.zhiwei.tools.httpclient.HttpClientBuilder
;
import
com.zhiwei.tools.httpclient.HttpRequestBuilder
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.util.WordReadFile
;
import
com.zhiwei.util.WordReadFile
;
...
@@ -25,17 +26,16 @@ import okhttp3.Request;
...
@@ -25,17 +26,16 @@ import okhttp3.Request;
public
class
BiliBili
{
public
class
BiliBili
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
BiliBili
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
BiliBili
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
Proxy
proxy
,
String
cookie
)
{
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
Proxy
proxy
,
String
cookie
)
{
OkHttpClient
client
=
HttpClientBuilder
.
newInstance
();
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
try
{
try
{
String
url
=
"https://search.bilibili.com/all?keyword="
+
URLEncoder
.
encode
(
word
,
"utf-8"
)+
"&
from_source=banner_search
"
;
String
url
=
"https://search.bilibili.com/all?keyword="
+
URLEncoder
.
encode
(
word
,
"utf-8"
)+
"&
order=pubdate&duration=0&tids_1=0
"
;
Headers
header
=
Headers
.
of
(
"cookie"
,
cookie
,
"Referer"
,
"https://www.bilibili.com/"
,
"Host"
,
"search.bilibili.com"
);
Headers
header
=
Headers
.
of
(
"cookie"
,
cookie
,
"Referer"
,
"https://www.bilibili.com/"
,
"Host"
,
"search.bilibili.com"
);
System
.
out
.
println
(
url
);
Request
request
=
HttpRequestBuilder
.
newGetRequest
(
url
,
header
);
Request
request
=
HttpRequestBuilder
.
newGetRequest
(
url
,
header
);
client
=
client
.
newBuilder
().
proxy
(
proxy
).
build
();
String
result
=
httpBoot
.
syncCall
(
request
,
proxy
).
body
().
string
();
String
result
=
client
.
newCall
(
request
).
execute
().
body
().
string
(
);
// System.out.println(result
);
ZhiWeiTools
.
sleep
(
3000
);
ZhiWeiTools
.
sleep
(
3000
);
Map
<
String
,
Object
>
map
=
BilibilikeyWordAnalysis
.
getData
(
result
);
Map
<
String
,
Object
>
map
=
BilibilikeyWordAnalysis
.
getData
(
result
);
boolean
more
=
(
boolean
)
map
.
get
(
"more"
);
boolean
more
=
(
boolean
)
map
.
get
(
"more"
);
...
@@ -48,7 +48,7 @@ public class BiliBili {
...
@@ -48,7 +48,7 @@ public class BiliBili {
map
.
clear
();
map
.
clear
();
String
ur
=
url
+
"&page="
+
n
;
String
ur
=
url
+
"&page="
+
n
;
request
=
HttpRequestBuilder
.
newGetRequest
(
ur
,
header
);
request
=
HttpRequestBuilder
.
newGetRequest
(
ur
,
header
);
String
result2
=
client
.
newCall
(
request
).
execute
(
).
body
().
string
();
String
result2
=
httpBoot
.
syncCall
(
request
,
proxy
).
body
().
string
();
map
=
BilibilikeyWordAnalysis
.
getData
(
result2
);
map
=
BilibilikeyWordAnalysis
.
getData
(
result2
);
List
<
Map
<
String
,
Object
>>
dataList2
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"data"
);
List
<
Map
<
String
,
Object
>>
dataList2
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"data"
);
if
(
dataList2
!=
null
)
{
if
(
dataList2
!=
null
)
{
...
@@ -89,7 +89,7 @@ public class BiliBili {
...
@@ -89,7 +89,7 @@ public class BiliBili {
headlist
.
add
(
"title"
);
headlist
.
add
(
"title"
);
headlist
.
add
(
"url"
);
headlist
.
add
(
"url"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata//bilibili关键词采集数据.xlsx"
,
"B站数据"
,
headlist
,
bodyList
);
poi
.
exportExcel
(
"D://crawlerdata//bilibili关键词采集数据
-竹鼠
.xlsx"
,
"B站数据"
,
headlist
,
bodyList
);
}
}
...
...
src/main/java/com/zhiwei/parse/Maimai.java
View file @
a731c54c
...
@@ -48,4 +48,32 @@ public class Maimai {
...
@@ -48,4 +48,32 @@ public class Maimai {
return
dataList
;
return
dataList
;
}
}
public
static
List
<
Map
<
String
,
Object
>>
getDataByNoName
(
String
key
,
String
cookie
,
String
time
,
Proxy
proxy
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getMaimaiKeywordHeaderMap
(
cookie
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
boolean
f
=
true
;
try
{
String
url
=
"https://maimai.cn/search/gossips?query="
+
URLEncoder
.
encode
(
key
,
"utf-8"
)+
"&limit=20&offset=0&highlight=true&sortby=time&jsononly=1"
;
int
i
=
20
;
while
(
f
)
{
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
Map
<
String
,
Object
>
map
=
maimaiBywordAnalysis
.
getDataByNoName
(
result
,
time
);
f
=
(
boolean
)
map
.
get
(
"hasMore"
);
List
<
Map
<
String
,
Object
>>
daList
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"data"
);
if
(
daList
!=
null
&&
daList
.
size
()
>
0
)
{
dataList
.
addAll
(
daList
);
url
=
"https://maimai.cn/search/gossips?query="
+
URLEncoder
.
encode
(
key
,
"utf-8"
)+
"&limit=20&offset="
+
i
+
"highlight=true&sortby=time&jsononly=1"
;
i
+=
20
;
logger
.
info
(
"{}==采集到的数据量=="
+
dataList
.
size
(),
key
);
ZhiWeiTools
.
sleep
(
2000
);
}
else
{
break
;
}
}
}
catch
(
Exception
e
)
{
}
return
dataList
;
}
}
}
src/main/java/com/zhiwei/parse/QQKandian.java
View file @
a731c54c
...
@@ -20,8 +20,8 @@ import com.alibaba.fastjson.JSONArray;
...
@@ -20,8 +20,8 @@ import com.alibaba.fastjson.JSONArray;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.bean.HistortyBean
;
import
com.zhiwei.bean.HistortyBean
;
import
com.zhiwei.bean.QQKandianUser
;
import
com.zhiwei.bean.QQKandianUser
;
import
com.zhiwei.
tools.httpclient
.HttpClientBuilder
;
import
com.zhiwei.
crawler.core
.HttpClientBuilder
;
import
com.zhiwei.
tools.httpclient
.HttpRequestBuilder
;
import
com.zhiwei.
crawler.core
.HttpRequestBuilder
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
...
...
src/main/java/com/zhiwei/parse/TXNews.java
View file @
a731c54c
...
@@ -17,23 +17,26 @@ public class TXNews {
...
@@ -17,23 +17,26 @@ public class TXNews {
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
TXNews
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
TXNews
.
class
);
private
static
TXNewsByWordAnalysis
txNewsByWordAnalysis
=
new
TXNewsByWordAnalysis
();
private
static
TXNewsByWordAnalysis
txNewsByWordAnalysis
=
new
TXNewsByWordAnalysis
();
public
static
boolean
hasMore
=
true
;
public
static
boolean
txNewshasMoreData
=
true
;
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
String
devid
,
Proxy
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getTxNewspage1HeaderMap
(
null
);
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getTxNewspage1HeaderMap
(
null
);
Map
<
String
,
Object
>
paramMap
=
HeadGet
.
getTxNewspage1ParamMap
(
word
);
Map
<
String
,
Object
>
paramMap
=
HeadGet
.
getTxNewspage1ParamMap
(
word
);
String
result
=
HttpClient
.
executeHttpRequestPost
(
"http://r.inews.qq.com/search?appver=11.2.1_qqnews_5.5.60&devid=6D33F35F-880D-42A6-A23F-881BEC6960EC"
,
proxy
,
headerMap
,
paramMap
);
// b3dd1e7d-9d3c-4e75-bf3e-3a76f326ee34
String
result
=
HttpClient
.
executeHttpRequestPost
(
"http://r.inews.qq.com/search?appver=11.2.1_qqnews_5.5.60&devid="
+
devid
,
proxy
,
headerMap
,
paramMap
);
List
<
Map
<
String
,
Object
>>
dList
=
txNewsByWordAnalysis
.
getData
(
result
);
List
<
Map
<
String
,
Object
>>
dList
=
txNewsByWordAnalysis
.
getData
(
result
);
dataList
.
addAll
(
dList
);
dataList
.
addAll
(
dList
);
int
page
=
2
;
int
page
=
2
;
int
count
=
0
;
int
count
=
0
;
Map
<
String
,
String
>
header2Map
=
HeadGet
.
getTxNewspage2HeaderMap
(
null
);
Map
<
String
,
String
>
header2Map
=
HeadGet
.
getTxNewspage2HeaderMap
(
null
);
while
(
hasMore
)
{
while
(
txNewshasMoreData
)
{
try
{
try
{
ZhiWeiTools
.
sleep
(
5000
);
ZhiWeiTools
.
sleep
(
5000
);
//
Map
<
String
,
Object
>
param2Map
=
HeadGet
.
getTxNewspagemoreParamMap
(
word
,
page
);
Map
<
String
,
Object
>
param2Map
=
HeadGet
.
getTxNewspagemoreParamMap
(
word
,
page
);
String
result2
=
HttpClient
.
executeHttpRequestPost
(
"http://r.inews.qq.com/searchMore?appver=11.2.1_qqnews_5.5.60&devid=6D33F35F-880D-42A6-A23F-881BEC6960EC"
,
proxy
,
header2Map
,
param2Map
);
//6D33F35F-880D-42A6-A23F-881BEC6960EC
String
result2
=
HttpClient
.
executeHttpRequestPost
(
"http://r.inews.qq.com/searchMore?appver=11.2.1_qqnews_5.5.60&devid=496d3626-9684-45ef-8d22-7a71fbfd22da"
,
proxy
,
header2Map
,
param2Map
);
page
++;
page
++;
List
<
Map
<
String
,
Object
>>
dList2
=
txNewsByWordAnalysis
.
getData
(
result2
);
List
<
Map
<
String
,
Object
>>
dList2
=
txNewsByWordAnalysis
.
getData
(
result2
);
dataList
.
addAll
(
dList2
);
dataList
.
addAll
(
dList2
);
...
...
src/main/java/com/zhiwei/parse/Yidianzixun.java
View file @
a731c54c
...
@@ -126,6 +126,7 @@ public class Yidianzixun {
...
@@ -126,6 +126,7 @@ public class Yidianzixun {
int
i
=
0
;
int
i
=
0
;
while
(
true
)
{
while
(
true
)
{
String
url
=
"http://www.yidianzixun.com/home/q/news_list_for_keyword?display="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)+
"&cstart="
+
i
+
"&cend="
+(
i
+
10
)+
"&word_type=token"
;
String
url
=
"http://www.yidianzixun.com/home/q/news_list_for_keyword?display="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)+
"&cstart="
+
i
+
"&cend="
+(
i
+
10
)+
"&word_type=token"
;
System
.
out
.
println
(
url
);
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getYidianzixunWordHeaderMap
(
null
);
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getYidianzixunWordHeaderMap
(
null
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
List
<
Map
<
String
,
Object
>>
list
=
yidianzixunByWordAnalysis
.
getOnePageData
(
result
);
List
<
Map
<
String
,
Object
>>
list
=
yidianzixunByWordAnalysis
.
getOnePageData
(
result
);
...
...
src/main/java/com/zhiwei/parse/analysis/BaijiaAccountAnalysis.java
View file @
a731c54c
...
@@ -54,11 +54,16 @@ public class BaijiaAccountAnalysis {
...
@@ -54,11 +54,16 @@ public class BaijiaAccountAnalysis {
try
{
try
{
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArry
=
json
.
getJSONObject
(
"data"
).
getJSONArray
(
"items"
);
JSONArray
jsonArry
=
json
.
getJSONObject
(
"data"
).
getJSONArray
(
"items"
);
if
(
json
.
getJSONObject
(
"data"
)
!=
null
&&
json
.
getJSONObject
(
"data"
).
getBoolean
(
"has_more"
)
!=
null
)
{
if
(
json
.
getJSONObject
(
"data"
).
getBoolean
(
"has_more"
))
{
more
=
true
;
}
}
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
JSONObject
data
=
jsonArry
.
getJSONObject
(
i
);
JSONObject
data
=
jsonArry
.
getJSONObject
(
i
);
String
id
=
data
.
getString
(
"article_id"
);
String
id
=
data
.
getString
(
"article_id"
);
int
t
=
data
.
getInteger
(
"
cre
ated_at"
);
int
t
=
data
.
getInteger
(
"
upd
ated_at"
);
String
time
=
TimeParse
.
dateFormartString
(
new
Date
(
t
*
1000L
),
"yyyy-MM-dd HH:mm:ss"
);
String
time
=
TimeParse
.
dateFormartString
(
new
Date
(
t
*
1000L
),
"yyyy-MM-dd HH:mm:ss"
);
System
.
out
.
println
(
time
);
System
.
out
.
println
(
time
);
if
(
startTime
!=
null
&&
startTime
.
length
()
>
1
)
{
if
(
startTime
!=
null
&&
startTime
.
length
()
>
1
)
{
...
@@ -80,9 +85,6 @@ public class BaijiaAccountAnalysis {
...
@@ -80,9 +85,6 @@ public class BaijiaAccountAnalysis {
map
.
put
(
"source"
,
name
);
map
.
put
(
"source"
,
name
);
dataList
.
add
(
map
);
dataList
.
add
(
map
);
}
}
if
(
json
.
getJSONObject
(
"data"
).
getBoolean
(
"has_more"
))
{
more
=
true
;
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
e
.
printStackTrace
();
}
}
...
...
src/main/java/com/zhiwei/parse/analysis/BilibilikeyWordAnalysis.java
View file @
a731c54c
...
@@ -44,7 +44,7 @@ public class BilibilikeyWordAnalysis {
...
@@ -44,7 +44,7 @@ public class BilibilikeyWordAnalysis {
map
.
put
(
"source"
,
source
);
map
.
put
(
"source"
,
source
);
map
.
put
(
"submitcount"
,
submitcount
);
map
.
put
(
"submitcount"
,
submitcount
);
dataList
.
add
(
map
);
dataList
.
add
(
map
);
//
System.out.println(map.toString());
System
.
out
.
println
(
map
.
toString
());
}
}
Map
<
String
,
Object
>
rmap
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
Object
>
rmap
=
new
HashMap
<
String
,
Object
>();
rmap
.
put
(
"more"
,
more
);
rmap
.
put
(
"more"
,
more
);
...
...
src/main/java/com/zhiwei/parse/analysis/MaimaiBywordAnalysis.java
View file @
a731c54c
...
@@ -38,6 +38,38 @@ public class MaimaiBywordAnalysis {
...
@@ -38,6 +38,38 @@ public class MaimaiBywordAnalysis {
map
.
put
(
"like"
,
data
.
getJSONObject
(
"feed"
).
getInteger
(
"likes"
));
map
.
put
(
"like"
,
data
.
getJSONObject
(
"feed"
).
getInteger
(
"likes"
));
map
.
put
(
"comment_count"
,
data
.
getJSONObject
(
"feed"
).
getInteger
(
"total_cnt"
));
map
.
put
(
"comment_count"
,
data
.
getJSONObject
(
"feed"
).
getInteger
(
"total_cnt"
));
map
.
put
(
"spreads"
,
data
.
getJSONObject
(
"feed"
).
getInteger
(
"spreads"
));
//传播数
map
.
put
(
"spreads"
,
data
.
getJSONObject
(
"feed"
).
getInteger
(
"spreads"
));
//传播数
System
.
out
.
println
(
map
.
toString
());
dataList
.
add
(
map
);
}
map1
.
put
(
"data"
,
dataList
);
map1
.
put
(
"hasMore"
,
f
);
return
map1
;
}
public
Map
<
String
,
Object
>
getDataByNoName
(
String
result
,
String
time
)
{
Map
<
String
,
Object
>
map1
=
new
HashMap
<
String
,
Object
>();
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArry
=
json
.
getJSONObject
(
"data"
).
getJSONArray
(
"gossips"
);
boolean
f
=
true
;
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
f
=
json
.
getJSONObject
(
"data"
).
getInteger
(
"more"
)==
1
?
true
:
false
;
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
JSONObject
data
=
jsonArry
.
getJSONObject
(
i
);
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
String
url
=
"https://maimai.cn/web/gossip_detail?encode_id="
+
data
.
getJSONObject
(
"gossip"
).
getString
(
"encode_id"
);
String
atime
=
data
.
getJSONObject
(
"gossip"
).
getString
(
"crtime_string"
);
if
(
time
.
compareTo
(
atime
)
>
-
1
)
{
f
=
false
;
continue
;
}
map
.
put
(
"time"
,
atime
);
map
.
put
(
"url"
,
url
);
map
.
put
(
"text"
,
data
.
getJSONObject
(
"gossip"
).
getString
(
"text"
));
map
.
put
(
"name"
,
data
.
getJSONObject
(
"gossip"
).
getString
(
"username"
));
map
.
put
(
"like"
,
data
.
getJSONObject
(
"gossip"
).
getInteger
(
"likes"
));
map
.
put
(
"comment_count"
,
data
.
getJSONObject
(
"gossip"
).
getInteger
(
"total_cnt"
));
map
.
put
(
"spreads"
,
data
.
getJSONObject
(
"gossip"
).
getInteger
(
"search_order"
));
//传播数
System
.
out
.
println
(
map
.
toString
());
dataList
.
add
(
map
);
dataList
.
add
(
map
);
}
}
map1
.
put
(
"data"
,
dataList
);
map1
.
put
(
"data"
,
dataList
);
...
...
src/main/java/com/zhiwei/parse/analysis/TXNewsByWordAnalysis.java
View file @
a731c54c
...
@@ -21,9 +21,9 @@ public class TXNewsByWordAnalysis {
...
@@ -21,9 +21,9 @@ public class TXNewsByWordAnalysis {
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
JSONArray
jsonArry
=
json
.
getJSONArray
(
"secList"
);
JSONArray
jsonArry
=
json
.
getJSONArray
(
"secList"
);
if
(
json
.
getInteger
(
"hasMore"
)
==
1
)
{
if
(
json
.
getInteger
(
"hasMore"
)
==
1
)
{
TXNews
.
hasMore
=
true
;
TXNews
.
txNewshasMoreData
=
true
;
}
else
{
}
else
{
TXNews
.
hasMore
=
false
;
TXNews
.
txNewshasMoreData
=
false
;
}
}
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
JSONObject
js
=
jsonArry
.
getJSONObject
(
i
);
JSONObject
js
=
jsonArry
.
getJSONObject
(
i
);
...
@@ -40,7 +40,7 @@ public class TXNewsByWordAnalysis {
...
@@ -40,7 +40,7 @@ public class TXNewsByWordAnalysis {
map
.
put
(
"id"
,
js2
.
getString
(
"id"
));
map
.
put
(
"id"
,
js2
.
getString
(
"id"
));
map
.
put
(
"url"
,
js2
.
getString
(
"url"
));
map
.
put
(
"url"
,
js2
.
getString
(
"url"
));
dataList
.
add
(
map
);
dataList
.
add
(
map
);
//
System.out.println(map.toString());
System
.
out
.
println
(
map
.
toString
());
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"采集出错:{}"
,
e
.
getMessage
());
logger
.
error
(
"采集出错:{}"
,
e
.
getMessage
());
System
.
out
.
println
(
js2
.
toString
());
System
.
out
.
println
(
js2
.
toString
());
...
...
src/test/java/com/zhiwei/crawler/BaijiaAccountExample.java
View file @
a731c54c
...
@@ -60,15 +60,16 @@ public class BaijiaAccountExample {
...
@@ -60,15 +60,16 @@ public class BaijiaAccountExample {
public
void
test3
()
{
public
void
test3
()
{
String
path
=
"D://crawlerdata//自媒体/百家号采集.xlsx"
;
String
path
=
"D://crawlerdata//自媒体/百家号采集.xlsx"
;
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
String
startTime
=
"2018-0
1
-01 00:00:00"
;
String
startTime
=
"2018-0
5
-01 00:00:00"
;
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
path
,
0
);
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
path
,
0
);
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
for
(
Map
<
String
,
Object
>
m
:
list
)
{
for
(
Map
<
String
,
Object
>
m
:
list
)
{
try
{
try
{
String
app_id
=
m
.
get
(
"id"
).
toString
();
String
app_id
=
m
.
get
(
"id"
).
toString
();
app_id
=
"1563725611969509"
;
String
name
=
m
.
get
(
"name"
).
toString
();
String
name
=
m
.
get
(
"name"
).
toString
();
String
cookie
=
"
BAIDUID=BA1090A5857735165A2A419CBA37957A:FG=1
"
;
String
cookie
=
"
__cfduid=d847baca85b97d1967b3da02ebb345b831535524251; BAIDUID=C0F0F81EF770C5219AB9C178654135EC:FG=1; PSTM=1536376257; BIDUPSID=250CCE0442BEBCB3568D8EC515953434; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; locale=zh; delPer=0; H_PS_PSSID=1447_21117_20930; PSINO=5
"
;
List
<
Map
<
String
,
Object
>>
lists
=
Baijia
.
getBaijiaAccountByBaiduData
(
app_id
,
name
,
startTime
,
cookie
,
null
);
List
<
Map
<
String
,
Object
>>
lists
=
Baijia
.
getBaijiaAccountByBaiduData
(
app_id
,
name
,
startTime
,
cookie
,
null
);
if
(
lists
!=
null
)
{
if
(
lists
!=
null
)
{
bodyList
.
addAll
(
lists
);
bodyList
.
addAll
(
lists
);
...
@@ -83,7 +84,7 @@ public class BaijiaAccountExample {
...
@@ -83,7 +84,7 @@ public class BaijiaAccountExample {
headList
.
add
(
"url"
);
headList
.
add
(
"url"
);
headList
.
add
(
"content"
);
headList
.
add
(
"content"
);
headList
.
add
(
"read_amount"
);
headList
.
add
(
"read_amount"
);
poi
.
exportExcel
(
"D://crawlerdata//自媒体/百家号-lxj.xlsx"
,
"娱乐资本论"
,
headList
,
bodyList
);
poi
.
exportExcel
(
"D://crawlerdata//自媒体/百家号-lxj
-2
.xlsx"
,
"娱乐资本论"
,
headList
,
bodyList
);
}
}
}
}
src/test/java/com/zhiwei/crawler/DayuByWordExample.java
View file @
a731c54c
...
@@ -12,7 +12,7 @@ public class DayuByWordExample {
...
@@ -12,7 +12,7 @@ public class DayuByWordExample {
@Test
@Test
public
void
dayuByWordTest
()
{
public
void
dayuByWordTest
()
{
String
word
=
"
沃尔玛
"
;
String
word
=
"
11
"
;
List
<
Map
<
String
,
Object
>>
dataList
=
Dayu
.
getDayuByWordData
(
word
,
null
);
List
<
Map
<
String
,
Object
>>
dataList
=
Dayu
.
getDayuByWordData
(
word
,
null
);
...
...
src/test/java/com/zhiwei/crawler/MaimaiBywordExample.java
View file @
a731c54c
...
@@ -10,13 +10,14 @@ import com.zhiwei.parse.Maimai;
...
@@ -10,13 +10,14 @@ import com.zhiwei.parse.Maimai;
public
class
MaimaiBywordExample
{
public
class
MaimaiBywordExample
{
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
String
word
=
"
小米 上市|小米 IPO|雷军 IPO|小米 招股书|雷军 上市
"
;
String
word
=
"
美团 晋升
"
;
String
cookie
=
"sessionid=
njbswswdrvwf4vpg0836xu6m7ve4ziso; guid=GxsfBBgZGwQYGx4EGBkeVgcYGx4bGRIdEx4bVhwZBB0ZHwVDWEtMS3kKExkbBBMfGRkEGgQcHAVPR0VYQmkKA0VBSU9tCk9BQ0YKBmZnfmJhAgocGQQdGR8FXkNhSE99T0ZaWmsKAx4cfWV9ChEZBBwKfmQKWV1FTkRDfQIKGgQfBUtGRkNQRWc=; seid=s1526952692556; token=\"nv0ZM3AICKHOmB1sdBi2QrvA0fFDgtRwdZJV+DzF3KsZdPIsvD1I2HOdRVyurjQi8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0Ijoid2s0MWRLbDBtWlFwTlJoWmdwc1JUZHR2IiwiX2V4cGlyZSI6MTUyNzAzOTEwMzE5MiwiX21heEFnZSI6ODY0MDAwMDB9; session.sig=ssvF7IeeQYlwCjdh8GaY3mhr0SY
"
;
String
cookie
=
"sessionid=
y87knknqrc3fi6xto2zv0s4kugmleepk; guid=GxsfBBgZGwQYGx4EGBkeVgcYGx4fHhwcGhgbVhwZBB0ZHwVDWEtMS3kKGhobBB0THhkEGgQTHAVPR0VYQmkKA0VBSU9tCk9BQ0YKBmZnfmJhAgocGQQdGR8FXkNhSE99T0ZaWmsKAx4cfWV9ChEZBBwKfmQKWV1FTkRDfQIKGgQfBUtGRkNQRWc=; seid=s1539933372113; token=\"ZTjnEij9jsL4ZCdnKF2CaUAwcJHgcem/zHvAbXp3MXdY+uSPva8scjbe2zHl2gE98CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoiSFVMLVhKb2g5TkJGNHRJanljUW5Qa1V5IiwiX2V4cGlyZSI6MTU0MDAxOTc5MTUwNSwiX21heEFnZSI6ODY0MDAwMDB9; session.sig=dJmy52LHX-stqroAbm66u2zJaZA
"
;
String
time
=
"2018-
05-01
00:00:00"
;
String
time
=
"2018-
10-15
00:00:00"
;
String
[]
words
=
word
.
split
(
"\\|"
);
String
[]
words
=
word
.
split
(
"\\|"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
String
w
:
words
)
{
for
(
String
w
:
words
)
{
List
<
Map
<
String
,
Object
>>
c
=
Maimai
.
getData
(
w
,
cookie
,
time
,
null
);
List
<
Map
<
String
,
Object
>>
c
=
Maimai
.
getData
(
w
,
cookie
,
time
,
null
);
// List<Map<String,Object>> c = Maimai.getDataByNoName(w, cookie, time, null);
bodyList
.
addAll
(
c
);
bodyList
.
addAll
(
c
);
}
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
...
@@ -28,7 +29,7 @@ public class MaimaiBywordExample {
...
@@ -28,7 +29,7 @@ public class MaimaiBywordExample {
headList
.
add
(
"comment_count"
);
headList
.
add
(
"comment_count"
);
headList
.
add
(
"spreads"
);
headList
.
add
(
"spreads"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\脉脉关键词采集-
1
.xlsx"
,
"脉脉关键词"
,
headList
,
bodyList
);
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\脉脉关键词采集-
美团 晋升-1015
.xlsx"
,
"脉脉关键词"
,
headList
,
bodyList
);
}
}
}
}
src/test/java/com/zhiwei/crawler/TXNewsByWordExample.java
View file @
a731c54c
...
@@ -11,7 +11,8 @@ public class TXNewsByWordExample {
...
@@ -11,7 +11,8 @@ public class TXNewsByWordExample {
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
String
word
=
"唐嫣"
;
String
word
=
"唐嫣"
;
List
<
Map
<
String
,
Object
>>
dataList
=
TXNews
.
getData
(
word
,
null
);
String
devid
=
"6D33F35F-880D-42A6-A23F-881BEC6960EC"
;
List
<
Map
<
String
,
Object
>>
dataList
=
TXNews
.
getData
(
word
,
devid
,
null
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"title"
);
...
@@ -20,7 +21,7 @@ public class TXNewsByWordExample {
...
@@ -20,7 +21,7 @@ public class TXNewsByWordExample {
headList
.
add
(
"url"
);
headList
.
add
(
"url"
);
headList
.
add
(
"id"
);
headList
.
add
(
"id"
);
headList
.
add
(
"source"
);
headList
.
add
(
"source"
);
poi
.
exportExcel
(
"D://crawlerdata/腾讯新闻-唐嫣.xlsx"
,
"腾讯新闻数据"
,
headList
,
dataList
);
poi
.
exportExcel
(
"D://crawlerdata/腾讯新闻-唐嫣
-1
.xlsx"
,
"腾讯新闻数据"
,
headList
,
dataList
);
}
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment