Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
articlenewscrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
articlenewscrawler
Commits
7bf0e1d7
Commit
7bf0e1d7
authored
Jan 04, 2018
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
增加部分自媒体采集
parent
f09aa1c9
Show whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
341 additions
and
59 deletions
+341
-59
src/main/java/com/zhiwei/httpclient/HeadGet.java
+144
-35
src/main/java/com/zhiwei/parse/Baijia.java
+8
-1
src/main/java/com/zhiwei/parse/Dayu.java
+44
-1
src/main/java/com/zhiwei/parse/Fenghuang.java
+7
-2
src/main/java/com/zhiwei/parse/Meipai.java
+7
-1
src/main/java/com/zhiwei/parse/Miaopai.java
+6
-2
src/main/java/com/zhiwei/parse/QQKB.java
+97
-5
src/main/java/com/zhiwei/parse/Souhu.java
+9
-1
src/main/java/com/zhiwei/parse/Yidianzixun.java
+0
-1
src/main/java/com/zhiwei/parse/analysis/QQKBAccountAnalysis.java
+2
-2
src/main/java/com/zhiwei/parse/analysis/SouhuCommentAnalysis.java
+1
-0
src/test/java/com/zhiwei/crawler/QQAccountExample.java
+2
-2
src/test/java/com/zhiwei/crawler/SouhuAccountExample.java
+14
-6
No files found.
src/main/java/com/zhiwei/httpclient/HeadGet.java
View file @
7bf0e1d7
...
@@ -453,39 +453,145 @@ public class HeadGet {
...
@@ -453,39 +453,145 @@ public class HeadGet {
return
headerMap
;
return
headerMap
;
}
}
public
static
Map
<
String
,
String
>
getJikeComment39HeaderMap
(
String
cookie
)
{
/**
*
* @Description 大鱼号关键词采集头信息
* @param url
* @param cookie
* @return
* @throws IOException
*/
public
static
Map
<
String
,
String
>
getDayuByWordHeaderMap
(
String
cookie
)
{
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"Host"
,
"app.jike.ruguoapp.com"
);
headerMap
.
put
(
"User-Agent"
,
headerMap
.
put
(
"Accept-Language"
,
"zh-cn"
);
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
);
headerMap
.
put
(
"Accept"
,
"*/*"
);
headerMap
.
put
(
"Accept"
,
headerMap
.
put
(
"User-Agent"
,
"%E5%8D%B3%E5%88%BB/989 CFNetwork/811.5.4 Darwin/16.7.0"
);
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
);
headerMap
.
put
(
"App-BuildNo"
,
"989"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-CN,zh;q=0.9"
);
headerMap
.
put
(
"App-Version"
,
"3.9.1"
);
headerMap
.
put
(
"Content-Type"
,
"application/json"
);
headerMap
.
put
(
"Manufacturer"
,
"Apple"
);
headerMap
.
put
(
"Content-Length"
,
"39"
);
headerMap
.
put
(
"Connection"
,
"keep-alive"
);
headerMap
.
put
(
"Connection"
,
"keep-alive"
);
headerMap
.
put
(
"OS-Version"
,
"Version 10.3.3 (Build 14G60)"
);
headerMap
.
put
(
"Host"
,
"zzd.sm.cn"
);
if
(
cookie
!=
null
)
{
headerMap
.
put
(
"Cookie"
,
cookie
);
}
return
headerMap
;
}
/**
*
* @Description 天天快报评论采集头信息
* @param url
* @param cookie
* @return
* @throws IOException
*/
public
static
Map
<
String
,
String
>
getQQKBCommentHeaderMap
(
String
cookie
)
{
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"User-Agent"
,
"天天快报 4.6.2 qnreading (iPhone8,1; iOS 11.2.1; zh_CN; 4.6.2.89)"
);
headerMap
.
put
(
"Accept"
,
"*/*"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-Hans-CN;q=1"
);
headerMap
.
put
(
"Connection"
,
"keep-alive"
);
headerMap
.
put
(
"Host"
,
"r.cnews.qq.com"
);
headerMap
.
put
(
"Referer"
,
"http://r.cnews.qq.com/inews/iphone/"
);
if
(
cookie
!=
null
)
{
if
(
cookie
!=
null
)
{
headerMap
.
put
(
"Cookie"
,
cookie
);
headerMap
.
put
(
"Cookie"
,
cookie
);
}
}
return
headerMap
;
return
headerMap
;
}
}
public
static
Map
<
String
,
String
>
getJikeComment94HeaderMap
(
String
cookie
)
{
/**
*
* @Description 天天快报post第一页头信息
* @param comment_id
* @param article_id
* @return
*/
public
static
Map
<
String
,
Object
>
getQQKBCommentParamMap
(
String
comment_id
,
String
article_id
){
Map
<
String
,
Object
>
param
=
new
HashMap
<
String
,
Object
>();
param
.
put
(
"chlid"
,
"daily_timeline"
);
param
.
put
(
"comment_id"
,
comment_id
);
param
.
put
(
"page"
,
1
);
param
.
put
(
"article_id"
,
article_id
);
param
.
put
(
"showType"
,
"orig"
);
return
param
;
}
/**
*
* @Description 天天快报评论第二页采集
* @param comment_id
* @param page
* @param coral_scorem
* @param article_id
* @param reply_id
* @return
*/
public
static
Map
<
String
,
Object
>
getQQKBCommentParamMap2
(
String
comment_id
,
int
page
,
String
coral_scorem
,
String
article_id
,
String
reply_id
){
Map
<
String
,
Object
>
param
=
new
HashMap
<
String
,
Object
>();
param
.
put
(
"chlid"
,
"daily_timeline"
);
param
.
put
(
"showType"
,
"orig"
);
param
.
put
(
"sortType"
,
"hot"
);
param
.
put
(
"comment_id"
,
comment_id
);
param
.
put
(
"page"
,
page
);
param
.
put
(
"coral_score"
,
coral_scorem
);
param
.
put
(
"article_id"
,
article_id
);
param
.
put
(
"reply_id"
,
reply_id
);
param
.
put
(
"c_type"
,
"comment"
);
return
param
;
}
/**
*
* @Description 天天快报回复评论下一页post头信息
* @param old_reply_id
* @param comment_id
* @param article_id
* @param reply_id
* @return
*/
public
static
Map
<
String
,
Object
>
getQQKBCommentReplyParamMap
(
String
old_reply_id
,
String
comment_id
,
String
article_id
,
String
reply_id
)
{
Map
<
String
,
Object
>
param
=
new
HashMap
<
String
,
Object
>();
if
(
old_reply_id
!=
null
)
{
param
.
put
(
"old_reply_id"
,
old_reply_id
);
}
param
.
put
(
"pageflag"
,
"old"
);
param
.
put
(
"comment_id"
,
comment_id
);
param
.
put
(
"article_id"
,
article_id
);
param
.
put
(
"orig_id"
,
reply_id
);
return
param
;
}
public
static
Map
<
String
,
Object
>
getQQKBByWordParamMap
(
String
word
,
String
sid
,
String
queryid
,
int
page
)
{
Map
<
String
,
Object
>
param
=
new
HashMap
<
String
,
Object
>();
if
(
sid
!=
null
&&
queryid
!=
null
)
{
param
.
put
(
"sid"
,
sid
);
param
.
put
(
"queryid"
,
queryid
);
param
.
put
(
"page"
,
page
);
}
param
.
put
(
"query"
,
word
);
return
param
;
}
public
static
Map
<
String
,
String
>
getQQKBByWordHeaderMap
(
String
cookie
)
{
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"Host"
,
"app.jike.ruguoapp.com"
);
headerMap
.
put
(
"User-Agent"
,
headerMap
.
put
(
"Accept-Language"
,
"zh-cn"
);
"天天快报 4.6.2 qnreading (iPhone8,1; iOS 11.2.1; zh_CN; 4.6.2.89)"
);
headerMap
.
put
(
"Accept"
,
"*/*"
);
headerMap
.
put
(
"Accept"
,
headerMap
.
put
(
"User-Agent"
,
"%E5%8D%B3%E5%88%BB/989 CFNetwork/811.5.4 Darwin/16.7.0"
);
"*/*"
);
headerMap
.
put
(
"App-BuildNo"
,
"989"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-Hans-CN;q=1"
);
headerMap
.
put
(
"App-Version"
,
"3.9.1"
);
// headerMap.put("Content-Type", "application/json");
headerMap
.
put
(
"Manufacturer"
,
"Apple"
);
headerMap
.
put
(
"Content-Length"
,
"94"
);
headerMap
.
put
(
"Connection"
,
"keep-alive"
);
headerMap
.
put
(
"Connection"
,
"keep-alive"
);
headerMap
.
put
(
"OS-Version"
,
"Version 10.3.3 (Build 14G60)"
);
headerMap
.
put
(
"Host"
,
"r.cnews.qq.com"
);
headerMap
.
put
(
"Referer"
,
"http://r.cnews.qq.com/inews/iphone/"
);
headerMap
.
put
(
"deviceToken"
,
"<585bee8d f6739b65 1248b40d 7be9dc4a 126bbf27 85ad470e ce6b7923 bbcb7c1c>"
);
headerMap
.
put
(
"qn-rid"
,
"9343AF22-FE03-4DFF-BC91-1D41997AA9B4"
);
headerMap
.
put
(
"qn-sig"
,
"8D2B15DA2D55970187106A58C1966986"
);
headerMap
.
put
(
"omgbizid"
,
"5144dee3f39a8d4dad994e5391fcebd1a0d50090112b14"
);
headerMap
.
put
(
"omgid"
,
"0f63f8e68f041746372b9ceecc8e97f028e90010112b14"
);
headerMap
.
put
(
"idfa"
,
"FE659B7E-5104-44C2-8A31-F88DEE7A2747"
);
headerMap
.
put
(
"appver"
,
"11.2.1_qnreading_4.6.2"
);
headerMap
.
put
(
"devid"
,
"6D33F35F-880D-42A6-A23F-881BEC6960EC"
);
if
(
cookie
!=
null
)
{
if
(
cookie
!=
null
)
{
headerMap
.
put
(
"Cookie"
,
cookie
);
headerMap
.
put
(
"Cookie"
,
cookie
);
...
@@ -493,23 +599,26 @@ public class HeadGet {
...
@@ -493,23 +599,26 @@ public class HeadGet {
return
headerMap
;
return
headerMap
;
}
}
public
static
Map
<
String
,
Object
>
getJikeCommentParamMap
(
String
targetId
,
String
time
)
{
public
static
Map
<
String
,
String
>
getWangyiCommentHeaderMap
(
String
cookie
)
{
Map
<
String
,
Object
>
paramMap
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
JSONObject
json
=
new
JSONObject
();
headerMap
.
put
(
"User-Agent"
,
if
(
time
!=
null
)
{
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
);
json
.
put
(
"createdAt"
,
time
);
headerMap
.
put
(
"Accept"
,
paramMap
.
put
(
"loadMoreKey"
,
json
);
"*/*"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-CN,zh;q=0.9"
);
headerMap
.
put
(
"Connection"
,
"keep-alive"
);
headerMap
.
put
(
"Host"
,
"comment.dy.163.com"
);
if
(
cookie
!=
null
)
{
headerMap
.
put
(
"Cookie"
,
cookie
);
}
}
paramMap
.
put
(
"targetId"
,
targetId
);
return
headerMap
;
return
paramMap
;
}
}
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
String
url
=
"https://app.jike.ruguoapp.com/1.0/messageComments/listPrimary"
;
String
url
=
"http://comment.dy.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/D75MDLL10524H5KD/comments/newList?offset=0&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2&callback=getData&ibc=newspc&_=1514966469573"
;
String
cookie
=
"jike:config:searchPlaceholderLastInfo=1514465731446#0; jike:sess=eyJfdWlkIjoiNWE0NGRmMTlmOWM4NWYwMDExODJhMjkwIiwiX3Nlc3Npb25Ub2tlbiI6InQ5cExKaEpiTFdVeDFsbUxKZW9vMUlKMEsifQ==; jike:sess.sig=HBuRKsTsMIIR9aMDUdkNV_mGH1E"
;
String
cookie
=
"phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=00030000fafc45b92e51a92d1a2d1c0536594402729a928137fe205f823d71e18c3e786e6f368baff37f7edc;%20uin=o0497332654;%20skey=MSF4MCe62n;%20sigA2=75E9AE34BD844F7CD19AC30353DE6116A767F02C50C78ABA2FB11B5B1D74324CCEDA1C9D13B6B3719AAA7875B14DBE4C560CB5FB99A5D63390B8041F6C83A48401EA8D5DA7B04E7A;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwvJbQ-Gsn52dfcob8V66AgcW1SAGy8xloQk1nVWfjVvR0b637c-qcRWE7M2QtFLKLsZP8o6dBVABpDhbzRQ92tw;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0"
;
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getJikeComment94HeaderMap
(
cookie
);
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getWangyiCommentHeaderMap
(
null
);
Map
<
String
,
Object
>
paramMap
=
HeadGet
.
getJikeCommentParamMap
(
"5a449a3d580d23001148412e"
,
"2017-12-28T10:17:50.601Z"
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
headerMap
);
String
result
=
HttpClient
.
executeHttpRequestPost
(
url
,
headerMap
,
paramMap
);
System
.
out
.
println
(
result
);
System
.
out
.
println
(
result
);
System
.
out
.
println
(
result
.
length
());
System
.
out
.
println
(
result
.
length
());
}
}
...
...
src/main/java/com/zhiwei/parse/Baijia.java
View file @
7bf0e1d7
...
@@ -16,6 +16,13 @@ public class Baijia {
...
@@ -16,6 +16,13 @@ public class Baijia {
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Baijia
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Baijia
.
class
);
private
static
BaijiaAccountAnalysis
baijiaAccountAnalysis
=
new
BaijiaAccountAnalysis
();
private
static
BaijiaAccountAnalysis
baijiaAccountAnalysis
=
new
BaijiaAccountAnalysis
();
/**
*
* @Description 百家号历史文章采集
* @param app_id
* @param startTime
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getBaijiaAccountData
(
String
app_id
,
String
startTime
)
{
public
static
List
<
Map
<
String
,
Object
>>
getBaijiaAccountData
(
String
app_id
,
String
startTime
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
int
i
=
0
;
int
i
=
0
;
...
@@ -23,7 +30,7 @@ public class Baijia {
...
@@ -23,7 +30,7 @@ public class Baijia {
while
(
true
)
{
while
(
true
)
{
try
{
try
{
String
url
=
"https://baijia.baidu.com/writerlistarticle?ajax=json&app_id="
+
app_id
+
"&_limit=20&_skip="
;
String
url
=
"https://baijia.baidu.com/writerlistarticle?ajax=json&app_id="
+
app_id
+
"&_limit=20&_skip="
;
System
.
out
.
println
(
url
+
i
);
logger
.
info
(
url
+
i
);
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getBaijiaAccountHeaderMap
(
null
);
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getBaijiaAccountHeaderMap
(
null
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
+
i
,
headerMap
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
+
i
,
headerMap
);
List
<
Map
<
String
,
Object
>>
list
=
baijiaAccountAnalysis
.
getBaijiaAccountData
(
result
,
startTime
);
List
<
Map
<
String
,
Object
>>
list
=
baijiaAccountAnalysis
.
getBaijiaAccountData
(
result
,
startTime
);
...
...
src/main/java/com/zhiwei/parse/Dayu.java
View file @
7bf0e1d7
package
com
.
zhiwei
.
parse
;
package
com
.
zhiwei
.
parse
;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.List
;
...
@@ -12,6 +13,7 @@ import com.alibaba.fastjson.JSONObject;
...
@@ -12,6 +13,7 @@ import com.alibaba.fastjson.JSONObject;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.parse.analysis.DayuAccountAnalysis
;
import
com.zhiwei.parse.analysis.DayuAccountAnalysis
;
import
com.zhiwei.parse.analysis.DayuByWordAnalysis
;
import
com.zhiwei.parse.analysis.DayuCommentAnalysis
;
import
com.zhiwei.parse.analysis.DayuCommentAnalysis
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
...
@@ -19,6 +21,7 @@ public class Dayu {
...
@@ -19,6 +21,7 @@ public class Dayu {
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Dayu
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Dayu
.
class
);
private
static
DayuAccountAnalysis
dayuAccountAnalysis
=
new
DayuAccountAnalysis
();
private
static
DayuAccountAnalysis
dayuAccountAnalysis
=
new
DayuAccountAnalysis
();
private
static
DayuCommentAnalysis
dayuCommentAnalysis
=
new
DayuCommentAnalysis
();
private
static
DayuCommentAnalysis
dayuCommentAnalysis
=
new
DayuCommentAnalysis
();
private
static
DayuByWordAnalysis
dayuByWordAnalysis
=
new
DayuByWordAnalysis
();
/**
/**
*
*
...
@@ -45,7 +48,7 @@ public class Dayu {
...
@@ -45,7 +48,7 @@ public class Dayu {
dataList
.
addAll
(
lists
);
dataList
.
addAll
(
lists
);
System
.
out
.
println
(
"================解析第"
+
i
+
"页====此时有数据=="
+
dataList
.
size
());
System
.
out
.
println
(
"================解析第"
+
i
+
"页====此时有数据=="
+
dataList
.
size
());
i
++;
i
++;
ZhiWeiTools
.
sleep
(
8
000
);
ZhiWeiTools
.
sleep
(
7
000
);
}
}
return
dataList
;
return
dataList
;
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
...
@@ -94,6 +97,12 @@ public class Dayu {
...
@@ -94,6 +97,12 @@ public class Dayu {
}
}
/**
*
* @Description 获取文章评论数
* @param articleId
* @return
*/
public
static
int
getDayuCommentCount
(
String
articleId
)
{
public
static
int
getDayuCommentCount
(
String
articleId
)
{
String
url
=
"http://m.uczzd.cn/iflow/api/v2/cmt/article/"
+
articleId
+
"/comments/byhot"
;
String
url
=
"http://m.uczzd.cn/iflow/api/v2/cmt/article/"
+
articleId
+
"/comments/byhot"
;
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getDayuCommentHeaderMap
(
null
);
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getDayuCommentHeaderMap
(
null
);
...
@@ -102,4 +111,38 @@ public class Dayu {
...
@@ -102,4 +111,38 @@ public class Dayu {
return
json
.
getJSONObject
(
"data"
).
getInteger
(
"comment_cnt"
);
return
json
.
getJSONObject
(
"data"
).
getInteger
(
"comment_cnt"
);
}
}
/**
*
* @Description 大鱼号依据关键词采集
* @param word
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getDayuByWordData
(
String
word
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getDayuByWordHeaderMap
(
null
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
int
i
=
1
;
try
{
while
(
true
)
{
String
url
=
"http://zzd.sm.cn/iflow/api/v1/article/fsearch?page="
+
i
+
"&size=20&sid=&q="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)+
"&scene=0"
;
System
.
out
.
println
(
url
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
headerMap
);
List
<
Map
<
String
,
Object
>>
lists
=
dayuByWordAnalysis
.
getDayuByWordData
(
result
);
if
(
lists
==
null
||
lists
.
size
()
<
1
)
{
break
;
}
if
(
lists
!=
null
&&
lists
.
size
()
>
0
)
{
dataList
.
addAll
(
lists
);
}
ZhiWeiTools
.
sleep
(
5000
);
i
++;
}
return
dataList
;
}
catch
(
Exception
e
)
{
logger
.
error
(
"关键词获取大鱼信息出错"
,
e
.
getMessage
());
return
dataList
;
}
}
}
}
src/main/java/com/zhiwei/parse/Fenghuang.java
View file @
7bf0e1d7
package
com
.
zhiwei
.
parse
;
package
com
.
zhiwei
.
parse
;
import
java.io.IOException
;
import
java.io.UnsupportedEncodingException
;
import
java.io.UnsupportedEncodingException
;
import
java.net.URLEncoder
;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
...
@@ -93,6 +92,12 @@ public class Fenghuang {
...
@@ -93,6 +92,12 @@ public class Fenghuang {
return
map
;
return
map
;
}
}
/**
*
* @Description 凤凰关键词采集
* @param word
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getFenghuangByWord
(
String
word
)
{
public
static
List
<
Map
<
String
,
Object
>>
getFenghuangByWord
(
String
word
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
int
i
=
1
;
int
i
=
1
;
...
@@ -120,7 +125,7 @@ public class Fenghuang {
...
@@ -120,7 +125,7 @@ public class Fenghuang {
logger
.
error
(
"依据关键词获取凤凰文章出错"
,
e
.
getMessage
());
logger
.
error
(
"依据关键词获取凤凰文章出错"
,
e
.
getMessage
());
e
.
printStackTrace
();
e
.
printStackTrace
();
return
dataList
;
return
dataList
;
}
catch
(
IO
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
e
.
printStackTrace
();
logger
.
error
(
"链接获取凤凰信息出错"
,
e
.
getMessage
());
logger
.
error
(
"链接获取凤凰信息出错"
,
e
.
getMessage
());
return
dataList
;
return
dataList
;
...
...
src/main/java/com/zhiwei/parse/Meipai.java
View file @
7bf0e1d7
package
com
.
zhiwei
.
parse
;
package
com
.
zhiwei
.
parse
;
import
java.io.UnsupportedEncodingException
;
import
java.net.URLEncoder
;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.List
;
...
@@ -18,6 +17,12 @@ public class Meipai {
...
@@ -18,6 +17,12 @@ public class Meipai {
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Meipai
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Meipai
.
class
);
private
static
MeipaiByWordAnalysis
meipaiByWordAnalysis
=
new
MeipaiByWordAnalysis
();
private
static
MeipaiByWordAnalysis
meipaiByWordAnalysis
=
new
MeipaiByWordAnalysis
();
/**
*
* @Description 美拍关键词获取视频数据
* @param word
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getMeipaiByWordData
(
String
word
)
{
public
static
List
<
Map
<
String
,
Object
>>
getMeipaiByWordData
(
String
word
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
try
{
try
{
...
@@ -44,6 +49,7 @@ public class Meipai {
...
@@ -44,6 +49,7 @@ public class Meipai {
return
dataList
;
return
dataList
;
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据出错"
,
e
.
getMessage
());
e
.
printStackTrace
();
e
.
printStackTrace
();
return
dataList
;
return
dataList
;
}
}
...
...
src/main/java/com/zhiwei/parse/Miaopai.java
View file @
7bf0e1d7
package
com
.
zhiwei
.
parse
;
package
com
.
zhiwei
.
parse
;
import
java.util.HashMap
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
...
@@ -15,7 +14,12 @@ import com.zhiwei.httpclient.HttpClient;
...
@@ -15,7 +14,12 @@ import com.zhiwei.httpclient.HttpClient;
public
class
Miaopai
{
public
class
Miaopai
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Miaopai
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Miaopai
.
class
);
/**
*
* @Description 秒拍依据链接获取数据
* @param url
* @return
*/
public
static
Map
<
String
,
Object
>
getMiaopaiDataByURL
(
String
url
)
{
public
static
Map
<
String
,
Object
>
getMiaopaiDataByURL
(
String
url
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getMiaoPaiByURlHeaderMap
(
null
);
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getMiaoPaiByURlHeaderMap
(
null
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
headerMap
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
headerMap
);
...
...
src/main/java/com/zhiwei/parse/QQ.java
→
src/main/java/com/zhiwei/parse/QQ
KB
.java
View file @
7bf0e1d7
...
@@ -7,15 +7,25 @@ import java.util.Map;
...
@@ -7,15 +7,25 @@ import java.util.Map;
import
org.slf4j.Logger
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.parse.analysis.QQAccountAnalysis
;
import
com.zhiwei.parse.analysis.QQKBAccountAnalysis
;
import
com.zhiwei.parse.analysis.QQKBCommentAnalysis
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
public
class
QQ
{
public
class
QQKB
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
QQ
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
QQKB
.
class
);
private
static
QQAccountAnalysis
qqAccountAnalysis
=
new
QQAccountAnalysis
();
private
static
QQKBAccountAnalysis
qqAccountAnalysis
=
new
QQKBAccountAnalysis
();
private
static
QQKBCommentAnalysis
qqkbCommentAnalysis
=
new
QQKBCommentAnalysis
();
/**
*
* @Description 采集天天快报历史文章
* @param child
* @param cookie
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getQQAccountData
(
String
child
,
String
cookie
)
{
public
static
List
<
Map
<
String
,
Object
>>
getQQAccountData
(
String
child
,
String
cookie
)
{
String
url
=
"http://r.cnews.qq.com/getSubNewsIndex"
;
String
url
=
"http://r.cnews.qq.com/getSubNewsIndex"
;
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getQQAccountHeaderMap
(
cookie
);
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getQQAccountHeaderMap
(
cookie
);
...
@@ -35,7 +45,7 @@ public class QQ {
...
@@ -35,7 +45,7 @@ public class QQ {
try
{
try
{
ids
=
ids
.
substring
(
0
,
ids
.
length
()-
1
);
ids
=
ids
.
substring
(
0
,
ids
.
length
()-
1
);
System
.
out
.
println
(
ids
);
System
.
out
.
println
(
ids
);
ZhiWeiTools
.
sleep
(
8
000
);
ZhiWeiTools
.
sleep
(
7
000
);
paramMap
.
clear
();
paramMap
.
clear
();
paramMap
=
HeadGet
.
getQQAccountOtherParamMap
(
ids
);
paramMap
=
HeadGet
.
getQQAccountOtherParamMap
(
ids
);
result
=
HttpClient
.
executeHttpRequestPost
(
url
,
headerMap
,
paramMap
);
result
=
HttpClient
.
executeHttpRequestPost
(
url
,
headerMap
,
paramMap
);
...
@@ -68,5 +78,87 @@ public class QQ {
...
@@ -68,5 +78,87 @@ public class QQ {
}
}
/**
*
* @Description 获取天天快报评论
* @param cookie
* @param comment_id
* @param article_id
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getQQKBCommentData
(
String
cookie
,
String
comment_id
,
String
article_id
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getQQKBCommentHeaderMap
(
cookie
);
try
{
Map
<
String
,
Object
>
paramMap
=
HeadGet
.
getQQKBCommentParamMap
(
comment_id
,
article_id
);
int
i
=
1
;
while
(
true
)
{
String
result
=
HttpClient
.
executeHttpRequestPost
(
"http://r.cnews.qq.com/getQQNewsComment"
,
headerMap
,
paramMap
);
System
.
out
.
println
(
result
);
paramMap
.
clear
();
List
<
Map
<
String
,
Object
>>
lists
=
qqkbCommentAnalysis
.
getCommentData
(
result
,
cookie
,
comment_id
,
article_id
);
if
(
lists
==
null
||
lists
.
size
()
<
1
)
{
break
;
}
dataList
.
addAll
(
lists
);
paramMap
=
qqkbCommentAnalysis
.
getParamMap
(
result
,
i
,
comment_id
,
article_id
);
i
++;
ZhiWeiTools
.
sleep
(
5000
);
}
return
dataList
;
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析天天快报评论出错"
,
e
.
getMessage
());
return
dataList
;
}
}
/**
*
* @Description 获取天天快报评论数
* @param cookie
* @param comment_id
* @param article_id
* @return
*/
public
static
int
getCommentCount
(
String
cookie
,
String
comment_id
,
String
article_id
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getQQKBCommentHeaderMap
(
cookie
);
try
{
Map
<
String
,
Object
>
paramMap
=
HeadGet
.
getQQKBCommentParamMap
(
comment_id
,
article_id
);
String
result
=
HttpClient
.
executeHttpRequestPost
(
"http://r.cnews.qq.com/getQQNewsComment"
,
headerMap
,
paramMap
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
return
json
.
getJSONObject
(
"comments"
).
getInteger
(
"count"
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析天天快报评论出错"
,
e
.
getMessage
());
return
0
;
}
}
// public static List<Map<String,Object>> getQQKBByWordData(String word,String cookie) {
// List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
// Map<String,String> headerMap = HeadGet.getQQKBByWordHeaderMap(cookie);
// Map<String,Object> paramMap = HeadGet.getQQKBByWordParamMap(word,null,null,0);
// int i = 1;
// try {
// String result = HttpClient.executeHttpRequestPost("http://r.cnews.qq.com/search", headerMap, paramMap);
// System.out.println(result);
// JSONObject json = JSONObject.parseObject(result);
// String sid = json.getString("sid");
// String queryid = json.getString("queryid");
// System.out.println(sid + "================" + queryid);
// while(true) {
// ZhiWeiTools.sleep(5000);
// i++;
// paramMap.clear();
// paramMap = HeadGet.getQQKBByWordParamMap(word, sid, queryid, i);
// result = HttpClient.executeHttpRequestPost("http://r.cnews.qq.com/searchMore", headerMap, paramMap);
// System.out.println(result);
// }
// } catch (Exception e) {
// logger.error("天天快报关键词采集出错",e.getMessage());
// return dataList;
// }
// }
}
}
src/main/java/com/zhiwei/parse/Souhu.java
View file @
7bf0e1d7
package
com
.
zhiwei
.
parse
;
package
com
.
zhiwei
.
parse
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.List
;
...
@@ -23,6 +22,12 @@ public class Souhu {
...
@@ -23,6 +22,12 @@ public class Souhu {
private
static
SouhuAccountAnalysis
souhuAccountAnalysis
=
new
SouhuAccountAnalysis
();
private
static
SouhuAccountAnalysis
souhuAccountAnalysis
=
new
SouhuAccountAnalysis
();
private
static
SouhuCommentAnalysis
souhuCommentAnalysis
=
new
SouhuCommentAnalysis
();
private
static
SouhuCommentAnalysis
souhuCommentAnalysis
=
new
SouhuCommentAnalysis
();
/**
*
* @Description 获取链接评论数
* @param url
* @return
*/
public
static
int
getSouhuCommentCount
(
String
url
)
{
public
static
int
getSouhuCommentCount
(
String
url
)
{
String
newurl
=
souhuCommentAnalysis
.
getSouhuURL
(
url
);
String
newurl
=
souhuCommentAnalysis
.
getSouhuURL
(
url
);
int
i
;
int
i
;
...
@@ -70,7 +75,9 @@ public class Souhu {
...
@@ -70,7 +75,9 @@ public class Souhu {
if
(
jsonArray
.
size
()
<
1
)
{
if
(
jsonArray
.
size
()
<
1
)
{
break
;
break
;
}
}
if
(
startTime
==
null
)
{
dataList
.
addAll
(
dataList1
);
dataList
.
addAll
(
dataList1
);
}
//判断时间
//判断时间
if
(
startTime
!=
null
)
{
if
(
startTime
!=
null
)
{
for
(
Map
<
String
,
Object
>
map
:
dataList1
)
{
for
(
Map
<
String
,
Object
>
map
:
dataList1
)
{
...
@@ -82,6 +89,7 @@ public class Souhu {
...
@@ -82,6 +89,7 @@ public class Souhu {
dataList
.
add
(
map
);
dataList
.
add
(
map
);
}
}
}
}
logger
.
info
(
"=============获取到的数据数目{}"
,
dataList
.
size
());
i
++;
i
++;
ZhiWeiTools
.
sleep
(
3000
);
ZhiWeiTools
.
sleep
(
3000
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
...
...
src/main/java/com/zhiwei/parse/Yidianzixun.java
View file @
7bf0e1d7
package
com
.
zhiwei
.
parse
;
package
com
.
zhiwei
.
parse
;
import
java.io.IOException
;
import
java.net.URLEncoder
;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.List
;
...
...
src/main/java/com/zhiwei/parse/analysis/QQAccountAnalysis.java
→
src/main/java/com/zhiwei/parse/analysis/QQ
KB
AccountAnalysis.java
View file @
7bf0e1d7
...
@@ -11,8 +11,8 @@ import org.slf4j.LoggerFactory;
...
@@ -11,8 +11,8 @@ import org.slf4j.LoggerFactory;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
public
class
QQAccountAnalysis
{
public
class
QQ
KB
AccountAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
QQAccountAnalysis
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
QQ
KB
AccountAnalysis
.
class
);
/**
/**
*
*
...
...
src/main/java/com/zhiwei/parse/analysis/SouhuCommentAnalysis.java
View file @
7bf0e1d7
...
@@ -33,6 +33,7 @@ public class SouhuCommentAnalysis {
...
@@ -33,6 +33,7 @@ public class SouhuCommentAnalysis {
source_id
=
s
.
split
(
"_"
)[
0
];
source_id
=
s
.
split
(
"_"
)[
0
];
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"链接解析错误"
,
e
.
getMessage
());
logger
.
error
(
"链接解析错误"
,
e
.
getMessage
());
return
null
;
}
}
String
newurl
=
"http://apiv2.sohu.com/api/comment/list?page_size=10&topic_id="
+
topic_id
+
"&source_id=mp_"
+
source_id
;
String
newurl
=
"http://apiv2.sohu.com/api/comment/list?page_size=10&topic_id="
+
topic_id
+
"&source_id=mp_"
+
source_id
;
return
newurl
;
return
newurl
;
...
...
src/test/java/com/zhiwei/crawler/QQAccountExample.java
View file @
7bf0e1d7
...
@@ -7,7 +7,7 @@ import java.util.Map;
...
@@ -7,7 +7,7 @@ import java.util.Map;
import
org.junit.Test
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.QQ
;
import
com.zhiwei.parse.QQ
KB
;
public
class
QQAccountExample
{
public
class
QQAccountExample
{
...
@@ -16,7 +16,7 @@ public class QQAccountExample {
...
@@ -16,7 +16,7 @@ public class QQAccountExample {
String
child
=
"5002744"
;
String
child
=
"5002744"
;
String
cookie
=
"phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=00030000db3c2ec2393ea968f523f50144db7ab5aec60e79d2509c271bdacdf784e88ac1f58b7493c23ceb15;%20uin=o0497332654;%20skey=M67MOgvFQJ;%20sigA2=D3046D543D9BA50CFE749D63B1F05AF28A281C29B4F1353374AB7A19D9527497A67E507C6829AE44F67C1EA032C2A3728301D2ABC864DA32BCA7D4C7A61609F9F3BC9AE0A7243003;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmUT_jxJCnY5yVwhmL3e2K5FOTRth6jz8SKVHGseA3v9s8UIZxw00LpF1uC9l7W5WL2trdb69LlCvE1s7twReOw;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0"
;
String
cookie
=
"phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=00030000db3c2ec2393ea968f523f50144db7ab5aec60e79d2509c271bdacdf784e88ac1f58b7493c23ceb15;%20uin=o0497332654;%20skey=M67MOgvFQJ;%20sigA2=D3046D543D9BA50CFE749D63B1F05AF28A281C29B4F1353374AB7A19D9527497A67E507C6829AE44F67C1EA032C2A3728301D2ABC864DA32BCA7D4C7A61609F9F3BC9AE0A7243003;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmUT_jxJCnY5yVwhmL3e2K5FOTRth6jz8SKVHGseA3v9s8UIZxw00LpF1uC9l7W5WL2trdb69LlCvE1s7twReOw;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0"
;
List
<
Map
<
String
,
Object
>>
dataList
=
QQ
.
getQQAccountData
(
child
,
cookie
);
List
<
Map
<
String
,
Object
>>
dataList
=
QQ
KB
.
getQQAccountData
(
child
,
cookie
);
System
.
out
.
println
(
dataList
.
size
());
System
.
out
.
println
(
dataList
.
size
());
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
...
...
src/test/java/com/zhiwei/crawler/SouhuAccountExample.java
View file @
7bf0e1d7
package
com
.
zhiwei
.
crawler
;
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
org.junit.Test
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Souhu
;
import
com.zhiwei.parse.Souhu
;
...
@@ -12,13 +14,19 @@ public class SouhuAccountExample {
...
@@ -12,13 +14,19 @@ public class SouhuAccountExample {
@Test
@Test
public
void
souhuAccountTest
()
{
public
void
souhuAccountTest
()
{
List
<
Map
<
String
,
Object
>>
lists
=
Souhu
.
getSouHuAccountData
(
"
MjQ4MDQ5Nzg2MEBzaW5hLnNvaHUuY29t"
,
null
,
tru
e
);
List
<
Map
<
String
,
Object
>>
lists
=
Souhu
.
getSouHuAccountData
(
"
c29odXptdGh5YXRieUBzb2h1LmNvbQ=="
,
"2017-01-01 00:00:00"
,
fals
e
);
System
.
out
.
println
(
lists
.
size
());
System
.
out
.
println
(
lists
.
size
());
int
i
=
0
;
List
<
String
>
headList
=
new
ArrayList
<
String
>();
for
(
Map
<
String
,
Object
>
map
:
lists
)
{
headList
.
add
(
"title"
);
System
.
out
.
println
(
map
.
toString
());
headList
.
add
(
"time"
);
System
.
out
.
println
(
i
++);
headList
.
add
(
"content"
);
}
headList
.
add
(
"url"
);
headList
.
add
(
"comment"
);
headList
.
add
(
"tags"
);
headList
.
add
(
"newsid"
);
headList
.
add
(
"newsPv"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D:\\crawlerdata\\搜狐号历史文章.xlsx"
,
"sasd"
,
headList
,
lists
);
}
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment