Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
articlenewscrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
articlenewscrawler
Commits
89439323
Commit
89439323
authored
Apr 24, 2018
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
1
parent
132e6350
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
417 additions
and
58 deletions
+417
-58
src/main/java/com/zhiwei/httpclient/HeadGet.java
+128
-10
src/main/java/com/zhiwei/parse/Baijia.java
+45
-2
src/main/java/com/zhiwei/parse/Fenghuang.java
+20
-20
src/main/java/com/zhiwei/parse/QQKB.java
+2
-2
src/main/java/com/zhiwei/parse/TXNews.java
+53
-0
src/main/java/com/zhiwei/parse/analysis/BaijiaAccountAnalysis.java
+18
-0
src/main/java/com/zhiwei/parse/analysis/DayuAccountAnalysis.java
+0
-1
src/main/java/com/zhiwei/parse/analysis/TXNewsByWordAnalysis.java
+55
-0
src/test/java/com/zhiwei/crawler/BaijiaAccountExample.java
+18
-2
src/test/java/com/zhiwei/crawler/DayuAccountExample.java
+27
-5
src/test/java/com/zhiwei/crawler/DayuByWordExample.java
+1
-1
src/test/java/com/zhiwei/crawler/FenghuangAccountExample.java
+2
-2
src/test/java/com/zhiwei/crawler/QQAccountExample.java
+5
-4
src/test/java/com/zhiwei/crawler/SouhuAccountExample.java
+2
-2
src/test/java/com/zhiwei/crawler/TXNewsByWordExample.java
+26
-0
src/test/java/com/zhiwei/crawler/WangyiCommentExample.java
+15
-7
No files found.
src/main/java/com/zhiwei/httpclient/HeadGet.java
View file @
89439323
package
com
.
zhiwei
.
httpclient
;
import
java.io.IOException
;
import
java.io.UnsupportedEncodingException
;
import
java.net.URLEncoder
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.Map
;
...
...
@@ -9,6 +11,7 @@ import org.jsoup.Jsoup;
import
org.jsoup.nodes.Document
;
import
com.alibaba.fastjson.JSONObject
;
import
com.sun.net.httpserver.Headers
;
public
class
HeadGet
{
...
...
@@ -645,37 +648,152 @@ public class HeadGet {
return
headerMap
;
}
public
static
Map
<
String
,
String
>
getQQkuaiCommentHeaderMap
(
String
cookie
)
{
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
// public static Map<String,String> getQQkuaiCommentHeaderMap(String cookie) {
// Map<String,String> headerMap = new HashMap<String, String>();
// if(cookie != null) {
// headerMap.put("Cookie", cookie);
// }
// return headerMap;
// }
public
static
Map
<
String
,
String
>
getweiboHeaderMap
(
String
cookie
)
{
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
);
headerMap
.
put
(
"Accept"
,
"*/*"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-CN,zh;q=0.9"
);
headerMap
.
put
(
"Connection"
,
"keep-alive"
);
headerMap
.
put
(
"Content-Type"
,
"application/x-www-form-urlencoded"
);
headerMap
.
put
(
"Host"
,
"d.weibo.com"
);
if
(
cookie
!=
null
)
{
headerMap
.
put
(
"Cookie"
,
cookie
);
}
return
headerMap
;
}
public
static
Map
<
String
,
String
>
get
weibo
HeaderMap
(
String
cookie
)
{
public
static
Map
<
String
,
String
>
get
TxNewspage1
HeaderMap
(
String
cookie
)
{
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"User-Agent"
,
"
Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36
"
);
"
QQNews/5.5.60 (iPhone; iOS 11.2.1; Scale/2.00)
"
);
headerMap
.
put
(
"Accept"
,
"*/*"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-
CN,zh;q=0.9
"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-
Hans-CN;q=1
"
);
headerMap
.
put
(
"Connection"
,
"keep-alive"
);
headerMap
.
put
(
"Content-Type"
,
"application/x-www-form-urlencoded"
);
headerMap
.
put
(
"Host"
,
"d.weibo.com"
);
headerMap
.
put
(
"devid"
,
"6d33f35f-880d-42a6-a23f-881bec6960ec"
);
headerMap
.
put
(
"Host"
,
"r.inews.qq.com"
);
headerMap
.
put
(
"Referer"
,
"http://inews.qq.com/inews/iphone/"
);
headerMap
.
put
(
"idft"
,
"60EE914A-6E8E-41FA-BC69-C44D47DDC4A0"
);
headerMap
.
put
(
"qn-sig"
,
"7697A692D78C878B70DD2CFE90610113"
);
headerMap
.
put
(
"idfa"
,
"FE659B7E-5104-44C2-8A31-F88DEE7A2747"
);
headerMap
.
put
(
"appver"
,
"11.2.1_qqnews_5.5.60"
);
headerMap
.
put
(
"deviceToken"
,
"<6428c8bd 9d302f00 cf30071d e72da40e 79d4f96b 58838dec f8bdbdae bcaa89a6>"
);
headerMap
.
put
(
"qn-rid"
,
"206cdadb83e8"
);
if
(
cookie
!=
null
)
{
headerMap
.
put
(
"Cookie"
,
cookie
);
}
return
headerMap
;
}
public
static
Map
<
String
,
String
>
getTxNewspage2HeaderMap
(
String
cookie
)
{
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"User-Agent"
,
"QQNews/5.5.60 (iPhone; iOS 11.2.1; Scale/2.00)"
);
headerMap
.
put
(
"Accept"
,
"*/*"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-Hans-CN;q=1"
);
headerMap
.
put
(
"Connection"
,
"keep-alive"
);
headerMap
.
put
(
"Content-Type"
,
"application/x-www-form-urlencoded"
);
headerMap
.
put
(
"devid"
,
"6d33f35f-880d-42a6-a23f-881bec6960ec"
);
headerMap
.
put
(
"Host"
,
"r.inews.qq.com"
);
headerMap
.
put
(
"Referer"
,
"http://inews.qq.com/inews/iphone/"
);
headerMap
.
put
(
"idft"
,
"60EE914A-6E8E-41FA-BC69-C44D47DDC4A0"
);
headerMap
.
put
(
"qn-sig"
,
"D4BE31B3F37B6F2670094D2C6EF825B7"
);
headerMap
.
put
(
"idfa"
,
"FE659B7E-5104-44C2-8A31-F88DEE7A2747"
);
headerMap
.
put
(
"appver"
,
"11.2.1_qqnews_5.5.60"
);
headerMap
.
put
(
"deviceToken"
,
"<6428c8bd 9d302f00 cf30071d e72da40e 79d4f96b 58838dec f8bdbdae bcaa89a6>"
);
headerMap
.
put
(
"qn-rid"
,
"206ce00acce9"
);
headerMap
.
put
(
"Content-Length"
,
"95"
);
if
(
cookie
!=
null
)
{
headerMap
.
put
(
"Cookie"
,
cookie
);
}
return
headerMap
;
}
public
static
Map
<
String
,
Object
>
getTxNewspage1ParamMap
(
String
word
)
{
Map
<
String
,
Object
>
param
=
new
HashMap
<
String
,
Object
>();
param
.
put
(
"c"
,
"searchEnterPage"
);
param
.
put
(
"query"
,
word
);
return
param
;
}
public
static
Map
<
String
,
Object
>
getTxNewspagemoreParamMap
(
String
word
,
int
page
)
{
Map
<
String
,
Object
>
param
=
new
HashMap
<
String
,
Object
>();
param
.
put
(
"id"
,
""
);
// param.put("queryid", "2606511522312027");
param
.
put
(
"count"
,
20
);
param
.
put
(
"query"
,
word
);
param
.
put
(
"timeline"
,
0
);
param
.
put
(
"type"
,
0
);
param
.
put
(
"secId"
,
2
);
param
.
put
(
"page"
,
page
);
return
param
;
}
public
static
Map
<
String
,
String
>
getTXNewsAccountHeaderMap
(
String
cookie
)
{
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"User-Agent"
,
"QQNews/5.5.60 (iPhone; iOS 11.2.1; Scale/2.00)"
);
headerMap
.
put
(
"Accept"
,
"*/*"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-Hans-CN;q=1"
);
headerMap
.
put
(
"Connection"
,
"keep-alive"
);
headerMap
.
put
(
"Content-Type"
,
"application/x-www-form-urlencoded"
);
headerMap
.
put
(
"devid"
,
"6d33f35f-880d-42a6-a23f-881bec6960ec"
);
headerMap
.
put
(
"Host"
,
"r.inews.qq.com"
);
headerMap
.
put
(
"Referer"
,
"http://inews.qq.com/inews/iphone/"
);
headerMap
.
put
(
"store"
,
"1"
);
headerMap
.
put
(
"idft"
,
"60EE914A-6E8E-41FA-BC69-C44D47DDC4A0"
);
headerMap
.
put
(
"qn-sig"
,
"BA8985D7C7CF361FB42F9692F8E86605"
);
headerMap
.
put
(
"idfa"
,
"FE659B7E-5104-44C2-8A31-F88DEE7A2747"
);
headerMap
.
put
(
"appver"
,
"11.2.1_qqnews_5.5.60"
);
headerMap
.
put
(
"deviceToken"
,
"<6428c8bd 9d302f00 cf30071d e72da40e 79d4f96b 58838dec f8bdbdae bcaa89a6>"
);
headerMap
.
put
(
"qn-rid"
,
"2073271e7f49"
);
headerMap
.
put
(
"Content-Length"
,
"83"
);
if
(
cookie
!=
null
)
{
headerMap
.
put
(
"Cookie"
,
cookie
);
}
return
headerMap
;
}
public
static
Map
<
String
,
Object
>
getTxNewsAccountpageParamMap
(
String
child
)
{
Map
<
String
,
Object
>
param
=
new
HashMap
<
String
,
Object
>();
param
.
put
(
"child"
,
child
);
param
.
put
(
"uid"
,
"8506EAF5-3678-4D3E-A9D6-E2A8DCF14F41"
);
param
.
put
(
"media_openid"
,
""
);
param
.
put
(
"commentBucketId"
,
0
);
return
param
;
}
public
static
Map
<
String
,
String
>
getBaijiaAccount2HeaderMap
(
String
cookie
)
{
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
);
headerMap
.
put
(
"Accept"
,
"application/json, text/javascript, */*; q=0.01"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-CN,zh;q=0.9,en;q=0.8"
);
headerMap
.
put
(
"Connection"
,
"keep-alive"
);
headerMap
.
put
(
"Host"
,
"news.baidu.com"
);
if
(
cookie
!=
null
)
{
headerMap
.
put
(
"Cookie"
,
cookie
);
}
return
headerMap
;
}
public
static
void
main
(
String
[]
args
)
{
String
url
=
"https://d.weibo.com/1087030002_2975_1003_0?pids=Pl_Core_F4RightUserList__4&page=2&ajaxpagelet=1&__ref=/1087030002_2975_1003_0&_t=FM_151825274677918"
;
String
cookie
=
"SINAGLOBAL=7701198867685.262.1517207017616; _s_tentry=login.sina.com.cn; Apache=6842405326379.926.1517796423994; ULV=1517796424127:3:1:3:6842405326379.926.1517796423994:1517209523882; ULOGIN_IMG=15177972786361; UOR=,,login.sina.com.cn; YF-Page-G0=23b9d9eac864b0d725a27007679967df; SCF=Ag8PQSV7wMV9Lc8UOZupWW2l6wfI5N2imvtjcwFE3ovIEsRCuG5QaKQhPx4ByaNkpC5LpYocPBPnOJT2NSZMkiU.; SUHB=0C1CJFGk8jNm31; SUB=_2AkMtIj0odcPxrABWn_0WzGPhbYhH-jye91TeAn7uJhMyAxgv7lMFqSVutBF-XFWUFIfrHOaUSPWy_1IBv_YbyS5_; SUBP=0033WrSXqPxfM72wWs9jqgMF55529P9D9WWr5b4iYaaqYk4kfrcubkrT5JpVF02ReoMpSo.XeK.f; login_sid_t=10c8fe00b1833b7414093404448d2330; cross_origin_proto=SSL"
;
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getweiboHeaderMap
(
null
);
String
url
=
"https://news.baidu.com/sn/api/homesubcribe?forum_id=b_1560023960896882&page=1"
;
String
cookie
=
"BAIDUID=4DB3FA13736131DBC2094C010E6EBCB0:FG=1; BIDUPSID=250CCE0442BEBCB3568D8EC515953434; PSTM=1522304033; BDUSS=zJEdDI0WFBCUE05M3BVTlhSbnozYkpUflZveW9aaGZ3ODBVTC1WRzVwaUxkZlphQVFBQUFBJCQAAAAAAAAAAAEAAADTCNY9Y3k5MDkyMDk5NTEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIvozlqL6M5ac; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BD_CK_SAM=1; BDSVRTM=98; BDSFRCVID=9g8sJeC62rdtQM7AdMI6hrB7leHy_qbTH6aoIgcaD_KjQB22bioFEG0PDU8g0KubMyQBogKKKgOTHIjP; H_BDCLCKID_SF=tJPOoD-bJI83fP36qRj8hPCsqxby26nQB2ceaJ5nJDoAoqOVWR5N-T-_-f7H3jbQ5RRb3CnvQpP-HJ7TyfCWM5_PhMbhhUcHKaufKl0MLpbYbb0xynoD-lFzLfnMBMni52OnapT_LIFaMII6D5DaejPShMr2aK6KaI58LRu8Kb7VbIOgDbbkbfJBD4QqhR5na26b3R3v2PoIMnRvhbQDD4t7yajK2-bmaN6A3lQ8aI3oD45HDTopQT8rKqAOK5OibCrpaC_Eab3vOpvTXpO1ytIreGLjt5LHJnFOVbD8bRrEDnukhtu_-P4DePjK-nJZ5m7mXp0b04TPjljgqj7jKU_mBpJbW60qXKb7BPF5BDOkbC86D6K5jjjM-f8X-PcKaD70LPI8Kb7VbprDXbbkbfJBDxc4-U_jB26b3tbe2PoIMnRNjl5tQU47yajK2-tfK64qXl5CyPOJftjT3-opQT8rQb_OK5Oib4jZ-fo9ab3vOpvTXpO1ytIreGKJtTF8fnuOV-35b5rtHJrwMtJo5DCHbq8sq4-O-2Q-5KL--JbMVqC6LtOYyjKJK4Kf2PQ7MGOD3fbdJJjoOJ3n-fOryPIuLGKH5tcy3eTxoUJgQCnJhhvG-xcB0fDebPRiB-b9QgbABftLK-oj-DLmD60h3e; PSINO=5; locale=zh; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; FP_UID=f9e064a71741aa2e821e58ca2b30c3da; H_PS_PSSID=1433_21104_20882_20927; userId=1524191310247; Hm_lvt_348091a80fe10e213d94a7de762bbd44=1524191312; Hm_lpvt_348091a80fe10e213d94a7de762bbd44=1524191395"
;
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getBaijiaAccount2HeaderMap
(
null
);
// Map<String,Object> paramMap = HeadGet.getTxNewsAccountpageParamMap("1979");
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
headerMap
);
System
.
out
.
println
(
result
);
//
System.out.println(result);
System
.
out
.
println
(
result
.
length
());
}
...
...
src/main/java/com/zhiwei/parse/Baijia.java
View file @
89439323
package
com
.
zhiwei
.
parse
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.parse.analysis.BaijiaAccountAnalysis
;
import
com.zhiwei.zhiweiTools.timeParse.TimeParse
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
public
class
Baijia
{
...
...
@@ -19,6 +23,45 @@ public class Baijia {
/**
*
* @Description 百家号历史文章采集
* @param app_id 百度新闻转发获取后面的数据
* @param startTime
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getBaijiaAccount2Data
(
String
app_id
,
String
startTime
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getBaijiaAccount2HeaderMap
(
null
);
String
url
=
"https://news.baidu.com/sn/api/homesubcribe?forum_id="
+
app_id
;
boolean
f
=
true
;
while
(
f
)
{
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
headerMap
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArry
=
json
.
getJSONObject
(
"data"
).
getJSONArray
(
"news"
);
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
JSONObject
data
=
jsonArry
.
getJSONObject
(
i
);
Map
<
String
,
Object
>
m
=
baijiaAccountAnalysis
.
getBaijiaAccount2Data
(
data
);
if
(
startTime
.
compareTo
((
String
)
m
.
get
(
"time"
))
>
0
)
{
f
=
false
;
break
;
}
dataList
.
add
(
m
);
if
(
startTime
!=
null
&&
startTime
.
length
()
>
5
)
{
logger
.
info
(
"采集到的时间为:{}"
,(
String
)
m
.
get
(
"time"
));
}
}
logger
.
info
(
"采集到的数据总量:{}"
,
dataList
.
size
());
if
(
json
.
getJSONObject
(
"data"
).
getBooleanValue
(
"hasMore"
))
{
url
=
"https://news.baidu.com/sn/api/homesubcribe?forum_id="
+
app_id
+
"&page="
+
(
json
.
getJSONObject
(
"data"
).
getIntValue
(
"page"
)+
1
);
ZhiWeiTools
.
sleep
(
2000
);
continue
;
}
break
;
}
return
dataList
;
}
/**
*
* @Description 百家号历史文章采集
* @param app_id
* @param startTime
* @return
...
...
@@ -37,12 +80,12 @@ public class Baijia {
break
;
}
i
+=
20
;
ZhiWeiTools
.
sleep
(
5
000
);
ZhiWeiTools
.
sleep
(
4
000
);
dataList
.
addAll
(
list
);
logger
.
info
(
url
+
i
+
"=============="
+
dataList
.
size
());
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
ZhiWeiTools
.
sleep
(
4
000
);
ZhiWeiTools
.
sleep
(
3
000
);
logger
.
error
(
"此页解析出错"
,
e
.
getMessage
());
continue
;
}
...
...
src/main/java/com/zhiwei/parse/Fenghuang.java
View file @
89439323
...
...
@@ -109,31 +109,31 @@ public class Fenghuang {
int
i
=
1
;
try
{
while
(
true
)
{
String
url
=
"http://search.ifeng.com/sofeng/search.action?q="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)+
"&c=1&p="
;
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getFenghuangWordHeaderMap
(
null
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
+
i
,
headerMap
);
List
<
Map
<
String
,
Object
>>
lists
=
fenghuangByWordAnalysis
.
getFenghuangByWord
(
result
);
if
(
lists
==
null
||
lists
.
size
()
<
1
)
{
break
;
}
if
(
lists
!=
null
&&
lists
.
size
()
>
0
)
{
dataList
.
addAll
(
lists
);
}
System
.
out
.
println
(
word
+
"===================以获取的数据==:"
+
dataList
.
size
());
i
++;
if
(
i
==
76
)
{
break
;
try
{
String
url
=
"http://search.ifeng.com/sofeng/search.action?q="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)+
"&c=1&p="
;
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getFenghuangWordHeaderMap
(
null
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
+
i
,
headerMap
);
List
<
Map
<
String
,
Object
>>
lists
=
fenghuangByWordAnalysis
.
getFenghuangByWord
(
result
);
if
(
lists
==
null
||
lists
.
size
()
<
1
)
{
break
;
}
if
(
lists
!=
null
&&
lists
.
size
()
>
0
)
{
dataList
.
addAll
(
lists
);
}
System
.
out
.
println
(
word
+
"===================以获取的数据==:"
+
dataList
.
size
());
i
++;
if
(
i
==
76
)
{
break
;
}
ZhiWeiTools
.
sleep
(
4000
);
}
catch
(
Exception
e
)
{
continue
;
}
ZhiWeiTools
.
sleep
(
4000
);
}
return
dataList
;
}
catch
(
UnsupportedEncodingException
e
)
{
logger
.
error
(
"依据关键词获取凤凰文章出错"
,
e
.
getMessage
());
e
.
printStackTrace
();
return
dataList
;
}
catch
(
Exception
e
)
{
logger
.
error
(
"依据关键词获取凤凰文章出错"
,
e
.
getMessage
());
e
.
printStackTrace
();
logger
.
error
(
"链接获取凤凰信息出错"
,
e
.
getMessage
());
return
dataList
;
}
}
...
...
src/main/java/com/zhiwei/parse/QQKB.java
View file @
89439323
...
...
@@ -59,12 +59,12 @@ public class QQKB {
paramMap
=
HeadGet
.
getQQAccountOtherParamMap
(
ids
);
result
=
HttpClient
.
executeHttpRequestPost
(
url
,
headerMap
,
paramMap
);
List
<
Map
<
String
,
Object
>>
list
=
qqAccountAnalysis
.
analysisQQAccountData
(
result
);
ids
=
""
;
i
=
0
;
if
(
list
!=
null
)
{
dataList
.
addAll
(
list
);
break
;
}
ids
=
""
;
i
=
0
;
}
}
catch
(
Exception
e
)
{
ids
=
""
;
...
...
src/main/java/com/zhiwei/parse/TXNews.java
0 → 100644
View file @
89439323
package
com
.
zhiwei
.
parse
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.parse.analysis.TXNewsByWordAnalysis
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
public
class
TXNews
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
TXNews
.
class
);
private
static
TXNewsByWordAnalysis
txNewsByWordAnalysis
=
new
TXNewsByWordAnalysis
();
public
static
boolean
hasMore
=
true
;
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getTxNewspage1HeaderMap
(
null
);
Map
<
String
,
Object
>
paramMap
=
HeadGet
.
getTxNewspage1ParamMap
(
word
);
String
result
=
HttpClient
.
executeHttpRequestPost
(
"http://r.inews.qq.com/search?appver=11.2.1_qqnews_5.5.60&devid=6D33F35F-880D-42A6-A23F-881BEC6960EC"
,
headerMap
,
paramMap
);
List
<
Map
<
String
,
Object
>>
dList
=
txNewsByWordAnalysis
.
getData
(
result
);
dataList
.
addAll
(
dList
);
int
page
=
2
;
int
count
=
0
;
Map
<
String
,
String
>
header2Map
=
HeadGet
.
getTxNewspage2HeaderMap
(
null
);
while
(
hasMore
)
{
try
{
ZhiWeiTools
.
sleep
(
5000
);
Map
<
String
,
Object
>
param2Map
=
HeadGet
.
getTxNewspagemoreParamMap
(
word
,
page
);
String
result2
=
HttpClient
.
executeHttpRequestPost
(
"http://r.inews.qq.com/searchMore?appver=11.2.1_qqnews_5.5.60&devid=6D33F35F-880D-42A6-A23F-881BEC6960EC"
,
header2Map
,
param2Map
);
page
++;
List
<
Map
<
String
,
Object
>>
dList2
=
txNewsByWordAnalysis
.
getData
(
result2
);
dataList
.
addAll
(
dList2
);
logger
.
info
(
"采集到数据======={}"
,
dataList
.
size
());
count
=
0
;
}
catch
(
Exception
e
)
{
if
(
count
>
2
)
{
count
++;
break
;
}
continue
;
}
}
return
dataList
;
}
}
src/main/java/com/zhiwei/parse/analysis/BaijiaAccountAnalysis.java
View file @
89439323
...
...
@@ -14,11 +14,29 @@ import com.alibaba.fastjson.JSONArray;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.zhiweiTools.timeParse.TimeParse
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
public
class
BaijiaAccountAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
BaijiaAccountAnalysis
.
class
);
public
Map
<
String
,
Object
>
getBaijiaAccount2Data
(
JSONObject
data
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
map
.
put
(
"title"
,
data
.
getString
(
"title"
));
map
.
put
(
"url"
,
data
.
getString
(
"url"
));
map
.
put
(
"source"
,
data
.
getString
(
"site"
));
map
.
put
(
"time"
,
TimeParse
.
dateFormartString
(
TimeParse
.
stringFormartDate
(
data
.
getString
(
"pulltime"
)),
"yyyy-MM-dd HH:mm:ss"
));
String
content
=
""
;
JSONArray
jsonArry
=
data
.
getJSONArray
(
"content"
);
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
JSONObject
d
=
jsonArry
.
getJSONObject
(
i
);
if
(
d
.
getString
(
"type"
).
equals
(
"text"
))
{
content
=
content
+
d
.
getString
(
"data"
);
}
}
map
.
put
(
"content"
,
content
.
replaceAll
(
"<.*?>"
,
""
));
return
map
;
}
/**
*
...
...
src/main/java/com/zhiwei/parse/analysis/DayuAccountAnalysis.java
View file @
89439323
...
...
@@ -52,7 +52,6 @@ public class DayuAccountAnalysis {
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
try
{
String
time
=
data
.
getString
(
"published_at"
).
replace
(
"T"
,
" "
).
split
(
"\\."
)[
0
];
System
.
out
.
println
(
time
);
if
(
startTime
!=
null
&&
startTime
.
length
()
>
1
)
{
if
(
time
.
compareTo
(
startTime
)
<
0
)
{
return
null
;
...
...
src/main/java/com/zhiwei/parse/analysis/TXNewsByWordAnalysis.java
0 → 100644
View file @
89439323
package
com
.
zhiwei
.
parse
.
analysis
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.parse.TXNews
;
public
class
TXNewsByWordAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
TXNewsByWordAnalysis
.
class
);
public
List
<
Map
<
String
,
Object
>>
getData
(
String
result
)
{
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
JSONArray
jsonArry
=
json
.
getJSONArray
(
"secList"
);
if
(
json
.
getInteger
(
"hasMore"
)
==
1
)
{
TXNews
.
hasMore
=
true
;
}
else
{
TXNews
.
hasMore
=
false
;
}
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
JSONObject
js
=
jsonArry
.
getJSONObject
(
i
);
if
(
js
.
getInteger
(
"secType"
)
==
0
)
{
JSONArray
jsonArry2
=
js
.
getJSONArray
(
"newsList"
);
for
(
int
j
=
0
;
j
<
jsonArry2
.
size
();
j
++)
{
JSONObject
js2
=
jsonArry2
.
getJSONObject
(
j
);
try
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
map
.
put
(
"title"
,
js2
.
getString
(
"title"
));
map
.
put
(
"content"
,
js2
.
getString
(
"abstract"
));
map
.
put
(
"time"
,
js2
.
getString
(
"time"
));
map
.
put
(
"source"
,
js2
.
getString
(
"source"
));
map
.
put
(
"id"
,
js2
.
getString
(
"id"
));
map
.
put
(
"url"
,
js2
.
getString
(
"url"
));
dataList
.
add
(
map
);
// System.out.println(map.toString());
}
catch
(
Exception
e
)
{
logger
.
error
(
"采集出错:{}"
,
e
.
getMessage
());
System
.
out
.
println
(
js2
.
toString
());
}
}
}
}
return
dataList
;
}
}
src/test/java/com/zhiwei/crawler/BaijiaAccountExample.java
View file @
89439323
...
...
@@ -13,7 +13,7 @@ public class BaijiaAccountExample {
@Test
public
void
baijiaAccountTest
()
{
String
app_id
=
"1536766
731827943
"
;
String
app_id
=
"1536766
390576806
"
;
String
startTime
=
"2016-01-01 00:00:00"
;
//2017-11-30 17:48:17
List
<
Map
<
String
,
Object
>>
lists
=
Baijia
.
getBaijiaAccountData
(
app_id
,
startTime
);
...
...
@@ -26,7 +26,23 @@ public class BaijiaAccountExample {
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
headList
.
add
(
"content"
);
poi
.
exportExcel
(
"D://crawlerdata/百家号-蓝鲸TMT网.xlsx"
,
"蓝鲸TMT网"
,
headList
,
lists
);
poi
.
exportExcel
(
"D://crawlerdata/百家号-太保.xlsx"
,
"太保"
,
headList
,
lists
);
}
// @Test
public
void
baijiaAccount2Test
()
{
String
app_id
=
"b_1536766390576806"
;
String
startTime
=
"2016-01-01 00:00:00"
;
//2017-11-30 17:48:17
List
<
Map
<
String
,
Object
>>
lists
=
Baijia
.
getBaijiaAccount2Data
(
app_id
,
startTime
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
headList
.
add
(
"content"
);
poi
.
exportExcel
(
"D://crawlerdata/百家号-俊世太保.xlsx"
,
"俊世太保"
,
headList
,
lists
);
}
}
src/test/java/com/zhiwei/crawler/DayuAccountExample.java
View file @
89439323
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Dayu
;
public
class
DayuAccountExample
{
...
...
@@ -15,12 +17,32 @@ public class DayuAccountExample {
//https://api.m.sm.cn/rest?method=Subscribe.list&format=html&from=wh10331&uc_biz_str=S:custom%7CC:search%7CN:true
String
mid
=
"d7300311c1504d24a229c3da345785c6"
;
String
name
=
"大鱼海棠雨"
;
String
startTime
=
"2017-12-05 22:08:01"
;
// String mid = "d7300311c1504d24a229c3da345785c6";
// String name = "大鱼海棠雨";
String
startTime
=
"2018-03-16 00:00:00"
;
String
path
=
"D:\\crawlerdata\\自媒体\\大鱼历史文章.xlsx"
;
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
path
,
0
);
List
<
Map
<
String
,
Object
>>
lists
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"content"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
// headList.add("content_id");
// headList.add("origin_id");
// headList.add("xss_item_id");
for
(
Map
<
String
,
Object
>
data
:
lists
)
{
String
mid
=
data
.
get
(
"mid"
)+
""
;
String
name
=
data
.
get
(
"name"
)+
""
;
if
(
mid
.
length
()
<
1
&&
name
.
length
()
<
1
)
{
continue
;
}
List
<
Map
<
String
,
Object
>>
dataList
=
Dayu
.
getDayuAccountData
(
mid
,
name
,
null
);
poi
.
exportExcel
(
path
,
name
,
headList
,
dataList
);
}
List
<
Map
<
String
,
Object
>>
dataList
=
Dayu
.
getDayuAccountData
(
mid
,
name
,
null
);
System
.
out
.
println
(
dataList
.
size
());
}
...
...
src/test/java/com/zhiwei/crawler/DayuByWordExample.java
View file @
89439323
...
...
@@ -12,7 +12,7 @@ public class DayuByWordExample {
@Test
public
void
dayuByWordTest
()
{
String
word
=
"
京东
"
;
String
word
=
"
沃尔玛
"
;
List
<
Map
<
String
,
Object
>>
dataList
=
Dayu
.
getDayuByWordData
(
word
);
...
...
src/test/java/com/zhiwei/crawler/FenghuangAccountExample.java
View file @
89439323
...
...
@@ -15,7 +15,7 @@ public class FenghuangAccountExample {
public
void
fenghuangAccountTest
()
{
//所用时间长 1s1篇文章吧
//https://api.3g.ifeng.com/client_search_subscribe?k=(凤凰号名称拿id)
String
id
=
"
276718
"
;
String
id
=
"
724
"
;
String
[]
ids
=
id
.
split
(
","
);
for
(
int
i
=
0
;
i
<
ids
.
length
;
i
++)
{
try
{
...
...
@@ -29,7 +29,7 @@ public class FenghuangAccountExample {
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
headList
.
add
(
"id"
);
poi
.
exportExcel
(
"D://crawlerdata/凤凰-
另眼看世界
.xlsx"
,
ids
[
i
],
headList
,
dataList
);
poi
.
exportExcel
(
"D://crawlerdata/凤凰-
电商报
.xlsx"
,
ids
[
i
],
headList
,
dataList
);
}
catch
(
Exception
e
)
{
continue
;
}
...
...
src/test/java/com/zhiwei/crawler/QQAccountExample.java
View file @
89439323
...
...
@@ -16,14 +16,15 @@ public class QQAccountExample {
public
void
qqAccountTest
()
{
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
Map
<
String
,
Object
>
dataMap
=
poi
.
importExcel
(
"D://crawlerdata/天天快报历史文章采集.xlsx"
,
0
);
Map
<
String
,
Object
>
dataMap
=
poi
.
importExcel
(
"D://crawlerdata/
/自媒体/
天天快报历史文章采集.xlsx"
,
0
);
List
<
Map
<
String
,
Object
>>
dataList
=
(
List
<
Map
<
String
,
Object
>>)
dataMap
.
get
(
"body"
);
String
cookie
=
"phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=000300006218e8444698ebbad28deed8243ef28e87f247b133bf5d19f160bce91d4e3e06ec2003cd4f92aff0;%20uin=o0497332654;%20skey=MQL8ScQBxu;%20sigA2=2EBEFFCC5C22FD27B32E6F21C76CA4A2AD8BF5D626B629A38C50923810CCD19C574CD093CFAD29C845084C8CD3E04B57DD24A69E4418E060C899696A728FB3B8B14C577F1FF93340;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmSQ3EqlyzseC9-qGEFY7Tkr0Ypk5vsnSwOaMC-IGxsBeY2K7knHrYstj_5dZpisJd5nihvLNQvCdsFhFwZQcT8;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0"
;
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
Map
<
String
,
Object
>
map
:
dataList
)
{
String
child
=
map
.
get
(
"帐号链接"
)+
""
;
System
.
out
.
println
(
child
.
split
(
"chlid="
)[
1
]);
List
<
Map
<
String
,
Object
>>
lists
=
QQKB
.
getQQAccountData
(
child
.
split
(
"chlid="
)[
1
],
cookie
);
// System.out.println(child.split("chlid=")[1]);
System
.
out
.
println
((
String
)
map
.
get
(
"child"
));
List
<
Map
<
String
,
Object
>>
lists
=
QQKB
.
getQQAccountData
((
String
)
map
.
get
(
"child"
),
cookie
);
if
(
lists
!=
null
)
{
for
(
Map
<
String
,
Object
>
map1
:
lists
)
{
map1
.
put
(
"name"
,
map
.
get
(
"呢称"
));
...
...
@@ -43,7 +44,7 @@ public class QQAccountExample {
headList
.
add
(
"content"
);
headList
.
add
(
"url"
);
headList
.
add
(
"commentid"
);
poi
.
exportExcel
(
"D://crawlerdata/
天天快报采集
.xlsx"
,
"asd"
,
headList
,
bodyList
);
poi
.
exportExcel
(
"D://crawlerdata/
/自媒体/天天快报采集-科技编年史
.xlsx"
,
"asd"
,
headList
,
bodyList
);
}
...
...
src/test/java/com/zhiwei/crawler/SouhuAccountExample.java
View file @
89439323
...
...
@@ -16,7 +16,7 @@ public class SouhuAccountExample {
@Test
public
void
souhuAccountTest
()
{
List
<
Map
<
String
,
Object
>>
lists
=
Souhu
.
getSouHuAccountData
(
"
cHBhZzUyMTNjZjAzZTczYUBzb2h1LmNvbQ=="
,
"2017
-01-01 00:00:00"
,
false
);
List
<
Map
<
String
,
Object
>>
lists
=
Souhu
.
getSouHuAccountData
(
"
MjI5MzAyOTMyMEBzaW5hLnNvaHUuY29t"
,
"2016
-01-01 00:00:00"
,
false
);
System
.
out
.
println
(
lists
.
size
());
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
...
...
@@ -28,7 +28,7 @@ public class SouhuAccountExample {
headList
.
add
(
"newsid"
);
headList
.
add
(
"newsPv"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D:\\crawlerdata\\搜狐号历史文章-
蓝媒汇.xlsx"
,
"蓝媒汇
"
,
headList
,
lists
);
poi
.
exportExcel
(
"D:\\crawlerdata\\搜狐号历史文章-
太保乱谈.xlsx"
,
"太保乱谈
"
,
headList
,
lists
);
}
}
src/test/java/com/zhiwei/crawler/TXNewsByWordExample.java
0 → 100644
View file @
89439323
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.TXNews
;
public
class
TXNewsByWordExample
{
public
static
void
main
(
String
[]
args
)
{
String
word
=
"唐嫣"
;
List
<
Map
<
String
,
Object
>>
dataList
=
TXNews
.
getData
(
word
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"content"
);
headList
.
add
(
"url"
);
headList
.
add
(
"id"
);
headList
.
add
(
"source"
);
poi
.
exportExcel
(
"D://crawlerdata/腾讯新闻-唐嫣.xlsx"
,
"腾讯新闻数据"
,
headList
,
dataList
);
}
}
src/test/java/com/zhiwei/crawler/WangyiCommentExample.java
View file @
89439323
...
...
@@ -14,12 +14,20 @@ public class WangyiCommentExample {
//若出错 可能数据有重复 以id为准
@Test
public
void
wangyiCommentTest
()
{
String
url
=
"http://news.163.com/18/0210/09/DA9B8PVJ000189FH.html"
;
String
id
=
url
.
split
(
"/"
)[
6
].
split
(
".ht"
)[
0
];
List
<
Map
<
String
,
Object
>>
lists
=
Wangyi
.
getWangyiCommentData
(
id
);
System
.
out
.
println
(
lists
.
size
());
List
<
String
>
urlList
=
new
ArrayList
<
String
>();
urlList
.
add
(
"https://c.m.163.com/news/a/DCQ42REV05118O92.html?spss=newsapp"
);
urlList
.
add
(
"https://c.m.163.com/news/a/DCPLJ5GB05198R91.html?spss=newsapp"
);
urlList
.
add
(
"https://c.m.163.com/news/a/DCRNI7020511CPVM.html?spss=newsapp"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
for
(
String
url
:
urlList
)
{
String
id
=
url
.
split
(
"a/"
)[
1
].
split
(
".ht"
)[
0
];
List
<
Map
<
String
,
Object
>>
lists
=
Wangyi
.
getWangyiCommentData
(
id
);
System
.
out
.
println
(
lists
.
size
());
if
(
lists
!=
null
)
{
bodyList
.
addAll
(
lists
);
}
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"content"
);
headList
.
add
(
"id"
);
...
...
@@ -29,7 +37,7 @@ public class WangyiCommentExample {
headList
.
add
(
"unlike"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D:\\crawlerdata\\网易评论采集
测试.xlsx"
,
"asd"
,
headList
,
lists
);
poi
.
exportExcel
(
"D:\\crawlerdata\\网易评论采集
-3.xlsx"
,
"asd"
,
headList
,
bodyList
);
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment