Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
articlenewscrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
articlenewscrawler
Commits
132e6350
Commit
132e6350
authored
Feb 11, 2018
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
天天快报历史文章采集修正
parent
e5ce0110
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
65 additions
and
14 deletions
+65
-14
src/main/java/com/zhiwei/httpclient/HeadGet.java
+22
-4
src/main/java/com/zhiwei/parse/QQKB.java
+20
-5
src/test/java/com/zhiwei/crawler/QQAccountExample.java
+23
-5
No files found.
src/main/java/com/zhiwei/httpclient/HeadGet.java
View file @
132e6350
package
com
.
zhiwei
.
httpclient
;
package
com
.
zhiwei
.
httpclient
;
import
java.io.IOException
;
import
java.io.IOException
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.HashMap
;
import
java.util.Map
;
import
java.util.Map
;
...
@@ -652,11 +653,28 @@ public class HeadGet {
...
@@ -652,11 +653,28 @@ public class HeadGet {
return
headerMap
;
return
headerMap
;
}
}
public
static
Map
<
String
,
String
>
getweiboHeaderMap
(
String
cookie
)
{
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
);
headerMap
.
put
(
"Accept"
,
"*/*"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-CN,zh;q=0.9"
);
headerMap
.
put
(
"Connection"
,
"keep-alive"
);
headerMap
.
put
(
"Content-Type"
,
"application/x-www-form-urlencoded"
);
headerMap
.
put
(
"Host"
,
"d.weibo.com"
);
if
(
cookie
!=
null
)
{
headerMap
.
put
(
"Cookie"
,
cookie
);
}
return
headerMap
;
}
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
String
url
=
"https://
view.inews.qq.com/a/NEW2018021000440002
"
;
String
url
=
"https://
d.weibo.com/1087030002_2975_1003_0?pids=Pl_Core_F4RightUserList__4&page=2&ajaxpagelet=1&__ref=/1087030002_2975_1003_0&_t=FM_151825274677918
"
;
String
cookie
=
"
mstuid=1518141097798_2540; Hm_lvt_71558e7b4aa822e282e758f8dc0b88b0=1518141098; lastsource=so.bbs.xiaomi.cn; mstz=||795199218.38||http%3A%2F%2Fso.bbs.xiaomi.cn%2F%3Fq%3D%25e5%25b0%258f%25e7%25b1%25b3%2520%25e7%2594%25b5%25e9%25a5%25ad%25e7%2585%25b2%2520%25e5%25bc%2580%25e8%25a3%2582%7Cp%3D1%7Cfid%3D0%7Ctime%3D31536000%7Corder%3D1|http%3A%2F%2Fso.bbs.xiaomi.cn%2F%3Fq%3D%25e5%25b0%258f%25e7%25b1%25b3%2520%25e7%2594%25b5%25e9%25a5%25ad%25e7%2585%25b2%2520%25e5%25bc%2580%25e8%25a3%2582%7Cp%3D1%7Cfid%3D0%7Ctime%3D63072000%7Corder%3D1; xm_vistor=1518141097798_2540_1518141097798-1518142530797; msttime=http%3A%2F%2Fso.bbs.xiaomi.cn%2F%3Fq%3D%25E5%25B0%258F%25E7%25B1%25B3%2520%25E7%2594%25B5%25E9%25A5%25AD%25E7%2585%25B2%2520%25E5%25BC%2580%25E8%25A3%2582%26p%3D1%26fid%3D0%26time%3D63072000%26order%3D1; msttime1=http%3A%2F%2Fso.bbs.xiaomi.cn%2F%3Fq%3D%25E5%25B0%258F%25E7%25B1%25B3%2520%25E7%2594%25B5%25E9%25A5%25AD%25E7%2585%25B2%2520%25E5%25BC%2580%25E8%25A3%2582%26p%3D1%26fid%3D0%26time%3D63072000%26order%3D1; Hm_lpvt_71558e7b4aa822e282e758f8dc0b88b0=1518142531
"
;
String
cookie
=
"
SINAGLOBAL=7701198867685.262.1517207017616; _s_tentry=login.sina.com.cn; Apache=6842405326379.926.1517796423994; ULV=1517796424127:3:1:3:6842405326379.926.1517796423994:1517209523882; ULOGIN_IMG=15177972786361; UOR=,,login.sina.com.cn; YF-Page-G0=23b9d9eac864b0d725a27007679967df; SCF=Ag8PQSV7wMV9Lc8UOZupWW2l6wfI5N2imvtjcwFE3ovIEsRCuG5QaKQhPx4ByaNkpC5LpYocPBPnOJT2NSZMkiU.; SUHB=0C1CJFGk8jNm31; SUB=_2AkMtIj0odcPxrABWn_0WzGPhbYhH-jye91TeAn7uJhMyAxgv7lMFqSVutBF-XFWUFIfrHOaUSPWy_1IBv_YbyS5_; SUBP=0033WrSXqPxfM72wWs9jqgMF55529P9D9WWr5b4iYaaqYk4kfrcubkrT5JpVF02ReoMpSo.XeK.f; login_sid_t=10c8fe00b1833b7414093404448d2330; cross_origin_proto=SSL
"
;
Map
<
String
,
String
>
headerMap
=
HeadGet
.
get
QQkuaiComment
HeaderMap
(
null
);
Map
<
String
,
String
>
headerMap
=
HeadGet
.
get
weibo
HeaderMap
(
null
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
null
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
headerMap
);
System
.
out
.
println
(
result
);
System
.
out
.
println
(
result
);
System
.
out
.
println
(
result
.
length
());
System
.
out
.
println
(
result
.
length
());
}
}
...
...
src/main/java/com/zhiwei/parse/QQKB.java
View file @
132e6350
...
@@ -33,9 +33,16 @@ public class QQKB {
...
@@ -33,9 +33,16 @@ public class QQKB {
Map
<
String
,
Object
>
paramMap
=
HeadGet
.
getQQAccountOneParamMap
(
child
);
Map
<
String
,
Object
>
paramMap
=
HeadGet
.
getQQAccountOneParamMap
(
child
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
try
{
try
{
String
result
=
HttpClient
.
executeHttpRequestPost
(
url
,
headerMap
,
paramMap
);
String
result
=
""
;
List
<
String
>
idsList
=
qqAccountAnalysis
.
getQQAllIds
(
result
);
List
<
String
>
idsList
=
new
ArrayList
<
String
>();
System
.
out
.
println
(
idsList
.
size
());
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
result
=
HttpClient
.
executeHttpRequestPost
(
url
,
headerMap
,
paramMap
);
idsList
=
qqAccountAnalysis
.
getQQAllIds
(
result
);
if
(
idsList
.
size
()
>
1
)
{
break
;
}
}
System
.
out
.
println
(
"此帐号可采集的历史文章数==============="
+
idsList
.
size
());
url
=
"http://r.cnews.qq.com/getSubNewsListItems"
;
url
=
"http://r.cnews.qq.com/getSubNewsListItems"
;
String
ids
=
""
;
String
ids
=
""
;
int
i
=
0
;
int
i
=
0
;
...
@@ -44,6 +51,7 @@ public class QQKB {
...
@@ -44,6 +51,7 @@ public class QQKB {
i
++;
i
++;
if
(
i
>=
20
)
{
if
(
i
>=
20
)
{
try
{
try
{
for
(
int
j
=
1
;
j
<
3
;
j
++)
{
ids
=
ids
.
substring
(
0
,
ids
.
length
()-
1
);
ids
=
ids
.
substring
(
0
,
ids
.
length
()-
1
);
System
.
out
.
println
(
ids
);
System
.
out
.
println
(
ids
);
ZhiWeiTools
.
sleep
(
7000
);
ZhiWeiTools
.
sleep
(
7000
);
...
@@ -51,9 +59,13 @@ public class QQKB {
...
@@ -51,9 +59,13 @@ public class QQKB {
paramMap
=
HeadGet
.
getQQAccountOtherParamMap
(
ids
);
paramMap
=
HeadGet
.
getQQAccountOtherParamMap
(
ids
);
result
=
HttpClient
.
executeHttpRequestPost
(
url
,
headerMap
,
paramMap
);
result
=
HttpClient
.
executeHttpRequestPost
(
url
,
headerMap
,
paramMap
);
List
<
Map
<
String
,
Object
>>
list
=
qqAccountAnalysis
.
analysisQQAccountData
(
result
);
List
<
Map
<
String
,
Object
>>
list
=
qqAccountAnalysis
.
analysisQQAccountData
(
result
);
if
(
list
!=
null
)
{
dataList
.
addAll
(
list
);
dataList
.
addAll
(
list
);
break
;
}
ids
=
""
;
ids
=
""
;
i
=
0
;
i
=
0
;
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
ids
=
""
;
ids
=
""
;
paramMap
.
clear
();
paramMap
.
clear
();
...
@@ -62,23 +74,26 @@ public class QQKB {
...
@@ -62,23 +74,26 @@ public class QQKB {
}
}
}
}
if
(
ids
.
length
()
>
1
)
{
if
(
ids
.
length
()
>
1
)
{
for
(
int
j
=
1
;
j
<
3
;
j
++)
{
ids
=
ids
.
substring
(
0
,
ids
.
length
()-
1
);
ids
=
ids
.
substring
(
0
,
ids
.
length
()-
1
);
ZhiWeiTools
.
sleep
(
8000
);
ZhiWeiTools
.
sleep
(
8000
);
paramMap
.
clear
();
paramMap
.
clear
();
paramMap
=
HeadGet
.
getQQAccountOtherParamMap
(
ids
);
paramMap
=
HeadGet
.
getQQAccountOtherParamMap
(
ids
);
result
=
HttpClient
.
executeHttpRequestPost
(
url
,
headerMap
,
paramMap
);
result
=
HttpClient
.
executeHttpRequestPost
(
url
,
headerMap
,
paramMap
);
List
<
Map
<
String
,
Object
>>
list
=
qqAccountAnalysis
.
analysisQQAccountData
(
result
);
List
<
Map
<
String
,
Object
>>
list
=
qqAccountAnalysis
.
analysisQQAccountData
(
result
);
if
(
list
!=
null
)
{
dataList
.
addAll
(
list
);
dataList
.
addAll
(
list
);
break
;
}
}
}
}
return
dataList
;
return
dataList
;
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取企鹅号历史文章未完全成功"
,
e
.
getMessage
());
logger
.
error
(
"获取企鹅号历史文章未完全成功"
,
e
.
getMessage
());
e
.
printStackTrace
();
return
dataList
;
return
dataList
;
}
}
}
}
/**
/**
*
*
* @Description 获取天天快报评论
* @Description 获取天天快报评论
...
...
src/test/java/com/zhiwei/crawler/QQAccountExample.java
View file @
132e6350
...
@@ -8,24 +8,42 @@ import org.junit.Test;
...
@@ -8,24 +8,42 @@ import org.junit.Test;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.QQKB
;
import
com.zhiwei.parse.QQKB
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
public
class
QQAccountExample
{
public
class
QQAccountExample
{
@Test
@Test
public
void
qqAccountTest
()
{
public
void
qqAccountTest
()
{
String
child
=
"5975325"
;
String
cookie
=
"phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=000300006218e8444698ebbad28deed8243ef28e87f247b133bf5d19f160bce91d4e3e06ec2003cd4f92aff0;%20uin=o0497332654;%20skey=MQL8ScQBxu;%20sigA2=2EBEFFCC5C22FD27B32E6F21C76CA4A2AD8BF5D626B629A38C50923810CCD19C574CD093CFAD29C845084C8CD3E04B57DD24A69E4418E060C899696A728FB3B8B14C577F1FF93340;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmSQ3EqlyzseC9-qGEFY7Tkr0Ypk5vsnSwOaMC-IGxsBeY2K7knHrYstj_5dZpisJd5nihvLNQvCdsFhFwZQcT8;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0"
;
List
<
Map
<
String
,
Object
>>
dataList
=
QQKB
.
getQQAccountData
(
child
,
cookie
);
System
.
out
.
println
(
dataList
.
size
());
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
Map
<
String
,
Object
>
dataMap
=
poi
.
importExcel
(
"D://crawlerdata/天天快报历史文章采集.xlsx"
,
0
);
List
<
Map
<
String
,
Object
>>
dataList
=
(
List
<
Map
<
String
,
Object
>>)
dataMap
.
get
(
"body"
);
String
cookie
=
"phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=000300006218e8444698ebbad28deed8243ef28e87f247b133bf5d19f160bce91d4e3e06ec2003cd4f92aff0;%20uin=o0497332654;%20skey=MQL8ScQBxu;%20sigA2=2EBEFFCC5C22FD27B32E6F21C76CA4A2AD8BF5D626B629A38C50923810CCD19C574CD093CFAD29C845084C8CD3E04B57DD24A69E4418E060C899696A728FB3B8B14C577F1FF93340;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmSQ3EqlyzseC9-qGEFY7Tkr0Ypk5vsnSwOaMC-IGxsBeY2K7knHrYstj_5dZpisJd5nihvLNQvCdsFhFwZQcT8;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0"
;
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
Map
<
String
,
Object
>
map
:
dataList
)
{
String
child
=
map
.
get
(
"帐号链接"
)+
""
;
System
.
out
.
println
(
child
.
split
(
"chlid="
)[
1
]);
List
<
Map
<
String
,
Object
>>
lists
=
QQKB
.
getQQAccountData
(
child
.
split
(
"chlid="
)[
1
],
cookie
);
if
(
lists
!=
null
)
{
for
(
Map
<
String
,
Object
>
map1
:
lists
)
{
map1
.
put
(
"name"
,
map
.
get
(
"呢称"
));
map1
.
put
(
"主页地址"
,
map
.
get
(
"帐号链接"
));
bodyList
.
add
(
map1
);
}
}
System
.
out
.
println
(
"采集到的历史文章数总和============="
+
bodyList
.
size
());
ZhiWeiTools
.
sleep
(
5000
);
}
System
.
out
.
println
(
dataList
.
size
());
List
<
String
>
headList
=
new
ArrayList
<
String
>();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"name"
);
headList
.
add
(
"主页地址"
);
headList
.
add
(
"title"
);
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"time"
);
headList
.
add
(
"content"
);
headList
.
add
(
"content"
);
headList
.
add
(
"url"
);
headList
.
add
(
"url"
);
headList
.
add
(
"commentid"
);
headList
.
add
(
"commentid"
);
poi
.
exportExcel
(
"D://crawlerdata/
qq-5975325.xlsx"
,
"asd"
,
headList
,
data
List
);
poi
.
exportExcel
(
"D://crawlerdata/
天天快报采集.xlsx"
,
"asd"
,
headList
,
body
List
);
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment