Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
articlenewscrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
articlenewscrawler
Commits
a8ebdd2c
Commit
a8ebdd2c
authored
Feb 09, 2018
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
自媒体采集部分优化
parent
a205f946
Show whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
263 additions
and
44 deletions
+263
-44
src/main/java/com/zhiwei/httpclient/HeadGet.java
+33
-3
src/main/java/com/zhiwei/parse/Baijia.java
+4
-4
src/main/java/com/zhiwei/parse/Fenghuang.java
+11
-5
src/main/java/com/zhiwei/parse/Souhu.java
+1
-0
src/main/java/com/zhiwei/parse/Xiaomi.java
+46
-0
src/main/java/com/zhiwei/parse/analysis/BaijiaAccountAnalysis.java
+3
-3
src/main/java/com/zhiwei/parse/analysis/FenghuangAccountAnalysis.java
+17
-3
src/main/java/com/zhiwei/parse/analysis/XiaomiShequByWordAnalysis.java
+51
-0
src/main/java/com/zhiwei/util/TimeUtil.java
+5
-6
src/test/java/com/zhiwei/crawler/BaijiaAccountExample.java
+14
-3
src/test/java/com/zhiwei/crawler/FenghuangAccountExample.java
+21
-6
src/test/java/com/zhiwei/crawler/QQAccountExample.java
+3
-3
src/test/java/com/zhiwei/crawler/SouhuAccountExample.java
+4
-2
src/test/java/com/zhiwei/crawler/SouhuCommentExample.java
+1
-1
src/test/java/com/zhiwei/crawler/XiaomiShequByWordExample.java
+35
-0
src/test/java/com/zhiwei/crawler/YidianzixunAccountExample.java
+14
-5
No files found.
src/main/java/com/zhiwei/httpclient/HeadGet.java
View file @
a8ebdd2c
...
@@ -614,10 +614,40 @@ public class HeadGet {
...
@@ -614,10 +614,40 @@ public class HeadGet {
return
headerMap
;
return
headerMap
;
}
}
public
static
Map
<
String
,
String
>
getSouhuByWordHeaderMap
(
String
cookie
)
{
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"
);
headerMap
.
put
(
"Accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-CN,zh;q=0.9"
);
headerMap
.
put
(
"Connection"
,
"keep-alive"
);
headerMap
.
put
(
"Host"
,
"api.k.sohu.com"
);
if
(
cookie
!=
null
)
{
headerMap
.
put
(
"Cookie"
,
cookie
);
}
return
headerMap
;
}
public
static
Map
<
String
,
String
>
getxiaomiShequByWordHeaderMap
(
String
cookie
)
{
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"
);
headerMap
.
put
(
"Accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-CN,zh;q=0.9"
);
headerMap
.
put
(
"Connection"
,
"keep-alive"
);
headerMap
.
put
(
"Host"
,
"so.bbs.xiaomi.cn"
);
if
(
cookie
!=
null
)
{
headerMap
.
put
(
"Cookie"
,
cookie
);
}
return
headerMap
;
}
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
String
url
=
"http://
comment.dy.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/D75MDLL10524H5KD/comments/newList?offset=0&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2&callback=getData&ibc=newspc&_=1514966469573
"
;
String
url
=
"http://
so.bbs.xiaomi.cn/?q=%E5%B0%8F%E7%B1%B3%20%E7%94%B5%E9%A5%AD%E7%85%B2%20%E5%BC%80%E8%A3%82&p=1&fid=0&time=63072000&order=1
"
;
String
cookie
=
"
phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=00030000fafc45b92e51a92d1a2d1c0536594402729a928137fe205f823d71e18c3e786e6f368baff37f7edc;%20uin=o0497332654;%20skey=MSF4MCe62n;%20sigA2=75E9AE34BD844F7CD19AC30353DE6116A767F02C50C78ABA2FB11B5B1D74324CCEDA1C9D13B6B3719AAA7875B14DBE4C560CB5FB99A5D63390B8041F6C83A48401EA8D5DA7B04E7A;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwvJbQ-Gsn52dfcob8V66AgcW1SAGy8xloQk1nVWfjVvR0b637c-qcRWE7M2QtFLKLsZP8o6dBVABpDhbzRQ92tw;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0
"
;
String
cookie
=
"
mstuid=1518141097798_2540; Hm_lvt_71558e7b4aa822e282e758f8dc0b88b0=1518141098; lastsource=so.bbs.xiaomi.cn; mstz=||795199218.38||http%3A%2F%2Fso.bbs.xiaomi.cn%2F%3Fq%3D%25e5%25b0%258f%25e7%25b1%25b3%2520%25e7%2594%25b5%25e9%25a5%25ad%25e7%2585%25b2%2520%25e5%25bc%2580%25e8%25a3%2582%7Cp%3D1%7Cfid%3D0%7Ctime%3D31536000%7Corder%3D1|http%3A%2F%2Fso.bbs.xiaomi.cn%2F%3Fq%3D%25e5%25b0%258f%25e7%25b1%25b3%2520%25e7%2594%25b5%25e9%25a5%25ad%25e7%2585%25b2%2520%25e5%25bc%2580%25e8%25a3%2582%7Cp%3D1%7Cfid%3D0%7Ctime%3D63072000%7Corder%3D1; xm_vistor=1518141097798_2540_1518141097798-1518142530797; msttime=http%3A%2F%2Fso.bbs.xiaomi.cn%2F%3Fq%3D%25E5%25B0%258F%25E7%25B1%25B3%2520%25E7%2594%25B5%25E9%25A5%25AD%25E7%2585%25B2%2520%25E5%25BC%2580%25E8%25A3%2582%26p%3D1%26fid%3D0%26time%3D63072000%26order%3D1; msttime1=http%3A%2F%2Fso.bbs.xiaomi.cn%2F%3Fq%3D%25E5%25B0%258F%25E7%25B1%25B3%2520%25E7%2594%25B5%25E9%25A5%25AD%25E7%2585%25B2%2520%25E5%25BC%2580%25E8%25A3%2582%26p%3D1%26fid%3D0%26time%3D63072000%26order%3D1; Hm_lpvt_71558e7b4aa822e282e758f8dc0b88b0=1518142531
"
;
Map
<
String
,
String
>
headerMap
=
HeadGet
.
get
WangyiComment
HeaderMap
(
null
);
Map
<
String
,
String
>
headerMap
=
HeadGet
.
get
xiaomiShequByWord
HeaderMap
(
null
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
headerMap
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
headerMap
);
System
.
out
.
println
(
result
);
System
.
out
.
println
(
result
);
System
.
out
.
println
(
result
.
length
());
System
.
out
.
println
(
result
.
length
());
...
...
src/main/java/com/zhiwei/parse/Baijia.java
View file @
a8ebdd2c
...
@@ -26,23 +26,23 @@ public class Baijia {
...
@@ -26,23 +26,23 @@ public class Baijia {
public
static
List
<
Map
<
String
,
Object
>>
getBaijiaAccountData
(
String
app_id
,
String
startTime
)
{
public
static
List
<
Map
<
String
,
Object
>>
getBaijiaAccountData
(
String
app_id
,
String
startTime
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
int
i
=
0
;
int
i
=
0
;
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getBaijiaAccountHeaderMap
(
null
);
try
{
try
{
while
(
true
)
{
while
(
true
)
{
try
{
try
{
String
url
=
"https://baijia.baidu.com/writerlistarticle?ajax=json&app_id="
+
app_id
+
"&_limit=20&_skip="
;
String
url
=
"https://baijia.baidu.com/writerlistarticle?ajax=json&app_id="
+
app_id
+
"&_limit=20&_skip="
;
logger
.
info
(
url
+
i
);
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getBaijiaAccountHeaderMap
(
null
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
+
i
,
headerMap
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
+
i
,
headerMap
);
List
<
Map
<
String
,
Object
>>
list
=
baijiaAccountAnalysis
.
getBaijiaAccountData
(
result
,
startTime
);
List
<
Map
<
String
,
Object
>>
list
=
baijiaAccountAnalysis
.
getBaijiaAccountData
(
result
,
startTime
);
if
(
list
==
null
||
list
.
size
()
<
1
){
if
(
list
==
null
||
list
.
size
()
<
1
){
break
;
break
;
}
}
i
+=
20
;
i
+=
20
;
ZhiWeiTools
.
sleep
(
6
000
);
ZhiWeiTools
.
sleep
(
5
000
);
dataList
.
addAll
(
list
);
dataList
.
addAll
(
list
);
logger
.
info
(
url
+
i
+
"=============="
+
dataList
.
size
());
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
e
.
printStackTrace
();
ZhiWeiTools
.
sleep
(
5
000
);
ZhiWeiTools
.
sleep
(
4
000
);
logger
.
error
(
"此页解析出错"
,
e
.
getMessage
());
logger
.
error
(
"此页解析出错"
,
e
.
getMessage
());
continue
;
continue
;
}
}
...
...
src/main/java/com/zhiwei/parse/Fenghuang.java
View file @
a8ebdd2c
...
@@ -32,17 +32,23 @@ public class Fenghuang {
...
@@ -32,17 +32,23 @@ public class Fenghuang {
public
static
List
<
Map
<
String
,
Object
>>
getFenghuangAccountData
(
String
id
,
String
startTime
)
{
public
static
List
<
Map
<
String
,
Object
>>
getFenghuangAccountData
(
String
id
,
String
startTime
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
int
i
=
1
;
int
i
=
1
;
while
(
true
){
boolean
f
=
true
;
while
(
f
){
try
{
try
{
for
(
int
j
=
0
;
j
<
4
;
j
++){
f
=
true
;
String
url
=
"http://api.3g.ifeng.com/api_wemedia_index?followid=weMedia_"
+
id
+
"&page="
+
i
+
"&pagesize=20&tag=article"
;
String
url
=
"http://api.3g.ifeng.com/api_wemedia_index?followid=weMedia_"
+
id
+
"&page="
+
i
+
"&pagesize=20&tag=article"
;
System
.
out
.
println
(
"====================采集第"
+
i
+
"页"
);
List
<
Map
<
String
,
Object
>>
list
=
fenghuangAccountAnalysis
.
getArticleData
(
url
,
startTime
);
List
<
Map
<
String
,
Object
>>
list
=
fenghuangAccountAnalysis
.
getArticleData
(
url
,
startTime
);
if
(
list
==
null
||
list
.
size
()
<
1
)
{
if
(
list
!=
null
&&
list
.
size
()
>
0
)
{
dataList
.
addAll
(
list
);
System
.
out
.
println
(
"====================采集第"
+
i
+
"页===共获取数据=="
+
dataList
.
size
());
i
++;
ZhiWeiTools
.
sleep
(
2000
);
break
;
break
;
}
}
dataList
.
addAll
(
list
)
;
f
=
false
;
ZhiWeiTools
.
sleep
(
2000
);
ZhiWeiTools
.
sleep
(
2000
);
i
++;
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"程序出错"
,
e
.
getMessage
());
logger
.
error
(
"程序出错"
,
e
.
getMessage
());
return
dataList
;
return
dataList
;
...
...
src/main/java/com/zhiwei/parse/Souhu.java
View file @
a8ebdd2c
...
@@ -22,6 +22,7 @@ public class Souhu {
...
@@ -22,6 +22,7 @@ public class Souhu {
private
static
SouhuAccountAnalysis
souhuAccountAnalysis
=
new
SouhuAccountAnalysis
();
private
static
SouhuAccountAnalysis
souhuAccountAnalysis
=
new
SouhuAccountAnalysis
();
private
static
SouhuCommentAnalysis
souhuCommentAnalysis
=
new
SouhuCommentAnalysis
();
private
static
SouhuCommentAnalysis
souhuCommentAnalysis
=
new
SouhuCommentAnalysis
();
/**
/**
*
*
* @Description 获取链接评论数
* @Description 获取链接评论数
...
...
src/main/java/com/zhiwei/parse/Xiaomi.java
0 → 100644
View file @
a8ebdd2c
package
com
.
zhiwei
.
parse
;
import
java.io.UnsupportedEncodingException
;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.parse.analysis.XiaomiShequByWordAnalysis
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
public
class
Xiaomi
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Xiaomi
.
class
);
private
static
XiaomiShequByWordAnalysis
xiaomiShequByWordAnalysis
=
new
XiaomiShequByWordAnalysis
();
public
static
List
<
Map
<
String
,
Object
>>
getXiaomiByWordData
(
String
word
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getxiaomiShequByWordHeaderMap
(
null
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
int
i
=
1
;
try
{
while
(
true
)
{
String
url
=
"http://so.bbs.xiaomi.cn/?q="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)+
"&p="
+
i
+
"&fid=457&time=63072000&order=1"
;
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
headerMap
);
List
<
Map
<
String
,
Object
>>
dataList
=
xiaomiShequByWordAnalysis
.
getdata
(
result
);
if
(
dataList
==
null
||
dataList
.
size
()
<
1
)
{
break
;
}
bodyList
.
addAll
(
dataList
);
ZhiWeiTools
.
sleep
(
5000
);
System
.
out
.
println
(
"第"
+
i
+
"页==========="
+
bodyList
.
size
());
i
++;
}
return
bodyList
;
}
catch
(
UnsupportedEncodingException
e
)
{
e
.
printStackTrace
();
return
bodyList
;
}
}
}
src/main/java/com/zhiwei/parse/analysis/BaijiaAccountAnalysis.java
View file @
a8ebdd2c
package
com
.
zhiwei
.
parse
.
analysis
;
package
com
.
zhiwei
.
parse
.
analysis
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.List
;
...
@@ -49,13 +48,14 @@ public class BaijiaAccountAnalysis {
...
@@ -49,13 +48,14 @@ public class BaijiaAccountAnalysis {
if
(
url
==
null
)
{
if
(
url
==
null
)
{
url
=
"https://baijia.baidu.com/s?old_id="
+
id
;
url
=
"https://baijia.baidu.com/s?old_id="
+
id
;
}
}
map
.
put
(
"content"
,
getBaijiaContent
(
url
));
// map.put("content", getBaijiaContent(url));
map
.
put
(
"content"
,
data
.
getString
(
"abstract"
));
map
.
put
(
"read_amount"
,
data
.
getString
(
"read_amount"
)==
null
?
0
:
data
.
getString
(
"read_amount"
));
map
.
put
(
"read_amount"
,
data
.
getString
(
"read_amount"
)==
null
?
0
:
data
.
getString
(
"read_amount"
));
map
.
put
(
"app_id"
,
data
.
getString
(
"app_id"
));
map
.
put
(
"app_id"
,
data
.
getString
(
"app_id"
));
map
.
put
(
"time"
,
time
);
map
.
put
(
"time"
,
time
);
map
.
put
(
"url"
,
url
);
map
.
put
(
"url"
,
url
);
map
.
put
(
"source"
,
data
.
getString
(
"writer_name"
));
map
.
put
(
"source"
,
data
.
getString
(
"writer_name"
));
System
.
out
.
println
(
map
.
toString
());
//
System.out.println(map.toString());
dataList
.
add
(
map
);
dataList
.
add
(
map
);
}
}
return
dataList
;
return
dataList
;
...
...
src/main/java/com/zhiwei/parse/analysis/FenghuangAccountAnalysis.java
View file @
a8ebdd2c
...
@@ -26,11 +26,25 @@ public class FenghuangAccountAnalysis {
...
@@ -26,11 +26,25 @@ public class FenghuangAccountAnalysis {
*/
*/
public
List
<
Map
<
String
,
Object
>>
getArticleData
(
String
url
,
String
startTime
)
{
public
List
<
Map
<
String
,
Object
>>
getArticleData
(
String
url
,
String
startTime
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
try
{
try
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getFenghuangAccountHeaderMap
(
null
);
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getFenghuangAccountHeaderMap
(
null
);
JSONArray
jsonArry
=
null
;
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
try
{
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
headerMap
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
headerMap
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArry
=
json
.
getJSONObject
(
"data"
).
getJSONObject
(
"feeds"
).
getJSONArray
(
"list"
);
jsonArry
=
json
.
getJSONObject
(
"data"
).
getJSONObject
(
"feeds"
).
getJSONArray
(
"list"
);
if
(
jsonArry
==
null
||
jsonArry
.
size
()
<
1
)
{
continue
;
}
}
catch
(
Exception
e
)
{
continue
;
}
}
if
(
jsonArry
==
null
||
jsonArry
.
size
()
<
1
)
{
return
dataList
;
}
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
try
{
try
{
JSONObject
data
=
jsonArry
.
getJSONObject
(
i
);
JSONObject
data
=
jsonArry
.
getJSONObject
(
i
);
...
@@ -50,12 +64,12 @@ public class FenghuangAccountAnalysis {
...
@@ -50,12 +64,12 @@ public class FenghuangAccountAnalysis {
continue
;
continue
;
}
}
}
}
return
dataList
;
}
catch
(
Exception
e1
)
{
}
catch
(
Exception
e1
)
{
e1
.
printStackTrace
();
e1
.
printStackTrace
();
return
null
;
}
return
dataList
;
return
dataList
;
}
}
}
private
static
Map
<
String
,
Object
>
getArticle
(
String
articleResult
)
{
private
static
Map
<
String
,
Object
>
getArticle
(
String
articleResult
)
{
JSONObject
json
=
JSONObject
.
parseObject
(
articleResult
).
getJSONObject
(
"body"
);
JSONObject
json
=
JSONObject
.
parseObject
(
articleResult
).
getJSONObject
(
"body"
);
...
...
src/main/java/com/zhiwei/parse/analysis/XiaomiShequByWordAnalysis.java
0 → 100644
View file @
a8ebdd2c
package
com
.
zhiwei
.
parse
.
analysis
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
public
class
XiaomiShequByWordAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
XiaomiShequByWordAnalysis
.
class
);
public
List
<
Map
<
String
,
Object
>>
getdata
(
String
result
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
try
{
Document
doc
=
Jsoup
.
parse
(
result
);
Elements
elements
=
doc
.
select
(
"div.search_list"
).
select
(
"dl"
);
System
.
out
.
println
(
elements
.
size
());
for
(
Element
element
:
elements
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
String
title
=
element
.
select
(
"dt"
).
select
(
"a"
).
text
();
String
url
=
element
.
select
(
"dt"
).
select
(
"a"
).
attr
(
"href"
);
map
.
put
(
"title"
,
title
);
map
.
put
(
"source"
,
"小米社区"
);
map
.
put
(
"url"
,
url
);
try
{
String
time
=
element
.
select
(
"dd"
).
select
(
"div.info"
).
text
();
time
=
time
.
split
(
" "
)[
4
]+
" "
+
time
.
split
(
" "
)[
5
];
map
.
put
(
"time"
,
time
);
}
catch
(
Exception
e
)
{
continue
;
}
String
content
=
element
.
select
(
"dd"
).
select
(
"p"
).
select
(
"a"
).
text
();
map
.
put
(
"content"
,
content
);
dataList
.
add
(
map
);
}
return
dataList
;
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析出错"
);
return
null
;
}
}
}
src/main/java/com/zhiwei/util/TimeUtil.java
View file @
a8ebdd2c
package
com
.
zhiwei
.
util
;
package
com
.
zhiwei
.
util
;
import
org.slf4j.Logger
;
import
java.util.Calendar
;
import
org.slf4j.LoggerFactory
;
public
class
TimeUtil
{
public
class
TimeUtil
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
TimeUtil
.
class
);
public
static
String
timeUtil
(
String
time
)
{
public
static
String
timeUtil
(
String
time
)
{
Calendar
calendar
=
Calendar
.
getInstance
();
if
(
time
.
split
(
"-"
).
length
==
2
)
{
if
(
time
.
split
(
"-"
).
length
==
2
)
{
time
=
"2017-"
+
time
+
":00"
;
time
=
calendar
.
getWeekYear
()
+
"-"
+
time
+
":00"
;
}
else
{
}
else
{
return
null
;
return
"20"
+
time
+
":00"
;
}
}
return
time
;
return
time
;
}
}
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
String
time
=
"1
7-1
2-12 15:01"
;
String
time
=
"12-12 15:01"
;
System
.
out
.
println
(
timeUtil
(
time
));
System
.
out
.
println
(
timeUtil
(
time
));
}
}
...
...
src/test/java/com/zhiwei/crawler/BaijiaAccountExample.java
View file @
a8ebdd2c
package
com
.
zhiwei
.
crawler
;
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
org.junit.Test
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Baijia
;
import
com.zhiwei.parse.Baijia
;
public
class
BaijiaAccountExample
{
public
class
BaijiaAccountExample
{
@Test
@Test
public
void
baijiaAccountTest
()
{
public
void
baijiaAccountTest
()
{
String
app_id
=
"153676
7984069926
"
;
String
app_id
=
"153676
6731827943
"
;
String
startTime
=
""
;
String
startTime
=
"
2016-01-01 00:00:00
"
;
//2017-11-30 17:48:17
//2017-11-30 17:48:17
List
<
Map
<
String
,
Object
>>
lists
=
Baijia
.
getBaijiaAccountData
(
app_id
,
startTime
);
List
<
Map
<
String
,
Object
>>
lists
=
Baijia
.
getBaijiaAccountData
(
app_id
,
startTime
);
System
.
out
.
println
(
lists
.
size
());
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"read_amount"
);
headList
.
add
(
"app_id"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
headList
.
add
(
"content"
);
poi
.
exportExcel
(
"D://crawlerdata/百家号-蓝鲸TMT网.xlsx"
,
"蓝鲸TMT网"
,
headList
,
lists
);
}
}
}
}
src/test/java/com/zhiwei/crawler/FenghuangAccountExample.java
View file @
a8ebdd2c
package
com
.
zhiwei
.
crawler
;
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
org.junit.Test
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Fenghuang
;
import
com.zhiwei.parse.Fenghuang
;
public
class
FenghuangAccountExample
{
public
class
FenghuangAccountExample
{
...
@@ -12,13 +14,26 @@ public class FenghuangAccountExample {
...
@@ -12,13 +14,26 @@ public class FenghuangAccountExample {
@Test
@Test
public
void
fenghuangAccountTest
()
{
public
void
fenghuangAccountTest
()
{
//所用时间长 1s1篇文章吧
//所用时间长 1s1篇文章吧
String
id
=
"733691"
;
//https://api.3g.ifeng.com/client_search_subscribe?k=(凤凰号名称拿id)
String
startTime
=
"2017-11-15 00:00:00"
;
//可为空
String
id
=
"276718"
;
List
<
Map
<
String
,
Object
>>
dataList
=
Fenghuang
.
getFenghuangAccountData
(
id
,
startTime
);
String
[]
ids
=
id
.
split
(
","
);
for
(
Map
<
String
,
Object
>
map
:
dataList
)
{
for
(
int
i
=
0
;
i
<
ids
.
length
;
i
++)
{
System
.
out
.
println
(
map
.
toString
());
try
{
String
startTime
=
"2017-01-01 00:00:00"
;
//可为空
List
<
Map
<
String
,
Object
>>
dataList
=
Fenghuang
.
getFenghuangAccountData
(
ids
[
i
],
startTime
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"text"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
headList
.
add
(
"id"
);
poi
.
exportExcel
(
"D://crawlerdata/凤凰-另眼看世界.xlsx"
,
ids
[
i
],
headList
,
dataList
);
}
catch
(
Exception
e
)
{
continue
;
}
}
}
System
.
out
.
println
(
dataList
.
size
());
}
}
}
}
src/test/java/com/zhiwei/crawler/QQAccountExample.java
View file @
a8ebdd2c
...
@@ -13,8 +13,8 @@ public class QQAccountExample {
...
@@ -13,8 +13,8 @@ public class QQAccountExample {
@Test
@Test
public
void
qqAccountTest
()
{
public
void
qqAccountTest
()
{
String
child
=
"5
002744
"
;
String
child
=
"5
975325
"
;
String
cookie
=
"phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=00030000
db3c2ec2393ea968f523f50144db7ab5aec60e79d2509c271bdacdf784e88ac1f58b7493c23ceb15;%20uin=o0497332654;%20skey=M67MOgvFQJ;%20sigA2=D3046D543D9BA50CFE749D63B1F05AF28A281C29B4F1353374AB7A19D9527497A67E507C6829AE44F67C1EA032C2A3728301D2ABC864DA32BCA7D4C7A61609F9F3BC9AE0A7243003;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmUT_jxJCnY5yVwhmL3e2K5FOTRth6jz8SKVHGseA3v9s8UIZxw00LpF1uC9l7W5WL2trdb69LlCvE1s7twReOw
;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0"
;
String
cookie
=
"phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=00030000
6218e8444698ebbad28deed8243ef28e87f247b133bf5d19f160bce91d4e3e06ec2003cd4f92aff0;%20uin=o0497332654;%20skey=MQL8ScQBxu;%20sigA2=2EBEFFCC5C22FD27B32E6F21C76CA4A2AD8BF5D626B629A38C50923810CCD19C574CD093CFAD29C845084C8CD3E04B57DD24A69E4418E060C899696A728FB3B8B14C577F1FF93340;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmSQ3EqlyzseC9-qGEFY7Tkr0Ypk5vsnSwOaMC-IGxsBeY2K7knHrYstj_5dZpisJd5nihvLNQvCdsFhFwZQcT8
;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0"
;
List
<
Map
<
String
,
Object
>>
dataList
=
QQKB
.
getQQAccountData
(
child
,
cookie
);
List
<
Map
<
String
,
Object
>>
dataList
=
QQKB
.
getQQAccountData
(
child
,
cookie
);
System
.
out
.
println
(
dataList
.
size
());
System
.
out
.
println
(
dataList
.
size
());
...
@@ -25,7 +25,7 @@ public class QQAccountExample {
...
@@ -25,7 +25,7 @@ public class QQAccountExample {
headList
.
add
(
"content"
);
headList
.
add
(
"content"
);
headList
.
add
(
"url"
);
headList
.
add
(
"url"
);
headList
.
add
(
"commentid"
);
headList
.
add
(
"commentid"
);
poi
.
exportExcel
(
"D://crawlerdata/qq-5
002744
.xlsx"
,
"asd"
,
headList
,
dataList
);
poi
.
exportExcel
(
"D://crawlerdata/qq-5
975325
.xlsx"
,
"asd"
,
headList
,
dataList
);
}
}
...
...
src/test/java/com/zhiwei/crawler/SouhuAccountExample.java
View file @
a8ebdd2c
...
@@ -12,9 +12,11 @@ import com.zhiwei.parse.Souhu;
...
@@ -12,9 +12,11 @@ import com.zhiwei.parse.Souhu;
public
class
SouhuAccountExample
{
public
class
SouhuAccountExample
{
//http://search.sohu.com/?keyword=%E8%99%8E%E5%97%85&source=article&queryType=edit&ie=utf8
@Test
@Test
public
void
souhuAccountTest
()
{
public
void
souhuAccountTest
()
{
List
<
Map
<
String
,
Object
>>
lists
=
Souhu
.
getSouHuAccountData
(
"c
29odXptdGh5YXRie
UBzb2h1LmNvbQ=="
,
"2017-01-01 00:00:00"
,
false
);
List
<
Map
<
String
,
Object
>>
lists
=
Souhu
.
getSouHuAccountData
(
"c
HBhZzUyMTNjZjAzZTczY
UBzb2h1LmNvbQ=="
,
"2017-01-01 00:00:00"
,
false
);
System
.
out
.
println
(
lists
.
size
());
System
.
out
.
println
(
lists
.
size
());
List
<
String
>
headList
=
new
ArrayList
<
String
>();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"title"
);
...
@@ -26,7 +28,7 @@ public class SouhuAccountExample {
...
@@ -26,7 +28,7 @@ public class SouhuAccountExample {
headList
.
add
(
"newsid"
);
headList
.
add
(
"newsid"
);
headList
.
add
(
"newsPv"
);
headList
.
add
(
"newsPv"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D:\\crawlerdata\\搜狐号历史文章
.xlsx"
,
"sasd
"
,
headList
,
lists
);
poi
.
exportExcel
(
"D:\\crawlerdata\\搜狐号历史文章
-蓝媒汇.xlsx"
,
"蓝媒汇
"
,
headList
,
lists
);
}
}
}
}
src/test/java/com/zhiwei/crawler/SouhuCommentExample.java
View file @
a8ebdd2c
...
@@ -14,7 +14,7 @@ public class SouhuCommentExample {
...
@@ -14,7 +14,7 @@ public class SouhuCommentExample {
@Test
@Test
public
void
souhuCommentTest
()
{
public
void
souhuCommentTest
()
{
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
"D://crawlerdata/搜狐评论采集.xlsx"
,
0
);
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
"D://crawlerdata/搜狐评论采集.xlsx"
,
0
);
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
...
...
src/test/java/com/zhiwei/crawler/XiaomiShequByWordExample.java
0 → 100644
View file @
a8ebdd2c
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Xiaomi
;
public
class
XiaomiShequByWordExample
{
public
static
void
main
(
String
[]
args
)
{
String
word
=
"小米 电饭煲 锅体开裂,小米 电饭煲 开裂,小米 电饭煲 裂缝,小米 电饭煲 烫伤,小米 电饭煲 变形"
;
//
String
[]
words
=
word
.
split
(
","
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
String
w
:
words
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
Xiaomi
.
getXiaomiByWordData
(
w
);
if
(
dataList
!=
null
&&
dataList
.
size
()
>
0
)
{
bodyList
.
addAll
(
dataList
);
}
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
headList
.
add
(
"content"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D:\\crawlerdata\\小米-关键词电饭煲相关.xlsx"
,
"小米社区采集"
,
headList
,
bodyList
);
}
}
src/test/java/com/zhiwei/crawler/YidianzixunAccountExample.java
View file @
a8ebdd2c
package
com
.
zhiwei
.
crawler
;
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
org.junit.Test
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Yidianzixun
;
import
com.zhiwei.parse.Yidianzixun
;
...
@@ -12,12 +14,19 @@ public class YidianzixunAccountExample {
...
@@ -12,12 +14,19 @@ public class YidianzixunAccountExample {
@Test
@Test
public
void
yidianzixunAccountTest
()
{
public
void
yidianzixunAccountTest
()
{
String
channelid
=
"m1
33695
"
;
String
channelid
=
"m1
43901
"
;
String
startTime
=
"2017-0
9-10 09:42:05
"
;
String
startTime
=
"2017-0
1-01 00:00:00
"
;
List
<
Map
<
String
,
Object
>>
dataList
=
Yidianzixun
.
getYidianzixunAccountData
(
channelid
,
startTime
);
List
<
Map
<
String
,
Object
>>
dataList
=
Yidianzixun
.
getYidianzixunAccountData
(
channelid
,
startTime
);
for
(
Map
<
String
,
Object
>
map
:
dataList
)
{
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
System
.
out
.
println
(
map
.
toString
());
List
<
String
>
headList
=
new
ArrayList
<
String
>();
}
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"comment_count"
);
headList
.
add
(
"ctype"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
headList
.
add
(
"summary"
);
poi
.
exportExcel
(
"D://crawlerdata/一点资讯-虎嗅.xlsx"
,
"虎嗅"
,
headList
,
dataList
);
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment