Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
articlenewscrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
articlenewscrawler
Commits
89439323
Commit
89439323
authored
Apr 24, 2018
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
1
parent
132e6350
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
271 additions
and
30 deletions
+271
-30
src/main/java/com/zhiwei/httpclient/HeadGet.java
+0
-0
src/main/java/com/zhiwei/parse/Baijia.java
+45
-2
src/main/java/com/zhiwei/parse/Fenghuang.java
+5
-5
src/main/java/com/zhiwei/parse/QQKB.java
+2
-2
src/main/java/com/zhiwei/parse/TXNews.java
+53
-0
src/main/java/com/zhiwei/parse/analysis/BaijiaAccountAnalysis.java
+18
-0
src/main/java/com/zhiwei/parse/analysis/DayuAccountAnalysis.java
+0
-1
src/main/java/com/zhiwei/parse/analysis/TXNewsByWordAnalysis.java
+55
-0
src/test/java/com/zhiwei/crawler/BaijiaAccountExample.java
+18
-2
src/test/java/com/zhiwei/crawler/DayuAccountExample.java
+27
-5
src/test/java/com/zhiwei/crawler/DayuByWordExample.java
+1
-1
src/test/java/com/zhiwei/crawler/FenghuangAccountExample.java
+2
-2
src/test/java/com/zhiwei/crawler/QQAccountExample.java
+5
-4
src/test/java/com/zhiwei/crawler/SouhuAccountExample.java
+2
-2
src/test/java/com/zhiwei/crawler/TXNewsByWordExample.java
+26
-0
src/test/java/com/zhiwei/crawler/WangyiCommentExample.java
+12
-4
No files found.
src/main/java/com/zhiwei/httpclient/HeadGet.java
View file @
89439323
This diff is collapsed.
Click to expand it.
src/main/java/com/zhiwei/parse/Baijia.java
View file @
89439323
package
com
.
zhiwei
.
parse
;
package
com
.
zhiwei
.
parse
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.parse.analysis.BaijiaAccountAnalysis
;
import
com.zhiwei.parse.analysis.BaijiaAccountAnalysis
;
import
com.zhiwei.zhiweiTools.timeParse.TimeParse
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
public
class
Baijia
{
public
class
Baijia
{
...
@@ -19,6 +23,45 @@ public class Baijia {
...
@@ -19,6 +23,45 @@ public class Baijia {
/**
/**
*
*
* @Description 百家号历史文章采集
* @Description 百家号历史文章采集
* @param app_id 百度新闻转发获取后面的数据
* @param startTime
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getBaijiaAccount2Data
(
String
app_id
,
String
startTime
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getBaijiaAccount2HeaderMap
(
null
);
String
url
=
"https://news.baidu.com/sn/api/homesubcribe?forum_id="
+
app_id
;
boolean
f
=
true
;
while
(
f
)
{
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
headerMap
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArry
=
json
.
getJSONObject
(
"data"
).
getJSONArray
(
"news"
);
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
JSONObject
data
=
jsonArry
.
getJSONObject
(
i
);
Map
<
String
,
Object
>
m
=
baijiaAccountAnalysis
.
getBaijiaAccount2Data
(
data
);
if
(
startTime
.
compareTo
((
String
)
m
.
get
(
"time"
))
>
0
)
{
f
=
false
;
break
;
}
dataList
.
add
(
m
);
if
(
startTime
!=
null
&&
startTime
.
length
()
>
5
)
{
logger
.
info
(
"采集到的时间为:{}"
,(
String
)
m
.
get
(
"time"
));
}
}
logger
.
info
(
"采集到的数据总量:{}"
,
dataList
.
size
());
if
(
json
.
getJSONObject
(
"data"
).
getBooleanValue
(
"hasMore"
))
{
url
=
"https://news.baidu.com/sn/api/homesubcribe?forum_id="
+
app_id
+
"&page="
+
(
json
.
getJSONObject
(
"data"
).
getIntValue
(
"page"
)+
1
);
ZhiWeiTools
.
sleep
(
2000
);
continue
;
}
break
;
}
return
dataList
;
}
/**
*
* @Description 百家号历史文章采集
* @param app_id
* @param app_id
* @param startTime
* @param startTime
* @return
* @return
...
@@ -37,12 +80,12 @@ public class Baijia {
...
@@ -37,12 +80,12 @@ public class Baijia {
break
;
break
;
}
}
i
+=
20
;
i
+=
20
;
ZhiWeiTools
.
sleep
(
5
000
);
ZhiWeiTools
.
sleep
(
4
000
);
dataList
.
addAll
(
list
);
dataList
.
addAll
(
list
);
logger
.
info
(
url
+
i
+
"=============="
+
dataList
.
size
());
logger
.
info
(
url
+
i
+
"=============="
+
dataList
.
size
());
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
e
.
printStackTrace
();
ZhiWeiTools
.
sleep
(
4
000
);
ZhiWeiTools
.
sleep
(
3
000
);
logger
.
error
(
"此页解析出错"
,
e
.
getMessage
());
logger
.
error
(
"此页解析出错"
,
e
.
getMessage
());
continue
;
continue
;
}
}
...
...
src/main/java/com/zhiwei/parse/Fenghuang.java
View file @
89439323
...
@@ -109,6 +109,7 @@ public class Fenghuang {
...
@@ -109,6 +109,7 @@ public class Fenghuang {
int
i
=
1
;
int
i
=
1
;
try
{
try
{
while
(
true
)
{
while
(
true
)
{
try
{
String
url
=
"http://search.ifeng.com/sofeng/search.action?q="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)+
"&c=1&p="
;
String
url
=
"http://search.ifeng.com/sofeng/search.action?q="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)+
"&c=1&p="
;
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getFenghuangWordHeaderMap
(
null
);
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getFenghuangWordHeaderMap
(
null
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
+
i
,
headerMap
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
+
i
,
headerMap
);
...
@@ -125,15 +126,14 @@ public class Fenghuang {
...
@@ -125,15 +126,14 @@ public class Fenghuang {
break
;
break
;
}
}
ZhiWeiTools
.
sleep
(
4000
);
ZhiWeiTools
.
sleep
(
4000
);
}
catch
(
Exception
e
)
{
continue
;
}
}
}
return
dataList
;
}
catch
(
UnsupportedEncodingException
e
)
{
logger
.
error
(
"依据关键词获取凤凰文章出错"
,
e
.
getMessage
());
e
.
printStackTrace
();
return
dataList
;
return
dataList
;
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"依据关键词获取凤凰文章出错"
,
e
.
getMessage
());
e
.
printStackTrace
();
e
.
printStackTrace
();
logger
.
error
(
"链接获取凤凰信息出错"
,
e
.
getMessage
());
return
dataList
;
return
dataList
;
}
}
}
}
...
...
src/main/java/com/zhiwei/parse/QQKB.java
View file @
89439323
...
@@ -59,12 +59,12 @@ public class QQKB {
...
@@ -59,12 +59,12 @@ public class QQKB {
paramMap
=
HeadGet
.
getQQAccountOtherParamMap
(
ids
);
paramMap
=
HeadGet
.
getQQAccountOtherParamMap
(
ids
);
result
=
HttpClient
.
executeHttpRequestPost
(
url
,
headerMap
,
paramMap
);
result
=
HttpClient
.
executeHttpRequestPost
(
url
,
headerMap
,
paramMap
);
List
<
Map
<
String
,
Object
>>
list
=
qqAccountAnalysis
.
analysisQQAccountData
(
result
);
List
<
Map
<
String
,
Object
>>
list
=
qqAccountAnalysis
.
analysisQQAccountData
(
result
);
ids
=
""
;
i
=
0
;
if
(
list
!=
null
)
{
if
(
list
!=
null
)
{
dataList
.
addAll
(
list
);
dataList
.
addAll
(
list
);
break
;
break
;
}
}
ids
=
""
;
i
=
0
;
}
}
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
ids
=
""
;
ids
=
""
;
...
...
src/main/java/com/zhiwei/parse/TXNews.java
0 → 100644
View file @
89439323
package
com
.
zhiwei
.
parse
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.parse.analysis.TXNewsByWordAnalysis
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
public
class
TXNews
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
TXNews
.
class
);
private
static
TXNewsByWordAnalysis
txNewsByWordAnalysis
=
new
TXNewsByWordAnalysis
();
public
static
boolean
hasMore
=
true
;
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getTxNewspage1HeaderMap
(
null
);
Map
<
String
,
Object
>
paramMap
=
HeadGet
.
getTxNewspage1ParamMap
(
word
);
String
result
=
HttpClient
.
executeHttpRequestPost
(
"http://r.inews.qq.com/search?appver=11.2.1_qqnews_5.5.60&devid=6D33F35F-880D-42A6-A23F-881BEC6960EC"
,
headerMap
,
paramMap
);
List
<
Map
<
String
,
Object
>>
dList
=
txNewsByWordAnalysis
.
getData
(
result
);
dataList
.
addAll
(
dList
);
int
page
=
2
;
int
count
=
0
;
Map
<
String
,
String
>
header2Map
=
HeadGet
.
getTxNewspage2HeaderMap
(
null
);
while
(
hasMore
)
{
try
{
ZhiWeiTools
.
sleep
(
5000
);
Map
<
String
,
Object
>
param2Map
=
HeadGet
.
getTxNewspagemoreParamMap
(
word
,
page
);
String
result2
=
HttpClient
.
executeHttpRequestPost
(
"http://r.inews.qq.com/searchMore?appver=11.2.1_qqnews_5.5.60&devid=6D33F35F-880D-42A6-A23F-881BEC6960EC"
,
header2Map
,
param2Map
);
page
++;
List
<
Map
<
String
,
Object
>>
dList2
=
txNewsByWordAnalysis
.
getData
(
result2
);
dataList
.
addAll
(
dList2
);
logger
.
info
(
"采集到数据======={}"
,
dataList
.
size
());
count
=
0
;
}
catch
(
Exception
e
)
{
if
(
count
>
2
)
{
count
++;
break
;
}
continue
;
}
}
return
dataList
;
}
}
src/main/java/com/zhiwei/parse/analysis/BaijiaAccountAnalysis.java
View file @
89439323
...
@@ -14,11 +14,29 @@ import com.alibaba.fastjson.JSONArray;
...
@@ -14,11 +14,29 @@ import com.alibaba.fastjson.JSONArray;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.zhiweiTools.timeParse.TimeParse
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
public
class
BaijiaAccountAnalysis
{
public
class
BaijiaAccountAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
BaijiaAccountAnalysis
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
BaijiaAccountAnalysis
.
class
);
public
Map
<
String
,
Object
>
getBaijiaAccount2Data
(
JSONObject
data
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
map
.
put
(
"title"
,
data
.
getString
(
"title"
));
map
.
put
(
"url"
,
data
.
getString
(
"url"
));
map
.
put
(
"source"
,
data
.
getString
(
"site"
));
map
.
put
(
"time"
,
TimeParse
.
dateFormartString
(
TimeParse
.
stringFormartDate
(
data
.
getString
(
"pulltime"
)),
"yyyy-MM-dd HH:mm:ss"
));
String
content
=
""
;
JSONArray
jsonArry
=
data
.
getJSONArray
(
"content"
);
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
JSONObject
d
=
jsonArry
.
getJSONObject
(
i
);
if
(
d
.
getString
(
"type"
).
equals
(
"text"
))
{
content
=
content
+
d
.
getString
(
"data"
);
}
}
map
.
put
(
"content"
,
content
.
replaceAll
(
"<.*?>"
,
""
));
return
map
;
}
/**
/**
*
*
...
...
src/main/java/com/zhiwei/parse/analysis/DayuAccountAnalysis.java
View file @
89439323
...
@@ -52,7 +52,6 @@ public class DayuAccountAnalysis {
...
@@ -52,7 +52,6 @@ public class DayuAccountAnalysis {
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
try
{
try
{
String
time
=
data
.
getString
(
"published_at"
).
replace
(
"T"
,
" "
).
split
(
"\\."
)[
0
];
String
time
=
data
.
getString
(
"published_at"
).
replace
(
"T"
,
" "
).
split
(
"\\."
)[
0
];
System
.
out
.
println
(
time
);
if
(
startTime
!=
null
&&
startTime
.
length
()
>
1
)
{
if
(
startTime
!=
null
&&
startTime
.
length
()
>
1
)
{
if
(
time
.
compareTo
(
startTime
)
<
0
)
{
if
(
time
.
compareTo
(
startTime
)
<
0
)
{
return
null
;
return
null
;
...
...
src/main/java/com/zhiwei/parse/analysis/TXNewsByWordAnalysis.java
0 → 100644
View file @
89439323
package
com
.
zhiwei
.
parse
.
analysis
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.parse.TXNews
;
public
class
TXNewsByWordAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
TXNewsByWordAnalysis
.
class
);
public
List
<
Map
<
String
,
Object
>>
getData
(
String
result
)
{
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
JSONArray
jsonArry
=
json
.
getJSONArray
(
"secList"
);
if
(
json
.
getInteger
(
"hasMore"
)
==
1
)
{
TXNews
.
hasMore
=
true
;
}
else
{
TXNews
.
hasMore
=
false
;
}
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
JSONObject
js
=
jsonArry
.
getJSONObject
(
i
);
if
(
js
.
getInteger
(
"secType"
)
==
0
)
{
JSONArray
jsonArry2
=
js
.
getJSONArray
(
"newsList"
);
for
(
int
j
=
0
;
j
<
jsonArry2
.
size
();
j
++)
{
JSONObject
js2
=
jsonArry2
.
getJSONObject
(
j
);
try
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
map
.
put
(
"title"
,
js2
.
getString
(
"title"
));
map
.
put
(
"content"
,
js2
.
getString
(
"abstract"
));
map
.
put
(
"time"
,
js2
.
getString
(
"time"
));
map
.
put
(
"source"
,
js2
.
getString
(
"source"
));
map
.
put
(
"id"
,
js2
.
getString
(
"id"
));
map
.
put
(
"url"
,
js2
.
getString
(
"url"
));
dataList
.
add
(
map
);
// System.out.println(map.toString());
}
catch
(
Exception
e
)
{
logger
.
error
(
"采集出错:{}"
,
e
.
getMessage
());
System
.
out
.
println
(
js2
.
toString
());
}
}
}
}
return
dataList
;
}
}
src/test/java/com/zhiwei/crawler/BaijiaAccountExample.java
View file @
89439323
...
@@ -13,7 +13,7 @@ public class BaijiaAccountExample {
...
@@ -13,7 +13,7 @@ public class BaijiaAccountExample {
@Test
@Test
public
void
baijiaAccountTest
()
{
public
void
baijiaAccountTest
()
{
String
app_id
=
"1536766
731827943
"
;
String
app_id
=
"1536766
390576806
"
;
String
startTime
=
"2016-01-01 00:00:00"
;
String
startTime
=
"2016-01-01 00:00:00"
;
//2017-11-30 17:48:17
//2017-11-30 17:48:17
List
<
Map
<
String
,
Object
>>
lists
=
Baijia
.
getBaijiaAccountData
(
app_id
,
startTime
);
List
<
Map
<
String
,
Object
>>
lists
=
Baijia
.
getBaijiaAccountData
(
app_id
,
startTime
);
...
@@ -26,7 +26,23 @@ public class BaijiaAccountExample {
...
@@ -26,7 +26,23 @@ public class BaijiaAccountExample {
headList
.
add
(
"source"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
headList
.
add
(
"url"
);
headList
.
add
(
"content"
);
headList
.
add
(
"content"
);
poi
.
exportExcel
(
"D://crawlerdata/百家号-蓝鲸TMT网.xlsx"
,
"蓝鲸TMT网"
,
headList
,
lists
);
poi
.
exportExcel
(
"D://crawlerdata/百家号-太保.xlsx"
,
"太保"
,
headList
,
lists
);
}
// @Test
public
void
baijiaAccount2Test
()
{
String
app_id
=
"b_1536766390576806"
;
String
startTime
=
"2016-01-01 00:00:00"
;
//2017-11-30 17:48:17
List
<
Map
<
String
,
Object
>>
lists
=
Baijia
.
getBaijiaAccount2Data
(
app_id
,
startTime
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
headList
.
add
(
"content"
);
poi
.
exportExcel
(
"D://crawlerdata/百家号-俊世太保.xlsx"
,
"俊世太保"
,
headList
,
lists
);
}
}
}
}
src/test/java/com/zhiwei/crawler/DayuAccountExample.java
View file @
89439323
package
com
.
zhiwei
.
crawler
;
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
import
org.junit.Test
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Dayu
;
import
com.zhiwei.parse.Dayu
;
public
class
DayuAccountExample
{
public
class
DayuAccountExample
{
...
@@ -15,12 +17,32 @@ public class DayuAccountExample {
...
@@ -15,12 +17,32 @@ public class DayuAccountExample {
//https://api.m.sm.cn/rest?method=Subscribe.list&format=html&from=wh10331&uc_biz_str=S:custom%7CC:search%7CN:true
//https://api.m.sm.cn/rest?method=Subscribe.list&format=html&from=wh10331&uc_biz_str=S:custom%7CC:search%7CN:true
String
mid
=
"d7300311c1504d24a229c3da345785c6"
;
// String mid = "d7300311c1504d24a229c3da345785c6";
String
name
=
"大鱼海棠雨"
;
// String name = "大鱼海棠雨";
String
startTime
=
"2017-12-05 22:08:01"
;
String
startTime
=
"2018-03-16 00:00:00"
;
String
path
=
"D:\\crawlerdata\\自媒体\\大鱼历史文章.xlsx"
;
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
path
,
0
);
List
<
Map
<
String
,
Object
>>
lists
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"content"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
// headList.add("content_id");
// headList.add("origin_id");
// headList.add("xss_item_id");
for
(
Map
<
String
,
Object
>
data
:
lists
)
{
String
mid
=
data
.
get
(
"mid"
)+
""
;
String
name
=
data
.
get
(
"name"
)+
""
;
if
(
mid
.
length
()
<
1
&&
name
.
length
()
<
1
)
{
continue
;
}
List
<
Map
<
String
,
Object
>>
dataList
=
Dayu
.
getDayuAccountData
(
mid
,
name
,
null
);
List
<
Map
<
String
,
Object
>>
dataList
=
Dayu
.
getDayuAccountData
(
mid
,
name
,
null
);
System
.
out
.
println
(
dataList
.
size
());
poi
.
exportExcel
(
path
,
name
,
headList
,
dataList
);
}
}
}
...
...
src/test/java/com/zhiwei/crawler/DayuByWordExample.java
View file @
89439323
...
@@ -12,7 +12,7 @@ public class DayuByWordExample {
...
@@ -12,7 +12,7 @@ public class DayuByWordExample {
@Test
@Test
public
void
dayuByWordTest
()
{
public
void
dayuByWordTest
()
{
String
word
=
"
京东
"
;
String
word
=
"
沃尔玛
"
;
List
<
Map
<
String
,
Object
>>
dataList
=
Dayu
.
getDayuByWordData
(
word
);
List
<
Map
<
String
,
Object
>>
dataList
=
Dayu
.
getDayuByWordData
(
word
);
...
...
src/test/java/com/zhiwei/crawler/FenghuangAccountExample.java
View file @
89439323
...
@@ -15,7 +15,7 @@ public class FenghuangAccountExample {
...
@@ -15,7 +15,7 @@ public class FenghuangAccountExample {
public
void
fenghuangAccountTest
()
{
public
void
fenghuangAccountTest
()
{
//所用时间长 1s1篇文章吧
//所用时间长 1s1篇文章吧
//https://api.3g.ifeng.com/client_search_subscribe?k=(凤凰号名称拿id)
//https://api.3g.ifeng.com/client_search_subscribe?k=(凤凰号名称拿id)
String
id
=
"
276718
"
;
String
id
=
"
724
"
;
String
[]
ids
=
id
.
split
(
","
);
String
[]
ids
=
id
.
split
(
","
);
for
(
int
i
=
0
;
i
<
ids
.
length
;
i
++)
{
for
(
int
i
=
0
;
i
<
ids
.
length
;
i
++)
{
try
{
try
{
...
@@ -29,7 +29,7 @@ public class FenghuangAccountExample {
...
@@ -29,7 +29,7 @@ public class FenghuangAccountExample {
headList
.
add
(
"source"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
headList
.
add
(
"url"
);
headList
.
add
(
"id"
);
headList
.
add
(
"id"
);
poi
.
exportExcel
(
"D://crawlerdata/凤凰-
另眼看世界
.xlsx"
,
ids
[
i
],
headList
,
dataList
);
poi
.
exportExcel
(
"D://crawlerdata/凤凰-
电商报
.xlsx"
,
ids
[
i
],
headList
,
dataList
);
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
continue
;
continue
;
}
}
...
...
src/test/java/com/zhiwei/crawler/QQAccountExample.java
View file @
89439323
...
@@ -16,14 +16,15 @@ public class QQAccountExample {
...
@@ -16,14 +16,15 @@ public class QQAccountExample {
public
void
qqAccountTest
()
{
public
void
qqAccountTest
()
{
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
Map
<
String
,
Object
>
dataMap
=
poi
.
importExcel
(
"D://crawlerdata/天天快报历史文章采集.xlsx"
,
0
);
Map
<
String
,
Object
>
dataMap
=
poi
.
importExcel
(
"D://crawlerdata/
/自媒体/
天天快报历史文章采集.xlsx"
,
0
);
List
<
Map
<
String
,
Object
>>
dataList
=
(
List
<
Map
<
String
,
Object
>>)
dataMap
.
get
(
"body"
);
List
<
Map
<
String
,
Object
>>
dataList
=
(
List
<
Map
<
String
,
Object
>>)
dataMap
.
get
(
"body"
);
String
cookie
=
"phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=000300006218e8444698ebbad28deed8243ef28e87f247b133bf5d19f160bce91d4e3e06ec2003cd4f92aff0;%20uin=o0497332654;%20skey=MQL8ScQBxu;%20sigA2=2EBEFFCC5C22FD27B32E6F21C76CA4A2AD8BF5D626B629A38C50923810CCD19C574CD093CFAD29C845084C8CD3E04B57DD24A69E4418E060C899696A728FB3B8B14C577F1FF93340;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmSQ3EqlyzseC9-qGEFY7Tkr0Ypk5vsnSwOaMC-IGxsBeY2K7knHrYstj_5dZpisJd5nihvLNQvCdsFhFwZQcT8;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0"
;
String
cookie
=
"phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=000300006218e8444698ebbad28deed8243ef28e87f247b133bf5d19f160bce91d4e3e06ec2003cd4f92aff0;%20uin=o0497332654;%20skey=MQL8ScQBxu;%20sigA2=2EBEFFCC5C22FD27B32E6F21C76CA4A2AD8BF5D626B629A38C50923810CCD19C574CD093CFAD29C845084C8CD3E04B57DD24A69E4418E060C899696A728FB3B8B14C577F1FF93340;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmSQ3EqlyzseC9-qGEFY7Tkr0Ypk5vsnSwOaMC-IGxsBeY2K7knHrYstj_5dZpisJd5nihvLNQvCdsFhFwZQcT8;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0"
;
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
Map
<
String
,
Object
>
map
:
dataList
)
{
for
(
Map
<
String
,
Object
>
map
:
dataList
)
{
String
child
=
map
.
get
(
"帐号链接"
)+
""
;
String
child
=
map
.
get
(
"帐号链接"
)+
""
;
System
.
out
.
println
(
child
.
split
(
"chlid="
)[
1
]);
// System.out.println(child.split("chlid=")[1]);
List
<
Map
<
String
,
Object
>>
lists
=
QQKB
.
getQQAccountData
(
child
.
split
(
"chlid="
)[
1
],
cookie
);
System
.
out
.
println
((
String
)
map
.
get
(
"child"
));
List
<
Map
<
String
,
Object
>>
lists
=
QQKB
.
getQQAccountData
((
String
)
map
.
get
(
"child"
),
cookie
);
if
(
lists
!=
null
)
{
if
(
lists
!=
null
)
{
for
(
Map
<
String
,
Object
>
map1
:
lists
)
{
for
(
Map
<
String
,
Object
>
map1
:
lists
)
{
map1
.
put
(
"name"
,
map
.
get
(
"呢称"
));
map1
.
put
(
"name"
,
map
.
get
(
"呢称"
));
...
@@ -43,7 +44,7 @@ public class QQAccountExample {
...
@@ -43,7 +44,7 @@ public class QQAccountExample {
headList
.
add
(
"content"
);
headList
.
add
(
"content"
);
headList
.
add
(
"url"
);
headList
.
add
(
"url"
);
headList
.
add
(
"commentid"
);
headList
.
add
(
"commentid"
);
poi
.
exportExcel
(
"D://crawlerdata/
天天快报采集
.xlsx"
,
"asd"
,
headList
,
bodyList
);
poi
.
exportExcel
(
"D://crawlerdata/
/自媒体/天天快报采集-科技编年史
.xlsx"
,
"asd"
,
headList
,
bodyList
);
}
}
...
...
src/test/java/com/zhiwei/crawler/SouhuAccountExample.java
View file @
89439323
...
@@ -16,7 +16,7 @@ public class SouhuAccountExample {
...
@@ -16,7 +16,7 @@ public class SouhuAccountExample {
@Test
@Test
public
void
souhuAccountTest
()
{
public
void
souhuAccountTest
()
{
List
<
Map
<
String
,
Object
>>
lists
=
Souhu
.
getSouHuAccountData
(
"
cHBhZzUyMTNjZjAzZTczYUBzb2h1LmNvbQ=="
,
"2017
-01-01 00:00:00"
,
false
);
List
<
Map
<
String
,
Object
>>
lists
=
Souhu
.
getSouHuAccountData
(
"
MjI5MzAyOTMyMEBzaW5hLnNvaHUuY29t"
,
"2016
-01-01 00:00:00"
,
false
);
System
.
out
.
println
(
lists
.
size
());
System
.
out
.
println
(
lists
.
size
());
List
<
String
>
headList
=
new
ArrayList
<
String
>();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"title"
);
...
@@ -28,7 +28,7 @@ public class SouhuAccountExample {
...
@@ -28,7 +28,7 @@ public class SouhuAccountExample {
headList
.
add
(
"newsid"
);
headList
.
add
(
"newsid"
);
headList
.
add
(
"newsPv"
);
headList
.
add
(
"newsPv"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D:\\crawlerdata\\搜狐号历史文章-
蓝媒汇.xlsx"
,
"蓝媒汇
"
,
headList
,
lists
);
poi
.
exportExcel
(
"D:\\crawlerdata\\搜狐号历史文章-
太保乱谈.xlsx"
,
"太保乱谈
"
,
headList
,
lists
);
}
}
}
}
src/test/java/com/zhiwei/crawler/TXNewsByWordExample.java
0 → 100644
View file @
89439323
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.TXNews
;
public
class
TXNewsByWordExample
{
public
static
void
main
(
String
[]
args
)
{
String
word
=
"唐嫣"
;
List
<
Map
<
String
,
Object
>>
dataList
=
TXNews
.
getData
(
word
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"content"
);
headList
.
add
(
"url"
);
headList
.
add
(
"id"
);
headList
.
add
(
"source"
);
poi
.
exportExcel
(
"D://crawlerdata/腾讯新闻-唐嫣.xlsx"
,
"腾讯新闻数据"
,
headList
,
dataList
);
}
}
src/test/java/com/zhiwei/crawler/WangyiCommentExample.java
View file @
89439323
...
@@ -14,12 +14,20 @@ public class WangyiCommentExample {
...
@@ -14,12 +14,20 @@ public class WangyiCommentExample {
//若出错 可能数据有重复 以id为准
//若出错 可能数据有重复 以id为准
@Test
@Test
public
void
wangyiCommentTest
()
{
public
void
wangyiCommentTest
()
{
String
url
=
"http://news.163.com/18/0210/09/DA9B8PVJ000189FH.html"
;
List
<
String
>
urlList
=
new
ArrayList
<
String
>();
urlList
.
add
(
"https://c.m.163.com/news/a/DCQ42REV05118O92.html?spss=newsapp"
);
String
id
=
url
.
split
(
"/"
)[
6
].
split
(
".ht"
)[
0
];
urlList
.
add
(
"https://c.m.163.com/news/a/DCPLJ5GB05198R91.html?spss=newsapp"
);
urlList
.
add
(
"https://c.m.163.com/news/a/DCRNI7020511CPVM.html?spss=newsapp"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
for
(
String
url
:
urlList
)
{
String
id
=
url
.
split
(
"a/"
)[
1
].
split
(
".ht"
)[
0
];
List
<
Map
<
String
,
Object
>>
lists
=
Wangyi
.
getWangyiCommentData
(
id
);
List
<
Map
<
String
,
Object
>>
lists
=
Wangyi
.
getWangyiCommentData
(
id
);
System
.
out
.
println
(
lists
.
size
());
System
.
out
.
println
(
lists
.
size
());
if
(
lists
!=
null
)
{
bodyList
.
addAll
(
lists
);
}
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"content"
);
headList
.
add
(
"content"
);
headList
.
add
(
"id"
);
headList
.
add
(
"id"
);
...
@@ -29,7 +37,7 @@ public class WangyiCommentExample {
...
@@ -29,7 +37,7 @@ public class WangyiCommentExample {
headList
.
add
(
"unlike"
);
headList
.
add
(
"unlike"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D:\\crawlerdata\\网易评论采集
测试.xlsx"
,
"asd"
,
headList
,
lists
);
poi
.
exportExcel
(
"D:\\crawlerdata\\网易评论采集
-3.xlsx"
,
"asd"
,
headList
,
bodyList
);
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment