Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
articlenewscrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
articlenewscrawler
Commits
7bf0e1d7
Commit
7bf0e1d7
authored
Jan 04, 2018
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
增加部分自媒体采集
parent
f09aa1c9
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
343 additions
and
61 deletions
+343
-61
src/main/java/com/zhiwei/httpclient/HeadGet.java
+145
-36
src/main/java/com/zhiwei/parse/Baijia.java
+8
-1
src/main/java/com/zhiwei/parse/Dayu.java
+44
-1
src/main/java/com/zhiwei/parse/Fenghuang.java
+7
-2
src/main/java/com/zhiwei/parse/Meipai.java
+7
-1
src/main/java/com/zhiwei/parse/Miaopai.java
+6
-2
src/main/java/com/zhiwei/parse/QQKB.java
+97
-5
src/main/java/com/zhiwei/parse/Souhu.java
+10
-2
src/main/java/com/zhiwei/parse/Yidianzixun.java
+0
-1
src/main/java/com/zhiwei/parse/analysis/QQKBAccountAnalysis.java
+2
-2
src/main/java/com/zhiwei/parse/analysis/SouhuCommentAnalysis.java
+1
-0
src/test/java/com/zhiwei/crawler/QQAccountExample.java
+2
-2
src/test/java/com/zhiwei/crawler/SouhuAccountExample.java
+14
-6
No files found.
src/main/java/com/zhiwei/httpclient/HeadGet.java
View file @
7bf0e1d7
This diff is collapsed.
Click to expand it.
src/main/java/com/zhiwei/parse/Baijia.java
View file @
7bf0e1d7
...
...
@@ -16,6 +16,13 @@ public class Baijia {
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Baijia
.
class
);
private
static
BaijiaAccountAnalysis
baijiaAccountAnalysis
=
new
BaijiaAccountAnalysis
();
/**
*
* @Description 百家号历史文章采集
* @param app_id
* @param startTime
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getBaijiaAccountData
(
String
app_id
,
String
startTime
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
int
i
=
0
;
...
...
@@ -23,7 +30,7 @@ public class Baijia {
while
(
true
)
{
try
{
String
url
=
"https://baijia.baidu.com/writerlistarticle?ajax=json&app_id="
+
app_id
+
"&_limit=20&_skip="
;
System
.
out
.
println
(
url
+
i
);
logger
.
info
(
url
+
i
);
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getBaijiaAccountHeaderMap
(
null
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
+
i
,
headerMap
);
List
<
Map
<
String
,
Object
>>
list
=
baijiaAccountAnalysis
.
getBaijiaAccountData
(
result
,
startTime
);
...
...
src/main/java/com/zhiwei/parse/Dayu.java
View file @
7bf0e1d7
package
com
.
zhiwei
.
parse
;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
...
...
@@ -12,6 +13,7 @@ import com.alibaba.fastjson.JSONObject;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.parse.analysis.DayuAccountAnalysis
;
import
com.zhiwei.parse.analysis.DayuByWordAnalysis
;
import
com.zhiwei.parse.analysis.DayuCommentAnalysis
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
...
...
@@ -19,6 +21,7 @@ public class Dayu {
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Dayu
.
class
);
private
static
DayuAccountAnalysis
dayuAccountAnalysis
=
new
DayuAccountAnalysis
();
private
static
DayuCommentAnalysis
dayuCommentAnalysis
=
new
DayuCommentAnalysis
();
private
static
DayuByWordAnalysis
dayuByWordAnalysis
=
new
DayuByWordAnalysis
();
/**
*
...
...
@@ -45,7 +48,7 @@ public class Dayu {
dataList
.
addAll
(
lists
);
System
.
out
.
println
(
"================解析第"
+
i
+
"页====此时有数据=="
+
dataList
.
size
());
i
++;
ZhiWeiTools
.
sleep
(
8
000
);
ZhiWeiTools
.
sleep
(
7
000
);
}
return
dataList
;
}
catch
(
Exception
e
)
{
...
...
@@ -94,6 +97,12 @@ public class Dayu {
}
/**
*
* @Description 获取文章评论数
* @param articleId
* @return
*/
public
static
int
getDayuCommentCount
(
String
articleId
)
{
String
url
=
"http://m.uczzd.cn/iflow/api/v2/cmt/article/"
+
articleId
+
"/comments/byhot"
;
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getDayuCommentHeaderMap
(
null
);
...
...
@@ -102,4 +111,38 @@ public class Dayu {
return
json
.
getJSONObject
(
"data"
).
getInteger
(
"comment_cnt"
);
}
/**
*
* @Description 大鱼号依据关键词采集
* @param word
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getDayuByWordData
(
String
word
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getDayuByWordHeaderMap
(
null
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
int
i
=
1
;
try
{
while
(
true
)
{
String
url
=
"http://zzd.sm.cn/iflow/api/v1/article/fsearch?page="
+
i
+
"&size=20&sid=&q="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)+
"&scene=0"
;
System
.
out
.
println
(
url
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
headerMap
);
List
<
Map
<
String
,
Object
>>
lists
=
dayuByWordAnalysis
.
getDayuByWordData
(
result
);
if
(
lists
==
null
||
lists
.
size
()
<
1
)
{
break
;
}
if
(
lists
!=
null
&&
lists
.
size
()
>
0
)
{
dataList
.
addAll
(
lists
);
}
ZhiWeiTools
.
sleep
(
5000
);
i
++;
}
return
dataList
;
}
catch
(
Exception
e
)
{
logger
.
error
(
"关键词获取大鱼信息出错"
,
e
.
getMessage
());
return
dataList
;
}
}
}
src/main/java/com/zhiwei/parse/Fenghuang.java
View file @
7bf0e1d7
package
com
.
zhiwei
.
parse
;
import
java.io.IOException
;
import
java.io.UnsupportedEncodingException
;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
...
...
@@ -93,6 +92,12 @@ public class Fenghuang {
return
map
;
}
/**
*
* @Description 凤凰关键词采集
* @param word
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getFenghuangByWord
(
String
word
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
int
i
=
1
;
...
...
@@ -120,7 +125,7 @@ public class Fenghuang {
logger
.
error
(
"依据关键词获取凤凰文章出错"
,
e
.
getMessage
());
e
.
printStackTrace
();
return
dataList
;
}
catch
(
IO
Exception
e
)
{
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
logger
.
error
(
"链接获取凤凰信息出错"
,
e
.
getMessage
());
return
dataList
;
...
...
src/main/java/com/zhiwei/parse/Meipai.java
View file @
7bf0e1d7
package
com
.
zhiwei
.
parse
;
import
java.io.UnsupportedEncodingException
;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
import
java.util.List
;
...
...
@@ -18,6 +17,12 @@ public class Meipai {
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Meipai
.
class
);
private
static
MeipaiByWordAnalysis
meipaiByWordAnalysis
=
new
MeipaiByWordAnalysis
();
/**
*
* @Description 美拍关键词获取视频数据
* @param word
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getMeipaiByWordData
(
String
word
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
try
{
...
...
@@ -44,6 +49,7 @@ public class Meipai {
return
dataList
;
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据出错"
,
e
.
getMessage
());
e
.
printStackTrace
();
return
dataList
;
}
...
...
src/main/java/com/zhiwei/parse/Miaopai.java
View file @
7bf0e1d7
package
com
.
zhiwei
.
parse
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.jsoup.Jsoup
;
...
...
@@ -15,7 +14,12 @@ import com.zhiwei.httpclient.HttpClient;
public
class
Miaopai
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Miaopai
.
class
);
/**
*
* @Description 秒拍依据链接获取数据
* @param url
* @return
*/
public
static
Map
<
String
,
Object
>
getMiaopaiDataByURL
(
String
url
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getMiaoPaiByURlHeaderMap
(
null
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
headerMap
);
...
...
src/main/java/com/zhiwei/parse/QQ.java
→
src/main/java/com/zhiwei/parse/QQ
KB
.java
View file @
7bf0e1d7
...
...
@@ -7,15 +7,25 @@ import java.util.Map;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.parse.analysis.QQAccountAnalysis
;
import
com.zhiwei.parse.analysis.QQKBAccountAnalysis
;
import
com.zhiwei.parse.analysis.QQKBCommentAnalysis
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
public
class
QQ
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
QQ
.
class
);
private
static
QQAccountAnalysis
qqAccountAnalysis
=
new
QQAccountAnalysis
();
public
class
QQKB
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
QQKB
.
class
);
private
static
QQKBAccountAnalysis
qqAccountAnalysis
=
new
QQKBAccountAnalysis
();
private
static
QQKBCommentAnalysis
qqkbCommentAnalysis
=
new
QQKBCommentAnalysis
();
/**
*
* @Description 采集天天快报历史文章
* @param child
* @param cookie
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getQQAccountData
(
String
child
,
String
cookie
)
{
String
url
=
"http://r.cnews.qq.com/getSubNewsIndex"
;
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getQQAccountHeaderMap
(
cookie
);
...
...
@@ -35,7 +45,7 @@ public class QQ {
try
{
ids
=
ids
.
substring
(
0
,
ids
.
length
()-
1
);
System
.
out
.
println
(
ids
);
ZhiWeiTools
.
sleep
(
8
000
);
ZhiWeiTools
.
sleep
(
7
000
);
paramMap
.
clear
();
paramMap
=
HeadGet
.
getQQAccountOtherParamMap
(
ids
);
result
=
HttpClient
.
executeHttpRequestPost
(
url
,
headerMap
,
paramMap
);
...
...
@@ -68,5 +78,87 @@ public class QQ {
}
/**
*
* @Description 获取天天快报评论
* @param cookie
* @param comment_id
* @param article_id
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getQQKBCommentData
(
String
cookie
,
String
comment_id
,
String
article_id
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getQQKBCommentHeaderMap
(
cookie
);
try
{
Map
<
String
,
Object
>
paramMap
=
HeadGet
.
getQQKBCommentParamMap
(
comment_id
,
article_id
);
int
i
=
1
;
while
(
true
)
{
String
result
=
HttpClient
.
executeHttpRequestPost
(
"http://r.cnews.qq.com/getQQNewsComment"
,
headerMap
,
paramMap
);
System
.
out
.
println
(
result
);
paramMap
.
clear
();
List
<
Map
<
String
,
Object
>>
lists
=
qqkbCommentAnalysis
.
getCommentData
(
result
,
cookie
,
comment_id
,
article_id
);
if
(
lists
==
null
||
lists
.
size
()
<
1
)
{
break
;
}
dataList
.
addAll
(
lists
);
paramMap
=
qqkbCommentAnalysis
.
getParamMap
(
result
,
i
,
comment_id
,
article_id
);
i
++;
ZhiWeiTools
.
sleep
(
5000
);
}
return
dataList
;
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析天天快报评论出错"
,
e
.
getMessage
());
return
dataList
;
}
}
/**
*
* @Description 获取天天快报评论数
* @param cookie
* @param comment_id
* @param article_id
* @return
*/
public
static
int
getCommentCount
(
String
cookie
,
String
comment_id
,
String
article_id
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getQQKBCommentHeaderMap
(
cookie
);
try
{
Map
<
String
,
Object
>
paramMap
=
HeadGet
.
getQQKBCommentParamMap
(
comment_id
,
article_id
);
String
result
=
HttpClient
.
executeHttpRequestPost
(
"http://r.cnews.qq.com/getQQNewsComment"
,
headerMap
,
paramMap
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
return
json
.
getJSONObject
(
"comments"
).
getInteger
(
"count"
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析天天快报评论出错"
,
e
.
getMessage
());
return
0
;
}
}
// public static List<Map<String,Object>> getQQKBByWordData(String word,String cookie) {
// List<Map<String,Object>> dataList = new ArrayList<Map<String,Object>>();
// Map<String,String> headerMap = HeadGet.getQQKBByWordHeaderMap(cookie);
// Map<String,Object> paramMap = HeadGet.getQQKBByWordParamMap(word,null,null,0);
// int i = 1;
// try {
// String result = HttpClient.executeHttpRequestPost("http://r.cnews.qq.com/search", headerMap, paramMap);
// System.out.println(result);
// JSONObject json = JSONObject.parseObject(result);
// String sid = json.getString("sid");
// String queryid = json.getString("queryid");
// System.out.println(sid + "================" + queryid);
// while(true) {
// ZhiWeiTools.sleep(5000);
// i++;
// paramMap.clear();
// paramMap = HeadGet.getQQKBByWordParamMap(word, sid, queryid, i);
// result = HttpClient.executeHttpRequestPost("http://r.cnews.qq.com/searchMore", headerMap, paramMap);
// System.out.println(result);
// }
// } catch (Exception e) {
// logger.error("天天快报关键词采集出错",e.getMessage());
// return dataList;
// }
// }
}
src/main/java/com/zhiwei/parse/Souhu.java
View file @
7bf0e1d7
package
com
.
zhiwei
.
parse
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
...
...
@@ -23,6 +22,12 @@ public class Souhu {
private
static
SouhuAccountAnalysis
souhuAccountAnalysis
=
new
SouhuAccountAnalysis
();
private
static
SouhuCommentAnalysis
souhuCommentAnalysis
=
new
SouhuCommentAnalysis
();
/**
*
* @Description 获取链接评论数
* @param url
* @return
*/
public
static
int
getSouhuCommentCount
(
String
url
)
{
String
newurl
=
souhuCommentAnalysis
.
getSouhuURL
(
url
);
int
i
;
...
...
@@ -70,7 +75,9 @@ public class Souhu {
if
(
jsonArray
.
size
()
<
1
)
{
break
;
}
dataList
.
addAll
(
dataList1
);
if
(
startTime
==
null
)
{
dataList
.
addAll
(
dataList1
);
}
//判断时间
if
(
startTime
!=
null
)
{
for
(
Map
<
String
,
Object
>
map
:
dataList1
)
{
...
...
@@ -82,6 +89,7 @@ public class Souhu {
dataList
.
add
(
map
);
}
}
logger
.
info
(
"=============获取到的数据数目{}"
,
dataList
.
size
());
i
++;
ZhiWeiTools
.
sleep
(
3000
);
}
catch
(
Exception
e
)
{
...
...
src/main/java/com/zhiwei/parse/Yidianzixun.java
View file @
7bf0e1d7
package
com
.
zhiwei
.
parse
;
import
java.io.IOException
;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
import
java.util.List
;
...
...
src/main/java/com/zhiwei/parse/analysis/QQAccountAnalysis.java
→
src/main/java/com/zhiwei/parse/analysis/QQ
KB
AccountAnalysis.java
View file @
7bf0e1d7
...
...
@@ -11,8 +11,8 @@ import org.slf4j.LoggerFactory;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
public
class
QQAccountAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
QQAccountAnalysis
.
class
);
public
class
QQ
KB
AccountAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
QQ
KB
AccountAnalysis
.
class
);
/**
*
...
...
src/main/java/com/zhiwei/parse/analysis/SouhuCommentAnalysis.java
View file @
7bf0e1d7
...
...
@@ -33,6 +33,7 @@ public class SouhuCommentAnalysis {
source_id
=
s
.
split
(
"_"
)[
0
];
}
catch
(
Exception
e
)
{
logger
.
error
(
"链接解析错误"
,
e
.
getMessage
());
return
null
;
}
String
newurl
=
"http://apiv2.sohu.com/api/comment/list?page_size=10&topic_id="
+
topic_id
+
"&source_id=mp_"
+
source_id
;
return
newurl
;
...
...
src/test/java/com/zhiwei/crawler/QQAccountExample.java
View file @
7bf0e1d7
...
...
@@ -7,7 +7,7 @@ import java.util.Map;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.QQ
;
import
com.zhiwei.parse.QQ
KB
;
public
class
QQAccountExample
{
...
...
@@ -16,7 +16,7 @@ public class QQAccountExample {
String
child
=
"5002744"
;
String
cookie
=
"phone_id=;%20phone_token=;%20luin=o0497332654;%20lskey=00030000db3c2ec2393ea968f523f50144db7ab5aec60e79d2509c271bdacdf784e88ac1f58b7493c23ceb15;%20uin=o0497332654;%20skey=M67MOgvFQJ;%20sigA2=D3046D543D9BA50CFE749D63B1F05AF28A281C29B4F1353374AB7A19D9527497A67E507C6829AE44F67C1EA032C2A3728301D2ABC864DA32BCA7D4C7A61609F9F3BC9AE0A7243003;%20openid=oijc7uLZNVp85r-MpFBYxwxzdEkg;%20appid=wxe90c9765ad00e2cd;%20access_token=5_IWbsKfygpJ0lnbUnnFnfwmUT_jxJCnY5yVwhmL3e2K5FOTRth6jz8SKVHGseA3v9s8UIZxw00LpF1uC9l7W5WL2trdb69LlCvE1s7twReOw;%20refresh_token=5_IWbsKfygpJ0lnbUnnFnfwgDT4pA9HEbY-wuqnqIHWf9AzdmIueZFuzHYfnZPuSNEc0ZjDuHXrtSrRBMMD-7kgj06iF0NdAOi-KRj6-mrmlA;%20unionid=onCs1uNNpjMXeYIHAhacGypamEBk;%20logintype=0"
;
List
<
Map
<
String
,
Object
>>
dataList
=
QQ
.
getQQAccountData
(
child
,
cookie
);
List
<
Map
<
String
,
Object
>>
dataList
=
QQ
KB
.
getQQAccountData
(
child
,
cookie
);
System
.
out
.
println
(
dataList
.
size
());
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
...
...
src/test/java/com/zhiwei/crawler/SouhuAccountExample.java
View file @
7bf0e1d7
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Souhu
;
...
...
@@ -12,13 +14,19 @@ public class SouhuAccountExample {
@Test
public
void
souhuAccountTest
()
{
List
<
Map
<
String
,
Object
>>
lists
=
Souhu
.
getSouHuAccountData
(
"
MjQ4MDQ5Nzg2MEBzaW5hLnNvaHUuY29t"
,
null
,
tru
e
);
List
<
Map
<
String
,
Object
>>
lists
=
Souhu
.
getSouHuAccountData
(
"
c29odXptdGh5YXRieUBzb2h1LmNvbQ=="
,
"2017-01-01 00:00:00"
,
fals
e
);
System
.
out
.
println
(
lists
.
size
());
int
i
=
0
;
for
(
Map
<
String
,
Object
>
map
:
lists
)
{
System
.
out
.
println
(
map
.
toString
());
System
.
out
.
println
(
i
++);
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"content"
);
headList
.
add
(
"url"
);
headList
.
add
(
"comment"
);
headList
.
add
(
"tags"
);
headList
.
add
(
"newsid"
);
headList
.
add
(
"newsPv"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D:\\crawlerdata\\搜狐号历史文章.xlsx"
,
"sasd"
,
headList
,
lists
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment