Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
media_data_crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
media_data_crawler
Commits
ea9efe8f
Commit
ea9efe8f
authored
Jul 25, 2019
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修改 知乎回答下评论采集
parent
45483734
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
33 additions
and
44 deletions
+33
-44
.classpath
+1
-1
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduInforCrawlerParse.java
+9
-10
src/main/java/com/zhiwei/media_data_crawler/crawler/JianshuCrawler.java
+0
-5
src/main/java/com/zhiwei/media_data_crawler/crawler/SoNewsCrawlerParse.java
+3
-6
src/main/java/com/zhiwei/media_data_crawler/crawler/ZhihuAnswerCommentParse.java
+20
-22
No files found.
.classpath
View file @
ea9efe8f
...
...
@@ -10,6 +10,7 @@
<attributes>
<attribute
name=
"optional"
value=
"true"
/>
<attribute
name=
"maven.pomderived"
value=
"true"
/>
<attribute
name=
"test"
value=
"true"
/>
</attributes>
</classpathentry>
<classpathentry
kind=
"con"
path=
"org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8"
>
...
...
@@ -20,7 +21,6 @@
<classpathentry
kind=
"con"
path=
"org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER"
>
<attributes>
<attribute
name=
"maven.pomderived"
value=
"true"
/>
<attribute
name=
"org.eclipse.jst.component.nondependency"
value=
""
/>
</attributes>
</classpathentry>
<classpathentry
kind=
"con"
path=
"org.eclipse.jdt.junit.JUNIT_CONTAINER/4"
/>
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduInforCrawlerParse.java
View file @
ea9efe8f
...
...
@@ -221,16 +221,12 @@ public class BaiduInforCrawlerParse {
return
resultMap
;
}
private
static
String
downloadHtml
(
String
url
,
int
page
)
throws
Exception
{
// 获取通用请求头
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
private
static
String
downloadHtml
(
String
url
,
int
page
)
{
// 获取链接地址
url
=
url
+
"&pn="
+
page
*
10
;
headerMap
.
put
(
"Host"
,
"news.baidu.com"
);
headerMap
.
put
(
"Referer"
,
url
);
// 下载数据页面
for
(
int
i
=
1
;
i
<=
3
;
i
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
ProxyHolder
.
NAT_HEAVY_PROXY
)){
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_HEAVY_PROXY
)){
return
response
.
body
().
string
();
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
...
...
@@ -259,10 +255,6 @@ public class BaiduInforCrawlerParse {
int
page
=
0
;
boolean
more
=
true
;
while
(
more
)
{
// 最大页数为20
if
(
page
>
30
)
{
more
=
false
;
}
String
htmlBody
=
downloadHtml
(
url
,
page
);
if
(
htmlBody
!=
null
)
{
Map
<
String
,
Object
>
dataMap
=
analysisData
(
htmlBody
,
word
);
...
...
@@ -273,6 +265,10 @@ public class BaiduInforCrawlerParse {
more
=
false
;
}
page
++;
// 最大页数为20
if
(
page
>
10
)
{
more
=
false
;
}
}
return
list
;
}
...
...
@@ -306,9 +302,12 @@ public class BaiduInforCrawlerParse {
//https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=%E6%B5%99%E6%B1%9F&medium=1&rn=50&gpc=stf%3D1546272000%2C1548950400%7Cstftype%3D2&tngroupname=organic_news&rsv_dl=news_b_pn&pn=0
// public static void main(String[] args) throws Exception {
// String url = "http://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=1&wd=%E5%A5%94%E9%A9%B0+%E6%BC%8F%E6%B1%BD%E6%B2%B9&medium=0&rn=50&gpc=stf%3D0%2C1496246399%7Cstftype%3D2&tngroupname=organic_news&rsv_dl=news_l_more&x_bfe_rqs=03E80&x_bfe_tjscore=0.332314&scs=2546086922&sortBy=0&pn=0";
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// List<NewsData> ndList = getBaiduInforData("马云","2019-07-04 23:59:59");
// System.out.println(ndList.size());
// String result = downloadHtml(url,0);
// System.out.println(result);
// }
}
src/main/java/com/zhiwei/media_data_crawler/crawler/JianshuCrawler.java
View file @
ea9efe8f
package
com
.
zhiwei
.
media_data_crawler
.
crawler
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
...
...
@@ -12,9 +11,7 @@ import org.apache.logging.log4j.Logger;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.media_data_crawler.entity.JianshuUser
;
...
...
@@ -78,7 +75,6 @@ public class JianshuCrawler {
}
break
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
logger
.
error
(
"简书用户采集出错{}"
,
e
);
}
...
...
@@ -88,6 +84,5 @@ public class JianshuCrawler {
}
public
static
void
main
(
String
[]
args
)
{}
}
src/main/java/com/zhiwei/media_data_crawler/crawler/SoNewsCrawlerParse.java
View file @
ea9efe8f
...
...
@@ -214,7 +214,6 @@ public class SoNewsCrawlerParse {
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
logger
.
error
(
"360新闻数据解析时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
continue
;
}
}
resultMap
.
put
(
"data"
,
list
);
...
...
@@ -236,8 +235,8 @@ public class SoNewsCrawlerParse {
* @return Map<String,Object> 返回类型
*/
private
static
Map
<
String
,
Object
>
analysisDataByTitle
(
String
htmlBody
,
Proxy
proxy
,
String
word
)
throws
Exception
{
Map
<
String
,
Object
>
resultMap
=
new
HashMap
<
String
,
Object
>();
List
<
NewsData
>
list
=
new
ArrayList
<
NewsData
>();
Map
<
String
,
Object
>
resultMap
=
new
HashMap
<>();
List
<
NewsData
>
list
=
new
ArrayList
<>();
boolean
more
=
true
;
/** 解析页面 */
...
...
@@ -269,9 +268,7 @@ public class SoNewsCrawlerParse {
NewsData
newsData
=
new
NewsData
(
link
,
title
,
source
,
time
,
content
,
pt
,
word
);
list
.
add
(
newsData
);
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
logger
.
error
(
"360新闻数据解析时出现问题,问题为:{}"
,
e
.
fillInStackTrace
());
continue
;
logger
.
error
(
"360新闻数据解析时出现问题,问题为:{}"
,
e
);
}
}
resultMap
.
put
(
"data"
,
list
);
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/ZhihuAnswerCommentParse.java
View file @
ea9efe8f
...
...
@@ -15,7 +15,6 @@ import com.zhiwei.crawler.core.HttpBoot;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.media_data_crawler.entity.ZhihuAnswerComment
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
...
...
@@ -45,10 +44,9 @@ public class ZhihuAnswerCommentParse {
int
count
=
-
1
;
for
(
int
i
=
1
;
i
<
3
;
i
++)
{
count
=
zacList
.
size
();
String
nurl
=
"https://www.zhihu.com/api/v4/answers/"
+
id
+
"/root_comments?"
+
"include=data%5B*%5D.author%2Ccollapsed%2Creply_to_author%2Cdisliked%2"
+
"Ccontent%2Cvoting%2Cvote_count%2Cis_parent_author%2Cis_author&order=norma"
+
"l&limit=50&offset="
+
pages
+
"&status=open"
;
//https://www.zhihu.com/api/v4/answers/708507274/root_comments?order=normal&limit=20&offset=20&status=open
String
nurl
=
"https://www.zhihu.com/api/v4/answers/"
+
id
+
"/root_comments?order=norma"
+
"l&limit=20&offset="
+
pages
+
"&status=open"
;
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
nurl
),
proxy
)){
String
result
=
response
.
body
().
string
();
zacList
.
addAll
(
getData
(
result
));
...
...
@@ -92,23 +90,23 @@ public class ZhihuAnswerCommentParse {
zac
.
setTime
(
new
Date
(
createdTime
*
1000L
));
zac
.
setChildCommentCount
(
childCommentCount
);
dataList
.
add
(
zac
);
if
(
childCommentCount
>
0
)
{
for
(
int
g
=
0
;
g
<
childCommentCount
;
g
+=
20
)
{
for
(
int
n
=
1
;
n
<
5
;
n
++)
{
//避免太快,ip被封,导致数据无法获取
ZhiWeiTools
.
sleep
(
200
);
String
url2
=
"https://www.zhihu.com/api/v4/comments/"
+
id
+
"/child_comments?include=%24%5B%2A%5D."
+
"author%2Creply_to_author%2Ccontent%2Cvote_count&limit="
+
"50&offset="
+
g
+
"&include=%24%5B*%5D.author%2Creply_to_author%2Ccontent%2Cvote_count"
;
//获取回答中的回复列表
List
<
ZhihuAnswerComment
>
replayList
=
getReplayList
(
url2
,
id
);
if
(!
replayList
.
isEmpty
())
{
dataList
.
addAll
(
replayList
);
break
;
}
}
}
}
//
if (childCommentCount > 0) {
//
for (int g = 0; g < childCommentCount; g += 20) {
//
for(int n = 1;n < 5;n++) {
//
//避免太快,ip被封,导致数据无法获取
//
ZhiWeiTools.sleep(200);
//
String url2 = "https://www.zhihu.com/api/v4/comments/" + id + "/child_comments?include=%24%5B%2A%5D." +
//
"author%2Creply_to_author%2Ccontent%2Cvote_count&limit=" +
//
"50&offset=" + g + "&include=%24%5B*%5D.author%2Creply_to_author%2Ccontent%2Cvote_count";
//
//获取回答中的回复列表
//
List<ZhihuAnswerComment> replayList = getReplayList(url2,id);
//
if(!replayList.isEmpty()) {
//
dataList.addAll(replayList);
//
break;
//
}
//
}
//
}
//
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment