Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
articlenewscrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
articlenewscrawler
Commits
1116d3c5
Commit
1116d3c5
authored
Dec 18, 2018
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
易车网 评论采集添加
parent
7fb5554a
Show whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
263 additions
and
87 deletions
+263
-87
pom.xml
+8
-7
src/main/java/com/zhiwei/parse/Douban.java
+6
-7
src/main/java/com/zhiwei/parse/Souhu.java
+10
-8
src/main/java/com/zhiwei/parse/Wangyi.java
+4
-0
src/main/java/com/zhiwei/parse/Yiche.java
+106
-0
src/main/java/com/zhiwei/parse/analysis/DoubanCommentAnalysis.java
+1
-1
src/main/java/com/zhiwei/parse/analysis/SouhuCommentAnalysis.java
+17
-17
src/test/java/com/zhiwei/Comment/DoubanCommentTest.java
+29
-29
src/test/java/com/zhiwei/Comment/YicheCommentCountTest.java
+45
-0
src/test/java/com/zhiwei/crawler/SinaCommentListTest.java
+0
-11
src/test/java/com/zhiwei/crawler/SouhuCommentCountExample.java
+1
-1
src/test/java/com/zhiwei/crawler/WangyiCommentCountExample.java
+33
-3
src/test/java/com/zhiwei/keyword/DoubanTopicTest.java
+3
-3
No files found.
pom.xml
View file @
1116d3c5
...
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<artifactId>
articlenewscrawler
</artifactId>
<version>
0.0.
3
-SNAPSHOT
</version>
<version>
0.0.
4
-SNAPSHOT
</version>
<name>
articlenewscrawler
</name>
<description>
采集凤凰,一点资讯,搜狐历时文章和文章评论
</description>
...
...
@@ -14,11 +14,6 @@
<version>
6.14.3
</version>
</dependency>
<dependency>
<groupId>
org.jsoup
</groupId>
<artifactId>
jsoup
</artifactId>
<version>
1.8.3
</version>
</dependency>
<dependency>
<groupId>
com.alibaba
</groupId>
<artifactId>
fastjson
</artifactId>
<version>
1.2.29
</version>
...
...
@@ -36,7 +31,13 @@
<dependency>
<groupId>
com.zhiwei.tools
</groupId>
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.0.9-SNAPSHOT
</version>
<version>
0.1.1-SNAPSHOT
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<version>
0.1.1-RELEASE
</version>
<scope>
provided
</scope>
</dependency>
</dependencies>
...
...
src/main/java/com/zhiwei/parse/Douban.java
View file @
1116d3c5
package
com
.
zhiwei
.
parse
;
import
java.io.IOException
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.apache.poi.xwpf.usermodel.BodyElementType
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
...
...
@@ -16,7 +15,6 @@ import org.jsoup.select.Elements;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.parse.analysis.DoubanCommentAnalysis
;
...
...
@@ -66,7 +64,7 @@ public class Douban {
if
(
nonNull
(
elements
))
{
for
(
Element
element
:
elements
)
{
link
=
element
.
select
(
"td.td-subject"
).
select
(
"a"
).
attr
(
"href"
);
title
=
element
.
select
(
"td.td-subject"
).
select
(
"a"
).
text
(
);
title
=
element
.
select
(
"td.td-subject"
).
select
(
"a"
).
attr
(
"title"
);
time
=
element
.
select
(
"td.td-time"
).
attr
(
"title"
);
replyCount
=
Integer
.
valueOf
(
element
.
select
(
"td.td-reply"
).
select
(
"span"
).
text
().
split
(
"回应"
)[
0
].
trim
());
group
=
element
.
select
(
"td"
).
get
(
3
).
text
();
...
...
@@ -79,9 +77,10 @@ public class Douban {
map
.
put
(
"time"
,
time
);
map
.
put
(
"reply_count"
,
replyCount
);
bodyList
.
add
(
map
);
// System.out.println(map.toString());
}
}
if
(
cou
==
bodyList
.
size
()
){
if
(
bodyList
.
size
()
-
cou
<
30
){
more
=
false
;
}
logger
.
info
(
"采集到 第 {} 页 ,一共采集到 {} 条 是否有下一页 {}"
,
page
,
bodyList
.
size
(),
more
);
...
...
@@ -95,7 +94,7 @@ public class Douban {
logger
.
error
(
"豆瓣 topic 采集出错 {}"
,
e
);
}
}
return
Collections
.
emptyList
()
;
return
bodyList
;
}
/**
...
...
src/main/java/com/zhiwei/parse/Souhu.java
View file @
1116d3c5
...
...
@@ -19,6 +19,7 @@ import com.zhiwei.parse.analysis.SouhuAccountAnalysis;
import
com.zhiwei.parse.analysis.SouhuCommentAnalysis
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
static
java
.
util
.
Objects
.
nonNull
;
public
class
Souhu
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Souhu
.
class
);
...
...
@@ -33,15 +34,17 @@ public class Souhu {
* @return
*/
public
static
int
getSouhuCommentCount
(
String
url
,
Proxy
proxy
)
{
String
newurl
=
souhuCommentAnalysis
.
getSouhuURL
(
url
);
int
i
;
try
{
String
newurl
=
souhuCommentAnalysis
.
getSouhuURL
(
url
,
proxy
);
if
(
nonNull
(
newurl
))
{
int
i
;
i
=
souhuCommentAnalysis
.
getSouhuCommentCount
(
newurl
,
proxy
);
return
i
;
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"搜狐获取评论数出错了"
,
e
.
getMessage
());
return
0
;
logger
.
error
(
"搜狐获取评论数出错了 {}"
,
e
);
}
return
-
1
;
}
...
...
@@ -137,11 +140,11 @@ public class Souhu {
*/
public
static
List
<
Map
<
String
,
Object
>>
getSouhuCommentData
(
String
url
,
Proxy
proxy
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getSouhuCommentHeaderMap
(
null
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<
Map
<
String
,
Object
>
>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
int
j
=
1
;
try
{
while
(
true
)
{
String
newurl
=
souhuCommentAnalysis
.
getSouhuURL
(
url
)
+
"&page_no="
+
j
;
String
newurl
=
souhuCommentAnalysis
.
getSouhuURL
(
url
,
proxy
)
+
"&page_no="
+
j
;
String
result
=
HttpClient
.
executeHttpRequestGet
(
newurl
,
proxy
,
headerMap
);
System
.
out
.
println
(
newurl
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
...
...
@@ -159,8 +162,7 @@ public class Souhu {
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
logger
.
error
(
"获取搜狐文章评论出错"
,
e
.
getMessage
());
logger
.
error
(
"获取搜狐文章评论出错 {}"
,
e
);
}
return
dataList
;
...
...
src/main/java/com/zhiwei/parse/Wangyi.java
View file @
1116d3c5
...
...
@@ -62,11 +62,15 @@ public class Wangyi {
* @return
*/
public
static
int
getWangyiCommentCount
(
String
id
,
Proxy
proxy
)
{
try
{
String
url
=
"http://comment.dy.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/"
+
id
;
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getWangyiCommentHeaderMap
(
null
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
return
json
.
getInteger
(
"tcount"
);
}
catch
(
Exception
e
)
{
return
-
1
;
}
}
public
static
List
<
Map
<
String
,
Object
>>
getHistoryData
(
String
url
,
Proxy
proxy
,
String
endTime
)
{
...
...
src/main/java/com/zhiwei/parse/Yiche.java
0 → 100644
View file @
1116d3c5
package
com
.
zhiwei
.
parse
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
okhttp3.Response
;
public
class
Yiche
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Yiche
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
/**
*
* @Description 易车网 评论获取
* @param url
* @param proxy
* @return
*/
public
static
int
getYicheCount
(
String
url
,
Proxy
proxy
)
{
String
nurl
=
getnewsId
(
url
,
proxy
);
if
(
nonNull
(
nurl
))
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
nurl
),
proxy
)){
String
result
=
response
.
body
().
string
();
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
return
json
.
getJSONObject
(
"result"
).
getInteger
(
"total"
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"error {} "
,
e
);
}
}
return
-
1
;
}
/**
*
* @Description 易车网 评论采集
* @param url
* @param proxy
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getYicheComment
(
String
url
,
Proxy
proxy
)
{
String
nUrl
=
getnewsId
(
url
,
proxy
);
if
(
nonNull
(
nUrl
))
{
int
page
=
1
;
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
boolean
f
=
true
;
while
(
f
)
{
String
surl
=
nUrl
+
"&pageSize=50&isHot=false&pageIndex="
+
page
;
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
surl
),
proxy
)){
String
result
=
response
.
body
().
string
();
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArray
=
json
.
getJSONObject
(
"result"
).
getJSONArray
(
"list"
);
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
map
.
put
(
"source"
,
data
.
getString
(
"showName"
));
map
.
put
(
"time"
,
data
.
getString
(
"createTime"
));
map
.
put
(
"content"
,
data
.
getString
(
"content"
));
map
.
put
(
"like"
,
data
.
get
(
"likeCount"
));
map
.
put
(
"id"
,
data
.
getString
(
"id"
));
bodyList
.
add
(
map
);
}
int
total
=
json
.
getJSONObject
(
"result"
).
getInteger
(
"total"
);
logger
.
info
(
" 一共采集 了 {} 条 采集到 {} 页 一共有 {} 条"
,
bodyList
.
size
(),
page
,
total
);
if
(
page
*
50
>
total
)
{
f
=
false
;
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"error {} "
,
e
);
f
=
false
;
}
ZhiWeiTools
.
sleep
(
2000
);
page
++;
}
}
return
Collections
.
emptyList
();
}
private
static
String
getnewsId
(
String
url
,
Proxy
proxy
)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
String
result
=
response
.
body
().
string
();
String
productId
=
result
.
split
(
"productId: "
)[
1
].
split
(
","
)[
0
];
String
objectId
=
result
.
split
(
"newsId = '"
)[
1
].
split
(
"',"
)[
0
];
return
"http://newsapi.bitauto.com/comment/comment/getdata?productId="
+
productId
+
"&objectId="
+
objectId
;
}
catch
(
Exception
e
)
{
logger
.
error
(
"error {} "
,
e
);
}
return
null
;
}
}
src/main/java/com/zhiwei/parse/analysis/DoubanCommentAnalysis.java
View file @
1116d3c5
...
...
@@ -42,7 +42,7 @@ public class DoubanCommentAnalysis {
map
.
put
(
"content"
,
content
);
map
.
put
(
"id"
,
id
);
map
.
put
(
"like"
,
getLikeNum
(
result
,
"c"
+
id
));
System
.
out
.
println
(
map
.
toString
());
//
System.out.println(map.toString());
bodyList
.
add
(
map
);
}
}
...
...
src/main/java/com/zhiwei/parse/analysis/SouhuCommentAnalysis.java
View file @
1116d3c5
...
...
@@ -9,12 +9,17 @@ import org.slf4j.Logger;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
okhttp3.Response
;
public
class
SouhuCommentAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
SouhuCommentAnalysis
.
class
);
private
HttpBoot
httpBoot
=
new
HttpBoot
();
/**
*
...
...
@@ -22,35 +27,30 @@ public class SouhuCommentAnalysis {
* @param url
* @return
*/
public
String
getSouhuURL
(
String
url
)
{
String
topic_id
=
""
;
String
source_id
=
""
;
try
{
if
(
url
.
contains
(
"?"
)){
url
=
url
.
split
(
"\\?"
)[
0
];
}
String
s
=
url
.
split
(
"a/"
)[
1
];
topic_id
=
s
.
split
(
"_"
)[
1
];
source_id
=
s
.
split
(
"_"
)[
0
];
public
String
getSouhuURL
(
String
url
,
Proxy
proxy
)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
String
result
=
response
.
body
().
string
();
String
source_id
=
result
.
split
(
"news_id: \""
)[
1
].
split
(
"\","
)[
0
];
String
topic_id
=
result
.
split
(
"media_id: \""
)[
1
].
split
(
"\","
)[
0
];
return
"http://apiv2.sohu.com/api/comment/list?page_size=10&topic_id="
+
topic_id
+
"&source_id=mp_"
+
source_id
;
}
catch
(
Exception
e
)
{
logger
.
error
(
"链接解析错误"
,
e
.
getMessage
());
return
null
;
logger
.
error
(
"Exception {} "
,
e
);
}
String
newurl
=
"http://apiv2.sohu.com/api/comment/list?page_size=10&topic_id="
+
topic_id
+
"&source_id=mp_"
+
source_id
;
return
newurl
;
return
null
;
}
public
int
getSouhuCommentCount
(
String
url
,
Proxy
proxy
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getSouhuCommentHeaderMap
(
null
);
int
i
;
try
{
System
.
out
.
println
(
url
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
i
=
json
.
getJSONObject
(
"jsonObject"
).
getInteger
(
"
participation
_sum"
);
i
=
json
.
getJSONObject
(
"jsonObject"
).
getInteger
(
"
cmt
_sum"
);
return
i
;
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取搜狐评论数信息出错
"
,
e
.
getMessage
()
);
return
0
;
logger
.
error
(
"获取搜狐评论数信息出错
{}"
,
e
);
return
-
1
;
}
}
...
...
src/test/java/com/zhiwei/Comment/DoubanCommentTest.java
View file @
1116d3c5
package
com
.
zhiwei
.
Comment
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.testng.annotations.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Douban
;
public
class
DoubanCommentTest
{
@Test
public
void
f
()
{
String
url
=
"https://www.douban.com/group/topic/128726395/"
;
String
cookie
=
"bid=rymxzs5aojg; ps=y; ll=\"118173\"; __utmc=30149280; dbcl2=\"188038058:9IHyVcSobVc\"; ck=_RvF; push_noty_num=0; push_doumail_num=0; __utmv=30149280.18803; douban-fav-remind=1; __yadk_uid=qLflXyj3R14ro9e0cLoZOQlJoMGVN32j; douban-profile-remind=1; _vwo_uuid_v2=D85F60C118B0AF465035D9CC7BBFDA7A6|4bf255e1e3a2e9aeede3708192f5f1bc; __utmz=30149280.1543564973.3.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; ap_v=0,6.0; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1543823236%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DPk19bxnhsVWwfnrcnwT0PquON7D1JpLsbfSu9rRowalyi4pOeM3rMHKFaJo9jJF7%26wd%3D%26eqid%3De7f262650001ef98000000045c00e64f%22%5D; _pk_id.100001.8cb4=6828fef49f6bcf34.1543559455.5.1543823236.1543820463.; _pk_ses.100001.8cb4=*; __utma=30149280.824403997.1543559458.1543818802.1543823236.6; __utmt=1; __utmb=30149280.5.7.1543823236"
;
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
Map
<
String
,
Object
>>
bodyList
=
Douban
.
getDoubanComment
(
url
,
null
,
cookie
);
List
<
String
>
headList
=
new
ArrayList
<>();
headList
.
add
(
"source"
);
headList
.
add
(
"time"
);
headList
.
add
(
"like"
);
headList
.
add
(
"content"
);
headList
.
add
(
"id"
);
poi
.
exportExcel
(
"D://crawlerdata//自媒体/douban评论采集-2.xlsx"
,
"asd"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.Comment;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.testng.annotations.Test;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Douban;
//
//
public class DoubanCommentTest {
//
@Test
//
public void f() {
//
String url = "https://www.douban.com/group/topic/128726395/";
//
String cookie = "bid=rymxzs5aojg; ps=y; ll=\"118173\"; __utmc=30149280; dbcl2=\"188038058:9IHyVcSobVc\"; ck=_RvF; push_noty_num=0; push_doumail_num=0; __utmv=30149280.18803; douban-fav-remind=1; __yadk_uid=qLflXyj3R14ro9e0cLoZOQlJoMGVN32j; douban-profile-remind=1; _vwo_uuid_v2=D85F60C118B0AF465035D9CC7BBFDA7A6|4bf255e1e3a2e9aeede3708192f5f1bc; __utmz=30149280.1543564973.3.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; ap_v=0,6.0; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1543823236%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DPk19bxnhsVWwfnrcnwT0PquON7D1JpLsbfSu9rRowalyi4pOeM3rMHKFaJo9jJF7%26wd%3D%26eqid%3De7f262650001ef98000000045c00e64f%22%5D; _pk_id.100001.8cb4=6828fef49f6bcf34.1543559455.5.1543823236.1543820463.; _pk_ses.100001.8cb4=*; __utma=30149280.824403997.1543559458.1543818802.1543823236.6; __utmt=1; __utmb=30149280.5.7.1543823236";
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
//
List<Map<String,Object>> bodyList = Douban.getDoubanComment(url, null, cookie);
//
List<String> headList = new ArrayList<>();
//
headList.add("source");
//
headList.add("time");
//
headList.add("like");
//
headList.add("content");
//
headList.add("id");
//
//
poi.exportExcel("D://crawlerdata//自媒体/douban评论采集-2.xlsx", "asd", headList, bodyList);
//
}
//
}
src/test/java/com/zhiwei/Comment/YicheCommentCountTest.java
0 → 100644
View file @
1116d3c5
//package com.zhiwei.Comment;
//
//import java.util.ArrayList;
//import java.util.HashMap;
//import java.util.List;
//import java.util.Map;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Fenghuang;
//import com.zhiwei.parse.Yiche;
//import com.zhiwei.tools.tools.ZhiWeiTools;
//
//public class YicheCommentCountTest {
// @SuppressWarnings("unchecked")
// @Test
// public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
// GroupType.PROVIDER);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
// Map<String, Object> map = poi
// .importExcel("D://crawlerdata//自媒体/易车链接.xlsx", 0);
// List<Map<String, Object>> list = (List<Map<String, Object>>) map.get("body");
// List<Map<String, Object>> bodyList = new ArrayList<Map<String, Object>>();
// List<String> headList = (List<String>) map.get("head");
// for (Map<String, Object> map1 : list) {
// String url = map1.get("易车链接") + "";
// url = "http://news.bitauto.com/xinchexiaoxi/20181212/2309130374.html#comment";
// System.out.println(url);
//// int i = Yiche.getYicheCount(url, ProxyFactory.getNatProxy());
//// System.out.println(i);
//// map1.put("count", i);
// Yiche.getYicheComment(url, ProxyFactory.getNatProxy());
// ZhiWeiTools.sleep(500);
// }
// headList.add("count");
// poi.exportExcel("D://crawlerdata//自媒体/易车链接.xlsx", "评论采集", headList,
// list);
//
// }
//}
src/test/java/com/zhiwei/crawler/SinaCommentListTest.java
View file @
1116d3c5
...
...
@@ -20,17 +20,6 @@ import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
public
class
SinaCommentListTest
{
public
static
void
main
(
String
[]
args
)
{
List
<
String
>
urlList
=
new
ArrayList
<
String
>();
for
(
String
url
:
urlList
){
sinaCommentListTest
(
url
);
}
}
public
static
void
sinaCommentListTest
(
String
url
)
{
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
String
newsId
=
getCommentId
(
url
).
split
(
"====="
)[
1
];
...
...
src/test/java/com/zhiwei/crawler/SouhuCommentCountExample.java
View file @
1116d3c5
...
...
@@ -9,7 +9,7 @@ public class SouhuCommentCountExample {
@Test
public
void
souhuCommentCountTest
()
{
String
url
=
"http
s://www.sohu.com/a/210588884_267106?_f=index_news_7
"
;
String
url
=
"http
://www.sohu.com/a/281414426_133392
"
;
int
i
=
Souhu
.
getSouhuCommentCount
(
url
,
null
);
System
.
out
.
println
(
i
);
...
...
src/test/java/com/zhiwei/crawler/WangyiCommentCountExample.java
View file @
1116d3c5
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Wangyi
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
WangyiCommentCountExample
{
@Test
public
void
wangyiCommentCountTest
()
{
String
id
=
"D77CENT50001875P"
;
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
String
path
=
"D:\\crawlerdata\\自媒体\\网易评论采集-1.xlsx"
;
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
path
,
0
);
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
List
<
String
>
urlList
=
new
ArrayList
<
String
>();
for
(
Map
<
String
,
Object
>
u
:
list
)
{
String
url
=
u
.
get
(
"链接"
)+
""
;
urlList
.
add
(
url
);
}
int
i
=
Wangyi
.
getWangyiCommentCount
(
id
,
null
);
System
.
out
.
println
(
i
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
for
(
String
url
:
urlList
)
{
String
id
=
url
.
split
(
"/"
)[
url
.
split
(
"/"
).
length
-
1
].
split
(
".ht"
)[
0
];
System
.
out
.
println
(
id
);
int
lists
=
Wangyi
.
getWangyiCommentCount
(
id
,
null
);
ZhiWeiTools
.
sleep
(
3000
);
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"content"
);
headList
.
add
(
"id"
);
headList
.
add
(
"time"
);
headList
.
add
(
"name"
);
headList
.
add
(
"like"
);
headList
.
add
(
"unlike"
);
headList
.
add
(
"from_url"
);
poi
.
exportExcel
(
path
,
"评论数据"
,
headList
,
bodyList
);
}
...
...
src/test/java/com/zhiwei/keyword/DoubanTopicTest.java
View file @
1116d3c5
...
...
@@ -7,9 +7,9 @@
//public class DoubanTopicTest {
// @Test
// public void f() {
// String word = "
唐嫣
";
// String cookie = "bid=rymxzs5aojg; ps=y; ll=\"118173\"; __utmc=30149280;
__utmz=30149280.1543559458.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); dbcl2=\"188038058:9IHyVcSobVc\"; ck=_RvF; ap_v=0,6.0; push_noty_num=0; push_doumail_num=0; __utmv=30149280.18803; douban-fav-remind=1; __yadk_uid=qLflXyj3R14ro9e0cLoZOQlJoMGVN32j; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1543562805%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DPk19bxnhsVWwfnrcnwT0PquON7D1JpLsbfSu9rRowalyi4pOeM3rMHKFaJo9jJF7%26wd%3D%26eqid%3De7f262650001ef98000000045c00e64f%22%5D; _pk_ses.100001.8cb4=*; douban-profile-remind=1; __utma=30149280.824403997.1543559458.1543559458.1543562809.2; __utmt=1; _pk_id.100001.8cb4=6828fef49f6bcf34.1543559455.2.1543564606.1543559542.; __utmb=30149280.227.9.1543564257221
";
// String time = "2018-11-
27 15:47:41
";
// String word = "
胡歌
";
// String cookie = "bid=rymxzs5aojg; ps=y; ll=\"118173\"; __utmc=30149280;
dbcl2=\"188038058:9IHyVcSobVc\"; ck=_RvF; push_noty_num=0; push_doumail_num=0; __utmv=30149280.18803; douban-fav-remind=1; __yadk_uid=qLflXyj3R14ro9e0cLoZOQlJoMGVN32j; douban-profile-remind=1; _vwo_uuid_v2=D85F60C118B0AF465035D9CC7BBFDA7A6|4bf255e1e3a2e9aeede3708192f5f1bc; __utmz=30149280.1543564973.3.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1543908324%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DPk19bxnhsVWwfnrcnwT0PquON7D1JpLsbfSu9rRowalyi4pOeM3rMHKFaJo9jJF7%26wd%3D%26eqid%3De7f262650001ef98000000045c00e64f%22%5D; _pk_ses.100001.8cb4=*; ap_v=0,6.0; __utma=30149280.824403997.1543559458.1543885946.1543908324.10; __utmt=1; _pk_id.100001.8cb4=6828fef49f6bcf34.1543559455.9.1543908331.1543885945.; __utmb=30149280.9.7.1543908324
";
// String time = "2018-11-
16 00:00:00
";
//
// Douban.doubanTopicGetByWord(word, null, cookie,time);
//
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment