Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
articlenewscrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
articlenewscrawler
Commits
7fb5554a
Commit
7fb5554a
authored
Dec 03, 2018
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
豆瓣评论点赞数 解析错误修改
parent
247e637d
Show whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
477 additions
and
419 deletions
+477
-419
src/main/java/com/zhiwei/parse/Douban.java
+2
-2
src/main/java/com/zhiwei/parse/analysis/DoubanCommentAnalysis.java
+4
-1
src/test/java/com/zhiwei/Comment/AikaComment.java
+19
-19
src/test/java/com/zhiwei/Comment/DoubanCommentTest.java
+2
-2
src/test/java/com/zhiwei/Comment/FenghuangCommentExample.java
+61
-61
src/test/java/com/zhiwei/Comment/PcautoComment.java
+18
-18
src/test/java/com/zhiwei/Comment/QicheComment.java
+15
-15
src/test/java/com/zhiwei/Comment/SinaKejiComment.java
+15
-15
src/test/java/com/zhiwei/Comment/TechTxComment.java
+29
-29
src/test/java/com/zhiwei/TestHttpBoot.java
+38
-0
src/test/java/com/zhiwei/keyword/DoubanTopicTest.java
+17
-0
src/test/java/com/zhiwei/keyword/GftaiTest.java
+33
-33
src/test/java/com/zhiwei/keyword/KuaiTousuTest.java
+38
-38
src/test/java/com/zhiwei/keyword/QQKandianKeyWordExample.java
+48
-48
src/test/java/com/zhiwei/keyword/SinaTousuTest.java
+38
-38
src/test/java/com/zhiwei/keyword/ToutiaoKeyWordExample.java
+54
-54
src/test/java/com/zhiwei/keyword/XueqiuKeyWord.java
+46
-46
No files found.
src/main/java/com/zhiwei/parse/Douban.java
View file @
7fb5554a
...
...
@@ -87,7 +87,7 @@ public class Douban {
logger
.
info
(
"采集到 第 {} 页 ,一共采集到 {} 条 是否有下一页 {}"
,
page
,
bodyList
.
size
(),
more
);
}
}
ZhiWeiTools
.
sleep
(
15
00
);
ZhiWeiTools
.
sleep
(
30
00
);
page
++;
}
catch
(
Exception
e
)
{
...
...
@@ -127,7 +127,7 @@ public class Douban {
if
(
dataList
.
size
()
-
count
<=
95
||
dataList
.
size
()
-
count
>=
105
)
{
more
=
false
;
}
ZhiWeiTools
.
sleep
(
15
00
);
ZhiWeiTools
.
sleep
(
30
00
);
logger
.
info
(
"评论采集到 第 {} 页 ,一共采集到 {} 条数据 ,more : {}"
,
page
,
dataList
.
size
(),
more
);
}
catch
(
Exception
e
)
{
logger
.
error
(
"Exception {}"
,
e
);
...
...
src/main/java/com/zhiwei/parse/analysis/DoubanCommentAnalysis.java
View file @
7fb5554a
...
...
@@ -10,6 +10,8 @@ import java.util.Map;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
javax.swing.plaf.synth.SynthSpinnerUI
;
import
org.apache.commons.lang3.math.NumberUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
...
...
@@ -40,6 +42,7 @@ public class DoubanCommentAnalysis {
map
.
put
(
"content"
,
content
);
map
.
put
(
"id"
,
id
);
map
.
put
(
"like"
,
getLikeNum
(
result
,
"c"
+
id
));
System
.
out
.
println
(
map
.
toString
());
bodyList
.
add
(
map
);
}
}
...
...
@@ -52,7 +55,7 @@ public class DoubanCommentAnalysis {
}
private
int
getLikeNum
(
String
result
,
String
id
)
{
Matcher
matcher
=
Pattern
.
compile
(
id
+
"\":[\\
D\\d][0,5]
"
).
matcher
(
result
);
Matcher
matcher
=
Pattern
.
compile
(
id
+
"\":[\\
d]{0,5}
"
).
matcher
(
result
);
while
(
matcher
.
find
())
{
String
ret
=
matcher
.
group
(
0
);
ret
=
ret
.
split
(
":"
)[
1
].
split
(
","
)[
0
];
...
...
src/test/java/com/zhiwei/Comment/AikaComment.java
View file @
7fb5554a
package
com
.
zhiwei
.
Comment
;
import
org.testng.annotations.Test
;
import
com.zhiwei.parse.Aika
;
import
com.zhiwei.tools.timeparse.TimeExtraction
;
import
com.zhiwei.tools.timeparse.TimeParse
;
public
class
AikaComment
{
@Test
public
void
f
()
{
String
url
=
"http://newcar.xcar.com.cn/201809/news_2021765_1.html"
;
Aika
.
getAikaComment
(
url
,
null
);
// System.out.println(TimeExtraction.parseFormatTime("09月12日", "MM dd"));
}
}
//
package com.zhiwei.Comment;
//
//
import org.testng.annotations.Test;
//
//
import com.zhiwei.parse.Aika;
//
import com.zhiwei.tools.timeparse.TimeExtraction;
//
import com.zhiwei.tools.timeparse.TimeParse;
//
//
public class AikaComment {
//
@Test
//
public void f() {
//
String url = "http://newcar.xcar.com.cn/201809/news_2021765_1.html";
//
//
Aika.getAikaComment(url, null);
//
//
//
System.out.println(TimeExtraction.parseFormatTime("09月12日", "MM dd"));
//
//
}
//
}
src/test/java/com/zhiwei/Comment/DoubanCommentTest.java
View file @
7fb5554a
...
...
@@ -12,8 +12,8 @@ import com.zhiwei.parse.Douban;
public
class
DoubanCommentTest
{
@Test
public
void
f
()
{
String
url
=
"https://www.douban.com/group/topic/
72528866
/"
;
String
cookie
=
"bid=rymxzs5aojg; ps=y; ll=\"118173\"; __utmc=30149280; dbcl2=\"188038058:9IHyVcSobVc\"; ck=_RvF;
ap_v=0,6.0; push_noty_num=0; push_doumail_num=0; __utmv=30149280.18803; douban-fav-remind=1; __yadk_uid=qLflXyj3R14ro9e0cLoZOQlJoMGVN32j; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1543562805%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DPk19bxnhsVWwfnrcnwT0PquON7D1JpLsbfSu9rRowalyi4pOeM3rMHKFaJo9jJF7%26wd%3D%26eqid%3De7f262650001ef98000000045c00e64f%22%5D; _pk_ses.100001.8cb4=*; douban-profile-remind=1; loc-last-index-location-id=\"118173\"; _vwo_uuid_v2=D85F60C118B0AF465035D9CC7BBFDA7A6|4bf255e1e3a2e9aeede3708192f5f1bc; __utma=30149280.824403997.1543559458.1543562809.1543564973.3; __utmz=30149280.1543564973.3.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _pk_id.100001.8cb4=6828fef49f6bcf34.1543559455.2.1543566557.1543559542.; __utmb=30149280.70.5.1543566539352
"
;
String
url
=
"https://www.douban.com/group/topic/
128726395
/"
;
String
cookie
=
"bid=rymxzs5aojg; ps=y; ll=\"118173\"; __utmc=30149280; dbcl2=\"188038058:9IHyVcSobVc\"; ck=_RvF;
push_noty_num=0; push_doumail_num=0; __utmv=30149280.18803; douban-fav-remind=1; __yadk_uid=qLflXyj3R14ro9e0cLoZOQlJoMGVN32j; douban-profile-remind=1; _vwo_uuid_v2=D85F60C118B0AF465035D9CC7BBFDA7A6|4bf255e1e3a2e9aeede3708192f5f1bc; __utmz=30149280.1543564973.3.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; ap_v=0,6.0; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1543823236%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DPk19bxnhsVWwfnrcnwT0PquON7D1JpLsbfSu9rRowalyi4pOeM3rMHKFaJo9jJF7%26wd%3D%26eqid%3De7f262650001ef98000000045c00e64f%22%5D; _pk_id.100001.8cb4=6828fef49f6bcf34.1543559455.5.1543823236.1543820463.; _pk_ses.100001.8cb4=*; __utma=30149280.824403997.1543559458.1543818802.1543823236.6; __utmt=1; __utmb=30149280.5.7.1543823236
"
;
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
Map
<
String
,
Object
>>
bodyList
=
Douban
.
getDoubanComment
(
url
,
null
,
cookie
);
...
...
src/test/java/com/zhiwei/Comment/FenghuangCommentExample.java
View file @
7fb5554a
package
com
.
zhiwei
.
Comment
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Fenghuang
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
FenghuangCommentExample
{
@Test
public
void
fenghuangCommentTest
()
{
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
"D://crawlerdata//自媒体/凤凰评论采集.xlsx"
,
0
);
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
List
<
String
>
urlList
=
new
ArrayList
<
String
>();
for
(
Map
<
String
,
Object
>
map1
:
list
)
{
String
url
=
""
;
try
{
url
=
map1
.
get
(
"url"
)+
""
;
System
.
out
.
println
(
url
);
List
<
Map
<
String
,
Object
>>
dataList
=
Fenghuang
.
getFenghuangCommentData2
(
url
,
null
);
if
(
dataList
==
null
||
dataList
.
size
()
<=
0
)
{
urlList
.
add
(
url
);
}
if
(
dataList
!=
null
)
{
for
(
Map
<
String
,
Object
>
m
:
dataList
)
{
m
.
put
(
"from_url"
,
url
);
bodyList
.
add
(
m
);
}
}
}
catch
(
Exception
e
)
{
System
.
out
.
println
(
url
);
e
.
printStackTrace
();
continue
;
}
ZhiWeiTools
.
sleep
(
1000
);
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"source"
);
headList
.
add
(
"content"
);
headList
.
add
(
"id"
);
headList
.
add
(
"like"
);
headList
.
add
(
"from"
);
headList
.
add
(
"time"
);
headList
.
add
(
"from_url"
);
for
(
String
s
:
urlList
)
{
System
.
out
.
println
(
s
);
}
poi
.
exportExcel
(
"D://crawlerdata//自媒体/凤凰评论采集.xlsx"
,
"评论采集"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.Comment;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Fenghuang;
//
import com.zhiwei.tools.tools.ZhiWeiTools;
//
//
public class FenghuangCommentExample {
//
//
@Test
//
public void fenghuangCommentTest() {
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
//
Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体/凤凰评论采集.xlsx", 0);
//
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
//
List<String> urlList = new ArrayList<String>();
//
for(Map<String,Object> map1 : list) {
//
String url = "";
//
try {
//
url = map1.get("url")+"";
//
System.out.println(url);
//
List<Map<String,Object>> dataList = Fenghuang.getFenghuangCommentData2(url,null);
//
if(dataList == null || dataList.size() <= 0) {
//
urlList.add(url);
//
}
//
if(dataList != null) {
//
for(Map<String,Object> m : dataList) {
//
m.put("from_url", url);
//
bodyList.add(m);
//
}
//
}
//
} catch (Exception e) {
//
System.out.println(url);
//
e.printStackTrace();
//
continue;
//
}
//
ZhiWeiTools.sleep(1000);
//
}
//
List<String> headList = new ArrayList<String>();
//
headList.add("source");
//
headList.add("content");
//
headList.add("id");
//
headList.add("like");
//
headList.add("from");
//
headList.add("time");
//
headList.add("from_url");
//
for(String s : urlList) {
//
System.out.println(s);
//
}
//
poi.exportExcel("D://crawlerdata//自媒体/凤凰评论采集.xlsx", "评论采集", headList, bodyList);
//
//
}
//
//
//
}
src/test/java/com/zhiwei/Comment/PcautoComment.java
View file @
7fb5554a
package
com
.
zhiwei
.
Comment
;
import
java.util.List
;
import
java.util.Map
;
import
org.testng.annotations.Test
;
import
com.zhiwei.parse.Pcauto
;
public
class
PcautoComment
{
@Test
public
void
f
()
{
String
url
=
"https://www.pcauto.com.cn/nation/1352/13523485.html"
;
List
<
Map
<
String
,
Object
>>
data
=
Pcauto
.
getPcAutoComment
(
url
,
null
);
System
.
out
.
println
(
data
.
size
());
}
}
//
package com.zhiwei.Comment;
//
//
import java.util.List;
//
import java.util.Map;
//
//
import org.testng.annotations.Test;
//
//
import com.zhiwei.parse.Pcauto;
//
//
public class PcautoComment {
//
@Test
//
public void f() {
//
String url = "https://www.pcauto.com.cn/nation/1352/13523485.html";
//
//
List<Map<String,Object>> data = Pcauto.getPcAutoComment(url, null);
//
System.out.println(data.size());
//
}
//
}
src/test/java/com/zhiwei/Comment/QicheComment.java
View file @
7fb5554a
package
com
.
zhiwei
.
Comment
;
import
org.testng.annotations.Test
;
import
com.zhiwei.parse.QicheHome
;
public
class
QicheComment
{
@Test
public
void
f
()
{
String
articleid
=
"922761"
;
QicheHome
.
getQiCheComment
(
articleid
,
null
);
}
}
//
package com.zhiwei.Comment;
//
//
import org.testng.annotations.Test;
//
//
import com.zhiwei.parse.QicheHome;
//
//
public class QicheComment {
//
@Test
//
public void f() {
//
String articleid = "922761";
//
//
QicheHome.getQiCheComment(articleid, null);
//
//
}
//
}
src/test/java/com/zhiwei/Comment/SinaKejiComment.java
View file @
7fb5554a
package
com
.
zhiwei
.
Comment
;
import
org.testng.annotations.Test
;
import
com.zhiwei.parse.SinaKeji
;
public
class
SinaKejiComment
{
@Test
public
void
f
()
{
String
url
=
"https://tech.sina.com.cn/it/2018-07-17/doc-ihfkffam0632649.shtml"
;
SinaKeji
.
getSinaKejiComment
(
url
,
null
);
}
}
//
package com.zhiwei.Comment;
//
//
import org.testng.annotations.Test;
//
//
import com.zhiwei.parse.SinaKeji;
//
//
public class SinaKejiComment {
//
@Test
//
public void f() {
//
String url = "https://tech.sina.com.cn/it/2018-07-17/doc-ihfkffam0632649.shtml";
//
//
SinaKeji.getSinaKejiComment(url, null);
//
//
}
//
}
src/test/java/com/zhiwei/Comment/TechTxComment.java
View file @
7fb5554a
package
com
.
zhiwei
.
Comment
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.testng.annotations.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.TechTx
;
public
class
TechTxComment
{
@Test
public
void
f
()
{
String
url
=
"http://tech.qq.com/a/20170629/005621.htm"
;
List
<
Map
<
String
,
Object
>>
bodyList
=
TechTx
.
getTechTxComment
(
url
,
null
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
String
>
headList
=
new
ArrayList
<>();
headList
.
add
(
"time"
);
headList
.
add
(
"content"
);
headList
.
add
(
"source"
);
headList
.
add
(
"like"
);
headList
.
add
(
"userId"
);
headList
.
add
(
"id"
);
poi
.
exportExcel
(
"D://crawlerdata//自媒体/腾讯科技评论采集.xlsx"
,
"ces"
,
headList
,
bodyList
);
System
.
out
.
println
(
bodyList
.
size
());
}
}
//
package com.zhiwei.Comment;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.testng.annotations.Test;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.TechTx;
//
//
public class TechTxComment {
//
@Test
//
public void f() {
//
String url = "http://tech.qq.com/a/20170629/005621.htm";
//
//
List<Map<String,Object>> bodyList = TechTx.getTechTxComment(url, null);
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
List<String> headList = new ArrayList<>();
//
headList.add("time");
//
headList.add("content");
//
headList.add("source");
//
headList.add("like");
//
headList.add("userId");
//
headList.add("id");
//
poi.exportExcel("D://crawlerdata//自媒体/腾讯科技评论采集.xlsx", "ces", headList, bodyList);
//
System.out.println(bodyList.size());
//
}
//
}
src/test/java/com/zhiwei/TestHttpBoot.java
0 → 100644
View file @
7fb5554a
package
com
.
zhiwei
;
import
java.io.IOException
;
import
java.util.HashMap
;
import
java.util.Map
;
import
java.util.HashMap
;
import
org.testng.annotations.Test
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
public
class
TestHttpBoot
{
@Test
public
void
f
()
{
HttpBoot
httpBoot
=
new
HttpBoot
();
String
url
=
"https://www.toutiao.com/c/user/following/?user_id=1034006366&count=20&_signature=wp5wPBAVmXlosTC8Fobui8KecC"
;
Map
<
String
,
Object
>
headers
=
new
HashMap
<>();
headers
.
put
(
"referer"
,
"https://www.qctt.cn/news/349056"
);
headers
.
put
(
"cookie"
,
"PHPSESSID=3rd6bvonb4g15t1fp777mjums0; Hm_lvt_70af9ea91e7adc8195f6d49511b9a2f1=1542253722; open_ad=1; Hm_lpvt_70af9ea91e7adc8195f6d49511b9a2f1=1542271394; vcode=sqmm; XSRF-TOKEN=eyJpdiI6IlFTNzkyYWNcLzB2SUwyN2dcL1hhUlpsZz09IiwidmFsdWUiOiJRSUpycjZJNGx3d1hUWkpOQUl1R2psSStuVU0yYW8xT1YxXC9QOFY1NjdyRXNrMWpFVE1kSm9IQ1o5Nm5keXlMTEFnZXdCOHVvWDg0U2picTE1cjZzMkE9PSIsIm1hYyI6IjZlYzk5NDI3ODEzMzA3ZTJjNDc3M2ZjMjBlNDJhZjc2YjU2ODFmYmY3YWRlMzdlMzM1NTBlNWMxNDk3MjFiZDEifQ%3D%3D; laravel_session=eyJpdiI6InJQMnByeFlIbXVhaUVVVVBLK1wvaXlRPT0iLCJ2YWx1ZSI6IlhUOUtIS2ZQZ0ZKNFh1RDVQYjBjSVZkVkpQZTdYRDNpa1wvV0o5QlJPbk8xZE0rQ3dZdnFMdjcya011ejVkdWEwUk1Qa29Zb2Y3OU0yUGkrWDF4Wk5adz09IiwibWFjIjoiZGJiYjlkNWZhNmJhMDFiMjkxYTAyMmUwZTEyMWVmZTQ0NmJiZDQ2ZGU3ZjNjNmUzNTIwZGI0NTc4NDJlZjNiMCJ9"
);
headers
.
put
(
"origin"
,
"https://www.qctt.cn"
);
Map
<
String
,
Object
>
params
=
new
HashMap
<>();
params
.
put
(
"id"
,
"349056"
);
params
.
put
(
"page"
,
"3"
);
params
.
put
(
"_token"
,
"EJ58V0qilRw7P77czp0U6iO9QW2IOS1ZGiBk4wH1"
);
try
{
String
result
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
)).
body
().
string
();
System
.
out
.
println
(
result
);
}
catch
(
IOException
e
)
{
// TODO Auto-generated catch block
e
.
printStackTrace
();
}
}
}
src/test/java/com/zhiwei/keyword/DoubanTopicTest.java
0 → 100644
View file @
7fb5554a
//package com.zhiwei.keyword;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.parse.Douban;
//
//public class DoubanTopicTest {
// @Test
// public void f() {
// String word = "唐嫣";
// String cookie = "bid=rymxzs5aojg; ps=y; ll=\"118173\"; __utmc=30149280; __utmz=30149280.1543559458.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); dbcl2=\"188038058:9IHyVcSobVc\"; ck=_RvF; ap_v=0,6.0; push_noty_num=0; push_doumail_num=0; __utmv=30149280.18803; douban-fav-remind=1; __yadk_uid=qLflXyj3R14ro9e0cLoZOQlJoMGVN32j; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1543562805%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DPk19bxnhsVWwfnrcnwT0PquON7D1JpLsbfSu9rRowalyi4pOeM3rMHKFaJo9jJF7%26wd%3D%26eqid%3De7f262650001ef98000000045c00e64f%22%5D; _pk_ses.100001.8cb4=*; douban-profile-remind=1; __utma=30149280.824403997.1543559458.1543559458.1543562809.2; __utmt=1; _pk_id.100001.8cb4=6828fef49f6bcf34.1543559455.2.1543564606.1543559542.; __utmb=30149280.227.9.1543564257221";
// String time = "2018-11-27 15:47:41";
//
// Douban.doubanTopicGetByWord(word, null, cookie,time);
//
// }
//}
src/test/java/com/zhiwei/keyword/GftaiTest.java
View file @
7fb5554a
package
com
.
zhiwei
.
keyword
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.testng.annotations.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Gftai
;
public
class
GftaiTest
{
@Test
public
void
f
()
{
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
String
words
=
"民宿|短租|住宿|途家|爱彼迎|小猪短租|榛果民宿|Airbnb"
;
String
[]
ws
=
words
.
split
(
"\\|"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
for
(
String
word
:
ws
)
{
List
<
Map
<
String
,
Object
>>
list
=
Gftai
.
getData
(
word
,
null
);
bodyList
.
addAll
(
list
);
System
.
out
.
println
(
word
+
" --------- "
+
bodyList
.
size
());
}
List
<
String
>
headList
=
new
ArrayList
<>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"content"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\投诉\\国富泰信用.xlsx"
,
"数据"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.keyword;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.testng.annotations.Test;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Gftai;
//
//
public class GftaiTest {
//
@Test
//
public void f() {
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
String words = "民宿|短租|住宿|途家|爱彼迎|小猪短租|榛果民宿|Airbnb";
//
String[] ws = words.split("\\|");
//
List<Map<String,Object>> bodyList = new ArrayList<>();
//
for(String word : ws) {
//
List<Map<String,Object>> list = Gftai.getData(word, null);
//
bodyList.addAll(list);
//
System.out.println(word + " --------- " + bodyList.size());
//
}
//
List<String> headList = new ArrayList<>();
//
headList.add("title");
//
headList.add("time");
//
headList.add("content");
//
headList.add("source");
//
headList.add("url");
//
//
poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\国富泰信用.xlsx", "数据", headList, bodyList);
//
}
//
}
src/test/java/com/zhiwei/keyword/KuaiTousuTest.java
View file @
7fb5554a
package
com
.
zhiwei
.
keyword
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.testng.annotations.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Gftai
;
import
com.zhiwei.parse.KuaiTousu
;
public
class
KuaiTousuTest
{
@Test
public
void
f
()
{
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
String
words
=
"民宿|短租|住宿|途家|爱彼迎|小猪短租|榛果民宿|Airbnb"
;
String
[]
ws
=
words
.
split
(
"\\|"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
for
(
String
word
:
ws
)
{
List
<
Map
<
String
,
Object
>>
list
=
KuaiTousu
.
getData
(
word
,
null
);
bodyList
.
addAll
(
list
);
System
.
out
.
println
(
word
+
" --------- "
+
bodyList
.
size
());
}
List
<
String
>
headList
=
new
ArrayList
<>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"content"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\投诉\\新浪广东快投诉.xlsx"
,
"数据"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.keyword;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.testng.annotations.Test;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Gftai;
//
import com.zhiwei.parse.KuaiTousu;
//
//
public class KuaiTousuTest {
//
@Test
//
public void f() {
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
String words = "民宿|短租|住宿|途家|爱彼迎|小猪短租|榛果民宿|Airbnb";
//
String[] ws = words.split("\\|");
//
List<Map<String,Object>> bodyList = new ArrayList<>();
//
for(String word : ws) {
//
List<Map<String,Object>> list = KuaiTousu.getData(word, null);
//
bodyList.addAll(list);
//
System.out.println(word + " --------- " + bodyList.size());
//
}
//
List<String> headList = new ArrayList<>();
//
headList.add("title");
//
headList.add("time");
//
headList.add("content");
//
headList.add("source");
//
headList.add("url");
//
//
poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\新浪广东快投诉.xlsx", "数据", headList, bodyList);
//
//
//
//
//
}
//
}
src/test/java/com/zhiwei/keyword/QQKandianKeyWordExample.java
View file @
7fb5554a
package
com
.
zhiwei
.
keyword
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.testng.annotations.Test
;
import
com.zhiwei.bean.HistortyBean
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.QQKandian
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
QQKandianKeyWordExample
{
@Test
public
void
f
()
{
String
word
=
"今日头条 算法|今日头条 侵权|今日头条 起诉|字节跳动|张一鸣|抖音 涉黄|抖音 未成年|抖音"
;
String
[]
words
=
word
.
split
(
"\\|"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
QQKandian
qqKandian
=
new
QQKandian
();
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
String
w
:
words
)
{
System
.
out
.
println
(
w
);
List
<
HistortyBean
>
dataList
=
qqKandian
.
getDataByword
(
w
,
null
);
System
.
out
.
println
(
w
+
" ---- "
+
dataList
.
size
());
for
(
HistortyBean
h
:
dataList
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<
String
,
Object
>();
map
.
put
(
"标题"
,
h
.
getTitle
());
map
.
put
(
"时间"
,
h
.
getTime
());
map
.
put
(
"来源"
,
h
.
getSource
());
map
.
put
(
"正文"
,
h
.
getContent
());
map
.
put
(
"链接"
,
h
.
getUrl
());
bodyList
.
add
(
map
);
}
ZhiWeiTools
.
sleep
(
3000
);
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"标题"
);
headList
.
add
(
"来源"
);
headList
.
add
(
"链接"
);
headList
.
add
(
"正文"
);
headList
.
add
(
"时间"
);
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\qq看点-今日头条 算法.xlsx"
,
"马化腾"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.keyword;
//
//
import java.util.ArrayList;
//
import java.util.HashMap;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.testng.annotations.Test;
//
//
import com.zhiwei.bean.HistortyBean;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.QQKandian;
//
import com.zhiwei.tools.tools.ZhiWeiTools;
//
//
public class QQKandianKeyWordExample {
//
@Test
//
public void f() {
//
String word = "今日头条 算法|今日头条 侵权|今日头条 起诉|字节跳动|张一鸣|抖音 涉黄|抖音 未成年|抖音";
//
String[] words = word.split("\\|");
//
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
QQKandian qqKandian = new QQKandian();
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
//
for(String w : words) {
//
System.out.println(w);
//
List<HistortyBean> dataList = qqKandian.getDataByword(w, null);
//
System.out.println(w + " ---- " + dataList.size());
//
for(HistortyBean h : dataList) {
//
Map<String, Object> map = new HashMap<String,Object>();
//
map.put("标题", h.getTitle());
//
map.put("时间", h.getTime());
//
map.put("来源", h.getSource());
//
map.put("正文", h.getContent());
//
map.put("链接", h.getUrl());
//
bodyList.add(map);
//
}
//
ZhiWeiTools.sleep(3000);
//
}
//
List<String> headList = new ArrayList<String>();
//
headList.add("标题");
//
headList.add("来源");
//
headList.add("链接");
//
headList.add("正文");
//
headList.add("时间");
//
poi.exportExcel("D:\\crawlerdata\\自媒体\\qq看点-今日头条 算法.xlsx", "马化腾", headList, bodyList);
//
}
//
//
}
src/test/java/com/zhiwei/keyword/SinaTousuTest.java
View file @
7fb5554a
package
com
.
zhiwei
.
keyword
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.testng.annotations.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.KuaiTousu
;
import
com.zhiwei.parse.SinaTousu
;
public
class
SinaTousuTest
{
@Test
public
void
getSinaTousuData
()
{
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
String
words
=
"民宿|短租|住宿|途家|爱彼迎|小猪短租|榛果民宿|Airbnb"
;
String
[]
ws
=
words
.
split
(
"\\|"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
for
(
String
word
:
ws
)
{
List
<
Map
<
String
,
Object
>>
list
=
SinaTousu
.
getSinaTousuData
(
word
,
null
,
"2018-01-01 00:00:00"
);
bodyList
.
addAll
(
list
);
System
.
out
.
println
(
word
+
" --------- "
+
bodyList
.
size
());
}
List
<
String
>
headList
=
new
ArrayList
<>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"content"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\投诉\\黑猫投诉.xlsx"
,
"数据"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.keyword;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.testng.annotations.Test;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.KuaiTousu;
//
import com.zhiwei.parse.SinaTousu;
//
//
public class SinaTousuTest {
//
//
@Test
//
public void getSinaTousuData() {
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
String words = "民宿|短租|住宿|途家|爱彼迎|小猪短租|榛果民宿|Airbnb";
//
String[] ws = words.split("\\|");
//
List<Map<String,Object>> bodyList = new ArrayList<>();
//
for(String word : ws) {
//
List<Map<String,Object>> list = SinaTousu.getSinaTousuData(word, null, "2018-01-01 00:00:00");
//
bodyList.addAll(list);
//
System.out.println(word + " --------- " + bodyList.size());
//
}
//
List<String> headList = new ArrayList<>();
//
headList.add("title");
//
headList.add("time");
//
headList.add("content");
//
headList.add("source");
//
headList.add("url");
//
//
poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\黑猫投诉.xlsx", "数据", headList, bodyList);
//
//
//
//
}
//
}
src/test/java/com/zhiwei/keyword/ToutiaoKeyWordExample.java
View file @
7fb5554a
package
com
.
zhiwei
.
keyword
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Toutiao
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
public
class
ToutiaoKeyWordExample
{
public
static
void
main
(
String
[]
args
)
{
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
String
path
=
"D:\\crawlerdata\\关键词.xlsx"
;
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
path
,
0
);
List
<
Map
<
String
,
Object
>>
list
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
String
startTime
=
"2018-06-28 00:00:00"
;
String
endTime
=
"2018-06-28 23:59:59"
;
String
devoid
=
"54381805805"
;
for
(
Map
<
String
,
Object
>
map1
:
list
)
{
String
word
=
map1
.
get
(
"关键词"
)+
""
;
List
<
Map
<
String
,
Object
>>
dataList
=
Toutiao
.
getKeyWordData
(
word
,
null
,
devoid
);
if
(
dataList
!=
null
)
{
for
(
Map
<
String
,
Object
>
m
:
dataList
)
{
String
time
=
m
.
get
(
"time"
)+
""
;
System
.
out
.
println
(
time
);
m
.
put
(
"word"
,
word
);
String
ma
=
m
.
get
(
"title"
)
+
"--"
+
m
.
get
(
"content"
);
if
(
time
.
compareTo
(
startTime
)
>
-
1
&&
time
.
compareTo
(
endTime
)
<
1
)
{
System
.
out
.
println
(
1
);
if
(
ma
.
contains
(
word
))
{
bodyList
.
add
(
m
);
}
}
}
}
ZhiWeiTools
.
sleep
(
2000
);
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"content"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
headList
.
add
(
"word"
);
poi
.
exportExcel
(
path
,
"雅培"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.keyword;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Toutiao;
//
import com.zhiwei.tools.tools.ZhiWeiTools;
//
//
public class ToutiaoKeyWordExample {
//
//
public static void main(String[] args) {
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
String path = "D:\\crawlerdata\\关键词.xlsx";
//
Map<String,Object> map = poi.importExcel(path, 0);
//
List<Map<String,Object>> list = (List<Map<String, Object>>) map.get("body");
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
//
String startTime = "2018-06-28 00:00:00";
//
String endTime = "2018-06-28 23:59:59";
//
String devoid = "54381805805";
//
for(Map<String,Object> map1 : list) {
//
String word = map1.get("关键词")+"";
//
List<Map<String,Object>> dataList = Toutiao.getKeyWordData(word, null,devoid);
//
if(dataList != null) {
//
for(Map<String,Object> m : dataList) {
//
String time = m.get("time")+"";
//
System.out.println(time);
//
m.put("word", word);
//
String ma = m.get("title") + "--" + m.get("content");
//
if(time.compareTo(startTime) > -1 && time.compareTo(endTime) < 1) {
//
System.out.println(1);
//
if(ma.contains(word)) {
//
bodyList.add(m);
//
}
//
}
//
}
//
}
//
ZhiWeiTools.sleep(2000);
//
}
//
//
List<String> headList = new ArrayList<String>();
//
headList.add("title");
//
headList.add("time");
//
headList.add("content");
//
headList.add("source");
//
headList.add("url");
//
headList.add("word");
//
//
poi.exportExcel(path, "雅培", headList, bodyList);
//
//
}
//
//
}
src/test/java/com/zhiwei/keyword/XueqiuKeyWord.java
View file @
7fb5554a
package
com
.
zhiwei
.
keyword
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.testng.annotations.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Xueqiu
;
public
class
XueqiuKeyWord
{
@Test
public
void
f
()
{
// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
String
word
=
"腾讯 股价|腾讯 市值|腾讯 估值|腾讯 投资|腾讯 财报"
;
String
endTime
=
"2018-01-01 00:00:00"
;
String
cookie
=
"_ga=GA1.2.590296572.1538275545; device_id=812aac42a8874c32ca97893a7b270684; s=ff12lgxbt1; __utma=1.590296572.1538275545.1538280279.1538280279.1; __utmz=1.1538280279.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); aliyungf_tc=AQAAAPtocWy01ggA6tbncwYt4OIDsaiG; xq_a_token=088c6ad5e275496d7c91b8b5b2ecb929bee15772; xq_a_token.sig=NcpAm7FRsAVoMGRMOLrLRveBT7U; xq_r_token=08131eb9a4f33b43b2340fd782f3776c9823d74a; xq_r_token.sig=AZ7au6ICJtTgQJVPybkOJs_Fr54; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539599526,1539913171,1539913178,1541157360; _gid=GA1.2.1817290343.1541157360; u=721541157360383; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1541212289"
;
String
[]
words
=
word
.
split
(
"\\|"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
String
w
:
words
)
{
System
.
out
.
println
(
w
);
List
<
Map
<
String
,
Object
>>
dataList
=
Xueqiu
.
getData
(
w
,
endTime
,
null
,
cookie
);
System
.
out
.
println
(
w
+
" ---- "
+
dataList
.
size
());
bodyList
.
addAll
(
dataList
);
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"content"
);
headList
.
add
(
"uper"
);
headList
.
add
(
"url"
);
headList
.
add
(
"likeCount"
);
headList
.
add
(
"replyCount"
);
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\雪球网页采集-腾讯-relevance.xlsx"
,
"马化腾"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.keyword;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.testng.annotations.Test;
//
//
import com.zhiwei.common.config.GroupType;
//
import com.zhiwei.crawler.proxy.ProxyFactory;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Xueqiu;
//
//
public class XueqiuKeyWord {
//
@Test
//
public void f() {
//
//
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
//
String word = "腾讯 股价|腾讯 市值|腾讯 估值|腾讯 投资|腾讯 财报";
//
String endTime = "2018-01-01 00:00:00";
//
String cookie = "_ga=GA1.2.590296572.1538275545; device_id=812aac42a8874c32ca97893a7b270684; s=ff12lgxbt1; __utma=1.590296572.1538275545.1538280279.1538280279.1; __utmz=1.1538280279.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); aliyungf_tc=AQAAAPtocWy01ggA6tbncwYt4OIDsaiG; xq_a_token=088c6ad5e275496d7c91b8b5b2ecb929bee15772; xq_a_token.sig=NcpAm7FRsAVoMGRMOLrLRveBT7U; xq_r_token=08131eb9a4f33b43b2340fd782f3776c9823d74a; xq_r_token.sig=AZ7au6ICJtTgQJVPybkOJs_Fr54; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539599526,1539913171,1539913178,1541157360; _gid=GA1.2.1817290343.1541157360; u=721541157360383; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1541212289";
//
//
//
//
String[] words = word.split("\\|");
//
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
//
for(String w : words) {
//
System.out.println(w);
//
//
List<Map<String, Object>> dataList = Xueqiu.getData(w, endTime, null, cookie);
//
System.out.println(w + " ---- " + dataList.size());
//
bodyList.addAll(dataList);
//
}
//
List<String> headList = new ArrayList<String>();
//
headList.add("title");
//
headList.add("time");
//
headList.add("content");
//
headList.add("uper");
//
headList.add("url");
//
headList.add("likeCount");
//
headList.add("replyCount");
//
poi.exportExcel("D:\\crawlerdata\\自媒体\\雪球网页采集-腾讯-relevance.xlsx", "马化腾", headList, bodyList);
//
//
}
//
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment