Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
articlenewscrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
articlenewscrawler
Commits
3e350f8b
Commit
3e350f8b
authored
Aug 05, 2019
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修改部分代理使用方式 并升级版本
parent
096c4f21
Show whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
408 additions
and
412 deletions
+408
-412
pom.xml
+1
-1
src/main/java/com/zhiwei/parse/Baijia.java
+9
-10
src/main/java/com/zhiwei/parse/Douban.java
+3
-3
src/main/java/com/zhiwei/parse/Maimai.java
+3
-3
src/main/java/com/zhiwei/parse/QQKB.java
+1
-2
src/main/java/com/zhiwei/parse/TXNews.java
+1
-2
src/main/java/com/zhiwei/parse/Wangyi.java
+1
-1
src/main/java/com/zhiwei/parse/analysis/Ts21cnAnalysis.java
+0
-1
src/test/java/com/zhiwei/crawler/DayuAccountExample.java
+54
-54
src/test/java/com/zhiwei/crawler/YidainzixunByWordExample.java
+44
-44
src/test/java/com/zhiwei/hsitory/YidianzixunAccountExample.java
+37
-37
src/test/java/com/zhiwei/keyword/XueqiuKeyWord.java
+47
-47
src/test/java/com/zhiwei/shipin/AiqiyiTest.java
+44
-44
src/test/java/com/zhiwei/shipin/BilibiliTest.java
+42
-42
src/test/java/com/zhiwei/shipin/QQTVTest.java
+46
-46
src/test/java/com/zhiwei/shipin/SohuTVTest.java
+39
-39
src/test/java/com/zhiwei/shipin/YoukuKeyWordTest.java
+36
-36
No files found.
pom.xml
View file @
3e350f8b
...
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<artifactId>
articlenewscrawler
</artifactId>
<version>
0.1.
6
-SNAPSHOT
</version>
<version>
0.1.
7
-SNAPSHOT
</version>
<name>
articlenewscrawler
</name>
<description>
采集凤凰,一点资讯,搜狐历时文章和文章评论
</description>
...
...
src/main/java/com/zhiwei/parse/Baijia.java
View file @
3e350f8b
...
...
@@ -14,6 +14,7 @@ import org.slf4j.LoggerFactory;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
...
...
@@ -21,8 +22,6 @@ import com.zhiwei.parse.analysis.BaijiaAccountAnalysis;
import
com.zhiwei.tools.httpclient.HeaderTool
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Request
;
public
class
Baijia
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
Baijia
.
class
);
private
static
BaijiaAccountAnalysis
baijiaAccountAnalysis
=
new
BaijiaAccountAnalysis
();
...
...
@@ -70,17 +69,18 @@ public class Baijia {
/**
*
* @Description 获取百家号第三种方法
* @param app
_i
d
* @param app
I
d
* @param startTime
* @param proxy
* @return
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
Map
<
String
,
Object
>>
getBaijiaAccountByBaiduData
(
String
app_id
,
String
name
,
String
startTime
,
String
cookie
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getBaijiaAccountByBaiduData
(
String
appId
,
String
name
,
String
startTime
,
String
cookie
,
ProxyHolder
proxy
)
{
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
headerMap
.
put
(
"cookie"
,
cookie
);
String
uk
=
getUkData
(
app
_i
d
,
proxy
,
cookie
);
String
uk
=
getUkData
(
app
I
d
,
proxy
,
cookie
);
if
(
Objects
.
isNull
(
uk
))
{
return
Collections
.
emptyList
();
}
...
...
@@ -90,12 +90,11 @@ public class Baijia {
for
(
int
i
=
1
;
i
<
3
;
i
++)
{
try
{
String
url
=
"https://author.baidu.com/list?type=article&tab=2&uk="
+
uk
+
"&ctime="
+
ctime
+
"&num=50"
;
Request
request
=
RequestUtils
.
wrapGet
(
url
,
headerMap
);
String
result
=
httpBoot
.
syncCall
(
request
,
proxy
).
body
().
string
();
String
result
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
Map
<
String
,
Object
>
dMap
=
baijiaAccountAnalysis
.
getBaijiaAccountData3
(
result
,
name
,
startTime
);
List
<
Map
<
String
,
Object
>>
dList
=
(
List
<
Map
<
String
,
Object
>>)
dMap
.
get
(
"data"
);
dataList
.
addAll
(
dList
);
logger
.
info
(
"{} 数据采集结果 {}"
,
app
_i
d
,
dataList
.
size
());
logger
.
info
(
"{} 数据采集结果 {}"
,
app
I
d
,
dataList
.
size
());
if
(!(
boolean
)
dMap
.
get
(
"more"
))
{
f
=
false
;
}
...
...
@@ -111,9 +110,9 @@ public class Baijia {
return
dataList
;
}
private
static
String
getUkData
(
String
app
_id
,
Proxy
proxy
,
String
cookie
)
{
private
static
String
getUkData
(
String
app
Id
,
ProxyHolder
proxy
,
String
cookie
)
{
String
url
=
"https://author.baidu.com/profile?context={\"from\":0,\"app_id\":\""
+
app
_i
d
+
"\"}&cmdType=&pagelets[]=root&reqID=0&ispeed=1#"
;
+
app
I
d
+
"\"}&cmdType=&pagelets[]=root&reqID=0&ispeed=1#"
;
Map
<
String
,
Object
>
headers
=
new
HashMap
<>();
headers
.
put
(
"Host"
,
"author.baidu.com"
);
headers
.
put
(
"cookie"
,
cookie
);
...
...
src/main/java/com/zhiwei/parse/Douban.java
View file @
3e350f8b
...
...
@@ -2,7 +2,6 @@ package com.zhiwei.parse;
import
static
java
.
util
.
Objects
.
nonNull
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
...
...
@@ -16,6 +15,7 @@ import org.slf4j.Logger;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.parse.analysis.DoubanCommentAnalysis
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
...
...
@@ -36,7 +36,7 @@ public class Douban {
* @param stime
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
doubanTopicGetByWord
(
String
word
,
Proxy
proxy
,
String
cookie
,
String
stime
)
{
public
static
List
<
Map
<
String
,
Object
>>
doubanTopicGetByWord
(
String
word
,
Proxy
Holder
proxy
,
String
cookie
,
String
stime
)
{
int
page
=
0
;
int
count
=
20
;
boolean
more
=
true
;
...
...
@@ -105,7 +105,7 @@ public class Douban {
* @param cookie
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getDoubanComment
(
String
url
,
Proxy
proxy
,
String
cookie
)
{
public
static
List
<
Map
<
String
,
Object
>>
getDoubanComment
(
String
url
,
Proxy
Holder
proxy
,
String
cookie
)
{
if
(
url
.
contains
(
"#"
))
{
url
=
url
.
split
(
"#"
)[
0
];
}
...
...
src/main/java/com/zhiwei/parse/Maimai.java
View file @
3e350f8b
...
...
@@ -44,7 +44,7 @@ public class Maimai {
* @return
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
key
,
String
cookie
,
String
time
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
key
,
String
cookie
,
String
time
,
Proxy
Holder
proxy
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getMaimaiKeywordHeaderMap
(
cookie
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
boolean
f
=
true
;
...
...
@@ -82,7 +82,7 @@ public class Maimai {
* @return
*/
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
Map
<
String
,
Object
>>
getDataByNoName
(
String
key
,
String
cookie
,
String
time
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getDataByNoName
(
String
key
,
String
cookie
,
String
time
,
Proxy
Holder
proxy
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getMaimaiKeywordHeaderMap
(
cookie
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
boolean
f
=
true
;
...
...
@@ -94,7 +94,7 @@ public class Maimai {
Map
<
String
,
Object
>
map
=
maimaiBywordAnalysis
.
getDataByNoName
(
result
,
time
,
key
);
f
=
(
boolean
)
map
.
get
(
"hasMore"
);
List
<
Map
<
String
,
Object
>>
daList
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"data"
);
if
(
daList
!=
null
&&
daList
.
size
()
>
0
)
{
if
(
daList
!=
null
&&
!
daList
.
isEmpty
()
)
{
dataList
.
addAll
(
daList
);
url
=
"https://maimai.cn/search/gossips?query="
+
URLEncoder
.
encode
(
key
,
"utf-8"
)+
"&limit=20&offset="
+
i
+
"highlight=true&sortby=time&jsononly=1"
;
i
+=
20
;
...
...
src/main/java/com/zhiwei/parse/QQKB.java
View file @
3e350f8b
...
...
@@ -12,7 +12,6 @@ import org.slf4j.LoggerFactory;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.bean.QQkbUser
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
...
...
@@ -32,7 +31,7 @@ public class QQKB {
* @param cookie
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getQQAccountData
(
String
child
,
String
cookie
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getQQAccountData
(
String
child
,
String
cookie
,
Proxy
Holder
proxy
)
{
String
url
=
"http://r.cnews.qq.com/getSubNewsIndex"
;
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getQQAccountHeaderMap
(
cookie
);
Map
<
String
,
Object
>
paramMap
=
HeadGet
.
getQQAccountOneParamMap
(
child
);
...
...
src/main/java/com/zhiwei/parse/TXNews.java
View file @
3e350f8b
package
com
.
zhiwei
.
parse
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.HashMap
;
...
...
@@ -31,7 +30,7 @@ public class TXNews {
public
static
boolean
txNewshasMoreData
=
true
;
private
static
HttpBoot
httpBoot
=
new
HttpBoot
.
Builder
().
retryTimes
(
2
).
build
();
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
String
devid
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
String
devid
,
Proxy
Holder
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getTxNewspage1HeaderMap
(
null
);
Map
<
String
,
Object
>
paramMap
=
HeadGet
.
getTxNewspage1ParamMap
(
word
);
...
...
src/main/java/com/zhiwei/parse/Wangyi.java
View file @
3e350f8b
...
...
@@ -36,7 +36,7 @@ public class Wangyi {
* @param id
* @return
*/
public
static
List
<
Map
<
String
,
Object
>>
getWangyiCommentData
(
String
id
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getWangyiCommentData
(
String
id
,
Proxy
Holder
proxy
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getWangyiCommentHeaderMap
(
null
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
int
i
=
0
;
...
...
src/main/java/com/zhiwei/parse/analysis/Ts21cnAnalysis.java
View file @
3e350f8b
package
com
.
zhiwei
.
parse
.
analysis
;
import
static
java
.
util
.
Objects
.
isNull
;
import
static
java
.
util
.
Objects
.
nonNull
;
import
java.util.ArrayList
;
...
...
src/test/java/com/zhiwei/crawler/DayuAccountExample.java
View file @
3e350f8b
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Dayu
;
public
class
DayuAccountExample
{
@Test
public
void
dayuAccountTest
()
{
//https://api.m.sm.cn/rest?method=Subscribe.list&format=html&from=wh10331&uc_biz_str=S:custom%7CC:search%7CN:true
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
// String mid = "0a8b2360fd8b4ded971cd324a56d32f0";
// String name = "大鱼海棠雨";
String
startTime
=
"2017-01-01 00:00:00"
;
String
path
=
"D:\\crawlerdata\\自媒体\\大鱼历史文章.xlsx"
;
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
Map
<
String
,
Object
>
map
=
poi
.
importExcel
(
path
,
0
);
List
<
Map
<
String
,
Object
>>
lists
=
(
List
<
Map
<
String
,
Object
>>)
map
.
get
(
"body"
);
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"content"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
// headList.add("content_id");
// headList.add("origin_id");
// headList.add("xss_item_id");
for
(
Map
<
String
,
Object
>
data
:
lists
)
{
String
mid
=
data
.
get
(
"mid"
)+
""
;
String
name
=
data
.
get
(
"name"
)+
""
;
mid
=
"7b345070c4124574b9cbcab8c4a1aeb8"
;
name
=
"国魂"
;
if
(
mid
.
length
()
<
1
&&
name
.
length
()
<
1
)
{
continue
;
}
List
<
Map
<
String
,
Object
>>
dataList
=
Dayu
.
getDayuAccountData
(
mid
,
name
,
null
,
null
);
poi
.
exportExcel
(
path
,
name
,
headList
,
dataList
);
}
}
}
//
package com.zhiwei.crawler;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.common.config.GroupType;
//
import com.zhiwei.crawler.proxy.ProxyFactory;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Dayu;
//
//
public class DayuAccountExample {
//
//
//
@Test
//
public void dayuAccountTest() {
//
//https://api.m.sm.cn/rest?method=Subscribe.list&format=html&from=wh10331&uc_biz_str=S:custom%7CC:search%7CN:true
//
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
//
//
//
String mid = "0a8b2360fd8b4ded971cd324a56d32f0";
//
//
String name = "大鱼海棠雨";
//
String startTime = "2017-01-01 00:00:00";
//
String path = "D:\\crawlerdata\\自媒体\\大鱼历史文章.xlsx";
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
Map<String,Object> map = poi.importExcel(path, 0);
//
List<Map<String,Object>> lists = (List<Map<String, Object>>) map.get("body");
//
List<String> headList = new ArrayList<String>();
//
headList.add("title");
//
headList.add("time");
//
headList.add("content");
//
headList.add("source");
//
headList.add("url");
//
//
headList.add("content_id");
//
//
headList.add("origin_id");
//
//
headList.add("xss_item_id");
//
for(Map<String,Object> data : lists) {
//
String mid = data.get("mid")+"";
//
String name = data.get("name")+"";
//
mid = "7b345070c4124574b9cbcab8c4a1aeb8";
//
name = "国魂";
//
if(mid.length() < 1 && name.length() < 1) {
//
continue;
//
}
//
List<Map<String,Object>> dataList = Dayu.getDayuAccountData(mid,name,null,null);
//
poi.exportExcel(path, name, headList, dataList);
//
}
//
//
//
}
//
//
//
}
src/test/java/com/zhiwei/crawler/YidainzixunByWordExample.java
View file @
3e350f8b
package
com
.
zhiwei
.
crawler
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Yidianzixun
;
import
com.zhiwei.util.WordReadFile
;
public
class
YidainzixunByWordExample
{
@Test
public
void
yidianzixunByWordTest
()
{
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"D://crawlerdata/关键词.txt"
);
List
<
Map
<
String
,
Object
>>
listAll
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
String
word
:
wordList
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
Yidianzixun
.
getYidianzixunDataByWord
(
word
,
null
);
System
.
out
.
println
(
dataList
.
size
());
listAll
.
addAll
(
dataList
);
System
.
out
.
println
(
listAll
.
size
());
}
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"docid"
);
headList
.
add
(
"title"
);
headList
.
add
(
"comment_count"
);
headList
.
add
(
"summary"
);
headList
.
add
(
"source"
);
headList
.
add
(
"wm_copyright"
);
headList
.
add
(
"time"
);
headList
.
add
(
"url"
);
System
.
out
.
println
(
listAll
.
size
());
poi
.
exportExcel
(
"D://crawlerdata/一点资讯-软博会.xlsx"
,
"asd"
,
headList
,
listAll
);
}
}
//
package com.zhiwei.crawler;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Yidianzixun;
//
import com.zhiwei.util.WordReadFile;
//
//
public class YidainzixunByWordExample {
//
//
@Test
//
public void yidianzixunByWordTest() {
//
//
List<String> wordList = WordReadFile.getWords("D://crawlerdata/关键词.txt");
//
List<Map<String,Object>> listAll = new ArrayList<Map<String,Object>>();
//
for(String word : wordList) {
//
List<Map<String,Object>> dataList = Yidianzixun.getYidianzixunDataByWord(word,null);
//
System.out.println(dataList.size());
//
listAll.addAll(dataList);
//
System.out.println(listAll.size());
//
}
//
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
List<String> headList = new ArrayList<String>();
//
headList.add("docid");
//
headList.add("title");
//
headList.add("comment_count");
//
headList.add("summary");
//
headList.add("source");
//
headList.add("wm_copyright");
//
headList.add("time");
//
headList.add("url");
//
System.out.println(listAll.size());
//
poi.exportExcel("D://crawlerdata/一点资讯-软博会.xlsx", "asd", headList, listAll);
//
//
//
}
//
//
//
}
src/test/java/com/zhiwei/hsitory/YidianzixunAccountExample.java
View file @
3e350f8b
package
com
.
zhiwei
.
hsitory
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Yidianzixun
;
public
class
YidianzixunAccountExample
{
@Test
public
void
yidianzixunAccountTest
()
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
String
channelid
=
"m190159"
;
String
startTime
=
"2007-01-01 00:00:00"
;
List
<
Map
<
String
,
Object
>>
dataList
=
Yidianzixun
.
getYidianzixunAccountData
(
channelid
,
startTime
,
ProxyHolder
.
NAT_HEAVY_PROXY
,
null
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"comment_count"
);
headList
.
add
(
"ctype"
);
headList
.
add
(
"source"
);
headList
.
add
(
"url"
);
headList
.
add
(
"summary"
);
poi
.
exportExcel
(
"D://crawlerdata//历史文章采集/一点资讯-新华社中国新三板.xlsx"
,
"新华社中国新三板"
,
headList
,
dataList
);
}
}
//
package com.zhiwei.hsitory;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.common.config.GroupType;
//
import com.zhiwei.crawler.proxy.ProxyFactory;
//
import com.zhiwei.crawler.proxy.ProxyHolder;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Yidianzixun;
//
//
//
public class YidianzixunAccountExample {
//
//
@Test
//
public void yidianzixunAccountTest() {
//
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER);
//
String channelid = "m190159";
//
String startTime = "2007-01-01 00:00:00";
//
List<Map<String,Object>> dataList = Yidianzixun.getYidianzixunAccountData(channelid, startTime,ProxyHolder.NAT_HEAVY_PROXY,null);
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
List<String> headList = new ArrayList<String>();
//
headList.add("title");
//
headList.add("time");
//
headList.add("comment_count");
//
headList.add("ctype");
//
headList.add("source");
//
headList.add("url");
//
headList.add("summary");
//
poi.exportExcel("D://crawlerdata//历史文章采集/一点资讯-新华社中国新三板.xlsx", "新华社中国新三板", headList, dataList);
//
}
//
//
//
}
src/test/java/com/zhiwei/keyword/XueqiuKeyWord.java
View file @
3e350f8b
package
com
.
zhiwei
.
keyword
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Xueqiu
;
public
class
XueqiuKeyWord
{
@Test
public
void
f
()
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
String
word
=
"软博会|软件博览会"
;
String
endTime
=
"2018-01-01 00:00:00"
;
String
cookie
=
"aliyungf_tc=AQAAADi3G3I7GAEAgtgMPF/W3FeFauXk; device_id=3221abfd50f9f8be2abca9182c338c9e; Hm_lvt_1db88642e346389874251b5a1eded6e3=1561962809; s=d911zeymzo; remember=1; remember.sig=K4F3faYzmVuqC0iXIERCQf55g2Y; xq_a_token=b2a8d68c56272cca3b71e1bcaf04b744b16e9497; xq_a_token.sig=hj7OrhYid1GuAaF-AmLYrPuyFDk; xqat=b2a8d68c56272cca3b71e1bcaf04b744b16e9497; xqat.sig=gker8dcZJ2Ez4s58eIGPa5zCd0w; xq_r_token=b0ad7b059411acac8c9e2f0a40336804bb60d047; xq_r_token.sig=DwNSLIigE8xtwQKLygagcM28Dd0; xq_is_login=1; xq_is_login.sig=J3LxgPVPUzbBg3Kee_PquUfih7Q; u=1273068356; u.sig=UyqQW-br8hcsOyT6anFmS_SFpCs; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1561963129"
;
String
[]
words
=
word
.
split
(
"\\|"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
String
w
:
words
)
{
System
.
out
.
println
(
w
);
List
<
Map
<
String
,
Object
>>
dataList
=
Xueqiu
.
getData
(
w
,
endTime
,
null
,
cookie
);
System
.
out
.
println
(
w
+
" ---- "
+
dataList
.
size
());
bodyList
.
addAll
(
dataList
);
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"content"
);
headList
.
add
(
"uper"
);
headList
.
add
(
"url"
);
headList
.
add
(
"likeCount"
);
headList
.
add
(
"replyCount"
);
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\雪球网页采集-软博会-relevance.xlsx"
,
"马化腾"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.keyword;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.common.config.GroupType;
//
import com.zhiwei.crawler.proxy.ProxyFactory;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Xueqiu;
//
//
public class XueqiuKeyWord {
//
//
@Test
//
public void f() {
//
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
//
String word = "软博会|软件博览会";
//
String endTime = "2018-01-01 00:00:00";
//
String cookie = "aliyungf_tc=AQAAADi3G3I7GAEAgtgMPF/W3FeFauXk; device_id=3221abfd50f9f8be2abca9182c338c9e; Hm_lvt_1db88642e346389874251b5a1eded6e3=1561962809; s=d911zeymzo; remember=1; remember.sig=K4F3faYzmVuqC0iXIERCQf55g2Y; xq_a_token=b2a8d68c56272cca3b71e1bcaf04b744b16e9497; xq_a_token.sig=hj7OrhYid1GuAaF-AmLYrPuyFDk; xqat=b2a8d68c56272cca3b71e1bcaf04b744b16e9497; xqat.sig=gker8dcZJ2Ez4s58eIGPa5zCd0w; xq_r_token=b0ad7b059411acac8c9e2f0a40336804bb60d047; xq_r_token.sig=DwNSLIigE8xtwQKLygagcM28Dd0; xq_is_login=1; xq_is_login.sig=J3LxgPVPUzbBg3Kee_PquUfih7Q; u=1273068356; u.sig=UyqQW-br8hcsOyT6anFmS_SFpCs; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1561963129";
//
//
//
//
String[] words = word.split("\\|");
//
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
//
for(String w : words) {
//
System.out.println(w);
//
//
List<Map<String, Object>> dataList = Xueqiu.getData(w, endTime, null, cookie);
//
System.out.println(w + " ---- " + dataList.size());
//
bodyList.addAll(dataList);
//
}
//
List<String> headList = new ArrayList<String>();
//
headList.add("title");
//
headList.add("time");
//
headList.add("content");
//
headList.add("uper");
//
headList.add("url");
//
headList.add("likeCount");
//
headList.add("replyCount");
//
poi.exportExcel("D:\\crawlerdata\\自媒体\\雪球网页采集-软博会-relevance.xlsx", "马化腾", headList, bodyList);
//
//
}
//
}
src/test/java/com/zhiwei/shipin/AiqiyiTest.java
View file @
3e350f8b
package
com
.
zhiwei
.
shipin
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Aiqiyi
;
import
com.zhiwei.util.WordReadFile
;
public
class
AiqiyiTest
{
@Test
public
void
aiqiyiTest
()
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"D://crawlerdata//关键词.txt"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
String
w
:
wordList
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
Aiqiyi
.
getAiqiyiByWordData
(
w
,
ProxyHolder
.
NAT_HEAVY_PROXY
);
if
(
dataList
!=
null
&&
dataList
.
size
()
>=
1
)
{
bodyList
.
addAll
(
dataList
);
}
}
List
<
String
>
headList
=
new
ArrayList
<
String
>();
headList
.
add
(
"time"
);
headList
.
add
(
"source"
);
headList
.
add
(
"content"
);
headList
.
add
(
"url"
);
headList
.
add
(
"title"
);
headList
.
add
(
"word"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata//视频/爱奇艺关键词采集-毓婷-0716.xlsx"
,
"数据"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.shipin;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.common.config.GroupType;
//
import com.zhiwei.crawler.proxy.ProxyFactory;
//
import com.zhiwei.crawler.proxy.ProxyHolder;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Aiqiyi;
//
import com.zhiwei.util.WordReadFile;
//
//
public class AiqiyiTest {
//
@Test
//
public void aiqiyiTest() {
//
//
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
//
List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
//
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
//
for(String w : wordList) {
//
List<Map<String,Object>> dataList = Aiqiyi.getAiqiyiByWordData(w,ProxyHolder.NAT_HEAVY_PROXY);
//
if(dataList != null && dataList.size() >= 1) {
//
bodyList.addAll(dataList);
//
}
//
}
//
List<String> headList = new ArrayList<String>();
//
headList.add("time");
//
headList.add("source");
//
headList.add("content");
//
headList.add("url");
//
headList.add("title");
//
headList.add("word");
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
poi.exportExcel("D://crawlerdata//视频/爱奇艺关键词采集-毓婷-0716.xlsx", "数据", headList, bodyList);
//
//
//
//
//
}
//
//
}
src/test/java/com/zhiwei/shipin/BilibiliTest.java
View file @
3e350f8b
package
com
.
zhiwei
.
shipin
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.BiliBili
;
import
com.zhiwei.util.WordReadFile
;
public
class
BilibiliTest
{
@Test
public
void
f
()
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"D://crawlerdata//关键词-1.txt"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
String
cookie
=
"LIVE_BUVID=AUTO8715300758995538; sid=kp5rluge; fts=1530161621; im_notify_type_35324319=0; buvid3=08ABE6AE-5061-4CE5-B34F-1A8AAB64DB3320712infoc; rpdid=olppsmkxmpdoskwoxiwww; finger=edc6ecda; stardustvideo=1; UM_distinctid=164fe68fb31996-01f161c3523abe-6114167a-1fa400-164fe68fb32274"
;
for
(
String
word
:
wordList
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
BiliBili
.
getData
(
word
,
null
,
"2019-07-18 00:00:00"
,
cookie
);
if
(
dataList
!=
null
)
{
System
.
out
.
println
(
word
+
" ----- "
+
dataList
.
size
());
bodyList
.
addAll
(
dataList
);
}
}
List
<
String
>
headlist
=
new
ArrayList
<>();
headlist
.
add
(
"submitcount"
);
headlist
.
add
(
"playcount"
);
headlist
.
add
(
"time"
);
headlist
.
add
(
"source"
);
headlist
.
add
(
"title"
);
headlist
.
add
(
"url"
);
headlist
.
add
(
"word"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata//视频//bilibili关键词采集数据-吃鸡否-0722.xlsx"
,
"B站数据"
,
headlist
,
bodyList
);
}
}
//
package com.zhiwei.shipin;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.common.config.GroupType;
//
import com.zhiwei.crawler.proxy.ProxyFactory;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.BiliBili;
//
import com.zhiwei.util.WordReadFile;
//
//
public class BilibiliTest {
//
@Test
//
public void f() {
//
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
//
List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词-1.txt");
//
List<Map<String, Object>> bodyList = new ArrayList<>();
//
String cookie = "LIVE_BUVID=AUTO8715300758995538; sid=kp5rluge; fts=1530161621; im_notify_type_35324319=0; buvid3=08ABE6AE-5061-4CE5-B34F-1A8AAB64DB3320712infoc; rpdid=olppsmkxmpdoskwoxiwww; finger=edc6ecda; stardustvideo=1; UM_distinctid=164fe68fb31996-01f161c3523abe-6114167a-1fa400-164fe68fb32274";
//
for (String word : wordList) {
//
List<Map<String, Object>> dataList = BiliBili.getData(word, null, "2019-07-18 00:00:00",
//
cookie);
//
if (dataList != null) {
//
System.out.println(word + " ----- " + dataList.size());
//
bodyList.addAll(dataList);
//
}
//
}
//
List<String> headlist = new ArrayList<>();
//
headlist.add("submitcount");
//
headlist.add("playcount");
//
headlist.add("time");
//
headlist.add("source");
//
headlist.add("title");
//
headlist.add("url");
//
headlist.add("word");
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
poi.exportExcel("D://crawlerdata//视频//bilibili关键词采集数据-吃鸡否-0722.xlsx", "B站数据", headlist, bodyList);
//
//
}
//
}
src/test/java/com/zhiwei/shipin/QQTVTest.java
View file @
3e350f8b
package
com
.
zhiwei
.
shipin
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.shipin.QQTV
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.util.WordReadFile
;
public
class
QQTVTest
{
@Test
public
void
f
()
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
String
time
=
"2019-04-11 00:00:00"
;
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"D://crawlerdata//关键词.txt"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
for
(
String
word
:
wordList
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
QQTV
.
getData
(
word
,
time
,
ProxyHolder
.
NAT_HEAVY_PROXY
);
if
(
dataList
!=
null
)
{
System
.
out
.
println
(
word
+
" ----- "
+
dataList
.
size
());
bodyList
.
addAll
(
dataList
);
}
ZhiWeiTools
.
sleep
(
1000
);
}
List
<
String
>
headlist
=
new
ArrayList
<>();
headlist
.
add
(
"playCount"
);
headlist
.
add
(
"time"
);
headlist
.
add
(
"source"
);
headlist
.
add
(
"title"
);
headlist
.
add
(
"url"
);
headlist
.
add
(
"word"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata//视频//腾讯视频关键词采集数据-毓婷-0716.xlsx"
,
"腾讯视频数据"
,
headlist
,
bodyList
);
}
}
//
package com.zhiwei.shipin;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.common.config.GroupType;
//
import com.zhiwei.crawler.proxy.ProxyFactory;
//
import com.zhiwei.crawler.proxy.ProxyHolder;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.shipin.QQTV;
//
import com.zhiwei.tools.tools.ZhiWeiTools;
//
import com.zhiwei.util.WordReadFile;
//
//
public class QQTVTest {
//
@Test
//
public void f() {
//
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",GroupType.PROVIDER);
//
String time = "2019-04-11 00:00:00";
//
List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
//
List<Map<String, Object>> bodyList = new ArrayList<>();
//
for (String word : wordList) {
//
List<Map<String, Object>> dataList = QQTV.getData(word,time, ProxyHolder.NAT_HEAVY_PROXY);
//
if (dataList != null) {
//
System.out.println(word + " ----- " + dataList.size());
//
bodyList.addAll(dataList);
//
}
//
ZhiWeiTools.sleep(1000);
//
}
//
List<String> headlist = new ArrayList<>();
//
headlist.add("playCount");
//
headlist.add("time");
//
headlist.add("source");
//
headlist.add("title");
//
headlist.add("url");
//
headlist.add("word");
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
poi.exportExcel("D://crawlerdata//视频//腾讯视频关键词采集数据-毓婷-0716.xlsx", "腾讯视频数据", headlist, bodyList);
//
//
//
//
//
}
//
}
src/test/java/com/zhiwei/shipin/SohuTVTest.java
View file @
3e350f8b
package
com
.
zhiwei
.
shipin
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.shipin.SohuTV
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.util.WordReadFile
;
public
class
SohuTVTest
{
@Test
public
void
f
()
{
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"D://crawlerdata//关键词.txt"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
String
cookie
=
"SUV=1901101134139015; IPLOC=CN3301; gidinf=x099980109ee0f08567b42835000336ade2ef3762611; fuid=15474616189304048886; newpuid=15474616191372936893; beans_mz_userid=UBThg01XRPg8; pmai=dad35c1c318bdd22; ifoxinstalled=false; beans_freq=1; beans_dmp=%7B%22admaster%22%3A1547461620%2C%22shunfei%22%3A1547461620%2C%22reachmax%22%3A1548816807%2C%22lingji%22%3A1547461620%2C%22yoyi%22%3A1547461620%2C%22ipinyou%22%3A1547461620%2C%22ipinyou_admaster%22%3A1547461620%2C%22miaozhen%22%3A1548816807%2C%22diantong%22%3A1547461620%2C%22huayang%22%3A1547461620%7D; beans_dmp_done=1; reqtype=pc; sokey=%5B%7B%22key%22%3A%22%E7%BE%8E%E5%9B%A2%22%7D%2C%7B%22key%22%3A%22%E5%B8%AE%E5%AE%9D%E9%80%82%22%7D%2C%7B%22key%22%3A%22%E5%B8%AE%E5%AE%9D%E9%80%82%20%E4%BA%8C%E5%99%81%E8%8B%B1%22%7D%5D; t=1548817812321"
;
for
(
String
word
:
wordList
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
SohuTV
.
sohuTVData
(
word
,
cookie
,
null
);
if
(
dataList
!=
null
)
{
System
.
out
.
println
(
word
+
" ----- "
+
dataList
.
size
());
bodyList
.
addAll
(
dataList
);
}
ZhiWeiTools
.
sleep
(
1000
);
}
List
<
String
>
headlist
=
new
ArrayList
<>();
headlist
.
add
(
"playCount"
);
headlist
.
add
(
"time"
);
headlist
.
add
(
"source"
);
headlist
.
add
(
"title"
);
headlist
.
add
(
"url"
);
headlist
.
add
(
"word"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata//视频//搜狐视频关键词采集数据-毓婷-0716.xlsx"
,
"搜狐数据"
,
headlist
,
bodyList
);
}
}
//
package com.zhiwei.shipin;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.shipin.SohuTV;
//
import com.zhiwei.tools.tools.ZhiWeiTools;
//
import com.zhiwei.util.WordReadFile;
//
//
public class SohuTVTest {
//
@Test
//
public void f() {
//
List<String> wordList = WordReadFile.getWords("D://crawlerdata//关键词.txt");
//
List<Map<String, Object>> bodyList = new ArrayList<>();
//
String cookie = "SUV=1901101134139015; IPLOC=CN3301; gidinf=x099980109ee0f08567b42835000336ade2ef3762611; fuid=15474616189304048886; newpuid=15474616191372936893; beans_mz_userid=UBThg01XRPg8; pmai=dad35c1c318bdd22; ifoxinstalled=false; beans_freq=1; beans_dmp=%7B%22admaster%22%3A1547461620%2C%22shunfei%22%3A1547461620%2C%22reachmax%22%3A1548816807%2C%22lingji%22%3A1547461620%2C%22yoyi%22%3A1547461620%2C%22ipinyou%22%3A1547461620%2C%22ipinyou_admaster%22%3A1547461620%2C%22miaozhen%22%3A1548816807%2C%22diantong%22%3A1547461620%2C%22huayang%22%3A1547461620%7D; beans_dmp_done=1; reqtype=pc; sokey=%5B%7B%22key%22%3A%22%E7%BE%8E%E5%9B%A2%22%7D%2C%7B%22key%22%3A%22%E5%B8%AE%E5%AE%9D%E9%80%82%22%7D%2C%7B%22key%22%3A%22%E5%B8%AE%E5%AE%9D%E9%80%82%20%E4%BA%8C%E5%99%81%E8%8B%B1%22%7D%5D; t=1548817812321";
//
for (String word : wordList) {
//
List<Map<String, Object>> dataList = SohuTV.sohuTVData(word, cookie, null);
//
if (dataList != null) {
//
System.out.println(word + " ----- " + dataList.size());
//
bodyList.addAll(dataList);
//
}
//
ZhiWeiTools.sleep(1000);
//
}
//
List<String> headlist = new ArrayList<>();
//
headlist.add("playCount");
//
headlist.add("time");
//
headlist.add("source");
//
headlist.add("title");
//
headlist.add("url");
//
headlist.add("word");
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
poi.exportExcel("D://crawlerdata//视频//搜狐视频关键词采集数据-毓婷-0716.xlsx", "搜狐数据", headlist, bodyList);
//
//
}
//
}
src/test/java/com/zhiwei/shipin/YoukuKeyWordTest.java
View file @
3e350f8b
package
com
.
zhiwei
.
shipin
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.Youku
;
import
com.zhiwei.util.WordReadFile
;
public
class
YoukuKeyWordTest
{
@Test
public
void
f
()
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
List
<
String
>
words
=
WordReadFile
.
getWords
(
"D://crawlerdata//关键词.txt"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
for
(
String
w
:
words
)
{
System
.
out
.
println
(
w
);
bodyList
.
addAll
(
Youku
.
getDataList
(
w
));
}
List
<
String
>
headList
=
new
ArrayList
<>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"url"
);
headList
.
add
(
"uper"
);
headList
.
add
(
"word"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata//视频//优酷数据-毓婷-0716.xlsx"
,
"数据"
,
headList
,
bodyList
);
}
}
//
package com.zhiwei.shipin;
//
//
import java.util.ArrayList;
//
import java.util.List;
//
import java.util.Map;
//
//
import org.junit.Test;
//
//
import com.zhiwei.common.config.GroupType;
//
import com.zhiwei.crawler.proxy.ProxyFactory;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//
import com.zhiwei.parse.Youku;
//
import com.zhiwei.util.WordReadFile;
//
//
public class YoukuKeyWordTest {
//
@Test
//
public void f() {
//
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
//
GroupType.PROVIDER);
//
List<String> words = WordReadFile.getWords("D://crawlerdata//关键词.txt");
//
List<Map<String,Object>> bodyList = new ArrayList<>();
//
for(String w : words) {
//
System.out.println(w);
//
bodyList.addAll(Youku.getDataList(w));
//
}
//
List<String> headList = new ArrayList<>();
//
headList.add("title");
//
headList.add("time");
//
headList.add("url");
//
headList.add("uper");
//
headList.add("word");
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
poi.exportExcel("D://crawlerdata//视频//优酷数据-毓婷-0716.xlsx", "数据", headList, bodyList);
//
//
}
//
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment