Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
articlenewscrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
articlenewscrawler
Commits
b3d545a3
Commit
b3d545a3
authored
Feb 01, 2019
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
b站关键词 和 搜狐等视频采集
parent
05c92686
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
566 additions
and
294 deletions
+566
-294
src/main/java/com/zhiwei/parse/Aiqiyi.java
+8
-8
src/main/java/com/zhiwei/parse/BiliBili.java
+0
-24
src/main/java/com/zhiwei/parse/analysis/AiqiyiByWordAnalysis.java
+29
-59
src/main/java/com/zhiwei/parse/shipin/QQTV.java
+101
-0
src/main/java/com/zhiwei/parse/shipin/SohuTV.java
+80
-0
src/test/java/com/zhiwei/crawler/AiqiyiByWordExample.java
+45
-42
src/test/java/com/zhiwei/crawler/MaimaiBywordExample.java
+3
-3
src/test/java/com/zhiwei/keyword/GftaiTest.java
+33
-33
src/test/java/com/zhiwei/keyword/KuaiTousuTest.java
+38
-38
src/test/java/com/zhiwei/keyword/SinaTousuTest.java
+38
-38
src/test/java/com/zhiwei/keyword/YoukuKeyWordTest.java
+68
-49
src/test/java/com/zhiwei/shipin/BilibiliTest.java
+38
-0
src/test/java/com/zhiwei/shipin/QQTVTest.java
+46
-0
src/test/java/com/zhiwei/shipin/SohuTVTest.java
+39
-0
No files found.
src/main/java/com/zhiwei/parse/Aiqiyi.java
View file @
b3d545a3
...
@@ -27,25 +27,25 @@ public class Aiqiyi {
...
@@ -27,25 +27,25 @@ public class Aiqiyi {
*/
*/
public
static
List
<
Map
<
String
,
Object
>>
getAiqiyiByWordData
(
String
word
,
Proxy
proxy
)
{
public
static
List
<
Map
<
String
,
Object
>>
getAiqiyiByWordData
(
String
word
,
Proxy
proxy
)
{
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getAiqiyiBywordHeaderMap
(
null
);
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getAiqiyiBywordHeaderMap
(
null
);
Map
<
String
,
String
>
headerMap1
=
HeadGet
.
getAiqiyiHeaderMap
(
null
);
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
try
{
try
{
for
(
int
i
=
1
;
i
<=
20
;
i
++)
{
for
(
int
i
=
1
;
i
<=
20
;
i
++)
{
String
url
=
"http://so.iqiyi.com/so/q_"
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)+
"_ctg_%E7%94%9F%E6%B4%BB_t_0_page_"
+
i
+
"_p_1_qc_0_rd__site__m_11_bitrate_?af=true"
;
int
count
=
dataList
.
size
();
String
url
=
"https://so.iqiyi.com/so/q_"
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)+
"_ctg__t_0_page_"
+
i
+
"_p_1_qc_0_rd__site__m_4_bitrate_"
;
System
.
out
.
println
(
url
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
List
<
String
>
urlList
=
aiqiyiByWordAnalysis
.
getAiqiyiUrlList
(
result
);
List
<
Map
<
String
,
Object
>>
map
=
aiqiyiByWordAnalysis
.
getAiqiyiData
(
result
);
for
(
String
newurl
:
urlList
)
{
ZhiWeiTools
.
sleep
(
2000
);
Map
<
String
,
Object
>
map
=
aiqiyiByWordAnalysis
.
getAiqiyiData
(
newurl
,
headerMap1
,
proxy
);
if
(
map
!=
null
)
{
if
(
map
!=
null
)
{
dataList
.
add
(
map
);
dataList
.
addAll
(
map
);
}
}
if
(
count
==
dataList
.
size
())
{
break
;
}
}
System
.
out
.
println
(
"=============="
+
dataList
.
size
());
System
.
out
.
println
(
"=============="
+
dataList
.
size
());
ZhiWeiTools
.
sleep
(
2000
);
}
}
return
dataList
;
return
dataList
;
}
catch
(
UnsupportedEncodingException
e
)
{
}
catch
(
UnsupportedEncodingException
e
)
{
e
.
printStackTrace
();
logger
.
info
(
"采集数据出错:{}"
,
e
.
getMessage
());
logger
.
info
(
"采集数据出错:{}"
,
e
.
getMessage
());
return
dataList
;
return
dataList
;
}
}
...
...
src/main/java/com/zhiwei/parse/BiliBili.java
View file @
b3d545a3
...
@@ -13,10 +13,8 @@ import org.slf4j.LoggerFactory;
...
@@ -13,10 +13,8 @@ import org.slf4j.LoggerFactory;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.HttpRequestBuilder
;
import
com.zhiwei.crawler.core.HttpRequestBuilder
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.analysis.BilibilikeyWordAnalysis
;
import
com.zhiwei.parse.analysis.BilibilikeyWordAnalysis
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.util.WordReadFile
;
import
okhttp3.Headers
;
import
okhttp3.Headers
;
import
okhttp3.Request
;
import
okhttp3.Request
;
...
@@ -34,7 +32,6 @@ public class BiliBili {
...
@@ -34,7 +32,6 @@ public class BiliBili {
Headers
header
=
Headers
.
of
(
"cookie"
,
cookie
,
"Referer"
,
"https://www.bilibili.com/"
,
"Host"
,
"search.bilibili.com"
);
Headers
header
=
Headers
.
of
(
"cookie"
,
cookie
,
"Referer"
,
"https://www.bilibili.com/"
,
"Host"
,
"search.bilibili.com"
);
Request
request
=
HttpRequestBuilder
.
newGetRequest
(
url
,
header
);
Request
request
=
HttpRequestBuilder
.
newGetRequest
(
url
,
header
);
String
result
=
httpBoot
.
syncCall
(
request
,
proxy
).
body
().
string
();
String
result
=
httpBoot
.
syncCall
(
request
,
proxy
).
body
().
string
();
// System.out.println(result);
ZhiWeiTools
.
sleep
(
3000
);
ZhiWeiTools
.
sleep
(
3000
);
Map
<
String
,
Object
>
map
=
BilibilikeyWordAnalysis
.
getData
(
result
);
Map
<
String
,
Object
>
map
=
BilibilikeyWordAnalysis
.
getData
(
result
);
boolean
more
=
(
boolean
)
map
.
get
(
"more"
);
boolean
more
=
(
boolean
)
map
.
get
(
"more"
);
...
@@ -70,28 +67,7 @@ public class BiliBili {
...
@@ -70,28 +67,7 @@ public class BiliBili {
return
Collections
.
emptyList
();
return
Collections
.
emptyList
();
}
}
public
static
void
main
(
String
[]
args
)
{
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"D://crawlerdata//关键词.txt"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
String
cookie
=
"LIVE_BUVID=AUTO8715300758995538; sid=kp5rluge; fts=1530161621; im_notify_type_35324319=0; buvid3=08ABE6AE-5061-4CE5-B34F-1A8AAB64DB3320712infoc; rpdid=olppsmkxmpdoskwoxiwww; finger=edc6ecda; stardustvideo=1; UM_distinctid=164fe68fb31996-01f161c3523abe-6114167a-1fa400-164fe68fb32274"
;
for
(
String
word
:
wordList
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
BiliBili
.
getData
(
word
,
null
,
cookie
);
if
(
dataList
!=
null
)
{
System
.
out
.
println
(
word
+
" ----- "
+
dataList
.
size
());
bodyList
.
addAll
(
dataList
);
}
}
List
<
String
>
headlist
=
new
ArrayList
<
String
>();
headlist
.
add
(
"submitcount"
);
headlist
.
add
(
"playcount"
);
headlist
.
add
(
"time"
);
headlist
.
add
(
"source"
);
headlist
.
add
(
"title"
);
headlist
.
add
(
"url"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata//bilibili关键词采集数据-txh.xlsx"
,
"B站数据"
,
headlist
,
bodyList
);
}
}
}
src/main/java/com/zhiwei/parse/analysis/AiqiyiByWordAnalysis.java
View file @
b3d545a3
package
com
.
zhiwei
.
parse
.
analysis
;
package
com
.
zhiwei
.
parse
.
analysis
;
import
java.net.Proxy
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.HashMap
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Map
;
...
@@ -15,82 +15,52 @@ import org.slf4j.LoggerFactory;
...
@@ -15,82 +15,52 @@ import org.slf4j.LoggerFactory;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HttpClient
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
import
okhttp3.Response
;
public
class
AiqiyiByWordAnalysis
{
public
class
AiqiyiByWordAnalysis
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
AiqiyiByWordAnalysis
.
class
);
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
AiqiyiByWordAnalysis
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
/**
public
List
<
Map
<
String
,
Object
>>
getAiqiyiData
(
String
result
)
{
*
List
<
Map
<
String
,
Object
>>
dataMap
=
new
ArrayList
<>();
* @Description 解析出所有有用链接
* @param result
* @return
*/
public
List
<
String
>
getAiqiyiUrlList
(
String
result
)
{
List
<
String
>
urlList
=
new
ArrayList
<
String
>();
try
{
try
{
Document
doc
=
Jsoup
.
parse
(
result
);
Document
doc
=
Jsoup
.
parse
(
result
);
Elements
elements
=
doc
.
select
(
"ul.mod_result_list"
).
select
(
"li.list_item"
);
Elements
elements
=
doc
.
select
(
"li.list_item"
);
for
(
Element
element
:
elements
)
{
for
(
Element
element
:
elements
)
{
String
url
=
element
.
select
(
"a.figure-180101"
).
attr
(
"href"
);
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
if
(
url
!=
null
&&
url
.
length
()
>
1
)
{
String
title
=
element
.
select
(
"li"
).
attr
(
"data-widget-searchlist-tvname"
);
urlList
.
add
(
url
);
String
time
=
element
.
select
(
"em.result_info_desc"
).
text
().
split
(
" "
)[
0
];
String
uurl
=
element
.
select
(
"h3.result_title > a"
).
attr
(
"href"
);
map
.
put
(
"time"
,
TimeParse
.
stringFormartDate
(
time
));
map
.
put
(
"url"
,
uurl
);
map
.
put
(
"title"
,
title
);
System
.
out
.
println
(
map
.
toString
());
dataMap
.
add
(
map
);
}
}
}
return
urlList
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
urlList
;
}
}
public
Map
<
String
,
Object
>
getAiqiyiData
(
String
url
,
Map
<
String
,
String
>
headerMap
,
Proxy
proxy
)
{
Map
<
String
,
Object
>
dataMap
=
new
HashMap
<>();
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
)){
String
result
=
response
.
body
().
string
();
Document
doc
=
Jsoup
.
parse
(
result
);
String
time
=
doc
.
select
(
"#widget-vshort-ptime"
).
text
();
if
(!
time
.
contains
(
"2017"
))
{
return
null
;
}
dataMap
.
put
(
"time"
,
time
.
split
(
"发布时间:"
)[
1
]);
String
source
=
doc
.
select
(
"#widget-vshort-un-inner"
).
attr
(
"title"
);
dataMap
.
put
(
"source"
,
source
);
String
content
=
doc
.
select
(
"#widget-vshort-lesswrap"
).
text
();
dataMap
.
put
(
"content"
,
content
);
dataMap
.
put
(
"url"
,
url
);
String
title
=
doc
.
select
(
"#widget-videotitle"
).
attr
(
"title"
);
String
id
=
result
.
split
(
" tvId: "
)[
1
].
split
(
","
)[
0
];
ZhiWeiTools
.
sleep
(
2000
);
int
count
=
getVideo_count
(
id
,
proxy
);
dataMap
.
put
(
"count"
,
count
);
dataMap
.
put
(
"title"
,
title
);
System
.
out
.
println
(
dataMap
.
toString
());
return
dataMap
;
return
dataMap
;
}
catch
(
Exception
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析出错 {}"
,
e
);
logger
.
error
(
"解析出错 {}"
,
e
);
return
dataMap
;
return
Collections
.
emptyList
()
;
}
}
}
}
public
int
getVideo_count
(
String
id
,
Proxy
proxy
)
{
// public String getSource(String url,ProxyHolder proxy) {
try
{
// Map<String,String> headerMap = HeadGet.getAiqiyiForCountHeaderMap(null);
String
url
=
"http://cache.video.iqiyi.com/jp/pc/"
+
id
+
"/"
;
// System.out.println(url);
Map
<
String
,
String
>
headerMap
=
HeadGet
.
getAiqiyiForCountHeaderMap
(
null
);
// try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy)){
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
proxy
,
headerMap
);
// String result = response.body().string();
String
count
=
result
.
split
(
":"
)[
1
].
split
(
"\\}"
)[
0
];
//// System.out.println(result);
return
Integer
.
valueOf
(
count
);
// return "aiqiyi";
}
catch
(
Exception
e
)
{
// } catch (Exception e) {
return
0
;
// e.printStackTrace();
}
// return "";
}
// }
// }
...
...
src/main/java/com/zhiwei/parse/shipin/QQTV.java
0 → 100644
View file @
b3d545a3
package
com
.
zhiwei
.
parse
.
shipin
;
import
java.net.Proxy
;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.Date
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
/**
*
* @ClassName QQTV
* @Description 腾讯视频采集
* @author byte-zbs
* @Date 2019年1月30日 下午3:01:47
* @version 1.0.0
*/
public
class
QQTV
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
QQTV
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
String
time
,
ProxyHolder
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
try
{
String
url
=
"https://v.qq.com/x/search/?ses=qid%3DdKzxiFfC7NqpC6z2jq4m-KGeQjb_Th556Yrz24cQaZo1MUTw2PK4XA%26last_query%3D%E7%BE%8E%E5%9B%A2%26tabid_list%3D0%7C1%7C5%7C13%7C11%7C7%7C2%7C3%7C4%7C6%7C12%7C21%7C14%7C17%7C8%7C15%7C20%26tabname_list%3D%E5%85%A8%E9%83%A8%7C%E7%94%B5%E5%BD%B1%7C%E9%9F%B3%E4%B9%90%7C%E8%B4%A2%E7%BB%8F%7C%E6%96%B0%E9%97%BB%7C%E5%85%B6%E4%BB%96%7C%E7%94%B5%E8%A7%86%E5%89%A7%7C%E7%BB%BC%E8%89%BA%7C%E5%8A%A8%E6%BC%AB%7C%E7%BA%AA%E5%BD%95%E7%89%87%7C%E5%A8%B1%E4%B9%90%7C%E6%B1%BD%E8%BD%A6%7C%E4%BD%93%E8%82%B2%7C%E6%B8%B8%E6%88%8F%7C%E5%8E%9F%E5%88%9B%7C%E6%95%99%E8%82%B2%7C%E6%AF%8D%E5%A9%B4%26resolution_tabid_list%3D0%7C1%7C2%7C3%7C4%7C5%26resolution_tabname_list%3D%E5%85%A8%E9%83%A8%7C%E6%A0%87%E6%B8%85%7C%E9%AB%98%E6%B8%85%7C%E8%B6%85%E6%B8%85%7C%E8%93%9D%E5%85%89%7CVR&q="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)+
"&stag=4&filter=sort%3D1%26pubfilter%3D0%26duration%3D0%26tabid%3D0%26resolution%3D0&cur="
;
int
page
=
1
;
while
(
true
)
{
int
count
=
dataList
.
size
();
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
+
page
),
proxy
)){
String
result
=
response
.
body
().
string
();
Document
doc
=
Jsoup
.
parse
(
result
);
Elements
elements
=
doc
.
select
(
"div._quickopen"
);
logger
.
info
(
" 关键词 {} 量 {} 页 数 {} 此页量 {} "
,
word
,
dataList
.
size
(),
page
,
elements
.
size
());
for
(
Element
element
:
elements
)
{
String
nurl
=
element
.
select
(
"h2.result_title"
).
select
(
"a"
).
attr
(
"href"
);
Map
<
String
,
Object
>
map
=
getUrlData
(
nurl
,
ProxyFactory
.
getNatProxy
());
if
(
Objects
.
nonNull
(
map
)
&&
time
.
compareTo
(
String
.
valueOf
(
map
.
get
(
"time"
)))
<
1
)
{
System
.
out
.
println
(
map
.
toString
());
dataList
.
add
(
map
);
}
ZhiWeiTools
.
sleep
(
50
);
}
page
++;
if
(
count
!=
dataList
.
size
())
{
continue
;
}
break
;
}
catch
(
Exception
e
)
{
logger
.
error
(
" 数据采集出错 {} "
,
e
);
}
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
return
dataList
;
}
private
static
Map
<
String
,
Object
>
getUrlData
(
String
url
,
Proxy
proxy
)
{
for
(
int
i
=
1
;
i
<
3
;
i
++)
{
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
proxy
)){
String
result
=
response
.
body
().
string
();
String
source
=
result
.
split
(
"\\<span class=\"user_name\"\\>"
)[
1
].
split
(
"\\</span\\>"
)[
0
];
result
=
result
.
split
(
"var VIDEO_INFO ="
)[
1
].
split
(
"\\</script\\>"
)[
0
];
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
map
.
put
(
"playCount"
,
json
.
getInteger
(
"view_all_count"
));
map
.
put
(
"title"
,
json
.
getString
(
"title"
));
map
.
put
(
"time"
,
json
.
getString
(
"video_checkup_time"
));
map
.
put
(
"source"
,
source
);
map
.
put
(
"url"
,
url
);
return
map
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
}
return
Collections
.
emptyMap
();
}
}
src/main/java/com/zhiwei/parse/shipin/SohuTV.java
0 → 100644
View file @
b3d545a3
package
com
.
zhiwei
.
parse
.
shipin
;
import
java.net.Proxy
;
import
java.net.URLEncoder
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.RequestUtils
;
import
com.zhiwei.tools.timeparse.TimeParse
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
okhttp3.Response
;
public
class
SohuTV
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
SohuTV
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
public
static
List
<
Map
<
String
,
Object
>>
sohuTVData
(
String
word
,
String
cookie
,
Proxy
proxy
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
try
{
String
url
=
"https://so.tv.sohu.com/mts?wd="
+
URLEncoder
.
encode
(
word
,
"utf-8"
)+
"&c=0&v=0&length=0&limit=0&site=0&o=3&st=0&suged=&p="
;
int
page
=
1
;
Map
<
String
,
Object
>
headers
=
new
HashMap
<>();
headers
.
put
(
"cookie"
,
cookie
);
while
(
true
)
{
int
count
=
dataList
.
size
();
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
+
page
,
headers
),
proxy
)){
String
result
=
response
.
body
().
string
();
Document
document
=
Jsoup
.
parse
(
result
);
Elements
elements
=
document
.
select
(
"body > div.wrap.cfix > div.ssList.area"
).
select
(
"li"
);
for
(
Element
element
:
elements
){
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
String
title
=
element
.
select
(
"strong.lt-title"
).
select
(
"a"
).
text
();
String
userName
=
element
.
select
(
" p > a.name"
).
text
();
String
time
=
element
.
select
(
" p > a.tcount"
).
text
();
String
nurl
=
element
.
select
(
"strong.lt-title"
).
select
(
"a"
).
attr
(
"href"
);
String
amountOfPlay
=
element
.
select
(
"div > a > span.acount"
).
text
();
map
.
put
(
"title"
,
title
);
map
.
put
(
"source"
,
userName
);
map
.
put
(
"time"
,
TimeParse
.
stringFormartDate
(
time
));
map
.
put
(
"url"
,
"https://"
+
nurl
);
map
.
put
(
"playCount"
,
amountOfPlay
);
dataList
.
add
(
map
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"获取数据出错"
,
e
);
}
logger
.
info
(
" 采集关键词 {} 采集到底 {} 页 ,采集到 {} 条 "
,
word
,
page
,
dataList
.
size
());
page
++;
ZhiWeiTools
.
sleep
(
2000
);
if
(
count
==
dataList
.
size
())
{
break
;
}
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"采集出错 {}"
,
e
);
}
return
dataList
;
}
public
static
void
main
(
String
[]
args
)
{
String
cookie
=
"SUV=1901101134139015; IPLOC=CN3301; gidinf=x099980109ee0f08567b42835000336ade2ef3762611; fuid=15474616189304048886; newpuid=15474616191372936893; beans_mz_userid=UBThg01XRPg8; pmai=dad35c1c318bdd22; ifoxinstalled=false; beans_freq=1; beans_dmp=%7B%22admaster%22%3A1547461620%2C%22shunfei%22%3A1547461620%2C%22reachmax%22%3A1548816807%2C%22lingji%22%3A1547461620%2C%22yoyi%22%3A1547461620%2C%22ipinyou%22%3A1547461620%2C%22ipinyou_admaster%22%3A1547461620%2C%22miaozhen%22%3A1548816807%2C%22diantong%22%3A1547461620%2C%22huayang%22%3A1547461620%7D; beans_dmp_done=1; reqtype=pc; sokey=%5B%7B%22key%22%3A%22%E7%BE%8E%E5%9B%A2%22%7D%2C%7B%22key%22%3A%22%E5%B8%AE%E5%AE%9D%E9%80%82%22%7D%2C%7B%22key%22%3A%22%E5%B8%AE%E5%AE%9D%E9%80%82%20%E4%BA%8C%E5%99%81%E8%8B%B1%22%7D%5D; t=1548817812321"
;
SohuTV
.
sohuTVData
(
"美团"
,
cookie
,
null
);
}
}
src/test/java/com/zhiwei/crawler/AiqiyiByWordExample.java
View file @
b3d545a3
//package com.zhiwei.crawler;
package
com
.
zhiwei
.
crawler
;
//
//import java.util.ArrayList;
import
java.util.ArrayList
;
//import java.util.List;
import
java.util.List
;
//import java.util.Map;
import
java.util.Map
;
//
//import org.junit.Test;
import
org.junit.Test
;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.common.config.GroupType
;
//import com.zhiwei.parse.Aiqiyi;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
//
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//public class AiqiyiByWordExample {
import
com.zhiwei.parse.Aiqiyi
;
//
import
com.zhiwei.util.WordReadFile
;
//
// @Test
public
class
AiqiyiByWordExample
{
// public void aiqiyiByWordTest() {
// String word = "美食,味道,菜";
// String[] words = word.split(",");
@Test
// List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
public
void
aiqiyiByWordTest
()
{
// for(String w : words) {
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
// List<Map<String,Object>> dataList = Aiqiyi.getAiqiyiByWordData(w,null);
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"D://crawlerdata//关键词.txt"
);
// if(dataList != null && dataList.size() >= 1) {
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
// bodyList.addAll(dataList);
for
(
String
w
:
wordList
)
{
// }
List
<
Map
<
String
,
Object
>>
dataList
=
Aiqiyi
.
getAiqiyiByWordData
(
w
,
null
);
// }
if
(
dataList
!=
null
&&
dataList
.
size
()
>=
1
)
{
// List<String> headList = new ArrayList<String>();
bodyList
.
addAll
(
dataList
);
// headList.add("count");
}
// headList.add("time");
}
// headList.add("source");
List
<
String
>
headList
=
new
ArrayList
<
String
>();
// headList.add("content");
headList
.
add
(
"count"
);
// headList.add("url");
headList
.
add
(
"time"
);
// headList.add("title");
headList
.
add
(
"source"
);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
headList
.
add
(
"content"
);
// poi.exportExcel("D://crawlerdata/爱奇艺关键词采集.xlsx", "数据", headList, bodyList);
headList
.
add
(
"url"
);
//
headList
.
add
(
"title"
);
//
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
//
poi
.
exportExcel
(
"D://crawlerdata/爱奇艺关键词采集.xlsx"
,
"数据"
,
headList
,
bodyList
);
// }
//
//
//
}
//}
}
src/test/java/com/zhiwei/crawler/MaimaiBywordExample.java
View file @
b3d545a3
...
@@ -11,8 +11,8 @@ public class MaimaiBywordExample {
...
@@ -11,8 +11,8 @@ public class MaimaiBywordExample {
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
String
word
=
"美团|某团|MT|大众点评|新美大|美团点评"
;
String
word
=
"美团|某团|MT|大众点评|新美大|美团点评"
;
String
cookie
=
"
guid=GxsfBBgZGwQYGx4EGBkeVhsfGB4aHBpWHBkEHRkfBUNYS0xLeQoSEwQSHR8ZBBoEGx0FT0dFWEJpCgNFQUlPbQpPQUNGCgZmZ35iYQIKHBkEHRkfBV5DYUhPfU9GWlprCgMeHH1lfQoRGQQcCn5kClldRU5EQ30CChoEHwVLRkZDUEVn; token=\"7IGuqjEwgJ2gXX5PZ0UYSxvn81Aws6v5OFrwpSErsbctlSd1e/7+AzYEMMMeeFJJ8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; _buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoiOGtDSnF6VG5QcFk0R3ZmVFB4MThIMW1ZIiwiX2V4cGlyZSI6MTU0ODMwODU0MTMyNCwiX21heEFnZSI6ODY0MDAwMDB9; session.sig=cnQ0i1LwYxhjO3_BvQ4Coh0f9PQ
"
;
String
cookie
=
"
_buuid=ba30f54f-57ed-4dd4-af5f-31cb08d2eacf; sessionid=3oatshv55and4kwcz9gdpie7qdpj27yt; guid=GxsfBBgZGwQYGx4EGBkeVgcYGxkdHxwdGRMcVhwZBB0ZHwVDWEtMS3kKEhMEEh0fGQQaBBsdBU9HRVhCaQoDRUFJT20KT0FDRgoGZmd+YmECChwZBB0ZHwVeQ2FIT31PRlpaawoDHhx9ZX0KERkEHAp+ZApZXUVOREN9AgoaBB8FS0ZGQ1BFZw==; seid=s1548984672861; token=\"Ap1u6QzIdn8FCrohEAEPI86n9mNSKk1qJWlauQ8KeSbn7fDKTu6bN2Yv6B9V19nO8CKuzcDfAvoCmBm7+jVysA==\"; uid=\"aa0Zx+VbwC41ceG8bxvIefAirs3A3wL6ApgZu/o1crA=\"; session=eyJ1IjoiMTczMzMzNTM2Iiwic2VjcmV0IjoibVVlSlRTUW1NdVdUTUUtRjV0SjBZbExtIiwibWlkNDU2ODc2MCI6ZmFsc2UsInN0YXR1cyI6dHJ1ZSwiX2V4cGlyZSI6MTU0OTA3MTEzOTA2NywiX21heEFnZSI6ODY0MDAwMDB9; session.sig=UOz44C2rF-uJFxFvSwHyII5aJxM
"
;
String
time
=
"2019-01-
17
00:00:00"
;
String
time
=
"2019-01-
24
00:00:00"
;
String
[]
words
=
word
.
split
(
"\\|"
);
String
[]
words
=
word
.
split
(
"\\|"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
for
(
String
w
:
words
)
{
for
(
String
w
:
words
)
{
...
@@ -32,7 +32,7 @@ public class MaimaiBywordExample {
...
@@ -32,7 +32,7 @@ public class MaimaiBywordExample {
headList
.
add
(
"comment_count"
);
headList
.
add
(
"comment_count"
);
headList
.
add
(
"spreads"
);
headList
.
add
(
"spreads"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\脉脉关键词采集-美团-0
123
.xlsx"
,
"脉脉关键词"
,
headList
,
bodyList
);
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\脉脉关键词采集-美团-0
201
.xlsx"
,
"脉脉关键词"
,
headList
,
bodyList
);
}
}
}
}
src/test/java/com/zhiwei/keyword/GftaiTest.java
View file @
b3d545a3
package
com
.
zhiwei
.
keyword
;
//
package com.zhiwei.keyword;
//
import
java.util.ArrayList
;
//
import java.util.ArrayList;
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
org.testng.annotations.Test
;
//
import org.testng.annotations.Test;
//
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.parse.Gftai
;
//
import com.zhiwei.parse.Gftai;
//
public
class
GftaiTest
{
//
public class GftaiTest {
@Test
//
@Test
public
void
f
()
{
//
public void f() {
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
String
words
=
"美团 催收|美团 借款|美团 还钱|三快 借钱|三快 生活费|三快 借款|美团 征信
"
;
// String words = "花呗|借呗|京东白条|京东金条|京东金融
";
String
[]
ws
=
words
.
split
(
"\\|"
);
//
String[] ws = words.split("\\|");
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
//
List<Map<String,Object>> bodyList = new ArrayList<>();
for
(
String
word
:
ws
)
{
//
for(String word : ws) {
List
<
Map
<
String
,
Object
>>
list
=
Gftai
.
getData
(
word
,
null
);
//
List<Map<String,Object>> list = Gftai.getData(word, null);
bodyList
.
addAll
(
list
);
//
bodyList.addAll(list);
System
.
out
.
println
(
word
+
" --------- "
+
bodyList
.
size
());
//
System.out.println(word + " --------- " + bodyList.size());
}
//
}
List
<
String
>
headList
=
new
ArrayList
<>();
//
List<String> headList = new ArrayList<>();
headList
.
add
(
"title"
);
//
headList.add("title");
headList
.
add
(
"time"
);
//
headList.add("time");
headList
.
add
(
"content"
);
//
headList.add("content");
headList
.
add
(
"source"
);
//
headList.add("source");
headList
.
add
(
"url"
);
//
headList.add("url");
//
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\投诉\\国富泰信用-美团-2
.xlsx"
,
"数据"
,
headList
,
bodyList
);
// poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\国富泰信用-美团-3
.xlsx", "数据", headList, bodyList);
}
//
}
}
//
}
src/test/java/com/zhiwei/keyword/KuaiTousuTest.java
View file @
b3d545a3
package
com
.
zhiwei
.
keyword
;
//
package com.zhiwei.keyword;
//
import
java.util.ArrayList
;
//
import java.util.ArrayList;
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
org.testng.annotations.Test
;
//
import org.testng.annotations.Test;
//
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.parse.Gftai
;
//
import com.zhiwei.parse.Gftai;
import
com.zhiwei.parse.KuaiTousu
;
//
import com.zhiwei.parse.KuaiTousu;
//
public
class
KuaiTousuTest
{
//
public class KuaiTousuTest {
@Test
//
@Test
public
void
f
()
{
//
public void f() {
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
String
words
=
"美团 催收|美团 借款|美团 还钱|三快 借钱|三快 生活费|三快 借款|美团 征信
"
;
// String words = "花呗|借呗|京东白条|京东金条|京东金融
";
String
[]
ws
=
words
.
split
(
"\\|"
);
//
String[] ws = words.split("\\|");
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
//
List<Map<String,Object>> bodyList = new ArrayList<>();
for
(
String
word
:
ws
)
{
//
for(String word : ws) {
List
<
Map
<
String
,
Object
>>
list
=
KuaiTousu
.
getData
(
word
,
null
);
//
List<Map<String,Object>> list = KuaiTousu.getData(word, null);
bodyList
.
addAll
(
list
);
//
bodyList.addAll(list);
System
.
out
.
println
(
word
+
" --------- "
+
bodyList
.
size
());
//
System.out.println(word + " --------- " + bodyList.size());
}
//
}
List
<
String
>
headList
=
new
ArrayList
<>();
//
List<String> headList = new ArrayList<>();
headList
.
add
(
"title"
);
//
headList.add("title");
headList
.
add
(
"time"
);
//
headList.add("time");
headList
.
add
(
"content"
);
//
headList.add("content");
headList
.
add
(
"source"
);
//
headList.add("source");
headList
.
add
(
"url"
);
//
headList.add("url");
//
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\投诉\\新浪广东快投诉-美团-2
.xlsx"
,
"数据"
,
headList
,
bodyList
);
// poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\新浪广东快投诉-美团-3
.xlsx", "数据", headList, bodyList);
//
//
//
//
}
//
}
}
//
}
src/test/java/com/zhiwei/keyword/SinaTousuTest.java
View file @
b3d545a3
package
com
.
zhiwei
.
keyword
;
//
package com.zhiwei.keyword;
//
import
java.util.ArrayList
;
//
import java.util.ArrayList;
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
org.testng.annotations.Test
;
//
import org.testng.annotations.Test;
//
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.parse.KuaiTousu
;
//
import com.zhiwei.parse.KuaiTousu;
import
com.zhiwei.parse.SinaTousu
;
//
import com.zhiwei.parse.SinaTousu;
//
public
class
SinaTousuTest
{
//
public class SinaTousuTest {
//
@Test
//
@Test
public
void
getSinaTousuData
()
{
//
public void getSinaTousuData() {
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
//
PoiExcelUtil poi = PoiExcelUtil.getInstance();
String
words
=
"美团 催收|美团 借款|美团 还钱|三快 借钱|三快 生活费|三快 借款|美团 征信
"
;
// String words = "花呗|借呗|京东白条|京东金条|京东金融
";
String
[]
ws
=
words
.
split
(
"\\|"
);
//
String[] ws = words.split("\\|");
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
//
List<Map<String,Object>> bodyList = new ArrayList<>();
for
(
String
word
:
ws
)
{
//
for(String word : ws) {
List
<
Map
<
String
,
Object
>>
list
=
SinaTousu
.
getSinaTousuData
(
word
,
null
,
"2018-07-01 00:00:00"
);
//
List<Map<String,Object>> list = SinaTousu.getSinaTousuData(word, null, "2018-07-01 00:00:00");
bodyList
.
addAll
(
list
);
//
bodyList.addAll(list);
System
.
out
.
println
(
word
+
" --------- "
+
bodyList
.
size
());
//
System.out.println(word + " --------- " + bodyList.size());
}
//
}
List
<
String
>
headList
=
new
ArrayList
<>();
//
List<String> headList = new ArrayList<>();
headList
.
add
(
"title"
);
//
headList.add("title");
headList
.
add
(
"time"
);
//
headList.add("time");
headList
.
add
(
"content"
);
//
headList.add("content");
headList
.
add
(
"source"
);
//
headList.add("source");
headList
.
add
(
"url"
);
//
headList.add("url");
//
poi
.
exportExcel
(
"D:\\crawlerdata\\自媒体\\投诉\\黑猫投诉-美团-2
.xlsx"
,
"数据"
,
headList
,
bodyList
);
// poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\黑猫投诉-美团-3
.xlsx", "数据", headList, bodyList);
//
//
//
}
//
}
}
//
}
src/test/java/com/zhiwei/keyword/YoukuKeyWordTest.java
View file @
b3d545a3
//package com.zhiwei.keyword;
package
com
.
zhiwei
.
keyword
;
//
//import java.util.ArrayList;
import
java.util.ArrayList
;
//import java.util.List;
import
java.util.List
;
//import java.util.Map;
import
java.util.Map
;
//
//import org.testng.annotations.Test;
import
org.testng.annotations.Test
;
//
//import com.zhiwei.common.config.GroupType;
import
com.zhiwei.common.config.GroupType
;
//import com.zhiwei.crawler.proxy.ProxyFactory;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//import com.zhiwei.parse.Youku;
import
com.zhiwei.parse.Youku
;
//
//public class YoukuKeyWordTest {
public
class
YoukuKeyWordTest
{
// @Test
@Test
// public void f() {
public
void
f
()
{
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local",
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
// GroupType.PROVIDER);
GroupType
.
PROVIDER
);
// String word = "帮宝适 二噁英," +
String
word
=
"帮宝适 二噁英,"
+
// "帮宝适 二恶英," +
"帮宝适 二恶英,"
+
// "帮宝适 有毒," +
"帮宝适 甲醛,"
+
// "帮宝适 剧毒," +
"帮宝适 荧光,"
+
// "帮宝适 致癌," +
"帮宝适 致癌,"
+
// "宝洁 二噁英," +
"帮宝适 有毒,"
+
// "宝洁 二恶英," +
"帮宝适 超标,"
+
// "宝洁 有毒," +
"帮宝适 防腐剂,"
+
// "宝洁 剧毒," +
"帮宝适 起诉,"
+
// "宝洁 致癌," +
"帮宝适 伤害,"
+
// "纸尿裤 二噁英," +
"帮宝适 气味,"
+
// "纸尿裤 二恶英," +
"帮宝适 异味,"
+
// "纸尿裤 有毒," +
"帮宝适 起坨,"
+
// "纸尿裤 剧毒," +
"帮宝适 异物,"
+
// "纸尿裤 致癌";
"帮宝适 漏,"
+
// List<Map<String,Object>> bodyList = new ArrayList<>();
"帮宝适 刺鼻,"
+
// String[] words = word.split(",");
"帮宝适 勒,"
+
// for(String w : words) {
"帮宝适 脱皮,"
+
// System.out.println(w);
"帮宝适 划伤,"
+
// bodyList.addAll(Youku.getDataList(w));
"绿帮 二噁英,"
+
// }
"绿帮 二恶英,"
+
// List<String> headList = new ArrayList<>();
"绿帮 甲醛,"
+
// headList.add("title");
"绿帮 荧光,"
+
// headList.add("time");
"绿帮 致癌,"
+
// headList.add("url");
"绿帮 有毒,"
+
// headList.add("uper");
"绿帮 超标,"
+
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
"绿帮 起诉,"
+
// poi.exportExcel("C:\\Users\\byte-zbs\\Desktop\\tx\\优酷数据-txh-0121.xlsx", "数据", headList, bodyList);
"绿帮 气味,"
+
//
"绿帮 异味,"
+
// }
"绿帮 异物,"
+
//}
"绿帮 漏,"
+
"绿帮 刺鼻,"
+
"绿帮 勒,"
+
"绿帮 脱皮"
;
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
String
[]
words
=
word
.
split
(
","
);
for
(
String
w
:
words
)
{
System
.
out
.
println
(
w
);
bodyList
.
addAll
(
Youku
.
getDataList
(
w
));
}
List
<
String
>
headList
=
new
ArrayList
<>();
headList
.
add
(
"title"
);
headList
.
add
(
"time"
);
headList
.
add
(
"url"
);
headList
.
add
(
"uper"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"C:\\Users\\byte-zbs\\Desktop\\tx\\优酷数据-txh-0121.xlsx"
,
"数据"
,
headList
,
bodyList
);
}
}
src/test/java/com/zhiwei/shipin/BilibiliTest.java
0 → 100644
View file @
b3d545a3
package
com
.
zhiwei
.
shipin
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.testng.annotations.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.BiliBili
;
import
com.zhiwei.util.WordReadFile
;
public
class
BilibiliTest
{
@Test
public
void
f
()
{
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"D://crawlerdata//关键词.txt"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
String
cookie
=
"LIVE_BUVID=AUTO8715300758995538; sid=kp5rluge; fts=1530161621; im_notify_type_35324319=0; buvid3=08ABE6AE-5061-4CE5-B34F-1A8AAB64DB3320712infoc; rpdid=olppsmkxmpdoskwoxiwww; finger=edc6ecda; stardustvideo=1; UM_distinctid=164fe68fb31996-01f161c3523abe-6114167a-1fa400-164fe68fb32274"
;
for
(
String
word
:
wordList
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
BiliBili
.
getData
(
word
,
null
,
cookie
);
if
(
dataList
!=
null
)
{
System
.
out
.
println
(
word
+
" ----- "
+
dataList
.
size
());
bodyList
.
addAll
(
dataList
);
}
}
List
<
String
>
headlist
=
new
ArrayList
<>();
headlist
.
add
(
"submitcount"
);
headlist
.
add
(
"playcount"
);
headlist
.
add
(
"time"
);
headlist
.
add
(
"source"
);
headlist
.
add
(
"title"
);
headlist
.
add
(
"url"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata//bilibili关键词采集数据-txh-0130.xlsx"
,
"B站数据"
,
headlist
,
bodyList
);
}
}
src/test/java/com/zhiwei/shipin/QQTVTest.java
0 → 100644
View file @
b3d545a3
package
com
.
zhiwei
.
shipin
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.testng.annotations.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.shipin.QQTV
;
import
com.zhiwei.parse.shipin.SohuTV
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.util.WordReadFile
;
public
class
QQTVTest
{
@Test
public
void
f
()
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
);
String
time
=
"2018-07-01 00:00:00"
;
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"D://crawlerdata//关键词.txt"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
for
(
String
word
:
wordList
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
QQTV
.
getData
(
word
,
time
,
ProxyHolder
.
NAT_PROXY
);
if
(
dataList
!=
null
)
{
System
.
out
.
println
(
word
+
" ----- "
+
dataList
.
size
());
bodyList
.
addAll
(
dataList
);
}
ZhiWeiTools
.
sleep
(
1000
);
}
List
<
String
>
headlist
=
new
ArrayList
<>();
headlist
.
add
(
"playCount"
);
headlist
.
add
(
"time"
);
headlist
.
add
(
"source"
);
headlist
.
add
(
"title"
);
headlist
.
add
(
"url"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata//腾讯视频关键词采集数据-txh-0130.xlsx"
,
"B站数据"
,
headlist
,
bodyList
);
}
}
src/test/java/com/zhiwei/shipin/SohuTVTest.java
0 → 100644
View file @
b3d545a3
package
com
.
zhiwei
.
shipin
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.testng.annotations.Test
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.BiliBili
;
import
com.zhiwei.parse.shipin.SohuTV
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
import
com.zhiwei.util.WordReadFile
;
public
class
SohuTVTest
{
@Test
public
void
f
()
{
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"D://crawlerdata//关键词.txt"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
String
cookie
=
"SUV=1901101134139015; IPLOC=CN3301; gidinf=x099980109ee0f08567b42835000336ade2ef3762611; fuid=15474616189304048886; newpuid=15474616191372936893; beans_mz_userid=UBThg01XRPg8; pmai=dad35c1c318bdd22; ifoxinstalled=false; beans_freq=1; beans_dmp=%7B%22admaster%22%3A1547461620%2C%22shunfei%22%3A1547461620%2C%22reachmax%22%3A1548816807%2C%22lingji%22%3A1547461620%2C%22yoyi%22%3A1547461620%2C%22ipinyou%22%3A1547461620%2C%22ipinyou_admaster%22%3A1547461620%2C%22miaozhen%22%3A1548816807%2C%22diantong%22%3A1547461620%2C%22huayang%22%3A1547461620%7D; beans_dmp_done=1; reqtype=pc; sokey=%5B%7B%22key%22%3A%22%E7%BE%8E%E5%9B%A2%22%7D%2C%7B%22key%22%3A%22%E5%B8%AE%E5%AE%9D%E9%80%82%22%7D%2C%7B%22key%22%3A%22%E5%B8%AE%E5%AE%9D%E9%80%82%20%E4%BA%8C%E5%99%81%E8%8B%B1%22%7D%5D; t=1548817812321"
;
for
(
String
word
:
wordList
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
SohuTV
.
sohuTVData
(
word
,
cookie
,
null
);
if
(
dataList
!=
null
)
{
System
.
out
.
println
(
word
+
" ----- "
+
dataList
.
size
());
bodyList
.
addAll
(
dataList
);
}
ZhiWeiTools
.
sleep
(
1000
);
}
List
<
String
>
headlist
=
new
ArrayList
<>();
headlist
.
add
(
"playCount"
);
headlist
.
add
(
"time"
);
headlist
.
add
(
"source"
);
headlist
.
add
(
"title"
);
headlist
.
add
(
"url"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"D://crawlerdata//搜狐视频关键词采集数据-txh-0130.xlsx"
,
"B站数据"
,
headlist
,
bodyList
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment