Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
articlenewscrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
articlenewscrawler
Commits
d7dce3fc
Commit
d7dce3fc
authored
Apr 03, 2020
by
cwy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
本地提交
parent
3fdd0d2c
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
226 additions
and
225 deletions
+226
-225
pom.xml
+2
-2
src/main/java/com/zhiwei/parse/Baijia.java
+4
-2
src/main/java/com/zhiwei/parse/BiliBili.java
+2
-2
src/main/java/com/zhiwei/parse/Maimai.java
+5
-7
src/main/java/com/zhiwei/parse/Xueqiu.java
+60
-60
src/main/java/com/zhiwei/parse/analysis/BaijiaAccountAnalysis.java
+0
-2
src/main/java/com/zhiwei/parse/shipin/QQTV.java
+1
-0
src/test/java/com/zhiwei/CrawlerTest.java
+16
-16
src/test/java/com/zhiwei/hsitory/BTimeHistoryExample.java
+18
-18
src/test/java/com/zhiwei/hsitory/BaijiaAccountExample.java
+16
-15
src/test/java/com/zhiwei/hsitory/KuaiDataHistoryExample.java
+18
-18
src/test/java/com/zhiwei/hsitory/ZakerHistoryExample.java
+20
-20
src/test/java/com/zhiwei/hsitory/ZhihuArticleHistoryExample.java
+18
-18
src/test/java/com/zhiwei/shipin/BilibiliTest.java
+43
-42
src/test/java/com/zhiwei/shipin/QQTVTest.java
+3
-3
No files found.
pom.xml
View file @
d7dce3fc
...
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<artifactId>
articlenewscrawler
</artifactId>
<version>
0.2.
3
-SNAPSHOT
</version>
<version>
0.2.
4
-SNAPSHOT
</version>
<name>
articlenewscrawler
</name>
<description>
采集凤凰,一点资讯,搜狐历时文章和文章评论
</description>
...
...
@@ -21,7 +21,7 @@
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<version>
0.
5.5.6
-SNAPSHOT
</version>
<version>
0.
6.0.1
-SNAPSHOT
</version>
<scope>
provided
</scope>
</dependency>
</dependencies>
...
...
src/main/java/com/zhiwei/parse/Baijia.java
View file @
d7dce3fc
...
...
@@ -77,9 +77,10 @@ public class Baijia {
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
Map
<
String
,
Object
>>
getBaijiaAccountByBaiduData
(
String
appId
,
String
name
,
String
startTime
,
String
cookie
,
ProxyHolder
proxy
)
{
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>
();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
headerMap
.
put
(
"cookie"
,
cookie
);
headerMap
.
put
(
"Host"
,
"mbd.baidu.com"
);
String
uk
=
getUkData
(
appId
,
proxy
,
cookie
);
if
(
Objects
.
isNull
(
uk
))
{
return
Collections
.
emptyList
();
...
...
@@ -90,13 +91,14 @@ public class Baijia {
for
(
int
i
=
1
;
i
<
3
;
i
++)
{
try
{
String
url
=
"https://mbd.baidu.com/webpage?tab=article&num=10&uk="
+
uk
+
"&ctime="
+
ctime
+
"&type=newhome&action=dynamic&format=json"
;
logger
.
info
(
"ctime = {} url === {}"
,
ctime
,
url
);
String
result
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
Map
<
String
,
Object
>
dMap
=
baijiaAccountAnalysis
.
getBaijiaAccountData3
(
result
,
name
,
startTime
);
List
<
Map
<
String
,
Object
>>
dList
=
(
List
<
Map
<
String
,
Object
>>)
dMap
.
get
(
"data"
);
if
(
Objects
.
nonNull
(
dList
))
dataList
.
addAll
(
dList
);
logger
.
info
(
"{} 数据采集结果 {}"
,
appId
,
dataList
.
size
());
if
(!(
boolean
)
dMap
.
get
(
"more"
))
{
if
(!(
boolean
)
dMap
.
get
(
"more"
)
||
ctime
.
equals
(
String
.
valueOf
(
String
.
valueOf
(
dMap
.
get
(
"ctime"
))))
)
{
f
=
false
;
}
ctime
=
String
.
valueOf
(
dMap
.
get
(
"ctime"
));
...
...
src/main/java/com/zhiwei/parse/BiliBili.java
View file @
d7dce3fc
...
...
@@ -23,11 +23,11 @@ public class BiliBili {
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
BiliBili
.
class
);
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
Proxy
proxy
,
String
endTime
,
String
cookie
)
{
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
Proxy
proxy
,
String
endTime
,
String
cookie
,
String
type
)
{
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
try
{
//
String
url
=
"https://search.bilibili.com/all?keyword="
+
URLEncoder
.
encode
(
word
,
"utf-8"
)+
"&single_column=1&order=
click
&duration=0&tids_1=0"
;
String
url
=
"https://search.bilibili.com/all?keyword="
+
URLEncoder
.
encode
(
word
,
"utf-8"
)+
"&single_column=1&order=
"
+
type
+
"
&duration=0&tids_1=0"
;
System
.
out
.
println
(
url
);
Headers
header
=
Headers
.
of
(
"cookie"
,
cookie
,
"Referer"
,
"https://www.bilibili.com/"
,
"Host"
,
"search.bilibili.com"
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
ProxyHolder
.
NAT_HEAVY_PROXY
,
header
);
...
...
src/main/java/com/zhiwei/parse/Maimai.java
View file @
d7dce3fc
...
...
@@ -16,9 +16,7 @@ import org.slf4j.LoggerFactory;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.httpclient.HeadGet
;
...
...
@@ -141,11 +139,11 @@ public class Maimai {
return
Collections
.
emptyMap
();
}
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
,
10000002L
);
String
url
=
"https://maimai.cn/web/feed_detail?fid=1353566056&efid=QTa45Y1e-oQzyn1dZ5ozlQ"
;
System
.
out
.
println
(
getMaiaiCount2
(
url
,
ProxyHolder
.
NAT_HEAVY_PROXY
));
}
//
public static void main(String[] args) {
//
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER, 10000002L);
//
String url = "https://maimai.cn/web/feed_detail?fid=1353566056&efid=QTa45Y1e-oQzyn1dZ5ozlQ";
//
System.out.println(getMaiaiCount2(url, ProxyHolder.NAT_HEAVY_PROXY));
//
}
/**
* https://maimai.cn/web/feed_detail?fid=1304191535&efid=0CQbJXhoYLXdC87NFIkRMA
...
...
src/main/java/com/zhiwei/parse/Xueqiu.java
View file @
d7dce3fc
...
...
@@ -165,70 +165,70 @@ public class Xueqiu {
List
<
Map
<
String
,
Object
>>
resultList
=
new
ArrayList
<>();
int
page
=
1
;
boolean
next
=
true
;
int
errCount
=
1
;
while
(
next
)
{
for
(
int
j
=
0
;
j
<
4
;
j
++)
{
try
{
String
url
=
"https://xueqiu.com/v4/statuses/user_timeline.json?page="
+
page
+
"&user_id="
+
uid
+
"&type=0"
;
logger
.
info
(
"重试次数:{},第{}页,JSON地址为:{}"
,
j
,
page
,
url
);
JSONObject
json
=
getJson
(
url
,
cookie
);
//获取json数据
JSONArray
jsonArray
=
json
.
getJSONArray
(
"statuses"
);
if
(
Objects
.
nonNull
(
jsonArray
))
{
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
JSONObject
ob
=
jsonArray
.
getJSONObject
(
i
);
//得到json数组的第i个数组
String
timeBefore
=
ob
.
getString
(
"timeBefore"
);
//时间
Date
date
=
TimeParse
.
stringFormartDate
(
timeBefore
);
Date
endDate
=
TimeParse
.
stringFormartDate
(
endTime
);
//获取规定时间内的数据
if
(
date
.
getTime
()
<
endDate
.
getTime
())
{
next
=
false
;
break
;
}
String
screenName
=
ob
.
getJSONObject
(
"user"
).
getString
(
"screen_name"
);
//用户名
String
source
=
ob
.
getString
(
"source"
);
//发布消息的手机平台
String
description
=
ob
.
getString
(
"description"
).
replaceAll
(
"<.*?>"
,
""
);
//帖子正文
int
retweetCount
=
ob
.
getInteger
(
"retweet_count"
);
//转发数
int
replyCount
=
ob
.
getInteger
(
"reply_count"
);
//评论数
int
likeCount
=
ob
.
getInteger
(
"like_count"
);
//点赞数
String
targetLink
=
"https://xueqiu.com"
+
ob
.
getString
(
"target"
);
//帖子链接
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
map
.
put
(
"screenName"
,
screenName
);
//用户名
map
.
put
(
"uid"
,
uid
);
//帖子链接
map
.
put
(
"time"
,
date
);
//时间
map
.
put
(
"source"
,
source
);
//发布消息的手机平台
map
.
put
(
"description"
,
description
);
//帖子正文
map
.
put
(
"retweetCount"
,
retweetCount
);
//转发数
map
.
put
(
"replyCount"
,
replyCount
);
//评论数
map
.
put
(
"likeCount"
,
likeCount
);
//点赞数
map
.
put
(
"targetLink"
,
targetLink
);
//帖子链接
map
.
put
(
"pt"
,
"雪球"
);
//帖子链接
resultList
.
add
(
map
);
}
}
//超出时间则停止采集
if
(
next
)
{
int
maxPag
=
json
.
getInteger
(
"maxPage"
);
//获取最大页数
boolean
flag
=
page
<
maxPag
;
//当前页数小于最大页数时,翻页
if
(
flag
)
{
page
++;
//页数+1
}
else
{
try
{
String
url
=
"https://xueqiu.com/v4/statuses/user_timeline.json?page="
+
page
+
"&user_id="
+
uid
+
"&type=0"
;
logger
.
info
(
"第{}页,JSON地址为:{}"
,
page
,
url
);
JSONObject
json
=
getJson
(
url
,
cookie
);
//获取json数据
JSONArray
jsonArray
=
json
.
getJSONArray
(
"statuses"
);
if
(
Objects
.
nonNull
(
jsonArray
))
{
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
JSONObject
ob
=
jsonArray
.
getJSONObject
(
i
);
//得到json数组的第i个数组
String
timeBefore
=
ob
.
getString
(
"timeBefore"
);
//时间
Date
date
=
TimeParse
.
stringFormartDate
(
timeBefore
);
Date
endDate
=
TimeParse
.
stringFormartDate
(
endTime
);
//获取规定时间内的数据
if
(
date
.
getTime
()
<
endDate
.
getTime
())
{
next
=
false
;
break
;
}
}
break
;
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析JSON出错 "
,
e
);
}
}
String
screenName
=
ob
.
getJSONObject
(
"user"
).
getString
(
"screen_name"
);
//用户名
String
source
=
ob
.
getString
(
"source"
);
//发布消息的手机平台
String
description
=
ob
.
getString
(
"description"
).
replaceAll
(
"<.*?>"
,
""
);
//帖子正文
int
retweetCount
=
ob
.
getInteger
(
"retweet_count"
);
//转发数
int
replyCount
=
ob
.
getInteger
(
"reply_count"
);
//评论数
int
likeCount
=
ob
.
getInteger
(
"like_count"
);
//点赞数
String
targetLink
=
"https://xueqiu.com"
+
ob
.
getString
(
"target"
);
//帖子链接
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
map
.
put
(
"screenName"
,
screenName
);
//用户名
map
.
put
(
"uid"
,
uid
);
//帖子链接
map
.
put
(
"time"
,
date
);
//时间
map
.
put
(
"source"
,
source
);
//发布消息的手机平台
map
.
put
(
"description"
,
description
);
//帖子正文
map
.
put
(
"retweetCount"
,
retweetCount
);
//转发数
map
.
put
(
"replyCount"
,
replyCount
);
//评论数
map
.
put
(
"likeCount"
,
likeCount
);
//点赞数
map
.
put
(
"targetLink"
,
targetLink
);
//帖子链接
map
.
put
(
"pt"
,
"雪球"
);
//帖子链接
resultList
.
add
(
map
);
}
}
//超出时间则停止采集
if
(
next
)
{
int
maxPag
=
json
.
getInteger
(
"maxPage"
);
//获取最大页数
boolean
flag
=
page
<
maxPag
;
//当前页数小于最大页数时,翻页
if
(
flag
)
{
page
++;
//页数+1
errCount
=
1
;
}
}
errCount
++;
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析JSON出错 "
,
e
);
errCount
++;
}
if
(
errCount
>
4
)
break
;
}
return
resultList
;
...
...
src/main/java/com/zhiwei/parse/analysis/BaijiaAccountAnalysis.java
View file @
d7dce3fc
...
...
@@ -63,14 +63,12 @@ public class BaijiaAccountAnalysis {
more
=
true
;
rmap
.
put
(
"ctime"
,
json
.
getJSONObject
(
"data"
).
getJSONObject
(
"query"
).
getString
(
"ctime"
));
}
// String name = json.getJSONObject("data").getJSONObject("author").getString("display_name");
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
JSONObject
data
=
jsonArry
.
getJSONObject
(
i
).
getJSONObject
(
"itemData"
);
String
id
=
data
.
getString
(
"article_id"
);
int
t
=
data
.
getInteger
(
"updated_at"
);
String
time
=
TimeParse
.
dateFormartString
(
new
Date
(
t
*
1000L
),
"yyyy-MM-dd HH:mm:ss"
);
System
.
out
.
println
(
time
);
if
(
startTime
!=
null
&&
startTime
.
length
()
>
1
&&
time
.
compareTo
(
startTime
)
<
1
)
{
more
=
false
;
continue
;
...
...
src/main/java/com/zhiwei/parse/shipin/QQTV.java
View file @
d7dce3fc
...
...
@@ -40,6 +40,7 @@ public class QQTV {
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
try
{
String
url
=
"https://v.qq.com/x/search/?ses=qid%3DdKzxiFfC7NqpC6z2jq4m-KGeQjb_Th556Yrz24cQaZo1MUTw2PK4XA%26last_query%3D%E7%BE%8E%E5%9B%A2%26tabid_list%3D0%7C1%7C5%7C13%7C11%7C7%7C2%7C3%7C4%7C6%7C12%7C21%7C14%7C17%7C8%7C15%7C20%26tabname_list%3D%E5%85%A8%E9%83%A8%7C%E7%94%B5%E5%BD%B1%7C%E9%9F%B3%E4%B9%90%7C%E8%B4%A2%E7%BB%8F%7C%E6%96%B0%E9%97%BB%7C%E5%85%B6%E4%BB%96%7C%E7%94%B5%E8%A7%86%E5%89%A7%7C%E7%BB%BC%E8%89%BA%7C%E5%8A%A8%E6%BC%AB%7C%E7%BA%AA%E5%BD%95%E7%89%87%7C%E5%A8%B1%E4%B9%90%7C%E6%B1%BD%E8%BD%A6%7C%E4%BD%93%E8%82%B2%7C%E6%B8%B8%E6%88%8F%7C%E5%8E%9F%E5%88%9B%7C%E6%95%99%E8%82%B2%7C%E6%AF%8D%E5%A9%B4%26resolution_tabid_list%3D0%7C1%7C2%7C3%7C4%7C5%26resolution_tabname_list%3D%E5%85%A8%E9%83%A8%7C%E6%A0%87%E6%B8%85%7C%E9%AB%98%E6%B8%85%7C%E8%B6%85%E6%B8%85%7C%E8%93%9D%E5%85%89%7CVR&q="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)+
"&stag=4&filter=sort%3D1%26pubfilter%3D0%26duration%3D0%26tabid%3D0%26resolution%3D0&cur="
;
System
.
out
.
println
(
url
);
int
page
=
1
;
while
(
true
)
{
int
count
=
dataList
.
size
();
...
...
src/test/java/com/zhiwei/CrawlerTest.java
View file @
d7dce3fc
package
com
.
zhiwei
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.parse.TechTx
;
public
class
CrawlerTest
{
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181"
,
"local"
,
GroupType
.
PROVIDER
,
10000002L
);
String
cookie
=
"bid=rymxzs5aojg; ps=y; ll=\"118173\"; __utmc=30149280; dbcl2=\"188038058:9IHyVcSobVc\"; ck=_RvF; push_noty_num=0; push_doumail_num=0; __utmv=30149280.18803; douban-fav-remind=1; __yadk_uid=qLflXyj3R14ro9e0cLoZOQlJoMGVN32j; douban-profile-remind=1; _vwo_uuid_v2=D85F60C118B0AF465035D9CC7BBFDA7A6|4bf255e1e3a2e9aeede3708192f5f1bc; __utmz=30149280.1543564973.3.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; ap_v=0,6.0; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1543823236%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DPk19bxnhsVWwfnrcnwT0PquON7D1JpLsbfSu9rRowalyi4pOeM3rMHKFaJo9jJF7%26wd%3D%26eqid%3De7f262650001ef98000000045c00e64f%22%5D; _pk_id.100001.8cb4=6828fef49f6bcf34.1543559455.5.1543823236.1543820463.; _pk_ses.100001.8cb4=*; __utma=30149280.824403997.1543559458.1543818802.1543823236.6; __utmt=1; __utmb=30149280.5.7.1543823236"
;
String
url
=
"E2S95LEA0008856R"
;
TechTx
.
getTechTxComment
(
url
,
null
);
}
}
//
package com.zhiwei;
//
//
import com.zhiwei.common.config.GroupType;
//
import com.zhiwei.crawler.proxy.ProxyFactory;
//
import com.zhiwei.parse.TechTx;
//
//
public class CrawlerTest {
//
//
public static void main(String[] args) {
//
ProxyFactory.init("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181", "local", GroupType.PROVIDER , 10000002L);
//
String cookie = "bid=rymxzs5aojg; ps=y; ll=\"118173\"; __utmc=30149280; dbcl2=\"188038058:9IHyVcSobVc\"; ck=_RvF; push_noty_num=0; push_doumail_num=0; __utmv=30149280.18803; douban-fav-remind=1; __yadk_uid=qLflXyj3R14ro9e0cLoZOQlJoMGVN32j; douban-profile-remind=1; _vwo_uuid_v2=D85F60C118B0AF465035D9CC7BBFDA7A6|4bf255e1e3a2e9aeede3708192f5f1bc; __utmz=30149280.1543564973.3.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; ap_v=0,6.0; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1543823236%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DPk19bxnhsVWwfnrcnwT0PquON7D1JpLsbfSu9rRowalyi4pOeM3rMHKFaJo9jJF7%26wd%3D%26eqid%3De7f262650001ef98000000045c00e64f%22%5D; _pk_id.100001.8cb4=6828fef49f6bcf34.1543559455.5.1543823236.1543820463.; _pk_ses.100001.8cb4=*; __utma=30149280.824403997.1543559458.1543818802.1543823236.6; __utmt=1; __utmb=30149280.5.7.1543823236";
//
String url = "E2S95LEA0008856R";
//
TechTx.getTechTxComment(url, null);
//
}
//
//
}
src/test/java/com/zhiwei/hsitory/BTimeHistoryExample.java
View file @
d7dce3fc
package
com
.
zhiwei
.
hsitory
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.parse.BTime
;
public
class
BTimeHistoryExample
{
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181"
,
"local"
,
GroupType
.
PROVIDER
,
10000002L
);
List
<
Map
<
String
,
Object
>>
dataList
=
BTime
.
getHistoryData
(
"1608238"
,
0L
);
// dataList.forEach(System.out::println);
}
}
//
package com.zhiwei.hsitory;
//
//
import java.util.List;
//
import java.util.Map;
//
//
import com.zhiwei.common.config.GroupType;
//
import com.zhiwei.crawler.proxy.ProxyFactory;
//
import com.zhiwei.parse.BTime;
//
//
public class BTimeHistoryExample {
//
//
public static void main(String[] args) {
//
ProxyFactory.init("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181", "local", GroupType.PROVIDER , 10000002L);
//
List<Map<String, Object>> dataList = BTime.getHistoryData("1608238", 0L);
//
//
dataList.forEach(System.out::println);
//
}
//
//
}
src/test/java/com/zhiwei/hsitory/BaijiaAccountExample.java
View file @
d7dce3fc
...
...
@@ -6,37 +6,38 @@
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Baijia;
//import com.zhiwei.proxy.config.SimpleConfig;
//
//public class BaijiaAccountExample {
//
// @Test
// public void test3() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// // , "local", GroupType.PROVIDER , 10000002L
// ProxyFactory.init(SimpleConfig.builder().registry("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181").appName("actool").appId(10000002).group("local").build());
// String path = "D://crawlerdata//自媒体/百家号采集.xlsx";
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// String startTime = "2018-05-01 00:00:00";
// Map<String,Object> map = poi.importExcel(path, 0);
// List<Map<String,Object>> list = (List<Map<String, Object>>) map.get("body");
//
//
Map<String,Object> map = poi.importExcel(path, 0);
//
//
List<Map<String,Object>> list = (List<Map<String, Object>>) map.get("body");
// List<Map<String,Object>> bodyList = new ArrayList<>();
// for(Map<String,Object> m : list) {
// try {
// String app_id = m.get("id").toString();
//
app_id = "1602674438508810
";
// String cookie = "BAIDUID=
7D453C932433A93F7AD1F3B8ABC8B0E1:FG=1; BIDUPSID=7D453C932433A93F7AD1F3B8ABC8B0E1; PSTM=1570766401; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDSFRCVID=eH-OJeCmH6VwoRJwCdmehrB7leKK0gOTHllvCh8hmwLadLIVJeC6EG0Ptf8g0KubFTPRogKK0gOTH6KF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tJkD_I_hJKt3qn7I5KToh4Athxob2bbXHDo-LIvHWT6cOR5JhfA-3R-e046f3-3L5CbH5D3s5lvvhb3O3M7ShbKdMa732RbrKCnraxQF5l8-sq0x0bOte-bQypoa0q3TLDOMahkM5h7xOKQoQlPK5JkgMx6MqpQJQeQ-5KQN3KJmfbL9bT3YjjISKx-_J5LJJxK; H_PS_PSSID=1442_21103_29567_29699_29220_22158; delPer=0; PSINO=5; ZD_ENTRY=baidu; yjs_js_security_passport=9687699d4b0965c0be1e6e312fc59ff5cf3d03a2_1571106914_js; Hmery-Time=121539387
8";
//
//
for(Map<String,Object> m : list) {
//
//
try {
//
//
String app_id = m.get("id").toString();
//
String app_id = "1565848819560927
";
// String cookie = "BAIDUID=
A46414BD701A3738E17E0212A6C2FEEA:FG=1; Hmery-Time=2269711404; BIDUPSID=A46414BD701A3738E17E0212A6C2FEEA; PSTM=1583375258; delPer=0; H_PS_PSSID=30972_1439_21095_30839_30998_30823; BDORZ=B490B5EBF6F3CD402E515D22BCDA159
8";
// System.out.println(app_id);
// List<Map<String,Object>> lists = Baijia.getBaijiaAccountByBaiduData(app_id,
"聚富财经", startTime,
cookie, ProxyHolder.NAT_HEAVY_PROXY);
// List<Map<String,Object>> lists = Baijia.getBaijiaAccountByBaiduData(app_id,
"聚富财经", startTime,
cookie, ProxyHolder.NAT_HEAVY_PROXY);
// if(lists != null) {
// bodyList.addAll(lists);
// }
// break;
// } catch (Exception e) {
// }
// }
//
//
break;
//
//
} catch (Exception e) {
//
//
}
//
//
}
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("time");
...
...
@@ -44,7 +45,7 @@
// headList.add("url");
// headList.add("content");
// headList.add("read_amount");
// poi.exportExcel("
D://crawlerdata//历史文章采集/
百家号-lxj-2.xlsx", "娱乐资本论", headList, bodyList);
// poi.exportExcel("
E://crawlerdata//历史
百家号-lxj-2.xlsx", "娱乐资本论", headList, bodyList);
// }
//
//}
src/test/java/com/zhiwei/hsitory/KuaiDataHistoryExample.java
View file @
d7dce3fc
package
com
.
zhiwei
.
hsitory
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.parse.KuaiData
;
public
class
KuaiDataHistoryExample
{
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181"
,
"local"
,
GroupType
.
PROVIDER
,
10000002L
);
List
<
Map
<
String
,
Object
>>
dataList
=
KuaiData
.
getArticleHistory
(
"5c19954ccb14fabc153971e3f924bf36"
,
"2686798288"
,
0L
);
// dataList.forEach(System.out::println);
}
}
//
package com.zhiwei.hsitory;
//
//
import java.util.List;
//
import java.util.Map;
//
//
import com.zhiwei.common.config.GroupType;
//
import com.zhiwei.crawler.proxy.ProxyFactory;
//
import com.zhiwei.parse.KuaiData;
//
//
public class KuaiDataHistoryExample {
//
//
public static void main(String[] args) {
//
ProxyFactory.init("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181", "local", GroupType.PROVIDER , 10000002L);
//
List<Map<String, Object>> dataList = KuaiData.getArticleHistory("5c19954ccb14fabc153971e3f924bf36", "2686798288", 0L);
//
//
dataList.forEach(System.out::println);
//
}
//
//
}
src/test/java/com/zhiwei/hsitory/ZakerHistoryExample.java
View file @
d7dce3fc
package
com
.
zhiwei
.
hsitory
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.parse.BTime
;
import
com.zhiwei.parse.MyZaker
;
public
class
ZakerHistoryExample
{
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181"
,
"local"
,
GroupType
.
PROVIDER
,
10000002L
);
List
<
Map
<
String
,
Object
>>
dataList
=
MyZaker
.
getHistoryData
(
"13584"
,
0L
);
// dataList.forEach(System.out::println);
}
}
//
package com.zhiwei.hsitory;
//
//
import java.util.List;
//
import java.util.Map;
//
//
import com.zhiwei.common.config.GroupType;
//
import com.zhiwei.crawler.proxy.ProxyFactory;
//
import com.zhiwei.parse.BTime;
//
import com.zhiwei.parse.MyZaker;
//
//
public class ZakerHistoryExample {
//
//
//
public static void main(String[] args) {
//
ProxyFactory.init("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181", "local", GroupType.PROVIDER , 10000002L);
//
List<Map<String, Object>> dataList = MyZaker.getHistoryData("13584", 0L);
//
//
dataList.forEach(System.out::println);
//
}
//
//
}
src/test/java/com/zhiwei/hsitory/ZhihuArticleHistoryExample.java
View file @
d7dce3fc
package
com
.
zhiwei
.
hsitory
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.parse.Zhihu
;
public
class
ZhihuArticleHistoryExample
{
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181"
,
"local"
,
GroupType
.
PROVIDER
,
10000002L
);
List
<
Map
<
String
,
Object
>>
dataList
=
Zhihu
.
getArticleHistory
(
"da-bai-xin-wen-27"
,
0L
);
// dataList.forEach(System.out::println);
}
}
//
package com.zhiwei.hsitory;
//
//
import java.util.List;
//
import java.util.Map;
//
//
import com.zhiwei.common.config.GroupType;
//
import com.zhiwei.crawler.proxy.ProxyFactory;
//
import com.zhiwei.parse.Zhihu;
//
//
public class ZhihuArticleHistoryExample {
//
//
public static void main(String[] args) {
//
ProxyFactory.init("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181", "local", GroupType.PROVIDER , 10000002L);
//
List<Map<String, Object>> dataList = Zhihu.getArticleHistory("da-bai-xin-wen-27", 0L);
//
//
dataList.forEach(System.out::println);
//
}
//
//
}
src/test/java/com/zhiwei/shipin/BilibiliTest.java
View file @
d7dce3fc
package
com
.
zhiwei
.
shipin
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Map
;
import
org.junit.Test
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.parse.BiliBili
;
import
com.zhiwei.util.WordReadFile
;
public
class
BilibiliTest
{
@Test
public
void
f
()
{
ProxyFactory
.
init
(
"zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181"
,
"local"
,
GroupType
.
PROVIDER
,
10000002L
);
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"E://crawlerdata//关键词.txt"
);
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
String
cookie
=
"LIVE_BUVID=AUTO8715300758995538; sid=kp5rluge; fts=1530161621; im_notify_type_35324319=0; buvid3=08ABE6AE-5061-4CE5-B34F-1A8AAB64DB3320712infoc; rpdid=olppsmkxmpdoskwoxiwww; finger=edc6ecda; stardustvideo=1; UM_distinctid=164fe68fb31996-01f161c3523abe-6114167a-1fa400-164fe68fb32274"
;
for
(
String
word
:
wordList
)
{
List
<
Map
<
String
,
Object
>>
dataList
=
BiliBili
.
getData
(
word
,
null
,
"2001-01-14 00:00:00"
,
cookie
);
if
(
dataList
!=
null
)
{
System
.
out
.
println
(
word
+
" ----- "
+
dataList
.
size
());
bodyList
.
addAll
(
dataList
);
}
}
List
<
String
>
headlist
=
new
ArrayList
<>();
headlist
.
add
(
"submitcount"
);
headlist
.
add
(
"playcount"
);
headlist
.
add
(
"time"
);
headlist
.
add
(
"source"
);
headlist
.
add
(
"title"
);
headlist
.
add
(
"url"
);
headlist
.
add
(
"word"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
poi
.
exportExcel
(
"E://crawlerdata//视频//bilibili关键词采集数据-dnf手游-点击-20200204.xlsx"
,
"B站数据"
,
headlist
,
bodyList
);
}
}
//package com.zhiwei.shipin;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.BiliBili;
//import com.zhiwei.util.WordReadFile;
//
//public class BilibiliTest {
// @Test
// public void f() {
// ProxyFactory.init("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181", "local", GroupType.PROVIDER , 10000002L);
// List<String> wordList = WordReadFile.getWords("E://crawlerdata//关键词.txt");
// List<Map<String, Object>> bodyList = new ArrayList<>();
// String cookie = "LIVE_BUVID=AUTO8715300758995538; sid=kp5rluge; fts=1530161621; im_notify_type_35324319=0; buvid3=08ABE6AE-5061-4CE5-B34F-1A8AAB64DB3320712infoc; rpdid=olppsmkxmpdoskwoxiwww; finger=edc6ecda; stardustvideo=1; UM_distinctid=164fe68fb31996-01f161c3523abe-6114167a-1fa400-164fe68fb32274";
// for (String word : wordList) {
//// pubdate 时间 totalrank 综合
// List<Map<String, Object>> dataList = BiliBili.getData(word, null, "2001-01-14 00:00:00",
// cookie, "pubdate");
// if (dataList != null) {
// System.out.println(word + " ----- " + dataList.size());
// bodyList.addAll(dataList);
// }
// }
// List<String> headlist = new ArrayList<>();
// headlist.add("submitcount");
// headlist.add("playcount");
// headlist.add("time");
// headlist.add("source");
// headlist.add("title");
// headlist.add("url");
// headlist.add("word");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("E://crawlerdata//视频//bilibili关键词采集数据-腾讯会议-time-20200218.xlsx", "B站数据", headlist, bodyList);
//
// }
//}
src/test/java/com/zhiwei/shipin/QQTVTest.java
View file @
d7dce3fc
...
...
@@ -17,9 +17,9 @@
//public class QQTVTest {
// @Test
// public void f() {
// ProxyFactory.init("zookeeper://192.168.0.
36
:2181", "local",GroupType.PROVIDER, 10000002);
// ProxyFactory.init("zookeeper://192.168.0.
11:2181?backup=192.168.0.30:2181,192.168.0.35
:2181", "local",GroupType.PROVIDER, 10000002);
// String time = "2019-01-11 00:00:00";
// List<String> wordList = WordReadFile.getWords("
D
://crawlerdata//关键词.txt");
// List<String> wordList = WordReadFile.getWords("
E
://crawlerdata//关键词.txt");
// List<Map<String, Object>> bodyList = new ArrayList<>();
// for (String word : wordList) {
// List<Map<String, Object>> dataList = QQTV.getData(word,time, ProxyHolder.NAT_HEAVY_PROXY);
...
...
@@ -37,7 +37,7 @@
// headlist.add("url");
// headlist.add("word");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("
D
://crawlerdata//视频//腾讯视频关键词采集数据-精装修.xlsx", "腾讯视频数据", headlist, bodyList);
// poi.exportExcel("
E
://crawlerdata//视频//腾讯视频关键词采集数据-精装修.xlsx", "腾讯视频数据", headlist, bodyList);
//
//
//
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment