Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
articlenewscrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
articlenewscrawler
Commits
d7dce3fc
Commit
d7dce3fc
authored
Apr 03, 2020
by
cwy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
本地提交
parent
3fdd0d2c
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
226 additions
and
225 deletions
+226
-225
pom.xml
+2
-2
src/main/java/com/zhiwei/parse/Baijia.java
+4
-2
src/main/java/com/zhiwei/parse/BiliBili.java
+2
-2
src/main/java/com/zhiwei/parse/Maimai.java
+5
-7
src/main/java/com/zhiwei/parse/Xueqiu.java
+60
-60
src/main/java/com/zhiwei/parse/analysis/BaijiaAccountAnalysis.java
+0
-2
src/main/java/com/zhiwei/parse/shipin/QQTV.java
+1
-0
src/test/java/com/zhiwei/CrawlerTest.java
+16
-16
src/test/java/com/zhiwei/hsitory/BTimeHistoryExample.java
+18
-18
src/test/java/com/zhiwei/hsitory/BaijiaAccountExample.java
+16
-15
src/test/java/com/zhiwei/hsitory/KuaiDataHistoryExample.java
+18
-18
src/test/java/com/zhiwei/hsitory/ZakerHistoryExample.java
+20
-20
src/test/java/com/zhiwei/hsitory/ZhihuArticleHistoryExample.java
+18
-18
src/test/java/com/zhiwei/shipin/BilibiliTest.java
+43
-42
src/test/java/com/zhiwei/shipin/QQTVTest.java
+3
-3
No files found.
pom.xml
View file @
d7dce3fc
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<groupId>
com.zhiwei
</groupId>
<artifactId>
articlenewscrawler
</artifactId>
<artifactId>
articlenewscrawler
</artifactId>
<version>
0.2.
3
-SNAPSHOT
</version>
<version>
0.2.
4
-SNAPSHOT
</version>
<name>
articlenewscrawler
</name>
<name>
articlenewscrawler
</name>
<description>
采集凤凰,一点资讯,搜狐历时文章和文章评论
</description>
<description>
采集凤凰,一点资讯,搜狐历时文章和文章评论
</description>
...
@@ -21,7 +21,7 @@
...
@@ -21,7 +21,7 @@
<dependency>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<artifactId>
crawler-core
</artifactId>
<version>
0.
5.5.6
-SNAPSHOT
</version>
<version>
0.
6.0.1
-SNAPSHOT
</version>
<scope>
provided
</scope>
<scope>
provided
</scope>
</dependency>
</dependency>
</dependencies>
</dependencies>
...
...
src/main/java/com/zhiwei/parse/Baijia.java
View file @
d7dce3fc
...
@@ -77,9 +77,10 @@ public class Baijia {
...
@@ -77,9 +77,10 @@ public class Baijia {
@SuppressWarnings
(
"unchecked"
)
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
Map
<
String
,
Object
>>
getBaijiaAccountByBaiduData
(
String
appId
,
String
name
,
String
startTime
,
public
static
List
<
Map
<
String
,
Object
>>
getBaijiaAccountByBaiduData
(
String
appId
,
String
name
,
String
startTime
,
String
cookie
,
ProxyHolder
proxy
)
{
String
cookie
,
ProxyHolder
proxy
)
{
Map
<
String
,
String
>
headerMap
=
HeaderTool
.
getCommonHead
();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<>
();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
headerMap
.
put
(
"cookie"
,
cookie
);
headerMap
.
put
(
"cookie"
,
cookie
);
headerMap
.
put
(
"Host"
,
"mbd.baidu.com"
);
String
uk
=
getUkData
(
appId
,
proxy
,
cookie
);
String
uk
=
getUkData
(
appId
,
proxy
,
cookie
);
if
(
Objects
.
isNull
(
uk
))
{
if
(
Objects
.
isNull
(
uk
))
{
return
Collections
.
emptyList
();
return
Collections
.
emptyList
();
...
@@ -90,13 +91,14 @@ public class Baijia {
...
@@ -90,13 +91,14 @@ public class Baijia {
for
(
int
i
=
1
;
i
<
3
;
i
++)
{
for
(
int
i
=
1
;
i
<
3
;
i
++)
{
try
{
try
{
String
url
=
"https://mbd.baidu.com/webpage?tab=article&num=10&uk="
+
uk
+
"&ctime="
+
ctime
+
"&type=newhome&action=dynamic&format=json"
;
String
url
=
"https://mbd.baidu.com/webpage?tab=article&num=10&uk="
+
uk
+
"&ctime="
+
ctime
+
"&type=newhome&action=dynamic&format=json"
;
logger
.
info
(
"ctime = {} url === {}"
,
ctime
,
url
);
String
result
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
String
result
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headerMap
),
proxy
).
body
().
string
();
Map
<
String
,
Object
>
dMap
=
baijiaAccountAnalysis
.
getBaijiaAccountData3
(
result
,
name
,
startTime
);
Map
<
String
,
Object
>
dMap
=
baijiaAccountAnalysis
.
getBaijiaAccountData3
(
result
,
name
,
startTime
);
List
<
Map
<
String
,
Object
>>
dList
=
(
List
<
Map
<
String
,
Object
>>)
dMap
.
get
(
"data"
);
List
<
Map
<
String
,
Object
>>
dList
=
(
List
<
Map
<
String
,
Object
>>)
dMap
.
get
(
"data"
);
if
(
Objects
.
nonNull
(
dList
))
if
(
Objects
.
nonNull
(
dList
))
dataList
.
addAll
(
dList
);
dataList
.
addAll
(
dList
);
logger
.
info
(
"{} 数据采集结果 {}"
,
appId
,
dataList
.
size
());
logger
.
info
(
"{} 数据采集结果 {}"
,
appId
,
dataList
.
size
());
if
(!(
boolean
)
dMap
.
get
(
"more"
))
{
if
(!(
boolean
)
dMap
.
get
(
"more"
)
||
ctime
.
equals
(
String
.
valueOf
(
String
.
valueOf
(
dMap
.
get
(
"ctime"
))))
)
{
f
=
false
;
f
=
false
;
}
}
ctime
=
String
.
valueOf
(
dMap
.
get
(
"ctime"
));
ctime
=
String
.
valueOf
(
dMap
.
get
(
"ctime"
));
...
...
src/main/java/com/zhiwei/parse/BiliBili.java
View file @
d7dce3fc
...
@@ -23,11 +23,11 @@ public class BiliBili {
...
@@ -23,11 +23,11 @@ public class BiliBili {
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
BiliBili
.
class
);
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
BiliBili
.
class
);
@SuppressWarnings
(
"unchecked"
)
@SuppressWarnings
(
"unchecked"
)
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
Proxy
proxy
,
String
endTime
,
String
cookie
)
{
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
Proxy
proxy
,
String
endTime
,
String
cookie
,
String
type
)
{
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
try
{
try
{
//
//
String
url
=
"https://search.bilibili.com/all?keyword="
+
URLEncoder
.
encode
(
word
,
"utf-8"
)+
"&single_column=1&order=
click
&duration=0&tids_1=0"
;
String
url
=
"https://search.bilibili.com/all?keyword="
+
URLEncoder
.
encode
(
word
,
"utf-8"
)+
"&single_column=1&order=
"
+
type
+
"
&duration=0&tids_1=0"
;
System
.
out
.
println
(
url
);
System
.
out
.
println
(
url
);
Headers
header
=
Headers
.
of
(
"cookie"
,
cookie
,
"Referer"
,
"https://www.bilibili.com/"
,
"Host"
,
"search.bilibili.com"
);
Headers
header
=
Headers
.
of
(
"cookie"
,
cookie
,
"Referer"
,
"https://www.bilibili.com/"
,
"Host"
,
"search.bilibili.com"
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
ProxyHolder
.
NAT_HEAVY_PROXY
,
header
);
String
result
=
HttpClient
.
executeHttpRequestGet
(
url
,
ProxyHolder
.
NAT_HEAVY_PROXY
,
header
);
...
...
src/main/java/com/zhiwei/parse/Maimai.java
View file @
d7dce3fc
...
@@ -16,9 +16,7 @@ import org.slf4j.LoggerFactory;
...
@@ -16,9 +16,7 @@ import org.slf4j.LoggerFactory;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.httpclient.HeadGet
;
import
com.zhiwei.httpclient.HeadGet
;
...
@@ -141,11 +139,11 @@ public class Maimai {
...
@@ -141,11 +139,11 @@ public class Maimai {
return
Collections
.
emptyMap
();
return
Collections
.
emptyMap
();
}
}
public
static
void
main
(
String
[]
args
)
{
//
public static void main(String[] args) {
ProxyFactory
.
init
(
"zookeeper://192.168.0.36:2181"
,
"local"
,
GroupType
.
PROVIDER
,
10000002L
);
//
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER, 10000002L);
String
url
=
"https://maimai.cn/web/feed_detail?fid=1353566056&efid=QTa45Y1e-oQzyn1dZ5ozlQ"
;
//
String url = "https://maimai.cn/web/feed_detail?fid=1353566056&efid=QTa45Y1e-oQzyn1dZ5ozlQ";
System
.
out
.
println
(
getMaiaiCount2
(
url
,
ProxyHolder
.
NAT_HEAVY_PROXY
));
//
System.out.println(getMaiaiCount2(url, ProxyHolder.NAT_HEAVY_PROXY));
}
//
}
/**
/**
* https://maimai.cn/web/feed_detail?fid=1304191535&efid=0CQbJXhoYLXdC87NFIkRMA
* https://maimai.cn/web/feed_detail?fid=1304191535&efid=0CQbJXhoYLXdC87NFIkRMA
...
...
src/main/java/com/zhiwei/parse/Xueqiu.java
View file @
d7dce3fc
...
@@ -165,70 +165,70 @@ public class Xueqiu {
...
@@ -165,70 +165,70 @@ public class Xueqiu {
List
<
Map
<
String
,
Object
>>
resultList
=
new
ArrayList
<>();
List
<
Map
<
String
,
Object
>>
resultList
=
new
ArrayList
<>();
int
page
=
1
;
int
page
=
1
;
boolean
next
=
true
;
boolean
next
=
true
;
int
errCount
=
1
;
while
(
next
)
{
while
(
next
)
{
for
(
int
j
=
0
;
j
<
4
;
j
++)
{
try
{
try
{
String
url
=
"https://xueqiu.com/v4/statuses/user_timeline.json?page="
+
page
+
"&user_id="
+
uid
+
"&type=0"
;
String
url
=
"https://xueqiu.com/v4/statuses/user_timeline.json?page="
+
page
+
"&user_id="
+
uid
+
"&type=0"
;
logger
.
info
(
"第{}页,JSON地址为:{}"
,
page
,
url
);
logger
.
info
(
"重试次数:{},第{}页,JSON地址为:{}"
,
j
,
page
,
url
);
JSONObject
json
=
getJson
(
url
,
cookie
);
//获取json数据
JSONObject
json
=
getJson
(
url
,
cookie
);
//获取json数据
JSONArray
jsonArray
=
json
.
getJSONArray
(
"statuses"
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"statuses"
);
if
(
Objects
.
nonNull
(
jsonArray
))
{
if
(
Objects
.
nonNull
(
jsonArray
))
{
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
JSONObject
ob
=
jsonArray
.
getJSONObject
(
i
);
//得到json数组的第i个数组
JSONObject
ob
=
jsonArray
.
getJSONObject
(
i
);
//得到json数组的第i个数组
String
timeBefore
=
ob
.
getString
(
"timeBefore"
);
//时间
String
timeBefore
=
ob
.
getString
(
"timeBefore"
);
//时间
Date
date
=
TimeParse
.
stringFormartDate
(
timeBefore
);
Date
date
=
TimeParse
.
stringFormartDate
(
timeBefore
);
Date
endDate
=
TimeParse
.
stringFormartDate
(
endTime
);
Date
endDate
=
TimeParse
.
stringFormartDate
(
endTime
);
//获取规定时间内的数据
//获取规定时间内的数据
if
(
date
.
getTime
()
<
endDate
.
getTime
())
{
if
(
date
.
getTime
()
<
endDate
.
getTime
())
{
next
=
false
;
break
;
}
String
screenName
=
ob
.
getJSONObject
(
"user"
).
getString
(
"screen_name"
);
//用户名
String
source
=
ob
.
getString
(
"source"
);
//发布消息的手机平台
String
description
=
ob
.
getString
(
"description"
).
replaceAll
(
"<.*?>"
,
""
);
//帖子正文
int
retweetCount
=
ob
.
getInteger
(
"retweet_count"
);
//转发数
int
replyCount
=
ob
.
getInteger
(
"reply_count"
);
//评论数
int
likeCount
=
ob
.
getInteger
(
"like_count"
);
//点赞数
String
targetLink
=
"https://xueqiu.com"
+
ob
.
getString
(
"target"
);
//帖子链接
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
map
.
put
(
"screenName"
,
screenName
);
//用户名
map
.
put
(
"uid"
,
uid
);
//帖子链接
map
.
put
(
"time"
,
date
);
//时间
map
.
put
(
"source"
,
source
);
//发布消息的手机平台
map
.
put
(
"description"
,
description
);
//帖子正文
map
.
put
(
"retweetCount"
,
retweetCount
);
//转发数
map
.
put
(
"replyCount"
,
replyCount
);
//评论数
map
.
put
(
"likeCount"
,
likeCount
);
//点赞数
map
.
put
(
"targetLink"
,
targetLink
);
//帖子链接
map
.
put
(
"pt"
,
"雪球"
);
//帖子链接
resultList
.
add
(
map
);
}
}
//超出时间则停止采集
if
(
next
)
{
int
maxPag
=
json
.
getInteger
(
"maxPage"
);
//获取最大页数
boolean
flag
=
page
<
maxPag
;
//当前页数小于最大页数时,翻页
if
(
flag
)
{
page
++;
//页数+1
}
else
{
next
=
false
;
next
=
false
;
break
;
}
}
}
break
;
String
screenName
=
ob
.
getJSONObject
(
"user"
).
getString
(
"screen_name"
);
//用户名
}
catch
(
Exception
e
)
{
String
source
=
ob
.
getString
(
"source"
);
//发布消息的手机平台
logger
.
error
(
"解析JSON出错 "
,
e
);
String
description
=
ob
.
getString
(
"description"
).
replaceAll
(
"<.*?>"
,
""
);
//帖子正文
}
int
retweetCount
=
ob
.
getInteger
(
"retweet_count"
);
//转发数
}
int
replyCount
=
ob
.
getInteger
(
"reply_count"
);
//评论数
int
likeCount
=
ob
.
getInteger
(
"like_count"
);
//点赞数
String
targetLink
=
"https://xueqiu.com"
+
ob
.
getString
(
"target"
);
//帖子链接
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
map
.
put
(
"screenName"
,
screenName
);
//用户名
map
.
put
(
"uid"
,
uid
);
//帖子链接
map
.
put
(
"time"
,
date
);
//时间
map
.
put
(
"source"
,
source
);
//发布消息的手机平台
map
.
put
(
"description"
,
description
);
//帖子正文
map
.
put
(
"retweetCount"
,
retweetCount
);
//转发数
map
.
put
(
"replyCount"
,
replyCount
);
//评论数
map
.
put
(
"likeCount"
,
likeCount
);
//点赞数
map
.
put
(
"targetLink"
,
targetLink
);
//帖子链接
map
.
put
(
"pt"
,
"雪球"
);
//帖子链接
resultList
.
add
(
map
);
}
}
//超出时间则停止采集
if
(
next
)
{
int
maxPag
=
json
.
getInteger
(
"maxPage"
);
//获取最大页数
boolean
flag
=
page
<
maxPag
;
//当前页数小于最大页数时,翻页
if
(
flag
)
{
page
++;
//页数+1
errCount
=
1
;
}
}
errCount
++;
}
catch
(
Exception
e
)
{
logger
.
error
(
"解析JSON出错 "
,
e
);
errCount
++;
}
if
(
errCount
>
4
)
break
;
}
}
return
resultList
;
return
resultList
;
...
...
src/main/java/com/zhiwei/parse/analysis/BaijiaAccountAnalysis.java
View file @
d7dce3fc
...
@@ -63,14 +63,12 @@ public class BaijiaAccountAnalysis {
...
@@ -63,14 +63,12 @@ public class BaijiaAccountAnalysis {
more
=
true
;
more
=
true
;
rmap
.
put
(
"ctime"
,
json
.
getJSONObject
(
"data"
).
getJSONObject
(
"query"
).
getString
(
"ctime"
));
rmap
.
put
(
"ctime"
,
json
.
getJSONObject
(
"data"
).
getJSONObject
(
"query"
).
getString
(
"ctime"
));
}
}
// String name = json.getJSONObject("data").getJSONObject("author").getString("display_name");
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
for
(
int
i
=
0
;
i
<
jsonArry
.
size
();
i
++)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
JSONObject
data
=
jsonArry
.
getJSONObject
(
i
).
getJSONObject
(
"itemData"
);
JSONObject
data
=
jsonArry
.
getJSONObject
(
i
).
getJSONObject
(
"itemData"
);
String
id
=
data
.
getString
(
"article_id"
);
String
id
=
data
.
getString
(
"article_id"
);
int
t
=
data
.
getInteger
(
"updated_at"
);
int
t
=
data
.
getInteger
(
"updated_at"
);
String
time
=
TimeParse
.
dateFormartString
(
new
Date
(
t
*
1000L
),
"yyyy-MM-dd HH:mm:ss"
);
String
time
=
TimeParse
.
dateFormartString
(
new
Date
(
t
*
1000L
),
"yyyy-MM-dd HH:mm:ss"
);
System
.
out
.
println
(
time
);
if
(
startTime
!=
null
&&
startTime
.
length
()
>
1
&&
time
.
compareTo
(
startTime
)
<
1
)
{
if
(
startTime
!=
null
&&
startTime
.
length
()
>
1
&&
time
.
compareTo
(
startTime
)
<
1
)
{
more
=
false
;
more
=
false
;
continue
;
continue
;
...
...
src/main/java/com/zhiwei/parse/shipin/QQTV.java
View file @
d7dce3fc
...
@@ -40,6 +40,7 @@ public class QQTV {
...
@@ -40,6 +40,7 @@ public class QQTV {
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
try
{
try
{
String
url
=
"https://v.qq.com/x/search/?ses=qid%3DdKzxiFfC7NqpC6z2jq4m-KGeQjb_Th556Yrz24cQaZo1MUTw2PK4XA%26last_query%3D%E7%BE%8E%E5%9B%A2%26tabid_list%3D0%7C1%7C5%7C13%7C11%7C7%7C2%7C3%7C4%7C6%7C12%7C21%7C14%7C17%7C8%7C15%7C20%26tabname_list%3D%E5%85%A8%E9%83%A8%7C%E7%94%B5%E5%BD%B1%7C%E9%9F%B3%E4%B9%90%7C%E8%B4%A2%E7%BB%8F%7C%E6%96%B0%E9%97%BB%7C%E5%85%B6%E4%BB%96%7C%E7%94%B5%E8%A7%86%E5%89%A7%7C%E7%BB%BC%E8%89%BA%7C%E5%8A%A8%E6%BC%AB%7C%E7%BA%AA%E5%BD%95%E7%89%87%7C%E5%A8%B1%E4%B9%90%7C%E6%B1%BD%E8%BD%A6%7C%E4%BD%93%E8%82%B2%7C%E6%B8%B8%E6%88%8F%7C%E5%8E%9F%E5%88%9B%7C%E6%95%99%E8%82%B2%7C%E6%AF%8D%E5%A9%B4%26resolution_tabid_list%3D0%7C1%7C2%7C3%7C4%7C5%26resolution_tabname_list%3D%E5%85%A8%E9%83%A8%7C%E6%A0%87%E6%B8%85%7C%E9%AB%98%E6%B8%85%7C%E8%B6%85%E6%B8%85%7C%E8%93%9D%E5%85%89%7CVR&q="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)+
"&stag=4&filter=sort%3D1%26pubfilter%3D0%26duration%3D0%26tabid%3D0%26resolution%3D0&cur="
;
String
url
=
"https://v.qq.com/x/search/?ses=qid%3DdKzxiFfC7NqpC6z2jq4m-KGeQjb_Th556Yrz24cQaZo1MUTw2PK4XA%26last_query%3D%E7%BE%8E%E5%9B%A2%26tabid_list%3D0%7C1%7C5%7C13%7C11%7C7%7C2%7C3%7C4%7C6%7C12%7C21%7C14%7C17%7C8%7C15%7C20%26tabname_list%3D%E5%85%A8%E9%83%A8%7C%E7%94%B5%E5%BD%B1%7C%E9%9F%B3%E4%B9%90%7C%E8%B4%A2%E7%BB%8F%7C%E6%96%B0%E9%97%BB%7C%E5%85%B6%E4%BB%96%7C%E7%94%B5%E8%A7%86%E5%89%A7%7C%E7%BB%BC%E8%89%BA%7C%E5%8A%A8%E6%BC%AB%7C%E7%BA%AA%E5%BD%95%E7%89%87%7C%E5%A8%B1%E4%B9%90%7C%E6%B1%BD%E8%BD%A6%7C%E4%BD%93%E8%82%B2%7C%E6%B8%B8%E6%88%8F%7C%E5%8E%9F%E5%88%9B%7C%E6%95%99%E8%82%B2%7C%E6%AF%8D%E5%A9%B4%26resolution_tabid_list%3D0%7C1%7C2%7C3%7C4%7C5%26resolution_tabname_list%3D%E5%85%A8%E9%83%A8%7C%E6%A0%87%E6%B8%85%7C%E9%AB%98%E6%B8%85%7C%E8%B6%85%E6%B8%85%7C%E8%93%9D%E5%85%89%7CVR&q="
+
URLEncoder
.
encode
(
word
,
"UTF-8"
)+
"&stag=4&filter=sort%3D1%26pubfilter%3D0%26duration%3D0%26tabid%3D0%26resolution%3D0&cur="
;
System
.
out
.
println
(
url
);
int
page
=
1
;
int
page
=
1
;
while
(
true
)
{
while
(
true
)
{
int
count
=
dataList
.
size
();
int
count
=
dataList
.
size
();
...
...
src/test/java/com/zhiwei/CrawlerTest.java
View file @
d7dce3fc
package
com
.
zhiwei
;
//
package com.zhiwei;
//
import
com.zhiwei.common.config.GroupType
;
//
import com.zhiwei.common.config.GroupType;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
//
import com.zhiwei.crawler.proxy.ProxyFactory;
import
com.zhiwei.parse.TechTx
;
//
import com.zhiwei.parse.TechTx;
//
public
class
CrawlerTest
{
//
public class CrawlerTest {
//
public
static
void
main
(
String
[]
args
)
{
//
public static void main(String[] args) {
ProxyFactory
.
init
(
"zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181"
,
"local"
,
GroupType
.
PROVIDER
,
10000002L
);
//
ProxyFactory.init("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181", "local", GroupType.PROVIDER , 10000002L);
String
cookie
=
"bid=rymxzs5aojg; ps=y; ll=\"118173\"; __utmc=30149280; dbcl2=\"188038058:9IHyVcSobVc\"; ck=_RvF; push_noty_num=0; push_doumail_num=0; __utmv=30149280.18803; douban-fav-remind=1; __yadk_uid=qLflXyj3R14ro9e0cLoZOQlJoMGVN32j; douban-profile-remind=1; _vwo_uuid_v2=D85F60C118B0AF465035D9CC7BBFDA7A6|4bf255e1e3a2e9aeede3708192f5f1bc; __utmz=30149280.1543564973.3.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; ap_v=0,6.0; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1543823236%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DPk19bxnhsVWwfnrcnwT0PquON7D1JpLsbfSu9rRowalyi4pOeM3rMHKFaJo9jJF7%26wd%3D%26eqid%3De7f262650001ef98000000045c00e64f%22%5D; _pk_id.100001.8cb4=6828fef49f6bcf34.1543559455.5.1543823236.1543820463.; _pk_ses.100001.8cb4=*; __utma=30149280.824403997.1543559458.1543818802.1543823236.6; __utmt=1; __utmb=30149280.5.7.1543823236"
;
//
String cookie = "bid=rymxzs5aojg; ps=y; ll=\"118173\"; __utmc=30149280; dbcl2=\"188038058:9IHyVcSobVc\"; ck=_RvF; push_noty_num=0; push_doumail_num=0; __utmv=30149280.18803; douban-fav-remind=1; __yadk_uid=qLflXyj3R14ro9e0cLoZOQlJoMGVN32j; douban-profile-remind=1; _vwo_uuid_v2=D85F60C118B0AF465035D9CC7BBFDA7A6|4bf255e1e3a2e9aeede3708192f5f1bc; __utmz=30149280.1543564973.3.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; ap_v=0,6.0; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1543823236%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DPk19bxnhsVWwfnrcnwT0PquON7D1JpLsbfSu9rRowalyi4pOeM3rMHKFaJo9jJF7%26wd%3D%26eqid%3De7f262650001ef98000000045c00e64f%22%5D; _pk_id.100001.8cb4=6828fef49f6bcf34.1543559455.5.1543823236.1543820463.; _pk_ses.100001.8cb4=*; __utma=30149280.824403997.1543559458.1543818802.1543823236.6; __utmt=1; __utmb=30149280.5.7.1543823236";
String
url
=
"E2S95LEA0008856R"
;
//
String url = "E2S95LEA0008856R";
TechTx
.
getTechTxComment
(
url
,
null
);
//
TechTx.getTechTxComment(url, null);
}
//
}
//
}
//
}
src/test/java/com/zhiwei/hsitory/BTimeHistoryExample.java
View file @
d7dce3fc
package
com
.
zhiwei
.
hsitory
;
//
package com.zhiwei.hsitory;
//
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
com.zhiwei.common.config.GroupType
;
//
import com.zhiwei.common.config.GroupType;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
//
import com.zhiwei.crawler.proxy.ProxyFactory;
import
com.zhiwei.parse.BTime
;
//
import com.zhiwei.parse.BTime;
//
public
class
BTimeHistoryExample
{
//
public class BTimeHistoryExample {
//
public
static
void
main
(
String
[]
args
)
{
//
public static void main(String[] args) {
ProxyFactory
.
init
(
"zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181"
,
"local"
,
GroupType
.
PROVIDER
,
10000002L
);
//
ProxyFactory.init("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181", "local", GroupType.PROVIDER , 10000002L);
List
<
Map
<
String
,
Object
>>
dataList
=
BTime
.
getHistoryData
(
"1608238"
,
0L
);
//
List<Map<String, Object>> dataList = BTime.getHistoryData("1608238", 0L);
// dataList.forEach(System.out::println);
//
//
dataList.forEach(System.out::println);
}
//
}
//
}
//
}
src/test/java/com/zhiwei/hsitory/BaijiaAccountExample.java
View file @
d7dce3fc
...
@@ -6,37 +6,38 @@
...
@@ -6,37 +6,38 @@
//
//
//import org.junit.Test;
//import org.junit.Test;
//
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.parse.Baijia;
//import com.zhiwei.parse.Baijia;
//import com.zhiwei.proxy.config.SimpleConfig;
//
//
//public class BaijiaAccountExample {
//public class BaijiaAccountExample {
//
//
// @Test
// @Test
// public void test3() {
// public void test3() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
// // , "local", GroupType.PROVIDER , 10000002L
// ProxyFactory.init(SimpleConfig.builder().registry("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181").appName("actool").appId(10000002).group("local").build());
// String path = "D://crawlerdata//自媒体/百家号采集.xlsx";
// String path = "D://crawlerdata//自媒体/百家号采集.xlsx";
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// String startTime = "2018-05-01 00:00:00";
// String startTime = "2018-05-01 00:00:00";
// Map<String,Object> map = poi.importExcel(path, 0);
//
//
Map<String,Object> map = poi.importExcel(path, 0);
// List<Map<String,Object>> list = (List<Map<String, Object>>) map.get("body");
//
//
List<Map<String,Object>> list = (List<Map<String, Object>>) map.get("body");
// List<Map<String,Object>> bodyList = new ArrayList<>();
// List<Map<String,Object>> bodyList = new ArrayList<>();
// for(Map<String,Object> m : list) {
//
//
for(Map<String,Object> m : list) {
// try {
//
//
try {
// String app_id = m.get("id").toString();
//
//
String app_id = m.get("id").toString();
//
app_id = "1602674438508810
";
//
String app_id = "1565848819560927
";
// String cookie = "BAIDUID=
7D453C932433A93F7AD1F3B8ABC8B0E1:FG=1; BIDUPSID=7D453C932433A93F7AD1F3B8ABC8B0E1; PSTM=1570766401; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDSFRCVID=eH-OJeCmH6VwoRJwCdmehrB7leKK0gOTHllvCh8hmwLadLIVJeC6EG0Ptf8g0KubFTPRogKK0gOTH6KF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tJkD_I_hJKt3qn7I5KToh4Athxob2bbXHDo-LIvHWT6cOR5JhfA-3R-e046f3-3L5CbH5D3s5lvvhb3O3M7ShbKdMa732RbrKCnraxQF5l8-sq0x0bOte-bQypoa0q3TLDOMahkM5h7xOKQoQlPK5JkgMx6MqpQJQeQ-5KQN3KJmfbL9bT3YjjISKx-_J5LJJxK; H_PS_PSSID=1442_21103_29567_29699_29220_22158; delPer=0; PSINO=5; ZD_ENTRY=baidu; yjs_js_security_passport=9687699d4b0965c0be1e6e312fc59ff5cf3d03a2_1571106914_js; Hmery-Time=121539387
8";
// String cookie = "BAIDUID=
A46414BD701A3738E17E0212A6C2FEEA:FG=1; Hmery-Time=2269711404; BIDUPSID=A46414BD701A3738E17E0212A6C2FEEA; PSTM=1583375258; delPer=0; H_PS_PSSID=30972_1439_21095_30839_30998_30823; BDORZ=B490B5EBF6F3CD402E515D22BCDA159
8";
// System.out.println(app_id);
// System.out.println(app_id);
// List<Map<String,Object>> lists = Baijia.getBaijiaAccountByBaiduData(app_id,
"聚富财经", startTime,
cookie, ProxyHolder.NAT_HEAVY_PROXY);
// List<Map<String,Object>> lists = Baijia.getBaijiaAccountByBaiduData(app_id,
"聚富财经", startTime,
cookie, ProxyHolder.NAT_HEAVY_PROXY);
// if(lists != null) {
// if(lists != null) {
// bodyList.addAll(lists);
// bodyList.addAll(lists);
// }
// }
// break;
//
//
break;
// } catch (Exception e) {
//
//
} catch (Exception e) {
// }
//
//
}
// }
//
//
}
// List<String> headList = new ArrayList<String>();
// List<String> headList = new ArrayList<String>();
// headList.add("title");
// headList.add("title");
// headList.add("time");
// headList.add("time");
...
@@ -44,7 +45,7 @@
...
@@ -44,7 +45,7 @@
// headList.add("url");
// headList.add("url");
// headList.add("content");
// headList.add("content");
// headList.add("read_amount");
// headList.add("read_amount");
// poi.exportExcel("
D://crawlerdata//历史文章采集/
百家号-lxj-2.xlsx", "娱乐资本论", headList, bodyList);
// poi.exportExcel("
E://crawlerdata//历史
百家号-lxj-2.xlsx", "娱乐资本论", headList, bodyList);
// }
// }
//
//
//}
//}
src/test/java/com/zhiwei/hsitory/KuaiDataHistoryExample.java
View file @
d7dce3fc
package
com
.
zhiwei
.
hsitory
;
//
package com.zhiwei.hsitory;
//
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
com.zhiwei.common.config.GroupType
;
//
import com.zhiwei.common.config.GroupType;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
//
import com.zhiwei.crawler.proxy.ProxyFactory;
import
com.zhiwei.parse.KuaiData
;
//
import com.zhiwei.parse.KuaiData;
//
public
class
KuaiDataHistoryExample
{
//
public class KuaiDataHistoryExample {
//
public
static
void
main
(
String
[]
args
)
{
//
public static void main(String[] args) {
ProxyFactory
.
init
(
"zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181"
,
"local"
,
GroupType
.
PROVIDER
,
10000002L
);
//
ProxyFactory.init("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181", "local", GroupType.PROVIDER , 10000002L);
List
<
Map
<
String
,
Object
>>
dataList
=
KuaiData
.
getArticleHistory
(
"5c19954ccb14fabc153971e3f924bf36"
,
"2686798288"
,
0L
);
//
List<Map<String, Object>> dataList = KuaiData.getArticleHistory("5c19954ccb14fabc153971e3f924bf36", "2686798288", 0L);
// dataList.forEach(System.out::println);
//
//
dataList.forEach(System.out::println);
}
//
}
//
}
//
}
src/test/java/com/zhiwei/hsitory/ZakerHistoryExample.java
View file @
d7dce3fc
package
com
.
zhiwei
.
hsitory
;
//
package com.zhiwei.hsitory;
//
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
com.zhiwei.common.config.GroupType
;
//
import com.zhiwei.common.config.GroupType;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
//
import com.zhiwei.crawler.proxy.ProxyFactory;
import
com.zhiwei.parse.BTime
;
//
import com.zhiwei.parse.BTime;
import
com.zhiwei.parse.MyZaker
;
//
import com.zhiwei.parse.MyZaker;
//
public
class
ZakerHistoryExample
{
//
public class ZakerHistoryExample {
//
//
public
static
void
main
(
String
[]
args
)
{
//
public static void main(String[] args) {
ProxyFactory
.
init
(
"zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181"
,
"local"
,
GroupType
.
PROVIDER
,
10000002L
);
//
ProxyFactory.init("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181", "local", GroupType.PROVIDER , 10000002L);
List
<
Map
<
String
,
Object
>>
dataList
=
MyZaker
.
getHistoryData
(
"13584"
,
0L
);
//
List<Map<String, Object>> dataList = MyZaker.getHistoryData("13584", 0L);
// dataList.forEach(System.out::println);
//
//
dataList.forEach(System.out::println);
}
//
}
//
}
//
}
src/test/java/com/zhiwei/hsitory/ZhihuArticleHistoryExample.java
View file @
d7dce3fc
package
com
.
zhiwei
.
hsitory
;
//
package com.zhiwei.hsitory;
//
import
java.util.List
;
//
import java.util.List;
import
java.util.Map
;
//
import java.util.Map;
//
import
com.zhiwei.common.config.GroupType
;
//
import com.zhiwei.common.config.GroupType;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
//
import com.zhiwei.crawler.proxy.ProxyFactory;
import
com.zhiwei.parse.Zhihu
;
//
import com.zhiwei.parse.Zhihu;
//
public
class
ZhihuArticleHistoryExample
{
//
public class ZhihuArticleHistoryExample {
//
public
static
void
main
(
String
[]
args
)
{
//
public static void main(String[] args) {
ProxyFactory
.
init
(
"zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181"
,
"local"
,
GroupType
.
PROVIDER
,
10000002L
);
//
ProxyFactory.init("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181", "local", GroupType.PROVIDER , 10000002L);
List
<
Map
<
String
,
Object
>>
dataList
=
Zhihu
.
getArticleHistory
(
"da-bai-xin-wen-27"
,
0L
);
//
List<Map<String, Object>> dataList = Zhihu.getArticleHistory("da-bai-xin-wen-27", 0L);
// dataList.forEach(System.out::println);
//
//
dataList.forEach(System.out::println);
}
//
}
//
}
//
}
src/test/java/com/zhiwei/shipin/BilibiliTest.java
View file @
d7dce3fc
package
com
.
zhiwei
.
shipin
;
//package com.zhiwei.shipin;
//
import
java.util.ArrayList
;
//import java.util.ArrayList;
import
java.util.List
;
//import java.util.List;
import
java.util.Map
;
//import java.util.Map;
//
import
org.junit.Test
;
//import org.junit.Test;
//
import
com.zhiwei.common.config.GroupType
;
//import com.zhiwei.common.config.GroupType;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
//import com.zhiwei.crawler.proxy.ProxyFactory;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import
com.zhiwei.parse.BiliBili
;
//import com.zhiwei.parse.BiliBili;
import
com.zhiwei.util.WordReadFile
;
//import com.zhiwei.util.WordReadFile;
//
public
class
BilibiliTest
{
//public class BilibiliTest {
@Test
// @Test
public
void
f
()
{
// public void f() {
ProxyFactory
.
init
(
"zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181"
,
"local"
,
GroupType
.
PROVIDER
,
10000002L
);
// ProxyFactory.init("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181", "local", GroupType.PROVIDER , 10000002L);
List
<
String
>
wordList
=
WordReadFile
.
getWords
(
"E://crawlerdata//关键词.txt"
);
// List<String> wordList = WordReadFile.getWords("E://crawlerdata//关键词.txt");
List
<
Map
<
String
,
Object
>>
bodyList
=
new
ArrayList
<>();
// List<Map<String, Object>> bodyList = new ArrayList<>();
String
cookie
=
"LIVE_BUVID=AUTO8715300758995538; sid=kp5rluge; fts=1530161621; im_notify_type_35324319=0; buvid3=08ABE6AE-5061-4CE5-B34F-1A8AAB64DB3320712infoc; rpdid=olppsmkxmpdoskwoxiwww; finger=edc6ecda; stardustvideo=1; UM_distinctid=164fe68fb31996-01f161c3523abe-6114167a-1fa400-164fe68fb32274"
;
// String cookie = "LIVE_BUVID=AUTO8715300758995538; sid=kp5rluge; fts=1530161621; im_notify_type_35324319=0; buvid3=08ABE6AE-5061-4CE5-B34F-1A8AAB64DB3320712infoc; rpdid=olppsmkxmpdoskwoxiwww; finger=edc6ecda; stardustvideo=1; UM_distinctid=164fe68fb31996-01f161c3523abe-6114167a-1fa400-164fe68fb32274";
for
(
String
word
:
wordList
)
{
// for (String word : wordList) {
List
<
Map
<
String
,
Object
>>
dataList
=
BiliBili
.
getData
(
word
,
null
,
"2001-01-14 00:00:00"
,
//// pubdate 时间 totalrank 综合
cookie
);
// List<Map<String, Object>> dataList = BiliBili.getData(word, null, "2001-01-14 00:00:00",
if
(
dataList
!=
null
)
{
// cookie, "pubdate");
System
.
out
.
println
(
word
+
" ----- "
+
dataList
.
size
());
// if (dataList != null) {
bodyList
.
addAll
(
dataList
);
// System.out.println(word + " ----- " + dataList.size());
}
// bodyList.addAll(dataList);
}
// }
List
<
String
>
headlist
=
new
ArrayList
<>();
// }
headlist
.
add
(
"submitcount"
);
// List<String> headlist = new ArrayList<>();
headlist
.
add
(
"playcount"
);
// headlist.add("submitcount");
headlist
.
add
(
"time"
);
// headlist.add("playcount");
headlist
.
add
(
"source"
);
// headlist.add("time");
headlist
.
add
(
"title"
);
// headlist.add("source");
headlist
.
add
(
"url"
);
// headlist.add("title");
headlist
.
add
(
"word"
);
// headlist.add("url");
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
// headlist.add("word");
poi
.
exportExcel
(
"E://crawlerdata//视频//bilibili关键词采集数据-dnf手游-点击-20200204.xlsx"
,
"B站数据"
,
headlist
,
bodyList
);
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("E://crawlerdata//视频//bilibili关键词采集数据-腾讯会议-time-20200218.xlsx", "B站数据", headlist, bodyList);
}
//
}
// }
//}
src/test/java/com/zhiwei/shipin/QQTVTest.java
View file @
d7dce3fc
...
@@ -17,9 +17,9 @@
...
@@ -17,9 +17,9 @@
//public class QQTVTest {
//public class QQTVTest {
// @Test
// @Test
// public void f() {
// public void f() {
// ProxyFactory.init("zookeeper://192.168.0.
36
:2181", "local",GroupType.PROVIDER, 10000002);
// ProxyFactory.init("zookeeper://192.168.0.
11:2181?backup=192.168.0.30:2181,192.168.0.35
:2181", "local",GroupType.PROVIDER, 10000002);
// String time = "2019-01-11 00:00:00";
// String time = "2019-01-11 00:00:00";
// List<String> wordList = WordReadFile.getWords("
D
://crawlerdata//关键词.txt");
// List<String> wordList = WordReadFile.getWords("
E
://crawlerdata//关键词.txt");
// List<Map<String, Object>> bodyList = new ArrayList<>();
// List<Map<String, Object>> bodyList = new ArrayList<>();
// for (String word : wordList) {
// for (String word : wordList) {
// List<Map<String, Object>> dataList = QQTV.getData(word,time, ProxyHolder.NAT_HEAVY_PROXY);
// List<Map<String, Object>> dataList = QQTV.getData(word,time, ProxyHolder.NAT_HEAVY_PROXY);
...
@@ -37,7 +37,7 @@
...
@@ -37,7 +37,7 @@
// headlist.add("url");
// headlist.add("url");
// headlist.add("word");
// headlist.add("word");
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// poi.exportExcel("
D
://crawlerdata//视频//腾讯视频关键词采集数据-精装修.xlsx", "腾讯视频数据", headlist, bodyList);
// poi.exportExcel("
E
://crawlerdata//视频//腾讯视频关键词采集数据-精装修.xlsx", "腾讯视频数据", headlist, bodyList);
//
//
//
//
//
//
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment