Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
media_data_crawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
media_data_crawler
Commits
47654569
Commit
47654569
authored
Mar 27, 2019
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
增加简书用户采集 和 修改百度知道 关键词采集死循环 问题
parent
8e2e2cc2
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
201 additions
and
1 deletions
+201
-1
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduZhidaoCrawlerParse.java
+5
-1
src/main/java/com/zhiwei/media_data_crawler/crawler/JianshuCrawler.java
+93
-0
src/main/java/com/zhiwei/media_data_crawler/entity/JianshuUser.java
+103
-0
No files found.
src/main/java/com/zhiwei/media_data_crawler/crawler/BaiduZhidaoCrawlerParse.java
View file @
47654569
...
...
@@ -30,6 +30,7 @@ public class BaiduZhidaoCrawlerParse {
public
static
List
<
Map
<
String
,
Object
>>
getData
(
String
word
,
ProxyHolder
proxy
)
{
try
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
List
<
String
>
urlList
=
new
ArrayList
<>();
int
i
=
0
;
int
count
=
-
1
;
while
(
true
)
{
...
...
@@ -42,12 +43,16 @@ public class BaiduZhidaoCrawlerParse {
for
(
Element
element
:
elements
)
{
Map
<
String
,
Object
>
map
=
new
HashMap
<>();
String
ur
=
element
.
select
(
"a.ti"
).
attr
(
"href"
).
split
(
"\\?"
)[
0
];
if
(
urlList
.
contains
(
ur
))
{
continue
;
}
String
title
=
element
.
select
(
"a.ti"
).
text
();
String
content
=
element
.
select
(
"dd.answer"
).
text
();
String
time
=
element
.
select
(
"dd.dd.explain.f-light > span:nth-child(1)"
).
text
();
String
source
=
element
.
select
(
"dd.dd.explain.f-light > span:nth-child(2) > a"
).
text
();
String
answerCount
=
element
.
select
(
"dd.dd.explain.f-light > span:nth-child(3) > a"
).
text
();
String
like
=
element
.
select
(
"dd.dd.explain.f-light > span:nth-child(4)"
).
text
();
urlList
.
add
(
ur
);
map
.
put
(
"url"
,
ur
);
map
.
put
(
"title"
,
title
);
map
.
put
(
"content"
,
content
);
...
...
@@ -56,7 +61,6 @@ public class BaiduZhidaoCrawlerParse {
map
.
put
(
"answerCount"
,
answerCount
);
map
.
put
(
"like"
,
like
);
map
.
put
(
"word"
,
word
);
System
.
out
.
println
(
map
.
toString
());
dataList
.
add
(
map
);
}
if
(
dataList
.
size
()
-
count
<
8
)
{
...
...
src/main/java/com/zhiwei/media_data_crawler/crawler/JianshuCrawler.java
0 → 100644
View file @
47654569
package
com
.
zhiwei
.
media_data_crawler
.
crawler
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
org.apache.logging.log4j.LogManager
;
import
org.apache.logging.log4j.Logger
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.media_data_crawler.entity.JianshuUser
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
okhttp3.Response
;
/**
*
* @ClassName JianshuCrawler
* @Description 简书相关采集
* @author byte-zbs
* @Date 2019年3月23日 上午11:12:07
* @version 1.0.0
*/
public
class
JianshuCrawler
{
private
static
Logger
logger
=
LogManager
.
getLogger
(
BaiduTiebaCrawlerParse
.
class
);
private
static
HttpBoot
httpBoot
=
new
HttpBoot
(
false
,
2
);
public
static
List
<
JianshuUser
>
getUserList
(
String
word
,
String
cookie
)
{
List
<
JianshuUser
>
jsList
=
new
ArrayList
<>();
Map
<
String
,
Object
>
headers
=
new
HashMap
<>();
int
page
=
1
;
while
(
true
)
{
String
url
=
"https://www.jianshu.com/search/do?q="
+
URLCodeUtil
.
getURLDecode
(
word
,
"utf-8"
)+
"&type=user&page="
+
page
+
"&order_by=default"
;
headers
.
put
(
"cookie"
,
cookie
);
headers
.
put
(
"origin"
,
"https://www.jianshu.com"
);
headers
.
put
(
"accept"
,
"application/json"
);
headers
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
);
try
(
Response
response
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapPost
(
url
,
headers
,
null
),
ProxyHolder
.
NAT_PROXY
)){
String
result
=
response
.
body
().
string
();
System
.
out
.
println
(
result
);
if
(
result
.
contains
(
"搜索过于频繁"
))
{
continue
;
}
JSONObject
json
=
JSONObject
.
parseObject
(
result
);
JSONArray
jsonArray
=
json
.
getJSONArray
(
"entries"
);
if
(
Objects
.
nonNull
(
jsonArray
))
{
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
JSONObject
data
=
jsonArray
.
getJSONObject
(
i
);
JianshuUser
jsu
=
new
JianshuUser
();
jsu
.
setId
(
data
.
getString
(
"id"
));
jsu
.
setFensi
(
data
.
getInteger
(
"followers_count"
));
jsu
.
setGuangzhu
(
data
.
getInteger
(
"following_users_count"
));
jsu
.
setName
(
data
.
getString
(
"nickname"
));
jsu
.
setArticles
(
data
.
getInteger
(
"total_wordage"
));
jsu
.
setZishu
(
data
.
getInteger
(
"total_likes_count"
));
jsu
.
setUrl
(
"https://www.jianshu.com/u/"
+
data
.
getString
(
"slug"
));
jsu
.
setImgUrl
(
data
.
getString
(
"avatar_url"
));
// System.out.println(jsu.toString());
jsList
.
add
(
jsu
);
}
logger
.
info
(
"{} 页 一共采集到 {} 关键词 {}"
,
page
,
jsList
.
size
(),
word
);
page
++;
if
(
page
*
10
>
jsList
.
size
()+
30
)
{
break
;
}
continue
;
}
break
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
logger
.
error
(
"简书用户采集出错{}"
,
e
);
}
}
return
jsList
;
}
public
static
void
main
(
String
[]
args
)
{}
}
src/main/java/com/zhiwei/media_data_crawler/entity/JianshuUser.java
0 → 100644
View file @
47654569
package
com
.
zhiwei
.
media_data_crawler
.
entity
;
public
class
JianshuUser
{
private
String
id
;
private
String
name
;
private
String
url
;
private
String
imgUrl
;
private
int
fensi
;
//粉丝
private
int
guangzhu
;
//关注
private
int
articles
;
//文章数
private
int
zishu
;
//写了多少字
private
int
like
;
//喜欢数
public
String
getUrl
()
{
return
url
;
}
public
void
setUrl
(
String
url
)
{
this
.
url
=
url
;
}
public
String
getImgUrl
()
{
return
imgUrl
;
}
public
void
setImgUrl
(
String
imgUrl
)
{
this
.
imgUrl
=
imgUrl
;
}
public
String
getId
()
{
return
id
;
}
public
void
setId
(
String
id
)
{
this
.
id
=
id
;
}
public
String
getName
()
{
return
name
;
}
public
void
setName
(
String
name
)
{
this
.
name
=
name
;
}
public
int
getFensi
()
{
return
fensi
;
}
public
void
setFensi
(
int
fensi
)
{
this
.
fensi
=
fensi
;
}
public
int
getGuangzhu
()
{
return
guangzhu
;
}
public
void
setGuangzhu
(
int
guangzhu
)
{
this
.
guangzhu
=
guangzhu
;
}
public
int
getArticles
()
{
return
articles
;
}
public
void
setArticles
(
int
articles
)
{
this
.
articles
=
articles
;
}
public
int
getZishu
()
{
return
zishu
;
}
public
void
setZishu
(
int
zishu
)
{
this
.
zishu
=
zishu
;
}
public
int
getLike
()
{
return
like
;
}
public
void
setLike
(
int
like
)
{
this
.
like
=
like
;
}
@Override
public
String
toString
()
{
return
"JianshuUser [id="
+
id
+
", name="
+
name
+
", url="
+
url
+
", imgUrl="
+
imgUrl
+
", fensi="
+
fensi
+
", guangzhu="
+
guangzhu
+
", articles="
+
articles
+
", zishu="
+
zishu
+
", like="
+
like
+
"]"
;
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment