Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
D
discover-mediaself-account
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
discover-mediaself-account
Commits
8d7380c6
Commit
8d7380c6
authored
Mar 21, 2019
by
zhiwei
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
自媒体号采集
parents
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
437 additions
and
0 deletions
+437
-0
pom.xml
+27
-0
src/main/java/com/zhiwei/account/crawler/SohuAccount.java
+91
-0
src/main/java/com/zhiwei/account/crawler/UCAccount.java
+94
-0
src/main/java/com/zhiwei/account/crawler/YiDianZiXunAccount.java
+203
-0
src/main/resources/log4j2.xml
+22
-0
No files found.
pom.xml
0 → 100644
View file @
8d7380c6
<project
xmlns=
"http://maven.apache.org/POM/4.0.0"
xmlns:xsi=
"http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation=
"http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
>
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei.account
</groupId>
<artifactId>
discover-mediaself-account
</artifactId>
<version>
0.0.1-SNAPSHOT
</version>
<description>
根据关键词查询相应的账号数据
</description>
<dependencies>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<version>
0.3.0-RELEASE
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei.tools
</groupId>
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.1.2-SNAPSHOT
</version>
</dependency>
<dependency>
<groupId>
com.zhiwei
</groupId>
<artifactId>
excelpoi
</artifactId>
<version>
0.0.3-SNAPSHOT
</version>
</dependency>
</dependencies>
</project>
\ No newline at end of file
src/main/java/com/zhiwei/account/crawler/SohuAccount.java
0 → 100644
View file @
8d7380c6
package
com
.
zhiwei
.
account
.
crawler
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.apache.commons.lang3.StringUtils
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
/**
* 根据关键词获取搜狐账号
* @author qq859
*
*/
public
class
SohuAccount
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
final
String
registry
=
"zookeeper://192.168.0.36:2181"
;
private
static
final
String
group
=
"local"
;
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
registry
,
group
,
GroupType
.
PROVIDER
);
String
word
=
"京东"
;
try
{
SohuAccount
.
getSohuAccountByWord
(
word
);
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
}
/**
* 根据关键词获取大鱼号账号信息
* @param word
* @return
* @throws IOException
*/
public
static
List
<
Map
<
String
,
Object
>>
getSohuAccountByWord
(
String
word
)
throws
IOException
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
boolean
more
=
true
;
int
page
=
0
;
while
(
more
)
{
String
url
=
"http://search.sohu.com/search/meta?keyword="
+
URLCodeUtil
.
getURLEncode
(
word
,
"utf-8"
)+
"&spm-pre=smpc.csrpage.0.0.15522844808206u49PLo&from="
+
page
*
50
+
"&size=50&searchType=media&queryType=edit"
;
System
.
out
.
println
(
"page============"
+
page
);
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_PROXY
).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"userName"
))
{
JSONObject
dataJson
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
JSONArray
mediaArray
=
dataJson
.
getJSONArray
(
"media"
);
if
(!
mediaArray
.
isEmpty
()
&&
mediaArray
.
size
()>
0
)
{
for
(
int
i
=
0
;
i
<
mediaArray
.
size
();
i
++)
{
Map
<
String
,
Object
>
dataMap
=
new
HashMap
<
String
,
Object
>();
JSONObject
json
=
mediaArray
.
getJSONObject
(
i
);
dataMap
.
put
(
"link"
,
json
.
getString
(
"weiboUrl"
));
dataMap
.
put
(
"id"
,
json
.
getInteger
(
"id"
));
dataMap
.
put
(
"mail"
,
json
.
getString
(
"passport"
));
dataMap
.
put
(
"name"
,
json
.
getString
(
"userName"
));
dataMap
.
put
(
"description"
,
json
.
getString
(
"description"
));
dataMap
.
put
(
"avatorUrl"
,
json
.
getString
(
"avatorUrl"
));
dataMap
.
put
(
"totalReadNum"
,
json
.
getJSONObject
(
"scoreMap"
).
getInteger
(
"totalPv"
));
dataMap
.
put
(
"newsCount"
,
json
.
getJSONObject
(
"scoreMap"
).
getInteger
(
"newsCount"
));
System
.
out
.
println
(
dataMap
);
dataList
.
add
(
dataMap
);
}
boolean
esEnd
=
dataJson
.
getBooleanValue
(
"esEnd"
);
if
(
esEnd
)
{
more
=
false
;
}
page
++;
}
else
{
more
=
false
;
}
}
else
{
more
=
false
;
}
}
return
dataList
;
}
}
src/main/java/com/zhiwei/account/crawler/UCAccount.java
0 → 100644
View file @
8d7380c6
package
com
.
zhiwei
.
account
.
crawler
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
org.apache.commons.lang3.StringUtils
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.select.Elements
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
import
com.zhiwei.tools.tools.ZhiWeiTools
;
/**
* UC大鱼号,根据关键词采集
* @author qq859
*
*/
public
class
UCAccount
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
final
String
registry
=
"zookeeper://192.168.0.36:2181"
;
private
static
final
String
group
=
"local"
;
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
registry
,
group
,
GroupType
.
PROVIDER
);
String
word
=
"京东"
;
try
{
UCAccount
.
getUCAccountByWord
(
word
);
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
}
/**
* 根据关键词获取大鱼号账号信息
* @param word
* @return
* @throws IOException
*/
public
static
List
<
Map
<
String
,
Object
>>
getUCAccountByWord
(
String
word
)
throws
IOException
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
boolean
more
=
true
;
int
page
=
1
;
while
(
more
)
{
String
url
=
"https://m.sm.cn/api/rest?method=Subscribe.feed&q="
+
URLCodeUtil
.
getURLEncode
(
word
,
"utf-8"
)+
"&format=json&by=submit&snum=0&page="
+
page
;
System
.
out
.
println
(
"page============"
+
page
);
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_PROXY
).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"feed_html"
))
{
JSONObject
dataJson
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONObject
(
"data"
);
String
dataText
=
dataJson
.
getString
(
"feed_html"
);
Document
document
=
Jsoup
.
parse
(
dataText
);
Elements
elements
=
document
.
getElementsByClass
(
"cell-wrapper"
);
if
(!
elements
.
isEmpty
()
&&
elements
.
size
()>
0
)
{
for
(
Element
element
:
elements
)
{
Map
<
String
,
Object
>
dataMap
=
new
HashMap
<
String
,
Object
>();
String
name
=
element
.
select
(
"div.info>p.title"
).
text
().
replace
(
"大鱼号"
,
""
);
String
description
=
element
.
select
(
"div.info>p.summary"
).
text
();
String
avatorUrl
=
element
.
select
(
"div.img"
).
attr
(
"data-image"
);
String
link
=
element
.
select
(
"a.cell"
).
attr
(
"href"
);
String
follow_count
=
element
.
select
(
"div.info>div.icons>span"
).
text
().
replace
(
"人关注"
,
""
);
dataMap
.
put
(
"link"
,
link
);
dataMap
.
put
(
"name"
,
name
);
dataMap
.
put
(
"avatorUrl"
,
avatorUrl
);
dataMap
.
put
(
"summary"
,
ZhiWeiTools
.
delHTMLTag
(
description
));
dataMap
.
put
(
"follow_count"
,
follow_count
);
System
.
out
.
println
(
dataMap
);
dataList
.
add
(
dataMap
);
}
page
++;
}
else
{
more
=
false
;
}
}
else
{
more
=
false
;
}
}
return
dataList
;
}
}
src/main/java/com/zhiwei/account/crawler/YiDianZiXunAccount.java
0 → 100644
View file @
8d7380c6
package
com
.
zhiwei
.
account
.
crawler
;
import
java.io.IOException
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Objects
;
import
javax.script.Invocable
;
import
javax.script.ScriptEngine
;
import
javax.script.ScriptEngineManager
;
import
org.apache.commons.lang3.StringUtils
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.common.config.GroupType
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.excelpoi.excel.PoiExcelUtil
;
import
com.zhiwei.tools.tools.URLCodeUtil
;
/**
* 根据关键词获取一点资讯账号
* @author qq859
*
*/
public
class
YiDianZiXunAccount
{
private
static
HttpBoot
httpBoot
=
new
HttpBoot
();
private
static
final
String
registry
=
"zookeeper://192.168.0.36:2181"
;
private
static
final
String
group
=
"local"
;
public
static
void
main
(
String
[]
args
)
{
ProxyFactory
.
init
(
registry
,
group
,
GroupType
.
PROVIDER
);
String
word
=
"京东"
;
try
{
List
<
Map
<
String
,
Object
>>
bodyList
=
YiDianZiXunAccount
.
getYiDianZiXunAccountByType
();
System
.
out
.
println
(
"bodyList size is :"
+
bodyList
.
size
());
List
<
String
>
headList
=
new
ArrayList
<>();
headList
.
add
(
"media_id"
);
headList
.
add
(
"media_name"
);
headList
.
add
(
"avatorUrl"
);
headList
.
add
(
"userid"
);
headList
.
add
(
"postcount"
);
headList
.
add
(
"bookcount"
);
headList
.
add
(
"media_domain"
);
headList
.
add
(
"authentication"
);
headList
.
add
(
"summary"
);
PoiExcelUtil
poi
=
PoiExcelUtil
.
getInstance
();
String
path
=
"D://一点号账号信息.xlsx"
;
poi
.
exportExcel
(
path
,
"账号信息"
,
headList
,
bodyList
);
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
}
/**
* 根据关键词获取一点资讯账号信息
* @param word
* @return
* @throws IOException
*/
public
static
List
<
Map
<
String
,
Object
>>
getYiDianZiXunAccountByWord
(
String
word
)
throws
IOException
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
String
url
=
"http://www.yidianzixun.com/home/q/search_channel?word="
+
URLCodeUtil
.
getURLEncode
(
word
,
"utf-8"
)+
"&&appid=web_yidian"
;
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_PROXY
).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"channels"
))
{
JSONArray
channels
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"channels"
);
if
(!
channels
.
isEmpty
()
&&
channels
.
size
()>
0
)
{
for
(
int
i
=
0
;
i
<
channels
.
size
();
i
++)
{
JSONObject
json
=
channels
.
getJSONObject
(
i
);
if
(
json
.
containsKey
(
"id"
))
{
Map
<
String
,
Object
>
dataMap
=
getYidianAccountInfo
(
json
.
getString
(
"id"
));
System
.
out
.
println
(
dataMap
);
dataList
.
add
(
dataMap
);
}
}
}
return
dataList
;
}
return
null
;
}
/**
* 根据频道拉取一点资讯账号
* @return
* @throws IOException
*/
public
static
List
<
Map
<
String
,
Object
>>
getYiDianZiXunAccountByType
()
throws
IOException
{
List
<
Map
<
String
,
Object
>>
dataList
=
new
ArrayList
<>();
String
url
=
"http://www.yidianzixun.com/medialist"
;
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
),
ProxyHolder
.
NAT_PROXY
).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"channels"
))
{
htmlBody
=
htmlBody
.
split
(
"window.yidian.docinfo = "
)[
1
].
split
(
"</script>"
)[
0
];
JSONArray
categories
=
JSONObject
.
parseObject
(
htmlBody
).
getJSONArray
(
"categories"
);
if
(!
categories
.
isEmpty
()
&&
categories
.
size
()>
0
)
{
for
(
int
i
=
0
;
i
<
categories
.
size
();
i
++)
{
JSONArray
channels
=
categories
.
getJSONObject
(
i
).
getJSONArray
(
"channels"
);
if
(
Objects
.
nonNull
(
channels
)
&&
channels
.
size
()>
0
)
{
for
(
int
j
=
0
;
j
<
channels
.
size
();
j
++)
{
JSONObject
json
=
channels
.
getJSONObject
(
j
);
if
(
json
.
containsKey
(
"id"
))
{
Map
<
String
,
Object
>
dataMap
=
getYidianAccountInfo
(
json
.
getString
(
"id"
));
System
.
out
.
println
(
dataMap
);
dataList
.
add
(
dataMap
);
}
}
}
}
}
return
dataList
;
}
return
null
;
}
/**
* 根据id获取一点账号信息
* @param id
* @return
*/
private
static
Map
<
String
,
Object
>
getYidianAccountInfo
(
String
id
){
Map
<
String
,
Object
>
dataMap
=
new
HashMap
<
String
,
Object
>();
String
spt
=
getSpt
(
id
,
0
,
10
);
String
url
=
"http://www.yidianzixun.com"
+
spt
+
"&appid=web_yidian"
;
Map
<
String
,
Object
>
headMap
=
new
HashMap
<
String
,
Object
>();
String
referer
=
"http://www.yidianzixun.com/channel/"
+
id
;
headMap
.
put
(
"Referer"
,
referer
);
try
{
String
htmlBody
=
httpBoot
.
syncCall
(
RequestUtils
.
wrapGet
(
url
,
headMap
),
ProxyHolder
.
NAT_PROXY
).
body
().
string
();
if
(
StringUtils
.
isNotBlank
(
htmlBody
)
&&
htmlBody
.
contains
(
"channel_media"
))
{
JSONObject
dataJson
=
JSONObject
.
parseObject
(
htmlBody
);
String
bookcount
=
dataJson
.
getString
(
"bookcount"
).
replaceAll
(
"人订阅"
,
""
);
Double
followCount
=
0.0
;
if
(
bookcount
.
contains
(
"万"
))
{
followCount
=
Double
.
valueOf
(
bookcount
.
replaceAll
(
"万"
,
""
))*
10000
;
}
else
{
followCount
=
Double
.
valueOf
(
bookcount
);
}
String
channel_id
=
dataJson
.
getString
(
"channel_id"
);
String
channel_image
=
dataJson
.
getString
(
"channel_image"
);
String
channel_name
=
dataJson
.
getString
(
"channel_name"
);
String
channel_summary
=
dataJson
.
getString
(
"channel_summary"
);
JSONObject
channel_media
=
dataJson
.
getJSONObject
(
"channel_media"
);
String
authentication
=
channel_media
.
getString
(
"authentication"
);
String
media_domain
=
channel_media
.
getString
(
"media_domain"
);
int
postcount
=
channel_media
.
getInteger
(
"postcount"
);
long
userid
=
channel_media
.
getLong
(
"userid"
);
dataMap
.
put
(
"bookcount"
,
followCount
.
intValue
());
dataMap
.
put
(
"media_id"
,
channel_id
);
dataMap
.
put
(
"media_name"
,
channel_name
);
dataMap
.
put
(
"avatorUrl"
,
channel_image
);
dataMap
.
put
(
"summary"
,
channel_summary
);
dataMap
.
put
(
"authentication"
,
authentication
);
dataMap
.
put
(
"media_domain"
,
media_domain
);
dataMap
.
put
(
"postcount"
,
postcount
);
dataMap
.
put
(
"userid"
,
userid
);
return
dataMap
;
}
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
return
dataMap
;
}
/**
* 计算spt加密参数,用于采集账号
* @param channel_id
* @param cstart
* @param cend
* @return
*/
private
static
String
getSpt
(
String
channel_id
,
int
cstart
,
int
cend
)
{
String
n
=
"/home/q/news_list_for_channel?channel_id="
+
channel_id
+
"&cstart="
+
cstart
+
"&cend="
+(
cstart
+
10
)+
"&infinite=true&refresh=1&__from__=pc&multi=5"
;
String
jsText
=
"function spt(n, e, i, t) {"
+
"for (var o = \"sptoken\", a = \"\", c = 1; c < arguments.length; c++){o += arguments[c];}"
+
"for (var c = 0; c < o.length; c++) {var r = 10 ^ o.charCodeAt(c); a += String.fromCharCode(r)}return n += (/\\?/.test(n) ? \"&_spt=\" : \"?_spt=\") + encodeURIComponent(a)}"
;
ScriptEngineManager
manager
=
new
ScriptEngineManager
();
ScriptEngine
engine
=
manager
.
getEngineByName
(
"javascript"
);
try
{
engine
.
eval
(
jsText
);
if
(
engine
instanceof
Invocable
)
{
Invocable
invoke
=
(
Invocable
)
engine
;
String
spt
=
invoke
.
invokeFunction
(
"spt"
,
n
,
channel_id
,
cstart
,
cend
).
toString
();
return
spt
;
}
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
return
null
;
}
}
src/main/resources/log4j2.xml
0 → 100644
View file @
8d7380c6
<?xml version="1.0" encoding="UTF-8"?>
<!-- log4j2 自身的日志级别 -->
<Configuration
status=
"WARN"
>
<Appenders>
<!-- 定义日志输出地 -->
<Console
name=
"Console"
target=
"SYSTEM_OUT"
>
<PatternLayout
pattern=
"%d{yyyy-MM-dd HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"
/>
</Console>
</Appenders>
<Loggers>
<Root
level=
"info"
>
<AppenderRef
ref=
"Console"
/>
</Root>
<!-- 所有的 logger 均继承 Root
当 additivity 为 true 时, 父子 logger 均会打印
当 additivity 为 false 时, 仅子 logger 会打印 -->
<Logger
name =
"mylog"
level=
"error"
additivity=
"false"
>
<AppenderRef
ref=
"Console"
/>
</Logger>
</Loggers>
</Configuration>
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment