Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
849ef5fe
Commit
849ef5fe
authored
Sep 19, 2018
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修改判断链接是否存活判断
parent
98e0d120
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
227 additions
and
28 deletions
+227
-28
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+31
-7
src/main/java/com/zhiwei/source_forward/run/URLLive.java
+1
-1
src/main/java/com/zhiwei/source_forward/util/ReadMediaData.java
+159
-0
src/test/java/com/zhiwei/source_forward/sourceforward/test/URLLiveTest.java
+36
-20
No files found.
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
849ef5fe
...
@@ -74,12 +74,16 @@ public class UrlLiveCrawler {
...
@@ -74,12 +74,16 @@ public class UrlLiveCrawler {
}
}
}
}
}
else
{
}
else
{
if
(
attr
.
getCount
()
>
3
)
{
if
(
future
.
cause
().
getMessage
().
contains
(
"status code: 301"
))
{
callBack
(
callback
,
attr
,
-
1
);
callBack
(
callback
,
attr
,
1
);
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
future
.
cause
().
getMessage
());
}
else
{
}
else
{
attr
.
AddCount
();
if
(
attr
.
getCount
()
>
3
)
{
search
(
counter
,
attr
.
getAttr
().
toString
(),
attr
,
callback
);
callBack
(
callback
,
attr
,
-
1
);
logger
.
info
(
"{} 搜索结果访问失败: {}"
,
request
.
url
().
url
(),
future
.
cause
().
getMessage
());
}
else
{
attr
.
AddCount
();
search
(
counter
,
attr
.
getAttr
().
toString
(),
attr
,
callback
);
}
}
}
}
}
counter
.
reduce
();
counter
.
reduce
();
...
@@ -116,6 +120,12 @@ public class UrlLiveCrawler {
...
@@ -116,6 +120,12 @@ public class UrlLiveCrawler {
logger
.
info
(
"url 解析出错 {}"
,
url
);
logger
.
info
(
"url 解析出错 {}"
,
url
);
return
url
;
return
url
;
}
}
}
else
if
(
url
.
contains
(
"mp.weixin.qq.com"
))
{
if
(
url
.
contains
(
"https"
))
{
}
else
{
url
=
url
.
replace
(
"http"
,
"https"
);
}
}
}
return
url
;
return
url
;
}
}
...
@@ -206,6 +216,11 @@ public class UrlLiveCrawler {
...
@@ -206,6 +216,11 @@ public class UrlLiveCrawler {
logger
.
info
(
"{}检测规则:第{}步"
,
url
,
step
);
logger
.
info
(
"{}检测规则:第{}步"
,
url
,
step
);
return
true
;
return
true
;
}
}
step
++;
if
(
rulerWechatWeigui
(
doc
))
{
logger
.
info
(
"{}检测规则:第{}步"
,
url
,
step
);
return
true
;
}
return
false
;
return
false
;
}
}
...
@@ -240,13 +255,22 @@ public class UrlLiveCrawler {
...
@@ -240,13 +255,22 @@ public class UrlLiveCrawler {
private
boolean
rulerWeigui
(
Document
doc
)
private
boolean
rulerWeigui
(
Document
doc
)
{
{
boolean
flg
=
false
;
boolean
flg
=
false
;
if
((
doc
.
select
(
"p.title"
).
text
()).
contains
(
"此内容因违规无法查看"
))
if
((
doc
.
select
(
"p.title"
).
text
()).
contains
(
"此内容因违规无法查看"
)
||
doc
.
select
(
"p.title"
).
text
().
contains
(
"此帐号在冻结期,内容无法查看"
)
)
{
{
flg
=
true
;
flg
=
true
;
}
}
return
flg
;
return
flg
;
}
}
private
boolean
rulerWechatWeigui
(
Document
doc
)
{
boolean
flg
=
false
;
if
((
doc
.
select
(
"h3.msg-title"
).
text
()).
contains
(
"此内容被投诉且经审核涉嫌侵权,无法查看"
))
{
flg
=
true
;
}
return
flg
;
}
/**
/**
*
*
* ( 微信内容违规的无效网址筛选规则)
* ( 微信内容违规的无效网址筛选规则)
...
...
src/main/java/com/zhiwei/source_forward/run/URLLive.java
View file @
849ef5fe
...
@@ -66,7 +66,7 @@ public class URLLive {
...
@@ -66,7 +66,7 @@ public class URLLive {
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
List
<
String
>
urlList
=
new
ArrayList
<>();
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"http://
www.zyzpes.com/toutiao/5048828/20180419A1AFBC00.html
"
);
urlList
.
add
(
"http://
mp.weixin.qq.com/s?__biz=MzA3MzY1NjMxMw==&mid=2652054872&idx=1&sn=d67630a6b55d0eebd353cc90242fd784&3rd=MzA3MDU4NTYzMw==&scene=6#rd
"
);
List
<
UrlLiveBean
>
u
=
URLLive
.
verificationURLLive
(
urlList
);
List
<
UrlLiveBean
>
u
=
URLLive
.
verificationURLLive
(
urlList
);
for
(
UrlLiveBean
b
:
u
)
{
for
(
UrlLiveBean
b
:
u
)
{
System
.
out
.
println
(
b
.
toString
());
System
.
out
.
println
(
b
.
toString
());
...
...
src/main/java/com/zhiwei/source_forward/util/ReadMediaData.java
0 → 100644
View file @
849ef5fe
package
com
.
zhiwei
.
source_forward
.
util
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
import
com.zhiwei.source_forward.util.SourceData
;
public
class
ReadMediaData
{
/**
* @Title: getUrl
* @author hero
* @Description: TODO(获取需要验证的链接)
* @param @param path
* @param @return 设定文件
* @return List<String> 返回类型
*/
public
static
Map
<
String
,
Map
<
String
,
Object
>>
getUrl
(
List
<
Map
<
String
,
Object
>>
dataList
){
try
{
Map
<
String
,
Map
<
String
,
Object
>>
result
=
new
HashMap
<
String
,
Map
<
String
,
Object
>>();
//网络媒体数据,用于更新转发原创
for
(
Map
<
String
,
Object
>
dataMap
:
dataList
){
try
{
String
source
=
dataMap
.
get
(
"来源"
)!=
null
?
dataMap
.
get
(
"来源"
).
toString
().
trim
().
toUpperCase
():
null
;
String
url
=
dataMap
.
get
(
"链接"
)!=
null
?
dataMap
.
get
(
"链接"
).
toString
().
trim
():
null
;
if
(
url
.
contains
(
"sh.qihoo.com"
)
&&
url
.
contains
(
"&url="
)){
url
=
url
.
split
(
"&url="
)[
1
];
}
result
.
put
(
url
,
dataMap
);
//添加来源到自定义来源列表
SourceData
.
addUserSource
(
source
);
}
catch
(
Exception
e
)
{
}
}
return
result
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
null
;
}
}
public
static
Map
<
String
,
Object
>
getUrlJD
(
List
<
Map
<
String
,
Object
>>
dataList
){
try
{
Map
<
String
,
Object
>
result
=
new
HashMap
<
String
,
Object
>();
List
<
Map
<
String
,
Object
>>
weiboList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
//微博数据集合
List
<
Map
<
String
,
Object
>>
appList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
//客户端数据集合
List
<
Map
<
String
,
Object
>>
wechatList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
//微信数据集合
List
<
Map
<
String
,
Object
>>
paperList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
//平媒数据集合
List
<
Map
<
String
,
Object
>>
toutiaoList
=
new
ArrayList
<
Map
<
String
,
Object
>>();
//平媒数据集合
Map
<
String
,
Map
<
String
,
Object
>>
mediaList
=
new
HashMap
<
String
,
Map
<
String
,
Object
>>();
//网络媒体数据,用于更新转发原创
for
(
Map
<
String
,
Object
>
dataMap
:
dataList
){
String
source
=
dataMap
.
get
(
"来源"
)!=
null
?
dataMap
.
get
(
"来源"
).
toString
().
toUpperCase
().
trim
():
null
;
String
url
=
dataMap
.
get
(
"链接"
)!=
null
?
dataMap
.
get
(
"链接"
).
toString
().
trim
():
null
;
String
pt
=
dataMap
.
get
(
"平台"
)!=
null
?
dataMap
.
get
(
"平台"
).
toString
().
trim
():
null
;
if
(
url
.
contains
(
"sh.qihoo.com"
)
&&
url
.
contains
(
"&url="
)){
url
=
url
.
split
(
"&url="
)[
1
];
}
if
(
url
.
contains
(
"toutiao.com"
)){
dataMap
.
put
(
"来源"
,
"今日头条"
+
"-"
+
source
);
if
(
source
.
equals
(
"今日头条"
)){
dataMap
.
put
(
"备注"
,
"修改头条名称"
);
}
toutiaoList
.
add
(
dataMap
);
}
else
{
if
(
pt
.
equals
(
"网络媒体"
)){
/**根据链接判断是否为客户端**/
if
(
url
.
contains
(
"baijia.baidu.com"
)
||
url
.
contains
(
"v.mp.uc.cn"
)
||
url
.
contains
(
"v.mp.uc.cn"
)
||
url
.
contains
(
"m.uczzd.cn"
)
||
url
.
contains
(
"a.mp.uc.cn"
)
||
url
.
contains
(
"share.iclient.ifeng.com"
)
||
url
.
contains
(
"m.ifeng.com"
)
||
url
.
contains
(
"kuaibao.qq.com"
)
||
url
.
contains
(
"sina.cn"
)
||
url
.
contains
(
"3g.163.com"
)
||
url
.
contains
(
"m.sohu.com"
)
){
dataMap
.
put
(
"备注"
,
""
);
appList
.
add
(
dataMap
);
}
/***根据链接判断是否为纸媒**/
else
if
(
url
.
contains
(
"e.xfrb.com.cn"
)
||
url
.
contains
(
"paper.people.com.cn"
)
||
url
.
contains
(
"bhsb.tjbhnews.com"
)
||
url
.
contains
(
"www.time-weekly.com"
)
||
url
.
contains
(
"bjrb.bjd.com.cn"
)
||
url
.
contains
(
"zqb.cyol.com"
)
||
url
.
contains
(
"hzdaily.hangzhou.com.cn"
)
||
url
.
contains
(
"shfinancialnews.com"
)
||
url
.
contains
(
"dz.xdkb.net"
)
||
url
.
contains
(
"njcb.xhby.net"
)
||
url
.
contains
(
"ctdsb.cnhubei.com"
)
||
url
.
contains
(
"bjwb.bjd.com.cn"
)
||
url
.
contains
(
"bjcb.morningpost.com.cn"
)
||
url
.
contains
(
"e.chengdu.cn"
)
){
dataMap
.
put
(
"备注"
,
""
);
paperList
.
add
(
dataMap
);
}
else
{
mediaList
.
put
(
url
,
dataMap
);
//添加来源到自定义来源列表
SourceData
.
addUserSource
(
source
);
}
}
else
if
(
pt
.
equals
(
"微博"
)){
dataMap
.
put
(
"备注"
,
""
);
weiboList
.
add
(
dataMap
);
}
else
if
(
pt
.
contains
(
"客户端"
)){
dataMap
.
put
(
"备注"
,
""
);
dataMap
.
put
(
"来源"
,
pt
+
"-"
+
source
);
appList
.
add
(
dataMap
);
}
else
if
(
pt
.
equals
(
"微信公众平台"
)){
dataMap
.
put
(
"备注"
,
""
);
wechatList
.
add
(
dataMap
);
}
else
if
(
pt
.
equals
(
"报刊"
)||
pt
.
equals
(
"平媒"
)){
dataMap
.
put
(
"备注"
,
""
);
paperList
.
add
(
dataMap
);
}
}
result
.
put
(
"weibo"
,
weiboList
);
result
.
put
(
"media"
,
mediaList
);
result
.
put
(
"app"
,
appList
);
result
.
put
(
"toutiao"
,
toutiaoList
);
result
.
put
(
"wechat"
,
wechatList
);
result
.
put
(
"paper"
,
paperList
);
}
return
result
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
null
;
}
}
/**
* @Title: getUrlLive
* @author hero
* @Description: 获取链接
* @param @param dataList
* @param @return 设定文件
* @return Map<String,Map<String,Object>> 返回类型
*/
public
static
Map
<
String
,
Map
<
String
,
Object
>>
getUrlLive
(
List
<
Map
<
String
,
Object
>>
dataList
){
try
{
Map
<
String
,
Map
<
String
,
Object
>>
result
=
new
HashMap
<
String
,
Map
<
String
,
Object
>>();
for
(
Map
<
String
,
Object
>
dataMap
:
dataList
){
String
url
=
dataMap
.
get
(
"地址"
)!=
null
?
dataMap
.
get
(
"地址"
).
toString
().
trim
():
null
;
if
(
url
.
contains
(
"sh.qihoo.com"
)
&&
url
.
contains
(
"&url="
)){
url
=
url
.
split
(
"&url="
)[
1
];
}
// if (!url.contains("http")) {
// url = "http://" + url;
// }
// if (!url.contains("www")) {
// url = url.replace("://", "://www.");
// }
result
.
put
(
url
,
dataMap
);
}
return
result
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
return
null
;
}
}
}
src/test/java/com/zhiwei/source_forward/sourceforward/test/URLLiveTest.java
View file @
849ef5fe
//package com.zhiwei.source_forward.sourceforward.test;
//package com.zhiwei.source_forward.sourceforward.test;
//
//
//import java.util.ArrayList;
//import java.util.ArrayList;
//import java.util.HashMap;
//import java.util.List;
//import java.util.List;
//import java.util.Map;
//import java.util.Map;
//import java.util.Map.Entry;
//import java.util.Map.Entry;
//
//
//import org.
junit
.Test;
//import org.
testng.annotations
.Test;
//
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.source_forward.bean.UrlLiveBean;
//import com.zhiwei.source_forward.run.URLLive;
//import com.zhiwei.source_forward.run.URLLive;
//import com.zhiwei.source_forward.util.ReadMediaData;
//
//
///**
///**
// * @ClassName: URLLiveTest
// * @ClassName: URLLiveTest
...
@@ -18,25 +22,37 @@
...
@@ -18,25 +22,37 @@
//public class URLLiveTest {
//public class URLLiveTest {
//
//
//
//
//// @Test
// @Test
//// public void urlLiveTest(){
// public void urlLiveTest(){
//// String path = "E://稿件汇总网媒数据//福莱网媒.xlsx";
// String path = "D://crawlerdata//链接删除2.xlsx";
//// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
//// Map<String,Object> data = poi.importExcel(path, 0);
// Map<String,Object> data = poi.importExcel(path, 0);
//// @SuppressWarnings("unchecked")
// @SuppressWarnings("unchecked")
//// List<String> headList = (List<String>)data.get("head");
// List<String> headList = (List<String>)data.get("head");
//// headList.add("是否删除");
// headList.add("是否删除");
//// @SuppressWarnings("unchecked")
// @SuppressWarnings("unchecked")
//// List<Map<String,Object>> dataList = (List<Map<String,Object>>)data.get("body");
// List<Map<String,Object>> dataList = (List<Map<String,Object>>)data.get("body");
//// Map<String,Map<String,Object>> dataMap = ReadMediaData.getUrlLive(dataList);
// List<String> uList = new ArrayList<>();
//// dataMap = URLLive.verificationURLLive(dataMap);
// for(Map<String,Object> m : dataList) {
////
// uList.add(m.get("地址").toString());
//// List<Map<String,Object>> bodyList = new ArrayList<>();
// }
//// for(Entry<String,Map<String,Object>> dataEntry : dataMap.entrySet()){
// List<UrlLiveBean> lb = URLLive.verificationURLLive(uList);
//// bodyList.add(dataEntry.getValue());
//
//// }
// List<Map<String,Object>> bodyList = new ArrayList<>();
//// poi.exportExcel(path ,"匹配后数据", headList, bodyList);
// for(UrlLiveBean dataEntry : lb){
//// }
// Map<String,Object> map = new HashMap<>();
// map.put("地址", dataEntry.getUrl());
// if(dataEntry.isLive() == 1) {
// map.put("是否删除", true);
// }else if(dataEntry.isLive() == 0) {
// map.put("是否删除", false);
// }else if(dataEntry.isLive() == -1) {
// map.put("是否删除", -1);
// }
// bodyList.add(map);
// }
// poi.exportExcel(path ,"匹配后数据", headList, bodyList);
// }
//
//
//
//
//}
//}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment