Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
72bdcd09
Commit
72bdcd09
authored
Jul 08, 2021
by
chenweiyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
头条是否删除做特殊处理
parent
e9aa812f
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
84 additions
and
84 deletions
+84
-84
pom.xml
+2
-2
src/main/java/com/zhiwei/source_forward/bean/UrlLiveBean.java
+3
-0
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+29
-27
src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
+1
-1
src/main/java/com/zhiwei/source_forward/run/URLLive.java
+1
-1
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
+3
-1
src/test/java/com/zhiwei/source_forward/sourceforward/test/SourceForwardTest.java
+45
-52
No files found.
pom.xml
View file @
72bdcd09
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<groupId>
com.zhiwei
</groupId>
<artifactId>
source-forward
</artifactId>
<artifactId>
source-forward
</artifactId>
<version>
0.
2.9
-SNAPSHOT
</version>
<version>
0.
3.0
-SNAPSHOT
</version>
<name>
source-forward
</name>
<name>
source-forward
</name>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
...
@@ -30,7 +30,7 @@
...
@@ -30,7 +30,7 @@
<dependency>
<dependency>
<groupId>
com.zhiwei.tools
</groupId>
<groupId>
com.zhiwei.tools
</groupId>
<artifactId>
zhiwei-tools
</artifactId>
<artifactId>
zhiwei-tools
</artifactId>
<version>
0.2.
0
-SNAPSHOT
</version>
<version>
0.2.
4
-SNAPSHOT
</version>
</dependency>
</dependency>
<dependency>
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<groupId>
com.zhiwei.crawler
</groupId>
...
...
src/main/java/com/zhiwei/source_forward/bean/UrlLiveBean.java
View file @
72bdcd09
...
@@ -4,6 +4,9 @@ public class UrlLiveBean {
...
@@ -4,6 +4,9 @@ public class UrlLiveBean {
private
String
url
;
private
String
url
;
/**
* 1 已删除
*/
private
Integer
isLive
;
private
Integer
isLive
;
private
String
title
;
private
String
title
;
...
...
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
72bdcd09
...
@@ -16,6 +16,7 @@ import org.jsoup.Jsoup;
...
@@ -16,6 +16,7 @@ import org.jsoup.Jsoup;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONObject
;
import
com.alibaba.fastjson.JSONPath
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
import
com.zhiwei.crawler.core.proxy.ProxyHolder
;
...
@@ -70,20 +71,8 @@ public class UrlLiveCrawler {
...
@@ -70,20 +71,8 @@ public class UrlLiveCrawler {
url
=
dealUrl
(
url
);
url
=
dealUrl
(
url
);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
logger
.
info
(
"当前处理 URL: {}"
,
url
);
Map
<
String
,
String
>
headers
=
new
HashMap
<>();
Map
<
String
,
String
>
headers
=
new
HashMap
<>();
ProxyHolder
ph
=
null
;
ProxyHolder
ph
=
ProxyHolder
.
NAT_HEAVY_PROXY
;
if
(
url
.
contains
(
"toutiao.com"
)){
if
(
url
.
contains
(
"toutiao.com"
)){
// headers.put("referer", url);
// headers.put("cookie", "csrftoken=6d0e5967684dbb57cea14dc12858d263; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __utmz=24953151.1576744435.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _ga=GA1.2.1570444666.1576744435; ttcid=d5fa0cec2bca426cbbddeb27a8bc72f629; UM_distinctid=170f5ad3a5b43e-04a6930812e3c5-f313f6d-240000-170f5ad3a5c40b; SLARDAR_WEB_ID=770be4fe-dd54-4701-a4ed-396b861a237d; CNZZDATA1259612802=2091325281-1587691681-%7C1589960650; s_v_web_id=verify_kaqh0jch_kZtSHRI9_GfBj_403X_8i5y_VOyWz4ATIx0d; __utmc=24953151; __utma=24953151.1570444666.1576744435.1591353562.1591579877.8; tt_webid=6837283963338622477; tt_webid=6837283963338622477; __tasessionId=dsei2aty41591951911851; tt_scid=M--AJ-FYwZ0qcYTzQCLyMeS5MlykLS6ktMkvqKKJmq-ghRxX4waEBhJ3YbheuNmi2b8a");
// headers.put("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
// headers.put("accept-encoding", "gzip, deflate, br");
// headers.put("accept-language", "zh-CN,zh;q=0.9");
// headers.put("cache-control", "no-cache");
// headers.put("sec-fetch-dest", "document");
// headers.put("sec-fetch-mode", "navigate");
// headers.put("sec-fetch-site", "same-origin");
// headers.put("sec-fetch-user", "?1");
// headers.put("upgrade-insecure-requests", "1");
// headers.put("user-agent", "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Mobile Safari/537.36");
ph
=
ProxyHolder
.
NAT_HEAVY_PROXY
;
ph
=
ProxyHolder
.
NAT_HEAVY_PROXY
;
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
url
=
treatZhihuUrl
(
url
);
url
=
treatZhihuUrl
(
url
);
...
@@ -138,20 +127,8 @@ public class UrlLiveCrawler {
...
@@ -138,20 +127,8 @@ public class UrlLiveCrawler {
private
String
dealUrl
(
String
url
)
{
private
String
dealUrl
(
String
url
)
{
try
{
try
{
if
(
url
.
contains
(
"www.toutiao.com"
))
{
if
(
url
.
contains
(
"toutiao.com"
))
{
if
(
url
.
contains
(
"www.toutiao.com"
))
{
return
dealToutiaoUrl
(
url
);
}
else
{
url
=
url
.
replace
(
"toutiao.com"
,
"www.toutiao.com"
);
}
if
(
url
.
contains
(
"https"
))
{
}
else
{
url
=
url
.
replace
(
"http"
,
"https"
);
}
if
(
url
.
contains
(
"group"
))
{
url
=
"https://www.toutiao.com/a"
+
url
.
split
(
"/"
)[
4
]
+
"/"
;
}
}
else
if
(
url
.
contains
(
"mp.weixin.qq.com"
))
{
}
else
if
(
url
.
contains
(
"mp.weixin.qq.com"
))
{
if
(
url
.
contains
(
"https"
))
{
if
(
url
.
contains
(
"https"
))
{
...
@@ -170,6 +147,22 @@ public class UrlLiveCrawler {
...
@@ -170,6 +147,22 @@ public class UrlLiveCrawler {
}
}
}
}
private
static
Pattern
pa
=
Pattern
.
compile
(
"\\d+"
);
private
String
dealToutiaoUrl
(
String
url
)
{
try
{
String
data
=
url
.
split
(
"\\?"
)[
0
];
Matcher
m
=
pa
.
matcher
(
data
);
if
(
m
.
find
())
{
String
aid
=
m
.
group
(
0
);
return
"https://m.toutiao.com/i"
+
aid
+
"/info/?_signature=&i="
+
aid
;
}
}
catch
(
Exception
e
)
{
logger
.
error
(
"数据获取id出错"
,
e
);
}
return
url
;
}
/**
/**
*
*
* @Description 判断是否删除
* @Description 判断是否删除
...
@@ -229,6 +222,10 @@ public class UrlLiveCrawler {
...
@@ -229,6 +222,10 @@ public class UrlLiveCrawler {
if
(
ma5
.
find
())
{
if
(
ma5
.
find
())
{
title
=
ma5
.
group
(
1
).
replaceAll
(
" "
,
" "
).
trim
();
title
=
ma5
.
group
(
1
).
replaceAll
(
" "
,
" "
).
trim
();
}
}
if
(
result
.
contains
(
"此帐号已被屏蔽, 内容无法查看"
)
||
result
.
contains
(
"该公众号已迁移"
)
||
result
.
contains
(
"此帐号已自主注销,内容无法查看"
)
||
result
.
contains
(
"此帐号处于帐号迁移流程中"
)
||
result
.
contains
(
"该内容已被发布者删除"
))
{
title
=
"网页已删除"
;
}
}
else
if
(
url
.
contains
(
"kuaibao"
)){
}
else
if
(
url
.
contains
(
"kuaibao"
)){
title
=
doc
.
select
(
"p.title"
).
text
().
replaceAll
(
" "
,
""
);
title
=
doc
.
select
(
"p.title"
).
text
().
replaceAll
(
" "
,
""
);
}
else
if
(
url
.
contains
(
"chinadaily.com.cn"
)){
}
else
if
(
url
.
contains
(
"chinadaily.com.cn"
)){
...
@@ -283,6 +280,11 @@ public class UrlLiveCrawler {
...
@@ -283,6 +280,11 @@ public class UrlLiveCrawler {
title
=
doc
.
select
(
"h1"
).
text
().
replaceAll
(
" "
,
""
);
title
=
doc
.
select
(
"h1"
).
text
().
replaceAll
(
" "
,
""
);
}
}
if
(
result
.
contains
(
"\"success\":false"
)
&&
attr
.
getAttr
().
toString
().
contains
(
"toutiao.com"
))
{
title
=
"网页已删除"
;
}
else
{
title
=
String
.
valueOf
(
JSONPath
.
read
(
result
,
"$..title"
));
}
//若title 为拿到 用 此方法 无法获取标题不进行程序迷惑性判断
//若title 为拿到 用 此方法 无法获取标题不进行程序迷惑性判断
// if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
// if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
// title = "网页已删除";
// title = "网页已删除";
...
...
src/main/java/com/zhiwei/source_forward/run/MediaSelfSource.java
View file @
72bdcd09
...
@@ -32,7 +32,7 @@ public class MediaSelfSource {
...
@@ -32,7 +32,7 @@ public class MediaSelfSource {
ProxyInit
.
initProxy
();
ProxyInit
.
initProxy
();
List
<
String
>
urlList
=
new
ArrayList
<>();
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"http
s://k.sina.com.cn/article_1060093724_3f2fbf1c00100vsqd.html?from=mood
"
);
urlList
.
add
(
"http
://baijiahao.baidu.com/s?id=1665770738503315058&wfr=spider&for=pc
"
);
List
<
MediaSelfSourceBean
>
u
=
MediaSelfSource
.
getMediaSelfSource
(
urlList
);
List
<
MediaSelfSourceBean
>
u
=
MediaSelfSource
.
getMediaSelfSource
(
urlList
);
for
(
MediaSelfSourceBean
b
:
u
)
{
for
(
MediaSelfSourceBean
b
:
u
)
{
...
...
src/main/java/com/zhiwei/source_forward/run/URLLive.java
View file @
72bdcd09
...
@@ -72,7 +72,7 @@ public class URLLive {
...
@@ -72,7 +72,7 @@ public class URLLive {
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
ProxyInit
.
initProxy
();
ProxyInit
.
initProxy
();
List
<
String
>
urlList
=
new
ArrayList
<>();
List
<
String
>
urlList
=
new
ArrayList
<>();
urlList
.
add
(
"http
://mp.weixin.qq.com/s?__biz=Mzg3MDMzNTc5Mg==&mid=2247485220&idx=1&sn=9118543ca120489cccbdc102be58f881
"
);
urlList
.
add
(
"http
s://www.toutiao.com/a6982350814614405670/
"
);
// urlList.add("http://www.yidianzixun.com/article/0PYO4Gbh");
// urlList.add("http://www.yidianzixun.com/article/0PYO4Gbh");
List
<
UrlLiveBean
>
u
=
URLLive
.
verificationURLLive
(
urlList
);
List
<
UrlLiveBean
>
u
=
URLLive
.
verificationURLLive
(
urlList
);
...
...
src/main/java/com/zhiwei/source_forward/util/MatchSource.java
View file @
72bdcd09
...
@@ -508,7 +508,9 @@ public class MatchSource {
...
@@ -508,7 +508,9 @@ public class MatchSource {
source
=
document
.
select
(
"p.author-name:nth-child(1)"
).
text
();
source
=
document
.
select
(
"p.author-name:nth-child(1)"
).
text
();
}
else
if
(
StringUtils
.
isNotBlank
(
document
.
select
(
"a.authorName"
).
text
())){
}
else
if
(
StringUtils
.
isNotBlank
(
document
.
select
(
"a.authorName"
).
text
())){
source
=
document
.
select
(
"a.authorName"
).
text
();
source
=
document
.
select
(
"a.authorName"
).
text
();
}
}
else
if
(
StringUtils
.
isNotBlank
(
document
.
select
(
"div.author-name > a"
).
text
())){
source
=
document
.
select
(
"div.author-name > a"
).
text
();
}
if
(
StringUtils
.
isNotBlank
(
source
)){
if
(
StringUtils
.
isNotBlank
(
source
)){
source
=
"百度百家-"
+
source
;
source
=
"百度百家-"
+
source
;
}
}
...
...
src/test/java/com/zhiwei/source_forward/sourceforward/test/SourceForwardTest.java
View file @
72bdcd09
//package com.zhiwei.source_forward.sourceforward.test;
//package com.zhiwei.source_forward.sourceforward.test;
//
//
//import java.util.ArrayList;
//import java.util.ArrayList;
//import java.util.List;
//import java.util.List;
//import java.util.Map;
//import java.util.Map;
//import java.util.Map.Entry;
//import java.util.Map.Entry;
//
//
//import org.junit.Test;
//import org.junit.Test;
//
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.source_forward.run.SourceForward;
//import com.zhiwei.source_forward.run.SourceForward;
//import com.zhiwei.source_forward.util.ReadMediaData;
//import com.zhiwei.source_forward.util.ReadMediaData;
//
//
///**
///**
// * @ClassName: SourceForwardTest
// * @ClassName: SourceForwardTest
// * @Description: 来源验证
// * @Description: 来源验证
// * @author hero
// * @author hero
// * @date 2017年12月6日 上午9:55:13
// * @date 2017年12月6日 上午9:55:13
// */
// */
//public class SourceForwardTest {
//public class SourceForwardTest {
//
//
// @Test
// @Test
// public void sourceForwardTest(){
// public void sourceForwardTest(){
// String path = "E://稿件汇总网媒数据//JD稿件转载情况-1206.xlsx";
// String path = "E://稿件汇总网媒数据//JD稿件转载情况-1206.xlsx";
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// Map<String,Object> data = poi.importExcel(path, 0);
// Map<String,Object> data = poi.importExcel(path, 0);
// @SuppressWarnings("unchecked")
// @SuppressWarnings("unchecked")
// List<String> headList = (List<String>)data.get("head");
// List<String> headList = (List<String>)data.get("head");
// headList.add("频道");
// headList.add("频道");
// headList.add("原来源");
// headList.add("原来源");
// headList.add("是否转发");
// headList.add("是否转发");
// @SuppressWarnings("unchecked")
// @SuppressWarnings("unchecked")
// List<Map<String,Object>> dataList = (List<Map<String,Object>>)data.get("body");
// List<Map<String,Object>> dataList = (List<Map<String,Object>>)data.get("body");
//
//
// Map<String,Map<String,Object>> dataMap = ReadMediaData.getUrl(dataList);
// Map<String,Map<String,Object>> dataMap = ReadMediaData.getUrl(dataList);
// dataMap = SourceForward.getSourceForward(dataMap);
// dataMap = SourceForward.getSourceForward(dataMap);
//
//
// List<Map<String,Object>> bodyList = new ArrayList<>();
// List<Map<String,Object>> bodyList = new ArrayList<>();
// for(Entry<String,Map<String,Object>> dataEntry : dataMap.entrySet()){
// for(Entry<String,Map<String,Object>> dataEntry : dataMap.entrySet()){
// bodyList.add(dataEntry.getValue());
// bodyList.add(dataEntry.getValue());
// }
// }
// poi.exportExcel(path ,"匹配后数据", headList, bodyList);
// poi.exportExcel(path ,"匹配后数据", headList, bodyList);
// }
// }
//
//
//
//}
//
//
//
//
//
//
//}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment