Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
source_forward
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhiwei
source_forward
Commits
4a661c59
Commit
4a661c59
authored
Jun 12, 2020
by
chenweiyang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
升级爬虫核心包 升级版本 0.2.6
parent
a94682af
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
17 additions
and
20 deletions
+17
-20
pom.xml
+2
-2
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
+2
-2
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
+2
-2
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
+2
-2
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+5
-8
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawlerNew.java
+1
-1
src/main/java/com/zhiwei/source_forward/run/URLLive.java
+2
-2
src/main/java/com/zhiwei/source_forward/util/ProxyInit.java
+1
-1
No files found.
pom.xml
View file @
4a661c59
...
...
@@ -3,7 +3,7 @@
<modelVersion>
4.0.0
</modelVersion>
<groupId>
com.zhiwei
</groupId>
<artifactId>
source-forward
</artifactId>
<version>
0.2.
5
-SNAPSHOT
</version>
<version>
0.2.
6
-SNAPSHOT
</version>
<name>
source-forward
</name>
<description>
验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)
</description>
...
...
@@ -29,7 +29,7 @@
<dependency>
<groupId>
com.zhiwei.crawler
</groupId>
<artifactId>
crawler-core
</artifactId>
<version>
0.6.
3.1
-SNAPSHOT
</version>
<version>
0.6.
6.3
-SNAPSHOT
</version>
</dependency>
</dependencies>
...
...
src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
View file @
4a661c59
...
...
@@ -7,8 +7,8 @@ import org.apache.logging.log4j.Logger;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.crawler.
core.
proxy.ProxyHolder
;
import
com.zhiwei.crawler.
core.
utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.ContentBean
;
import
com.zhiwei.source_forward.bean.ContentBean.Attribution
;
import
com.zhiwei.source_forward.util.ContentDataCallback
;
...
...
src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
View file @
4a661c59
...
...
@@ -12,8 +12,8 @@ import org.jsoup.nodes.Node;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.crawler.
core.
proxy.ProxyHolder
;
import
com.zhiwei.crawler.
core.
utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean
;
import
com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution
;
import
com.zhiwei.source_forward.util.MatchChannel
;
...
...
src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
View file @
4a661c59
...
...
@@ -13,8 +13,8 @@ import org.jsoup.nodes.Node;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.crawler.
core.
proxy.ProxyHolder
;
import
com.zhiwei.crawler.
core.
utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.SourceForwardBean
;
import
com.zhiwei.source_forward.bean.SourceForwardBean.Attribution
;
import
com.zhiwei.source_forward.util.MatchChannel
;
...
...
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
View file @
4a661c59
...
...
@@ -2,9 +2,6 @@ package com.zhiwei.source_forward.crawler;
import
static
java
.
util
.
Objects
.
nonNull
;
import
java.net.InetSocketAddress
;
import
java.net.Proxy
;
import
java.net.Proxy.Type
;
import
java.util.Arrays
;
import
java.util.HashMap
;
import
java.util.List
;
...
...
@@ -21,8 +18,8 @@ import org.jsoup.nodes.Document;
import
com.alibaba.fastjson.JSONObject
;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.proxy.ProxyHolder
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.crawler.
core.
proxy.ProxyHolder
;
import
com.zhiwei.crawler.
core.
utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
com.zhiwei.source_forward.bean.UrlLiveBean.Attribution
;
import
com.zhiwei.source_forward.util.UrlLiveDataCallback
;
...
...
@@ -75,7 +72,7 @@ public class UrlLiveCrawler {
ProxyHolder
ph
=
null
;
if
(
url
.
contains
(
"toutiao.com"
)){
headers
.
put
(
"referer"
,
url
);
// headers.put("cookie", "csrftoken=6d0e5967684dbb57cea14dc12858d263; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __utmz=24953151.1576744435.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _ga=GA1.2.1570444666.1576744435; ttcid=d5fa0cec2bca426cbbddeb27a8bc72f629; UM_distinctid=170f5ad3a5b43e-04a6930812e3c5-f313f6d-240000-170f5ad3a5c40b; SLARDAR_WEB_ID=770be4fe-dd54-4701-a4ed-396b861a237d; CNZZDATA1259612802=2091325281-1587691681-%7C1589960650; s_v_web_id=verify_kaqh0jch_kZtSHRI9_GfBj_403X_8i5y_VOyWz4ATIx0d; tt_webid=6833273737980659213; tt_webid=6833273737980659213; __utmc=24953151; __utma=24953151.1570444666.1576744435.1591353562.1591579877.8; tt_scid=
KdPOCLtoSVDQTnptuiejH4SkyYa7RodIcBHFpAGwf17X9rUWJJadFYALAeJ5C8xI71e5; __ac_nonce=05ee037380054152ddc38; __ac_signature=6C1-YAAgEBB40vzLiGE95-gsf3AALbYjxEHG0FQERCcxB-9tebz.fovM7gew-AHObLDUegpmF7k8G57XzXokCbi72klNkdvS.ukzrfuuFk3UL836QudGNHE6IJQ47kFRkiT; __tasessionId=nz5ags6bk1591752505915
");
// headers.put("cookie", "csrftoken=6d0e5967684dbb57cea14dc12858d263; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __utmz=24953151.1576744435.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _ga=GA1.2.1570444666.1576744435; ttcid=d5fa0cec2bca426cbbddeb27a8bc72f629; UM_distinctid=170f5ad3a5b43e-04a6930812e3c5-f313f6d-240000-170f5ad3a5c40b; SLARDAR_WEB_ID=770be4fe-dd54-4701-a4ed-396b861a237d; CNZZDATA1259612802=2091325281-1587691681-%7C1589960650; s_v_web_id=verify_kaqh0jch_kZtSHRI9_GfBj_403X_8i5y_VOyWz4ATIx0d; tt_webid=6833273737980659213; tt_webid=6833273737980659213; __utmc=24953151; __utma=24953151.1570444666.1576744435.1591353562.1591579877.8; tt_scid=
a56VD6ALatPbD63MlXw5skpZx9olxW6X.mRiDJBvVfZyQF2lfw8-lNeLPqqPPuCH4c68; __ac_nonce=05ee1cac300c6be6af0fe; __ac_signature=YvewuAAgEBDyCDITpF4SsmL2saAADwawXLDdrzlqO4hucxtXaZyI1l3ZReIsXb1OnF3koe7MdMwhnGPBA-mn5X5ERtMmQrb7RY5NqiBu.g3p0.oY6nNsvIT3NNbIsViZXz3; __tasessionId=48abvzgub1591855812394
");
headers
.
put
(
"accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
);
headers
.
put
(
"accept-encoding"
,
"gzip, deflate, br"
);
headers
.
put
(
"accept-language"
,
"zh-CN,zh;q=0.9"
);
...
...
@@ -86,7 +83,7 @@ public class UrlLiveCrawler {
headers
.
put
(
"sec-fetch-user"
,
"?1"
);
headers
.
put
(
"upgrade-insecure-requests"
,
"1"
);
headers
.
put
(
"user-agent"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
);
ph
=
ProxyHolder
.
SOUGOU_OUTER
_PROXY
;
ph
=
ProxyHolder
.
NAT_HEAVY
_PROXY
;
}
else
if
(
url
.
contains
(
"zhihu.com"
))
{
url
=
treatZhihuUrl
(
url
);
ph
=
ProxyHolder
.
NAT_HEAVY_PROXY
;
...
...
@@ -96,7 +93,7 @@ public class UrlLiveCrawler {
if
(
Objects
.
nonNull
(
request
))
{
counter
.
add
();
// , new Proxy(Type.HTTP, new InetSocketAddress("119.3.38.9", 31128))
httpBoot
.
asyncCall
(
request
,
ProxyHolder
.
NAT_HEAVY_PROXY
).
whenComplete
((
rs
,
ex
)
->
{
httpBoot
.
asyncCall
(
request
,
ph
).
whenComplete
((
rs
,
ex
)
->
{
try
{
if
(
Objects
.
isNull
(
ex
))
{
if
(
rs
.
isSuccessful
())
{
...
...
src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawlerNew.java
View file @
4a661c59
...
...
@@ -16,7 +16,7 @@ import com.alibaba.fastjson.JSONObject;
import
com.zhiwei.async.GroupSync
;
import
com.zhiwei.async.TaskBoot
;
import
com.zhiwei.crawler.core.HttpBoot
;
import
com.zhiwei.crawler.utils.RequestUtils
;
import
com.zhiwei.crawler.
core.
utils.RequestUtils
;
import
com.zhiwei.source_forward.bean.UrlLiveBean
;
import
okhttp3.Request
;
...
...
src/main/java/com/zhiwei/source_forward/run/URLLive.java
View file @
4a661c59
...
...
@@ -72,8 +72,8 @@ public class URLLive {
public
static
void
main
(
String
[]
args
)
{
ProxyInit
.
initProxy
();
List
<
String
>
urlList
=
new
ArrayList
<>();
//
urlList.add("http://www.toutiao.com/a1665677841741827");
urlList
.
add
(
"http://www.yidianzixun.com/article/0PYO4Gbh"
);
urlList
.
add
(
"http://www.toutiao.com/a1665677841741827"
);
//
urlList.add("http://www.yidianzixun.com/article/0PYO4Gbh");
List
<
UrlLiveBean
>
u
=
URLLive
.
verificationURLLive
(
urlList
);
for
(
UrlLiveBean
b
:
u
)
{
...
...
src/main/java/com/zhiwei/source_forward/util/ProxyInit.java
View file @
4a661c59
package
com
.
zhiwei
.
source_forward
.
util
;
import
com.zhiwei.crawler.proxy.ProxyFactory
;
import
com.zhiwei.crawler.
core.
proxy.ProxyFactory
;
import
com.zhiwei.proxy.config.SimpleConfig
;
import
com.zhiwei.source_forward.config.ProxyConfig
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment