代理升级版本

2c9d4fa2 · chenweiyang · 4860f41e · 2c9d4fa2 · 2c9d4fa2 · 2c9d4fa2
Commit 2c9d4fa2 authored Nov 02, 2022 by chenweiyang
10 changed files
--- a/pom.xml
+++ b/pom.xml
@@ -3,13 +3,17 @@
    <modelVersion>4.0.0</modelVersion>
    <groupId>com.zhiwei</groupId>
    <artifactId>source-forward</artifactId>
-    <version>0.3.0-SNAPSHOT</version>
+    <version>0.3.1-SNAPSHOT</version>
    <name>source-forward</name>
    <description>验证网媒的转发关系及链接的有效性（转发验证微信及自媒体匹配率不高）</description>

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
+        <http-boot.version>0.1.0.8-SNAPSHOT</http-boot.version>
+        <task-boot.version>1.1.2-SNAPSHOT</task-boot.version>
+        <boilerpipe.version>0.0.1-SNAPSHOT</boilerpipe.version>
+        <conomys-consumer.version>0.0.3-SNAPSHOT</conomys-consumer.version>
    </properties>

    <developers>
@@ -30,12 +34,30 @@
        <dependency>
            <groupId>com.zhiwei.tools</groupId>
            <artifactId>zhiwei-tools</artifactId>
-            <version>0.2.4-SNAPSHOT</version>
+            <version>0.4.5-SNAPSHOT</version>
        </dependency>
        <dependency>
-        	<groupId>com.zhiwei.crawler</groupId>
-        	<artifactId>crawler-core</artifactId>
-        	<version>0.6.6.8-SNAPSHOT</version>
+            <groupId>com.kohlschutter.boilerpipe</groupId>
+            <artifactId>boilerpipe-extractor</artifactId>
+            <version>${boilerpipe.version}</version>
+        </dependency>
+        
+        <dependency>
+            <groupId>com.zhiwei.http</groupId>
+            <artifactId>http-boot</artifactId>
+            <version>${http-boot.version}</version>
+        </dependency>
+
+        <dependency>
+            <groupId>com.zhiwei.async</groupId>
+            <artifactId>task-boot</artifactId>
+            <version>${task-boot.version}</version>
+        </dependency>
+
+        <dependency>
+            <groupId>com.zhiwei.network</groupId>
+            <artifactId>cynomys-consumer</artifactId>
+            <version>${conomys-consumer.version}</version>
        </dependency>
    </dependencies>


--- a/src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
+++ b/src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
 package com.zhiwei.source_forward.crawler;

 import java.util.Objects;
+import java.util.concurrent.Semaphore;

-import com.zhiwei.tools.tools.ZhiWeiTools;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;

-import com.zhiwei.async.GroupSync;
-import com.zhiwei.crawler.core.HttpBoot;
-import com.zhiwei.crawler.core.proxy.ProxyHolder;
-import com.zhiwei.crawler.core.utils.RequestUtils;
+import com.zhiwei.http.boot.HttpBoot;
+import com.zhiwei.http.proxy.ProxyServerSupplier;
+import com.zhiwei.http.util.RequestUtils;
 import com.zhiwei.source_forward.bean.ContentBean;
 import com.zhiwei.source_forward.bean.ContentBean.Attribution;
 import com.zhiwei.source_forward.util.ContentDataCallback;
 import com.zhiwei.source_forward.util.MatchContent;
+import com.zhiwei.task.sync.GroupSync;
+import com.zhiwei.tools.tools.ZhiWeiTools;

 import okhttp3.Request;

 public class ContentCrawler {

    private static Logger logger = LogManager.getLogger(ContentCrawler.class);
-    private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
-    
+    private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(2).build();
+    private static Semaphore semaphore = new Semaphore(5);
+
    /**
     * 
     * @Description 链接传入 并 返回采集完信号
@@ -51,9 +53,12 @@ public class ContentCrawler {
                ZhiWeiTools.sleep(100);
                if (url != null) {
                    try {
+                        semaphore.acquire();
                        search(counter, url, Attribution.of(url), callback);
                    } catch (Exception e) {
                        logger.error("搜索创建出错", e);
+                    } finally {
+                        semaphore.release();
                    }
                }
            }
@@ -75,15 +80,15 @@ public class ContentCrawler {
        Request request = RequestUtils.wrapGet(url);
        counter.add();
        
-        httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
+        httpBoot.asyncCall(request, ProxyServerSupplier.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
            try {
                if (Objects.isNull(ex)) {
-                    parseHtml(rs.body().string(), attr, callback);
+                    parseHtml(rs.bodyString(), attr, callback);
                } else {
                    logger.info("{} 搜索结果访问失败: {}", request.url().url(),ex);
                }
            } catch (Exception e) {
-                logger.info("搜索结果访问失败: {}", ex);
+                logger.info("搜索结果访问失败: ", ex);
            } finally {
                counter.done();
            }

--- a/src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
+++ b/src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
@@ -4,22 +4,23 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Objects;
+import java.util.concurrent.Semaphore;

-import com.zhiwei.tools.tools.ZhiWeiTools;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Node;

-import com.zhiwei.async.GroupSync;
-import com.zhiwei.crawler.core.HttpBoot;
-import com.zhiwei.crawler.core.proxy.ProxyHolder;
-import com.zhiwei.crawler.core.utils.RequestUtils;
+import com.zhiwei.http.boot.HttpBoot;
+import com.zhiwei.http.proxy.ProxyServerSupplier;
+import com.zhiwei.http.util.RequestUtils;
 import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
 import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
 import com.zhiwei.source_forward.util.MatchChannel;
 import com.zhiwei.source_forward.util.MatchSource;
 import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack;
+import com.zhiwei.task.sync.GroupSync;
+import com.zhiwei.tools.tools.ZhiWeiTools;

 import okhttp3.Request;

@@ -34,8 +35,9 @@ import okhttp3.Request;
 public class MediaSelfSourceCrawler {
    
    private static final Logger logger = LogManager.getLogger(MediaSelfSourceCrawler.class);
-    private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
-    
+    private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(2).build();
+    private static Semaphore semaphore = new Semaphore(5);
+
    /**
     * 
     * @Description 链接传入 并 返回采集完信号
@@ -69,9 +71,12 @@ public class MediaSelfSourceCrawler {
                counter.add();
                if (url != null) {
                    try {
+                        semaphore.acquire();
                        search(counter, url, Attribution.of(url), callback);
                    } catch (Exception e) {
                        logger.error("搜索创建出错", e);
+                    } finally {
+                        semaphore.release();
                    }
                }
                counter.done();
@@ -91,7 +96,7 @@ public class MediaSelfSourceCrawler {
    private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) {
        logger.info("当前处理 URL: {}", attr.get());
        Map<String,Object> map = new HashMap<>();
-        ProxyHolder ph = ProxyHolder.NAT_HEAVY_PROXY;
+        ProxyServerSupplier ph = ProxyServerSupplier.NAT_HEAVY_PROXY;
        if(url.contains("toutiao.com")) {
            map.put("referer", url);
        }
@@ -104,7 +109,7 @@ public class MediaSelfSourceCrawler {
                try {
                    if (Objects.isNull(ex)) {
                        try {
-                            parseHtml(rs.body().string(), attr, callback, rs.request().url().uri().toString());
+                            parseHtml(rs.bodyString(), attr, callback, rs.bootRequest().url().uri().toString());
                        } catch (Exception e) {
                            logger.error("解析出错", e);
                        }

--- a/src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
+++ b/src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
@@ -4,6 +4,7 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Objects;
+import java.util.concurrent.Semaphore;

 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
@@ -11,16 +12,16 @@ import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Node;

-import com.zhiwei.async.GroupSync;
-import com.zhiwei.crawler.core.HttpBoot;
-import com.zhiwei.crawler.core.proxy.ProxyHolder;
-import com.zhiwei.crawler.core.utils.RequestUtils;
+import com.zhiwei.http.boot.HttpBoot;
+import com.zhiwei.http.proxy.ProxyServerSupplier;
+import com.zhiwei.http.util.RequestUtils;
 import com.zhiwei.source_forward.bean.SourceForwardBean;
 import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
 import com.zhiwei.source_forward.util.MatchChannel;
 import com.zhiwei.source_forward.util.MatchSource;
 import com.zhiwei.source_forward.util.SourceData;
 import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
+import com.zhiwei.task.sync.GroupSync;
 import com.zhiwei.tools.tools.ZhiWeiTools;

 import okhttp3.Request;
@@ -29,9 +30,10 @@ public class SourceForwardCrawler {
    
    private static final Logger logger = LogManager.getLogger(SourceForwardCrawler.class);
    
-    private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
+    private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(2).build();
    private static List<String> sourceList = SourceData.getSourceList();
-    
+    private static Semaphore semaphore = new Semaphore(5);
+
    public GroupSync submitTask(SourceForwardDataCallBack callback,String... urls) {
        try {
            GroupSync counter = new GroupSync();
@@ -50,10 +52,13 @@ public class SourceForwardCrawler {
                ZhiWeiTools.sleep(100);
                if (url != null) {
                    try {
+                        semaphore.acquire();
                        search(counter, url, Attribution.of(url), callback);
                    } catch (Exception e) {
                        logger.error("搜索创建出错", e);
-                    } 
+                    } finally {
+                        semaphore.release();
+                    }
                }
                counter.done();
            }
@@ -77,10 +82,10 @@ public class SourceForwardCrawler {
        }
        Request request = RequestUtils.wrapGet(url, headers);
        counter.add();
-        httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
+        httpBoot.asyncCall(request, ProxyServerSupplier.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
            try {
                if (Objects.isNull(ex)) {
-                    parseHtml(rs.body().string(), attr, callback);
+                    parseHtml(rs.bodyString(), attr, callback);
                } else {
                    logger.info("{} 搜索结果访问失败: {}", request.url().url(), ex);
                }

--- a/src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+++ b/src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
 package com.zhiwei.source_forward.crawler;

+import static java.util.Objects.nonNull;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.concurrent.Semaphore;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+
 import com.alibaba.fastjson.JSONObject;
 import com.alibaba.fastjson.JSONPath;
-import com.zhiwei.async.GroupSync;
-import com.zhiwei.crawler.core.HttpBoot;
-import com.zhiwei.crawler.core.proxy.ProxyHolder;
-import com.zhiwei.crawler.core.utils.RequestUtils;
+import com.zhiwei.http.boot.HttpBoot;
+import com.zhiwei.http.proxy.ProxyServerSupplier;
+import com.zhiwei.http.util.RequestUtils;
 import com.zhiwei.source_forward.bean.UrlLiveBean;
 import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
 import com.zhiwei.source_forward.util.UrlLiveDataCallback;
+import com.zhiwei.task.sync.GroupSync;
 import com.zhiwei.tools.tools.ZhiWeiTools;
-import okhttp3.Request;
-import org.apache.logging.log4j.LogManager;
-import org.apache.logging.log4j.Logger;
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;

-import java.util.*;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import static java.util.Objects.nonNull;
+import okhttp3.Request;

 /**
 * @author byte-zbs
@@ -32,8 +39,9 @@ import static java.util.Objects.nonNull;
 public class UrlLiveCrawler {

    private static final Logger logger = LogManager.getLogger(UrlLiveCrawler.class);
-    private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).throwException(true).build();
-
+    private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(2).build();
+    private static Semaphore semaphore = new Semaphore(5);
+    
    public GroupSync submitTask(UrlLiveDataCallback callback, String... urls) {
        GroupSync counter = new GroupSync();
        start(counter, callback, urls);
@@ -43,17 +51,20 @@ public class UrlLiveCrawler {
    private void start(GroupSync counter, UrlLiveDataCallback callback, String... urls) {
        if (nonNull(urls) && urls.length > 0) {
            for (String url : urls) {
-                counter.add();
-                ZhiWeiTools.sleep(100);
-                if (nonNull(url)) {
-                    try {
+                try {
+                    counter.add();
+                    semaphore.acquire();
+                    ZhiWeiTools.sleep(200);
+                    if (nonNull(url)) {
 //                        ZhiWeiTools.sleep(3000);
                        search(counter, url, Attribution.of(url, 1), callback);
-                    } catch (Exception e) {
-                        logger.error("搜索创建出错:", e);
                    }
+                } catch (Exception e) {
+                    logger.error("搜索创建出错:", e);
+                } finally {
+                    counter.done();
+                    semaphore.release();
                }
-                counter.done();
            }
        }
    }
@@ -63,12 +74,12 @@ public class UrlLiveCrawler {
        url = dealUrl(url);
        logger.info("当前处理 URL: {}", url);
        Map<String, String> headers = new HashMap<>();
-        ProxyHolder ph = ProxyHolder.NAT_HEAVY_PROXY;
+        ProxyServerSupplier ph = ProxyServerSupplier.NAT_HEAVY_PROXY;
        if (url.contains("toutiao.com")) {
-            ph = ProxyHolder.NAT_HEAVY_PROXY;
+            ph = ProxyServerSupplier.NAT_HEAVY_PROXY;
        } else if (url.contains("zhihu.com")) {
            url = treatZhihuUrl(url);
-            ph = ProxyHolder.NAT_HEAVY_PROXY;
+            ph = ProxyServerSupplier.NAT_HEAVY_PROXY;
        }
        try {
            Request request = RequestUtils.wrapGet(url, headers);
@@ -80,7 +91,7 @@ public class UrlLiveCrawler {
                        System.out.println(rs.code());
                        if (Objects.isNull(ex)) {
                            if (rs.isSuccessful()) {
-                                parseHtml(rs.body().string(), attr, callback);
+                                parseHtml(rs.bodyString(), attr, callback);
                            } else if (rs.code() == 404) {
                                callBack(callback, attr, 1, String.valueOf(rs.code()));
                            } else {
@@ -91,7 +102,7 @@ public class UrlLiveCrawler {
                            callBack(callback, attr, -1, "程序无法判断");
                        }
                    } catch (Exception e) {
-                        logger.error(" 数据是否删除 采集出错 {} ", e);
+                        logger.error(" 数据是否删除 采集出错 ", e);
                    } finally {
                        counter.done();
                    }
@@ -99,7 +110,7 @@ public class UrlLiveCrawler {
                return counter;
            }
        } catch (Exception e2) {
-            logger.error("数据出错 {}", e2);
+            logger.error("数据出错 ", e2);
        }
        return counter;
    }

--- a/src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawlerNew.java
+++ b/src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawlerNew.java
-package com.zhiwei.source_forward.crawler;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Objects;
-
-import org.apache.logging.log4j.LogManager;
-import org.apache.logging.log4j.Logger;
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;
-
-import com.alibaba.fastjson.JSONObject;
-import com.zhiwei.async.GroupSync;
-import com.zhiwei.async.TaskBoot;
-import com.zhiwei.crawler.core.HttpBoot;
-import com.zhiwei.crawler.core.utils.RequestUtils;
-import com.zhiwei.source_forward.bean.UrlLiveBean;
-
-import okhttp3.Request;
-import okhttp3.Response;
-
-public class UrlLiveCrawlerNew {
-    
-    private static final Logger logger = LogManager.getLogger(UrlLiveCrawlerNew.class);
-    private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).throwException(false).build();
-    
-    public List<UrlLiveBean> judgeIsDelete(List<String> urlList) {
-        GroupSync counter = new GroupSync();
-        List<UrlLiveBean> ulbList = new ArrayList<>();
-        urlList.forEach(url -> {
-            try {
-                counter.add();
-                TaskBoot.blockingAsync(() -> {
-                    try {
-                        counter.add();
-                        UrlLiveBean ulb = dealUrlLive(url);
-                        if(Objects.nonNull(ulb)) {
-                            ulbList.add(ulb);
-                        }
-                    } catch (Exception e) {
-                        logger.error("链接是否删除新", e);
-                    } finally {
-                        counter.done();
-                    }
-                });
-            } catch (Exception e2) {
-                logger.error("数据出错 {}" ,e2);
-            } finally {
-                counter.done();
-            }
-        });
-        try {
-            counter.await();
-        } catch (InterruptedException e) {
-            e.printStackTrace();
-        }
-        return ulbList;
-    }
-    
-    private UrlLiveBean dealUrlLive(String url) {
-        try {
-            url = dealUrl(url);
-            logger.info("当前处理 URL: {}", url);
-            Map<String,String> headers = new HashMap<>();
-//            Map<String,String> headers = HeaderTool.getCommonHead();
-            if(url.contains("www.toutiao.com")){
-                headers.put("referer", url);
-            }else if(url.contains("zhihu.com")) {
-                url = treatZhihuUrl(url);
-            }
-            Request request = RequestUtils.wrapGet(url, headers);
-            int code = 404;
-            for(int i = 0; i < 2; i++) {
-                try (Response response = httpBoot.syncCall(request)){
-                    if(response.isSuccessful()) {
-                        return matchDel(response.body().string(), url);
-                    }else {
-                        code = response.code();
-                    }
-                } catch (Exception e) {
-                    logger.error("解析", e);
-                }
-            }
-            if(code == 403){
-                return callBack(url, -1, String.valueOf(code));
-            }else {
-                return callBack(url, 1, String.valueOf(code));
-            }
-        } catch (Exception e) {
-            e.printStackTrace();
-            return null;
-        }
-    }
-    
-    private UrlLiveBean callBack(String url,int i,String title) {
-        if(i == 1) {
-            return new UrlLiveBean(url, true, title);
-        }else {
-            return new UrlLiveBean(url, i, title);
-        }
-    }
-    
-    private String dealUrl(String url) {
-        try {
-            if(url.contains("toutiao.com")) {
-                if(url.contains("www.toutiao.com")) {
-                    
-                }else {
-                    url = url.replace("toutiao.com", "www.toutiao.com");
-                }
-                if(url.contains("https")) {
-                    
-                }else {
-                    url = url.replace("http", "https");
-                }
-                if(url.contains("group")) {
-                    url = "https://www.toutiao.com/a" + url.split("/")[4] + "/";
-                }
-            }else if(url.contains("mp.weixin.qq.com")) {
-                if(url.contains("https")) {
-                
-                }else {
-                    url = url.replace("http", "https");
-                }
-            }else if(url.contains("a.mp.uc.cn") && url.contains("wm_aid")) {
-                url = "http://ff.dayu.com/contents/origin/"+url.split("wm_aid=")[1].split("!!wm_id")[0]+"?biz_id=1002&_fetch_author=1&_fetch_incrs=1";
-            }
-            return url;
-        } catch (Exception e) {
-            return url;
-        }
-    }
-    
-    /***
-     * @Title: matchDel 
-     * @author hero 
-     * @Description: 验证链接是否有效
-     * @param @param page
-     * @param @return 设定文件 
-     * @return boolean 返回类型
-     */
-    public UrlLiveBean matchDel(String result,String url){
-        try {
-            Document doc = Jsoup.parse(result);
-            String title = null;
-            if(url.contains("mp.weixin.qq.com") || url.contains("post.mp.qq.com") || url.contains("weixin.sogou.com")){
-                title = doc.select("h2.rich_media_title").text().replaceAll(" ", "");
-                if(Objects.isNull(title) || title.isEmpty()) {
-                    title = doc.select("p.title").text();
-                }
-                if(Objects.isNull(title) || title.isEmpty()) {
-                    title = doc.select("h3.msg-title").text();
-                }
-                if(Objects.isNull(title) || title.isEmpty()) {
-                    title = doc.select("div.global_error_msg.warn").text();
-                }
-                if(Objects.isNull(title) || title.isEmpty()) {
-                    title = doc.select("p.tips").text();
-                }
-                if(Objects.isNull(title) || title.isEmpty()) {
-                    title = doc.select("h2").text();
-                }
-            }else if(url.contains("kuaibao")){
-                title = doc.select("p.title").text().replaceAll(" ", "");
-            }else if(url.contains("chinadaily.com.cn")){
-                title = doc.select("p.style1").text().replaceAll(" ", "");
-            }else if(url.contains("baidu.com") || url.contains("hao123.com")) {
-                title = doc.select("p#contaniner").text();
-            }else if(url.contains("kanfanews.com")) {
-                title = doc.select("p#tit").text();
-            }else if(url.contains("ifeng.com") && result.contains("url=http://www.ifeng.com/")) {
-                title = "网页已删除";
-            }else if(url.contains("a.mp.uc.cn")) {
-                try {
-                    JSONObject json = JSONObject.parseObject(result);
-                    title = json.getJSONObject("data").getString("title");
-                    if(Objects.isNull(title) || title.length() < 1) {
-                        title = "网页已删除";
-                    }
-                } catch (Exception e) {
-                    logger.error(" uc 数据 json 转换失败", e);
-                }
-            }else if(url.contains("huanqiu.com") && result.contains("www.huanqiu.com/404.html")) {
-                title = "网页已删除";
-            }else if(url.contains("zhihu.com")) {
-                JSONObject resultJson = JSONObject.parseObject(result);
-                
-                title = resultJson.getString("title")!=null?resultJson.getString("title"):resultJson.getString("message");
-            }
-            
-            //若title 为拿到 用 此方法
-            if(Objects.isNull(title) || title.length() < 1) {
-                title = doc.select("div.adiv > p > span").text().replaceAll(" ", "");
-            }
-            
-            //若title 为拿到 用 此方法
-            if(Objects.isNull(title) || title.length() < 1) {
-                title = doc.select("title").text().replaceAll(" ", "");
-            }
-            
-            //若title 为拿到 用 此方法
-            if(Objects.isNull(title) || title.length() < 1) {
-                title = doc.select("h1").text().replaceAll(" ", "");
-            }
-            
-           //若title 为拿到 用 此方法
-            if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
-                title = "网页已删除";
-            }
-        
-            if(Objects.nonNull(title) && title.length() > 1){
-                return new UrlLiveBean(url, isDelete(title),title);
-            } else {
-                return null;
-            }
-        } catch (Exception e) {
-            return null;
-        }
-    }
-    
-    /**
-     * 
-     * @Description 标题判断
-     * @param title
-     * @return
-     */
-    private boolean isDelete(String title) {
-        List<String> eList = Arrays.asList("系统出错","该内容已被发布者删除","网页已删除"
-                ,"此帐号已自主注销，内容无法查看","页面提示","正在维护中"
-                ,"此文章被第三方评估为不实信息","财经头条","知识100题","502BadGateway"
-                ,"提示信息","跳转页","跳转中...","此帐号在冻结期,内容无法查看","东北新闻网"
-                ,"百度一下，你就知道","帐号已迁移","手机百度","内容被删除","亚博国际|首页"
-                ,"中国软件网","云广网","新浪首页","文章暂时找不到了","-法易网"
-                ,"【一点资讯】www.yidianzixun.com","错误页面","网站暂停通知","【快资讯】你的专属资讯平台"
-                ,"百度新闻——全球最大的中文新闻平台","以上文章由以下机构判定为不实信息","该公众号已迁移"
-                ,"财经网-CAIJING.COM.CN","蚂蚁资讯","参数错误","时尚头条_YOKA时尚网","该文章已经被删除"
-                ,"网易","链接已过期","找不到页面","今晚网","该文章已被删除", "该回答已被删除-知乎", "资源不存在");
-        
-        List<String> cList = Arrays.asList("提示信息-","此内容因违规无法查看","微信公众号不存在"
-                ,"此内容被投诉且经审核涉嫌侵权，无法查看","thepageyourequestedwasnotfound","未知错误"
-                ,"Objectmoved","404","页面没有找到","页面未找到","301MovedPermanently","加载异常",
-                "此帐号已被屏蔽, 内容无法查看","链接不存在");
-        
-        return cList.stream().anyMatch(title::contains) || eList.stream().anyMatch(title::equals);
-    }
-    
-    
-    /**
-     * 处理知乎链接
-     * 
-     * */
-    private static String treatZhihuUrl(String url) {
-        if(url.contains("/answer/")) {
-            url = "https://api.zhihu.com/answers/" + url.replaceAll(".*/answer/", "");
-        }else if(url.contains("/question/") && !url.contains("/answer/")) {
-            url = "https://api.zhihu.com/questions/" + url.replaceAll(".*/question/", "");
-        }else if(url.contains("/p/")) {
-            url = "https://api.zhihu.com/articles/" + url.replaceAll(".*/p/", "");
-        }
-        return url;
-    }
-
-    
-}
+//package com.zhiwei.source_forward.crawler;
+//
+//import java.util.ArrayList;
+//import java.util.Arrays;
+//import java.util.HashMap;
+//import java.util.List;
+//import java.util.Map;
+//import java.util.Objects;
+//import java.util.concurrent.Semaphore;
+//
+//import org.apache.logging.log4j.LogManager;
+//import org.apache.logging.log4j.Logger;
+//import org.jsoup.Jsoup;
+//import org.jsoup.nodes.Document;
+//
+//import com.alibaba.fastjson.JSONObject;
+//import com.zhiwei.http.boot.HttpBoot;
+//import com.zhiwei.http.util.RequestUtils;
+//import com.zhiwei.source_forward.bean.UrlLiveBean;
+//import com.zhiwei.task.async.TaskBoot;
+//import com.zhiwei.task.sync.GroupSync;
+//
+//import okhttp3.Request;
+//import okhttp3.Response;
+//
+//public class UrlLiveCrawlerNew {
+//    
+//    private static final Logger logger = LogManager.getLogger(UrlLiveCrawlerNew.class);
+//    private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(2).throwException(false).build();
+//    private static Semaphore semaphore = new Semaphore(5);
+//
+//    
+//    public List<UrlLiveBean> judgeIsDelete(List<String> urlList) {
+//        GroupSync counter = new GroupSync();
+//        List<UrlLiveBean> ulbList = new ArrayList<>();
+//        urlList.forEach(url -> {
+//            try {
+//                counter.add();
+//                TaskBoot.blockingAsync(() -> {
+//                    try {
+//                        counter.add();
+//                        UrlLiveBean ulb = dealUrlLive(url);
+//                        if(Objects.nonNull(ulb)) {
+//                            ulbList.add(ulb);
+//                        }
+//                    } catch (Exception e) {
+//                        logger.error("链接是否删除新", e);
+//                    } finally {
+//                        counter.done();
+//                    }
+//                });
+//            } catch (Exception e2) {
+//                logger.error("数据出错 {}" ,e2);
+//            } finally {
+//                counter.done();
+//            }
+//        });
+//        try {
+//            counter.await();
+//        } catch (InterruptedException e) {
+//            e.printStackTrace();
+//        }
+//        return ulbList;
+//    }
+//    
+//    private UrlLiveBean dealUrlLive(String url) {
+//        try {
+//            url = dealUrl(url);
+//            logger.info("当前处理 URL: {}", url);
+//            Map<String,String> headers = new HashMap<>();
+////            Map<String,String> headers = HeaderTool.getCommonHead();
+//            if(url.contains("www.toutiao.com")){
+//                headers.put("referer", url);
+//            }else if(url.contains("zhihu.com")) {
+//                url = treatZhihuUrl(url);
+//            }
+//            Request request = RequestUtils.wrapGet(url, headers);
+//            int code = 404;
+//            for(int i = 0; i < 2; i++) {
+//                try (Response response = httpBoot.syncCall(request)){
+//                    if(response.isSuccessful()) {
+//                        return matchDel(response.body().string(), url);
+//                    }else {
+//                        code = response.code();
+//                    }
+//                } catch (Exception e) {
+//                    logger.error("解析", e);
+//                }
+//            }
+//            if(code == 403){
+//                return callBack(url, -1, String.valueOf(code));
+//            }else {
+//                return callBack(url, 1, String.valueOf(code));
+//            }
+//        } catch (Exception e) {
+//            e.printStackTrace();
+//            return null;
+//        }
+//    }
+//    
+//    private UrlLiveBean callBack(String url,int i,String title) {
+//        if(i == 1) {
+//            return new UrlLiveBean(url, true, title);
+//        }else {
+//            return new UrlLiveBean(url, i, title);
+//        }
+//    }
+//    
+//    private String dealUrl(String url) {
+//        try {
+//            if(url.contains("toutiao.com")) {
+//                if(url.contains("www.toutiao.com")) {
+//                    
+//                }else {
+//                    url = url.replace("toutiao.com", "www.toutiao.com");
+//                }
+//                if(url.contains("https")) {
+//                    
+//                }else {
+//                    url = url.replace("http", "https");
+//                }
+//                if(url.contains("group")) {
+//                    url = "https://www.toutiao.com/a" + url.split("/")[4] + "/";
+//                }
+//            }else if(url.contains("mp.weixin.qq.com")) {
+//                if(url.contains("https")) {
+//                
+//                }else {
+//                    url = url.replace("http", "https");
+//                }
+//            }else if(url.contains("a.mp.uc.cn") && url.contains("wm_aid")) {
+//                url = "http://ff.dayu.com/contents/origin/"+url.split("wm_aid=")[1].split("!!wm_id")[0]+"?biz_id=1002&_fetch_author=1&_fetch_incrs=1";
+//            }
+//            return url;
+//        } catch (Exception e) {
+//            return url;
+//        }
+//    }
+//    
+//    /***
+//     * @Title: matchDel 
+//     * @author hero 
+//     * @Description: 验证链接是否有效
+//     * @param @param page
+//     * @param @return 设定文件 
+//     * @return boolean 返回类型
+//     */
+//    public UrlLiveBean matchDel(String result,String url){
+//        try {
+//            Document doc = Jsoup.parse(result);
+//            String title = null;
+//            if(url.contains("mp.weixin.qq.com") || url.contains("post.mp.qq.com") || url.contains("weixin.sogou.com")){
+//                title = doc.select("h2.rich_media_title").text().replaceAll(" ", "");
+//                if(Objects.isNull(title) || title.isEmpty()) {
+//                    title = doc.select("p.title").text();
+//                }
+//                if(Objects.isNull(title) || title.isEmpty()) {
+//                    title = doc.select("h3.msg-title").text();
+//                }
+//                if(Objects.isNull(title) || title.isEmpty()) {
+//                    title = doc.select("div.global_error_msg.warn").text();
+//                }
+//                if(Objects.isNull(title) || title.isEmpty()) {
+//                    title = doc.select("p.tips").text();
+//                }
+//                if(Objects.isNull(title) || title.isEmpty()) {
+//                    title = doc.select("h2").text();
+//                }
+//            }else if(url.contains("kuaibao")){
+//                title = doc.select("p.title").text().replaceAll(" ", "");
+//            }else if(url.contains("chinadaily.com.cn")){
+//                title = doc.select("p.style1").text().replaceAll(" ", "");
+//            }else if(url.contains("baidu.com") || url.contains("hao123.com")) {
+//                title = doc.select("p#contaniner").text();
+//            }else if(url.contains("kanfanews.com")) {
+//                title = doc.select("p#tit").text();
+//            }else if(url.contains("ifeng.com") && result.contains("url=http://www.ifeng.com/")) {
+//                title = "网页已删除";
+//            }else if(url.contains("a.mp.uc.cn")) {
+//                try {
+//                    JSONObject json = JSONObject.parseObject(result);
+//                    title = json.getJSONObject("data").getString("title");
+//                    if(Objects.isNull(title) || title.length() < 1) {
+//                        title = "网页已删除";
+//                    }
+//                } catch (Exception e) {
+//                    logger.error(" uc 数据 json 转换失败", e);
+//                }
+//            }else if(url.contains("huanqiu.com") && result.contains("www.huanqiu.com/404.html")) {
+//                title = "网页已删除";
+//            }else if(url.contains("zhihu.com")) {
+//                JSONObject resultJson = JSONObject.parseObject(result);
+//                
+//                title = resultJson.getString("title")!=null?resultJson.getString("title"):resultJson.getString("message");
+//            }
+//            
+//            //若title 为拿到 用 此方法
+//            if(Objects.isNull(title) || title.length() < 1) {
+//                title = doc.select("div.adiv > p > span").text().replaceAll(" ", "");
+//            }
+//            
+//            //若title 为拿到 用 此方法
+//            if(Objects.isNull(title) || title.length() < 1) {
+//                title = doc.select("title").text().replaceAll(" ", "");
+//            }
+//            
+//            //若title 为拿到 用 此方法
+//            if(Objects.isNull(title) || title.length() < 1) {
+//                title = doc.select("h1").text().replaceAll(" ", "");
+//            }
+//            
+//           //若title 为拿到 用 此方法
+//            if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
+//                title = "网页已删除";
+//            }
+//        
+//            if(Objects.nonNull(title) && title.length() > 1){
+//                return new UrlLiveBean(url, isDelete(title),title);
+//            } else {
+//                return null;
+//            }
+//        } catch (Exception e) {
+//            return null;
+//        }
+//    }
+//    
+//    /**
+//     * 
+//     * @Description 标题判断
+//     * @param title
+//     * @return
+//     */
+//    private boolean isDelete(String title) {
+//        List<String> eList = Arrays.asList("系统出错","该内容已被发布者删除","网页已删除"
+//                ,"此帐号已自主注销，内容无法查看","页面提示","正在维护中"
+//                ,"此文章被第三方评估为不实信息","财经头条","知识100题","502BadGateway"
+//                ,"提示信息","跳转页","跳转中...","此帐号在冻结期,内容无法查看","东北新闻网"
+//                ,"百度一下，你就知道","帐号已迁移","手机百度","内容被删除","亚博国际|首页"
+//                ,"中国软件网","云广网","新浪首页","文章暂时找不到了","-法易网"
+//                ,"【一点资讯】www.yidianzixun.com","错误页面","网站暂停通知","【快资讯】你的专属资讯平台"
+//                ,"百度新闻——全球最大的中文新闻平台","以上文章由以下机构判定为不实信息","该公众号已迁移"
+//                ,"财经网-CAIJING.COM.CN","蚂蚁资讯","参数错误","时尚头条_YOKA时尚网","该文章已经被删除"
+//                ,"网易","链接已过期","找不到页面","今晚网","该文章已被删除", "该回答已被删除-知乎", "资源不存在");
+//        
+//        List<String> cList = Arrays.asList("提示信息-","此内容因违规无法查看","微信公众号不存在"
+//                ,"此内容被投诉且经审核涉嫌侵权，无法查看","thepageyourequestedwasnotfound","未知错误"
+//                ,"Objectmoved","404","页面没有找到","页面未找到","301MovedPermanently","加载异常",
+//                "此帐号已被屏蔽, 内容无法查看","链接不存在");
+//        
+//        return cList.stream().anyMatch(title::contains) || eList.stream().anyMatch(title::equals);
+//    }
+//    
+//    
+//    /**
+//     * 处理知乎链接
+//     * 
+//     * */
+//    private static String treatZhihuUrl(String url) {
+//        if(url.contains("/answer/")) {
+//            url = "https://api.zhihu.com/answers/" + url.replaceAll(".*/answer/", "");
+//        }else if(url.contains("/question/") && !url.contains("/answer/")) {
+//            url = "https://api.zhihu.com/questions/" + url.replaceAll(".*/question/", "");
+//        }else if(url.contains("/p/")) {
+//            url = "https://api.zhihu.com/articles/" + url.replaceAll(".*/p/", "");
+//        }
+//        return url;
+//    }
+//
+//    
+//}
--- a/src/main/java/com/zhiwei/source_forward/run/SourceForward.java
+++ b/src/main/java/com/zhiwei/source_forward/run/SourceForward.java
@@ -80,7 +80,7 @@ public class SourceForward {
 	public static void main(String[] args) {
        ProxyInit.initProxy();
        List<String> urlList = new ArrayList<>();
-        urlList.add("http://gu.qq.com/resources/shy/news/detail-v2/index.html?#/index?id=SN202006091653447945411f");
+        urlList.add("https://ypstatic.cnnb.com.cn/yppage-share/news/share/news_detail?newsId=627223d9e4b042b45e211c5a");
        List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
        for(SourceForwardBean sfb : da) {
            System.out.println("=============="+sfb.toString());

--- a/src/main/java/com/zhiwei/source_forward/run/URLLive.java
+++ b/src/main/java/com/zhiwei/source_forward/run/URLLive.java
@@ -98,7 +98,7 @@ public class URLLive {
              };
              crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await();
          }catch (Exception e){
-              logger.error(" 数据采集运行有问题 {} ", e);
+              logger.error(" 数据采集运行有问题 ", e);
          }
          return list;
      }

--- a/src/main/java/com/zhiwei/source_forward/util/MatchContent.java
+++ b/src/main/java/com/zhiwei/source_forward/util/MatchContent.java
-package com.zhiwei.source_forward.util;
-
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.kohlschutter.boilerpipe.extractors.ArticleExtractor;
-import com.zhiwei.tools.tools.ZhiWeiTools;
-
-/**
- * @ClassName: MatchChannel 
- * @Description: 匹配频道
- * @author hero 
- * @date 2018年6月30日 上午10:27:58
- */
-public class MatchContent {
-
-	private static Logger logger = LoggerFactory.getLogger(MatchContent.class);
-	/**
-	 * @Title: matchContent 
-	 * @author hero 
-	 * @Description: 匹配文章正文
-	 * @param @param url
-	 * @param @param html
-	 * @param @return 设定文件 
-	 * @return String 返回类型
-	 */
-	public static String matchContent(String url,String html) {
-		String content = null;
-		try {
-		    Document document = Jsoup.parse(html);
-		    if(url.contains("weixin.qq.com")) {
-		        content = matchContentWeixin(html);
-		    }else if(url.contains("toutiao.com")) {
-		        content = matchContentToutiao(html);
-		    }
-		    if(content == null || content.length() < 10) {
-		        content = mathchContent(html, document);
-		    }
-			return ZhiWeiTools.delHTMLTag(content);
-		} catch (Exception e) {
-			logger.error("获取全文失败",e);
-			content = null;
-		}
-		return content;
-	}
-	
-	/**
-	 * 
-	 * @Description 头条正文获取
-	 * @param html
-	 * @return
-	 */
-	private static String matchContentToutiao(String html) {
-	    Pattern pa = Pattern.compile("content:(.*?)',");
-	    Matcher ma = pa.matcher(html);
-	    while(ma.find()) {
-	        return ma.group(1);
-	    }
-        return null;
-    }
-
-    /**
-	 * 
-	 * @Description 微信文本获取
-	 * @param html
-	 * @return
-	 */
-    private static String matchContentWeixin(String contentHtml) {
-        try {
-            Document document = Jsoup.parse(contentHtml);
-            if (contentHtml.contains("js_article")) {
-                return document.select("div#js_article").text();
-            } else if (contentHtml.contains("js_share_content")) {
-                return document.select("div#js_share_content").text();
-            }
-            if (contentHtml.contains("content_tpl")) {
-                String text = document.select("script#content_tpl").html();
-                return Jsoup.parse(text).text();
-            }
-        } catch (Exception e) {
-            logger.error("微信全文解析出错 {}", e);
-        }
-        return "";
-    }
-
-
-    /**
-	 * @Title: mathchContent 
-	 * @author hero 
-	 * @Description: 匹配正文数据
-	 * @param @param html
-	 * @param @param document
-	 * @param @return 设定文件 
-	 * @return String 返回类型
-	 */
-	private static String mathchContent(String html,Document document){
-		/** 正文抽取,目的是避免正文中存在相应匹配规则 **/
-	    String content = null;
-        try {
-          content = ArticleExtractor.getInstance().getText(html);
-        } catch (Exception e) {
-            logger.error("正文抽取失败,获取全文文本:",e);
-            content = document.text();
-        }
-	    
-//		String content = null;
-//		try {
-//			News news = ContentExtractor.getNewsByHtml(html);
-//			content = TreateData.filterSpecialCharacter(news.getContent());
-//		} catch (Exception e) {
-//			logger.error("正文抽取失败,获取全文文本:",e);
-//			content = document.text();
-//		}
-		return content;
-	}
-}
+package com.zhiwei.source_forward.util;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.kohlschutter.boilerpipe.extractors.ArticleExtractor;
+import com.zhiwei.tools.tools.ZhiWeiTools;
+
+/**
+ * @ClassName: MatchChannel 
+ * @Description: 匹配频道
+ * @author hero 
+ * @date 2018年6月30日 上午10:27:58
+ */
+public class MatchContent {
+
+	private static Logger logger = LoggerFactory.getLogger(MatchContent.class);
+	/**
+	 * @Title: matchContent 
+	 * @author hero 
+	 * @Description: 匹配文章正文
+	 * @param @param url
+	 * @param @param html
+	 * @param @return 设定文件 
+	 * @return String 返回类型
+	 */
+	public static String matchContent(String url,String html) {
+		String content = null;
+		try {
+		    Document document = Jsoup.parse(html);
+		    if(url.contains("weixin.qq.com")) {
+		        content = matchContentWeixin(html);
+		    }else if(url.contains("toutiao.com")) {
+		        content = matchContentToutiao(html);
+		    }
+		    if(content == null || content.length() < 10) {
+		        content = mathchContent(html, document);
+		    }
+			return ZhiWeiTools.delHTMLTag(content);
+		} catch (Exception e) {
+			logger.error("获取全文失败",e);
+			content = null;
+		}
+		return content;
+	}
+	
+	/**
+	 * 
+	 * @Description 头条正文获取
+	 * @param html
+	 * @return
+	 */
+	private static String matchContentToutiao(String html) {
+	    Pattern pa = Pattern.compile("content:(.*?)',");
+	    Matcher ma = pa.matcher(html);
+	    while(ma.find()) {
+	        return ma.group(1);
+	    }
+        return null;
+    }
+
+    /**
+	 * 
+	 * @Description 微信文本获取
+	 * @param html
+	 * @return
+	 */
+    private static String matchContentWeixin(String contentHtml) {
+        try {
+            Document document = Jsoup.parse(contentHtml);
+            if (contentHtml.contains("js_article")) {
+                return document.select("div#js_article").text();
+            } else if (contentHtml.contains("js_share_content")) {
+                return document.select("div#js_share_content").text();
+            }
+            if (contentHtml.contains("content_tpl")) {
+                String text = document.select("script#content_tpl").html();
+                return Jsoup.parse(text).text();
+            }
+        } catch (Exception e) {
+            logger.error("微信全文解析出错 {}", e);
+        }
+        return "";
+    }
+
+
+    /**
+	 * @Title: mathchContent 
+	 * @author hero 
+	 * @Description: 匹配正文数据
+	 * @param @param html
+	 * @param @param document
+	 * @param @return 设定文件 
+	 * @return String 返回类型
+	 */
+	private static String mathchContent(String html,Document document){
+		/** 正文抽取,目的是避免正文中存在相应匹配规则 **/
+	    String content = null;
+        try {
+            content = ArticleExtractor.getInstance().getText(html);
+        } catch (Exception e) {
+            logger.error("正文抽取失败,获取全文文本:",e);
+            content = document.text();
+        }
+	    
+//		String content = null;
+//		try {
+//			News news = ContentExtractor.getNewsByHtml(html);
+//			content = TreateData.filterSpecialCharacter(news.getContent());
+//		} catch (Exception e) {
+//			logger.error("正文抽取失败,获取全文文本:",e);
+//			content = document.text();
+//		}
+		return content;
+	}
+}
--- a/src/main/java/com/zhiwei/source_forward/util/ProxyInit.java
+++ b/src/main/java/com/zhiwei/source_forward/util/ProxyInit.java
 package com.zhiwei.source_forward.util;

-import com.zhiwei.crawler.core.proxy.ProxyFactory;
-import com.zhiwei.proxy.config.SimpleConfig;
-import com.zhiwei.source_forward.config.ProxyConfig;
+import org.apache.dubbo.config.ApplicationConfig;
+import org.apache.dubbo.config.ConsumerConfig;
+import org.apache.dubbo.config.RegistryConfig;
+
+import com.zhiwei.http.proxy.CynomysFactory;
+import com.zhiwei.network.cynomys.consumer.CynomysConsumer;
+import com.zhiwei.network.cynomys.consumer.CynomysConsumerFactory;

 /**
 * 初始化代理
@@ -16,10 +20,18 @@ public class ProxyInit {
     * void
     */
    public static void initProxy() {
-        String address = ProxyConfig.registry;
-        String appName = "xumiaoxin";
-        long appId = ProxyConfig.proxyid;
-        ProxyFactory.init(SimpleConfig.builder().registry(address).appName(appName).appId(appId).group(ProxyConfig.group).build());
+        ApplicationConfig applicationConfig = new ApplicationConfig();
+        applicationConfig.setName("actool");
+        RegistryConfig registryConfig = new RegistryConfig();
+        registryConfig.setAddress("zookeeper://192.168.0.30:2181");
+        ConsumerConfig consumerConfig = new ConsumerConfig();
+// 设置分组
+        consumerConfig.setGroup("local");
+        String username = "18271694195";
+        String password = "Zhiwei289";
+// 创建 consumer，applicationConfig 非必需参数
+        CynomysConsumer consumer = CynomysConsumerFactory.create(applicationConfig, registryConfig, consumerConfig, username, password);
+        CynomysFactory.init(consumer);
    }
    
 }