各个采集验证添加休眠，避免数据过多导致程序阻塞

9fcfba2d · zhiwei · aa059934 · 9fcfba2d · 9fcfba2d · 9fcfba2d
Commit 9fcfba2d authored Aug 13, 2020 by zhiwei
5 changed files
--- a/src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
+++ b/src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
-package com.zhiwei.source_forward.crawler;
-
-import java.util.Objects;
-
-import org.apache.logging.log4j.LogManager;
-import org.apache.logging.log4j.Logger;
-
-import com.zhiwei.async.GroupSync;
-import com.zhiwei.crawler.core.HttpBoot;
-import com.zhiwei.crawler.core.proxy.ProxyHolder;
-import com.zhiwei.crawler.core.utils.RequestUtils;
-import com.zhiwei.source_forward.bean.ContentBean;
-import com.zhiwei.source_forward.bean.ContentBean.Attribution;
-import com.zhiwei.source_forward.util.ContentDataCallback;
-import com.zhiwei.source_forward.util.MatchContent;
-
-import okhttp3.Request;
-
-public class ContentCrawler {
-
-    private static Logger logger = LogManager.getLogger(ContentCrawler.class);
-    private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
-    
-    /**
-     * 
-     * @Description 链接传入 并 返回采集完信号
-     * @param callback
-     * @param urls
-     * @return
-     * @throws Exception
-     */
-    public GroupSync submitTask(ContentDataCallback callback,
-            String... urls) {
-        GroupSync counter = new GroupSync();
-        start(counter, callback, urls);
-        return counter;
-    }
-
-    /**
-     * 
-     * @Description 提交链接
-     * @param counter
-     * @param callback
-     * @param urls
-     */
-    private void start(GroupSync counter,
-            ContentDataCallback callback, String... urls) {
-        if (urls != null && urls.length > 0) {
-            for (String url : urls) {
-                if (url != null) {
-                    try {
-                        search(counter, url, Attribution.of(url), callback);
-                    } catch (Exception e) {
-                        logger.error("搜索创建出错", e);
-                    }
-                }
-            }
-        }
-    }
-
-    /**
-     * 
-     * @Description 链接获取文章信息
-     * @param counter
-     * @param url
-     * @param attr
-     * @param callback
-     * @return
-     */
-    private GroupSync search(GroupSync counter,
-            String url, Attribution attr, ContentDataCallback callback) {
-        logger.info("当前处理 URL: {}", url);
-        Request request = RequestUtils.wrapGet(url);
-        counter.add();
-        
-        httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
-            try {
-                if (Objects.isNull(ex)) {
-                    parseHtml(rs.body().string(), attr, callback);
-                } else {
-                    logger.info("{} 搜索结果访问失败: {}", request.url().url(),ex);
-                }
-            } catch (Exception e) {
-                logger.info("搜索结果访问失败: {}", ex);
-            } finally {
-                counter.done();
-            }
-            
-        });
-        
-        return counter;
-    }
-
-    /**
-     * 
-     * 
-     * @Description 获取正文解析
-     * @param response
-     * @param attr
-     * @param callback
-     */
-    private void parseHtml(String result, Attribution attr,
-            ContentDataCallback callback) {
-        try {
-            String content = MatchContent.matchContent(attr.get().toString(),
-                    result);
-            ContentBean cb = new ContentBean(attr.get().toString(), content);
-            if (callback == null) {
-                logger.warn("DataCallback 对象为 null，无法保存数据");
-            } else {
-                callback.onData(cb, attr);
-            }
-        } catch (Exception e) {
-            logger.error("网页链接失效", e);
-        }
-
-    }
-
-}
+package com.zhiwei.source_forward.crawler;
+
+import java.util.Objects;
+
+import com.zhiwei.tools.tools.ZhiWeiTools;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+import com.zhiwei.async.GroupSync;
+import com.zhiwei.crawler.core.HttpBoot;
+import com.zhiwei.crawler.core.proxy.ProxyHolder;
+import com.zhiwei.crawler.core.utils.RequestUtils;
+import com.zhiwei.source_forward.bean.ContentBean;
+import com.zhiwei.source_forward.bean.ContentBean.Attribution;
+import com.zhiwei.source_forward.util.ContentDataCallback;
+import com.zhiwei.source_forward.util.MatchContent;
+
+import okhttp3.Request;
+
+public class ContentCrawler {
+
+    private static Logger logger = LogManager.getLogger(ContentCrawler.class);
+    private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
+    
+    /**
+     * 
+     * @Description 链接传入 并 返回采集完信号
+     * @param callback
+     * @param urls
+     * @return
+     * @throws Exception
+     */
+    public GroupSync submitTask(ContentDataCallback callback,
+            String... urls) {
+        GroupSync counter = new GroupSync();
+        start(counter, callback, urls);
+        return counter;
+    }
+
+    /**
+     * 
+     * @Description 提交链接
+     * @param counter
+     * @param callback
+     * @param urls
+     */
+    private void start(GroupSync counter,
+            ContentDataCallback callback, String... urls) {
+        if (urls != null && urls.length > 0) {
+            for (String url : urls) {
+                ZhiWeiTools.sleep(100);
+                if (url != null) {
+                    try {
+                        search(counter, url, Attribution.of(url), callback);
+                    } catch (Exception e) {
+                        logger.error("搜索创建出错", e);
+                    }
+                }
+            }
+        }
+    }
+
+    /**
+     * 
+     * @Description 链接获取文章信息
+     * @param counter
+     * @param url
+     * @param attr
+     * @param callback
+     * @return
+     */
+    private GroupSync search(GroupSync counter,
+            String url, Attribution attr, ContentDataCallback callback) {
+        logger.info("当前处理 URL: {}", url);
+        Request request = RequestUtils.wrapGet(url);
+        counter.add();
+        
+        httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
+            try {
+                if (Objects.isNull(ex)) {
+                    parseHtml(rs.body().string(), attr, callback);
+                } else {
+                    logger.info("{} 搜索结果访问失败: {}", request.url().url(),ex);
+                }
+            } catch (Exception e) {
+                logger.info("搜索结果访问失败: {}", ex);
+            } finally {
+                counter.done();
+            }
+            
+        });
+        
+        return counter;
+    }
+
+    /**
+     * 
+     * 
+     * @Description 获取正文解析
+     * @param response
+     * @param attr
+     * @param callback
+     */
+    private void parseHtml(String result, Attribution attr,
+            ContentDataCallback callback) {
+        try {
+            String content = MatchContent.matchContent(attr.get().toString(),
+                    result);
+            ContentBean cb = new ContentBean(attr.get().toString(), content);
+            if (callback == null) {
+                logger.warn("DataCallback 对象为 null，无法保存数据");
+            } else {
+                callback.onData(cb, attr);
+            }
+        } catch (Exception e) {
+            logger.error("网页链接失效", e);
+        }
+
+    }
+
+}
--- a/src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
+++ b/src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
@@ -5,6 +5,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.Objects;

+import com.zhiwei.tools.tools.ZhiWeiTools;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.jsoup.Jsoup;
@@ -64,6 +65,7 @@ public class MediaSelfSourceCrawler {
    private void start(GroupSync counter,MediaSelfSourceDataCallBack callback, String... urls) {
        if (urls != null && urls.length > 0) {
            for (String url : urls) {
+                ZhiWeiTools.sleep(100);
                counter.add();
                if (url != null) {
                    try {

--- a/src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
+++ b/src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
-package com.zhiwei.source_forward.crawler;
-
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Objects;
-
-import org.apache.logging.log4j.LogManager;
-import org.apache.logging.log4j.Logger;
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Node;
-
-import com.zhiwei.async.GroupSync;
-import com.zhiwei.crawler.core.HttpBoot;
-import com.zhiwei.crawler.core.proxy.ProxyHolder;
-import com.zhiwei.crawler.core.utils.RequestUtils;
-import com.zhiwei.source_forward.bean.SourceForwardBean;
-import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
-import com.zhiwei.source_forward.util.MatchChannel;
-import com.zhiwei.source_forward.util.MatchSource;
-import com.zhiwei.source_forward.util.SourceData;
-import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
-
-import okhttp3.Request;
-
-public class SourceForwardCrawler {
-    
-    private static final Logger logger = LogManager.getLogger(SourceForwardCrawler.class);
-    
-    private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
-    private static List<String> sourceList = SourceData.getSourceList();
-    
-    public GroupSync submitTask(SourceForwardDataCallBack callback,String... urls) {
-        try {
-            GroupSync counter = new GroupSync();
-            start(counter, callback, urls);
-            return counter;
-        } catch (Exception e) {
-            logger.error(" exception  ", e);
-            return null;
-        }
-    }
-
-    private void start(GroupSync counter,SourceForwardDataCallBack callback, String... urls) {
-        if (urls != null && urls.length > 0) {
-            for (String url : urls) {
-                counter.add();
-                if (url != null) {
-                    try {
-                        search(counter, url, Attribution.of(url), callback);
-                    } catch (Exception e) {
-                        logger.error("搜索创建出错", e);
-                    } 
-                }
-                counter.done();
-            }
-        }
-    }
-
-    private GroupSync search(GroupSync counter, String url,Attribution attr, SourceForwardDataCallBack callback) {
-        logger.info("当前处理 URL: {}", url);
-        Map<String,String> headers = new HashMap<>();
-//        Map<String,String> headers = HeaderTool.getCommonHead();
-        if(url.contains("www.toutiao.com")){
-        	headers.put("referer", url);
-        }
-        if(url.contains("china.prcfe.com")) {
-            url = "http://china.prcfe.com/e/extend/ShowSource/?id=" + url.split("/")[url.split("/").length-1].split("\\.")[0];
-        }
-        Request request = RequestUtils.wrapGet(url, headers);
-        counter.add();
-        httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
-            try {
-                if (Objects.isNull(ex)) {
-                    parseHtml(rs.body().string(), attr, callback);
-                } else {
-                    logger.info("{} 搜索结果访问失败: {}", request.url().url(), ex);
-                }
-            } catch (Exception e1) {
-                logger.error("解析出错",e1);
-            } finally {
-                counter.done();
-            }
-        });
-        return counter;
-    }
-
-    private void parseHtml(String body, Attribution attr,
-            SourceForwardDataCallBack callback) {
-        String source = null;
-        String channel = "新闻";
-        String isforward = "未知";
-        try {
-            Document document = Jsoup.parse(body);
-            if(attr.get().toString().contains("mp.weixin.qq.com")){
-                isforward = document.select("div#meta_content").select("span#copyright_logo").text();
-                if(isforward.contains("原创")){
-                    isforward = "原创";
-                }else {
-                    isforward = "未知";
-                }
-            }else if(attr.get().toString().contains("www.toutiao.com")){
-            	if(body.contains("isOriginal") && body.contains("isOriginal: true")){
-            	    isforward = "原创";
-				}
-            }else{
-                channel = MatchChannel.verifyChannel(attr.get().toString());
-                if(channel==null){
-                    List<Node> nodeList = document.head().childNodes();
-                    channel = MatchChannel.matchChannel(nodeList);
-                }
-                source = MatchSource.matchSource(attr.get().toString(),document.toString(), sourceList);
-            }
-        } catch (Exception e) {
-            source = null;
-            channel = "新闻";
-        }
-        logger.info(attr.get().toString()+"======="+channel+"================="+source);   
-        SourceForwardBean sfb = new SourceForwardBean(attr.get().toString(), channel, source,isforward);
-        if (callback == null) {
-            logger.warn("DataCallback 对象为 null，无法保存数据");
-        } else {
-            callback.onData(sfb, attr);
-        }
-    }
-    
-}
+package com.zhiwei.source_forward.crawler;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+
+import com.zhiwei.tools.tools.ZhiWeiTools;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Node;
+
+import com.zhiwei.async.GroupSync;
+import com.zhiwei.crawler.core.HttpBoot;
+import com.zhiwei.crawler.core.proxy.ProxyHolder;
+import com.zhiwei.crawler.core.utils.RequestUtils;
+import com.zhiwei.source_forward.bean.SourceForwardBean;
+import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
+import com.zhiwei.source_forward.util.MatchChannel;
+import com.zhiwei.source_forward.util.MatchSource;
+import com.zhiwei.source_forward.util.SourceData;
+import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
+
+import okhttp3.Request;
+
+public class SourceForwardCrawler {
+    
+    private static final Logger logger = LogManager.getLogger(SourceForwardCrawler.class);
+    
+    private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
+    private static List<String> sourceList = SourceData.getSourceList();
+    
+    public GroupSync submitTask(SourceForwardDataCallBack callback,String... urls) {
+        try {
+            GroupSync counter = new GroupSync();
+            start(counter, callback, urls);
+            return counter;
+        } catch (Exception e) {
+            logger.error(" exception  ", e);
+            return null;
+        }
+    }
+
+    private void start(GroupSync counter,SourceForwardDataCallBack callback, String... urls) {
+        if (urls != null && urls.length > 0) {
+            for (String url : urls) {
+                counter.add();
+                ZhiWeiTools.sleep(100);
+                if (url != null) {
+                    try {
+                        search(counter, url, Attribution.of(url), callback);
+                    } catch (Exception e) {
+                        logger.error("搜索创建出错", e);
+                    } 
+                }
+                counter.done();
+            }
+        }
+    }
+
+    private GroupSync search(GroupSync counter, String url,Attribution attr, SourceForwardDataCallBack callback) {
+        logger.info("当前处理 URL: {}", url);
+        Map<String,String> headers = new HashMap<>();
+//        Map<String,String> headers = HeaderTool.getCommonHead();
+        if(url.contains("www.toutiao.com")){
+        	headers.put("referer", url);
+        }
+        if(url.contains("china.prcfe.com")) {
+            url = "http://china.prcfe.com/e/extend/ShowSource/?id=" + url.split("/")[url.split("/").length-1].split("\\.")[0];
+        }
+        Request request = RequestUtils.wrapGet(url, headers);
+        counter.add();
+        httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
+            try {
+                if (Objects.isNull(ex)) {
+                    parseHtml(rs.body().string(), attr, callback);
+                } else {
+                    logger.info("{} 搜索结果访问失败: {}", request.url().url(), ex);
+                }
+            } catch (Exception e1) {
+                logger.error("解析出错",e1);
+            } finally {
+                counter.done();
+            }
+        });
+        return counter;
+    }
+
+    private void parseHtml(String body, Attribution attr,
+            SourceForwardDataCallBack callback) {
+        String source = null;
+        String channel = "新闻";
+        String isforward = "未知";
+        try {
+            Document document = Jsoup.parse(body);
+            if(attr.get().toString().contains("mp.weixin.qq.com")){
+                isforward = document.select("div#meta_content").select("span#copyright_logo").text();
+                if(isforward.contains("原创")){
+                    isforward = "原创";
+                }else {
+                    isforward = "未知";
+                }
+            }else if(attr.get().toString().contains("www.toutiao.com")){
+            	if(body.contains("isOriginal") && body.contains("isOriginal: true")){
+            	    isforward = "原创";
+				}
+            }else{
+                channel = MatchChannel.verifyChannel(attr.get().toString());
+                if(channel==null){
+                    List<Node> nodeList = document.head().childNodes();
+                    channel = MatchChannel.matchChannel(nodeList);
+                }
+                source = MatchSource.matchSource(attr.get().toString(),document.toString(), sourceList);
+            }
+        } catch (Exception e) {
+            source = null;
+            channel = "新闻";
+        }
+        logger.info(attr.get().toString()+"======="+channel+"================="+source);   
+        SourceForwardBean sfb = new SourceForwardBean(attr.get().toString(), channel, source,isforward);
+        if (callback == null) {
+            logger.warn("DataCallback 对象为 null，无法保存数据");
+        } else {
+            callback.onData(sfb, attr);
+        }
+    }
+    
+}
--- a/src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+++ b/src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
-package com.zhiwei.source_forward.crawler;
-
-import static java.util.Objects.nonNull;
-
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Objects;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.logging.log4j.LogManager;
-import org.apache.logging.log4j.Logger;
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;
-
-import com.alibaba.fastjson.JSONObject;
-import com.zhiwei.async.GroupSync;
-import com.zhiwei.crawler.core.HttpBoot;
-import com.zhiwei.crawler.core.proxy.ProxyHolder;
-import com.zhiwei.crawler.core.utils.RequestUtils;
-import com.zhiwei.source_forward.bean.UrlLiveBean;
-import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
-import com.zhiwei.source_forward.util.UrlLiveDataCallback;
-import com.zhiwei.tools.tools.ZhiWeiTools;
-
-import okhttp3.Request;
-
-/**
- * 
- * @ClassName UrlLiveCrawler
- * @Description 判断页面是否存在
- * @author byte-zbs
- * @Date 2018年8月20日 下午3:34:57
- * @version 1.0.0
- */
-public class UrlLiveCrawler {
-
-    private static final Logger logger = LogManager.getLogger(UrlLiveCrawler.class);
-    private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).throwException(true).build();
-    
-    public GroupSync submitTask(UrlLiveDataCallback callback,String... urls) {
-        GroupSync counter = new GroupSync();
-        start(counter, callback, urls);
-        return counter;
-    }
-    
-    private void start(GroupSync counter,UrlLiveDataCallback callback, String... urls) {
-        if (nonNull(urls) && urls.length > 0) {
-            for (String url : urls) {
-                counter.add();
-                if (nonNull(url)) {
-                    try {
-//                        ZhiWeiTools.sleep(3000);
-                        search(counter, url, Attribution.of(url, 1), callback);
-                    } catch (Exception e) {
-                        logger.error("搜索创建出错:", e);
-                    }
-                }
-                counter.done();
-            }
-        }
-    }
-    
-    private GroupSync search(GroupSync counter, String url,
-            Attribution attr, UrlLiveDataCallback callback) {
-//        System.out.println(url);
-        url = dealUrl(url);
-        logger.info("当前处理 URL: {}", url);
-        Map<String,String> headers = new HashMap<>();
-        ProxyHolder ph = null;
-        if(url.contains("toutiao.com")){
-//        	headers.put("referer", url);
-//        	headers.put("cookie", "csrftoken=6d0e5967684dbb57cea14dc12858d263; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __utmz=24953151.1576744435.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _ga=GA1.2.1570444666.1576744435; ttcid=d5fa0cec2bca426cbbddeb27a8bc72f629; UM_distinctid=170f5ad3a5b43e-04a6930812e3c5-f313f6d-240000-170f5ad3a5c40b; SLARDAR_WEB_ID=770be4fe-dd54-4701-a4ed-396b861a237d; CNZZDATA1259612802=2091325281-1587691681-%7C1589960650; s_v_web_id=verify_kaqh0jch_kZtSHRI9_GfBj_403X_8i5y_VOyWz4ATIx0d; __utmc=24953151; __utma=24953151.1570444666.1576744435.1591353562.1591579877.8; tt_webid=6837283963338622477; tt_webid=6837283963338622477; __tasessionId=dsei2aty41591951911851; tt_scid=M--AJ-FYwZ0qcYTzQCLyMeS5MlykLS6ktMkvqKKJmq-ghRxX4waEBhJ3YbheuNmi2b8a");
-//            headers.put("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
-//            headers.put("accept-encoding", "gzip, deflate, br");
-//            headers.put("accept-language", "zh-CN,zh;q=0.9");
-//            headers.put("cache-control", "no-cache");
-//            headers.put("sec-fetch-dest", "document");
-//            headers.put("sec-fetch-mode", "navigate");
-//            headers.put("sec-fetch-site", "same-origin");
-//            headers.put("sec-fetch-user", "?1");
-//            headers.put("upgrade-insecure-requests", "1");
-//            headers.put("user-agent", "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Mobile Safari/537.36");
-            ph = ProxyHolder.NAT_HEAVY_PROXY;
-        }else if(url.contains("zhihu.com")) {
-        	url = treatZhihuUrl(url);
-        	ph = ProxyHolder.NAT_HEAVY_PROXY;
-        }
-        try {
-            Request request = RequestUtils.wrapGet(url, headers);
-            if(Objects.nonNull(request)) {
-                counter.add();
-                // , new Proxy(Type.HTTP, new InetSocketAddress("119.3.38.9", 31128))
-                httpBoot.asyncCall(request, ph).whenComplete((rs,ex) -> {
-                    try {
-                        if (Objects.isNull(ex)) {
-                            if(rs.isSuccessful()) {
-                            	parseHtml(rs.body().string(), attr, callback);
-                            }else if(rs.code() == 404){
-                            	callBack(callback, attr, 1, String.valueOf(rs.code()));
-                            }else {
-                                callBack(callback, attr, -1, "程序无法判断");
-                            }
-                        } else {
-                            logger.error("e", ex);
-                            callBack(callback, attr, -1, "程序无法判断");
-                        }
-                    } catch (Exception e) {
-                        logger.error(" 数据是否删除 采集出错 {} ",e);
-                    }finally {
-                        counter.done();
-                    }
-                });
-                return counter;
-            }
-        } catch (Exception e2) {
-            logger.error("数据出错 {}" ,e2);
-        }
-        return counter;
-    }
-    
-    private void callBack(UrlLiveDataCallback callback,Attribution attr,int i,String title) {
-        UrlLiveBean ulb = null;
-        if(i == 1) {
-            ulb = new UrlLiveBean(attr.getAttr().toString(), true, title);
-        }else {
-            ulb = new UrlLiveBean(attr.getAttr().toString(), i, title);
-        }
-        if (callback == null) {
-            logger.warn("DataCallback 对象为 null，无法保存数据");
-        } else {
-            callback.onData(ulb, attr);
-        }
-    }
-    
-    private String dealUrl(String url) {
-        try {
-            if(url.contains("www.toutiao.com")) {
-                if(url.contains("www.toutiao.com")) {
-                    
-                }else {
-                    url = url.replace("toutiao.com", "www.toutiao.com");
-                }
-                if(url.contains("https")) {
-                    
-                }else {
-                    url = url.replace("http", "https");
-                }
-                if(url.contains("group")) {
-                    url = "https://www.toutiao.com/a" + url.split("/")[4] + "/";
-                }
-            }else if(url.contains("mp.weixin.qq.com")) {
-                if(url.contains("https")) {
-                
-                }else {
-                    url = url.replace("http", "https");
-                }
-            }else if(url.contains("a.mp.uc.cn") && url.contains("wm_aid")) {
-                url = "http://ff.dayu.com/contents/origin/"+url.split("wm_aid=")[1].split("!!wm_id")[0]+"?biz_id=1002&_fetch_author=1&_fetch_incrs=1";
-            }else if(url.contains("tznew.58.com/view") && url.contains("infoid=")) {
-                // https://tznew.58.com/view/c/sharingDetailNew?infoid=117073473
-                url = "https://tznew.58.com/tznew/c/info-detail?infoid=" + url.split("infoid=")[1].split("&")[0];
-            }
-            return url;
-        } catch (Exception e) {
-            return url;
-        }
-    }
-    
-    /**
-     * 
-     * @Description 判断是否删除
-     * @param html
-     * @param attr
-     * @param callback
-     */
-    private void parseHtml(String html, Attribution attr,
-            UrlLiveDataCallback callback) {
-        if (callback == null) {
-            logger.warn("DataCallback 对象为 null，无法保存数据");
-        } else {
-            UrlLiveBean ulb = matchDel(html, attr, attr.getAttr().toString());
-            if(Objects.nonNull(ulb)) {
-                callback.onData(ulb, attr);
-            }else {
-                callBack(callback, attr, -1, "程序无法判断");
-            }
-        }
-    }
-    
-    /***
-     * @Title: matchDel 
-     * @author hero 
-     * @Description: 验证链接是否有效
-     * @param @param page
-     * @param @return 设定文件 
-     * @return boolean 返回类型
-     */
-    public UrlLiveBean matchDel(String result,Attribution attr,String url){
-        try {
-            Document doc = Jsoup.parse(result);
-            String title = null;
-            if(url.contains("mp.weixin.qq.com") || url.contains("post.mp.qq.com") || url.contains("weixin.sogou.com")){
-            	title = doc.select("h2.rich_media_title").text().replaceAll(" ", "");
-            	if(Objects.isNull(title) || title.isEmpty()) {
-            	    title = doc.select("p.title").text();
-            	}
-            	if(Objects.isNull(title) || title.isEmpty()) {
-                    title = doc.select("h3.msg-title").text();
-                }
-            	if(Objects.isNull(title) || title.isEmpty()) {
-                    title = doc.select("div.global_error_msg.warn").text();
-                }
-            	if(Objects.isNull(title) || title.isEmpty()) {
-                    title = doc.select("p.tips").text();
-                }
-            	if(Objects.isNull(title) || title.isEmpty()) {
-                    title = doc.select("h2").text();
-                }
-            	// 获取title
-                Matcher ma5 = Pattern.compile("var msg_title = \'(.*)\'")
-                        .matcher(result);
-                if (ma5.find()) {
-                    title = ma5.group(1).replaceAll("  ", " ").trim();
-                }
-            }else if(url.contains("kuaibao")){
-                title = doc.select("p.title").text().replaceAll(" ", "");
-            }else if(url.contains("chinadaily.com.cn")){
-                title = doc.select("p.style1").text().replaceAll(" ", "");
-            }else if(url.contains("baidu.com") || url.contains("hao123.com")) {
-                title = doc.select("p#contaniner").text();
-            }else if(url.contains("kanfanews.com")) {
-                title = doc.select("p#tit").text();
-            }else if(url.contains("ifeng.com") && result.contains("url=http://www.ifeng.com/")) {
-                title = "网页已删除";
-            }else if(url.contains("a.mp.uc.cn")) {
-                try {
-                    JSONObject json = JSONObject.parseObject(result);
-                    title = json.getJSONObject("data").getString("title");
-                } catch (Exception e) {
-                    logger.error(" uc 数据 json 转换失败", e);
-                }
-            }else if(url.contains("huanqiu.com") && result.contains("www.huanqiu.com/404.html")) {
-                title = "网页已删除";
-            }else if(url.contains("zhihu.com")) {
-            	JSONObject resultJson = JSONObject.parseObject(result);
-            	if(url.contains("/answer/")) {
-            	    title = resultJson.getJSONObject("question").getString("title");
-                }else if(url.contains("/question/") && !url.contains("/answer/") || url.contains("/p/")) {
-                    title = resultJson.getString("title");
-                }
-            }else if(url.contains("360kuai.com") && result.contains("您访问的文章走失了")) {
-                title = String.valueOf("404");
-            }else if(result.contains("文章没有找到哦") && url.contains("yidianzixun.com")) {
-                title = "文章未找到";
-            }else if(url.contains("tznew.58.com/view")) {
-                try {
-                    JSONObject json = JSONObject.parseObject(result);
-                    title = json.getJSONObject("result").getString("title");
-                } catch (Exception e) {
-                    logger.error(" uc 数据 json 转换失败", e);
-                }
-            }
-            
-            //若title 为拿到 用 此方法
-            if(Objects.isNull(title) || title.length() < 1) {
-                title = doc.select("div.adiv > p > span").text().replaceAll(" ", "");
-            }
-            
-            //若title 为拿到 用 此方法
-            if(Objects.isNull(title) || title.length() < 1) {
-                title = doc.select("title").text().replaceAll(" ", "");
-            }
-            
-            //若title 为拿到 用 此方法
-            if(Objects.isNull(title) || title.length() < 1) {
-                title = doc.select("h1").text().replaceAll(" ", "");
-            }
-            
-            //若title 为拿到 用 此方法  无法获取标题不进行程序迷惑性判断
-//            if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
-//                title = "网页已删除";
-//            }
-        
-            if(Objects.nonNull(title) && title.length() > 1){
-                return new UrlLiveBean(attr.getAttr().toString(), isDelete(title),title);
-            } else {
-                return null;
-            }
-        } catch (Exception e) {
-            return null;
-        }
-    }
-    
-    /**
-     * 
-     * @Description 标题判断
-     * @param title
-     * @return
-     */
-    private boolean isDelete(String title) {
-        List<String> eList = Arrays.asList("系统出错","该内容已被发布者删除","网页已删除"
-                ,"此帐号已自主注销，内容无法查看","页面提示","正在维护中"
-                ,"此文章被第三方评估为不实信息","财经头条","知识100题","502BadGateway"
-                ,"提示信息","跳转页","跳转中...","此帐号在冻结期,内容无法查看","东北新闻网"
-                ,"百度一下，你就知道","帐号已迁移","手机百度","内容被删除","亚博国际|首页"
-                ,"中国软件网","云广网","新浪首页","文章暂时找不到了","-法易网"
-                ,"【一点资讯】www.yidianzixun.com","错误页面","网站暂停通知","【快资讯】你的专属资讯平台"
-                ,"百度新闻——全球最大的中文新闻平台","以上文章由以下机构判定为不实信息","该公众号已迁移"
-                ,"财经网-CAIJING.COM.CN","蚂蚁资讯","参数错误","时尚头条_YOKA时尚网","该文章已经被删除"
-                ,"网易","链接已过期","找不到页面","今晚网","该文章已被删除", "该回答已被删除-知乎", "资源不存在","文章未找到"
-                , "UC头条");
-        
-        List<String> cList = Arrays.asList("提示信息-","此内容因违规无法查看","微信公众号不存在"
-                ,"此内容被投诉且经审核涉嫌侵权，无法查看","thepageyourequestedwasnotfound","未知错误"
-                ,"Objectmoved","404","页面没有找到","页面未找到","301MovedPermanently","加载异常",
-                "此帐号已被屏蔽, 内容无法查看","链接不存在", "新闻已删除");
-        
-        return cList.stream().anyMatch(title::contains) || eList.stream().anyMatch(title::equals);
-    }
-    
-    
-    /**
-     * 处理知乎链接
-     * 
-     * */
-    private static String treatZhihuUrl(String url) {
-    	if(url.contains("/answer/")) {
-    		url = "https://api.zhihu.com/answers/" + url.replaceAll(".*/answer/", "");
-    	}else if(url.contains("/question/") && !url.contains("/answer/")) {
-    		url = "https://api.zhihu.com/questions/" + url.replaceAll(".*/question/", "");
-    	}else if(url.contains("/p/")) {
-    		url = "https://api.zhihu.com/articles/" + url.replaceAll(".*/p/", "");
-    	}
-    	return url;
-    }
-    
-}
+package com.zhiwei.source_forward.crawler;
+
+import static java.util.Objects.nonNull;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+
+import com.alibaba.fastjson.JSONObject;
+import com.zhiwei.async.GroupSync;
+import com.zhiwei.crawler.core.HttpBoot;
+import com.zhiwei.crawler.core.proxy.ProxyHolder;
+import com.zhiwei.crawler.core.utils.RequestUtils;
+import com.zhiwei.source_forward.bean.UrlLiveBean;
+import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
+import com.zhiwei.source_forward.util.UrlLiveDataCallback;
+import com.zhiwei.tools.tools.ZhiWeiTools;
+
+import okhttp3.Request;
+
+/**
+ * 
+ * @ClassName UrlLiveCrawler
+ * @Description 判断页面是否存在
+ * @author byte-zbs
+ * @Date 2018年8月20日 下午3:34:57
+ * @version 1.0.0
+ */
+public class UrlLiveCrawler {
+
+    private static final Logger logger = LogManager.getLogger(UrlLiveCrawler.class);
+    private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).throwException(true).build();
+    
+    public GroupSync submitTask(UrlLiveDataCallback callback,String... urls) {
+        GroupSync counter = new GroupSync();
+        start(counter, callback, urls);
+        return counter;
+    }
+    
+    private void start(GroupSync counter,UrlLiveDataCallback callback, String... urls) {
+        if (nonNull(urls) && urls.length > 0) {
+            for (String url : urls) {
+                counter.add();
+                ZhiWeiTools.sleep(100);
+                if (nonNull(url)) {
+                    try {
+//                        ZhiWeiTools.sleep(3000);
+                        search(counter, url, Attribution.of(url, 1), callback);
+                    } catch (Exception e) {
+                        logger.error("搜索创建出错:", e);
+                    }
+                }
+                counter.done();
+            }
+        }
+    }
+    
+    private GroupSync search(GroupSync counter, String url,
+            Attribution attr, UrlLiveDataCallback callback) {
+//        System.out.println(url);
+        url = dealUrl(url);
+        logger.info("当前处理 URL: {}", url);
+        Map<String,String> headers = new HashMap<>();
+        ProxyHolder ph = null;
+        if(url.contains("toutiao.com")){
+//        	headers.put("referer", url);
+//        	headers.put("cookie", "csrftoken=6d0e5967684dbb57cea14dc12858d263; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __utmz=24953151.1576744435.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _ga=GA1.2.1570444666.1576744435; ttcid=d5fa0cec2bca426cbbddeb27a8bc72f629; UM_distinctid=170f5ad3a5b43e-04a6930812e3c5-f313f6d-240000-170f5ad3a5c40b; SLARDAR_WEB_ID=770be4fe-dd54-4701-a4ed-396b861a237d; CNZZDATA1259612802=2091325281-1587691681-%7C1589960650; s_v_web_id=verify_kaqh0jch_kZtSHRI9_GfBj_403X_8i5y_VOyWz4ATIx0d; __utmc=24953151; __utma=24953151.1570444666.1576744435.1591353562.1591579877.8; tt_webid=6837283963338622477; tt_webid=6837283963338622477; __tasessionId=dsei2aty41591951911851; tt_scid=M--AJ-FYwZ0qcYTzQCLyMeS5MlykLS6ktMkvqKKJmq-ghRxX4waEBhJ3YbheuNmi2b8a");
+//            headers.put("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
+//            headers.put("accept-encoding", "gzip, deflate, br");
+//            headers.put("accept-language", "zh-CN,zh;q=0.9");
+//            headers.put("cache-control", "no-cache");
+//            headers.put("sec-fetch-dest", "document");
+//            headers.put("sec-fetch-mode", "navigate");
+//            headers.put("sec-fetch-site", "same-origin");
+//            headers.put("sec-fetch-user", "?1");
+//            headers.put("upgrade-insecure-requests", "1");
+//            headers.put("user-agent", "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Mobile Safari/537.36");
+            ph = ProxyHolder.NAT_HEAVY_PROXY;
+        }else if(url.contains("zhihu.com")) {
+        	url = treatZhihuUrl(url);
+        	ph = ProxyHolder.NAT_HEAVY_PROXY;
+        }
+        try {
+            Request request = RequestUtils.wrapGet(url, headers);
+            if(Objects.nonNull(request)) {
+                counter.add();
+                // , new Proxy(Type.HTTP, new InetSocketAddress("119.3.38.9", 31128))
+                httpBoot.asyncCall(request, ph).whenComplete((rs,ex) -> {
+                    try {
+                        if (Objects.isNull(ex)) {
+                            if(rs.isSuccessful()) {
+                            	parseHtml(rs.body().string(), attr, callback);
+                            }else if(rs.code() == 404){
+                            	callBack(callback, attr, 1, String.valueOf(rs.code()));
+                            }else {
+                                callBack(callback, attr, -1, "程序无法判断");
+                            }
+                        } else {
+                            logger.error("e", ex);
+                            callBack(callback, attr, -1, "程序无法判断");
+                        }
+                    } catch (Exception e) {
+                        logger.error(" 数据是否删除 采集出错 {} ",e);
+                    }finally {
+                        counter.done();
+                    }
+                });
+                return counter;
+            }
+        } catch (Exception e2) {
+            logger.error("数据出错 {}" ,e2);
+        }
+        return counter;
+    }
+    
+    private void callBack(UrlLiveDataCallback callback,Attribution attr,int i,String title) {
+        UrlLiveBean ulb = null;
+        if(i == 1) {
+            ulb = new UrlLiveBean(attr.getAttr().toString(), true, title);
+        }else {
+            ulb = new UrlLiveBean(attr.getAttr().toString(), i, title);
+        }
+        if (callback == null) {
+            logger.warn("DataCallback 对象为 null，无法保存数据");
+        } else {
+            callback.onData(ulb, attr);
+        }
+    }
+    
+    private String dealUrl(String url) {
+        try {
+            if(url.contains("www.toutiao.com")) {
+                if(url.contains("www.toutiao.com")) {
+                    
+                }else {
+                    url = url.replace("toutiao.com", "www.toutiao.com");
+                }
+                if(url.contains("https")) {
+                    
+                }else {
+                    url = url.replace("http", "https");
+                }
+                if(url.contains("group")) {
+                    url = "https://www.toutiao.com/a" + url.split("/")[4] + "/";
+                }
+            }else if(url.contains("mp.weixin.qq.com")) {
+                if(url.contains("https")) {
+                
+                }else {
+                    url = url.replace("http", "https");
+                }
+            }else if(url.contains("a.mp.uc.cn") && url.contains("wm_aid")) {
+                url = "http://ff.dayu.com/contents/origin/"+url.split("wm_aid=")[1].split("!!wm_id")[0]+"?biz_id=1002&_fetch_author=1&_fetch_incrs=1";
+            }else if(url.contains("tznew.58.com/view") && url.contains("infoid=")) {
+                // https://tznew.58.com/view/c/sharingDetailNew?infoid=117073473
+                url = "https://tznew.58.com/tznew/c/info-detail?infoid=" + url.split("infoid=")[1].split("&")[0];
+            }
+            return url;
+        } catch (Exception e) {
+            return url;
+        }
+    }
+    
+    /**
+     * 
+     * @Description 判断是否删除
+     * @param html
+     * @param attr
+     * @param callback
+     */
+    private void parseHtml(String html, Attribution attr,
+            UrlLiveDataCallback callback) {
+        if (callback == null) {
+            logger.warn("DataCallback 对象为 null，无法保存数据");
+        } else {
+            UrlLiveBean ulb = matchDel(html, attr, attr.getAttr().toString());
+            if(Objects.nonNull(ulb)) {
+                callback.onData(ulb, attr);
+            }else {
+                callBack(callback, attr, -1, "程序无法判断");
+            }
+        }
+    }
+    
+    /***
+     * @Title: matchDel 
+     * @author hero 
+     * @Description: 验证链接是否有效
+     * @param @param page
+     * @param @return 设定文件 
+     * @return boolean 返回类型
+     */
+    public UrlLiveBean matchDel(String result,Attribution attr,String url){
+        try {
+            Document doc = Jsoup.parse(result);
+            String title = null;
+            if(url.contains("mp.weixin.qq.com") || url.contains("post.mp.qq.com") || url.contains("weixin.sogou.com")){
+            	title = doc.select("h2.rich_media_title").text().replaceAll(" ", "");
+            	if(Objects.isNull(title) || title.isEmpty()) {
+            	    title = doc.select("p.title").text();
+            	}
+            	if(Objects.isNull(title) || title.isEmpty()) {
+                    title = doc.select("h3.msg-title").text();
+                }
+            	if(Objects.isNull(title) || title.isEmpty()) {
+                    title = doc.select("div.global_error_msg.warn").text();
+                }
+            	if(Objects.isNull(title) || title.isEmpty()) {
+                    title = doc.select("p.tips").text();
+                }
+            	if(Objects.isNull(title) || title.isEmpty()) {
+                    title = doc.select("h2").text();
+                }
+            	// 获取title
+                Matcher ma5 = Pattern.compile("var msg_title = \'(.*)\'")
+                        .matcher(result);
+                if (ma5.find()) {
+                    title = ma5.group(1).replaceAll("  ", " ").trim();
+                }
+            }else if(url.contains("kuaibao")){
+                title = doc.select("p.title").text().replaceAll(" ", "");
+            }else if(url.contains("chinadaily.com.cn")){
+                title = doc.select("p.style1").text().replaceAll(" ", "");
+            }else if(url.contains("baidu.com") || url.contains("hao123.com")) {
+                title = doc.select("p#contaniner").text();
+            }else if(url.contains("kanfanews.com")) {
+                title = doc.select("p#tit").text();
+            }else if(url.contains("ifeng.com") && result.contains("url=http://www.ifeng.com/")) {
+                title = "网页已删除";
+            }else if(url.contains("a.mp.uc.cn")) {
+                try {
+                    JSONObject json = JSONObject.parseObject(result);
+                    title = json.getJSONObject("data").getString("title");
+                } catch (Exception e) {
+                    logger.error(" uc 数据 json 转换失败", e);
+                }
+            }else if(url.contains("huanqiu.com") && result.contains("www.huanqiu.com/404.html")) {
+                title = "网页已删除";
+            }else if(url.contains("zhihu.com")) {
+            	JSONObject resultJson = JSONObject.parseObject(result);
+            	if(url.contains("/answer/")) {
+            	    title = resultJson.getJSONObject("question").getString("title");
+                }else if(url.contains("/question/") && !url.contains("/answer/") || url.contains("/p/")) {
+                    title = resultJson.getString("title");
+                }
+            }else if(url.contains("360kuai.com") && result.contains("您访问的文章走失了")) {
+                title = String.valueOf("404");
+            }else if(result.contains("文章没有找到哦") && url.contains("yidianzixun.com")) {
+                title = "文章未找到";
+            }else if(url.contains("tznew.58.com/view")) {
+                try {
+                    JSONObject json = JSONObject.parseObject(result);
+                    title = json.getJSONObject("result").getString("title");
+                } catch (Exception e) {
+                    logger.error(" uc 数据 json 转换失败", e);
+                }
+            }
+            
+            //若title 为拿到 用 此方法
+            if(Objects.isNull(title) || title.length() < 1) {
+                title = doc.select("div.adiv > p > span").text().replaceAll(" ", "");
+            }
+            
+            //若title 为拿到 用 此方法
+            if(Objects.isNull(title) || title.length() < 1) {
+                title = doc.select("title").text().replaceAll(" ", "");
+            }
+            
+            //若title 为拿到 用 此方法
+            if(Objects.isNull(title) || title.length() < 1) {
+                title = doc.select("h1").text().replaceAll(" ", "");
+            }
+            
+            //若title 为拿到 用 此方法  无法获取标题不进行程序迷惑性判断
+//            if(Objects.isNull(title) || title.length() < 1 || result.length() < 200) {
+//                title = "网页已删除";
+//            }
+        
+            if(Objects.nonNull(title) && title.length() > 1){
+                return new UrlLiveBean(attr.getAttr().toString(), isDelete(title),title);
+            } else {
+                return null;
+            }
+        } catch (Exception e) {
+            return null;
+        }
+    }
+    
+    /**
+     * 
+     * @Description 标题判断
+     * @param title
+     * @return
+     */
+    private boolean isDelete(String title) {
+        List<String> eList = Arrays.asList("系统出错","该内容已被发布者删除","网页已删除"
+                ,"此帐号已自主注销，内容无法查看","页面提示","正在维护中"
+                ,"此文章被第三方评估为不实信息","财经头条","知识100题","502BadGateway"
+                ,"提示信息","跳转页","跳转中...","此帐号在冻结期,内容无法查看","东北新闻网"
+                ,"百度一下，你就知道","帐号已迁移","手机百度","内容被删除","亚博国际|首页"
+                ,"中国软件网","云广网","新浪首页","文章暂时找不到了","-法易网"
+                ,"【一点资讯】www.yidianzixun.com","错误页面","网站暂停通知","【快资讯】你的专属资讯平台"
+                ,"百度新闻——全球最大的中文新闻平台","以上文章由以下机构判定为不实信息","该公众号已迁移"
+                ,"财经网-CAIJING.COM.CN","蚂蚁资讯","参数错误","时尚头条_YOKA时尚网","该文章已经被删除"
+                ,"网易","链接已过期","找不到页面","今晚网","该文章已被删除", "该回答已被删除-知乎", "资源不存在","文章未找到"
+                , "UC头条");
+        
+        List<String> cList = Arrays.asList("提示信息-","此内容因违规无法查看","微信公众号不存在"
+                ,"此内容被投诉且经审核涉嫌侵权，无法查看","thepageyourequestedwasnotfound","未知错误"
+                ,"Objectmoved","404","页面没有找到","页面未找到","301MovedPermanently","加载异常",
+                "此帐号已被屏蔽, 内容无法查看","链接不存在", "新闻已删除");
+        
+        return cList.stream().anyMatch(title::contains) || eList.stream().anyMatch(title::equals);
+    }
+    
+    
+    /**
+     * 处理知乎链接
+     * 
+     * */
+    private static String treatZhihuUrl(String url) {
+    	if(url.contains("/answer/")) {
+    		url = "https://api.zhihu.com/answers/" + url.replaceAll(".*/answer/", "");
+    	}else if(url.contains("/question/") && !url.contains("/answer/")) {
+    		url = "https://api.zhihu.com/questions/" + url.replaceAll(".*/question/", "");
+    	}else if(url.contains("/p/")) {
+    		url = "https://api.zhihu.com/articles/" + url.replaceAll(".*/p/", "");
+    	}
+    	return url;
+    }
+    
+}
--- a/src/main/java/com/zhiwei/source_forward/util/MatchSource.java
+++ b/src/main/java/com/zhiwei/source_forward/util/MatchSource.java
@@ -325,7 +325,6 @@ public class MatchSource {
 					source = source.replaceAll(".*来源：|）", "");
 				}
 			}
-			
            if(Objects.nonNull(source) && source.length() != 0) {
            	return source;
            }