sourceforward 链接匹配修改

98e0d120 · yangchen · aa2a108b · 98e0d120 · 98e0d120 · 98e0d120
Commit 98e0d120 authored Sep 11, 2018 by yangchen
10 changed files
--- a/pom.xml
+++ b/pom.xml
@@ -24,12 +24,12 @@
        <dependency>
            <groupId>com.zhiwei.tools</groupId>
            <artifactId>zhiwei-tools</artifactId>
-            <version>0.0.2-SNAPSHOT</version>
+            <version>0.0.5-SNAPSHOT</version>
        </dependency>
        <dependency>
            <groupId>com.zhiwei.middleware</groupId>
            <artifactId>proxy-client</artifactId>
-            <version>0.0.1-RELEASE</version>
+            <version>0.0.2-RELEASE</version>
        </dependency>
    </dependencies>

@@ -89,13 +89,4 @@



-    <dependencyManagement>
-        <dependencies>
-            <dependency>
-                <groupId>com.squareup.okhttp3</groupId>
-                <artifactId>okhttp</artifactId>
-                <version>3.11.0</version>
-            </dependency>
-        </dependencies>
-    </dependencyManagement>
 </project>
\ No newline at end of file
--- a/src/main/java/com/zhiwei/source_forward/bean/UrlLiveBean.java
+++ b/src/main/java/com/zhiwei/source_forward/bean/UrlLiveBean.java
@@ -4,17 +4,27 @@ public class UrlLiveBean {
    
    private String url;
    
-    private boolean isLive;
+    private Integer isLive;
    
    public UrlLiveBean() {
        super();
    }

-    public UrlLiveBean(String url, boolean isLive) {
+    public UrlLiveBean(String url, Integer isLive) {
        super();
        this.url = url;
        this.isLive = isLive;
    }
+    
+    public UrlLiveBean(String url, boolean isLive) {
+        super();
+        this.url = url;
+        if(isLive) {
+            this.isLive = 1;      //已删除
+        }else {
+            this.isLive = 0;
+        }
+    }

    public String getUrl() {
        return url;
@@ -24,11 +34,11 @@ public class UrlLiveBean {
        this.url = url;
    }

-    public boolean isLive() {
+    public Integer isLive() {
        return isLive;
    }

-    public void setLive(boolean isLive) {
+    public void setLive(Integer isLive) {
        this.isLive = isLive;
    }

@@ -46,6 +56,8 @@ public class UrlLiveBean {
    public static class Attribution {
        private Object attr;
        
+        private Integer count;
+        
        /** 
         * Constructor
         * 
@@ -55,6 +67,17 @@ public class UrlLiveBean {
            this.attr = attr;
        }
        
+        /**
+         * 
+         * @Description TODO(这里用一句话描述这个方法的作用)
+         * @param attr
+         * @param count
+         */
+        private Attribution(Object attr,Integer count){
+            this.attr = attr;
+            this.count = count;
+        }
+        
        /** 
         * 创建属性
         * 
@@ -66,13 +89,36 @@ public class UrlLiveBean {
        }
        
        /** 
+         * 创建属性
+         * 
+         * @param attr
+         * @return Attribution
+         */
+        public static Attribution of(Object attr,Integer count) {
+            return new Attribution(attr,count);
+        }
+        
+        /** 
         * 获取属性
         * 
         * @return Object
         */
-        public Object get() {
+        public Object getAttr() {
            return attr;
        }
+        
+        /** 
+         * 获取属性
+         * 
+         * @return Object
+         */
+        public Integer getCount() {
+            return count;
+        }
+        
+        public void AddCount() {
+            count++;
+        }
    }
    
 }
--- a/src/main/java/com/zhiwei/source_forward/content/ContentExtractor.java
+++ b/src/main/java/com/zhiwei/source_forward/content/ContentExtractor.java
@@ -143,9 +143,9 @@ public class ContentExtractor {
                content = tag;
            }
        }
-        if (content == null) {
-            throw new Exception("extraction failed");
-        }
+//        if (content == null) {
+//            throw new Exception("extraction failed");
+//        }
        return content;
    }

@@ -164,17 +164,17 @@ public class ContentExtractor {
            news.setUrl(doc.baseUri());
        }

-        try {
-            news.setTime(getTime(contentElement));
-        } catch (Exception ex) {
-            LOG.info("news title extraction failed", ex);
-        }
-
-        try {
-            news.setTitle(getTitle(contentElement));
-        } catch (Exception ex) {
-            LOG.info("title extraction failed", ex);
-        }
+//        try {
+//            news.setTime(getTime(contentElement));
+//        } catch (Exception ex) {
+//            LOG.info("news title extraction failed", ex);
+//        }
+
+//        try {
+//            news.setTitle(getTitle(contentElement));
+//        } catch (Exception ex) {
+//            LOG.info("title extraction failed", ex);
+//        }
        return news;
    }


--- a/src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
+++ b/src/main/java/com/zhiwei/source_forward/crawler/ContentCrawler.java
@@ -3,23 +3,22 @@ package com.zhiwei.source_forward.crawler;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;

+import com.zhiwei.crawler.async.MultiThreadingCounter;
+import com.zhiwei.crawler.core.HttpBoot;
+import com.zhiwei.crawler.core.HttpRequestBuilder;
 import com.zhiwei.source_forward.bean.ContentBean;
 import com.zhiwei.source_forward.bean.ContentBean.Attribution;
 import com.zhiwei.source_forward.util.ContentDataCallback;
 import com.zhiwei.source_forward.util.MatchContent;
 import com.zhiwei.source_forward.util.ProxyClientUtil;
-import com.zhiwei.tools.httpclient.HttpBoot;
-import com.zhiwei.tools.httpclient.HttpRequestBuilder;
-import com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter;

-import okhttp3.Headers;
 import okhttp3.Request;
 import okhttp3.Response;

 public class ContentCrawler {
-    
+
    private static Logger logger = LogManager.getLogger(ContentCrawler.class);
-    
+
    /**
     * 
     * @Description 链接传入 并 返回采集完信号
@@ -28,12 +27,13 @@ public class ContentCrawler {
     * @return
     * @throws Exception
     */
-    public MultiThreadingCounter submitTask(ContentDataCallback callback,String... urls) throws Exception {
+    public MultiThreadingCounter submitTask(ContentDataCallback callback,
+            String... urls) throws Exception {
        MultiThreadingCounter counter = new MultiThreadingCounter();
        start(counter, callback, urls);
        return counter;
    }
-    
+
    /**
     * 
     * @Description 提交链接
@@ -41,17 +41,15 @@ public class ContentCrawler {
     * @param callback
     * @param urls
     */
-    private void start(MultiThreadingCounter counter,ContentDataCallback callback, String... urls) {
+    private void start(MultiThreadingCounter counter,
+            ContentDataCallback callback, String... urls) {
        if (urls != null && urls.length > 0) {
            for (String url : urls) {
                if (url != null) {
                    try {
-                        counter.increase();
                        search(counter, url, Attribution.of(url), callback);
                    } catch (Exception e) {
                        logger.error("关键词 {} 搜索创建出错: {}", e.getMessage());
-                    } finally {
-                        counter.reduce();
                    }
                }
            }
@@ -67,7 +65,8 @@ public class ContentCrawler {
     * @param callback
     * @return
     */
-    private MultiThreadingCounter search(MultiThreadingCounter counter, String url,Attribution attr, ContentDataCallback callback) {
+    private MultiThreadingCounter search(MultiThreadingCounter counter,
+            String url, Attribution attr, ContentDataCallback callback) {
        logger.info("当前处理 URL: {}", url);
        Request request = HttpRequestBuilder.newGetRequest(url, null);
        counter.increase();
@@ -75,22 +74,23 @@ public class ContentCrawler {
            if (future.isSuccess()) {
                Response response = future.result();
                try {
-                    parseHtml(response, attr, callback);
+                     parseHtml(response, attr, callback);
                } catch (Exception e) {
                    logger.error("解析出错", e);
                }
            } else {
-                logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage());
+                logger.info("{} 搜索结果访问失败: {}", request.url().url(),future.cause().getMessage());
            }
+            
            counter.reduce();
        });
        return counter;
    }
-    
+
    /**
     * 
     * 
-     * @Description 获取正文解析 
+     * @Description 获取正文解析
     * @param response
     * @param attr
     * @param callback
@@ -99,14 +99,15 @@ public class ContentCrawler {
            ContentDataCallback callback) {
        String content = null;
        try {
-            if(response.isSuccessful()){
+            if (response.isSuccessful()) {
                String html = response.body().string();
-                content = MatchContent.matchContent(attr.get().toString(), html);
+                content = MatchContent.matchContent(attr.get().toString(),
+                        html);
            }
        } catch (Exception e) {
-            logger.info("网页链接失效",e.fillInStackTrace());
-        }finally {
-            if(response != null) {
+            logger.info("网页链接失效", e.fillInStackTrace());
+        } finally {
+            if (response != null) {
                response.close();
            }
        }
@@ -116,7 +117,7 @@ public class ContentCrawler {
        } else {
            callback.onData(cb, attr);
        }
-        
+
    }
-    
+
 }
--- a/src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
+++ b/src/main/java/com/zhiwei/source_forward/crawler/MediaSelfSourceCrawler.java
@@ -7,15 +7,15 @@ import org.apache.logging.log4j.Logger;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Node;

+import com.zhiwei.crawler.async.MultiThreadingCounter;
+import com.zhiwei.crawler.core.HttpBoot;
+import com.zhiwei.crawler.core.HttpRequestBuilder;
 import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
 import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
 import com.zhiwei.source_forward.util.MatchChannel;
 import com.zhiwei.source_forward.util.MatchSource;
 import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack;
 import com.zhiwei.source_forward.util.ProxyClientUtil;
-import com.zhiwei.tools.httpclient.HttpBoot;
-import com.zhiwei.tools.httpclient.HttpRequestBuilder;
-import com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter;

 import okhttp3.Request;
 import okhttp3.Response;

--- a/src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
+++ b/src/main/java/com/zhiwei/source_forward/crawler/SourceForwardCrawler.java
@@ -8,6 +8,9 @@ import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Node;

+import com.zhiwei.crawler.async.MultiThreadingCounter;
+import com.zhiwei.crawler.core.HttpBoot;
+import com.zhiwei.crawler.core.HttpRequestBuilder;
 import com.zhiwei.source_forward.bean.SourceForwardBean;
 import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
 import com.zhiwei.source_forward.util.MatchChannel;
@@ -15,9 +18,6 @@ import com.zhiwei.source_forward.util.MatchSource;
 import com.zhiwei.source_forward.util.ProxyClientUtil;
 import com.zhiwei.source_forward.util.SourceData;
 import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
-import com.zhiwei.tools.httpclient.HttpBoot;
-import com.zhiwei.tools.httpclient.HttpRequestBuilder;
-import com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter;

 import okhttp3.Request;
 import okhttp3.Response;

--- a/src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
+++ b/src/main/java/com/zhiwei/source_forward/crawler/UrlLiveCrawler.java
 package com.zhiwei.source_forward.crawler;

-import java.io.IOException;
 import java.util.List;

 import org.apache.logging.log4j.LogManager;
@@ -9,13 +8,13 @@ import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Node;

+import com.zhiwei.crawler.async.MultiThreadingCounter;
+import com.zhiwei.crawler.core.HttpBoot;
+import com.zhiwei.crawler.core.HttpRequestBuilder;
 import com.zhiwei.source_forward.bean.UrlLiveBean;
 import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
 import com.zhiwei.source_forward.util.ProxyClientUtil;
 import com.zhiwei.source_forward.util.UrlLiveDataCallback;
-import com.zhiwei.tools.httpclient.HttpBoot;
-import com.zhiwei.tools.httpclient.HttpRequestBuilder;
-import com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter;

 import okhttp3.Request;
 import okhttp3.Response;
@@ -43,12 +42,9 @@ public class UrlLiveCrawler {
            for (String url : urls) {
                if (url != null) {
                    try {
-                        counter.increase();
-                        search(counter, url, Attribution.of(url), callback);
+                        search(counter, url, Attribution.of(url,1), callback);
                    } catch (Exception e) {
                        logger.error("关键词 {} 搜索创建出错: {}", e.getMessage());
-                    } finally {
-                        counter.reduce();
                    }
                }
            }
@@ -57,6 +53,7 @@ public class UrlLiveCrawler {

    private MultiThreadingCounter search(MultiThreadingCounter counter, String url,
            Attribution attr, UrlLiveDataCallback callback) {
+        url = dealUrl(url);
        logger.info("当前处理 URL: {}", url);
        Request request = HttpRequestBuilder.newGetRequest(url, null);
        counter.increase();
@@ -64,36 +61,82 @@ public class UrlLiveCrawler {
            if (future.isSuccess()) {
                Response response = future.result();
                try {
-                    parseHtml(response, attr, callback);
+                    if(response.code() == 200) {
+                        parseHtml(response.body().string(), attr, callback);
+                    }else {
+                        callBack(callback, attr, 1);
+                    }
                } catch (Exception e) {
                    logger.error("解析出错", e);
+                }finally {
+                    if(response != null) {
+                        response.close();
+                    }
                }
            } else {
-                logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage());
+                if(attr.getCount() > 3) {
+                    callBack(callback, attr, -1);
+                    logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage());
+                }else {
+                    attr.AddCount();
+                    search(counter, attr.getAttr().toString(), attr, callback);
+                }
            }
            counter.reduce();
        });
        return counter;
    }
-
-    private void parseHtml(Response response, Attribution attr,
-            UrlLiveDataCallback callback) {
-        /***验证网页是否能够连通*/
-        boolean f = true;
-        if(!response.isSuccessful()){
+    
+    private void callBack(UrlLiveDataCallback callback,Attribution attr,int i) {
+        UrlLiveBean ulb = new UrlLiveBean(attr.getAttr().toString(), i);
+        if (callback == null) {
+            logger.warn("DataCallback 对象为 null，无法保存数据");
+        } else {
+            callback.onData(ulb, attr);
+        }
+    }
+    
+    private String dealUrl(String url) {
+        if(url.contains("toutiao.com")) {
            try {
-                f = matchDel(response.body().string(),attr.get().toString());
-            } catch (IOException e) {
-                logger.info("数据判断出错 {}",e.getMessage());
-            }finally {
-                if(response != null) {
-                    response.close();
+                if(url.contains("www.toutiao.com")) {
+                    
+                }else {
+                    url = url.replace("toutiao.com", "www.toutiao.com");
                }
+                if(url.contains("https")) {
+                    
+                }else {
+                    url = url.replace("http", "https");
+                }
+                if(url.contains("group")) {
+                    url = "https://www.toutiao.com/a" + url.split("/")[4] + "/";
+                }
+            } catch (Exception e) {
+                logger.info("url 解析出错  {}",url);
+                return url;
            }
-        }else{
-            f = false;
        }
-        UrlLiveBean ulb = new UrlLiveBean(attr.get().toString(), f);
+        return url;
+    }
+    
+    /**
+     * 
+     * @Description 判断是否删除
+     * @param html
+     * @param attr
+     * @param callback
+     */
+    private void parseHtml(String html, Attribution attr,
+            UrlLiveDataCallback callback) {
+        /***验证网页是否能够连通*/
+        boolean f = true;
+        try {
+            f = matchDel(html,attr.getAttr().toString());
+        } catch (Exception e) {
+            logger.info("数据判断出错 {}",e.getMessage());
+        }
+        UrlLiveBean ulb = new UrlLiveBean(attr.getAttr().toString(), f);
        if (callback == null) {
            logger.warn("DataCallback 对象为 null，无法保存数据");
        } else {
@@ -123,12 +166,6 @@ public class UrlLiveCrawler {
            return true;
        }
        step++;
-        if (rulerWeigui(doc))
-        {
-            logger.info("{}检测规则：第{}步",url,step);
-            return true;
-        }
-        step++;
        if (rulerTousu(doc))
        {
            logger.info("{}检测规则：第{}步",url,step);
@@ -158,6 +195,11 @@ public class UrlLiveCrawler {
            logger.info("{}检测规则：第{}步",url,step);
            return true;
        }
+        step++;//10
+        if(rulerWeigui(doc)) {
+            logger.info("{}检测规则：第{}步",url,step);
+            return true;
+        }
        step++;//11
        if (rulerYidian(doc))
        {
@@ -169,7 +211,7 @@ public class UrlLiveCrawler {
    
      /**
     * 
-     * @TODO(TODO 微信谣言的无效网址筛选规则)
+     * ( 微信谣言的无效网址筛选规则)
     * @author 陈炜涛
     * @param doc
     * @return
@@ -188,7 +230,7 @@ public class UrlLiveCrawler {

    /**
     * 
-     * @TODO(TODO 微信内容违规的无效网址筛选规则)
+     * ( 微信内容违规的无效网址筛选规则)
     * @author 陈炜涛
     * @param doc
     * @return
@@ -198,8 +240,7 @@ public class UrlLiveCrawler {
    private boolean rulerWeigui(Document doc)
    {
        boolean flg = false;
-        if ("此内容因违规无法查看".equals(doc.select(".text_area > p:nth-child(1)")
-                .text()))
+        if ((doc.select("p.title").text()).contains("此内容因违规无法查看"))
        {
            flg = true;
        }
@@ -208,7 +249,7 @@ public class UrlLiveCrawler {

    /**
     * 
-     * @TODO(TODO 微信内容违规的无效网址筛选规则)
+     * ( 微信内容违规的无效网址筛选规则)
     * @author 陈炜涛
     * @param doc
     * @return
@@ -227,7 +268,7 @@ public class UrlLiveCrawler {

    /**
     * 
-     * @TODO(TODO 环球的无效网址筛选规则)
+     * ( 环球的无效网址筛选规则)
     * @author 陈炜涛
     * @param doc
     * @return
@@ -246,7 +287,7 @@ public class UrlLiveCrawler {

    /**
     * 
-     * @TODO(TODO 空的无效网址筛选规则)
+     * ( 空的无效网址筛选规则)
     * @author 陈炜涛
     * @param doc
     * @return
@@ -267,7 +308,7 @@ public class UrlLiveCrawler {

    /**
     * 
-     * @TODO(TODO 内容不存在)
+     * ( 内容不存在)
     * @author 陈炜涛
     * @param doc
     * @return
@@ -286,7 +327,7 @@ public class UrlLiveCrawler {

    /**
     * 
-     * @TODO(TODO 招商网的无效网址筛选规则)
+     * ( 招商网的无效网址筛选规则)
     * @author 陈炜涛
     * @param doc
     * @return
@@ -315,7 +356,7 @@ public class UrlLiveCrawler {

    /**
     * 
-     * @TODO(TODO 一点资讯的无效网址筛选规则)
+     * ( 一点资讯的无效网址筛选规则)
     * @author 陈炜涛
     * @param doc
     * @return
@@ -334,7 +375,7 @@ public class UrlLiveCrawler {
        }
        catch (Exception e)
        {
-            // TODO: handle exception
+            // : handle exception
        }
        return flg;
    }
@@ -354,7 +395,7 @@ public class UrlLiveCrawler {
            for (Node node : nodeList) {
                if (node.outerHtml().contains("<title>")) {
                    String title = node.toString().split("<title>")[1].split("</title>")[0];
-                    if(title.contains("404")){
+                    if(title.contains("未知错误") || title.contains("Object moved") || title.contains("404") || title.contains("页面没有找到") || title.contains("页面未找到") || title.contains("301 Moved Permanently")){
                        return true;
                    }
                }

--- a/src/main/java/com/zhiwei/source_forward/run/ContentMatch.java
+++ b/src/main/java/com/zhiwei/source_forward/run/ContentMatch.java
@@ -55,14 +55,14 @@ public class ContentMatch {
        return dataList;
    }
    
-//    public static void main(String[] args) {
-//        List<String> urlList = new ArrayList<>();
-//        urlList.add("http://www.toutiao.com/a6571343464292680196/");
-//        List<ContentBean> l = getContentMatch(urlList);
-//        for(ContentBean cb : l) {
-//            System.out.println(cb.getContent());
-//        }
-//    }
+    public static void main(String[] args) {
+        List<String> urlList = new ArrayList<>();
+        urlList.add("https://mp.weixin.qq.com/s?src=11&timestamp=1535697915&ver=1093&signature=HNXpB8owyjfkyX-p2UDMga5R-qEpgjEpRQAjVmy7xqdrfsjZNdW0xa56dgCWMD9I*eo**yak46juxNEzryhKVLRT48DG0g9SUJSVrKSaPrhHEuJ1JOA86mSaY7TrHMMT&new=1");
+        List<ContentBean> l = getContentMatch(urlList);
+        for(ContentBean cb : l) {
+            System.out.println(cb.getContent());
+        }
+    }
    
    static class ContentMatchCrawlerThread extends Thread{


--- a/src/main/java/com/zhiwei/source_forward/run/URLLive.java
+++ b/src/main/java/com/zhiwei/source_forward/run/URLLive.java
@@ -33,27 +33,46 @@ public class URLLive {
        for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
            urlList.add(entry.getKey());
        }
-        
+        System.out.println(urlList.size());
        //验证数据是否已删除
        List<UrlLiveBean> dataList = UrlLiveCrawlerThread.getUrlLiveCrawle(urlList);
        for(UrlLiveBean ub : dataList){
            String url = ub.getUrl();
-            boolean live = ub.isLive();
+            int i = ub.isLive();
            if(dataMap.containsKey(url)){
                Map<String,Object> map = dataMap.get(url);
-                map.put("是否删除", live);
+                if(i == 1) {
+                    map.put("是否删除", true);
+                }else if(i == 0) {
+                    map.put("是否删除", false);
+                }
                dataMap.put(url, map);
            }
        }
        return dataMap;
    }

+    /**
+     * 
+     * @Description (TODO这里用一句话描述这个方法的作用)
+     * @param urlList
+     * @return UrlLiveBean  1 已删除  2 未删除 -1 访问失败
+     */
 	public static List<UrlLiveBean> verificationURLLive(List<String> urlList){
        //启动验证链接是否有效程序程序
 	    List<UrlLiveBean> dataList = UrlLiveCrawlerThread.getUrlLiveCrawle(urlList);
 	    return dataList;
    }
 	
+	public static void main(String[] args) {
+	    List<String> urlList = new ArrayList<>();
+	    urlList.add("http://www.zyzpes.com/toutiao/5048828/20180419A1AFBC00.html");
+        List<UrlLiveBean> u = URLLive.verificationURLLive(urlList);
+        for(UrlLiveBean b : u) {
+            System.out.println(b.toString());
+        }
+    }
+	
 	static class UrlLiveCrawlerThread extends Thread{

      private static List<UrlLiveBean> getUrlLiveCrawle(List<String> urlList){

--- a/src/main/java/com/zhiwei/source_forward/util/MatchContent.java
+++ b/src/main/java/com/zhiwei/source_forward/util/MatchContent.java
@@ -38,7 +38,8 @@ public class MatchContent {
 		        content = matchContentWeixin(document);
 		    }else if(url.contains("toutiao.com")) {
 		        content = matchContentToutiao(html);
-		    }else {
+		    }
+		    if(content == null || content.length() < 10) {
 		        content = mathchContent(html, document);
 		    }
 			return ZhiWeiTools.delHTMLTag(content);
@@ -71,7 +72,21 @@ public class MatchContent {
 	 * @return
 	 */
 	private static String matchContentWeixin(Document document) {
-        return document.select("div.rich_media_content").text();
+	    try {
+    	    String content = document.select("div.rich_media_content").text();
+    	    if(document.toString().contains("<script id=\"content_tpl\"")) {
+    	        Pattern pa = Pattern.compile("\\<script id=\"content_tpl(.*?)\\</script\\>");
+    	        Matcher ma = pa.matcher(document.toString());
+    	        while(ma.find()) {
+    	            return ma.group(0).replaceAll("<script id=\"content_tpl\" type=\"text/html\">", "").replaceAll("</script>", "");
+    	        }
+    	        return content;
+    	    }
+    	    return content;
+	    } catch (Exception e) {
+	        e.printStackTrace();
+	        return "";
+	    }
    }