处理微信链接出现重复拼接问题

ab9c3fd4 · zhiwei · 1cbcc794 · ab9c3fd4
Commit ab9c3fd4 authored Aug 25, 2020 by zhiwei
Hide whitespace changes
Inline Side-by-side

Showing with 308 additions and 322 deletions

src/main/java/com/zhiwei/wechat/search/WechatAritcleSearch.java
+308 -322

No files found.
--- a/src/main/java/com/zhiwei/wechat/search/WechatAritcleSearch.java
+++ b/src/main/java/com/zhiwei/wechat/search/WechatAritcleSearch.java
@@ -7,6 +7,7 @@ import java.net.URLEncoder;
 import java.util.*;;
 import com.alibaba.fastjson.JSONObject;
 import com.zhiwei.crawler.core.proxy.ProxyHolder;
+import com.zhiwei.crawler.core.utils.URIUtils;
 import com.zhiwei.wechat.util.HtmlDownUtil;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.logging.log4j.LogManager;
@@ -20,347 +21,332 @@ import org.seimicrawler.xpath.JXDocument;
 import org.seimicrawler.xpath.JXNode;

 /**
+ * @author Bewilder Z
 * @ClassName: WechatAritcleSearch
 * @Description: TODO(在搜索接口根据关键词采集微信文章)
- * @author Bewilder Z
 * @date 2016年10月14日 上午9:40:18
 */
 public class WechatAritcleSearch {

-	private static Logger logger = LogManager.getLogger(WechatAritcleSearch.class);
+    private static Logger logger = LogManager.getLogger(WechatAritcleSearch.class);
+
+    /**
+     * 根据关键词在搜狗微信搜索微信文章,不包含全文
+     *
+     * @param word  关键词
+     * @param proxy 代理
+     * @param pages 需要限制返回页数的总页数（如返回前20页则传21）,如没有限制页数则传null
+     * @return List<Wechat> 返回类型
+     * @throws Exception
+     * @Title: wechatKeywordSearch
+     */
+    public static List<WechatAricle> wechatKeywordSearch(String word, Proxy proxy, Integer pages) throws Exception {
+        List<WechatAricle> result = new ArrayList<>();
+        Map<String, String> headerMap = HeaderTool.getCommonHead();
+        headerMap.put("Host", "weixin.sogou.com");
+        headerMap.put("cookie", "com_sohu_websearch_ITEM_PER_PAGE=100;");
+        boolean f = true;
+        int page = 1;
+
+        while (f) {
+            String searchUrl = "https://weixin.sogou.com/weixin?type=2&s_from=input&query=" + URLEncoder.encode(word, "UTF-8") + "&ie=utf8&_sug_=n&_sug_type_=&page=" + page;
+            headerMap.put("Referer", searchUrl);
+            // 获取数据
+            try {
+                String htmlBody = HtmlDownUtil.downloadHtml(searchUrl, headerMap, proxy);
+                // 解析数据
+                if (StringUtils.isNotBlank(htmlBody)) {
+                    JXDocument jxDocument = JXDocument.create(htmlBody);
+                    result.addAll(analysis(jxDocument));
+                    // 解析最大可寻页码
+                    String pageNext = jxDocument.selNOne("//a[@id='sogou_next']").asElement().text();
+                    if (pageNext.contains("下一页")) {
+                        page++;
+                    } else {
+                        f = false;
+                    }
+                } else {
+                    logger.info("根据关键词获取微信文章失败，返回的数据结果集: {}", htmlBody);
+                }
+                if (pages != null && pages == page) {
+                    break;
+                }
+            } catch (IOException e) {
+                logger.error("根据关键词获取微信文章失败，错误为: {}", e);
+            }
+
+        }
+        return result;
+    }
+
+
+    /**
+     * @param @param  word 关键词
+     * @param @param  tsn 采集时间范围：1(1天内);2(一周内);3(一月内);4(一年内);
+     *                5(某一时间段内与startTime和endTime配合使用)
+     * @param @param  startTime 开始时间 格式为yyyy-MM-dd
+     * @param @param  endTime 结束时间 格式为yyyy-MM-dd
+     * @param @return
+     * @param @throws ZhiWeiException
+     * @param @throws UnsupportedEncodingException 设定文件
+     * @return List<Wechat> 返回类型
+     * @Title: wechatKeywordSearch
+     * @Description: 根据关键词在搜狗微信搜索微信文章, 包含全文
+     */
+    public static List<WechatAricle> wechatKeywordSearch(String word,
+                                                         Proxy proxy, ProxyHolder proxyHolder) throws Exception {
+        List<WechatAricle> result = new ArrayList<WechatAricle>();
+        Map<String, String> headerMap = HeaderTool.getCommonHead();
+        headerMap.put("Host", "weixin.sogou.com");
+        headerMap.put("cookie", "com_sohu_websearch_ITEM_PER_PAGE=100;");
+        boolean f = true;
+        int page = 1;
+        while (f) {
+            String searchUrl = "https://weixin.sogou.com/weixin?type=2&s_from=input&query=" + URLEncoder.encode(word, "UTF-8") + "&ie=utf8&_sug_=n&_sug_type_=&page=" + page;
+            headerMap.put("Referer", searchUrl);
+            // 获取数据
+            String htmlBody = HtmlDownUtil.downloadHtml(searchUrl, headerMap, proxy);
+            // 解析数据
+            if (StringUtils.isNotBlank(htmlBody)) {
+                JXDocument jxDocument = JXDocument.create(htmlBody);
+                result.addAll(analysis(jxDocument));
+                // 解析最大可寻页码
+                String pageNext = jxDocument.selNOne("//a[@id='sogou_next']").asElement().text();
+                if (pageNext.contains("下一页")) {
+                    page++;
+                } else {
+                    f = false;
+                }
+            } else {
+                logger.info("根据关键词获取微信文章失败，返回的数据结果集: {}", htmlBody);
+            }
+        }
+        return result;
+    }
+
+    /**
+     * 获取全文及来源
+     *
+     * @param url
+     * @param proxy
+     * @param wechatAricle
+     * @return
+     * @throws IOException
+     */
+    private static WechatAricle getWechatAricleInfo(String url, ProxyHolder proxy, WechatAricle wechatAricle) {
+        try {
+            String contentHtml = HtmlDownUtil.downloadHtml(url, HeaderTool.getCommonHead(), proxy.getProxy());
+            String content = null;
+            String time = null;
+            String source = null;
+            String biz = null;
+            String title = null;
+            String user_name = null;
+            String wxId = null;
+            List<String> imgUrls = null;
+            String rootSource = null;
+            if (contentHtml != null) {
+                JXDocument jxDocument = JXDocument.create(contentHtml);
+                title = jxDocument.selNOne("//h2[@id='activity-name']").asElement().text();
+                wxId = jxDocument.selNOne("//p[@class='profile_meta'][1]/span[@class='profile_meta_value']").asElement().text();

-	/**
-	 * 根据关键词在搜狗微信搜索微信文章,不包含全文
-	 * @Title: wechatKeywordSearch
-	 * @param 
-	 *            word 关键词
-	 * @param
-	 *            proxy 代理
-	 * @param
-	 * 			  pages  需要限制返回页数的总页数（如返回前20页则传21）,如没有限制页数则传null
-	 * @throws
-	 *            Exception
-	 * @return List<Wechat> 返回类型
-	 */
-	public static List<WechatAricle> wechatKeywordSearch(String word, Proxy proxy, Integer pages) throws Exception{
-		List<WechatAricle> result = new ArrayList<>();
-		Map<String, String> headerMap = HeaderTool.getCommonHead();
-		headerMap.put("Host", "weixin.sogou.com");
-		headerMap.put("cookie", "com_sohu_websearch_ITEM_PER_PAGE=100;");
-		boolean f = true;
-		int page = 1;
+                if (contentHtml.contains("js_content")) {
+                    content = jxDocument.selNOne("//div[@id='js_content']").asElement().text();
+                } else if (contentHtml.contains("js_share_content")) {
+                    content = jxDocument.selNOne("//div[@id='js_share_content']").asElement().text();
+                }
+                if (contentHtml.contains("content_tpl")) {
+                    String text = jxDocument.selNOne("//script[@id='content_tpl']").asElement().text();
+                    content = Jsoup.parse(text).text();
+                }
+                //解析文章图片地址
+                if (Objects.nonNull(jxDocument.selN("//div[@id='js_content']//img"))) {
+                    imgUrls = new ArrayList<>();
+                    List<JXNode> imgNodeList = jxDocument.selN("//div[@id='js_content']//img");
+                    for (JXNode imgNode : imgNodeList) {
+                        String imgUrl = imgNode.selOne("//img").asElement().attr("href");
+                        imgUrls.add(imgUrl);
+                    }
+                }
+                //解析来源
+                if (Objects.nonNull(jxDocument.selNOne("//span[@id='copyright_logo']"))) {
+                    rootSource = jxDocument.selNOne("//span[@id='profileBt']/a[@id='js_name']").asElement().text();
+                }

-		while (f) {
-		    String searchUrl = "https://weixin.sogou.com/weixin?type=2&s_from=input&query="+URLEncoder.encode(word, "UTF-8")+"&ie=utf8&_sug_=n&_sug_type_=&page="+ page;
-			headerMap.put("Referer", searchUrl);
-			// 获取数据
-			try{
-				String htmlBody = HtmlDownUtil.downloadHtml(searchUrl, headerMap, proxy);
-				// 解析数据
-				if (StringUtils.isNotBlank(htmlBody)) {
-					JXDocument jxDocument = JXDocument.create(htmlBody);
-					result.addAll(analysis(jxDocument));
-					// 解析最大可寻页码
-					String pageNext = jxDocument.selNOne("//a[@id='sogou_next']").asElement().text();
-					if (pageNext.contains("下一页")) {
-						page++;
-					} else {
-						f = false;
-					}
-				} else {
-					logger.info("根据关键词获取微信文章失败，返回的数据结果集: {}", htmlBody);
-				}
-				if(pages!=null && pages==page) {
-					break;
-				}
-			}catch (IOException e){
-				logger.error("根据关键词获取微信文章失败，错误为: {}", e);
-			}

-		}
-		return result;
-	}
-	
-	
-	
-	
+                if (contentHtml.contains("d.nick_name = ")) {
+                    time = contentHtml.split("d.ct = \"")[1].split("\";")[0];
+                    source = contentHtml.split("d.nick_name = \"")[1].split("\";")[0];
+                    biz = contentHtml.split("d.biz = \"")[1].split("\"")[0];
+                    user_name = contentHtml.split("d.user_name = \"")[1].split("\"")[0];
+                } else if (contentHtml.contains("var nickname = ")) {
+                    time = contentHtml.split("var ct = \"")[1].split("\";")[0];
+                    source = contentHtml.split("var nickname = \"")[1].split("\";")[0];
+                    biz = contentHtml.split("var appuin = \"\"||\"")[1].split("\"")[0];
+                    user_name = contentHtml.split("var user_name = \"")[1].split("\"")[0];
+                }
+            }
+            if (wechatAricle == null) {
+                wechatAricle = new WechatAricle();
+                wechatAricle.setTitle(title);
+                wechatAricle.setTime(new Date(Long.valueOf(time) * 1000));
+                wechatAricle.setSource(source);
+            }

-	/**
-	 * 
-	 * @Title: wechatKeywordSearch
-	 * @Description: 根据关键词在搜狗微信搜索微信文章,包含全文
-	 * @param @param
-	 *            word 关键词
-	 * @param @param
-	 *            tsn 采集时间范围：1(1天内);2(一周内);3(一月内);4(一年内);
-	 *            5(某一时间段内与startTime和endTime配合使用)
-	 * @param @param
-	 *            startTime 开始时间 格式为yyyy-MM-dd
-	 * @param @param
-	 *            endTime 结束时间 格式为yyyy-MM-dd
-	 * @param @return
-	 * @param @throws
-	 *            ZhiWeiException
-	 * @param @throws
-	 *            UnsupportedEncodingException 设定文件
-	 * @return List<Wechat> 返回类型
-	 */
-	public static List<WechatAricle> wechatKeywordSearch(String word,
-														 Proxy proxy, ProxyHolder proxyHolder) throws Exception {
-		List<WechatAricle> result = new ArrayList<WechatAricle>();
-		Map<String, String> headerMap = HeaderTool.getCommonHead();
-		headerMap.put("Host", "weixin.sogou.com");
-		headerMap.put("cookie", "com_sohu_websearch_ITEM_PER_PAGE=100;");
-		boolean f = true;
-		int page = 1;
-		while (f) {
-			String searchUrl = "https://weixin.sogou.com/weixin?type=2&s_from=input&query="+URLEncoder.encode(word, "UTF-8")+"&ie=utf8&_sug_=n&_sug_type_=&page="+ page;
-			headerMap.put("Referer", searchUrl);
-			// 获取数据
-			String htmlBody = HtmlDownUtil.downloadHtml(searchUrl, headerMap, proxy);
-			// 解析数据
-			if (StringUtils.isNotBlank(htmlBody)) {
-				JXDocument jxDocument = JXDocument.create(htmlBody);
-				result.addAll(analysis(jxDocument));
-				// 解析最大可寻页码
-				String pageNext = jxDocument.selNOne("//a[@id='sogou_next']").asElement().text();
-				if (pageNext.contains("下一页")) {
-					page++;
-				} else {
-					f = false;
-				}
-			} else {
-				logger.info("根据关键词获取微信文章失败，返回的数据结果集: {}", htmlBody);
-			}
-		}
-		return result;
-	}
+            wechatAricle.setImgUrls(imgUrls);
+            wechatAricle.setRootSource(rootSource);
+            wechatAricle.setBiz(biz);
+            wechatAricle.setContent(content);
+            wechatAricle.setWxId(wxId);
+            wechatAricle.setUser_name(user_name);

-	/**
-	 * 获取全文及来源
-	 * @param url
-	 * @param proxy
-	 * @param wechatAricle
-	 * @return
-	 * @throws IOException
-	 */
-	private static WechatAricle getWechatAricleInfo(String url, ProxyHolder proxy,WechatAricle wechatAricle){
-		try {
-			String contentHtml = HtmlDownUtil.downloadHtml(url, HeaderTool.getCommonHead(), proxy.getProxy());
-			String content = null;
-			String time = null;
-			String source = null;
-			String biz = null;
-			String title = null;
-			String user_name = null;
-			String wxId = null;
-			List<String> imgUrls = null;
-			String rootSource = null;
-			if(contentHtml!=null){
-				JXDocument jxDocument = JXDocument.create(contentHtml);
-				title = jxDocument.selNOne("//h2[@id='activity-name']").asElement().text();
-				wxId = jxDocument.selNOne("//p[@class='profile_meta'][1]/span[@class='profile_meta_value']").asElement().text();
+        } catch (Exception e) {
+            e.printStackTrace();
+            return wechatAricle;
+        }
+        return wechatAricle;
+    }

-				if(contentHtml.contains("js_content")){
-					content = jxDocument.selNOne("//div[@id='js_content']").asElement().text();
-				}else if(contentHtml.contains("js_share_content")){
-					content = jxDocument.selNOne("//div[@id='js_share_content']").asElement().text();
-				}
-				if(contentHtml.contains("content_tpl")){
-					String text = jxDocument.selNOne("//script[@id='content_tpl']").asElement().text();
-					content = Jsoup.parse(text).text();
-				}
-				//解析文章图片地址
-				if(Objects.nonNull(jxDocument.selN("//div[@id='js_content']//img"))){
-					imgUrls = new ArrayList<>();
-					List<JXNode> imgNodeList = jxDocument.selN("//div[@id='js_content']//img");
-					for(JXNode imgNode : imgNodeList){
-						String imgUrl = imgNode.selOne("//img").asElement().attr("href");
-						imgUrls.add(imgUrl);
-					}
-				}
-				//解析来源
-				if(Objects.nonNull(jxDocument.selNOne("//span[@id='copyright_logo']"))){
-					rootSource = jxDocument.selNOne("//span[@id='profileBt']/a[@id='js_name']").asElement().text();
-				}

+    /**
+     * 根据关键词采集指定时间+账号的数据
+     *
+     * @param word
+     * @param idOrName
+     * @param startTime
+     * @param endTime
+     * @param proxyHolder
+     * @return
+     * @throws Exception
+     * @throws UnsupportedEncodingException
+     */
+    public static List<WechatAricle> wechatKeywordSearchByAccount(String word, String idOrName, String startTime, String endTime,
+                                                                  ProxyHolder proxyHolder) throws Exception, UnsupportedEncodingException {
+        List<WechatAricle> result = new ArrayList<WechatAricle>();
+        Map<String, String> headerMap = HeaderTool.getCommonHead();
+        headerMap.put("Host", "weixin.sogou.com");
+        if (idOrName == null || idOrName.equals("")) {
+            throw new IllegalArgumentException("要检索的昵称或id不能为空");
+        }
+        String openId = getOpenId(idOrName, proxyHolder);
+        boolean f = false;
+        if (openId != null) {
+            f = true;
+        }
+        int page = 1;

-				if(contentHtml.contains("d.nick_name = ")){
-					time = contentHtml.split("d.ct = \"")[1].split("\";")[0];
-					source = contentHtml.split("d.nick_name = \"")[1].split("\";")[0];
-					biz = contentHtml.split("d.biz = \"")[1].split("\"")[0];
-					user_name = contentHtml.split("d.user_name = \"")[1].split("\"")[0];
-				}else if(contentHtml.contains("var nickname = ")){
-					time = contentHtml.split("var ct = \"")[1].split("\";")[0];
-					source = contentHtml.split("var nickname = \"")[1].split("\";")[0];
-					biz = contentHtml.split("var appuin = \"\"||\"")[1].split("\"")[0];
-					user_name = contentHtml.split("var user_name = \"")[1].split("\"")[0];
-				}
-			}
-			if(wechatAricle == null) {
-				wechatAricle = new WechatAricle();
-				wechatAricle.setTitle(title);
-				wechatAricle.setTime(new Date(Long.valueOf(time)*1000));
-				wechatAricle.setSource(source);
-			}
+        while (f) {
+            String searchUrl = "https://weixin.sogou.com/weixin?type=2&ie=utf8&query=" + URLEncoder.encode(word, "UTF-8")
+                    + "&tsn=5&ft=" + startTime + "&et=" + endTime + "&interation=&page=" + page + "&wxid=" + openId
+                    + "&usip=" + URLEncoder.encode(idOrName, "UTF-8");

-			wechatAricle.setImgUrls(imgUrls);
-			wechatAricle.setRootSource(rootSource);
-			wechatAricle.setBiz(biz);
-			wechatAricle.setContent(content);
-			wechatAricle.setWxId(wxId);
-			wechatAricle.setUser_name(user_name);
-			
-		} catch (Exception e) {
-			e.printStackTrace();
-			return wechatAricle;
-		}
-		return wechatAricle;
-	}
-	
-	
-	
-	/**
-	 * 根据关键词采集指定时间+账号的数据
-	 * @param word
-	 * @param idOrName
-	 * @param startTime
-	 * @param endTime
-	 * @param proxyHolder
-	 * @return
-	 * @throws Exception
-	 * @throws UnsupportedEncodingException
-	 */
-	public static List<WechatAricle> wechatKeywordSearchByAccount(String word, String idOrName, String startTime, String endTime,
-			 ProxyHolder proxyHolder) throws Exception, UnsupportedEncodingException {
-		List<WechatAricle> result = new ArrayList<WechatAricle>();
-		Map<String, String> headerMap = HeaderTool.getCommonHead();
-		headerMap.put("Host", "weixin.sogou.com");
-		if(idOrName==null || idOrName.equals("")){
-			throw new IllegalArgumentException("要检索的昵称或id不能为空");
-		}
-		String openId = getOpenId(idOrName, proxyHolder);
-		boolean f = false;
-		if(openId!=null){
-			f = true;
-		}
-		int page = 1;
+            headerMap.put("Referer", searchUrl);
+            // 获取数据
+            String htmlBody = HtmlDownUtil.downloadHtml(searchUrl, headerMap, proxyHolder);
+            if (StringUtils.isNotBlank(htmlBody)) {
+                JXDocument jxDocument = JXDocument.create(htmlBody);
+                result.addAll(analysis(jxDocument));
+                // 解析最大可寻页码
+                String pageNext = jxDocument.selNOne("//a[@id='sogou_next']").asElement().text();
+                if (pageNext.contains("下一页")) {
+                    page++;
+                } else {
+                    f = false;
+                }
+            } else {
+                logger.info("根据关键词获取微信文章失败，返回的数据结果集: {}", htmlBody);
+            }
+        }
+        return result;
+    }

-		while (f) {
-			String searchUrl = "https://weixin.sogou.com/weixin?type=2&ie=utf8&query=" + URLEncoder.encode(word, "UTF-8")
-			+ "&tsn=5&ft=" + startTime + "&et=" + endTime +"&interation=&page=" + page+"&wxid="+ openId 
-			+"&usip=" + URLEncoder.encode(idOrName, "UTF-8");
-			
-			headerMap.put("Referer", searchUrl);
-			// 获取数据
-			String htmlBody = HtmlDownUtil.downloadHtml(searchUrl, headerMap, proxyHolder);
-			if (StringUtils.isNotBlank(htmlBody)) {
-				JXDocument jxDocument = JXDocument.create(htmlBody);
-				result.addAll(analysis(jxDocument));
-				// 解析最大可寻页码
-				String pageNext = jxDocument.selNOne("//a[@id='sogou_next']").asElement().text();
-				if (pageNext.contains("下一页")) {
-					page++;
-				} else {
-					f = false;
-				}
-			} else {
-				logger.info("根据关键词获取微信文章失败，返回的数据结果集: {}", htmlBody);
-			}
-		}
-		return result;
-	}

+    /**
+     * 解析数据
+     *
+     * @param jxDocument
+     * @return
+     */
+    private static List<WechatAricle> analysis(JXDocument jxDocument) {
+        List<WechatAricle> result = new ArrayList<WechatAricle>();
+        // 解析数据
+        try {
+            // 解析数据
+            List<JXNode> jxNodeList = jxDocument.selN("//div[@class='news-box']/ul[@class='news-list']/li");
+            String title = null;
+            String link = null;
+            String content = null;
+            String source = null;
+            String openid = null;
+            String putDate = null;
+            Date date = null;
+            WechatAricle wechat = null;
+            if (Objects.nonNull(jxNodeList) && !jxNodeList.isEmpty()) {
+                for (JXNode jxNode : jxNodeList) {
+                    try {
+                        title = jxNode.selOne("//div[@class='txt-box']/h3/a").asElement().text();
+                        link = jxNode.selOne("//div[@class='txt-box']/h3/a").asElement().attr("href");
+                        link = URIUtils.resolve("https://weixin.sogou.com", link);
+                        if (Objects.nonNull(jxNode.selOne("//div[@class='txt-box']/p"))) {
+                            content = jxNode.selOne("//div[@class='txt-box']/p").asElement().text();
+                        }
+                        source = jxNode.selOne("//div[@class='s-p']/a").asElement().text();
+                        openid = jxNode.selOne("//div[@class='s-p']/a").asElement().attr("i");
+                        putDate = jxNode.selOne("//div[@class='s-p']").asElement().attr("t");
+                        date = new Date(Long.valueOf(putDate) * 1000);
+                        int readNum = 0;
+                        try {
+                            readNum = Integer.valueOf(jxNode.selOne("//div[@class='s-p']/span[@class='s1']").asElement().text().trim());
+                        } catch (Exception e) {
+                            readNum = 0;
+                        }
+                        title = ZhiWeiTools.SBC2DBC(title);
+                        content = ZhiWeiTools.SBC2DBC(content);
+                        if (StringUtils.isNotBlank(title)) {
+                            wechat = new WechatAricle(link, title, source, content, date, null, null, readNum, 0, openid, "unknow");
+                            result.add(wechat);
+                        }
+                    } catch (Exception e) {
+                        logger.debug("解析数据出现错误:{}", e.getMessage());
+                        continue;
+                    }
+                }

-	/**
-	 * 解析数据
-	 * @param jxDocument
-	 * @return
-	 */
-	private static List<WechatAricle> analysis(JXDocument jxDocument){
-		List<WechatAricle> result = new ArrayList<WechatAricle>();
-		// 解析数据
-		try {
-			// 解析数据
-			List<JXNode> jxNodeList = jxDocument.selN("//div[@class='news-box']/ul[@class='news-list']/li");
-			String title = null;
-			String link = null;
-			String content = null;
-			String source = null;
-			String openid = null;
-			String putDate = null;
-			Date date = null;
-			WechatAricle wechat = null;
-			if(Objects.nonNull(jxNodeList) && !jxNodeList.isEmpty()){
-				for (JXNode jxNode : jxNodeList) {
-					try {
-						title = jxNode.selOne("//div[@class='txt-box']/h3/a").asElement().text();
-						link = jxNode.selOne("//div[@class='txt-box']/h3/a").asElement().attr("href");
-						if(!link.contains("weixin.sogou.com")){
-							link = "https://weixin.sogou.com" + link;
-						}
-						if (Objects.nonNull(jxNode.selOne("//div[@class='txt-box']/p"))) {
-							content = jxNode.selOne("//div[@class='txt-box']/p").asElement().text();
-						}
-						source = jxNode.selOne("//div[@class='s-p']/a").asElement().text();
-						openid = jxNode.selOne("//div[@class='s-p']/a").asElement().attr("i");
-						putDate = jxNode.selOne("//div[@class='s-p']").asElement().attr("t");
-						date = new Date(Long.valueOf(putDate) * 1000);
-						int readNum = 0;
-						try {
-							readNum = Integer.valueOf(jxNode.selOne("//div[@class='s-p']/span[@class='s1']").asElement().text().trim());
-						} catch (Exception e) {
-							readNum = 0;
-						}
-						title = ZhiWeiTools.SBC2DBC(title);
-						content = ZhiWeiTools.SBC2DBC(content);
-						if(StringUtils.isNotBlank(title)){
-							wechat = new WechatAricle(link, title, source, content, date, null, null,readNum, 0, openid, "unknow");
-							result.add(wechat);
-						}
-					} catch (Exception e) {
-						logger.debug("解析数据出现错误:{}", e.getMessage());
-						continue;
-					}
-				}
+            }
+            // logger.info("数据总页数为:{}", page);
+        } catch (Exception e) {
+            logger.debug("获取数据出现问题:{}", e.getMessage());
+            return null;
+        }
+        return result;
+    }

-			}
-			// logger.info("数据总页数为:{}", page);
-		} catch (Exception e) {
-			logger.debug("获取数据出现问题:{}", e.getMessage());
-			return null;
-		}
-		return result;
-	}
+    /**
+     * @param @param  wxId
+     * @param @return 设定文件
+     * @return String 返回类型
+     * @Title: getOpenId
+     * @Description: 获取微信wxID
+     */
+    public static String getOpenId(String idOrName, ProxyHolder proxyHolder) {
+        String openId = null;
+        String url = "https://weixin.sogou.com/weixin?zhnss=1&type=1&ie=utf8&query=" + URLCodeUtil.getURLEncode(idOrName, "utf-8");
+        String htmlBody;
+        for (int i = 1; i < 3; i++) {

-	/**
-	 * @Title: getOpenId
-	 * @Description: 获取微信wxID
-	 * @param @param
-	 *            wxId
-	 * @param @return
-	 *            设定文件
-	 * @return String 返回类型
-	 */
-	public static String getOpenId(String idOrName, ProxyHolder proxyHolder) {
-		String openId = null;
-		String url = "https://weixin.sogou.com/weixin?zhnss=1&type=1&ie=utf8&query=" + URLCodeUtil.getURLEncode(idOrName, "utf-8");
-		String htmlBody;
-		for(int i = 1;i < 3;i++) {
-		    
-		try {
-			htmlBody = HtmlDownUtil.downloadHtml(url,null, proxyHolder);
-			if (StringUtils.isNotBlank(htmlBody)) {
-				JSONObject jsonObject = JSONObject.parseObject(htmlBody);
-				openId = jsonObject.getString("openid");
-				return openId;
-			}
-		} catch (Exception e) {
-		    e.printStackTrace();
-			openId = null;
-		}
-		}
-		return openId;
-	}
+            try {
+                htmlBody = HtmlDownUtil.downloadHtml(url, null, proxyHolder);
+                if (StringUtils.isNotBlank(htmlBody)) {
+                    JSONObject jsonObject = JSONObject.parseObject(htmlBody);
+                    openId = jsonObject.getString("openid");
+                    return openId;
+                }
+            } catch (Exception e) {
+                e.printStackTrace();
+                openId = null;
+            }
+        }
+        return openId;
+    }

 }