处理微信链接出现重复拼接问题

ab9c3fd4 · zhiwei · 1cbcc794 · ab9c3fd4
Commit ab9c3fd4 authored Aug 25, 2020 by zhiwei
Show whitespace changes
Inline Side-by-side

Showing with 51 additions and 65 deletions

src/main/java/com/zhiwei/wechat/search/WechatAritcleSearch.java
+51 -65

No files found.
--- a/src/main/java/com/zhiwei/wechat/search/WechatAritcleSearch.java
+++ b/src/main/java/com/zhiwei/wechat/search/WechatAritcleSearch.java
@@ -7,6 +7,7 @@ import java.net.URLEncoder;
 import java.util.*;;
 import com.alibaba.fastjson.JSONObject;
 import com.zhiwei.crawler.core.proxy.ProxyHolder;
+import com.zhiwei.crawler.core.utils.URIUtils;
 import com.zhiwei.wechat.util.HtmlDownUtil;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.logging.log4j.LogManager;
@@ -20,9 +21,9 @@ import org.seimicrawler.xpath.JXDocument;
 import org.seimicrawler.xpath.JXNode;

 /**
+ * @author Bewilder Z
 * @ClassName: WechatAritcleSearch
 * @Description: TODO(在搜索接口根据关键词采集微信文章)
- * @author Bewilder Z
 * @date 2016年10月14日 上午9:40:18
 */
 public class WechatAritcleSearch {
@@ -31,18 +32,15 @@ public class WechatAritcleSearch {

    /**
     * 根据关键词在搜狗微信搜索微信文章,不包含全文
-	 * @Title: wechatKeywordSearch
-	 * @param 
-	 *            word 关键词
-	 * @param
-	 *            proxy 代理
-	 * @param
-	 * 			  pages  需要限制返回页数的总页数（如返回前20页则传21）,如没有限制页数则传null
-	 * @throws
-	 *            Exception
+     *
+     * @param word  关键词
+     * @param proxy 代理
+     * @param pages 需要限制返回页数的总页数（如返回前20页则传21）,如没有限制页数则传null
     * @return List<Wechat> 返回类型
+     * @throws Exception
+     * @Title: wechatKeywordSearch
     */
-	public static List<WechatAricle> wechatKeywordSearch(String word, Proxy proxy, Integer pages) throws Exception{
+    public static List<WechatAricle> wechatKeywordSearch(String word, Proxy proxy, Integer pages) throws Exception {
        List<WechatAricle> result = new ArrayList<>();
        Map<String, String> headerMap = HeaderTool.getCommonHead();
        headerMap.put("Host", "weixin.sogou.com");
@@ -51,10 +49,10 @@ public class WechatAritcleSearch {
        int page = 1;

        while (f) {
-		    String searchUrl = "https://weixin.sogou.com/weixin?type=2&s_from=input&query="+URLEncoder.encode(word, "UTF-8")+"&ie=utf8&_sug_=n&_sug_type_=&page="+ page;
+            String searchUrl = "https://weixin.sogou.com/weixin?type=2&s_from=input&query=" + URLEncoder.encode(word, "UTF-8") + "&ie=utf8&_sug_=n&_sug_type_=&page=" + page;
            headerMap.put("Referer", searchUrl);
            // 获取数据
-			try{
+            try {
                String htmlBody = HtmlDownUtil.downloadHtml(searchUrl, headerMap, proxy);
                // 解析数据
                if (StringUtils.isNotBlank(htmlBody)) {
@@ -70,10 +68,10 @@ public class WechatAritcleSearch {
                } else {
                    logger.info("根据关键词获取微信文章失败，返回的数据结果集: {}", htmlBody);
                }
-				if(pages!=null && pages==page) {
+                if (pages != null && pages == page) {
                    break;
                }
-			}catch (IOException e){
+            } catch (IOException e) {
                logger.error("根据关键词获取微信文章失败，错误为: {}", e);
            }

@@ -82,28 +80,18 @@ public class WechatAritcleSearch {
    }


-	
-	
-
    /**
-	 * 
-	 * @Title: wechatKeywordSearch
-	 * @Description: 根据关键词在搜狗微信搜索微信文章,包含全文
-	 * @param @param
-	 *            word 关键词
-	 * @param @param
-	 *            tsn 采集时间范围：1(1天内);2(一周内);3(一月内);4(一年内);
+     * @param @param  word 关键词
+     * @param @param  tsn 采集时间范围：1(1天内);2(一周内);3(一月内);4(一年内);
     *                5(某一时间段内与startTime和endTime配合使用)
-	 * @param @param
-	 *            startTime 开始时间 格式为yyyy-MM-dd
-	 * @param @param
-	 *            endTime 结束时间 格式为yyyy-MM-dd
+     * @param @param  startTime 开始时间 格式为yyyy-MM-dd
+     * @param @param  endTime 结束时间 格式为yyyy-MM-dd
     * @param @return
-	 * @param @throws
-	 *            ZhiWeiException
-	 * @param @throws
-	 *            UnsupportedEncodingException 设定文件
+     * @param @throws ZhiWeiException
+     * @param @throws UnsupportedEncodingException 设定文件
     * @return List<Wechat> 返回类型
+     * @Title: wechatKeywordSearch
+     * @Description: 根据关键词在搜狗微信搜索微信文章, 包含全文
     */
    public static List<WechatAricle> wechatKeywordSearch(String word,
                                                         Proxy proxy, ProxyHolder proxyHolder) throws Exception {
@@ -114,7 +102,7 @@ public class WechatAritcleSearch {
        boolean f = true;
        int page = 1;
        while (f) {
-			String searchUrl = "https://weixin.sogou.com/weixin?type=2&s_from=input&query="+URLEncoder.encode(word, "UTF-8")+"&ie=utf8&_sug_=n&_sug_type_=&page="+ page;
+            String searchUrl = "https://weixin.sogou.com/weixin?type=2&s_from=input&query=" + URLEncoder.encode(word, "UTF-8") + "&ie=utf8&_sug_=n&_sug_type_=&page=" + page;
            headerMap.put("Referer", searchUrl);
            // 获取数据
            String htmlBody = HtmlDownUtil.downloadHtml(searchUrl, headerMap, proxy);
@@ -138,13 +126,14 @@ public class WechatAritcleSearch {

    /**
     * 获取全文及来源
+     *
     * @param url
     * @param proxy
     * @param wechatAricle
     * @return
     * @throws IOException
     */
-	private static WechatAricle getWechatAricleInfo(String url, ProxyHolder proxy,WechatAricle wechatAricle){
+    private static WechatAricle getWechatAricleInfo(String url, ProxyHolder proxy, WechatAricle wechatAricle) {
        try {
            String contentHtml = HtmlDownUtil.downloadHtml(url, HeaderTool.getCommonHead(), proxy.getProxy());
            String content = null;
@@ -156,51 +145,51 @@ public class WechatAritcleSearch {
            String wxId = null;
            List<String> imgUrls = null;
            String rootSource = null;
-			if(contentHtml!=null){
+            if (contentHtml != null) {
                JXDocument jxDocument = JXDocument.create(contentHtml);
                title = jxDocument.selNOne("//h2[@id='activity-name']").asElement().text();
                wxId = jxDocument.selNOne("//p[@class='profile_meta'][1]/span[@class='profile_meta_value']").asElement().text();

-				if(contentHtml.contains("js_content")){
+                if (contentHtml.contains("js_content")) {
                    content = jxDocument.selNOne("//div[@id='js_content']").asElement().text();
-				}else if(contentHtml.contains("js_share_content")){
+                } else if (contentHtml.contains("js_share_content")) {
                    content = jxDocument.selNOne("//div[@id='js_share_content']").asElement().text();
                }
-				if(contentHtml.contains("content_tpl")){
+                if (contentHtml.contains("content_tpl")) {
                    String text = jxDocument.selNOne("//script[@id='content_tpl']").asElement().text();
                    content = Jsoup.parse(text).text();
                }
                //解析文章图片地址
-				if(Objects.nonNull(jxDocument.selN("//div[@id='js_content']//img"))){
+                if (Objects.nonNull(jxDocument.selN("//div[@id='js_content']//img"))) {
                    imgUrls = new ArrayList<>();
                    List<JXNode> imgNodeList = jxDocument.selN("//div[@id='js_content']//img");
-					for(JXNode imgNode : imgNodeList){
+                    for (JXNode imgNode : imgNodeList) {
                        String imgUrl = imgNode.selOne("//img").asElement().attr("href");
                        imgUrls.add(imgUrl);
                    }
                }
                //解析来源
-				if(Objects.nonNull(jxDocument.selNOne("//span[@id='copyright_logo']"))){
+                if (Objects.nonNull(jxDocument.selNOne("//span[@id='copyright_logo']"))) {
                    rootSource = jxDocument.selNOne("//span[@id='profileBt']/a[@id='js_name']").asElement().text();
                }


-				if(contentHtml.contains("d.nick_name = ")){
+                if (contentHtml.contains("d.nick_name = ")) {
                    time = contentHtml.split("d.ct = \"")[1].split("\";")[0];
                    source = contentHtml.split("d.nick_name = \"")[1].split("\";")[0];
                    biz = contentHtml.split("d.biz = \"")[1].split("\"")[0];
                    user_name = contentHtml.split("d.user_name = \"")[1].split("\"")[0];
-				}else if(contentHtml.contains("var nickname = ")){
+                } else if (contentHtml.contains("var nickname = ")) {
                    time = contentHtml.split("var ct = \"")[1].split("\";")[0];
                    source = contentHtml.split("var nickname = \"")[1].split("\";")[0];
                    biz = contentHtml.split("var appuin = \"\"||\"")[1].split("\"")[0];
                    user_name = contentHtml.split("var user_name = \"")[1].split("\"")[0];
                }
            }
-			if(wechatAricle == null) {
+            if (wechatAricle == null) {
                wechatAricle = new WechatAricle();
                wechatAricle.setTitle(title);
-				wechatAricle.setTime(new Date(Long.valueOf(time)*1000));
+                wechatAricle.setTime(new Date(Long.valueOf(time) * 1000));
                wechatAricle.setSource(source);
            }

@@ -219,9 +208,9 @@ public class WechatAritcleSearch {
    }


-	
    /**
     * 根据关键词采集指定时间+账号的数据
+     *
     * @param word
     * @param idOrName
     * @param startTime
@@ -236,20 +225,20 @@ public class WechatAritcleSearch {
        List<WechatAricle> result = new ArrayList<WechatAricle>();
        Map<String, String> headerMap = HeaderTool.getCommonHead();
        headerMap.put("Host", "weixin.sogou.com");
-		if(idOrName==null || idOrName.equals("")){
+        if (idOrName == null || idOrName.equals("")) {
            throw new IllegalArgumentException("要检索的昵称或id不能为空");
        }
        String openId = getOpenId(idOrName, proxyHolder);
        boolean f = false;
-		if(openId!=null){
+        if (openId != null) {
            f = true;
        }
        int page = 1;

        while (f) {
            String searchUrl = "https://weixin.sogou.com/weixin?type=2&ie=utf8&query=" + URLEncoder.encode(word, "UTF-8")
-			+ "&tsn=5&ft=" + startTime + "&et=" + endTime +"&interation=&page=" + page+"&wxid="+ openId 
-			+"&usip=" + URLEncoder.encode(idOrName, "UTF-8");
+                    + "&tsn=5&ft=" + startTime + "&et=" + endTime + "&interation=&page=" + page + "&wxid=" + openId
+                    + "&usip=" + URLEncoder.encode(idOrName, "UTF-8");

            headerMap.put("Referer", searchUrl);
            // 获取数据
@@ -274,10 +263,11 @@ public class WechatAritcleSearch {

    /**
     * 解析数据
+     *
     * @param jxDocument
     * @return
     */
-	private static List<WechatAricle> analysis(JXDocument jxDocument){
+    private static List<WechatAricle> analysis(JXDocument jxDocument) {
        List<WechatAricle> result = new ArrayList<WechatAricle>();
        // 解析数据
        try {
@@ -291,14 +281,12 @@ public class WechatAritcleSearch {
            String putDate = null;
            Date date = null;
            WechatAricle wechat = null;
-			if(Objects.nonNull(jxNodeList) && !jxNodeList.isEmpty()){
+            if (Objects.nonNull(jxNodeList) && !jxNodeList.isEmpty()) {
                for (JXNode jxNode : jxNodeList) {
                    try {
                        title = jxNode.selOne("//div[@class='txt-box']/h3/a").asElement().text();
                        link = jxNode.selOne("//div[@class='txt-box']/h3/a").asElement().attr("href");
-						if(!link.contains("weixin.sogou.com")){
-							link = "https://weixin.sogou.com" + link;
-						}
+                        link = URIUtils.resolve("https://weixin.sogou.com", link);
                        if (Objects.nonNull(jxNode.selOne("//div[@class='txt-box']/p"))) {
                            content = jxNode.selOne("//div[@class='txt-box']/p").asElement().text();
                        }
@@ -314,8 +302,8 @@ public class WechatAritcleSearch {
                        }
                        title = ZhiWeiTools.SBC2DBC(title);
                        content = ZhiWeiTools.SBC2DBC(content);
-						if(StringUtils.isNotBlank(title)){
-							wechat = new WechatAricle(link, title, source, content, date, null, null,readNum, 0, openid, "unknow");
+                        if (StringUtils.isNotBlank(title)) {
+                            wechat = new WechatAricle(link, title, source, content, date, null, null, readNum, 0, openid, "unknow");
                            result.add(wechat);
                        }
                    } catch (Exception e) {
@@ -334,22 +322,20 @@ public class WechatAritcleSearch {
    }

    /**
+     * @param @param  wxId
+     * @param @return 设定文件
+     * @return String 返回类型
     * @Title: getOpenId
     * @Description: 获取微信wxID
-	 * @param @param
-	 *            wxId
-	 * @param @return
-	 *            设定文件
-	 * @return String 返回类型
     */
    public static String getOpenId(String idOrName, ProxyHolder proxyHolder) {
        String openId = null;
        String url = "https://weixin.sogou.com/weixin?zhnss=1&type=1&ie=utf8&query=" + URLCodeUtil.getURLEncode(idOrName, "utf-8");
        String htmlBody;
-		for(int i = 1;i < 3;i++) {
+        for (int i = 1; i < 3; i++) {

            try {
-			htmlBody = HtmlDownUtil.downloadHtml(url,null, proxyHolder);
+                htmlBody = HtmlDownUtil.downloadHtml(url, null, proxyHolder);
                if (StringUtils.isNotBlank(htmlBody)) {
                    JSONObject jsonObject = JSONObject.parseObject(htmlBody);
                    openId = jsonObject.getString("openid");