Commit 09b58307 by zhiwei

处理搜狗微信搜索链接中出现两次https的问题

parent 6c9f649a
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>wechat</artifactId>
<version>1.3.1-SNAPSHOT</version>
<version>1.3.2-SNAPSHOT</version>
<description>
知微微信采集程序,包含
1.微信历史文章采集
......@@ -85,13 +85,13 @@
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.1.4-SNAPSHOT</version>
<version>0.1.5-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.5.5.6-SNAPSHOT</version>
<version>0.5.6.3-RELEASE</version>
<scope>provided</scope>
</dependency>
</dependencies>
......
......@@ -301,11 +301,9 @@ public class WechatAritcleSearch {
try {
title = element.select("div.txt-box").select("h3").text();
link = element.select("div.txt-box").select("h3 >a").attr("href");
if(!link.contains("https")){
if(!link.contains("weixin.sogou.com")){
link = "https://weixin.sogou.com" + link;
}
content = "";
if (element.select("p.txt-info").isEmpty()) {
content = element.select("p.txt-info").text();
} else {
......@@ -325,8 +323,10 @@ public class WechatAritcleSearch {
}
title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content);
wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow");
result.add(wechat);
if(StringUtils.isNotBlank(title)){
wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow");
result.add(wechat);
}
} catch (Exception e) {
logger.debug("解析数据出现错误:{}", e.getMessage());
continue;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment