Commit ab9c3fd4 by zhiwei

处理微信链接出现重复拼接问题

parent 1cbcc794
...@@ -7,6 +7,7 @@ import java.net.URLEncoder; ...@@ -7,6 +7,7 @@ import java.net.URLEncoder;
import java.util.*;; import java.util.*;;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.URIUtils;
import com.zhiwei.wechat.util.HtmlDownUtil; import com.zhiwei.wechat.util.HtmlDownUtil;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
...@@ -20,9 +21,9 @@ import org.seimicrawler.xpath.JXDocument; ...@@ -20,9 +21,9 @@ import org.seimicrawler.xpath.JXDocument;
import org.seimicrawler.xpath.JXNode; import org.seimicrawler.xpath.JXNode;
/** /**
* @author Bewilder Z
* @ClassName: WechatAritcleSearch * @ClassName: WechatAritcleSearch
* @Description: TODO(在搜索接口根据关键词采集微信文章) * @Description: TODO(在搜索接口根据关键词采集微信文章)
* @author Bewilder Z
* @date 2016年10月14日 上午9:40:18 * @date 2016年10月14日 上午9:40:18
*/ */
public class WechatAritcleSearch { public class WechatAritcleSearch {
...@@ -31,18 +32,15 @@ public class WechatAritcleSearch { ...@@ -31,18 +32,15 @@ public class WechatAritcleSearch {
/** /**
* 根据关键词在搜狗微信搜索微信文章,不包含全文 * 根据关键词在搜狗微信搜索微信文章,不包含全文
* @Title: wechatKeywordSearch *
* @param * @param word 关键词
* word 关键词 * @param proxy 代理
* @param * @param pages 需要限制返回页数的总页数(如返回前20页则传21),如没有限制页数则传null
* proxy 代理
* @param
* pages 需要限制返回页数的总页数(如返回前20页则传21),如没有限制页数则传null
* @throws
* Exception
* @return List<Wechat> 返回类型 * @return List<Wechat> 返回类型
* @throws Exception
* @Title: wechatKeywordSearch
*/ */
public static List<WechatAricle> wechatKeywordSearch(String word, Proxy proxy, Integer pages) throws Exception{ public static List<WechatAricle> wechatKeywordSearch(String word, Proxy proxy, Integer pages) throws Exception {
List<WechatAricle> result = new ArrayList<>(); List<WechatAricle> result = new ArrayList<>();
Map<String, String> headerMap = HeaderTool.getCommonHead(); Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com"); headerMap.put("Host", "weixin.sogou.com");
...@@ -51,10 +49,10 @@ public class WechatAritcleSearch { ...@@ -51,10 +49,10 @@ public class WechatAritcleSearch {
int page = 1; int page = 1;
while (f) { while (f) {
String searchUrl = "https://weixin.sogou.com/weixin?type=2&s_from=input&query="+URLEncoder.encode(word, "UTF-8")+"&ie=utf8&_sug_=n&_sug_type_=&page="+ page; String searchUrl = "https://weixin.sogou.com/weixin?type=2&s_from=input&query=" + URLEncoder.encode(word, "UTF-8") + "&ie=utf8&_sug_=n&_sug_type_=&page=" + page;
headerMap.put("Referer", searchUrl); headerMap.put("Referer", searchUrl);
// 获取数据 // 获取数据
try{ try {
String htmlBody = HtmlDownUtil.downloadHtml(searchUrl, headerMap, proxy); String htmlBody = HtmlDownUtil.downloadHtml(searchUrl, headerMap, proxy);
// 解析数据 // 解析数据
if (StringUtils.isNotBlank(htmlBody)) { if (StringUtils.isNotBlank(htmlBody)) {
...@@ -70,10 +68,10 @@ public class WechatAritcleSearch { ...@@ -70,10 +68,10 @@ public class WechatAritcleSearch {
} else { } else {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody); logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
} }
if(pages!=null && pages==page) { if (pages != null && pages == page) {
break; break;
} }
}catch (IOException e){ } catch (IOException e) {
logger.error("根据关键词获取微信文章失败,错误为: {}", e); logger.error("根据关键词获取微信文章失败,错误为: {}", e);
} }
...@@ -82,28 +80,18 @@ public class WechatAritcleSearch { ...@@ -82,28 +80,18 @@ public class WechatAritcleSearch {
} }
/** /**
* * @param @param word 关键词
* @Title: wechatKeywordSearch * @param @param tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内);
* @Description: 根据关键词在搜狗微信搜索微信文章,包含全文
* @param @param
* word 关键词
* @param @param
* tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内);
* 5(某一时间段内与startTime和endTime配合使用) * 5(某一时间段内与startTime和endTime配合使用)
* @param @param * @param @param startTime 开始时间 格式为yyyy-MM-dd
* startTime 开始时间 格式为yyyy-MM-dd * @param @param endTime 结束时间 格式为yyyy-MM-dd
* @param @param
* endTime 结束时间 格式为yyyy-MM-dd
* @param @return * @param @return
* @param @throws * @param @throws ZhiWeiException
* ZhiWeiException * @param @throws UnsupportedEncodingException 设定文件
* @param @throws
* UnsupportedEncodingException 设定文件
* @return List<Wechat> 返回类型 * @return List<Wechat> 返回类型
* @Title: wechatKeywordSearch
* @Description: 根据关键词在搜狗微信搜索微信文章, 包含全文
*/ */
public static List<WechatAricle> wechatKeywordSearch(String word, public static List<WechatAricle> wechatKeywordSearch(String word,
Proxy proxy, ProxyHolder proxyHolder) throws Exception { Proxy proxy, ProxyHolder proxyHolder) throws Exception {
...@@ -114,7 +102,7 @@ public class WechatAritcleSearch { ...@@ -114,7 +102,7 @@ public class WechatAritcleSearch {
boolean f = true; boolean f = true;
int page = 1; int page = 1;
while (f) { while (f) {
String searchUrl = "https://weixin.sogou.com/weixin?type=2&s_from=input&query="+URLEncoder.encode(word, "UTF-8")+"&ie=utf8&_sug_=n&_sug_type_=&page="+ page; String searchUrl = "https://weixin.sogou.com/weixin?type=2&s_from=input&query=" + URLEncoder.encode(word, "UTF-8") + "&ie=utf8&_sug_=n&_sug_type_=&page=" + page;
headerMap.put("Referer", searchUrl); headerMap.put("Referer", searchUrl);
// 获取数据 // 获取数据
String htmlBody = HtmlDownUtil.downloadHtml(searchUrl, headerMap, proxy); String htmlBody = HtmlDownUtil.downloadHtml(searchUrl, headerMap, proxy);
...@@ -138,13 +126,14 @@ public class WechatAritcleSearch { ...@@ -138,13 +126,14 @@ public class WechatAritcleSearch {
/** /**
* 获取全文及来源 * 获取全文及来源
*
* @param url * @param url
* @param proxy * @param proxy
* @param wechatAricle * @param wechatAricle
* @return * @return
* @throws IOException * @throws IOException
*/ */
private static WechatAricle getWechatAricleInfo(String url, ProxyHolder proxy,WechatAricle wechatAricle){ private static WechatAricle getWechatAricleInfo(String url, ProxyHolder proxy, WechatAricle wechatAricle) {
try { try {
String contentHtml = HtmlDownUtil.downloadHtml(url, HeaderTool.getCommonHead(), proxy.getProxy()); String contentHtml = HtmlDownUtil.downloadHtml(url, HeaderTool.getCommonHead(), proxy.getProxy());
String content = null; String content = null;
...@@ -156,51 +145,51 @@ public class WechatAritcleSearch { ...@@ -156,51 +145,51 @@ public class WechatAritcleSearch {
String wxId = null; String wxId = null;
List<String> imgUrls = null; List<String> imgUrls = null;
String rootSource = null; String rootSource = null;
if(contentHtml!=null){ if (contentHtml != null) {
JXDocument jxDocument = JXDocument.create(contentHtml); JXDocument jxDocument = JXDocument.create(contentHtml);
title = jxDocument.selNOne("//h2[@id='activity-name']").asElement().text(); title = jxDocument.selNOne("//h2[@id='activity-name']").asElement().text();
wxId = jxDocument.selNOne("//p[@class='profile_meta'][1]/span[@class='profile_meta_value']").asElement().text(); wxId = jxDocument.selNOne("//p[@class='profile_meta'][1]/span[@class='profile_meta_value']").asElement().text();
if(contentHtml.contains("js_content")){ if (contentHtml.contains("js_content")) {
content = jxDocument.selNOne("//div[@id='js_content']").asElement().text(); content = jxDocument.selNOne("//div[@id='js_content']").asElement().text();
}else if(contentHtml.contains("js_share_content")){ } else if (contentHtml.contains("js_share_content")) {
content = jxDocument.selNOne("//div[@id='js_share_content']").asElement().text(); content = jxDocument.selNOne("//div[@id='js_share_content']").asElement().text();
} }
if(contentHtml.contains("content_tpl")){ if (contentHtml.contains("content_tpl")) {
String text = jxDocument.selNOne("//script[@id='content_tpl']").asElement().text(); String text = jxDocument.selNOne("//script[@id='content_tpl']").asElement().text();
content = Jsoup.parse(text).text(); content = Jsoup.parse(text).text();
} }
//解析文章图片地址 //解析文章图片地址
if(Objects.nonNull(jxDocument.selN("//div[@id='js_content']//img"))){ if (Objects.nonNull(jxDocument.selN("//div[@id='js_content']//img"))) {
imgUrls = new ArrayList<>(); imgUrls = new ArrayList<>();
List<JXNode> imgNodeList = jxDocument.selN("//div[@id='js_content']//img"); List<JXNode> imgNodeList = jxDocument.selN("//div[@id='js_content']//img");
for(JXNode imgNode : imgNodeList){ for (JXNode imgNode : imgNodeList) {
String imgUrl = imgNode.selOne("//img").asElement().attr("href"); String imgUrl = imgNode.selOne("//img").asElement().attr("href");
imgUrls.add(imgUrl); imgUrls.add(imgUrl);
} }
} }
//解析来源 //解析来源
if(Objects.nonNull(jxDocument.selNOne("//span[@id='copyright_logo']"))){ if (Objects.nonNull(jxDocument.selNOne("//span[@id='copyright_logo']"))) {
rootSource = jxDocument.selNOne("//span[@id='profileBt']/a[@id='js_name']").asElement().text(); rootSource = jxDocument.selNOne("//span[@id='profileBt']/a[@id='js_name']").asElement().text();
} }
if(contentHtml.contains("d.nick_name = ")){ if (contentHtml.contains("d.nick_name = ")) {
time = contentHtml.split("d.ct = \"")[1].split("\";")[0]; time = contentHtml.split("d.ct = \"")[1].split("\";")[0];
source = contentHtml.split("d.nick_name = \"")[1].split("\";")[0]; source = contentHtml.split("d.nick_name = \"")[1].split("\";")[0];
biz = contentHtml.split("d.biz = \"")[1].split("\"")[0]; biz = contentHtml.split("d.biz = \"")[1].split("\"")[0];
user_name = contentHtml.split("d.user_name = \"")[1].split("\"")[0]; user_name = contentHtml.split("d.user_name = \"")[1].split("\"")[0];
}else if(contentHtml.contains("var nickname = ")){ } else if (contentHtml.contains("var nickname = ")) {
time = contentHtml.split("var ct = \"")[1].split("\";")[0]; time = contentHtml.split("var ct = \"")[1].split("\";")[0];
source = contentHtml.split("var nickname = \"")[1].split("\";")[0]; source = contentHtml.split("var nickname = \"")[1].split("\";")[0];
biz = contentHtml.split("var appuin = \"\"||\"")[1].split("\"")[0]; biz = contentHtml.split("var appuin = \"\"||\"")[1].split("\"")[0];
user_name = contentHtml.split("var user_name = \"")[1].split("\"")[0]; user_name = contentHtml.split("var user_name = \"")[1].split("\"")[0];
} }
} }
if(wechatAricle == null) { if (wechatAricle == null) {
wechatAricle = new WechatAricle(); wechatAricle = new WechatAricle();
wechatAricle.setTitle(title); wechatAricle.setTitle(title);
wechatAricle.setTime(new Date(Long.valueOf(time)*1000)); wechatAricle.setTime(new Date(Long.valueOf(time) * 1000));
wechatAricle.setSource(source); wechatAricle.setSource(source);
} }
...@@ -219,9 +208,9 @@ public class WechatAritcleSearch { ...@@ -219,9 +208,9 @@ public class WechatAritcleSearch {
} }
/** /**
* 根据关键词采集指定时间+账号的数据 * 根据关键词采集指定时间+账号的数据
*
* @param word * @param word
* @param idOrName * @param idOrName
* @param startTime * @param startTime
...@@ -236,20 +225,20 @@ public class WechatAritcleSearch { ...@@ -236,20 +225,20 @@ public class WechatAritcleSearch {
List<WechatAricle> result = new ArrayList<WechatAricle>(); List<WechatAricle> result = new ArrayList<WechatAricle>();
Map<String, String> headerMap = HeaderTool.getCommonHead(); Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com"); headerMap.put("Host", "weixin.sogou.com");
if(idOrName==null || idOrName.equals("")){ if (idOrName == null || idOrName.equals("")) {
throw new IllegalArgumentException("要检索的昵称或id不能为空"); throw new IllegalArgumentException("要检索的昵称或id不能为空");
} }
String openId = getOpenId(idOrName, proxyHolder); String openId = getOpenId(idOrName, proxyHolder);
boolean f = false; boolean f = false;
if(openId!=null){ if (openId != null) {
f = true; f = true;
} }
int page = 1; int page = 1;
while (f) { while (f) {
String searchUrl = "https://weixin.sogou.com/weixin?type=2&ie=utf8&query=" + URLEncoder.encode(word, "UTF-8") String searchUrl = "https://weixin.sogou.com/weixin?type=2&ie=utf8&query=" + URLEncoder.encode(word, "UTF-8")
+ "&tsn=5&ft=" + startTime + "&et=" + endTime +"&interation=&page=" + page+"&wxid="+ openId + "&tsn=5&ft=" + startTime + "&et=" + endTime + "&interation=&page=" + page + "&wxid=" + openId
+"&usip=" + URLEncoder.encode(idOrName, "UTF-8"); + "&usip=" + URLEncoder.encode(idOrName, "UTF-8");
headerMap.put("Referer", searchUrl); headerMap.put("Referer", searchUrl);
// 获取数据 // 获取数据
...@@ -274,10 +263,11 @@ public class WechatAritcleSearch { ...@@ -274,10 +263,11 @@ public class WechatAritcleSearch {
/** /**
* 解析数据 * 解析数据
*
* @param jxDocument * @param jxDocument
* @return * @return
*/ */
private static List<WechatAricle> analysis(JXDocument jxDocument){ private static List<WechatAricle> analysis(JXDocument jxDocument) {
List<WechatAricle> result = new ArrayList<WechatAricle>(); List<WechatAricle> result = new ArrayList<WechatAricle>();
// 解析数据 // 解析数据
try { try {
...@@ -291,14 +281,12 @@ public class WechatAritcleSearch { ...@@ -291,14 +281,12 @@ public class WechatAritcleSearch {
String putDate = null; String putDate = null;
Date date = null; Date date = null;
WechatAricle wechat = null; WechatAricle wechat = null;
if(Objects.nonNull(jxNodeList) && !jxNodeList.isEmpty()){ if (Objects.nonNull(jxNodeList) && !jxNodeList.isEmpty()) {
for (JXNode jxNode : jxNodeList) { for (JXNode jxNode : jxNodeList) {
try { try {
title = jxNode.selOne("//div[@class='txt-box']/h3/a").asElement().text(); title = jxNode.selOne("//div[@class='txt-box']/h3/a").asElement().text();
link = jxNode.selOne("//div[@class='txt-box']/h3/a").asElement().attr("href"); link = jxNode.selOne("//div[@class='txt-box']/h3/a").asElement().attr("href");
if(!link.contains("weixin.sogou.com")){ link = URIUtils.resolve("https://weixin.sogou.com", link);
link = "https://weixin.sogou.com" + link;
}
if (Objects.nonNull(jxNode.selOne("//div[@class='txt-box']/p"))) { if (Objects.nonNull(jxNode.selOne("//div[@class='txt-box']/p"))) {
content = jxNode.selOne("//div[@class='txt-box']/p").asElement().text(); content = jxNode.selOne("//div[@class='txt-box']/p").asElement().text();
} }
...@@ -314,8 +302,8 @@ public class WechatAritcleSearch { ...@@ -314,8 +302,8 @@ public class WechatAritcleSearch {
} }
title = ZhiWeiTools.SBC2DBC(title); title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content); content = ZhiWeiTools.SBC2DBC(content);
if(StringUtils.isNotBlank(title)){ if (StringUtils.isNotBlank(title)) {
wechat = new WechatAricle(link, title, source, content, date, null, null,readNum, 0, openid, "unknow"); wechat = new WechatAricle(link, title, source, content, date, null, null, readNum, 0, openid, "unknow");
result.add(wechat); result.add(wechat);
} }
} catch (Exception e) { } catch (Exception e) {
...@@ -334,22 +322,20 @@ public class WechatAritcleSearch { ...@@ -334,22 +322,20 @@ public class WechatAritcleSearch {
} }
/** /**
* @param @param wxId
* @param @return 设定文件
* @return String 返回类型
* @Title: getOpenId * @Title: getOpenId
* @Description: 获取微信wxID * @Description: 获取微信wxID
* @param @param
* wxId
* @param @return
* 设定文件
* @return String 返回类型
*/ */
public static String getOpenId(String idOrName, ProxyHolder proxyHolder) { public static String getOpenId(String idOrName, ProxyHolder proxyHolder) {
String openId = null; String openId = null;
String url = "https://weixin.sogou.com/weixin?zhnss=1&type=1&ie=utf8&query=" + URLCodeUtil.getURLEncode(idOrName, "utf-8"); String url = "https://weixin.sogou.com/weixin?zhnss=1&type=1&ie=utf8&query=" + URLCodeUtil.getURLEncode(idOrName, "utf-8");
String htmlBody; String htmlBody;
for(int i = 1;i < 3;i++) { for (int i = 1; i < 3; i++) {
try { try {
htmlBody = HtmlDownUtil.downloadHtml(url,null, proxyHolder); htmlBody = HtmlDownUtil.downloadHtml(url, null, proxyHolder);
if (StringUtils.isNotBlank(htmlBody)) { if (StringUtils.isNotBlank(htmlBody)) {
JSONObject jsonObject = JSONObject.parseObject(htmlBody); JSONObject jsonObject = JSONObject.parseObject(htmlBody);
openId = jsonObject.getString("openid"); openId = jsonObject.getString("openid");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment