Commit ab9c3fd4 by zhiwei

处理微信链接出现重复拼接问题

parent 1cbcc794
......@@ -7,6 +7,7 @@ import java.net.URLEncoder;
import java.util.*;;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.URIUtils;
import com.zhiwei.wechat.util.HtmlDownUtil;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
......@@ -20,347 +21,332 @@ import org.seimicrawler.xpath.JXDocument;
import org.seimicrawler.xpath.JXNode;
/**
* @author Bewilder Z
* @ClassName: WechatAritcleSearch
* @Description: TODO(在搜索接口根据关键词采集微信文章)
* @author Bewilder Z
* @date 2016年10月14日 上午9:40:18
*/
public class WechatAritcleSearch {
private static Logger logger = LogManager.getLogger(WechatAritcleSearch.class);
private static Logger logger = LogManager.getLogger(WechatAritcleSearch.class);
/**
* 根据关键词在搜狗微信搜索微信文章,不包含全文
*
* @param word 关键词
* @param proxy 代理
* @param pages 需要限制返回页数的总页数(如返回前20页则传21),如没有限制页数则传null
* @return List<Wechat> 返回类型
* @throws Exception
* @Title: wechatKeywordSearch
*/
public static List<WechatAricle> wechatKeywordSearch(String word, Proxy proxy, Integer pages) throws Exception {
List<WechatAricle> result = new ArrayList<>();
Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com");
headerMap.put("cookie", "com_sohu_websearch_ITEM_PER_PAGE=100;");
boolean f = true;
int page = 1;
while (f) {
String searchUrl = "https://weixin.sogou.com/weixin?type=2&s_from=input&query=" + URLEncoder.encode(word, "UTF-8") + "&ie=utf8&_sug_=n&_sug_type_=&page=" + page;
headerMap.put("Referer", searchUrl);
// 获取数据
try {
String htmlBody = HtmlDownUtil.downloadHtml(searchUrl, headerMap, proxy);
// 解析数据
if (StringUtils.isNotBlank(htmlBody)) {
JXDocument jxDocument = JXDocument.create(htmlBody);
result.addAll(analysis(jxDocument));
// 解析最大可寻页码
String pageNext = jxDocument.selNOne("//a[@id='sogou_next']").asElement().text();
if (pageNext.contains("下一页")) {
page++;
} else {
f = false;
}
} else {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
}
if (pages != null && pages == page) {
break;
}
} catch (IOException e) {
logger.error("根据关键词获取微信文章失败,错误为: {}", e);
}
}
return result;
}
/**
* @param @param word 关键词
* @param @param tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内);
* 5(某一时间段内与startTime和endTime配合使用)
* @param @param startTime 开始时间 格式为yyyy-MM-dd
* @param @param endTime 结束时间 格式为yyyy-MM-dd
* @param @return
* @param @throws ZhiWeiException
* @param @throws UnsupportedEncodingException 设定文件
* @return List<Wechat> 返回类型
* @Title: wechatKeywordSearch
* @Description: 根据关键词在搜狗微信搜索微信文章, 包含全文
*/
public static List<WechatAricle> wechatKeywordSearch(String word,
Proxy proxy, ProxyHolder proxyHolder) throws Exception {
List<WechatAricle> result = new ArrayList<WechatAricle>();
Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com");
headerMap.put("cookie", "com_sohu_websearch_ITEM_PER_PAGE=100;");
boolean f = true;
int page = 1;
while (f) {
String searchUrl = "https://weixin.sogou.com/weixin?type=2&s_from=input&query=" + URLEncoder.encode(word, "UTF-8") + "&ie=utf8&_sug_=n&_sug_type_=&page=" + page;
headerMap.put("Referer", searchUrl);
// 获取数据
String htmlBody = HtmlDownUtil.downloadHtml(searchUrl, headerMap, proxy);
// 解析数据
if (StringUtils.isNotBlank(htmlBody)) {
JXDocument jxDocument = JXDocument.create(htmlBody);
result.addAll(analysis(jxDocument));
// 解析最大可寻页码
String pageNext = jxDocument.selNOne("//a[@id='sogou_next']").asElement().text();
if (pageNext.contains("下一页")) {
page++;
} else {
f = false;
}
} else {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
}
}
return result;
}
/**
* 获取全文及来源
*
* @param url
* @param proxy
* @param wechatAricle
* @return
* @throws IOException
*/
private static WechatAricle getWechatAricleInfo(String url, ProxyHolder proxy, WechatAricle wechatAricle) {
try {
String contentHtml = HtmlDownUtil.downloadHtml(url, HeaderTool.getCommonHead(), proxy.getProxy());
String content = null;
String time = null;
String source = null;
String biz = null;
String title = null;
String user_name = null;
String wxId = null;
List<String> imgUrls = null;
String rootSource = null;
if (contentHtml != null) {
JXDocument jxDocument = JXDocument.create(contentHtml);
title = jxDocument.selNOne("//h2[@id='activity-name']").asElement().text();
wxId = jxDocument.selNOne("//p[@class='profile_meta'][1]/span[@class='profile_meta_value']").asElement().text();
/**
* 根据关键词在搜狗微信搜索微信文章,不包含全文
* @Title: wechatKeywordSearch
* @param
* word 关键词
* @param
* proxy 代理
* @param
* pages 需要限制返回页数的总页数(如返回前20页则传21),如没有限制页数则传null
* @throws
* Exception
* @return List<Wechat> 返回类型
*/
public static List<WechatAricle> wechatKeywordSearch(String word, Proxy proxy, Integer pages) throws Exception{
List<WechatAricle> result = new ArrayList<>();
Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com");
headerMap.put("cookie", "com_sohu_websearch_ITEM_PER_PAGE=100;");
boolean f = true;
int page = 1;
if (contentHtml.contains("js_content")) {
content = jxDocument.selNOne("//div[@id='js_content']").asElement().text();
} else if (contentHtml.contains("js_share_content")) {
content = jxDocument.selNOne("//div[@id='js_share_content']").asElement().text();
}
if (contentHtml.contains("content_tpl")) {
String text = jxDocument.selNOne("//script[@id='content_tpl']").asElement().text();
content = Jsoup.parse(text).text();
}
//解析文章图片地址
if (Objects.nonNull(jxDocument.selN("//div[@id='js_content']//img"))) {
imgUrls = new ArrayList<>();
List<JXNode> imgNodeList = jxDocument.selN("//div[@id='js_content']//img");
for (JXNode imgNode : imgNodeList) {
String imgUrl = imgNode.selOne("//img").asElement().attr("href");
imgUrls.add(imgUrl);
}
}
//解析来源
if (Objects.nonNull(jxDocument.selNOne("//span[@id='copyright_logo']"))) {
rootSource = jxDocument.selNOne("//span[@id='profileBt']/a[@id='js_name']").asElement().text();
}
while (f) {
String searchUrl = "https://weixin.sogou.com/weixin?type=2&s_from=input&query="+URLEncoder.encode(word, "UTF-8")+"&ie=utf8&_sug_=n&_sug_type_=&page="+ page;
headerMap.put("Referer", searchUrl);
// 获取数据
try{
String htmlBody = HtmlDownUtil.downloadHtml(searchUrl, headerMap, proxy);
// 解析数据
if (StringUtils.isNotBlank(htmlBody)) {
JXDocument jxDocument = JXDocument.create(htmlBody);
result.addAll(analysis(jxDocument));
// 解析最大可寻页码
String pageNext = jxDocument.selNOne("//a[@id='sogou_next']").asElement().text();
if (pageNext.contains("下一页")) {
page++;
} else {
f = false;
}
} else {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
}
if(pages!=null && pages==page) {
break;
}
}catch (IOException e){
logger.error("根据关键词获取微信文章失败,错误为: {}", e);
}
}
return result;
}
if (contentHtml.contains("d.nick_name = ")) {
time = contentHtml.split("d.ct = \"")[1].split("\";")[0];
source = contentHtml.split("d.nick_name = \"")[1].split("\";")[0];
biz = contentHtml.split("d.biz = \"")[1].split("\"")[0];
user_name = contentHtml.split("d.user_name = \"")[1].split("\"")[0];
} else if (contentHtml.contains("var nickname = ")) {
time = contentHtml.split("var ct = \"")[1].split("\";")[0];
source = contentHtml.split("var nickname = \"")[1].split("\";")[0];
biz = contentHtml.split("var appuin = \"\"||\"")[1].split("\"")[0];
user_name = contentHtml.split("var user_name = \"")[1].split("\"")[0];
}
}
if (wechatAricle == null) {
wechatAricle = new WechatAricle();
wechatAricle.setTitle(title);
wechatAricle.setTime(new Date(Long.valueOf(time) * 1000));
wechatAricle.setSource(source);
}
/**
*
* @Title: wechatKeywordSearch
* @Description: 根据关键词在搜狗微信搜索微信文章,包含全文
* @param @param
* word 关键词
* @param @param
* tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内);
* 5(某一时间段内与startTime和endTime配合使用)
* @param @param
* startTime 开始时间 格式为yyyy-MM-dd
* @param @param
* endTime 结束时间 格式为yyyy-MM-dd
* @param @return
* @param @throws
* ZhiWeiException
* @param @throws
* UnsupportedEncodingException 设定文件
* @return List<Wechat> 返回类型
*/
public static List<WechatAricle> wechatKeywordSearch(String word,
Proxy proxy, ProxyHolder proxyHolder) throws Exception {
List<WechatAricle> result = new ArrayList<WechatAricle>();
Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com");
headerMap.put("cookie", "com_sohu_websearch_ITEM_PER_PAGE=100;");
boolean f = true;
int page = 1;
while (f) {
String searchUrl = "https://weixin.sogou.com/weixin?type=2&s_from=input&query="+URLEncoder.encode(word, "UTF-8")+"&ie=utf8&_sug_=n&_sug_type_=&page="+ page;
headerMap.put("Referer", searchUrl);
// 获取数据
String htmlBody = HtmlDownUtil.downloadHtml(searchUrl, headerMap, proxy);
// 解析数据
if (StringUtils.isNotBlank(htmlBody)) {
JXDocument jxDocument = JXDocument.create(htmlBody);
result.addAll(analysis(jxDocument));
// 解析最大可寻页码
String pageNext = jxDocument.selNOne("//a[@id='sogou_next']").asElement().text();
if (pageNext.contains("下一页")) {
page++;
} else {
f = false;
}
} else {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
}
}
return result;
}
wechatAricle.setImgUrls(imgUrls);
wechatAricle.setRootSource(rootSource);
wechatAricle.setBiz(biz);
wechatAricle.setContent(content);
wechatAricle.setWxId(wxId);
wechatAricle.setUser_name(user_name);
/**
* 获取全文及来源
* @param url
* @param proxy
* @param wechatAricle
* @return
* @throws IOException
*/
private static WechatAricle getWechatAricleInfo(String url, ProxyHolder proxy,WechatAricle wechatAricle){
try {
String contentHtml = HtmlDownUtil.downloadHtml(url, HeaderTool.getCommonHead(), proxy.getProxy());
String content = null;
String time = null;
String source = null;
String biz = null;
String title = null;
String user_name = null;
String wxId = null;
List<String> imgUrls = null;
String rootSource = null;
if(contentHtml!=null){
JXDocument jxDocument = JXDocument.create(contentHtml);
title = jxDocument.selNOne("//h2[@id='activity-name']").asElement().text();
wxId = jxDocument.selNOne("//p[@class='profile_meta'][1]/span[@class='profile_meta_value']").asElement().text();
} catch (Exception e) {
e.printStackTrace();
return wechatAricle;
}
return wechatAricle;
}
if(contentHtml.contains("js_content")){
content = jxDocument.selNOne("//div[@id='js_content']").asElement().text();
}else if(contentHtml.contains("js_share_content")){
content = jxDocument.selNOne("//div[@id='js_share_content']").asElement().text();
}
if(contentHtml.contains("content_tpl")){
String text = jxDocument.selNOne("//script[@id='content_tpl']").asElement().text();
content = Jsoup.parse(text).text();
}
//解析文章图片地址
if(Objects.nonNull(jxDocument.selN("//div[@id='js_content']//img"))){
imgUrls = new ArrayList<>();
List<JXNode> imgNodeList = jxDocument.selN("//div[@id='js_content']//img");
for(JXNode imgNode : imgNodeList){
String imgUrl = imgNode.selOne("//img").asElement().attr("href");
imgUrls.add(imgUrl);
}
}
//解析来源
if(Objects.nonNull(jxDocument.selNOne("//span[@id='copyright_logo']"))){
rootSource = jxDocument.selNOne("//span[@id='profileBt']/a[@id='js_name']").asElement().text();
}
/**
* 根据关键词采集指定时间+账号的数据
*
* @param word
* @param idOrName
* @param startTime
* @param endTime
* @param proxyHolder
* @return
* @throws Exception
* @throws UnsupportedEncodingException
*/
public static List<WechatAricle> wechatKeywordSearchByAccount(String word, String idOrName, String startTime, String endTime,
ProxyHolder proxyHolder) throws Exception, UnsupportedEncodingException {
List<WechatAricle> result = new ArrayList<WechatAricle>();
Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com");
if (idOrName == null || idOrName.equals("")) {
throw new IllegalArgumentException("要检索的昵称或id不能为空");
}
String openId = getOpenId(idOrName, proxyHolder);
boolean f = false;
if (openId != null) {
f = true;
}
int page = 1;
if(contentHtml.contains("d.nick_name = ")){
time = contentHtml.split("d.ct = \"")[1].split("\";")[0];
source = contentHtml.split("d.nick_name = \"")[1].split("\";")[0];
biz = contentHtml.split("d.biz = \"")[1].split("\"")[0];
user_name = contentHtml.split("d.user_name = \"")[1].split("\"")[0];
}else if(contentHtml.contains("var nickname = ")){
time = contentHtml.split("var ct = \"")[1].split("\";")[0];
source = contentHtml.split("var nickname = \"")[1].split("\";")[0];
biz = contentHtml.split("var appuin = \"\"||\"")[1].split("\"")[0];
user_name = contentHtml.split("var user_name = \"")[1].split("\"")[0];
}
}
if(wechatAricle == null) {
wechatAricle = new WechatAricle();
wechatAricle.setTitle(title);
wechatAricle.setTime(new Date(Long.valueOf(time)*1000));
wechatAricle.setSource(source);
}
while (f) {
String searchUrl = "https://weixin.sogou.com/weixin?type=2&ie=utf8&query=" + URLEncoder.encode(word, "UTF-8")
+ "&tsn=5&ft=" + startTime + "&et=" + endTime + "&interation=&page=" + page + "&wxid=" + openId
+ "&usip=" + URLEncoder.encode(idOrName, "UTF-8");
wechatAricle.setImgUrls(imgUrls);
wechatAricle.setRootSource(rootSource);
wechatAricle.setBiz(biz);
wechatAricle.setContent(content);
wechatAricle.setWxId(wxId);
wechatAricle.setUser_name(user_name);
} catch (Exception e) {
e.printStackTrace();
return wechatAricle;
}
return wechatAricle;
}
/**
* 根据关键词采集指定时间+账号的数据
* @param word
* @param idOrName
* @param startTime
* @param endTime
* @param proxyHolder
* @return
* @throws Exception
* @throws UnsupportedEncodingException
*/
public static List<WechatAricle> wechatKeywordSearchByAccount(String word, String idOrName, String startTime, String endTime,
ProxyHolder proxyHolder) throws Exception, UnsupportedEncodingException {
List<WechatAricle> result = new ArrayList<WechatAricle>();
Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com");
if(idOrName==null || idOrName.equals("")){
throw new IllegalArgumentException("要检索的昵称或id不能为空");
}
String openId = getOpenId(idOrName, proxyHolder);
boolean f = false;
if(openId!=null){
f = true;
}
int page = 1;
headerMap.put("Referer", searchUrl);
// 获取数据
String htmlBody = HtmlDownUtil.downloadHtml(searchUrl, headerMap, proxyHolder);
if (StringUtils.isNotBlank(htmlBody)) {
JXDocument jxDocument = JXDocument.create(htmlBody);
result.addAll(analysis(jxDocument));
// 解析最大可寻页码
String pageNext = jxDocument.selNOne("//a[@id='sogou_next']").asElement().text();
if (pageNext.contains("下一页")) {
page++;
} else {
f = false;
}
} else {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
}
}
return result;
}
while (f) {
String searchUrl = "https://weixin.sogou.com/weixin?type=2&ie=utf8&query=" + URLEncoder.encode(word, "UTF-8")
+ "&tsn=5&ft=" + startTime + "&et=" + endTime +"&interation=&page=" + page+"&wxid="+ openId
+"&usip=" + URLEncoder.encode(idOrName, "UTF-8");
headerMap.put("Referer", searchUrl);
// 获取数据
String htmlBody = HtmlDownUtil.downloadHtml(searchUrl, headerMap, proxyHolder);
if (StringUtils.isNotBlank(htmlBody)) {
JXDocument jxDocument = JXDocument.create(htmlBody);
result.addAll(analysis(jxDocument));
// 解析最大可寻页码
String pageNext = jxDocument.selNOne("//a[@id='sogou_next']").asElement().text();
if (pageNext.contains("下一页")) {
page++;
} else {
f = false;
}
} else {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
}
}
return result;
}
/**
* 解析数据
*
* @param jxDocument
* @return
*/
private static List<WechatAricle> analysis(JXDocument jxDocument) {
List<WechatAricle> result = new ArrayList<WechatAricle>();
// 解析数据
try {
// 解析数据
List<JXNode> jxNodeList = jxDocument.selN("//div[@class='news-box']/ul[@class='news-list']/li");
String title = null;
String link = null;
String content = null;
String source = null;
String openid = null;
String putDate = null;
Date date = null;
WechatAricle wechat = null;
if (Objects.nonNull(jxNodeList) && !jxNodeList.isEmpty()) {
for (JXNode jxNode : jxNodeList) {
try {
title = jxNode.selOne("//div[@class='txt-box']/h3/a").asElement().text();
link = jxNode.selOne("//div[@class='txt-box']/h3/a").asElement().attr("href");
link = URIUtils.resolve("https://weixin.sogou.com", link);
if (Objects.nonNull(jxNode.selOne("//div[@class='txt-box']/p"))) {
content = jxNode.selOne("//div[@class='txt-box']/p").asElement().text();
}
source = jxNode.selOne("//div[@class='s-p']/a").asElement().text();
openid = jxNode.selOne("//div[@class='s-p']/a").asElement().attr("i");
putDate = jxNode.selOne("//div[@class='s-p']").asElement().attr("t");
date = new Date(Long.valueOf(putDate) * 1000);
int readNum = 0;
try {
readNum = Integer.valueOf(jxNode.selOne("//div[@class='s-p']/span[@class='s1']").asElement().text().trim());
} catch (Exception e) {
readNum = 0;
}
title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content);
if (StringUtils.isNotBlank(title)) {
wechat = new WechatAricle(link, title, source, content, date, null, null, readNum, 0, openid, "unknow");
result.add(wechat);
}
} catch (Exception e) {
logger.debug("解析数据出现错误:{}", e.getMessage());
continue;
}
}
/**
* 解析数据
* @param jxDocument
* @return
*/
private static List<WechatAricle> analysis(JXDocument jxDocument){
List<WechatAricle> result = new ArrayList<WechatAricle>();
// 解析数据
try {
// 解析数据
List<JXNode> jxNodeList = jxDocument.selN("//div[@class='news-box']/ul[@class='news-list']/li");
String title = null;
String link = null;
String content = null;
String source = null;
String openid = null;
String putDate = null;
Date date = null;
WechatAricle wechat = null;
if(Objects.nonNull(jxNodeList) && !jxNodeList.isEmpty()){
for (JXNode jxNode : jxNodeList) {
try {
title = jxNode.selOne("//div[@class='txt-box']/h3/a").asElement().text();
link = jxNode.selOne("//div[@class='txt-box']/h3/a").asElement().attr("href");
if(!link.contains("weixin.sogou.com")){
link = "https://weixin.sogou.com" + link;
}
if (Objects.nonNull(jxNode.selOne("//div[@class='txt-box']/p"))) {
content = jxNode.selOne("//div[@class='txt-box']/p").asElement().text();
}
source = jxNode.selOne("//div[@class='s-p']/a").asElement().text();
openid = jxNode.selOne("//div[@class='s-p']/a").asElement().attr("i");
putDate = jxNode.selOne("//div[@class='s-p']").asElement().attr("t");
date = new Date(Long.valueOf(putDate) * 1000);
int readNum = 0;
try {
readNum = Integer.valueOf(jxNode.selOne("//div[@class='s-p']/span[@class='s1']").asElement().text().trim());
} catch (Exception e) {
readNum = 0;
}
title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content);
if(StringUtils.isNotBlank(title)){
wechat = new WechatAricle(link, title, source, content, date, null, null,readNum, 0, openid, "unknow");
result.add(wechat);
}
} catch (Exception e) {
logger.debug("解析数据出现错误:{}", e.getMessage());
continue;
}
}
}
// logger.info("数据总页数为:{}", page);
} catch (Exception e) {
logger.debug("获取数据出现问题:{}", e.getMessage());
return null;
}
return result;
}
}
// logger.info("数据总页数为:{}", page);
} catch (Exception e) {
logger.debug("获取数据出现问题:{}", e.getMessage());
return null;
}
return result;
}
/**
* @param @param wxId
* @param @return 设定文件
* @return String 返回类型
* @Title: getOpenId
* @Description: 获取微信wxID
*/
public static String getOpenId(String idOrName, ProxyHolder proxyHolder) {
String openId = null;
String url = "https://weixin.sogou.com/weixin?zhnss=1&type=1&ie=utf8&query=" + URLCodeUtil.getURLEncode(idOrName, "utf-8");
String htmlBody;
for (int i = 1; i < 3; i++) {
/**
* @Title: getOpenId
* @Description: 获取微信wxID
* @param @param
* wxId
* @param @return
* 设定文件
* @return String 返回类型
*/
public static String getOpenId(String idOrName, ProxyHolder proxyHolder) {
String openId = null;
String url = "https://weixin.sogou.com/weixin?zhnss=1&type=1&ie=utf8&query=" + URLCodeUtil.getURLEncode(idOrName, "utf-8");
String htmlBody;
for(int i = 1;i < 3;i++) {
try {
htmlBody = HtmlDownUtil.downloadHtml(url,null, proxyHolder);
if (StringUtils.isNotBlank(htmlBody)) {
JSONObject jsonObject = JSONObject.parseObject(htmlBody);
openId = jsonObject.getString("openid");
return openId;
}
} catch (Exception e) {
e.printStackTrace();
openId = null;
}
}
return openId;
}
try {
htmlBody = HtmlDownUtil.downloadHtml(url, null, proxyHolder);
if (StringUtils.isNotBlank(htmlBody)) {
JSONObject jsonObject = JSONObject.parseObject(htmlBody);
openId = jsonObject.getString("openid");
return openId;
}
} catch (Exception e) {
e.printStackTrace();
openId = null;
}
}
return openId;
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment