Commit e6b2adc7 by [zhangzhiwei]

添加采集微信全文及文章内来源

parent f7e200cf
package com.zhiwei.wechat.search;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.Proxy;
import java.net.URLEncoder;
......@@ -19,6 +20,7 @@ import org.jsoup.select.Elements;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.wechat.entity.WechatAricle;
......@@ -54,9 +56,7 @@ public class WechatAritcleSearch {
throws Exception, UnsupportedEncodingException
{
List<WechatAricle> result = new ArrayList<WechatAricle>();
Map<String,String> headerMap = new HashMap<String,String>();
headerMap.put("Upgrade-Insecure-Requests", "1");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36");
Map<String,String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host","weixin.sogou.com");
if(cookie!=null){
headerMap.put("Cookie",cookie);
......@@ -126,6 +126,7 @@ public class WechatAritcleSearch {
title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content);
wechat = new WechatAricle(link, title, source, content, date, readNum, 0,openid,"unknow");
wechat = getContentAndSource(url, proxy, headerMap, wechat);
result.add(wechat);
} catch (Exception e) {
logger.debug("解析数据出现错误:{}",e.getMessage());
......@@ -153,6 +154,38 @@ public class WechatAritcleSearch {
return result;
}
/**
* 获取全文及来源
* @param url
* @param proxy
* @param headerMap
* @param wechatAricle
* @return
* @throws IOException
*/
private static WechatAricle getContentAndSource(String url, Proxy proxy, Map<String,String> headerMap,WechatAricle wechatAricle) throws IOException{
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy, false).body().string();
if(htmlBody!=null){
Document document = Jsoup.parse(htmlBody);
String content = document.select("div#js_content").text();
String source = document.select("a#js_name").text();
if(content!=null){
wechatAricle.setContent(content);
}
if(source!=null){
wechatAricle.setSource(source);
}
}
return wechatAricle;
}
/**
* @Title: getOpenId
* @Description: TODO(获取微信wxID)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment