Commit 97909cfe by [zhangzhiwei]

添加代理采集全文功能

parent 87c407d1
......@@ -19,6 +19,7 @@ import org.jsoup.select.Elements;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.tools.tools.ZhiWeiTools;
......@@ -36,10 +37,15 @@ public class WechatAritcleSearch {
private static Logger logger = LogManager.getLogger(WechatAritcleSearch.class);
private static HttpBoot httpBoot = new HttpBoot();
/**
*
* @Title: wechatKeywordSearch
* @Description: TODO(根据关键词在搜狗微信搜索微信文章)
* @Description: TODO(根据关键词在搜狗微信搜索微信文章,不包含全文)
* @param @param
* word 关键词
* @param @param
......@@ -119,7 +125,120 @@ public class WechatAritcleSearch {
title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content);
wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow");
wechat = getContentAndSource(link, proxy, wechat);
result.add(wechat);
} catch (Exception e) {
logger.debug("解析数据出现错误:{}", e.getMessage());
continue;
}
}
// 解析最大可寻页码
String pageNext = document.select("[id=pagebar_container]>a").text();
if (pageNext.contains("下一页")) {
page++;
} else {
f = false;
}
// logger.info("数据总页数为:{}", page);
} catch (Exception e) {
logger.debug("获取数据出现问题:{}", e.getMessage());
return null;
}
} else {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
}
// ZhiWeiTools.sleep(100);
}
return result;
}
/**
*
* @Title: wechatKeywordSearch
* @Description: TODO(根据关键词在搜狗微信搜索微信文章,包含全文)
* @param @param
* word 关键词
* @param @param
* tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内);
* 5(某一时间段内与startTime和endTime配合使用)
* @param @param
* startTime 开始时间 格式为yyyy-MM-dd
* @param @param
* endTime 结束时间 格式为yyyy-MM-dd
* @param @param
* cookie 用户登录后的cookie(不登录最多10页)
* @param @return
* @param @throws
* ZhiWeiException
* @param @throws
* UnsupportedEncodingException 设定文件
* @return List<Wechat> 返回类型
*/
public static List<WechatAricle> wechatKeywordSearch(String word, int tsn, String startTime, String endTime,
String cookie, Proxy proxy,ProxyHolder proxyHolder) throws Exception, UnsupportedEncodingException {
List<WechatAricle> result = new ArrayList<WechatAricle>();
Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com");
if (cookie != null) {
headerMap.put("Cookie", cookie);
}
boolean f = true;
int page = 1;
while (f) {
String url = "http://weixin.sogou.com/weixin?type=2&query=" + URLEncoder.encode(word, "UTF-8")
+ "&ie=utf8&_sug_=n&_sug_type_=" + "&ri=1&sourceid=sugg&sst0=" + System.currentTimeMillis()
+ "&tsn=" + tsn + "&page=" + page;
if (tsn == 5) {
url = url + "&ft=" + startTime + "&et=" + endTime + "&wxid=&usip=&interation=&from=tool";
}
headerMap.put("Referer", url);
// 获取数据
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy, false).body().string();
// 解析数据
if (htmlBody != null) {
try {
// 解析数据
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div.news-box").select("ul.news-list").select("li");
String title = null;
String link = null;
String content = null;
String source = null;
String openid = null;
String putDate = null;
Date date = null;
WechatAricle wechat = null;
for (Element element : elements) {
try {
title = element.select("div.txt-box").select("h3").text();
link = element.select("div.txt-box").select("h3 >a").attr("href");
content = "";
if (element.select("p.txt-info").isEmpty()) {
content = element.select("p.txt-info").text();
} else {
content = element.select("div.txt-box").select("p.txt-info").text();
}
// System.out.println("content======================"+content);
source = element.select("div.txt-box").select("div.s-p").select("a").text();
openid = element.select("div.txt-box").select("div.s-p").select("a").attr("i");
putDate = element.select("div.txt-box").select("div.s-p").attr("t");
date = new Date(Long.valueOf(putDate) * 1000);
int readNum = 0;
try {
readNum = Integer.valueOf(element.select("div.txt-box").select("div.s-p")
.select("span.s1").text().trim());
} catch (Exception e) {
readNum = 0;
}
title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content);
wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow");
wechat = getContentAndSource(link, proxyHolder, wechat);
result.add(wechat);
} catch (Exception e) {
logger.debug("解析数据出现错误:{}", e.getMessage());
......@@ -155,7 +274,7 @@ public class WechatAritcleSearch {
* @return
* @throws IOException
*/
private static WechatAricle getContentAndSource(String url, Proxy proxy,WechatAricle wechatAricle){
private static WechatAricle getContentAndSource(String url, ProxyHolder proxy,WechatAricle wechatAricle){
try {
// String htmlBody = HttpClientTemplateOK.get(url, proxy, null);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy).body().string();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment