Commit 87c407d1 by [zhangzhiwei]

微信添加全文及来源采集

parent fd3dac6f
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>wechat</artifactId> <artifactId>wechat</artifactId>
<version>1.1.0-SNAPSHOT</version> <version>1.1.1-SNAPSHOT</version>
<description> <description>
知微微信采集程序,包含 知微微信采集程序,包含
1.微信历史文章采集 1.微信历史文章采集
......
...@@ -26,134 +26,126 @@ import com.zhiwei.wechat.entity.WechatAricle; ...@@ -26,134 +26,126 @@ import com.zhiwei.wechat.entity.WechatAricle;
import com.zhiwei.wechat.util.Tools; import com.zhiwei.wechat.util.Tools;
/** /**
* @ClassName: WechatAritcleSearch * @ClassName: WechatAritcleSearch
* @Description: TODO(在搜索接口根据关键词采集微信文章) * @Description: TODO(在搜索接口根据关键词采集微信文章)
* @author Bewilder Z * @author Bewilder Z
* @date 2016年10月14日 上午9:40:18 * @date 2016年10月14日 上午9:40:18
*/ */
public class WechatAritcleSearch { public class WechatAritcleSearch {
private static Logger logger = LogManager.getLogger(WechatAritcleSearch.class); private static Logger logger = LogManager.getLogger(WechatAritcleSearch.class);
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot();
/** /**
* *
* @Title: wechatKeywordSearch * @Title: wechatKeywordSearch
* @Description: TODO(根据关键词在搜狗微信搜索微信文章) * @Description: TODO(根据关键词在搜狗微信搜索微信文章)
* @param @param word 关键词 * @param @param
* @param @param tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内); * word 关键词
* 5(某一时间段内与startTime和endTime配合使用) * @param @param
* @param @param startTime 开始时间 格式为yyyy-MM-dd * tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内);
* @param @param endTime 结束时间 格式为yyyy-MM-dd * 5(某一时间段内与startTime和endTime配合使用)
* @param @param cookie 用户登录后的cookie(不登录最多10页) * @param @param
* startTime 开始时间 格式为yyyy-MM-dd
* @param @param
* endTime 结束时间 格式为yyyy-MM-dd
* @param @param
* cookie 用户登录后的cookie(不登录最多10页)
* @param @return * @param @return
* @param @throws ZhiWeiException * @param @throws
* @param @throws UnsupportedEncodingException 设定文件 * ZhiWeiException
* @param @throws
* UnsupportedEncodingException 设定文件
* @return List<Wechat> 返回类型 * @return List<Wechat> 返回类型
*/ */
public static List<WechatAricle> wechatKeywordSearch(String word, int tsn, public static List<WechatAricle> wechatKeywordSearch(String word, int tsn, String startTime, String endTime,
String startTime,String endTime,String cookie,Proxy proxy) String cookie, Proxy proxy) throws Exception, UnsupportedEncodingException {
throws Exception, UnsupportedEncodingException
{
List<WechatAricle> result = new ArrayList<WechatAricle>(); List<WechatAricle> result = new ArrayList<WechatAricle>();
Map<String,String> headerMap = HeaderTool.getCommonHead(); Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host","weixin.sogou.com"); headerMap.put("Host", "weixin.sogou.com");
if(cookie!=null){ if (cookie != null) {
headerMap.put("Cookie",cookie); headerMap.put("Cookie", cookie);
} }
boolean f = true; boolean f = true;
int page = 1; int page = 1;
while(f) while (f) {
{ String url = "http://weixin.sogou.com/weixin?type=2&query=" + URLEncoder.encode(word, "UTF-8")
String url = "http://weixin.sogou.com/weixin?type=2&query=" + "&ie=utf8&_sug_=n&_sug_type_=" + "&ri=1&sourceid=sugg&sst0=" + System.currentTimeMillis()
+ URLEncoder.encode(word,"UTF-8")+"&ie=utf8&_sug_=n&_sug_type_=" + "&tsn=" + tsn + "&page=" + page;
+ "&ri=1&sourceid=sugg&sst0="+System.currentTimeMillis() if (tsn == 5) {
+"&tsn="+tsn + "&page="+page; url = url + "&ft=" + startTime + "&et=" + endTime + "&wxid=&usip=&interation=&from=tool";
if(tsn==5)
{
url = url + "&ft="+startTime + "&et="+ endTime
+ "&wxid=&usip=&interation=&from=tool";
} }
headerMap.put("Referer",url); headerMap.put("Referer", url);
//获取数据 // 获取数据
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy, false).body().string(); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy, false).body().string();
//解析数据 // 解析数据
if(htmlBody != null) if (htmlBody != null) {
{
try { try {
// 解析数据 // 解析数据
Document document = Jsoup.parse(htmlBody); Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div.news-box") Elements elements = document.select("div.news-box").select("ul.news-list").select("li");
.select("ul.news-list").select("li");
String title = null; String title = null;
String link = null; String link = null;
String content = null; String content = null;
String source = null; String source = null;
String openid = null; String openid = null;
String putDate = null; String putDate = null;
Date date = null; Date date = null;
WechatAricle wechat = null; WechatAricle wechat = null;
for (Element element : elements) for (Element element : elements) {
{
try { try {
title = element.select("div.txt-box").select("h3").text(); title = element.select("div.txt-box").select("h3").text();
link = element.select("div.txt-box").select("h3 >a").attr("href"); link = element.select("div.txt-box").select("h3 >a").attr("href");
content = ""; content = "";
if(element.select("p.txt-info").isEmpty()) if (element.select("p.txt-info").isEmpty()) {
{
content = element.select("p.txt-info").text(); content = element.select("p.txt-info").text();
}else } else {
{
content = element.select("div.txt-box").select("p.txt-info").text(); content = element.select("div.txt-box").select("p.txt-info").text();
} }
// System.out.println("content======================"+content); // System.out.println("content======================"+content);
source = element.select("div.txt-box").select("div.s-p") source = element.select("div.txt-box").select("div.s-p").select("a").text();
.select("a").text(); openid = element.select("div.txt-box").select("div.s-p").select("a").attr("i");
openid = element.select("div.txt-box") putDate = element.select("div.txt-box").select("div.s-p").attr("t");
.select("div.s-p").select("a").attr("i");
putDate = element.select("div.txt-box")
.select("div.s-p").attr("t");
date = new Date(Long.valueOf(putDate) * 1000); date = new Date(Long.valueOf(putDate) * 1000);
int readNum = 0; int readNum = 0;
try { try {
readNum = Integer.valueOf(element.select("div.txt-box") readNum = Integer.valueOf(element.select("div.txt-box").select("div.s-p")
.select("div.s-p").select("span.s1").text().trim()); .select("span.s1").text().trim());
} catch (Exception e) { } catch (Exception e) {
readNum = 0; readNum = 0;
} }
title = ZhiWeiTools.SBC2DBC(title); title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content); content = ZhiWeiTools.SBC2DBC(content);
wechat = new WechatAricle(link, title, source, content, date, readNum, 0,openid,"unknow"); wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow");
wechat = getContentAndSource(url, proxy, headerMap, wechat); wechat = getContentAndSource(link, proxy, wechat);
result.add(wechat); result.add(wechat);
} catch (Exception e) { } catch (Exception e) {
logger.debug("解析数据出现错误:{}",e.getMessage()); logger.debug("解析数据出现错误:{}", e.getMessage());
continue; continue;
} }
} }
// 解析最大可寻页码 // 解析最大可寻页码
String pageNext = document.select("[id=pagebar_container]>a").text(); String pageNext = document.select("[id=pagebar_container]>a").text();
if(pageNext.contains("下一页")){ if (pageNext.contains("下一页")) {
page++; page++;
}else{ } else {
f = false; f = false;
} }
// logger.info("数据总页数为:{}", page); // logger.info("数据总页数为:{}", page);
} catch (Exception e) { } catch (Exception e) {
logger.debug("获取数据出现问题:{}",e.getMessage()); logger.debug("获取数据出现问题:{}", e.getMessage());
return null; return null;
} }
}else } else {
{
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody); logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
} }
// ZhiWeiTools.sleep(100); // ZhiWeiTools.sleep(100);
} }
return result; return result;
} }
/** /**
* 获取全文及来源 * 获取全文及来源
* @param url * @param url
...@@ -163,45 +155,68 @@ public class WechatAritcleSearch { ...@@ -163,45 +155,68 @@ public class WechatAritcleSearch {
* @return * @return
* @throws IOException * @throws IOException
*/ */
private static WechatAricle getContentAndSource(String url, Proxy proxy, Map<String,String> headerMap,WechatAricle wechatAricle) throws IOException{ private static WechatAricle getContentAndSource(String url, Proxy proxy,WechatAricle wechatAricle){
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy, false).body().string(); try {
if(htmlBody!=null){ // String htmlBody = HttpClientTemplateOK.get(url, proxy, null);
Document document = Jsoup.parse(htmlBody); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy).body().string();
String content = document.select("div#js_content").text(); if(htmlBody!=null){
String source = document.select("a#js_name").text(); Document document = Jsoup.parse(htmlBody);
if(content!=null){ String content = null;
wechatAricle.setContent(content); String source = null;
} String text = null;
if(source!=null){ if(htmlBody.contains("js_article")){
wechatAricle.setSource(source); content = document.select("div#js_article").text();
}else if(htmlBody.contains("js_share_content")){
content = document.select("div#js_share_content").text();
}
if(htmlBody.contains("content_tpl")){
text = document.select("script#content_tpl").html();
text = Jsoup.parse(text).text();
}
content = content+text;
if(htmlBody.contains("js_name")){
source = document.select("a#js_name").text().trim();
}else if(htmlBody.contains("account_nickname")){
source = document.select("div.account_nickname").text().trim();
}
// System.out.println(source+"=========="+content);
if(content!=null && content.length()>50){
wechatAricle.setContent(content);
}
if(source!=null && content.length()>0){
wechatAricle.setSource(source);
}
} }
} catch (Exception e) {
e.printStackTrace();
return wechatAricle;
} }
return wechatAricle; return wechatAricle;
} }
// public static void main(String[] args) {
// String url = "https://mp.weixin.qq.com/s?src=11&timestamp=1540521001&ver=1205&signature=12dtyhMA3Xi7lzUhGUFyEpJmWPlnaLAwDVXMUi-tcFXHJbIYDKuLm76sdQUAZxkEjyGby22amJ4AnxIM4oS0ivtAS6ibs4F3OO8-jwoFLk4Pd6d8AhZdj94Z1gQdhdIQ&new=1";
// getContentAndSource(url, null, null);
// }
/** /**
* @Title: getOpenId * @Title: getOpenId
* @Description: TODO(获取微信wxID) * @Description: TODO(获取微信wxID)
* @param @param wxId * @param @param
* @param @return 设定文件 * wxId
* @param @return
* 设定文件
* @return String 返回类型 * @return String 返回类型
*/ */
public static String getOpenId(String wxId) public static String getOpenId(String wxId) {
{
String openId = null; String openId = null;
String url = "http://weixin.sogou.com/weixin?zhnss=1&type=1&ie=utf8&query="+wxId; String url = "http://weixin.sogou.com/weixin?zhnss=1&type=1&ie=utf8&query=" + wxId;
Map<String,String> headerMap = Tools.getWechatHeader(); Map<String, String> headerMap = Tools.getWechatHeader();
String htmlBody; String htmlBody;
try { try {
htmlBody = HttpClientTemplateOK.get(url, null,headerMap); htmlBody = HttpClientTemplateOK.get(url, null, headerMap);
if(htmlBody!=null) if (htmlBody != null) {
{
JSONObject json = JSONObject.parseObject(htmlBody); JSONObject json = JSONObject.parseObject(htmlBody);
openId = json.getString("openid"); openId = json.getString("openid");
} }
...@@ -209,8 +224,8 @@ public class WechatAritcleSearch { ...@@ -209,8 +224,8 @@ public class WechatAritcleSearch {
openId = null; openId = null;
e.printStackTrace(); e.printStackTrace();
} }
return openId; return openId;
} }
} }
//package com.zhiwei.wechat.example; package com.zhiwei.wechat.example;
//
//import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
//import java.net.UnknownHostException; import java.net.UnknownHostException;
//import java.util.ArrayList; import java.util.ArrayList;
//import java.util.List; import java.util.List;
//
//import org.junit.Test; import org.slf4j.Logger;
//import org.slf4j.Logger; import org.slf4j.LoggerFactory;
//import org.slf4j.LoggerFactory;
// import com.zhiwei.wechat.entity.WechatAricle;
//import com.zhiwei.wechat.entity.WechatAricle; import com.zhiwei.wechat.search.WechatAritcleSearch;
//import com.zhiwei.wechat.search.WechatAritcleSearch; import com.zhiwei.wechat.util.Tools;
//import com.zhiwei.wechat.util.Tools;
// /**
///** * @ClassName: WechatSearchExample
// * @ClassName: WechatSearchExample * @Description: TODO(根据关键词等采集数据)
// * @Description: TODO(根据关键词等采集数据) * @author hero
// * @author hero * @date 2016年12月16日 上午9:15:42
// * @date 2016年12月16日 上午9:15:42 */
// */ public class WechatSearchExample{
//public class WechatSearchExample{
// private Logger logger = LoggerFactory.getLogger(WechatSearchExample.class);
// private Logger logger = LoggerFactory.getLogger(WechatSearchExample.class);
//
// public static String cookie = "IPLOC=CN3302; SUID=EAD6E7733220910A000000005941E93A; SUV=1497491773102567; ABTEST=7|1497603317|v1; weixinIndexVisited=1; ppinf=5|1498107937|1499317537|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToyNzolRTYlOUElOTclRTYlQkElOUYlRTYlODMlOTF8Y3J0OjEwOjE0OTgxMDc5Mzd8cmVmbmljazoyNzolRTYlOUElOTclRTYlQkElOUYlRTYlODMlOTF8dXNlcmlkOjQ0Om85dDJsdUJ6dUhpQ2IxcnB3OUZ0QWk4WTN5S0lAd2VpeGluLnNvaHUuY29tfA; pprdig=I4bAcCm_wsn8RDnyejcfFQ-1gxkd2q3VhMOcLSGlyEXZaT3Oq0fbbNN1wslhlmUEMSAMcqhwDG46ZYpKwnHMjFWGtWLqB0qzu8HfI0uCja08CIEt6hWICe66kYCzJNvEiXuu26wBjE47Zivcb8p4XD1CSxh5qRl59DYYDFXIrzM; sgid=08-27429961-AVlLUCFlKgO0FEox1ElfuR0; ld=Jlllllllll2ByW6ElllllVOUXJkllllltMKQfkllllwlllll4ylll5@@@@@@@@@@; LSTMV=405%2C353; LCLKINT=8709; SNUID=B08DBC295B5F0970DCAD6F2C5B1D68B2; ppmdig=1498817001000000c7e9b5e47114b70495487a6f03e36c6c; JSESSIONID=aaavdFFFwNH4Y_-_f0OZv; sct=10"; public static void main(String[] args) {
// try {
// @Test WechatSearchExample.wechatSearchExample();
// public void wechatSearchExample() throws UnknownHostException } catch (UnknownHostException e) {
// { e.printStackTrace();
// List<String> wordList = new ArrayList<String>(); }
// wordList.add("王石"); }
//
// String fileName = "E:\\微博mid.csv";
// List<String> wechatIds= Tools.getFileName(fileName); public static void wechatSearchExample() throws UnknownHostException
// {
// for(String word : wordList) List<String> wordList = new ArrayList<String>();
// { wordList.add("马化腾 知乎");
//
// try { for(String word : wordList)
// List<WechatAricle> list = WechatAritcleSearch.wechatKeywordSearch(word, 5, "2017-06-25", "2017-06-25", cookie); {
// } catch (UnsupportedEncodingException e) { try {
// e.printStackTrace(); List<WechatAricle> list = WechatAritcleSearch.wechatKeywordSearch(word, 5, "2018-10-25", "2018-10-25",null,null);
// } catch (Exception e) { } catch (UnsupportedEncodingException e) {
// e.printStackTrace(); e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
// for(String wxId : wechatIds)
// {
// try {
// logger.info("需要采集的wxId:::{}", wxId);
//
// } catch (UnsupportedEncodingException e) {
// e.printStackTrace();
// } catch (Exception e) {
// e.printStackTrace();
// }
// } // }
//// for(String wxId : wechatIds) }
//// { }
//// try {
//// logger.info("需要采集的wxId:::{}", wxId);
//// }
//// } catch (UnsupportedEncodingException e) {
//// e.printStackTrace();
//// } catch (Exception e) {
//// e.printStackTrace();
//// }
//// }
// }
// }
//
//
//}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment