Commit 87c407d1 by [zhangzhiwei]

微信添加全文及来源采集

parent fd3dac6f
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>wechat</artifactId>
<version>1.1.0-SNAPSHOT</version>
<version>1.1.1-SNAPSHOT</version>
<description>
知微微信采集程序,包含
1.微信历史文章采集
......
......@@ -26,134 +26,126 @@ import com.zhiwei.wechat.entity.WechatAricle;
import com.zhiwei.wechat.util.Tools;
/**
* @ClassName: WechatAritcleSearch
* @Description: TODO(在搜索接口根据关键词采集微信文章)
* @author Bewilder Z
* @ClassName: WechatAritcleSearch
* @Description: TODO(在搜索接口根据关键词采集微信文章)
* @author Bewilder Z
* @date 2016年10月14日 上午9:40:18
*/
public class WechatAritcleSearch {
private static Logger logger = LogManager.getLogger(WechatAritcleSearch.class);
private static HttpBoot httpBoot = new HttpBoot();
/**
*
* @Title: wechatKeywordSearch
* @Description: TODO(根据关键词在搜狗微信搜索微信文章)
* @param @param word 关键词
* @param @param tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内);
* 5(某一时间段内与startTime和endTime配合使用)
* @param @param startTime 开始时间 格式为yyyy-MM-dd
* @param @param endTime 结束时间 格式为yyyy-MM-dd
* @param @param cookie 用户登录后的cookie(不登录最多10页)
* @Title: wechatKeywordSearch
* @Description: TODO(根据关键词在搜狗微信搜索微信文章)
* @param @param
* word 关键词
* @param @param
* tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内);
* 5(某一时间段内与startTime和endTime配合使用)
* @param @param
* startTime 开始时间 格式为yyyy-MM-dd
* @param @param
* endTime 结束时间 格式为yyyy-MM-dd
* @param @param
* cookie 用户登录后的cookie(不登录最多10页)
* @param @return
* @param @throws ZhiWeiException
* @param @throws UnsupportedEncodingException 设定文件
* @param @throws
* ZhiWeiException
* @param @throws
* UnsupportedEncodingException 设定文件
* @return List<Wechat> 返回类型
*/
public static List<WechatAricle> wechatKeywordSearch(String word, int tsn,
String startTime,String endTime,String cookie,Proxy proxy)
throws Exception, UnsupportedEncodingException
{
public static List<WechatAricle> wechatKeywordSearch(String word, int tsn, String startTime, String endTime,
String cookie, Proxy proxy) throws Exception, UnsupportedEncodingException {
List<WechatAricle> result = new ArrayList<WechatAricle>();
Map<String,String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host","weixin.sogou.com");
if(cookie!=null){
headerMap.put("Cookie",cookie);
Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com");
if (cookie != null) {
headerMap.put("Cookie", cookie);
}
boolean f = true;
int page = 1;
while(f)
{
String url = "http://weixin.sogou.com/weixin?type=2&query="
+ URLEncoder.encode(word,"UTF-8")+"&ie=utf8&_sug_=n&_sug_type_="
+ "&ri=1&sourceid=sugg&sst0="+System.currentTimeMillis()
+"&tsn="+tsn + "&page="+page;
if(tsn==5)
{
url = url + "&ft="+startTime + "&et="+ endTime
+ "&wxid=&usip=&interation=&from=tool";
while (f) {
String url = "http://weixin.sogou.com/weixin?type=2&query=" + URLEncoder.encode(word, "UTF-8")
+ "&ie=utf8&_sug_=n&_sug_type_=" + "&ri=1&sourceid=sugg&sst0=" + System.currentTimeMillis()
+ "&tsn=" + tsn + "&page=" + page;
if (tsn == 5) {
url = url + "&ft=" + startTime + "&et=" + endTime + "&wxid=&usip=&interation=&from=tool";
}
headerMap.put("Referer",url);
//获取数据
headerMap.put("Referer", url);
// 获取数据
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy, false).body().string();
//解析数据
if(htmlBody != null)
{
// 解析数据
if (htmlBody != null) {
try {
// 解析数据
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div.news-box")
.select("ul.news-list").select("li");
Elements elements = document.select("div.news-box").select("ul.news-list").select("li");
String title = null;
String link = null;
String link = null;
String content = null;
String source = null;
String openid = null;
String putDate = null;
Date date = null;
WechatAricle wechat = null;
for (Element element : elements)
{
for (Element element : elements) {
try {
title = element.select("div.txt-box").select("h3").text();
link = element.select("div.txt-box").select("h3 >a").attr("href");
content = "";
if(element.select("p.txt-info").isEmpty())
{
if (element.select("p.txt-info").isEmpty()) {
content = element.select("p.txt-info").text();
}else
{
} else {
content = element.select("div.txt-box").select("p.txt-info").text();
}
// System.out.println("content======================"+content);
source = element.select("div.txt-box").select("div.s-p")
.select("a").text();
openid = element.select("div.txt-box")
.select("div.s-p").select("a").attr("i");
putDate = element.select("div.txt-box")
.select("div.s-p").attr("t");
source = element.select("div.txt-box").select("div.s-p").select("a").text();
openid = element.select("div.txt-box").select("div.s-p").select("a").attr("i");
putDate = element.select("div.txt-box").select("div.s-p").attr("t");
date = new Date(Long.valueOf(putDate) * 1000);
int readNum = 0;
try {
readNum = Integer.valueOf(element.select("div.txt-box")
.select("div.s-p").select("span.s1").text().trim());
readNum = Integer.valueOf(element.select("div.txt-box").select("div.s-p")
.select("span.s1").text().trim());
} catch (Exception e) {
readNum = 0;
}
title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content);
wechat = new WechatAricle(link, title, source, content, date, readNum, 0,openid,"unknow");
wechat = getContentAndSource(url, proxy, headerMap, wechat);
wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow");
wechat = getContentAndSource(link, proxy, wechat);
result.add(wechat);
} catch (Exception e) {
logger.debug("解析数据出现错误:{}",e.getMessage());
logger.debug("解析数据出现错误:{}", e.getMessage());
continue;
}
}
// 解析最大可寻页码
String pageNext = document.select("[id=pagebar_container]>a").text();
if(pageNext.contains("下一页")){
if (pageNext.contains("下一页")) {
page++;
}else{
} else {
f = false;
}
// logger.info("数据总页数为:{}", page);
// logger.info("数据总页数为:{}", page);
} catch (Exception e) {
logger.debug("获取数据出现问题:{}",e.getMessage());
logger.debug("获取数据出现问题:{}", e.getMessage());
return null;
}
}else
{
} else {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
}
// ZhiWeiTools.sleep(100);
// ZhiWeiTools.sleep(100);
}
return result;
}
/**
* 获取全文及来源
* @param url
......@@ -163,45 +155,68 @@ public class WechatAritcleSearch {
* @return
* @throws IOException
*/
private static WechatAricle getContentAndSource(String url, Proxy proxy, Map<String,String> headerMap,WechatAricle wechatAricle) throws IOException{
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy, false).body().string();
if(htmlBody!=null){
Document document = Jsoup.parse(htmlBody);
String content = document.select("div#js_content").text();
String source = document.select("a#js_name").text();
if(content!=null){
wechatAricle.setContent(content);
}
if(source!=null){
wechatAricle.setSource(source);
private static WechatAricle getContentAndSource(String url, Proxy proxy,WechatAricle wechatAricle){
try {
// String htmlBody = HttpClientTemplateOK.get(url, proxy, null);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy).body().string();
if(htmlBody!=null){
Document document = Jsoup.parse(htmlBody);
String content = null;
String source = null;
String text = null;
if(htmlBody.contains("js_article")){
content = document.select("div#js_article").text();
}else if(htmlBody.contains("js_share_content")){
content = document.select("div#js_share_content").text();
}
if(htmlBody.contains("content_tpl")){
text = document.select("script#content_tpl").html();
text = Jsoup.parse(text).text();
}
content = content+text;
if(htmlBody.contains("js_name")){
source = document.select("a#js_name").text().trim();
}else if(htmlBody.contains("account_nickname")){
source = document.select("div.account_nickname").text().trim();
}
// System.out.println(source+"=========="+content);
if(content!=null && content.length()>50){
wechatAricle.setContent(content);
}
if(source!=null && content.length()>0){
wechatAricle.setSource(source);
}
}
} catch (Exception e) {
e.printStackTrace();
return wechatAricle;
}
return wechatAricle;
}
// public static void main(String[] args) {
// String url = "https://mp.weixin.qq.com/s?src=11&timestamp=1540521001&ver=1205&signature=12dtyhMA3Xi7lzUhGUFyEpJmWPlnaLAwDVXMUi-tcFXHJbIYDKuLm76sdQUAZxkEjyGby22amJ4AnxIM4oS0ivtAS6ibs4F3OO8-jwoFLk4Pd6d8AhZdj94Z1gQdhdIQ&new=1";
// getContentAndSource(url, null, null);
// }
/**
* @Title: getOpenId
* @Description: TODO(获取微信wxID)
* @param @param wxId
* @param @return 设定文件
* @Title: getOpenId
* @Description: TODO(获取微信wxID)
* @param @param
* wxId
* @param @return
* 设定文件
* @return String 返回类型
*/
public static String getOpenId(String wxId)
{
public static String getOpenId(String wxId) {
String openId = null;
String url = "http://weixin.sogou.com/weixin?zhnss=1&type=1&ie=utf8&query="+wxId;
Map<String,String> headerMap = Tools.getWechatHeader();
String url = "http://weixin.sogou.com/weixin?zhnss=1&type=1&ie=utf8&query=" + wxId;
Map<String, String> headerMap = Tools.getWechatHeader();
String htmlBody;
try {
htmlBody = HttpClientTemplateOK.get(url, null,headerMap);
if(htmlBody!=null)
{
htmlBody = HttpClientTemplateOK.get(url, null, headerMap);
if (htmlBody != null) {
JSONObject json = JSONObject.parseObject(htmlBody);
openId = json.getString("openid");
}
......@@ -209,8 +224,8 @@ public class WechatAritcleSearch {
openId = null;
e.printStackTrace();
}
return openId;
}
}
//package com.zhiwei.wechat.example;
//
//import java.io.UnsupportedEncodingException;
//import java.net.UnknownHostException;
//import java.util.ArrayList;
//import java.util.List;
//
//import org.junit.Test;
//import org.slf4j.Logger;
//import org.slf4j.LoggerFactory;
//
//import com.zhiwei.wechat.entity.WechatAricle;
//import com.zhiwei.wechat.search.WechatAritcleSearch;
//import com.zhiwei.wechat.util.Tools;
//
///**
// * @ClassName: WechatSearchExample
// * @Description: TODO(根据关键词等采集数据)
// * @author hero
// * @date 2016年12月16日 上午9:15:42
// */
//public class WechatSearchExample{
//
// private Logger logger = LoggerFactory.getLogger(WechatSearchExample.class);
//
// public static String cookie = "IPLOC=CN3302; SUID=EAD6E7733220910A000000005941E93A; SUV=1497491773102567; ABTEST=7|1497603317|v1; weixinIndexVisited=1; ppinf=5|1498107937|1499317537|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToyNzolRTYlOUElOTclRTYlQkElOUYlRTYlODMlOTF8Y3J0OjEwOjE0OTgxMDc5Mzd8cmVmbmljazoyNzolRTYlOUElOTclRTYlQkElOUYlRTYlODMlOTF8dXNlcmlkOjQ0Om85dDJsdUJ6dUhpQ2IxcnB3OUZ0QWk4WTN5S0lAd2VpeGluLnNvaHUuY29tfA; pprdig=I4bAcCm_wsn8RDnyejcfFQ-1gxkd2q3VhMOcLSGlyEXZaT3Oq0fbbNN1wslhlmUEMSAMcqhwDG46ZYpKwnHMjFWGtWLqB0qzu8HfI0uCja08CIEt6hWICe66kYCzJNvEiXuu26wBjE47Zivcb8p4XD1CSxh5qRl59DYYDFXIrzM; sgid=08-27429961-AVlLUCFlKgO0FEox1ElfuR0; ld=Jlllllllll2ByW6ElllllVOUXJkllllltMKQfkllllwlllll4ylll5@@@@@@@@@@; LSTMV=405%2C353; LCLKINT=8709; SNUID=B08DBC295B5F0970DCAD6F2C5B1D68B2; ppmdig=1498817001000000c7e9b5e47114b70495487a6f03e36c6c; JSESSIONID=aaavdFFFwNH4Y_-_f0OZv; sct=10";
//
// @Test
// public void wechatSearchExample() throws UnknownHostException
// {
// List<String> wordList = new ArrayList<String>();
// wordList.add("王石");
//
// String fileName = "E:\\微博mid.csv";
// List<String> wechatIds= Tools.getFileName(fileName);
//
// for(String word : wordList)
// {
//
// try {
// List<WechatAricle> list = WechatAritcleSearch.wechatKeywordSearch(word, 5, "2017-06-25", "2017-06-25", cookie);
// } catch (UnsupportedEncodingException e) {
// e.printStackTrace();
// } catch (Exception e) {
// e.printStackTrace();
package com.zhiwei.wechat.example;
import java.io.UnsupportedEncodingException;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.wechat.entity.WechatAricle;
import com.zhiwei.wechat.search.WechatAritcleSearch;
import com.zhiwei.wechat.util.Tools;
/**
* @ClassName: WechatSearchExample
* @Description: TODO(根据关键词等采集数据)
* @author hero
* @date 2016年12月16日 上午9:15:42
*/
public class WechatSearchExample{
private Logger logger = LoggerFactory.getLogger(WechatSearchExample.class);
public static void main(String[] args) {
try {
WechatSearchExample.wechatSearchExample();
} catch (UnknownHostException e) {
e.printStackTrace();
}
}
public static void wechatSearchExample() throws UnknownHostException
{
List<String> wordList = new ArrayList<String>();
wordList.add("马化腾 知乎");
for(String word : wordList)
{
try {
List<WechatAricle> list = WechatAritcleSearch.wechatKeywordSearch(word, 5, "2018-10-25", "2018-10-25",null,null);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
// for(String wxId : wechatIds)
// {
// try {
// logger.info("需要采集的wxId:::{}", wxId);
//
// } catch (UnsupportedEncodingException e) {
// e.printStackTrace();
// } catch (Exception e) {
// e.printStackTrace();
// }
// }
//// for(String wxId : wechatIds)
//// {
//// try {
//// logger.info("需要采集的wxId:::{}", wxId);
////
//// } catch (UnsupportedEncodingException e) {
//// e.printStackTrace();
//// } catch (Exception e) {
//// e.printStackTrace();
//// }
//// }
// }
// }
//
//
//}
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment