Commit 7db2a9e8 by zhiwei

分享链接消失并失效,解析改为有验证码的链接

parent 7ad96e77
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>wechat</artifactId>
<version>1.1.7-SNAPSHOT</version>
<description>
知微微信采集程序,包含
1.微信历史文章采集
2.搜狗微信接口关键词采集
3.点赞阅读更新接口
4.根据关键词或微信id查询帐号信息
5.根据文章链接采集评论列表及评论数
</description>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties>
<developers>
<developer>
<id>Bewilder</id>
<name>zhiwei zhang</name>
<email>zhangzhiwei@zhiweidata.com</email>
</developer>
</developers>
<!-- 打包管理 -->
<build>
<plugins>
<!-- 发布源码 -->
<plugin>
<artifactId>maven-source-plugin</artifactId>
<version>2.4</version>
<configuration>
<attach>true</attach>
</configuration>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.10.4</version>
</plugin>
<!-- 解决maven test命令时console出现中文乱码乱码 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.19.1</version>
<configuration>
<forkMode>once</forkMode>
<argLine>-Dfile.encoding=UTF-8</argLine>
</configuration>
</plugin>
</plugins>
</build>
<!-- 分发管理:管理distribution和supporting files -->
<distributionManagement>
<snapshotRepository>
<id>nexus-releases</id>
<name>User Porject Snapshot</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/snapshots/</url>
<uniqueVersion>true</uniqueVersion>
</snapshotRepository>
<repository>
<id>nexus-releases</id>
<name>User Porject Release</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/releases/</url>
</repository>
</distributionManagement>
<dependencies>
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.1.4-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.5.2-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
</dependencies>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>wechat</artifactId>
<version>1.1.8-SNAPSHOT</version>
<description>
知微微信采集程序,包含
1.微信历史文章采集
2.搜狗微信接口关键词采集
3.点赞阅读更新接口
4.根据关键词或微信id查询帐号信息
5.根据文章链接采集评论列表及评论数
</description>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties>
<developers>
<developer>
<id>Bewilder</id>
<name>zhiwei zhang</name>
<email>zhangzhiwei@zhiweidata.com</email>
</developer>
</developers>
<!-- 打包管理 -->
<build>
<plugins>
<!-- 发布源码 -->
<plugin>
<artifactId>maven-source-plugin</artifactId>
<version>2.4</version>
<configuration>
<attach>true</attach>
</configuration>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.10.4</version>
</plugin>
<!-- 解决maven test命令时console出现中文乱码乱码 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.19.1</version>
<configuration>
<forkMode>once</forkMode>
<argLine>-Dfile.encoding=UTF-8</argLine>
</configuration>
</plugin>
</plugins>
</build>
<!-- 分发管理:管理distribution和supporting files -->
<distributionManagement>
<snapshotRepository>
<id>nexus-releases</id>
<name>User Porject Snapshot</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/snapshots/</url>
<uniqueVersion>true</uniqueVersion>
</snapshotRepository>
<repository>
<id>nexus-releases</id>
<name>User Porject Release</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/releases/</url>
</repository>
</distributionManagement>
<dependencies>
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.1.4-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.5.5.6-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
</dependencies>
</project>
\ No newline at end of file
package com.zhiwei.wechat.search;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.Proxy;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.wechat.entity.WechatAricle;
/**
* @ClassName: WechatAritcleSearch
* @Description: TODO(在搜索接口根据关键词采集微信文章)
* @author Bewilder Z
* @date 2016年10月14日 上午9:40:18
*/
public class WechatAritcleSearch {
private static Logger logger = LogManager.getLogger(WechatAritcleSearch.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* 根据关键词在搜狗微信搜索微信文章,不包含全文
* @Title: wechatKeywordSearch
* @param
* word 关键词
* @param
* tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内);
* 5(某一时间段内与startTime和endTime配合使用)
* @param
* startTime 开始时间 格式为yyyy-MM-dd
* @param
* endTime 结束时间 格式为yyyy-MM-dd
* @param
* cookie 用户登录后的cookie(不登录最多10页)
* @param
* pages 需要限制返回页数的总页数(如返回前20页则传21),如没有限制页数则传null
* @throws
* ZhiWeiException
* @return List<Wechat> 返回类型
*/
public static List<WechatAricle> wechatKeywordSearch(String word, int tsn, String cookie,String startTime, String endTime,
Proxy proxy, Integer pages) throws Exception{
List<WechatAricle> result = new ArrayList<>();
Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com");
if(StringUtils.isNotBlank(cookie)) {
headerMap.put("cookie", cookie);
}
boolean f = true;
int page = 1;
while (f) {
String url = "http://weixin.sogou.com/weixin?type=2&query=" + URLEncoder.encode(word, "UTF-8")
+ "&ie=utf8&_sug_=n&_sug_type_=&ri=1&sourceid=sugg&sst0=" + System.currentTimeMillis()
+ "&tsn=" + tsn + "&page=" + page;
if (tsn == 5) {
url = url + "&ft=" + startTime + "&et=" + endTime + "&wxid=&usip=&interation=&from=tool";
}
System.out.println(url);
headerMap.put("Referer", url);
// 获取数据
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy, false).body().string();
// 解析数据
if (htmlBody != null) {
try {
// 解析数据
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div.news-box").select("ul.news-list").select("li");
String title = null;
String link = null;
String content = null;
String source = null;
String openid = null;
String putDate = null;
Date date = null;
WechatAricle wechat = null;
for (Element element : elements) {
try {
title = element.select("div.txt-box").select("h3").text();
link = element.select("div.txt-box").select("h3 >a").attr("data-share");
content = "";
if (element.select("p.txt-info").isEmpty()) {
content = element.select("p.txt-info").text();
} else {
content = element.select("div.txt-box").select("p.txt-info").text();
}
// System.out.println("content======================"+content);
source = element.select("div.txt-box").select("div.s-p").select("a").text();
source = source.replaceAll(" ", "").trim();
openid = element.select("div.txt-box").select("div.s-p").select("a").attr("i");
putDate = element.select("div.txt-box").select("div.s-p").attr("t");
date = new Date(Long.valueOf(putDate) * 1000);
int readNum = 0;
try {
readNum = Integer.valueOf(element.select("div.txt-box").select("div.s-p")
.select("span.s1").text().trim());
} catch (Exception e) {
readNum = 0;
}
title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content);
wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow");
result.add(wechat);
} catch (Exception e) {
logger.debug("解析数据出现错误:{}", e);
}
}
// 解析最大可寻页码
String pageNext = document.select("[id=pagebar_container]>a").text();
if (pageNext.contains("下一页")) {
// logger.info("采集到 {} 页" , page);
page++;
} else {
f = false;
}
// logger.info("数据总页数为:{}", page);
} catch (Exception e) {
logger.debug("获取数据出现问题:{}", e.getMessage());
return result;
}
} else {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
}
// ZhiWeiTools.sleep(100);
if(pages!=null && pages==page) {
break;
}
}
return result;
}
/**
*
* @Title: wechatKeywordSearch
* @Description: TODO(根据关键词在搜狗微信搜索微信文章,包含全文)
* @param @param
* word 关键词
* @param @param
* tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内);
* 5(某一时间段内与startTime和endTime配合使用)
* @param @param
* startTime 开始时间 格式为yyyy-MM-dd
* @param @param
* endTime 结束时间 格式为yyyy-MM-dd
* @param @param
* cookie 用户登录后的cookie(不登录最多10页)
* @param @return
* @param @throws
* ZhiWeiException
* @param @throws
* UnsupportedEncodingException 设定文件
* @return List<Wechat> 返回类型
*/
public static List<WechatAricle> wechatKeywordSearch(String word, int tsn, String startTime, String endTime,
Proxy proxy,ProxyHolder proxyHolder) throws Exception, UnsupportedEncodingException {
List<WechatAricle> result = new ArrayList<WechatAricle>();
Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com");
boolean f = true;
int page = 1;
while (f) {
String url = "http://weixin.sogou.com/weixin?type=2&query=" + URLEncoder.encode(word, "UTF-8")
+ "&ie=utf8&_sug_=n&_sug_type_=" + "&ri=1&sourceid=sugg&sst0=" + System.currentTimeMillis()
+ "&tsn=" + tsn + "&page=" + page;
if (tsn == 5) {
url = url + "&ft=" + startTime + "&et=" + endTime + "&wxid=&usip=&interation=&from=tool";
}
headerMap.put("Referer", url);
// 获取数据
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy, false).body().string();
// 解析数据
if (htmlBody != null) {
try {
// 解析数据
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div.news-box").select("ul.news-list").select("li");
String title = null;
String link = null;
String content = null;
String source = null;
String openid = null;
String putDate = null;
Date date = null;
WechatAricle wechat = null;
for (Element element : elements) {
try {
title = element.select("div.txt-box").select("h3").text();
link = element.select("div.txt-box").select("h3 >a").attr("data-share");
content = "";
if (element.select("p.txt-info").isEmpty()) {
content = element.select("p.txt-info").text();
} else {
content = element.select("div.txt-box").select("p.txt-info").text();
}
// System.out.println("content======================"+content);
source = element.select("div.txt-box").select("div.s-p").select("a").text();
openid = element.select("div.txt-box").select("div.s-p").select("a").attr("i");
putDate = element.select("div.txt-box").select("div.s-p").attr("t");
date = new Date(Long.valueOf(putDate) * 1000);
int readNum = 0;
try {
readNum = Integer.valueOf(element.select("div.txt-box").select("div.s-p")
.select("span.s1").text().trim());
} catch (Exception e) {
readNum = 0;
}
title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content);
wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow");
wechat = getWechatAricleInfo(link, proxyHolder, wechat);
result.add(wechat);
} catch (Exception e) {
logger.debug("解析数据出现错误:{}", e.getMessage());
continue;
}
}
// 解析最大可寻页码
String pageNext = document.select("[id=pagebar_container]>a").text();
if (pageNext.contains("下一页")) {
page++;
} else {
f = false;
}
// logger.info("数据总页数为:{}", page);
} catch (Exception e) {
logger.debug("获取数据出现问题:{}", e.getMessage());
return null;
}
} else {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
}
// ZhiWeiTools.sleep(100);
}
return result;
}
/**
* 获取全文及来源
* @param url
* @param proxy
* @param headerMap
* @param wechatAricle
* @return
* @throws IOException
*/
private static WechatAricle getWechatAricleInfo(String url, ProxyHolder proxy,WechatAricle wechatAricle){
try {
String contentHtml = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy.getProxy()).body().string();
String content = null;
String time = null;
String source = null;
String biz = null;
String title = null;
String user_name = null;
String wxId = null;
if(contentHtml!=null){
Document document = Jsoup.parse(contentHtml);
title = document.select("title").text();
wxId = document.select("p.profile_meta").get(0).select("span.profile_meta_value").text();
if(contentHtml.contains("js_article")){
content = document.select("div#js_article").text();
}else if(contentHtml.contains("js_share_content")){
content = document.select("div#js_share_content").text();
}
if(contentHtml.contains("content_tpl")){
String text = document.select("script#content_tpl").html();
content = Jsoup.parse(text).text();
}
if(contentHtml.contains("d.nick_name = ")){
time = contentHtml.split("d.ct = \"")[1].split("\";")[0];
source = contentHtml.split("d.nick_name = \"")[1].split("\";")[0];
biz = contentHtml.split("d.biz = \"")[1].split("\"")[0];
user_name = contentHtml.split("d.user_name = \"")[1].split("\"")[0];
}else if(contentHtml.contains("var nickname = ")){
time = contentHtml.split("var ct = \"")[1].split("\";")[0];
source = contentHtml.split("var nickname = \"")[1].split("\";")[0];
biz = contentHtml.split("var appuin = \"\"||\"")[1].split("\"")[0];
user_name = contentHtml.split("var user_name = \"")[1].split("\"")[0];
}
}
if(wechatAricle == null) {
wechatAricle = new WechatAricle();
wechatAricle.setTitle(title);
wechatAricle.setTime(new Date(Long.valueOf(time)*1000));
wechatAricle.setSource(source);
}
wechatAricle.setBiz(biz);
wechatAricle.setContent(content);
wechatAricle.setWxId(wxId);
wechatAricle.setUser_name(user_name);
} catch (Exception e) {
e.printStackTrace();
return wechatAricle;
}
return wechatAricle;
}
/**
* 根据关键词采集指定时间+账号的数据
* @param word
* @param idOrName
* @param tsn
* @param startTime
* @param endTime
* @param proxy
* @param proxyHolder
* @return
* @throws Exception
* @throws UnsupportedEncodingException
*/
public static List<WechatAricle> wechatKeywordSearchByAccount(String word, String idOrName, String startTime, String endTime,
ProxyHolder proxyHolder) throws Exception, UnsupportedEncodingException {
List<WechatAricle> result = new ArrayList<WechatAricle>();
Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com");
if(idOrName==null || idOrName.equals("")){
throw new IllegalArgumentException("要检索的昵称或id不能为空");
}
String openId = getOpenId(idOrName, proxyHolder);
boolean f = false;
if(openId!=null){
f = true;
}
int page = 1;
while (f) {
String url = "https://weixin.sogou.com/weixin?type=2&ie=utf8&query=" + URLEncoder.encode(word, "UTF-8")
+ "&tsn=5&ft=" + startTime + "&et=" + endTime +"&interation=&page=" + page+"&wxid="+ openId
+"&usip=" + URLEncoder.encode(idOrName, "UTF-8");
headerMap.put("Referer", url);
System.out.println(url);
// 获取数据
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxyHolder, true).body().string();
// 解析数据
if (htmlBody != null) {
try {
// 解析数据
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div.news-box").select("ul.news-list").select("li");
String title = null;
String link = null;
String content = null;
String source = null;
String openid = null;
String putDate = null;
Date date = null;
WechatAricle wechat = null;
for (Element element : elements) {
try {
title = element.select("div.txt-box").select("h3").text();
link = element.select("div.txt-box").select("h3 >a").attr("data-share");
content = "";
if (element.select("p.txt-info").isEmpty()) {
content = element.select("p.txt-info").text();
} else {
content = element.select("div.txt-box").select("p.txt-info").text();
}
// System.out.println("content======================"+content);
source = element.select("div.txt-box").select("div.s-p").select("a").text();
openid = element.select("div.txt-box").select("div.s-p").select("a").attr("i");
putDate = element.select("div.txt-box").select("div.s-p").attr("t");
date = new Date(Long.valueOf(putDate) * 1000);
int readNum = 0;
try {
readNum = Integer.valueOf(element.select("div.txt-box").select("div.s-p")
.select("span.s1").text().trim());
} catch (Exception e) {
readNum = 0;
}
title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content);
wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow");
wechat = getWechatAricleInfo(link, proxyHolder, wechat);
result.add(wechat);
} catch (Exception e) {
logger.debug("解析数据出现错误:{}", e.getMessage());
continue;
}
}
// 解析最大可寻页码
String pageNext = document.select("[id=pagebar_container]>a").text();
if (pageNext.contains("下一页")) {
page++;
} else {
f = false;
}
// logger.info("数据总页数为:{}", page);
} catch (Exception e) {
logger.debug("获取数据出现问题:{}", e.getMessage());
return null;
}
} else {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
}
// ZhiWeiTools.sleep(100);
}
return result;
}
/**
* @Title: getOpenId
* @Description: TODO(获取微信wxID)
* @param @param
* wxId
* @param @return
* 设定文件
* @return String 返回类型
*/
public static String getOpenId(String idOrName, ProxyHolder proxyHolder) {
String openId = null;
String url = "https://weixin.sogou.com/weixin?zhnss=1&type=1&ie=utf8&query=" + URLCodeUtil.getURLEncode(idOrName, "utf-8");
String htmlBody;
for(int i = 1;i < 3;i++) {
try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), proxyHolder).body().string();
if (htmlBody != null) {
JSONObject json = JSONObject.parseObject(htmlBody);
openId = json.getString("openid");
return openId;
}
} catch (Exception e) {
e.printStackTrace();
openId = null;
}
}
return openId;
}
}
package com.zhiwei.wechat.search;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.Proxy;
import java.net.URLEncoder;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.wechat.entity.WechatAricle;
/**
* @ClassName: WechatAritcleSearch
* @Description: TODO(在搜索接口根据关键词采集微信文章)
* @author Bewilder Z
* @date 2016年10月14日 上午9:40:18
*/
public class WechatAritcleSearch {
private static Logger logger = LogManager.getLogger(WechatAritcleSearch.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
* 根据关键词在搜狗微信搜索微信文章,不包含全文
* @Title: wechatKeywordSearch
* @param
* word 关键词
* @param
* tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内);
* 5(某一时间段内与startTime和endTime配合使用)
* @param
* startTime 开始时间 格式为yyyy-MM-dd
* @param
* endTime 结束时间 格式为yyyy-MM-dd
* @param
* cookie 用户登录后的cookie(不登录最多10页)
* @param
* pages 需要限制返回页数的总页数(如返回前20页则传21),如没有限制页数则传null
* @throws
* Exception
* @return List<Wechat> 返回类型
*/
public static List<WechatAricle> wechatKeywordSearch(String word, int tsn, String cookie,String startTime, String endTime,
Proxy proxy, Integer pages) throws Exception{
List<WechatAricle> result = new ArrayList<>();
Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com");
if(StringUtils.isNotBlank(cookie)) {
headerMap.put("cookie", cookie);
}
boolean f = true;
int page = 1;
while (f) {
String searchUrl = "http://weixin.sogou.com/weixin?type=2&query=" + URLEncoder.encode(word, "UTF-8")
+ "&ie=utf8&_sug_=n&_sug_type_=&ri=1&sourceid=sugg&sst0=" + System.currentTimeMillis()
+ "&tsn=" + tsn + "&page=" + page;
if (tsn == 5) {
searchUrl = searchUrl + "&ft=" + startTime + "&et=" + endTime + "&wxid=&usip=&interation=&from=tool";
}
headerMap.put("Referer", searchUrl);
// 获取数据
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(searchUrl, headerMap), proxy, false).body().string();
// 解析数据
if (htmlBody != null) {
try {
// 解析数据
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div.news-box").select("ul.news-list").select("li");
String title = null;
String link = null;
String content = null;
String source = null;
String openid = null;
String putDate = null;
Date date = null;
WechatAricle wechat = null;
for (Element element : elements) {
try {
title = element.select("div.txt-box").select("h3").text();
link = element.select("div.txt-box").select("h3 >a").attr("href");
// link = getRealLink(link, searchUrl);
content = "";
if (element.select("p.txt-info").isEmpty()) {
content = element.select("p.txt-info").text();
} else {
content = element.select("div.txt-box").select("p.txt-info").text();
}
// System.out.println("content======================"+content);
source = element.select("div.txt-box").select("div.s-p").select("a").text();
source = source.replaceAll(" ", "").trim();
openid = element.select("div.txt-box").select("div.s-p").select("a").attr("i");
putDate = element.select("div.txt-box").select("div.s-p").attr("t");
date = new Date(Long.valueOf(putDate) * 1000);
int readNum = 0;
try {
readNum = Integer.valueOf(element.select("div.txt-box").select("div.s-p")
.select("span.s1").text().trim());
} catch (Exception e) {
readNum = 0;
}
title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content);
wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow");
result.add(wechat);
} catch (Exception e) {
e.printStackTrace();
logger.debug("解析数据出现错误:{}", e);
}
}
// 解析最大可寻页码
String pageNext = document.select("[id=pagebar_container]>a").text();
if (pageNext.contains("下一页")) {
// logger.info("采集到 {} 页" , page);
page++;
} else {
f = false;
}
// logger.info("数据总页数为:{}", page);
} catch (Exception e) {
logger.debug("获取数据出现问题:{}", e.getMessage());
return result;
}
} else {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
}
// ZhiWeiTools.sleep(100);
if(pages!=null && pages==page) {
break;
}
}
return result;
}
/**
*
* @Title: wechatKeywordSearch
* @Description: TODO(根据关键词在搜狗微信搜索微信文章,包含全文)
* @param @param
* word 关键词
* @param @param
* tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内);
* 5(某一时间段内与startTime和endTime配合使用)
* @param @param
* startTime 开始时间 格式为yyyy-MM-dd
* @param @param
* endTime 结束时间 格式为yyyy-MM-dd
* @param @param
* cookie 用户登录后的cookie(不登录最多10页)
* @param @return
* @param @throws
* ZhiWeiException
* @param @throws
* UnsupportedEncodingException 设定文件
* @return List<Wechat> 返回类型
*/
public static List<WechatAricle> wechatKeywordSearch(String word, int tsn, String startTime, String endTime,
Proxy proxy,ProxyHolder proxyHolder) throws Exception, UnsupportedEncodingException {
List<WechatAricle> result = new ArrayList<WechatAricle>();
Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com");
boolean f = true;
int page = 1;
while (f) {
String searchUrl = "http://weixin.sogou.com/weixin?type=2&query=" + URLEncoder.encode(word, "UTF-8")
+ "&ie=utf8&_sug_=n&_sug_type_=" + "&ri=1&sourceid=sugg&sst0=" + System.currentTimeMillis()
+ "&tsn=" + tsn + "&page=" + page;
if (tsn == 5) {
searchUrl = searchUrl + "&ft=" + startTime + "&et=" + endTime + "&wxid=&usip=&interation=&from=tool";
}
headerMap.put("Referer", searchUrl);
// 获取数据
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(searchUrl, headerMap), proxy, false).body().string();
// 解析数据
if (htmlBody != null) {
try {
// 解析数据
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div.news-box").select("ul.news-list").select("li");
String title = null;
String link = null;
String content = null;
String source = null;
String openid = null;
String putDate = null;
Date date = null;
WechatAricle wechat = null;
for (Element element : elements) {
try {
title = element.select("div.txt-box").select("h3").text();
link = element.select("div.txt-box").select("h3 >a").attr("href");
// link = getRealLink(link, searchUrl);
content = "";
if (element.select("p.txt-info").isEmpty()) {
content = element.select("p.txt-info").text();
} else {
content = element.select("div.txt-box").select("p.txt-info").text();
}
// System.out.println("content======================"+content);
source = element.select("div.txt-box").select("div.s-p").select("a").text();
openid = element.select("div.txt-box").select("div.s-p").select("a").attr("i");
putDate = element.select("div.txt-box").select("div.s-p").attr("t");
date = new Date(Long.valueOf(putDate) * 1000);
int readNum = 0;
try {
readNum = Integer.valueOf(element.select("div.txt-box").select("div.s-p")
.select("span.s1").text().trim());
} catch (Exception e) {
readNum = 0;
}
title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content);
wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow");
wechat = getWechatAricleInfo(link, proxyHolder, wechat);
result.add(wechat);
} catch (Exception e) {
logger.debug("解析数据出现错误:{}", e.getMessage());
continue;
}
}
// 解析最大可寻页码
String pageNext = document.select("[id=pagebar_container]>a").text();
if (pageNext.contains("下一页")) {
page++;
} else {
f = false;
}
// logger.info("数据总页数为:{}", page);
} catch (Exception e) {
logger.debug("获取数据出现问题:{}", e.getMessage());
return null;
}
} else {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
}
// ZhiWeiTools.sleep(100);
}
return result;
}
/**
* 获取全文及来源
* @param url
* @param proxy
* @param wechatAricle
* @return
* @throws IOException
*/
private static WechatAricle getWechatAricleInfo(String url, ProxyHolder proxy,WechatAricle wechatAricle){
try {
String contentHtml = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy.getProxy()).body().string();
String content = null;
String time = null;
String source = null;
String biz = null;
String title = null;
String user_name = null;
String wxId = null;
if(contentHtml!=null){
Document document = Jsoup.parse(contentHtml);
title = document.select("title").text();
wxId = document.select("p.profile_meta").get(0).select("span.profile_meta_value").text();
if(contentHtml.contains("js_article")){
content = document.select("div#js_article").text();
}else if(contentHtml.contains("js_share_content")){
content = document.select("div#js_share_content").text();
}
if(contentHtml.contains("content_tpl")){
String text = document.select("script#content_tpl").html();
content = Jsoup.parse(text).text();
}
if(contentHtml.contains("d.nick_name = ")){
time = contentHtml.split("d.ct = \"")[1].split("\";")[0];
source = contentHtml.split("d.nick_name = \"")[1].split("\";")[0];
biz = contentHtml.split("d.biz = \"")[1].split("\"")[0];
user_name = contentHtml.split("d.user_name = \"")[1].split("\"")[0];
}else if(contentHtml.contains("var nickname = ")){
time = contentHtml.split("var ct = \"")[1].split("\";")[0];
source = contentHtml.split("var nickname = \"")[1].split("\";")[0];
biz = contentHtml.split("var appuin = \"\"||\"")[1].split("\"")[0];
user_name = contentHtml.split("var user_name = \"")[1].split("\"")[0];
}
}
if(wechatAricle == null) {
wechatAricle = new WechatAricle();
wechatAricle.setTitle(title);
wechatAricle.setTime(new Date(Long.valueOf(time)*1000));
wechatAricle.setSource(source);
}
wechatAricle.setBiz(biz);
wechatAricle.setContent(content);
wechatAricle.setWxId(wxId);
wechatAricle.setUser_name(user_name);
} catch (Exception e) {
e.printStackTrace();
return wechatAricle;
}
return wechatAricle;
}
/**
* 根据关键词采集指定时间+账号的数据
* @param word
* @param idOrName
* @param startTime
* @param endTime
* @param proxyHolder
* @return
* @throws Exception
* @throws UnsupportedEncodingException
*/
public static List<WechatAricle> wechatKeywordSearchByAccount(String word, String idOrName, String startTime, String endTime,
ProxyHolder proxyHolder) throws Exception, UnsupportedEncodingException {
List<WechatAricle> result = new ArrayList<WechatAricle>();
Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com");
if(idOrName==null || idOrName.equals("")){
throw new IllegalArgumentException("要检索的昵称或id不能为空");
}
String openId = getOpenId(idOrName, proxyHolder);
boolean f = false;
if(openId!=null){
f = true;
}
int page = 1;
while (f) {
String searchUrl = "https://weixin.sogou.com/weixin?type=2&ie=utf8&query=" + URLEncoder.encode(word, "UTF-8")
+ "&tsn=5&ft=" + startTime + "&et=" + endTime +"&interation=&page=" + page+"&wxid="+ openId
+"&usip=" + URLEncoder.encode(idOrName, "UTF-8");
headerMap.put("Referer", searchUrl);
// 获取数据
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(searchUrl, headerMap), proxyHolder, true).body().string();
// 解析数据
if (htmlBody != null) {
try {
// 解析数据
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div.news-box").select("ul.news-list").select("li");
String title = null;
String link = null;
String content = null;
String source = null;
String openid = null;
String putDate = null;
Date date = null;
WechatAricle wechat = null;
for (Element element : elements) {
try {
title = element.select("div.txt-box").select("h3").text();
link = element.select("div.txt-box").select("h3 >a").attr("href");
// link = getRealLink(link, searchUrl);
content = "";
if (element.select("p.txt-info").isEmpty()) {
content = element.select("p.txt-info").text();
} else {
content = element.select("div.txt-box").select("p.txt-info").text();
}
// System.out.println("content======================"+content);
source = element.select("div.txt-box").select("div.s-p").select("a").text();
openid = element.select("div.txt-box").select("div.s-p").select("a").attr("i");
putDate = element.select("div.txt-box").select("div.s-p").attr("t");
date = new Date(Long.valueOf(putDate) * 1000);
int readNum = 0;
try {
readNum = Integer.valueOf(element.select("div.txt-box").select("div.s-p")
.select("span.s1").text().trim());
} catch (Exception e) {
readNum = 0;
}
title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content);
wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow");
wechat = getWechatAricleInfo(link, proxyHolder, wechat);
result.add(wechat);
} catch (Exception e) {
logger.debug("解析数据出现错误:{}", e.getMessage());
continue;
}
}
// 解析最大可寻页码
String pageNext = document.select("[id=pagebar_container]>a").text();
if (pageNext.contains("下一页")) {
page++;
} else {
f = false;
}
// logger.info("数据总页数为:{}", page);
} catch (Exception e) {
logger.debug("获取数据出现问题:{}", e.getMessage());
return null;
}
} else {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
}
// ZhiWeiTools.sleep(100);
}
return result;
}
/**
* 获取真实链接
* @param originalUrl
* @param searchUrl
* @return
* @throws IOException
*/
public static String getRealLink(String originalUrl,String searchUrl) throws Exception{
originalUrl = "https://weixin.sogou.com" + originalUrl;
int b = (int) (Math.floor(100 * Math.random()) + 1);
int a = originalUrl.indexOf("url=");
int c = originalUrl.indexOf("&k=");
String d = null;
if (a != -1 && -1 == c) {
d = originalUrl.substring(a + 25 + b, a + 26 + b);
}
originalUrl += "&k=" + b + "&h=" + d;
originalUrl = getFinalUrl(originalUrl, searchUrl);
return originalUrl;
}
/**
* 获取真实链接
* @param originalUrl
* @param rerferer
* @return
* @throws Exception
*/
public static String getFinalUrl(String originalUrl,String rerferer) throws Exception{
Map<String,String> headerMap = new HashMap<>();
headerMap.put("Sec-Fetch-Mode", "navigate");
headerMap.put("Sec-Fetch-User", "?1");
headerMap.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3");
headerMap.put("Sec-Fetch-Site", "same-origin");
headerMap.put("Referer", rerferer);
headerMap.put("Cookie", "SUID=EAD6E7733765860A5AEAE09C000ACA78; SUV=00C351E873E7D6EA5AEBCB68E5B81671; wuid=AAGyrPzuHwAAAAqLFD3eFgAAGwY=; pgv_pvi=5713931264; GOTO=; ssuid=5316643370; pex=C864C03270DED3DD8A06887A372DA219231FFAC25A9D64AE09E82AED12E416AC; weixinIndexVisited=1; ABTEST=8|1572271712|v1; SNUID=C5F9D7432F2ABAD638CB0A7A30803056; sct=917; JSESSIONID=aaaR-8KOdPrlZ_KSPKs4w; PHPSESSID=oc296ck54mc3jbgvnu2mar6r40; IPLOC=CN3302");
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(originalUrl, headerMap),ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) ){
StringBuilder furl = new StringBuilder();
Pattern pa1 = Pattern.compile("url \\+= \'(.*?)\';");
Matcher ma1 = pa1.matcher(htmlBody);
while (ma1.find()) {
furl.append(ma1.group(1));
}
return furl.toString();
}
return null;
}
/**
* @Title: getOpenId
* @Description: 获取微信wxID
* @param @param
* wxId
* @param @return
* 设定文件
* @return String 返回类型
*/
public static String getOpenId(String idOrName, ProxyHolder proxyHolder) {
String openId = null;
String url = "https://weixin.sogou.com/weixin?zhnss=1&type=1&ie=utf8&query=" + URLCodeUtil.getURLEncode(idOrName, "utf-8");
String htmlBody;
for(int i = 1;i < 3;i++) {
try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), proxyHolder).body().string();
if (htmlBody != null) {
JSONObject json = JSONObject.parseObject(htmlBody);
openId = json.getString("openid");
return openId;
}
} catch (Exception e) {
e.printStackTrace();
openId = null;
}
}
return openId;
}
}
//package com.zhiwei.wechat.example;
//
//import java.io.UnsupportedEncodingException;
//import java.net.UnknownHostException;
//import java.util.ArrayList;
//import java.util.List;
//
//import org.slf4j.Logger;
//import org.slf4j.LoggerFactory;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.wechat.entity.WechatAricle;
//import com.zhiwei.wechat.search.WechatAritcleSearch;
//
///**
// * @ClassName: WechatSearchExample
// * @Description: TODO(根据关键词等采集数据)
// * @author hero
// * @date 2016年12月16日 上午9:15:42
// */
//public class WechatSearchExample{
//
// private Logger logger = LoggerFactory.getLogger(WechatSearchExample.class);
// private static final String registry = "zookeeper://192.168.0.36:2181";
// private static final String group = "local";
//
// public static void main(String[] args) {
// ProxyFactory.init(registry, group, GroupType.PROVIDER);
// try {
// WechatSearchExample.wechatSearchExample();
// } catch (UnknownHostException e) {
// e.printStackTrace();
// }
// }
//
//
// public static void wechatSearchExample() throws UnknownHostException
// {
// List<String> wordList = new ArrayList<String>();
// wordList.add("京东");
// for(String word : wordList)
// {
// try {
// List<WechatAricle> list = WechatAritcleSearch.wechatKeywordSearch(word, 5, null,"2019-07-24", "2019-07-24", ProxyHolder.SOUGOU_INNER_PROXY.getProxy(), 21);
// System.out.println("======"+list.size());
// for(WechatAricle wechat : list){
//// System.out.println(wechat.getTitle());
// }
// } catch (UnsupportedEncodingException e) {
// e.printStackTrace();
// } catch (Exception e) {
// e.printStackTrace();
// }
//// for(String wxId : wechatIds)
//// {
//// try {
//// logger.info("需要采集的wxId:::{}", wxId);
////
//// } catch (UnsupportedEncodingException e) {
//// e.printStackTrace();
//// } catch (Exception e) {
//// e.printStackTrace();
//// }
//// }
// }
// }
//
//
//}
package com.zhiwei.wechat.example;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.Proxy;
import java.net.URLEncoder;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.wechat.entity.WechatAricle;
import com.zhiwei.wechat.search.WechatAritcleSearch;
/**
* @ClassName: WechatSearchExample
* @Description: TODO(根据关键词等采集数据)
* @author hero
* @date 2016年12月16日 上午9:15:42
*/
public class WechatSearchExample{
private Logger logger = LoggerFactory.getLogger(WechatSearchExample.class);
private static final String registry = "zookeeper://192.168.0.36:2181";
private static final String group = "local";
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static Proxy proxy = null;
public static void main(String[] args) {
ProxyFactory.init(registry, group, GroupType.PROVIDER,10000018);
proxy = ProxyHolder.SOUGOU_INNER_PROXY.getProxy();
try {
WechatSearchExample.wechatSearchExample();
} catch (UnknownHostException e) {
e.printStackTrace();
}
}
public static void wechatSearchExample() throws UnknownHostException
{
List<String> wordList = new ArrayList<String>();
wordList.add("京东");
for(String word : wordList)
{
try {
List<WechatAricle> list = WechatAritcleSearch.wechatKeywordSearch(word, 5, null,"2019-10-28", "2019-10-28",proxy, 51);
System.out.println("======"+list.size());
for(WechatAricle wechat : list){
System.out.println(wechat.getId());
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
// for(String wxId : wechatIds)
// {
// try {
// logger.info("需要采集的wxId:::{}", wxId);
//
// } catch (UnsupportedEncodingException e) {
// e.printStackTrace();
// } catch (Exception e) {
// e.printStackTrace();
// }
// }
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment