Commit 7db2a9e8 by zhiwei

分享链接消失并失效,解析改为有验证码的链接

parent 7ad96e77
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>wechat</artifactId> <artifactId>wechat</artifactId>
<version>1.1.7-SNAPSHOT</version> <version>1.1.8-SNAPSHOT</version>
<description> <description>
知微微信采集程序,包含 知微微信采集程序,包含
1.微信历史文章采集 1.微信历史文章采集
2.搜狗微信接口关键词采集 2.搜狗微信接口关键词采集
3.点赞阅读更新接口 3.点赞阅读更新接口
4.根据关键词或微信id查询帐号信息 4.根据关键词或微信id查询帐号信息
5.根据文章链接采集评论列表及评论数 5.根据文章链接采集评论列表及评论数
</description> </description>
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties> </properties>
<developers> <developers>
<developer> <developer>
<id>Bewilder</id> <id>Bewilder</id>
<name>zhiwei zhang</name> <name>zhiwei zhang</name>
<email>zhangzhiwei@zhiweidata.com</email> <email>zhangzhiwei@zhiweidata.com</email>
</developer> </developer>
</developers> </developers>
<!-- 打包管理 --> <!-- 打包管理 -->
<build> <build>
<plugins> <plugins>
<!-- 发布源码 --> <!-- 发布源码 -->
<plugin> <plugin>
<artifactId>maven-source-plugin</artifactId> <artifactId>maven-source-plugin</artifactId>
<version>2.4</version> <version>2.4</version>
<configuration> <configuration>
<attach>true</attach> <attach>true</attach>
</configuration> </configuration>
<executions> <executions>
<execution> <execution>
<phase>compile</phase> <phase>compile</phase>
<goals> <goals>
<goal>jar</goal> <goal>jar</goal>
</goals> </goals>
</execution> </execution>
</executions> </executions>
</plugin> </plugin>
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId> <artifactId>maven-javadoc-plugin</artifactId>
<version>2.10.4</version> <version>2.10.4</version>
</plugin> </plugin>
<!-- 解决maven test命令时console出现中文乱码乱码 --> <!-- 解决maven test命令时console出现中文乱码乱码 -->
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId> <artifactId>maven-surefire-plugin</artifactId>
<version>2.19.1</version> <version>2.19.1</version>
<configuration> <configuration>
<forkMode>once</forkMode> <forkMode>once</forkMode>
<argLine>-Dfile.encoding=UTF-8</argLine> <argLine>-Dfile.encoding=UTF-8</argLine>
</configuration> </configuration>
</plugin> </plugin>
</plugins> </plugins>
</build> </build>
<!-- 分发管理:管理distribution和supporting files --> <!-- 分发管理:管理distribution和supporting files -->
<distributionManagement> <distributionManagement>
<snapshotRepository> <snapshotRepository>
<id>nexus-releases</id> <id>nexus-releases</id>
<name>User Porject Snapshot</name> <name>User Porject Snapshot</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/snapshots/</url> <url>http://192.168.0.30:8081/nexus/content/repositories/snapshots/</url>
<uniqueVersion>true</uniqueVersion> <uniqueVersion>true</uniqueVersion>
</snapshotRepository> </snapshotRepository>
<repository> <repository>
<id>nexus-releases</id> <id>nexus-releases</id>
<name>User Porject Release</name> <name>User Porject Release</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/releases/</url> <url>http://192.168.0.30:8081/nexus/content/repositories/releases/</url>
</repository> </repository>
</distributionManagement> </distributionManagement>
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>com.zhiwei.tools</groupId> <groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId> <artifactId>zhiwei-tools</artifactId>
<version>0.1.4-SNAPSHOT</version> <version>0.1.4-SNAPSHOT</version>
<scope>provided</scope> <scope>provided</scope>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.zhiwei.crawler</groupId> <groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId> <artifactId>crawler-core</artifactId>
<version>0.5.2-SNAPSHOT</version> <version>0.5.5.6-SNAPSHOT</version>
<scope>provided</scope> <scope>provided</scope>
</dependency> </dependency>
</dependencies> </dependencies>
</project> </project>
\ No newline at end of file
package com.zhiwei.wechat.search; package com.zhiwei.wechat.search;
import java.io.IOException; import java.io.IOException;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.net.Proxy; import java.net.Proxy;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.util.ArrayList; import java.util.*;
import java.util.Date; import java.util.regex.Matcher;
import java.util.List; import java.util.regex.Pattern;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.Logger; import org.jsoup.Jsoup;
import org.jsoup.Jsoup; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Element;
import org.jsoup.nodes.Element; import org.jsoup.select.Elements;
import org.jsoup.select.Elements;
import com.alibaba.fastjson.JSONObject;
import com.alibaba.fastjson.JSONObject; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.httpclient.HeaderTool; import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.URLCodeUtil; import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.wechat.entity.WechatAricle;
import com.zhiwei.wechat.entity.WechatAricle;
/**
/** * @ClassName: WechatAritcleSearch
* @ClassName: WechatAritcleSearch * @Description: TODO(在搜索接口根据关键词采集微信文章)
* @Description: TODO(在搜索接口根据关键词采集微信文章) * @author Bewilder Z
* @author Bewilder Z * @date 2016年10月14日 上午9:40:18
* @date 2016年10月14日 上午9:40:18 */
*/ public class WechatAritcleSearch {
public class WechatAritcleSearch {
private static Logger logger = LogManager.getLogger(WechatAritcleSearch.class);
private static Logger logger = LogManager.getLogger(WechatAritcleSearch.class); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
/** * 根据关键词在搜狗微信搜索微信文章,不包含全文
* 根据关键词在搜狗微信搜索微信文章,不包含全文 * @Title: wechatKeywordSearch
* @Title: wechatKeywordSearch * @param
* @param * word 关键词
* word 关键词 * @param
* @param * tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内);
* tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内); * 5(某一时间段内与startTime和endTime配合使用)
* 5(某一时间段内与startTime和endTime配合使用) * @param
* @param * startTime 开始时间 格式为yyyy-MM-dd
* startTime 开始时间 格式为yyyy-MM-dd * @param
* @param * endTime 结束时间 格式为yyyy-MM-dd
* endTime 结束时间 格式为yyyy-MM-dd * @param
* @param * cookie 用户登录后的cookie(不登录最多10页)
* cookie 用户登录后的cookie(不登录最多10页) * @param
* @param * pages 需要限制返回页数的总页数(如返回前20页则传21),如没有限制页数则传null
* pages 需要限制返回页数的总页数(如返回前20页则传21),如没有限制页数则传null * @throws
* @throws * Exception
* ZhiWeiException * @return List<Wechat> 返回类型
* @return List<Wechat> 返回类型 */
*/ public static List<WechatAricle> wechatKeywordSearch(String word, int tsn, String cookie,String startTime, String endTime,
public static List<WechatAricle> wechatKeywordSearch(String word, int tsn, String cookie,String startTime, String endTime, Proxy proxy, Integer pages) throws Exception{
Proxy proxy, Integer pages) throws Exception{ List<WechatAricle> result = new ArrayList<>();
List<WechatAricle> result = new ArrayList<>(); Map<String, String> headerMap = HeaderTool.getCommonHead();
Map<String, String> headerMap = HeaderTool.getCommonHead(); headerMap.put("Host", "weixin.sogou.com");
headerMap.put("Host", "weixin.sogou.com"); if(StringUtils.isNotBlank(cookie)) {
if(StringUtils.isNotBlank(cookie)) { headerMap.put("cookie", cookie);
headerMap.put("cookie", cookie); }
} boolean f = true;
boolean f = true; int page = 1;
int page = 1; while (f) {
while (f) { String searchUrl = "http://weixin.sogou.com/weixin?type=2&query=" + URLEncoder.encode(word, "UTF-8")
String url = "http://weixin.sogou.com/weixin?type=2&query=" + URLEncoder.encode(word, "UTF-8") + "&ie=utf8&_sug_=n&_sug_type_=&ri=1&sourceid=sugg&sst0=" + System.currentTimeMillis()
+ "&ie=utf8&_sug_=n&_sug_type_=&ri=1&sourceid=sugg&sst0=" + System.currentTimeMillis() + "&tsn=" + tsn + "&page=" + page;
+ "&tsn=" + tsn + "&page=" + page; if (tsn == 5) {
if (tsn == 5) { searchUrl = searchUrl + "&ft=" + startTime + "&et=" + endTime + "&wxid=&usip=&interation=&from=tool";
url = url + "&ft=" + startTime + "&et=" + endTime + "&wxid=&usip=&interation=&from=tool"; }
} headerMap.put("Referer", searchUrl);
System.out.println(url); // 获取数据
headerMap.put("Referer", url); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(searchUrl, headerMap), proxy, false).body().string();
// 获取数据 // 解析数据
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy, false).body().string(); if (htmlBody != null) {
// 解析数据 try {
if (htmlBody != null) { // 解析数据
try { Document document = Jsoup.parse(htmlBody);
// 解析数据 Elements elements = document.select("div.news-box").select("ul.news-list").select("li");
Document document = Jsoup.parse(htmlBody); String title = null;
Elements elements = document.select("div.news-box").select("ul.news-list").select("li"); String link = null;
String title = null; String content = null;
String link = null; String source = null;
String content = null; String openid = null;
String source = null; String putDate = null;
String openid = null; Date date = null;
String putDate = null; WechatAricle wechat = null;
Date date = null; for (Element element : elements) {
WechatAricle wechat = null; try {
for (Element element : elements) { title = element.select("div.txt-box").select("h3").text();
try { link = element.select("div.txt-box").select("h3 >a").attr("href");
title = element.select("div.txt-box").select("h3").text();
link = element.select("div.txt-box").select("h3 >a").attr("data-share"); // link = getRealLink(link, searchUrl);
content = "";
if (element.select("p.txt-info").isEmpty()) { content = "";
content = element.select("p.txt-info").text(); if (element.select("p.txt-info").isEmpty()) {
} else { content = element.select("p.txt-info").text();
content = element.select("div.txt-box").select("p.txt-info").text(); } else {
} content = element.select("div.txt-box").select("p.txt-info").text();
// System.out.println("content======================"+content); }
source = element.select("div.txt-box").select("div.s-p").select("a").text(); // System.out.println("content======================"+content);
source = source.replaceAll(" ", "").trim(); source = element.select("div.txt-box").select("div.s-p").select("a").text();
openid = element.select("div.txt-box").select("div.s-p").select("a").attr("i"); source = source.replaceAll(" ", "").trim();
putDate = element.select("div.txt-box").select("div.s-p").attr("t"); openid = element.select("div.txt-box").select("div.s-p").select("a").attr("i");
date = new Date(Long.valueOf(putDate) * 1000); putDate = element.select("div.txt-box").select("div.s-p").attr("t");
int readNum = 0; date = new Date(Long.valueOf(putDate) * 1000);
try { int readNum = 0;
readNum = Integer.valueOf(element.select("div.txt-box").select("div.s-p") try {
.select("span.s1").text().trim()); readNum = Integer.valueOf(element.select("div.txt-box").select("div.s-p")
} catch (Exception e) { .select("span.s1").text().trim());
readNum = 0; } catch (Exception e) {
} readNum = 0;
}
title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content); title = ZhiWeiTools.SBC2DBC(title);
wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow"); content = ZhiWeiTools.SBC2DBC(content);
result.add(wechat); wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow");
} catch (Exception e) { result.add(wechat);
logger.debug("解析数据出现错误:{}", e); } catch (Exception e) {
} e.printStackTrace();
} logger.debug("解析数据出现错误:{}", e);
// 解析最大可寻页码 }
String pageNext = document.select("[id=pagebar_container]>a").text(); }
if (pageNext.contains("下一页")) { // 解析最大可寻页码
// logger.info("采集到 {} 页" , page); String pageNext = document.select("[id=pagebar_container]>a").text();
page++; if (pageNext.contains("下一页")) {
} else { // logger.info("采集到 {} 页" , page);
f = false; page++;
} } else {
// logger.info("数据总页数为:{}", page); f = false;
} catch (Exception e) { }
logger.debug("获取数据出现问题:{}", e.getMessage()); // logger.info("数据总页数为:{}", page);
return result; } catch (Exception e) {
} logger.debug("获取数据出现问题:{}", e.getMessage());
} else { return result;
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody); }
} } else {
// ZhiWeiTools.sleep(100); logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
if(pages!=null && pages==page) { }
break; // ZhiWeiTools.sleep(100);
} if(pages!=null && pages==page) {
} break;
return result; }
} }
return result;
}
/**
*
* @Title: wechatKeywordSearch /**
* @Description: TODO(根据关键词在搜狗微信搜索微信文章,包含全文) *
* @param @param * @Title: wechatKeywordSearch
* word 关键词 * @Description: TODO(根据关键词在搜狗微信搜索微信文章,包含全文)
* @param @param * @param @param
* tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内); * word 关键词
* 5(某一时间段内与startTime和endTime配合使用) * @param @param
* @param @param * tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内);
* startTime 开始时间 格式为yyyy-MM-dd * 5(某一时间段内与startTime和endTime配合使用)
* @param @param * @param @param
* endTime 结束时间 格式为yyyy-MM-dd * startTime 开始时间 格式为yyyy-MM-dd
* @param @param * @param @param
* cookie 用户登录后的cookie(不登录最多10页) * endTime 结束时间 格式为yyyy-MM-dd
* @param @return * @param @param
* @param @throws * cookie 用户登录后的cookie(不登录最多10页)
* ZhiWeiException * @param @return
* @param @throws * @param @throws
* UnsupportedEncodingException 设定文件 * ZhiWeiException
* @return List<Wechat> 返回类型 * @param @throws
*/ * UnsupportedEncodingException 设定文件
public static List<WechatAricle> wechatKeywordSearch(String word, int tsn, String startTime, String endTime, * @return List<Wechat> 返回类型
Proxy proxy,ProxyHolder proxyHolder) throws Exception, UnsupportedEncodingException { */
List<WechatAricle> result = new ArrayList<WechatAricle>(); public static List<WechatAricle> wechatKeywordSearch(String word, int tsn, String startTime, String endTime,
Map<String, String> headerMap = HeaderTool.getCommonHead(); Proxy proxy,ProxyHolder proxyHolder) throws Exception, UnsupportedEncodingException {
headerMap.put("Host", "weixin.sogou.com"); List<WechatAricle> result = new ArrayList<WechatAricle>();
Map<String, String> headerMap = HeaderTool.getCommonHead();
boolean f = true; headerMap.put("Host", "weixin.sogou.com");
int page = 1;
while (f) { boolean f = true;
String url = "http://weixin.sogou.com/weixin?type=2&query=" + URLEncoder.encode(word, "UTF-8") int page = 1;
+ "&ie=utf8&_sug_=n&_sug_type_=" + "&ri=1&sourceid=sugg&sst0=" + System.currentTimeMillis() while (f) {
+ "&tsn=" + tsn + "&page=" + page; String searchUrl = "http://weixin.sogou.com/weixin?type=2&query=" + URLEncoder.encode(word, "UTF-8")
if (tsn == 5) { + "&ie=utf8&_sug_=n&_sug_type_=" + "&ri=1&sourceid=sugg&sst0=" + System.currentTimeMillis()
url = url + "&ft=" + startTime + "&et=" + endTime + "&wxid=&usip=&interation=&from=tool"; + "&tsn=" + tsn + "&page=" + page;
} if (tsn == 5) {
headerMap.put("Referer", url); searchUrl = searchUrl + "&ft=" + startTime + "&et=" + endTime + "&wxid=&usip=&interation=&from=tool";
// 获取数据 }
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy, false).body().string(); headerMap.put("Referer", searchUrl);
// 解析数据 // 获取数据
if (htmlBody != null) { String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(searchUrl, headerMap), proxy, false).body().string();
try { // 解析数据
// 解析数据 if (htmlBody != null) {
Document document = Jsoup.parse(htmlBody); try {
Elements elements = document.select("div.news-box").select("ul.news-list").select("li"); // 解析数据
String title = null; Document document = Jsoup.parse(htmlBody);
String link = null; Elements elements = document.select("div.news-box").select("ul.news-list").select("li");
String content = null; String title = null;
String source = null; String link = null;
String openid = null; String content = null;
String putDate = null; String source = null;
Date date = null; String openid = null;
WechatAricle wechat = null; String putDate = null;
for (Element element : elements) { Date date = null;
try { WechatAricle wechat = null;
title = element.select("div.txt-box").select("h3").text(); for (Element element : elements) {
link = element.select("div.txt-box").select("h3 >a").attr("data-share"); try {
content = ""; title = element.select("div.txt-box").select("h3").text();
if (element.select("p.txt-info").isEmpty()) { link = element.select("div.txt-box").select("h3 >a").attr("href");
content = element.select("p.txt-info").text(); // link = getRealLink(link, searchUrl);
} else { content = "";
content = element.select("div.txt-box").select("p.txt-info").text(); if (element.select("p.txt-info").isEmpty()) {
} content = element.select("p.txt-info").text();
// System.out.println("content======================"+content); } else {
source = element.select("div.txt-box").select("div.s-p").select("a").text(); content = element.select("div.txt-box").select("p.txt-info").text();
openid = element.select("div.txt-box").select("div.s-p").select("a").attr("i"); }
putDate = element.select("div.txt-box").select("div.s-p").attr("t"); // System.out.println("content======================"+content);
date = new Date(Long.valueOf(putDate) * 1000); source = element.select("div.txt-box").select("div.s-p").select("a").text();
int readNum = 0; openid = element.select("div.txt-box").select("div.s-p").select("a").attr("i");
try { putDate = element.select("div.txt-box").select("div.s-p").attr("t");
readNum = Integer.valueOf(element.select("div.txt-box").select("div.s-p") date = new Date(Long.valueOf(putDate) * 1000);
.select("span.s1").text().trim()); int readNum = 0;
} catch (Exception e) { try {
readNum = 0; readNum = Integer.valueOf(element.select("div.txt-box").select("div.s-p")
} .select("span.s1").text().trim());
} catch (Exception e) {
title = ZhiWeiTools.SBC2DBC(title); readNum = 0;
content = ZhiWeiTools.SBC2DBC(content); }
wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow");
wechat = getWechatAricleInfo(link, proxyHolder, wechat); title = ZhiWeiTools.SBC2DBC(title);
result.add(wechat); content = ZhiWeiTools.SBC2DBC(content);
} catch (Exception e) { wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow");
logger.debug("解析数据出现错误:{}", e.getMessage()); wechat = getWechatAricleInfo(link, proxyHolder, wechat);
continue; result.add(wechat);
} } catch (Exception e) {
} logger.debug("解析数据出现错误:{}", e.getMessage());
// 解析最大可寻页码 continue;
String pageNext = document.select("[id=pagebar_container]>a").text(); }
if (pageNext.contains("下一页")) { }
page++; // 解析最大可寻页码
} else { String pageNext = document.select("[id=pagebar_container]>a").text();
f = false; if (pageNext.contains("下一页")) {
} page++;
// logger.info("数据总页数为:{}", page); } else {
} catch (Exception e) { f = false;
logger.debug("获取数据出现问题:{}", e.getMessage()); }
return null; // logger.info("数据总页数为:{}", page);
} } catch (Exception e) {
} else { logger.debug("获取数据出现问题:{}", e.getMessage());
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody); return null;
} }
// ZhiWeiTools.sleep(100); } else {
} logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
return result; }
} // ZhiWeiTools.sleep(100);
}
/** return result;
* 获取全文及来源 }
* @param url
* @param proxy /**
* @param headerMap * 获取全文及来源
* @param wechatAricle * @param url
* @return * @param proxy
* @throws IOException * @param wechatAricle
*/ * @return
private static WechatAricle getWechatAricleInfo(String url, ProxyHolder proxy,WechatAricle wechatAricle){ * @throws IOException
try { */
String contentHtml = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy.getProxy()).body().string(); private static WechatAricle getWechatAricleInfo(String url, ProxyHolder proxy,WechatAricle wechatAricle){
String content = null; try {
String time = null; String contentHtml = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy.getProxy()).body().string();
String source = null; String content = null;
String biz = null; String time = null;
String title = null; String source = null;
String user_name = null; String biz = null;
String wxId = null; String title = null;
if(contentHtml!=null){ String user_name = null;
Document document = Jsoup.parse(contentHtml); String wxId = null;
title = document.select("title").text(); if(contentHtml!=null){
wxId = document.select("p.profile_meta").get(0).select("span.profile_meta_value").text(); Document document = Jsoup.parse(contentHtml);
title = document.select("title").text();
if(contentHtml.contains("js_article")){ wxId = document.select("p.profile_meta").get(0).select("span.profile_meta_value").text();
content = document.select("div#js_article").text();
}else if(contentHtml.contains("js_share_content")){ if(contentHtml.contains("js_article")){
content = document.select("div#js_share_content").text(); content = document.select("div#js_article").text();
} }else if(contentHtml.contains("js_share_content")){
if(contentHtml.contains("content_tpl")){ content = document.select("div#js_share_content").text();
String text = document.select("script#content_tpl").html(); }
content = Jsoup.parse(text).text(); if(contentHtml.contains("content_tpl")){
} String text = document.select("script#content_tpl").html();
content = Jsoup.parse(text).text();
if(contentHtml.contains("d.nick_name = ")){ }
time = contentHtml.split("d.ct = \"")[1].split("\";")[0];
source = contentHtml.split("d.nick_name = \"")[1].split("\";")[0]; if(contentHtml.contains("d.nick_name = ")){
biz = contentHtml.split("d.biz = \"")[1].split("\"")[0]; time = contentHtml.split("d.ct = \"")[1].split("\";")[0];
user_name = contentHtml.split("d.user_name = \"")[1].split("\"")[0]; source = contentHtml.split("d.nick_name = \"")[1].split("\";")[0];
}else if(contentHtml.contains("var nickname = ")){ biz = contentHtml.split("d.biz = \"")[1].split("\"")[0];
time = contentHtml.split("var ct = \"")[1].split("\";")[0]; user_name = contentHtml.split("d.user_name = \"")[1].split("\"")[0];
source = contentHtml.split("var nickname = \"")[1].split("\";")[0]; }else if(contentHtml.contains("var nickname = ")){
biz = contentHtml.split("var appuin = \"\"||\"")[1].split("\"")[0]; time = contentHtml.split("var ct = \"")[1].split("\";")[0];
user_name = contentHtml.split("var user_name = \"")[1].split("\"")[0]; source = contentHtml.split("var nickname = \"")[1].split("\";")[0];
} biz = contentHtml.split("var appuin = \"\"||\"")[1].split("\"")[0];
} user_name = contentHtml.split("var user_name = \"")[1].split("\"")[0];
if(wechatAricle == null) { }
wechatAricle = new WechatAricle(); }
wechatAricle.setTitle(title); if(wechatAricle == null) {
wechatAricle.setTime(new Date(Long.valueOf(time)*1000)); wechatAricle = new WechatAricle();
wechatAricle.setSource(source); wechatAricle.setTitle(title);
} wechatAricle.setTime(new Date(Long.valueOf(time)*1000));
wechatAricle.setBiz(biz); wechatAricle.setSource(source);
wechatAricle.setContent(content); }
wechatAricle.setWxId(wxId); wechatAricle.setBiz(biz);
wechatAricle.setUser_name(user_name); wechatAricle.setContent(content);
wechatAricle.setWxId(wxId);
} catch (Exception e) { wechatAricle.setUser_name(user_name);
e.printStackTrace();
return wechatAricle; } catch (Exception e) {
} e.printStackTrace();
return wechatAricle; return wechatAricle;
} }
return wechatAricle;
}
/**
* 根据关键词采集指定时间+账号的数据
* @param word /**
* @param idOrName * 根据关键词采集指定时间+账号的数据
* @param tsn * @param word
* @param startTime * @param idOrName
* @param endTime * @param startTime
* @param proxy * @param endTime
* @param proxyHolder * @param proxyHolder
* @return * @return
* @throws Exception * @throws Exception
* @throws UnsupportedEncodingException * @throws UnsupportedEncodingException
*/ */
public static List<WechatAricle> wechatKeywordSearchByAccount(String word, String idOrName, String startTime, String endTime, public static List<WechatAricle> wechatKeywordSearchByAccount(String word, String idOrName, String startTime, String endTime,
ProxyHolder proxyHolder) throws Exception, UnsupportedEncodingException { ProxyHolder proxyHolder) throws Exception, UnsupportedEncodingException {
List<WechatAricle> result = new ArrayList<WechatAricle>(); List<WechatAricle> result = new ArrayList<WechatAricle>();
Map<String, String> headerMap = HeaderTool.getCommonHead(); Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com"); headerMap.put("Host", "weixin.sogou.com");
if(idOrName==null || idOrName.equals("")){ if(idOrName==null || idOrName.equals("")){
throw new IllegalArgumentException("要检索的昵称或id不能为空"); throw new IllegalArgumentException("要检索的昵称或id不能为空");
} }
String openId = getOpenId(idOrName, proxyHolder); String openId = getOpenId(idOrName, proxyHolder);
boolean f = false; boolean f = false;
if(openId!=null){ if(openId!=null){
f = true; f = true;
} }
int page = 1; int page = 1;
while (f) { while (f) {
String url = "https://weixin.sogou.com/weixin?type=2&ie=utf8&query=" + URLEncoder.encode(word, "UTF-8") String searchUrl = "https://weixin.sogou.com/weixin?type=2&ie=utf8&query=" + URLEncoder.encode(word, "UTF-8")
+ "&tsn=5&ft=" + startTime + "&et=" + endTime +"&interation=&page=" + page+"&wxid="+ openId + "&tsn=5&ft=" + startTime + "&et=" + endTime +"&interation=&page=" + page+"&wxid="+ openId
+"&usip=" + URLEncoder.encode(idOrName, "UTF-8"); +"&usip=" + URLEncoder.encode(idOrName, "UTF-8");
headerMap.put("Referer", url); headerMap.put("Referer", searchUrl);
System.out.println(url); // 获取数据
// 获取数据 String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(searchUrl, headerMap), proxyHolder, true).body().string();
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxyHolder, true).body().string(); // 解析数据
// 解析数据 if (htmlBody != null) {
if (htmlBody != null) { try {
try { // 解析数据
// 解析数据 Document document = Jsoup.parse(htmlBody);
Document document = Jsoup.parse(htmlBody); Elements elements = document.select("div.news-box").select("ul.news-list").select("li");
Elements elements = document.select("div.news-box").select("ul.news-list").select("li"); String title = null;
String title = null; String link = null;
String link = null; String content = null;
String content = null; String source = null;
String source = null; String openid = null;
String openid = null; String putDate = null;
String putDate = null; Date date = null;
Date date = null; WechatAricle wechat = null;
WechatAricle wechat = null; for (Element element : elements) {
for (Element element : elements) { try {
try { title = element.select("div.txt-box").select("h3").text();
title = element.select("div.txt-box").select("h3").text(); link = element.select("div.txt-box").select("h3 >a").attr("href");
link = element.select("div.txt-box").select("h3 >a").attr("data-share"); // link = getRealLink(link, searchUrl);
content = ""; content = "";
if (element.select("p.txt-info").isEmpty()) { if (element.select("p.txt-info").isEmpty()) {
content = element.select("p.txt-info").text(); content = element.select("p.txt-info").text();
} else { } else {
content = element.select("div.txt-box").select("p.txt-info").text(); content = element.select("div.txt-box").select("p.txt-info").text();
} }
// System.out.println("content======================"+content); // System.out.println("content======================"+content);
source = element.select("div.txt-box").select("div.s-p").select("a").text(); source = element.select("div.txt-box").select("div.s-p").select("a").text();
openid = element.select("div.txt-box").select("div.s-p").select("a").attr("i"); openid = element.select("div.txt-box").select("div.s-p").select("a").attr("i");
putDate = element.select("div.txt-box").select("div.s-p").attr("t"); putDate = element.select("div.txt-box").select("div.s-p").attr("t");
date = new Date(Long.valueOf(putDate) * 1000); date = new Date(Long.valueOf(putDate) * 1000);
int readNum = 0; int readNum = 0;
try { try {
readNum = Integer.valueOf(element.select("div.txt-box").select("div.s-p") readNum = Integer.valueOf(element.select("div.txt-box").select("div.s-p")
.select("span.s1").text().trim()); .select("span.s1").text().trim());
} catch (Exception e) { } catch (Exception e) {
readNum = 0; readNum = 0;
} }
title = ZhiWeiTools.SBC2DBC(title); title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content); content = ZhiWeiTools.SBC2DBC(content);
wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow"); wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow");
wechat = getWechatAricleInfo(link, proxyHolder, wechat); wechat = getWechatAricleInfo(link, proxyHolder, wechat);
result.add(wechat); result.add(wechat);
} catch (Exception e) { } catch (Exception e) {
logger.debug("解析数据出现错误:{}", e.getMessage()); logger.debug("解析数据出现错误:{}", e.getMessage());
continue; continue;
} }
} }
// 解析最大可寻页码 // 解析最大可寻页码
String pageNext = document.select("[id=pagebar_container]>a").text(); String pageNext = document.select("[id=pagebar_container]>a").text();
if (pageNext.contains("下一页")) { if (pageNext.contains("下一页")) {
page++; page++;
} else { } else {
f = false; f = false;
} }
// logger.info("数据总页数为:{}", page); // logger.info("数据总页数为:{}", page);
} catch (Exception e) { } catch (Exception e) {
logger.debug("获取数据出现问题:{}", e.getMessage()); logger.debug("获取数据出现问题:{}", e.getMessage());
return null; return null;
} }
} else { } else {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody); logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
} }
// ZhiWeiTools.sleep(100); // ZhiWeiTools.sleep(100);
} }
return result; return result;
} }
/**
* 获取真实链接
/** * @param originalUrl
* @Title: getOpenId * @param searchUrl
* @Description: TODO(获取微信wxID) * @return
* @param @param * @throws IOException
* wxId */
* @param @return public static String getRealLink(String originalUrl,String searchUrl) throws Exception{
* 设定文件 originalUrl = "https://weixin.sogou.com" + originalUrl;
* @return String 返回类型
*/ int b = (int) (Math.floor(100 * Math.random()) + 1);
public static String getOpenId(String idOrName, ProxyHolder proxyHolder) { int a = originalUrl.indexOf("url=");
String openId = null; int c = originalUrl.indexOf("&k=");
String url = "https://weixin.sogou.com/weixin?zhnss=1&type=1&ie=utf8&query=" + URLCodeUtil.getURLEncode(idOrName, "utf-8"); String d = null;
String htmlBody; if (a != -1 && -1 == c) {
for(int i = 1;i < 3;i++) { d = originalUrl.substring(a + 25 + b, a + 26 + b);
}
try { originalUrl += "&k=" + b + "&h=" + d;
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), proxyHolder).body().string();
if (htmlBody != null) { originalUrl = getFinalUrl(originalUrl, searchUrl);
JSONObject json = JSONObject.parseObject(htmlBody); return originalUrl;
openId = json.getString("openid"); }
return openId;
}
} catch (Exception e) { /**
e.printStackTrace(); * 获取真实链接
openId = null; * @param originalUrl
} * @param rerferer
} * @return
* @throws Exception
return openId; */
} public static String getFinalUrl(String originalUrl,String rerferer) throws Exception{
Map<String,String> headerMap = new HashMap<>();
} headerMap.put("Sec-Fetch-Mode", "navigate");
headerMap.put("Sec-Fetch-User", "?1");
headerMap.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3");
headerMap.put("Sec-Fetch-Site", "same-origin");
headerMap.put("Referer", rerferer);
headerMap.put("Cookie", "SUID=EAD6E7733765860A5AEAE09C000ACA78; SUV=00C351E873E7D6EA5AEBCB68E5B81671; wuid=AAGyrPzuHwAAAAqLFD3eFgAAGwY=; pgv_pvi=5713931264; GOTO=; ssuid=5316643370; pex=C864C03270DED3DD8A06887A372DA219231FFAC25A9D64AE09E82AED12E416AC; weixinIndexVisited=1; ABTEST=8|1572271712|v1; SNUID=C5F9D7432F2ABAD638CB0A7A30803056; sct=917; JSESSIONID=aaaR-8KOdPrlZ_KSPKs4w; PHPSESSID=oc296ck54mc3jbgvnu2mar6r40; IPLOC=CN3302");
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(originalUrl, headerMap),ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) ){
StringBuilder furl = new StringBuilder();
Pattern pa1 = Pattern.compile("url \\+= \'(.*?)\';");
Matcher ma1 = pa1.matcher(htmlBody);
while (ma1.find()) {
furl.append(ma1.group(1));
}
return furl.toString();
}
return null;
}
/**
* @Title: getOpenId
* @Description: 获取微信wxID
* @param @param
* wxId
* @param @return
* 设定文件
* @return String 返回类型
*/
public static String getOpenId(String idOrName, ProxyHolder proxyHolder) {
String openId = null;
String url = "https://weixin.sogou.com/weixin?zhnss=1&type=1&ie=utf8&query=" + URLCodeUtil.getURLEncode(idOrName, "utf-8");
String htmlBody;
for(int i = 1;i < 3;i++) {
try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), proxyHolder).body().string();
if (htmlBody != null) {
JSONObject json = JSONObject.parseObject(htmlBody);
openId = json.getString("openid");
return openId;
}
} catch (Exception e) {
e.printStackTrace();
openId = null;
}
}
return openId;
}
}
//package com.zhiwei.wechat.example; package com.zhiwei.wechat.example;
//
//import java.io.UnsupportedEncodingException; import java.io.IOException;
//import java.net.UnknownHostException; import java.io.UnsupportedEncodingException;
//import java.util.ArrayList; import java.net.Proxy;
//import java.util.List; import java.net.URLEncoder;
// import java.net.UnknownHostException;
//import org.slf4j.Logger; import java.util.ArrayList;
//import org.slf4j.LoggerFactory; import java.util.HashMap;
// import java.util.List;
//import com.zhiwei.common.config.GroupType; import java.util.Map;
//import com.zhiwei.crawler.proxy.ProxyFactory; import java.util.regex.Matcher;
//import com.zhiwei.crawler.proxy.ProxyHolder; import java.util.regex.Pattern;
//import com.zhiwei.wechat.entity.WechatAricle;
//import com.zhiwei.wechat.search.WechatAritcleSearch; import com.zhiwei.crawler.core.HttpBoot;
// import com.zhiwei.crawler.utils.RequestUtils;
///** import org.apache.commons.lang3.StringUtils;
// * @ClassName: WechatSearchExample import org.slf4j.Logger;
// * @Description: TODO(根据关键词等采集数据) import org.slf4j.LoggerFactory;
// * @author hero
// * @date 2016年12月16日 上午9:15:42 import com.zhiwei.common.config.GroupType;
// */ import com.zhiwei.crawler.proxy.ProxyFactory;
//public class WechatSearchExample{ import com.zhiwei.crawler.proxy.ProxyHolder;
// import com.zhiwei.wechat.entity.WechatAricle;
// private Logger logger = LoggerFactory.getLogger(WechatSearchExample.class); import com.zhiwei.wechat.search.WechatAritcleSearch;
// private static final String registry = "zookeeper://192.168.0.36:2181";
// private static final String group = "local"; /**
// * @ClassName: WechatSearchExample
// public static void main(String[] args) { * @Description: TODO(根据关键词等采集数据)
// ProxyFactory.init(registry, group, GroupType.PROVIDER); * @author hero
// try { * @date 2016年12月16日 上午9:15:42
// WechatSearchExample.wechatSearchExample(); */
// } catch (UnknownHostException e) { public class WechatSearchExample{
// e.printStackTrace();
// } private Logger logger = LoggerFactory.getLogger(WechatSearchExample.class);
// } private static final String registry = "zookeeper://192.168.0.36:2181";
// private static final String group = "local";
// private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
// public static void wechatSearchExample() throws UnknownHostException private static Proxy proxy = null;
// {
// List<String> wordList = new ArrayList<String>(); public static void main(String[] args) {
// wordList.add("京东"); ProxyFactory.init(registry, group, GroupType.PROVIDER,10000018);
// for(String word : wordList) proxy = ProxyHolder.SOUGOU_INNER_PROXY.getProxy();
// { try {
// try { WechatSearchExample.wechatSearchExample();
// List<WechatAricle> list = WechatAritcleSearch.wechatKeywordSearch(word, 5, null,"2019-07-24", "2019-07-24", ProxyHolder.SOUGOU_INNER_PROXY.getProxy(), 21); } catch (UnknownHostException e) {
// System.out.println("======"+list.size()); e.printStackTrace();
// for(WechatAricle wechat : list){ }
//// System.out.println(wechat.getTitle()); }
// }
// } catch (UnsupportedEncodingException e) {
// e.printStackTrace(); public static void wechatSearchExample() throws UnknownHostException
// } catch (Exception e) { {
// e.printStackTrace(); List<String> wordList = new ArrayList<String>();
// } wordList.add("京东");
//// for(String wxId : wechatIds) for(String word : wordList)
//// { {
//// try { try {
//// logger.info("需要采集的wxId:::{}", wxId); List<WechatAricle> list = WechatAritcleSearch.wechatKeywordSearch(word, 5, null,"2019-10-28", "2019-10-28",proxy, 51);
//// System.out.println("======"+list.size());
//// } catch (UnsupportedEncodingException e) { for(WechatAricle wechat : list){
//// e.printStackTrace(); System.out.println(wechat.getId());
//// } catch (Exception e) { }
//// e.printStackTrace(); } catch (UnsupportedEncodingException e) {
//// } e.printStackTrace();
//// } } catch (Exception e) {
// } e.printStackTrace();
// } }
// // for(String wxId : wechatIds)
// // {
//} // try {
// logger.info("需要采集的wxId:::{}", wxId);
//
// } catch (UnsupportedEncodingException e) {
// e.printStackTrace();
// } catch (Exception e) {
// e.printStackTrace();
// }
// }
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment