Commit 7db2a9e8 by zhiwei

分享链接消失并失效,解析改为有验证码的链接

parent 7ad96e77
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>wechat</artifactId> <artifactId>wechat</artifactId>
<version>1.1.7-SNAPSHOT</version> <version>1.1.8-SNAPSHOT</version>
<description> <description>
知微微信采集程序,包含 知微微信采集程序,包含
1.微信历史文章采集 1.微信历史文章采集
...@@ -91,7 +91,7 @@ ...@@ -91,7 +91,7 @@
<dependency> <dependency>
<groupId>com.zhiwei.crawler</groupId> <groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId> <artifactId>crawler-core</artifactId>
<version>0.5.2-SNAPSHOT</version> <version>0.5.5.6-SNAPSHOT</version>
<scope>provided</scope> <scope>provided</scope>
</dependency> </dependency>
</dependencies> </dependencies>
......
...@@ -4,10 +4,9 @@ import java.io.IOException; ...@@ -4,10 +4,9 @@ import java.io.IOException;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.net.Proxy; import java.net.Proxy;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.util.ArrayList; import java.util.*;
import java.util.Date; import java.util.regex.Matcher;
import java.util.List; import java.util.regex.Pattern;
import java.util.Map;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
...@@ -54,7 +53,7 @@ public class WechatAritcleSearch { ...@@ -54,7 +53,7 @@ public class WechatAritcleSearch {
* @param * @param
* pages 需要限制返回页数的总页数(如返回前20页则传21),如没有限制页数则传null * pages 需要限制返回页数的总页数(如返回前20页则传21),如没有限制页数则传null
* @throws * @throws
* ZhiWeiException * Exception
* @return List<Wechat> 返回类型 * @return List<Wechat> 返回类型
*/ */
public static List<WechatAricle> wechatKeywordSearch(String word, int tsn, String cookie,String startTime, String endTime, public static List<WechatAricle> wechatKeywordSearch(String word, int tsn, String cookie,String startTime, String endTime,
...@@ -68,16 +67,15 @@ public class WechatAritcleSearch { ...@@ -68,16 +67,15 @@ public class WechatAritcleSearch {
boolean f = true; boolean f = true;
int page = 1; int page = 1;
while (f) { while (f) {
String url = "http://weixin.sogou.com/weixin?type=2&query=" + URLEncoder.encode(word, "UTF-8") String searchUrl = "http://weixin.sogou.com/weixin?type=2&query=" + URLEncoder.encode(word, "UTF-8")
+ "&ie=utf8&_sug_=n&_sug_type_=&ri=1&sourceid=sugg&sst0=" + System.currentTimeMillis() + "&ie=utf8&_sug_=n&_sug_type_=&ri=1&sourceid=sugg&sst0=" + System.currentTimeMillis()
+ "&tsn=" + tsn + "&page=" + page; + "&tsn=" + tsn + "&page=" + page;
if (tsn == 5) { if (tsn == 5) {
url = url + "&ft=" + startTime + "&et=" + endTime + "&wxid=&usip=&interation=&from=tool"; searchUrl = searchUrl + "&ft=" + startTime + "&et=" + endTime + "&wxid=&usip=&interation=&from=tool";
} }
System.out.println(url); headerMap.put("Referer", searchUrl);
headerMap.put("Referer", url);
// 获取数据 // 获取数据
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy, false).body().string(); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(searchUrl, headerMap), proxy, false).body().string();
// 解析数据 // 解析数据
if (htmlBody != null) { if (htmlBody != null) {
try { try {
...@@ -95,7 +93,10 @@ public class WechatAritcleSearch { ...@@ -95,7 +93,10 @@ public class WechatAritcleSearch {
for (Element element : elements) { for (Element element : elements) {
try { try {
title = element.select("div.txt-box").select("h3").text(); title = element.select("div.txt-box").select("h3").text();
link = element.select("div.txt-box").select("h3 >a").attr("data-share"); link = element.select("div.txt-box").select("h3 >a").attr("href");
// link = getRealLink(link, searchUrl);
content = ""; content = "";
if (element.select("p.txt-info").isEmpty()) { if (element.select("p.txt-info").isEmpty()) {
content = element.select("p.txt-info").text(); content = element.select("p.txt-info").text();
...@@ -121,6 +122,7 @@ public class WechatAritcleSearch { ...@@ -121,6 +122,7 @@ public class WechatAritcleSearch {
wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow"); wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow");
result.add(wechat); result.add(wechat);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace();
logger.debug("解析数据出现错误:{}", e); logger.debug("解析数据出现错误:{}", e);
} }
} }
...@@ -183,15 +185,15 @@ public class WechatAritcleSearch { ...@@ -183,15 +185,15 @@ public class WechatAritcleSearch {
boolean f = true; boolean f = true;
int page = 1; int page = 1;
while (f) { while (f) {
String url = "http://weixin.sogou.com/weixin?type=2&query=" + URLEncoder.encode(word, "UTF-8") String searchUrl = "http://weixin.sogou.com/weixin?type=2&query=" + URLEncoder.encode(word, "UTF-8")
+ "&ie=utf8&_sug_=n&_sug_type_=" + "&ri=1&sourceid=sugg&sst0=" + System.currentTimeMillis() + "&ie=utf8&_sug_=n&_sug_type_=" + "&ri=1&sourceid=sugg&sst0=" + System.currentTimeMillis()
+ "&tsn=" + tsn + "&page=" + page; + "&tsn=" + tsn + "&page=" + page;
if (tsn == 5) { if (tsn == 5) {
url = url + "&ft=" + startTime + "&et=" + endTime + "&wxid=&usip=&interation=&from=tool"; searchUrl = searchUrl + "&ft=" + startTime + "&et=" + endTime + "&wxid=&usip=&interation=&from=tool";
} }
headerMap.put("Referer", url); headerMap.put("Referer", searchUrl);
// 获取数据 // 获取数据
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy, false).body().string(); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(searchUrl, headerMap), proxy, false).body().string();
// 解析数据 // 解析数据
if (htmlBody != null) { if (htmlBody != null) {
try { try {
...@@ -209,7 +211,8 @@ public class WechatAritcleSearch { ...@@ -209,7 +211,8 @@ public class WechatAritcleSearch {
for (Element element : elements) { for (Element element : elements) {
try { try {
title = element.select("div.txt-box").select("h3").text(); title = element.select("div.txt-box").select("h3").text();
link = element.select("div.txt-box").select("h3 >a").attr("data-share"); link = element.select("div.txt-box").select("h3 >a").attr("href");
// link = getRealLink(link, searchUrl);
content = ""; content = "";
if (element.select("p.txt-info").isEmpty()) { if (element.select("p.txt-info").isEmpty()) {
content = element.select("p.txt-info").text(); content = element.select("p.txt-info").text();
...@@ -263,7 +266,6 @@ public class WechatAritcleSearch { ...@@ -263,7 +266,6 @@ public class WechatAritcleSearch {
* 获取全文及来源 * 获取全文及来源
* @param url * @param url
* @param proxy * @param proxy
* @param headerMap
* @param wechatAricle * @param wechatAricle
* @return * @return
* @throws IOException * @throws IOException
...@@ -329,10 +331,8 @@ public class WechatAritcleSearch { ...@@ -329,10 +331,8 @@ public class WechatAritcleSearch {
* 根据关键词采集指定时间+账号的数据 * 根据关键词采集指定时间+账号的数据
* @param word * @param word
* @param idOrName * @param idOrName
* @param tsn
* @param startTime * @param startTime
* @param endTime * @param endTime
* @param proxy
* @param proxyHolder * @param proxyHolder
* @return * @return
* @throws Exception * @throws Exception
...@@ -355,14 +355,13 @@ public class WechatAritcleSearch { ...@@ -355,14 +355,13 @@ public class WechatAritcleSearch {
int page = 1; int page = 1;
while (f) { while (f) {
String url = "https://weixin.sogou.com/weixin?type=2&ie=utf8&query=" + URLEncoder.encode(word, "UTF-8") String searchUrl = "https://weixin.sogou.com/weixin?type=2&ie=utf8&query=" + URLEncoder.encode(word, "UTF-8")
+ "&tsn=5&ft=" + startTime + "&et=" + endTime +"&interation=&page=" + page+"&wxid="+ openId + "&tsn=5&ft=" + startTime + "&et=" + endTime +"&interation=&page=" + page+"&wxid="+ openId
+"&usip=" + URLEncoder.encode(idOrName, "UTF-8"); +"&usip=" + URLEncoder.encode(idOrName, "UTF-8");
headerMap.put("Referer", url); headerMap.put("Referer", searchUrl);
System.out.println(url);
// 获取数据 // 获取数据
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxyHolder, true).body().string(); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(searchUrl, headerMap), proxyHolder, true).body().string();
// 解析数据 // 解析数据
if (htmlBody != null) { if (htmlBody != null) {
try { try {
...@@ -380,7 +379,8 @@ public class WechatAritcleSearch { ...@@ -380,7 +379,8 @@ public class WechatAritcleSearch {
for (Element element : elements) { for (Element element : elements) {
try { try {
title = element.select("div.txt-box").select("h3").text(); title = element.select("div.txt-box").select("h3").text();
link = element.select("div.txt-box").select("h3 >a").attr("data-share"); link = element.select("div.txt-box").select("h3 >a").attr("href");
// link = getRealLink(link, searchUrl);
content = ""; content = "";
if (element.select("p.txt-info").isEmpty()) { if (element.select("p.txt-info").isEmpty()) {
content = element.select("p.txt-info").text(); content = element.select("p.txt-info").text();
...@@ -431,11 +431,64 @@ public class WechatAritcleSearch { ...@@ -431,11 +431,64 @@ public class WechatAritcleSearch {
} }
/**
* 获取真实链接
* @param originalUrl
* @param searchUrl
* @return
* @throws IOException
*/
public static String getRealLink(String originalUrl,String searchUrl) throws Exception{
originalUrl = "https://weixin.sogou.com" + originalUrl;
int b = (int) (Math.floor(100 * Math.random()) + 1);
int a = originalUrl.indexOf("url=");
int c = originalUrl.indexOf("&k=");
String d = null;
if (a != -1 && -1 == c) {
d = originalUrl.substring(a + 25 + b, a + 26 + b);
}
originalUrl += "&k=" + b + "&h=" + d;
originalUrl = getFinalUrl(originalUrl, searchUrl);
return originalUrl;
}
/**
* 获取真实链接
* @param originalUrl
* @param rerferer
* @return
* @throws Exception
*/
public static String getFinalUrl(String originalUrl,String rerferer) throws Exception{
Map<String,String> headerMap = new HashMap<>();
headerMap.put("Sec-Fetch-Mode", "navigate");
headerMap.put("Sec-Fetch-User", "?1");
headerMap.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3");
headerMap.put("Sec-Fetch-Site", "same-origin");
headerMap.put("Referer", rerferer);
headerMap.put("Cookie", "SUID=EAD6E7733765860A5AEAE09C000ACA78; SUV=00C351E873E7D6EA5AEBCB68E5B81671; wuid=AAGyrPzuHwAAAAqLFD3eFgAAGwY=; pgv_pvi=5713931264; GOTO=; ssuid=5316643370; pex=C864C03270DED3DD8A06887A372DA219231FFAC25A9D64AE09E82AED12E416AC; weixinIndexVisited=1; ABTEST=8|1572271712|v1; SNUID=C5F9D7432F2ABAD638CB0A7A30803056; sct=917; JSESSIONID=aaaR-8KOdPrlZ_KSPKs4w; PHPSESSID=oc296ck54mc3jbgvnu2mar6r40; IPLOC=CN3302");
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(originalUrl, headerMap),ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) ){
StringBuilder furl = new StringBuilder();
Pattern pa1 = Pattern.compile("url \\+= \'(.*?)\';");
Matcher ma1 = pa1.matcher(htmlBody);
while (ma1.find()) {
furl.append(ma1.group(1));
}
return furl.toString();
}
return null;
}
/** /**
* @Title: getOpenId * @Title: getOpenId
* @Description: TODO(获取微信wxID) * @Description: 获取微信wxID
* @param @param * @param @param
* wxId * wxId
* @param @return * @param @return
......
//package com.zhiwei.wechat.example; package com.zhiwei.wechat.example;
//
//import java.io.UnsupportedEncodingException; import java.io.IOException;
//import java.net.UnknownHostException; import java.io.UnsupportedEncodingException;
//import java.util.ArrayList; import java.net.Proxy;
//import java.util.List; import java.net.URLEncoder;
// import java.net.UnknownHostException;
//import org.slf4j.Logger; import java.util.ArrayList;
//import org.slf4j.LoggerFactory; import java.util.HashMap;
// import java.util.List;
//import com.zhiwei.common.config.GroupType; import java.util.Map;
//import com.zhiwei.crawler.proxy.ProxyFactory; import java.util.regex.Matcher;
//import com.zhiwei.crawler.proxy.ProxyHolder; import java.util.regex.Pattern;
//import com.zhiwei.wechat.entity.WechatAricle;
//import com.zhiwei.wechat.search.WechatAritcleSearch; import com.zhiwei.crawler.core.HttpBoot;
// import com.zhiwei.crawler.utils.RequestUtils;
///** import org.apache.commons.lang3.StringUtils;
// * @ClassName: WechatSearchExample import org.slf4j.Logger;
// * @Description: TODO(根据关键词等采集数据) import org.slf4j.LoggerFactory;
// * @author hero
// * @date 2016年12月16日 上午9:15:42 import com.zhiwei.common.config.GroupType;
// */ import com.zhiwei.crawler.proxy.ProxyFactory;
//public class WechatSearchExample{ import com.zhiwei.crawler.proxy.ProxyHolder;
// import com.zhiwei.wechat.entity.WechatAricle;
// private Logger logger = LoggerFactory.getLogger(WechatSearchExample.class); import com.zhiwei.wechat.search.WechatAritcleSearch;
// private static final String registry = "zookeeper://192.168.0.36:2181";
// private static final String group = "local"; /**
// * @ClassName: WechatSearchExample
// public static void main(String[] args) { * @Description: TODO(根据关键词等采集数据)
// ProxyFactory.init(registry, group, GroupType.PROVIDER); * @author hero
// try { * @date 2016年12月16日 上午9:15:42
// WechatSearchExample.wechatSearchExample(); */
// } catch (UnknownHostException e) { public class WechatSearchExample{
// e.printStackTrace();
// } private Logger logger = LoggerFactory.getLogger(WechatSearchExample.class);
// } private static final String registry = "zookeeper://192.168.0.36:2181";
// private static final String group = "local";
// private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
// public static void wechatSearchExample() throws UnknownHostException private static Proxy proxy = null;
// {
// List<String> wordList = new ArrayList<String>(); public static void main(String[] args) {
// wordList.add("京东"); ProxyFactory.init(registry, group, GroupType.PROVIDER,10000018);
// for(String word : wordList) proxy = ProxyHolder.SOUGOU_INNER_PROXY.getProxy();
try {
WechatSearchExample.wechatSearchExample();
} catch (UnknownHostException e) {
e.printStackTrace();
}
}
public static void wechatSearchExample() throws UnknownHostException
{
List<String> wordList = new ArrayList<String>();
wordList.add("京东");
for(String word : wordList)
{
try {
List<WechatAricle> list = WechatAritcleSearch.wechatKeywordSearch(word, 5, null,"2019-10-28", "2019-10-28",proxy, 51);
System.out.println("======"+list.size());
for(WechatAricle wechat : list){
System.out.println(wechat.getId());
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
// for(String wxId : wechatIds)
// { // {
// try { // try {
// List<WechatAricle> list = WechatAritcleSearch.wechatKeywordSearch(word, 5, null,"2019-07-24", "2019-07-24", ProxyHolder.SOUGOU_INNER_PROXY.getProxy(), 21); // logger.info("需要采集的wxId:::{}", wxId);
// System.out.println("======"+list.size()); //
// for(WechatAricle wechat : list){
//// System.out.println(wechat.getTitle());
// }
// } catch (UnsupportedEncodingException e) { // } catch (UnsupportedEncodingException e) {
// e.printStackTrace(); // e.printStackTrace();
// } catch (Exception e) { // } catch (Exception e) {
// e.printStackTrace(); // e.printStackTrace();
// } // }
//// for(String wxId : wechatIds)
//// {
//// try {
//// logger.info("需要采集的wxId:::{}", wxId);
////
//// } catch (UnsupportedEncodingException e) {
//// e.printStackTrace();
//// } catch (Exception e) {
//// e.printStackTrace();
//// }
//// }
// }
// } // }
// }
// }
//}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment