Commit 81ade9e7 by zhiwei

添加搜狗微信由link获取真实链接(带有效期)方法

parent 4c2e31e6
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>wechat</artifactId> <artifactId>wechat</artifactId>
<version>1.2.1-SNAPSHOT</version> <version>1.2.2-SNAPSHOT</version>
<description> <description>
知微微信采集程序,包含 知微微信采集程序,包含
1.微信历史文章采集 1.微信历史文章采集
......
...@@ -2,17 +2,9 @@ package com.zhiwei.wechat.search; ...@@ -2,17 +2,9 @@ package com.zhiwei.wechat.search;
import java.io.IOException; import java.io.IOException;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.net.InetSocketAddress;
import java.net.Proxy; import java.net.Proxy;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.security.MessageDigest; import java.util.*;;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import okhttp3.Headers;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
...@@ -71,6 +63,7 @@ public class WechatAritcleSearch { ...@@ -71,6 +63,7 @@ public class WechatAritcleSearch {
} }
boolean f = true; boolean f = true;
int page = 1; int page = 1;
while (f) { while (f) {
String searchUrl = "https://weixin.sogou.com/weixin?type=2&s_from=input&query="+URLEncoder.encode(word, "UTF-8")+"&ie=utf8&_sug_=n&_sug_type_=&page="+ page; String searchUrl = "https://weixin.sogou.com/weixin?type=2&s_from=input&query="+URLEncoder.encode(word, "UTF-8")+"&ie=utf8&_sug_=n&_sug_type_=&page="+ page;
...@@ -84,68 +77,16 @@ public class WechatAritcleSearch { ...@@ -84,68 +77,16 @@ public class WechatAritcleSearch {
// 获取数据 // 获取数据
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(searchUrl, headerMap), proxy, false).body().string(); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(searchUrl, headerMap), proxy, false).body().string();
// 解析数据 // 解析数据
if (htmlBody != null) { if (StringUtils.isNotBlank(htmlBody) && !htmlBody.contains("输入验证码")) {
try {
// 解析数据
Document document = Jsoup.parse(htmlBody); Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div.news-box").select("ul.news-list").select("li"); result.addAll(analysis(document));
String title = null;
String link = null;
String content = null;
String source = null;
String openid = null;
String putDate = null;
Date date = null;
WechatAricle wechat = null;
for (Element element : elements) {
try {
title = element.select("div.txt-box").select("h3").text();
link = "https://weixin.sogou.com" + element.select("div.txt-box").select("h3 >a").attr("href");
// link = getRealLink(link, searchUrl);
content = "";
if (element.select("p.txt-info").isEmpty()) {
content = element.select("p.txt-info").text();
} else {
content = element.select("div.txt-box").select("p.txt-info").text();
}
// System.out.println("content======================"+content);
source = element.select("div.txt-box").select("div.s-p").select("a").text();
source = source.replaceAll(" ", "").trim();
openid = element.select("div.txt-box").select("div.s-p").select("a").attr("i");
putDate = element.select("div.txt-box").select("div.s-p").attr("t");
date = new Date(Long.valueOf(putDate) * 1000);
int readNum = 0;
try {
readNum = Integer.valueOf(element.select("div.txt-box").select("div.s-p")
.select("span.s1").text().trim());
} catch (Exception e) {
readNum = 0;
}
title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content);
wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow");
result.add(wechat);
} catch (Exception e) {
e.printStackTrace();
logger.debug("解析数据出现错误:{}", e);
}
}
// 解析最大可寻页码 // 解析最大可寻页码
String pageNext = document.select("[id=pagebar_container]>a").text(); String pageNext = document.select("[id=pagebar_container]>a").text();
if (pageNext.contains("下一页")) { if (pageNext.contains("下一页")) {
// logger.info("采集到 {} 页" , page);
page++; page++;
} else { } else {
f = false; f = false;
} }
// logger.info("数据总页数为:{}", page);
} catch (Exception e) {
logger.debug("获取数据出现问题:{}", e.getMessage());
return result;
}
} else { } else {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody); logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
} }
...@@ -202,53 +143,9 @@ public class WechatAritcleSearch { ...@@ -202,53 +143,9 @@ public class WechatAritcleSearch {
// 获取数据 // 获取数据
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(searchUrl, headerMap), proxy, false).body().string(); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(searchUrl, headerMap), proxy, false).body().string();
// 解析数据 // 解析数据
if (htmlBody != null) { if (StringUtils.isNotBlank(htmlBody) && !htmlBody.contains("输入验证码")) {
try {
// 解析数据
Document document = Jsoup.parse(htmlBody); Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div.news-box").select("ul.news-list").select("li"); result.addAll(analysis(document));
String title = null;
String link = null;
String content = null;
String source = null;
String openid = null;
String putDate = null;
Date date = null;
WechatAricle wechat = null;
for (Element element : elements) {
try {
title = element.select("div.txt-box").select("h3").text();
link = link = "https://weixin.sogou.com" + element.select("div.txt-box").select("h3 >a").attr("href");
// link = getRealLink(link, searchUrl);
content = "";
if (element.select("p.txt-info").isEmpty()) {
content = element.select("p.txt-info").text();
} else {
content = element.select("div.txt-box").select("p.txt-info").text();
}
// System.out.println("content======================"+content);
source = element.select("div.txt-box").select("div.s-p").select("a").text();
openid = element.select("div.txt-box").select("div.s-p").select("a").attr("i");
putDate = element.select("div.txt-box").select("div.s-p").attr("t");
date = new Date(Long.valueOf(putDate) * 1000);
int readNum = 0;
try {
readNum = Integer.valueOf(element.select("div.txt-box").select("div.s-p")
.select("span.s1").text().trim());
} catch (Exception e) {
readNum = 0;
}
title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content);
wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow");
wechat = getWechatAricleInfo(link, proxyHolder, wechat);
result.add(wechat);
} catch (Exception e) {
logger.debug("解析数据出现错误:{}", e.getMessage());
continue;
}
}
// 解析最大可寻页码 // 解析最大可寻页码
String pageNext = document.select("[id=pagebar_container]>a").text(); String pageNext = document.select("[id=pagebar_container]>a").text();
if (pageNext.contains("下一页")) { if (pageNext.contains("下一页")) {
...@@ -256,15 +153,9 @@ public class WechatAritcleSearch { ...@@ -256,15 +153,9 @@ public class WechatAritcleSearch {
} else { } else {
f = false; f = false;
} }
// logger.info("数据总页数为:{}", page);
} catch (Exception e) {
logger.debug("获取数据出现问题:{}", e.getMessage());
return null;
}
} else { } else {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody); logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
} }
// ZhiWeiTools.sleep(100);
} }
return result; return result;
} }
...@@ -369,11 +260,34 @@ public class WechatAritcleSearch { ...@@ -369,11 +260,34 @@ public class WechatAritcleSearch {
headerMap.put("Referer", searchUrl); headerMap.put("Referer", searchUrl);
// 获取数据 // 获取数据
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(searchUrl, headerMap), proxyHolder, true).body().string(); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(searchUrl, headerMap), proxyHolder, true).body().string();
if (StringUtils.isNotBlank(htmlBody) && !htmlBody.contains("输入验证码")) {
Document document = Jsoup.parse(htmlBody);
result.addAll(analysis(document));
// 解析最大可寻页码
String pageNext = document.select("[id=pagebar_container]>a").text();
if (pageNext.contains("下一页")) {
page++;
} else {
f = false;
}
} else {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
}
}
return result;
}
/**
* 解析数据
* @param document
* @return
*/
private static List<WechatAricle> analysis(Document document){
List<WechatAricle> result = new ArrayList<WechatAricle>();
// 解析数据 // 解析数据
if (htmlBody != null) {
try { try {
// 解析数据 // 解析数据
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div.news-box").select("ul.news-list").select("li"); Elements elements = document.select("div.news-box").select("ul.news-list").select("li");
String title = null; String title = null;
String link = null; String link = null;
...@@ -387,7 +301,7 @@ public class WechatAritcleSearch { ...@@ -387,7 +301,7 @@ public class WechatAritcleSearch {
try { try {
title = element.select("div.txt-box").select("h3").text(); title = element.select("div.txt-box").select("h3").text();
link = "https://weixin.sogou.com" + element.select("div.txt-box").select("h3 >a").attr("href"); link = "https://weixin.sogou.com" + element.select("div.txt-box").select("h3 >a").attr("href");
// link = getRealLink(link, searchUrl); link = WechatReal.getRealLink(link);
content = ""; content = "";
if (element.select("p.txt-info").isEmpty()) { if (element.select("p.txt-info").isEmpty()) {
content = element.select("p.txt-info").text(); content = element.select("p.txt-info").text();
...@@ -406,116 +320,23 @@ public class WechatAritcleSearch { ...@@ -406,116 +320,23 @@ public class WechatAritcleSearch {
} catch (Exception e) { } catch (Exception e) {
readNum = 0; readNum = 0;
} }
title = ZhiWeiTools.SBC2DBC(title); title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content); content = ZhiWeiTools.SBC2DBC(content);
wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow"); wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow");
wechat = getWechatAricleInfo(link, proxyHolder, wechat);
result.add(wechat); result.add(wechat);
} catch (Exception e) { } catch (Exception e) {
logger.debug("解析数据出现错误:{}", e.getMessage()); logger.debug("解析数据出现错误:{}", e.getMessage());
continue; continue;
} }
} }
// 解析最大可寻页码
String pageNext = document.select("[id=pagebar_container]>a").text();
if (pageNext.contains("下一页")) {
page++;
} else {
f = false;
}
// logger.info("数据总页数为:{}", page); // logger.info("数据总页数为:{}", page);
} catch (Exception e) { } catch (Exception e) {
logger.debug("获取数据出现问题:{}", e.getMessage()); logger.debug("获取数据出现问题:{}", e.getMessage());
return null; return null;
} }
} else {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
}
// ZhiWeiTools.sleep(100);
}
return result; return result;
} }
/**
* 获取真实链接
* @param originalUrl
* @return
* @throws IOException
*/
public static String getRealLink(String originalUrl,String cookie) throws Exception{
int b = (int) (Math.floor(100 * Math.random()) + 1);
int a = originalUrl.indexOf("url=");
int c = originalUrl.indexOf("&k=");
String d = null;
if (a != -1 && -1 == c) {
d = originalUrl.substring(a + 25 + b, a + 26 + b);
}
originalUrl += "&k=" + b + "&h=" + d;
String realUrl = getFinalUrl(originalUrl, cookie);
return realUrl;
}
/**
* 获取真实链接
* @param originalUrl
* @return
* @throws Exception
*/
private static String getFinalUrl(String originalUrl,String cookie) throws Exception{
String word = originalUrl.split("query=")[1];
String searchUrl = "https://weixin.sogou.com/weixin?query="+ word;
if(StringUtils.isNotBlank(cookie)){
Map<String,String> headerMap = new HashMap<>();
headerMap.put("Sec-Fetch-Mode", "navigate");
headerMap.put("Sec-Fetch-User", "?1");
headerMap.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3");
headerMap.put("Sec-Fetch-Site", "same-origin");
headerMap.put("Host", "weixin.sogou.com");
headerMap.put("Referer", searchUrl);
headerMap.put("Cookie", cookie);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(originalUrl, headerMap), ProxyHolder.NAT_HEAVY_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) ){
StringBuilder furl = new StringBuilder();
Pattern pa1 = Pattern.compile("url \\+= \'(.*?)\';");
Matcher ma1 = pa1.matcher(htmlBody);
while (ma1.find()) {
furl.append(ma1.group(1));
}
return furl.toString();
}
}
return null;
}
public static String getRealUrlCookie(String originalUrl) throws Exception{
String word = originalUrl.split("query=")[1];
String searchUrl = "https://weixin.sogou.com/weixin?type=2&s_from=input&query="+ word +"&ie=utf8&_sug_=n&_sug_type_=&page=1";
Map<String,String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com");
headerMap.put("Referer", searchUrl);
String cookie = "";
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(searchUrl, headerMap),ProxyHolder.NAT_HEAVY_PROXY, false).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("snuid")){
StringBuilder furl = new StringBuilder();
Pattern pa1 = Pattern.compile("\"snuid\" : \"(.*?)\",");
Matcher ma1 = pa1.matcher(htmlBody);
while (ma1.find()) {
furl.append(ma1.group(1));
}
return "SNUID=" + furl.toString();
}
return null;
}
/** /**
* @Title: getOpenId * @Title: getOpenId
* @Description: 获取微信wxID * @Description: 获取微信wxID
...@@ -543,7 +364,6 @@ public class WechatAritcleSearch { ...@@ -543,7 +364,6 @@ public class WechatAritcleSearch {
openId = null; openId = null;
} }
} }
return openId; return openId;
} }
......
package com.zhiwei.wechat.search;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.IOException;
import java.net.Proxy;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class WechatReal {
private static Logger logger = LogManager.getLogger(WechatReal.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).useCookieJar(true).build();
/**
* 获取真实链接
* @param originalUrl
* @return
* @throws IOException
*/
public static String getRealLink(String originalUrl) throws Exception{
Proxy proxy = ProxyHolder.NAT_HEAVY_PROXY.getProxy();
int b = (int) (Math.floor(100 * Math.random()) + 1);
int a = originalUrl.indexOf("url=");
int c = originalUrl.indexOf("&k=");
String d = null;
if (a != -1 && -1 == c) {
d = originalUrl.substring(a + 25 + b, a + 26 + b);
}
originalUrl += "&k=" + b + "&h=" + d;
String realUrl = getFinalUrl(originalUrl, proxy);
return realUrl;
}
/**
* 获取真实链接
* @param originalUrl
* @return
* @throws Exception
*/
private static String getFinalUrl(String originalUrl, Proxy proxy) throws Exception{
Map<String,String> headerMap = new HashMap<>();
headerMap.put("Sec-Fetch-Mode", "navigate");
headerMap.put("Sec-Fetch-User", "?1");
headerMap.put("Sec-Fetch-Site", "same-origin");
headerMap.put("Host", "weixin.sogou.com");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36");
String word = originalUrl.split("query=")[1];
String searchUrl = "https://weixin.sogou.com/weixin?query="+ word;
for(int i=0; i<3; i++){
try{
headerMap.put("referer", searchUrl);
httpBoot.syncCall(RequestUtils.wrapGet(searchUrl, headerMap), proxy).body().string();
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(originalUrl, headerMap), proxy).body().string();
headerMap.put("referer", searchUrl);
if(StringUtils.isNotBlank(htmlBody) && !htmlBody.contains("验证码")){
StringBuilder furl = new StringBuilder();
Pattern pa1 = Pattern.compile("url \\+= \'(.*?)\';");
Matcher ma1 = pa1.matcher(htmlBody);
while (ma1.find()) {
furl.append(ma1.group(1));
}
return furl.toString();
}
}catch (Exception e){
logger.error("获取真实链接时出现错误,错误为:{}", e);
}
}
return null;
}
}
...@@ -5,16 +5,17 @@ import java.io.UnsupportedEncodingException; ...@@ -5,16 +5,17 @@ import java.io.UnsupportedEncodingException;
import java.net.Proxy; import java.net.Proxy;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.net.UnknownHostException; import java.net.UnknownHostException;
import java.util.ArrayList; import java.util.*;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
...@@ -36,50 +37,173 @@ public class WechatSearchExample{ ...@@ -36,50 +37,173 @@ public class WechatSearchExample{
private Logger logger = LoggerFactory.getLogger(WechatSearchExample.class); private Logger logger = LoggerFactory.getLogger(WechatSearchExample.class);
private static final String registry = "zookeeper://192.168.0.36:2181"; private static final String registry = "zookeeper://192.168.0.36:2181";
private static final String group = "local"; private static final String group = "local";
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).useCookieJar(true).build();
private static Proxy proxy = null;
public static void main(String[] args) { public static void main(String[] args) {
ProxyFactory.init(registry, group, GroupType.PROVIDER,10000018); ProxyFactory.init(registry, group, GroupType.PROVIDER,10000018);
proxy = ProxyHolder.SOUGOU_INNER_PROXY.getProxy();
String url = "https://weixin.sogou.com/link?url=dn9a_-gY295K0Rci_xozVXfdMkSQTLW6cwJThYulHEtVjXrGTiVgS8CMDfv9wh9qo5s-_tRRSYjdmlThuLl1UVqXa8Fplpd9fM3bn57YTm93DADHLmE53r3LNleAx90O6EdWFlMiLgABVb5FRuhnTbO_GzJrBhvROBdUYdPJ-HwpjtEvi_VZoCFXsP2Y8IMUHXuJCM5s6KSDiXUXG84daahQ5c0PemEIN_1vJiPn8w7tlTjPfiB-Z3QxFXEDiCN7KNRVfuxuX7N535pjGOOjYg..&type=2&query=%E8%85%BE%E8%AE%AF"; Proxy proxy = ProxyHolder.SOUGOU_INNER_PROXY.getProxy();
try{ // String url = "https://weixin.sogou.com/link?url=dn9a_-gY295K0Rci_xozVXfdMkSQTLW6cwJThYulHEtVjXrGTiVgSzcttWBfUfRyBteZJZKwOQZcZaXkLh7iD1qXa8Fplpd9OIASmEBDDgpc-DopMAxHDRa5rMUETB5W4jcmy1RslCj6dRdWlI71gTiuwjp2qvcTJ8ryfwJWyrd9awnq8kg4J-jH9rgNij43NIxLSEyMEC0OFckdi_fmA1TpUaYEJzIlQ9H-i95UM3h5UwmbSJx95X6FkyXmgknK9g_68U3LLV9hlgeRt7bSzA..&type=2&query=%E4%BA%BA%E6%B0%91%E6%97%A5%E6%8A%A5";
String cookie = WechatAritcleSearch.getRealUrlCookie(url); // try {
if(StringUtils.isNotBlank(cookie)){ //// String cookie = getRealUrlCookie(url, proxy);
System.out.println("cookie============="+cookie); //// System.out.println("cookie==================="+cookie);
boolean f = true; //
int i = 0; // String realUrl = getRealLink(url, proxy);
while(f){ //
// System.out.println("realUrl==================="+realUrl);
// } catch (Exception e) {
// e.printStackTrace();
// }
try{ try{
String link = WechatAritcleSearch.getRealLink(url, cookie); WechatAritcleSearch.wechatKeywordSearch("京东", 5, null, "2019-10-01", "2019-10-01", proxy, 10);
if(StringUtils.isNotBlank(link) && link.contains("s?src=")){
System.out.println(i+++"=========="+link);
}else{
System.out.println(i+++"=========="+link);
}
ZhiWeiTools.sleep(3000);
}catch (Exception e){ }catch (Exception e){
ZhiWeiTools.sleep(50);
e.printStackTrace(); e.printStackTrace();
} }
}
/**
* 获取真实链接
* @param originalUrl
* @return
* @throws IOException
*/
public static String getRealLink(String originalUrl,Proxy proxy) throws Exception{
int b = (int) (Math.floor(100 * Math.random()) + 1);
int a = originalUrl.indexOf("url=");
int c = originalUrl.indexOf("&k=");
String d = null;
if (a != -1 && -1 == c) {
d = originalUrl.substring(a + 25 + b, a + 26 + b);
}
originalUrl += "&k=" + b + "&h=" + d;
String realUrl = getFinalUrl(originalUrl, proxy);
return realUrl;
}
/**
* 获取真实链接
* @param originalUrl
* @return
* @throws Exception
*/
private static String getFinalUrl(String originalUrl, Proxy proxy) throws Exception{
Map<String,String> headerMap = new HashMap<>();
headerMap.put("Sec-Fetch-Mode", "navigate");
headerMap.put("Sec-Fetch-User", "?1");
headerMap.put("Sec-Fetch-Site", "same-origin");
headerMap.put("Host", "weixin.sogou.com");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36");
String word = originalUrl.split("query=")[1];
String searchUrl = "https://weixin.sogou.com/weixin?query="+ word;
headerMap.put("referer", searchUrl);
httpBoot.syncCall(RequestUtils.wrapGet(searchUrl, headerMap), proxy).body().string();
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(originalUrl, headerMap), proxy).body().string();
headerMap.put("referer", searchUrl);
if(StringUtils.isNotBlank(htmlBody) && !htmlBody.contains("验证码")){
StringBuilder furl = new StringBuilder();
Pattern pa1 = Pattern.compile("url \\+= \'(.*?)\';");
Matcher ma1 = pa1.matcher(htmlBody);
while (ma1.find()) {
furl.append(ma1.group(1));
}
return furl.toString();
} }
}else { return null;
System.out.println("cookie============="+cookie); }
public static String getRealUrlCookie(String originalUrl, Proxy proxy) throws Exception{
String word = originalUrl.split("query=")[1];
String searchUrl = "https://weixin.sogou.com/weixin?type=2&s_from=input&query="+ word +"&ie=utf8&_sug_=n&_sug_type_=&page=1";
Map<String,String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com");
headerMap.put("Referer", searchUrl);
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36");
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(searchUrl, headerMap), proxy,false).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("snuid")){
Matcher matcher = Pattern.compile("var uigs_para = [\\s\\S]+?;").matcher(htmlBody);
matcher.find();
String str = matcher.group().replaceAll("var uigs_para = |;", "");
try {
str = str.toString().replaceAll(" passportUserId \\? \"1\" \\:", "");
JSONObject data = JSONObject.parseObject(str);
str = str.replaceAll("\\s","").replaceAll("\":\"","=").replaceAll("\",\"","&").replaceAll("\\{\"|\"\\}","");
String ac = URLCodeUtil.getURLEncode(str, "utf-8");
String cookieUrl = "https://pb.sogou.com/cl.gif?uigs_cl=article_title_6&href="+originalUrl+"&uigs_refer=https://weixin.sogou.com/&uigs_t="+ ac +"&right=right0_0&exp_id=null_0-null_1-null_2-null_3-null_4-null_5-null_6-null_7-null_8-null_9";
headerMap.put("Host", "pb.sogou.com");
headerMap.put("Referer", searchUrl);
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36");
Response response = httpBoot.syncCall(RequestUtils.wrapGet(cookieUrl, headerMap), proxy,false);
if(Objects.nonNull(response)){
String cookie = "";
List<String> headersResponse = response.headers("Set-Cookie");
for (String header : headersResponse) {
System.out.println("head======"+header);
cookie += header.split(";")[0];
}
return "uuid=" + data.getString("uuid")+ ";snuid="+ data.getString("snuid")+ ";" +cookie + "IPLOC=CN3302; sct=1;weixinIndexVisited=1";
} }
}catch (Exception e){ }catch (Exception e){
e.printStackTrace(); e.printStackTrace();
} }
}else{
System.out.println("-----------------"+ htmlBody);
}
return null;
}
public static String getCookie(String htmlBody) throws Exception{
// try { String cookie = "";
// WechatSearchExample.wechatSearchExample(); if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("snuid")){
// } catch (UnknownHostException e) { StringBuilder furl = new StringBuilder();
// e.printStackTrace(); Pattern pa1 = Pattern.compile("\"snuid\" : \"(.*?)\",");
// } Matcher ma1 = pa1.matcher(htmlBody);
while (ma1.find()) {
furl.append(ma1.group(1));
}
return "SNUID=" + furl.toString();
}
return null;
} }
public static void wechatSearchExample() throws UnknownHostException public static void wechatSearchExample() throws UnknownHostException
{ {
List<String> wordList = new ArrayList<String>(); List<String> wordList = new ArrayList<String>();
...@@ -87,7 +211,7 @@ public class WechatSearchExample{ ...@@ -87,7 +211,7 @@ public class WechatSearchExample{
for(String word : wordList) for(String word : wordList)
{ {
try { try {
List<WechatAricle> list = WechatAritcleSearch.wechatKeywordSearch(word, 5, null,"2019-10-28", "2019-10-28",proxy, 3); List<WechatAricle> list = WechatAritcleSearch.wechatKeywordSearch(word, 5, null,"2019-10-28", "2019-10-28", null, 3);
System.out.println("======"+list.size()); System.out.println("======"+list.size());
for(WechatAricle wechat : list){ for(WechatAricle wechat : list){
System.out.println(wechat.getId()); System.out.println(wechat.getId());
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment