Commit 0348e2a1 by shenjunjie

调整微信链接获取方式

parent 4bd06f63
...@@ -6,94 +6,126 @@ import java.util.HashMap; ...@@ -6,94 +6,126 @@ import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.crawler.core.utils.RequestUtils;
import okhttp3.Request;
import okhttp3.Response;
public class WechatReal { public class WechatReal {
private static Logger logger = LogManager.getLogger(WechatReal.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().useCookieJar(true).build(); private static HttpBoot httpBoot = new HttpBoot.Builder().useCookieJar(true).build();
/** private static final Pattern PATTERN = Pattern.compile("url \\+= \'(.*?)\';");
* 获取真实链接 private String cookie = null;
* @param originalUrl
* @return
* @throws IOException
*/
public static String getRealLink(String originalUrl) throws Exception{
Proxy proxy = ProxyHolder.NAT_HEAVY_PROXY.getProxy();
originalUrl = getOriginalUrl(originalUrl);
String realUrl = getFinalUrl(originalUrl, proxy);
return realUrl;
}
/** /**
* 通过白名单代理ip获取真实链接 * 获取真实链接
* @param originalUrl *
* @param proxy * @param originalUrl
* @return * @return
* @throws Exception * @throws IOException
*/ */
public static String getRealLink(String originalUrl,Proxy proxy) throws Exception{ public String getRealLink(String originalUrl) throws Exception {
originalUrl = getOriginalUrl(originalUrl); Proxy proxy = ProxyHolder.NAT_HEAVY_PROXY.getProxy();
String realUrl = getFinalUrl(originalUrl, proxy); return getFinalUrl(originalUrl, proxy);
return realUrl; }
}
/**
* 通过白名单代理ip获取真实链接
*
* @param originalUrl
* @param proxy
* @return
* @throws Exception
*/
public String getRealLink(String originalUrl, Proxy proxy) throws Exception {
return getFinalUrl(originalUrl, proxy);
}
/** private String getFinalUrl(String url, Proxy proxy) throws IOException {
* 获取转链接的中间跳转链接 Map<String, Object> headers = new HashMap<>();
* @param originalUrl headers.put("Referer", "https://weixin.sogou.com/weixin");
* @return headers.put("User-Agent",
*/ "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36");
private static String getOriginalUrl(String originalUrl){ // 初次获取cookie
int b = (int) (Math.floor(100 * Math.random()) + 1); if (null == cookie) {
int a = originalUrl.indexOf("url="); initCookie(url, headers, proxy);
int c = originalUrl.indexOf("&k="); }
String d = null; headers.put("cookie", cookie);
if (a != -1 && -1 == c) { for (int i = 0; i < 2; i++) {
d = originalUrl.substring(a + 25 + b, a + 26 + b); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headers), proxy).body().string();
} if (StringUtils.isNotBlank(htmlBody) && !htmlBody.contains("验证码")) {
originalUrl += "&k=" + b + "&h=" + d; StringBuilder furl = new StringBuilder();
return originalUrl; Matcher ma1 = PATTERN.matcher(htmlBody);
} while (ma1.find()) {
furl.append(ma1.group(1));
}
return furl.toString();
} else {
// cookie过期
initCookie(url, headers, proxy);
}
}
throw new NullPointerException("ip:" + proxy.address() + "获取临时链接失败, 出现输入验证码");
}
private void initCookie(String url, Map<String, Object> headers, Proxy proxy) throws IOException {
Request request = RequestUtils.wrapGet(getSearchUrl(url), headers);
Response response = httpBoot.syncCall(request, proxy);
// System.out.println("htmlBody:" + response.body().string());
cookie = response.headers("set-cookie").stream().map(s -> s.replaceAll(";.*", ""))
.collect(Collectors.joining("; "));
logger.info("ip:{},初始化cookie:{}", proxy.address(), cookie);
}
/** /**
* 通过普通代理获取临时链接 *
* @param originalUrl * 获取cookie所需的搜索链接
* @param proxy *
* @return * @param url
* @throws Exception * @return String
*/ */
private static String getFinalUrl(String originalUrl, Proxy proxy) throws Exception{ private static String getSearchUrl(String url) {
Map<String,String> headerMap = new HashMap<>(); // return "https://weixin.sogou.com/weixin?query=" + getOriginalUrl(url).split("query=")[1];
headerMap.put("Sec-Fetch-Mode", "navigate"); String timestamp = Long.toString(System.currentTimeMillis());
headerMap.put("Sec-Fetch-User", "?1"); return StringUtils.join(
headerMap.put("Sec-Fetch-Site", "same-origin"); "https://weixin.sogou.com/weixin?type=2&query=%E8%90%A5%E9%94%80&ie=utf8&s_from=input&_sug_=y&_sug_type_=&w=01019900&sut=1314&sst0=",
headerMap.put("Host", "weixin.sogou.com"); timestamp, "&lkt=1%2C", timestamp, "%2C", timestamp);
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"); }
String word = originalUrl.split("query=")[1]; /**
String searchUrl = "https://weixin.sogou.com/weixin?query="+ word; * 获取转链接的中间跳转链接
*
* @param originalUrl
* @return
*/
@Deprecated
private static String getOriginalUrl(String originalUrl) {
int b = (int) (Math.floor(100 * Math.random()) + 1);
int a = originalUrl.indexOf("url=");
int c = originalUrl.indexOf("&k=");
String d = null;
if (a != -1 && -1 == c) {
d = originalUrl.substring(a + 25 + b, a + 26 + b);
}
originalUrl += "&k=" + b + "&h=" + d;
return originalUrl;
}
headerMap.put("referer", searchUrl); // public static void main(String[] args) throws Exception {
httpBoot.syncCall(RequestUtils.wrapGet(searchUrl, headerMap), proxy).body().string(); // WechatReal real = new WechatReal();
// Proxy proxy = new Proxy(Type.HTTP, new InetSocketAddress("119.3.86.205", 31128));
// String url = "https://weixin.sogou.com/link?url=dn9a_-gY295K0Rci_xozVXfdMkSQTLW6cwJThYulHEtVjXrGTiVgS8FzrTzGEPrGvM6hiNXA4ZFfuz5MvdMSLVqXa8Fplpd9gowHZ2-xDps585u2obuOVGC2ke8iAlwOUW5Vlcs1qv8YeB2DBj_2dTSVEmgoED-M4y9lx6Ykc9IjDA2sWjYtSyDfEXs2p-nZB6QB9v1FTm3sgVx8MYuQh6L7kx32DJ4fKy9a6PM182aN3M2SXrGSIqAH50L-W7WN8EgDyGxD5NruL0unUdKkuw..&type=2&query=%E8%90%A5%E9%94%80&token=3ABD0306D5E9D84C3F3A954539751A493F10FC545F1FCD9F&k=61&h=M";
// String url1 = real.getRealLink(url, proxy);
// System.out.println(url1);
// }
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(originalUrl, headerMap), proxy).body().string();
headerMap.put("referer", searchUrl);
if(StringUtils.isNotBlank(htmlBody) && !htmlBody.contains("验证码")){
StringBuilder furl = new StringBuilder();
Pattern pa1 = Pattern.compile("url \\+= \'(.*?)\';");
Matcher ma1 = pa1.matcher(htmlBody);
while (ma1.find()) {
furl.append(ma1.group(1));
}
return furl.toString();
}else{
throw new NullPointerException("获取临时链接失败, 出现输入验证码");
}
}
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment