Commit 46c203b9 by chenweiyang

爬虫核心包 升级 版本升级至 1.3.3

parent 09b58307
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>wechat</artifactId>
<version>1.3.2-SNAPSHOT</version>
<version>1.3.3-SNAPSHOT</version>
<description>
知微微信采集程序,包含
1.微信历史文章采集
......@@ -91,7 +91,7 @@
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.5.6.3-RELEASE</version>
<version>0.6.6.3-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
</dependencies>
......
......@@ -6,7 +6,7 @@ import java.util.Map;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.crawler.core.utils.RequestUtils;
/**
* @ClassName: WechatAccountFans
......
......@@ -14,7 +14,7 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.wechat.entity.WechatAccount;
......
......@@ -18,7 +18,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.wechat.util.Tools;
/**
......
......@@ -4,7 +4,11 @@ import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.Proxy;
import java.net.URLEncoder;
import java.util.*;;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
......@@ -15,8 +19,8 @@ import org.jsoup.select.Elements;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
......
......@@ -9,7 +9,7 @@ import org.jsoup.Jsoup;
import org.jsoup.select.Elements;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.crawler.core.utils.RequestUtils;
public class WechatCount {
......
......@@ -6,7 +6,7 @@ import java.util.Map;
import java.util.Map.Entry;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.URLCodeUtil;
......
package com.zhiwei.wechat.search;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.IOException;
import java.net.Proxy;
import java.util.HashMap;
......@@ -14,9 +7,14 @@ import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
public class WechatReal {
private static Logger logger = LogManager.getLogger(WechatReal.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().useCookieJar(true).build();
/**
* 获取真实链接
......
package com.zhiwei.wechat.example;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.Proxy;
import java.net.URLEncoder;
import java.net.UnknownHostException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.wechat.entity.WechatAricle;
import com.zhiwei.wechat.search.WechatAritcleSearch;
/**
* @ClassName: WechatSearchExample
* @Description: TODO(根据关键词等采集数据)
* @author hero
* @date 2016年12月16日 上午9:15:42
*/
public class WechatSearchExample{
private Logger logger = LoggerFactory.getLogger(WechatSearchExample.class);
private static final String registry = "zookeeper://192.168.0.36:2181";
private static final String group = "local";
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).useCookieJar(true).build();
public static void main(String[] args) {
ProxyFactory.init(registry, group, GroupType.PROVIDER,10000018);
Proxy proxy = ProxyHolder.NAT_HEAVY_PROXY.getProxy();
String url = "https://weixin.sogou.com/link?url=dn9a_-gY295K0Rci_xozVXfdMkSQTLW6cwJThYulHEtVjXrGTiVgSzcttWBfUfRyBteZJZKwOQZcZaXkLh7iD1qXa8Fplpd9OIASmEBDDgpc-DopMAxHDRa5rMUETB5W4jcmy1RslCj6dRdWlI71gTiuwjp2qvcTJ8ryfwJWyrd9awnq8kg4J-jH9rgNij43NIxLSEyMEC0OFckdi_fmA1TpUaYEJzIlQ9H-i95UM3h5UwmbSJx95X6FkyXmgknK9g_68U3LLV9hlgeRt7bSzA..&type=2&query=%E4%BA%BA%E6%B0%91%E6%97%A5%E6%8A%A5";
try {
// String cookie = getRealUrlCookie(url, proxy);
// System.out.println("cookie==================="+cookie);
String realUrl = getRealLink(url, proxy);
System.out.println("realUrl==================="+realUrl);
} catch (Exception e) {
e.printStackTrace();
}
// try{
// WechatAritcleSearch.wechatKeywordSearch("京东", 5, null, "2019-10-01", "2019-10-01", proxy, 10);
//
// }catch (Exception e){
//package com.zhiwei.wechat.example;
//
//import java.io.IOException;
//import java.io.UnsupportedEncodingException;
//import java.net.Proxy;
//import java.net.UnknownHostException;
//import java.util.ArrayList;
//import java.util.HashMap;
//import java.util.List;
//import java.util.Map;
//import java.util.regex.Matcher;
//import java.util.regex.Pattern;
//
//import org.apache.commons.lang3.StringUtils;
//import org.slf4j.Logger;
//import org.slf4j.LoggerFactory;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.core.HttpBoot;
//import com.zhiwei.crawler.core.proxy.ProxyFactory;
//import com.zhiwei.crawler.core.proxy.ProxyHolder;
//import com.zhiwei.crawler.core.utils.RequestUtils;
//import com.zhiwei.wechat.entity.WechatAricle;
//import com.zhiwei.wechat.search.WechatAritcleSearch;
//
///**
// * @ClassName: WechatSearchExample
// * @Description: TODO(根据关键词等采集数据)
// * @author hero
// * @date 2016年12月16日 上午9:15:42
// */
//public class WechatSearchExample{
//
// private Logger logger = LoggerFactory.getLogger(WechatSearchExample.class);
// private static final String registry = "zookeeper://192.168.0.36:2181";
// private static final String group = "local";
// private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).useCookieJar(true).build();
//
// public static void main(String[] args) {
// ProxyFactory.init(registry, group, GroupType.PROVIDER,10000018);
//
// Proxy proxy = ProxyHolder.NAT_HEAVY_PROXY.getProxy();
// String url = "https://weixin.sogou.com/link?url=dn9a_-gY295K0Rci_xozVXfdMkSQTLW6cwJThYulHEtVjXrGTiVgSzcttWBfUfRyBteZJZKwOQZcZaXkLh7iD1qXa8Fplpd9OIASmEBDDgpc-DopMAxHDRa5rMUETB5W4jcmy1RslCj6dRdWlI71gTiuwjp2qvcTJ8ryfwJWyrd9awnq8kg4J-jH9rgNij43NIxLSEyMEC0OFckdi_fmA1TpUaYEJzIlQ9H-i95UM3h5UwmbSJx95X6FkyXmgknK9g_68U3LLV9hlgeRt7bSzA..&type=2&query=%E4%BA%BA%E6%B0%91%E6%97%A5%E6%8A%A5";
// try {
//// String cookie = getRealUrlCookie(url, proxy);
//// System.out.println("cookie==================="+cookie);
//
// String realUrl = getRealLink(url, proxy);
// System.out.println("realUrl==================="+realUrl);
// } catch (Exception e) {
// e.printStackTrace();
// }
}
/**
* 获取真实链接
* @param originalUrl
* @return
* @throws IOException
*/
public static String getRealLink(String originalUrl,Proxy proxy) throws Exception{
int b = (int) (Math.floor(100 * Math.random()) + 1);
int a = originalUrl.indexOf("url=");
int c = originalUrl.indexOf("&k=");
String d = null;
if (a != -1 && -1 == c) {
d = originalUrl.substring(a + 25 + b, a + 26 + b);
}
originalUrl += "&k=" + b + "&h=" + d;
String realUrl = getFinalUrl(originalUrl, proxy);
return realUrl;
}
/**
* 获取真实链接
* @param originalUrl
* @return
* @throws Exception
*/
private static String getFinalUrl(String originalUrl, Proxy proxy) throws Exception{
Map<String,String> headerMap = new HashMap<>();
headerMap.put("Sec-Fetch-Mode", "navigate");
headerMap.put("Sec-Fetch-User", "?1");
headerMap.put("Sec-Fetch-Site", "same-origin");
headerMap.put("Host", "weixin.sogou.com");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36");
String word = originalUrl.split("query=")[1];
String searchUrl = "https://weixin.sogou.com/weixin?query="+ word;
headerMap.put("referer", searchUrl);
httpBoot.syncCall(RequestUtils.wrapGet(searchUrl, headerMap), proxy).body().string();
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(originalUrl, headerMap), proxy).body().string();
headerMap.put("referer", searchUrl);
if(StringUtils.isNotBlank(htmlBody) && !htmlBody.contains("验证码")){
StringBuilder furl = new StringBuilder();
Pattern pa1 = Pattern.compile("url \\+= \'(.*?)\';");
Matcher ma1 = pa1.matcher(htmlBody);
while (ma1.find()) {
furl.append(ma1.group(1));
}
return furl.toString();
}
return null;
}
public static void wechatSearchExample() throws UnknownHostException
{
List<String> wordList = new ArrayList<String>();
wordList.add("京东");
for(String word : wordList)
{
try {
List<WechatAricle> list = WechatAritcleSearch.wechatKeywordSearch(word, 5, null,"2019-10-28", "2019-10-28", null, 3);
System.out.println("======"+list.size());
for(WechatAricle wechat : list){
System.out.println(wechat.getId());
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
// for(String wxId : wechatIds)
//
//// try{
//// WechatAritcleSearch.wechatKeywordSearch("京东", 5, null, "2019-10-01", "2019-10-01", proxy, 10);
////
//// }catch (Exception e){
//// e.printStackTrace();
//// }
//
//
//
// }
//
//
//
//
//
//
//
//
//
// /**
// * 获取真实链接
// * @param originalUrl
// * @return
// * @throws IOException
// */
// public static String getRealLink(String originalUrl,Proxy proxy) throws Exception{
// int b = (int) (Math.floor(100 * Math.random()) + 1);
// int a = originalUrl.indexOf("url=");
// int c = originalUrl.indexOf("&k=");
// String d = null;
// if (a != -1 && -1 == c) {
// d = originalUrl.substring(a + 25 + b, a + 26 + b);
// }
// originalUrl += "&k=" + b + "&h=" + d;
// String realUrl = getFinalUrl(originalUrl, proxy);
// return realUrl;
// }
//
//
// /**
// * 获取真实链接
// * @param originalUrl
// * @return
// * @throws Exception
// */
// private static String getFinalUrl(String originalUrl, Proxy proxy) throws Exception{
// Map<String,String> headerMap = new HashMap<>();
// headerMap.put("Sec-Fetch-Mode", "navigate");
// headerMap.put("Sec-Fetch-User", "?1");
// headerMap.put("Sec-Fetch-Site", "same-origin");
// headerMap.put("Host", "weixin.sogou.com");
// headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36");
//
// String word = originalUrl.split("query=")[1];
// String searchUrl = "https://weixin.sogou.com/weixin?query="+ word;
//
// headerMap.put("referer", searchUrl);
// httpBoot.syncCall(RequestUtils.wrapGet(searchUrl, headerMap), proxy).body().string();
//
// String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(originalUrl, headerMap), proxy).body().string();
// headerMap.put("referer", searchUrl);
//
// if(StringUtils.isNotBlank(htmlBody) && !htmlBody.contains("验证码")){
// StringBuilder furl = new StringBuilder();
// Pattern pa1 = Pattern.compile("url \\+= \'(.*?)\';");
// Matcher ma1 = pa1.matcher(htmlBody);
// while (ma1.find()) {
// furl.append(ma1.group(1));
// }
// return furl.toString();
// }
// return null;
// }
//
//
// public static void wechatSearchExample() throws UnknownHostException
// {
// List<String> wordList = new ArrayList<String>();
// wordList.add("京东");
// for(String word : wordList)
// {
// try {
// logger.info("需要采集的wxId:::{}", wxId);
//
// List<WechatAricle> list = WechatAritcleSearch.wechatKeywordSearch(word, 5, null,"2019-10-28", "2019-10-28", null, 3);
// System.out.println("======"+list.size());
// for(WechatAricle wechat : list){
// System.out.println(wechat.getId());
// }
// } catch (UnsupportedEncodingException e) {
// e.printStackTrace();
// } catch (Exception e) {
// e.printStackTrace();
// }
//// for(String wxId : wechatIds)
//// {
//// try {
//// logger.info("需要采集的wxId:::{}", wxId);
////
//// } catch (UnsupportedEncodingException e) {
//// e.printStackTrace();
//// } catch (Exception e) {
//// e.printStackTrace();
//// }
//// }
// }
// }
}
}
}
//
//
//}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment