Commit 95487743 by [zhangzhiwei]

因修改采集核心包版本,修改相应的方法

parent f09faf1a
......@@ -25,22 +25,17 @@
<version>3.8.1</version>
</dependency>
<dependency>
<groupId>com.zhiwei.middleware</groupId>
<artifactId>proxy-client</artifactId>
<version>0.0.2-RELEASE</version>
</dependency>
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.0.5-SNAPSHOT</version>
<version>0.0.8-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.middleware</groupId>
<artifactId>cleaner-unified-urlfilter</artifactId>
<version>1.0.0.RELEASE</version>
<version>1.0.6.RELEASE</version>
</dependency>
<dependency>
......
......@@ -4,9 +4,9 @@ import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.config.ProxyConfig;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.proxy.common.Definition.GroupType;
public class MainRun {
private ScheduledExecutorService scheduExec;
......@@ -21,8 +21,7 @@ public class MainRun {
public static void main(String[] args) {
/** 初始化代理IP **/
ProxyFactory.init(ProxyConfig.registry, ProxyConfig.group, GroupType.PROVIDER,
ProxyFactory::getNatProxy);
ProxyFactory.init(ProxyConfig.registry, ProxyConfig.group, GroupType.PROVIDER);
new MainRun().showTimer();
}
......
/**
* @Title: Crawler.java
* @Package com.zhiwei.crawler.soubao
* @author 0xff
* @date 2018年6月28日 上午9:49:32
*/
package com.zhiwei.crawler.soubao;
import java.net.Proxy;
import java.net.URLEncoder;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.zhiwei.crawler.download.HttpClientBuilder;
import com.zhiwei.crawler.download.HttpRequestBuilder;
import com.zhiwei.crawler.util.TreatData;
import okhttp3.FormBody;
import okhttp3.Headers;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
/**
* @ClassName: Crawler
* @Description: 搜报网爬虫
* @author 0xff
* @date 2018年6月28日 上午9:49:32
*/
public class Crawler {
private static final Logger logger = LogManager.getLogger(Crawler.class);
public static void start(int days, String keyword, Proxy proxy) throws Exception {
if (days < 0) {
throw new IllegalArgumentException("搜索天数不能小于 0");
}
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
Calendar c = Calendar.getInstance();
String endDate = sdf.format(c.getTime());
c.add(Calendar.DAY_OF_YEAR, -1 * days);
String startDate = sdf.format(c.getTime());
StringBuilder sb = new StringBuilder(
"http://www.soubao.net/search/searchList.aspx?timesel=custom&checkNum=");
sb.append("&startdate=" + startDate + "&enddate=" + endDate);
sb.append("&keyword=" + URLEncoder.encode(keyword, "UTF-8"));
try {
String url = sb.toString();
logger.info("关键词 {} 搜索链接 {}", keyword, url);
search(url, keyword, startDate, endDate,proxy);
} catch (Exception e) {
logger.error("关键词 {} 采集出错", keyword, e);
}
}
private static void search(String url, String keyword, String startDate, String endDate,Proxy proxy) throws Exception {
int count = 0;
OkHttpClient client = HttpClientBuilder.newInstanceWithCookieJar(proxy);
Map<String, String> map = new HashMap<String, String>();
map.put("Referer", "http://www.soubao.net/search/searchList.aspx");
map.put("Cookie", DevKit.buildSoubaoCookie());
map.put("Host", "www.soubao.net");
map.put("Origin", "http://www.soubao.net");
map.put("Content-Type", "application/x-www-form-urlencoded");
Request request = HttpRequestBuilder.newGetRequest(url, Headers.of(map));
Response response = client.newCall(request).execute();
String body = response.body().string();
logger.info("关键词 {} 搜索成功", keyword);
Document html = Jsoup.parse(body);
boolean needRepair = true;
int page = 1;
// 开始强制翻页
for (int i = 1; i <= page; i++) {
FormBody formBody = new FormBody.Builder()
.add("__VIEWSTATE", html.getElementById("__VIEWSTATE").attr("value"))
.add("__VIEWSTATEGENERATOR", html.getElementById("__VIEWSTATEGENERATOR").attr("value"))
.add("__EVENTTARGET", "AspNetPager1").add("__EVENTARGUMENT", i + "")
.add("__EVENTVALIDATION", html.getElementById("__EVENTVALIDATION").attr("value"))
.add("HidTimeSelect", html.getElementById("HidTimeSelect").attr("value"))
.add("HiddenMsg", html.getElementById("HiddenMsg").attr("value")).add("txtKeyword", keyword)
.add("checkNum", "").add("timesel", "on").add("txtStartDate", startDate).add("txtEndDate", endDate)
.build();
request = HttpRequestBuilder.newPostRequest("http://www.soubao.net/search/searchList.aspx",
request.headers(), formBody);
response = client.newCall(request).execute();
body = response.body().string();
if (!body.contains("rptRetList_ctl01_HLinkBT")) {
page = 0;
logger.info("关键词 {} 无数据,退出搜索", keyword);
break;
}
html = Jsoup.parse(body);
if(needRepair) {
//修正翻页页数
try {
page = Integer.parseInt(html.getElementById("LbKeyword").select("span").get(2).text().replaceAll(".*?/|页", ""));
logger.info("关键词 {} 搜索结果页数: {}", keyword, page);
needRepair = false;
} catch (Exception e) {
throw new IllegalStateException("关键词 " + keyword + " 获取搜索结果页数失败");
}
}
count ++;
logger.info("关键词 {} 翻页页数: {} 访问成功, 页面长度:{}", keyword, i, body.length());
// 解析翻页
parse(client, request.headers(), html);
TimeUnit.SECONDS.sleep(2);
}
logger.info("关键词 {} 爬取完毕,总页数: {},数据条数: {}", keyword, page, count);
}
/**
* @Title: parse
* @author hero
* @Description: 解析数据
* @param @param client
* @param @param headers
* @param @param html 设定文件
* @return void 返回类型
*/
private static void parse(OkHttpClient client, Headers headers, Document html) {
try {
Elements elements = html.select("ul.newList").select("li");
logger.info("数据大小:::{}", elements.size());
for (Element element : elements) {
try {
String link = "http://www.soubao.net" + element.select("h2").select("a").attr("href");
String realUrl = matchRealUrl(client, headers, link);
if (realUrl != null) {
Map<String, Object> dataMap = new HashMap<String, Object>();
dataMap.put("title", element.select("h2").select("a").text());
dataMap.put("content", element.select("p.newCon").text());
dataMap.put("source",
element.select("p.newsInfo").select("em.paperName").select("span").text());
dataMap.put("time", element.select("p.newsInfo").select("em.postDate").select("span").text());
dataMap.put("_id", realUrl);
TreatData.treatDataAccount(dataMap);
}else {
logger.info("链接为:{},真实地址解析出现错误", link);
}
} catch (Exception e) {
logger.debug("解析数据结构出现问题::", e.fillInStackTrace());
continue;
}
}
} catch (Exception e) {
logger.info("页面正文提取出错", e);
}
}
public static String matchRealUrl(OkHttpClient client, Headers headers, String url) {
String regex = "^([hH][tT]{2}[pP]:/*|[hH][tT]{2}[pP][sS]:/*|[fF][tT][pP]:/*)(([A-Za-z0-9-~]+).)+([A-Za-z0-9-~\\/])+(\\?{0,1}(([A-Za-z0-9-~]+\\={0,1})([A-Za-z0-9-~]*)\\&{0,1})*)$";
String realUrl = null;
try {
TimeUnit.MILLISECONDS.sleep(500);
Request request = HttpRequestBuilder.newGetRequest(url, null);
Response response = client.newBuilder().build().newCall(request).execute();
String html = response.body().string();
if (html != null && html.contains("window.location='")) {
realUrl = html.split("window.location='")[1].split("'</script>")[0];
realUrl = realUrl.replaceAll("/./", "/");
Pattern pattern = Pattern.compile(regex);
if (pattern.matcher(realUrl).matches()) {
return realUrl;
}
}
} catch (Exception e) {
e.printStackTrace();
return null;
}
return realUrl;
}
public static void main(String[] args) {
try {
start(1, "京东", null);
} catch (Exception e) {
e.printStackTrace();
}
}
}
/**
* @Title: Crawler.java
* @Package com.zhiwei.crawler.soubao
* @author 0xff
* @date 2018年6月28日 上午9:49:32
*/
package com.zhiwei.crawler.soubao;
import java.net.Proxy;
import java.net.URLEncoder;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.zhiwei.crawler.download.HttpClientBuilder;
import com.zhiwei.crawler.download.HttpRequestBuilder;
import com.zhiwei.crawler.util.TreatData;
import okhttp3.FormBody;
import okhttp3.Headers;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
/**
* @ClassName: Crawler
* @Description: 搜报网爬虫
* @author 0xff
* @date 2018年6月28日 上午9:49:32
*/
public class Crawler {
private static final Logger logger = LogManager.getLogger(Crawler.class);
public static void start(int days, String keyword, Proxy proxy) throws Exception {
if (days < 0) {
throw new IllegalArgumentException("搜索天数不能小于 0");
}
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
Calendar c = Calendar.getInstance();
String endDate = sdf.format(c.getTime());
c.add(Calendar.DAY_OF_YEAR, -1 * days);
String startDate = sdf.format(c.getTime());
StringBuilder sb = new StringBuilder(
"http://www.soubao.net/search/searchList.aspx?timesel=custom&checkNum=");
sb.append("&startdate=" + startDate + "&enddate=" + endDate);
sb.append("&keyword=" + URLEncoder.encode(keyword, "UTF-8"));
try {
String url = sb.toString();
logger.info("关键词 {} 搜索链接 {}", keyword, url);
search(url, keyword, startDate, endDate,proxy);
} catch (Exception e) {
logger.error("关键词 {} 采集出错", keyword, e);
}
}
private static void search(String url, String keyword, String startDate, String endDate,Proxy proxy) throws Exception {
int count = 0;
OkHttpClient client = HttpClientBuilder.newInstanceWithCookieJar(proxy);
Map<String, String> map = new HashMap<String, String>();
map.put("Referer", "http://www.soubao.net/search/searchList.aspx");
map.put("Cookie", DevKit.buildSoubaoCookie());
map.put("Host", "www.soubao.net");
map.put("Origin", "http://www.soubao.net");
map.put("Content-Type", "application/x-www-form-urlencoded");
Request request = HttpRequestBuilder.newGetRequest(url, Headers.of(map));
Response response = client.newCall(request).execute();
String body = response.body().string();
logger.info("关键词 {} 搜索成功", keyword);
Document html = Jsoup.parse(body);
boolean needRepair = true;
int page = 1;
// 开始强制翻页
for (int i = 1; i <= page; i++) {
FormBody formBody = new FormBody.Builder()
.add("__VIEWSTATE", html.getElementById("__VIEWSTATE").attr("value"))
.add("__VIEWSTATEGENERATOR", html.getElementById("__VIEWSTATEGENERATOR").attr("value"))
.add("__EVENTTARGET", "AspNetPager1").add("__EVENTARGUMENT", i + "")
.add("__EVENTVALIDATION", html.getElementById("__EVENTVALIDATION").attr("value"))
.add("HidTimeSelect", html.getElementById("HidTimeSelect").attr("value"))
.add("HiddenMsg", html.getElementById("HiddenMsg").attr("value")).add("txtKeyword", keyword)
.add("checkNum", "").add("timesel", "on").add("txtStartDate", startDate).add("txtEndDate", endDate)
.build();
request = HttpRequestBuilder.newPostRequest("http://www.soubao.net/search/searchList.aspx",
request.headers(), formBody);
response = client.newCall(request).execute();
body = response.body().string();
if (!body.contains("rptRetList_ctl01_HLinkBT")) {
page = 0;
logger.info("关键词 {} 无数据,退出搜索", keyword);
break;
}
html = Jsoup.parse(body);
if(needRepair) {
//修正翻页页数
try {
page = Integer.parseInt(html.getElementById("LbKeyword").select("span").get(2).text().replaceAll(".*?/|页", ""));
logger.info("关键词 {} 搜索结果页数: {}", keyword, page);
needRepair = false;
} catch (Exception e) {
throw new IllegalStateException("关键词 " + keyword + " 获取搜索结果页数失败");
}
}
count ++;
logger.info("关键词 {} 翻页页数: {} 访问成功, 页面长度:{}", keyword, i, body.length());
// 解析翻页
parse(client, request.headers(), html);
TimeUnit.SECONDS.sleep(2);
}
logger.info("关键词 {} 爬取完毕,总页数: {},数据条数: {}", keyword, page, count);
}
/**
* @Title: parse
* @author hero
* @Description: 解析数据
* @param @param client
* @param @param headers
* @param @param html 设定文件
* @return void 返回类型
*/
private static void parse(OkHttpClient client, Headers headers, Document html) {
try {
Elements elements = html.select("ul.newList").select("li");
logger.info("数据大小:::{}", elements.size());
for (Element element : elements) {
try {
String link = "http://www.soubao.net" + element.select("h2").select("a").attr("href");
String realUrl = matchRealUrl(client, headers, link);
if (realUrl != null) {
Map<String, Object> dataMap = new HashMap<String, Object>();
dataMap.put("title", element.select("h2").select("a").text());
dataMap.put("content", element.select("p.newCon").text());
dataMap.put("source",
element.select("p.newsInfo").select("em.paperName").select("span").text());
dataMap.put("time", element.select("p.newsInfo").select("em.postDate").select("span").text());
dataMap.put("_id", realUrl);
TreatData.treatDataAccount(dataMap);
}else {
logger.info("链接为:{},真实地址解析出现错误", link);
}
} catch (Exception e) {
logger.debug("解析数据结构出现问题::", e.fillInStackTrace());
continue;
}
}
} catch (Exception e) {
logger.info("页面正文提取出错", e);
}
}
public static String matchRealUrl(OkHttpClient client, Headers headers, String url) {
String regex = "^([hH][tT]{2}[pP]:/*|[hH][tT]{2}[pP][sS]:/*|[fF][tT][pP]:/*)(([A-Za-z0-9-~]+).)+([A-Za-z0-9-~\\/])+(\\?{0,1}(([A-Za-z0-9-~]+\\={0,1})([A-Za-z0-9-~]*)\\&{0,1})*)$";
String realUrl = null;
try {
TimeUnit.MILLISECONDS.sleep(500);
Request request = HttpRequestBuilder.newGetRequest(url, null);
Response response = client.newBuilder().build().newCall(request).execute();
String html = response.body().string();
if (html != null && html.contains("window.location='")) {
realUrl = html.split("window.location='")[1].split("'</script>")[0];
realUrl = realUrl.replaceAll("/./", "/");
Pattern pattern = Pattern.compile(regex);
if (pattern.matcher(realUrl).matches()) {
return realUrl;
}
}
} catch (Exception e) {
e.printStackTrace();
return null;
}
return realUrl;
}
public static void main(String[] args) {
try {
start(1, "京东", null);
} catch (Exception e) {
e.printStackTrace();
}
}
}
package com.zhiwei.crawler.soubao;
import java.net.Proxy;
import java.util.concurrent.BlockingQueue;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.run.SoubaoCrawlerRun;
import com.zhiwei.tools.tools.ZhiWeiTools;
......@@ -19,7 +18,6 @@ import com.zhiwei.tools.tools.ZhiWeiTools;
public class SouBaoCrawlerThread extends Thread{
private static final Logger logger = LogManager.getLogger(SoubaoCrawlerRun.class);
private BlockingQueue<String> wordsQueue;
public SouBaoCrawlerThread(BlockingQueue<String> wordsQueue) {
this.wordsQueue = wordsQueue;
}
......@@ -30,11 +28,10 @@ public class SouBaoCrawlerThread extends Thread{
while(wordsQueue!=null && wordsQueue.size()>0){
try {
String word = wordsQueue.take();
Proxy proxy = ProxyFactory.proxyCallback().getProxy();
/***开始采集**/
logger.info("开始采集:{}搜报网关键词,目前未采集的关键词为:{}", word, wordsQueue.size());
long s = System.currentTimeMillis();
Crawler.start(1, word, proxy);
Crawler.start(1, word, ProxyHolder.NAT_PROXY.getProxy());
long e = System.currentTimeMillis();
logger.info("采集:::{}搜报网关键词结束,采集所用时间为:{}", word, (e-s));
} catch (Exception e) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment