Commit 9fcfba2d by zhiwei

各个采集验证添加休眠,避免数据过多导致程序阻塞

parent aa059934
package com.zhiwei.source_forward.crawler; package com.zhiwei.source_forward.crawler;
import java.util.Objects; import java.util.Objects;
import org.apache.logging.log4j.LogManager; import com.zhiwei.tools.tools.ZhiWeiTools;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.proxy.ProxyHolder; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.utils.RequestUtils; import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.source_forward.bean.ContentBean; import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.source_forward.bean.ContentBean.Attribution; import com.zhiwei.source_forward.bean.ContentBean;
import com.zhiwei.source_forward.util.ContentDataCallback; import com.zhiwei.source_forward.bean.ContentBean.Attribution;
import com.zhiwei.source_forward.util.MatchContent; import com.zhiwei.source_forward.util.ContentDataCallback;
import com.zhiwei.source_forward.util.MatchContent;
import okhttp3.Request;
import okhttp3.Request;
public class ContentCrawler {
public class ContentCrawler {
private static Logger logger = LogManager.getLogger(ContentCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build(); private static Logger logger = LogManager.getLogger(ContentCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/**
* /**
* @Description 链接传入 并 返回采集完信号 *
* @param callback * @Description 链接传入 并 返回采集完信号
* @param urls * @param callback
* @return * @param urls
* @throws Exception * @return
*/ * @throws Exception
public GroupSync submitTask(ContentDataCallback callback, */
String... urls) { public GroupSync submitTask(ContentDataCallback callback,
GroupSync counter = new GroupSync(); String... urls) {
start(counter, callback, urls); GroupSync counter = new GroupSync();
return counter; start(counter, callback, urls);
} return counter;
}
/**
* /**
* @Description 提交链接 *
* @param counter * @Description 提交链接
* @param callback * @param counter
* @param urls * @param callback
*/ * @param urls
private void start(GroupSync counter, */
ContentDataCallback callback, String... urls) { private void start(GroupSync counter,
if (urls != null && urls.length > 0) { ContentDataCallback callback, String... urls) {
for (String url : urls) { if (urls != null && urls.length > 0) {
if (url != null) { for (String url : urls) {
try { ZhiWeiTools.sleep(100);
search(counter, url, Attribution.of(url), callback); if (url != null) {
} catch (Exception e) { try {
logger.error("搜索创建出错", e); search(counter, url, Attribution.of(url), callback);
} } catch (Exception e) {
} logger.error("搜索创建出错", e);
} }
} }
} }
}
/** }
*
* @Description 链接获取文章信息 /**
* @param counter *
* @param url * @Description 链接获取文章信息
* @param attr * @param counter
* @param callback * @param url
* @return * @param attr
*/ * @param callback
private GroupSync search(GroupSync counter, * @return
String url, Attribution attr, ContentDataCallback callback) { */
logger.info("当前处理 URL: {}", url); private GroupSync search(GroupSync counter,
Request request = RequestUtils.wrapGet(url); String url, Attribution attr, ContentDataCallback callback) {
counter.add(); logger.info("当前处理 URL: {}", url);
Request request = RequestUtils.wrapGet(url);
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> { counter.add();
try {
if (Objects.isNull(ex)) { httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
parseHtml(rs.body().string(), attr, callback); try {
} else { if (Objects.isNull(ex)) {
logger.info("{} 搜索结果访问失败: {}", request.url().url(),ex); parseHtml(rs.body().string(), attr, callback);
} } else {
} catch (Exception e) { logger.info("{} 搜索结果访问失败: {}", request.url().url(),ex);
logger.info("搜索结果访问失败: {}", ex); }
} finally { } catch (Exception e) {
counter.done(); logger.info("搜索结果访问失败: {}", ex);
} } finally {
counter.done();
}); }
return counter; });
}
return counter;
/** }
*
* /**
* @Description 获取正文解析 *
* @param response *
* @param attr * @Description 获取正文解析
* @param callback * @param response
*/ * @param attr
private void parseHtml(String result, Attribution attr, * @param callback
ContentDataCallback callback) { */
try { private void parseHtml(String result, Attribution attr,
String content = MatchContent.matchContent(attr.get().toString(), ContentDataCallback callback) {
result); try {
ContentBean cb = new ContentBean(attr.get().toString(), content); String content = MatchContent.matchContent(attr.get().toString(),
if (callback == null) { result);
logger.warn("DataCallback 对象为 null,无法保存数据"); ContentBean cb = new ContentBean(attr.get().toString(), content);
} else { if (callback == null) {
callback.onData(cb, attr); logger.warn("DataCallback 对象为 null,无法保存数据");
} } else {
} catch (Exception e) { callback.onData(cb, attr);
logger.error("网页链接失效", e); }
} } catch (Exception e) {
logger.error("网页链接失效", e);
} }
} }
}
...@@ -5,6 +5,7 @@ import java.util.List; ...@@ -5,6 +5,7 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects; import java.util.Objects;
import com.zhiwei.tools.tools.ZhiWeiTools;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
...@@ -64,6 +65,7 @@ public class MediaSelfSourceCrawler { ...@@ -64,6 +65,7 @@ public class MediaSelfSourceCrawler {
private void start(GroupSync counter,MediaSelfSourceDataCallBack callback, String... urls) { private void start(GroupSync counter,MediaSelfSourceDataCallBack callback, String... urls) {
if (urls != null && urls.length > 0) { if (urls != null && urls.length > 0) {
for (String url : urls) { for (String url : urls) {
ZhiWeiTools.sleep(100);
counter.add(); counter.add();
if (url != null) { if (url != null) {
try { try {
......
...@@ -325,7 +325,6 @@ public class MatchSource { ...@@ -325,7 +325,6 @@ public class MatchSource {
source = source.replaceAll(".*来源:|)", ""); source = source.replaceAll(".*来源:|)", "");
} }
} }
if(Objects.nonNull(source) && source.length() != 0) { if(Objects.nonNull(source) && source.length() != 0) {
return source; return source;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment