Commit b8ed38f4 by chenweiyang

链接是否删除部分修改

parents bd0353ac 7003572f
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId>
<version>0.2.7-SNAPSHOT</version>
<version>0.2.8-SNAPSHOT</version>
<name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
......
package com.zhiwei.source_forward.crawler;
import java.util.Objects;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.source_forward.bean.ContentBean;
import com.zhiwei.source_forward.bean.ContentBean.Attribution;
import com.zhiwei.source_forward.util.ContentDataCallback;
import com.zhiwei.source_forward.util.MatchContent;
import okhttp3.Request;
public class ContentCrawler {
private static Logger logger = LogManager.getLogger(ContentCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/**
*
* @Description 链接传入 并 返回采集完信号
* @param callback
* @param urls
* @return
* @throws Exception
*/
public GroupSync submitTask(ContentDataCallback callback,
String... urls) {
GroupSync counter = new GroupSync();
start(counter, callback, urls);
return counter;
}
/**
*
* @Description 提交链接
* @param counter
* @param callback
* @param urls
*/
private void start(GroupSync counter,
ContentDataCallback callback, String... urls) {
if (urls != null && urls.length > 0) {
for (String url : urls) {
if (url != null) {
try {
search(counter, url, Attribution.of(url), callback);
} catch (Exception e) {
logger.error("搜索创建出错", e);
}
}
}
}
}
/**
*
* @Description 链接获取文章信息
* @param counter
* @param url
* @param attr
* @param callback
* @return
*/
private GroupSync search(GroupSync counter,
String url, Attribution attr, ContentDataCallback callback) {
logger.info("当前处理 URL: {}", url);
Request request = RequestUtils.wrapGet(url);
counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
try {
if (Objects.isNull(ex)) {
parseHtml(rs.body().string(), attr, callback);
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(),ex);
}
} catch (Exception e) {
logger.info("搜索结果访问失败: {}", ex);
} finally {
counter.done();
}
});
return counter;
}
/**
*
*
* @Description 获取正文解析
* @param response
* @param attr
* @param callback
*/
private void parseHtml(String result, Attribution attr,
ContentDataCallback callback) {
try {
String content = MatchContent.matchContent(attr.get().toString(),
result);
ContentBean cb = new ContentBean(attr.get().toString(), content);
if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据");
} else {
callback.onData(cb, attr);
}
} catch (Exception e) {
logger.error("网页链接失效", e);
}
}
}
package com.zhiwei.source_forward.crawler;
import java.util.Objects;
import com.zhiwei.tools.tools.ZhiWeiTools;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.source_forward.bean.ContentBean;
import com.zhiwei.source_forward.bean.ContentBean.Attribution;
import com.zhiwei.source_forward.util.ContentDataCallback;
import com.zhiwei.source_forward.util.MatchContent;
import okhttp3.Request;
public class ContentCrawler {
private static Logger logger = LogManager.getLogger(ContentCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/**
*
* @Description 链接传入 并 返回采集完信号
* @param callback
* @param urls
* @return
* @throws Exception
*/
public GroupSync submitTask(ContentDataCallback callback,
String... urls) {
GroupSync counter = new GroupSync();
start(counter, callback, urls);
return counter;
}
/**
*
* @Description 提交链接
* @param counter
* @param callback
* @param urls
*/
private void start(GroupSync counter,
ContentDataCallback callback, String... urls) {
if (urls != null && urls.length > 0) {
for (String url : urls) {
ZhiWeiTools.sleep(100);
if (url != null) {
try {
search(counter, url, Attribution.of(url), callback);
} catch (Exception e) {
logger.error("搜索创建出错", e);
}
}
}
}
}
/**
*
* @Description 链接获取文章信息
* @param counter
* @param url
* @param attr
* @param callback
* @return
*/
private GroupSync search(GroupSync counter,
String url, Attribution attr, ContentDataCallback callback) {
logger.info("当前处理 URL: {}", url);
Request request = RequestUtils.wrapGet(url);
counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY, true).whenComplete((rs,ex) -> {
try {
if (Objects.isNull(ex)) {
parseHtml(rs.body().string(), attr, callback);
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(),ex);
}
} catch (Exception e) {
logger.info("搜索结果访问失败: {}", ex);
} finally {
counter.done();
}
});
return counter;
}
/**
*
*
* @Description 获取正文解析
* @param response
* @param attr
* @param callback
*/
private void parseHtml(String result, Attribution attr,
ContentDataCallback callback) {
try {
String content = MatchContent.matchContent(attr.get().toString(),
result);
ContentBean cb = new ContentBean(attr.get().toString(), content);
if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据");
} else {
callback.onData(cb, attr);
}
} catch (Exception e) {
logger.error("网页链接失效", e);
}
}
}
......@@ -5,6 +5,7 @@ import java.util.List;
import java.util.Map;
import java.util.Objects;
import com.zhiwei.tools.tools.ZhiWeiTools;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
......@@ -64,6 +65,7 @@ public class MediaSelfSourceCrawler {
private void start(GroupSync counter,MediaSelfSourceDataCallBack callback, String... urls) {
if (urls != null && urls.length > 0) {
for (String url : urls) {
ZhiWeiTools.sleep(100);
counter.add();
if (url != null) {
try {
......@@ -89,12 +91,9 @@ public class MediaSelfSourceCrawler {
private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) {
logger.info("当前处理 URL: {}", attr.get());
Map<String,Object> map = new HashMap<>();
ProxyHolder ph = null;
ProxyHolder ph = ProxyHolder.NAT_HEAVY_PROXY;
if(url.contains("toutiao.com")) {
map.put("referer", url);
ph = ProxyHolder.SOUGOU_OUTER_PROXY;
}else {
ph = ProxyHolder.NAT_HEAVY_PROXY;
}
url = dealUrl(url);
if(Objects.nonNull(url)) {
......@@ -168,7 +167,6 @@ public class MediaSelfSourceCrawler {
String url = attr.get().toString();
try {
source = MatchSource.matchMediaSelfSource(url + eUrl,result);
logger.info(url+"=======" + source);
channel = MatchChannel.verifyChannel(url);
if(channel==null){
List<Node> nodeList = Jsoup.parse(result).head().childNodes();
......
......@@ -32,7 +32,7 @@ public class MediaSelfSource {
ProxyInit.initProxy();
List<String> urlList = new ArrayList<>();
urlList.add("https://new.qq.com/omn/20200507/20200507A0Q9JV00.html");
urlList.add("https://k.sina.com.cn/article_1060093724_3f2fbf1c00100vsqd.html?from=mood");
List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList);
for(MediaSelfSourceBean b : u) {
......
......@@ -80,10 +80,10 @@ public class SourceForward {
public static void main(String[] args) {
ProxyInit.initProxy();
List<String> urlList = new ArrayList<>();
urlList.add("http://www.wangjiaozixun.com/html/zx20/2020/0730/1396388.html");
urlList.add("http://gu.qq.com/resources/shy/news/detail-v2/index.html?#/index?id=SN202006091653447945411f");
List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
for(SourceForwardBean sfb : da) {
System.out.println(sfb.toString());
System.out.println("=============="+sfb.toString());
}
}
......@@ -94,7 +94,6 @@ public class SourceForward {
try{
SourceForwardCrawler crawler = new SourceForwardCrawler();
SourceForwardDataCallBack callback = new SourceForwardDataCallBack() {
@Override
public void onData(SourceForwardBean data, Attribution attr) {
list.add(data);
......
......@@ -72,13 +72,13 @@ public class URLLive {
public static void main(String[] args) {
ProxyInit.initProxy();
List<String> urlList = new ArrayList<>();
urlList.add("http://www.toutiao.com/item/1668646006370318/");
urlList.add("http://mp.weixin.qq.com/s?__biz=Mzg3MDMzNTc5Mg==&mid=2247485220&idx=1&sn=9118543ca120489cccbdc102be58f881");
// urlList.add("http://www.yidianzixun.com/article/0PYO4Gbh");
List<UrlLiveBean> u = URLLive.verificationURLLive(urlList);
for(UrlLiveBean b : u) {
System.out.println(b.toString());
}
}
}
static class UrlLiveCrawlerThread extends Thread{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment