Commit 7f4a87a2 by yangchen

修改 crawler-core 提升版本

parent 1b20782c
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId>
<version>0.1.1-SNAPSHOT</version>
<version>0.1.3-SNAPSHOT</version>
<name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
......@@ -24,12 +24,12 @@
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.1.1-SNAPSHOT</version>
<version>0.1.2-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.1.1-RELEASE</version>
<version>0.3.0-RELEASE</version>
<scope>provided</scope>
</dependency>
</dependencies>
......
package com.zhiwei.source_forward.crawler;
import java.util.concurrent.TimeUnit;
import java.util.Objects;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.crawler.async.MultiThreadingCounter;
import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.source_forward.bean.ContentBean;
import com.zhiwei.source_forward.bean.ContentBean.Attribution;
import com.zhiwei.source_forward.util.ContentDataCallback;
import com.zhiwei.source_forward.util.MatchContent;
import okhttp3.Request;
import okhttp3.Response;
public class ContentCrawler {
......@@ -30,10 +29,10 @@ public class ContentCrawler {
* @return
* @throws Exception
*/
public MultiThreadingCounter submitTask(ContentDataCallback callback,
public GroupSync submitTask(ContentDataCallback callback,
String... urls) {
try {
MultiThreadingCounter counter = new MultiThreadingCounter(15,TimeUnit.MINUTES,false);
GroupSync counter = new GroupSync();
start(counter, callback, urls);
return counter;
} catch (Exception e) {
......@@ -49,7 +48,7 @@ public class ContentCrawler {
* @param callback
* @param urls
*/
private void start(MultiThreadingCounter counter,
private void start(GroupSync counter,
ContentDataCallback callback, String... urls) {
if (urls != null && urls.length > 0) {
for (String url : urls) {
......@@ -73,23 +72,25 @@ public class ContentCrawler {
* @param callback
* @return
*/
private MultiThreadingCounter search(MultiThreadingCounter counter,
private GroupSync search(GroupSync counter,
String url, Attribution attr, ContentDataCallback callback) {
logger.info("当前处理 URL: {}", url);
Request request = RequestUtils.wrapGet(url);
counter.increase();
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY, true).addListener(future -> {
if (future.isSuccess()) {
Response response = future.result();
counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY, true).whenComplete((rs,ex) -> {
try {
parseHtml(response, attr, callback);
} catch (Exception e) {
logger.error("解析出错", e);
}
if (Objects.isNull(ex)) {
parseHtml(rs.body().string(), attr, callback);
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(),future.cause().getMessage());
logger.info("{} 搜索结果访问失败: {}", request.url().url(),ex);
}
} catch (Exception e) {
logger.info("搜索结果访问失败: {}", ex);
} finally {
counter.done();
}
counter.reduce();
});
return counter;
......@@ -103,28 +104,20 @@ public class ContentCrawler {
* @param attr
* @param callback
*/
private void parseHtml(Response response, Attribution attr,
private void parseHtml(String result, Attribution attr,
ContentDataCallback callback) {
String content = null;
try {
if (response.isSuccessful()) {
String html = response.body().string();
content = MatchContent.matchContent(attr.get().toString(),
html);
}
} catch (Exception e) {
logger.error("网页链接失效", e);
} finally {
if (response != null) {
response.close();
}
}
String content = MatchContent.matchContent(attr.get().toString(),
result);
ContentBean cb = new ContentBean(attr.get().toString(), content);
if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据");
} else {
callback.onData(cb, attr);
}
} catch (Exception e) {
logger.error("网页链接失效", e);
}
}
......
......@@ -3,17 +3,17 @@ package com.zhiwei.source_forward.crawler;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.Objects;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Node;
import com.zhiwei.crawler.async.MultiThreadingCounter;
import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel;
......@@ -21,7 +21,6 @@ import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack;
import okhttp3.Request;
import okhttp3.Response;
/**
*
......@@ -44,9 +43,9 @@ public class MediaSelfSourceCrawler {
* @return
* @throws Exception
*/
public MultiThreadingCounter submitTask(MediaSelfSourceDataCallBack callback,String... urls) {
public GroupSync submitTask(MediaSelfSourceDataCallBack callback,String... urls) {
try {
MultiThreadingCounter counter = new MultiThreadingCounter("任务======= ", 15,TimeUnit.MINUTES,true);
GroupSync counter = new GroupSync();
start(counter, callback, urls);
return counter;
} catch (Exception e) {
......@@ -62,10 +61,10 @@ public class MediaSelfSourceCrawler {
* @param callback
* @param urls
*/
private void start(MultiThreadingCounter counter,MediaSelfSourceDataCallBack callback, String... urls) {
private void start(GroupSync counter,MediaSelfSourceDataCallBack callback, String... urls) {
if (urls != null && urls.length > 0) {
for (String url : urls) {
counter.increase();
counter.add();
if (url != null) {
try {
search(counter, url, Attribution.of(url), callback);
......@@ -73,7 +72,7 @@ public class MediaSelfSourceCrawler {
logger.error("搜索创建出错", e);
}
}
counter.reduce();
counter.done();
}
}
}
......@@ -87,28 +86,28 @@ public class MediaSelfSourceCrawler {
* @param callback
* @return
*/
private MultiThreadingCounter search(MultiThreadingCounter counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) {
private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) {
logger.info("当前处理 URL: {}", url);
Map<String,Object> map = new HashMap<>();
if(url.contains("toutiao.com")) {
map.put("referer", url);
}
Request request = RequestUtils.wrapGet(url, map);
counter.increase();
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY, true).addListener(future -> {
counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY).whenComplete((rs,ex) -> {
try {
if (future.isSuccess()) {
Response response = future.result();
if (Objects.isNull(ex)) {
try {
parseHtml(response, attr, callback);
parseHtml(rs.body().string(), attr, callback);
} catch (Exception e) {
logger.error("解析出错", e);
}
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage());
logger.info("{} 搜索结果访问失败: {}", request.url().url(),ex);
}
} finally {
counter.reduce();
counter.done();
}
});
return counter;
......@@ -121,30 +120,23 @@ public class MediaSelfSourceCrawler {
* @param attr
* @param callback
*/
private void parseHtml(Response response, Attribution attr,
private void parseHtml(String result, Attribution attr,
MediaSelfSourceDataCallBack callback) {
String source = null;
String channel = null;
try {
if(response.isSuccessful()){
String html = response.body().string();
source = MatchSource.matchMediaSelfSource(attr.get().toString(),html);
source = MatchSource.matchMediaSelfSource(attr.get().toString(),result);
if(source==null || source.equals("")){
source = null;
}
channel = MatchChannel.verifyChannel(attr.get().toString());
if(channel==null){
List<Node> nodeList = Jsoup.parse(html).head().childNodes();
List<Node> nodeList = Jsoup.parse(result).head().childNodes();
channel = MatchChannel.matchChannel(nodeList);
}
}
} catch (Exception e) {
logger.error("exception ",e);
source = null;
}finally {
if(response != null) {
response.close();
}
}
logger.info(attr.get()+"================="+source);
MediaSelfSourceBean msfb = new MediaSelfSourceBean(attr.get().toString(), source, channel);
......
......@@ -2,7 +2,7 @@ package com.zhiwei.source_forward.crawler;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.Objects;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
......@@ -10,10 +10,10 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import com.zhiwei.crawler.async.MultiThreadingCounter;
import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel;
......@@ -23,7 +23,6 @@ import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
import com.zhiwei.tools.httpclient.HeaderTool;
import okhttp3.Request;
import okhttp3.Response;
public class SourceForwardCrawler {
......@@ -32,9 +31,9 @@ public class SourceForwardCrawler {
private static HttpBoot httpBoot = new HttpBoot();
private static List<String> sourceList = SourceData.getSourceList();
public MultiThreadingCounter submitTask(SourceForwardDataCallBack callback,String... urls) {
public GroupSync submitTask(SourceForwardDataCallBack callback,String... urls) {
try {
MultiThreadingCounter counter = new MultiThreadingCounter(15,TimeUnit.MINUTES,false);
GroupSync counter = new GroupSync();
start(counter, callback, urls);
return counter;
} catch (Exception e) {
......@@ -43,10 +42,10 @@ public class SourceForwardCrawler {
}
}
private void start(MultiThreadingCounter counter,SourceForwardDataCallBack callback, String... urls) {
private void start(GroupSync counter,SourceForwardDataCallBack callback, String... urls) {
if (urls != null && urls.length > 0) {
for (String url : urls) {
counter.increase();
counter.add();
if (url != null) {
try {
search(counter, url, Attribution.of(url), callback);
......@@ -54,12 +53,12 @@ public class SourceForwardCrawler {
logger.error("搜索创建出错", e);
}
}
counter.reduce();
counter.done();
}
}
}
private MultiThreadingCounter search(MultiThreadingCounter counter, String url,Attribution attr, SourceForwardDataCallBack callback) {
private GroupSync search(GroupSync counter, String url,Attribution attr, SourceForwardDataCallBack callback) {
logger.info("当前处理 URL: {}", url);
Map<String,String> headers = HeaderTool.getCommonHead();
if(url.contains("www.toutiao.com")){
......@@ -67,34 +66,29 @@ public class SourceForwardCrawler {
}
Request request = RequestUtils.wrapGet(url, headers);
counter.increase();
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY, true).addListener(future -> {
counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY, true).whenComplete((rs,ex) -> {
try {
if (future.isSuccess()) {
Response response = future.result();
try {
parseHtml(response, attr, callback);
} catch (Exception e) {
logger.error("解析出错", e);
}
if (Objects.isNull(ex)) {
parseHtml(rs.body().string(), attr, callback);
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage());
logger.info("{} 搜索结果访问失败: {}", request.url().url(), ex);
}
} catch (Exception e1) {
logger.error("解析出错",e1);
} finally {
counter.reduce();
counter.done();
}
});
return counter;
}
private void parseHtml(Response response, Attribution attr,
private void parseHtml(String body, Attribution attr,
SourceForwardDataCallBack callback) {
String source = null;
String channel = "新闻";
String isforward = "未知";
try {
if(response.isSuccessful()){
String body = response.body().string();
Document document = Jsoup.parse(body);
if(attr.get().toString().contains("mp.weixin.qq.com")){
isforward = document.select("div#meta_content").select("span#copyright_logo").text();
......@@ -115,15 +109,9 @@ public class SourceForwardCrawler {
}
source = MatchSource.matchSource(attr.get().toString(),document.toString(), sourceList);
}
}
} catch (Exception e) {
e.printStackTrace();
source = null;
channel = "新闻";
}finally {
if(response != null) {
response.close();
}
}
logger.info(attr.get().toString()+"======="+channel+"================="+source);
SourceForwardBean sfb = new SourceForwardBean(attr.get().toString(), channel, source,isforward);
......
......@@ -6,7 +6,6 @@ import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.TimeUnit;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
......@@ -14,17 +13,16 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.async.MultiThreadingCounter;
import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.util.UrlLiveDataCallback;
import com.zhiwei.tools.httpclient.HeaderTool;
import okhttp3.Request;
import okhttp3.Response;
/**
*
......@@ -39,9 +37,9 @@ public class UrlLiveCrawler {
private static final Logger logger = LogManager.getLogger(UrlLiveCrawler.class);
private static HttpBoot httpBoot = new HttpBoot();
public MultiThreadingCounter submitTask(UrlLiveDataCallback callback,String... urls) {
public GroupSync submitTask(UrlLiveDataCallback callback,String... urls) {
try {
MultiThreadingCounter counter = new MultiThreadingCounter(10,TimeUnit.MINUTES,false);
GroupSync counter = new GroupSync();
start(counter, callback, urls);
return counter;
} catch (Exception e) {
......@@ -50,10 +48,10 @@ public class UrlLiveCrawler {
}
}
private void start(MultiThreadingCounter counter,UrlLiveDataCallback callback, String... urls) {
private void start(GroupSync counter,UrlLiveDataCallback callback, String... urls) {
if (nonNull(urls) && urls.length > 0) {
for (String url : urls) {
counter.increase();
counter.add();
if (nonNull(url)) {
try {
search(counter, url, Attribution.of(url,1), callback);
......@@ -61,12 +59,12 @@ public class UrlLiveCrawler {
logger.error("搜索创建出错:", e);
}
}
counter.reduce();
counter.done();
}
}
}
private MultiThreadingCounter search(MultiThreadingCounter counter, String url,
private GroupSync search(GroupSync counter, String url,
Attribution attr, UrlLiveDataCallback callback) {
url = dealUrl(url);
logger.info("当前处理 URL: {}", url);
......@@ -75,46 +73,33 @@ public class UrlLiveCrawler {
headers.put("referer", url);
}
Request request = RequestUtils.wrapGet(url, headers);
counter.increase();
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY).addListener(future -> {
counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY).whenComplete((rs,ex) -> {
try {
if (future.isSuccess()) {
Response response = future.result();
try {
if(response.code() == 200) {
parseHtml(response.body().string(), attr, callback,counter);
if (Objects.isNull(ex)) {
if(rs.code() == 200) {
parseHtml(rs.body().string(), attr, callback,counter);
}else {
if(attr.getCount() > 2) {
callBack(callback, attr, 1,String.valueOf(response.code()));
callBack(callback, attr, 1,String.valueOf(rs.code()));
}else {
attr.AddCount();
search(counter, attr.getAttr().toString(), attr, callback);
}
}
} catch (Exception e) {
logger.error("解析出错 {}", e);
}finally {
if(response != null) {
response.close();
}
}
} else {
if(future.cause().getMessage().contains("status code: ")) {
callBack(callback, attr, 1,null);
}else {
if(attr.getCount() > 3) {
callBack(callback, attr, -1,null);
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage());
logger.info("{} 搜索结果访问失败: {}", request.url().url(), ex);
}else {
attr.AddCount();
search(counter, attr.getAttr().toString(), attr, callback);
}
}
}
} catch (Exception e) {
logger.error(" 数据是否删除 采集出错 {} ",e);
}finally {
counter.reduce();
counter.done();
}
});
return counter;
......@@ -178,7 +163,7 @@ public class UrlLiveCrawler {
* @param callback
*/
private void parseHtml(String html, Attribution attr,
UrlLiveDataCallback callback,MultiThreadingCounter counter) {
UrlLiveDataCallback callback,GroupSync counter) {
if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据");
} else {
......
......@@ -25,7 +25,7 @@ public class MediaSelfSource {
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
List<String> urlList = new ArrayList<>();
urlList.add("https://sports.qq.com/a/20190227/001177.htm");
urlList.add("https://www.toutiao.com/a6669697912458445059/");
List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList);
for(MediaSelfSourceBean b : u) {
System.out.println(b.toString());
......
package com.zhiwei.source_forward.util;
import java.util.List;
import java.util.Objects;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
......@@ -148,6 +149,16 @@ public class MatchSource {
source = "东方头条-" + source;
}
}else if(url.contains("fashion.eastday.com")){
//处理东方头条网-自媒体号匹配
source = document.select("div.J-title_detail.title_detail > div > div.fl > i:nth-child(2)").text().trim();
if(Objects.isNull(source) || source.length() < 1) {
source = document.select("div.J-title_detail.title_detail > div > div.fl > a").text().trim();
}
if(source!=null && source.length()>1){
source = "东方看点-" + source;
}
}else if(url.contains("sh.qihoo.com")){
//今日报点解析
source = document.select("span.source").text().trim();
......@@ -237,6 +248,18 @@ public class MatchSource {
if(source!=null && !source.equals("")){
source = "网易号-" + source;
}
}else if(url.contains("myzaker.com")){
source = document.select("div.article_header > div > a > span.auther")
.text();
if(source!=null && !source.equals("")){
source = "zaker-" + source;
}
}else if(url.contains("edushi.com")){
source = document.select("div.eds-name-box > div.eds-name > a > div.name")
.text();
if(source!=null && !source.equals("")){
source = "今日潮闻-" + source;
}
}
return source;
} catch (Exception e) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment