Commit 5a79e3d2 by yangchen

发布版本 修改

parent 4a1a7343
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId>
<version>0.0.5-SNAPSHOT</version>
<version>0.0.7-SNAPSHOT</version>
<name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
......@@ -24,7 +24,12 @@
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.0.9-SNAPSHOT</version>
<version>0.1.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.1.1-RELEASE</version>
</dependency>
</dependencies>
......
......@@ -7,7 +7,7 @@ import org.apache.logging.log4j.Logger;
import com.zhiwei.crawler.async.MultiThreadingCounter;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.HttpRequestBuilder;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.source_forward.bean.ContentBean;
import com.zhiwei.source_forward.bean.ContentBean.Attribution;
......@@ -31,8 +31,8 @@ public class ContentCrawler {
* @throws Exception
*/
public MultiThreadingCounter submitTask(ContentDataCallback callback,
String... urls) throws Exception {
MultiThreadingCounter counter = new MultiThreadingCounter(20,TimeUnit.MINUTES,false);
String... urls) {
MultiThreadingCounter counter = new MultiThreadingCounter(15,TimeUnit.MINUTES,false);
start(counter, callback, urls);
return counter;
}
......@@ -52,7 +52,7 @@ public class ContentCrawler {
try {
search(counter, url, Attribution.of(url), callback);
} catch (Exception e) {
logger.error("关键词 {} 搜索创建出错: {}", e.getMessage());
logger.error("搜索创建出错", e);
}
}
}
......@@ -71,7 +71,7 @@ public class ContentCrawler {
private MultiThreadingCounter search(MultiThreadingCounter counter,
String url, Attribution attr, ContentDataCallback callback) {
logger.info("当前处理 URL: {}", url);
Request request = HttpRequestBuilder.newGetRequest(url, null);
Request request = RequestUtils.wrapGet(url);
counter.increase();
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY, true).addListener(future -> {
if (future.isSuccess()) {
......@@ -84,9 +84,9 @@ public class ContentCrawler {
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(),future.cause().getMessage());
}
counter.reduce();
});
return counter;
}
......@@ -104,12 +104,11 @@ public class ContentCrawler {
try {
if (response.isSuccessful()) {
String html = response.body().string();
System.out.println(html);
content = MatchContent.matchContent(attr.get().toString(),
html);
}
} catch (Exception e) {
logger.info("网页链接失效", e.fillInStackTrace());
logger.error("网页链接失效", e);
} finally {
if (response != null) {
response.close();
......
......@@ -45,7 +45,7 @@ public class MediaSelfSourceCrawler {
* @throws Exception
*/
public MultiThreadingCounter submitTask(MediaSelfSourceDataCallBack callback,String... urls) throws Exception {
MultiThreadingCounter counter = new MultiThreadingCounter("任务======= ", 10,TimeUnit.SECONDS,true);
MultiThreadingCounter counter = new MultiThreadingCounter("任务======= ", 15,TimeUnit.SECONDS,true);
start(counter, callback, urls);
return counter;
}
......@@ -65,7 +65,7 @@ public class MediaSelfSourceCrawler {
counter.increase();
search(counter, url, Attribution.of(url), callback);
} catch (Exception e) {
logger.error("关键词 {} 搜索创建出错: {}", e.getMessage());
logger.error("搜索创建出错", e);
} finally {
counter.reduce();
}
......@@ -135,6 +135,7 @@ public class MediaSelfSourceCrawler {
}
}
} catch (Exception e) {
e.printStackTrace();
source = null;
}finally {
if(response != null) {
......
......@@ -31,7 +31,7 @@ public class SourceForwardCrawler {
private static List<String> sourceList = SourceData.getSourceList();
public MultiThreadingCounter submitTask(SourceForwardDataCallBack callback,String... urls) throws Exception {
MultiThreadingCounter counter = new MultiThreadingCounter(20,TimeUnit.MINUTES,false);
MultiThreadingCounter counter = new MultiThreadingCounter(5,TimeUnit.MINUTES,false);
start(counter, callback, urls);
return counter;
}
......@@ -44,7 +44,7 @@ public class SourceForwardCrawler {
counter.increase();
search(counter, url, Attribution.of(url), callback);
} catch (Exception e) {
logger.error("关键词 {} 搜索创建出错: {}", e.getMessage());
logger.error("搜索创建出错", e);
} finally {
counter.reduce();
}
......@@ -58,17 +58,20 @@ public class SourceForwardCrawler {
Request request = HttpRequestBuilder.newGetRequest(url, null);
counter.increase();
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY, true).addListener(future -> {
if (future.isSuccess()) {
Response response = future.result();
try {
parseHtml(response, attr, callback);
} catch (Exception e) {
logger.error("解析出错", e);
try {
if (future.isSuccess()) {
Response response = future.result();
try {
parseHtml(response, attr, callback);
} catch (Exception e) {
logger.error("解析出错", e);
}
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage());
}
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage());
} finally {
counter.reduce();
}
counter.reduce();
});
return counter;
}
......@@ -98,6 +101,7 @@ public class SourceForwardCrawler {
}
}
} catch (Exception e) {
e.printStackTrace();
source = null;
channel = "新闻";
}finally {
......
......@@ -46,7 +46,7 @@ public class UrlLiveCrawler {
try {
search(counter, url, Attribution.of(url,1), callback);
} catch (Exception e) {
logger.error("关键词 {} 搜索创建出错: {}", e.getMessage());
logger.error("搜索创建出错:", e);
}
}
}
......@@ -124,7 +124,7 @@ public class UrlLiveCrawler {
url = "https://www.toutiao.com/a" + url.split("/")[4] + "/";
}
} catch (Exception e) {
logger.info("url 解析出错 {}",url);
logger.error("url 解析出错 ",e);
return url;
}
}else if(url.contains("mp.weixin.qq.com")) {
......@@ -151,7 +151,7 @@ public class UrlLiveCrawler {
try {
f = matchDel(html,attr.getAttr().toString());
} catch (Exception e) {
logger.info("数据判断出错 {}",e.getMessage());
logger.error("数据判断出错 ",e);
}
UrlLiveBean ulb = new UrlLiveBean(attr.getAttr().toString(), f);
if (callback == null) {
......@@ -378,6 +378,7 @@ public class UrlLiveCrawler {
}
catch (Exception e)
{
e.printStackTrace();
// TODO: handle exception
}
......@@ -406,6 +407,7 @@ public class UrlLiveCrawler {
}
catch (Exception e)
{
e.printStackTrace();
// : handle exception
}
return flg;
......@@ -438,6 +440,7 @@ public class UrlLiveCrawler {
}
}
} catch (Exception e) {
e.printStackTrace();
return false;
}
return false;
......
......@@ -51,25 +51,25 @@ public class ContentMatch {
public static List<ContentBean> getContentMatch(List<String> urlList){
//启动获取链接正文
List<ContentBean> dataList = ContentMatchCrawlerThread.getContentMatch(urlList);
return dataList;
return ContentMatchCrawlerThread.getContentMatch(urlList);
}
public static void main(String[] args) {
List<String> urlList = new ArrayList<>();
urlList.add("https://mp.weixin.qq.com/s?src=11&timestamp=1539828001&ver=1189&signature=SAyiGuX8VfwlPsIlq*V7I8epXKcMc9Zr6RptkDT34vDk7tSYQCwix6qJxMm25JK9gxo0t9HKAeqm70V2J1FhcDiSlf1eMhSSMz8EiCk*Hu50B7sJFkoH46HHo1uiC4f7&new=1");
List<ContentBean> l = getContentMatch(urlList);
for(ContentBean cb : l) {
System.out.println(cb.getContent());
}
// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
// List<String> urlList = new ArrayList<>();
// urlList.add("http://www.egsea.com/news/detail?id=324048");
// List<ContentBean> l = getContentMatch(urlList);
// for(ContentBean cb : l) {
// System.out.println(cb.getContent());
// }
}
static class ContentMatchCrawlerThread extends Thread{
private static List<ContentBean> getContentMatch(List<String> urlList){
List<ContentBean> list = Collections.synchronizedList(new ArrayList<ContentBean>());
try{
ContentCrawler crawler = new ContentCrawler();
List<ContentBean> list = Collections.synchronizedList(new ArrayList<ContentBean>());
ContentDataCallback callback = new ContentDataCallback() {
@Override
......@@ -80,11 +80,10 @@ public class ContentMatch {
};
crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await();
return list;
}catch (Exception e){
e.fillInStackTrace();
logger.error(" Exception {} ",e);
}
return null;
return list;
}
}
......
......@@ -7,8 +7,6 @@ import java.util.List;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.crawler.MediaSelfSourceCrawler;
......@@ -24,13 +22,13 @@ public class MediaSelfSource {
}
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
List<String> urlList = new ArrayList<>();
urlList.add("https://www.toutiao.com/a6452936157751968013/");
List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList);
for(MediaSelfSourceBean b : u) {
System.out.println(b.toString());
}
// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
// List<String> urlList = new ArrayList<>();
// urlList.add("https://www.toutiao.com/a6452936157751968013/");
// List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList);
// for(MediaSelfSourceBean b : u) {
// System.out.println(b.toString());
// }
}
static class MediaSelfSourceCrawlerThread extends Thread{
......@@ -49,9 +47,8 @@ public class MediaSelfSource {
};
crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await();
return list;
}catch (Exception e){
e.fillInStackTrace();
e.printStackTrace();
}
return list;
}
......
......@@ -10,8 +10,6 @@ import java.util.Map.Entry;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
......@@ -119,8 +117,9 @@ public class SourceForward {
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
urlList.add(entry.getKey());
}
System.out.println(urlList.size());
List<SourceForwardBean> dataList = SourceForwardCrawlerThread.getSourceForward(urlList);
System.out.println(dataList.size());
for(SourceForwardBean sfb : dataList){
String url = sfb.getUrl();
String root_source = sfb.getRoot_source();
......@@ -144,7 +143,7 @@ public class SourceForward {
dataMap.put(url, data);
}
}
System.out.println("success");
return dataMap;
}
......@@ -156,26 +155,25 @@ public class SourceForward {
*/
public static List<SourceForwardBean> getSourceForward(List<String> urlList){
//启动获取链接来源
List<SourceForwardBean> dataList = SourceForwardCrawlerThread.getSourceForward(urlList);
return dataList;
return SourceForwardCrawlerThread.getSourceForward(urlList);
}
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
List<String> urlList = new ArrayList<>();
urlList.add("http://www.toutiao.com/a6452936157751968013/");
List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
for(SourceForwardBean sfb : da) {
System.out.println(sfb.toString());
}
// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
// List<String> urlList = new ArrayList<>();
// urlList.add("http://www.toutiao.com/a6452936157751968013/");
// List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
// for(SourceForwardBean sfb : da) {
// System.out.println(sfb.toString());
// }
}
static class SourceForwardCrawlerThread extends Thread{
private static List<SourceForwardBean> getSourceForward(List<String> urlList){
List<SourceForwardBean> list = Collections.synchronizedList(new ArrayList<SourceForwardBean>());
try{
SourceForwardCrawler crawler = new SourceForwardCrawler();
List<SourceForwardBean> list = Collections.synchronizedList(new ArrayList<SourceForwardBean>());
SourceForwardDataCallBack callback = new SourceForwardDataCallBack() {
@Override
......@@ -186,11 +184,10 @@ public class SourceForward {
};
crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await();
return list;
}catch (Exception e){
e.fillInStackTrace();
e.printStackTrace();
}
return null;
return list;
}
}
......
......@@ -76,9 +76,9 @@ public class URLLive {
static class UrlLiveCrawlerThread extends Thread{
private static List<UrlLiveBean> getUrlLiveCrawle(List<String> urlList){
List<UrlLiveBean> list = Collections.synchronizedList(new ArrayList<UrlLiveBean>());
try{
UrlLiveCrawler crawler = new UrlLiveCrawler();
List<UrlLiveBean> list = Collections.synchronizedList(new ArrayList<UrlLiveBean>());
UrlLiveDataCallback callback = new UrlLiveDataCallback() {
@Override
......@@ -89,11 +89,10 @@ public class URLLive {
};
crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await();
return list;
}catch (Exception e){
e.fillInStackTrace();
e.printStackTrace();
}
return null;
return list;
}
}
......
......@@ -40,7 +40,7 @@ public class MatchChannel {
}
}
} catch (Exception e) {
return channel;
e.printStackTrace();
}
return channel;
}
......
......@@ -44,7 +44,7 @@ public class MatchContent {
}
return ZhiWeiTools.delHTMLTag(content);
} catch (Exception e) {
logger.debug("获取全文失败",e.fillInStackTrace());
logger.error("获取全文失败",e);
content = null;
}
return content;
......@@ -106,7 +106,7 @@ public class MatchContent {
News news = ContentExtractor.getNewsByHtml(html);
content = TreateData.filterSpecialCharacter(news.getContent());
} catch (Exception e) {
logger.info("正文抽取失败,获取全文文本:{}");
logger.error("正文抽取失败,获取全文文本:",e);
content = document.text();
}
return content;
......
......@@ -192,6 +192,7 @@ public class MatchSource {
}
return source;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
......@@ -352,7 +353,9 @@ public class MatchSource {
* @return String 返回类型
*/
private static String getSourceByTime(String timeSource, String htmlBody,List<String> sourceList){
try {
/**以时间做分割,匹配来源信息。
* 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
......@@ -381,5 +384,9 @@ public class MatchSource {
}
}
return null;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
}
......@@ -32,7 +32,7 @@ public class ReadMediaData {
//添加来源到自定义来源列表
SourceData.addUserSource(source);
} catch (Exception e) {
e.printStackTrace();
}
}
return result;
......
......@@ -43,6 +43,7 @@ public class TreateData {
Matcher m = p.matcher(str);
return m.replaceAll("");
} catch (Exception ex) {
ex.printStackTrace();
return str;
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment