Commit a71c606b by yangchen

链接是否删除 修改

parent 37ac4e23
......@@ -6,6 +6,8 @@ public class UrlLiveBean {
private Integer isLive;
private String title;
public UrlLiveBean() {
super();
}
......@@ -16,15 +18,31 @@ public class UrlLiveBean {
this.isLive = isLive;
}
public UrlLiveBean(String url, boolean isLive) {
public UrlLiveBean(String url, Integer isLive,String title) {
super();
this.url = url;
this.isLive = isLive;
this.title = title;
}
public UrlLiveBean(String url, boolean isLive,String title) {
super();
this.url = url;
this.title = title;
if(isLive) {
this.isLive = 1; //已删除
}else {
this.isLive = 0;
}
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getUrl() {
return url;
......@@ -44,7 +62,8 @@ public class UrlLiveBean {
@Override
public String toString() {
return "UrlLiveBean [url=" + url + ", isLive=" + isLive + "]";
return "UrlLiveBean [url=" + url + ", isLive=" + isLive + ", title="
+ title + "]";
}
/**
......
package com.zhiwei.source_forward.crawler;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.TimeUnit;
import org.apache.logging.log4j.LogManager;
......@@ -34,7 +35,7 @@ public class UrlLiveCrawler {
private static HttpBoot httpBoot = new HttpBoot();
public MultiThreadingCounter submitTask(UrlLiveDataCallback callback,String... urls) throws Exception {
MultiThreadingCounter counter = new MultiThreadingCounter(20,TimeUnit.MINUTES,false);
MultiThreadingCounter counter = new MultiThreadingCounter(10,TimeUnit.MINUTES,false);
start(counter, callback, urls);
return counter;
}
......@@ -57,21 +58,29 @@ public class UrlLiveCrawler {
Attribution attr, UrlLiveDataCallback callback) {
url = dealUrl(url);
logger.info("当前处理 URL: {}", url);
// Map<String,String> headers = new HashMap<>();
Map<String,String> headers = HeaderTool.getCommonHead();
if(url.contains("www.toutiao.com")){
headers.put("referer", url);
}
Request request = RequestUtils.wrapGet(url, headers);
counter.increase();
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY, false).addListener(future -> {
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY).addListener(future -> {
try {
if (future.isSuccess()) {
Response response = future.result();
try {
if(response.code() == 200) {
parseHtml(response.body().string(), attr, callback);
parseHtml(response.body().string(), attr, callback,counter);
}else if(response.code() == 404){
if(attr.getCount() > 2) {
callBack(callback, attr, 1,"404");
}else {
attr.AddCount();
search(counter, attr.getAttr().toString(), attr, callback);
}
}else {
callBack(callback, attr, 1);
callBack(callback, attr, -2,String.valueOf(response.code()));
}
} catch (Exception e) {
logger.error("解析出错", e);
......@@ -82,10 +91,10 @@ public class UrlLiveCrawler {
}
} else {
if(future.cause().getMessage().contains("status code: ")) {
callBack(callback, attr, 1);
callBack(callback, attr, 1,null);
}else {
if(attr.getCount() > 3) {
callBack(callback, attr, -1);
callBack(callback, attr, -1,null);
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage());
}else {
attr.AddCount();
......@@ -102,8 +111,13 @@ public class UrlLiveCrawler {
return counter;
}
private void callBack(UrlLiveDataCallback callback,Attribution attr,int i) {
UrlLiveBean ulb = new UrlLiveBean(attr.getAttr().toString(), i);
private void callBack(UrlLiveDataCallback callback,Attribution attr,int i,String title) {
UrlLiveBean ulb = null;
if(i == 1) {
ulb = new UrlLiveBean(attr.getAttr().toString(), true, title);
}else {
ulb = new UrlLiveBean(attr.getAttr().toString(), i);
}
if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据");
} else {
......@@ -149,19 +163,21 @@ public class UrlLiveCrawler {
* @param callback
*/
private void parseHtml(String html, Attribution attr,
UrlLiveDataCallback callback) {
/***验证网页是否能够连通*/
boolean f = true;
try {
f = matchDel(html,attr.getAttr().toString());
} catch (Exception e) {
logger.error("数据判断出错 ",e);
}
UrlLiveBean ulb = new UrlLiveBean(attr.getAttr().toString(), f);
UrlLiveDataCallback callback,MultiThreadingCounter counter) {
if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据");
} else {
callback.onData(ulb, attr);
UrlLiveBean ulb = matchDel(html,attr,attr.getAttr().toString());
if(Objects.nonNull(ulb)) {
callback.onData(ulb, attr);
}else {
if(attr.getCount() > 3) {
callBack(callback, attr, -1,null);
}else {
attr.AddCount();
search(counter, attr.getAttr().toString(), attr, callback);
}
}
}
}
......@@ -173,22 +189,65 @@ public class UrlLiveCrawler {
* @param @return 设定文件
* @return boolean 返回类型
*/
public boolean matchDel(String result,String url){
public UrlLiveBean matchDel(String result,Attribution attr,String url){
Document doc = Jsoup.parse(result);
String title = null;
boolean f = false;
if(url.contains("mp.weixin.qq.com") || url.contains("post.mp.qq.com")){
title = doc.select("h2.rich_media_title").text().replaceAll(" ", "");
if(Objects.isNull(title) || title.length() < 1) {
title = doc.select("p.title").text();
}
if(Objects.isNull(title) || title.length() < 1) {
title = doc.select("h3.msg-title").text();
}
}else if(url.contains("kuaibao")){
title = doc.select("p.title").text().replaceAll(" ", "");;
}else {
title = doc.select("title").text().replaceAll(" ", "");;
title = doc.select("p.title").text().replaceAll(" ", "");
}else if(url.contains("chinadaily.com.cn")){
title = doc.select("p.style1").text().replaceAll(" ", "");
}else if(url.contains("baidu.com") || url.contains("hao123.com")) {
title = doc.select("p#contaniner").text();
}else if(url.contains("kanfanews.com")) {
title = doc.select("p#tit").text();
}
//若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1) {
title = doc.select("div.adiv > p > span").text().replaceAll(" ", "");
}
if(title!=null && !title.equals("")){
if(title.contains("未知错误") || title.contains("Object moved") || title.contains("404") || title.contains("页面没有找到") || title.contains("页面未找到") || title.contains("301 Moved Permanently")){
return true;
//若title 为拿到 用 此方法
if(Objects.isNull(title) || title.length() < 1) {
title = doc.select("title").text().replaceAll(" ", "");
}
if(title!=null && title.length() > 1){
if(Objects.equals("网页已删除", title)
|| Objects.equals("页面提示", title) || title.contains("正在维护中")
|| Objects.equals("此文章被第三方评估为不实信息", title) || title.contains("提示信息-")
|| Objects.equals("财经头条", title) || Objects.equals("知识100题", title)
|| Objects.equals("502BadGateway", title) || Objects.equals("提示信息", title)
|| Objects.equals("跳转页", title) || Objects.equals("跳转中...", title)
|| Objects.equals("此帐号在冻结期,内容无法查看", title) || Objects.equals("东北新闻网", title)
|| Objects.equals("百度一下,你就知道", title) || Objects.equals("帐号已迁移", title)
|| Objects.equals("手机百度", title) || Objects.equals("内容被删除", title)
|| Objects.equals("亚博国际|首页", title) || Objects.equals("中国软件网", title)
|| Objects.equals("云广网", title) || Objects.equals("新浪首页", title)
|| Objects.equals("文章暂时找不到了", title) || title.contains("此内容因违规无法查看")
|| title.contains("微信公众号不存在") || title.contains("此内容被投诉且经审核涉嫌侵权,无法查看")
|| Objects.equals("-法易网", title) || Objects.equals("【一点资讯】www.yidianzixun.com", title)
|| title.contains("您访问的链接不存在") || Objects.equals("文章暂时不能查看", title)
|| Objects.equals("错误页面", title) || title.contains("thepageyourequestedwasnotfound")
|| Objects.equals("此帐号已被屏蔽, 内容无法查看", title) || Objects.equals("网站暂停通知", title)
|| title.contains("未知错误") || title.contains("Object moved")
|| title.contains("404") || title.contains("页面没有找到")
|| title.contains("页面未找到") || title.contains("301MovedPermanently")){
f = true;
}
} else {
return null;
}
return false;
return new UrlLiveBean(attr.getAttr().toString(), f,title);
}
// /**
......@@ -200,8 +259,7 @@ public class UrlLiveCrawler {
// * @time 2016年6月3日上午9:54:00
// * @return boolean
// */
// private boolean rulerYaoyan(Document doc)
// {
// private boolean rulerYaoyan(Document doc){
// boolean flg = false;
// if ("谣言".equals(doc.select(".pic_rumor").text()))
// {
......@@ -219,24 +277,16 @@ public class UrlLiveCrawler {
// * @time 2016年6月3日上午9:59:54
// * @return boolean
// */
// private boolean rulerWeigui(Document doc)
// private boolean rulerWechat(Document doc)
// {
// boolean flg = false;
// if ((doc.select("p.title").text()).contains("此内容因违规无法查看") || doc.select("p.title").text().contains("此帐号在冻结期,内容无法查看"))
// if ((doc.select("h3.msg-title").text()).contains("此内容被投诉且经审核涉嫌侵权,无法查看") || (doc.select("p.title").text()).contains("此内容因违规无法查看") || doc.select("p.title").text().contains("此帐号在冻结期,内容无法查看"))
// {
// flg = true;
// }
// return flg;
// }
//
// private boolean rulerWechatWeigui(Document doc) {
// boolean flg = false;
// if ((doc.select("h3.msg-title").text()).contains("此内容被投诉且经审核涉嫌侵权,无法查看"))
// {
// flg = true;
// }
// return flg;
// }
//
// /**
// *
......
package com.zhiwei.source_forward.run;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.crawler.UrlLiveCrawler;
......@@ -46,6 +51,7 @@ public class URLLive {
}else if(i == 0) {
map.put("是否删除", false);
}
map.put("title", ub.getTitle());
dataMap.put(url, map);
}
}
......@@ -60,13 +66,15 @@ public class URLLive {
*/
public static List<UrlLiveBean> verificationURLLive(List<String> urlList){
//启动验证链接是否有效程序程序
List<UrlLiveBean> dataList = UrlLiveCrawlerThread.getUrlLiveCrawle(urlList);
return dataList;
return UrlLiveCrawlerThread.getUrlLiveCrawle(urlList);
}
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
List<String> urlList = new ArrayList<>();
urlList.add("http://www.teso.cc/html/zixun/201606/233848.html");
urlList.add("https://www.hao123.com/mid/16981890690654602094");
List<UrlLiveBean> u = URLLive.verificationURLLive(urlList);
for(UrlLiveBean b : u) {
System.out.println(b.toString());
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment