Commit d6f4e440 by zhiwei

验证是否删除添加知乎验证

parent b7e91b0a
...@@ -29,8 +29,7 @@ ...@@ -29,8 +29,7 @@
<dependency> <dependency>
<groupId>com.zhiwei.crawler</groupId> <groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId> <artifactId>crawler-core</artifactId>
<version>0.3.6-RELEASE</version> <version>0.5.2-RELEASE</version>
<scope>provided</scope>
</dependency> </dependency>
</dependencies> </dependencies>
......
...@@ -77,6 +77,8 @@ public class UrlLiveBean { ...@@ -77,6 +77,8 @@ public class UrlLiveBean {
private Integer count; private Integer count;
private Integer code;
/** /**
* Constructor * Constructor
* *
...@@ -92,6 +94,12 @@ public class UrlLiveBean { ...@@ -92,6 +94,12 @@ public class UrlLiveBean {
* @param attr * @param attr
* @param count * @param count
*/ */
private Attribution(Object attr,Integer count, Integer code){
this.attr = attr;
this.count = count;
this.code = code;
}
private Attribution(Object attr,Integer count){ private Attribution(Object attr,Integer count){
this.attr = attr; this.attr = attr;
this.count = count; this.count = count;
...@@ -114,7 +122,11 @@ public class UrlLiveBean { ...@@ -114,7 +122,11 @@ public class UrlLiveBean {
* @return Attribution * @return Attribution
*/ */
public static Attribution of(Object attr,Integer count) { public static Attribution of(Object attr,Integer count) {
return new Attribution(attr,count); return new Attribution(attr, count);
}
public static Attribution of(Object attr,Integer count,Integer code) {
return new Attribution(attr, count, code);
} }
/** /**
...@@ -135,7 +147,11 @@ public class UrlLiveBean { ...@@ -135,7 +147,11 @@ public class UrlLiveBean {
return count; return count;
} }
public void AddCount() { public Integer getCode() {
return code;
}
public void addCount() {
count++; count++;
} }
} }
......
...@@ -51,7 +51,7 @@ public class UrlLiveCrawler { ...@@ -51,7 +51,7 @@ public class UrlLiveCrawler {
if (nonNull(url)) { if (nonNull(url)) {
try { try {
ZhiWeiTools.sleep(10); ZhiWeiTools.sleep(10);
search(counter, url, Attribution.of(url,1), callback); search(counter, url, Attribution.of(url, 1), callback);
} catch (Exception e) { } catch (Exception e) {
logger.error("搜索创建出错:", e); logger.error("搜索创建出错:", e);
} }
...@@ -68,6 +68,8 @@ public class UrlLiveCrawler { ...@@ -68,6 +68,8 @@ public class UrlLiveCrawler {
Map<String,String> headers = HeaderTool.getCommonHead(); Map<String,String> headers = HeaderTool.getCommonHead();
if(url.contains("www.toutiao.com")){ if(url.contains("www.toutiao.com")){
headers.put("referer", url); headers.put("referer", url);
}else if(url.contains("zhihu.com")) {
url = treatZhihuUrl(url);
} }
try { try {
Request request = RequestUtils.wrapGet(url, headers); Request request = RequestUtils.wrapGet(url, headers);
...@@ -76,10 +78,12 @@ public class UrlLiveCrawler { ...@@ -76,10 +78,12 @@ public class UrlLiveCrawler {
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY).whenComplete((rs,ex) -> { httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY).whenComplete((rs,ex) -> {
try { try {
if (Objects.isNull(ex)) { if (Objects.isNull(ex)) {
if(rs.code() == 200) { if(rs.isSuccessful()) {
parseHtml(rs.body().string(), attr, callback); parseHtml(rs.body().string(), attr, callback);
}else if(rs.code() == 403){
callBack(callback, attr, -1, String.valueOf(rs.code()));
}else { }else {
callBack(callback, attr, 1,String.valueOf(rs.code())); callBack(callback, attr, 1, String.valueOf(rs.code()));
} }
} else { } else {
callBack(callback, attr, 1,"未访问成功"); callBack(callback, attr, 1,"未访问成功");
...@@ -155,7 +159,7 @@ public class UrlLiveCrawler { ...@@ -155,7 +159,7 @@ public class UrlLiveCrawler {
if (callback == null) { if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据"); logger.warn("DataCallback 对象为 null,无法保存数据");
} else { } else {
UrlLiveBean ulb = matchDel(html,attr,attr.getAttr().toString()); UrlLiveBean ulb = matchDel(html, attr, attr.getAttr().toString());
if(Objects.nonNull(ulb)) { if(Objects.nonNull(ulb)) {
callback.onData(ulb, attr); callback.onData(ulb, attr);
}else { }else {
...@@ -215,6 +219,10 @@ public class UrlLiveCrawler { ...@@ -215,6 +219,10 @@ public class UrlLiveCrawler {
} }
}else if(url.contains("huanqiu.com") && result.contains("www.huanqiu.com/404.html")) { }else if(url.contains("huanqiu.com") && result.contains("www.huanqiu.com/404.html")) {
title = "网页已删除"; title = "网页已删除";
}else if(url.contains("zhihu.com")) {
JSONObject resultJson = JSONObject.parseObject(result);
title = resultJson.getString("title")!=null?resultJson.getString("title"):resultJson.getString("message");
} }
//若title 为拿到 用 此方法 //若title 为拿到 用 此方法
...@@ -263,7 +271,7 @@ public class UrlLiveCrawler { ...@@ -263,7 +271,7 @@ public class UrlLiveCrawler {
,"【一点资讯】www.yidianzixun.com","错误页面","网站暂停通知","【快资讯】你的专属资讯平台" ,"【一点资讯】www.yidianzixun.com","错误页面","网站暂停通知","【快资讯】你的专属资讯平台"
,"百度新闻——全球最大的中文新闻平台","以上文章由以下机构判定为不实信息","该公众号已迁移" ,"百度新闻——全球最大的中文新闻平台","以上文章由以下机构判定为不实信息","该公众号已迁移"
,"财经网-CAIJING.COM.CN","蚂蚁资讯","参数错误","时尚头条_YOKA时尚网","该文章已经被删除" ,"财经网-CAIJING.COM.CN","蚂蚁资讯","参数错误","时尚头条_YOKA时尚网","该文章已经被删除"
,"网易","链接已过期","找不到页面","今晚网","该文章已被删除"); ,"网易","链接已过期","找不到页面","今晚网","该文章已被删除", "该回答已被删除-知乎", "资源不存在");
List<String> cList = Arrays.asList("提示信息-","此内容因违规无法查看","微信公众号不存在" List<String> cList = Arrays.asList("提示信息-","此内容因违规无法查看","微信公众号不存在"
,"此内容被投诉且经审核涉嫌侵权,无法查看","thepageyourequestedwasnotfound","未知错误" ,"此内容被投诉且经审核涉嫌侵权,无法查看","thepageyourequestedwasnotfound","未知错误"
...@@ -273,210 +281,26 @@ public class UrlLiveCrawler { ...@@ -273,210 +281,26 @@ public class UrlLiveCrawler {
return cList.stream().anyMatch(title::contains) || eList.stream().anyMatch(title::equals); return cList.stream().anyMatch(title::contains) || eList.stream().anyMatch(title::equals);
} }
// /**
// * /**
// * ( 微信谣言的无效网址筛选规则) * 处理知乎链接
// * @author 陈炜涛 *
// * @param doc * */
// * @return private static String treatZhihuUrl(String url) {
// * @time 2016年6月3日上午9:54:00 if(url.contains("/answer/")) {
// * @return boolean url = "https://api.zhihu.com/answers/" + url.replaceAll(".*/answer/", "");
// */ }else if(url.contains("/question/") && !url.contains("/answer/")) {
// private boolean rulerYaoyan(Document doc){ url = "https://api.zhihu.com/questions/" + url.replaceAll(".*/question/", "");
// boolean flg = false; }else if(url.contains("/p/")) {
// if ("谣言".equals(doc.select(".pic_rumor").text())) url = "https://api.zhihu.com/articles/" + url.replaceAll(".*/p/", "");
// { }
// flg = true; return url;
// } }
// return flg;
// }
//
// /**
// *
// * ( 微信内容违规的无效网址筛选规则)
// * @author 陈炜涛
// * @param doc
// * @return
// * @time 2016年6月3日上午9:59:54
// * @return boolean
// */
// private boolean rulerWechat(Document doc)
// {
// boolean flg = false;
// if ((doc.select("h3.msg-title").text()).contains("此内容被投诉且经审核涉嫌侵权,无法查看") || (doc.select("p.title").text()).contains("此内容因违规无法查看") || doc.select("p.title").text().contains("此帐号在冻结期,内容无法查看"))
// {
// flg = true;
// }
// return flg;
// }
//
//
// /**
// *
// * ( 微信内容违规的无效网址筛选规则)
// * @author 陈炜涛
// * @param doc
// * @return
// * @time 2016年6月3日上午9:59:54
// * @return boolean
// */
// private boolean rulerTousu(Document doc)
// {
// boolean flg = false;
// if (0 < doc.select("i[class=\"icon_msg warn\"]").size())
// {
// flg = true;
// }
// return flg;
// }
//
// /**
// *
// * ( 环球的无效网址筛选规则)
// * @author 陈炜涛
// * @param doc
// * @return
// * @time 2016年6月3日上午9:59:54
// * @return boolean
// */
// private boolean rulerHuanqiuWuxiao(Document doc)
// {
// boolean flg = false;
// if (0 < doc.select("div[class=\"errMsg\"]").size())
// {
// flg = true;
// }
// return flg;
// }
//
// /**
// *
// * ( 空的无效网址筛选规则)
// * @author 陈炜涛
// * @param doc
// * @return
// * @time 2016年6月3日上午9:59:54
// * @return boolean
// */
// private boolean rulerKong(Document doc)
// {
// boolean flg = false;
// if (14 > doc.select("body").toString().length()
// &&
// 14 > doc.select("head").toString().length())
// {
// flg = true;
// }
// return flg;
// }
//
// /**
// *
// * ( 内容不存在)
// * @author 陈炜涛
// * @param doc
// * @return
// * @time 2016年6月3日上午9:59:54
// * @return boolean
// */
// private boolean rulerBucunzai(Document doc)
// {
// boolean flg = false;
// if (doc.text().contains("很抱歉,您访问的页面不存在")||doc.text().contains("该内容已被发布者删除"))
// {
// flg = true;
// }
// return flg;
// }
//
// /**
// *
// * ( 招商网的无效网址筛选规则)
// * @author 陈炜涛
// * @param doc
// * @return
// * @time 2016年6月3日上午9:59:54
// * @return boolean
// */
// private boolean rulerZhaoshang(Document doc)
// {
// boolean flg = false;
// try
// {
// if ("<a href=\"\"> </a>".equals(doc.select("div[class=\"paths\"]")
// .first().child(2).toString()))
// {
// flg = true;
// }
// }
// catch (Exception e)
// {
// e.printStackTrace();
// // TODO: handle exception
// }
//
// return flg;
// }
//
//
// /**
// *
// * ( 一点资讯的无效网址筛选规则)
// * @author 陈炜涛
// * @param doc
// * @return
// * @time 2016年6月3日上午9:59:54
// * @return boolean
// */
// private boolean rulerYidian(Document doc)
// {
// boolean flg = false;
// try
// {
// if (doc.select("div[class=\"content\"]").text().contains("文章没有找到"))
// {
// flg = true;
// }
// }
// catch (Exception e)
// {
// e.printStackTrace();
// // : handle exception
// }
// return flg;
// }
//
// /**
// * @Title: rulerHead
// * @author hero
// * @Description: 验证链接头部
// * @param @param doc
// * @param @return 设定文件
// * @return boolean 返回类型
// */
// private boolean rulerHead(Document doc)
// {
// List<Node> nodeList = doc.head().childNodes();
// try {
// for (Node node : nodeList) {
// if (node.outerHtml().contains("<title>")) {
// String title = node.toString().split("<title>")[1].split("</title>")[0];
// if(title.contains("未知错误") || title.contains("Object moved") || title.contains("404") || title.contains("页面没有找到") || title.contains("页面未找到") || title.contains("301 Moved Permanently")){
// return true;
// }
// }
// if (node.outerHtml().contains("meta")) {
// String meta = node.toString();
// if(meta.contains("公益404页面")) {
// return true;
// }
// }
// }
// } catch (Exception e) {
// e.printStackTrace();
// return false;
// }
// return false;
// }
} }
...@@ -11,6 +11,7 @@ import org.apache.logging.log4j.Logger; ...@@ -11,6 +11,7 @@ import org.apache.logging.log4j.Logger;
import com.zhiwei.common.config.GroupType; import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory; import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.source_forward.bean.UrlLiveBean; import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution; import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.crawler.UrlLiveCrawler; import com.zhiwei.source_forward.crawler.UrlLiveCrawler;
...@@ -74,8 +75,7 @@ public class URLLive { ...@@ -74,8 +75,7 @@ public class URLLive {
public static void main(String[] args) { public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER); ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
List<String> urlList = new ArrayList<>(); List<String> urlList = new ArrayList<>();
urlList.add("http://www.ebrun.com/ebrungo/zb/316384.shtml"); urlList.add("https://www.zhihu.com/question/340524333");
List<UrlLiveBean> u = URLLive.verificationURLLive(urlList); List<UrlLiveBean> u = URLLive.verificationURLLive(urlList);
for(UrlLiveBean b : u) { for(UrlLiveBean b : u) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment