Commit 849ef5fe by yangchen

修改判断链接是否存活判断

parent 98e0d120
...@@ -74,12 +74,16 @@ public class UrlLiveCrawler { ...@@ -74,12 +74,16 @@ public class UrlLiveCrawler {
} }
} }
} else { } else {
if(attr.getCount() > 3) { if(future.cause().getMessage().contains("status code: 301")) {
callBack(callback, attr, -1); callBack(callback, attr, 1);
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage());
}else { }else {
attr.AddCount(); if(attr.getCount() > 3) {
search(counter, attr.getAttr().toString(), attr, callback); callBack(callback, attr, -1);
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage());
}else {
attr.AddCount();
search(counter, attr.getAttr().toString(), attr, callback);
}
} }
} }
counter.reduce(); counter.reduce();
...@@ -116,6 +120,12 @@ public class UrlLiveCrawler { ...@@ -116,6 +120,12 @@ public class UrlLiveCrawler {
logger.info("url 解析出错 {}",url); logger.info("url 解析出错 {}",url);
return url; return url;
} }
}else if(url.contains("mp.weixin.qq.com")) {
if(url.contains("https")) {
}else {
url = url.replace("http", "https");
}
} }
return url; return url;
} }
...@@ -206,6 +216,11 @@ public class UrlLiveCrawler { ...@@ -206,6 +216,11 @@ public class UrlLiveCrawler {
logger.info("{}检测规则:第{}步",url,step); logger.info("{}检测规则:第{}步",url,step);
return true; return true;
} }
step++;
if(rulerWechatWeigui(doc)) {
logger.info("{}检测规则:第{}步",url,step);
return true;
}
return false; return false;
} }
...@@ -240,13 +255,22 @@ public class UrlLiveCrawler { ...@@ -240,13 +255,22 @@ public class UrlLiveCrawler {
private boolean rulerWeigui(Document doc) private boolean rulerWeigui(Document doc)
{ {
boolean flg = false; boolean flg = false;
if ((doc.select("p.title").text()).contains("此内容因违规无法查看")) if ((doc.select("p.title").text()).contains("此内容因违规无法查看") || doc.select("p.title").text().contains("此帐号在冻结期,内容无法查看"))
{ {
flg = true; flg = true;
} }
return flg; return flg;
} }
private boolean rulerWechatWeigui(Document doc) {
boolean flg = false;
if ((doc.select("h3.msg-title").text()).contains("此内容被投诉且经审核涉嫌侵权,无法查看"))
{
flg = true;
}
return flg;
}
/** /**
* *
* ( 微信内容违规的无效网址筛选规则) * ( 微信内容违规的无效网址筛选规则)
......
...@@ -66,7 +66,7 @@ public class URLLive { ...@@ -66,7 +66,7 @@ public class URLLive {
public static void main(String[] args) { public static void main(String[] args) {
List<String> urlList = new ArrayList<>(); List<String> urlList = new ArrayList<>();
urlList.add("http://www.zyzpes.com/toutiao/5048828/20180419A1AFBC00.html"); urlList.add("http://mp.weixin.qq.com/s?__biz=MzA3MzY1NjMxMw==&mid=2652054872&idx=1&sn=d67630a6b55d0eebd353cc90242fd784&3rd=MzA3MDU4NTYzMw==&scene=6#rd");
List<UrlLiveBean> u = URLLive.verificationURLLive(urlList); List<UrlLiveBean> u = URLLive.verificationURLLive(urlList);
for(UrlLiveBean b : u) { for(UrlLiveBean b : u) {
System.out.println(b.toString()); System.out.println(b.toString());
......
package com.zhiwei.source_forward.util;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.zhiwei.source_forward.util.SourceData;
public class ReadMediaData {
/**
* @Title: getUrl
* @author hero
* @Description: TODO(获取需要验证的链接)
* @param @param path
* @param @return 设定文件
* @return List<String> 返回类型
*/
public static Map<String,Map<String,Object>> getUrl(List<Map<String,Object>> dataList){
try {
Map<String,Map<String,Object>> result = new HashMap<String,Map<String,Object>>(); //网络媒体数据,用于更新转发原创
for(Map<String,Object> dataMap : dataList){
try {
String source = dataMap.get("来源")!=null?dataMap.get("来源").toString().trim().toUpperCase():null;
String url = dataMap.get("链接")!=null?dataMap.get("链接").toString().trim():null;
if(url.contains("sh.qihoo.com") && url.contains("&url=")){
url = url.split("&url=")[1];
}
result.put(url, dataMap);
//添加来源到自定义来源列表
SourceData.addUserSource(source);
} catch (Exception e) {
}
}
return result;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
public static Map<String,Object> getUrlJD(List<Map<String,Object>> dataList){
try {
Map<String,Object> result = new HashMap<String,Object>();
List<Map<String,Object>> weiboList = new ArrayList<Map<String,Object>>(); //微博数据集合
List<Map<String,Object>> appList = new ArrayList<Map<String,Object>>(); //客户端数据集合
List<Map<String,Object>> wechatList = new ArrayList<Map<String,Object>>(); //微信数据集合
List<Map<String,Object>> paperList = new ArrayList<Map<String,Object>>(); //平媒数据集合
List<Map<String,Object>> toutiaoList = new ArrayList<Map<String,Object>>(); //平媒数据集合
Map<String,Map<String,Object>> mediaList = new HashMap<String,Map<String,Object>>(); //网络媒体数据,用于更新转发原创
for(Map<String,Object> dataMap : dataList){
String source = dataMap.get("来源")!=null?dataMap.get("来源").toString().toUpperCase().trim():null;
String url = dataMap.get("链接")!=null?dataMap.get("链接").toString().trim():null;
String pt = dataMap.get("平台")!=null?dataMap.get("平台").toString().trim():null;
if(url.contains("sh.qihoo.com") && url.contains("&url=")){
url = url.split("&url=")[1];
}
if(url.contains("toutiao.com")){
dataMap.put("来源", "今日头条"+"-"+source);
if(source.equals("今日头条")){
dataMap.put("备注", "修改头条名称");
}
toutiaoList.add(dataMap);
}else{
if(pt.equals("网络媒体")){
/**根据链接判断是否为客户端**/
if(url.contains("baijia.baidu.com") || url.contains("v.mp.uc.cn")
|| url.contains("v.mp.uc.cn") || url.contains("m.uczzd.cn")
|| url.contains("a.mp.uc.cn") || url.contains("share.iclient.ifeng.com")
|| url.contains("m.ifeng.com") || url.contains("kuaibao.qq.com")
|| url.contains("sina.cn") || url.contains("3g.163.com")
|| url.contains("m.sohu.com")
){
dataMap.put("备注", "");
appList.add(dataMap);
}
/***根据链接判断是否为纸媒**/
else if(url.contains("e.xfrb.com.cn") || url.contains("paper.people.com.cn")
|| url.contains("bhsb.tjbhnews.com") || url.contains("www.time-weekly.com")
|| url.contains("bjrb.bjd.com.cn") || url.contains("zqb.cyol.com")
|| url.contains("hzdaily.hangzhou.com.cn") || url.contains("shfinancialnews.com")
|| url.contains("dz.xdkb.net") || url.contains("njcb.xhby.net")
|| url.contains("ctdsb.cnhubei.com") || url.contains("bjwb.bjd.com.cn")
|| url.contains("bjcb.morningpost.com.cn") || url.contains("e.chengdu.cn")
){
dataMap.put("备注", "");
paperList.add(dataMap);
}else{
mediaList.put(url, dataMap);
//添加来源到自定义来源列表
SourceData.addUserSource(source);
}
}else if(pt.equals("微博")){
dataMap.put("备注", "");
weiboList.add(dataMap);
}else if(pt.contains("客户端")){
dataMap.put("备注", "");
dataMap.put("来源", pt+"-"+source);
appList.add(dataMap);
}else if(pt.equals("微信公众平台")){
dataMap.put("备注", "");
wechatList.add(dataMap);
}else if(pt.equals("报刊")||pt.equals("平媒")){
dataMap.put("备注", "");
paperList.add(dataMap);
}
}
result.put("weibo", weiboList);
result.put("media", mediaList);
result.put("app", appList);
result.put("toutiao", toutiaoList);
result.put("wechat", wechatList);
result.put("paper", paperList);
}
return result;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* @Title: getUrlLive
* @author hero
* @Description: 获取链接
* @param @param dataList
* @param @return 设定文件
* @return Map<String,Map<String,Object>> 返回类型
*/
public static Map<String,Map<String,Object>> getUrlLive(List<Map<String,Object>> dataList){
try {
Map<String,Map<String,Object>> result = new HashMap<String,Map<String,Object>>();
for(Map<String,Object> dataMap : dataList){
String url = dataMap.get("地址")!=null?dataMap.get("地址").toString().trim():null;
if(url.contains("sh.qihoo.com") && url.contains("&url=")){
url = url.split("&url=")[1];
}
// if (!url.contains("http")) {
// url = "http://" + url;
// }
// if (!url.contains("www")) {
// url = url.replace("://", "://www.");
// }
result.put(url, dataMap);
}
return result;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
}
//package com.zhiwei.source_forward.sourceforward.test; //package com.zhiwei.source_forward.sourceforward.test;
// //
//import java.util.ArrayList; //import java.util.ArrayList;
//import java.util.HashMap;
//import java.util.List; //import java.util.List;
//import java.util.Map; //import java.util.Map;
//import java.util.Map.Entry; //import java.util.Map.Entry;
// //
//import org.junit.Test; //import org.testng.annotations.Test;
// //
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.source_forward.bean.UrlLiveBean;
//import com.zhiwei.source_forward.run.URLLive; //import com.zhiwei.source_forward.run.URLLive;
//import com.zhiwei.source_forward.util.ReadMediaData;
// //
///** ///**
// * @ClassName: URLLiveTest // * @ClassName: URLLiveTest
...@@ -18,25 +22,37 @@ ...@@ -18,25 +22,37 @@
//public class URLLiveTest { //public class URLLiveTest {
// //
// //
//// @Test // @Test
//// public void urlLiveTest(){ // public void urlLiveTest(){
//// String path = "E://稿件汇总网媒数据//福莱网媒.xlsx"; // String path = "D://crawlerdata//链接删除2.xlsx";
//// PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
//// Map<String,Object> data = poi.importExcel(path, 0); // Map<String,Object> data = poi.importExcel(path, 0);
//// @SuppressWarnings("unchecked") // @SuppressWarnings("unchecked")
//// List<String> headList = (List<String>)data.get("head"); // List<String> headList = (List<String>)data.get("head");
//// headList.add("是否删除"); // headList.add("是否删除");
//// @SuppressWarnings("unchecked") // @SuppressWarnings("unchecked")
//// List<Map<String,Object>> dataList = (List<Map<String,Object>>)data.get("body"); // List<Map<String,Object>> dataList = (List<Map<String,Object>>)data.get("body");
//// Map<String,Map<String,Object>> dataMap = ReadMediaData.getUrlLive(dataList); // List<String> uList = new ArrayList<>();
//// dataMap = URLLive.verificationURLLive(dataMap); // for(Map<String,Object> m : dataList) {
//// // uList.add(m.get("地址").toString());
//// List<Map<String,Object>> bodyList = new ArrayList<>(); // }
//// for(Entry<String,Map<String,Object>> dataEntry : dataMap.entrySet()){ // List<UrlLiveBean> lb = URLLive.verificationURLLive(uList);
//// bodyList.add(dataEntry.getValue()); //
//// } // List<Map<String,Object>> bodyList = new ArrayList<>();
//// poi.exportExcel(path ,"匹配后数据", headList, bodyList); // for(UrlLiveBean dataEntry : lb){
//// } // Map<String,Object> map = new HashMap<>();
// map.put("地址", dataEntry.getUrl());
// if(dataEntry.isLive() == 1) {
// map.put("是否删除", true);
// }else if(dataEntry.isLive() == 0) {
// map.put("是否删除", false);
// }else if(dataEntry.isLive() == -1) {
// map.put("是否删除", -1);
// }
// bodyList.add(map);
// }
// poi.exportExcel(path ,"匹配后数据", headList, bodyList);
// }
// //
// //
//} //}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment