Commit 98e0d120 by yangchen

sourceforward 链接匹配修改

parent aa2a108b
...@@ -24,12 +24,12 @@ ...@@ -24,12 +24,12 @@
<dependency> <dependency>
<groupId>com.zhiwei.tools</groupId> <groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId> <artifactId>zhiwei-tools</artifactId>
<version>0.0.2-SNAPSHOT</version> <version>0.0.5-SNAPSHOT</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.zhiwei.middleware</groupId> <groupId>com.zhiwei.middleware</groupId>
<artifactId>proxy-client</artifactId> <artifactId>proxy-client</artifactId>
<version>0.0.1-RELEASE</version> <version>0.0.2-RELEASE</version>
</dependency> </dependency>
</dependencies> </dependencies>
...@@ -89,13 +89,4 @@ ...@@ -89,13 +89,4 @@
<dependencyManagement>
<dependencies>
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
<version>3.11.0</version>
</dependency>
</dependencies>
</dependencyManagement>
</project> </project>
\ No newline at end of file
...@@ -4,17 +4,27 @@ public class UrlLiveBean { ...@@ -4,17 +4,27 @@ public class UrlLiveBean {
private String url; private String url;
private boolean isLive; private Integer isLive;
public UrlLiveBean() { public UrlLiveBean() {
super(); super();
} }
public UrlLiveBean(String url, boolean isLive) { public UrlLiveBean(String url, Integer isLive) {
super(); super();
this.url = url; this.url = url;
this.isLive = isLive; this.isLive = isLive;
} }
public UrlLiveBean(String url, boolean isLive) {
super();
this.url = url;
if(isLive) {
this.isLive = 1; //已删除
}else {
this.isLive = 0;
}
}
public String getUrl() { public String getUrl() {
return url; return url;
...@@ -24,11 +34,11 @@ public class UrlLiveBean { ...@@ -24,11 +34,11 @@ public class UrlLiveBean {
this.url = url; this.url = url;
} }
public boolean isLive() { public Integer isLive() {
return isLive; return isLive;
} }
public void setLive(boolean isLive) { public void setLive(Integer isLive) {
this.isLive = isLive; this.isLive = isLive;
} }
...@@ -46,6 +56,8 @@ public class UrlLiveBean { ...@@ -46,6 +56,8 @@ public class UrlLiveBean {
public static class Attribution { public static class Attribution {
private Object attr; private Object attr;
private Integer count;
/** /**
* Constructor * Constructor
* *
...@@ -55,6 +67,17 @@ public class UrlLiveBean { ...@@ -55,6 +67,17 @@ public class UrlLiveBean {
this.attr = attr; this.attr = attr;
} }
/**
*
* @Description TODO(这里用一句话描述这个方法的作用)
* @param attr
* @param count
*/
private Attribution(Object attr,Integer count){
this.attr = attr;
this.count = count;
}
/** /**
* 创建属性 * 创建属性
* *
...@@ -66,13 +89,36 @@ public class UrlLiveBean { ...@@ -66,13 +89,36 @@ public class UrlLiveBean {
} }
/** /**
* 创建属性
*
* @param attr
* @return Attribution
*/
public static Attribution of(Object attr,Integer count) {
return new Attribution(attr,count);
}
/**
* 获取属性 * 获取属性
* *
* @return Object * @return Object
*/ */
public Object get() { public Object getAttr() {
return attr; return attr;
} }
/**
* 获取属性
*
* @return Object
*/
public Integer getCount() {
return count;
}
public void AddCount() {
count++;
}
} }
} }
...@@ -143,9 +143,9 @@ public class ContentExtractor { ...@@ -143,9 +143,9 @@ public class ContentExtractor {
content = tag; content = tag;
} }
} }
if (content == null) { // if (content == null) {
throw new Exception("extraction failed"); // throw new Exception("extraction failed");
} // }
return content; return content;
} }
...@@ -164,17 +164,17 @@ public class ContentExtractor { ...@@ -164,17 +164,17 @@ public class ContentExtractor {
news.setUrl(doc.baseUri()); news.setUrl(doc.baseUri());
} }
try { // try {
news.setTime(getTime(contentElement)); // news.setTime(getTime(contentElement));
} catch (Exception ex) { // } catch (Exception ex) {
LOG.info("news title extraction failed", ex); // LOG.info("news title extraction failed", ex);
} // }
try { // try {
news.setTitle(getTitle(contentElement)); // news.setTitle(getTitle(contentElement));
} catch (Exception ex) { // } catch (Exception ex) {
LOG.info("title extraction failed", ex); // LOG.info("title extraction failed", ex);
} // }
return news; return news;
} }
......
...@@ -3,23 +3,22 @@ package com.zhiwei.source_forward.crawler; ...@@ -3,23 +3,22 @@ package com.zhiwei.source_forward.crawler;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import com.zhiwei.crawler.async.MultiThreadingCounter;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.HttpRequestBuilder;
import com.zhiwei.source_forward.bean.ContentBean; import com.zhiwei.source_forward.bean.ContentBean;
import com.zhiwei.source_forward.bean.ContentBean.Attribution; import com.zhiwei.source_forward.bean.ContentBean.Attribution;
import com.zhiwei.source_forward.util.ContentDataCallback; import com.zhiwei.source_forward.util.ContentDataCallback;
import com.zhiwei.source_forward.util.MatchContent; import com.zhiwei.source_forward.util.MatchContent;
import com.zhiwei.source_forward.util.ProxyClientUtil; import com.zhiwei.source_forward.util.ProxyClientUtil;
import com.zhiwei.tools.httpclient.HttpBoot;
import com.zhiwei.tools.httpclient.HttpRequestBuilder;
import com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter;
import okhttp3.Headers;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response; import okhttp3.Response;
public class ContentCrawler { public class ContentCrawler {
private static Logger logger = LogManager.getLogger(ContentCrawler.class); private static Logger logger = LogManager.getLogger(ContentCrawler.class);
/** /**
* *
* @Description 链接传入 并 返回采集完信号 * @Description 链接传入 并 返回采集完信号
...@@ -28,12 +27,13 @@ public class ContentCrawler { ...@@ -28,12 +27,13 @@ public class ContentCrawler {
* @return * @return
* @throws Exception * @throws Exception
*/ */
public MultiThreadingCounter submitTask(ContentDataCallback callback,String... urls) throws Exception { public MultiThreadingCounter submitTask(ContentDataCallback callback,
String... urls) throws Exception {
MultiThreadingCounter counter = new MultiThreadingCounter(); MultiThreadingCounter counter = new MultiThreadingCounter();
start(counter, callback, urls); start(counter, callback, urls);
return counter; return counter;
} }
/** /**
* *
* @Description 提交链接 * @Description 提交链接
...@@ -41,17 +41,15 @@ public class ContentCrawler { ...@@ -41,17 +41,15 @@ public class ContentCrawler {
* @param callback * @param callback
* @param urls * @param urls
*/ */
private void start(MultiThreadingCounter counter,ContentDataCallback callback, String... urls) { private void start(MultiThreadingCounter counter,
ContentDataCallback callback, String... urls) {
if (urls != null && urls.length > 0) { if (urls != null && urls.length > 0) {
for (String url : urls) { for (String url : urls) {
if (url != null) { if (url != null) {
try { try {
counter.increase();
search(counter, url, Attribution.of(url), callback); search(counter, url, Attribution.of(url), callback);
} catch (Exception e) { } catch (Exception e) {
logger.error("关键词 {} 搜索创建出错: {}", e.getMessage()); logger.error("关键词 {} 搜索创建出错: {}", e.getMessage());
} finally {
counter.reduce();
} }
} }
} }
...@@ -67,7 +65,8 @@ public class ContentCrawler { ...@@ -67,7 +65,8 @@ public class ContentCrawler {
* @param callback * @param callback
* @return * @return
*/ */
private MultiThreadingCounter search(MultiThreadingCounter counter, String url,Attribution attr, ContentDataCallback callback) { private MultiThreadingCounter search(MultiThreadingCounter counter,
String url, Attribution attr, ContentDataCallback callback) {
logger.info("当前处理 URL: {}", url); logger.info("当前处理 URL: {}", url);
Request request = HttpRequestBuilder.newGetRequest(url, null); Request request = HttpRequestBuilder.newGetRequest(url, null);
counter.increase(); counter.increase();
...@@ -75,22 +74,23 @@ public class ContentCrawler { ...@@ -75,22 +74,23 @@ public class ContentCrawler {
if (future.isSuccess()) { if (future.isSuccess()) {
Response response = future.result(); Response response = future.result();
try { try {
parseHtml(response, attr, callback); parseHtml(response, attr, callback);
} catch (Exception e) { } catch (Exception e) {
logger.error("解析出错", e); logger.error("解析出错", e);
} }
} else { } else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage()); logger.info("{} 搜索结果访问失败: {}", request.url().url(),future.cause().getMessage());
} }
counter.reduce(); counter.reduce();
}); });
return counter; return counter;
} }
/** /**
* *
* *
* @Description 获取正文解析 * @Description 获取正文解析
* @param response * @param response
* @param attr * @param attr
* @param callback * @param callback
...@@ -99,14 +99,15 @@ public class ContentCrawler { ...@@ -99,14 +99,15 @@ public class ContentCrawler {
ContentDataCallback callback) { ContentDataCallback callback) {
String content = null; String content = null;
try { try {
if(response.isSuccessful()){ if (response.isSuccessful()) {
String html = response.body().string(); String html = response.body().string();
content = MatchContent.matchContent(attr.get().toString(), html); content = MatchContent.matchContent(attr.get().toString(),
html);
} }
} catch (Exception e) { } catch (Exception e) {
logger.info("网页链接失效",e.fillInStackTrace()); logger.info("网页链接失效", e.fillInStackTrace());
}finally { } finally {
if(response != null) { if (response != null) {
response.close(); response.close();
} }
} }
...@@ -116,7 +117,7 @@ public class ContentCrawler { ...@@ -116,7 +117,7 @@ public class ContentCrawler {
} else { } else {
callback.onData(cb, attr); callback.onData(cb, attr);
} }
} }
} }
...@@ -7,15 +7,15 @@ import org.apache.logging.log4j.Logger; ...@@ -7,15 +7,15 @@ import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Node; import org.jsoup.nodes.Node;
import com.zhiwei.crawler.async.MultiThreadingCounter;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.HttpRequestBuilder;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean; import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution; import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel; import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource; import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack; import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack;
import com.zhiwei.source_forward.util.ProxyClientUtil; import com.zhiwei.source_forward.util.ProxyClientUtil;
import com.zhiwei.tools.httpclient.HttpBoot;
import com.zhiwei.tools.httpclient.HttpRequestBuilder;
import com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response; import okhttp3.Response;
......
...@@ -8,6 +8,9 @@ import org.jsoup.Jsoup; ...@@ -8,6 +8,9 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node; import org.jsoup.nodes.Node;
import com.zhiwei.crawler.async.MultiThreadingCounter;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.HttpRequestBuilder;
import com.zhiwei.source_forward.bean.SourceForwardBean; import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution; import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel; import com.zhiwei.source_forward.util.MatchChannel;
...@@ -15,9 +18,6 @@ import com.zhiwei.source_forward.util.MatchSource; ...@@ -15,9 +18,6 @@ import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.ProxyClientUtil; import com.zhiwei.source_forward.util.ProxyClientUtil;
import com.zhiwei.source_forward.util.SourceData; import com.zhiwei.source_forward.util.SourceData;
import com.zhiwei.source_forward.util.SourceForwardDataCallBack; import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
import com.zhiwei.tools.httpclient.HttpBoot;
import com.zhiwei.tools.httpclient.HttpRequestBuilder;
import com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response; import okhttp3.Response;
......
package com.zhiwei.source_forward.crawler; package com.zhiwei.source_forward.crawler;
import java.io.IOException;
import java.util.List; import java.util.List;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
...@@ -9,13 +8,13 @@ import org.jsoup.Jsoup; ...@@ -9,13 +8,13 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node; import org.jsoup.nodes.Node;
import com.zhiwei.crawler.async.MultiThreadingCounter;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.HttpRequestBuilder;
import com.zhiwei.source_forward.bean.UrlLiveBean; import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution; import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.util.ProxyClientUtil; import com.zhiwei.source_forward.util.ProxyClientUtil;
import com.zhiwei.source_forward.util.UrlLiveDataCallback; import com.zhiwei.source_forward.util.UrlLiveDataCallback;
import com.zhiwei.tools.httpclient.HttpBoot;
import com.zhiwei.tools.httpclient.HttpRequestBuilder;
import com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response; import okhttp3.Response;
...@@ -43,12 +42,9 @@ public class UrlLiveCrawler { ...@@ -43,12 +42,9 @@ public class UrlLiveCrawler {
for (String url : urls) { for (String url : urls) {
if (url != null) { if (url != null) {
try { try {
counter.increase(); search(counter, url, Attribution.of(url,1), callback);
search(counter, url, Attribution.of(url), callback);
} catch (Exception e) { } catch (Exception e) {
logger.error("关键词 {} 搜索创建出错: {}", e.getMessage()); logger.error("关键词 {} 搜索创建出错: {}", e.getMessage());
} finally {
counter.reduce();
} }
} }
} }
...@@ -57,6 +53,7 @@ public class UrlLiveCrawler { ...@@ -57,6 +53,7 @@ public class UrlLiveCrawler {
private MultiThreadingCounter search(MultiThreadingCounter counter, String url, private MultiThreadingCounter search(MultiThreadingCounter counter, String url,
Attribution attr, UrlLiveDataCallback callback) { Attribution attr, UrlLiveDataCallback callback) {
url = dealUrl(url);
logger.info("当前处理 URL: {}", url); logger.info("当前处理 URL: {}", url);
Request request = HttpRequestBuilder.newGetRequest(url, null); Request request = HttpRequestBuilder.newGetRequest(url, null);
counter.increase(); counter.increase();
...@@ -64,36 +61,82 @@ public class UrlLiveCrawler { ...@@ -64,36 +61,82 @@ public class UrlLiveCrawler {
if (future.isSuccess()) { if (future.isSuccess()) {
Response response = future.result(); Response response = future.result();
try { try {
parseHtml(response, attr, callback); if(response.code() == 200) {
parseHtml(response.body().string(), attr, callback);
}else {
callBack(callback, attr, 1);
}
} catch (Exception e) { } catch (Exception e) {
logger.error("解析出错", e); logger.error("解析出错", e);
}finally {
if(response != null) {
response.close();
}
} }
} else { } else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage()); if(attr.getCount() > 3) {
callBack(callback, attr, -1);
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage());
}else {
attr.AddCount();
search(counter, attr.getAttr().toString(), attr, callback);
}
} }
counter.reduce(); counter.reduce();
}); });
return counter; return counter;
} }
private void parseHtml(Response response, Attribution attr, private void callBack(UrlLiveDataCallback callback,Attribution attr,int i) {
UrlLiveDataCallback callback) { UrlLiveBean ulb = new UrlLiveBean(attr.getAttr().toString(), i);
/***验证网页是否能够连通*/ if (callback == null) {
boolean f = true; logger.warn("DataCallback 对象为 null,无法保存数据");
if(!response.isSuccessful()){ } else {
callback.onData(ulb, attr);
}
}
private String dealUrl(String url) {
if(url.contains("toutiao.com")) {
try { try {
f = matchDel(response.body().string(),attr.get().toString()); if(url.contains("www.toutiao.com")) {
} catch (IOException e) {
logger.info("数据判断出错 {}",e.getMessage()); }else {
}finally { url = url.replace("toutiao.com", "www.toutiao.com");
if(response != null) {
response.close();
} }
if(url.contains("https")) {
}else {
url = url.replace("http", "https");
}
if(url.contains("group")) {
url = "https://www.toutiao.com/a" + url.split("/")[4] + "/";
}
} catch (Exception e) {
logger.info("url 解析出错 {}",url);
return url;
} }
}else{
f = false;
} }
UrlLiveBean ulb = new UrlLiveBean(attr.get().toString(), f); return url;
}
/**
*
* @Description 判断是否删除
* @param html
* @param attr
* @param callback
*/
private void parseHtml(String html, Attribution attr,
UrlLiveDataCallback callback) {
/***验证网页是否能够连通*/
boolean f = true;
try {
f = matchDel(html,attr.getAttr().toString());
} catch (Exception e) {
logger.info("数据判断出错 {}",e.getMessage());
}
UrlLiveBean ulb = new UrlLiveBean(attr.getAttr().toString(), f);
if (callback == null) { if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据"); logger.warn("DataCallback 对象为 null,无法保存数据");
} else { } else {
...@@ -123,12 +166,6 @@ public class UrlLiveCrawler { ...@@ -123,12 +166,6 @@ public class UrlLiveCrawler {
return true; return true;
} }
step++; step++;
if (rulerWeigui(doc))
{
logger.info("{}检测规则:第{}步",url,step);
return true;
}
step++;
if (rulerTousu(doc)) if (rulerTousu(doc))
{ {
logger.info("{}检测规则:第{}步",url,step); logger.info("{}检测规则:第{}步",url,step);
...@@ -158,6 +195,11 @@ public class UrlLiveCrawler { ...@@ -158,6 +195,11 @@ public class UrlLiveCrawler {
logger.info("{}检测规则:第{}步",url,step); logger.info("{}检测规则:第{}步",url,step);
return true; return true;
} }
step++;//10
if(rulerWeigui(doc)) {
logger.info("{}检测规则:第{}步",url,step);
return true;
}
step++;//11 step++;//11
if (rulerYidian(doc)) if (rulerYidian(doc))
{ {
...@@ -169,7 +211,7 @@ public class UrlLiveCrawler { ...@@ -169,7 +211,7 @@ public class UrlLiveCrawler {
/** /**
* *
* @TODO(TODO 微信谣言的无效网址筛选规则) * ( 微信谣言的无效网址筛选规则)
* @author 陈炜涛 * @author 陈炜涛
* @param doc * @param doc
* @return * @return
...@@ -188,7 +230,7 @@ public class UrlLiveCrawler { ...@@ -188,7 +230,7 @@ public class UrlLiveCrawler {
/** /**
* *
* @TODO(TODO 微信内容违规的无效网址筛选规则) * ( 微信内容违规的无效网址筛选规则)
* @author 陈炜涛 * @author 陈炜涛
* @param doc * @param doc
* @return * @return
...@@ -198,8 +240,7 @@ public class UrlLiveCrawler { ...@@ -198,8 +240,7 @@ public class UrlLiveCrawler {
private boolean rulerWeigui(Document doc) private boolean rulerWeigui(Document doc)
{ {
boolean flg = false; boolean flg = false;
if ("此内容因违规无法查看".equals(doc.select(".text_area > p:nth-child(1)") if ((doc.select("p.title").text()).contains("此内容因违规无法查看"))
.text()))
{ {
flg = true; flg = true;
} }
...@@ -208,7 +249,7 @@ public class UrlLiveCrawler { ...@@ -208,7 +249,7 @@ public class UrlLiveCrawler {
/** /**
* *
* @TODO(TODO 微信内容违规的无效网址筛选规则) * ( 微信内容违规的无效网址筛选规则)
* @author 陈炜涛 * @author 陈炜涛
* @param doc * @param doc
* @return * @return
...@@ -227,7 +268,7 @@ public class UrlLiveCrawler { ...@@ -227,7 +268,7 @@ public class UrlLiveCrawler {
/** /**
* *
* @TODO(TODO 环球的无效网址筛选规则) * ( 环球的无效网址筛选规则)
* @author 陈炜涛 * @author 陈炜涛
* @param doc * @param doc
* @return * @return
...@@ -246,7 +287,7 @@ public class UrlLiveCrawler { ...@@ -246,7 +287,7 @@ public class UrlLiveCrawler {
/** /**
* *
* @TODO(TODO 空的无效网址筛选规则) * ( 空的无效网址筛选规则)
* @author 陈炜涛 * @author 陈炜涛
* @param doc * @param doc
* @return * @return
...@@ -267,7 +308,7 @@ public class UrlLiveCrawler { ...@@ -267,7 +308,7 @@ public class UrlLiveCrawler {
/** /**
* *
* @TODO(TODO 内容不存在) * ( 内容不存在)
* @author 陈炜涛 * @author 陈炜涛
* @param doc * @param doc
* @return * @return
...@@ -286,7 +327,7 @@ public class UrlLiveCrawler { ...@@ -286,7 +327,7 @@ public class UrlLiveCrawler {
/** /**
* *
* @TODO(TODO 招商网的无效网址筛选规则) * ( 招商网的无效网址筛选规则)
* @author 陈炜涛 * @author 陈炜涛
* @param doc * @param doc
* @return * @return
...@@ -315,7 +356,7 @@ public class UrlLiveCrawler { ...@@ -315,7 +356,7 @@ public class UrlLiveCrawler {
/** /**
* *
* @TODO(TODO 一点资讯的无效网址筛选规则) * ( 一点资讯的无效网址筛选规则)
* @author 陈炜涛 * @author 陈炜涛
* @param doc * @param doc
* @return * @return
...@@ -334,7 +375,7 @@ public class UrlLiveCrawler { ...@@ -334,7 +375,7 @@ public class UrlLiveCrawler {
} }
catch (Exception e) catch (Exception e)
{ {
// TODO: handle exception // : handle exception
} }
return flg; return flg;
} }
...@@ -354,7 +395,7 @@ public class UrlLiveCrawler { ...@@ -354,7 +395,7 @@ public class UrlLiveCrawler {
for (Node node : nodeList) { for (Node node : nodeList) {
if (node.outerHtml().contains("<title>")) { if (node.outerHtml().contains("<title>")) {
String title = node.toString().split("<title>")[1].split("</title>")[0]; String title = node.toString().split("<title>")[1].split("</title>")[0];
if(title.contains("404")){ if(title.contains("未知错误") || title.contains("Object moved") || title.contains("404") || title.contains("页面没有找到") || title.contains("页面未找到") || title.contains("301 Moved Permanently")){
return true; return true;
} }
} }
......
...@@ -55,14 +55,14 @@ public class ContentMatch { ...@@ -55,14 +55,14 @@ public class ContentMatch {
return dataList; return dataList;
} }
// public static void main(String[] args) { public static void main(String[] args) {
// List<String> urlList = new ArrayList<>(); List<String> urlList = new ArrayList<>();
// urlList.add("http://www.toutiao.com/a6571343464292680196/"); urlList.add("https://mp.weixin.qq.com/s?src=11&timestamp=1535697915&ver=1093&signature=HNXpB8owyjfkyX-p2UDMga5R-qEpgjEpRQAjVmy7xqdrfsjZNdW0xa56dgCWMD9I*eo**yak46juxNEzryhKVLRT48DG0g9SUJSVrKSaPrhHEuJ1JOA86mSaY7TrHMMT&new=1");
// List<ContentBean> l = getContentMatch(urlList); List<ContentBean> l = getContentMatch(urlList);
// for(ContentBean cb : l) { for(ContentBean cb : l) {
// System.out.println(cb.getContent()); System.out.println(cb.getContent());
// } }
// } }
static class ContentMatchCrawlerThread extends Thread{ static class ContentMatchCrawlerThread extends Thread{
......
...@@ -33,27 +33,46 @@ public class URLLive { ...@@ -33,27 +33,46 @@ public class URLLive {
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){ for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
urlList.add(entry.getKey()); urlList.add(entry.getKey());
} }
System.out.println(urlList.size());
//验证数据是否已删除 //验证数据是否已删除
List<UrlLiveBean> dataList = UrlLiveCrawlerThread.getUrlLiveCrawle(urlList); List<UrlLiveBean> dataList = UrlLiveCrawlerThread.getUrlLiveCrawle(urlList);
for(UrlLiveBean ub : dataList){ for(UrlLiveBean ub : dataList){
String url = ub.getUrl(); String url = ub.getUrl();
boolean live = ub.isLive(); int i = ub.isLive();
if(dataMap.containsKey(url)){ if(dataMap.containsKey(url)){
Map<String,Object> map = dataMap.get(url); Map<String,Object> map = dataMap.get(url);
map.put("是否删除", live); if(i == 1) {
map.put("是否删除", true);
}else if(i == 0) {
map.put("是否删除", false);
}
dataMap.put(url, map); dataMap.put(url, map);
} }
} }
return dataMap; return dataMap;
} }
/**
*
* @Description (TODO这里用一句话描述这个方法的作用)
* @param urlList
* @return UrlLiveBean 1 已删除 2 未删除 -1 访问失败
*/
public static List<UrlLiveBean> verificationURLLive(List<String> urlList){ public static List<UrlLiveBean> verificationURLLive(List<String> urlList){
//启动验证链接是否有效程序程序 //启动验证链接是否有效程序程序
List<UrlLiveBean> dataList = UrlLiveCrawlerThread.getUrlLiveCrawle(urlList); List<UrlLiveBean> dataList = UrlLiveCrawlerThread.getUrlLiveCrawle(urlList);
return dataList; return dataList;
} }
public static void main(String[] args) {
List<String> urlList = new ArrayList<>();
urlList.add("http://www.zyzpes.com/toutiao/5048828/20180419A1AFBC00.html");
List<UrlLiveBean> u = URLLive.verificationURLLive(urlList);
for(UrlLiveBean b : u) {
System.out.println(b.toString());
}
}
static class UrlLiveCrawlerThread extends Thread{ static class UrlLiveCrawlerThread extends Thread{
private static List<UrlLiveBean> getUrlLiveCrawle(List<String> urlList){ private static List<UrlLiveBean> getUrlLiveCrawle(List<String> urlList){
......
...@@ -38,7 +38,8 @@ public class MatchContent { ...@@ -38,7 +38,8 @@ public class MatchContent {
content = matchContentWeixin(document); content = matchContentWeixin(document);
}else if(url.contains("toutiao.com")) { }else if(url.contains("toutiao.com")) {
content = matchContentToutiao(html); content = matchContentToutiao(html);
}else { }
if(content == null || content.length() < 10) {
content = mathchContent(html, document); content = mathchContent(html, document);
} }
return ZhiWeiTools.delHTMLTag(content); return ZhiWeiTools.delHTMLTag(content);
...@@ -71,7 +72,21 @@ public class MatchContent { ...@@ -71,7 +72,21 @@ public class MatchContent {
* @return * @return
*/ */
private static String matchContentWeixin(Document document) { private static String matchContentWeixin(Document document) {
return document.select("div.rich_media_content").text(); try {
String content = document.select("div.rich_media_content").text();
if(document.toString().contains("<script id=\"content_tpl\"")) {
Pattern pa = Pattern.compile("\\<script id=\"content_tpl(.*?)\\</script\\>");
Matcher ma = pa.matcher(document.toString());
while(ma.find()) {
return ma.group(0).replaceAll("<script id=\"content_tpl\" type=\"text/html\">", "").replaceAll("</script>", "");
}
return content;
}
return content;
} catch (Exception e) {
e.printStackTrace();
return "";
}
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment