Commit 98e0d120 by yangchen

sourceforward 链接匹配修改

parent aa2a108b
......@@ -24,12 +24,12 @@
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.0.2-SNAPSHOT</version>
<version>0.0.5-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.middleware</groupId>
<artifactId>proxy-client</artifactId>
<version>0.0.1-RELEASE</version>
<version>0.0.2-RELEASE</version>
</dependency>
</dependencies>
......@@ -89,13 +89,4 @@
<dependencyManagement>
<dependencies>
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
<version>3.11.0</version>
</dependency>
</dependencies>
</dependencyManagement>
</project>
\ No newline at end of file
......@@ -4,17 +4,27 @@ public class UrlLiveBean {
private String url;
private boolean isLive;
private Integer isLive;
public UrlLiveBean() {
super();
}
public UrlLiveBean(String url, boolean isLive) {
public UrlLiveBean(String url, Integer isLive) {
super();
this.url = url;
this.isLive = isLive;
}
public UrlLiveBean(String url, boolean isLive) {
super();
this.url = url;
if(isLive) {
this.isLive = 1; //已删除
}else {
this.isLive = 0;
}
}
public String getUrl() {
return url;
......@@ -24,11 +34,11 @@ public class UrlLiveBean {
this.url = url;
}
public boolean isLive() {
public Integer isLive() {
return isLive;
}
public void setLive(boolean isLive) {
public void setLive(Integer isLive) {
this.isLive = isLive;
}
......@@ -46,6 +56,8 @@ public class UrlLiveBean {
public static class Attribution {
private Object attr;
private Integer count;
/**
* Constructor
*
......@@ -55,6 +67,17 @@ public class UrlLiveBean {
this.attr = attr;
}
/**
*
* @Description TODO(这里用一句话描述这个方法的作用)
* @param attr
* @param count
*/
private Attribution(Object attr,Integer count){
this.attr = attr;
this.count = count;
}
/**
* 创建属性
*
......@@ -66,13 +89,36 @@ public class UrlLiveBean {
}
/**
* 创建属性
*
* @param attr
* @return Attribution
*/
public static Attribution of(Object attr,Integer count) {
return new Attribution(attr,count);
}
/**
* 获取属性
*
* @return Object
*/
public Object get() {
public Object getAttr() {
return attr;
}
/**
* 获取属性
*
* @return Object
*/
public Integer getCount() {
return count;
}
public void AddCount() {
count++;
}
}
}
......@@ -143,9 +143,9 @@ public class ContentExtractor {
content = tag;
}
}
if (content == null) {
throw new Exception("extraction failed");
}
// if (content == null) {
// throw new Exception("extraction failed");
// }
return content;
}
......@@ -164,17 +164,17 @@ public class ContentExtractor {
news.setUrl(doc.baseUri());
}
try {
news.setTime(getTime(contentElement));
} catch (Exception ex) {
LOG.info("news title extraction failed", ex);
}
try {
news.setTitle(getTitle(contentElement));
} catch (Exception ex) {
LOG.info("title extraction failed", ex);
}
// try {
// news.setTime(getTime(contentElement));
// } catch (Exception ex) {
// LOG.info("news title extraction failed", ex);
// }
// try {
// news.setTitle(getTitle(contentElement));
// } catch (Exception ex) {
// LOG.info("title extraction failed", ex);
// }
return news;
}
......
......@@ -3,23 +3,22 @@ package com.zhiwei.source_forward.crawler;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.crawler.async.MultiThreadingCounter;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.HttpRequestBuilder;
import com.zhiwei.source_forward.bean.ContentBean;
import com.zhiwei.source_forward.bean.ContentBean.Attribution;
import com.zhiwei.source_forward.util.ContentDataCallback;
import com.zhiwei.source_forward.util.MatchContent;
import com.zhiwei.source_forward.util.ProxyClientUtil;
import com.zhiwei.tools.httpclient.HttpBoot;
import com.zhiwei.tools.httpclient.HttpRequestBuilder;
import com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter;
import okhttp3.Headers;
import okhttp3.Request;
import okhttp3.Response;
public class ContentCrawler {
private static Logger logger = LogManager.getLogger(ContentCrawler.class);
/**
*
* @Description 链接传入 并 返回采集完信号
......@@ -28,12 +27,13 @@ public class ContentCrawler {
* @return
* @throws Exception
*/
public MultiThreadingCounter submitTask(ContentDataCallback callback,String... urls) throws Exception {
public MultiThreadingCounter submitTask(ContentDataCallback callback,
String... urls) throws Exception {
MultiThreadingCounter counter = new MultiThreadingCounter();
start(counter, callback, urls);
return counter;
}
/**
*
* @Description 提交链接
......@@ -41,17 +41,15 @@ public class ContentCrawler {
* @param callback
* @param urls
*/
private void start(MultiThreadingCounter counter,ContentDataCallback callback, String... urls) {
private void start(MultiThreadingCounter counter,
ContentDataCallback callback, String... urls) {
if (urls != null && urls.length > 0) {
for (String url : urls) {
if (url != null) {
try {
counter.increase();
search(counter, url, Attribution.of(url), callback);
} catch (Exception e) {
logger.error("关键词 {} 搜索创建出错: {}", e.getMessage());
} finally {
counter.reduce();
}
}
}
......@@ -67,7 +65,8 @@ public class ContentCrawler {
* @param callback
* @return
*/
private MultiThreadingCounter search(MultiThreadingCounter counter, String url,Attribution attr, ContentDataCallback callback) {
private MultiThreadingCounter search(MultiThreadingCounter counter,
String url, Attribution attr, ContentDataCallback callback) {
logger.info("当前处理 URL: {}", url);
Request request = HttpRequestBuilder.newGetRequest(url, null);
counter.increase();
......@@ -75,22 +74,23 @@ public class ContentCrawler {
if (future.isSuccess()) {
Response response = future.result();
try {
parseHtml(response, attr, callback);
parseHtml(response, attr, callback);
} catch (Exception e) {
logger.error("解析出错", e);
}
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage());
logger.info("{} 搜索结果访问失败: {}", request.url().url(),future.cause().getMessage());
}
counter.reduce();
});
return counter;
}
/**
*
*
* @Description 获取正文解析
* @Description 获取正文解析
* @param response
* @param attr
* @param callback
......@@ -99,14 +99,15 @@ public class ContentCrawler {
ContentDataCallback callback) {
String content = null;
try {
if(response.isSuccessful()){
if (response.isSuccessful()) {
String html = response.body().string();
content = MatchContent.matchContent(attr.get().toString(), html);
content = MatchContent.matchContent(attr.get().toString(),
html);
}
} catch (Exception e) {
logger.info("网页链接失效",e.fillInStackTrace());
}finally {
if(response != null) {
logger.info("网页链接失效", e.fillInStackTrace());
} finally {
if (response != null) {
response.close();
}
}
......@@ -116,7 +117,7 @@ public class ContentCrawler {
} else {
callback.onData(cb, attr);
}
}
}
......@@ -7,15 +7,15 @@ import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Node;
import com.zhiwei.crawler.async.MultiThreadingCounter;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.HttpRequestBuilder;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack;
import com.zhiwei.source_forward.util.ProxyClientUtil;
import com.zhiwei.tools.httpclient.HttpBoot;
import com.zhiwei.tools.httpclient.HttpRequestBuilder;
import com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter;
import okhttp3.Request;
import okhttp3.Response;
......
......@@ -8,6 +8,9 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import com.zhiwei.crawler.async.MultiThreadingCounter;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.HttpRequestBuilder;
import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel;
......@@ -15,9 +18,6 @@ import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.ProxyClientUtil;
import com.zhiwei.source_forward.util.SourceData;
import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
import com.zhiwei.tools.httpclient.HttpBoot;
import com.zhiwei.tools.httpclient.HttpRequestBuilder;
import com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter;
import okhttp3.Request;
import okhttp3.Response;
......
package com.zhiwei.source_forward.crawler;
import java.io.IOException;
import java.util.List;
import org.apache.logging.log4j.LogManager;
......@@ -9,13 +8,13 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import com.zhiwei.crawler.async.MultiThreadingCounter;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.HttpRequestBuilder;
import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.util.ProxyClientUtil;
import com.zhiwei.source_forward.util.UrlLiveDataCallback;
import com.zhiwei.tools.httpclient.HttpBoot;
import com.zhiwei.tools.httpclient.HttpRequestBuilder;
import com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter;
import okhttp3.Request;
import okhttp3.Response;
......@@ -43,12 +42,9 @@ public class UrlLiveCrawler {
for (String url : urls) {
if (url != null) {
try {
counter.increase();
search(counter, url, Attribution.of(url), callback);
search(counter, url, Attribution.of(url,1), callback);
} catch (Exception e) {
logger.error("关键词 {} 搜索创建出错: {}", e.getMessage());
} finally {
counter.reduce();
}
}
}
......@@ -57,6 +53,7 @@ public class UrlLiveCrawler {
private MultiThreadingCounter search(MultiThreadingCounter counter, String url,
Attribution attr, UrlLiveDataCallback callback) {
url = dealUrl(url);
logger.info("当前处理 URL: {}", url);
Request request = HttpRequestBuilder.newGetRequest(url, null);
counter.increase();
......@@ -64,36 +61,82 @@ public class UrlLiveCrawler {
if (future.isSuccess()) {
Response response = future.result();
try {
parseHtml(response, attr, callback);
if(response.code() == 200) {
parseHtml(response.body().string(), attr, callback);
}else {
callBack(callback, attr, 1);
}
} catch (Exception e) {
logger.error("解析出错", e);
}finally {
if(response != null) {
response.close();
}
}
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage());
if(attr.getCount() > 3) {
callBack(callback, attr, -1);
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage());
}else {
attr.AddCount();
search(counter, attr.getAttr().toString(), attr, callback);
}
}
counter.reduce();
});
return counter;
}
private void parseHtml(Response response, Attribution attr,
UrlLiveDataCallback callback) {
/***验证网页是否能够连通*/
boolean f = true;
if(!response.isSuccessful()){
private void callBack(UrlLiveDataCallback callback,Attribution attr,int i) {
UrlLiveBean ulb = new UrlLiveBean(attr.getAttr().toString(), i);
if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据");
} else {
callback.onData(ulb, attr);
}
}
private String dealUrl(String url) {
if(url.contains("toutiao.com")) {
try {
f = matchDel(response.body().string(),attr.get().toString());
} catch (IOException e) {
logger.info("数据判断出错 {}",e.getMessage());
}finally {
if(response != null) {
response.close();
if(url.contains("www.toutiao.com")) {
}else {
url = url.replace("toutiao.com", "www.toutiao.com");
}
if(url.contains("https")) {
}else {
url = url.replace("http", "https");
}
if(url.contains("group")) {
url = "https://www.toutiao.com/a" + url.split("/")[4] + "/";
}
} catch (Exception e) {
logger.info("url 解析出错 {}",url);
return url;
}
}else{
f = false;
}
UrlLiveBean ulb = new UrlLiveBean(attr.get().toString(), f);
return url;
}
/**
*
* @Description 判断是否删除
* @param html
* @param attr
* @param callback
*/
private void parseHtml(String html, Attribution attr,
UrlLiveDataCallback callback) {
/***验证网页是否能够连通*/
boolean f = true;
try {
f = matchDel(html,attr.getAttr().toString());
} catch (Exception e) {
logger.info("数据判断出错 {}",e.getMessage());
}
UrlLiveBean ulb = new UrlLiveBean(attr.getAttr().toString(), f);
if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据");
} else {
......@@ -123,12 +166,6 @@ public class UrlLiveCrawler {
return true;
}
step++;
if (rulerWeigui(doc))
{
logger.info("{}检测规则:第{}步",url,step);
return true;
}
step++;
if (rulerTousu(doc))
{
logger.info("{}检测规则:第{}步",url,step);
......@@ -158,6 +195,11 @@ public class UrlLiveCrawler {
logger.info("{}检测规则:第{}步",url,step);
return true;
}
step++;//10
if(rulerWeigui(doc)) {
logger.info("{}检测规则:第{}步",url,step);
return true;
}
step++;//11
if (rulerYidian(doc))
{
......@@ -169,7 +211,7 @@ public class UrlLiveCrawler {
/**
*
* @TODO(TODO 微信谣言的无效网址筛选规则)
* ( 微信谣言的无效网址筛选规则)
* @author 陈炜涛
* @param doc
* @return
......@@ -188,7 +230,7 @@ public class UrlLiveCrawler {
/**
*
* @TODO(TODO 微信内容违规的无效网址筛选规则)
* ( 微信内容违规的无效网址筛选规则)
* @author 陈炜涛
* @param doc
* @return
......@@ -198,8 +240,7 @@ public class UrlLiveCrawler {
private boolean rulerWeigui(Document doc)
{
boolean flg = false;
if ("此内容因违规无法查看".equals(doc.select(".text_area > p:nth-child(1)")
.text()))
if ((doc.select("p.title").text()).contains("此内容因违规无法查看"))
{
flg = true;
}
......@@ -208,7 +249,7 @@ public class UrlLiveCrawler {
/**
*
* @TODO(TODO 微信内容违规的无效网址筛选规则)
* ( 微信内容违规的无效网址筛选规则)
* @author 陈炜涛
* @param doc
* @return
......@@ -227,7 +268,7 @@ public class UrlLiveCrawler {
/**
*
* @TODO(TODO 环球的无效网址筛选规则)
* ( 环球的无效网址筛选规则)
* @author 陈炜涛
* @param doc
* @return
......@@ -246,7 +287,7 @@ public class UrlLiveCrawler {
/**
*
* @TODO(TODO 空的无效网址筛选规则)
* ( 空的无效网址筛选规则)
* @author 陈炜涛
* @param doc
* @return
......@@ -267,7 +308,7 @@ public class UrlLiveCrawler {
/**
*
* @TODO(TODO 内容不存在)
* ( 内容不存在)
* @author 陈炜涛
* @param doc
* @return
......@@ -286,7 +327,7 @@ public class UrlLiveCrawler {
/**
*
* @TODO(TODO 招商网的无效网址筛选规则)
* ( 招商网的无效网址筛选规则)
* @author 陈炜涛
* @param doc
* @return
......@@ -315,7 +356,7 @@ public class UrlLiveCrawler {
/**
*
* @TODO(TODO 一点资讯的无效网址筛选规则)
* ( 一点资讯的无效网址筛选规则)
* @author 陈炜涛
* @param doc
* @return
......@@ -334,7 +375,7 @@ public class UrlLiveCrawler {
}
catch (Exception e)
{
// TODO: handle exception
// : handle exception
}
return flg;
}
......@@ -354,7 +395,7 @@ public class UrlLiveCrawler {
for (Node node : nodeList) {
if (node.outerHtml().contains("<title>")) {
String title = node.toString().split("<title>")[1].split("</title>")[0];
if(title.contains("404")){
if(title.contains("未知错误") || title.contains("Object moved") || title.contains("404") || title.contains("页面没有找到") || title.contains("页面未找到") || title.contains("301 Moved Permanently")){
return true;
}
}
......
......@@ -55,14 +55,14 @@ public class ContentMatch {
return dataList;
}
// public static void main(String[] args) {
// List<String> urlList = new ArrayList<>();
// urlList.add("http://www.toutiao.com/a6571343464292680196/");
// List<ContentBean> l = getContentMatch(urlList);
// for(ContentBean cb : l) {
// System.out.println(cb.getContent());
// }
// }
public static void main(String[] args) {
List<String> urlList = new ArrayList<>();
urlList.add("https://mp.weixin.qq.com/s?src=11&timestamp=1535697915&ver=1093&signature=HNXpB8owyjfkyX-p2UDMga5R-qEpgjEpRQAjVmy7xqdrfsjZNdW0xa56dgCWMD9I*eo**yak46juxNEzryhKVLRT48DG0g9SUJSVrKSaPrhHEuJ1JOA86mSaY7TrHMMT&new=1");
List<ContentBean> l = getContentMatch(urlList);
for(ContentBean cb : l) {
System.out.println(cb.getContent());
}
}
static class ContentMatchCrawlerThread extends Thread{
......
......@@ -33,27 +33,46 @@ public class URLLive {
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
urlList.add(entry.getKey());
}
System.out.println(urlList.size());
//验证数据是否已删除
List<UrlLiveBean> dataList = UrlLiveCrawlerThread.getUrlLiveCrawle(urlList);
for(UrlLiveBean ub : dataList){
String url = ub.getUrl();
boolean live = ub.isLive();
int i = ub.isLive();
if(dataMap.containsKey(url)){
Map<String,Object> map = dataMap.get(url);
map.put("是否删除", live);
if(i == 1) {
map.put("是否删除", true);
}else if(i == 0) {
map.put("是否删除", false);
}
dataMap.put(url, map);
}
}
return dataMap;
}
/**
*
* @Description (TODO这里用一句话描述这个方法的作用)
* @param urlList
* @return UrlLiveBean 1 已删除 2 未删除 -1 访问失败
*/
public static List<UrlLiveBean> verificationURLLive(List<String> urlList){
//启动验证链接是否有效程序程序
List<UrlLiveBean> dataList = UrlLiveCrawlerThread.getUrlLiveCrawle(urlList);
return dataList;
}
public static void main(String[] args) {
List<String> urlList = new ArrayList<>();
urlList.add("http://www.zyzpes.com/toutiao/5048828/20180419A1AFBC00.html");
List<UrlLiveBean> u = URLLive.verificationURLLive(urlList);
for(UrlLiveBean b : u) {
System.out.println(b.toString());
}
}
static class UrlLiveCrawlerThread extends Thread{
private static List<UrlLiveBean> getUrlLiveCrawle(List<String> urlList){
......
......@@ -38,7 +38,8 @@ public class MatchContent {
content = matchContentWeixin(document);
}else if(url.contains("toutiao.com")) {
content = matchContentToutiao(html);
}else {
}
if(content == null || content.length() < 10) {
content = mathchContent(html, document);
}
return ZhiWeiTools.delHTMLTag(content);
......@@ -71,7 +72,21 @@ public class MatchContent {
* @return
*/
private static String matchContentWeixin(Document document) {
return document.select("div.rich_media_content").text();
try {
String content = document.select("div.rich_media_content").text();
if(document.toString().contains("<script id=\"content_tpl\"")) {
Pattern pa = Pattern.compile("\\<script id=\"content_tpl(.*?)\\</script\\>");
Matcher ma = pa.matcher(document.toString());
while(ma.find()) {
return ma.group(0).replaceAll("<script id=\"content_tpl\" type=\"text/html\">", "").replaceAll("</script>", "");
}
return content;
}
return content;
} catch (Exception e) {
e.printStackTrace();
return "";
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment