Commit 19bb2414 by yangchen

修改ok初版提交

parent 76581f38
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
<modelVersion>4.0.0</modelVersion> xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<groupId>com.zhiwei</groupId> <modelVersion>4.0.0</modelVersion>
<artifactId>source_forward</artifactId> <groupId>com.zhiwei</groupId>
<version>0.0.2-SNAPSHOT</version> <artifactId>source-forward</artifactId>
<name>source_forward</name> <version>0.0.3-SNAPSHOT</version>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description> <name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties>
<developers> <properties>
<developer> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<id>Bewilder</id> <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<name>zhiwei zhang</name> </properties>
<email>zhangzhiwei@zhiweidata.com</email>
</developer>
</developers>
<dependencies>
<dependency>
<groupId>cn.edu.hfut.dmic.webcollector</groupId>
<artifactId>WebCollector</artifactId>
<version>2.71</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.6.1</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.6.1</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-saxon</artifactId>
<version>0.6.1</version>
</dependency>
<dependency>
<groupId>com.zhiwei</groupId>
<artifactId>zhiweiTools</artifactId>
<version>0.0.6-SNAPSHOT</version>
</dependency>
</dependencies>
<!-- 打包管理 -->
<build>
<plugins>
<!-- 发布源码 -->
<plugin>
<artifactId>maven-source-plugin</artifactId>
<version>2.4</version>
<configuration>
<attach>true</attach>
</configuration>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.10.4</version>
</plugin>
<!-- 解决maven test命令时console出现中文乱码乱码 --> <developers>
<plugin> <developer>
<groupId>org.apache.maven.plugins</groupId> <id>Bewilder</id>
<artifactId>maven-surefire-plugin</artifactId> <name>zhiwei zhang</name>
<version>2.19.1</version> <email>zhangzhiwei@zhiweidata.com</email>
<configuration> </developer>
<forkMode>once</forkMode> </developers>
<argLine>-Dfile.encoding=UTF-8</argLine>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.0.2-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.middleware</groupId>
<artifactId>proxy-client</artifactId>
<version>0.0.1-RELEASE</version>
</dependency>
</dependencies>
<!-- 分发管理:管理distribution和supporting files --> <!-- 打包管理 -->
<distributionManagement> <build>
<snapshotRepository> <plugins>
<id>nexus-releases</id> <!-- 发布源码 -->
<name>User Porject Snapshot</name> <plugin>
<url>http://192.168.0.30:8081/nexus/content/repositories/snapshots/</url> <artifactId>maven-source-plugin</artifactId>
<uniqueVersion>true</uniqueVersion> <version>2.4</version>
</snapshotRepository> <configuration>
<repository> <attach>true</attach>
<id>nexus-releases</id> </configuration>
<name>User Porject Release</name> <executions>
<url>http://192.168.0.30:8081/nexus/content/repositories/releases/</url> <execution>
</repository> <phase>compile</phase>
</distributionManagement> <goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.10.4</version>
</plugin>
<!-- 解决maven test命令时console出现中文乱码乱码 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.19.1</version>
<configuration>
<forkMode>once</forkMode>
<argLine>-Dfile.encoding=UTF-8</argLine>
</configuration>
</plugin>
</plugins>
</build>
<!-- 分发管理:管理distribution和supporting files -->
<distributionManagement>
<snapshotRepository>
<id>nexus-releases</id>
<name>User Porject Snapshot</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/snapshots/</url>
<uniqueVersion>true</uniqueVersion>
</snapshotRepository>
<repository>
<id>nexus-releases</id>
<name>User Porject Release</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/releases/</url>
</repository>
</distributionManagement>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
<version>3.11.0</version>
</dependency>
</dependencies>
</dependencyManagement>
</project> </project>
\ No newline at end of file
package com.zhiwei.source_forward.bean;
public class ContentBean {
private String url;
private String content;
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
@Override
public String toString() {
return "ContentBean [url=" + url + ", content=" + content + "]";
}
public ContentBean(String url, String content) {
super();
this.url = url;
this.content = content;
}
public ContentBean() {
super();
}
/**
* @ClassName: Attribution
* @Description: 属性
* @author 0xff
* @date 2018年7月3日 下午5:53:22
*/
public static class Attribution {
private Object attr;
/**
* Constructor
*
* @param attr
*/
private Attribution(Object attr) {
this.attr = attr;
}
/**
* 创建属性
*
* @param attr
* @return Attribution
*/
public static Attribution of(Object attr) {
return new Attribution(attr);
}
/**
* 获取属性
*
* @return Object
*/
public Object get() {
return attr;
}
}
}
package com.zhiwei.source_forward.bean;
public class MediaSelfSourceBean {
private String url;
private String source;
private String channel;
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public String getChannel() {
return channel;
}
public void setChannel(String channel) {
this.channel = channel;
}
public MediaSelfSourceBean() {
super();
}
public MediaSelfSourceBean(String url, String source, String channel) {
super();
this.url = url;
this.source = source;
this.channel = channel;
}
@Override
public String toString() {
return "MediaSelfSourceBean [url=" + url + ", source=" + source
+ ", channel=" + channel + "]";
}
/**
* @ClassName: Attribution
* @Description: 属性
* @author 0xff
* @date 2018年7月3日 下午5:53:22
*/
public static class Attribution {
private Object attr;
/**
* Constructor
*
* @param attr
*/
private Attribution(Object attr) {
this.attr = attr;
}
/**
* 创建属性
*
* @param attr
* @return Attribution
*/
public static Attribution of(Object attr) {
return new Attribution(attr);
}
/**
* 获取属性
*
* @return Object
*/
public Object get() {
return attr;
}
}
}
package com.zhiwei.source_forward.bean;
public class SourceForwardBean {
private String url;
private String channel;
private String root_source;
private String isforward;
public String getIsforward() {
return isforward;
}
public void setIsforward(String isforward) {
this.isforward = isforward;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getChannel() {
return channel;
}
public void setChannel(String channel) {
this.channel = channel;
}
public String getRoot_source() {
return root_source;
}
public void setRoot_source(String root_source) {
this.root_source = root_source;
}
@Override
public String toString() {
return "SourceForwardBean [url=" + url + ", channel=" + channel
+ ", root_source=" + root_source + "]";
}
public SourceForwardBean(String url, String channel, String root_source,
String isforward) {
super();
this.url = url;
this.channel = channel;
this.root_source = root_source;
this.isforward = isforward;
}
public SourceForwardBean() {
super();
}
/**
* @ClassName: Attribution
* @Description: 属性
* @author 0xff
* @date 2018年7月3日 下午5:53:22
*/
public static class Attribution {
private Object attr;
/**
* Constructor
*
* @param attr
*/
private Attribution(Object attr) {
this.attr = attr;
}
/**
* 创建属性
*
* @param attr
* @return Attribution
*/
public static Attribution of(Object attr) {
return new Attribution(attr);
}
/**
* 获取属性
*
* @return Object
*/
public Object get() {
return attr;
}
}
}
package com.zhiwei.source_forward.bean;
public class UrlLiveBean {
private String url;
private boolean isLive;
public UrlLiveBean() {
super();
}
public UrlLiveBean(String url, boolean isLive) {
super();
this.url = url;
this.isLive = isLive;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public boolean isLive() {
return isLive;
}
public void setLive(boolean isLive) {
this.isLive = isLive;
}
@Override
public String toString() {
return "UrlLiveBean [url=" + url + ", isLive=" + isLive + "]";
}
/**
* @ClassName: Attribution
* @Description: 属性
* @author 0xff
* @date 2018年7月3日 下午5:53:22
*/
public static class Attribution {
private Object attr;
/**
* Constructor
*
* @param attr
*/
private Attribution(Object attr) {
this.attr = attr;
}
/**
* 创建属性
*
* @param attr
* @return Attribution
*/
public static Attribution of(Object attr) {
return new Attribution(attr);
}
/**
* 获取属性
*
* @return Object
*/
public Object get() {
return attr;
}
}
}
package com.zhiwei.source_forward.content;
import org.jsoup.nodes.Element;
/**
*
* @author hu
*/
public class News {
protected String url = null;
protected String title = null;
protected String content = null;
protected String time = null;
protected Element contentElement = null;
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getContent() {
if (content == null) {
if (contentElement != null) {
content = contentElement.text();
}
}
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
@Override
public String toString() {
return "URL:\n" + url + "\nTITLE:\n" + title + "\nTIME:\n" + time + "\nCONTENT:\n" + getContent() + "\nCONTENT(SOURCE):\n" + contentElement;
}
public Element getContentElement() {
return contentElement;
}
public void setContentElement(Element contentElement) {
this.contentElement = contentElement;
}
}
package com.zhiwei.source_forward.crawler;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.source_forward.bean.ContentBean;
import com.zhiwei.source_forward.bean.ContentBean.Attribution;
import com.zhiwei.source_forward.util.ContentDataCallback;
import com.zhiwei.source_forward.util.MatchContent;
import com.zhiwei.source_forward.util.ProxyClientUtil;
import com.zhiwei.tools.httpclient.HttpBoot;
import com.zhiwei.tools.httpclient.HttpRequestBuilder;
import com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter;
import okhttp3.Request;
import okhttp3.Response;
public class ContentCrawler {
private static Logger logger = LogManager.getLogger(ContentCrawler.class);
/**
*
* @Description 链接传入 并 返回采集完信号
* @param callback
* @param urls
* @return
* @throws Exception
*/
public MultiThreadingCounter submitTask(ContentDataCallback callback,String... urls) throws Exception {
MultiThreadingCounter counter = new MultiThreadingCounter();
start(counter, callback, urls);
return counter;
}
/**
*
* @Description 提交链接
* @param counter
* @param callback
* @param urls
*/
private void start(MultiThreadingCounter counter,ContentDataCallback callback, String... urls) {
if (urls != null && urls.length > 0) {
for (String url : urls) {
if (url != null) {
try {
counter.increase();
search(counter, url, Attribution.of(url), callback);
} catch (Exception e) {
logger.error("关键词 {} 搜索创建出错: {}", e.getMessage());
} finally {
counter.reduce();
}
}
}
}
}
/**
*
* @Description 链接获取文章信息
* @param counter
* @param url
* @param attr
* @param callback
* @return
*/
private MultiThreadingCounter search(MultiThreadingCounter counter, String url,Attribution attr, ContentDataCallback callback) {
logger.info("当前处理 URL: {}", url);
Request request = HttpRequestBuilder.newGetRequest(url, null);
counter.increase();
HttpBoot.asyncCall(request, ProxyClientUtil.getNATProxy(), false).addListeners(future -> {
if (future.isSuccess()) {
Response response = future.result();
try {
parseHtml(response, attr, callback);
} catch (Exception e) {
logger.error("解析出错", e);
}
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage());
}
counter.reduce();
});
return counter;
}
/**
*
*
* @Description 获取正文解析
* @param response
* @param attr
* @param callback
*/
private void parseHtml(Response response, Attribution attr,
ContentDataCallback callback) {
String content = null;
try {
if(response.isSuccessful()){
String html = response.body().string();
content = MatchContent.matchContent(attr.get().toString(), html);
}
} catch (Exception e) {
logger.info("网页链接失效",e.fillInStackTrace());
}finally {
if(response != null) {
response.close();
}
}
ContentBean cb = new ContentBean(attr.get().toString(), content);
if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据");
} else {
callback.onData(cb, attr);
}
}
}
package com.zhiwei.source_forward.crawler;
import java.util.HashMap;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.source_forward.util.MatchContent;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* @ClassName: ContentPageProcessor
* @Description: 获取文章内容
* @author hero
* @date 2018年6月30日 上午9:54:02
*/
public class ContentPageProcessor implements PageProcessor {
private static Logger logger = LoggerFactory.getLogger(ContentPageProcessor.class);
private Site site = Site.me().setCycleRetryTimes(3).setSleepTime(1500)
.setTimeOut(10000)
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0")
.addHeader("Accept-Encoding", "deflate, br")
;
@Override
public Site getSite() {
return site;
}
@Override
public void process(Page page) {
Map<String,String> data = new HashMap<String,String>();
String content = null;
try {
if(page.getStatusCode()!=404){
content = MatchContent.matchContent(page.getUrl().get(), page.getHtml().toString());
}
} catch (Exception e) {
logger.info("网页链接失效",e.fillInStackTrace());
content = null;
}
data.put("url", page.getUrl().get());
data.put("content", content);
page.putField("content", data);
}
}
package com.zhiwei.source_forward.crawler;
import java.util.List;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Node;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack;
import com.zhiwei.source_forward.util.ProxyClientUtil;
import com.zhiwei.tools.httpclient.HttpBoot;
import com.zhiwei.tools.httpclient.HttpRequestBuilder;
import com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter;
import okhttp3.Request;
import okhttp3.Response;
/**
*
* @ClassName MediaSelfSourceCrawler
* @Description 自媒体号匹配
* @author byte-zbs
* @Date 2018年8月21日 下午3:54:03
* @version 1.0.0
*/
public class MediaSelfSourceCrawler {
private static final Logger logger = LogManager.getLogger(MediaSelfSourceCrawler.class);
/**
*
* @Description 链接传入 并 返回采集完信号
* @param callback
* @param urls
* @return
* @throws Exception
*/
public MultiThreadingCounter submitTask(MediaSelfSourceDataCallBack callback,String... urls) throws Exception {
MultiThreadingCounter counter = new MultiThreadingCounter();
start(counter, callback, urls);
return counter;
}
/**
*
* @Description 提交链接
* @param counter
* @param callback
* @param urls
*/
private void start(MultiThreadingCounter counter,MediaSelfSourceDataCallBack callback, String... urls) {
if (urls != null && urls.length > 0) {
for (String url : urls) {
if (url != null) {
try {
counter.increase();
search(counter, url, Attribution.of(url), callback);
} catch (Exception e) {
logger.error("关键词 {} 搜索创建出错: {}", e.getMessage());
} finally {
counter.reduce();
}
}
}
}
}
/**
*
* @Description 链接获取文章信息
* @param counter
* @param url
* @param attr
* @param callback
* @return
*/
private MultiThreadingCounter search(MultiThreadingCounter counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) {
logger.info("当前处理 URL: {}", url);
Request request = HttpRequestBuilder.newGetRequest(url, null);
counter.increase();
HttpBoot.asyncCall(request, ProxyClientUtil.getNATProxy(), false).addListeners(future -> {
if (future.isSuccess()) {
Response response = future.result();
try {
parseHtml(response, attr, callback);
} catch (Exception e) {
logger.error("解析出错", e);
}
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage());
}
counter.reduce();
});
return counter;
}
/**
*
* @Description 解析文章获取相关数据
* @param response
* @param attr
* @param callback
*/
private void parseHtml(Response response, Attribution attr,
MediaSelfSourceDataCallBack callback) {
String source = null;
String channel = null;
try {
if(response.isSuccessful()){
String html = response.body().string();
source = MatchSource.matchMediaSelfSource(attr.get().toString(),html);
if(source==null || source.equals("")){
source = null;
}
channel = MatchChannel.verifyChannel(attr.get().toString());
if(channel==null){
List<Node> nodeList = Jsoup.parse(html).head().childNodes();
channel = MatchChannel.matchChannel(nodeList);
}
}
} catch (Exception e) {
source = null;
}finally {
if(response != null) {
response.close();
}
}
logger.info(attr.get()+"================="+source);
MediaSelfSourceBean msfb = new MediaSelfSourceBean(attr.get().toString(), source, channel);
if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据");
} else {
callback.onData(msfb, attr);
}
}
}
package com.zhiwei.source_forward.crawler;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.nodes.Node;
import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
public class MediaSelfSourcePageProcessor implements PageProcessor {
private Site site = Site.me().setCycleRetryTimes(3).setSleepTime(1500)
.setTimeOut(10000)
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0")
.addHeader("Accept-Encoding", "gzip, deflate, br")
;
@Override
public Site getSite() {
return site;
}
@Override
public void process(Page page) {
Map<String,String> data = new HashMap<String,String>();
String source = null;
String channel = null;
try {
if(page.getStatusCode()!=404){
source = MatchSource.matchMediaSelfSource(page.getUrl().get(),page.getHtml().toString());
if(source==null || source.equals("")){
source = null;
}
channel = MatchChannel.verifyChannel(page.getUrl().get());
if(channel==null){
List<Node> nodeList = page.getHtml().getDocument().head().childNodes();
channel = MatchChannel.matchChannel(nodeList);
}
}
} catch (Exception e) {
source = null;
}
System.out.println(page.getUrl().get()+"================="+source);
data.put("url", page.getUrl().get());
data.put("mediaself", source);
data.put("channel", channel);
page.putField("mediaSelf", data);
}
}
package com.zhiwei.source_forward.crawler;
import java.util.List;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.ProxyClientUtil;
import com.zhiwei.source_forward.util.SourceData;
import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
import com.zhiwei.tools.httpclient.HttpBoot;
import com.zhiwei.tools.httpclient.HttpRequestBuilder;
import com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter;
import okhttp3.Request;
import okhttp3.Response;
public class SourceForwardCrawler {
private static final Logger logger = LogManager.getLogger(SourceForwardCrawler.class);
private static List<String> sourceList = SourceData.getSourceList();
public MultiThreadingCounter submitTask(SourceForwardDataCallBack callback,String... urls) throws Exception {
MultiThreadingCounter counter = new MultiThreadingCounter();
start(counter, callback, urls);
return counter;
}
private void start(MultiThreadingCounter counter,SourceForwardDataCallBack callback, String... urls) {
if (urls != null && urls.length > 0) {
for (String url : urls) {
if (url != null) {
try {
counter.increase();
search(counter, url, Attribution.of(url), callback);
} catch (Exception e) {
logger.error("关键词 {} 搜索创建出错: {}", e.getMessage());
} finally {
counter.reduce();
}
}
}
}
}
private MultiThreadingCounter search(MultiThreadingCounter counter, String url,Attribution attr, SourceForwardDataCallBack callback) {
logger.info("当前处理 URL: {}", url);
Request request = HttpRequestBuilder.newGetRequest(url, null);
counter.increase();
HttpBoot.asyncCall(request, ProxyClientUtil.getNATProxy(), false).addListeners(future -> {
if (future.isSuccess()) {
Response response = future.result();
try {
parseHtml(response, attr, callback);
} catch (Exception e) {
logger.error("解析出错", e);
}
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage());
}
counter.reduce();
});
return counter;
}
private void parseHtml(Response response, Attribution attr,
SourceForwardDataCallBack callback) {
String source = null;
String channel = "新闻";
String isforward = "未知";
try {
if(response.isSuccessful()){
Document document = Jsoup.parse(response.body().string());
if(attr.get().toString().contains("mp.weixin.qq.com")){
isforward = document.select("div#meta_content").select("span#copyright_logo").text();
if(!"原创".equals(isforward)){
isforward = "未知";
}
}else{
channel = MatchChannel.verifyChannel(attr.get().toString());
if(channel==null){
List<Node> nodeList = document.head().childNodes();
channel = MatchChannel.matchChannel(nodeList);
}
source = MatchSource.matchSource(attr.get().toString(),document.toString(), sourceList);
}
}
} catch (Exception e) {
source = null;
channel = "新闻";
}finally {
if(response != null) {
response.close();
}
}
logger.info(attr.get().toString()+"======="+channel+"================="+source);
SourceForwardBean sfb = new SourceForwardBean(attr.get().toString(), channel, source,isforward);
if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据");
} else {
callback.onData(sfb, attr);
}
}
}
package com.zhiwei.source_forward.crawler;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.SourceData;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
public class SourceForwardPageProcessor implements PageProcessor {
private static List<String> sourceList = SourceData.getSourceList();
private Site site = Site.me().setCycleRetryTimes(3).setSleepTime(1500)
.setTimeOut(10000)
.addHeader("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0")
.addHeader("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
;
@Override
public Site getSite() {
return site;
}
@Override
public void process(Page page) {
Map<String,String> data = new HashMap<String,String>();
String source = null;
String channel = "新闻";
try {
if(page.getStatusCode()!=404){
if(page.getUrl().get().contains("mp.weixin.qq.com")){
String isforward = "未知";
Document document = page.getHtml().getDocument();
isforward = document.select("div#meta_content").select("span#copyright_logo").text();
if(!"原创".equals(isforward)){
isforward = "未知";
}
data.put("isforward", isforward);
}else{
channel = MatchChannel.verifyChannel(page.getUrl().get());
if(channel==null){
List<Node> nodeList = page.getHtml().getDocument().head().childNodes();
channel = MatchChannel.matchChannel(nodeList);
}
source = MatchSource.matchSource(page.getUrl().get(),page.getHtml().toString(), sourceList);
}
}
} catch (Exception e) {
source = null;
channel = "新闻";
}
System.out.println(page.getUrl().get()+"======="+channel+"================="+source);
data.put("url", page.getUrl().get());
data.put("channel", channel);
data.put("root_source", source);
page.putField("sourceForward", data);
}
}
package com.zhiwei.source_forward.crawler; package com.zhiwei.source_forward.crawler;
import java.util.HashMap;
import java.io.IOException;
import java.util.List; import java.util.List;
import java.util.Map;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node; import org.jsoup.nodes.Node;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
public class UrlLivePageProcessor implements PageProcessor{ import com.zhiwei.source_forward.bean.UrlLiveBean;
private static Logger logger = LoggerFactory.getLogger(UrlLivePageProcessor.class); import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
private Site site = Site.me().setCycleRetryTimes(3).setSleepTime(1500) import com.zhiwei.source_forward.util.ProxyClientUtil;
.setTimeOut(15000) import com.zhiwei.source_forward.util.UrlLiveDataCallback;
.addHeader("User-Agent", import com.zhiwei.tools.httpclient.HttpBoot;
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0") import com.zhiwei.tools.httpclient.HttpRequestBuilder;
.addHeader("Accept", import com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter;
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
@Override import okhttp3.Request;
public void process(Page page) { import okhttp3.Response;
/***验证网页是否能够连通*/
boolean f = true; /**
if(page!=null){ *
if(page.getStatusCode()==200){ * @ClassName UrlLiveCrawler
f = matchDel(page); * @Description 判断页面是否存在
}else if(page.getStatusCode()==404){ * @author byte-zbs
f = true; * @Date 2018年8月20日 下午3:34:57
}else{ * @version 1.0.0
f = false; */
} public class UrlLiveCrawler {
}
private static final Logger logger = LogManager.getLogger(UrlLiveCrawler.class);
Map<String,Object> data = new HashMap<String,Object>();
data.put("url", page.getUrl().get()); public MultiThreadingCounter submitTask(UrlLiveDataCallback callback,String... urls) throws Exception {
data.put("live", f); MultiThreadingCounter counter = new MultiThreadingCounter();
page.putField("urlLive", data); start(counter, callback, urls);
} return counter;
}
private void start(MultiThreadingCounter counter,UrlLiveDataCallback callback, String... urls) {
if (urls != null && urls.length > 0) {
for (String url : urls) {
if (url != null) {
try {
counter.increase();
search(counter, url, Attribution.of(url), callback);
} catch (Exception e) {
logger.error("关键词 {} 搜索创建出错: {}", e.getMessage());
} finally {
counter.reduce();
}
}
}
}
}
private MultiThreadingCounter search(MultiThreadingCounter counter, String url,
Attribution attr, UrlLiveDataCallback callback) {
logger.info("当前处理 URL: {}", url);
Request request = HttpRequestBuilder.newGetRequest(url, null);
counter.increase();
HttpBoot.asyncCall(request, ProxyClientUtil.getNATProxy(), false).addListeners(future -> {
if (future.isSuccess()) {
Response response = future.result();
try {
parseHtml(response, attr, callback);
} catch (Exception e) {
logger.error("解析出错", e);
}
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage());
}
counter.reduce();
});
return counter;
}
@Override private void parseHtml(Response response, Attribution attr,
public Site getSite() { UrlLiveDataCallback callback) {
return site; /***验证网页是否能够连通*/
} boolean f = true;
if(!response.isSuccessful()){
try {
f = matchDel(response.body().string(),attr.get().toString());
/*** } catch (IOException e) {
* @Title: matchDel logger.info("数据判断出错 {}",e.getMessage());
* @author hero }finally {
* @Description: 验证链接是否有效 if(response != null) {
* @param @param page response.close();
* @param @return 设定文件 }
* @return boolean 返回类型 }
*/ }else{
public boolean matchDel(Page page){ f = false;
int step = 1; }
Document doc = page.getHtml().getDocument(); UrlLiveBean ulb = new UrlLiveBean(attr.get().toString(), f);
if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据");
} else {
callback.onData(ulb, attr);
}
}
/***
* @Title: matchDel
* @author hero
* @Description: 验证链接是否有效
* @param @param page
* @param @return 设定文件
* @return boolean 返回类型
*/
public boolean matchDel(String result,String url){
int step = 1;
Document doc = Jsoup.parse(result);
if(rulerHead(doc)){ if(rulerHead(doc)){
logger.info("{}检测规则:第{}步",page.getUrl(),step); logger.info("{}检测规则:第{}步",url,step);
return true; return true;
} }
step++; step++;
if (rulerYaoyan(doc)) if (rulerYaoyan(doc))
{ {
logger.info("{}检测规则:第{}步",page.getUrl(),step); logger.info("{}检测规则:第{}步",url,step);
return true; return true;
} }
step++; step++;
if (rulerWeigui(doc)) if (rulerWeigui(doc))
{ {
logger.info("{}检测规则:第{}步",page.getUrl(),step); logger.info("{}检测规则:第{}步",url,step);
return true; return true;
} }
step++; step++;
if (rulerTousu(doc)) if (rulerTousu(doc))
{ {
logger.info("{}检测规则:第{}步",page.getUrl(),step); logger.info("{}检测规则:第{}步",url,step);
return true; return true;
} }
step++; step++;
if (page.getUrl().get().contains("huanqiu.com")) if (url.contains("huanqiu.com"))
{ {
logger.info("{}检测规则:第{}步",page.getUrl(),step); logger.info("{}检测规则:第{}步",url,step);
return rulerHuanqiuWuxiao(doc); return rulerHuanqiuWuxiao(doc);
} }
step++;//7 step++;//7
if (rulerBucunzai(doc)) if (rulerBucunzai(doc))
{ {
logger.info("{}检测规则:第{}步",page.getUrl(),step); logger.info("{}检测规则:第{}步",url,step);
return true; return true;
} }
step++;//8 step++;//8
if (rulerKong(doc)) if (rulerKong(doc))
{ {
logger.info("{}检测规则:第{}步",page.getUrl(),step); logger.info("{}检测规则:第{}步",url,step);
return true; return true;
} }
step++;//9 step++;//9
if (rulerZhaoshang(doc)) if (rulerZhaoshang(doc))
{ {
logger.info("{}检测规则:第{}步",page.getUrl(),step); logger.info("{}检测规则:第{}步",url,step);
return true; return true;
} }
step++;//11 step++;//11
if (rulerYidian(doc)) if (rulerYidian(doc))
{ {
logger.info("{}检测规则:第{}步",page.getUrl(),step); logger.info("{}检测规则:第{}步",url,step);
return true; return true;
} }
return false; return false;
} }
/** /**
* *
* @TODO(TODO 微信谣言的无效网址筛选规则) * @TODO(TODO 微信谣言的无效网址筛选规则)
* @author 陈炜涛 * @author 陈炜涛
...@@ -282,31 +338,37 @@ public class UrlLivePageProcessor implements PageProcessor{ ...@@ -282,31 +338,37 @@ public class UrlLivePageProcessor implements PageProcessor{
} }
return flg; return flg;
} }
/** /**
* @Title: rulerHead * @Title: rulerHead
* @author hero * @author hero
* @Description: 验证链接头部 * @Description: 验证链接头部
* @param @param doc * @param @param doc
* @param @return 设定文件 * @param @return 设定文件
* @return boolean 返回类型 * @return boolean 返回类型
*/ */
private boolean rulerHead(Document doc) private boolean rulerHead(Document doc)
{ {
List<Node> nodeList = doc.head().childNodes(); List<Node> nodeList = doc.head().childNodes();
try { try {
for (Node node : nodeList) { for (Node node : nodeList) {
if (node.outerHtml().contains("<title>")) { if (node.outerHtml().contains("<title>")) {
String title = node.toString().split("<title>")[1].split("</title>")[0]; String title = node.toString().split("<title>")[1].split("</title>")[0];
if(title.contains("404")){ if(title.contains("404")){
return true; return true;
} }
} }
} if (node.outerHtml().contains("meta")) {
} catch (Exception e) { String meta = node.toString();
return false; if(meta.contains("公益404页面")) {
} return true;
}
}
}
} catch (Exception e) {
return false;
}
return false; return false;
} }
} }
package com.zhiwei.source_forward.pipeline;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
/**
* @ClassName: ContentDataPipeline
* @Description: 存储文章位置
* @author hero
* @date 2018年6月30日 上午9:54:27
*/
public class DataPipeline implements Pipeline {
private List<Map<String, Object>> contentDataList = new ArrayList<Map<String, Object>>();
private List<Map<String, Object>> mediaSelfDataList = new ArrayList<Map<String, Object>>();
private List<Map<String, Object>> sourceForwardDataList = new ArrayList<Map<String, Object>>();
private List<Map<String, Object>> urlLivedataList = new ArrayList<Map<String, Object>>();
public DataPipeline() {
super();
}
@Override
public void process(ResultItems resultItems, Task task) {
Map<String, Object> contentData = resultItems.get("content");
Map<String, Object> mediaSelfData = resultItems.get("mediaSelf");
Map<String, Object> sourceForwardData = resultItems.get("sourceForward");
Map<String, Object> urlLivedata = resultItems.get("urlLive");
if (contentData != null) {
contentDataList.add(contentData);
}
if (mediaSelfData != null) {
mediaSelfDataList.add(mediaSelfData);
}
if (sourceForwardData != null) {
sourceForwardDataList.add(sourceForwardData);
}
if (urlLivedata != null) {
urlLivedataList.add(urlLivedata);
}
}
public List<Map<String, Object>> getContentDataList() {
return contentDataList;
}
public void setContentDataList(List<Map<String, Object>> contentDataList) {
this.contentDataList = contentDataList;
}
public List<Map<String, Object>> getMediaSelfDataList() {
return mediaSelfDataList;
}
public void setMediaSelfDataList(List<Map<String, Object>> mediaSelfDataList) {
this.mediaSelfDataList = mediaSelfDataList;
}
public List<Map<String, Object>> getSourceForwardDataList() {
return sourceForwardDataList;
}
public void setSourceForwardDataList(List<Map<String, Object>> sourceForwardDataList) {
this.sourceForwardDataList = sourceForwardDataList;
}
public List<Map<String, Object>> getUrlLivedataList() {
return urlLivedataList;
}
public void setUrlLivedataList(List<Map<String, Object>> urlLivedataList) {
this.urlLivedataList = urlLivedataList;
}
}
package com.zhiwei.source_forward.run; package com.zhiwei.source_forward.run;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import com.zhiwei.source_forward.crawler.ContentPageProcessor; import org.apache.logging.log4j.LogManager;
import com.zhiwei.source_forward.downloader.MyDownLoader; import org.apache.logging.log4j.Logger;
import com.zhiwei.source_forward.pipeline.DataPipeline;
import us.codecraft.webmagic.Spider; import com.zhiwei.source_forward.bean.ContentBean;
import com.zhiwei.source_forward.bean.ContentBean.Attribution;
import com.zhiwei.source_forward.crawler.ContentCrawler;
import com.zhiwei.source_forward.util.ContentDataCallback;
public class ContentMatch { public class ContentMatch {
/**
* @Title: getSourceForward
* @author hero private static Logger logger = LogManager.getLogger(ContentMatch.class);
* @Description: 验证文章是否转发
* @param @param dataMap public static List<ContentBean> getContentMatch(List<String> urlList){
* @param @return 设定文件 //启动获取链接来源
* @return Map<String,Map<String,Object>> 返回类型 List<ContentBean> dataList = ContentMatchCrawlerThread.getContentMatch(urlList);
*/ return dataList;
public static Map<String,Map<String,Object>> getContent(Map<String,Map<String,Object>> dataMap){ }
//启动验证来源程序
DataPipeline pipeline = new DataPipeline(); public static void main(String[] args) {
Spider spider = Spider.create(new ContentPageProcessor()); List<String> urlList = new ArrayList<>();
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){ urlList.add("http://sh.qihoo.com/pc/99493b3bf136d8e20?sign=360_e39369d1");
spider.addUrl(entry.getKey()); urlList.add("http://news.ctocio.com.cn/383/14543883.shtml");
} urlList.add("http://www.jn001.com/news/2018-07/05/content_561091.htm");
spider.setDownloader(new MyDownLoader()); urlList.add("http://www.ca800.com/fFa8D/bOTUBC1QfF/40944.aspx");
spider.addPipeline(pipeline); urlList.add("http://sh.qihoo.com/pc/988470164f6c5ca14?sign=360_e39369d1");
spider.thread(5).run(); urlList.add("http://news.jstv.com/a/20180705/1530731642686.shtml?jsbcApp=1");
urlList.add("https://tech.sina.cn/i/gn/2018-07-05/detail-ihexfcvi8155439.d.html?pos=18");
List<Map<String,Object>> contentList = pipeline.getContentDataList(); urlList.add("http://sh.qihoo.com/pc/983b3d157f91af18b?sign=360_e39369d1");
for(Map<String,Object> contentMap : contentList){ urlList.add("http://china.rednet.cn/c/2018/07/05/4671927.htm");
String url = contentMap.get("url")+""; urlList.add("http://news.enorth.com.cn/system/2018/07/05/035782857.shtml");
//搜集原文 urlList.add("https://www.toutiao.com/i6573922350037729796/");
if(dataMap.containsKey(url)){ urlList.add("http://news.cnhubei.com/xw/sh/201807/t4132048.shtml");
Map<String,Object> data = dataMap.get(url); urlList.add("https://www.toutiao.com/a6573774143949373956/");
String content = contentMap.get("content")+""; List<ContentBean> da = ContentMatch.getContentMatch(urlList);
data.put("content", content); for(ContentBean sfb : da) {
dataMap.put(url, data); System.out.println(sfb.toString());
} }
} }
return dataMap;
} static class ContentMatchCrawlerThread extends Thread{
private static List<ContentBean> getContentMatch(List<String> urlList){
try{
ContentCrawler crawler = new ContentCrawler();
List<ContentBean> list = Collections.synchronizedList(new ArrayList<ContentBean>());
ContentDataCallback callback = new ContentDataCallback() {
@Override
public void onData(ContentBean data, Attribution attr) {
list.add(data);
logger.info("列表大小:::{}",list.size());
}
};
crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await();
return list;
}catch (Exception e){
e.fillInStackTrace();
}
return null;
}
}
} }
package com.zhiwei.source_forward.run;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.crawler.MediaSelfSourceCrawler;
import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack;
public class MediaSelfSource {
private static Logger logger = LogManager.getLogger(MediaSelfSource.class);
public static List<MediaSelfSourceBean> getMediaSelfSource(List<String> urlList) {
List<MediaSelfSourceBean> list = MediaSelfSourceCrawlerThread.getMediaSelfSource(urlList);
return list;
}
public static void main(String[] args) {
List<String> urlList = new ArrayList<>();
urlList.add("https://baijiahao.baidu.com/s?id=1606950814338460255&wfr=spider&for=pc&qq-pf-to=pcqq.c2c");
List<MediaSelfSourceBean> da = MediaSelfSource.getMediaSelfSource(urlList);
for(MediaSelfSourceBean mssb : da) {
System.out.println(mssb.toString());
}
}
static class MediaSelfSourceCrawlerThread extends Thread{
private static List<MediaSelfSourceBean> getMediaSelfSource(List<String> urlList){
try{
MediaSelfSourceCrawler crawler = new MediaSelfSourceCrawler();
List<MediaSelfSourceBean> list = Collections.synchronizedList(new ArrayList<MediaSelfSourceBean>());
MediaSelfSourceDataCallBack callback = new MediaSelfSourceDataCallBack() {
@Override
public void onData(MediaSelfSourceBean data, Attribution attr) {
list.add(data);
logger.info("列表大小:::{}",list.size());
}
};
crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await();
return list;
}catch (Exception e){
e.fillInStackTrace();
}
return null;
}
}
}
package com.zhiwei.source_forward.run; package com.zhiwei.source_forward.run;
import java.util.HashMap; import java.util.ArrayList;
import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import com.zhiwei.source_forward.crawler.MediaSelfSourcePageProcessor; import org.apache.logging.log4j.LogManager;
import com.zhiwei.source_forward.crawler.SourceForwardPageProcessor; import org.apache.logging.log4j.Logger;
import com.zhiwei.source_forward.downloader.MyDownLoader;
import com.zhiwei.source_forward.pipeline.DataPipeline;
import us.codecraft.webmagic.Spider; import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
import com.zhiwei.source_forward.crawler.SourceForwardCrawler;
import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
/** /**
* @ClassName: SourceForward * @ClassName: SourceForward
...@@ -20,149 +20,57 @@ import us.codecraft.webmagic.Spider; ...@@ -20,149 +20,57 @@ import us.codecraft.webmagic.Spider;
*/ */
public class SourceForward { public class SourceForward {
/** private static Logger logger = LogManager.getLogger(SourceForward.class);
* @Title: getSourceForward
* @author hero public static List<SourceForwardBean> getSourceForward(List<String> urlList){
* @Description: 验证文章是否转发 //启动获取链接来源
* @param @param dataMap List<SourceForwardBean> dataList = SourceForwardCrawlerThread.getSourceForward(urlList);
* @param @return 设定文件 return dataList;
* @return Map<String,Map<String,Object>> 返回类型 }
*/
public static Map<String,Map<String,Object>> getSourceForward(Map<String,Map<String,Object>> dataMap){ public static void main(String[] args) {
//启动验证来源程序 List<String> urlList = new ArrayList<>();
DataPipeline pipeline = new DataPipeline(); urlList.add("http://sh.qihoo.com/pc/99493b3bf136d8e20?sign=360_e39369d1");
Spider spider = Spider.create(new SourceForwardPageProcessor()); urlList.add("http://news.ctocio.com.cn/383/14543883.shtml");
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){ urlList.add("http://www.jn001.com/news/2018-07/05/content_561091.htm");
spider.addUrl(entry.getKey()); urlList.add("http://www.ca800.com/fFa8D/bOTUBC1QfF/40944.aspx");
} urlList.add("http://sh.qihoo.com/pc/988470164f6c5ca14?sign=360_e39369d1");
spider.setDownloader(new MyDownLoader()); urlList.add("http://news.jstv.com/a/20180705/1530731642686.shtml?jsbcApp=1");
spider.addPipeline(pipeline); urlList.add("https://tech.sina.cn/i/gn/2018-07-05/detail-ihexfcvi8155439.d.html?pos=18");
spider.thread(5).run(); urlList.add("http://sh.qihoo.com/pc/983b3d157f91af18b?sign=360_e39369d1");
urlList.add("http://china.rednet.cn/c/2018/07/05/4671927.htm");
List<Map<String,Object>> sourceForwardList = pipeline.getSourceForwardDataList(); urlList.add("http://news.enorth.com.cn/system/2018/07/05/035782857.shtml");
for(Map<String,Object> sourceMap : sourceForwardList){ urlList.add("https://www.toutiao.com/i6573922350037729796/");
String url = sourceMap.get("url")+""; urlList.add("http://news.cnhubei.com/xw/sh/201807/t4132048.shtml");
String root_source = sourceMap.get("root_source")!=null?sourceMap.get("root_source").toString():null; urlList.add("https://www.toutiao.com/a6573774143949373956/");
String isForwardWX = sourceMap.get("isforward")!=null?sourceMap.get("isforward").toString():null; List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
String channel = sourceMap.get("channel")+""; for(SourceForwardBean sfb : da) {
//整合数据及验证转发原创 System.out.println(sfb.toString());
if(dataMap.containsKey(url)){ }
Map<String,Object> data = dataMap.get(url); }
String source = data.get("来源")+"";
String isForward = "转发";
if(root_source == null){
isForward = "原创";
}else if(root_source.toUpperCase().trim().equals(source.toUpperCase().trim())){
isForward = "原创";
}
if(url.contains("mp.weixin.qq.com")){
isForward = isForwardWX;
}else{
data.put("原来源", root_source);
data.put("频道", channel);
}
data.put("是否转发", isForward);
dataMap.put(url, data);
}
}
return dataMap;
}
static class SourceForwardCrawlerThread extends Thread{
/**
* @Title: getMediaSelfSource
* @author hero
* @Description: 根据链接匹配自媒体号名称
* @param @param dataMap
* @param @return 设定文件
* @return Map<String,Map<String,Object>> 返回类型
*/
public static Map<String,Map<String,Object>> getMediaSelfSource(Map<String,Map<String,Object>> dataMap){
//启动验证来源程序
DataPipeline pipeline = new DataPipeline();
Spider spider = Spider.create(new MediaSelfSourcePageProcessor());
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
spider.addUrl(entry.getKey());
}
spider.setDownloader(new MyDownLoader());
spider.addPipeline(pipeline);
spider.thread(5).run();
List<Map<String,Object>> sourceForwardList = pipeline.getMediaSelfDataList();
for(Map<String,Object> sourceMap : sourceForwardList){
String url = sourceMap.get("url")+"";
//整合数据及验证转发原创
if(dataMap.containsKey(url)){
Map<String,Object> data = dataMap.get(url);
data.put("自媒体号", sourceMap.get("mediaself"));
data.put("频道", sourceMap.get("channel"));
dataMap.put(url, data);
}
}
return dataMap;
}
/**
* @Title: getMediaSelfSource
* @author hero
* @Description: 根据链接匹配自媒体账号
* @param @param urlList
* @param @return 设定文件
* @return Map<String,String> 返回类型
*/
public static Map<String,String> getMediaSelfSource(List<String> urlList){
//启动验证来源程序
Map<String,String> dataMap = new HashMap<String,String>();
DataPipeline pipeline = new DataPipeline();
Spider spider = Spider.create(new MediaSelfSourcePageProcessor());
for(String url : urlList){
spider.addUrl(url);
dataMap.put(url, null);
}
spider.setDownloader(new MyDownLoader());
spider.addPipeline(pipeline);
spider.thread(5).run();
List<Map<String,Object>> sourceForwardList = pipeline.getMediaSelfDataList();
for(Map<String,Object> sourceMap : sourceForwardList){
String url = sourceMap.get("url")+"";
//整合数据及验证转发原创
if(dataMap.containsKey(url)){
dataMap.put(url, sourceMap.get("mediaself").toString());
}
}
return dataMap;
}
/**
*
* @Title: getMediaSelfSource
* @author hero
* @Description: 根据链接匹配自媒体账号
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
public static String getMediaSelfSource(String url){
//启动验证来源程序
DataPipeline pipeline = new DataPipeline();
Spider spider = Spider.create(new MediaSelfSourcePageProcessor());
spider.addUrl(url);
spider.setDownloader(new MyDownLoader());
spider.addPipeline(pipeline);
spider.thread(1).run();
List<Map<String,Object>> sourceForwardList = pipeline.getMediaSelfDataList();
for(Map<String,Object> sourceMap : sourceForwardList){
return sourceMap.get("mediaself").toString();
}
return null;
}
private static List<SourceForwardBean> getSourceForward(List<String> urlList){
try{
SourceForwardCrawler crawler = new SourceForwardCrawler();
List<SourceForwardBean> list = Collections.synchronizedList(new ArrayList<SourceForwardBean>());
SourceForwardDataCallBack callback = new SourceForwardDataCallBack() {
@Override
public void onData(SourceForwardBean data, Attribution attr) {
list.add(data);
logger.info("列表大小:::{}",list.size());
}
};
crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await();
return list;
}catch (Exception e){
e.fillInStackTrace();
}
return null;
}
}
} }
package com.zhiwei.source_forward.run; package com.zhiwei.source_forward.run;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import com.zhiwei.source_forward.crawler.UrlLivePageProcessor; import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.pipeline.DataPipeline; import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.crawler.UrlLiveCrawler;
import us.codecraft.webmagic.Spider; import com.zhiwei.source_forward.util.UrlLiveDataCallback;
/** /**
* @ClassName: URLLive * @ClassName: URLLive
...@@ -16,46 +18,70 @@ import us.codecraft.webmagic.Spider; ...@@ -16,46 +18,70 @@ import us.codecraft.webmagic.Spider;
* @date 2017年12月6日 上午9:22:49 * @date 2017年12月6日 上午9:22:49
*/ */
public class URLLive { public class URLLive {
/**
* @Title: verificationURLLive
* @author hero
* @Description: 验证数据是否已删除
* @param @param dataMap
* @param @return 设定文件
* @return Map<String,Map<String,Object>> 返回类型
*/
public static Map<String,Map<String,Object>> verificationURLLive(Map<String,Map<String,Object>> dataMap){
List<String> urlList = new ArrayList<>();
//启动验证链接是否有效程序程序
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
urlList.add(entry.getKey());
}
//验证数据是否已删除
List<UrlLiveBean> dataList = UrlLiveCrawlerThread.getUrlLiveCrawle(urlList);
for(UrlLiveBean ub : dataList){
String url = ub.getUrl();
if(!url.contains("http")){
url = "http://"+url;
}
if(!url.contains("www")){
url = url.replace("://", "://www.");
}
boolean live = ub.isLive();
if(dataMap.containsKey(url)){
Map<String,Object> map = dataMap.get(url);
map.put("是否删除", live);
dataMap.put(url, map);
}
}
return dataMap;
}
public static List<UrlLiveBean> verificationURLLive(List<String> urlList){
//启动验证链接是否有效程序程序
List<UrlLiveBean> dataList = UrlLiveCrawlerThread.getUrlLiveCrawle(urlList);
return dataList;
}
static class UrlLiveCrawlerThread extends Thread{
/**
* @Title: verificationURLLive private static List<UrlLiveBean> getUrlLiveCrawle(List<String> urlList){
* @author hero try{
* @Description: 验证数据是否已删除 UrlLiveCrawler crawler = new UrlLiveCrawler();
* @param @param dataMap List<UrlLiveBean> list = Collections.synchronizedList(new ArrayList<UrlLiveBean>());
* @param @return 设定文件 UrlLiveDataCallback callback = new UrlLiveDataCallback() {
* @return Map<String,Map<String,Object>> 返回类型
*/
public static Map<String,Map<String,Object>> verificationURLLive(Map<String,Map<String,Object>> dataMap){
//启动验证链接是否有效程序程序
DataPipeline pipeline = new DataPipeline();
Spider spider = Spider.create(new UrlLivePageProcessor());
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
spider.addUrl(entry.getKey());
}
spider.addPipeline(pipeline);
spider.thread(5).run();
//验证数据是否已删除
List<Map<String,Object>> dataList = pipeline.getUrlLivedataList();
for(Map<String,Object> data : dataList){
String url = data.get("url")+"";
if(!url.contains("http")){
url = "http://"+url;
}
if(!url.contains("www")){
url = url.replace("://", "://www.");
}
boolean live = (boolean)data.get("live");
if(dataMap.containsKey(url)){
Map<String,Object> map = dataMap.get(url);
map.put("是否删除", live);
dataMap.put(url, map);
}
}
return dataMap;
}
@Override
public void onData(UrlLiveBean data, Attribution attr) {
list.add(data);
System.out.println("列表大小:::"+list.size());
}
};
crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await();
return list;
}catch (Exception e){
e.fillInStackTrace();
}
return null;
}
}
} }
package com.zhiwei.source_forward.util;
import com.zhiwei.source_forward.bean.ContentBean;
import com.zhiwei.source_forward.bean.ContentBean.Attribution;
public interface ContentDataCallback {
void onData(ContentBean data, Attribution attr);
}
...@@ -5,8 +5,8 @@ import org.jsoup.nodes.Document; ...@@ -5,8 +5,8 @@ import org.jsoup.nodes.Document;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import cn.edu.hfut.dmic.contentextractor.ContentExtractor; import com.zhiwei.source_forward.content.ContentExtractor;
import cn.edu.hfut.dmic.contentextractor.News; import com.zhiwei.source_forward.content.News;
/** /**
* @ClassName: MatchChannel * @ClassName: MatchChannel
......
...@@ -5,8 +5,8 @@ import java.util.List; ...@@ -5,8 +5,8 @@ import java.util.List;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import cn.edu.hfut.dmic.contentextractor.ContentExtractor; import com.zhiwei.source_forward.content.ContentExtractor;
import cn.edu.hfut.dmic.contentextractor.News; import com.zhiwei.source_forward.content.News;
/** /**
* @ClassName: MatchSource * @ClassName: MatchSource
......
package com.zhiwei.source_forward.util;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
public interface MediaSelfSourceDataCallBack {
void onData(MediaSelfSourceBean data, Attribution attr);
}
package com.zhiwei.source_forward.util;
import java.net.Proxy;
import com.zhiwei.proxy.common.Definition.GroupType;
import com.zhiwei.proxy.core.ProxyClient;
import com.zhiwei.proxy.core.ProxyClientFactory;
public class ProxyClientUtil {
private static volatile ProxyClient client;
/**
* @Title: getNATProxy
* @author hero
* @Description: 获取NAT机代理IP
* @param @return 设定文件
* @return Proxy 返回类型
*/
public static Proxy getNATProxy(){
return getClient().getNATProxy();
}
public static ProxyClient getClient() {
if(client==null) {
synchronized (ProxyClientUtil.class) {
if(client==null) {
client = ProxyClientFactory.build("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
}
}
}
return client;
}
}
...@@ -9,7 +9,8 @@ import java.util.List; ...@@ -9,7 +9,8 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import com.zhiwei.zhiweiTools.order.TreatOrder; import com.zhiwei.tools.order.TreatOrder;
/** /**
* @ClassName: SourceData * @ClassName: SourceData
...@@ -82,7 +83,7 @@ public class SourceData { ...@@ -82,7 +83,7 @@ public class SourceData {
public static List<String> getSourceList(){ public static List<String> getSourceList(){
List<String> result = null; List<String> result = null;
if(sourceMap!=null && sourceMap.size()>0){ if(sourceMap!=null && sourceMap.size()>0){
result = new ArrayList<String>(); result = new ArrayList<>();
List<Entry<String,Integer>> dataList = TreatOrder.treatOrderByCountDesc(sourceMap); List<Entry<String,Integer>> dataList = TreatOrder.treatOrderByCountDesc(sourceMap);
for(Entry<String,Integer> entry : dataList){ for(Entry<String,Integer> entry : dataList){
result.add(entry.getKey()); result.add(entry.getKey());
......
package com.zhiwei.source_forward.util;
import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
public interface SourceForwardDataCallBack {
/**
* 当有输入传入调度
*
* @param data
* @param attr
* @return void
*/
void onData(SourceForwardBean data, Attribution attr);
}
...@@ -46,5 +46,5 @@ public class TreateData { ...@@ -46,5 +46,5 @@ public class TreateData {
return str; return str;
} }
} }
} }
/**
* @Title: DataCallback.java
* @Package com.zhiwei.crawler.baidu
* @author 0xff
* @date 2018年6月29日 下午4:44:38
*/
package com.zhiwei.source_forward.util;
import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
/**
* @ClassName: UrlLiveDataCallback
* @Description: 链接是否删除保存接口
* @author 0xff
* @date 2018年6月29日 下午4:44:38
*/
public interface UrlLiveDataCallback {
/**
* 当有输入传入调度
*
* @param data
* @param attr
* @return void
*/
void onData(UrlLiveBean data, Attribution attr);
}
<?xml version="1.0" encoding="UTF-8"?>
<!-- log4j2 自身的日志级别 -->
<Configuration status="WARN">
<properties>
<property name="LOG_HOME">Log/</property>
<property name="LOG_FILE">crawler</property>
</properties>
<Appenders>
<!-- 定义日志输出地 -->
<Console name="Console" target="SYSTEM_OUT">
<PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss.SSS} %-5level %logger{36} - %msg%n" />
</Console>
<RollingRandomAccessFile name="LogFile"
fileName="${LOG_HOME}/${LOG_FILE}.log"
filePattern="${LOG_HOME}/$${date:yyyy-MM}/${LOG_FILE}-%d{yyyy-MM-dd}-%i.log">
<PatternLayout
pattern="%d{yyyy-MM-dd HH:mm:ss.SSS} %-5level %logger{36} - %msg%n" />
<Policies>
<TimeBasedTriggeringPolicy interval="1" />
<SizeBasedTriggeringPolicy size="20 MB" />
</Policies>
<DefaultRolloverStrategy max="20" />
</RollingRandomAccessFile>
</Appenders>
<Loggers>
<Root level="all">
<AppenderRef ref="Console" level="info" />
<AppenderRef ref="LogFile" level="info" />
</Root>
</Loggers>
</Configuration>
\ No newline at end of file
package com.zhiwei.source_forward.sourceforward.test; //package com.zhiwei.source_forward.sourceforward.test;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
import java.util.Map.Entry; //import java.util.Map.Entry;
//
import org.junit.Test; //import org.junit.Test;
//
import com.zhiwei.source_forward.run.URLLive; //import com.zhiwei.source_forward.run.URLLive;
//
/** ///**
* @ClassName: URLLiveTest // * @ClassName: URLLiveTest
* @Description: 验证链接有效性 // * @Description: 验证链接有效性
* @author hero // * @author hero
* @date 2017年12月6日 下午1:30:26 // * @date 2017年12月6日 下午1:30:26
*/ // */
public class URLLiveTest { //public class URLLiveTest {
//
//
// @Test //// @Test
// public void urlLiveTest(){ //// public void urlLiveTest(){
// String path = "E://稿件汇总网媒数据//福莱网媒.xlsx"; //// String path = "E://稿件汇总网媒数据//福莱网媒.xlsx";
// PoiExcelUtil poi = PoiExcelUtil.getInstance(); //// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// Map<String,Object> data = poi.importExcel(path, 0); //// Map<String,Object> data = poi.importExcel(path, 0);
// @SuppressWarnings("unchecked") //// @SuppressWarnings("unchecked")
// List<String> headList = (List<String>)data.get("head"); //// List<String> headList = (List<String>)data.get("head");
// headList.add("是否删除"); //// headList.add("是否删除");
// @SuppressWarnings("unchecked") //// @SuppressWarnings("unchecked")
// List<Map<String,Object>> dataList = (List<Map<String,Object>>)data.get("body"); //// List<Map<String,Object>> dataList = (List<Map<String,Object>>)data.get("body");
// Map<String,Map<String,Object>> dataMap = ReadMediaData.getUrlLive(dataList); //// Map<String,Map<String,Object>> dataMap = ReadMediaData.getUrlLive(dataList);
// dataMap = URLLive.verificationURLLive(dataMap); //// dataMap = URLLive.verificationURLLive(dataMap);
// ////
// List<Map<String,Object>> bodyList = new ArrayList<>(); //// List<Map<String,Object>> bodyList = new ArrayList<>();
// for(Entry<String,Map<String,Object>> dataEntry : dataMap.entrySet()){ //// for(Entry<String,Map<String,Object>> dataEntry : dataMap.entrySet()){
// bodyList.add(dataEntry.getValue()); //// bodyList.add(dataEntry.getValue());
// } //// }
// poi.exportExcel(path ,"匹配后数据", headList, bodyList); //// poi.exportExcel(path ,"匹配后数据", headList, bodyList);
// } //// }
//
//
} //}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment