Commit 0f93a339 by zhiwei

第一次git项目提交测试,来源验证及是否删除验证程序

parents
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>source_forward</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>source_forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties>
<developers>
<developer>
<id>Bewilder</id>
<name>zhiwei zhang</name>
<email>zhangzhiwei@zhiweidata.com</email>
</developer>
</developers>
<dependencies>
<dependency>
<groupId>cn.edu.hfut.dmic.webcollector</groupId>
<artifactId>WebCollector</artifactId>
<version>2.71</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.6.1</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.6.1</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-saxon</artifactId>
<version>0.6.1</version>
</dependency>
<dependency>
<groupId>com.zhiwei</groupId>
<artifactId>zhiweiTools</artifactId>
<version>0.0.6-SNAPSHOT</version>
</dependency>
</dependencies>
<!-- 打包管理 -->
<build>
<plugins>
<!-- 发布源码 -->
<plugin>
<artifactId>maven-source-plugin</artifactId>
<version>2.4</version>
<configuration>
<attach>true</attach>
</configuration>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.10.4</version>
</plugin>
<!-- 解决maven test命令时console出现中文乱码乱码 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.19.1</version>
<configuration>
<forkMode>once</forkMode>
<argLine>-Dfile.encoding=UTF-8</argLine>
</configuration>
</plugin>
</plugins>
</build>
<!-- 分发管理:管理distribution和supporting files -->
<distributionManagement>
<snapshotRepository>
<id>nexus-releases</id>
<name>User Porject Snapshot</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/snapshots/</url>
<uniqueVersion>true</uniqueVersion>
</snapshotRepository>
<repository>
<id>nexus-releases</id>
<name>User Porject Release</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/releases/</url>
</repository>
</distributionManagement>
</project>
\ No newline at end of file
package com.zhiwei.source_forward.crawler;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.nodes.Node;
import com.zhiwei.source_forward.util.SourceData;
import com.zhiwei.source_forward.util.TreateData;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
public class SourceForwardPageProcessor implements PageProcessor {
private static List<String> sourceList = SourceData.getSourceList();
private Site site = Site.me().setCycleRetryTimes(3).setSleepTime(1500)
.setTimeOut(10000)
.addHeader("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0")
.addHeader("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
;
@Override
public Site getSite() {
return site;
}
@Override
public void process(Page page) {
Map<String,String> data = new HashMap<String,String>();
String source = null;
String channel = "新闻";
try {
if(page.getStatusCode()!=404){
List<Node> nodeList = page.getHtml().getDocument().head().childNodes();
source = TreateData.matchSource(page.getUrl().get(),page.getHtml().toString(), sourceList);
channel = TreateData.matchChannel(nodeList);
}
} catch (Exception e) {
source = null;
channel = "新闻";
}
System.out.println(page.getUrl().get()+"======="+channel+"================="+source);
data.put("url", page.getUrl().get());
data.put("channel", channel);
data.put("root_source", source);
page.putField("data", data);
}
}
package com.zhiwei.source_forward.crawler;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
public class UrlLivePageProcessor implements PageProcessor{
private static Logger logger = LoggerFactory.getLogger(UrlLivePageProcessor.class);
private Site site = Site.me().setCycleRetryTimes(3).setSleepTime(1500)
.setTimeOut(15000)
.addHeader("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0")
.addHeader("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
@Override
public void process(Page page) {
/***验证网页是否能够连通*/
boolean f = true;
if(page!=null){
if(page.getStatusCode()==200){
f = matchDel(page);
}else if(page.getStatusCode()==404){
f = true;
}else{
f = false;
}
}
Map<String,Object> data = new HashMap<String,Object>();
data.put("url", page.getUrl().get());
data.put("live", f);
page.putField("data", data);
}
@Override
public Site getSite() {
return site;
}
/***
* @Title: matchDel
* @author hero
* @Description: 验证链接是否有效
* @param @param page
* @param @return 设定文件
* @return boolean 返回类型
*/
public boolean matchDel(Page page){
int step = 1;
Document doc = page.getHtml().getDocument();
if(rulerHead(doc)){
logger.info("{}检测规则:第{}步",page.getUrl(),step);
return true;
}
step++;
if (rulerYaoyan(doc))
{
logger.info("{}检测规则:第{}步",page.getUrl(),step);
return true;
}
step++;
if (rulerWeigui(doc))
{
logger.info("{}检测规则:第{}步",page.getUrl(),step);
return true;
}
step++;
if (rulerTousu(doc))
{
logger.info("{}检测规则:第{}步",page.getUrl(),step);
return true;
}
step++;
if (page.getUrl().get().contains("huanqiu.com"))
{
logger.info("{}检测规则:第{}步",page.getUrl(),step);
return rulerHuanqiuWuxiao(doc);
}
step++;//7
if (rulerBucunzai(doc))
{
logger.info("{}检测规则:第{}步",page.getUrl(),step);
return true;
}
step++;//8
if (rulerKong(doc))
{
logger.info("{}检测规则:第{}步",page.getUrl(),step);
return true;
}
step++;//9
if (rulerZhaoshang(doc))
{
logger.info("{}检测规则:第{}步",page.getUrl(),step);
return true;
}
step++;//11
if (rulerYidian(doc))
{
logger.info("{}检测规则:第{}步",page.getUrl(),step);
return true;
}
return false;
}
/**
*
* @TODO(TODO 微信谣言的无效网址筛选规则)
* @author 陈炜涛
* @param doc
* @return
* @time 2016年6月3日上午9:54:00
* @return boolean
*/
private boolean rulerYaoyan(Document doc)
{
boolean flg = false;
if ("谣言".equals(doc.select(".pic_rumor").text()))
{
flg = true;
}
return flg;
}
/**
*
* @TODO(TODO 微信内容违规的无效网址筛选规则)
* @author 陈炜涛
* @param doc
* @return
* @time 2016年6月3日上午9:59:54
* @return boolean
*/
private boolean rulerWeigui(Document doc)
{
boolean flg = false;
if ("此内容因违规无法查看".equals(doc.select(".text_area > p:nth-child(1)")
.text()))
{
flg = true;
}
return flg;
}
/**
*
* @TODO(TODO 微信内容违规的无效网址筛选规则)
* @author 陈炜涛
* @param doc
* @return
* @time 2016年6月3日上午9:59:54
* @return boolean
*/
private boolean rulerTousu(Document doc)
{
boolean flg = false;
if (0 < doc.select("i[class=\"icon_msg warn\"]").size())
{
flg = true;
}
return flg;
}
/**
*
* @TODO(TODO 环球的无效网址筛选规则)
* @author 陈炜涛
* @param doc
* @return
* @time 2016年6月3日上午9:59:54
* @return boolean
*/
private boolean rulerHuanqiuWuxiao(Document doc)
{
boolean flg = false;
if (0 < doc.select("div[class=\"errMsg\"]").size())
{
flg = true;
}
return flg;
}
/**
*
* @TODO(TODO 空的无效网址筛选规则)
* @author 陈炜涛
* @param doc
* @return
* @time 2016年6月3日上午9:59:54
* @return boolean
*/
private boolean rulerKong(Document doc)
{
boolean flg = false;
if (14 > doc.select("body").toString().length()
&&
14 > doc.select("head").toString().length())
{
flg = true;
}
return flg;
}
/**
*
* @TODO(TODO 内容不存在)
* @author 陈炜涛
* @param doc
* @return
* @time 2016年6月3日上午9:59:54
* @return boolean
*/
private boolean rulerBucunzai(Document doc)
{
boolean flg = false;
if (doc.text().contains("很抱歉,您访问的页面不存在")||doc.text().contains("该内容已被发布者删除"))
{
flg = true;
}
return flg;
}
/**
*
* @TODO(TODO 招商网的无效网址筛选规则)
* @author 陈炜涛
* @param doc
* @return
* @time 2016年6月3日上午9:59:54
* @return boolean
*/
private boolean rulerZhaoshang(Document doc)
{
boolean flg = false;
try
{
if ("<a href=\"\"> </a>".equals(doc.select("div[class=\"paths\"]")
.first().child(2).toString()))
{
flg = true;
}
}
catch (Exception e)
{
// TODO: handle exception
}
return flg;
}
/**
*
* @TODO(TODO 一点资讯的无效网址筛选规则)
* @author 陈炜涛
* @param doc
* @return
* @time 2016年6月3日上午9:59:54
* @return boolean
*/
private boolean rulerYidian(Document doc)
{
boolean flg = false;
try
{
if (doc.select("div[class=\"content\"]").text().contains("文章没有找到"))
{
flg = true;
}
}
catch (Exception e)
{
// TODO: handle exception
}
return flg;
}
/**
* @Title: rulerHead
* @author hero
* @Description: 验证链接头部
* @param @param doc
* @param @return 设定文件
* @return boolean 返回类型
*/
private boolean rulerHead(Document doc)
{
List<Node> nodeList = doc.head().childNodes();
try {
for (Node node : nodeList) {
if (node.outerHtml().contains("<title>")) {
String title = node.toString().split("<title>")[1].split("</title>")[0];
if(title.contains("404")){
return true;
}
}
}
} catch (Exception e) {
return false;
}
return false;
}
}
package com.zhiwei.source_forward.pipeline;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
public class SourceForwardDataPipeline implements Pipeline {
private List<Map<String, Object>> dataList;
public SourceForwardDataPipeline(List<Map<String, Object>> dataList) {
super();
this.dataList = dataList;
}
public SourceForwardDataPipeline() {
super();
this.dataList = new ArrayList<>();
}
public List<Map<String, Object>> getDataList() {
return dataList;
}
public void setDataList(List<Map<String, Object>> dataList) {
this.dataList = dataList;
}
@Override
public void process(ResultItems resultItems, Task task) {
Map<String, Object> data = resultItems.get("data");
if (data != null) {
dataList.add(data);
}
}
}
package com.zhiwei.source_forward.pipeline;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
public class UrlLivePipeline implements Pipeline{
private List<Map<String, Object>> dataList;
public UrlLivePipeline(List<Map<String, Object>> dataList) {
super();
this.dataList = dataList;
}
public UrlLivePipeline() {
super();
this.dataList = new ArrayList<>();
}
public List<Map<String, Object>> getDataList() {
return dataList;
}
public void setDataList(List<Map<String, Object>> dataList) {
this.dataList = dataList;
}
@Override
public void process(ResultItems resultItems, Task task) {
Map<String, Object> data = resultItems.get("data");
if (data != null) {
dataList.add(data);
}
}
}
package com.zhiwei.source_forward.run;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import com.zhiwei.source_forward.crawler.SourceForwardPageProcessor;
import com.zhiwei.source_forward.downloader.MyDownLoader;
import com.zhiwei.source_forward.pipeline.SourceForwardDataPipeline;
import us.codecraft.webmagic.Spider;
/**
* @ClassName: SourceForward
* @Description: 验证文章是否为转发
* @author hero
* @date 2017年12月5日 下午7:03:57
*/
public class SourceForward {
/**
* @Title: getSourceForward
* @author hero
* @Description: 验证文章是否转发
* @param @param dataMap
* @param @return 设定文件
* @return Map<String,Map<String,Object>> 返回类型
*/
public static Map<String,Map<String,Object>> getSourceForward(Map<String,Map<String,Object>> dataMap){
//启动验证来源程序
SourceForwardDataPipeline pipeline = new SourceForwardDataPipeline();
Spider spider = Spider.create(new SourceForwardPageProcessor());
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
spider.addUrl(entry.getKey());
}
spider.setDownloader(new MyDownLoader());
spider.addPipeline(pipeline);
spider.thread(5).run();
List<Map<String,Object>> sourceForwardList = pipeline.getDataList();
for(Map<String,Object> sourceMap : sourceForwardList){
String url = sourceMap.get("url")+"";
String root_source = sourceMap.get("root_source")!=null?sourceMap.get("root_source").toString():null;
String channel = sourceMap.get("channel")+"";
//整合数据及验证转发原创
if(dataMap.containsKey(url)){
Map<String,Object> data = dataMap.get(url);
String source = data.get("来源")+"";
String isForward = "转发";
if(root_source == null){
isForward = "原创";
}else if(root_source.equals(source)){
isForward = "原创";
}
data.put("是否转发", isForward);
data.put("原来源", root_source);
data.put("频道", channel);
dataMap.put(url, data);
}
}
return dataMap;
}
}
package com.zhiwei.source_forward.run;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import com.zhiwei.source_forward.crawler.UrlLivePageProcessor;
import com.zhiwei.source_forward.pipeline.UrlLivePipeline;
import us.codecraft.webmagic.Spider;
/**
* @ClassName: URLLive
* @Description: 验证链接是否已删除
* @author hero
* @date 2017年12月6日 上午9:22:49
*/
public class URLLive {
/**
* @Title: verificationURLLive
* @author hero
* @Description: 验证数据是否已删除
* @param @param dataMap
* @param @return 设定文件
* @return Map<String,Map<String,Object>> 返回类型
*/
public static Map<String,Map<String,Object>> verificationURLLive(Map<String,Map<String,Object>> dataMap){
//启动验证链接是否有效程序程序
UrlLivePipeline pipeline = new UrlLivePipeline();
Spider spider = Spider.create(new UrlLivePageProcessor());
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
spider.addUrl(entry.getKey());
}
spider.addPipeline(pipeline);
spider.thread(5).run();
//验证数据是否已删除
List<Map<String,Object>> dataList = pipeline.getDataList();
for(Map<String,Object> data : dataList){
String url = data.get("url")+"";
if(!url.contains("http")){
url = "http://"+url;
}
if(!url.contains("www")){
url = url.replace("://", "://www.");
}
boolean live = (boolean)data.get("live");
if(dataMap.containsKey(url)){
Map<String,Object> map = dataMap.get(url);
map.put("是否删除", live);
dataMap.put(url, map);
}
}
return dataMap;
}
}
package com.zhiwei.source_forward.util;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import com.zhiwei.zhiweiTools.order.TreatOrder;
/**
* @ClassName: SourceData
* @Description: 来源列表
* @author hero
* @date 2017年12月5日 下午6:37:38
*/
public class SourceData {
private static Map<String,Integer> sourceMap;
static {
initSourceList();
}
/**
* @Title: initSourceList
* @author hero
* @Description: 初始化基本来源列表
* @param @return 设定文件
* @return Map<String,Integer> 返回类型
*/
private static Map<String,Integer> initSourceList(){
try {
sourceMap = new HashMap<String,Integer>();
InputStream is = Thread.currentThread().getContextClassLoader().getResourceAsStream("sourceList.txt");
BufferedReader br = new BufferedReader(new InputStreamReader(is));
String line = "";
while((line = br.readLine())!=null)
{
String source = line.toUpperCase();
sourceMap.put(source, source.hashCode());
}
is.close();
br.close();
return sourceMap;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* @Title: addUserSource
* @author hero
* @Description: 用户添加自定义来源站
* @param @return 设定文件
* @return boolean 返回类型
*/
public static boolean addUserSource(String source){
if(sourceMap!=null){
source = source.toUpperCase();
if(!sourceMap.containsKey(source)){
sourceMap.put(source, source.hashCode());
return true;
}
}
return false;
}
/**
* @Title: getSourceList
* @author hero
* @Description: 获取来源列表
* @param @return 设定文件
* @return List<String> 返回类型
*/
public static List<String> getSourceList(){
List<String> result = null;
if(sourceMap!=null && sourceMap.size()>0){
result = new ArrayList<String>();
List<Entry<String,Integer>> dataList = TreatOrder.treatOrderByCountDesc(sourceMap);
for(Entry<String,Integer> entry : dataList){
result.add(entry.getKey());
}
}
return result;
}
}
//package com.zhiwei.source_forward.sourceforward.test;
//
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//import java.util.Map.Entry;
//
//import org.junit.Test;
//
//import com.zhiwei.excelpoi.excel.PoiExcelUtil;
//import com.zhiwei.source_forward.run.SourceForward;
//import com.zhiwei.source_forward.util.ReadMediaData;
//
///**
// * @ClassName: SourceForwardTest
// * @Description: 来源验证
// * @author hero
// * @date 2017年12月6日 上午9:55:13
// */
//public class SourceForwardTest {
//
// @Test
// public void sourceForwardTest(){
// String path = "E://稿件汇总网媒数据//JD稿件转载情况-1206.xlsx";
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// Map<String,Object> data = poi.importExcel(path, 0);
// @SuppressWarnings("unchecked")
// List<String> headList = (List<String>)data.get("head");
// headList.add("频道");
// headList.add("原来源");
// headList.add("是否转发");
// @SuppressWarnings("unchecked")
// List<Map<String,Object>> dataList = (List<Map<String,Object>>)data.get("body");
//
// Map<String,Map<String,Object>> dataMap = ReadMediaData.getUrl(dataList);
// dataMap = SourceForward.getSourceForward(dataMap);
//
// List<Map<String,Object>> bodyList = new ArrayList<>();
// for(Entry<String,Map<String,Object>> dataEntry : dataMap.entrySet()){
// bodyList.add(dataEntry.getValue());
// }
// poi.exportExcel(path ,"匹配后数据", headList, bodyList);
// }
//
//
//
//
//
//
//
//
//}
package com.zhiwei.source_forward.sourceforward.test;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.junit.Test;
import com.zhiwei.source_forward.run.URLLive;
/**
* @ClassName: URLLiveTest
* @Description: 验证链接有效性
* @author hero
* @date 2017年12月6日 下午1:30:26
*/
public class URLLiveTest {
// @Test
// public void urlLiveTest(){
// String path = "E://稿件汇总网媒数据//福莱网媒.xlsx";
// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// Map<String,Object> data = poi.importExcel(path, 0);
// @SuppressWarnings("unchecked")
// List<String> headList = (List<String>)data.get("head");
// headList.add("是否删除");
// @SuppressWarnings("unchecked")
// List<Map<String,Object>> dataList = (List<Map<String,Object>>)data.get("body");
// Map<String,Map<String,Object>> dataMap = ReadMediaData.getUrlLive(dataList);
// dataMap = URLLive.verificationURLLive(dataMap);
//
// List<Map<String,Object>> bodyList = new ArrayList<>();
// for(Entry<String,Map<String,Object>> dataEntry : dataMap.entrySet()){
// bodyList.add(dataEntry.getValue());
// }
// poi.exportExcel(path ,"匹配后数据", headList, bodyList);
// }
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment