Commit 132b70cb by cwy

Merge branch 'source-forward-chen' of…

Merge branch 'source-forward-chen' of http://git.zhiweidata.top/zhangzhiwei/source_forward.git into source-forward-chen
parents f0fbf66b 0abfbd4a
This source diff could not be displayed because it is too large. You can view the blob instead.
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId>
<version>0.2.1-SNAPSHOT</version>
<name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties>
<developers>
<developer>
<id>Bewilder</id>
<name>zhiwei zhang</name>
<email>zhangzhiwei@zhiweidata.com</email>
</developer>
</developers>
<dependencies>
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.1.3-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.5.5.6-SNAPSHOT</version>
</dependency>
</dependencies>
<!-- 打包管理 -->
<build>
<plugins>
<!-- 发布源码 -->
<plugin>
<artifactId>maven-source-plugin</artifactId>
<version>2.4</version>
<configuration>
<attach>true</attach>
</configuration>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.10.4</version>
</plugin>
<!-- 解决maven test命令时console出现中文乱码乱码 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.19.1</version>
<configuration>
<forkMode>once</forkMode>
<argLine>-Dfile.encoding=UTF-8</argLine>
</configuration>
</plugin>
</plugins>
</build>
<!-- 分发管理:管理distribution和supporting files -->
<distributionManagement>
<snapshotRepository>
<id>nexus-releases</id>
<name>User Porject Snapshot</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/snapshots/</url>
<uniqueVersion>true</uniqueVersion>
</snapshotRepository>
<repository>
<id>nexus-releases</id>
<name>User Porject Release</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/releases/</url>
</repository>
</distributionManagement>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId>
<version>0.2.2-SNAPSHOT</version>
<name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties>
<developers>
<developer>
<id>Bewilder</id>
<name>zhiwei zhang</name>
<email>zhangzhiwei@zhiweidata.com</email>
</developer>
</developers>
<dependencies>
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.1.6-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.6.1.0-SNAPSHOT</version>
</dependency>
</dependencies>
<!-- 打包管理 -->
<build>
<plugins>
<!-- 发布源码 -->
<plugin>
<artifactId>maven-source-plugin</artifactId>
<version>2.4</version>
<configuration>
<attach>true</attach>
</configuration>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.10.4</version>
</plugin>
<!-- 解决maven test命令时console出现中文乱码乱码 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.19.1</version>
<configuration>
<forkMode>once</forkMode>
<argLine>-Dfile.encoding=UTF-8</argLine>
</configuration>
</plugin>
</plugins>
</build>
<!-- 分发管理:管理distribution和supporting files -->
<distributionManagement>
<snapshotRepository>
<id>nexus-releases</id>
<name>User Porject Snapshot</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/snapshots/</url>
<uniqueVersion>true</uniqueVersion>
</snapshotRepository>
<repository>
<id>nexus-releases</id>
<name>User Porject Release</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/releases/</url>
</repository>
</distributionManagement>
</project>
\ No newline at end of file
package com.zhiwei.source_forward.config;
import java.io.InputStream;
import java.util.Properties;
public class ProxyConfig {
static {
Properties conf = null;
try {
InputStream is = Thread.currentThread().getContextClassLoader()
.getResourceAsStream("proxyip.properties");
conf = new Properties();
conf.load(is);
is.close();
registry = conf.getProperty("registry");
group = conf.getProperty("group");
} catch (Exception e) {
e.printStackTrace();
}
}
public static String registry;
public static String group;
}
package com.zhiwei.source_forward.config;
import java.io.InputStream;
import java.util.Properties;
public class ProxyConfig {
static {
Properties conf = null;
try {
InputStream is = Thread.currentThread().getContextClassLoader()
.getResourceAsStream("proxyip.properties");
conf = new Properties();
conf.load(is);
is.close();
registry = conf.getProperty("registry");
proxyid = Long.valueOf(conf.getProperty("proxyid"));
group = conf.getProperty("group");
} catch (Exception e) {
e.printStackTrace();
}
}
public static String registry;
public static Long proxyid;
public static String group;
}
package com.zhiwei.source_forward.crawler;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Node;
import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack;
import okhttp3.Request;
/**
*
* @ClassName MediaSelfSourceCrawler
* @Description 自媒体号匹配
* @author byte-zbs
* @Date 2018年8月21日 下午3:54:03
* @version 1.0.0
*/
public class MediaSelfSourceCrawler {
private static final Logger logger = LogManager.getLogger(MediaSelfSourceCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/**
*
* @Description 链接传入 并 返回采集完信号
* @param callback
* @param urls
* @return
* @throws Exception
*/
public GroupSync submitTask(MediaSelfSourceDataCallBack callback,String... urls) {
try {
GroupSync counter = new GroupSync();
start(counter, callback, urls);
return counter;
} catch (Exception e) {
logger.error(" exception {}",e);
return null;
}
}
/**
*
* @Description 提交链接
* @param counter
* @param callback
* @param urls
*/
private void start(GroupSync counter,MediaSelfSourceDataCallBack callback, String... urls) {
if (urls != null && urls.length > 0) {
for (String url : urls) {
counter.add();
if (url != null) {
try {
search(counter, url, Attribution.of(url), callback);
} catch (Exception e) {
logger.error("搜索创建出错", e);
}
}
counter.done();
}
}
}
/**
*
* @Description 链接获取文章信息
* @param counter
* @param url
* @param attr
* @param callback
* @return
*/
private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) {
logger.info("当前处理 URL: {}", url);
Map<String,Object> map = new HashMap<>();
if(url.contains("toutiao.com")) {
map.put("referer", url);
}
map.put("Connection", "close");
url = dealUrl(url);
if(Objects.nonNull(url)) {
Request request = RequestUtils.wrapGet(url, map);
counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY).whenComplete((rs,ex) -> {
try {
if (Objects.isNull(ex)) {
try {
parseHtml(rs.body().string(), attr, callback);
} catch (Exception e) {
logger.error("解析出错", e);
}
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(),ex);
}
} finally {
counter.done();
}
});
}
return counter;
}
/**
** 链接处理
* @param url
* @return
* @return String
*/
private String dealUrl(String url) {
try {
if(url.startsWith("http")) {
if(url.contains("wap.peopleapp.com/article")) {
return "https://app.peopleapp.com/WapApi/610/ArtInfoApi/getInfoUp?id=" + url.split("article")[1].split("/")[1];
}
if(url.contains("a.mp.uc.cn/") && url.contains("wm_cid=")) {
return "https://ff.dayu.com/contents/" + url.split("wm_cid=")[1].split("&")[0] + "?biz_id=1002&_fetch_author=1";
}
if(url.contains("tznew.58.com")) {
return "https://tznew.58.com/tznew/c/info-detail?infoid=" + url.split("infoid=")[1].split("&")[0];
}
if(url.contains("wap.peopleapp.com/article")) {
return "https://app.peopleapp.com/WapApi/610/ArtInfoApi/getInfoUp?id=" + url.split("article")[1].split("/")[1];
}
return url;
}
} catch (Exception e) {
}
return null;
}
/**
*
* @Description 解析文章获取相关数据
* @param response
* @param attr
* @param callback
*/
private void parseHtml(String result, Attribution attr,
MediaSelfSourceDataCallBack callback) {
String source = null;
String channel = null;
try {
source = MatchSource.matchMediaSelfSource(attr.get().toString(),result);
if(source==null || source.equals("")){
source = null;
}
channel = MatchChannel.verifyChannel(attr.get().toString());
if(channel==null){
List<Node> nodeList = Jsoup.parse(result).head().childNodes();
channel = MatchChannel.matchChannel(nodeList);
}
} catch (Exception e) {
logger.error("exception ",e);
source = null;
}
logger.info(attr.get()+"=================来源" + source);
MediaSelfSourceBean msfb = new MediaSelfSourceBean(attr.get().toString(), source, channel);
if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据");
} else {
callback.onData(msfb, attr);
}
}
}
package com.zhiwei.source_forward.crawler;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Node;
import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack;
import okhttp3.Request;
/**
*
* @ClassName MediaSelfSourceCrawler
* @Description 自媒体号匹配
* @author byte-zbs
* @Date 2018年8月21日 下午3:54:03
* @version 1.0.0
*/
public class MediaSelfSourceCrawler {
private static final Logger logger = LogManager.getLogger(MediaSelfSourceCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/**
*
* @Description 链接传入 并 返回采集完信号
* @param callback
* @param urls
* @return
* @throws Exception
*/
public GroupSync submitTask(MediaSelfSourceDataCallBack callback,String... urls) {
try {
GroupSync counter = new GroupSync();
start(counter, callback, urls);
return counter;
} catch (Exception e) {
logger.error(" exception {}",e);
return null;
}
}
/**
*
* @Description 提交链接
* @param counter
* @param callback
* @param urls
*/
private void start(GroupSync counter,MediaSelfSourceDataCallBack callback, String... urls) {
if (urls != null && urls.length > 0) {
for (String url : urls) {
counter.add();
if (url != null) {
try {
search(counter, url, Attribution.of(url), callback);
} catch (Exception e) {
logger.error("搜索创建出错", e);
}
}
counter.done();
}
}
}
/**
*
* @Description 链接获取文章信息
* @param counter
* @param url
* @param attr
* @param callback
* @return
*/
private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) {
logger.info("当前处理 URL: {}", attr.get());
Map<String,Object> map = new HashMap<>();
if(url.contains("toutiao.com")) {
map.put("referer", url);
}
url = dealUrl(url);
if(Objects.nonNull(url)) {
Request request = RequestUtils.wrapGet(url, map);
counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY).whenComplete((rs,ex) -> {
try {
if (Objects.isNull(ex)) {
try {
parseHtml(rs.body().string(), attr, callback);
} catch (Exception e) {
logger.error("解析出错", e);
}
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(),ex);
}
} finally {
counter.done();
}
});
}
return counter;
}
/**
** 链接处理
* @param url
* @return
* @return String
*/
private String dealUrl(String url) {
try {
if(url.startsWith("http")) {
if(url.contains("wap.peopleapp.com/article")) {
return "https://app.peopleapp.com/WapApi/610/ArtInfoApi/getInfoUp?id=" + url.split("article")[1].split("/")[1];
}
if(url.contains("a.mp.uc.cn/") && url.contains("wm_cid=")) {
return "https://ff.dayu.com/contents/" + url.split("wm_cid=")[1].split("&")[0] + "?biz_id=1002&_fetch_author=1";
}
if(url.contains("tznew.58.com")) {
return "https://tznew.58.com/tznew/c/info-detail?infoid=" + url.split("infoid=")[1].split("&")[0];
}
if(url.contains("wap.peopleapp.com/article")) {
return "https://app.peopleapp.com/WapApi/610/ArtInfoApi/getInfoUp?id=" + url.split("article")[1].split("/")[1];
}
return url;
}
} catch (Exception e) {
}
return null;
}
/**
*
* @Description 解析文章获取相关数据
* @param attr
* @param callback
*/
private void parseHtml(String result, Attribution attr,
MediaSelfSourceDataCallBack callback) {
String source = null;
String channel = null;
String url = attr.get().toString();
try {
source = MatchSource.matchMediaSelfSource(url,result);
logger.info(url+"=======" + source);
channel = MatchChannel.verifyChannel(url);
if(channel==null){
List<Node> nodeList = Jsoup.parse(result).head().childNodes();
channel = MatchChannel.matchChannel(nodeList);
}
} catch (Exception e) {
logger.error("exception ",e);
source = null;
}
MediaSelfSourceBean msfb = new MediaSelfSourceBean(url, source, channel);
if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据");
} else {
callback.onData(msfb, attr);
}
}
}
package com.zhiwei.source_forward.run;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
import com.zhiwei.source_forward.crawler.SourceForwardCrawler;
import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
/**
* @ClassName: SourceForward
* @Description: 验证文章是否为转发
* @author hero
* @date 2017年12月5日 下午7:03:57
*/
public class SourceForward {
private static Logger logger = LogManager.getLogger(SourceForward.class);
/**
* @Title: getSourceForward
* @author hero
* @Description: 验证文章是否转发
* @param @param dataMap
* @param @return 设定文件
* @return Map<String,Map<String,Object>> 返回类型
*/
public static Map<String,Map<String,Object>> getSourceForward(Map<String,Map<String,Object>> dataMap){
//启动验证来源程序
List<String> urlList = new ArrayList<>();
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
urlList.add(entry.getKey());
}
List<SourceForwardBean> dataList = SourceForwardCrawlerThread.getSourceForward(urlList);
for(SourceForwardBean sfb : dataList){
String url = sfb.getUrl();
String root_source = sfb.getRoot_source();
//整合数据及验证转发原创
if(dataMap.containsKey(url)){
Map<String,Object> data = dataMap.get(url);
String source = data.get("来源")+"";
String isForward = "转发";
if(root_source == null || root_source.toUpperCase().trim().equals(source.toUpperCase().trim())){
isForward = "原创";
}
if(url.contains("mp.weixin.qq.com")){
isForward = sfb.getIsforward();
}else{
data.put("原来源", root_source);
data.put("频道", sfb.getChannel());
}
data.put("是否转发", isForward);
dataMap.put(url, data);
}
}
return dataMap;
}
/**
*
* @Description 批量传入链接获取数据
* @param urlList
* @return
*/
public static List<SourceForwardBean> getSourceForward(List<String> urlList){
//启动获取链接来源
return SourceForwardCrawlerThread.getSourceForward(urlList);
}
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER, 10000002);
List<String> urlList = new ArrayList<>();
urlList.add("http://software.it168.com/a2019/0621/6005/000006005693.shtml");
List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
for(SourceForwardBean sfb : da) {
System.out.println(sfb.toString());
}
}
static class SourceForwardCrawlerThread extends Thread{
private static List<SourceForwardBean> getSourceForward(List<String> urlList){
List<SourceForwardBean> list = Collections.synchronizedList(new ArrayList<SourceForwardBean>());
try{
SourceForwardCrawler crawler = new SourceForwardCrawler();
SourceForwardDataCallBack callback = new SourceForwardDataCallBack() {
@Override
public void onData(SourceForwardBean data, Attribution attr) {
list.add(data);
logger.info("列表大小:::{}",list.size());
}
};
crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await();
}catch (Exception e){
logger.error(" 来源判断 出错 {} ",e);
}
return list;
}
}
}
package com.zhiwei.source_forward.run;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import com.zhiwei.source_forward.util.ProxyInit;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
import com.zhiwei.source_forward.crawler.SourceForwardCrawler;
import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
/**
* @ClassName: SourceForward
* @Description: 验证文章是否为转发
* @author hero
* @date 2017年12月5日 下午7:03:57
*/
public class SourceForward {
private static Logger logger = LogManager.getLogger(SourceForward.class);
/**
* @Title: getSourceForward
* @author hero
* @Description: 验证文章是否转发
* @param @param dataMap
* @param @return 设定文件
* @return Map<String,Map<String,Object>> 返回类型
*/
public static Map<String,Map<String,Object>> getSourceForward(Map<String,Map<String,Object>> dataMap){
//启动验证来源程序
List<String> urlList = new ArrayList<>();
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
urlList.add(entry.getKey());
}
List<SourceForwardBean> dataList = SourceForwardCrawlerThread.getSourceForward(urlList);
for(SourceForwardBean sfb : dataList){
String url = sfb.getUrl();
String root_source = sfb.getRoot_source();
//整合数据及验证转发原创
if(dataMap.containsKey(url)){
Map<String,Object> data = dataMap.get(url);
String source = data.get("来源")+"";
String isForward = "转发";
if(root_source == null || root_source.toUpperCase().trim().equals(source.toUpperCase().trim())){
isForward = "原创";
}
if(url.contains("mp.weixin.qq.com")){
isForward = sfb.getIsforward();
}else{
data.put("原来源", root_source);
data.put("频道", sfb.getChannel());
}
data.put("是否转发", isForward);
dataMap.put(url, data);
}
}
return dataMap;
}
/**
*
* @Description 批量传入链接获取数据
* @param urlList
* @return
*/
public static List<SourceForwardBean> getSourceForward(List<String> urlList){
//启动获取链接来源
return SourceForwardCrawlerThread.getSourceForward(urlList);
}
public static void main(String[] args) {
ProxyInit.initProxy();
List<String> urlList = new ArrayList<>();
urlList.add("http://software.it168.com/a2019/0621/6005/000006005693.shtml");
List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
for(SourceForwardBean sfb : da) {
System.out.println(sfb.toString());
}
}
static class SourceForwardCrawlerThread extends Thread{
private static List<SourceForwardBean> getSourceForward(List<String> urlList){
List<SourceForwardBean> list = Collections.synchronizedList(new ArrayList<SourceForwardBean>());
try{
SourceForwardCrawler crawler = new SourceForwardCrawler();
SourceForwardDataCallBack callback = new SourceForwardDataCallBack() {
@Override
public void onData(SourceForwardBean data, Attribution attr) {
list.add(data);
logger.info("列表大小:::{}",list.size());
}
};
crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await();
}catch (Exception e){
logger.error(" 来源判断 出错 {} ",e);
}
return list;
}
}
}
package com.zhiwei.source_forward.run;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.async.TaskBoot;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.crawler.UrlLiveCrawler;
import com.zhiwei.source_forward.crawler.UrlLiveCrawlerNew;
import com.zhiwei.source_forward.util.UrlLiveDataCallback;
import okhttp3.Request;
import okhttp3.Response;
/**
* @ClassName: URLLive
* @Description: 验证链接是否已删除
* @author hero
* @date 2017年12月6日 上午9:22:49
*/
public class URLLive {
private static Logger logger = LogManager.getLogger(URLLive.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().build();
/**
* @Title: verificationURLLive
* @author hero
* @Description: 验证数据是否已删除
* @param @param dataMap
* @param @return 设定文件
* @return Map<String,Map<String,Object>> 返回类型
*/
public static Map<String,Map<String,Object>> verificationURLLive(Map<String,Map<String,Object>> dataMap){
List<String> urlList = new ArrayList<>();
//启动验证链接是否有效程序程序
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
urlList.add(entry.getKey());
}
System.out.println(urlList.size());
//验证数据是否已删除
List<UrlLiveBean> dataList = UrlLiveCrawlerThread.getUrlLiveCrawle(urlList);
for(UrlLiveBean ub : dataList){
String url = ub.getUrl();
int i = ub.isLive();
if(dataMap.containsKey(url)){
Map<String,Object> map = dataMap.get(url);
if(i == 1) {
map.put("是否删除", true);
}else if(i == 0) {
map.put("是否删除", false);
}
map.put("title", ub.getTitle());
dataMap.put(url, map);
}
}
return dataMap;
}
/**
*
* @Description (TODO这里用一句话描述这个方法的作用)
* @param urlList
* @return UrlLiveBean 1 已删除 2 未删除 -1 访问失败
*/
public static List<UrlLiveBean> verificationURLLive(List<String> urlList){
//启动验证链接是否有效程序程序
return UrlLiveCrawlerThread.getUrlLiveCrawle(urlList);
}
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER, 10000002);
List<String> urlList = new ArrayList<>();
urlList.add("http://a.mp.uc.cn/article.html?uc_param_str=frdnsnpfvecpntnwprdssskt#!wm_aid=038b8207b444418c845f43e4d2d3a754");
List<UrlLiveBean> u = URLLive.verificationURLLive(urlList);
for(UrlLiveBean b : u) {
System.out.println(b.toString());
}
}
static class UrlLiveCrawlerThread extends Thread{
private static List<UrlLiveBean> getUrlLiveCrawle(List<String> urlList){
List<UrlLiveBean> list = Collections.synchronizedList(new ArrayList<UrlLiveBean>());
try{
UrlLiveCrawler crawler = new UrlLiveCrawler();
UrlLiveDataCallback callback = new UrlLiveDataCallback() {
@Override
public void onData(UrlLiveBean data, Attribution attr) {
list.add(data);
System.out.println("列表大小:::"+list.size());
}
};
crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await();
}catch (Exception e){
logger.error(" 数据采集运行有问题 {} ", e);
}
return list;
}
}
}
package com.zhiwei.source_forward.run;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import com.zhiwei.source_forward.util.ProxyInit;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.crawler.UrlLiveCrawler;
import com.zhiwei.source_forward.util.UrlLiveDataCallback;
/**
* @ClassName: URLLive
* @Description: 验证链接是否已删除
* @author hero
* @date 2017年12月6日 上午9:22:49
*/
public class URLLive {
private static Logger logger = LogManager.getLogger(URLLive.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().build();
/**
* @Title: verificationURLLive
* @author hero
* @Description: 验证数据是否已删除
* @param @param dataMap
* @param @return 设定文件
* @return Map<String,Map<String,Object>> 返回类型
*/
public static Map<String,Map<String,Object>> verificationURLLive(Map<String,Map<String,Object>> dataMap){
List<String> urlList = new ArrayList<>();
//启动验证链接是否有效程序程序
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
urlList.add(entry.getKey());
}
System.out.println(urlList.size());
//验证数据是否已删除
List<UrlLiveBean> dataList = UrlLiveCrawlerThread.getUrlLiveCrawle(urlList);
for(UrlLiveBean ub : dataList){
String url = ub.getUrl();
int i = ub.isLive();
if(dataMap.containsKey(url)){
Map<String,Object> map = dataMap.get(url);
if(i == 1) {
map.put("是否删除", true);
}else if(i == 0) {
map.put("是否删除", false);
}
map.put("title", ub.getTitle());
dataMap.put(url, map);
}
}
return dataMap;
}
/**
*
* @Description (TODO这里用一句话描述这个方法的作用)
* @param urlList
* @return UrlLiveBean 1 已删除 2 未删除 -1 访问失败
*/
public static List<UrlLiveBean> verificationURLLive(List<String> urlList){
//启动验证链接是否有效程序程序
return UrlLiveCrawlerThread.getUrlLiveCrawle(urlList);
}
public static void main(String[] args) {
ProxyInit.initProxy();
List<String> urlList = new ArrayList<>();
urlList.add("http://a.mp.uc.cn/article.html?uc_param_str=frdnsnpfvecpntnwprdssskt#!wm_aid=038b8207b444418c845f43e4d2d3a754");
List<UrlLiveBean> u = URLLive.verificationURLLive(urlList);
for(UrlLiveBean b : u) {
System.out.println(b.toString());
}
}
static class UrlLiveCrawlerThread extends Thread{
private static List<UrlLiveBean> getUrlLiveCrawle(List<String> urlList){
List<UrlLiveBean> list = Collections.synchronizedList(new ArrayList<UrlLiveBean>());
try{
UrlLiveCrawler crawler = new UrlLiveCrawler();
UrlLiveDataCallback callback = new UrlLiveDataCallback() {
@Override
public void onData(UrlLiveBean data, Attribution attr) {
list.add(data);
System.out.println("列表大小:::"+list.size());
}
};
crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await();
}catch (Exception e){
logger.error(" 数据采集运行有问题 {} ", e);
}
return list;
}
}
}
package com.zhiwei.source_forward.util;
import java.util.List;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.source_forward.content.ContentExtractor;
import com.zhiwei.source_forward.content.News;
/**
* @ClassName: MatchSource
* @Description: 匹配来源
* @author hero
* @date 2018年6月30日 上午10:27:29
*/
public class MatchSource {
private static String fromRegex = "(来源:(\\S)*(\\s)*[\\S]*)|(来源:(\\S)*(\\s)*[\\S]*)|(本文来源于(\\S)*(\\s)*[\\S]*)"
+ "|(源:(\\S)*(\\s)*[\\S]*)|(来自:(\\S)*(\\s)*[\\S]*)|(来自:(\\S)*(\\s)*[\\S]*)"
+ "|(\\[来源\\]:(\\S)*(\\s)*[\\S]*)|(\\[来源\\]:(\\S)*(\\s)*[\\S]*)"
+ "|(出自:(\\S)*(\\s)*[\\S]*)|(出自:(\\S)*(\\s)*[\\S]*)" + "|(转自:(\\S)*(\\s)*[\\S]*)|(转自:(\\S)*(\\s)*[\\S]*)"
+ "|(出处\\/作者:(\\S)*(\\s)*[\\S]*)|(出处\\/作者:(\\S)*(\\s)*[\\S]*)"
+ "|(出处:(\\S)*(\\s)*[\\S]*)|(出处:(\\S)*(\\s)*[\\S]*)|(稿源:(\\S)*(\\s)*[\\S]*)|(出处:(\\S)*(\\s)*[\\S]*)";
private static String timeRegex = ""
+ "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})"
+ "|([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})"
+ "|(\\d{4}年\\d{0,2}月\\d{0,2}日\\D+\\d{0,2}\\D+\\d{0,2})"
+ "|(\\d{4}年\\d{0,2}月\\d{0,2}日\\D+\\d{0,2})"
+ "|(\\d{4}年\\d{0,2}月\\d{0,2}日)"
+ "|(\\d{0,2}月\\d{0,2}日\\D+\\d{0,2}\\D+\\d{0,2})"
+ "|(\\d{0,2}月\\d{0,2}日\\D+\\d{0,2})"
+ "|(\\d{0,2}月\\d{0,2}日)"
+ "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2})"
+ "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2})"
+ "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2})"
;
/**
* @Title: findURLs
* @author hero
* @Description: (验证并匹配数据)
* @param @param
* s
* @param @param
* regex
* @param @return
* 设定文件
* @return String 返回类型
*/
public static String matchSource(String url,String html, List<String> sourceList) {
String source = null;
Document document = Jsoup.parse(html);
String htmlBody = TreateData.filterSpecialCharacter(document.select("body").text().toUpperCase());
try {
/***特定网站单独处理**/
if(url.contains("thepaper.cn")){
//单独处理澎湃数据
source = document.select("div.news_about").select("p").select("span").text().replaceAll(".*来源:", "");
if(source.length() == 0) {
source = document.select("div.news_about").text().replaceAll(" \\d{4}.*|.*/", "");
}
}else if(url.contains("sports.eastday.com")){
//单独处理东方体育网
source = document.select("div.article").select("span").text();
}else if(url.contains("lesports.com")){
//单独处理乐视网数据
source = document.select("div.article-source").select("strong").text();
}else if(url.contains("myzaker.com")){
//单独处理扎克网数据
source = document.select("div#article").select("span.auther").text();
}else if(url.contains("sina.com.cn") || url.contains("sohu.com")){
//单独处理新浪网
if(html.contains("<meta name=\"mediaid\"")){
source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0];
}
}else if(url.contains("a.mini.eastday.com")){
//处理东方头条网-自媒体号匹配
// source = document.select("[class=\"share_cnt_p clearfix\"]").select("div.fl").select("i").get(1).text();
source = "东方头条";
}else if(url.contains("orz520.com")){
//千寻生活网解析
source = "千寻生活";
}else if(url.contains("sh.qihoo.com")){
//今日报点解析
source = "今日爆点";
}else if(url.contains("itouchtv.cn")){
//触电新闻解析
source = "触电新闻";
}else if(url.contains("yidianzixun.com")){
//一点资讯
if(html.contains("related_wemedia")){
source = "一点资讯";
}else{
source = html.split("source\":\"")[1].split("\",\"")[0];
}
}else if(url.contains("tech.china.com")){
//中华网科技
source = document.select("#chan_newsInfo").text().split("来源:")[1];
}else if(url.contains("caijing.com.cn")){
//财经网产经
source = document.select("#source_baidu").text();
}else if(url.contains("news.eastday.com")){
//单独处理东方网
source = document.select("div#sectionleft").select("div").select("p").select("a").text();
}else if(url.contains("ny.chinacenn.com")){
//单独处理中企网
source = document.select("td").select("span.ltutext3").text().replaceAll(" \\d{4}.*", "");
}else if(url.contains("ebrun.com")){
//单独处理亿邦动力网
source = document.select("div.post-header").select("p.source").select("span.f-left").text().replaceAll(".*来源: ", "");
}else if(url.contains("www.mnw.cn")){
//单独处理闽南网
source = document.select("div.il").select("span").text().replaceAll("来源:|\\d{4}.*", "");
}else if(url.contains("sn.cri.cn")){
//单独处理国际在线
source = document.select("span.asource").select("a").text();
}else if(url.contains("sh.sina.com.cn")){
//单独处理新浪上海
source = document.select("p.source-time").select("span").get(1).select("a").text();
}else if(url.contains("kaixian.tv")){
//单独处理汉丰网
source = document.select("div.content").select("h2.font_gray").text().replaceAll(".*来源:", "");
}else if(url.contains("lanjingtmt.com")){
//单独处理蓝鲸TMT
source = "蓝鲸TMT网";
}else if(url.contains("tech.huanqiu.com")){
//单独处理环球网
source = document.select("span.la_t_b").select("a").text();
}else if(url.contains("china.qianlong.com")){
//单独处理千龙网
source = document.select("span.source").select("a").text();
}else if(url.contains("m.mnw.cn")){
//单独处理手机闽南网
source = document.select("article.info").select("header").select("div").select("span").text().replaceAll("\\d{4}.*| ", "");
}else if(url.contains("mydrivers.com")){
//单独处理快科技
source = document.select("div.news_bt1_left").text().replaceAll(".*出处:| 作者:[\\w\\W]*", "");
}else if(url.contains("3dmgame.com")){
//单独处理3DMGAME
source = document.select("ul.intem").select("li").select("span.weibo").text();
}else if(url.contains("99it.com.cn")){
//单独处理99科技
source = document.select("div.mate").select("span").text().replaceAll(".*来源:|编辑.*", "");
}else if(url.contains("ciotimes.com")){
//单独处理CIO时代网
source = document.select("p.ly.visible-xs.text-left").text().replaceAll(".*来源:", "");
}else if(url.contains("ithome.com")){
//单独处理IT之家
source = document.select("span#source_baidu").select("a").text();
}else if(url.contains("techweb.com.cn")){
//单独处理TechWeb
source = document.select("span.from").select("a").text();
}else if(url.contains("cniteyes.com")){
//单独处理T客帮
source = document.select("div.item-date").select("span").text();
}else if(url.contains("enorth.com.cn")){
//单独处理北方网
source = document.select("p.col-sm-8.info").select("span").text().replaceAll(".*来源:|编辑.*", "");
}else if(url.contains("btime.com")){
//单独处理北京时间
source = document.select("span.col.cite").text();
}else if(url.contains("bianews.com")){
//单独处理鞭牛士
source = document.select("span.name.fl").text();
}else if(url.contains("dzwww.com")){
//单独处理大众网
source = document.select("div.layout").select("div.left").text().replaceAll(".*来源: |作者.*", "");
}else if(url.contains("dsb.cn")){
//单独处理电商报
source = document.select("div.new-content-info.clearfix").select("span").text().replaceAll(".*作者:", "");
}else if(url.contains("finance.eastmoney.com")){
//单独处理东方财富网
source = document.select("div.source.data-source").attr("data-source");
}else if(url.contains("emwap.eastmoney.com")){
//单独处理东方财富网客户端
source = document.select("div.where").select("span.source").attr("title");
}else if(url.contains("mini.eastday.com")){
//单独处理东方头条
source = document.select("div.article-src-time").select("span").text().replaceAll(".*来源:", "");
}else if(url.contains("tech.ifeng.com")){
//单独处理凤凰科技
source = document.select("p.p_time").select("span").select("span.ss03").text();
}else if(url.contains("finance.ifeng.com")){
//单独处理凤凰网
source = document.select("p.p_time").select("span").select("span").select("a").text();
if(Objects.isNull(source) || source.length() < 1) {
source = html.split("source\":\"")[1].split("\"")[0];
}
}else if(url.contains("iphone.265g.com")){
//单独处理265G网
source = document.select("div.article_info").select("span").text().replaceAll(".*来源:|QQ群号.*", "");
}else if(url.contains("yicai.com")){
//单独处理第一财经
source = document.select("div.title.f-pr").select("p").select("span").text();
}else if(url.contains("cnblogs.com")){
//单独处理博客园
source = document.select("div#come_from").text().replaceAll(".*来自:", "");
}else if(url.contains("chinaxiaokang.com")){
//单独处理中国小康网
source = document.select("span#arturl").select("a").text();
}else if(url.contains("chinabaogao.com")) {
//单独处理中国报告网
source = document.select("p.cbg-a-d-info").select("a").text().replaceAll("大 中 小 | ", "");
}else if(url.contains("anyv.net")) {
//单独处理爱妮微
source = document.select("span.cor666").select("a").text();
}else if(url.contains("yingxiao360.com")){
//单独处理第一赢销网
source = "第一赢销网";
}else if(url.contains("cctime.com")){
//单独处理飞象网
source = document.select("td.dateAndSource").text().replaceAll(".*\\d{2}|作 者.*| ", "");
}else if(url.contains("news.hexun.com")){
//单独处理和讯网
source = document.select("div.tip.fl").select("a").text();
}else if(url.contains("finance.jrj.com.cn")){
//单独处理金融界
source = document.select("p.inftop").select("span").get(1).select("a").text().replaceAll("价值.*| ", "");
}else if(url.contains("tech.china.com.cn")){
//单独处理中国网
source = document.select("span.fl.time2").select("a").text();
}else if(url.contains("news.china.com.cn")){
//单独处理中国网
source = document.select("div.pub_date").select("span#source_baidu").text().replaceAll(".*来源:", "");
}else if(url.contains("admin5.com")){
//单独处理站长网
source = document.select("div.source").select("span").text().replaceAll(".*来源:| ", "");
}else if(url.contains("stock.qq.com")){
//单独处理腾讯证券
source = document.select("div.a_Info").select("span.a_source").text();
}else if(url.contains("n.cztv.com")){
//单独处理新蓝网
source = document.select("div.publish").select("ul").select("li").text().replaceAll("\\d{4}.*", "");
}else if(url.contains("news.paidai.com")){
//单独处理派代网
source = document.select("p.t_info").select("span").select("a").text();
}else if(url.contains("news.mydrivers.com")){
//单独处理快科技
source = document.select("div.news_bt1_left").text().replaceAll(".*出处:| 作者.*", "");
}else if(url.contains("www.chinaz.com")){
//单独处理站长之家
source = document.select("div.meta").select("span.source").select("a").text();
}else if(url.contains("yuncaijing.com")){
//单独处理云财经
source = document.select("section.news-wrap").select("header").select("div").text().replaceAll(".*消息来源: |\\[阅读原文.*| ", "");
}else if(url.contains("itmsc.cn")){
//单独处理科技传媒网
source = document.select("div.arc_sc").select("p").select("a").text();
}else if(url.contains("nbd.com.cn")){
//单独处理每日经济新闻
source = document.select("span.source").text();
}else if(url.contains("pintu360.com")){
//单独处理品途商业评论
source = "品途商业评论";
}else if(url.contains("news.qudong.com")){
//单独处理驱动中国
source = document.select("div.news_right").select("dd").select("li").select("span").select("a").text().replaceAll(" .*", "");
}else if(url.contains("shobserver.com")){
//单独处理上海观察
source = document.select("span.max-words").get(0).text();
}else if(url.contains("g.pconline.com.cn")){
//单独处理太平洋电脑网
source = document.select("div.art-info").text().replaceAll("手机|\\d{4}.*| ", "");
}else if(url.contains("news.xtol.cn")){
//单独处理湘潭在线
source = document.select("span.date").text().replaceAll(".*来源:", "");
}else if(url.contains("bjnews.com.cn")){
//单独处理新京报网
source = document.select("span.author").text().replaceAll(" 记者.*", "");
}else if(url.contains("telworld.com.cn")){
//单独处理运营商世界
source = document.select("div.news_xiang_tit_2_left").select("a").text();
}else if(url.contains("thehour.cn")){
//单独处理浙江24小时
source = document.select("div.newsInfo").select("span").select("a").text();
}else if(url.contains("sh.zol.com.cn")){
//单独处理中关村在线
source = document.select("div.article-aboute").select("span.source_baidu").text();
}else if(url.contains("ec.com.cn")){
//单独处理中国国际电子商务网
source = document.select("span.article_resource").text().replaceAll(".*来源:", "");
}else if(url.contains("cqn.com.cn")){
//单独处理中国质量新闻网
source = document.select("span.from").text().replaceAll("-.*", "");
}else if(url.contains("sc.stock.cnfol.com")){
//单独处理中金在线
source = document.select("div.artDes").select("span").select("a").text();
}else if(url.contains("zczj.com")){
//单独处理众筹之家
source = document.select("div.news-info").select("span").text().replaceAll("来源:|作者.*", "");
}else if(url.contains("cqcb.com")){
//单独处理重庆晨报
source = document.select("span.label_nr").text();
}else if(url.contains("stock.10jqka.com.cn")){
//单独处理重庆晨报
source = document.select("span.label_nr").text();
}else if(url.contains("jiemian.com") ){
//单独处理界面新闻
// source = document.select("div.article-info").select("span").text().replaceAll(".*来源:| 字体[\\w\\W]*", "");
return "界面新闻";
}else if(url.contains("finance.youth.cn")){
//单独处理中国青年网
source = document.select("span#source_baidu").text().replaceAll("来源:|作者.*", "");
}else if(url.contains("china.com")) {
//中国金融商报
source = document.select("#chan_newsInfo > a").text();
}else if(url.contains("xw.qq.com")) {
//腾讯网客户端
source = document.select("div.tpl_header_author").text();
}else if(url.contains("china.prcfe.com")) {
source = html.split("\"")[1];
}
if(Objects.nonNull(source) && source.length() != 0) {
return source;
}
else{
//其他网站处理
source = mathchOtherSource(html, htmlBody, sourceList);
if(source!=null){
//验证来源
for (String sourceMatch : sourceList) {
if (source.contains(sourceMatch)) {
return sourceMatch;
}
}
}
}
} catch (Exception e) {
e.toString();
}
return null;
}
/**
* @Title: matchMediaSelfSource
* @author hero
* @Description: 验证及匹配自媒体号
* @param @param url
* @param @param html
* @param @return 设定文件
* @return String 返回类型
*/
public static String matchMediaSelfSource(String url,String html) {
String source = null;
Document document = Jsoup.parse(html);
try {
/***特定网站单独处理**/
if(url.contains("toutiao.com")){
//今日头条帐号匹配
if(html.contains("name: '") && html.contains("mediaInfo")){
source = html.split("mediaInfo:")[1].split("name: '")[1].split("',")[0].trim();
}else if(html.contains("name: '") && html.contains("ugcInfo")){
source = html.split("ugcInfo:")[1].split("name: '")[1].split("',")[0].trim();
}else if(html.contains("screen_name:")){
source = html.split("screen_name:'")[1].split("',")[0].trim();
}
if(source!=null && source.length()>1){
source = "今日头条-" + source;
}
}else if(url.contains("sohu.com")){
//搜狐自媒体号
if(html.contains("<meta name=\"mediaid\"")){
source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim();
if(source!=null && source.length()>1){
source = "搜狐-" + source;
}
}
}else if(url.contains("tznew.58.com")){
//58
source = JSONObject.parseObject(html).getJSONObject("result").getString("author");
if(source!=null && source.length()>1){
source = "58-" + source;
}
}else if(url.contains("c.m.163.com")){
//58
source = document.select("section.g-article.js-article > div.js-article-inner > div > b").text();
if(source!=null && source.length()>1){
source = "网易新闻-" + source;
}
}else if(url.contains("a.mini.eastday.com")){
//处理东方头条网-自媒体号匹配
source = document.select("[class=\"share_cnt_p clearfix\"]").select("div.fl").select("i").get(1).text().trim();
if(source!=null && source.length()>1){
source = "东方头条-" + source;
}
}else if(url.contains("fashion.eastday.com")){
//处理东方头条网-自媒体号匹配
source = document.select("div.J-title_detail.title_detail > div > div.fl > i:nth-child(2)").text().trim();
if(Objects.isNull(source) || source.length() < 1) {
source = document.select("div.J-title_detail.title_detail > div > div.fl > a").text().trim();
}
if(source!=null && source.length()>1){
source = "东方看点-" + source;
}
}else if(url.contains("sh.qihoo.com") || url.contains("360kuai.com")){
//今日报点解析
source = document.select("span.source").text().trim();
if(source.length() < 1) {
source = document.select("p.article-info").select("a").text().trim();
}
if(source.length() < 1 && html.contains("window.__INITIAL_DATA__ =")) {
Matcher ma = Pattern.compile("window.__INITIAL_DATA__ =[\\s\\S]+?}};").matcher(html);
if(ma.find()) {
String result = ma.group().replaceAll("window.__INITIAL_DATA__ =|\\</script\\>|", "").trim();
if(result.contains("window.autohomePVDDWhiteList")) {
result = result.split("window.autohomePVDDWhiteList")[0];
}
JSONObject json = JSONObject.parseObject(result.trim().substring(0,result.lastIndexOf(";")));
source = json.getJSONObject("detail").getString("sec_src");
if(Objects.isNull(source) || source.length() < 1) {
source = json.getJSONObject("detail").getString("src");
}
}
}
if(Objects.nonNull(source) && source.length()>1){
source = "快资讯-" + source;
}
}else if(url.contains("cj.sina.com.cn") || url.contains("finance.sina.cn") ||
url.contains("tech.sina.cn") || url.contains("news.sina.cn")){
source = document.select("h2.weibo_user").text();
if((Objects.isNull(source) || source.length() < 1) && html.contains("<meta name=\"mediaid\"")){
//新浪科技头条号
source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim();
}
if(Objects.isNull(source) || source.length() < 1){
//新浪财经头条号
source = document.select("body > main > section.j_main_art > section > article > time > cite").text();
}
if(source!=null && source.length()>1){
source = "新浪-" + source;
}
}else if(url.contains("auto.sina.cn") || url.contains("auto.sina.com.cn")){
source = document.select("div.art_title > div > span:nth-child(1)").text();
if(Objects.isNull(source) || source.length() < 1){
source = document.select("#top_bar > div > div.date-source > a").text();
}
if(source!=null && source.length()>1){
source = "新浪-" + source;
}
}else if(url.contains("baijiahao.baidu.com")){
//百度百家
source = document.select("p.author-name").first().text().trim();
if(source!=null && source.length()>1){
source = "百度百家-" + source;
}
}else if(url.contains("app.myzaker.com")){
// zaker客户端
source = document.select("#tpl_author").first().text().trim();
if(source!=null && source.length()>1){
source = "zaker客户端-" + source;
}
}else if(url.contains("yidianzixun.com")){
//一点资讯
if(html.contains("related_wemedia")){
source = html.split("media_name\":\"")[1].split("\",\"")[0].trim();
if(source!=null && source.length()>1){
source = "一点资讯-" + source;
}
}else{
source = html.split("source\":\"")[1].split("\",\"")[0];
}
}else if(url.contains("news.bitauto.com")){
//易车网
source = document.select("[class=\"gz-box clearfix\"]").select("div.txt-box")
.select("p.p-n").select("a").text();
if(source!=null && source.length()>1){
source = "易车网-" + source;
}
}else if(url.contains("ittime.com.cn")){
//it时代网
source = document.select("div.top.author > dl > dd > p > a").text();
if(Objects.nonNull(source) && !source.isEmpty()){
source = "it时代网-" + source;
}
}else if(url.contains("wap.peopleapp.com")){
// 人民日报客户端
JSONObject json = JSONObject.parseObject(html);
source = json.getJSONObject("data").getString("authors");
if(Objects.nonNull(source) && !source.isEmpty()){
source = "人民日报客户端-" + source;
}
}else if(url.contains("guancha.cn")){
// 风闻社区
source = document.select("div.main-tow > div.box-left > div.article-content > div:nth-child(3) > div.user-main > h4 > a").text();
if(Objects.nonNull(source) && !source.isEmpty()){
source = "风闻社区-" + source;
}else {
source = document.select("div.author-intro.fix > p > a").text();
if(Objects.nonNull(source) && !source.isEmpty()){
source = "观察者-" + source;
}
}
}else if(url.contains("yesky.com")){
// 天极自媒体
source = document.select("div.elf > dl > dd.bt > a").text();
if(Objects.nonNull(source) && !source.isEmpty()){
source = "天极自媒体-" + source;
}
}else if(url.contains("nkj.cn")){
// 牛科技
source = document.select("div.widget.suxingme_post_author > div > div.author_name > a").text();
if(Objects.nonNull(source) && !source.isEmpty()){
source = "牛科技-" + source;
}
}else if(url.contains("chejiahao.autohome.com.cn")){
//汽车之家
source = document.select("div.authorMes").select("[class=\"name text-overflow\"]")
.select("a").text();
if(source!=null && source.length()>1){
source = "汽车之家-" + source;
}
}else if(url.contains("item.btime.com")){
//北京时间
source = document.select("a.author").text();
if(Objects.isNull(source) || source.length() < 1){
source = document.select("div.content-info > span.col.cite").text();
}
if(source!=null && source.length()>1){
source = "北京时间-" + source;
}
}else if(url.contains("mp.qq.com")){
source = document.select("div#account_top > div.puin_text > div.pname").text();
if(source!=null && !source.equals("")){
source = "QQ看点-" + source;
}
}else if(url.contains("v.qq.com")) {
source = document.select("span.user_name").text();
if(source!=null && !source.equals("")){
source = "腾讯视频-" + source;
}
}else if(url.contains("qq.com/")){
//腾讯网-企鹅号
source = html.split("media\": \"")[1].split("\",")[0];
if(source!=null && source.length()>1){
source = "企鹅号-" + source;
}
}else if(url.contains("feng.ifeng.com")){
//凤凰网-大风号
source = html.split("source\":\"")[1].split("\",\"")[0];
if(source!=null && source.length()>1){
source = "大风号-" + source;
}
}else if(url.contains("dy.163.com")){
//网易订阅-网易号
source = document.select("div.normal > div.colum_info > h4").text();
if(source!=null && source.length()>1){
source = "网易-" + source;
}
}else if(url.contains("qctt.cn")){
//汽车头条
source = document.select("div.part2>a").text();
if(source!=null && source.length()>1){
source = "汽车头条-" + source;
}
}else if(url.contains("maiche.com")){
//买车网
source = document.select("div.info-left > div:nth-child(2) > span > a").text();
if(source!=null && source.length()>1){
source = "买车网-" + source;
}
}else if(url.contains("3g.163.com")){
source = document.select("div.info").select("[class=\"source js-source\"]")
.text();
if(source!=null && !source.equals("")){
source = "网易号-" + source;
}
}else if(url.contains("myzaker.com")){
source = document.select("div.article_header > div > a > span.auther")
.text();
if(source!=null && !source.equals("")){
source = "zaker-" + source;
}
}else if(url.contains("edushi.com")){
source = document.select("div.eds-name-box > div.eds-name > a > div.name")
.text();
if(source!=null && !source.equals("")){
source = "今日潮闻-" + source;
}
}else if(url.contains("ijiandao.com")){
source = document.select("div.article-author > span.author-name > a")
.text();
if(source!=null && !source.equals("")){
source = "爱尖刀-" + source;
}
}else if(url.contains("chuangyejia.com")){
source = document.select("div.article-title > ul.article-author > li:nth-child(1)")
.text();
if(source!=null && !source.equals("")){
source = "创业家-" + source;
}
}else if(url.contains("kejixun.com")){
source = document.select("div.r-box.r-box-none.clearfix > div > div.big-man > h3 > a")
.text();
if(source!=null && !source.equals("")){
source = "科技讯-" + source;
}
}else if(url.contains("tmtpost.com")){
source = document.select("article > div.post-info > a")
.text();
if(source!=null && !source.equals("")){
source = "钛媒体-" + source;
}
}else if(url.contains("cyzone.cn")){
source = document.select("div.article-author-info > div.author-main > div > div.a-word > div.a-name > a")
.text();
if(source!=null && !source.equals("")){
source = "创业邦-" + source;
}
}else if(url.contains("36kr.com")){
source = document.select("div.info-header-text > a.author-name").text();
if(source!=null && !source.equals("")){
return "36氪-" + source;
}
source = document.select("h4.author-name").text();
if(source!=null && !source.equals("")){
return "36氪-" + source;
}
source = document.select("span.author-nickname").text();
if(source!=null && !source.equals("")){
return "36氪-" + source;
}
}else if(url.contains("lianxianjia.com")){
source = document.select("span.author-name").text();
if(source!=null && !source.equals("")){
source = "连线家-" + source;
}
}else if(url.contains("itouchtv.cn")){
source = document.select("div.index__article-media-20Tg_ > span:nth-child(1)").text();
if(source!=null && !source.equals("")){
source = "触电新闻-" + source;
}
}else if(url.contains("whb.cn")){
source = document.select("div.yidian-info > span:nth-child(1)").text();
if(source!=null && !source.equals("")){
source = "文汇APP-" + source;
}
}else if(url.contains("blogchina.com")){
source = document.select("div.meta-top > label.lm_name > span > a").text();
if(source!=null && !source.equals("")){
source = "博客中国-" + source;
}
}else if(url.contains(".iqiyi.com")) {
source = JSONObject.parseObject(html.split("page-info='")[1].split("'")[0]).getJSONObject("user").getString("name");
if(source!=null && !source.equals("")){
source = "爱奇艺-" + source;
}
}else if(url.contains("v.youku.com")) {
source = document.select("a.sub-name").text();
if(source!=null && !source.equals("")){
source = "优酷-" + source;
}
}else if(url.contains("jiemian.com")) {
source = document.select("div.article-info > p > span.author > a").text();
if(source!=null && !source.equals("")){
source = "界面新闻-" + source;
}
}else if (url.contains("iyiou.com")) {
source = document.select("div#post_author > a").text();
if(source!=null && !source.equals("")) {
source = "亿欧网-" + source;
}
}else if (url.contains("lanjingtmt.com")) {
source = document.select("div.scd-title > a:nth-child(2)").text();
if(source!=null && !source.equals("")) {
source = "蓝鲸-" + source;
}
}else if (url.contains("lanjinger.com")) {
if(document.select("div.content_left > div:nth-child(2) > span").text().contains("专栏")) {
source = document.select("a.author_name").text().replaceAll(".*编辑| ", "");
if(source!=null && !source.equals("")) {
source = "蓝鲸财经-" + source;
}
}
}else if (url.contains("huxiu.com")) {
source = document.select("div.article__author-info-box > a.article-author-info > span.author-info__username").text();
if(source!=null && !source.equals("")) {
source = "虎嗅-" + source;
}
}else if (url.contains("chuansongme.com")) {
source = document.select("div.rich_media_meta_list > span.rich_media_meta.rich_media_meta_text").text();
if(source!=null && !source.equals("")) {
source = "传送门-" + source;
}
}else if (url.contains("a.mp.uc.cn")) {
JSONObject json = JSONObject.parseObject(html);
source = json.getJSONObject("data").getJSONObject("_author").getString("author_name");
if(source!=null && !source.equals("")) {
source = "uc-" + source;
}
}else if (url.contains("kd.youth.cn")) {
source = document.select("body > div > div > div.rich_media_meta_list > a").text();
if(source!=null && !source.equals("")) {
source = "中青在线-" + source;
}
}
return source;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
*
* @Title: mathchOtherSource
* @author hero
* @Description: 匹配通用结果数据
* @param @param html
* @param @param htmlBody
* @param @param length
* @param @return 设定文件
* @return String 返回类型
*/
private static String mathchOtherSource(String html,String htmlBody,List<String> sourceList){
/** 正文抽取,目的是避免正文中存在相应匹配规则 **/
String source = null;
try {
News news = ContentExtractor.getNewsByHtml(html);
String content = TreateData.filterSpecialCharacter(news.getContent().toUpperCase());
String title = TreateData.filterSpecialCharacter(news.getTitle().toUpperCase());
/**剔除正文**/
String text = htmlBody.replace(content, "@@@@@@@@@@");
/**分割正文**/
String[] matchTextArr = text.split("@@@@@@@@@@");
if(TreateData.regex(fromRegex, matchTextArr[0]) != null || TreateData.regex(fromRegex, matchTextArr[1])!=null){
if(TreateData.regex(fromRegex, matchTextArr[0])!=null){
source = TreateData.regex(fromRegex, matchTextArr[0]);
for (String sourceMatch : sourceList) {
if (source.contains(sourceMatch)) {
return sourceMatch;
}
}
}else if(TreateData.regex(fromRegex, matchTextArr[1])!=null){
source = TreateData.regex(fromRegex, matchTextArr[1]);
for (String sourceMatch : sourceList) {
if (source.contains(sourceMatch)) {
return sourceMatch;
}
}
}
}else{
if(matchTextArr[0].contains(title)){
/***判断是否包含标题,如果包含标题则以标题截取数据
* 验证数据为 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
* ***/
String[] titlesArr = matchTextArr[0].split(title);
for(int j = 0;j<titlesArr.length; j++){
String timeSource = TreateData.regex(timeRegex, titlesArr[j]);
if(timeSource!=null){
source = getSourceByTime(timeSource, titlesArr[j], sourceList);
if(source != null){
return source;
}
}
}
}
if(matchTextArr[1].contains(title)){
/***判断是否包含标题,如果包含标题则以标题截取数据
* 验证数据为 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
* ***/
String[] titlesArr = matchTextArr[1].split(title);
for(int j = 0;j<titlesArr.length; j++){
String timeSource = TreateData.regex(timeRegex, titlesArr[j]);
if(timeSource!=null){
source = getSourceByTime(timeSource, titlesArr[j], sourceList);
if(source != null){
return source;
}
}
}
}
}
/***正文外无相关数据,匹配正文**/
if(source == null ){
/***
* 匹配命中包含来源等规则的数据
*/
source = TreateData.regex(fromRegex, content);
if(source!=null){
for (String sourceMatch : sourceList) {
if (source.contains(sourceMatch)) {
return sourceMatch;
}
}
}else {
/***判断是否包含标题,如果包含标题则以标题截取数据
* 验证数据为 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
* ***/
if(content.contains(title)){ /**正文中包含标题**/
String[] titlesArr = content.split(title);
for(int j = 0;j<titlesArr.length; j++){
String timeSource = TreateData.regex(timeRegex, titlesArr[j]);
if(timeSource!=null){
source = getSourceByTime(timeSource, titlesArr[j], sourceList);
if(source != null){
return source;
}
}
}
}else{ /**正文中不包含标题**/
String timeSource = TreateData.regex(timeRegex, content);
if(timeSource!=null){
source = getSourceByTime(timeSource, content, sourceList);
if(source != null){
return source;
}
}
}
}
}
} catch (Exception e) {
System.out.println("正文抽取失败处理........");
e.toString();
/***
* 匹配正文失败
* 匹配命中包含来源等规则的数据
*/
source = TreateData.regex(fromRegex, htmlBody);
if (source != null) {
for (String sourceMatch : sourceList) {
if (source.contains(sourceMatch)) {
return sourceMatch;
}
}
} else {
/***判断是否包含标题,如果包含标题则以标题截取数据
* 验证数据为 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
* ***/
String timeSource = TreateData.regex(timeRegex, htmlBody);
if(timeSource!=null){
source = getSourceByTime(timeSource, htmlBody, sourceList);
if(source != null){
return source;
}
}
}
}
return null;
}
/**
* @Title: getSourceByTime
* @author hero
* @Description: TODO(根据匹配时间截取数据)
* @param @param htmlBody
* @param @return 设定文件
* @return String 返回类型
*/
private static String getSourceByTime(String timeSource, String htmlBody,List<String> sourceList){
try {
/**以时间做分割,匹配来源信息。
* 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
***/
String[] times = htmlBody.split(timeSource);
for (int j = 0; j < times.length; j++) {
String timecontent = times[j];
if (j == 0) {
if (timecontent.length() >= 30) {
timecontent = timecontent.substring(timecontent.length() - 30, timecontent.length());
} else {
timecontent = timecontent.substring(0, timecontent.length());
}
} else {
if (timecontent.length() >= 30) {
timecontent = timecontent.substring(0, 30);
} else {
timecontent = timecontent.substring(0, timecontent.length());
}
}
for (String sourceMatch : sourceList) {
if (timecontent.contains(sourceMatch)) {
return sourceMatch;
}
}
}
return null;
} catch (Exception e) {
e.toString();
return null;
}
}
}
package com.zhiwei.source_forward.util;
import java.util.List;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.source_forward.content.ContentExtractor;
import com.zhiwei.source_forward.content.News;
/**
* @ClassName: MatchSource
* @Description: 匹配来源
* @author hero
* @date 2018年6月30日 上午10:27:29
*/
public class MatchSource {
private static String fromRegex = "(来源:(\\S)*(\\s)*[\\S]*)|(来源:(\\S)*(\\s)*[\\S]*)|(本文来源于(\\S)*(\\s)*[\\S]*)"
+ "|(源:(\\S)*(\\s)*[\\S]*)|(来自:(\\S)*(\\s)*[\\S]*)|(来自:(\\S)*(\\s)*[\\S]*)"
+ "|(\\[来源\\]:(\\S)*(\\s)*[\\S]*)|(\\[来源\\]:(\\S)*(\\s)*[\\S]*)"
+ "|(出自:(\\S)*(\\s)*[\\S]*)|(出自:(\\S)*(\\s)*[\\S]*)" + "|(转自:(\\S)*(\\s)*[\\S]*)|(转自:(\\S)*(\\s)*[\\S]*)"
+ "|(出处\\/作者:(\\S)*(\\s)*[\\S]*)|(出处\\/作者:(\\S)*(\\s)*[\\S]*)"
+ "|(出处:(\\S)*(\\s)*[\\S]*)|(出处:(\\S)*(\\s)*[\\S]*)|(稿源:(\\S)*(\\s)*[\\S]*)|(出处:(\\S)*(\\s)*[\\S]*)";
private static String timeRegex = ""
+ "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})"
+ "|([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})"
+ "|(\\d{4}年\\d{0,2}月\\d{0,2}日\\D+\\d{0,2}\\D+\\d{0,2})"
+ "|(\\d{4}年\\d{0,2}月\\d{0,2}日\\D+\\d{0,2})"
+ "|(\\d{4}年\\d{0,2}月\\d{0,2}日)"
+ "|(\\d{0,2}月\\d{0,2}日\\D+\\d{0,2}\\D+\\d{0,2})"
+ "|(\\d{0,2}月\\d{0,2}日\\D+\\d{0,2})"
+ "|(\\d{0,2}月\\d{0,2}日)"
+ "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2})"
+ "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2})"
+ "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2})"
;
/**
* @Title: findURLs
* @author hero
* @Description: (验证并匹配数据)
* @param @param
* s
* @param @param
* regex
* @param @return
* 设定文件
* @return String 返回类型
*/
public static String matchSource(String url,String html, List<String> sourceList) {
String source = null;
Document document = Jsoup.parse(html);
String htmlBody = TreateData.filterSpecialCharacter(document.select("body").text().toUpperCase());
try {
/***特定网站单独处理**/
if(url.contains("thepaper.cn")){
//单独处理澎湃数据
source = document.select("div.news_about").select("p").select("span").text().replaceAll(".*来源:", "");
if(StringUtils.isNotBlank(source)) {
source = document.select("div.news_about").text().replaceAll(" \\d{4}.*|.*/", "");
}
}else if(url.contains("sports.eastday.com")){
//单独处理东方体育网
source = document.select("div.article").select("span").text();
}else if(url.contains("lesports.com")){
//单独处理乐视网数据
source = document.select("div.article-source").select("strong").text();
}else if(url.contains("myzaker.com")){
//单独处理扎克网数据
source = document.select("div#article").select("span.auther").text();
}else if(url.contains("sina.com.cn") || url.contains("sohu.com")){
//单独处理新浪网
if(html.contains("<meta name=\"mediaid\"")){
source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0];
}
}else if(url.contains("a.mini.eastday.com")){
//处理东方头条网-自媒体号匹配
// source = document.select("[class=\"share_cnt_p clearfix\"]").select("div.fl").select("i").get(1).text();
source = "东方头条";
}else if(url.contains("orz520.com")){
//千寻生活网解析
source = "千寻生活";
}else if(url.contains("sh.qihoo.com")){
//今日报点解析
source = "今日爆点";
}else if(url.contains("itouchtv.cn")){
//触电新闻解析
source = "触电新闻";
}else if(url.contains("yidianzixun.com")){
//一点资讯
if(html.contains("related_wemedia")){
source = "一点资讯";
}else{
source = html.split("source\":\"")[1].split("\",\"")[0];
}
}else if(url.contains("tech.china.com")){
//中华网科技
source = document.select("#chan_newsInfo").text().split("来源:")[1];
}else if(url.contains("caijing.com.cn")){
//财经网产经
source = document.select("#source_baidu").text();
}else if(url.contains("news.eastday.com")){
//单独处理东方网
source = document.select("div#sectionleft").select("div").select("p").select("a").text();
}else if(url.contains("ny.chinacenn.com")){
//单独处理中企网
source = document.select("td").select("span.ltutext3").text().replaceAll(" \\d{4}.*", "");
}else if(url.contains("ebrun.com")){
//单独处理亿邦动力网
source = document.select("div.post-header").select("p.source").select("span.f-left").text().replaceAll(".*来源: ", "");
}else if(url.contains("www.mnw.cn")){
//单独处理闽南网
source = document.select("div.il").select("span").text().replaceAll("来源:|\\d{4}.*", "");
}else if(url.contains("sn.cri.cn")){
//单独处理国际在线
source = document.select("span.asource").select("a").text();
}else if(url.contains("sh.sina.com.cn")){
//单独处理新浪上海
source = document.select("p.source-time").select("span").get(1).select("a").text();
}else if(url.contains("kaixian.tv")){
//单独处理汉丰网
source = document.select("div.content").select("h2.font_gray").text().replaceAll(".*来源:", "");
}else if(url.contains("lanjingtmt.com")){
//单独处理蓝鲸TMT
source = "蓝鲸TMT网";
}else if(url.contains("tech.huanqiu.com")){
//单独处理环球网
source = document.select("span.la_t_b").select("a").text();
}else if(url.contains("china.qianlong.com")){
//单独处理千龙网
source = document.select("span.source").select("a").text();
}else if(url.contains("m.mnw.cn")){
//单独处理手机闽南网
source = document.select("article.info").select("header").select("div").select("span").text().replaceAll("\\d{4}.*| ", "");
}else if(url.contains("mydrivers.com")){
//单独处理快科技
source = document.select("div.news_bt1_left").text().replaceAll(".*出处:| 作者:[\\w\\W]*", "");
}else if(url.contains("3dmgame.com")){
//单独处理3DMGAME
source = document.select("ul.intem").select("li").select("span.weibo").text();
}else if(url.contains("99it.com.cn")){
//单独处理99科技
source = document.select("div.mate").select("span").text().replaceAll(".*来源:|编辑.*", "");
}else if(url.contains("ciotimes.com")){
//单独处理CIO时代网
source = document.select("p.ly.visible-xs.text-left").text().replaceAll(".*来源:", "");
}else if(url.contains("ithome.com")){
//单独处理IT之家
source = document.select("span#source_baidu").select("a").text();
}else if(url.contains("techweb.com.cn")){
//单独处理TechWeb
source = document.select("span.from").select("a").text();
}else if(url.contains("cniteyes.com")){
//单独处理T客帮
source = document.select("div.item-date").select("span").text();
}else if(url.contains("enorth.com.cn")){
//单独处理北方网
source = document.select("p.col-sm-8.info").select("span").text().replaceAll(".*来源:|编辑.*", "");
}else if(url.contains("btime.com")){
//单独处理北京时间
source = document.select("span.col.cite").text();
}else if(url.contains("bianews.com")){
//单独处理鞭牛士
source = document.select("span.name.fl").text();
}else if(url.contains("dzwww.com")){
//单独处理大众网
source = document.select("div.layout").select("div.left").text().replaceAll(".*来源: |作者.*", "");
}else if(url.contains("dsb.cn")){
//单独处理电商报
source = document.select("div.new-content-info.clearfix").select("span").text().replaceAll(".*作者:", "");
}else if(url.contains("finance.eastmoney.com")){
//单独处理东方财富网
source = document.select("div.source.data-source").attr("data-source");
}else if(url.contains("emwap.eastmoney.com")){
//单独处理东方财富网客户端
source = document.select("div.where").select("span.source").attr("title");
}else if(url.contains("mini.eastday.com")){
//单独处理东方头条
source = document.select("div.article-src-time").select("span").text().replaceAll(".*来源:", "");
}else if(url.contains("tech.ifeng.com")){
//单独处理凤凰科技
source = document.select("p.p_time").select("span").select("span.ss03").text();
}else if(url.contains("finance.ifeng.com")){
//单独处理凤凰网
source = document.select("p.p_time").select("span").select("span").select("a").text();
if(Objects.isNull(source) || source.length() < 1) {
source = html.split("source\":\"")[1].split("\"")[0];
}
}else if(url.contains("iphone.265g.com")){
//单独处理265G网
source = document.select("div.article_info").select("span").text().replaceAll(".*来源:|QQ群号.*", "");
}else if(url.contains("yicai.com")){
//单独处理第一财经
source = document.select("div.title.f-pr").select("p").select("span").text();
}else if(url.contains("cnblogs.com")){
//单独处理博客园
source = document.select("div#come_from").text().replaceAll(".*来自:", "");
}else if(url.contains("chinaxiaokang.com")){
//单独处理中国小康网
source = document.select("span#arturl").select("a").text();
}else if(url.contains("chinabaogao.com")) {
//单独处理中国报告网
source = document.select("p.cbg-a-d-info").select("a").text().replaceAll("大 中 小 | ", "");
}else if(url.contains("anyv.net")) {
//单独处理爱妮微
source = document.select("span.cor666").select("a").text();
}else if(url.contains("yingxiao360.com")){
//单独处理第一赢销网
source = "第一赢销网";
}else if(url.contains("cctime.com")){
//单独处理飞象网
source = document.select("td.dateAndSource").text().replaceAll(".*\\d{2}|作 者.*| ", "");
}else if(url.contains("news.hexun.com")){
//单独处理和讯网
source = document.select("div.tip.fl").select("a").text();
}else if(url.contains("finance.jrj.com.cn")){
//单独处理金融界
source = document.select("p.inftop").select("span").get(1).select("a").text().replaceAll("价值.*| ", "");
}else if(url.contains("tech.china.com.cn")){
//单独处理中国网
source = document.select("span.fl.time2").select("a").text();
}else if(url.contains("news.china.com.cn")){
//单独处理中国网
source = document.select("div.pub_date").select("span#source_baidu").text().replaceAll(".*来源:", "");
}else if(url.contains("admin5.com")){
//单独处理站长网
source = document.select("div.source").select("span").text().replaceAll(".*来源:| ", "");
}else if(url.contains("stock.qq.com")){
//单独处理腾讯证券
source = document.select("div.a_Info").select("span.a_source").text();
}else if(url.contains("n.cztv.com")){
//单独处理新蓝网
source = document.select("div.publish").select("ul").select("li").text().replaceAll("\\d{4}.*", "");
}else if(url.contains("news.paidai.com")){
//单独处理派代网
source = document.select("p.t_info").select("span").select("a").text();
}else if(url.contains("news.mydrivers.com")){
//单独处理快科技
source = document.select("div.news_bt1_left").text().replaceAll(".*出处:| 作者.*", "");
}else if(url.contains("www.chinaz.com")){
//单独处理站长之家
source = document.select("div.meta").select("span.source").select("a").text();
}else if(url.contains("yuncaijing.com")){
//单独处理云财经
source = document.select("section.news-wrap").select("header").select("div").text().replaceAll(".*消息来源: |\\[阅读原文.*| ", "");
}else if(url.contains("itmsc.cn")){
//单独处理科技传媒网
source = document.select("div.arc_sc").select("p").select("a").text();
}else if(url.contains("nbd.com.cn")){
//单独处理每日经济新闻
source = document.select("span.source").text();
}else if(url.contains("pintu360.com")){
//单独处理品途商业评论
source = "品途商业评论";
}else if(url.contains("news.qudong.com")){
//单独处理驱动中国
source = document.select("div.news_right").select("dd").select("li").select("span").select("a").text().replaceAll(" .*", "");
}else if(url.contains("shobserver.com")){
//单独处理上海观察
source = document.select("span.max-words").get(0).text();
}else if(url.contains("g.pconline.com.cn")){
//单独处理太平洋电脑网
source = document.select("div.art-info").text().replaceAll("手机|\\d{4}.*| ", "");
}else if(url.contains("news.xtol.cn")){
//单独处理湘潭在线
source = document.select("span.date").text().replaceAll(".*来源:", "");
}else if(url.contains("bjnews.com.cn")){
//单独处理新京报网
source = document.select("span.author").text().replaceAll(" 记者.*", "");
}else if(url.contains("telworld.com.cn")){
//单独处理运营商世界
source = document.select("div.news_xiang_tit_2_left").select("a").text();
}else if(url.contains("thehour.cn")){
//单独处理浙江24小时
source = document.select("div.newsInfo").select("span").select("a").text();
}else if(url.contains("sh.zol.com.cn")){
//单独处理中关村在线
source = document.select("div.article-aboute").select("span.source_baidu").text();
}else if(url.contains("ec.com.cn")){
//单独处理中国国际电子商务网
source = document.select("span.article_resource").text().replaceAll(".*来源:", "");
}else if(url.contains("cqn.com.cn")){
//单独处理中国质量新闻网
source = document.select("span.from").text().replaceAll("-.*", "");
}else if(url.contains("sc.stock.cnfol.com")){
//单独处理中金在线
source = document.select("div.artDes").select("span").select("a").text();
}else if(url.contains("zczj.com")){
//单独处理众筹之家
source = document.select("div.news-info").select("span").text().replaceAll("来源:|作者.*", "");
}else if(url.contains("cqcb.com")){
//单独处理重庆晨报
source = document.select("span.label_nr").text();
}else if(url.contains("stock.10jqka.com.cn")){
//单独处理重庆晨报
source = document.select("span.label_nr").text();
}else if(url.contains("jiemian.com") ){
//单独处理界面新闻
// source = document.select("div.article-info").select("span").text().replaceAll(".*来源:| 字体[\\w\\W]*", "");
return "界面新闻";
}else if(url.contains("finance.youth.cn")){
//单独处理中国青年网
source = document.select("span#source_baidu").text().replaceAll("来源:|作者.*", "");
}else if(url.contains("china.com")) {
//中国金融商报
source = document.select("#chan_newsInfo > a").text();
}else if(url.contains("xw.qq.com")) {
//腾讯网客户端
source = document.select("div.tpl_header_author").text();
}else if(url.contains("china.prcfe.com")) {
source = html.split("\"")[1];
}
if(Objects.nonNull(source) && source.length() != 0) {
return source;
}
else{
//其他网站处理
source = mathchOtherSource(html, htmlBody, sourceList);
if(source!=null){
//验证来源
for (String sourceMatch : sourceList) {
if (source.contains(sourceMatch)) {
return sourceMatch;
}
}
}
}
} catch (Exception e) {
e.toString();
}
return null;
}
/**
* @Title: matchMediaSelfSource
* @author hero
* @Description: 验证及匹配自媒体号
* @param @param url
* @param @param html
* @param @return 设定文件
* @return String 返回类型
*/
public static String matchMediaSelfSource(String url,String html) {
String source = null;
Document document = Jsoup.parse(html);
try {
/***特定网站单独处理**/
if(url.contains("toutiao.com")){
//今日头条帐号匹配
if(html.contains("name: '") && html.contains("mediaInfo")){
source = html.split("mediaInfo:")[1].split("name: '")[1].split("',")[0].trim();
}else if(html.contains("name: '") && html.contains("ugcInfo")){
source = html.split("ugcInfo:")[1].split("name: '")[1].split("',")[0].trim();
}else if(html.contains("screen_name:")){
source = html.split("screen_name:'")[1].split("',")[0].trim();
}
if(source!=null && source.length()>1){
source = "今日头条-" + source;
}
}else if(url.contains("sohu.com")){
//搜狐自媒体号
if(html.contains("<meta name=\"mediaid\"")){
source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim();
if(source!=null && source.length()>1){
source = "搜狐-" + source;
}
}
}else if(url.contains("tznew.58.com")){
source = JSONObject.parseObject(html).getJSONObject("result").getString("author");
if(source!=null && source.length()>1){
source = "58-" + source;
}
}else if(url.contains("c.m.163.com")){
source = document.select("section.g-article.js-article > div.js-article-inner > div > b").text();
if(StringUtils.isBlank(source)){
source = document.select("div.info > h3").text();
}
if(source!=null && source.length()>1){
source = "网易新闻-" + source;
}
}else if(url.contains("a.mini.eastday.com")){
//处理东方头条网-自媒体号匹配
source = document.select("[class=\"share_cnt_p clearfix\"]").select("div.fl").select("i").get(1).text().trim();
if(source!=null && source.length()>1){
source = "东方头条-" + source;
}
}else if(url.contains("fashion.eastday.com")){
//处理东方头条网-自媒体号匹配
source = document.select("div.J-title_detail.title_detail > div > div.fl > i:nth-child(2)").text().trim();
if(Objects.isNull(source) || source.length() < 1) {
source = document.select("div.J-title_detail.title_detail > div > div.fl > a").text().trim();
}
if(source!=null && source.length()>1){
source = "东方看点-" + source;
}
}else if(url.contains("sh.qihoo.com") || url.contains("360kuai.com")){
//今日报点解析
source = document.select("span.source").text().trim();
if(source.length() < 1) {
source = document.select("p.article-info").select("a").text().trim();
}
if(source.length() < 1 && html.contains("window.__INITIAL_DATA__ =")) {
Matcher ma = Pattern.compile("window.__INITIAL_DATA__ =[\\s\\S]+?}};").matcher(html);
if(ma.find()) {
String result = ma.group().replaceAll("window.__INITIAL_DATA__ =|\\</script\\>|", "").trim();
if(result.contains("window.autohomePVDDWhiteList")) {
result = result.split("window.autohomePVDDWhiteList")[0];
}
JSONObject json = JSONObject.parseObject(result.trim().substring(0,result.lastIndexOf(";")));
source = json.getJSONObject("detail").getString("sec_src");
if(Objects.isNull(source) || source.length() < 1) {
source = json.getJSONObject("detail").getString("src");
}
}
}
if(Objects.nonNull(source) && source.length()>1){
source = "快资讯-" + source;
}
}else if(url.contains("cj.sina.com.cn") || url.contains("finance.sina.cn") ||
url.contains("tech.sina.cn") || url.contains("news.sina.cn")){
source = document.select("h2.weibo_user").text();
if((Objects.isNull(source) || source.length() < 1) && html.contains("<meta name=\"mediaid\"")){
//新浪科技头条号
source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim();
}
if(Objects.isNull(source) || source.length() < 1){
//新浪财经头条号
source = document.select("body > main > section.j_main_art > section > article > time > cite").text();
}
if(source!=null && source.length()>1){
source = "新浪-" + source;
}
}else if(url.contains("auto.sina.cn") || url.contains("auto.sina.com.cn")){
source = document.select("div.art_title > div > span:nth-child(1)").text();
if(Objects.isNull(source) || source.length() < 1){
source = document.select("#top_bar > div > div.date-source > a").text();
}
if(source!=null && source.length()>1){
source = "新浪-" + source;
}
}else if(url.contains("k.sina.cn")){
source = document.select("h2.weibo_user").text();
if(source!=null && source.length()>1){
source = "新浪-" + source;
}
}else if(url.contains("blog.sina.com.cn")){
source = document.select("strong#ownernick").text();
if(source!=null && source.length()>1){
source = "新浪博客-" + source;
}
}else if(url.contains("baijiahao.baidu.com") || url.contains("mbd.baidu.com")){
//百度百家
source = document.select("span.userNameSpan").text();
if(StringUtils.isBlank(source)){
source = document.select("p.author-name:nth-child(1)").text();
}
if(StringUtils.isNotBlank(source)){
source = "百度百家-" + source;
}
}else if(url.contains("app.myzaker.com")){
// zaker客户端
source = document.select("#tpl_author").first().text().trim();
if(source!=null && source.length()>1){
source = "zaker客户端-" + source;
}
}else if(url.contains("yidianzixun.com")){
//一点资讯
if(html.contains("related_wemedia")){
source = html.split("media_name\":\"")[1].split("\",\"")[0].trim();
if(source!=null && source.length()>1){
source = "一点资讯-" + source;
}
}else{
source = html.split("source\":\"")[1].split("\",\"")[0];
}
}else if(url.contains("news.bitauto.com")){
//易车网
source = document.select("[class=\"gz-box clearfix\"]").select("div.txt-box")
.select("p.p-n").select("a").text();
if(source!=null && source.length()>1){
source = "易车网-" + source;
}
}else if(url.contains("ittime.com.cn")){
//it时代网
source = document.select("div.top.author > dl > dd > p > a").text();
if(Objects.nonNull(source) && !source.isEmpty()){
source = "it时代网-" + source;
}
}else if(url.contains("wap.peopleapp.com")){
// 人民日报客户端
JSONObject json = JSONObject.parseObject(html);
source = json.getJSONObject("data").getString("authors");
if(Objects.nonNull(source) && !source.isEmpty()){
source = "人民日报客户端-" + source;
}
}else if(url.contains("guancha.cn")){
// 风闻社区
source = document.select("div.main-tow > div.box-left > div.article-content > div:nth-child(3) > div.user-main > h4 > a").text();
if(Objects.nonNull(source) && !source.isEmpty()){
source = "风闻社区-" + source;
}else {
source = document.select("div.author-intro.fix > p > a").text();
if(Objects.nonNull(source) && !source.isEmpty()){
source = "观察者-" + source;
}
}
}else if(url.contains("yesky.com")){
// 天极自媒体
source = document.select("div.elf > dl > dd.bt > a").text();
if(Objects.nonNull(source) && !source.isEmpty()){
source = "天极自媒体-" + source;
}
}else if(url.contains("nkj.cn")){
// 牛科技
source = document.select("div.widget.suxingme_post_author > div > div.author_name > a").text();
if(Objects.nonNull(source) && !source.isEmpty()){
source = "牛科技-" + source;
}
}else if(url.contains("chejiahao.autohome.com.cn")){
//汽车之家
source = document.select("div.authorMes").select("[class=\"name text-overflow\"]")
.select("a").text();
if(source!=null && source.length()>1){
source = "汽车之家-" + source;
}
}else if(url.contains("item.btime.com")){
//北京时间
source = document.select("a.author").text();
if(Objects.isNull(source) || source.length() < 1){
source = document.select("div.content-info > span.col.cite").text();
}
if(source!=null && source.length()>1){
source = "北京时间-" + source;
}
}else if(url.contains("mp.qq.com")){
source = document.select("div#account_top > div.puin_text > div.pname").text();
if(StringUtils.isNotBlank(source)){
source = "QQ看点-" + source;
}
}else if(url.contains("v.qq.com")) {
source = document.select("span.user_name").text();
if(StringUtils.isNotBlank(source)){
source = "腾讯视频-" + source;
}
}else if(url.contains("qq.com/")){
//腾讯网-企鹅号
source = html.split("media\": \"")[1].split("\",")[0];
if(source!=null && source.length()>1){
source = "企鹅号-" + source;
}
}else if(url.contains("feng.ifeng.com")){
//凤凰网-大风号
source = html.split("source\":\"")[1].split("\",\"")[0];
if(source!=null && source.length()>1){
source = "大风号-" + source;
}
}else if(url.contains("dy.163.com")){
//网易订阅-网易号
source = document.select("div.normal > div.colum_info > h4").text();
if(source!=null && source.length()>1){
source = "网易-" + source;
}
}else if(url.contains("qctt.cn")){
//汽车头条
source = document.select("div.part2>a").text();
if(source!=null && source.length()>1){
source = "汽车头条-" + source;
}
}else if(url.contains("maiche.com")){
//买车网
source = document.select("div.info-left > div:nth-child(2) > span > a").text();
if(source!=null && source.length()>1){
source = "买车网-" + source;
}
}else if(url.contains("3g.163.com")){
source = document.select("div.info").select("[class=\"source js-source\"]")
.text();
if(StringUtils.isNotBlank(source)){
source = "网易号-" + source;
}
}else if(url.contains("myzaker.com")){
source = document.select("div.article_header > div > a > span.auther")
.text();
if(StringUtils.isNotBlank(source)){
source = "zaker-" + source;
}
}else if(url.contains("edushi.com")){
source = document.select("div.eds-name-box > div.eds-name > a > div.name")
.text();
if(StringUtils.isNotBlank(source)){
source = "今日潮闻-" + source;
}
}else if(url.contains("ijiandao.com")){
source = document.select("div.article-author > span.author-name > a")
.text();
if(StringUtils.isNotBlank(source)){
source = "爱尖刀-" + source;
}
}else if(url.contains("chuangyejia.com")){
source = document.select("div.article-title > ul.article-author > li:nth-child(1)")
.text();
if(StringUtils.isNotBlank(source)){
source = "创业家-" + source;
}
}else if(url.contains("kejixun.com")){
source = document.select("div.r-box.r-box-none.clearfix > div > div.big-man > h3 > a")
.text();
if(StringUtils.isNotBlank(source)){
source = "科技讯-" + source;
}
}else if(url.contains("tmtpost.com")){
source = document.select("article > div.post-info > a")
.text();
if(StringUtils.isNotBlank(source)){
source = "钛媒体-" + source;
}
}else if(url.contains("cyzone.cn")){
source = document.select("div.article-author-info > div.author-main > div > div.a-word > div.a-name > a")
.text();
if(StringUtils.isNotBlank(source)){
source = "创业邦-" + source;
}
}else if(url.contains("36kr.com")){
source = document.select("div.info-header-text > a.author-name").text();
if(StringUtils.isNotBlank(source)){
return "36氪-" + source;
}
source = document.select("h4.author-name").text();
if(StringUtils.isNotBlank(source)){
return "36氪-" + source;
}
source = document.select("span.author-nickname").text();
if(StringUtils.isNotBlank(source)){
return "36氪-" + source;
}
}else if(url.contains("lianxianjia.com")){
source = document.select("span.author-name").text();
if(StringUtils.isNotBlank(source)){
source = "连线家-" + source;
}
}else if(url.contains("itouchtv.cn")){
source = document.select("div.index__article-media-20Tg_ > span:nth-child(1)").text();
if(StringUtils.isNotBlank(source)){
source = "触电新闻-" + source;
}
}else if(url.contains("whb.cn")){
source = document.select("div.yidian-info > span:nth-child(1)").text();
if(StringUtils.isNotBlank(source)){
source = "文汇APP-" + source;
}
}else if(url.contains("blogchina.com")){
source = document.select("div.meta-top > label.lm_name > span > a").text();
if(StringUtils.isNotBlank(source)){
source = "博客中国-" + source;
}
}else if(url.contains(".iqiyi.com")) {
source = JSONObject.parseObject(html.split("page-info='")[1].split("'")[0]).getJSONObject("user").getString("name");
if(StringUtils.isNotBlank(source)){
source = "爱奇艺-" + source;
}
}else if(url.contains("v.youku.com")) {
source = document.select("a.sub-name").text();
if(StringUtils.isNotBlank(source)){
source = "优酷-" + source;
}
}else if(url.contains("jiemian.com")) {
source = document.select("div.article-info > p > span.author > a").text();
if(StringUtils.isNotBlank(source)){
source = "界面新闻-" + source;
}
}else if (url.contains("iyiou.com")) {
source = document.select("div#post_author > a").text();
if(StringUtils.isNotBlank(source)) {
source = "亿欧网-" + source;
}
}else if (url.contains("lanjingtmt.com")) {
source = document.select("div.scd-title > a:nth-child(2)").text();
if(StringUtils.isNotBlank(source)) {
source = "蓝鲸-" + source;
}
}else if (url.contains("lanjinger.com")) {
if(document.select("div.content_left > div:nth-child(2) > span").text().contains("专栏")) {
source = document.select("a.author_name").text().replaceAll(".*编辑| ", "");
if(StringUtils.isNotBlank(source)) {
source = "蓝鲸财经-" + source;
}
}
}else if (url.contains("huxiu.com")) {
source = document.select("div.article__author-info-box > a.article-author-info > span.author-info__username").text();
if(StringUtils.isNotBlank(source)) {
source = "虎嗅-" + source;
}
}else if (url.contains("chuansongme.com")) {
source = document.select("div.rich_media_meta_list > span.rich_media_meta.rich_media_meta_text").text();
if(StringUtils.isNotBlank(source)) {
source = "传送门-" + source;
}
}else if (url.contains("a.mp.uc.cn")) {
JSONObject json = JSONObject.parseObject(html);
source = json.getJSONObject("data").getJSONObject("_author").getString("author_name");
if(StringUtils.isNotBlank(source)) {
source = "uc-" + source;
}
}else if (url.contains("m.uczzd.cn")) {
if(html.contains("var xissJsonData =")){
html = html.split("var xissJsonData = ")[1].split("};")[0]+"}";
source = JSONObject.parseObject(html).getString("source_name");
}
if(StringUtils.isNotBlank(source)) {
source = "uc-" + source;
}
}else if (url.contains("kd.youth.cn")) {
source = document.select("body > div > div > div.rich_media_meta_list > a").text();
if(StringUtils.isNotBlank(source)) {
source = "中青在线-" + source;
}
}else if (url.contains("zhuanlan.zhihu.com")) {
source = document.select("a.UserLink-link").text();
if(StringUtils.isNotBlank(source)) {
source = "知乎专栏-" + source;
}
}else if (url.contains("wulizixun.com")) {
source = document.select("span.newdetailOrigin").text();
if(StringUtils.isNotBlank(source)) {
source = "唔哩头条-" + source;
}
}else if(url.contains("t.10jqka.com.cn")){
source = document.select("a[class=\"link777 post-author db fl\"]").text();
if(StringUtils.isNotBlank(source)) {
source = "同花顺-" + source;
}
}else if(url.contains("shangyexinzhi.com")){
source = document.select("span.hover-color_change").text();
if(StringUtils.isNotBlank(source)) {
source = "商业新知-" + source;
}
}else if(url.contains("thepaper.cn")){
source = document.select("a> div.name").text();
if(StringUtils.isNotBlank(source)){
source = "澎湃新闻-" + source;
}
}else if(url.contains("tuicool.com")){
source = document.select("span.from> a").text();
if(StringUtils.isNotBlank(source)){
source = "推酷-" + source;
}
}
return source;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
*
* @Title: mathchOtherSource
* @author hero
* @Description: 匹配通用结果数据
* @param @param html
* @param @param htmlBody
* @param @param length
* @param @return 设定文件
* @return String 返回类型
*/
private static String mathchOtherSource(String html,String htmlBody,List<String> sourceList){
/** 正文抽取,目的是避免正文中存在相应匹配规则 **/
String source = null;
try {
News news = ContentExtractor.getNewsByHtml(html);
String content = TreateData.filterSpecialCharacter(news.getContent().toUpperCase());
String title = TreateData.filterSpecialCharacter(news.getTitle().toUpperCase());
/**剔除正文**/
String text = htmlBody.replace(content, "@@@@@@@@@@");
/**分割正文**/
String[] matchTextArr = text.split("@@@@@@@@@@");
if(TreateData.regex(fromRegex, matchTextArr[0]) != null || TreateData.regex(fromRegex, matchTextArr[1])!=null){
if(TreateData.regex(fromRegex, matchTextArr[0])!=null){
source = TreateData.regex(fromRegex, matchTextArr[0]);
for (String sourceMatch : sourceList) {
if (source.contains(sourceMatch)) {
return sourceMatch;
}
}
}else if(TreateData.regex(fromRegex, matchTextArr[1])!=null){
source = TreateData.regex(fromRegex, matchTextArr[1]);
for (String sourceMatch : sourceList) {
if (source.contains(sourceMatch)) {
return sourceMatch;
}
}
}
}else{
if(matchTextArr[0].contains(title)){
/***判断是否包含标题,如果包含标题则以标题截取数据
* 验证数据为 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
* ***/
String[] titlesArr = matchTextArr[0].split(title);
for(int j = 0;j<titlesArr.length; j++){
String timeSource = TreateData.regex(timeRegex, titlesArr[j]);
if(timeSource!=null){
source = getSourceByTime(timeSource, titlesArr[j], sourceList);
if(source != null){
return source;
}
}
}
}
if(matchTextArr[1].contains(title)){
/***判断是否包含标题,如果包含标题则以标题截取数据
* 验证数据为 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
* ***/
String[] titlesArr = matchTextArr[1].split(title);
for(int j = 0;j<titlesArr.length; j++){
String timeSource = TreateData.regex(timeRegex, titlesArr[j]);
if(timeSource!=null){
source = getSourceByTime(timeSource, titlesArr[j], sourceList);
if(source != null){
return source;
}
}
}
}
}
/***正文外无相关数据,匹配正文**/
if(source == null ){
/***
* 匹配命中包含来源等规则的数据
*/
source = TreateData.regex(fromRegex, content);
if(source!=null){
for (String sourceMatch : sourceList) {
if (source.contains(sourceMatch)) {
return sourceMatch;
}
}
}else {
/***判断是否包含标题,如果包含标题则以标题截取数据
* 验证数据为 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
* ***/
if(content.contains(title)){ /**正文中包含标题**/
String[] titlesArr = content.split(title);
for(int j = 0;j<titlesArr.length; j++){
String timeSource = TreateData.regex(timeRegex, titlesArr[j]);
if(timeSource!=null){
source = getSourceByTime(timeSource, titlesArr[j], sourceList);
if(source != null){
return source;
}
}
}
}else{ /**正文中不包含标题**/
String timeSource = TreateData.regex(timeRegex, content);
if(timeSource!=null){
source = getSourceByTime(timeSource, content, sourceList);
if(source != null){
return source;
}
}
}
}
}
} catch (Exception e) {
System.out.println("正文抽取失败处理........");
e.toString();
/***
* 匹配正文失败
* 匹配命中包含来源等规则的数据
*/
source = TreateData.regex(fromRegex, htmlBody);
if (source != null) {
for (String sourceMatch : sourceList) {
if (source.contains(sourceMatch)) {
return sourceMatch;
}
}
} else {
/***判断是否包含标题,如果包含标题则以标题截取数据
* 验证数据为 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
* ***/
String timeSource = TreateData.regex(timeRegex, htmlBody);
if(timeSource!=null){
source = getSourceByTime(timeSource, htmlBody, sourceList);
if(source != null){
return source;
}
}
}
}
return null;
}
/**
* @Title: getSourceByTime
* @author hero
* @Description: TODO(根据匹配时间截取数据)
* @param @param htmlBody
* @param @return 设定文件
* @return String 返回类型
*/
private static String getSourceByTime(String timeSource, String htmlBody,List<String> sourceList){
try {
/**以时间做分割,匹配来源信息。
* 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
***/
String[] times = htmlBody.split(timeSource);
for (int j = 0; j < times.length; j++) {
String timecontent = times[j];
if (j == 0) {
if (timecontent.length() >= 30) {
timecontent = timecontent.substring(timecontent.length() - 30, timecontent.length());
} else {
timecontent = timecontent.substring(0, timecontent.length());
}
} else {
if (timecontent.length() >= 30) {
timecontent = timecontent.substring(0, 30);
} else {
timecontent = timecontent.substring(0, timecontent.length());
}
}
for (String sourceMatch : sourceList) {
if (timecontent.contains(sourceMatch)) {
return sourceMatch;
}
}
}
return null;
} catch (Exception e) {
e.toString();
return null;
}
}
}
package com.zhiwei.source_forward.util;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.source_forward.config.ProxyConfig;
/**
* 初始化代理
* @author xMx
* @date 2020年1月6日 上午9:29:04
*/
public class ProxyInit {
/**
* 初始化代理
* void
*/
public static void initProxy() {
String address = ProxyConfig.registry;
String appName = "xumiaoxin";
long appId = ProxyConfig.proxyid;
ProxyFactory.init(SimpleConfig.builder().registry(address).appName(appName).appId(appId).group(ProxyConfig.group).build());
}
}
#registry=zookeeper://192.168.0.203:2181;zookeeper://192.168.0.104:2181;zookeeper://192.168.0.105:2181
#group=hangzhou
##########################测试地址##############################
registry=zookeeper://192.168.0.36:2181
#registry=zookeeper://192.168.0.203:2181;zookeeper://192.168.0.104:2181;zookeeper://192.168.0.105:2181
#group=hangzhou
##########################测试地址##############################
registry=zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181
proxyid=10000002
group=local
\ No newline at end of file
//package com.zhiwei.source_forward.sourceforward.test;
//
//import java.util.HashMap;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.source_forward.run.SourceForward;
//
///**
// * @ClassName: SourceForwardTest
// * @Description: 来源验证
// * @author hero
// * @date 2017年12月6日 上午9:55:13
// */
//public class MediaSelfSourceTest {
//
// @Test
// public void sourceForwardTest(){
// Map<String,Map<String,Object>> dataMap = new HashMap<String,Map<String,Object>>();
// String url = "https://www.toutiao.com/a6549872248428167687/";
// Map<String,Object> data = new HashMap<String,Object>();
// dataMap.put(url, data);
//
// SourceForward.getMediaSelfSource(dataMap);
//
// }
//
//
//
//
//
//
//
//
//}
//package com.zhiwei.source_forward.sourceforward.test;
//
//import java.util.HashMap;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.source_forward.run.SourceForward;
//
///**
// * @ClassName: SourceForwardTest
// * @Description: 来源验证
// * @author hero
// * @date 2017年12月6日 上午9:55:13
// */
//public class MediaSelfSourceTest {
//
// @Test
// public void sourceForwardTest(){
// Map<String,Map<String,Object>> dataMap = new HashMap<String,Map<String,Object>>();
// String url = "https://www.toutiao.com/a6549872248428167687/";
// Map<String,Object> data = new HashMap<String,Object>();
// dataMap.put(url, data);
//
// SourceForward.getMediaSelfSource(dataMap);
//
// }
//
//
//
//
//
//
//
//
//}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment