Commit 132b70cb by cwy

Merge branch 'source-forward-chen' of…

Merge branch 'source-forward-chen' of http://git.zhiweidata.top/zhangzhiwei/source_forward.git into source-forward-chen
parents f0fbf66b 0abfbd4a
This source diff could not be displayed because it is too large. You can view the blob instead.
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId> <artifactId>source-forward</artifactId>
<version>0.2.1-SNAPSHOT</version> <version>0.2.2-SNAPSHOT</version>
<name>source-forward</name> <name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description> <description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties> </properties>
<developers> <developers>
<developer> <developer>
<id>Bewilder</id> <id>Bewilder</id>
<name>zhiwei zhang</name> <name>zhiwei zhang</name>
<email>zhangzhiwei@zhiweidata.com</email> <email>zhangzhiwei@zhiweidata.com</email>
</developer> </developer>
</developers> </developers>
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>com.zhiwei.tools</groupId> <groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId> <artifactId>zhiwei-tools</artifactId>
<version>0.1.3-SNAPSHOT</version> <version>0.1.6-SNAPSHOT</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.zhiwei.crawler</groupId> <groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId> <artifactId>crawler-core</artifactId>
<version>0.5.5.6-SNAPSHOT</version> <version>0.6.1.0-SNAPSHOT</version>
</dependency> </dependency>
</dependencies> </dependencies>
<!-- 打包管理 --> <!-- 打包管理 -->
<build> <build>
<plugins> <plugins>
<!-- 发布源码 --> <!-- 发布源码 -->
<plugin> <plugin>
<artifactId>maven-source-plugin</artifactId> <artifactId>maven-source-plugin</artifactId>
<version>2.4</version> <version>2.4</version>
<configuration> <configuration>
<attach>true</attach> <attach>true</attach>
</configuration> </configuration>
<executions> <executions>
<execution> <execution>
<phase>compile</phase> <phase>compile</phase>
<goals> <goals>
<goal>jar</goal> <goal>jar</goal>
</goals> </goals>
</execution> </execution>
</executions> </executions>
</plugin> </plugin>
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId> <artifactId>maven-javadoc-plugin</artifactId>
<version>2.10.4</version> <version>2.10.4</version>
</plugin> </plugin>
<!-- 解决maven test命令时console出现中文乱码乱码 --> <!-- 解决maven test命令时console出现中文乱码乱码 -->
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId> <artifactId>maven-surefire-plugin</artifactId>
<version>2.19.1</version> <version>2.19.1</version>
<configuration> <configuration>
<forkMode>once</forkMode> <forkMode>once</forkMode>
<argLine>-Dfile.encoding=UTF-8</argLine> <argLine>-Dfile.encoding=UTF-8</argLine>
</configuration> </configuration>
</plugin> </plugin>
</plugins> </plugins>
</build> </build>
<!-- 分发管理:管理distribution和supporting files --> <!-- 分发管理:管理distribution和supporting files -->
<distributionManagement> <distributionManagement>
<snapshotRepository> <snapshotRepository>
<id>nexus-releases</id> <id>nexus-releases</id>
<name>User Porject Snapshot</name> <name>User Porject Snapshot</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/snapshots/</url> <url>http://192.168.0.30:8081/nexus/content/repositories/snapshots/</url>
<uniqueVersion>true</uniqueVersion> <uniqueVersion>true</uniqueVersion>
</snapshotRepository> </snapshotRepository>
<repository> <repository>
<id>nexus-releases</id> <id>nexus-releases</id>
<name>User Porject Release</name> <name>User Porject Release</name>
<url>http://192.168.0.30:8081/nexus/content/repositories/releases/</url> <url>http://192.168.0.30:8081/nexus/content/repositories/releases/</url>
</repository> </repository>
</distributionManagement> </distributionManagement>
</project> </project>
\ No newline at end of file
package com.zhiwei.source_forward.config; package com.zhiwei.source_forward.config;
import java.io.InputStream; import java.io.InputStream;
import java.util.Properties; import java.util.Properties;
public class ProxyConfig { public class ProxyConfig {
static { static {
Properties conf = null; Properties conf = null;
try { try {
InputStream is = Thread.currentThread().getContextClassLoader() InputStream is = Thread.currentThread().getContextClassLoader()
.getResourceAsStream("proxyip.properties"); .getResourceAsStream("proxyip.properties");
conf = new Properties(); conf = new Properties();
conf.load(is); conf.load(is);
is.close(); is.close();
registry = conf.getProperty("registry"); registry = conf.getProperty("registry");
group = conf.getProperty("group"); proxyid = Long.valueOf(conf.getProperty("proxyid"));
} catch (Exception e) { group = conf.getProperty("group");
e.printStackTrace();
} } catch (Exception e) {
} e.printStackTrace();
}
}
public static String registry;
public static String group;
public static String registry;
} public static Long proxyid;
public static String group;
}
package com.zhiwei.source_forward.crawler; package com.zhiwei.source_forward.crawler;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects; import java.util.Objects;
import org.apache.logging.log4j.LogManager; import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.LogManager;
import org.jsoup.Jsoup; import org.apache.logging.log4j.Logger;
import org.jsoup.nodes.Node; import org.jsoup.Jsoup;
import org.jsoup.nodes.Node;
import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.async.GroupSync;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution; import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.util.MatchChannel; import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.util.MatchSource; import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack; import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack;
import okhttp3.Request;
import okhttp3.Request;
/**
* /**
* @ClassName MediaSelfSourceCrawler *
* @Description 自媒体号匹配 * @ClassName MediaSelfSourceCrawler
* @author byte-zbs * @Description 自媒体号匹配
* @Date 2018年8月21日 下午3:54:03 * @author byte-zbs
* @version 1.0.0 * @Date 2018年8月21日 下午3:54:03
*/ * @version 1.0.0
public class MediaSelfSourceCrawler { */
public class MediaSelfSourceCrawler {
private static final Logger logger = LogManager.getLogger(MediaSelfSourceCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build(); private static final Logger logger = LogManager.getLogger(MediaSelfSourceCrawler.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
/**
* /**
* @Description 链接传入 并 返回采集完信号 *
* @param callback * @Description 链接传入 并 返回采集完信号
* @param urls * @param callback
* @return * @param urls
* @throws Exception * @return
*/ * @throws Exception
public GroupSync submitTask(MediaSelfSourceDataCallBack callback,String... urls) { */
try { public GroupSync submitTask(MediaSelfSourceDataCallBack callback,String... urls) {
GroupSync counter = new GroupSync(); try {
start(counter, callback, urls); GroupSync counter = new GroupSync();
return counter; start(counter, callback, urls);
} catch (Exception e) { return counter;
logger.error(" exception {}",e); } catch (Exception e) {
return null; logger.error(" exception {}",e);
} return null;
} }
}
/**
* /**
* @Description 提交链接 *
* @param counter * @Description 提交链接
* @param callback * @param counter
* @param urls * @param callback
*/ * @param urls
private void start(GroupSync counter,MediaSelfSourceDataCallBack callback, String... urls) { */
if (urls != null && urls.length > 0) { private void start(GroupSync counter,MediaSelfSourceDataCallBack callback, String... urls) {
for (String url : urls) { if (urls != null && urls.length > 0) {
counter.add(); for (String url : urls) {
if (url != null) { counter.add();
try { if (url != null) {
search(counter, url, Attribution.of(url), callback); try {
} catch (Exception e) { search(counter, url, Attribution.of(url), callback);
logger.error("搜索创建出错", e); } catch (Exception e) {
} logger.error("搜索创建出错", e);
} }
counter.done(); }
} counter.done();
} }
} }
}
/**
* /**
* @Description 链接获取文章信息 *
* @param counter * @Description 链接获取文章信息
* @param url * @param counter
* @param attr * @param url
* @param callback * @param attr
* @return * @param callback
*/ * @return
private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) { */
logger.info("当前处理 URL: {}", url); private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) {
Map<String,Object> map = new HashMap<>(); logger.info("当前处理 URL: {}", attr.get());
if(url.contains("toutiao.com")) { Map<String,Object> map = new HashMap<>();
map.put("referer", url); if(url.contains("toutiao.com")) {
} map.put("referer", url);
map.put("Connection", "close"); }
url = dealUrl(url); url = dealUrl(url);
if(Objects.nonNull(url)) { if(Objects.nonNull(url)) {
Request request = RequestUtils.wrapGet(url, map); Request request = RequestUtils.wrapGet(url, map);
counter.add(); counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY).whenComplete((rs,ex) -> { httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY).whenComplete((rs,ex) -> {
try { try {
if (Objects.isNull(ex)) { if (Objects.isNull(ex)) {
try { try {
parseHtml(rs.body().string(), attr, callback); parseHtml(rs.body().string(), attr, callback);
} catch (Exception e) { } catch (Exception e) {
logger.error("解析出错", e); logger.error("解析出错", e);
} }
} else { } else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(),ex); logger.info("{} 搜索结果访问失败: {}", request.url().url(),ex);
} }
} finally { } finally {
counter.done(); counter.done();
} }
}); });
} }
return counter; return counter;
} }
/** /**
** 链接处理 ** 链接处理
* @param url * @param url
* @return * @return
* @return String * @return String
*/ */
private String dealUrl(String url) { private String dealUrl(String url) {
try { try {
if(url.startsWith("http")) { if(url.startsWith("http")) {
if(url.contains("wap.peopleapp.com/article")) { if(url.contains("wap.peopleapp.com/article")) {
return "https://app.peopleapp.com/WapApi/610/ArtInfoApi/getInfoUp?id=" + url.split("article")[1].split("/")[1]; return "https://app.peopleapp.com/WapApi/610/ArtInfoApi/getInfoUp?id=" + url.split("article")[1].split("/")[1];
} }
if(url.contains("a.mp.uc.cn/") && url.contains("wm_cid=")) { if(url.contains("a.mp.uc.cn/") && url.contains("wm_cid=")) {
return "https://ff.dayu.com/contents/" + url.split("wm_cid=")[1].split("&")[0] + "?biz_id=1002&_fetch_author=1"; return "https://ff.dayu.com/contents/" + url.split("wm_cid=")[1].split("&")[0] + "?biz_id=1002&_fetch_author=1";
} }
if(url.contains("tznew.58.com")) { if(url.contains("tznew.58.com")) {
return "https://tznew.58.com/tznew/c/info-detail?infoid=" + url.split("infoid=")[1].split("&")[0]; return "https://tznew.58.com/tznew/c/info-detail?infoid=" + url.split("infoid=")[1].split("&")[0];
} }
if(url.contains("wap.peopleapp.com/article")) { if(url.contains("wap.peopleapp.com/article")) {
return "https://app.peopleapp.com/WapApi/610/ArtInfoApi/getInfoUp?id=" + url.split("article")[1].split("/")[1]; return "https://app.peopleapp.com/WapApi/610/ArtInfoApi/getInfoUp?id=" + url.split("article")[1].split("/")[1];
} }
return url; return url;
} }
} catch (Exception e) { } catch (Exception e) {
} }
return null; return null;
} }
/** /**
* *
* @Description 解析文章获取相关数据 * @Description 解析文章获取相关数据
* @param response * @param attr
* @param attr * @param callback
* @param callback */
*/ private void parseHtml(String result, Attribution attr,
private void parseHtml(String result, Attribution attr, MediaSelfSourceDataCallBack callback) {
MediaSelfSourceDataCallBack callback) { String source = null;
String source = null; String channel = null;
String channel = null; String url = attr.get().toString();
try { try {
source = MatchSource.matchMediaSelfSource(attr.get().toString(),result); source = MatchSource.matchMediaSelfSource(url,result);
if(source==null || source.equals("")){ logger.info(url+"=======" + source);
source = null; channel = MatchChannel.verifyChannel(url);
} if(channel==null){
channel = MatchChannel.verifyChannel(attr.get().toString()); List<Node> nodeList = Jsoup.parse(result).head().childNodes();
if(channel==null){ channel = MatchChannel.matchChannel(nodeList);
List<Node> nodeList = Jsoup.parse(result).head().childNodes(); }
channel = MatchChannel.matchChannel(nodeList); } catch (Exception e) {
} logger.error("exception ",e);
} catch (Exception e) { source = null;
logger.error("exception ",e); }
source = null; MediaSelfSourceBean msfb = new MediaSelfSourceBean(url, source, channel);
} if (callback == null) {
logger.info(attr.get()+"=================来源" + source); logger.warn("DataCallback 对象为 null,无法保存数据");
MediaSelfSourceBean msfb = new MediaSelfSourceBean(attr.get().toString(), source, channel); } else {
if (callback == null) { callback.onData(msfb, attr);
logger.warn("DataCallback 对象为 null,无法保存数据"); }
} else {
callback.onData(msfb, attr); }
}
}
}
}
package com.zhiwei.source_forward.run; package com.zhiwei.source_forward.run;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import org.apache.logging.log4j.LogManager; import com.zhiwei.source_forward.util.ProxyInit;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory; import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean; import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution; import com.zhiwei.source_forward.crawler.SourceForwardCrawler;
import com.zhiwei.source_forward.crawler.SourceForwardCrawler; import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
/**
/** * @ClassName: SourceForward
* @ClassName: SourceForward * @Description: 验证文章是否为转发
* @Description: 验证文章是否为转发 * @author hero
* @author hero * @date 2017年12月5日 下午7:03:57
* @date 2017年12月5日 下午7:03:57 */
*/ public class SourceForward {
public class SourceForward {
private static Logger logger = LogManager.getLogger(SourceForward.class);
private static Logger logger = LogManager.getLogger(SourceForward.class);
/**
/** * @Title: getSourceForward
* @Title: getSourceForward * @author hero
* @author hero * @Description: 验证文章是否转发
* @Description: 验证文章是否转发 * @param @param dataMap
* @param @param dataMap * @param @return 设定文件
* @param @return 设定文件 * @return Map<String,Map<String,Object>> 返回类型
* @return Map<String,Map<String,Object>> 返回类型 */
*/ public static Map<String,Map<String,Object>> getSourceForward(Map<String,Map<String,Object>> dataMap){
public static Map<String,Map<String,Object>> getSourceForward(Map<String,Map<String,Object>> dataMap){ //启动验证来源程序
//启动验证来源程序 List<String> urlList = new ArrayList<>();
List<String> urlList = new ArrayList<>(); for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){ urlList.add(entry.getKey());
urlList.add(entry.getKey()); }
} List<SourceForwardBean> dataList = SourceForwardCrawlerThread.getSourceForward(urlList);
List<SourceForwardBean> dataList = SourceForwardCrawlerThread.getSourceForward(urlList); for(SourceForwardBean sfb : dataList){
for(SourceForwardBean sfb : dataList){ String url = sfb.getUrl();
String url = sfb.getUrl(); String root_source = sfb.getRoot_source();
String root_source = sfb.getRoot_source(); //整合数据及验证转发原创
//整合数据及验证转发原创 if(dataMap.containsKey(url)){
if(dataMap.containsKey(url)){ Map<String,Object> data = dataMap.get(url);
Map<String,Object> data = dataMap.get(url); String source = data.get("来源")+"";
String source = data.get("来源")+""; String isForward = "转发";
String isForward = "转发"; if(root_source == null || root_source.toUpperCase().trim().equals(source.toUpperCase().trim())){
if(root_source == null || root_source.toUpperCase().trim().equals(source.toUpperCase().trim())){ isForward = "原创";
isForward = "原创"; }
}
if(url.contains("mp.weixin.qq.com")){
if(url.contains("mp.weixin.qq.com")){ isForward = sfb.getIsforward();
isForward = sfb.getIsforward(); }else{
}else{ data.put("原来源", root_source);
data.put("原来源", root_source); data.put("频道", sfb.getChannel());
data.put("频道", sfb.getChannel()); }
}
data.put("是否转发", isForward);
data.put("是否转发", isForward); dataMap.put(url, data);
dataMap.put(url, data); }
} }
} return dataMap;
return dataMap; }
}
/**
/** *
* * @Description 批量传入链接获取数据
* @Description 批量传入链接获取数据 * @param urlList
* @param urlList * @return
* @return */
*/ public static List<SourceForwardBean> getSourceForward(List<String> urlList){
public static List<SourceForwardBean> getSourceForward(List<String> urlList){ //启动获取链接来源
//启动获取链接来源 return SourceForwardCrawlerThread.getSourceForward(urlList);
return SourceForwardCrawlerThread.getSourceForward(urlList); }
}
public static void main(String[] args) {
public static void main(String[] args) { ProxyInit.initProxy();
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER, 10000002); List<String> urlList = new ArrayList<>();
List<String> urlList = new ArrayList<>(); urlList.add("http://software.it168.com/a2019/0621/6005/000006005693.shtml");
urlList.add("http://software.it168.com/a2019/0621/6005/000006005693.shtml"); List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
List<SourceForwardBean> da = SourceForward.getSourceForward(urlList); for(SourceForwardBean sfb : da) {
for(SourceForwardBean sfb : da) { System.out.println(sfb.toString());
System.out.println(sfb.toString()); }
} }
}
static class SourceForwardCrawlerThread extends Thread{
static class SourceForwardCrawlerThread extends Thread{
private static List<SourceForwardBean> getSourceForward(List<String> urlList){
private static List<SourceForwardBean> getSourceForward(List<String> urlList){ List<SourceForwardBean> list = Collections.synchronizedList(new ArrayList<SourceForwardBean>());
List<SourceForwardBean> list = Collections.synchronizedList(new ArrayList<SourceForwardBean>()); try{
try{ SourceForwardCrawler crawler = new SourceForwardCrawler();
SourceForwardCrawler crawler = new SourceForwardCrawler(); SourceForwardDataCallBack callback = new SourceForwardDataCallBack() {
SourceForwardDataCallBack callback = new SourceForwardDataCallBack() {
@Override
@Override public void onData(SourceForwardBean data, Attribution attr) {
public void onData(SourceForwardBean data, Attribution attr) { list.add(data);
list.add(data); logger.info("列表大小:::{}",list.size());
logger.info("列表大小:::{}",list.size()); }
}
};
}; crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await();
crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await(); }catch (Exception e){
}catch (Exception e){ logger.error(" 来源判断 出错 {} ",e);
logger.error(" 来源判断 出错 {} ",e); }
} return list;
return list; }
} }
}
}
}
package com.zhiwei.source_forward.run; package com.zhiwei.source_forward.run;
import java.io.IOException; import java.util.ArrayList;
import java.net.Proxy; import java.util.Collections;
import java.util.ArrayList; import java.util.List;
import java.util.Collections; import java.util.Map;
import java.util.List; import java.util.Map.Entry;
import java.util.Map;
import java.util.Map.Entry; import com.zhiwei.source_forward.util.ProxyInit;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.Logger;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.async.TaskBoot; import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.common.config.GroupType; import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.source_forward.crawler.UrlLiveCrawler;
import com.zhiwei.crawler.proxy.ProxyFactory; import com.zhiwei.source_forward.util.UrlLiveDataCallback;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils; /**
import com.zhiwei.source_forward.bean.UrlLiveBean; * @ClassName: URLLive
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution; * @Description: 验证链接是否已删除
import com.zhiwei.source_forward.crawler.UrlLiveCrawler; * @author hero
import com.zhiwei.source_forward.crawler.UrlLiveCrawlerNew; * @date 2017年12月6日 上午9:22:49
import com.zhiwei.source_forward.util.UrlLiveDataCallback; */
public class URLLive {
import okhttp3.Request;
import okhttp3.Response; private static Logger logger = LogManager.getLogger(URLLive.class);
/** private static HttpBoot httpBoot = new HttpBoot.Builder().build();
* @ClassName: URLLive
* @Description: 验证链接是否已删除 /**
* @author hero * @Title: verificationURLLive
* @date 2017年12月6日 上午9:22:49 * @author hero
*/ * @Description: 验证数据是否已删除
public class URLLive { * @param @param dataMap
* @param @return 设定文件
private static Logger logger = LogManager.getLogger(URLLive.class); * @return Map<String,Map<String,Object>> 返回类型
*/
private static HttpBoot httpBoot = new HttpBoot.Builder().build(); public static Map<String,Map<String,Object>> verificationURLLive(Map<String,Map<String,Object>> dataMap){
List<String> urlList = new ArrayList<>();
/** //启动验证链接是否有效程序程序
* @Title: verificationURLLive for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
* @author hero urlList.add(entry.getKey());
* @Description: 验证数据是否已删除 }
* @param @param dataMap System.out.println(urlList.size());
* @param @return 设定文件 //验证数据是否已删除
* @return Map<String,Map<String,Object>> 返回类型 List<UrlLiveBean> dataList = UrlLiveCrawlerThread.getUrlLiveCrawle(urlList);
*/ for(UrlLiveBean ub : dataList){
public static Map<String,Map<String,Object>> verificationURLLive(Map<String,Map<String,Object>> dataMap){ String url = ub.getUrl();
List<String> urlList = new ArrayList<>(); int i = ub.isLive();
//启动验证链接是否有效程序程序 if(dataMap.containsKey(url)){
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){ Map<String,Object> map = dataMap.get(url);
urlList.add(entry.getKey()); if(i == 1) {
} map.put("是否删除", true);
System.out.println(urlList.size()); }else if(i == 0) {
//验证数据是否已删除 map.put("是否删除", false);
List<UrlLiveBean> dataList = UrlLiveCrawlerThread.getUrlLiveCrawle(urlList); }
for(UrlLiveBean ub : dataList){ map.put("title", ub.getTitle());
String url = ub.getUrl(); dataMap.put(url, map);
int i = ub.isLive(); }
if(dataMap.containsKey(url)){ }
Map<String,Object> map = dataMap.get(url); return dataMap;
if(i == 1) { }
map.put("是否删除", true);
}else if(i == 0) { /**
map.put("是否删除", false); *
} * @Description (TODO这里用一句话描述这个方法的作用)
map.put("title", ub.getTitle()); * @param urlList
dataMap.put(url, map); * @return UrlLiveBean 1 已删除 2 未删除 -1 访问失败
} */
} public static List<UrlLiveBean> verificationURLLive(List<String> urlList){
return dataMap; //启动验证链接是否有效程序程序
} return UrlLiveCrawlerThread.getUrlLiveCrawle(urlList);
}
/**
* public static void main(String[] args) {
* @Description (TODO这里用一句话描述这个方法的作用) ProxyInit.initProxy();
* @param urlList List<String> urlList = new ArrayList<>();
* @return UrlLiveBean 1 已删除 2 未删除 -1 访问失败 urlList.add("http://a.mp.uc.cn/article.html?uc_param_str=frdnsnpfvecpntnwprdssskt#!wm_aid=038b8207b444418c845f43e4d2d3a754");
*/
public static List<UrlLiveBean> verificationURLLive(List<String> urlList){ List<UrlLiveBean> u = URLLive.verificationURLLive(urlList);
//启动验证链接是否有效程序程序 for(UrlLiveBean b : u) {
return UrlLiveCrawlerThread.getUrlLiveCrawle(urlList); System.out.println(b.toString());
} }
}
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER, 10000002); static class UrlLiveCrawlerThread extends Thread{
List<String> urlList = new ArrayList<>();
urlList.add("http://a.mp.uc.cn/article.html?uc_param_str=frdnsnpfvecpntnwprdssskt#!wm_aid=038b8207b444418c845f43e4d2d3a754"); private static List<UrlLiveBean> getUrlLiveCrawle(List<String> urlList){
List<UrlLiveBean> list = Collections.synchronizedList(new ArrayList<UrlLiveBean>());
List<UrlLiveBean> u = URLLive.verificationURLLive(urlList); try{
for(UrlLiveBean b : u) { UrlLiveCrawler crawler = new UrlLiveCrawler();
System.out.println(b.toString()); UrlLiveDataCallback callback = new UrlLiveDataCallback() {
}
} @Override
public void onData(UrlLiveBean data, Attribution attr) {
static class UrlLiveCrawlerThread extends Thread{ list.add(data);
System.out.println("列表大小:::"+list.size());
private static List<UrlLiveBean> getUrlLiveCrawle(List<String> urlList){ }
List<UrlLiveBean> list = Collections.synchronizedList(new ArrayList<UrlLiveBean>());
try{ };
UrlLiveCrawler crawler = new UrlLiveCrawler(); crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await();
UrlLiveDataCallback callback = new UrlLiveDataCallback() { }catch (Exception e){
logger.error(" 数据采集运行有问题 {} ", e);
@Override }
public void onData(UrlLiveBean data, Attribution attr) { return list;
list.add(data); }
System.out.println("列表大小:::"+list.size()); }
}
}
};
crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await();
}catch (Exception e){
logger.error(" 数据采集运行有问题 {} ", e);
}
return list;
}
}
}
package com.zhiwei.source_forward.util; package com.zhiwei.source_forward.util;
import java.util.List; import java.util.List;
import java.util.Objects; import java.util.Objects;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import org.jsoup.Jsoup; import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.source_forward.content.ContentExtractor; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.source_forward.content.News; import com.zhiwei.source_forward.content.ContentExtractor;
import com.zhiwei.source_forward.content.News;
/**
* @ClassName: MatchSource /**
* @Description: 匹配来源 * @ClassName: MatchSource
* @author hero * @Description: 匹配来源
* @date 2018年6月30日 上午10:27:29 * @author hero
*/ * @date 2018年6月30日 上午10:27:29
public class MatchSource { */
private static String fromRegex = "(来源:(\\S)*(\\s)*[\\S]*)|(来源:(\\S)*(\\s)*[\\S]*)|(本文来源于(\\S)*(\\s)*[\\S]*)" public class MatchSource {
+ "|(源:(\\S)*(\\s)*[\\S]*)|(来自:(\\S)*(\\s)*[\\S]*)|(来自:(\\S)*(\\s)*[\\S]*)" private static String fromRegex = "(来源:(\\S)*(\\s)*[\\S]*)|(来源:(\\S)*(\\s)*[\\S]*)|(本文来源于(\\S)*(\\s)*[\\S]*)"
+ "|(\\[来源\\]:(\\S)*(\\s)*[\\S]*)|(\\[来源\\]:(\\S)*(\\s)*[\\S]*)" + "|(源:(\\S)*(\\s)*[\\S]*)|(来自:(\\S)*(\\s)*[\\S]*)|(来自:(\\S)*(\\s)*[\\S]*)"
+ "|(出自:(\\S)*(\\s)*[\\S]*)|(出自:(\\S)*(\\s)*[\\S]*)" + "|(转自:(\\S)*(\\s)*[\\S]*)|(转自:(\\S)*(\\s)*[\\S]*)" + "|(\\[来源\\]:(\\S)*(\\s)*[\\S]*)|(\\[来源\\]:(\\S)*(\\s)*[\\S]*)"
+ "|(出处\\/作者:(\\S)*(\\s)*[\\S]*)|(出处\\/作者:(\\S)*(\\s)*[\\S]*)" + "|(出自:(\\S)*(\\s)*[\\S]*)|(出自:(\\S)*(\\s)*[\\S]*)" + "|(转自:(\\S)*(\\s)*[\\S]*)|(转自:(\\S)*(\\s)*[\\S]*)"
+ "|(出处:(\\S)*(\\s)*[\\S]*)|(出处:(\\S)*(\\s)*[\\S]*)|(稿源:(\\S)*(\\s)*[\\S]*)|(出处:(\\S)*(\\s)*[\\S]*)"; + "|(出处\\/作者:(\\S)*(\\s)*[\\S]*)|(出处\\/作者:(\\S)*(\\s)*[\\S]*)"
+ "|(出处:(\\S)*(\\s)*[\\S]*)|(出处:(\\S)*(\\s)*[\\S]*)|(稿源:(\\S)*(\\s)*[\\S]*)|(出处:(\\S)*(\\s)*[\\S]*)";
private static String timeRegex = ""
+ "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})" private static String timeRegex = ""
+ "|([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})" + "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})"
+ "|(\\d{4}年\\d{0,2}月\\d{0,2}日\\D+\\d{0,2}\\D+\\d{0,2})" + "|([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})"
+ "|(\\d{4}年\\d{0,2}月\\d{0,2}日\\D+\\d{0,2})" + "|(\\d{4}年\\d{0,2}月\\d{0,2}日\\D+\\d{0,2}\\D+\\d{0,2})"
+ "|(\\d{4}年\\d{0,2}月\\d{0,2}日)" + "|(\\d{4}年\\d{0,2}月\\d{0,2}日\\D+\\d{0,2})"
+ "|(\\d{0,2}月\\d{0,2}日\\D+\\d{0,2}\\D+\\d{0,2})" + "|(\\d{4}年\\d{0,2}月\\d{0,2}日)"
+ "|(\\d{0,2}月\\d{0,2}日\\D+\\d{0,2})" + "|(\\d{0,2}月\\d{0,2}日\\D+\\d{0,2}\\D+\\d{0,2})"
+ "|(\\d{0,2}月\\d{0,2}日)" + "|(\\d{0,2}月\\d{0,2}日\\D+\\d{0,2})"
+ "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2})" + "|(\\d{0,2}月\\d{0,2}日)"
+ "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2})" + "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2})"
+ "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2})" + "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2}\\D+\\d{0,2})"
; + "|(\\d{4}\\D+\\d{0,2}\\D+\\d{0,2})"
/** ;
* @Title: findURLs /**
* @author hero * @Title: findURLs
* @Description: (验证并匹配数据) * @author hero
* @param @param * @Description: (验证并匹配数据)
* s * @param @param
* @param @param * s
* regex * @param @param
* @param @return * regex
* 设定文件 * @param @return
* @return String 返回类型 * 设定文件
*/ * @return String 返回类型
public static String matchSource(String url,String html, List<String> sourceList) { */
String source = null; public static String matchSource(String url,String html, List<String> sourceList) {
Document document = Jsoup.parse(html); String source = null;
String htmlBody = TreateData.filterSpecialCharacter(document.select("body").text().toUpperCase()); Document document = Jsoup.parse(html);
try { String htmlBody = TreateData.filterSpecialCharacter(document.select("body").text().toUpperCase());
/***特定网站单独处理**/ try {
if(url.contains("thepaper.cn")){ /***特定网站单独处理**/
//单独处理澎湃数据 if(url.contains("thepaper.cn")){
source = document.select("div.news_about").select("p").select("span").text().replaceAll(".*来源:", ""); //单独处理澎湃数据
if(source.length() == 0) { source = document.select("div.news_about").select("p").select("span").text().replaceAll(".*来源:", "");
source = document.select("div.news_about").text().replaceAll(" \\d{4}.*|.*/", ""); if(StringUtils.isNotBlank(source)) {
} source = document.select("div.news_about").text().replaceAll(" \\d{4}.*|.*/", "");
}else if(url.contains("sports.eastday.com")){ }
//单独处理东方体育网 }else if(url.contains("sports.eastday.com")){
source = document.select("div.article").select("span").text(); //单独处理东方体育网
}else if(url.contains("lesports.com")){ source = document.select("div.article").select("span").text();
//单独处理乐视网数据 }else if(url.contains("lesports.com")){
source = document.select("div.article-source").select("strong").text(); //单独处理乐视网数据
}else if(url.contains("myzaker.com")){ source = document.select("div.article-source").select("strong").text();
//单独处理扎克网数据 }else if(url.contains("myzaker.com")){
source = document.select("div#article").select("span.auther").text(); //单独处理扎克网数据
}else if(url.contains("sina.com.cn") || url.contains("sohu.com")){ source = document.select("div#article").select("span.auther").text();
//单独处理新浪网 }else if(url.contains("sina.com.cn") || url.contains("sohu.com")){
if(html.contains("<meta name=\"mediaid\"")){ //单独处理新浪网
source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0]; if(html.contains("<meta name=\"mediaid\"")){
} source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0];
}else if(url.contains("a.mini.eastday.com")){ }
//处理东方头条网-自媒体号匹配 }else if(url.contains("a.mini.eastday.com")){
// source = document.select("[class=\"share_cnt_p clearfix\"]").select("div.fl").select("i").get(1).text(); //处理东方头条网-自媒体号匹配
source = "东方头条"; // source = document.select("[class=\"share_cnt_p clearfix\"]").select("div.fl").select("i").get(1).text();
}else if(url.contains("orz520.com")){ source = "东方头条";
//千寻生活网解析 }else if(url.contains("orz520.com")){
source = "千寻生活"; //千寻生活网解析
}else if(url.contains("sh.qihoo.com")){ source = "千寻生活";
//今日报点解析 }else if(url.contains("sh.qihoo.com")){
source = "今日爆点"; //今日报点解析
}else if(url.contains("itouchtv.cn")){ source = "今日爆点";
//触电新闻解析 }else if(url.contains("itouchtv.cn")){
source = "触电新闻"; //触电新闻解析
}else if(url.contains("yidianzixun.com")){ source = "触电新闻";
//一点资讯 }else if(url.contains("yidianzixun.com")){
if(html.contains("related_wemedia")){ //一点资讯
source = "一点资讯"; if(html.contains("related_wemedia")){
}else{ source = "一点资讯";
source = html.split("source\":\"")[1].split("\",\"")[0]; }else{
} source = html.split("source\":\"")[1].split("\",\"")[0];
}else if(url.contains("tech.china.com")){ }
//中华网科技 }else if(url.contains("tech.china.com")){
source = document.select("#chan_newsInfo").text().split("来源:")[1]; //中华网科技
}else if(url.contains("caijing.com.cn")){ source = document.select("#chan_newsInfo").text().split("来源:")[1];
//财经网产经 }else if(url.contains("caijing.com.cn")){
source = document.select("#source_baidu").text(); //财经网产经
}else if(url.contains("news.eastday.com")){ source = document.select("#source_baidu").text();
//单独处理东方网 }else if(url.contains("news.eastday.com")){
source = document.select("div#sectionleft").select("div").select("p").select("a").text(); //单独处理东方网
}else if(url.contains("ny.chinacenn.com")){ source = document.select("div#sectionleft").select("div").select("p").select("a").text();
//单独处理中企网 }else if(url.contains("ny.chinacenn.com")){
source = document.select("td").select("span.ltutext3").text().replaceAll(" \\d{4}.*", ""); //单独处理中企网
}else if(url.contains("ebrun.com")){ source = document.select("td").select("span.ltutext3").text().replaceAll(" \\d{4}.*", "");
//单独处理亿邦动力网 }else if(url.contains("ebrun.com")){
source = document.select("div.post-header").select("p.source").select("span.f-left").text().replaceAll(".*来源: ", ""); //单独处理亿邦动力网
}else if(url.contains("www.mnw.cn")){ source = document.select("div.post-header").select("p.source").select("span.f-left").text().replaceAll(".*来源: ", "");
//单独处理闽南网 }else if(url.contains("www.mnw.cn")){
source = document.select("div.il").select("span").text().replaceAll("来源:|\\d{4}.*", ""); //单独处理闽南网
}else if(url.contains("sn.cri.cn")){ source = document.select("div.il").select("span").text().replaceAll("来源:|\\d{4}.*", "");
//单独处理国际在线 }else if(url.contains("sn.cri.cn")){
source = document.select("span.asource").select("a").text(); //单独处理国际在线
}else if(url.contains("sh.sina.com.cn")){ source = document.select("span.asource").select("a").text();
//单独处理新浪上海 }else if(url.contains("sh.sina.com.cn")){
source = document.select("p.source-time").select("span").get(1).select("a").text(); //单独处理新浪上海
}else if(url.contains("kaixian.tv")){ source = document.select("p.source-time").select("span").get(1).select("a").text();
//单独处理汉丰网 }else if(url.contains("kaixian.tv")){
source = document.select("div.content").select("h2.font_gray").text().replaceAll(".*来源:", ""); //单独处理汉丰网
}else if(url.contains("lanjingtmt.com")){ source = document.select("div.content").select("h2.font_gray").text().replaceAll(".*来源:", "");
//单独处理蓝鲸TMT }else if(url.contains("lanjingtmt.com")){
source = "蓝鲸TMT网"; //单独处理蓝鲸TMT
}else if(url.contains("tech.huanqiu.com")){ source = "蓝鲸TMT网";
//单独处理环球网 }else if(url.contains("tech.huanqiu.com")){
source = document.select("span.la_t_b").select("a").text(); //单独处理环球网
}else if(url.contains("china.qianlong.com")){ source = document.select("span.la_t_b").select("a").text();
//单独处理千龙网 }else if(url.contains("china.qianlong.com")){
source = document.select("span.source").select("a").text(); //单独处理千龙网
}else if(url.contains("m.mnw.cn")){ source = document.select("span.source").select("a").text();
//单独处理手机闽南网 }else if(url.contains("m.mnw.cn")){
source = document.select("article.info").select("header").select("div").select("span").text().replaceAll("\\d{4}.*| ", ""); //单独处理手机闽南网
}else if(url.contains("mydrivers.com")){ source = document.select("article.info").select("header").select("div").select("span").text().replaceAll("\\d{4}.*| ", "");
//单独处理快科技 }else if(url.contains("mydrivers.com")){
source = document.select("div.news_bt1_left").text().replaceAll(".*出处:| 作者:[\\w\\W]*", ""); //单独处理快科技
}else if(url.contains("3dmgame.com")){ source = document.select("div.news_bt1_left").text().replaceAll(".*出处:| 作者:[\\w\\W]*", "");
//单独处理3DMGAME }else if(url.contains("3dmgame.com")){
source = document.select("ul.intem").select("li").select("span.weibo").text(); //单独处理3DMGAME
}else if(url.contains("99it.com.cn")){ source = document.select("ul.intem").select("li").select("span.weibo").text();
//单独处理99科技 }else if(url.contains("99it.com.cn")){
source = document.select("div.mate").select("span").text().replaceAll(".*来源:|编辑.*", ""); //单独处理99科技
}else if(url.contains("ciotimes.com")){ source = document.select("div.mate").select("span").text().replaceAll(".*来源:|编辑.*", "");
//单独处理CIO时代网 }else if(url.contains("ciotimes.com")){
source = document.select("p.ly.visible-xs.text-left").text().replaceAll(".*来源:", ""); //单独处理CIO时代网
}else if(url.contains("ithome.com")){ source = document.select("p.ly.visible-xs.text-left").text().replaceAll(".*来源:", "");
//单独处理IT之家 }else if(url.contains("ithome.com")){
source = document.select("span#source_baidu").select("a").text(); //单独处理IT之家
}else if(url.contains("techweb.com.cn")){ source = document.select("span#source_baidu").select("a").text();
//单独处理TechWeb }else if(url.contains("techweb.com.cn")){
source = document.select("span.from").select("a").text(); //单独处理TechWeb
}else if(url.contains("cniteyes.com")){ source = document.select("span.from").select("a").text();
//单独处理T客帮 }else if(url.contains("cniteyes.com")){
source = document.select("div.item-date").select("span").text(); //单独处理T客帮
}else if(url.contains("enorth.com.cn")){ source = document.select("div.item-date").select("span").text();
//单独处理北方网 }else if(url.contains("enorth.com.cn")){
source = document.select("p.col-sm-8.info").select("span").text().replaceAll(".*来源:|编辑.*", ""); //单独处理北方网
}else if(url.contains("btime.com")){ source = document.select("p.col-sm-8.info").select("span").text().replaceAll(".*来源:|编辑.*", "");
//单独处理北京时间 }else if(url.contains("btime.com")){
source = document.select("span.col.cite").text(); //单独处理北京时间
}else if(url.contains("bianews.com")){ source = document.select("span.col.cite").text();
//单独处理鞭牛士 }else if(url.contains("bianews.com")){
source = document.select("span.name.fl").text(); //单独处理鞭牛士
}else if(url.contains("dzwww.com")){ source = document.select("span.name.fl").text();
//单独处理大众网 }else if(url.contains("dzwww.com")){
source = document.select("div.layout").select("div.left").text().replaceAll(".*来源: |作者.*", ""); //单独处理大众网
}else if(url.contains("dsb.cn")){ source = document.select("div.layout").select("div.left").text().replaceAll(".*来源: |作者.*", "");
//单独处理电商报 }else if(url.contains("dsb.cn")){
source = document.select("div.new-content-info.clearfix").select("span").text().replaceAll(".*作者:", ""); //单独处理电商报
}else if(url.contains("finance.eastmoney.com")){ source = document.select("div.new-content-info.clearfix").select("span").text().replaceAll(".*作者:", "");
//单独处理东方财富网 }else if(url.contains("finance.eastmoney.com")){
source = document.select("div.source.data-source").attr("data-source"); //单独处理东方财富网
}else if(url.contains("emwap.eastmoney.com")){ source = document.select("div.source.data-source").attr("data-source");
//单独处理东方财富网客户端 }else if(url.contains("emwap.eastmoney.com")){
source = document.select("div.where").select("span.source").attr("title"); //单独处理东方财富网客户端
}else if(url.contains("mini.eastday.com")){ source = document.select("div.where").select("span.source").attr("title");
//单独处理东方头条 }else if(url.contains("mini.eastday.com")){
source = document.select("div.article-src-time").select("span").text().replaceAll(".*来源:", ""); //单独处理东方头条
}else if(url.contains("tech.ifeng.com")){ source = document.select("div.article-src-time").select("span").text().replaceAll(".*来源:", "");
//单独处理凤凰科技 }else if(url.contains("tech.ifeng.com")){
source = document.select("p.p_time").select("span").select("span.ss03").text(); //单独处理凤凰科技
}else if(url.contains("finance.ifeng.com")){ source = document.select("p.p_time").select("span").select("span.ss03").text();
//单独处理凤凰网 }else if(url.contains("finance.ifeng.com")){
source = document.select("p.p_time").select("span").select("span").select("a").text(); //单独处理凤凰网
if(Objects.isNull(source) || source.length() < 1) { source = document.select("p.p_time").select("span").select("span").select("a").text();
source = html.split("source\":\"")[1].split("\"")[0]; if(Objects.isNull(source) || source.length() < 1) {
} source = html.split("source\":\"")[1].split("\"")[0];
}else if(url.contains("iphone.265g.com")){ }
//单独处理265G网 }else if(url.contains("iphone.265g.com")){
source = document.select("div.article_info").select("span").text().replaceAll(".*来源:|QQ群号.*", ""); //单独处理265G网
}else if(url.contains("yicai.com")){ source = document.select("div.article_info").select("span").text().replaceAll(".*来源:|QQ群号.*", "");
//单独处理第一财经 }else if(url.contains("yicai.com")){
source = document.select("div.title.f-pr").select("p").select("span").text(); //单独处理第一财经
}else if(url.contains("cnblogs.com")){ source = document.select("div.title.f-pr").select("p").select("span").text();
//单独处理博客园 }else if(url.contains("cnblogs.com")){
source = document.select("div#come_from").text().replaceAll(".*来自:", ""); //单独处理博客园
}else if(url.contains("chinaxiaokang.com")){ source = document.select("div#come_from").text().replaceAll(".*来自:", "");
//单独处理中国小康网 }else if(url.contains("chinaxiaokang.com")){
source = document.select("span#arturl").select("a").text(); //单独处理中国小康网
}else if(url.contains("chinabaogao.com")) { source = document.select("span#arturl").select("a").text();
//单独处理中国报告网 }else if(url.contains("chinabaogao.com")) {
source = document.select("p.cbg-a-d-info").select("a").text().replaceAll("大 中 小 | ", ""); //单独处理中国报告网
}else if(url.contains("anyv.net")) { source = document.select("p.cbg-a-d-info").select("a").text().replaceAll("大 中 小 | ", "");
//单独处理爱妮微 }else if(url.contains("anyv.net")) {
source = document.select("span.cor666").select("a").text(); //单独处理爱妮微
}else if(url.contains("yingxiao360.com")){ source = document.select("span.cor666").select("a").text();
//单独处理第一赢销网 }else if(url.contains("yingxiao360.com")){
source = "第一赢销网"; //单独处理第一赢销网
}else if(url.contains("cctime.com")){ source = "第一赢销网";
//单独处理飞象网 }else if(url.contains("cctime.com")){
source = document.select("td.dateAndSource").text().replaceAll(".*\\d{2}|作 者.*| ", ""); //单独处理飞象网
}else if(url.contains("news.hexun.com")){ source = document.select("td.dateAndSource").text().replaceAll(".*\\d{2}|作 者.*| ", "");
//单独处理和讯网 }else if(url.contains("news.hexun.com")){
source = document.select("div.tip.fl").select("a").text(); //单独处理和讯网
}else if(url.contains("finance.jrj.com.cn")){ source = document.select("div.tip.fl").select("a").text();
//单独处理金融界 }else if(url.contains("finance.jrj.com.cn")){
source = document.select("p.inftop").select("span").get(1).select("a").text().replaceAll("价值.*| ", ""); //单独处理金融界
}else if(url.contains("tech.china.com.cn")){ source = document.select("p.inftop").select("span").get(1).select("a").text().replaceAll("价值.*| ", "");
//单独处理中国网 }else if(url.contains("tech.china.com.cn")){
source = document.select("span.fl.time2").select("a").text(); //单独处理中国网
}else if(url.contains("news.china.com.cn")){ source = document.select("span.fl.time2").select("a").text();
//单独处理中国网 }else if(url.contains("news.china.com.cn")){
source = document.select("div.pub_date").select("span#source_baidu").text().replaceAll(".*来源:", ""); //单独处理中国网
}else if(url.contains("admin5.com")){ source = document.select("div.pub_date").select("span#source_baidu").text().replaceAll(".*来源:", "");
//单独处理站长网 }else if(url.contains("admin5.com")){
source = document.select("div.source").select("span").text().replaceAll(".*来源:| ", ""); //单独处理站长网
}else if(url.contains("stock.qq.com")){ source = document.select("div.source").select("span").text().replaceAll(".*来源:| ", "");
//单独处理腾讯证券 }else if(url.contains("stock.qq.com")){
source = document.select("div.a_Info").select("span.a_source").text(); //单独处理腾讯证券
}else if(url.contains("n.cztv.com")){ source = document.select("div.a_Info").select("span.a_source").text();
//单独处理新蓝网 }else if(url.contains("n.cztv.com")){
source = document.select("div.publish").select("ul").select("li").text().replaceAll("\\d{4}.*", ""); //单独处理新蓝网
}else if(url.contains("news.paidai.com")){ source = document.select("div.publish").select("ul").select("li").text().replaceAll("\\d{4}.*", "");
//单独处理派代网 }else if(url.contains("news.paidai.com")){
source = document.select("p.t_info").select("span").select("a").text(); //单独处理派代网
}else if(url.contains("news.mydrivers.com")){ source = document.select("p.t_info").select("span").select("a").text();
//单独处理快科技 }else if(url.contains("news.mydrivers.com")){
source = document.select("div.news_bt1_left").text().replaceAll(".*出处:| 作者.*", ""); //单独处理快科技
}else if(url.contains("www.chinaz.com")){ source = document.select("div.news_bt1_left").text().replaceAll(".*出处:| 作者.*", "");
//单独处理站长之家 }else if(url.contains("www.chinaz.com")){
source = document.select("div.meta").select("span.source").select("a").text(); //单独处理站长之家
}else if(url.contains("yuncaijing.com")){ source = document.select("div.meta").select("span.source").select("a").text();
//单独处理云财经 }else if(url.contains("yuncaijing.com")){
source = document.select("section.news-wrap").select("header").select("div").text().replaceAll(".*消息来源: |\\[阅读原文.*| ", ""); //单独处理云财经
}else if(url.contains("itmsc.cn")){ source = document.select("section.news-wrap").select("header").select("div").text().replaceAll(".*消息来源: |\\[阅读原文.*| ", "");
//单独处理科技传媒网 }else if(url.contains("itmsc.cn")){
source = document.select("div.arc_sc").select("p").select("a").text(); //单独处理科技传媒网
}else if(url.contains("nbd.com.cn")){ source = document.select("div.arc_sc").select("p").select("a").text();
//单独处理每日经济新闻 }else if(url.contains("nbd.com.cn")){
source = document.select("span.source").text(); //单独处理每日经济新闻
}else if(url.contains("pintu360.com")){ source = document.select("span.source").text();
//单独处理品途商业评论 }else if(url.contains("pintu360.com")){
source = "品途商业评论"; //单独处理品途商业评论
}else if(url.contains("news.qudong.com")){ source = "品途商业评论";
//单独处理驱动中国 }else if(url.contains("news.qudong.com")){
source = document.select("div.news_right").select("dd").select("li").select("span").select("a").text().replaceAll(" .*", ""); //单独处理驱动中国
}else if(url.contains("shobserver.com")){ source = document.select("div.news_right").select("dd").select("li").select("span").select("a").text().replaceAll(" .*", "");
//单独处理上海观察 }else if(url.contains("shobserver.com")){
source = document.select("span.max-words").get(0).text(); //单独处理上海观察
}else if(url.contains("g.pconline.com.cn")){ source = document.select("span.max-words").get(0).text();
//单独处理太平洋电脑网 }else if(url.contains("g.pconline.com.cn")){
source = document.select("div.art-info").text().replaceAll("手机|\\d{4}.*| ", ""); //单独处理太平洋电脑网
}else if(url.contains("news.xtol.cn")){ source = document.select("div.art-info").text().replaceAll("手机|\\d{4}.*| ", "");
//单独处理湘潭在线 }else if(url.contains("news.xtol.cn")){
source = document.select("span.date").text().replaceAll(".*来源:", ""); //单独处理湘潭在线
}else if(url.contains("bjnews.com.cn")){ source = document.select("span.date").text().replaceAll(".*来源:", "");
//单独处理新京报网 }else if(url.contains("bjnews.com.cn")){
source = document.select("span.author").text().replaceAll(" 记者.*", ""); //单独处理新京报网
}else if(url.contains("telworld.com.cn")){ source = document.select("span.author").text().replaceAll(" 记者.*", "");
//单独处理运营商世界 }else if(url.contains("telworld.com.cn")){
source = document.select("div.news_xiang_tit_2_left").select("a").text(); //单独处理运营商世界
}else if(url.contains("thehour.cn")){ source = document.select("div.news_xiang_tit_2_left").select("a").text();
//单独处理浙江24小时 }else if(url.contains("thehour.cn")){
source = document.select("div.newsInfo").select("span").select("a").text(); //单独处理浙江24小时
}else if(url.contains("sh.zol.com.cn")){ source = document.select("div.newsInfo").select("span").select("a").text();
//单独处理中关村在线 }else if(url.contains("sh.zol.com.cn")){
source = document.select("div.article-aboute").select("span.source_baidu").text(); //单独处理中关村在线
}else if(url.contains("ec.com.cn")){ source = document.select("div.article-aboute").select("span.source_baidu").text();
//单独处理中国国际电子商务网 }else if(url.contains("ec.com.cn")){
source = document.select("span.article_resource").text().replaceAll(".*来源:", ""); //单独处理中国国际电子商务网
}else if(url.contains("cqn.com.cn")){ source = document.select("span.article_resource").text().replaceAll(".*来源:", "");
//单独处理中国质量新闻网 }else if(url.contains("cqn.com.cn")){
source = document.select("span.from").text().replaceAll("-.*", ""); //单独处理中国质量新闻网
}else if(url.contains("sc.stock.cnfol.com")){ source = document.select("span.from").text().replaceAll("-.*", "");
//单独处理中金在线 }else if(url.contains("sc.stock.cnfol.com")){
source = document.select("div.artDes").select("span").select("a").text(); //单独处理中金在线
}else if(url.contains("zczj.com")){ source = document.select("div.artDes").select("span").select("a").text();
//单独处理众筹之家 }else if(url.contains("zczj.com")){
source = document.select("div.news-info").select("span").text().replaceAll("来源:|作者.*", ""); //单独处理众筹之家
}else if(url.contains("cqcb.com")){ source = document.select("div.news-info").select("span").text().replaceAll("来源:|作者.*", "");
//单独处理重庆晨报 }else if(url.contains("cqcb.com")){
source = document.select("span.label_nr").text(); //单独处理重庆晨报
}else if(url.contains("stock.10jqka.com.cn")){ source = document.select("span.label_nr").text();
//单独处理重庆晨报 }else if(url.contains("stock.10jqka.com.cn")){
source = document.select("span.label_nr").text(); //单独处理重庆晨报
}else if(url.contains("jiemian.com") ){ source = document.select("span.label_nr").text();
//单独处理界面新闻 }else if(url.contains("jiemian.com") ){
// source = document.select("div.article-info").select("span").text().replaceAll(".*来源:| 字体[\\w\\W]*", ""); //单独处理界面新闻
return "界面新闻"; // source = document.select("div.article-info").select("span").text().replaceAll(".*来源:| 字体[\\w\\W]*", "");
}else if(url.contains("finance.youth.cn")){ return "界面新闻";
//单独处理中国青年网 }else if(url.contains("finance.youth.cn")){
source = document.select("span#source_baidu").text().replaceAll("来源:|作者.*", ""); //单独处理中国青年网
}else if(url.contains("china.com")) { source = document.select("span#source_baidu").text().replaceAll("来源:|作者.*", "");
//中国金融商报 }else if(url.contains("china.com")) {
source = document.select("#chan_newsInfo > a").text(); //中国金融商报
}else if(url.contains("xw.qq.com")) { source = document.select("#chan_newsInfo > a").text();
//腾讯网客户端 }else if(url.contains("xw.qq.com")) {
source = document.select("div.tpl_header_author").text(); //腾讯网客户端
}else if(url.contains("china.prcfe.com")) { source = document.select("div.tpl_header_author").text();
source = html.split("\"")[1]; }else if(url.contains("china.prcfe.com")) {
} source = html.split("\"")[1];
}
if(Objects.nonNull(source) && source.length() != 0) {
return source; if(Objects.nonNull(source) && source.length() != 0) {
} return source;
else{ }
//其他网站处理 else{
source = mathchOtherSource(html, htmlBody, sourceList); //其他网站处理
if(source!=null){ source = mathchOtherSource(html, htmlBody, sourceList);
//验证来源 if(source!=null){
for (String sourceMatch : sourceList) { //验证来源
if (source.contains(sourceMatch)) { for (String sourceMatch : sourceList) {
return sourceMatch; if (source.contains(sourceMatch)) {
} return sourceMatch;
} }
} }
} }
} catch (Exception e) { }
e.toString(); } catch (Exception e) {
} e.toString();
return null; }
} return null;
}
/**
* @Title: matchMediaSelfSource /**
* @author hero * @Title: matchMediaSelfSource
* @Description: 验证及匹配自媒体号 * @author hero
* @param @param url * @Description: 验证及匹配自媒体号
* @param @param html * @param @param url
* @param @return 设定文件 * @param @param html
* @return String 返回类型 * @param @return 设定文件
*/ * @return String 返回类型
public static String matchMediaSelfSource(String url,String html) { */
String source = null; public static String matchMediaSelfSource(String url,String html) {
Document document = Jsoup.parse(html); String source = null;
try { Document document = Jsoup.parse(html);
/***特定网站单独处理**/ try {
if(url.contains("toutiao.com")){ /***特定网站单独处理**/
//今日头条帐号匹配 if(url.contains("toutiao.com")){
if(html.contains("name: '") && html.contains("mediaInfo")){ //今日头条帐号匹配
source = html.split("mediaInfo:")[1].split("name: '")[1].split("',")[0].trim(); if(html.contains("name: '") && html.contains("mediaInfo")){
}else if(html.contains("name: '") && html.contains("ugcInfo")){ source = html.split("mediaInfo:")[1].split("name: '")[1].split("',")[0].trim();
source = html.split("ugcInfo:")[1].split("name: '")[1].split("',")[0].trim(); }else if(html.contains("name: '") && html.contains("ugcInfo")){
}else if(html.contains("screen_name:")){ source = html.split("ugcInfo:")[1].split("name: '")[1].split("',")[0].trim();
source = html.split("screen_name:'")[1].split("',")[0].trim(); }else if(html.contains("screen_name:")){
} source = html.split("screen_name:'")[1].split("',")[0].trim();
if(source!=null && source.length()>1){ }
source = "今日头条-" + source; if(source!=null && source.length()>1){
} source = "今日头条-" + source;
}else if(url.contains("sohu.com")){ }
//搜狐自媒体号 }else if(url.contains("sohu.com")){
if(html.contains("<meta name=\"mediaid\"")){ //搜狐自媒体号
source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim(); if(html.contains("<meta name=\"mediaid\"")){
if(source!=null && source.length()>1){ source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim();
source = "搜狐-" + source; if(source!=null && source.length()>1){
} source = "搜狐-" + source;
} }
}else if(url.contains("tznew.58.com")){ }
//58 }else if(url.contains("tznew.58.com")){
source = JSONObject.parseObject(html).getJSONObject("result").getString("author"); source = JSONObject.parseObject(html).getJSONObject("result").getString("author");
if(source!=null && source.length()>1){ if(source!=null && source.length()>1){
source = "58-" + source; source = "58-" + source;
} }
}else if(url.contains("c.m.163.com")){ }else if(url.contains("c.m.163.com")){
//58 source = document.select("section.g-article.js-article > div.js-article-inner > div > b").text();
source = document.select("section.g-article.js-article > div.js-article-inner > div > b").text(); if(StringUtils.isBlank(source)){
if(source!=null && source.length()>1){ source = document.select("div.info > h3").text();
source = "网易新闻-" + source; }
} if(source!=null && source.length()>1){
}else if(url.contains("a.mini.eastday.com")){ source = "网易新闻-" + source;
//处理东方头条网-自媒体号匹配 }
source = document.select("[class=\"share_cnt_p clearfix\"]").select("div.fl").select("i").get(1).text().trim(); }else if(url.contains("a.mini.eastday.com")){
if(source!=null && source.length()>1){ //处理东方头条网-自媒体号匹配
source = "东方头条-" + source; source = document.select("[class=\"share_cnt_p clearfix\"]").select("div.fl").select("i").get(1).text().trim();
if(source!=null && source.length()>1){
} source = "东方头条-" + source;
}else if(url.contains("fashion.eastday.com")){
//处理东方头条网-自媒体号匹配 }
source = document.select("div.J-title_detail.title_detail > div > div.fl > i:nth-child(2)").text().trim(); }else if(url.contains("fashion.eastday.com")){
if(Objects.isNull(source) || source.length() < 1) { //处理东方头条网-自媒体号匹配
source = document.select("div.J-title_detail.title_detail > div > div.fl > a").text().trim(); source = document.select("div.J-title_detail.title_detail > div > div.fl > i:nth-child(2)").text().trim();
} if(Objects.isNull(source) || source.length() < 1) {
if(source!=null && source.length()>1){ source = document.select("div.J-title_detail.title_detail > div > div.fl > a").text().trim();
source = "东方看点-" + source; }
if(source!=null && source.length()>1){
} source = "东方看点-" + source;
}else if(url.contains("sh.qihoo.com") || url.contains("360kuai.com")){
//今日报点解析 }
source = document.select("span.source").text().trim(); }else if(url.contains("sh.qihoo.com") || url.contains("360kuai.com")){
if(source.length() < 1) { //今日报点解析
source = document.select("p.article-info").select("a").text().trim(); source = document.select("span.source").text().trim();
} if(source.length() < 1) {
if(source.length() < 1 && html.contains("window.__INITIAL_DATA__ =")) { source = document.select("p.article-info").select("a").text().trim();
Matcher ma = Pattern.compile("window.__INITIAL_DATA__ =[\\s\\S]+?}};").matcher(html); }
if(ma.find()) { if(source.length() < 1 && html.contains("window.__INITIAL_DATA__ =")) {
String result = ma.group().replaceAll("window.__INITIAL_DATA__ =|\\</script\\>|", "").trim(); Matcher ma = Pattern.compile("window.__INITIAL_DATA__ =[\\s\\S]+?}};").matcher(html);
if(result.contains("window.autohomePVDDWhiteList")) { if(ma.find()) {
result = result.split("window.autohomePVDDWhiteList")[0]; String result = ma.group().replaceAll("window.__INITIAL_DATA__ =|\\</script\\>|", "").trim();
} if(result.contains("window.autohomePVDDWhiteList")) {
JSONObject json = JSONObject.parseObject(result.trim().substring(0,result.lastIndexOf(";"))); result = result.split("window.autohomePVDDWhiteList")[0];
source = json.getJSONObject("detail").getString("sec_src"); }
if(Objects.isNull(source) || source.length() < 1) { JSONObject json = JSONObject.parseObject(result.trim().substring(0,result.lastIndexOf(";")));
source = json.getJSONObject("detail").getString("src"); source = json.getJSONObject("detail").getString("sec_src");
} if(Objects.isNull(source) || source.length() < 1) {
} source = json.getJSONObject("detail").getString("src");
} }
if(Objects.nonNull(source) && source.length()>1){ }
source = "快资讯-" + source; }
} if(Objects.nonNull(source) && source.length()>1){
}else if(url.contains("cj.sina.com.cn") || url.contains("finance.sina.cn") || source = "快资讯-" + source;
url.contains("tech.sina.cn") || url.contains("news.sina.cn")){ }
source = document.select("h2.weibo_user").text(); }else if(url.contains("cj.sina.com.cn") || url.contains("finance.sina.cn") ||
if((Objects.isNull(source) || source.length() < 1) && html.contains("<meta name=\"mediaid\"")){ url.contains("tech.sina.cn") || url.contains("news.sina.cn")){
//新浪科技头条号 source = document.select("h2.weibo_user").text();
source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim(); if((Objects.isNull(source) || source.length() < 1) && html.contains("<meta name=\"mediaid\"")){
} //新浪科技头条号
if(Objects.isNull(source) || source.length() < 1){ source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim();
//新浪财经头条号 }
source = document.select("body > main > section.j_main_art > section > article > time > cite").text(); if(Objects.isNull(source) || source.length() < 1){
} //新浪财经头条号
if(source!=null && source.length()>1){ source = document.select("body > main > section.j_main_art > section > article > time > cite").text();
source = "新浪-" + source; }
} if(source!=null && source.length()>1){
}else if(url.contains("auto.sina.cn") || url.contains("auto.sina.com.cn")){ source = "新浪-" + source;
source = document.select("div.art_title > div > span:nth-child(1)").text(); }
if(Objects.isNull(source) || source.length() < 1){ }else if(url.contains("auto.sina.cn") || url.contains("auto.sina.com.cn")){
source = document.select("#top_bar > div > div.date-source > a").text(); source = document.select("div.art_title > div > span:nth-child(1)").text();
} if(Objects.isNull(source) || source.length() < 1){
if(source!=null && source.length()>1){ source = document.select("#top_bar > div > div.date-source > a").text();
source = "新浪-" + source; }
} if(source!=null && source.length()>1){
}else if(url.contains("baijiahao.baidu.com")){ source = "新浪-" + source;
//百度百家 }
source = document.select("p.author-name").first().text().trim(); }else if(url.contains("k.sina.cn")){
if(source!=null && source.length()>1){ source = document.select("h2.weibo_user").text();
source = "百度百家-" + source; if(source!=null && source.length()>1){
} source = "新浪-" + source;
}else if(url.contains("app.myzaker.com")){ }
// zaker客户端 }else if(url.contains("blog.sina.com.cn")){
source = document.select("#tpl_author").first().text().trim(); source = document.select("strong#ownernick").text();
if(source!=null && source.length()>1){ if(source!=null && source.length()>1){
source = "zaker客户端-" + source; source = "新浪博客-" + source;
} }
}else if(url.contains("yidianzixun.com")){ }else if(url.contains("baijiahao.baidu.com") || url.contains("mbd.baidu.com")){
//一点资讯 //百度百家
if(html.contains("related_wemedia")){ source = document.select("span.userNameSpan").text();
source = html.split("media_name\":\"")[1].split("\",\"")[0].trim(); if(StringUtils.isBlank(source)){
if(source!=null && source.length()>1){ source = document.select("p.author-name:nth-child(1)").text();
source = "一点资讯-" + source; }
} if(StringUtils.isNotBlank(source)){
}else{ source = "百度百家-" + source;
source = html.split("source\":\"")[1].split("\",\"")[0]; }
} }else if(url.contains("app.myzaker.com")){
}else if(url.contains("news.bitauto.com")){ // zaker客户端
//易车网 source = document.select("#tpl_author").first().text().trim();
source = document.select("[class=\"gz-box clearfix\"]").select("div.txt-box") if(source!=null && source.length()>1){
.select("p.p-n").select("a").text(); source = "zaker客户端-" + source;
if(source!=null && source.length()>1){ }
source = "易车网-" + source; }else if(url.contains("yidianzixun.com")){
} //一点资讯
}else if(url.contains("ittime.com.cn")){ if(html.contains("related_wemedia")){
//it时代网 source = html.split("media_name\":\"")[1].split("\",\"")[0].trim();
source = document.select("div.top.author > dl > dd > p > a").text(); if(source!=null && source.length()>1){
if(Objects.nonNull(source) && !source.isEmpty()){ source = "一点资讯-" + source;
source = "it时代网-" + source; }
} }else{
}else if(url.contains("wap.peopleapp.com")){ source = html.split("source\":\"")[1].split("\",\"")[0];
// 人民日报客户端 }
JSONObject json = JSONObject.parseObject(html); }else if(url.contains("news.bitauto.com")){
source = json.getJSONObject("data").getString("authors"); //易车网
if(Objects.nonNull(source) && !source.isEmpty()){ source = document.select("[class=\"gz-box clearfix\"]").select("div.txt-box")
source = "人民日报客户端-" + source; .select("p.p-n").select("a").text();
} if(source!=null && source.length()>1){
}else if(url.contains("guancha.cn")){ source = "易车网-" + source;
// 风闻社区 }
source = document.select("div.main-tow > div.box-left > div.article-content > div:nth-child(3) > div.user-main > h4 > a").text(); }else if(url.contains("ittime.com.cn")){
if(Objects.nonNull(source) && !source.isEmpty()){ //it时代网
source = "风闻社区-" + source; source = document.select("div.top.author > dl > dd > p > a").text();
}else { if(Objects.nonNull(source) && !source.isEmpty()){
source = document.select("div.author-intro.fix > p > a").text(); source = "it时代网-" + source;
if(Objects.nonNull(source) && !source.isEmpty()){ }
source = "观察者-" + source; }else if(url.contains("wap.peopleapp.com")){
} // 人民日报客户端
} JSONObject json = JSONObject.parseObject(html);
}else if(url.contains("yesky.com")){ source = json.getJSONObject("data").getString("authors");
// 天极自媒体 if(Objects.nonNull(source) && !source.isEmpty()){
source = document.select("div.elf > dl > dd.bt > a").text(); source = "人民日报客户端-" + source;
if(Objects.nonNull(source) && !source.isEmpty()){ }
source = "天极自媒体-" + source; }else if(url.contains("guancha.cn")){
} // 风闻社区
}else if(url.contains("nkj.cn")){ source = document.select("div.main-tow > div.box-left > div.article-content > div:nth-child(3) > div.user-main > h4 > a").text();
// 牛科技 if(Objects.nonNull(source) && !source.isEmpty()){
source = document.select("div.widget.suxingme_post_author > div > div.author_name > a").text(); source = "风闻社区-" + source;
if(Objects.nonNull(source) && !source.isEmpty()){ }else {
source = "牛科技-" + source; source = document.select("div.author-intro.fix > p > a").text();
} if(Objects.nonNull(source) && !source.isEmpty()){
}else if(url.contains("chejiahao.autohome.com.cn")){ source = "观察者-" + source;
//汽车之家 }
source = document.select("div.authorMes").select("[class=\"name text-overflow\"]") }
.select("a").text(); }else if(url.contains("yesky.com")){
if(source!=null && source.length()>1){ // 天极自媒体
source = "汽车之家-" + source; source = document.select("div.elf > dl > dd.bt > a").text();
} if(Objects.nonNull(source) && !source.isEmpty()){
}else if(url.contains("item.btime.com")){ source = "天极自媒体-" + source;
//北京时间 }
source = document.select("a.author").text(); }else if(url.contains("nkj.cn")){
if(Objects.isNull(source) || source.length() < 1){ // 牛科技
source = document.select("div.content-info > span.col.cite").text(); source = document.select("div.widget.suxingme_post_author > div > div.author_name > a").text();
} if(Objects.nonNull(source) && !source.isEmpty()){
if(source!=null && source.length()>1){ source = "牛科技-" + source;
source = "北京时间-" + source; }
} }else if(url.contains("chejiahao.autohome.com.cn")){
}else if(url.contains("mp.qq.com")){ //汽车之家
source = document.select("div#account_top > div.puin_text > div.pname").text(); source = document.select("div.authorMes").select("[class=\"name text-overflow\"]")
if(source!=null && !source.equals("")){ .select("a").text();
source = "QQ看点-" + source; if(source!=null && source.length()>1){
} source = "汽车之家-" + source;
}else if(url.contains("v.qq.com")) { }
source = document.select("span.user_name").text(); }else if(url.contains("item.btime.com")){
if(source!=null && !source.equals("")){ //北京时间
source = "腾讯视频-" + source; source = document.select("a.author").text();
} if(Objects.isNull(source) || source.length() < 1){
}else if(url.contains("qq.com/")){ source = document.select("div.content-info > span.col.cite").text();
//腾讯网-企鹅号 }
source = html.split("media\": \"")[1].split("\",")[0]; if(source!=null && source.length()>1){
if(source!=null && source.length()>1){ source = "北京时间-" + source;
source = "企鹅号-" + source; }
} }else if(url.contains("mp.qq.com")){
}else if(url.contains("feng.ifeng.com")){ source = document.select("div#account_top > div.puin_text > div.pname").text();
//凤凰网-大风号 if(StringUtils.isNotBlank(source)){
source = html.split("source\":\"")[1].split("\",\"")[0]; source = "QQ看点-" + source;
if(source!=null && source.length()>1){ }
source = "大风号-" + source; }else if(url.contains("v.qq.com")) {
} source = document.select("span.user_name").text();
}else if(url.contains("dy.163.com")){ if(StringUtils.isNotBlank(source)){
//网易订阅-网易号 source = "腾讯视频-" + source;
source = document.select("div.normal > div.colum_info > h4").text(); }
if(source!=null && source.length()>1){ }else if(url.contains("qq.com/")){
source = "网易-" + source; //腾讯网-企鹅号
} source = html.split("media\": \"")[1].split("\",")[0];
}else if(url.contains("qctt.cn")){ if(source!=null && source.length()>1){
//汽车头条 source = "企鹅号-" + source;
source = document.select("div.part2>a").text(); }
if(source!=null && source.length()>1){ }else if(url.contains("feng.ifeng.com")){
source = "汽车头条-" + source; //凤凰网-大风号
} source = html.split("source\":\"")[1].split("\",\"")[0];
}else if(url.contains("maiche.com")){ if(source!=null && source.length()>1){
//买车网 source = "大风号-" + source;
source = document.select("div.info-left > div:nth-child(2) > span > a").text(); }
if(source!=null && source.length()>1){ }else if(url.contains("dy.163.com")){
source = "买车网-" + source; //网易订阅-网易号
} source = document.select("div.normal > div.colum_info > h4").text();
}else if(url.contains("3g.163.com")){ if(source!=null && source.length()>1){
source = document.select("div.info").select("[class=\"source js-source\"]") source = "网易-" + source;
.text(); }
if(source!=null && !source.equals("")){ }else if(url.contains("qctt.cn")){
source = "网易号-" + source; //汽车头条
} source = document.select("div.part2>a").text();
}else if(url.contains("myzaker.com")){ if(source!=null && source.length()>1){
source = document.select("div.article_header > div > a > span.auther") source = "汽车头条-" + source;
.text(); }
if(source!=null && !source.equals("")){ }else if(url.contains("maiche.com")){
source = "zaker-" + source; //买车网
} source = document.select("div.info-left > div:nth-child(2) > span > a").text();
}else if(url.contains("edushi.com")){ if(source!=null && source.length()>1){
source = document.select("div.eds-name-box > div.eds-name > a > div.name") source = "买车网-" + source;
.text(); }
if(source!=null && !source.equals("")){ }else if(url.contains("3g.163.com")){
source = "今日潮闻-" + source; source = document.select("div.info").select("[class=\"source js-source\"]")
} .text();
}else if(url.contains("ijiandao.com")){ if(StringUtils.isNotBlank(source)){
source = document.select("div.article-author > span.author-name > a") source = "网易号-" + source;
.text(); }
if(source!=null && !source.equals("")){ }else if(url.contains("myzaker.com")){
source = "爱尖刀-" + source; source = document.select("div.article_header > div > a > span.auther")
} .text();
}else if(url.contains("chuangyejia.com")){ if(StringUtils.isNotBlank(source)){
source = document.select("div.article-title > ul.article-author > li:nth-child(1)") source = "zaker-" + source;
.text(); }
if(source!=null && !source.equals("")){ }else if(url.contains("edushi.com")){
source = "创业家-" + source; source = document.select("div.eds-name-box > div.eds-name > a > div.name")
} .text();
}else if(url.contains("kejixun.com")){ if(StringUtils.isNotBlank(source)){
source = document.select("div.r-box.r-box-none.clearfix > div > div.big-man > h3 > a") source = "今日潮闻-" + source;
.text(); }
if(source!=null && !source.equals("")){ }else if(url.contains("ijiandao.com")){
source = "科技讯-" + source; source = document.select("div.article-author > span.author-name > a")
} .text();
}else if(url.contains("tmtpost.com")){ if(StringUtils.isNotBlank(source)){
source = document.select("article > div.post-info > a") source = "爱尖刀-" + source;
.text(); }
if(source!=null && !source.equals("")){ }else if(url.contains("chuangyejia.com")){
source = "钛媒体-" + source; source = document.select("div.article-title > ul.article-author > li:nth-child(1)")
} .text();
}else if(url.contains("cyzone.cn")){ if(StringUtils.isNotBlank(source)){
source = document.select("div.article-author-info > div.author-main > div > div.a-word > div.a-name > a") source = "创业家-" + source;
.text(); }
if(source!=null && !source.equals("")){ }else if(url.contains("kejixun.com")){
source = "创业邦-" + source; source = document.select("div.r-box.r-box-none.clearfix > div > div.big-man > h3 > a")
} .text();
}else if(url.contains("36kr.com")){ if(StringUtils.isNotBlank(source)){
source = document.select("div.info-header-text > a.author-name").text(); source = "科技讯-" + source;
if(source!=null && !source.equals("")){ }
return "36氪-" + source; }else if(url.contains("tmtpost.com")){
} source = document.select("article > div.post-info > a")
source = document.select("h4.author-name").text(); .text();
if(source!=null && !source.equals("")){ if(StringUtils.isNotBlank(source)){
return "36氪-" + source; source = "钛媒体-" + source;
} }
source = document.select("span.author-nickname").text(); }else if(url.contains("cyzone.cn")){
if(source!=null && !source.equals("")){ source = document.select("div.article-author-info > div.author-main > div > div.a-word > div.a-name > a")
return "36氪-" + source; .text();
} if(StringUtils.isNotBlank(source)){
}else if(url.contains("lianxianjia.com")){ source = "创业邦-" + source;
source = document.select("span.author-name").text(); }
if(source!=null && !source.equals("")){ }else if(url.contains("36kr.com")){
source = "连线家-" + source; source = document.select("div.info-header-text > a.author-name").text();
} if(StringUtils.isNotBlank(source)){
}else if(url.contains("itouchtv.cn")){ return "36氪-" + source;
source = document.select("div.index__article-media-20Tg_ > span:nth-child(1)").text(); }
if(source!=null && !source.equals("")){ source = document.select("h4.author-name").text();
source = "触电新闻-" + source; if(StringUtils.isNotBlank(source)){
} return "36氪-" + source;
}else if(url.contains("whb.cn")){ }
source = document.select("div.yidian-info > span:nth-child(1)").text(); source = document.select("span.author-nickname").text();
if(source!=null && !source.equals("")){ if(StringUtils.isNotBlank(source)){
source = "文汇APP-" + source; return "36氪-" + source;
} }
}else if(url.contains("blogchina.com")){ }else if(url.contains("lianxianjia.com")){
source = document.select("div.meta-top > label.lm_name > span > a").text(); source = document.select("span.author-name").text();
if(source!=null && !source.equals("")){ if(StringUtils.isNotBlank(source)){
source = "博客中国-" + source; source = "连线家-" + source;
} }
}else if(url.contains(".iqiyi.com")) { }else if(url.contains("itouchtv.cn")){
source = JSONObject.parseObject(html.split("page-info='")[1].split("'")[0]).getJSONObject("user").getString("name"); source = document.select("div.index__article-media-20Tg_ > span:nth-child(1)").text();
if(source!=null && !source.equals("")){ if(StringUtils.isNotBlank(source)){
source = "爱奇艺-" + source; source = "触电新闻-" + source;
} }
}else if(url.contains("v.youku.com")) { }else if(url.contains("whb.cn")){
source = document.select("a.sub-name").text(); source = document.select("div.yidian-info > span:nth-child(1)").text();
if(source!=null && !source.equals("")){ if(StringUtils.isNotBlank(source)){
source = "优酷-" + source; source = "文汇APP-" + source;
} }
}else if(url.contains("jiemian.com")) { }else if(url.contains("blogchina.com")){
source = document.select("div.article-info > p > span.author > a").text(); source = document.select("div.meta-top > label.lm_name > span > a").text();
if(source!=null && !source.equals("")){ if(StringUtils.isNotBlank(source)){
source = "界面新闻-" + source; source = "博客中国-" + source;
} }
}else if (url.contains("iyiou.com")) { }else if(url.contains(".iqiyi.com")) {
source = document.select("div#post_author > a").text(); source = JSONObject.parseObject(html.split("page-info='")[1].split("'")[0]).getJSONObject("user").getString("name");
if(source!=null && !source.equals("")) { if(StringUtils.isNotBlank(source)){
source = "亿欧网-" + source; source = "爱奇艺-" + source;
} }
}else if (url.contains("lanjingtmt.com")) { }else if(url.contains("v.youku.com")) {
source = document.select("div.scd-title > a:nth-child(2)").text(); source = document.select("a.sub-name").text();
if(source!=null && !source.equals("")) { if(StringUtils.isNotBlank(source)){
source = "蓝鲸-" + source; source = "优酷-" + source;
} }
}else if (url.contains("lanjinger.com")) { }else if(url.contains("jiemian.com")) {
if(document.select("div.content_left > div:nth-child(2) > span").text().contains("专栏")) { source = document.select("div.article-info > p > span.author > a").text();
source = document.select("a.author_name").text().replaceAll(".*编辑| ", ""); if(StringUtils.isNotBlank(source)){
if(source!=null && !source.equals("")) { source = "界面新闻-" + source;
source = "蓝鲸财经-" + source; }
} }else if (url.contains("iyiou.com")) {
} source = document.select("div#post_author > a").text();
}else if (url.contains("huxiu.com")) { if(StringUtils.isNotBlank(source)) {
source = document.select("div.article__author-info-box > a.article-author-info > span.author-info__username").text(); source = "亿欧网-" + source;
if(source!=null && !source.equals("")) { }
source = "虎嗅-" + source; }else if (url.contains("lanjingtmt.com")) {
} source = document.select("div.scd-title > a:nth-child(2)").text();
}else if (url.contains("chuansongme.com")) { if(StringUtils.isNotBlank(source)) {
source = document.select("div.rich_media_meta_list > span.rich_media_meta.rich_media_meta_text").text(); source = "蓝鲸-" + source;
if(source!=null && !source.equals("")) { }
source = "传送门-" + source; }else if (url.contains("lanjinger.com")) {
} if(document.select("div.content_left > div:nth-child(2) > span").text().contains("专栏")) {
}else if (url.contains("a.mp.uc.cn")) { source = document.select("a.author_name").text().replaceAll(".*编辑| ", "");
JSONObject json = JSONObject.parseObject(html); if(StringUtils.isNotBlank(source)) {
source = json.getJSONObject("data").getJSONObject("_author").getString("author_name"); source = "蓝鲸财经-" + source;
if(source!=null && !source.equals("")) { }
source = "uc-" + source; }
} }else if (url.contains("huxiu.com")) {
}else if (url.contains("kd.youth.cn")) { source = document.select("div.article__author-info-box > a.article-author-info > span.author-info__username").text();
source = document.select("body > div > div > div.rich_media_meta_list > a").text(); if(StringUtils.isNotBlank(source)) {
if(source!=null && !source.equals("")) { source = "虎嗅-" + source;
source = "中青在线-" + source; }
} }else if (url.contains("chuansongme.com")) {
} source = document.select("div.rich_media_meta_list > span.rich_media_meta.rich_media_meta_text").text();
return source; if(StringUtils.isNotBlank(source)) {
} catch (Exception e) { source = "传送门-" + source;
e.printStackTrace(); }
return null; }else if (url.contains("a.mp.uc.cn")) {
} JSONObject json = JSONObject.parseObject(html);
} source = json.getJSONObject("data").getJSONObject("_author").getString("author_name");
if(StringUtils.isNotBlank(source)) {
source = "uc-" + source;
}
/** }else if (url.contains("m.uczzd.cn")) {
* if(html.contains("var xissJsonData =")){
* @Title: mathchOtherSource html = html.split("var xissJsonData = ")[1].split("};")[0]+"}";
* @author hero source = JSONObject.parseObject(html).getString("source_name");
* @Description: 匹配通用结果数据 }
* @param @param html if(StringUtils.isNotBlank(source)) {
* @param @param htmlBody source = "uc-" + source;
* @param @param length }
* @param @return 设定文件 }else if (url.contains("kd.youth.cn")) {
* @return String 返回类型 source = document.select("body > div > div > div.rich_media_meta_list > a").text();
*/ if(StringUtils.isNotBlank(source)) {
private static String mathchOtherSource(String html,String htmlBody,List<String> sourceList){ source = "中青在线-" + source;
/** 正文抽取,目的是避免正文中存在相应匹配规则 **/ }
String source = null; }else if (url.contains("zhuanlan.zhihu.com")) {
try { source = document.select("a.UserLink-link").text();
News news = ContentExtractor.getNewsByHtml(html); if(StringUtils.isNotBlank(source)) {
String content = TreateData.filterSpecialCharacter(news.getContent().toUpperCase()); source = "知乎专栏-" + source;
String title = TreateData.filterSpecialCharacter(news.getTitle().toUpperCase()); }
/**剔除正文**/ }else if (url.contains("wulizixun.com")) {
String text = htmlBody.replace(content, "@@@@@@@@@@"); source = document.select("span.newdetailOrigin").text();
/**分割正文**/ if(StringUtils.isNotBlank(source)) {
String[] matchTextArr = text.split("@@@@@@@@@@"); source = "唔哩头条-" + source;
if(TreateData.regex(fromRegex, matchTextArr[0]) != null || TreateData.regex(fromRegex, matchTextArr[1])!=null){ }
if(TreateData.regex(fromRegex, matchTextArr[0])!=null){ }else if(url.contains("t.10jqka.com.cn")){
source = TreateData.regex(fromRegex, matchTextArr[0]); source = document.select("a[class=\"link777 post-author db fl\"]").text();
for (String sourceMatch : sourceList) { if(StringUtils.isNotBlank(source)) {
if (source.contains(sourceMatch)) { source = "同花顺-" + source;
return sourceMatch; }
} }else if(url.contains("shangyexinzhi.com")){
} source = document.select("span.hover-color_change").text();
}else if(TreateData.regex(fromRegex, matchTextArr[1])!=null){ if(StringUtils.isNotBlank(source)) {
source = TreateData.regex(fromRegex, matchTextArr[1]); source = "商业新知-" + source;
for (String sourceMatch : sourceList) { }
if (source.contains(sourceMatch)) { }else if(url.contains("thepaper.cn")){
return sourceMatch; source = document.select("a> div.name").text();
} if(StringUtils.isNotBlank(source)){
} source = "澎湃新闻-" + source;
} }
}else{ }else if(url.contains("tuicool.com")){
if(matchTextArr[0].contains(title)){ source = document.select("span.from> a").text();
/***判断是否包含标题,如果包含标题则以标题截取数据 if(StringUtils.isNotBlank(source)){
* 验证数据为 主要匹配 YYYY-MM-dd xx日报 source = "推酷-" + source;
* 或 xx日报 YYYY-MM-dd }
* ***/ }
String[] titlesArr = matchTextArr[0].split(title); return source;
for(int j = 0;j<titlesArr.length; j++){ } catch (Exception e) {
String timeSource = TreateData.regex(timeRegex, titlesArr[j]); e.printStackTrace();
if(timeSource!=null){ return null;
source = getSourceByTime(timeSource, titlesArr[j], sourceList); }
if(source != null){ }
return source;
}
}
} /**
} *
* @Title: mathchOtherSource
if(matchTextArr[1].contains(title)){ * @author hero
/***判断是否包含标题,如果包含标题则以标题截取数据 * @Description: 匹配通用结果数据
* 验证数据为 主要匹配 YYYY-MM-dd xx日报 * @param @param html
* 或 xx日报 YYYY-MM-dd * @param @param htmlBody
* ***/ * @param @param length
String[] titlesArr = matchTextArr[1].split(title); * @param @return 设定文件
for(int j = 0;j<titlesArr.length; j++){ * @return String 返回类型
String timeSource = TreateData.regex(timeRegex, titlesArr[j]); */
if(timeSource!=null){ private static String mathchOtherSource(String html,String htmlBody,List<String> sourceList){
source = getSourceByTime(timeSource, titlesArr[j], sourceList); /** 正文抽取,目的是避免正文中存在相应匹配规则 **/
if(source != null){ String source = null;
return source; try {
} News news = ContentExtractor.getNewsByHtml(html);
} String content = TreateData.filterSpecialCharacter(news.getContent().toUpperCase());
} String title = TreateData.filterSpecialCharacter(news.getTitle().toUpperCase());
} /**剔除正文**/
} String text = htmlBody.replace(content, "@@@@@@@@@@");
/**分割正文**/
/***正文外无相关数据,匹配正文**/ String[] matchTextArr = text.split("@@@@@@@@@@");
if(source == null ){ if(TreateData.regex(fromRegex, matchTextArr[0]) != null || TreateData.regex(fromRegex, matchTextArr[1])!=null){
/*** if(TreateData.regex(fromRegex, matchTextArr[0])!=null){
* 匹配命中包含来源等规则的数据 source = TreateData.regex(fromRegex, matchTextArr[0]);
*/ for (String sourceMatch : sourceList) {
source = TreateData.regex(fromRegex, content); if (source.contains(sourceMatch)) {
if(source!=null){ return sourceMatch;
for (String sourceMatch : sourceList) { }
if (source.contains(sourceMatch)) { }
return sourceMatch; }else if(TreateData.regex(fromRegex, matchTextArr[1])!=null){
} source = TreateData.regex(fromRegex, matchTextArr[1]);
} for (String sourceMatch : sourceList) {
}else { if (source.contains(sourceMatch)) {
/***判断是否包含标题,如果包含标题则以标题截取数据 return sourceMatch;
* 验证数据为 主要匹配 YYYY-MM-dd xx日报 }
* 或 xx日报 YYYY-MM-dd }
* ***/ }
if(content.contains(title)){ /**正文中包含标题**/ }else{
String[] titlesArr = content.split(title); if(matchTextArr[0].contains(title)){
for(int j = 0;j<titlesArr.length; j++){ /***判断是否包含标题,如果包含标题则以标题截取数据
String timeSource = TreateData.regex(timeRegex, titlesArr[j]); * 验证数据为 主要匹配 YYYY-MM-dd xx日报
if(timeSource!=null){ * 或 xx日报 YYYY-MM-dd
source = getSourceByTime(timeSource, titlesArr[j], sourceList); * ***/
if(source != null){ String[] titlesArr = matchTextArr[0].split(title);
return source; for(int j = 0;j<titlesArr.length; j++){
} String timeSource = TreateData.regex(timeRegex, titlesArr[j]);
} if(timeSource!=null){
} source = getSourceByTime(timeSource, titlesArr[j], sourceList);
}else{ /**正文中不包含标题**/ if(source != null){
String timeSource = TreateData.regex(timeRegex, content); return source;
if(timeSource!=null){ }
source = getSourceByTime(timeSource, content, sourceList); }
if(source != null){ }
return source; }
}
} if(matchTextArr[1].contains(title)){
} /***判断是否包含标题,如果包含标题则以标题截取数据
} * 验证数据为 主要匹配 YYYY-MM-dd xx日报
} * 或 xx日报 YYYY-MM-dd
} catch (Exception e) { * ***/
System.out.println("正文抽取失败处理........"); String[] titlesArr = matchTextArr[1].split(title);
e.toString(); for(int j = 0;j<titlesArr.length; j++){
/*** String timeSource = TreateData.regex(timeRegex, titlesArr[j]);
* 匹配正文失败 if(timeSource!=null){
* 匹配命中包含来源等规则的数据 source = getSourceByTime(timeSource, titlesArr[j], sourceList);
*/ if(source != null){
source = TreateData.regex(fromRegex, htmlBody); return source;
if (source != null) { }
for (String sourceMatch : sourceList) { }
if (source.contains(sourceMatch)) { }
return sourceMatch; }
} }
}
} else { /***正文外无相关数据,匹配正文**/
/***判断是否包含标题,如果包含标题则以标题截取数据 if(source == null ){
* 验证数据为 主要匹配 YYYY-MM-dd xx日报 /***
* 或 xx日报 YYYY-MM-dd * 匹配命中包含来源等规则的数据
* ***/ */
String timeSource = TreateData.regex(timeRegex, htmlBody); source = TreateData.regex(fromRegex, content);
if(timeSource!=null){ if(source!=null){
source = getSourceByTime(timeSource, htmlBody, sourceList); for (String sourceMatch : sourceList) {
if(source != null){ if (source.contains(sourceMatch)) {
return source; return sourceMatch;
} }
} }
} }else {
} /***判断是否包含标题,如果包含标题则以标题截取数据
return null; * 验证数据为 主要匹配 YYYY-MM-dd xx日报
} * 或 xx日报 YYYY-MM-dd
* ***/
if(content.contains(title)){ /**正文中包含标题**/
/** String[] titlesArr = content.split(title);
* @Title: getSourceByTime for(int j = 0;j<titlesArr.length; j++){
* @author hero String timeSource = TreateData.regex(timeRegex, titlesArr[j]);
* @Description: TODO(根据匹配时间截取数据) if(timeSource!=null){
* @param @param htmlBody source = getSourceByTime(timeSource, titlesArr[j], sourceList);
* @param @return 设定文件 if(source != null){
* @return String 返回类型 return source;
*/ }
private static String getSourceByTime(String timeSource, String htmlBody,List<String> sourceList){ }
}
try { }else{ /**正文中不包含标题**/
String timeSource = TreateData.regex(timeRegex, content);
/**以时间做分割,匹配来源信息。 if(timeSource!=null){
* 主要匹配 YYYY-MM-dd xx日报 source = getSourceByTime(timeSource, content, sourceList);
* 或 xx日报 YYYY-MM-dd if(source != null){
***/ return source;
String[] times = htmlBody.split(timeSource); }
for (int j = 0; j < times.length; j++) { }
String timecontent = times[j]; }
if (j == 0) { }
if (timecontent.length() >= 30) { }
timecontent = timecontent.substring(timecontent.length() - 30, timecontent.length()); } catch (Exception e) {
} else { System.out.println("正文抽取失败处理........");
timecontent = timecontent.substring(0, timecontent.length()); e.toString();
} /***
} else { * 匹配正文失败
if (timecontent.length() >= 30) { * 匹配命中包含来源等规则的数据
timecontent = timecontent.substring(0, 30); */
} else { source = TreateData.regex(fromRegex, htmlBody);
timecontent = timecontent.substring(0, timecontent.length()); if (source != null) {
} for (String sourceMatch : sourceList) {
} if (source.contains(sourceMatch)) {
return sourceMatch;
for (String sourceMatch : sourceList) { }
if (timecontent.contains(sourceMatch)) { }
return sourceMatch; } else {
} /***判断是否包含标题,如果包含标题则以标题截取数据
} * 验证数据为 主要匹配 YYYY-MM-dd xx日报
} * 或 xx日报 YYYY-MM-dd
return null; * ***/
} catch (Exception e) { String timeSource = TreateData.regex(timeRegex, htmlBody);
e.toString(); if(timeSource!=null){
return null; source = getSourceByTime(timeSource, htmlBody, sourceList);
} if(source != null){
} return source;
} }
}
}
}
return null;
}
/**
* @Title: getSourceByTime
* @author hero
* @Description: TODO(根据匹配时间截取数据)
* @param @param htmlBody
* @param @return 设定文件
* @return String 返回类型
*/
private static String getSourceByTime(String timeSource, String htmlBody,List<String> sourceList){
try {
/**以时间做分割,匹配来源信息。
* 主要匹配 YYYY-MM-dd xx日报
* 或 xx日报 YYYY-MM-dd
***/
String[] times = htmlBody.split(timeSource);
for (int j = 0; j < times.length; j++) {
String timecontent = times[j];
if (j == 0) {
if (timecontent.length() >= 30) {
timecontent = timecontent.substring(timecontent.length() - 30, timecontent.length());
} else {
timecontent = timecontent.substring(0, timecontent.length());
}
} else {
if (timecontent.length() >= 30) {
timecontent = timecontent.substring(0, 30);
} else {
timecontent = timecontent.substring(0, timecontent.length());
}
}
for (String sourceMatch : sourceList) {
if (timecontent.contains(sourceMatch)) {
return sourceMatch;
}
}
}
return null;
} catch (Exception e) {
e.toString();
return null;
}
}
}
package com.zhiwei.source_forward.util;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.source_forward.config.ProxyConfig;
/**
* 初始化代理
* @author xMx
* @date 2020年1月6日 上午9:29:04
*/
public class ProxyInit {
/**
* 初始化代理
* void
*/
public static void initProxy() {
String address = ProxyConfig.registry;
String appName = "xumiaoxin";
long appId = ProxyConfig.proxyid;
ProxyFactory.init(SimpleConfig.builder().registry(address).appName(appName).appId(appId).group(ProxyConfig.group).build());
}
}
#registry=zookeeper://192.168.0.203:2181;zookeeper://192.168.0.104:2181;zookeeper://192.168.0.105:2181 #registry=zookeeper://192.168.0.203:2181;zookeeper://192.168.0.104:2181;zookeeper://192.168.0.105:2181
#group=hangzhou #group=hangzhou
##########################测试地址############################## ##########################测试地址##############################
registry=zookeeper://192.168.0.36:2181 registry=zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181
proxyid=10000002
group=local group=local
\ No newline at end of file
//package com.zhiwei.source_forward.sourceforward.test; //package com.zhiwei.source_forward.sourceforward.test;
// //
//import java.util.HashMap; //import java.util.HashMap;
//import java.util.Map; //import java.util.Map;
// //
//import org.junit.Test; //import org.junit.Test;
// //
//import com.zhiwei.source_forward.run.SourceForward; //import com.zhiwei.source_forward.run.SourceForward;
// //
///** ///**
// * @ClassName: SourceForwardTest // * @ClassName: SourceForwardTest
// * @Description: 来源验证 // * @Description: 来源验证
// * @author hero // * @author hero
// * @date 2017年12月6日 上午9:55:13 // * @date 2017年12月6日 上午9:55:13
// */ // */
//public class MediaSelfSourceTest { //public class MediaSelfSourceTest {
// //
// @Test // @Test
// public void sourceForwardTest(){ // public void sourceForwardTest(){
// Map<String,Map<String,Object>> dataMap = new HashMap<String,Map<String,Object>>(); // Map<String,Map<String,Object>> dataMap = new HashMap<String,Map<String,Object>>();
// String url = "https://www.toutiao.com/a6549872248428167687/"; // String url = "https://www.toutiao.com/a6549872248428167687/";
// Map<String,Object> data = new HashMap<String,Object>(); // Map<String,Object> data = new HashMap<String,Object>();
// dataMap.put(url, data); // dataMap.put(url, data);
// //
// SourceForward.getMediaSelfSource(dataMap); // SourceForward.getMediaSelfSource(dataMap);
// //
// } // }
// //
// //
// //
// //
// //
// //
// //
// //
//} //}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment