Commit 0abfbd4a by zhiwei

添加自媒体匹配

parent 4e02a60f
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId> <artifactId>source-forward</artifactId>
<version>0.2.1-SNAPSHOT</version> <version>0.2.2-SNAPSHOT</version>
<name>source-forward</name> <name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description> <description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
...@@ -24,12 +24,12 @@ ...@@ -24,12 +24,12 @@
<dependency> <dependency>
<groupId>com.zhiwei.tools</groupId> <groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId> <artifactId>zhiwei-tools</artifactId>
<version>0.1.3-SNAPSHOT</version> <version>0.1.6-SNAPSHOT</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.zhiwei.crawler</groupId> <groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId> <artifactId>crawler-core</artifactId>
<version>0.5.5.6-SNAPSHOT</version> <version>0.6.1.0-SNAPSHOT</version>
</dependency> </dependency>
</dependencies> </dependencies>
......
...@@ -13,7 +13,9 @@ public class ProxyConfig { ...@@ -13,7 +13,9 @@ public class ProxyConfig {
conf.load(is); conf.load(is);
is.close(); is.close();
registry = conf.getProperty("registry"); registry = conf.getProperty("registry");
proxyid = Long.valueOf(conf.getProperty("proxyid"));
group = conf.getProperty("group"); group = conf.getProperty("group");
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
} }
...@@ -21,6 +23,7 @@ public class ProxyConfig { ...@@ -21,6 +23,7 @@ public class ProxyConfig {
public static String registry; public static String registry;
public static Long proxyid;
public static String group; public static String group;
} }
...@@ -5,6 +5,7 @@ import java.util.List; ...@@ -5,6 +5,7 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects; import java.util.Objects;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
...@@ -87,12 +88,11 @@ public class MediaSelfSourceCrawler { ...@@ -87,12 +88,11 @@ public class MediaSelfSourceCrawler {
* @return * @return
*/ */
private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) { private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) {
logger.info("当前处理 URL: {}", url); logger.info("当前处理 URL: {}", attr.get());
Map<String,Object> map = new HashMap<>(); Map<String,Object> map = new HashMap<>();
if(url.contains("toutiao.com")) { if(url.contains("toutiao.com")) {
map.put("referer", url); map.put("referer", url);
} }
map.put("Connection", "close");
url = dealUrl(url); url = dealUrl(url);
if(Objects.nonNull(url)) { if(Objects.nonNull(url)) {
Request request = RequestUtils.wrapGet(url, map); Request request = RequestUtils.wrapGet(url, map);
...@@ -148,7 +148,6 @@ public class MediaSelfSourceCrawler { ...@@ -148,7 +148,6 @@ public class MediaSelfSourceCrawler {
/** /**
* *
* @Description 解析文章获取相关数据 * @Description 解析文章获取相关数据
* @param response
* @param attr * @param attr
* @param callback * @param callback
*/ */
...@@ -156,12 +155,11 @@ public class MediaSelfSourceCrawler { ...@@ -156,12 +155,11 @@ public class MediaSelfSourceCrawler {
MediaSelfSourceDataCallBack callback) { MediaSelfSourceDataCallBack callback) {
String source = null; String source = null;
String channel = null; String channel = null;
String url = attr.get().toString();
try { try {
source = MatchSource.matchMediaSelfSource(attr.get().toString(),result); source = MatchSource.matchMediaSelfSource(url,result);
if(source==null || source.equals("")){ logger.info(url+"=======" + source);
source = null; channel = MatchChannel.verifyChannel(url);
}
channel = MatchChannel.verifyChannel(attr.get().toString());
if(channel==null){ if(channel==null){
List<Node> nodeList = Jsoup.parse(result).head().childNodes(); List<Node> nodeList = Jsoup.parse(result).head().childNodes();
channel = MatchChannel.matchChannel(nodeList); channel = MatchChannel.matchChannel(nodeList);
...@@ -170,8 +168,7 @@ public class MediaSelfSourceCrawler { ...@@ -170,8 +168,7 @@ public class MediaSelfSourceCrawler {
logger.error("exception ",e); logger.error("exception ",e);
source = null; source = null;
} }
logger.info(attr.get()+"=================来源" + source); MediaSelfSourceBean msfb = new MediaSelfSourceBean(url, source, channel);
MediaSelfSourceBean msfb = new MediaSelfSourceBean(attr.get().toString(), source, channel);
if (callback == null) { if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据"); logger.warn("DataCallback 对象为 null,无法保存数据");
} else { } else {
......
...@@ -4,11 +4,11 @@ import java.util.ArrayList; ...@@ -4,11 +4,11 @@ import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.source_forward.util.ProxyInit;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean; import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution; import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.crawler.MediaSelfSourceCrawler; import com.zhiwei.source_forward.crawler.MediaSelfSourceCrawler;
...@@ -30,9 +30,10 @@ public class MediaSelfSource { ...@@ -30,9 +30,10 @@ public class MediaSelfSource {
} }
public static void main(String[] args) { public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181", "local", GroupType.PROVIDER, 10000002L); ProxyInit.initProxy();
List<String> urlList = new ArrayList<>(); List<String> urlList = new ArrayList<>();
urlList.add("https://wap.peopleapp.com/article/rmh12074926/0"); urlList.add("https://www.tuicool.com/articles/nIfmu2B");
List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList); List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList);
for(MediaSelfSourceBean b : u) { for(MediaSelfSourceBean b : u) {
System.out.println(b.toString()); System.out.println(b.toString());
......
...@@ -6,11 +6,10 @@ import java.util.List; ...@@ -6,11 +6,10 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import com.zhiwei.source_forward.util.ProxyInit;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.source_forward.bean.SourceForwardBean; import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution; import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
import com.zhiwei.source_forward.crawler.SourceForwardCrawler; import com.zhiwei.source_forward.crawler.SourceForwardCrawler;
...@@ -79,7 +78,7 @@ public class SourceForward { ...@@ -79,7 +78,7 @@ public class SourceForward {
} }
public static void main(String[] args) { public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER, 10000002); ProxyInit.initProxy();
List<String> urlList = new ArrayList<>(); List<String> urlList = new ArrayList<>();
urlList.add("http://software.it168.com/a2019/0621/6005/000006005693.shtml"); urlList.add("http://software.it168.com/a2019/0621/6005/000006005693.shtml");
List<SourceForwardBean> da = SourceForward.getSourceForward(urlList); List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
......
package com.zhiwei.source_forward.run; package com.zhiwei.source_forward.run;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import com.zhiwei.source_forward.util.ProxyInit;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import com.zhiwei.async.TaskBoot;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.source_forward.bean.UrlLiveBean; import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution; import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.crawler.UrlLiveCrawler; import com.zhiwei.source_forward.crawler.UrlLiveCrawler;
import com.zhiwei.source_forward.crawler.UrlLiveCrawlerNew;
import com.zhiwei.source_forward.util.UrlLiveDataCallback; import com.zhiwei.source_forward.util.UrlLiveDataCallback;
import okhttp3.Request;
import okhttp3.Response;
/** /**
* @ClassName: URLLive * @ClassName: URLLive
* @Description: 验证链接是否已删除 * @Description: 验证链接是否已删除
...@@ -84,7 +74,7 @@ public class URLLive { ...@@ -84,7 +74,7 @@ public class URLLive {
} }
public static void main(String[] args) { public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER, 10000002); ProxyInit.initProxy();
List<String> urlList = new ArrayList<>(); List<String> urlList = new ArrayList<>();
urlList.add("http://a.mp.uc.cn/article.html?uc_param_str=frdnsnpfvecpntnwprdssskt#!wm_aid=038b8207b444418c845f43e4d2d3a754"); urlList.add("http://a.mp.uc.cn/article.html?uc_param_str=frdnsnpfvecpntnwprdssskt#!wm_aid=038b8207b444418c845f43e4d2d3a754");
......
...@@ -5,6 +5,7 @@ import java.util.Objects; ...@@ -5,6 +5,7 @@ import java.util.Objects;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
...@@ -60,7 +61,7 @@ public class MatchSource { ...@@ -60,7 +61,7 @@ public class MatchSource {
if(url.contains("thepaper.cn")){ if(url.contains("thepaper.cn")){
//单独处理澎湃数据 //单独处理澎湃数据
source = document.select("div.news_about").select("p").select("span").text().replaceAll(".*来源:", ""); source = document.select("div.news_about").select("p").select("span").text().replaceAll(".*来源:", "");
if(source.length() == 0) { if(StringUtils.isNotBlank(source)) {
source = document.select("div.news_about").text().replaceAll(" \\d{4}.*|.*/", ""); source = document.select("div.news_about").text().replaceAll(" \\d{4}.*|.*/", "");
} }
}else if(url.contains("sports.eastday.com")){ }else if(url.contains("sports.eastday.com")){
...@@ -372,14 +373,15 @@ public class MatchSource { ...@@ -372,14 +373,15 @@ public class MatchSource {
} }
} }
}else if(url.contains("tznew.58.com")){ }else if(url.contains("tznew.58.com")){
//58
source = JSONObject.parseObject(html).getJSONObject("result").getString("author"); source = JSONObject.parseObject(html).getJSONObject("result").getString("author");
if(source!=null && source.length()>1){ if(source!=null && source.length()>1){
source = "58-" + source; source = "58-" + source;
} }
}else if(url.contains("c.m.163.com")){ }else if(url.contains("c.m.163.com")){
//58
source = document.select("section.g-article.js-article > div.js-article-inner > div > b").text(); source = document.select("section.g-article.js-article > div.js-article-inner > div > b").text();
if(StringUtils.isBlank(source)){
source = document.select("div.info > h3").text();
}
if(source!=null && source.length()>1){ if(source!=null && source.length()>1){
source = "网易新闻-" + source; source = "网易新闻-" + source;
} }
...@@ -445,10 +447,23 @@ public class MatchSource { ...@@ -445,10 +447,23 @@ public class MatchSource {
if(source!=null && source.length()>1){ if(source!=null && source.length()>1){
source = "新浪-" + source; source = "新浪-" + source;
} }
}else if(url.contains("baijiahao.baidu.com")){ }else if(url.contains("k.sina.cn")){
//百度百家 source = document.select("h2.weibo_user").text();
source = document.select("p.author-name").first().text().trim();
if(source!=null && source.length()>1){ if(source!=null && source.length()>1){
source = "新浪-" + source;
}
}else if(url.contains("blog.sina.com.cn")){
source = document.select("strong#ownernick").text();
if(source!=null && source.length()>1){
source = "新浪博客-" + source;
}
}else if(url.contains("baijiahao.baidu.com") || url.contains("mbd.baidu.com")){
//百度百家
source = document.select("span.userNameSpan").text();
if(StringUtils.isBlank(source)){
source = document.select("p.author-name:nth-child(1)").text();
}
if(StringUtils.isNotBlank(source)){
source = "百度百家-" + source; source = "百度百家-" + source;
} }
}else if(url.contains("app.myzaker.com")){ }else if(url.contains("app.myzaker.com")){
...@@ -528,12 +543,12 @@ public class MatchSource { ...@@ -528,12 +543,12 @@ public class MatchSource {
} }
}else if(url.contains("mp.qq.com")){ }else if(url.contains("mp.qq.com")){
source = document.select("div#account_top > div.puin_text > div.pname").text(); source = document.select("div#account_top > div.puin_text > div.pname").text();
if(source!=null && !source.equals("")){ if(StringUtils.isNotBlank(source)){
source = "QQ看点-" + source; source = "QQ看点-" + source;
} }
}else if(url.contains("v.qq.com")) { }else if(url.contains("v.qq.com")) {
source = document.select("span.user_name").text(); source = document.select("span.user_name").text();
if(source!=null && !source.equals("")){ if(StringUtils.isNotBlank(source)){
source = "腾讯视频-" + source; source = "腾讯视频-" + source;
} }
}else if(url.contains("qq.com/")){ }else if(url.contains("qq.com/")){
...@@ -569,137 +584,175 @@ public class MatchSource { ...@@ -569,137 +584,175 @@ public class MatchSource {
}else if(url.contains("3g.163.com")){ }else if(url.contains("3g.163.com")){
source = document.select("div.info").select("[class=\"source js-source\"]") source = document.select("div.info").select("[class=\"source js-source\"]")
.text(); .text();
if(source!=null && !source.equals("")){ if(StringUtils.isNotBlank(source)){
source = "网易号-" + source; source = "网易号-" + source;
} }
}else if(url.contains("myzaker.com")){ }else if(url.contains("myzaker.com")){
source = document.select("div.article_header > div > a > span.auther") source = document.select("div.article_header > div > a > span.auther")
.text(); .text();
if(source!=null && !source.equals("")){ if(StringUtils.isNotBlank(source)){
source = "zaker-" + source; source = "zaker-" + source;
} }
}else if(url.contains("edushi.com")){ }else if(url.contains("edushi.com")){
source = document.select("div.eds-name-box > div.eds-name > a > div.name") source = document.select("div.eds-name-box > div.eds-name > a > div.name")
.text(); .text();
if(source!=null && !source.equals("")){ if(StringUtils.isNotBlank(source)){
source = "今日潮闻-" + source; source = "今日潮闻-" + source;
} }
}else if(url.contains("ijiandao.com")){ }else if(url.contains("ijiandao.com")){
source = document.select("div.article-author > span.author-name > a") source = document.select("div.article-author > span.author-name > a")
.text(); .text();
if(source!=null && !source.equals("")){ if(StringUtils.isNotBlank(source)){
source = "爱尖刀-" + source; source = "爱尖刀-" + source;
} }
}else if(url.contains("chuangyejia.com")){ }else if(url.contains("chuangyejia.com")){
source = document.select("div.article-title > ul.article-author > li:nth-child(1)") source = document.select("div.article-title > ul.article-author > li:nth-child(1)")
.text(); .text();
if(source!=null && !source.equals("")){ if(StringUtils.isNotBlank(source)){
source = "创业家-" + source; source = "创业家-" + source;
} }
}else if(url.contains("kejixun.com")){ }else if(url.contains("kejixun.com")){
source = document.select("div.r-box.r-box-none.clearfix > div > div.big-man > h3 > a") source = document.select("div.r-box.r-box-none.clearfix > div > div.big-man > h3 > a")
.text(); .text();
if(source!=null && !source.equals("")){ if(StringUtils.isNotBlank(source)){
source = "科技讯-" + source; source = "科技讯-" + source;
} }
}else if(url.contains("tmtpost.com")){ }else if(url.contains("tmtpost.com")){
source = document.select("article > div.post-info > a") source = document.select("article > div.post-info > a")
.text(); .text();
if(source!=null && !source.equals("")){ if(StringUtils.isNotBlank(source)){
source = "钛媒体-" + source; source = "钛媒体-" + source;
} }
}else if(url.contains("cyzone.cn")){ }else if(url.contains("cyzone.cn")){
source = document.select("div.article-author-info > div.author-main > div > div.a-word > div.a-name > a") source = document.select("div.article-author-info > div.author-main > div > div.a-word > div.a-name > a")
.text(); .text();
if(source!=null && !source.equals("")){ if(StringUtils.isNotBlank(source)){
source = "创业邦-" + source; source = "创业邦-" + source;
} }
}else if(url.contains("36kr.com")){ }else if(url.contains("36kr.com")){
source = document.select("div.info-header-text > a.author-name").text(); source = document.select("div.info-header-text > a.author-name").text();
if(source!=null && !source.equals("")){ if(StringUtils.isNotBlank(source)){
return "36氪-" + source; return "36氪-" + source;
} }
source = document.select("h4.author-name").text(); source = document.select("h4.author-name").text();
if(source!=null && !source.equals("")){ if(StringUtils.isNotBlank(source)){
return "36氪-" + source; return "36氪-" + source;
} }
source = document.select("span.author-nickname").text(); source = document.select("span.author-nickname").text();
if(source!=null && !source.equals("")){ if(StringUtils.isNotBlank(source)){
return "36氪-" + source; return "36氪-" + source;
} }
}else if(url.contains("lianxianjia.com")){ }else if(url.contains("lianxianjia.com")){
source = document.select("span.author-name").text(); source = document.select("span.author-name").text();
if(source!=null && !source.equals("")){ if(StringUtils.isNotBlank(source)){
source = "连线家-" + source; source = "连线家-" + source;
} }
}else if(url.contains("itouchtv.cn")){ }else if(url.contains("itouchtv.cn")){
source = document.select("div.index__article-media-20Tg_ > span:nth-child(1)").text(); source = document.select("div.index__article-media-20Tg_ > span:nth-child(1)").text();
if(source!=null && !source.equals("")){ if(StringUtils.isNotBlank(source)){
source = "触电新闻-" + source; source = "触电新闻-" + source;
} }
}else if(url.contains("whb.cn")){ }else if(url.contains("whb.cn")){
source = document.select("div.yidian-info > span:nth-child(1)").text(); source = document.select("div.yidian-info > span:nth-child(1)").text();
if(source!=null && !source.equals("")){ if(StringUtils.isNotBlank(source)){
source = "文汇APP-" + source; source = "文汇APP-" + source;
} }
}else if(url.contains("blogchina.com")){ }else if(url.contains("blogchina.com")){
source = document.select("div.meta-top > label.lm_name > span > a").text(); source = document.select("div.meta-top > label.lm_name > span > a").text();
if(source!=null && !source.equals("")){ if(StringUtils.isNotBlank(source)){
source = "博客中国-" + source; source = "博客中国-" + source;
} }
}else if(url.contains(".iqiyi.com")) { }else if(url.contains(".iqiyi.com")) {
source = JSONObject.parseObject(html.split("page-info='")[1].split("'")[0]).getJSONObject("user").getString("name"); source = JSONObject.parseObject(html.split("page-info='")[1].split("'")[0]).getJSONObject("user").getString("name");
if(source!=null && !source.equals("")){ if(StringUtils.isNotBlank(source)){
source = "爱奇艺-" + source; source = "爱奇艺-" + source;
} }
}else if(url.contains("v.youku.com")) { }else if(url.contains("v.youku.com")) {
source = document.select("a.sub-name").text(); source = document.select("a.sub-name").text();
if(source!=null && !source.equals("")){ if(StringUtils.isNotBlank(source)){
source = "优酷-" + source; source = "优酷-" + source;
} }
}else if(url.contains("jiemian.com")) { }else if(url.contains("jiemian.com")) {
source = document.select("div.article-info > p > span.author > a").text(); source = document.select("div.article-info > p > span.author > a").text();
if(source!=null && !source.equals("")){ if(StringUtils.isNotBlank(source)){
source = "界面新闻-" + source; source = "界面新闻-" + source;
} }
}else if (url.contains("iyiou.com")) { }else if (url.contains("iyiou.com")) {
source = document.select("div#post_author > a").text(); source = document.select("div#post_author > a").text();
if(source!=null && !source.equals("")) { if(StringUtils.isNotBlank(source)) {
source = "亿欧网-" + source; source = "亿欧网-" + source;
} }
}else if (url.contains("lanjingtmt.com")) { }else if (url.contains("lanjingtmt.com")) {
source = document.select("div.scd-title > a:nth-child(2)").text(); source = document.select("div.scd-title > a:nth-child(2)").text();
if(source!=null && !source.equals("")) { if(StringUtils.isNotBlank(source)) {
source = "蓝鲸-" + source; source = "蓝鲸-" + source;
} }
}else if (url.contains("lanjinger.com")) { }else if (url.contains("lanjinger.com")) {
if(document.select("div.content_left > div:nth-child(2) > span").text().contains("专栏")) { if(document.select("div.content_left > div:nth-child(2) > span").text().contains("专栏")) {
source = document.select("a.author_name").text().replaceAll(".*编辑| ", ""); source = document.select("a.author_name").text().replaceAll(".*编辑| ", "");
if(source!=null && !source.equals("")) { if(StringUtils.isNotBlank(source)) {
source = "蓝鲸财经-" + source; source = "蓝鲸财经-" + source;
} }
} }
}else if (url.contains("huxiu.com")) { }else if (url.contains("huxiu.com")) {
source = document.select("div.article__author-info-box > a.article-author-info > span.author-info__username").text(); source = document.select("div.article__author-info-box > a.article-author-info > span.author-info__username").text();
if(source!=null && !source.equals("")) { if(StringUtils.isNotBlank(source)) {
source = "虎嗅-" + source; source = "虎嗅-" + source;
} }
}else if (url.contains("chuansongme.com")) { }else if (url.contains("chuansongme.com")) {
source = document.select("div.rich_media_meta_list > span.rich_media_meta.rich_media_meta_text").text(); source = document.select("div.rich_media_meta_list > span.rich_media_meta.rich_media_meta_text").text();
if(source!=null && !source.equals("")) { if(StringUtils.isNotBlank(source)) {
source = "传送门-" + source; source = "传送门-" + source;
} }
}else if (url.contains("a.mp.uc.cn")) { }else if (url.contains("a.mp.uc.cn")) {
JSONObject json = JSONObject.parseObject(html); JSONObject json = JSONObject.parseObject(html);
source = json.getJSONObject("data").getJSONObject("_author").getString("author_name"); source = json.getJSONObject("data").getJSONObject("_author").getString("author_name");
if(source!=null && !source.equals("")) { if(StringUtils.isNotBlank(source)) {
source = "uc-" + source;
}
}else if (url.contains("m.uczzd.cn")) {
if(html.contains("var xissJsonData =")){
html = html.split("var xissJsonData = ")[1].split("};")[0]+"}";
source = JSONObject.parseObject(html).getString("source_name");
}
if(StringUtils.isNotBlank(source)) {
source = "uc-" + source; source = "uc-" + source;
} }
}else if (url.contains("kd.youth.cn")) { }else if (url.contains("kd.youth.cn")) {
source = document.select("body > div > div > div.rich_media_meta_list > a").text(); source = document.select("body > div > div > div.rich_media_meta_list > a").text();
if(source!=null && !source.equals("")) { if(StringUtils.isNotBlank(source)) {
source = "中青在线-" + source; source = "中青在线-" + source;
} }
}else if (url.contains("zhuanlan.zhihu.com")) {
source = document.select("a.UserLink-link").text();
if(StringUtils.isNotBlank(source)) {
source = "知乎专栏-" + source;
}
}else if (url.contains("wulizixun.com")) {
source = document.select("span.newdetailOrigin").text();
if(StringUtils.isNotBlank(source)) {
source = "唔哩头条-" + source;
}
}else if(url.contains("t.10jqka.com.cn")){
source = document.select("a[class=\"link777 post-author db fl\"]").text();
if(StringUtils.isNotBlank(source)) {
source = "同花顺-" + source;
}
}else if(url.contains("shangyexinzhi.com")){
source = document.select("span.hover-color_change").text();
if(StringUtils.isNotBlank(source)) {
source = "商业新知-" + source;
}
}else if(url.contains("thepaper.cn")){
source = document.select("a> div.name").text();
if(StringUtils.isNotBlank(source)){
source = "澎湃新闻-" + source;
}
}else if(url.contains("tuicool.com")){
source = document.select("span.from> a").text();
if(StringUtils.isNotBlank(source)){
source = "推酷-" + source;
}
} }
return source; return source;
} catch (Exception e) { } catch (Exception e) {
......
package com.zhiwei.source_forward.util;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.source_forward.config.ProxyConfig;
/**
* 初始化代理
* @author xMx
* @date 2020年1月6日 上午9:29:04
*/
public class ProxyInit {
/**
* 初始化代理
* void
*/
public static void initProxy() {
String address = ProxyConfig.registry;
String appName = "xumiaoxin";
long appId = ProxyConfig.proxyid;
ProxyFactory.init(SimpleConfig.builder().registry(address).appName(appName).appId(appId).group(ProxyConfig.group).build());
}
}
#registry=zookeeper://192.168.0.203:2181;zookeeper://192.168.0.104:2181;zookeeper://192.168.0.105:2181 #registry=zookeeper://192.168.0.203:2181;zookeeper://192.168.0.104:2181;zookeeper://192.168.0.105:2181
#group=hangzhou #group=hangzhou
##########################测试地址############################## ##########################测试地址##############################
registry=zookeeper://192.168.0.36:2181 registry=zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181
proxyid=10000002
group=local group=local
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment