Commit 132b70cb by cwy

Merge branch 'source-forward-chen' of…

Merge branch 'source-forward-chen' of http://git.zhiweidata.top/zhangzhiwei/source_forward.git into source-forward-chen
parents f0fbf66b 0abfbd4a
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId>
<version>0.2.1-SNAPSHOT</version>
<version>0.2.2-SNAPSHOT</version>
<name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
......@@ -24,12 +24,12 @@
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.1.3-SNAPSHOT</version>
<version>0.1.6-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.5.5.6-SNAPSHOT</version>
<version>0.6.1.0-SNAPSHOT</version>
</dependency>
</dependencies>
......
......@@ -13,7 +13,9 @@ public class ProxyConfig {
conf.load(is);
is.close();
registry = conf.getProperty("registry");
proxyid = Long.valueOf(conf.getProperty("proxyid"));
group = conf.getProperty("group");
} catch (Exception e) {
e.printStackTrace();
}
......@@ -21,6 +23,7 @@ public class ProxyConfig {
public static String registry;
public static Long proxyid;
public static String group;
}
......@@ -5,6 +5,7 @@ import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
......@@ -87,12 +88,11 @@ public class MediaSelfSourceCrawler {
* @return
*/
private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) {
logger.info("当前处理 URL: {}", url);
logger.info("当前处理 URL: {}", attr.get());
Map<String,Object> map = new HashMap<>();
if(url.contains("toutiao.com")) {
map.put("referer", url);
}
map.put("Connection", "close");
url = dealUrl(url);
if(Objects.nonNull(url)) {
Request request = RequestUtils.wrapGet(url, map);
......@@ -148,7 +148,6 @@ public class MediaSelfSourceCrawler {
/**
*
* @Description 解析文章获取相关数据
* @param response
* @param attr
* @param callback
*/
......@@ -156,12 +155,11 @@ public class MediaSelfSourceCrawler {
MediaSelfSourceDataCallBack callback) {
String source = null;
String channel = null;
String url = attr.get().toString();
try {
source = MatchSource.matchMediaSelfSource(attr.get().toString(),result);
if(source==null || source.equals("")){
source = null;
}
channel = MatchChannel.verifyChannel(attr.get().toString());
source = MatchSource.matchMediaSelfSource(url,result);
logger.info(url+"=======" + source);
channel = MatchChannel.verifyChannel(url);
if(channel==null){
List<Node> nodeList = Jsoup.parse(result).head().childNodes();
channel = MatchChannel.matchChannel(nodeList);
......@@ -170,8 +168,7 @@ public class MediaSelfSourceCrawler {
logger.error("exception ",e);
source = null;
}
logger.info(attr.get()+"=================来源" + source);
MediaSelfSourceBean msfb = new MediaSelfSourceBean(attr.get().toString(), source, channel);
MediaSelfSourceBean msfb = new MediaSelfSourceBean(url, source, channel);
if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据");
} else {
......
......@@ -6,11 +6,10 @@ import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import com.zhiwei.source_forward.util.ProxyInit;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
import com.zhiwei.source_forward.crawler.SourceForwardCrawler;
......@@ -79,7 +78,7 @@ public class SourceForward {
}
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER, 10000002);
ProxyInit.initProxy();
List<String> urlList = new ArrayList<>();
urlList.add("http://software.it168.com/a2019/0621/6005/000006005693.shtml");
List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
......
package com.zhiwei.source_forward.run;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import com.zhiwei.source_forward.util.ProxyInit;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.async.TaskBoot;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.crawler.UrlLiveCrawler;
import com.zhiwei.source_forward.crawler.UrlLiveCrawlerNew;
import com.zhiwei.source_forward.util.UrlLiveDataCallback;
import okhttp3.Request;
import okhttp3.Response;
/**
* @ClassName: URLLive
* @Description: 验证链接是否已删除
......@@ -84,7 +74,7 @@ public class URLLive {
}
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER, 10000002);
ProxyInit.initProxy();
List<String> urlList = new ArrayList<>();
urlList.add("http://a.mp.uc.cn/article.html?uc_param_str=frdnsnpfvecpntnwprdssskt#!wm_aid=038b8207b444418c845f43e4d2d3a754");
......
......@@ -5,6 +5,7 @@ import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
......@@ -60,7 +61,7 @@ public class MatchSource {
if(url.contains("thepaper.cn")){
//单独处理澎湃数据
source = document.select("div.news_about").select("p").select("span").text().replaceAll(".*来源:", "");
if(source.length() == 0) {
if(StringUtils.isNotBlank(source)) {
source = document.select("div.news_about").text().replaceAll(" \\d{4}.*|.*/", "");
}
}else if(url.contains("sports.eastday.com")){
......@@ -372,14 +373,15 @@ public class MatchSource {
}
}
}else if(url.contains("tznew.58.com")){
//58
source = JSONObject.parseObject(html).getJSONObject("result").getString("author");
if(source!=null && source.length()>1){
source = "58-" + source;
}
}else if(url.contains("c.m.163.com")){
//58
source = document.select("section.g-article.js-article > div.js-article-inner > div > b").text();
if(StringUtils.isBlank(source)){
source = document.select("div.info > h3").text();
}
if(source!=null && source.length()>1){
source = "网易新闻-" + source;
}
......@@ -445,10 +447,23 @@ public class MatchSource {
if(source!=null && source.length()>1){
source = "新浪-" + source;
}
}else if(url.contains("baijiahao.baidu.com")){
//百度百家
source = document.select("p.author-name").first().text().trim();
}else if(url.contains("k.sina.cn")){
source = document.select("h2.weibo_user").text();
if(source!=null && source.length()>1){
source = "新浪-" + source;
}
}else if(url.contains("blog.sina.com.cn")){
source = document.select("strong#ownernick").text();
if(source!=null && source.length()>1){
source = "新浪博客-" + source;
}
}else if(url.contains("baijiahao.baidu.com") || url.contains("mbd.baidu.com")){
//百度百家
source = document.select("span.userNameSpan").text();
if(StringUtils.isBlank(source)){
source = document.select("p.author-name:nth-child(1)").text();
}
if(StringUtils.isNotBlank(source)){
source = "百度百家-" + source;
}
}else if(url.contains("app.myzaker.com")){
......@@ -528,12 +543,12 @@ public class MatchSource {
}
}else if(url.contains("mp.qq.com")){
source = document.select("div#account_top > div.puin_text > div.pname").text();
if(source!=null && !source.equals("")){
if(StringUtils.isNotBlank(source)){
source = "QQ看点-" + source;
}
}else if(url.contains("v.qq.com")) {
source = document.select("span.user_name").text();
if(source!=null && !source.equals("")){
if(StringUtils.isNotBlank(source)){
source = "腾讯视频-" + source;
}
}else if(url.contains("qq.com/")){
......@@ -569,137 +584,175 @@ public class MatchSource {
}else if(url.contains("3g.163.com")){
source = document.select("div.info").select("[class=\"source js-source\"]")
.text();
if(source!=null && !source.equals("")){
if(StringUtils.isNotBlank(source)){
source = "网易号-" + source;
}
}else if(url.contains("myzaker.com")){
source = document.select("div.article_header > div > a > span.auther")
.text();
if(source!=null && !source.equals("")){
if(StringUtils.isNotBlank(source)){
source = "zaker-" + source;
}
}else if(url.contains("edushi.com")){
source = document.select("div.eds-name-box > div.eds-name > a > div.name")
.text();
if(source!=null && !source.equals("")){
if(StringUtils.isNotBlank(source)){
source = "今日潮闻-" + source;
}
}else if(url.contains("ijiandao.com")){
source = document.select("div.article-author > span.author-name > a")
.text();
if(source!=null && !source.equals("")){
if(StringUtils.isNotBlank(source)){
source = "爱尖刀-" + source;
}
}else if(url.contains("chuangyejia.com")){
source = document.select("div.article-title > ul.article-author > li:nth-child(1)")
.text();
if(source!=null && !source.equals("")){
if(StringUtils.isNotBlank(source)){
source = "创业家-" + source;
}
}else if(url.contains("kejixun.com")){
source = document.select("div.r-box.r-box-none.clearfix > div > div.big-man > h3 > a")
.text();
if(source!=null && !source.equals("")){
if(StringUtils.isNotBlank(source)){
source = "科技讯-" + source;
}
}else if(url.contains("tmtpost.com")){
source = document.select("article > div.post-info > a")
.text();
if(source!=null && !source.equals("")){
if(StringUtils.isNotBlank(source)){
source = "钛媒体-" + source;
}
}else if(url.contains("cyzone.cn")){
source = document.select("div.article-author-info > div.author-main > div > div.a-word > div.a-name > a")
.text();
if(source!=null && !source.equals("")){
if(StringUtils.isNotBlank(source)){
source = "创业邦-" + source;
}
}else if(url.contains("36kr.com")){
source = document.select("div.info-header-text > a.author-name").text();
if(source!=null && !source.equals("")){
if(StringUtils.isNotBlank(source)){
return "36氪-" + source;
}
source = document.select("h4.author-name").text();
if(source!=null && !source.equals("")){
if(StringUtils.isNotBlank(source)){
return "36氪-" + source;
}
source = document.select("span.author-nickname").text();
if(source!=null && !source.equals("")){
if(StringUtils.isNotBlank(source)){
return "36氪-" + source;
}
}else if(url.contains("lianxianjia.com")){
source = document.select("span.author-name").text();
if(source!=null && !source.equals("")){
if(StringUtils.isNotBlank(source)){
source = "连线家-" + source;
}
}else if(url.contains("itouchtv.cn")){
source = document.select("div.index__article-media-20Tg_ > span:nth-child(1)").text();
if(source!=null && !source.equals("")){
if(StringUtils.isNotBlank(source)){
source = "触电新闻-" + source;
}
}else if(url.contains("whb.cn")){
source = document.select("div.yidian-info > span:nth-child(1)").text();
if(source!=null && !source.equals("")){
if(StringUtils.isNotBlank(source)){
source = "文汇APP-" + source;
}
}else if(url.contains("blogchina.com")){
source = document.select("div.meta-top > label.lm_name > span > a").text();
if(source!=null && !source.equals("")){
if(StringUtils.isNotBlank(source)){
source = "博客中国-" + source;
}
}else if(url.contains(".iqiyi.com")) {
source = JSONObject.parseObject(html.split("page-info='")[1].split("'")[0]).getJSONObject("user").getString("name");
if(source!=null && !source.equals("")){
if(StringUtils.isNotBlank(source)){
source = "爱奇艺-" + source;
}
}else if(url.contains("v.youku.com")) {
source = document.select("a.sub-name").text();
if(source!=null && !source.equals("")){
if(StringUtils.isNotBlank(source)){
source = "优酷-" + source;
}
}else if(url.contains("jiemian.com")) {
source = document.select("div.article-info > p > span.author > a").text();
if(source!=null && !source.equals("")){
if(StringUtils.isNotBlank(source)){
source = "界面新闻-" + source;
}
}else if (url.contains("iyiou.com")) {
source = document.select("div#post_author > a").text();
if(source!=null && !source.equals("")) {
if(StringUtils.isNotBlank(source)) {
source = "亿欧网-" + source;
}
}else if (url.contains("lanjingtmt.com")) {
source = document.select("div.scd-title > a:nth-child(2)").text();
if(source!=null && !source.equals("")) {
if(StringUtils.isNotBlank(source)) {
source = "蓝鲸-" + source;
}
}else if (url.contains("lanjinger.com")) {
if(document.select("div.content_left > div:nth-child(2) > span").text().contains("专栏")) {
source = document.select("a.author_name").text().replaceAll(".*编辑| ", "");
if(source!=null && !source.equals("")) {
if(StringUtils.isNotBlank(source)) {
source = "蓝鲸财经-" + source;
}
}
}else if (url.contains("huxiu.com")) {
source = document.select("div.article__author-info-box > a.article-author-info > span.author-info__username").text();
if(source!=null && !source.equals("")) {
if(StringUtils.isNotBlank(source)) {
source = "虎嗅-" + source;
}
}else if (url.contains("chuansongme.com")) {
source = document.select("div.rich_media_meta_list > span.rich_media_meta.rich_media_meta_text").text();
if(source!=null && !source.equals("")) {
if(StringUtils.isNotBlank(source)) {
source = "传送门-" + source;
}
}else if (url.contains("a.mp.uc.cn")) {
JSONObject json = JSONObject.parseObject(html);
source = json.getJSONObject("data").getJSONObject("_author").getString("author_name");
if(source!=null && !source.equals("")) {
if(StringUtils.isNotBlank(source)) {
source = "uc-" + source;
}
}else if (url.contains("m.uczzd.cn")) {
if(html.contains("var xissJsonData =")){
html = html.split("var xissJsonData = ")[1].split("};")[0]+"}";
source = JSONObject.parseObject(html).getString("source_name");
}
if(StringUtils.isNotBlank(source)) {
source = "uc-" + source;
}
}else if (url.contains("kd.youth.cn")) {
source = document.select("body > div > div > div.rich_media_meta_list > a").text();
if(source!=null && !source.equals("")) {
if(StringUtils.isNotBlank(source)) {
source = "中青在线-" + source;
}
}else if (url.contains("zhuanlan.zhihu.com")) {
source = document.select("a.UserLink-link").text();
if(StringUtils.isNotBlank(source)) {
source = "知乎专栏-" + source;
}
}else if (url.contains("wulizixun.com")) {
source = document.select("span.newdetailOrigin").text();
if(StringUtils.isNotBlank(source)) {
source = "唔哩头条-" + source;
}
}else if(url.contains("t.10jqka.com.cn")){
source = document.select("a[class=\"link777 post-author db fl\"]").text();
if(StringUtils.isNotBlank(source)) {
source = "同花顺-" + source;
}
}else if(url.contains("shangyexinzhi.com")){
source = document.select("span.hover-color_change").text();
if(StringUtils.isNotBlank(source)) {
source = "商业新知-" + source;
}
}else if(url.contains("thepaper.cn")){
source = document.select("a> div.name").text();
if(StringUtils.isNotBlank(source)){
source = "澎湃新闻-" + source;
}
}else if(url.contains("tuicool.com")){
source = document.select("span.from> a").text();
if(StringUtils.isNotBlank(source)){
source = "推酷-" + source;
}
}
return source;
} catch (Exception e) {
......
package com.zhiwei.source_forward.util;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.source_forward.config.ProxyConfig;
/**
* 初始化代理
* @author xMx
* @date 2020年1月6日 上午9:29:04
*/
public class ProxyInit {
/**
* 初始化代理
* void
*/
public static void initProxy() {
String address = ProxyConfig.registry;
String appName = "xumiaoxin";
long appId = ProxyConfig.proxyid;
ProxyFactory.init(SimpleConfig.builder().registry(address).appName(appName).appId(appId).group(ProxyConfig.group).build());
}
}
#registry=zookeeper://192.168.0.203:2181;zookeeper://192.168.0.104:2181;zookeeper://192.168.0.105:2181
#group=hangzhou
##########################测试地址##############################
registry=zookeeper://192.168.0.36:2181
registry=zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181
proxyid=10000002
group=local
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment