Commit 0abfbd4a by zhiwei

添加自媒体匹配

parent 4e02a60f
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId>
<version>0.2.1-SNAPSHOT</version>
<version>0.2.2-SNAPSHOT</version>
<name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
......@@ -24,12 +24,12 @@
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.1.3-SNAPSHOT</version>
<version>0.1.6-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.5.5.6-SNAPSHOT</version>
<version>0.6.1.0-SNAPSHOT</version>
</dependency>
</dependencies>
......
......@@ -13,7 +13,9 @@ public class ProxyConfig {
conf.load(is);
is.close();
registry = conf.getProperty("registry");
proxyid = Long.valueOf(conf.getProperty("proxyid"));
group = conf.getProperty("group");
} catch (Exception e) {
e.printStackTrace();
}
......@@ -21,6 +23,7 @@ public class ProxyConfig {
public static String registry;
public static Long proxyid;
public static String group;
}
......@@ -5,6 +5,7 @@ import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
......@@ -87,12 +88,11 @@ public class MediaSelfSourceCrawler {
* @return
*/
private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) {
logger.info("当前处理 URL: {}", url);
logger.info("当前处理 URL: {}", attr.get());
Map<String,Object> map = new HashMap<>();
if(url.contains("toutiao.com")) {
map.put("referer", url);
}
map.put("Connection", "close");
url = dealUrl(url);
if(Objects.nonNull(url)) {
Request request = RequestUtils.wrapGet(url, map);
......@@ -148,7 +148,6 @@ public class MediaSelfSourceCrawler {
/**
*
* @Description 解析文章获取相关数据
* @param response
* @param attr
* @param callback
*/
......@@ -156,12 +155,11 @@ public class MediaSelfSourceCrawler {
MediaSelfSourceDataCallBack callback) {
String source = null;
String channel = null;
String url = attr.get().toString();
try {
source = MatchSource.matchMediaSelfSource(attr.get().toString(),result);
if(source==null || source.equals("")){
source = null;
}
channel = MatchChannel.verifyChannel(attr.get().toString());
source = MatchSource.matchMediaSelfSource(url,result);
logger.info(url+"=======" + source);
channel = MatchChannel.verifyChannel(url);
if(channel==null){
List<Node> nodeList = Jsoup.parse(result).head().childNodes();
channel = MatchChannel.matchChannel(nodeList);
......@@ -170,8 +168,7 @@ public class MediaSelfSourceCrawler {
logger.error("exception ",e);
source = null;
}
logger.info(attr.get()+"=================来源" + source);
MediaSelfSourceBean msfb = new MediaSelfSourceBean(attr.get().toString(), source, channel);
MediaSelfSourceBean msfb = new MediaSelfSourceBean(url, source, channel);
if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据");
} else {
......
......@@ -4,11 +4,11 @@ import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.source_forward.util.ProxyInit;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.crawler.MediaSelfSourceCrawler;
......@@ -30,9 +30,10 @@ public class MediaSelfSource {
}
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181", "local", GroupType.PROVIDER, 10000002L);
ProxyInit.initProxy();
List<String> urlList = new ArrayList<>();
urlList.add("https://wap.peopleapp.com/article/rmh12074926/0");
urlList.add("https://www.tuicool.com/articles/nIfmu2B");
List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList);
for(MediaSelfSourceBean b : u) {
System.out.println(b.toString());
......
......@@ -6,11 +6,10 @@ import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import com.zhiwei.source_forward.util.ProxyInit;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
import com.zhiwei.source_forward.crawler.SourceForwardCrawler;
......@@ -79,7 +78,7 @@ public class SourceForward {
}
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER, 10000002);
ProxyInit.initProxy();
List<String> urlList = new ArrayList<>();
urlList.add("http://software.it168.com/a2019/0621/6005/000006005693.shtml");
List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
......
package com.zhiwei.source_forward.run;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import com.zhiwei.source_forward.util.ProxyInit;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.async.TaskBoot;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.crawler.UrlLiveCrawler;
import com.zhiwei.source_forward.crawler.UrlLiveCrawlerNew;
import com.zhiwei.source_forward.util.UrlLiveDataCallback;
import okhttp3.Request;
import okhttp3.Response;
/**
* @ClassName: URLLive
* @Description: 验证链接是否已删除
......@@ -84,7 +74,7 @@ public class URLLive {
}
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER, 10000002);
ProxyInit.initProxy();
List<String> urlList = new ArrayList<>();
urlList.add("http://a.mp.uc.cn/article.html?uc_param_str=frdnsnpfvecpntnwprdssskt#!wm_aid=038b8207b444418c845f43e4d2d3a754");
......
package com.zhiwei.source_forward.util;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.source_forward.config.ProxyConfig;
/**
* 初始化代理
* @author xMx
* @date 2020年1月6日 上午9:29:04
*/
public class ProxyInit {
/**
* 初始化代理
* void
*/
public static void initProxy() {
String address = ProxyConfig.registry;
String appName = "xumiaoxin";
long appId = ProxyConfig.proxyid;
ProxyFactory.init(SimpleConfig.builder().registry(address).appName(appName).appId(appId).group(ProxyConfig.group).build());
}
}
#registry=zookeeper://192.168.0.203:2181;zookeeper://192.168.0.104:2181;zookeeper://192.168.0.105:2181
#group=hangzhou
##########################测试地址##############################
registry=zookeeper://192.168.0.36:2181
registry=zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181
proxyid=10000002
group=local
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment