Commit 132b70cb by cwy

Merge branch 'source-forward-chen' of…

Merge branch 'source-forward-chen' of http://git.zhiweidata.top/zhangzhiwei/source_forward.git into source-forward-chen
parents f0fbf66b 0abfbd4a
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId> <artifactId>source-forward</artifactId>
<version>0.2.1-SNAPSHOT</version> <version>0.2.2-SNAPSHOT</version>
<name>source-forward</name> <name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description> <description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
...@@ -24,12 +24,12 @@ ...@@ -24,12 +24,12 @@
<dependency> <dependency>
<groupId>com.zhiwei.tools</groupId> <groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId> <artifactId>zhiwei-tools</artifactId>
<version>0.1.3-SNAPSHOT</version> <version>0.1.6-SNAPSHOT</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.zhiwei.crawler</groupId> <groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId> <artifactId>crawler-core</artifactId>
<version>0.5.5.6-SNAPSHOT</version> <version>0.6.1.0-SNAPSHOT</version>
</dependency> </dependency>
</dependencies> </dependencies>
......
...@@ -13,7 +13,9 @@ public class ProxyConfig { ...@@ -13,7 +13,9 @@ public class ProxyConfig {
conf.load(is); conf.load(is);
is.close(); is.close();
registry = conf.getProperty("registry"); registry = conf.getProperty("registry");
proxyid = Long.valueOf(conf.getProperty("proxyid"));
group = conf.getProperty("group"); group = conf.getProperty("group");
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
} }
...@@ -21,6 +23,7 @@ public class ProxyConfig { ...@@ -21,6 +23,7 @@ public class ProxyConfig {
public static String registry; public static String registry;
public static Long proxyid;
public static String group; public static String group;
} }
...@@ -5,6 +5,7 @@ import java.util.List; ...@@ -5,6 +5,7 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects; import java.util.Objects;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
...@@ -87,12 +88,11 @@ public class MediaSelfSourceCrawler { ...@@ -87,12 +88,11 @@ public class MediaSelfSourceCrawler {
* @return * @return
*/ */
private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) { private GroupSync search(GroupSync counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) {
logger.info("当前处理 URL: {}", url); logger.info("当前处理 URL: {}", attr.get());
Map<String,Object> map = new HashMap<>(); Map<String,Object> map = new HashMap<>();
if(url.contains("toutiao.com")) { if(url.contains("toutiao.com")) {
map.put("referer", url); map.put("referer", url);
} }
map.put("Connection", "close");
url = dealUrl(url); url = dealUrl(url);
if(Objects.nonNull(url)) { if(Objects.nonNull(url)) {
Request request = RequestUtils.wrapGet(url, map); Request request = RequestUtils.wrapGet(url, map);
...@@ -148,7 +148,6 @@ public class MediaSelfSourceCrawler { ...@@ -148,7 +148,6 @@ public class MediaSelfSourceCrawler {
/** /**
* *
* @Description 解析文章获取相关数据 * @Description 解析文章获取相关数据
* @param response
* @param attr * @param attr
* @param callback * @param callback
*/ */
...@@ -156,12 +155,11 @@ public class MediaSelfSourceCrawler { ...@@ -156,12 +155,11 @@ public class MediaSelfSourceCrawler {
MediaSelfSourceDataCallBack callback) { MediaSelfSourceDataCallBack callback) {
String source = null; String source = null;
String channel = null; String channel = null;
String url = attr.get().toString();
try { try {
source = MatchSource.matchMediaSelfSource(attr.get().toString(),result); source = MatchSource.matchMediaSelfSource(url,result);
if(source==null || source.equals("")){ logger.info(url+"=======" + source);
source = null; channel = MatchChannel.verifyChannel(url);
}
channel = MatchChannel.verifyChannel(attr.get().toString());
if(channel==null){ if(channel==null){
List<Node> nodeList = Jsoup.parse(result).head().childNodes(); List<Node> nodeList = Jsoup.parse(result).head().childNodes();
channel = MatchChannel.matchChannel(nodeList); channel = MatchChannel.matchChannel(nodeList);
...@@ -170,8 +168,7 @@ public class MediaSelfSourceCrawler { ...@@ -170,8 +168,7 @@ public class MediaSelfSourceCrawler {
logger.error("exception ",e); logger.error("exception ",e);
source = null; source = null;
} }
logger.info(attr.get()+"=================来源" + source); MediaSelfSourceBean msfb = new MediaSelfSourceBean(url, source, channel);
MediaSelfSourceBean msfb = new MediaSelfSourceBean(attr.get().toString(), source, channel);
if (callback == null) { if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据"); logger.warn("DataCallback 对象为 null,无法保存数据");
} else { } else {
......
...@@ -6,11 +6,10 @@ import java.util.List; ...@@ -6,11 +6,10 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import com.zhiwei.source_forward.util.ProxyInit;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.source_forward.bean.SourceForwardBean; import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution; import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
import com.zhiwei.source_forward.crawler.SourceForwardCrawler; import com.zhiwei.source_forward.crawler.SourceForwardCrawler;
...@@ -79,7 +78,7 @@ public class SourceForward { ...@@ -79,7 +78,7 @@ public class SourceForward {
} }
public static void main(String[] args) { public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER, 10000002); ProxyInit.initProxy();
List<String> urlList = new ArrayList<>(); List<String> urlList = new ArrayList<>();
urlList.add("http://software.it168.com/a2019/0621/6005/000006005693.shtml"); urlList.add("http://software.it168.com/a2019/0621/6005/000006005693.shtml");
List<SourceForwardBean> da = SourceForward.getSourceForward(urlList); List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
......
package com.zhiwei.source_forward.run; package com.zhiwei.source_forward.run;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import com.zhiwei.source_forward.util.ProxyInit;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import com.zhiwei.async.TaskBoot;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.source_forward.bean.UrlLiveBean; import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution; import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.crawler.UrlLiveCrawler; import com.zhiwei.source_forward.crawler.UrlLiveCrawler;
import com.zhiwei.source_forward.crawler.UrlLiveCrawlerNew;
import com.zhiwei.source_forward.util.UrlLiveDataCallback; import com.zhiwei.source_forward.util.UrlLiveDataCallback;
import okhttp3.Request;
import okhttp3.Response;
/** /**
* @ClassName: URLLive * @ClassName: URLLive
* @Description: 验证链接是否已删除 * @Description: 验证链接是否已删除
...@@ -84,7 +74,7 @@ public class URLLive { ...@@ -84,7 +74,7 @@ public class URLLive {
} }
public static void main(String[] args) { public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER, 10000002); ProxyInit.initProxy();
List<String> urlList = new ArrayList<>(); List<String> urlList = new ArrayList<>();
urlList.add("http://a.mp.uc.cn/article.html?uc_param_str=frdnsnpfvecpntnwprdssskt#!wm_aid=038b8207b444418c845f43e4d2d3a754"); urlList.add("http://a.mp.uc.cn/article.html?uc_param_str=frdnsnpfvecpntnwprdssskt#!wm_aid=038b8207b444418c845f43e4d2d3a754");
......
package com.zhiwei.source_forward.util;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.source_forward.config.ProxyConfig;
/**
* 初始化代理
* @author xMx
* @date 2020年1月6日 上午9:29:04
*/
public class ProxyInit {
/**
* 初始化代理
* void
*/
public static void initProxy() {
String address = ProxyConfig.registry;
String appName = "xumiaoxin";
long appId = ProxyConfig.proxyid;
ProxyFactory.init(SimpleConfig.builder().registry(address).appName(appName).appId(appId).group(ProxyConfig.group).build());
}
}
#registry=zookeeper://192.168.0.203:2181;zookeeper://192.168.0.104:2181;zookeeper://192.168.0.105:2181 #registry=zookeeper://192.168.0.203:2181;zookeeper://192.168.0.104:2181;zookeeper://192.168.0.105:2181
#group=hangzhou #group=hangzhou
##########################测试地址############################## ##########################测试地址##############################
registry=zookeeper://192.168.0.36:2181 registry=zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181
proxyid=10000002
group=local group=local
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment