Commit bde825dd by yangchen

添加今日头条转发原创判断和链接是否删除方法修改

parent 5a79e3d2
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
<groupId>com.zhiwei.crawler</groupId> <groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId> <artifactId>crawler-core</artifactId>
<version>0.1.1-RELEASE</version> <version>0.1.1-RELEASE</version>
<scope>provided</scope>
</dependency> </dependency>
</dependencies> </dependencies>
......
package com.zhiwei.source_forward.crawler; package com.zhiwei.source_forward.crawler;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
...@@ -11,7 +12,7 @@ import org.jsoup.nodes.Node; ...@@ -11,7 +12,7 @@ import org.jsoup.nodes.Node;
import com.zhiwei.crawler.async.MultiThreadingCounter; import com.zhiwei.crawler.async.MultiThreadingCounter;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.HttpRequestBuilder; import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.source_forward.bean.SourceForwardBean; import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution; import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
...@@ -19,6 +20,7 @@ import com.zhiwei.source_forward.util.MatchChannel; ...@@ -19,6 +20,7 @@ import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource; import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.SourceData; import com.zhiwei.source_forward.util.SourceData;
import com.zhiwei.source_forward.util.SourceForwardDataCallBack; import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
import com.zhiwei.tools.httpclient.HeaderTool;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response; import okhttp3.Response;
...@@ -31,7 +33,7 @@ public class SourceForwardCrawler { ...@@ -31,7 +33,7 @@ public class SourceForwardCrawler {
private static List<String> sourceList = SourceData.getSourceList(); private static List<String> sourceList = SourceData.getSourceList();
public MultiThreadingCounter submitTask(SourceForwardDataCallBack callback,String... urls) throws Exception { public MultiThreadingCounter submitTask(SourceForwardDataCallBack callback,String... urls) throws Exception {
MultiThreadingCounter counter = new MultiThreadingCounter(5,TimeUnit.MINUTES,false); MultiThreadingCounter counter = new MultiThreadingCounter(15,TimeUnit.MINUTES,false);
start(counter, callback, urls); start(counter, callback, urls);
return counter; return counter;
} }
...@@ -55,7 +57,12 @@ public class SourceForwardCrawler { ...@@ -55,7 +57,12 @@ public class SourceForwardCrawler {
private MultiThreadingCounter search(MultiThreadingCounter counter, String url,Attribution attr, SourceForwardDataCallBack callback) { private MultiThreadingCounter search(MultiThreadingCounter counter, String url,Attribution attr, SourceForwardDataCallBack callback) {
logger.info("当前处理 URL: {}", url); logger.info("当前处理 URL: {}", url);
Request request = HttpRequestBuilder.newGetRequest(url, null); Map<String,String> headers = HeaderTool.getCommonHead();
if(url.contains("www.toutiao.com")){
headers.put("referer", url);
}
Request request = RequestUtils.wrapGet(url, headers);
counter.increase(); counter.increase();
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY, true).addListener(future -> { httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY, true).addListener(future -> {
try { try {
...@@ -83,7 +90,8 @@ public class SourceForwardCrawler { ...@@ -83,7 +90,8 @@ public class SourceForwardCrawler {
String isforward = "未知"; String isforward = "未知";
try { try {
if(response.isSuccessful()){ if(response.isSuccessful()){
Document document = Jsoup.parse(response.body().string()); String body = response.body().string();
Document document = Jsoup.parse(body);
if(attr.get().toString().contains("mp.weixin.qq.com")){ if(attr.get().toString().contains("mp.weixin.qq.com")){
isforward = document.select("div#meta_content").select("span#copyright_logo").text(); isforward = document.select("div#meta_content").select("span#copyright_logo").text();
if(isforward.contains("原创")){ if(isforward.contains("原创")){
...@@ -91,6 +99,12 @@ public class SourceForwardCrawler { ...@@ -91,6 +99,12 @@ public class SourceForwardCrawler {
}else { }else {
isforward = "未知"; isforward = "未知";
} }
}else if(attr.get().toString().contains("www.toutiao.com")){
if(body.contains("isOriginal")){
if(body.contains("isOriginal: true")){
isforward = "原创";
}
}
}else{ }else{
channel = MatchChannel.verifyChannel(attr.get().toString()); channel = MatchChannel.verifyChannel(attr.get().toString());
if(channel==null){ if(channel==null){
......
...@@ -7,6 +7,8 @@ import java.util.List; ...@@ -7,6 +7,8 @@ import java.util.List;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean; import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution; import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.crawler.MediaSelfSourceCrawler; import com.zhiwei.source_forward.crawler.MediaSelfSourceCrawler;
...@@ -22,13 +24,13 @@ public class MediaSelfSource { ...@@ -22,13 +24,13 @@ public class MediaSelfSource {
} }
public static void main(String[] args) { public static void main(String[] args) {
// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER); ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
// List<String> urlList = new ArrayList<>(); List<String> urlList = new ArrayList<>();
// urlList.add("https://www.toutiao.com/a6452936157751968013/"); urlList.add("http://sh.qihoo.com/pc/91d1d565fe552fa1e?sign=360_e39369d1");
// List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList); List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList);
// for(MediaSelfSourceBean b : u) { for(MediaSelfSourceBean b : u) {
// System.out.println(b.toString()); System.out.println(b.toString());
// } }
} }
static class MediaSelfSourceCrawlerThread extends Thread{ static class MediaSelfSourceCrawlerThread extends Thread{
......
...@@ -149,7 +149,10 @@ public class MatchSource { ...@@ -149,7 +149,10 @@ public class MatchSource {
} }
}else if(url.contains("sh.qihoo.com")){ }else if(url.contains("sh.qihoo.com")){
//今日报点解析 //今日报点解析
source = document.select("p.info").select("span.source").text().trim(); source = document.select("span.source").text().trim();
if(source.length() < 2) {
source = document.select("p.article-info").select("a").text().trim();
}
if(source!=null && !source.equals("")){ if(source!=null && !source.equals("")){
source = "快资讯-" + source; source = "快资讯-" + source;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment