Commit 554dd201 by yangchen

增加自媒体 创业家 科技讯 爱尖刀 来源获取

parent 7f4a87a2
...@@ -29,7 +29,7 @@ ...@@ -29,7 +29,7 @@
<dependency> <dependency>
<groupId>com.zhiwei.crawler</groupId> <groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId> <artifactId>crawler-core</artifactId>
<version>0.3.0-RELEASE</version> <version>0.3.1-RELEASE</version>
<scope>provided</scope> <scope>provided</scope>
</dependency> </dependency>
</dependencies> </dependencies>
......
...@@ -92,6 +92,7 @@ public class MediaSelfSourceCrawler { ...@@ -92,6 +92,7 @@ public class MediaSelfSourceCrawler {
if(url.contains("toutiao.com")) { if(url.contains("toutiao.com")) {
map.put("referer", url); map.put("referer", url);
} }
map.put("Connection", "close");
Request request = RequestUtils.wrapGet(url, map); Request request = RequestUtils.wrapGet(url, map);
counter.add(); counter.add();
......
...@@ -22,8 +22,6 @@ import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution; ...@@ -22,8 +22,6 @@ import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.util.UrlLiveDataCallback; import com.zhiwei.source_forward.util.UrlLiveDataCallback;
import com.zhiwei.tools.httpclient.HeaderTool; import com.zhiwei.tools.httpclient.HeaderTool;
import okhttp3.Request;
/** /**
* *
* @ClassName UrlLiveCrawler * @ClassName UrlLiveCrawler
...@@ -35,17 +33,12 @@ import okhttp3.Request; ...@@ -35,17 +33,12 @@ import okhttp3.Request;
public class UrlLiveCrawler { public class UrlLiveCrawler {
private static final Logger logger = LogManager.getLogger(UrlLiveCrawler.class); private static final Logger logger = LogManager.getLogger(UrlLiveCrawler.class);
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot(false,2);
public GroupSync submitTask(UrlLiveDataCallback callback,String... urls) { public GroupSync submitTask(UrlLiveDataCallback callback,String... urls) {
try { GroupSync counter = new GroupSync();
GroupSync counter = new GroupSync(); start(counter, callback, urls);
start(counter, callback, urls); return counter;
return counter;
} catch (Exception e) {
logger.error(" 判断链接是否删除 {} ",e);
return null;
}
} }
private void start(GroupSync counter,UrlLiveDataCallback callback, String... urls) { private void start(GroupSync counter,UrlLiveDataCallback callback, String... urls) {
...@@ -72,9 +65,8 @@ public class UrlLiveCrawler { ...@@ -72,9 +65,8 @@ public class UrlLiveCrawler {
if(url.contains("www.toutiao.com")){ if(url.contains("www.toutiao.com")){
headers.put("referer", url); headers.put("referer", url);
} }
Request request = RequestUtils.wrapGet(url, headers);
counter.add(); counter.add();
httpBoot.asyncCall(request, ProxyHolder.NAT_PROXY).whenComplete((rs,ex) -> { httpBoot.asyncCall(RequestUtils.wrapGet(url, headers), ProxyHolder.NAT_PROXY).whenComplete((rs,ex) -> {
try { try {
if (Objects.isNull(ex)) { if (Objects.isNull(ex)) {
if(rs.code() == 200) { if(rs.code() == 200) {
...@@ -90,7 +82,7 @@ public class UrlLiveCrawler { ...@@ -90,7 +82,7 @@ public class UrlLiveCrawler {
} else { } else {
if(attr.getCount() > 3) { if(attr.getCount() > 3) {
callBack(callback, attr, -1,null); callBack(callback, attr, -1,null);
logger.info("{} 搜索结果访问失败: {}", request.url().url(), ex); logger.info("搜索结果访问失败: {}", ex);
}else { }else {
attr.AddCount(); attr.AddCount();
search(counter, attr.getAttr().toString(), attr, callback); search(counter, attr.getAttr().toString(), attr, callback);
...@@ -122,23 +114,18 @@ public class UrlLiveCrawler { ...@@ -122,23 +114,18 @@ public class UrlLiveCrawler {
private String dealUrl(String url) { private String dealUrl(String url) {
try { try {
if(url.contains("toutiao.com")) { if(url.contains("toutiao.com")) {
try { if(url.contains("www.toutiao.com")) {
if(url.contains("www.toutiao.com")) {
}else {
}else { url = url.replace("toutiao.com", "www.toutiao.com");
url = url.replace("toutiao.com", "www.toutiao.com"); }
} if(url.contains("https")) {
if(url.contains("https")) {
}else {
}else { url = url.replace("http", "https");
url = url.replace("http", "https"); }
} if(url.contains("group")) {
if(url.contains("group")) { url = "https://www.toutiao.com/a" + url.split("/")[4] + "/";
url = "https://www.toutiao.com/a" + url.split("/")[4] + "/";
}
} catch (Exception e) {
logger.error("url 解析出错 ",e);
return url;
} }
}else if(url.contains("mp.weixin.qq.com")) { }else if(url.contains("mp.weixin.qq.com")) {
if(url.contains("https")) { if(url.contains("https")) {
......
...@@ -25,7 +25,7 @@ public class MediaSelfSource { ...@@ -25,7 +25,7 @@ public class MediaSelfSource {
public static void main(String[] args) { public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER); ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
List<String> urlList = new ArrayList<>(); List<String> urlList = new ArrayList<>();
urlList.add("https://www.toutiao.com/a6669697912458445059/"); urlList.add("http://dy.163.com/v2/article/detail/EANTKV6H0512ES8F.html");
List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList); List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList);
for(MediaSelfSourceBean b : u) { for(MediaSelfSourceBean b : u) {
System.out.println(b.toString()); System.out.println(b.toString());
......
...@@ -226,7 +226,7 @@ public class MatchSource { ...@@ -226,7 +226,7 @@ public class MatchSource {
} }
}else if(url.contains("dy.163.com")){ }else if(url.contains("dy.163.com")){
//网易订阅-网易号 //网易订阅-网易号
source = document.select("div.colum_info>h4").text(); source = document.select("div.colum_info > h4").text();
if(source!=null && source.length()>1){ if(source!=null && source.length()>1){
source = "网易号-" + source; source = "网易号-" + source;
} }
...@@ -260,6 +260,24 @@ public class MatchSource { ...@@ -260,6 +260,24 @@ public class MatchSource {
if(source!=null && !source.equals("")){ if(source!=null && !source.equals("")){
source = "今日潮闻-" + source; source = "今日潮闻-" + source;
} }
}else if(url.contains("ijiandao.com")){
source = document.select("div.article-author > span.author-name > a")
.text();
if(source!=null && !source.equals("")){
source = "爱尖刀-" + source;
}
}else if(url.contains("chuangyejia.com")){
source = document.select("div.article-title > ul.article-author > li:nth-child(1)")
.text();
if(source!=null && !source.equals("")){
source = "创业家-" + source;
}
}else if(url.contains("kejixun.com")){
source = document.select("div.r-box.r-box-none.clearfix > div > div.big-man > h3 > a")
.text();
if(source!=null && !source.equals("")){
source = "科技讯-" + source;
}
} }
return source; return source;
} catch (Exception e) { } catch (Exception e) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment