Commit 947a2179 by cwy

增加zaker客户端获取

parent 0c98f43b
...@@ -93,28 +93,50 @@ public class MediaSelfSourceCrawler { ...@@ -93,28 +93,50 @@ public class MediaSelfSourceCrawler {
map.put("referer", url); map.put("referer", url);
} }
map.put("Connection", "close"); map.put("Connection", "close");
Request request = RequestUtils.wrapGet(url, map); url = dealUrl(url);
counter.add(); if(Objects.nonNull(url)) {
Request request = RequestUtils.wrapGet(url, map);
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY).whenComplete((rs,ex) -> { counter.add();
try {
if (Objects.isNull(ex)) { httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY).whenComplete((rs,ex) -> {
try { try {
parseHtml(rs.body().string(), attr, callback); if (Objects.isNull(ex)) {
} catch (Exception e) { try {
logger.error("解析出错", e); parseHtml(rs.body().string(), attr, callback);
} catch (Exception e) {
logger.error("解析出错", e);
}
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(),ex);
} }
} else { } finally {
logger.info("{} 搜索结果访问失败: {}", request.url().url(),ex); counter.done();
} }
} finally { });
counter.done(); }
}
});
return counter; return counter;
} }
/** /**
** 链接处理
* @param url
* @return
* @return String
*/
private String dealUrl(String url) {
try {
if(url.startsWith("http")) {
if(url.contains("wap.peopleapp.com/article")) {
return "https://app.peopleapp.com/WapApi/610/ArtInfoApi/getInfoUp?id=" + url.split("article")[1].split("/")[1];
}
return url;
}
} catch (Exception e) {
}
return null;
}
/**
* *
* @Description 解析文章获取相关数据 * @Description 解析文章获取相关数据
* @param response * @param response
......
...@@ -32,7 +32,7 @@ public class MediaSelfSource { ...@@ -32,7 +32,7 @@ public class MediaSelfSource {
public static void main(String[] args) { public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181", "local", GroupType.PROVIDER, 10000002L); ProxyFactory.init("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181", "local", GroupType.PROVIDER, 10000002L);
List<String> urlList = new ArrayList<>(); List<String> urlList = new ArrayList<>();
urlList.add("https://www.360kuai.com/pc/922e4596800e5ef0a?cota=3&kuai_so=1&sign=360_e39369d1&refer_scene=so_3"); urlList.add("http://app.myzaker.com/news/article.php?pk=5dbef675b15ec03075727503");
List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList); List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList);
for(MediaSelfSourceBean b : u) { for(MediaSelfSourceBean b : u) {
System.out.println(b.toString()); System.out.println(b.toString());
......
...@@ -439,7 +439,13 @@ public class MatchSource { ...@@ -439,7 +439,13 @@ public class MatchSource {
if(source!=null && source.length()>1){ if(source!=null && source.length()>1){
source = "百度百家-" + source; source = "百度百家-" + source;
} }
}else if(url.contains("yidianzixun.com")){ }else if(url.contains("app.myzaker.com")){
// zaker客户端
source = document.select("#tpl_author").first().text().trim();
if(source!=null && source.length()>1){
source = "zaker客户端-" + source;
}
}else if(url.contains("yidianzixun.com")){
//一点资讯 //一点资讯
if(html.contains("related_wemedia")){ if(html.contains("related_wemedia")){
source = html.split("media_name\":\"")[1].split("\",\"")[0].trim(); source = html.split("media_name\":\"")[1].split("\",\"")[0].trim();
...@@ -462,6 +468,13 @@ public class MatchSource { ...@@ -462,6 +468,13 @@ public class MatchSource {
if(Objects.nonNull(source) && !source.isEmpty()){ if(Objects.nonNull(source) && !source.isEmpty()){
source = "it时代网-" + source; source = "it时代网-" + source;
} }
}else if(url.contains("wap.peopleapp.com")){
// 人民日报客户端
JSONObject json = JSONObject.parseObject(html);
source = json.getJSONObject("data").getString("authors");
if(Objects.nonNull(source) && !source.isEmpty()){
source = "人民日报客户端-" + source;
}
}else if(url.contains("guancha.cn")){ }else if(url.contains("guancha.cn")){
// 风闻社区 // 风闻社区
source = document.select("div.main-tow > div.box-left > div.article-content > div:nth-child(3) > div.user-main > h4 > a").text(); source = document.select("div.main-tow > div.box-left > div.article-content > div:nth-child(3) > div.user-main > h4 > a").text();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment