Commit 4e02a60f by cwy

添加部分自媒体号采集

parent 947a2179
......@@ -129,6 +129,15 @@ public class MediaSelfSourceCrawler {
if(url.contains("wap.peopleapp.com/article")) {
return "https://app.peopleapp.com/WapApi/610/ArtInfoApi/getInfoUp?id=" + url.split("article")[1].split("/")[1];
}
if(url.contains("a.mp.uc.cn/") && url.contains("wm_cid=")) {
return "https://ff.dayu.com/contents/" + url.split("wm_cid=")[1].split("&")[0] + "?biz_id=1002&_fetch_author=1";
}
if(url.contains("tznew.58.com")) {
return "https://tznew.58.com/tznew/c/info-detail?infoid=" + url.split("infoid=")[1].split("&")[0];
}
if(url.contains("wap.peopleapp.com/article")) {
return "https://app.peopleapp.com/WapApi/610/ArtInfoApi/getInfoUp?id=" + url.split("article")[1].split("/")[1];
}
return url;
}
} catch (Exception e) {
......
......@@ -32,7 +32,7 @@ public class MediaSelfSource {
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181", "local", GroupType.PROVIDER, 10000002L);
List<String> urlList = new ArrayList<>();
urlList.add("http://app.myzaker.com/news/article.php?pk=5dbef675b15ec03075727503");
urlList.add("https://wap.peopleapp.com/article/rmh12074926/0");
List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList);
for(MediaSelfSourceBean b : u) {
System.out.println(b.toString());
......
......@@ -371,7 +371,19 @@ public class MatchSource {
source = "搜狐-" + source;
}
}
}else if(url.contains("a.mini.eastday.com")){
}else if(url.contains("tznew.58.com")){
//58
source = JSONObject.parseObject(html).getJSONObject("result").getString("author");
if(source!=null && source.length()>1){
source = "58-" + source;
}
}else if(url.contains("c.m.163.com")){
//58
source = document.select("section.g-article.js-article > div.js-article-inner > div > b").text();
if(source!=null && source.length()>1){
source = "网易新闻-" + source;
}
}else if(url.contains("a.mini.eastday.com")){
//处理东方头条网-自媒体号匹配
source = document.select("[class=\"share_cnt_p clearfix\"]").select("div.fl").select("i").get(1).text().trim();
if(source!=null && source.length()>1){
......@@ -677,9 +689,21 @@ public class MatchSource {
if(source!=null && !source.equals("")) {
source = "传送门-" + source;
}
}else if (url.contains("a.mp.uc.cn")) {
JSONObject json = JSONObject.parseObject(html);
source = json.getJSONObject("data").getJSONObject("_author").getString("author_name");
if(source!=null && !source.equals("")) {
source = "uc-" + source;
}
}else if (url.contains("kd.youth.cn")) {
source = document.select("body > div > div > div.rich_media_meta_list > a").text();
if(source!=null && !source.equals("")) {
source = "中青在线-" + source;
}
}
return source;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment