Commit bd0353ac by chenweiyang

冲突

parent eff378d9
...@@ -52,7 +52,6 @@ public class UrlLiveCrawler { ...@@ -52,7 +52,6 @@ public class UrlLiveCrawler {
counter.add(); counter.add();
if (nonNull(url)) { if (nonNull(url)) {
try { try {
// ZhiWeiTools.sleep(3000);
search(counter, url, Attribution.of(url, 1), callback); search(counter, url, Attribution.of(url, 1), callback);
} catch (Exception e) { } catch (Exception e) {
logger.error("搜索创建出错:", e); logger.error("搜索创建出错:", e);
......
package com.zhiwei.source_forward.util; package com.zhiwei.source_forward.util;
import java.util.List; import com.alibaba.fastjson.JSONObject;
import java.util.Objects; import com.zhiwei.source_forward.content.ContentExtractor;
import java.util.regex.Matcher; import com.zhiwei.source_forward.content.News;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import com.alibaba.fastjson.JSONObject; import java.util.List;
import com.zhiwei.source_forward.content.ContentExtractor; import java.util.Objects;
import com.zhiwei.source_forward.content.News; import java.util.regex.Matcher;
import java.util.regex.Pattern;
/** /**
* @ClassName: MatchSource * @ClassName: MatchSource
...@@ -53,7 +52,8 @@ public class MatchSource { ...@@ -53,7 +52,8 @@ public class MatchSource {
* 设定文件 * 设定文件
* @return String 返回类型 * @return String 返回类型
*/ */
public static String matchSource(String url,String html, List<String> sourceList) { public static String
matchSource(String url,String html, List<String> sourceList) {
String source = null; String source = null;
Document document = Jsoup.parse(html); Document document = Jsoup.parse(html);
String htmlBody = TreateData.filterSpecialCharacter(document.select("body").text().toUpperCase()); String htmlBody = TreateData.filterSpecialCharacter(document.select("body").text().toUpperCase());
...@@ -324,8 +324,19 @@ public class MatchSource { ...@@ -324,8 +324,19 @@ public class MatchSource {
if(StringUtils.isNotBlank(source)){ if(StringUtils.isNotBlank(source)){
source = source.replaceAll(".*来源:|)", ""); source = source.replaceAll(".*来源:|)", "");
} }
}else if(url.contains("gu.qq.com")){
source = document.select("span#news_source").text();
if(StringUtils.isNotBlank(source)){
return source;
}
}else if(url.contains("kandian.youth.cn")){
source = document.select("div.fl > a").text();
if(StringUtils.isNotBlank(source)){
return source;
}
} }
if(Objects.nonNull(source) && source.length() != 0) { if(Objects.nonNull(source) && source.length() != 0) {
return source; return source;
} }
...@@ -438,29 +449,18 @@ public class MatchSource { ...@@ -438,29 +449,18 @@ public class MatchSource {
} }
} }
} }
if(source.length() < 1 && html.contains("window.__INITIAL_DATA__ =")) {
Matcher ma = Pattern.compile("window.__INITIAL_DATA__ =[\\s\\S]+?}}").matcher(html);
if(ma.find()) {
String result = ma.group().replaceAll("window.__INITIAL_DATA__ =|\\</script\\>|", "").trim();
if(result.contains("window.autohomePVDDWhiteList")) {
result = result.split("window.autohomePVDDWhiteList")[0];
}
JSONObject json = JSONObject.parseObject(result.trim());
source = json.getJSONObject("detail").getString("sec_src");
if(Objects.isNull(source) || source.length() < 1) {
source = json.getJSONObject("detail").getString("src");
}
}
}
if(Objects.nonNull(source) && source.length()>1){ if(Objects.nonNull(source) && source.length()>1){
source = "快资讯-" + source; source = "快资讯-" + source;
} }
}else if(url.contains("cj.sina.com.cn") || url.contains("finance.sina.cn") || }else if(url.contains("k.sina.com.cn") || url.contains("cj.sina.com.cn") || url.contains("finance.sina.cn") ||
url.contains("tech.sina.cn") || url.contains("news.sina.cn") || url.contains("k.sina.cn")){ url.contains("tech.sina.cn") || url.contains("news.sina.cn") || url.contains("k.sina.cn")){
source = document.select("h2.weibo_user").text(); source = document.select("h2.weibo_user").text();
if(Objects.isNull(source) || source.length() < 1) { if(Objects.isNull(source) || source.length() < 1) {
source = document.select("#top_bar > div > div.date-source > a").text(); source = document.select("#top_bar > div > div.date-source >span > a").text();
} }
if(Objects.isNull(source) || source.length() < 1) {
source = document.select("#top_bar > div > div.date-source > a").text();
}
if((Objects.isNull(source) || source.length() < 1) && html.contains("<meta name=\"mediaid\"")){ if((Objects.isNull(source) || source.length() < 1) && html.contains("<meta name=\"mediaid\"")){
//新浪科技头条号 //新浪科技头条号
source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim(); source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim();
...@@ -502,9 +502,12 @@ public class MatchSource { ...@@ -502,9 +502,12 @@ public class MatchSource {
} }
}else if(url.contains("baijiahao.baidu.com") || url.contains("mbd.baidu.com")){ }else if(url.contains("baijiahao.baidu.com") || url.contains("mbd.baidu.com")){
//百度百家 //百度百家
source = document.select("span.userNameSpan").text(); if(StringUtils.isNotBlank(document.select("span.userNameSpan").text())){
if(StringUtils.isBlank(source)){ source = document.select("span.userNameSpan").text();
}else if(StringUtils.isNotBlank(document.select("p.author-name:nth-child(1)").text())){
source = document.select("p.author-name:nth-child(1)").text(); source = document.select("p.author-name:nth-child(1)").text();
}else if(StringUtils.isNotBlank(document.select("a.authorName").text())){
source = document.select("a.authorName").text();
} }
if(StringUtils.isNotBlank(source)){ if(StringUtils.isNotBlank(source)){
source = "百度百家-" + source; source = "百度百家-" + source;
...@@ -783,8 +786,7 @@ public class MatchSource { ...@@ -783,8 +786,7 @@ public class MatchSource {
if(StringUtils.isNotBlank(source)){ if(StringUtils.isNotBlank(source)){
source = "推酷-" + source; source = "推酷-" + source;
} }
} }else if(url.contains("36kr.com")){
if(url.contains("36kr.com")){
source = document.select("div.info-header-text > a.author-name").text(); source = document.select("div.info-header-text > a.author-name").text();
if(StringUtils.isNotBlank(source)){ if(StringUtils.isNotBlank(source)){
return "36氪-" + source; return "36氪-" + source;
...@@ -798,6 +800,7 @@ public class MatchSource { ...@@ -798,6 +800,7 @@ public class MatchSource {
return "36氪-" + source; return "36氪-" + source;
} }
} }
return source; return source;
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment