Commit 50524098 by zk

增加k.sina.com.cn网址的source字段的解析规则

parent 2da35a84
...@@ -52,7 +52,8 @@ public class MatchSource { ...@@ -52,7 +52,8 @@ public class MatchSource {
* 设定文件 * 设定文件
* @return String 返回类型 * @return String 返回类型
*/ */
public static String matchSource(String url,String html, List<String> sourceList) { public static String
matchSource(String url,String html, List<String> sourceList) {
String source = null; String source = null;
Document document = Jsoup.parse(html); Document document = Jsoup.parse(html);
String htmlBody = TreateData.filterSpecialCharacter(document.select("body").text().toUpperCase()); String htmlBody = TreateData.filterSpecialCharacter(document.select("body").text().toUpperCase());
...@@ -452,11 +453,14 @@ public class MatchSource { ...@@ -452,11 +453,14 @@ public class MatchSource {
source = "快资讯-" + source; source = "快资讯-" + source;
} }
}else if(url.contains("cj.sina.com.cn") || url.contains("finance.sina.cn") || }else if(url.contains("cj.sina.com.cn") || url.contains("finance.sina.cn") ||
url.contains("tech.sina.cn") || url.contains("news.sina.cn") || url.contains("k.sina.cn")){ url.contains("tech.sina.cn") || url.contains("news.sina.cn") || url.contains("k.sina.cn") || url.contains("k.sina.com.cn")){
source = document.select("h2.weibo_user").text(); source = document.select("h2.weibo_user").text();
if(Objects.isNull(source) || source.length() < 1) { if(Objects.isNull(source) || source.length() < 1) {
source = document.select("#top_bar > div > div.date-source > a").text(); source = document.select("#top_bar > div > div.date-source > a").text();
} }
if(Objects.isNull(source) || source.length() < 1) {
source = document.select("#top_bar > div > div.date-source >span > a").text();
}
if((Objects.isNull(source) || source.length() < 1) && html.contains("<meta name=\"mediaid\"")){ if((Objects.isNull(source) || source.length() < 1) && html.contains("<meta name=\"mediaid\"")){
//新浪科技头条号 //新浪科技头条号
source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim(); source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment