Commit e229722e by zhiwei

根据链接匹配自媒体号添加频道匹配

parent c34e21d1
package com.zhiwei.source_forward.crawler;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.nodes.Node;
import com.zhiwei.source_forward.util.TreateData;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
......@@ -26,9 +30,15 @@ public class MediaSelfSourcePageProcessor implements PageProcessor {
public void process(Page page) {
Map<String,String> data = new HashMap<String,String>();
String source = null;
String channel = null;
try {
if(page.getStatusCode()!=404){
source = TreateData.matchMediaSelfSource(page.getUrl().get(),page.getHtml().toString());
channel = TreateData.verifyChannel(page.getUrl().get());
if(channel==null){
List<Node> nodeList = page.getHtml().getDocument().head().childNodes();
channel = TreateData.matchChannel(nodeList);
}
}
} catch (Exception e) {
source = null;
......@@ -36,6 +46,7 @@ public class MediaSelfSourcePageProcessor implements PageProcessor {
System.out.println(page.getUrl().get()+"================="+source);
data.put("url", page.getUrl().get());
data.put("mediaself", source);
data.put("channel", channel);
page.putField("data", data);
}
......
......@@ -34,7 +34,7 @@ public class SourceForwardPageProcessor implements PageProcessor {
String channel = "新闻";
try {
if(page.getStatusCode()!=404){
channel = verifyChannel(page.getUrl().get());
channel = TreateData.verifyChannel(page.getUrl().get());
if(channel==null){
List<Node> nodeList = page.getHtml().getDocument().head().childNodes();
channel = TreateData.matchChannel(nodeList);
......@@ -53,57 +53,4 @@ public class SourceForwardPageProcessor implements PageProcessor {
page.putField("data", data);
}
/**
* @Title: verifyChannel
* @author hero
* @Description: 根据链接验证文章频道
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
private static String verifyChannel(String url){
String channel = null;
if(url.contains("news.") || url.contains("cj.sina.com.cn")
|| url.contains("wemedia.ifeng.com")){
channel = "新闻";
}else if(url.contains("finance.") || url.contains("business.")
|| url.contains("money.") || url.contains("stock.")
|| url.contains("10jqka.com.cn")){
channel = "财经";
}else if(url.contains("tech.") || url.contains("it.")
|| url.contains("pcedu.") || url.contains("mobile.")
|| url.contains("vr.")){
channel = "科技";
}else if(url.contains("sports.")){
channel = "体育";
}else if(url.contains("ent.") || url.contains("yule.")){
channel = "娱乐";
}else if(url.contains("auto.")){
channel = "汽车";
}else if(url.contains("fashion.")){
channel = "时尚";
}else if(url.contains("learning.") || url.contains("edu.")){
channel = "教育";
}else if(url.contains("baobao.")){
channel = "母婴";
}else if(url.contains("house.") ||url.contains("leju.")
|| url.contains("focus.")){
channel = "房产";
}else if(url.contains("games.")){
channel = "游戏";
}else if(url.contains("intl.")){
channel = "国际";
}else if(url.contains("science.")){
channel = "科学";
}else if(url.contains("city.")){
channel = "城市";
}else if(url.contains("sc.")){
channel = "市场";
}
return channel;
}
}
......@@ -459,6 +459,58 @@ public class TreateData {
}
/**
* @Title: verifyChannel
* @author hero
* @Description: 根据链接验证文章频道
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
public static String verifyChannel(String url){
String channel = null;
if(url.contains("news.") || url.contains("cj.sina.com.cn")
|| url.contains("wemedia.ifeng.com")){
channel = "新闻";
}else if(url.contains("finance.") || url.contains("business.")
|| url.contains("money.") || url.contains("stock.")
|| url.contains("10jqka.com.cn")){
channel = "财经";
}else if(url.contains("tech.") || url.contains("it.")
|| url.contains("pcedu.") || url.contains("mobile.")
|| url.contains("vr.")){
channel = "科技";
}else if(url.contains("sports.")){
channel = "体育";
}else if(url.contains("ent.") || url.contains("yule.")){
channel = "娱乐";
}else if(url.contains("auto.")){
channel = "汽车";
}else if(url.contains("fashion.")){
channel = "时尚";
}else if(url.contains("learning.") || url.contains("edu.")){
channel = "教育";
}else if(url.contains("baobao.")){
channel = "母婴";
}else if(url.contains("house.") ||url.contains("leju.")
|| url.contains("focus.")){
channel = "房产";
}else if(url.contains("games.")){
channel = "游戏";
}else if(url.contains("intl.")){
channel = "国际";
}else if(url.contains("science.")){
channel = "科学";
}else if(url.contains("city.")){
channel = "城市";
}else if(url.contains("sc.")){
channel = "市场";
}
return channel;
}
public static String filterSpecialCharacter(String str) {
try {
String regEx = "【[`~!@#$%^&*()+=|{}';'//[//].<>/?~!@#%……&*——+|{}“”;‘’,。、·]】";
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment