Commit 7c541080 by yangchen

全文获取修改为晋豪获取

parent a715f4b8
...@@ -8,8 +8,7 @@ import org.jsoup.nodes.Document; ...@@ -8,8 +8,7 @@ import org.jsoup.nodes.Document;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.source_forward.content.ContentExtractor; import com.kohlschutter.boilerpipe.extractors.ArticleExtractor;
import com.zhiwei.source_forward.content.News;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
/** /**
...@@ -103,12 +102,20 @@ public class MatchContent { ...@@ -103,12 +102,20 @@ public class MatchContent {
/** 正文抽取,目的是避免正文中存在相应匹配规则 **/ /** 正文抽取,目的是避免正文中存在相应匹配规则 **/
String content = null; String content = null;
try { try {
News news = ContentExtractor.getNewsByHtml(html); content = ArticleExtractor.getInstance().getText(html);
content = TreateData.filterSpecialCharacter(news.getContent());
} catch (Exception e) { } catch (Exception e) {
logger.error("正文抽取失败,获取全文文本:",e); logger.error("正文抽取失败,获取全文文本:",e);
content = document.text(); content = document.text();
} }
// String content = null;
// try {
// News news = ContentExtractor.getNewsByHtml(html);
// content = TreateData.filterSpecialCharacter(news.getContent());
// } catch (Exception e) {
// logger.error("正文抽取失败,获取全文文本:",e);
// content = document.text();
// }
return content; return content;
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment