Commit 7c541080 by yangchen

全文获取修改为晋豪获取

parent a715f4b8
......@@ -8,8 +8,7 @@ import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.source_forward.content.ContentExtractor;
import com.zhiwei.source_forward.content.News;
import com.kohlschutter.boilerpipe.extractors.ArticleExtractor;
import com.zhiwei.tools.tools.ZhiWeiTools;
/**
......@@ -101,14 +100,22 @@ public class MatchContent {
*/
private static String mathchContent(String html,Document document){
/** 正文抽取,目的是避免正文中存在相应匹配规则 **/
String content = null;
try {
News news = ContentExtractor.getNewsByHtml(html);
content = TreateData.filterSpecialCharacter(news.getContent());
} catch (Exception e) {
logger.error("正文抽取失败,获取全文文本:",e);
content = document.text();
}
String content = null;
try {
content = ArticleExtractor.getInstance().getText(html);
} catch (Exception e) {
logger.error("正文抽取失败,获取全文文本:",e);
content = document.text();
}
// String content = null;
// try {
// News news = ContentExtractor.getNewsByHtml(html);
// content = TreateData.filterSpecialCharacter(news.getContent());
// } catch (Exception e) {
// logger.error("正文抽取失败,获取全文文本:",e);
// content = document.text();
// }
return content;
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment