Commit 574cb605 by zhiwei

修复获取正文bug

parent ded4bfdb
......@@ -2,6 +2,10 @@ package com.zhiwei.source_forward.crawler;
import java.util.HashMap;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.source_forward.util.MatchContent;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
......@@ -15,10 +19,11 @@ import us.codecraft.webmagic.processor.PageProcessor;
*/
public class ContentPageProcessor implements PageProcessor {
private static Logger logger = LoggerFactory.getLogger(ContentPageProcessor.class);
private Site site = Site.me().setCycleRetryTimes(3).setSleepTime(1500)
.setTimeOut(10000)
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0")
.addHeader("Accept-Encoding", "gzip, deflate, br")
.addHeader("Accept-Encoding", "deflate, br")
;
@Override
......@@ -32,14 +37,14 @@ public class ContentPageProcessor implements PageProcessor {
String content = null;
try {
if(page.getStatusCode()!=404){
MatchContent.matchContent(page.getUrl().get(), page.getHtml().toString());
content = MatchContent.matchContent(page.getUrl().get(), page.getHtml().toString());
}
} catch (Exception e) {
logger.info("网页链接失效",e.fillInStackTrace());
content = null;
}
data.put("url", page.getUrl().get());
data.put("content", content);
page.putField("content", data);
}
......
package com.zhiwei.source_forward.pipeline;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
......@@ -13,20 +14,11 @@ import us.codecraft.webmagic.pipeline.Pipeline;
* @date 2018年6月30日 上午9:54:27
*/
public class DataPipeline implements Pipeline {
private List<Map<String, Object>> contentDataList;
private List<Map<String, Object>> mediaSelfDataList;
private List<Map<String, Object>> sourceForwardDataList;
private List<Map<String, Object>> urlLivedataList;
private List<Map<String, Object>> contentDataList = new ArrayList<Map<String, Object>>();
private List<Map<String, Object>> mediaSelfDataList = new ArrayList<Map<String, Object>>();
private List<Map<String, Object>> sourceForwardDataList = new ArrayList<Map<String, Object>>();
private List<Map<String, Object>> urlLivedataList = new ArrayList<Map<String, Object>>();
public DataPipeline(List<Map<String, Object>> dataList,List<Map<String, Object>> contentDataList,List<Map<String, Object>> mediaSelfDataList,
List<Map<String, Object>> sourceForwardDataList,List<Map<String, Object>> urlLivedataList) {
super();
this.contentDataList = contentDataList;
this.mediaSelfDataList = mediaSelfDataList;
this.sourceForwardDataList = sourceForwardDataList;
this.urlLivedataList = urlLivedataList;
}
public DataPipeline() {
super();
......
......@@ -31,12 +31,12 @@ public class ContentMatch {
spider.thread(5).run();
List<Map<String,Object>> contentList = pipeline.getContentDataList();
for(Map<String,Object> sourceMap : contentList){
String url = sourceMap.get("url")+"";
//整合数据及验证转发原创
for(Map<String,Object> contentMap : contentList){
String url = contentMap.get("url")+"";
//搜集原文
if(dataMap.containsKey(url)){
Map<String,Object> data = dataMap.get(url);
String content = data.get("content")+"";
String content = contentMap.get("content")+"";
data.put("content", content);
dataMap.put(url, data);
}
......
......@@ -2,6 +2,9 @@ package com.zhiwei.source_forward.util;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import cn.edu.hfut.dmic.contentextractor.ContentExtractor;
import cn.edu.hfut.dmic.contentextractor.News;
......@@ -13,7 +16,7 @@ import cn.edu.hfut.dmic.contentextractor.News;
*/
public class MatchContent {
private static Logger logger = LoggerFactory.getLogger(MatchContent.class);
/**
* @Title: matchContent
* @author hero
......@@ -25,10 +28,12 @@ public class MatchContent {
*/
public static String matchContent(String url,String html) {
String content = null;
Document document = Jsoup.parse(html);
try {
Document document = Jsoup.parse(html);
content = mathchContent(html, document);
return content;
} catch (Exception e) {
logger.debug("获取全文失败",e.fillInStackTrace());
content = null;
}
return content;
......@@ -51,9 +56,8 @@ public class MatchContent {
News news = ContentExtractor.getNewsByHtml(html);
content = TreateData.filterSpecialCharacter(news.getContent());
} catch (Exception e) {
logger.info("正文抽取失败,获取全文文本:{}");
content = document.text();
System.out.println("正文抽取失败处理........");
e.printStackTrace();
}
return content;
}
......
package com.zhiwei.source_forward.sourceforward.test;
import java.util.HashMap;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.source_forward.run.SourceForward;
/**
* @ClassName: SourceForwardTest
* @Description: 来源验证
* @author hero
* @date 2017年12月6日 上午9:55:13
*/
public class MediaSelfSourceTest {
@Test
public void sourceForwardTest(){
Map<String,Map<String,Object>> dataMap = new HashMap<String,Map<String,Object>>();
String url = "https://www.toutiao.com/a6549872248428167687/";
Map<String,Object> data = new HashMap<String,Object>();
dataMap.put(url, data);
SourceForward.getMediaSelfSource(dataMap);
}
}
//package com.zhiwei.source_forward.sourceforward.test;
//
//import java.util.HashMap;
//import java.util.Map;
//
//import org.junit.Test;
//
//import com.zhiwei.source_forward.run.SourceForward;
//
///**
// * @ClassName: SourceForwardTest
// * @Description: 来源验证
// * @author hero
// * @date 2017年12月6日 上午9:55:13
// */
//public class MediaSelfSourceTest {
//
// @Test
// public void sourceForwardTest(){
// Map<String,Map<String,Object>> dataMap = new HashMap<String,Map<String,Object>>();
// String url = "https://www.toutiao.com/a6549872248428167687/";
// Map<String,Object> data = new HashMap<String,Object>();
// dataMap.put(url, data);
//
// SourceForward.getMediaSelfSource(dataMap);
//
// }
//
//
//
//
//
//
//
//
//}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment