Commit 574cb605 by zhiwei

修复获取正文bug

parent ded4bfdb
...@@ -2,6 +2,10 @@ package com.zhiwei.source_forward.crawler; ...@@ -2,6 +2,10 @@ package com.zhiwei.source_forward.crawler;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.source_forward.util.MatchContent; import com.zhiwei.source_forward.util.MatchContent;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
...@@ -15,10 +19,11 @@ import us.codecraft.webmagic.processor.PageProcessor; ...@@ -15,10 +19,11 @@ import us.codecraft.webmagic.processor.PageProcessor;
*/ */
public class ContentPageProcessor implements PageProcessor { public class ContentPageProcessor implements PageProcessor {
private static Logger logger = LoggerFactory.getLogger(ContentPageProcessor.class);
private Site site = Site.me().setCycleRetryTimes(3).setSleepTime(1500) private Site site = Site.me().setCycleRetryTimes(3).setSleepTime(1500)
.setTimeOut(10000) .setTimeOut(10000)
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0") .setUserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0")
.addHeader("Accept-Encoding", "gzip, deflate, br") .addHeader("Accept-Encoding", "deflate, br")
; ;
@Override @Override
...@@ -32,14 +37,14 @@ public class ContentPageProcessor implements PageProcessor { ...@@ -32,14 +37,14 @@ public class ContentPageProcessor implements PageProcessor {
String content = null; String content = null;
try { try {
if(page.getStatusCode()!=404){ if(page.getStatusCode()!=404){
MatchContent.matchContent(page.getUrl().get(), page.getHtml().toString()); content = MatchContent.matchContent(page.getUrl().get(), page.getHtml().toString());
} }
} catch (Exception e) { } catch (Exception e) {
logger.info("网页链接失效",e.fillInStackTrace());
content = null; content = null;
} }
data.put("url", page.getUrl().get()); data.put("url", page.getUrl().get());
data.put("content", content); data.put("content", content);
page.putField("content", data); page.putField("content", data);
} }
......
package com.zhiwei.source_forward.pipeline; package com.zhiwei.source_forward.pipeline;
import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
...@@ -13,20 +14,11 @@ import us.codecraft.webmagic.pipeline.Pipeline; ...@@ -13,20 +14,11 @@ import us.codecraft.webmagic.pipeline.Pipeline;
* @date 2018年6月30日 上午9:54:27 * @date 2018年6月30日 上午9:54:27
*/ */
public class DataPipeline implements Pipeline { public class DataPipeline implements Pipeline {
private List<Map<String, Object>> contentDataList; private List<Map<String, Object>> contentDataList = new ArrayList<Map<String, Object>>();
private List<Map<String, Object>> mediaSelfDataList; private List<Map<String, Object>> mediaSelfDataList = new ArrayList<Map<String, Object>>();
private List<Map<String, Object>> sourceForwardDataList; private List<Map<String, Object>> sourceForwardDataList = new ArrayList<Map<String, Object>>();
private List<Map<String, Object>> urlLivedataList; private List<Map<String, Object>> urlLivedataList = new ArrayList<Map<String, Object>>();
public DataPipeline(List<Map<String, Object>> dataList,List<Map<String, Object>> contentDataList,List<Map<String, Object>> mediaSelfDataList,
List<Map<String, Object>> sourceForwardDataList,List<Map<String, Object>> urlLivedataList) {
super();
this.contentDataList = contentDataList;
this.mediaSelfDataList = mediaSelfDataList;
this.sourceForwardDataList = sourceForwardDataList;
this.urlLivedataList = urlLivedataList;
}
public DataPipeline() { public DataPipeline() {
super(); super();
......
...@@ -31,12 +31,12 @@ public class ContentMatch { ...@@ -31,12 +31,12 @@ public class ContentMatch {
spider.thread(5).run(); spider.thread(5).run();
List<Map<String,Object>> contentList = pipeline.getContentDataList(); List<Map<String,Object>> contentList = pipeline.getContentDataList();
for(Map<String,Object> sourceMap : contentList){ for(Map<String,Object> contentMap : contentList){
String url = sourceMap.get("url")+""; String url = contentMap.get("url")+"";
//整合数据及验证转发原创 //搜集原文
if(dataMap.containsKey(url)){ if(dataMap.containsKey(url)){
Map<String,Object> data = dataMap.get(url); Map<String,Object> data = dataMap.get(url);
String content = data.get("content")+""; String content = contentMap.get("content")+"";
data.put("content", content); data.put("content", content);
dataMap.put(url, data); dataMap.put(url, data);
} }
......
...@@ -2,6 +2,9 @@ package com.zhiwei.source_forward.util; ...@@ -2,6 +2,9 @@ package com.zhiwei.source_forward.util;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import cn.edu.hfut.dmic.contentextractor.ContentExtractor; import cn.edu.hfut.dmic.contentextractor.ContentExtractor;
import cn.edu.hfut.dmic.contentextractor.News; import cn.edu.hfut.dmic.contentextractor.News;
...@@ -13,7 +16,7 @@ import cn.edu.hfut.dmic.contentextractor.News; ...@@ -13,7 +16,7 @@ import cn.edu.hfut.dmic.contentextractor.News;
*/ */
public class MatchContent { public class MatchContent {
private static Logger logger = LoggerFactory.getLogger(MatchContent.class);
/** /**
* @Title: matchContent * @Title: matchContent
* @author hero * @author hero
...@@ -25,10 +28,12 @@ public class MatchContent { ...@@ -25,10 +28,12 @@ public class MatchContent {
*/ */
public static String matchContent(String url,String html) { public static String matchContent(String url,String html) {
String content = null; String content = null;
Document document = Jsoup.parse(html);
try { try {
Document document = Jsoup.parse(html);
content = mathchContent(html, document); content = mathchContent(html, document);
return content;
} catch (Exception e) { } catch (Exception e) {
logger.debug("获取全文失败",e.fillInStackTrace());
content = null; content = null;
} }
return content; return content;
...@@ -51,9 +56,8 @@ public class MatchContent { ...@@ -51,9 +56,8 @@ public class MatchContent {
News news = ContentExtractor.getNewsByHtml(html); News news = ContentExtractor.getNewsByHtml(html);
content = TreateData.filterSpecialCharacter(news.getContent()); content = TreateData.filterSpecialCharacter(news.getContent());
} catch (Exception e) { } catch (Exception e) {
logger.info("正文抽取失败,获取全文文本:{}");
content = document.text(); content = document.text();
System.out.println("正文抽取失败处理........");
e.printStackTrace();
} }
return content; return content;
} }
......
package com.zhiwei.source_forward.sourceforward.test; //package com.zhiwei.source_forward.sourceforward.test;
//
import java.util.HashMap; //import java.util.HashMap;
import java.util.Map; //import java.util.Map;
//
import org.junit.Test; //import org.junit.Test;
//
import com.zhiwei.source_forward.run.SourceForward; //import com.zhiwei.source_forward.run.SourceForward;
//
/** ///**
* @ClassName: SourceForwardTest // * @ClassName: SourceForwardTest
* @Description: 来源验证 // * @Description: 来源验证
* @author hero // * @author hero
* @date 2017年12月6日 上午9:55:13 // * @date 2017年12月6日 上午9:55:13
*/ // */
public class MediaSelfSourceTest { //public class MediaSelfSourceTest {
//
@Test // @Test
public void sourceForwardTest(){ // public void sourceForwardTest(){
Map<String,Map<String,Object>> dataMap = new HashMap<String,Map<String,Object>>(); // Map<String,Map<String,Object>> dataMap = new HashMap<String,Map<String,Object>>();
String url = "https://www.toutiao.com/a6549872248428167687/"; // String url = "https://www.toutiao.com/a6549872248428167687/";
Map<String,Object> data = new HashMap<String,Object>(); // Map<String,Object> data = new HashMap<String,Object>();
dataMap.put(url, data); // dataMap.put(url, data);
//
SourceForward.getMediaSelfSource(dataMap); // SourceForward.getMediaSelfSource(dataMap);
//
} // }
//
//
//
//
//
//
//
//
} //}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment