Commit 82632f70 by zhiwei

添加根据传入连接返回自媒体名称

parent 94228139
......@@ -50,6 +50,7 @@ public class MyDownLoader extends AbstractDownloader{
if (site == null) {
return httpClientGenerator.getClient(null, proxy);
}
String domain = site.getDomain();
CloseableHttpClient httpClient = httpClients.get(domain);
if (httpClient == null) {
......@@ -201,6 +202,7 @@ public class MyDownLoader extends AbstractDownloader{
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
String content = getContent(charset, httpResponse);
Page page = new Page();
page.setRawText(content);
page.setUrl(new PlainText(request.getUrl()));
......@@ -212,6 +214,7 @@ public class MyDownLoader extends AbstractDownloader{
protected String getContent(String charset, HttpResponse httpResponse) throws IOException {
if (charset == null) {
byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
String htmlCharset = getHtmlCharset(httpResponse, contentBytes);
if (htmlCharset != null) {
return new String(contentBytes, htmlCharset);
......
package com.zhiwei.source_forward.run;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
......@@ -107,7 +108,62 @@ public class SourceForward {
return dataMap;
}
/**
* @Title: getMediaSelfSource
* @author hero
* @Description: 根据链接匹配自媒体账号
* @param @param urlList
* @param @return 设定文件
* @return Map<String,String> 返回类型
*/
public static Map<String,String> getMediaSelfSource(List<String> urlList){
//启动验证来源程序
Map<String,String> dataMap = new HashMap<String,String>();
MediaSelfSourceDataPipeline pipeline = new MediaSelfSourceDataPipeline();
Spider spider = Spider.create(new MediaSelfSourcePageProcessor());
for(String url : urlList){
spider.addUrl(url);
dataMap.put(url, null);
}
spider.setDownloader(new MyDownLoader());
spider.addPipeline(pipeline);
spider.thread(5).run();
List<Map<String,Object>> sourceForwardList = pipeline.getDataList();
for(Map<String,Object> sourceMap : sourceForwardList){
String url = sourceMap.get("url")+"";
//整合数据及验证转发原创
if(dataMap.containsKey(url)){
dataMap.put(url, sourceMap.get("mediaself").toString());
}
}
return dataMap;
}
/**
*
* @Title: getMediaSelfSource
* @author hero
* @Description: 根据链接匹配自媒体账号
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
public static String getMediaSelfSource(String url){
//启动验证来源程序
MediaSelfSourceDataPipeline pipeline = new MediaSelfSourceDataPipeline();
Spider spider = Spider.create(new MediaSelfSourcePageProcessor());
spider.addUrl(url);
spider.setDownloader(new MyDownLoader());
spider.addPipeline(pipeline);
spider.thread(1).run();
List<Map<String,Object>> sourceForwardList = pipeline.getDataList();
for(Map<String,Object> sourceMap : sourceForwardList){
return sourceMap.get("mediaself").toString();
}
return null;
}
}
......@@ -129,8 +129,8 @@ public class TreateData {
/***特定网站单独处理**/
if(url.contains("toutiao.com")){
//今日头条帐号匹配
if(html.contains(" name: '")){
source = html.split("name: '")[1].split("',")[0].trim();
if(html.contains("name: '")){
source = html.split("mediaInfo:")[1].split("name: '")[1].split("',")[0].trim();
}else if(html.contains("screen_name:")){
source = html.split("screen_name:'")[1].split("',")[0].trim();
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment