Commit 19bb2414 by yangchen

修改ok初版提交

parent 76581f38
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>source_forward</artifactId> <artifactId>source-forward</artifactId>
<version>0.0.2-SNAPSHOT</version> <version>0.0.3-SNAPSHOT</version>
<name>source_forward</name> <name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description> <description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
<properties> <properties>
...@@ -21,35 +22,14 @@ ...@@ -21,35 +22,14 @@
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>cn.edu.hfut.dmic.webcollector</groupId> <groupId>com.zhiwei.tools</groupId>
<artifactId>WebCollector</artifactId> <artifactId>zhiwei-tools</artifactId>
<version>2.71</version> <version>0.0.2-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.6.1</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.6.1</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-saxon</artifactId>
<version>0.6.1</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei.middleware</groupId>
<artifactId>zhiweiTools</artifactId> <artifactId>proxy-client</artifactId>
<version>0.0.6-SNAPSHOT</version> <version>0.0.1-RELEASE</version>
</dependency> </dependency>
</dependencies> </dependencies>
...@@ -109,4 +89,13 @@ ...@@ -109,4 +89,13 @@
<dependencyManagement>
<dependencies>
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
<version>3.11.0</version>
</dependency>
</dependencies>
</dependencyManagement>
</project> </project>
\ No newline at end of file
package com.zhiwei.source_forward.bean;
public class ContentBean {
private String url;
private String content;
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
@Override
public String toString() {
return "ContentBean [url=" + url + ", content=" + content + "]";
}
public ContentBean(String url, String content) {
super();
this.url = url;
this.content = content;
}
public ContentBean() {
super();
}
/**
* @ClassName: Attribution
* @Description: 属性
* @author 0xff
* @date 2018年7月3日 下午5:53:22
*/
public static class Attribution {
private Object attr;
/**
* Constructor
*
* @param attr
*/
private Attribution(Object attr) {
this.attr = attr;
}
/**
* 创建属性
*
* @param attr
* @return Attribution
*/
public static Attribution of(Object attr) {
return new Attribution(attr);
}
/**
* 获取属性
*
* @return Object
*/
public Object get() {
return attr;
}
}
}
package com.zhiwei.source_forward.bean;
public class MediaSelfSourceBean {
private String url;
private String source;
private String channel;
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public String getChannel() {
return channel;
}
public void setChannel(String channel) {
this.channel = channel;
}
public MediaSelfSourceBean() {
super();
}
public MediaSelfSourceBean(String url, String source, String channel) {
super();
this.url = url;
this.source = source;
this.channel = channel;
}
@Override
public String toString() {
return "MediaSelfSourceBean [url=" + url + ", source=" + source
+ ", channel=" + channel + "]";
}
/**
* @ClassName: Attribution
* @Description: 属性
* @author 0xff
* @date 2018年7月3日 下午5:53:22
*/
public static class Attribution {
private Object attr;
/**
* Constructor
*
* @param attr
*/
private Attribution(Object attr) {
this.attr = attr;
}
/**
* 创建属性
*
* @param attr
* @return Attribution
*/
public static Attribution of(Object attr) {
return new Attribution(attr);
}
/**
* 获取属性
*
* @return Object
*/
public Object get() {
return attr;
}
}
}
package com.zhiwei.source_forward.bean;
public class SourceForwardBean {
private String url;
private String channel;
private String root_source;
private String isforward;
public String getIsforward() {
return isforward;
}
public void setIsforward(String isforward) {
this.isforward = isforward;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getChannel() {
return channel;
}
public void setChannel(String channel) {
this.channel = channel;
}
public String getRoot_source() {
return root_source;
}
public void setRoot_source(String root_source) {
this.root_source = root_source;
}
@Override
public String toString() {
return "SourceForwardBean [url=" + url + ", channel=" + channel
+ ", root_source=" + root_source + "]";
}
public SourceForwardBean(String url, String channel, String root_source,
String isforward) {
super();
this.url = url;
this.channel = channel;
this.root_source = root_source;
this.isforward = isforward;
}
public SourceForwardBean() {
super();
}
/**
* @ClassName: Attribution
* @Description: 属性
* @author 0xff
* @date 2018年7月3日 下午5:53:22
*/
public static class Attribution {
private Object attr;
/**
* Constructor
*
* @param attr
*/
private Attribution(Object attr) {
this.attr = attr;
}
/**
* 创建属性
*
* @param attr
* @return Attribution
*/
public static Attribution of(Object attr) {
return new Attribution(attr);
}
/**
* 获取属性
*
* @return Object
*/
public Object get() {
return attr;
}
}
}
package com.zhiwei.source_forward.bean;
public class UrlLiveBean {
private String url;
private boolean isLive;
public UrlLiveBean() {
super();
}
public UrlLiveBean(String url, boolean isLive) {
super();
this.url = url;
this.isLive = isLive;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public boolean isLive() {
return isLive;
}
public void setLive(boolean isLive) {
this.isLive = isLive;
}
@Override
public String toString() {
return "UrlLiveBean [url=" + url + ", isLive=" + isLive + "]";
}
/**
* @ClassName: Attribution
* @Description: 属性
* @author 0xff
* @date 2018年7月3日 下午5:53:22
*/
public static class Attribution {
private Object attr;
/**
* Constructor
*
* @param attr
*/
private Attribution(Object attr) {
this.attr = attr;
}
/**
* 创建属性
*
* @param attr
* @return Attribution
*/
public static Attribution of(Object attr) {
return new Attribution(attr);
}
/**
* 获取属性
*
* @return Object
*/
public Object get() {
return attr;
}
}
}
package com.zhiwei.source_forward.content;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
import org.jsoup.select.NodeVisitor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* ContentExtractor could extract content,title,time from news webpage
*
* @author hu
*/
public class ContentExtractor {
public static final Logger LOG = LoggerFactory.getLogger(ContentExtractor.class);
protected Document doc;
ContentExtractor(Document doc) {
this.doc = doc;
}
protected HashMap<Element, CountInfo> infoMap = new HashMap<Element, CountInfo>();
class CountInfo {
int textCount = 0;
int linkTextCount = 0;
int tagCount = 0;
int linkTagCount = 0;
double density = 0;
double densitySum = 0;
double score = 0;
int pCount = 0;
ArrayList<Integer> leafList = new ArrayList<Integer>();
}
protected void clean() {
doc.select("script,noscript,style,iframe,br").remove();
}
protected CountInfo computeInfo(Node node) {
if (node instanceof Element) {
Element tag = (Element) node;
CountInfo countInfo = new CountInfo();
for (Node childNode : tag.childNodes()) {
CountInfo childCountInfo = computeInfo(childNode);
countInfo.textCount += childCountInfo.textCount;
countInfo.linkTextCount += childCountInfo.linkTextCount;
countInfo.tagCount += childCountInfo.tagCount;
countInfo.linkTagCount += childCountInfo.linkTagCount;
countInfo.leafList.addAll(childCountInfo.leafList);
countInfo.densitySum += childCountInfo.density;
countInfo.pCount += childCountInfo.pCount;
}
countInfo.tagCount++;
String tagName = tag.tagName();
if (tagName.equals("a")) {
countInfo.linkTextCount = countInfo.textCount;
countInfo.linkTagCount++;
} else if (tagName.equals("p")) {
countInfo.pCount++;
}
int pureLen = countInfo.textCount - countInfo.linkTextCount;
int len = countInfo.tagCount - countInfo.linkTagCount;
if (pureLen == 0 || len == 0) {
countInfo.density = 0;
} else {
countInfo.density = (pureLen + 0.0) / len;
}
infoMap.put(tag, countInfo);
return countInfo;
} else if (node instanceof TextNode) {
TextNode tn = (TextNode) node;
CountInfo countInfo = new CountInfo();
String text = tn.text();
int len = text.length();
countInfo.textCount = len;
countInfo.leafList.add(len);
return countInfo;
} else {
return new CountInfo();
}
}
protected double computeScore(Element tag) {
CountInfo countInfo = infoMap.get(tag);
double var = Math.sqrt(computeVar(countInfo.leafList) + 1);
double score = Math.log(var) * countInfo.densitySum * Math.log(countInfo.textCount - countInfo.linkTextCount + 1) * Math.log10(countInfo.pCount + 2);
return score;
}
protected double computeVar(ArrayList<Integer> data) {
if (data.size() == 0) {
return 0;
}
if (data.size() == 1) {
return data.get(0) / 2;
}
double sum = 0;
for (Integer i : data) {
sum += i;
}
double ave = sum / data.size();
sum = 0;
for (Integer i : data) {
sum += (i - ave) * (i - ave);
}
sum = sum / data.size();
return sum;
}
public Element getContentElement() throws Exception {
clean();
computeInfo(doc.body());
double maxScore = 0;
Element content = null;
for (Map.Entry<Element, CountInfo> entry : infoMap.entrySet()) {
Element tag = entry.getKey();
if (tag.tagName().equals("a") || tag == doc.body()) {
continue;
}
double score = computeScore(tag);
if (score > maxScore) {
maxScore = score;
content = tag;
}
}
if (content == null) {
throw new Exception("extraction failed");
}
return content;
}
public News getNews() throws Exception {
News news = new News();
Element contentElement;
try {
contentElement = getContentElement();
news.setContentElement(contentElement);
} catch (Exception ex) {
LOG.info("news content extraction failed,extraction abort", ex);
throw new Exception(ex);
}
if (doc.baseUri() != null) {
news.setUrl(doc.baseUri());
}
try {
news.setTime(getTime(contentElement));
} catch (Exception ex) {
LOG.info("news title extraction failed", ex);
}
try {
news.setTitle(getTitle(contentElement));
} catch (Exception ex) {
LOG.info("title extraction failed", ex);
}
return news;
}
protected String getTime(Element contentElement) throws Exception {
String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})";
Pattern pattern = Pattern.compile(regex);
Element current = contentElement;
for (int i = 0; i < 2; i++) {
if (current != null && current != doc.body()) {
Element parent = current.parent();
if (parent != null) {
current = parent;
}
}
}
for (int i = 0; i < 6; i++) {
if (current == null) {
break;
}
String currentHtml = current.outerHtml();
Matcher matcher = pattern.matcher(currentHtml);
if (matcher.find()) {
return matcher.group(1) + "-" + matcher.group(2) + "-" + matcher.group(3) + " " + matcher.group(4) + ":" + matcher.group(5) + ":" + matcher.group(6);
}
if (current != doc.body()) {
current = current.parent();
}
}
try {
return getDate(contentElement);
} catch (Exception ex) {
throw new Exception("time not found");
}
}
protected String getDate(Element contentElement) throws Exception {
String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})";
Pattern pattern = Pattern.compile(regex);
Element current = contentElement;
for (int i = 0; i < 2; i++) {
if (current != null && current != doc.body()) {
Element parent = current.parent();
if (parent != null) {
current = parent;
}
}
}
for (int i = 0; i < 6; i++) {
if (current == null) {
break;
}
String currentHtml = current.outerHtml();
Matcher matcher = pattern.matcher(currentHtml);
if (matcher.find()) {
return matcher.group(1) + "-" + matcher.group(2) + "-" + matcher.group(3);
}
if (current != doc.body()) {
current = current.parent();
}
}
throw new Exception("date not found");
}
protected double strSim(String a, String b) {
int len1 = a.length();
int len2 = b.length();
if (len1 == 0 || len2 == 0) {
return 0;
}
double ratio;
if (len1 > len2) {
ratio = (len1 + 0.0) / len2;
} else {
ratio = (len2 + 0.0) / len1;
}
if (ratio >= 3) {
return 0;
}
return (lcs(a, b) + 0.0) / Math.max(len1, len2);
}
protected String getTitle(final Element contentElement) throws Exception {
final ArrayList<Element> titleList = new ArrayList<Element>();
final ArrayList<Double> titleSim = new ArrayList<Double>();
final AtomicInteger contentIndex = new AtomicInteger();
final String metaTitle = doc.title().trim();
if (!metaTitle.isEmpty()) {
doc.body().traverse(new NodeVisitor() {
@Override
public void head(Node node, int i) {
if (node instanceof Element) {
Element tag = (Element) node;
if (tag == contentElement) {
contentIndex.set(titleList.size());
return;
}
String tagName = tag.tagName();
if (Pattern.matches("h[1-6]", tagName)) {
String title = tag.text().trim();
double sim = strSim(title, metaTitle);
titleSim.add(sim);
titleList.add(tag);
}
}
}
@Override
public void tail(Node node, int i) {
}
});
int index = contentIndex.get();
if (index > 0) {
double maxScore = 0;
int maxIndex = -1;
for (int i = 0; i < index; i++) {
double score = (i + 1) * titleSim.get(i);
if (score > maxScore) {
maxScore = score;
maxIndex = i;
}
}
if (maxIndex != -1) {
return titleList.get(maxIndex).text();
}
}
}
Elements titles = doc.body().select("*[id^=title],*[id$=title],*[class^=title],*[class$=title]");
if (titles.size() > 0) {
String title = titles.first().text();
if (title.length() > 5 && title.length()<40) {
return titles.first().text();
}
}
try {
return getTitleByEditDistance(contentElement);
} catch (Exception ex) {
throw new Exception("title not found");
}
}
protected String getTitleByEditDistance(Element contentElement) throws Exception {
final String metaTitle = doc.title();
final ArrayList<Double> max = new ArrayList<Double>();
max.add(0.0);
final StringBuilder sb = new StringBuilder();
doc.body().traverse(new NodeVisitor() {
public void head(Node node, int i) {
if (node instanceof TextNode) {
TextNode tn = (TextNode) node;
String text = tn.text().trim();
double sim = strSim(text, metaTitle);
if (sim > 0) {
if (sim > max.get(0)) {
max.set(0, sim);
sb.setLength(0);
sb.append(text);
}
}
}
}
public void tail(Node node, int i) {
}
});
if (sb.length() > 0) {
return sb.toString();
}
throw new Exception();
}
protected int lcs(String x, String y) {
int M = x.length();
int N = y.length();
if (M == 0 || N == 0) {
return 0;
}
int[][] opt = new int[M + 1][N + 1];
for (int i = M - 1; i >= 0; i--) {
for (int j = N - 1; j >= 0; j--) {
if (x.charAt(i) == y.charAt(j)) {
opt[i][j] = opt[i + 1][j + 1] + 1;
} else {
opt[i][j] = Math.max(opt[i + 1][j], opt[i][j + 1]);
}
}
}
return opt[0][0];
}
protected int editDistance(String word1, String word2) {
int len1 = word1.length();
int len2 = word2.length();
int[][] dp = new int[len1 + 1][len2 + 1];
for (int i = 0; i <= len1; i++) {
dp[i][0] = i;
}
for (int j = 0; j <= len2; j++) {
dp[0][j] = j;
}
for (int i = 0; i < len1; i++) {
char c1 = word1.charAt(i);
for (int j = 0; j < len2; j++) {
char c2 = word2.charAt(j);
if (c1 == c2) {
dp[i + 1][j + 1] = dp[i][j];
} else {
int replace = dp[i][j] + 1;
int insert = dp[i][j + 1] + 1;
int delete = dp[i + 1][j] + 1;
int min = replace > insert ? insert : replace;
min = delete > min ? min : delete;
dp[i + 1][j + 1] = min;
}
}
}
return dp[len1][len2];
}
/*输入Jsoup的Document,获取正文所在Element*/
public static Element getContentElementByDoc(Document doc) throws Exception {
ContentExtractor ce = new ContentExtractor(doc);
return ce.getContentElement();
}
/*输入HTML,获取正文所在Element*/
public static Element getContentElementByHtml(String html) throws Exception {
Document doc = Jsoup.parse(html);
return getContentElementByDoc(doc);
}
/*输入HTML和URL,获取正文所在Element*/
public static Element getContentElementByHtml(String html, String url) throws Exception {
Document doc = Jsoup.parse(html, url);
return getContentElementByDoc(doc);
}
/*输入Jsoup的Document,获取正文文本*/
public static String getContentByDoc(Document doc) throws Exception {
ContentExtractor ce = new ContentExtractor(doc);
return ce.getContentElement().text();
}
/*输入HTML,获取正文文本*/
public static String getContentByHtml(String html) throws Exception {
Document doc = Jsoup.parse(html);
return getContentElementByDoc(doc).text();
}
/*输入HTML和URL,获取正文文本*/
public static String getContentByHtml(String html, String url) throws Exception {
Document doc = Jsoup.parse(html, url);
return getContentElementByDoc(doc).text();
}
/*输入Jsoup的Document,获取结构化新闻信息*/
public static News getNewsByDoc(Document doc) throws Exception {
ContentExtractor ce = new ContentExtractor(doc);
return ce.getNews();
}
/*输入HTML,获取结构化新闻信息*/
public static News getNewsByHtml(String html) throws Exception {
Document doc = Jsoup.parse(html);
return getNewsByDoc(doc);
}
/*输入HTML和URL,获取结构化新闻信息*/
public static News getNewsByHtml(String html, String url) throws Exception {
Document doc = Jsoup.parse(html, url);
return getNewsByDoc(doc);
}
}
package com.zhiwei.source_forward.content;
import org.jsoup.nodes.Element;
/**
*
* @author hu
*/
public class News {
protected String url = null;
protected String title = null;
protected String content = null;
protected String time = null;
protected Element contentElement = null;
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getContent() {
if (content == null) {
if (contentElement != null) {
content = contentElement.text();
}
}
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
@Override
public String toString() {
return "URL:\n" + url + "\nTITLE:\n" + title + "\nTIME:\n" + time + "\nCONTENT:\n" + getContent() + "\nCONTENT(SOURCE):\n" + contentElement;
}
public Element getContentElement() {
return contentElement;
}
public void setContentElement(Element contentElement) {
this.contentElement = contentElement;
}
}
package com.zhiwei.source_forward.crawler;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.source_forward.bean.ContentBean;
import com.zhiwei.source_forward.bean.ContentBean.Attribution;
import com.zhiwei.source_forward.util.ContentDataCallback;
import com.zhiwei.source_forward.util.MatchContent;
import com.zhiwei.source_forward.util.ProxyClientUtil;
import com.zhiwei.tools.httpclient.HttpBoot;
import com.zhiwei.tools.httpclient.HttpRequestBuilder;
import com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter;
import okhttp3.Request;
import okhttp3.Response;
public class ContentCrawler {
private static Logger logger = LogManager.getLogger(ContentCrawler.class);
/**
*
* @Description 链接传入 并 返回采集完信号
* @param callback
* @param urls
* @return
* @throws Exception
*/
public MultiThreadingCounter submitTask(ContentDataCallback callback,String... urls) throws Exception {
MultiThreadingCounter counter = new MultiThreadingCounter();
start(counter, callback, urls);
return counter;
}
/**
*
* @Description 提交链接
* @param counter
* @param callback
* @param urls
*/
private void start(MultiThreadingCounter counter,ContentDataCallback callback, String... urls) {
if (urls != null && urls.length > 0) {
for (String url : urls) {
if (url != null) {
try {
counter.increase();
search(counter, url, Attribution.of(url), callback);
} catch (Exception e) {
logger.error("关键词 {} 搜索创建出错: {}", e.getMessage());
} finally {
counter.reduce();
}
}
}
}
}
/**
*
* @Description 链接获取文章信息
* @param counter
* @param url
* @param attr
* @param callback
* @return
*/
private MultiThreadingCounter search(MultiThreadingCounter counter, String url,Attribution attr, ContentDataCallback callback) {
logger.info("当前处理 URL: {}", url);
Request request = HttpRequestBuilder.newGetRequest(url, null);
counter.increase();
HttpBoot.asyncCall(request, ProxyClientUtil.getNATProxy(), false).addListeners(future -> {
if (future.isSuccess()) {
Response response = future.result();
try {
parseHtml(response, attr, callback);
} catch (Exception e) {
logger.error("解析出错", e);
}
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage());
}
counter.reduce();
});
return counter;
}
/**
*
*
* @Description 获取正文解析
* @param response
* @param attr
* @param callback
*/
private void parseHtml(Response response, Attribution attr,
ContentDataCallback callback) {
String content = null;
try {
if(response.isSuccessful()){
String html = response.body().string();
content = MatchContent.matchContent(attr.get().toString(), html);
}
} catch (Exception e) {
logger.info("网页链接失效",e.fillInStackTrace());
}finally {
if(response != null) {
response.close();
}
}
ContentBean cb = new ContentBean(attr.get().toString(), content);
if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据");
} else {
callback.onData(cb, attr);
}
}
}
package com.zhiwei.source_forward.crawler;
import java.util.HashMap;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.source_forward.util.MatchContent;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* @ClassName: ContentPageProcessor
* @Description: 获取文章内容
* @author hero
* @date 2018年6月30日 上午9:54:02
*/
public class ContentPageProcessor implements PageProcessor {
private static Logger logger = LoggerFactory.getLogger(ContentPageProcessor.class);
private Site site = Site.me().setCycleRetryTimes(3).setSleepTime(1500)
.setTimeOut(10000)
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0")
.addHeader("Accept-Encoding", "deflate, br")
;
@Override
public Site getSite() {
return site;
}
@Override
public void process(Page page) {
Map<String,String> data = new HashMap<String,String>();
String content = null;
try {
if(page.getStatusCode()!=404){
content = MatchContent.matchContent(page.getUrl().get(), page.getHtml().toString());
}
} catch (Exception e) {
logger.info("网页链接失效",e.fillInStackTrace());
content = null;
}
data.put("url", page.getUrl().get());
data.put("content", content);
page.putField("content", data);
}
}
package com.zhiwei.source_forward.crawler;
import java.util.List;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Node;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack;
import com.zhiwei.source_forward.util.ProxyClientUtil;
import com.zhiwei.tools.httpclient.HttpBoot;
import com.zhiwei.tools.httpclient.HttpRequestBuilder;
import com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter;
import okhttp3.Request;
import okhttp3.Response;
/**
*
* @ClassName MediaSelfSourceCrawler
* @Description 自媒体号匹配
* @author byte-zbs
* @Date 2018年8月21日 下午3:54:03
* @version 1.0.0
*/
public class MediaSelfSourceCrawler {
private static final Logger logger = LogManager.getLogger(MediaSelfSourceCrawler.class);
/**
*
* @Description 链接传入 并 返回采集完信号
* @param callback
* @param urls
* @return
* @throws Exception
*/
public MultiThreadingCounter submitTask(MediaSelfSourceDataCallBack callback,String... urls) throws Exception {
MultiThreadingCounter counter = new MultiThreadingCounter();
start(counter, callback, urls);
return counter;
}
/**
*
* @Description 提交链接
* @param counter
* @param callback
* @param urls
*/
private void start(MultiThreadingCounter counter,MediaSelfSourceDataCallBack callback, String... urls) {
if (urls != null && urls.length > 0) {
for (String url : urls) {
if (url != null) {
try {
counter.increase();
search(counter, url, Attribution.of(url), callback);
} catch (Exception e) {
logger.error("关键词 {} 搜索创建出错: {}", e.getMessage());
} finally {
counter.reduce();
}
}
}
}
}
/**
*
* @Description 链接获取文章信息
* @param counter
* @param url
* @param attr
* @param callback
* @return
*/
private MultiThreadingCounter search(MultiThreadingCounter counter, String url,Attribution attr, MediaSelfSourceDataCallBack callback) {
logger.info("当前处理 URL: {}", url);
Request request = HttpRequestBuilder.newGetRequest(url, null);
counter.increase();
HttpBoot.asyncCall(request, ProxyClientUtil.getNATProxy(), false).addListeners(future -> {
if (future.isSuccess()) {
Response response = future.result();
try {
parseHtml(response, attr, callback);
} catch (Exception e) {
logger.error("解析出错", e);
}
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage());
}
counter.reduce();
});
return counter;
}
/**
*
* @Description 解析文章获取相关数据
* @param response
* @param attr
* @param callback
*/
private void parseHtml(Response response, Attribution attr,
MediaSelfSourceDataCallBack callback) {
String source = null;
String channel = null;
try {
if(response.isSuccessful()){
String html = response.body().string();
source = MatchSource.matchMediaSelfSource(attr.get().toString(),html);
if(source==null || source.equals("")){
source = null;
}
channel = MatchChannel.verifyChannel(attr.get().toString());
if(channel==null){
List<Node> nodeList = Jsoup.parse(html).head().childNodes();
channel = MatchChannel.matchChannel(nodeList);
}
}
} catch (Exception e) {
source = null;
}finally {
if(response != null) {
response.close();
}
}
logger.info(attr.get()+"================="+source);
MediaSelfSourceBean msfb = new MediaSelfSourceBean(attr.get().toString(), source, channel);
if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据");
} else {
callback.onData(msfb, attr);
}
}
}
package com.zhiwei.source_forward.crawler;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.nodes.Node;
import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
public class MediaSelfSourcePageProcessor implements PageProcessor {
private Site site = Site.me().setCycleRetryTimes(3).setSleepTime(1500)
.setTimeOut(10000)
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0")
.addHeader("Accept-Encoding", "gzip, deflate, br")
;
@Override
public Site getSite() {
return site;
}
@Override
public void process(Page page) {
Map<String,String> data = new HashMap<String,String>();
String source = null;
String channel = null;
try {
if(page.getStatusCode()!=404){
source = MatchSource.matchMediaSelfSource(page.getUrl().get(),page.getHtml().toString());
if(source==null || source.equals("")){
source = null;
}
channel = MatchChannel.verifyChannel(page.getUrl().get());
if(channel==null){
List<Node> nodeList = page.getHtml().getDocument().head().childNodes();
channel = MatchChannel.matchChannel(nodeList);
}
}
} catch (Exception e) {
source = null;
}
System.out.println(page.getUrl().get()+"================="+source);
data.put("url", page.getUrl().get());
data.put("mediaself", source);
data.put("channel", channel);
page.putField("mediaSelf", data);
}
}
package com.zhiwei.source_forward.crawler;
import java.util.List;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.ProxyClientUtil;
import com.zhiwei.source_forward.util.SourceData;
import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
import com.zhiwei.tools.httpclient.HttpBoot;
import com.zhiwei.tools.httpclient.HttpRequestBuilder;
import com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter;
import okhttp3.Request;
import okhttp3.Response;
public class SourceForwardCrawler {
private static final Logger logger = LogManager.getLogger(SourceForwardCrawler.class);
private static List<String> sourceList = SourceData.getSourceList();
public MultiThreadingCounter submitTask(SourceForwardDataCallBack callback,String... urls) throws Exception {
MultiThreadingCounter counter = new MultiThreadingCounter();
start(counter, callback, urls);
return counter;
}
private void start(MultiThreadingCounter counter,SourceForwardDataCallBack callback, String... urls) {
if (urls != null && urls.length > 0) {
for (String url : urls) {
if (url != null) {
try {
counter.increase();
search(counter, url, Attribution.of(url), callback);
} catch (Exception e) {
logger.error("关键词 {} 搜索创建出错: {}", e.getMessage());
} finally {
counter.reduce();
}
}
}
}
}
private MultiThreadingCounter search(MultiThreadingCounter counter, String url,Attribution attr, SourceForwardDataCallBack callback) {
logger.info("当前处理 URL: {}", url);
Request request = HttpRequestBuilder.newGetRequest(url, null);
counter.increase();
HttpBoot.asyncCall(request, ProxyClientUtil.getNATProxy(), false).addListeners(future -> {
if (future.isSuccess()) {
Response response = future.result();
try {
parseHtml(response, attr, callback);
} catch (Exception e) {
logger.error("解析出错", e);
}
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage());
}
counter.reduce();
});
return counter;
}
private void parseHtml(Response response, Attribution attr,
SourceForwardDataCallBack callback) {
String source = null;
String channel = "新闻";
String isforward = "未知";
try {
if(response.isSuccessful()){
Document document = Jsoup.parse(response.body().string());
if(attr.get().toString().contains("mp.weixin.qq.com")){
isforward = document.select("div#meta_content").select("span#copyright_logo").text();
if(!"原创".equals(isforward)){
isforward = "未知";
}
}else{
channel = MatchChannel.verifyChannel(attr.get().toString());
if(channel==null){
List<Node> nodeList = document.head().childNodes();
channel = MatchChannel.matchChannel(nodeList);
}
source = MatchSource.matchSource(attr.get().toString(),document.toString(), sourceList);
}
}
} catch (Exception e) {
source = null;
channel = "新闻";
}finally {
if(response != null) {
response.close();
}
}
logger.info(attr.get().toString()+"======="+channel+"================="+source);
SourceForwardBean sfb = new SourceForwardBean(attr.get().toString(), channel, source,isforward);
if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据");
} else {
callback.onData(sfb, attr);
}
}
}
package com.zhiwei.source_forward.crawler;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import com.zhiwei.source_forward.util.MatchChannel;
import com.zhiwei.source_forward.util.MatchSource;
import com.zhiwei.source_forward.util.SourceData;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
public class SourceForwardPageProcessor implements PageProcessor {
private static List<String> sourceList = SourceData.getSourceList();
private Site site = Site.me().setCycleRetryTimes(3).setSleepTime(1500)
.setTimeOut(10000)
.addHeader("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0")
.addHeader("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
;
@Override
public Site getSite() {
return site;
}
@Override
public void process(Page page) {
Map<String,String> data = new HashMap<String,String>();
String source = null;
String channel = "新闻";
try {
if(page.getStatusCode()!=404){
if(page.getUrl().get().contains("mp.weixin.qq.com")){
String isforward = "未知";
Document document = page.getHtml().getDocument();
isforward = document.select("div#meta_content").select("span#copyright_logo").text();
if(!"原创".equals(isforward)){
isforward = "未知";
}
data.put("isforward", isforward);
}else{
channel = MatchChannel.verifyChannel(page.getUrl().get());
if(channel==null){
List<Node> nodeList = page.getHtml().getDocument().head().childNodes();
channel = MatchChannel.matchChannel(nodeList);
}
source = MatchSource.matchSource(page.getUrl().get(),page.getHtml().toString(), sourceList);
}
}
} catch (Exception e) {
source = null;
channel = "新闻";
}
System.out.println(page.getUrl().get()+"======="+channel+"================="+source);
data.put("url", page.getUrl().get());
data.put("channel", channel);
data.put("root_source", source);
page.putField("sourceForward", data);
}
}
package com.zhiwei.source_forward.crawler; package com.zhiwei.source_forward.crawler;
import java.util.HashMap;
import java.io.IOException;
import java.util.List; import java.util.List;
import java.util.Map;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node; import org.jsoup.nodes.Node;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
public class UrlLivePageProcessor implements PageProcessor{ import com.zhiwei.source_forward.bean.UrlLiveBean;
private static Logger logger = LoggerFactory.getLogger(UrlLivePageProcessor.class); import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
private Site site = Site.me().setCycleRetryTimes(3).setSleepTime(1500) import com.zhiwei.source_forward.util.ProxyClientUtil;
.setTimeOut(15000) import com.zhiwei.source_forward.util.UrlLiveDataCallback;
.addHeader("User-Agent", import com.zhiwei.tools.httpclient.HttpBoot;
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0") import com.zhiwei.tools.httpclient.HttpRequestBuilder;
.addHeader("Accept", import com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter;
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
@Override import okhttp3.Request;
public void process(Page page) { import okhttp3.Response;
/**
*
* @ClassName UrlLiveCrawler
* @Description 判断页面是否存在
* @author byte-zbs
* @Date 2018年8月20日 下午3:34:57
* @version 1.0.0
*/
public class UrlLiveCrawler {
private static final Logger logger = LogManager.getLogger(UrlLiveCrawler.class);
public MultiThreadingCounter submitTask(UrlLiveDataCallback callback,String... urls) throws Exception {
MultiThreadingCounter counter = new MultiThreadingCounter();
start(counter, callback, urls);
return counter;
}
private void start(MultiThreadingCounter counter,UrlLiveDataCallback callback, String... urls) {
if (urls != null && urls.length > 0) {
for (String url : urls) {
if (url != null) {
try {
counter.increase();
search(counter, url, Attribution.of(url), callback);
} catch (Exception e) {
logger.error("关键词 {} 搜索创建出错: {}", e.getMessage());
} finally {
counter.reduce();
}
}
}
}
}
private MultiThreadingCounter search(MultiThreadingCounter counter, String url,
Attribution attr, UrlLiveDataCallback callback) {
logger.info("当前处理 URL: {}", url);
Request request = HttpRequestBuilder.newGetRequest(url, null);
counter.increase();
HttpBoot.asyncCall(request, ProxyClientUtil.getNATProxy(), false).addListeners(future -> {
if (future.isSuccess()) {
Response response = future.result();
try {
parseHtml(response, attr, callback);
} catch (Exception e) {
logger.error("解析出错", e);
}
} else {
logger.info("{} 搜索结果访问失败: {}", request.url().url(), future.cause().getMessage());
}
counter.reduce();
});
return counter;
}
private void parseHtml(Response response, Attribution attr,
UrlLiveDataCallback callback) {
/***验证网页是否能够连通*/ /***验证网页是否能够连通*/
boolean f = true; boolean f = true;
if(page!=null){ if(!response.isSuccessful()){
if(page.getStatusCode()==200){ try {
f = matchDel(page); f = matchDel(response.body().string(),attr.get().toString());
}else if(page.getStatusCode()==404){ } catch (IOException e) {
f = true; logger.info("数据判断出错 {}",e.getMessage());
}finally {
if(response != null) {
response.close();
}
}
}else{ }else{
f = false; f = false;
} }
UrlLiveBean ulb = new UrlLiveBean(attr.get().toString(), f);
if (callback == null) {
logger.warn("DataCallback 对象为 null,无法保存数据");
} else {
callback.onData(ulb, attr);
} }
Map<String,Object> data = new HashMap<String,Object>();
data.put("url", page.getUrl().get());
data.put("live", f);
page.putField("urlLive", data);
}
@Override
public Site getSite() {
return site;
} }
/*** /***
* @Title: matchDel * @Title: matchDel
* @author hero * @author hero
...@@ -53,59 +109,59 @@ public class UrlLivePageProcessor implements PageProcessor{ ...@@ -53,59 +109,59 @@ public class UrlLivePageProcessor implements PageProcessor{
* @param @return 设定文件 * @param @return 设定文件
* @return boolean 返回类型 * @return boolean 返回类型
*/ */
public boolean matchDel(Page page){ public boolean matchDel(String result,String url){
int step = 1; int step = 1;
Document doc = page.getHtml().getDocument(); Document doc = Jsoup.parse(result);
if(rulerHead(doc)){ if(rulerHead(doc)){
logger.info("{}检测规则:第{}步",page.getUrl(),step); logger.info("{}检测规则:第{}步",url,step);
return true; return true;
} }
step++; step++;
if (rulerYaoyan(doc)) if (rulerYaoyan(doc))
{ {
logger.info("{}检测规则:第{}步",page.getUrl(),step); logger.info("{}检测规则:第{}步",url,step);
return true; return true;
} }
step++; step++;
if (rulerWeigui(doc)) if (rulerWeigui(doc))
{ {
logger.info("{}检测规则:第{}步",page.getUrl(),step); logger.info("{}检测规则:第{}步",url,step);
return true; return true;
} }
step++; step++;
if (rulerTousu(doc)) if (rulerTousu(doc))
{ {
logger.info("{}检测规则:第{}步",page.getUrl(),step); logger.info("{}检测规则:第{}步",url,step);
return true; return true;
} }
step++; step++;
if (page.getUrl().get().contains("huanqiu.com")) if (url.contains("huanqiu.com"))
{ {
logger.info("{}检测规则:第{}步",page.getUrl(),step); logger.info("{}检测规则:第{}步",url,step);
return rulerHuanqiuWuxiao(doc); return rulerHuanqiuWuxiao(doc);
} }
step++;//7 step++;//7
if (rulerBucunzai(doc)) if (rulerBucunzai(doc))
{ {
logger.info("{}检测规则:第{}步",page.getUrl(),step); logger.info("{}检测规则:第{}步",url,step);
return true; return true;
} }
step++;//8 step++;//8
if (rulerKong(doc)) if (rulerKong(doc))
{ {
logger.info("{}检测规则:第{}步",page.getUrl(),step); logger.info("{}检测规则:第{}步",url,step);
return true; return true;
} }
step++;//9 step++;//9
if (rulerZhaoshang(doc)) if (rulerZhaoshang(doc))
{ {
logger.info("{}检测规则:第{}步",page.getUrl(),step); logger.info("{}检测规则:第{}步",url,step);
return true; return true;
} }
step++;//11 step++;//11
if (rulerYidian(doc)) if (rulerYidian(doc))
{ {
logger.info("{}检测规则:第{}步",page.getUrl(),step); logger.info("{}检测规则:第{}步",url,step);
return true; return true;
} }
return false; return false;
...@@ -302,6 +358,12 @@ public class UrlLivePageProcessor implements PageProcessor{ ...@@ -302,6 +358,12 @@ public class UrlLivePageProcessor implements PageProcessor{
return true; return true;
} }
} }
if (node.outerHtml().contains("meta")) {
String meta = node.toString();
if(meta.contains("公益404页面")) {
return true;
}
}
} }
} catch (Exception e) { } catch (Exception e) {
return false; return false;
......
package com.zhiwei.source_forward.downloader;
import java.io.IOException;
import java.net.UnknownHostException;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.conn.ConnectTimeoutException;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.AbstractDownloader;
import us.codecraft.webmagic.downloader.HttpClientGenerator;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.HttpConstant;
import us.codecraft.webmagic.utils.UrlUtils;
import us.codecraft.webmagic.utils.WMCollections;
public class MyDownLoader extends AbstractDownloader{
private Logger logger = LoggerFactory.getLogger(getClass());
private final Map<String, CloseableHttpClient> httpClients = new HashMap<String, CloseableHttpClient>();
private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
private CloseableHttpClient getHttpClient(Site site, Proxy proxy) {
if (site == null) {
return httpClientGenerator.getClient(null, proxy);
}
String domain = site.getDomain();
CloseableHttpClient httpClient = httpClients.get(domain);
if (httpClient == null) {
synchronized (this) {
httpClient = httpClients.get(domain);
if (httpClient == null) {
httpClient = httpClientGenerator.getClient(site, proxy);
httpClients.put(domain, httpClient);
}
}
}
return httpClient;
}
@Override
public Page download(Request request, Task task){
Site site = null;
if (task != null) {
site = task.getSite();
}
Set<Integer> acceptStatCode;
String charset = null;
Map<String, String> headers = null;
if (site != null) {
acceptStatCode = site.getAcceptStatCode();
charset = site.getCharset();
headers = site.getHeaders();
} else {
acceptStatCode = WMCollections.newHashSet(200);
}
logger.info("downloading page {}", request.getUrl());
CloseableHttpResponse httpResponse = null;
int statusCode=0;
try {
HttpHost proxyHost = null;
Proxy proxy = null; //TODO
if (site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
proxy = site.getHttpProxyFromPool();
proxyHost = proxy.getHttpHost();
} else if(site.getHttpProxy()!= null){
proxyHost = site.getHttpProxy();
}
HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost);//���������˴���
httpResponse = getHttpClient(site, proxy).execute(httpUriRequest);//getHttpClient�������˴�����֤
statusCode = httpResponse.getStatusLine().getStatusCode();
request.putExtra(Request.STATUS_CODE, statusCode);
if (statusAccept(acceptStatCode, statusCode)) {
Page page = handleResponse(request, charset, httpResponse, task);
onSuccess(request);
return page;
} else {
logger.warn("get page {} error, status code {} ",request.getUrl(),statusCode);
return null;
}
}catch (ConnectTimeoutException e ) {
logger.warn("download page {} error", request.getUrl(), e);
onError(request);
Page page = new Page();
page.setStatusCode(404);
page.setUrl(new PlainText(request.getUrl()));
page.setRawText(null);
return page;
}catch (UnknownHostException e ) {
logger.warn("download page {} error", request.getUrl(), e);
onError(request);
Page page = new Page();
page.setStatusCode(404);
page.setUrl(new PlainText(request.getUrl()));
page.setRawText(null);
return page;
}catch (IOException e ) {
logger.warn("download page {} error", request.getUrl(), e);
if (site.getCycleRetryTimes() > 0) {
return addToCycleRetry(request, site);
}
onError(request);
return null;
} finally {
request.putExtra(Request.STATUS_CODE, statusCode);
if (site.getHttpProxyPool()!=null && site.getHttpProxyPool().isEnable()) {
site.returnHttpProxyToPool((HttpHost) request.getExtra(Request.PROXY), (Integer) request
.getExtra(Request.STATUS_CODE));
}
try {
if (httpResponse != null) {
//ensure the connection is released back to pool
EntityUtils.consume(httpResponse.getEntity());
}
} catch (IOException e) {
logger.warn("close response fail", e);
}
}
}
@Override
public void setThread(int thread) {
httpClientGenerator.setPoolSize(thread);
}
protected boolean statusAccept(Set<Integer> acceptStatCode, int statusCode) {
return acceptStatCode.contains(statusCode);
}
protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map<String, String> headers,HttpHost proxy) {
RequestBuilder requestBuilder = selectRequestMethod(request).setUri(request.getUrl());
if (headers != null) {
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
}
}
@SuppressWarnings("deprecation")
RequestConfig.Builder requestConfigBuilder = RequestConfig.custom()
.setConnectionRequestTimeout(site.getTimeOut())
.setSocketTimeout(site.getTimeOut())
.setConnectTimeout(site.getTimeOut())
.setCookieSpec(CookieSpecs.BEST_MATCH);
if (proxy !=null) {
requestConfigBuilder.setProxy(proxy);
request.putExtra(Request.PROXY, proxy);
}
requestBuilder.setConfig(requestConfigBuilder.build());
return requestBuilder.build();
}
protected RequestBuilder selectRequestMethod(Request request) {
String method = request.getMethod();
if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) {
//default get
return RequestBuilder.get();
} else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) {
RequestBuilder requestBuilder = RequestBuilder.post();
NameValuePair[] nameValuePair = (NameValuePair[]) request.getExtra("nameValuePair");
if (nameValuePair != null && nameValuePair.length > 0) {
requestBuilder.addParameters(nameValuePair);
}
return requestBuilder;
} else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
return RequestBuilder.head();
} else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {
return RequestBuilder.put();
} else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) {
return RequestBuilder.delete();
} else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) {
return RequestBuilder.trace();
}
throw new IllegalArgumentException("Illegal HTTP Method " + method);
}
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
String content = getContent(charset, httpResponse);
Page page = new Page();
page.setRawText(content);
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
return page;
}
protected String getContent(String charset, HttpResponse httpResponse) throws IOException {
if (charset == null) {
byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
String htmlCharset = getHtmlCharset(httpResponse, contentBytes);
if (htmlCharset != null) {
return new String(contentBytes, htmlCharset);
} else {
logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
return new String(contentBytes);
}
} else {
return IOUtils.toString(httpResponse.getEntity().getContent(), charset);
}
}
protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
String charset;
// charset
// 1、encoding in http header Content-Type
String value = httpResponse.getEntity().getContentType().getValue();
charset = UrlUtils.getCharset(value);
if (StringUtils.isNotBlank(charset)) {
logger.debug("Auto get charset: {}", charset);
return charset;
}
// use default charset to decode first time
Charset defaultCharset = Charset.defaultCharset();
String content = new String(contentBytes, defaultCharset.name());
// 2、charset in meta
if (StringUtils.isNotEmpty(content)) {
Document document = Jsoup.parse(content);
Elements links = document.select("meta");
for (Element link : links) {
// 2.1、html4.01 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
String metaContent = link.attr("content");
String metaCharset = link.attr("charset");
if (metaContent.indexOf("charset") != -1) {
metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
charset = metaContent.split("=")[1];
break;
}
// 2.2、html5 <meta charset="UTF-8" />
else if (StringUtils.isNotEmpty(metaCharset)) {
charset = metaCharset;
break;
}
}
}
logger.debug("Auto get charset: {}", charset);
// 3、todo use tools as cpdetector for content decode
return charset;
}
}
package com.zhiwei.source_forward.pipeline;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
/**
* @ClassName: ContentDataPipeline
* @Description: 存储文章位置
* @author hero
* @date 2018年6月30日 上午9:54:27
*/
public class DataPipeline implements Pipeline {
private List<Map<String, Object>> contentDataList = new ArrayList<Map<String, Object>>();
private List<Map<String, Object>> mediaSelfDataList = new ArrayList<Map<String, Object>>();
private List<Map<String, Object>> sourceForwardDataList = new ArrayList<Map<String, Object>>();
private List<Map<String, Object>> urlLivedataList = new ArrayList<Map<String, Object>>();
public DataPipeline() {
super();
}
@Override
public void process(ResultItems resultItems, Task task) {
Map<String, Object> contentData = resultItems.get("content");
Map<String, Object> mediaSelfData = resultItems.get("mediaSelf");
Map<String, Object> sourceForwardData = resultItems.get("sourceForward");
Map<String, Object> urlLivedata = resultItems.get("urlLive");
if (contentData != null) {
contentDataList.add(contentData);
}
if (mediaSelfData != null) {
mediaSelfDataList.add(mediaSelfData);
}
if (sourceForwardData != null) {
sourceForwardDataList.add(sourceForwardData);
}
if (urlLivedata != null) {
urlLivedataList.add(urlLivedata);
}
}
public List<Map<String, Object>> getContentDataList() {
return contentDataList;
}
public void setContentDataList(List<Map<String, Object>> contentDataList) {
this.contentDataList = contentDataList;
}
public List<Map<String, Object>> getMediaSelfDataList() {
return mediaSelfDataList;
}
public void setMediaSelfDataList(List<Map<String, Object>> mediaSelfDataList) {
this.mediaSelfDataList = mediaSelfDataList;
}
public List<Map<String, Object>> getSourceForwardDataList() {
return sourceForwardDataList;
}
public void setSourceForwardDataList(List<Map<String, Object>> sourceForwardDataList) {
this.sourceForwardDataList = sourceForwardDataList;
}
public List<Map<String, Object>> getUrlLivedataList() {
return urlLivedataList;
}
public void setUrlLivedataList(List<Map<String, Object>> urlLivedataList) {
this.urlLivedataList = urlLivedataList;
}
}
package com.zhiwei.source_forward.run; package com.zhiwei.source_forward.run;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import com.zhiwei.source_forward.crawler.ContentPageProcessor; import org.apache.logging.log4j.LogManager;
import com.zhiwei.source_forward.downloader.MyDownLoader; import org.apache.logging.log4j.Logger;
import com.zhiwei.source_forward.pipeline.DataPipeline;
import us.codecraft.webmagic.Spider; import com.zhiwei.source_forward.bean.ContentBean;
import com.zhiwei.source_forward.bean.ContentBean.Attribution;
import com.zhiwei.source_forward.crawler.ContentCrawler;
import com.zhiwei.source_forward.util.ContentDataCallback;
public class ContentMatch { public class ContentMatch {
/**
* @Title: getSourceForward
* @author hero private static Logger logger = LogManager.getLogger(ContentMatch.class);
* @Description: 验证文章是否转发
* @param @param dataMap public static List<ContentBean> getContentMatch(List<String> urlList){
* @param @return 设定文件 //启动获取链接来源
* @return Map<String,Map<String,Object>> 返回类型 List<ContentBean> dataList = ContentMatchCrawlerThread.getContentMatch(urlList);
*/ return dataList;
public static Map<String,Map<String,Object>> getContent(Map<String,Map<String,Object>> dataMap){
//启动验证来源程序
DataPipeline pipeline = new DataPipeline();
Spider spider = Spider.create(new ContentPageProcessor());
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
spider.addUrl(entry.getKey());
} }
spider.setDownloader(new MyDownLoader());
spider.addPipeline(pipeline); public static void main(String[] args) {
spider.thread(5).run(); List<String> urlList = new ArrayList<>();
urlList.add("http://sh.qihoo.com/pc/99493b3bf136d8e20?sign=360_e39369d1");
List<Map<String,Object>> contentList = pipeline.getContentDataList(); urlList.add("http://news.ctocio.com.cn/383/14543883.shtml");
for(Map<String,Object> contentMap : contentList){ urlList.add("http://www.jn001.com/news/2018-07/05/content_561091.htm");
String url = contentMap.get("url")+""; urlList.add("http://www.ca800.com/fFa8D/bOTUBC1QfF/40944.aspx");
//搜集原文 urlList.add("http://sh.qihoo.com/pc/988470164f6c5ca14?sign=360_e39369d1");
if(dataMap.containsKey(url)){ urlList.add("http://news.jstv.com/a/20180705/1530731642686.shtml?jsbcApp=1");
Map<String,Object> data = dataMap.get(url); urlList.add("https://tech.sina.cn/i/gn/2018-07-05/detail-ihexfcvi8155439.d.html?pos=18");
String content = contentMap.get("content")+""; urlList.add("http://sh.qihoo.com/pc/983b3d157f91af18b?sign=360_e39369d1");
data.put("content", content); urlList.add("http://china.rednet.cn/c/2018/07/05/4671927.htm");
dataMap.put(url, data); urlList.add("http://news.enorth.com.cn/system/2018/07/05/035782857.shtml");
urlList.add("https://www.toutiao.com/i6573922350037729796/");
urlList.add("http://news.cnhubei.com/xw/sh/201807/t4132048.shtml");
urlList.add("https://www.toutiao.com/a6573774143949373956/");
List<ContentBean> da = ContentMatch.getContentMatch(urlList);
for(ContentBean sfb : da) {
System.out.println(sfb.toString());
} }
} }
return dataMap;
static class ContentMatchCrawlerThread extends Thread{
private static List<ContentBean> getContentMatch(List<String> urlList){
try{
ContentCrawler crawler = new ContentCrawler();
List<ContentBean> list = Collections.synchronizedList(new ArrayList<ContentBean>());
ContentDataCallback callback = new ContentDataCallback() {
@Override
public void onData(ContentBean data, Attribution attr) {
list.add(data);
logger.info("列表大小:::{}",list.size());
} }
};
crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await();
return list;
}catch (Exception e){
e.fillInStackTrace();
}
return null;
}
}
} }
package com.zhiwei.source_forward.run;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
import com.zhiwei.source_forward.crawler.MediaSelfSourceCrawler;
import com.zhiwei.source_forward.util.MediaSelfSourceDataCallBack;
public class MediaSelfSource {
private static Logger logger = LogManager.getLogger(MediaSelfSource.class);
public static List<MediaSelfSourceBean> getMediaSelfSource(List<String> urlList) {
List<MediaSelfSourceBean> list = MediaSelfSourceCrawlerThread.getMediaSelfSource(urlList);
return list;
}
public static void main(String[] args) {
List<String> urlList = new ArrayList<>();
urlList.add("https://baijiahao.baidu.com/s?id=1606950814338460255&wfr=spider&for=pc&qq-pf-to=pcqq.c2c");
List<MediaSelfSourceBean> da = MediaSelfSource.getMediaSelfSource(urlList);
for(MediaSelfSourceBean mssb : da) {
System.out.println(mssb.toString());
}
}
static class MediaSelfSourceCrawlerThread extends Thread{
private static List<MediaSelfSourceBean> getMediaSelfSource(List<String> urlList){
try{
MediaSelfSourceCrawler crawler = new MediaSelfSourceCrawler();
List<MediaSelfSourceBean> list = Collections.synchronizedList(new ArrayList<MediaSelfSourceBean>());
MediaSelfSourceDataCallBack callback = new MediaSelfSourceDataCallBack() {
@Override
public void onData(MediaSelfSourceBean data, Attribution attr) {
list.add(data);
logger.info("列表大小:::{}",list.size());
}
};
crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await();
return list;
}catch (Exception e){
e.fillInStackTrace();
}
return null;
}
}
}
package com.zhiwei.source_forward.run; package com.zhiwei.source_forward.run;
import java.util.HashMap; import java.util.ArrayList;
import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import com.zhiwei.source_forward.crawler.MediaSelfSourcePageProcessor; import org.apache.logging.log4j.LogManager;
import com.zhiwei.source_forward.crawler.SourceForwardPageProcessor; import org.apache.logging.log4j.Logger;
import com.zhiwei.source_forward.downloader.MyDownLoader;
import com.zhiwei.source_forward.pipeline.DataPipeline;
import us.codecraft.webmagic.Spider; import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
import com.zhiwei.source_forward.crawler.SourceForwardCrawler;
import com.zhiwei.source_forward.util.SourceForwardDataCallBack;
/** /**
* @ClassName: SourceForward * @ClassName: SourceForward
...@@ -20,149 +20,57 @@ import us.codecraft.webmagic.Spider; ...@@ -20,149 +20,57 @@ import us.codecraft.webmagic.Spider;
*/ */
public class SourceForward { public class SourceForward {
/** private static Logger logger = LogManager.getLogger(SourceForward.class);
* @Title: getSourceForward
* @author hero
* @Description: 验证文章是否转发
* @param @param dataMap
* @param @return 设定文件
* @return Map<String,Map<String,Object>> 返回类型
*/
public static Map<String,Map<String,Object>> getSourceForward(Map<String,Map<String,Object>> dataMap){
//启动验证来源程序
DataPipeline pipeline = new DataPipeline();
Spider spider = Spider.create(new SourceForwardPageProcessor());
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
spider.addUrl(entry.getKey());
}
spider.setDownloader(new MyDownLoader());
spider.addPipeline(pipeline);
spider.thread(5).run();
List<Map<String,Object>> sourceForwardList = pipeline.getSourceForwardDataList(); public static List<SourceForwardBean> getSourceForward(List<String> urlList){
for(Map<String,Object> sourceMap : sourceForwardList){ //启动获取链接来源
String url = sourceMap.get("url")+""; List<SourceForwardBean> dataList = SourceForwardCrawlerThread.getSourceForward(urlList);
String root_source = sourceMap.get("root_source")!=null?sourceMap.get("root_source").toString():null; return dataList;
String isForwardWX = sourceMap.get("isforward")!=null?sourceMap.get("isforward").toString():null;
String channel = sourceMap.get("channel")+"";
//整合数据及验证转发原创
if(dataMap.containsKey(url)){
Map<String,Object> data = dataMap.get(url);
String source = data.get("来源")+"";
String isForward = "转发";
if(root_source == null){
isForward = "原创";
}else if(root_source.toUpperCase().trim().equals(source.toUpperCase().trim())){
isForward = "原创";
} }
if(url.contains("mp.weixin.qq.com")){ public static void main(String[] args) {
isForward = isForwardWX; List<String> urlList = new ArrayList<>();
}else{ urlList.add("http://sh.qihoo.com/pc/99493b3bf136d8e20?sign=360_e39369d1");
data.put("原来源", root_source); urlList.add("http://news.ctocio.com.cn/383/14543883.shtml");
data.put("频道", channel); urlList.add("http://www.jn001.com/news/2018-07/05/content_561091.htm");
urlList.add("http://www.ca800.com/fFa8D/bOTUBC1QfF/40944.aspx");
urlList.add("http://sh.qihoo.com/pc/988470164f6c5ca14?sign=360_e39369d1");
urlList.add("http://news.jstv.com/a/20180705/1530731642686.shtml?jsbcApp=1");
urlList.add("https://tech.sina.cn/i/gn/2018-07-05/detail-ihexfcvi8155439.d.html?pos=18");
urlList.add("http://sh.qihoo.com/pc/983b3d157f91af18b?sign=360_e39369d1");
urlList.add("http://china.rednet.cn/c/2018/07/05/4671927.htm");
urlList.add("http://news.enorth.com.cn/system/2018/07/05/035782857.shtml");
urlList.add("https://www.toutiao.com/i6573922350037729796/");
urlList.add("http://news.cnhubei.com/xw/sh/201807/t4132048.shtml");
urlList.add("https://www.toutiao.com/a6573774143949373956/");
List<SourceForwardBean> da = SourceForward.getSourceForward(urlList);
for(SourceForwardBean sfb : da) {
System.out.println(sfb.toString());
} }
data.put("是否转发", isForward);
dataMap.put(url, data);
}
}
return dataMap;
} }
static class SourceForwardCrawlerThread extends Thread{
private static List<SourceForwardBean> getSourceForward(List<String> urlList){
try{
SourceForwardCrawler crawler = new SourceForwardCrawler();
List<SourceForwardBean> list = Collections.synchronizedList(new ArrayList<SourceForwardBean>());
SourceForwardDataCallBack callback = new SourceForwardDataCallBack() {
@Override
/** public void onData(SourceForwardBean data, Attribution attr) {
* @Title: getMediaSelfSource list.add(data);
* @author hero logger.info("列表大小:::{}",list.size());
* @Description: 根据链接匹配自媒体号名称
* @param @param dataMap
* @param @return 设定文件
* @return Map<String,Map<String,Object>> 返回类型
*/
public static Map<String,Map<String,Object>> getMediaSelfSource(Map<String,Map<String,Object>> dataMap){
//启动验证来源程序
DataPipeline pipeline = new DataPipeline();
Spider spider = Spider.create(new MediaSelfSourcePageProcessor());
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
spider.addUrl(entry.getKey());
} }
spider.setDownloader(new MyDownLoader());
spider.addPipeline(pipeline);
spider.thread(5).run();
List<Map<String,Object>> sourceForwardList = pipeline.getMediaSelfDataList(); };
for(Map<String,Object> sourceMap : sourceForwardList){ crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await();
String url = sourceMap.get("url")+""; return list;
//整合数据及验证转发原创 }catch (Exception e){
if(dataMap.containsKey(url)){ e.fillInStackTrace();
Map<String,Object> data = dataMap.get(url);
data.put("自媒体号", sourceMap.get("mediaself"));
data.put("频道", sourceMap.get("channel"));
dataMap.put(url, data);
}
}
return dataMap;
}
/**
* @Title: getMediaSelfSource
* @author hero
* @Description: 根据链接匹配自媒体账号
* @param @param urlList
* @param @return 设定文件
* @return Map<String,String> 返回类型
*/
public static Map<String,String> getMediaSelfSource(List<String> urlList){
//启动验证来源程序
Map<String,String> dataMap = new HashMap<String,String>();
DataPipeline pipeline = new DataPipeline();
Spider spider = Spider.create(new MediaSelfSourcePageProcessor());
for(String url : urlList){
spider.addUrl(url);
dataMap.put(url, null);
}
spider.setDownloader(new MyDownLoader());
spider.addPipeline(pipeline);
spider.thread(5).run();
List<Map<String,Object>> sourceForwardList = pipeline.getMediaSelfDataList();
for(Map<String,Object> sourceMap : sourceForwardList){
String url = sourceMap.get("url")+"";
//整合数据及验证转发原创
if(dataMap.containsKey(url)){
dataMap.put(url, sourceMap.get("mediaself").toString());
}
}
return dataMap;
}
/**
*
* @Title: getMediaSelfSource
* @author hero
* @Description: 根据链接匹配自媒体账号
* @param @param url
* @param @return 设定文件
* @return String 返回类型
*/
public static String getMediaSelfSource(String url){
//启动验证来源程序
DataPipeline pipeline = new DataPipeline();
Spider spider = Spider.create(new MediaSelfSourcePageProcessor());
spider.addUrl(url);
spider.setDownloader(new MyDownLoader());
spider.addPipeline(pipeline);
spider.thread(1).run();
List<Map<String,Object>> sourceForwardList = pipeline.getMediaSelfDataList();
for(Map<String,Object> sourceMap : sourceForwardList){
return sourceMap.get("mediaself").toString();
} }
return null; return null;
} }
}
} }
package com.zhiwei.source_forward.run; package com.zhiwei.source_forward.run;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import com.zhiwei.source_forward.crawler.UrlLivePageProcessor; import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.pipeline.DataPipeline; import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.crawler.UrlLiveCrawler;
import us.codecraft.webmagic.Spider; import com.zhiwei.source_forward.util.UrlLiveDataCallback;
/** /**
* @ClassName: URLLive * @ClassName: URLLive
...@@ -17,7 +19,6 @@ import us.codecraft.webmagic.Spider; ...@@ -17,7 +19,6 @@ import us.codecraft.webmagic.Spider;
*/ */
public class URLLive { public class URLLive {
/** /**
* @Title: verificationURLLive * @Title: verificationURLLive
* @author hero * @author hero
...@@ -27,35 +28,60 @@ public class URLLive { ...@@ -27,35 +28,60 @@ public class URLLive {
* @return Map<String,Map<String,Object>> 返回类型 * @return Map<String,Map<String,Object>> 返回类型
*/ */
public static Map<String,Map<String,Object>> verificationURLLive(Map<String,Map<String,Object>> dataMap){ public static Map<String,Map<String,Object>> verificationURLLive(Map<String,Map<String,Object>> dataMap){
List<String> urlList = new ArrayList<>();
//启动验证链接是否有效程序程序 //启动验证链接是否有效程序程序
DataPipeline pipeline = new DataPipeline();
Spider spider = Spider.create(new UrlLivePageProcessor());
for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){ for(Entry<String,Map<String,Object>> entry : dataMap.entrySet()){
spider.addUrl(entry.getKey()); urlList.add(entry.getKey());
} }
spider.addPipeline(pipeline);
spider.thread(5).run();
//验证数据是否已删除 //验证数据是否已删除
List<Map<String,Object>> dataList = pipeline.getUrlLivedataList(); List<UrlLiveBean> dataList = UrlLiveCrawlerThread.getUrlLiveCrawle(urlList);
for(Map<String,Object> data : dataList){ for(UrlLiveBean ub : dataList){
String url = data.get("url")+""; String url = ub.getUrl();
if(!url.contains("http")){ if(!url.contains("http")){
url = "http://"+url; url = "http://"+url;
} }
if(!url.contains("www")){ if(!url.contains("www")){
url = url.replace("://", "://www."); url = url.replace("://", "://www.");
} }
boolean live = (boolean)data.get("live"); boolean live = ub.isLive();
if(dataMap.containsKey(url)){ if(dataMap.containsKey(url)){
Map<String,Object> map = dataMap.get(url); Map<String,Object> map = dataMap.get(url);
map.put("是否删除", live); map.put("是否删除", live);
dataMap.put(url, map); dataMap.put(url, map);
} }
} }
return dataMap; return dataMap;
} }
public static List<UrlLiveBean> verificationURLLive(List<String> urlList){
//启动验证链接是否有效程序程序
List<UrlLiveBean> dataList = UrlLiveCrawlerThread.getUrlLiveCrawle(urlList);
return dataList;
}
static class UrlLiveCrawlerThread extends Thread{
private static List<UrlLiveBean> getUrlLiveCrawle(List<String> urlList){
try{
UrlLiveCrawler crawler = new UrlLiveCrawler();
List<UrlLiveBean> list = Collections.synchronizedList(new ArrayList<UrlLiveBean>());
UrlLiveDataCallback callback = new UrlLiveDataCallback() {
@Override
public void onData(UrlLiveBean data, Attribution attr) {
list.add(data);
System.out.println("列表大小:::"+list.size());
}
};
crawler.submitTask(callback,urlList.toArray(new String[urlList.size()])).await();
return list;
}catch (Exception e){
e.fillInStackTrace();
}
return null;
}
}
} }
package com.zhiwei.source_forward.spider;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.commons.collections.CollectionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.SpiderListener;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.pipeline.CollectorPipeline;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.Scheduler;
import us.codecraft.webmagic.thread.CountableThreadPool;
import us.codecraft.webmagic.utils.UrlUtils;
import us.codecraft.webmagic.utils.WMCollections;
public class MySpider implements Runnable, Task {
protected Downloader downloader;
protected List<Pipeline> pipelines = new ArrayList<Pipeline>();
protected PageProcessor pageProcessor;
protected List<Request> startRequests;
protected Site site;
protected String uuid;
protected Scheduler scheduler = new QueueScheduler();
protected Logger logger = LoggerFactory.getLogger(getClass());
protected CountableThreadPool threadPool;
protected ExecutorService executorService;
protected int threadNum = 1;
protected AtomicInteger stat = new AtomicInteger(STAT_INIT);
protected boolean exitWhenComplete = true;
protected final static int STAT_INIT = 0;
protected final static int STAT_RUNNING = 1;
protected final static int STAT_STOPPED = 2;
protected boolean spawnUrl = true;
protected boolean destroyWhenExit = true;
private ReentrantLock newUrlLock = new ReentrantLock();
private Condition newUrlCondition = newUrlLock.newCondition();
private List<SpiderListener> spiderListeners;
private final AtomicLong pageCount = new AtomicLong(0);
private Date startTime;
private int emptySleepTime = 30000;
/**
* create a spider with pageProcessor.
*
* @param pageProcessor pageProcessor
* @return new spider
* @see PageProcessor
*/
public static MySpider create(PageProcessor pageProcessor) {
return new MySpider(pageProcessor);
}
/**
* create a spider with pageProcessor.
*
* @param pageProcessor pageProcessor
*/
public MySpider(PageProcessor pageProcessor) {
this.pageProcessor = pageProcessor;
this.site = pageProcessor.getSite();
this.startRequests = pageProcessor.getSite().getStartRequests();
}
/**
* Set startUrls of Spider.<br>
* Prior to startUrls of Site.
*
* @param startUrls startUrls
* @return this
*/
public MySpider startUrls(List<String> startUrls) {
checkIfRunning();
this.startRequests = UrlUtils.convertToRequests(startUrls);
return this;
}
/**
* Set startUrls of Spider.<br>
* Prior to startUrls of Site.
*
* @param startRequests startRequests
* @return this
*/
public MySpider startRequest(List<Request> startRequests) {
checkIfRunning();
this.startRequests = startRequests;
return this;
}
/**
* Set an uuid for spider.<br>
* Default uuid is domain of site.<br>
*
* @param uuid uuid
* @return this
*/
public MySpider setUUID(String uuid) {
this.uuid = uuid;
return this;
}
/**
* set scheduler for Spider
*
* @param scheduler scheduler
* @return this
* @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler)
*/
@Deprecated
public MySpider scheduler(Scheduler scheduler) {
return setScheduler(scheduler);
}
/**
* set scheduler for Spider
*
* @param scheduler scheduler
* @return this
* @see Scheduler
* @since 0.2.1
*/
public MySpider setScheduler(Scheduler scheduler) {
checkIfRunning();
Scheduler oldScheduler = this.scheduler;
this.scheduler = scheduler;
if (oldScheduler != null) {
Request request;
while ((request = oldScheduler.poll(this)) != null) {
this.scheduler.push(request, this);
}
}
return this;
}
/**
* add a pipeline for Spider
*
* @param pipeline pipeline
* @return this
* @see #addPipeline(us.codecraft.webmagic.pipeline.Pipeline)
* @deprecated
*/
public MySpider pipeline(Pipeline pipeline) {
return addPipeline(pipeline);
}
/**
* add a pipeline for Spider
*
* @param pipeline pipeline
* @return this
* @see Pipeline
* @since 0.2.1
*/
public MySpider addPipeline(Pipeline pipeline) {
checkIfRunning();
this.pipelines.add(pipeline);
return this;
}
/**
* set pipelines for Spider
*
* @param pipelines pipelines
* @return this
* @see Pipeline
* @since 0.4.1
*/
public MySpider setPipelines(List<Pipeline> pipelines) {
checkIfRunning();
this.pipelines = pipelines;
return this;
}
/**
* clear the pipelines set
*
* @return this
*/
public MySpider clearPipeline() {
pipelines = new ArrayList<Pipeline>();
return this;
}
/**
* set the downloader of spider
*
* @param downloader downloader
* @return this
* @see #setDownloader(us.codecraft.webmagic.downloader.Downloader)
* @deprecated
*/
public MySpider downloader(Downloader downloader) {
return setDownloader(downloader);
}
/**
* set the downloader of spider
*
* @param downloader downloader
* @return this
* @see Downloader
*/
public MySpider setDownloader(Downloader downloader) {
checkIfRunning();
this.downloader = downloader;
return this;
}
protected void initComponent() {
if (downloader == null) {
this.downloader = new HttpClientDownloader();
}
if (pipelines.isEmpty()) {
pipelines.add(new ConsolePipeline());
}
downloader.setThread(threadNum);
if (threadPool == null || threadPool.isShutdown()) {
if (executorService != null && !executorService.isShutdown()) {
threadPool = new CountableThreadPool(threadNum, executorService);
} else {
threadPool = new CountableThreadPool(threadNum);
}
}
if (startRequests != null) {
for (Request request : startRequests) {
addRequest(request);
}
startRequests.clear();
}
startTime = new Date();
}
@Override
public void run() {
checkRunningStat();
initComponent();
logger.info("Spider " + getUUID() + " started!");
while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) {
Request request = scheduler.poll(this);
if (request == null) {
if (threadPool.getThreadAlive() == 0 && exitWhenComplete) {
break;
}
// wait until new url added
waitNewUrl();
} else {
final Request requestFinal = request;
threadPool.execute(new Runnable() {
@Override
public void run() {
try {
processRequest(requestFinal);
onSuccess(requestFinal);
} catch (Exception e) {
onError(requestFinal);
logger.error("process request " + requestFinal + " error", e);
} finally {
pageCount.incrementAndGet();
signalNewUrl();
}
}
});
}
}
stat.set(STAT_STOPPED);
// release some resources
if (destroyWhenExit) {
close();
}
}
protected void onError(Request request) {
if (CollectionUtils.isNotEmpty(spiderListeners)) {
for (SpiderListener spiderListener : spiderListeners) {
spiderListener.onError(request);
}
}
}
protected void onSuccess(Request request) {
if (CollectionUtils.isNotEmpty(spiderListeners)) {
for (SpiderListener spiderListener : spiderListeners) {
spiderListener.onSuccess(request);
}
}
}
private void checkRunningStat() {
while (true) {
int statNow = stat.get();
if (statNow == STAT_RUNNING) {
throw new IllegalStateException("Spider is already running!");
}
if (stat.compareAndSet(statNow, STAT_RUNNING)) {
break;
}
}
}
public void close() {
destroyEach(downloader);
destroyEach(pageProcessor);
destroyEach(scheduler);
for (Pipeline pipeline : pipelines) {
destroyEach(pipeline);
}
threadPool.shutdown();
}
private void destroyEach(Object object) {
if (object instanceof Closeable) {
try {
((Closeable) object).close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* Process specific urls without url discovering.
*
* @param urls urls to process
*/
public void test(String... urls) {
initComponent();
if (urls.length > 0) {
for (String url : urls) {
processRequest(new Request(url));
}
}
}
protected void processRequest(Request request) {
Page page = downloader.download(request, this);
if (page == null) {
sleep(site.getSleepTime());
onError(request);
return;
}
// for cycle retry
if (page.isNeedCycleRetry()) {
extractAndAddRequests(page, true);
sleep(site.getRetrySleepTime());
return;
}
pageProcessor.process(page);
extractAndAddRequests(page, spawnUrl);
if (!page.getResultItems().isSkip()) {
for (Pipeline pipeline : pipelines) {
pipeline.process(page.getResultItems(), this);
}
}
//for proxy status management
request.putExtra(Request.STATUS_CODE, page.getStatusCode());
sleep(site.getSleepTime());
}
protected void sleep(int time) {
try {
Thread.sleep(time);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
protected void extractAndAddRequests(Page page, boolean spawnUrl) {
if (spawnUrl && CollectionUtils.isNotEmpty(page.getTargetRequests())) {
for (Request request : page.getTargetRequests()) {
addRequest(request);
}
}
}
private void addRequest(Request request) {
if (site.getDomain() == null && request != null && request.getUrl() != null) {
site.setDomain(UrlUtils.getDomain(request.getUrl()));
}
scheduler.push(request, this);
}
protected void checkIfRunning() {
if (stat.get() == STAT_RUNNING) {
throw new IllegalStateException("Spider is already running!");
}
}
public void runAsync() {
Thread thread = new Thread(this);
thread.setDaemon(false);
thread.start();
}
/**
* Add urls to crawl. <br>
*
* @param urls urls
* @return this
*/
public MySpider addUrl(String... urls) {
for (String url : urls) {
addRequest(new Request(url));
}
signalNewUrl();
return this;
}
/**
* Download urls synchronizing.
*
* @param urls urls
* @return list downloaded
*/
@SuppressWarnings({ "rawtypes", "unchecked" })
public <T> List<T> getAll(Collection<String> urls) {
destroyWhenExit = false;
spawnUrl = false;
startRequests.clear();
for (Request request : UrlUtils.convertToRequests(urls)) {
addRequest(request);
}
CollectorPipeline collectorPipeline = getCollectorPipeline();
pipelines.add(collectorPipeline);
run();
spawnUrl = true;
destroyWhenExit = true;
return collectorPipeline.getCollected();
}
@SuppressWarnings("rawtypes")
protected CollectorPipeline getCollectorPipeline() {
return new ResultItemsCollectorPipeline();
}
public <T> T get(String url) {
List<String> urls = WMCollections.newArrayList(url);
List<T> resultItemses = getAll(urls);
if (resultItemses != null && resultItemses.size() > 0) {
return resultItemses.get(0);
} else {
return null;
}
}
/**
* Add urls with information to crawl.<br>
*
* @param requests requests
* @return this
*/
public MySpider addRequest(Request... requests) {
for (Request request : requests) {
addRequest(request);
}
signalNewUrl();
return this;
}
private void waitNewUrl() {
newUrlLock.lock();
try {
//double check
if (threadPool.getThreadAlive() == 0 && exitWhenComplete) {
return;
}
newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
} catch (InterruptedException e) {
logger.warn("waitNewUrl - interrupted, error {}", e);
} finally {
newUrlLock.unlock();
}
}
private void signalNewUrl() {
try {
newUrlLock.lock();
newUrlCondition.signalAll();
} finally {
newUrlLock.unlock();
}
}
public void start() {
runAsync();
}
public void stop() {
if (stat.compareAndSet(STAT_RUNNING, STAT_STOPPED)) {
logger.info("Spider " + getUUID() + " stop success!");
} else {
logger.info("Spider " + getUUID() + " stop fail!");
}
}
/**
* start with more than one threads
*
* @param threadNum threadNum
* @return this
*/
public MySpider thread(int threadNum) {
checkIfRunning();
this.threadNum = threadNum;
if (threadNum <= 0) {
throw new IllegalArgumentException("threadNum should be more than one!");
}
return this;
}
/**
* start with more than one threads
*
* @param executorService executorService to run the spider
* @param threadNum threadNum
* @return this
*/
public MySpider thread(ExecutorService executorService, int threadNum) {
checkIfRunning();
this.threadNum = threadNum;
if (threadNum <= 0) {
throw new IllegalArgumentException("threadNum should be more than one!");
}
return this;
}
public boolean isExitWhenComplete() {
return exitWhenComplete;
}
/**
* Exit when complete. <br>
* True: exit when all url of the site is downloaded. <br>
* False: not exit until call stop() manually.<br>
*
* @param exitWhenComplete exitWhenComplete
* @return this
*/
public MySpider setExitWhenComplete(boolean exitWhenComplete) {
this.exitWhenComplete = exitWhenComplete;
return this;
}
public boolean isSpawnUrl() {
return spawnUrl;
}
/**
* Get page count downloaded by spider.
*
* @return total downloaded page count
* @since 0.4.1
*/
public long getPageCount() {
return pageCount.get();
}
/**
* Get running status by spider.
*
* @return running status
* @see Status
* @since 0.4.1
*/
public Status getStatus() {
return Status.fromValue(stat.get());
}
public enum Status {
Init(0), Running(1), Stopped(2);
private Status(int value) {
this.value = value;
}
private int value;
int getValue() {
return value;
}
public static Status fromValue(int value) {
for (Status status : Status.values()) {
if (status.getValue() == value) {
return status;
}
}
//default value
return Init;
}
}
/**
* Get thread count which is running
*
* @return thread count which is running
* @since 0.4.1
*/
public int getThreadAlive() {
if (threadPool == null) {
return 0;
}
return threadPool.getThreadAlive();
}
/**
* Whether add urls extracted to download.<br>
* Add urls to download when it is true, and just download seed urls when it is false. <br>
* DO NOT set it unless you know what it means!
*
* @param spawnUrl spawnUrl
* @return this
* @since 0.4.0
*/
public MySpider setSpawnUrl(boolean spawnUrl) {
this.spawnUrl = spawnUrl;
return this;
}
@Override
public String getUUID() {
if (uuid != null) {
return uuid;
}
if (site != null) {
return site.getDomain();
}
uuid = UUID.randomUUID().toString();
return uuid;
}
public MySpider setExecutorService(ExecutorService executorService) {
checkIfRunning();
this.executorService = executorService;
return this;
}
@Override
public Site getSite() {
return site;
}
public List<SpiderListener> getSpiderListeners() {
return spiderListeners;
}
public MySpider setSpiderListeners(List<SpiderListener> spiderListeners) {
this.spiderListeners = spiderListeners;
return this;
}
public Date getStartTime() {
return startTime;
}
public Scheduler getScheduler() {
return scheduler;
}
/**
* Set wait time when no url is polled.<br><br>
*
* @param emptySleepTime In MILLISECONDS.
*/
public void setEmptySleepTime(int emptySleepTime) {
this.emptySleepTime = emptySleepTime;
}
}
package com.zhiwei.source_forward.util;
import com.zhiwei.source_forward.bean.ContentBean;
import com.zhiwei.source_forward.bean.ContentBean.Attribution;
public interface ContentDataCallback {
void onData(ContentBean data, Attribution attr);
}
...@@ -5,8 +5,8 @@ import org.jsoup.nodes.Document; ...@@ -5,8 +5,8 @@ import org.jsoup.nodes.Document;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import cn.edu.hfut.dmic.contentextractor.ContentExtractor; import com.zhiwei.source_forward.content.ContentExtractor;
import cn.edu.hfut.dmic.contentextractor.News; import com.zhiwei.source_forward.content.News;
/** /**
* @ClassName: MatchChannel * @ClassName: MatchChannel
......
...@@ -5,8 +5,8 @@ import java.util.List; ...@@ -5,8 +5,8 @@ import java.util.List;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import cn.edu.hfut.dmic.contentextractor.ContentExtractor; import com.zhiwei.source_forward.content.ContentExtractor;
import cn.edu.hfut.dmic.contentextractor.News; import com.zhiwei.source_forward.content.News;
/** /**
* @ClassName: MatchSource * @ClassName: MatchSource
......
package com.zhiwei.source_forward.util;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean;
import com.zhiwei.source_forward.bean.MediaSelfSourceBean.Attribution;
public interface MediaSelfSourceDataCallBack {
void onData(MediaSelfSourceBean data, Attribution attr);
}
package com.zhiwei.source_forward.util;
import java.net.Proxy;
import com.zhiwei.proxy.common.Definition.GroupType;
import com.zhiwei.proxy.core.ProxyClient;
import com.zhiwei.proxy.core.ProxyClientFactory;
public class ProxyClientUtil {
private static volatile ProxyClient client;
/**
* @Title: getNATProxy
* @author hero
* @Description: 获取NAT机代理IP
* @param @return 设定文件
* @return Proxy 返回类型
*/
public static Proxy getNATProxy(){
return getClient().getNATProxy();
}
public static ProxyClient getClient() {
if(client==null) {
synchronized (ProxyClientUtil.class) {
if(client==null) {
client = ProxyClientFactory.build("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER);
}
}
}
return client;
}
}
...@@ -9,7 +9,8 @@ import java.util.List; ...@@ -9,7 +9,8 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import com.zhiwei.zhiweiTools.order.TreatOrder; import com.zhiwei.tools.order.TreatOrder;
/** /**
* @ClassName: SourceData * @ClassName: SourceData
...@@ -82,7 +83,7 @@ public class SourceData { ...@@ -82,7 +83,7 @@ public class SourceData {
public static List<String> getSourceList(){ public static List<String> getSourceList(){
List<String> result = null; List<String> result = null;
if(sourceMap!=null && sourceMap.size()>0){ if(sourceMap!=null && sourceMap.size()>0){
result = new ArrayList<String>(); result = new ArrayList<>();
List<Entry<String,Integer>> dataList = TreatOrder.treatOrderByCountDesc(sourceMap); List<Entry<String,Integer>> dataList = TreatOrder.treatOrderByCountDesc(sourceMap);
for(Entry<String,Integer> entry : dataList){ for(Entry<String,Integer> entry : dataList){
result.add(entry.getKey()); result.add(entry.getKey());
......
package com.zhiwei.source_forward.util;
import com.zhiwei.source_forward.bean.SourceForwardBean;
import com.zhiwei.source_forward.bean.SourceForwardBean.Attribution;
public interface SourceForwardDataCallBack {
/**
* 当有输入传入调度
*
* @param data
* @param attr
* @return void
*/
void onData(SourceForwardBean data, Attribution attr);
}
/**
* @Title: DataCallback.java
* @Package com.zhiwei.crawler.baidu
* @author 0xff
* @date 2018年6月29日 下午4:44:38
*/
package com.zhiwei.source_forward.util;
import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
/**
* @ClassName: UrlLiveDataCallback
* @Description: 链接是否删除保存接口
* @author 0xff
* @date 2018年6月29日 下午4:44:38
*/
public interface UrlLiveDataCallback {
/**
* 当有输入传入调度
*
* @param data
* @param attr
* @return void
*/
void onData(UrlLiveBean data, Attribution attr);
}
<?xml version="1.0" encoding="UTF-8"?>
<!-- log4j2 自身的日志级别 -->
<Configuration status="WARN">
<properties>
<property name="LOG_HOME">Log/</property>
<property name="LOG_FILE">crawler</property>
</properties>
<Appenders>
<!-- 定义日志输出地 -->
<Console name="Console" target="SYSTEM_OUT">
<PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss.SSS} %-5level %logger{36} - %msg%n" />
</Console>
<RollingRandomAccessFile name="LogFile"
fileName="${LOG_HOME}/${LOG_FILE}.log"
filePattern="${LOG_HOME}/$${date:yyyy-MM}/${LOG_FILE}-%d{yyyy-MM-dd}-%i.log">
<PatternLayout
pattern="%d{yyyy-MM-dd HH:mm:ss.SSS} %-5level %logger{36} - %msg%n" />
<Policies>
<TimeBasedTriggeringPolicy interval="1" />
<SizeBasedTriggeringPolicy size="20 MB" />
</Policies>
<DefaultRolloverStrategy max="20" />
</RollingRandomAccessFile>
</Appenders>
<Loggers>
<Root level="all">
<AppenderRef ref="Console" level="info" />
<AppenderRef ref="LogFile" level="info" />
</Root>
</Loggers>
</Configuration>
\ No newline at end of file
package com.zhiwei.source_forward.sourceforward.test; //package com.zhiwei.source_forward.sourceforward.test;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
import java.util.Map.Entry; //import java.util.Map.Entry;
//
import org.junit.Test; //import org.junit.Test;
//
import com.zhiwei.source_forward.run.URLLive; //import com.zhiwei.source_forward.run.URLLive;
//
/** ///**
* @ClassName: URLLiveTest // * @ClassName: URLLiveTest
* @Description: 验证链接有效性 // * @Description: 验证链接有效性
* @author hero // * @author hero
* @date 2017年12月6日 下午1:30:26 // * @date 2017年12月6日 下午1:30:26
*/ // */
public class URLLiveTest { //public class URLLiveTest {
//
//
// @Test //// @Test
// public void urlLiveTest(){ //// public void urlLiveTest(){
// String path = "E://稿件汇总网媒数据//福莱网媒.xlsx"; //// String path = "E://稿件汇总网媒数据//福莱网媒.xlsx";
// PoiExcelUtil poi = PoiExcelUtil.getInstance(); //// PoiExcelUtil poi = PoiExcelUtil.getInstance();
// Map<String,Object> data = poi.importExcel(path, 0); //// Map<String,Object> data = poi.importExcel(path, 0);
// @SuppressWarnings("unchecked") //// @SuppressWarnings("unchecked")
// List<String> headList = (List<String>)data.get("head"); //// List<String> headList = (List<String>)data.get("head");
// headList.add("是否删除"); //// headList.add("是否删除");
// @SuppressWarnings("unchecked") //// @SuppressWarnings("unchecked")
// List<Map<String,Object>> dataList = (List<Map<String,Object>>)data.get("body"); //// List<Map<String,Object>> dataList = (List<Map<String,Object>>)data.get("body");
// Map<String,Map<String,Object>> dataMap = ReadMediaData.getUrlLive(dataList); //// Map<String,Map<String,Object>> dataMap = ReadMediaData.getUrlLive(dataList);
// dataMap = URLLive.verificationURLLive(dataMap); //// dataMap = URLLive.verificationURLLive(dataMap);
// ////
// List<Map<String,Object>> bodyList = new ArrayList<>(); //// List<Map<String,Object>> bodyList = new ArrayList<>();
// for(Entry<String,Map<String,Object>> dataEntry : dataMap.entrySet()){ //// for(Entry<String,Map<String,Object>> dataEntry : dataMap.entrySet()){
// bodyList.add(dataEntry.getValue()); //// bodyList.add(dataEntry.getValue());
// } //// }
// poi.exportExcel(path ,"匹配后数据", headList, bodyList); //// poi.exportExcel(path ,"匹配后数据", headList, bodyList);
// } //// }
//
//
} //}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment