Commit 2d115777 by zhiwei

添加根据链接采集全文

parent b66c4c38
...@@ -20,6 +20,7 @@ import java.util.HashMap; ...@@ -20,6 +20,7 @@ import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
...@@ -29,6 +30,7 @@ import com.zhiwei.crawler.core.HttpBoot; ...@@ -29,6 +30,7 @@ import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils; import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.tools.timeparse.TimeParse; import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.toutiao.bean.Signature; import com.zhiwei.toutiao.bean.Signature;
import com.zhiwei.toutiao.bean.TouTiaoArticle; import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.util.Tools; import com.zhiwei.toutiao.util.Tools;
...@@ -502,6 +504,30 @@ public class TouTiaoArticleParse { ...@@ -502,6 +504,30 @@ public class TouTiaoArticleParse {
return map; return map;
} }
/**
* 根据链接获取全文
* @param url
* @param proxy
* @return
*/
public static String getContent(String url,Proxy proxy) {
String htmlBody = downloadHtml(url, proxy, null);
if(!StringUtils.isBlank(htmlBody)) {
if(htmlBody.contains("content:")) {
String content = htmlBody.split(" content: '")[1].split("',")[0];
return ZhiWeiTools.delHTMLTag(content);
}
}
return null;
}
private static String downloadHtml(String url, Proxy proxy, Map<String,String> headMap) { private static String downloadHtml(String url, Proxy proxy, Map<String,String> headMap) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment