Commit b6fe1572 by yangchen

微信正文获取修改

parent 87e9aaf3
...@@ -60,7 +60,7 @@ public class UrlLiveCrawler { ...@@ -60,7 +60,7 @@ public class UrlLiveCrawler {
logger.info("当前处理 URL: {}", url); logger.info("当前处理 URL: {}", url);
Request request = HttpRequestBuilder.newGetRequest(url, null); Request request = HttpRequestBuilder.newGetRequest(url, null);
counter.increase(); counter.increase();
HttpBoot.asyncCall(request, ProxyClientUtil.getNATProxy(), false).addListeners(future -> { HttpBoot.asyncCall(request, ProxyClientUtil.getNATProxy(), false,false).addListeners(future -> {
if (future.isSuccess()) { if (future.isSuccess()) {
Response response = future.result(); Response response = future.result();
try { try {
......
...@@ -55,6 +55,15 @@ public class ContentMatch { ...@@ -55,6 +55,15 @@ public class ContentMatch {
return dataList; return dataList;
} }
public static void main(String[] args) {
List<String> urlList = new ArrayList<>();
urlList.add("https://mp.weixin.qq.com/s?src=11&timestamp=1535449515&ver=1088&signature=9kByOydse2KaausR0FP5HoQpSeSXs097LR-akxhJxfCV*onfJuoWkznZ8UEk5OfFox4aVzDqx0n0xwbtTm6KUzPpNz2desfNiQ4Uevp4LaTSyoH3OKysG2qxy2jisojb&new=1");
List<ContentBean> l = getContentMatch(urlList);
for(ContentBean cb : l) {
System.out.println(cb.toString());
}
}
static class ContentMatchCrawlerThread extends Thread{ static class ContentMatchCrawlerThread extends Thread{
private static List<ContentBean> getContentMatch(List<String> urlList){ private static List<ContentBean> getContentMatch(List<String> urlList){
......
...@@ -29,8 +29,12 @@ public class MatchContent { ...@@ -29,8 +29,12 @@ public class MatchContent {
public static String matchContent(String url,String html) { public static String matchContent(String url,String html) {
String content = null; String content = null;
try { try {
Document document = Jsoup.parse(html); Document document = Jsoup.parse(html);
content = mathchContent(html, document); if(url.contains("weixin.qq.com")) {
content = matchContentWeixin(document);
}else {
content = mathchContent(html, document);
}
return content; return content;
} catch (Exception e) { } catch (Exception e) {
logger.debug("获取全文失败",e.fillInStackTrace()); logger.debug("获取全文失败",e.fillInStackTrace());
...@@ -39,8 +43,18 @@ public class MatchContent { ...@@ -39,8 +43,18 @@ public class MatchContent {
return content; return content;
} }
/** /**
*
* @Description 微信文本获取
* @param html
* @return
*/
private static String matchContentWeixin(Document document) {
return document.select("div.rich_media_content").text();
}
/**
* @Title: mathchContent * @Title: mathchContent
* @author hero * @author hero
* @Description: 匹配正文数据 * @Description: 匹配正文数据
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment