Commit aa2a108b by yangchen

头条获取修改

parent b6fe1572
......@@ -12,6 +12,7 @@ import com.zhiwei.tools.httpclient.HttpBoot;
import com.zhiwei.tools.httpclient.HttpRequestBuilder;
import com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter;
import okhttp3.Headers;
import okhttp3.Request;
import okhttp3.Response;
......
......@@ -55,14 +55,14 @@ public class ContentMatch {
return dataList;
}
public static void main(String[] args) {
List<String> urlList = new ArrayList<>();
urlList.add("https://mp.weixin.qq.com/s?src=11&timestamp=1535449515&ver=1088&signature=9kByOydse2KaausR0FP5HoQpSeSXs097LR-akxhJxfCV*onfJuoWkznZ8UEk5OfFox4aVzDqx0n0xwbtTm6KUzPpNz2desfNiQ4Uevp4LaTSyoH3OKysG2qxy2jisojb&new=1");
List<ContentBean> l = getContentMatch(urlList);
for(ContentBean cb : l) {
System.out.println(cb.toString());
}
}
// public static void main(String[] args) {
// List<String> urlList = new ArrayList<>();
// urlList.add("http://www.toutiao.com/a6571343464292680196/");
// List<ContentBean> l = getContentMatch(urlList);
// for(ContentBean cb : l) {
// System.out.println(cb.getContent());
// }
// }
static class ContentMatchCrawlerThread extends Thread{
......
package com.zhiwei.source_forward.util;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
......@@ -7,6 +10,7 @@ import org.slf4j.LoggerFactory;
import com.zhiwei.source_forward.content.ContentExtractor;
import com.zhiwei.source_forward.content.News;
import com.zhiwei.tools.tools.ZhiWeiTools;
/**
* @ClassName: MatchChannel
......@@ -32,10 +36,12 @@ public class MatchContent {
Document document = Jsoup.parse(html);
if(url.contains("weixin.qq.com")) {
content = matchContentWeixin(document);
}else if(url.contains("toutiao.com")) {
content = matchContentToutiao(html);
}else {
content = mathchContent(html, document);
}
return content;
return ZhiWeiTools.delHTMLTag(content);
} catch (Exception e) {
logger.debug("获取全文失败",e.fillInStackTrace());
content = null;
......@@ -45,6 +51,21 @@ public class MatchContent {
/**
*
* @Description 头条正文获取
* @param html
* @return
*/
private static String matchContentToutiao(String html) {
Pattern pa = Pattern.compile("content:(.*?)',");
Matcher ma = pa.matcher(html);
while(ma.find()) {
return ma.group(1);
}
return null;
}
/**
*
* @Description 微信文本获取
* @param html
* @return
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment