Commit aa2a108b by yangchen

头条获取修改

parent b6fe1572
...@@ -12,6 +12,7 @@ import com.zhiwei.tools.httpclient.HttpBoot; ...@@ -12,6 +12,7 @@ import com.zhiwei.tools.httpclient.HttpBoot;
import com.zhiwei.tools.httpclient.HttpRequestBuilder; import com.zhiwei.tools.httpclient.HttpRequestBuilder;
import com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter; import com.zhiwei.tools.httpclient.asyn.MultiThreadingCounter;
import okhttp3.Headers;
import okhttp3.Request; import okhttp3.Request;
import okhttp3.Response; import okhttp3.Response;
......
...@@ -55,14 +55,14 @@ public class ContentMatch { ...@@ -55,14 +55,14 @@ public class ContentMatch {
return dataList; return dataList;
} }
public static void main(String[] args) { // public static void main(String[] args) {
List<String> urlList = new ArrayList<>(); // List<String> urlList = new ArrayList<>();
urlList.add("https://mp.weixin.qq.com/s?src=11&timestamp=1535449515&ver=1088&signature=9kByOydse2KaausR0FP5HoQpSeSXs097LR-akxhJxfCV*onfJuoWkznZ8UEk5OfFox4aVzDqx0n0xwbtTm6KUzPpNz2desfNiQ4Uevp4LaTSyoH3OKysG2qxy2jisojb&new=1"); // urlList.add("http://www.toutiao.com/a6571343464292680196/");
List<ContentBean> l = getContentMatch(urlList); // List<ContentBean> l = getContentMatch(urlList);
for(ContentBean cb : l) { // for(ContentBean cb : l) {
System.out.println(cb.toString()); // System.out.println(cb.getContent());
} // }
} // }
static class ContentMatchCrawlerThread extends Thread{ static class ContentMatchCrawlerThread extends Thread{
......
package com.zhiwei.source_forward.util; package com.zhiwei.source_forward.util;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.slf4j.Logger; import org.slf4j.Logger;
...@@ -7,6 +10,7 @@ import org.slf4j.LoggerFactory; ...@@ -7,6 +10,7 @@ import org.slf4j.LoggerFactory;
import com.zhiwei.source_forward.content.ContentExtractor; import com.zhiwei.source_forward.content.ContentExtractor;
import com.zhiwei.source_forward.content.News; import com.zhiwei.source_forward.content.News;
import com.zhiwei.tools.tools.ZhiWeiTools;
/** /**
* @ClassName: MatchChannel * @ClassName: MatchChannel
...@@ -32,10 +36,12 @@ public class MatchContent { ...@@ -32,10 +36,12 @@ public class MatchContent {
Document document = Jsoup.parse(html); Document document = Jsoup.parse(html);
if(url.contains("weixin.qq.com")) { if(url.contains("weixin.qq.com")) {
content = matchContentWeixin(document); content = matchContentWeixin(document);
}else if(url.contains("toutiao.com")) {
content = matchContentToutiao(html);
}else { }else {
content = mathchContent(html, document); content = mathchContent(html, document);
} }
return content; return ZhiWeiTools.delHTMLTag(content);
} catch (Exception e) { } catch (Exception e) {
logger.debug("获取全文失败",e.fillInStackTrace()); logger.debug("获取全文失败",e.fillInStackTrace());
content = null; content = null;
...@@ -45,6 +51,21 @@ public class MatchContent { ...@@ -45,6 +51,21 @@ public class MatchContent {
/** /**
* *
* @Description 头条正文获取
* @param html
* @return
*/
private static String matchContentToutiao(String html) {
Pattern pa = Pattern.compile("content:(.*?)',");
Matcher ma = pa.matcher(html);
while(ma.find()) {
return ma.group(1);
}
return null;
}
/**
*
* @Description 微信文本获取 * @Description 微信文本获取
* @param html * @param html
* @return * @return
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment