Commit 3ddb6288 by zhiwei

添加根据链接采集全文

parent 2c702467
...@@ -40,6 +40,12 @@ public class WechatAricle { ...@@ -40,6 +40,12 @@ public class WechatAricle {
private String isFirst; //是否为头条文章 private String isFirst; //是否为头条文章
private String biz; //微信公众号唯一标识
private String wxId; //微信公众号id
private String user_name;//微信公众号初始id
public String getIsFirst() { public String getIsFirst() {
return isFirst; return isFirst;
} }
...@@ -103,6 +109,24 @@ public class WechatAricle { ...@@ -103,6 +109,24 @@ public class WechatAricle {
} }
public String getBiz() {
return biz;
}
public String getWxId() {
return wxId;
}
public String getUser_name() {
return user_name;
}
public void setBiz(String biz) {
this.biz = biz;
}
public void setWxId(String wxId) {
this.wxId = wxId;
}
public void setUser_name(String user_name) {
this.user_name = user_name;
}
public WechatAricle(){} public WechatAricle(){}
public WechatAricle(String id,String title,String source,String content public WechatAricle(String id,String title,String source,String content
......
...@@ -231,7 +231,7 @@ public class WechatAritcleSearch { ...@@ -231,7 +231,7 @@ public class WechatAritcleSearch {
title = ZhiWeiTools.SBC2DBC(title); title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content); content = ZhiWeiTools.SBC2DBC(content);
wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow"); wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow");
wechat = getContentAndSource(link, proxyHolder, wechat); wechat = getWechatAricleInfo(link, proxyHolder, wechat);
result.add(wechat); result.add(wechat);
} catch (Exception e) { } catch (Exception e) {
logger.debug("解析数据出现错误:{}", e.getMessage()); logger.debug("解析数据出现错误:{}", e.getMessage());
...@@ -267,37 +267,54 @@ public class WechatAritcleSearch { ...@@ -267,37 +267,54 @@ public class WechatAritcleSearch {
* @return * @return
* @throws IOException * @throws IOException
*/ */
private static WechatAricle getContentAndSource(String url, ProxyHolder proxy,WechatAricle wechatAricle){ private static WechatAricle getWechatAricleInfo(String url, ProxyHolder proxy,WechatAricle wechatAricle){
try { try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy.getProxy()).body().string(); String contentHtml = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy.getProxy()).body().string();
if(htmlBody!=null){ String content = null;
Document document = Jsoup.parse(htmlBody); String time = null;
String content = null; String source = null;
String source = null; String biz = null;
String text = null; String title = null;
if(htmlBody.contains("js_article")){ String user_name = null;
String wxId = null;
if(contentHtml!=null){
Document document = Jsoup.parse(contentHtml);
title = document.select("title").text();
wxId = document.select("p.profile_meta").get(0).select("span.profile_meta_value").text();
if(contentHtml.contains("js_article")){
content = document.select("div#js_article").text(); content = document.select("div#js_article").text();
}else if(htmlBody.contains("js_share_content")){ }else if(contentHtml.contains("js_share_content")){
content = document.select("div#js_share_content").text(); content = document.select("div#js_share_content").text();
} }
if(htmlBody.contains("content_tpl")){ if(contentHtml.contains("content_tpl")){
text = document.select("script#content_tpl").html(); String text = document.select("script#content_tpl").html();
text = Jsoup.parse(text).text(); content = Jsoup.parse(text).text();
} }
content = content+text;
if(htmlBody.contains("js_name")){ if(contentHtml.contains("d.nick_name = ")){
source = document.select("a#js_name").text().trim(); time = contentHtml.split("d.ct = \"")[1].split("\";")[0];
}else if(htmlBody.contains("account_nickname")){ source = contentHtml.split("d.nick_name = \"")[1].split("\";")[0];
source = document.select("div.account_nickname").text().trim(); biz = contentHtml.split("d.biz = \"")[1].split("\"")[0];
} user_name = contentHtml.split("d.user_name = \"")[1].split("\"")[0];
// System.out.println(source+"=========="+content); }else if(contentHtml.contains("var nickname = ")){
if(content!=null && content.length()>50){ time = contentHtml.split("var ct = \"")[1].split("\";")[0];
wechatAricle.setContent(content); source = contentHtml.split("var nickname = \"")[1].split("\";")[0];
} biz = contentHtml.split("var appuin = \"\"||\"")[1].split("\"")[0];
if(source!=null && content.length()>0){ user_name = contentHtml.split("var user_name = \"")[1].split("\"")[0];
wechatAricle.setSource(source);
} }
} }
if(wechatAricle == null) {
wechatAricle = new WechatAricle();
wechatAricle.setTitle(title);
wechatAricle.setTime(new Date(Long.valueOf(time)*1000));
wechatAricle.setSource(source);
}
wechatAricle.setBiz(biz);
wechatAricle.setContent(content);
wechatAricle.setWxId(wxId);
wechatAricle.setUser_name(user_name);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
return wechatAricle; return wechatAricle;
...@@ -385,7 +402,7 @@ public class WechatAritcleSearch { ...@@ -385,7 +402,7 @@ public class WechatAritcleSearch {
title = ZhiWeiTools.SBC2DBC(title); title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content); content = ZhiWeiTools.SBC2DBC(content);
wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow"); wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow");
wechat = getContentAndSource(link, proxyHolder, wechat); wechat = getWechatAricleInfo(link, proxyHolder, wechat);
result.add(wechat); result.add(wechat);
} catch (Exception e) { } catch (Exception e) {
logger.debug("解析数据出现错误:{}", e.getMessage()); logger.debug("解析数据出现错误:{}", e.getMessage());
...@@ -432,7 +449,6 @@ public class WechatAritcleSearch { ...@@ -432,7 +449,6 @@ public class WechatAritcleSearch {
try { try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), proxyHolder).body().string(); htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), proxyHolder).body().string();
System.out.println(htmlBody);
if (htmlBody != null) { if (htmlBody != null) {
JSONObject json = JSONObject.parseObject(htmlBody); JSONObject json = JSONObject.parseObject(htmlBody);
openId = json.getString("openid"); openId = json.getString("openid");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment