Commit 3ddb6288 by zhiwei

添加根据链接采集全文

parent 2c702467
......@@ -40,6 +40,12 @@ public class WechatAricle {
private String isFirst; //是否为头条文章
private String biz; //微信公众号唯一标识
private String wxId; //微信公众号id
private String user_name;//微信公众号初始id
public String getIsFirst() {
return isFirst;
}
......@@ -103,6 +109,24 @@ public class WechatAricle {
}
public String getBiz() {
return biz;
}
public String getWxId() {
return wxId;
}
public String getUser_name() {
return user_name;
}
public void setBiz(String biz) {
this.biz = biz;
}
public void setWxId(String wxId) {
this.wxId = wxId;
}
public void setUser_name(String user_name) {
this.user_name = user_name;
}
public WechatAricle(){}
public WechatAricle(String id,String title,String source,String content
......
......@@ -231,7 +231,7 @@ public class WechatAritcleSearch {
title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content);
wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow");
wechat = getContentAndSource(link, proxyHolder, wechat);
wechat = getWechatAricleInfo(link, proxyHolder, wechat);
result.add(wechat);
} catch (Exception e) {
logger.debug("解析数据出现错误:{}", e.getMessage());
......@@ -267,37 +267,54 @@ public class WechatAritcleSearch {
* @return
* @throws IOException
*/
private static WechatAricle getContentAndSource(String url, ProxyHolder proxy,WechatAricle wechatAricle){
private static WechatAricle getWechatAricleInfo(String url, ProxyHolder proxy,WechatAricle wechatAricle){
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy.getProxy()).body().string();
if(htmlBody!=null){
Document document = Jsoup.parse(htmlBody);
String content = null;
String source = null;
String text = null;
if(htmlBody.contains("js_article")){
String contentHtml = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy.getProxy()).body().string();
String content = null;
String time = null;
String source = null;
String biz = null;
String title = null;
String user_name = null;
String wxId = null;
if(contentHtml!=null){
Document document = Jsoup.parse(contentHtml);
title = document.select("title").text();
wxId = document.select("p.profile_meta").get(0).select("span.profile_meta_value").text();
if(contentHtml.contains("js_article")){
content = document.select("div#js_article").text();
}else if(htmlBody.contains("js_share_content")){
}else if(contentHtml.contains("js_share_content")){
content = document.select("div#js_share_content").text();
}
if(htmlBody.contains("content_tpl")){
text = document.select("script#content_tpl").html();
text = Jsoup.parse(text).text();
if(contentHtml.contains("content_tpl")){
String text = document.select("script#content_tpl").html();
content = Jsoup.parse(text).text();
}
content = content+text;
if(htmlBody.contains("js_name")){
source = document.select("a#js_name").text().trim();
}else if(htmlBody.contains("account_nickname")){
source = document.select("div.account_nickname").text().trim();
}
// System.out.println(source+"=========="+content);
if(content!=null && content.length()>50){
wechatAricle.setContent(content);
}
if(source!=null && content.length()>0){
wechatAricle.setSource(source);
if(contentHtml.contains("d.nick_name = ")){
time = contentHtml.split("d.ct = \"")[1].split("\";")[0];
source = contentHtml.split("d.nick_name = \"")[1].split("\";")[0];
biz = contentHtml.split("d.biz = \"")[1].split("\"")[0];
user_name = contentHtml.split("d.user_name = \"")[1].split("\"")[0];
}else if(contentHtml.contains("var nickname = ")){
time = contentHtml.split("var ct = \"")[1].split("\";")[0];
source = contentHtml.split("var nickname = \"")[1].split("\";")[0];
biz = contentHtml.split("var appuin = \"\"||\"")[1].split("\"")[0];
user_name = contentHtml.split("var user_name = \"")[1].split("\"")[0];
}
}
if(wechatAricle == null) {
wechatAricle = new WechatAricle();
wechatAricle.setTitle(title);
wechatAricle.setTime(new Date(Long.valueOf(time)*1000));
wechatAricle.setSource(source);
}
wechatAricle.setBiz(biz);
wechatAricle.setContent(content);
wechatAricle.setWxId(wxId);
wechatAricle.setUser_name(user_name);
} catch (Exception e) {
e.printStackTrace();
return wechatAricle;
......@@ -385,7 +402,7 @@ public class WechatAritcleSearch {
title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content);
wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow");
wechat = getContentAndSource(link, proxyHolder, wechat);
wechat = getWechatAricleInfo(link, proxyHolder, wechat);
result.add(wechat);
} catch (Exception e) {
logger.debug("解析数据出现错误:{}", e.getMessage());
......@@ -432,7 +449,6 @@ public class WechatAritcleSearch {
try {
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), proxyHolder).body().string();
System.out.println(htmlBody);
if (htmlBody != null) {
JSONObject json = JSONObject.parseObject(htmlBody);
openId = json.getString("openid");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment