Commit f6290b0f by zhiwei

修改微头条采集内容解析

parent f97d6fe2
...@@ -80,8 +80,7 @@ public class TouTiaoAccountParse { ...@@ -80,8 +80,7 @@ public class TouTiaoAccountParse {
String htmlBody = null; String htmlBody = null;
htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap); htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
if(htmlBody != null){ if(htmlBody != null){
tta = parseAccountByUserId(htmlBody, user_id); tta = parseAccountByUserId(htmlBody, user_id, proxy);
} }
} catch (Exception e) { } catch (Exception e) {
e.fillInStackTrace(); e.fillInStackTrace();
...@@ -193,7 +192,6 @@ public class TouTiaoAccountParse { ...@@ -193,7 +192,6 @@ public class TouTiaoAccountParse {
for (int i = 0; i < jsonArray.size(); i++) { for (int i = 0; i < jsonArray.size(); i++) {
try { try {
JSONObject data = jsonArray.getJSONObject(i); JSONObject data = jsonArray.getJSONObject(i);
System.out.println(data.toString());
if(data.getLong("id") == null) { if(data.getLong("id") == null) {
continue; continue;
} }
...@@ -214,7 +212,6 @@ public class TouTiaoAccountParse { ...@@ -214,7 +212,6 @@ public class TouTiaoAccountParse {
follow_count = data.getInteger("follow_count"); follow_count = data.getInteger("follow_count");
} }
String img_url = "https:"+data.getString("avatar_url"); String img_url = "https:"+data.getString("avatar_url");
System.out.println(data.getString("create_time"));
Date create_time = null; Date create_time = null;
if(data.getString("create_time") != null) { if(data.getString("create_time") != null) {
create_time = new Date(Long.valueOf(data.getString("create_time"))*1000); create_time = new Date(Long.valueOf(data.getString("create_time"))*1000);
...@@ -254,9 +251,9 @@ public class TouTiaoAccountParse { ...@@ -254,9 +251,9 @@ public class TouTiaoAccountParse {
* @param @return 设定文件 * @param @return 设定文件
* @return TouTiaoAccount 返回类型 * @return TouTiaoAccount 返回类型
*/ */
private static TouTiaoAccount parseAccountByUserId(String htmlBody,String user_id) { private static TouTiaoAccount parseAccountByUserId(String htmlBody,String user_id,Proxy proxy) {
try { try {
TouTiaoAccount touTiaoAccount = new TouTiaoAccount(); TouTiaoAccount touTiaoAccount = new TouTiaoAccount();;
if(htmlBody.contains("var header={")){ if(htmlBody.contains("var header={")){
String name = htmlBody.split("var header")[1].split("name:'")[1].split("',")[0]; String name = htmlBody.split("var header")[1].split("name:'")[1].split("',")[0];
String img_url = "https:"+htmlBody.split("avtar_img:'")[1].split("',")[0]; String img_url = "https:"+htmlBody.split("avtar_img:'")[1].split("',")[0];
...@@ -268,8 +265,9 @@ public class TouTiaoAccountParse { ...@@ -268,8 +265,9 @@ public class TouTiaoAccountParse {
touTiaoAccount.setFollow_count(fensi); touTiaoAccount.setFollow_count(fensi);
} }
touTiaoAccount.setId(user_id); touTiaoAccount.setId(user_id);
touTiaoAccount.setUser_id(Long.valueOf(user_id));
touTiaoAccount.setImg_url(img_url); touTiaoAccount.setImg_url(img_url);
touTiaoAccount.setName(name); touTiaoAccount.setName(name);
touTiaoAccount.setUser_type(type); touTiaoAccount.setUser_type(type);
return touTiaoAccount; return touTiaoAccount;
} }
......
...@@ -151,7 +151,6 @@ public class TouTiaoArticleParse { ...@@ -151,7 +151,6 @@ public class TouTiaoArticleParse {
} }
Map<String, String> headerMap = Tools.getTouTiaoHeader(); Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", "https://www.toutiao.com/c/user/" + user_id + "/"); headerMap.put("Referer", "https://www.toutiao.com/c/user/" + user_id + "/");
String htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap); String htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
if (htmlBody != null) { if (htmlBody != null) {
Map<String, Object> dataMap = parseHtmlByMicroAccount(htmlBody, endDate); Map<String, Object> dataMap = parseHtmlByMicroAccount(htmlBody, endDate);
...@@ -175,6 +174,7 @@ public class TouTiaoArticleParse { ...@@ -175,6 +174,7 @@ public class TouTiaoArticleParse {
* @param @return 设定文件 * @param @return 设定文件
* @return Map<String,Object> 返回类型 * @return Map<String,Object> 返回类型
*/ */
@SuppressWarnings("unlikely-arg-type")
private static Map<String, Object> parseHtmlByMicroAccount(String htmlBody, Date endDate) { private static Map<String, Object> parseHtmlByMicroAccount(String htmlBody, Date endDate) {
Map<String, Object> map = new HashMap<String, Object>(); Map<String, Object> map = new HashMap<String, Object>();
...@@ -201,7 +201,7 @@ public class TouTiaoArticleParse { ...@@ -201,7 +201,7 @@ public class TouTiaoArticleParse {
date = new Date(max_behot_time*1000); date = new Date(max_behot_time*1000);
href = "https://www.toutiao.com/a" + data.getString("thread_id"); href = "https://www.toutiao.com/a" + data.getString("thread_id");
source = data.getJSONObject("ugc_user").getString("name"); source = data.getJSONObject("ugc_user").getString("name");
content = data.getString("rich_content"); content = data.getString("content");
readNum = data.getInteger("read_count")+""; readNum = data.getInteger("read_count")+"";
commentNum = data.getInteger("comment_count")+""; commentNum = data.getInteger("comment_count")+"";
user_id = data.getJSONObject("ugc_user").getString("user_id"); user_id = data.getJSONObject("ugc_user").getString("user_id");
......
//package com.zhiwei.toutiao.test; package com.zhiwei.toutiao.test;
//
//import java.util.List; import java.util.List;
//import java.util.Map; import java.util.Map;
//
//import com.zhiwei.toutiao.bean.TouTiaoArticle; import com.zhiwei.toutiao.bean.TouTiaoArticle;
//import com.zhiwei.toutiao.parse.TouTiaoChannelParse; import com.zhiwei.toutiao.parse.TouTiaoChannelParse;
//import com.zhiwei.toutiao.util.Tools; import com.zhiwei.toutiao.util.Tools;
//
///** /**
// * @ClassName: TouTiaoChannelExample * @ClassName: TouTiaoChannelExample
// * @Description: TODO(头条频道解析测试) * @Description: TODO(头条频道解析测试)
// * @author hero * @author hero
// * @date 2017年7月24日 下午5:10:52 * @date 2017年7月24日 下午5:10:52
// */ */
//public class TouTiaoChannelExample { public class TouTiaoChannelExample {
//
// public static void main(String[] args) { public static void main(String[] args) {
//
// long max_behot_time = 0; long max_behot_time = 0;
// for(int i= 0;i<3; i++){ for(int i= 0;i<3; i++){
// System.out.println("i=============="+i); System.out.println("i=============="+i);
// if( i==0 ){ if( i==0 ){
// max_behot_time = 0; max_behot_time = 0;
// } }
// String as = Tools.getAS().split("_")[0]; String as = Tools.getAS().split("_")[0];
// String cp = Tools.getAS().split("_")[1]; String cp = Tools.getAS().split("_")[1];
// String url = "http://www.toutiao.com/api/pc/feed/?category=news_tech&utm_source=toutiao" String url = "http://www.toutiao.com/api/pc/feed/?category=news_tech&utm_source=toutiao"
// + "&widen=1&max_behot_time="+max_behot_time+"&max_behot_time_tmp="+max_behot_time + "&widen=1&max_behot_time="+max_behot_time+"&max_behot_time_tmp="+max_behot_time
// +"&tadrequire=true&as=" +as +"&cp=" + cp; +"&tadrequire=true&as=" +as +"&cp=" + cp;
// System.out.println("url:" + url); System.out.println("url:" + url);
//
// Map<String,Object> result = TouTiaoChannelParse.touTiaoChannel(url, null); Map<String, Object> result;
// if(result!=null){ try {
// Long next = (Long)result.get("next"); result = TouTiaoChannelParse.touTiaoChannel(url, null);
// List<TouTiaoArticle> ttList = (List<TouTiaoArticle>)result.get("data"); if(result!=null){
// System.out.println("ttlist size is " + ttList.size()); Long next = (Long)result.get("next");
// for(TouTiaoArticle tt : ttList){ List<TouTiaoArticle> ttList = (List<TouTiaoArticle>)result.get("data");
// System.out.println(tt); System.out.println("ttlist size is " + ttList.size());
// } for(TouTiaoArticle tt : ttList){
// if(next != null){ System.out.println(tt);
// max_behot_time = next; }
// }else{ if(next != null){
// break; max_behot_time = next;
// } }else{
// } break;
// } }
// } }
// } catch (Exception e) {
//} e.printStackTrace();
}
}
}
}
///** /**
// * @Title: TouTiaoExample.java * @Title: TouTiaoExample.java
// * @Package com.zhiwei.toutiao.test * @Package com.zhiwei.toutiao.test
// * @Description: * @Description:
// * @author hero * @author hero
// * @date 2016年9月2日 上午11:48:51 * @date 2016年9月2日 上午11:48:51
// * @version V1.0 * @version V1.0
// */ */
///** /**
//* *
//*/ */
//package com.zhiwei.toutiao.test; package com.zhiwei.toutiao.test;
//
//import java.util.ArrayList; import java.util.ArrayList;
//import java.util.Date; import java.util.Date;
//import java.util.List; import java.util.List;
//import java.util.Map; import java.util.Map;
//
//import com.zhiwei.toutiao.bean.TouTiaoArticle; import com.zhiwei.toutiao.bean.TouTiaoArticle;
//import com.zhiwei.toutiao.parse.TouTiaoArticleParse; import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
//import com.zhiwei.toutiao.util.Tools; import com.zhiwei.zhiweiTools.timeParse.TimeParse;
//import com.zhiwei.zhiweiTools.timeParse.TimeParse;
// /**
///** * @Description:
// * @Description: * @author hero
// * @author hero * @date 2016年9月2日 上午11:48:51
// * @date 2016年9月2日 上午11:48:51 */
// */ public class TouTiaoExample {
//public class TouTiaoExample {
// @SuppressWarnings("unchecked")
// @SuppressWarnings("unchecked") public static void main(String[] args) throws Exception {
// public static void main(String[] args) throws Exception { long a = System.currentTimeMillis();
// long a = System.currentTimeMillis(); List<String> urlList = new ArrayList<String>();
// List<String> urlList = new ArrayList<String>(); urlList.add("6859134443");
// urlList.add("6859134443");
// System.out.println(urlList.size());
// System.out.println(urlList.size());
// Date endTime = TimeParse.stringFormartDate("2018-04-01");
// Date endTime = TimeParse.stringFormartDate("2018-04-01");
// for (String url : urlList) {
// for (String url : urlList) { String mid = url;
// String mid = url; String max_behot_time = "0";
// String max_behot_time = "0"; while (true) {
// while (true) { Map<String, Object> dataMap = null;
// Map<String, Object> dataMap = null; dataMap = TouTiaoArticleParse.getTouTiaoList(mid, max_behot_time, endTime,null);
// dataMap = TouTiaoArticleParse.getTouTiaoList(mid, max_behot_time, endTime,null); if (dataMap != null) {
// if (dataMap != null) { List<TouTiaoArticle> ttlist = (List<TouTiaoArticle>) dataMap.get("data");
// List<TouTiaoArticle> ttlist = (List<TouTiaoArticle>) dataMap.get("data"); max_behot_time = (String) dataMap.get("max_behot_time");
// max_behot_time = (String) dataMap.get("max_behot_time"); System.out.println(max_behot_time + "=======" + ttlist.size());
// System.out.println(max_behot_time + "=======" + ttlist.size()); if (max_behot_time == null || ttlist.isEmpty()) {
// if (max_behot_time == null || ttlist.isEmpty()) { break;
// break; } else {
// } else { if (ttlist.size() > 0) {
// if (ttlist.size() > 0) { for (TouTiaoArticle tt : ttlist) {
// for (TouTiaoArticle tt : ttlist) { System.out.println(tt);
// System.out.println(tt); }
// } }
// } }
// } }
// } }
// } }
// } long b = System.currentTimeMillis();
// long b = System.currentTimeMillis(); System.out.println("一轮的采集时间为:" + (b - a) / 1000);
// System.out.println("一轮的采集时间为:" + (b - a) / 1000); }
// }
// }
//}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment