Commit f6290b0f by zhiwei

修改微头条采集内容解析

parent f97d6fe2
......@@ -80,8 +80,7 @@ public class TouTiaoAccountParse {
String htmlBody = null;
htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
if(htmlBody != null){
tta = parseAccountByUserId(htmlBody, user_id);
tta = parseAccountByUserId(htmlBody, user_id, proxy);
}
} catch (Exception e) {
e.fillInStackTrace();
......@@ -193,7 +192,6 @@ public class TouTiaoAccountParse {
for (int i = 0; i < jsonArray.size(); i++) {
try {
JSONObject data = jsonArray.getJSONObject(i);
System.out.println(data.toString());
if(data.getLong("id") == null) {
continue;
}
......@@ -214,7 +212,6 @@ public class TouTiaoAccountParse {
follow_count = data.getInteger("follow_count");
}
String img_url = "https:"+data.getString("avatar_url");
System.out.println(data.getString("create_time"));
Date create_time = null;
if(data.getString("create_time") != null) {
create_time = new Date(Long.valueOf(data.getString("create_time"))*1000);
......@@ -254,9 +251,9 @@ public class TouTiaoAccountParse {
* @param @return 设定文件
* @return TouTiaoAccount 返回类型
*/
private static TouTiaoAccount parseAccountByUserId(String htmlBody,String user_id) {
private static TouTiaoAccount parseAccountByUserId(String htmlBody,String user_id,Proxy proxy) {
try {
TouTiaoAccount touTiaoAccount = new TouTiaoAccount();
TouTiaoAccount touTiaoAccount = new TouTiaoAccount();;
if(htmlBody.contains("var header={")){
String name = htmlBody.split("var header")[1].split("name:'")[1].split("',")[0];
String img_url = "https:"+htmlBody.split("avtar_img:'")[1].split("',")[0];
......@@ -268,6 +265,7 @@ public class TouTiaoAccountParse {
touTiaoAccount.setFollow_count(fensi);
}
touTiaoAccount.setId(user_id);
touTiaoAccount.setUser_id(Long.valueOf(user_id));
touTiaoAccount.setImg_url(img_url);
touTiaoAccount.setName(name);
touTiaoAccount.setUser_type(type);
......
......@@ -151,7 +151,6 @@ public class TouTiaoArticleParse {
}
Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", "https://www.toutiao.com/c/user/" + user_id + "/");
String htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
if (htmlBody != null) {
Map<String, Object> dataMap = parseHtmlByMicroAccount(htmlBody, endDate);
......@@ -175,6 +174,7 @@ public class TouTiaoArticleParse {
* @param @return 设定文件
* @return Map<String,Object> 返回类型
*/
@SuppressWarnings("unlikely-arg-type")
private static Map<String, Object> parseHtmlByMicroAccount(String htmlBody, Date endDate) {
Map<String, Object> map = new HashMap<String, Object>();
......@@ -201,7 +201,7 @@ public class TouTiaoArticleParse {
date = new Date(max_behot_time*1000);
href = "https://www.toutiao.com/a" + data.getString("thread_id");
source = data.getJSONObject("ugc_user").getString("name");
content = data.getString("rich_content");
content = data.getString("content");
readNum = data.getInteger("read_count")+"";
commentNum = data.getInteger("comment_count")+"";
user_id = data.getJSONObject("ugc_user").getString("user_id");
......
//package com.zhiwei.toutiao.test;
//
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.toutiao.bean.TouTiaoArticle;
//import com.zhiwei.toutiao.parse.TouTiaoChannelParse;
//import com.zhiwei.toutiao.util.Tools;
//
///**
// * @ClassName: TouTiaoChannelExample
// * @Description: TODO(头条频道解析测试)
// * @author hero
// * @date 2017年7月24日 下午5:10:52
// */
//public class TouTiaoChannelExample {
//
// public static void main(String[] args) {
//
// long max_behot_time = 0;
// for(int i= 0;i<3; i++){
// System.out.println("i=============="+i);
// if( i==0 ){
// max_behot_time = 0;
// }
// String as = Tools.getAS().split("_")[0];
// String cp = Tools.getAS().split("_")[1];
// String url = "http://www.toutiao.com/api/pc/feed/?category=news_tech&utm_source=toutiao"
// + "&widen=1&max_behot_time="+max_behot_time+"&max_behot_time_tmp="+max_behot_time
// +"&tadrequire=true&as=" +as +"&cp=" + cp;
// System.out.println("url:" + url);
//
// Map<String,Object> result = TouTiaoChannelParse.touTiaoChannel(url, null);
// if(result!=null){
// Long next = (Long)result.get("next");
// List<TouTiaoArticle> ttList = (List<TouTiaoArticle>)result.get("data");
// System.out.println("ttlist size is " + ttList.size());
// for(TouTiaoArticle tt : ttList){
// System.out.println(tt);
// }
// if(next != null){
// max_behot_time = next;
// }else{
// break;
// }
// }
// }
// }
//
//}
package com.zhiwei.toutiao.test;
import java.util.List;
import java.util.Map;
import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.parse.TouTiaoChannelParse;
import com.zhiwei.toutiao.util.Tools;
/**
* @ClassName: TouTiaoChannelExample
* @Description: TODO(头条频道解析测试)
* @author hero
* @date 2017年7月24日 下午5:10:52
*/
public class TouTiaoChannelExample {
public static void main(String[] args) {
long max_behot_time = 0;
for(int i= 0;i<3; i++){
System.out.println("i=============="+i);
if( i==0 ){
max_behot_time = 0;
}
String as = Tools.getAS().split("_")[0];
String cp = Tools.getAS().split("_")[1];
String url = "http://www.toutiao.com/api/pc/feed/?category=news_tech&utm_source=toutiao"
+ "&widen=1&max_behot_time="+max_behot_time+"&max_behot_time_tmp="+max_behot_time
+"&tadrequire=true&as=" +as +"&cp=" + cp;
System.out.println("url:" + url);
Map<String, Object> result;
try {
result = TouTiaoChannelParse.touTiaoChannel(url, null);
if(result!=null){
Long next = (Long)result.get("next");
List<TouTiaoArticle> ttList = (List<TouTiaoArticle>)result.get("data");
System.out.println("ttlist size is " + ttList.size());
for(TouTiaoArticle tt : ttList){
System.out.println(tt);
}
if(next != null){
max_behot_time = next;
}else{
break;
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
///**
// * @Title: TouTiaoExample.java
// * @Package com.zhiwei.toutiao.test
// * @Description:
// * @author hero
// * @date 2016年9月2日 上午11:48:51
// * @version V1.0
// */
///**
//*
//*/
//package com.zhiwei.toutiao.test;
//
//import java.util.ArrayList;
//import java.util.Date;
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.toutiao.bean.TouTiaoArticle;
//import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
//import com.zhiwei.toutiao.util.Tools;
//import com.zhiwei.zhiweiTools.timeParse.TimeParse;
//
///**
// * @Description:
// * @author hero
// * @date 2016年9月2日 上午11:48:51
// */
//public class TouTiaoExample {
//
// @SuppressWarnings("unchecked")
// public static void main(String[] args) throws Exception {
// long a = System.currentTimeMillis();
// List<String> urlList = new ArrayList<String>();
// urlList.add("6859134443");
//
// System.out.println(urlList.size());
//
// Date endTime = TimeParse.stringFormartDate("2018-04-01");
//
// for (String url : urlList) {
// String mid = url;
// String max_behot_time = "0";
// while (true) {
// Map<String, Object> dataMap = null;
// dataMap = TouTiaoArticleParse.getTouTiaoList(mid, max_behot_time, endTime,null);
// if (dataMap != null) {
// List<TouTiaoArticle> ttlist = (List<TouTiaoArticle>) dataMap.get("data");
// max_behot_time = (String) dataMap.get("max_behot_time");
// System.out.println(max_behot_time + "=======" + ttlist.size());
// if (max_behot_time == null || ttlist.isEmpty()) {
// break;
// } else {
// if (ttlist.size() > 0) {
// for (TouTiaoArticle tt : ttlist) {
// System.out.println(tt);
// }
// }
// }
// }
// }
// }
// long b = System.currentTimeMillis();
// System.out.println("一轮的采集时间为:" + (b - a) / 1000);
// }
//
//}
/**
* @Title: TouTiaoExample.java
* @Package com.zhiwei.toutiao.test
* @Description:
* @author hero
* @date 2016年9月2日 上午11:48:51
* @version V1.0
*/
/**
*
*/
package com.zhiwei.toutiao.test;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.parse.TouTiaoArticleParse;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
/**
* @Description:
* @author hero
* @date 2016年9月2日 上午11:48:51
*/
public class TouTiaoExample {
@SuppressWarnings("unchecked")
public static void main(String[] args) throws Exception {
long a = System.currentTimeMillis();
List<String> urlList = new ArrayList<String>();
urlList.add("6859134443");
System.out.println(urlList.size());
Date endTime = TimeParse.stringFormartDate("2018-04-01");
for (String url : urlList) {
String mid = url;
String max_behot_time = "0";
while (true) {
Map<String, Object> dataMap = null;
dataMap = TouTiaoArticleParse.getTouTiaoList(mid, max_behot_time, endTime,null);
if (dataMap != null) {
List<TouTiaoArticle> ttlist = (List<TouTiaoArticle>) dataMap.get("data");
max_behot_time = (String) dataMap.get("max_behot_time");
System.out.println(max_behot_time + "=======" + ttlist.size());
if (max_behot_time == null || ttlist.isEmpty()) {
break;
} else {
if (ttlist.size() > 0) {
for (TouTiaoArticle tt : ttlist) {
System.out.println(tt);
}
}
}
}
}
}
long b = System.currentTimeMillis();
System.out.println("一轮的采集时间为:" + (b - a) / 1000);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment