Commit 95287efd by win 10

新增微博财经头条历史文章采集WeiBoCaiJing

parent 2d60a4f2
package com.zhiwei.parse;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.timeparse.TimeParse;
import okhttp3.Response;
public class WeiBoCaiJing {
private WeiBoCaiJing() {
throw new IllegalStateException("WeiBoCaiJing class");
}
private static Logger logger = LoggerFactory.getLogger(WeiBoCaiJing.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
public static List<Map<String, Object>> getWeiBoCaiJingHistory(String uid, String startTime, ProxyHolder proxy, String cookie) {
List<Map<String, Object>> dataList = new ArrayList<Map<String,Object>>();
int page = 1;//从第一页开始
boolean nextPage = true;//是否有下一页
while(nextPage) {
String jsonUrl = "https://cj.sina.com.cn/k/api/article/lists_by_author?uid=" + uid + "&page=" + page + "&count=20";
try(Response response = httpBoot.syncCall(RequestUtils.wrapGet(jsonUrl),proxy)) {
String htmlBody = response.body().string();
JSONObject json = JSONObject.parseObject(htmlBody);
JSONArray jsonArray = json.getJSONObject("result").getJSONObject("data").getJSONArray("lists");
int totalpage = json.getJSONObject("result").getJSONObject("data").getInteger("totalpage");//总页数
String source = json.getJSONObject("result").getJSONObject("data").getJSONObject("user_info").getString("name");//来源
if(jsonArray != null) {
for (int i = 0; i < jsonArray.size(); i++) {
JSONObject data = jsonArray.getJSONObject(i);
String title = data.getString("title");//标题
String url = data.getString("url");//文章链接
Date time = new Date(data.getLong("create_time")*1000);//发布时间
Date startDate = TimeParse.stringFormartDate(startTime);
if(startDate.after(time)) {
page = totalpage;
break;
}
String summary = data.getString("summary");//正文概述
int read = data.getInteger("read_num");//阅读数
Map<String, Object> dataMap = new HashMap<String, Object>();
dataMap.put("title", title);
dataMap.put("source", source);
dataMap.put("url", url);
dataMap.put("time", time);
dataMap.put("summary", summary);
dataMap.put("read", read);
dataList.add(dataMap);
}
}
if(page < totalpage) {
nextPage = true;
page ++;
}else {
nextPage = false;
}
} catch (Exception e) {
logger.error("获取微博财经头条数据失败 {}", e);
}
}
return dataList;
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment