Commit f9669513 by win 10
parents f9343985 27c7475b
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>articlenewscrawler</artifactId>
<version>0.2.2-SNAPSHOT</version>
<version>0.2.3-SNAPSHOT</version>
<name>articlenewscrawler</name>
<description>采集凤凰,一点资讯,搜狐历时文章和文章评论</description>
......
package com.zhiwei.parse;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import okhttp3.Response;
/**
*
* @ClassName: BTime
* @Description: 北京时间相关采集
* @author 0xff
* @date 2019年12月3日 上午11:06:29
*/
public class BTime {
private static final Logger logger = LoggerFactory.getLogger(BTime.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(1).throwException(false).build();
/**
** 知乎专栏历史文章采集
* @param uid
* @param startTime
* @return
* @return List<Map<String,Object>>
*/
public static List<Map<String, Object>> getHistoryData(String uid, Long startTime) {
List<Map<String, Object>> dataList = new ArrayList<>();
boolean f = true;
int page = 1;
Map<String,String> headers = new HashMap<>();
int errorCount = 1;
while(f) {
String url = "https://record.btime.com/getNews?tab=all&pageRow=20&uid=" + uid + "&refresh=" + page + "&target=v4&refresh_type=2&req_count=" + page + "&page=" + page;
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headers), ProxyHolder.NAT_HEAVY_PROXY)){
String result = response.body().string();
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArray = json.getJSONObject("data").getJSONArray("data");
if(Objects.nonNull(jsonArray)) {
for(int i = 0;i < jsonArray.size();i++) {
JSONObject data = jsonArray.getJSONObject(i);
if(Objects.nonNull(startTime) && startTime/1000 > data.getLongValue("pdate")) {
f = false;
break;
}
Map<String, Object> map = new HashMap<>();
map.put("url", data.getString("url"));
map.put("title", data.getJSONObject("data").getString("title"));
map.put("content", data.getJSONObject("data").getString("summary").replaceAll("<.*?>", ""));
map.put("time", new Date(data.getJSONObject("data").getLongValue("pdate") * 1000L));
map.put("source", data.getJSONObject("data").getString("source"));
dataList.add(map);
}
logger.info("北京时间文章 采集到第{}页,一共采集到{}条",page,dataList.size());
} else {
f = false;
}
page++;
errorCount = 1;
} catch (Exception e) {
logger.error("此轮错误解析",e);
errorCount++;
}
if(errorCount > 3) {
f = false;
}
}
return dataList;
}
}
......@@ -27,7 +27,7 @@ public class BiliBili {
List<Map<String,Object>> bodyList = new ArrayList<>();
try {
//
String url = "https://search.bilibili.com/all?keyword="+URLEncoder.encode(word, "utf-8")+"&single_column=1&order=stow&duration=0&tids_1=0";
String url = "https://search.bilibili.com/all?keyword="+URLEncoder.encode(word, "utf-8")+"&single_column=1&order=pubdate&duration=0&tids_1=0";
System.out.println(url);
Headers header = Headers.of("cookie",cookie,"Referer","https://www.bilibili.com/","Host","search.bilibili.com");
String result = HttpClient.executeHttpRequestGet(url, ProxyHolder.NAT_HEAVY_PROXY, header);
......
package com.zhiwei.parse;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.timeparse.TimeParse;
import okhttp3.Response;
/**
*
* @ClassName: Huxiu
* @Description: 虎嗅相关采集
* @author 0xff
* @date 2019年12月3日 下午3:00:40
*/
public class Huxiu {
private static final Logger logger = LoggerFactory.getLogger(Huxiu.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(1).throwException(false).build();
/**
* 解析element
* @param uid
* @return List<Map<String,Object>>
*/
public static List<Map<String, Object>> getHuXiuData(String uid, String endTime, String cookie) {
List<Map<String, Object>> bodyList = new ArrayList<>();
int page = 1;
boolean next = true;
while(next) {
for(int i=0;i<5;i++) {
try {
String url = "https://www.huxiu.com/member/" + uid + "/article/" + page + ".html";
logger.info("重试次数:{},页数:{},地址:{}", i, page, url);
Document document = getDocument(url, cookie);
String haveNext = document.select("a").attr("aria-label");//包含Next时为最后一页
String uname = document.select("div.user-name").text();
Elements elements = document.select("div.message-box > div.mod-b.mod-art");
for(Element e:elements) {
String title = e.select("div.mob-ctt > h3 > a").text();
//列表显示的时间(补充文章获取失败后的时间)
String artTime = e.select("div.mob-author > span.time").text();
Date time = TimeParse.stringFormartDate(artTime);
//跳进文章获取具体时间
String artUrl = "https://www.huxiu.com" + e.select("div.mob-ctt > h3 > a").attr("href");
//超出时间则不获取
if(haveNext.contains("Next") || (time.getTime() < TimeParse.stringFormartDate(endTime).getTime()) || page > 500) {
next = false;
break;
}
Map<String, Object> map = new HashMap<>();
map.put("title", title);
map.put("url", artUrl);
map.put("time", time);
map.put("uid", uid);
map.put("uname", uname);
map.put("source", "虎嗅");
bodyList.add(map);
}
page ++;
break;
} catch (Exception e) {
logger.error("解析数据失败", e);
}
}
}
return bodyList;
}
/**
* 建立网页连接,获取json数据
* @Description:
* @param @param url
* @return void 返回类型
*/
public static Document getDocument(String url, String cookie) {
Document document = null;
for(int i = 0; i < 4; i ++) {
Map<String, Object> headMap = new HashMap<>();
headMap.put("cookie", cookie);
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headMap), ProxyHolder.NAT_HEAVY_PROXY)){
String htmlBody = response.body().string();
document = Jsoup.parse(htmlBody);
break;
} catch (Exception e) {
logger.error("页面连接失败", e);
}
}
return document;
}
}
package com.zhiwei.parse;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import okhttp3.Response;
/**
*
* @ClassName: KuaiData
* @Description: 快咨询 相关采集
* @author 0xff
* @date 2019年12月3日 下午2:13:32
*/
public class KuaiData {
private static final Logger logger = LoggerFactory.getLogger(KuaiData.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(1).throwException(false).build();
/**
** 知乎专栏历史文章采集
* @param uid
* @param startTime
* @return
* @return List<Map<String,Object>>
*/
public static List<Map<String, Object>> getArticleHistory(String uid, String gid, Long startTime) {
List<Map<String, Object>> dataList = new ArrayList<>();
boolean f = true;
int page = 1;
Map<String,String> headers = new HashMap<>();
int errorCount = 1;
while(f) {
String url = "https://m.look.360.cn/api/getgzh?f=json&pg=" + page + "&scheme=https&callback=&sign=look&u=" + uid + "&n=30&sqid=&gzh=" + gid;
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headers), ProxyHolder.NAT_HEAVY_PROXY)){
String result = response.body().string();
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArray = json.getJSONArray("res");
if(Objects.nonNull(jsonArray) && !jsonArray.isEmpty()) {
for(int i = 0;i < jsonArray.size();i++) {
JSONObject data = jsonArray.getJSONObject(i);
if(Objects.nonNull(startTime) && startTime/1000 > data.getLongValue("p")) {
f = false;
break;
}
Map<String, Object> map = new HashMap<>();
map.put("url", data.getString("pcurl"));
map.put("title", data.getString("t"));
map.put("time", new Date(data.getLongValue("p") * 1000L));
map.put("source", data.getJSONObject("zmt").getString("name"));
dataList.add(map);
}
logger.info("快资讯文章 采集到第{}页,一共采集到{}条",page,dataList.size());
}else {
f = false;
}
errorCount = 1;
page++;
} catch (Exception e) {
logger.error("此轮错误解析",e);
errorCount++;
}
if(errorCount > 3) {
f = false;
}
}
return dataList;
}
}
package com.zhiwei.parse;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.apache.commons.lang3.time.FastDateFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import okhttp3.Response;
/**
*
* @ClassName: MyZaker
* @Description: zaker 采集相关
* @author 0xff
* @date 2019年12月3日 上午11:29:20
*/
public class MyZaker {
private static final Logger logger = LoggerFactory.getLogger(BTime.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(1).throwException(false).build();
private static final FastDateFormat fdf = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss");
/**
** 知乎专栏历史文章采集
* @param uid
* @param startTime
* @return
* @return List<Map<String,Object>>
*/
public static List<Map<String, Object>> getHistoryData(String uid, Long startTime) {
List<Map<String, Object>> dataList = new ArrayList<>();
boolean f = true;
int page = 0;
Map<String,String> headers = new HashMap<>();
int errorCount = 1;
while(f) {
String url = "http://iphone.myzaker.com/zaker/flock.php?app_id=" + uid + "&since_date=&nt=1&otimestamp=1&next_id=&_appid=&start=" + page * 20 + "&p_num=1&_version=8.5";
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headers), ProxyHolder.NAT_HEAVY_PROXY)){
String result = response.body().string();
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArray = json.getJSONObject("data").getJSONArray("list");
if(Objects.nonNull(jsonArray)) {
for(int i = 0;i < jsonArray.size();i++) {
JSONObject data = jsonArray.getJSONObject(i);
if(Objects.nonNull(startTime) && startTime > fdf.parse(data.getJSONObject("item_open_info").getJSONObject("article").getString("date")).getTime()) {
f = false;
break;
}
Map<String, Object> map = new HashMap<>();
map.put("url", "http://www.myzaker.com/article/" + data.getString("pk"));
map.put("title", data.getString("title"));
map.put("content", data.getJSONObject("item_open_info").getJSONObject("article").getString("content_desc").replaceAll("<.*?>", ""));
map.put("time", fdf.parse(data.getJSONObject("item_open_info").getJSONObject("article").getString("date")));
map.put("source", data.getJSONObject("item_open_info").getJSONObject("article").getString("auther_name"));
dataList.add(map);
}
logger.info("zaker文章 采集到第{}页,一共采集到{}条",page,dataList.size());
} else {
f = false;
}
page++;
errorCount = 1;
} catch (Exception e) {
logger.error("此轮错误解析",e);
errorCount++;
}
if(errorCount > 5) {
f = false;
}
}
return dataList;
}
}
......@@ -8,6 +8,7 @@ import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -15,6 +16,7 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.parse.analysis.XueqiuKeyWordAnalysis;
import com.zhiwei.tools.timeparse.TimeParse;
......@@ -104,6 +106,7 @@ public class Xueqiu {
* @Description 雪球历史文章采集
* @return
*/
@Deprecated
public static List<Map<String,Object>> getXueqiuAccountData(String userId,String cookie,Proxy proxy) {
Map<String,Object> headers = new HashMap<>();
headers.put("cookie", cookie);
......@@ -153,4 +156,103 @@ public class Xueqiu {
return bodyList;
}
/**
* 雪球历史文章采集最新
* @param url
* @param page void
*/
public static List<Map<String, Object>> getData(String uid, String endTime, String cookie) {
List<Map<String, Object>> resultList = new ArrayList<>();
int page = 1;
boolean next = true;
while(next) {
for(int j = 0; j < 4; j++) {
try {
String url = "https://xueqiu.com/v4/statuses/user_timeline.json?page=" + page + "&user_id=" + uid + "&type=0";
logger.info("重试次数:{},第{}页,JSON地址为:{}", j, page, url);
JSONObject json = getJson(url, cookie);//获取json数据
JSONArray jsonArray = json.getJSONArray("statuses");
if(Objects.nonNull(jsonArray)) {
for (int i = 0; i < jsonArray.size(); i++) {
JSONObject ob = jsonArray.getJSONObject(i);//得到json数组的第i个数组
String timeBefore = ob.getString("timeBefore");//时间
Date date = TimeParse.stringFormartDate(timeBefore);
Date endDate = TimeParse.stringFormartDate(endTime);
//获取规定时间内的数据
if(date.getTime() < endDate.getTime()) {
next = false;
break;
}
String screenName = ob.getJSONObject("user").getString("screen_name");//用户名
String source = ob.getString("source");//发布消息的手机平台
String description = ob.getString("description").replaceAll("<.*?>", "");//帖子正文
int retweetCount = ob.getInteger("retweet_count");//转发数
int replyCount = ob.getInteger("reply_count");//评论数
int likeCount = ob.getInteger("like_count");//点赞数
String targetLink = "https://xueqiu.com" + ob.getString("target");//帖子链接
Map<String, Object> map = new HashMap<>();
map.put("screenName", screenName);//用户名
map.put("uid", uid);//帖子链接
map.put("time", date);//时间
map.put("source", source);//发布消息的手机平台
map.put("description", description);//帖子正文
map.put("retweetCount", retweetCount);//转发数
map.put("replyCount", replyCount);//评论数
map.put("likeCount", likeCount);//点赞数
map.put("targetLink", targetLink);//帖子链接
map.put("pt", "雪球");//帖子链接
resultList.add(map);
}
}
//超出时间则停止采集
if(next) {
int maxPag = json.getInteger("maxPage");//获取最大页数
boolean flag = page < maxPag;//当前页数小于最大页数时,翻页
if(flag) {
page ++;//页数+1
} else {
next = false;
}
}
break;
} catch (Exception e) {
logger.error("解析JSON出错 ", e);
}
}
}
return resultList;
}
/**
* 获取json数据
* @param url
* @return JSONObject
*/
public static JSONObject getJson(String url, String cookie) {
JSONObject json = new JSONObject();
for(int i = 0; i < 5; i++) {
HashMap<String, Object> headMap = new HashMap<>();
headMap.put("Cookie", cookie);
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headMap), ProxyHolder.NAT_HEAVY_PROXY)){
String htmlBody = response.body().string();
json = JSONObject.parseObject(htmlBody);
} catch (Exception e) {
e.toString();
}
}
return json;
}
}
package com.zhiwei.parse;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import okhttp3.Response;
/**
*
* @ClassName: Zhihu
* @Description: 知乎相关采集
* @author 0xff
* @date 2019年12月3日 上午10:20:17
*/
public class Zhihu {
private static final Logger logger = LoggerFactory.getLogger(Zhihu.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(1).throwException(false).build();
/**
** 知乎专栏历史文章采集
* @param uid
* @param startTime
* @return
* @return List<Map<String,Object>>
*/
public static List<Map<String, Object>> getArticleHistory(String uid, Long startTime) {
List<Map<String, Object>> dataList = new ArrayList<>();
boolean f = true;
int page = 0;
Map<String,String> headers = new HashMap<>();
int errorCount = 1;
while(f) {
String url = "https://www.zhihu.com/api/v4/members/" + uid + "/articles?include=data%5B*%5D.comment_count%2Csuggest_edit%2Cis_normal%2Cthumbnail_extra_info%2Cthumbnail%2Ccan_comment%2Ccomment_permission%2Cadmin_closed_comment%2Ccontent%2Cvoteup_count%2Ccreated%2Cupdated%2Cupvoted_followees%2Cvoting%2Creview_info%2Cis_labeled%2Clabel_info%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=" + page * 20 + "&limit=20&sort_by=created";
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url, headers), ProxyHolder.NAT_HEAVY_PROXY)){
String result = response.body().string();
JSONObject json = JSONObject.parseObject(result);
if(json.getJSONObject("paging").getBooleanValue("is_end") || dataList.size() > json.getJSONObject("paging").getIntValue("totals")) {
f = false;
}
JSONArray jsonArray = json.getJSONArray("data");
for(int i = 0;i < jsonArray.size();i++) {
JSONObject data = jsonArray.getJSONObject(i);
if(Objects.nonNull(startTime) && startTime/1000 > data.getLongValue("created")) {
f = false;
break;
}
Map<String, Object> map = new HashMap<>();
map.put("url", data.getString("url"));
map.put("title", data.getString("title"));
map.put("content", data.getString("content").replaceAll("<.*?>", ""));
map.put("time", new Date(data.getLongValue("created") * 1000L));
map.put("source", data.getJSONObject("author").getString("name"));
dataList.add(map);
}
logger.info("知乎文章 采集到第{}页,一共采集到{}条",page,dataList.size());
errorCount = 1;
page++;
} catch (Exception e) {
logger.error("此轮错误解析",e);
errorCount++;
}
if(errorCount > 3) {
f = false;
}
}
return dataList;
}
}
package com.zhiwei.hsitory;
import java.util.List;
import java.util.Map;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.parse.BTime;
public class BTimeHistoryExample {
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181", "local", GroupType.PROVIDER , 10000002L);
List<Map<String, Object>> dataList = BTime.getHistoryData("1608238", 0L);
// dataList.forEach(System.out::println);
}
}
package com.zhiwei.hsitory;
import java.util.List;
import java.util.Map;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.parse.KuaiData;
public class KuaiDataHistoryExample {
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181", "local", GroupType.PROVIDER , 10000002L);
List<Map<String, Object>> dataList = KuaiData.getArticleHistory("5c19954ccb14fabc153971e3f924bf36", "2686798288", 0L);
// dataList.forEach(System.out::println);
}
}
package com.zhiwei.hsitory;
import java.util.List;
import java.util.Map;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.parse.BTime;
import com.zhiwei.parse.MyZaker;
public class ZakerHistoryExample {
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181", "local", GroupType.PROVIDER , 10000002L);
List<Map<String, Object>> dataList = MyZaker.getHistoryData("13584", 0L);
// dataList.forEach(System.out::println);
}
}
package com.zhiwei.hsitory;
import java.util.List;
import java.util.Map;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.parse.Zhihu;
public class ZhihuArticleHistoryExample {
public static void main(String[] args) {
ProxyFactory.init("zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181", "local", GroupType.PROVIDER , 10000002L);
List<Map<String, Object>> dataList = Zhihu.getArticleHistory("da-bai-xin-wen-27", 0L);
// dataList.forEach(System.out::println);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment