Commit 982e420e by leiliangliang

增加知乎采集循环次数

parent 7151cb11
......@@ -12,7 +12,6 @@ import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
......@@ -27,22 +26,22 @@ import org.jsoup.select.Elements;
import static java.util.Objects.nonNull;
/**
* @author hero
* @ClassName: ZhihuHotCrawler
* @Description: 知乎热搜采集程序
* @author hero
* @date 2017年9月15日 上午10:54:31
*/
@Log4j2
public class ZhihuHotSearchCrawler {
private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
//private static HttpBoot httpBoot = HttpBoot.newBuilder().sslProvider(SslProvider.CONSCRYPT).retryTimes(3).build();
/**
* @Title: getZhihuHotList
* @author hero
* @Description: 知乎热搜采集程序
* @return void 返回类型
*/
private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
//private static HttpBoot httpBoot = HttpBoot.newBuilder().sslProvider(SslProvider.CONSCRYPT).retryTimes(3).build();
/**
* @Title: getZhihuHotList
* @author hero
* @Description: 知乎热搜采集程序
* @return void 返回类型
*/
// public static List<HotSearchList> getZhihuHotList(){
// List<HotSearchList> list = null;
// String url = "https://www.zhihu.com/api/v4/search/top_search";
......@@ -81,121 +80,127 @@ public class ZhihuHotSearchCrawler {
// }
/**
* @param @return 设定文件
* @return List<ZhihuHotSearch> 返回类型
* @Title: getMobileZhihuHotList
* @author hero
* @Description: 移動端知乎熱搜榜
*/
public static List<HotSearchList> getMobileZhihuHotList(Date date) {
List<HotSearchList> list = new ArrayList<>();
String url = "https://api.zhihu.com/topstory/hot-list?limit=50&reverse_order=0";
Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "api.zhihu.com");
headerMap.put("Referer", url);
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36");
headerMap.put("X-UDID", "AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8=");
headerMap.put("authorization", "oauth c3cef7c66a1843f8b3a9e6a1e3160e20");
String htmlBody = null;
Request request = RequestUtils.wrapGet(url, headerMap);
for (int x = 0; x <= 5; x++) {
Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
if (response.hasCause()) {
Throwable cause = response.cause();
log.debug("获取知乎热搜时出现问题:{}", cause);
continue;
} else {
htmlBody = response.bodyString();
if (!htmlBody.contains("author")) {
continue;
}
}
try {
if (htmlBody != null && htmlBody.contains("author")) {
JSONObject topSearch = JSONObject.parseObject(htmlBody);
JSONArray dataJson = topSearch.getJSONArray("data");
String link = null;
String displayQuery = null;
Long hotCount = null;
String hotText = null;
for (int i = 0; i < dataJson.size(); i++) {
JSONObject data = dataJson.getJSONObject(i).getJSONObject("target");
displayQuery = data.getString("title");
link = "https://www.zhihu.com/question/" + data.getLongValue("id");
hotText = dataJson.getJSONObject(i).getString("detail_text");
/**
* @Title: getMobileZhihuHotList
* @author hero
* @Description: 移動端知乎熱搜榜
* @param @return 设定文件
* @return List<ZhihuHotSearch> 返回类型
*/
public static List<HotSearchList> getMobileZhihuHotList(Date date){
List<HotSearchList> list = new ArrayList<>();
String url = "https://api.zhihu.com/topstory/hot-list?limit=50&reverse_order=0";
Map<String,String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "api.zhihu.com");
headerMap.put("Referer", url);
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36");
headerMap.put("X-UDID", "AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8=");
headerMap.put("authorization", "oauth c3cef7c66a1843f8b3a9e6a1e3160e20");
String htmlBody = null;
Request request = RequestUtils.wrapGet(url, headerMap);
Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
if (response.hasCause()){
Throwable cause = response.cause();
log.debug("获取知乎热搜时出现问题:{}",cause);
return list;
}else {
htmlBody = response.bodyString();
}
try {
if (htmlBody != null && htmlBody.contains("author")) {
JSONObject topSearch = JSONObject.parseObject(htmlBody);
JSONArray dataJson = topSearch.getJSONArray("data");
String link = null;
String displayQuery = null;
Long hotCount = null;
String hotText = null;
for (int i = 0; i < dataJson.size(); i++) {
JSONObject data = dataJson.getJSONObject(i).getJSONObject("target");
displayQuery = data.getString("title");
link = "https://www.zhihu.com/question/" + data.getLongValue("id");
hotText = dataJson.getJSONObject(i).getString("detail_text");
//计算热度
try {
if (hotText.contains("万")) {
hotText = hotText.replaceAll("万.*", "").trim();
hotCount = (long) (Double.parseDouble(hotText) * 10000);
} else if (hotText.contains("亿")) {
hotText = hotText.replaceAll("亿.*", "").trim();
hotCount = (long) (Double.parseDouble(hotText) * 100000000);
} else {
hotCount = Long.getLong(hotText);
}
} catch (Exception e) {
e.printStackTrace();
}
org.bson.Document doc = getTag(link);
String tog = nonNull(doc.get("tag")) ? doc.getString("tag") : null;
Long view = nonNull(doc.get("view")) ? Long.valueOf(doc.get("view").toString()) : null;
Long fans = nonNull(doc.get("fans")) ? Long.valueOf(doc.get("fans").toString()) : null;
HotSearchList zhihu = new HotSearchList(link, displayQuery, hotCount, i + 1, HotSearchType.知乎热搜.name(), date);
zhihu.setFans(fans);
zhihu.setView(view);
zhihu.setTag(tog);
list.add(zhihu);
}
return list;
}
} catch (Exception e) {
log.info("知乎热搜解析异常", e);
}
}
return list;
}
//计算热度
try {
if (hotText.contains("万")) {
hotText = hotText.replaceAll("万.*", "").trim();
hotCount = (long) (Double.parseDouble(hotText) * 10000);
} else if (hotText.contains("亿")) {
hotText = hotText.replaceAll("亿.*", "").trim();
hotCount = (long) (Double.parseDouble(hotText) * 100000000);
} else {
hotCount = Long.getLong(hotText);
}
} catch (Exception e) {
e.printStackTrace();
}
org.bson.Document doc = getTag(link);
String tog = nonNull(doc.get("tag")) ? doc.getString("tag") : null;
Long view = nonNull(doc.get("view")) ? Long.valueOf(doc.get("view").toString()) : null;
Long fans = nonNull(doc.get("fans")) ? Long.valueOf(doc.get("fans").toString()) : null;
HotSearchList zhihu = new HotSearchList(link, displayQuery, hotCount, i + 1, HotSearchType.知乎热搜.name(),date);
zhihu.setFans(fans);
zhihu.setView(view);
zhihu.setTag(tog);
list.add(zhihu);
}
}
} catch (Exception e) {
log.info("知乎热搜解析异常",e);
}
return list;
}
//访问pc端 获取标签及浏览量关注数
private static org.bson.Document getTag(String url) {
org.bson.Document doc = new org.bson.Document();
doc.put("tag",null);
//浏览量
doc.put("view",null);
//粉丝
doc.put("fans",null);
Map<String,String> Map = HeaderTool.getCommonHead();
Map.put("cookie", "_xsrf=7NFWM5qBcOutfs8MaW7bhQQH65t3Xia4");
Request request = RequestUtils.wrapGet(url,Map);
try {
Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
if (response.hasCause()){
Throwable cause = response.cause();
log.error("单条知乎热搜数据页面连接失败",cause);
return doc;
}else {
String htmlBody = response.bodyString();
if (htmlBody != null && htmlBody.contains("QuestionHeader")) {
Document document = Jsoup.parse(htmlBody);
//获取标签
String label="";
Elements select = document.select("div.Tag");
for (Element element : select) {
String text = "`"+element.select("div.Popover").text()+";";
label=label+text;
}
doc.put("tag",label.trim());
String strong = document.select("div.NumberBoard-itemInner").select("strong").text();
String[] count = strong.split(" ");
//获取关注数
doc.put("fans",Long.valueOf(count[0].replaceAll(",","").trim()));
//获取浏览量
doc.put("view",Long.valueOf(count[1].replaceAll(",","").trim()));
return doc;
}else {
return doc;
}
}
} catch (Exception e) {
log.info("知乎热搜标签解析异常",e);
}
return doc;
}
//访问pc端 获取标签及浏览量关注数
private static org.bson.Document getTag(String url) {
org.bson.Document doc = new org.bson.Document();
doc.put("tag", null);
//浏览量
doc.put("view", null);
//粉丝
doc.put("fans", null);
Map<String, String> Map = HeaderTool.getCommonHead();
Map.put("cookie", "_xsrf=7NFWM5qBcOutfs8MaW7bhQQH65t3Xia4");
Request request = RequestUtils.wrapGet(url, Map);
try {
Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
if (response.hasCause()) {
Throwable cause = response.cause();
log.error("单条知乎热搜数据页面连接失败", cause);
return doc;
} else {
String htmlBody = response.bodyString();
if (htmlBody != null && htmlBody.contains("QuestionHeader")) {
Document document = Jsoup.parse(htmlBody);
//获取标签
String label = "";
Elements select = document.select("div.Tag");
for (Element element : select) {
String text = "`" + element.select("div.Popover").text() + ";";
label = label + text;
}
doc.put("tag", label.trim());
String strong = document.select("div.NumberBoard-itemInner").select("strong").text();
String[] count = strong.split(" ");
//获取关注数
doc.put("fans", Long.valueOf(count[0].replaceAll(",", "").trim()));
//获取浏览量
doc.put("view", Long.valueOf(count[1].replaceAll(",", "").trim()));
return doc;
} else {
return doc;
}
}
} catch (Exception e) {
log.info("知乎热搜标签解析异常", e);
}
return doc;
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment