Commit 982e420e by leiliangliang

增加知乎采集循环次数

parent 7151cb11
...@@ -12,7 +12,6 @@ import lombok.extern.log4j.Log4j2; ...@@ -12,7 +12,6 @@ import lombok.extern.log4j.Log4j2;
import okhttp3.Request; import okhttp3.Request;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
...@@ -27,22 +26,22 @@ import org.jsoup.select.Elements; ...@@ -27,22 +26,22 @@ import org.jsoup.select.Elements;
import static java.util.Objects.nonNull; import static java.util.Objects.nonNull;
/** /**
* @author hero
* @ClassName: ZhihuHotCrawler * @ClassName: ZhihuHotCrawler
* @Description: 知乎热搜采集程序 * @Description: 知乎热搜采集程序
* @author hero
* @date 2017年9月15日 上午10:54:31 * @date 2017年9月15日 上午10:54:31
*/ */
@Log4j2 @Log4j2
public class ZhihuHotSearchCrawler { public class ZhihuHotSearchCrawler {
private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build(); private static HttpBoot httpBoot = HttpBoot.newBuilder().retryTimes(3).build();
//private static HttpBoot httpBoot = HttpBoot.newBuilder().sslProvider(SslProvider.CONSCRYPT).retryTimes(3).build(); //private static HttpBoot httpBoot = HttpBoot.newBuilder().sslProvider(SslProvider.CONSCRYPT).retryTimes(3).build();
/** /**
* @Title: getZhihuHotList * @Title: getZhihuHotList
* @author hero * @author hero
* @Description: 知乎热搜采集程序 * @Description: 知乎热搜采集程序
* @return void 返回类型 * @return void 返回类型
*/ */
// public static List<HotSearchList> getZhihuHotList(){ // public static List<HotSearchList> getZhihuHotList(){
// List<HotSearchList> list = null; // List<HotSearchList> list = null;
// String url = "https://www.zhihu.com/api/v4/search/top_search"; // String url = "https://www.zhihu.com/api/v4/search/top_search";
...@@ -81,121 +80,127 @@ public class ZhihuHotSearchCrawler { ...@@ -81,121 +80,127 @@ public class ZhihuHotSearchCrawler {
// } // }
/**
* @param @return 设定文件
* @return List<ZhihuHotSearch> 返回类型
* @Title: getMobileZhihuHotList
* @author hero
* @Description: 移動端知乎熱搜榜
*/
public static List<HotSearchList> getMobileZhihuHotList(Date date) {
List<HotSearchList> list = new ArrayList<>();
String url = "https://api.zhihu.com/topstory/hot-list?limit=50&reverse_order=0";
Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "api.zhihu.com");
headerMap.put("Referer", url);
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36");
headerMap.put("X-UDID", "AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8=");
headerMap.put("authorization", "oauth c3cef7c66a1843f8b3a9e6a1e3160e20");
String htmlBody = null;
Request request = RequestUtils.wrapGet(url, headerMap);
for (int x = 0; x <= 5; x++) {
Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
if (response.hasCause()) {
Throwable cause = response.cause();
log.debug("获取知乎热搜时出现问题:{}", cause);
continue;
} else {
htmlBody = response.bodyString();
if (!htmlBody.contains("author")) {
continue;
}
}
try {
if (htmlBody != null && htmlBody.contains("author")) {
JSONObject topSearch = JSONObject.parseObject(htmlBody);
JSONArray dataJson = topSearch.getJSONArray("data");
String link = null;
String displayQuery = null;
Long hotCount = null;
String hotText = null;
for (int i = 0; i < dataJson.size(); i++) {
JSONObject data = dataJson.getJSONObject(i).getJSONObject("target");
displayQuery = data.getString("title");
link = "https://www.zhihu.com/question/" + data.getLongValue("id");
hotText = dataJson.getJSONObject(i).getString("detail_text");
/** //计算热度
* @Title: getMobileZhihuHotList try {
* @author hero if (hotText.contains("万")) {
* @Description: 移動端知乎熱搜榜 hotText = hotText.replaceAll("万.*", "").trim();
* @param @return 设定文件 hotCount = (long) (Double.parseDouble(hotText) * 10000);
* @return List<ZhihuHotSearch> 返回类型 } else if (hotText.contains("亿")) {
*/ hotText = hotText.replaceAll("亿.*", "").trim();
public static List<HotSearchList> getMobileZhihuHotList(Date date){ hotCount = (long) (Double.parseDouble(hotText) * 100000000);
List<HotSearchList> list = new ArrayList<>(); } else {
String url = "https://api.zhihu.com/topstory/hot-list?limit=50&reverse_order=0"; hotCount = Long.getLong(hotText);
Map<String,String> headerMap = HeaderTool.getCommonHead(); }
headerMap.put("Host", "api.zhihu.com"); } catch (Exception e) {
headerMap.put("Referer", url); e.printStackTrace();
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"); }
headerMap.put("X-UDID", "AFAC3hv3vgyPTt9ZmNmqTm0yv_8NKY3S3z8="); org.bson.Document doc = getTag(link);
headerMap.put("authorization", "oauth c3cef7c66a1843f8b3a9e6a1e3160e20"); String tog = nonNull(doc.get("tag")) ? doc.getString("tag") : null;
String htmlBody = null; Long view = nonNull(doc.get("view")) ? Long.valueOf(doc.get("view").toString()) : null;
Request request = RequestUtils.wrapGet(url, headerMap); Long fans = nonNull(doc.get("fans")) ? Long.valueOf(doc.get("fans").toString()) : null;
Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY); HotSearchList zhihu = new HotSearchList(link, displayQuery, hotCount, i + 1, HotSearchType.知乎热搜.name(), date);
if (response.hasCause()){ zhihu.setFans(fans);
Throwable cause = response.cause(); zhihu.setView(view);
log.debug("获取知乎热搜时出现问题:{}",cause); zhihu.setTag(tog);
return list; list.add(zhihu);
}else { }
htmlBody = response.bodyString(); return list;
} }
try { } catch (Exception e) {
if (htmlBody != null && htmlBody.contains("author")) { log.info("知乎热搜解析异常", e);
JSONObject topSearch = JSONObject.parseObject(htmlBody); }
JSONArray dataJson = topSearch.getJSONArray("data"); }
String link = null; return list;
String displayQuery = null; }
Long hotCount = null;
String hotText = null;
for (int i = 0; i < dataJson.size(); i++) {
JSONObject data = dataJson.getJSONObject(i).getJSONObject("target");
displayQuery = data.getString("title");
link = "https://www.zhihu.com/question/" + data.getLongValue("id");
hotText = dataJson.getJSONObject(i).getString("detail_text");
//计算热度
try {
if (hotText.contains("万")) {
hotText = hotText.replaceAll("万.*", "").trim();
hotCount = (long) (Double.parseDouble(hotText) * 10000);
} else if (hotText.contains("亿")) {
hotText = hotText.replaceAll("亿.*", "").trim();
hotCount = (long) (Double.parseDouble(hotText) * 100000000);
} else {
hotCount = Long.getLong(hotText);
}
} catch (Exception e) {
e.printStackTrace();
}
org.bson.Document doc = getTag(link);
String tog = nonNull(doc.get("tag")) ? doc.getString("tag") : null;
Long view = nonNull(doc.get("view")) ? Long.valueOf(doc.get("view").toString()) : null;
Long fans = nonNull(doc.get("fans")) ? Long.valueOf(doc.get("fans").toString()) : null;
HotSearchList zhihu = new HotSearchList(link, displayQuery, hotCount, i + 1, HotSearchType.知乎热搜.name(),date);
zhihu.setFans(fans);
zhihu.setView(view);
zhihu.setTag(tog);
list.add(zhihu);
}
}
} catch (Exception e) {
log.info("知乎热搜解析异常",e);
}
return list;
}
//访问pc端 获取标签及浏览量关注数 //访问pc端 获取标签及浏览量关注数
private static org.bson.Document getTag(String url) { private static org.bson.Document getTag(String url) {
org.bson.Document doc = new org.bson.Document(); org.bson.Document doc = new org.bson.Document();
doc.put("tag",null); doc.put("tag", null);
//浏览量 //浏览量
doc.put("view",null); doc.put("view", null);
//粉丝 //粉丝
doc.put("fans",null); doc.put("fans", null);
Map<String,String> Map = HeaderTool.getCommonHead(); Map<String, String> Map = HeaderTool.getCommonHead();
Map.put("cookie", "_xsrf=7NFWM5qBcOutfs8MaW7bhQQH65t3Xia4"); Map.put("cookie", "_xsrf=7NFWM5qBcOutfs8MaW7bhQQH65t3Xia4");
Request request = RequestUtils.wrapGet(url,Map); Request request = RequestUtils.wrapGet(url, Map);
try { try {
Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY); Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
if (response.hasCause()){ if (response.hasCause()) {
Throwable cause = response.cause(); Throwable cause = response.cause();
log.error("单条知乎热搜数据页面连接失败",cause); log.error("单条知乎热搜数据页面连接失败", cause);
return doc; return doc;
}else { } else {
String htmlBody = response.bodyString(); String htmlBody = response.bodyString();
if (htmlBody != null && htmlBody.contains("QuestionHeader")) { if (htmlBody != null && htmlBody.contains("QuestionHeader")) {
Document document = Jsoup.parse(htmlBody); Document document = Jsoup.parse(htmlBody);
//获取标签 //获取标签
String label=""; String label = "";
Elements select = document.select("div.Tag"); Elements select = document.select("div.Tag");
for (Element element : select) { for (Element element : select) {
String text = "`"+element.select("div.Popover").text()+";"; String text = "`" + element.select("div.Popover").text() + ";";
label=label+text; label = label + text;
} }
doc.put("tag",label.trim()); doc.put("tag", label.trim());
String strong = document.select("div.NumberBoard-itemInner").select("strong").text(); String strong = document.select("div.NumberBoard-itemInner").select("strong").text();
String[] count = strong.split(" "); String[] count = strong.split(" ");
//获取关注数 //获取关注数
doc.put("fans",Long.valueOf(count[0].replaceAll(",","").trim())); doc.put("fans", Long.valueOf(count[0].replaceAll(",", "").trim()));
//获取浏览量 //获取浏览量
doc.put("view",Long.valueOf(count[1].replaceAll(",","").trim())); doc.put("view", Long.valueOf(count[1].replaceAll(",", "").trim()));
return doc; return doc;
}else { } else {
return doc; return doc;
} }
} }
} catch (Exception e) { } catch (Exception e) {
log.info("知乎热搜标签解析异常",e); log.info("知乎热搜标签解析异常", e);
} }
return doc; return doc;
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment