Commit e800df88 by chenweitao

Merge branch 'working' into 'master'

增加知乎采集循环次数

See merge request !191
parents 1ea78f29 982e420e
......@@ -12,7 +12,6 @@ import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
......@@ -27,9 +26,9 @@ import org.jsoup.select.Elements;
import static java.util.Objects.nonNull;
/**
* @author hero
* @ClassName: ZhihuHotCrawler
* @Description: 知乎热搜采集程序
* @author hero
* @date 2017年9月15日 上午10:54:31
*/
@Log4j2
......@@ -81,18 +80,17 @@ public class ZhihuHotSearchCrawler {
// }
/**
* @param @return 设定文件
* @return List<ZhihuHotSearch> 返回类型
* @Title: getMobileZhihuHotList
* @author hero
* @Description: 移動端知乎熱搜榜
* @param @return 设定文件
* @return List<ZhihuHotSearch> 返回类型
*/
public static List<HotSearchList> getMobileZhihuHotList(Date date){
public static List<HotSearchList> getMobileZhihuHotList(Date date) {
List<HotSearchList> list = new ArrayList<>();
String url = "https://api.zhihu.com/topstory/hot-list?limit=50&reverse_order=0";
Map<String,String> headerMap = HeaderTool.getCommonHead();
Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "api.zhihu.com");
headerMap.put("Referer", url);
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36");
......@@ -100,13 +98,17 @@ public class ZhihuHotSearchCrawler {
headerMap.put("authorization", "oauth c3cef7c66a1843f8b3a9e6a1e3160e20");
String htmlBody = null;
Request request = RequestUtils.wrapGet(url, headerMap);
for (int x = 0; x <= 5; x++) {
Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
if (response.hasCause()){
if (response.hasCause()) {
Throwable cause = response.cause();
log.debug("获取知乎热搜时出现问题:{}",cause);
return list;
}else {
log.debug("获取知乎热搜时出现问题:{}", cause);
continue;
} else {
htmlBody = response.bodyString();
if (!htmlBody.contains("author")) {
continue;
}
}
try {
if (htmlBody != null && htmlBody.contains("author")) {
......@@ -140,61 +142,64 @@ public class ZhihuHotSearchCrawler {
String tog = nonNull(doc.get("tag")) ? doc.getString("tag") : null;
Long view = nonNull(doc.get("view")) ? Long.valueOf(doc.get("view").toString()) : null;
Long fans = nonNull(doc.get("fans")) ? Long.valueOf(doc.get("fans").toString()) : null;
HotSearchList zhihu = new HotSearchList(link, displayQuery, hotCount, i + 1, HotSearchType.知乎热搜.name(),date);
HotSearchList zhihu = new HotSearchList(link, displayQuery, hotCount, i + 1, HotSearchType.知乎热搜.name(), date);
zhihu.setFans(fans);
zhihu.setView(view);
zhihu.setTag(tog);
list.add(zhihu);
}
return list;
}
} catch (Exception e) {
log.info("知乎热搜解析异常",e);
log.info("知乎热搜解析异常", e);
}
}
return list;
}
//访问pc端 获取标签及浏览量关注数
private static org.bson.Document getTag(String url) {
org.bson.Document doc = new org.bson.Document();
doc.put("tag",null);
doc.put("tag", null);
//浏览量
doc.put("view",null);
doc.put("view", null);
//粉丝
doc.put("fans",null);
Map<String,String> Map = HeaderTool.getCommonHead();
doc.put("fans", null);
Map<String, String> Map = HeaderTool.getCommonHead();
Map.put("cookie", "_xsrf=7NFWM5qBcOutfs8MaW7bhQQH65t3Xia4");
Request request = RequestUtils.wrapGet(url,Map);
Request request = RequestUtils.wrapGet(url, Map);
try {
Response response = httpBoot.syncCall(request, ProxySupplier.NAT_HEAVY_PROXY);
if (response.hasCause()){
if (response.hasCause()) {
Throwable cause = response.cause();
log.error("单条知乎热搜数据页面连接失败",cause);
log.error("单条知乎热搜数据页面连接失败", cause);
return doc;
}else {
} else {
String htmlBody = response.bodyString();
if (htmlBody != null && htmlBody.contains("QuestionHeader")) {
Document document = Jsoup.parse(htmlBody);
//获取标签
String label="";
String label = "";
Elements select = document.select("div.Tag");
for (Element element : select) {
String text = "`"+element.select("div.Popover").text()+";";
label=label+text;
String text = "`" + element.select("div.Popover").text() + ";";
label = label + text;
}
doc.put("tag",label.trim());
doc.put("tag", label.trim());
String strong = document.select("div.NumberBoard-itemInner").select("strong").text();
String[] count = strong.split(" ");
//获取关注数
doc.put("fans",Long.valueOf(count[0].replaceAll(",","").trim()));
doc.put("fans", Long.valueOf(count[0].replaceAll(",", "").trim()));
//获取浏览量
doc.put("view",Long.valueOf(count[1].replaceAll(",","").trim()));
doc.put("view", Long.valueOf(count[1].replaceAll(",", "").trim()));
return doc;
}else {
} else {
return doc;
}
}
} catch (Exception e) {
log.info("知乎热搜标签解析异常",e);
log.info("知乎热搜标签解析异常", e);
}
return doc;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment