Commit de854d36 by leiliangliang

重启知乎标签采集程序

parent f23f4c31
......@@ -132,14 +132,14 @@ public class ZhihuHotSearchCrawler {
} catch (Exception e) {
e.printStackTrace();
}
//org.bson.Document doc = getTag(link);
// String tog = nonNull(doc.get("tag")) ? doc.getString("tag") : null;
// Long view = nonNull(doc.get("view")) ? Long.valueOf(doc.get("view").toString()) : null;
// Long fans = nonNull(doc.get("fans")) ? Long.valueOf(doc.get("fans").toString()) : null;
org.bson.Document doc = getTag(link);
String tog = nonNull(doc.get("tag")) ? doc.getString("tag") : null;
Long view = nonNull(doc.get("view")) ? Long.valueOf(doc.get("view").toString()) : null;
Long fans = nonNull(doc.get("fans")) ? Long.valueOf(doc.get("fans").toString()) : null;
HotSearchList zhihu = new HotSearchList(link, displayQuery, hotCount, i + 1, HotSearchType.知乎热搜.name(),date);
// zhihu.setFans(fans);
// zhihu.setView(view);
// zhihu.setTag(tog);
zhihu.setFans(fans);
zhihu.setView(view);
zhihu.setTag(tog);
list.add(zhihu);
}
}
......@@ -165,8 +165,12 @@ public class ZhihuHotSearchCrawler {
if (htmlBody != null && htmlBody.contains("QuestionHeader")) {
Document document = Jsoup.parse(htmlBody);
//获取标签
String content = "`"+document.select("div.Tag").text()+";";
String label = content.replaceAll(" ", ";`");
String label="";
Elements select = document.select("div.Tag");
for (Element element : select) {
String text = "`"+element.select("div.Popover").text()+";";
label=label+text;
}
doc.put("tag",label.trim());
String strong = document.select("div.NumberBoard-itemInner").select("strong").text();
String[] count = strong.split(" ");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment