Commit a8eb686a by 马黎滨

Merge branch 'mlbWork' into 'master'

Mlb work

See merge request !1
parents fa3b5b7e 88b59f64
......@@ -7,5 +7,6 @@ public enum HotSearchType {
抖音热搜,
搜狗微信热搜,
微博话题,
今日头条热搜
今日头条热搜,
知乎热搜榜单
}
......@@ -91,6 +91,9 @@ public class BaiDuHotSearchCrawler {
} else if (!element.select("td.last").select("span.icon-rise").isEmpty()) {
hot = element.select("td.last").select("span.icon-rise").text();
}
else if (!element.select("td.last").select("span.icon-fair").isEmpty()) {
hot = element.select("td.last").select("span.icon-fair").text();
}
int count = 0;
// 判断hot是否为空
if (StringUtils.isNotBlank(hot)) {
......
package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.w3c.dom.Element;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
@Log4j2
public class ZhihuTopicSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
public static List<HotSearchList> getZhihuTopicSearch(){
List<HotSearchList> list = new ArrayList<>();
String url = "https://www.zhihu.com/topsearch";
JSONObject jsonObject = null;
try {
for(int t=0 ;t<3 && jsonObject== null;t++)
{
// ZhiWeiTools.sleep(10000L);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url),
ProxyHolder.NAT_HEAVY_PROXY).body().string();
// log.info("页面内容获取:{}",htmlBody);
Document document = Jsoup.parse(htmlBody);
String html = document.getElementsByTag("script").select("#js-initialData").html();
jsonObject = JSONObject.parseObject(html);
}
if(jsonObject != null) {
JSONArray dataJson = jsonObject.getJSONObject("initialState").getJSONObject("topsearch").getJSONArray("data");
for (int i = 0; i < dataJson.size(); i++) {
Integer rank = i + 1;
JSONObject data = dataJson.getJSONObject(i);
String name = data.getString("queryDisplay");
String realQuery = data.getString("realQuery");
String zhihuUrl = "https://www.zhihu.com/search?q=" + realQuery + "&utm_content=search_hot&type=content";
HotSearchList hotSearchList = new HotSearchList(zhihuUrl, name, null, rank, HotSearchType.知乎热搜榜单.name());
list.add(hotSearchList);
}
return list;
}else{
log.error("知乎热搜榜单页面获取异常,404");
log.error(jsonObject);
}
} catch (IOException e) {
log.error("知乎热搜获取异常", e);
}
return Collections.emptyList();
}
}
......@@ -51,6 +51,6 @@ public class HotSearchRun {
new WeiboSuperTopicRun().start();
new WeiboTopicRun().start();
new ToutiaoHotSearchRun().start();
new ZhihuTopSearchRun().start();
}
}
package com.zhiwei.searchhotcrawler.timer;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.crawler.ZhihuTopicSearchCrawler;
import com.zhiwei.searchhotcrawler.dao.HotSearchCacheDAO;
import com.zhiwei.searchhotcrawler.dao.HotSearchListDAO;
import com.zhiwei.tools.tools.ZhiWeiTools;
import lombok.extern.log4j.Log4j2;
import org.bson.Document;
import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;
@Log4j2
public class ZhihuTopSearchRun extends Thread {
@Override
public void run() {
boolean f = true;
while(f) {
try {
getHotList();
TimeUnit.MINUTES.sleep(1);
} catch (Exception e) {
e.fillInStackTrace();
ZhiWeiTools.sleep(60*60*1000);
}
ZhiWeiTools.sleep(50);
}
}
public void getHotList(){
HotSearchListDAO hotSearchDAO = new HotSearchListDAO();
HotSearchCacheDAO hotSearchCacheDAO = new HotSearchCacheDAO();
log.info("知乎热搜采集开始...,当前线程名字:{}", Thread.currentThread().getName());
List<HotSearchList> list = ZhihuTopicSearchCrawler.getZhihuTopicSearch();
log.info("{}, 知乎热搜此轮采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
List<Document> data = hotSearchCacheDAO.addData(list);
hotSearchDAO.addHotSearchList(data);
log.info("知乎热搜话题采集结束........");
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment