Commit 64992203 by chenweitao

Merge branch 'working' into 'master'

百度热搜的更新

See merge request !96
parents c4c91711 d544547c
...@@ -53,6 +53,9 @@ public class HotSearchCacheDAO { ...@@ -53,6 +53,9 @@ public class HotSearchCacheDAO {
if("虎嗅热文推荐".equals(hotSearch.getType())){ if("虎嗅热文推荐".equals(hotSearch.getType())){
document.put("comment_count", hotSearch.getCommentCount()); document.put("comment_count", hotSearch.getCommentCount());
} }
if("百度热搜".equals(hotSearch.getType())){
document.put("topic_lead", hotSearch.getTopicLead());
}
if("腾讯较真榜".equals(hotSearch.getType())){ if("腾讯较真榜".equals(hotSearch.getType())){
document.put("topic_result",hotSearch.getTopicResult()); document.put("topic_result",hotSearch.getTopicResult());
...@@ -64,6 +67,9 @@ public class HotSearchCacheDAO { ...@@ -64,6 +67,9 @@ public class HotSearchCacheDAO {
document.put("pictureUrl",hotSearch.getPictureUrl()); document.put("pictureUrl",hotSearch.getPictureUrl());
} }
addAndUpdateData(document); addAndUpdateData(document);
if("百度热搜".equals(hotSearch.getType())){
document.remove("topic_lead");
}
dataes.add(document); dataes.add(document);
}); });
return dataes; return dataes;
......
package hotSaerchTest; package hotSaerchTest;
import com.alibaba.fastjson.JSONObject;
import com.mongodb.client.MongoCollection;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyFactory; import com.zhiwei.crawler.core.proxy.ProxyFactory;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.proxy.config.SimpleConfig; import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.config.DBConfig;
import com.zhiwei.searchhotcrawler.config.ProxyConfig; import com.zhiwei.searchhotcrawler.config.ProxyConfig;
import com.zhiwei.searchhotcrawler.crawler.BaiDuHotSearchCrawler;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import com.zhiwei.searchhotcrawler.test.KuaiShouHotSearchCrawlerTest; import com.zhiwei.searchhotcrawler.test.KuaiShouHotSearchCrawlerTest;
import com.zhiwei.searchhotcrawler.test.TaoBaoHotSearchCrawlerTest;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.bson.Document;
import org.junit.Test; import org.junit.Test;
import org.junit.runner.RunWith; import org.junit.runner.RunWith;
import org.springframework.test.context.ContextConfiguration; import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import java.io.IOException;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
import static com.ibm.icu.util.LocalePriorityList.add;
import static java.util.Objects.nonNull;
/** /**
* @author ll * @author ll
* @date 2021/6/10 6:30 * @date 2021/6/10 6:30
...@@ -24,6 +41,7 @@ import java.util.List; ...@@ -24,6 +41,7 @@ import java.util.List;
{"classpath:applicationContext.xml"}) {"classpath:applicationContext.xml"})
public class HotSearchTest { public class HotSearchTest {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/** /**
* 测试快手热榜采集 * 测试快手热榜采集
...@@ -40,4 +58,106 @@ public class HotSearchTest { ...@@ -40,4 +58,106 @@ public class HotSearchTest {
System.out.println(hotSearchLists.size()); System.out.println(hotSearchLists.size());
} }
@Test
public void WeiBoUpdate() {
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
ProxyFactory.init(simpleConfig);
Document document = new Document();
//String url = "https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26t%3D10%26q%3D%23我国新冠疫苗接种剂次超9亿%23";
String url = "https://m.weibo.cn/api/container/getIndex?containerid=231522type%3D1%26q%3D%23可口可乐回应C罗拒绝与可乐同框%23";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("解析微博热搜详情页面时出现连接失败", e);
}
if (htmlBody != null && htmlBody.contains("data")) {
JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data").getJSONObject("cardlistInfo");
if (json.containsKey("desc")) {
String topicLead = json.getString("desc");
if (!"".equals(topicLead)) {
document.put("topicLead", topicLead);
}
}
if (json.containsKey("cardlist_head_cards")) {
JSONObject readJson = json.getJSONArray("cardlist_head_cards").getJSONObject(0);
if (readJson.containsKey("head_data")) {
String midText = readJson.getJSONObject("head_data").getString("midtext");
String read = midText.replaceAll("阅读", "").replaceAll("讨论.*", "").trim();
String discussCount = midText.replaceAll(".*讨论", "").replaceAll("详情.*", "").trim();
String pictureUrl = readJson.getJSONObject("head_data").getString("portrait_url");
document.put("readCount", TipsUtils.getHotCount(read));
document.put("discussCount", TipsUtils.getHotCount(discussCount));
document.put("pictureUrl", pictureUrl);
if (readJson.getJSONObject("head_data").containsKey("downtext")) {
String downtext = readJson.getJSONObject("head_data").getString("downtext");
if (!"".equals(downtext)) {
document.put("downtext", downtext.replaceAll("主持人:", ""));
}
}
}
}
}
ad(document);
System.out.println(document);
}
private void ad(Document nowDoc) {
MongoCollection collection = MongoDBTemplate.getCollection(DBConfig.dbName, DBConfig.searchCacheCollName);
if(nowDoc.containsKey("topicLead")){
nowDoc.put("topicLead", nowDoc.getString("topicLead"));
}
if(nowDoc.containsKey("readCount") && nowDoc.containsKey("discussCount")) {
nowDoc.put("readCount", nonNull(nowDoc.get("readCount"))?Long.valueOf(nowDoc.get("readCount").toString()):null);
nowDoc.put("discussCount", nonNull(nowDoc.get("discussCount"))?Long.valueOf(nowDoc.get("discussCount").toString()):null);
}
if (nowDoc.containsKey("pictureUrl")) {
nowDoc.put("pictureUrl",nowDoc.getString("pictureUrl"));
}
if (nowDoc.containsKey("downtext")) {
nowDoc.put("downtext",nowDoc.getString("downtext"));
}
collection.insertOne(nowDoc);
}
/**
* 测试淘宝热搜采集
*/
@Test
public void taoBaoTestCrawler() {
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
ProxyFactory.init(simpleConfig);
List<HotSearchList> hotSearchLists = TaoBaoHotSearchCrawlerTest.taoBaoHotSearch(new Date());
System.out.println(hotSearchLists);
System.out.println(hotSearchLists.size());
}
/**
* 测试百度热搜采集
*/
@Test
public void baiDuTestCrawler() {
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
ProxyFactory.init(simpleConfig);
List<HotSearchList> hotSearchLists = BaiDuHotSearchCrawler.baiduHotSearch(new Date());
System.out.println(hotSearchLists);
System.out.println(hotSearchLists.size());
}
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment