Commit b21d2070 by chenweitao

测试类及项目结构文件、爬虫核心包升级、热搜基础项目构造器更新

parent a2bf4e4f
......@@ -113,6 +113,12 @@
<artifactId>jedis</artifactId>
<version>2.8.1</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
</dependencies>
......
<?xml version="1.0" encoding="UTF-8"?>
<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
<component name="FacetManager">
<facet type="Spring" name="Spring">
<configuration />
</facet>
</component>
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8">
<output url="file://$MODULE_DIR$/target/classes" />
<output-test url="file://$MODULE_DIR$/target/test-classes" />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
<sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
<excludeFolder url="file://$MODULE_DIR$/target" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" name="Maven: org.mongodb:mongo-java-driver:3.12.2" level="project" />
<orderEntry type="library" name="Maven: com.zhiwei:sendmail:0.0.1-SNAPSHOT" level="project" />
<orderEntry type="library" name="Maven: javax.mail:mail:1.4.7" level="project" />
<orderEntry type="library" name="Maven: javax.activation:activation:1.1" level="project" />
<orderEntry type="library" name="Maven: com.zhiwei.tools:zhiwei-tools:0.1.6-SNAPSHOT" level="project" />
<orderEntry type="library" name="Maven: com.alibaba:fastjson:1.2.58" level="project" />
<orderEntry type="library" name="Maven: de.ruedigermoeller:fst:2.57" level="project" />
<orderEntry type="library" name="Maven: com.fasterxml.jackson.core:jackson-core:2.8.8" level="project" />
<orderEntry type="library" name="Maven: org.javassist:javassist:3.21.0-GA" level="project" />
<orderEntry type="library" name="Maven: org.objenesis:objenesis:2.5.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-lang3:3.8.1" level="project" />
<orderEntry type="library" name="Maven: com.zhiwei.crawler:crawler-core:0.6.7.4-SNAPSHOT" level="project" />
<orderEntry type="library" name="Maven: com.squareup.okhttp3:okhttp:3.14.9" level="project" />
<orderEntry type="library" name="Maven: com.squareup.okio:okio:1.17.2" level="project" />
<orderEntry type="library" name="Maven: org.jsoup:jsoup:1.13.1" level="project" />
<orderEntry type="library" name="Maven: cn.wanghaomiao:JsoupXpath:2.3.2" level="project" />
<orderEntry type="library" name="Maven: org.antlr:antlr4-runtime:4.7" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-compress:1.20" level="project" />
<orderEntry type="library" name="Maven: org.brotli:dec:0.1.2" level="project" />
<orderEntry type="library" name="Maven: com.ibm.icu:icu4j:67.1" level="project" />
<orderEntry type="library" name="Maven: com.google.guava:guava:29.0-jre" level="project" />
<orderEntry type="library" name="Maven: com.google.guava:failureaccess:1.0.1" level="project" />
<orderEntry type="library" name="Maven: com.google.guava:listenablefuture:9999.0-empty-to-avoid-conflict-with-guava" level="project" />
<orderEntry type="library" name="Maven: com.google.code.findbugs:jsr305:3.0.2" level="project" />
<orderEntry type="library" name="Maven: org.checkerframework:checker-qual:2.11.1" level="project" />
<orderEntry type="library" name="Maven: com.google.errorprone:error_prone_annotations:2.3.4" level="project" />
<orderEntry type="library" name="Maven: com.google.j2objc:j2objc-annotations:1.3" level="project" />
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-core:2.13.3" level="project" />
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-api:2.13.3" level="project" />
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-1.2-api:2.13.3" level="project" />
<orderEntry type="library" name="Maven: org.slf4j:slf4j-log4j12:1.8.0-beta4" level="project" />
<orderEntry type="library" name="Maven: org.slf4j:slf4j-api:1.8.0-beta4" level="project" />
<orderEntry type="library" name="Maven: log4j:log4j:1.2.17" level="project" />
<orderEntry type="library" name="Maven: com.zhiwei.async:task-boot:0.0.3-SNAPSHOT" level="project" />
<orderEntry type="library" name="Maven: com.zhiwei.crawler:proxy-client:1.0.5-SNAPSHOT" level="project" />
<orderEntry type="library" name="Maven: org.apache.dubbo:dubbo:2.7.4.1" level="project" />
<orderEntry type="library" name="Maven: io.netty:netty-all:4.1.25.Final" level="project" />
<orderEntry type="library" name="Maven: com.google.code.gson:gson:2.8.5" level="project" />
<orderEntry type="library" name="Maven: org.apache.curator:curator-recipes:2.12.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.curator:curator-framework:2.12.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.curator:curator-client:2.12.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.zookeeper:zookeeper:3.4.8" level="project" />
<orderEntry type="library" name="Maven: jline:jline:0.9.94" level="project" />
<orderEntry type="library" name="Maven: io.netty:netty:3.7.0.Final" level="project" />
<orderEntry type="library" name="Maven: com.kohlschutter.boilerpipe:boilerpipe-extractor:0.0.1-SNAPSHOT" level="project" />
<orderEntry type="library" name="Maven: org.projectlombok:lombok:1.18.8" level="project" />
<orderEntry type="library" name="Maven: org.springframework:spring-aop:4.2.2.RELEASE" level="project" />
<orderEntry type="library" name="Maven: aopalliance:aopalliance:1.0" level="project" />
<orderEntry type="library" name="Maven: org.springframework:spring-beans:4.2.2.RELEASE" level="project" />
<orderEntry type="library" name="Maven: org.springframework:spring-core:4.2.2.RELEASE" level="project" />
<orderEntry type="library" name="Maven: commons-logging:commons-logging:1.2" level="project" />
<orderEntry type="library" name="Maven: org.springframework:spring-test:4.2.2.RELEASE" level="project" />
<orderEntry type="library" name="Maven: org.springframework:spring-context:4.2.2.RELEASE" level="project" />
<orderEntry type="library" name="Maven: org.springframework:spring-expression:4.2.2.RELEASE" level="project" />
<orderEntry type="library" name="Maven: org.springframework:spring-context-support:4.2.2.RELEASE" level="project" />
<orderEntry type="library" name="Maven: org.springframework:spring-web:4.2.2.RELEASE" level="project" />
<orderEntry type="library" name="Maven: org.springframework:spring-tx:4.2.2.RELEASE" level="project" />
<orderEntry type="library" name="Maven: redis.clients:jedis:2.8.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-pool2:2.4.2" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: junit:junit:4.12" level="project" />
<orderEntry type="library" scope="TEST" name="Maven: org.hamcrest:hamcrest-core:1.3" level="project" />
</component>
</module>
\ No newline at end of file
......@@ -102,7 +102,7 @@ public class HotSearchList implements Serializable{
public HotSearchList(){}
public HotSearchList(String url, String name, Integer count,Boolean hot,Integer rank,String type,String icon,Date date){
this.id = name + "_" + new Date().getTime() + "_" + type;
this.id = name + "_" + System.currentTimeMillis() + "_" + type;
this.url = url;
this.name = name;
this.count = count;
......@@ -116,7 +116,7 @@ public class HotSearchList implements Serializable{
public HotSearchList(String url, String name, Integer count,Integer rank,String type,Date date){
this.id = name + "_" + new Date().getTime()+ "_" + type;
this.id = name + "_" + System.currentTimeMillis()+ "_" + type;
this.url = url;
this.name = name;
this.count = count;
......@@ -129,7 +129,7 @@ public class HotSearchList implements Serializable{
public HotSearchList(String url, String name, Integer count,Integer rank,String type, Integer commentCount, String topicLead,Date date){
this.id = name + "_" + new Date().getTime()+ "_" + type;
this.id = name + "_" + System.currentTimeMillis()+ "_" + type;
this.url = url;
this.name = name;
this.count = count;
......@@ -143,7 +143,7 @@ public class HotSearchList implements Serializable{
}
public HotSearchList(String url, String name, Integer count, Boolean hot,Integer rank, String type, Date date, String icon, String topicResult){
this.id = name + "_" + new Date().getTime() + "_" + type;
this.id = name + "_" + System.currentTimeMillis() + "_" + type;
this.url = url;
this.name = name;
this.hot = hot;
......@@ -157,7 +157,7 @@ public class HotSearchList implements Serializable{
}
public HotSearchList(String url, String name, String topicLead, Integer count, Boolean hot, Date time, Integer rank, String type, Integer view, Integer barrage, String pictureUrl) {
this.id = name + "_" + new Date().getTime()+ "_" + type;
this.id = name + "_" + System.currentTimeMillis()+ "_" + type;
this.url = url;
this.name = name;
this.topicLead = topicLead;
......
package com.zhiwei.searchhotcrawler.run;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.core.proxy.ProxyFactory;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.searchhotcrawler.cache.CacheListener;
import com.zhiwei.searchhotcrawler.config.ProxyConfig;
......
package com.zhiwei.searchhotcrawler.test;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.mongodb.client.MongoDatabase;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyFactory;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.proxy.config.SimpleConfig;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.config.DBConfig;
import com.zhiwei.searchhotcrawler.config.ProxyConfig;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBLocalTemplate;
import com.zhiwei.searchhotcrawler.util.HttpClientUtils;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@Log4j2
public class Job51Test {
public static void main(String[] args) {
// ApplicationContext context = new ClassPathXmlApplicationContext("applicationContext.xml");
SimpleConfig simpleConfig = SimpleConfig.builder().registry(ProxyConfig.registry)
.group(ProxyConfig.group).appId(10000013).appName("hotsearch").build();
ProxyFactory.init(simpleConfig);
HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
// MongoDatabase mongoDBLocal = MongoDBLocalTemplate.getDB(DBConfig.dbName);
List<HotSearchList> list = new ArrayList<>();
String url = "https://search.51job.com/list/080300,000000,0000,00,9,99,java,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=";
Map<String,Object> header = new HashMap<>();
header.put("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
header.put("Accept-Encoding","gzip, deflate, br");
header.put("Accept-Language","zh-CN,zh;q=0.9");
header.put("Cache-Control","max-age=0");
header.put("Connection","keep-alive");
header.put("Cookie","guid=1925f996c7ae446cdf1f579f113bff6e; _ujz=MTg3NDg4MTM4MA%3D%3D; ps=needv%3D0; slife=lowbrowser%3Dnot%26%7C%26lastlogindate%3D20210318%26%7C%26securetime%3DBztcaVQzWTsEZlJrWmJdPwQ2Ajw%253D; track=registertype%3D1; 51job=cuid%3D187488138%26%7C%26cusername%3Dphone_15757871020_202103189219%26%7C%26cpassword%3D%26%7C%26cname%3D%25B3%25C2%25EC%25BF%25CC%25CE%26%7C%26cemail%3D15757871020%2540163.com%26%7C%26cemailstatus%3D0%26%7C%26cnickname%3D%26%7C%26ccry%3D.0b4qUteozwmg%26%7C%26cconfirmkey%3D%25241%2524UXfAYBHG%2524Hni.5zaFu5kr7BN.eVcOU%252F%26%7C%26cautologin%3D1%26%7C%26cenglish%3D0%26%7C%26sex%3D0%26%7C%26cnamekey%3D%25241%2524CN04lL8j%2524kCHAFcf4TNh%252F2odmIqujW1%26%7C%26to%3D8019a57bb26817913b5f3c2080ba5792605354bf%26%7C%26; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; search=jobarea%7E%60080300%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60080300%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FAjava%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60080300%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21");
header.put("Host","search.51job.com");
header.put("Referer","https://search.51job.com/list/080300,000000,0000,00,9,99,%2B,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=");
header.put("sec-ch-ua","\"Google Chrome\";v=\"89\", \"Chromium\";v=\"89\", \";Not A Brand\";v=\"99\"");
header.put("Sec-Fetch-Dest","document");
header.put("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36");
// header.put("","");
JSONObject jsonObject = null;
String htmlBody = null;
Request request = RequestUtils.wrapGet(url,header);
for (int t = 0; t < 1 && jsonObject == null; t++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("知乎热搜页面连接异常", e);
}
if (htmlBody != null) {
Document document = Jsoup.parse(htmlBody);
log.info("document:{}",document);
log.info("======================");
String html = document.getElementsByClass("j_joblist").first().html();
log.info("html:{}",html);
jsonObject = JSONObject.parseObject(html);
if (jsonObject != null) {
// JSONArray dataJson = jsonObject.getJSONObject("initialState").getJSONObject("topsearch").getJSONArray("data");
// for (int i = 0; i < dataJson.size(); i++) {
// Integer rank = i + 1;
// JSONObject data = dataJson.getJSONObject(i);
// String name = data.getString("queryDisplay");
// String realQuery = data.getString("realQuery");
// String zhihuUrl = "https://www.zhihu.com/search?q=" + realQuery + "&utm_content=search_hot&type=content";
//
// }
}
} else {
log.error("临时爬取出问题");
}
}
}
}
#redis.host=127.0.0.1
#redis.port=6379
#redis.host=115.236.59.91
#redis.port=7382
#redis.password=
#redis
#redis
#redis.host = 192.168.0.39
#redis.port = 7382
#redis.database = 3
#redis
redis.host = 192.168.0.39
redis.port = 6379
redis.database = 1
#maxIdle
redis.maxIdle=20
#minIdle
......
/**
* ***************************************************
* Copyright (C), NingBo ZhiWeiReach info. Co., Ltd. *
*****************************************************
* 类的详细说明
*
* @author 东临碣石
* @Date 2016年1月16日
* @version 1.00
*/
import org.junit.runner.RunWith;
import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.junit4.AbstractJUnit4SpringContextTests;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
/**
* @Description: SpringTest的父类,用来加载基础的配置文件
* @date 2016年1月16日 上午11:40:14
*/
@RunWith(SpringJUnit4ClassRunner.class)
@ContextConfiguration(locations =
{ "classpath:applicationContext.xml" })
public abstract class ObjectTest extends AbstractJUnit4SpringContextTests
{
}
package weiboTest;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.searchhotcrawler.config.RedisConfig;
import com.zhiwei.searchhotcrawler.util.TipsUtils;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import java.io.IOException;
import java.util.*;
/**
* @author cwt
* @date 2021/5/26 10:35
*/
@Log4j2
@RunWith(SpringJUnit4ClassRunner.class)
@ContextConfiguration(locations =
{ "classpath:applicationContext.xml" })
public class WeiboHotSearchTest{
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
@Test
public void test(){
Document document = Jsoup.parse("a href=\"https://m.weibo.cn/search?containerid=231522type%3D1%26t%3D10%26q%3D%23%E9%82%93%E4%BC%A6%E8%AE%B2%E6%88%8F%E4%B8%93%E4%B8%9A%23&extparam=%23%E9%82%93%E4%BC%A6%E8%AE%B2%E6%88%8F%E4%B8%93%E4%B8%9A%23&luicode=10000011&lfid=100103type%3D1%26t%3D10%26q%3D%23%E9%82%93%E4%BC%A6%E8%AE%B2%E6%88%8F%E4%B8%93%E4%B8%9A%23\" data-hide=\"\"><span class=\"surl-text\">#邓伦讲戏专业#</span></a><a href=\"https://m.weibo.cn/search?containerid=231522type%3D1%26t%3D10%26q%3D%23%E6%9E%81%E9%99%90%E6%8C%91%E6%88%98%23&luicode=10000011&lfid=100103type%3D1%26t%3D10%26q%3D%23%E9%82%93%E4%BC%A6%E8%AE%B2%E6%88%8F%E4%B8%93%E4%B8%9A%23\" data-hide=\"\"><span class=\"surl-text\">#极限挑战#</span></a> <a href='/n/邓伦'>@邓伦</a> 和<a href='/n/景甜'>@景甜</a> 改编《甄嬛传》剧本,伦伦认真讲戏的样子让人瞬间穿越到拍摄现场。看来戏瘾上身的邓伦还过了一把导演的瘾,这专业的模样要不要考虑跨界当当导演呀~<span class=\"url-icon\"><img alt=[哈哈] src=\"https://h5.sinaimg.cn/m/emoticon/icon/default/d_haha-0ec05e6dad.png\" style=\"width:1em; height:1em;\" /></span><a data-url=\"http://t.cn/A6VJPN9w\" href=\"https://video.weibo.com/show?fid=1034:4640837901156490\" data-hide=\"\"><span class='url-icon'><img style='width: 1rem;height: 1rem' src='https://h5.sinaimg.cn/upload/2015/09/25/3/timeline_card_small_video_default.png'></span><span class=\"surl-text\">东方卫视极限挑战的微博视频</span></a>");
System.out.println(document.text());
}
@Test
public void testHotWeibo(){
Date date = new Date();
List<HotSearchList> hotSearchLists = weiboHotSearchByPhone(date);
for (HotSearchList hotSearchList : hotSearchLists) {
}
}
/**
* 微博热搜数据更新导语,阅读量,讨论量
* @param document
* @return
*/
public static org.bson.Document weiboUpdate(org.bson.Document document) {
log.info("更新微博热搜{}导语阅读量和讨论量",document.getString("name"));
String url = "https://m.weibo.cn/api/container/getIndex?"+ document.getString("url").substring(
document.getString("url").indexOf("?")+1,document.getString("url").indexOf("&"));
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
for(int count =0; count<=5; count++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("解析微博热搜详情页面时出现连接失败", e);
}
if (htmlBody != null && htmlBody.contains("data")) {
JSONObject dataJson = JSONObject.parseObject(htmlBody).getJSONObject("data");
JSONObject cardlistInfoJson = dataJson.getJSONObject("cardlistInfo");
List<JSONObject> cardsJsons = (List<JSONObject>) dataJson.get("cards");
//解析cardlistInfo,讨论、导语、阅读
if(cardlistInfoJson.containsKey("desc")){
String topicLead = cardlistInfoJson.getString("desc");
if(!"".equals(topicLead)) {
document.put("topicLead", topicLead);
}
}
if(cardlistInfoJson.containsKey("cardlist_head_cards")){
JSONObject readJson = cardlistInfoJson.getJSONArray("cardlist_head_cards").getJSONObject(0);
if (readJson.containsKey("head_data")) {
String midText = readJson.getJSONObject("head_data").getString("midtext");
String read = midText.replaceAll("阅读", "").replaceAll("讨论.*", "").trim();
String discussCount = midText.replaceAll(".*讨论", "").replaceAll("详情.*", "").trim();
String pictureUrl = readJson.getJSONObject("head_data").getString("portrait_url");
document.put("readCount", TipsUtils.getHotCount(read));
document.put("discussCount", TipsUtils.getHotCount(discussCount));
document.put("pictureUrl",pictureUrl);
if (readJson.getJSONObject("head_data").containsKey("downtext")){
String downtext = readJson.getJSONObject("head_data").getString("downtext");
if(!"".equals(downtext)) {
document.put("downtext",downtext.replaceAll("主持人:",""));
}
}
}
}
//解析cards,获取热门微博、人物
for (JSONObject jsonObject : cardsJsons) {
}
return document;
}
}
return null;
}
public JSONObject analysisWeiboSon(JSONObject readJson){
return null;
}
/**
* @Title: weiboHotSearchByPhoneTest
* @author hero
* @Description: TODO(手机端Iphone 微博热搜采集)
* @return void 返回类型
*/
public static List<HotSearchList> weiboHotSearchByPhone(Date date){
String url = "https://m.weibo.cn/api/container/getIndex?containerid=106003type%3D25%26t%3D3%26disable_hot%3D1%26filter_type%3Drealtimehot&title=%E5%BE%AE%E5%8D%9A%E7%83%AD%E6%90%9C&extparam=pos%3D0_0%26mi_cid%3D100103%26cate%3D10103%26filter_type%3Drealtimehot%26c_type%3D30&luicode=10000011&lfid=231583";
Map<String,String> headerMap = new HashMap<>();
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36");
String htmlBody = null;
Request request = RequestUtils.wrapGet(url, headerMap);
for(int count =0; count<=5; count++){
try(Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (IOException e) {
log.error("解析微博时热搜时出现连接失败",e);
}
List<HotSearchList> result = new ArrayList<HotSearchList>();
if (StringUtils.isNotBlank(htmlBody) && htmlBody.contains("cards")) {
try {
JSONObject json = JSONObject.parseObject(htmlBody).getJSONObject("data");
JSONArray cards = json.getJSONArray("cards");
int rank = 0;
// for (int i = 0; i < cards.size(); i++) {
try {
JSONObject card = cards.getJSONObject(0);
JSONArray cardGroup = card.getJSONArray("card_group");
JSONObject topCard =cardGroup.getJSONObject(0);
if(!topCard.containsKey("pic")){
rank = 1;
}
if (Objects.nonNull(cardGroup) && !cardGroup.isEmpty()) {
// String title = card.getString("title");
boolean hot = true;
// if (Objects.nonNull(title) && title.contains("实时上升热点")) {
// hot = false;
// rank = 51;
// }
for (int j = 0; j < cardGroup.size(); j++) {
JSONObject cardInfo = cardGroup.getJSONObject(j);
String name = cardInfo.getString("desc");
int hotCount = cardInfo.getIntValue("desc_extr");
String icon = cardInfo.getString("icon");
if (StringUtils.isNotBlank(icon)) {
icon = icon.split("_")[1].split(".png")[0];
}
// String id = "http://s.weibo.com/weibo/" + URLCodeUtil.getURLEncode(name, "utf-8") + "&Refer=top";
String id = cardInfo.getString("scheme");
HotSearchList hotSearch = new HotSearchList(id, name, hotCount, hot, rank, HotSearchType.微博热搜.name(), icon, date);
result.add(hotSearch);
rank++;
// redisDao.addDataToSet(RedisConfig.WEIBO_HOTSEARCHIDS,name+"_微博热搜");
}
} else {
log.info("card 数据结构为:{}", card);
}
} catch (Exception e) {
log.error("解析微博时热搜时出现解析错误", e);
continue;
}
// }
return result;
} catch (Exception e) {
log.error("解析微博时热搜时出现解析错误,数据不是json结构", e);
}
} else {
log.info("解析微博时热搜时出现解析错误,页面结构有问题");
}
}
return Collections.emptyList();
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment