Commit 355bdd18 by 马黎滨

Merge branch 'mlbWork' into 'master'

今日头条采集修改

See merge request !2
parents a8eb686a 6941070e
...@@ -3,12 +3,15 @@ package com.zhiwei.searchhotcrawler.crawler; ...@@ -3,12 +3,15 @@ package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.HotSearchList; import com.zhiwei.searchhotcrawler.bean.HotSearchList;
import com.zhiwei.searchhotcrawler.bean.HotSearchType; import com.zhiwei.searchhotcrawler.bean.HotSearchType;
import com.zhiwei.tools.tools.URLCodeUtil; import com.zhiwei.tools.tools.URLCodeUtil;
import lombok.extern.log4j.Log4j2; import lombok.extern.log4j.Log4j2;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException; import java.io.IOException;
import java.util.*; import java.util.*;
...@@ -32,9 +35,20 @@ public class ToutiaoHotSearchCrawler { ...@@ -32,9 +35,20 @@ public class ToutiaoHotSearchCrawler {
* @return void 返回类型 * @return void 返回类型
*/ */
public static List<HotSearchList> toutiaoHotSearchByPhone(){ public static List<HotSearchList> toutiaoHotSearchByPhone(){
String origin = "hot_board";
String jsUrl = "https://s3.pstatp.com/toutiao/feoffline/hot_list/resource/hot_list/js/index.45f50250.chunk.js";
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(jsUrl)).body().string();
if(htmlBody.contains("origin")){
String s = htmlBody.substring(htmlBody.indexOf("origin:")+"origin:".length());
origin = s.substring(1,s.indexOf("}")-1);
}
} catch (IOException e) {
log.error("获取今日头条实时热搜头部信息标识失败",e);
}
for(int count =0; count<=5; count++){ for(int count =0; count<=5; count++){
String url = "https://ib.snssdk.com/api/suggest_words/?business_id=10017"; String url = "https://i.snssdk.com/hot-event/hot-board/?origin="+origin;
Map<String,String> headerMap = new HashMap<>(); Map<String,String> headerMap = new HashMap<>();
headerMap.put("User-Agent", "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"); headerMap.put("User-Agent", "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1");
headerMap.put("referer","https://ib.snssdk.com/rogue/aladdin_landingpage/template/aladdin_landingpage/hot_words.html?isBrowser=true&traffic_source="); headerMap.put("referer","https://ib.snssdk.com/rogue/aladdin_landingpage/template/aladdin_landingpage/hot_words.html?isBrowser=true&traffic_source=");
...@@ -42,17 +56,17 @@ public class ToutiaoHotSearchCrawler { ...@@ -42,17 +56,17 @@ public class ToutiaoHotSearchCrawler {
try { try {
List<HotSearchList> result = new ArrayList<HotSearchList>(); List<HotSearchList> result = new ArrayList<HotSearchList>();
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string(); htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap)).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("words")){ if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("data")){
try { try {
JSONArray words = JSONObject.parseObject(htmlBody).getJSONArray("data").getJSONObject(0).getJSONArray("words"); JSONArray words = JSONObject.parseObject(htmlBody).getJSONArray("data");
int rank = 1; int rank = 1;
for(int i=0;i<words.size();i++){ for(int i=0;i<words.size();i++){
try { try {
JSONObject word = words.getJSONObject(i); JSONObject word = words.getJSONObject(i);
String name = word.getString("word"); String name = word.getString("Title");
String link = "https://ib.snssdk.com/search/?keyword="+ URLCodeUtil.getURLEncode(name, "utf-8") +"&pd=synthesis&source=trending_list&traffic_source="; String link = "https://ib.snssdk.com/search/?keyword="+ URLCodeUtil.getURLEncode(name, "utf-8") +"&pd=synthesis&source=trending_list&traffic_source=";
Integer hotCount = word.getJSONObject("params").getInteger("fake_click_cnt"); Integer hotCount = word.getInteger("HotValue");
Integer wordsType = word.getInteger("words_type"); String wordsType = word.getString("Label");
String icon = getIcon(wordsType); String icon = getIcon(wordsType);
HotSearchList hotSearch = new HotSearchList(link, name, hotCount, true, rank, HotSearchType.今日头条热搜.name(), icon); HotSearchList hotSearch = new HotSearchList(link, name, hotCount, true, rank, HotSearchType.今日头条热搜.name(), icon);
...@@ -83,17 +97,17 @@ public class ToutiaoHotSearchCrawler { ...@@ -83,17 +97,17 @@ public class ToutiaoHotSearchCrawler {
* @param wordsType * @param wordsType
* @return * @return
*/ */
private static String getIcon(Integer wordsType){ private static String getIcon(String wordsType){
String icon = "无"; String icon = "无";
if(Objects.nonNull(wordsType)){ if(Objects.nonNull(wordsType)){
switch (wordsType){ switch (wordsType){
case 1: case "new":
icon = "新"; icon = "新";
break; break;
case 2: case "hot":
icon = "热"; icon = "热";
break; break;
case 3: case "explode":
icon = "爆"; icon = "爆";
break; break;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment