Commit 007cfcb4 by leiliangliang

微博搜索框数据采集功能上线

parent ee4922e1
package com.zhiwei.searchhotcrawler.bean;
/**
* @ClassName: WeiBoUser
* @Description: 微博用户
* @author ll
* @date 2021年5月27日 下午3:26:11
*/
import lombok.Data;
import lombok.ToString;
import java.io.Serializable;
import java.util.Date;
@Data
@ToString
public class WeiBoSearch {
/**
* 主键
*/
private String id;
/**
* 话题
*/
private String name;
/**
* 类型
*/
private String type;
private String ext;
private String word;
/**
*时间
*/
private Date time;
public WeiBoSearch() {
}
public WeiBoSearch(String name, String ext, String word,String type,Date time) {
this.id = name+"_大家正在搜";
this.name = name;
this.ext = ext;
this.word = word;
this.type=type;
this.time=time;
}
}
...@@ -21,6 +21,7 @@ public class DBConfig { ...@@ -21,6 +21,7 @@ public class DBConfig {
collWechatUserName = conf.getProperty("collWechatUserName"); collWechatUserName = conf.getProperty("collWechatUserName");
weiBoMassageCollName = conf.getProperty("weiBoMassageCollName"); weiBoMassageCollName = conf.getProperty("weiBoMassageCollName");
weiBoUserCollName = conf.getProperty("weiBoUserCollName"); weiBoUserCollName = conf.getProperty("weiBoUserCollName");
weiBoSearchCollName = conf.getProperty("weiBoSearchCollName");
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
...@@ -37,4 +38,5 @@ public class DBConfig { ...@@ -37,4 +38,5 @@ public class DBConfig {
public static String collWechatUserName; public static String collWechatUserName;
public static String weiBoMassageCollName; public static String weiBoMassageCollName;
public static String weiBoUserCollName; public static String weiBoUserCollName;
public static String weiBoSearchCollName;
} }
package com.zhiwei.searchhotcrawler.crawler;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.proxy.ProxyHolder;
import com.zhiwei.crawler.core.utils.RequestUtils;
import com.zhiwei.searchhotcrawler.bean.WeiBoSearch;
import com.zhiwei.searchhotcrawler.dao.WeiBoSearchDao;
import lombok.extern.log4j.Log4j2;
import okhttp3.Request;
import okhttp3.Response;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Objects;
/**
* @author: ll
* @ClassName: weiBoSearchCrawlerTest
* @Description: 移动端微博搜索框数据采集
* @date: 2021年11月12日 上午11:35:31
* @Title: weiBoSearchCrawler
*/
@Log4j2
public class weiBoSearchCrawler {
private static HttpBoot httpBoot = new HttpBoot.Builder().throwException(false).retryTimes(3).connectTimeout(Duration.ofSeconds(60)).build();
static WeiBoSearchDao weiBoSearchDao = new WeiBoSearchDao();
public static void weiBoSearch(Date date){
String url = "https://api.weibo.cn/2/guest/cardlist?networktype=wifi&image_type=heif&launchid=10000365--x&uicode=10000512&ul_hid=dfa73128-2705-4483-bda9-063cd789e44e&ul_sid=cef2538c-9b16-486e-b49f-db9c387b8384&moduleID=708&checktoken=ea8044f2cc7f0a44a9ad159526fd7186&wb_version=5293&refresh_type=0&c=android&s=0b69e4f6&ft=0&ua=Xiaomi-Redmi%208__weibo__11.11.1__android__android9&wm=20005_0002&aid=01AxK-lU0KWwz9mhMfL6gTb37upA4rmFhPxq5T1hrsYVnDdks.&did=0f607264fc6318a92b9e13c65db7cd3cbce74dcd&fid=231278_plaza&uid=2004639399897&v_f=2&v_p=89&from=10BB195010&gsid=_2AkMW0UMLf8NhqwFRmPwTz2LhZYR_ww_EieKgjbLQJRM3HRl-wT_nqksFtRV6PfAyN6rPTMzBcJo_-h6X0zli7DSuUqw-&imsi=&lang=zh_CN&page=1&skin=default&count=20&oldwm=20005_0002&sflag=1&containerid=231289type%3D1&ignore_inturrpted_error=true&no_location_permission=1&android_id=0febc80e083662a7&client_key=c2f5393732c75e52b85b1da27a8e20ae&need_new_pop=1&ul_ctime=1636683060289&need_head_cards=0&cum=53EC532B";
String htmlBody = null;
Request request = RequestUtils.wrapGet(url);
for (int count = 0; count <= 5; count++) {
try (Response response = httpBoot.syncCall(request, ProxyHolder.NAT_HEAVY_PROXY)) {
htmlBody = response.body().string();
} catch (Exception e) {
log.error("解析微博搜索时出现解析错误,页面结构有问题", e);
}
if (htmlBody != null && htmlBody.contains("hotwords")) {
int num = ansysData(htmlBody, date);
if(num>0){
break;
}
} else {
log.info("解析微博" +
"搜索时出现解析错误,页面结构有问题");
continue;
}
}
}
//解析页面数据
private static int ansysData(String htmlBody, Date date) {
//使用静态WeiBoSearchDao,防止频繁连数据库
if (Objects.isNull(weiBoSearchDao)) {
weiBoSearchDao = new WeiBoSearchDao();
}
List<WeiBoSearch> list = new ArrayList<>();
try {
//解析htmlBody
JSONObject object = JSONObject.parseObject(htmlBody);
//类型
String type="大家正在搜";
//获取json数组
JSONArray cards = object.getJSONArray("hotwords");
for (int i = 0; i < cards.size(); i++) {
//获取单条数据
JSONObject card = cards.getJSONObject(i);
String ext = card.getString("ext");
String word = card.getString("word");
//获取标题
String name = card.getString("note");
WeiBoSearch weiBoSearch = new WeiBoSearch(name, ext, word, type, date);
list.add(weiBoSearch);
}
} catch (Exception e) {
log.error("解析微博搜索时出现解析错误,数据不是json结构",e);
}
log.info("{}, 此轮微博搜索采集到的数据量为:{}", new Date(), Integer.valueOf(list != null ? list.size() : 0));
//数据传给dao
weiBoSearchDao.addWeiBoUser(list);
return list.size();
}
}
package com.zhiwei.searchhotcrawler.dao;
import com.mongodb.MongoWriteException;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import com.zhiwei.searchhotcrawler.bean.WeiBoSearch;
import com.zhiwei.searchhotcrawler.config.DBConfig;
import com.zhiwei.searchhotcrawler.dbtemplate.MongoDBTemplate;
import lombok.extern.log4j.Log4j2;
import org.bson.Document;
import java.util.Date;
import java.util.List;
import static java.util.Objects.nonNull;
@Log4j2
public class WeiBoSearchDao {
public static MongoDatabase mongoDatabase = MongoDBTemplate.getDB(DBConfig.dbName);
public static MongoCollection mongoCollection;
public WeiBoSearchDao() {
String collName = DBConfig.weiBoSearchCollName;
mongoCollection = mongoDatabase.getCollection(collName);
//给数据表创建索引
MongoDBTemplate.createIndex(DBConfig.dbName, collName);
}
/**
* 添加数据入库
* @param weiBoSearch
*/
public void addWeiBoUser(List<WeiBoSearch> weiBoSearch){
for (WeiBoSearch search : weiBoSearch) {
try {
//获取时间
Date time = search.getTime();
String ext = search.getExt();
//获取标题
String name = search.getName();
//获取类型
String type = search.getType();
String word = search.getWord();
//获取id
String id = search.getId();
Document query = new Document("_id", id);
Document nowDoc = (Document) mongoCollection.find(query).first();
//去库里查询数据如果无则添加,有则更新lastTime
if (nonNull(nowDoc)){
nowDoc.put("lastTime", time);
mongoCollection.replaceOne(query, nowDoc);
}else {
nowDoc = new Document();
nowDoc.put("_id",id);
nowDoc.put("name",name);
nowDoc.put("type",type);
nowDoc.put("time",time);
nowDoc.put("word",word);
nowDoc.put("ext",ext);
nowDoc.put("lastTime",time);
//插入数据
mongoCollection.insertOne(nowDoc);
}
}catch (MongoWriteException e1){
log.error("数据写入时出错,数据为【{}】:", search,e1);
}catch (Exception e) {
log.error("数据存储时出错,数据为【{}】:", search,e);
}
}
}
}
...@@ -600,4 +600,15 @@ public class GatherTimer { ...@@ -600,4 +600,15 @@ public class GatherTimer {
TipsUtils.addHotList(HotSearchType.微博要闻榜.name(), WeiboNewsList); TipsUtils.addHotList(HotSearchType.微博要闻榜.name(), WeiboNewsList);
logger.info("微博要闻榜采集结束..."); logger.info("微博要闻榜采集结束...");
} }
/**
*微博搜索采集
*/
@Async(value = "myScheduler")
@Scheduled(cron = "0 * * * * ? ")
public void crawlerWeiBoSearch(){
logger.info("微博搜索采集开始........");
Date date = DateUtils.getMillSecondTime(new Date());
weiBoSearchCrawler.weiBoSearch(date);
logger.info("微博搜索采集结束........");
}
} }
package com.zhiwei.searchhotcrawler.util;
import lombok.extern.log4j.Log4j2;
import org.springframework.core.io.ClassPathResource;
import org.springframework.core.io.Resource;
import javax.script.Invocable;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import java.io.*;
import java.util.Objects;
import java.util.stream.Collectors;
@Log4j2
public class TaoBaoUtils {
public static String parsJSFunction(String sign) {
//脚本的执行结果
String scriptResult = "";
//1.得到脚本引擎
//ScriptEngine engine = new ScriptEngineManager().getEngineByName("JavaScript");
ScriptEngine engine = new ScriptEngineManager().getEngineByExtension("js");
try {
InputStream in = null;
try {
in = TaoBaoUtils.class.getClassLoader().getResourceAsStream("taobao.js");
//如果js存在文件里
//Resource aesJs = new ClassPathResource("taobao.js",TaoBaoUtils.class.getClassLoader());
//InputStream inputStream = aesJs.getInputStream();
InputStreamReader inputStreamReader = new InputStreamReader(in);
BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
if (Objects.isNull(engine)){
log.info("engine为空");
}
if(Objects.isNull(bufferedReader)){
log.info("bufferedReader为空");
}
engine.eval(bufferedReader);
} finally {
in.close();
}
// engine.eval(new BufferedReader(new InputStreamReader(inputStream))
// .lines().parallel().collect(Collectors.joining(System.lineSeparator())));
//3.将引擎转换为Invocable,这样才可以掉用js的方法
Invocable invocable = (Invocable) engine;
//4.使用 invocable.invokeFunction掉用js脚本里的方法,第一個参数为方法名,后面的参数为被调用的js方法的入参
scriptResult = (String) invocable.invokeFunction("h", sign);
} catch (Exception e) {
log.error("Error executing script: ", e);
}
return scriptResult;
}
public static String parsJSFunction1(String sign) {
//脚本的执行结果
String scriptResult = "";
//1.得到脚本引擎
ScriptEngine engine = new ScriptEngineManager().getEngineByName("nashorn");
//ScriptEngine engine = new ScriptEngineManager().getEngineByExtension("js");
try {
InputStream in = null;
try {
in = TaoBaoUtils.class.getClassLoader().getResourceAsStream("taobao.js");
//如果js存在文件里
//Resource aesJs = new ClassPathResource("taobao.js",TaoBaoUtils.class.getClassLoader());
//InputStream inputStream = aesJs.getInputStream();
InputStreamReader inputStreamReader = new InputStreamReader(in);
BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
if (Objects.isNull(engine)){
log.info("engine1为空");
}
if(Objects.isNull(bufferedReader)){
log.info("bufferedReader1为空");
}
engine.eval(bufferedReader);
} finally {
in.close();
}
// engine.eval(new BufferedReader(new InputStreamReader(inputStream))
// .lines().parallel().collect(Collectors.joining(System.lineSeparator())));
//3.将引擎转换为Invocable,这样才可以掉用js的方法
Invocable invocable = (Invocable) engine;
//4.使用 invocable.invokeFunction掉用js脚本里的方法,第一個参数为方法名,后面的参数为被调用的js方法的入参
scriptResult = (String) invocable.invokeFunction("h", sign);
} catch (Exception e) {
log.error("Error executing script1: ", e);
}
return scriptResult;
}
public static String parsJSFunction2(String sign) {
//脚本的执行结果
String scriptResult = "";
//1.得到脚本引擎
//ScriptEngine engine = new ScriptEngineManager().getEngineByName("JavaScript");
ScriptEngine engine = new ScriptEngineManager().getEngineByMimeType("text/javascript");
try {
InputStream in = null;
try {
in = TaoBaoUtils.class.getClassLoader().getResourceAsStream("taobao.js");
//如果js存在文件里
//Resource aesJs = new ClassPathResource("taobao.js",TaoBaoUtils.class.getClassLoader());
//InputStream inputStream = aesJs.getInputStream();
InputStreamReader inputStreamReader = new InputStreamReader(in);
BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
if (Objects.isNull(engine)){
log.info("engine2为空");
}
if(Objects.isNull(bufferedReader)){
log.info("bufferedReader2为空");
}
engine.eval(bufferedReader);
} finally {
in.close();
}
// engine.eval(new BufferedReader(new InputStreamReader(inputStream))
// .lines().parallel().collect(Collectors.joining(System.lineSeparator())));
//3.将引擎转换为Invocable,这样才可以掉用js的方法
Invocable invocable = (Invocable) engine;
//4.使用 invocable.invokeFunction掉用js脚本里的方法,第一個参数为方法名,后面的参数为被调用的js方法的入参
scriptResult = (String) invocable.invokeFunction("h", sign);
} catch (Exception e) {
log.error("Error executing script2: ", e);
}
return scriptResult;
}
}
...@@ -20,4 +20,5 @@ topicCollName=topic_list ...@@ -20,4 +20,5 @@ topicCollName=topic_list
collWechatUserName=wechat_user collWechatUserName=wechat_user
weiBoMassageCollName=weibo_massage weiBoMassageCollName=weibo_massage
weiBoUserCollName=weibo_user weiBoUserCollName=weibo_user
weiBoSearchCollName=weibo_search
# #
function h(a) {
function b(a, b) {
return a << b | a >>> 32 - b
}
function c(a, b) {
var c, d, e, f, g;
return e = 2147483648 & a,
f = 2147483648 & b,
c = 1073741824 & a,
d = 1073741824 & b,
g = (1073741823 & a) + (1073741823 & b),
c & d ? 2147483648 ^ g ^ e ^ f : c | d ? 1073741824 & g ? 3221225472 ^ g ^ e ^ f : 1073741824 ^ g ^ e ^ f : g ^ e ^ f
}
function d(a, b, c) {
return a & b | ~a & c
}
function e(a, b, c) {
return a & c | b & ~c
}
function f(a, b, c) {
return a ^ b ^ c
}
function g(a, b, c) {
return b ^ (a | ~c)
}
function h(a, e, f, g, h, i, j) {
return a = c(a, c(c(d(e, f, g), h), j)),
c(b(a, i), e)
}
function i(a, d, f, g, h, i, j) {
return a = c(a, c(c(e(d, f, g), h), j)),
c(b(a, i), d)
}
function j(a, d, e, g, h, i, j) {
return a = c(a, c(c(f(d, e, g), h), j)),
c(b(a, i), d)
}
function k(a, d, e, f, h, i, j) {
return a = c(a, c(c(g(d, e, f), h), j)),
c(b(a, i), d)
}
function l(a) {
for (var b, c = a.length, d = c + 8, e = (d - d % 64) / 64, f = 16 * (e + 1), g = new Array(f - 1), h = 0, i = 0; c > i;)
b = (i - i % 4) / 4,
h = i % 4 * 8,
g[b] = g[b] | a.charCodeAt(i) << h,
i++;
return b = (i - i % 4) / 4,
h = i % 4 * 8,
g[b] = g[b] | 128 << h,
g[f - 2] = c << 3,
g[f - 1] = c >>> 29,
g
}
function m(a) {
var b, c, d = "", e = "";
for (c = 0; 3 >= c; c++)
b = a >>> 8 * c & 255,
e = "0" + b.toString(16),
d += e.substr(e.length - 2, 2);
return d
}
function n(a) {
a = a.replace(/\r\n/g, "\n");
for (var b = "", c = 0; c < a.length; c++) {
var d = a.charCodeAt(c);
128 > d ? b += String.fromCharCode(d) : d > 127 && 2048 > d ? (b += String.fromCharCode(d >> 6 | 192),
b += String.fromCharCode(63 & d | 128)) : (b += String.fromCharCode(d >> 12 | 224),
b += String.fromCharCode(d >> 6 & 63 | 128),
b += String.fromCharCode(63 & d | 128))
}
return b
}
var o, p, q, r, s, t, u, v, w, x = [], y = 7, z = 12, A = 17, B = 22, C = 5, D = 9, E = 14, F = 20, G = 4,
H = 11, I = 16, J = 23, K = 6, L = 10, M = 15, N = 21;
for (a = n(a),
x = l(a),
t = 1732584193,
u = 4023233417,
v = 2562383102,
w = 271733878,
o = 0; o < x.length; o += 16)
p = t,
q = u,
r = v,
s = w,
t = h(t, u, v, w, x[o + 0], y, 3614090360),
w = h(w, t, u, v, x[o + 1], z, 3905402710),
v = h(v, w, t, u, x[o + 2], A, 606105819),
u = h(u, v, w, t, x[o + 3], B, 3250441966),
t = h(t, u, v, w, x[o + 4], y, 4118548399),
w = h(w, t, u, v, x[o + 5], z, 1200080426),
v = h(v, w, t, u, x[o + 6], A, 2821735955),
u = h(u, v, w, t, x[o + 7], B, 4249261313),
t = h(t, u, v, w, x[o + 8], y, 1770035416),
w = h(w, t, u, v, x[o + 9], z, 2336552879),
v = h(v, w, t, u, x[o + 10], A, 4294925233),
u = h(u, v, w, t, x[o + 11], B, 2304563134),
t = h(t, u, v, w, x[o + 12], y, 1804603682),
w = h(w, t, u, v, x[o + 13], z, 4254626195),
v = h(v, w, t, u, x[o + 14], A, 2792965006),
u = h(u, v, w, t, x[o + 15], B, 1236535329),
t = i(t, u, v, w, x[o + 1], C, 4129170786),
w = i(w, t, u, v, x[o + 6], D, 3225465664),
v = i(v, w, t, u, x[o + 11], E, 643717713),
u = i(u, v, w, t, x[o + 0], F, 3921069994),
t = i(t, u, v, w, x[o + 5], C, 3593408605),
w = i(w, t, u, v, x[o + 10], D, 38016083),
v = i(v, w, t, u, x[o + 15], E, 3634488961),
u = i(u, v, w, t, x[o + 4], F, 3889429448),
t = i(t, u, v, w, x[o + 9], C, 568446438),
w = i(w, t, u, v, x[o + 14], D, 3275163606),
v = i(v, w, t, u, x[o + 3], E, 4107603335),
u = i(u, v, w, t, x[o + 8], F, 1163531501),
t = i(t, u, v, w, x[o + 13], C, 2850285829),
w = i(w, t, u, v, x[o + 2], D, 4243563512),
v = i(v, w, t, u, x[o + 7], E, 1735328473),
u = i(u, v, w, t, x[o + 12], F, 2368359562),
t = j(t, u, v, w, x[o + 5], G, 4294588738),
w = j(w, t, u, v, x[o + 8], H, 2272392833),
v = j(v, w, t, u, x[o + 11], I, 1839030562),
u = j(u, v, w, t, x[o + 14], J, 4259657740),
t = j(t, u, v, w, x[o + 1], G, 2763975236),
w = j(w, t, u, v, x[o + 4], H, 1272893353),
v = j(v, w, t, u, x[o + 7], I, 4139469664),
u = j(u, v, w, t, x[o + 10], J, 3200236656),
t = j(t, u, v, w, x[o + 13], G, 681279174),
w = j(w, t, u, v, x[o + 0], H, 3936430074),
v = j(v, w, t, u, x[o + 3], I, 3572445317),
u = j(u, v, w, t, x[o + 6], J, 76029189),
t = j(t, u, v, w, x[o + 9], G, 3654602809),
w = j(w, t, u, v, x[o + 12], H, 3873151461),
v = j(v, w, t, u, x[o + 15], I, 530742520),
u = j(u, v, w, t, x[o + 2], J, 3299628645),
t = k(t, u, v, w, x[o + 0], K, 4096336452),
w = k(w, t, u, v, x[o + 7], L, 1126891415),
v = k(v, w, t, u, x[o + 14], M, 2878612391),
u = k(u, v, w, t, x[o + 5], N, 4237533241),
t = k(t, u, v, w, x[o + 12], K, 1700485571),
w = k(w, t, u, v, x[o + 3], L, 2399980690),
v = k(v, w, t, u, x[o + 10], M, 4293915773),
u = k(u, v, w, t, x[o + 1], N, 2240044497),
t = k(t, u, v, w, x[o + 8], K, 1873313359),
w = k(w, t, u, v, x[o + 15], L, 4264355552),
v = k(v, w, t, u, x[o + 6], M, 2734768916),
u = k(u, v, w, t, x[o + 13], N, 1309151649),
t = k(t, u, v, w, x[o + 4], K, 4149444226),
w = k(w, t, u, v, x[o + 11], L, 3174756917),
v = k(v, w, t, u, x[o + 2], M, 718787259),
u = k(u, v, w, t, x[o + 9], N, 3951481745),
t = c(t, p),
u = c(u, q),
v = c(v, r),
w = c(w, s);
var O = m(t) + m(u) + m(v) + m(w);
return O.toLowerCase()
}
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment