Commit 05561321 by [zhangzhiwei]

去重中间件版本的新包,并修改了http采集方式

parent cd252c2d
......@@ -3,44 +3,17 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>weibohotcrawler</artifactId>
<version>0.0.1-SNAPSHOT</version>
<version>0.0.2-SNAPSHOT</version>
<name>weibohotcrawler</name>
<description>微博热搜1小时榜单,社会、热点采集程序</description>
<dependencies>
<dependency>
<groupId>com.zhiwei.middleware</groupId>
<artifactId>rsid-client</artifactId>
<version>0.0.2-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>javax.mail</groupId>
<artifactId>mail</artifactId>
<version>1.4.7</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>4.4.6</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>
<dependency>
<groupId>com.zhiwei</groupId>
......@@ -48,10 +21,16 @@
<version>0.0.5-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei</groupId>
<artifactId>zhiweiTools</artifactId>
<version>0.0.6-SNAPSHOT</version>
<groupId>com.zhiwei.middleware</groupId>
<artifactId>cleaner-unified-urlfilter</artifactId>
<version>1.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.0.5-SNAPSHOT</version>
</dependency>
</dependencies>
<!-- 打包管理 -->
<build>
......
......@@ -17,9 +17,9 @@ public class DataCrawlerStart{
}
public void start() {
scheduled.scheduleWithFixedDelay(new WeiboBangdanData(), 2000, 15*1000, TimeUnit.MILLISECONDS);
scheduled.scheduleWithFixedDelay(new WeiboHotData(), 1000, 20*1000, TimeUnit.MILLISECONDS);
scheduled.scheduleWithFixedDelay(new WeiboSocietyData(), 3000, 19*1000, TimeUnit.MILLISECONDS);
scheduled.scheduleWithFixedDelay(new WeiboBangdanData(), 2000, 3*60*1000, TimeUnit.MILLISECONDS);
scheduled.scheduleWithFixedDelay(new WeiboHotData(), 1000, 3*60*1000, TimeUnit.MILLISECONDS);
scheduled.scheduleWithFixedDelay(new WeiboSocietyData(), 3000, 3*60*1000, TimeUnit.MILLISECONDS);
}
......
......@@ -6,12 +6,11 @@ import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.weibobusiness.weibo4j.model.Status;
import com.zhiwei.weibocrawler.crawler.getdata.WeiboCrawlerAnalysis;
import com.zhiwei.weibocrawler.rsidClient.DataQueue;
import com.zhiwei.weibocrawler.rsidClient.UpdateQueue;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
/***
*
* @ClassName DataUpdate
......@@ -56,6 +55,7 @@ public class DataUpdate implements Runnable{
}
i = 1;
}else {
logger.error("目前数据量不足50,目前队列中得数据量为:::{}", DataQueue.linkQueue.size());
i++;
}
} catch (Exception e) {
......
......@@ -5,7 +5,7 @@ import java.util.Date;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
import com.zhiwei.tools.tools.ZhiWeiTools;
/**
*
* @ClassName WeiboBangdanData
......@@ -28,7 +28,7 @@ public class WeiboBangdanData implements Runnable{
weiboCrawlerAnalysis.getWeiboHotMid(url);
ZhiWeiTools.sleep(12000);
} catch (Exception e) {
logger.error("出错====榜单的出错了",e.getMessage());
logger.error("出错====榜单的出错了",e);
e.printStackTrace();
ZhiWeiTools.sleep(20);
continue;
......
package com.zhiwei.weibocrawler.crawler.getdata;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
......@@ -14,6 +11,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.weibobusiness.business.SearchBusiness;
import com.zhiwei.weibobusiness.weibo4j.model.Status;
import com.zhiwei.weibobusiness.weibo4j.model.StatusWapper;
......@@ -21,7 +19,7 @@ import com.zhiwei.weibobusiness.weibo4j.model.WeiboException;
import com.zhiwei.weibocrawler.httpclient.HttpClientDemo;
import com.zhiwei.weibocrawler.rsidClient.DataQueue;
import com.zhiwei.weibocrawler.rsidClient.RsidClientDAO;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
/**
*
* @ClassName WeiboCrawlerAnalysis
......@@ -31,102 +29,110 @@ import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
* @version 1.0.0
*/
public class WeiboCrawlerAnalysis {
private static Logger logger = LoggerFactory.getLogger(WeiboCrawlerAnalysis.class);
/**
*
* @Description (mid获取微博数据)
* @param midsList
* @param businessToken
* @return
*/
public static List<Status> getWeiboData(List<String> midsList,String businessToken) {
SearchBusiness searchBusiness = new SearchBusiness(businessToken);
if(midsList.size() < 1){
return null;
}
List<Status> statuses = new ArrayList<Status>();
String mids = "";
int i = 0;
for(String mid : midsList) {
mids = mids + mid + ",";
i++;
if(i > 48) {
try {
mids = mids.substring(0,mids.length()-1);
StatusWapper statusWapper = searchBusiness.showStatusBusniess(mids);
statuses.addAll(statusWapper.getStatuses());
i = 0;
mids = "";
}catch (WeiboException e) {
logger.error("数据更新出错部分mids========="+mids);
e.printStackTrace();
continue;
}
}
}
try {
mids = mids.substring(0,mids.length()-1);
StatusWapper statusWapper = searchBusiness.showStatusBusniess(mids);
statuses.addAll(statusWapper.getStatuses());
}catch (WeiboException e) {
logger.error("数据更新出错部分mids========="+mids);
logger.error("数据出错",e.getMessage());
e.printStackTrace();
return null;
}
return statuses;
}
/**
*
* @Description (获取微博数据mid集合)
* @return
*/
public void getWeiboHotMid(String url) {
try {
String result = HttpClientDemo.executeHttpRequestGet(url);
getWeiboData(result);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
*
* @Description (解析)
* @param result
* @return
*/
private void getWeiboData(String result) {
try {
JSONObject json = (JSONObject) JSONObject.parse(result);
String s = json.getString("data");
Document document = Jsoup.parse(s);
Elements elements = document.select("div.UG_contents").select("ul.clearfix").select("div[action-type=feed_list_item]");
for(Element element : elements) {
try {
String mid = element.attr("mid");
if(mid.length() > 16) {
mid = mid.substring(mid.length()-16, mid.length());
}
if(RsidClientDAO.isWeiboExit(mid)) {
DataQueue.offer(mid);
}
} catch (Exception e) {
logger.error("数据解析出错",e.getMessage());
ZhiWeiTools.sleep(200);
e.printStackTrace();
continue;
}
}
} catch (Exception e) {
logger.error("数据解析出错",e.getMessage());
ZhiWeiTools.sleep(200);
e.printStackTrace();
}
}
private static Logger logger = LoggerFactory.getLogger(WeiboCrawlerAnalysis.class);
/**
*
* @Description (mid获取微博数据)
* @param midsList
* @param businessToken
* @return
*/
public static List<Status> getWeiboData(List<String> midsList, String businessToken) {
SearchBusiness searchBusiness = new SearchBusiness(businessToken);
if (midsList.size() < 1) {
return null;
}
List<Status> statuses = new ArrayList<Status>();
String mids = "";
int i = 0;
for (String mid : midsList) {
mids = mids + mid + ",";
i++;
if (i > 48) {
try {
mids = mids.substring(0, mids.length() - 1);
StatusWapper statusWapper = searchBusiness.showStatusBusniess(mids);
statuses.addAll(statusWapper.getStatuses());
i = 0;
mids = "";
} catch (WeiboException e) {
logger.error("数据更新出错部分mids=========" + mids);
e.printStackTrace();
continue;
}
}
}
try {
mids = mids.substring(0, mids.length() - 1);
StatusWapper statusWapper = searchBusiness.showStatusBusniess(mids);
statuses.addAll(statusWapper.getStatuses());
} catch (WeiboException e) {
logger.error("数据更新出错部分mids=========" + mids);
logger.error("数据出错", e.getMessage());
e.printStackTrace();
return null;
}
return statuses;
}
/**
*
* @Description (获取微博数据mid集合)
* @return
*/
public void getWeiboHotMid(String url) {
try {
String result = HttpClientDemo.executeHttpRequestGet(url);
getWeiboData(result);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
*
* @Description (解析)
* @param result
* @return
*/
private void getWeiboData(String result) {
try {
JSONObject json = (JSONObject) JSONObject.parse(result);
String s = json.getString("data");
Document document = Jsoup.parse(s);
Elements elements = document.select("div.UG_contents").select("ul.clearfix")
.select("div[action-type=feed_list_item]");
System.out.println("elements size is " + elements.size());
List<String> midsList = new ArrayList<String>();
for (Element element : elements) {
try {
String mid = element.attr("mid");
if (mid.length() > 16) {
mid = mid.substring(mid.length() - 16, mid.length());
}
if(!midsList.contains(mid)){
midsList.add(mid);
}
} catch (Exception e) {
logger.error("数据解析出错", e.getMessage());
ZhiWeiTools.sleep(200);
e.printStackTrace();
continue;
}
}
if(!midsList.isEmpty()){
for(String mid : midsList){
if (!RsidClientDAO.isWeiboExit(mid)) {
DataQueue.offer(mid);
}
}
}
} catch (Exception e) {
logger.error("数据解析出错", e.getMessage());
ZhiWeiTools.sleep(200);
e.printStackTrace();
}
}
}
......@@ -5,7 +5,7 @@ import java.util.Date;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
import com.zhiwei.tools.tools.ZhiWeiTools;
/**
*
* @ClassName WeiboHotData
......
......@@ -5,7 +5,7 @@ import java.util.Date;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
import com.zhiwei.tools.tools.ZhiWeiTools;
/**
*
* @ClassName WeiboSocietyData
......
package com.zhiwei.weibocrawler.httpclient;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
public class HttpClientDemo {
private static Logger logger = LoggerFactory.getLogger(HttpClientDemo.class);
// public static String executeHttpRequestGet(String url) throws IOException {
// String result = null;
// Map<String, String> headerMap = new HashMap<String, String>();
// headerMap.put("User-Agent",
// "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
// headerMap.put("Accept","*/*");
// headerMap.put("Accept-Encoding", "gzip, deflate, br");
// headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
// headerMap.put("Connection", "keep-alive");
// headerMap.put("Content-Type", "application/x-www-form-urlencoded");
// headerMap.put("Host", "weibo.com");
// CloseableHttpClient httpClient = null;
// for(int j = 1;j <= 3;j++) {
// try {
// HttpGet httpGet = new HttpGet(url);
// RequestConfig requestConfig = RequestConfig.custom()
// .setSocketTimeout(8000).setConnectTimeout(8000).build();
// httpClient = HttpClients.custom()
// .setDefaultRequestConfig(requestConfig).build();
// if (headerMap != null) {
// for (Entry<String, String> header : headerMap.entrySet()) {
// httpGet.setHeader(header.getKey(), header.getValue());
// }
// }
// result = EntityUtils
// .toString(httpClient.execute(httpGet).getEntity());
// return result;
// }catch (Exception e) {
// e.printStackTrace();
// continue;
// }finally {
// if (httpClient != null) {
// httpClient.close();
// }
// }
// }
// return result;
//
// }
public static String executeHttpRequestGet(String url) throws IOException {
public static String executeHttpRequestGet(String url) {
String result = null;
Map<String, String> headerMap = new HashMap<String, String>();
headerMap.put("User-Agent",
......@@ -72,11 +23,17 @@ public class HttpClientDemo {
headerMap.put("Content-Type", "application/x-www-form-urlencoded");
headerMap.put("Host", "weibo.com");
try {
result = HttpClientTemplateOK.get(url, null, headerMap);
System.out.println("开始下载");
// Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url, headerMap));
// result = response.body().string();
result = HttpClientTemplateOK.get(url, null, headerMap);
System.out.println("下载结束");
return result;
} catch (Exception e) {
logger.error("httpClient 获取数据出现问题:{}", e.getMessage());
logger.info("httpClient 获取数据出现问题:{}", e.getMessage());
e.printStackTrace();
}
return result;
}
......
......@@ -3,9 +3,11 @@ package com.zhiwei.weibocrawler.rsidClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.rsid.core.RsidClient;
import com.zhiwei.middleware.cleaner.ptenum.PTENUM;
import com.zhiwei.middleware.cleaner.urlfilter.UnifiedUrlFilterClient;
import com.zhiwei.middleware.filter.config.Definition;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.weibocrawler.config.Config;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
/**
* @ClassName RsidClientDAO
......@@ -16,17 +18,22 @@ import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
*/
public class RsidClientDAO {
// private static final String rsidUrl = "zookeeper://192.168.0.203:2181"; //中间件zookkeeper地址,服务器地址
//
// private static final String rsidGroup = "rsidserver"; //中间件分组
//
// private static final String redisWeiboKey = "weibo"; //去重的分组
private static RsidClient client = RsidClient.build(Config.rsidUrl, Config.rsidGroup);
private static UnifiedUrlFilterClient client;
private static Logger logger = LoggerFactory.getLogger(RsidClientDAO.class);
static{
if(client==null){
synchronized (RsidClientDAO.class) {
if(client==null) {
try {
client = UnifiedUrlFilterClient.getClient(Config.rsidUrl, Config.rsidGroup, Definition.GroupType.PROVIDER);
} catch (Exception e) {
logger.error("链接清洗中间件时出现错误,错误为:::{}", e);
}
}
}
}
}
/**
* @Description 验证微博是否重复
* @param mid
......@@ -36,7 +43,9 @@ public class RsidClientDAO {
//循环3次避免连接超时引起的验证失效
for(int i=0; i<3; i++){
try {
return client.addFilterUrl(mid, false, Config.redisWeiboKey);
boolean f = client.contains(mid, PTENUM.COMMON);
System.out.println(mid+"==========="+f);
return f;
} catch (Exception e) {
logger.error("判断此条微博消息是否存在出现问题",e.fillInStackTrace());
ZhiWeiTools.sleep(200);
......@@ -46,6 +55,4 @@ public class RsidClientDAO {
return false;
}
}
......@@ -3,14 +3,14 @@ package weibotest;
import java.util.Map;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.weibocrawler.crawler.GetData;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class HotWeiboTest {
public static void main(String[] args) {
//开启采集
String token = "2.00HUuC3C3_jZ8E0c00a67ab8xbOHqB";
String token = "2.00HUuC3C3_jZ8E36c5026e390AzIOP";
GetData.start(token);
// //获取数据
while(true){
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment