Commit 05561321 by [zhangzhiwei]

去重中间件版本的新包,并修改了http采集方式

parent cd252c2d
...@@ -3,44 +3,17 @@ ...@@ -3,44 +3,17 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>weibohotcrawler</artifactId> <artifactId>weibohotcrawler</artifactId>
<version>0.0.1-SNAPSHOT</version> <version>0.0.2-SNAPSHOT</version>
<name>weibohotcrawler</name> <name>weibohotcrawler</name>
<description>微博热搜1小时榜单,社会、热点采集程序</description> <description>微博热搜1小时榜单,社会、热点采集程序</description>
<dependencies> <dependencies>
<dependency>
<groupId>com.zhiwei.middleware</groupId>
<artifactId>rsid-client</artifactId>
<version>0.0.2-SNAPSHOT</version>
</dependency>
<dependency> <dependency>
<groupId>javax.mail</groupId> <groupId>javax.mail</groupId>
<artifactId>mail</artifactId> <artifactId>mail</artifactId>
<version>1.4.7</version> <version>1.4.7</version>
</dependency> </dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>4.4.6</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>
<dependency> <dependency>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
...@@ -48,10 +21,16 @@ ...@@ -48,10 +21,16 @@
<version>0.0.5-SNAPSHOT</version> <version>0.0.5-SNAPSHOT</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei.middleware</groupId>
<artifactId>zhiweiTools</artifactId> <artifactId>cleaner-unified-urlfilter</artifactId>
<version>0.0.6-SNAPSHOT</version> <version>1.0-SNAPSHOT</version>
</dependency> </dependency>
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.0.5-SNAPSHOT</version>
</dependency>
</dependencies> </dependencies>
<!-- 打包管理 --> <!-- 打包管理 -->
<build> <build>
......
...@@ -17,9 +17,9 @@ public class DataCrawlerStart{ ...@@ -17,9 +17,9 @@ public class DataCrawlerStart{
} }
public void start() { public void start() {
scheduled.scheduleWithFixedDelay(new WeiboBangdanData(), 2000, 15*1000, TimeUnit.MILLISECONDS); scheduled.scheduleWithFixedDelay(new WeiboBangdanData(), 2000, 3*60*1000, TimeUnit.MILLISECONDS);
scheduled.scheduleWithFixedDelay(new WeiboHotData(), 1000, 20*1000, TimeUnit.MILLISECONDS); scheduled.scheduleWithFixedDelay(new WeiboHotData(), 1000, 3*60*1000, TimeUnit.MILLISECONDS);
scheduled.scheduleWithFixedDelay(new WeiboSocietyData(), 3000, 19*1000, TimeUnit.MILLISECONDS); scheduled.scheduleWithFixedDelay(new WeiboSocietyData(), 3000, 3*60*1000, TimeUnit.MILLISECONDS);
} }
......
...@@ -6,12 +6,11 @@ import java.util.List; ...@@ -6,12 +6,11 @@ import java.util.List;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.weibobusiness.weibo4j.model.Status; import com.zhiwei.weibobusiness.weibo4j.model.Status;
import com.zhiwei.weibocrawler.crawler.getdata.WeiboCrawlerAnalysis; import com.zhiwei.weibocrawler.crawler.getdata.WeiboCrawlerAnalysis;
import com.zhiwei.weibocrawler.rsidClient.DataQueue; import com.zhiwei.weibocrawler.rsidClient.DataQueue;
import com.zhiwei.weibocrawler.rsidClient.UpdateQueue; import com.zhiwei.weibocrawler.rsidClient.UpdateQueue;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
/*** /***
* *
* @ClassName DataUpdate * @ClassName DataUpdate
...@@ -56,6 +55,7 @@ public class DataUpdate implements Runnable{ ...@@ -56,6 +55,7 @@ public class DataUpdate implements Runnable{
} }
i = 1; i = 1;
}else { }else {
logger.error("目前数据量不足50,目前队列中得数据量为:::{}", DataQueue.linkQueue.size());
i++; i++;
} }
} catch (Exception e) { } catch (Exception e) {
......
...@@ -5,7 +5,7 @@ import java.util.Date; ...@@ -5,7 +5,7 @@ import java.util.Date;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
/** /**
* *
* @ClassName WeiboBangdanData * @ClassName WeiboBangdanData
...@@ -28,7 +28,7 @@ public class WeiboBangdanData implements Runnable{ ...@@ -28,7 +28,7 @@ public class WeiboBangdanData implements Runnable{
weiboCrawlerAnalysis.getWeiboHotMid(url); weiboCrawlerAnalysis.getWeiboHotMid(url);
ZhiWeiTools.sleep(12000); ZhiWeiTools.sleep(12000);
} catch (Exception e) { } catch (Exception e) {
logger.error("出错====榜单的出错了",e.getMessage()); logger.error("出错====榜单的出错了",e);
e.printStackTrace(); e.printStackTrace();
ZhiWeiTools.sleep(20); ZhiWeiTools.sleep(20);
continue; continue;
......
package com.zhiwei.weibocrawler.crawler.getdata; package com.zhiwei.weibocrawler.crawler.getdata;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date;
import java.util.List; import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
...@@ -14,6 +11,7 @@ import org.slf4j.Logger; ...@@ -14,6 +11,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.weibobusiness.business.SearchBusiness; import com.zhiwei.weibobusiness.business.SearchBusiness;
import com.zhiwei.weibobusiness.weibo4j.model.Status; import com.zhiwei.weibobusiness.weibo4j.model.Status;
import com.zhiwei.weibobusiness.weibo4j.model.StatusWapper; import com.zhiwei.weibobusiness.weibo4j.model.StatusWapper;
...@@ -21,7 +19,7 @@ import com.zhiwei.weibobusiness.weibo4j.model.WeiboException; ...@@ -21,7 +19,7 @@ import com.zhiwei.weibobusiness.weibo4j.model.WeiboException;
import com.zhiwei.weibocrawler.httpclient.HttpClientDemo; import com.zhiwei.weibocrawler.httpclient.HttpClientDemo;
import com.zhiwei.weibocrawler.rsidClient.DataQueue; import com.zhiwei.weibocrawler.rsidClient.DataQueue;
import com.zhiwei.weibocrawler.rsidClient.RsidClientDAO; import com.zhiwei.weibocrawler.rsidClient.RsidClientDAO;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
/** /**
* *
* @ClassName WeiboCrawlerAnalysis * @ClassName WeiboCrawlerAnalysis
...@@ -33,7 +31,6 @@ import com.zhiwei.zhiweiTools.tools.ZhiWeiTools; ...@@ -33,7 +31,6 @@ import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class WeiboCrawlerAnalysis { public class WeiboCrawlerAnalysis {
private static Logger logger = LoggerFactory.getLogger(WeiboCrawlerAnalysis.class); private static Logger logger = LoggerFactory.getLogger(WeiboCrawlerAnalysis.class);
/** /**
* *
* @Description (mid获取微博数据) * @Description (mid获取微博数据)
...@@ -41,38 +38,38 @@ public class WeiboCrawlerAnalysis { ...@@ -41,38 +38,38 @@ public class WeiboCrawlerAnalysis {
* @param businessToken * @param businessToken
* @return * @return
*/ */
public static List<Status> getWeiboData(List<String> midsList,String businessToken) { public static List<Status> getWeiboData(List<String> midsList, String businessToken) {
SearchBusiness searchBusiness = new SearchBusiness(businessToken); SearchBusiness searchBusiness = new SearchBusiness(businessToken);
if(midsList.size() < 1){ if (midsList.size() < 1) {
return null; return null;
} }
List<Status> statuses = new ArrayList<Status>(); List<Status> statuses = new ArrayList<Status>();
String mids = ""; String mids = "";
int i = 0; int i = 0;
for(String mid : midsList) { for (String mid : midsList) {
mids = mids + mid + ","; mids = mids + mid + ",";
i++; i++;
if(i > 48) { if (i > 48) {
try { try {
mids = mids.substring(0,mids.length()-1); mids = mids.substring(0, mids.length() - 1);
StatusWapper statusWapper = searchBusiness.showStatusBusniess(mids); StatusWapper statusWapper = searchBusiness.showStatusBusniess(mids);
statuses.addAll(statusWapper.getStatuses()); statuses.addAll(statusWapper.getStatuses());
i = 0; i = 0;
mids = ""; mids = "";
}catch (WeiboException e) { } catch (WeiboException e) {
logger.error("数据更新出错部分mids========="+mids); logger.error("数据更新出错部分mids=========" + mids);
e.printStackTrace(); e.printStackTrace();
continue; continue;
} }
} }
} }
try { try {
mids = mids.substring(0,mids.length()-1); mids = mids.substring(0, mids.length() - 1);
StatusWapper statusWapper = searchBusiness.showStatusBusniess(mids); StatusWapper statusWapper = searchBusiness.showStatusBusniess(mids);
statuses.addAll(statusWapper.getStatuses()); statuses.addAll(statusWapper.getStatuses());
}catch (WeiboException e) { } catch (WeiboException e) {
logger.error("数据更新出错部分mids========="+mids); logger.error("数据更新出错部分mids=========" + mids);
logger.error("数据出错",e.getMessage()); logger.error("数据出错", e.getMessage());
e.printStackTrace(); e.printStackTrace();
return null; return null;
} }
...@@ -104,26 +101,35 @@ public class WeiboCrawlerAnalysis { ...@@ -104,26 +101,35 @@ public class WeiboCrawlerAnalysis {
JSONObject json = (JSONObject) JSONObject.parse(result); JSONObject json = (JSONObject) JSONObject.parse(result);
String s = json.getString("data"); String s = json.getString("data");
Document document = Jsoup.parse(s); Document document = Jsoup.parse(s);
Elements elements = document.select("div.UG_contents").select("ul.clearfix").select("div[action-type=feed_list_item]"); Elements elements = document.select("div.UG_contents").select("ul.clearfix")
for(Element element : elements) { .select("div[action-type=feed_list_item]");
System.out.println("elements size is " + elements.size());
List<String> midsList = new ArrayList<String>();
for (Element element : elements) {
try { try {
String mid = element.attr("mid"); String mid = element.attr("mid");
if(mid.length() > 16) { if (mid.length() > 16) {
mid = mid.substring(mid.length()-16, mid.length()); mid = mid.substring(mid.length() - 16, mid.length());
} }
if(RsidClientDAO.isWeiboExit(mid)) { if(!midsList.contains(mid)){
DataQueue.offer(mid); midsList.add(mid);
} }
} catch (Exception e) { } catch (Exception e) {
logger.error("数据解析出错",e.getMessage()); logger.error("数据解析出错", e.getMessage());
ZhiWeiTools.sleep(200); ZhiWeiTools.sleep(200);
e.printStackTrace(); e.printStackTrace();
continue; continue;
} }
}
if(!midsList.isEmpty()){
for(String mid : midsList){
if (!RsidClientDAO.isWeiboExit(mid)) {
DataQueue.offer(mid);
}
}
} }
} catch (Exception e) { } catch (Exception e) {
logger.error("数据解析出错",e.getMessage()); logger.error("数据解析出错", e.getMessage());
ZhiWeiTools.sleep(200); ZhiWeiTools.sleep(200);
e.printStackTrace(); e.printStackTrace();
} }
......
...@@ -5,7 +5,7 @@ import java.util.Date; ...@@ -5,7 +5,7 @@ import java.util.Date;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
/** /**
* *
* @ClassName WeiboHotData * @ClassName WeiboHotData
......
...@@ -5,7 +5,7 @@ import java.util.Date; ...@@ -5,7 +5,7 @@ import java.util.Date;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
/** /**
* *
* @ClassName WeiboSocietyData * @ClassName WeiboSocietyData
......
package com.zhiwei.weibocrawler.httpclient; package com.zhiwei.weibocrawler.httpclient;
import java.io.IOException;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK; import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
public class HttpClientDemo { public class HttpClientDemo {
private static Logger logger = LoggerFactory.getLogger(HttpClientDemo.class); private static Logger logger = LoggerFactory.getLogger(HttpClientDemo.class);
// public static String executeHttpRequestGet(String url) throws IOException { public static String executeHttpRequestGet(String url) {
// String result = null;
// Map<String, String> headerMap = new HashMap<String, String>();
// headerMap.put("User-Agent",
// "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
// headerMap.put("Accept","*/*");
// headerMap.put("Accept-Encoding", "gzip, deflate, br");
// headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
// headerMap.put("Connection", "keep-alive");
// headerMap.put("Content-Type", "application/x-www-form-urlencoded");
// headerMap.put("Host", "weibo.com");
// CloseableHttpClient httpClient = null;
// for(int j = 1;j <= 3;j++) {
// try {
// HttpGet httpGet = new HttpGet(url);
// RequestConfig requestConfig = RequestConfig.custom()
// .setSocketTimeout(8000).setConnectTimeout(8000).build();
// httpClient = HttpClients.custom()
// .setDefaultRequestConfig(requestConfig).build();
// if (headerMap != null) {
// for (Entry<String, String> header : headerMap.entrySet()) {
// httpGet.setHeader(header.getKey(), header.getValue());
// }
// }
// result = EntityUtils
// .toString(httpClient.execute(httpGet).getEntity());
// return result;
// }catch (Exception e) {
// e.printStackTrace();
// continue;
// }finally {
// if (httpClient != null) {
// httpClient.close();
// }
// }
// }
// return result;
//
// }
public static String executeHttpRequestGet(String url) throws IOException {
String result = null; String result = null;
Map<String, String> headerMap = new HashMap<String, String>(); Map<String, String> headerMap = new HashMap<String, String>();
headerMap.put("User-Agent", headerMap.put("User-Agent",
...@@ -72,11 +23,17 @@ public class HttpClientDemo { ...@@ -72,11 +23,17 @@ public class HttpClientDemo {
headerMap.put("Content-Type", "application/x-www-form-urlencoded"); headerMap.put("Content-Type", "application/x-www-form-urlencoded");
headerMap.put("Host", "weibo.com"); headerMap.put("Host", "weibo.com");
try { try {
System.out.println("开始下载");
// Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url, headerMap));
// result = response.body().string();
result = HttpClientTemplateOK.get(url, null, headerMap); result = HttpClientTemplateOK.get(url, null, headerMap);
System.out.println("下载结束");
return result;
} catch (Exception e) { } catch (Exception e) {
logger.error("httpClient 获取数据出现问题:{}", e.getMessage()); logger.info("httpClient 获取数据出现问题:{}", e.getMessage());
e.printStackTrace();
} }
return result; return result;
} }
......
...@@ -3,9 +3,11 @@ package com.zhiwei.weibocrawler.rsidClient; ...@@ -3,9 +3,11 @@ package com.zhiwei.weibocrawler.rsidClient;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.rsid.core.RsidClient; import com.zhiwei.middleware.cleaner.ptenum.PTENUM;
import com.zhiwei.middleware.cleaner.urlfilter.UnifiedUrlFilterClient;
import com.zhiwei.middleware.filter.config.Definition;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.weibocrawler.config.Config; import com.zhiwei.weibocrawler.config.Config;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
/** /**
* @ClassName RsidClientDAO * @ClassName RsidClientDAO
...@@ -16,16 +18,21 @@ import com.zhiwei.zhiweiTools.tools.ZhiWeiTools; ...@@ -16,16 +18,21 @@ import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
*/ */
public class RsidClientDAO { public class RsidClientDAO {
// private static final String rsidUrl = "zookeeper://192.168.0.203:2181"; //中间件zookkeeper地址,服务器地址 private static UnifiedUrlFilterClient client;
//
// private static final String rsidGroup = "rsidserver"; //中间件分组
//
// private static final String redisWeiboKey = "weibo"; //去重的分组
private static RsidClient client = RsidClient.build(Config.rsidUrl, Config.rsidGroup);
private static Logger logger = LoggerFactory.getLogger(RsidClientDAO.class); private static Logger logger = LoggerFactory.getLogger(RsidClientDAO.class);
static{
if(client==null){
synchronized (RsidClientDAO.class) {
if(client==null) {
try {
client = UnifiedUrlFilterClient.getClient(Config.rsidUrl, Config.rsidGroup, Definition.GroupType.PROVIDER);
} catch (Exception e) {
logger.error("链接清洗中间件时出现错误,错误为:::{}", e);
}
}
}
}
}
/** /**
* @Description 验证微博是否重复 * @Description 验证微博是否重复
...@@ -36,7 +43,9 @@ public class RsidClientDAO { ...@@ -36,7 +43,9 @@ public class RsidClientDAO {
//循环3次避免连接超时引起的验证失效 //循环3次避免连接超时引起的验证失效
for(int i=0; i<3; i++){ for(int i=0; i<3; i++){
try { try {
return client.addFilterUrl(mid, false, Config.redisWeiboKey); boolean f = client.contains(mid, PTENUM.COMMON);
System.out.println(mid+"==========="+f);
return f;
} catch (Exception e) { } catch (Exception e) {
logger.error("判断此条微博消息是否存在出现问题",e.fillInStackTrace()); logger.error("判断此条微博消息是否存在出现问题",e.fillInStackTrace());
ZhiWeiTools.sleep(200); ZhiWeiTools.sleep(200);
...@@ -46,6 +55,4 @@ public class RsidClientDAO { ...@@ -46,6 +55,4 @@ public class RsidClientDAO {
return false; return false;
} }
} }
...@@ -3,14 +3,14 @@ package weibotest; ...@@ -3,14 +3,14 @@ package weibotest;
import java.util.Map; import java.util.Map;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.weibocrawler.crawler.GetData; import com.zhiwei.weibocrawler.crawler.GetData;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class HotWeiboTest { public class HotWeiboTest {
public static void main(String[] args) { public static void main(String[] args) {
//开启采集 //开启采集
String token = "2.00HUuC3C3_jZ8E0c00a67ab8xbOHqB"; String token = "2.00HUuC3C3_jZ8E36c5026e390AzIOP";
GetData.start(token); GetData.start(token);
// //获取数据 // //获取数据
while(true){ while(true){
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment