Commit 05561321 by [zhangzhiwei]

去重中间件版本的新包,并修改了http采集方式

parent cd252c2d
...@@ -3,44 +3,17 @@ ...@@ -3,44 +3,17 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>weibohotcrawler</artifactId> <artifactId>weibohotcrawler</artifactId>
<version>0.0.1-SNAPSHOT</version> <version>0.0.2-SNAPSHOT</version>
<name>weibohotcrawler</name> <name>weibohotcrawler</name>
<description>微博热搜1小时榜单,社会、热点采集程序</description> <description>微博热搜1小时榜单,社会、热点采集程序</description>
<dependencies> <dependencies>
<dependency>
<groupId>com.zhiwei.middleware</groupId>
<artifactId>rsid-client</artifactId>
<version>0.0.2-SNAPSHOT</version>
</dependency>
<dependency> <dependency>
<groupId>javax.mail</groupId> <groupId>javax.mail</groupId>
<artifactId>mail</artifactId> <artifactId>mail</artifactId>
<version>1.4.7</version> <version>1.4.7</version>
</dependency> </dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>4.4.6</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>
<dependency> <dependency>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
...@@ -48,10 +21,16 @@ ...@@ -48,10 +21,16 @@
<version>0.0.5-SNAPSHOT</version> <version>0.0.5-SNAPSHOT</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei.middleware</groupId>
<artifactId>zhiweiTools</artifactId> <artifactId>cleaner-unified-urlfilter</artifactId>
<version>0.0.6-SNAPSHOT</version> <version>1.0-SNAPSHOT</version>
</dependency> </dependency>
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.0.5-SNAPSHOT</version>
</dependency>
</dependencies> </dependencies>
<!-- 打包管理 --> <!-- 打包管理 -->
<build> <build>
......
...@@ -17,9 +17,9 @@ public class DataCrawlerStart{ ...@@ -17,9 +17,9 @@ public class DataCrawlerStart{
} }
public void start() { public void start() {
scheduled.scheduleWithFixedDelay(new WeiboBangdanData(), 2000, 15*1000, TimeUnit.MILLISECONDS); scheduled.scheduleWithFixedDelay(new WeiboBangdanData(), 2000, 3*60*1000, TimeUnit.MILLISECONDS);
scheduled.scheduleWithFixedDelay(new WeiboHotData(), 1000, 20*1000, TimeUnit.MILLISECONDS); scheduled.scheduleWithFixedDelay(new WeiboHotData(), 1000, 3*60*1000, TimeUnit.MILLISECONDS);
scheduled.scheduleWithFixedDelay(new WeiboSocietyData(), 3000, 19*1000, TimeUnit.MILLISECONDS); scheduled.scheduleWithFixedDelay(new WeiboSocietyData(), 3000, 3*60*1000, TimeUnit.MILLISECONDS);
} }
......
...@@ -6,12 +6,11 @@ import java.util.List; ...@@ -6,12 +6,11 @@ import java.util.List;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.weibobusiness.weibo4j.model.Status; import com.zhiwei.weibobusiness.weibo4j.model.Status;
import com.zhiwei.weibocrawler.crawler.getdata.WeiboCrawlerAnalysis; import com.zhiwei.weibocrawler.crawler.getdata.WeiboCrawlerAnalysis;
import com.zhiwei.weibocrawler.rsidClient.DataQueue; import com.zhiwei.weibocrawler.rsidClient.DataQueue;
import com.zhiwei.weibocrawler.rsidClient.UpdateQueue; import com.zhiwei.weibocrawler.rsidClient.UpdateQueue;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
/*** /***
* *
* @ClassName DataUpdate * @ClassName DataUpdate
...@@ -56,6 +55,7 @@ public class DataUpdate implements Runnable{ ...@@ -56,6 +55,7 @@ public class DataUpdate implements Runnable{
} }
i = 1; i = 1;
}else { }else {
logger.error("目前数据量不足50,目前队列中得数据量为:::{}", DataQueue.linkQueue.size());
i++; i++;
} }
} catch (Exception e) { } catch (Exception e) {
......
...@@ -5,7 +5,7 @@ import java.util.Date; ...@@ -5,7 +5,7 @@ import java.util.Date;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
/** /**
* *
* @ClassName WeiboBangdanData * @ClassName WeiboBangdanData
...@@ -28,7 +28,7 @@ public class WeiboBangdanData implements Runnable{ ...@@ -28,7 +28,7 @@ public class WeiboBangdanData implements Runnable{
weiboCrawlerAnalysis.getWeiboHotMid(url); weiboCrawlerAnalysis.getWeiboHotMid(url);
ZhiWeiTools.sleep(12000); ZhiWeiTools.sleep(12000);
} catch (Exception e) { } catch (Exception e) {
logger.error("出错====榜单的出错了",e.getMessage()); logger.error("出错====榜单的出错了",e);
e.printStackTrace(); e.printStackTrace();
ZhiWeiTools.sleep(20); ZhiWeiTools.sleep(20);
continue; continue;
......
package com.zhiwei.weibocrawler.crawler.getdata; package com.zhiwei.weibocrawler.crawler.getdata;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date;
import java.util.List; import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
...@@ -14,6 +11,7 @@ import org.slf4j.Logger; ...@@ -14,6 +11,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.weibobusiness.business.SearchBusiness; import com.zhiwei.weibobusiness.business.SearchBusiness;
import com.zhiwei.weibobusiness.weibo4j.model.Status; import com.zhiwei.weibobusiness.weibo4j.model.Status;
import com.zhiwei.weibobusiness.weibo4j.model.StatusWapper; import com.zhiwei.weibobusiness.weibo4j.model.StatusWapper;
...@@ -21,7 +19,7 @@ import com.zhiwei.weibobusiness.weibo4j.model.WeiboException; ...@@ -21,7 +19,7 @@ import com.zhiwei.weibobusiness.weibo4j.model.WeiboException;
import com.zhiwei.weibocrawler.httpclient.HttpClientDemo; import com.zhiwei.weibocrawler.httpclient.HttpClientDemo;
import com.zhiwei.weibocrawler.rsidClient.DataQueue; import com.zhiwei.weibocrawler.rsidClient.DataQueue;
import com.zhiwei.weibocrawler.rsidClient.RsidClientDAO; import com.zhiwei.weibocrawler.rsidClient.RsidClientDAO;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
/** /**
* *
* @ClassName WeiboCrawlerAnalysis * @ClassName WeiboCrawlerAnalysis
...@@ -31,102 +29,110 @@ import com.zhiwei.zhiweiTools.tools.ZhiWeiTools; ...@@ -31,102 +29,110 @@ import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
* @version 1.0.0 * @version 1.0.0
*/ */
public class WeiboCrawlerAnalysis { public class WeiboCrawlerAnalysis {
private static Logger logger = LoggerFactory.getLogger(WeiboCrawlerAnalysis.class); private static Logger logger = LoggerFactory.getLogger(WeiboCrawlerAnalysis.class);
/**
/** *
* * @Description (mid获取微博数据)
* @Description (mid获取微博数据) * @param midsList
* @param midsList * @param businessToken
* @param businessToken * @return
* @return */
*/ public static List<Status> getWeiboData(List<String> midsList, String businessToken) {
public static List<Status> getWeiboData(List<String> midsList,String businessToken) { SearchBusiness searchBusiness = new SearchBusiness(businessToken);
SearchBusiness searchBusiness = new SearchBusiness(businessToken); if (midsList.size() < 1) {
if(midsList.size() < 1){ return null;
return null; }
} List<Status> statuses = new ArrayList<Status>();
List<Status> statuses = new ArrayList<Status>(); String mids = "";
String mids = ""; int i = 0;
int i = 0; for (String mid : midsList) {
for(String mid : midsList) { mids = mids + mid + ",";
mids = mids + mid + ","; i++;
i++; if (i > 48) {
if(i > 48) { try {
try { mids = mids.substring(0, mids.length() - 1);
mids = mids.substring(0,mids.length()-1); StatusWapper statusWapper = searchBusiness.showStatusBusniess(mids);
StatusWapper statusWapper = searchBusiness.showStatusBusniess(mids); statuses.addAll(statusWapper.getStatuses());
statuses.addAll(statusWapper.getStatuses()); i = 0;
i = 0; mids = "";
mids = ""; } catch (WeiboException e) {
}catch (WeiboException e) { logger.error("数据更新出错部分mids=========" + mids);
logger.error("数据更新出错部分mids========="+mids); e.printStackTrace();
e.printStackTrace(); continue;
continue; }
} }
} }
} try {
try { mids = mids.substring(0, mids.length() - 1);
mids = mids.substring(0,mids.length()-1); StatusWapper statusWapper = searchBusiness.showStatusBusniess(mids);
StatusWapper statusWapper = searchBusiness.showStatusBusniess(mids); statuses.addAll(statusWapper.getStatuses());
statuses.addAll(statusWapper.getStatuses()); } catch (WeiboException e) {
}catch (WeiboException e) { logger.error("数据更新出错部分mids=========" + mids);
logger.error("数据更新出错部分mids========="+mids); logger.error("数据出错", e.getMessage());
logger.error("数据出错",e.getMessage()); e.printStackTrace();
e.printStackTrace(); return null;
return null; }
} return statuses;
return statuses; }
}
/**
/** *
* * @Description (获取微博数据mid集合)
* @Description (获取微博数据mid集合) * @return
* @return */
*/ public void getWeiboHotMid(String url) {
public void getWeiboHotMid(String url) { try {
try { String result = HttpClientDemo.executeHttpRequestGet(url);
String result = HttpClientDemo.executeHttpRequestGet(url); getWeiboData(result);
getWeiboData(result); } catch (Exception e) {
} catch (Exception e) { e.printStackTrace();
e.printStackTrace(); }
} }
}
/**
/** *
* * @Description (解析)
* @Description (解析) * @param result
* @param result * @return
* @return */
*/ private void getWeiboData(String result) {
private void getWeiboData(String result) { try {
try { JSONObject json = (JSONObject) JSONObject.parse(result);
JSONObject json = (JSONObject) JSONObject.parse(result); String s = json.getString("data");
String s = json.getString("data"); Document document = Jsoup.parse(s);
Document document = Jsoup.parse(s); Elements elements = document.select("div.UG_contents").select("ul.clearfix")
Elements elements = document.select("div.UG_contents").select("ul.clearfix").select("div[action-type=feed_list_item]"); .select("div[action-type=feed_list_item]");
for(Element element : elements) { System.out.println("elements size is " + elements.size());
try { List<String> midsList = new ArrayList<String>();
String mid = element.attr("mid"); for (Element element : elements) {
if(mid.length() > 16) { try {
mid = mid.substring(mid.length()-16, mid.length()); String mid = element.attr("mid");
} if (mid.length() > 16) {
if(RsidClientDAO.isWeiboExit(mid)) { mid = mid.substring(mid.length() - 16, mid.length());
DataQueue.offer(mid); }
} if(!midsList.contains(mid)){
} catch (Exception e) { midsList.add(mid);
logger.error("数据解析出错",e.getMessage()); }
ZhiWeiTools.sleep(200); } catch (Exception e) {
e.printStackTrace(); logger.error("数据解析出错", e.getMessage());
continue; ZhiWeiTools.sleep(200);
} e.printStackTrace();
continue;
} }
} catch (Exception e) { }
logger.error("数据解析出错",e.getMessage()); if(!midsList.isEmpty()){
ZhiWeiTools.sleep(200); for(String mid : midsList){
e.printStackTrace(); if (!RsidClientDAO.isWeiboExit(mid)) {
} DataQueue.offer(mid);
} }
}
}
} catch (Exception e) {
logger.error("数据解析出错", e.getMessage());
ZhiWeiTools.sleep(200);
e.printStackTrace();
}
}
} }
...@@ -5,7 +5,7 @@ import java.util.Date; ...@@ -5,7 +5,7 @@ import java.util.Date;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
/** /**
* *
* @ClassName WeiboHotData * @ClassName WeiboHotData
......
...@@ -5,7 +5,7 @@ import java.util.Date; ...@@ -5,7 +5,7 @@ import java.util.Date;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
/** /**
* *
* @ClassName WeiboSocietyData * @ClassName WeiboSocietyData
......
package com.zhiwei.weibocrawler.httpclient; package com.zhiwei.weibocrawler.httpclient;
import java.io.IOException;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK; import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
public class HttpClientDemo { public class HttpClientDemo {
private static Logger logger = LoggerFactory.getLogger(HttpClientDemo.class); private static Logger logger = LoggerFactory.getLogger(HttpClientDemo.class);
// public static String executeHttpRequestGet(String url) throws IOException { public static String executeHttpRequestGet(String url) {
// String result = null;
// Map<String, String> headerMap = new HashMap<String, String>();
// headerMap.put("User-Agent",
// "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
// headerMap.put("Accept","*/*");
// headerMap.put("Accept-Encoding", "gzip, deflate, br");
// headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
// headerMap.put("Connection", "keep-alive");
// headerMap.put("Content-Type", "application/x-www-form-urlencoded");
// headerMap.put("Host", "weibo.com");
// CloseableHttpClient httpClient = null;
// for(int j = 1;j <= 3;j++) {
// try {
// HttpGet httpGet = new HttpGet(url);
// RequestConfig requestConfig = RequestConfig.custom()
// .setSocketTimeout(8000).setConnectTimeout(8000).build();
// httpClient = HttpClients.custom()
// .setDefaultRequestConfig(requestConfig).build();
// if (headerMap != null) {
// for (Entry<String, String> header : headerMap.entrySet()) {
// httpGet.setHeader(header.getKey(), header.getValue());
// }
// }
// result = EntityUtils
// .toString(httpClient.execute(httpGet).getEntity());
// return result;
// }catch (Exception e) {
// e.printStackTrace();
// continue;
// }finally {
// if (httpClient != null) {
// httpClient.close();
// }
// }
// }
// return result;
//
// }
public static String executeHttpRequestGet(String url) throws IOException {
String result = null; String result = null;
Map<String, String> headerMap = new HashMap<String, String>(); Map<String, String> headerMap = new HashMap<String, String>();
headerMap.put("User-Agent", headerMap.put("User-Agent",
...@@ -72,11 +23,17 @@ public class HttpClientDemo { ...@@ -72,11 +23,17 @@ public class HttpClientDemo {
headerMap.put("Content-Type", "application/x-www-form-urlencoded"); headerMap.put("Content-Type", "application/x-www-form-urlencoded");
headerMap.put("Host", "weibo.com"); headerMap.put("Host", "weibo.com");
try { try {
result = HttpClientTemplateOK.get(url, null, headerMap); System.out.println("开始下载");
// Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url, headerMap));
// result = response.body().string();
result = HttpClientTemplateOK.get(url, null, headerMap);
System.out.println("下载结束");
return result;
} catch (Exception e) { } catch (Exception e) {
logger.error("httpClient 获取数据出现问题:{}", e.getMessage()); logger.info("httpClient 获取数据出现问题:{}", e.getMessage());
e.printStackTrace();
} }
return result; return result;
} }
......
...@@ -3,9 +3,11 @@ package com.zhiwei.weibocrawler.rsidClient; ...@@ -3,9 +3,11 @@ package com.zhiwei.weibocrawler.rsidClient;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.rsid.core.RsidClient; import com.zhiwei.middleware.cleaner.ptenum.PTENUM;
import com.zhiwei.middleware.cleaner.urlfilter.UnifiedUrlFilterClient;
import com.zhiwei.middleware.filter.config.Definition;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.weibocrawler.config.Config; import com.zhiwei.weibocrawler.config.Config;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
/** /**
* @ClassName RsidClientDAO * @ClassName RsidClientDAO
...@@ -16,17 +18,22 @@ import com.zhiwei.zhiweiTools.tools.ZhiWeiTools; ...@@ -16,17 +18,22 @@ import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
*/ */
public class RsidClientDAO { public class RsidClientDAO {
// private static final String rsidUrl = "zookeeper://192.168.0.203:2181"; //中间件zookkeeper地址,服务器地址 private static UnifiedUrlFilterClient client;
//
// private static final String rsidGroup = "rsidserver"; //中间件分组
//
// private static final String redisWeiboKey = "weibo"; //去重的分组
private static RsidClient client = RsidClient.build(Config.rsidUrl, Config.rsidGroup);
private static Logger logger = LoggerFactory.getLogger(RsidClientDAO.class); private static Logger logger = LoggerFactory.getLogger(RsidClientDAO.class);
static{
if(client==null){
synchronized (RsidClientDAO.class) {
if(client==null) {
try {
client = UnifiedUrlFilterClient.getClient(Config.rsidUrl, Config.rsidGroup, Definition.GroupType.PROVIDER);
} catch (Exception e) {
logger.error("链接清洗中间件时出现错误,错误为:::{}", e);
}
}
}
}
}
/** /**
* @Description 验证微博是否重复 * @Description 验证微博是否重复
* @param mid * @param mid
...@@ -36,7 +43,9 @@ public class RsidClientDAO { ...@@ -36,7 +43,9 @@ public class RsidClientDAO {
//循环3次避免连接超时引起的验证失效 //循环3次避免连接超时引起的验证失效
for(int i=0; i<3; i++){ for(int i=0; i<3; i++){
try { try {
return client.addFilterUrl(mid, false, Config.redisWeiboKey); boolean f = client.contains(mid, PTENUM.COMMON);
System.out.println(mid+"==========="+f);
return f;
} catch (Exception e) { } catch (Exception e) {
logger.error("判断此条微博消息是否存在出现问题",e.fillInStackTrace()); logger.error("判断此条微博消息是否存在出现问题",e.fillInStackTrace());
ZhiWeiTools.sleep(200); ZhiWeiTools.sleep(200);
...@@ -46,6 +55,4 @@ public class RsidClientDAO { ...@@ -46,6 +55,4 @@ public class RsidClientDAO {
return false; return false;
} }
} }
...@@ -3,14 +3,14 @@ package weibotest; ...@@ -3,14 +3,14 @@ package weibotest;
import java.util.Map; import java.util.Map;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.weibocrawler.crawler.GetData; import com.zhiwei.weibocrawler.crawler.GetData;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
public class HotWeiboTest { public class HotWeiboTest {
public static void main(String[] args) { public static void main(String[] args) {
//开启采集 //开启采集
String token = "2.00HUuC3C3_jZ8E0c00a67ab8xbOHqB"; String token = "2.00HUuC3C3_jZ8E36c5026e390AzIOP";
GetData.start(token); GetData.start(token);
// //获取数据 // //获取数据
while(true){ while(true){
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment