Commit 7ad96e77 by zhiwei

添加限制返回总页数功能

parent 9d752049
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId> <groupId>com.zhiwei</groupId>
<artifactId>wechat</artifactId> <artifactId>wechat</artifactId>
<version>1.1.6-SNAPSHOT</version> <version>1.1.7-SNAPSHOT</version>
<description> <description>
知微微信采集程序,包含 知微微信采集程序,包含
1.微信历史文章采集 1.微信历史文章采集
......
...@@ -38,36 +38,33 @@ public class WechatAritcleSearch { ...@@ -38,36 +38,33 @@ public class WechatAritcleSearch {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build(); private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/** /**
* * 根据关键词在搜狗微信搜索微信文章,不包含全文
* @Title: wechatKeywordSearch * @Title: wechatKeywordSearch
* @Description: TODO(根据关键词在搜狗微信搜索微信文章,不包含全文) * @param
* @param @param
* word 关键词 * word 关键词
* @param @param * @param
* tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内); * tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内);
* 5(某一时间段内与startTime和endTime配合使用) * 5(某一时间段内与startTime和endTime配合使用)
* @param @param * @param
* startTime 开始时间 格式为yyyy-MM-dd * startTime 开始时间 格式为yyyy-MM-dd
* @param @param * @param
* endTime 结束时间 格式为yyyy-MM-dd * endTime 结束时间 格式为yyyy-MM-dd
* @param @param * @param
* cookie 用户登录后的cookie(不登录最多10页) * cookie 用户登录后的cookie(不登录最多10页)
* @param @return * @param
* @param @throws * pages 需要限制返回页数的总页数(如返回前20页则传21),如没有限制页数则传null
* @throws
* ZhiWeiException * ZhiWeiException
* @param @throws
* UnsupportedEncodingException 设定文件
* @return List<Wechat> 返回类型 * @return List<Wechat> 返回类型
*/ */
public static List<WechatAricle> wechatKeywordSearch(String word, int tsn, String cookie,String startTime, String endTime, public static List<WechatAricle> wechatKeywordSearch(String word, int tsn, String cookie,String startTime, String endTime,
Proxy proxy) throws Exception, UnsupportedEncodingException { Proxy proxy, Integer pages) throws Exception{
List<WechatAricle> result = new ArrayList<WechatAricle>(); List<WechatAricle> result = new ArrayList<>();
Map<String, String> headerMap = HeaderTool.getCommonHead(); Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com"); headerMap.put("Host", "weixin.sogou.com");
if(StringUtils.isNotBlank(cookie)) { if(StringUtils.isNotBlank(cookie)) {
headerMap.put("cookie", cookie); headerMap.put("cookie", cookie);
} }
boolean f = true; boolean f = true;
int page = 1; int page = 1;
while (f) { while (f) {
...@@ -77,6 +74,7 @@ public class WechatAritcleSearch { ...@@ -77,6 +74,7 @@ public class WechatAritcleSearch {
if (tsn == 5) { if (tsn == 5) {
url = url + "&ft=" + startTime + "&et=" + endTime + "&wxid=&usip=&interation=&from=tool"; url = url + "&ft=" + startTime + "&et=" + endTime + "&wxid=&usip=&interation=&from=tool";
} }
System.out.println(url);
headerMap.put("Referer", url); headerMap.put("Referer", url);
// 获取数据 // 获取数据
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy, false).body().string(); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy, false).body().string();
...@@ -143,6 +141,9 @@ public class WechatAritcleSearch { ...@@ -143,6 +141,9 @@ public class WechatAritcleSearch {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody); logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
} }
// ZhiWeiTools.sleep(100); // ZhiWeiTools.sleep(100);
if(pages!=null && pages==page) {
break;
}
} }
return result; return result;
} }
......
package com.zhiwei.wechat.example; //package com.zhiwei.wechat.example;
//
import java.io.UnsupportedEncodingException; //import java.io.UnsupportedEncodingException;
import java.net.UnknownHostException; //import java.net.UnknownHostException;
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
//
import org.slf4j.Logger; //import org.slf4j.Logger;
import org.slf4j.LoggerFactory; //import org.slf4j.LoggerFactory;
//
import com.zhiwei.common.config.GroupType; //import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory; //import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder; //import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.wechat.entity.WechatAricle; //import com.zhiwei.wechat.entity.WechatAricle;
import com.zhiwei.wechat.search.WechatAritcleSearch; //import com.zhiwei.wechat.search.WechatAritcleSearch;
import com.zhiwei.wechat.util.Tools; //
///**
/** // * @ClassName: WechatSearchExample
* @ClassName: WechatSearchExample // * @Description: TODO(根据关键词等采集数据)
* @Description: TODO(根据关键词等采集数据) // * @author hero
* @author hero // * @date 2016年12月16日 上午9:15:42
* @date 2016年12月16日 上午9:15:42 // */
*/ //public class WechatSearchExample{
public class WechatSearchExample{ //
// private Logger logger = LoggerFactory.getLogger(WechatSearchExample.class);
private Logger logger = LoggerFactory.getLogger(WechatSearchExample.class); // private static final String registry = "zookeeper://192.168.0.36:2181";
private static final String registry = "zookeeper://192.168.0.36:2181"; // private static final String group = "local";
private static final String group = "local"; //
// public static void main(String[] args) {
public static void main(String[] args) { // ProxyFactory.init(registry, group, GroupType.PROVIDER);
ProxyFactory.init(registry, group, GroupType.PROVIDER);
try {
WechatSearchExample.wechatSearchExample();
} catch (UnknownHostException e) {
e.printStackTrace();
}
}
public static void wechatSearchExample() throws UnknownHostException
{
List<String> wordList = new ArrayList<String>();
wordList.add("京东");
for(String word : wordList)
{
try {
List<WechatAricle> list = WechatAritcleSearch.wechatKeywordSearch(word, 5, null,"2019-04-08", "2019-04-08", ProxyHolder.SOUGOU_INNER_PROXY.getProxy());
System.out.println("======"+list.size());
for(WechatAricle wechat : list){
System.out.println(wechat.getTitle());
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
// for(String wxId : wechatIds)
// {
// try { // try {
// logger.info("需要采集的wxId:::{}", wxId); // WechatSearchExample.wechatSearchExample();
// } catch (UnknownHostException e) {
// e.printStackTrace();
// }
// }
//
// //
// public static void wechatSearchExample() throws UnknownHostException
// {
// List<String> wordList = new ArrayList<String>();
// wordList.add("京东");
// for(String word : wordList)
// {
// try {
// List<WechatAricle> list = WechatAritcleSearch.wechatKeywordSearch(word, 5, null,"2019-07-24", "2019-07-24", ProxyHolder.SOUGOU_INNER_PROXY.getProxy(), 21);
// System.out.println("======"+list.size());
// for(WechatAricle wechat : list){
//// System.out.println(wechat.getTitle());
// }
// } catch (UnsupportedEncodingException e) { // } catch (UnsupportedEncodingException e) {
// e.printStackTrace(); // e.printStackTrace();
// } catch (Exception e) { // } catch (Exception e) {
// e.printStackTrace(); // e.printStackTrace();
// } // }
//// for(String wxId : wechatIds)
//// {
//// try {
//// logger.info("需要采集的wxId:::{}", wxId);
////
//// } catch (UnsupportedEncodingException e) {
//// e.printStackTrace();
//// } catch (Exception e) {
//// e.printStackTrace();
//// }
//// }
// }
// } // }
} //
} //
//}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment