Commit 7ad96e77 by zhiwei

添加限制返回总页数功能

parent 9d752049
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>wechat</artifactId>
<version>1.1.6-SNAPSHOT</version>
<version>1.1.7-SNAPSHOT</version>
<description>
知微微信采集程序,包含
1.微信历史文章采集
......
......@@ -38,36 +38,33 @@ public class WechatAritcleSearch {
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(3).build();
/**
*
* 根据关键词在搜狗微信搜索微信文章,不包含全文
* @Title: wechatKeywordSearch
* @Description: TODO(根据关键词在搜狗微信搜索微信文章,不包含全文)
* @param @param
* @param
* word 关键词
* @param @param
* @param
* tsn 采集时间范围:1(1天内);2(一周内);3(一月内);4(一年内);
* 5(某一时间段内与startTime和endTime配合使用)
* @param @param
* @param
* startTime 开始时间 格式为yyyy-MM-dd
* @param @param
* @param
* endTime 结束时间 格式为yyyy-MM-dd
* @param @param
* @param
* cookie 用户登录后的cookie(不登录最多10页)
* @param @return
* @param @throws
* @param
* pages 需要限制返回页数的总页数(如返回前20页则传21),如没有限制页数则传null
* @throws
* ZhiWeiException
* @param @throws
* UnsupportedEncodingException 设定文件
* @return List<Wechat> 返回类型
*/
public static List<WechatAricle> wechatKeywordSearch(String word, int tsn, String cookie,String startTime, String endTime,
Proxy proxy) throws Exception, UnsupportedEncodingException {
List<WechatAricle> result = new ArrayList<WechatAricle>();
Proxy proxy, Integer pages) throws Exception{
List<WechatAricle> result = new ArrayList<>();
Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com");
if(StringUtils.isNotBlank(cookie)) {
headerMap.put("cookie", cookie);
}
boolean f = true;
int page = 1;
while (f) {
......@@ -77,6 +74,7 @@ public class WechatAritcleSearch {
if (tsn == 5) {
url = url + "&ft=" + startTime + "&et=" + endTime + "&wxid=&usip=&interation=&from=tool";
}
System.out.println(url);
headerMap.put("Referer", url);
// 获取数据
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy, false).body().string();
......@@ -143,6 +141,9 @@ public class WechatAritcleSearch {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
}
// ZhiWeiTools.sleep(100);
if(pages!=null && pages==page) {
break;
}
}
return result;
}
......
package com.zhiwei.wechat.example;
import java.io.UnsupportedEncodingException;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.wechat.entity.WechatAricle;
import com.zhiwei.wechat.search.WechatAritcleSearch;
import com.zhiwei.wechat.util.Tools;
/**
* @ClassName: WechatSearchExample
* @Description: TODO(根据关键词等采集数据)
* @author hero
* @date 2016年12月16日 上午9:15:42
*/
public class WechatSearchExample{
private Logger logger = LoggerFactory.getLogger(WechatSearchExample.class);
private static final String registry = "zookeeper://192.168.0.36:2181";
private static final String group = "local";
public static void main(String[] args) {
ProxyFactory.init(registry, group, GroupType.PROVIDER);
try {
WechatSearchExample.wechatSearchExample();
} catch (UnknownHostException e) {
e.printStackTrace();
}
}
public static void wechatSearchExample() throws UnknownHostException
{
List<String> wordList = new ArrayList<String>();
wordList.add("京东");
for(String word : wordList)
{
try {
List<WechatAricle> list = WechatAritcleSearch.wechatKeywordSearch(word, 5, null,"2019-04-08", "2019-04-08", ProxyHolder.SOUGOU_INNER_PROXY.getProxy());
System.out.println("======"+list.size());
for(WechatAricle wechat : list){
System.out.println(wechat.getTitle());
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
// for(String wxId : wechatIds)
// {
// try {
// logger.info("需要采集的wxId:::{}", wxId);
//
// } catch (UnsupportedEncodingException e) {
// e.printStackTrace();
// } catch (Exception e) {
// e.printStackTrace();
//package com.zhiwei.wechat.example;
//
//import java.io.UnsupportedEncodingException;
//import java.net.UnknownHostException;
//import java.util.ArrayList;
//import java.util.List;
//
//import org.slf4j.Logger;
//import org.slf4j.LoggerFactory;
//
//import com.zhiwei.common.config.GroupType;
//import com.zhiwei.crawler.proxy.ProxyFactory;
//import com.zhiwei.crawler.proxy.ProxyHolder;
//import com.zhiwei.wechat.entity.WechatAricle;
//import com.zhiwei.wechat.search.WechatAritcleSearch;
//
///**
// * @ClassName: WechatSearchExample
// * @Description: TODO(根据关键词等采集数据)
// * @author hero
// * @date 2016年12月16日 上午9:15:42
// */
//public class WechatSearchExample{
//
// private Logger logger = LoggerFactory.getLogger(WechatSearchExample.class);
// private static final String registry = "zookeeper://192.168.0.36:2181";
// private static final String group = "local";
//
// public static void main(String[] args) {
// ProxyFactory.init(registry, group, GroupType.PROVIDER);
// try {
// WechatSearchExample.wechatSearchExample();
// } catch (UnknownHostException e) {
// e.printStackTrace();
// }
// }
//
//
// public static void wechatSearchExample() throws UnknownHostException
// {
// List<String> wordList = new ArrayList<String>();
// wordList.add("京东");
// for(String word : wordList)
// {
// try {
// List<WechatAricle> list = WechatAritcleSearch.wechatKeywordSearch(word, 5, null,"2019-07-24", "2019-07-24", ProxyHolder.SOUGOU_INNER_PROXY.getProxy(), 21);
// System.out.println("======"+list.size());
// for(WechatAricle wechat : list){
//// System.out.println(wechat.getTitle());
// }
// } catch (UnsupportedEncodingException e) {
// e.printStackTrace();
// } catch (Exception e) {
// e.printStackTrace();
// }
}
}
}
//// for(String wxId : wechatIds)
//// {
//// try {
//// logger.info("需要采集的wxId:::{}", wxId);
////
//// } catch (UnsupportedEncodingException e) {
//// e.printStackTrace();
//// } catch (Exception e) {
//// e.printStackTrace();
//// }
//// }
// }
// }
//
//
//}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment