Commit 3ea331c1 by [zhangzhiwei]

添加搜狗微信根据账号+关键词采集数据

parent 60e4b279
...@@ -122,7 +122,7 @@ public class WeChatReadAndLike { ...@@ -122,7 +122,7 @@ public class WeChatReadAndLike {
time = time.split(" ")[0]; time = time.split(" ")[0];
} }
String openid = WechatAritcleSearch.getOpenId(wxId); String openid = WechatAritcleSearch.getOpenId(wxId,null);
logger.info("openid is {}", openid); logger.info("openid is {}", openid);
try { try {
......
...@@ -21,10 +21,9 @@ import com.zhiwei.crawler.core.HttpBoot; ...@@ -21,10 +21,9 @@ import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils; import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder; import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.tools.httpclient.HeaderTool; import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK; import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.wechat.entity.WechatAricle; import com.zhiwei.wechat.entity.WechatAricle;
import com.zhiwei.wechat.util.Tools;
/** /**
* @ClassName: WechatAritcleSearch * @ClassName: WechatAritcleSearch
...@@ -60,13 +59,10 @@ public class WechatAritcleSearch { ...@@ -60,13 +59,10 @@ public class WechatAritcleSearch {
* @return List<Wechat> 返回类型 * @return List<Wechat> 返回类型
*/ */
public static List<WechatAricle> wechatKeywordSearch(String word, int tsn, String startTime, String endTime, public static List<WechatAricle> wechatKeywordSearch(String word, int tsn, String startTime, String endTime,
String cookie, Proxy proxy) throws Exception, UnsupportedEncodingException { Proxy proxy) throws Exception, UnsupportedEncodingException {
List<WechatAricle> result = new ArrayList<WechatAricle>(); List<WechatAricle> result = new ArrayList<WechatAricle>();
Map<String, String> headerMap = HeaderTool.getCommonHead(); Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com"); headerMap.put("Host", "weixin.sogou.com");
if (cookie != null) {
headerMap.put("Cookie", cookie);
}
boolean f = true; boolean f = true;
int page = 1; int page = 1;
...@@ -174,13 +170,10 @@ public class WechatAritcleSearch { ...@@ -174,13 +170,10 @@ public class WechatAritcleSearch {
* @return List<Wechat> 返回类型 * @return List<Wechat> 返回类型
*/ */
public static List<WechatAricle> wechatKeywordSearch(String word, int tsn, String startTime, String endTime, public static List<WechatAricle> wechatKeywordSearch(String word, int tsn, String startTime, String endTime,
String cookie, Proxy proxy,ProxyHolder proxyHolder) throws Exception, UnsupportedEncodingException { Proxy proxy,ProxyHolder proxyHolder) throws Exception, UnsupportedEncodingException {
List<WechatAricle> result = new ArrayList<WechatAricle>(); List<WechatAricle> result = new ArrayList<WechatAricle>();
Map<String, String> headerMap = HeaderTool.getCommonHead(); Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com"); headerMap.put("Host", "weixin.sogou.com");
if (cookie != null) {
headerMap.put("Cookie", cookie);
}
boolean f = true; boolean f = true;
int page = 1; int page = 1;
...@@ -272,7 +265,6 @@ public class WechatAritcleSearch { ...@@ -272,7 +265,6 @@ public class WechatAritcleSearch {
*/ */
private static WechatAricle getContentAndSource(String url, ProxyHolder proxy,WechatAricle wechatAricle){ private static WechatAricle getContentAndSource(String url, ProxyHolder proxy,WechatAricle wechatAricle){
try { try {
// String htmlBody = HttpClientTemplateOK.get(url, proxy, null);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy.getProxy()).body().string(); String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy.getProxy()).body().string();
if(htmlBody!=null){ if(htmlBody!=null){
Document document = Jsoup.parse(htmlBody); Document document = Jsoup.parse(htmlBody);
...@@ -309,11 +301,115 @@ public class WechatAritcleSearch { ...@@ -309,11 +301,115 @@ public class WechatAritcleSearch {
return wechatAricle; return wechatAricle;
} }
/**
* 根据关键词采集指定时间+账号的数据
* @param word
* @param idOrName
* @param tsn
* @param startTime
* @param endTime
* @param proxy
* @param proxyHolder
* @return
* @throws Exception
* @throws UnsupportedEncodingException
*/
public static List<WechatAricle> wechatKeywordSearchByAccount(String word, String idOrName, String startTime, String endTime,
ProxyHolder proxyHolder) throws Exception, UnsupportedEncodingException {
List<WechatAricle> result = new ArrayList<WechatAricle>();
Map<String, String> headerMap = HeaderTool.getCommonHead();
headerMap.put("Host", "weixin.sogou.com");
if(idOrName==null || idOrName.equals("")){
throw new IllegalArgumentException("要检索的昵称或id不能为空");
}
String openId = getOpenId(idOrName, proxyHolder);
boolean f = false;
if(openId!=null){
f = true;
}
int page = 1;
// public static void main(String[] args) { while (f) {
// String url = "https://mp.weixin.qq.com/s?src=11&timestamp=1540521001&ver=1205&signature=12dtyhMA3Xi7lzUhGUFyEpJmWPlnaLAwDVXMUi-tcFXHJbIYDKuLm76sdQUAZxkEjyGby22amJ4AnxIM4oS0ivtAS6ibs4F3OO8-jwoFLk4Pd6d8AhZdj94Z1gQdhdIQ&new=1"; String url = "https://weixin.sogou.com/weixin?type=2&ie=utf8&query=" + URLEncoder.encode(word, "UTF-8")
// getContentAndSource(url, null, null); + "&tsn=5&ft=" + startTime + "&et=" + endTime +"&interation=&page=" + page+"&wxid="+ openId
// } +"&usip=" + URLEncoder.encode(idOrName, "UTF-8");
headerMap.put("Referer", url);
System.out.println(url);
// 获取数据
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxyHolder, true).body().string();
// 解析数据
if (htmlBody != null) {
try {
// 解析数据
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div.news-box").select("ul.news-list").select("li");
String title = null;
String link = null;
String content = null;
String source = null;
String openid = null;
String putDate = null;
Date date = null;
WechatAricle wechat = null;
for (Element element : elements) {
try {
title = element.select("div.txt-box").select("h3").text();
link = element.select("div.txt-box").select("h3 >a").attr("href");
content = "";
if (element.select("p.txt-info").isEmpty()) {
content = element.select("p.txt-info").text();
} else {
content = element.select("div.txt-box").select("p.txt-info").text();
}
// System.out.println("content======================"+content);
source = element.select("div.txt-box").select("div.s-p").select("a").text();
openid = element.select("div.txt-box").select("div.s-p").select("a").attr("i");
putDate = element.select("div.txt-box").select("div.s-p").attr("t");
date = new Date(Long.valueOf(putDate) * 1000);
int readNum = 0;
try {
readNum = Integer.valueOf(element.select("div.txt-box").select("div.s-p")
.select("span.s1").text().trim());
} catch (Exception e) {
readNum = 0;
}
title = ZhiWeiTools.SBC2DBC(title);
content = ZhiWeiTools.SBC2DBC(content);
wechat = new WechatAricle(link, title, source, content, date, readNum, 0, openid, "unknow");
wechat = getContentAndSource(link, proxyHolder, wechat);
result.add(wechat);
} catch (Exception e) {
logger.debug("解析数据出现错误:{}", e.getMessage());
continue;
}
}
// 解析最大可寻页码
String pageNext = document.select("[id=pagebar_container]>a").text();
if (pageNext.contains("下一页")) {
page++;
} else {
f = false;
}
// logger.info("数据总页数为:{}", page);
} catch (Exception e) {
logger.debug("获取数据出现问题:{}", e.getMessage());
return null;
}
} else {
logger.info("根据关键词获取微信文章失败,返回的数据结果集: {}", htmlBody);
}
// ZhiWeiTools.sleep(100);
}
return result;
}
/** /**
* @Title: getOpenId * @Title: getOpenId
...@@ -324,20 +420,19 @@ public class WechatAritcleSearch { ...@@ -324,20 +420,19 @@ public class WechatAritcleSearch {
* 设定文件 * 设定文件
* @return String 返回类型 * @return String 返回类型
*/ */
public static String getOpenId(String wxId) { public static String getOpenId(String idOrName, ProxyHolder proxyHolder) {
String openId = null; String openId = null;
String url = "http://weixin.sogou.com/weixin?zhnss=1&type=1&ie=utf8&query=" + wxId; String url = "https://weixin.sogou.com/weixin?zhnss=1&type=1&ie=utf8&query=" + URLCodeUtil.getURLEncode(idOrName, "utf-8");
Map<String, String> headerMap = Tools.getWechatHeader();
String htmlBody; String htmlBody;
try { try {
htmlBody = HttpClientTemplateOK.get(url, null, headerMap); htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url), proxyHolder,true).body().string();
System.out.println(htmlBody);
if (htmlBody != null) { if (htmlBody != null) {
JSONObject json = JSONObject.parseObject(htmlBody); JSONObject json = JSONObject.parseObject(htmlBody);
openId = json.getString("openid"); openId = json.getString("openid");
} }
} catch (Exception e) { } catch (Exception e) {
openId = null; openId = null;
e.printStackTrace();
} }
return openId; return openId;
......
...@@ -24,7 +24,7 @@ public class Tools { ...@@ -24,7 +24,7 @@ public class Tools {
headerMap.put("Connection", "keep-alive"); headerMap.put("Connection", "keep-alive");
headerMap.put("Upgrade-Insecure-Requests", "1"); headerMap.put("Upgrade-Insecure-Requests", "1");
headerMap.put("Host", "mp.weixin.qq.com"); headerMap.put("Host", "mp.weixin.qq.com");
headerMap.put("Origin", "http://mp.weixin.qq.com"); headerMap.put("Origin", "https://mp.weixin.qq.com");
headerMap.put("User-Agent", headerMap.put("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat QBCore/3.43.691.400 QQBrowser/9.0.2524.400"); "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat QBCore/3.43.691.400 QQBrowser/9.0.2524.400");
return headerMap; return headerMap;
......
...@@ -8,6 +8,9 @@ import java.util.List; ...@@ -8,6 +8,9 @@ import java.util.List;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.wechat.entity.WechatAricle; import com.zhiwei.wechat.entity.WechatAricle;
import com.zhiwei.wechat.search.WechatAritcleSearch; import com.zhiwei.wechat.search.WechatAritcleSearch;
import com.zhiwei.wechat.util.Tools; import com.zhiwei.wechat.util.Tools;
...@@ -21,9 +24,11 @@ import com.zhiwei.wechat.util.Tools; ...@@ -21,9 +24,11 @@ import com.zhiwei.wechat.util.Tools;
public class WechatSearchExample{ public class WechatSearchExample{
private Logger logger = LoggerFactory.getLogger(WechatSearchExample.class); private Logger logger = LoggerFactory.getLogger(WechatSearchExample.class);
private static final String registry = "zookeeper://192.168.0.36:2181";
private static final String group = "local";
public static void main(String[] args) { public static void main(String[] args) {
ProxyFactory.init(registry, group, GroupType.PROVIDER);
try { try {
WechatSearchExample.wechatSearchExample(); WechatSearchExample.wechatSearchExample();
} catch (UnknownHostException e) { } catch (UnknownHostException e) {
...@@ -35,12 +40,13 @@ public class WechatSearchExample{ ...@@ -35,12 +40,13 @@ public class WechatSearchExample{
public static void wechatSearchExample() throws UnknownHostException public static void wechatSearchExample() throws UnknownHostException
{ {
List<String> wordList = new ArrayList<String>(); List<String> wordList = new ArrayList<String>();
wordList.add("QQ 涉密邮件 间谍"); wordList.add("工业互联网");
String idOrName = "吴晓波频道";
for(String word : wordList) for(String word : wordList)
{ {
try { try {
List<WechatAricle> list = WechatAritcleSearch.wechatKeywordSearch(word, 5, "2018-11-29", "2018-11-29",null,null); List<WechatAricle> list = WechatAritcleSearch.wechatKeywordSearchByAccount(word, idOrName, "2017-12-01", "2018-12-01", ProxyHolder.SOUGOU_INNER_PROXY);
System.out.println("======"+list.size()); System.out.println("======"+list.size());
for(WechatAricle wechat : list){ for(WechatAricle wechat : list){
System.out.println(wechat.getTitle()); System.out.println(wechat.getTitle());
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment