Commit 84056e85 by 俞宁

对搜狗微信网进行数据的爬取

parent 9b7d8114
package com.zhiwei.httpclient;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.HashMap;
import java.util.Map;
/**
* 用来获取异步加载网页中的key值
*/
public class GetKey {
public static HashMap<String,Integer> getkeyUtils(int i) throws Exception {
int z=0;
HashMap<String, Integer> c = new HashMap<String, Integer>();
String url = "https://weixin.sogou.com/weixin?query=%E5%BE%AE%E4%BF%A1&_sug_type_=&s_from=input&_sug_=n&type=&page="+i+"&ie=utf8";
//获取页面,并解析html页面空格&bsp乱码问题
Document document = Jsoup.connect(url).get();
//选择html中想要内容的具体位置的语句
Elements li = document.select("ul.news-list2");
Elements links = li.select("li");
for (Element link : links) {
//获取d中的值,也就是我们想要的内容
String key = link.attr("d");
c.put(key, z);
z++;
}
return c;
}
}
package com.zhiwei.httpclient;
import net.sf.json.JSONArray;
import net.sf.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import java.net.URL;
public class GetSignature {
/**
* 此方法是用来获取每一个网页的signature。
* @param url
* @return
* @throws Exception
*/
public static Object getsignatgure(String url) throws Exception {
Object obj = null;
//使用json来parse html
Document doc = Jsoup.parse(new URL(url),3000);
//取得所有的script tag
Elements eles = doc.getElementsByTag("script");
for(Element ele : eles){
//检查是否有account_anti_url字串
String script = ele.toString();
if(script.indexOf("account_anti_url") > -1){
//只取得script的内容
script = ele.childNode(0).toString();
//使用ScriptEngine来parse
ScriptEngine engine = new ScriptEngineManager().getEngineByName("javascript");
engine.eval(script);
//取得想要的内容:
obj = engine.get("account_anti_url");
}
}
return obj;
}
}
package com.zhiwei.httpclient;
import com.alibaba.fastjson.JSONObject;
import org.seimicrawler.xpath.JXDocument;
import org.seimicrawler.xpath.JXNode;
import java.io.IOException;
import java.text.ParseException;
import java.util.*;
import static java.lang.Thread.sleep;
public class GetWebData {
public static int x =1;
public static int y =0;
public static int i = 0;
public static void main(String[] args) throws ParseException, IOException,Exception {
IfExistNextPage ifExistNextPage = new IfExistNextPage();
//得到总页数
int totalpages = ifExistNextPage.getTotalpages();
//循环输出爬取的网页数据
for(int page=1;page<=totalpages;page++) {
String content = StockUtils.getHtmlByUrl(
"https://weixin.sogou.com/weixin?query=%E5%BE%AE%E4%BF%A1&_sug_type_=&s_from=input&_sug_=n&type=1&page="+page+"&ie=utf8");
parseHtmlByXpath(content);
x++;
y=0;
}
}
/* public static void parseHtml(String content) throws ParseException,IOException{
Document document = Jsoup.parse(content);
Elements links = document.getElementsByClass("news-box").select("li");
for (Element e : links){
// System.out.println("新闻标题:"+e.select("a").text().toString());
//获取页面链接
Elements linkHerf = e.select("div[class='img-box']");
//截取时间字符串
// Elements timeStr = e.select("div[class=news_source");
System.out.println(linkHerf.attr("href"));
// System.out.println(timeStr.text());
System.out.println("=================");
}*/
/**
*
* @param content
* @throws ParseException
* @throws IOException
*/
public static void parseHtmlByXpath(String content) throws ParseException,IOException{
GetSignature serieExTool = new GetSignature();
//System.out.println("Test.parseHtmlByXpath====="+content);
//获取页面,解析网页html源码
JXDocument jxDocument = JXDocument.create(content);
//规定爬取的数据的范围
List<JXNode> jxNodeList = jxDocument.selN("//div[@class='news-box']/ul/li");
jxNodeList.forEach(jxNode -> {
//获取title数据,并设为text格式
String title = jxNode.selOne("//div[@class='txt-box']/p[@class='tit']/a").asElement().text();
System.out.println("标题:"+title);
//获取url数据,并设为url格式
String url = jxNode.selOne("//div[@class='gzh-box2']/div[@class='img-box']/a").asElement().attr("href");
System.out.println("链接:"+url);
//获取vxid数据,并设为text格式
String vxid = jxNode.selOne("//div[@class='txt-box']/p[@class='info']/label").asElement().text();
System.out.println("微信号:"+vxid);
//判断功能介绍的内容是否存在
if (jxNode.selOne("//dl[1]/dd")==null){
//不存在就输出没有功能介绍
System.out.println("没有功能介绍");
}else {
//存在就获取gnjs数据,并设为text格式
String gnjs = jxNode.selOne("//dl[1]/dd").asElement().text();
System.out.println("功能介绍:" + gnjs);
}
if (jxNode.selOne("//dl[2]/dd")==null){
System.out.println("没有微信认证");
} else{
//存在就获取vxrz数据,并设为text格式
String vxrz = jxNode.selOne("//dl[2]/dd").asElement().text();
System.out.println("微信认证:" + vxrz);
}
if (jxNode.selOne("//dl[3]/dd/a")==null){
System.out.println("没有最近文章。");
}else {
//存在就获取zjwz数据,并设为text格式
String zjwz = jxNode.selOne("//dl[3]/dd/a").asElement().text();
System.out.println("最近文章:"+zjwz);
}
y=y+1;
//获取异步加载的数据(月发文数量)
try {
//获取signature
String str = (String) serieExTool.getsignatgure("https://weixin.sogou.com/weixin?query=%E5%BE%AE%E4%BF%A1&_sug_type_=&s_from=input&_sug_=n&type=1&page="+x+"&ie=utf8");
//结合成异步加载的网址
String ybcontent = StockUtils.getHtmlByUrl("https://weixin.sogou.com"+str);
//调用parseHtmlByXpath2获取月发文的数量
parseHtmlByXpath2(ybcontent);
} catch (Exception e) {
e.printStackTrace();
}
//每爬取一次停3秒
try {
sleep(3000);
} catch (InterruptedException e) {
e.printStackTrace();
}
});
i=0;
}
public static void parseHtmlByXpath2(String content) throws IOException,Exception {
//获取异步加载网页里的key值
GetKey getKey = new GetKey();
HashMap<String,Integer> c = getKey.getkeyUtils(x);
i=y-1;
//System.out.println(y);
//System.out.println(i);
//System.out.println(getAllKey(c, i).toString().replaceAll("\\[|\\]",""));
//把Json格式的数据进行转换
JSONObject object = (JSONObject) JSONObject.parse(content);
//获取对应key值的value值(月发文篇数),但是月发文的篇数要用split()来进行分割,提取前半部分
//如果对应的key没有value值(月发文篇数)就返回null
if (object.getJSONObject("msg").get(getAllKey(c, i).toString().replaceAll("\\[|\\]", "")) == null) {
System.out.println("没有发布文章");
} else {
//获取月发文的篇数
String[] strings = object.getJSONObject("msg").get(getAllKey(c, i).toString().replaceAll("\\[|\\]", "")).toString().split(",");
System.out.println("月发布" + strings[0] + "篇文章");
}
}
public static ArrayList getAllKey(HashMap hm,Integer value){
/**
* 通过HashMap中的value值来获取HashMap中的key值
*/
ArrayList list=new ArrayList();
for(Object getKey:hm.keySet()){
if (hm.get(getKey).equals(value)) {
list.add(getKey);
}
}
return list;
}
}
package com.zhiwei.httpclient;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import static java.lang.Thread.getDefaultUncaughtExceptionHandler;
import static java.lang.Thread.sleep;
public class IfExistNextPage {
/**
* 用来获取爬取的网页的总页数
* @return
* @throws Exception
*/
public static int getTotalpages() throws Exception {
int x = 0;
int y = 1;
String ifnextpage = null;
String keyhref = null;
String keyuigs = null;
//设置的标志位,在最后一个网页到来之前都是true
boolean ifend = true;
//把第一个网页的后半段设为初值
String nexturl = "?query=%E5%BE%AE%E4%BF%A1&_sug_type_=&s_from=input&_sug_=n&type=1&page=1&ie=utf8";
//HashMap用来存储内容
HashMap<String, Integer> hashMaphref = new HashMap<String, Integer>();
HashMap<String, Integer> hashMapuigs = new HashMap<String, Integer>();
while (ifend) {
String url = "https://weixin.sogou.com/weixin" + nexturl;
//获取网页,并解析网页中的空格&bsp问题
Document document = Jsoup.connect(url).get();
Elements li = document.select("div.p-fy");
Elements links = li.select("a");
for (Element link : links) {
//获取href中的内容
keyhref = link.attr("href");
//获取uigs中的内容
keyuigs = link.attr("uigs");
//把keyhref放入hashMaphref中
hashMaphref.put(keyhref, y);
//把keyuigs放入hashMapuigs中
hashMapuigs.put(keyuigs,y);
y++;
//设置时间间隔为1秒
try {
sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
//将nexturl的值赋成下一页的后半段网址
nexturl = getAllKey(hashMaphref, y - 1).toString().replaceAll("\\[|\\]", "");
ifnextpage = getAllKey(hashMapuigs,y - 1).toString().replaceAll("\\[|\\]","");
// System.out.println(nexturl);
// System.out.println(ifnextpage);
//当循环到最后一页,把ifend设为false
if(ifnextpage.equals("page_next")==false){
ifend = false;
}
x++;
}
//返回总页数
return x;
}
public static ArrayList getAllKey(HashMap hm, Integer value){
/**
* 通过value得到hashMap中的key值
*/
ArrayList list=new ArrayList();
for(Object getKey:hm.keySet()){
if (hm.get(getKey).equals(value)) {
list.add(getKey);
}
}
return list;
}
}
package com.zhiwei.httpclient;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.io.IOException;
public class StockUtils {
public static String getHtmlByUrl(String url) throws IOException{
String html = null;
// 创建httpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//使用HttpGet的方式请求网址
HttpGet httpGet = new HttpGet(url);
//模拟浏览器访问
httpGet.addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36");
httpGet.addHeader("Referer","https://weixin.sogou.com/weixin?type=1&s_from=input&query=%E5%BE%AE%E4%BF%A1&ie=utf8&_sug_=y&_sug_type_=&w=01019900&sut=3104&sst0=1598233518739&lkt=1%2C1598233518638%2C1598233518638");
httpGet.addHeader("cookie","ssuid=7721315200; IPLOC=CN3302; SUID=FD3CB33C3118960A000000005E6EC315; SUV=1589806411593000; pgv_pvi=8877030400; sw_uuid=2356830386; wuid=AAGybl+xLwAAAAqLMX9P4QUAGwY=; CXID=2A332FD98F354B28467C13BEE5751651; LCLKINT=1823; LSTMV=522%2C185; ABTEST=0|1597799749|v1; weixinIndexVisited=1; JSESSIONID=aaaXXkIyD8HIAcrOi3Yox; PHPSESSID=nv30eatifl87ngsfpb2bldcg56; SNUID=B6E081A7C2C76F58857D3FBAC250DF69; seccodeRight=success; successCount=1|Mon, 24 Aug 2020 07:14:34 GMT; refresh=1");
httpGet.addHeader("Referer","https://weixin.sogou.com/websearch/weixin/pc/anti_account.jsp?t=1598235364118&signature=zEH6t4vtzmgi-lZOgXZ*wnxyO0xCCmI*98uRvLGbuvdOYmdXdNTo0m9UnsAIjFTJswHTHB-*ndspBdCd-6Wcq9DEKhmxAGRLKPE5SE7EJaSzCG4E4uotravrUcaPDoakBuCXin0uTIVmHkN6zhISeLHZQ6c5yaxgjbhTmqab7Gojgm5t6vZ*II-L*Mnj3E5WpD9YtUFSMBlnDNIrph7IKqSMUFrqlohvdt05KiICwetWRjmLUBXxfZK4FfeoALORnYDQQy8ZXvgVlg9M5tftR5HpBPU-or4rNgtH2yMkpVnQ9HV3B5M98cUcStRyaN016eD6DHgUu8ysQKm0dLfvIS0fIqlxF4YKLRL6vCaFeF8GNVUo2rSPZBcl1MQmyjRHxTXLhs4seZ2CLiTPUiNzBaoTiSCuEtlf6pSozZSnx0Y=");
httpGet.addHeader("cookie","ssuid=7721315200; IPLOC=CN3302; SUID=FD3CB33C3118960A000000005E6EC315; SUV=1589806411593000; pgv_pvi=8877030400; sw_uuid=2356830386; wuid=AAGybl+xLwAAAAqLMX9P4QUAGwY=; CXID=2A332FD98F354B28467C13BEE5751651; LCLKINT=1823; LSTMV=522%2C185; ABTEST=0|1597799749|v1; weixinIndexVisited=1; SNUID=742341650105AC94BDD0F52601C16B72; JSESSIONID=aaaXXkIyD8HIAcrOi3Yox");
try{
//获取网页的返回结果
HttpResponse httpResponse = httpClient.execute(httpGet);
int rtValue = httpResponse.getStatusLine().getStatusCode();
if (rtValue == HttpStatus.SC_OK){
//获取返回结果中的实体
HttpEntity entity = httpResponse.getEntity();
if (entity != null){
//将返回的实体输出
html = EntityUtils.toString(entity,"UTF-8");
}
}
}catch(Exception e){
System.out.println("访问【"+url+"】出现了异常!");
e.printStackTrace();
}finally {
//释放连接
httpClient.close();
}
return html;
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment