Commit 247e637d by yangchen

douban topic crawler

parent 36eb5887
package com.zhiwei.parse;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.poi.xwpf.usermodel.BodyElementType;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static java.util.Objects.nonNull;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.parse.analysis.DoubanCommentAnalysis;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class Douban {
private static final Logger logger = LoggerFactory.getLogger(Double.class);
private static DoubanCommentAnalysis doubanCommentAnalysis = new DoubanCommentAnalysis();
private static HttpBoot httpBoot = new HttpBoot();
/**
*
* @Description 豆瓣小组采集
* @param word
* @param proxy
* @param cookie
* @param stime
* @return
*/
public static List<Map<String,Object>> doubanTopicGetByWord(String word,Proxy proxy,String cookie,String stime) {
int page = 0;
int count = 20;
boolean more = true;
Map<String,String> headerMap = new HashMap<>();
headerMap.put("Host", "www.douban.com");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36");
headerMap.put("Cookie", cookie);
List<Map<String,Object>> bodyList = new ArrayList<>();
int cou = 0;
while(more){
cou = bodyList.size();
String url = "https://www.douban.com/group/search?q="+URLCodeUtil.getURLEncode(word, "utf-8")+"&start="+page*count+"&cat=1013&sort=time";
headerMap.put("Referer", url);
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
if(htmlBody != null){
Document document = Jsoup.parse(htmlBody);
Elements elements = document.select("div.topics").select("tr.pl");
String link = null;
String title = null;
String group = null;
String time = null;
int replyCount = 0;
if(nonNull(elements)) {
for (Element element : elements) {
link = element.select("td.td-subject").select("a").attr("href");
title = element.select("td.td-subject").select("a").text();
time = element.select("td.td-time").attr("title");
replyCount = Integer.valueOf(element.select("td.td-reply").select("span").text().split("回应")[0].trim());
group = element.select("td").get(3).text();
if(time.compareTo(stime) > -1) {
Map<String,Object> map = new HashMap<>();
map.put("_id", link);
map.put("title", title);
map.put("group", group);
map.put("time", time);
map.put("reply_count", replyCount);
bodyList.add(map);
}
}
if(cou == bodyList.size()){
more = false;
}
logger.info("采集到 第 {} 页 ,一共采集到 {} 条 是否有下一页 {}",page,bodyList.size(),more);
}
}
ZhiWeiTools.sleep(1500);
page++;
} catch (Exception e) {
more = false;
logger.error("豆瓣 topic 采集出错 {}",e);
}
}
return Collections.emptyList();
}
/**
*
* @Description 采集豆瓣topic 评论
* @param url
* @param proxy
* @param cookie
* @return
*/
public static List<Map<String,Object>> getDoubanComment(String url,Proxy proxy,String cookie) {
if(url.contains("#")) {
url = url.split("#")[0];
}
Map<String,String> headerMap = new HashMap<>();
headerMap.put("Host", "www.douban.com");
headerMap.put("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36");
headerMap.put("Cookie", cookie);
boolean more = true;
int page = 0;
List<Map<String,Object>> dataList = new ArrayList<>();
int count = -1;
while(more) {
try {
String result = httpBoot.syncCall(RequestUtils.wrapGet(url+"?start="+page*100, headerMap), proxy).body().string();
count = dataList.size();
dataList.addAll(doubanCommentAnalysis.getData(result));
page++;
if(dataList.size() - count <= 95 || dataList.size() - count >= 105) {
more = false;
}
ZhiWeiTools.sleep(1500);
logger.info("评论采集到 第 {} 页 ,一共采集到 {} 条数据 ,more : {}",page,dataList.size(),more);
} catch (Exception e) {
logger.error("Exception {}",e);
more = false;
}
}
return dataList;
}
}
package com.zhiwei.parse.analysis;
import static java.util.Objects.nonNull;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.math.NumberUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class DoubanCommentAnalysis {
private static final Logger logger = LoggerFactory.getLogger(DoubanCommentAnalysis.class);
public List<Map<String,Object>> getData(String result) {
try {
List<Map<String,Object>> bodyList = new ArrayList<>();
Document doc = Jsoup.parse(result);
Elements elements = doc.select("#comments").select("li");
if(nonNull(elements)) {
for(Element element : elements) {
Map<String,Object> map = new HashMap<>();
String source = element.select("div.reply-doc.content > div.bg-img-green > h4 > a").text();
String time = element.select("div.reply-doc.content > div.bg-img-green > h4 > span").text();
String content = element.select("div.reply-doc.content > p").text();
String id = element.select("li").attr("data-cid");
map.put("source", source);
map.put("time", time);
map.put("content", content);
map.put("id", id);
map.put("like", getLikeNum(result, "c"+id));
bodyList.add(map);
}
}
return bodyList;
} catch (Exception e) {
logger.error("解析错误 {}",e);
}
return Collections.emptyList();
}
private int getLikeNum(String result,String id) {
Matcher matcher = Pattern.compile(id+"\":[\\D\\d][0,5]").matcher(result);
while(matcher.find()) {
String ret = matcher.group(0);
ret = ret.split(":")[1].split(",")[0];
if(NumberUtils.isNumber(ret)) {
return Integer.parseInt(ret);
}
}
return 0;
}
}
package com.zhiwei.Comment;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.testng.annotations.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Douban;
public class DoubanCommentTest {
@Test
public void f() {
String url = "https://www.douban.com/group/topic/72528866/";
String cookie = "bid=rymxzs5aojg; ps=y; ll=\"118173\"; __utmc=30149280; dbcl2=\"188038058:9IHyVcSobVc\"; ck=_RvF; ap_v=0,6.0; push_noty_num=0; push_doumail_num=0; __utmv=30149280.18803; douban-fav-remind=1; __yadk_uid=qLflXyj3R14ro9e0cLoZOQlJoMGVN32j; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1543562805%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DPk19bxnhsVWwfnrcnwT0PquON7D1JpLsbfSu9rRowalyi4pOeM3rMHKFaJo9jJF7%26wd%3D%26eqid%3De7f262650001ef98000000045c00e64f%22%5D; _pk_ses.100001.8cb4=*; douban-profile-remind=1; loc-last-index-location-id=\"118173\"; _vwo_uuid_v2=D85F60C118B0AF465035D9CC7BBFDA7A6|4bf255e1e3a2e9aeede3708192f5f1bc; __utma=30149280.824403997.1543559458.1543562809.1543564973.3; __utmz=30149280.1543564973.3.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _pk_id.100001.8cb4=6828fef49f6bcf34.1543559455.2.1543566557.1543559542.; __utmb=30149280.70.5.1543566539352";
PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<Map<String,Object>> bodyList = Douban.getDoubanComment(url, null, cookie);
List<String> headList = new ArrayList<>();
headList.add("source");
headList.add("time");
headList.add("like");
headList.add("content");
headList.add("id");
poi.exportExcel("D://crawlerdata//自媒体/douban评论采集-2.xlsx", "asd", headList, bodyList);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment