Commit 47654569 by yangchen

增加简书用户采集 和 修改百度知道 关键词采集死循环 问题

parent 8e2e2cc2
...@@ -30,6 +30,7 @@ public class BaiduZhidaoCrawlerParse { ...@@ -30,6 +30,7 @@ public class BaiduZhidaoCrawlerParse {
public static List<Map<String,Object>> getData(String word,ProxyHolder proxy) { public static List<Map<String,Object>> getData(String word,ProxyHolder proxy) {
try { try {
List<Map<String,Object>> dataList = new ArrayList<>(); List<Map<String,Object>> dataList = new ArrayList<>();
List<String> urlList = new ArrayList<>();
int i = 0; int i = 0;
int count = -1; int count = -1;
while(true) { while(true) {
...@@ -42,12 +43,16 @@ public class BaiduZhidaoCrawlerParse { ...@@ -42,12 +43,16 @@ public class BaiduZhidaoCrawlerParse {
for(Element element : elements) { for(Element element : elements) {
Map<String,Object> map = new HashMap<>(); Map<String,Object> map = new HashMap<>();
String ur = element.select("a.ti").attr("href").split("\\?")[0]; String ur = element.select("a.ti").attr("href").split("\\?")[0];
if(urlList.contains(ur)) {
continue;
}
String title = element.select("a.ti").text(); String title = element.select("a.ti").text();
String content = element.select("dd.answer").text(); String content = element.select("dd.answer").text();
String time = element.select("dd.dd.explain.f-light > span:nth-child(1)").text(); String time = element.select("dd.dd.explain.f-light > span:nth-child(1)").text();
String source = element.select("dd.dd.explain.f-light > span:nth-child(2) > a").text(); String source = element.select("dd.dd.explain.f-light > span:nth-child(2) > a").text();
String answerCount = element.select("dd.dd.explain.f-light > span:nth-child(3) > a").text(); String answerCount = element.select("dd.dd.explain.f-light > span:nth-child(3) > a").text();
String like = element.select("dd.dd.explain.f-light > span:nth-child(4)").text(); String like = element.select("dd.dd.explain.f-light > span:nth-child(4)").text();
urlList.add(ur);
map.put("url", ur); map.put("url", ur);
map.put("title", title); map.put("title", title);
map.put("content", content); map.put("content", content);
...@@ -56,7 +61,6 @@ public class BaiduZhidaoCrawlerParse { ...@@ -56,7 +61,6 @@ public class BaiduZhidaoCrawlerParse {
map.put("answerCount", answerCount); map.put("answerCount", answerCount);
map.put("like", like); map.put("like", like);
map.put("word", word); map.put("word", word);
System.out.println(map.toString());
dataList.add(map); dataList.add(map);
} }
if(dataList.size() - count < 8) { if(dataList.size() - count < 8) {
......
package com.zhiwei.media_data_crawler.crawler;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.media_data_crawler.entity.JianshuUser;
import com.zhiwei.tools.tools.URLCodeUtil;
import okhttp3.Response;
/**
*
* @ClassName JianshuCrawler
* @Description 简书相关采集
* @author byte-zbs
* @Date 2019年3月23日 上午11:12:07
* @version 1.0.0
*/
public class JianshuCrawler {
private static Logger logger = LogManager.getLogger(BaiduTiebaCrawlerParse.class);
private static HttpBoot httpBoot = new HttpBoot(false,2);
public static List<JianshuUser> getUserList(String word,String cookie) {
List<JianshuUser> jsList = new ArrayList<>();
Map<String,Object> headers = new HashMap<>();
int page = 1;
while(true) {
String url = "https://www.jianshu.com/search/do?q="+URLCodeUtil.getURLDecode(word, "utf-8")+"&type=user&page="+page+"&order_by=default";
headers.put("cookie", cookie);
headers.put("origin", "https://www.jianshu.com");
headers.put("accept", "application/json");
headers.put("user-agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36");
try (Response response = httpBoot.syncCall(RequestUtils.wrapPost(url,headers,null), ProxyHolder.NAT_PROXY)){
String result = response.body().string();
System.out.println(result);
if(result.contains("搜索过于频繁")) {
continue;
}
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArray = json.getJSONArray("entries");
if(Objects.nonNull(jsonArray)) {
for(int i = 0;i < jsonArray.size();i++) {
JSONObject data = jsonArray.getJSONObject(i);
JianshuUser jsu = new JianshuUser();
jsu.setId(data.getString("id"));
jsu.setFensi(data.getInteger("followers_count"));
jsu.setGuangzhu(data.getInteger("following_users_count"));
jsu.setName(data.getString("nickname"));
jsu.setArticles(data.getInteger("total_wordage"));
jsu.setZishu(data.getInteger("total_likes_count"));
jsu.setUrl("https://www.jianshu.com/u/"+data.getString("slug"));
jsu.setImgUrl(data.getString("avatar_url"));
// System.out.println(jsu.toString());
jsList.add(jsu);
}
logger.info("{} 页 一共采集到 {} 关键词 {}", page,jsList.size(),word);
page++;
if(page * 10 > jsList.size()+30) {
break;
}
continue;
}
break;
} catch (Exception e) {
e.printStackTrace();
logger.error("简书用户采集出错{}", e);
}
}
return jsList;
}
public static void main(String[] args) {}
}
package com.zhiwei.media_data_crawler.entity;
public class JianshuUser {
private String id;
private String name;
private String url;
private String imgUrl;
private int fensi; //粉丝
private int guangzhu; //关注
private int articles; //文章数
private int zishu; //写了多少字
private int like; //喜欢数
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getImgUrl() {
return imgUrl;
}
public void setImgUrl(String imgUrl) {
this.imgUrl = imgUrl;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getFensi() {
return fensi;
}
public void setFensi(int fensi) {
this.fensi = fensi;
}
public int getGuangzhu() {
return guangzhu;
}
public void setGuangzhu(int guangzhu) {
this.guangzhu = guangzhu;
}
public int getArticles() {
return articles;
}
public void setArticles(int articles) {
this.articles = articles;
}
public int getZishu() {
return zishu;
}
public void setZishu(int zishu) {
this.zishu = zishu;
}
public int getLike() {
return like;
}
public void setLike(int like) {
this.like = like;
}
@Override
public String toString() {
return "JianshuUser [id=" + id + ", name=" + name + ", url=" + url
+ ", imgUrl=" + imgUrl + ", fensi=" + fensi + ", guangzhu="
+ guangzhu + ", articles=" + articles + ", zishu=" + zishu
+ ", like=" + like + "]";
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment