Commit c4e67a9e by yangchen

知乎 增加用户采集和 链接更新问题时间

parent bb73a9c6
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
<description>网媒数据抓取,包含百度新闻、搜狗新闻、360新闻、知乎回答列表等</description> <description>网媒数据抓取,包含百度新闻、搜狗新闻、360新闻、知乎回答列表等</description>
<!-- 打包管理 --> <!-- 打包管理 -->
<build> <build>
<plugins> <plugins>
......
package com.zhiwei.media_data_crawler.crawler; package com.zhiwei.media_data_crawler.crawler;
import java.net.Proxy; import java.net.Proxy;
import java.net.URLEncoder;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
...@@ -16,6 +18,8 @@ import com.zhiwei.crawler.core.HttpBoot; ...@@ -16,6 +18,8 @@ import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils; import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.media_data_crawler.data.DataCrawler; import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.ZhiHuData; import com.zhiwei.media_data_crawler.entity.ZhiHuData;
import com.zhiwei.media_data_crawler.entity.ZhihuAuthor;
import com.zhiwei.media_data_crawler.entity.ZhihuQuestionData;
import com.zhiwei.tools.httpclient.HeaderTool; import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.timeparse.TimeParse; import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.URLCodeUtil; import com.zhiwei.tools.tools.URLCodeUtil;
...@@ -70,6 +74,159 @@ public class ZhihuCrawlerParse { ...@@ -70,6 +74,159 @@ public class ZhihuCrawlerParse {
return list; return list;
} }
/**
*
* @Description 传入参数获取时间
* @param id
* @param proxy
* @return
*/
public static ZhihuQuestionData getQuestionData(String id,Proxy proxy) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet("https://www.zhihu.com/api/v4/questions/"+id), proxy)){
String result = response.body().string();
JSONObject json = JSONObject.parseObject(result);
String ur = json.getString("url");
String title = json.getString("title");
String time = TimeParse.dateFormartString(new Date(json.getInteger("created")*1000L), "yyyy-MM-dd HH:mm:ss");
return new ZhihuQuestionData(title, time, ur);
} catch (Exception e) {
logger.error(" 知乎 问题获取出错 {} ",e);
}
return null;
}
/**
*
* @Description 知乎用户采集
* @param url
* @param proxy
* @return
*/
public static ZhihuAuthor getZhihuUser(String url,Proxy proxy) {
try (Response response = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy)){
String result = response.body().string();
String jsondata = result.split("js-initialData\" type=\"text/json\"\\>")[1]
.split("\\</script")[0];
JSONObject json = JSONObject.parseObject(jsondata);
String jsonget = url.split("/")[4];
JSONObject jsonentities = json.getJSONObject("initialState").getJSONObject("entities")
.getJSONObject("users").getJSONObject(jsonget);
ZhihuAuthor za = new ZhihuAuthor();
if(Objects.nonNull(jsonentities)) {
za.setName(jsonentities.getString("name"));
za.setThank(jsonentities.getInteger("thankedCount"));
za.setCollection(jsonentities.getInteger("favoritedCount"));
za.setFensi(jsonentities.getInteger("followerCount"));
za.setDescript(jsonentities.getString("description").replaceAll("<.*?>", ""));
za.setGuanzhu(jsonentities.getInteger("followingCount"));
za.setTags(jsonentities.getString("headline"));
za.setIsauthentication(jsonentities.getBooleanValue("isOrg"));
za.setAuthentication(jsonentities.getString("orgName"));
za.setLike(jsonentities.getInteger("voteupCount"));
za.setEdit(jsonentities.getInteger("logsCount"));
if(Objects.nonNull(jsonentities.get("badge")) && !jsonentities.getJSONArray("badge").isEmpty()) {
za.setAuthdescription(jsonentities.getJSONArray("badge").getJSONObject(0).getString("description"));
}
if(Objects.nonNull(jsonentities.get("locations")) && !jsonentities.getJSONArray("locations").isEmpty()) {
za.setLocations(jsonentities.getJSONArray("locations").getJSONObject(0).getString("name"));
}
if(Objects.nonNull(jsonentities.get("business"))) {
za.setBusiness(jsonentities.getJSONObject("business").getString("name"));
}
za.setUrl(url);
return za;
}
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
// public static void main(String[] args) {
// String url = "68781862";
// ZhihuQuestionData zqd = ZhihuCrawlerParse.getQuestionData(url, null);
// System.out.println(zqd.toString());
// }
/**
* 根据关键词获取链接地址(栏目msg,问答question)
*
* @param keys
* 关键词
* @param proxy
* 代理
* @param num
* 页数,一页10条
* @return
*/
public List<ZhiHuData> getUrlByKey(String word, Proxy proxy, int num, String cookie) {
List<ZhiHuData> da = new ArrayList<>();
String result = "";
Map<String, Object> header = new HashMap<>();
header.put("Cookie", cookie);
header.put("authorization", "oauth c3cef7c66a1843f8b3a9e6a1e3160e20");
try {
String u = "https://www.zhihu.com/search?type=content&q=" + URLEncoder.encode(word, "utf-8");
result = httpBoot.syncCall(RequestUtils.wrapGet(u,header), proxy).body().string();
try {
String searchHashId = result.split("search_hash_id=")[1].split("&")[0];
int n = 5;
while(true) {
result = null;
u = "https://www.zhihu.com/api/v4/search_v3?t=general&q="+URLEncoder.encode(word, "utf-8")+"&correction=1&offset="+n+"&limit=20&search_hash_id="+searchHashId;
logger.info(" 采集链接 u = {} ", u);
System.out.println(u);
result = httpBoot.syncCall(RequestUtils.wrapGet(u,header), proxy).body().string();
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArry = json.getJSONArray("data");
int c = da.size();
for(int i = 0;i < jsonArry.size();i++) {
JSONObject data = jsonArry.getJSONObject(i);
try {
ZhiHuData zhihuData = new ZhiHuData();
String type = data.getJSONObject("object").getString("type");
zhihuData.setType(type);
zhihuData.setComment_count(data.getJSONObject("object").getInteger("comment_count"));
if(data.getJSONObject("object").getLong("created_time") == null) {
continue;
}
zhihuData.setTime(TimeParse.dateFormartString(new Date(data.getJSONObject("object").getLong("created_time")*1000L),"yyyy-MM-dd HH:mm:ss"));
zhihuData.setSource(data.getJSONObject("object").getJSONObject("author").getString("name"));
if("article".equals(type)) {
zhihuData.setTitle(data.getJSONObject("object").getString("title").replaceAll("<.*?>", ""));
zhihuData.setAttitudes_count(data.getJSONObject("object").getInteger("voteup_count"));
zhihuData.setContent(data.getJSONObject("object").getString("content").replaceAll("<.*?>", ""));
zhihuData.setUrl(data.getJSONObject("object").getString("url").replace("https://api.zhihu.com/articles/", "https://zhuanlan.zhihu.com/p/"));
}else {
zhihuData.setTitle(data.getJSONObject("highlight").getString("title").replaceAll("<.*?>", ""));
zhihuData.setContent(data.getJSONObject("highlight").getString("description").replaceAll("<.*?>", ""));
if(data.getJSONObject("object").getJSONObject("question") != null) {
zhihuData.setUrl("https://www.zhihu.com/question/"+data.getJSONObject("object").getJSONObject("question").getString("id"));
}else {
zhihuData.setUrl("https://www.zhihu.com/question/"+data.getJSONObject("object").getString("id"));
}
}
System.out.println(zhihuData.toString());
da.add(zhihuData);
} catch (Exception e) {
logger.error("解析数据出错 {}", e);
continue;
}
}
if(c == da.size()) {
break;
}
n += 10;
Thread.sleep(3000);
}
} catch (Exception e1) {
logger.error(" 获取数据出错 {} ",e1);
}
} catch (Exception e) {
logger.error(" 获取数据出错 {} ",e);
}
return da;
}
/** /**
* @param word * @param word
......
...@@ -387,6 +387,20 @@ public class DataCrawler { ...@@ -387,6 +387,20 @@ public class DataCrawler {
} }
} }
/**
*
* @Description 知乎用户依据链接采集
* @param url
* @param proxy
* @return
*/
public static ZhihuAuthor getZhihuUser(String url, Proxy proxy){
try{
return ZhihuCrawlerParse.getZhihuUser(url, proxy);
}catch (Exception e){
throw e;
}
}
} }
...@@ -65,7 +65,7 @@ public class ZhiHuData implements Serializable{ ...@@ -65,7 +65,7 @@ public class ZhiHuData implements Serializable{
this.word = word; this.word = word;
} }
private String word; //采集关键词 private String word; //采集关键词
public String getWord() { public String getWord() {
return word; return word;
...@@ -162,4 +162,5 @@ private String word; //采集关键词 ...@@ -162,4 +162,5 @@ private String word; //采集关键词
public void setFollower_count(Integer follower_count) { public void setFollower_count(Integer follower_count) {
this.follower_count = follower_count; this.follower_count = follower_count;
} }
} }
package com.zhiwei.media_data_crawler.entity;
public class ZhihuAuthor {
private String url;
private String name;
private String descript;
private String tags;
private int like; //赞同数
private int edit; //编辑数
private int guanzhu; //关注数
private int fensi; //粉丝数
private int thank; //感谢数
private int collection; //收藏数
private boolean isauthentication; //是否认证
private String authentication; //认证公司
private String authdescription; //认证描述
private String locations; //居住信息
private String business; //所在行业
public String getBusiness() {
return business;
}
public void setBusiness(String business) {
this.business = business;
}
public String getLocations() {
return locations;
}
public void setLocations(String locations) {
this.locations = locations;
}
public String getAuthdescription() {
return authdescription;
}
public void setAuthdescription(String authdescription) {
this.authdescription = authdescription;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getDescript() {
return descript;
}
public void setDescript(String descript) {
this.descript = descript;
}
public String getTags() {
return tags;
}
public void setTags(String tags) {
this.tags = tags;
}
public int getLike() {
return like;
}
public void setLike(int like) {
this.like = like;
}
public int getEdit() {
return edit;
}
public void setEdit(int edit) {
this.edit = edit;
}
public int getGuanzhu() {
return guanzhu;
}
public void setGuanzhu(int guanzhu) {
this.guanzhu = guanzhu;
}
public int getFensi() {
return fensi;
}
public void setFensi(int fensi) {
this.fensi = fensi;
}
public int getThank() {
return thank;
}
public void setThank(int thank) {
this.thank = thank;
}
public int getCollection() {
return collection;
}
public void setCollection(int collection) {
this.collection = collection;
}
public boolean isIsauthentication() {
return isauthentication;
}
public void setIsauthentication(boolean isauthentication) {
this.isauthentication = isauthentication;
}
public String getAuthentication() {
return authentication;
}
public void setAuthentication(String authentication) {
this.authentication = authentication;
}
@Override
public String toString() {
return "ZhihuAuthor [url=" + url + ", name=" + name + ", descript="
+ descript + ", tags=" + tags + ", like=" + like + ", edit="
+ edit + ", guanzhu=" + guanzhu + ", fensi=" + fensi
+ ", thank=" + thank + ", collection=" + collection
+ ", isauthentication=" + isauthentication + ", authentication="
+ authentication + ", authdescription=" + authdescription
+ ", locations=" + locations + ", business=" + business + "]";
}
}
package com.zhiwei.media_data_crawler.entity;
public class ZhihuQuestionData {
private String title;
private String time;
private String url;
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
@Override
public String toString() {
return "ZhihuQuestionData [title=" + title + ", time=" + time + ", url="
+ url + "]";
}
public ZhihuQuestionData(String title, String time, String url) {
super();
this.title = title;
this.time = time;
this.url = url;
}
public ZhihuQuestionData() {
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment