Commit 9ef31c31 by [zhangzhiwei]

因修改采集核心包版本,修改相应的方法

parent 9205aa61
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="CompilerConfiguration">
<annotationProcessing>
<profile name="Maven default annotation processors profile" enabled="true">
<sourceOutputDir name="target/generated-sources/annotations" />
<sourceTestOutputDir name="target/generated-test-sources/test-annotations" />
<outputRelativeToContentRoot value="true" />
<module name="media_data_crawler" />
</profile>
</annotationProcessing>
</component>
</project>
\ No newline at end of file
<component name="ProjectDictionaryState">
<dictionary name="bewiler hk" />
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding" defaultCharsetForPropertiesFiles="UTF-8">
<file url="PROJECT" charset="UTF-8" />
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ExternalStorageConfigurationManager" enabled="true" />
<component name="Kotlin2JsCompilerArguments">
<option name="sourceMapEmbedSources" />
<option name="sourceMapPrefix" />
</component>
<component name="MavenProjectsManager">
<option name="originalFiles">
<list>
<option value="$PROJECT_DIR$/pom.xml" />
</list>
</option>
</component>
<component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" project-jdk-name="1.8" project-jdk-type="JavaSDK">
<output url="file://$PROJECT_DIR$/out" />
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Palette2">
<group name="Swing">
<item class="com.intellij.uiDesigner.HSpacer" tooltip-text="Horizontal Spacer" icon="/com/intellij/uiDesigner/icons/hspacer.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="1" hsize-policy="6" anchor="0" fill="1" />
</item>
<item class="com.intellij.uiDesigner.VSpacer" tooltip-text="Vertical Spacer" icon="/com/intellij/uiDesigner/icons/vspacer.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="1" anchor="0" fill="2" />
</item>
<item class="javax.swing.JPanel" icon="/com/intellij/uiDesigner/icons/panel.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3" />
</item>
<item class="javax.swing.JScrollPane" icon="/com/intellij/uiDesigner/icons/scrollPane.png" removable="false" auto-create-binding="false" can-attach-label="true">
<default-constraints vsize-policy="7" hsize-policy="7" anchor="0" fill="3" />
</item>
<item class="javax.swing.JButton" icon="/com/intellij/uiDesigner/icons/button.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="3" anchor="0" fill="1" />
<initial-values>
<property name="text" value="Button" />
</initial-values>
</item>
<item class="javax.swing.JRadioButton" icon="/com/intellij/uiDesigner/icons/radioButton.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="3" anchor="8" fill="0" />
<initial-values>
<property name="text" value="RadioButton" />
</initial-values>
</item>
<item class="javax.swing.JCheckBox" icon="/com/intellij/uiDesigner/icons/checkBox.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="3" anchor="8" fill="0" />
<initial-values>
<property name="text" value="CheckBox" />
</initial-values>
</item>
<item class="javax.swing.JLabel" icon="/com/intellij/uiDesigner/icons/label.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="0" anchor="8" fill="0" />
<initial-values>
<property name="text" value="Label" />
</initial-values>
</item>
<item class="javax.swing.JTextField" icon="/com/intellij/uiDesigner/icons/textField.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1">
<preferred-size width="150" height="-1" />
</default-constraints>
</item>
<item class="javax.swing.JPasswordField" icon="/com/intellij/uiDesigner/icons/passwordField.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1">
<preferred-size width="150" height="-1" />
</default-constraints>
</item>
<item class="javax.swing.JFormattedTextField" icon="/com/intellij/uiDesigner/icons/formattedTextField.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1">
<preferred-size width="150" height="-1" />
</default-constraints>
</item>
<item class="javax.swing.JTextArea" icon="/com/intellij/uiDesigner/icons/textArea.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JTextPane" icon="/com/intellij/uiDesigner/icons/textPane.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JEditorPane" icon="/com/intellij/uiDesigner/icons/editorPane.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JComboBox" icon="/com/intellij/uiDesigner/icons/comboBox.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="2" anchor="8" fill="1" />
</item>
<item class="javax.swing.JTable" icon="/com/intellij/uiDesigner/icons/table.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JList" icon="/com/intellij/uiDesigner/icons/list.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="2" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JTree" icon="/com/intellij/uiDesigner/icons/tree.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JTabbedPane" icon="/com/intellij/uiDesigner/icons/tabbedPane.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3">
<preferred-size width="200" height="200" />
</default-constraints>
</item>
<item class="javax.swing.JSplitPane" icon="/com/intellij/uiDesigner/icons/splitPane.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3">
<preferred-size width="200" height="200" />
</default-constraints>
</item>
<item class="javax.swing.JSpinner" icon="/com/intellij/uiDesigner/icons/spinner.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1" />
</item>
<item class="javax.swing.JSlider" icon="/com/intellij/uiDesigner/icons/slider.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1" />
</item>
<item class="javax.swing.JSeparator" icon="/com/intellij/uiDesigner/icons/separator.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3" />
</item>
<item class="javax.swing.JProgressBar" icon="/com/intellij/uiDesigner/icons/progressbar.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="0" fill="1" />
</item>
<item class="javax.swing.JToolBar" icon="/com/intellij/uiDesigner/icons/toolbar.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="0" fill="1">
<preferred-size width="-1" height="20" />
</default-constraints>
</item>
<item class="javax.swing.JToolBar$Separator" icon="/com/intellij/uiDesigner/icons/toolbarSeparator.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="0" anchor="0" fill="1" />
</item>
<item class="javax.swing.JScrollBar" icon="/com/intellij/uiDesigner/icons/scrollbar.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="0" anchor="0" fill="2" />
</item>
</group>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>
\ No newline at end of file
......@@ -65,7 +65,7 @@
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.0.5-SNAPSHOT</version>
<version>0.0.8-SNAPSHOT</version>
</dependency>
</dependencies>
</project>
\ No newline at end of file
......@@ -28,6 +28,7 @@ import java.util.regex.Pattern;
public class BaiduNewsCrawlerParse {
private static Logger logger = LogManager.getLogger(BaiduNewsCrawlerParse.class);
private static HttpBoot httpBoot = new HttpBoot();
private static final String pt = "百度新闻";
/**
......@@ -200,7 +201,7 @@ public class BaiduNewsCrawlerParse {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap));
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap));
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
......@@ -246,7 +247,7 @@ public class BaiduNewsCrawlerParse {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
......@@ -270,7 +271,7 @@ public class BaiduNewsCrawlerParse {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
......
......@@ -24,7 +24,7 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class BaiduTiebaCrawlerParse {
private static HttpBoot httpBoot = new HttpBoot();
private static Logger logger = LogManager.getLogger(BaiduTiebaCrawlerParse.class);
/**
* @Title: getBaiduTiebaData
......@@ -217,7 +217,7 @@ public class BaiduTiebaCrawlerParse {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().toString();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
......@@ -255,7 +255,7 @@ public class BaiduTiebaCrawlerParse {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
......
......@@ -24,6 +24,7 @@ import java.util.Map;
public class DoubanCrawlerParse {
private static Logger logger = LogManager.getLogger(BaiduTiebaCrawlerParse.class);
private static HttpBoot httpBoot = new HttpBoot();
/**
*
* @Title: getDoubanData
......@@ -93,7 +94,7 @@ public class DoubanCrawlerParse {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
......@@ -115,7 +116,7 @@ public class DoubanCrawlerParse {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().toString();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
......
......@@ -28,6 +28,7 @@ import java.util.Map;
public class SoCrawlerParse {
private static Logger logger = LogManager.getLogger(SoCrawlerParse.class);
private static HttpBoot httpBoot = new HttpBoot();
private static final String pt = "360网页";
/**
......@@ -103,7 +104,7 @@ public class SoCrawlerParse {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().string();
} catch (Exception e) {
logger.error("获取360新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
......@@ -283,7 +284,7 @@ public class SoCrawlerParse {
String url = null;
if(link != null) {
try {
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url),proxy,false);
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url),proxy,false);
String htmlBody = response.body().toString();
if(htmlBody!=null) {
url = htmlBody.split("window.location.replace\\(\"")[1].split("\"\\)")[0];
......
......@@ -25,6 +25,7 @@ import java.util.Map;
public class SoNewsCrawlerParse {
private static Logger logger = LogManager.getLogger(SoNewsCrawlerParse.class);
private static HttpBoot httpBoot = new HttpBoot();
private static final String pt = "360新闻";
/**
......@@ -135,7 +136,7 @@ public class SoNewsCrawlerParse {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().string();
} catch (Exception e) {
logger.error("获取360新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
......
......@@ -26,6 +26,7 @@ import java.util.Map;
public class SougouNewsCrawlerParse {
private static Logger logger = LogManager.getLogger(BaiduTiebaCrawlerParse.class);
private static HttpBoot httpBoot = new HttpBoot();
private static final String pt = "搜狗新闻";
......@@ -129,7 +130,7 @@ public class SougouNewsCrawlerParse {
//下载数据页面
for(int i = 1; i<=3; i++){
try {
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().string();
} catch (Exception e) {
logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
......@@ -153,7 +154,7 @@ public class SougouNewsCrawlerParse {
//下载数据页面
for(int i = 1; i<=3; i++){
try {
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().string();
} catch (Exception e) {
logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
......
......@@ -25,6 +25,7 @@ import java.util.regex.Pattern;
public class SougouZhihuCrawlerParse{
private static Logger logger = LogManager.getLogger(SougouZhihuCrawlerParse.class);
private static HttpBoot httpBoot = new HttpBoot();
private static final String pt = "搜狗知乎";
......@@ -96,7 +97,7 @@ public class SougouZhihuCrawlerParse{
//下载数据页面
for(int i = 1; i<=3; i++){
try {
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().string();
} catch (Exception e) {
logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
......@@ -123,7 +124,7 @@ public class SougouZhihuCrawlerParse{
//下载数据页面
for(int i = 1; i<=3; i++){
try {
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().string();
} catch (Exception e) {
logger.error("获取搜狗新闻数据时出现问题,问题为:{}", e.fillInStackTrace());
......
......@@ -22,6 +22,7 @@ import java.util.*;
public class TianYaCrawlerParse {
private static Logger logger = LogManager.getLogger(TianYaCrawlerParse.class);
private static HttpBoot httpBoot = new HttpBoot();
private static final String pt = "天涯论坛";
/**
* @Title: getBaiduTiebaData
......@@ -86,7 +87,7 @@ public class TianYaCrawlerParse {
// 下载数据页面
for (int i = 1; i <= 3; i++) {
try {
Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
Response response = httpBoot.syncCall(RequestUtils.wrapGet(url,headerMap),proxy,false);
return response.body().string();
} catch (Exception e) {
logger.error("获取数据时出现问题,问题为:{}", e.fillInStackTrace());
......
package com.zhiwei.media_data_crawler.crawler;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.media_data_crawler.entity.ZhihuAnswer;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.net.Proxy;
import java.util.*;
/**
* 知乎评论采集
*/
public class ZhihuAnwserCrawlerParse {
/**
* 获取数据
* @param url
* @param endDate
* @param proxy
* @return
* @throws Exception
*/
public static List<ZhihuAnswer> getAnswerList(String url, Date endDate, Proxy proxy) throws Exception{
try{
List<ZhihuAnswer> answerList = new ArrayList<>();
String questionId = getQuestionId(url);
String bord = getNumberBoard(url, proxy);
boolean more = true;
int page = 0;
while(more){
try{
Map<String,Object> dataMap = analsis(questionId,endDate,page,bord,proxy);
if(dataMap!=null && !dataMap.isEmpty()){
more = (boolean)dataMap.get("more");
List<ZhihuAnswer> list = (List<ZhihuAnswer>)dataMap.get("data");
if(list!=null && !list.isEmpty()){
answerList.addAll(list);
}else{
more = false;
}
}
//单线程采集避免被封休眠8s
ZhiWeiTools.sleep(8000);
page++;
}catch (Exception e){
more = false;
}
}
return answerList;
}catch (Exception e){
throw e;
}
}
/**
* 获取问题的关注者和浏览量
* @param url
* @param proxy
* @return
* @throws Exception
*/
private static String getNumberBoard(String url, Proxy proxy) throws Exception{
try{
String body = download(url, proxy);
Document document = Jsoup.parse(body);
Elements views = document.select("strong.NumberBoard-itemValue");
String fllow = "0";
String view = "0";
if (views.size() >= 2) {
fllow=views.get(0).attr("title");
view=views.get(1).attr("title");
}
return fllow+","+view;
} catch (Exception e){
throw e;
}
}
/**
* 获取单页数据
* @param url
* @param page
* @param endDate
* @param proxy
* @return
* @throws Exception
*/
public static Map<String,Object> getAnswerList(String url, int page, Date endDate, Proxy proxy) throws Exception{
try{
String questionId = getQuestionId(url);
String bord = getNumberBoard(url, proxy);
return analsis(questionId,endDate,page,bord ,proxy);
}catch (Exception e){
throw e;
}
}
/**
* 解析数据
* @param questionId
* @param endDate
* @param page
* @param proxy
* @return
* @throws Exception
*/
private static Map<String,Object> analsis(String questionId, Date endDate, int page, String bord, Proxy proxy) throws Exception{
try{
boolean more = true;
List<ZhihuAnswer> answerList = new ArrayList<>();
String urlNext = getUrl(questionId, page);
String body = download(urlNext, proxy);
JSONObject dataJson = JSONObject.parseObject(body);
Integer count = dataJson.getJSONObject("paging").getInteger("totals");
JSONArray jsonArray = dataJson.getJSONArray("data");
String from_url = "https://www.zhihu.com/question/" + questionId;
for(int i=0; i<jsonArray.size(); i++){
JSONObject answerJson = jsonArray.getJSONObject(i);
Date time = new Date(answerJson.getLong("created_time")*1000);
if(time.after(endDate)){
String answerId = answerJson.getString("id");
String link = from_url+"/answers/" + answerId;
String author = answerJson.getJSONObject("author").getString("name");
String authorUrl = "https://www.zhihu.com/people/"+answerJson.getJSONObject("author").getString("url_token");
String content = ZhiWeiTools.delHTMLTag(answerJson.getString("content"));
String title = answerJson.getJSONObject("question").getString("title");
Integer voteup_count = answerJson.getInteger("voteup_count");
Integer comment_count = answerJson.getInteger("comment_count");
Integer guanzhu_count = Integer.valueOf(bord.split(",")[0]);
Integer bord_count = Integer.valueOf(bord.split(",")[1]);
ZhihuAnswer zhihuAnswer = new ZhihuAnswer(link, from_url, title, time, author, authorUrl, content,voteup_count ,comment_count, guanzhu_count, bord_count);
answerList.add(zhihuAnswer);
}
}
if(count<page*20){
more = false;
}
Map<String,Object> resultMap = new HashMap<>();
resultMap.put("data", answerList);
resultMap.put("more", more);
return resultMap;
}catch (Exception e){
throw e;
}
}
/**
* 根据链接获取数据
* @param url
* @param proxy
* @return
* @throws Exception
*/
private static String download(String url, Proxy proxy) throws Exception{
try(Response response = HttpBoot.syncCall(RequestUtils.wrapGet(url),proxy)){
return response.body().string();
}catch (Exception e){
throw e;
}
}
/**
* 根据链接获取问题id
* @param url
* @return
* @throws Exception
*/
private static String getQuestionId(String url) throws Exception{
try{
if(url.contains("question")){
return url.split("question/")[1].split("/")[0];
}
}catch (Exception e){
throw e;
}
throw new Exception("链接不符合要求,不是正常的知乎问题链接");
}
/***
* 获取数据页链接
* @param questionId
* @param page
* @return
*/
private static String getUrl(String questionId, int page){
return "https://www.zhihu.com/api/v4/questions/"+questionId+"/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2" +
"Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit" +
"%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2" +
"Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp" +
"%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics&offset="+page*20+"&limit=20&sort_by=created";
}
public static void main(String[] args){
String url = "https://www.zhihu.com/question/288128510";
Date endDate = TimeParse.stringFormartDate("2018-09-20 08:00:00");
try{
getAnswerList(url,endDate, null);
}catch (Exception e){
e.fillInStackTrace();
}
}
}
package com.zhiwei.media_data_crawler.crawler;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.media_data_crawler.entity.ZhihuAnswer;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.net.Proxy;
import java.util.*;
/**
* 知乎评论采集
*/
public class ZhihuAnwserCrawlerParse {
private static HttpBoot httpBoot = new HttpBoot();
/**
* 获取数据
* @param url
* @param endDate
* @param proxy
* @return
* @throws Exception
*/
public static List<ZhihuAnswer> getAnswerList(String url, Date endDate, Proxy proxy) throws Exception{
try{
List<ZhihuAnswer> answerList = new ArrayList<>();
String questionId = getQuestionId(url);
String bord = getNumberBoard(url, proxy);
boolean more = true;
int page = 0;
while(more){
try{
Map<String,Object> dataMap = analsis(questionId,endDate,page,bord,proxy);
if(dataMap!=null && !dataMap.isEmpty()){
more = (boolean)dataMap.get("more");
@SuppressWarnings("unchecked")
List<ZhihuAnswer> list = (List<ZhihuAnswer>)dataMap.get("data");
if(list!=null && !list.isEmpty()){
answerList.addAll(list);
}else{
more = false;
}
}
//单线程采集避免被封休眠8s
ZhiWeiTools.sleep(8000);
page++;
}catch (Exception e){
more = false;
}
}
return answerList;
}catch (Exception e){
throw e;
}
}
/**
* 获取问题的关注者和浏览量
* @param url
* @param proxy
* @return
* @throws Exception
*/
private static String getNumberBoard(String url, Proxy proxy) throws Exception{
try{
String body = download(url, proxy);
Document document = Jsoup.parse(body);
Elements views = document.select("strong.NumberBoard-itemValue");
String fllow = "0";
String view = "0";
if (views.size() >= 2) {
fllow=views.get(0).attr("title");
view=views.get(1).attr("title");
}
return fllow+","+view;
} catch (Exception e){
throw e;
}
}
/**
* 获取单页数据
* @param url
* @param page
* @param endDate
* @param proxy
* @return
* @throws Exception
*/
public static Map<String,Object> getAnswerList(String url, int page, Date endDate, Proxy proxy) throws Exception{
try{
String questionId = getQuestionId(url);
String bord = getNumberBoard(url, proxy);
return analsis(questionId,endDate,page,bord ,proxy);
}catch (Exception e){
throw e;
}
}
/**
* 解析数据
* @param questionId
* @param endDate
* @param page
* @param proxy
* @return
* @throws Exception
*/
private static Map<String,Object> analsis(String questionId, Date endDate, int page, String bord, Proxy proxy) throws Exception{
try{
boolean more = true;
List<ZhihuAnswer> answerList = new ArrayList<>();
String urlNext = getUrl(questionId, page);
String body = download(urlNext, proxy);
JSONObject dataJson = JSONObject.parseObject(body);
Integer count = dataJson.getJSONObject("paging").getInteger("totals");
JSONArray jsonArray = dataJson.getJSONArray("data");
String from_url = "https://www.zhihu.com/question/" + questionId;
for(int i=0; i<jsonArray.size(); i++){
JSONObject answerJson = jsonArray.getJSONObject(i);
Date time = new Date(answerJson.getLong("created_time")*1000);
if(time.after(endDate)){
String answerId = answerJson.getString("id");
String link = from_url+"/answers/" + answerId;
String author = answerJson.getJSONObject("author").getString("name");
String authorUrl = "https://www.zhihu.com/people/"+answerJson.getJSONObject("author").getString("url_token");
String content = ZhiWeiTools.delHTMLTag(answerJson.getString("content"));
String title = answerJson.getJSONObject("question").getString("title");
Integer voteup_count = answerJson.getInteger("voteup_count");
Integer comment_count = answerJson.getInteger("comment_count");
Integer guanzhu_count = Integer.valueOf(bord.split(",")[0]);
Integer bord_count = Integer.valueOf(bord.split(",")[1]);
ZhihuAnswer zhihuAnswer = new ZhihuAnswer(link, from_url, title, time, author, authorUrl, content,voteup_count ,comment_count, guanzhu_count, bord_count);
answerList.add(zhihuAnswer);
}
}
if(count<page*20){
more = false;
}
Map<String,Object> resultMap = new HashMap<>();
resultMap.put("data", answerList);
resultMap.put("more", more);
return resultMap;
}catch (Exception e){
throw e;
}
}
/**
* 根据链接获取数据
* @param url
* @param proxy
* @return
* @throws Exception
*/
private static String download(String url, Proxy proxy) throws Exception{
try(Response response = httpBoot.syncCall(RequestUtils.wrapGet(url),proxy)){
return response.body().string();
}catch (Exception e){
throw e;
}
}
/**
* 根据链接获取问题id
* @param url
* @return
* @throws Exception
*/
private static String getQuestionId(String url) throws Exception{
try{
if(url.contains("question")){
return url.split("question/")[1].split("/")[0];
}
}catch (Exception e){
throw e;
}
throw new Exception("链接不符合要求,不是正常的知乎问题链接");
}
/***
* 获取数据页链接
* @param questionId
* @param page
* @return
*/
private static String getUrl(String questionId, int page){
return "https://www.zhihu.com/api/v4/questions/"+questionId+"/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2" +
"Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit" +
"%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2" +
"Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp" +
"%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics&offset="+page*20+"&limit=20&sort_by=created";
}
public static void main(String[] args){
String url = "https://www.zhihu.com/question/288128510";
Date endDate = TimeParse.stringFormartDate("2018-09-20 08:00:00");
try{
getAnswerList(url,endDate, null);
}catch (Exception e){
e.fillInStackTrace();
}
}
}
package com.zhiwei.media_data_crawler.data;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import com.zhiwei.media_data_crawler.crawler.*;
import com.zhiwei.media_data_crawler.entity.*;
import com.zhiwei.tools.tools.ZhiWeiTools;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
public class DataCrawler {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment