Commit 5988fd8b by cwy

知乎采集 代理 使用方式修改 升级版本

parent aacd8761
...@@ -16,8 +16,10 @@ import org.slf4j.LoggerFactory; ...@@ -16,8 +16,10 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils; import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.media_data_crawler.entity.ZhihuAnswer; import com.zhiwei.media_data_crawler.entity.ZhihuAnswer;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools; import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response; import okhttp3.Response;
...@@ -57,7 +59,7 @@ public class ZhihuAnwserCrawlerParse { ...@@ -57,7 +59,7 @@ public class ZhihuAnwserCrawlerParse {
* @return * @return
* @throws Exception * @throws Exception
*/ */
public static List<ZhihuAnswer> getAnswerList(String url, Date endDate, Proxy proxy) throws Exception{ public static List<ZhihuAnswer> getAnswerList(String url, Date endDate, ProxyHolder proxy) throws Exception{
try{ try{
List<ZhihuAnswer> answerList = new ArrayList<>(); List<ZhihuAnswer> answerList = new ArrayList<>();
String questionId = getQuestionId(url); String questionId = getQuestionId(url);
...@@ -100,7 +102,7 @@ public class ZhihuAnwserCrawlerParse { ...@@ -100,7 +102,7 @@ public class ZhihuAnwserCrawlerParse {
* @return * @return
* @throws Exception * @throws Exception
*/ */
private static String getNumberBoard(String url, Proxy proxy) throws Exception{ private static String getNumberBoard(String url, ProxyHolder proxy) throws Exception{
try{ try{
String body = download(url, proxy); String body = download(url, proxy);
Document document = Jsoup.parse(body); Document document = Jsoup.parse(body);
...@@ -130,7 +132,7 @@ public class ZhihuAnwserCrawlerParse { ...@@ -130,7 +132,7 @@ public class ZhihuAnwserCrawlerParse {
* @return * @return
* @throws Exception * @throws Exception
*/ */
public static Map<String,Object> getAnswerList(String url, int page, Date endDate, Proxy proxy) throws Exception{ public static Map<String,Object> getAnswerList(String url, int page, Date endDate, ProxyHolder proxy) throws Exception{
try{ try{
String questionId = getQuestionId(url); String questionId = getQuestionId(url);
String bord = getNumberBoard(url, proxy); String bord = getNumberBoard(url, proxy);
...@@ -150,7 +152,7 @@ public class ZhihuAnwserCrawlerParse { ...@@ -150,7 +152,7 @@ public class ZhihuAnwserCrawlerParse {
* @return * @return
* @throws Exception * @throws Exception
*/ */
private static Map<String,Object> analsis(String questionId, Date endDate, int page, String bord, Proxy proxy) throws Exception{ private static Map<String,Object> analsis(String questionId, Date endDate, int page, String bord, ProxyHolder proxy) throws Exception{
try{ try{
boolean more = true; boolean more = true;
List<ZhihuAnswer> answerList = new ArrayList<>(); List<ZhihuAnswer> answerList = new ArrayList<>();
...@@ -199,7 +201,7 @@ public class ZhihuAnwserCrawlerParse { ...@@ -199,7 +201,7 @@ public class ZhihuAnwserCrawlerParse {
* @return * @return
* @throws Exception * @throws Exception
*/ */
private static String download(String url, Proxy proxy) throws Exception{ private static String download(String url, ProxyHolder proxy) throws Exception{
try(Response response = httpBoot.syncCall(RequestUtils.wrapGet(url),proxy)){ try(Response response = httpBoot.syncCall(RequestUtils.wrapGet(url),proxy)){
return response.body().string(); return response.body().string();
}catch (Exception e){ }catch (Exception e){
...@@ -243,14 +245,15 @@ public class ZhihuAnwserCrawlerParse { ...@@ -243,14 +245,15 @@ public class ZhihuAnwserCrawlerParse {
public static void main(String[] args){ public static void main(String[] args){
// String url = "https://www.zhihu.com/question/288128510"; String url = "https://www.zhihu.com/question/67992264";
// Date endDate = TimeParse.stringFormartDate("2018-09-20 08:00:00"); Date endDate = TimeParse.stringFormartDate("2010-09-20 08:00:00");
// try{ try{
// getAnswerList(url,endDate, null); getAnswerList(url,endDate, null);
// }catch (Exception e){ }catch (Exception e){
// e.fillInStackTrace(); e.fillInStackTrace();
// } }
getAnswerCount("https://www.zhihu.com/question/41539825", null); int count = getAnswerCount("https://www.zhihu.com/question/67992264", null);
System.out.println(count);
} }
......
...@@ -391,7 +391,7 @@ public class DataCrawler { ...@@ -391,7 +391,7 @@ public class DataCrawler {
* @return * @return
* @throws Exception * @throws Exception
*/ */
public static List<ZhihuAnswer> getAnswerList(String url, Date endDate, Proxy proxy) throws Exception{ public static List<ZhihuAnswer> getAnswerList(String url, Date endDate, ProxyHolder proxy) throws Exception{
try{ try{
return ZhihuAnwserCrawlerParse.getAnswerList(url,endDate, proxy); return ZhihuAnwserCrawlerParse.getAnswerList(url,endDate, proxy);
}catch (Exception e){ }catch (Exception e){
...@@ -408,7 +408,7 @@ public class DataCrawler { ...@@ -408,7 +408,7 @@ public class DataCrawler {
* @return * @return
* @throws Exception * @throws Exception
*/ */
public static Map<String,Object> getAnswerList(String url, int page, Date endDate, Proxy proxy) throws Exception{ public static Map<String,Object> getAnswerList(String url, int page, Date endDate, ProxyHolder proxy) throws Exception{
try{ try{
return ZhihuAnwserCrawlerParse.getAnswerList(url,page,endDate, proxy); return ZhihuAnwserCrawlerParse.getAnswerList(url,page,endDate, proxy);
}catch (Exception e){ }catch (Exception e){
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment