Commit ee3aa8bd by [zhangzhiwei]

修改并更新今日头条采集程序

parent 2191591c
......@@ -6,12 +6,13 @@ import java.util.Date;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.toutiao.bean.TouTiaoAccount;
......@@ -27,7 +28,7 @@ public class TouTiaoAccountParse {
private TouTiaoAccountParse() {}
private static Map<String, String> headerMap;
private static Logger logger = LoggerFactory.getLogger(TouTiaoAccountParse.class);
private static Logger logger = LogManager.getLogger(TouTiaoAccountParse.class);
/**
* @Title: getTouTiaoAccountInfo
......@@ -44,13 +45,13 @@ public class TouTiaoAccountParse {
TouTiaoAccount tta = null;
try {
String htmlBody = null;
htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
htmlBody = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy).body().string();
if(htmlBody != null ){
tta = parseHtmlByAccount(htmlBody, name, proxy);
if(tta == null){
url = "https://www.toutiao.com/search_content/?offset=0&format=json&keyword="+URLCodeUtil.getURLEncode(name, "utf-8")+"&autoload=true&count=20&cur_tab=4&from=media";
headerMap.put("Referer","https://www.toutiao.com/search/?keyword="+URLCodeUtil.getURLEncode(name, "utf-8"));
htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
htmlBody = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy).body().string();
if(htmlBody != null){
tta = parseHtmlByAccount(htmlBody, name, proxy);
}
......@@ -58,7 +59,7 @@ public class TouTiaoAccountParse {
}else {
url = "https://www.toutiao.com/search_content/?offset=0&format=json&keyword="+URLCodeUtil.getURLEncode(name, "utf-8")+"&autoload=true&count=20&cur_tab=4&from=media";
headerMap.put("Referer","https://www.toutiao.com/search/?keyword="+URLCodeUtil.getURLEncode(name, "utf-8"));
htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
htmlBody = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy).body().string();
if(htmlBody != null){
tta = parseHtmlByAccount(htmlBody, name, proxy);
}
......@@ -79,7 +80,7 @@ public class TouTiaoAccountParse {
TouTiaoAccount tta = null;
try {
String htmlBody = null;
htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
htmlBody = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy).body().string();
if(htmlBody != null){
tta = parseAccountByUserId(htmlBody, user_id, proxy);
}
......@@ -112,7 +113,7 @@ public class TouTiaoAccountParse {
headerMap = Tools.getTouTiaoHeader();
try {
String htmlBody = null;
htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
htmlBody = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy).body().string();
if(htmlBody != null){
JSONObject json = JSONObject.parseObject(htmlBody);
list.addAll(parseHtmlByWord(json, proxy));
......@@ -152,7 +153,7 @@ public class TouTiaoAccountParse {
headerMap.put("Host", "is.snssdk.com");
try {
String htmlBody = null;
htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
htmlBody = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy).body().string();
if(htmlBody != null && htmlBody.contains("name")){
JSONObject json = JSONObject.parseObject(htmlBody);
more = json.getJSONObject("data").getBooleanValue("has_more");
......
......@@ -19,12 +19,13 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.util.Tools;
......@@ -36,7 +37,7 @@ import com.zhiwei.toutiao.util.Tools;
*/
public class TouTiaoArticleParse {
private TouTiaoArticleParse() {}
private static Logger logger = LoggerFactory.getLogger(TouTiaoArticleParse.class);
private static Logger logger = LogManager.getLogger(TouTiaoArticleParse.class);
/***
* 获取头条数据
......@@ -59,7 +60,7 @@ public class TouTiaoArticleParse {
headerMap.put("Referer", url);
String htmlBody = null;
try {
htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
htmlBody = HttpBoot.syncCall(RequestUtils.wrapGet(url, headerMap), proxy).body().string();
if(htmlBody != null && htmlBody.contains("behot_time")){
Map<String, Object> ttList = parseHtmlByAccount(htmlBody, endData);
if(ttList!=null && ttList.size()>0){
......@@ -154,7 +155,7 @@ public class TouTiaoArticleParse {
Map<String, String> headerMap = Tools.getTouTiaoHeader();
headerMap.put("Referer", "https://www.toutiao.com/c/user/" + user_id + "/");
try {
String htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
String htmlBody = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy).body().string();
if (htmlBody != null) {
Map<String, Object> dataMap = parseHtmlByMicroAccount(htmlBody, endDate);
if(dataMap!=null && dataMap.size()>0){
......@@ -182,9 +183,7 @@ public class TouTiaoArticleParse {
* @param @return 设定文件
* @return Map<String,Object> 返回类型
*/
@SuppressWarnings("unlikely-arg-type")
private static Map<String, Object> parseHtmlByMicroAccount(String htmlBody, Date endDate) {
Map<String, Object> map = new HashMap<String, Object>();
Long max_behot_time = null;
List<TouTiaoArticle> dataList = new ArrayList<TouTiaoArticle>();
......
......@@ -7,8 +7,8 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONException;
......@@ -26,7 +26,7 @@ import com.zhiwei.toutiao.util.Tools;
*/
public class TouTiaoChannelParse {
private static Map<String, String> headerMap;
private static Logger logger = LoggerFactory.getLogger(TouTiaoChannelParse.class);
private static Logger logger = LogManager.getLogger(TouTiaoChannelParse.class);
/**
* @Title: touTiaoChannel
......
......@@ -8,8 +8,8 @@ import java.util.Date;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
......@@ -26,7 +26,7 @@ import com.zhiwei.toutiao.util.Tools;
*/
public class TouTiaoCommentParse {
private static Logger logger = LoggerFactory.getLogger(TouTiaoCommentParse.class);
private static Logger logger = LogManager.getLogger(TouTiaoCommentParse.class);
/**
......
......@@ -20,10 +20,10 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
......@@ -40,7 +40,7 @@ import com.zhiwei.toutiao.util.Tools;
public class TouTiaoParse {
private Map<String, String> headerMap ;
private Logger logger = LoggerFactory.getLogger(TouTiaoCommentParse.class);
private static Logger logger = LogManager.getLogger(TouTiaoCommentParse.class);
/***
* 获取头条数据
......
......@@ -7,10 +7,10 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
......@@ -28,7 +28,7 @@ import com.zhiwei.toutiao.util.Tools;
public class TouTiaoQuestionAnswerParse {
private static Map<String, String> headerMap ;
private static Logger logger = LoggerFactory.getLogger(TouTiaoQuestionAnswerParse.class);
private static Logger logger = LogManager.getLogger(TouTiaoQuestionAnswerParse.class);
public static Map<String,Object> getAnserList(String questionId,int page,int req_type,Proxy proxy){
......
......@@ -6,8 +6,8 @@ import java.util.Date;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
......@@ -25,7 +25,7 @@ import com.zhiwei.toutiao.util.Tools;
public class TouTiaoQuestionParse {
private static Map<String, String> headerMap;
private static Logger logger = LoggerFactory.getLogger(TouTiaoQuestionParse.class);
private static Logger logger = LogManager.getLogger(TouTiaoQuestionParse.class);
/**
* @Title: getSearchTouTiaoQuestion
......
......@@ -7,8 +7,8 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONException;
......@@ -27,7 +27,7 @@ import com.zhiwei.toutiao.util.Tools;
public class TouTiaoSearchParse {
private static Map<String, String> headerMap;
private static Logger logger = LoggerFactory.getLogger(TouTiaoSearchParse.class);
private static Logger logger = LogManager.getLogger(TouTiaoSearchParse.class);
/**
* @Title: touTiaoSearchByWord
......
package com.zhiwei.wangyi.parse;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
import com.zhiwei.toutiao.util.Tools;
import com.zhiwei.wangyi.bean.WangYiNews;
public class WangyiNewParse {
private static Logger logger = LoggerFactory.getLogger(WangyiNewParse.class);
private static Logger logger = LogManager.getLogger(WangyiNewParse.class);
private static boolean finish = true;
/**
* @Title: getWYHistory
......@@ -27,7 +27,7 @@ public class WangyiNewParse {
* @return List<WangYiNews> 返回类型
* @throws Exception
*/
public static List<WangYiNews> getWYHistory(String tid,Date endTime) throws Exception
public static List<WangYiNews> getWYHistory(String tid,Date endTime,Proxy proxy) throws Exception
{
List<WangYiNews> list = new ArrayList<WangYiNews>();
Map<String,String> headerMap = Tools.getWangYiHeader();
......@@ -38,7 +38,7 @@ public class WangyiNewParse {
{
String url = "http://c.m.163.com/nc/subscribe/list/"+tid+"/all/"+page*20+"-20.html";
System.out.println(url);
String htmlBody = HttpClientTemplateOK.get(url, null,headerMap);
String htmlBody = HttpBoot.syncCall(RequestUtils.wrapGet(url,headerMap), proxy).body().string();
if(htmlBody!=null)
{
List<WangYiNews> wyList = analysis(htmlBody,endTime);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment