Commit ec916427 by yangchen

头条关键词 采集修改

parent 5ade0fda
......@@ -43,7 +43,7 @@ public class TouTiaoArticleParse {
}
private static Logger logger = LogManager.getLogger(TouTiaoArticleParse.class);
private static HttpBoot httpBoot = new HttpBoot();
private static HttpBoot httpBoot = new HttpBoot(true);
/***
* 获取头条数据
......@@ -169,7 +169,7 @@ public class TouTiaoArticleParse {
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id=" + user_id + "&max_behot_time="
+ max_behot_time + "&count=20&as=" + as + "&cp=" + cp + "&_signature=" + _signature;
logger.info("当前采集的历史文章链接:::{}", url);
Map<String, String> headerMap = new HashMap<String, String>();
Map<String, String> headerMap = new HashMap<>();
headerMap.put("user-agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
headerMap.put("referer", "https://www.toutiao.com/c/user/" + user_id + "/");
......
package com.zhiwei.toutiao.parse;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
......@@ -13,10 +12,11 @@ import org.apache.logging.log4j.Logger;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONException;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.util.Tools;
/**
* @ClassName: TouTiaoSearch
......@@ -26,24 +26,23 @@ import com.zhiwei.toutiao.util.Tools;
*/
public class TouTiaoSearchParse {
private static Map<String, String> headerMap;
private static Logger logger = LogManager.getLogger(TouTiaoSearchParse.class);
private static HttpBoot httpBoot = new HttpBoot();
/**
* @Title: touTiaoSearchByWord
* @author hero
* @Description: TODO(根据关键词采集今日头条数据)
* @Description: (根据关键词采集今日头条数据)
* @param @param url
* @param @return 设定文件
* @return List<TouTiaoArticle> 返回类型
* @throws Exception
*/
public static Map<String,Object> touTiaoSearchByWord(String url,Proxy proxy ) throws Exception{
headerMap = Tools.getTouTiaoSearchHeader();
headerMap.put("referer", url);
public static Map<String,Object> touTiaoSearchByWord(String url,ProxyHolder proxy) throws Exception{
String htmlBody = null;
try {
htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap);
htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url),proxy).body().string();
if(htmlBody != null){
Map<String,Object> dataMap = parseHtmlBySearch(htmlBody);
if(dataMap!=null && dataMap.size()>0){
......@@ -60,7 +59,7 @@ public class TouTiaoSearchParse {
/**
* @Title: parseHtmlBySearch
* @author hero
* @Description: TODO(解析文本)
* @Description: (解析文本)
* @param @param htmlBody
* @param @return 设定文件
* @return List<TouTiaoArticle> 返回类型
......@@ -72,8 +71,8 @@ public class TouTiaoSearchParse {
int has_more = jsonObject.getIntValue("has_more");
if(null!=dataList && dataList.size()>0){
Map<String,Object> result = new HashMap<String,Object>();
List<TouTiaoArticle> ttList = new ArrayList<TouTiaoArticle>();
Map<String,Object> result = new HashMap<>();
List<TouTiaoArticle> ttList = new ArrayList<>();
for (int i = 0; i < dataList.size(); i++) {
JSONObject jso = dataList.getJSONObject(i);
try {
......@@ -93,8 +92,7 @@ public class TouTiaoSearchParse {
TouTiaoArticle tt = new TouTiaoArticle(url, title, user_id,source, date, content, comment_count, "-1", "-1", "-1","今日头条",null);
ttList.add(tt);
} catch (JSONException e) {
logger.debug("解析数据出现问题", e.fillInStackTrace());
continue;
logger.debug("解析数据出现问题 {}", e);
}
}
result.put("data", ttList);
......
package com.zhiwei.toutiao.test;
import java.util.List;
import java.util.Map;
import com.zhiwei.toutiao.bean.Signature;
import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.parse.TouTiaoChannelParse;
import com.zhiwei.toutiao.util.Tools;
/**
* @ClassName: TouTiaoChannelExample
* @Description: TODO(头条频道解析测试)
* @author hero
* @date 2017年7月24日 下午5:10:52
*/
public class TouTiaoChannelExample {
public static void main(String[] args) {
long max_behot_time = 0;
for(int i= 0;i<3; i++){
System.out.println("i=============="+i);
if( i==0 ){
max_behot_time = 0;
}
Signature signature = new Signature();
String as = signature.getAs();
String cp = signature.getCp();
String url = "http://www.toutiao.com/api/pc/feed/?category=news_tech&utm_source=toutiao"
+ "&widen=1&max_behot_time="+max_behot_time+"&max_behot_time_tmp="+max_behot_time
+"&tadrequire=true&as=" +as +"&cp=" + cp;
System.out.println("url:" + url);
Map<String, Object> result;
try {
result = TouTiaoChannelParse.touTiaoChannel(url, null);
if(result!=null){
Long next = (Long)result.get("next");
List<TouTiaoArticle> ttList = (List<TouTiaoArticle>)result.get("data");
System.out.println("ttlist size is " + ttList.size());
for(TouTiaoArticle tt : ttList){
System.out.println(tt);
}
if(next != null){
max_behot_time = next;
}else{
break;
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
//package com.zhiwei.toutiao.test;
//
//import java.util.List;
//import java.util.Map;
//
//import com.zhiwei.toutiao.bean.Signature;
//import com.zhiwei.toutiao.bean.TouTiaoArticle;
//import com.zhiwei.toutiao.parse.TouTiaoChannelParse;
//import com.zhiwei.toutiao.util.Tools;
//
///**
// * @ClassName: TouTiaoChannelExample
// * @Description: TODO(头条频道解析测试)
// * @author hero
// * @date 2017年7月24日 下午5:10:52
// */
//public class TouTiaoChannelExample {
//
// public static void main(String[] args) {
//
// long max_behot_time = 0;
// for(int i= 0;i<3; i++){
// System.out.println("i=============="+i);
// if( i==0 ){
// max_behot_time = 0;
// }
// Signature signature = new Signature();
// String as = signature.getAs();
// String cp = signature.getCp();
// String url = "http://www.toutiao.com/api/pc/feed/?category=news_tech&utm_source=toutiao"
// + "&widen=1&max_behot_time="+max_behot_time+"&max_behot_time_tmp="+max_behot_time
// +"&tadrequire=true&as=" +as +"&cp=" + cp;
// System.out.println("url:" + url);
//
// Map<String, Object> result;
// try {
// result = TouTiaoChannelParse.touTiaoChannel(url, null);
// if(result!=null){
// Long next = (Long)result.get("next");
// List<TouTiaoArticle> ttList = (List<TouTiaoArticle>)result.get("data");
// System.out.println("ttlist size is " + ttList.size());
// for(TouTiaoArticle tt : ttList){
// System.out.println(tt);
// }
// if(next != null){
// max_behot_time = next;
// }else{
// break;
// }
// }
// } catch (Exception e) {
// e.printStackTrace();
// }
// }
// }
//
//}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment