Commit ec916427 by yangchen

头条关键词 采集修改

parent 5ade0fda
...@@ -43,7 +43,7 @@ public class TouTiaoArticleParse { ...@@ -43,7 +43,7 @@ public class TouTiaoArticleParse {
} }
private static Logger logger = LogManager.getLogger(TouTiaoArticleParse.class); private static Logger logger = LogManager.getLogger(TouTiaoArticleParse.class);
private static HttpBoot httpBoot = new HttpBoot(); private static HttpBoot httpBoot = new HttpBoot(true);
/*** /***
* 获取头条数据 * 获取头条数据
...@@ -169,7 +169,7 @@ public class TouTiaoArticleParse { ...@@ -169,7 +169,7 @@ public class TouTiaoArticleParse {
String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id=" + user_id + "&max_behot_time=" String url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id=" + user_id + "&max_behot_time="
+ max_behot_time + "&count=20&as=" + as + "&cp=" + cp + "&_signature=" + _signature; + max_behot_time + "&count=20&as=" + as + "&cp=" + cp + "&_signature=" + _signature;
logger.info("当前采集的历史文章链接:::{}", url); logger.info("当前采集的历史文章链接:::{}", url);
Map<String, String> headerMap = new HashMap<String, String>(); Map<String, String> headerMap = new HashMap<>();
headerMap.put("user-agent", headerMap.put("user-agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"); "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36");
headerMap.put("referer", "https://www.toutiao.com/c/user/" + user_id + "/"); headerMap.put("referer", "https://www.toutiao.com/c/user/" + user_id + "/");
......
package com.zhiwei.toutiao.parse; package com.zhiwei.toutiao.parse;
import java.net.Proxy;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
...@@ -13,10 +12,11 @@ import org.apache.logging.log4j.Logger; ...@@ -13,10 +12,11 @@ import org.apache.logging.log4j.Logger;
import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONException; import com.alibaba.fastjson.JSONException;
import com.alibaba.fastjson.JSONObject; import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.httpclient.HttpClientTemplateOK; import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.tools.timeparse.TimeParse; import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.toutiao.bean.TouTiaoArticle; import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.util.Tools;
/** /**
* @ClassName: TouTiaoSearch * @ClassName: TouTiaoSearch
...@@ -26,24 +26,23 @@ import com.zhiwei.toutiao.util.Tools; ...@@ -26,24 +26,23 @@ import com.zhiwei.toutiao.util.Tools;
*/ */
public class TouTiaoSearchParse { public class TouTiaoSearchParse {
private static Map<String, String> headerMap;
private static Logger logger = LogManager.getLogger(TouTiaoSearchParse.class); private static Logger logger = LogManager.getLogger(TouTiaoSearchParse.class);
private static HttpBoot httpBoot = new HttpBoot();
/** /**
* @Title: touTiaoSearchByWord * @Title: touTiaoSearchByWord
* @author hero * @author hero
* @Description: TODO(根据关键词采集今日头条数据) * @Description: (根据关键词采集今日头条数据)
* @param @param url * @param @param url
* @param @return 设定文件 * @param @return 设定文件
* @return List<TouTiaoArticle> 返回类型 * @return List<TouTiaoArticle> 返回类型
* @throws Exception * @throws Exception
*/ */
public static Map<String,Object> touTiaoSearchByWord(String url,Proxy proxy ) throws Exception{ public static Map<String,Object> touTiaoSearchByWord(String url,ProxyHolder proxy) throws Exception{
headerMap = Tools.getTouTiaoSearchHeader();
headerMap.put("referer", url);
String htmlBody = null; String htmlBody = null;
try { try {
htmlBody = HttpClientTemplateOK.get(url, proxy, headerMap); htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url),proxy).body().string();
if(htmlBody != null){ if(htmlBody != null){
Map<String,Object> dataMap = parseHtmlBySearch(htmlBody); Map<String,Object> dataMap = parseHtmlBySearch(htmlBody);
if(dataMap!=null && dataMap.size()>0){ if(dataMap!=null && dataMap.size()>0){
...@@ -60,7 +59,7 @@ public class TouTiaoSearchParse { ...@@ -60,7 +59,7 @@ public class TouTiaoSearchParse {
/** /**
* @Title: parseHtmlBySearch * @Title: parseHtmlBySearch
* @author hero * @author hero
* @Description: TODO(解析文本) * @Description: (解析文本)
* @param @param htmlBody * @param @param htmlBody
* @param @return 设定文件 * @param @return 设定文件
* @return List<TouTiaoArticle> 返回类型 * @return List<TouTiaoArticle> 返回类型
...@@ -72,8 +71,8 @@ public class TouTiaoSearchParse { ...@@ -72,8 +71,8 @@ public class TouTiaoSearchParse {
int has_more = jsonObject.getIntValue("has_more"); int has_more = jsonObject.getIntValue("has_more");
if(null!=dataList && dataList.size()>0){ if(null!=dataList && dataList.size()>0){
Map<String,Object> result = new HashMap<String,Object>(); Map<String,Object> result = new HashMap<>();
List<TouTiaoArticle> ttList = new ArrayList<TouTiaoArticle>(); List<TouTiaoArticle> ttList = new ArrayList<>();
for (int i = 0; i < dataList.size(); i++) { for (int i = 0; i < dataList.size(); i++) {
JSONObject jso = dataList.getJSONObject(i); JSONObject jso = dataList.getJSONObject(i);
try { try {
...@@ -93,8 +92,7 @@ public class TouTiaoSearchParse { ...@@ -93,8 +92,7 @@ public class TouTiaoSearchParse {
TouTiaoArticle tt = new TouTiaoArticle(url, title, user_id,source, date, content, comment_count, "-1", "-1", "-1","今日头条",null); TouTiaoArticle tt = new TouTiaoArticle(url, title, user_id,source, date, content, comment_count, "-1", "-1", "-1","今日头条",null);
ttList.add(tt); ttList.add(tt);
} catch (JSONException e) { } catch (JSONException e) {
logger.debug("解析数据出现问题", e.fillInStackTrace()); logger.debug("解析数据出现问题 {}", e);
continue;
} }
} }
result.put("data", ttList); result.put("data", ttList);
......
package com.zhiwei.toutiao.test; //package com.zhiwei.toutiao.test;
//
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import com.zhiwei.toutiao.bean.Signature; //import com.zhiwei.toutiao.bean.Signature;
import com.zhiwei.toutiao.bean.TouTiaoArticle; //import com.zhiwei.toutiao.bean.TouTiaoArticle;
import com.zhiwei.toutiao.parse.TouTiaoChannelParse; //import com.zhiwei.toutiao.parse.TouTiaoChannelParse;
import com.zhiwei.toutiao.util.Tools; //import com.zhiwei.toutiao.util.Tools;
//
/** ///**
* @ClassName: TouTiaoChannelExample // * @ClassName: TouTiaoChannelExample
* @Description: TODO(头条频道解析测试) // * @Description: TODO(头条频道解析测试)
* @author hero // * @author hero
* @date 2017年7月24日 下午5:10:52 // * @date 2017年7月24日 下午5:10:52
*/ // */
public class TouTiaoChannelExample { //public class TouTiaoChannelExample {
//
public static void main(String[] args) { // public static void main(String[] args) {
//
long max_behot_time = 0; // long max_behot_time = 0;
for(int i= 0;i<3; i++){ // for(int i= 0;i<3; i++){
System.out.println("i=============="+i); // System.out.println("i=============="+i);
if( i==0 ){ // if( i==0 ){
max_behot_time = 0; // max_behot_time = 0;
} // }
Signature signature = new Signature(); // Signature signature = new Signature();
String as = signature.getAs(); // String as = signature.getAs();
String cp = signature.getCp(); // String cp = signature.getCp();
String url = "http://www.toutiao.com/api/pc/feed/?category=news_tech&utm_source=toutiao" // String url = "http://www.toutiao.com/api/pc/feed/?category=news_tech&utm_source=toutiao"
+ "&widen=1&max_behot_time="+max_behot_time+"&max_behot_time_tmp="+max_behot_time // + "&widen=1&max_behot_time="+max_behot_time+"&max_behot_time_tmp="+max_behot_time
+"&tadrequire=true&as=" +as +"&cp=" + cp; // +"&tadrequire=true&as=" +as +"&cp=" + cp;
System.out.println("url:" + url); // System.out.println("url:" + url);
//
Map<String, Object> result; // Map<String, Object> result;
try { // try {
result = TouTiaoChannelParse.touTiaoChannel(url, null); // result = TouTiaoChannelParse.touTiaoChannel(url, null);
if(result!=null){ // if(result!=null){
Long next = (Long)result.get("next"); // Long next = (Long)result.get("next");
List<TouTiaoArticle> ttList = (List<TouTiaoArticle>)result.get("data"); // List<TouTiaoArticle> ttList = (List<TouTiaoArticle>)result.get("data");
System.out.println("ttlist size is " + ttList.size()); // System.out.println("ttlist size is " + ttList.size());
for(TouTiaoArticle tt : ttList){ // for(TouTiaoArticle tt : ttList){
System.out.println(tt); // System.out.println(tt);
} // }
if(next != null){ // if(next != null){
max_behot_time = next; // max_behot_time = next;
}else{ // }else{
break; // break;
} // }
} // }
} catch (Exception e) { // } catch (Exception e) {
e.printStackTrace(); // e.printStackTrace();
} // }
} // }
} // }
//
} //}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment