Commit 87a16473 by yangchen

提交部分解析

parent 9234d24c
package com.zhiwei.parse;
import static java.util.Objects.nonNull;
import java.net.Proxy;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class Travel315 {
private static final Logger logger = LoggerFactory.getLogger(Travel315.class);
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
public static List<Map<String,Object>> getData(String word,Proxy proxy) {
try {
List<Map<String,Object>> bodyList = new ArrayList<>();
String url = "http://travel315.people.com.cn/interface/select/data_srch.php?key="+URLEncoder.encode(word, "UTF-8");
String result = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy).body().string();
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArray = json.getJSONArray("comps");
if(nonNull(jsonArray)) {
for(int i = 0;i < jsonArray.size() ;i++) {
ZhiWeiTools.sleep(1000);
JSONObject data = jsonArray.getJSONObject(i);
Map<String,Object> map = new HashMap<>();
map.put("title", data.getString("c_title"));
String cid = data.getString("c_id");
map.put("url", "http://travel315.people.com.cn/iframe/shw_xml.php?c_id="+cid);
map.put("content", getContent(cid, proxy));
map.put("time", data.getString("c_create_time"));
map.put("source", data.getString("u_nick_name"));
System.out.println(map.toString());
bodyList.add(map);
}
}
return bodyList;
} catch (Exception e) {
}
return Collections.emptyList();
}
private static String getContent(String cid,Proxy proxy) {
try {
String result = httpBoot.syncCall(RequestUtils.wrapGet("http://travel315.people.com.cn/interface/select/data_show_xml.php?c_id="+cid), proxy).body().string();
String content = result.split("\\<c_context\\>")[1].split("\\</c_context\\>")[0];
content = content.replaceAll("<.*?>", "").replaceAll("&nbsp;", "");
return content;
} catch (Exception e) {
logger.error("Exception {}",e);
}
return null;
}
}
package com.zhiwei.parse;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.parse.analysis.Ts21cnAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class Ts21cn {
private static Logger logger = LoggerFactory.getLogger(Ts21cn.class);
private static Ts21cnAnalysis ts21cnAnalysis = new Ts21cnAnalysis();
private static HttpBoot httpBoot = new HttpBoot.Builder().retryTimes(2).build();
public static List<Map<String,Object>> getdata(String word,ProxyHolder proxy,String endTime) {
int page = 1;
int count1 = 1;
List<Map<String,Object>> dataList = new ArrayList<>();
while(true) {
try {
if(count1 > 3) {
break;
}
String url = "http://ts.21cn.com/front/api/search/searchPostList.do?title="+URLEncoder.encode(word, "utf-8")+"&listType=1&pageNo=";
ZhiWeiTools.sleep(100);
String result = httpBoot.syncCall(RequestUtils.wrapGet(url+page), proxy).body().string();
int count = JSONObject.parseObject(result).getInteger("count");
List<Map<String,Object>> bodyList = ts21cnAnalysis.getdata(result,endTime);
if(bodyList.isEmpty() || dataList.size() > count) {
break;
}
dataList.addAll(bodyList);
logger.info("聚投诉 采集第 {} 页, 一共采集到 {} 条",page,dataList.size());
page++;
} catch (Exception e) {
count1++;
logger.error("Exception {}",e);
}
}
return dataList;
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment