Commit 36eb5887 by yangchen

投诉网站提交

parent 67a6c8f2
package com.zhiwei.parse;
import java.net.Proxy;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.parse.analysis.GftaiAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class Gftai {
private static final Logger logger = LoggerFactory.getLogger(Gftai.class);
private static GftaiAnalysis gftaiAnalysis = new GftaiAnalysis();
private static HttpBoot httpBoot = new HttpBoot();
public static List<Map<String,Object>> getData(String word,Proxy proxy) {
List<Map<String,Object>> dataList = new ArrayList<>();
int page = 1;
int count = 1;
while(true) {
try {
String url = "http://www.gftai.com/gftso?t=xyts&kd="+URLEncoder.encode(word, "UTF-8")+"&sid=24&rn=10&pn="+page;
String result = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy).body().string();
List<Map<String,Object>> list = gftaiAnalysis.getData(result);
dataList.addAll(list);
logger.info("采集第 {} 页 ,一共采集到 {} 条",page,dataList.size());
if(list.size() < 5) {
break;
}
page++;
ZhiWeiTools.sleep(2500);
} catch (Exception e) {
logger.error("Exception {}",e);
count++;
if(count > 3) {
break;
}
}
}
return dataList;
}
}
package com.zhiwei.parse;
import java.net.Proxy;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.parse.analysis.KuaiTousuAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class KuaiTousu {
private static Logger logger = LoggerFactory.getLogger(KuaiTousu.class);
private static KuaiTousuAnalysis kuaiTousuAnalysis = new KuaiTousuAnalysis();
private static HttpBoot httpBoot = new HttpBoot();
public static List<Map<String,Object>> getData(String word,Proxy proxy) {
int page = 1;
int count = 1;
List<Map<String,Object>> bodyList = new ArrayList<>();
while(true) {
try {
String url = "http://ts.gd.sina.com.cn/list/latestv1/key/"+URLEncoder.encode(word, "UTF-8")+"/p/"+page+".html";
String result = httpBoot.syncCall(RequestUtils.wrapGet(url), proxy).body().string();
List<Map<String,Object>> dataList = kuaiTousuAnalysis.getData(result);
bodyList.addAll(dataList);
logger.info("采集到第{}页,一共 采集到 {}",page,bodyList.size());
if(dataList.isEmpty()) {
break;
}
page++;
ZhiWeiTools.sleep(2000);
} catch (Exception e) {
logger.error("Exception {} ",e);
count++;
if(count > 3) {
break;
}
}
}
return Collections.emptyList();
}
}
package com.zhiwei.parse;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.Proxy;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
import com.zhiwei.parse.analysis.SinaTousuAnalysis;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class SinaTousu {
private static final Logger logger = LoggerFactory.getLogger(SinaTousu.class);
private static SinaTousuAnalysis sinaTousuAnalysis = new SinaTousuAnalysis();
private static HttpBoot httpBoot = new HttpBoot();
public static List<Map<String,Object>> getSinaTousuData(String word,Proxy proxy,String time) {
List<Map<String,Object>> bodyList = new ArrayList<>();
int page = 1;
int count = 1;
while(true) {
try {
if(count > 3) {
break;
}
String url = "https://tousu.sina.com.cn/api/index/s?keywords="+URLEncoder.encode(word, "utf-8")+"&page_size=100&page=";
String result = httpBoot.syncCall(RequestUtils.wrapGet(url+page), proxy).body().string();
List<Map<String,Object>> dataList = sinaTousuAnalysis.getData(result,time);
if(dataList.isEmpty()) {
break;
}
bodyList.addAll(dataList);
logger.info("黑猫投诉 关键词采集 第{}页 ,一共采集到数据 {} ",page,bodyList.size());
page++;
ZhiWeiTools.sleep(3000);
} catch (UnsupportedEncodingException e) {
count++;
logger.error("UnsupportedEncodingException {}",e);
} catch (IOException e) {
count++;
logger.error("IOException {}",e);
}
}
return bodyList;
}
}
package com.zhiwei.parse.analysis;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static java.util.Objects.nonNull;
import java.util.ArrayList;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class GftaiAnalysis {
private static final Logger logger = LoggerFactory.getLogger(GftaiAnalysis.class);
public List<Map<String,Object>> getData(String result) {
try {
List<Map<String,Object>> dataList = new ArrayList<>();
Document doc = Jsoup.parse(result);
Elements elements = doc.select("div.se_container_left > ul");
if(nonNull(elements)) {
for(Element element : elements) {
Map<String,Object> map = new HashMap<>();
String title = element.select("li>a").text();
String url = element.select("li>a").attr("href");
String content = element.select("li.se_result_con").text();
String time = element.select("p.search_quick").text().split("\\.\\.\\.")[1].trim();
map.put("title", title);
map.put("url", url);
map.put("content", content);
map.put("time", time);
map.put("source", "国富泰信用");
dataList.add(map);
}
}
return dataList;
} catch (Exception e) {
logger.error("Exception {}",e);
}
return Collections.emptyList();
}
}
package com.zhiwei.parse.analysis;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static java.util.Objects.nonNull;
import java.util.ArrayList;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class KuaiTousuAnalysis {
private static final Logger logger = LoggerFactory.getLogger(KuaiTousuAnalysis.class);
public List<Map<String,Object>> getData(String result) {
try {
List<Map<String,Object>> dataList = new ArrayList<>();
Document doc = Jsoup.parse(result);
Elements elements = doc.select("div.ts-list > div.ts-list-item.haspic");
if (nonNull(elements)) {
for(Element element : elements) {
Map<String,Object> map = new HashMap<>();
String title = element.select("div.ts-list-item-title.clearfix > div.title.fl > a").text();;
String url = element.select("div.ts-list-item-title.clearfix > div.title.fl > a").attr("href");
String time = element.select("div.ts-list-item-date-author > span.date").text();
String content = element.select("div.ts-list-item-txt").text();
String source = element.select("div.ts-list-item-date-author > span.author").text();
map.put("title", title);
map.put("time", time);
map.put("content", content);
map.put("url", url);
map.put("source", source);
dataList.add(map);
}
return dataList;
}
} catch (Exception e) {
logger.error("Exception {} ",e);
}
return Collections.emptyList();
}
}
package com.zhiwei.parse.analysis;
import static java.util.Objects.nonNull;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.tools.timeparse.TimeParse;
public class SinaTousuAnalysis {
private static final Logger logger = LoggerFactory.getLogger(SinaTousuAnalysis.class);
public List<Map<String,Object>> getData(String result,String time) {
try {
List<Map<String,Object>> bodyList = new ArrayList<>();
JSONObject json = JSONObject.parseObject(result);
JSONArray jsonArray = json.getJSONObject("result").getJSONObject("data").getJSONArray("lists");
if(nonNull(jsonArray)) {
for(int i = 0;i < jsonArray.size() ;i++) {
JSONObject data = jsonArray.getJSONObject(i);
String ctime = TimeParse.dateFormartString(new Date(data.getJSONObject("main").getLong("timestamp")*1000L), "yyyy-MM-dd HH:mm:ss");
if(!nonNull(time) || ctime.compareTo(time) <= 0) {
continue;
}
Map<String,Object> map = new HashMap<>();
map.put("title", data.getJSONObject("main").getString("title").replaceAll("<.*?>", ""));
map.put("url", "https:" + data.getJSONObject("main").getString("url"));
map.put("content", data.getJSONObject("main").getString("summary").replaceAll("<.*?>", ""));
map.put("time", ctime);
map.put("source", data.getJSONObject("author").getString("title"));
bodyList.add(map);
}
return bodyList;
}
} catch (Exception e) {
logger.error("用户错误信息 {} ",e);
}
return Collections.emptyList();
}
}
package com.zhiwei.Comment;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Fenghuang;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class FenghuangCommentExample {
@Test
public void fenghuangCommentTest() {
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体/凤凰评论采集.xlsx", 0);
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
List<String> urlList = new ArrayList<String>();
for(Map<String,Object> map1 : list) {
String url = "";
try {
url = map1.get("url")+"";
System.out.println(url);
List<Map<String,Object>> dataList = Fenghuang.getFenghuangCommentData2(url,null);
if(dataList == null || dataList.size() <= 0) {
urlList.add(url);
}
if(dataList != null) {
for(Map<String,Object> m : dataList) {
m.put("from_url", url);
bodyList.add(m);
}
}
} catch (Exception e) {
System.out.println(url);
e.printStackTrace();
continue;
}
ZhiWeiTools.sleep(1000);
}
List<String> headList = new ArrayList<String>();
headList.add("source");
headList.add("content");
headList.add("id");
headList.add("like");
headList.add("from");
headList.add("time");
headList.add("from_url");
for(String s : urlList) {
System.out.println(s);
}
poi.exportExcel("D://crawlerdata//自媒体/凤凰评论采集.xlsx", "评论采集", headList, bodyList);
}
}
package com.zhiwei.keyword;
package com.zhiwei.Comment;
import org.testng.annotations.Test;
import com.zhiwei.parse.QicheHome;
public class QicheKeyWord {
public class QicheComment {
@Test
public void f() {
String articleid = "922761";
......
......@@ -8,17 +8,30 @@ import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.QQKB;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class QQKBCommentExample {
//天天快报与腾讯新闻都可用 不用cookie
@Test
public void qqkbCommentTest() {
String url = "http://op.inews.qq.com/m/20180424A0309700?refer=100000355&chl_code=auto&h=0";
String url = "https://kuaibao.qq.com/s/20181122A11WQB00";
//https://kuaibao.qq.com/s/20180423A1PI7400?refer=kb_news
// https://kuaibao.qq.com/s/20180423A0L60800?refer=kb_news
List<Map<String,Object>> dataList = QQKB.getQQKBCommentData(url,null);
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体/快报评论采集.xlsx", 0);
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(Map<String,Object> m : list) {
String u = m.get("地址").toString();
System.out.println(u);
ZhiWeiTools.sleep(2000);
List<Map<String,Object>> dataList = QQKB.getQQKBCommentData(u,null);
if(dataList!= null) {
bodyList.addAll(dataList);
}
}
List<String> headList = new ArrayList<String>();
headList.add("reply_id"); //id
headList.add("like"); //点赞数
......@@ -26,9 +39,8 @@ public class QQKBCommentExample {
headList.add("reply_num"); //回复数
headList.add("time"); //时间
headList.add("content"); //内容
System.out.println(dataList.size());
PoiExcelUtil poi = PoiExcelUtil.getInstance();
poi.exportExcel("D:\\crawlerdata\\快报评论采集-2.xlsx", "sada", headList, dataList);
System.out.println(bodyList.size());
poi.exportExcel("D:\\crawlerdata\\自媒体\\快报评论采集-zhj.xlsx", "sada", headList, bodyList);
}
......
......@@ -9,6 +9,7 @@ import org.junit.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Fenghuang;
import com.zhiwei.parse.Souhu;
import com.zhiwei.tools.tools.ZhiWeiTools;
public class SouhuCommentExample {
......@@ -16,7 +17,7 @@ public class SouhuCommentExample {
public void souhuCommentTest() {
PoiExcelUtil poi = PoiExcelUtil.getInstance();
Map<String,Object> map = poi.importExcel("D://crawlerdata/搜狐评论采集.xlsx", 0);
Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", 0);
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
List<String> urlList = new ArrayList<String>();
......@@ -24,11 +25,12 @@ public class SouhuCommentExample {
String url = "";
try {
url = map1.get("url")+"";
System.out.println(url);
List<Map<String,Object>> dataList = Souhu.getSouhuCommentData(url,null);
if(dataList.size() <= 0) {
urlList.add(url);
}
ZhiWeiTools.sleep(2000);
if(dataList != null) {
bodyList.addAll(dataList);
}
......@@ -50,7 +52,7 @@ public class SouhuCommentExample {
for(String s : urlList) {
System.out.println(s);
}
poi.exportExcel("D://crawlerdata/搜狐评论采集.xlsx", "搜狐评论", headList, bodyList);
poi.exportExcel("D://crawlerdata//自媒体//搜狐评论采集.xlsx", "搜狐评论", headList, bodyList);
}
......
package com.zhiwei.keyword;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.testng.annotations.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Gftai;
public class GftaiTest {
@Test
public void f() {
PoiExcelUtil poi = PoiExcelUtil.getInstance();
String words = "民宿|短租|住宿|途家|爱彼迎|小猪短租|榛果民宿|Airbnb";
String[] ws = words.split("\\|");
List<Map<String,Object>> bodyList = new ArrayList<>();
for(String word : ws) {
List<Map<String,Object>> list = Gftai.getData(word, null);
bodyList.addAll(list);
System.out.println(word + " --------- " + bodyList.size());
}
List<String> headList = new ArrayList<>();
headList.add("title");
headList.add("time");
headList.add("content");
headList.add("source");
headList.add("url");
poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\国富泰信用.xlsx", "数据", headList, bodyList);
}
}
package com.zhiwei.keyword;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.testng.annotations.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Gftai;
import com.zhiwei.parse.KuaiTousu;
public class KuaiTousuTest {
@Test
public void f() {
PoiExcelUtil poi = PoiExcelUtil.getInstance();
String words = "民宿|短租|住宿|途家|爱彼迎|小猪短租|榛果民宿|Airbnb";
String[] ws = words.split("\\|");
List<Map<String,Object>> bodyList = new ArrayList<>();
for(String word : ws) {
List<Map<String,Object>> list = KuaiTousu.getData(word, null);
bodyList.addAll(list);
System.out.println(word + " --------- " + bodyList.size());
}
List<String> headList = new ArrayList<>();
headList.add("title");
headList.add("time");
headList.add("content");
headList.add("source");
headList.add("url");
poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\新浪广东快投诉.xlsx", "数据", headList, bodyList);
}
}
package com.zhiwei.keyword;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.testng.annotations.Test;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.KuaiTousu;
import com.zhiwei.parse.SinaTousu;
public class SinaTousuTest {
@Test
public void getSinaTousuData() {
PoiExcelUtil poi = PoiExcelUtil.getInstance();
String words = "民宿|短租|住宿|途家|爱彼迎|小猪短租|榛果民宿|Airbnb";
String[] ws = words.split("\\|");
List<Map<String,Object>> bodyList = new ArrayList<>();
for(String word : ws) {
List<Map<String,Object>> list = SinaTousu.getSinaTousuData(word, null, "2018-01-01 00:00:00");
bodyList.addAll(list);
System.out.println(word + " --------- " + bodyList.size());
}
List<String> headList = new ArrayList<>();
headList.add("title");
headList.add("time");
headList.add("content");
headList.add("source");
headList.add("url");
poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\黑猫投诉.xlsx", "数据", headList, bodyList);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment