Commit 7fb5554a by yangchen

豆瓣评论点赞数 解析错误修改

parent 247e637d
...@@ -87,7 +87,7 @@ public class Douban { ...@@ -87,7 +87,7 @@ public class Douban {
logger.info("采集到 第 {} 页 ,一共采集到 {} 条 是否有下一页 {}",page,bodyList.size(),more); logger.info("采集到 第 {} 页 ,一共采集到 {} 条 是否有下一页 {}",page,bodyList.size(),more);
} }
} }
ZhiWeiTools.sleep(1500); ZhiWeiTools.sleep(3000);
page++; page++;
} catch (Exception e) { } catch (Exception e) {
...@@ -127,7 +127,7 @@ public class Douban { ...@@ -127,7 +127,7 @@ public class Douban {
if(dataList.size() - count <= 95 || dataList.size() - count >= 105) { if(dataList.size() - count <= 95 || dataList.size() - count >= 105) {
more = false; more = false;
} }
ZhiWeiTools.sleep(1500); ZhiWeiTools.sleep(3000);
logger.info("评论采集到 第 {} 页 ,一共采集到 {} 条数据 ,more : {}",page,dataList.size(),more); logger.info("评论采集到 第 {} 页 ,一共采集到 {} 条数据 ,more : {}",page,dataList.size(),more);
} catch (Exception e) { } catch (Exception e) {
logger.error("Exception {}",e); logger.error("Exception {}",e);
......
...@@ -10,6 +10,8 @@ import java.util.Map; ...@@ -10,6 +10,8 @@ import java.util.Map;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import javax.swing.plaf.synth.SynthSpinnerUI;
import org.apache.commons.lang3.math.NumberUtils; import org.apache.commons.lang3.math.NumberUtils;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
...@@ -40,6 +42,7 @@ public class DoubanCommentAnalysis { ...@@ -40,6 +42,7 @@ public class DoubanCommentAnalysis {
map.put("content", content); map.put("content", content);
map.put("id", id); map.put("id", id);
map.put("like", getLikeNum(result, "c"+id)); map.put("like", getLikeNum(result, "c"+id));
System.out.println(map.toString());
bodyList.add(map); bodyList.add(map);
} }
} }
...@@ -52,7 +55,7 @@ public class DoubanCommentAnalysis { ...@@ -52,7 +55,7 @@ public class DoubanCommentAnalysis {
} }
private int getLikeNum(String result,String id) { private int getLikeNum(String result,String id) {
Matcher matcher = Pattern.compile(id+"\":[\\D\\d][0,5]").matcher(result); Matcher matcher = Pattern.compile(id+"\":[\\d]{0,5}").matcher(result);
while(matcher.find()) { while(matcher.find()) {
String ret = matcher.group(0); String ret = matcher.group(0);
ret = ret.split(":")[1].split(",")[0]; ret = ret.split(":")[1].split(",")[0];
......
package com.zhiwei.Comment; //package com.zhiwei.Comment;
//
import org.testng.annotations.Test; //import org.testng.annotations.Test;
//
import com.zhiwei.parse.Aika; //import com.zhiwei.parse.Aika;
import com.zhiwei.tools.timeparse.TimeExtraction; //import com.zhiwei.tools.timeparse.TimeExtraction;
import com.zhiwei.tools.timeparse.TimeParse; //import com.zhiwei.tools.timeparse.TimeParse;
//
public class AikaComment { //public class AikaComment {
@Test // @Test
public void f() { // public void f() {
String url = "http://newcar.xcar.com.cn/201809/news_2021765_1.html"; // String url = "http://newcar.xcar.com.cn/201809/news_2021765_1.html";
//
Aika.getAikaComment(url, null); // Aika.getAikaComment(url, null);
//
// System.out.println(TimeExtraction.parseFormatTime("09月12日", "MM dd")); //// System.out.println(TimeExtraction.parseFormatTime("09月12日", "MM dd"));
//
} // }
} //}
...@@ -12,8 +12,8 @@ import com.zhiwei.parse.Douban; ...@@ -12,8 +12,8 @@ import com.zhiwei.parse.Douban;
public class DoubanCommentTest { public class DoubanCommentTest {
@Test @Test
public void f() { public void f() {
String url = "https://www.douban.com/group/topic/72528866/"; String url = "https://www.douban.com/group/topic/128726395/";
String cookie = "bid=rymxzs5aojg; ps=y; ll=\"118173\"; __utmc=30149280; dbcl2=\"188038058:9IHyVcSobVc\"; ck=_RvF; ap_v=0,6.0; push_noty_num=0; push_doumail_num=0; __utmv=30149280.18803; douban-fav-remind=1; __yadk_uid=qLflXyj3R14ro9e0cLoZOQlJoMGVN32j; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1543562805%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DPk19bxnhsVWwfnrcnwT0PquON7D1JpLsbfSu9rRowalyi4pOeM3rMHKFaJo9jJF7%26wd%3D%26eqid%3De7f262650001ef98000000045c00e64f%22%5D; _pk_ses.100001.8cb4=*; douban-profile-remind=1; loc-last-index-location-id=\"118173\"; _vwo_uuid_v2=D85F60C118B0AF465035D9CC7BBFDA7A6|4bf255e1e3a2e9aeede3708192f5f1bc; __utma=30149280.824403997.1543559458.1543562809.1543564973.3; __utmz=30149280.1543564973.3.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _pk_id.100001.8cb4=6828fef49f6bcf34.1543559455.2.1543566557.1543559542.; __utmb=30149280.70.5.1543566539352"; String cookie = "bid=rymxzs5aojg; ps=y; ll=\"118173\"; __utmc=30149280; dbcl2=\"188038058:9IHyVcSobVc\"; ck=_RvF; push_noty_num=0; push_doumail_num=0; __utmv=30149280.18803; douban-fav-remind=1; __yadk_uid=qLflXyj3R14ro9e0cLoZOQlJoMGVN32j; douban-profile-remind=1; _vwo_uuid_v2=D85F60C118B0AF465035D9CC7BBFDA7A6|4bf255e1e3a2e9aeede3708192f5f1bc; __utmz=30149280.1543564973.3.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; ap_v=0,6.0; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1543823236%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DPk19bxnhsVWwfnrcnwT0PquON7D1JpLsbfSu9rRowalyi4pOeM3rMHKFaJo9jJF7%26wd%3D%26eqid%3De7f262650001ef98000000045c00e64f%22%5D; _pk_id.100001.8cb4=6828fef49f6bcf34.1543559455.5.1543823236.1543820463.; _pk_ses.100001.8cb4=*; __utma=30149280.824403997.1543559458.1543818802.1543823236.6; __utmt=1; __utmb=30149280.5.7.1543823236";
PoiExcelUtil poi = PoiExcelUtil.getInstance(); PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<Map<String,Object>> bodyList = Douban.getDoubanComment(url, null, cookie); List<Map<String,Object>> bodyList = Douban.getDoubanComment(url, null, cookie);
......
package com.zhiwei.Comment; //package com.zhiwei.Comment;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.junit.Test; //import org.junit.Test;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Fenghuang; //import com.zhiwei.parse.Fenghuang;
import com.zhiwei.tools.tools.ZhiWeiTools; //import com.zhiwei.tools.tools.ZhiWeiTools;
//
public class FenghuangCommentExample { //public class FenghuangCommentExample {
//
@Test // @Test
public void fenghuangCommentTest() { // public void fenghuangCommentTest() {
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
//
Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体/凤凰评论采集.xlsx", 0); // Map<String,Object> map = poi.importExcel("D://crawlerdata//自媒体/凤凰评论采集.xlsx", 0);
List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body"); // List<Map<String,Object>> list = (List<Map<String,Object>>)map.get("body");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); // List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
List<String> urlList = new ArrayList<String>(); // List<String> urlList = new ArrayList<String>();
for(Map<String,Object> map1 : list) { // for(Map<String,Object> map1 : list) {
String url = ""; // String url = "";
try { // try {
url = map1.get("url")+""; // url = map1.get("url")+"";
System.out.println(url); // System.out.println(url);
List<Map<String,Object>> dataList = Fenghuang.getFenghuangCommentData2(url,null); // List<Map<String,Object>> dataList = Fenghuang.getFenghuangCommentData2(url,null);
if(dataList == null || dataList.size() <= 0) { // if(dataList == null || dataList.size() <= 0) {
urlList.add(url); // urlList.add(url);
} // }
if(dataList != null) { // if(dataList != null) {
for(Map<String,Object> m : dataList) { // for(Map<String,Object> m : dataList) {
m.put("from_url", url); // m.put("from_url", url);
bodyList.add(m); // bodyList.add(m);
} // }
} // }
} catch (Exception e) { // } catch (Exception e) {
System.out.println(url); // System.out.println(url);
e.printStackTrace(); // e.printStackTrace();
continue; // continue;
} // }
ZhiWeiTools.sleep(1000); // ZhiWeiTools.sleep(1000);
} // }
List<String> headList = new ArrayList<String>(); // List<String> headList = new ArrayList<String>();
headList.add("source"); // headList.add("source");
headList.add("content"); // headList.add("content");
headList.add("id"); // headList.add("id");
headList.add("like"); // headList.add("like");
headList.add("from"); // headList.add("from");
headList.add("time"); // headList.add("time");
headList.add("from_url"); // headList.add("from_url");
for(String s : urlList) { // for(String s : urlList) {
System.out.println(s); // System.out.println(s);
} // }
poi.exportExcel("D://crawlerdata//自媒体/凤凰评论采集.xlsx", "评论采集", headList, bodyList); // poi.exportExcel("D://crawlerdata//自媒体/凤凰评论采集.xlsx", "评论采集", headList, bodyList);
//
} // }
//
//
} //}
package com.zhiwei.Comment; //package com.zhiwei.Comment;
//
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.testng.annotations.Test; //import org.testng.annotations.Test;
//
import com.zhiwei.parse.Pcauto; //import com.zhiwei.parse.Pcauto;
//
public class PcautoComment { //public class PcautoComment {
@Test // @Test
public void f() { // public void f() {
String url = "https://www.pcauto.com.cn/nation/1352/13523485.html"; // String url = "https://www.pcauto.com.cn/nation/1352/13523485.html";
//
List<Map<String,Object>> data = Pcauto.getPcAutoComment(url, null); // List<Map<String,Object>> data = Pcauto.getPcAutoComment(url, null);
System.out.println(data.size()); // System.out.println(data.size());
} // }
} //}
package com.zhiwei.Comment; //package com.zhiwei.Comment;
//
import org.testng.annotations.Test; //import org.testng.annotations.Test;
//
import com.zhiwei.parse.QicheHome; //import com.zhiwei.parse.QicheHome;
//
public class QicheComment { //public class QicheComment {
@Test // @Test
public void f() { // public void f() {
String articleid = "922761"; // String articleid = "922761";
//
QicheHome.getQiCheComment(articleid, null); // QicheHome.getQiCheComment(articleid, null);
//
} // }
} //}
package com.zhiwei.Comment; //package com.zhiwei.Comment;
//
import org.testng.annotations.Test; //import org.testng.annotations.Test;
//
import com.zhiwei.parse.SinaKeji; //import com.zhiwei.parse.SinaKeji;
//
public class SinaKejiComment { //public class SinaKejiComment {
@Test // @Test
public void f() { // public void f() {
String url = "https://tech.sina.com.cn/it/2018-07-17/doc-ihfkffam0632649.shtml"; // String url = "https://tech.sina.com.cn/it/2018-07-17/doc-ihfkffam0632649.shtml";
//
SinaKeji.getSinaKejiComment(url, null); // SinaKeji.getSinaKejiComment(url, null);
//
} // }
} //}
package com.zhiwei.Comment; //package com.zhiwei.Comment;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.testng.annotations.Test; //import org.testng.annotations.Test;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.TechTx; //import com.zhiwei.parse.TechTx;
//
public class TechTxComment { //public class TechTxComment {
@Test // @Test
public void f() { // public void f() {
String url = "http://tech.qq.com/a/20170629/005621.htm"; // String url = "http://tech.qq.com/a/20170629/005621.htm";
//
List<Map<String,Object>> bodyList = TechTx.getTechTxComment(url, null); // List<Map<String,Object>> bodyList = TechTx.getTechTxComment(url, null);
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<String> headList = new ArrayList<>(); // List<String> headList = new ArrayList<>();
headList.add("time"); // headList.add("time");
headList.add("content"); // headList.add("content");
headList.add("source"); // headList.add("source");
headList.add("like"); // headList.add("like");
headList.add("userId"); // headList.add("userId");
headList.add("id"); // headList.add("id");
poi.exportExcel("D://crawlerdata//自媒体/腾讯科技评论采集.xlsx", "ces", headList, bodyList); // poi.exportExcel("D://crawlerdata//自媒体/腾讯科技评论采集.xlsx", "ces", headList, bodyList);
System.out.println(bodyList.size()); // System.out.println(bodyList.size());
} // }
} //}
package com.zhiwei;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.HashMap;
import org.testng.annotations.Test;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.core.RequestUtils;
public class TestHttpBoot {
@Test
public void f() {
HttpBoot httpBoot = new HttpBoot();
String url = "https://www.toutiao.com/c/user/following/?user_id=1034006366&count=20&_signature=wp5wPBAVmXlosTC8Fobui8KecC";
Map<String,Object> headers = new HashMap<>();
headers.put("referer", "https://www.qctt.cn/news/349056");
headers.put("cookie", "PHPSESSID=3rd6bvonb4g15t1fp777mjums0; Hm_lvt_70af9ea91e7adc8195f6d49511b9a2f1=1542253722; open_ad=1; Hm_lpvt_70af9ea91e7adc8195f6d49511b9a2f1=1542271394; vcode=sqmm; XSRF-TOKEN=eyJpdiI6IlFTNzkyYWNcLzB2SUwyN2dcL1hhUlpsZz09IiwidmFsdWUiOiJRSUpycjZJNGx3d1hUWkpOQUl1R2psSStuVU0yYW8xT1YxXC9QOFY1NjdyRXNrMWpFVE1kSm9IQ1o5Nm5keXlMTEFnZXdCOHVvWDg0U2picTE1cjZzMkE9PSIsIm1hYyI6IjZlYzk5NDI3ODEzMzA3ZTJjNDc3M2ZjMjBlNDJhZjc2YjU2ODFmYmY3YWRlMzdlMzM1NTBlNWMxNDk3MjFiZDEifQ%3D%3D; laravel_session=eyJpdiI6InJQMnByeFlIbXVhaUVVVVBLK1wvaXlRPT0iLCJ2YWx1ZSI6IlhUOUtIS2ZQZ0ZKNFh1RDVQYjBjSVZkVkpQZTdYRDNpa1wvV0o5QlJPbk8xZE0rQ3dZdnFMdjcya011ejVkdWEwUk1Qa29Zb2Y3OU0yUGkrWDF4Wk5adz09IiwibWFjIjoiZGJiYjlkNWZhNmJhMDFiMjkxYTAyMmUwZTEyMWVmZTQ0NmJiZDQ2ZGU3ZjNjNmUzNTIwZGI0NTc4NDJlZjNiMCJ9");
headers.put("origin", "https://www.qctt.cn");
Map<String,Object> params = new HashMap<>();
params.put("id", "349056");
params.put("page", "3");
params.put("_token", "EJ58V0qilRw7P77czp0U6iO9QW2IOS1ZGiBk4wH1");
try {
String result = httpBoot.syncCall(RequestUtils.wrapGet(url)).body().string();
System.out.println(result);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
//package com.zhiwei.keyword;
//
//import org.testng.annotations.Test;
//
//import com.zhiwei.parse.Douban;
//
//public class DoubanTopicTest {
// @Test
// public void f() {
// String word = "唐嫣";
// String cookie = "bid=rymxzs5aojg; ps=y; ll=\"118173\"; __utmc=30149280; __utmz=30149280.1543559458.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); dbcl2=\"188038058:9IHyVcSobVc\"; ck=_RvF; ap_v=0,6.0; push_noty_num=0; push_doumail_num=0; __utmv=30149280.18803; douban-fav-remind=1; __yadk_uid=qLflXyj3R14ro9e0cLoZOQlJoMGVN32j; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1543562805%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DPk19bxnhsVWwfnrcnwT0PquON7D1JpLsbfSu9rRowalyi4pOeM3rMHKFaJo9jJF7%26wd%3D%26eqid%3De7f262650001ef98000000045c00e64f%22%5D; _pk_ses.100001.8cb4=*; douban-profile-remind=1; __utma=30149280.824403997.1543559458.1543559458.1543562809.2; __utmt=1; _pk_id.100001.8cb4=6828fef49f6bcf34.1543559455.2.1543564606.1543559542.; __utmb=30149280.227.9.1543564257221";
// String time = "2018-11-27 15:47:41";
//
// Douban.doubanTopicGetByWord(word, null, cookie,time);
//
// }
//}
package com.zhiwei.keyword; //package com.zhiwei.keyword;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.testng.annotations.Test; //import org.testng.annotations.Test;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Gftai; //import com.zhiwei.parse.Gftai;
//
public class GftaiTest { //public class GftaiTest {
@Test // @Test
public void f() { // public void f() {
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
String words = "民宿|短租|住宿|途家|爱彼迎|小猪短租|榛果民宿|Airbnb"; // String words = "民宿|短租|住宿|途家|爱彼迎|小猪短租|榛果民宿|Airbnb";
String[] ws = words.split("\\|"); // String[] ws = words.split("\\|");
List<Map<String,Object>> bodyList = new ArrayList<>(); // List<Map<String,Object>> bodyList = new ArrayList<>();
for(String word : ws) { // for(String word : ws) {
List<Map<String,Object>> list = Gftai.getData(word, null); // List<Map<String,Object>> list = Gftai.getData(word, null);
bodyList.addAll(list); // bodyList.addAll(list);
System.out.println(word + " --------- " + bodyList.size()); // System.out.println(word + " --------- " + bodyList.size());
} // }
List<String> headList = new ArrayList<>(); // List<String> headList = new ArrayList<>();
headList.add("title"); // headList.add("title");
headList.add("time"); // headList.add("time");
headList.add("content"); // headList.add("content");
headList.add("source"); // headList.add("source");
headList.add("url"); // headList.add("url");
//
poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\国富泰信用.xlsx", "数据", headList, bodyList); // poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\国富泰信用.xlsx", "数据", headList, bodyList);
} // }
} //}
package com.zhiwei.keyword; //package com.zhiwei.keyword;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.testng.annotations.Test; //import org.testng.annotations.Test;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Gftai; //import com.zhiwei.parse.Gftai;
import com.zhiwei.parse.KuaiTousu; //import com.zhiwei.parse.KuaiTousu;
//
public class KuaiTousuTest { //public class KuaiTousuTest {
@Test // @Test
public void f() { // public void f() {
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
String words = "民宿|短租|住宿|途家|爱彼迎|小猪短租|榛果民宿|Airbnb"; // String words = "民宿|短租|住宿|途家|爱彼迎|小猪短租|榛果民宿|Airbnb";
String[] ws = words.split("\\|"); // String[] ws = words.split("\\|");
List<Map<String,Object>> bodyList = new ArrayList<>(); // List<Map<String,Object>> bodyList = new ArrayList<>();
for(String word : ws) { // for(String word : ws) {
List<Map<String,Object>> list = KuaiTousu.getData(word, null); // List<Map<String,Object>> list = KuaiTousu.getData(word, null);
bodyList.addAll(list); // bodyList.addAll(list);
System.out.println(word + " --------- " + bodyList.size()); // System.out.println(word + " --------- " + bodyList.size());
} // }
List<String> headList = new ArrayList<>(); // List<String> headList = new ArrayList<>();
headList.add("title"); // headList.add("title");
headList.add("time"); // headList.add("time");
headList.add("content"); // headList.add("content");
headList.add("source"); // headList.add("source");
headList.add("url"); // headList.add("url");
//
poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\新浪广东快投诉.xlsx", "数据", headList, bodyList); // poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\新浪广东快投诉.xlsx", "数据", headList, bodyList);
//
//
//
//
} // }
} //}
package com.zhiwei.keyword; //package com.zhiwei.keyword;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.HashMap; //import java.util.HashMap;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.testng.annotations.Test; //import org.testng.annotations.Test;
//
import com.zhiwei.bean.HistortyBean; //import com.zhiwei.bean.HistortyBean;
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.QQKandian; //import com.zhiwei.parse.QQKandian;
import com.zhiwei.tools.tools.ZhiWeiTools; //import com.zhiwei.tools.tools.ZhiWeiTools;
//
public class QQKandianKeyWordExample { //public class QQKandianKeyWordExample {
@Test // @Test
public void f() { // public void f() {
String word = "今日头条 算法|今日头条 侵权|今日头条 起诉|字节跳动|张一鸣|抖音 涉黄|抖音 未成年|抖音"; // String word = "今日头条 算法|今日头条 侵权|今日头条 起诉|字节跳动|张一鸣|抖音 涉黄|抖音 未成年|抖音";
String[] words = word.split("\\|"); // String[] words = word.split("\\|");
//
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
QQKandian qqKandian = new QQKandian(); // QQKandian qqKandian = new QQKandian();
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); // List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(String w : words) { // for(String w : words) {
System.out.println(w); // System.out.println(w);
List<HistortyBean> dataList = qqKandian.getDataByword(w, null); // List<HistortyBean> dataList = qqKandian.getDataByword(w, null);
System.out.println(w + " ---- " + dataList.size()); // System.out.println(w + " ---- " + dataList.size());
for(HistortyBean h : dataList) { // for(HistortyBean h : dataList) {
Map<String, Object> map = new HashMap<String,Object>(); // Map<String, Object> map = new HashMap<String,Object>();
map.put("标题", h.getTitle()); // map.put("标题", h.getTitle());
map.put("时间", h.getTime()); // map.put("时间", h.getTime());
map.put("来源", h.getSource()); // map.put("来源", h.getSource());
map.put("正文", h.getContent()); // map.put("正文", h.getContent());
map.put("链接", h.getUrl()); // map.put("链接", h.getUrl());
bodyList.add(map); // bodyList.add(map);
} // }
ZhiWeiTools.sleep(3000); // ZhiWeiTools.sleep(3000);
} // }
List<String> headList = new ArrayList<String>(); // List<String> headList = new ArrayList<String>();
headList.add("标题"); // headList.add("标题");
headList.add("来源"); // headList.add("来源");
headList.add("链接"); // headList.add("链接");
headList.add("正文"); // headList.add("正文");
headList.add("时间"); // headList.add("时间");
poi.exportExcel("D:\\crawlerdata\\自媒体\\qq看点-今日头条 算法.xlsx", "马化腾", headList, bodyList); // poi.exportExcel("D:\\crawlerdata\\自媒体\\qq看点-今日头条 算法.xlsx", "马化腾", headList, bodyList);
} // }
//
} //}
package com.zhiwei.keyword; //package com.zhiwei.keyword;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.testng.annotations.Test; //import org.testng.annotations.Test;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.KuaiTousu; //import com.zhiwei.parse.KuaiTousu;
import com.zhiwei.parse.SinaTousu; //import com.zhiwei.parse.SinaTousu;
//
public class SinaTousuTest { //public class SinaTousuTest {
//
@Test // @Test
public void getSinaTousuData() { // public void getSinaTousuData() {
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
String words = "民宿|短租|住宿|途家|爱彼迎|小猪短租|榛果民宿|Airbnb"; // String words = "民宿|短租|住宿|途家|爱彼迎|小猪短租|榛果民宿|Airbnb";
String[] ws = words.split("\\|"); // String[] ws = words.split("\\|");
List<Map<String,Object>> bodyList = new ArrayList<>(); // List<Map<String,Object>> bodyList = new ArrayList<>();
for(String word : ws) { // for(String word : ws) {
List<Map<String,Object>> list = SinaTousu.getSinaTousuData(word, null, "2018-01-01 00:00:00"); // List<Map<String,Object>> list = SinaTousu.getSinaTousuData(word, null, "2018-01-01 00:00:00");
bodyList.addAll(list); // bodyList.addAll(list);
System.out.println(word + " --------- " + bodyList.size()); // System.out.println(word + " --------- " + bodyList.size());
} // }
List<String> headList = new ArrayList<>(); // List<String> headList = new ArrayList<>();
headList.add("title"); // headList.add("title");
headList.add("time"); // headList.add("time");
headList.add("content"); // headList.add("content");
headList.add("source"); // headList.add("source");
headList.add("url"); // headList.add("url");
//
poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\黑猫投诉.xlsx", "数据", headList, bodyList); // poi.exportExcel("D:\\crawlerdata\\自媒体\\投诉\\黑猫投诉.xlsx", "数据", headList, bodyList);
//
//
//
} // }
} //}
package com.zhiwei.keyword; //package com.zhiwei.keyword;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Toutiao; //import com.zhiwei.parse.Toutiao;
import com.zhiwei.tools.tools.ZhiWeiTools; //import com.zhiwei.tools.tools.ZhiWeiTools;
//
public class ToutiaoKeyWordExample { //public class ToutiaoKeyWordExample {
//
public static void main(String[] args) { // public static void main(String[] args) {
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
String path = "D:\\crawlerdata\\关键词.xlsx"; // String path = "D:\\crawlerdata\\关键词.xlsx";
Map<String,Object> map = poi.importExcel(path, 0); // Map<String,Object> map = poi.importExcel(path, 0);
List<Map<String,Object>> list = (List<Map<String, Object>>) map.get("body"); // List<Map<String,Object>> list = (List<Map<String, Object>>) map.get("body");
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); // List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
String startTime = "2018-06-28 00:00:00"; // String startTime = "2018-06-28 00:00:00";
String endTime = "2018-06-28 23:59:59"; // String endTime = "2018-06-28 23:59:59";
String devoid = "54381805805"; // String devoid = "54381805805";
for(Map<String,Object> map1 : list) { // for(Map<String,Object> map1 : list) {
String word = map1.get("关键词")+""; // String word = map1.get("关键词")+"";
List<Map<String,Object>> dataList = Toutiao.getKeyWordData(word, null,devoid); // List<Map<String,Object>> dataList = Toutiao.getKeyWordData(word, null,devoid);
if(dataList != null) { // if(dataList != null) {
for(Map<String,Object> m : dataList) { // for(Map<String,Object> m : dataList) {
String time = m.get("time")+""; // String time = m.get("time")+"";
System.out.println(time); // System.out.println(time);
m.put("word", word); // m.put("word", word);
String ma = m.get("title") + "--" + m.get("content"); // String ma = m.get("title") + "--" + m.get("content");
if(time.compareTo(startTime) > -1 && time.compareTo(endTime) < 1) { // if(time.compareTo(startTime) > -1 && time.compareTo(endTime) < 1) {
System.out.println(1); // System.out.println(1);
if(ma.contains(word)) { // if(ma.contains(word)) {
bodyList.add(m); // bodyList.add(m);
} // }
} // }
} // }
} // }
ZhiWeiTools.sleep(2000); // ZhiWeiTools.sleep(2000);
} // }
//
List<String> headList = new ArrayList<String>(); // List<String> headList = new ArrayList<String>();
headList.add("title"); // headList.add("title");
headList.add("time"); // headList.add("time");
headList.add("content"); // headList.add("content");
headList.add("source"); // headList.add("source");
headList.add("url"); // headList.add("url");
headList.add("word"); // headList.add("word");
//
poi.exportExcel(path, "雅培", headList, bodyList); // poi.exportExcel(path, "雅培", headList, bodyList);
//
} // }
//
} //}
package com.zhiwei.keyword; //package com.zhiwei.keyword;
//
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
//
import org.testng.annotations.Test; //import org.testng.annotations.Test;
//
import com.zhiwei.common.config.GroupType; //import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory; //import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.excelpoi.excel.PoiExcelUtil; //import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.parse.Xueqiu; //import com.zhiwei.parse.Xueqiu;
//
public class XueqiuKeyWord { //public class XueqiuKeyWord {
@Test // @Test
public void f() { // public void f() {
// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER); //// ProxyFactory.init("zookeeper://192.168.0.36:2181","local",GroupType.PROVIDER);
String word = "腾讯 股价|腾讯 市值|腾讯 估值|腾讯 投资|腾讯 财报"; // String word = "腾讯 股价|腾讯 市值|腾讯 估值|腾讯 投资|腾讯 财报";
String endTime = "2018-01-01 00:00:00"; // String endTime = "2018-01-01 00:00:00";
String cookie = "_ga=GA1.2.590296572.1538275545; device_id=812aac42a8874c32ca97893a7b270684; s=ff12lgxbt1; __utma=1.590296572.1538275545.1538280279.1538280279.1; __utmz=1.1538280279.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); aliyungf_tc=AQAAAPtocWy01ggA6tbncwYt4OIDsaiG; xq_a_token=088c6ad5e275496d7c91b8b5b2ecb929bee15772; xq_a_token.sig=NcpAm7FRsAVoMGRMOLrLRveBT7U; xq_r_token=08131eb9a4f33b43b2340fd782f3776c9823d74a; xq_r_token.sig=AZ7au6ICJtTgQJVPybkOJs_Fr54; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539599526,1539913171,1539913178,1541157360; _gid=GA1.2.1817290343.1541157360; u=721541157360383; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1541212289"; // String cookie = "_ga=GA1.2.590296572.1538275545; device_id=812aac42a8874c32ca97893a7b270684; s=ff12lgxbt1; __utma=1.590296572.1538275545.1538280279.1538280279.1; __utmz=1.1538280279.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); aliyungf_tc=AQAAAPtocWy01ggA6tbncwYt4OIDsaiG; xq_a_token=088c6ad5e275496d7c91b8b5b2ecb929bee15772; xq_a_token.sig=NcpAm7FRsAVoMGRMOLrLRveBT7U; xq_r_token=08131eb9a4f33b43b2340fd782f3776c9823d74a; xq_r_token.sig=AZ7au6ICJtTgQJVPybkOJs_Fr54; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539599526,1539913171,1539913178,1541157360; _gid=GA1.2.1817290343.1541157360; u=721541157360383; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1541212289";
//
//
//
String[] words = word.split("\\|"); // String[] words = word.split("\\|");
//
PoiExcelUtil poi = PoiExcelUtil.getInstance(); // PoiExcelUtil poi = PoiExcelUtil.getInstance();
List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>(); // List<Map<String,Object>> bodyList = new ArrayList<Map<String,Object>>();
for(String w : words) { // for(String w : words) {
System.out.println(w); // System.out.println(w);
//
List<Map<String, Object>> dataList = Xueqiu.getData(w, endTime, null, cookie); // List<Map<String, Object>> dataList = Xueqiu.getData(w, endTime, null, cookie);
System.out.println(w + " ---- " + dataList.size()); // System.out.println(w + " ---- " + dataList.size());
bodyList.addAll(dataList); // bodyList.addAll(dataList);
} // }
List<String> headList = new ArrayList<String>(); // List<String> headList = new ArrayList<String>();
headList.add("title"); // headList.add("title");
headList.add("time"); // headList.add("time");
headList.add("content"); // headList.add("content");
headList.add("uper"); // headList.add("uper");
headList.add("url"); // headList.add("url");
headList.add("likeCount"); // headList.add("likeCount");
headList.add("replyCount"); // headList.add("replyCount");
poi.exportExcel("D:\\crawlerdata\\自媒体\\雪球网页采集-腾讯-relevance.xlsx", "马化腾", headList, bodyList); // poi.exportExcel("D:\\crawlerdata\\自媒体\\雪球网页采集-腾讯-relevance.xlsx", "马化腾", headList, bodyList);
//
} // }
} //}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment