Commit 5bb9510d by win 10

解决冲突

parent a56fa9e1
......@@ -2,7 +2,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>media_data_crawler</artifactId>
<version>0.1.2-SNAPSHOT</version>
<version>0.1.3-SNAPSHOT</version>
<name>media_data_crawler</name>
<description>网媒数据抓取,包含百度新闻、搜狗新闻、360新闻、知乎回答列表等</description>
......@@ -16,9 +16,23 @@
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.5.2-RELEASE</version>
<version>0.6.1.0-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
<!-- excel导出 -->
<dependency>
<groupId>com.zhiwei</groupId>
<artifactId>excelpoi</artifactId>
<version>0.0.3-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>easyexcel</artifactId>
<version>2.0.0-beta3</version>
<scope>provided</scope>
</dependency>
</dependencies>
<!-- 打包管理 -->
......
......@@ -16,8 +16,11 @@ import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.ZhihuAnswer;
import com.zhiwei.tools.timeparse.TimeParse;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Response;
......@@ -49,6 +52,50 @@ public class ZhihuAnwserCrawlerParse {
return -1;
}
public static List<ZhihuAnswer> getPictureCount(String url) {
List<ZhihuAnswer> answerList = new ArrayList<>();
logger.info("知乎回答采集开始:{}",url);
try {
if(url.contains("/answer")) {
url = url.split("/answer")[0];
}
int n = -1;
int i = 1;
while(true) {
try {
n++;
Map<String, Object> dataMap = DataCrawler.getAnswerList(url, n, TimeParse.stringFormartDate("2000-01-01"), ProxyHolder.NAT_HEAVY_PROXY);
List<ZhihuAnswer> list = (List<ZhihuAnswer>)dataMap.get("data");
if(list!=null && !list.isEmpty()){
logger.info("知乎回答采集链接:{} 页数 {} ,此页总数 {}",url,n,list.size());
i = 1;
answerList.addAll(list);
}else {
n--;
i++;
}
// ZhiWeiTools.sleep(100);
if(!(boolean) dataMap.get("more")) {
break;
}
} catch (Exception e) {
logger.error(" exception {} ",e);
// ZhiWeiTools.sleep(100);
i++;
n--;
}
if(i > 10) {
break;
}
}
} catch (Exception e) {
e.toString();
}
// ZhiWeiTools.sleep(1000);
logger.info("知乎回答采集结束:{}",url);
return answerList;
}
/**
* 知乎回答采集
* @param url
......@@ -57,7 +104,7 @@ public class ZhihuAnwserCrawlerParse {
* @return
* @throws Exception
*/
public static List<ZhihuAnswer> getAnswerList(String url, Date endDate, Proxy proxy) throws Exception{
public static List<ZhihuAnswer> getAnswerList(String url, Date endDate, ProxyHolder proxy) throws Exception{
try{
List<ZhihuAnswer> answerList = new ArrayList<>();
String questionId = getQuestionId(url);
......@@ -80,7 +127,7 @@ public class ZhihuAnwserCrawlerParse {
more = false;
}
//单线程采集避免被封休眠8s
ZhiWeiTools.sleep(8000);
// ZhiWeiTools.sleep(3000);
page++;
}catch (Exception e){
more = false;
......@@ -92,7 +139,6 @@ public class ZhihuAnwserCrawlerParse {
}
}
/**
* 获取问题的关注者和浏览量
* @param url
......@@ -100,7 +146,7 @@ public class ZhihuAnwserCrawlerParse {
* @return
* @throws Exception
*/
private static String getNumberBoard(String url, Proxy proxy) throws Exception{
private static String getNumberBoard(String url, ProxyHolder proxy) throws Exception{
try{
String body = download(url, proxy);
Document document = Jsoup.parse(body);
......@@ -117,10 +163,6 @@ public class ZhihuAnwserCrawlerParse {
}
}
/**
* 获取单页数据
* @param url
......@@ -130,17 +172,16 @@ public class ZhihuAnwserCrawlerParse {
* @return
* @throws Exception
*/
public static Map<String,Object> getAnswerList(String url, int page, Date endDate, Proxy proxy) throws Exception{
public static Map<String,Object> getAnswerList(String url, int page, Date endDate, ProxyHolder proxy) throws Exception{
try{
String questionId = getQuestionId(url);
String bord = getNumberBoard(url, proxy);
return analsis(questionId,endDate,page,bord ,proxy);
return analsis(questionId,endDate,page,bord ,proxy);
}catch (Exception e){
throw e;
}
}
/**
* 解析数据
* @param questionId
......@@ -150,7 +191,7 @@ public class ZhihuAnwserCrawlerParse {
* @return
* @throws Exception
*/
private static Map<String,Object> analsis(String questionId, Date endDate, int page, String bord, Proxy proxy) throws Exception{
private static Map<String,Object> analsis(String questionId, Date endDate, int page, String bord, ProxyHolder proxy) throws Exception{
try{
boolean more = true;
List<ZhihuAnswer> answerList = new ArrayList<>();
......@@ -160,23 +201,29 @@ public class ZhihuAnwserCrawlerParse {
Integer count = dataJson.getJSONObject("paging").getInteger("totals");
JSONArray jsonArray = dataJson.getJSONArray("data");
String from_url = "https://www.zhihu.com/question/" + questionId;
String fromUrl = "https://www.zhihu.com/question/" + questionId;
Integer sort = page*20 + 1;
for(int i=0; i<jsonArray.size(); i++){
JSONObject answerJson = jsonArray.getJSONObject(i);
Date time = new Date(answerJson.getLong("created_time")*1000);
if(time.after(endDate)){
String answerId = answerJson.getString("id");
String link = from_url+"/answer/" + answerId;
String link = fromUrl+"/answer/" + answerId;
System.out.println("正在处理 === " + link);
String author = answerJson.getJSONObject("author").getString("name");
String authorUrl = "https://www.zhihu.com/people/"+answerJson.getJSONObject("author").getString("url_token");
String content = ZhiWeiTools.delHTMLTag(answerJson.getString("content"));
String[] imgContent = answerJson.getString("content").split("<img");
Integer imgCount = (imgContent.length-1)/2;
String title = answerJson.getJSONObject("question").getString("title");
Integer voteup_count = answerJson.getInteger("voteup_count");
Integer comment_count = answerJson.getInteger("comment_count");
Integer guanzhu_count = Integer.valueOf(bord.split(",")[0]);
Integer bord_count = Integer.valueOf(bord.split(",")[1]);
ZhihuAnswer zhihuAnswer = new ZhihuAnswer(link, from_url, title, time, author, authorUrl, content,voteup_count ,comment_count, guanzhu_count, bord_count);
Integer voteupCount = answerJson.getInteger("voteup_count");
Integer commentCount = answerJson.getInteger("comment_count");
Integer guanzhuCount = Integer.valueOf(bord.split(",")[0]);
Integer bordCount = Integer.valueOf(bord.split(",")[1]);
ZhihuAnswer zhihuAnswer = new ZhihuAnswer(link, fromUrl, title, time, author, authorUrl, content,voteupCount ,commentCount, guanzhuCount, bordCount, imgCount, sort);
answerList.add(zhihuAnswer);
System.out.println(imgCount + " ---- " + sort);
sort ++;
}
}
if(count<page*20){
......@@ -191,7 +238,6 @@ public class ZhihuAnwserCrawlerParse {
}
}
/**
* 根据链接获取数据
* @param url
......@@ -199,7 +245,7 @@ public class ZhihuAnwserCrawlerParse {
* @return
* @throws Exception
*/
private static String download(String url, Proxy proxy) throws Exception{
private static String download(String url, ProxyHolder proxy) throws Exception{
try(Response response = httpBoot.syncCall(RequestUtils.wrapGet(url),proxy)){
return response.body().string();
}catch (Exception e){
......@@ -207,7 +253,6 @@ public class ZhihuAnwserCrawlerParse {
}
}
/**
* 根据链接获取问题id
* @param url
......@@ -237,11 +282,9 @@ public class ZhihuAnwserCrawlerParse {
"Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit" +
"%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2" +
"Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp" +
"%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics&offset="+page*20+"&limit=20&sort_by=created";
"%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics&offset="+page*20+"&limit=20";
}
public static void main(String[] args){
// String url = "https://www.zhihu.com/question/288128510";
// Date endDate = TimeParse.stringFormartDate("2018-09-20 08:00:00");
......@@ -253,7 +296,4 @@ public class ZhihuAnwserCrawlerParse {
getAnswerCount("https://www.zhihu.com/question/41539825", null);
}
}
......@@ -35,7 +35,33 @@ public class DataCrawler {
try {
return BaiduInforCrawlerParse.getBaiduInforData(word,endTime);
} catch (Exception e) {
e.printStackTrace();
e.toString();
return Collections.emptyList();
}
}
/**
*
* @Title: getBaiduNewsData
* @author hero
* @Description: 根据关键词和时间,全文匹配百度新闻数据
* @param @param
* word
* @param @param
* startTime
* @param @param
* endTime
* @param @param
* proxy
* @param @return
* 设定文件
* @return List<NewsData> 返回类型
*/
public static List<NewsData> getBaiduInforDataManyWord(String word,String endTime, String saveWord) {
try {
return BaiduInforCrawlerParse.getBaiduInforDataManyWord(word,endTime,saveWord);
} catch (Exception e) {
e.toString();
return Collections.emptyList();
}
}
......@@ -62,8 +88,8 @@ public class DataCrawler {
try {
return BaiduNewsCrawlerParse.getBaiduNewsData(word, startTime, endTime, proxy);
} catch (Exception e) {
e.printStackTrace();
return null;
e.toString();
return Collections.emptyList();
}
}
......@@ -106,7 +132,7 @@ public class DataCrawler {
try {
return BaiduNewsCrawlerParse.getBaiduNewsCount(word, startTime, endTime, proxy,cookie);
} catch (Exception e) {
e.printStackTrace();
e.toString();
return -1;
}
}
......@@ -132,8 +158,8 @@ public class DataCrawler {
try {
return BaiduNewsCrawlerParse.getBaiduNewsDataByTitle(word, startTime, endTime, proxy);
} catch (Exception e) {
e.printStackTrace();
return null;
e.toString();
return Collections.emptyList();
}
}
......@@ -154,8 +180,8 @@ public class DataCrawler {
try {
return SoNewsCrawlerParse.getSoNewsData(word, proxy);
} catch (Exception e) {
e.printStackTrace();
return null;
e.toString();
return Collections.emptyList();
}
}
......@@ -176,8 +202,8 @@ public class DataCrawler {
try {
return SoNewsCrawlerParse.getSoNewsDataByTitle(word, proxy);
} catch (Exception e) {
e.printStackTrace();
return null;
e.toString();
return Collections.emptyList();
}
}
......@@ -199,8 +225,8 @@ public class DataCrawler {
System.out.println("开始采集sogou");
return SougouNewsCrawlerParse.getSougouNewsData(word, proxy);
} catch (Exception e) {
e.printStackTrace();
return null;
e.toString();
return Collections.emptyList();
}
}
......@@ -221,8 +247,8 @@ public class DataCrawler {
try {
return SougouNewsCrawlerParse.getSougouNewsDataByTitle(word, proxy);
} catch (Exception e) {
e.printStackTrace();
return null;
e.toString();
return Collections.emptyList();
}
}
......@@ -242,8 +268,8 @@ public class DataCrawler {
try {
return SougouZhihuCrawlerParse.getSougouZhihuData(word, proxy);
} catch (Exception e) {
e.printStackTrace();
return null;
e.toString();
return Collections.emptyList();
}
}
......@@ -259,14 +285,14 @@ public class DataCrawler {
* 设定文件
* @return List<TiebaData> 返回类型
*/
public static List<TiebaData> getBaiduTiebaData(String word, Proxy proxy) {
public static List<TiebaData> getBaiduTiebaDataSortByTime(String word, Proxy proxy, String startTime) {
try {
return BaiduTiebaCrawlerParse.getBaiduTiebaData(word, proxy, null);
return BaiduTiebaCrawlerParse.getBaiduTiebaData(word, proxy, null, startTime);
} catch (Exception e) {
return Collections.emptyList();
}
}
/**
* @Title: getBaiduTiebaData
* @author hero
......@@ -291,8 +317,8 @@ public class DataCrawler {
try {
return BaiduTiebaCrawlerParse.getBaiduTiebaAnswerDataByUrl(url, proxy);
} catch (Exception e) {
e.printStackTrace();
return null;
e.toString();
return Collections.emptyList();
}
}
......@@ -308,10 +334,10 @@ public class DataCrawler {
*/
public static List<TiebaData> getBaiduTiebaData(String word, Proxy proxy, String tiebaName) {
try {
return BaiduTiebaCrawlerParse.getBaiduTiebaData(word, proxy, tiebaName);
return BaiduTiebaCrawlerParse.getBaiduTiebaData(word, proxy, tiebaName, null);
} catch (Exception e) {
e.printStackTrace();
return null;
e.toString();
return Collections.emptyList();
}
}
......@@ -353,8 +379,8 @@ public class DataCrawler {
try {
return DoubanCrawlerParse.getDoubanData(word, type, proxy);
} catch (Exception e) {
e.printStackTrace();
return null;
e.toString();
return Collections.emptyList();
}
}
......@@ -374,8 +400,8 @@ public class DataCrawler {
try {
return SoCrawlerParse.getSoData(word, site, time, proxy);
} catch (Exception e) {
e.printStackTrace();
return null;
e.toString();
return Collections.emptyList();
}
}
......@@ -387,10 +413,11 @@ public class DataCrawler {
* @return
* @throws Exception
*/
public static List<ZhihuAnswer> getAnswerList(String url, Date endDate, Proxy proxy) throws Exception{
public static List<ZhihuAnswer> getAnswerList(String url, Date endDate, ProxyHolder proxy) throws Exception{
try{
return ZhihuAnwserCrawlerParse.getAnswerList(url,endDate, proxy);
}catch (Exception e){
e.toString();
throw e;
}
}
......@@ -404,10 +431,11 @@ public class DataCrawler {
* @return
* @throws Exception
*/
public static Map<String,Object> getAnswerList(String url, int page, Date endDate, Proxy proxy) throws Exception{
public static Map<String,Object> getAnswerList(String url, int page, Date endDate, ProxyHolder proxy) throws Exception{
try{
return ZhihuAnwserCrawlerParse.getAnswerList(url,page,endDate, proxy);
}catch (Exception e){
e.toString();
throw e;
}
}
......@@ -428,6 +456,7 @@ public class DataCrawler {
try{
return ZhihuCrawlerParse.getZhihuData(word, timeLimit, proxy, endDate);
}catch (Exception e){
e.toString();
throw e;
}
}
......@@ -443,6 +472,7 @@ public class DataCrawler {
try{
return ZhihuCrawlerParse.getZhihuUser(url, proxy);
}catch (Exception e){
e.toString();
throw e;
}
}
......@@ -458,6 +488,7 @@ public class DataCrawler {
try{
return ZhihuUserAnswerCrawlerParse.getData(userId, proxy);
}catch (Exception e){
e.toString();
throw e;
}
}
......
......@@ -5,12 +5,12 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.media_data_crawler.crawler.WordsReadFile;
import com.zhiwei.media_data_crawler.data.DataCrawler;
import com.zhiwei.media_data_crawler.entity.LunTanData;
import com.zhiwei.proxy.config.SimpleConfig;
/**
* 天涯论坛数据获取
......@@ -25,7 +25,12 @@ public class GetTiayaDataTest {
String startTime = "2019-01-01 00:00:00"; //开始时间
String endTime = "2019-11-08 23:59:59"; //结束时间
ProxyFactory.init("zookeeper://192.168.0.36:2181", "local", GroupType.PROVIDER, 10000008);
//代理地址
String address = "zookeeper://192.168.0.11:2181?backup=192.168.0.30:2181,192.168.0.35:2181";
String appName = "xumaioxin";
long appId = 10000008L;
ProxyFactory.init(SimpleConfig.builder().registry(address).appName(appName).appId(appId).group("local").build());
List<String> wordList = WordsReadFile.getWords(wordFilePath);
List<LunTanData> list = new ArrayList<>();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment