Commit 306c37e1 by chenweitao

修复了些不规范代码导致的bug

parent a402132e
...@@ -173,21 +173,26 @@ public class JsoupHtml { ...@@ -173,21 +173,26 @@ public class JsoupHtml {
str = json.getString("html"); str = json.getString("html");
doc = Jsoup.parse(str); doc = Jsoup.parse(str);
Elements a = doc.getElementsByTag("a"); Elements a = doc.getElementsByClass("page S_txt1");
int num = 0; int num = 0;
for (Element e : a) if ("page".equals(a.last().attr("bpfilter"))) {
{ num = Integer.parseInt(a.last().text());
if ("page".equals(e.attr("bpfilter")) && "page S_txt1".equals(e.attr("class")))
{
if (Integer.parseInt(e.text()) > num)
{
num = Integer.parseInt(e.text());
}
}
} }
// for (Element e : a)
// {
// if ("page".equals(e.attr("bpfilter")) && "page S_txt1".equals(e.attr("class")))
// {
// if (Integer.parseInt(e.text()) > num)
// {
// num = Integer.parseInt(e.text());
// }
// }
// }
return num; return num;
} }
catch (Exception e) { catch (Exception e) {
e.printStackTrace();
return 0; return 0;
} }
......
...@@ -36,7 +36,7 @@ public class crawlerQuartz { ...@@ -36,7 +36,7 @@ public class crawlerQuartz {
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
String cookie = "SINAGLOBAL=1413878352487.6208.1509610656233; UM_distinctid=16053b396fea93-0a1b0a92fd2d7-5f19311c-1fa400-16053b396ffa9d; un=15757871020; UOR=,www.weibo.com,www.baidu.com; login_sid_t=c3935e41ed072cfa20cbd1462a51e1b6; cross_origin_proto=SSL; _s_tentry=passport.weibo.com; Apache=3630121226743.0815.1520321567340; ULV=1520321568035:15:2:1:3630121226743.0815.1520321567340:1519887954422; SSOLoginState=1520321599; SCF=AuCAEA8HE7llKecGgDC9mXez57Y8TrXZB-bbR3xw1Rg_-hZyItSqXebc8xU1toDVlo8zXAxor6wCq9fcmjOxtlI.; SUHB=0FQrE4a4pmNdGS; ALF=1551948727; SUB=_2AkMt_rd6f8NxqwJRmP0RzWPrbI90wg3EieKbokahJRMxHRl-yT83qlc5tRB6Bn6ZlSJDljRT4MPLb3O5AT8RtbcROuaE; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWDQpkiHsu8zrdep6H.3aI_; YF-Page-G0=aabeaa17d9557111c805fb15a9959531"; String cookie = "YF-Page-G0=b98b45d9bba85e843a07e69c0880151a; SUB=_2AkMt9Wyxf8NxqwJRmP0RzWPrbI90wg3EieKbqZ1qJRMxHRl-yj83qhwMtRB6BnVCXs0VTavYQWuC4hgT0djGew9Twhmm; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWDQpkiHsu8zrdep6H.3aI_";
serice.crawlerData(cookie); serice.crawlerData(cookie);
long end = System.currentTimeMillis(); long end = System.currentTimeMillis();
log.info(time + "次运行耗时:" + (end - start) + "\t毫秒"); log.info(time + "次运行耗时:" + (end - start) + "\t毫秒");
......
...@@ -15,6 +15,7 @@ import java.util.ArrayList; ...@@ -15,6 +15,7 @@ import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry;
import java.util.Random; import java.util.Random;
import javax.annotation.Resource; import javax.annotation.Resource;
...@@ -49,26 +50,31 @@ public class MongoSerivce { ...@@ -49,26 +50,31 @@ public class MongoSerivce {
public void crawlerData(String cookie) { public void crawlerData(String cookie) {
Map<String, String> map = groupSet(); Map<String, String> map = groupSet();
for (String domain : map.keySet()) { for (Entry<String, String> en : map.entrySet()) {
String domainId = map.get(domain); log.info("【{}】页开始爬取...............", en.getKey());
log.info("【{}】页开始爬取...............", domain); int sum = parseAndInsert(en.getKey(), en.getValue(), cookie);
int sum = parseAndInsert(domain, domainId, cookie); log.info("【{}】页所有数据爬取结束...............", en.getKey());
log.info("【{}】页所有数据爬取结束...............", domain); tagDao.updateByState(en.getKey(), 2);
tagDao.updateByState(domain, 2); log.info("【{}】所有页数据存储成功,共计【{}】条数据", en.getKey(), sum);
log.info("【{}】所有页数据存储成功,共计【{}】条数据", domain, sum);
} }
log.info("所有页面爬取结束,程序结束"); tagDao.findAll().forEach(a -> tagDao.updateByState(a.getDomain(), 0));
log.info("所有页面爬取结束,程序结束,重置所有主标签状态");
} }
private int getPageNum(String domainId, String cookie) { private int getPageNum(String domainId, String cookie) {
int index = 0;
while (true) { while (true) {
String page = crawler.getPage(domainId, cookie);
crawler.sleep(3000L); crawler.sleep(3000L);
String page = crawler.getPage(domainId, cookie);
int num = jsoupHtml.parsePage(page); int num = jsoupHtml.parsePage(page);
if (num != 0) { if (num != 0) {
return num; return num;
} }
if (++index > 10) {
log.error("【{}】未获取到页码");
return 0;
}
} }
} }
...@@ -158,6 +164,6 @@ public class MongoSerivce { ...@@ -158,6 +164,6 @@ public class MongoSerivce {
tagDao.updateByState(key, 0); tagDao.updateByState(key, 0);
} }
domainDao.createColl(); // domainDao.createColl();
} }
} }
...@@ -23,19 +23,16 @@ import com.zhiweidata.weiboDomain.service.MongoSerivce; ...@@ -23,19 +23,16 @@ import com.zhiweidata.weiboDomain.service.MongoSerivce;
* @date 2018年2月23日 下午3:09:33 * @date 2018年2月23日 下午3:09:33
*/ */
public class Start { public class Start {
// private static ApplicationContext ctx = new ClassPathXmlApplicationContext("applicationContext.xml"); private static ApplicationContext ctx = new ClassPathXmlApplicationContext("applicationContext.xml");
// private static MongoSerivce serice = ctx.getBean(MongoSerivce.class); private static MongoSerivce serice = ctx.getBean(MongoSerivce.class);
public static void main(String[] args) { public static void main(String[] args) {
ApplicationContext ctx = new ClassPathXmlApplicationContext("applicationContext.xml");
System.out.println("微博热门榜单采集开始..."); System.out.println("微博热门榜单采集开始...");
//程序主体切换至com.zhiweidata.weiboDomain.quartz定时器 //程序主体切换至com.zhiweidata.weiboDomain.quartz定时器
// String cookie = "YF-Page-G0=b98b45d9bba85e843a07e69c0880151a; SUB=_2AkMt9Wyxf8NxqwJRmP0RzWPrbI90wg3EieKbqZ1qJRMxHRl-yj83qhwMtRB6BnVCXs0VTavYQWuC4hgT0djGew9Twhmm; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWDQpkiHsu8zrdep6H.3aI_";
// 初始化程序状态,在再次爬取时调用
// String cookie = "SINAGLOBAL=1413878352487.6208.1509610656233; UM_distinctid=16053b396fea93-0a1b0a92fd2d7-5f19311c-1fa400-16053b396ffa9d; un=15757871020; UOR=,www.weibo.com,www.baidu.com; login_sid_t=c3935e41ed072cfa20cbd1462a51e1b6; cross_origin_proto=SSL; _s_tentry=passport.weibo.com; Apache=3630121226743.0815.1520321567340; ULV=1520321568035:15:2:1:3630121226743.0815.1520321567340:1519887954422; SSOLoginState=1520321599; SCF=AuCAEA8HE7llKecGgDC9mXez57Y8TrXZB-bbR3xw1Rg_-hZyItSqXebc8xU1toDVlo8zXAxor6wCq9fcmjOxtlI.; SUHB=0FQrE4a4pmNdGS; ALF=1551948727; SUB=_2AkMt_rd6f8NxqwJRmP0RzWPrbI90wg3EieKbokahJRMxHRl-yT83qlc5tRB6Bn6ZlSJDljRT4MPLb3O5AT8RtbcROuaE; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWDQpkiHsu8zrdep6H.3aI_; YF-Page-G0=aabeaa17d9557111c805fb15a9959531";
// 初始化程序状态,在再次爬取时调用
// 断点续传时,注释掉 // 断点续传时,注释掉
// serice.initTag(); // serice.initTag();
// serice.crawlerData(cookie); // serice.crawlerData(cookie);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment