Commit 306c37e1 by chenweitao

修复了些不规范代码导致的bug

parent a402132e
......@@ -173,21 +173,26 @@ public class JsoupHtml {
str = json.getString("html");
doc = Jsoup.parse(str);
Elements a = doc.getElementsByTag("a");
Elements a = doc.getElementsByClass("page S_txt1");
int num = 0;
for (Element e : a)
{
if ("page".equals(e.attr("bpfilter")) && "page S_txt1".equals(e.attr("class")))
{
if (Integer.parseInt(e.text()) > num)
{
num = Integer.parseInt(e.text());
}
}
if ("page".equals(a.last().attr("bpfilter"))) {
num = Integer.parseInt(a.last().text());
}
// for (Element e : a)
// {
// if ("page".equals(e.attr("bpfilter")) && "page S_txt1".equals(e.attr("class")))
// {
// if (Integer.parseInt(e.text()) > num)
// {
// num = Integer.parseInt(e.text());
// }
// }
// }
return num;
}
catch (Exception e) {
e.printStackTrace();
return 0;
}
......
......@@ -36,7 +36,7 @@ public class crawlerQuartz {
long start = System.currentTimeMillis();
String cookie = "SINAGLOBAL=1413878352487.6208.1509610656233; UM_distinctid=16053b396fea93-0a1b0a92fd2d7-5f19311c-1fa400-16053b396ffa9d; un=15757871020; UOR=,www.weibo.com,www.baidu.com; login_sid_t=c3935e41ed072cfa20cbd1462a51e1b6; cross_origin_proto=SSL; _s_tentry=passport.weibo.com; Apache=3630121226743.0815.1520321567340; ULV=1520321568035:15:2:1:3630121226743.0815.1520321567340:1519887954422; SSOLoginState=1520321599; SCF=AuCAEA8HE7llKecGgDC9mXez57Y8TrXZB-bbR3xw1Rg_-hZyItSqXebc8xU1toDVlo8zXAxor6wCq9fcmjOxtlI.; SUHB=0FQrE4a4pmNdGS; ALF=1551948727; SUB=_2AkMt_rd6f8NxqwJRmP0RzWPrbI90wg3EieKbokahJRMxHRl-yT83qlc5tRB6Bn6ZlSJDljRT4MPLb3O5AT8RtbcROuaE; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWDQpkiHsu8zrdep6H.3aI_; YF-Page-G0=aabeaa17d9557111c805fb15a9959531";
String cookie = "YF-Page-G0=b98b45d9bba85e843a07e69c0880151a; SUB=_2AkMt9Wyxf8NxqwJRmP0RzWPrbI90wg3EieKbqZ1qJRMxHRl-yj83qhwMtRB6BnVCXs0VTavYQWuC4hgT0djGew9Twhmm; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWDQpkiHsu8zrdep6H.3aI_";
serice.crawlerData(cookie);
long end = System.currentTimeMillis();
log.info(time + "次运行耗时:" + (end - start) + "\t毫秒");
......
......@@ -15,6 +15,7 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Random;
import javax.annotation.Resource;
......@@ -49,26 +50,31 @@ public class MongoSerivce {
public void crawlerData(String cookie) {
Map<String, String> map = groupSet();
for (String domain : map.keySet()) {
String domainId = map.get(domain);
log.info("【{}】页开始爬取...............", domain);
int sum = parseAndInsert(domain, domainId, cookie);
log.info("【{}】页所有数据爬取结束...............", domain);
tagDao.updateByState(domain, 2);
log.info("【{}】所有页数据存储成功,共计【{}】条数据", domain, sum);
for (Entry<String, String> en : map.entrySet()) {
log.info("【{}】页开始爬取...............", en.getKey());
int sum = parseAndInsert(en.getKey(), en.getValue(), cookie);
log.info("【{}】页所有数据爬取结束...............", en.getKey());
tagDao.updateByState(en.getKey(), 2);
log.info("【{}】所有页数据存储成功,共计【{}】条数据", en.getKey(), sum);
}
log.info("所有页面爬取结束,程序结束");
tagDao.findAll().forEach(a -> tagDao.updateByState(a.getDomain(), 0));
log.info("所有页面爬取结束,程序结束,重置所有主标签状态");
}
private int getPageNum(String domainId, String cookie) {
int index = 0;
while (true) {
String page = crawler.getPage(domainId, cookie);
crawler.sleep(3000L);
String page = crawler.getPage(domainId, cookie);
int num = jsoupHtml.parsePage(page);
if (num != 0) {
return num;
}
if (++index > 10) {
log.error("【{}】未获取到页码");
return 0;
}
}
}
......@@ -158,6 +164,6 @@ public class MongoSerivce {
tagDao.updateByState(key, 0);
}
domainDao.createColl();
// domainDao.createColl();
}
}
......@@ -23,19 +23,16 @@ import com.zhiweidata.weiboDomain.service.MongoSerivce;
* @date 2018年2月23日 下午3:09:33
*/
public class Start {
// private static ApplicationContext ctx = new ClassPathXmlApplicationContext("applicationContext.xml");
// private static MongoSerivce serice = ctx.getBean(MongoSerivce.class);
private static ApplicationContext ctx = new ClassPathXmlApplicationContext("applicationContext.xml");
private static MongoSerivce serice = ctx.getBean(MongoSerivce.class);
public static void main(String[] args) {
ApplicationContext ctx = new ClassPathXmlApplicationContext("applicationContext.xml");
System.out.println("微博热门榜单采集开始...");
//程序主体切换至com.zhiweidata.weiboDomain.quartz定时器
// String cookie = "SINAGLOBAL=1413878352487.6208.1509610656233; UM_distinctid=16053b396fea93-0a1b0a92fd2d7-5f19311c-1fa400-16053b396ffa9d; un=15757871020; UOR=,www.weibo.com,www.baidu.com; login_sid_t=c3935e41ed072cfa20cbd1462a51e1b6; cross_origin_proto=SSL; _s_tentry=passport.weibo.com; Apache=3630121226743.0815.1520321567340; ULV=1520321568035:15:2:1:3630121226743.0815.1520321567340:1519887954422; SSOLoginState=1520321599; SCF=AuCAEA8HE7llKecGgDC9mXez57Y8TrXZB-bbR3xw1Rg_-hZyItSqXebc8xU1toDVlo8zXAxor6wCq9fcmjOxtlI.; SUHB=0FQrE4a4pmNdGS; ALF=1551948727; SUB=_2AkMt_rd6f8NxqwJRmP0RzWPrbI90wg3EieKbokahJRMxHRl-yT83qlc5tRB6Bn6ZlSJDljRT4MPLb3O5AT8RtbcROuaE; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWDQpkiHsu8zrdep6H.3aI_; YF-Page-G0=aabeaa17d9557111c805fb15a9959531";
// 初始化程序状态,在再次爬取时调用
// String cookie = "YF-Page-G0=b98b45d9bba85e843a07e69c0880151a; SUB=_2AkMt9Wyxf8NxqwJRmP0RzWPrbI90wg3EieKbqZ1qJRMxHRl-yj83qhwMtRB6BnVCXs0VTavYQWuC4hgT0djGew9Twhmm; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWDQpkiHsu8zrdep6H.3aI_";
// 初始化程序状态,在再次爬取时调用
// 断点续传时,注释掉
// serice.initTag();
// serice.crawlerData(cookie);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment