Commit 8fbf7592 by chenweitao

检查性维护

parent 306c37e1
...@@ -65,16 +65,20 @@ public class MongoSerivce { ...@@ -65,16 +65,20 @@ public class MongoSerivce {
private int getPageNum(String domainId, String cookie) { private int getPageNum(String domainId, String cookie) {
int index = 0; int index = 0;
while (true) { while (true) {
crawler.sleep(3000L); if (index > 10) {
log.error("【{}】未获取到页码",domainId);
return 0;
}
String page = crawler.getPage(domainId, cookie); String page = crawler.getPage(domainId, cookie);
if (page == null||"".equals(page)) {
log.info("获取页码失败,tag【{}】,重试【{}】",domainId, ++index);
continue;
}
int num = jsoupHtml.parsePage(page); int num = jsoupHtml.parsePage(page);
if (num != 0) { if (num != 0) {
return num; return num;
} }
if (++index > 10) { crawler.sleep(5000);
log.error("【{}】未获取到页码");
return 0;
}
} }
} }
......
...@@ -31,7 +31,7 @@ public class Start { ...@@ -31,7 +31,7 @@ public class Start {
System.out.println("微博热门榜单采集开始..."); System.out.println("微博热门榜单采集开始...");
//程序主体切换至com.zhiweidata.weiboDomain.quartz定时器 //程序主体切换至com.zhiweidata.weiboDomain.quartz定时器
// String cookie = "YF-Page-G0=b98b45d9bba85e843a07e69c0880151a; SUB=_2AkMt9Wyxf8NxqwJRmP0RzWPrbI90wg3EieKbqZ1qJRMxHRl-yj83qhwMtRB6BnVCXs0VTavYQWuC4hgT0djGew9Twhmm; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWDQpkiHsu8zrdep6H.3aI_"; String cookie = "YF-Page-G0=b98b45d9bba85e843a07e69c0880151a; SUB=_2AkMt9Wyxf8NxqwJRmP0RzWPrbI90wg3EieKbqZ1qJRMxHRl-yj83qhwMtRB6BnVCXs0VTavYQWuC4hgT0djGew9Twhmm; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWDQpkiHsu8zrdep6H.3aI_";
// 初始化程序状态,在再次爬取时调用 // 初始化程序状态,在再次爬取时调用
// 断点续传时,注释掉 // 断点续传时,注释掉
// serice.initTag(); // serice.initTag();
......
package com.zhiweidata.weiboDomain.utils;
/**
* @Description: TODO(处理微博的用户的认证类型)
* @ClassName: TreatVtype
* @author 志伟
* @date 2015-5-26 下午2:49:49
* ***/
public class TreatVtype {
/**
* @deprecated:处理微博的用户的认证类型
* @param int vtype
* 认证类型(int)
* @return String
* **/
public static String changeVtype(int vtype) {
String type = "";
switch (vtype) {
case -2:
type = "未知";
break;
case -1:
type = "普通用户";
break;
case 0:
type="名人";
break;
case 1:
type="政府";
break;
case 2:
type="企业";
break;
case 3:
type="媒体";
break;
case 4:
type="校园";
break;
case 5:
type="网站";
break;
case 6:
type="应用";
break;
case 7:
type="团体";
break;
case 10:
type="微博女郎";
break;
case 220:
type="达人";
break;
case 200:
type="达人";
break;
}
return type;
}
/**
* @deprecated:处理微博的用户的认证类型
* @param int vtype
* 认证类型(int)
* @return String
* **/
public static String changeVtype(String vtype) {
String type = "";
if (vtype.equals("n")) {
type = "普通用户";
} else if (vtype.equals("b")) {
type="蓝V";
} else if (vtype.equals("y")) {
type="黄V";
} else if (vtype.equals("d")) {
type="达人";
}
return type;
}
/**
* @deprecated:处理微博的用户的性别
* @param String sex 性别
* @return String
* **/
public static String changeSex(String sex) {
String gender = sex;
if(sex.equals("f"))
{
gender = "女";
}else if (sex.equals("m"))
{
gender = "男";
}else if (sex.equals("n"))
{
gender = "未知";
}
return gender;
}
/**
* @deprecated:处理微博的是否为原创
* @param int isforword 微博状态
* @return String
* **/
public static String changeIsForword(int isforword) {
String isForword = "";
if(isforword == 0 )
{
isForword = "原创";
}else if (isforword == 1)
{
isForword = "转发";
}
return isForword;
}
}
#mongo.serverMongoIp=127.0.0.1 #mongo.serverMongoIp=127.0.0.1
mongo.serverMongoIp=115.236.59.91 mongo.serverMongoIp=202.107.192.94
mongo.dbName=weiboDomain mongo.dbName=weiboDomain
#mongo.ef.username=rsync
#mongo.ef.pwd=rsync1q2w3e4r
#mongo.ef.defaultDB=admin
mongo.ef.username=cwtno
mongo.ef.pwd=cwtno1q2w3e4r
mongo.ef.defaultDB=admin
...@@ -40,7 +40,7 @@ ...@@ -40,7 +40,7 @@
<property name="typeMapper" ref="defaultMongoTypeMapper" /> <property name="typeMapper" ref="defaultMongoTypeMapper" />
</bean> </bean>
<!-- 配置数据库相关配置 --> <!-- 配置数据库相关配置 -->
<mongo:mongo-client id="Mongo" host="${mongo.serverMongoIp}" port="27017"/> <mongo:mongo-client id="Mongo" host="${mongo.serverMongoIp}" port="30000" credentials="${mongo.ef.username}:${mongo.ef.pwd}@${mongo.ef.defaultDB}"/>
<mongo:db-factory id="Factory" dbname="${mongo.dbName}" <mongo:db-factory id="Factory" dbname="${mongo.dbName}"
mongo-ref="Mongo" /> mongo-ref="Mongo" />
<mongo:template id="template" converter-ref="mappingMongoConverter" <mongo:template id="template" converter-ref="mappingMongoConverter"
......
package com.zhiweidata.weiboDomain.crawler;
import javax.annotation.Resource;
import org.junit.Test;
import com.zhiweidata.weiboDomain.ObjectTest;
import com.zhiweidata.weiboDomain.dao.DomainDao;
import com.zhiweidata.weiboDomain.dao.TagDao;
public class CrawlerTest extends ObjectTest {
@Resource
DomainDao domainDao;
@Resource
TagDao tagDao;
@Test
public void test2() {
WeiboDomainCrawler weiboDomainCrawler = new WeiboDomainCrawler();
JsoupHtml jsoupHtml = JsoupHtml.getInstance();
String cookie = "SINAGLOBAL=1523184237048.3672.1521084294046; UOR=,,www.baidu.com; login_sid_t=e1701f6447d260c7bdb34906162c80e8; cross_origin_proto=SSL; _s_tentry=passport.weibo.com; Apache=6087081085858.952.1545809452672; ULV=1545809452676:23:3:1:6087081085858.952.1545809452672:1544841771587; YF-Page-G0=1ffbef18656bf02c17e45a764e3d5336; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhJTOikRJv51mSslx-nU-WA5JpX5K2hUgL.Foe0eK571h-f1K-2dJLoIEXLxKBLB.eL1-2LxK.LBKeL1KnLxKML1-2L1hBLxKqL1h.LB-zLxKqL122L122t; SSOLoginState=1547099803; ALF=1578635821; SCF=AuCAEA8HE7llKecGgDC9mXez57Y8TrXZB-bbR3xw1Rg_LDOkUG0Qj0SGSDLEvIMQSNKl6aF3mXb0ucWr77cjGH4.; SUB=_2A25xMq7_DeRhGeVN6lIR-CvJwjmIHXVSSYc3rDV8PUNbmtBeLRX2kW9NTHXVKUPFRv4an3QvDUPEjSX37t9jiiWn; SUHB=05i6gOaQs8TYWb; un=15757871020; wvr=6; wb_view_log_3310085595=1920*10801";
tagDao.findAll().forEach(tag->{
String url = "https://d.weibo.com/" + tag.getDomainId()+ "?pids=Pl_Core_F4RightUserList__4" + "&page=" +1
+ "&ajaxpagelet=1&__ref=/" + tag.getDomainId() + "&_t=FM_" + System.currentTimeMillis()
+100;
String page = weiboDomainCrawler.getPage(tag.getDomainId(), cookie);
if (page!=null) {
System.out.println("页码:"+jsoupHtml.parsePage(page));
System.out.println(url);
}else {
System.out.println("失效页码:"+url);
}
});
}
@Test
public void test() {
System.out.println("testRunning");
}
}
package com.zhiweidata.weiboDomain.dao; package com.zhiweidata.weiboDomain.dao;
import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List;
import java.util.Map; import java.util.Map;
import javax.annotation.Resource; import javax.annotation.Resource;
...@@ -11,9 +13,14 @@ import org.jsoup.nodes.Element; ...@@ -11,9 +13,14 @@ import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
import org.junit.Test; import org.junit.Test;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiweidata.weiboDomain.ObjectTest; import com.zhiweidata.weiboDomain.ObjectTest;
import com.zhiweidata.weiboDomain.crawler.WeiboDomainCrawler; import com.zhiweidata.weiboDomain.crawler.WeiboDomainCrawler;
import com.zhiweidata.weiboDomain.entity.DomainTag; import com.zhiweidata.weiboDomain.entity.DomainTag;
import com.zhiweidata.weiboDomain.entity.WeiboDomain;
import com.zhiweidata.weiboDomain.excel.DBOExp;
import com.zhiweidata.weiboDomain.utils.TreatVtype;
import net.sf.json.JSONObject; import net.sf.json.JSONObject;
...@@ -25,7 +32,7 @@ public class DomainDaoTest extends ObjectTest { ...@@ -25,7 +32,7 @@ public class DomainDaoTest extends ObjectTest {
@Resource @Resource
TagDao tagDao; TagDao tagDao;
// @Test // @Test
public void insertTagTest() { public void insertTagTest() {
DomainTag domainTag = new DomainTag(); DomainTag domainTag = new DomainTag();
String id = "24H热门"; String id = "24H热门";
...@@ -36,6 +43,43 @@ public class DomainDaoTest extends ObjectTest { ...@@ -36,6 +43,43 @@ public class DomainDaoTest extends ObjectTest {
tagDao.insert(domainTag); tagDao.insert(domainTag);
} }
@Test
public void exTest() {
List<WeiboDomain> doms ;
List<DBObject> dbos;
DBObject dbo;
String fatherPath = "E:\\出的数据\\微博栏目数据\\";
List<DomainTag> tsgs = tagDao.findAll();
for (DomainTag domainTag : tsgs) {
doms = domainDao.findByDomain(domainTag.getDomain());
dbos = new ArrayList<>();
for (WeiboDomain weiboDomain : doms) {
dbo = new BasicDBObject();
dbo.put("栏目",weiboDomain.getDomain());
dbo.put("标签",weiboDomain.getTag());
dbo.put("uid",weiboDomain.getUid());
dbo.put("用户昵称",weiboDomain.getName());
dbo.put("性别",TreatVtype.changeSex(weiboDomain.getGender()));
dbo.put("地域",weiboDomain.getLocation());
dbo.put("简介",weiboDomain.getDescription());
dbo.put("粉丝数",weiboDomain.getFollowers_count());
dbo.put("关注数",weiboDomain.getFriends_count());
dbo.put("微博数",weiboDomain.getStatuses_count());
dbo.put("认证类别",weiboDomain.getvType());
dbos.add(dbo);
}
if (dbos.size()>0) {
DBOExp dbExp = new DBOExp();
dbExp.putRun(dbos, fatherPath+domainTag.getDomain()+"用户.xls", domainTag.getDomain());
}
}
}
/** /**
* *
* @Title: insertTagFirstTest * @Title: insertTagFirstTest
...@@ -46,7 +90,7 @@ public class DomainDaoTest extends ObjectTest { ...@@ -46,7 +90,7 @@ public class DomainDaoTest extends ObjectTest {
* 陈炜涛 * 陈炜涛
* @date: 2018年3月13日 下午5:53:34 * @date: 2018年3月13日 下午5:53:34
*/ */
// @Test // @Test
public void insertTagFirstTest() { public void insertTagFirstTest() {
WeiboDomainCrawler crawler = new WeiboDomainCrawler(); WeiboDomainCrawler crawler = new WeiboDomainCrawler();
String url = "https://d.weibo.com/1087030002_2975_1003_0#"; String url = "https://d.weibo.com/1087030002_2975_1003_0#";
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment