Commit f245927d by chenweitao

大量重构后正常自动化采集版本

parent 158abbbc
......@@ -15,8 +15,6 @@ import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
import javax.print.Doc;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
......@@ -73,21 +71,25 @@ public class JsoupHtml {
Element S_txt1 = div.getElementsByClass("S_txt1").get(0);
uid = S_txt1.attr("usercard").split("&")[0].replaceAll("id=", "");
weiboDomainGroup.setUid(uid);
weiboDomainGroup.setUrl(S_txt1.attr("href"));
weiboDomainGroup.setUrl("https:"+S_txt1.attr("href"));
weiboDomainGroup.setName(S_txt1.attr("title"));
Elements i = div.getElementsByTag("i");
if (i.size()>0) {
weiboDomainGroup.setvType(i.get(0).attr("title"));;
}
for (Element ele : i)
{
if (ele.attr("class").equals("W_icon icon_member"))
{
weiboDomainGroup.setVip(true);
}
// if (ele.attr("class").equals("W_icon icon_approve"))
// {
// weiboDomainGroup.setVType(ele.attr("title"));;
// }
if (ele.attr("class").equals("W_icon icon_male"))
{
weiboDomainGroup.setGender("m");
}
else
else if(ele.attr("class").equals("W_icon icon_female"))
{
weiboDomainGroup.setGender("f");
}
......@@ -112,8 +114,13 @@ public class JsoupHtml {
}
if (div.attr("class").equals("info_relation"))
{
String tag = div.text().split(":")[1];
weiboDomainGroup.setTag(tag);
try {
String tag = div.text().split(":")[1];
weiboDomainGroup.setTag(tag);
} catch (Exception e) {
weiboDomainGroup.setTag(div.text());
e.printStackTrace();
}
}
}
weiboDomainGroup.setDomain(domain);
......
......@@ -88,7 +88,8 @@ public class WeiboDomainCrawler {
private String get(HttpGet httpGet, Predicate<String> predicate) {
boolean flag = true;
while(flag)
int i = 3;
while(flag&&i-- >0)
{
try
{
......@@ -107,6 +108,7 @@ public class WeiboDomainCrawler {
}
catch (Exception e)
{
e.getMessage();
sleep(3000L);
}
}
......
......@@ -34,4 +34,6 @@ public interface DomainDao{
public void createColl();
public String bestNewCollName();
public void insert(WeiboDomain weiboDomain);
}
......@@ -27,4 +27,6 @@ public interface TagDao {
public void updateByState(String domain,Integer state);
public void insert(DomainTag domainTag);
}
......@@ -88,4 +88,9 @@ public class DomainDaoImpl implements DomainDao{
String collName = "weiboDomain"+time;
mongoTemplate.createCollection(collName);
}
@Override
public void insert(WeiboDomain weiboDomain) {
mongoTemplate.insert(weiboDomain);
}
}
......@@ -54,4 +54,9 @@ public class TagDaoImpl implements TagDao{
return mongoTemplate.find(query, DomainTag.class);
}
@Override
public void insert(DomainTag domainTag) {
mongoTemplate.insert(domainTag);
}
}
......@@ -14,15 +14,12 @@ import java.util.List;
import org.springframework.data.mongodb.core.mapping.Document;
import lombok.Data;
/**
* @ClassName: DomainTag
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午3:23:31
*/
@Data
@Document(collection = "domainTag")
public class DomainTag {
private String _id;
......@@ -30,4 +27,35 @@ public class DomainTag {
private String domain;
private String domainId;
private Integer state;
public String get_id() {
return _id;
}
public void set_id(String _id) {
this._id = _id;
}
public List<String> getTags() {
return tags;
}
public void setTags(List<String> tags) {
this.tags = tags;
}
public String getDomain() {
return domain;
}
public void setDomain(String domain) {
this.domain = domain;
}
public String getDomainId() {
return domainId;
}
public void setDomainId(String domainId) {
this.domainId = domainId;
}
public Integer getState() {
return state;
}
public void setState(Integer state) {
this.state = state;
}
}
......@@ -12,8 +12,7 @@ package com.zhiweidata.weiboDomain.entity;
import org.springframework.data.annotation.Id;
import org.springframework.data.mongodb.core.index.Indexed;
import lombok.Data;
import org.springframework.data.mongodb.core.mapping.Document;
/**
* @ClassName: entity
......@@ -21,7 +20,7 @@ import lombok.Data;
* @author xuyimeng
* @date 2018年2月23日 下午2:37:27
*/
@Data
@Document(collection = "weiboDomain")
public class WeiboDomain {
@Id
private String id;
......@@ -38,6 +37,91 @@ public class WeiboDomain {
private String followers_count;
private Integer friends_count;
private Integer statuses_count;
private boolean isVip;
private String vType;
private String updateTime;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getUid() {
return uid;
}
public void setUid(String uid) {
this.uid = uid;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getGender() {
return gender;
}
public void setGender(String gender) {
this.gender = gender;
}
public String getLocation() {
return location;
}
public void setLocation(String location) {
this.location = location;
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = description;
}
public String getDomain() {
return domain;
}
public void setDomain(String domain) {
this.domain = domain;
}
public String getTag() {
return tag;
}
public void setTag(String tag) {
this.tag = tag;
}
public String getFollowers_count() {
return followers_count;
}
public void setFollowers_count(String followers_count) {
this.followers_count = followers_count;
}
public Integer getFriends_count() {
return friends_count;
}
public void setFriends_count(Integer friends_count) {
this.friends_count = friends_count;
}
public Integer getStatuses_count() {
return statuses_count;
}
public void setStatuses_count(Integer statuses_count) {
this.statuses_count = statuses_count;
}
public String getvType() {
return vType;
}
public void setvType(String vType) {
this.vType = vType;
}
public String getUpdateTime() {
return updateTime;
}
public void setUpdateTime(String updateTime) {
this.updateTime = updateTime;
}
}
\ No newline at end of file
......@@ -10,6 +10,9 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.jxlzw.report.model.HLink;
import jxl.Cell;
......@@ -28,7 +31,6 @@ import jxl.write.WritableSheet;
import jxl.write.WritableWorkbook;
import jxl.write.WriteException;
import jxl.write.biff.RowsExceededException;
import lombok.extern.slf4j.Slf4j;
/**
......@@ -39,9 +41,10 @@ import lombok.extern.slf4j.Slf4j;
* @author Administrator
* @date 2015年11月20日 下午4:52:02
*/
@Slf4j
public class SimpeExcelReport
{
private static Logger log = LoggerFactory.getLogger(SimpeExcelReport.class);
private List<Map<String, Object>> bodyList;
private List<String> headList;
......
package com.zhiweidata.weiboDomain.quartz;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import javax.annotation.Resource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import com.zhiweidata.weiboDomain.service.MongoSerivce;
@Component
public class crawlerQuartz {
private static Logger log = LoggerFactory.getLogger(crawlerQuartz.class);
@Resource
MongoSerivce serice;
/**
*
* @Title: execute
* @Description: TODO(定时器每天20点执行一次)
* @param:
* @return: void
* @throws @author:
* 陈炜涛
* @date: 2018年3月13日 下午4:42:19
*/
@Scheduled(cron = "0 0 20 * * *")
public void execute() {
String time = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH"));
long start = System.currentTimeMillis();
String cookie = "SINAGLOBAL=1413878352487.6208.1509610656233; UM_distinctid=16053b396fea93-0a1b0a92fd2d7-5f19311c-1fa400-16053b396ffa9d; un=15757871020; UOR=,www.weibo.com,www.baidu.com; login_sid_t=c3935e41ed072cfa20cbd1462a51e1b6; cross_origin_proto=SSL; _s_tentry=passport.weibo.com; Apache=3630121226743.0815.1520321567340; ULV=1520321568035:15:2:1:3630121226743.0815.1520321567340:1519887954422; SSOLoginState=1520321599; SCF=AuCAEA8HE7llKecGgDC9mXez57Y8TrXZB-bbR3xw1Rg_-hZyItSqXebc8xU1toDVlo8zXAxor6wCq9fcmjOxtlI.; SUHB=0FQrE4a4pmNdGS; ALF=1551948727; SUB=_2AkMt_rd6f8NxqwJRmP0RzWPrbI90wg3EieKbokahJRMxHRl-yT83qlc5tRB6Bn6ZlSJDljRT4MPLb3O5AT8RtbcROuaE; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWDQpkiHsu8zrdep6H.3aI_; YF-Page-G0=aabeaa17d9557111c805fb15a9959531";
serice.crawlerData(cookie);
long end = System.currentTimeMillis();
log.info(time + "次运行耗时:" + (end - start) + "\t毫秒");
}
}
......@@ -5,18 +5,22 @@
* @author xuyimeng
* @date 2018年2月23日 下午4:55:00
* @version V1.0
*/ /**
*
*/
/**
*
*/
package com.zhiweidata.weiboDomain.service;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import javax.annotation.Resource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
import com.zhiweidata.weiboDomain.crawler.JsoupHtml;
......@@ -26,130 +30,134 @@ import com.zhiweidata.weiboDomain.dao.TagDao;
import com.zhiweidata.weiboDomain.entity.DomainTag;
import com.zhiweidata.weiboDomain.entity.WeiboDomain;
import lombok.extern.slf4j.Slf4j;
/**
* @ClassName: Serivce
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午4:55:00
/**
* @ClassName: Serivce
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午4:55:00
*/
@Slf4j
@Service
public class MongoSerivce {
@Resource
TagDao tagDao;
@Resource
DomainDao domainDao;
private static Logger log = LoggerFactory.getLogger(MongoSerivce.class);
WeiboDomainCrawler crawler = new WeiboDomainCrawler();
JsoupHtml jsoupHtml = JsoupHtml.getInstance();
public void crawlerData(String cookie) {
Map<String, String> map = groupSet();
for (String domain : map.keySet())
{
for (String domain : map.keySet()) {
String domainId = map.get(domain);
log.info("【{}】页开始爬取...............",domain);
List<WeiboDomain> list = parse(domain, domainId, cookie);
log.info("【{}】页所有数据爬取结束...............",domain);
domainDao.insert(list);
log.info("【{}】页开始爬取...............", domain);
int sum = parseAndInsert(domain, domainId, cookie);
log.info("【{}】页所有数据爬取结束...............", domain);
tagDao.updateByState(domain, 2);
log.info("【{}】所有页数据存储成功,共计【{}】条数据",domain,list.size());
log.info("【{}】所有页数据存储成功,共计【{}】条数据", domain, sum);
}
log.info("所有页面爬取结束,程序结束");
}
private int getPageNum(String domainId,String cookie) {
while (true)
{
private int getPageNum(String domainId, String cookie) {
while (true) {
String page = crawler.getPage(domainId, cookie);
crawler.sleep(3000L);
int num = jsoupHtml.parsePage(page);
if (num != 0)
{
if (num != 0) {
return num;
}
}
}
private List<WeiboDomain> parse(String domain,String domainId,String cookie) {
private List<WeiboDomain> parse(String domain, String domainId, String cookie) {
List<WeiboDomain> result = new ArrayList<>();
int num = getPageNum(domainId, cookie);
int i = 1;
while (i<300)
{
String url = "https://d.weibo.com/"+domainId+"?pids=Pl_Core_F4RightUserList__4"
+ "&page="+i+"&ajaxpagelet=1&__ref=/"+domainId;
String html = crawler.getHtml(url, cookie);
if (html == null)
{
if ((result.size()/10)+2 < num)
{
continue;
}
else {
break;
}
log.info("【{}】:共【{}】页", domain, num);
Random rand = new Random();
for (int i = 1; i < num + 1; i++) {
String url = "https://d.weibo.com/" + domainId + "?pids=Pl_Core_F4RightUserList__4" + "&page=" + i
+ "&ajaxpagelet=1&__ref=/" + domainId + "&_t=FM_" + System.currentTimeMillis()
+ (rand.nextInt(89) + 10);
String html = crawler.getHtml(url, cookie);
System.out.println(html);
if (html != null) {
List<WeiboDomain> list = jsoupHtml.parseData(html, domain);
result.addAll(list);
log.info("【{}】:第【{}】页爬取成功", domain, i);
} else {
log.info("【{}】:第【{}】页爬取失败", domain, i);
}
List<WeiboDomain> list = jsoupHtml.parseData(html,domain);
result.addAll(list);
log.info("【{}】:第【{}】页爬取成功",domain,i);
i++;
crawler.sleep(3000L);
crawler.sleep(6000L);
}
return result;
}
private Map<String,String> groupSet(){
private int parseAndInsert(String domain, String domainId, String cookie) {
int size = 0;
int num = getPageNum(domainId, cookie);
log.info("【{}】:共【{}】页", domain, num);
Random rand = new Random();
for (int i = 1; i < num + 1; i++) {
String url = "https://d.weibo.com/" + domainId + "?pids=Pl_Core_F4RightUserList__4" + "&page=" + i
+ "&ajaxpagelet=1&__ref=/" + domainId + "&_t=FM_" + System.currentTimeMillis()
+ (rand.nextInt(89) + 10);
String html = crawler.getHtml(url, cookie);
if (html != null) {
List<WeiboDomain> list = jsoupHtml.parseData(html, domain);
domainDao.insert(list);
size += list.size();
log.info("【{}】:第【{}】页爬取成功,存储【{}】条数据", domain, i, list.size());
} else {
log.info("【{}】:第【{}】页爬取失败", domain, i);
}
crawler.sleep(6000L);
}
return size;
}
private Map<String, String> groupSet() {
Map<String, String> result = new HashMap<>();
List<DomainTag> list = tagDao.findByState(1);
for (DomainTag domainTag : list)
{
for (DomainTag domainTag : list) {
String key = domainTag.getDomain();
String value = domainTag.getDomainId();
result.put(key, value);
}
for (String key : result.keySet())
{
for (String key : result.keySet()) {
tagDao.updateByState(key, 0);
}
return result;
}
/**
* @Title: initTag
* @Description: TODO(初始化所有tag状态,并创建新的表,在再次爬取时调用)
* void 返回类型
* @Title: initTag
* @Description: TODO(初始化所有tag状态,并创建新的表,在再次爬取时调用) void 返回类型
*/
public void initTag() {
Map<String, String> result = new HashMap<>();
List<DomainTag> list = tagDao.findAll();
for (DomainTag domainTag : list)
{
for (DomainTag domainTag : list) {
String key = domainTag.getDomain();
String value = domainTag.getDomainId();
result.put(key, value);
}
for (String key : result.keySet())
{
for (String key : result.keySet()) {
tagDao.updateByState(key, 0);
}
domainDao.createColl();
}
}
......@@ -15,6 +15,7 @@ import org.springframework.context.support.ClassPathXmlApplicationContext;
import com.zhiweidata.weiboDomain.service.MongoSerivce;
/**
* @ClassName: Main
* @Description: TODO(这里用一句话描述这个类的作用)
......@@ -22,14 +23,20 @@ import com.zhiweidata.weiboDomain.service.MongoSerivce;
* @date 2018年2月23日 下午3:09:33
*/
public class Start {
private static ApplicationContext ctx = new ClassPathXmlApplicationContext("spring-context.xml");
private static MongoSerivce serice = ctx.getBean(MongoSerivce.class);
// private static ApplicationContext ctx = new ClassPathXmlApplicationContext("applicationContext.xml");
// private static MongoSerivce serice = ctx.getBean(MongoSerivce.class);
public static void main(String[] args) {
String cookie = "login_sid_t=2da8770fb84cdb5be026bbfcd76ef1e6; cross_origin_proto=SSL; _s_tentry=passport.weibo.com; Apache=873655794108.0503.1519525903336; SINAGLOBAL=873655794108.0503.1519525903336; ULV=1519525903344:1:1:1:873655794108.0503.1519525903336:; SSOLoginState=1519525975; SCF=AqU8lfV6ROhTkYEEmVi2ROhtdMxlB0mT3EF2ABKenC3OfC3SeK3YfvZYWFJY8ytsaFhYcc1vO5hvhLwolzBW5ps.; SUB=_2A253llAIDeRhGeNH6VoY9C7Mzz-IHXVU4sbArDV8PUNbmtBeLUnSkW9NStghaGFgK4WPoq15L2ikM_srwT7hNvkI; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5eochNrdf3XKPD1VaPcG3T5JpX5K2hUgL.Fo-4eon4Sh57She2dJLoIEQLxK-LBKBLBo2LxKBLBo.L12zLxK-L1-BLBKqLxKML1hBLBoqEeh2ceh-t; SUHB=0mxUFkR8aaPo5m; ALF=1551061975; un=18395807152; wvr=6; YF-Page-G0=416186e6974c7d5349e42861f3303251";
ApplicationContext ctx = new ClassPathXmlApplicationContext("applicationContext.xml");
System.out.println("微博热门榜单采集开始...");
MongoSerivce serice = ctx.getBean(MongoSerivce.class);
serice.equals("");
//程序主体切换至com.zhiweidata.weiboDomain.quartz定时器
// String cookie = "SINAGLOBAL=1413878352487.6208.1509610656233; UM_distinctid=16053b396fea93-0a1b0a92fd2d7-5f19311c-1fa400-16053b396ffa9d; un=15757871020; UOR=,www.weibo.com,www.baidu.com; login_sid_t=c3935e41ed072cfa20cbd1462a51e1b6; cross_origin_proto=SSL; _s_tentry=passport.weibo.com; Apache=3630121226743.0815.1520321567340; ULV=1520321568035:15:2:1:3630121226743.0815.1520321567340:1519887954422; SSOLoginState=1520321599; SCF=AuCAEA8HE7llKecGgDC9mXez57Y8TrXZB-bbR3xw1Rg_-hZyItSqXebc8xU1toDVlo8zXAxor6wCq9fcmjOxtlI.; SUHB=0FQrE4a4pmNdGS; ALF=1551948727; SUB=_2AkMt_rd6f8NxqwJRmP0RzWPrbI90wg3EieKbokahJRMxHRl-yT83qlc5tRB6Bn6ZlSJDljRT4MPLb3O5AT8RtbcROuaE; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWDQpkiHsu8zrdep6H.3aI_; YF-Page-G0=aabeaa17d9557111c805fb15a9959531";
// 初始化程序状态,在再次爬取时调用
// 断点续传时,注释掉
// serice.initTag();
serice.crawlerData(cookie);
// serice.crawlerData(cookie);
}
}
<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:context="http://www.springframework.org/schema/context"
xmlns:aop="http://www.springframework.org/schema/aop" xmlns:tx="http://www.springframework.org/schema/tx"
xmlns:mvc="http://www.springframework.org/schema/mvc" xmlns:task="http://www.springframework.org/schema/task"
xsi:schemaLocation="http://www.springframework.org/schema/context
http://www.springframework.org/schema/context/spring-context-4.2.xsd
http://www.springframework.org/schema/beans
http://www.springframework.org/schema/beans/spring-beans-4.2.xsd
http://www.springframework.org/schema/tx
http://www.springframework.org/schema/tx/spring-tx-4.2.xsd
http://www.springframework.org/schema/aop
http://www.springframework.org/schema/aop/spring-aop-4.2.xsd
http://www.springframework.org/schema/task
http://www.springframework.org/schema/task/spring-task.xsd">
<!-- 开启注解处理器 -->
<context:annotation-config />
<!-- 开启aspectj自动代理,可以使用@aspectj注解 -->
<aop:aspectj-autoproxy expose-proxy="true" />
<!-- 基于注解方式的定时器 -->
<task:annotation-driven/>
<!-- 开启组件自动扫描,扫描路径由base-package属性指定 -->
<context:component-scan base-package="com.zhiweidata.weiboDomain" />
<!-- 配置文件导入 -->
<context:property-placeholder location="classpath*:*.properties" />
<!-- 导入Mongo配置文件 -->
<import resource="mongoContext.xml" />
</beans>
\ No newline at end of file
log4j.rootLogger=info,stdout,R
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.logger.org.apache.commons.httpclient=info
log4j.logger.httpclient.wire.content=info
log4j.logger.httpclient.wire.header=info
# Pattern to output the caller's file name and line number.
log4j.appender.stdout.layout.ConversionPattern=%-4r %-5p [%d{yyyy-MM-dd HH:mm:ss}] %m%n
log4j.appender.R=org.apache.log4j.RollingFileAppender
log4j.appender.R.File=./log/weiboDomain.log
log4j.appender.R.MaxFileSize= 2MB
# Keep one backup file
log4j.appender.R.MaxBackupIndex=3
log4j.appender.R.layout=org.apache.log4j.PatternLayout
log4j.appender.R.layout.ConversionPattern=%l %-5p [%d{yyyy-MM-dd HH\:mm\:ss}] %m%n
\ No newline at end of file
#mongo.serverMongoIp=127.0.0.1
mongo.serverMongoIp=115.236.59.91
mongo.dbName=weiboDomain
<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:context="http://www.springframework.org/schema/context"
xmlns:tx="http://www.springframework.org/schema/tx"
xmlns:mongo="http://www.springframework.org/schema/data/mongo"
xsi:schemaLocation="http://www.springframework.org/schema/context
http://www.springframework.org/schema/context/spring-context.xsd
http://www.springframework.org/schema/beans
http://www.springframework.org/schema/beans/spring-beans.xsd
http://www.springframework.org/schema/tx
http://www.springframework.org/schema/tx/spring-tx.xsd
http://www.springframework.org/schema/data/mongo
http://www.springframework.org/schema/data/mongo/spring-mongo.xsd
">
<!-- 默认配置文件 -->
<bean id="mappingContext"
class="org.springframework.data.mongodb.core.mapping.MongoMappingContext" />
<!-- 默认Mongodb类型映射 -->
<bean id="defaultMongoTypeMapper"
class="org.springframework.data.mongodb.core.convert.DefaultMongoTypeMapper">
<constructor-arg name="typeKey">
<!-- 这里设置为空,可以把 spring data mongodb 多余保存的_class字段去掉 -->
<null />
</constructor-arg>
</bean>
<!-- 新版事件库 -->
<!-- 配置过滤规则 -->
<bean id="resolver"
class="org.springframework.data.mongodb.core.convert.DefaultDbRefResolver">
<constructor-arg name="mongoDbFactory" ref="Factory" />
</bean>
<bean id="mappingMongoConverter" class="org.springframework.data.mongodb.core.convert.MappingMongoConverter">
<constructor-arg name="dbRefResolver" ref="resolver" />
<constructor-arg name="mappingContext" ref="mappingContext" />
<!-- map中.的替换 -->
<property name="mapKeyDotReplacement" value="${replaceKey}" />
<!-- 存储数据的时候不保存类型 -->
<property name="typeMapper" ref="defaultMongoTypeMapper" />
</bean>
<!-- 配置数据库相关配置 -->
<mongo:mongo-client id="Mongo" host="${mongo.serverMongoIp}" port="27017"/>
<mongo:db-factory id="Factory" dbname="${mongo.dbName}"
mongo-ref="Mongo" />
<mongo:template id="template" converter-ref="mappingMongoConverter"
db-factory-ref="Factory" />
</beans>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:context="http://www.springframework.org/schema/context"
xmlns:aop="http://www.springframework.org/schema/aop"
xmlns:mvc="http://www.springframework.org/schema/mvc"
xsi:schemaLocation="http://www.springframework.org/schema/context
http://www.springframework.org/schema/context/spring-context-4.2.xsd
http://www.springframework.org/schema/beans
http://www.springframework.org/schema/beans/spring-beans-4.2.xsd
http://www.springframework.org/schema/aop http://www.springframework.org/schema/aop/spring-aop-4.2.xsd
http://www.springframework.org/schema/mvc http://www.springframework.org/schema/mvc/spring-mvc-4.2.xsd ">
<context:component-scan base-package="com.zhiweidata.weiboDomain"></context:component-scan>
<mvc:annotation-driven />
<context:annotation-config />
<bean
class="org.springframework.web.servlet.mvc.method.annotation.RequestMappingHandlerMapping" />
<!-- HandlerAdapter -->
<bean
class="org.springframework.web.servlet.mvc.method.annotation.RequestMappingHandlerAdapter" />
<!-- <mvc:interceptors>
使用bean定义一个Interceptor,直接定义在mvc:interceptors根下面的Interceptor将拦截所有的请求
<bean class="com.zhiwei.manager.interceptor.SessionInterceptor" />
<mvc:interceptor>
<mvc:mapping path="/*" />
定义在mvc:interceptor下面的表示是对特定的请求才进行拦截的
<bean class="com.host.app.web.interceptor.LoginInterceptor" />
</mvc:interceptor>
</mvc:interceptors> -->
<!-- ViewResolver -->
<bean
class="org.springframework.web.servlet.view.InternalResourceViewResolver">
<property name="viewClass"
value="org.springframework.web.servlet.view.JstlView" />
<property name="prefix" value="/html/" />
<property name="suffix" value=".html" />
</bean>
<!-- 支持上传文件 -->
<!-- <bean id="multipartResolver" class="org.springframework.web.multipart.commons.CommonsMultipartResolver">
<property name="maxUploadSize" value ="1024" />
<property name="resolveLazily" value="true"/>
<property name="defaultEncoding" value = "UTF-8" />
</bean> -->
</beans>
\ No newline at end of file
/**
* ***************************************************
* Copyright (C), NingBo ZhiWeiReach info. Co., Ltd. *
*****************************************************
* 类的详细说明
*
* @author 东临碣石
* @Date 2016年1月16日
* @version 1.00
*/
package com.zhiweidata.weiboDomain;
import org.junit.runner.RunWith;
import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.junit4.AbstractJUnit4SpringContextTests;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
/**
* @Description: SpringTest的父类,用来加载基础的配置文件
* @ClassName: ObjectTest
* @author 落花流水
* @date 2016年1月16日 上午11:40:14
*/
@RunWith(SpringJUnit4ClassRunner.class)
@ContextConfiguration(locations =
{ "classpath:applicationContext.xml" })
public abstract class ObjectTest extends AbstractJUnit4SpringContextTests
{
}
package com.zhiweidata.weiboDomain.dao;
import java.util.HashMap;
import java.util.Map;
import javax.annotation.Resource;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
import com.zhiweidata.weiboDomain.ObjectTest;
import com.zhiweidata.weiboDomain.crawler.WeiboDomainCrawler;
import com.zhiweidata.weiboDomain.entity.DomainTag;
import net.sf.json.JSONObject;
public class DomainDaoTest extends ObjectTest {
@Resource
DomainDao domainDao;
@Resource
TagDao tagDao;
// @Test
public void insertTagTest() {
DomainTag domainTag = new DomainTag();
String id = "24H热门";
domainTag.set_id(id);
domainTag.setDomain(id);
domainTag.setDomainId("1087030002_2982");
domainTag.setState(0);
tagDao.insert(domainTag);
}
/**
*
* @Title: insertTagFirstTest
* @Description: TODO(抓取所有标签)
* @param:
* @return: void
* @throws @author:
* 陈炜涛
* @date: 2018年3月13日 下午5:53:34
*/
// @Test
public void insertTagFirstTest() {
WeiboDomainCrawler crawler = new WeiboDomainCrawler();
String url = "https://d.weibo.com/1087030002_2975_1003_0#";
String html = crawler.getHtml(url,
"SINAGLOBAL=1413878352487.6208.1509610656233; UM_distinctid=16053b396fea93-0a1b0a92fd2d7-5f19311c-1fa400-16053b396ffa9d; un=15757871020; UOR=,www.weibo.com,www.baidu.com; login_sid_t=c3935e41ed072cfa20cbd1462a51e1b6; cross_origin_proto=SSL; _s_tentry=passport.weibo.com; Apache=3630121226743.0815.1520321567340; ULV=1520321568035:15:2:1:3630121226743.0815.1520321567340:1519887954422; SSOLoginState=1520321599; SCF=AuCAEA8HE7llKecGgDC9mXez57Y8TrXZB-bbR3xw1Rg_-hZyItSqXebc8xU1toDVlo8zXAxor6wCq9fcmjOxtlI.; SUHB=0FQrE4a4pmNdGS; ALF=1551948727; SUB=_2AkMt_rd6f8NxqwJRmP0RzWPrbI90wg3EieKbokahJRMxHRl-yT83qlc5tRB6Bn6ZlSJDljRT4MPLb3O5AT8RtbcROuaE; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWDQpkiHsu8zrdep6H.3aI_; YF-Page-G0=aabeaa17d9557111c805fb15a9959531");
Document doc = Jsoup.parse(html);
// 处理填充数据
String str = "";
Elements scripts = doc.getElementsByTag("script");
str = scripts.get(23).childNode(0).toString();
str = getHtml(str);
// 解析页面数据
doc = Jsoup.parse(str);
System.out.println(doc.getElementsByClass("subitem_box S_line1").get(0));
Elements es = doc.getElementsByClass("subitem_box S_line1").get(0).getElementsByClass("item");
for (Element element : es) {
String id = element.getElementsByClass("item_title S_txt1").text();
DomainTag domainTag = new DomainTag();
domainTag.set_id(id);
domainTag.setDomain(id);
domainTag.setDomainId(
element.getElementsByTag("a").get(0).attr("href").replace("//d.weibo.com/", "").replace("#", ""));
domainTag.setState(0);
tagDao.insert(domainTag);
}
}
public Map<String, String> getTag() {
Map<String, String> tagMap = new HashMap<>();
tagMap.put("", "");
return tagMap;
}
/**
* @Title: getHtml
* @Description: TODO(微博数据是用FW.view填充,所以需要解析)
* @return String 返回类型
*/
private String getHtml(String str) {
str = str.replaceAll("FM.view\\(", "").replaceAll("\\)", "");
JSONObject json = JSONObject.fromObject(str);
return json.getString("html");
}
@Test
public void test() {
System.out.println("testRunning");
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment