Commit 158abbbc by win7

sss

parents
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiweidata.weiboDomain</groupId>
<artifactId>weiboDomain</artifactId>
<version>0.0.1-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
<dependency>
<groupId>org.springframework.data</groupId>
<artifactId>spring-data-mongodb</artifactId>
<version>1.10.10.RELEASE</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-test</artifactId>
<version>4.3.14.RELEASE</version>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
<version>3.2.2</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.16.20</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.1</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.5</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.5</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.7</version>
</dependency>
<dependency>
<groupId>net.sf.json-lib</groupId>
<artifactId>json-lib</artifactId>
<version>2.4</version>
<classifier>jdk15</classifier>
</dependency>
<dependency>
<groupId>com.zhiwei</groupId>
<artifactId>jxlzw</artifactId>
<version>0.0.2-SNAPSHOT</version>
</dependency>
</dependencies>
</project>
\ No newline at end of file
/**
* @Title: httpclientInstance.java
* @Package com.zhiweidata.weiboDomain.crawler
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午1:54:32
* @version V1.0
*/ /**
*
*/
package com.zhiweidata.weiboDomain.crawler;
import java.util.List;
import javax.net.ssl.SSLContext;
import org.apache.http.HttpHost;
import org.apache.http.client.CookieStore;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.cookie.Cookie;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.ssl.SSLContexts;
/**
* @ClassName: httpclientInstance
* @Description: TODO(http连接管理,生成http对象)
* @author xuyimeng
* @date 2018年2月23日 下午1:54:32
*/
public class HttpclientInstance {
//设置userAgent池
private static final String[] userAgent = {
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
"NokiaX2-02/2.0 (11.79) Profile/MIDP-2.1 Configuration/CLDC-1.1 Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2;.NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2) UCBrowser8.4.0.159/70/352",
"Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6",
"Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B137 Safari/601.1",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"};
/**
* @Title: generateClient
* @Description: TODO(用连接池生成httpclient)
* @return
* CloseableHttpClient 返回类型
*/
public static CloseableHttpClient generateClient(CookieStore cookieStore) {
return generateClient(null, cookieStore);
}
/**
* @Title: generateClient
* @Description: TODO(增加代理)
* @param httpHost
* @return
* CloseableHttpClient 返回类型
*/
public static CloseableHttpClient generateClient(HttpHost httpHost,CookieStore cookieStore) {
SSLContext sslcontext = SSLContexts.createSystemDefault();
Registry<ConnectionSocketFactory> socketFactoryRegistry = RegistryBuilder.<ConnectionSocketFactory>create()
.register("http", PlainConnectionSocketFactory.INSTANCE)
.register("https", new SSLConnectionSocketFactory(sslcontext)).build();
// http连接池管理,服务于多个执行进程的连接请求
PoolingHttpClientConnectionManager connectionManager = new PoolingHttpClientConnectionManager(
socketFactoryRegistry);
connectionManager.setMaxTotal(200);
connectionManager.setDefaultMaxPerRoute(20);
RequestConfig requestConfig = RequestConfig.custom().setProxy(httpHost).build();
HttpClientBuilder httpClientBuilder = HttpClients.custom().setUserAgent(randomUserAgent())
.setConnectionManager(connectionManager).setDefaultRequestConfig(requestConfig).setDefaultCookieStore(cookieStore);
return httpClientBuilder.build();
}
/**
* @Title: randomUserAgent
* @Description: TODO(随机取一个ua)
* @return
* String 返回类型
*/
public static String randomUserAgent() {
return userAgent[(int) (Math.random() * userAgent.length)];
}
}
/**
* @Title: JsoupHtml.java
* @Package com.zhiweidata.weiboDomain.crawler
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午3:16:06
* @version V1.0
*/ /**
*
*/
package com.zhiweidata.weiboDomain.crawler;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
import javax.print.Doc;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.zhiweidata.weiboDomain.entity.WeiboDomain;
import net.sf.json.JSONObject;
/**
* @ClassName: JsoupHtml
* @Description: TODO(解析页面数据)
* @author xuyimeng
* @date 2018年2月23日 下午3:16:06
*/
public class JsoupHtml {
private static JsoupHtml jsoupHtml = new JsoupHtml();
private JsoupHtml() {}
public static JsoupHtml getInstance() {
return jsoupHtml;
}
/**
* @Title: parseData
* @Description: TODO(解析页面数据转化为集合)
* @param html
* @param domain
* @return
* List<WeiboDomainGroup> 返回类型
*/
public List<WeiboDomain> parseData(String html,String domain){
List<WeiboDomain> result = new ArrayList<>();
Document doc = Jsoup.parse(html);
//处理填充数据
String str = "";
Elements script = doc.getElementsByTag("script");
str = script.get(0).childNode(0).toString();
str = getHtml(str);
//解析页面数据
doc = Jsoup.parse(str);
Elements user = doc.getElementsByTag("dd");
for (Element element : user)
{
if (element.attr("class").equals("mod_info S_line1"))
{
WeiboDomain weiboDomainGroup = new WeiboDomain();
String uid = "";
Elements elements = element.getElementsByTag("div");
for (Element div : elements)
{
if (div.attr("class").equals("info_name W_fb W_f14"))
{
Element S_txt1 = div.getElementsByClass("S_txt1").get(0);
uid = S_txt1.attr("usercard").split("&")[0].replaceAll("id=", "");
weiboDomainGroup.setUid(uid);
weiboDomainGroup.setUrl(S_txt1.attr("href"));
weiboDomainGroup.setName(S_txt1.attr("title"));
Elements i = div.getElementsByTag("i");
for (Element ele : i)
{
if (ele.attr("class").equals("W_icon icon_member"))
{
weiboDomainGroup.setVip(true);
}
if (ele.attr("class").equals("W_icon icon_male"))
{
weiboDomainGroup.setGender("m");
}
else
{
weiboDomainGroup.setGender("f");
}
}
}
if (div.attr("class").equals("info_connect"))
{
Elements em = div.getElementsByTag("em");
weiboDomainGroup.setFriends_count(Integer.parseInt(em.get(0).text()));
weiboDomainGroup.setFollowers_count(em.get(1).text());
weiboDomainGroup.setStatuses_count(Integer.parseInt(em.get(2).text()));
}
if (div.attr("class").equals("info_add"))
{
Elements span = div.getElementsByTag("span");
weiboDomainGroup.setLocation(span.get(0).text());
}
if (div.attr("class").equals("info_intro"))
{
Elements span = div.getElementsByTag("span");
weiboDomainGroup.setDescription(span.get(0).text());
}
if (div.attr("class").equals("info_relation"))
{
String tag = div.text().split(":")[1];
weiboDomainGroup.setTag(tag);
}
}
weiboDomainGroup.setDomain(domain);
weiboDomainGroup.setUpdateTime(LocalDateTime.now()
.format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")));
weiboDomainGroup.setId(domain+"_"+uid);
result.add(weiboDomainGroup);
}
}
return result;
}
/**
* @Title: getHtml
* @Description: TODO(微博数据是用FW.view填充,所以需要解析)
* @return
* String 返回类型
*/
private String getHtml(String str) {
str = str.replaceAll("parent.FM.view\\(", "").replaceAll("\\)", "");
JSONObject json = JSONObject.fromObject(str);
return json.getString("html");
}
/**
* @Title: parsePage
* @Description: TODO(这里用一句话描述这个方法的作用)
* @param domainId
* @param cookie
* @return
* int 返回类型
*/
public int parsePage(String page) {
try
{
Document doc = Jsoup.parse(page);
//处理填充数据
String str = "";
Elements script = doc.getElementsByTag("script");
for (Element s : script)
{
if (s.childNode(0).toString().contains("content.signInPeople.index"))
{
str = s.childNode(0).toString();
}
}
str = str.replaceAll("FM.view\\(", "").replaceAll("\\)", "");
JSONObject json = JSONObject.fromObject(str);
str = json.getString("html");
doc = Jsoup.parse(str);
Elements a = doc.getElementsByTag("a");
int num = 0;
for (Element e : a)
{
if ("page".equals(e.attr("bpfilter")) && "page S_txt1".equals(e.attr("class")))
{
if (Integer.parseInt(e.text()) > num)
{
num = Integer.parseInt(e.text());
}
}
}
return num;
}
catch (Exception e) {
return 0;
}
}
}
/**
* @Title: WeiboDomainCrawler.java
* @Package com.zhiweidata.weiboDomain.crawler
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午1:59:46
* @version V1.0
*/ /**
*
*/
package com.zhiweidata.weiboDomain.crawler;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.function.Predicate;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHeaders;
import org.apache.http.HttpHost;
import org.apache.http.HttpStatus;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.entity.ContentType;
import org.apache.http.impl.client.CloseableHttpClient;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* @ClassName: WeiboDomainCrawler
* @Description: TODO(微博榜单爬虫)
* @author xuyimeng
* @date 2018年2月23日 下午1:59:46
*/
public class WeiboDomainCrawler {
private static CloseableHttpClient client = HttpclientInstance.generateClient(null);
private HttpHost httpHost;
/**
* @Title: getHtml
* @Description: TODO(通过url返回页面数据)
* @param url
* @return
* String 返回类型
*/
public String getHtml(String url,String cookie) {
HttpGet httpGet = createHttpGet(url, cookie);
return get(httpGet);
}
/**
* @Title: getPage
* @Description: TODO(这里用一句话描述这个方法的作用)
* @param domainId
* @return
* String 返回类型
*/
public String getPage(String domainId,String cookie) {
String url = "https://d.weibo.com/"+domainId;
HttpGet httpGet = createHttpGet(url, cookie);
return get(httpGet);
}
/**
* @Title: get
* @Description: TODO(返回html数据)
* @param httpGet
* @return
* String 返回类型
*/
private String get(HttpGet httpGet) {
//设置返回内容的检测逻辑
Predicate<String> predicate = s -> (s == null || "".equals(s))
|| s.contains("empty_con clearfix") || !s.contains("follow_item S_line2");
return get(httpGet, predicate);
}
private String get(HttpGet httpGet, Predicate<String> predicate) {
boolean flag = true;
while(flag)
{
try
{
CloseableHttpResponse response = client.execute(httpGet);
HttpEntity httpEntity = response.getEntity();
String responseContent = getResponseContent(httpEntity);
if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK)
{
flag = false;
}
if (!predicate.test(responseContent))
{
return responseContent;
}
}
catch (Exception e)
{
sleep(3000L);
}
}
return null;
}
/**
* @Title: getResponseContent
* @Description: TODO(字符流的方式,获取相应的正文)
* @param httpEntity
* @return
* @throws IOException
* String 返回类型
*/
public static String getResponseContent(final HttpEntity httpEntity) throws IOException {
if (httpEntity == null)
{
return null;
}
InputStream in = null;
try
{
Header header = httpEntity.getContentEncoding();
//被压缩就先解压
if (null != header && "gzip".equals(header.getValue()))
{
in = new GzipDecompressingEntity(httpEntity).getContent();
}
else
{
in = httpEntity.getContent();
}
ByteArrayOutputStream baos = new ByteArrayOutputStream();
IOUtils.copy(in, baos);
Charset charset = null;
ContentType contentType = ContentType.get(httpEntity);
//获取字符集,为空就从页面解析
if (contentType != null)
{
charset = contentType.getCharset();
}
if (charset == null)
{
String content = IOUtils.toString(new ByteArrayInputStream(baos.toByteArray()),
Charset.defaultCharset().displayName());
charset = getHtmlCharset(content);
if (charset == null)
{
return content;
}
}
return IOUtils.toString(new ByteArrayInputStream(baos.toByteArray()), charset.displayName());
}
finally
{
if (in != null)
{
in.close();
}
}
}
/**
* @Title: getHtmlCharset
* @Description: TODO(解析页面字符集)
* @param html
* @return
* Charset 返回类型
*/
public static Charset getHtmlCharset(final String html) {
if (!StringUtils.isEmpty(html))
{
Document document = Jsoup.parse(html);
Elements links = document.select("meta");
for (Element link : links)
{
String metaContent = link.attr("content");
String metaCharset = link.attr("charset");
if (metaContent.contains("charset="))
{
metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
return Charset.forName(metaContent.split("=")[1]);
}
else if (!StringUtils.isEmpty(metaCharset))
{
return Charset.forName(metaCharset);
}
}
}
return null;
}
/**
* 生成get请求,请求头和请求参数
*/
private HttpGet createHttpGet(String url,String cookie) {
HttpGet httpGet = new HttpGet(url);
httpGet.setConfig(getRequestConfig());
httpGet.setHeader(HttpHeaders.USER_AGENT, "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
httpGet.addHeader(HttpHeaders.ACCEPT, "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
httpGet.addHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate, br");
httpGet.addHeader(HttpHeaders.ACCEPT_LANGUAGE, "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7");
httpGet.addHeader(HttpHeaders.CONNECTION, "keep-alive");
httpGet.addHeader("Cookie", cookie);
httpGet.addHeader(HttpHeaders.HOST, "d.weibo.com");
return httpGet;
}
/**
* @Title: getRequestConfig
* @Description: TODO(设置请求配置)
* @return
* RequestConfig 返回类型
*/
private RequestConfig getRequestConfig() {
return RequestConfig.custom().setSocketTimeout(3000).setConnectTimeout(3000).setConnectionRequestTimeout(3000)
.setProxy(httpHost).build();
}
public void sleep(long time) {
try {
Thread.sleep(time);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
/**
* @Title: mongoDao.java
* @Package com.zhiweidata.weiboDomain.dao
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午2:34:36
* @version V1.0
*/ /**
*
*/
package com.zhiweidata.weiboDomain.dao;
import java.util.List;
import com.zhiweidata.weiboDomain.entity.WeiboDomain;
/**
* @ClassName: mongoDao
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午2:34:36
*/
public interface DomainDao{
public List<WeiboDomain> findByUid(String uid);
public List<WeiboDomain> findByDomain(String domain);
public List<WeiboDomain> findAll();
public void insert(List<WeiboDomain> list);
public void createColl();
public String bestNewCollName();
}
/**
* @Title: TagDao.java
* @Package com.zhiweidata.weiboDomain.dao
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午5:17:58
* @version V1.0
*/ /**
*
*/
package com.zhiweidata.weiboDomain.dao;
import java.util.List;
import com.zhiweidata.weiboDomain.entity.DomainTag;
/**
* @ClassName: TagDao
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午5:17:58
*/
public interface TagDao {
public List<DomainTag> findAll();
public List<DomainTag> findByState(Integer state);
public void updateByState(String domain,Integer state);
}
/**
* @Title: MongoDaoImpl.java
* @Package dao.impl
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午2:57:24
* @version V1.0
*/ /**
*
*/
package com.zhiweidata.weiboDomain.dao.impl;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.List;
import java.util.Set;
import javax.annotation.Resource;
import org.springframework.data.mongodb.core.MongoTemplate;
import org.springframework.data.mongodb.core.query.Criteria;
import org.springframework.data.mongodb.core.query.Query;
import org.springframework.stereotype.Repository;
import com.zhiweidata.weiboDomain.dao.DomainDao;
import com.zhiweidata.weiboDomain.entity.WeiboDomain;
/**
* @ClassName: MongoDaoImpl
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午2:57:24
*/
@Repository
public class DomainDaoImpl implements DomainDao{
@Resource
MongoTemplate mongoTemplate;
@Override
public List<WeiboDomain> findByUid(String uid) {
String collName = bestNewCollName();
Query query = new Query().addCriteria(Criteria.where("uid").is(uid));
return mongoTemplate.find(query, WeiboDomain.class,collName);
}
@Override
public List<WeiboDomain> findByDomain(String domain) {
String collName = bestNewCollName();
Query query = new Query().addCriteria(Criteria.where("domain").is(domain));
return mongoTemplate.find(query, WeiboDomain.class,collName);
}
@Override
public List<WeiboDomain> findAll() {
String collName = bestNewCollName();
return mongoTemplate.findAll(WeiboDomain.class,collName);
}
@Override
public void insert(List<WeiboDomain> list) {
String collName = bestNewCollName();
for (WeiboDomain weiboDomain : list)
{
mongoTemplate.save(weiboDomain,collName);
}
}
@Override
public String bestNewCollName() {
Set<String> names = mongoTemplate.getCollectionNames();
String result = "";
for (String name : names)
{
if (name.contains("weiboDomain"))
{
if (name.compareTo(result) > 0)
{
result = name;
}
}
}
return result;
}
@Override
public void createColl() {
String time = LocalDate.now().format(DateTimeFormatter.ofPattern("yyyyMMdd"));
String collName = "weiboDomain"+time;
mongoTemplate.createCollection(collName);
}
}
/**
* @Title: TagDaoImpl.java
* @Package com.zhiweidata.weiboDomain.dao.impl
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午5:21:35
* @version V1.0
*/ /**
*
*/
package com.zhiweidata.weiboDomain.dao.impl;
import java.util.List;
import javax.annotation.Resource;
import org.springframework.data.mongodb.core.MongoTemplate;
import org.springframework.data.mongodb.core.query.Criteria;
import org.springframework.data.mongodb.core.query.Query;
import org.springframework.data.mongodb.core.query.Update;
import org.springframework.stereotype.Repository;
import com.zhiweidata.weiboDomain.dao.TagDao;
import com.zhiweidata.weiboDomain.entity.DomainTag;
/**
* @ClassName: TagDaoImpl
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午5:21:35
*/
@Repository
public class TagDaoImpl implements TagDao{
@Resource
private MongoTemplate mongoTemplate;
@Override
public List<DomainTag> findAll() {
return mongoTemplate.findAll(DomainTag.class);
}
@Override
public void updateByState(String domain,Integer state) {
Query query = new Query(Criteria.where("domain").is(domain));
Update update = new Update();
update.set("state", state);
mongoTemplate.updateMulti(query, update, DomainTag.class);
}
@Override
public List<DomainTag> findByState(Integer state) {
Query query = new Query(Criteria.where("state").lte(state));
return mongoTemplate.find(query, DomainTag.class);
}
}
/**
* @Title: DomainTag.java
* @Package com.zhiweidata.weiboDomain.entity
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午3:23:31
* @version V1.0
*/ /**
*
*/
package com.zhiweidata.weiboDomain.entity;
import java.util.List;
import org.springframework.data.mongodb.core.mapping.Document;
import lombok.Data;
/**
* @ClassName: DomainTag
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午3:23:31
*/
@Data
@Document(collection = "domainTag")
public class DomainTag {
private String _id;
private List<String> tags;
private String domain;
private String domainId;
private Integer state;
}
/**
* @Title: entity.java
* @Package com.zhiweidata.weiboDomain.entity
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午2:37:27
* @version V1.0
*/ /**
*
*/
package com.zhiweidata.weiboDomain.entity;
import org.springframework.data.annotation.Id;
import org.springframework.data.mongodb.core.index.Indexed;
import lombok.Data;
/**
* @ClassName: entity
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午2:37:27
*/
@Data
public class WeiboDomain {
@Id
private String id;
@Indexed
private String uid;
private String name;
private String url;
private String gender;
private String location;
private String description;
@Indexed
private String domain;
private String tag;
private String followers_count;
private Integer friends_count;
private Integer statuses_count;
private boolean isVip;
private String updateTime;
}
\ No newline at end of file
package com.zhiweidata.weiboDomain.excel;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.mongodb.DBObject;
public class DBOExp
{
// private static SimpeExcelReport simpe = SimpeExcelReport.getInstance();
/**
*
* @TODO (输出DBObject集合)
* @author 陈炜涛
* @param listChai
* @param fliename
* @param sheetName
* @time 2016年8月27日上午10:12:37
* @return void
*/
public void putRun(List<DBObject> listChai, String fliename,
String sheetName)
{
// flie.mkdirs();
SimpeExcelReport simpe = SimpeExcelReport.getInstance();
File excelFile = new File(fliename);
boolean flg = excelFile.exists();
// System.out.println(flg);
OutputStream osOutputStream = null;
try
{
osOutputStream = new FileOutputStream(excelFile, true);
}
catch (FileNotFoundException e1)
{
// TODO Auto-generated catch block
e1.printStackTrace();
}
List<Map<String, Object>> dataList = new ArrayList<Map<String, Object>>();
// 将取到的body集合加入总集合
dataList.addAll(bodyList(listChai));
// 创建文件导出
// simpe.createExcelWithStream(headList(), bodyList(lists),
// osOutputStream,
// "微信信息");
// List<String> mergeList = new ArrayList<String>();
// mergeList.add("主题");
// mergeList.add("关键词");
// simpe.setMergeList(mergeList);
// simpe.addSheetInExcelWithFile(headList(),dataList, new
// File(fliename), "微信信息");
if (!flg)
{
simpe.createExcelWithStream(headList(listChai.get(0)), dataList,
osOutputStream, sheetName, excelFile);
}
else
{
simpe.addSheetInExcelWithFile(headList(listChai.get(0)), dataList,
new File(fliename), sheetName);
}
try
{
osOutputStream.close();
}
catch (IOException e)
{
e.printStackTrace();
}
}
/**
* @Description 设置文件的列名
*
* @return headList excel中所有列名的list
*/
public static List<String> headList(DBObject dbo)
{
List<String> headList = new ArrayList<String>();
headList.addAll(dbo.keySet());
return headList;
}
/**
* @Description 装载数据
*
* @return dataList 列名和值组成的map的list
*/
public List<Map<String, Object>> bodyList(List<DBObject> lists)
{
List<Map<String, Object>> dataList = new ArrayList<Map<String, Object>>();
// 循环存数据的list组装成制表时候能用的map的list
// List<String> days = InfoSource27.getDayPoint();
List<String> keys = new ArrayList<String>();
keys.addAll(lists.get(0).keySet());
Map<String, Object> beanMap;
for (DBObject dbo : lists)
{
// 因为这个导出文件类不能导出空对象,所以每个值都做了判断空的
beanMap = new HashMap<String, Object>();
for (String key : keys)
{
beanMap.put(key, dbo.get(key));
}
dataList.add(beanMap);
}
return dataList;
}
}
package com.zhiweidata.weiboDomain.excel;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.zhiwei.jxlzw.report.model.HLink;
import jxl.Cell;
import jxl.Sheet;
import jxl.Workbook;
import jxl.format.Border;
import jxl.format.BorderLineStyle;
import jxl.format.Colour;
import jxl.format.UnderlineStyle;
import jxl.read.biff.BiffException;
import jxl.write.Label;
import jxl.write.WritableCellFormat;
import jxl.write.WritableFont;
import jxl.write.WritableHyperlink;
import jxl.write.WritableSheet;
import jxl.write.WritableWorkbook;
import jxl.write.WriteException;
import jxl.write.biff.RowsExceededException;
import lombok.extern.slf4j.Slf4j;
/**
* 简单的 Excel报表
*
* @ClassName: SimpeExcelReport
* @Description: TODO(这里用一句话描述这个类的作用)
* @author Administrator
* @date 2015年11月20日 下午4:52:02
*/
@Slf4j
public class SimpeExcelReport
{
private List<Map<String, Object>> bodyList;
private List<String> headList;
private WritableCellFormat format;
private WritableCellFormat formatColor;
private WritableCellFormat headformat;
private WritableWorkbook writeWorkBook;
private OutputStream os;
private Workbook readWordBook;
private WritableSheet sheet = null;
private List<String> mergeList;
public static SimpeExcelReport getInstance()
{
return new SimpeExcelReport();
}
/**
* 读取一个Excel,返回格式Map<sheetName,DATA> DATA: 是一个Map,存放两个key head,和body head是表头的List,存放了表头的字段 body是Map
* <key,value> 存放了表头字段对应的数据
*
* @Title: readExcel
* @param excelFiel
* @param sheetName 不输入Sheet名字那么返回所有sheet数据
* 设定文件
* @return Map<String,Object> 返回类型
*/
public Map<String, Object> readExcel(File excelFiel, String sheetName)
{
readWordBook = getExcelFile(excelFiel);
Map<String, Object> map;
if (null != sheetName && !"".equals(sheetName))
{
map = getExcelBySheet(sheetName);
}
else
{
map = getExcelAllData();
}
closeAllObject();
log.info("文件读取成功");
return map;
}
public WritableCellFormat getTitleFormat(WritableFont headFont)
{
WritableCellFormat wcfFormat = new WritableCellFormat(headFont);
// 设置居中
try
{
wcfFormat.setAlignment(jxl.format.Alignment.CENTRE); // 左右居中
wcfFormat.setVerticalAlignment(jxl.format.VerticalAlignment.CENTRE);// 上下居中
wcfFormat.setBackground(Colour.LIGHT_BLUE);
}
catch (WriteException e)
{
e.printStackTrace();
}
return wcfFormat;
}
/**
* 获取Excel所有的数据
*
* @Title: getExcelAllData
* @Description: TODO(这里用一句话描述这个方法的作用)
* 设定文件
* @return Map<String,Object> 返回类型
*/
private Map<String, Object> getExcelAllData()
{
Sheet[] sheets = readWordBook.getSheets();
Sheet sheet;
Map<String, Object> excelMap = new HashMap<String, Object>();
for (int s = 0; s < sheets.length; s++)
{
sheet = sheets[s];
Map<String, Object> sheetMap = getSheetData(sheet);
excelMap.put(sheet.getName(), sheetMap);
}
return excelMap;
}
/**
* 获取指定sheet的Excel数据
*
* @Title: getExcelBySheet
* @Description: TODO(这里用一句话描述这个方法的作用)
* 设定文件
* @return Map<String,Object> 返回类型
*/
private Map<String, Object> getExcelBySheet(String sheetName)
{
Sheet sheet = readWordBook.getSheet(sheetName);
Map<String, Object> sheetMap = getSheetData(sheet);
return sheetMap;
}
private Map<String, Object> getSheetData(Sheet sheet)
{
List<String> headList = new ArrayList<String>();
List<Map<String, Object>> bodyList = new ArrayList<Map<String, Object>>();
Map<String, Object> sheetMap = new HashMap<String, Object>();
// 获取表头
Cell[] cell = sheet.getRow(0);
for (int i = 0; i < cell.length; i++)
{
headList.add(cell[i].getContents());
}
// 获取数据
Map<String, Object> bodyData;
for (int i = 1; i < sheet.getRows(); i++)
{
cell = sheet.getRow(i);
bodyData = new HashMap<String, Object>();
for (int c = 0; c < headList.size(); c++)
{
try {
bodyData.put(headList.get(c), cell[c].getContents());
} catch (Exception e) {
bodyData.put(headList.get(c), null);
}
}
bodyList.add(bodyData);
}
sheetMap.put("head", headList);
sheetMap.put("body", bodyList);
return sheetMap;
}
/**
* 根据已存在的Excel创建新的Sheet
*
* @Title: createExcelWithStream
* @Description: TODO(这里用一句话描述这个方法的作用)
* @param headList 表头
* @param bodyList 数据对象Map<String,Object>;
* @param isClose 创建完成后是否关闭流;
* @param os
* @param sheetName 设定文件
* @return void 返回类型
*/
public synchronized void createExcelWithStream(List<String> headList, List<Map<String, Object>> bodyList,
OutputStream os, String sheetName,File excelFiel)
{
this.os = os;
this.headList = headList;
this.bodyList = bodyList;
int sheetIndex = 0;
getWriteWorkBookWithStream();
@SuppressWarnings("unused")
int size = buildSheet(sheetIndex, null == sheetName || "".equals(sheetName) ? "Sheet" : sheetName);
try
{
os.flush();
}
catch (IOException e)
{
e.printStackTrace();
}
writeWorkBookWriter();
closeAllObject();
this.headList = null;
this.bodyList = null;
log.info("文件创建成功");
}
/**
* @return the mergeList
*/
public List<String> getMergeList()
{
return mergeList;
}
/**
* @param mergeList the mergeList to set
*/
public void setMergeList(List<String> mergeList)
{
this.mergeList = mergeList;
}
/**
* 根据输出流创建Excel文件
*
* @Title: createExcelWithStream
* @Description: TODO(这里用一句话描述这个方法的作用)
* @param headList 表头
* @param bodyList 数据对象Map<String,Object>;
* @param os
* @param sheetName 设定文件
* @return void 返回类型
*/
public synchronized void addSheetInExcelWithFile(List<String> headList, List<Map<String, Object>> bodyList,
File excelFile, String sheetName)
{
this.headList = headList;
this.bodyList = bodyList;
getWorkBookWithFile(excelFile);
int sheetIndex = getSheetIndex();
buildSheet(sheetIndex, null == sheetName || "".equals(sheetName) ? "Sheet" : existsName(sheetName));
writeWorkBookWriter();
closeAllObject();
log.info("文件创建成功");
this.headList = null;
this.bodyList = null;
}
/**
* 构建Sheet,在这个文件判断了数据是发大于50000条,大于50000那么创建新的Sheet
*
* @Title: buildSheet
* @Description: TODO(这里用一句话描述这个方法的作用)
* @param index
* @param sheetName 设定文件
* @return void 返回类型
*/
public int buildSheet(int index, String sheetName)
{
// 大于五万条数据就进行分表
int size = bodyList.size();
int limit = 50000;
if (size < limit)
{
sheet = writeWorkBook.createSheet(sheetName, index);// 创建工作表,第一个参数是名称,第二个参数是该工作表在工作薄中的位置
fileInToSheet(sheet, bodyList, headList);
}
else
{
int count = size % limit == 0 ? size / limit : size / limit + 1;
for (int i = 0; i < count; i++)
{
sheet = writeWorkBook.createSheet(sheetName + "(" + (i + index) + ")", i + index);// 创建工作表,第一个参数是名称,第二个参数是该工作表在工作薄中的位置
int toIndex = limit * (i + 1) > size ? size - 1 : limit * (i + 1);
fileInToSheet(sheet, bodyList.subList(i * limit, toIndex), headList);
}
}
for (Integer mergeColNum : getmergeColNumS())
{
mergeCell(mergeColNum);
}
return writeWorkBook.getSheetNames().length;
}
private List<Integer> getmergeColNumS()
{
if (null != mergeList && mergeList.size() > 0)
{
List<Integer> mergerList = new ArrayList<Integer>();
for (String merge : mergeList)
{
mergerList.add(getHeadListMap().get(merge));
}
return mergerList;
}
return new ArrayList<Integer>();
}
/**
* 选择列号,进行相同的值合并操作
*
* @Title: mergeCell
* @param colNum 列的下标第一列为0
*/
private void mergeCell(int colNum)
{
try
{
Map<Integer, Integer> map = getMergeCellsList(colNum);
for (Integer startRowNum : map.keySet())
sheet.mergeCells(colNum, startRowNum, colNum, map.get(startRowNum));
}
catch (RowsExceededException e)
{
e.printStackTrace();
}
catch (WriteException e)
{
e.printStackTrace();
}
}
private Map<Integer, Integer> getMergeCellsList(Integer colNum)
{
Map<Integer, Integer> map = new HashMap<Integer, Integer>();
int rows = sheet.getRows();
for (int rowNum = 0; rowNum < rows; rowNum++)
{
Cell cell = sheet.getCell(colNum, rowNum);
while (rowNum < rows)
{
rowNum++;
Cell cellNext = sheet.getCell(colNum, rowNum);
if (cell.getContents().equals(cellNext.getContents()))
{
map.put(cell.getRow(), rowNum);
}
else
{
rowNum -= 1;
break;
}
}
}
return map;
}
private void getWriteWorkBookWithStream()
{
try
{
writeWorkBook = Workbook.createWorkbook(os);// 创建xls文件
if (writeWorkBook != null)
{
formatting();
}
}
catch (FileNotFoundException e)
{
e.printStackTrace();
}
catch (IOException e)
{
e.printStackTrace();
}
}
// 创建新的Sheet
private void getWorkBookWithFile(File excelFile)
{
try
{
readWordBook = getExcelFile(excelFile);
writeWorkBook = Workbook.createWorkbook(excelFile, readWordBook);
if (writeWorkBook != null)
{
formatting();
}
}
catch (FileNotFoundException e)
{
excelFile.deleteOnExit();
e.printStackTrace();
}
catch (IOException e)
{
excelFile.deleteOnExit();
e.printStackTrace();
}
}
// 创建新的Sheet
private Workbook getExcelFile(File excelFile)
{
try
{
return Workbook.getWorkbook(excelFile);
}
catch (BiffException e)
{
e.printStackTrace();
}
catch (IOException e)
{
e.printStackTrace();
}
return null;
}
/**
* @Title: getSheetIndex
* @Description: TODO(获取新增加的sheet在表中的位置)
* 设定文件
* @return int 返回类型
*/
private int getSheetIndex()
{
int index = readWordBook.getSheets().length + 1;
return index;
}
private String existsName(String sheetName)
{
if (readWordBook != null)
{
String[] names = readWordBook.getSheetNames();
for (int i = 0; i < names.length; i++)
{
if (sheetName.equals(names[i]))
{
sheetName += "副本";
}
}
}
return sheetName;
}
/**
* @Title: fileInSheet
* @Description: TODO(为sheet页填充数据)
* @param sheet
* @param list 设定文件
* @return void 返回类型
*/
private void fileInToSheet(WritableSheet sheet, List<Map<String, Object>> list, List<String> headList)
{
builderHeader(sheet, headList);
parserBean(sheet, list);
}
/**
* @Title: builderHeader
* @Description: TODO(生成表头)
* @param sheet
* @param list
* @param headformat 设定文件
* @return void 返回类型
*/
private void builderHeader(WritableSheet sheet, List<String> list)
{
for (int i = 0; i < list.size(); i++)
{
try
{
sheet.addCell(new Label(i, 0, list.get(i), headformat));
}
catch (RowsExceededException e)
{
e.printStackTrace();
}
catch (WriteException e)
{
e.printStackTrace();
}
}
}
/** 解析行 */
private void parserBean(WritableSheet sheet, List<Map<String, Object>> list)
{
for (int i = 0; i < list.size(); i++)
{
builderCell(sheet, list.get(i), i + 1);
}
}
/** 根据表头来解析列数据 */
private void builderCell(WritableSheet sheet, Map<String, Object> obj, int row)
{
try
{
List<String> head = headList;
for (int i = 0; i < head.size(); i++)
{
Object o = obj.get(head.get(i)) == null ? "" : obj.get(head.get(i));
try
{
if (o instanceof Integer)
{
sheet.addCell(new jxl.write.Number(i, row, Integer.valueOf(o.toString()), format));
}
else if (o instanceof Double)
{
sheet.addCell(new jxl.write.Number(i, row, Double.valueOf(o.toString()), format));
}
else if (o instanceof Long)
{
sheet.addCell(new jxl.write.Number(i, row, Long.valueOf(o.toString()), format));
}
else if (o instanceof HLink)
{
HLink l = (HLink)o;
if (null == l.getUrl())
{
sheet.addCell(new Label(i, row, "超链接出错:_" + l.getDescription(), format));
}
else
{
WritableHyperlink link = null;
link = new WritableHyperlink(i, row, l.getUrl());
link.setDescription(l.getDescription());
sheet.addHyperlink(link);
}
}
else
{
if (row % 2 == 0)
{
sheet.addCell(new Label(i, row, o.toString(), formatColor));
}
else
{
sheet.addCell(new Label(i, row, o.toString(), format));
}
}
}
catch (NumberFormatException e)
{
log.error("第几列:{}\t列名:{}\t数据:" + o, i, headList.get(i));
}
}
}
catch (RowsExceededException e)
{
e.printStackTrace();
}
catch (WriteException e)
{
e.printStackTrace();
}
}
/**
* @Title: formatting
* @Description: TODO(设置Excel单元格格式)
* @param 设定文件
* @return void 返回类型
*/
private void formatting()
{
format = getCellFormat(writeWorkBook);
formatColor = getCellSimpleFormat(writeWorkBook);
headformat = getHeaderFormat(writeWorkBook);
}
public void closeAllObject()
{
try
{
if (writeWorkBook != null)
{
writeWorkBook.close();
}
if (os != null)
{
os.close();
}
if (readWordBook != null)
{
readWordBook.close();
}
}
catch (WriteException e)
{
e.printStackTrace();
}
catch (IOException e)
{
e.printStackTrace();
}
}
/** 表头各式 */
public WritableCellFormat getHeaderFormat(WritableWorkbook wb)
{
// 创建表头样式
//WritableFont headFont = new WritableFont(WritableFont.TIMES, 10, WritableFont.BOLD, false);
WritableFont headFont = new WritableFont(WritableFont.TIMES, 10, WritableFont.BOLD, false,UnderlineStyle.NO_UNDERLINE,Colour.WHITE);
WritableCellFormat wcfFormat = new WritableCellFormat(headFont);
// 设置居中
try
{
wb.setColourRGB(Colour.GRAY_50, 166, 166, 166);
wcfFormat.setAlignment(jxl.format.Alignment.CENTRE);// 左右居中
wcfFormat.setVerticalAlignment(jxl.format.VerticalAlignment.CENTRE);// 上下居中
wcfFormat.setBorder(Border.ALL, BorderLineStyle.THIN, Colour.BLUE2);// 黑色边框
wcfFormat.setBackground(Colour.GRAY_50);
}
catch (Exception e)
{
e.printStackTrace();
}
return wcfFormat;
}
/**
* 单元格各式
* 添加了背景色
**/
public WritableCellFormat getCellSimpleFormat(WritableWorkbook wb)
{
// 创建表头样式
WritableFont headFonts = new WritableFont(WritableFont.createFont("微软雅黑"), 9, WritableFont.NO_BOLD, false);
WritableCellFormat wcfSimpleFormat = new WritableCellFormat(headFonts);
// 设置居中
try
{
wb.setColourRGB(Colour.GRAY_80, 242, 242, 242); // 工作簿颜色设置
wcfSimpleFormat.setAlignment(jxl.format.Alignment.CENTRE);// 左右居中
wcfSimpleFormat.setVerticalAlignment(jxl.format.VerticalAlignment.CENTRE);// 上下居中
wcfSimpleFormat.setBorder(Border.ALL, BorderLineStyle.THIN, Colour.BLUE2);// 蓝色边框
wcfSimpleFormat.setBackground(Colour.GRAY_80);
}
catch (Exception e)
{
e.printStackTrace();
}
return wcfSimpleFormat;
}
/** 单元格各式 */
public WritableCellFormat getCellFormat(WritableWorkbook wb)
{
// 创建表头样式
WritableFont headFont = new WritableFont(WritableFont.createFont("微软雅黑"), 9, WritableFont.NO_BOLD, false);
WritableCellFormat wcfFormat = new WritableCellFormat(headFont);
// 设置居中
try
{
wb.setColourRGB(Colour.BLUE2, 0, 176, 240); // 工作簿颜色设置
wcfFormat.setAlignment(jxl.format.Alignment.CENTRE);// 左右居中
wcfFormat.setVerticalAlignment(jxl.format.VerticalAlignment.CENTRE);// 上下居中
wcfFormat.setBorder(Border.ALL, BorderLineStyle.THIN, Colour.BLUE2);// 蓝色边框
}
catch (Exception e)
{
e.printStackTrace();
}
return wcfFormat;
}
/**
* @return the format
*/
public WritableCellFormat getFormat()
{
return format;
}
/**
* @return the headformat
*/
public WritableCellFormat getHeadformat()
{
return headformat;
}
/**
* @param format the format to set
*/
public void setFormat(WritableCellFormat format)
{
this.format = format;
}
/**
* @Description (这里用一句话描述这个方法的作用)
* @param formatColor
*/
public void setFormatColor(WritableCellFormat formatColor)
{
this.formatColor = formatColor;
}
/**
* @param headformat the headformat to set
*/
public void setHeadformat(WritableCellFormat headformat)
{
this.headformat = headformat;
}
private void writeWorkBookWriter()
{
try
{
writeWorkBook.write();
}
catch (IOException e)
{
}
}
/**
* 通过表头List,获取 表头对应的列下标 key-value ,表头-下标
*
* @Title: getHeadListMap
* 设定文件
* @return Map<String,Integer> 返回类型
*/
private Map<String, Integer> getHeadListMap()
{
if (null != headList && headList.size() > 0)
{
Map<String, Integer> map = new HashMap<String, Integer>();
int i = 0;
for (String headNum : headList)
{
map.put(headNum, i);
i++;
}
return map;
}
return new HashMap<String, Integer>();
}
}
/**
* @Title: Serivce.java
* @Package com.zhiweidata.weiboDomain.service
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午4:55:00
* @version V1.0
*/ /**
*
*/
package com.zhiweidata.weiboDomain.service;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.annotation.Resource;
import org.springframework.stereotype.Service;
import com.zhiweidata.weiboDomain.crawler.JsoupHtml;
import com.zhiweidata.weiboDomain.crawler.WeiboDomainCrawler;
import com.zhiweidata.weiboDomain.dao.DomainDao;
import com.zhiweidata.weiboDomain.dao.TagDao;
import com.zhiweidata.weiboDomain.entity.DomainTag;
import com.zhiweidata.weiboDomain.entity.WeiboDomain;
import lombok.extern.slf4j.Slf4j;
/**
* @ClassName: Serivce
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午4:55:00
*/
@Slf4j
@Service
public class MongoSerivce {
@Resource
TagDao tagDao;
@Resource
DomainDao domainDao;
WeiboDomainCrawler crawler = new WeiboDomainCrawler();
JsoupHtml jsoupHtml = JsoupHtml.getInstance();
public void crawlerData(String cookie) {
Map<String, String> map = groupSet();
for (String domain : map.keySet())
{
String domainId = map.get(domain);
log.info("【{}】页开始爬取...............",domain);
List<WeiboDomain> list = parse(domain, domainId, cookie);
log.info("【{}】页所有数据爬取结束...............",domain);
domainDao.insert(list);
tagDao.updateByState(domain, 2);
log.info("【{}】所有页数据存储成功,共计【{}】条数据",domain,list.size());
}
log.info("所有页面爬取结束,程序结束");
}
private int getPageNum(String domainId,String cookie) {
while (true)
{
String page = crawler.getPage(domainId, cookie);
crawler.sleep(3000L);
int num = jsoupHtml.parsePage(page);
if (num != 0)
{
return num;
}
}
}
private List<WeiboDomain> parse(String domain,String domainId,String cookie) {
List<WeiboDomain> result = new ArrayList<>();
int num = getPageNum(domainId, cookie);
int i = 1;
while (i<300)
{
String url = "https://d.weibo.com/"+domainId+"?pids=Pl_Core_F4RightUserList__4"
+ "&page="+i+"&ajaxpagelet=1&__ref=/"+domainId;
String html = crawler.getHtml(url, cookie);
if (html == null)
{
if ((result.size()/10)+2 < num)
{
continue;
}
else {
break;
}
}
List<WeiboDomain> list = jsoupHtml.parseData(html,domain);
result.addAll(list);
log.info("【{}】:第【{}】页爬取成功",domain,i);
i++;
crawler.sleep(3000L);
}
return result;
}
private Map<String,String> groupSet(){
Map<String, String> result = new HashMap<>();
List<DomainTag> list = tagDao.findByState(1);
for (DomainTag domainTag : list)
{
String key = domainTag.getDomain();
String value = domainTag.getDomainId();
result.put(key, value);
}
for (String key : result.keySet())
{
tagDao.updateByState(key, 0);
}
return result;
}
/**
* @Title: initTag
* @Description: TODO(初始化所有tag状态,并创建新的表,在再次爬取时调用)
* void 返回类型
*/
public void initTag() {
Map<String, String> result = new HashMap<>();
List<DomainTag> list = tagDao.findAll();
for (DomainTag domainTag : list)
{
String key = domainTag.getDomain();
String value = domainTag.getDomainId();
result.put(key, value);
}
for (String key : result.keySet())
{
tagDao.updateByState(key, 0);
}
domainDao.createColl();
}
}
/**
* @Title: Main.java
* @Package com.zhiweidata.weiboDomain.start
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午3:09:33
* @version V1.0
*/ /**
*
*/
package com.zhiweidata.weiboDomain.start;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import com.zhiweidata.weiboDomain.service.MongoSerivce;
/**
* @ClassName: Main
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午3:09:33
*/
public class Start {
private static ApplicationContext ctx = new ClassPathXmlApplicationContext("spring-context.xml");
private static MongoSerivce serice = ctx.getBean(MongoSerivce.class);
public static void main(String[] args) {
String cookie = "login_sid_t=2da8770fb84cdb5be026bbfcd76ef1e6; cross_origin_proto=SSL; _s_tentry=passport.weibo.com; Apache=873655794108.0503.1519525903336; SINAGLOBAL=873655794108.0503.1519525903336; ULV=1519525903344:1:1:1:873655794108.0503.1519525903336:; SSOLoginState=1519525975; SCF=AqU8lfV6ROhTkYEEmVi2ROhtdMxlB0mT3EF2ABKenC3OfC3SeK3YfvZYWFJY8ytsaFhYcc1vO5hvhLwolzBW5ps.; SUB=_2A253llAIDeRhGeNH6VoY9C7Mzz-IHXVU4sbArDV8PUNbmtBeLUnSkW9NStghaGFgK4WPoq15L2ikM_srwT7hNvkI; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5eochNrdf3XKPD1VaPcG3T5JpX5K2hUgL.Fo-4eon4Sh57She2dJLoIEQLxK-LBKBLBo2LxKBLBo.L12zLxK-L1-BLBKqLxKML1hBLBoqEeh2ceh-t; SUHB=0mxUFkR8aaPo5m; ALF=1551061975; un=18395807152; wvr=6; YF-Page-G0=416186e6974c7d5349e42861f3303251";
// 初始化程序状态,在再次爬取时调用
// 断点续传时,注释掉
// serice.initTag();
serice.crawlerData(cookie);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment