Commit 158abbbc by win7

sss

parents
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiweidata.weiboDomain</groupId>
<artifactId>weiboDomain</artifactId>
<version>0.0.1-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
<dependency>
<groupId>org.springframework.data</groupId>
<artifactId>spring-data-mongodb</artifactId>
<version>1.10.10.RELEASE</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-test</artifactId>
<version>4.3.14.RELEASE</version>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
<version>3.2.2</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.16.20</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.1</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.5</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.5</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.7</version>
</dependency>
<dependency>
<groupId>net.sf.json-lib</groupId>
<artifactId>json-lib</artifactId>
<version>2.4</version>
<classifier>jdk15</classifier>
</dependency>
<dependency>
<groupId>com.zhiwei</groupId>
<artifactId>jxlzw</artifactId>
<version>0.0.2-SNAPSHOT</version>
</dependency>
</dependencies>
</project>
\ No newline at end of file
/**
* @Title: httpclientInstance.java
* @Package com.zhiweidata.weiboDomain.crawler
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午1:54:32
* @version V1.0
*/ /**
*
*/
package com.zhiweidata.weiboDomain.crawler;
import java.util.List;
import javax.net.ssl.SSLContext;
import org.apache.http.HttpHost;
import org.apache.http.client.CookieStore;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.cookie.Cookie;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.ssl.SSLContexts;
/**
* @ClassName: httpclientInstance
* @Description: TODO(http连接管理,生成http对象)
* @author xuyimeng
* @date 2018年2月23日 下午1:54:32
*/
public class HttpclientInstance {
//设置userAgent池
private static final String[] userAgent = {
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
"NokiaX2-02/2.0 (11.79) Profile/MIDP-2.1 Configuration/CLDC-1.1 Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2;.NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2) UCBrowser8.4.0.159/70/352",
"Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6",
"Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B137 Safari/601.1",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"};
/**
* @Title: generateClient
* @Description: TODO(用连接池生成httpclient)
* @return
* CloseableHttpClient 返回类型
*/
public static CloseableHttpClient generateClient(CookieStore cookieStore) {
return generateClient(null, cookieStore);
}
/**
* @Title: generateClient
* @Description: TODO(增加代理)
* @param httpHost
* @return
* CloseableHttpClient 返回类型
*/
public static CloseableHttpClient generateClient(HttpHost httpHost,CookieStore cookieStore) {
SSLContext sslcontext = SSLContexts.createSystemDefault();
Registry<ConnectionSocketFactory> socketFactoryRegistry = RegistryBuilder.<ConnectionSocketFactory>create()
.register("http", PlainConnectionSocketFactory.INSTANCE)
.register("https", new SSLConnectionSocketFactory(sslcontext)).build();
// http连接池管理,服务于多个执行进程的连接请求
PoolingHttpClientConnectionManager connectionManager = new PoolingHttpClientConnectionManager(
socketFactoryRegistry);
connectionManager.setMaxTotal(200);
connectionManager.setDefaultMaxPerRoute(20);
RequestConfig requestConfig = RequestConfig.custom().setProxy(httpHost).build();
HttpClientBuilder httpClientBuilder = HttpClients.custom().setUserAgent(randomUserAgent())
.setConnectionManager(connectionManager).setDefaultRequestConfig(requestConfig).setDefaultCookieStore(cookieStore);
return httpClientBuilder.build();
}
/**
* @Title: randomUserAgent
* @Description: TODO(随机取一个ua)
* @return
* String 返回类型
*/
public static String randomUserAgent() {
return userAgent[(int) (Math.random() * userAgent.length)];
}
}
/**
* @Title: JsoupHtml.java
* @Package com.zhiweidata.weiboDomain.crawler
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午3:16:06
* @version V1.0
*/ /**
*
*/
package com.zhiweidata.weiboDomain.crawler;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
import javax.print.Doc;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.zhiweidata.weiboDomain.entity.WeiboDomain;
import net.sf.json.JSONObject;
/**
* @ClassName: JsoupHtml
* @Description: TODO(解析页面数据)
* @author xuyimeng
* @date 2018年2月23日 下午3:16:06
*/
public class JsoupHtml {
private static JsoupHtml jsoupHtml = new JsoupHtml();
private JsoupHtml() {}
public static JsoupHtml getInstance() {
return jsoupHtml;
}
/**
* @Title: parseData
* @Description: TODO(解析页面数据转化为集合)
* @param html
* @param domain
* @return
* List<WeiboDomainGroup> 返回类型
*/
public List<WeiboDomain> parseData(String html,String domain){
List<WeiboDomain> result = new ArrayList<>();
Document doc = Jsoup.parse(html);
//处理填充数据
String str = "";
Elements script = doc.getElementsByTag("script");
str = script.get(0).childNode(0).toString();
str = getHtml(str);
//解析页面数据
doc = Jsoup.parse(str);
Elements user = doc.getElementsByTag("dd");
for (Element element : user)
{
if (element.attr("class").equals("mod_info S_line1"))
{
WeiboDomain weiboDomainGroup = new WeiboDomain();
String uid = "";
Elements elements = element.getElementsByTag("div");
for (Element div : elements)
{
if (div.attr("class").equals("info_name W_fb W_f14"))
{
Element S_txt1 = div.getElementsByClass("S_txt1").get(0);
uid = S_txt1.attr("usercard").split("&")[0].replaceAll("id=", "");
weiboDomainGroup.setUid(uid);
weiboDomainGroup.setUrl(S_txt1.attr("href"));
weiboDomainGroup.setName(S_txt1.attr("title"));
Elements i = div.getElementsByTag("i");
for (Element ele : i)
{
if (ele.attr("class").equals("W_icon icon_member"))
{
weiboDomainGroup.setVip(true);
}
if (ele.attr("class").equals("W_icon icon_male"))
{
weiboDomainGroup.setGender("m");
}
else
{
weiboDomainGroup.setGender("f");
}
}
}
if (div.attr("class").equals("info_connect"))
{
Elements em = div.getElementsByTag("em");
weiboDomainGroup.setFriends_count(Integer.parseInt(em.get(0).text()));
weiboDomainGroup.setFollowers_count(em.get(1).text());
weiboDomainGroup.setStatuses_count(Integer.parseInt(em.get(2).text()));
}
if (div.attr("class").equals("info_add"))
{
Elements span = div.getElementsByTag("span");
weiboDomainGroup.setLocation(span.get(0).text());
}
if (div.attr("class").equals("info_intro"))
{
Elements span = div.getElementsByTag("span");
weiboDomainGroup.setDescription(span.get(0).text());
}
if (div.attr("class").equals("info_relation"))
{
String tag = div.text().split(":")[1];
weiboDomainGroup.setTag(tag);
}
}
weiboDomainGroup.setDomain(domain);
weiboDomainGroup.setUpdateTime(LocalDateTime.now()
.format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")));
weiboDomainGroup.setId(domain+"_"+uid);
result.add(weiboDomainGroup);
}
}
return result;
}
/**
* @Title: getHtml
* @Description: TODO(微博数据是用FW.view填充,所以需要解析)
* @return
* String 返回类型
*/
private String getHtml(String str) {
str = str.replaceAll("parent.FM.view\\(", "").replaceAll("\\)", "");
JSONObject json = JSONObject.fromObject(str);
return json.getString("html");
}
/**
* @Title: parsePage
* @Description: TODO(这里用一句话描述这个方法的作用)
* @param domainId
* @param cookie
* @return
* int 返回类型
*/
public int parsePage(String page) {
try
{
Document doc = Jsoup.parse(page);
//处理填充数据
String str = "";
Elements script = doc.getElementsByTag("script");
for (Element s : script)
{
if (s.childNode(0).toString().contains("content.signInPeople.index"))
{
str = s.childNode(0).toString();
}
}
str = str.replaceAll("FM.view\\(", "").replaceAll("\\)", "");
JSONObject json = JSONObject.fromObject(str);
str = json.getString("html");
doc = Jsoup.parse(str);
Elements a = doc.getElementsByTag("a");
int num = 0;
for (Element e : a)
{
if ("page".equals(e.attr("bpfilter")) && "page S_txt1".equals(e.attr("class")))
{
if (Integer.parseInt(e.text()) > num)
{
num = Integer.parseInt(e.text());
}
}
}
return num;
}
catch (Exception e) {
return 0;
}
}
}
/**
* @Title: WeiboDomainCrawler.java
* @Package com.zhiweidata.weiboDomain.crawler
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午1:59:46
* @version V1.0
*/ /**
*
*/
package com.zhiweidata.weiboDomain.crawler;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.function.Predicate;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHeaders;
import org.apache.http.HttpHost;
import org.apache.http.HttpStatus;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.entity.ContentType;
import org.apache.http.impl.client.CloseableHttpClient;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* @ClassName: WeiboDomainCrawler
* @Description: TODO(微博榜单爬虫)
* @author xuyimeng
* @date 2018年2月23日 下午1:59:46
*/
public class WeiboDomainCrawler {
private static CloseableHttpClient client = HttpclientInstance.generateClient(null);
private HttpHost httpHost;
/**
* @Title: getHtml
* @Description: TODO(通过url返回页面数据)
* @param url
* @return
* String 返回类型
*/
public String getHtml(String url,String cookie) {
HttpGet httpGet = createHttpGet(url, cookie);
return get(httpGet);
}
/**
* @Title: getPage
* @Description: TODO(这里用一句话描述这个方法的作用)
* @param domainId
* @return
* String 返回类型
*/
public String getPage(String domainId,String cookie) {
String url = "https://d.weibo.com/"+domainId;
HttpGet httpGet = createHttpGet(url, cookie);
return get(httpGet);
}
/**
* @Title: get
* @Description: TODO(返回html数据)
* @param httpGet
* @return
* String 返回类型
*/
private String get(HttpGet httpGet) {
//设置返回内容的检测逻辑
Predicate<String> predicate = s -> (s == null || "".equals(s))
|| s.contains("empty_con clearfix") || !s.contains("follow_item S_line2");
return get(httpGet, predicate);
}
private String get(HttpGet httpGet, Predicate<String> predicate) {
boolean flag = true;
while(flag)
{
try
{
CloseableHttpResponse response = client.execute(httpGet);
HttpEntity httpEntity = response.getEntity();
String responseContent = getResponseContent(httpEntity);
if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK)
{
flag = false;
}
if (!predicate.test(responseContent))
{
return responseContent;
}
}
catch (Exception e)
{
sleep(3000L);
}
}
return null;
}
/**
* @Title: getResponseContent
* @Description: TODO(字符流的方式,获取相应的正文)
* @param httpEntity
* @return
* @throws IOException
* String 返回类型
*/
public static String getResponseContent(final HttpEntity httpEntity) throws IOException {
if (httpEntity == null)
{
return null;
}
InputStream in = null;
try
{
Header header = httpEntity.getContentEncoding();
//被压缩就先解压
if (null != header && "gzip".equals(header.getValue()))
{
in = new GzipDecompressingEntity(httpEntity).getContent();
}
else
{
in = httpEntity.getContent();
}
ByteArrayOutputStream baos = new ByteArrayOutputStream();
IOUtils.copy(in, baos);
Charset charset = null;
ContentType contentType = ContentType.get(httpEntity);
//获取字符集,为空就从页面解析
if (contentType != null)
{
charset = contentType.getCharset();
}
if (charset == null)
{
String content = IOUtils.toString(new ByteArrayInputStream(baos.toByteArray()),
Charset.defaultCharset().displayName());
charset = getHtmlCharset(content);
if (charset == null)
{
return content;
}
}
return IOUtils.toString(new ByteArrayInputStream(baos.toByteArray()), charset.displayName());
}
finally
{
if (in != null)
{
in.close();
}
}
}
/**
* @Title: getHtmlCharset
* @Description: TODO(解析页面字符集)
* @param html
* @return
* Charset 返回类型
*/
public static Charset getHtmlCharset(final String html) {
if (!StringUtils.isEmpty(html))
{
Document document = Jsoup.parse(html);
Elements links = document.select("meta");
for (Element link : links)
{
String metaContent = link.attr("content");
String metaCharset = link.attr("charset");
if (metaContent.contains("charset="))
{
metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
return Charset.forName(metaContent.split("=")[1]);
}
else if (!StringUtils.isEmpty(metaCharset))
{
return Charset.forName(metaCharset);
}
}
}
return null;
}
/**
* 生成get请求,请求头和请求参数
*/
private HttpGet createHttpGet(String url,String cookie) {
HttpGet httpGet = new HttpGet(url);
httpGet.setConfig(getRequestConfig());
httpGet.setHeader(HttpHeaders.USER_AGENT, "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
httpGet.addHeader(HttpHeaders.ACCEPT, "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
httpGet.addHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate, br");
httpGet.addHeader(HttpHeaders.ACCEPT_LANGUAGE, "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7");
httpGet.addHeader(HttpHeaders.CONNECTION, "keep-alive");
httpGet.addHeader("Cookie", cookie);
httpGet.addHeader(HttpHeaders.HOST, "d.weibo.com");
return httpGet;
}
/**
* @Title: getRequestConfig
* @Description: TODO(设置请求配置)
* @return
* RequestConfig 返回类型
*/
private RequestConfig getRequestConfig() {
return RequestConfig.custom().setSocketTimeout(3000).setConnectTimeout(3000).setConnectionRequestTimeout(3000)
.setProxy(httpHost).build();
}
public void sleep(long time) {
try {
Thread.sleep(time);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
/**
* @Title: mongoDao.java
* @Package com.zhiweidata.weiboDomain.dao
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午2:34:36
* @version V1.0
*/ /**
*
*/
package com.zhiweidata.weiboDomain.dao;
import java.util.List;
import com.zhiweidata.weiboDomain.entity.WeiboDomain;
/**
* @ClassName: mongoDao
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午2:34:36
*/
public interface DomainDao{
public List<WeiboDomain> findByUid(String uid);
public List<WeiboDomain> findByDomain(String domain);
public List<WeiboDomain> findAll();
public void insert(List<WeiboDomain> list);
public void createColl();
public String bestNewCollName();
}
/**
* @Title: TagDao.java
* @Package com.zhiweidata.weiboDomain.dao
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午5:17:58
* @version V1.0
*/ /**
*
*/
package com.zhiweidata.weiboDomain.dao;
import java.util.List;
import com.zhiweidata.weiboDomain.entity.DomainTag;
/**
* @ClassName: TagDao
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午5:17:58
*/
public interface TagDao {
public List<DomainTag> findAll();
public List<DomainTag> findByState(Integer state);
public void updateByState(String domain,Integer state);
}
/**
* @Title: MongoDaoImpl.java
* @Package dao.impl
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午2:57:24
* @version V1.0
*/ /**
*
*/
package com.zhiweidata.weiboDomain.dao.impl;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.List;
import java.util.Set;
import javax.annotation.Resource;
import org.springframework.data.mongodb.core.MongoTemplate;
import org.springframework.data.mongodb.core.query.Criteria;
import org.springframework.data.mongodb.core.query.Query;
import org.springframework.stereotype.Repository;
import com.zhiweidata.weiboDomain.dao.DomainDao;
import com.zhiweidata.weiboDomain.entity.WeiboDomain;
/**
* @ClassName: MongoDaoImpl
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午2:57:24
*/
@Repository
public class DomainDaoImpl implements DomainDao{
@Resource
MongoTemplate mongoTemplate;
@Override
public List<WeiboDomain> findByUid(String uid) {
String collName = bestNewCollName();
Query query = new Query().addCriteria(Criteria.where("uid").is(uid));
return mongoTemplate.find(query, WeiboDomain.class,collName);
}
@Override
public List<WeiboDomain> findByDomain(String domain) {
String collName = bestNewCollName();
Query query = new Query().addCriteria(Criteria.where("domain").is(domain));
return mongoTemplate.find(query, WeiboDomain.class,collName);
}
@Override
public List<WeiboDomain> findAll() {
String collName = bestNewCollName();
return mongoTemplate.findAll(WeiboDomain.class,collName);
}
@Override
public void insert(List<WeiboDomain> list) {
String collName = bestNewCollName();
for (WeiboDomain weiboDomain : list)
{
mongoTemplate.save(weiboDomain,collName);
}
}
@Override
public String bestNewCollName() {
Set<String> names = mongoTemplate.getCollectionNames();
String result = "";
for (String name : names)
{
if (name.contains("weiboDomain"))
{
if (name.compareTo(result) > 0)
{
result = name;
}
}
}
return result;
}
@Override
public void createColl() {
String time = LocalDate.now().format(DateTimeFormatter.ofPattern("yyyyMMdd"));
String collName = "weiboDomain"+time;
mongoTemplate.createCollection(collName);
}
}
/**
* @Title: TagDaoImpl.java
* @Package com.zhiweidata.weiboDomain.dao.impl
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午5:21:35
* @version V1.0
*/ /**
*
*/
package com.zhiweidata.weiboDomain.dao.impl;
import java.util.List;
import javax.annotation.Resource;
import org.springframework.data.mongodb.core.MongoTemplate;
import org.springframework.data.mongodb.core.query.Criteria;
import org.springframework.data.mongodb.core.query.Query;
import org.springframework.data.mongodb.core.query.Update;
import org.springframework.stereotype.Repository;
import com.zhiweidata.weiboDomain.dao.TagDao;
import com.zhiweidata.weiboDomain.entity.DomainTag;
/**
* @ClassName: TagDaoImpl
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午5:21:35
*/
@Repository
public class TagDaoImpl implements TagDao{
@Resource
private MongoTemplate mongoTemplate;
@Override
public List<DomainTag> findAll() {
return mongoTemplate.findAll(DomainTag.class);
}
@Override
public void updateByState(String domain,Integer state) {
Query query = new Query(Criteria.where("domain").is(domain));
Update update = new Update();
update.set("state", state);
mongoTemplate.updateMulti(query, update, DomainTag.class);
}
@Override
public List<DomainTag> findByState(Integer state) {
Query query = new Query(Criteria.where("state").lte(state));
return mongoTemplate.find(query, DomainTag.class);
}
}
/**
* @Title: DomainTag.java
* @Package com.zhiweidata.weiboDomain.entity
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午3:23:31
* @version V1.0
*/ /**
*
*/
package com.zhiweidata.weiboDomain.entity;
import java.util.List;
import org.springframework.data.mongodb.core.mapping.Document;
import lombok.Data;
/**
* @ClassName: DomainTag
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午3:23:31
*/
@Data
@Document(collection = "domainTag")
public class DomainTag {
private String _id;
private List<String> tags;
private String domain;
private String domainId;
private Integer state;
}
/**
* @Title: entity.java
* @Package com.zhiweidata.weiboDomain.entity
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午2:37:27
* @version V1.0
*/ /**
*
*/
package com.zhiweidata.weiboDomain.entity;
import org.springframework.data.annotation.Id;
import org.springframework.data.mongodb.core.index.Indexed;
import lombok.Data;
/**
* @ClassName: entity
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午2:37:27
*/
@Data
public class WeiboDomain {
@Id
private String id;
@Indexed
private String uid;
private String name;
private String url;
private String gender;
private String location;
private String description;
@Indexed
private String domain;
private String tag;
private String followers_count;
private Integer friends_count;
private Integer statuses_count;
private boolean isVip;
private String updateTime;
}
\ No newline at end of file
package com.zhiweidata.weiboDomain.excel;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.mongodb.DBObject;
public class DBOExp
{
// private static SimpeExcelReport simpe = SimpeExcelReport.getInstance();
/**
*
* @TODO (输出DBObject集合)
* @author 陈炜涛
* @param listChai
* @param fliename
* @param sheetName
* @time 2016年8月27日上午10:12:37
* @return void
*/
public void putRun(List<DBObject> listChai, String fliename,
String sheetName)
{
// flie.mkdirs();
SimpeExcelReport simpe = SimpeExcelReport.getInstance();
File excelFile = new File(fliename);
boolean flg = excelFile.exists();
// System.out.println(flg);
OutputStream osOutputStream = null;
try
{
osOutputStream = new FileOutputStream(excelFile, true);
}
catch (FileNotFoundException e1)
{
// TODO Auto-generated catch block
e1.printStackTrace();
}
List<Map<String, Object>> dataList = new ArrayList<Map<String, Object>>();
// 将取到的body集合加入总集合
dataList.addAll(bodyList(listChai));
// 创建文件导出
// simpe.createExcelWithStream(headList(), bodyList(lists),
// osOutputStream,
// "微信信息");
// List<String> mergeList = new ArrayList<String>();
// mergeList.add("主题");
// mergeList.add("关键词");
// simpe.setMergeList(mergeList);
// simpe.addSheetInExcelWithFile(headList(),dataList, new
// File(fliename), "微信信息");
if (!flg)
{
simpe.createExcelWithStream(headList(listChai.get(0)), dataList,
osOutputStream, sheetName, excelFile);
}
else
{
simpe.addSheetInExcelWithFile(headList(listChai.get(0)), dataList,
new File(fliename), sheetName);
}
try
{
osOutputStream.close();
}
catch (IOException e)
{
e.printStackTrace();
}
}
/**
* @Description 设置文件的列名
*
* @return headList excel中所有列名的list
*/
public static List<String> headList(DBObject dbo)
{
List<String> headList = new ArrayList<String>();
headList.addAll(dbo.keySet());
return headList;
}
/**
* @Description 装载数据
*
* @return dataList 列名和值组成的map的list
*/
public List<Map<String, Object>> bodyList(List<DBObject> lists)
{
List<Map<String, Object>> dataList = new ArrayList<Map<String, Object>>();
// 循环存数据的list组装成制表时候能用的map的list
// List<String> days = InfoSource27.getDayPoint();
List<String> keys = new ArrayList<String>();
keys.addAll(lists.get(0).keySet());
Map<String, Object> beanMap;
for (DBObject dbo : lists)
{
// 因为这个导出文件类不能导出空对象,所以每个值都做了判断空的
beanMap = new HashMap<String, Object>();
for (String key : keys)
{
beanMap.put(key, dbo.get(key));
}
dataList.add(beanMap);
}
return dataList;
}
}
/**
* @Title: Serivce.java
* @Package com.zhiweidata.weiboDomain.service
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午4:55:00
* @version V1.0
*/ /**
*
*/
package com.zhiweidata.weiboDomain.service;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.annotation.Resource;
import org.springframework.stereotype.Service;
import com.zhiweidata.weiboDomain.crawler.JsoupHtml;
import com.zhiweidata.weiboDomain.crawler.WeiboDomainCrawler;
import com.zhiweidata.weiboDomain.dao.DomainDao;
import com.zhiweidata.weiboDomain.dao.TagDao;
import com.zhiweidata.weiboDomain.entity.DomainTag;
import com.zhiweidata.weiboDomain.entity.WeiboDomain;
import lombok.extern.slf4j.Slf4j;
/**
* @ClassName: Serivce
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午4:55:00
*/
@Slf4j
@Service
public class MongoSerivce {
@Resource
TagDao tagDao;
@Resource
DomainDao domainDao;
WeiboDomainCrawler crawler = new WeiboDomainCrawler();
JsoupHtml jsoupHtml = JsoupHtml.getInstance();
public void crawlerData(String cookie) {
Map<String, String> map = groupSet();
for (String domain : map.keySet())
{
String domainId = map.get(domain);
log.info("【{}】页开始爬取...............",domain);
List<WeiboDomain> list = parse(domain, domainId, cookie);
log.info("【{}】页所有数据爬取结束...............",domain);
domainDao.insert(list);
tagDao.updateByState(domain, 2);
log.info("【{}】所有页数据存储成功,共计【{}】条数据",domain,list.size());
}
log.info("所有页面爬取结束,程序结束");
}
private int getPageNum(String domainId,String cookie) {
while (true)
{
String page = crawler.getPage(domainId, cookie);
crawler.sleep(3000L);
int num = jsoupHtml.parsePage(page);
if (num != 0)
{
return num;
}
}
}
private List<WeiboDomain> parse(String domain,String domainId,String cookie) {
List<WeiboDomain> result = new ArrayList<>();
int num = getPageNum(domainId, cookie);
int i = 1;
while (i<300)
{
String url = "https://d.weibo.com/"+domainId+"?pids=Pl_Core_F4RightUserList__4"
+ "&page="+i+"&ajaxpagelet=1&__ref=/"+domainId;
String html = crawler.getHtml(url, cookie);
if (html == null)
{
if ((result.size()/10)+2 < num)
{
continue;
}
else {
break;
}
}
List<WeiboDomain> list = jsoupHtml.parseData(html,domain);
result.addAll(list);
log.info("【{}】:第【{}】页爬取成功",domain,i);
i++;
crawler.sleep(3000L);
}
return result;
}
private Map<String,String> groupSet(){
Map<String, String> result = new HashMap<>();
List<DomainTag> list = tagDao.findByState(1);
for (DomainTag domainTag : list)
{
String key = domainTag.getDomain();
String value = domainTag.getDomainId();
result.put(key, value);
}
for (String key : result.keySet())
{
tagDao.updateByState(key, 0);
}
return result;
}
/**
* @Title: initTag
* @Description: TODO(初始化所有tag状态,并创建新的表,在再次爬取时调用)
* void 返回类型
*/
public void initTag() {
Map<String, String> result = new HashMap<>();
List<DomainTag> list = tagDao.findAll();
for (DomainTag domainTag : list)
{
String key = domainTag.getDomain();
String value = domainTag.getDomainId();
result.put(key, value);
}
for (String key : result.keySet())
{
tagDao.updateByState(key, 0);
}
domainDao.createColl();
}
}
/**
* @Title: Main.java
* @Package com.zhiweidata.weiboDomain.start
* @Description: TODO(用一句话描述该文件做什么)
* @author xuyimeng
* @date 2018年2月23日 下午3:09:33
* @version V1.0
*/ /**
*
*/
package com.zhiweidata.weiboDomain.start;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import com.zhiweidata.weiboDomain.service.MongoSerivce;
/**
* @ClassName: Main
* @Description: TODO(这里用一句话描述这个类的作用)
* @author xuyimeng
* @date 2018年2月23日 下午3:09:33
*/
public class Start {
private static ApplicationContext ctx = new ClassPathXmlApplicationContext("spring-context.xml");
private static MongoSerivce serice = ctx.getBean(MongoSerivce.class);
public static void main(String[] args) {
String cookie = "login_sid_t=2da8770fb84cdb5be026bbfcd76ef1e6; cross_origin_proto=SSL; _s_tentry=passport.weibo.com; Apache=873655794108.0503.1519525903336; SINAGLOBAL=873655794108.0503.1519525903336; ULV=1519525903344:1:1:1:873655794108.0503.1519525903336:; SSOLoginState=1519525975; SCF=AqU8lfV6ROhTkYEEmVi2ROhtdMxlB0mT3EF2ABKenC3OfC3SeK3YfvZYWFJY8ytsaFhYcc1vO5hvhLwolzBW5ps.; SUB=_2A253llAIDeRhGeNH6VoY9C7Mzz-IHXVU4sbArDV8PUNbmtBeLUnSkW9NStghaGFgK4WPoq15L2ikM_srwT7hNvkI; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5eochNrdf3XKPD1VaPcG3T5JpX5K2hUgL.Fo-4eon4Sh57She2dJLoIEQLxK-LBKBLBo2LxKBLBo.L12zLxK-L1-BLBKqLxKML1hBLBoqEeh2ceh-t; SUHB=0mxUFkR8aaPo5m; ALF=1551061975; un=18395807152; wvr=6; YF-Page-G0=416186e6974c7d5349e42861f3303251";
// 初始化程序状态,在再次爬取时调用
// 断点续传时,注释掉
// serice.initTag();
serice.crawlerData(cookie);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment