Commit 6632f6e7 by zhiwei

搜报网数据采集程序

parents
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>soubao-crawlerNew</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>搜报网采集</name>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties>
<developers>
<developer>
<id>Bewilder</id>
<name>zhiwei zhang</name>
<email>zhangzhiwei@zhiweidata.com</email>
</developer>
</developers>
<dependencies>
<!-- https://mvnrepository.com/artifact/com.squareup.okhttp3/okhttp -->
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
<version>3.10.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.logging.log4j/log4j-core -->
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.11.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.logging.log4j/log4j-api -->
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<version>2.11.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
<version>3.6.3</version>
</dependency>
<dependency>
<groupId>com.zhiwei</groupId>
<artifactId>mongosave-service-client</artifactId>
<version>1.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.middleware</groupId>
<artifactId>rsid-client</artifactId>
<version>0.0.2-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei</groupId>
<artifactId>sendmail</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei.middleware</groupId>
<artifactId>proxy-client</artifactId>
<version>0.0.1-RELEASE</version>
</dependency>
<dependency>
<groupId>com.zhiwei</groupId>
<artifactId>zhiweiTools</artifactId>
<version>0.0.6-SNAPSHOT</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.2</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>com.zhiwei.crawler.run.MainRun</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-source-plugin</artifactId>
<version>2.4</version>
<configuration>
<attach>true</attach>
</configuration>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<!-- 解决maven test命令时console出现中文乱码乱码 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.19.1</version>
<configuration>
<forkMode>once</forkMode>
<argLine>-Dfile.encoding=UTF-8</argLine>
<skipTests>true</skipTests>
</configuration>
</plugin>
</plugins>
</build>
</project>
\ No newline at end of file
package com.zhiwei.crawler.config;
import java.io.InputStream;
import java.util.Properties;
public class Config {
static {
Properties conf = null;
try {
InputStream is = Thread.currentThread().getContextClassLoader()
.getResourceAsStream("db.properties");
conf = new Properties();
conf.load(is);
is.close();
mongoIp = conf.getProperty("mongoIp");
mongoPort = Integer.valueOf(conf.getProperty("mongoPort"));
userName = conf.getProperty("db.username");
userPwd = conf.getProperty("db.paasword");
authDB = conf.getProperty("db.certifiedDB");
savedbName = conf.getProperty("savedbName");
saveCollName = conf.getProperty("saveCollName");
crawlerdbName = conf.getProperty("crawlerdbName");
crawlerCollName = conf.getProperty("crawlerCollName");
redisKey = conf.getProperty("redisKey");
rsidUrl = conf.getProperty("rsid.zookeeper.url");
rsidGroup = conf.getProperty("rsid.zookeeper.group");
} catch (Exception e) {
e.printStackTrace();
}
}
public static String mongoIp;
public static int mongoPort;
public static String userName;
public static String userPwd;
public static String authDB;
public static String savedbName;
public static String saveCollName;
public static String crawlerdbName;
public static String crawlerCollName;
public static String redisKey;
public static String rsidUrl;
public static String rsidGroup;
public static String filePath;
public static int saveCount;
public static int queueCount;
}
package com.zhiwei.crawler.dao;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.mongodb.DBObject;
import com.zhiwei.core.MongoSaveService;
import com.zhiwei.crawler.config.Config;
import com.zhiwei.crawler.util.ReadFileTools;
/**
* @ClassName: MediaDataDAO
* @Description: TODO(数据存储)
* @author hero
* @date 2017年8月11日 上午10:49:49
*/
public class MediaDataDAO {
/**生产环境**/
private static JSONObject json = JSON.parseObject(ReadFileTools.getFileContent("config.json", "utf-8"));
/**测试环境**/
// private static JSONObject json = JSON.parseObject(ReadFileTools.getFileContent("configTest.json", "utf-8"));
private static final MongoSaveService service = MongoSaveService.getMongoSaveService(json, Config.mongoIp,
Config.savedbName, Config.saveCollName, Config.redisKey);
/**
* @Title: addMediaData
* @author hero
* @Description: TODO(存储单条数据)
* @param @param doc 设定文件
* @return void 返回类型
*/
public static void addMediaData(DBObject doc){
service.setRsidName("rsid");
try {
service.insert(doc);
} catch (Exception e) {
e.printStackTrace();
}
}
}
package com.zhiwei.crawler.dao;
import java.util.List;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mongodb.BasicDBObject;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;
import com.zhiwei.crawler.config.Config;
import com.zhiwei.crawler.dbtemplate.MongoDBTemplate;
public class WordsDao extends MongoDBTemplate{
private static Logger logger = LoggerFactory.getLogger(WordsDao.class);
public WordsDao(){
super.setDbName(Config.crawlerdbName);
super.setCollName(Config.crawlerCollName);
}
/***
* @Title: getAllWordList
* @author hero
* @Description: 获取全部关键词
* @param @return 设定文件
* @return List<String> 返回类型
*/
public BlockingQueue<String> getAllWordList(){
try {
BlockingQueue<String> list = new LinkedBlockingQueue<String>();
DBObject query = new BasicDBObject();
query.put("company", "美赞臣");
DBCursor cur = this.getReadColl().find(query);
while(cur.hasNext()){
DBObject doc = cur.next();
list.add(doc.get("word").toString());
}
return list;
} catch (Exception e) {
logger.debug("查询关键词出现问题:{}", e.getMessage());
return null;
}
}
/**
* @Title: getWordList
* @author hero
* @Description: 获取单个组的关键词
* @param @param company
* @param @return 设定文件
* @return List<String> 返回类型
*/
public BlockingQueue<String> getWordList(String company){
DBObject query = new BasicDBObject();
query.put("company", company);
try {
BlockingQueue<String> list = new LinkedBlockingQueue<String>();
DBCursor cur = this.getReadColl().find(query);
while(cur.hasNext()){
DBObject doc = cur.next();
list.add(doc.get("word").toString());
}
return list;
} catch (Exception e) {
logger.debug("查询关键词出现问题:{}", e.getMessage());
return null;
}
}
/***
* @Title: getWordList
* @author hero
* @Description: 获取多个组的关键词
* @param @param companyList
* @param @return 设定文件
* @return List<String> 返回类型
*/
public BlockingQueue<String> getWordList(List<String> companyList){
DBObject query = new BasicDBObject();
query.put("company", new BasicDBObject("$in", companyList));
try {
BlockingQueue<String> list = new LinkedBlockingQueue<String>();
DBCursor cur = this.getReadColl().find(query);
while(cur.hasNext()){
DBObject doc = cur.next();
list.add(doc.get("word").toString());
}
return list;
} catch (Exception e) {
logger.debug("查询关键词出现问题:{}", e.getMessage());
return null;
}
}
}
package com.zhiwei.crawler.dbtemplate;
import java.util.Arrays;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.Mongo;
import com.mongodb.MongoClient;
import com.mongodb.MongoCredential;
import com.mongodb.MongoException;
import com.mongodb.ServerAddress;
import com.zhiwei.crawler.config.Config;
/**
*
* @author LihuaTang
*
*/
public class MongoDBTemplate
{
protected static Mongo reader;
protected static Mongo writer;
protected String collName;
protected String dbName;
public MongoDBTemplate() {
MongoCredential credential = MongoCredential.createCredential(Config.userName, Config.authDB, Config.userPwd.toCharArray());
ServerAddress address = new ServerAddress(Config.mongoIp, Config.mongoPort);
try {
if(reader==null)
{
reader = new MongoClient(address, Arrays.asList(credential));
// reader = new MongoClient(address);
}
if(writer==null)
{
writer = new MongoClient(address, Arrays.asList(credential));
// writer = new MongoClient(address);
}
} catch (MongoException e ) {
e.printStackTrace();
}
}
public DBCollection getReadColl() {
@SuppressWarnings("deprecation")
final DB db = getReader().getDB(dbName);
final DBCollection coll = db.getCollection(collName);
return coll;
}
protected Mongo getReader() {
return reader;
}
@SuppressWarnings("deprecation")
public DBCollection getWriteColl() {
final DB db = getWriter().getDB(dbName);
final DBCollection coll = db.getCollection(collName);
return coll;
}
protected Mongo getWriter() {
return writer;
}
protected void setCollName(final String collName) {
this.collName = collName;
}
protected void setDbName(final String dbName) {
this.dbName = dbName;
}
@SuppressWarnings("static-access")
protected void setReader(final Mongo reader) {
this.reader = reader;
}
@SuppressWarnings("static-access")
protected void setWriter(final Mongo writer) {
this.writer = writer;
}
public static void main(String[] args) {
}
}
package com.zhiwei.crawler.dbtemplate;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.zhiwei.crawler.config.Config;
import com.zhiwei.rsid.core.RsidClient;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
/**
* @ClassName: RsidClient
* @Description: TODO(数据去重及rsid获取)
* @author Bewilder Z
* @date 2017年5月15日 下午3:39:47
*/
public class RsidClientTemplate {
private static Logger logger = LoggerFactory.getLogger(RsidClientTemplate.class);
private static RsidClient client = RsidClient.build(Config.rsidUrl, Config.rsidGroup);
/**
* @Title: isMeidaExit
* @author hero
* @Description: TODO(验证网媒是否存在)
* @param @param id
* @param @return 设定文件
* @return boolean 返回类型
*/
public static boolean addFilterUrl(String id){
for(int i=0; i<3; i++){
try {
return client.addFilterUrl(id, false, Config.redisKey);
} catch (Exception e) {
logger.error("判断此条网媒消息是否存在出现问题", e.fillInStackTrace());
ZhiWeiTools.sleep(500);
continue;
}
}
return false;
}
}
/**
* @Title: HttpClientBuilder.java
* @Package com.zhiwei.crawler.download
* @author 0xff
* @date 2018年6月15日 下午12:00:37
*/
package com.zhiwei.crawler.download;
import java.io.IOException;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import okhttp3.Cookie;
import okhttp3.CookieJar;
import okhttp3.HttpUrl;
import okhttp3.Interceptor;
import okhttp3.OkHttpClient;
import okhttp3.OkHttpClient.Builder;
import okhttp3.Request;
import okhttp3.Response;
/**
* @ClassName: HttpClientBuilder
* @Description: HttpClient 创建器
* @author 0xff
* @date 2018年6月15日 下午12:00:37
*/
public class HttpClientBuilder {
private static final OkHttpClient BASE_HTTP_CLIENT = new OkHttpClient.Builder().readTimeout(5, TimeUnit.SECONDS)
.retryOnConnectionFailure(true).sslSocketFactory(TrustAllSSL.SSLSocketFactory(), TrustAllSSL.trustManager())
.hostnameVerifier(TrustAllSSL.hostnameVerifier())
.addInterceptor(new Interceptor() {
private int maxRetry = 3;
private int retry = 0;
@Override
public Response intercept(Chain chain) throws IOException {
Request request = chain.request();
Response response = chain.proceed(request);
while(!response.isSuccessful() && retry < maxRetry) {
retry ++;
response.close();
response = chain.proceed(request);
}
return response;
}
})
.build();
/**
* 创建普通 OkHttpClient
*
* @return OkHttpClient
*/
public static OkHttpClient newInstance() {
// InetSocketAddress address = new InetSocketAddress("114.228.75.232", 12121);
// return BASE_HTTP_CLIENT.newBuilder().proxy(new Proxy(Type.HTTP, address)).build();
return BASE_HTTP_CLIENT.newBuilder().build();
}
/**
* 创建带 Cookie 管理的 OkHttpClient
*
* @return OkHttpClient
*/
public static OkHttpClient newInstanceWithCookieJar(Proxy proxy) {
Builder builder = BASE_HTTP_CLIENT.newBuilder();
if(proxy != null) {
builder.proxy(proxy);
}
return builder.cookieJar(new CookieJar() {
private Map<HttpUrl, List<Cookie>> store = new ConcurrentHashMap<HttpUrl, List<Cookie>>();
@Override
public void saveFromResponse(HttpUrl url, List<Cookie> cookies) {
store.put(url, cookies);
}
@Override
public List<Cookie> loadForRequest(HttpUrl url) {
List<Cookie> cookies = store.get(url);
return cookies == null ? new ArrayList<Cookie>() : cookies;
}
}).build();
}
}
/**
* @Title: HttpRequestBuilder.java
* @Package com.zhiwei.crawler.download
* @author 0xff
* @date 2018年6月19日 下午4:55:20
*/
package com.zhiwei.crawler.download;
import java.util.Random;
import okhttp3.Headers;
import okhttp3.Request;
import okhttp3.Request.Builder;
import okhttp3.RequestBody;
/**
* @ClassName: HttpRequestBuilder
* @Description: OkHttpRequest 创建器
* @author 0xff
* @date 2018年6月19日 下午4:55:20
*/
public class HttpRequestBuilder {
/**
* 获取 Request Builder (添加了一些基本的请求头)
*
* @return Builder
*/
private static Builder builder() {
Builder builder = new Request.Builder();
int index = new Random(System.nanoTime()).nextInt() ;
if(index < 0) {
index *= -1;
}
index = index % USER_AGENTS.length;
builder.addHeader("User-Agent", USER_AGENTS[index]);
// builder.addHeader("Accept-Encoding", "gzip, deflate, br");
builder.addHeader("Accept-Language", "zh-CN,zh;q=0.9");
builder.addHeader("Cache-Control", "no-cache");
builder.addHeader("Connection", "keep-alive");
builder.addHeader("Pragma", "no-cache");
return builder;
}
/**
* 新建 Get 请求
*
* @param url 请求 URL
* @param headers 请求头信息
* @return Request 请求对象
*/
public static Request newGetRequest(String url, Headers headers) {
Builder builder = builder().url(url);
if(headers != null) {
for(int i = 0; i < headers.size(); i ++) {
builder.addHeader(headers.name(i), headers.value(i));
}
}
return builder.build();
}
/**
* 新建 Post 请求
*
* @param url 请求 URL
* @param headers 请求头信息
* @param body Post 参数
* @return Request 请求对象
*/
public static Request newPostRequest(String url, Headers headers, RequestBody body) {
Builder builder = builder().url(url);
if(headers != null) {
builder.headers(headers);
}
if(body != null) {
builder.post(body);
}
return builder.build();
}
/**
* @Fields USER_AGENTS 预装 UA
*/
private static final String[] USER_AGENTS = new String[] {
//Mac Chrome
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36",
//Mac Safari
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15",
//Mac Firefox
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0",
//Win10 Chrome
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36",
//Win10 Edge
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134",
//Win8.1 Chrome
"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36",
//Win8.1 Firefox
"Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0",
//Win7 Chrome
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36",
//Win7 IE
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0"
};
}
/**
* @Title: TrustAllSSL.java
* @Package com.zhiwei.crawler.download
* @author 0xff
* @date 2018年6月19日 下午2:41:29
*/
package com.zhiwei.crawler.download;
import java.security.SecureRandom;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import javax.net.ssl.SSLSocketFactory;
import javax.net.ssl.X509TrustManager;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
/**
* @ClassName: TrustAllSSL
* @Description: 默认信任所有证书,包括自签、过期、吊销、正常等
* @author 0xff
* @date 2018年6月19日 下午2:35:51
*/
public class TrustAllSSL {
private static final Logger logger = LogManager.getLogger(TrustAllSSL.class);
private static SSLSocketFactory factory;
private static X509TrustManager manager = new TrustAllManager();
private static HostnameVerifier verifier = new TrustAllHostnameVerifier();
/**
* 获取 SSLSocketFactory 单例实例
*
* @return SSLSocketFactory
*/
public synchronized static SSLSocketFactory SSLSocketFactory() {
if(factory == null) {
try {
SSLContext context = SSLContext.getInstance("TLS");
context.init(null, new X509TrustManager[] {manager}, new SecureRandom());
factory = context.getSocketFactory();
} catch (Exception e) {
logger.error("SSLSocketFactory 创建失败", e);
System.exit(-1);
}
}
return factory;
}
/**
* 获取 X509TrustManager 单例实例
*
* @return X509TrustManager
*/
public static X509TrustManager trustManager() {
return manager;
}
/**
* 获取 HostnameVerifier 单例实例
*
* @return HostnameVerifier
*/
public static HostnameVerifier hostnameVerifier() {
return verifier;
}
/**
* @ClassName: TrustAllManager
* @Description: Trust All X509TrustManager
* @author 0xff
* @date 2018年6月19日 下午2:27:57
*/
private static class TrustAllManager implements X509TrustManager {
@Override
public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {}
@Override
public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {}
@Override
public X509Certificate[] getAcceptedIssuers() {
return new X509Certificate[0];
}
}
/**
* @ClassName: TrustAllHostnameVerifier
* @Description: Trust All Hostname
* @author 0xff
* @date 2018年6月19日 下午2:38:55
*/
private static class TrustAllHostnameVerifier implements HostnameVerifier {
@Override
public boolean verify(String hostname, SSLSession session) {
return true;
}
}
}
package com.zhiwei.crawler.run;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
public class MainRun {
private ScheduledExecutorService scheduExec;
public MainRun() {
this.scheduExec = Executors.newScheduledThreadPool(1);
}
public void showTimer() {
scheduExec.scheduleAtFixedRate(new SoubaoCrawlerRun(), 1000, 60 * 60 * 1000, TimeUnit.MILLISECONDS);
}
public static void main(String[] args) {
new MainRun().showTimer();
}
}
package com.zhiwei.crawler.run;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.crawler.dao.WordsDao;
import com.zhiwei.crawler.soubao.SouBaoCrawlerThread;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
/**
* @ClassName: BaiDuNewsCrawler
* @Description: 百度新闻内容获取
* @author hero
* @date 2018年6月20日 下午3:08:24
*/
public class SoubaoCrawlerRun implements Runnable{
private static final Logger logger = LogManager.getLogger(SoubaoCrawlerRun.class);
private WordsDao wordsDao = new WordsDao();
@Override
public void run() {
getNewsData();
}
/**
* @Title: getNewsData
* @author hero
* @Description: 获取数据
* @param 设定文件
* @return void 返回类型
*/
private void getNewsData(){
logger.info("采集开始.....");
long s = System.currentTimeMillis();
int thread = 5;
BlockingQueue<String> wordesQueue = wordsDao.getAllWordList();
SouBaoCrawlerThread[] souBaoCrawlerThread = new SouBaoCrawlerThread[thread];
ExecutorService service = Executors.newFixedThreadPool(2);
for (int i = 0; i < thread; i++) {
souBaoCrawlerThread[i] = new SouBaoCrawlerThread(wordesQueue);
service.execute(souBaoCrawlerThread[i]);
}
//关闭线程池
service.shutdown();
boolean f = true;
long e = 0L;
while(f){
//判断线程池中任务是否已执行完毕
if(service.isTerminated()){
e = System.currentTimeMillis();
logger.info("一轮采集所用的时间为:{}", (e-s));
f = false;
}
ZhiWeiTools.sleep(50);
}
}
}
/**
* @Title: Crawler.java
* @Package com.zhiwei.crawler.soubao
* @author 0xff
* @date 2018年6月28日 上午9:49:32
*/
package com.zhiwei.crawler.soubao;
import java.io.IOException;
import java.net.Proxy;
import java.net.URLEncoder;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.zhiwei.crawler.download.HttpClientBuilder;
import com.zhiwei.crawler.download.HttpRequestBuilder;
import com.zhiwei.crawler.util.TreatData;
import okhttp3.FormBody;
import okhttp3.Headers;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
/**
* @ClassName: Crawler
* @Description: 搜报网爬虫
* @author 0xff
* @date 2018年6月28日 上午9:49:32
*/
public class Crawler {
private static final Logger logger = LogManager.getLogger(Crawler.class);
public static void start(int days, String keyword, Proxy proxy) throws Exception {
if (days < 0) {
throw new IllegalArgumentException("搜索天数不能小于 0");
}
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
Calendar c = Calendar.getInstance();
String endDate = sdf.format(c.getTime());
c.add(Calendar.DAY_OF_YEAR, -1 * days);
String startDate = sdf.format(c.getTime());
StringBuilder sb = new StringBuilder(
"http://www.soubao.net/search/searchList.aspx?timesel=custom&checkNum=");
sb.append("&startdate=" + startDate + "&enddate=" + endDate);
sb.append("&keyword=" + URLEncoder.encode(keyword, "UTF-8"));
try {
String url = sb.toString();
logger.info("关键词 {} 搜索链接 {}", keyword, url);
search(url, keyword, startDate, endDate,proxy);
} catch (Exception e) {
logger.error("关键词 {} 采集出错", keyword, e);
}
}
private static void search(String url, String keyword, String startDate, String endDate,Proxy proxy) throws Exception {
int count = 0;
OkHttpClient client = HttpClientBuilder.newInstanceWithCookieJar(proxy);
Map<String, String> map = new HashMap<String, String>();
map.put("Referer", "http://www.soubao.net/search/searchList.aspx");
map.put("Cookie", DevKit.buildSoubaoCookie());
map.put("Host", "www.soubao.net");
map.put("Origin", "http://www.soubao.net");
map.put("Content-Type", "application/x-www-form-urlencoded");
Request request = HttpRequestBuilder.newGetRequest(url, Headers.of(map));
Response response = client.newCall(request).execute();
String body = response.body().string();
logger.info("关键词 {} 搜索成功", keyword);
Document html = Jsoup.parse(body);
boolean needRepair = true;
int page = 1;
// 开始强制翻页
for (int i = 1; i <= page; i++) {
FormBody formBody = new FormBody.Builder()
.add("__VIEWSTATE", html.getElementById("__VIEWSTATE").attr("value"))
.add("__VIEWSTATEGENERATOR", html.getElementById("__VIEWSTATEGENERATOR").attr("value"))
.add("__EVENTTARGET", "AspNetPager1").add("__EVENTARGUMENT", i + "")
.add("__EVENTVALIDATION", html.getElementById("__EVENTVALIDATION").attr("value"))
.add("HidTimeSelect", html.getElementById("HidTimeSelect").attr("value"))
.add("HiddenMsg", html.getElementById("HiddenMsg").attr("value")).add("txtKeyword", keyword)
.add("checkNum", "").add("timesel", "on").add("txtStartDate", startDate).add("txtEndDate", endDate)
.build();
request = HttpRequestBuilder.newPostRequest("http://www.soubao.net/search/searchList.aspx",
request.headers(), formBody);
response = client.newCall(request).execute();
body = response.body().string();
if (!body.contains("rptRetList_ctl01_HLinkBT")) {
page = 0;
logger.info("关键词 {} 无数据,退出搜索", keyword);
break;
}
html = Jsoup.parse(body);
if(needRepair) {
//修正翻页页数
try {
page = Integer.parseInt(html.getElementById("LbKeyword").select("span").get(2).text().replaceAll(".*?/|页", ""));
logger.info("关键词 {} 搜索结果页数: {}", keyword, page);
needRepair = false;
} catch (Exception e) {
throw new IllegalStateException("关键词 " + keyword + " 获取搜索结果页数失败");
}
}
count ++;
logger.info("关键词 {} 翻页页数: {} 访问成功", keyword, i);
// 解析翻页
parse(client, request.headers(), html);
TimeUnit.SECONDS.sleep(2);
}
logger.info("关键词 {} 爬取完毕,总页数: {},数据条数: {}", keyword, page, count);
}
/**
* @Title: parse
* @author hero
* @Description: 解析数据
* @param @param client
* @param @param headers
* @param @param html 设定文件
* @return void 返回类型
*/
private static void parse(OkHttpClient client, Headers headers, Document html) {
try {
Elements elements = html.select("ul.newList").select("li");
for (Element element : elements) {
try {
String link = "http://www.soubao.net" + element.select("h2").select("a").attr("href");
String realUrl = matchRealUrl(client, headers, link);
if (realUrl != null) {
Map<String, Object> dataMap = new HashMap<String, Object>();
dataMap.put("title", element.select("h2").select("a").text());
dataMap.put("content", element.select("p.newCon").text());
dataMap.put("source",
element.select("p.newsInfo").select("em.paperName").select("span").text());
dataMap.put("time", element.select("p.newsInfo").select("em.postDate").select("span").text());
dataMap.put("_id", realUrl);
TreatData.treatDataAccount(dataMap);
}else {
logger.info("链接为:{},真实地址解析出现错误", link);
}
} catch (Exception e) {
logger.debug("解析数据结构出现问题::", e.fillInStackTrace());
continue;
}
}
} catch (Exception e) {
logger.info("页面正文提取出错", e);
}
}
public static String matchRealUrl(OkHttpClient client, Headers headers, String url) {
String regex = "^([hH][tT]{2}[pP]:/*|[hH][tT]{2}[pP][sS]:/*|[fF][tT][pP]:/*)(([A-Za-z0-9-~]+).)+([A-Za-z0-9-~\\/])+(\\?{0,1}(([A-Za-z0-9-~]+\\={0,1})([A-Za-z0-9-~]*)\\&{0,1})*)$";
String realUrl = null;
try {
TimeUnit.MILLISECONDS.sleep(500);
Request request = HttpRequestBuilder.newGetRequest(url, null);
Response response = client.newBuilder().build().newCall(request).execute();
String html = response.body().string();
if (html != null && html.contains("window.location='")) {
realUrl = html.split("window.location='")[1].split("'</script>")[0];
realUrl = realUrl.replaceAll("/./", "/");
Pattern pattern = Pattern.compile(regex);
if (pattern.matcher(realUrl).matches()) {
return realUrl;
}
}
} catch (Exception e) {
e.printStackTrace();
return null;
}
return realUrl;
}
}
/**
* @Title: Defination.java
* @Package com.zhiwei.crawler.soubao
* @author 0xff
* @date 2018年6月28日 上午10:05:25
*/
package com.zhiwei.crawler.soubao;
/**
* @ClassName: Defination
* @Description: 定义
* @author 0xff
* @date 2018年6月28日 上午10:05:25
*/
public class Defination {
}
/**
* @Title: DevKit.java
* @Package com.zhiwei.crawler.soubao
* @author 0xff
* @date 2018年6月28日 上午9:51:02
*/
package com.zhiwei.crawler.soubao;
import java.util.Random;
/**
* @ClassName: DevKit
* @Description: 工具方法
* @author 0xff
* @date 2018年6月28日 上午9:51:02
*/
public class DevKit {
/**
* 生成搜报网 cookie
*
* @return String
*/
public static String buildSoubaoCookie() {
StringBuilder sb = new StringBuilder();
sb.append("CNZZDATA1260939784=");
sb.append(randomInt(10000000));
long timestamp1 = System.currentTimeMillis() / 1000;
sb.append("-" + timestamp1 + "-null%7C" + timestamp1 + "; ; ");
sb.append("ASP.NET_SessionId=" + randomString(24));
return sb.toString();
}
/**
* 生成随机数
*
* @param round 随机数最大范围
* @return int
*/
private static int randomInt(int round) {
int num = new Random(System.nanoTime()).nextInt(round);
return num < 0 ? num *= -1 : num;
}
/**
* 生成随机只包含数字与字母的字符串
*
* @param length 随机字符串长度
* @return String
*/
private static String randomString(int length) {
StringBuilder sb = new StringBuilder();
for(int i = 0; i < length; i ++) {
if(randomInt(2) == 0) {
sb.append((char) (48 + randomInt(10)));
} else {
sb.append((char) (97 + randomInt(26)));
}
}
return sb.toString();
}
}
package com.zhiwei.crawler.soubao;
import java.net.Proxy;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.zhiwei.crawler.run.SoubaoCrawlerRun;
import com.zhiwei.crawler.util.ProxyClientUtil;
import com.zhiwei.zhiweiTools.tools.ZhiWeiTools;
/**
* @ClassName: BaiDuNewsCrawler
* @Description: 百度新闻内容获取
* @author hero
* @date 2018年6月20日 下午3:08:24
*/
public class SouBaoCrawlerThread extends Thread{
private static final Logger logger = LogManager.getLogger(SoubaoCrawlerRun.class);
private BlockingQueue<String> wordsQueue;
public SouBaoCrawlerThread(BlockingQueue<String> wordsQueue) {
this.wordsQueue = wordsQueue;
}
@Override
public void run() {
while(wordsQueue!=null && wordsQueue.size()>0){
try {
String word = wordsQueue.take();
Proxy proxy = ProxyClientUtil.getProxy();
/***开始采集**/
logger.info("开始采集:::{}搜报网关键词", word);
long s = System.currentTimeMillis();
Crawler.start(1, word, proxy);
long e = System.currentTimeMillis();
logger.info("采集:::{}搜报网关键词结束,采集所用时间为:{}", word, (e-s));
} catch (Exception e) {
break;
}finally {
ZhiWeiTools.sleep(50);
}
}
}
}
package com.zhiwei.crawler.util;
import java.net.Proxy;
import com.zhiwei.proxy.core.ProxyClient;
import com.zhiwei.proxy.core.ProxyClientFactory;
import com.zhiwei.rsid.common.Definition.GroupType;
public class ProxyClientUtil {
// private static String registry = "zookeeper://202.107.192.94:2181";
// private static String group = "local";
private static String registry = "zookeeper://192.168.0.203:2181";
private static String group = "hangzhou";
private static ProxyClient client = ProxyClientFactory.build(registry, group, GroupType.PROVIDER);
public static Proxy getProxy(){
return client.getNATProxy();
}
}
/**
* Copyright © 2017宁波知微瑞驰信息科技有限公司. All rights reserved.
* @Title: Tools.java
* @Package com.zhiwei.demo
* @author 0xFF
* @date 2017年8月2日 下午5:45:34
*/
package com.zhiwei.crawler.util;
import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
/**
* @ClassName: Tools
* @Description: 工具类
* @author 0xFF
* @date 2017年8月2日 下午5:45:34
*/
public class ReadFileTools {
/**
* 读取文本文件内容
* @param path
* @param encode
* @return String
*/
public static String getFileContent(String path, String encode) {
String result = null;
ByteArrayOutputStream baos = null;
try(BufferedInputStream bis = new BufferedInputStream(ReadFileTools.class.getClassLoader().getResourceAsStream(path))) {
byte[] buf = new byte[1024];
baos = new ByteArrayOutputStream();
int count;
while((count = bis.read(buf)) != -1) {
baos.write(buf, 0, count);
}
result = new String(baos.toByteArray(), encode);
} catch(Exception e) {}
return result;
}
}
package com.zhiwei.crawler.util;
import com.zhiwei.sendmail.SendMail;
import com.zhiwei.sendmail.bean.MailInfo;
public class SendMailTVCloud {
/**
* @Title: sendMail
* @Description: TODO(发送邮件)
* @param @param mailContent
* @param @param email
* @param @return 设定文件
* @return boolean 返回类型
*/
public static boolean sendMail(String mailContent,String email){
//这个类主要是设置邮件
String mailServerHost = "smtp.ym.163.com";
String mailServerPort = "25";
String fromAddress = "zhangzhiwei@zhiweidata.com";
String toAddress = email;
String userName = "zhangzhiwei@zhiweidata.com";
String password = "olp1437z..";
boolean validate = true;
String subject = "监测系统天脉云采集程序";
String content = mailContent;
MailInfo mailInfo = new MailInfo(mailServerHost, mailServerPort, fromAddress, toAddress, userName, password, validate, subject, content,null);
boolean f=false;
//这个类主要来发送邮件
f = SendMail.sendMailByHtml(mailInfo);//发送文体格式
return f;
}
/**
* 测试发送邮件
*/
// public static void main(String[] args) {
// SendMailDaoImpl sendMail=new SendMailDaoImpl();
// String mailSubject="国家核电重点规则追踪提醒";
// String mailContent="消息内容:考或不考,成功的路就在那里,冥冥之中就会有一双手帮你。但如果只是为了一纸文凭,奉劝大家不要将有限的生命投入到无限的为考试而考之中。如果真的发自内心想要学英语,一定是会说会用,才算会英语!<br/><br/>地址:www.baidu.com<br/><br/> 本消息由国家核电舆情系统发送(请勿回复)";
// String[] email={"859548429@qq.com"};
// sendMail.SendMail(mailContent, email);
//
// }
}
package com.zhiwei.crawler.util;
import java.util.Date;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;
import com.zhiwei.crawler.dao.MediaDataDAO;
import com.zhiwei.crawler.dbtemplate.RsidClientTemplate;
import com.zhiwei.zhiweiTools.timeParse.TimeParse;
/**
* @ClassName: TreatData
* @Description: TODO(处理数据)
* @author Bewilder Z
* @date 2017年5月23日 上午10:34:04
*/
public class TreatData {
private static Logger logger = LoggerFactory.getLogger(TreatData.class);
/**
* @Title: treatData
* @Description: TODO(处理天脉云数据)
* @param @param
* doc 设定文件
* @return void 返回类型
*/
public static void treatDataAccount(Map<String,Object> dataMap) {
if (!(dataMap.get("_id") == null || dataMap.get("_id").equals("") || dataMap.get("source") == null
|| dataMap.get("source").equals("")) || dataMap.get("time")!=null) {
Date now = new Date(new Date().getTime() - 24 * 60 * 60 * 1000);
Date date = TimeParse.stringFormartDate(dataMap.get("time").toString());
if (date.after(now)) {
logger.info("去重的链接为:{}", dataMap.get("_id"));
String url = dataMap.get("_id").toString();
if(RsidClientTemplate.addFilterUrl(url)){
DBObject doc = new BasicDBObject();
doc.put("_id", url);
doc.put("url", url);
doc.put("title", dataMap.get("title"));
doc.put("time", date);
doc.put("source", dataMap.get("source"));
doc.put("content", dataMap.get("content")+"".replaceAll("<[^>]*>", ""));
doc.put("pt", "平媒");
doc.put("type", "平媒");
doc.put("savetime", System.currentTimeMillis());
MediaDataDAO.addMediaData(doc);
}else{
logger.info("搜报网地址为:{},此条数据重复", dataMap.get("_id"));
}
}
}
}
}
{
"registry": "zookeeper://192.168.0.203:2181",
"group": "mongosaveservice",
"shardings":
{
"enforce": false,
"settings": [
{
"collection": "video",
"method": "month",
"field": "time"
}
]
}
}
{
"registry": "zookeeper://192.168.0.234:2181",
"group": "mongosaveservice",
"shardings":
{
"enforce": false,
"settings": [
{
"collection": "video",
"method": "month",
"field": "time"
}
]
}
}
#####################生产环境#################################
mongoIp=192.168.0.101
mongoPort=27017
db.username=zzwno
db.paasword=zzwno1q2w3e4r
db.certifiedDB=admin
##save data dbInfo
savedbName=mediaspider
saveCollName=net_media
##crawler word dbInfo
crawlerdbName=qbjcPhoenix
crawlerCollName=qbjc_crawlerword
rsid.zookeeper.url = zookeeper://192.168.0.203:2181;
rsid.zookeeper.group=rsidservernew
redisKey=media
#####################测试环境#################################
#mongoIp=192.168.0.233
#mongoPort=27017
#db.username=zzwno
#db.paasword=zzwno1q2w3e4r
#db.certifiedDB=admin
###save data dbInfo
#savedbName=mediaspider
#saveCollName=net_media
###crawler word dbInfo
#crawlerdbName=qbjcPhoenix
#crawlerCollName=qbjc_crawlerword
#rsid.zookeeper.url = zookeeper://192.168.0.234:2181;
#rsid.zookeeper.group=rsidservernew
#redisKey=media
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<!-- log4j2 自身的日志级别 -->
<Configuration status="WARN">
<Appenders>
<!-- 定义日志输出地 -->
<Console name="Console" target="SYSTEM_OUT">
<PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n" />
</Console>
</Appenders>
<Loggers>
<Root level="info">
<AppenderRef ref="Console" />
</Root>
<!-- 所有的 logger 均继承 Root
当 additivity 为 true 时, 父子 logger 均会打印
当 additivity 为 false 时, 仅子 logger 会打印 -->
<Logger name = "mylog" level="error" additivity="false">
<AppenderRef ref="Console" />
</Logger>
</Loggers>
</Configuration>
\ No newline at end of file
registry=zookeeper://192.168.0.203:2181
group=hangzhou
minCount=20
maxCount=40
########################################################
#registry=zookeeper://192.168.0.36:2181
#group=testGroup
#minCount=10
#maxCount=20
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment