Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
W
weibohotcrawler
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
chenweiyang
weibohotcrawler
Commits
cd252c2d
Commit
cd252c2d
authored
Dec 09, 2017
by
yangchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
时间一天内的判断修改
parent
fdbe7b0b
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
66 additions
and
34 deletions
+66
-34
src/main/java/com/zhiwei/weibocrawler/crawler/DataCrawlerStart.java
+1
-1
src/main/java/com/zhiwei/weibocrawler/crawler/DataUpdate.java
+9
-2
src/main/java/com/zhiwei/weibocrawler/crawler/getdata/WeiboCrawlerAnalysis.java
+4
-3
src/main/java/com/zhiwei/weibocrawler/httpclient/HttpClientDemo.java
+52
-26
src/main/java/com/zhiwei/weibocrawler/rsidClient/DataQueue.java
+0
-1
src/main/java/com/zhiwei/weibocrawler/rsidClient/UpdateQueue.java
+0
-1
No files found.
src/main/java/com/zhiwei/weibocrawler/crawler/DataCrawlerStart.java
View file @
cd252c2d
...
@@ -13,7 +13,7 @@ public class DataCrawlerStart{
...
@@ -13,7 +13,7 @@ public class DataCrawlerStart{
private
ScheduledExecutorService
scheduled
;
private
ScheduledExecutorService
scheduled
;
public
DataCrawlerStart
()
{
public
DataCrawlerStart
()
{
this
.
scheduled
=
Executors
.
newScheduledThreadPool
(
4
);
this
.
scheduled
=
Executors
.
newScheduledThreadPool
(
3
);
}
}
public
void
start
()
{
public
void
start
()
{
...
...
src/main/java/com/zhiwei/weibocrawler/crawler/DataUpdate.java
View file @
cd252c2d
package
com
.
zhiwei
.
weibocrawler
.
crawler
;
package
com
.
zhiwei
.
weibocrawler
.
crawler
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.List
;
import
org.slf4j.Logger
;
import
org.slf4j.Logger
;
...
@@ -9,6 +10,7 @@ import com.zhiwei.weibobusiness.weibo4j.model.Status;
...
@@ -9,6 +10,7 @@ import com.zhiwei.weibobusiness.weibo4j.model.Status;
import
com.zhiwei.weibocrawler.crawler.getdata.WeiboCrawlerAnalysis
;
import
com.zhiwei.weibocrawler.crawler.getdata.WeiboCrawlerAnalysis
;
import
com.zhiwei.weibocrawler.rsidClient.DataQueue
;
import
com.zhiwei.weibocrawler.rsidClient.DataQueue
;
import
com.zhiwei.weibocrawler.rsidClient.UpdateQueue
;
import
com.zhiwei.weibocrawler.rsidClient.UpdateQueue
;
import
com.zhiwei.zhiweiTools.timeParse.TimeParse
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
/***
/***
*
*
...
@@ -39,13 +41,18 @@ public class DataUpdate implements Runnable{
...
@@ -39,13 +41,18 @@ public class DataUpdate implements Runnable{
while
(
true
)
{
while
(
true
)
{
try
{
try
{
ZhiWeiTools
.
sleep
(
1000
);
ZhiWeiTools
.
sleep
(
1000
);
if
(
i
>
60
||
DataQueue
.
linkQueue
.
size
()
>=
50
)
{
if
(
i
>
60
0
||
DataQueue
.
linkQueue
.
size
()
>=
50
)
{
logger
.
info
(
"此次更新更新队列中数据量"
+
DataQueue
.
linkQueue
.
size
());
logger
.
info
(
"此次更新更新队列中数据量"
+
DataQueue
.
linkQueue
.
size
());
List
<
String
>
midList
=
DataQueue
.
get
(
48
);
List
<
String
>
midList
=
DataQueue
.
get
(
48
);
if
(
midList
!=
null
&&
midList
.
size
()
>
0
)
{
if
(
midList
!=
null
&&
midList
.
size
()
>
0
)
{
List
<
Status
>
list
=
WeiboCrawlerAnalysis
.
getWeiboData
(
midList
,
token
);
List
<
Status
>
list
=
WeiboCrawlerAnalysis
.
getWeiboData
(
midList
,
token
);
logger
.
info
(
"更新数据量"
+
list
.
size
());
logger
.
info
(
"更新数据量"
+
list
.
size
());
UpdateQueue
.
add
(
list
);
for
(
Status
status
:
list
)
{
Date
date
=
new
Date
(
new
Date
().
getTime
()-
24
*
60
*
60
*
1000
);
if
(
status
.
getCreatedAt
().
after
(
date
)){
UpdateQueue
.
add
(
status
);
}
}
}
}
i
=
1
;
i
=
1
;
}
else
{
}
else
{
...
...
src/main/java/com/zhiwei/weibocrawler/crawler/getdata/WeiboCrawlerAnalysis.java
View file @
cd252c2d
package
com
.
zhiwei
.
weibocrawler
.
crawler
.
getdata
;
package
com
.
zhiwei
.
weibocrawler
.
crawler
.
getdata
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.Date
;
import
java.util.List
;
import
java.util.List
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
import
org.jsoup.Jsoup
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Document
;
...
@@ -16,9 +19,9 @@ import com.zhiwei.weibobusiness.weibo4j.model.Status;
...
@@ -16,9 +19,9 @@ import com.zhiwei.weibobusiness.weibo4j.model.Status;
import
com.zhiwei.weibobusiness.weibo4j.model.StatusWapper
;
import
com.zhiwei.weibobusiness.weibo4j.model.StatusWapper
;
import
com.zhiwei.weibobusiness.weibo4j.model.WeiboException
;
import
com.zhiwei.weibobusiness.weibo4j.model.WeiboException
;
import
com.zhiwei.weibocrawler.httpclient.HttpClientDemo
;
import
com.zhiwei.weibocrawler.httpclient.HttpClientDemo
;
import
com.zhiwei.weibocrawler.rsidClient.DataQueue
;
import
com.zhiwei.weibocrawler.rsidClient.RsidClientDAO
;
import
com.zhiwei.weibocrawler.rsidClient.RsidClientDAO
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
import
com.zhiwei.zhiweiTools.tools.ZhiWeiTools
;
import
com.zhiwei.weibocrawler.rsidClient.DataQueue
;
/**
/**
*
*
* @ClassName WeiboCrawlerAnalysis
* @ClassName WeiboCrawlerAnalysis
...
@@ -52,7 +55,6 @@ public class WeiboCrawlerAnalysis {
...
@@ -52,7 +55,6 @@ public class WeiboCrawlerAnalysis {
if
(
i
>
48
)
{
if
(
i
>
48
)
{
try
{
try
{
mids
=
mids
.
substring
(
0
,
mids
.
length
()-
1
);
mids
=
mids
.
substring
(
0
,
mids
.
length
()-
1
);
System
.
out
.
println
(
mids
);
StatusWapper
statusWapper
=
searchBusiness
.
showStatusBusniess
(
mids
);
StatusWapper
statusWapper
=
searchBusiness
.
showStatusBusniess
(
mids
);
statuses
.
addAll
(
statusWapper
.
getStatuses
());
statuses
.
addAll
(
statusWapper
.
getStatuses
());
i
=
0
;
i
=
0
;
...
@@ -127,5 +129,4 @@ public class WeiboCrawlerAnalysis {
...
@@ -127,5 +129,4 @@ public class WeiboCrawlerAnalysis {
}
}
}
}
}
}
src/main/java/com/zhiwei/weibocrawler/httpclient/HttpClientDemo.java
View file @
cd252c2d
...
@@ -10,49 +10,75 @@ import org.apache.http.client.methods.HttpGet;
...
@@ -10,49 +10,75 @@ import org.apache.http.client.methods.HttpGet;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.apache.http.impl.client.HttpClients
;
import
org.apache.http.impl.client.HttpClients
;
import
org.apache.http.util.EntityUtils
;
import
org.apache.http.util.EntityUtils
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
com.zhiwei.zhiweiTools.httpClient.HttpClientTemplateOK
;
public
class
HttpClientDemo
{
public
class
HttpClientDemo
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
HttpClientDemo
.
class
);
// public static String executeHttpRequestGet(String url) throws IOException {
// String result = null;
// Map<String, String> headerMap = new HashMap<String, String>();
// headerMap.put("User-Agent",
// "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36");
// headerMap.put("Accept","*/*");
// headerMap.put("Accept-Encoding", "gzip, deflate, br");
// headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
// headerMap.put("Connection", "keep-alive");
// headerMap.put("Content-Type", "application/x-www-form-urlencoded");
// headerMap.put("Host", "weibo.com");
// CloseableHttpClient httpClient = null;
// for(int j = 1;j <= 3;j++) {
// try {
// HttpGet httpGet = new HttpGet(url);
// RequestConfig requestConfig = RequestConfig.custom()
// .setSocketTimeout(8000).setConnectTimeout(8000).build();
// httpClient = HttpClients.custom()
// .setDefaultRequestConfig(requestConfig).build();
// if (headerMap != null) {
// for (Entry<String, String> header : headerMap.entrySet()) {
// httpGet.setHeader(header.getKey(), header.getValue());
// }
// }
// result = EntityUtils
// .toString(httpClient.execute(httpGet).getEntity());
// return result;
// }catch (Exception e) {
// e.printStackTrace();
// continue;
// }finally {
// if (httpClient != null) {
// httpClient.close();
// }
// }
// }
// return result;
//
// }
public
static
String
executeHttpRequestGet
(
String
url
)
throws
IOException
{
public
static
String
executeHttpRequestGet
(
String
url
)
throws
IOException
{
String
result
=
null
;
String
result
=
null
;
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
Map
<
String
,
String
>
headerMap
=
new
HashMap
<
String
,
String
>();
headerMap
.
put
(
"User-Agent"
,
headerMap
.
put
(
"User-Agent"
,
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
);
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
);
headerMap
.
put
(
"Accept"
,
"*/*"
);
headerMap
.
put
(
"Accept"
,
"*/*"
);
headerMap
.
put
(
"Accept-Encoding"
,
"gzip, deflate, br"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-CN,zh;q=0.9"
);
headerMap
.
put
(
"Accept-Language"
,
"zh-CN,zh;q=0.9"
);
headerMap
.
put
(
"Connection"
,
"keep-alive"
);
headerMap
.
put
(
"Connection"
,
"keep-alive"
);
headerMap
.
put
(
"Content-Type"
,
"application/x-www-form-urlencoded"
);
headerMap
.
put
(
"Content-Type"
,
"application/x-www-form-urlencoded"
);
headerMap
.
put
(
"Host"
,
"weibo.com"
);
headerMap
.
put
(
"Host"
,
"weibo.com"
);
CloseableHttpClient
httpClient
=
null
;
for
(
int
j
=
1
;
j
<=
3
;
j
++)
{
try
{
try
{
HttpGet
httpGet
=
new
HttpGet
(
url
);
result
=
HttpClientTemplateOK
.
get
(
url
,
null
,
headerMap
);
RequestConfig
requestConfig
=
RequestConfig
.
custom
()
}
catch
(
Exception
e
)
{
.
setSocketTimeout
(
8000
).
setConnectTimeout
(
8000
).
build
();
logger
.
error
(
"httpClient 获取数据出现问题:{}"
,
e
.
getMessage
());
httpClient
=
HttpClients
.
custom
()
.
setDefaultRequestConfig
(
requestConfig
).
build
();
if
(
headerMap
!=
null
)
{
for
(
Entry
<
String
,
String
>
header
:
headerMap
.
entrySet
())
{
httpGet
.
setHeader
(
header
.
getKey
(),
header
.
getValue
());
}
}
result
=
EntityUtils
.
toString
(
httpClient
.
execute
(
httpGet
).
getEntity
());
return
result
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
continue
;
}
finally
{
if
(
httpClient
!=
null
)
{
httpClient
.
close
();
}
}
}
}
return
result
;
return
result
;
}
}
}
}
src/main/java/com/zhiwei/weibocrawler/rsidClient/DataQueue.java
View file @
cd252c2d
...
@@ -17,7 +17,6 @@ public class DataQueue {
...
@@ -17,7 +17,6 @@ public class DataQueue {
public
static
ListQueue
<
String
>
linkQueue
=
new
ListQueue
<
String
>();
//已去重数据队列
public
static
ListQueue
<
String
>
linkQueue
=
new
ListQueue
<
String
>();
//已去重数据队列
public
static
void
offer
(
String
mid
)
{
public
static
void
offer
(
String
mid
)
{
System
.
out
.
println
(
"更新队列中的数据大小===="
+
linkQueue
.
size
());
linkQueue
.
offer
(
mid
);
linkQueue
.
offer
(
mid
);
}
}
...
...
src/main/java/com/zhiwei/weibocrawler/rsidClient/UpdateQueue.java
View file @
cd252c2d
...
@@ -23,7 +23,6 @@ public class UpdateQueue {
...
@@ -23,7 +23,6 @@ public class UpdateQueue {
* @param mid
* @param mid
*/
*/
public
static
void
add
(
Status
status
){
public
static
void
add
(
Status
status
){
System
.
out
.
println
(
"更新后队列中的数据大小===="
+
linkQueue
.
size
());
linkQueue
.
offer
(
status
);
linkQueue
.
offer
(
status
);
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment