Commit dad70819 by cwy

自媒体获取修改

parent 391fcd6c
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei</groupId>
<artifactId>source-forward</artifactId>
<version>0.2.3-SNAPSHOT</version>
<version>0.2.4-SNAPSHOT</version>
<name>source-forward</name>
<description>验证网媒的转发关系及链接的有效性(转发验证微信及自媒体匹配率不高)</description>
......
......@@ -101,7 +101,7 @@ public class MediaSelfSourceCrawler {
try {
if (Objects.isNull(ex)) {
try {
parseHtml(rs.body().string(), attr, callback);
parseHtml(rs.body().string(), attr, callback, rs.request().url().uri().toString());
} catch (Exception e) {
logger.error("解析出错", e);
}
......@@ -151,12 +151,12 @@ public class MediaSelfSourceCrawler {
* @param callback
*/
private void parseHtml(String result, Attribution attr,
MediaSelfSourceDataCallBack callback) {
MediaSelfSourceDataCallBack callback, String eUrl) {
String source = null;
String channel = null;
String url = attr.get().toString();
try {
source = MatchSource.matchMediaSelfSource(url,result);
source = MatchSource.matchMediaSelfSource(url + eUrl,result);
logger.info(url+"=======" + source);
channel = MatchChannel.verifyChannel(url);
if(channel==null){
......
......@@ -3,9 +3,12 @@ package com.zhiwei.source_forward.crawler;
import static java.util.Objects.nonNull;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
......@@ -20,7 +23,6 @@ import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.source_forward.bean.UrlLiveBean;
import com.zhiwei.source_forward.bean.UrlLiveBean.Attribution;
import com.zhiwei.source_forward.util.UrlLiveDataCallback;
import com.zhiwei.tools.httpclient.HeaderTool;
import com.zhiwei.tools.tools.ZhiWeiTools;
import okhttp3.Request;
......@@ -50,7 +52,7 @@ public class UrlLiveCrawler {
counter.add();
if (nonNull(url)) {
try {
ZhiWeiTools.sleep(10);
// ZhiWeiTools.sleep(3000);
search(counter, url, Attribution.of(url, 1), callback);
} catch (Exception e) {
logger.error("搜索创建出错:", e);
......@@ -63,13 +65,23 @@ public class UrlLiveCrawler {
private GroupSync search(GroupSync counter, String url,
Attribution attr, UrlLiveDataCallback callback) {
System.out.println(url);
// System.out.println(url);
url = dealUrl(url);
logger.info("当前处理 URL: {}", url);
// Map<String,String> headers = new HashMap<>();
Map<String,String> headers = HeaderTool.getCommonHead();
if(url.contains("www.toutiao.com")){
Map<String,String> headers = new HashMap<>();
if(url.contains("toutiao.com")){
headers.put("referer", url);
headers.put("cookie", "csrftoken=6d0e5967684dbb57cea14dc12858d263; tt_webid=6763913092738418180; tt_webid=6763913092738418180; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __utmz=24953151.1576744435.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _ga=GA1.2.1570444666.1576744435; ttcid=d5fa0cec2bca426cbbddeb27a8bc72f629; UM_distinctid=170f5ad3a5b43e-04a6930812e3c5-f313f6d-240000-170f5ad3a5c40b; s_v_web_id=verify_k9wn4wvx_J8Tm9B3v_4KQj_4pYw_B3C5_Bz00jljwk2Ik; SLARDAR_WEB_ID=770be4fe-dd54-4701-a4ed-396b861a237d; __utmc=24953151; __utma=24953151.1570444666.1576744435.1589341084.1589355043.4; CNZZDATA1259612802=2091325281-1587691681-%7C1589354688; __ac_nonce=05ec2023000312916dbf0; __ac_signature=YYVItAAgEBDxesof46KjamGESaAAD9LCPu9LY3i693yRwgjuLokObvXcXAHluuEslefdgz60kyPRc1WnihwB4acMsJgn1wYE8IuqB3toZpnIZRexNBULILeZxouOJAtnxO6; __tasessionId=402dor9vo1589772849201; tt_scid=yP.oipZ1w-SChWahT4a7rhJ2gsjG-rJO.4UkyTROzer4MBRJ4bAv7POpDKAcZwzc497f");
headers.put("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
headers.put("accept-encoding", "gzip, deflate, br");
headers.put("accept-language", "zh-CN,zh;q=0.9");
headers.put("cache-control", "no-cache");
headers.put("sec-fetch-dest", "document");
headers.put("sec-fetch-mode", "navigate");
headers.put("sec-fetch-site", "same-origin");
headers.put("sec-fetch-user", "?1");
headers.put("upgrade-insecure-requests", "1");
headers.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36");
}else if(url.contains("zhihu.com")) {
url = treatZhihuUrl(url);
}
......@@ -77,7 +89,7 @@ public class UrlLiveCrawler {
Request request = RequestUtils.wrapGet(url, headers);
if(Objects.nonNull(request)) {
counter.add();
httpBoot.asyncCall(RequestUtils.wrapGet(url), ProxyHolder.NAT_HEAVY_PROXY).whenComplete((rs,ex) -> {
httpBoot.asyncCall(request, ProxyHolder.NAT_HEAVY_PROXY).whenComplete((rs,ex) -> {
try {
if (Objects.isNull(ex)) {
if(rs.isSuccessful()) {
......@@ -200,6 +212,12 @@ public class UrlLiveCrawler {
if(Objects.isNull(title) || title.isEmpty()) {
title = doc.select("h2").text();
}
// 获取title
Matcher ma5 = Pattern.compile("var msg_title = \'(.*)\'")
.matcher(result);
if (ma5.find()) {
title = ma5.group(1).replaceAll(" ", " ").trim();
}
}else if(url.contains("kuaibao")){
title = doc.select("p.title").text().replaceAll(" ", "");
}else if(url.contains("chinadaily.com.cn")){
......@@ -224,8 +242,11 @@ public class UrlLiveCrawler {
title = "网页已删除";
}else if(url.contains("zhihu.com")) {
JSONObject resultJson = JSONObject.parseObject(result);
title = resultJson.getString("title")!=null?resultJson.getString("title"):resultJson.getString("message");
if(url.contains("/answer/")) {
title = resultJson.getJSONObject("question").getString("title");
}else if(url.contains("/question/") && !url.contains("/answer/") || url.contains("/p/")) {
title = resultJson.getString("title");
}
}else if(url.contains("360kuai.com") && result.contains("您访问的文章走失了")) {
title = String.valueOf("404");
}else if(result.contains("文章没有找到哦") && url.contains("yidianzixun.com")) {
......@@ -304,10 +325,4 @@ public class UrlLiveCrawler {
return url;
}
}
......@@ -32,7 +32,8 @@ public class MediaSelfSource {
ProxyInit.initProxy();
List<String> urlList = new ArrayList<>();
urlList.add("https://www.dcdapp.com/article/6819085953756299789");
urlList.add("http://iphone.myzaker.com/l.php?l=5ec0d951b15ec0157b6b4e46");
List<MediaSelfSourceBean> u = MediaSelfSource.getMediaSelfSource(urlList);
for(MediaSelfSourceBean b : u) {
System.out.println(b.toString());
......
......@@ -72,12 +72,13 @@ public class URLLive {
public static void main(String[] args) {
ProxyInit.initProxy();
List<String> urlList = new ArrayList<>();
urlList.add("http://a.mp.uc.cn/article.html?uc_param_str=frdnsnpfvecpntnwprdssskt#!wm_aid=038b8207b444418c845f43e4d2d3a754");
urlList.add("http://www.toutiao.com/a1665677841741827");
// urlList.add("https://mp.weixin.qq.com/s?__biz=MzA3NjgyNTU5Nw==&mid=2247486586&idx=2&sn=419218b3c831b17d2b9bd9a5281ea842&scene=6#wechat_redirect");
List<UrlLiveBean> u = URLLive.verificationURLLive(urlList);
for(UrlLiveBean b : u) {
System.out.println(b.toString());
}
}
}
static class UrlLiveCrawlerThread extends Thread{
......
......@@ -432,8 +432,11 @@ public class MatchSource {
source = "快资讯-" + source;
}
}else if(url.contains("cj.sina.com.cn") || url.contains("finance.sina.cn") ||
url.contains("tech.sina.cn") || url.contains("news.sina.cn")){
url.contains("tech.sina.cn") || url.contains("news.sina.cn") || url.contains("k.sina.cn")){
source = document.select("h2.weibo_user").text();
if(Objects.isNull(source) || source.length() < 1) {
source = document.select("#top_bar > div > div.date-source > a").text();
}
if((Objects.isNull(source) || source.length() < 1) && html.contains("<meta name=\"mediaid\"")){
//新浪科技头条号
source = html.split("<meta name=\"mediaid\" content=\"")[1].split("\"")[0].trim();
......@@ -453,6 +456,16 @@ public class MatchSource {
if(source!=null && source.length()>1){
source = "新浪-" + source;
}
}else if(url.contains("finance.ifeng.com")){
source = html.split("weMediaName\":\"")[1].split("\",")[0];
if(source!=null && source.length()>1){
source = "大风号-" + source;
}
}else if(url.contains("ihouse.ifeng.com")){
source = document.select("body > section.article > span > a").text();
if(source!=null && source.length()>1){
source = "大风号-" + source;
}
}else if(url.contains("k.sina.cn")){
source = document.select("h2.weibo_user").text();
if(source!=null && source.length()>1){
......@@ -635,19 +648,6 @@ public class MatchSource {
if(StringUtils.isNotBlank(source)){
source = "创业邦-" + source;
}
}else if(url.contains("36kr.com")){
source = document.select("div.info-header-text > a.author-name").text();
if(StringUtils.isNotBlank(source)){
return "36氪-" + source;
}
source = document.select("h4.author-name").text();
if(StringUtils.isNotBlank(source)){
return "36氪-" + source;
}
source = document.select("span.author-nickname").text();
if(StringUtils.isNotBlank(source)){
return "36氪-" + source;
}
}else if(url.contains("lianxianjia.com")){
source = document.select("span.author-name").text();
if(StringUtils.isNotBlank(source)){
......@@ -760,6 +760,20 @@ public class MatchSource {
source = "推酷-" + source;
}
}
if(url.contains("36kr.com")){
source = document.select("div.info-header-text > a.author-name").text();
if(StringUtils.isNotBlank(source)){
return "36氪-" + source;
}
source = document.select("h4.author-name").text();
if(StringUtils.isNotBlank(source)){
return "36氪-" + source;
}
source = document.select("span.author-nickname").text();
if(StringUtils.isNotBlank(source)){
return "36氪-" + source;
}
}
return source;
} catch (Exception e) {
e.printStackTrace();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment