Commit 8d7380c6 by zhiwei

自媒体号采集

parents
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.zhiwei.account</groupId>
<artifactId>discover-mediaself-account</artifactId>
<version>0.0.1-SNAPSHOT</version>
<description>根据关键词查询相应的账号数据</description>
<dependencies>
<dependency>
<groupId>com.zhiwei.crawler</groupId>
<artifactId>crawler-core</artifactId>
<version>0.3.0-RELEASE</version>
</dependency>
<dependency>
<groupId>com.zhiwei.tools</groupId>
<artifactId>zhiwei-tools</artifactId>
<version>0.1.2-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.zhiwei</groupId>
<artifactId>excelpoi</artifactId>
<version>0.0.3-SNAPSHOT</version>
</dependency>
</dependencies>
</project>
\ No newline at end of file
package com.zhiwei.account.crawler;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.tools.URLCodeUtil;
/**
* 根据关键词获取搜狐账号
* @author qq859
*
*/
public class SohuAccount {
private static HttpBoot httpBoot = new HttpBoot();
private static final String registry = "zookeeper://192.168.0.36:2181";
private static final String group = "local";
public static void main(String[] args) {
ProxyFactory.init(registry, group, GroupType.PROVIDER);
String word = "京东";
try {
SohuAccount.getSohuAccountByWord(word);
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 根据关键词获取大鱼号账号信息
* @param word
* @return
* @throws IOException
*/
public static List<Map<String,Object>> getSohuAccountByWord(String word) throws IOException{
List<Map<String,Object>> dataList = new ArrayList<>();
boolean more = true;
int page = 0;
while(more) {
String url = "http://search.sohu.com/search/meta?keyword="+URLCodeUtil.getURLEncode(word, "utf-8")+"&spm-pre=smpc.csrpage.0.0.15522844808206u49PLo&from="+page*50+"&size=50&searchType=media&queryType=edit";
System.out.println("page============"+page);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url),ProxyHolder.NAT_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("userName")) {
JSONObject dataJson = JSONObject.parseObject(htmlBody).getJSONObject("data");
JSONArray mediaArray = dataJson.getJSONArray("media");
if(!mediaArray.isEmpty() && mediaArray.size()>0) {
for(int i=0; i<mediaArray.size(); i++) {
Map<String,Object> dataMap = new HashMap<String,Object>();
JSONObject json = mediaArray.getJSONObject(i);
dataMap.put("link", json.getString("weiboUrl"));
dataMap.put("id", json.getInteger("id"));
dataMap.put("mail", json.getString("passport"));
dataMap.put("name", json.getString("userName"));
dataMap.put("description", json.getString("description"));
dataMap.put("avatorUrl", json.getString("avatorUrl"));
dataMap.put("totalReadNum", json.getJSONObject("scoreMap").getInteger("totalPv"));
dataMap.put("newsCount", json.getJSONObject("scoreMap").getInteger("newsCount"));
System.out.println(dataMap);
dataList.add(dataMap);
}
boolean esEnd = dataJson.getBooleanValue("esEnd");
if(esEnd) {
more = false;
}
page++;
}else {
more = false;
}
}else {
more = false;
}
}
return dataList;
}
}
package com.zhiwei.account.crawler;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.tools.tools.URLCodeUtil;
import com.zhiwei.tools.tools.ZhiWeiTools;
/**
* UC大鱼号,根据关键词采集
* @author qq859
*
*/
public class UCAccount {
private static HttpBoot httpBoot = new HttpBoot();
private static final String registry = "zookeeper://192.168.0.36:2181";
private static final String group = "local";
public static void main(String[] args) {
ProxyFactory.init(registry, group, GroupType.PROVIDER);
String word = "京东";
try {
UCAccount.getUCAccountByWord(word);
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 根据关键词获取大鱼号账号信息
* @param word
* @return
* @throws IOException
*/
public static List<Map<String,Object>> getUCAccountByWord(String word) throws IOException{
List<Map<String,Object>> dataList = new ArrayList<>();
boolean more = true;
int page = 1;
while(more) {
String url = "https://m.sm.cn/api/rest?method=Subscribe.feed&q="+URLCodeUtil.getURLEncode(word, "utf-8")+"&format=json&by=submit&snum=0&page="+page;
System.out.println("page============"+page);
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url),ProxyHolder.NAT_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("feed_html")) {
JSONObject dataJson = JSONObject.parseObject(htmlBody).getJSONObject("data");
String dataText = dataJson.getString("feed_html");
Document document = Jsoup.parse(dataText);
Elements elements = document.getElementsByClass("cell-wrapper");
if(!elements.isEmpty() && elements.size()>0) {
for(Element element : elements) {
Map<String,Object> dataMap = new HashMap<String,Object>();
String name = element.select("div.info>p.title").text().replace("大鱼号", "");
String description = element.select("div.info>p.summary").text();
String avatorUrl = element.select("div.img").attr("data-image");
String link = element.select("a.cell").attr("href");
String follow_count = element.select("div.info>div.icons>span").text().replace("人关注", "");
dataMap.put("link", link);
dataMap.put("name", name);
dataMap.put("avatorUrl", avatorUrl);
dataMap.put("summary", ZhiWeiTools.delHTMLTag(description));
dataMap.put("follow_count", follow_count);
System.out.println(dataMap);
dataList.add(dataMap);
}
page++;
}else {
more = false;
}
}else {
more = false;
}
}
return dataList;
}
}
package com.zhiwei.account.crawler;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import javax.script.Invocable;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import org.apache.commons.lang3.StringUtils;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zhiwei.common.config.GroupType;
import com.zhiwei.crawler.core.HttpBoot;
import com.zhiwei.crawler.proxy.ProxyFactory;
import com.zhiwei.crawler.proxy.ProxyHolder;
import com.zhiwei.crawler.utils.RequestUtils;
import com.zhiwei.excelpoi.excel.PoiExcelUtil;
import com.zhiwei.tools.tools.URLCodeUtil;
/**
* 根据关键词获取一点资讯账号
* @author qq859
*
*/
public class YiDianZiXunAccount {
private static HttpBoot httpBoot = new HttpBoot();
private static final String registry = "zookeeper://192.168.0.36:2181";
private static final String group = "local";
public static void main(String[] args) {
ProxyFactory.init(registry, group, GroupType.PROVIDER);
String word = "京东";
try {
List<Map<String,Object>> bodyList = YiDianZiXunAccount.getYiDianZiXunAccountByType();
System.out.println("bodyList size is :" + bodyList.size());
List<String> headList = new ArrayList<>();
headList.add("media_id");
headList.add("media_name");
headList.add("avatorUrl");
headList.add("userid");
headList.add("postcount");
headList.add("bookcount");
headList.add("media_domain");
headList.add("authentication");
headList.add("summary");
PoiExcelUtil poi = PoiExcelUtil.getInstance();
String path = "D://一点号账号信息.xlsx";
poi.exportExcel(path ,"账号信息", headList, bodyList);
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 根据关键词获取一点资讯账号信息
* @param word
* @return
* @throws IOException
*/
public static List<Map<String,Object>> getYiDianZiXunAccountByWord(String word) throws IOException{
List<Map<String,Object>> dataList = new ArrayList<>();
String url = "http://www.yidianzixun.com/home/q/search_channel?word="+URLCodeUtil.getURLEncode(word, "utf-8")+"&&appid=web_yidian";
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url),ProxyHolder.NAT_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("channels")) {
JSONArray channels = JSONObject.parseObject(htmlBody).getJSONArray("channels");
if(!channels.isEmpty() && channels.size()>0) {
for(int i=0; i<channels.size(); i++) {
JSONObject json = channels.getJSONObject(i);
if(json.containsKey("id")) {
Map<String,Object> dataMap = getYidianAccountInfo(json.getString("id"));
System.out.println(dataMap);
dataList.add(dataMap);
}
}
}
return dataList;
}
return null;
}
/**
* 根据频道拉取一点资讯账号
* @return
* @throws IOException
*/
public static List<Map<String,Object>> getYiDianZiXunAccountByType() throws IOException{
List<Map<String,Object>> dataList = new ArrayList<>();
String url = "http://www.yidianzixun.com/medialist";
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url),ProxyHolder.NAT_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("channels")) {
htmlBody = htmlBody.split("window.yidian.docinfo = ")[1].split("</script>")[0];
JSONArray categories = JSONObject.parseObject(htmlBody).getJSONArray("categories");
if(!categories.isEmpty() && categories.size()>0) {
for(int i=0; i<categories.size(); i++) {
JSONArray channels = categories.getJSONObject(i).getJSONArray("channels");
if(Objects.nonNull(channels) && channels.size()>0) {
for(int j=0; j<channels.size(); j++) {
JSONObject json = channels.getJSONObject(j);
if(json.containsKey("id")) {
Map<String,Object> dataMap = getYidianAccountInfo(json.getString("id"));
System.out.println(dataMap);
dataList.add(dataMap);
}
}
}
}
}
return dataList;
}
return null;
}
/**
* 根据id获取一点账号信息
* @param id
* @return
*/
private static Map<String,Object> getYidianAccountInfo(String id){
Map<String,Object> dataMap = new HashMap<String,Object>();
String spt = getSpt(id, 0, 10);
String url = "http://www.yidianzixun.com"+spt+"&appid=web_yidian";
Map<String,Object> headMap = new HashMap<String,Object>();
String referer = "http://www.yidianzixun.com/channel/"+id;
headMap.put("Referer", referer);
try {
String htmlBody = httpBoot.syncCall(RequestUtils.wrapGet(url,headMap),ProxyHolder.NAT_PROXY).body().string();
if(StringUtils.isNotBlank(htmlBody) && htmlBody.contains("channel_media")) {
JSONObject dataJson = JSONObject.parseObject(htmlBody);
String bookcount = dataJson.getString("bookcount").replaceAll("人订阅", "");
Double followCount = 0.0;
if(bookcount.contains("万")) {
followCount = Double.valueOf(bookcount.replaceAll("万", ""))*10000;
}else {
followCount = Double.valueOf(bookcount);
}
String channel_id = dataJson.getString("channel_id");
String channel_image = dataJson.getString("channel_image");
String channel_name = dataJson.getString("channel_name");
String channel_summary = dataJson.getString("channel_summary");
JSONObject channel_media = dataJson.getJSONObject("channel_media");
String authentication = channel_media.getString("authentication");
String media_domain = channel_media.getString("media_domain");
int postcount = channel_media.getInteger("postcount");
long userid = channel_media.getLong("userid");
dataMap.put("bookcount", followCount.intValue());
dataMap.put("media_id", channel_id);
dataMap.put("media_name", channel_name);
dataMap.put("avatorUrl", channel_image);
dataMap.put("summary", channel_summary);
dataMap.put("authentication", authentication);
dataMap.put("media_domain", media_domain);
dataMap.put("postcount", postcount);
dataMap.put("userid", userid);
return dataMap;
}
} catch (IOException e) {
e.printStackTrace();
}
return dataMap;
}
/**
* 计算spt加密参数,用于采集账号
* @param channel_id
* @param cstart
* @param cend
* @return
*/
private static String getSpt(String channel_id ,int cstart, int cend) {
String n = "/home/q/news_list_for_channel?channel_id="+channel_id+"&cstart="+cstart+"&cend="+(cstart+10)+"&infinite=true&refresh=1&__from__=pc&multi=5";
String jsText = "function spt(n, e, i, t) {" +
"for (var o = \"sptoken\", a = \"\", c = 1; c < arguments.length; c++){o += arguments[c];}" +
"for (var c = 0; c < o.length; c++) {var r = 10 ^ o.charCodeAt(c); a += String.fromCharCode(r)}return n += (/\\?/.test(n) ? \"&_spt=\" : \"?_spt=\") + encodeURIComponent(a)}";
ScriptEngineManager manager = new ScriptEngineManager();
ScriptEngine engine = manager.getEngineByName("javascript");
try {
engine.eval(jsText);
if(engine instanceof Invocable) {
Invocable invoke = (Invocable)engine;
String spt = invoke.invokeFunction("spt", n, channel_id, cstart, cend).toString();
return spt;
}
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
}
<?xml version="1.0" encoding="UTF-8"?>
<!-- log4j2 自身的日志级别 -->
<Configuration status="WARN">
<Appenders>
<!-- 定义日志输出地 -->
<Console name="Console" target="SYSTEM_OUT">
<PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n" />
</Console>
</Appenders>
<Loggers>
<Root level="info">
<AppenderRef ref="Console" />
</Root>
<!-- 所有的 logger 均继承 Root
当 additivity 为 true 时, 父子 logger 均会打印
当 additivity 为 false 时, 仅子 logger 会打印 -->
<Logger name = "mylog" level="error" additivity="false">
<AppenderRef ref="Console" />
</Logger>
</Loggers>
</Configuration>
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment