本文主要說明Redis中哨兵Sentinel的設計與實現(xiàn)。
建議閱讀:
1、Sentinel的理論部分見:Redis之Sentinel
I、上帝視角
1、Sentinel也是Redis服務器,只是與普通服務器職責不同,其負責監(jiān)視Redis服務器,以提高服務器集群的可靠性。Sentinel與普通服務器共用一套框架(網絡框架,底層數(shù)據(jù)結構,訂閱與發(fā)布機制),但又有其獨立的運行代碼。
為維護Sentinel系統(tǒng)的正常運行,我們先來看Redis為Sentinel維護了怎樣的數(shù)據(jù)結構:
/* Main state. */
/* Sentinel 的狀態(tài)結構 */
/*src/sentinel.csentinelState*/
struct sentinelState {
// 當前紀元
uint64_t current_epoch; /* Current epoch. */
// 保存了所有被這個 sentinel 監(jiān)視的主服務器
// 字典的key是主服務器的名字
// 字典的value則是一個指向 sentinelRedisInstance 結構的指針
dict *masters; /* Dictionary of master sentinelRedisInstances.
Key is the instance name, value is the
sentinelRedisInstance structure pointer. */
// 是否進入了 TILT 模式?
int tilt; /* Are we in TILT mode? */
// 目前正在執(zhí)行的腳本的數(shù)量
int running_scripts; /* Number of scripts in execution right now. */
// 進入 TILT 模式的時間
mstime_t tilt_start_time; /* When TITL started. */
// 最后一次執(zhí)行時間處理器的時間
mstime_t previous_time; /* Last time we ran the time handler. */
// 一個 FIFO 隊列,包含了所有需要執(zhí)行的用戶腳本
list *scripts_queue; /* Queue of user scripts to execute. */
} sentinel;
2、從主函數(shù)main中可以看到服務器是如何向Sentinel轉化的:
/*src/redis.c/main*/
int main(int argc, char **argv) {
// 隨機種子,一般rand() 產生隨機數(shù)的函數(shù)會用到
srand(time(NULL)^getpid());
gettimeofday(&tv,NULL);
dictSetHashFunctionSeed(tv.tv_sec^tv.tv_usec^getpid());
// 通過命令行參數(shù)確認是否啟動哨兵模式
server.sentinel_mode = checkForSentinelMode(argc,argv);
// 初始化服務器配置,主要是填充redisServer 結構體中的各種參數(shù)
initServerConfig();
// 將服務器配置為哨兵模式,與普通的redis 服務器不同
/* We need to init sentinel right now as parsing the configuration file
* in sentinel mode will have the effect of populating the sentinel
* data structures with master nodes to monitor. */
if (server.sentinel_mode) {
// initSentinelConfig() 只指定哨兵服務器的端口
initSentinelConfig();
initSentinel();
}
......
// 普通redis 服務器模式
if (!server.sentinel_mode) {
......
// 哨兵服務器模式
} else {
// 檢測哨兵模式是否正常配置
sentinelIsRunning();
}
......
// 進入事件循環(huán)
aeMain(server.el);
// 去除事件循環(huán)系統(tǒng)
aeDeleteEventLoop(server.el);
return 0;
}
II、Sentinel的初始化
1、在上面的程序中,可以看出,如果檢查到需要使用Sentinel模式時,會調用initSentinel函數(shù)對Sentinel服務器進行特有的初始化:
/* Perform the Sentinel mode initialization. */
// 以 Sentinel 模式初始化服務器
/*src/sentinel.c/initSentinel*/
void initSentinel(void) {
int j;
/* Remove usual Redis commands from the command table, then just add
* the SENTINEL command. */
// 清空 Redis 服務器的命令表(該表用于普通模式)
dictEmpty(server.commands,NULL);
// 將 SENTINEL 模式所用的命令添加進命令表
for (j = 0; j < sizeof(sentinelcmds)/sizeof(sentinelcmds[0]); j++) {
int retval;
struct redisCommand *cmd = sentinelcmds+j;
retval = dictAdd(server.commands, sdsnew(cmd->name), cmd);
redisAssert(retval == DICT_OK);
}
/* Initialize various data structures. */
/* 初始化 Sentinel 的狀態(tài) */
// 初始化紀元
sentinel.current_epoch = 0;
// 初始化保存主服務器信息的字典
sentinel.masters = dictCreate(&instancesDictType,NULL);
// 初始化 TILT 模式的相關選項
sentinel.tilt = 0;
sentinel.tilt_start_time = 0;
sentinel.previous_time = mstime();
// 初始化腳本相關選項
sentinel.running_scripts = 0;
sentinel.scripts_queue = listCreate();
}
2、為了能讓Sentinel自動管理Redis服務器,在serverCorn函數(shù)中添加了一個定時程序:
/*src/redis.c/serverCorn*/
int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
......
run_with_period(100) {
//sentinelTimer即為sentinel主函數(shù)
if (server.sentinel_mode) sentinelTimer();
}
}
III、Sentinel主函數(shù):sentinelTimer
sentinelTimer所做的工作包括:監(jiān)視普通Redis服務器,執(zhí)行故障轉移,執(zhí)行腳本命令。
// sentinel 模式的主函數(shù),由 redis.c/serverCron 函數(shù)調用
/*src/sentinel.c/sentinelTimer*/
void sentinelTimer(void) {
// 記錄本次 sentinel 調用的事件,
// 并判斷是否需要進入 TITL 模式
sentinelCheckTiltCondition();
// 執(zhí)行定期操作
// 比如 PING 實例、分析主服務器和從服務器的 INFO 命令
// 向其他監(jiān)視相同主服務器的 sentinel 發(fā)送問候信息
// 并接收其他 sentinel 發(fā)來的問候信息
// 執(zhí)行故障轉移操作,等等
sentinelHandleDictOfRedisInstances(sentinel.masters);
// 運行等待執(zhí)行的腳本
sentinelRunPendingScripts();
// 清理已執(zhí)行完畢的腳本,并重試出錯的腳本
sentinelCollectTerminatedScripts();
// 殺死運行超時的腳本
sentinelKillTimedoutScripts();
/* We continuously change the frequency of the Redis "timer interrupt"
* in order to desynchronize every Sentinel from every other.
* This non-determinism avoids that Sentinels started at the same time
* exactly continue to stay synchronized asking to be voted at the
* same time again and again (resulting in nobody likely winning the
* election because of split brain voting). */
server.hz = REDIS_DEFAULT_HZ + rand() % REDIS_DEFAULT_HZ;
}
IV、Sentinel與Redis服務器的連接
1、每個Sentinel都可以與多個Redis服務器連接,其為每個Redis服務器都維護了一個struct sentinelRedisInstance:
// Sentinel 會為每個被監(jiān)視的 Redis 實例創(chuàng)建相應的 sentinelRedisInstance 實例
// (被監(jiān)視的實例可以是主服務器、從服務器、或者其他 Sentinel )
typedef struct sentinelRedisInstance {
......
/* Master specific. */
// 其他正在監(jiān)視此主機的哨兵
dict *sentinels; /* Other sentinels monitoring the same master. */
// 次主機的從機列表
dict *slaves; /* Slaves for this master instance. */
......
// 如果是從機,master 則指向它的主機
struct sentinelRedisInstance *master; /* Master instance if it's slave. */
......
} sentinelRedisInstance;
可見,Sentinel可監(jiān)視的實例可以是主服務器,從服務器,或者其他Sentinel,下圖表示了一個完整的sentinel.masters結構:

2、Sentinel要想對某個Redis服務器進行監(jiān)視,則首先要做的就是先對Redis服務器進行連接,在連接之前需要完成配置工作(如IP,port)
假如需要對一個Redis服務器進行監(jiān)視,則需要在配置文件中寫入:
sentinel monitor <master-name> <ip> <redis-port> <quorum>
上述命令中quorum參數(shù)是Sentinel用來判斷Redis服務器是否下線的參數(shù),對以上命令的解析與配置是通過調用函數(shù)sentinelHandleConfiguration完成的:
// 哨兵配置文件解析和處理
/*src/sentinel.c/sentinelHandleConfiguration*/
char *sentinelHandleConfiguration(char **argv, int argc) {
sentinelRedisInstance *ri;
if (!strcasecmp(argv[0],"monitor") && argc == 5) {
/* monitor <name> <host> <port> <quorum> */
int quorum = atoi(argv[4]);
// quorum >= 0
if (quorum <= 0) return "Quorum must be 1 or greater.";
if (createSentinelRedisInstance(argv[1],SRI_MASTER,argv[2],
atoi(argv[3]),quorum,NULL) == NULL)
{
switch(errno) {
case EBUSY: return "Duplicated master name.";
case ENOENT: return "Can't resolve master instance hostname.";
case EINVAL: return "Invalid port number";
}
}
......
}
sentinelHandleConfiguration主要調用了createSentinelRedisInstance函數(shù),這個函數(shù)的工作就是初始化sentinelRedisInstance結構體。
/* ========================== sentinelRedisInstance ========================= */
/* Create a redis instance, the following fields must be populated by the
* caller if needed:
*
* 創(chuàng)建一個 Redis 實例,在有需要時,以下兩個域需要從調用者提?。? *
* runid: set to NULL but will be populated once INFO output is received.
* 設置為 NULL ,并在接收到 INFO 命令的回復時設置
*
* info_refresh: is set to 0 to mean that we never received INFO so far.
* 如果這個值為 0 ,那么表示我們未收到過 INFO 信息。
*
* If SRI_MASTER is set into initial flags the instance is added to
* sentinel.masters table.
*
* 如果 flags 參數(shù)為 SRI_MASTER ,
* 那么這個實例會被添加到 sentinel.masters 表。
*
* if SRI_SLAVE or SRI_SENTINEL is set then 'master' must be not NULL and the
* instance is added into master->slaves or master->sentinels table.
*
* 如果 flags 為 SRI_SLAVE 或者 SRI_SENTINEL ,
* 那么 master 參數(shù)不能為 NULL ,
* SRI_SLAVE 類型的實例會被添加到 master->slaves 表中,
* 而 SRI_SENTINEL 類型的實例則會被添加到 master->sentinels 表中。
*
* If the instance is a slave or sentinel, the name parameter is ignored and
* is created automatically as hostname:port.
*
* 如果實例是從服務器或者 sentinel ,那么 name 參數(shù)會被自動忽略,
* 實例的名字會被自動設置為 hostname:port 。
*
* The function fails if hostname can't be resolved or port is out of range.
* When this happens NULL is returned and errno is set accordingly to the
* createSentinelAddr() function.
*
* 當 hostname 不能被解釋,或者超出范圍時,函數(shù)將失敗。
* 函數(shù)將返回 NULL ,并設置 errno 變量,
* 具體的出錯值請參考 createSentinelAddr() 函數(shù)。
*
* The function may also fail and return NULL with errno set to EBUSY if
* a master or slave with the same name already exists.
*
* 當相同名字的主服務器或者從服務器已經存在時,函數(shù)返回 NULL ,
* 并將 errno 設為 EBUSY 。
*/
sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char *hostname, int port, int quorum, sentinelRedisInstance *master) {
sentinelRedisInstance *ri;
sentinelAddr *addr;
dict *table = NULL;
char slavename[128], *sdsname;
redisAssert(flags & (SRI_MASTER|SRI_SLAVE|SRI_SENTINEL));
redisAssert((flags & SRI_MASTER) || master != NULL);
/* Check address validity. */
// 保存 IP 地址和端口號到 addr
addr = createSentinelAddr(hostname,port);
if (addr == NULL) return NULL;
/* For slaves and sentinel we use ip:port as name. */
// 如果實例是從服務器或者 sentinel ,那么使用 ip:port 格式為實例設置名字
if (flags & (SRI_SLAVE|SRI_SENTINEL)) {
snprintf(slavename,sizeof(slavename),
strchr(hostname,':') ? "[%s]:%d" : "%s:%d",
hostname,port);
name = slavename;
}
/* Make sure the entry is not duplicated. This may happen when the same
* name for a master is used multiple times inside the configuration or
* if we try to add multiple times a slave or sentinel with same ip/port
* to a master. */
// 配置文件中添加了重復的主服務器配置
// 或者嘗試添加一個相同 ip 或者端口號的從服務器或者 sentinel 時
// 就可能出現(xiàn)重復添加同一個實例的情況
// 為了避免這種現(xiàn)象,程序在添加新實例之前,需要先檢查實例是否已存在
// 只有不存在的實例會被添加
// 選擇要添加的表
// 注意主服務會被添加到 sentinel.masters 表
// 而從服務器和 sentinel 則會被添加到 master 所屬的 slaves 表和 sentinels 表中
if (flags & SRI_MASTER) table = sentinel.masters;
else if (flags & SRI_SLAVE) table = master->slaves;
else if (flags & SRI_SENTINEL) table = master->sentinels;
sdsname = sdsnew(name);
if (dictFind(table,sdsname)) {
// 實例已存在,函數(shù)直接返回
sdsfree(sdsname);
errno = EBUSY;
return NULL;
}
/* Create the instance object. */
// 創(chuàng)建實例對象
ri = zmalloc(sizeof(*ri));
/* Note that all the instances are started in the disconnected state,
* the event loop will take care of connecting them. */
// 所有連接都已斷線為起始狀態(tài),sentinel 會在需要時自動為它創(chuàng)建連接
ri->flags = flags | SRI_DISCONNECTED;
ri->name = sdsname;
ri->runid = NULL;
ri->config_epoch = 0;
ri->addr = addr;
ri->cc = NULL;
ri->pc = NULL;
ri->pending_commands = 0;
ri->cc_conn_time = 0;
ri->pc_conn_time = 0;
ri->pc_last_activity = 0;
/* We set the last_ping_time to "now" even if we actually don't have yet
* a connection with the node, nor we sent a ping.
* This is useful to detect a timeout in case we'll not be able to connect
* with the node at all. */
ri->last_ping_time = mstime();
ri->last_avail_time = mstime();
ri->last_pong_time = mstime();
ri->last_pub_time = mstime();
ri->last_hello_time = mstime();
ri->last_master_down_reply_time = mstime();
ri->s_down_since_time = 0;
ri->o_down_since_time = 0;
ri->down_after_period = master ? master->down_after_period :
SENTINEL_DEFAULT_DOWN_AFTER;
ri->master_link_down_time = 0;
ri->auth_pass = NULL;
ri->slave_priority = SENTINEL_DEFAULT_SLAVE_PRIORITY;
ri->slave_reconf_sent_time = 0;
ri->slave_master_host = NULL;
ri->slave_master_port = 0;
ri->slave_master_link_status = SENTINEL_MASTER_LINK_STATUS_DOWN;
ri->slave_repl_offset = 0;
ri->sentinels = dictCreate(&instancesDictType,NULL);
ri->quorum = quorum;
ri->parallel_syncs = SENTINEL_DEFAULT_PARALLEL_SYNCS;
ri->master = master;
ri->slaves = dictCreate(&instancesDictType,NULL);
ri->info_refresh = 0;
/* Failover state. */
ri->leader = NULL;
ri->leader_epoch = 0;
ri->failover_epoch = 0;
ri->failover_state = SENTINEL_FAILOVER_STATE_NONE;
ri->failover_state_change_time = 0;
ri->failover_start_time = 0;
ri->failover_timeout = SENTINEL_DEFAULT_FAILOVER_TIMEOUT;
ri->failover_delay_logged = 0;
ri->promoted_slave = NULL;
ri->notification_script = NULL;
ri->client_reconfig_script = NULL;
/* Role */
ri->role_reported = ri->flags & (SRI_MASTER|SRI_SLAVE);
ri->role_reported_time = mstime();
ri->slave_conf_change_time = mstime();
/* Add into the right table. */
// 將實例添加到適當?shù)谋碇? dictAdd(table, ri->name, ri);
// 返回實例
return ri;
}
3、在這里Sentinel并沒有馬上去連接Redis服務器,而只是將sentinelRedisInstance.flag狀態(tài)標記為了SRI_DISCONNECT,真正的連接工作其實在定時程序中,因為無論是主從服務器之間的連接,還是Sentinel與Redis服務器之間的連接,要想保持其連接狀態(tài),就需要定期檢查,所以就直接將連接放到了定時程序中統(tǒng)一處理。
調用過程如下:
sentinelTimer()->sentinelHandleDictOfRedisInstance()->sentinelHandleRedisInstance()->sentinelReconnectInstance()
sentinelReconnectInstance()函數(shù)的作用就是連接標記為SRI_DISCONNECT的服務器,其對Redis發(fā)起了兩種連接:
· 普通連接:用于向主服務器發(fā)布Sentinel的命令,并接收回復(這里Sentinel是主服務器的客戶端)。
· 訂閱與發(fā)布專用連接:用于訂閱主服務器的__sentinel__:hello頻道。這是因為Redis的發(fā)布與訂閱功能中,被發(fā)布的信息不會保存在Redis服務器里面,因此,為了不丟失__sentinel__:hello頻道的任何信息,Sentinel專門用一個連接來接收。
/* Create the async connections for the specified instance if the instance
* is disconnected. Note that the SRI_DISCONNECTED flag is set even if just
* one of the two links (commands and pub/sub) is missing. */
// 如果 sentinel 與實例處于斷線(未連接)狀態(tài),那么創(chuàng)建連向實例的異步連接。
/*src/sentinel.c/sentinelReconnectInstance*/
void sentinelReconnectInstance(sentinelRedisInstance *ri) {
// 示例未斷線(已連接),返回
if (!(ri->flags & SRI_DISCONNECTED)) return;
/* Commands connection. */
// 對所有實例創(chuàng)建一個用于發(fā)送 Redis 命令的連接, 包括主服務器,從服務器,和其他Sentinel
if (ri->cc == NULL) {
// 連接實例
ri->cc = redisAsyncConnect(ri->addr->ip,ri->addr->port);
// 連接出錯
if (ri->cc->err) {
sentinelEvent(REDIS_DEBUG,"-cmd-link-reconnection",ri,"%@ #%s",
ri->cc->errstr);
sentinelKillLink(ri,ri->cc);
// 連接成功
} else {
// 設置連接屬性
ri->cc_conn_time = mstime();
ri->cc->data = ri;
redisAeAttach(server.el,ri->cc);
// 設置連線 callback
redisAsyncSetConnectCallback(ri->cc,
sentinelLinkEstablishedCallback);
// 設置斷線 callback
redisAsyncSetDisconnectCallback(ri->cc,
sentinelDisconnectCallback);
// 發(fā)送 AUTH 命令,驗證身份
sentinelSendAuthIfNeeded(ri,ri->cc);
sentinelSetClientName(ri,ri->cc,"cmd");
/* Send a PING ASAP when reconnecting. */
sentinelSendPing(ri);
}
}
/* Pub / Sub */
// 對主服務器和從服務器,創(chuàng)建一個用于訂閱頻道的連接
if ((ri->flags & (SRI_MASTER|SRI_SLAVE)) && ri->pc == NULL) {
// 連接實例
ri->pc = redisAsyncConnect(ri->addr->ip,ri->addr->port);
// 連接出錯
if (ri->pc->err) {
sentinelEvent(REDIS_DEBUG,"-pubsub-link-reconnection",ri,"%@ #%s",
ri->pc->errstr);
sentinelKillLink(ri,ri->pc);
// 連接成功
} else {
int retval;
// 設置連接屬性
ri->pc_conn_time = mstime();
ri->pc->data = ri;
redisAeAttach(server.el,ri->pc);
// 設置連接 callback
redisAsyncSetConnectCallback(ri->pc,
sentinelLinkEstablishedCallback);
// 設置斷線 callback
redisAsyncSetDisconnectCallback(ri->pc,
sentinelDisconnectCallback);
// 發(fā)送 AUTH 命令,驗證身份
sentinelSendAuthIfNeeded(ri,ri->pc);
// 為客戶但設置名字 "pubsub"
sentinelSetClientName(ri,ri->pc,"pubsub");
/* Now we subscribe to the Sentinels "Hello" channel. */
// 發(fā)送 SUBSCRIBE __sentinel__:hello 命令,訂閱頻道
retval = redisAsyncCommand(ri->pc,
sentinelReceiveHelloMessages, NULL, "SUBSCRIBE %s",
SENTINEL_HELLO_CHANNEL);
// 訂閱出錯,斷開連接
if (retval != REDIS_OK) {
/* If we can't subscribe, the Pub/Sub connection is useless
* and we can simply disconnect it and try again. */
sentinelKillLink(ri,ri->pc);
return;
}
}
}
/* Clear the DISCONNECTED flags only if we have both the connections
* (or just the commands connection if this is a sentinel instance). */
// 如果實例是主服務器或者從服務器,那么當 cc 和 pc 兩個連接都創(chuàng)建成功時,關閉 DISCONNECTED 標識
// 如果實例是 Sentinel ,那么當 cc 連接創(chuàng)建成功時,關閉 DISCONNECTED 標識
if (ri->cc && (ri->flags & SRI_SENTINEL || ri->pc))
ri->flags &= ~SRI_DISCONNECTED;
}
4、上述代碼中可以看出,Sentinel對主從服務器需要維護兩個連接,而對其他Sentinel只需要維護命令連接,這是因為訂閱連接的作用其實是為了自動發(fā)現(xiàn):
一個Sentinel可以通過分析接收到的訂閱頻道信息來獲知其他Sentinel的存在,并通過發(fā)送頻道信息來讓其他Sentinel知道自己的存在(將信息發(fā)送給主從服務器,主從服務器發(fā)布信息,使得所有監(jiān)視服務器的Sentinel獲知信息),所以用戶在使用Sentinel的時候不需要提供各個Sentinel的地址信息,監(jiān)視同一個服務器的多個Sentinel可以自動發(fā)現(xiàn)對方,只需要維護一個命令連接進行通信就足夠了。
V、HELLO
1、從上面的sentinelReconnectInstance中可以看出,Sentinel初始化訂閱連接的時候進行了兩個操作,易格斯想服務器發(fā)送了HELLO命令,二是注冊了回調函數(shù)sentinelReceiveHelloMessages,這個函數(shù)的功能就是處理訂閱頻道的返回值,從而完成自動發(fā)現(xiàn)。
2、在定時程序中sentinelTimer()->sentinelHandleDictOfRedisInstance()->sentinelHandleRedisInstance()->SentinelSendPeriodicCommand()中,Sentinel會向服務器的hello頻道發(fā)布數(shù)據(jù),其中由sentinelSendHello函數(shù)實現(xiàn):
/*src/sentinel.c/sentinelSendHello*/
/* Send an "Hello" message via Pub/Sub to the specified 'ri' Redis
* instance in order to broadcast the current configuraiton for this
* master, and to advertise the existence of this Sentinel at the same time.
*
* 向給定 ri 實例的頻道發(fā)送信息,
* 從而傳播關于給定主服務器的配置,
* 并向其他 Sentinel 宣告本 Sentinel 的存在。
*
* The message has the following format:
*
* 發(fā)送信息的格式如下:
*
* sentinel_ip,sentinel_port,sentinel_runid,current_epoch,
* master_name,master_ip,master_port,master_config_epoch.
*
* Sentinel IP,Sentinel 端口號,Sentinel 的運行 ID,Sentinel 當前的紀元,
* 主服務器的名稱,主服務器的 IP,主服務器的端口號,主服務器的配置紀元.
*
* Returns REDIS_OK if the PUBLISH was queued correctly, otherwise
* REDIS_ERR is returned.
*
* PUBLISH 命令成功入隊時返回 REDIS_OK ,
* 否則返回 REDIS_ERR 。
*/
int sentinelSendHello(sentinelRedisInstance *ri) {
char ip[REDIS_IP_STR_LEN];
char payload[REDIS_IP_STR_LEN+1024];
int retval;
// 如果實例是主服務器,那么使用此實例的信息
// 如果實例是從服務器,那么使用這個從服務器的主服務器的信息
sentinelRedisInstance *master = (ri->flags & SRI_MASTER) ? ri : ri->master;
// 獲取地址信息
sentinelAddr *master_addr = sentinelGetCurrentMasterAddress(master);
/* Try to obtain our own IP address. */
// 獲取實例自身的地址
if (anetSockName(ri->cc->c.fd,ip,sizeof(ip),NULL) == -1) return REDIS_ERR;
if (ri->flags & SRI_DISCONNECTED) return REDIS_ERR;
/* Format and send the Hello message. */
// 格式化信息
snprintf(payload,sizeof(payload),
"%s,%d,%s,%llu," /* Info about this sentinel. */
"%s,%s,%d,%llu", /* Info about current master. */
ip, server.port, server.runid,
(unsigned long long) sentinel.current_epoch,
/* --- */
master->name,master_addr->ip,master_addr->port,
(unsigned long long) master->config_epoch);
// 發(fā)送信息
retval = redisAsyncCommand(ri->cc,
sentinelPublishReplyCallback, NULL, "PUBLISH %s %s",
SENTINEL_HELLO_CHANNEL,payload);
if (retval != REDIS_OK) return REDIS_ERR;
ri->pending_commands++;
return REDIS_OK;
}
2、當Redis收到來自Sentinel的發(fā)布信息時,就會想所有訂閱hello頻道的Sentinel發(fā)布數(shù)據(jù),于是剛才所注冊的回調函數(shù)sentinelReceiveHelloMessage就被調用,其主要做了兩方面的工作:
· 發(fā)現(xiàn)了其他監(jiān)視此服務器的Sentinel;
· 更新配置信息;
VI、INFO
1、Sentinel會以十秒一次的頻率首先向所監(jiān)視的主機發(fā)送INFO命令:
其調用過程如下:
sentinelTimer()->sentinelHandleDictOfRedisInstances()->sentinelHandleRedisInstance()->sentinelSendPeriodicCommands()
這其中,Sentinel同樣做了兩件事,一個是發(fā)送了INFO命令,另一個是注冊了sentinelInfoReplyCallback()回調函數(shù)。
當INFO命令返回時,收到了來自服務器的回復(包括主機的相關信息,以及主機所連接的從服務器),回調函數(shù)被調用,主要是完成對服務器回復信息的處理(這其中包括,主從復制信息,存儲的鍵值對數(shù)量,Sentinel判斷是否下線等),并根據(jù)獲取到所的從服務器信息實現(xiàn)對從服務器的監(jiān)視。這也是Sentinel自動發(fā)現(xiàn)的部分。
VII、心跳檢測
1、心跳檢測是判斷兩臺機器是否連接正常的常用手段,接收方在收到心跳包之后,會更新收到心跳的時間,在某個事件點如果檢測到心跳包多久沒有收到(超時),則證明網絡狀況不好,或對方很忙,也為接下來的行動提供指導,如延遲所需要進行的后續(xù)操作,指導心跳檢測正常。
VIII、在線狀態(tài)監(jiān)測
1、Sentinel根據(jù)主觀判斷與客觀判斷來完成在線狀態(tài)監(jiān)測:
主觀下線:是根據(jù)Sentinel自己觀測某個服務器的信息;
客觀下線:是通過綜合所有監(jiān)測某服務器的Sentinel的信息;
這同樣是通過心跳檢測發(fā)送PING實現(xiàn)的。
2、主觀下線判斷
/*src/sentinel.c/sentinelCheckSubjectivelyDown*/
/* Is this instance down from our point of view? */
// 檢查實例是否以下線(從本 Sentinel 的角度來看)
void sentinelCheckSubjectivelyDown(sentinelRedisInstance *ri) {
mstime_t elapsed = 0;
if (ri->last_ping_time)
elapsed = mstime() - ri->last_ping_time;
/* Check if we are in need for a reconnection of one of the
* links, because we are detecting low activity.
*
* 如果檢測到連接的活躍度(activity)很低,那么考慮重斷開連接,并進行重連
*
* 1) Check if the command link seems connected, was connected not less
* than SENTINEL_MIN_LINK_RECONNECT_PERIOD, but still we have a
* pending ping for more than half the timeout. */
// 考慮斷開實例的 cc 連接
if (ri->cc &&
(mstime() - ri->cc_conn_time) > SENTINEL_MIN_LINK_RECONNECT_PERIOD &&
ri->last_ping_time != 0 && /* Ther is a pending ping... */
/* The pending ping is delayed, and we did not received
* error replies as well. */
(mstime() - ri->last_ping_time) > (ri->down_after_period/2) &&
(mstime() - ri->last_pong_time) > (ri->down_after_period/2))
{
sentinelKillLink(ri,ri->cc);
}
/* 2) Check if the pubsub link seems connected, was connected not less
* than SENTINEL_MIN_LINK_RECONNECT_PERIOD, but still we have no
* activity in the Pub/Sub channel for more than
* SENTINEL_PUBLISH_PERIOD * 3.
*/
// 考慮斷開實例的 pc 連接
if (ri->pc &&
(mstime() - ri->pc_conn_time) > SENTINEL_MIN_LINK_RECONNECT_PERIOD &&
(mstime() - ri->pc_last_activity) > (SENTINEL_PUBLISH_PERIOD*3))
{
sentinelKillLink(ri,ri->pc);
}
/* Update the SDOWN flag. We believe the instance is SDOWN if:
*
* 更新 SDOWN 標識。如果以下條件被滿足,那么 Sentinel 認為實例已下線:
*
* 1) It is not replying.
* 它沒有回應命令
* 2) We believe it is a master, it reports to be a slave for enough time
* to meet the down_after_period, plus enough time to get two times
* INFO report from the instance.
* Sentinel 認為實例是主服務器,這個服務器向 Sentinel 報告它將成為從服務器,
* 但在超過給定時限之后,服務器仍然沒有完成這一角色轉換。
*/
if (elapsed > ri->down_after_period ||
(ri->flags & SRI_MASTER &&
ri->role_reported == SRI_SLAVE &&
mstime() - ri->role_reported_time >
(ri->down_after_period+SENTINEL_INFO_PERIOD*2)))
{
/* Is subjectively down */
if ((ri->flags & SRI_S_DOWN) == 0) {
// 發(fā)送事件
sentinelEvent(REDIS_WARNING,"+sdown",ri,"%@");
// 記錄進入 SDOWN 狀態(tài)的時間
ri->s_down_since_time = mstime();
// 打開 SDOWN 標志
ri->flags |= SRI_S_DOWN;
}
} else {
// 移除(可能有的) SDOWN 狀態(tài)
/* Is subjectively up */
if (ri->flags & SRI_S_DOWN) {
// 發(fā)送事件
sentinelEvent(REDIS_WARNING,"-sdown",ri,"%@");
// 移除相關標志
ri->flags &= ~(SRI_S_DOWN|SRI_SCRIPT_KILL_SENT);
}
}
}
3、客觀下線判斷
/*src/sentinel.c/sentinelCheckObjectiveDown*/
/* Is this instance down according to the configured quorum?
*
* 根據(jù)給定數(shù)量的 Sentinel 投票,判斷實例是否已下線。
*
* Note that ODOWN is a weak quorum, it only means that enough Sentinels
* reported in a given time range that the instance was not reachable.
*
* 注意 ODOWN 是一個 weak quorum ,它只意味著有足夠多的 Sentinel
* 在**給定的時間范圍內**報告實例不可達。
*
* However messages can be delayed so there are no strong guarantees about
* N instances agreeing at the same time about the down state.
*
* 因為 Sentinel 對實例的檢測信息可能帶有延遲,
* 所以實際上 N 個 Sentinel **不可能在同一時間內**判斷主服務器進入了下線狀態(tài)。
*/
void sentinelCheckObjectivelyDown(sentinelRedisInstance *master) {
dictIterator *di;
dictEntry *de;
int quorum = 0, odown = 0;
// 如果當前 Sentinel 將主服務器判斷為主觀下線
// 那么檢查是否有其他 Sentinel 同意這一判斷
// 當同意的數(shù)量足夠時,將主服務器判斷為客觀下線
if (master->flags & SRI_S_DOWN) {
/* Is down for enough sentinels? */
// 統(tǒng)計同意的 Sentinel 數(shù)量(起始的 1 代表本 Sentinel)
quorum = 1; /* the current sentinel. */
/* Count all the other sentinels. */
// 統(tǒng)計其他認為 master 進入下線狀態(tài)的 Sentinel 的數(shù)量
di = dictGetIterator(master->sentinels);
while((de = dictNext(di)) != NULL) {
sentinelRedisInstance *ri = dictGetVal(de);
// 該 SENTINEL 也認為 master 已下線
if (ri->flags & SRI_MASTER_DOWN) quorum++;
}
dictReleaseIterator(di);
// 如果投票得出的支持數(shù)目大于等于判斷 ODOWN 所需的票數(shù)
// 那么進入 ODOWN 狀態(tài)
if (quorum >= master->quorum) odown = 1;
}
/* Set the flag accordingly to the outcome. */
if (odown) {
// master 已 ODOWN
if ((master->flags & SRI_O_DOWN) == 0) {
// 發(fā)送事件
sentinelEvent(REDIS_WARNING,"+odown",master,"%@ #quorum %d/%d",
quorum, master->quorum);
// 打開 ODOWN 標志
master->flags |= SRI_O_DOWN;
// 記錄進入 ODOWN 的時間
master->o_down_since_time = mstime();
}
} else {
// 未進入 ODOWN
if (master->flags & SRI_O_DOWN) {
// 如果 master 曾經進入過 ODOWN 狀態(tài),那么移除該狀態(tài)
// 發(fā)送事件
sentinelEvent(REDIS_WARNING,"-odown",master,"%@");
// 移除 ODOWN 標志
master->flags &= ~SRI_O_DOWN;
}
}
}
IX、故障修復
1、一般在Redis服務器集群中,只有主機同時肩負著讀請求和寫請求兩個功能,而從機只負責讀請求(從機的寫是通過主從復制中主機的命令傳播完成的)。所以當主機出現(xiàn)宕幾是需要進行故障修復。
同樣是來源于sentinelTimer()定時函數(shù):
sentinelTimer()->sentinelHandleDictOfRedisInstance()->sentinelHandleRedisInstance()->sentinelStartFailoverIfNeeded() & sentinelFailoverStateMachine()
sentinelStartFailoverIfNeed()函數(shù)在判斷主機主觀下線之后,決定是否執(zhí)行古裝轉移操作,sentinelFailoverStateMachine()函數(shù)開始執(zhí)行故障轉移操作:
/*src/sentinel.c/sentinelFailoverStateMachine*/
// 故障修復狀態(tài)機,依據(jù)被標記的狀態(tài)執(zhí)行相應的動作
void sentinelFailoverStateMachine(sentinelRedisInstance *ri) {
redisAssert(ri->flags & SRI_MASTER);
if (!(ri->flags & SRI_FAILOVER_IN_PROGRESS)) return;
switch(ri->failover_state) {
case SENTINEL_FAILOVER_STATE_WAIT_START:
sentinelFailoverWaitStart(ri);
break;
case SENTINEL_FAILOVER_STATE_SELECT_SLAVE:
sentinelFailoverSelectSlave(ri);
break;
case SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE:
sentinelFailoverSendSlaveOfNoOne(ri);
break;
case SENTINEL_FAILOVER_STATE_WAIT_PROMOTION:
sentinelFailoverWaitPromotion(ri);
break;
case SENTINEL_FAILOVER_STATE_RECONF_SLAVES:
sentinelFailoverReconfNextSlave(ri);
break;
}
}
上面的case是Sentinel故障轉移中的六種狀態(tài):

sentinelFailoverStateMachine就是根據(jù)這些狀態(tài)判斷故障轉移進行到了哪一步從而執(zhí)行相應的函數(shù),下面我們分別看著六個狀態(tài)對應需要完成的工作是什么。
9.1 WAIT_START
1、當一個主服務器被判斷為客觀下線時,監(jiān)視這個主服務器的各個Sentinel會進行協(xié)商,選舉出一個領頭Sentinel,并由領頭Sentinel對主服務器進行故障轉移操作。
此狀態(tài)下調用函數(shù)sentinelFailoverWaitStart所進行的工作主要是判斷自己是否為領頭Sentinel:
// 準備執(zhí)行故障轉移
/*src/sentinel.c/sentinelFailoverWaitStart*/
void sentinelFailoverWaitStart(sentinelRedisInstance *ri) {
char *leader;
int isleader;
/* Check if we are the leader for the failover epoch. */
// 獲取給定紀元的領頭 Sentinel
leader = sentinelGetLeader(ri, ri->failover_epoch);
// 本 Sentinel 是否為領頭 Sentinel ?
isleader = leader && strcasecmp(leader,server.runid) == 0;
sdsfree(leader);
/* If I'm not the leader, and it is not a forced failover via
* SENTINEL FAILOVER, then I can't continue with the failover. */
// 如果本 Sentinel 不是領頭,并且這次故障遷移不是一次強制故障遷移操作
// 那么本 Sentinel 不做動作
if (!isleader && !(ri->flags & SRI_FORCE_FAILOVER)) {
int election_timeout = SENTINEL_ELECTION_TIMEOUT;
/* The election timeout is the MIN between SENTINEL_ELECTION_TIMEOUT
* and the configured failover timeout. */
// 當選的時長(類似于任期)是 SENTINEL_ELECTION_TIMEOUT
// 和 Sentinel 設置的故障遷移時長之間的較小那個值
if (election_timeout > ri->failover_timeout)
election_timeout = ri->failover_timeout;
/* Abort the failover if I'm not the leader after some time. */
// Sentinel 的當選時間已過,取消故障轉移計劃
if (mstime() - ri->failover_start_time > election_timeout) {
sentinelEvent(REDIS_WARNING,"-failover-abort-not-elected",ri,"%@");
// 取消故障轉移
sentinelAbortFailover(ri);
}
return;
}
// 本 Sentinel 作為領頭,開始執(zhí)行故障遷移操作...
sentinelEvent(REDIS_WARNING,"+elected-leader",ri,"%@");
// 進入選擇從服務器狀態(tài)
ri->failover_state = SENTINEL_FAILOVER_STATE_SELECT_SLAVE;
ri->failover_state_change_time = mstime();
sentinelEvent(REDIS_WARNING,"+failover-state-select-slave",ri,"%@");
}
如果是領頭Sentinel則將狀態(tài)更新為SELECT_SLAVE。
9.2 SELECT_SLAVE
這個狀態(tài)即為選取從服務器作為新的主服務器:
// 選擇合適的從服務器作為新的主服務器
void sentinelFailoverSelectSlave(sentinelRedisInstance *ri) {
// 在舊主服務器所屬的從服務器中,選擇新服務器
sentinelRedisInstance *slave = sentinelSelectSlave(ri);
/* We don't handle the timeout in this state as the function aborts
* the failover or go forward in the next state. */
// 沒有合適的從服務器,直接終止故障轉移操作
if (slave == NULL) {
// 沒有可用的從服務器可以提升為新主服務器,故障轉移操作無法執(zhí)行
sentinelEvent(REDIS_WARNING,"-failover-abort-no-good-slave",ri,"%@");
// 中止故障轉移
sentinelAbortFailover(ri);
} else {
// 成功選定新主服務器
// 發(fā)送事件
sentinelEvent(REDIS_WARNING,"+selected-slave",slave,"%@");
// 打開實例的升級標記
slave->flags |= SRI_PROMOTED;
// 記錄被選中的從服務器
ri->promoted_slave = slave;
// 更新故障轉移狀態(tài)
ri->failover_state = SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE;
// 更新狀態(tài)改變時間
ri->failover_state_change_time = mstime();
// 發(fā)送事件
sentinelEvent(REDIS_NOTICE,"+failover-state-send-slaveof-noone",
slave, "%@");
}
}
此時狀態(tài)更新為SLAVEOF_NOONE。
9.3 SLAVEOF_NOONE
此狀態(tài)的工作是向選出來的新的主服務器發(fā)送SLAVEOF no one命令,使其成為真正的主服務器:
// 向被選中的從服務器發(fā)送 SLAVEOF no one 命令
// 將它升級為新的主服務器
/*src/sentinel.c/sentinelFailoverSendSlaveOfNoOne*/
void sentinelFailoverSendSlaveOfNoOne(sentinelRedisInstance *ri) {
int retval;
/* We can't send the command to the promoted slave if it is now
* disconnected. Retry again and again with this state until the timeout
* is reached, then abort the failover. */
// 如果選中的從服務器斷線了,那么在給定的時間內重試
// 如果給定時間內選中的從服務器也沒有上線,那么終止故障遷移操作
// (一般來說出現(xiàn)這種情況的機會很小,因為在選擇新的主服務器時,
// 已經斷線的從服務器是不會被選中的,所以這種情況只會出現(xiàn)在
// 從服務器被選中,并且發(fā)送 SLAVEOF NO ONE 命令之前的這段時間內)
if (ri->promoted_slave->flags & SRI_DISCONNECTED) {
// 如果超過時限,就不再重試
if (mstime() - ri->failover_state_change_time > ri->failover_timeout) {
sentinelEvent(REDIS_WARNING,"-failover-abort-slave-timeout",ri,"%@");
sentinelAbortFailover(ri);
}
return;
}
/* Send SLAVEOF NO ONE command to turn the slave into a master.
*
* 向被升級的從服務器發(fā)送 SLAVEOF NO ONE 命令,將它變?yōu)橐粋€主服務器。
*
* We actually register a generic callback for this command as we don't
* really care about the reply. We check if it worked indirectly observing
* if INFO returns a different role (master instead of slave).
*
* 這里沒有為命令回復關聯(lián)一個回調函數(shù),因為從服務器是否已經轉變?yōu)橹鞣掌骺梢? * 通過向從服務器發(fā)送 INFO 命令來確認
*/
retval = sentinelSendSlaveOf(ri->promoted_slave,NULL,0);
if (retval != REDIS_OK) return;
sentinelEvent(REDIS_NOTICE, "+failover-state-wait-promotion",
ri->promoted_slave,"%@");
// 更新狀態(tài)
// 這個狀態(tài)會讓 Sentinel 等待被選中的從服務器升級為主服務器
ri->failover_state = SENTINEL_FAILOVER_STATE_WAIT_PROMOTION;
// 更新狀態(tài)改變的時間
ri->failover_state_change_time = mstime();
}
9.4 WAIT_PROMOTION
負責檢查時限,調用函數(shù)sentinelFailoverWaitPromotion只做了超時判斷,如果超時則停止故障修復:
/* We actually wait for promotion indirectly checking with INFO when the
* slave turns into a master. */
// Sentinel 會通過 INFO 命令的回復檢查從服務器是否已經轉變?yōu)橹鞣掌?// 這里只負責檢查時限
/*src/sentinel.c/sentinelFailoverWaitPromotion*/
void sentinelFailoverWaitPromotion(sentinelRedisInstance *ri) {
/* Just handle the timeout. Switching to the next state is handled
* by the function parsing the INFO command of the promoted slave. */
if (mstime() - ri->failover_state_change_time > ri->failover_timeout) {
sentinelEvent(REDIS_WARNING,"-failover-abort-slave-timeout",ri,"%@");
sentinelAbortFailover(ri);
}
}
9.5 RECONF_SLAVE
主要做的是向其他候選從服務器發(fā)送slaveof promote_slave,使其成為他們的主機:
/* Send SLAVE OF <new master address> to all the remaining slaves that
* still don't appear to have the configuration updated. */
// 向所有尚未同步新主服務器的從服務器發(fā)送 SLAVEOF <new-master-address> 命令
void sentinelFailoverReconfNextSlave(sentinelRedisInstance *master) {
dictIterator *di;
dictEntry *de;
int in_progress = 0;
// 計算正在同步新主服務器的從服務器數(shù)量
di = dictGetIterator(master->slaves);
while((de = dictNext(di)) != NULL) {
sentinelRedisInstance *slave = dictGetVal(de);
// SLAVEOF 命令已發(fā)送,或者同步正在進行
if (slave->flags & (SRI_RECONF_SENT|SRI_RECONF_INPROG))
in_progress++;
}
dictReleaseIterator(di);
// 如果正在同步的從服務器的數(shù)量少于 parallel-syncs 選項的值
// 那么繼續(xù)遍歷從服務器,并讓從服務器對新主服務器進行同步
di = dictGetIterator(master->slaves);
while(in_progress < master->parallel_syncs &&
(de = dictNext(di)) != NULL)
{
sentinelRedisInstance *slave = dictGetVal(de);
int retval;
/* Skip the promoted slave, and already configured slaves. */
// 跳過新主服務器,以及已經完成了同步的從服務器
if (slave->flags & (SRI_PROMOTED|SRI_RECONF_DONE)) continue;
/* If too much time elapsed without the slave moving forward to
* the next state, consider it reconfigured even if it is not.
* Sentinels will detect the slave as misconfigured and fix its
* configuration later. */
if ((slave->flags & SRI_RECONF_SENT) &&
(mstime() - slave->slave_reconf_sent_time) >
SENTINEL_SLAVE_RECONF_TIMEOUT)
{
// 發(fā)送重拾同步事件
sentinelEvent(REDIS_NOTICE,"-slave-reconf-sent-timeout",slave,"%@");
// 清除已發(fā)送 SLAVEOF 命令的標記
slave->flags &= ~SRI_RECONF_SENT;
slave->flags |= SRI_RECONF_DONE;
}
/* Nothing to do for instances that are disconnected or already
* in RECONF_SENT state. */
// 如果已向從服務器發(fā)送 SLAVEOF 命令,或者同步正在進行
// 又或者從服務器已斷線,那么略過該服務器
if (slave->flags & (SRI_DISCONNECTED|SRI_RECONF_SENT|SRI_RECONF_INPROG))
continue;
/* Send SLAVEOF <new master>. */
// 向從服務器發(fā)送 SLAVEOF 命令,讓它同步新主服務器
retval = sentinelSendSlaveOf(slave,
master->promoted_slave->addr->ip,
master->promoted_slave->addr->port);
if (retval == REDIS_OK) {
// 將狀態(tài)改為 SLAVEOF 命令已發(fā)送
slave->flags |= SRI_RECONF_SENT;
// 更新發(fā)送 SLAVEOF 命令的時間
slave->slave_reconf_sent_time = mstime();
sentinelEvent(REDIS_NOTICE,"+slave-reconf-sent",slave,"%@");
// 增加當前正在同步的從服務器的數(shù)量
in_progress++;
}
}
dictReleaseIterator(di);
/* Check if all the slaves are reconfigured and handle timeout. */
// 判斷是否所有從服務器的同步都已經完成
sentinelFailoverDetectEnd(master);
}
9.6 UPDATE_CONFIG
故障轉移結束后,將進入這一狀態(tài),會調用sentinelFailoverSwitchToPromotedSlave函數(shù),將之前的下線master移除master表格,并由新的主服務器代替:
/* This function is called when the slave is in
* SENTINEL_FAILOVER_STATE_UPDATE_CONFIG state. In this state we need
* to remove it from the master table and add the promoted slave instead. */
// 這個函數(shù)在 master 已下線,并且對這個 master 的故障遷移操作已經完成時調用
// 這個 master 會被移除出 master 表格,并由新的主服務器代替
void sentinelFailoverSwitchToPromotedSlave(sentinelRedisInstance *master) {
/// 選出要添加的 master
sentinelRedisInstance *ref = master->promoted_slave ?
master->promoted_slave : master;
// 發(fā)送更新 master 事件
sentinelEvent(REDIS_WARNING,"+switch-master",master,"%s %s %d %s %d",
// 原 master 信息
master->name, master->addr->ip, master->addr->port,
// 新 master 信息
ref->addr->ip, ref->addr->port);
// 用新主服務器的信息代替原 master 的信息
sentinelResetMasterAndChangeAddress(master,ref->addr->ip,ref->addr->port);
}
至此,故障轉移操作完成。
【參考】
[1] 《Redis設計與實現(xiàn)》
[2] 《Redis源碼日志》