From 294dd94a40bbc499502ec987afa278ee060d4e78 Mon Sep 17 00:00:00 2001 From: Jacob Murphy Date: Wed, 15 Jan 2025 12:04:36 +0000 Subject: [PATCH 01/18] Initial commit of AOF based atomic slot migration Signed-off-by: Jacob Murphy --- src/aof.c | 26 +- src/blocked.c | 2 +- src/cluster.c | 70 ++ src/cluster.h | 8 + src/cluster_legacy.c | 259 ++++- src/cluster_legacy.h | 42 +- src/commands.def | 20 + src/commands/cluster-migrate.json | 18 + src/config.c | 23 +- src/db.c | 21 +- src/evict.c | 2 +- src/expire.c | 2 +- src/io_threads.c | 2 +- src/kvstore.c | 52 +- src/kvstore.h | 4 + src/lazyfree.c | 23 + src/module.c | 28 +- src/networking.c | 34 +- src/rdb.c | 28 +- src/rdb.h | 2 +- src/replication.c | 1485 +++++++++++++++++------------ src/script.c | 12 +- src/server.c | 72 +- src/server.h | 125 ++- 24 files changed, 1583 insertions(+), 777 deletions(-) create mode 100644 src/commands/cluster-migrate.json diff --git a/src/aof.c b/src/aof.c index 024cdb2771..5c2691c1ba 100644 --- a/src/aof.c +++ b/src/aof.c @@ -2190,14 +2190,27 @@ static int rewriteFunctions(rio *aof) { return 0; } -int rewriteAppendOnlyFileRio(rio *aof) { +int shouldFilterSlot(int slot, void * slot_ranges) { + if (slot_ranges == NULL) return 0; + list *ranges = (list *)slot_ranges; + listIter li; + listNode *ln; + listRewind(ranges, &li); + while ((ln = listNext(&li))) { + slotRange *range = (slotRange *) ln->value; + if (slot >= range->start && slot <= range->end) return 0; + } + return 1; +} + +int rewriteAppendOnlyFileRio(rio *aof, list *slot_ranges) { int j; long key_count = 0; long long updated_time = 0; kvstoreIterator *kvs_it = NULL; /* Record timestamp at the beginning of rewriting AOF. */ - if (server.aof_timestamp_enabled) { + if (server.aof_timestamp_enabled && slot_ranges == NULL) { sds ts = genAofTimestampAnnotationIfNeeded(1); if (rioWrite(aof, ts, sdslen(ts)) == 0) { sdsfree(ts); @@ -2217,7 +2230,11 @@ int rewriteAppendOnlyFileRio(rio *aof) { if (rioWrite(aof, selectcmd, sizeof(selectcmd) - 1) == 0) goto werr; if (rioWriteBulkLongLong(aof, j) == 0) goto werr; - kvs_it = kvstoreIteratorInit(db->keys); + if (slot_ranges == NULL) { + kvs_it = kvstoreIteratorInit(db->keys); + } else { + kvs_it = kvstoreFilteredIteratorInit(db->keys, &shouldFilterSlot, slot_ranges); + } /* Iterate this DB writing every entry */ void *next; while (kvstoreIteratorNext(kvs_it, &next)) { @@ -2280,6 +2297,7 @@ int rewriteAppendOnlyFileRio(rio *aof) { updated_time = now; } } + serverLog(LL_NOTICE, "AOF rewrite: %s, key_count: %ld", keystr, key_count); /* Delay before next key if required (for testing) */ if (server.rdb_key_save_delay) debugDelay(server.rdb_key_save_delay); @@ -2330,7 +2348,7 @@ int rewriteAppendOnlyFile(char *filename) { goto werr; } } else { - if (rewriteAppendOnlyFileRio(&aof) == C_ERR) goto werr; + if (rewriteAppendOnlyFileRio(&aof, NULL) == C_ERR) goto werr; } /* Make sure data will not remain on the OS's output buffers */ diff --git a/src/blocked.c b/src/blocked.c index d2d6a5d314..d1a6ff9c6b 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -101,7 +101,7 @@ void freeClientBlockingState(client *c) { * and will be processed when the client is unblocked. */ void blockClient(client *c, int btype) { /* Primary client should never be blocked unless pause or module */ - serverAssert(!(c->flag.primary && btype != BLOCKED_MODULE && btype != BLOCKED_POSTPONE)); + serverAssert(!(c->flag.replication_source && btype != BLOCKED_MODULE && btype != BLOCKED_POSTPONE)); initClientBlockingState(c); diff --git a/src/cluster.c b/src/cluster.c index 309279e0be..8050cd869d 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -815,6 +815,76 @@ unsigned int countKeysInSlot(unsigned int slot) { return kvstoreHashtableSize(server.db->keys, slot); } +unsigned int dropKeysInSlotRanges(list *slot_ranges, int async) { + unsigned int result = 0; + listIter li; + listNode *ln; + listRewind(slot_ranges, &li); + while ((ln = listNext(&li))) { + slotRange *slot_range = (slotRange *) listNodeValue(ln); + for (int i = slot_range->start; i <= slot_range->end; i++) { + result += dropKeysInSlot(i, async); + } + } + return result; +} + +unsigned int dropKeysInSlot(unsigned int hashslot, int async) { + unsigned int result = kvstoreHashtableSize(server.db->keys, hashslot); + if (async) { + emptyHashtableAsync(server.db, hashslot); + } else { + kvstoreEmptyHashtable(server.db->keys, hashslot, NULL); + kvstoreEmptyHashtable(server.db->expires, hashslot, NULL); + } + return result; +} + + + +void slotRangesToBitmap(list *slot_ranges, unsigned char *bitmap_out) { + listIter li; + listNode *ln; + listRewind(slot_ranges, &li); + while ((ln = listNext(&li))) { + slotRange *range = (slotRange *) listNodeValue(ln); + for (int i = range->start; i <= range->end; i++) { + bitmapSetBit(bitmap_out, i); + } + } +} + +void bitmapToSlotRanges(unsigned char *bitmap, list **slot_ranges_out) { + *slot_ranges_out = listCreate(); + int range_start = -1; + for (int i = 0; i <= CLUSTER_SLOTS; i++) { + if (i != CLUSTER_SLOTS && bitmapTestBit(bitmap, i)) { + if (range_start == -1) { + range_start = i; + } + } else if (range_start != -1) { + slotRange *range = zmalloc(sizeof(slotRange)); + range->start = range_start; + range->end = i - 1; + range_start = -1; + serverLog(LL_NOTICE, "Got another range: %d-%d", range->start, range->end); + listAddNodeTail(*slot_ranges_out, range); + } + } +} + +void freeSlotRanges(list *slot_ranges) { + listIter li; + listNode *ln; + listRewind(slot_ranges, &li); + while ((ln = listNext(&li))) { + slotRange *range = (slotRange *)ln->value; + zfree(range); + listDelNode(slot_ranges, ln); + } + listRelease(slot_ranges); +} + void clusterCommandHelp(client *c) { const char *help[] = { "COUNTKEYSINSLOT ", diff --git a/src/cluster.h b/src/cluster.h index 142f2d70b3..fd994d1ce7 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -116,6 +116,14 @@ client *createCachedResponseClient(int resp); void deleteCachedResponseClient(client *recording_client); void clearCachedClusterSlotsResponse(void); unsigned int countKeysInSlot(unsigned int hashslot); +unsigned int dropKeysInSlotRanges(list *slot_ranges, int async); +unsigned int dropKeysInSlot(unsigned int hashslot, int async); +void slotRangesToBitmap(list *slot_ranges, unsigned char *bitmap_out); +void bitmapToSlotRanges(unsigned char *bitmap, list **slot_ranges_out); +void freeSlotRanges(list *slot_ranges); +int bitmapTestBit(unsigned char *bitmap, int pos); +void bitmapSetBit(unsigned char *bitmap, int pos); +void bitmapClearBit(unsigned char *bitmap, int pos); int getSlotOrReply(client *c, robj *o); /* functions with shared implementations */ diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 5c4bb65aae..15a5ee3b7d 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -72,9 +72,6 @@ int clusterNodeSetSlotBit(clusterNode *n, int slot); static void clusterSetPrimary(clusterNode *n, int closeSlots, int full_sync_required); void clusterHandleReplicaFailover(void); void clusterHandleReplicaMigration(int max_replicas); -int bitmapTestBit(unsigned char *bitmap, int pos); -void bitmapSetBit(unsigned char *bitmap, int pos); -void bitmapClearBit(unsigned char *bitmap, int pos); void clusterDoBeforeSleep(int flags); void clusterSendUpdate(clusterLink *link, clusterNode *node); void resetManualFailover(void); @@ -86,6 +83,8 @@ sds representSlotInfo(sds ci, uint16_t *slot_info_pairs, int slot_info_pairs_cou void clusterFreeNodesSlotsInfo(clusterNode *n); uint64_t clusterGetMaxEpoch(void); int clusterBumpConfigEpochWithoutConsensus(void); +slotMigration *clusterGetCurrentSlotMigration(void); +void clusterSendMigrateSlotStart(clusterNode *node, list *slot_ranges); void moduleCallClusterReceivers(const char *sender_id, uint64_t module_id, uint8_t type, @@ -1134,6 +1133,7 @@ void clusterInit(void) { server.cluster->failover_auth_epoch = 0; server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE; server.cluster->lastVoteEpoch = 0; + server.cluster->slot_migrations = listCreate(); /* Initialize stats */ for (int i = 0; i < CLUSTERMSG_TYPE_COUNT; i++) { @@ -1456,7 +1456,7 @@ void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) { /* If the server is starting up, don't accept cluster connections: * UPDATE messages may interact with the database content. */ - if (server.primary_host == NULL && server.loading) return; + if (server.primary == NULL && server.loading) return; while (max--) { cfd = anetTcpAccept(server.neterr, fd, cip, sizeof(cip), &cport); @@ -2570,6 +2570,12 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc migrated_our_slots++; } + /* Was this slot mine and it was in a paused state for slot + * migration? If so, clear the manual failover state. */ + if (server.cluster->slots[j] == myself && server.cluster->mf_end && server.cluster->mf_replica == sender) { + resetManualFailover(); + } + /* If the sender who claims this slot is not in the same shard, * it must be a result of deliberate operator actions. Therefore, * we should honor it and clear the outstanding migrating_slots_to @@ -3245,6 +3251,20 @@ int clusterProcessPacket(clusterLink *link) { "primary manual failover: %lld", server.cluster->mf_primary_offset); } + /* If we are a importing a slot and the slot owner sent its offset + * while already paused, populate the migration state. */ + slotMigration * curr_migration = clusterGetCurrentSlotMigration(); + if (hdr->mflags[0] & CLUSTERMSG_FLAG0_PAUSED && curr_migration != NULL && + curr_migration->state == SLOT_MIGRATION_WAITING_FOR_OFFSET && + curr_migration->source_node == sender) { + curr_migration->pause_primary_offset = sender->repl_offset; + curr_migration->state = SLOT_MIGRATION_SYNCING_TO_OFFSET; + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION); + serverLog(LL_NOTICE, + "Received replication offset from paused owner for " + "slot import: %lld", + curr_migration->pause_primary_offset); + } } /* Initial processing of PING and MEET requests replying with a PONG. */ @@ -3699,6 +3719,26 @@ int clusterProcessPacket(clusterLink *link) { uint8_t type = hdr->data.module.msg.type; unsigned char *payload = hdr->data.module.msg.bulk_data; moduleCallClusterReceivers(sender->name, module_id, type, payload, len); + } else if (type == CLUSTERMSG_TYPE_MIGRATE_SLOT_START) { + /* This message is acceptable only if I'm a primary and I own the slot */ + if (!sender) return 1; + for (int i = 0; i <= CLUSTER_SLOTS; i++) { + if (bitmapTestBit(hdr->data.slot_migration.msg.slot_bitmap, i) && server.cluster->slots[i] != myself) return 1; + } + /* Initialize the slot migration state accordingly */ + resetManualFailover(); + server.cluster->mf_end = now + CLUSTER_MF_TIMEOUT; + server.cluster->mf_replica = sender; + /* TODO(murphyjacob4) pause subset of slots */ + pauseActions(PAUSE_DURING_FAILOVER, now + (CLUSTER_MF_TIMEOUT * CLUSTER_MF_PAUSE_MULT), + PAUSE_ACTIONS_CLIENT_WRITE_SET); + serverLog(LL_NOTICE, "Slot migration requested by node %.40s (%s).", sender->name, sender->human_nodename); + /* We need to send a ping message to the replica, as it would carry + * `server.cluster->mf_primary_offset`, which means the primary paused clients + * at offset `server.cluster->mf_primary_offset`, so that the replica would + * know that it is safe to set its `server.cluster->mf_can_start` to 1 so as + * to complete failover as quickly as possible. */ + clusterSendPing(link, CLUSTERMSG_TYPE_PING); } else { serverLog(LL_WARNING, "Received unknown packet type: %d", type); } @@ -4395,6 +4435,128 @@ void clusterPropagatePublish(robj *channel, robj *message, int sharded) { clusterMsgSendBlockDecrRefCount(msgblock_light); } +/* ----------------------------------------------------------------------------- + * Slot Migration functions + * -------------------------------------------------------------------------- */ + +/* Gets the current slot migration from the head of the queue. */ +slotMigration *clusterGetCurrentSlotMigration(void) { + if (listLength(server.cluster->slot_migrations) == 0) return NULL; + return (slotMigration *) listFirst(server.cluster->slot_migrations)->value; +} + +void clusterSendMigrateSlotStart(clusterNode *node, list *slot_ranges) { + if (!node->link) return; + + uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData) + sizeof(clusterMsgSlotMigration); + clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_MIGRATE_SLOT_START, msglen); + clusterMsg *hdr = getMessageFromSendBlock(msgblock); + slotRangesToBitmap(slot_ranges, hdr->data.slot_migration.msg.slot_bitmap); + clusterSendMessage(node->link, msgblock); + clusterMsgSendBlockDecrRefCount(msgblock); +} + +/* This is the main state machine for the slot migration workflow. Slot + * migration is driven by the new owner of the slot. This function will do as + * much work as possible synchronously, processing the enqueued slot migrations + * and only returning once we are waiting on some IO. */ +void clusterProceedWithSlotMigration(void) { + server.cluster->todo_before_sleep &= ~CLUSTER_TODO_HANDLE_SLOTMIGRATION; + + while (clusterGetCurrentSlotMigration() != NULL) { + listNode *curr_node = listFirst(server.cluster->slot_migrations); + slotMigration *curr_migration = (slotMigration *) curr_node->value; + if (curr_migration->state != SLOT_MIGRATION_QUEUED && curr_migration->end_time < mstime()) { + serverLog(LL_WARNING, + "Timed out for slot migration from source node %.40s", curr_migration->source_node->name); + curr_migration->state = SLOT_MIGRATION_FAILED; + } + if (curr_migration->state > SLOT_MIGRATION_PAUSE_OWNER && curr_migration->state < SLOT_MIGRATION_FAILED && curr_migration->pause_end < mstime() && curr_migration->vote_retry_time < mstime()) { + /* If the owner ever unpauses, we have to move back in the state machine and retry. */ + serverLog(LL_WARNING, "Reinitiating pause on the node owning the slot range..."); + curr_migration->state = SLOT_MIGRATION_PAUSE_OWNER; + curr_migration->pause_end = mstime() + CLUSTER_MF_TIMEOUT; + } + switch(curr_migration->state) { + case SLOT_MIGRATION_QUEUED: + /* Start the migration */ + serverLog(LL_NOTICE, "Starting sync from migration source node %.40s", curr_migration->source_node->name); + curr_migration->end_time = mstime() + CLUSTER_SLOT_MIGRATION_TIMEOUT; + curr_migration->link = createReplicationLink(curr_migration->source_node->ip, getNodeDefaultReplicationPort(curr_migration->source_node), curr_migration->slot_ranges); + if (connectReplicationLink(curr_migration->link) == C_ERR) { + serverLog(LL_WARNING, + "Failed to begin sync from migration source node %.40s", curr_migration->source_node->name); + curr_migration->state = SLOT_MIGRATION_FAILED; + continue; + } + curr_migration->state = SLOT_MIGRATION_SYNCING; + continue; + case SLOT_MIGRATION_SYNCING: + /* replicationCron should manage retrying connection, but there could be scenarios where we hit an irrecoverable error. */ + if (curr_migration->link->state == REPL_STATE_NONE || curr_migration->link->state == REPL_STATE_CANCELLED) { + serverLog(LL_WARNING, "Sync failed from migration node %.40s", curr_migration->source_node->name); + curr_migration->state = SLOT_MIGRATION_FAILED; + continue; + } + if (curr_migration->link->state == REPL_STATE_CONNECTED) { + curr_migration->state = SLOT_MIGRATION_PAUSE_OWNER; + continue; + } + /* If we are in another state, nothing to do right now. */ + return; + case SLOT_MIGRATION_PAUSE_OWNER: + serverLog(LL_NOTICE, "Replication link to slot owner %.40s has been established. Pausing source node and waiting to continue", curr_migration->source_node->name); + clusterSendMigrateSlotStart(curr_migration->source_node, curr_migration->slot_ranges); + curr_migration->pause_primary_offset = -1; + curr_migration->pause_end = mstime() + CLUSTER_MF_TIMEOUT; + curr_migration->state = SLOT_MIGRATION_WAITING_FOR_OFFSET; + continue; + case SLOT_MIGRATION_WAITING_FOR_OFFSET: + /* Nothing to do, need to wait for cluster message to come in. */ + return; + case SLOT_MIGRATION_SYNCING_TO_OFFSET: + if (curr_migration->link->client->repl_data->reploff >= curr_migration->pause_primary_offset) { + serverLog(LL_NOTICE, "Replication of slot range has caught up to paused slot owner, slot migration can start."); + curr_migration->state = SLOT_MIGRATION_FINISH; + continue; + } + /* Need to wait for the sync to progress further */ + return; + case SLOT_MIGRATION_FINISH: + serverLog(LL_NOTICE, "Setting myself to owner of migrating slots and broadcasting"); + listIter li; + listNode *ln; + listRewind(curr_migration->slot_ranges, &li); + while ((ln = listNext(&li))) { + slotRange *range = (slotRange *) ln->value; + for (int i = range->start; i <= range->end; i++) { + clusterDelSlot(i); + clusterAddSlot(myself, i); + } + } + clusterUpdateState(); + clusterSaveConfigOrDie(1); + if (clusterBumpConfigEpochWithoutConsensus() == C_OK) { + serverLog(LL_NOTICE, "ConfigEpoch updated after importing slots"); + } + clusterBroadcastPong(CLUSTER_BROADCAST_ALL); + listDelNode(server.cluster->slot_migrations, curr_node); + freeReplicationLink(curr_migration->link); + zfree(curr_migration); + continue; + case SLOT_MIGRATION_FAILED: + /* Delete the migration from the queue and proceed to the next migration */ + listDelNode(server.cluster->slot_migrations, curr_node); + freeReplicationLink(curr_migration->link); + dropKeysInSlotRanges(curr_migration->slot_ranges, server.repl_replica_lazy_flush); + freeSlotRanges(curr_migration->slot_ranges); + zfree(curr_migration); + continue; + } + } +} + + /* ----------------------------------------------------------------------------- * REPLICA node specific functions * -------------------------------------------------------------------------- */ @@ -4739,8 +4901,8 @@ void clusterHandleReplicaFailover(void) { /* Set data_age to the number of milliseconds we are disconnected from * the primary. */ - if (server.repl_state == REPL_STATE_CONNECTED) { - data_age = (mstime_t)(server.unixtime - server.primary->last_interaction) * 1000; + if (server.primary && server.primary->state == REPL_STATE_CONNECTED) { + data_age = (mstime_t)(server.unixtime - server.primary->client->last_interaction) * 1000; } else { data_age = (mstime_t)(server.unixtime - server.repl_down_since) * 1000; } @@ -5332,7 +5494,7 @@ void clusterCron(void) { /* If we are a replica node but the replication is still turned off, * enable it if we know the address of our primary and it appears to * be up. */ - if (nodeIsReplica(myself) && server.primary_host == NULL && myself->replicaof && nodeHasAddr(myself->replicaof)) { + if (nodeIsReplica(myself) && server.primary == NULL && myself->replicaof && nodeHasAddr(myself->replicaof)) { replicationSetPrimary(myself->replicaof->ip, getNodeDefaultReplicationPort(myself->replicaof), 0); } @@ -5353,6 +5515,8 @@ void clusterCron(void) { } if (update_state || server.cluster->state == CLUSTER_FAIL) clusterUpdateState(); + + clusterProceedWithSlotMigration(); } /* This function is called before the event handler returns to sleep for @@ -5378,6 +5542,9 @@ void clusterBeforeSleep(void) { /* Handle failover, this is needed when it is likely that there is already * the quorum from primaries in order to react fast. */ clusterHandleReplicaFailover(); + } else if (flags & CLUSTER_TODO_HANDLE_SLOTMIGRATION) { + /* Continue with slot migration (e.g. if import offset is updated) */ + clusterProceedWithSlotMigration(); } /* Update the cluster state. */ @@ -6528,13 +6695,13 @@ int clusterParseSetSlotCommand(client *c, int *slot_out, clusterNode **node_out, int optarg_pos = 0; /* Allow primaries to replicate "CLUSTER SETSLOT" */ - if (!c->flag.primary && nodeIsReplica(myself)) { + if (!c->flag.replication_source && nodeIsReplica(myself)) { addReplyError(c, "Please use SETSLOT only with masters."); return 0; } /* If 'myself' is a replica, 'c' must be the primary client. */ - serverAssert(!nodeIsReplica(myself) || c == server.primary); + serverAssert(!nodeIsReplica(myself) || (server.primary && c == server.primary->client)); if ((slot = getSlotOrReply(c, c->argv[2])) == -1) return 0; @@ -7108,6 +7275,78 @@ int clusterCommandSpecial(client *c) { } else if (!strcasecmp(c->argv[1]->ptr, "links") && c->argc == 2) { /* CLUSTER LINKS */ addReplyClusterLinksDescription(c); + } else if (!strcasecmp(c->argv[1]->ptr, "migrate")) { + /* CLUSTER MIGRATE SLOTSRANGE [ ] */ + if (nodeIsReplica(myself)) { + addReplyError(c, "Only primaries can migrate slots"); + return 1; + } + if (c->argc < 5 || strcasecmp(c->argv[2]->ptr, "slotsrange")) { + addReplyError(c, "Migrate command requires at least one "); + return 1; + } + unsigned char requested_slots[CLUSTER_SLOTS/8]; + memset(requested_slots, 0, sizeof(requested_slots)); + int i; + clusterNode * curr_owner = NULL; + for (i = 3; i + 1 < c->argc; i+=2) { + if (i > 3 && getLongLongFromObject(c->argv[i], NULL) != C_OK) { + /* If we find a non-integer in the args and we have already + * parsed >=1 slot range, we assume it is the next token. */ + break; + } + int start = getSlotOrReply(c, c->argv[i]); + if (start < 0) { + return 1; + } + int end = getSlotOrReply(c, c->argv[i + 1]); + if (end < 0) { + return 1; + } + if (end < start) { + addReplyErrorFormat(c, "Invalid SLOTSRANGE, start slot %d is greater than end slot %d", start, end); + return 1; + } + for (int j = start; j <= end; j++) { + if (bitmapTestBit(requested_slots, j)) { + addReplyError(c, "Invalid SLOTSRANGE, slot ranges overlap"); + return 1; + } + if (curr_owner == NULL) { + curr_owner = server.cluster->slots[j]; + } else { + if (curr_owner != server.cluster->slots[j]) { + addReplyError(c, "Invalid SLOTSRANGE, slot ranges are not all owned by the same shard"); + return 1; + } + } + if (curr_owner == myself) { + addReplyErrorFormat(c, "I'm already the owner of hash slot %u", j); + return 1; + } + if (nodeFailed(curr_owner)) { + addReplyErrorFormat(c, "Primary is currently failing for slot %u. Please try again once there is a healthy primary", j); + return 1; + } + bitmapSetBit(requested_slots, j); + } + } + + slotMigration *to_enqueue = (slotMigration *) zmalloc(sizeof(slotMigration)); + bitmapToSlotRanges(requested_slots, &to_enqueue->slot_ranges); + to_enqueue->source_node = curr_owner; + to_enqueue->state = SLOT_MIGRATION_QUEUED; + to_enqueue->end_time = 0; /* Will be set once started. */ + to_enqueue->link = NULL; + to_enqueue->pause_end = 0; + to_enqueue->pause_primary_offset = -1; + to_enqueue->vote_end_time = 0; + to_enqueue->vote_retry_time = 0; + to_enqueue->vote_epoch = 0; + to_enqueue->auth_count = 0; + listAddNodeTail(server.cluster->slot_migrations, to_enqueue); + clusterProceedWithSlotMigration(); + addReply(c, shared.ok); } else { return 0; } @@ -7150,6 +7389,8 @@ const char **clusterCommandExtendedHelp(void) { "LINKS", " Return information about all network links between this node and its peers.", " Output format is an array where each array element is a map containing attributes of a link", + "MIGRATE SLOTSRANGE [ ...] SHARD ", + " Initiate server driven slot migration of all slot ranges to the designated shard.", NULL}; return help; diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index 226842c5dc..dc157af78b 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -10,6 +10,7 @@ #define CLUSTER_MF_TIMEOUT 5000 /* Milliseconds to do a manual failover. */ #define CLUSTER_MF_PAUSE_MULT 2 /* Primary pause manual failover mult. */ #define CLUSTER_REPLICA_MIGRATION_DELAY 5000 /* Delay for replica migration. */ +#define CLUSTER_SLOT_MIGRATION_TIMEOUT 30000 /* Milliseconds to do a slot migration. */ /* Reasons why a replica is not able to failover. */ #define CLUSTER_CANT_FAILOVER_NONE 0 @@ -26,6 +27,7 @@ #define CLUSTER_TODO_FSYNC_CONFIG (1 << 3) #define CLUSTER_TODO_HANDLE_MANUALFAILOVER (1 << 4) #define CLUSTER_TODO_BROADCAST_ALL (1 << 5) +#define CLUSTER_TODO_HANDLE_SLOTMIGRATION (1 << 6) /* clusterLink encapsulates everything needed to talk with a remote node. */ typedef struct clusterLink { @@ -95,7 +97,9 @@ typedef struct clusterNodeFailReport { #define CLUSTERMSG_TYPE_MFSTART 8 /* Pause clients for manual failover */ #define CLUSTERMSG_TYPE_MODULE 9 /* Module cluster API message. */ #define CLUSTERMSG_TYPE_PUBLISHSHARD 10 /* Pub/Sub Publish shard propagation */ -#define CLUSTERMSG_TYPE_COUNT 11 /* Total number of message types. */ +#define CLUSTERMSG_TYPE_MIGRATE_SLOT_START 11 /* Pause clients for slot migration */ +#define CLUSTERMSG_TYPE_COUNT 12 /* Total number of message types. */ + #define CLUSTERMSG_LIGHT 0x8000 /* Modifier bit for message types that support light header */ @@ -142,6 +146,10 @@ typedef struct { unsigned char bulk_data[3]; /* 3 bytes just as placeholder. */ } clusterMsgModule; +typedef struct { + unsigned char slot_bitmap[CLUSTER_SLOTS / 8]; /* Slots bitmap. */ +} clusterMsgSlotMigration; + /* The cluster supports optional extension messages that can be sent * along with ping/pong/meet messages to give additional info in a * consistent manner. */ @@ -228,6 +236,12 @@ union clusterMsgData { struct { clusterMsgModule msg; } module; + + /* SLOT_MIGRATION */ + struct { + clusterMsgSlotMigration msg; + } slot_migration; + }; #define CLUSTER_PROTO_VER 1 /* Cluster bus protocol version. */ @@ -362,6 +376,31 @@ struct _clusterNode { Update with updateAndCountChangedNodeHealth(). */ }; +typedef enum slotMigrationState { + SLOT_MIGRATION_QUEUED, /* Queued behind some other slot migration. */ + SLOT_MIGRATION_SYNCING, /* Syncing contents from current owner. */ + SLOT_MIGRATION_PAUSE_OWNER, + SLOT_MIGRATION_WAITING_FOR_OFFSET, + SLOT_MIGRATION_SYNCING_TO_OFFSET, + SLOT_MIGRATION_FINISH, + SLOT_MIGRATION_FAILED, +} slotMigrationState; + +typedef struct slotMigration { + list *slot_ranges; + slotMigrationState state; + clusterNode *source_node; + mstime_t end_time; /* Slot migration time limit (ms unixtime). + If not yet in progress (e.g. queued), will be zero. */ + replicationLink *link; + mstime_t pause_end; + long long pause_primary_offset; + mstime_t vote_end_time; + mstime_t vote_retry_time; + uint64_t vote_epoch; + int auth_count; +} slotMigration; + /* Struct used for storing slot statistics. */ typedef struct slotStat { uint64_t cpu_usec; @@ -420,6 +459,7 @@ struct clusterState { unsigned char owner_not_claiming_slot[CLUSTER_SLOTS / 8]; /* Struct used for storing slot statistics, for all slots owned by the current shard. */ slotStat slot_stats[CLUSTER_SLOTS]; + list *slot_migrations; /* Queue of ongoing slot migrations. */ }; #endif // CLUSTER_LEGACY_H diff --git a/src/commands.def b/src/commands.def index c5d766e3f8..0e54094821 100644 --- a/src/commands.def +++ b/src/commands.def @@ -685,6 +685,25 @@ struct COMMAND_ARG CLUSTER_MEET_Args[] = { {MAKE_ARG("cluster-bus-port",ARG_TYPE_INTEGER,-1,NULL,NULL,"4.0.0",CMD_ARG_OPTIONAL,0,NULL)}, }; +/********** CLUSTER MIGRATE ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* CLUSTER MIGRATE history */ +#define CLUSTER_MIGRATE_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* CLUSTER MIGRATE tips */ +const char *CLUSTER_MIGRATE_Tips[] = { +"nondeterministic_output", +}; +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* CLUSTER MIGRATE key specs */ +#define CLUSTER_MIGRATE_Keyspecs NULL +#endif + /********** CLUSTER MYID ********************/ #ifndef SKIP_CMD_HISTORY_TABLE @@ -1020,6 +1039,7 @@ struct COMMAND_STRUCT CLUSTER_Subcommands[] = { {MAKE_CMD("keyslot","Returns the hash slot for a key.","O(N) where N is the number of bytes in the key","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_KEYSLOT_History,0,CLUSTER_KEYSLOT_Tips,0,clusterCommand,3,CMD_STALE,0,CLUSTER_KEYSLOT_Keyspecs,0,NULL,1),.args=CLUSTER_KEYSLOT_Args}, {MAKE_CMD("links","Returns a list of all TCP links to and from peer nodes.","O(N) where N is the total number of Cluster nodes","7.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_LINKS_History,0,CLUSTER_LINKS_Tips,1,clusterCommand,2,CMD_STALE,0,CLUSTER_LINKS_Keyspecs,0,NULL,0)}, {MAKE_CMD("meet","Forces a node to handshake with another node.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_MEET_History,1,CLUSTER_MEET_Tips,0,clusterCommand,-4,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_STALE,0,CLUSTER_MEET_Keyspecs,0,NULL,3),.args=CLUSTER_MEET_Args}, +{MAKE_CMD("migrate","Initiates server driven hash slot migration, importing the given slot to this shard.","O(N) where N is the total number of hash slot arguments","8.1.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_MIGRATE_History,0,CLUSTER_MIGRATE_Tips,1,clusterCommand,-2,CMD_ADMIN|CMD_STALE,0,CLUSTER_MIGRATE_Keyspecs,0,NULL,0)}, {MAKE_CMD("myid","Returns the ID of a node.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_MYID_History,0,CLUSTER_MYID_Tips,0,clusterCommand,2,CMD_LOADING|CMD_STALE,0,CLUSTER_MYID_Keyspecs,0,NULL,0)}, {MAKE_CMD("myshardid","Returns the shard ID of a node.","O(1)","7.2.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_MYSHARDID_History,0,CLUSTER_MYSHARDID_Tips,1,clusterCommand,2,CMD_LOADING|CMD_STALE,0,CLUSTER_MYSHARDID_Keyspecs,0,NULL,0)}, {MAKE_CMD("nodes","Returns the cluster configuration for a node.","O(N) where N is the total number of Cluster nodes","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_NODES_History,0,CLUSTER_NODES_Tips,1,clusterCommand,2,CMD_LOADING|CMD_STALE,0,CLUSTER_NODES_Keyspecs,0,NULL,0)}, diff --git a/src/commands/cluster-migrate.json b/src/commands/cluster-migrate.json new file mode 100644 index 0000000000..719e827fa4 --- /dev/null +++ b/src/commands/cluster-migrate.json @@ -0,0 +1,18 @@ +{ + "MIGRATE": { + "summary": "Initiates server driven hash slot migration, importing the given slot to this shard.", + "complexity": "O(N) where N is the total number of hash slot arguments", + "group": "cluster", + "since": "8.1.0", + "arity": -2, + "container": "CLUSTER", + "function": "clusterCommand", + "command_flags": [ + "ADMIN", + "STALE" + ], + "command_tips": [ + "NONDETERMINISTIC_OUTPUT" + ] + } +} diff --git a/src/config.c b/src/config.c index 5b90ebbd60..512b35f210 100644 --- a/src/config.c +++ b/src/config.c @@ -596,7 +596,7 @@ void loadServerConfigFromString(char *config) { } /* Sanity checks. */ - if (server.cluster_enabled && server.primary_host) { + if (server.cluster_enabled && server.primary) { err = "replicaof directive not allowed in cluster mode"; goto loaderr; } @@ -1451,11 +1451,11 @@ void rewriteConfigReplicaOfOption(standardConfig *config, const char *name, stru /* If this is a primary, we want all the replicaof config options * in the file to be removed. Note that if this is a cluster instance * we don't want a replicaof directive inside valkey.conf. */ - if (server.cluster_enabled || server.primary_host == NULL) { + if (server.cluster_enabled || server.primary == NULL) { rewriteConfigMarkAsProcessed(state, name); return; } - line = sdscatprintf(sdsempty(), "%s %s %d", name, server.primary_host, server.primary_port); + line = sdscatprintf(sdsempty(), "%s %s %d", name, server.primary->host, server.primary->port); rewriteConfigRewriteLine(state, name, line, 1); } @@ -3000,19 +3000,20 @@ static int setConfigReplicaOfOption(standardConfig *config, sds *argv, int argc, return 0; } - sdsfree(server.primary_host); - server.primary_host = NULL; + freeReplicationLink(server.primary); + server.primary = NULL; + if (!strcasecmp(argv[0], "no") && !strcasecmp(argv[1], "one")) { return 1; } char *ptr; - server.primary_port = strtol(argv[1], &ptr, 10); - if (server.primary_port < 0 || server.primary_port > 65535 || *ptr != '\0') { + int port = strtol(argv[1], &ptr, 10); + if (port < 0 || port > 65535 || *ptr != '\0') { *err = "Invalid primary port"; return 0; } - server.primary_host = sdsnew(argv[0]); - server.repl_state = REPL_STATE_CONNECT; + server.primary = createReplicationLink(argv[0], port, NULL); + server.primary->state = REPL_STATE_CONNECT; return 1; } @@ -3024,8 +3025,8 @@ static sds getConfigBindOption(standardConfig *config) { static sds getConfigReplicaOfOption(standardConfig *config) { UNUSED(config); char buf[256]; - if (server.primary_host) - snprintf(buf, sizeof(buf), "%s %d", server.primary_host, server.primary_port); + if (server.primary) + snprintf(buf, sizeof(buf), "%s %d", server.primary->host, server.primary->port); else buf[0] = '\0'; return sdsnew(buf); diff --git a/src/db.c b/src/db.c index 94074bf668..05b395728a 100644 --- a/src/db.c +++ b/src/db.c @@ -110,7 +110,7 @@ robj *lookupKey(serverDb *db, robj *key, int flags) { * It's possible that the WRITE flag is set even during a readonly * command, since the command may trigger events that cause modules to * perform additional writes. */ - int is_ro_replica = server.primary_host && server.repl_replica_ro; + int is_ro_replica = server.primary && server.repl_replica_ro; int expire_flags = 0; if (flags & LOOKUP_WRITE && !is_ro_replica) expire_flags |= EXPIRE_FORCE_DELETE_EXPIRED; if (flags & LOOKUP_NOEXPIRE) expire_flags |= EXPIRE_AVOID_DELETE_EXPIRED; @@ -258,7 +258,7 @@ int getKeySlot(sds key) { * so we must always recompute the slot for commands coming from the primary. */ if (server.current_client && server.current_client->slot >= 0 && server.current_client->flag.executing_command && - !server.current_client->flag.primary) { + !server.current_client->flag.replication_source) { debugServerAssertWithInfo(server.current_client, NULL, (int)keyHashSlot(key, (int)sdslen(key)) == server.current_client->slot); return server.current_client->slot; @@ -267,7 +267,7 @@ int getKeySlot(sds key) { /* For the case of replicated commands from primary, getNodeByQuery() never gets called, * and thus c->slot never gets populated. That said, if this command ends up accessing a key, * we are able to backfill c->slot here, where the key's hash calculation is made. */ - if (server.current_client && server.current_client->flag.primary) { + if (server.current_client && server.current_client->flag.replication_source) { server.current_client->slot = slot; } return slot; @@ -432,6 +432,7 @@ void setKey(client *c, serverDb *db, robj *key, robj **valref, int flags) { * If there are no keys, NULL is returned. * * The function makes sure to return keys not already expired. */ +// TODO murphyjacob4 need to exclude the loading slots from this robj *dbRandomKey(serverDb *db) { int maxtries = 100; int allvolatile = kvstoreSize(db->keys) == kvstoreSize(db->expires); @@ -445,7 +446,7 @@ robj *dbRandomKey(serverDb *db) { sds key = objectGetKey(valkey); robj *keyobj = createStringObject(key, sdslen(key)); if (objectIsExpired(valkey)) { - if (allvolatile && (server.primary_host || server.import_mode) && --maxtries == 0) { + if (allvolatile && (server.primary || server.import_mode) && --maxtries == 0) { /* If the DB is composed only of keys with an expire set, * it could happen that all the keys are already logically * expired in the replica, so the function cannot stop because @@ -1800,8 +1801,8 @@ robj *setExpire(client *c, serverDb *db, robj *key, long long when) { serverAssert(added); } - int writable_replica = server.primary_host && server.repl_replica_ro == 0; - if (c && writable_replica && !c->flag.primary) rememberReplicaKeyWithExpire(db, key); + int writable_replica = server.primary && server.repl_replica_ro == 0; + if (c && writable_replica && !c->flag.replication_source) rememberReplicaKeyWithExpire(db, key); return val; } @@ -1906,7 +1907,7 @@ static int objectIsExpired(robj *val) { /* Don't expire anything while loading. It will be done later. */ if (server.loading) return 0; if (!timestampIsExpired(objectGetExpire(val))) return 0; - if (server.primary_host == NULL && server.import_mode) { + if (server.primary == NULL && server.import_mode) { if (server.current_client && server.current_client->flag.import_source) return 0; } return 1; @@ -1924,7 +1925,7 @@ static int keyIsExpiredWithDictIndex(serverDb *db, robj *key, int dict_index) { if (!keyIsExpiredWithDictIndexImpl(db, key, dict_index)) return 0; /* See expireIfNeededWithDictIndex for more details. */ - if (server.primary_host == NULL && server.import_mode) { + if (server.primary == NULL && server.import_mode) { if (server.current_client && server.current_client->flag.import_source) return 0; } return 1; @@ -1958,8 +1959,8 @@ static keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, robj *val, * * When replicating commands from the primary, keys are never considered * expired. */ - if (server.primary_host != NULL) { - if (server.current_client && (server.current_client->flag.primary)) return KEY_VALID; + if (server.primary != NULL) { + if (server.current_client && (server.current_client->flag.replication_source)) return KEY_VALID; if (!(flags & EXPIRE_FORCE_DELETE_EXPIRED)) return KEY_EXPIRED; } else if (server.import_mode) { /* If we are running in the import mode on a primary, instead of diff --git a/src/evict.c b/src/evict.c index d4bfade4fc..f91f2b76f7 100644 --- a/src/evict.c +++ b/src/evict.c @@ -466,7 +466,7 @@ static int isSafeToPerformEvictions(void) { /* By default replicas should ignore maxmemory * and just be primaries exact copies. */ - if (server.primary_host && server.repl_replica_ignore_maxmemory) return 0; + if (server.primary && server.repl_replica_ignore_maxmemory) return 0; /* If 'evict' action is paused, for whatever reason, then return false */ if (isPausedActionsWithUpdate(PAUSE_ACTION_EVICT)) return 0; diff --git a/src/expire.c b/src/expire.c index e4c3b0ec96..29dcd82c83 100644 --- a/src/expire.c +++ b/src/expire.c @@ -524,7 +524,7 @@ int checkAlreadyExpired(long long when) { * * If the server is a primary and in the import mode, we also add the already * expired key and wait for an explicit DEL from the import source. */ - return (when <= commandTimeSnapshot() && !server.loading && !server.primary_host && !server.import_mode); + return (when <= commandTimeSnapshot() && !server.loading && !server.primary && !server.import_mode); } #define EXPIRE_NX (1 << 0) diff --git a/src/io_threads.c b/src/io_threads.c index 66ef4948b6..260d7007be 100644 --- a/src/io_threads.c +++ b/src/io_threads.c @@ -345,7 +345,7 @@ int trySendReadToIOThreads(client *c) { c->cur_tid = tid; c->read_flags = canParseCommand(c) ? 0 : READ_FLAGS_DONT_PARSE; c->read_flags |= authRequired(c) ? READ_FLAGS_AUTH_REQUIRED : 0; - c->read_flags |= c->flag.primary ? READ_FLAGS_PRIMARY : 0; + c->read_flags |= c->flag.replication_source ? READ_FLAGS_PRIMARY : 0; c->io_read_state = CLIENT_PENDING_IO; connSetPostponeUpdateState(c->conn, 1); diff --git a/src/kvstore.c b/src/kvstore.c index d6db4d3fe1..ef4b90af73 100644 --- a/src/kvstore.c +++ b/src/kvstore.c @@ -74,6 +74,8 @@ struct _kvstoreIterator { kvstore *kvs; long long didx; long long next_didx; + kvstoreIteratorFilter *filter; + void *filter_privdata; hashtableIterator di; }; @@ -300,12 +302,7 @@ kvstore *kvstoreCreate(hashtableType *type, int num_hashtables_bits, int flags) void kvstoreEmpty(kvstore *kvs, void(callback)(hashtable *)) { for (int didx = 0; didx < kvs->num_hashtables; didx++) { - hashtable *ht = kvstoreGetHashtable(kvs, didx); - if (!ht) continue; - kvstoreHashtableMetadata *metadata = (kvstoreHashtableMetadata *)hashtableMetadata(ht); - if (metadata->rehashing_node) metadata->rehashing_node = NULL; - hashtableEmpty(ht, callback); - freeHashtableIfNeeded(kvs, didx); + kvstoreEmptyHashtable(kvs, didx, callback); } listEmpty(kvs->rehashing); @@ -318,6 +315,28 @@ void kvstoreEmpty(kvstore *kvs, void(callback)(hashtable *)) { kvs->overhead_hashtable_rehashing = 0; } +void kvstoreEmptyHashtable(kvstore *kvs, int didx, void(callback)(hashtable *)) { + hashtable *ht = kvstoreGetHashtable(kvs, didx); + if (!ht) return; + kvstoreHashtableMetadata *metadata = (kvstoreHashtableMetadata *)hashtableMetadata(ht); + if (metadata->rehashing_node) metadata->rehashing_node = NULL; + hashtableEmpty(ht, callback); + freeHashtableIfNeeded(kvs, didx); +} + +hashtable *kvstoreUnlinkHashtable(kvstore *kvs, int didx) { + hashtable *oldht = kvstoreGetHashtable(kvs, didx); + if (!oldht) return NULL; + + /* Pause rehashing on the to be unlinked node. */ + kvstoreHashtableMetadata *oldmetadata = (kvstoreHashtableMetadata *)hashtableMetadata(oldht); + if (oldmetadata->rehashing_node) oldmetadata->rehashing_node = NULL; + + kvs->hashtables[didx] = NULL; + kvs->allocated_hashtables--; + return oldht; +} + void kvstoreRelease(kvstore *kvs) { for (int didx = 0; didx < kvs->num_hashtables; didx++) { hashtable *ht = kvstoreGetHashtable(kvs, didx); @@ -581,6 +600,20 @@ kvstoreIterator *kvstoreIteratorInit(kvstore *kvs) { kvs_it->kvs = kvs; kvs_it->didx = -1; kvs_it->next_didx = kvstoreGetFirstNonEmptyHashtableIndex(kvs_it->kvs); /* Finds first non-empty hashtable index. */ + kvs_it->filter = NULL; + kvs_it->filter_privdata = NULL; + hashtableInitSafeIterator(&kvs_it->di, NULL); + return kvs_it; +} + +/* Returns kvstore iterator that filters out hash tables based on the predicate.*/ +kvstoreIterator *kvstoreFilteredIteratorInit(kvstore *kvs, kvstoreIteratorFilter *filter, void *privdata) { + kvstoreIterator *kvs_it = zmalloc(sizeof(*kvs_it)); + kvs_it->kvs = kvs; + kvs_it->didx = -1; + kvs_it->next_didx = kvstoreGetFirstNonEmptyHashtableIndex(kvs_it->kvs); + kvs_it->filter = filter; + kvs_it->filter_privdata = privdata; hashtableInitSafeIterator(&kvs_it->di, NULL); return kvs_it; } @@ -607,8 +640,11 @@ static hashtable *kvstoreIteratorNextHashtable(kvstoreIterator *kvs_it) { freeHashtableIfNeeded(kvs_it->kvs, kvs_it->didx); } - kvs_it->didx = kvs_it->next_didx; - kvs_it->next_didx = kvstoreGetNextNonEmptyHashtableIndex(kvs_it->kvs, kvs_it->didx); + do { + kvs_it->didx = kvs_it->next_didx; + if (kvs_it->didx == -1) return NULL; + kvs_it->next_didx = kvstoreGetNextNonEmptyHashtableIndex(kvs_it->kvs, kvs_it->didx); + } while (kvs_it->filter && kvs_it->filter(kvs_it->didx, kvs_it->filter_privdata)); return kvs_it->kvs->hashtables[kvs_it->didx]; } diff --git a/src/kvstore.h b/src/kvstore.h index 1a8c74a6b9..668b0ae23e 100644 --- a/src/kvstore.h +++ b/src/kvstore.h @@ -10,11 +10,14 @@ typedef struct _kvstoreHashtableIterator kvstoreHashtableIterator; typedef int(kvstoreScanShouldSkipHashtable)(hashtable *d); typedef int(kvstoreExpandShouldSkipHashtableIndex)(int didx); +typedef int(kvstoreIteratorFilter)(int didx, void *privdata); #define KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND (1 << 0) #define KVSTORE_FREE_EMPTY_HASHTABLES (1 << 1) kvstore *kvstoreCreate(hashtableType *type, int num_hashtables_bits, int flags); void kvstoreEmpty(kvstore *kvs, void(callback)(hashtable *)); +void kvstoreEmptyHashtable(kvstore *kvs, int didx, void(callback)(hashtable *)); +hashtable *kvstoreUnlinkHashtable(kvstore *kvs, int didx); void kvstoreRelease(kvstore *kvs); unsigned long long kvstoreSize(kvstore *kvs); unsigned long kvstoreBuckets(kvstore *kvs); @@ -44,6 +47,7 @@ size_t kvstoreHashtableMetadataSize(void); /* kvstore iterator specific functions */ kvstoreIterator *kvstoreIteratorInit(kvstore *kvs); +kvstoreIterator *kvstoreFilteredIteratorInit(kvstore *kvs, kvstoreIteratorFilter *filter, void *privdata); void kvstoreIteratorRelease(kvstoreIterator *kvs_it); int kvstoreIteratorGetCurrentHashtableIndex(kvstoreIterator *kvs_it); int kvstoreIteratorNext(kvstoreIterator *kvs_it, void **next); diff --git a/src/lazyfree.c b/src/lazyfree.c index 3b061ccd84..8cd04eed37 100644 --- a/src/lazyfree.c +++ b/src/lazyfree.c @@ -32,6 +32,18 @@ void lazyfreeFreeDatabase(void *args[]) { atomic_fetch_add_explicit(&lazyfreed_objects, numkeys, memory_order_relaxed); } +/* Release a hashtable from the lazyfree thread. */ +void lazyfreeFreeHashtable(void *args[]) { + hashtable *ht1 = args[0]; + hashtable *ht2 = args[1]; + + size_t numkeys = hashtableSize(ht1); + hashtableRelease(ht1); + if (ht2) hashtableRelease(ht2); + atomic_fetch_sub_explicit(&lazyfree_objects, numkeys, memory_order_relaxed); + atomic_fetch_add_explicit(&lazyfreed_objects, numkeys, memory_order_relaxed); +} + /* Release the key tracking table. */ void lazyFreeTrackingTable(void *args[]) { rax *rt = args[0]; @@ -199,6 +211,17 @@ void emptyDbAsync(serverDb *db) { bioCreateLazyFreeJob(lazyfreeFreeDatabase, 2, oldkeys, oldexpires); } +/* Empty a hashtable asynchrounously. */ +void emptyHashtableAsync(serverDb *db, int didx) { + hashtable *oldkeys = kvstoreUnlinkHashtable(db->keys, didx); + hashtable *oldexpires = kvstoreUnlinkHashtable(db->expires, didx); + if (!oldkeys) { + return; + } + atomic_fetch_add_explicit(&lazyfree_objects, hashtableSize(oldkeys), memory_order_relaxed); + bioCreateLazyFreeJob(lazyfreeFreeHashtable, 2, oldkeys, oldexpires); +} + /* Free the key tracking table. * If the table is huge enough, free it in async way. */ void freeTrackingRadixTreeAsync(rax *tracking) { diff --git a/src/module.c b/src/module.c index fa60335837..40a5c8de20 100644 --- a/src/module.c +++ b/src/module.c @@ -3757,9 +3757,9 @@ int modulePopulateReplicationInfoStructure(void *ri, int structver) { ValkeyModuleReplicationInfoV1 *ri1 = ri; memset(ri1, 0, sizeof(*ri1)); ri1->version = structver; - ri1->primary = server.primary_host == NULL; - ri1->primary_host = server.primary_host ? server.primary_host : ""; - ri1->primary_port = server.primary_port; + ri1->primary = server.primary == NULL; + ri1->primary_host = server.primary ? server.primary->host : ""; + ri1->primary_port = server.primary ? server.primary->port : 0; ri1->replid1 = server.replid; ri1->replid2 = server.replid2; ri1->repl1_offset = server.primary_repl_offset; @@ -3948,7 +3948,7 @@ int VM_GetContextFlags(ValkeyModuleCtx *ctx) { if (ctx->client) { if (ctx->client->flag.deny_blocking) flags |= VALKEYMODULE_CTX_FLAGS_DENY_BLOCKING; /* Module command received from PRIMARY, is replicated. */ - if (ctx->client->flag.primary) flags |= VALKEYMODULE_CTX_FLAGS_REPLICATED; + if (ctx->client->flag.replication_source) flags |= VALKEYMODULE_CTX_FLAGS_REPLICATED; if (ctx->client->resp == 3) { flags |= VALKEYMODULE_CTX_FLAGS_RESP3; } @@ -3973,7 +3973,7 @@ int VM_GetContextFlags(ValkeyModuleCtx *ctx) { flags |= VALKEYMODULE_CTX_FLAGS_LOADING; /* Maxmemory and eviction policy */ - if (server.maxmemory > 0 && (!server.primary_host || !server.repl_replica_ignore_maxmemory)) { + if (server.maxmemory > 0 && (!server.primary || !server.repl_replica_ignore_maxmemory)) { flags |= VALKEYMODULE_CTX_FLAGS_MAXMEMORY; if (server.maxmemory_policy != MAXMEMORY_NO_EVICTION) flags |= VALKEYMODULE_CTX_FLAGS_EVICT; @@ -3984,22 +3984,22 @@ int VM_GetContextFlags(ValkeyModuleCtx *ctx) { if (server.saveparamslen > 0) flags |= VALKEYMODULE_CTX_FLAGS_RDB; /* Replication flags */ - if (server.primary_host == NULL) { + if (server.primary == NULL) { flags |= VALKEYMODULE_CTX_FLAGS_PRIMARY; } else { flags |= VALKEYMODULE_CTX_FLAGS_REPLICA; if (server.repl_replica_ro) flags |= VALKEYMODULE_CTX_FLAGS_READONLY; /* Replica state flags. */ - if (server.repl_state == REPL_STATE_CONNECT || server.repl_state == REPL_STATE_CONNECTING) { + if (server.primary->state == REPL_STATE_CONNECT || server.primary->state == REPL_STATE_CONNECTING) { flags |= VALKEYMODULE_CTX_FLAGS_REPLICA_IS_CONNECTING; - } else if (server.repl_state == REPL_STATE_TRANSFER) { + } else if (server.primary->state == REPL_STATE_TRANSFER) { flags |= VALKEYMODULE_CTX_FLAGS_REPLICA_IS_TRANSFERRING; - } else if (server.repl_state == REPL_STATE_CONNECTED) { + } else if (server.primary->state == REPL_STATE_CONNECTED) { flags |= VALKEYMODULE_CTX_FLAGS_REPLICA_IS_ONLINE; } - if (server.repl_state != REPL_STATE_CONNECTED) flags |= VALKEYMODULE_CTX_FLAGS_REPLICA_IS_STALE; + if (server.primary->state != REPL_STATE_CONNECTED) flags |= VALKEYMODULE_CTX_FLAGS_REPLICA_IS_STALE; } /* OOM flag. */ @@ -6462,7 +6462,7 @@ ValkeyModuleCallReply *VM_Call(ValkeyModuleCtx *ctx, const char *cmdname, const goto cleanup; } - if (server.primary_host && server.repl_replica_ro && !obey_client) { + if (server.primary && server.repl_replica_ro && !obey_client) { errno = ESPIPE; if (error_as_call_replies) { sds msg = sdsdup(shared.roreplicaerr->ptr); @@ -6472,7 +6472,7 @@ ValkeyModuleCallReply *VM_Call(ValkeyModuleCtx *ctx, const char *cmdname, const } } - if (server.primary_host && server.repl_state != REPL_STATE_CONNECTED && server.repl_serve_stale_data == 0 && + if (server.primary && server.primary->state != REPL_STATE_CONNECTED && server.repl_serve_stale_data == 0 && !(cmd_flags & CMD_STALE)) { errno = ESPIPE; if (error_as_call_replies) { @@ -8782,7 +8782,7 @@ int VM_AddPostNotificationJob(ValkeyModuleCtx *ctx, ValkeyModulePostNotificationJobFunc callback, void *privdata, void (*free_privdata)(void *)) { - if (server.loading || (server.primary_host && server.repl_replica_ro)) { + if (server.loading || (server.primary && server.repl_replica_ro)) { return VALKEYMODULE_ERR; } ValkeyModulePostExecUnitJob *job = zmalloc(sizeof(*job)); @@ -13059,7 +13059,7 @@ int VM_RdbLoad(ValkeyModuleCtx *ctx, ValkeyModuleRdbStream *stream, int flags) { } /* Not allowed on replicas. */ - if (server.primary_host != NULL) { + if (server.primary != NULL) { errno = ENOTSUP; return VALKEYMODULE_ERR; } diff --git a/src/networking.c b/src/networking.c index 48e397e6f4..b9712d877a 100644 --- a/src/networking.c +++ b/src/networking.c @@ -290,7 +290,7 @@ int prepareClientToWrite(client *c) { /* Primaries don't receive replies, unless CLIENT_PRIMARY_FORCE_REPLY flag * is set. */ - if (c->flag.primary && !c->flag.primary_force_reply) return C_ERR; + if (c->flag.replication_source && !c->flag.primary_force_reply) return C_ERR; /* Skip the fake client, such as the fake client for AOF loading. * But CLIENT_ID_CACHED_RESPONSE is allowed since it is a fake client @@ -1599,7 +1599,7 @@ void clearClientConnectionState(client *c) { c->flag.replica = 0; } - serverAssert(!(c->flag.replica || c->flag.primary)); + serverAssert(!(c->flag.replica || c->flag.replication_source)); if (c->flag.tracking) disableTracking(c); selectDb(c, 0); @@ -1668,7 +1668,7 @@ void freeClient(client *c) { * * Note that before doing this we make sure that the client is not in * some unexpected state, by checking its flags. */ - if (server.primary && c->flag.primary) { + if (server.primary && server.primary->client == c) { serverLog(LL_NOTICE, "Connection with primary lost."); if (!c->flag.dont_cache_primary && !(c->flag.protocol_error || c->flag.blocked)) { c->flag.close_asap = 0; @@ -1818,7 +1818,7 @@ void beforeNextClient(client *c) { * blocked client as well */ /* Trim the query buffer to the current position. */ - if (c->flag.primary) { + if (c->flag.replication_source) { /* If the client is a primary, trim the querybuf to repl_applied, * since primary client is very special, its querybuf not only * used to parse command, but also proxy to sub-replicas. @@ -2148,7 +2148,7 @@ int postWriteToClient(client *c) { * as an interaction, since we always send REPLCONF ACK commands * that take some time to just fill the socket output buffer. * We just rely on data / pings received for timeout detection. */ - if (!c->flag.primary) c->last_interaction = server.unixtime; + if (!c->flag.replication_source) c->last_interaction = server.unixtime; } if (!clientHasPendingReplies(c)) { c->sentlen = 0; @@ -2236,7 +2236,7 @@ int handleReadResult(client *c) { c->last_interaction = server.unixtime; c->net_input_bytes += c->nread; - if (c->flag.primary) { + if (c->flag.replication_source) { c->repl_data->read_reploff += c->nread; server.stat_net_repl_input_bytes += c->nread; } else { @@ -2642,7 +2642,7 @@ void processInlineBuffer(client *c) { * CLIENT_PROTOCOL_ERROR. */ #define PROTO_DUMP_LEN 128 static void setProtocolError(const char *errstr, client *c) { - if (server.verbosity <= LL_VERBOSE || c->flag.primary) { + if (server.verbosity <= LL_VERBOSE || c->flag.replication_source) { sds client = catClientInfoString(sdsempty(), c, server.hide_user_data_from_log); /* Sample some protocol to given an idea about what was inside. */ @@ -2664,7 +2664,7 @@ static void setProtocolError(const char *errstr, client *c) { } /* Log all the client and protocol info. */ - int loglevel = (c->flag.primary) ? LL_WARNING : LL_VERBOSE; + int loglevel = (c->flag.replication_source) ? LL_WARNING : LL_VERBOSE; serverLog(loglevel, "Protocol error (%s) from client: %s. %s", errstr, client, buf); sdsfree(client); } @@ -2895,7 +2895,7 @@ void commandProcessed(client *c) { if (!c->repl_data) return; long long prev_offset = c->repl_data->reploff; - if (c->flag.primary && !c->flag.multi) { + if (c->flag.replication_source && !c->flag.multi) { /* Update the applied replication offset of our primary. */ c->repl_data->reploff = c->repl_data->read_reploff - sdslen(c->querybuf) + c->qb_pos; } @@ -2906,7 +2906,7 @@ void commandProcessed(client *c) { * applied to the primary state: this quantity, and its corresponding * part of the replication stream, will be propagated to the * sub-replicas and to the replication backlog. */ - if (c->flag.primary) { + if (c->flag.replication_source) { long long applied = c->repl_data->reploff - prev_offset; if (applied) { replicationFeedStreamFromPrimaryStream(c->querybuf + c->repl_data->repl_applied, applied); @@ -3014,7 +3014,7 @@ int canParseCommand(client *c) { * condition on the replica. We want just to accumulate the replication * stream (instead of replying -BUSY like we do with other clients) and * later resume the processing. */ - if (isInsideYieldingLongCommand() && c->flag.primary) return 0; + if (isInsideYieldingLongCommand() && c->flag.replication_source) return 0; /* CLIENT_CLOSE_AFTER_REPLY closes the connection once the reply is * written to the client. Make sure to not let the reply grow after @@ -3033,7 +3033,7 @@ int processInputBuffer(client *c) { break; } - c->read_flags = c->flag.primary ? READ_FLAGS_PRIMARY : 0; + c->read_flags = c->flag.replication_source ? READ_FLAGS_PRIMARY : 0; c->read_flags |= authRequired(c) ? READ_FLAGS_AUTH_REQUIRED : 0; parseCommand(c); @@ -3097,7 +3097,7 @@ void readToQueryBuf(client *c) { /* Primary client needs expand the readlen when meet BIG_ARG(see #9100), * but doesn't need align to the next arg, we can read more data. */ - if (c->flag.primary && readlen < PROTO_IOBUF_LEN) readlen = PROTO_IOBUF_LEN; + if (c->flag.replication_source && readlen < PROTO_IOBUF_LEN) readlen = PROTO_IOBUF_LEN; } if (c->querybuf == NULL) { @@ -3240,7 +3240,7 @@ sds catClientInfoString(sds s, client *client, int hide_user_data) { *p++ = 'S'; } - if (client->flag.primary) *p++ = 'M'; + if (client->flag.replication_source) *p++ = 'M'; if (client->flag.pubsub) *p++ = 'P'; if (client->flag.multi) *p++ = 'x'; if (client->flag.blocked) *p++ = 'b'; @@ -3458,7 +3458,7 @@ void resetCommand(client *c) { flags.replica = 0; } - if (flags.replica || flags.primary || flags.module) { + if (flags.replica || flags.replication_source || flags.module) { addReplyError(c, "can only reset normal client connections"); return; } @@ -4132,7 +4132,7 @@ void helloCommand(client *c) { if (!server.sentinel_mode) { addReplyBulkCString(c, "role"); - addReplyBulkCString(c, server.primary_host ? "replica" : "master"); + addReplyBulkCString(c, server.primary ? "replica" : "master"); } addReplyBulkCString(c, "modules"); @@ -4363,7 +4363,7 @@ size_t getClientMemoryUsage(client *c, size_t *output_buffer_mem_usage) { * CLIENT_TYPE_PRIMARY -> The client representing our replication primary. */ int getClientType(client *c) { - if (c->flag.primary) return CLIENT_TYPE_PRIMARY; + if (c->flag.replication_source) return CLIENT_TYPE_PRIMARY; /* Even though MONITOR clients are marked as replicas, we * want the expose them as normal clients. */ if (c->flag.replica && !c->flag.monitor) return CLIENT_TYPE_REPLICA; diff --git a/src/rdb.c b/src/rdb.c index 0bb5d7d45d..57fae239ad 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -1869,7 +1869,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { if (server.sanitize_dump_payload == SANITIZE_DUMP_CLIENTS) { /* Skip sanitization when loading (an RDB), or getting a RESTORE command * from either the primary or a client using an ACL user with the skip-sanitize-payload flag. */ - int skip = server.loading || (server.current_client && (server.current_client->flag.primary)); + int skip = server.loading || (server.current_client && (server.current_client->flag.replication_source)); if (!skip && server.current_client && server.current_client->user) skip = !!(server.current_client->user->flags & USER_FLAG_SANITIZE_PAYLOAD_SKIP); deep_integrity_validation = !skip; @@ -2934,12 +2934,12 @@ void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) { if (server.loading_process_events_interval_bytes && (r->processed_bytes + len) / server.loading_process_events_interval_bytes > r->processed_bytes / server.loading_process_events_interval_bytes) { - if (server.primary_host && server.repl_state == REPL_STATE_TRANSFER) replicationSendNewlineToPrimary(); + replicationSendNewlineToConnectedLinks(); loadingAbsProgress(r->processed_bytes); processEventsWhileBlocked(); processModuleLoadingProgressEvent(0); } - if (server.repl_state == REPL_STATE_TRANSFER && rioCheckType(r) == RIO_TYPE_CONN) { + if (server.primary && server.primary->state == REPL_STATE_TRANSFER && rioCheckType(r) == RIO_TYPE_CONN) { server.stat_net_repl_input_bytes += len; } } @@ -3526,12 +3526,13 @@ void killRDBChild(void) { /* Spawn an RDB child that writes the RDB to the sockets of the replicas * that are currently in REPLICA_STATE_WAIT_BGSAVE_START state. */ -int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi) { +int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, list *slot_ranges) { listNode *ln; listIter li; pid_t childpid; int pipefds[2], rdb_pipe_write = 0, safe_to_exit_pipe = 0; int dual_channel = (req & REPLICA_REQ_RDB_CHANNEL); + int aof = (req & REPLICA_REQ_AOF_FORMAT); if (hasActiveChildProcess()) return C_ERR; serverAssert(server.rdb_pipe_read == -1 && server.rdb_child_exit_pipe == -1); @@ -3560,7 +3561,7 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi) { server.rdb_child_exit_pipe = pipefds[1]; /* write end */ } /* Collect the connections of the replicas we want to transfer - * the RDB to, which are i WAIT_BGSAVE_START state. */ + * the RDB to, which are in WAIT_BGSAVE_START state. */ int connsnum = 0; connection **conns = zmalloc(sizeof(connection *) * listLength(server.replicas)); server.rdb_pipe_conns = NULL; @@ -3576,6 +3577,8 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi) { if (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_START) { /* Check replica has the exact requirements */ if (replica->repl_data->replica_req != req) continue; + /* No attempt to coallesce slot ranges, just use equality */ + if (replica->repl_data->slot_ranges != slot_ranges) continue; conns[connsnum++] = replica->conn; if (dual_channel) { @@ -3615,7 +3618,16 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi) { } serverSetCpuAffinity(server.bgsave_cpulist); - retval = rdbSaveRioWithEOFMark(req, &rdb, NULL, rsi); + if (aof) { + serverLog(LL_NOTICE, "Background AOF transfer started by pid %ld", (long)getpid()); + retval = rewriteAppendOnlyFileRio(&rdb, slot_ranges); + rioWrite(&rdb, "*3\r\n", 4); + rioWriteBulkString(&rdb, "REPLCONF", 8); + rioWriteBulkString(&rdb, "SYNC-PAYLOAD-END", 17); + rioWriteBulkLongLong(&rdb, rsi->repl_stream_db); + } else { + retval = rdbSaveRioWithEOFMark(req, &rdb, NULL, rsi); + } if (retval == C_OK && rioFlush(&rdb) == 0) retval = C_ERR; if (retval == C_OK) { @@ -3778,7 +3790,7 @@ rdbSaveInfo *rdbPopulateSaveInfo(rdbSaveInfo *rsi) { * connects to us, the NULL repl_backlog will trigger a full * synchronization, at the same time we will use a new replid and clear * replid2. */ - if (!server.primary_host && server.repl_backlog) { + if (!server.primary && server.repl_backlog) { /* Note that when server.replicas_eldb is -1, it means that this primary * didn't apply any write commands after a full synchronization. * So we can let repl_stream_db be 0, this allows a restarted replica @@ -3791,7 +3803,7 @@ rdbSaveInfo *rdbPopulateSaveInfo(rdbSaveInfo *rsi) { /* If the instance is a replica we need a connected primary * in order to fetch the currently selected DB. */ if (server.primary) { - rsi->repl_stream_db = server.primary->db->id; + rsi->repl_stream_db = server.primary->client->db->id; return rsi; } diff --git a/src/rdb.h b/src/rdb.h index 7342a926b5..440620e5bb 100644 --- a/src/rdb.h +++ b/src/rdb.h @@ -152,7 +152,7 @@ int rdbSaveObjectType(rio *rdb, robj *o); int rdbLoadObjectType(rio *rdb); int rdbLoad(char *filename, rdbSaveInfo *rsi, int rdbflags); int rdbSaveBackground(int req, char *filename, rdbSaveInfo *rsi, int rdbflags); -int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi); +int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, list *slotRanges); void rdbRemoveTempFile(pid_t childpid, int from_signal); int rdbSaveToFile(const char *filename); int rdbSave(int req, char *filename, rdbSaveInfo *rsi, int rdbflags); diff --git a/src/replication.c b/src/replication.c index 9913d64d65..cecfad5ee5 100644 --- a/src/replication.c +++ b/src/replication.c @@ -47,16 +47,17 @@ #include void replicationDiscardCachedPrimary(void); -void replicationResurrectCachedPrimary(connection *conn); -void replicationResurrectProvisionalPrimary(void); -void replicationSendAck(void); +void replicationResurrectCachedPrimary(replicationLink *link); +void replicationResurrectProvisionalSource(replicationLink *link); +void replicationSendAck(replicationLink *link); int replicaPutOnline(client *replica); void replicaStartCommandStream(client *replica); -int cancelReplicationHandshake(int reconnect); -void replicationSteadyStateInit(void); +int cancelReplicationHandshake(replicationLink *link, int reconnect); +void replicationSteadyStateInit(replicationLink *link); void dualChannelSetupMainConnForPsync(connection *conn); -void dualChannelSyncHandleRdbLoadCompletion(void); -static void dualChannelFullSyncWithPrimary(connection *conn); +int dualChannelSyncHandleRdbLoadCompletion(replicationLink *link); +static void dualChannelFullSyncWithReplicationSource(connection *conn); +void replicationFinishSyncPayload(connection *conn, replicationLink *link, int db); /* We take a global flag to remember if this instance generated an RDB * because of replication, so that we can remove the RDB file in case @@ -537,7 +538,7 @@ void replicationFeedReplicas(int dictid, robj **argv, int argc) { * propagate *identical* replication stream. In this way this replica can * advertise the same replication ID as the primary (since it shares the * primary replication history and has the same backlog and offsets). */ - if (server.primary_host != NULL) return; + if (server.primary != NULL) return; /* If there aren't replicas, and there is no backlog buffer to populate, * we can return ASAP. */ @@ -952,7 +953,7 @@ int primaryTryPartialResynchronization(client *c, long long psync_offset) { * started. * * Returns C_OK on success or C_ERR otherwise. */ -int startBgsaveForReplication(int mincapa, int req) { +int startBgsaveForReplication(int mincapa, int req, list *slot_ranges) { int retval; int socket_target = 0; listIter li; @@ -965,9 +966,10 @@ int startBgsaveForReplication(int mincapa, int req) { /* `SYNC` should have failed with error if we don't support socket and require a filter, assert this here */ serverAssert(socket_target || !(req & REPLICA_REQ_RDB_MASK)); - serverLog(LL_NOTICE, "Starting BGSAVE for SYNC with target: %s using: %s", + serverLog(LL_NOTICE, "Starting BGSAVE for SYNC with target: %s using: %s with format %s", socket_target ? "replicas sockets" : "disk", - (req & REPLICA_REQ_RDB_CHANNEL) ? "dual-channel" : "normal sync"); + (req & REPLICA_REQ_RDB_CHANNEL) ? "dual-channel" : "normal sync", + (req & REPLICA_REQ_AOF_FORMAT) ? "AOF" : "RDB"); rdbSaveInfo rsi, *rsiptr; rsiptr = rdbPopulateSaveInfo(&rsi); @@ -975,7 +977,7 @@ int startBgsaveForReplication(int mincapa, int req) { * otherwise replica will miss repl-stream-db. */ if (rsiptr) { if (socket_target) - retval = rdbSaveToReplicasSockets(req, rsiptr); + retval = rdbSaveToReplicasSockets(req, rsiptr, slot_ranges); else { /* Keep the page cache since it'll get used soon */ retval = rdbSaveBackground(req, server.rdb_filename, rsiptr, RDBFLAGS_REPLICATION | RDBFLAGS_KEEP_CACHE); @@ -1046,7 +1048,7 @@ void syncCommand(client *c) { * become a primary if so. */ if (c->argc > 3 && !strcasecmp(c->argv[0]->ptr, "psync") && !strcasecmp(c->argv[3]->ptr, "failover")) { serverLog(LL_NOTICE, "Failover request received for replid %s.", (unsigned char *)c->argv[1]->ptr); - if (!server.primary_host) { + if (server.primary == NULL) { addReplyError(c, "PSYNC FAILOVER can't be sent to a master."); return; } @@ -1074,7 +1076,7 @@ void syncCommand(client *c) { /* Refuse SYNC requests if we are a replica but the link with our primary * is not ok... */ - if (server.primary_host && server.repl_state != REPL_STATE_CONNECTED) { + if (server.primary && server.primary->state != REPL_STATE_CONNECTED) { addReplyError(c, "-NOMASTERLINK Can't SYNC while not connected with my master"); return; } @@ -1096,6 +1098,12 @@ void syncCommand(client *c) { return; } + /* Fail sync if it is asking for AOF format and a slot is not set via REPLCONF already. */ + if (c->repl_data->replica_req & REPLICA_REQ_AOF_FORMAT && c->repl_data->slot_ranges == NULL) { + addReplyError(c, "AOF format is only supported for single slot SYNC"); + return; + } + serverLog(LL_NOTICE, "Replica %s asks for synchronization", replicationGetReplicaName(c)); /* Try a partial resynchronization if this is a PSYNC command. @@ -1171,8 +1179,11 @@ void syncCommand(client *c) { server.replid, server.replid2); } + /* For slot level replication, we make no attempt to coallesce BGSAVEs */ + int require_dedicated = c->repl_data->slot_ranges != NULL; + /* CASE 1: BGSAVE is in progress, with disk target. */ - if (server.child_type == CHILD_TYPE_RDB && server.rdb_child_type == RDB_CHILD_TYPE_DISK) { + if (!require_dedicated && server.child_type == CHILD_TYPE_RDB && server.rdb_child_type == RDB_CHILD_TYPE_DISK) { /* Ok a background save is in progress. Let's check if it is a good * one for replication, i.e. if there is another replica that is * registering differences since the server forked to save. */ @@ -1205,32 +1216,35 @@ void syncCommand(client *c) { * register differences. */ serverLog(LL_NOTICE, "Can't attach the replica to the current BGSAVE. Waiting for next BGSAVE for SYNC"); } + } - /* CASE 2: BGSAVE is in progress, with socket target. */ - } else if (server.child_type == CHILD_TYPE_RDB && server.rdb_child_type == RDB_CHILD_TYPE_SOCKET) { + /* CASE 2: BGSAVE is in progress, with socket target. */ + if (server.child_type == CHILD_TYPE_RDB && server.rdb_child_type == RDB_CHILD_TYPE_SOCKET) { /* There is an RDB child process but it is writing directly to * children sockets. We need to wait for the next BGSAVE * in order to synchronize. */ serverLog(LL_NOTICE, "Current BGSAVE has socket target. Waiting for next BGSAVE for SYNC"); + return; + } - /* CASE 3: There is no BGSAVE is in progress. */ - } else { - if (server.repl_diskless_sync && (c->repl_data->replica_capa & REPLICA_CAPA_EOF) && server.repl_diskless_sync_delay) { - /* Diskless replication RDB child is created inside - * replicationCron() since we want to delay its start a - * few seconds to wait for more replicas to arrive. */ - serverLog(LL_NOTICE, "Delay next BGSAVE for diskless SYNC"); - } else { - /* We don't have a BGSAVE in progress, let's start one. Diskless - * or disk-based mode is determined by replica's capacity. */ - if (!hasActiveChildProcess()) { - startBgsaveForReplication(c->repl_data->replica_capa, c->repl_data->replica_req); - } else { - serverLog(LL_NOTICE, "No BGSAVE in progress, but another BG operation is active. " - "BGSAVE for replication delayed"); - } - } + /* CASE 3: There is no BGSAVE is in progress, but we need to delay. */ + if (!require_dedicated && server.repl_diskless_sync && (c->repl_data->replica_capa & REPLICA_CAPA_EOF) && server.repl_diskless_sync_delay) { + /* Diskless replication RDB child is created inside + * replicationCron() since we want to delay its start a + * few seconds to wait for more replicas to arrive. */ + serverLog(LL_NOTICE, "Delay next BGSAVE for diskless SYNC"); + return; + } + + /* CASE 4: We don't have a BGSAVE in progress, but there is an existing child process. */ + if (hasActiveChildProcess()) { + serverLog(LL_NOTICE, "No BGSAVE in progress, but another BG operation is active. " + "BGSAVE for replication delayed"); + return; } + + /* CASE 5: We are good to start a BGSAVE. Diskless or disk-based mode is determined by replica's capacity. */ + startBgsaveForReplication(c->repl_data->replica_capa, c->repl_data->replica_req, c->repl_data->slot_ranges); return; } @@ -1294,8 +1308,13 @@ void freeClientReplicationData(client *c) { moduleFireServerEvent(VALKEYMODULE_EVENT_REPLICA_CHANGE, VALKEYMODULE_SUBEVENT_REPLICA_CHANGE_OFFLINE, NULL); } - if (c->flag.primary) replicationHandlePrimaryDisconnection(); + if (c->flag.replication_source) { + replicationHandleSourceDisconnection(c->repl_data->link); + } sdsfree(c->repl_data->replica_addr); + if (c->repl_data->slot_ranges) { + freeSlotRanges(c->repl_data->slot_ranges); + } zfree(c->repl_data); c->repl_data = NULL; } @@ -1420,7 +1439,7 @@ void replconfCommand(client *c) { } else if (!strcasecmp(c->argv[j]->ptr, "getack")) { /* REPLCONF GETACK is used in order to request an ACK ASAP * to the replica. */ - if (server.primary_host && server.primary) replicationSendAck(); + if (server.primary && server.primary->client) replicationSendAck(server.primary); return; } else if (!strcasecmp(c->argv[j]->ptr, "rdb-only")) { /* REPLCONF RDB-ONLY is used to identify the client only wants @@ -1491,6 +1510,45 @@ void replconfCommand(client *c) { return; } c->repl_data->associated_rdb_client_id = (uint64_t)client_id; + } else if (!strcasecmp(c->argv[j]->ptr, "slot-bitmap")) { + /* REPLCONF slot-bitmap is used to filter the replication stream to just a set number of slots. */ + if (!server.cluster_enabled) { + addReplyError(c, "Cannot replicate a slot when cluster mode is disabled"); + } + if (c->repl_data->slot_ranges != NULL) { + addReplyError(c, "Slot bitmap already set"); + } + if (stringObjectLen(c->argv[j + 1]) != CLUSTER_SLOTS / 8) { + addReplyError(c, "Invalid slot bitmap length"); + return; + } + list *slot_ranges; + bitmapToSlotRanges(c->argv[j + 1]->ptr, &slot_ranges); + for (int slot = 0; slot <= CLUSTER_SLOTS; slot++) { + if (bitmapTestBit(c->argv[j + 1]->ptr, slot) && server.cluster->slots[slot] != server.cluster->myself) { + addReplyErrorFormat(c, "I cannot replicate slot %d since I do not own it", slot); + freeSlotRanges(slot_ranges); + return; + } + } + c->repl_data->slot_ranges = slot_ranges; + + /* For now, we only support AOF for slot transfer. */ + c->repl_data->replica_req |= REPLICA_REQ_AOF_FORMAT; + } else if (!strcasecmp(c->argv[j]->ptr, "sync-payload-end")) { + /* REPLCONF sync-payload-end is used to inform the replica + * that the primary has finished sending the sync snapshot, and + * that it is free to begin processing the replication backlog. + * + * dbnum specifies which db to stream the backlog into. */ + int db_num = 0; + if (getIntFromObjectOrReply(c, c->argv[j + 1], &db_num, NULL) != C_OK || db_num < 0 || db_num >= server.dbnum) { + addReplyError(c, "Unable to parse DB number"); + return; + } + serverLog(LL_NOTICE, "Got sync-payload-end for db %d", db_num); + + replicationFinishSyncPayload(c->conn, c->repl_data->link, db_num); } else { addReplyErrorFormat(c, "Unrecognized REPLCONF option: %s", (char *)c->argv[j]->ptr); return; @@ -1932,13 +1990,30 @@ void shiftReplicationId(void) { /* ----------------------------------- REPLICA -------------------------------- */ +char *replicationGetNameForLogs(replicationLink *link) { + if (link == server.primary) + return "PRIMARY"; + if (link->slot_ranges != NULL) + return "SLOT OWNER"; + return "OTHER REPLICATION SOURCE"; +} + /* Returns 1 if the given replication state is a handshake state, * 0 otherwise. */ -int replicaIsInHandshakeState(void) { - return server.repl_state >= REPL_STATE_RECEIVE_PING_REPLY && server.repl_state <= REPL_STATE_RECEIVE_PSYNC_REPLY; +int replicaIsInHandshakeState(replicationLink *link) { + return link->state >= REPL_STATE_RECEIVE_PING_REPLY && link->state <= REPL_STATE_RECEIVE_PSYNC_REPLY; +} + +void replicationSendNewlineOnLink(replicationLink *link) { + static time_t newline_sent; + if (time(NULL) != newline_sent) { + newline_sent = time(NULL); + /* Pinging back in this stage is best-effort. */ + if (link->transfer_s) connWrite(link->transfer_s, "\n", 1); + } } -/* Avoid the primary to detect the replica is timing out while loading the +/* Avoid the primary to detect replicas as timing out while loading the * RDB file in initial synchronization. We send a single newline character * that is valid protocol but is guaranteed to either be sent entirely or * not, since the byte is indivisible. @@ -1946,12 +2021,15 @@ int replicaIsInHandshakeState(void) { * The function is called in two contexts: while we flush the current * data with emptyData(), and while we load the new data received as an * RDB file from the primary. */ -void replicationSendNewlineToPrimary(void) { - static time_t newline_sent; - if (time(NULL) != newline_sent) { - newline_sent = time(NULL); - /* Pinging back in this stage is best-effort. */ - if (server.repl_transfer_s) connWrite(server.repl_transfer_s, "\n", 1); +void replicationSendNewlineToConnectedLinks() { + listIter li; + listNode *ln; + listRewind(server.replication_links, &li); + while ((ln = listNext(&li))) { + replicationLink *link = (replicationLink *)ln->value; + if (link->state >= REPL_STATE_CONNECTING && link->state <= REPL_STATE_CANCELLED) { + replicationSendNewlineOnLink(link); + } } } @@ -1960,15 +2038,17 @@ void replicationSendNewlineToPrimary(void) { * after loading succeeded or failed. */ void replicationEmptyDbCallback(hashtable *d) { UNUSED(d); - if (server.repl_state == REPL_STATE_TRANSFER) replicationSendNewlineToPrimary(); + replicationSendNewlineToConnectedLinks(); } /* Once we have a link with the primary and the synchronization was * performed, this function materializes the primary client we store * at server.primary, starting from the specified file descriptor. */ -void replicationCreatePrimaryClientWithHandler(connection *conn, int dbid, ConnectionCallbackFunc handler) { - server.primary = createClient(conn); - if (conn) connSetReadHandler(server.primary->conn, handler); +client *createReplicationLinkClientWithHandler(replicationLink *link, connection *conn, int dbid, ConnectionCallbackFunc handler) { + client *c = createClient(conn); + if (conn) { + connSetReadHandler(conn, handler); + } /** * Important note: @@ -1981,27 +2061,33 @@ void replicationCreatePrimaryClientWithHandler(connection *conn, int dbid, Conne * to pass the execution to a background thread and unblock after the * execution is done. This is the reason why we allow blocking the replication * connection. */ - server.primary->flag.primary = 1; - server.primary->flag.authenticated = 1; - - /* Allocate a private query buffer for the primary client instead of using the shared query buffer. - * This is done because the primary's query buffer data needs to be preserved for my sub-replicas to use. */ - server.primary->querybuf = sdsempty(); - initClientReplicationData(server.primary); - server.primary->repl_data->reploff = server.primary_initial_offset; - server.primary->repl_data->read_reploff = server.primary->repl_data->reploff; - server.primary->user = NULL; /* This client can do everything. */ - memcpy(server.primary->repl_data->replid, server.primary_replid, sizeof(server.primary_replid)); + c->flag.replication_source = 1; + c->flag.authenticated = 1; + + + /* Allocate a private query buffer for the replication link client instead of using the shared query buffer. + * This is done because the replication link's query buffer data needs to be preserved for my sub-replicas to use. */ + c->querybuf = sdsempty(); + initClientReplicationData(c); + c->repl_data->reploff = link->initial_offset; + c->repl_data->read_reploff = c->repl_data->reploff; + c->user = NULL; /* This client can do everything. */ + c->repl_data->link = link; + memcpy(c->repl_data->replid, link->replid, sizeof(link->replid)); + /* If primary offset is set to -1, this primary is old and is not * PSYNC capable, so we flag it accordingly. */ - if (server.primary->repl_data->reploff == -1) server.primary->flag.pre_psync = 1; - if (dbid != -1) selectDb(server.primary, dbid); + if (c->repl_data->reploff == -1) c->flag.pre_psync = 1; + if (dbid != -1) selectDb(c, dbid); + c->repl_data->slot_ranges = link->slot_ranges; + + return c; } /* Wrapper for replicationCreatePrimaryClientWithHandler, init primary connection handler * with ordinary client connection handler */ -void replicationCreatePrimaryClient(connection *conn, int dbid) { - replicationCreatePrimaryClientWithHandler(conn, dbid, readQueryFromClient); +client *createReplicationLinkClient(replicationLink *link, connection *conn, int dbid) { + return createReplicationLinkClientWithHandler(link, conn, dbid, readQueryFromClient); } /* This function will try to re-enable the AOF file after the @@ -2078,13 +2164,75 @@ void disklessLoadDiscardFunctionsLibCtx(functionsLibCtx *temp_functions_lib_ctx) void replicationAttachToNewPrimary(void) { /* Replica starts to apply data from new primary, we must discard the cached * primary structure. */ - serverAssert(server.primary == NULL); + serverAssert(server.primary == NULL || server.primary->client == NULL); replicationDiscardCachedPrimary(); disconnectReplicas(); /* Force our replicas to resync with us as well. */ freeReplicationBacklog(); /* Don't allow our chained replicas to PSYNC. */ } +void replicationFinishSyncPayload(connection *conn, replicationLink *link, int db) { + /* Final setup of the connected replica <- primary link */ + int link_closed = 0; + if (link->rdb_channel_state == REPL_DUAL_CHANNEL_RDB_LOAD) { + if (dualChannelSyncHandleRdbLoadCompletion(link) == C_ERR) { + /* This may happen if, while loading the backlog, our primary is unset */ + serverLog(LL_NOTICE, "%s <-> MYSELF sync: Failed to finalize dual channel load", replicationGetNameForLogs(link)); + freeReplicationLink(link); + link_closed = 1; + } + } else { + /* Client could have been previously created for AOF load. */ + if (!link->client) { + link->client = createReplicationLinkClient(link, link->transfer_s, db); + link->transfer_s = NULL; + } + link->state = REPL_STATE_CONNECTED; + /* Send the initial ACK immediately to put this replica in online state. */ + replicationSendAck(link); + } + + if (!link_closed && link == server.primary) { + server.repl_down_since = 0; + + /* Fire the primary link modules event. */ + moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_UP, NULL); + if (link->state == REPL_STATE_CONNECTED) { + /* After a full resynchronization we use the replication ID and + * offset of the primary. The secondary ID / offset are cleared since + * we are starting a new history. */ + memcpy(server.replid, link->client->repl_data->replid, sizeof(server.replid)); + server.primary_repl_offset = link->client->repl_data->reploff; + } + clearReplicationId2(); + + /* Let's create the replication backlog if needed. Replicas need to + * accumulate the backlog regardless of the fact they have sub-replicas + * or not, in order to behave correctly if they are promoted to + * primaries after a failover. */ + if (server.repl_backlog == NULL) createReplicationBacklog(); + serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Finished with success"); + + if (server.supervised_mode == SUPERVISED_SYSTEMD) { + serverCommunicateSystemd("STATUS=PRIMARY <-> REPLICA sync: Finished with success. Ready to accept connections " + "in read-write mode.\n"); + } + } + + /* Restart the AOF subsystem now that we finished the sync. This + * will trigger an AOF rewrite, and when done will start appending + * to the new file. */ + if (server.aof_enabled && server.aof_state != AOF_OFF) restartAOFAfterSYNC(); + + /* In case of dual channel replication sync we want to close the RDB connection + * once the connection is established */ + if (!link_closed && conn == link->rdb_transfer_s) { + connClose(conn); + link->rdb_transfer_s = NULL; + } + return; +} + /* Asynchronously read the SYNC payload we receive from a primary */ #define REPL_MAX_WRITTEN_BEFORE_FSYNC (1024 * 1024 * 8) /* 8 MB */ void readSyncBulkPayload(connection *conn) { @@ -2096,6 +2244,11 @@ void readSyncBulkPayload(connection *conn) { int empty_db_flags = server.repl_replica_lazy_flush ? EMPTYDB_ASYNC : EMPTYDB_NO_FLAGS; off_t left; + replicationLink *link = (replicationLink *)connGetPrivateData(conn); + + /* RDB bulk load will only be used if we are sending all slots. */ + serverAssert(link->slot_ranges == NULL); + /* Static vars used to hold the EOF mark, and the last bytes received * from the server: when they match, we reached the end of the transfer. */ static char eofmark[RDB_EOF_MARK_SIZE]; @@ -2104,10 +2257,10 @@ void readSyncBulkPayload(connection *conn) { /* If repl_transfer_size == -1 we still have to read the bulk length * from the primary reply. */ - if (server.repl_transfer_size == -1) { + if (link->transfer_size == -1) { nread = connSyncReadLine(conn, buf, 1024, server.repl_syncio_timeout * 1000); if (nread == -1) { - serverLog(LL_WARNING, "I/O error reading bulk count from PRIMARY: %s", connGetLastError(conn)); + serverLog(LL_WARNING, "I/O error reading bulk count from %s: %s", replicationGetNameForLogs(link), connGetLastError(conn)); goto error; } else { /* nread here is returned by connSyncReadLine(), which calls syncReadLine() and @@ -2116,18 +2269,19 @@ void readSyncBulkPayload(connection *conn) { } if (buf[0] == '-') { - serverLog(LL_WARNING, "PRIMARY aborted replication with an error: %s", buf + 1); + serverLog(LL_WARNING, "%s aborted replication with an error: %s", replicationGetNameForLogs(link), buf + 1); goto error; } else if (buf[0] == '\0') { /* At this stage just a newline works as a PING in order to take * the connection live. So we refresh our last interaction * timestamp. */ - server.repl_transfer_lastio = server.unixtime; + link->transfer_lastio = server.unixtime; return; } else if (buf[0] != '$') { serverLog(LL_WARNING, - "Bad protocol from PRIMARY, the first byte is not '$' (we received '%s'), are you sure the host " + "Bad protocol from %s, the first byte is not '$' (we received '%s'), are you sure the host " "and port are right?", + replicationGetNameForLogs(link), buf); goto error; } @@ -2148,14 +2302,14 @@ void readSyncBulkPayload(connection *conn) { memset(lastbytes, 0, RDB_EOF_MARK_SIZE); /* Set any repl_transfer_size to avoid entering this code path * at the next call. */ - server.repl_transfer_size = 0; - serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: receiving streamed RDB from primary with EOF %s", + link->transfer_size = 0; + serverLog(LL_NOTICE, "%s <-> REPLICA sync: receiving streamed RDB from primary with EOF %s", replicationGetNameForLogs(link), use_diskless_load ? "to parser" : "to disk"); } else { usemark = 0; - server.repl_transfer_size = strtol(buf + 1, NULL, 10); - serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: receiving %lld bytes from primary %s", - (long long)server.repl_transfer_size, use_diskless_load ? "to parser" : "to disk"); + link->transfer_size = strtol(buf + 1, NULL, 10); + serverLog(LL_NOTICE, "%s <-> REPLICA sync: receiving %lld bytes from primary %s", replicationGetNameForLogs(link), + (long long)link->transfer_size, use_diskless_load ? "to parser" : "to disk"); } return; } @@ -2166,7 +2320,7 @@ void readSyncBulkPayload(connection *conn) { if (usemark) { readlen = sizeof(buf); } else { - left = server.repl_transfer_size - server.repl_transfer_read; + left = link->transfer_size - link->transfer_read; readlen = (left < (signed)sizeof(buf)) ? left : (signed)sizeof(buf); } @@ -2176,7 +2330,7 @@ void readSyncBulkPayload(connection *conn) { /* equivalent to EAGAIN */ return; } - serverLog(LL_WARNING, "I/O error trying to sync with PRIMARY: %s", + serverLog(LL_WARNING, "I/O error trying to sync with %s: %s", replicationGetNameForLogs(link), (nread == -1) ? connGetLastError(conn) : "connection lost"); goto error; } @@ -2202,19 +2356,20 @@ void readSyncBulkPayload(connection *conn) { /* Update the last I/O time for the replication transfer (used in * order to detect timeouts during replication), and write what we * got from the socket to the dump file on disk. */ - server.repl_transfer_lastio = server.unixtime; - if ((nwritten = write(server.repl_transfer_fd, buf, nread)) != nread) { + link->transfer_lastio = server.unixtime; + if ((nwritten = write(link->transfer_fd, buf, nread)) != nread) { serverLog(LL_WARNING, "Write error or short write writing to the DB dump file " - "needed for PRIMARY <-> REPLICA synchronization: %s", + "needed for %s <-> REPLICA synchronization: %s", + replicationGetNameForLogs(link), (nwritten == -1) ? strerror(errno) : "short write"); goto error; } - server.repl_transfer_read += nread; + link->transfer_read += nread; /* Delete the last 40 bytes from the file if we reached EOF. */ if (usemark && eof_reached) { - if (ftruncate(server.repl_transfer_fd, server.repl_transfer_read - RDB_EOF_MARK_SIZE) == -1) { + if (ftruncate(link->transfer_fd, link->transfer_read - RDB_EOF_MARK_SIZE) == -1) { serverLog(LL_WARNING, "Error truncating the RDB file received from the primary " "for SYNC: %s", @@ -2226,15 +2381,15 @@ void readSyncBulkPayload(connection *conn) { /* Sync data on disk from time to time, otherwise at the end of the * transfer we may suffer a big delay as the memory buffers are copied * into the actual disk. */ - if (server.repl_transfer_read >= server.repl_transfer_last_fsync_off + REPL_MAX_WRITTEN_BEFORE_FSYNC) { - off_t sync_size = server.repl_transfer_read - server.repl_transfer_last_fsync_off; - rdb_fsync_range(server.repl_transfer_fd, server.repl_transfer_last_fsync_off, sync_size); - server.repl_transfer_last_fsync_off += sync_size; + if (link->transfer_read >= link->transfer_last_fsync_off + REPL_MAX_WRITTEN_BEFORE_FSYNC) { + off_t sync_size = link->transfer_read - link->transfer_last_fsync_off; + rdb_fsync_range(link->transfer_fd, link->transfer_last_fsync_off, sync_size); + link->transfer_last_fsync_off += sync_size; } /* Check if the transfer is now complete */ if (!usemark) { - if (server.repl_transfer_read == server.repl_transfer_size) eof_reached = 1; + if (link->transfer_read == link->transfer_size) eof_reached = 1; } /* If the transfer is yet not complete, we need to read more, so @@ -2297,7 +2452,7 @@ void readSyncBulkPayload(connection *conn) { * It is enabled only on SWAPDB diskless replication when primary replication ID hasn't changed, * because in that state the old content of the db represents a different point in time of the same * data set we're currently receiving from the primary. */ - if (memcmp(server.replid, server.primary_replid, CONFIG_RUN_ID_SIZE) == 0) { + if (memcmp(server.replid, link->replid, CONFIG_RUN_ID_SIZE) == 0) { asyncLoading = 1; } dbarray = diskless_load_tempDb; @@ -2308,29 +2463,34 @@ void readSyncBulkPayload(connection *conn) { replicationAttachToNewPrimary(); /* Even though we are on-empty-db and the database is empty, we still call emptyData. */ - serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Flushing old data"); + serverLog(LL_NOTICE, "%s <-> REPLICA sync: Flushing old data", replicationGetNameForLogs(link)); emptyData(-1, empty_db_flags, replicationEmptyDbCallback); dbarray = server.db; functions_lib_ctx = functionsLibCtxGetCurrent(); } - rioInitWithConn(&rdb, conn, server.repl_transfer_size); + rioInitWithConn(&rdb, conn, link->transfer_size); /* Put the socket in blocking mode to simplify RDB transfer. * We'll restore it when the RDB is received. */ connBlock(conn); connRecvTimeout(conn, server.repl_timeout * 1000); - serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Loading DB in memory"); - startLoading(server.repl_transfer_size, RDBFLAGS_REPLICATION, asyncLoading); + serverLog(LL_NOTICE, "%s <-> REPLICA sync: Loading DB in memory", replicationGetNameForLogs(link)); + startLoading(link->transfer_size, RDBFLAGS_REPLICATION, asyncLoading); + + /* Before loading, ensure that the link won't be freed, even if + * REPLICAOF NO ONE is called in background event processing. */ + link->protected = 1; int loadingFailed = 0; rdbLoadingCtx loadingCtx = {.dbarray = dbarray, .functions_lib_ctx = functions_lib_ctx}; if (rdbLoadRioWithLoadingCtxScopedRdb(&rdb, RDBFLAGS_REPLICATION, &rsi, &loadingCtx) != C_OK) { /* RDB loading failed. */ - serverLog(LL_WARNING, "Failed trying to load the PRIMARY synchronization DB " - "from socket, check server logs."); + serverLog(LL_WARNING, "Failed trying to load the %s synchronization DB " + "from socket, check server logs.", + replicationGetNameForLogs(link)); loadingFailed = 1; } else if (usemark) { /* Verify the end mark is correct. */ @@ -2340,6 +2500,14 @@ void readSyncBulkPayload(connection *conn) { } } + /* After loading, check if replication was cancelled */ + link->protected = 0; + if (link->state == REPL_STATE_CANCELLED) { + /* Link was freed during RDB load */ + serverLog(LL_NOTICE, "%s <-> REPLICA sync: Link to primary closed during diskless RDB load", replicationGetNameForLogs(link)); + loadingFailed = 1; + } + if (loadingFailed) { stopLoading(0); rioFreeConn(&rdb, NULL); @@ -2351,10 +2519,10 @@ void readSyncBulkPayload(connection *conn) { disklessLoadDiscardTempDb(diskless_load_tempDb); disklessLoadDiscardFunctionsLibCtx(temp_functions_lib_ctx); - serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding temporary DB in background"); + serverLog(LL_NOTICE, "%s <-> REPLICA sync: Discarding temporary DB in background", replicationGetNameForLogs(link)); } else { /* Remove the half-loaded data in case we started with an empty replica. */ - serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding the half-loaded data"); + serverLog(LL_NOTICE, "%s <-> REPLICA sync: Discarding the half-loaded data", replicationGetNameForLogs(link)); emptyData(-1, empty_db_flags, replicationEmptyDbCallback); } @@ -2371,7 +2539,7 @@ void readSyncBulkPayload(connection *conn) { * primary structure and force resync of sub-replicas. */ replicationAttachToNewPrimary(); - serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Swapping active DB with loaded DB"); + serverLog(LL_NOTICE, "%s <-> REPLICA sync: Swapping active DB with loaded DB", replicationGetNameForLogs(link)); swapMainDbWithTempDb(diskless_load_tempDb); /* swap existing functions ctx with the temporary one */ @@ -2382,7 +2550,7 @@ void readSyncBulkPayload(connection *conn) { /* Delete the old db as it's useless now. */ disklessLoadDiscardTempDb(diskless_load_tempDb); - serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding old DB in background"); + serverLog(LL_NOTICE, "%s <-> REPLICA sync: Discarding old DB in background", replicationGetNameForLogs(link)); } /* Inform about db change, as replication was diskless and didn't cause a save. */ @@ -2398,20 +2566,22 @@ void readSyncBulkPayload(connection *conn) { } else { /* Make sure the new file (also used for persistence) is fully synced * (not covered by earlier calls to rdb_fsync_range). */ - if (fsync(server.repl_transfer_fd) == -1) { + if (fsync(link->transfer_fd) == -1) { serverLog(LL_WARNING, "Failed trying to sync the temp DB to disk in " - "PRIMARY <-> REPLICA synchronization: %s", + "%s <-> REPLICA synchronization: %s", + replicationGetNameForLogs(link), strerror(errno)); goto error; } /* Rename rdb like renaming rewrite aof asynchronously. */ int old_rdb_fd = open(server.rdb_filename, O_RDONLY | O_NONBLOCK); - if (rename(server.repl_transfer_tmpfile, server.rdb_filename) == -1) { + if (rename(link->transfer_tmpfile, server.rdb_filename) == -1) { serverLog(LL_WARNING, "Failed trying to rename the temp DB into %s in " - "PRIMARY <-> REPLICA synchronization: %s", + "%s <-> REPLICA synchronization: %s", + replicationGetNameForLogs(link), server.rdb_filename, strerror(errno)); if (old_rdb_fd != -1) close(old_rdb_fd); goto error; @@ -2423,7 +2593,8 @@ void readSyncBulkPayload(connection *conn) { if (fsyncFileDir(server.rdb_filename) == -1) { serverLog(LL_WARNING, "Failed trying to sync DB directory %s in " - "PRIMARY <-> REPLICA synchronization: %s", + "%s <-> REPLICA synchronization: %s", + replicationGetNameForLogs(link), server.rdb_filename, strerror(errno)); goto error; } @@ -2435,13 +2606,14 @@ void readSyncBulkPayload(connection *conn) { /* Empty the databases only after the RDB file is ok, that is, before the RDB file * is actually loaded, in case we encounter an error and drop the replication stream * and leave an empty database. */ - serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Flushing old data"); + serverLog(LL_NOTICE, "%s <-> REPLICA sync: Flushing old data", replicationGetNameForLogs(link)); emptyData(-1, empty_db_flags, replicationEmptyDbCallback); - serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Loading DB in memory"); + serverLog(LL_NOTICE, "%s <-> REPLICA sync: Loading DB in memory", replicationGetNameForLogs(link)); if (rdbLoad(server.rdb_filename, &rsi, RDBFLAGS_REPLICATION) != RDB_OK) { - serverLog(LL_WARNING, "Failed trying to load the PRIMARY synchronization " - "DB from disk, check server logs."); + serverLog(LL_WARNING, "Failed trying to load the %s synchronization " + "DB from disk, check server logs.", + replicationGetNameForLogs(link)); if (server.rdb_del_sync_files && allPersistenceDisabled()) { serverLog(LL_NOTICE, "Removing the RDB file obtained from " "the primary. This replica has persistence " @@ -2450,7 +2622,7 @@ void readSyncBulkPayload(connection *conn) { } /* If disk-based RDB loading fails, remove the half-loaded dataset. */ - serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding the half-loaded data"); + serverLog(LL_NOTICE, "%s<-> REPLICA sync: Discarding the half-loaded data", replicationGetNameForLogs(link)); emptyData(-1, empty_db_flags, replicationEmptyDbCallback); /* Note that there's no point in restarting the AOF on sync failure, @@ -2466,61 +2638,17 @@ void readSyncBulkPayload(connection *conn) { bg_unlink(server.rdb_filename); } - zfree(server.repl_transfer_tmpfile); - close(server.repl_transfer_fd); - server.repl_transfer_fd = -1; - server.repl_transfer_tmpfile = NULL; - } - - /* Final setup of the connected replica <- primary link */ - if (conn == server.repl_rdb_transfer_s) { - dualChannelSyncHandleRdbLoadCompletion(); - } else { - replicationCreatePrimaryClient(server.repl_transfer_s, rsi.repl_stream_db); - server.repl_state = REPL_STATE_CONNECTED; - server.repl_down_since = 0; - /* Send the initial ACK immediately to put this replica in online state. */ - replicationSendAck(); + zfree(link->transfer_tmpfile); + close(link->transfer_fd); + link->transfer_fd = -1; + link->transfer_tmpfile = NULL; } - /* Fire the primary link modules event. */ - moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_UP, NULL); - if (server.repl_state == REPL_STATE_CONNECTED) { - /* After a full resynchronization we use the replication ID and - * offset of the primary. The secondary ID / offset are cleared since - * we are starting a new history. */ - memcpy(server.replid, server.primary->repl_data->replid, sizeof(server.replid)); - server.primary_repl_offset = server.primary->repl_data->reploff; - } - clearReplicationId2(); - - /* Let's create the replication backlog if needed. Replicas need to - * accumulate the backlog regardless of the fact they have sub-replicas - * or not, in order to behave correctly if they are promoted to - * primaries after a failover. */ - if (server.repl_backlog == NULL) createReplicationBacklog(); - serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Finished with success"); - - if (server.supervised_mode == SUPERVISED_SYSTEMD) { - serverCommunicateSystemd("STATUS=PRIMARY <-> REPLICA sync: Finished with success. Ready to accept connections " - "in read-write mode.\n"); - } - - /* Restart the AOF subsystem now that we finished the sync. This - * will trigger an AOF rewrite, and when done will start appending - * to the new file. */ - if (server.aof_enabled) restartAOFAfterSYNC(); - - /* In case of dual channel replication sync we want to close the RDB connection - * once the connection is established */ - if (conn == server.repl_rdb_transfer_s) { - connClose(conn); - server.repl_rdb_transfer_s = NULL; - } + replicationFinishSyncPayload(conn, link, rsi.repl_stream_db); return; error: - cancelReplicationHandshake(1); + if (link) cancelReplicationHandshake(link, 1); return; } @@ -2531,7 +2659,8 @@ char *receiveSynchronousResponse(connection *conn) { serverLog(LL_WARNING, "Failed to read response from the server: %s", connGetLastError(conn)); return NULL; } - server.repl_transfer_lastio = server.unixtime; + replicationLink *link = (replicationLink *)connGetPrivateData(conn); + link->transfer_lastio = server.unixtime; return sdsnew(buf); } @@ -2628,35 +2757,34 @@ sds getReplicaPortString(void) { /* Replication: Replica side. * Free replica's local replication buffer */ -void freePendingReplDataBuf(void) { - listRelease(server.pending_repl_data.blocks); - server.pending_repl_data.blocks = NULL; - server.pending_repl_data.len = 0; +void freePendingReplDataBuf(replicationLink *link) { + listRelease(link->pending_repl_data.blocks); + link->pending_repl_data.blocks = NULL; + link->pending_repl_data.len = 0; } /* Replication: Replica side. * Upon dual-channel sync failure, close rdb-connection, reset repl-state, reset * provisional primary struct, and free local replication buffer. */ -void replicationAbortDualChannelSyncTransfer(void) { - serverAssert(server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE); +void replicationAbortDualChannelSyncTransfer(replicationLink *link) { + serverAssert(link->rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE); dualChannelServerLog(LL_NOTICE, "Aborting dual channel sync"); - if (server.repl_rdb_transfer_s) { - connClose(server.repl_rdb_transfer_s); - server.repl_rdb_transfer_s = NULL; - } - zfree(server.repl_transfer_tmpfile); - server.repl_transfer_tmpfile = NULL; - if (server.repl_transfer_fd != -1) { - close(server.repl_transfer_fd); - server.repl_transfer_fd = -1; - } - server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE; - server.repl_provisional_primary.read_reploff = 0; - server.repl_provisional_primary.reploff = 0; - server.repl_provisional_primary.conn = NULL; - server.repl_provisional_primary.dbid = -1; - server.rdb_client_id = -1; - freePendingReplDataBuf(); + if (link->rdb_transfer_s) { + connClose(link->rdb_transfer_s); + link->rdb_transfer_s = NULL; + } + zfree(link->transfer_tmpfile); + link->transfer_tmpfile = NULL; + if (link->transfer_fd != -1) { + close(link->transfer_fd); + link->transfer_fd = -1; + } + link->rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE; + link->provisional_source_state.read_reploff = 0; + link->provisional_source_state.reploff = 0; + link->provisional_source_state.dbid = -1; + link->rdb_client_id = -1; + freePendingReplDataBuf(link); return; } @@ -2678,7 +2806,7 @@ int sendCurrentOffsetToReplica(client *replica) { return C_OK; } -static int dualChannelReplHandleHandshake(connection *conn, sds *err) { +static int dualChannelReplHandleHandshake(replicationLink *link, sds *err) { dualChannelServerLog(LL_DEBUG, "Received first reply from primary using rdb connection."); /* AUTH with the primary if required. */ if (server.primary_auth) { @@ -2693,7 +2821,7 @@ static int dualChannelReplHandleHandshake(connection *conn, sds *err) { args[argc] = server.primary_auth; lens[argc] = sdslen(server.primary_auth); argc++; - *err = sendCommandArgv(conn, argc, args, lens); + *err = sendCommandArgv(link->transfer_s, argc, args, lens); if (*err) { dualChannelServerLog(LL_WARNING, "Sending command to primary in dual channel replication handshake: %s", *err); return C_ERR; @@ -2701,7 +2829,7 @@ static int dualChannelReplHandleHandshake(connection *conn, sds *err) { } /* Send replica listening port to primary for clarification */ sds portstr = getReplicaPortString(); - *err = sendCommand(conn, "REPLCONF", "capa", "eof", "rdb-only", "1", "rdb-channel", "1", "listening-port", portstr, + *err = sendCommand(link->transfer_s, "REPLCONF", "capa", "eof", "rdb-only", "1", "rdb-channel", "1", "listening-port", portstr, NULL); sdsfree(portstr); if (*err) { @@ -2709,17 +2837,17 @@ static int dualChannelReplHandleHandshake(connection *conn, sds *err) { return C_ERR; } - if (connSetReadHandler(conn, dualChannelFullSyncWithPrimary) == C_ERR) { + if (connSetReadHandler(link->transfer_s, dualChannelFullSyncWithReplicationSource) == C_ERR) { char conninfo[CONN_INFO_LEN]; dualChannelServerLog(LL_WARNING, "Can't create readable event for SYNC: %s (%s)", strerror(errno), - connGetInfo(conn, conninfo, sizeof(conninfo))); + connGetInfo(link->transfer_s, conninfo, sizeof(conninfo))); return C_ERR; } return C_OK; } -static int dualChannelReplHandleAuthReply(connection *conn, sds *err) { - *err = receiveSynchronousResponse(conn); +static int dualChannelReplHandleAuthReply(replicationLink *link, sds *err) { + *err = receiveSynchronousResponse(link->transfer_s); if (*err == NULL) { dualChannelServerLog(LL_WARNING, "Primary did not respond to auth command during SYNC handshake"); return C_ERR; @@ -2728,12 +2856,11 @@ static int dualChannelReplHandleAuthReply(connection *conn, sds *err) { dualChannelServerLog(LL_WARNING, "Unable to AUTH to Primary: %s", *err); return C_ERR; } - server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY; return C_OK; } -static int dualChannelReplHandleReplconfReply(connection *conn, sds *err) { - *err = receiveSynchronousResponse(conn); +static int dualChannelReplHandleReplconfReply(replicationLink *link, sds *err) { + *err = receiveSynchronousResponse(link->transfer_s); if (*err == NULL) { dualChannelServerLog(LL_WARNING, "Primary did not respond to replconf command during SYNC handshake"); return C_ERR; @@ -2744,16 +2871,20 @@ static int dualChannelReplHandleReplconfReply(connection *conn, sds *err) { *err); return C_ERR; } - if (connSyncWrite(conn, "SYNC\r\n", 6, server.repl_syncio_timeout * 1000) == -1) { - dualChannelServerLog(LL_WARNING, "I/O error writing to Primary: %s", connGetLastError(conn)); + if (connSyncWrite(link->transfer_s, "SYNC\r\n", 6, server.repl_syncio_timeout * 1000) == -1) { + dualChannelServerLog(LL_WARNING, "I/O error writing to Primary: %s", connGetLastError(link->transfer_s)); return C_ERR; } return C_OK; } -static int dualChannelReplHandleEndOffsetResponse(connection *conn, sds *err) { +int replicationUseAOFFormatSnapshot(replicationLink *link) { + return link->slot_ranges != NULL; +} + +static int dualChannelReplHandleEndOffsetResponse(replicationLink *link, sds *err) { uint64_t rdb_client_id; - *err = receiveSynchronousResponse(conn); + *err = receiveSynchronousResponse(link->transfer_s); if (*err == NULL) { return C_ERR; } @@ -2771,30 +2902,34 @@ static int dualChannelReplHandleEndOffsetResponse(connection *conn, sds *err) { dualChannelServerLog(LL_WARNING, "Received unexpected $ENDOFF response: %s", *err); return C_ERR; } - server.rdb_client_id = rdb_client_id; - server.primary_initial_offset = reploffset; + link->rdb_client_id = rdb_client_id; + link->initial_offset = reploffset; /* Initiate repl_provisional_primary to act as this replica temp primary until RDB is loaded */ - server.repl_provisional_primary.conn = server.repl_transfer_s; - memcpy(server.repl_provisional_primary.replid, primary_replid, sizeof(server.repl_provisional_primary.replid)); - server.repl_provisional_primary.reploff = reploffset; - server.repl_provisional_primary.read_reploff = reploffset; - server.repl_provisional_primary.dbid = dbid; + memcpy(link->provisional_source_state.replid, primary_replid, CONFIG_RUN_ID_SIZE + 1); + link->provisional_source_state.reploff = reploffset; + link->provisional_source_state.read_reploff = reploffset; + link->provisional_source_state.dbid = dbid; /* Now that we have the snapshot end-offset, we can ask for psync from that offset. Prepare the * main connection accordingly.*/ - server.repl_transfer_s->state = CONN_STATE_CONNECTED; - server.repl_state = REPL_STATE_SEND_HANDSHAKE; - serverAssert(connSetReadHandler(server.repl_transfer_s, dualChannelSetupMainConnForPsync) != C_ERR); - dualChannelSetupMainConnForPsync(server.repl_transfer_s); + link->transfer_s->state = CONN_STATE_CONNECTED; + link->state = REPL_STATE_SEND_HANDSHAKE; + serverAssert(connSetReadHandler(link->transfer_s, dualChannelSetupMainConnForPsync) != C_ERR); + dualChannelSetupMainConnForPsync(link->transfer_s); - /* As the next block we will receive using this connection is the rdb, we need to prepare + /* As the next block we will receive using this connection is the snapshot, we need to prepare * the connection accordingly */ - serverAssert(connSetReadHandler(server.repl_rdb_transfer_s, readSyncBulkPayload) != C_ERR); - server.repl_transfer_size = -1; - server.repl_transfer_read = 0; - server.repl_transfer_last_fsync_off = 0; - server.repl_transfer_lastio = server.unixtime; + if (replicationUseAOFFormatSnapshot(link)) { + link->client = createReplicationLinkClientWithHandler(link, link->rdb_transfer_s, -1, readQueryFromClient); + link->rdb_transfer_s = NULL; + } else { + serverAssert(connSetReadHandler(link->rdb_transfer_s, readSyncBulkPayload) != C_ERR); + } + link->transfer_size = -1; + link->transfer_read = 0; + link->transfer_last_fsync_off = 0; + link->transfer_lastio = server.unixtime; return C_OK; } @@ -2802,15 +2937,15 @@ static int dualChannelReplHandleEndOffsetResponse(connection *conn, sds *err) { /* Replication: Replica side. * This connection handler is used to initialize the RDB connection (dual-channel-replication). * Once a replica with dual-channel-replication enabled, denied from PSYNC with its primary, - * dualChannelFullSyncWithPrimary begins its role. The connection handler prepares server.repl_rdb_transfer_s + * dualChannelFullSyncWithReplicationSource begins its role. The connection handler prepares server.repl_rdb_transfer_s * for a rdb stream, and server.repl_transfer_s for incremental replication data stream. */ -static void dualChannelFullSyncWithPrimary(connection *conn) { +static void dualChannelFullSyncWithReplicationSource(connection *conn) { char *err = NULL; int ret = 0; - serverAssert(conn == server.repl_rdb_transfer_s); + replicationLink *link = (replicationLink *)connGetPrivateData(conn); /* If this event fired after the user turned the instance into a primary * with REPLICAOF NO ONE we must just return ASAP. */ - if (server.repl_state == REPL_STATE_NONE) { + if (link->state == REPL_STATE_NONE) { goto error; } /* Check for errors in the socket: after a non blocking connect() we @@ -2820,30 +2955,30 @@ static void dualChannelFullSyncWithPrimary(connection *conn) { connGetLastError(conn)); goto error; } - switch (server.repl_rdb_channel_state) { + switch (link->rdb_channel_state) { case REPL_DUAL_CHANNEL_SEND_HANDSHAKE: - ret = dualChannelReplHandleHandshake(conn, &err); - if (ret == C_OK) server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_AUTH_REPLY; + ret = dualChannelReplHandleHandshake(link, &err); + if (ret == C_OK) link->rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_AUTH_REPLY; break; case REPL_DUAL_CHANNEL_RECEIVE_AUTH_REPLY: if (server.primary_auth) { - ret = dualChannelReplHandleAuthReply(conn, &err); - if (ret == C_OK) server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY; + ret = dualChannelReplHandleAuthReply(link, &err); + if (ret == C_OK) link->rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY; /* Wait for next bulk before trying to read replconf reply. */ break; } - server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY; + link->rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY; /* fall through */ case REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY: - ret = dualChannelReplHandleReplconfReply(conn, &err); - if (ret == C_OK) server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_ENDOFF; + ret = dualChannelReplHandleReplconfReply(link, &err); + if (ret == C_OK) link->rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_ENDOFF; break; case REPL_DUAL_CHANNEL_RECEIVE_ENDOFF: - ret = dualChannelReplHandleEndOffsetResponse(conn, &err); - if (ret == C_OK) server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RDB_LOAD; + ret = dualChannelReplHandleEndOffsetResponse(link, &err); + if (ret == C_OK) link->rdb_channel_state = REPL_DUAL_CHANNEL_RDB_LOAD; break; default: - serverPanic("Unexpected dual replication state: %d", server.repl_rdb_channel_state); + serverPanic("Unexpected dual replication state: %d", link->rdb_channel_state); } if (ret == C_ERR) goto error; sdsfree(err); @@ -2854,29 +2989,29 @@ static void dualChannelFullSyncWithPrimary(connection *conn) { serverLog(LL_WARNING, "Dual channel sync failed with error %s", err); sdsfree(err); } - if (server.repl_transfer_s) { - connClose(server.repl_transfer_s); - server.repl_transfer_s = NULL; + if (link->transfer_s) { + connClose(link->transfer_s); + link->transfer_s = NULL; } - if (server.repl_rdb_transfer_s) { - connClose(server.repl_rdb_transfer_s); - server.repl_rdb_transfer_s = NULL; + if (link->rdb_transfer_s) { + connClose(link->rdb_transfer_s); + link->rdb_transfer_s = NULL; } - if (server.repl_transfer_fd != -1) close(server.repl_transfer_fd); - server.repl_transfer_fd = -1; - server.repl_state = REPL_STATE_CONNECT; - replicationAbortDualChannelSyncTransfer(); + if (link->transfer_fd != -1) close(link->transfer_fd); + link->transfer_fd = -1; + link->state = REPL_STATE_CONNECT; + replicationAbortDualChannelSyncTransfer(link); } /* Replication: Replica side. * Initialize server.pending_repl_data infrastructure, we will allocate the buffer * itself once we need it */ -void replDataBufInit(void) { - serverAssert(server.pending_repl_data.blocks == NULL); - server.pending_repl_data.len = 0; - server.pending_repl_data.peak = 0; - server.pending_repl_data.blocks = listCreate(); - server.pending_repl_data.blocks->free = zfree; +void replDataBufInit(replicationLink *link) { + serverAssert(link->pending_repl_data.blocks == NULL); + link->pending_repl_data.len = 0; + link->pending_repl_data.peak = 0; + link->pending_repl_data.blocks = listCreate(); + link->pending_repl_data.blocks->free = zfree; } /* Replication: Replica side. @@ -2887,7 +3022,7 @@ void replStreamProgressCallback(size_t offset, int readlen, time_t *last_progres ((offset + readlen) / server.loading_process_events_interval_bytes > offset / server.loading_process_events_interval_bytes) && (now - *last_progress_callback > server.loading_process_events_interval_ms)) { - replicationSendNewlineToPrimary(); + replicationSendNewlineToConnectedLinks(); processEventsWhileBlocked(); *last_progress_callback = now; } @@ -2902,14 +3037,16 @@ typedef struct replDataBufBlock { /* Replication: Replica side. * Reads replication data from primary into specified repl buffer block */ -int readIntoReplDataBlock(connection *conn, replDataBufBlock *data_block, size_t read) { - int nread = connRead(conn, data_block->buf + data_block->used, read); +int readIntoReplDataBlock(replicationLink *link, replDataBufBlock *data_block, size_t read) { + int nread = connRead(link->transfer_s, data_block->buf + data_block->used, read); if (nread <= 0) { - if (nread == 0 || connGetState(conn) != CONN_STATE_CONNECTED) { + if (nread == 0 || connGetState(link->transfer_s) != CONN_STATE_CONNECTED) { dualChannelServerLog(LL_WARNING, "Provisional primary closed connection"); - /* Signal ongoing RDB load to terminate gracefully */ - if (server.loading_rio) rioCloseASAP(server.loading_rio); - cancelReplicationHandshake(1); + if (link->rdb_channel_state == REPL_DUAL_CHANNEL_RDB_LOAD) { + /* Signal ongoing RDB load to terminate gracefully */ + if (server.loading_rio) rioCloseASAP(server.loading_rio); + } + cancelReplicationHandshake(link, 1); } return C_ERR; } @@ -2924,8 +3061,10 @@ void bufferReplData(connection *conn) { size_t readlen = PROTO_IOBUF_LEN; int remaining_bytes = 0; + replicationLink *link = (replicationLink *)connGetPrivateData(conn); + while (readlen > 0) { - listNode *ln = listLast(server.pending_repl_data.blocks); + listNode *ln = listLast(link->pending_repl_data.blocks); replDataBufBlock *tail = ln ? listNodeValue(ln) : NULL; /* Append to tail string when possible */ @@ -2933,11 +3072,11 @@ void bufferReplData(connection *conn) { size_t avail = tail->size - tail->used; remaining_bytes = min(readlen, avail); readlen -= remaining_bytes; - remaining_bytes = readIntoReplDataBlock(conn, tail, remaining_bytes); + remaining_bytes = readIntoReplDataBlock(link, tail, remaining_bytes); } if (readlen && remaining_bytes == 0) { if (server.client_obuf_limits[CLIENT_TYPE_REPLICA].hard_limit_bytes && - server.pending_repl_data.len > server.client_obuf_limits[CLIENT_TYPE_REPLICA].hard_limit_bytes) { + link->pending_repl_data.len > server.client_obuf_limits[CLIENT_TYPE_REPLICA].hard_limit_bytes) { dualChannelServerLog(LL_NOTICE, "Replication buffer limit reached, stopping buffering."); /* Stop accumulating primary commands. */ connSetReadHandler(conn, NULL); @@ -2952,15 +3091,15 @@ void bufferReplData(connection *conn) { tail = zmalloc_usable(size + sizeof(replDataBufBlock), &usable_size); tail->size = usable_size - sizeof(replDataBufBlock); tail->used = 0; - listAddNodeTail(server.pending_repl_data.blocks, tail); - server.pending_repl_data.len += tail->size; + listAddNodeTail(link->pending_repl_data.blocks, tail); + link->pending_repl_data.len += tail->size; /* Update buffer's peak */ - if (server.pending_repl_data.peak < server.pending_repl_data.len) - server.pending_repl_data.peak = server.pending_repl_data.len; + if (link->pending_repl_data.peak < link->pending_repl_data.len) + link->pending_repl_data.peak = link->pending_repl_data.len; remaining_bytes = min(readlen, tail->size); readlen -= remaining_bytes; - remaining_bytes = readIntoReplDataBlock(conn, tail, remaining_bytes); + remaining_bytes = readIntoReplDataBlock(link, tail, remaining_bytes); } if (remaining_bytes > 0) { /* Stop reading in case we read less than we anticipated */ @@ -2974,29 +3113,34 @@ void bufferReplData(connection *conn) { /* Replication: Replica side. * Streams accumulated replication data into the database while freeing read nodes */ -int streamReplDataBufToDb(client *c) { - serverAssert(c->flag.primary); +int streamReplDataBufToDb(replicationLink *link) { + serverAssert(link->client->flag.replication_source); blockingOperationStarts(); size_t used, offset = 0; listNode *cur = NULL; time_t last_progress_callback = mstime(); - while (server.pending_repl_data.blocks && (cur = listFirst(server.pending_repl_data.blocks))) { + + /* Before loading, protect our link from being destructed. */ + link->protected = 1; + + while (link->pending_repl_data.blocks && (cur = listFirst(link->pending_repl_data.blocks))) { /* Read and process repl data block */ replDataBufBlock *o = listNodeValue(cur); used = o->used; - c->querybuf = sdscatlen(c->querybuf, o->buf, used); - c->repl_data->read_reploff += used; - processInputBuffer(c); - server.pending_repl_data.len -= used; + link->client->querybuf = sdscatlen(link->client->querybuf, o->buf, used); + link->client->repl_data->read_reploff += used; + processInputBuffer(link->client); + link->pending_repl_data.len -= used; offset += used; - listDelNode(server.pending_repl_data.blocks, cur); + listDelNode(link->pending_repl_data.blocks, cur); replStreamProgressCallback(offset, used, &last_progress_callback); } + link->protected = 0; blockingOperationEnds(); - if (!server.pending_repl_data.blocks) { + + if (link->state == REPL_STATE_CANCELLED) { /* If we encounter a `replicaof` command during the replStreamProgressCallback, - * pending_repl_data.blocks will be NULL, and we should return an error and - * abort the current sync session. */ + * we should return an error and abort the current sync session. */ return C_ERR; } return C_OK; @@ -3005,65 +3149,64 @@ int streamReplDataBufToDb(client *c) { /* Replication: Replica side. * After done loading the snapshot using the rdb-channel prepare this replica for steady state by * initializing the primary client, amd stream local incremental buffer into memory. */ -void dualChannelSyncSuccess(void) { - server.primary_initial_offset = server.repl_provisional_primary.reploff; - replicationResurrectProvisionalPrimary(); +int dualChannelSyncSuccess(replicationLink *link) { + link->initial_offset = link->provisional_source_state.reploff; + replicationResurrectProvisionalSource(link); /* Wait for the accumulated buffer to be processed before reading any more replication updates */ - if (server.pending_repl_data.blocks && streamReplDataBufToDb(server.primary) == C_ERR) { + if (link->pending_repl_data.blocks && streamReplDataBufToDb(link) == C_ERR) { /* Sync session aborted during repl data streaming. */ dualChannelServerLog(LL_WARNING, "Failed to stream local replication buffer into memory"); /* Verify sync is still in progress */ - if (server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) { - replicationAbortDualChannelSyncTransfer(); - replicationUnsetPrimary(); + if (link->rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) { + replicationAbortDualChannelSyncTransfer(link); } - return; + return C_ERR; } - freePendingReplDataBuf(); + freePendingReplDataBuf(link); dualChannelServerLog(LL_NOTICE, "Successfully streamed replication data into memory"); /* We can resume reading from the primary connection once the local replication buffer has been loaded. */ - replicationSteadyStateInit(); - replicationSendAck(); /* Send ACK to notify primary that replica is synced */ - server.rdb_client_id = -1; - server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE; + replicationSteadyStateInit(link); + replicationSendAck(link); /* Send ACK to notify primary that replica is synced */ + link->rdb_client_id = -1; + link->rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE; + return C_OK; } /* Replication: Replica side. * Main channel successfully established psync with primary. Check whether the rdb channel * has completed its part and act accordingly. */ -int dualChannelSyncHandlePsync(void) { - serverAssert(server.repl_state == REPL_STATE_RECEIVE_PSYNC_REPLY); - if (server.repl_rdb_channel_state < REPL_DUAL_CHANNEL_RDB_LOADED) { +int dualChannelSyncHandlePsync(replicationLink *link) { + serverAssert(link->state == REPL_STATE_RECEIVE_PSYNC_REPLY); + if (link->rdb_channel_state < REPL_DUAL_CHANNEL_RDB_LOADED) { /* RDB is still loading */ - if (connSetReadHandler(server.repl_provisional_primary.conn, bufferReplData) == C_ERR) { + if (connSetReadHandler(link->transfer_s, bufferReplData) == C_ERR) { dualChannelServerLog(LL_WARNING, "Error while setting readable handler: %s", strerror(errno)); - cancelReplicationHandshake(1); + cancelReplicationHandshake(link, 1); return C_ERR; } - replDataBufInit(); + replDataBufInit(link); return C_OK; } - serverAssert(server.repl_rdb_channel_state == REPL_DUAL_CHANNEL_RDB_LOADED); + serverAssert(link->rdb_channel_state == REPL_DUAL_CHANNEL_RDB_LOADED); /* RDB is loaded */ dualChannelServerLog(LL_DEBUG, "Psync established after rdb load"); - dualChannelSyncSuccess(); + dualChannelSyncSuccess(link); return C_OK; } /* Replication: Replica side. * RDB channel done loading the RDB. Check whether the main channel has completed its part * and act accordingly. */ -void dualChannelSyncHandleRdbLoadCompletion(void) { - serverAssert(server.repl_rdb_channel_state == REPL_DUAL_CHANNEL_RDB_LOAD); - if (server.repl_state < REPL_STATE_TRANSFER) { +int dualChannelSyncHandleRdbLoadCompletion(replicationLink *link) { + serverAssert(link->rdb_channel_state == REPL_DUAL_CHANNEL_RDB_LOAD); + if (link->state < REPL_STATE_TRANSFER) { /* Main psync channel hasn't been established yet */ - server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RDB_LOADED; - return; + link->rdb_channel_state = REPL_DUAL_CHANNEL_RDB_LOADED; + return C_OK; } - serverAssert(server.repl_state == REPL_STATE_TRANSFER); - connSetReadHandler(server.repl_transfer_s, NULL); - dualChannelSyncSuccess(); - return; + serverAssert(link->state == REPL_STATE_TRANSFER); + connSetReadHandler(link->transfer_s, NULL); + return dualChannelSyncSuccess(link); } /* Try a partial resynchronization with the primary if we are about to reconnect. @@ -3121,8 +3264,8 @@ void dualChannelSyncHandleRdbLoadCompletion(void) { #define PSYNC_NOT_SUPPORTED 4 #define PSYNC_TRY_LATER 5 #define PSYNC_FULLRESYNC_DUAL_CHANNEL 6 -int replicaTryPartialResynchronization(connection *conn, int read_reply) { - char *psync_replid; +int replicaTryPartialResynchronization(replicationLink *link, int read_reply) { + char *psync_replid = NULL; char psync_offset[32]; sds reply; @@ -3133,21 +3276,25 @@ int replicaTryPartialResynchronization(connection *conn, int read_reply) { * a FULL resync using the PSYNC command we'll set the offset at the * right value, so that this information will be propagated to the * client structure representing the primary into server.primary. */ - server.primary_initial_offset = -1; + link->initial_offset = -1; - if (server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) { + if (link->rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) { /* While in dual channel replication, we should use our prepared repl id and offset. */ - psync_replid = server.repl_provisional_primary.replid; - snprintf(psync_offset, sizeof(psync_offset), "%lld", server.repl_provisional_primary.reploff + 1); + psync_replid = link->provisional_source_state.replid; + snprintf(psync_offset, sizeof(psync_offset), "%lld", link->provisional_source_state.reploff + 1); dualChannelServerLog(LL_NOTICE, "Trying a partial resynchronization using main channel (request %s:%s).", psync_replid, psync_offset); + } else if (link != server.primary) { + serverLog(LL_NOTICE, "Partial resynchronization not attempted (not primary replication)"); } else if (server.cached_primary) { psync_replid = server.cached_primary->repl_data->replid; snprintf(psync_offset, sizeof(psync_offset), "%lld", server.cached_primary->repl_data->reploff + 1); serverLog(LL_NOTICE, "Trying a partial resynchronization (request %s:%s).", psync_replid, psync_offset); } else { serverLog(LL_NOTICE, "Partial resynchronization not possible (no cached primary)"); + } + if (!psync_replid) { psync_replid = "?"; memcpy(psync_offset, "-1", 3); } @@ -3155,26 +3302,26 @@ int replicaTryPartialResynchronization(connection *conn, int read_reply) { /* Issue the PSYNC command, if this is a primary with a failover in * progress then send the failover argument to the replica to cause it * to become a primary */ - if (server.failover_state == FAILOVER_IN_PROGRESS) { - reply = sendCommand(conn, "PSYNC", psync_replid, psync_offset, "FAILOVER", NULL); + if (server.failover_state == FAILOVER_IN_PROGRESS && link == server.primary) { + reply = sendCommand(link->transfer_s, "PSYNC", psync_replid, psync_offset, "FAILOVER", NULL); } else { - reply = sendCommand(conn, "PSYNC", psync_replid, psync_offset, NULL); + reply = sendCommand(link->transfer_s, "PSYNC", psync_replid, psync_offset, NULL); } if (reply != NULL) { - serverLog(LL_WARNING, "Unable to send PSYNC to primary: %s", reply); + serverLog(LL_WARNING, "Unable to send PSYNC to source: %s", reply); sdsfree(reply); - connSetReadHandler(conn, NULL); + connSetReadHandler(link->transfer_s, NULL); return PSYNC_WRITE_ERROR; } return PSYNC_WAIT_REPLY; } /* Reading half */ - reply = receiveSynchronousResponse(conn); + reply = receiveSynchronousResponse(link->transfer_s); /* Primary did not reply to PSYNC */ if (reply == NULL) { - connSetReadHandler(conn, NULL); + connSetReadHandler(link->transfer_s, NULL); serverLog(LL_WARNING, "Primary did not reply to PSYNC, will try later"); return PSYNC_TRY_LATER; } @@ -3186,7 +3333,7 @@ int replicaTryPartialResynchronization(connection *conn, int read_reply) { return PSYNC_WAIT_REPLY; } - connSetReadHandler(conn, NULL); + connSetReadHandler(link->transfer_s, NULL); if (!strncmp(reply, "+FULLRESYNC", 11)) { char *replid = NULL, *offset = NULL; @@ -3205,24 +3352,31 @@ int replicaTryPartialResynchronization(connection *conn, int read_reply) { * reply means that the primary supports PSYNC, but the reply * format seems wrong. To stay safe we blank the primary * replid to make sure next PSYNCs will fail. */ - memset(server.primary_replid, 0, CONFIG_RUN_ID_SIZE + 1); + memset(link->replid, 0, CONFIG_RUN_ID_SIZE + 1); } else { - memcpy(server.primary_replid, replid, offset - replid - 1); - server.primary_replid[CONFIG_RUN_ID_SIZE] = '\0'; - server.primary_initial_offset = strtoll(offset, NULL, 10); - serverLog(LL_NOTICE, "Full resync from primary: %s:%lld", server.primary_replid, - server.primary_initial_offset); + memcpy(link->replid, replid, offset - replid - 1); + link->replid[CONFIG_RUN_ID_SIZE] = '\0'; + link->initial_offset = strtoll(offset, NULL, 10); + serverLog(LL_NOTICE, "Full resync from primary: %s:%lld", link->replid, + link->initial_offset); } sdsfree(reply); return PSYNC_FULLRESYNC; } if (!strncmp(reply, "+CONTINUE", 9)) { - if (server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) { - /* During dual channel sync sesseion, primary struct is already initialized. */ + if (link->rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) { + /* During dual channel sync session, primary struct is already initialized. */ sdsfree(reply); return PSYNC_CONTINUE; } + if (link != server.primary) { + /* Continuing from a cached primary should only happen when we are syncing for primary replication. */ + sdsfree(reply); + serverLog(LL_WARNING, "Received +CONTINUE response to PSYNC when not doing replication and not performing dual channel sync. Failing PSYNC."); + return PSYNC_NOT_SUPPORTED; + } + /* Partial resync was accepted. */ serverLog(LL_NOTICE, "Successful partial resynchronization with primary."); @@ -3259,7 +3413,7 @@ int replicaTryPartialResynchronization(connection *conn, int read_reply) { /* Setup the replication to continue. */ sdsfree(reply); - replicationResurrectCachedPrimary(conn); + replicationResurrectCachedPrimary(link); /* If this instance was restarted and we read the metadata to * PSYNC from the persistence file, our replication backlog could @@ -3320,16 +3474,16 @@ sds getTryPsyncString(int result) { } } -int dualChannelReplMainConnSendHandshake(connection *conn, sds *err) { +int dualChannelReplMainConnSendHandshake(replicationLink *link, sds *err) { char llstr[LONG_STR_SIZE]; - ull2string(llstr, sizeof(llstr), server.rdb_client_id); - *err = sendCommand(conn, "REPLCONF", "set-rdb-client-id", llstr, NULL); + ull2string(llstr, sizeof(llstr), link->rdb_client_id); + *err = sendCommand(link->transfer_s, "REPLCONF", "set-rdb-client-id", llstr, NULL); if (*err) return C_ERR; return C_OK; } -int dualChannelReplMainConnRecvCapaReply(connection *conn, sds *err) { - *err = receiveSynchronousResponse(conn); +int dualChannelReplMainConnRecvCapaReply(replicationLink *link, sds *err) { + *err = receiveSynchronousResponse(link->transfer_s); if (*err == NULL) return C_ERR; if ((*err)[0] == '-') { dualChannelServerLog(LL_NOTICE, "Primary does not understand REPLCONF identify: %s", *err); @@ -3338,28 +3492,28 @@ int dualChannelReplMainConnRecvCapaReply(connection *conn, sds *err) { return C_OK; } -int dualChannelReplMainConnSendPsync(connection *conn, sds *err) { +int dualChannelReplMainConnSendPsync(replicationLink *link, sds *err) { if (server.debug_pause_after_fork) debugPauseProcess(); - if (replicaTryPartialResynchronization(conn, 0) == PSYNC_WRITE_ERROR) { + if (replicaTryPartialResynchronization(link, 0) == PSYNC_WRITE_ERROR) { dualChannelServerLog(LL_WARNING, "Aborting dual channel sync. Write error."); - *err = sdsnew(connGetLastError(conn)); + *err = sdsnew(connGetLastError(link->transfer_s)); return C_ERR; } return C_OK; } -int dualChannelReplMainConnRecvPsyncReply(connection *conn, sds *err) { - int psync_result = replicaTryPartialResynchronization(conn, 1); +int dualChannelReplMainConnRecvPsyncReply(replicationLink *link, sds *err) { + int psync_result = replicaTryPartialResynchronization(link, 1); if (psync_result == PSYNC_WAIT_REPLY) return C_OK; /* Try again later... */ if (psync_result == PSYNC_CONTINUE) { dualChannelServerLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Primary accepted a Partial Resynchronization%s", - server.repl_rdb_transfer_s != NULL ? ", RDB load in background." : "."); + link->rdb_transfer_s != NULL ? ", RDB load in background." : "."); if (server.supervised_mode == SUPERVISED_SYSTEMD) { serverCommunicateSystemd("STATUS=PRIMARY <-> REPLICA sync: Partial Resynchronization accepted. Ready to " "accept connections in read-write mode.\n"); } - dualChannelSyncHandlePsync(); + dualChannelSyncHandlePsync(link); return C_OK; } *err = getTryPsyncString(psync_result); @@ -3373,37 +3527,39 @@ void dualChannelSetupMainConnForPsync(connection *conn) { char *err = NULL; int ret; - switch (server.repl_state) { + replicationLink *link = (replicationLink *)connGetPrivateData(conn); + + switch (link->state) { case REPL_STATE_SEND_HANDSHAKE: - ret = dualChannelReplMainConnSendHandshake(conn, &err); - if (ret == C_OK) server.repl_state = REPL_STATE_RECEIVE_CAPA_REPLY; + ret = dualChannelReplMainConnSendHandshake(link, &err); + if (ret == C_OK) link->state = REPL_STATE_RECEIVE_CAPA_REPLY; break; case REPL_STATE_RECEIVE_CAPA_REPLY: - ret = dualChannelReplMainConnRecvCapaReply(conn, &err); + ret = dualChannelReplMainConnRecvCapaReply(link, &err); if (ret == C_ERR) { break; } - if (ret == C_OK) server.repl_state = REPL_STATE_SEND_PSYNC; + if (ret == C_OK) link->state = REPL_STATE_SEND_PSYNC; sdsfree(err); err = NULL; /* fall through */ case REPL_STATE_SEND_PSYNC: - ret = dualChannelReplMainConnSendPsync(conn, &err); - if (ret == C_OK) server.repl_state = REPL_STATE_RECEIVE_PSYNC_REPLY; + ret = dualChannelReplMainConnSendPsync(link, &err); + if (ret == C_OK) link->state = REPL_STATE_RECEIVE_PSYNC_REPLY; break; case REPL_STATE_RECEIVE_PSYNC_REPLY: - ret = dualChannelReplMainConnRecvPsyncReply(conn, &err); - if (ret == C_OK && server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) - server.repl_state = REPL_STATE_TRANSFER; - /* In case the RDB is already loaded, the repl_state will be set during establishPrimaryConnection. */ + ret = dualChannelReplMainConnRecvPsyncReply(link, &err); + if (ret == C_OK && link->rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) + link->state = REPL_STATE_TRANSFER; + /* In case the RDB is already loaded, the repl_state will be set during establishSourceConnection. */ break; default: - serverPanic("Unexpected replication state: %d", server.repl_state); + serverPanic("Unexpected replication state: %d", link->state); } if (ret == C_ERR) { dualChannelServerLog(LL_WARNING, "Aborting dual channel sync. Main channel psync result %d %s", ret, err ? err : ""); - cancelReplicationHandshake(1); + cancelReplicationHandshake(link, 1); } sdsfree(err); } @@ -3486,16 +3642,11 @@ void dualChannelSetupMainConnForPsync(connection *conn) { */ /* This handler fires when the non blocking connect was able to * establish a connection with the primary. */ -void syncWithPrimary(connection *conn) { +void syncWithSource(connection *conn) { char tmpfile[256], *err = NULL; int psync_result; - /* If this event fired after the user turned the instance into a primary - * with REPLICAOF NO ONE we must just return ASAP. */ - if (server.repl_state == REPL_STATE_NONE) { - connClose(conn); - return; - } + replicationLink *link = (replicationLink *)connGetPrivateData(conn); /* Check for errors in the socket: after a non blocking connect() we * may find that the socket is in error state. */ @@ -3505,13 +3656,13 @@ void syncWithPrimary(connection *conn) { } /* Send a PING to check the primary is able to reply without errors. */ - if (server.repl_state == REPL_STATE_CONNECTING) { + if (link->state == REPL_STATE_CONNECTING) { serverLog(LL_NOTICE, "Non blocking connect for SYNC fired the event."); /* Delete the writable event so that the readable event remains * registered and we can wait for the PONG reply. */ - connSetReadHandler(conn, syncWithPrimary); + connSetReadHandler(conn, syncWithSource); connSetWriteHandler(conn, NULL); - server.repl_state = REPL_STATE_RECEIVE_PING_REPLY; + link->state = REPL_STATE_RECEIVE_PING_REPLY; /* Send the PING, don't check for errors at all, we have the timeout * that will take care about this. */ err = sendCommand(conn, "PING", NULL); @@ -3520,7 +3671,7 @@ void syncWithPrimary(connection *conn) { } /* Receive the PONG command. */ - if (server.repl_state == REPL_STATE_RECEIVE_PING_REPLY) { + if (link->state == REPL_STATE_RECEIVE_PING_REPLY) { err = receiveSynchronousResponse(conn); /* The primary did not reply */ @@ -3541,10 +3692,10 @@ void syncWithPrimary(connection *conn) { } sdsfree(err); err = NULL; - server.repl_state = REPL_STATE_SEND_HANDSHAKE; + link->state = REPL_STATE_SEND_HANDSHAKE; } - if (server.repl_state == REPL_STATE_SEND_HANDSHAKE) { + if (link->state == REPL_STATE_SEND_HANDSHAKE) { /* AUTH with the primary if required. */ if (server.primary_auth) { char *args[3] = {"AUTH", NULL, NULL}; @@ -3579,6 +3730,19 @@ void syncWithPrimary(connection *conn) { if (err) goto write_error; } + /* Set the slot number, so that the primary only provides us with the appropriate slot dictionary. */ + if (link->slot_ranges != NULL) { + char *argv[3] = {"REPLCONF", "slot-bitmap", NULL}; + size_t lens[3] = {8, 11, 0}; + unsigned char slot_bitmap[CLUSTER_SLOTS/8 + 1] = {0}; + slotRangesToBitmap(link->slot_ranges, slot_bitmap); + slot_bitmap[CLUSTER_SLOTS/8] = '\0'; + argv[2] = (char *)slot_bitmap; + lens[2] = CLUSTER_SLOTS/8; + err = sendCommandArgv(conn, 3, argv, lens); + if (err) goto write_error; + } + /* Inform the primary of our (replica) capabilities. * * EOF: supports EOF-style RDB transfer for diskless replication. @@ -3594,30 +3758,30 @@ void syncWithPrimary(connection *conn) { err = sendCommand(conn, "REPLCONF", "version", VALKEY_VERSION, NULL); if (err) goto write_error; - server.repl_state = REPL_STATE_RECEIVE_AUTH_REPLY; + link->state = REPL_STATE_RECEIVE_AUTH_REPLY; return; } - if (server.repl_state == REPL_STATE_RECEIVE_AUTH_REPLY && !server.primary_auth) - server.repl_state = REPL_STATE_RECEIVE_PORT_REPLY; + if (link->state == REPL_STATE_RECEIVE_AUTH_REPLY && !server.primary_auth) + link->state = REPL_STATE_RECEIVE_PORT_REPLY; /* Receive AUTH reply. */ - if (server.repl_state == REPL_STATE_RECEIVE_AUTH_REPLY) { + if (link->state == REPL_STATE_RECEIVE_AUTH_REPLY) { err = receiveSynchronousResponse(conn); if (err == NULL) goto no_response_error; if (err[0] == '-') { - serverLog(LL_WARNING, "Unable to AUTH to PRIMARY: %s", err); + serverLog(LL_WARNING, "Unable to AUTH to %s: %s", replicationGetNameForLogs(link), err); sdsfree(err); goto error; } sdsfree(err); err = NULL; - server.repl_state = REPL_STATE_RECEIVE_PORT_REPLY; + link->state = REPL_STATE_RECEIVE_PORT_REPLY; return; } /* Receive REPLCONF listening-port reply. */ - if (server.repl_state == REPL_STATE_RECEIVE_PORT_REPLY) { + if (link->state == REPL_STATE_RECEIVE_PORT_REPLY) { err = receiveSynchronousResponse(conn); if (err == NULL) goto no_response_error; /* Ignore the error if any, not all the Redis OSS versions support @@ -3629,15 +3793,15 @@ void syncWithPrimary(connection *conn) { err); } sdsfree(err); - server.repl_state = REPL_STATE_RECEIVE_IP_REPLY; + link->state = REPL_STATE_RECEIVE_IP_REPLY; return; } - if (server.repl_state == REPL_STATE_RECEIVE_IP_REPLY && !server.replica_announce_ip) - server.repl_state = REPL_STATE_RECEIVE_CAPA_REPLY; + if (link->state == REPL_STATE_RECEIVE_IP_REPLY && !server.replica_announce_ip) + link->state = REPL_STATE_RECEIVE_SLOT_REPLY; /* Receive REPLCONF ip-address reply. */ - if (server.repl_state == REPL_STATE_RECEIVE_IP_REPLY) { + if (link->state == REPL_STATE_RECEIVE_IP_REPLY) { err = receiveSynchronousResponse(conn); if (err == NULL) goto no_response_error; /* Ignore the error if any, not all the Redis OSS versions support @@ -3649,42 +3813,59 @@ void syncWithPrimary(connection *conn) { err); } sdsfree(err); - server.repl_state = REPL_STATE_RECEIVE_CAPA_REPLY; + link->state = REPL_STATE_RECEIVE_SLOT_REPLY; + return; + } + + if (link->state == REPL_STATE_RECEIVE_SLOT_REPLY && link->slot_ranges == NULL) + link->state = REPL_STATE_RECEIVE_CAPA_REPLY; + + if (link->state == REPL_STATE_RECEIVE_SLOT_REPLY) { + err = receiveSynchronousResponse(conn); + if (err == NULL) goto no_response_error; + /* If we sent the slot number, we need it to be properly acked, or we can't do slot migration. */ + if (err[0] == '-') { + serverLog(LL_WARNING, "Source does not understand REPLCONF slot-num. Cannot continue with slot-level sync: %s", err); + sdsfree(err); + goto error; + } + sdsfree(err); + link->state = REPL_STATE_RECEIVE_CAPA_REPLY; return; } /* Receive CAPA reply. */ - if (server.repl_state == REPL_STATE_RECEIVE_CAPA_REPLY) { + if (link->state == REPL_STATE_RECEIVE_CAPA_REPLY) { err = receiveSynchronousResponse(conn); if (err == NULL) goto no_response_error; /* Ignore the error if any, not all the Redis OSS versions support * REPLCONF capa. */ if (err[0] == '-') { serverLog(LL_NOTICE, - "(Non critical) Primary does not understand " + "(Non critical) Source does not understand " "REPLCONF capa: %s", err); } sdsfree(err); err = NULL; - server.repl_state = REPL_STATE_RECEIVE_VERSION_REPLY; + link->state = REPL_STATE_RECEIVE_VERSION_REPLY; return; } /* Receive VERSION reply. */ - if (server.repl_state == REPL_STATE_RECEIVE_VERSION_REPLY) { + if (link->state == REPL_STATE_RECEIVE_VERSION_REPLY) { err = receiveSynchronousResponse(conn); if (err == NULL) goto no_response_error; /* Ignore the error if any. Valkey >= 8 supports REPLCONF VERSION. */ if (err[0] == '-') { serverLog(LL_NOTICE, - "(Non critical) Primary does not understand " + "(Non critical) Source does not understand " "REPLCONF VERSION: %s", err); } sdsfree(err); err = NULL; - server.repl_state = REPL_STATE_SEND_PSYNC; + link->state = REPL_STATE_SEND_PSYNC; } /* Try a partial resynchronization. If we don't have a cached primary @@ -3692,32 +3873,32 @@ void syncWithPrimary(connection *conn) { * to start a full resynchronization so that we get the primary replid * and the global offset, to try a partial resync at the next * reconnection attempt. */ - if (server.repl_state == REPL_STATE_SEND_PSYNC) { - if (replicaTryPartialResynchronization(conn, 0) == PSYNC_WRITE_ERROR) { + if (link->state == REPL_STATE_SEND_PSYNC) { + if (replicaTryPartialResynchronization(link, 0) == PSYNC_WRITE_ERROR) { err = sdsnew("Write error sending the PSYNC command."); abortFailover("Write error to failover target"); goto write_error; } - server.repl_state = REPL_STATE_RECEIVE_PSYNC_REPLY; + link->state = REPL_STATE_RECEIVE_PSYNC_REPLY; return; } /* If reached this point, we should be in REPL_STATE_RECEIVE_PSYNC_REPLY. */ - if (server.repl_state != REPL_STATE_RECEIVE_PSYNC_REPLY) { + if (link->state != REPL_STATE_RECEIVE_PSYNC_REPLY) { serverLog(LL_WARNING, - "syncWithPrimary(): state machine error, " + "syncWithSource(): state machine error, " "state should be RECEIVE_PSYNC but is %d", - server.repl_state); + link->state); goto error; } - psync_result = replicaTryPartialResynchronization(conn, 1); + psync_result = replicaTryPartialResynchronization(link, 1); if (psync_result == PSYNC_WAIT_REPLY) return; /* Try again later... */ /* Check the status of the planned failover. We expect PSYNC_CONTINUE, * but there is nothing technically wrong with a full resync which * could happen in edge cases. */ - if (server.failover_state == FAILOVER_IN_PROGRESS) { + if (server.failover_state == FAILOVER_IN_PROGRESS && link == server.primary) { if (psync_result == PSYNC_CONTINUE || psync_result == PSYNC_FULLRESYNC) { clearFailoverState(); } else { @@ -3750,13 +3931,13 @@ void syncWithPrimary(connection *conn) { if (psync_result == PSYNC_NOT_SUPPORTED) { serverLog(LL_NOTICE, "Retrying with SYNC..."); if (connSyncWrite(conn, "SYNC\r\n", 6, server.repl_syncio_timeout * 1000) == -1) { - serverLog(LL_WARNING, "I/O error writing to PRIMARY: %s", connGetLastError(conn)); + serverLog(LL_WARNING, "I/O error writing to %s: %s", replicationGetNameForLogs(link), connGetLastError(conn)); goto error; } } /* Prepare a suitable temp file for bulk transfer */ - if (!useDisklessLoad()) { + if (!useDisklessLoad() && link->slot_ranges == NULL) { int dfd = -1, maxtries = 5; while (maxtries--) { snprintf(tmpfile, 256, "temp-%d.%ld.rdb", (int)server.unixtime, (long int)getpid()); @@ -3769,24 +3950,30 @@ void syncWithPrimary(connection *conn) { errno = saved_errno; } if (dfd == -1) { - serverLog(LL_WARNING, "Opening the temp file needed for PRIMARY <-> REPLICA synchronization: %s", + serverLog(LL_WARNING, "Opening the temp file needed for %s <-> REPLICA synchronization: %s", replicationGetNameForLogs(link), strerror(errno)); goto error; } - server.repl_transfer_tmpfile = zstrdup(tmpfile); - server.repl_transfer_fd = dfd; + link->transfer_tmpfile = zstrdup(tmpfile); + link->transfer_fd = dfd; + } + + /* We are going to need to do a full resync. If we are accepting a single + * slot - make sure we have a clean slate to load it into.*/ + if (link->slot_ranges != NULL) { + dropKeysInSlotRanges(link->slot_ranges, 1); } /* Using dual-channel-replication, the primary responded +DUALCHANNELSYNC. We need to * initialize the RDB channel. */ if (psync_result == PSYNC_FULLRESYNC_DUAL_CHANNEL) { /* Create RDB connection */ - server.repl_rdb_transfer_s = connCreate(connTypeOfReplication()); - if (connConnect(server.repl_rdb_transfer_s, server.primary_host, server.primary_port, server.bind_source_addr, - dualChannelFullSyncWithPrimary) == C_ERR) { - serverLog(LL_WARNING, "Unable to connect to Primary: %s", connGetLastError(server.repl_transfer_s)); - connClose(server.repl_rdb_transfer_s); - server.repl_rdb_transfer_s = NULL; + link->rdb_transfer_s = connCreate(connTypeOfReplication()); + if (connConnect(link->rdb_transfer_s, link->host, link->port, server.bind_source_addr, + dualChannelFullSyncWithReplicationSource) == C_ERR) { + serverLog(LL_WARNING, "Unable to connect to source: %s", connGetLastError(link->transfer_s)); + connClose(link->rdb_transfer_s); + link->rdb_transfer_s = NULL; goto error; } if (connSetReadHandler(conn, NULL) == C_ERR) { @@ -3795,22 +3982,27 @@ void syncWithPrimary(connection *conn) { connGetInfo(conn, conninfo, sizeof(conninfo))); goto error; } - server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_SEND_HANDSHAKE; + link->rdb_channel_state = REPL_DUAL_CHANNEL_SEND_HANDSHAKE; return; } - /* Setup the non blocking download of the bulk file. */ - if (connSetReadHandler(conn, readSyncBulkPayload) == C_ERR) { - char conninfo[CONN_INFO_LEN]; - serverLog(LL_WARNING, "Can't create readable event for SYNC: %s (%s)", strerror(errno), - connGetInfo(conn, conninfo, sizeof(conninfo))); - goto error; + if (replicationUseAOFFormatSnapshot(link)) { + link->client = createReplicationLinkClientWithHandler(link, conn, -1, readQueryFromClient); + link->transfer_s = NULL; + } else { + /* Setup the non blocking download of the bulk file. */ + if (connSetReadHandler(conn, readSyncBulkPayload) == C_ERR) { + char conninfo[CONN_INFO_LEN]; + serverLog(LL_WARNING, "Can't create readable event for SYNC: %s (%s)", strerror(errno), + connGetInfo(conn, conninfo, sizeof(conninfo))); + goto error; + } } - server.repl_state = REPL_STATE_TRANSFER; - server.repl_transfer_size = -1; - server.repl_transfer_read = 0; - server.repl_transfer_last_fsync_off = 0; - server.repl_transfer_lastio = server.unixtime; + link->state = REPL_STATE_TRANSFER; + link->transfer_size = -1; + link->transfer_read = 0; + link->transfer_last_fsync_off = 0; + link->transfer_lastio = server.unixtime; return; no_response_error: /* Handle receiveSynchronousResponse() error when primary has no reply */ @@ -3819,16 +4011,16 @@ void syncWithPrimary(connection *conn) { error: connClose(conn); - server.repl_transfer_s = NULL; - if (server.repl_rdb_transfer_s) { - connClose(server.repl_rdb_transfer_s); - server.repl_rdb_transfer_s = NULL; - } - if (server.repl_transfer_fd != -1) close(server.repl_transfer_fd); - if (server.repl_transfer_tmpfile) zfree(server.repl_transfer_tmpfile); - server.repl_transfer_tmpfile = NULL; - server.repl_transfer_fd = -1; - server.repl_state = REPL_STATE_CONNECT; + link->transfer_s = NULL; + if (link->rdb_transfer_s) { + connClose(link->rdb_transfer_s); + link->rdb_transfer_s = NULL; + } + if (link->transfer_fd != -1) close(link->transfer_fd); + if (link->transfer_tmpfile) zfree(link->transfer_tmpfile); + link->transfer_tmpfile = NULL; + link->transfer_fd = -1; + link->state = REPL_STATE_CONNECT; return; write_error: /* Handle sendCommand() errors. */ @@ -3837,20 +4029,108 @@ void syncWithPrimary(connection *conn) { goto error; } -int connectWithPrimary(void) { - server.repl_transfer_s = connCreate(connTypeOfReplication()); - if (connConnect(server.repl_transfer_s, server.primary_host, server.primary_port, server.bind_source_addr, - syncWithPrimary) == C_ERR) { - serverLog(LL_WARNING, "Unable to connect to PRIMARY: %s", connGetLastError(server.repl_transfer_s)); - connClose(server.repl_transfer_s); - server.repl_transfer_s = NULL; - return C_ERR; +replicationLink *createReplicationLink(char *host, int port, list *slot_ranges) { + replicationLink *result = (replicationLink *)zmalloc(sizeof(replicationLink)); + result->protected = 0; + result->state = REPL_STATE_NONE; + result->rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE; + result->slot_ranges = slot_ranges; + result->client = NULL; + result->host = sdsnew(host); + result->port = port; + result->transfer_s = NULL; + result->rdb_transfer_s = NULL; + result->rdb_client_id = -1; + result->replid[0] = '\0'; + result->initial_offset = -1; + result->transfer_size = 0; + result->transfer_read = 0; + result->transfer_last_fsync_off = 0; + result->transfer_fd = -1; + result->transfer_tmpfile = NULL; + result->transfer_lastio = 0; + result->provisional_source_state.replid[0] = '\0'; + result->provisional_source_state.reploff = -1; + result->provisional_source_state.read_reploff = -1; + result->provisional_source_state.dbid = -1; + result->pending_repl_data.blocks = NULL; + result->pending_repl_data.len = 0; + result->pending_repl_data.peak = 0; + listAddNodeTail(server.replication_links, result); + return result; +} + + +int freeReplicationLink(replicationLink *link) { + if (!link) return 0; + + /* Free primary_host before any calls to freeClient since it calls + * replicationHandleSourceDisconnection which can trigger a re-connect + * directly from within that call. */ + sdsfree(link->host); + link->host = NULL; + + cancelReplicationHandshake(link, 0); + if (link->client) { + freeClient(link->client); + link->client = NULL; } + if (link->transfer_s) { + connClose(link->transfer_s); + link->transfer_s = NULL; + } + if (link->rdb_transfer_s) { + connClose(link->rdb_transfer_s); + link->rdb_transfer_s = NULL; + } + if (link->transfer_tmpfile) { + zfree(link->transfer_tmpfile); + link->transfer_tmpfile = NULL; + } + if (link->transfer_fd != -1) { + close(link->transfer_fd); + link->transfer_fd = -1; + } + freePendingReplDataBuf(link); + + /* Unlink this replication link from the server list */ + listIter li; + listNode *ln; + listRewind(server.replication_links, &li); + while ((ln = listNext(&li))) { + replicationLink *elem = (replicationLink *)ln->value; + if (elem == link) { + listDelNode(server.replication_links, ln); + break; + } + } + + /* Keep the link intact if it is protected, but mark it as such */ + if (link->protected) { + link->state = REPL_STATE_CANCELLED; + return 0; + } + zfree(link); + return 1; +} + +int connectReplicationLink(replicationLink *link) { + if (!link) + return C_ERR; + + link->transfer_s = connCreate(connTypeOfReplication()); + connSetPrivateData(link->transfer_s, link); + if (connConnect(link->transfer_s, link->host, link->port, server.bind_source_addr, syncWithSource) == C_ERR) { + serverLog(LL_WARNING, "Unable to connect to %s: %s", replicationGetNameForLogs(link), connGetLastError(link->transfer_s)); + connClose(link->transfer_s); + link->transfer_s = NULL; + return C_ERR; + } - server.repl_transfer_lastio = server.unixtime; - server.repl_state = REPL_STATE_CONNECTING; - serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync started"); + link->transfer_lastio = server.unixtime; + link->state = REPL_STATE_CONNECTING; + serverLog(LL_NOTICE, "%s <-> REPLICA sync started", replicationGetNameForLogs(link)); return C_OK; } @@ -3858,23 +4138,27 @@ int connectWithPrimary(void) { * in progress to undo it. * Never call this function directly, use cancelReplicationHandshake() instead. */ -void undoConnectWithPrimary(void) { - connClose(server.repl_transfer_s); - server.repl_transfer_s = NULL; +void undoConnectWithSource(replicationLink *link) { + if (link->client) { + freeClient(link->client); + } else if (link->transfer_s) { + connClose(link->transfer_s); + link->transfer_s = NULL; + } } /* Abort the async download of the bulk dataset while SYNC-ing with primary. * Never call this function directly, use cancelReplicationHandshake() instead. */ -void replicationAbortSyncTransfer(void) { - serverAssert(server.repl_state == REPL_STATE_TRANSFER); - undoConnectWithPrimary(); - if (server.repl_transfer_fd != -1) { - close(server.repl_transfer_fd); - bg_unlink(server.repl_transfer_tmpfile); - zfree(server.repl_transfer_tmpfile); - server.repl_transfer_tmpfile = NULL; - server.repl_transfer_fd = -1; +void replicationAbortSyncTransfer(replicationLink *link) { + serverAssert(link->state == REPL_STATE_TRANSFER); + undoConnectWithSource(link); + if (link->transfer_fd != -1) { + close(link->transfer_fd); + bg_unlink(link->transfer_tmpfile); + zfree(link->transfer_tmpfile); + link->transfer_tmpfile = NULL; + link->transfer_fd = -1; } } @@ -3883,19 +4167,22 @@ void replicationAbortSyncTransfer(void) { * the initial bulk transfer. * * If there was a replication handshake in progress 1 is returned and - * the replication state (server.repl_state) set to REPL_STATE_CONNECT. + * the replication state (link->state) set to REPL_STATE_CONNECT. * * Otherwise zero is returned and no operation is performed at all. */ -int cancelReplicationHandshake(int reconnect) { - if (server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) { - replicationAbortDualChannelSyncTransfer(); - } - if (server.repl_state == REPL_STATE_TRANSFER) { - replicationAbortSyncTransfer(); - server.repl_state = REPL_STATE_CONNECT; - } else if (server.repl_state == REPL_STATE_CONNECTING || replicaIsInHandshakeState()) { - undoConnectWithPrimary(); - server.repl_state = REPL_STATE_CONNECT; +int cancelReplicationHandshake(replicationLink *link, int reconnect) { + if (link->rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) { + replicationAbortDualChannelSyncTransfer(link); + } + if (link->state == REPL_STATE_TRANSFER) { + replicationAbortSyncTransfer(link); + /* Note that disconnection may already trigger reconnect */ + if (link->state == REPL_STATE_CONNECTING) + return 1; + link->state = REPL_STATE_CONNECT; + } else if (link->state == REPL_STATE_CONNECTING || replicaIsInHandshakeState(link)) { + undoConnectWithSource(link); + link->state = REPL_STATE_CONNECT; } else { return 0; } @@ -3904,34 +4191,32 @@ int cancelReplicationHandshake(int reconnect) { /* try to re-connect without waiting for replicationCron, this is needed * for the "diskless loading short read" test. */ - serverLog(LL_NOTICE, "Reconnecting to PRIMARY %s:%d after failure", server.primary_host, server.primary_port); - connectWithPrimary(); + serverLog(LL_NOTICE, "Reconnecting to replication source %s:%d after failure", link->host, link->port); + connectReplicationLink(link); return 1; } /* Set replication to the specified primary address and port. */ void replicationSetPrimary(char *ip, int port, int full_sync_required) { - int was_primary = server.primary_host == NULL; + int was_primary = server.primary == NULL; + int was_connected = server.primary->state == REPL_STATE_CONNECTED; - sdsfree(server.primary_host); - server.primary_host = NULL; if (server.primary) { /* When joining 'myself' to a new primary, set the dont_cache_primary flag * if a full sync is required. This happens when 'myself' was previously * part of a different shard from the new primary. Since 'myself' does not * have the replication history of the shard it is joining, clearing the * cached primary is necessary to ensure proper replication behavior. */ - server.primary->flag.dont_cache_primary = full_sync_required; - freeClient(server.primary); + server.primary->client->flag.dont_cache_primary = full_sync_required; + freeReplicationLink(server.primary); } disconnectAllBlockedClients(); /* Clients blocked in primary, now replica. */ /* Setting primary_host only after the call to freeClient since it calls - * replicationHandlePrimaryDisconnection which can trigger a re-connect + * replicationHandleSourceDisconnection which can trigger a re-connect * directly from within that call. */ - server.primary_host = sdsnew(ip); - server.primary_port = port; + server.primary = createReplicationLink(ip, port, NULL); /* Update oom_score_adj */ setOOMScoreAdj(-1); @@ -3942,8 +4227,6 @@ void replicationSetPrimary(char *ip, int port, int full_sync_required) { * primary, or finishing transferring RDB and preparing loading DB on full * sync with new primary. */ - cancelReplicationHandshake(0); - /* Before destroying our primary state, create a cached primary using * our own parameters, to later PSYNC with the new primary. */ if (was_primary && !full_sync_required) { @@ -3956,31 +4239,26 @@ void replicationSetPrimary(char *ip, int port, int full_sync_required) { NULL); /* Fire the primary link modules event. */ - if (server.repl_state == REPL_STATE_CONNECTED) + if (was_connected) moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_DOWN, NULL); - server.repl_state = REPL_STATE_CONNECT; /* Allow trying dual-channel-replication with the new primary. If new primary doesn't * support dual-channel-replication, we will set to 0 afterwards. */ - serverLog(LL_NOTICE, "Connecting to PRIMARY %s:%d", server.primary_host, server.primary_port); - connectWithPrimary(); + serverLog(LL_NOTICE, "Connecting to PRIMARY %s:%d", server.primary->host, server.primary->port); + connectReplicationLink(server.primary); } /* Cancel replication, setting the instance as a primary itself. */ void replicationUnsetPrimary(void) { - if (server.primary_host == NULL) return; /* Nothing to do. */ + if (server.primary == NULL) return; /* Nothing to do. */ /* Fire the primary link modules event. */ - if (server.repl_state == REPL_STATE_CONNECTED) + if (server.primary->state == REPL_STATE_CONNECTED) moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_DOWN, NULL); - /* Clear primary_host first, since the freeClient calls - * replicationHandlePrimaryDisconnection which can attempt to re-connect. */ - sdsfree(server.primary_host); - server.primary_host = NULL; - if (server.primary) freeClient(server.primary); + freeReplicationLink(server.primary); replicationDiscardCachedPrimary(); - cancelReplicationHandshake(0); + /* When a replica is turned into a primary, the current replication ID * (that was inherited from the primary at synchronization time) is * used as secondary ID up to the current offset, and a new replication @@ -3991,7 +4269,6 @@ void replicationUnsetPrimary(void) { * the replicas will be able to partially resync with us, so it will be * a very fast reconnection. */ disconnectReplicas(); - server.repl_state = REPL_STATE_NONE; /* We need to make sure the new primary will start the replication stream * with a SELECT statement. This is forced after a full resync, but @@ -4022,23 +4299,26 @@ void replicationUnsetPrimary(void) { /* This function is called when the replica lose the connection with the * primary into an unexpected way. */ -void replicationHandlePrimaryDisconnection(void) { - /* Fire the primary link modules event. */ - if (server.repl_state == REPL_STATE_CONNECTED) - moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_DOWN, NULL); +void replicationHandleSourceDisconnection(replicationLink *link) { + if (link == server.primary) { + if (link->state == REPL_STATE_CONNECTED && link == server.primary) { + moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_DOWN, NULL); + } + server.repl_down_since = server.unixtime; - server.primary = NULL; - server.repl_state = REPL_STATE_CONNECT; - server.repl_down_since = server.unixtime; - /* We lost connection with our primary, don't disconnect replicas yet, - * maybe we'll be able to PSYNC with our primary later. We'll disconnect - * the replicas only if we'll have to do a full resync with our primary. */ + /* We lost connection with our primary, don't disconnect replicas yet, + * maybe we'll be able to PSYNC with our primary later. We'll disconnect + * the replicas only if we'll have to do a full resync with our primary. */ + } + + link->client = NULL; + link->state = REPL_STATE_CONNECT; /* Try to re-connect immediately rather than wait for replicationCron * waiting 1 second may risk backlog being recycled. */ - if (server.primary_host) { - serverLog(LL_NOTICE, "Reconnecting to PRIMARY %s:%d", server.primary_host, server.primary_port); - connectWithPrimary(); + if (link->host) { + serverLog(LL_NOTICE, "Reconnecting to replication source %s:%d", link->host, link->port); + connectReplicationLink(link); } } @@ -4058,7 +4338,7 @@ void replicaofCommand(client *c) { /* The special host/port combination "NO" "ONE" turns the instance * into a primary. Otherwise the new primary address is set. */ if (!strcasecmp(c->argv[1]->ptr, "no") && !strcasecmp(c->argv[2]->ptr, "one")) { - if (server.primary_host) { + if (server.primary) { replicationUnsetPrimary(); sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log); serverLog(LL_NOTICE, "PRIMARY MODE enabled (user request from '%s')", client); @@ -4078,7 +4358,7 @@ void replicaofCommand(client *c) { if (getRangeLongFromObjectOrReply(c, c->argv[2], 0, 65535, &port, "Invalid master port") != C_OK) return; /* Check if we are already attached to the specified primary */ - if (server.primary_host && !strcasecmp(server.primary_host, c->argv[1]->ptr) && server.primary_port == port) { + if (server.primary && !strcasecmp(server.primary->host, c->argv[1]->ptr) && server.primary->port == port) { serverLog(LL_NOTICE, "REPLICAOF would result into synchronization " "with the primary we are already connected " "with. No operation performed."); @@ -4090,8 +4370,8 @@ void replicaofCommand(client *c) { * we can continue. */ replicationSetPrimary(c->argv[1]->ptr, port, 0); sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log); - serverLog(LL_NOTICE, "REPLICAOF %s:%d enabled (user request from '%s')", server.primary_host, - server.primary_port, client); + serverLog(LL_NOTICE, "REPLICAOF %s:%d enabled (user request from '%s')", server.primary->host, + server.primary->port, client); sdsfree(client); } addReply(c, shared.ok); @@ -4106,7 +4386,7 @@ void roleCommand(client *c) { return; } - if (server.primary_host == NULL) { + if (server.primary == NULL) { listIter li; listNode *ln; void *mbcount; @@ -4138,12 +4418,12 @@ void roleCommand(client *c) { addReplyArrayLen(c, 5); addReplyBulkCBuffer(c, "slave", 5); - addReplyBulkCString(c, server.primary_host); - addReplyLongLong(c, server.primary_port); - if (replicaIsInHandshakeState()) { + addReplyBulkCString(c, server.primary->host); + addReplyLongLong(c, server.primary->port); + if (replicaIsInHandshakeState(server.primary)) { replica_state = "handshake"; } else { - switch (server.repl_state) { + switch (server.primary->state) { case REPL_STATE_NONE: replica_state = "none"; break; case REPL_STATE_CONNECT: replica_state = "connect"; break; case REPL_STATE_CONNECTING: replica_state = "connecting"; break; @@ -4153,16 +4433,15 @@ void roleCommand(client *c) { } } addReplyBulkCString(c, replica_state); - addReplyLongLong(c, server.primary ? server.primary->repl_data->reploff : -1); + addReplyLongLong(c, server.primary->client ? server.primary->client->repl_data->reploff : -1); } } /* Send a REPLCONF ACK command to the primary to inform it about the current * processed offset. If we are not connected with a primary, the command has * no effects. */ -void replicationSendAck(void) { - client *c = server.primary; - +void replicationSendAck(replicationLink *link) { + client *c = link->client; if (c != NULL) { int send_fack = server.fsynced_reploff != -1; c->flag.primary_force_reply = 1; @@ -4203,7 +4482,7 @@ void replicationSendAck(void) { * handshake in order to reactivate the cached primary. */ void replicationCachePrimary(client *c) { - serverAssert(server.primary != NULL && server.cached_primary == NULL); + serverAssert(server.primary != NULL && server.primary->client != NULL && server.cached_primary == NULL); serverLog(LL_NOTICE, "Caching the disconnected primary state."); /* Wait for IO operations to be done before proceeding */ @@ -4215,10 +4494,10 @@ void replicationCachePrimary(client *c) { * we want to discard the non processed query buffers and non processed * offsets, including pending transactions, already populated arguments, * pending outputs to the primary. */ - sdsclear(server.primary->querybuf); - server.primary->qb_pos = 0; - server.primary->repl_data->repl_applied = 0; - server.primary->repl_data->read_reploff = server.primary->repl_data->reploff; + sdsclear(c->querybuf); + c->qb_pos = 0; + c->repl_data->repl_applied = 0; + c->repl_data->read_reploff = c->repl_data->reploff; if (c->flag.multi) discardTransaction(c); listEmpty(c->reply); c->sentlen = 0; @@ -4227,9 +4506,9 @@ void replicationCachePrimary(client *c) { resetClient(c); resetClientIOState(c); - /* Save the primary. Server.primary will be set to null later by - * replicationHandlePrimaryDisconnection(). */ - server.cached_primary = server.primary; + /* Save the primary. Server.primary->client will be set to null later by + * replicationHandleSourceDisconnection(). */ + server.cached_primary = c; /* Invalidate the Peer ID cache. */ if (c->peerid) { @@ -4244,8 +4523,8 @@ void replicationCachePrimary(client *c) { /* Caching the primary happens instead of the actual freeClient() call, * so make sure to adjust the replication state. This function will - * also set server.primary to NULL. */ - replicationHandlePrimaryDisconnection(); + * also set server.primary->client to NULL. */ + replicationHandleSourceDisconnection(server.primary); } /* This function is called when a primary is turned into a replica, in order to @@ -4261,24 +4540,27 @@ void replicationCachePrimaryUsingMyself(void) { serverLog(LL_NOTICE, "Before turning into a replica, using my own primary parameters " "to synthesize a cached primary: I may be able to synchronize with " "the new primary with just a partial transfer."); + /* Create a temporary link for the purpose of creating a client. */ + replicationLink *temp_link = createReplicationLink(NULL, 0, NULL); /* This will be used to populate the field server.primary->repl_data->reploff * by replicationCreatePrimaryClient(). We'll later set the created * primary as server.cached_primary, so the replica will use such * offset for PSYNC. */ - server.primary_initial_offset = server.primary_repl_offset; + temp_link->initial_offset = server.primary_repl_offset; /* The primary client we create can be set to any DBID, because * the new primary will start its replication stream with SELECT. */ - replicationCreatePrimaryClient(NULL, -1); + createReplicationLinkClient(temp_link, NULL, -1); /* Use our own ID / offset. */ - memcpy(server.primary->repl_data->replid, server.replid, sizeof(server.replid)); + memcpy(temp_link->client->repl_data->replid, server.replid, sizeof(server.replid)); /* Set as cached primary. */ - unlinkClient(server.primary); - server.cached_primary = server.primary; - server.primary = NULL; + unlinkClient(temp_link->client); + server.cached_primary = temp_link->client; + temp_link->client = NULL; + freeReplicationLink(temp_link); } /* Free a cached primary, called when there are no longer the conditions for @@ -4287,7 +4569,7 @@ void replicationDiscardCachedPrimary(void) { if (server.cached_primary == NULL) return; serverLog(LL_NOTICE, "Discarding previously cached primary state."); - server.cached_primary->flag.primary = 0; + server.cached_primary->flag.replication_source = 0; freeClient(server.cached_primary); server.cached_primary = NULL; } @@ -4295,17 +4577,19 @@ void replicationDiscardCachedPrimary(void) { /* Replication: Replica side. * This method performs the necessary steps to establish a connection with the primary server. * It sets private data, updates flags, and fires an event to notify modules about the primary link change. */ -void establishPrimaryConnection(void) { - connSetPrivateData(server.primary->conn, server.primary); - server.primary->flag.close_after_reply = 0; - server.primary->flag.close_asap = 0; - server.primary->flag.authenticated = 1; - server.primary->last_interaction = server.unixtime; - server.repl_state = REPL_STATE_CONNECTED; - server.repl_down_since = 0; +void establishSourceConnection(replicationLink *link) { + connSetPrivateData(link->client->conn, link->client); + link->client->flag.close_after_reply = 0; + link->client->flag.close_asap = 0; + link->client->flag.authenticated = 1; + link->client->last_interaction = server.unixtime; + link->state = REPL_STATE_CONNECTED; + if (link == server.primary) { + server.repl_down_since = 0; - /* Fire the primary link modules event. */ - moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_UP, NULL); + /* Fire the primary link modules event. */ + moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_UP, NULL); + } } /* Replication: Replica side. @@ -4315,34 +4599,38 @@ void establishPrimaryConnection(void) { * This function is called when successfully setup a partial resynchronization * so the stream of data that we'll receive will start from where this * primary left. */ -void replicationResurrectCachedPrimary(connection *conn) { - server.primary = server.cached_primary; +void replicationResurrectCachedPrimary(replicationLink *link) { + serverAssert(link == server.primary); + link->client = server.cached_primary; server.cached_primary = NULL; - server.primary->conn = conn; - establishPrimaryConnection(); + /* The client takes ownership of the connection now. */ + link->client->conn = link->transfer_s; + link->transfer_s = NULL; + + establishSourceConnection(link); /* Re-add to the list of clients. */ - linkClient(server.primary); - replicationSteadyStateInit(); + linkClient(link->client); + replicationSteadyStateInit(link); } /* Replication: Replica side. * Prepare replica to steady state. * prerequisite: server.primary is already initialized and linked in client list. */ -void replicationSteadyStateInit(void) { - if (connSetReadHandler(server.primary->conn, readQueryFromClient)) { +void replicationSteadyStateInit(replicationLink *link) { + if (connSetReadHandler(link->client->conn, readQueryFromClient)) { serverLog(LL_WARNING, "Error resurrecting the cached primary, impossible to add the readable handler: %s", strerror(errno)); - freeClientAsync(server.primary); /* Close ASAP. */ + freeClientAsync(link->client); /* Close ASAP. */ } /* We may also need to install the write handler as well if there is * pending data in the write buffers. */ - if (clientHasPendingReplies(server.primary)) { - if (connSetWriteHandler(server.primary->conn, sendReplyToClient)) { + if (clientHasPendingReplies(link->client)) { + if (connSetWriteHandler(link->client->conn, sendReplyToClient)) { serverLog(LL_WARNING, "Error resurrecting the cached primary, impossible to add the writable handler: %s", strerror(errno)); - freeClientAsync(server.primary); /* Close ASAP. */ + freeClientAsync(link->client); /* Close ASAP. */ } } } @@ -4350,16 +4638,19 @@ void replicationSteadyStateInit(void) { /* Replication: Replica side. * Turn the provisional primary into the current primary. * This function is called after dual channel sync is finished successfully. */ -void replicationResurrectProvisionalPrimary(void) { - /* Create a primary client, but do not initialize the read handler yet, as this replica still has a local buffer to +void replicationResurrectProvisionalSource(replicationLink *link) { + /* Create a client, but do not initialize the read handler yet, as this replica still has a local buffer to * drain. */ - replicationCreatePrimaryClientWithHandler(server.repl_transfer_s, server.repl_provisional_primary.dbid, NULL); - memcpy(server.primary->repl_data->replid, server.repl_provisional_primary.replid, sizeof(server.repl_provisional_primary.replid)); - server.primary->repl_data->reploff = server.repl_provisional_primary.reploff; - server.primary->repl_data->read_reploff = server.repl_provisional_primary.read_reploff; - server.primary_repl_offset = server.primary->repl_data->reploff; - memcpy(server.replid, server.primary->repl_data->replid, sizeof(server.primary->repl_data->replid)); - establishPrimaryConnection(); + createReplicationLinkClientWithHandler(link, link->transfer_s, link->provisional_source_state.dbid, NULL); + link->transfer_s = NULL; /* link->client now takes ownership of this connection */ + memcpy(link->client->repl_data->replid, link->provisional_source_state.replid, sizeof(link->provisional_source_state.replid)); + link->client->repl_data->reploff = link->provisional_source_state.reploff; + link->client->repl_data->read_reploff = link->provisional_source_state.read_reploff; + if (link == server.primary) { + server.primary_repl_offset = link->client->repl_data->reploff; + memcpy(server.replid, link->client->repl_data->replid, sizeof(link->client->repl_data->replid)); + } + establishSourceConnection(link); } /* ------------------------- MIN-REPLICAS-TO-WRITE --------------------------- */ @@ -4386,7 +4677,7 @@ void refreshGoodReplicasCount(void) { /* return true if status of good replicas is OK. otherwise false */ int checkGoodReplicasStatus(void) { - return server.primary_host || /* not a primary status should be OK */ + return server.primary || /* not a primary status should be OK */ !server.repl_min_replicas_max_lag || /* Min replica max lag not configured */ !server.repl_min_replicas_to_write || /* Min replica to write not configured */ server.repl_good_replicas_count >= server.repl_min_replicas_to_write; /* check if we have enough replicas */ @@ -4479,7 +4770,7 @@ void waitCommand(client *c) { long numreplicas, ackreplicas; long long offset = getClientWriteOffset(c); - if (server.primary_host) { + if (server.primary) { addReplyError( c, "WAIT cannot be used with replica instances. Please also note that if a replica is configured to be " "writable (which is not the default) writes to replicas are just local and are not propagated."); @@ -4517,7 +4808,7 @@ void waitaofCommand(client *c) { if (getPositiveLongFromObjectOrReply(c, c->argv[2], &numreplicas, NULL) != C_OK) return; if (getTimeoutFromObjectOrReply(c, c->argv[3], &timeout, UNIT_MILLISECONDS) != C_OK) return; - if (server.primary_host) { + if (server.primary) { addReplyError(c, "WAITAOF cannot be used with replica instances. Please also note that writes to replicas are " "just local and are not propagated."); return; @@ -4638,9 +4929,9 @@ void processClientsWaitingReplicas(void) { long long replicationGetReplicaOffset(void) { long long offset = 0; - if (server.primary_host != NULL) { - if (server.primary) { - offset = server.primary->repl_data->reploff; + if (server.primary != NULL) { + if (server.primary->client) { + offset = server.primary->client->repl_data->reploff; } else if (server.cached_primary) { offset = server.cached_primary->repl_data->reploff; } @@ -4664,44 +4955,48 @@ void replicationCron(void) { updateFailoverStatus(); /* Non blocking connection timeout? */ - if (server.primary_host && (server.repl_state == REPL_STATE_CONNECTING || replicaIsInHandshakeState()) && - (time(NULL) - server.repl_transfer_lastio) > server.repl_timeout) { - serverLog(LL_WARNING, "Timeout connecting to the PRIMARY..."); - cancelReplicationHandshake(1); - } + listNode *ln; + listIter li; + listRewind(server.replication_links, &li); + while ((ln = listNext(&li))) { + replicationLink *link = (replicationLink *)ln->value; + if ((link->state == REPL_STATE_CONNECTING || replicaIsInHandshakeState(link)) && + (time(NULL) - link->transfer_lastio) > server.repl_timeout) { + serverLog(LL_WARNING, "Timeout connecting to %s...", replicationGetNameForLogs(link)); + cancelReplicationHandshake(link, 1); + } - /* Bulk transfer I/O timeout? */ - if (server.primary_host && server.repl_state == REPL_STATE_TRANSFER && - (time(NULL) - server.repl_transfer_lastio) > server.repl_timeout) { - serverLog(LL_WARNING, "Timeout receiving bulk data from PRIMARY... If the problem persists try to set the " - "'repl-timeout' parameter in valkey.conf to a larger value."); - cancelReplicationHandshake(1); - } + /* Bulk transfer I/O timeout? */ + if (link && link->state == REPL_STATE_TRANSFER && + (time(NULL) - link->transfer_lastio) > server.repl_timeout) { + serverLog(LL_WARNING, "Timeout receiving bulk data from %s... If the problem persists try to set the " + "'repl-timeout' parameter in valkey.conf to a larger value.", replicationGetNameForLogs(link)); + cancelReplicationHandshake(link, 1); + } - /* Timed out primary when we are an already connected replica? */ - if (server.primary_host && server.repl_state == REPL_STATE_CONNECTED && - (time(NULL) - server.primary->last_interaction) > server.repl_timeout) { - serverLog(LL_WARNING, "PRIMARY timeout: no data nor PING received..."); - freeClient(server.primary); - } + /* Timed out primary when we are an already connected replica? */ + if (link && link->state == REPL_STATE_CONNECTED && + (time(NULL) - link->client->last_interaction) > server.repl_timeout) { + serverLog(LL_WARNING, "%s timeout: no data nor PING received...", replicationGetNameForLogs(link)); + freeClient(link->client); /* free client will attempt reconnect */ + } - /* Check if we should connect to a PRIMARY */ - if (server.repl_state == REPL_STATE_CONNECT) { - serverLog(LL_NOTICE, "Connecting to PRIMARY %s:%d", server.primary_host, server.primary_port); - connectWithPrimary(); - } + /* Check if we should connect to a replication source */ + if (link && link->state == REPL_STATE_CONNECT) { + serverLog(LL_NOTICE, "Connecting to %s %s:%d", replicationGetNameForLogs(link), link->host, link->port); + connectReplicationLink(link); + } - /* Send ACK to primary from time to time. - * Note that we do not send periodic acks to primary that don't - * support PSYNC and replication offsets. */ - if (server.primary_host && server.primary && !(server.primary->flag.pre_psync)) replicationSendAck(); + /* Send ACK to replication sources from time to time. + * Note that we do not send periodic acks to replication sources that don't + * support PSYNC and replication offsets. */ + if (link && link->client && !(link->client->flag.pre_psync)) replicationSendAck(link); + } /* If we have attached replicas, PING them from time to time. * So replicas can implement an explicit timeout to primaries, and will * be able to detect a link disconnection even if the TCP connection * will not actually go down. */ - listIter li; - listNode *ln; robj *ping_argv[1]; /* First, send PING according to ping_replica_period. */ @@ -4788,7 +5083,7 @@ void replicationCron(void) { * backlog, in order to reply to PSYNC queries if they are turned into * primaries after a failover. */ if (listLength(server.replicas) == 0 && server.repl_backlog_time_limit && server.repl_backlog && - server.primary_host == NULL) { + server.primary == NULL) { time_t idle = server.unixtime - server.repl_no_replicas_since; if (idle > server.repl_backlog_time_limit) { @@ -4838,7 +5133,7 @@ void replicationCron(void) { replication_cron_loops++; /* Incremented with frequency 1 HZ. */ } -int shouldStartChildReplication(int *mincapa_out, int *req_out) { +int shouldStartChildReplication(int *mincapa_out, int *req_out, list **slot_ranges_out) { /* We should start a BGSAVE good for replication if we have replicas in * WAIT_BGSAVE_START state. * @@ -4850,6 +5145,7 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out) { int replicas_waiting = 0; int mincapa; int req; + list *slot_ranges; int first = 1; listNode *ln; listIter li; @@ -4861,6 +5157,7 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out) { if (first) { /* Get first replica's requirements */ req = replica->repl_data->replica_req; + slot_ranges = replica->repl_data->slot_ranges; } else if (req != replica->repl_data->replica_req) { /* Skip replicas that don't match */ continue; @@ -4879,6 +5176,7 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out) { max_idle >= server.repl_diskless_sync_delay)) { if (mincapa_out) *mincapa_out = mincapa; if (req_out) *req_out = req; + if (slot_ranges_out) *slot_ranges_out = slot_ranges; return 1; } } @@ -4889,12 +5187,13 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out) { void replicationStartPendingFork(void) { int mincapa = -1; int req = -1; + list *slot_ranges = NULL; - if (shouldStartChildReplication(&mincapa, &req)) { + if (shouldStartChildReplication(&mincapa, &req, &slot_ranges)) { /* Start the BGSAVE. The called function may start a * BGSAVE with socket target or disk target depending on the * configuration and replicas capabilities and requirements. */ - startBgsaveForReplication(mincapa, req); + startBgsaveForReplication(mincapa, req, slot_ranges); } } @@ -5033,7 +5332,7 @@ void failoverCommand(client *c) { return; } - if (server.primary_host) { + if (server.primary) { addReplyError(c, "FAILOVER is not valid when server is a replica."); return; } diff --git a/src/script.c b/src/script.c index a8e5b18eb9..a43de5c7af 100644 --- a/src/script.c +++ b/src/script.c @@ -51,7 +51,7 @@ static void exitScriptTimedoutMode(scriptRunCtx *run_ctx) { run_ctx->flags &= ~SCRIPT_TIMEDOUT; blockingOperationEnds(); /* if we are a replica and we have an active primary, set it for continue processing */ - if (server.primary_host && server.primary) queueClientForReprocessing(server.primary); + if (server.primary && server.primary->client) queueClientForReprocessing(server.primary->client); } static void enterScriptTimedoutMode(scriptRunCtx *run_ctx) { @@ -137,7 +137,7 @@ int scriptPrepareForRun(scriptRunCtx *run_ctx, int client_allow_oom = !!(caller->flag.allow_oom); int running_stale = - server.primary_host && server.repl_state != REPL_STATE_CONNECTED && server.repl_serve_stale_data == 0; + server.primary && server.primary->state != REPL_STATE_CONNECTED && server.repl_serve_stale_data == 0; int obey_client = mustObeyClient(caller); if (!(script_flags & SCRIPT_FLAG_EVAL_COMPAT_MODE)) { @@ -158,7 +158,7 @@ int scriptPrepareForRun(scriptRunCtx *run_ctx, * 1. we are not a readonly replica * 2. no disk error detected * 3. command is not `fcall_ro`/`eval[sha]_ro` */ - if (server.primary_host && server.repl_replica_ro && !obey_client) { + if (server.primary && server.repl_replica_ro && !obey_client) { addReplyError(caller, "-READONLY Can not run script with write flag on readonly replica"); return C_ERR; } @@ -375,7 +375,7 @@ static int scriptVerifyWriteCommandAllow(scriptRunCtx *run_ctx, char **err) { * of this script. */ int deny_write_type = writeCommandsDeniedByDiskError(); - if (server.primary_host && server.repl_replica_ro && !mustObeyClient(run_ctx->original_client)) { + if (server.primary && server.repl_replica_ro && !mustObeyClient(run_ctx->original_client)) { *err = sdsdup(shared.roreplicaerr->ptr); return C_ERR; } @@ -501,12 +501,12 @@ int scriptSetRepl(scriptRunCtx *run_ctx, int repl) { } static int scriptVerifyAllowStale(client *c, sds *err) { - if (!server.primary_host) { + if (!server.primary) { /* Not a replica, stale is irrelevant */ return C_OK; } - if (server.repl_state == REPL_STATE_CONNECTED) { + if (server.primary->state == REPL_STATE_CONNECTED) { /* Connected to replica, stale is irrelevant */ return C_OK; } diff --git a/src/server.c b/src/server.c index 8255b57e25..697ce48013 100644 --- a/src/server.c +++ b/src/server.c @@ -221,7 +221,7 @@ void serverLogRaw(int level, const char *msg) { } else if (pid != server.pid) { role_index = 1; /* RDB / AOF writing child. */ } else { - role_index = (server.primary_host ? 2 : 3); /* Replica or Primary. */ + role_index = (server.primary ? 2 : 3); /* Replica or Primary. */ } switch (server.log_format) { case LOG_FORMAT_LOGFMT: @@ -900,7 +900,7 @@ int clientsCronResizeQueryBuffer(client *c) { if (idletime > 2) { /* 1) Query is idle for a long time. */ size_t remaining = sdslen(c->querybuf) - c->qb_pos; - if (!c->flag.primary && !remaining) { + if (!c->flag.replication_source && !remaining) { /* If the client is not a primary and no data is pending, * The client can safely use the shared query buffer in the next read - free the client's querybuf. */ sdsfree(c->querybuf); @@ -2223,21 +2223,12 @@ void initServerConfig(void) { appendServerSaveParams(60, 10000); /* save after 1 minute and 10000 changes */ /* Replication related */ - server.primary_host = NULL; - server.primary_port = 6379; server.primary = NULL; server.cached_primary = NULL; - server.primary_initial_offset = -1; - server.repl_state = REPL_STATE_NONE; - server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE; - server.repl_transfer_tmpfile = NULL; - server.repl_transfer_fd = -1; - server.repl_transfer_s = NULL; server.repl_syncio_timeout = CONFIG_REPL_SYNCIO_TIMEOUT; server.repl_down_since = 0; /* Never connected, repl is down since EVER. */ server.primary_repl_offset = 0; server.fsynced_reploff_pending = 0; - server.rdb_client_id = -1; server.loading_process_events_interval_ms = LOADING_PROCESS_EVENTS_INTERVAL_DEFAULT; server.loading_rio = NULL; @@ -2348,7 +2339,7 @@ int restartServer(client *c, int flags, mstime_t delay) { * depending on current role. */ int setOOMScoreAdj(int process_class) { - if (process_class == -1) process_class = (server.primary_host ? CONFIG_OOM_REPLICA : CONFIG_OOM_PRIMARY); + if (process_class == -1) process_class = (server.primary ? CONFIG_OOM_REPLICA : CONFIG_OOM_PRIMARY); serverAssert(process_class >= 0 && process_class < CONFIG_OOM_COUNT); @@ -2760,6 +2751,7 @@ void initServer(void) { server.reply_buffer_peak_reset_time = REPLY_BUFFER_DEFAULT_PEAK_RESET_TIME; server.reply_buffer_resizing_enabled = 1; server.client_mem_usage_buckets = NULL; + server.replication_links = listCreate(); resetReplicationBuffer(); /* Make sure the locale is set on startup based on the config file. */ @@ -3359,7 +3351,7 @@ struct serverCommand *lookupCommandOrOriginal(robj **argv, int argc) { /* Commands arriving from the primary client or AOF client, should never be rejected. */ int mustObeyClient(client *c) { - return c->id == CLIENT_ID_AOF || c->flag.primary; + return c->id == CLIENT_ID_AOF || c->flag.replication_source; } static int shouldPropagate(int target) { @@ -3369,7 +3361,7 @@ static int shouldPropagate(int target) { if (server.aof_state != AOF_OFF) return 1; } if (target & PROPAGATE_REPL) { - if (server.primary_host == NULL && (server.repl_backlog || listLength(server.replicas) != 0)) return 1; + if (server.primary == NULL && (server.repl_backlog || listLength(server.replicas) != 0)) return 1; } return 0; @@ -4111,7 +4103,7 @@ int processCommand(client *c) { } } - if (!server.cluster_enabled && c->capa & CLIENT_CAPA_REDIRECT && server.primary_host && !obey_client && + if (!server.cluster_enabled && c->capa & CLIENT_CAPA_REDIRECT && server.primary && !obey_client && (is_write_command || (is_read_command && !c->flag.readonly))) { if (server.failover_state == FAILOVER_IN_PROGRESS) { /* During the FAILOVER process, when conditions are met (such as @@ -4142,7 +4134,7 @@ int processCommand(client *c) { } c->duration = 0; c->cmd->rejected_calls++; - addReplyErrorSds(c, sdscatprintf(sdsempty(), "-REDIRECT %s:%d", server.primary_host, server.primary_port)); + addReplyErrorSds(c, sdscatprintf(sdsempty(), "-REDIRECT %s:%d", server.primary->host, server.primary->port)); } return C_OK; } @@ -4227,7 +4219,7 @@ int processCommand(client *c) { /* Don't accept write commands if this is a read only replica. But * accept write commands if this is our primary. */ - if (server.primary_host && server.repl_replica_ro && !obey_client && is_write_command) { + if (server.primary && server.repl_replica_ro && !obey_client && is_write_command) { rejectCommand(c, shared.roreplicaerr); return C_OK; } @@ -4248,7 +4240,7 @@ int processCommand(client *c) { /* Only allow commands with flag "t", such as INFO, REPLICAOF and so on, * when replica-serve-stale-data is no and we are a replica with a broken * link with primary. */ - if (server.primary_host && server.repl_state != REPL_STATE_CONNECTED && server.repl_serve_stale_data == 0 && + if (server.primary && server.primary->state != REPL_STATE_CONNECTED && server.repl_serve_stale_data == 0 && is_denystale_command) { rejectCommand(c, shared.primarydownerr); return C_OK; @@ -5972,14 +5964,14 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) { info = sdscatprintf(info, "# Replication\r\n" "role:%s\r\n", - server.primary_host == NULL ? "master" : "slave"); - if (server.primary_host) { + server.primary == NULL ? "master" : "slave"); + if (server.primary) { long long replica_repl_offset = 1; long long replica_read_repl_offset = 1; - if (server.primary) { - replica_repl_offset = server.primary->repl_data->reploff; - replica_read_repl_offset = server.primary->repl_data->read_reploff; + if (server.primary->client) { + replica_repl_offset = server.primary->client->repl_data->reploff; + replica_read_repl_offset = server.primary->client->repl_data->read_reploff; } else if (server.cached_primary) { replica_repl_offset = server.cached_primary->repl_data->reploff; replica_read_repl_offset = server.cached_primary->repl_data->read_reploff; @@ -5988,32 +5980,32 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) { info = sdscatprintf( info, FMTARGS( - "master_host:%s\r\n", server.primary_host, - "master_port:%d\r\n", server.primary_port, - "master_link_status:%s\r\n", (server.repl_state == REPL_STATE_CONNECTED) ? "up" : "down", - "master_last_io_seconds_ago:%d\r\n", server.primary ? ((int)(server.unixtime - server.primary->last_interaction)) : -1, - "master_sync_in_progress:%d\r\n", server.repl_state == REPL_STATE_TRANSFER, + "master_host:%s\r\n", server.primary->host, + "master_port:%d\r\n", server.primary->port, + "master_link_status:%s\r\n", (server.primary->state == REPL_STATE_CONNECTED) ? "up" : "down", + "master_last_io_seconds_ago:%d\r\n", server.primary->client ? ((int)(server.unixtime - server.primary->client->last_interaction)) : -1, + "master_sync_in_progress:%d\r\n", server.primary->state == REPL_STATE_TRANSFER, "slave_read_repl_offset:%lld\r\n", replica_read_repl_offset, "slave_repl_offset:%lld\r\n", replica_repl_offset, - "replicas_repl_buffer_size:%zu\r\n", server.pending_repl_data.len, - "replicas_repl_buffer_peak:%zu\r\n", server.pending_repl_data.peak)); + "replicas_repl_buffer_size:%zu\r\n", server.primary->pending_repl_data.len, + "replicas_repl_buffer_peak:%zu\r\n", server.primary->pending_repl_data.peak)); - if (server.repl_state == REPL_STATE_TRANSFER) { + if (server.primary->state == REPL_STATE_TRANSFER) { double perc = 0; - if (server.repl_transfer_size) { - perc = ((double)server.repl_transfer_read / server.repl_transfer_size) * 100; + if (server.primary->transfer_size) { + perc = ((double)server.primary->transfer_read / server.primary->transfer_size) * 100; } info = sdscatprintf( info, FMTARGS( - "master_sync_total_bytes:%lld\r\n", (long long)server.repl_transfer_size, - "master_sync_read_bytes:%lld\r\n", (long long)server.repl_transfer_read, - "master_sync_left_bytes:%lld\r\n", (long long)(server.repl_transfer_size - server.repl_transfer_read), + "master_sync_total_bytes:%lld\r\n", (long long)server.primary->transfer_size, + "master_sync_read_bytes:%lld\r\n", (long long)server.primary->transfer_read, + "master_sync_left_bytes:%lld\r\n", (long long)(server.primary->transfer_size - server.primary->transfer_read), "master_sync_perc:%.2f\r\n", perc, - "master_sync_last_io_seconds_ago:%d\r\n", (int)(server.unixtime - server.repl_transfer_lastio))); + "master_sync_last_io_seconds_ago:%d\r\n", (int)(server.unixtime - server.primary->transfer_lastio))); } - if (server.repl_state != REPL_STATE_CONNECTED) { + if (server.primary->state != REPL_STATE_CONNECTED) { info = sdscatprintf(info, "master_link_down_since_seconds:%jd\r\n", server.repl_down_since ? (intmax_t)(server.unixtime - server.repl_down_since) : -1); } @@ -6848,7 +6840,7 @@ int serverIsSupervised(int mode) { } int iAmPrimary(void) { - return ((!server.cluster_enabled && server.primary_host == NULL) || + return ((!server.cluster_enabled && server.primary == NULL) || (server.cluster_enabled && clusterNodeIsPrimary(getMyClusterNode()))); } @@ -7131,7 +7123,7 @@ __attribute__((weak)) int main(int argc, char **argv) { } if (server.supervised_mode == SUPERVISED_SYSTEMD) { - if (!server.primary_host) { + if (!server.primary) { serverCommunicateSystemd("STATUS=Ready to accept connections\n"); } else { serverCommunicateSystemd( diff --git a/src/server.h b/src/server.h index d186d16c73..1bd78f57f6 100644 --- a/src/server.h +++ b/src/server.h @@ -394,6 +394,7 @@ typedef enum { REPL_STATE_RECEIVE_AUTH_REPLY, /* Wait for AUTH reply */ REPL_STATE_RECEIVE_PORT_REPLY, /* Wait for REPLCONF reply */ REPL_STATE_RECEIVE_IP_REPLY, /* Wait for REPLCONF reply */ + REPL_STATE_RECEIVE_SLOT_REPLY, /* Wait for REPLCONF reply */ REPL_STATE_RECEIVE_CAPA_REPLY, /* Wait for REPLCONF reply */ REPL_STATE_RECEIVE_VERSION_REPLY, /* Wait for REPLCONF reply */ REPL_STATE_SEND_PSYNC, /* Send PSYNC */ @@ -401,6 +402,7 @@ typedef enum { /* --- End of handshake states --- */ REPL_STATE_TRANSFER, /* Receiving .rdb from primary */ REPL_STATE_CONNECTED, /* Connected to primary */ + REPL_STATE_CANCELLED, /* Replication was cancelled, and this link is pending deletion. */ } repl_state; /* Replica rdb-channel replication state. Used in server.repl_rdb_channel_state for @@ -446,6 +448,7 @@ typedef enum { #define REPLICA_REQ_RDB_EXCLUDE_DATA (1 << 0) /* Exclude data from RDB */ #define REPLICA_REQ_RDB_EXCLUDE_FUNCTIONS (1 << 1) /* Exclude functions from RDB */ #define REPLICA_REQ_RDB_CHANNEL (1 << 2) /* Use dual-channel-replication */ +#define REPLICA_REQ_AOF_FORMAT (1 << 3) /* Use AOF-based replication format*/ /* Mask of all bits in the replica requirements bitfield that represent non-standard (filtered) RDB requirements */ #define REPLICA_REQ_RDB_MASK (REPLICA_REQ_RDB_EXCLUDE_DATA | REPLICA_REQ_RDB_EXCLUDE_FUNCTIONS) @@ -1011,7 +1014,7 @@ typedef enum { } clientIOState; typedef struct ClientFlags { - uint64_t primary : 1; /* This client is a primary */ + uint64_t replication_source : 1; /* This client is a replication source (i.e. primary or slot migration source) */ uint64_t replica : 1; /* This client is a replica */ uint64_t monitor : 1; /* This client is a replica monitor, see MONITOR */ uint64_t multi : 1; /* This client is in a MULTI context */ @@ -1103,6 +1106,7 @@ typedef struct ClientPubSubData { context of client side caching. */ } ClientPubSubData; +typedef struct replicationLink replicationLink; typedef struct ClientReplicationData { int repl_state; /* Replication state if this is a replica. */ int repl_start_cmd_stream_on_ack; /* Install replica write handler on first ACK. */ @@ -1133,6 +1137,8 @@ typedef struct ClientReplicationData { see the definition of replBufBlock. */ size_t ref_block_pos; /* Access position of referenced buffer block, i.e. the next offset to send. */ + list *slot_ranges; /* The slot range this replica is replicating for. */ + replicationLink *link; /* The replication link owning this. */ } ClientReplicationData; typedef struct ClientModuleData { @@ -1414,7 +1420,7 @@ typedef enum { * top-level primary. */ typedef struct rdbSaveInfo { /* Used saving and loading. */ - int repl_stream_db; /* DB to select in server.primary client. */ + int repl_stream_db; /* DB to select in server.primary->client. */ /* Used only loading. */ int repl_id_is_set; /* True if repl_id field is set. */ @@ -1536,6 +1542,43 @@ typedef enum childInfoType { CHILD_INFO_TYPE_MODULE_COW_SIZE } childInfoType; +typedef struct slotRange { + int start; + int end; +} slotRange; + +typedef struct replicationLink { + int protected; /* Used to protect link from destruction during background loading. */ + int state; /* State of the sync operation overall. */ + int rdb_channel_state; + client *client; + client *snapshot_load_client; /* client used for full sync when AOF format is used. */ + sds host; + int port; + connection *transfer_s; /* Replica -> Primary SYNC connection */ + connection *rdb_transfer_s; /* Primary FULL SYNC connection (RDB download) */ + uint64_t rdb_client_id; /* Rdb client id as it defined at primary side */ + /* The following two fields is where we store primary PSYNC replid/offset + * while the PSYNC is in progress. At the end we'll copy the fields into + * the server->primary client structure. */ + char replid[CONFIG_RUN_ID_SIZE + 1]; /* Primary PSYNC runid. */ + long long initial_offset; /* Primary PSYNC offset. */ + off_t transfer_size; /* Size of RDB to read from primary during sync. */ + off_t transfer_read; /* Amount of RDB read from primary during sync. */ + off_t transfer_last_fsync_off; /* Offset when we fsync-ed last time. */ + int transfer_fd; /* Replica -> Primary SYNC temp file descriptor */ + char *transfer_tmpfile; /* Replica-> Primary SYNC temp file name */ + time_t transfer_lastio; /* Unix time of the latest read, for timeout */ + struct { + char replid[CONFIG_RUN_ID_SIZE + 1]; + long long reploff; + long long read_reploff; + int dbid; + } provisional_source_state; /* Information about the provisional state (after RDB) for the source node, stored during dual channel sync. */ + replDataBuf pending_repl_data; /* Replication data buffer for dual-channel-replication */ + list *slot_ranges; /* Slot range used for slot import. */ +} replicationLink; + struct valkeyServer { /* General */ pid_t pid; /* Main process pid. */ @@ -1896,7 +1939,6 @@ struct valkeyServer { int repl_ping_replica_period; /* Primary pings the replica every N seconds */ replBacklog *repl_backlog; /* Replication backlog for partial syncs */ long long repl_backlog_size; /* Backlog circular buffer size */ - replDataBuf pending_repl_data; /* Replication data buffer for dual-channel-replication */ time_t repl_backlog_time_limit; /* Time without replicas after the backlog gets released. */ time_t repl_no_replicas_since; /* We have no replicas since that time. @@ -1920,52 +1962,28 @@ struct valkeyServer { list *repl_buffer_blocks; /* Replication buffers blocks list * (serving replica clients and repl backlog) */ /* Replication (replica) */ - char *primary_user; /* AUTH with this user and primary_auth with primary */ - sds primary_auth; /* AUTH with this password with primary */ - char *primary_host; /* Hostname of primary */ - int primary_port; /* Port of primary */ - int repl_timeout; /* Timeout after N seconds of primary idle */ - client *primary; /* Client that is primary for this replica */ - uint64_t rdb_client_id; /* Rdb client id as it defined at primary side */ - struct { - connection *conn; - char replid[CONFIG_RUN_ID_SIZE + 1]; - long long reploff; - long long read_reploff; - int dbid; - } repl_provisional_primary; - client *cached_primary; /* Cached primary to be reused for PSYNC. */ - rio *loading_rio; /* Pointer to the rio object currently used for loading data. */ - int repl_syncio_timeout; /* Timeout for synchronous I/O calls */ - int repl_state; /* Replication status if the instance is a replica */ - int repl_rdb_channel_state; /* State of the replica's rdb channel during dual-channel-replication */ - off_t repl_transfer_size; /* Size of RDB to read from primary during sync. */ - off_t repl_transfer_read; /* Amount of RDB read from primary during sync. */ - off_t repl_transfer_last_fsync_off; /* Offset when we fsync-ed last time. */ - connection *repl_transfer_s; /* Replica -> Primary SYNC connection */ - connection *repl_rdb_transfer_s; /* Primary FULL SYNC connection (RDB download) */ - int repl_transfer_fd; /* Replica -> Primary SYNC temp file descriptor */ - char *repl_transfer_tmpfile; /* Replica-> Primary SYNC temp file name */ - time_t repl_transfer_lastio; /* Unix time of the latest read, for timeout */ - int repl_serve_stale_data; /* Serve stale data when link is down? */ - int repl_replica_ro; /* Replica is read only? */ - int repl_replica_ignore_maxmemory; /* If true replicas do not evict. */ - time_t repl_down_since; /* Unix time at which link with primary went down */ - int repl_disable_tcp_nodelay; /* Disable TCP_NODELAY after SYNC? */ - int replica_priority; /* Reported in INFO and used by Sentinel. */ - int replica_announced; /* If true, replica is announced by Sentinel */ - int replica_announce_port; /* Give the primary this listening port. */ - char *replica_announce_ip; /* Give the primary this ip address. */ - int propagation_error_behavior; /* Configures the behavior of the replica - * when it receives an error on the replication stream */ - int repl_ignore_disk_write_error; /* Configures whether replicas panic when unable to - * persist writes to AOF. */ - /* The following two fields is where we store primary PSYNC replid/offset - * while the PSYNC is in progress. At the end we'll copy the fields into - * the server->primary client structure. */ - char primary_replid[CONFIG_RUN_ID_SIZE + 1]; /* Primary PSYNC runid. */ - long long primary_initial_offset; /* Primary PSYNC offset. */ - int repl_replica_lazy_flush; /* Lazy FLUSHALL before loading DB? */ + char *primary_user; /* AUTH with this user and primary_auth with primary */ + sds primary_auth; /* AUTH with this password with primary */ + int repl_timeout; /* Timeout after N seconds of primary idle */ + replicationLink *primary; /* Replication link for the primary. */ + list *replication_links; /* List of all current replication links. */ + client *cached_primary; /* Cached primary to be reused for PSYNC. */ + int repl_syncio_timeout; /* Timeout for synchronous I/O calls */ + int repl_serve_stale_data; /* Serve stale data when link is down? */ + int repl_replica_ro; /* Replica is read only? */ + int repl_replica_ignore_maxmemory; /* If true replicas do not evict. */ + time_t repl_down_since; /* Unix time at which link with primary went down */ + int repl_disable_tcp_nodelay; /* Disable TCP_NODELAY after SYNC? */ + int replica_priority; /* Reported in INFO and used by Sentinel. */ + int replica_announced; /* If true, replica is announced by Sentinel */ + int replica_announce_port; /* Give the primary this listening port. */ + char *replica_announce_ip; /* Give the primary this ip address. */ + int propagation_error_behavior; /* Configures the behavior of the replica + * when it receives an error on the replication stream */ + int repl_ignore_disk_write_error; /* Configures whether replicas panic when unable to + * persist writes to AOF. */ + int repl_replica_lazy_flush; /* Lazy FLUSHALL before loading DB? */ + rio *loading_rio; /* Pointer to the rio object currently used for loading data. */ /* Import Mode */ int import_mode; /* If true, server is in import mode and forbid expiration and eviction. */ /* Synchronous replication. */ @@ -2745,6 +2763,9 @@ void ioThreadWriteToClient(void *data); int canParseCommand(client *c); int processIOThreadsReadDone(void); int processIOThreadsWriteDone(void); +replicationLink *createReplicationLink(char *host, int port, list *slot_ranges); +int connectReplicationLink(replicationLink *link); +int freeReplicationLink(replicationLink *link); /* logreqres.c - logging of requests and responses */ void reqresReset(client *c, int free_buf); @@ -2898,7 +2919,7 @@ void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv, void updateReplicasWaitingBgsave(int bgsaveerr, int type); void replicationCron(void); void replicationStartPendingFork(void); -void replicationHandlePrimaryDisconnection(void); +void replicationHandleSourceDisconnection(replicationLink *link); void replicationCachePrimary(client *c); void resizeReplicationBacklog(void); void replicationSetPrimary(char *ip, int port, int full_sync_required); @@ -2909,7 +2930,7 @@ void processClientsWaitingReplicas(void); void unblockClientWaitingReplicas(client *c); int replicationCountAcksByOffset(long long offset); int replicationCountAOFAcksByOffset(long long offset); -void replicationSendNewlineToPrimary(void); +void replicationSendNewlineToConnectedLinks(void); long long replicationGetReplicaOffset(void); char *replicationGetReplicaName(client *c); long long getPsyncInitialOffset(void); @@ -2974,6 +2995,7 @@ void aofOpenIfNeededOnServerStart(void); void aofManifestFree(aofManifest *am); int aofDelHistoryFiles(void); int aofRewriteLimited(void); +int rewriteAppendOnlyFileRio(rio *aof, list *slot_ranges); /* Child info */ void openChildInfoPipe(void); @@ -3429,6 +3451,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor); int parseScanCursorOrReply(client *c, robj *o, unsigned long long *cursor); int dbAsyncDelete(serverDb *db, robj *key); void emptyDbAsync(serverDb *db); +void emptyHashtableAsync(serverDb *db, int didx); size_t lazyfreeGetPendingObjectsCount(void); size_t lazyfreeGetFreedObjectsCount(void); void lazyfreeResetStats(void); From ac26e2099cb32b8465bfc666d8aabafff3bdf703 Mon Sep 17 00:00:00 2001 From: Jacob Murphy Date: Wed, 15 Jan 2025 20:59:49 +0000 Subject: [PATCH 02/18] Use slot bitmap everywhere Signed-off-by: Jacob Murphy --- src/aof.c | 24 +++++++---------- src/cluster.c | 55 +++----------------------------------- src/cluster.h | 10 +++---- src/cluster_legacy.c | 34 ++++++++++++++---------- src/cluster_legacy.h | 2 +- src/rdb.c | 9 ++++--- src/rdb.h | 2 +- src/replication.c | 63 ++++++++++++++++++++------------------------ src/server.h | 20 +++++++------- 9 files changed, 82 insertions(+), 137 deletions(-) diff --git a/src/aof.c b/src/aof.c index 5c2691c1ba..3e9bc9d323 100644 --- a/src/aof.c +++ b/src/aof.c @@ -32,6 +32,7 @@ #include "rio.h" #include "functions.h" #include "module.h" +#include "cluster.h" #include #include @@ -2190,27 +2191,20 @@ static int rewriteFunctions(rio *aof) { return 0; } -int shouldFilterSlot(int slot, void * slot_ranges) { - if (slot_ranges == NULL) return 0; - list *ranges = (list *)slot_ranges; - listIter li; - listNode *ln; - listRewind(ranges, &li); - while ((ln = listNext(&li))) { - slotRange *range = (slotRange *) ln->value; - if (slot >= range->start && slot <= range->end) return 0; - } - return 1; +int shouldFilterSlot(int slot, void * privdata) { + if (privdata == NULL) return 0; + unsigned char *slot_bitmap = (unsigned char *)privdata; + return !bitmapTestBit(slot_bitmap, slot); } -int rewriteAppendOnlyFileRio(rio *aof, list *slot_ranges) { +int rewriteAppendOnlyFileRio(rio *aof, unsigned char *slot_bitmap) { int j; long key_count = 0; long long updated_time = 0; kvstoreIterator *kvs_it = NULL; /* Record timestamp at the beginning of rewriting AOF. */ - if (server.aof_timestamp_enabled && slot_ranges == NULL) { + if (server.aof_timestamp_enabled && isSlotBitmapAllSlots(slot_bitmap)) { sds ts = genAofTimestampAnnotationIfNeeded(1); if (rioWrite(aof, ts, sdslen(ts)) == 0) { sdsfree(ts); @@ -2230,10 +2224,10 @@ int rewriteAppendOnlyFileRio(rio *aof, list *slot_ranges) { if (rioWrite(aof, selectcmd, sizeof(selectcmd) - 1) == 0) goto werr; if (rioWriteBulkLongLong(aof, j) == 0) goto werr; - if (slot_ranges == NULL) { + if (isSlotBitmapAllSlots(slot_bitmap)) { kvs_it = kvstoreIteratorInit(db->keys); } else { - kvs_it = kvstoreFilteredIteratorInit(db->keys, &shouldFilterSlot, slot_ranges); + kvs_it = kvstoreFilteredIteratorInit(db->keys, &shouldFilterSlot, slot_bitmap); } /* Iterate this DB writing every entry */ void *next; diff --git a/src/cluster.c b/src/cluster.c index 8050cd869d..2e88ff8ba2 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -815,14 +815,10 @@ unsigned int countKeysInSlot(unsigned int slot) { return kvstoreHashtableSize(server.db->keys, slot); } -unsigned int dropKeysInSlotRanges(list *slot_ranges, int async) { +unsigned int dropKeysInSlotBitmap(unsigned char *slot_bitmap, int async) { unsigned int result = 0; - listIter li; - listNode *ln; - listRewind(slot_ranges, &li); - while ((ln = listNext(&li))) { - slotRange *slot_range = (slotRange *) listNodeValue(ln); - for (int i = slot_range->start; i <= slot_range->end; i++) { + for (int i = 0; i < CLUSTER_SLOTS; i++) { + if (bitmapTestBit(slot_bitmap, i)) { result += dropKeysInSlot(i, async); } } @@ -840,51 +836,6 @@ unsigned int dropKeysInSlot(unsigned int hashslot, int async) { return result; } - - -void slotRangesToBitmap(list *slot_ranges, unsigned char *bitmap_out) { - listIter li; - listNode *ln; - listRewind(slot_ranges, &li); - while ((ln = listNext(&li))) { - slotRange *range = (slotRange *) listNodeValue(ln); - for (int i = range->start; i <= range->end; i++) { - bitmapSetBit(bitmap_out, i); - } - } -} - -void bitmapToSlotRanges(unsigned char *bitmap, list **slot_ranges_out) { - *slot_ranges_out = listCreate(); - int range_start = -1; - for (int i = 0; i <= CLUSTER_SLOTS; i++) { - if (i != CLUSTER_SLOTS && bitmapTestBit(bitmap, i)) { - if (range_start == -1) { - range_start = i; - } - } else if (range_start != -1) { - slotRange *range = zmalloc(sizeof(slotRange)); - range->start = range_start; - range->end = i - 1; - range_start = -1; - serverLog(LL_NOTICE, "Got another range: %d-%d", range->start, range->end); - listAddNodeTail(*slot_ranges_out, range); - } - } -} - -void freeSlotRanges(list *slot_ranges) { - listIter li; - listNode *ln; - listRewind(slot_ranges, &li); - while ((ln = listNext(&li))) { - slotRange *range = (slotRange *)ln->value; - zfree(range); - listDelNode(slot_ranges, ln); - } - listRelease(slot_ranges); -} - void clusterCommandHelp(client *c) { const char *help[] = { "COUNTKEYSINSLOT ", diff --git a/src/cluster.h b/src/cluster.h index fd994d1ce7..e6610a8074 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -5,8 +5,6 @@ * Cluster exported API. *----------------------------------------------------------------------------*/ -#define CLUSTER_SLOT_MASK_BITS 14 /* Number of bits used for slot id. */ -#define CLUSTER_SLOTS (1 << CLUSTER_SLOT_MASK_BITS) /* Total number of slots in cluster mode, which is 16384. */ #define CLUSTER_SLOT_MASK ((unsigned long long)(CLUSTER_SLOTS - 1)) /* Bit mask for slot id stored in LSB. */ #define CLUSTER_OK 0 /* Everything looks ok */ #define CLUSTER_FAIL 1 /* The cluster can't work */ @@ -116,14 +114,14 @@ client *createCachedResponseClient(int resp); void deleteCachedResponseClient(client *recording_client); void clearCachedClusterSlotsResponse(void); unsigned int countKeysInSlot(unsigned int hashslot); -unsigned int dropKeysInSlotRanges(list *slot_ranges, int async); +unsigned int dropKeysInSlotBitmap(unsigned char *slot_bitmap, int async); unsigned int dropKeysInSlot(unsigned int hashslot, int async); -void slotRangesToBitmap(list *slot_ranges, unsigned char *bitmap_out); -void bitmapToSlotRanges(unsigned char *bitmap, list **slot_ranges_out); -void freeSlotRanges(list *slot_ranges); +void bitmapToSlotRanges(unsigned char *bitmap, char **slot_bitmap_out); int bitmapTestBit(unsigned char *bitmap, int pos); void bitmapSetBit(unsigned char *bitmap, int pos); void bitmapClearBit(unsigned char *bitmap, int pos); +void bitmapSetAllBits(unsigned char *bitmap, int len); +int isSlotBitmapAllSlots(unsigned char *bitmap); int getSlotOrReply(client *c, robj *o); /* functions with shared implementations */ diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 15a5ee3b7d..95e6e600fe 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -84,7 +84,7 @@ void clusterFreeNodesSlotsInfo(clusterNode *n); uint64_t clusterGetMaxEpoch(void); int clusterBumpConfigEpochWithoutConsensus(void); slotMigration *clusterGetCurrentSlotMigration(void); -void clusterSendMigrateSlotStart(clusterNode *node, list *slot_ranges); +void clusterSendMigrateSlotStart(clusterNode *node, unsigned char *slot_bitmap); void moduleCallClusterReceivers(const char *sender_id, uint64_t module_id, uint8_t type, @@ -4445,13 +4445,13 @@ slotMigration *clusterGetCurrentSlotMigration(void) { return (slotMigration *) listFirst(server.cluster->slot_migrations)->value; } -void clusterSendMigrateSlotStart(clusterNode *node, list *slot_ranges) { +void clusterSendMigrateSlotStart(clusterNode *node, unsigned char *slot_bitmap) { if (!node->link) return; uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData) + sizeof(clusterMsgSlotMigration); clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_MIGRATE_SLOT_START, msglen); clusterMsg *hdr = getMessageFromSendBlock(msgblock); - slotRangesToBitmap(slot_ranges, hdr->data.slot_migration.msg.slot_bitmap); + memcpy(hdr->data.slot_migration.msg.slot_bitmap, slot_bitmap, sizeof(hdr->data.slot_migration.msg.slot_bitmap)); clusterSendMessage(node->link, msgblock); clusterMsgSendBlockDecrRefCount(msgblock); } @@ -4482,7 +4482,7 @@ void clusterProceedWithSlotMigration(void) { /* Start the migration */ serverLog(LL_NOTICE, "Starting sync from migration source node %.40s", curr_migration->source_node->name); curr_migration->end_time = mstime() + CLUSTER_SLOT_MIGRATION_TIMEOUT; - curr_migration->link = createReplicationLink(curr_migration->source_node->ip, getNodeDefaultReplicationPort(curr_migration->source_node), curr_migration->slot_ranges); + curr_migration->link = createReplicationLink(curr_migration->source_node->ip, getNodeDefaultReplicationPort(curr_migration->source_node), curr_migration->slot_bitmap); if (connectReplicationLink(curr_migration->link) == C_ERR) { serverLog(LL_WARNING, "Failed to begin sync from migration source node %.40s", curr_migration->source_node->name); @@ -4506,7 +4506,7 @@ void clusterProceedWithSlotMigration(void) { return; case SLOT_MIGRATION_PAUSE_OWNER: serverLog(LL_NOTICE, "Replication link to slot owner %.40s has been established. Pausing source node and waiting to continue", curr_migration->source_node->name); - clusterSendMigrateSlotStart(curr_migration->source_node, curr_migration->slot_ranges); + clusterSendMigrateSlotStart(curr_migration->source_node, curr_migration->slot_bitmap); curr_migration->pause_primary_offset = -1; curr_migration->pause_end = mstime() + CLUSTER_MF_TIMEOUT; curr_migration->state = SLOT_MIGRATION_WAITING_FOR_OFFSET; @@ -4524,12 +4524,8 @@ void clusterProceedWithSlotMigration(void) { return; case SLOT_MIGRATION_FINISH: serverLog(LL_NOTICE, "Setting myself to owner of migrating slots and broadcasting"); - listIter li; - listNode *ln; - listRewind(curr_migration->slot_ranges, &li); - while ((ln = listNext(&li))) { - slotRange *range = (slotRange *) ln->value; - for (int i = range->start; i <= range->end; i++) { + for (int i = 0; i < CLUSTER_SLOTS; i++) { + if (bitmapTestBit(curr_migration->slot_bitmap, i)) { clusterDelSlot(i); clusterAddSlot(myself, i); } @@ -4548,8 +4544,7 @@ void clusterProceedWithSlotMigration(void) { /* Delete the migration from the queue and proceed to the next migration */ listDelNode(server.cluster->slot_migrations, curr_node); freeReplicationLink(curr_migration->link); - dropKeysInSlotRanges(curr_migration->slot_ranges, server.repl_replica_lazy_flush); - freeSlotRanges(curr_migration->slot_ranges); + dropKeysInSlotBitmap(curr_migration->slot_bitmap, server.repl_replica_lazy_flush); zfree(curr_migration); continue; } @@ -5597,6 +5592,17 @@ void bitmapClearBit(unsigned char *bitmap, int pos) { bitmap[byte] &= ~(1 << bit); } +void bitmapSetAllBits(unsigned char *bitmap, int len) { + memset(bitmap, 0xff, len); +} + +/* Return if the slot bitmap contains all slots */ +int isSlotBitmapAllSlots(unsigned char *bitmap) { + unsigned char all_slot_bitmap[CLUSTER_SLOTS / 8]; + bitmapSetAllBits(all_slot_bitmap, sizeof(all_slot_bitmap)); + return memcmp(bitmap, all_slot_bitmap, sizeof(all_slot_bitmap)) == 0; +} + /* Return non-zero if there is at least one primary with replicas in the cluster. * Otherwise zero is returned. Used by clusterNodeSetSlotBit() to set the * MIGRATE_TO flag the when a primary gets the first slot. */ @@ -7333,7 +7339,7 @@ int clusterCommandSpecial(client *c) { } slotMigration *to_enqueue = (slotMigration *) zmalloc(sizeof(slotMigration)); - bitmapToSlotRanges(requested_slots, &to_enqueue->slot_ranges); + memcpy(to_enqueue->slot_bitmap, requested_slots, sizeof(requested_slots)); to_enqueue->source_node = curr_owner; to_enqueue->state = SLOT_MIGRATION_QUEUED; to_enqueue->end_time = 0; /* Will be set once started. */ diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index dc157af78b..9a5add854d 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -387,7 +387,7 @@ typedef enum slotMigrationState { } slotMigrationState; typedef struct slotMigration { - list *slot_ranges; + unsigned char slot_bitmap[CLUSTER_SLOTS/8]; slotMigrationState state; clusterNode *source_node; mstime_t end_time; /* Slot migration time limit (ms unixtime). diff --git a/src/rdb.c b/src/rdb.c index 57fae239ad..33fb2c274c 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -38,6 +38,7 @@ #include "bio.h" #include "zmalloc.h" #include "module.h" +#include "cluster.h" #include #include @@ -3526,7 +3527,7 @@ void killRDBChild(void) { /* Spawn an RDB child that writes the RDB to the sockets of the replicas * that are currently in REPLICA_STATE_WAIT_BGSAVE_START state. */ -int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, list *slot_ranges) { +int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, unsigned char *slot_bitmap) { listNode *ln; listIter li; pid_t childpid; @@ -3577,8 +3578,8 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, list *slot_ranges) { if (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_START) { /* Check replica has the exact requirements */ if (replica->repl_data->replica_req != req) continue; - /* No attempt to coallesce slot ranges, just use equality */ - if (replica->repl_data->slot_ranges != slot_ranges) continue; + /* Check matching slot bitmaps. */ + if (memcmp(replica->repl_data->slot_bitmap, slot_bitmap, CLUSTER_SLOTS/8) != 0) continue; conns[connsnum++] = replica->conn; if (dual_channel) { @@ -3620,7 +3621,7 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, list *slot_ranges) { if (aof) { serverLog(LL_NOTICE, "Background AOF transfer started by pid %ld", (long)getpid()); - retval = rewriteAppendOnlyFileRio(&rdb, slot_ranges); + retval = rewriteAppendOnlyFileRio(&rdb, slot_bitmap); rioWrite(&rdb, "*3\r\n", 4); rioWriteBulkString(&rdb, "REPLCONF", 8); rioWriteBulkString(&rdb, "SYNC-PAYLOAD-END", 17); diff --git a/src/rdb.h b/src/rdb.h index 440620e5bb..5225933dd6 100644 --- a/src/rdb.h +++ b/src/rdb.h @@ -152,7 +152,7 @@ int rdbSaveObjectType(rio *rdb, robj *o); int rdbLoadObjectType(rio *rdb); int rdbLoad(char *filename, rdbSaveInfo *rsi, int rdbflags); int rdbSaveBackground(int req, char *filename, rdbSaveInfo *rsi, int rdbflags); -int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, list *slotRanges); +int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, unsigned char *slot_bitmap); void rdbRemoveTempFile(pid_t childpid, int from_signal); int rdbSaveToFile(const char *filename); int rdbSave(int req, char *filename, rdbSaveInfo *rsi, int rdbflags); diff --git a/src/replication.c b/src/replication.c index cecfad5ee5..5119eeb408 100644 --- a/src/replication.c +++ b/src/replication.c @@ -953,7 +953,7 @@ int primaryTryPartialResynchronization(client *c, long long psync_offset) { * started. * * Returns C_OK on success or C_ERR otherwise. */ -int startBgsaveForReplication(int mincapa, int req, list *slot_ranges) { +int startBgsaveForReplication(int mincapa, int req, unsigned char *slot_bitmap) { int retval; int socket_target = 0; listIter li; @@ -977,7 +977,7 @@ int startBgsaveForReplication(int mincapa, int req, list *slot_ranges) { * otherwise replica will miss repl-stream-db. */ if (rsiptr) { if (socket_target) - retval = rdbSaveToReplicasSockets(req, rsiptr, slot_ranges); + retval = rdbSaveToReplicasSockets(req, rsiptr, slot_bitmap); else { /* Keep the page cache since it'll get used soon */ retval = rdbSaveBackground(req, server.rdb_filename, rsiptr, RDBFLAGS_REPLICATION | RDBFLAGS_KEEP_CACHE); @@ -1099,7 +1099,7 @@ void syncCommand(client *c) { } /* Fail sync if it is asking for AOF format and a slot is not set via REPLCONF already. */ - if (c->repl_data->replica_req & REPLICA_REQ_AOF_FORMAT && c->repl_data->slot_ranges == NULL) { + if (c->repl_data->replica_req & REPLICA_REQ_AOF_FORMAT && isSlotBitmapAllSlots(c->repl_data->slot_bitmap)) { addReplyError(c, "AOF format is only supported for single slot SYNC"); return; } @@ -1180,7 +1180,7 @@ void syncCommand(client *c) { } /* For slot level replication, we make no attempt to coallesce BGSAVEs */ - int require_dedicated = c->repl_data->slot_ranges != NULL; + int require_dedicated = !isSlotBitmapAllSlots(c->repl_data->slot_bitmap); /* CASE 1: BGSAVE is in progress, with disk target. */ if (!require_dedicated && server.child_type == CHILD_TYPE_RDB && server.rdb_child_type == RDB_CHILD_TYPE_DISK) { @@ -1244,7 +1244,7 @@ void syncCommand(client *c) { } /* CASE 5: We are good to start a BGSAVE. Diskless or disk-based mode is determined by replica's capacity. */ - startBgsaveForReplication(c->repl_data->replica_capa, c->repl_data->replica_req, c->repl_data->slot_ranges); + startBgsaveForReplication(c->repl_data->replica_capa, c->repl_data->replica_req, c->repl_data->slot_bitmap); return; } @@ -1268,6 +1268,7 @@ int anyOtherReplicaWaitRdb(client *except_me) { void initClientReplicationData(client *c) { if (c->repl_data) return; c->repl_data = (ClientReplicationData *)zcalloc(sizeof(ClientReplicationData)); + bitmapSetAllBits(c->repl_data->slot_bitmap, sizeof(c->repl_data->slot_bitmap)); } void freeClientReplicationData(client *c) { @@ -1312,9 +1313,6 @@ void freeClientReplicationData(client *c) { replicationHandleSourceDisconnection(c->repl_data->link); } sdsfree(c->repl_data->replica_addr); - if (c->repl_data->slot_ranges) { - freeSlotRanges(c->repl_data->slot_ranges); - } zfree(c->repl_data); c->repl_data = NULL; } @@ -1515,23 +1513,20 @@ void replconfCommand(client *c) { if (!server.cluster_enabled) { addReplyError(c, "Cannot replicate a slot when cluster mode is disabled"); } - if (c->repl_data->slot_ranges != NULL) { + if (!isSlotBitmapAllSlots(c->repl_data->slot_bitmap)) { addReplyError(c, "Slot bitmap already set"); } if (stringObjectLen(c->argv[j + 1]) != CLUSTER_SLOTS / 8) { addReplyError(c, "Invalid slot bitmap length"); return; } - list *slot_ranges; - bitmapToSlotRanges(c->argv[j + 1]->ptr, &slot_ranges); for (int slot = 0; slot <= CLUSTER_SLOTS; slot++) { if (bitmapTestBit(c->argv[j + 1]->ptr, slot) && server.cluster->slots[slot] != server.cluster->myself) { addReplyErrorFormat(c, "I cannot replicate slot %d since I do not own it", slot); - freeSlotRanges(slot_ranges); return; } } - c->repl_data->slot_ranges = slot_ranges; + memcpy(c->repl_data->slot_bitmap, c->argv[j + 1]->ptr, CLUSTER_SLOTS / 8); /* For now, we only support AOF for slot transfer. */ c->repl_data->replica_req |= REPLICA_REQ_AOF_FORMAT; @@ -1993,7 +1988,7 @@ void shiftReplicationId(void) { char *replicationGetNameForLogs(replicationLink *link) { if (link == server.primary) return "PRIMARY"; - if (link->slot_ranges != NULL) + if (!isSlotBitmapAllSlots(link->slot_bitmap)) return "SLOT OWNER"; return "OTHER REPLICATION SOURCE"; } @@ -2079,7 +2074,7 @@ client *createReplicationLinkClientWithHandler(replicationLink *link, connection * PSYNC capable, so we flag it accordingly. */ if (c->repl_data->reploff == -1) c->flag.pre_psync = 1; if (dbid != -1) selectDb(c, dbid); - c->repl_data->slot_ranges = link->slot_ranges; + memcpy(c->repl_data->slot_bitmap, link->slot_bitmap, sizeof(c->repl_data->slot_bitmap)); return c; } @@ -2247,7 +2242,7 @@ void readSyncBulkPayload(connection *conn) { replicationLink *link = (replicationLink *)connGetPrivateData(conn); /* RDB bulk load will only be used if we are sending all slots. */ - serverAssert(link->slot_ranges == NULL); + serverAssert(isSlotBitmapAllSlots(link->slot_bitmap)); /* Static vars used to hold the EOF mark, and the last bytes received * from the server: when they match, we reached the end of the transfer. */ @@ -2879,7 +2874,7 @@ static int dualChannelReplHandleReplconfReply(replicationLink *link, sds *err) { } int replicationUseAOFFormatSnapshot(replicationLink *link) { - return link->slot_ranges != NULL; + return !isSlotBitmapAllSlots(link->slot_bitmap); } static int dualChannelReplHandleEndOffsetResponse(replicationLink *link, sds *err) { @@ -3731,13 +3726,10 @@ void syncWithSource(connection *conn) { } /* Set the slot number, so that the primary only provides us with the appropriate slot dictionary. */ - if (link->slot_ranges != NULL) { + if (!isSlotBitmapAllSlots(link->slot_bitmap)) { char *argv[3] = {"REPLCONF", "slot-bitmap", NULL}; size_t lens[3] = {8, 11, 0}; - unsigned char slot_bitmap[CLUSTER_SLOTS/8 + 1] = {0}; - slotRangesToBitmap(link->slot_ranges, slot_bitmap); - slot_bitmap[CLUSTER_SLOTS/8] = '\0'; - argv[2] = (char *)slot_bitmap; + argv[2] = (char *)link->slot_bitmap; lens[2] = CLUSTER_SLOTS/8; err = sendCommandArgv(conn, 3, argv, lens); if (err) goto write_error; @@ -3817,7 +3809,7 @@ void syncWithSource(connection *conn) { return; } - if (link->state == REPL_STATE_RECEIVE_SLOT_REPLY && link->slot_ranges == NULL) + if (link->state == REPL_STATE_RECEIVE_SLOT_REPLY && isSlotBitmapAllSlots(link->slot_bitmap)) link->state = REPL_STATE_RECEIVE_CAPA_REPLY; if (link->state == REPL_STATE_RECEIVE_SLOT_REPLY) { @@ -3937,7 +3929,7 @@ void syncWithSource(connection *conn) { } /* Prepare a suitable temp file for bulk transfer */ - if (!useDisklessLoad() && link->slot_ranges == NULL) { + if (!useDisklessLoad() && isSlotBitmapAllSlots(link->slot_bitmap)) { int dfd = -1, maxtries = 5; while (maxtries--) { snprintf(tmpfile, 256, "temp-%d.%ld.rdb", (int)server.unixtime, (long int)getpid()); @@ -3960,8 +3952,8 @@ void syncWithSource(connection *conn) { /* We are going to need to do a full resync. If we are accepting a single * slot - make sure we have a clean slate to load it into.*/ - if (link->slot_ranges != NULL) { - dropKeysInSlotRanges(link->slot_ranges, 1); + if (!isSlotBitmapAllSlots(link->slot_bitmap)) { + dropKeysInSlotBitmap(link->slot_bitmap, 1); } /* Using dual-channel-replication, the primary responded +DUALCHANNELSYNC. We need to @@ -4029,12 +4021,12 @@ void syncWithSource(connection *conn) { goto error; } -replicationLink *createReplicationLink(char *host, int port, list *slot_ranges) { +replicationLink *createReplicationLink(char *host, int port, unsigned char *slot_bitmap) { replicationLink *result = (replicationLink *)zmalloc(sizeof(replicationLink)); result->protected = 0; result->state = REPL_STATE_NONE; result->rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE; - result->slot_ranges = slot_ranges; + memcpy(result->slot_bitmap, slot_bitmap, sizeof(result->slot_bitmap)); result->client = NULL; result->host = sdsnew(host); result->port = port; @@ -5133,7 +5125,7 @@ void replicationCron(void) { replication_cron_loops++; /* Incremented with frequency 1 HZ. */ } -int shouldStartChildReplication(int *mincapa_out, int *req_out, list **slot_ranges_out) { +int shouldStartChildReplication(int *mincapa_out, int *req_out, unsigned char *slot_bitmap_out) { /* We should start a BGSAVE good for replication if we have replicas in * WAIT_BGSAVE_START state. * @@ -5145,7 +5137,7 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out, list **slot_rang int replicas_waiting = 0; int mincapa; int req; - list *slot_ranges; + unsigned char slot_bitmap[CLUSTER_SLOTS/8]; int first = 1; listNode *ln; listIter li; @@ -5157,7 +5149,7 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out, list **slot_rang if (first) { /* Get first replica's requirements */ req = replica->repl_data->replica_req; - slot_ranges = replica->repl_data->slot_ranges; + memcpy(slot_bitmap, replica->repl_data->slot_bitmap, sizeof(slot_bitmap)); } else if (req != replica->repl_data->replica_req) { /* Skip replicas that don't match */ continue; @@ -5176,7 +5168,7 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out, list **slot_rang max_idle >= server.repl_diskless_sync_delay)) { if (mincapa_out) *mincapa_out = mincapa; if (req_out) *req_out = req; - if (slot_ranges_out) *slot_ranges_out = slot_ranges; + if (slot_bitmap_out) memcpy(slot_bitmap_out, slot_bitmap, sizeof(slot_bitmap)); return 1; } } @@ -5187,13 +5179,14 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out, list **slot_rang void replicationStartPendingFork(void) { int mincapa = -1; int req = -1; - list *slot_ranges = NULL; + unsigned char slot_bitmap[CLUSTER_SLOTS/8]; + bitmapSetAllBits(slot_bitmap, sizeof(slot_bitmap)); - if (shouldStartChildReplication(&mincapa, &req, &slot_ranges)) { + if (shouldStartChildReplication(&mincapa, &req, slot_bitmap)) { /* Start the BGSAVE. The called function may start a * BGSAVE with socket target or disk target depending on the * configuration and replicas capabilities and requirements. */ - startBgsaveForReplication(mincapa, req, slot_ranges); + startBgsaveForReplication(mincapa, req, slot_bitmap); } } diff --git a/src/server.h b/src/server.h index 1bd78f57f6..5d7b8db461 100644 --- a/src/server.h +++ b/src/server.h @@ -153,6 +153,8 @@ struct hdr_histogram; #else #define CONFIG_ACTIVE_DEFRAG_DEFAULT 1 #endif +#define CLUSTER_SLOT_MASK_BITS 14 /* Number of bits used for slot id. */ +#define CLUSTER_SLOTS (1 << CLUSTER_SLOT_MASK_BITS) /* Total number of slots in cluster mode, which is 16384. */ /* Bucket sizes for client eviction pools. Each bucket stores clients with * memory usage of up to twice the size of the bucket below it. */ @@ -1133,12 +1135,12 @@ typedef struct ClientReplicationData { short replica_req; /* Replica requirements: REPLICA_REQ_* */ uint64_t associated_rdb_client_id; /* The client id of this replica's rdb connection */ time_t rdb_client_disconnect_time; /* Time of the first freeClient call on this client. Used for delaying free. */ - listNode *ref_repl_buf_node; /* Referenced node of replication buffer blocks, - see the definition of replBufBlock. */ - size_t ref_block_pos; /* Access position of referenced buffer block, - i.e. the next offset to send. */ - list *slot_ranges; /* The slot range this replica is replicating for. */ - replicationLink *link; /* The replication link owning this. */ + listNode *ref_repl_buf_node; /* Referenced node of replication buffer blocks, + see the definition of replBufBlock. */ + size_t ref_block_pos; /* Access position of referenced buffer block, + i.e. the next offset to send. */ + unsigned char slot_bitmap[CLUSTER_SLOTS/8]; /* The slot range this replica is replicating for. */ + replicationLink *link; /* The replication link owning this. */ } ClientReplicationData; typedef struct ClientModuleData { @@ -1576,7 +1578,7 @@ typedef struct replicationLink { int dbid; } provisional_source_state; /* Information about the provisional state (after RDB) for the source node, stored during dual channel sync. */ replDataBuf pending_repl_data; /* Replication data buffer for dual-channel-replication */ - list *slot_ranges; /* Slot range used for slot import. */ + unsigned char slot_bitmap[CLUSTER_SLOTS/8]; /* Slot range used for slot import. */ } replicationLink; struct valkeyServer { @@ -2763,7 +2765,7 @@ void ioThreadWriteToClient(void *data); int canParseCommand(client *c); int processIOThreadsReadDone(void); int processIOThreadsWriteDone(void); -replicationLink *createReplicationLink(char *host, int port, list *slot_ranges); +replicationLink *createReplicationLink(char *host, int port, unsigned char *slot_bitmap); int connectReplicationLink(replicationLink *link); int freeReplicationLink(replicationLink *link); @@ -2995,7 +2997,7 @@ void aofOpenIfNeededOnServerStart(void); void aofManifestFree(aofManifest *am); int aofDelHistoryFiles(void); int aofRewriteLimited(void); -int rewriteAppendOnlyFileRio(rio *aof, list *slot_ranges); +int rewriteAppendOnlyFileRio(rio *aof, unsigned char *slot_bitmap); /* Child info */ void openChildInfoPipe(void); From 6c83496c45db364d3bf4f3e20ac69cea7a334cad Mon Sep 17 00:00:00 2001 From: Jacob Murphy Date: Wed, 15 Jan 2025 22:45:19 +0000 Subject: [PATCH 03/18] Iterative improvements to get dual channel working Signed-off-by: Jacob Murphy --- src/aof.c | 2 +- src/cluster.c | 2 +- src/cluster.h | 7 ++-- src/cluster_legacy.c | 25 +++++++----- src/cluster_legacy.h | 4 +- src/rdb.c | 4 +- src/rdb.h | 2 +- src/replication.c | 95 ++++++++++++++++++++++++++++++++++---------- src/server.h | 18 +++++---- 9 files changed, 109 insertions(+), 50 deletions(-) diff --git a/src/aof.c b/src/aof.c index 3e9bc9d323..06f04760ca 100644 --- a/src/aof.c +++ b/src/aof.c @@ -2197,7 +2197,7 @@ int shouldFilterSlot(int slot, void * privdata) { return !bitmapTestBit(slot_bitmap, slot); } -int rewriteAppendOnlyFileRio(rio *aof, unsigned char *slot_bitmap) { +int rewriteAppendOnlyFileRio(rio *aof, slotBitmap slot_bitmap) { int j; long key_count = 0; long long updated_time = 0; diff --git a/src/cluster.c b/src/cluster.c index 2e88ff8ba2..d7e7be52af 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -815,7 +815,7 @@ unsigned int countKeysInSlot(unsigned int slot) { return kvstoreHashtableSize(server.db->keys, slot); } -unsigned int dropKeysInSlotBitmap(unsigned char *slot_bitmap, int async) { +unsigned int dropKeysInSlotBitmap(slotBitmap slot_bitmap, int async) { unsigned int result = 0; for (int i = 0; i < CLUSTER_SLOTS; i++) { if (bitmapTestBit(slot_bitmap, i)) { diff --git a/src/cluster.h b/src/cluster.h index e6610a8074..9b050d0b70 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -114,14 +114,15 @@ client *createCachedResponseClient(int resp); void deleteCachedResponseClient(client *recording_client); void clearCachedClusterSlotsResponse(void); unsigned int countKeysInSlot(unsigned int hashslot); -unsigned int dropKeysInSlotBitmap(unsigned char *slot_bitmap, int async); +unsigned int dropKeysInSlotBitmap(slotBitmap slot_bitmap, int async); unsigned int dropKeysInSlot(unsigned int hashslot, int async); -void bitmapToSlotRanges(unsigned char *bitmap, char **slot_bitmap_out); +void bitmapToSlotRanges(unsigned char *bitmap, slotBitmap slot_bitmap_out); int bitmapTestBit(unsigned char *bitmap, int pos); void bitmapSetBit(unsigned char *bitmap, int pos); void bitmapClearBit(unsigned char *bitmap, int pos); void bitmapSetAllBits(unsigned char *bitmap, int len); -int isSlotBitmapAllSlots(unsigned char *bitmap); +void slotBitmapSetAll(slotBitmap bitmap); +int isSlotBitmapAllSlots(slotBitmap bitmap); int getSlotOrReply(client *c, robj *o); /* functions with shared implementations */ diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 95e6e600fe..0e07057856 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -84,7 +84,7 @@ void clusterFreeNodesSlotsInfo(clusterNode *n); uint64_t clusterGetMaxEpoch(void); int clusterBumpConfigEpochWithoutConsensus(void); slotMigration *clusterGetCurrentSlotMigration(void); -void clusterSendMigrateSlotStart(clusterNode *node, unsigned char *slot_bitmap); +void clusterSendMigrateSlotStart(clusterNode *node, slotBitmap slot_bitmap); void moduleCallClusterReceivers(const char *sender_id, uint64_t module_id, uint8_t type, @@ -4445,13 +4445,13 @@ slotMigration *clusterGetCurrentSlotMigration(void) { return (slotMigration *) listFirst(server.cluster->slot_migrations)->value; } -void clusterSendMigrateSlotStart(clusterNode *node, unsigned char *slot_bitmap) { +void clusterSendMigrateSlotStart(clusterNode *node, slotBitmap slot_bitmap) { if (!node->link) return; uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData) + sizeof(clusterMsgSlotMigration); clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_MIGRATE_SLOT_START, msglen); clusterMsg *hdr = getMessageFromSendBlock(msgblock); - memcpy(hdr->data.slot_migration.msg.slot_bitmap, slot_bitmap, sizeof(hdr->data.slot_migration.msg.slot_bitmap)); + memcpy(hdr->data.slot_migration.msg.slot_bitmap, slot_bitmap, sizeof(slotBitmap)); clusterSendMessage(node->link, msgblock); clusterMsgSendBlockDecrRefCount(msgblock); } @@ -5592,15 +5592,20 @@ void bitmapClearBit(unsigned char *bitmap, int pos) { bitmap[byte] &= ~(1 << bit); } -void bitmapSetAllBits(unsigned char *bitmap, int len) { - memset(bitmap, 0xff, len); +void slotBitmapSetAll(slotBitmap bitmap) { + memset(bitmap, 0xff, sizeof(slotBitmap)); +} + +int slotBitmapCompare(slotBitmap bitmap, slotBitmap otherbitmap) { + return memcmp(bitmap, otherbitmap, sizeof(slotBitmap)); } /* Return if the slot bitmap contains all slots */ -int isSlotBitmapAllSlots(unsigned char *bitmap) { - unsigned char all_slot_bitmap[CLUSTER_SLOTS / 8]; - bitmapSetAllBits(all_slot_bitmap, sizeof(all_slot_bitmap)); - return memcmp(bitmap, all_slot_bitmap, sizeof(all_slot_bitmap)) == 0; +int isSlotBitmapAllSlots(slotBitmap bitmap) { + if (!bitmap) return 1; + slotBitmap all_slot_bitmap; + slotBitmapSetAll(all_slot_bitmap); + return slotBitmapCompare(bitmap, all_slot_bitmap) == 0; } /* Return non-zero if there is at least one primary with replicas in the cluster. @@ -7339,7 +7344,7 @@ int clusterCommandSpecial(client *c) { } slotMigration *to_enqueue = (slotMigration *) zmalloc(sizeof(slotMigration)); - memcpy(to_enqueue->slot_bitmap, requested_slots, sizeof(requested_slots)); + memcpy(to_enqueue->slot_bitmap, requested_slots, sizeof(slotBitmap)); to_enqueue->source_node = curr_owner; to_enqueue->state = SLOT_MIGRATION_QUEUED; to_enqueue->end_time = 0; /* Will be set once started. */ diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index 9a5add854d..1b83c1b2f5 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -147,7 +147,7 @@ typedef struct { } clusterMsgModule; typedef struct { - unsigned char slot_bitmap[CLUSTER_SLOTS / 8]; /* Slots bitmap. */ + slotBitmap slot_bitmap; } clusterMsgSlotMigration; /* The cluster supports optional extension messages that can be sent @@ -387,7 +387,7 @@ typedef enum slotMigrationState { } slotMigrationState; typedef struct slotMigration { - unsigned char slot_bitmap[CLUSTER_SLOTS/8]; + slotBitmap slot_bitmap; slotMigrationState state; clusterNode *source_node; mstime_t end_time; /* Slot migration time limit (ms unixtime). diff --git a/src/rdb.c b/src/rdb.c index 33fb2c274c..4a3a7e1c8e 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -3527,7 +3527,7 @@ void killRDBChild(void) { /* Spawn an RDB child that writes the RDB to the sockets of the replicas * that are currently in REPLICA_STATE_WAIT_BGSAVE_START state. */ -int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, unsigned char *slot_bitmap) { +int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, slotBitmap slot_bitmap) { listNode *ln; listIter li; pid_t childpid; @@ -3579,7 +3579,7 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, unsigned char *slot_bitm /* Check replica has the exact requirements */ if (replica->repl_data->replica_req != req) continue; /* Check matching slot bitmaps. */ - if (memcmp(replica->repl_data->slot_bitmap, slot_bitmap, CLUSTER_SLOTS/8) != 0) continue; + if (memcmp(replica->repl_data->slot_bitmap, slot_bitmap, sizeof(slotBitmap)) != 0) continue; conns[connsnum++] = replica->conn; if (dual_channel) { diff --git a/src/rdb.h b/src/rdb.h index 5225933dd6..734ae7ba72 100644 --- a/src/rdb.h +++ b/src/rdb.h @@ -152,7 +152,7 @@ int rdbSaveObjectType(rio *rdb, robj *o); int rdbLoadObjectType(rio *rdb); int rdbLoad(char *filename, rdbSaveInfo *rsi, int rdbflags); int rdbSaveBackground(int req, char *filename, rdbSaveInfo *rsi, int rdbflags); -int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, unsigned char *slot_bitmap); +int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, slotBitmap slot_bitmap); void rdbRemoveTempFile(pid_t childpid, int from_signal); int rdbSaveToFile(const char *filename); int rdbSave(int req, char *filename, rdbSaveInfo *rsi, int rdbflags); diff --git a/src/replication.c b/src/replication.c index 5119eeb408..c50cebecfd 100644 --- a/src/replication.c +++ b/src/replication.c @@ -953,7 +953,7 @@ int primaryTryPartialResynchronization(client *c, long long psync_offset) { * started. * * Returns C_OK on success or C_ERR otherwise. */ -int startBgsaveForReplication(int mincapa, int req, unsigned char *slot_bitmap) { +int startBgsaveForReplication(int mincapa, int req, slotBitmap slot_bitmap) { int retval; int socket_target = 0; listIter li; @@ -1268,7 +1268,7 @@ int anyOtherReplicaWaitRdb(client *except_me) { void initClientReplicationData(client *c) { if (c->repl_data) return; c->repl_data = (ClientReplicationData *)zcalloc(sizeof(ClientReplicationData)); - bitmapSetAllBits(c->repl_data->slot_bitmap, sizeof(c->repl_data->slot_bitmap)); + slotBitmapSetAll(c->repl_data->slot_bitmap); } void freeClientReplicationData(client *c) { @@ -2074,7 +2074,7 @@ client *createReplicationLinkClientWithHandler(replicationLink *link, connection * PSYNC capable, so we flag it accordingly. */ if (c->repl_data->reploff == -1) c->flag.pre_psync = 1; if (dbid != -1) selectDb(c, dbid); - memcpy(c->repl_data->slot_bitmap, link->slot_bitmap, sizeof(c->repl_data->slot_bitmap)); + memcpy(c->repl_data->slot_bitmap, link->slot_bitmap, sizeof(slotBitmap)); return c; } @@ -2816,7 +2816,7 @@ static int dualChannelReplHandleHandshake(replicationLink *link, sds *err) { args[argc] = server.primary_auth; lens[argc] = sdslen(server.primary_auth); argc++; - *err = sendCommandArgv(link->transfer_s, argc, args, lens); + *err = sendCommandArgv(link->rdb_transfer_s, argc, args, lens); if (*err) { dualChannelServerLog(LL_WARNING, "Sending command to primary in dual channel replication handshake: %s", *err); return C_ERR; @@ -2824,7 +2824,7 @@ static int dualChannelReplHandleHandshake(replicationLink *link, sds *err) { } /* Send replica listening port to primary for clarification */ sds portstr = getReplicaPortString(); - *err = sendCommand(link->transfer_s, "REPLCONF", "capa", "eof", "rdb-only", "1", "rdb-channel", "1", "listening-port", portstr, + *err = sendCommand(link->rdb_transfer_s, "REPLCONF", "capa", "eof", "rdb-only", "1", "rdb-channel", "1", "listening-port", portstr, NULL); sdsfree(portstr); if (*err) { @@ -2832,7 +2832,20 @@ static int dualChannelReplHandleHandshake(replicationLink *link, sds *err) { return C_ERR; } - if (connSetReadHandler(link->transfer_s, dualChannelFullSyncWithReplicationSource) == C_ERR) { + /* Send slot bitmap, if it is needed */ + if (!isSlotBitmapAllSlots(link->slot_bitmap)) { + char *args[] = {"REPLCONF", "slot-bitmap", NULL}; + size_t lens[] = {8, 11, 0}; + args[2] = (char *) link->slot_bitmap; + lens[2] = sizeof(slotBitmap); + *err = sendCommandArgv(link->rdb_transfer_s, 3, args, lens); + if (*err) { + dualChannelServerLog(LL_WARNING, "Sending REPLCONF slot-bitmap command to primary in dual channel replication handshake: %s", *err); + return C_ERR; + } + } + + if (connSetReadHandler(link->rdb_transfer_s, dualChannelFullSyncWithReplicationSource) == C_ERR) { char conninfo[CONN_INFO_LEN]; dualChannelServerLog(LL_WARNING, "Can't create readable event for SYNC: %s (%s)", strerror(errno), connGetInfo(link->transfer_s, conninfo, sizeof(conninfo))); @@ -2842,7 +2855,7 @@ static int dualChannelReplHandleHandshake(replicationLink *link, sds *err) { } static int dualChannelReplHandleAuthReply(replicationLink *link, sds *err) { - *err = receiveSynchronousResponse(link->transfer_s); + *err = receiveSynchronousResponse(link->rdb_transfer_s); if (*err == NULL) { dualChannelServerLog(LL_WARNING, "Primary did not respond to auth command during SYNC handshake"); return C_ERR; @@ -2855,7 +2868,7 @@ static int dualChannelReplHandleAuthReply(replicationLink *link, sds *err) { } static int dualChannelReplHandleReplconfReply(replicationLink *link, sds *err) { - *err = receiveSynchronousResponse(link->transfer_s); + *err = receiveSynchronousResponse(link->rdb_transfer_s); if (*err == NULL) { dualChannelServerLog(LL_WARNING, "Primary did not respond to replconf command during SYNC handshake"); return C_ERR; @@ -2866,8 +2879,24 @@ static int dualChannelReplHandleReplconfReply(replicationLink *link, sds *err) { *err); return C_ERR; } - if (connSyncWrite(link->transfer_s, "SYNC\r\n", 6, server.repl_syncio_timeout * 1000) == -1) { - dualChannelServerLog(LL_WARNING, "I/O error writing to Primary: %s", connGetLastError(link->transfer_s)); + + /* Recieve slot bitmap response as well. */ + if (!isSlotBitmapAllSlots(link->slot_bitmap)) { + *err = receiveSynchronousResponse(link->rdb_transfer_s); + if (*err == NULL) { + dualChannelServerLog(LL_WARNING, "Primary did not respond to replconf slot-bitmap command during SYNC handshake"); + return C_ERR; + } + + if (*err[0] == '-') { + dualChannelServerLog(LL_NOTICE, "Server does not support sync with slot-bitmap, dual channel sync approach cannot be used: %s", + *err); + return C_ERR; + } + } + + if (connSyncWrite(link->rdb_transfer_s, "SYNC\r\n", 6, server.repl_syncio_timeout * 1000) == -1) { + dualChannelServerLog(LL_WARNING, "I/O error writing to Primary: %s", connGetLastError(link->rdb_transfer_s)); return C_ERR; } return C_OK; @@ -2879,7 +2908,7 @@ int replicationUseAOFFormatSnapshot(replicationLink *link) { static int dualChannelReplHandleEndOffsetResponse(replicationLink *link, sds *err) { uint64_t rdb_client_id; - *err = receiveSynchronousResponse(link->transfer_s); + *err = receiveSynchronousResponse(link->rdb_transfer_s); if (*err == NULL) { return C_ERR; } @@ -2996,6 +3025,10 @@ static void dualChannelFullSyncWithReplicationSource(connection *conn) { link->transfer_fd = -1; link->state = REPL_STATE_CONNECT; replicationAbortDualChannelSyncTransfer(link); + if (link->client) { + freeClient(link->client); + link->client = NULL; + } } /* Replication: Replica side. @@ -3730,7 +3763,7 @@ void syncWithSource(connection *conn) { char *argv[3] = {"REPLCONF", "slot-bitmap", NULL}; size_t lens[3] = {8, 11, 0}; argv[2] = (char *)link->slot_bitmap; - lens[2] = CLUSTER_SLOTS/8; + lens[2] = sizeof(slotBitmap); err = sendCommandArgv(conn, 3, argv, lens); if (err) goto write_error; } @@ -3950,8 +3983,10 @@ void syncWithSource(connection *conn) { link->transfer_fd = dfd; } - /* We are going to need to do a full resync. If we are accepting a single - * slot - make sure we have a clean slate to load it into.*/ + /* We are going to need to do a full resync. If we are accepting a + * slot subset - make sure we have a clean state to load it into. This may + * happen in cases where a previous replication attempt failed and is being + * retried. */ if (!isSlotBitmapAllSlots(link->slot_bitmap)) { dropKeysInSlotBitmap(link->slot_bitmap, 1); } @@ -3961,6 +3996,7 @@ void syncWithSource(connection *conn) { if (psync_result == PSYNC_FULLRESYNC_DUAL_CHANNEL) { /* Create RDB connection */ link->rdb_transfer_s = connCreate(connTypeOfReplication()); + connSetPrivateData(link->rdb_transfer_s, link); if (connConnect(link->rdb_transfer_s, link->host, link->port, server.bind_source_addr, dualChannelFullSyncWithReplicationSource) == C_ERR) { serverLog(LL_WARNING, "Unable to connect to source: %s", connGetLastError(link->transfer_s)); @@ -4013,6 +4049,10 @@ void syncWithSource(connection *conn) { link->transfer_tmpfile = NULL; link->transfer_fd = -1; link->state = REPL_STATE_CONNECT; + if (link->client) { + freeClient(link->client); + link->client = NULL; + } return; write_error: /* Handle sendCommand() errors. */ @@ -4021,12 +4061,12 @@ void syncWithSource(connection *conn) { goto error; } -replicationLink *createReplicationLink(char *host, int port, unsigned char *slot_bitmap) { +replicationLink *createReplicationLink(char *host, int port, slotBitmap slot_bitmap) { replicationLink *result = (replicationLink *)zmalloc(sizeof(replicationLink)); result->protected = 0; result->state = REPL_STATE_NONE; result->rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE; - memcpy(result->slot_bitmap, slot_bitmap, sizeof(result->slot_bitmap)); + memcpy(result->slot_bitmap, slot_bitmap, sizeof(slotBitmap)); result->client = NULL; result->host = sdsnew(host); result->port = port; @@ -4306,6 +4346,17 @@ void replicationHandleSourceDisconnection(replicationLink *link) { link->client = NULL; link->state = REPL_STATE_CONNECT; + if (link->rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) { + /* Our client was closed in the middle of dual channel (e.g, we were + * loading AOF as a client). Ensure that the other dual channel + * connections are cleaned up. */ + if (link->transfer_s) { + connClose(link->transfer_s); + link->transfer_s = NULL; + } + replicationAbortDualChannelSyncTransfer(link); + } + /* Try to re-connect immediately rather than wait for replicationCron * waiting 1 second may risk backlog being recycled. */ if (link->host) { @@ -5125,7 +5176,7 @@ void replicationCron(void) { replication_cron_loops++; /* Incremented with frequency 1 HZ. */ } -int shouldStartChildReplication(int *mincapa_out, int *req_out, unsigned char *slot_bitmap_out) { +int shouldStartChildReplication(int *mincapa_out, int *req_out, slotBitmap slot_bitmap_out) { /* We should start a BGSAVE good for replication if we have replicas in * WAIT_BGSAVE_START state. * @@ -5137,7 +5188,7 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out, unsigned char *s int replicas_waiting = 0; int mincapa; int req; - unsigned char slot_bitmap[CLUSTER_SLOTS/8]; + slotBitmap slot_bitmap; int first = 1; listNode *ln; listIter li; @@ -5149,7 +5200,7 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out, unsigned char *s if (first) { /* Get first replica's requirements */ req = replica->repl_data->replica_req; - memcpy(slot_bitmap, replica->repl_data->slot_bitmap, sizeof(slot_bitmap)); + memcpy(slot_bitmap, replica->repl_data->slot_bitmap, sizeof(slotBitmap)); } else if (req != replica->repl_data->replica_req) { /* Skip replicas that don't match */ continue; @@ -5168,7 +5219,7 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out, unsigned char *s max_idle >= server.repl_diskless_sync_delay)) { if (mincapa_out) *mincapa_out = mincapa; if (req_out) *req_out = req; - if (slot_bitmap_out) memcpy(slot_bitmap_out, slot_bitmap, sizeof(slot_bitmap)); + if (slot_bitmap_out) memcpy(slot_bitmap_out, slot_bitmap, sizeof(slotBitmap)); return 1; } } @@ -5179,8 +5230,8 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out, unsigned char *s void replicationStartPendingFork(void) { int mincapa = -1; int req = -1; - unsigned char slot_bitmap[CLUSTER_SLOTS/8]; - bitmapSetAllBits(slot_bitmap, sizeof(slot_bitmap)); + slotBitmap slot_bitmap; + slotBitmapSetAll(slot_bitmap); if (shouldStartChildReplication(&mincapa, &req, slot_bitmap)) { /* Start the BGSAVE. The called function may start a diff --git a/src/server.h b/src/server.h index 5d7b8db461..34c8e9ba41 100644 --- a/src/server.h +++ b/src/server.h @@ -1108,6 +1108,8 @@ typedef struct ClientPubSubData { context of client side caching. */ } ClientPubSubData; +typedef unsigned char slotBitmap[CLUSTER_SLOTS / 8]; + typedef struct replicationLink replicationLink; typedef struct ClientReplicationData { int repl_state; /* Replication state if this is a replica. */ @@ -1135,12 +1137,12 @@ typedef struct ClientReplicationData { short replica_req; /* Replica requirements: REPLICA_REQ_* */ uint64_t associated_rdb_client_id; /* The client id of this replica's rdb connection */ time_t rdb_client_disconnect_time; /* Time of the first freeClient call on this client. Used for delaying free. */ - listNode *ref_repl_buf_node; /* Referenced node of replication buffer blocks, - see the definition of replBufBlock. */ - size_t ref_block_pos; /* Access position of referenced buffer block, - i.e. the next offset to send. */ - unsigned char slot_bitmap[CLUSTER_SLOTS/8]; /* The slot range this replica is replicating for. */ - replicationLink *link; /* The replication link owning this. */ + listNode *ref_repl_buf_node; /* Referenced node of replication buffer blocks, + see the definition of replBufBlock. */ + size_t ref_block_pos; /* Access position of referenced buffer block, + i.e. the next offset to send. */ + slotBitmap slot_bitmap; /* The slot range this replica is replicating for. */ + replicationLink *link; /* The replication link owning this. */ } ClientReplicationData; typedef struct ClientModuleData { @@ -2765,7 +2767,7 @@ void ioThreadWriteToClient(void *data); int canParseCommand(client *c); int processIOThreadsReadDone(void); int processIOThreadsWriteDone(void); -replicationLink *createReplicationLink(char *host, int port, unsigned char *slot_bitmap); +replicationLink *createReplicationLink(char *host, int port, slotBitmap slot_bitmap); int connectReplicationLink(replicationLink *link); int freeReplicationLink(replicationLink *link); @@ -2997,7 +2999,7 @@ void aofOpenIfNeededOnServerStart(void); void aofManifestFree(aofManifest *am); int aofDelHistoryFiles(void); int aofRewriteLimited(void); -int rewriteAppendOnlyFileRio(rio *aof, unsigned char *slot_bitmap); +int rewriteAppendOnlyFileRio(rio *aof, slotBitmap slot_bitmap); /* Child info */ void openChildInfoPipe(void); From 50878f5784c940c6a48ca51ac37e81f3f0ba6970 Mon Sep 17 00:00:00 2001 From: Jacob Murphy Date: Fri, 17 Jan 2025 22:39:46 +0000 Subject: [PATCH 04/18] Refactor code to reduce touch points in replication.c Signed-off-by: Jacob Murphy --- src/aof.c | 4 +- src/blocked.c | 2 +- src/cluster.h | 5 +- src/cluster_legacy.c | 164 ++-- src/cluster_legacy.h | 16 +- src/config.c | 23 +- src/db.c | 20 +- src/evict.c | 2 +- src/expire.c | 2 +- src/io_threads.c | 2 +- src/module.c | 29 +- src/networking.c | 86 ++- src/object.c | 3 +- src/rdb.c | 14 +- src/replication.c | 1715 ++++++++++++++++++------------------------ src/script.c | 12 +- src/server.c | 70 +- src/server.h | 150 ++-- src/valkeymodule.h | 4 +- 19 files changed, 1071 insertions(+), 1252 deletions(-) diff --git a/src/aof.c b/src/aof.c index 06f04760ca..dbebc92e63 100644 --- a/src/aof.c +++ b/src/aof.c @@ -2204,7 +2204,7 @@ int rewriteAppendOnlyFileRio(rio *aof, slotBitmap slot_bitmap) { kvstoreIterator *kvs_it = NULL; /* Record timestamp at the beginning of rewriting AOF. */ - if (server.aof_timestamp_enabled && isSlotBitmapAllSlots(slot_bitmap)) { + if (server.aof_timestamp_enabled && !slot_bitmap) { sds ts = genAofTimestampAnnotationIfNeeded(1); if (rioWrite(aof, ts, sdslen(ts)) == 0) { sdsfree(ts); @@ -2224,7 +2224,7 @@ int rewriteAppendOnlyFileRio(rio *aof, slotBitmap slot_bitmap) { if (rioWrite(aof, selectcmd, sizeof(selectcmd) - 1) == 0) goto werr; if (rioWriteBulkLongLong(aof, j) == 0) goto werr; - if (isSlotBitmapAllSlots(slot_bitmap)) { + if (!slot_bitmap) { kvs_it = kvstoreIteratorInit(db->keys); } else { kvs_it = kvstoreFilteredIteratorInit(db->keys, &shouldFilterSlot, slot_bitmap); diff --git a/src/blocked.c b/src/blocked.c index d1a6ff9c6b..70da7877ad 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -100,7 +100,7 @@ void freeClientBlockingState(client *c) { * flag is set client query buffer is not longer processed, but accumulated, * and will be processed when the client is unblocked. */ void blockClient(client *c, int btype) { - /* Primary client should never be blocked unless pause or module */ + /* Replication clients should never be blocked unless pause or module */ serverAssert(!(c->flag.replication_source && btype != BLOCKED_MODULE && btype != BLOCKED_POSTPONE)); initClientBlockingState(c); diff --git a/src/cluster.h b/src/cluster.h index 9b050d0b70..74889422b4 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -121,9 +121,10 @@ int bitmapTestBit(unsigned char *bitmap, int pos); void bitmapSetBit(unsigned char *bitmap, int pos); void bitmapClearBit(unsigned char *bitmap, int pos); void bitmapSetAllBits(unsigned char *bitmap, int len); -void slotBitmapSetAll(slotBitmap bitmap); -int isSlotBitmapAllSlots(slotBitmap bitmap); +int slotBitmapCompare(slotBitmap bitmap, slotBitmap other); +int isSlotBitmapEmpty(slotBitmap bitmap); int getSlotOrReply(client *c, robj *o); +void clusterSlotMigrationDoneSyncing(long long initial_offset); /* functions with shared implementations */ int clusterNodeIsMyself(clusterNode *n); diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 0e07057856..d174124f40 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -122,6 +122,7 @@ int verifyClusterNodeId(const char *name, int length); sds clusterEncodeOpenSlotsAuxField(int rdbflags); int clusterDecodeOpenSlotsAuxField(int rdbflags, sds s); static int nodeExceedsHandshakeTimeout(clusterNode *node, mstime_t now); +void clusterProceedWithSlotMigration(void); /* Only primaries that own slots have voting rights. * Returns 1 if the node has voting rights, otherwise returns 0. */ @@ -1456,7 +1457,7 @@ void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) { /* If the server is starting up, don't accept cluster connections: * UPDATE messages may interact with the database content. */ - if (server.primary == NULL && server.loading) return; + if (server.primary_host == NULL && server.loading) return; while (max--) { cfd = anetTcpAccept(server.neterr, fd, cip, sizeof(cip), &cport); @@ -4439,6 +4440,29 @@ void clusterPropagatePublish(robj *channel, robj *message, int sharded) { * Slot Migration functions * -------------------------------------------------------------------------- */ +slotMigration *clusterCreateSlotMigration(clusterNode *source, slotBitmap slots) { + slotMigration *result = (slotMigration *) zmalloc(sizeof(slotMigration)); + memcpy(result->slot_bitmap, slots, sizeof(slotBitmap)); + result->source_node = source; + result->state = SLOT_MIGRATION_QUEUED; + result->end_time = 0; /* Will be set once started. */ + result->replication_connection = NULL; + result->replication_client = NULL; + result->replication_handshake_state = REPL_STATE_NONE; + result->pause_end = 0; + result->pause_primary_offset = -1; + return result; +} + +void clusterFreeSlotMigration(slotMigration *migration) { + if (migration->replication_client) { + freeClient(migration->replication_client); + } else if (migration->replication_connection) { + connClose(migration->replication_connection); + } + zfree(migration); +} + /* Gets the current slot migration from the head of the queue. */ slotMigration *clusterGetCurrentSlotMigration(void) { if (listLength(server.cluster->slot_migrations) == 0) return NULL; @@ -4456,6 +4480,23 @@ void clusterSendMigrateSlotStart(clusterNode *node, slotBitmap slot_bitmap) { clusterMsgSendBlockDecrRefCount(msgblock); } +void clusterImportHandler(connection *conn) { + UNUSED(conn); + /* This is called if there is an event on the current migrations + * connection. If that is the case, we can just continue with our + * state machine.*/ + clusterProceedWithSlotMigration(); +} + +void clusterSlotMigrationDoneSyncing(long long initial_offset) { + slotMigration *migration = clusterGetCurrentSlotMigration(); + serverAssert(migration != NULL && migration->state == SLOT_MIGRATION_RECEIVE_SYNC); + migration->state = SLOT_MIGRATION_PAUSE_OWNER; + migration->replication_client->repl_data->reploff = initial_offset; + migration->replication_client->repl_data->read_reploff = initial_offset; + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION); +} + /* This is the main state machine for the slot migration workflow. Slot * migration is driven by the new owner of the slot. This function will do as * much work as possible synchronously, processing the enqueued slot migrations @@ -4471,7 +4512,7 @@ void clusterProceedWithSlotMigration(void) { "Timed out for slot migration from source node %.40s", curr_migration->source_node->name); curr_migration->state = SLOT_MIGRATION_FAILED; } - if (curr_migration->state > SLOT_MIGRATION_PAUSE_OWNER && curr_migration->state < SLOT_MIGRATION_FAILED && curr_migration->pause_end < mstime() && curr_migration->vote_retry_time < mstime()) { + if (curr_migration->state > SLOT_MIGRATION_PAUSE_OWNER && curr_migration->state < SLOT_MIGRATION_FAILED && curr_migration->pause_end < mstime()) { /* If the owner ever unpauses, we have to move back in the state machine and retry. */ serverLog(LL_WARNING, "Reinitiating pause on the node owning the slot range..."); curr_migration->state = SLOT_MIGRATION_PAUSE_OWNER; @@ -4480,43 +4521,87 @@ void clusterProceedWithSlotMigration(void) { switch(curr_migration->state) { case SLOT_MIGRATION_QUEUED: /* Start the migration */ - serverLog(LL_NOTICE, "Starting sync from migration source node %.40s", curr_migration->source_node->name); + serverLog(LL_NOTICE, "Starting replication of slots from migration source node %.40s", curr_migration->source_node->name); curr_migration->end_time = mstime() + CLUSTER_SLOT_MIGRATION_TIMEOUT; - curr_migration->link = createReplicationLink(curr_migration->source_node->ip, getNodeDefaultReplicationPort(curr_migration->source_node), curr_migration->slot_bitmap); - if (connectReplicationLink(curr_migration->link) == C_ERR) { + curr_migration->replication_connection = connCreate(server.tls_replication ? connectionTypeTls() : connectionTypeTcp()); + if (connConnect(curr_migration->replication_connection, curr_migration->source_node->ip, getNodeDefaultReplicationPort(curr_migration->source_node), server.bind_source_addr, clusterImportHandler) == C_ERR) { serverLog(LL_WARNING, - "Failed to begin sync from migration source node %.40s", curr_migration->source_node->name); + "Failed to connect to migration source node %.40s", curr_migration->source_node->name); curr_migration->state = SLOT_MIGRATION_FAILED; continue; } - curr_migration->state = SLOT_MIGRATION_SYNCING; + curr_migration->replication_handshake_state = REPL_STATE_CONNECTING; + curr_migration->state = SLOT_MIGRATION_CONNECTING; continue; - case SLOT_MIGRATION_SYNCING: - /* replicationCron should manage retrying connection, but there could be scenarios where we hit an irrecoverable error. */ - if (curr_migration->link->state == REPL_STATE_NONE || curr_migration->link->state == REPL_STATE_CANCELLED) { - serverLog(LL_WARNING, "Sync failed from migration node %.40s", curr_migration->source_node->name); + case SLOT_MIGRATION_CONNECTING: + if (curr_migration->replication_connection->state == CONN_STATE_CONNECTED) { + curr_migration->state = SLOT_MIGRATION_REPL_HANDSHAKE; + continue; + } + /* Nothing to do, waiting for connection to be established. */ + return; + case SLOT_MIGRATION_REPL_HANDSHAKE: + curr_migration->replication_handshake_state = replicationProceedWithHandshake(curr_migration->replication_connection, curr_migration->replication_handshake_state, curr_migration->slot_bitmap); + if (curr_migration->replication_handshake_state == REPL_STATE_ERROR) { + serverLog(LL_WARNING, "Handshake failed from migration node %.40s", curr_migration->source_node->name); curr_migration->state = SLOT_MIGRATION_FAILED; continue; } - if (curr_migration->link->state == REPL_STATE_CONNECTED) { - curr_migration->state = SLOT_MIGRATION_PAUSE_OWNER; + if (curr_migration->replication_handshake_state == REPL_STATE_SEND_PSYNC) { + curr_migration->state = SLOT_MIGRATION_SEND_SYNC; continue; } - /* If we are in another state, nothing to do right now. */ return; + case SLOT_MIGRATION_SEND_SYNC: + /* Ensure we have a clean state for the SYNC. */ + dropKeysInSlotBitmap(curr_migration->slot_bitmap, 1); + + /* We are done with our handshake phase. We can proceed straight to doing our SYNC. + * Note that we are skipping PSYNC. PSYNC will always result in full resync for a + * slot migration anyways. + * + * In the future, we can do a PSYNC phase to incorporate dual channel. */ + serverLog(LL_NOTICE, "Starting SYNC for slot migration from migration source node %.40s", curr_migration->source_node->name); + if (connSyncWrite(curr_migration->replication_connection, "SYNC\r\n", 6, server.repl_syncio_timeout * 1000) == -1) { + serverLog(LL_WARNING, "I/O error writing to slot migration source: %s", connGetLastError(curr_migration->replication_connection)); + curr_migration->state = SLOT_MIGRATION_FAILED; + continue; + } + client *c = createClient(curr_migration->replication_connection); + curr_migration->replication_client = c; + c->flag.replication_source = 1; + c->flag.slot_migration_source = 1; + c->flag.authenticated = 1; + c->user = NULL; /* This client can do everything. */ + initClientReplicationData(c); /* We use this to track offset. */ + c->querybuf = sdsempty(); /* Similar to primary, we use a dedicated query buf. */ + + /* Our result will be received in AOF format, so we can pipe it + * straight to readQueryFromClient. */ + connSetReadHandler(c->conn, readQueryFromClient); + curr_migration->state = SLOT_MIGRATION_RECEIVE_SYNC; + continue; + case SLOT_MIGRATION_RECEIVE_SYNC: + return; /* Nothing to do */ case SLOT_MIGRATION_PAUSE_OWNER: - serverLog(LL_NOTICE, "Replication link to slot owner %.40s has been established. Pausing source node and waiting to continue", curr_migration->source_node->name); + /* Send an ACK to put the connection into streaming state. */ + replicationSendAck(curr_migration->replication_client); + + serverLog(LL_NOTICE, "Replication sync to slot owner %.40s has been performed. Current replication offset: %lld. Pausing source node and waiting to continue", curr_migration->source_node->name, curr_migration->replication_client->repl_data->reploff); clusterSendMigrateSlotStart(curr_migration->source_node, curr_migration->slot_bitmap); curr_migration->pause_primary_offset = -1; curr_migration->pause_end = mstime() + CLUSTER_MF_TIMEOUT; curr_migration->state = SLOT_MIGRATION_WAITING_FOR_OFFSET; continue; case SLOT_MIGRATION_WAITING_FOR_OFFSET: - /* Nothing to do, need to wait for cluster message to come in. */ + /* Send REPLCONF ACK from time to time */ + replicationSendAck(curr_migration->replication_client); return; case SLOT_MIGRATION_SYNCING_TO_OFFSET: - if (curr_migration->link->client->repl_data->reploff >= curr_migration->pause_primary_offset) { - serverLog(LL_NOTICE, "Replication of slot range has caught up to paused slot owner, slot migration can start."); + /* Send REPLCONF ACK from time to time */ + replicationSendAck(curr_migration->replication_client); + if (curr_migration->replication_client->repl_data->reploff >= curr_migration->pause_primary_offset) { + serverLog(LL_NOTICE, "Replication of slot range has caught up to paused slot owner (%lld), slot migration can start.", curr_migration->pause_primary_offset); curr_migration->state = SLOT_MIGRATION_FINISH; continue; } @@ -4535,17 +4620,15 @@ void clusterProceedWithSlotMigration(void) { if (clusterBumpConfigEpochWithoutConsensus() == C_OK) { serverLog(LL_NOTICE, "ConfigEpoch updated after importing slots"); } + clusterFreeSlotMigration(curr_migration); clusterBroadcastPong(CLUSTER_BROADCAST_ALL); listDelNode(server.cluster->slot_migrations, curr_node); - freeReplicationLink(curr_migration->link); - zfree(curr_migration); continue; case SLOT_MIGRATION_FAILED: /* Delete the migration from the queue and proceed to the next migration */ listDelNode(server.cluster->slot_migrations, curr_node); - freeReplicationLink(curr_migration->link); dropKeysInSlotBitmap(curr_migration->slot_bitmap, server.repl_replica_lazy_flush); - zfree(curr_migration); + clusterFreeSlotMigration(curr_migration); continue; } } @@ -4896,8 +4979,8 @@ void clusterHandleReplicaFailover(void) { /* Set data_age to the number of milliseconds we are disconnected from * the primary. */ - if (server.primary && server.primary->state == REPL_STATE_CONNECTED) { - data_age = (mstime_t)(server.unixtime - server.primary->client->last_interaction) * 1000; + if (server.repl_state == REPL_STATE_CONNECTED) { + data_age = (mstime_t)(server.unixtime - server.primary->last_interaction) * 1000; } else { data_age = (mstime_t)(server.unixtime - server.repl_down_since) * 1000; } @@ -5489,7 +5572,7 @@ void clusterCron(void) { /* If we are a replica node but the replication is still turned off, * enable it if we know the address of our primary and it appears to * be up. */ - if (nodeIsReplica(myself) && server.primary == NULL && myself->replicaof && nodeHasAddr(myself->replicaof)) { + if (nodeIsReplica(myself) && server.primary_host == NULL && myself->replicaof && nodeHasAddr(myself->replicaof)) { replicationSetPrimary(myself->replicaof->ip, getNodeDefaultReplicationPort(myself->replicaof), 0); } @@ -5592,20 +5675,14 @@ void bitmapClearBit(unsigned char *bitmap, int pos) { bitmap[byte] &= ~(1 << bit); } -void slotBitmapSetAll(slotBitmap bitmap) { - memset(bitmap, 0xff, sizeof(slotBitmap)); -} - int slotBitmapCompare(slotBitmap bitmap, slotBitmap otherbitmap) { return memcmp(bitmap, otherbitmap, sizeof(slotBitmap)); } -/* Return if the slot bitmap contains all slots */ -int isSlotBitmapAllSlots(slotBitmap bitmap) { - if (!bitmap) return 1; - slotBitmap all_slot_bitmap; - slotBitmapSetAll(all_slot_bitmap); - return slotBitmapCompare(bitmap, all_slot_bitmap) == 0; +int isSlotBitmapEmpty(slotBitmap bitmap) { + slotBitmap empty; + memset(empty, 0, sizeof(slotBitmap)); + return slotBitmapCompare(bitmap, empty) == 0; } /* Return non-zero if there is at least one primary with replicas in the cluster. @@ -6706,13 +6783,13 @@ int clusterParseSetSlotCommand(client *c, int *slot_out, clusterNode **node_out, int optarg_pos = 0; /* Allow primaries to replicate "CLUSTER SETSLOT" */ - if (!c->flag.replication_source && nodeIsReplica(myself)) { + if (!c->flag.primary && nodeIsReplica(myself)) { addReplyError(c, "Please use SETSLOT only with masters."); return 0; } /* If 'myself' is a replica, 'c' must be the primary client. */ - serverAssert(!nodeIsReplica(myself) || (server.primary && c == server.primary->client)); + serverAssert(!nodeIsReplica(myself) || c == server.primary); if ((slot = getSlotOrReply(c, c->argv[2])) == -1) return 0; @@ -7343,18 +7420,7 @@ int clusterCommandSpecial(client *c) { } } - slotMigration *to_enqueue = (slotMigration *) zmalloc(sizeof(slotMigration)); - memcpy(to_enqueue->slot_bitmap, requested_slots, sizeof(slotBitmap)); - to_enqueue->source_node = curr_owner; - to_enqueue->state = SLOT_MIGRATION_QUEUED; - to_enqueue->end_time = 0; /* Will be set once started. */ - to_enqueue->link = NULL; - to_enqueue->pause_end = 0; - to_enqueue->pause_primary_offset = -1; - to_enqueue->vote_end_time = 0; - to_enqueue->vote_retry_time = 0; - to_enqueue->vote_epoch = 0; - to_enqueue->auth_count = 0; + slotMigration * to_enqueue = clusterCreateSlotMigration(curr_owner, requested_slots); listAddNodeTail(server.cluster->slot_migrations, to_enqueue); clusterProceedWithSlotMigration(); addReply(c, shared.ok); diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index 1b83c1b2f5..f9c6f5e5b8 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -377,8 +377,12 @@ struct _clusterNode { }; typedef enum slotMigrationState { - SLOT_MIGRATION_QUEUED, /* Queued behind some other slot migration. */ - SLOT_MIGRATION_SYNCING, /* Syncing contents from current owner. */ + SLOT_MIGRATION_QUEUED, + SLOT_MIGRATION_CONNECTING, + SLOT_MIGRATION_REPL_HANDSHAKE, /* The handshake has it's own state machine, + * see replicationProceedWithHandshake */ + SLOT_MIGRATION_SEND_SYNC, + SLOT_MIGRATION_RECEIVE_SYNC, SLOT_MIGRATION_PAUSE_OWNER, SLOT_MIGRATION_WAITING_FOR_OFFSET, SLOT_MIGRATION_SYNCING_TO_OFFSET, @@ -392,13 +396,11 @@ typedef struct slotMigration { clusterNode *source_node; mstime_t end_time; /* Slot migration time limit (ms unixtime). If not yet in progress (e.g. queued), will be zero. */ - replicationLink *link; + connection *replication_connection; /* Connection for replication. */ + client *replication_client; /* Client for replication */ + int replication_handshake_state; mstime_t pause_end; long long pause_primary_offset; - mstime_t vote_end_time; - mstime_t vote_retry_time; - uint64_t vote_epoch; - int auth_count; } slotMigration; /* Struct used for storing slot statistics. */ diff --git a/src/config.c b/src/config.c index 512b35f210..5b90ebbd60 100644 --- a/src/config.c +++ b/src/config.c @@ -596,7 +596,7 @@ void loadServerConfigFromString(char *config) { } /* Sanity checks. */ - if (server.cluster_enabled && server.primary) { + if (server.cluster_enabled && server.primary_host) { err = "replicaof directive not allowed in cluster mode"; goto loaderr; } @@ -1451,11 +1451,11 @@ void rewriteConfigReplicaOfOption(standardConfig *config, const char *name, stru /* If this is a primary, we want all the replicaof config options * in the file to be removed. Note that if this is a cluster instance * we don't want a replicaof directive inside valkey.conf. */ - if (server.cluster_enabled || server.primary == NULL) { + if (server.cluster_enabled || server.primary_host == NULL) { rewriteConfigMarkAsProcessed(state, name); return; } - line = sdscatprintf(sdsempty(), "%s %s %d", name, server.primary->host, server.primary->port); + line = sdscatprintf(sdsempty(), "%s %s %d", name, server.primary_host, server.primary_port); rewriteConfigRewriteLine(state, name, line, 1); } @@ -3000,20 +3000,19 @@ static int setConfigReplicaOfOption(standardConfig *config, sds *argv, int argc, return 0; } - freeReplicationLink(server.primary); - server.primary = NULL; - + sdsfree(server.primary_host); + server.primary_host = NULL; if (!strcasecmp(argv[0], "no") && !strcasecmp(argv[1], "one")) { return 1; } char *ptr; - int port = strtol(argv[1], &ptr, 10); - if (port < 0 || port > 65535 || *ptr != '\0') { + server.primary_port = strtol(argv[1], &ptr, 10); + if (server.primary_port < 0 || server.primary_port > 65535 || *ptr != '\0') { *err = "Invalid primary port"; return 0; } - server.primary = createReplicationLink(argv[0], port, NULL); - server.primary->state = REPL_STATE_CONNECT; + server.primary_host = sdsnew(argv[0]); + server.repl_state = REPL_STATE_CONNECT; return 1; } @@ -3025,8 +3024,8 @@ static sds getConfigBindOption(standardConfig *config) { static sds getConfigReplicaOfOption(standardConfig *config) { UNUSED(config); char buf[256]; - if (server.primary) - snprintf(buf, sizeof(buf), "%s %d", server.primary->host, server.primary->port); + if (server.primary_host) + snprintf(buf, sizeof(buf), "%s %d", server.primary_host, server.primary_port); else buf[0] = '\0'; return sdsnew(buf); diff --git a/src/db.c b/src/db.c index 05b395728a..134dc6e9dd 100644 --- a/src/db.c +++ b/src/db.c @@ -110,7 +110,7 @@ robj *lookupKey(serverDb *db, robj *key, int flags) { * It's possible that the WRITE flag is set even during a readonly * command, since the command may trigger events that cause modules to * perform additional writes. */ - int is_ro_replica = server.primary && server.repl_replica_ro; + int is_ro_replica = server.primary_host && server.repl_replica_ro; int expire_flags = 0; if (flags & LOOKUP_WRITE && !is_ro_replica) expire_flags |= EXPIRE_FORCE_DELETE_EXPIRED; if (flags & LOOKUP_NOEXPIRE) expire_flags |= EXPIRE_AVOID_DELETE_EXPIRED; @@ -258,7 +258,7 @@ int getKeySlot(sds key) { * so we must always recompute the slot for commands coming from the primary. */ if (server.current_client && server.current_client->slot >= 0 && server.current_client->flag.executing_command && - !server.current_client->flag.replication_source) { + !server.current_client->flag.primary) { debugServerAssertWithInfo(server.current_client, NULL, (int)keyHashSlot(key, (int)sdslen(key)) == server.current_client->slot); return server.current_client->slot; @@ -267,7 +267,7 @@ int getKeySlot(sds key) { /* For the case of replicated commands from primary, getNodeByQuery() never gets called, * and thus c->slot never gets populated. That said, if this command ends up accessing a key, * we are able to backfill c->slot here, where the key's hash calculation is made. */ - if (server.current_client && server.current_client->flag.replication_source) { + if (server.current_client && server.current_client->flag.primary) { server.current_client->slot = slot; } return slot; @@ -446,7 +446,7 @@ robj *dbRandomKey(serverDb *db) { sds key = objectGetKey(valkey); robj *keyobj = createStringObject(key, sdslen(key)); if (objectIsExpired(valkey)) { - if (allvolatile && (server.primary || server.import_mode) && --maxtries == 0) { + if (allvolatile && (server.primary_host || server.import_mode) && --maxtries == 0) { /* If the DB is composed only of keys with an expire set, * it could happen that all the keys are already logically * expired in the replica, so the function cannot stop because @@ -1801,8 +1801,8 @@ robj *setExpire(client *c, serverDb *db, robj *key, long long when) { serverAssert(added); } - int writable_replica = server.primary && server.repl_replica_ro == 0; - if (c && writable_replica && !c->flag.replication_source) rememberReplicaKeyWithExpire(db, key); + int writable_replica = server.primary_host && server.repl_replica_ro == 0; + if (c && writable_replica && !c->flag.primary) rememberReplicaKeyWithExpire(db, key); return val; } @@ -1907,7 +1907,7 @@ static int objectIsExpired(robj *val) { /* Don't expire anything while loading. It will be done later. */ if (server.loading) return 0; if (!timestampIsExpired(objectGetExpire(val))) return 0; - if (server.primary == NULL && server.import_mode) { + if (server.primary_host == NULL && server.import_mode) { if (server.current_client && server.current_client->flag.import_source) return 0; } return 1; @@ -1925,7 +1925,7 @@ static int keyIsExpiredWithDictIndex(serverDb *db, robj *key, int dict_index) { if (!keyIsExpiredWithDictIndexImpl(db, key, dict_index)) return 0; /* See expireIfNeededWithDictIndex for more details. */ - if (server.primary == NULL && server.import_mode) { + if (server.primary_host == NULL && server.import_mode) { if (server.current_client && server.current_client->flag.import_source) return 0; } return 1; @@ -1959,8 +1959,8 @@ static keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, robj *val, * * When replicating commands from the primary, keys are never considered * expired. */ - if (server.primary != NULL) { - if (server.current_client && (server.current_client->flag.replication_source)) return KEY_VALID; + if (server.primary_host != NULL) { + if (server.current_client && (server.current_client->flag.primary)) return KEY_VALID; if (!(flags & EXPIRE_FORCE_DELETE_EXPIRED)) return KEY_EXPIRED; } else if (server.import_mode) { /* If we are running in the import mode on a primary, instead of diff --git a/src/evict.c b/src/evict.c index f91f2b76f7..d4bfade4fc 100644 --- a/src/evict.c +++ b/src/evict.c @@ -466,7 +466,7 @@ static int isSafeToPerformEvictions(void) { /* By default replicas should ignore maxmemory * and just be primaries exact copies. */ - if (server.primary && server.repl_replica_ignore_maxmemory) return 0; + if (server.primary_host && server.repl_replica_ignore_maxmemory) return 0; /* If 'evict' action is paused, for whatever reason, then return false */ if (isPausedActionsWithUpdate(PAUSE_ACTION_EVICT)) return 0; diff --git a/src/expire.c b/src/expire.c index 29dcd82c83..e4c3b0ec96 100644 --- a/src/expire.c +++ b/src/expire.c @@ -524,7 +524,7 @@ int checkAlreadyExpired(long long when) { * * If the server is a primary and in the import mode, we also add the already * expired key and wait for an explicit DEL from the import source. */ - return (when <= commandTimeSnapshot() && !server.loading && !server.primary && !server.import_mode); + return (when <= commandTimeSnapshot() && !server.loading && !server.primary_host && !server.import_mode); } #define EXPIRE_NX (1 << 0) diff --git a/src/io_threads.c b/src/io_threads.c index 260d7007be..715251a06a 100644 --- a/src/io_threads.c +++ b/src/io_threads.c @@ -345,7 +345,7 @@ int trySendReadToIOThreads(client *c) { c->cur_tid = tid; c->read_flags = canParseCommand(c) ? 0 : READ_FLAGS_DONT_PARSE; c->read_flags |= authRequired(c) ? READ_FLAGS_AUTH_REQUIRED : 0; - c->read_flags |= c->flag.replication_source ? READ_FLAGS_PRIMARY : 0; + c->read_flags |= c->flag.replication_source ? READ_FLAGS_REPLICATION_SOURCE : 0; c->io_read_state = CLIENT_PENDING_IO; connSetPostponeUpdateState(c->conn, 1); diff --git a/src/module.c b/src/module.c index 40a5c8de20..01c9962e90 100644 --- a/src/module.c +++ b/src/module.c @@ -3757,9 +3757,9 @@ int modulePopulateReplicationInfoStructure(void *ri, int structver) { ValkeyModuleReplicationInfoV1 *ri1 = ri; memset(ri1, 0, sizeof(*ri1)); ri1->version = structver; - ri1->primary = server.primary == NULL; - ri1->primary_host = server.primary ? server.primary->host : ""; - ri1->primary_port = server.primary ? server.primary->port : 0; + ri1->primary = server.primary_host == NULL; + ri1->primary_host = server.primary_host ? server.primary_host : ""; + ri1->primary_port = server.primary_port; ri1->replid1 = server.replid; ri1->replid2 = server.replid2; ri1->repl1_offset = server.primary_repl_offset; @@ -3948,7 +3948,8 @@ int VM_GetContextFlags(ValkeyModuleCtx *ctx) { if (ctx->client) { if (ctx->client->flag.deny_blocking) flags |= VALKEYMODULE_CTX_FLAGS_DENY_BLOCKING; /* Module command received from PRIMARY, is replicated. */ - if (ctx->client->flag.replication_source) flags |= VALKEYMODULE_CTX_FLAGS_REPLICATED; + if (ctx->client->flag.primary) flags |= VALKEYMODULE_CTX_FLAGS_REPLICATED; + if (ctx->client->flag.slot_migration_source) flags |= VALKEYMODULE_CTX_FLAGS_IMPORTING_SLOT; if (ctx->client->resp == 3) { flags |= VALKEYMODULE_CTX_FLAGS_RESP3; } @@ -3973,7 +3974,7 @@ int VM_GetContextFlags(ValkeyModuleCtx *ctx) { flags |= VALKEYMODULE_CTX_FLAGS_LOADING; /* Maxmemory and eviction policy */ - if (server.maxmemory > 0 && (!server.primary || !server.repl_replica_ignore_maxmemory)) { + if (server.maxmemory > 0 && (!server.primary_host || !server.repl_replica_ignore_maxmemory)) { flags |= VALKEYMODULE_CTX_FLAGS_MAXMEMORY; if (server.maxmemory_policy != MAXMEMORY_NO_EVICTION) flags |= VALKEYMODULE_CTX_FLAGS_EVICT; @@ -3984,22 +3985,22 @@ int VM_GetContextFlags(ValkeyModuleCtx *ctx) { if (server.saveparamslen > 0) flags |= VALKEYMODULE_CTX_FLAGS_RDB; /* Replication flags */ - if (server.primary == NULL) { + if (server.primary_host == NULL) { flags |= VALKEYMODULE_CTX_FLAGS_PRIMARY; } else { flags |= VALKEYMODULE_CTX_FLAGS_REPLICA; if (server.repl_replica_ro) flags |= VALKEYMODULE_CTX_FLAGS_READONLY; /* Replica state flags. */ - if (server.primary->state == REPL_STATE_CONNECT || server.primary->state == REPL_STATE_CONNECTING) { + if (server.repl_state == REPL_STATE_CONNECT || server.repl_state == REPL_STATE_CONNECTING) { flags |= VALKEYMODULE_CTX_FLAGS_REPLICA_IS_CONNECTING; - } else if (server.primary->state == REPL_STATE_TRANSFER) { + } else if (server.repl_state == REPL_STATE_TRANSFER) { flags |= VALKEYMODULE_CTX_FLAGS_REPLICA_IS_TRANSFERRING; - } else if (server.primary->state == REPL_STATE_CONNECTED) { + } else if (server.repl_state == REPL_STATE_CONNECTED) { flags |= VALKEYMODULE_CTX_FLAGS_REPLICA_IS_ONLINE; } - if (server.primary->state != REPL_STATE_CONNECTED) flags |= VALKEYMODULE_CTX_FLAGS_REPLICA_IS_STALE; + if (server.repl_state != REPL_STATE_CONNECTED) flags |= VALKEYMODULE_CTX_FLAGS_REPLICA_IS_STALE; } /* OOM flag. */ @@ -6462,7 +6463,7 @@ ValkeyModuleCallReply *VM_Call(ValkeyModuleCtx *ctx, const char *cmdname, const goto cleanup; } - if (server.primary && server.repl_replica_ro && !obey_client) { + if (server.primary_host && server.repl_replica_ro && !obey_client) { errno = ESPIPE; if (error_as_call_replies) { sds msg = sdsdup(shared.roreplicaerr->ptr); @@ -6472,7 +6473,7 @@ ValkeyModuleCallReply *VM_Call(ValkeyModuleCtx *ctx, const char *cmdname, const } } - if (server.primary && server.primary->state != REPL_STATE_CONNECTED && server.repl_serve_stale_data == 0 && + if (server.primary_host && server.repl_state != REPL_STATE_CONNECTED && server.repl_serve_stale_data == 0 && !(cmd_flags & CMD_STALE)) { errno = ESPIPE; if (error_as_call_replies) { @@ -8782,7 +8783,7 @@ int VM_AddPostNotificationJob(ValkeyModuleCtx *ctx, ValkeyModulePostNotificationJobFunc callback, void *privdata, void (*free_privdata)(void *)) { - if (server.loading || (server.primary && server.repl_replica_ro)) { + if (server.loading || (server.primary_host && server.repl_replica_ro)) { return VALKEYMODULE_ERR; } ValkeyModulePostExecUnitJob *job = zmalloc(sizeof(*job)); @@ -13059,7 +13060,7 @@ int VM_RdbLoad(ValkeyModuleCtx *ctx, ValkeyModuleRdbStream *stream, int flags) { } /* Not allowed on replicas. */ - if (server.primary != NULL) { + if (server.primary_host != NULL) { errno = ENOTSUP; return VALKEYMODULE_ERR; } diff --git a/src/networking.c b/src/networking.c index b9712d877a..5c31ac4562 100644 --- a/src/networking.c +++ b/src/networking.c @@ -288,9 +288,9 @@ int prepareClientToWrite(client *c) { * CLIENT_PUSHING handling: disables the reply silencing flags. */ if ((c->flag.reply_off || c->flag.reply_skip) && !c->flag.pushing) return C_ERR; - /* Primaries don't receive replies, unless CLIENT_PRIMARY_FORCE_REPLY flag + /* Replication sources don't receive replies, unless force reply flag * is set. */ - if (c->flag.replication_source && !c->flag.primary_force_reply) return C_ERR; + if ((c->flag.replication_source) && !c->flag.replication_force_reply) return C_ERR; /* Skip the fake client, such as the fake client for AOF loading. * But CLIENT_ID_CACHED_RESPONSE is allowed since it is a fake client @@ -581,7 +581,7 @@ void afterErrorReply(client *c, const char *s, size_t len, int flags) { * the commands sent by the primary. However it is useful to log such events since * they are rare and may hint at errors in a script or a bug in the server. */ int ctype = getClientType(c); - if (ctype == CLIENT_TYPE_PRIMARY || ctype == CLIENT_TYPE_REPLICA || c->id == CLIENT_ID_AOF) { + if (ctype == CLIENT_TYPE_PRIMARY || ctype == CLIENT_TYPE_REPLICA || c->id == CLIENT_ID_AOF || ctype == CLIENT_TYPE_SLOT_MIGRATION) { char *to, *from; if (c->id == CLIENT_ID_AOF) { @@ -590,9 +590,12 @@ void afterErrorReply(client *c, const char *s, size_t len, int flags) { } else if (ctype == CLIENT_TYPE_PRIMARY) { to = "primary"; from = "replica"; - } else { + } else if (ctype == CLIENT_TYPE_REPLICA) { to = "replica"; from = "primary"; + } else { + to = "slot-migration-source"; + from = "slot-migration-target"; } if (len > 4096) len = 4096; @@ -1668,7 +1671,7 @@ void freeClient(client *c) { * * Note that before doing this we make sure that the client is not in * some unexpected state, by checking its flags. */ - if (server.primary && server.primary->client == c) { + if (server.primary && c->flag.primary) { serverLog(LL_NOTICE, "Connection with primary lost."); if (!c->flag.dont_cache_primary && !(c->flag.protocol_error || c->flag.blocked)) { c->flag.close_asap = 0; @@ -1819,14 +1822,14 @@ void beforeNextClient(client *c) { /* Trim the query buffer to the current position. */ if (c->flag.replication_source) { - /* If the client is a primary, trim the querybuf to repl_applied, - * since primary client is very special, its querybuf not only + /* If the client is a replication source, trim the querybuf to repl_applied, + * since replication clients are very special, its querybuf not only * used to parse command, but also proxy to sub-replicas. * * Here are some scenarios we cannot trim to qb_pos: - * 1. we don't receive complete command from primary - * 2. primary client blocked cause of client pause - * 3. io threads operate read, primary client flagged with CLIENT_PENDING_COMMAND + * 1. we don't receive complete command from replication + * 2. replication client blocked cause of client pause + * 3. io threads operate read, replication client flagged with CLIENT_PENDING_COMMAND * * In these scenarios, qb_pos points to the part of the current command * or the beginning of next command, and the current command is not applied yet, @@ -2144,7 +2147,7 @@ int postWriteToClient(client *c) { } if (c->nwritten > 0) { c->net_output_bytes += c->nwritten; - /* For clients representing primaries we don't count sending data + /* For clients representing replication sources we don't count sending data * as an interaction, since we always send REPLCONF ACK commands * that take some time to just fill the socket output buffer. * We just rely on data / pings received for timeout detection. */ @@ -2238,7 +2241,11 @@ int handleReadResult(client *c) { c->net_input_bytes += c->nread; if (c->flag.replication_source) { c->repl_data->read_reploff += c->nread; - server.stat_net_repl_input_bytes += c->nread; + if (c->flag.primary) { + server.stat_net_repl_input_bytes += c->nread; + } else if (c->flag.slot_migration_source) { + server.stat_net_slot_migration_input_bytes += c->nread; + } } else { server.stat_net_input_bytes += c->nread; } @@ -2281,7 +2288,7 @@ void handleParseError(client *c) { } else if (flags & READ_FLAGS_ERROR_UNBALANCED_QUOTES) { addReplyError(c, "Protocol error: unbalanced quotes in request"); setProtocolError("unbalanced quotes in inline request", c); - } else if (flags & READ_FLAGS_ERROR_UNEXPECTED_INLINE_FROM_PRIMARY) { + } else if (flags & READ_FLAGS_ERROR_UNEXPECTED_INLINE_FROM_REPLICATION_SOURCE) { serverLog(LL_WARNING, "WARNING: Receiving inline protocol from primary, primary stream corruption? Closing the " "primary connection and discarding the cached primary."); setProtocolError("Master using the inline protocol. Desync?", c); @@ -2295,7 +2302,7 @@ int isParsingError(client *c) { READ_FLAGS_ERROR_INVALID_MULTIBULK_LEN | READ_FLAGS_ERROR_UNAUTHENTICATED_MULTIBULK_LEN | READ_FLAGS_ERROR_UNAUTHENTICATED_BULK_LEN | READ_FLAGS_ERROR_MBULK_INVALID_BULK_LEN | READ_FLAGS_ERROR_BIG_BULK_COUNT | READ_FLAGS_ERROR_MBULK_UNEXPECTED_CHARACTER | - READ_FLAGS_ERROR_UNEXPECTED_INLINE_FROM_PRIMARY | READ_FLAGS_ERROR_UNBALANCED_QUOTES); + READ_FLAGS_ERROR_UNEXPECTED_INLINE_FROM_REPLICATION_SOURCE | READ_FLAGS_ERROR_UNBALANCED_QUOTES); } /* This function is called after the query-buffer was parsed. @@ -2556,7 +2563,7 @@ void processInlineBuffer(client *c) { int argc, j, linefeed_chars = 1; sds *argv, aux; size_t querylen; - int is_primary = c->read_flags & READ_FLAGS_PRIMARY; + int is_replication_source = c->read_flags & READ_FLAGS_REPLICATION_SOURCE; /* Search for end of line */ newline = strchr(c->querybuf + c->qb_pos, '\n'); @@ -2593,9 +2600,9 @@ void processInlineBuffer(client *c) { * * However there is an exception: primaries may send us just a newline * to keep the connection active. */ - if (querylen != 0 && is_primary) { + if (querylen != 0 && is_replication_source) { sdsfreesplitres(argv, argc); - c->read_flags |= READ_FLAGS_ERROR_UNEXPECTED_INLINE_FROM_PRIMARY; + c->read_flags |= READ_FLAGS_ERROR_UNEXPECTED_INLINE_FROM_REPLICATION_SOURCE; return; } @@ -2683,7 +2690,7 @@ void processMultibulkBuffer(client *c) { char *newline = NULL; int ok; long long ll; - int is_primary = c->read_flags & READ_FLAGS_PRIMARY; + int is_replication_source = c->read_flags & READ_FLAGS_REPLICATION_SOURCE; int auth_required = c->read_flags & READ_FLAGS_AUTH_REQUIRED; if (c->multibulklen == 0) { @@ -2787,7 +2794,7 @@ void processMultibulkBuffer(client *c) { size_t bulklen_slen = newline - (c->querybuf + c->qb_pos + 1); ok = string2ll(c->querybuf + c->qb_pos + 1, bulklen_slen, &ll); - if (!ok || ll < 0 || (!(is_primary) && ll > server.proto_max_bulk_len)) { + if (!ok || ll < 0 || (!(is_replication_source) && ll > server.proto_max_bulk_len)) { c->read_flags |= READ_FLAGS_ERROR_MBULK_INVALID_BULK_LEN; return; } else if (ll > 16384 && auth_required) { @@ -2796,7 +2803,7 @@ void processMultibulkBuffer(client *c) { } c->qb_pos = newline - c->querybuf + 2; - if (!(is_primary) && ll >= PROTO_MBULK_BIG_ARG) { + if (!(is_replication_source) && ll >= PROTO_MBULK_BIG_ARG) { /* When the client is not a primary client (because primary * client's querybuf can only be trimmed after data applied * and sent to replicas). @@ -2845,7 +2852,7 @@ void processMultibulkBuffer(client *c) { /* Optimization: if a non-primary client's buffer contains JUST our bulk element * instead of creating a new object by *copying* the sds we * just use the current sds string. */ - if (!is_primary && c->qb_pos == 0 && c->bulklen >= PROTO_MBULK_BIG_ARG && + if (!is_replication_source && c->qb_pos == 0 && c->bulklen >= PROTO_MBULK_BIG_ARG && sdslen(c->querybuf) == (size_t)(c->bulklen + 2)) { c->argv[c->argc++] = createObject(OBJ_STRING, c->querybuf); c->argv_len_sum += c->bulklen; @@ -2895,15 +2902,15 @@ void commandProcessed(client *c) { if (!c->repl_data) return; long long prev_offset = c->repl_data->reploff; - if (c->flag.replication_source && !c->flag.multi) { - /* Update the applied replication offset of our primary. */ + if (!c->flag.multi && c->flag.replication_source) { + /* Update the applied replication offset of our source. */ c->repl_data->reploff = c->repl_data->read_reploff - sdslen(c->querybuf) + c->qb_pos; } - /* If the client is a primary we need to compute the difference + /* If the client is a replication source we need to compute the difference * between the applied offset before and after processing the buffer, * to understand how much of the replication stream was actually - * applied to the primary state: this quantity, and its corresponding + * applied to the replication state: this quantity, and its corresponding * part of the replication stream, will be propagated to the * sub-replicas and to the replication backlog. */ if (c->flag.replication_source) { @@ -3010,7 +3017,7 @@ int canParseCommand(client *c) { * commands to execute in c->argv. */ if (c->flag.pending_command) return 0; - /* Don't process input from the primary while there is a busy script + /* Don't process input from replication while there is a busy script * condition on the replica. We want just to accumulate the replication * stream (instead of replying -BUSY like we do with other clients) and * later resume the processing. */ @@ -3033,7 +3040,7 @@ int processInputBuffer(client *c) { break; } - c->read_flags = c->flag.replication_source ? READ_FLAGS_PRIMARY : 0; + c->read_flags = c->flag.replication_source ? READ_FLAGS_REPLICATION_SOURCE : 0; c->read_flags |= authRequired(c) ? READ_FLAGS_AUTH_REQUIRED : 0; parseCommand(c); @@ -3076,7 +3083,7 @@ void readToQueryBuf(client *c) { /* If the replica RDB client is marked as closed ASAP, do not try to read from it */ if (c->flag.close_asap) return; - int is_primary = c->read_flags & READ_FLAGS_PRIMARY; + int is_replication_source = c->read_flags & READ_FLAGS_REPLICATION_SOURCE; readlen = PROTO_IOBUF_LEN; qblen = c->querybuf ? sdslen(c->querybuf) : 0; @@ -3110,7 +3117,7 @@ void readToQueryBuf(client *c) { * Although we have ensured that c->querybuf will not be expanded in the current * thread_shared_qb, we still add this check for code robustness. */ int use_thread_shared_qb = (c->querybuf == thread_shared_qb) ? 1 : 0; - if (!is_primary && // primary client's querybuf can grow greedy. + if (!is_replication_source && /* replication client's querybuf can grow greedy. */ (big_arg || sdsalloc(c->querybuf) < PROTO_IOBUF_LEN)) { /* When reading a BIG_ARG we won't be reading more than that one arg * into the query buffer, so we don't need to pre-allocate more than we @@ -3137,7 +3144,7 @@ void readToQueryBuf(client *c) { sdsIncrLen(c->querybuf, c->nread); qblen = sdslen(c->querybuf); if (c->querybuf_peak < qblen) c->querybuf_peak = qblen; - if (!is_primary) { + if (!is_replication_source) { /* The commands cached in the MULTI/EXEC queue have not been executed yet, * so they are also considered a part of the query buffer in a broader sense. * @@ -3240,7 +3247,7 @@ sds catClientInfoString(sds s, client *client, int hide_user_data) { *p++ = 'S'; } - if (client->flag.replication_source) *p++ = 'M'; + if (client->flag.primary) *p++ = 'M'; if (client->flag.pubsub) *p++ = 'P'; if (client->flag.multi) *p++ = 'x'; if (client->flag.blocked) *p++ = 'b'; @@ -4132,7 +4139,7 @@ void helloCommand(client *c) { if (!server.sentinel_mode) { addReplyBulkCString(c, "role"); - addReplyBulkCString(c, server.primary ? "replica" : "master"); + addReplyBulkCString(c, server.primary_host ? "replica" : "master"); } addReplyBulkCString(c, "modules"); @@ -4361,13 +4368,15 @@ size_t getClientMemoryUsage(client *c, size_t *output_buffer_mem_usage) { * CLIENT_TYPE_REPLICA -> replica * CLIENT_TYPE_PUBSUB -> Client subscribed to Pub/Sub channels * CLIENT_TYPE_PRIMARY -> The client representing our replication primary. + * CLIENT_TYPE_SLOT_MIGRATION -> The client representing a slot migration. */ int getClientType(client *c) { - if (c->flag.replication_source) return CLIENT_TYPE_PRIMARY; + if (c->flag.primary) return CLIENT_TYPE_PRIMARY; /* Even though MONITOR clients are marked as replicas, we * want the expose them as normal clients. */ if (c->flag.replica && !c->flag.monitor) return CLIENT_TYPE_REPLICA; if (c->flag.pubsub) return CLIENT_TYPE_PUBSUB; + if (c->flag.slot_migration_source) return CLIENT_TYPE_SLOT_MIGRATION; return CLIENT_TYPE_NORMAL; } @@ -4382,6 +4391,8 @@ int getClientTypeByName(char *name) { return CLIENT_TYPE_PUBSUB; else if (!strcasecmp(name, "master") || !strcasecmp(name, "primary")) return CLIENT_TYPE_PRIMARY; + else if (!strcasecmp(name, "slot-migration")) + return CLIENT_TYPE_SLOT_MIGRATION; else return -1; } @@ -4392,6 +4403,7 @@ char *getClientTypeName(int class) { case CLIENT_TYPE_REPLICA: return "slave"; case CLIENT_TYPE_PUBSUB: return "pubsub"; case CLIENT_TYPE_PRIMARY: return "master"; + case CLIENT_TYPE_SLOT_MIGRATION: return "slot-migration"; default: return NULL; } } @@ -4407,9 +4419,9 @@ int checkClientOutputBufferLimits(client *c) { unsigned long used_mem = getClientOutputBufferMemoryUsage(c); class = getClientType(c); - /* For the purpose of output buffer limiting, primaries are handled - * like normal clients. */ - if (class == CLIENT_TYPE_PRIMARY) class = CLIENT_TYPE_NORMAL; + /* For the purpose of output buffer limiting, primaries and slot migrations + * are handled like normal clients. */ + if (class == CLIENT_TYPE_PRIMARY || class == CLIENT_TYPE_SLOT_MIGRATION) class = CLIENT_TYPE_NORMAL; /* Note that it doesn't make sense to set the replica clients output buffer * limit lower than the repl-backlog-size config (partial sync will succeed @@ -4892,7 +4904,7 @@ void ioThreadReadQueryFromClient(void *data) { done: /* Only trim query buffer for non-primary clients * Primary client's buffer is handled by main thread using repl_applied position */ - if (!(c->read_flags & READ_FLAGS_PRIMARY)) { + if (!(c->read_flags & READ_FLAGS_REPLICATION_SOURCE)) { trimClientQueryBuffer(c); } atomic_thread_fence(memory_order_release); diff --git a/src/object.c b/src/object.c index b8200dd815..a9c701964a 100644 --- a/src/object.c +++ b/src/object.c @@ -1337,7 +1337,8 @@ struct serverMemOverhead *getMemoryOverheadData(void) { * updateClientMemoryUsage(). */ mh->clients_normal = server.stat_clients_type_memory[CLIENT_TYPE_PRIMARY] + server.stat_clients_type_memory[CLIENT_TYPE_PUBSUB] + - server.stat_clients_type_memory[CLIENT_TYPE_NORMAL]; + server.stat_clients_type_memory[CLIENT_TYPE_NORMAL] + + server.stat_clients_type_memory[CLIENT_TYPE_SLOT_MIGRATION]; mem_total += mh->clients_normal; mh->cluster_links = server.stat_cluster_links_memory; diff --git a/src/rdb.c b/src/rdb.c index 4a3a7e1c8e..7bb9edf31f 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -1869,7 +1869,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { int deep_integrity_validation = server.sanitize_dump_payload == SANITIZE_DUMP_YES; if (server.sanitize_dump_payload == SANITIZE_DUMP_CLIENTS) { /* Skip sanitization when loading (an RDB), or getting a RESTORE command - * from either the primary or a client using an ACL user with the skip-sanitize-payload flag. */ + * from either a replication source or a client using an ACL user with the skip-sanitize-payload flag. */ int skip = server.loading || (server.current_client && (server.current_client->flag.replication_source)); if (!skip && server.current_client && server.current_client->user) skip = !!(server.current_client->user->flags & USER_FLAG_SANITIZE_PAYLOAD_SKIP); @@ -2935,12 +2935,12 @@ void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) { if (server.loading_process_events_interval_bytes && (r->processed_bytes + len) / server.loading_process_events_interval_bytes > r->processed_bytes / server.loading_process_events_interval_bytes) { - replicationSendNewlineToConnectedLinks(); + if (server.primary_host && server.repl_state == REPL_STATE_TRANSFER) replicationSendNewlineToPrimary(); loadingAbsProgress(r->processed_bytes); processEventsWhileBlocked(); processModuleLoadingProgressEvent(0); } - if (server.primary && server.primary->state == REPL_STATE_TRANSFER && rioCheckType(r) == RIO_TYPE_CONN) { + if (server.repl_state == REPL_STATE_TRANSFER && rioCheckType(r) == RIO_TYPE_CONN) { server.stat_net_repl_input_bytes += len; } } @@ -3624,8 +3624,8 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, slotBitmap slot_bitmap) retval = rewriteAppendOnlyFileRio(&rdb, slot_bitmap); rioWrite(&rdb, "*3\r\n", 4); rioWriteBulkString(&rdb, "REPLCONF", 8); - rioWriteBulkString(&rdb, "SYNC-PAYLOAD-END", 17); - rioWriteBulkLongLong(&rdb, rsi->repl_stream_db); + rioWriteBulkString(&rdb, "AOF-PAYLOAD-END", 15); + rioWriteBulkLongLong(&rdb, server.primary_repl_offset); } else { retval = rdbSaveRioWithEOFMark(req, &rdb, NULL, rsi); } @@ -3791,7 +3791,7 @@ rdbSaveInfo *rdbPopulateSaveInfo(rdbSaveInfo *rsi) { * connects to us, the NULL repl_backlog will trigger a full * synchronization, at the same time we will use a new replid and clear * replid2. */ - if (!server.primary && server.repl_backlog) { + if (!server.primary_host && server.repl_backlog) { /* Note that when server.replicas_eldb is -1, it means that this primary * didn't apply any write commands after a full synchronization. * So we can let repl_stream_db be 0, this allows a restarted replica @@ -3804,7 +3804,7 @@ rdbSaveInfo *rdbPopulateSaveInfo(rdbSaveInfo *rsi) { /* If the instance is a replica we need a connected primary * in order to fetch the currently selected DB. */ if (server.primary) { - rsi->repl_stream_db = server.primary->client->db->id; + rsi->repl_stream_db = server.primary->db->id; return rsi; } diff --git a/src/replication.c b/src/replication.c index c50cebecfd..90a9e90e24 100644 --- a/src/replication.c +++ b/src/replication.c @@ -47,17 +47,15 @@ #include void replicationDiscardCachedPrimary(void); -void replicationResurrectCachedPrimary(replicationLink *link); -void replicationResurrectProvisionalSource(replicationLink *link); -void replicationSendAck(replicationLink *link); +void replicationResurrectCachedPrimary(connection *conn); +void replicationResurrectProvisionalPrimary(void); int replicaPutOnline(client *replica); void replicaStartCommandStream(client *replica); -int cancelReplicationHandshake(replicationLink *link, int reconnect); -void replicationSteadyStateInit(replicationLink *link); +int cancelReplicationHandshake(int reconnect); +void replicationSteadyStateInit(void); void dualChannelSetupMainConnForPsync(connection *conn); -int dualChannelSyncHandleRdbLoadCompletion(replicationLink *link); -static void dualChannelFullSyncWithReplicationSource(connection *conn); -void replicationFinishSyncPayload(connection *conn, replicationLink *link, int db); +void dualChannelSyncHandleRdbLoadCompletion(void); +static void dualChannelFullSyncWithPrimary(connection *conn); /* We take a global flag to remember if this instance generated an RDB * because of replication, so that we can remove the RDB file in case @@ -538,7 +536,7 @@ void replicationFeedReplicas(int dictid, robj **argv, int argc) { * propagate *identical* replication stream. In this way this replica can * advertise the same replication ID as the primary (since it shares the * primary replication history and has the same backlog and offsets). */ - if (server.primary != NULL) return; + if (server.primary_host != NULL) return; /* If there aren't replicas, and there is no backlog buffer to populate, * we can return ASAP. */ @@ -962,11 +960,11 @@ int startBgsaveForReplication(int mincapa, int req, slotBitmap slot_bitmap) { /* We use a socket target if replica can handle the EOF marker and we're configured to do diskless syncs. * Note that in case we're creating a "filtered" RDB (functions-only, for example) we also force socket replication * to avoid overwriting the snapshot RDB file with filtered data. */ - socket_target = (server.repl_diskless_sync || req & REPLICA_REQ_RDB_MASK) && (mincapa & REPLICA_CAPA_EOF); + socket_target = (server.repl_diskless_sync || req & REPLICA_REQ_RDB_MASK) && (mincapa & REPLICA_CAPA_EOF || req & REPLICA_REQ_AOF_FORMAT); /* `SYNC` should have failed with error if we don't support socket and require a filter, assert this here */ serverAssert(socket_target || !(req & REPLICA_REQ_RDB_MASK)); - serverLog(LL_NOTICE, "Starting BGSAVE for SYNC with target: %s using: %s with format %s", + serverLog(LL_NOTICE, "Starting BGSAVE for SYNC with target: %s using: %s with format: %s", socket_target ? "replicas sockets" : "disk", (req & REPLICA_REQ_RDB_CHANNEL) ? "dual-channel" : "normal sync", (req & REPLICA_REQ_AOF_FORMAT) ? "AOF" : "RDB"); @@ -1048,7 +1046,7 @@ void syncCommand(client *c) { * become a primary if so. */ if (c->argc > 3 && !strcasecmp(c->argv[0]->ptr, "psync") && !strcasecmp(c->argv[3]->ptr, "failover")) { serverLog(LL_NOTICE, "Failover request received for replid %s.", (unsigned char *)c->argv[1]->ptr); - if (server.primary == NULL) { + if (!server.primary_host) { addReplyError(c, "PSYNC FAILOVER can't be sent to a master."); return; } @@ -1076,7 +1074,7 @@ void syncCommand(client *c) { /* Refuse SYNC requests if we are a replica but the link with our primary * is not ok... */ - if (server.primary && server.primary->state != REPL_STATE_CONNECTED) { + if (server.primary_host && server.repl_state != REPL_STATE_CONNECTED) { addReplyError(c, "-NOMASTERLINK Can't SYNC while not connected with my master"); return; } @@ -1098,12 +1096,6 @@ void syncCommand(client *c) { return; } - /* Fail sync if it is asking for AOF format and a slot is not set via REPLCONF already. */ - if (c->repl_data->replica_req & REPLICA_REQ_AOF_FORMAT && isSlotBitmapAllSlots(c->repl_data->slot_bitmap)) { - addReplyError(c, "AOF format is only supported for single slot SYNC"); - return; - } - serverLog(LL_NOTICE, "Replica %s asks for synchronization", replicationGetReplicaName(c)); /* Try a partial resynchronization if this is a PSYNC command. @@ -1179,11 +1171,8 @@ void syncCommand(client *c) { server.replid, server.replid2); } - /* For slot level replication, we make no attempt to coallesce BGSAVEs */ - int require_dedicated = !isSlotBitmapAllSlots(c->repl_data->slot_bitmap); - /* CASE 1: BGSAVE is in progress, with disk target. */ - if (!require_dedicated && server.child_type == CHILD_TYPE_RDB && server.rdb_child_type == RDB_CHILD_TYPE_DISK) { + if (server.child_type == CHILD_TYPE_RDB && server.rdb_child_type == RDB_CHILD_TYPE_DISK) { /* Ok a background save is in progress. Let's check if it is a good * one for replication, i.e. if there is another replica that is * registering differences since the server forked to save. */ @@ -1204,7 +1193,7 @@ void syncCommand(client *c) { * capabilities of the replica that triggered the current BGSAVE * and its exact requirements. */ if (ln && ((c->repl_data->replica_capa & replica->repl_data->replica_capa) == replica->repl_data->replica_capa) && - c->repl_data->replica_req == replica->repl_data->replica_req) { + c->repl_data->replica_req == replica->repl_data->replica_req && isSlotBitmapEmpty(c->repl_data->slot_bitmap)) { /* Perfect, the server is already registering differences for * another replica. Set the right state, and copy the buffer. * We don't copy buffer if clients don't want. */ @@ -1216,35 +1205,32 @@ void syncCommand(client *c) { * register differences. */ serverLog(LL_NOTICE, "Can't attach the replica to the current BGSAVE. Waiting for next BGSAVE for SYNC"); } - } - /* CASE 2: BGSAVE is in progress, with socket target. */ - if (server.child_type == CHILD_TYPE_RDB && server.rdb_child_type == RDB_CHILD_TYPE_SOCKET) { + /* CASE 2: BGSAVE is in progress, with socket target. */ + } else if (server.child_type == CHILD_TYPE_RDB && server.rdb_child_type == RDB_CHILD_TYPE_SOCKET) { /* There is an RDB child process but it is writing directly to * children sockets. We need to wait for the next BGSAVE * in order to synchronize. */ serverLog(LL_NOTICE, "Current BGSAVE has socket target. Waiting for next BGSAVE for SYNC"); - return; - } - /* CASE 3: There is no BGSAVE is in progress, but we need to delay. */ - if (!require_dedicated && server.repl_diskless_sync && (c->repl_data->replica_capa & REPLICA_CAPA_EOF) && server.repl_diskless_sync_delay) { - /* Diskless replication RDB child is created inside - * replicationCron() since we want to delay its start a - * few seconds to wait for more replicas to arrive. */ - serverLog(LL_NOTICE, "Delay next BGSAVE for diskless SYNC"); - return; - } - - /* CASE 4: We don't have a BGSAVE in progress, but there is an existing child process. */ - if (hasActiveChildProcess()) { - serverLog(LL_NOTICE, "No BGSAVE in progress, but another BG operation is active. " - "BGSAVE for replication delayed"); - return; + /* CASE 3: There is no BGSAVE is in progress. */ + } else { + if (server.repl_diskless_sync && (c->repl_data->replica_capa & REPLICA_CAPA_EOF) && server.repl_diskless_sync_delay && isSlotBitmapEmpty(c->repl_data->slot_bitmap)) { + /* Diskless replication RDB child is created inside + * replicationCron() since we want to delay its start a + * few seconds to wait for more replicas to arrive. */ + serverLog(LL_NOTICE, "Delay next BGSAVE for diskless SYNC"); + } else { + /* We don't have a BGSAVE in progress, let's start one. Diskless + * or disk-based mode is determined by replica's capacity. */ + if (!hasActiveChildProcess()) { + startBgsaveForReplication(c->repl_data->replica_capa, c->repl_data->replica_req, c->repl_data->slot_bitmap); + } else { + serverLog(LL_NOTICE, "No BGSAVE in progress, but another BG operation is active. " + "BGSAVE for replication delayed"); + } + } } - - /* CASE 5: We are good to start a BGSAVE. Diskless or disk-based mode is determined by replica's capacity. */ - startBgsaveForReplication(c->repl_data->replica_capa, c->repl_data->replica_req, c->repl_data->slot_bitmap); return; } @@ -1268,7 +1254,7 @@ int anyOtherReplicaWaitRdb(client *except_me) { void initClientReplicationData(client *c) { if (c->repl_data) return; c->repl_data = (ClientReplicationData *)zcalloc(sizeof(ClientReplicationData)); - slotBitmapSetAll(c->repl_data->slot_bitmap); + memset(c->repl_data->slot_bitmap, 0, sizeof(c->repl_data->slot_bitmap)); } void freeClientReplicationData(client *c) { @@ -1309,9 +1295,7 @@ void freeClientReplicationData(client *c) { moduleFireServerEvent(VALKEYMODULE_EVENT_REPLICA_CHANGE, VALKEYMODULE_SUBEVENT_REPLICA_CHANGE_OFFLINE, NULL); } - if (c->flag.replication_source) { - replicationHandleSourceDisconnection(c->repl_data->link); - } + if (c->flag.primary) replicationHandlePrimaryDisconnection(); sdsfree(c->repl_data->replica_addr); zfree(c->repl_data); c->repl_data = NULL; @@ -1437,7 +1421,7 @@ void replconfCommand(client *c) { } else if (!strcasecmp(c->argv[j]->ptr, "getack")) { /* REPLCONF GETACK is used in order to request an ACK ASAP * to the replica. */ - if (server.primary && server.primary->client) replicationSendAck(server.primary); + if (server.primary_host && server.primary) replicationSendAck(server.primary); return; } else if (!strcasecmp(c->argv[j]->ptr, "rdb-only")) { /* REPLCONF RDB-ONLY is used to identify the client only wants @@ -1513,10 +1497,7 @@ void replconfCommand(client *c) { if (!server.cluster_enabled) { addReplyError(c, "Cannot replicate a slot when cluster mode is disabled"); } - if (!isSlotBitmapAllSlots(c->repl_data->slot_bitmap)) { - addReplyError(c, "Slot bitmap already set"); - } - if (stringObjectLen(c->argv[j + 1]) != CLUSTER_SLOTS / 8) { + if (stringObjectLen(c->argv[j + 1]) != sizeof(slotBitmap)) { addReplyError(c, "Invalid slot bitmap length"); return; } @@ -1526,24 +1507,26 @@ void replconfCommand(client *c) { return; } } - memcpy(c->repl_data->slot_bitmap, c->argv[j + 1]->ptr, CLUSTER_SLOTS / 8); + memcpy(c->repl_data->slot_bitmap, c->argv[j + 1]->ptr, sizeof(slotBitmap)); /* For now, we only support AOF for slot transfer. */ c->repl_data->replica_req |= REPLICA_REQ_AOF_FORMAT; - } else if (!strcasecmp(c->argv[j]->ptr, "sync-payload-end")) { - /* REPLCONF sync-payload-end is used to inform the replica - * that the primary has finished sending the sync snapshot, and - * that it is free to begin processing the replication backlog. - * - * dbnum specifies which db to stream the backlog into. */ - int db_num = 0; - if (getIntFromObjectOrReply(c, c->argv[j + 1], &db_num, NULL) != C_OK || db_num < 0 || db_num >= server.dbnum) { - addReplyError(c, "Unable to parse DB number"); + } else if (!strcasecmp(c->argv[j]->ptr, "aof-payload-end")) { + /* REPLCONF aof-payload-end is used to inform the target + * that the replication source has finished sending the AOF formatted + * sync snapshot, and that it is free to begin processing the + * replication backlog. */ + long long initial_offset = 0; + if (getLongLongFromObjectOrReply(c, c->argv[j + 1], &initial_offset, NULL) != C_OK) { return; } - serverLog(LL_NOTICE, "Got sync-payload-end for db %d", db_num); - - replicationFinishSyncPayload(c->conn, c->repl_data->link, db_num); + if (c->flag.slot_migration_source) { + clusterSlotMigrationDoneSyncing(initial_offset); + return; + } + /* Right now, we only support this for slot migration. */ + addReplyErrorFormat(c, "AOF sync is not in progress."); + return; } else { addReplyErrorFormat(c, "Unrecognized REPLCONF option: %s", (char *)c->argv[j]->ptr); return; @@ -1985,30 +1968,13 @@ void shiftReplicationId(void) { /* ----------------------------------- REPLICA -------------------------------- */ -char *replicationGetNameForLogs(replicationLink *link) { - if (link == server.primary) - return "PRIMARY"; - if (!isSlotBitmapAllSlots(link->slot_bitmap)) - return "SLOT OWNER"; - return "OTHER REPLICATION SOURCE"; -} - /* Returns 1 if the given replication state is a handshake state, * 0 otherwise. */ -int replicaIsInHandshakeState(replicationLink *link) { - return link->state >= REPL_STATE_RECEIVE_PING_REPLY && link->state <= REPL_STATE_RECEIVE_PSYNC_REPLY; +int replicaIsInHandshakeState(void) { + return server.repl_state >= REPL_STATE_RECEIVE_PING_REPLY && server.repl_state <= REPL_STATE_RECEIVE_PSYNC_REPLY; } -void replicationSendNewlineOnLink(replicationLink *link) { - static time_t newline_sent; - if (time(NULL) != newline_sent) { - newline_sent = time(NULL); - /* Pinging back in this stage is best-effort. */ - if (link->transfer_s) connWrite(link->transfer_s, "\n", 1); - } -} - -/* Avoid the primary to detect replicas as timing out while loading the +/* Avoid the primary to detect the replica is timing out while loading the * RDB file in initial synchronization. We send a single newline character * that is valid protocol but is guaranteed to either be sent entirely or * not, since the byte is indivisible. @@ -2016,15 +1982,12 @@ void replicationSendNewlineOnLink(replicationLink *link) { * The function is called in two contexts: while we flush the current * data with emptyData(), and while we load the new data received as an * RDB file from the primary. */ -void replicationSendNewlineToConnectedLinks() { - listIter li; - listNode *ln; - listRewind(server.replication_links, &li); - while ((ln = listNext(&li))) { - replicationLink *link = (replicationLink *)ln->value; - if (link->state >= REPL_STATE_CONNECTING && link->state <= REPL_STATE_CANCELLED) { - replicationSendNewlineOnLink(link); - } +void replicationSendNewlineToPrimary(void) { + static time_t newline_sent; + if (time(NULL) != newline_sent) { + newline_sent = time(NULL); + /* Pinging back in this stage is best-effort. */ + if (server.repl_transfer_s) connWrite(server.repl_transfer_s, "\n", 1); } } @@ -2033,17 +1996,15 @@ void replicationSendNewlineToConnectedLinks() { * after loading succeeded or failed. */ void replicationEmptyDbCallback(hashtable *d) { UNUSED(d); - replicationSendNewlineToConnectedLinks(); + if (server.repl_state == REPL_STATE_TRANSFER) replicationSendNewlineToPrimary(); } /* Once we have a link with the primary and the synchronization was * performed, this function materializes the primary client we store * at server.primary, starting from the specified file descriptor. */ -client *createReplicationLinkClientWithHandler(replicationLink *link, connection *conn, int dbid, ConnectionCallbackFunc handler) { - client *c = createClient(conn); - if (conn) { - connSetReadHandler(conn, handler); - } +void replicationCreatePrimaryClientWithHandler(connection *conn, int dbid, ConnectionCallbackFunc handler) { + server.primary = createClient(conn); + if (conn) connSetReadHandler(server.primary->conn, handler); /** * Important note: @@ -2056,33 +2017,28 @@ client *createReplicationLinkClientWithHandler(replicationLink *link, connection * to pass the execution to a background thread and unblock after the * execution is done. This is the reason why we allow blocking the replication * connection. */ - c->flag.replication_source = 1; - c->flag.authenticated = 1; - - - /* Allocate a private query buffer for the replication link client instead of using the shared query buffer. - * This is done because the replication link's query buffer data needs to be preserved for my sub-replicas to use. */ - c->querybuf = sdsempty(); - initClientReplicationData(c); - c->repl_data->reploff = link->initial_offset; - c->repl_data->read_reploff = c->repl_data->reploff; - c->user = NULL; /* This client can do everything. */ - c->repl_data->link = link; - memcpy(c->repl_data->replid, link->replid, sizeof(link->replid)); - + server.primary->flag.primary = 1; + server.primary->flag.authenticated = 1; + server.primary->flag.replication_source = 1; + + /* Allocate a private query buffer for the primary client instead of using the shared query buffer. + * This is done because the primary's query buffer data needs to be preserved for my sub-replicas to use. */ + server.primary->querybuf = sdsempty(); + initClientReplicationData(server.primary); + server.primary->repl_data->reploff = server.primary_initial_offset; + server.primary->repl_data->read_reploff = server.primary->repl_data->reploff; + server.primary->user = NULL; /* This client can do everything. */ + memcpy(server.primary->repl_data->replid, server.primary_replid, sizeof(server.primary_replid)); /* If primary offset is set to -1, this primary is old and is not * PSYNC capable, so we flag it accordingly. */ - if (c->repl_data->reploff == -1) c->flag.pre_psync = 1; - if (dbid != -1) selectDb(c, dbid); - memcpy(c->repl_data->slot_bitmap, link->slot_bitmap, sizeof(slotBitmap)); - - return c; + if (server.primary->repl_data->reploff == -1) server.primary->flag.pre_psync = 1; + if (dbid != -1) selectDb(server.primary, dbid); } /* Wrapper for replicationCreatePrimaryClientWithHandler, init primary connection handler * with ordinary client connection handler */ -client *createReplicationLinkClient(replicationLink *link, connection *conn, int dbid) { - return createReplicationLinkClientWithHandler(link, conn, dbid, readQueryFromClient); +void replicationCreatePrimaryClient(connection *conn, int dbid) { + replicationCreatePrimaryClientWithHandler(conn, dbid, readQueryFromClient); } /* This function will try to re-enable the AOF file after the @@ -2159,75 +2115,13 @@ void disklessLoadDiscardFunctionsLibCtx(functionsLibCtx *temp_functions_lib_ctx) void replicationAttachToNewPrimary(void) { /* Replica starts to apply data from new primary, we must discard the cached * primary structure. */ - serverAssert(server.primary == NULL || server.primary->client == NULL); + serverAssert(server.primary == NULL); replicationDiscardCachedPrimary(); disconnectReplicas(); /* Force our replicas to resync with us as well. */ freeReplicationBacklog(); /* Don't allow our chained replicas to PSYNC. */ } -void replicationFinishSyncPayload(connection *conn, replicationLink *link, int db) { - /* Final setup of the connected replica <- primary link */ - int link_closed = 0; - if (link->rdb_channel_state == REPL_DUAL_CHANNEL_RDB_LOAD) { - if (dualChannelSyncHandleRdbLoadCompletion(link) == C_ERR) { - /* This may happen if, while loading the backlog, our primary is unset */ - serverLog(LL_NOTICE, "%s <-> MYSELF sync: Failed to finalize dual channel load", replicationGetNameForLogs(link)); - freeReplicationLink(link); - link_closed = 1; - } - } else { - /* Client could have been previously created for AOF load. */ - if (!link->client) { - link->client = createReplicationLinkClient(link, link->transfer_s, db); - link->transfer_s = NULL; - } - link->state = REPL_STATE_CONNECTED; - /* Send the initial ACK immediately to put this replica in online state. */ - replicationSendAck(link); - } - - if (!link_closed && link == server.primary) { - server.repl_down_since = 0; - - /* Fire the primary link modules event. */ - moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_UP, NULL); - if (link->state == REPL_STATE_CONNECTED) { - /* After a full resynchronization we use the replication ID and - * offset of the primary. The secondary ID / offset are cleared since - * we are starting a new history. */ - memcpy(server.replid, link->client->repl_data->replid, sizeof(server.replid)); - server.primary_repl_offset = link->client->repl_data->reploff; - } - clearReplicationId2(); - - /* Let's create the replication backlog if needed. Replicas need to - * accumulate the backlog regardless of the fact they have sub-replicas - * or not, in order to behave correctly if they are promoted to - * primaries after a failover. */ - if (server.repl_backlog == NULL) createReplicationBacklog(); - serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Finished with success"); - - if (server.supervised_mode == SUPERVISED_SYSTEMD) { - serverCommunicateSystemd("STATUS=PRIMARY <-> REPLICA sync: Finished with success. Ready to accept connections " - "in read-write mode.\n"); - } - } - - /* Restart the AOF subsystem now that we finished the sync. This - * will trigger an AOF rewrite, and when done will start appending - * to the new file. */ - if (server.aof_enabled && server.aof_state != AOF_OFF) restartAOFAfterSYNC(); - - /* In case of dual channel replication sync we want to close the RDB connection - * once the connection is established */ - if (!link_closed && conn == link->rdb_transfer_s) { - connClose(conn); - link->rdb_transfer_s = NULL; - } - return; -} - /* Asynchronously read the SYNC payload we receive from a primary */ #define REPL_MAX_WRITTEN_BEFORE_FSYNC (1024 * 1024 * 8) /* 8 MB */ void readSyncBulkPayload(connection *conn) { @@ -2239,11 +2133,6 @@ void readSyncBulkPayload(connection *conn) { int empty_db_flags = server.repl_replica_lazy_flush ? EMPTYDB_ASYNC : EMPTYDB_NO_FLAGS; off_t left; - replicationLink *link = (replicationLink *)connGetPrivateData(conn); - - /* RDB bulk load will only be used if we are sending all slots. */ - serverAssert(isSlotBitmapAllSlots(link->slot_bitmap)); - /* Static vars used to hold the EOF mark, and the last bytes received * from the server: when they match, we reached the end of the transfer. */ static char eofmark[RDB_EOF_MARK_SIZE]; @@ -2252,10 +2141,10 @@ void readSyncBulkPayload(connection *conn) { /* If repl_transfer_size == -1 we still have to read the bulk length * from the primary reply. */ - if (link->transfer_size == -1) { + if (server.repl_transfer_size == -1) { nread = connSyncReadLine(conn, buf, 1024, server.repl_syncio_timeout * 1000); if (nread == -1) { - serverLog(LL_WARNING, "I/O error reading bulk count from %s: %s", replicationGetNameForLogs(link), connGetLastError(conn)); + serverLog(LL_WARNING, "I/O error reading bulk count from PRIMARY: %s", connGetLastError(conn)); goto error; } else { /* nread here is returned by connSyncReadLine(), which calls syncReadLine() and @@ -2264,19 +2153,18 @@ void readSyncBulkPayload(connection *conn) { } if (buf[0] == '-') { - serverLog(LL_WARNING, "%s aborted replication with an error: %s", replicationGetNameForLogs(link), buf + 1); + serverLog(LL_WARNING, "PRIMARY aborted replication with an error: %s", buf + 1); goto error; } else if (buf[0] == '\0') { /* At this stage just a newline works as a PING in order to take * the connection live. So we refresh our last interaction * timestamp. */ - link->transfer_lastio = server.unixtime; + server.repl_transfer_lastio = server.unixtime; return; } else if (buf[0] != '$') { serverLog(LL_WARNING, - "Bad protocol from %s, the first byte is not '$' (we received '%s'), are you sure the host " + "Bad protocol from PRIMARY, the first byte is not '$' (we received '%s'), are you sure the host " "and port are right?", - replicationGetNameForLogs(link), buf); goto error; } @@ -2297,14 +2185,14 @@ void readSyncBulkPayload(connection *conn) { memset(lastbytes, 0, RDB_EOF_MARK_SIZE); /* Set any repl_transfer_size to avoid entering this code path * at the next call. */ - link->transfer_size = 0; - serverLog(LL_NOTICE, "%s <-> REPLICA sync: receiving streamed RDB from primary with EOF %s", replicationGetNameForLogs(link), + server.repl_transfer_size = 0; + serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: receiving streamed RDB from primary with EOF %s", use_diskless_load ? "to parser" : "to disk"); } else { usemark = 0; - link->transfer_size = strtol(buf + 1, NULL, 10); - serverLog(LL_NOTICE, "%s <-> REPLICA sync: receiving %lld bytes from primary %s", replicationGetNameForLogs(link), - (long long)link->transfer_size, use_diskless_load ? "to parser" : "to disk"); + server.repl_transfer_size = strtol(buf + 1, NULL, 10); + serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: receiving %lld bytes from primary %s", + (long long)server.repl_transfer_size, use_diskless_load ? "to parser" : "to disk"); } return; } @@ -2315,7 +2203,7 @@ void readSyncBulkPayload(connection *conn) { if (usemark) { readlen = sizeof(buf); } else { - left = link->transfer_size - link->transfer_read; + left = server.repl_transfer_size - server.repl_transfer_read; readlen = (left < (signed)sizeof(buf)) ? left : (signed)sizeof(buf); } @@ -2325,7 +2213,7 @@ void readSyncBulkPayload(connection *conn) { /* equivalent to EAGAIN */ return; } - serverLog(LL_WARNING, "I/O error trying to sync with %s: %s", replicationGetNameForLogs(link), + serverLog(LL_WARNING, "I/O error trying to sync with PRIMARY: %s", (nread == -1) ? connGetLastError(conn) : "connection lost"); goto error; } @@ -2351,20 +2239,19 @@ void readSyncBulkPayload(connection *conn) { /* Update the last I/O time for the replication transfer (used in * order to detect timeouts during replication), and write what we * got from the socket to the dump file on disk. */ - link->transfer_lastio = server.unixtime; - if ((nwritten = write(link->transfer_fd, buf, nread)) != nread) { + server.repl_transfer_lastio = server.unixtime; + if ((nwritten = write(server.repl_transfer_fd, buf, nread)) != nread) { serverLog(LL_WARNING, "Write error or short write writing to the DB dump file " - "needed for %s <-> REPLICA synchronization: %s", - replicationGetNameForLogs(link), + "needed for PRIMARY <-> REPLICA synchronization: %s", (nwritten == -1) ? strerror(errno) : "short write"); goto error; } - link->transfer_read += nread; + server.repl_transfer_read += nread; /* Delete the last 40 bytes from the file if we reached EOF. */ if (usemark && eof_reached) { - if (ftruncate(link->transfer_fd, link->transfer_read - RDB_EOF_MARK_SIZE) == -1) { + if (ftruncate(server.repl_transfer_fd, server.repl_transfer_read - RDB_EOF_MARK_SIZE) == -1) { serverLog(LL_WARNING, "Error truncating the RDB file received from the primary " "for SYNC: %s", @@ -2376,15 +2263,15 @@ void readSyncBulkPayload(connection *conn) { /* Sync data on disk from time to time, otherwise at the end of the * transfer we may suffer a big delay as the memory buffers are copied * into the actual disk. */ - if (link->transfer_read >= link->transfer_last_fsync_off + REPL_MAX_WRITTEN_BEFORE_FSYNC) { - off_t sync_size = link->transfer_read - link->transfer_last_fsync_off; - rdb_fsync_range(link->transfer_fd, link->transfer_last_fsync_off, sync_size); - link->transfer_last_fsync_off += sync_size; + if (server.repl_transfer_read >= server.repl_transfer_last_fsync_off + REPL_MAX_WRITTEN_BEFORE_FSYNC) { + off_t sync_size = server.repl_transfer_read - server.repl_transfer_last_fsync_off; + rdb_fsync_range(server.repl_transfer_fd, server.repl_transfer_last_fsync_off, sync_size); + server.repl_transfer_last_fsync_off += sync_size; } /* Check if the transfer is now complete */ if (!usemark) { - if (link->transfer_read == link->transfer_size) eof_reached = 1; + if (server.repl_transfer_read == server.repl_transfer_size) eof_reached = 1; } /* If the transfer is yet not complete, we need to read more, so @@ -2447,7 +2334,7 @@ void readSyncBulkPayload(connection *conn) { * It is enabled only on SWAPDB diskless replication when primary replication ID hasn't changed, * because in that state the old content of the db represents a different point in time of the same * data set we're currently receiving from the primary. */ - if (memcmp(server.replid, link->replid, CONFIG_RUN_ID_SIZE) == 0) { + if (memcmp(server.replid, server.primary_replid, CONFIG_RUN_ID_SIZE) == 0) { asyncLoading = 1; } dbarray = diskless_load_tempDb; @@ -2458,34 +2345,29 @@ void readSyncBulkPayload(connection *conn) { replicationAttachToNewPrimary(); /* Even though we are on-empty-db and the database is empty, we still call emptyData. */ - serverLog(LL_NOTICE, "%s <-> REPLICA sync: Flushing old data", replicationGetNameForLogs(link)); + serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Flushing old data"); emptyData(-1, empty_db_flags, replicationEmptyDbCallback); dbarray = server.db; functions_lib_ctx = functionsLibCtxGetCurrent(); } - rioInitWithConn(&rdb, conn, link->transfer_size); + rioInitWithConn(&rdb, conn, server.repl_transfer_size); /* Put the socket in blocking mode to simplify RDB transfer. * We'll restore it when the RDB is received. */ connBlock(conn); connRecvTimeout(conn, server.repl_timeout * 1000); - serverLog(LL_NOTICE, "%s <-> REPLICA sync: Loading DB in memory", replicationGetNameForLogs(link)); - startLoading(link->transfer_size, RDBFLAGS_REPLICATION, asyncLoading); - - /* Before loading, ensure that the link won't be freed, even if - * REPLICAOF NO ONE is called in background event processing. */ - link->protected = 1; + serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Loading DB in memory"); + startLoading(server.repl_transfer_size, RDBFLAGS_REPLICATION, asyncLoading); int loadingFailed = 0; rdbLoadingCtx loadingCtx = {.dbarray = dbarray, .functions_lib_ctx = functions_lib_ctx}; if (rdbLoadRioWithLoadingCtxScopedRdb(&rdb, RDBFLAGS_REPLICATION, &rsi, &loadingCtx) != C_OK) { /* RDB loading failed. */ - serverLog(LL_WARNING, "Failed trying to load the %s synchronization DB " - "from socket, check server logs.", - replicationGetNameForLogs(link)); + serverLog(LL_WARNING, "Failed trying to load the PRIMARY synchronization DB " + "from socket, check server logs."); loadingFailed = 1; } else if (usemark) { /* Verify the end mark is correct. */ @@ -2495,14 +2377,6 @@ void readSyncBulkPayload(connection *conn) { } } - /* After loading, check if replication was cancelled */ - link->protected = 0; - if (link->state == REPL_STATE_CANCELLED) { - /* Link was freed during RDB load */ - serverLog(LL_NOTICE, "%s <-> REPLICA sync: Link to primary closed during diskless RDB load", replicationGetNameForLogs(link)); - loadingFailed = 1; - } - if (loadingFailed) { stopLoading(0); rioFreeConn(&rdb, NULL); @@ -2514,10 +2388,10 @@ void readSyncBulkPayload(connection *conn) { disklessLoadDiscardTempDb(diskless_load_tempDb); disklessLoadDiscardFunctionsLibCtx(temp_functions_lib_ctx); - serverLog(LL_NOTICE, "%s <-> REPLICA sync: Discarding temporary DB in background", replicationGetNameForLogs(link)); + serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding temporary DB in background"); } else { /* Remove the half-loaded data in case we started with an empty replica. */ - serverLog(LL_NOTICE, "%s <-> REPLICA sync: Discarding the half-loaded data", replicationGetNameForLogs(link)); + serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding the half-loaded data"); emptyData(-1, empty_db_flags, replicationEmptyDbCallback); } @@ -2534,7 +2408,7 @@ void readSyncBulkPayload(connection *conn) { * primary structure and force resync of sub-replicas. */ replicationAttachToNewPrimary(); - serverLog(LL_NOTICE, "%s <-> REPLICA sync: Swapping active DB with loaded DB", replicationGetNameForLogs(link)); + serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Swapping active DB with loaded DB"); swapMainDbWithTempDb(diskless_load_tempDb); /* swap existing functions ctx with the temporary one */ @@ -2545,7 +2419,7 @@ void readSyncBulkPayload(connection *conn) { /* Delete the old db as it's useless now. */ disklessLoadDiscardTempDb(diskless_load_tempDb); - serverLog(LL_NOTICE, "%s <-> REPLICA sync: Discarding old DB in background", replicationGetNameForLogs(link)); + serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding old DB in background"); } /* Inform about db change, as replication was diskless and didn't cause a save. */ @@ -2561,22 +2435,20 @@ void readSyncBulkPayload(connection *conn) { } else { /* Make sure the new file (also used for persistence) is fully synced * (not covered by earlier calls to rdb_fsync_range). */ - if (fsync(link->transfer_fd) == -1) { + if (fsync(server.repl_transfer_fd) == -1) { serverLog(LL_WARNING, "Failed trying to sync the temp DB to disk in " - "%s <-> REPLICA synchronization: %s", - replicationGetNameForLogs(link), + "PRIMARY <-> REPLICA synchronization: %s", strerror(errno)); goto error; } /* Rename rdb like renaming rewrite aof asynchronously. */ int old_rdb_fd = open(server.rdb_filename, O_RDONLY | O_NONBLOCK); - if (rename(link->transfer_tmpfile, server.rdb_filename) == -1) { + if (rename(server.repl_transfer_tmpfile, server.rdb_filename) == -1) { serverLog(LL_WARNING, "Failed trying to rename the temp DB into %s in " - "%s <-> REPLICA synchronization: %s", - replicationGetNameForLogs(link), + "PRIMARY <-> REPLICA synchronization: %s", server.rdb_filename, strerror(errno)); if (old_rdb_fd != -1) close(old_rdb_fd); goto error; @@ -2588,8 +2460,7 @@ void readSyncBulkPayload(connection *conn) { if (fsyncFileDir(server.rdb_filename) == -1) { serverLog(LL_WARNING, "Failed trying to sync DB directory %s in " - "%s <-> REPLICA synchronization: %s", - replicationGetNameForLogs(link), + "PRIMARY <-> REPLICA synchronization: %s", server.rdb_filename, strerror(errno)); goto error; } @@ -2601,14 +2472,13 @@ void readSyncBulkPayload(connection *conn) { /* Empty the databases only after the RDB file is ok, that is, before the RDB file * is actually loaded, in case we encounter an error and drop the replication stream * and leave an empty database. */ - serverLog(LL_NOTICE, "%s <-> REPLICA sync: Flushing old data", replicationGetNameForLogs(link)); + serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Flushing old data"); emptyData(-1, empty_db_flags, replicationEmptyDbCallback); - serverLog(LL_NOTICE, "%s <-> REPLICA sync: Loading DB in memory", replicationGetNameForLogs(link)); + serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Loading DB in memory"); if (rdbLoad(server.rdb_filename, &rsi, RDBFLAGS_REPLICATION) != RDB_OK) { - serverLog(LL_WARNING, "Failed trying to load the %s synchronization " - "DB from disk, check server logs.", - replicationGetNameForLogs(link)); + serverLog(LL_WARNING, "Failed trying to load the PRIMARY synchronization " + "DB from disk, check server logs."); if (server.rdb_del_sync_files && allPersistenceDisabled()) { serverLog(LL_NOTICE, "Removing the RDB file obtained from " "the primary. This replica has persistence " @@ -2617,7 +2487,7 @@ void readSyncBulkPayload(connection *conn) { } /* If disk-based RDB loading fails, remove the half-loaded dataset. */ - serverLog(LL_NOTICE, "%s<-> REPLICA sync: Discarding the half-loaded data", replicationGetNameForLogs(link)); + serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding the half-loaded data"); emptyData(-1, empty_db_flags, replicationEmptyDbCallback); /* Note that there's no point in restarting the AOF on sync failure, @@ -2633,17 +2503,61 @@ void readSyncBulkPayload(connection *conn) { bg_unlink(server.rdb_filename); } - zfree(link->transfer_tmpfile); - close(link->transfer_fd); - link->transfer_fd = -1; - link->transfer_tmpfile = NULL; + zfree(server.repl_transfer_tmpfile); + close(server.repl_transfer_fd); + server.repl_transfer_fd = -1; + server.repl_transfer_tmpfile = NULL; + } + + /* Final setup of the connected replica <- primary link */ + if (conn == server.repl_rdb_transfer_s) { + dualChannelSyncHandleRdbLoadCompletion(); + } else { + replicationCreatePrimaryClient(server.repl_transfer_s, rsi.repl_stream_db); + server.repl_state = REPL_STATE_CONNECTED; + server.repl_down_since = 0; + /* Send the initial ACK immediately to put this replica in online state. */ + replicationSendAck(server.primary); + } + + /* Fire the primary link modules event. */ + moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_UP, NULL); + if (server.repl_state == REPL_STATE_CONNECTED) { + /* After a full resynchronization we use the replication ID and + * offset of the primary. The secondary ID / offset are cleared since + * we are starting a new history. */ + memcpy(server.replid, server.primary->repl_data->replid, sizeof(server.replid)); + server.primary_repl_offset = server.primary->repl_data->reploff; + } + clearReplicationId2(); + + /* Let's create the replication backlog if needed. Replicas need to + * accumulate the backlog regardless of the fact they have sub-replicas + * or not, in order to behave correctly if they are promoted to + * primaries after a failover. */ + if (server.repl_backlog == NULL) createReplicationBacklog(); + serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Finished with success"); + + if (server.supervised_mode == SUPERVISED_SYSTEMD) { + serverCommunicateSystemd("STATUS=PRIMARY <-> REPLICA sync: Finished with success. Ready to accept connections " + "in read-write mode.\n"); } - replicationFinishSyncPayload(conn, link, rsi.repl_stream_db); + /* Restart the AOF subsystem now that we finished the sync. This + * will trigger an AOF rewrite, and when done will start appending + * to the new file. */ + if (server.aof_enabled) restartAOFAfterSYNC(); + + /* In case of dual channel replication sync we want to close the RDB connection + * once the connection is established */ + if (conn == server.repl_rdb_transfer_s) { + connClose(conn); + server.repl_rdb_transfer_s = NULL; + } return; error: - if (link) cancelReplicationHandshake(link, 1); + cancelReplicationHandshake(1); return; } @@ -2654,8 +2568,7 @@ char *receiveSynchronousResponse(connection *conn) { serverLog(LL_WARNING, "Failed to read response from the server: %s", connGetLastError(conn)); return NULL; } - replicationLink *link = (replicationLink *)connGetPrivateData(conn); - link->transfer_lastio = server.unixtime; + server.repl_transfer_lastio = server.unixtime; return sdsnew(buf); } @@ -2752,34 +2665,35 @@ sds getReplicaPortString(void) { /* Replication: Replica side. * Free replica's local replication buffer */ -void freePendingReplDataBuf(replicationLink *link) { - listRelease(link->pending_repl_data.blocks); - link->pending_repl_data.blocks = NULL; - link->pending_repl_data.len = 0; +void freePendingReplDataBuf(void) { + listRelease(server.pending_repl_data.blocks); + server.pending_repl_data.blocks = NULL; + server.pending_repl_data.len = 0; } /* Replication: Replica side. * Upon dual-channel sync failure, close rdb-connection, reset repl-state, reset * provisional primary struct, and free local replication buffer. */ -void replicationAbortDualChannelSyncTransfer(replicationLink *link) { - serverAssert(link->rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE); +void replicationAbortDualChannelSyncTransfer(void) { + serverAssert(server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE); dualChannelServerLog(LL_NOTICE, "Aborting dual channel sync"); - if (link->rdb_transfer_s) { - connClose(link->rdb_transfer_s); - link->rdb_transfer_s = NULL; - } - zfree(link->transfer_tmpfile); - link->transfer_tmpfile = NULL; - if (link->transfer_fd != -1) { - close(link->transfer_fd); - link->transfer_fd = -1; - } - link->rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE; - link->provisional_source_state.read_reploff = 0; - link->provisional_source_state.reploff = 0; - link->provisional_source_state.dbid = -1; - link->rdb_client_id = -1; - freePendingReplDataBuf(link); + if (server.repl_rdb_transfer_s) { + connClose(server.repl_rdb_transfer_s); + server.repl_rdb_transfer_s = NULL; + } + zfree(server.repl_transfer_tmpfile); + server.repl_transfer_tmpfile = NULL; + if (server.repl_transfer_fd != -1) { + close(server.repl_transfer_fd); + server.repl_transfer_fd = -1; + } + server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE; + server.repl_provisional_primary.read_reploff = 0; + server.repl_provisional_primary.reploff = 0; + server.repl_provisional_primary.conn = NULL; + server.repl_provisional_primary.dbid = -1; + server.rdb_client_id = -1; + freePendingReplDataBuf(); return; } @@ -2801,7 +2715,7 @@ int sendCurrentOffsetToReplica(client *replica) { return C_OK; } -static int dualChannelReplHandleHandshake(replicationLink *link, sds *err) { +static int dualChannelReplHandleHandshake(connection *conn, sds *err) { dualChannelServerLog(LL_DEBUG, "Received first reply from primary using rdb connection."); /* AUTH with the primary if required. */ if (server.primary_auth) { @@ -2816,7 +2730,7 @@ static int dualChannelReplHandleHandshake(replicationLink *link, sds *err) { args[argc] = server.primary_auth; lens[argc] = sdslen(server.primary_auth); argc++; - *err = sendCommandArgv(link->rdb_transfer_s, argc, args, lens); + *err = sendCommandArgv(conn, argc, args, lens); if (*err) { dualChannelServerLog(LL_WARNING, "Sending command to primary in dual channel replication handshake: %s", *err); return C_ERR; @@ -2824,7 +2738,7 @@ static int dualChannelReplHandleHandshake(replicationLink *link, sds *err) { } /* Send replica listening port to primary for clarification */ sds portstr = getReplicaPortString(); - *err = sendCommand(link->rdb_transfer_s, "REPLCONF", "capa", "eof", "rdb-only", "1", "rdb-channel", "1", "listening-port", portstr, + *err = sendCommand(conn, "REPLCONF", "capa", "eof", "rdb-only", "1", "rdb-channel", "1", "listening-port", portstr, NULL); sdsfree(portstr); if (*err) { @@ -2832,30 +2746,17 @@ static int dualChannelReplHandleHandshake(replicationLink *link, sds *err) { return C_ERR; } - /* Send slot bitmap, if it is needed */ - if (!isSlotBitmapAllSlots(link->slot_bitmap)) { - char *args[] = {"REPLCONF", "slot-bitmap", NULL}; - size_t lens[] = {8, 11, 0}; - args[2] = (char *) link->slot_bitmap; - lens[2] = sizeof(slotBitmap); - *err = sendCommandArgv(link->rdb_transfer_s, 3, args, lens); - if (*err) { - dualChannelServerLog(LL_WARNING, "Sending REPLCONF slot-bitmap command to primary in dual channel replication handshake: %s", *err); - return C_ERR; - } - } - - if (connSetReadHandler(link->rdb_transfer_s, dualChannelFullSyncWithReplicationSource) == C_ERR) { + if (connSetReadHandler(conn, dualChannelFullSyncWithPrimary) == C_ERR) { char conninfo[CONN_INFO_LEN]; dualChannelServerLog(LL_WARNING, "Can't create readable event for SYNC: %s (%s)", strerror(errno), - connGetInfo(link->transfer_s, conninfo, sizeof(conninfo))); + connGetInfo(conn, conninfo, sizeof(conninfo))); return C_ERR; } return C_OK; } -static int dualChannelReplHandleAuthReply(replicationLink *link, sds *err) { - *err = receiveSynchronousResponse(link->rdb_transfer_s); +static int dualChannelReplHandleAuthReply(connection *conn, sds *err) { + *err = receiveSynchronousResponse(conn); if (*err == NULL) { dualChannelServerLog(LL_WARNING, "Primary did not respond to auth command during SYNC handshake"); return C_ERR; @@ -2864,11 +2765,12 @@ static int dualChannelReplHandleAuthReply(replicationLink *link, sds *err) { dualChannelServerLog(LL_WARNING, "Unable to AUTH to Primary: %s", *err); return C_ERR; } + server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY; return C_OK; } -static int dualChannelReplHandleReplconfReply(replicationLink *link, sds *err) { - *err = receiveSynchronousResponse(link->rdb_transfer_s); +static int dualChannelReplHandleReplconfReply(connection *conn, sds *err) { + *err = receiveSynchronousResponse(conn); if (*err == NULL) { dualChannelServerLog(LL_WARNING, "Primary did not respond to replconf command during SYNC handshake"); return C_ERR; @@ -2879,36 +2781,16 @@ static int dualChannelReplHandleReplconfReply(replicationLink *link, sds *err) { *err); return C_ERR; } - - /* Recieve slot bitmap response as well. */ - if (!isSlotBitmapAllSlots(link->slot_bitmap)) { - *err = receiveSynchronousResponse(link->rdb_transfer_s); - if (*err == NULL) { - dualChannelServerLog(LL_WARNING, "Primary did not respond to replconf slot-bitmap command during SYNC handshake"); - return C_ERR; - } - - if (*err[0] == '-') { - dualChannelServerLog(LL_NOTICE, "Server does not support sync with slot-bitmap, dual channel sync approach cannot be used: %s", - *err); - return C_ERR; - } - } - - if (connSyncWrite(link->rdb_transfer_s, "SYNC\r\n", 6, server.repl_syncio_timeout * 1000) == -1) { - dualChannelServerLog(LL_WARNING, "I/O error writing to Primary: %s", connGetLastError(link->rdb_transfer_s)); + if (connSyncWrite(conn, "SYNC\r\n", 6, server.repl_syncio_timeout * 1000) == -1) { + dualChannelServerLog(LL_WARNING, "I/O error writing to Primary: %s", connGetLastError(conn)); return C_ERR; } return C_OK; } -int replicationUseAOFFormatSnapshot(replicationLink *link) { - return !isSlotBitmapAllSlots(link->slot_bitmap); -} - -static int dualChannelReplHandleEndOffsetResponse(replicationLink *link, sds *err) { +static int dualChannelReplHandleEndOffsetResponse(connection *conn, sds *err) { uint64_t rdb_client_id; - *err = receiveSynchronousResponse(link->rdb_transfer_s); + *err = receiveSynchronousResponse(conn); if (*err == NULL) { return C_ERR; } @@ -2926,34 +2808,30 @@ static int dualChannelReplHandleEndOffsetResponse(replicationLink *link, sds *er dualChannelServerLog(LL_WARNING, "Received unexpected $ENDOFF response: %s", *err); return C_ERR; } - link->rdb_client_id = rdb_client_id; - link->initial_offset = reploffset; + server.rdb_client_id = rdb_client_id; + server.primary_initial_offset = reploffset; /* Initiate repl_provisional_primary to act as this replica temp primary until RDB is loaded */ - memcpy(link->provisional_source_state.replid, primary_replid, CONFIG_RUN_ID_SIZE + 1); - link->provisional_source_state.reploff = reploffset; - link->provisional_source_state.read_reploff = reploffset; - link->provisional_source_state.dbid = dbid; + server.repl_provisional_primary.conn = server.repl_transfer_s; + memcpy(server.repl_provisional_primary.replid, primary_replid, sizeof(server.repl_provisional_primary.replid)); + server.repl_provisional_primary.reploff = reploffset; + server.repl_provisional_primary.read_reploff = reploffset; + server.repl_provisional_primary.dbid = dbid; /* Now that we have the snapshot end-offset, we can ask for psync from that offset. Prepare the * main connection accordingly.*/ - link->transfer_s->state = CONN_STATE_CONNECTED; - link->state = REPL_STATE_SEND_HANDSHAKE; - serverAssert(connSetReadHandler(link->transfer_s, dualChannelSetupMainConnForPsync) != C_ERR); - dualChannelSetupMainConnForPsync(link->transfer_s); + server.repl_transfer_s->state = CONN_STATE_CONNECTED; + server.repl_state = REPL_STATE_SEND_HANDSHAKE; + serverAssert(connSetReadHandler(server.repl_transfer_s, dualChannelSetupMainConnForPsync) != C_ERR); + dualChannelSetupMainConnForPsync(server.repl_transfer_s); - /* As the next block we will receive using this connection is the snapshot, we need to prepare + /* As the next block we will receive using this connection is the rdb, we need to prepare * the connection accordingly */ - if (replicationUseAOFFormatSnapshot(link)) { - link->client = createReplicationLinkClientWithHandler(link, link->rdb_transfer_s, -1, readQueryFromClient); - link->rdb_transfer_s = NULL; - } else { - serverAssert(connSetReadHandler(link->rdb_transfer_s, readSyncBulkPayload) != C_ERR); - } - link->transfer_size = -1; - link->transfer_read = 0; - link->transfer_last_fsync_off = 0; - link->transfer_lastio = server.unixtime; + serverAssert(connSetReadHandler(server.repl_rdb_transfer_s, readSyncBulkPayload) != C_ERR); + server.repl_transfer_size = -1; + server.repl_transfer_read = 0; + server.repl_transfer_last_fsync_off = 0; + server.repl_transfer_lastio = server.unixtime; return C_OK; } @@ -2961,15 +2839,15 @@ static int dualChannelReplHandleEndOffsetResponse(replicationLink *link, sds *er /* Replication: Replica side. * This connection handler is used to initialize the RDB connection (dual-channel-replication). * Once a replica with dual-channel-replication enabled, denied from PSYNC with its primary, - * dualChannelFullSyncWithReplicationSource begins its role. The connection handler prepares server.repl_rdb_transfer_s + * dualChannelFullSyncWithPrimary begins its role. The connection handler prepares server.repl_rdb_transfer_s * for a rdb stream, and server.repl_transfer_s for incremental replication data stream. */ -static void dualChannelFullSyncWithReplicationSource(connection *conn) { +static void dualChannelFullSyncWithPrimary(connection *conn) { char *err = NULL; int ret = 0; - replicationLink *link = (replicationLink *)connGetPrivateData(conn); + serverAssert(conn == server.repl_rdb_transfer_s); /* If this event fired after the user turned the instance into a primary * with REPLICAOF NO ONE we must just return ASAP. */ - if (link->state == REPL_STATE_NONE) { + if (server.repl_state == REPL_STATE_NONE) { goto error; } /* Check for errors in the socket: after a non blocking connect() we @@ -2979,30 +2857,30 @@ static void dualChannelFullSyncWithReplicationSource(connection *conn) { connGetLastError(conn)); goto error; } - switch (link->rdb_channel_state) { + switch (server.repl_rdb_channel_state) { case REPL_DUAL_CHANNEL_SEND_HANDSHAKE: - ret = dualChannelReplHandleHandshake(link, &err); - if (ret == C_OK) link->rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_AUTH_REPLY; + ret = dualChannelReplHandleHandshake(conn, &err); + if (ret == C_OK) server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_AUTH_REPLY; break; case REPL_DUAL_CHANNEL_RECEIVE_AUTH_REPLY: if (server.primary_auth) { - ret = dualChannelReplHandleAuthReply(link, &err); - if (ret == C_OK) link->rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY; + ret = dualChannelReplHandleAuthReply(conn, &err); + if (ret == C_OK) server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY; /* Wait for next bulk before trying to read replconf reply. */ break; } - link->rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY; + server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY; /* fall through */ case REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY: - ret = dualChannelReplHandleReplconfReply(link, &err); - if (ret == C_OK) link->rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_ENDOFF; + ret = dualChannelReplHandleReplconfReply(conn, &err); + if (ret == C_OK) server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_ENDOFF; break; case REPL_DUAL_CHANNEL_RECEIVE_ENDOFF: - ret = dualChannelReplHandleEndOffsetResponse(link, &err); - if (ret == C_OK) link->rdb_channel_state = REPL_DUAL_CHANNEL_RDB_LOAD; + ret = dualChannelReplHandleEndOffsetResponse(conn, &err); + if (ret == C_OK) server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RDB_LOAD; break; default: - serverPanic("Unexpected dual replication state: %d", link->rdb_channel_state); + serverPanic("Unexpected dual replication state: %d", server.repl_rdb_channel_state); } if (ret == C_ERR) goto error; sdsfree(err); @@ -3013,33 +2891,29 @@ static void dualChannelFullSyncWithReplicationSource(connection *conn) { serverLog(LL_WARNING, "Dual channel sync failed with error %s", err); sdsfree(err); } - if (link->transfer_s) { - connClose(link->transfer_s); - link->transfer_s = NULL; + if (server.repl_transfer_s) { + connClose(server.repl_transfer_s); + server.repl_transfer_s = NULL; } - if (link->rdb_transfer_s) { - connClose(link->rdb_transfer_s); - link->rdb_transfer_s = NULL; - } - if (link->transfer_fd != -1) close(link->transfer_fd); - link->transfer_fd = -1; - link->state = REPL_STATE_CONNECT; - replicationAbortDualChannelSyncTransfer(link); - if (link->client) { - freeClient(link->client); - link->client = NULL; + if (server.repl_rdb_transfer_s) { + connClose(server.repl_rdb_transfer_s); + server.repl_rdb_transfer_s = NULL; } + if (server.repl_transfer_fd != -1) close(server.repl_transfer_fd); + server.repl_transfer_fd = -1; + server.repl_state = REPL_STATE_CONNECT; + replicationAbortDualChannelSyncTransfer(); } /* Replication: Replica side. * Initialize server.pending_repl_data infrastructure, we will allocate the buffer * itself once we need it */ -void replDataBufInit(replicationLink *link) { - serverAssert(link->pending_repl_data.blocks == NULL); - link->pending_repl_data.len = 0; - link->pending_repl_data.peak = 0; - link->pending_repl_data.blocks = listCreate(); - link->pending_repl_data.blocks->free = zfree; +void replDataBufInit(void) { + serverAssert(server.pending_repl_data.blocks == NULL); + server.pending_repl_data.len = 0; + server.pending_repl_data.peak = 0; + server.pending_repl_data.blocks = listCreate(); + server.pending_repl_data.blocks->free = zfree; } /* Replication: Replica side. @@ -3050,7 +2924,7 @@ void replStreamProgressCallback(size_t offset, int readlen, time_t *last_progres ((offset + readlen) / server.loading_process_events_interval_bytes > offset / server.loading_process_events_interval_bytes) && (now - *last_progress_callback > server.loading_process_events_interval_ms)) { - replicationSendNewlineToConnectedLinks(); + replicationSendNewlineToPrimary(); processEventsWhileBlocked(); *last_progress_callback = now; } @@ -3065,16 +2939,14 @@ typedef struct replDataBufBlock { /* Replication: Replica side. * Reads replication data from primary into specified repl buffer block */ -int readIntoReplDataBlock(replicationLink *link, replDataBufBlock *data_block, size_t read) { - int nread = connRead(link->transfer_s, data_block->buf + data_block->used, read); +int readIntoReplDataBlock(connection *conn, replDataBufBlock *data_block, size_t read) { + int nread = connRead(conn, data_block->buf + data_block->used, read); if (nread <= 0) { - if (nread == 0 || connGetState(link->transfer_s) != CONN_STATE_CONNECTED) { + if (nread == 0 || connGetState(conn) != CONN_STATE_CONNECTED) { dualChannelServerLog(LL_WARNING, "Provisional primary closed connection"); - if (link->rdb_channel_state == REPL_DUAL_CHANNEL_RDB_LOAD) { - /* Signal ongoing RDB load to terminate gracefully */ - if (server.loading_rio) rioCloseASAP(server.loading_rio); - } - cancelReplicationHandshake(link, 1); + /* Signal ongoing RDB load to terminate gracefully */ + if (server.loading_rio) rioCloseASAP(server.loading_rio); + cancelReplicationHandshake(1); } return C_ERR; } @@ -3089,10 +2961,8 @@ void bufferReplData(connection *conn) { size_t readlen = PROTO_IOBUF_LEN; int remaining_bytes = 0; - replicationLink *link = (replicationLink *)connGetPrivateData(conn); - while (readlen > 0) { - listNode *ln = listLast(link->pending_repl_data.blocks); + listNode *ln = listLast(server.pending_repl_data.blocks); replDataBufBlock *tail = ln ? listNodeValue(ln) : NULL; /* Append to tail string when possible */ @@ -3100,11 +2970,11 @@ void bufferReplData(connection *conn) { size_t avail = tail->size - tail->used; remaining_bytes = min(readlen, avail); readlen -= remaining_bytes; - remaining_bytes = readIntoReplDataBlock(link, tail, remaining_bytes); + remaining_bytes = readIntoReplDataBlock(conn, tail, remaining_bytes); } if (readlen && remaining_bytes == 0) { if (server.client_obuf_limits[CLIENT_TYPE_REPLICA].hard_limit_bytes && - link->pending_repl_data.len > server.client_obuf_limits[CLIENT_TYPE_REPLICA].hard_limit_bytes) { + server.pending_repl_data.len > server.client_obuf_limits[CLIENT_TYPE_REPLICA].hard_limit_bytes) { dualChannelServerLog(LL_NOTICE, "Replication buffer limit reached, stopping buffering."); /* Stop accumulating primary commands. */ connSetReadHandler(conn, NULL); @@ -3119,15 +2989,15 @@ void bufferReplData(connection *conn) { tail = zmalloc_usable(size + sizeof(replDataBufBlock), &usable_size); tail->size = usable_size - sizeof(replDataBufBlock); tail->used = 0; - listAddNodeTail(link->pending_repl_data.blocks, tail); - link->pending_repl_data.len += tail->size; + listAddNodeTail(server.pending_repl_data.blocks, tail); + server.pending_repl_data.len += tail->size; /* Update buffer's peak */ - if (link->pending_repl_data.peak < link->pending_repl_data.len) - link->pending_repl_data.peak = link->pending_repl_data.len; + if (server.pending_repl_data.peak < server.pending_repl_data.len) + server.pending_repl_data.peak = server.pending_repl_data.len; remaining_bytes = min(readlen, tail->size); readlen -= remaining_bytes; - remaining_bytes = readIntoReplDataBlock(link, tail, remaining_bytes); + remaining_bytes = readIntoReplDataBlock(conn, tail, remaining_bytes); } if (remaining_bytes > 0) { /* Stop reading in case we read less than we anticipated */ @@ -3141,34 +3011,29 @@ void bufferReplData(connection *conn) { /* Replication: Replica side. * Streams accumulated replication data into the database while freeing read nodes */ -int streamReplDataBufToDb(replicationLink *link) { - serverAssert(link->client->flag.replication_source); +int streamReplDataBufToDb(client *c) { + serverAssert(c->flag.primary); blockingOperationStarts(); size_t used, offset = 0; listNode *cur = NULL; time_t last_progress_callback = mstime(); - - /* Before loading, protect our link from being destructed. */ - link->protected = 1; - - while (link->pending_repl_data.blocks && (cur = listFirst(link->pending_repl_data.blocks))) { + while (server.pending_repl_data.blocks && (cur = listFirst(server.pending_repl_data.blocks))) { /* Read and process repl data block */ replDataBufBlock *o = listNodeValue(cur); used = o->used; - link->client->querybuf = sdscatlen(link->client->querybuf, o->buf, used); - link->client->repl_data->read_reploff += used; - processInputBuffer(link->client); - link->pending_repl_data.len -= used; + c->querybuf = sdscatlen(c->querybuf, o->buf, used); + c->repl_data->read_reploff += used; + processInputBuffer(c); + server.pending_repl_data.len -= used; offset += used; - listDelNode(link->pending_repl_data.blocks, cur); + listDelNode(server.pending_repl_data.blocks, cur); replStreamProgressCallback(offset, used, &last_progress_callback); } - link->protected = 0; blockingOperationEnds(); - - if (link->state == REPL_STATE_CANCELLED) { + if (!server.pending_repl_data.blocks) { /* If we encounter a `replicaof` command during the replStreamProgressCallback, - * we should return an error and abort the current sync session. */ + * pending_repl_data.blocks will be NULL, and we should return an error and + * abort the current sync session. */ return C_ERR; } return C_OK; @@ -3177,64 +3042,65 @@ int streamReplDataBufToDb(replicationLink *link) { /* Replication: Replica side. * After done loading the snapshot using the rdb-channel prepare this replica for steady state by * initializing the primary client, amd stream local incremental buffer into memory. */ -int dualChannelSyncSuccess(replicationLink *link) { - link->initial_offset = link->provisional_source_state.reploff; - replicationResurrectProvisionalSource(link); +void dualChannelSyncSuccess(void) { + server.primary_initial_offset = server.repl_provisional_primary.reploff; + replicationResurrectProvisionalPrimary(); /* Wait for the accumulated buffer to be processed before reading any more replication updates */ - if (link->pending_repl_data.blocks && streamReplDataBufToDb(link) == C_ERR) { + if (server.pending_repl_data.blocks && streamReplDataBufToDb(server.primary) == C_ERR) { /* Sync session aborted during repl data streaming. */ dualChannelServerLog(LL_WARNING, "Failed to stream local replication buffer into memory"); /* Verify sync is still in progress */ - if (link->rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) { - replicationAbortDualChannelSyncTransfer(link); + if (server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) { + replicationAbortDualChannelSyncTransfer(); + replicationUnsetPrimary(); } - return C_ERR; + return; } - freePendingReplDataBuf(link); + freePendingReplDataBuf(); dualChannelServerLog(LL_NOTICE, "Successfully streamed replication data into memory"); /* We can resume reading from the primary connection once the local replication buffer has been loaded. */ - replicationSteadyStateInit(link); - replicationSendAck(link); /* Send ACK to notify primary that replica is synced */ - link->rdb_client_id = -1; - link->rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE; - return C_OK; + replicationSteadyStateInit(); + replicationSendAck(server.primary); /* Send ACK to notify primary that replica is synced */ + server.rdb_client_id = -1; + server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE; } /* Replication: Replica side. * Main channel successfully established psync with primary. Check whether the rdb channel * has completed its part and act accordingly. */ -int dualChannelSyncHandlePsync(replicationLink *link) { - serverAssert(link->state == REPL_STATE_RECEIVE_PSYNC_REPLY); - if (link->rdb_channel_state < REPL_DUAL_CHANNEL_RDB_LOADED) { +int dualChannelSyncHandlePsync(void) { + serverAssert(server.repl_state == REPL_STATE_RECEIVE_PSYNC_REPLY); + if (server.repl_rdb_channel_state < REPL_DUAL_CHANNEL_RDB_LOADED) { /* RDB is still loading */ - if (connSetReadHandler(link->transfer_s, bufferReplData) == C_ERR) { + if (connSetReadHandler(server.repl_provisional_primary.conn, bufferReplData) == C_ERR) { dualChannelServerLog(LL_WARNING, "Error while setting readable handler: %s", strerror(errno)); - cancelReplicationHandshake(link, 1); + cancelReplicationHandshake(1); return C_ERR; } - replDataBufInit(link); + replDataBufInit(); return C_OK; } - serverAssert(link->rdb_channel_state == REPL_DUAL_CHANNEL_RDB_LOADED); + serverAssert(server.repl_rdb_channel_state == REPL_DUAL_CHANNEL_RDB_LOADED); /* RDB is loaded */ dualChannelServerLog(LL_DEBUG, "Psync established after rdb load"); - dualChannelSyncSuccess(link); + dualChannelSyncSuccess(); return C_OK; } /* Replication: Replica side. * RDB channel done loading the RDB. Check whether the main channel has completed its part * and act accordingly. */ -int dualChannelSyncHandleRdbLoadCompletion(replicationLink *link) { - serverAssert(link->rdb_channel_state == REPL_DUAL_CHANNEL_RDB_LOAD); - if (link->state < REPL_STATE_TRANSFER) { +void dualChannelSyncHandleRdbLoadCompletion(void) { + serverAssert(server.repl_rdb_channel_state == REPL_DUAL_CHANNEL_RDB_LOAD); + if (server.repl_state < REPL_STATE_TRANSFER) { /* Main psync channel hasn't been established yet */ - link->rdb_channel_state = REPL_DUAL_CHANNEL_RDB_LOADED; - return C_OK; + server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RDB_LOADED; + return; } - serverAssert(link->state == REPL_STATE_TRANSFER); - connSetReadHandler(link->transfer_s, NULL); - return dualChannelSyncSuccess(link); + serverAssert(server.repl_state == REPL_STATE_TRANSFER); + connSetReadHandler(server.repl_transfer_s, NULL); + dualChannelSyncSuccess(); + return; } /* Try a partial resynchronization with the primary if we are about to reconnect. @@ -3292,8 +3158,8 @@ int dualChannelSyncHandleRdbLoadCompletion(replicationLink *link) { #define PSYNC_NOT_SUPPORTED 4 #define PSYNC_TRY_LATER 5 #define PSYNC_FULLRESYNC_DUAL_CHANNEL 6 -int replicaTryPartialResynchronization(replicationLink *link, int read_reply) { - char *psync_replid = NULL; +int replicaTryPartialResynchronization(connection *conn, int read_reply) { + char *psync_replid; char psync_offset[32]; sds reply; @@ -3304,25 +3170,21 @@ int replicaTryPartialResynchronization(replicationLink *link, int read_reply) { * a FULL resync using the PSYNC command we'll set the offset at the * right value, so that this information will be propagated to the * client structure representing the primary into server.primary. */ - link->initial_offset = -1; + server.primary_initial_offset = -1; - if (link->rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) { + if (server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) { /* While in dual channel replication, we should use our prepared repl id and offset. */ - psync_replid = link->provisional_source_state.replid; - snprintf(psync_offset, sizeof(psync_offset), "%lld", link->provisional_source_state.reploff + 1); + psync_replid = server.repl_provisional_primary.replid; + snprintf(psync_offset, sizeof(psync_offset), "%lld", server.repl_provisional_primary.reploff + 1); dualChannelServerLog(LL_NOTICE, "Trying a partial resynchronization using main channel (request %s:%s).", psync_replid, psync_offset); - } else if (link != server.primary) { - serverLog(LL_NOTICE, "Partial resynchronization not attempted (not primary replication)"); } else if (server.cached_primary) { psync_replid = server.cached_primary->repl_data->replid; snprintf(psync_offset, sizeof(psync_offset), "%lld", server.cached_primary->repl_data->reploff + 1); serverLog(LL_NOTICE, "Trying a partial resynchronization (request %s:%s).", psync_replid, psync_offset); } else { serverLog(LL_NOTICE, "Partial resynchronization not possible (no cached primary)"); - } - if (!psync_replid) { psync_replid = "?"; memcpy(psync_offset, "-1", 3); } @@ -3330,26 +3192,26 @@ int replicaTryPartialResynchronization(replicationLink *link, int read_reply) { /* Issue the PSYNC command, if this is a primary with a failover in * progress then send the failover argument to the replica to cause it * to become a primary */ - if (server.failover_state == FAILOVER_IN_PROGRESS && link == server.primary) { - reply = sendCommand(link->transfer_s, "PSYNC", psync_replid, psync_offset, "FAILOVER", NULL); + if (server.failover_state == FAILOVER_IN_PROGRESS) { + reply = sendCommand(conn, "PSYNC", psync_replid, psync_offset, "FAILOVER", NULL); } else { - reply = sendCommand(link->transfer_s, "PSYNC", psync_replid, psync_offset, NULL); + reply = sendCommand(conn, "PSYNC", psync_replid, psync_offset, NULL); } if (reply != NULL) { - serverLog(LL_WARNING, "Unable to send PSYNC to source: %s", reply); + serverLog(LL_WARNING, "Unable to send PSYNC to primary: %s", reply); sdsfree(reply); - connSetReadHandler(link->transfer_s, NULL); + connSetReadHandler(conn, NULL); return PSYNC_WRITE_ERROR; } return PSYNC_WAIT_REPLY; } /* Reading half */ - reply = receiveSynchronousResponse(link->transfer_s); + reply = receiveSynchronousResponse(conn); /* Primary did not reply to PSYNC */ if (reply == NULL) { - connSetReadHandler(link->transfer_s, NULL); + connSetReadHandler(conn, NULL); serverLog(LL_WARNING, "Primary did not reply to PSYNC, will try later"); return PSYNC_TRY_LATER; } @@ -3361,7 +3223,7 @@ int replicaTryPartialResynchronization(replicationLink *link, int read_reply) { return PSYNC_WAIT_REPLY; } - connSetReadHandler(link->transfer_s, NULL); + connSetReadHandler(conn, NULL); if (!strncmp(reply, "+FULLRESYNC", 11)) { char *replid = NULL, *offset = NULL; @@ -3380,31 +3242,24 @@ int replicaTryPartialResynchronization(replicationLink *link, int read_reply) { * reply means that the primary supports PSYNC, but the reply * format seems wrong. To stay safe we blank the primary * replid to make sure next PSYNCs will fail. */ - memset(link->replid, 0, CONFIG_RUN_ID_SIZE + 1); + memset(server.primary_replid, 0, CONFIG_RUN_ID_SIZE + 1); } else { - memcpy(link->replid, replid, offset - replid - 1); - link->replid[CONFIG_RUN_ID_SIZE] = '\0'; - link->initial_offset = strtoll(offset, NULL, 10); - serverLog(LL_NOTICE, "Full resync from primary: %s:%lld", link->replid, - link->initial_offset); + memcpy(server.primary_replid, replid, offset - replid - 1); + server.primary_replid[CONFIG_RUN_ID_SIZE] = '\0'; + server.primary_initial_offset = strtoll(offset, NULL, 10); + serverLog(LL_NOTICE, "Full resync from primary: %s:%lld", server.primary_replid, + server.primary_initial_offset); } sdsfree(reply); return PSYNC_FULLRESYNC; } if (!strncmp(reply, "+CONTINUE", 9)) { - if (link->rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) { - /* During dual channel sync session, primary struct is already initialized. */ + if (server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) { + /* During dual channel sync sesseion, primary struct is already initialized. */ sdsfree(reply); return PSYNC_CONTINUE; } - if (link != server.primary) { - /* Continuing from a cached primary should only happen when we are syncing for primary replication. */ - sdsfree(reply); - serverLog(LL_WARNING, "Received +CONTINUE response to PSYNC when not doing replication and not performing dual channel sync. Failing PSYNC."); - return PSYNC_NOT_SUPPORTED; - } - /* Partial resync was accepted. */ serverLog(LL_NOTICE, "Successful partial resynchronization with primary."); @@ -3441,7 +3296,7 @@ int replicaTryPartialResynchronization(replicationLink *link, int read_reply) { /* Setup the replication to continue. */ sdsfree(reply); - replicationResurrectCachedPrimary(link); + replicationResurrectCachedPrimary(conn); /* If this instance was restarted and we read the metadata to * PSYNC from the persistence file, our replication backlog could @@ -3502,16 +3357,16 @@ sds getTryPsyncString(int result) { } } -int dualChannelReplMainConnSendHandshake(replicationLink *link, sds *err) { +int dualChannelReplMainConnSendHandshake(connection *conn, sds *err) { char llstr[LONG_STR_SIZE]; - ull2string(llstr, sizeof(llstr), link->rdb_client_id); - *err = sendCommand(link->transfer_s, "REPLCONF", "set-rdb-client-id", llstr, NULL); + ull2string(llstr, sizeof(llstr), server.rdb_client_id); + *err = sendCommand(conn, "REPLCONF", "set-rdb-client-id", llstr, NULL); if (*err) return C_ERR; return C_OK; } -int dualChannelReplMainConnRecvCapaReply(replicationLink *link, sds *err) { - *err = receiveSynchronousResponse(link->transfer_s); +int dualChannelReplMainConnRecvCapaReply(connection *conn, sds *err) { + *err = receiveSynchronousResponse(conn); if (*err == NULL) return C_ERR; if ((*err)[0] == '-') { dualChannelServerLog(LL_NOTICE, "Primary does not understand REPLCONF identify: %s", *err); @@ -3520,28 +3375,28 @@ int dualChannelReplMainConnRecvCapaReply(replicationLink *link, sds *err) { return C_OK; } -int dualChannelReplMainConnSendPsync(replicationLink *link, sds *err) { +int dualChannelReplMainConnSendPsync(connection *conn, sds *err) { if (server.debug_pause_after_fork) debugPauseProcess(); - if (replicaTryPartialResynchronization(link, 0) == PSYNC_WRITE_ERROR) { + if (replicaTryPartialResynchronization(conn, 0) == PSYNC_WRITE_ERROR) { dualChannelServerLog(LL_WARNING, "Aborting dual channel sync. Write error."); - *err = sdsnew(connGetLastError(link->transfer_s)); + *err = sdsnew(connGetLastError(conn)); return C_ERR; } return C_OK; } -int dualChannelReplMainConnRecvPsyncReply(replicationLink *link, sds *err) { - int psync_result = replicaTryPartialResynchronization(link, 1); +int dualChannelReplMainConnRecvPsyncReply(connection *conn, sds *err) { + int psync_result = replicaTryPartialResynchronization(conn, 1); if (psync_result == PSYNC_WAIT_REPLY) return C_OK; /* Try again later... */ if (psync_result == PSYNC_CONTINUE) { dualChannelServerLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Primary accepted a Partial Resynchronization%s", - link->rdb_transfer_s != NULL ? ", RDB load in background." : "."); + server.repl_rdb_transfer_s != NULL ? ", RDB load in background." : "."); if (server.supervised_mode == SUPERVISED_SYSTEMD) { serverCommunicateSystemd("STATUS=PRIMARY <-> REPLICA sync: Partial Resynchronization accepted. Ready to " "accept connections in read-write mode.\n"); } - dualChannelSyncHandlePsync(link); + dualChannelSyncHandlePsync(); return C_OK; } *err = getTryPsyncString(psync_result); @@ -3555,126 +3410,43 @@ void dualChannelSetupMainConnForPsync(connection *conn) { char *err = NULL; int ret; - replicationLink *link = (replicationLink *)connGetPrivateData(conn); - - switch (link->state) { + switch (server.repl_state) { case REPL_STATE_SEND_HANDSHAKE: - ret = dualChannelReplMainConnSendHandshake(link, &err); - if (ret == C_OK) link->state = REPL_STATE_RECEIVE_CAPA_REPLY; + ret = dualChannelReplMainConnSendHandshake(conn, &err); + if (ret == C_OK) server.repl_state = REPL_STATE_RECEIVE_CAPA_REPLY; break; case REPL_STATE_RECEIVE_CAPA_REPLY: - ret = dualChannelReplMainConnRecvCapaReply(link, &err); + ret = dualChannelReplMainConnRecvCapaReply(conn, &err); if (ret == C_ERR) { break; } - if (ret == C_OK) link->state = REPL_STATE_SEND_PSYNC; + if (ret == C_OK) server.repl_state = REPL_STATE_SEND_PSYNC; sdsfree(err); err = NULL; /* fall through */ case REPL_STATE_SEND_PSYNC: - ret = dualChannelReplMainConnSendPsync(link, &err); - if (ret == C_OK) link->state = REPL_STATE_RECEIVE_PSYNC_REPLY; + ret = dualChannelReplMainConnSendPsync(conn, &err); + if (ret == C_OK) server.repl_state = REPL_STATE_RECEIVE_PSYNC_REPLY; break; case REPL_STATE_RECEIVE_PSYNC_REPLY: - ret = dualChannelReplMainConnRecvPsyncReply(link, &err); - if (ret == C_OK && link->rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) - link->state = REPL_STATE_TRANSFER; - /* In case the RDB is already loaded, the repl_state will be set during establishSourceConnection. */ + ret = dualChannelReplMainConnRecvPsyncReply(conn, &err); + if (ret == C_OK && server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) + server.repl_state = REPL_STATE_TRANSFER; + /* In case the RDB is already loaded, the repl_state will be set during establishPrimaryConnection. */ break; default: - serverPanic("Unexpected replication state: %d", link->state); + serverPanic("Unexpected replication state: %d", server.repl_state); } if (ret == C_ERR) { dualChannelServerLog(LL_WARNING, "Aborting dual channel sync. Main channel psync result %d %s", ret, err ? err : ""); - cancelReplicationHandshake(link, 1); + cancelReplicationHandshake(1); } sdsfree(err); } -/* - * Dual channel for full sync - * - * * Motivation * - * - Reduce primary memory load. We do that by moving the COB tracking to the replica side. This also decrease - * the chance for COB overruns. Note that primary's input buffer limits at the replica side are less restricted - * then primary's COB as the replica plays less critical part in the replication group. While increasing the - * primary's COB may end up with primary reaching swap and clients suffering, at replica side we're more at - * ease with it. Larger COB means better chance to sync successfully. - * - Reduce primary main process CPU load. By opening a new, dedicated channel for the RDB transfer, child - * processes can have direct access to the new channel. Due to TLS connection restrictions, this was not - * possible using one main channel. We eliminate the need for the child process to use the primary's - * child-proc -> main-proc pipeline, thus freeing up the main process to process clients queries. - * - * * High level interface design * - * - Dual channel sync begins when the replica sends a REPLCONF capa dual-channel to the primary during initial - * handshake. This allows the replica to verify whether the primary supports dual-channel-replication and, if - * so, state that this is the replica's main channel, which is not used for snapshot transfer. - * - When replica lacks sufficient data for PSYNC, the primary will send +DUALCHANNELSYNC response instead - * of RDB data. As a next step, the replica creates a new channel (rdb-channel) and configures it against - * the primary with the appropriate capabilities and requirements. The replica then requests a sync - * using the RDB channel. - * - Prior to forking, the primary sends the replica the snapshot's end repl-offset, and attaches the replica - * to the replication backlog to keep repl data until the replica requests psync. The replica uses the main - * channel to request a PSYNC starting at the snapshot end offset. - * - The primary main threads sends incremental changes via the main channel, while the bgsave process - * sends the RDB directly to the replica via the rdb-channel. As for the replica, the incremental - * changes are stored on a local buffer, while the RDB is loaded into memory. - * - Once the replica completes loading the rdb, it drops the rdb channel and streams the accumulated incremental - * changes into memory. Repl steady state continues normally. - * - * * Replica state machine * - * ┌───────────────────┐ Dual channel sync - * │RECEIVE_PING_REPLY │ ┌──────────────────────────────────────────────────────────────┐ - * └────────┬──────────┘ │ RDB channel states Main channel state │ - * │+PONG │ ┌────────────────────────────┐ ┌───────────────────┐ │ - * ┌────────▼──────────┐ ┌─┼─────►DUAL_CHANNEL_SEND_HANDSHAKE │ ┌─►SEND_HANDSHAKE │ │ - * │SEND_HANDSHAKE │ │ │ └────┬───────────────────────┘ │ └──┬────────────────┘ │ - * └────────┬──────────┘ │ │ │ │ │REPLCONF set-rdb-client-id - * │ │ │ ┌───────▼───────────────────────┐ │ ┌──▼────────────────┐ │ - * ┌────────▼──────────┐ │ │ │DUAL_CHANNEL_RECEIVE_AUTH_REPLY│ │ │RECEIVE_CAPA_REPLY │ │ - * │RECEIVE_AUTH_REPLY │ │ │ └───────┬───────────────────────┘ │ └──┬────────────────┘ │ - * └────────┬──────────┘ │ │ │+OK │ │+OK │ - * │+OK │ │ ┌───────▼───────────────────────┐ │ ┌──▼────────────────┐ │ - * ┌────────▼──────────┐ │ │ │DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY│SEND_PSYNC │ │ - * │RECEIVE_PORT_REPLY │ │ │ └───────┬───────────────────────┘ │ └──┬────────────────┘ │ - * └────────┬──────────┘ │ │ │+OK │ │PSYNC use snapshot │ - * │+OK │ │ ┌───────▼───────────────────┐ │ │end-offset provided │ - * ┌────────▼──────────┐ │ │ │DUAL_CHANNEL_RECEIVE_ENDOFF│ │ │by the primary │ - * │RECEIVE_IP_REPLY │ │ │ └───────┬───────────────────┘ │ ┌──▼────────────────┐ │ - * └────────┬──────────┘ │ │ │$ENDOFF │ │RECEIVE_PSYNC_REPLY│ │ - * │+OK │ │ ├─────────────────────────┘ └──┬────────────────┘ │ - * ┌────────▼──────────┐ │ │ │ │+CONTINUE │ - * │RECEIVE_CAPA_REPLY │ │ │ ┌───────▼───────────────┐ ┌──▼────────────────┐ │ - * └────────┬──────────┘ │ │ │DUAL_CHANNEL_RDB_LOAD │ │TRANSFER │ │ - * │+OK │ │ └───────┬───────────────┘ └─────┬─────────────┘ │ - * ┌────────▼─────────────┐ │ │ │Done loading │ │ - * │RECEIVE_VERSION_REPLY │ │ │ ┌───────▼───────────────┐ │ │ - * └────────┬─────────────┘ │ │ │DUAL_CHANNEL_RDB_LOADED│ │ │ - * │+OK │ │ └───────┬───────────────┘ │ │ - * ┌────────▼───┐ │ │ │ │ │ - * │SEND_PSYNC │ │ │ │Replica loads local replication │ │ - * └─┬──────────┘ │ │ │buffer into memory │ │ - * │PSYNC (use cached-primary)│ │ └─────────┬───────────────────────┘ │ - * ┌─▼─────────────────┐ │ │ │ │ - * │RECEIVE_PSYNC_REPLY│ │ └────────────────────┼─────────────────────────────────────────┘ - * └────────┬─┬────────┘ │ │ - * +CONTINUE│ │+DUALCHANNELSYNC │ │ - * │ │ └─────────────────┘ │ - * │ │+FULLRESYNC │ - * │ ┌─▼─────────────────┐ ┌────▼──────────────┐ - * │ │TRANSFER ├───────────────────►CONNECTED │ - * │ └───────────────────┘ └────▲──────────────┘ - * │ │ - * └─────────────────────────────────────────────────┘ - */ -/* This handler fires when the non blocking connect was able to - * establish a connection with the primary. */ -void syncWithSource(connection *conn) { - char tmpfile[256], *err = NULL; - int psync_result; - - replicationLink *link = (replicationLink *)connGetPrivateData(conn); +int replicationProceedWithHandshake(connection *conn, int curr_state, slotBitmap slot_bitmap) { + char *err = NULL; /* Check for errors in the socket: after a non blocking connect() we * may find that the socket is in error state. */ @@ -3684,22 +3456,16 @@ void syncWithSource(connection *conn) { } /* Send a PING to check the primary is able to reply without errors. */ - if (link->state == REPL_STATE_CONNECTING) { + if (curr_state == REPL_STATE_CONNECTING) { serverLog(LL_NOTICE, "Non blocking connect for SYNC fired the event."); - /* Delete the writable event so that the readable event remains - * registered and we can wait for the PONG reply. */ - connSetReadHandler(conn, syncWithSource); - connSetWriteHandler(conn, NULL); - link->state = REPL_STATE_RECEIVE_PING_REPLY; /* Send the PING, don't check for errors at all, we have the timeout * that will take care about this. */ err = sendCommand(conn, "PING", NULL); if (err) goto write_error; - return; + return REPL_STATE_RECEIVE_PING_REPLY; } - - /* Receive the PONG command. */ - if (link->state == REPL_STATE_RECEIVE_PING_REPLY) { + /* Receive the PONG command. */ + if (curr_state == REPL_STATE_RECEIVE_PING_REPLY) { err = receiveSynchronousResponse(conn); /* The primary did not reply */ @@ -3720,10 +3486,10 @@ void syncWithSource(connection *conn) { } sdsfree(err); err = NULL; - link->state = REPL_STATE_SEND_HANDSHAKE; + curr_state = REPL_STATE_SEND_HANDSHAKE; } - if (link->state == REPL_STATE_SEND_HANDSHAKE) { + if (curr_state == REPL_STATE_SEND_HANDSHAKE) { /* AUTH with the primary if required. */ if (server.primary_auth) { char *args[3] = {"AUTH", NULL, NULL}; @@ -3758,11 +3524,11 @@ void syncWithSource(connection *conn) { if (err) goto write_error; } - /* Set the slot number, so that the primary only provides us with the appropriate slot dictionary. */ - if (!isSlotBitmapAllSlots(link->slot_bitmap)) { + /* Set the slot bitmap, so that the primary only provides us with the appropriate slot dictionary. */ + if (slot_bitmap != NULL && !isSlotBitmapEmpty(slot_bitmap)) { char *argv[3] = {"REPLCONF", "slot-bitmap", NULL}; size_t lens[3] = {8, 11, 0}; - argv[2] = (char *)link->slot_bitmap; + argv[2] = (char *)slot_bitmap; lens[2] = sizeof(slotBitmap); err = sendCommandArgv(conn, 3, argv, lens); if (err) goto write_error; @@ -3783,30 +3549,28 @@ void syncWithSource(connection *conn) { err = sendCommand(conn, "REPLCONF", "version", VALKEY_VERSION, NULL); if (err) goto write_error; - link->state = REPL_STATE_RECEIVE_AUTH_REPLY; - return; + return REPL_STATE_RECEIVE_AUTH_REPLY; } - if (link->state == REPL_STATE_RECEIVE_AUTH_REPLY && !server.primary_auth) - link->state = REPL_STATE_RECEIVE_PORT_REPLY; + if (curr_state == REPL_STATE_RECEIVE_AUTH_REPLY && !server.primary_auth) + curr_state = REPL_STATE_RECEIVE_PORT_REPLY; /* Receive AUTH reply. */ - if (link->state == REPL_STATE_RECEIVE_AUTH_REPLY) { + if (curr_state == REPL_STATE_RECEIVE_AUTH_REPLY) { err = receiveSynchronousResponse(conn); if (err == NULL) goto no_response_error; if (err[0] == '-') { - serverLog(LL_WARNING, "Unable to AUTH to %s: %s", replicationGetNameForLogs(link), err); + serverLog(LL_WARNING, "Unable to AUTH to PRIMARY: %s", err); sdsfree(err); goto error; } sdsfree(err); err = NULL; - link->state = REPL_STATE_RECEIVE_PORT_REPLY; - return; + return REPL_STATE_RECEIVE_PORT_REPLY; } /* Receive REPLCONF listening-port reply. */ - if (link->state == REPL_STATE_RECEIVE_PORT_REPLY) { + if (curr_state == REPL_STATE_RECEIVE_PORT_REPLY) { err = receiveSynchronousResponse(conn); if (err == NULL) goto no_response_error; /* Ignore the error if any, not all the Redis OSS versions support @@ -3818,15 +3582,14 @@ void syncWithSource(connection *conn) { err); } sdsfree(err); - link->state = REPL_STATE_RECEIVE_IP_REPLY; - return; + return REPL_STATE_RECEIVE_IP_REPLY; } - if (link->state == REPL_STATE_RECEIVE_IP_REPLY && !server.replica_announce_ip) - link->state = REPL_STATE_RECEIVE_SLOT_REPLY; + if (curr_state == REPL_STATE_RECEIVE_IP_REPLY && !server.replica_announce_ip) + curr_state = REPL_STATE_RECEIVE_SLOT_REPLY; /* Receive REPLCONF ip-address reply. */ - if (link->state == REPL_STATE_RECEIVE_IP_REPLY) { + if (curr_state == REPL_STATE_RECEIVE_IP_REPLY) { err = receiveSynchronousResponse(conn); if (err == NULL) goto no_response_error; /* Ignore the error if any, not all the Redis OSS versions support @@ -3838,59 +3601,173 @@ void syncWithSource(connection *conn) { err); } sdsfree(err); - link->state = REPL_STATE_RECEIVE_SLOT_REPLY; - return; + return REPL_STATE_RECEIVE_SLOT_REPLY; } - if (link->state == REPL_STATE_RECEIVE_SLOT_REPLY && isSlotBitmapAllSlots(link->slot_bitmap)) - link->state = REPL_STATE_RECEIVE_CAPA_REPLY; + if (curr_state == REPL_STATE_RECEIVE_SLOT_REPLY && (slot_bitmap == NULL || isSlotBitmapEmpty(slot_bitmap))) + curr_state = REPL_STATE_RECEIVE_CAPA_REPLY; - if (link->state == REPL_STATE_RECEIVE_SLOT_REPLY) { + if (curr_state == REPL_STATE_RECEIVE_SLOT_REPLY) { err = receiveSynchronousResponse(conn); if (err == NULL) goto no_response_error; - /* If we sent the slot number, we need it to be properly acked, or we can't do slot migration. */ + /* If we sent the slot bitmap, we need it to be properly acked, or we can't do slot migration. */ if (err[0] == '-') { serverLog(LL_WARNING, "Source does not understand REPLCONF slot-num. Cannot continue with slot-level sync: %s", err); sdsfree(err); goto error; } sdsfree(err); - link->state = REPL_STATE_RECEIVE_CAPA_REPLY; - return; + return REPL_STATE_RECEIVE_CAPA_REPLY; } + /* Receive CAPA reply. */ - if (link->state == REPL_STATE_RECEIVE_CAPA_REPLY) { + if (curr_state == REPL_STATE_RECEIVE_CAPA_REPLY) { err = receiveSynchronousResponse(conn); if (err == NULL) goto no_response_error; /* Ignore the error if any, not all the Redis OSS versions support * REPLCONF capa. */ if (err[0] == '-') { serverLog(LL_NOTICE, - "(Non critical) Source does not understand " + "(Non critical) Primary does not understand " "REPLCONF capa: %s", err); } sdsfree(err); err = NULL; - link->state = REPL_STATE_RECEIVE_VERSION_REPLY; - return; + return REPL_STATE_RECEIVE_VERSION_REPLY; } /* Receive VERSION reply. */ - if (link->state == REPL_STATE_RECEIVE_VERSION_REPLY) { + if (curr_state == REPL_STATE_RECEIVE_VERSION_REPLY) { err = receiveSynchronousResponse(conn); if (err == NULL) goto no_response_error; /* Ignore the error if any. Valkey >= 8 supports REPLCONF VERSION. */ if (err[0] == '-') { serverLog(LL_NOTICE, - "(Non critical) Source does not understand " + "(Non critical) Primary does not understand " "REPLCONF VERSION: %s", err); } sdsfree(err); err = NULL; - link->state = REPL_STATE_SEND_PSYNC; + return REPL_STATE_SEND_PSYNC; + } + + +no_response_error: /* Handle receiveSynchronousResponse() error when primary has no reply */ + serverLog(LL_WARNING, "Primary did not respond to command during SYNC handshake"); + /* Fall through to regular error handling */ + +error: + return REPL_STATE_ERROR; + +write_error: /* Handle sendCommand() errors. */ + serverLog(LL_WARNING, "Sending command to primary in replication handshake: %s", err); + sdsfree(err); + goto error; +} + +/* + * Dual channel for full sync + * + * * Motivation * + * - Reduce primary memory load. We do that by moving the COB tracking to the replica side. This also decrease + * the chance for COB overruns. Note that primary's input buffer limits at the replica side are less restricted + * then primary's COB as the replica plays less critical part in the replication group. While increasing the + * primary's COB may end up with primary reaching swap and clients suffering, at replica side we're more at + * ease with it. Larger COB means better chance to sync successfully. + * - Reduce primary main process CPU load. By opening a new, dedicated channel for the RDB transfer, child + * processes can have direct access to the new channel. Due to TLS connection restrictions, this was not + * possible using one main channel. We eliminate the need for the child process to use the primary's + * child-proc -> main-proc pipeline, thus freeing up the main process to process clients queries. + * + * * High level interface design * + * - Dual channel sync begins when the replica sends a REPLCONF capa dual-channel to the primary during initial + * handshake. This allows the replica to verify whether the primary supports dual-channel-replication and, if + * so, state that this is the replica's main channel, which is not used for snapshot transfer. + * - When replica lacks sufficient data for PSYNC, the primary will send +DUALCHANNELSYNC response instead + * of RDB data. As a next step, the replica creates a new channel (rdb-channel) and configures it against + * the primary with the appropriate capabilities and requirements. The replica then requests a sync + * using the RDB channel. + * - Prior to forking, the primary sends the replica the snapshot's end repl-offset, and attaches the replica + * to the replication backlog to keep repl data until the replica requests psync. The replica uses the main + * channel to request a PSYNC starting at the snapshot end offset. + * - The primary main threads sends incremental changes via the main channel, while the bgsave process + * sends the RDB directly to the replica via the rdb-channel. As for the replica, the incremental + * changes are stored on a local buffer, while the RDB is loaded into memory. + * - Once the replica completes loading the rdb, it drops the rdb channel and streams the accumulated incremental + * changes into memory. Repl steady state continues normally. + * + * * Replica state machine * + * ┌───────────────────┐ Dual channel sync + * │RECEIVE_PING_REPLY │ ┌──────────────────────────────────────────────────────────────┐ + * └────────┬──────────┘ │ RDB channel states Main channel state │ + * │+PONG │ ┌────────────────────────────┐ ┌───────────────────┐ │ + * ┌────────▼──────────┐ ┌─┼─────►DUAL_CHANNEL_SEND_HANDSHAKE │ ┌─►SEND_HANDSHAKE │ │ + * │SEND_HANDSHAKE │ │ │ └────┬───────────────────────┘ │ └──┬────────────────┘ │ + * └────────┬──────────┘ │ │ │ │ │REPLCONF set-rdb-client-id + * │ │ │ ┌───────▼───────────────────────┐ │ ┌──▼────────────────┐ │ + * ┌────────▼──────────┐ │ │ │DUAL_CHANNEL_RECEIVE_AUTH_REPLY│ │ │RECEIVE_CAPA_REPLY │ │ + * │RECEIVE_AUTH_REPLY │ │ │ └───────┬───────────────────────┘ │ └──┬────────────────┘ │ + * └────────┬──────────┘ │ │ │+OK │ │+OK │ + * │+OK │ │ ┌───────▼───────────────────────┐ │ ┌──▼────────────────┐ │ + * ┌────────▼──────────┐ │ │ │DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY│SEND_PSYNC │ │ + * │RECEIVE_PORT_REPLY │ │ │ └───────┬───────────────────────┘ │ └──┬────────────────┘ │ + * └────────┬──────────┘ │ │ │+OK │ │PSYNC use snapshot │ + * │+OK │ │ ┌───────▼───────────────────┐ │ │end-offset provided │ + * ┌────────▼──────────┐ │ │ │DUAL_CHANNEL_RECEIVE_ENDOFF│ │ │by the primary │ + * │RECEIVE_IP_REPLY │ │ │ └───────┬───────────────────┘ │ ┌──▼────────────────┐ │ + * └────────┬──────────┘ │ │ │$ENDOFF │ │RECEIVE_PSYNC_REPLY│ │ + * │+OK │ │ ├─────────────────────────┘ └──┬────────────────┘ │ + * ┌────────▼──────────┐ │ │ │ │+CONTINUE │ + * │RECEIVE_CAPA_REPLY │ │ │ ┌───────▼───────────────┐ ┌──▼────────────────┐ │ + * └────────┬──────────┘ │ │ │DUAL_CHANNEL_RDB_LOAD │ │TRANSFER │ │ + * │+OK │ │ └───────┬───────────────┘ └─────┬─────────────┘ │ + * ┌────────▼─────────────┐ │ │ │Done loading │ │ + * │RECEIVE_VERSION_REPLY │ │ │ ┌───────▼───────────────┐ │ │ + * └────────┬─────────────┘ │ │ │DUAL_CHANNEL_RDB_LOADED│ │ │ + * │+OK │ │ └───────┬───────────────┘ │ │ + * ┌────────▼───┐ │ │ │ │ │ + * │SEND_PSYNC │ │ │ │Replica loads local replication │ │ + * └─┬──────────┘ │ │ │buffer into memory │ │ + * │PSYNC (use cached-primary)│ │ └─────────┬───────────────────────┘ │ + * ┌─▼─────────────────┐ │ │ │ │ + * │RECEIVE_PSYNC_REPLY│ │ └────────────────────┼─────────────────────────────────────────┘ + * └────────┬─┬────────┘ │ │ + * +CONTINUE│ │+DUALCHANNELSYNC │ │ + * │ │ └─────────────────┘ │ + * │ │+FULLRESYNC │ + * │ ┌─▼─────────────────┐ ┌────▼──────────────┐ + * │ │TRANSFER ├───────────────────►CONNECTED │ + * │ └───────────────────┘ └────▲──────────────┘ + * │ │ + * └─────────────────────────────────────────────────┘ + */ +/* This handler fires when the non blocking connect was able to + * establish a connection with the primary. */ +void syncWithPrimary(connection *conn) { + char tmpfile[256], *err = NULL; + int psync_result; + + /* If this event fired after the user turned the instance into a primary + * with REPLICAOF NO ONE we must just return ASAP. */ + if (server.repl_state == REPL_STATE_NONE) { + connClose(conn); + return; + } + + if (server.repl_state < REPL_STATE_SEND_PSYNC) { + server.repl_state = replicationProceedWithHandshake(conn, server.repl_state, NULL); + + if (server.repl_state == REPL_STATE_RECEIVE_PING_REPLY) { + /* Delete the writable event so that the readable event remains + * registered and we can wait for the PONG reply. */ + connSetReadHandler(conn, syncWithPrimary); + connSetWriteHandler(conn, NULL); + } else if (server.repl_state == REPL_STATE_ERROR) { + goto error; + } } /* Try a partial resynchronization. If we don't have a cached primary @@ -3898,32 +3775,32 @@ void syncWithSource(connection *conn) { * to start a full resynchronization so that we get the primary replid * and the global offset, to try a partial resync at the next * reconnection attempt. */ - if (link->state == REPL_STATE_SEND_PSYNC) { - if (replicaTryPartialResynchronization(link, 0) == PSYNC_WRITE_ERROR) { + if (server.repl_state == REPL_STATE_SEND_PSYNC) { + if (replicaTryPartialResynchronization(conn, 0) == PSYNC_WRITE_ERROR) { err = sdsnew("Write error sending the PSYNC command."); abortFailover("Write error to failover target"); goto write_error; } - link->state = REPL_STATE_RECEIVE_PSYNC_REPLY; + server.repl_state = REPL_STATE_RECEIVE_PSYNC_REPLY; return; } /* If reached this point, we should be in REPL_STATE_RECEIVE_PSYNC_REPLY. */ - if (link->state != REPL_STATE_RECEIVE_PSYNC_REPLY) { + if (server.repl_state != REPL_STATE_RECEIVE_PSYNC_REPLY) { serverLog(LL_WARNING, - "syncWithSource(): state machine error, " + "syncWithPrimary(): state machine error, " "state should be RECEIVE_PSYNC but is %d", - link->state); + server.repl_state); goto error; } - psync_result = replicaTryPartialResynchronization(link, 1); + psync_result = replicaTryPartialResynchronization(conn, 1); if (psync_result == PSYNC_WAIT_REPLY) return; /* Try again later... */ /* Check the status of the planned failover. We expect PSYNC_CONTINUE, * but there is nothing technically wrong with a full resync which * could happen in edge cases. */ - if (server.failover_state == FAILOVER_IN_PROGRESS && link == server.primary) { + if (server.failover_state == FAILOVER_IN_PROGRESS) { if (psync_result == PSYNC_CONTINUE || psync_result == PSYNC_FULLRESYNC) { clearFailoverState(); } else { @@ -3956,13 +3833,13 @@ void syncWithSource(connection *conn) { if (psync_result == PSYNC_NOT_SUPPORTED) { serverLog(LL_NOTICE, "Retrying with SYNC..."); if (connSyncWrite(conn, "SYNC\r\n", 6, server.repl_syncio_timeout * 1000) == -1) { - serverLog(LL_WARNING, "I/O error writing to %s: %s", replicationGetNameForLogs(link), connGetLastError(conn)); + serverLog(LL_WARNING, "I/O error writing to PRIMARY: %s", connGetLastError(conn)); goto error; } } /* Prepare a suitable temp file for bulk transfer */ - if (!useDisklessLoad() && isSlotBitmapAllSlots(link->slot_bitmap)) { + if (!useDisklessLoad()) { int dfd = -1, maxtries = 5; while (maxtries--) { snprintf(tmpfile, 256, "temp-%d.%ld.rdb", (int)server.unixtime, (long int)getpid()); @@ -3975,33 +3852,24 @@ void syncWithSource(connection *conn) { errno = saved_errno; } if (dfd == -1) { - serverLog(LL_WARNING, "Opening the temp file needed for %s <-> REPLICA synchronization: %s", replicationGetNameForLogs(link), + serverLog(LL_WARNING, "Opening the temp file needed for PRIMARY <-> REPLICA synchronization: %s", strerror(errno)); goto error; } - link->transfer_tmpfile = zstrdup(tmpfile); - link->transfer_fd = dfd; - } - - /* We are going to need to do a full resync. If we are accepting a - * slot subset - make sure we have a clean state to load it into. This may - * happen in cases where a previous replication attempt failed and is being - * retried. */ - if (!isSlotBitmapAllSlots(link->slot_bitmap)) { - dropKeysInSlotBitmap(link->slot_bitmap, 1); + server.repl_transfer_tmpfile = zstrdup(tmpfile); + server.repl_transfer_fd = dfd; } /* Using dual-channel-replication, the primary responded +DUALCHANNELSYNC. We need to * initialize the RDB channel. */ if (psync_result == PSYNC_FULLRESYNC_DUAL_CHANNEL) { /* Create RDB connection */ - link->rdb_transfer_s = connCreate(connTypeOfReplication()); - connSetPrivateData(link->rdb_transfer_s, link); - if (connConnect(link->rdb_transfer_s, link->host, link->port, server.bind_source_addr, - dualChannelFullSyncWithReplicationSource) == C_ERR) { - serverLog(LL_WARNING, "Unable to connect to source: %s", connGetLastError(link->transfer_s)); - connClose(link->rdb_transfer_s); - link->rdb_transfer_s = NULL; + server.repl_rdb_transfer_s = connCreate(connTypeOfReplication()); + if (connConnect(server.repl_rdb_transfer_s, server.primary_host, server.primary_port, server.bind_source_addr, + dualChannelFullSyncWithPrimary) == C_ERR) { + serverLog(LL_WARNING, "Unable to connect to Primary: %s", connGetLastError(server.repl_transfer_s)); + connClose(server.repl_rdb_transfer_s); + server.repl_rdb_transfer_s = NULL; goto error; } if (connSetReadHandler(conn, NULL) == C_ERR) { @@ -4010,50 +3878,36 @@ void syncWithSource(connection *conn) { connGetInfo(conn, conninfo, sizeof(conninfo))); goto error; } - link->rdb_channel_state = REPL_DUAL_CHANNEL_SEND_HANDSHAKE; + server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_SEND_HANDSHAKE; return; } - if (replicationUseAOFFormatSnapshot(link)) { - link->client = createReplicationLinkClientWithHandler(link, conn, -1, readQueryFromClient); - link->transfer_s = NULL; - } else { - /* Setup the non blocking download of the bulk file. */ - if (connSetReadHandler(conn, readSyncBulkPayload) == C_ERR) { - char conninfo[CONN_INFO_LEN]; - serverLog(LL_WARNING, "Can't create readable event for SYNC: %s (%s)", strerror(errno), - connGetInfo(conn, conninfo, sizeof(conninfo))); - goto error; - } + /* Setup the non blocking download of the bulk file. */ + if (connSetReadHandler(conn, readSyncBulkPayload) == C_ERR) { + char conninfo[CONN_INFO_LEN]; + serverLog(LL_WARNING, "Can't create readable event for SYNC: %s (%s)", strerror(errno), + connGetInfo(conn, conninfo, sizeof(conninfo))); + goto error; } - link->state = REPL_STATE_TRANSFER; - link->transfer_size = -1; - link->transfer_read = 0; - link->transfer_last_fsync_off = 0; - link->transfer_lastio = server.unixtime; + server.repl_state = REPL_STATE_TRANSFER; + server.repl_transfer_size = -1; + server.repl_transfer_read = 0; + server.repl_transfer_last_fsync_off = 0; + server.repl_transfer_lastio = server.unixtime; return; -no_response_error: /* Handle receiveSynchronousResponse() error when primary has no reply */ - serverLog(LL_WARNING, "Primary did not respond to command during SYNC handshake"); - /* Fall through to regular error handling */ - error: connClose(conn); - link->transfer_s = NULL; - if (link->rdb_transfer_s) { - connClose(link->rdb_transfer_s); - link->rdb_transfer_s = NULL; - } - if (link->transfer_fd != -1) close(link->transfer_fd); - if (link->transfer_tmpfile) zfree(link->transfer_tmpfile); - link->transfer_tmpfile = NULL; - link->transfer_fd = -1; - link->state = REPL_STATE_CONNECT; - if (link->client) { - freeClient(link->client); - link->client = NULL; + server.repl_transfer_s = NULL; + if (server.repl_rdb_transfer_s) { + connClose(server.repl_rdb_transfer_s); + server.repl_rdb_transfer_s = NULL; } - return; + if (server.repl_transfer_fd != -1) close(server.repl_transfer_fd); + if (server.repl_transfer_tmpfile) zfree(server.repl_transfer_tmpfile); + server.repl_transfer_tmpfile = NULL; + server.repl_transfer_fd = -1; + server.repl_state = REPL_STATE_CONNECT; write_error: /* Handle sendCommand() errors. */ serverLog(LL_WARNING, "Sending command to primary in replication handshake: %s", err); @@ -4061,108 +3915,20 @@ void syncWithSource(connection *conn) { goto error; } -replicationLink *createReplicationLink(char *host, int port, slotBitmap slot_bitmap) { - replicationLink *result = (replicationLink *)zmalloc(sizeof(replicationLink)); - result->protected = 0; - result->state = REPL_STATE_NONE; - result->rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE; - memcpy(result->slot_bitmap, slot_bitmap, sizeof(slotBitmap)); - result->client = NULL; - result->host = sdsnew(host); - result->port = port; - result->transfer_s = NULL; - result->rdb_transfer_s = NULL; - result->rdb_client_id = -1; - result->replid[0] = '\0'; - result->initial_offset = -1; - result->transfer_size = 0; - result->transfer_read = 0; - result->transfer_last_fsync_off = 0; - result->transfer_fd = -1; - result->transfer_tmpfile = NULL; - result->transfer_lastio = 0; - result->provisional_source_state.replid[0] = '\0'; - result->provisional_source_state.reploff = -1; - result->provisional_source_state.read_reploff = -1; - result->provisional_source_state.dbid = -1; - result->pending_repl_data.blocks = NULL; - result->pending_repl_data.len = 0; - result->pending_repl_data.peak = 0; - listAddNodeTail(server.replication_links, result); - return result; -} - - -int freeReplicationLink(replicationLink *link) { - if (!link) return 0; - - /* Free primary_host before any calls to freeClient since it calls - * replicationHandleSourceDisconnection which can trigger a re-connect - * directly from within that call. */ - sdsfree(link->host); - link->host = NULL; - - cancelReplicationHandshake(link, 0); - if (link->client) { - freeClient(link->client); - link->client = NULL; - } - - if (link->transfer_s) { - connClose(link->transfer_s); - link->transfer_s = NULL; - } - if (link->rdb_transfer_s) { - connClose(link->rdb_transfer_s); - link->rdb_transfer_s = NULL; - } - if (link->transfer_tmpfile) { - zfree(link->transfer_tmpfile); - link->transfer_tmpfile = NULL; - } - if (link->transfer_fd != -1) { - close(link->transfer_fd); - link->transfer_fd = -1; - } - freePendingReplDataBuf(link); - - /* Unlink this replication link from the server list */ - listIter li; - listNode *ln; - listRewind(server.replication_links, &li); - while ((ln = listNext(&li))) { - replicationLink *elem = (replicationLink *)ln->value; - if (elem == link) { - listDelNode(server.replication_links, ln); - break; - } - } - - /* Keep the link intact if it is protected, but mark it as such */ - if (link->protected) { - link->state = REPL_STATE_CANCELLED; - return 0; - } - zfree(link); - return 1; -} - -int connectReplicationLink(replicationLink *link) { - if (!link) - return C_ERR; - - link->transfer_s = connCreate(connTypeOfReplication()); - connSetPrivateData(link->transfer_s, link); - if (connConnect(link->transfer_s, link->host, link->port, server.bind_source_addr, syncWithSource) == C_ERR) { - serverLog(LL_WARNING, "Unable to connect to %s: %s", replicationGetNameForLogs(link), connGetLastError(link->transfer_s)); - connClose(link->transfer_s); - link->transfer_s = NULL; +int connectWithPrimary(void) { + server.repl_transfer_s = connCreate(connTypeOfReplication()); + if (connConnect(server.repl_transfer_s, server.primary_host, server.primary_port, server.bind_source_addr, + syncWithPrimary) == C_ERR) { + serverLog(LL_WARNING, "Unable to connect to PRIMARY: %s", connGetLastError(server.repl_transfer_s)); + connClose(server.repl_transfer_s); + server.repl_transfer_s = NULL; return C_ERR; } - link->transfer_lastio = server.unixtime; - link->state = REPL_STATE_CONNECTING; - serverLog(LL_NOTICE, "%s <-> REPLICA sync started", replicationGetNameForLogs(link)); + + server.repl_transfer_lastio = server.unixtime; + server.repl_state = REPL_STATE_CONNECTING; + serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync started"); return C_OK; } @@ -4170,27 +3936,23 @@ int connectReplicationLink(replicationLink *link) { * in progress to undo it. * Never call this function directly, use cancelReplicationHandshake() instead. */ -void undoConnectWithSource(replicationLink *link) { - if (link->client) { - freeClient(link->client); - } else if (link->transfer_s) { - connClose(link->transfer_s); - link->transfer_s = NULL; - } +void undoConnectWithPrimary(void) { + connClose(server.repl_transfer_s); + server.repl_transfer_s = NULL; } /* Abort the async download of the bulk dataset while SYNC-ing with primary. * Never call this function directly, use cancelReplicationHandshake() instead. */ -void replicationAbortSyncTransfer(replicationLink *link) { - serverAssert(link->state == REPL_STATE_TRANSFER); - undoConnectWithSource(link); - if (link->transfer_fd != -1) { - close(link->transfer_fd); - bg_unlink(link->transfer_tmpfile); - zfree(link->transfer_tmpfile); - link->transfer_tmpfile = NULL; - link->transfer_fd = -1; +void replicationAbortSyncTransfer(void) { + serverAssert(server.repl_state == REPL_STATE_TRANSFER); + undoConnectWithPrimary(); + if (server.repl_transfer_fd != -1) { + close(server.repl_transfer_fd); + bg_unlink(server.repl_transfer_tmpfile); + zfree(server.repl_transfer_tmpfile); + server.repl_transfer_tmpfile = NULL; + server.repl_transfer_fd = -1; } } @@ -4199,22 +3961,19 @@ void replicationAbortSyncTransfer(replicationLink *link) { * the initial bulk transfer. * * If there was a replication handshake in progress 1 is returned and - * the replication state (link->state) set to REPL_STATE_CONNECT. + * the replication state (server.repl_state) set to REPL_STATE_CONNECT. * * Otherwise zero is returned and no operation is performed at all. */ -int cancelReplicationHandshake(replicationLink *link, int reconnect) { - if (link->rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) { - replicationAbortDualChannelSyncTransfer(link); - } - if (link->state == REPL_STATE_TRANSFER) { - replicationAbortSyncTransfer(link); - /* Note that disconnection may already trigger reconnect */ - if (link->state == REPL_STATE_CONNECTING) - return 1; - link->state = REPL_STATE_CONNECT; - } else if (link->state == REPL_STATE_CONNECTING || replicaIsInHandshakeState(link)) { - undoConnectWithSource(link); - link->state = REPL_STATE_CONNECT; +int cancelReplicationHandshake(int reconnect) { + if (server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) { + replicationAbortDualChannelSyncTransfer(); + } + if (server.repl_state == REPL_STATE_TRANSFER) { + replicationAbortSyncTransfer(); + server.repl_state = REPL_STATE_CONNECT; + } else if (server.repl_state == REPL_STATE_CONNECTING || replicaIsInHandshakeState()) { + undoConnectWithPrimary(); + server.repl_state = REPL_STATE_CONNECT; } else { return 0; } @@ -4223,32 +3982,34 @@ int cancelReplicationHandshake(replicationLink *link, int reconnect) { /* try to re-connect without waiting for replicationCron, this is needed * for the "diskless loading short read" test. */ - serverLog(LL_NOTICE, "Reconnecting to replication source %s:%d after failure", link->host, link->port); - connectReplicationLink(link); + serverLog(LL_NOTICE, "Reconnecting to PRIMARY %s:%d after failure", server.primary_host, server.primary_port); + connectWithPrimary(); return 1; } /* Set replication to the specified primary address and port. */ void replicationSetPrimary(char *ip, int port, int full_sync_required) { - int was_primary = server.primary == NULL; - int was_connected = server.primary->state == REPL_STATE_CONNECTED; + int was_primary = server.primary_host == NULL; + sdsfree(server.primary_host); + server.primary_host = NULL; if (server.primary) { /* When joining 'myself' to a new primary, set the dont_cache_primary flag * if a full sync is required. This happens when 'myself' was previously * part of a different shard from the new primary. Since 'myself' does not * have the replication history of the shard it is joining, clearing the * cached primary is necessary to ensure proper replication behavior. */ - server.primary->client->flag.dont_cache_primary = full_sync_required; - freeReplicationLink(server.primary); + server.primary->flag.dont_cache_primary = full_sync_required; + freeClient(server.primary); } disconnectAllBlockedClients(); /* Clients blocked in primary, now replica. */ /* Setting primary_host only after the call to freeClient since it calls - * replicationHandleSourceDisconnection which can trigger a re-connect + * replicationHandlePrimaryDisconnection which can trigger a re-connect * directly from within that call. */ - server.primary = createReplicationLink(ip, port, NULL); + server.primary_host = sdsnew(ip); + server.primary_port = port; /* Update oom_score_adj */ setOOMScoreAdj(-1); @@ -4259,6 +4020,8 @@ void replicationSetPrimary(char *ip, int port, int full_sync_required) { * primary, or finishing transferring RDB and preparing loading DB on full * sync with new primary. */ + cancelReplicationHandshake(0); + /* Before destroying our primary state, create a cached primary using * our own parameters, to later PSYNC with the new primary. */ if (was_primary && !full_sync_required) { @@ -4271,26 +4034,31 @@ void replicationSetPrimary(char *ip, int port, int full_sync_required) { NULL); /* Fire the primary link modules event. */ - if (was_connected) + if (server.repl_state == REPL_STATE_CONNECTED) moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_DOWN, NULL); + server.repl_state = REPL_STATE_CONNECT; /* Allow trying dual-channel-replication with the new primary. If new primary doesn't * support dual-channel-replication, we will set to 0 afterwards. */ - serverLog(LL_NOTICE, "Connecting to PRIMARY %s:%d", server.primary->host, server.primary->port); - connectReplicationLink(server.primary); + serverLog(LL_NOTICE, "Connecting to PRIMARY %s:%d", server.primary_host, server.primary_port); + connectWithPrimary(); } /* Cancel replication, setting the instance as a primary itself. */ void replicationUnsetPrimary(void) { - if (server.primary == NULL) return; /* Nothing to do. */ + if (server.primary_host == NULL) return; /* Nothing to do. */ /* Fire the primary link modules event. */ - if (server.primary->state == REPL_STATE_CONNECTED) + if (server.repl_state == REPL_STATE_CONNECTED) moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_DOWN, NULL); - freeReplicationLink(server.primary); + /* Clear primary_host first, since the freeClient calls + * replicationHandlePrimaryDisconnection which can attempt to re-connect. */ + sdsfree(server.primary_host); + server.primary_host = NULL; + if (server.primary) freeClient(server.primary); replicationDiscardCachedPrimary(); - + cancelReplicationHandshake(0); /* When a replica is turned into a primary, the current replication ID * (that was inherited from the primary at synchronization time) is * used as secondary ID up to the current offset, and a new replication @@ -4301,6 +4069,7 @@ void replicationUnsetPrimary(void) { * the replicas will be able to partially resync with us, so it will be * a very fast reconnection. */ disconnectReplicas(); + server.repl_state = REPL_STATE_NONE; /* We need to make sure the new primary will start the replication stream * with a SELECT statement. This is forced after a full resync, but @@ -4331,37 +4100,23 @@ void replicationUnsetPrimary(void) { /* This function is called when the replica lose the connection with the * primary into an unexpected way. */ -void replicationHandleSourceDisconnection(replicationLink *link) { - if (link == server.primary) { - if (link->state == REPL_STATE_CONNECTED && link == server.primary) { - moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_DOWN, NULL); - } - server.repl_down_since = server.unixtime; - - /* We lost connection with our primary, don't disconnect replicas yet, - * maybe we'll be able to PSYNC with our primary later. We'll disconnect - * the replicas only if we'll have to do a full resync with our primary. */ - } +void replicationHandlePrimaryDisconnection(void) { + /* Fire the primary link modules event. */ + if (server.repl_state == REPL_STATE_CONNECTED) + moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_DOWN, NULL); - link->client = NULL; - link->state = REPL_STATE_CONNECT; - - if (link->rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) { - /* Our client was closed in the middle of dual channel (e.g, we were - * loading AOF as a client). Ensure that the other dual channel - * connections are cleaned up. */ - if (link->transfer_s) { - connClose(link->transfer_s); - link->transfer_s = NULL; - } - replicationAbortDualChannelSyncTransfer(link); - } + server.primary = NULL; + server.repl_state = REPL_STATE_CONNECT; + server.repl_down_since = server.unixtime; + /* We lost connection with our primary, don't disconnect replicas yet, + * maybe we'll be able to PSYNC with our primary later. We'll disconnect + * the replicas only if we'll have to do a full resync with our primary. */ /* Try to re-connect immediately rather than wait for replicationCron * waiting 1 second may risk backlog being recycled. */ - if (link->host) { - serverLog(LL_NOTICE, "Reconnecting to replication source %s:%d", link->host, link->port); - connectReplicationLink(link); + if (server.primary_host) { + serverLog(LL_NOTICE, "Reconnecting to PRIMARY %s:%d", server.primary_host, server.primary_port); + connectWithPrimary(); } } @@ -4381,7 +4136,7 @@ void replicaofCommand(client *c) { /* The special host/port combination "NO" "ONE" turns the instance * into a primary. Otherwise the new primary address is set. */ if (!strcasecmp(c->argv[1]->ptr, "no") && !strcasecmp(c->argv[2]->ptr, "one")) { - if (server.primary) { + if (server.primary_host) { replicationUnsetPrimary(); sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log); serverLog(LL_NOTICE, "PRIMARY MODE enabled (user request from '%s')", client); @@ -4401,7 +4156,7 @@ void replicaofCommand(client *c) { if (getRangeLongFromObjectOrReply(c, c->argv[2], 0, 65535, &port, "Invalid master port") != C_OK) return; /* Check if we are already attached to the specified primary */ - if (server.primary && !strcasecmp(server.primary->host, c->argv[1]->ptr) && server.primary->port == port) { + if (server.primary_host && !strcasecmp(server.primary_host, c->argv[1]->ptr) && server.primary_port == port) { serverLog(LL_NOTICE, "REPLICAOF would result into synchronization " "with the primary we are already connected " "with. No operation performed."); @@ -4413,8 +4168,8 @@ void replicaofCommand(client *c) { * we can continue. */ replicationSetPrimary(c->argv[1]->ptr, port, 0); sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log); - serverLog(LL_NOTICE, "REPLICAOF %s:%d enabled (user request from '%s')", server.primary->host, - server.primary->port, client); + serverLog(LL_NOTICE, "REPLICAOF %s:%d enabled (user request from '%s')", server.primary_host, + server.primary_port, client); sdsfree(client); } addReply(c, shared.ok); @@ -4429,7 +4184,7 @@ void roleCommand(client *c) { return; } - if (server.primary == NULL) { + if (server.primary_host == NULL) { listIter li; listNode *ln; void *mbcount; @@ -4461,12 +4216,12 @@ void roleCommand(client *c) { addReplyArrayLen(c, 5); addReplyBulkCBuffer(c, "slave", 5); - addReplyBulkCString(c, server.primary->host); - addReplyLongLong(c, server.primary->port); - if (replicaIsInHandshakeState(server.primary)) { + addReplyBulkCString(c, server.primary_host); + addReplyLongLong(c, server.primary_port); + if (replicaIsInHandshakeState()) { replica_state = "handshake"; } else { - switch (server.primary->state) { + switch (server.repl_state) { case REPL_STATE_NONE: replica_state = "none"; break; case REPL_STATE_CONNECT: replica_state = "connect"; break; case REPL_STATE_CONNECTING: replica_state = "connecting"; break; @@ -4476,18 +4231,17 @@ void roleCommand(client *c) { } } addReplyBulkCString(c, replica_state); - addReplyLongLong(c, server.primary->client ? server.primary->client->repl_data->reploff : -1); + addReplyLongLong(c, server.primary ? server.primary->repl_data->reploff : -1); } } /* Send a REPLCONF ACK command to the primary to inform it about the current * processed offset. If we are not connected with a primary, the command has * no effects. */ -void replicationSendAck(replicationLink *link) { - client *c = link->client; +void replicationSendAck(client *c) { if (c != NULL) { int send_fack = server.fsynced_reploff != -1; - c->flag.primary_force_reply = 1; + c->flag.replication_force_reply = 1; addReplyArrayLen(c, send_fack ? 5 : 3); addReplyBulkCString(c, "REPLCONF"); addReplyBulkCString(c, "ACK"); @@ -4496,7 +4250,7 @@ void replicationSendAck(replicationLink *link) { addReplyBulkCString(c, "FACK"); addReplyBulkLongLong(c, server.fsynced_reploff); } - c->flag.primary_force_reply = 0; + c->flag.replication_force_reply = 0; /* Accumulation from above replies must be reset back to 0 manually, * as this subroutine does not invoke resetClient(). */ @@ -4525,7 +4279,7 @@ void replicationSendAck(replicationLink *link) { * handshake in order to reactivate the cached primary. */ void replicationCachePrimary(client *c) { - serverAssert(server.primary != NULL && server.primary->client != NULL && server.cached_primary == NULL); + serverAssert(server.primary != NULL && server.cached_primary == NULL); serverLog(LL_NOTICE, "Caching the disconnected primary state."); /* Wait for IO operations to be done before proceeding */ @@ -4537,10 +4291,10 @@ void replicationCachePrimary(client *c) { * we want to discard the non processed query buffers and non processed * offsets, including pending transactions, already populated arguments, * pending outputs to the primary. */ - sdsclear(c->querybuf); - c->qb_pos = 0; - c->repl_data->repl_applied = 0; - c->repl_data->read_reploff = c->repl_data->reploff; + sdsclear(server.primary->querybuf); + server.primary->qb_pos = 0; + server.primary->repl_data->repl_applied = 0; + server.primary->repl_data->read_reploff = server.primary->repl_data->reploff; if (c->flag.multi) discardTransaction(c); listEmpty(c->reply); c->sentlen = 0; @@ -4549,9 +4303,9 @@ void replicationCachePrimary(client *c) { resetClient(c); resetClientIOState(c); - /* Save the primary. Server.primary->client will be set to null later by - * replicationHandleSourceDisconnection(). */ - server.cached_primary = c; + /* Save the primary. Server.primary will be set to null later by + * replicationHandlePrimaryDisconnection(). */ + server.cached_primary = server.primary; /* Invalidate the Peer ID cache. */ if (c->peerid) { @@ -4566,8 +4320,8 @@ void replicationCachePrimary(client *c) { /* Caching the primary happens instead of the actual freeClient() call, * so make sure to adjust the replication state. This function will - * also set server.primary->client to NULL. */ - replicationHandleSourceDisconnection(server.primary); + * also set server.primary to NULL. */ + replicationHandlePrimaryDisconnection(); } /* This function is called when a primary is turned into a replica, in order to @@ -4583,27 +4337,24 @@ void replicationCachePrimaryUsingMyself(void) { serverLog(LL_NOTICE, "Before turning into a replica, using my own primary parameters " "to synthesize a cached primary: I may be able to synchronize with " "the new primary with just a partial transfer."); - /* Create a temporary link for the purpose of creating a client. */ - replicationLink *temp_link = createReplicationLink(NULL, 0, NULL); /* This will be used to populate the field server.primary->repl_data->reploff * by replicationCreatePrimaryClient(). We'll later set the created * primary as server.cached_primary, so the replica will use such * offset for PSYNC. */ - temp_link->initial_offset = server.primary_repl_offset; + server.primary_initial_offset = server.primary_repl_offset; /* The primary client we create can be set to any DBID, because * the new primary will start its replication stream with SELECT. */ - createReplicationLinkClient(temp_link, NULL, -1); + replicationCreatePrimaryClient(NULL, -1); /* Use our own ID / offset. */ - memcpy(temp_link->client->repl_data->replid, server.replid, sizeof(server.replid)); + memcpy(server.primary->repl_data->replid, server.replid, sizeof(server.replid)); /* Set as cached primary. */ - unlinkClient(temp_link->client); - server.cached_primary = temp_link->client; - temp_link->client = NULL; - freeReplicationLink(temp_link); + unlinkClient(server.primary); + server.cached_primary = server.primary; + server.primary = NULL; } /* Free a cached primary, called when there are no longer the conditions for @@ -4612,7 +4363,7 @@ void replicationDiscardCachedPrimary(void) { if (server.cached_primary == NULL) return; serverLog(LL_NOTICE, "Discarding previously cached primary state."); - server.cached_primary->flag.replication_source = 0; + server.cached_primary->flag.primary = 0; freeClient(server.cached_primary); server.cached_primary = NULL; } @@ -4620,19 +4371,17 @@ void replicationDiscardCachedPrimary(void) { /* Replication: Replica side. * This method performs the necessary steps to establish a connection with the primary server. * It sets private data, updates flags, and fires an event to notify modules about the primary link change. */ -void establishSourceConnection(replicationLink *link) { - connSetPrivateData(link->client->conn, link->client); - link->client->flag.close_after_reply = 0; - link->client->flag.close_asap = 0; - link->client->flag.authenticated = 1; - link->client->last_interaction = server.unixtime; - link->state = REPL_STATE_CONNECTED; - if (link == server.primary) { - server.repl_down_since = 0; +void establishPrimaryConnection(void) { + connSetPrivateData(server.primary->conn, server.primary); + server.primary->flag.close_after_reply = 0; + server.primary->flag.close_asap = 0; + server.primary->flag.authenticated = 1; + server.primary->last_interaction = server.unixtime; + server.repl_state = REPL_STATE_CONNECTED; + server.repl_down_since = 0; - /* Fire the primary link modules event. */ - moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_UP, NULL); - } + /* Fire the primary link modules event. */ + moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_UP, NULL); } /* Replication: Replica side. @@ -4642,38 +4391,34 @@ void establishSourceConnection(replicationLink *link) { * This function is called when successfully setup a partial resynchronization * so the stream of data that we'll receive will start from where this * primary left. */ -void replicationResurrectCachedPrimary(replicationLink *link) { - serverAssert(link == server.primary); - link->client = server.cached_primary; +void replicationResurrectCachedPrimary(connection *conn) { + server.primary = server.cached_primary; server.cached_primary = NULL; + server.primary->conn = conn; - /* The client takes ownership of the connection now. */ - link->client->conn = link->transfer_s; - link->transfer_s = NULL; - - establishSourceConnection(link); + establishPrimaryConnection(); /* Re-add to the list of clients. */ - linkClient(link->client); - replicationSteadyStateInit(link); + linkClient(server.primary); + replicationSteadyStateInit(); } /* Replication: Replica side. * Prepare replica to steady state. * prerequisite: server.primary is already initialized and linked in client list. */ -void replicationSteadyStateInit(replicationLink *link) { - if (connSetReadHandler(link->client->conn, readQueryFromClient)) { +void replicationSteadyStateInit(void) { + if (connSetReadHandler(server.primary->conn, readQueryFromClient)) { serverLog(LL_WARNING, "Error resurrecting the cached primary, impossible to add the readable handler: %s", strerror(errno)); - freeClientAsync(link->client); /* Close ASAP. */ + freeClientAsync(server.primary); /* Close ASAP. */ } /* We may also need to install the write handler as well if there is * pending data in the write buffers. */ - if (clientHasPendingReplies(link->client)) { - if (connSetWriteHandler(link->client->conn, sendReplyToClient)) { + if (clientHasPendingReplies(server.primary)) { + if (connSetWriteHandler(server.primary->conn, sendReplyToClient)) { serverLog(LL_WARNING, "Error resurrecting the cached primary, impossible to add the writable handler: %s", strerror(errno)); - freeClientAsync(link->client); /* Close ASAP. */ + freeClientAsync(server.primary); /* Close ASAP. */ } } } @@ -4681,19 +4426,16 @@ void replicationSteadyStateInit(replicationLink *link) { /* Replication: Replica side. * Turn the provisional primary into the current primary. * This function is called after dual channel sync is finished successfully. */ -void replicationResurrectProvisionalSource(replicationLink *link) { - /* Create a client, but do not initialize the read handler yet, as this replica still has a local buffer to +void replicationResurrectProvisionalPrimary(void) { + /* Create a primary client, but do not initialize the read handler yet, as this replica still has a local buffer to * drain. */ - createReplicationLinkClientWithHandler(link, link->transfer_s, link->provisional_source_state.dbid, NULL); - link->transfer_s = NULL; /* link->client now takes ownership of this connection */ - memcpy(link->client->repl_data->replid, link->provisional_source_state.replid, sizeof(link->provisional_source_state.replid)); - link->client->repl_data->reploff = link->provisional_source_state.reploff; - link->client->repl_data->read_reploff = link->provisional_source_state.read_reploff; - if (link == server.primary) { - server.primary_repl_offset = link->client->repl_data->reploff; - memcpy(server.replid, link->client->repl_data->replid, sizeof(link->client->repl_data->replid)); - } - establishSourceConnection(link); + replicationCreatePrimaryClientWithHandler(server.repl_transfer_s, server.repl_provisional_primary.dbid, NULL); + memcpy(server.primary->repl_data->replid, server.repl_provisional_primary.replid, sizeof(server.repl_provisional_primary.replid)); + server.primary->repl_data->reploff = server.repl_provisional_primary.reploff; + server.primary->repl_data->read_reploff = server.repl_provisional_primary.read_reploff; + server.primary_repl_offset = server.primary->repl_data->reploff; + memcpy(server.replid, server.primary->repl_data->replid, sizeof(server.primary->repl_data->replid)); + establishPrimaryConnection(); } /* ------------------------- MIN-REPLICAS-TO-WRITE --------------------------- */ @@ -4720,7 +4462,7 @@ void refreshGoodReplicasCount(void) { /* return true if status of good replicas is OK. otherwise false */ int checkGoodReplicasStatus(void) { - return server.primary || /* not a primary status should be OK */ + return server.primary_host || /* not a primary status should be OK */ !server.repl_min_replicas_max_lag || /* Min replica max lag not configured */ !server.repl_min_replicas_to_write || /* Min replica to write not configured */ server.repl_good_replicas_count >= server.repl_min_replicas_to_write; /* check if we have enough replicas */ @@ -4813,7 +4555,7 @@ void waitCommand(client *c) { long numreplicas, ackreplicas; long long offset = getClientWriteOffset(c); - if (server.primary) { + if (server.primary_host) { addReplyError( c, "WAIT cannot be used with replica instances. Please also note that if a replica is configured to be " "writable (which is not the default) writes to replicas are just local and are not propagated."); @@ -4851,7 +4593,7 @@ void waitaofCommand(client *c) { if (getPositiveLongFromObjectOrReply(c, c->argv[2], &numreplicas, NULL) != C_OK) return; if (getTimeoutFromObjectOrReply(c, c->argv[3], &timeout, UNIT_MILLISECONDS) != C_OK) return; - if (server.primary) { + if (server.primary_host) { addReplyError(c, "WAITAOF cannot be used with replica instances. Please also note that writes to replicas are " "just local and are not propagated."); return; @@ -4972,9 +4714,9 @@ void processClientsWaitingReplicas(void) { long long replicationGetReplicaOffset(void) { long long offset = 0; - if (server.primary != NULL) { - if (server.primary->client) { - offset = server.primary->client->repl_data->reploff; + if (server.primary_host != NULL) { + if (server.primary) { + offset = server.primary->repl_data->reploff; } else if (server.cached_primary) { offset = server.cached_primary->repl_data->reploff; } @@ -4998,48 +4740,44 @@ void replicationCron(void) { updateFailoverStatus(); /* Non blocking connection timeout? */ - listNode *ln; - listIter li; - listRewind(server.replication_links, &li); - while ((ln = listNext(&li))) { - replicationLink *link = (replicationLink *)ln->value; - if ((link->state == REPL_STATE_CONNECTING || replicaIsInHandshakeState(link)) && - (time(NULL) - link->transfer_lastio) > server.repl_timeout) { - serverLog(LL_WARNING, "Timeout connecting to %s...", replicationGetNameForLogs(link)); - cancelReplicationHandshake(link, 1); - } - - /* Bulk transfer I/O timeout? */ - if (link && link->state == REPL_STATE_TRANSFER && - (time(NULL) - link->transfer_lastio) > server.repl_timeout) { - serverLog(LL_WARNING, "Timeout receiving bulk data from %s... If the problem persists try to set the " - "'repl-timeout' parameter in valkey.conf to a larger value.", replicationGetNameForLogs(link)); - cancelReplicationHandshake(link, 1); - } + if (server.primary_host && (server.repl_state == REPL_STATE_CONNECTING || replicaIsInHandshakeState()) && + (time(NULL) - server.repl_transfer_lastio) > server.repl_timeout) { + serverLog(LL_WARNING, "Timeout connecting to the PRIMARY..."); + cancelReplicationHandshake(1); + } - /* Timed out primary when we are an already connected replica? */ - if (link && link->state == REPL_STATE_CONNECTED && - (time(NULL) - link->client->last_interaction) > server.repl_timeout) { - serverLog(LL_WARNING, "%s timeout: no data nor PING received...", replicationGetNameForLogs(link)); - freeClient(link->client); /* free client will attempt reconnect */ - } + /* Bulk transfer I/O timeout? */ + if (server.primary_host && server.repl_state == REPL_STATE_TRANSFER && + (time(NULL) - server.repl_transfer_lastio) > server.repl_timeout) { + serverLog(LL_WARNING, "Timeout receiving bulk data from PRIMARY... If the problem persists try to set the " + "'repl-timeout' parameter in valkey.conf to a larger value."); + cancelReplicationHandshake(1); + } - /* Check if we should connect to a replication source */ - if (link && link->state == REPL_STATE_CONNECT) { - serverLog(LL_NOTICE, "Connecting to %s %s:%d", replicationGetNameForLogs(link), link->host, link->port); - connectReplicationLink(link); - } + /* Timed out primary when we are an already connected replica? */ + if (server.primary_host && server.repl_state == REPL_STATE_CONNECTED && + (time(NULL) - server.primary->last_interaction) > server.repl_timeout) { + serverLog(LL_WARNING, "PRIMARY timeout: no data nor PING received..."); + freeClient(server.primary); + } - /* Send ACK to replication sources from time to time. - * Note that we do not send periodic acks to replication sources that don't - * support PSYNC and replication offsets. */ - if (link && link->client && !(link->client->flag.pre_psync)) replicationSendAck(link); + /* Check if we should connect to a PRIMARY */ + if (server.repl_state == REPL_STATE_CONNECT) { + serverLog(LL_NOTICE, "Connecting to PRIMARY %s:%d", server.primary_host, server.primary_port); + connectWithPrimary(); } + /* Send ACK to primary from time to time. + * Note that we do not send periodic acks to primary that don't + * support PSYNC and replication offsets. */ + if (server.primary_host && server.primary && !(server.primary->flag.pre_psync)) replicationSendAck(server.primary); + /* If we have attached replicas, PING them from time to time. * So replicas can implement an explicit timeout to primaries, and will * be able to detect a link disconnection even if the TCP connection * will not actually go down. */ + listIter li; + listNode *ln; robj *ping_argv[1]; /* First, send PING according to ping_replica_period. */ @@ -5126,7 +4864,7 @@ void replicationCron(void) { * backlog, in order to reply to PSYNC queries if they are turned into * primaries after a failover. */ if (listLength(server.replicas) == 0 && server.repl_backlog_time_limit && server.repl_backlog && - server.primary == NULL) { + server.primary_host == NULL) { time_t idle = server.unixtime - server.repl_no_replicas_since; if (idle > server.repl_backlog_time_limit) { @@ -5201,7 +4939,7 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out, slotBitmap slot_ /* Get first replica's requirements */ req = replica->repl_data->replica_req; memcpy(slot_bitmap, replica->repl_data->slot_bitmap, sizeof(slotBitmap)); - } else if (req != replica->repl_data->replica_req) { + } else if (req != replica->repl_data->replica_req || slotBitmapCompare(slot_bitmap, replica->repl_data->slot_bitmap) != 0) { /* Skip replicas that don't match */ continue; } @@ -5231,7 +4969,6 @@ void replicationStartPendingFork(void) { int mincapa = -1; int req = -1; slotBitmap slot_bitmap; - slotBitmapSetAll(slot_bitmap); if (shouldStartChildReplication(&mincapa, &req, slot_bitmap)) { /* Start the BGSAVE. The called function may start a @@ -5376,7 +5113,7 @@ void failoverCommand(client *c) { return; } - if (server.primary) { + if (server.primary_host) { addReplyError(c, "FAILOVER is not valid when server is a replica."); return; } diff --git a/src/script.c b/src/script.c index a43de5c7af..a8e5b18eb9 100644 --- a/src/script.c +++ b/src/script.c @@ -51,7 +51,7 @@ static void exitScriptTimedoutMode(scriptRunCtx *run_ctx) { run_ctx->flags &= ~SCRIPT_TIMEDOUT; blockingOperationEnds(); /* if we are a replica and we have an active primary, set it for continue processing */ - if (server.primary && server.primary->client) queueClientForReprocessing(server.primary->client); + if (server.primary_host && server.primary) queueClientForReprocessing(server.primary); } static void enterScriptTimedoutMode(scriptRunCtx *run_ctx) { @@ -137,7 +137,7 @@ int scriptPrepareForRun(scriptRunCtx *run_ctx, int client_allow_oom = !!(caller->flag.allow_oom); int running_stale = - server.primary && server.primary->state != REPL_STATE_CONNECTED && server.repl_serve_stale_data == 0; + server.primary_host && server.repl_state != REPL_STATE_CONNECTED && server.repl_serve_stale_data == 0; int obey_client = mustObeyClient(caller); if (!(script_flags & SCRIPT_FLAG_EVAL_COMPAT_MODE)) { @@ -158,7 +158,7 @@ int scriptPrepareForRun(scriptRunCtx *run_ctx, * 1. we are not a readonly replica * 2. no disk error detected * 3. command is not `fcall_ro`/`eval[sha]_ro` */ - if (server.primary && server.repl_replica_ro && !obey_client) { + if (server.primary_host && server.repl_replica_ro && !obey_client) { addReplyError(caller, "-READONLY Can not run script with write flag on readonly replica"); return C_ERR; } @@ -375,7 +375,7 @@ static int scriptVerifyWriteCommandAllow(scriptRunCtx *run_ctx, char **err) { * of this script. */ int deny_write_type = writeCommandsDeniedByDiskError(); - if (server.primary && server.repl_replica_ro && !mustObeyClient(run_ctx->original_client)) { + if (server.primary_host && server.repl_replica_ro && !mustObeyClient(run_ctx->original_client)) { *err = sdsdup(shared.roreplicaerr->ptr); return C_ERR; } @@ -501,12 +501,12 @@ int scriptSetRepl(scriptRunCtx *run_ctx, int repl) { } static int scriptVerifyAllowStale(client *c, sds *err) { - if (!server.primary) { + if (!server.primary_host) { /* Not a replica, stale is irrelevant */ return C_OK; } - if (server.primary->state == REPL_STATE_CONNECTED) { + if (server.repl_state == REPL_STATE_CONNECTED) { /* Connected to replica, stale is irrelevant */ return C_OK; } diff --git a/src/server.c b/src/server.c index 697ce48013..ea77cc1312 100644 --- a/src/server.c +++ b/src/server.c @@ -221,7 +221,7 @@ void serverLogRaw(int level, const char *msg) { } else if (pid != server.pid) { role_index = 1; /* RDB / AOF writing child. */ } else { - role_index = (server.primary ? 2 : 3); /* Replica or Primary. */ + role_index = (server.primary_host ? 2 : 3); /* Replica or Primary. */ } switch (server.log_format) { case LOG_FORMAT_LOGFMT: @@ -901,7 +901,7 @@ int clientsCronResizeQueryBuffer(client *c) { /* 1) Query is idle for a long time. */ size_t remaining = sdslen(c->querybuf) - c->qb_pos; if (!c->flag.replication_source && !remaining) { - /* If the client is not a primary and no data is pending, + /* If the client is not for replication and no data is pending, * The client can safely use the shared query buffer in the next read - free the client's querybuf. */ sdsfree(c->querybuf); /* By setting the querybuf to NULL, the client will use the shared query buffer in the next read. @@ -2223,12 +2223,21 @@ void initServerConfig(void) { appendServerSaveParams(60, 10000); /* save after 1 minute and 10000 changes */ /* Replication related */ + server.primary_host = NULL; + server.primary_port = 6379; server.primary = NULL; server.cached_primary = NULL; + server.primary_initial_offset = -1; + server.repl_state = REPL_STATE_NONE; + server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE; + server.repl_transfer_tmpfile = NULL; + server.repl_transfer_fd = -1; + server.repl_transfer_s = NULL; server.repl_syncio_timeout = CONFIG_REPL_SYNCIO_TIMEOUT; server.repl_down_since = 0; /* Never connected, repl is down since EVER. */ server.primary_repl_offset = 0; server.fsynced_reploff_pending = 0; + server.rdb_client_id = -1; server.loading_process_events_interval_ms = LOADING_PROCESS_EVENTS_INTERVAL_DEFAULT; server.loading_rio = NULL; @@ -2339,7 +2348,7 @@ int restartServer(client *c, int flags, mstime_t delay) { * depending on current role. */ int setOOMScoreAdj(int process_class) { - if (process_class == -1) process_class = (server.primary ? CONFIG_OOM_REPLICA : CONFIG_OOM_PRIMARY); + if (process_class == -1) process_class = (server.primary_host ? CONFIG_OOM_REPLICA : CONFIG_OOM_PRIMARY); serverAssert(process_class >= 0 && process_class < CONFIG_OOM_COUNT); @@ -2751,7 +2760,6 @@ void initServer(void) { server.reply_buffer_peak_reset_time = REPLY_BUFFER_DEFAULT_PEAK_RESET_TIME; server.reply_buffer_resizing_enabled = 1; server.client_mem_usage_buckets = NULL; - server.replication_links = listCreate(); resetReplicationBuffer(); /* Make sure the locale is set on startup based on the config file. */ @@ -3349,7 +3357,7 @@ struct serverCommand *lookupCommandOrOriginal(robj **argv, int argc) { return cmd; } -/* Commands arriving from the primary client or AOF client, should never be rejected. */ +/* Commands arriving from a replication source or AOF client, should never be rejected. */ int mustObeyClient(client *c) { return c->id == CLIENT_ID_AOF || c->flag.replication_source; } @@ -4103,7 +4111,7 @@ int processCommand(client *c) { } } - if (!server.cluster_enabled && c->capa & CLIENT_CAPA_REDIRECT && server.primary && !obey_client && + if (!server.cluster_enabled && c->capa & CLIENT_CAPA_REDIRECT && server.primary_host && !obey_client && (is_write_command || (is_read_command && !c->flag.readonly))) { if (server.failover_state == FAILOVER_IN_PROGRESS) { /* During the FAILOVER process, when conditions are met (such as @@ -4134,7 +4142,7 @@ int processCommand(client *c) { } c->duration = 0; c->cmd->rejected_calls++; - addReplyErrorSds(c, sdscatprintf(sdsempty(), "-REDIRECT %s:%d", server.primary->host, server.primary->port)); + addReplyErrorSds(c, sdscatprintf(sdsempty(), "-REDIRECT %s:%d", server.primary_host, server.primary_port)); } return C_OK; } @@ -4219,7 +4227,7 @@ int processCommand(client *c) { /* Don't accept write commands if this is a read only replica. But * accept write commands if this is our primary. */ - if (server.primary && server.repl_replica_ro && !obey_client && is_write_command) { + if (server.primary_host && server.repl_replica_ro && !obey_client && is_write_command) { rejectCommand(c, shared.roreplicaerr); return C_OK; } @@ -4240,7 +4248,7 @@ int processCommand(client *c) { /* Only allow commands with flag "t", such as INFO, REPLICAOF and so on, * when replica-serve-stale-data is no and we are a replica with a broken * link with primary. */ - if (server.primary && server.primary->state != REPL_STATE_CONNECTED && server.repl_serve_stale_data == 0 && + if (server.primary_host && server.repl_state != REPL_STATE_CONNECTED && server.repl_serve_stale_data == 0 && is_denystale_command) { rejectCommand(c, shared.primarydownerr); return C_OK; @@ -5964,14 +5972,14 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) { info = sdscatprintf(info, "# Replication\r\n" "role:%s\r\n", - server.primary == NULL ? "master" : "slave"); - if (server.primary) { + server.primary_host == NULL ? "master" : "slave"); + if (server.primary_host) { long long replica_repl_offset = 1; long long replica_read_repl_offset = 1; - if (server.primary->client) { - replica_repl_offset = server.primary->client->repl_data->reploff; - replica_read_repl_offset = server.primary->client->repl_data->read_reploff; + if (server.primary) { + replica_repl_offset = server.primary->repl_data->reploff; + replica_read_repl_offset = server.primary->repl_data->read_reploff; } else if (server.cached_primary) { replica_repl_offset = server.cached_primary->repl_data->reploff; replica_read_repl_offset = server.cached_primary->repl_data->read_reploff; @@ -5980,32 +5988,32 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) { info = sdscatprintf( info, FMTARGS( - "master_host:%s\r\n", server.primary->host, - "master_port:%d\r\n", server.primary->port, - "master_link_status:%s\r\n", (server.primary->state == REPL_STATE_CONNECTED) ? "up" : "down", - "master_last_io_seconds_ago:%d\r\n", server.primary->client ? ((int)(server.unixtime - server.primary->client->last_interaction)) : -1, - "master_sync_in_progress:%d\r\n", server.primary->state == REPL_STATE_TRANSFER, + "master_host:%s\r\n", server.primary_host, + "master_port:%d\r\n", server.primary_port, + "master_link_status:%s\r\n", (server.repl_state == REPL_STATE_CONNECTED) ? "up" : "down", + "master_last_io_seconds_ago:%d\r\n", server.primary ? ((int)(server.unixtime - server.primary->last_interaction)) : -1, + "master_sync_in_progress:%d\r\n", server.repl_state == REPL_STATE_TRANSFER, "slave_read_repl_offset:%lld\r\n", replica_read_repl_offset, "slave_repl_offset:%lld\r\n", replica_repl_offset, - "replicas_repl_buffer_size:%zu\r\n", server.primary->pending_repl_data.len, - "replicas_repl_buffer_peak:%zu\r\n", server.primary->pending_repl_data.peak)); + "replicas_repl_buffer_size:%zu\r\n", server.pending_repl_data.len, + "replicas_repl_buffer_peak:%zu\r\n", server.pending_repl_data.peak)); - if (server.primary->state == REPL_STATE_TRANSFER) { + if (server.repl_state == REPL_STATE_TRANSFER) { double perc = 0; - if (server.primary->transfer_size) { - perc = ((double)server.primary->transfer_read / server.primary->transfer_size) * 100; + if (server.repl_transfer_size) { + perc = ((double)server.repl_transfer_read / server.repl_transfer_size) * 100; } info = sdscatprintf( info, FMTARGS( - "master_sync_total_bytes:%lld\r\n", (long long)server.primary->transfer_size, - "master_sync_read_bytes:%lld\r\n", (long long)server.primary->transfer_read, - "master_sync_left_bytes:%lld\r\n", (long long)(server.primary->transfer_size - server.primary->transfer_read), + "master_sync_total_bytes:%lld\r\n", (long long)server.repl_transfer_size, + "master_sync_read_bytes:%lld\r\n", (long long)server.repl_transfer_read, + "master_sync_left_bytes:%lld\r\n", (long long)(server.repl_transfer_size - server.repl_transfer_read), "master_sync_perc:%.2f\r\n", perc, - "master_sync_last_io_seconds_ago:%d\r\n", (int)(server.unixtime - server.primary->transfer_lastio))); + "master_sync_last_io_seconds_ago:%d\r\n", (int)(server.unixtime - server.repl_transfer_lastio))); } - if (server.primary->state != REPL_STATE_CONNECTED) { + if (server.repl_state != REPL_STATE_CONNECTED) { info = sdscatprintf(info, "master_link_down_since_seconds:%jd\r\n", server.repl_down_since ? (intmax_t)(server.unixtime - server.repl_down_since) : -1); } @@ -6840,7 +6848,7 @@ int serverIsSupervised(int mode) { } int iAmPrimary(void) { - return ((!server.cluster_enabled && server.primary == NULL) || + return ((!server.cluster_enabled && server.primary_host == NULL) || (server.cluster_enabled && clusterNodeIsPrimary(getMyClusterNode()))); } @@ -7123,7 +7131,7 @@ __attribute__((weak)) int main(int argc, char **argv) { } if (server.supervised_mode == SUPERVISED_SYSTEMD) { - if (!server.primary) { + if (!server.primary_host) { serverCommunicateSystemd("STATUS=Ready to accept connections\n"); } else { serverCommunicateSystemd( diff --git a/src/server.h b/src/server.h index 34c8e9ba41..e1a8a1d503 100644 --- a/src/server.h +++ b/src/server.h @@ -153,8 +153,6 @@ struct hdr_histogram; #else #define CONFIG_ACTIVE_DEFRAG_DEFAULT 1 #endif -#define CLUSTER_SLOT_MASK_BITS 14 /* Number of bits used for slot id. */ -#define CLUSTER_SLOTS (1 << CLUSTER_SLOT_MASK_BITS) /* Total number of slots in cluster mode, which is 16384. */ /* Bucket sizes for client eviction pools. Each bucket stores clients with * memory usage of up to twice the size of the bucket below it. */ @@ -375,11 +373,12 @@ typedef enum blocking_type { /* Client classes for client limits, currently used only for * the max-client-output-buffer limit implementation. */ -#define CLIENT_TYPE_NORMAL 0 /* Normal req-reply clients + MONITORs */ -#define CLIENT_TYPE_REPLICA 1 /* Replicas. */ -#define CLIENT_TYPE_PUBSUB 2 /* Clients subscribed to PubSub channels. */ -#define CLIENT_TYPE_PRIMARY 3 /* Primary. */ -#define CLIENT_TYPE_COUNT 4 /* Total number of client types. */ +#define CLIENT_TYPE_NORMAL 0 /* Normal req-reply clients + MONITORs */ +#define CLIENT_TYPE_REPLICA 1 /* Replicas. */ +#define CLIENT_TYPE_PUBSUB 2 /* Clients subscribed to PubSub channels. */ +#define CLIENT_TYPE_PRIMARY 3 /* Primary. */ +#define CLIENT_TYPE_SLOT_MIGRATION 4 /* Slot migration client. */ +#define CLIENT_TYPE_COUNT 5 /* Total number of client types. */ #define CLIENT_TYPE_OBUF_COUNT 3 /* Number of clients to expose to output \ buffer configuration. Just the first \ three: normal, replica, pubsub. */ @@ -388,6 +387,7 @@ typedef enum blocking_type { * what to do next. */ typedef enum { REPL_STATE_NONE = 0, /* No active replication */ + REPL_STATE_ERROR, /* Error in replication. */ REPL_STATE_CONNECT, /* Must connect to primary */ REPL_STATE_CONNECTING, /* Connecting to primary */ /* --- Handshake states, must be ordered --- */ @@ -404,7 +404,6 @@ typedef enum { /* --- End of handshake states --- */ REPL_STATE_TRANSFER, /* Receiving .rdb from primary */ REPL_STATE_CONNECTED, /* Connected to primary */ - REPL_STATE_CANCELLED, /* Replication was cancelled, and this link is pending deletion. */ } repl_state; /* Replica rdb-channel replication state. Used in server.repl_rdb_channel_state for @@ -1016,7 +1015,7 @@ typedef enum { } clientIOState; typedef struct ClientFlags { - uint64_t replication_source : 1; /* This client is a replication source (i.e. primary or slot migration source) */ + uint64_t primary : 1; /* This client is a primary */ uint64_t replica : 1; /* This client is a replica */ uint64_t monitor : 1; /* This client is a replica monitor, see MONITOR */ uint64_t multi : 1; /* This client is in a MULTI context */ @@ -1029,7 +1028,7 @@ typedef struct ClientFlags { uint64_t close_asap : 1; /* Close this client ASAP */ uint64_t unix_socket : 1; /* Client connected via Unix domain socket */ uint64_t dirty_exec : 1; /* EXEC will fail for errors while queueing */ - uint64_t primary_force_reply : 1; /* Queue replies even if is primary */ + uint64_t replication_force_reply : 1; /* Queue replies even if is primary */ uint64_t force_aof : 1; /* Force AOF propagation of current cmd. */ uint64_t force_repl : 1; /* Force replication of current cmd. */ uint64_t pre_psync : 1; /* Instance don't understand PSYNC. */ @@ -1092,7 +1091,9 @@ typedef struct ClientFlags { * flag, we won't cache the primary in freeClient. */ uint64_t fake : 1; /* This is a fake client without a real connection. */ uint64_t import_source : 1; /* This client is importing data to server and can visit expired key. */ - uint64_t reserved : 4; /* Reserved for future use */ + uint64_t replication_source : 1; /* This client is a replication source (i.e. primary or slot migration). */ + uint64_t slot_migration_source : 1; /* This client is a slot migration source. */ + uint64_t reserved : 3; /* Reserved for future use */ } ClientFlags; typedef struct ClientPubSubData { @@ -1108,9 +1109,11 @@ typedef struct ClientPubSubData { context of client side caching. */ } ClientPubSubData; +#define CLUSTER_SLOT_MASK_BITS 14 /* Number of bits used for slot id. */ +#define CLUSTER_SLOTS (1 << CLUSTER_SLOT_MASK_BITS) /* Total number of slots in cluster mode, which is 16384. */ + typedef unsigned char slotBitmap[CLUSTER_SLOTS / 8]; -typedef struct replicationLink replicationLink; typedef struct ClientReplicationData { int repl_state; /* Replication state if this is a replica. */ int repl_start_cmd_stream_on_ack; /* Install replica write handler on first ACK. */ @@ -1142,7 +1145,6 @@ typedef struct ClientReplicationData { size_t ref_block_pos; /* Access position of referenced buffer block, i.e. the next offset to send. */ slotBitmap slot_bitmap; /* The slot range this replica is replicating for. */ - replicationLink *link; /* The replication link owning this. */ } ClientReplicationData; typedef struct ClientModuleData { @@ -1424,7 +1426,7 @@ typedef enum { * top-level primary. */ typedef struct rdbSaveInfo { /* Used saving and loading. */ - int repl_stream_db; /* DB to select in server.primary->client. */ + int repl_stream_db; /* DB to select in server.primary client. */ /* Used only loading. */ int repl_id_is_set; /* True if repl_id field is set. */ @@ -1546,43 +1548,6 @@ typedef enum childInfoType { CHILD_INFO_TYPE_MODULE_COW_SIZE } childInfoType; -typedef struct slotRange { - int start; - int end; -} slotRange; - -typedef struct replicationLink { - int protected; /* Used to protect link from destruction during background loading. */ - int state; /* State of the sync operation overall. */ - int rdb_channel_state; - client *client; - client *snapshot_load_client; /* client used for full sync when AOF format is used. */ - sds host; - int port; - connection *transfer_s; /* Replica -> Primary SYNC connection */ - connection *rdb_transfer_s; /* Primary FULL SYNC connection (RDB download) */ - uint64_t rdb_client_id; /* Rdb client id as it defined at primary side */ - /* The following two fields is where we store primary PSYNC replid/offset - * while the PSYNC is in progress. At the end we'll copy the fields into - * the server->primary client structure. */ - char replid[CONFIG_RUN_ID_SIZE + 1]; /* Primary PSYNC runid. */ - long long initial_offset; /* Primary PSYNC offset. */ - off_t transfer_size; /* Size of RDB to read from primary during sync. */ - off_t transfer_read; /* Amount of RDB read from primary during sync. */ - off_t transfer_last_fsync_off; /* Offset when we fsync-ed last time. */ - int transfer_fd; /* Replica -> Primary SYNC temp file descriptor */ - char *transfer_tmpfile; /* Replica-> Primary SYNC temp file name */ - time_t transfer_lastio; /* Unix time of the latest read, for timeout */ - struct { - char replid[CONFIG_RUN_ID_SIZE + 1]; - long long reploff; - long long read_reploff; - int dbid; - } provisional_source_state; /* Information about the provisional state (after RDB) for the source node, stored during dual channel sync. */ - replDataBuf pending_repl_data; /* Replication data buffer for dual-channel-replication */ - unsigned char slot_bitmap[CLUSTER_SLOTS/8]; /* Slot range used for slot import. */ -} replicationLink; - struct valkeyServer { /* General */ pid_t pid; /* Main process pid. */ @@ -1743,6 +1708,7 @@ struct valkeyServer { long long stat_net_input_bytes; /* Bytes read from network. */ long long stat_net_output_bytes; /* Bytes written to network. */ long long stat_net_repl_input_bytes; /* Bytes read during replication, added to stat_net_input_bytes in 'info'. */ + long long stat_net_slot_migration_input_bytes; /* Bytes read during replication, added to stat_net_input_bytes in 'info'. */ /* Bytes written during replication, added to stat_net_output_bytes in 'info'. */ long long stat_net_repl_output_bytes; size_t stat_current_cow_peak; /* Peak size of copy on write bytes. */ @@ -1943,6 +1909,7 @@ struct valkeyServer { int repl_ping_replica_period; /* Primary pings the replica every N seconds */ replBacklog *repl_backlog; /* Replication backlog for partial syncs */ long long repl_backlog_size; /* Backlog circular buffer size */ + replDataBuf pending_repl_data; /* Replication data buffer for dual-channel-replication */ time_t repl_backlog_time_limit; /* Time without replicas after the backlog gets released. */ time_t repl_no_replicas_since; /* We have no replicas since that time. @@ -1966,28 +1933,52 @@ struct valkeyServer { list *repl_buffer_blocks; /* Replication buffers blocks list * (serving replica clients and repl backlog) */ /* Replication (replica) */ - char *primary_user; /* AUTH with this user and primary_auth with primary */ - sds primary_auth; /* AUTH with this password with primary */ - int repl_timeout; /* Timeout after N seconds of primary idle */ - replicationLink *primary; /* Replication link for the primary. */ - list *replication_links; /* List of all current replication links. */ - client *cached_primary; /* Cached primary to be reused for PSYNC. */ - int repl_syncio_timeout; /* Timeout for synchronous I/O calls */ - int repl_serve_stale_data; /* Serve stale data when link is down? */ - int repl_replica_ro; /* Replica is read only? */ - int repl_replica_ignore_maxmemory; /* If true replicas do not evict. */ - time_t repl_down_since; /* Unix time at which link with primary went down */ - int repl_disable_tcp_nodelay; /* Disable TCP_NODELAY after SYNC? */ - int replica_priority; /* Reported in INFO and used by Sentinel. */ - int replica_announced; /* If true, replica is announced by Sentinel */ - int replica_announce_port; /* Give the primary this listening port. */ - char *replica_announce_ip; /* Give the primary this ip address. */ - int propagation_error_behavior; /* Configures the behavior of the replica - * when it receives an error on the replication stream */ - int repl_ignore_disk_write_error; /* Configures whether replicas panic when unable to - * persist writes to AOF. */ - int repl_replica_lazy_flush; /* Lazy FLUSHALL before loading DB? */ - rio *loading_rio; /* Pointer to the rio object currently used for loading data. */ + char *primary_user; /* AUTH with this user and primary_auth with primary */ + sds primary_auth; /* AUTH with this password with primary */ + char *primary_host; /* Hostname of primary */ + int primary_port; /* Port of primary */ + int repl_timeout; /* Timeout after N seconds of primary idle */ + client *primary; /* Client that is primary for this replica */ + uint64_t rdb_client_id; /* Rdb client id as it defined at primary side */ + struct { + connection *conn; + char replid[CONFIG_RUN_ID_SIZE + 1]; + long long reploff; + long long read_reploff; + int dbid; + } repl_provisional_primary; + client *cached_primary; /* Cached primary to be reused for PSYNC. */ + rio *loading_rio; /* Pointer to the rio object currently used for loading data. */ + int repl_syncio_timeout; /* Timeout for synchronous I/O calls */ + int repl_state; /* Replication status if the instance is a replica */ + int repl_rdb_channel_state; /* State of the replica's rdb channel during dual-channel-replication */ + off_t repl_transfer_size; /* Size of RDB to read from primary during sync. */ + off_t repl_transfer_read; /* Amount of RDB read from primary during sync. */ + off_t repl_transfer_last_fsync_off; /* Offset when we fsync-ed last time. */ + connection *repl_transfer_s; /* Replica -> Primary SYNC connection */ + connection *repl_rdb_transfer_s; /* Primary FULL SYNC connection (RDB download) */ + int repl_transfer_fd; /* Replica -> Primary SYNC temp file descriptor */ + char *repl_transfer_tmpfile; /* Replica-> Primary SYNC temp file name */ + time_t repl_transfer_lastio; /* Unix time of the latest read, for timeout */ + int repl_serve_stale_data; /* Serve stale data when link is down? */ + int repl_replica_ro; /* Replica is read only? */ + int repl_replica_ignore_maxmemory; /* If true replicas do not evict. */ + time_t repl_down_since; /* Unix time at which link with primary went down */ + int repl_disable_tcp_nodelay; /* Disable TCP_NODELAY after SYNC? */ + int replica_priority; /* Reported in INFO and used by Sentinel. */ + int replica_announced; /* If true, replica is announced by Sentinel */ + int replica_announce_port; /* Give the primary this listening port. */ + char *replica_announce_ip; /* Give the primary this ip address. */ + int propagation_error_behavior; /* Configures the behavior of the replica + * when it receives an error on the replication stream */ + int repl_ignore_disk_write_error; /* Configures whether replicas panic when unable to + * persist writes to AOF. */ + /* The following two fields is where we store primary PSYNC replid/offset + * while the PSYNC is in progress. At the end we'll copy the fields into + * the server->primary client structure. */ + char primary_replid[CONFIG_RUN_ID_SIZE + 1]; /* Primary PSYNC runid. */ + long long primary_initial_offset; /* Primary PSYNC offset. */ + int repl_replica_lazy_flush; /* Lazy FLUSHALL before loading DB? */ /* Import Mode */ int import_mode; /* If true, server is in import mode and forbid expiration and eviction. */ /* Synchronous replication. */ @@ -2625,12 +2616,12 @@ void dictVanillaFree(void *val); #define READ_FLAGS_ERROR_BIG_BULK_COUNT (1 << 6) #define READ_FLAGS_ERROR_MBULK_UNEXPECTED_CHARACTER (1 << 7) #define READ_FLAGS_ERROR_MBULK_INVALID_BULK_LEN (1 << 8) -#define READ_FLAGS_ERROR_UNEXPECTED_INLINE_FROM_PRIMARY (1 << 9) +#define READ_FLAGS_ERROR_UNEXPECTED_INLINE_FROM_REPLICATION_SOURCE (1 << 9) #define READ_FLAGS_ERROR_UNBALANCED_QUOTES (1 << 10) #define READ_FLAGS_INLINE_ZERO_QUERY_LEN (1 << 11) #define READ_FLAGS_PARSING_NEGATIVE_MBULK_LEN (1 << 12) #define READ_FLAGS_PARSING_COMPLETED (1 << 13) -#define READ_FLAGS_PRIMARY (1 << 14) +#define READ_FLAGS_REPLICATION_SOURCE (1 << 14) #define READ_FLAGS_DONT_PARSE (1 << 15) #define READ_FLAGS_AUTH_REQUIRED (1 << 16) @@ -2767,9 +2758,6 @@ void ioThreadWriteToClient(void *data); int canParseCommand(client *c); int processIOThreadsReadDone(void); int processIOThreadsWriteDone(void); -replicationLink *createReplicationLink(char *host, int port, slotBitmap slot_bitmap); -int connectReplicationLink(replicationLink *link); -int freeReplicationLink(replicationLink *link); /* logreqres.c - logging of requests and responses */ void reqresReset(client *c, int free_buf); @@ -2923,7 +2911,7 @@ void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv, void updateReplicasWaitingBgsave(int bgsaveerr, int type); void replicationCron(void); void replicationStartPendingFork(void); -void replicationHandleSourceDisconnection(replicationLink *link); +void replicationHandlePrimaryDisconnection(void); void replicationCachePrimary(client *c); void resizeReplicationBacklog(void); void replicationSetPrimary(char *ip, int port, int full_sync_required); @@ -2934,7 +2922,7 @@ void processClientsWaitingReplicas(void); void unblockClientWaitingReplicas(client *c); int replicationCountAcksByOffset(long long offset); int replicationCountAOFAcksByOffset(long long offset); -void replicationSendNewlineToConnectedLinks(void); +void replicationSendNewlineToPrimary(void); long long replicationGetReplicaOffset(void); char *replicationGetReplicaName(client *c); long long getPsyncInitialOffset(void); @@ -2960,6 +2948,8 @@ int sendCurrentOffsetToReplica(client *replica); void addRdbReplicaToPsyncWait(client *replica); void initClientReplicationData(client *c); void freeClientReplicationData(client *c); +void replicationSendAck(client *c); +int replicationProceedWithHandshake(connection *conn, int curr_state, slotBitmap slot_bitmap); /* Generic persistence functions */ void startLoadingFile(size_t size, char *filename, int rdbflags); diff --git a/src/valkeymodule.h b/src/valkeymodule.h index 1d99d2ff7a..8a2090fcca 100644 --- a/src/valkeymodule.h +++ b/src/valkeymodule.h @@ -221,11 +221,13 @@ typedef struct ValkeyModuleStreamID { #define VALKEYMODULE_CTX_FLAGS_ASYNC_LOADING (1 << 23) /* Valkey is starting. */ #define VALKEYMODULE_CTX_FLAGS_SERVER_STARTUP (1 << 24) +/* The command was sent via slot migration link. */ +#define VALKEYMODULE_CTX_FLAGS_IMPORTING_SLOT (1 << 25) /* Next context flag, must be updated when adding new flags above! This flag should not be used directly by the module. * Use ValkeyModule_GetContextFlagsAll instead. */ -#define _VALKEYMODULE_CTX_FLAGS_NEXT (1 << 25) +#define _VALKEYMODULE_CTX_FLAGS_NEXT (1 << 26) /* Keyspace changes notification classes. Every class is associated with a * character for configuration purposes. From db1829590359d65955ddf4bca5c1f213649f8aa7 Mon Sep 17 00:00:00 2001 From: Jacob Murphy Date: Sat, 18 Jan 2025 00:49:05 +0000 Subject: [PATCH 05/18] Get tests passing Signed-off-by: Jacob Murphy --- src/aof.c | 4 ++-- src/cluster_legacy.c | 38 ++++++++++++++++++++++---------------- src/cluster_legacy.h | 7 +++++-- src/replication.c | 3 +++ 4 files changed, 32 insertions(+), 20 deletions(-) diff --git a/src/aof.c b/src/aof.c index dbebc92e63..ebdf6f1b71 100644 --- a/src/aof.c +++ b/src/aof.c @@ -2204,7 +2204,7 @@ int rewriteAppendOnlyFileRio(rio *aof, slotBitmap slot_bitmap) { kvstoreIterator *kvs_it = NULL; /* Record timestamp at the beginning of rewriting AOF. */ - if (server.aof_timestamp_enabled && !slot_bitmap) { + if (server.aof_timestamp_enabled && (slot_bitmap == NULL || isSlotBitmapEmpty(slot_bitmap))) { sds ts = genAofTimestampAnnotationIfNeeded(1); if (rioWrite(aof, ts, sdslen(ts)) == 0) { sdsfree(ts); @@ -2224,7 +2224,7 @@ int rewriteAppendOnlyFileRio(rio *aof, slotBitmap slot_bitmap) { if (rioWrite(aof, selectcmd, sizeof(selectcmd) - 1) == 0) goto werr; if (rioWriteBulkLongLong(aof, j) == 0) goto werr; - if (!slot_bitmap) { + if (slot_bitmap == NULL || isSlotBitmapEmpty(slot_bitmap)) { kvs_it = kvstoreIteratorInit(db->keys); } else { kvs_it = kvstoreFilteredIteratorInit(db->keys, &shouldFilterSlot, slot_bitmap); diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index d174124f40..636a3ea930 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -2503,7 +2503,7 @@ void clusterSetNodeAsPrimary(clusterNode *n) { * The 'sender' is the node for which we received a configuration update. * Sometimes it is not actually the "Sender" of the information, like in the * case we receive the info via an UPDATE packet. */ -void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoch, unsigned char *slots) { +void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoch, slotBitmap slots) { int j; clusterNode *cur_primary = NULL, *new_primary = NULL; /* The dirty slots list is a list of slots for which we lose the ownership @@ -2572,9 +2572,13 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc } /* Was this slot mine and it was in a paused state for slot - * migration? If so, clear the manual failover state. */ - if (server.cluster->slots[j] == myself && server.cluster->mf_end && server.cluster->mf_replica == sender) { - resetManualFailover(); + * migration? If so, mark the move as done. */ + if (server.cluster->slots[j] == myself && server.cluster->mf_end && server.cluster->mf_slots_target == sender) { + bitmapClearBit(server.cluster->mf_slots, j); + if (isSlotBitmapEmpty(server.cluster->mf_slots)) { + serverLog(LL_NOTICE, "Slot migration to node %s (%s) has finished. Unpausing myself.", server.cluster->mf_slots_target->name, server.cluster->mf_slots_target->human_nodename); + resetManualFailover(); + } } /* If the sender who claims this slot is not in the same shard, @@ -3252,7 +3256,7 @@ int clusterProcessPacket(clusterLink *link) { "primary manual failover: %lld", server.cluster->mf_primary_offset); } - /* If we are a importing a slot and the slot owner sent its offset + /* If we are importing a slot and the slot owner sent its offset * while already paused, populate the migration state. */ slotMigration * curr_migration = clusterGetCurrentSlotMigration(); if (hdr->mflags[0] & CLUSTERMSG_FLAG0_PAUSED && curr_migration != NULL && @@ -3729,11 +3733,12 @@ int clusterProcessPacket(clusterLink *link) { /* Initialize the slot migration state accordingly */ resetManualFailover(); server.cluster->mf_end = now + CLUSTER_MF_TIMEOUT; - server.cluster->mf_replica = sender; + server.cluster->mf_slots_target = sender; + memcpy(server.cluster->mf_slots, hdr->data.slot_migration.msg.slot_bitmap, sizeof(slotBitmap)); /* TODO(murphyjacob4) pause subset of slots */ pauseActions(PAUSE_DURING_FAILOVER, now + (CLUSTER_MF_TIMEOUT * CLUSTER_MF_PAUSE_MULT), PAUSE_ACTIONS_CLIENT_WRITE_SET); - serverLog(LL_NOTICE, "Slot migration requested by node %.40s (%s).", sender->name, sender->human_nodename); + serverLog(LL_NOTICE, "Slot migration requested by node %.40s (%s). Pausing myself to allow slot takeover.", sender->name, sender->human_nodename); /* We need to send a ping message to the replica, as it would carry * `server.cluster->mf_primary_offset`, which means the primary paused clients * at offset `server.cluster->mf_primary_offset`, so that the replica would @@ -5281,7 +5286,7 @@ void manualFailoverCanStart(void) { * The function can be used both to initialize the manual failover state at * startup or to abort a manual failover in progress. */ void resetManualFailover(void) { - if (server.cluster->mf_replica) { + if (server.cluster->mf_replica || server.cluster->mf_slots_target) { /* We were a primary failing over, so we paused clients and related actions. * Regardless of the outcome we unpause now to allow traffic again. */ unpauseActions(PAUSE_DURING_FAILOVER); @@ -5290,6 +5295,8 @@ void resetManualFailover(void) { server.cluster->mf_can_start = 0; server.cluster->mf_replica = NULL; server.cluster->mf_primary_offset = -1; + memset(server.cluster->mf_slots, 0, sizeof(server.cluster->mf_slots)); + server.cluster->mf_slots_target = NULL; } /* If a manual failover timed out, abort it. */ @@ -7370,19 +7377,18 @@ int clusterCommandSpecial(client *c) { return 1; } if (c->argc < 5 || strcasecmp(c->argv[2]->ptr, "slotsrange")) { - addReplyError(c, "Migrate command requires at least one "); + addReplyError(c, "Migrate command requires at least one slot range"); + return 1; + } + if (c->argc % 2 == 0) { + addReplyError(c, "Invalid SLOTSRANGE, missing end slot"); return 1; } - unsigned char requested_slots[CLUSTER_SLOTS/8]; - memset(requested_slots, 0, sizeof(requested_slots)); + slotBitmap requested_slots; + memset(requested_slots, 0, sizeof(slotBitmap)); int i; clusterNode * curr_owner = NULL; for (i = 3; i + 1 < c->argc; i+=2) { - if (i > 3 && getLongLongFromObject(c->argv[i], NULL) != C_OK) { - /* If we find a non-integer in the args and we have already - * parsed >=1 slot range, we assume it is the next token. */ - break; - } int start = getSlotOrReply(c, c->argv[i]); if (start < 0) { return 1; diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index f9c6f5e5b8..0a97ca37ad 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -340,7 +340,7 @@ struct _clusterNode { char shard_id[CLUSTER_NAMELEN]; /* shard id, hex string, sha1-size */ int flags; /* CLUSTER_NODE_... */ uint64_t configEpoch; /* Last configEpoch observed for this node */ - unsigned char slots[CLUSTER_SLOTS / 8]; /* slots handled by this node */ + slotBitmap slots; /* slots handled by this node */ uint16_t *slot_info_pairs; /* Slots info represented as (start/end) pair (consecutive index). */ int slot_info_pairs_count; /* Used number of slots in slot_info_pairs */ int numslots; /* Number of slots handled by this node */ @@ -441,6 +441,9 @@ struct clusterState { or -1 if still not received. */ int mf_can_start; /* If non-zero signal that the manual failover can start requesting primary vote. */ + /* Manual failover state for slot migration */ + slotBitmap mf_slots; /* Slots in migration. */ + clusterNode *mf_slots_target; /* The following fields are used by primaries to take state on elections. */ uint64_t lastVoteEpoch; /* Epoch of the last vote granted. */ int todo_before_sleep; /* Things to do in clusterBeforeSleep(). */ @@ -458,7 +461,7 @@ struct clusterState { * the ownership transfer. Set the bit corresponding to the slot when a node * stops claiming the slot. This prevents spreading incorrect information (that * source still owns the slot) using UPDATE messages. */ - unsigned char owner_not_claiming_slot[CLUSTER_SLOTS / 8]; + slotBitmap owner_not_claiming_slot; /* Struct used for storing slot statistics, for all slots owned by the current shard. */ slotStat slot_stats[CLUSTER_SLOTS]; list *slot_migrations; /* Queue of ongoing slot migrations. */ diff --git a/src/replication.c b/src/replication.c index 90a9e90e24..9abf5cead3 100644 --- a/src/replication.c +++ b/src/replication.c @@ -3768,6 +3768,8 @@ void syncWithPrimary(connection *conn) { } else if (server.repl_state == REPL_STATE_ERROR) { goto error; } + if (server.repl_state != REPL_STATE_SEND_PSYNC) + return; } /* Try a partial resynchronization. If we don't have a cached primary @@ -3908,6 +3910,7 @@ void syncWithPrimary(connection *conn) { server.repl_transfer_tmpfile = NULL; server.repl_transfer_fd = -1; server.repl_state = REPL_STATE_CONNECT; + return; write_error: /* Handle sendCommand() errors. */ serverLog(LL_WARNING, "Sending command to primary in replication handshake: %s", err); From 6e8bdb5b5b3687ee8df65ec8b039a612d51007f0 Mon Sep 17 00:00:00 2001 From: Jacob Murphy Date: Sun, 19 Jan 2025 13:12:18 +0000 Subject: [PATCH 06/18] Refactor to use dedicated sync mechanisms Signed-off-by: Jacob Murphy --- src/aof.c | 8 +- src/blocked.c | 2 +- src/cluster.c | 14 +- src/cluster.h | 5 +- src/cluster_legacy.c | 585 +++++++++++++++++++--------- src/cluster_legacy.h | 65 ++-- src/commands.def | 18 + src/commands/cluster-syncslots.json | 14 + src/db.c | 4 +- src/io_threads.c | 2 +- src/kvstore.c | 18 +- src/kvstore.h | 4 +- src/networking.c | 58 +-- src/rdb.c | 150 +++---- src/rdb.h | 2 +- src/replication.c | 372 +++++++----------- src/server.c | 30 +- src/server.h | 38 +- tests/unit/slot-migration.tcl | 22 ++ 19 files changed, 836 insertions(+), 575 deletions(-) create mode 100644 src/commands/cluster-syncslots.json create mode 100644 tests/unit/slot-migration.tcl diff --git a/src/aof.c b/src/aof.c index ebdf6f1b71..0cd64820c8 100644 --- a/src/aof.c +++ b/src/aof.c @@ -2191,10 +2191,10 @@ static int rewriteFunctions(rio *aof) { return 0; } -int shouldFilterSlot(int slot, void * privdata) { - if (privdata == NULL) return 0; +int slotFilterPredicate(int slot, void * privdata) { + if (privdata == NULL) return 1; unsigned char *slot_bitmap = (unsigned char *)privdata; - return !bitmapTestBit(slot_bitmap, slot); + return bitmapTestBit(slot_bitmap, slot); } int rewriteAppendOnlyFileRio(rio *aof, slotBitmap slot_bitmap) { @@ -2227,7 +2227,7 @@ int rewriteAppendOnlyFileRio(rio *aof, slotBitmap slot_bitmap) { if (slot_bitmap == NULL || isSlotBitmapEmpty(slot_bitmap)) { kvs_it = kvstoreIteratorInit(db->keys); } else { - kvs_it = kvstoreFilteredIteratorInit(db->keys, &shouldFilterSlot, slot_bitmap); + kvs_it = kvstoreFilteredIteratorInit(db->keys, &slotFilterPredicate, slot_bitmap); } /* Iterate this DB writing every entry */ void *next; diff --git a/src/blocked.c b/src/blocked.c index 70da7877ad..9bdab5be8e 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -101,7 +101,7 @@ void freeClientBlockingState(client *c) { * and will be processed when the client is unblocked. */ void blockClient(client *c, int btype) { /* Replication clients should never be blocked unless pause or module */ - serverAssert(!(c->flag.replication_source && btype != BLOCKED_MODULE && btype != BLOCKED_POSTPONE)); + serverAssert(!(c->flag.replicated && btype != BLOCKED_MODULE && btype != BLOCKED_POSTPONE)); initClientBlockingState(c); diff --git a/src/cluster.c b/src/cluster.c index d7e7be52af..508eddefc6 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1023,7 +1023,7 @@ getNodeByQuery(client *c, struct serverCommand *cmd, robj **argv, int argc, int /* We handle all the cases as if they were EXEC commands, so we have * a common code path for everything */ - if (cmd->proc == execCommand) { + if (c && cmd->proc == execCommand) { /* If CLIENT_MULTI flag is not set EXEC is just going to return an * error. */ if (!c->flag.multi) return myself; @@ -1040,11 +1040,11 @@ getNodeByQuery(client *c, struct serverCommand *cmd, robj **argv, int argc, int mc.cmd = cmd; } - uint64_t cmd_flags = getCommandFlags(c); + uint64_t cmd_flags = c ? getCommandFlags(c) : cmd->flags; /* Only valid for sharded pubsub as regular pubsub can operate on any node and bypasses this layer. */ int pubsubshard_included = - (cmd_flags & CMD_PUBSUB) || (c->cmd->proc == execCommand && (c->mstate->cmd_flags & CMD_PUBSUB)); + (cmd_flags & CMD_PUBSUB) || (c && c->cmd->proc == execCommand && (c->mstate->cmd_flags & CMD_PUBSUB)); /* Check that all the keys are in the same hash slot, and obtain this * slot and the node associated. */ @@ -1089,7 +1089,7 @@ getNodeByQuery(client *c, struct serverCommand *cmd, robj **argv, int argc, int * can safely serve the request, otherwise we return a TRYAGAIN * error). To do so we set the importing/migrating state and * increment a counter for every missing key. */ - if (clusterNodeIsPrimary(myself) || c->flag.readonly) { + if (clusterNodeIsPrimary(myself) || (c && c->flag.readonly)) { if (n == clusterNodeGetPrimary(myself) && getMigratingSlotDest(slot) != NULL) { migrating_slot = 1; } else if (getImportingSlotSource(slot) != NULL) { @@ -1184,7 +1184,7 @@ getNodeByQuery(client *c, struct serverCommand *cmd, robj **argv, int argc, int * request as "ASKING", we can serve the request. However if the request * involves multiple keys and we don't have them all, the only option is * to send a TRYAGAIN error. */ - if (importing_slot && (c->flag.asking || cmd_flags & CMD_ASKING)) { + if (importing_slot && (c && (c->flag.asking || cmd_flags & CMD_ASKING))) { if (multiple_keys && missing_keys) { if (error_code) *error_code = CLUSTER_REDIR_UNSTABLE; return NULL; @@ -1197,8 +1197,8 @@ getNodeByQuery(client *c, struct serverCommand *cmd, robj **argv, int argc, int * node is a replica and the request is about a hash slot our primary * is serving, we can reply without redirection. */ int is_write_command = - (cmd_flags & CMD_WRITE) || (c->cmd->proc == execCommand && (c->mstate->cmd_flags & CMD_WRITE)); - if ((c->flag.readonly || pubsubshard_included) && !is_write_command && clusterNodeIsReplica(myself) && + (cmd_flags & CMD_WRITE) || (c && c->cmd->proc == execCommand && (c->mstate->cmd_flags & CMD_WRITE)); + if (((c && c->flag.readonly) || pubsubshard_included) && !is_write_command && clusterNodeIsReplica(myself) && clusterNodeGetPrimary(myself) == n) { return myself; } diff --git a/src/cluster.h b/src/cluster.h index 74889422b4..5192bc405e 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -124,7 +124,10 @@ void bitmapSetAllBits(unsigned char *bitmap, int len); int slotBitmapCompare(slotBitmap bitmap, slotBitmap other); int isSlotBitmapEmpty(slotBitmap bitmap); int getSlotOrReply(client *c, robj *o); -void clusterSlotMigrationDoneSyncing(long long initial_offset); +void clusterSlotImportDoneSyncing(long long initial_offset); +void clusterSlotMigrationHandleClientClose(client *c); +void clusterFeedSlotMigration(int dbid, robj **argv, int argc); +int clusterShouldWriteToSlotMigrationTarget(void); /* functions with shared implementations */ int clusterNodeIsMyself(clusterNode *n); diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 636a3ea930..fa0da913b2 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -83,8 +83,8 @@ sds representSlotInfo(sds ci, uint16_t *slot_info_pairs, int slot_info_pairs_cou void clusterFreeNodesSlotsInfo(clusterNode *n); uint64_t clusterGetMaxEpoch(void); int clusterBumpConfigEpochWithoutConsensus(void); -slotMigration *clusterGetCurrentSlotMigration(void); -void clusterSendMigrateSlotStart(clusterNode *node, slotBitmap slot_bitmap); +slotImport *clusterGetCurrentSlotImport(void); +slotExport *clusterGetCurrentSlotExport(void); void moduleCallClusterReceivers(const char *sender_id, uint64_t module_id, uint8_t type, @@ -122,7 +122,7 @@ int verifyClusterNodeId(const char *name, int length); sds clusterEncodeOpenSlotsAuxField(int rdbflags); int clusterDecodeOpenSlotsAuxField(int rdbflags, sds s); static int nodeExceedsHandshakeTimeout(clusterNode *node, mstime_t now); -void clusterProceedWithSlotMigration(void); +void clusterProceedWithSlotImport(void); /* Only primaries that own slots have voting rights. * Returns 1 if the node has voting rights, otherwise returns 0. */ @@ -1134,7 +1134,8 @@ void clusterInit(void) { server.cluster->failover_auth_epoch = 0; server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE; server.cluster->lastVoteEpoch = 0; - server.cluster->slot_migrations = listCreate(); + server.cluster->slot_import_jobs = listCreate(); + server.cluster->slot_export_jobs = listCreate(); /* Initialize stats */ for (int i = 0; i < CLUSTERMSG_TYPE_COUNT; i++) { @@ -2573,11 +2574,14 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc /* Was this slot mine and it was in a paused state for slot * migration? If so, mark the move as done. */ - if (server.cluster->slots[j] == myself && server.cluster->mf_end && server.cluster->mf_slots_target == sender) { - bitmapClearBit(server.cluster->mf_slots, j); - if (isSlotBitmapEmpty(server.cluster->mf_slots)) { - serverLog(LL_NOTICE, "Slot migration to node %s (%s) has finished. Unpausing myself.", server.cluster->mf_slots_target->name, server.cluster->mf_slots_target->human_nodename); - resetManualFailover(); + slotExport * curr_export = clusterGetCurrentSlotExport(); + if (server.cluster->slots[j] == myself && curr_export && bitmapTestBit(curr_export->slot_bitmap, j)) { + bitmapClearBit(curr_export->slot_bitmap, j); + if (isSlotBitmapEmpty(curr_export->slot_bitmap)) { + serverLog(LL_NOTICE, "Slot migration has finished. Unpausing myself."); + unpauseActions(PAUSE_DURING_SLOT_MIGRATION); + curr_export->state = SLOT_EXPORT_FINISH; + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION); } } @@ -3256,20 +3260,6 @@ int clusterProcessPacket(clusterLink *link) { "primary manual failover: %lld", server.cluster->mf_primary_offset); } - /* If we are importing a slot and the slot owner sent its offset - * while already paused, populate the migration state. */ - slotMigration * curr_migration = clusterGetCurrentSlotMigration(); - if (hdr->mflags[0] & CLUSTERMSG_FLAG0_PAUSED && curr_migration != NULL && - curr_migration->state == SLOT_MIGRATION_WAITING_FOR_OFFSET && - curr_migration->source_node == sender) { - curr_migration->pause_primary_offset = sender->repl_offset; - curr_migration->state = SLOT_MIGRATION_SYNCING_TO_OFFSET; - clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION); - serverLog(LL_NOTICE, - "Received replication offset from paused owner for " - "slot import: %lld", - curr_migration->pause_primary_offset); - } } /* Initial processing of PING and MEET requests replying with a PONG. */ @@ -3724,27 +3714,6 @@ int clusterProcessPacket(clusterLink *link) { uint8_t type = hdr->data.module.msg.type; unsigned char *payload = hdr->data.module.msg.bulk_data; moduleCallClusterReceivers(sender->name, module_id, type, payload, len); - } else if (type == CLUSTERMSG_TYPE_MIGRATE_SLOT_START) { - /* This message is acceptable only if I'm a primary and I own the slot */ - if (!sender) return 1; - for (int i = 0; i <= CLUSTER_SLOTS; i++) { - if (bitmapTestBit(hdr->data.slot_migration.msg.slot_bitmap, i) && server.cluster->slots[i] != myself) return 1; - } - /* Initialize the slot migration state accordingly */ - resetManualFailover(); - server.cluster->mf_end = now + CLUSTER_MF_TIMEOUT; - server.cluster->mf_slots_target = sender; - memcpy(server.cluster->mf_slots, hdr->data.slot_migration.msg.slot_bitmap, sizeof(slotBitmap)); - /* TODO(murphyjacob4) pause subset of slots */ - pauseActions(PAUSE_DURING_FAILOVER, now + (CLUSTER_MF_TIMEOUT * CLUSTER_MF_PAUSE_MULT), - PAUSE_ACTIONS_CLIENT_WRITE_SET); - serverLog(LL_NOTICE, "Slot migration requested by node %.40s (%s). Pausing myself to allow slot takeover.", sender->name, sender->human_nodename); - /* We need to send a ping message to the replica, as it would carry - * `server.cluster->mf_primary_offset`, which means the primary paused clients - * at offset `server.cluster->mf_primary_offset`, so that the replica would - * know that it is safe to set its `server.cluster->mf_can_start` to 1 so as - * to complete failover as quickly as possible. */ - clusterSendPing(link, CLUSTERMSG_TYPE_PING); } else { serverLog(LL_WARNING, "Received unknown packet type: %d", type); } @@ -4445,177 +4414,253 @@ void clusterPropagatePublish(robj *channel, robj *message, int sharded) { * Slot Migration functions * -------------------------------------------------------------------------- */ -slotMigration *clusterCreateSlotMigration(clusterNode *source, slotBitmap slots) { - slotMigration *result = (slotMigration *) zmalloc(sizeof(slotMigration)); +slotImport *clusterCreateSlotImportJob(clusterNode *source, slotBitmap slots) { + slotImport *result = (slotImport *) zcalloc(sizeof(slotImport)); memcpy(result->slot_bitmap, slots, sizeof(slotBitmap)); result->source_node = source; - result->state = SLOT_MIGRATION_QUEUED; - result->end_time = 0; /* Will be set once started. */ - result->replication_connection = NULL; - result->replication_client = NULL; - result->replication_handshake_state = REPL_STATE_NONE; + result->state = SLOT_IMPORT_QUEUED; + result->paused_at_offset = -1; + return result; +} + +slotExport *clusterCreateSlotExportJob(client *c, slotBitmap slots) { + slotExport *result = (slotExport *) zmalloc(sizeof(slotExport)); + memcpy(result->slot_bitmap, slots, sizeof(slotBitmap)); + result->state = SLOT_EXPORT_QUEUED; result->pause_end = 0; - result->pause_primary_offset = -1; + result->client = c; return result; } -void clusterFreeSlotMigration(slotMigration *migration) { - if (migration->replication_client) { - freeClient(migration->replication_client); - } else if (migration->replication_connection) { - connClose(migration->replication_connection); +void clusterFreeSlotImportJob(slotImport *slot_import) { + if (slot_import->client) { + freeClient(slot_import->client); + } else if (slot_import->conn) { + connClose(slot_import->conn); } - zfree(migration); + zfree(slot_import); } -/* Gets the current slot migration from the head of the queue. */ -slotMigration *clusterGetCurrentSlotMigration(void) { - if (listLength(server.cluster->slot_migrations) == 0) return NULL; - return (slotMigration *) listFirst(server.cluster->slot_migrations)->value; +void clusterFreeSlotExportJob(slotExport *slot_export) { + if (slot_export->client) { + freeClient(slot_export->client); + } + zfree(slot_export); } -void clusterSendMigrateSlotStart(clusterNode *node, slotBitmap slot_bitmap) { - if (!node->link) return; +slotImport *clusterGetCurrentSlotImport(void) { + if (listLength(server.cluster->slot_import_jobs) == 0) return NULL; + return (slotImport *) listFirst(server.cluster->slot_import_jobs)->value; +} - uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData) + sizeof(clusterMsgSlotMigration); - clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_MIGRATE_SLOT_START, msglen); - clusterMsg *hdr = getMessageFromSendBlock(msgblock); - memcpy(hdr->data.slot_migration.msg.slot_bitmap, slot_bitmap, sizeof(slotBitmap)); - clusterSendMessage(node->link, msgblock); - clusterMsgSendBlockDecrRefCount(msgblock); +slotExport *clusterGetCurrentSlotExport(void) { + if (listLength(server.cluster->slot_export_jobs) == 0) return NULL; + return (slotExport *) listFirst(server.cluster->slot_export_jobs)->value; } -void clusterImportHandler(connection *conn) { - UNUSED(conn); - /* This is called if there is an event on the current migrations - * connection. If that is the case, we can just continue with our - * state machine.*/ - clusterProceedWithSlotMigration(); +void clusterFeedSlotMigration(int dbid, robj **argv, int argc) { + UNUSED(dbid); + int i, slot, error_code; + slotExport *curr_export = clusterGetCurrentSlotExport(); + if (curr_export == NULL || curr_export->state < SLOT_EXPORT_SNAPSHOTTING) { + return; + } + + /* Check the slot this command belongs to. Note that it is not a guarantee + * that the slot of the replicated command is the same as the slot of the + * executed command, for example in the case of module VM_Replicate APIs. + * Because of this case, we need to recomplete the slot lookup completely + * at this time. */ + struct serverCommand *cmd = lookupCommand(argv, argc); + getNodeByQuery(server.current_client, cmd, argv, argc, &slot, &error_code); + if (error_code != CLUSTER_REDIR_NONE || slot == -1) { + /* This shouldn't happen - but is possible if a module does something + * like VM_Replicate a cross-slot command. In that case, we don't have + * a clear way to proceed, so it makes sense to give up. */ + return; + } + if (!bitmapTestBit(curr_export->slot_bitmap, slot)) return; + + unsigned long long prev_pending = curr_export->client->reply_bytes; + addReplyArrayLen(curr_export->client, argc); + for (i = 0; i < argc; i++) { + addReply(curr_export->client, argv[i]); + } + curr_export->syncslot_offset += curr_export->client->reply_bytes - prev_pending; } -void clusterSlotMigrationDoneSyncing(long long initial_offset) { - slotMigration *migration = clusterGetCurrentSlotMigration(); - serverAssert(migration != NULL && migration->state == SLOT_MIGRATION_RECEIVE_SYNC); - migration->state = SLOT_MIGRATION_PAUSE_OWNER; - migration->replication_client->repl_data->reploff = initial_offset; - migration->replication_client->repl_data->read_reploff = initial_offset; - clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION); +int clusterShouldWriteToSlotMigrationTarget() { + slotExport *curr_export = clusterGetCurrentSlotExport(); + if (curr_export->state != SLOT_EXPORT_PAUSED) { + return 0; + } + return 1; } -/* This is the main state machine for the slot migration workflow. Slot - * migration is driven by the new owner of the slot. This function will do as - * much work as possible synchronously, processing the enqueued slot migrations - * and only returning once we are waiting on some IO. */ -void clusterProceedWithSlotMigration(void) { - server.cluster->todo_before_sleep &= ~CLUSTER_TODO_HANDLE_SLOTMIGRATION; +void clusterSlotMigrationHandleClientClose(client *c) { + if (c->flag.slot_migration_source) { + serverLog(LL_NOTICE, "Connection with slot migration source lost."); + slotImport *import = clusterGetCurrentSlotImport(); + if (import == NULL || import->client != c) return; + import->client = NULL; + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION); + } else if (c->flag.slot_migration_target) { + serverLog(LL_NOTICE, "Connection with slot export target lost."); + slotExport *export = clusterGetCurrentSlotExport(); + if (export == NULL || export->client != c) return; + export->client = NULL; + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION); + } +} - while (clusterGetCurrentSlotMigration() != NULL) { - listNode *curr_node = listFirst(server.cluster->slot_migrations); - slotMigration *curr_migration = (slotMigration *) curr_node->value; - if (curr_migration->state != SLOT_MIGRATION_QUEUED && curr_migration->end_time < mstime()) { +void clusterImportHandler(connection *conn) { + UNUSED(conn); + clusterProceedWithSlotImport(); +} + +void clusterProceedWithSlotImport(void) { + char *err; + while (clusterGetCurrentSlotImport() != NULL) { + listNode *curr_node = listFirst(server.cluster->slot_import_jobs); + slotImport *curr_import = (slotImport *) curr_node->value; + if (curr_import->state != SLOT_IMPORT_QUEUED && curr_import->end_time < mstime()) { serverLog(LL_WARNING, - "Timed out for slot migration from source node %.40s", curr_migration->source_node->name); - curr_migration->state = SLOT_MIGRATION_FAILED; + "Timed out for slot import from source node %.40s", curr_import->source_node->name); + curr_import->state = SLOT_IMPORT_FAILED; } - if (curr_migration->state > SLOT_MIGRATION_PAUSE_OWNER && curr_migration->state < SLOT_MIGRATION_FAILED && curr_migration->pause_end < mstime()) { + if (curr_import->state > SLOT_IMPORT_PAUSE_OWNER && curr_import->state != SLOT_IMPORT_FAILED && curr_import->pause_end < mstime()) { /* If the owner ever unpauses, we have to move back in the state machine and retry. */ serverLog(LL_WARNING, "Reinitiating pause on the node owning the slot range..."); - curr_migration->state = SLOT_MIGRATION_PAUSE_OWNER; - curr_migration->pause_end = mstime() + CLUSTER_MF_TIMEOUT; + curr_import->state = SLOT_IMPORT_PAUSE_OWNER; + curr_import->pause_end = mstime() + CLUSTER_MF_TIMEOUT; } - switch(curr_migration->state) { - case SLOT_MIGRATION_QUEUED: + if (curr_import->state > SLOT_IMPORT_CONNECTING && curr_import->client == NULL) { + serverLog(LL_WARNING, "Client for slot import from source node %.40s has been closed", curr_import->source_node->name); + curr_import->state = SLOT_IMPORT_FAILED; + } + switch(curr_import->state) { + case SLOT_IMPORT_QUEUED: /* Start the migration */ - serverLog(LL_NOTICE, "Starting replication of slots from migration source node %.40s", curr_migration->source_node->name); - curr_migration->end_time = mstime() + CLUSTER_SLOT_MIGRATION_TIMEOUT; - curr_migration->replication_connection = connCreate(server.tls_replication ? connectionTypeTls() : connectionTypeTcp()); - if (connConnect(curr_migration->replication_connection, curr_migration->source_node->ip, getNodeDefaultReplicationPort(curr_migration->source_node), server.bind_source_addr, clusterImportHandler) == C_ERR) { + serverLog(LL_NOTICE, "Starting slot import from source node %.40s", curr_import->source_node->name); + curr_import->end_time = mstime() + CLUSTER_SLOT_IMPORT_TIMEOUT; + curr_import->conn = connCreate(server.tls_replication ? connectionTypeTls() : connectionTypeTcp()); + if (connConnect(curr_import->conn, curr_import->source_node->ip, getNodeDefaultReplicationPort(curr_import->source_node), server.bind_source_addr, clusterImportHandler) == C_ERR) { serverLog(LL_WARNING, - "Failed to connect to migration source node %.40s", curr_migration->source_node->name); - curr_migration->state = SLOT_MIGRATION_FAILED; + "Failed to connect to slot import source node %.40s", curr_import->source_node->name); + curr_import->state = SLOT_IMPORT_FAILED; continue; } - curr_migration->replication_handshake_state = REPL_STATE_CONNECTING; - curr_migration->state = SLOT_MIGRATION_CONNECTING; + curr_import->state = SLOT_IMPORT_CONNECTING; continue; - case SLOT_MIGRATION_CONNECTING: - if (curr_migration->replication_connection->state == CONN_STATE_CONNECTED) { - curr_migration->state = SLOT_MIGRATION_REPL_HANDSHAKE; + case SLOT_IMPORT_CONNECTING: + if (curr_import->conn->state == CONN_STATE_CONNECTING) { + /* Nothing to do, waiting for connection to be established. */ + return; + } else if (curr_import->conn->state != CONN_STATE_CONNECTED) { + serverLog(LL_WARNING, "Failed during connection to slot import source node %.40s: %s", curr_import->source_node->name, connGetLastError(curr_import->conn)); + curr_import->state = SLOT_IMPORT_FAILED; continue; } - /* Nothing to do, waiting for connection to be established. */ - return; - case SLOT_MIGRATION_REPL_HANDSHAKE: - curr_migration->replication_handshake_state = replicationProceedWithHandshake(curr_migration->replication_connection, curr_migration->replication_handshake_state, curr_migration->slot_bitmap); - if (curr_migration->replication_handshake_state == REPL_STATE_ERROR) { - serverLog(LL_WARNING, "Handshake failed from migration node %.40s", curr_migration->source_node->name); - curr_migration->state = SLOT_MIGRATION_FAILED; + serverLog(LL_NOTICE, "Connected to slot import source node %.40s", curr_import->source_node->name); + connSetReadHandler(curr_import->conn, NULL); + client *c = createClient(curr_import->conn); + curr_import->client = c; + c->flag.replicated = 1; + c->flag.slot_migration_source = 1; + c->flag.authenticated = 1; + c->user = NULL; /* This client can do everything. */ + c->querybuf = sdsempty(); /* Similar to primary, we use a dedicated query buf. */ + initClientReplicationData(c); /* Used to track reploff */ + + curr_import->state = SLOT_IMPORT_SEND_AUTH; + continue; + case SLOT_IMPORT_SEND_AUTH: + if (!server.primary_auth) { + curr_import->state = SLOT_IMPORT_SEND_SYNCSLOTS; continue; } - if (curr_migration->replication_handshake_state == REPL_STATE_SEND_PSYNC) { - curr_migration->state = SLOT_MIGRATION_SEND_SYNC; + char *auth_args[3] = {"AUTH", NULL, NULL}; + size_t auth_lens[3] = {4, 0, 0}; + int argc = 1; + if (server.primary_user) { + auth_args[argc] = server.primary_user; + auth_lens[argc] = strlen(server.primary_user); + argc++; + } + auth_args[argc] = server.primary_auth; + auth_lens[argc] = sdslen(server.primary_auth); + argc++; + err = sendCommandArgv(curr_import->conn, argc, auth_args, auth_lens); + if (err) { + serverLog(LL_WARNING, "Failed to write AUTH to slot migration source: %s", err); + sdsfree(err); + curr_import->state = SLOT_IMPORT_FAILED; continue; } - return; - case SLOT_MIGRATION_SEND_SYNC: + curr_import->state = SLOT_IMPORT_RECEIVE_AUTH; + continue; + case SLOT_IMPORT_RECEIVE_AUTH: + err = receiveSynchronousResponse(curr_import->conn); + if (err == NULL) { + serverLog(LL_WARNING, "Slot migration source did not respond to AUTH command"); + } + if (err[0] == '-') { + serverLog(LL_WARNING, "Unable to AUTH to slot migration source: %s", err); + sdsfree(err); + } + sdsfree(err); + err = NULL; + curr_import->state = SLOT_IMPORT_SEND_SYNCSLOTS; + continue; + case SLOT_IMPORT_SEND_SYNCSLOTS: /* Ensure we have a clean state for the SYNC. */ - dropKeysInSlotBitmap(curr_migration->slot_bitmap, 1); - - /* We are done with our handshake phase. We can proceed straight to doing our SYNC. - * Note that we are skipping PSYNC. PSYNC will always result in full resync for a - * slot migration anyways. - * - * In the future, we can do a PSYNC phase to incorporate dual channel. */ - serverLog(LL_NOTICE, "Starting SYNC for slot migration from migration source node %.40s", curr_migration->source_node->name); - if (connSyncWrite(curr_migration->replication_connection, "SYNC\r\n", 6, server.repl_syncio_timeout * 1000) == -1) { - serverLog(LL_WARNING, "I/O error writing to slot migration source: %s", connGetLastError(curr_migration->replication_connection)); - curr_migration->state = SLOT_MIGRATION_FAILED; + dropKeysInSlotBitmap(curr_import->slot_bitmap, 1); + + serverLog(LL_NOTICE, "Sending CLUSTER SYNCSLOTS START request to source node %.40s", curr_import->source_node->name); + char *syncslots_args[4] = {"CLUSTER", "SYNCSLOTS", "START", (char *)curr_import->slot_bitmap}; + size_t syncslots_lens[4] = {7, 9, 5, sizeof(slotBitmap)}; + err = sendCommandArgv(curr_import->conn, 4, syncslots_args, syncslots_lens); + if (err) { + serverLog(LL_WARNING, "Failed to write SYNCSLOTS START to slot migration source: %s", err); + sdsfree(err); + curr_import->state = SLOT_IMPORT_FAILED; continue; } - client *c = createClient(curr_migration->replication_connection); - curr_migration->replication_client = c; - c->flag.replication_source = 1; - c->flag.slot_migration_source = 1; - c->flag.authenticated = 1; - c->user = NULL; /* This client can do everything. */ - initClientReplicationData(c); /* We use this to track offset. */ - c->querybuf = sdsempty(); /* Similar to primary, we use a dedicated query buf. */ /* Our result will be received in AOF format, so we can pipe it * straight to readQueryFromClient. */ - connSetReadHandler(c->conn, readQueryFromClient); - curr_migration->state = SLOT_MIGRATION_RECEIVE_SYNC; - continue; - case SLOT_MIGRATION_RECEIVE_SYNC: - return; /* Nothing to do */ - case SLOT_MIGRATION_PAUSE_OWNER: - /* Send an ACK to put the connection into streaming state. */ - replicationSendAck(curr_migration->replication_client); - - serverLog(LL_NOTICE, "Replication sync to slot owner %.40s has been performed. Current replication offset: %lld. Pausing source node and waiting to continue", curr_migration->source_node->name, curr_migration->replication_client->repl_data->reploff); - clusterSendMigrateSlotStart(curr_migration->source_node, curr_migration->slot_bitmap); - curr_migration->pause_primary_offset = -1; - curr_migration->pause_end = mstime() + CLUSTER_MF_TIMEOUT; - curr_migration->state = SLOT_MIGRATION_WAITING_FOR_OFFSET; + connSetReadHandler(curr_import->client->conn, readQueryFromClient); + curr_import->state = SLOT_IMPORT_RECEIVE_SYNCSLOTS; + case SLOT_IMPORT_RECEIVE_SYNCSLOTS: + /* Nothing to do in this state. Waiting for CLUSTER SYNCSLOTS END to be processed. */ + return; + case SLOT_IMPORT_PAUSE_OWNER: + curr_import->client->flag.replication_force_reply = 1; + addReplyArrayLen(curr_import->client, 3); + addReplyBulkCBuffer(curr_import->client, "CLUSTER", 7); + addReplyBulkCBuffer(curr_import->client, "SYNCSLOTS", 9); + addReplyBulkCBuffer(curr_import->client, "PAUSE", 5); + curr_import->client->flag.replication_force_reply = 0; + + serverLog(LL_NOTICE, "SYNCSLOTS from slot migration source %.40s (%s) has been performed. Pausing source node and waiting to continue", curr_import->source_node->name, curr_import->source_node->human_nodename); + curr_import->paused_at_offset = -1; + curr_import->pause_end = mstime() + CLUSTER_MF_TIMEOUT; + curr_import->state = SLOT_IMPORT_WAITING_FOR_OFFSET; continue; - case SLOT_MIGRATION_WAITING_FOR_OFFSET: - /* Send REPLCONF ACK from time to time */ - replicationSendAck(curr_migration->replication_client); + case SLOT_IMPORT_WAITING_FOR_OFFSET: return; - case SLOT_MIGRATION_SYNCING_TO_OFFSET: - /* Send REPLCONF ACK from time to time */ - replicationSendAck(curr_migration->replication_client); - if (curr_migration->replication_client->repl_data->reploff >= curr_migration->pause_primary_offset) { - serverLog(LL_NOTICE, "Replication of slot range has caught up to paused slot owner (%lld), slot migration can start.", curr_migration->pause_primary_offset); - curr_migration->state = SLOT_MIGRATION_FINISH; + case SLOT_IMPORT_SYNCING_TO_OFFSET: + if (curr_import->client->repl_data->reploff >= curr_import->paused_at_offset) { + serverLog(LL_NOTICE, "SYNCSLOTS of slot range has caught up to paused slot owner (%lld), slot migration can start.", curr_import->paused_at_offset); + curr_import->state = SLOT_IMPORT_FINISH; continue; } /* Need to wait for the sync to progress further */ return; - case SLOT_MIGRATION_FINISH: + case SLOT_IMPORT_FINISH: serverLog(LL_NOTICE, "Setting myself to owner of migrating slots and broadcasting"); for (int i = 0; i < CLUSTER_SLOTS; i++) { - if (bitmapTestBit(curr_migration->slot_bitmap, i)) { + if (bitmapTestBit(curr_import->slot_bitmap, i)) { clusterDelSlot(i); clusterAddSlot(myself, i); } @@ -4625,20 +4670,101 @@ void clusterProceedWithSlotMigration(void) { if (clusterBumpConfigEpochWithoutConsensus() == C_OK) { serverLog(LL_NOTICE, "ConfigEpoch updated after importing slots"); } - clusterFreeSlotMigration(curr_migration); + clusterFreeSlotImportJob(curr_import); clusterBroadcastPong(CLUSTER_BROADCAST_ALL); - listDelNode(server.cluster->slot_migrations, curr_node); + listDelNode(server.cluster->slot_import_jobs, curr_node); continue; - case SLOT_MIGRATION_FAILED: - /* Delete the migration from the queue and proceed to the next migration */ - listDelNode(server.cluster->slot_migrations, curr_node); - dropKeysInSlotBitmap(curr_migration->slot_bitmap, server.repl_replica_lazy_flush); - clusterFreeSlotMigration(curr_migration); + case SLOT_IMPORT_REPLICA_TRACKING: + /* As a replica, we will simply apply the primaries updates + * from the slot migration source. However, if we are ever + * promoted to primary, we need to fail the migration to + * prevent leaked keys in the importing slots. */ + if (clusterNodeIsPrimary(myself)) { + serverLog(LL_WARNING, "Promoted to primary during slot migration, failing the ongoing migration"); + curr_import->state = SLOT_IMPORT_FAILED; + continue; + } + return; + case SLOT_IMPORT_FAILED: + listDelNode(server.cluster->slot_import_jobs, curr_node); + dropKeysInSlotBitmap(curr_import->slot_bitmap, server.repl_replica_lazy_flush); + clusterFreeSlotImportJob(curr_import); continue; } } } +int childSnapshotForSyncSlot(int req, rio *rdb, void *privdata) { + UNUSED(req); + int retval = rewriteAppendOnlyFileRio(rdb, (unsigned char *) privdata); + rioWrite(rdb, "*3\r\n", 4); + rioWriteBulkString(rdb, "CLUSTER", 7); + rioWriteBulkString(rdb, "SYNCSLOTS", 9); + rioWriteBulkString(rdb, "END", 3); + return retval; +} + +void clusterProceedWithSlotExport(void) { + while (clusterGetCurrentSlotExport() != NULL) { + listNode *curr_node = listFirst(server.cluster->slot_export_jobs); + slotExport *curr_export = (slotExport *) curr_node->value; + if (curr_export->client == NULL) { + serverLog(LL_WARNING, "Client for slot export has been closed"); + curr_export->state = SLOT_EXPORT_FAILED; + } + switch(curr_export->state) { + case SLOT_EXPORT_QUEUED: + if (hasActiveChildProcess()) { + /* We need to wait for the child to die, then we can + * proceed. */ + return; + } + connection ** conns = zmalloc(sizeof(connection*)); + *conns = curr_export->client->conn; + if (saveSnapshotToConnectionSockets(conns, 1, 1, 0, childSnapshotForSyncSlot, curr_export->slot_bitmap) != C_OK) { + serverLog(LL_WARNING, "Failed to start slot export to target"); + curr_export->state = SLOT_EXPORT_FAILED; + continue; + } + curr_export->state = SLOT_EXPORT_SNAPSHOTTING; + continue; + case SLOT_EXPORT_SNAPSHOTTING: + /* During this time, we are waiting for SYNCSLOTS PAUSE to + * start flushing the accumulated backlog. */ + return; + case SLOT_EXPORT_PAUSE_AND_REPLY: + addReplyArrayLen(curr_export->client, 4); + addReplyBulkCBuffer(curr_export->client, "CLUSTER", 7); + addReplyBulkCBuffer(curr_export->client, "SYNCSLOTS", 9); + addReplyBulkCBuffer(curr_export->client, "PAUSEDAT", 8); + addReplyLongLong(curr_export->client, curr_export->syncslot_offset); + + curr_export->pause_end = mstime() + (CLUSTER_MF_TIMEOUT * CLUSTER_MF_PAUSE_MULT); + pauseActions(PAUSE_DURING_SLOT_MIGRATION, curr_export->pause_end, PAUSE_ACTIONS_CLIENT_WRITE_SET); + + curr_export->state = SLOT_EXPORT_PAUSED; + continue; + case SLOT_EXPORT_PAUSED: + /* */ + case SLOT_EXPORT_FINISH: + case SLOT_EXPORT_FAILED: + listDelNode(server.cluster->slot_export_jobs, curr_node); + clusterFreeSlotExportJob(curr_export); + continue; + } + } +} + + +/* This is the main state machine for the slot migration workflow. Slot + * migration is mostly driven by the new owner of the slot (target node). These + * functions will do as much work as possible synchronously, processing the + * enqueued slot migrations and only returning once we are waiting on some IO. */ +void clusterProceedWithSlotMigration(void) { + server.cluster->todo_before_sleep &= ~CLUSTER_TODO_HANDLE_SLOTMIGRATION; + clusterProceedWithSlotImport(); + clusterProceedWithSlotExport(); +} /* ----------------------------------------------------------------------------- * REPLICA node specific functions @@ -5286,7 +5412,7 @@ void manualFailoverCanStart(void) { * The function can be used both to initialize the manual failover state at * startup or to abort a manual failover in progress. */ void resetManualFailover(void) { - if (server.cluster->mf_replica || server.cluster->mf_slots_target) { + if (server.cluster->mf_replica) { /* We were a primary failing over, so we paused clients and related actions. * Regardless of the outcome we unpause now to allow traffic again. */ unpauseActions(PAUSE_DURING_FAILOVER); @@ -5295,8 +5421,6 @@ void resetManualFailover(void) { server.cluster->mf_can_start = 0; server.cluster->mf_replica = NULL; server.cluster->mf_primary_offset = -1; - memset(server.cluster->mf_slots, 0, sizeof(server.cluster->mf_slots)); - server.cluster->mf_slots_target = NULL; } /* If a manual failover timed out, abort it. */ @@ -6592,6 +6716,7 @@ void removeChannelsInSlot(unsigned int slot) { /* Remove all the keys in the specified hash slot. * The number of removed items is returned. */ +// TODO(murphyjacob4) - can we just use this? unsigned int delKeysInSlot(unsigned int hashslot) { if (!countKeysInSlot(hashslot)) return 0; @@ -7426,10 +7551,106 @@ int clusterCommandSpecial(client *c) { } } - slotMigration * to_enqueue = clusterCreateSlotMigration(curr_owner, requested_slots); - listAddNodeTail(server.cluster->slot_migrations, to_enqueue); - clusterProceedWithSlotMigration(); + slotImport * to_enqueue = clusterCreateSlotImportJob(curr_owner, requested_slots); + listAddNodeTail(server.cluster->slot_import_jobs, to_enqueue); + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION); addReply(c, shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr, "syncslots")) { + if (c->argc < 3) { + addReplyError(c, "SYNCSLOTS command requires either START or END to be provided."); + return 1; + } + if (!strcasecmp(c->argv[2]->ptr, "start")) { + /* CLUSTER SYNCSLOTS START */ + if (c->argc != 4) { + addReplyError(c, "CLUSTER SYNCSLOTS START command requires exactly one argument"); + return 1; + } + if (sdslen(c->argv[3]->ptr) != sizeof(slotBitmap)) { + addReplyError(c, "Invalid slot bitmap length"); + return 1; + } + c->flag.slot_migration_target = 1; + initClientReplicationData(c); + slotExport *job = clusterCreateSlotExportJob(c, c->argv[2]->ptr); + listAddNodeTail(server.cluster->slot_export_jobs, job); + clusterProceedWithSlotMigration(); + } else if (!strcasecmp(c->argv[2]->ptr, "inform")) { + /* CLUSTER SYNCSLOTS INFORM */ + if (c->argc != 4) { + addReplyError(c, "CLUSTER SYNCSLOTS INFORM command requires exactly one argument"); + return 1; + } + slotImport * to_enqueue = clusterCreateSlotImportJob(NULL, c->argv[2]->ptr); + to_enqueue->state = SLOT_IMPORT_REPLICA_TRACKING; + } else if (!strcasecmp(c->argv[2]->ptr, "end")) { + /* CLUSTER SYNCSLOTS END */ + if (c->argc != 3) { + addReplyError(c, "CLUSTER SYNCSLOTS END does not expect any arguments."); + return 1; + } + slotImport *curr_import = clusterGetCurrentSlotImport(); + if (!curr_import || (curr_import->state != SLOT_IMPORT_RECEIVE_SYNCSLOTS && curr_import->state != SLOT_IMPORT_REPLICA_TRACKING)) { + addReplyError(c, "No ongoing CLUSTER SYNCSLOTS to end."); + return 1; + } + if (curr_import->state != SLOT_IMPORT_REPLICA_TRACKING) { + /* Replicas will also receive this command through the replication + * stream, but it is not actionable. */ + return 1; + } + if (curr_import->client != c) { + addReplyError(c, "This client is not the one that initiated the ongoing CLUSTER SYNCSLOTS."); + } + curr_import->state = SLOT_IMPORT_PAUSE_OWNER; + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION); + addReply(c, shared.ok); + } else if (!strcasecmp(c->argv[2]->ptr, "pause")) { + /* CLUSTER SYNCSLOTS PAUSE */ + if (c->argc != 3) { + addReplyError(c, "CLUSTER SYNCSLOTS PAUSE does not expect any arguments."); + return 1; + } + slotExport *slot_export = clusterGetCurrentSlotExport(); + if (!slot_export) { + addReplyError(c, "No ongoing CLUSTER SYNCSLOTS to pause."); + return 1; + } + if (slot_export->state == SLOT_EXPORT_PAUSED) { + serverLog(LL_NOTICE, "Pause retriggered by target during slot migration."); + } else if (slot_export->state != SLOT_EXPORT_SNAPSHOTTING) { + addReplyError(c, "SYNCSLOTS is not in the correct state for this command."); + return 1; + } else { + /* First pause. We want to flush the output buffer that was not allowed to + * flush during the snapshot. */ + putClientInPendingWriteQueue(slot_export->client); + } + + slot_export->state = SLOT_EXPORT_PAUSE_AND_REPLY; + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION); + } else if (!strcasecmp(c->argv[2]->ptr, "pausedat")) { + /* CLUSTER SYNCSLOTS PAUSEDAT */ + if (c->argc != 4) { + addReplyError(c, "CLUSTER SYNCSLOTS PAUSEDAT command requires exactly one argument."); + return 1; + } + slotImport *slot_import = clusterGetCurrentSlotImport(); + if (!slot_import || slot_import->state != SLOT_IMPORT_WAITING_FOR_OFFSET) { + addReplyError(c, "No CLUSTER SYNCSLOTS is waiting for a PAUSEDAT response."); + return 1; + } + long long offset; + if (getLongLongFromObject(c->argv[3]->ptr, &offset) != C_OK) { + addReplyError(c, "Failed to parse PAUSEDAT offset."); + return 1; + } + slot_import->paused_at_offset = offset; + slot_import->state = SLOT_IMPORT_SYNCING_TO_OFFSET; + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION); + } else { + addReplyError(c, "Unknown subcommand for CLUSTER SYNCSLOTS."); + } } else { return 0; } @@ -7474,6 +7695,12 @@ const char **clusterCommandExtendedHelp(void) { " Output format is an array where each array element is a map containing attributes of a link", "MIGRATE SLOTSRANGE [ ...] SHARD ", " Initiate server driven slot migration of all slot ranges to the designated shard.", + "SYNCSLOTS [START |END|INFORM |PAUSE|PAUSEDAT]", + " Internal command. SYNCSLOTS START initiates send of an AOF formatted snapshot containing the", + " provided slot bitmap. SYNCSLOTS END terminates the AOF formatted snapshot, and after this", + " SYNCSLOTS PAUSE signals for this node to be paused and for a continuous stream of commands" + " for the slots to be replicated. SYNCSLOTS PAUSEDAT will be replied with the offset of remaining" + " commands. SYNCSLOTS INFORM is used to inform replicas that the operation is occurring.", NULL}; return help; diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index 0a97ca37ad..9eda033bda 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -10,7 +10,7 @@ #define CLUSTER_MF_TIMEOUT 5000 /* Milliseconds to do a manual failover. */ #define CLUSTER_MF_PAUSE_MULT 2 /* Primary pause manual failover mult. */ #define CLUSTER_REPLICA_MIGRATION_DELAY 5000 /* Delay for replica migration. */ -#define CLUSTER_SLOT_MIGRATION_TIMEOUT 30000 /* Milliseconds to do a slot migration. */ +#define CLUSTER_SLOT_IMPORT_TIMEOUT 30000 /* Milliseconds to do a slot migration. */ /* Reasons why a replica is not able to failover. */ #define CLUSTER_CANT_FAILOVER_NONE 0 @@ -376,32 +376,50 @@ struct _clusterNode { Update with updateAndCountChangedNodeHealth(). */ }; -typedef enum slotMigrationState { - SLOT_MIGRATION_QUEUED, - SLOT_MIGRATION_CONNECTING, - SLOT_MIGRATION_REPL_HANDSHAKE, /* The handshake has it's own state machine, - * see replicationProceedWithHandshake */ - SLOT_MIGRATION_SEND_SYNC, - SLOT_MIGRATION_RECEIVE_SYNC, - SLOT_MIGRATION_PAUSE_OWNER, - SLOT_MIGRATION_WAITING_FOR_OFFSET, - SLOT_MIGRATION_SYNCING_TO_OFFSET, - SLOT_MIGRATION_FINISH, - SLOT_MIGRATION_FAILED, -} slotMigrationState; - -typedef struct slotMigration { +typedef enum slotImportState { + SLOT_IMPORT_QUEUED, + SLOT_IMPORT_REPLICA_TRACKING, /* Replicas track the slot import as well */ + SLOT_IMPORT_CONNECTING, + SLOT_IMPORT_SEND_AUTH, + SLOT_IMPORT_RECEIVE_AUTH, + SLOT_IMPORT_SEND_SYNCSLOTS, + SLOT_IMPORT_RECEIVE_SYNCSLOTS, + SLOT_IMPORT_PAUSE_OWNER, + SLOT_IMPORT_WAITING_FOR_OFFSET, + SLOT_IMPORT_SYNCING_TO_OFFSET, + SLOT_IMPORT_FINISH, + SLOT_IMPORT_FAILED, +} slotImportState; + +typedef struct slotImport { slotBitmap slot_bitmap; - slotMigrationState state; + slotImportState state; clusterNode *source_node; mstime_t end_time; /* Slot migration time limit (ms unixtime). If not yet in progress (e.g. queued), will be zero. */ - connection *replication_connection; /* Connection for replication. */ - client *replication_client; /* Client for replication */ - int replication_handshake_state; + connection *conn; + client *client; mstime_t pause_end; - long long pause_primary_offset; -} slotMigration; + long long syncslots_offset; + long long paused_at_offset; +} slotImport; + +typedef enum slotExportState { + SLOT_EXPORT_QUEUED, + SLOT_EXPORT_SNAPSHOTTING, + SLOT_EXPORT_PAUSE_AND_REPLY, + SLOT_EXPORT_PAUSED, + SLOT_EXPORT_FINISH, + SLOT_EXPORT_FAILED, +} slotExportState; + +typedef struct slotExport { + slotBitmap slot_bitmap; + slotExportState state; + client *client; /* Client for replication */ + unsigned long long syncslot_offset; + mstime_t pause_end; +} slotExport; /* Struct used for storing slot statistics. */ typedef struct slotStat { @@ -464,7 +482,8 @@ struct clusterState { slotBitmap owner_not_claiming_slot; /* Struct used for storing slot statistics, for all slots owned by the current shard. */ slotStat slot_stats[CLUSTER_SLOTS]; - list *slot_migrations; /* Queue of ongoing slot migrations. */ + list *slot_import_jobs; /* Queue of ongoing slot imports (we are the target). */ + list *slot_export_jobs; /* Queue of ongoing slot exports (we are the source). */ }; #endif // CLUSTER_LEGACY_H diff --git a/src/commands.def b/src/commands.def index 0e54094821..f0a1183e5a 100644 --- a/src/commands.def +++ b/src/commands.def @@ -1021,6 +1021,23 @@ const char *CLUSTER_SLOTS_Tips[] = { #define CLUSTER_SLOTS_Keyspecs NULL #endif +/********** CLUSTER SYNCSLOTS ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* CLUSTER SYNCSLOTS history */ +#define CLUSTER_SYNCSLOTS_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* CLUSTER SYNCSLOTS tips */ +#define CLUSTER_SYNCSLOTS_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* CLUSTER SYNCSLOTS key specs */ +#define CLUSTER_SYNCSLOTS_Keyspecs NULL +#endif + /* CLUSTER command table */ struct COMMAND_STRUCT CLUSTER_Subcommands[] = { {MAKE_CMD("addslots","Assigns new hash slots to a node.","O(N) where N is the total number of hash slot arguments","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_ADDSLOTS_History,0,CLUSTER_ADDSLOTS_Tips,0,clusterCommand,-3,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_STALE,0,CLUSTER_ADDSLOTS_Keyspecs,0,NULL,1),.args=CLUSTER_ADDSLOTS_Args}, @@ -1053,6 +1070,7 @@ struct COMMAND_STRUCT CLUSTER_Subcommands[] = { {MAKE_CMD("slaves","Lists the replica nodes of a primary node.","O(N) where N is the number of replicas.","3.0.0",CMD_DOC_DEPRECATED,"`CLUSTER REPLICAS`","5.0.0","cluster",COMMAND_GROUP_CLUSTER,CLUSTER_SLAVES_History,0,CLUSTER_SLAVES_Tips,1,clusterCommand,3,CMD_ADMIN|CMD_STALE,0,CLUSTER_SLAVES_Keyspecs,0,NULL,1),.args=CLUSTER_SLAVES_Args}, {MAKE_CMD("slot-stats","Return an array of slot usage statistics for slots assigned to the current node.","O(N) where N is the total number of slots based on arguments. O(N*log(N)) with ORDERBY subcommand.","8.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_SLOT_STATS_History,0,CLUSTER_SLOT_STATS_Tips,2,clusterSlotStatsCommand,-4,CMD_STALE|CMD_LOADING,0,CLUSTER_SLOT_STATS_Keyspecs,0,NULL,1),.args=CLUSTER_SLOT_STATS_Args}, {MAKE_CMD("slots","Returns the mapping of cluster slots to nodes.","O(N) where N is the total number of Cluster nodes","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_SLOTS_History,2,CLUSTER_SLOTS_Tips,1,clusterCommand,2,CMD_LOADING|CMD_STALE,0,CLUSTER_SLOTS_Keyspecs,0,NULL,0)}, +{MAKE_CMD("syncslots","An internal command used in slot migration.",NULL,"8.1.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_SYNCSLOTS_History,0,CLUSTER_SYNCSLOTS_Tips,0,clusterCommand,-2,CMD_ADMIN|CMD_STALE,0,CLUSTER_SYNCSLOTS_Keyspecs,0,NULL,0)}, {0} }; diff --git a/src/commands/cluster-syncslots.json b/src/commands/cluster-syncslots.json new file mode 100644 index 0000000000..2d23903ac4 --- /dev/null +++ b/src/commands/cluster-syncslots.json @@ -0,0 +1,14 @@ +{ + "SYNCSLOTS": { + "summary": "An internal command used in slot migration.", + "group": "cluster", + "since": "8.1.0", + "arity": -2, + "container": "CLUSTER", + "function": "clusterCommand", + "command_flags": [ + "ADMIN", + "STALE" + ] + } +} diff --git a/src/db.c b/src/db.c index 134dc6e9dd..905f5c6120 100644 --- a/src/db.c +++ b/src/db.c @@ -258,7 +258,7 @@ int getKeySlot(sds key) { * so we must always recompute the slot for commands coming from the primary. */ if (server.current_client && server.current_client->slot >= 0 && server.current_client->flag.executing_command && - !server.current_client->flag.primary) { + !server.current_client->flag.replicated) { debugServerAssertWithInfo(server.current_client, NULL, (int)keyHashSlot(key, (int)sdslen(key)) == server.current_client->slot); return server.current_client->slot; @@ -267,7 +267,7 @@ int getKeySlot(sds key) { /* For the case of replicated commands from primary, getNodeByQuery() never gets called, * and thus c->slot never gets populated. That said, if this command ends up accessing a key, * we are able to backfill c->slot here, where the key's hash calculation is made. */ - if (server.current_client && server.current_client->flag.primary) { + if (server.current_client && server.current_client->flag.replicated) { server.current_client->slot = slot; } return slot; diff --git a/src/io_threads.c b/src/io_threads.c index 715251a06a..93fe56fdb9 100644 --- a/src/io_threads.c +++ b/src/io_threads.c @@ -345,7 +345,7 @@ int trySendReadToIOThreads(client *c) { c->cur_tid = tid; c->read_flags = canParseCommand(c) ? 0 : READ_FLAGS_DONT_PARSE; c->read_flags |= authRequired(c) ? READ_FLAGS_AUTH_REQUIRED : 0; - c->read_flags |= c->flag.replication_source ? READ_FLAGS_REPLICATION_SOURCE : 0; + c->read_flags |= c->flag.replicated ? READ_FLAGS_REPLICATED : 0; c->io_read_state = CLIENT_PENDING_IO; connSetPostponeUpdateState(c->conn, 1); diff --git a/src/kvstore.c b/src/kvstore.c index ef4b90af73..f1ed085c43 100644 --- a/src/kvstore.c +++ b/src/kvstore.c @@ -74,7 +74,7 @@ struct _kvstoreIterator { kvstore *kvs; long long didx; long long next_didx; - kvstoreIteratorFilter *filter; + kvstoreIteratorPredicate *predicate; void *filter_privdata; hashtableIterator di; }; @@ -600,19 +600,22 @@ kvstoreIterator *kvstoreIteratorInit(kvstore *kvs) { kvs_it->kvs = kvs; kvs_it->didx = -1; kvs_it->next_didx = kvstoreGetFirstNonEmptyHashtableIndex(kvs_it->kvs); /* Finds first non-empty hashtable index. */ - kvs_it->filter = NULL; + kvs_it->predicate = NULL; kvs_it->filter_privdata = NULL; hashtableInitSafeIterator(&kvs_it->di, NULL); return kvs_it; } /* Returns kvstore iterator that filters out hash tables based on the predicate.*/ -kvstoreIterator *kvstoreFilteredIteratorInit(kvstore *kvs, kvstoreIteratorFilter *filter, void *privdata) { +kvstoreIterator *kvstoreFilteredIteratorInit(kvstore *kvs, kvstoreIteratorPredicate *predicate, void *privdata) { kvstoreIterator *kvs_it = zmalloc(sizeof(*kvs_it)); kvs_it->kvs = kvs; kvs_it->didx = -1; kvs_it->next_didx = kvstoreGetFirstNonEmptyHashtableIndex(kvs_it->kvs); - kvs_it->filter = filter; + while (kvs_it->next_didx != -1 && predicate && !predicate(kvs_it->next_didx, privdata)) { + kvs_it->next_didx = kvstoreGetNextNonEmptyHashtableIndex(kvs_it->kvs, kvs_it->next_didx); + } + kvs_it->predicate = predicate; kvs_it->filter_privdata = privdata; hashtableInitSafeIterator(&kvs_it->di, NULL); return kvs_it; @@ -640,11 +643,10 @@ static hashtable *kvstoreIteratorNextHashtable(kvstoreIterator *kvs_it) { freeHashtableIfNeeded(kvs_it->kvs, kvs_it->didx); } + kvs_it->didx = kvs_it->next_didx; do { - kvs_it->didx = kvs_it->next_didx; - if (kvs_it->didx == -1) return NULL; - kvs_it->next_didx = kvstoreGetNextNonEmptyHashtableIndex(kvs_it->kvs, kvs_it->didx); - } while (kvs_it->filter && kvs_it->filter(kvs_it->didx, kvs_it->filter_privdata)); + kvs_it->next_didx = kvstoreGetNextNonEmptyHashtableIndex(kvs_it->kvs, kvs_it->next_didx); + } while (kvs_it->next_didx != -1 && kvs_it->predicate && !kvs_it->predicate(kvs_it->didx, kvs_it->filter_privdata)); return kvs_it->kvs->hashtables[kvs_it->didx]; } diff --git a/src/kvstore.h b/src/kvstore.h index 668b0ae23e..a79caf23aa 100644 --- a/src/kvstore.h +++ b/src/kvstore.h @@ -10,7 +10,7 @@ typedef struct _kvstoreHashtableIterator kvstoreHashtableIterator; typedef int(kvstoreScanShouldSkipHashtable)(hashtable *d); typedef int(kvstoreExpandShouldSkipHashtableIndex)(int didx); -typedef int(kvstoreIteratorFilter)(int didx, void *privdata); +typedef int(kvstoreIteratorPredicate)(int didx, void *privdata); #define KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND (1 << 0) #define KVSTORE_FREE_EMPTY_HASHTABLES (1 << 1) @@ -47,7 +47,7 @@ size_t kvstoreHashtableMetadataSize(void); /* kvstore iterator specific functions */ kvstoreIterator *kvstoreIteratorInit(kvstore *kvs); -kvstoreIterator *kvstoreFilteredIteratorInit(kvstore *kvs, kvstoreIteratorFilter *filter, void *privdata); +kvstoreIterator *kvstoreFilteredIteratorInit(kvstore *kvs, kvstoreIteratorPredicate *filter, void *privdata); void kvstoreIteratorRelease(kvstoreIterator *kvs_it); int kvstoreIteratorGetCurrentHashtableIndex(kvstoreIterator *kvs_it); int kvstoreIteratorNext(kvstoreIterator *kvs_it, void **next); diff --git a/src/networking.c b/src/networking.c index 5c31ac4562..c2828d384a 100644 --- a/src/networking.c +++ b/src/networking.c @@ -290,7 +290,9 @@ int prepareClientToWrite(client *c) { /* Replication sources don't receive replies, unless force reply flag * is set. */ - if ((c->flag.replication_source) && !c->flag.replication_force_reply) return C_ERR; + if ((c->flag.replicated) && !c->flag.replication_force_reply) return C_ERR; + + if ((c->flag.slot_migration_target && !clusterShouldWriteToSlotMigrationTarget())) return C_ERR; /* Skip the fake client, such as the fake client for AOF loading. * But CLIENT_ID_CACHED_RESPONSE is allowed since it is a fake client @@ -1602,7 +1604,7 @@ void clearClientConnectionState(client *c) { c->flag.replica = 0; } - serverAssert(!(c->flag.replica || c->flag.replication_source)); + serverAssert(!(c->flag.replica || c->flag.replicated)); if (c->flag.tracking) disableTracking(c); selectDb(c, 0); @@ -1681,6 +1683,10 @@ void freeClient(client *c) { } } + if (c->flag.slot_migration_source || c->flag.slot_migration_target) { + clusterSlotMigrationHandleClientClose(c); + } + /* Log link disconnection with replica */ if (getClientType(c) == CLIENT_TYPE_REPLICA) { if (c->flag.repl_rdb_channel) @@ -1821,7 +1827,7 @@ void beforeNextClient(client *c) { * blocked client as well */ /* Trim the query buffer to the current position. */ - if (c->flag.replication_source) { + if (c->flag.replicated) { /* If the client is a replication source, trim the querybuf to repl_applied, * since replication clients are very special, its querybuf not only * used to parse command, but also proxy to sub-replicas. @@ -2135,7 +2141,11 @@ int postWriteToClient(client *c) { if (getClientType(c) != CLIENT_TYPE_REPLICA) { _postWriteToClient(c); } else { - server.stat_net_repl_output_bytes += c->nwritten > 0 ? c->nwritten : 0; + if (c->flag.slot_migration_target) { + server.stat_net_slot_migration_output_bytes += c->nwritten > 0 ? c->nwritten : 0; + } else { + server.stat_net_repl_output_bytes += c->nwritten > 0 ? c->nwritten : 0; + } } if (c->write_flags & WRITE_FLAGS_WRITE_ERROR) { @@ -2151,7 +2161,7 @@ int postWriteToClient(client *c) { * as an interaction, since we always send REPLCONF ACK commands * that take some time to just fill the socket output buffer. * We just rely on data / pings received for timeout detection. */ - if (!c->flag.replication_source) c->last_interaction = server.unixtime; + if (!c->flag.replicated) c->last_interaction = server.unixtime; } if (!clientHasPendingReplies(c)) { c->sentlen = 0; @@ -2239,7 +2249,7 @@ int handleReadResult(client *c) { c->last_interaction = server.unixtime; c->net_input_bytes += c->nread; - if (c->flag.replication_source) { + if (c->flag.replicated) { c->repl_data->read_reploff += c->nread; if (c->flag.primary) { server.stat_net_repl_input_bytes += c->nread; @@ -2563,7 +2573,7 @@ void processInlineBuffer(client *c) { int argc, j, linefeed_chars = 1; sds *argv, aux; size_t querylen; - int is_replication_source = c->read_flags & READ_FLAGS_REPLICATION_SOURCE; + int is_replicated = c->read_flags & READ_FLAGS_REPLICATED; /* Search for end of line */ newline = strchr(c->querybuf + c->qb_pos, '\n'); @@ -2600,7 +2610,7 @@ void processInlineBuffer(client *c) { * * However there is an exception: primaries may send us just a newline * to keep the connection active. */ - if (querylen != 0 && is_replication_source) { + if (querylen != 0 && is_replicated) { sdsfreesplitres(argv, argc); c->read_flags |= READ_FLAGS_ERROR_UNEXPECTED_INLINE_FROM_REPLICATION_SOURCE; return; @@ -2649,7 +2659,7 @@ void processInlineBuffer(client *c) { * CLIENT_PROTOCOL_ERROR. */ #define PROTO_DUMP_LEN 128 static void setProtocolError(const char *errstr, client *c) { - if (server.verbosity <= LL_VERBOSE || c->flag.replication_source) { + if (server.verbosity <= LL_VERBOSE || c->flag.replicated) { sds client = catClientInfoString(sdsempty(), c, server.hide_user_data_from_log); /* Sample some protocol to given an idea about what was inside. */ @@ -2671,7 +2681,7 @@ static void setProtocolError(const char *errstr, client *c) { } /* Log all the client and protocol info. */ - int loglevel = (c->flag.replication_source) ? LL_WARNING : LL_VERBOSE; + int loglevel = (c->flag.replicated) ? LL_WARNING : LL_VERBOSE; serverLog(loglevel, "Protocol error (%s) from client: %s. %s", errstr, client, buf); sdsfree(client); } @@ -2690,7 +2700,7 @@ void processMultibulkBuffer(client *c) { char *newline = NULL; int ok; long long ll; - int is_replication_source = c->read_flags & READ_FLAGS_REPLICATION_SOURCE; + int is_replicated = c->read_flags & READ_FLAGS_REPLICATED; int auth_required = c->read_flags & READ_FLAGS_AUTH_REQUIRED; if (c->multibulklen == 0) { @@ -2794,7 +2804,7 @@ void processMultibulkBuffer(client *c) { size_t bulklen_slen = newline - (c->querybuf + c->qb_pos + 1); ok = string2ll(c->querybuf + c->qb_pos + 1, bulklen_slen, &ll); - if (!ok || ll < 0 || (!(is_replication_source) && ll > server.proto_max_bulk_len)) { + if (!ok || ll < 0 || (!(is_replicated) && ll > server.proto_max_bulk_len)) { c->read_flags |= READ_FLAGS_ERROR_MBULK_INVALID_BULK_LEN; return; } else if (ll > 16384 && auth_required) { @@ -2803,7 +2813,7 @@ void processMultibulkBuffer(client *c) { } c->qb_pos = newline - c->querybuf + 2; - if (!(is_replication_source) && ll >= PROTO_MBULK_BIG_ARG) { + if (!(is_replicated) && ll >= PROTO_MBULK_BIG_ARG) { /* When the client is not a primary client (because primary * client's querybuf can only be trimmed after data applied * and sent to replicas). @@ -2852,7 +2862,7 @@ void processMultibulkBuffer(client *c) { /* Optimization: if a non-primary client's buffer contains JUST our bulk element * instead of creating a new object by *copying* the sds we * just use the current sds string. */ - if (!is_replication_source && c->qb_pos == 0 && c->bulklen >= PROTO_MBULK_BIG_ARG && + if (!is_replicated && c->qb_pos == 0 && c->bulklen >= PROTO_MBULK_BIG_ARG && sdslen(c->querybuf) == (size_t)(c->bulklen + 2)) { c->argv[c->argc++] = createObject(OBJ_STRING, c->querybuf); c->argv_len_sum += c->bulklen; @@ -2902,7 +2912,7 @@ void commandProcessed(client *c) { if (!c->repl_data) return; long long prev_offset = c->repl_data->reploff; - if (!c->flag.multi && c->flag.replication_source) { + if (!c->flag.multi && c->flag.replicated) { /* Update the applied replication offset of our source. */ c->repl_data->reploff = c->repl_data->read_reploff - sdslen(c->querybuf) + c->qb_pos; } @@ -2913,7 +2923,7 @@ void commandProcessed(client *c) { * applied to the replication state: this quantity, and its corresponding * part of the replication stream, will be propagated to the * sub-replicas and to the replication backlog. */ - if (c->flag.replication_source) { + if (c->flag.replicated) { long long applied = c->repl_data->reploff - prev_offset; if (applied) { replicationFeedStreamFromPrimaryStream(c->querybuf + c->repl_data->repl_applied, applied); @@ -3021,7 +3031,7 @@ int canParseCommand(client *c) { * condition on the replica. We want just to accumulate the replication * stream (instead of replying -BUSY like we do with other clients) and * later resume the processing. */ - if (isInsideYieldingLongCommand() && c->flag.replication_source) return 0; + if (isInsideYieldingLongCommand() && c->flag.replicated) return 0; /* CLIENT_CLOSE_AFTER_REPLY closes the connection once the reply is * written to the client. Make sure to not let the reply grow after @@ -3040,7 +3050,7 @@ int processInputBuffer(client *c) { break; } - c->read_flags = c->flag.replication_source ? READ_FLAGS_REPLICATION_SOURCE : 0; + c->read_flags = c->flag.replicated ? READ_FLAGS_REPLICATED : 0; c->read_flags |= authRequired(c) ? READ_FLAGS_AUTH_REQUIRED : 0; parseCommand(c); @@ -3083,7 +3093,7 @@ void readToQueryBuf(client *c) { /* If the replica RDB client is marked as closed ASAP, do not try to read from it */ if (c->flag.close_asap) return; - int is_replication_source = c->read_flags & READ_FLAGS_REPLICATION_SOURCE; + int is_replicated = c->read_flags & READ_FLAGS_REPLICATED; readlen = PROTO_IOBUF_LEN; qblen = c->querybuf ? sdslen(c->querybuf) : 0; @@ -3104,7 +3114,7 @@ void readToQueryBuf(client *c) { /* Primary client needs expand the readlen when meet BIG_ARG(see #9100), * but doesn't need align to the next arg, we can read more data. */ - if (c->flag.replication_source && readlen < PROTO_IOBUF_LEN) readlen = PROTO_IOBUF_LEN; + if (c->flag.replicated && readlen < PROTO_IOBUF_LEN) readlen = PROTO_IOBUF_LEN; } if (c->querybuf == NULL) { @@ -3117,7 +3127,7 @@ void readToQueryBuf(client *c) { * Although we have ensured that c->querybuf will not be expanded in the current * thread_shared_qb, we still add this check for code robustness. */ int use_thread_shared_qb = (c->querybuf == thread_shared_qb) ? 1 : 0; - if (!is_replication_source && /* replication client's querybuf can grow greedy. */ + if (!is_replicated && /* replication client's querybuf can grow greedy. */ (big_arg || sdsalloc(c->querybuf) < PROTO_IOBUF_LEN)) { /* When reading a BIG_ARG we won't be reading more than that one arg * into the query buffer, so we don't need to pre-allocate more than we @@ -3144,7 +3154,7 @@ void readToQueryBuf(client *c) { sdsIncrLen(c->querybuf, c->nread); qblen = sdslen(c->querybuf); if (c->querybuf_peak < qblen) c->querybuf_peak = qblen; - if (!is_replication_source) { + if (!is_replicated) { /* The commands cached in the MULTI/EXEC queue have not been executed yet, * so they are also considered a part of the query buffer in a broader sense. * @@ -3465,7 +3475,7 @@ void resetCommand(client *c) { flags.replica = 0; } - if (flags.replica || flags.replication_source || flags.module) { + if (flags.replica || flags.replicated || flags.module) { addReplyError(c, "can only reset normal client connections"); return; } @@ -4904,7 +4914,7 @@ void ioThreadReadQueryFromClient(void *data) { done: /* Only trim query buffer for non-primary clients * Primary client's buffer is handled by main thread using repl_applied position */ - if (!(c->read_flags & READ_FLAGS_REPLICATION_SOURCE)) { + if (!(c->read_flags & READ_FLAGS_REPLICATED)) { trimClientQueryBuffer(c); } atomic_thread_fence(memory_order_release); diff --git a/src/rdb.c b/src/rdb.c index 7bb9edf31f..ba5d219452 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -1870,7 +1870,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { if (server.sanitize_dump_payload == SANITIZE_DUMP_CLIENTS) { /* Skip sanitization when loading (an RDB), or getting a RESTORE command * from either a replication source or a client using an ACL user with the skip-sanitize-payload flag. */ - int skip = server.loading || (server.current_client && (server.current_client->flag.replication_source)); + int skip = server.loading || (server.current_client && (server.current_client->flag.replicated)); if (!skip && server.current_client && server.current_client->user) skip = !!(server.current_client->user->flags & USER_FLAG_SANITIZE_PAYLOAD_SKIP); deep_integrity_validation = !skip; @@ -3525,16 +3525,14 @@ void killRDBChild(void) { * - rdbRemoveTempFile */ } -/* Spawn an RDB child that writes the RDB to the sockets of the replicas - * that are currently in REPLICA_STATE_WAIT_BGSAVE_START state. */ -int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, slotBitmap slot_bitmap) { - listNode *ln; - listIter li; +/* Save snapshot to the provided connections, spawning a child process and + * running the provided function. + * + * Connections array provided will be freed after the save is completed, and + * should not be freed by the caller. */ +int saveSnapshotToConnectionSockets(connection **conns, int connsnum, int use_pipe, int req, ChildSnapshotFunc snapshot_func, void *privdata) { pid_t childpid; int pipefds[2], rdb_pipe_write = 0, safe_to_exit_pipe = 0; - int dual_channel = (req & REPLICA_REQ_RDB_CHANNEL); - int aof = (req & REPLICA_REQ_AOF_FORMAT); - if (hasActiveChildProcess()) return C_ERR; serverAssert(server.rdb_pipe_read == -1 && server.rdb_child_exit_pipe == -1); @@ -3542,7 +3540,7 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, slotBitmap slot_bitmap) * drained the pipe. */ if (server.rdb_pipe_conns) return C_ERR; - if (!dual_channel) { + if (use_pipe) { /* Before to fork, create a pipe that is used to transfer the rdb bytes to * the parent, we can't let it write directly to the sockets, since in case * of TLS we must let the parent handle a continuous TLS state when the @@ -3561,49 +3559,20 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, slotBitmap slot_bitmap) safe_to_exit_pipe = pipefds[0]; /* read end */ server.rdb_child_exit_pipe = pipefds[1]; /* write end */ } - /* Collect the connections of the replicas we want to transfer - * the RDB to, which are in WAIT_BGSAVE_START state. */ - int connsnum = 0; - connection **conns = zmalloc(sizeof(connection *) * listLength(server.replicas)); server.rdb_pipe_conns = NULL; - if (!dual_channel) { + if (use_pipe) { server.rdb_pipe_conns = conns; server.rdb_pipe_numconns = 0; server.rdb_pipe_numconns_writing = 0; + } else { + server.rdb_pipe_numconns = connsnum; } - /* Filter replica connections pending full sync (ie. in WAIT_BGSAVE_START state). */ - listRewind(server.replicas, &li); - while ((ln = listNext(&li))) { - client *replica = ln->value; - if (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_START) { - /* Check replica has the exact requirements */ - if (replica->repl_data->replica_req != req) continue; - /* Check matching slot bitmaps. */ - if (memcmp(replica->repl_data->slot_bitmap, slot_bitmap, sizeof(slotBitmap)) != 0) continue; - - conns[connsnum++] = replica->conn; - if (dual_channel) { - connSendTimeout(replica->conn, server.repl_timeout * 1000); - /* This replica uses diskless dual channel sync, hence we need - * to inform it with the save end offset.*/ - sendCurrentOffsetToReplica(replica); - /* Make sure repl traffic is appended to the replication backlog */ - addRdbReplicaToPsyncWait(replica); - /* Put the socket in blocking mode to simplify RDB transfer. */ - connBlock(replica->conn); - } else { - server.rdb_pipe_numconns++; - } - replicationSetupReplicaForFullResync(replica, getPsyncInitialOffset()); - } - } - /* Create the child process. */ if ((childpid = serverFork(CHILD_TYPE_RDB)) == 0) { /* Child */ int retval, dummy; rio rdb; - if (dual_channel) { + if (!use_pipe) { rioInitWithConnset(&rdb, conns, connsnum); } else { rioInitWithFd(&rdb, rdb_pipe_write); @@ -3611,7 +3580,7 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, slotBitmap slot_bitmap) /* Close the reading part, so that if the parent crashes, the child will * get a write error and exit. */ - if (!dual_channel) close(server.rdb_pipe_read); + if (use_pipe) close(server.rdb_pipe_read); if (strstr(server.exec_argv[0], "redis-server") != NULL) { serverSetProcTitle("redis-rdb-to-slaves"); } else { @@ -3619,22 +3588,13 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, slotBitmap slot_bitmap) } serverSetCpuAffinity(server.bgsave_cpulist); - if (aof) { - serverLog(LL_NOTICE, "Background AOF transfer started by pid %ld", (long)getpid()); - retval = rewriteAppendOnlyFileRio(&rdb, slot_bitmap); - rioWrite(&rdb, "*3\r\n", 4); - rioWriteBulkString(&rdb, "REPLCONF", 8); - rioWriteBulkString(&rdb, "AOF-PAYLOAD-END", 15); - rioWriteBulkLongLong(&rdb, server.primary_repl_offset); - } else { - retval = rdbSaveRioWithEOFMark(req, &rdb, NULL, rsi); - } + retval = snapshot_func(req, &rdb, privdata); if (retval == C_OK && rioFlush(&rdb) == 0) retval = C_ERR; if (retval == C_OK) { sendChildCowInfo(CHILD_INFO_TYPE_RDB_COW_SIZE, "RDB"); } - if (dual_channel) { + if (!use_pipe) { rioFreeConnset(&rdb); } else { rioFreeFd(&rdb); @@ -3645,7 +3605,7 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, slotBitmap slot_bitmap) zfree(conns); /* hold exit until the parent tells us it's safe. we're not expecting * to read anything, just get the error when the pipe is closed. */ - if (!dual_channel) dummy = read(safe_to_exit_pipe, pipefds, 1); + if (use_pipe) dummy = read(safe_to_exit_pipe, pipefds, 1); UNUSED(dummy); exitFromChild((retval == C_OK) ? 0 : 1); } else { @@ -3653,23 +3613,13 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, slotBitmap slot_bitmap) if (childpid == -1) { serverLog(LL_WARNING, "Can't save in background: fork: %s", strerror(errno)); - /* Undo the state change. The caller will perform cleanup on - * all the replicas in BGSAVE_START state, but an early call to - * replicationSetupReplicaForFullResync() turned it into BGSAVE_END */ - listRewind(server.replicas, &li); - while ((ln = listNext(&li))) { - client *replica = ln->value; - if (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_END) { - replica->repl_data->repl_state = REPLICA_STATE_WAIT_BGSAVE_START; - } - } - if (!dual_channel) { + if (use_pipe) { close(rdb_pipe_write); close(server.rdb_pipe_read); close(server.rdb_child_exit_pipe); } zfree(conns); - if (dual_channel) { + if (!use_pipe) { closeChildInfoPipe(); } else { server.rdb_pipe_conns = NULL; @@ -3678,10 +3628,10 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, slotBitmap slot_bitmap) } } else { serverLog(LL_NOTICE, "Background RDB transfer started by pid %ld to %s", (long)childpid, - dual_channel ? "direct socket to replica" : "pipe through parent process"); + !use_pipe ? "direct socket to replica" : "pipe through parent process"); server.rdb_save_time_start = time(NULL); server.rdb_child_type = RDB_CHILD_TYPE_SOCKET; - if (dual_channel) { + if (!use_pipe) { /* For dual channel sync, the main process no longer requires these RDB connections. */ zfree(conns); } else { @@ -3692,12 +3642,70 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, slotBitmap slot_bitmap) } } } - if (!dual_channel) close(safe_to_exit_pipe); + if (use_pipe) close(safe_to_exit_pipe); return (childpid == -1) ? C_ERR : C_OK; } return C_OK; /* Unreached. */ } +int childSnapshotUsingRDB(int req, rio *rdb, void *privdata) { + return rdbSaveRioWithEOFMark(req, rdb, NULL, (rdbSaveInfo *)privdata); +} + +/* Spawn an RDB child that writes the RDB to the sockets of the replicas + * that are currently in REPLICA_STATE_WAIT_BGSAVE_START state. */ +int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi) { + listNode *ln; + listIter li; + int dual_channel = (req & REPLICA_REQ_RDB_CHANNEL); + + /* Collect the connections of the replicas we want to transfer + * the RDB to, which are i WAIT_BGSAVE_START state. */ + int connsnum = 0; + connection **conns = zmalloc(sizeof(connection *) * listLength(server.replicas)); + + /* Filter replica connections pending full sync (ie. in WAIT_BGSAVE_START state). */ + listRewind(server.replicas, &li); + while ((ln = listNext(&li))) { + client *replica = ln->value; + if (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_START) { + /* Check replica has the exact requirements */ + if (replica->repl_data->replica_req != req) continue; + + conns[connsnum++] = replica->conn; + if (dual_channel) { + connSendTimeout(replica->conn, server.repl_timeout * 1000); + /* This replica uses diskless dual channel sync, hence we need + * to inform it with the save end offset.*/ + sendCurrentOffsetToReplica(replica); + /* Make sure repl traffic is appended to the replication backlog */ + addRdbReplicaToPsyncWait(replica); + /* Put the socket in blocking mode to simplify RDB transfer. */ + connBlock(replica->conn); + } + replicationSetupReplicaForFullResync(replica, getPsyncInitialOffset()); + } + } + + int retval = saveSnapshotToConnectionSockets(conns, connsnum, !dual_channel, req, childSnapshotUsingRDB, (void *) rsi); + + if (retval != C_OK) { + serverLog(LL_WARNING, "Can't save in background: fork: %s", strerror(errno)); + + /* Undo the state change. The caller will perform cleanup on + * all the replicas in BGSAVE_START state, but an early call to + * replicationSetupReplicaForFullResync() turned it into BGSAVE_END */ + listRewind(server.replicas, &li); + while ((ln = listNext(&li))) { + client *replica = ln->value; + if (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_END) { + replica->repl_data->repl_state = REPLICA_STATE_WAIT_BGSAVE_START; + } + } + } + return retval; +} + void saveCommand(client *c) { if (server.child_type == CHILD_TYPE_RDB) { addReplyError(c, "Background save already in progress"); diff --git a/src/rdb.h b/src/rdb.h index 734ae7ba72..7342a926b5 100644 --- a/src/rdb.h +++ b/src/rdb.h @@ -152,7 +152,7 @@ int rdbSaveObjectType(rio *rdb, robj *o); int rdbLoadObjectType(rio *rdb); int rdbLoad(char *filename, rdbSaveInfo *rsi, int rdbflags); int rdbSaveBackground(int req, char *filename, rdbSaveInfo *rsi, int rdbflags); -int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, slotBitmap slot_bitmap); +int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi); void rdbRemoveTempFile(pid_t childpid, int from_signal); int rdbSaveToFile(const char *filename); int rdbSave(int req, char *filename, rdbSaveInfo *rsi, int rdbflags); diff --git a/src/replication.c b/src/replication.c index 9abf5cead3..bcb9e0a756 100644 --- a/src/replication.c +++ b/src/replication.c @@ -49,6 +49,7 @@ void replicationDiscardCachedPrimary(void); void replicationResurrectCachedPrimary(connection *conn); void replicationResurrectProvisionalPrimary(void); +void replicationSendAck(void); int replicaPutOnline(client *replica); void replicaStartCommandStream(client *replica); int cancelReplicationHandshake(int reconnect); @@ -951,7 +952,7 @@ int primaryTryPartialResynchronization(client *c, long long psync_offset) { * started. * * Returns C_OK on success or C_ERR otherwise. */ -int startBgsaveForReplication(int mincapa, int req, slotBitmap slot_bitmap) { +int startBgsaveForReplication(int mincapa, int req) { int retval; int socket_target = 0; listIter li; @@ -960,14 +961,13 @@ int startBgsaveForReplication(int mincapa, int req, slotBitmap slot_bitmap) { /* We use a socket target if replica can handle the EOF marker and we're configured to do diskless syncs. * Note that in case we're creating a "filtered" RDB (functions-only, for example) we also force socket replication * to avoid overwriting the snapshot RDB file with filtered data. */ - socket_target = (server.repl_diskless_sync || req & REPLICA_REQ_RDB_MASK) && (mincapa & REPLICA_CAPA_EOF || req & REPLICA_REQ_AOF_FORMAT); + socket_target = (server.repl_diskless_sync || req & REPLICA_REQ_RDB_MASK) && (mincapa & REPLICA_CAPA_EOF); /* `SYNC` should have failed with error if we don't support socket and require a filter, assert this here */ serverAssert(socket_target || !(req & REPLICA_REQ_RDB_MASK)); - serverLog(LL_NOTICE, "Starting BGSAVE for SYNC with target: %s using: %s with format: %s", + serverLog(LL_NOTICE, "Starting BGSAVE for SYNC with target: %s using: %s", socket_target ? "replicas sockets" : "disk", - (req & REPLICA_REQ_RDB_CHANNEL) ? "dual-channel" : "normal sync", - (req & REPLICA_REQ_AOF_FORMAT) ? "AOF" : "RDB"); + (req & REPLICA_REQ_RDB_CHANNEL) ? "dual-channel" : "normal sync"); rdbSaveInfo rsi, *rsiptr; rsiptr = rdbPopulateSaveInfo(&rsi); @@ -975,7 +975,7 @@ int startBgsaveForReplication(int mincapa, int req, slotBitmap slot_bitmap) { * otherwise replica will miss repl-stream-db. */ if (rsiptr) { if (socket_target) - retval = rdbSaveToReplicasSockets(req, rsiptr, slot_bitmap); + retval = rdbSaveToReplicasSockets(req, rsiptr); else { /* Keep the page cache since it'll get used soon */ retval = rdbSaveBackground(req, server.rdb_filename, rsiptr, RDBFLAGS_REPLICATION | RDBFLAGS_KEEP_CACHE); @@ -1193,7 +1193,7 @@ void syncCommand(client *c) { * capabilities of the replica that triggered the current BGSAVE * and its exact requirements. */ if (ln && ((c->repl_data->replica_capa & replica->repl_data->replica_capa) == replica->repl_data->replica_capa) && - c->repl_data->replica_req == replica->repl_data->replica_req && isSlotBitmapEmpty(c->repl_data->slot_bitmap)) { + c->repl_data->replica_req == replica->repl_data->replica_req) { /* Perfect, the server is already registering differences for * another replica. Set the right state, and copy the buffer. * We don't copy buffer if clients don't want. */ @@ -1215,7 +1215,7 @@ void syncCommand(client *c) { /* CASE 3: There is no BGSAVE is in progress. */ } else { - if (server.repl_diskless_sync && (c->repl_data->replica_capa & REPLICA_CAPA_EOF) && server.repl_diskless_sync_delay && isSlotBitmapEmpty(c->repl_data->slot_bitmap)) { + if (server.repl_diskless_sync && (c->repl_data->replica_capa & REPLICA_CAPA_EOF) && server.repl_diskless_sync_delay) { /* Diskless replication RDB child is created inside * replicationCron() since we want to delay its start a * few seconds to wait for more replicas to arrive. */ @@ -1224,7 +1224,7 @@ void syncCommand(client *c) { /* We don't have a BGSAVE in progress, let's start one. Diskless * or disk-based mode is determined by replica's capacity. */ if (!hasActiveChildProcess()) { - startBgsaveForReplication(c->repl_data->replica_capa, c->repl_data->replica_req, c->repl_data->slot_bitmap); + startBgsaveForReplication(c->repl_data->replica_capa, c->repl_data->replica_req); } else { serverLog(LL_NOTICE, "No BGSAVE in progress, but another BG operation is active. " "BGSAVE for replication delayed"); @@ -1254,7 +1254,6 @@ int anyOtherReplicaWaitRdb(client *except_me) { void initClientReplicationData(client *c) { if (c->repl_data) return; c->repl_data = (ClientReplicationData *)zcalloc(sizeof(ClientReplicationData)); - memset(c->repl_data->slot_bitmap, 0, sizeof(c->repl_data->slot_bitmap)); } void freeClientReplicationData(client *c) { @@ -1421,7 +1420,7 @@ void replconfCommand(client *c) { } else if (!strcasecmp(c->argv[j]->ptr, "getack")) { /* REPLCONF GETACK is used in order to request an ACK ASAP * to the replica. */ - if (server.primary_host && server.primary) replicationSendAck(server.primary); + if (server.primary_host && server.primary) replicationSendAck(); return; } else if (!strcasecmp(c->argv[j]->ptr, "rdb-only")) { /* REPLCONF RDB-ONLY is used to identify the client only wants @@ -1492,41 +1491,6 @@ void replconfCommand(client *c) { return; } c->repl_data->associated_rdb_client_id = (uint64_t)client_id; - } else if (!strcasecmp(c->argv[j]->ptr, "slot-bitmap")) { - /* REPLCONF slot-bitmap is used to filter the replication stream to just a set number of slots. */ - if (!server.cluster_enabled) { - addReplyError(c, "Cannot replicate a slot when cluster mode is disabled"); - } - if (stringObjectLen(c->argv[j + 1]) != sizeof(slotBitmap)) { - addReplyError(c, "Invalid slot bitmap length"); - return; - } - for (int slot = 0; slot <= CLUSTER_SLOTS; slot++) { - if (bitmapTestBit(c->argv[j + 1]->ptr, slot) && server.cluster->slots[slot] != server.cluster->myself) { - addReplyErrorFormat(c, "I cannot replicate slot %d since I do not own it", slot); - return; - } - } - memcpy(c->repl_data->slot_bitmap, c->argv[j + 1]->ptr, sizeof(slotBitmap)); - - /* For now, we only support AOF for slot transfer. */ - c->repl_data->replica_req |= REPLICA_REQ_AOF_FORMAT; - } else if (!strcasecmp(c->argv[j]->ptr, "aof-payload-end")) { - /* REPLCONF aof-payload-end is used to inform the target - * that the replication source has finished sending the AOF formatted - * sync snapshot, and that it is free to begin processing the - * replication backlog. */ - long long initial_offset = 0; - if (getLongLongFromObjectOrReply(c, c->argv[j + 1], &initial_offset, NULL) != C_OK) { - return; - } - if (c->flag.slot_migration_source) { - clusterSlotMigrationDoneSyncing(initial_offset); - return; - } - /* Right now, we only support this for slot migration. */ - addReplyErrorFormat(c, "AOF sync is not in progress."); - return; } else { addReplyErrorFormat(c, "Unrecognized REPLCONF option: %s", (char *)c->argv[j]->ptr); return; @@ -2019,7 +1983,7 @@ void replicationCreatePrimaryClientWithHandler(connection *conn, int dbid, Conne * connection. */ server.primary->flag.primary = 1; server.primary->flag.authenticated = 1; - server.primary->flag.replication_source = 1; + server.primary->flag.replicated = 1; /* Allocate a private query buffer for the primary client instead of using the shared query buffer. * This is done because the primary's query buffer data needs to be preserved for my sub-replicas to use. */ @@ -2517,7 +2481,7 @@ void readSyncBulkPayload(connection *conn) { server.repl_state = REPL_STATE_CONNECTED; server.repl_down_since = 0; /* Send the initial ACK immediately to put this replica in online state. */ - replicationSendAck(server.primary); + replicationSendAck(); } /* Fire the primary link modules event. */ @@ -3060,7 +3024,7 @@ void dualChannelSyncSuccess(void) { dualChannelServerLog(LL_NOTICE, "Successfully streamed replication data into memory"); /* We can resume reading from the primary connection once the local replication buffer has been loaded. */ replicationSteadyStateInit(); - replicationSendAck(server.primary); /* Send ACK to notify primary that replica is synced */ + replicationSendAck(); /* Send ACK to notify primary that replica is synced */ server.rdb_client_id = -1; server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE; } @@ -3445,8 +3409,94 @@ void dualChannelSetupMainConnForPsync(connection *conn) { sdsfree(err); } -int replicationProceedWithHandshake(connection *conn, int curr_state, slotBitmap slot_bitmap) { - char *err = NULL; +/* + * Dual channel for full sync + * + * * Motivation * + * - Reduce primary memory load. We do that by moving the COB tracking to the replica side. This also decrease + * the chance for COB overruns. Note that primary's input buffer limits at the replica side are less restricted + * then primary's COB as the replica plays less critical part in the replication group. While increasing the + * primary's COB may end up with primary reaching swap and clients suffering, at replica side we're more at + * ease with it. Larger COB means better chance to sync successfully. + * - Reduce primary main process CPU load. By opening a new, dedicated channel for the RDB transfer, child + * processes can have direct access to the new channel. Due to TLS connection restrictions, this was not + * possible using one main channel. We eliminate the need for the child process to use the primary's + * child-proc -> main-proc pipeline, thus freeing up the main process to process clients queries. + * + * * High level interface design * + * - Dual channel sync begins when the replica sends a REPLCONF capa dual-channel to the primary during initial + * handshake. This allows the replica to verify whether the primary supports dual-channel-replication and, if + * so, state that this is the replica's main channel, which is not used for snapshot transfer. + * - When replica lacks sufficient data for PSYNC, the primary will send +DUALCHANNELSYNC response instead + * of RDB data. As a next step, the replica creates a new channel (rdb-channel) and configures it against + * the primary with the appropriate capabilities and requirements. The replica then requests a sync + * using the RDB channel. + * - Prior to forking, the primary sends the replica the snapshot's end repl-offset, and attaches the replica + * to the replication backlog to keep repl data until the replica requests psync. The replica uses the main + * channel to request a PSYNC starting at the snapshot end offset. + * - The primary main threads sends incremental changes via the main channel, while the bgsave process + * sends the RDB directly to the replica via the rdb-channel. As for the replica, the incremental + * changes are stored on a local buffer, while the RDB is loaded into memory. + * - Once the replica completes loading the rdb, it drops the rdb channel and streams the accumulated incremental + * changes into memory. Repl steady state continues normally. + * + * * Replica state machine * + * ┌───────────────────┐ Dual channel sync + * │RECEIVE_PING_REPLY │ ┌──────────────────────────────────────────────────────────────┐ + * └────────┬──────────┘ │ RDB channel states Main channel state │ + * │+PONG │ ┌────────────────────────────┐ ┌───────────────────┐ │ + * ┌────────▼──────────┐ ┌─┼─────►DUAL_CHANNEL_SEND_HANDSHAKE │ ┌─►SEND_HANDSHAKE │ │ + * │SEND_HANDSHAKE │ │ │ └────┬───────────────────────┘ │ └──┬────────────────┘ │ + * └────────┬──────────┘ │ │ │ │ │REPLCONF set-rdb-client-id + * │ │ │ ┌───────▼───────────────────────┐ │ ┌──▼────────────────┐ │ + * ┌────────▼──────────┐ │ │ │DUAL_CHANNEL_RECEIVE_AUTH_REPLY│ │ │RECEIVE_CAPA_REPLY │ │ + * │RECEIVE_AUTH_REPLY │ │ │ └───────┬───────────────────────┘ │ └──┬────────────────┘ │ + * └────────┬──────────┘ │ │ │+OK │ │+OK │ + * │+OK │ │ ┌───────▼───────────────────────┐ │ ┌──▼────────────────┐ │ + * ┌────────▼──────────┐ │ │ │DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY│SEND_PSYNC │ │ + * │RECEIVE_PORT_REPLY │ │ │ └───────┬───────────────────────┘ │ └──┬────────────────┘ │ + * └────────┬──────────┘ │ │ │+OK │ │PSYNC use snapshot │ + * │+OK │ │ ┌───────▼───────────────────┐ │ │end-offset provided │ + * ┌────────▼──────────┐ │ │ │DUAL_CHANNEL_RECEIVE_ENDOFF│ │ │by the primary │ + * │RECEIVE_IP_REPLY │ │ │ └───────┬───────────────────┘ │ ┌──▼────────────────┐ │ + * └────────┬──────────┘ │ │ │$ENDOFF │ │RECEIVE_PSYNC_REPLY│ │ + * │+OK │ │ ├─────────────────────────┘ └──┬────────────────┘ │ + * ┌────────▼──────────┐ │ │ │ │+CONTINUE │ + * │RECEIVE_CAPA_REPLY │ │ │ ┌───────▼───────────────┐ ┌──▼────────────────┐ │ + * └────────┬──────────┘ │ │ │DUAL_CHANNEL_RDB_LOAD │ │TRANSFER │ │ + * │+OK │ │ └───────┬───────────────┘ └─────┬─────────────┘ │ + * ┌────────▼─────────────┐ │ │ │Done loading │ │ + * │RECEIVE_VERSION_REPLY │ │ │ ┌───────▼───────────────┐ │ │ + * └────────┬─────────────┘ │ │ │DUAL_CHANNEL_RDB_LOADED│ │ │ + * │+OK │ │ └───────┬───────────────┘ │ │ + * ┌────────▼───┐ │ │ │ │ │ + * │SEND_PSYNC │ │ │ │Replica loads local replication │ │ + * └─┬──────────┘ │ │ │buffer into memory │ │ + * │PSYNC (use cached-primary)│ │ └─────────┬───────────────────────┘ │ + * ┌─▼─────────────────┐ │ │ │ │ + * │RECEIVE_PSYNC_REPLY│ │ └────────────────────┼─────────────────────────────────────────┘ + * └────────┬─┬────────┘ │ │ + * +CONTINUE│ │+DUALCHANNELSYNC │ │ + * │ │ └─────────────────┘ │ + * │ │+FULLRESYNC │ + * │ ┌─▼─────────────────┐ ┌────▼──────────────┐ + * │ │TRANSFER ├───────────────────►CONNECTED │ + * │ └───────────────────┘ └────▲──────────────┘ + * │ │ + * └─────────────────────────────────────────────────┘ + */ +/* This handler fires when the non blocking connect was able to + * establish a connection with the primary. */ +void syncWithPrimary(connection *conn) { + char tmpfile[256], *err = NULL; + int psync_result; + + /* If this event fired after the user turned the instance into a primary + * with REPLICAOF NO ONE we must just return ASAP. */ + if (server.repl_state == REPL_STATE_NONE) { + connClose(conn); + return; + } /* Check for errors in the socket: after a non blocking connect() we * may find that the socket is in error state. */ @@ -3456,16 +3506,22 @@ int replicationProceedWithHandshake(connection *conn, int curr_state, slotBitmap } /* Send a PING to check the primary is able to reply without errors. */ - if (curr_state == REPL_STATE_CONNECTING) { + if (server.repl_state == REPL_STATE_CONNECTING) { serverLog(LL_NOTICE, "Non blocking connect for SYNC fired the event."); + /* Delete the writable event so that the readable event remains + * registered and we can wait for the PONG reply. */ + connSetReadHandler(conn, syncWithPrimary); + connSetWriteHandler(conn, NULL); + server.repl_state = REPL_STATE_RECEIVE_PING_REPLY; /* Send the PING, don't check for errors at all, we have the timeout * that will take care about this. */ err = sendCommand(conn, "PING", NULL); if (err) goto write_error; - return REPL_STATE_RECEIVE_PING_REPLY; + return; } - /* Receive the PONG command. */ - if (curr_state == REPL_STATE_RECEIVE_PING_REPLY) { + + /* Receive the PONG command. */ + if (server.repl_state == REPL_STATE_RECEIVE_PING_REPLY) { err = receiveSynchronousResponse(conn); /* The primary did not reply */ @@ -3486,10 +3542,10 @@ int replicationProceedWithHandshake(connection *conn, int curr_state, slotBitmap } sdsfree(err); err = NULL; - curr_state = REPL_STATE_SEND_HANDSHAKE; + server.repl_state = REPL_STATE_SEND_HANDSHAKE; } - if (curr_state == REPL_STATE_SEND_HANDSHAKE) { + if (server.repl_state == REPL_STATE_SEND_HANDSHAKE) { /* AUTH with the primary if required. */ if (server.primary_auth) { char *args[3] = {"AUTH", NULL, NULL}; @@ -3524,16 +3580,6 @@ int replicationProceedWithHandshake(connection *conn, int curr_state, slotBitmap if (err) goto write_error; } - /* Set the slot bitmap, so that the primary only provides us with the appropriate slot dictionary. */ - if (slot_bitmap != NULL && !isSlotBitmapEmpty(slot_bitmap)) { - char *argv[3] = {"REPLCONF", "slot-bitmap", NULL}; - size_t lens[3] = {8, 11, 0}; - argv[2] = (char *)slot_bitmap; - lens[2] = sizeof(slotBitmap); - err = sendCommandArgv(conn, 3, argv, lens); - if (err) goto write_error; - } - /* Inform the primary of our (replica) capabilities. * * EOF: supports EOF-style RDB transfer for diskless replication. @@ -3549,14 +3595,15 @@ int replicationProceedWithHandshake(connection *conn, int curr_state, slotBitmap err = sendCommand(conn, "REPLCONF", "version", VALKEY_VERSION, NULL); if (err) goto write_error; - return REPL_STATE_RECEIVE_AUTH_REPLY; + server.repl_state = REPL_STATE_RECEIVE_AUTH_REPLY; + return; } - if (curr_state == REPL_STATE_RECEIVE_AUTH_REPLY && !server.primary_auth) - curr_state = REPL_STATE_RECEIVE_PORT_REPLY; + if (server.repl_state == REPL_STATE_RECEIVE_AUTH_REPLY && !server.primary_auth) + server.repl_state = REPL_STATE_RECEIVE_PORT_REPLY; /* Receive AUTH reply. */ - if (curr_state == REPL_STATE_RECEIVE_AUTH_REPLY) { + if (server.repl_state == REPL_STATE_RECEIVE_AUTH_REPLY) { err = receiveSynchronousResponse(conn); if (err == NULL) goto no_response_error; if (err[0] == '-') { @@ -3566,11 +3613,12 @@ int replicationProceedWithHandshake(connection *conn, int curr_state, slotBitmap } sdsfree(err); err = NULL; - return REPL_STATE_RECEIVE_PORT_REPLY; + server.repl_state = REPL_STATE_RECEIVE_PORT_REPLY; + return; } /* Receive REPLCONF listening-port reply. */ - if (curr_state == REPL_STATE_RECEIVE_PORT_REPLY) { + if (server.repl_state == REPL_STATE_RECEIVE_PORT_REPLY) { err = receiveSynchronousResponse(conn); if (err == NULL) goto no_response_error; /* Ignore the error if any, not all the Redis OSS versions support @@ -3582,14 +3630,15 @@ int replicationProceedWithHandshake(connection *conn, int curr_state, slotBitmap err); } sdsfree(err); - return REPL_STATE_RECEIVE_IP_REPLY; + server.repl_state = REPL_STATE_RECEIVE_IP_REPLY; + return; } - if (curr_state == REPL_STATE_RECEIVE_IP_REPLY && !server.replica_announce_ip) - curr_state = REPL_STATE_RECEIVE_SLOT_REPLY; + if (server.repl_state == REPL_STATE_RECEIVE_IP_REPLY && !server.replica_announce_ip) + server.repl_state = REPL_STATE_RECEIVE_CAPA_REPLY; /* Receive REPLCONF ip-address reply. */ - if (curr_state == REPL_STATE_RECEIVE_IP_REPLY) { + if (server.repl_state == REPL_STATE_RECEIVE_IP_REPLY) { err = receiveSynchronousResponse(conn); if (err == NULL) goto no_response_error; /* Ignore the error if any, not all the Redis OSS versions support @@ -3601,28 +3650,12 @@ int replicationProceedWithHandshake(connection *conn, int curr_state, slotBitmap err); } sdsfree(err); - return REPL_STATE_RECEIVE_SLOT_REPLY; - } - - if (curr_state == REPL_STATE_RECEIVE_SLOT_REPLY && (slot_bitmap == NULL || isSlotBitmapEmpty(slot_bitmap))) - curr_state = REPL_STATE_RECEIVE_CAPA_REPLY; - - if (curr_state == REPL_STATE_RECEIVE_SLOT_REPLY) { - err = receiveSynchronousResponse(conn); - if (err == NULL) goto no_response_error; - /* If we sent the slot bitmap, we need it to be properly acked, or we can't do slot migration. */ - if (err[0] == '-') { - serverLog(LL_WARNING, "Source does not understand REPLCONF slot-num. Cannot continue with slot-level sync: %s", err); - sdsfree(err); - goto error; - } - sdsfree(err); - return REPL_STATE_RECEIVE_CAPA_REPLY; + server.repl_state = REPL_STATE_RECEIVE_CAPA_REPLY; + return; } - /* Receive CAPA reply. */ - if (curr_state == REPL_STATE_RECEIVE_CAPA_REPLY) { + if (server.repl_state == REPL_STATE_RECEIVE_CAPA_REPLY) { err = receiveSynchronousResponse(conn); if (err == NULL) goto no_response_error; /* Ignore the error if any, not all the Redis OSS versions support @@ -3635,11 +3668,12 @@ int replicationProceedWithHandshake(connection *conn, int curr_state, slotBitmap } sdsfree(err); err = NULL; - return REPL_STATE_RECEIVE_VERSION_REPLY; + server.repl_state = REPL_STATE_RECEIVE_VERSION_REPLY; + return; } /* Receive VERSION reply. */ - if (curr_state == REPL_STATE_RECEIVE_VERSION_REPLY) { + if (server.repl_state == REPL_STATE_RECEIVE_VERSION_REPLY) { err = receiveSynchronousResponse(conn); if (err == NULL) goto no_response_error; /* Ignore the error if any. Valkey >= 8 supports REPLCONF VERSION. */ @@ -3651,125 +3685,7 @@ int replicationProceedWithHandshake(connection *conn, int curr_state, slotBitmap } sdsfree(err); err = NULL; - return REPL_STATE_SEND_PSYNC; - } - - -no_response_error: /* Handle receiveSynchronousResponse() error when primary has no reply */ - serverLog(LL_WARNING, "Primary did not respond to command during SYNC handshake"); - /* Fall through to regular error handling */ - -error: - return REPL_STATE_ERROR; - -write_error: /* Handle sendCommand() errors. */ - serverLog(LL_WARNING, "Sending command to primary in replication handshake: %s", err); - sdsfree(err); - goto error; -} - -/* - * Dual channel for full sync - * - * * Motivation * - * - Reduce primary memory load. We do that by moving the COB tracking to the replica side. This also decrease - * the chance for COB overruns. Note that primary's input buffer limits at the replica side are less restricted - * then primary's COB as the replica plays less critical part in the replication group. While increasing the - * primary's COB may end up with primary reaching swap and clients suffering, at replica side we're more at - * ease with it. Larger COB means better chance to sync successfully. - * - Reduce primary main process CPU load. By opening a new, dedicated channel for the RDB transfer, child - * processes can have direct access to the new channel. Due to TLS connection restrictions, this was not - * possible using one main channel. We eliminate the need for the child process to use the primary's - * child-proc -> main-proc pipeline, thus freeing up the main process to process clients queries. - * - * * High level interface design * - * - Dual channel sync begins when the replica sends a REPLCONF capa dual-channel to the primary during initial - * handshake. This allows the replica to verify whether the primary supports dual-channel-replication and, if - * so, state that this is the replica's main channel, which is not used for snapshot transfer. - * - When replica lacks sufficient data for PSYNC, the primary will send +DUALCHANNELSYNC response instead - * of RDB data. As a next step, the replica creates a new channel (rdb-channel) and configures it against - * the primary with the appropriate capabilities and requirements. The replica then requests a sync - * using the RDB channel. - * - Prior to forking, the primary sends the replica the snapshot's end repl-offset, and attaches the replica - * to the replication backlog to keep repl data until the replica requests psync. The replica uses the main - * channel to request a PSYNC starting at the snapshot end offset. - * - The primary main threads sends incremental changes via the main channel, while the bgsave process - * sends the RDB directly to the replica via the rdb-channel. As for the replica, the incremental - * changes are stored on a local buffer, while the RDB is loaded into memory. - * - Once the replica completes loading the rdb, it drops the rdb channel and streams the accumulated incremental - * changes into memory. Repl steady state continues normally. - * - * * Replica state machine * - * ┌───────────────────┐ Dual channel sync - * │RECEIVE_PING_REPLY │ ┌──────────────────────────────────────────────────────────────┐ - * └────────┬──────────┘ │ RDB channel states Main channel state │ - * │+PONG │ ┌────────────────────────────┐ ┌───────────────────┐ │ - * ┌────────▼──────────┐ ┌─┼─────►DUAL_CHANNEL_SEND_HANDSHAKE │ ┌─►SEND_HANDSHAKE │ │ - * │SEND_HANDSHAKE │ │ │ └────┬───────────────────────┘ │ └──┬────────────────┘ │ - * └────────┬──────────┘ │ │ │ │ │REPLCONF set-rdb-client-id - * │ │ │ ┌───────▼───────────────────────┐ │ ┌──▼────────────────┐ │ - * ┌────────▼──────────┐ │ │ │DUAL_CHANNEL_RECEIVE_AUTH_REPLY│ │ │RECEIVE_CAPA_REPLY │ │ - * │RECEIVE_AUTH_REPLY │ │ │ └───────┬───────────────────────┘ │ └──┬────────────────┘ │ - * └────────┬──────────┘ │ │ │+OK │ │+OK │ - * │+OK │ │ ┌───────▼───────────────────────┐ │ ┌──▼────────────────┐ │ - * ┌────────▼──────────┐ │ │ │DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY│SEND_PSYNC │ │ - * │RECEIVE_PORT_REPLY │ │ │ └───────┬───────────────────────┘ │ └──┬────────────────┘ │ - * └────────┬──────────┘ │ │ │+OK │ │PSYNC use snapshot │ - * │+OK │ │ ┌───────▼───────────────────┐ │ │end-offset provided │ - * ┌────────▼──────────┐ │ │ │DUAL_CHANNEL_RECEIVE_ENDOFF│ │ │by the primary │ - * │RECEIVE_IP_REPLY │ │ │ └───────┬───────────────────┘ │ ┌──▼────────────────┐ │ - * └────────┬──────────┘ │ │ │$ENDOFF │ │RECEIVE_PSYNC_REPLY│ │ - * │+OK │ │ ├─────────────────────────┘ └──┬────────────────┘ │ - * ┌────────▼──────────┐ │ │ │ │+CONTINUE │ - * │RECEIVE_CAPA_REPLY │ │ │ ┌───────▼───────────────┐ ┌──▼────────────────┐ │ - * └────────┬──────────┘ │ │ │DUAL_CHANNEL_RDB_LOAD │ │TRANSFER │ │ - * │+OK │ │ └───────┬───────────────┘ └─────┬─────────────┘ │ - * ┌────────▼─────────────┐ │ │ │Done loading │ │ - * │RECEIVE_VERSION_REPLY │ │ │ ┌───────▼───────────────┐ │ │ - * └────────┬─────────────┘ │ │ │DUAL_CHANNEL_RDB_LOADED│ │ │ - * │+OK │ │ └───────┬───────────────┘ │ │ - * ┌────────▼───┐ │ │ │ │ │ - * │SEND_PSYNC │ │ │ │Replica loads local replication │ │ - * └─┬──────────┘ │ │ │buffer into memory │ │ - * │PSYNC (use cached-primary)│ │ └─────────┬───────────────────────┘ │ - * ┌─▼─────────────────┐ │ │ │ │ - * │RECEIVE_PSYNC_REPLY│ │ └────────────────────┼─────────────────────────────────────────┘ - * └────────┬─┬────────┘ │ │ - * +CONTINUE│ │+DUALCHANNELSYNC │ │ - * │ │ └─────────────────┘ │ - * │ │+FULLRESYNC │ - * │ ┌─▼─────────────────┐ ┌────▼──────────────┐ - * │ │TRANSFER ├───────────────────►CONNECTED │ - * │ └───────────────────┘ └────▲──────────────┘ - * │ │ - * └─────────────────────────────────────────────────┘ - */ -/* This handler fires when the non blocking connect was able to - * establish a connection with the primary. */ -void syncWithPrimary(connection *conn) { - char tmpfile[256], *err = NULL; - int psync_result; - - /* If this event fired after the user turned the instance into a primary - * with REPLICAOF NO ONE we must just return ASAP. */ - if (server.repl_state == REPL_STATE_NONE) { - connClose(conn); - return; - } - - if (server.repl_state < REPL_STATE_SEND_PSYNC) { - server.repl_state = replicationProceedWithHandshake(conn, server.repl_state, NULL); - - if (server.repl_state == REPL_STATE_RECEIVE_PING_REPLY) { - /* Delete the writable event so that the readable event remains - * registered and we can wait for the PONG reply. */ - connSetReadHandler(conn, syncWithPrimary); - connSetWriteHandler(conn, NULL); - } else if (server.repl_state == REPL_STATE_ERROR) { - goto error; - } - if (server.repl_state != REPL_STATE_SEND_PSYNC) - return; + server.repl_state = REPL_STATE_SEND_PSYNC; } /* Try a partial resynchronization. If we don't have a cached primary @@ -3898,6 +3814,10 @@ void syncWithPrimary(connection *conn) { server.repl_transfer_lastio = server.unixtime; return; +no_response_error: /* Handle receiveSynchronousResponse() error when primary has no reply */ + serverLog(LL_WARNING, "Primary did not respond to command during SYNC handshake"); + /* Fall through to regular error handling */ + error: connClose(conn); server.repl_transfer_s = NULL; @@ -4241,7 +4161,9 @@ void roleCommand(client *c) { /* Send a REPLCONF ACK command to the primary to inform it about the current * processed offset. If we are not connected with a primary, the command has * no effects. */ -void replicationSendAck(client *c) { +void replicationSendAck(void) { + client *c = server.primary; + if (c != NULL) { int send_fack = server.fsynced_reploff != -1; c->flag.replication_force_reply = 1; @@ -4773,7 +4695,7 @@ void replicationCron(void) { /* Send ACK to primary from time to time. * Note that we do not send periodic acks to primary that don't * support PSYNC and replication offsets. */ - if (server.primary_host && server.primary && !(server.primary->flag.pre_psync)) replicationSendAck(server.primary); + if (server.primary_host && server.primary && !(server.primary->flag.pre_psync)) replicationSendAck(); /* If we have attached replicas, PING them from time to time. * So replicas can implement an explicit timeout to primaries, and will @@ -4917,7 +4839,7 @@ void replicationCron(void) { replication_cron_loops++; /* Incremented with frequency 1 HZ. */ } -int shouldStartChildReplication(int *mincapa_out, int *req_out, slotBitmap slot_bitmap_out) { +int shouldStartChildReplication(int *mincapa_out, int *req_out) { /* We should start a BGSAVE good for replication if we have replicas in * WAIT_BGSAVE_START state. * @@ -4929,7 +4851,6 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out, slotBitmap slot_ int replicas_waiting = 0; int mincapa; int req; - slotBitmap slot_bitmap; int first = 1; listNode *ln; listIter li; @@ -4941,8 +4862,7 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out, slotBitmap slot_ if (first) { /* Get first replica's requirements */ req = replica->repl_data->replica_req; - memcpy(slot_bitmap, replica->repl_data->slot_bitmap, sizeof(slotBitmap)); - } else if (req != replica->repl_data->replica_req || slotBitmapCompare(slot_bitmap, replica->repl_data->slot_bitmap) != 0) { + } else if (req != replica->repl_data->replica_req) { /* Skip replicas that don't match */ continue; } @@ -4960,7 +4880,6 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out, slotBitmap slot_ max_idle >= server.repl_diskless_sync_delay)) { if (mincapa_out) *mincapa_out = mincapa; if (req_out) *req_out = req; - if (slot_bitmap_out) memcpy(slot_bitmap_out, slot_bitmap, sizeof(slotBitmap)); return 1; } } @@ -4971,13 +4890,12 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out, slotBitmap slot_ void replicationStartPendingFork(void) { int mincapa = -1; int req = -1; - slotBitmap slot_bitmap; - if (shouldStartChildReplication(&mincapa, &req, slot_bitmap)) { + if (shouldStartChildReplication(&mincapa, &req)) { /* Start the BGSAVE. The called function may start a * BGSAVE with socket target or disk target depending on the * configuration and replicas capabilities and requirements. */ - startBgsaveForReplication(mincapa, req, slot_bitmap); + startBgsaveForReplication(mincapa, req); } } diff --git a/src/server.c b/src/server.c index ea77cc1312..50b93b2943 100644 --- a/src/server.c +++ b/src/server.c @@ -900,7 +900,7 @@ int clientsCronResizeQueryBuffer(client *c) { if (idletime > 2) { /* 1) Query is idle for a long time. */ size_t remaining = sdslen(c->querybuf) - c->qb_pos; - if (!c->flag.replication_source && !remaining) { + if (!c->flag.replicated && !remaining) { /* If the client is not for replication and no data is pending, * The client can safely use the shared query buffer in the next read - free the client's querybuf. */ sdsfree(c->querybuf); @@ -1451,7 +1451,7 @@ long long serverCron(struct aeEventLoop *eventLoop, long long id, void *clientDa monotime current_time = getMonotonicUs(); long long factor = 1000000; // us trackInstantaneousMetric(STATS_METRIC_COMMAND, server.stat_numcommands, current_time, factor); - trackInstantaneousMetric(STATS_METRIC_NET_INPUT, server.stat_net_input_bytes + server.stat_net_repl_input_bytes, + trackInstantaneousMetric(STATS_METRIC_NET_INPUT, server.stat_net_input_bytes + server.stat_net_repl_input_bytes + server.stat_net_slot_migration_input_bytes, current_time, factor); trackInstantaneousMetric(STATS_METRIC_NET_OUTPUT, server.stat_net_output_bytes + server.stat_net_repl_output_bytes, current_time, @@ -1464,6 +1464,8 @@ long long serverCron(struct aeEventLoop *eventLoop, long long id, void *clientDa factor); trackInstantaneousMetric(STATS_METRIC_EL_DURATION, server.duration_stats[EL_DURATION_TYPE_EL].sum, server.duration_stats[EL_DURATION_TYPE_EL].cnt, 1); + trackInstantaneousMetric(STATS_METRIC_NET_INPUT_SLOT_MIGRATION, server.stat_net_slot_migration_input_bytes, + current_time, factor); } /* We have just LRU_BITS bits per object for LRU information. @@ -2684,6 +2686,7 @@ void resetServerStats(void) { server.stat_net_input_bytes = 0; server.stat_net_output_bytes = 0; server.stat_net_repl_input_bytes = 0; + server.stat_net_slot_migration_input_bytes = 0; server.stat_net_repl_output_bytes = 0; server.stat_unexpected_error_replies = 0; server.stat_total_error_replies = 0; @@ -3359,7 +3362,7 @@ struct serverCommand *lookupCommandOrOriginal(robj **argv, int argc) { /* Commands arriving from a replication source or AOF client, should never be rejected. */ int mustObeyClient(client *c) { - return c->id == CLIENT_ID_AOF || c->flag.replication_source; + return c->id == CLIENT_ID_AOF || c->flag.replicated; } static int shouldPropagate(int target) { @@ -3369,7 +3372,7 @@ static int shouldPropagate(int target) { if (server.aof_state != AOF_OFF) return 1; } if (target & PROPAGATE_REPL) { - if (server.primary == NULL && (server.repl_backlog || listLength(server.replicas) != 0)) return 1; + if (server.primary_host == NULL && (server.repl_backlog || listLength(server.replicas) != 0)) return 1; } return 0; @@ -3418,7 +3421,12 @@ static void propagateNow(int dbid, robj **argv, int argc, int target) { server.server_del_keys_in_slot); if (server.aof_state != AOF_OFF && target & PROPAGATE_AOF) feedAppendOnlyFile(dbid, argv, argc); - if (target & PROPAGATE_REPL) replicationFeedReplicas(dbid, argv, argc); + if (target & PROPAGATE_REPL) { + replicationFeedReplicas(dbid, argv, argc); + if (server.cluster_enabled) { + clusterFeedSlotMigration(dbid, argv, argc); + } + } } /* Used inside commands to schedule the propagation of additional commands @@ -4297,7 +4305,7 @@ int processCommand(client *c) { /* If the server is paused, block the client until * the pause has ended. Replicas are never paused. */ - if (!c->flag.replica && ((isPausedActions(PAUSE_ACTION_CLIENT_ALL)) || + if (!c->flag.replica && !c->flag.slot_migration_target && ((isPausedActions(PAUSE_ACTION_CLIENT_ALL)) || ((isPausedActions(PAUSE_ACTION_CLIENT_WRITE)) && is_may_replicate_command))) { blockPostponeClient(c); return C_OK; @@ -5903,8 +5911,8 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) { "total_connections_received:%lld\r\n", server.stat_numconnections, "total_commands_processed:%lld\r\n", server.stat_numcommands, "instantaneous_ops_per_sec:%lld\r\n", getInstantaneousMetric(STATS_METRIC_COMMAND), - "total_net_input_bytes:%lld\r\n", server.stat_net_input_bytes + server.stat_net_repl_input_bytes, - "total_net_output_bytes:%lld\r\n", server.stat_net_output_bytes + server.stat_net_repl_output_bytes, + "total_net_input_bytes:%lld\r\n", server.stat_net_input_bytes + server.stat_net_repl_input_bytes + server.stat_net_slot_migration_input_bytes, + "total_net_output_bytes:%lld\r\n", server.stat_net_output_bytes + server.stat_net_repl_output_bytes + server.stat_net_slot_migration_output_bytes, "total_net_repl_input_bytes:%lld\r\n", server.stat_net_repl_input_bytes, "total_net_repl_output_bytes:%lld\r\n", server.stat_net_repl_output_bytes, "instantaneous_input_kbps:%.2f\r\n", (float)getInstantaneousMetric(STATS_METRIC_NET_INPUT) / 1024, @@ -5962,7 +5970,11 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) { "eventloop_duration_sum:%llu\r\n", server.duration_stats[EL_DURATION_TYPE_EL].sum, "eventloop_duration_cmd_sum:%llu\r\n", server.duration_stats[EL_DURATION_TYPE_CMD].sum, "instantaneous_eventloop_cycles_per_sec:%llu\r\n", getInstantaneousMetric(STATS_METRIC_EL_CYCLE), - "instantaneous_eventloop_duration_usec:%llu\r\n", getInstantaneousMetric(STATS_METRIC_EL_DURATION))); + "instantaneous_eventloop_duration_usec:%llu\r\n", getInstantaneousMetric(STATS_METRIC_EL_DURATION), + "total_net_slot_migration_input_bytes:%lld\r\n", server.stat_net_slot_migration_input_bytes, + "total_net_slot_migration_output_bytes:%lld\r\n", server.stat_net_slot_migration_output_bytes, + "instantaneous_slot_migration_input_kbps:%.2f\r\n", (float)getInstantaneousMetric(STATS_METRIC_NET_INPUT_SLOT_MIGRATION) / 1024, + "instantaneous_slot_migration_output_kbps:%.2f\r\n", (float)getInstantaneousMetric(STATS_METRIC_NET_OUTPUT_SLOT_MIGRATION) / 1024)); info = genValkeyInfoStringACLStats(info); } diff --git a/src/server.h b/src/server.h index e1a8a1d503..32d71bafd2 100644 --- a/src/server.h +++ b/src/server.h @@ -182,15 +182,17 @@ struct hdr_histogram; #define RIO_CONNSET_WRITE_MAX_CHUNK_SIZE 16384 /* Instantaneous metrics tracking. */ -#define STATS_METRIC_SAMPLES 16 /* Number of samples per metric. */ -#define STATS_METRIC_COMMAND 0 /* Number of commands executed. */ -#define STATS_METRIC_NET_INPUT 1 /* Bytes read to network. */ -#define STATS_METRIC_NET_OUTPUT 2 /* Bytes written to network. */ -#define STATS_METRIC_NET_INPUT_REPLICATION 3 /* Bytes read to network during replication. */ -#define STATS_METRIC_NET_OUTPUT_REPLICATION 4 /* Bytes written to network during replication. */ -#define STATS_METRIC_EL_CYCLE 5 /* Number of eventloop cycled. */ -#define STATS_METRIC_EL_DURATION 6 /* Eventloop duration. */ -#define STATS_METRIC_COUNT 7 +#define STATS_METRIC_SAMPLES 16 /* Number of samples per metric. */ +#define STATS_METRIC_COMMAND 0 /* Number of commands executed. */ +#define STATS_METRIC_NET_INPUT 1 /* Bytes read to network. */ +#define STATS_METRIC_NET_OUTPUT 2 /* Bytes written to network. */ +#define STATS_METRIC_NET_INPUT_REPLICATION 3 /* Bytes read to network during replication. */ +#define STATS_METRIC_NET_OUTPUT_REPLICATION 4 /* Bytes written to network during replication. */ +#define STATS_METRIC_EL_CYCLE 5 /* Number of eventloop cycled. */ +#define STATS_METRIC_EL_DURATION 6 /* Eventloop duration. */ +#define STATS_METRIC_NET_INPUT_SLOT_MIGRATION 7 /* Bytes read to network during slot migration. */ +#define STATS_METRIC_NET_OUTPUT_SLOT_MIGRATION 7 /* Bytes written to network during slot migration. */ +#define STATS_METRIC_COUNT 8 /* Protocol and I/O related defines */ #define PROTO_IOBUF_LEN (1024 * 16) /* Generic I/O buffer size */ @@ -599,6 +601,7 @@ typedef enum { PAUSE_BY_CLIENT_COMMAND = 0, PAUSE_DURING_SHUTDOWN, PAUSE_DURING_FAILOVER, + PAUSE_DURING_SLOT_MIGRATION, NUM_PAUSE_PURPOSES /* This value is the number of purposes above. */ } pause_purpose; @@ -1091,8 +1094,9 @@ typedef struct ClientFlags { * flag, we won't cache the primary in freeClient. */ uint64_t fake : 1; /* This is a fake client without a real connection. */ uint64_t import_source : 1; /* This client is importing data to server and can visit expired key. */ - uint64_t replication_source : 1; /* This client is a replication source (i.e. primary or slot migration). */ + uint64_t replicated : 1; /* This client is a replication source (i.e. primary or slot migration). */ uint64_t slot_migration_source : 1; /* This client is a slot migration source. */ + uint64_t slot_migration_target : 1; /* This client is a slot migration target. */ uint64_t reserved : 3; /* Reserved for future use */ } ClientFlags; @@ -1144,7 +1148,6 @@ typedef struct ClientReplicationData { see the definition of replBufBlock. */ size_t ref_block_pos; /* Access position of referenced buffer block, i.e. the next offset to send. */ - slotBitmap slot_bitmap; /* The slot range this replica is replicating for. */ } ClientReplicationData; typedef struct ClientModuleData { @@ -1540,6 +1543,7 @@ typedef struct { #define CHILD_TYPE_AOF 2 #define CHILD_TYPE_LDB 3 #define CHILD_TYPE_MODULE 4 +#define CHILD_TYPE_SYNCSLOTS 5 typedef enum childInfoType { CHILD_INFO_TYPE_CURRENT_INFO, @@ -1708,9 +1712,10 @@ struct valkeyServer { long long stat_net_input_bytes; /* Bytes read from network. */ long long stat_net_output_bytes; /* Bytes written to network. */ long long stat_net_repl_input_bytes; /* Bytes read during replication, added to stat_net_input_bytes in 'info'. */ - long long stat_net_slot_migration_input_bytes; /* Bytes read during replication, added to stat_net_input_bytes in 'info'. */ /* Bytes written during replication, added to stat_net_output_bytes in 'info'. */ long long stat_net_repl_output_bytes; + long long stat_net_slot_migration_input_bytes; /* Bytes read during replication, added to stat_net_input_bytes in 'info'. */ + long long stat_net_slot_migration_output_bytes; /* Bytes read during replication, added to stat_net_input_bytes in 'info'. */ size_t stat_current_cow_peak; /* Peak size of copy on write bytes. */ size_t stat_current_cow_bytes; /* Copy on write bytes while child is active. */ monotime stat_current_cow_updated; /* Last update time of stat_current_cow_bytes */ @@ -2621,7 +2626,7 @@ void dictVanillaFree(void *val); #define READ_FLAGS_INLINE_ZERO_QUERY_LEN (1 << 11) #define READ_FLAGS_PARSING_NEGATIVE_MBULK_LEN (1 << 12) #define READ_FLAGS_PARSING_COMPLETED (1 << 13) -#define READ_FLAGS_REPLICATION_SOURCE (1 << 14) +#define READ_FLAGS_REPLICATED (1 << 14) #define READ_FLAGS_DONT_PARSE (1 << 15) #define READ_FLAGS_AUTH_REQUIRED (1 << 16) @@ -2948,8 +2953,8 @@ int sendCurrentOffsetToReplica(client *replica); void addRdbReplicaToPsyncWait(client *replica); void initClientReplicationData(client *c); void freeClientReplicationData(client *c); -void replicationSendAck(client *c); -int replicationProceedWithHandshake(connection *conn, int curr_state, slotBitmap slot_bitmap); +char *sendCommandArgv(connection *conn, int argc, char **argv, size_t *argv_lens); +char *receiveSynchronousResponse(connection *conn); /* Generic persistence functions */ void startLoadingFile(size_t size, char *filename, int rdbflags); @@ -2961,6 +2966,8 @@ void updateLoadingFileName(char *filename); void startSaving(int rdbflags); void stopSaving(int success); int allPersistenceDisabled(void); +typedef int(*ChildSnapshotFunc)(int req, rio *rdb, void *privdata); +int saveSnapshotToConnectionSockets(connection **conns, int connsnum, int use_pipe, int req, ChildSnapshotFunc snapshot_func, void *privdata); #define DISK_ERROR_TYPE_AOF 1 /* Don't accept writes: AOF errors. */ #define DISK_ERROR_TYPE_RDB 2 /* Don't accept writes: RDB errors. */ @@ -3684,6 +3691,7 @@ void sdiffCommand(client *c); void sdiffstoreCommand(client *c); void sscanCommand(client *c); void syncCommand(client *c); +void syncSlotsCommand(client *c); void flushdbCommand(client *c); void flushallCommand(client *c); void sortCommand(client *c); diff --git a/tests/unit/slot-migration.tcl b/tests/unit/slot-migration.tcl new file mode 100644 index 0000000000..90adaf84e0 --- /dev/null +++ b/tests/unit/slot-migration.tcl @@ -0,0 +1,22 @@ + + +# TEST CASES +# ---- General ---- +# - Only migrating slots are synced +# - Changes in non-migrating slots are not sent to target +# - Parsing test +# - Slot must have available primary +# +# ---- Reslience ---- +# - Target gives up if primary is unavailable +# - Source unpauses itself if replica is unavailable +# - Client is closed by target during migration +# +# ---- Importing slot is not exposed ---- +# - KEYS command on importing node +# - RANDOMKEY on importing node +# +# ---- Replication +# - Replica receives updates through target primary +# - Time out results in replica dropping slots +# - Failover during migration cleans up slots \ No newline at end of file From 98de0a757698378572efc4e6d0b758974d61cdd9 Mon Sep 17 00:00:00 2001 From: Jacob Murphy Date: Mon, 20 Jan 2025 08:30:17 +0000 Subject: [PATCH 07/18] Bug fixes for SYNCSLOTS based implementation Signed-off-by: Jacob Murphy --- src/cluster.c | 21 ------ src/cluster.h | 2 - src/cluster_legacy.c | 127 ++++++++++++++++++---------------- src/cluster_legacy.h | 1 - src/kvstore.c | 29 ++------ src/kvstore.h | 1 - src/lazyfree.c | 23 ------ src/networking.c | 5 +- src/rdb.c | 4 +- src/server.h | 1 - tests/unit/slot-migration.tcl | 12 +++- 11 files changed, 85 insertions(+), 141 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 508eddefc6..f650d979f7 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -815,27 +815,6 @@ unsigned int countKeysInSlot(unsigned int slot) { return kvstoreHashtableSize(server.db->keys, slot); } -unsigned int dropKeysInSlotBitmap(slotBitmap slot_bitmap, int async) { - unsigned int result = 0; - for (int i = 0; i < CLUSTER_SLOTS; i++) { - if (bitmapTestBit(slot_bitmap, i)) { - result += dropKeysInSlot(i, async); - } - } - return result; -} - -unsigned int dropKeysInSlot(unsigned int hashslot, int async) { - unsigned int result = kvstoreHashtableSize(server.db->keys, hashslot); - if (async) { - emptyHashtableAsync(server.db, hashslot); - } else { - kvstoreEmptyHashtable(server.db->keys, hashslot, NULL); - kvstoreEmptyHashtable(server.db->expires, hashslot, NULL); - } - return result; -} - void clusterCommandHelp(client *c) { const char *help[] = { "COUNTKEYSINSLOT ", diff --git a/src/cluster.h b/src/cluster.h index 5192bc405e..21d4469357 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -114,8 +114,6 @@ client *createCachedResponseClient(int resp); void deleteCachedResponseClient(client *recording_client); void clearCachedClusterSlotsResponse(void); unsigned int countKeysInSlot(unsigned int hashslot); -unsigned int dropKeysInSlotBitmap(slotBitmap slot_bitmap, int async); -unsigned int dropKeysInSlot(unsigned int hashslot, int async); void bitmapToSlotRanges(unsigned char *bitmap, slotBitmap slot_bitmap_out); int bitmapTestBit(unsigned char *bitmap, int pos); void bitmapSetBit(unsigned char *bitmap, int pos); diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index fa0da913b2..97b201a0a2 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -94,6 +94,7 @@ const char *clusterGetMessageTypeString(int type); void removeChannelsInSlot(unsigned int slot); unsigned int countChannelsInSlot(unsigned int hashslot); unsigned int delKeysInSlot(unsigned int hashslot); +unsigned int delKeysInSlotBitmap(slotBitmap bitmap); void clusterAddNodeToShard(const char *shard_id, clusterNode *node); list *clusterLookupNodeListByShardId(const char *shard_id); void clusterRemoveNodeFromShard(clusterNode *node); @@ -4424,10 +4425,10 @@ slotImport *clusterCreateSlotImportJob(clusterNode *source, slotBitmap slots) { } slotExport *clusterCreateSlotExportJob(client *c, slotBitmap slots) { - slotExport *result = (slotExport *) zmalloc(sizeof(slotExport)); + slotExport *result = (slotExport *) zcalloc(sizeof(slotExport)); memcpy(result->slot_bitmap, slots, sizeof(slotBitmap)); result->state = SLOT_EXPORT_QUEUED; - result->pause_end = 0; + result->pause_end = -1; result->client = c; return result; } @@ -4484,17 +4485,14 @@ void clusterFeedSlotMigration(int dbid, robj **argv, int argc) { unsigned long long prev_pending = curr_export->client->reply_bytes; addReplyArrayLen(curr_export->client, argc); for (i = 0; i < argc; i++) { - addReply(curr_export->client, argv[i]); + addReplyBulk(curr_export->client, argv[i]); } curr_export->syncslot_offset += curr_export->client->reply_bytes - prev_pending; } int clusterShouldWriteToSlotMigrationTarget() { slotExport *curr_export = clusterGetCurrentSlotExport(); - if (curr_export->state != SLOT_EXPORT_PAUSED) { - return 0; - } - return 1; + return curr_export && (curr_export->state == SLOT_EXPORT_PAUSE_AND_REPLY || curr_export->state == SLOT_EXPORT_PAUSED); } void clusterSlotMigrationHandleClientClose(client *c) { @@ -4570,7 +4568,7 @@ void clusterProceedWithSlotImport(void) { c->flag.authenticated = 1; c->user = NULL; /* This client can do everything. */ c->querybuf = sdsempty(); /* Similar to primary, we use a dedicated query buf. */ - initClientReplicationData(c); /* Used to track reploff */ + initClientReplicationData(c); curr_import->state = SLOT_IMPORT_SEND_AUTH; continue; @@ -4614,7 +4612,7 @@ void clusterProceedWithSlotImport(void) { continue; case SLOT_IMPORT_SEND_SYNCSLOTS: /* Ensure we have a clean state for the SYNC. */ - dropKeysInSlotBitmap(curr_import->slot_bitmap, 1); + delKeysInSlotBitmap(curr_import->slot_bitmap); serverLog(LL_NOTICE, "Sending CLUSTER SYNCSLOTS START request to source node %.40s", curr_import->source_node->name); char *syncslots_args[4] = {"CLUSTER", "SYNCSLOTS", "START", (char *)curr_import->slot_bitmap}; @@ -4632,7 +4630,7 @@ void clusterProceedWithSlotImport(void) { connSetReadHandler(curr_import->client->conn, readQueryFromClient); curr_import->state = SLOT_IMPORT_RECEIVE_SYNCSLOTS; case SLOT_IMPORT_RECEIVE_SYNCSLOTS: - /* Nothing to do in this state. Waiting for CLUSTER SYNCSLOTS END to be processed. */ + /* Nothing to do in this state. Waiting for CLUSTER SYNCSLOTS ENDSNAPSHOT to be processed. */ return; case SLOT_IMPORT_PAUSE_OWNER: curr_import->client->flag.replication_force_reply = 1; @@ -4674,20 +4672,9 @@ void clusterProceedWithSlotImport(void) { clusterBroadcastPong(CLUSTER_BROADCAST_ALL); listDelNode(server.cluster->slot_import_jobs, curr_node); continue; - case SLOT_IMPORT_REPLICA_TRACKING: - /* As a replica, we will simply apply the primaries updates - * from the slot migration source. However, if we are ever - * promoted to primary, we need to fail the migration to - * prevent leaked keys in the importing slots. */ - if (clusterNodeIsPrimary(myself)) { - serverLog(LL_WARNING, "Promoted to primary during slot migration, failing the ongoing migration"); - curr_import->state = SLOT_IMPORT_FAILED; - continue; - } - return; case SLOT_IMPORT_FAILED: listDelNode(server.cluster->slot_import_jobs, curr_node); - dropKeysInSlotBitmap(curr_import->slot_bitmap, server.repl_replica_lazy_flush); + delKeysInSlotBitmap(curr_import->slot_bitmap); clusterFreeSlotImportJob(curr_import); continue; } @@ -4700,7 +4687,7 @@ int childSnapshotForSyncSlot(int req, rio *rdb, void *privdata) { rioWrite(rdb, "*3\r\n", 4); rioWriteBulkString(rdb, "CLUSTER", 7); rioWriteBulkString(rdb, "SYNCSLOTS", 9); - rioWriteBulkString(rdb, "END", 3); + rioWriteBulkString(rdb, "ENDSNAPSHOT", 11); return retval; } @@ -4721,6 +4708,7 @@ void clusterProceedWithSlotExport(void) { } connection ** conns = zmalloc(sizeof(connection*)); *conns = curr_export->client->conn; + serverLog(LL_NOTICE, "Initiating snapshot to conn with fd %d", curr_export->client->conn->fd); if (saveSnapshotToConnectionSockets(conns, 1, 1, 0, childSnapshotForSyncSlot, curr_export->slot_bitmap) != C_OK) { serverLog(LL_WARNING, "Failed to start slot export to target"); curr_export->state = SLOT_EXPORT_FAILED; @@ -4736,8 +4724,13 @@ void clusterProceedWithSlotExport(void) { addReplyArrayLen(curr_export->client, 4); addReplyBulkCBuffer(curr_export->client, "CLUSTER", 7); addReplyBulkCBuffer(curr_export->client, "SYNCSLOTS", 9); - addReplyBulkCBuffer(curr_export->client, "PAUSEDAT", 8); - addReplyLongLong(curr_export->client, curr_export->syncslot_offset); + addReplyBulkCBuffer(curr_export->client, "PAUSEOFFSET", 11); + addReplyBulkLongLong(curr_export->client, curr_export->syncslot_offset); + + /* Even though we just added replies, it's possible that, due to + * existing pending data, the client is not in the pending write + * queue. We enqueue it explicitly to work around this. */ + putClientInPendingWriteQueue(curr_export->client); curr_export->pause_end = mstime() + (CLUSTER_MF_TIMEOUT * CLUSTER_MF_PAUSE_MULT); pauseActions(PAUSE_DURING_SLOT_MIGRATION, curr_export->pause_end, PAUSE_ACTIONS_CLIENT_WRITE_SET); @@ -4745,7 +4738,18 @@ void clusterProceedWithSlotExport(void) { curr_export->state = SLOT_EXPORT_PAUSED; continue; case SLOT_EXPORT_PAUSED: - /* */ + /* While paused, we simply want to check if we should unpause. */ + if (curr_export->pause_end <= mstime()) { + /* Every CLUSTER_MF_TIMEOUT, the source node should + * re-attempt the pause. If we reach this point, it hasn't + * attempted the pause in that time, we can assume it is + * dead and fail the migration.*/ + serverLog(LL_WARNING, "During slot export, unpausing self and cancelling export due to timeout."); + unpauseActions(PAUSE_DURING_SLOT_MIGRATION); + curr_export->state = SLOT_EXPORT_FAILED; + continue; + } + return; case SLOT_EXPORT_FINISH: case SLOT_EXPORT_FAILED: listDelNode(server.cluster->slot_export_jobs, curr_node); @@ -6714,9 +6718,18 @@ void removeChannelsInSlot(unsigned int slot) { pubsubShardUnsubscribeAllChannelsInSlot(slot); } +unsigned int delKeysInSlotBitmap(slotBitmap bitmap) { + unsigned int res = 0; + for (int i = 0; i < CLUSTER_SLOTS; i++) { + if (bitmapTestBit(bitmap, i)) { + res += delKeysInSlot(i); + } + } + return res; +} + /* Remove all the keys in the specified hash slot. * The number of removed items is returned. */ -// TODO(murphyjacob4) - can we just use this? unsigned int delKeysInSlot(unsigned int hashslot) { if (!countKeysInSlot(hashslot)) return 0; @@ -7572,35 +7585,29 @@ int clusterCommandSpecial(client *c) { } c->flag.slot_migration_target = 1; initClientReplicationData(c); - slotExport *job = clusterCreateSlotExportJob(c, c->argv[2]->ptr); + slotExport *job = clusterCreateSlotExportJob(c, c->argv[3]->ptr); listAddNodeTail(server.cluster->slot_export_jobs, job); clusterProceedWithSlotMigration(); - } else if (!strcasecmp(c->argv[2]->ptr, "inform")) { - /* CLUSTER SYNCSLOTS INFORM */ - if (c->argc != 4) { - addReplyError(c, "CLUSTER SYNCSLOTS INFORM command requires exactly one argument"); - return 1; - } - slotImport * to_enqueue = clusterCreateSlotImportJob(NULL, c->argv[2]->ptr); - to_enqueue->state = SLOT_IMPORT_REPLICA_TRACKING; - } else if (!strcasecmp(c->argv[2]->ptr, "end")) { - /* CLUSTER SYNCSLOTS END */ + } else if (!strcasecmp(c->argv[2]->ptr, "endsnapshot")) { + /* CLUSTER SYNCSLOTS ENDSNAPSHOT */ if (c->argc != 3) { - addReplyError(c, "CLUSTER SYNCSLOTS END does not expect any arguments."); + addReplyError(c, "CLUSTER SYNCSLOTS ENDSNAPSHOT does not expect any arguments."); return 1; } - slotImport *curr_import = clusterGetCurrentSlotImport(); - if (!curr_import || (curr_import->state != SLOT_IMPORT_RECEIVE_SYNCSLOTS && curr_import->state != SLOT_IMPORT_REPLICA_TRACKING)) { - addReplyError(c, "No ongoing CLUSTER SYNCSLOTS to end."); + if (c->flag.primary) { + /* Due to the proxying nature of replication from the source + * node through the target node to the target node's replicas, + * this message should simply be ignored. */ return 1; } - if (curr_import->state != SLOT_IMPORT_REPLICA_TRACKING) { - /* Replicas will also receive this command through the replication - * stream, but it is not actionable. */ + slotImport *curr_import = clusterGetCurrentSlotImport(); + if (!curr_import || (curr_import->state != SLOT_IMPORT_RECEIVE_SYNCSLOTS)) { + addReplyError(c, "No ongoing snapshot to end."); return 1; } if (curr_import->client != c) { addReplyError(c, "This client is not the one that initiated the ongoing CLUSTER SYNCSLOTS."); + return 1; } curr_import->state = SLOT_IMPORT_PAUSE_OWNER; clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION); @@ -7621,28 +7628,25 @@ int clusterCommandSpecial(client *c) { } else if (slot_export->state != SLOT_EXPORT_SNAPSHOTTING) { addReplyError(c, "SYNCSLOTS is not in the correct state for this command."); return 1; - } else { - /* First pause. We want to flush the output buffer that was not allowed to - * flush during the snapshot. */ - putClientInPendingWriteQueue(slot_export->client); } + serverLog(LL_NOTICE, "Pause received by target during slot migration. Pausing and initiating stream of commands."); slot_export->state = SLOT_EXPORT_PAUSE_AND_REPLY; clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION); - } else if (!strcasecmp(c->argv[2]->ptr, "pausedat")) { - /* CLUSTER SYNCSLOTS PAUSEDAT */ + } else if (!strcasecmp(c->argv[2]->ptr, "pauseoffset")) { + /* CLUSTER SYNCSLOTS PAUSEOFFSET */ if (c->argc != 4) { - addReplyError(c, "CLUSTER SYNCSLOTS PAUSEDAT command requires exactly one argument."); + addReplyError(c, "CLUSTER SYNCSLOTS PAUSEOFFSET command requires exactly one argument."); return 1; } slotImport *slot_import = clusterGetCurrentSlotImport(); if (!slot_import || slot_import->state != SLOT_IMPORT_WAITING_FOR_OFFSET) { - addReplyError(c, "No CLUSTER SYNCSLOTS is waiting for a PAUSEDAT response."); + addReplyError(c, "No CLUSTER SYNCSLOTS is waiting for a PAUSEOFFSET response."); return 1; } long long offset; - if (getLongLongFromObject(c->argv[3]->ptr, &offset) != C_OK) { - addReplyError(c, "Failed to parse PAUSEDAT offset."); + if (getLongLongFromObject(c->argv[3], &offset) != C_OK) { + addReplyError(c, "Failed to parse PAUSEOFFSET offset."); return 1; } slot_import->paused_at_offset = offset; @@ -7693,14 +7697,14 @@ const char **clusterCommandExtendedHelp(void) { "LINKS", " Return information about all network links between this node and its peers.", " Output format is an array where each array element is a map containing attributes of a link", - "MIGRATE SLOTSRANGE [ ...] SHARD ", + "MIGRATE SLOTSRANGE [ ...]", " Initiate server driven slot migration of all slot ranges to the designated shard.", - "SYNCSLOTS [START |END|INFORM |PAUSE|PAUSEDAT]", + "SYNCSLOTS [START |ENDSNAPSHOT|PAUSE|PAUSEOFFSET ]", " Internal command. SYNCSLOTS START initiates send of an AOF formatted snapshot containing the", - " provided slot bitmap. SYNCSLOTS END terminates the AOF formatted snapshot, and after this", + " provided slot bitmap. SYNCSLOTS ENDSNAPSHOT terminates the AOF formatted snapshot, and after this", " SYNCSLOTS PAUSE signals for this node to be paused and for a continuous stream of commands" - " for the slots to be replicated. SYNCSLOTS PAUSEDAT will be replied with the offset of remaining" - " commands. SYNCSLOTS INFORM is used to inform replicas that the operation is occurring.", + " for the slots to be replicated. SYNCSLOTS PAUSEOFFSET will be replied with the offset of remaining" + " commands.", NULL}; return help; @@ -7759,6 +7763,9 @@ int clusterAllowFailoverCmd(client *c) { void clusterPromoteSelfToPrimary(void) { replicationUnsetPrimary(); + /* verifyClusterConfigWithData will delete keys in unowned slots. This + * could happen in the case of failover during a slot migration. */ + serverAssert(verifyClusterConfigWithData() == C_OK); } int detectAndUpdateCachedNodeHealth(void) { diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index 9eda033bda..ee38b3eced 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -378,7 +378,6 @@ struct _clusterNode { typedef enum slotImportState { SLOT_IMPORT_QUEUED, - SLOT_IMPORT_REPLICA_TRACKING, /* Replicas track the slot import as well */ SLOT_IMPORT_CONNECTING, SLOT_IMPORT_SEND_AUTH, SLOT_IMPORT_RECEIVE_AUTH, diff --git a/src/kvstore.c b/src/kvstore.c index f1ed085c43..b84ec5e8df 100644 --- a/src/kvstore.c +++ b/src/kvstore.c @@ -302,7 +302,12 @@ kvstore *kvstoreCreate(hashtableType *type, int num_hashtables_bits, int flags) void kvstoreEmpty(kvstore *kvs, void(callback)(hashtable *)) { for (int didx = 0; didx < kvs->num_hashtables; didx++) { - kvstoreEmptyHashtable(kvs, didx, callback); + hashtable *ht = kvstoreGetHashtable(kvs, didx); + if (!ht) continue; + kvstoreHashtableMetadata *metadata = (kvstoreHashtableMetadata *)hashtableMetadata(ht); + if (metadata->rehashing_node) metadata->rehashing_node = NULL; + hashtableEmpty(ht, callback); + freeHashtableIfNeeded(kvs, didx); } listEmpty(kvs->rehashing); @@ -315,28 +320,6 @@ void kvstoreEmpty(kvstore *kvs, void(callback)(hashtable *)) { kvs->overhead_hashtable_rehashing = 0; } -void kvstoreEmptyHashtable(kvstore *kvs, int didx, void(callback)(hashtable *)) { - hashtable *ht = kvstoreGetHashtable(kvs, didx); - if (!ht) return; - kvstoreHashtableMetadata *metadata = (kvstoreHashtableMetadata *)hashtableMetadata(ht); - if (metadata->rehashing_node) metadata->rehashing_node = NULL; - hashtableEmpty(ht, callback); - freeHashtableIfNeeded(kvs, didx); -} - -hashtable *kvstoreUnlinkHashtable(kvstore *kvs, int didx) { - hashtable *oldht = kvstoreGetHashtable(kvs, didx); - if (!oldht) return NULL; - - /* Pause rehashing on the to be unlinked node. */ - kvstoreHashtableMetadata *oldmetadata = (kvstoreHashtableMetadata *)hashtableMetadata(oldht); - if (oldmetadata->rehashing_node) oldmetadata->rehashing_node = NULL; - - kvs->hashtables[didx] = NULL; - kvs->allocated_hashtables--; - return oldht; -} - void kvstoreRelease(kvstore *kvs) { for (int didx = 0; didx < kvs->num_hashtables; didx++) { hashtable *ht = kvstoreGetHashtable(kvs, didx); diff --git a/src/kvstore.h b/src/kvstore.h index a79caf23aa..bc3baba43a 100644 --- a/src/kvstore.h +++ b/src/kvstore.h @@ -17,7 +17,6 @@ typedef int(kvstoreIteratorPredicate)(int didx, void *privdata); kvstore *kvstoreCreate(hashtableType *type, int num_hashtables_bits, int flags); void kvstoreEmpty(kvstore *kvs, void(callback)(hashtable *)); void kvstoreEmptyHashtable(kvstore *kvs, int didx, void(callback)(hashtable *)); -hashtable *kvstoreUnlinkHashtable(kvstore *kvs, int didx); void kvstoreRelease(kvstore *kvs); unsigned long long kvstoreSize(kvstore *kvs); unsigned long kvstoreBuckets(kvstore *kvs); diff --git a/src/lazyfree.c b/src/lazyfree.c index 8cd04eed37..3b061ccd84 100644 --- a/src/lazyfree.c +++ b/src/lazyfree.c @@ -32,18 +32,6 @@ void lazyfreeFreeDatabase(void *args[]) { atomic_fetch_add_explicit(&lazyfreed_objects, numkeys, memory_order_relaxed); } -/* Release a hashtable from the lazyfree thread. */ -void lazyfreeFreeHashtable(void *args[]) { - hashtable *ht1 = args[0]; - hashtable *ht2 = args[1]; - - size_t numkeys = hashtableSize(ht1); - hashtableRelease(ht1); - if (ht2) hashtableRelease(ht2); - atomic_fetch_sub_explicit(&lazyfree_objects, numkeys, memory_order_relaxed); - atomic_fetch_add_explicit(&lazyfreed_objects, numkeys, memory_order_relaxed); -} - /* Release the key tracking table. */ void lazyFreeTrackingTable(void *args[]) { rax *rt = args[0]; @@ -211,17 +199,6 @@ void emptyDbAsync(serverDb *db) { bioCreateLazyFreeJob(lazyfreeFreeDatabase, 2, oldkeys, oldexpires); } -/* Empty a hashtable asynchrounously. */ -void emptyHashtableAsync(serverDb *db, int didx) { - hashtable *oldkeys = kvstoreUnlinkHashtable(db->keys, didx); - hashtable *oldexpires = kvstoreUnlinkHashtable(db->expires, didx); - if (!oldkeys) { - return; - } - atomic_fetch_add_explicit(&lazyfree_objects, hashtableSize(oldkeys), memory_order_relaxed); - bioCreateLazyFreeJob(lazyfreeFreeHashtable, 2, oldkeys, oldexpires); -} - /* Free the key tracking table. * If the table is huge enough, free it in async way. */ void freeTrackingRadixTreeAsync(rax *tracking) { diff --git a/src/networking.c b/src/networking.c index c2828d384a..91a8001984 100644 --- a/src/networking.c +++ b/src/networking.c @@ -242,7 +242,8 @@ void putClientInPendingWriteQueue(client *c) { if (!c->flag.pending_write && (!c->repl_data || c->repl_data->repl_state == REPL_STATE_NONE || - (isReplicaReadyForReplData(c) && !c->repl_data->repl_start_cmd_stream_on_ack))) { + (isReplicaReadyForReplData(c) && !c->repl_data->repl_start_cmd_stream_on_ack)) && + (!c->flag.slot_migration_target || clusterShouldWriteToSlotMigrationTarget())) { /* Here instead of installing the write handler, we just flag the * client and put it into a list of clients that have something * to write to the socket. This way before re-entering the event @@ -292,8 +293,6 @@ int prepareClientToWrite(client *c) { * is set. */ if ((c->flag.replicated) && !c->flag.replication_force_reply) return C_ERR; - if ((c->flag.slot_migration_target && !clusterShouldWriteToSlotMigrationTarget())) return C_ERR; - /* Skip the fake client, such as the fake client for AOF loading. * But CLIENT_ID_CACHED_RESPONSE is allowed since it is a fake client * but has a connection to cache the response. */ diff --git a/src/rdb.c b/src/rdb.c index ba5d219452..e8d4bfae1a 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -3562,10 +3562,8 @@ int saveSnapshotToConnectionSockets(connection **conns, int connsnum, int use_pi server.rdb_pipe_conns = NULL; if (use_pipe) { server.rdb_pipe_conns = conns; - server.rdb_pipe_numconns = 0; - server.rdb_pipe_numconns_writing = 0; - } else { server.rdb_pipe_numconns = connsnum; + server.rdb_pipe_numconns_writing = 0; } /* Create the child process. */ if ((childpid = serverFork(CHILD_TYPE_RDB)) == 0) { diff --git a/src/server.h b/src/server.h index 32d71bafd2..c0af0b6625 100644 --- a/src/server.h +++ b/src/server.h @@ -3452,7 +3452,6 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor); int parseScanCursorOrReply(client *c, robj *o, unsigned long long *cursor); int dbAsyncDelete(serverDb *db, robj *key); void emptyDbAsync(serverDb *db); -void emptyHashtableAsync(serverDb *db, int didx); size_t lazyfreeGetPendingObjectsCount(void); size_t lazyfreeGetFreedObjectsCount(void); void lazyfreeResetStats(void); diff --git a/tests/unit/slot-migration.tcl b/tests/unit/slot-migration.tcl index 90adaf84e0..96048dad60 100644 --- a/tests/unit/slot-migration.tcl +++ b/tests/unit/slot-migration.tcl @@ -7,16 +7,22 @@ # - Parsing test # - Slot must have available primary # -# ---- Reslience ---- +# ---- Error handling ---- # - Target gives up if primary is unavailable # - Source unpauses itself if replica is unavailable # - Client is closed by target during migration +# - Client is closed by source during migration # # ---- Importing slot is not exposed ---- # - KEYS command on importing node # - RANDOMKEY on importing node # -# ---- Replication +# ---- Replication ---- # - Replica receives updates through target primary # - Time out results in replica dropping slots -# - Failover during migration cleans up slots \ No newline at end of file +# - Failover during migration cleans up slots +# - Full sync with pending migration includes pending slots, is cleaned up if migration fails +# +# ---- Loading ---- +# - Partial slot migration is cleaned up after AOF load +# - Partial slot migration is cleaned up after RDB load \ No newline at end of file From 4784891a7651caf63cf02c687fc64922ff6d91d1 Mon Sep 17 00:00:00 2001 From: Jacob Murphy Date: Mon, 20 Jan 2025 09:00:31 +0000 Subject: [PATCH 08/18] Code cleanup Signed-off-by: Jacob Murphy --- src/cluster.h | 1 - src/cluster_legacy.c | 24 +++++++++++++++--------- src/cluster_legacy.h | 23 ++++------------------- src/kvstore.h | 1 - src/networking.c | 4 ++-- src/server.h | 13 ++++--------- 6 files changed, 25 insertions(+), 41 deletions(-) diff --git a/src/cluster.h b/src/cluster.h index 21d4469357..41b6263bd4 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -122,7 +122,6 @@ void bitmapSetAllBits(unsigned char *bitmap, int len); int slotBitmapCompare(slotBitmap bitmap, slotBitmap other); int isSlotBitmapEmpty(slotBitmap bitmap); int getSlotOrReply(client *c, robj *o); -void clusterSlotImportDoneSyncing(long long initial_offset); void clusterSlotMigrationHandleClientClose(client *c); void clusterFeedSlotMigration(int dbid, robj **argv, int argc); int clusterShouldWriteToSlotMigrationTarget(void); diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 97b201a0a2..8ca61e5b1a 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -4461,7 +4461,8 @@ slotExport *clusterGetCurrentSlotExport(void) { void clusterFeedSlotMigration(int dbid, robj **argv, int argc) { UNUSED(dbid); - int i, slot, error_code; + int i, error_code; + int slot = -1; slotExport *curr_export = clusterGetCurrentSlotExport(); if (curr_export == NULL || curr_export->state < SLOT_EXPORT_SNAPSHOTTING) { return; @@ -4475,9 +4476,14 @@ void clusterFeedSlotMigration(int dbid, robj **argv, int argc) { struct serverCommand *cmd = lookupCommand(argv, argc); getNodeByQuery(server.current_client, cmd, argv, argc, &slot, &error_code); if (error_code != CLUSTER_REDIR_NONE || slot == -1) { - /* This shouldn't happen - but is possible if a module does something - * like VM_Replicate a cross-slot command. In that case, we don't have - * a clear way to proceed, so it makes sense to give up. */ + /* A couple cases where this could happen: + * - The replicated command is a command without a slot. + * - The replicated command is written by VM_Replicate module APIs + * and is a cross-slot command, or a slot that is not owned by + * this node. + * + * In any case, our best solution is to not replicate this to the + * target node. */ return; } if (!bitmapTestBit(curr_export->slot_bitmap, slot)) return; @@ -7508,14 +7514,14 @@ int clusterCommandSpecial(client *c) { } else if (!strcasecmp(c->argv[1]->ptr, "links") && c->argc == 2) { /* CLUSTER LINKS */ addReplyClusterLinksDescription(c); - } else if (!strcasecmp(c->argv[1]->ptr, "migrate")) { - /* CLUSTER MIGRATE SLOTSRANGE [ ] */ + } else if (!strcasecmp(c->argv[1]->ptr, "import")) { + /* CLUSTER IMPORT SLOTSRANGE [ ] */ if (nodeIsReplica(myself)) { - addReplyError(c, "Only primaries can migrate slots"); + addReplyError(c, "Only primaries can import slots"); return 1; } if (c->argc < 5 || strcasecmp(c->argv[2]->ptr, "slotsrange")) { - addReplyError(c, "Migrate command requires at least one slot range"); + addReplyError(c, "CLUSTER IMPORT command requires at least one slot range"); return 1; } if (c->argc % 2 == 0) { @@ -7570,7 +7576,7 @@ int clusterCommandSpecial(client *c) { addReply(c, shared.ok); } else if (!strcasecmp(c->argv[1]->ptr, "syncslots")) { if (c->argc < 3) { - addReplyError(c, "SYNCSLOTS command requires either START or END to be provided."); + addReplyError(c, "SYNCSLOTS command requires a subcommand to be provided."); return 1; } if (!strcasecmp(c->argv[2]->ptr, "start")) { diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index ee38b3eced..eb9ecc5bb1 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -10,7 +10,7 @@ #define CLUSTER_MF_TIMEOUT 5000 /* Milliseconds to do a manual failover. */ #define CLUSTER_MF_PAUSE_MULT 2 /* Primary pause manual failover mult. */ #define CLUSTER_REPLICA_MIGRATION_DELAY 5000 /* Delay for replica migration. */ -#define CLUSTER_SLOT_IMPORT_TIMEOUT 30000 /* Milliseconds to do a slot migration. */ +#define CLUSTER_SLOT_IMPORT_TIMEOUT 30000 /* Milliseconds to do a slot import. */ /* Reasons why a replica is not able to failover. */ #define CLUSTER_CANT_FAILOVER_NONE 0 @@ -97,9 +97,7 @@ typedef struct clusterNodeFailReport { #define CLUSTERMSG_TYPE_MFSTART 8 /* Pause clients for manual failover */ #define CLUSTERMSG_TYPE_MODULE 9 /* Module cluster API message. */ #define CLUSTERMSG_TYPE_PUBLISHSHARD 10 /* Pub/Sub Publish shard propagation */ -#define CLUSTERMSG_TYPE_MIGRATE_SLOT_START 11 /* Pause clients for slot migration */ -#define CLUSTERMSG_TYPE_COUNT 12 /* Total number of message types. */ - +#define CLUSTERMSG_TYPE_COUNT 11 /* Total number of message types. */ #define CLUSTERMSG_LIGHT 0x8000 /* Modifier bit for message types that support light header */ @@ -136,7 +134,7 @@ typedef struct { typedef struct { uint64_t configEpoch; /* Config epoch of the specified instance. */ char nodename[CLUSTER_NAMELEN]; /* Name of the slots owner. */ - unsigned char slots[CLUSTER_SLOTS / 8]; /* Slots bitmap. */ + slotBitmap slots; /* Slots bitmap. */ } clusterMsgDataUpdate; typedef struct { @@ -146,10 +144,6 @@ typedef struct { unsigned char bulk_data[3]; /* 3 bytes just as placeholder. */ } clusterMsgModule; -typedef struct { - slotBitmap slot_bitmap; -} clusterMsgSlotMigration; - /* The cluster supports optional extension messages that can be sent * along with ping/pong/meet messages to give additional info in a * consistent manner. */ @@ -236,12 +230,6 @@ union clusterMsgData { struct { clusterMsgModule msg; } module; - - /* SLOT_MIGRATION */ - struct { - clusterMsgSlotMigration msg; - } slot_migration; - }; #define CLUSTER_PROTO_VER 1 /* Cluster bus protocol version. */ @@ -260,7 +248,7 @@ typedef struct { uint64_t offset; /* Primary replication offset if node is a primary or processed replication offset if node is a replica. */ char sender[CLUSTER_NAMELEN]; /* Name of the sender node */ - unsigned char myslots[CLUSTER_SLOTS / 8]; + slotBitmap myslots; char replicaof[CLUSTER_NAMELEN]; char myip[NET_IP_STR_LEN]; /* Sender IP, if not all zeroed. */ uint16_t extensions; /* Number of extensions sent along with this packet. */ @@ -458,9 +446,6 @@ struct clusterState { or -1 if still not received. */ int mf_can_start; /* If non-zero signal that the manual failover can start requesting primary vote. */ - /* Manual failover state for slot migration */ - slotBitmap mf_slots; /* Slots in migration. */ - clusterNode *mf_slots_target; /* The following fields are used by primaries to take state on elections. */ uint64_t lastVoteEpoch; /* Epoch of the last vote granted. */ int todo_before_sleep; /* Things to do in clusterBeforeSleep(). */ diff --git a/src/kvstore.h b/src/kvstore.h index bc3baba43a..fee8d71dbd 100644 --- a/src/kvstore.h +++ b/src/kvstore.h @@ -16,7 +16,6 @@ typedef int(kvstoreIteratorPredicate)(int didx, void *privdata); #define KVSTORE_FREE_EMPTY_HASHTABLES (1 << 1) kvstore *kvstoreCreate(hashtableType *type, int num_hashtables_bits, int flags); void kvstoreEmpty(kvstore *kvs, void(callback)(hashtable *)); -void kvstoreEmptyHashtable(kvstore *kvs, int didx, void(callback)(hashtable *)); void kvstoreRelease(kvstore *kvs); unsigned long long kvstoreSize(kvstore *kvs); unsigned long kvstoreBuckets(kvstore *kvs); diff --git a/src/networking.c b/src/networking.c index 91a8001984..cad0e86de0 100644 --- a/src/networking.c +++ b/src/networking.c @@ -2156,11 +2156,11 @@ int postWriteToClient(client *c) { } if (c->nwritten > 0) { c->net_output_bytes += c->nwritten; - /* For clients representing replication sources we don't count sending data + /* For clients representing primaries we don't count sending data * as an interaction, since we always send REPLCONF ACK commands * that take some time to just fill the socket output buffer. * We just rely on data / pings received for timeout detection. */ - if (!c->flag.replicated) c->last_interaction = server.unixtime; + if (!c->flag.primary) c->last_interaction = server.unixtime; } if (!clientHasPendingReplies(c)) { c->sentlen = 0; diff --git a/src/server.h b/src/server.h index c0af0b6625..1344db1de9 100644 --- a/src/server.h +++ b/src/server.h @@ -389,7 +389,6 @@ typedef enum blocking_type { * what to do next. */ typedef enum { REPL_STATE_NONE = 0, /* No active replication */ - REPL_STATE_ERROR, /* Error in replication. */ REPL_STATE_CONNECT, /* Must connect to primary */ REPL_STATE_CONNECTING, /* Connecting to primary */ /* --- Handshake states, must be ordered --- */ @@ -398,7 +397,6 @@ typedef enum { REPL_STATE_RECEIVE_AUTH_REPLY, /* Wait for AUTH reply */ REPL_STATE_RECEIVE_PORT_REPLY, /* Wait for REPLCONF reply */ REPL_STATE_RECEIVE_IP_REPLY, /* Wait for REPLCONF reply */ - REPL_STATE_RECEIVE_SLOT_REPLY, /* Wait for REPLCONF reply */ REPL_STATE_RECEIVE_CAPA_REPLY, /* Wait for REPLCONF reply */ REPL_STATE_RECEIVE_VERSION_REPLY, /* Wait for REPLCONF reply */ REPL_STATE_SEND_PSYNC, /* Send PSYNC */ @@ -451,7 +449,6 @@ typedef enum { #define REPLICA_REQ_RDB_EXCLUDE_DATA (1 << 0) /* Exclude data from RDB */ #define REPLICA_REQ_RDB_EXCLUDE_FUNCTIONS (1 << 1) /* Exclude functions from RDB */ #define REPLICA_REQ_RDB_CHANNEL (1 << 2) /* Use dual-channel-replication */ -#define REPLICA_REQ_AOF_FORMAT (1 << 3) /* Use AOF-based replication format*/ /* Mask of all bits in the replica requirements bitfield that represent non-standard (filtered) RDB requirements */ #define REPLICA_REQ_RDB_MASK (REPLICA_REQ_RDB_EXCLUDE_DATA | REPLICA_REQ_RDB_EXCLUDE_FUNCTIONS) @@ -1094,10 +1091,10 @@ typedef struct ClientFlags { * flag, we won't cache the primary in freeClient. */ uint64_t fake : 1; /* This is a fake client without a real connection. */ uint64_t import_source : 1; /* This client is importing data to server and can visit expired key. */ - uint64_t replicated : 1; /* This client is a replication source (i.e. primary or slot migration). */ + uint64_t replicated : 1; /* This client is a replication source (i.e. primary or slot migration). */ uint64_t slot_migration_source : 1; /* This client is a slot migration source. */ uint64_t slot_migration_target : 1; /* This client is a slot migration target. */ - uint64_t reserved : 3; /* Reserved for future use */ + uint64_t reserved : 1; /* Reserved for future use */ } ClientFlags; typedef struct ClientPubSubData { @@ -1543,7 +1540,6 @@ typedef struct { #define CHILD_TYPE_AOF 2 #define CHILD_TYPE_LDB 3 #define CHILD_TYPE_MODULE 4 -#define CHILD_TYPE_SYNCSLOTS 5 typedef enum childInfoType { CHILD_INFO_TYPE_CURRENT_INFO, @@ -1714,8 +1710,8 @@ struct valkeyServer { long long stat_net_repl_input_bytes; /* Bytes read during replication, added to stat_net_input_bytes in 'info'. */ /* Bytes written during replication, added to stat_net_output_bytes in 'info'. */ long long stat_net_repl_output_bytes; - long long stat_net_slot_migration_input_bytes; /* Bytes read during replication, added to stat_net_input_bytes in 'info'. */ - long long stat_net_slot_migration_output_bytes; /* Bytes read during replication, added to stat_net_input_bytes in 'info'. */ + long long stat_net_slot_migration_input_bytes; /* Bytes read during slot migration, added to stat_net_input_bytes in 'info'. */ + long long stat_net_slot_migration_output_bytes; /* Bytes written during slot migration, added to stat_net_output_bytes in 'info'. */ size_t stat_current_cow_peak; /* Peak size of copy on write bytes. */ size_t stat_current_cow_bytes; /* Copy on write bytes while child is active. */ monotime stat_current_cow_updated; /* Last update time of stat_current_cow_bytes */ @@ -3690,7 +3686,6 @@ void sdiffCommand(client *c); void sdiffstoreCommand(client *c); void sscanCommand(client *c); void syncCommand(client *c); -void syncSlotsCommand(client *c); void flushdbCommand(client *c); void flushallCommand(client *c); void sortCommand(client *c); From b52d77980f72726d0f886a6f7b02c78ab2041861 Mon Sep 17 00:00:00 2001 From: Jacob Murphy Date: Mon, 20 Jan 2025 09:01:53 +0000 Subject: [PATCH 09/18] Cleanup debug log Signed-off-by: Jacob Murphy --- src/aof.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/aof.c b/src/aof.c index 0cd64820c8..6ee7d99c0a 100644 --- a/src/aof.c +++ b/src/aof.c @@ -2291,7 +2291,6 @@ int rewriteAppendOnlyFileRio(rio *aof, slotBitmap slot_bitmap) { updated_time = now; } } - serverLog(LL_NOTICE, "AOF rewrite: %s, key_count: %ld", keystr, key_count); /* Delay before next key if required (for testing) */ if (server.rdb_key_save_delay) debugDelay(server.rdb_key_save_delay); From 3b5e555c8a03b0344eec9b1871144ffc435cc2ce Mon Sep 17 00:00:00 2001 From: Jacob Murphy Date: Mon, 20 Jan 2025 09:05:57 +0000 Subject: [PATCH 10/18] Rename CLUSTER MIGRATE to CLUSTER IMPORT Signed-off-by: Jacob Murphy --- src/commands.def | 40 +++++++++++++++---------------- src/commands/cluster-migrate.json | 2 +- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/commands.def b/src/commands.def index f0a1183e5a..fc910f96bd 100644 --- a/src/commands.def +++ b/src/commands.def @@ -599,6 +599,25 @@ struct COMMAND_ARG CLUSTER_GETKEYSINSLOT_Args[] = { #define CLUSTER_HELP_Keyspecs NULL #endif +/********** CLUSTER IMPORT ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* CLUSTER IMPORT history */ +#define CLUSTER_IMPORT_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* CLUSTER IMPORT tips */ +const char *CLUSTER_IMPORT_Tips[] = { +"nondeterministic_output", +}; +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* CLUSTER IMPORT key specs */ +#define CLUSTER_IMPORT_Keyspecs NULL +#endif + /********** CLUSTER INFO ********************/ #ifndef SKIP_CMD_HISTORY_TABLE @@ -685,25 +704,6 @@ struct COMMAND_ARG CLUSTER_MEET_Args[] = { {MAKE_ARG("cluster-bus-port",ARG_TYPE_INTEGER,-1,NULL,NULL,"4.0.0",CMD_ARG_OPTIONAL,0,NULL)}, }; -/********** CLUSTER MIGRATE ********************/ - -#ifndef SKIP_CMD_HISTORY_TABLE -/* CLUSTER MIGRATE history */ -#define CLUSTER_MIGRATE_History NULL -#endif - -#ifndef SKIP_CMD_TIPS_TABLE -/* CLUSTER MIGRATE tips */ -const char *CLUSTER_MIGRATE_Tips[] = { -"nondeterministic_output", -}; -#endif - -#ifndef SKIP_CMD_KEY_SPECS_TABLE -/* CLUSTER MIGRATE key specs */ -#define CLUSTER_MIGRATE_Keyspecs NULL -#endif - /********** CLUSTER MYID ********************/ #ifndef SKIP_CMD_HISTORY_TABLE @@ -1052,11 +1052,11 @@ struct COMMAND_STRUCT CLUSTER_Subcommands[] = { {MAKE_CMD("forget","Removes a node from the nodes table.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_FORGET_History,0,CLUSTER_FORGET_Tips,0,clusterCommand,3,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_STALE,0,CLUSTER_FORGET_Keyspecs,0,NULL,1),.args=CLUSTER_FORGET_Args}, {MAKE_CMD("getkeysinslot","Returns the key names in a hash slot.","O(N) where N is the number of requested keys","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_GETKEYSINSLOT_History,0,CLUSTER_GETKEYSINSLOT_Tips,1,clusterCommand,4,CMD_STALE,0,CLUSTER_GETKEYSINSLOT_Keyspecs,0,NULL,2),.args=CLUSTER_GETKEYSINSLOT_Args}, {MAKE_CMD("help","Returns helpful text about the different subcommands.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_HELP_History,0,CLUSTER_HELP_Tips,0,clusterCommand,2,CMD_LOADING|CMD_STALE,0,CLUSTER_HELP_Keyspecs,0,NULL,0)}, +{MAKE_CMD("import","Initiates server driven hash slot migration, importing the given slot to this shard.","O(N) where N is the total number of hash slot arguments","8.1.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_IMPORT_History,0,CLUSTER_IMPORT_Tips,1,clusterCommand,-2,CMD_ADMIN|CMD_STALE,0,CLUSTER_IMPORT_Keyspecs,0,NULL,0)}, {MAKE_CMD("info","Returns information about the state of a node.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_INFO_History,0,CLUSTER_INFO_Tips,1,clusterCommand,2,CMD_LOADING|CMD_STALE,0,CLUSTER_INFO_Keyspecs,0,NULL,0)}, {MAKE_CMD("keyslot","Returns the hash slot for a key.","O(N) where N is the number of bytes in the key","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_KEYSLOT_History,0,CLUSTER_KEYSLOT_Tips,0,clusterCommand,3,CMD_STALE,0,CLUSTER_KEYSLOT_Keyspecs,0,NULL,1),.args=CLUSTER_KEYSLOT_Args}, {MAKE_CMD("links","Returns a list of all TCP links to and from peer nodes.","O(N) where N is the total number of Cluster nodes","7.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_LINKS_History,0,CLUSTER_LINKS_Tips,1,clusterCommand,2,CMD_STALE,0,CLUSTER_LINKS_Keyspecs,0,NULL,0)}, {MAKE_CMD("meet","Forces a node to handshake with another node.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_MEET_History,1,CLUSTER_MEET_Tips,0,clusterCommand,-4,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_STALE,0,CLUSTER_MEET_Keyspecs,0,NULL,3),.args=CLUSTER_MEET_Args}, -{MAKE_CMD("migrate","Initiates server driven hash slot migration, importing the given slot to this shard.","O(N) where N is the total number of hash slot arguments","8.1.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_MIGRATE_History,0,CLUSTER_MIGRATE_Tips,1,clusterCommand,-2,CMD_ADMIN|CMD_STALE,0,CLUSTER_MIGRATE_Keyspecs,0,NULL,0)}, {MAKE_CMD("myid","Returns the ID of a node.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_MYID_History,0,CLUSTER_MYID_Tips,0,clusterCommand,2,CMD_LOADING|CMD_STALE,0,CLUSTER_MYID_Keyspecs,0,NULL,0)}, {MAKE_CMD("myshardid","Returns the shard ID of a node.","O(1)","7.2.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_MYSHARDID_History,0,CLUSTER_MYSHARDID_Tips,1,clusterCommand,2,CMD_LOADING|CMD_STALE,0,CLUSTER_MYSHARDID_Keyspecs,0,NULL,0)}, {MAKE_CMD("nodes","Returns the cluster configuration for a node.","O(N) where N is the total number of Cluster nodes","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_NODES_History,0,CLUSTER_NODES_Tips,1,clusterCommand,2,CMD_LOADING|CMD_STALE,0,CLUSTER_NODES_Keyspecs,0,NULL,0)}, diff --git a/src/commands/cluster-migrate.json b/src/commands/cluster-migrate.json index 719e827fa4..e7b34be508 100644 --- a/src/commands/cluster-migrate.json +++ b/src/commands/cluster-migrate.json @@ -1,5 +1,5 @@ { - "MIGRATE": { + "IMPORT": { "summary": "Initiates server driven hash slot migration, importing the given slot to this shard.", "complexity": "O(N) where N is the total number of hash slot arguments", "group": "cluster", From c59a7f7e8e3858699ac0fc809a085723741163d6 Mon Sep 17 00:00:00 2001 From: Jacob Murphy Date: Mon, 20 Jan 2025 09:42:49 +0000 Subject: [PATCH 11/18] Fix implicit fallthrough Signed-off-by: Jacob Murphy --- src/cluster_legacy.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 8ca61e5b1a..ddb25663a0 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -4635,6 +4635,7 @@ void clusterProceedWithSlotImport(void) { * straight to readQueryFromClient. */ connSetReadHandler(curr_import->client->conn, readQueryFromClient); curr_import->state = SLOT_IMPORT_RECEIVE_SYNCSLOTS; + continue; case SLOT_IMPORT_RECEIVE_SYNCSLOTS: /* Nothing to do in this state. Waiting for CLUSTER SYNCSLOTS ENDSNAPSHOT to be processed. */ return; From 586b22e904efaaffbae04c31b911d3e223a13ca6 Mon Sep 17 00:00:00 2001 From: Jacob Murphy Date: Mon, 20 Jan 2025 09:43:38 +0000 Subject: [PATCH 12/18] Fix mac build Signed-off-by: Jacob Murphy --- src/cluster_legacy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index ddb25663a0..f2e1990804 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -4496,7 +4496,7 @@ void clusterFeedSlotMigration(int dbid, robj **argv, int argc) { curr_export->syncslot_offset += curr_export->client->reply_bytes - prev_pending; } -int clusterShouldWriteToSlotMigrationTarget() { +int clusterShouldWriteToSlotMigrationTarget(void) { slotExport *curr_export = clusterGetCurrentSlotExport(); return curr_export && (curr_export->state == SLOT_EXPORT_PAUSE_AND_REPLY || curr_export->state == SLOT_EXPORT_PAUSED); } From 117bfa98ffeea4cb17e1f2ee6782251378e2c7de Mon Sep 17 00:00:00 2001 From: Jacob Murphy Date: Mon, 20 Jan 2025 10:10:16 +0000 Subject: [PATCH 13/18] Apply clang format Signed-off-by: Jacob Murphy --- src/aof.c | 2 +- src/cluster_legacy.c | 412 +++++++++++++++++++++---------------------- src/cluster_legacy.h | 6 +- src/rdb.c | 6 +- src/server.c | 3 +- src/server.h | 36 ++-- 6 files changed, 232 insertions(+), 233 deletions(-) diff --git a/src/aof.c b/src/aof.c index 6ee7d99c0a..8befd2d8a1 100644 --- a/src/aof.c +++ b/src/aof.c @@ -2191,7 +2191,7 @@ static int rewriteFunctions(rio *aof) { return 0; } -int slotFilterPredicate(int slot, void * privdata) { +int slotFilterPredicate(int slot, void *privdata) { if (privdata == NULL) return 1; unsigned char *slot_bitmap = (unsigned char *)privdata; return bitmapTestBit(slot_bitmap, slot); diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index f2e1990804..ae190fc83b 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -2575,7 +2575,7 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc /* Was this slot mine and it was in a paused state for slot * migration? If so, mark the move as done. */ - slotExport * curr_export = clusterGetCurrentSlotExport(); + slotExport *curr_export = clusterGetCurrentSlotExport(); if (server.cluster->slots[j] == myself && curr_export && bitmapTestBit(curr_export->slot_bitmap, j)) { bitmapClearBit(curr_export->slot_bitmap, j); if (isSlotBitmapEmpty(curr_export->slot_bitmap)) { @@ -4416,7 +4416,7 @@ void clusterPropagatePublish(robj *channel, robj *message, int sharded) { * -------------------------------------------------------------------------- */ slotImport *clusterCreateSlotImportJob(clusterNode *source, slotBitmap slots) { - slotImport *result = (slotImport *) zcalloc(sizeof(slotImport)); + slotImport *result = (slotImport *)zcalloc(sizeof(slotImport)); memcpy(result->slot_bitmap, slots, sizeof(slotBitmap)); result->source_node = source; result->state = SLOT_IMPORT_QUEUED; @@ -4425,7 +4425,7 @@ slotImport *clusterCreateSlotImportJob(clusterNode *source, slotBitmap slots) { } slotExport *clusterCreateSlotExportJob(client *c, slotBitmap slots) { - slotExport *result = (slotExport *) zcalloc(sizeof(slotExport)); + slotExport *result = (slotExport *)zcalloc(sizeof(slotExport)); memcpy(result->slot_bitmap, slots, sizeof(slotBitmap)); result->state = SLOT_EXPORT_QUEUED; result->pause_end = -1; @@ -4451,12 +4451,12 @@ void clusterFreeSlotExportJob(slotExport *slot_export) { slotImport *clusterGetCurrentSlotImport(void) { if (listLength(server.cluster->slot_import_jobs) == 0) return NULL; - return (slotImport *) listFirst(server.cluster->slot_import_jobs)->value; + return (slotImport *)listFirst(server.cluster->slot_import_jobs)->value; } slotExport *clusterGetCurrentSlotExport(void) { if (listLength(server.cluster->slot_export_jobs) == 0) return NULL; - return (slotExport *) listFirst(server.cluster->slot_export_jobs)->value; + return (slotExport *)listFirst(server.cluster->slot_export_jobs)->value; } void clusterFeedSlotMigration(int dbid, robj **argv, int argc) { @@ -4477,13 +4477,13 @@ void clusterFeedSlotMigration(int dbid, robj **argv, int argc) { getNodeByQuery(server.current_client, cmd, argv, argc, &slot, &error_code); if (error_code != CLUSTER_REDIR_NONE || slot == -1) { /* A couple cases where this could happen: - * - The replicated command is a command without a slot. - * - The replicated command is written by VM_Replicate module APIs - * and is a cross-slot command, or a slot that is not owned by - * this node. - * - * In any case, our best solution is to not replicate this to the - * target node. */ + * - The replicated command is a command without a slot. + * - The replicated command is written by VM_Replicate module APIs + * and is a cross-slot command, or a slot that is not owned by + * this node. + * + * In any case, our best solution is to not replicate this to the + * target node. */ return; } if (!bitmapTestBit(curr_export->slot_bitmap, slot)) return; @@ -4526,10 +4526,10 @@ void clusterProceedWithSlotImport(void) { char *err; while (clusterGetCurrentSlotImport() != NULL) { listNode *curr_node = listFirst(server.cluster->slot_import_jobs); - slotImport *curr_import = (slotImport *) curr_node->value; + slotImport *curr_import = (slotImport *)curr_node->value; if (curr_import->state != SLOT_IMPORT_QUEUED && curr_import->end_time < mstime()) { serverLog(LL_WARNING, - "Timed out for slot import from source node %.40s", curr_import->source_node->name); + "Timed out for slot import from source node %.40s", curr_import->source_node->name); curr_import->state = SLOT_IMPORT_FAILED; } if (curr_import->state > SLOT_IMPORT_PAUSE_OWNER && curr_import->state != SLOT_IMPORT_FAILED && curr_import->pause_end < mstime()) { @@ -4542,155 +4542,155 @@ void clusterProceedWithSlotImport(void) { serverLog(LL_WARNING, "Client for slot import from source node %.40s has been closed", curr_import->source_node->name); curr_import->state = SLOT_IMPORT_FAILED; } - switch(curr_import->state) { - case SLOT_IMPORT_QUEUED: - /* Start the migration */ - serverLog(LL_NOTICE, "Starting slot import from source node %.40s", curr_import->source_node->name); - curr_import->end_time = mstime() + CLUSTER_SLOT_IMPORT_TIMEOUT; - curr_import->conn = connCreate(server.tls_replication ? connectionTypeTls() : connectionTypeTcp()); - if (connConnect(curr_import->conn, curr_import->source_node->ip, getNodeDefaultReplicationPort(curr_import->source_node), server.bind_source_addr, clusterImportHandler) == C_ERR) { - serverLog(LL_WARNING, - "Failed to connect to slot import source node %.40s", curr_import->source_node->name); - curr_import->state = SLOT_IMPORT_FAILED; - continue; - } - curr_import->state = SLOT_IMPORT_CONNECTING; + switch (curr_import->state) { + case SLOT_IMPORT_QUEUED: + /* Start the migration */ + serverLog(LL_NOTICE, "Starting slot import from source node %.40s", curr_import->source_node->name); + curr_import->end_time = mstime() + CLUSTER_SLOT_IMPORT_TIMEOUT; + curr_import->conn = connCreate(server.tls_replication ? connectionTypeTls() : connectionTypeTcp()); + if (connConnect(curr_import->conn, curr_import->source_node->ip, getNodeDefaultReplicationPort(curr_import->source_node), server.bind_source_addr, clusterImportHandler) == C_ERR) { + serverLog(LL_WARNING, + "Failed to connect to slot import source node %.40s", curr_import->source_node->name); + curr_import->state = SLOT_IMPORT_FAILED; continue; - case SLOT_IMPORT_CONNECTING: - if (curr_import->conn->state == CONN_STATE_CONNECTING) { - /* Nothing to do, waiting for connection to be established. */ - return; - } else if (curr_import->conn->state != CONN_STATE_CONNECTED) { - serverLog(LL_WARNING, "Failed during connection to slot import source node %.40s: %s", curr_import->source_node->name, connGetLastError(curr_import->conn)); - curr_import->state = SLOT_IMPORT_FAILED; - continue; - } - serverLog(LL_NOTICE, "Connected to slot import source node %.40s", curr_import->source_node->name); - connSetReadHandler(curr_import->conn, NULL); - client *c = createClient(curr_import->conn); - curr_import->client = c; - c->flag.replicated = 1; - c->flag.slot_migration_source = 1; - c->flag.authenticated = 1; - c->user = NULL; /* This client can do everything. */ - c->querybuf = sdsempty(); /* Similar to primary, we use a dedicated query buf. */ - initClientReplicationData(c); - - curr_import->state = SLOT_IMPORT_SEND_AUTH; + } + curr_import->state = SLOT_IMPORT_CONNECTING; + continue; + case SLOT_IMPORT_CONNECTING: + if (curr_import->conn->state == CONN_STATE_CONNECTING) { + /* Nothing to do, waiting for connection to be established. */ + return; + } else if (curr_import->conn->state != CONN_STATE_CONNECTED) { + serverLog(LL_WARNING, "Failed during connection to slot import source node %.40s: %s", curr_import->source_node->name, connGetLastError(curr_import->conn)); + curr_import->state = SLOT_IMPORT_FAILED; continue; - case SLOT_IMPORT_SEND_AUTH: - if (!server.primary_auth) { - curr_import->state = SLOT_IMPORT_SEND_SYNCSLOTS; - continue; - } - char *auth_args[3] = {"AUTH", NULL, NULL}; - size_t auth_lens[3] = {4, 0, 0}; - int argc = 1; - if (server.primary_user) { - auth_args[argc] = server.primary_user; - auth_lens[argc] = strlen(server.primary_user); - argc++; - } - auth_args[argc] = server.primary_auth; - auth_lens[argc] = sdslen(server.primary_auth); + } + serverLog(LL_NOTICE, "Connected to slot import source node %.40s", curr_import->source_node->name); + connSetReadHandler(curr_import->conn, NULL); + client *c = createClient(curr_import->conn); + curr_import->client = c; + c->flag.replicated = 1; + c->flag.slot_migration_source = 1; + c->flag.authenticated = 1; + c->user = NULL; /* This client can do everything. */ + c->querybuf = sdsempty(); /* Similar to primary, we use a dedicated query buf. */ + initClientReplicationData(c); + + curr_import->state = SLOT_IMPORT_SEND_AUTH; + continue; + case SLOT_IMPORT_SEND_AUTH: + if (!server.primary_auth) { + curr_import->state = SLOT_IMPORT_SEND_SYNCSLOTS; + continue; + } + char *auth_args[3] = {"AUTH", NULL, NULL}; + size_t auth_lens[3] = {4, 0, 0}; + int argc = 1; + if (server.primary_user) { + auth_args[argc] = server.primary_user; + auth_lens[argc] = strlen(server.primary_user); argc++; - err = sendCommandArgv(curr_import->conn, argc, auth_args, auth_lens); - if (err) { - serverLog(LL_WARNING, "Failed to write AUTH to slot migration source: %s", err); - sdsfree(err); - curr_import->state = SLOT_IMPORT_FAILED; - continue; - } - curr_import->state = SLOT_IMPORT_RECEIVE_AUTH; + } + auth_args[argc] = server.primary_auth; + auth_lens[argc] = sdslen(server.primary_auth); + argc++; + err = sendCommandArgv(curr_import->conn, argc, auth_args, auth_lens); + if (err) { + serverLog(LL_WARNING, "Failed to write AUTH to slot migration source: %s", err); + sdsfree(err); + curr_import->state = SLOT_IMPORT_FAILED; continue; - case SLOT_IMPORT_RECEIVE_AUTH: - err = receiveSynchronousResponse(curr_import->conn); - if (err == NULL) { - serverLog(LL_WARNING, "Slot migration source did not respond to AUTH command"); - } - if (err[0] == '-') { - serverLog(LL_WARNING, "Unable to AUTH to slot migration source: %s", err); - sdsfree(err); - } + } + curr_import->state = SLOT_IMPORT_RECEIVE_AUTH; + continue; + case SLOT_IMPORT_RECEIVE_AUTH: + err = receiveSynchronousResponse(curr_import->conn); + if (err == NULL) { + serverLog(LL_WARNING, "Slot migration source did not respond to AUTH command"); + } + if (err[0] == '-') { + serverLog(LL_WARNING, "Unable to AUTH to slot migration source: %s", err); sdsfree(err); - err = NULL; - curr_import->state = SLOT_IMPORT_SEND_SYNCSLOTS; + } + sdsfree(err); + err = NULL; + curr_import->state = SLOT_IMPORT_SEND_SYNCSLOTS; + continue; + case SLOT_IMPORT_SEND_SYNCSLOTS: + /* Ensure we have a clean state for the SYNC. */ + delKeysInSlotBitmap(curr_import->slot_bitmap); + + serverLog(LL_NOTICE, "Sending CLUSTER SYNCSLOTS START request to source node %.40s", curr_import->source_node->name); + char *syncslots_args[4] = {"CLUSTER", "SYNCSLOTS", "START", (char *)curr_import->slot_bitmap}; + size_t syncslots_lens[4] = {7, 9, 5, sizeof(slotBitmap)}; + err = sendCommandArgv(curr_import->conn, 4, syncslots_args, syncslots_lens); + if (err) { + serverLog(LL_WARNING, "Failed to write SYNCSLOTS START to slot migration source: %s", err); + sdsfree(err); + curr_import->state = SLOT_IMPORT_FAILED; continue; - case SLOT_IMPORT_SEND_SYNCSLOTS: - /* Ensure we have a clean state for the SYNC. */ - delKeysInSlotBitmap(curr_import->slot_bitmap); - - serverLog(LL_NOTICE, "Sending CLUSTER SYNCSLOTS START request to source node %.40s", curr_import->source_node->name); - char *syncslots_args[4] = {"CLUSTER", "SYNCSLOTS", "START", (char *)curr_import->slot_bitmap}; - size_t syncslots_lens[4] = {7, 9, 5, sizeof(slotBitmap)}; - err = sendCommandArgv(curr_import->conn, 4, syncslots_args, syncslots_lens); - if (err) { - serverLog(LL_WARNING, "Failed to write SYNCSLOTS START to slot migration source: %s", err); - sdsfree(err); - curr_import->state = SLOT_IMPORT_FAILED; - continue; - } + } - /* Our result will be received in AOF format, so we can pipe it - * straight to readQueryFromClient. */ - connSetReadHandler(curr_import->client->conn, readQueryFromClient); - curr_import->state = SLOT_IMPORT_RECEIVE_SYNCSLOTS; - continue; - case SLOT_IMPORT_RECEIVE_SYNCSLOTS: - /* Nothing to do in this state. Waiting for CLUSTER SYNCSLOTS ENDSNAPSHOT to be processed. */ - return; - case SLOT_IMPORT_PAUSE_OWNER: - curr_import->client->flag.replication_force_reply = 1; - addReplyArrayLen(curr_import->client, 3); - addReplyBulkCBuffer(curr_import->client, "CLUSTER", 7); - addReplyBulkCBuffer(curr_import->client, "SYNCSLOTS", 9); - addReplyBulkCBuffer(curr_import->client, "PAUSE", 5); - curr_import->client->flag.replication_force_reply = 0; - - serverLog(LL_NOTICE, "SYNCSLOTS from slot migration source %.40s (%s) has been performed. Pausing source node and waiting to continue", curr_import->source_node->name, curr_import->source_node->human_nodename); - curr_import->paused_at_offset = -1; - curr_import->pause_end = mstime() + CLUSTER_MF_TIMEOUT; - curr_import->state = SLOT_IMPORT_WAITING_FOR_OFFSET; + /* Our result will be received in AOF format, so we can pipe it + * straight to readQueryFromClient. */ + connSetReadHandler(curr_import->client->conn, readQueryFromClient); + curr_import->state = SLOT_IMPORT_RECEIVE_SYNCSLOTS; + continue; + case SLOT_IMPORT_RECEIVE_SYNCSLOTS: + /* Nothing to do in this state. Waiting for CLUSTER SYNCSLOTS ENDSNAPSHOT to be processed. */ + return; + case SLOT_IMPORT_PAUSE_OWNER: + curr_import->client->flag.replication_force_reply = 1; + addReplyArrayLen(curr_import->client, 3); + addReplyBulkCBuffer(curr_import->client, "CLUSTER", 7); + addReplyBulkCBuffer(curr_import->client, "SYNCSLOTS", 9); + addReplyBulkCBuffer(curr_import->client, "PAUSE", 5); + curr_import->client->flag.replication_force_reply = 0; + + serverLog(LL_NOTICE, "SYNCSLOTS from slot migration source %.40s (%s) has been performed. Pausing source node and waiting to continue", curr_import->source_node->name, curr_import->source_node->human_nodename); + curr_import->paused_at_offset = -1; + curr_import->pause_end = mstime() + CLUSTER_MF_TIMEOUT; + curr_import->state = SLOT_IMPORT_WAITING_FOR_OFFSET; + continue; + case SLOT_IMPORT_WAITING_FOR_OFFSET: + return; + case SLOT_IMPORT_SYNCING_TO_OFFSET: + if (curr_import->client->repl_data->reploff >= curr_import->paused_at_offset) { + serverLog(LL_NOTICE, "SYNCSLOTS of slot range has caught up to paused slot owner (%lld), slot migration can start.", curr_import->paused_at_offset); + curr_import->state = SLOT_IMPORT_FINISH; continue; - case SLOT_IMPORT_WAITING_FOR_OFFSET: - return; - case SLOT_IMPORT_SYNCING_TO_OFFSET: - if (curr_import->client->repl_data->reploff >= curr_import->paused_at_offset) { - serverLog(LL_NOTICE, "SYNCSLOTS of slot range has caught up to paused slot owner (%lld), slot migration can start.", curr_import->paused_at_offset); - curr_import->state = SLOT_IMPORT_FINISH; - continue; - } - /* Need to wait for the sync to progress further */ - return; - case SLOT_IMPORT_FINISH: - serverLog(LL_NOTICE, "Setting myself to owner of migrating slots and broadcasting"); - for (int i = 0; i < CLUSTER_SLOTS; i++) { - if (bitmapTestBit(curr_import->slot_bitmap, i)) { - clusterDelSlot(i); - clusterAddSlot(myself, i); - } - } - clusterUpdateState(); - clusterSaveConfigOrDie(1); - if (clusterBumpConfigEpochWithoutConsensus() == C_OK) { - serverLog(LL_NOTICE, "ConfigEpoch updated after importing slots"); + } + /* Need to wait for the sync to progress further */ + return; + case SLOT_IMPORT_FINISH: + serverLog(LL_NOTICE, "Setting myself to owner of migrating slots and broadcasting"); + for (int i = 0; i < CLUSTER_SLOTS; i++) { + if (bitmapTestBit(curr_import->slot_bitmap, i)) { + clusterDelSlot(i); + clusterAddSlot(myself, i); } - clusterFreeSlotImportJob(curr_import); - clusterBroadcastPong(CLUSTER_BROADCAST_ALL); - listDelNode(server.cluster->slot_import_jobs, curr_node); - continue; - case SLOT_IMPORT_FAILED: - listDelNode(server.cluster->slot_import_jobs, curr_node); - delKeysInSlotBitmap(curr_import->slot_bitmap); - clusterFreeSlotImportJob(curr_import); - continue; + } + clusterUpdateState(); + clusterSaveConfigOrDie(1); + if (clusterBumpConfigEpochWithoutConsensus() == C_OK) { + serverLog(LL_NOTICE, "ConfigEpoch updated after importing slots"); + } + clusterFreeSlotImportJob(curr_import); + clusterBroadcastPong(CLUSTER_BROADCAST_ALL); + listDelNode(server.cluster->slot_import_jobs, curr_node); + continue; + case SLOT_IMPORT_FAILED: + listDelNode(server.cluster->slot_import_jobs, curr_node); + delKeysInSlotBitmap(curr_import->slot_bitmap); + clusterFreeSlotImportJob(curr_import); + continue; } } } int childSnapshotForSyncSlot(int req, rio *rdb, void *privdata) { UNUSED(req); - int retval = rewriteAppendOnlyFileRio(rdb, (unsigned char *) privdata); + int retval = rewriteAppendOnlyFileRio(rdb, (unsigned char *)privdata); rioWrite(rdb, "*3\r\n", 4); rioWriteBulkString(rdb, "CLUSTER", 7); rioWriteBulkString(rdb, "SYNCSLOTS", 9); @@ -4701,67 +4701,67 @@ int childSnapshotForSyncSlot(int req, rio *rdb, void *privdata) { void clusterProceedWithSlotExport(void) { while (clusterGetCurrentSlotExport() != NULL) { listNode *curr_node = listFirst(server.cluster->slot_export_jobs); - slotExport *curr_export = (slotExport *) curr_node->value; + slotExport *curr_export = (slotExport *)curr_node->value; if (curr_export->client == NULL) { serverLog(LL_WARNING, "Client for slot export has been closed"); curr_export->state = SLOT_EXPORT_FAILED; } - switch(curr_export->state) { - case SLOT_EXPORT_QUEUED: - if (hasActiveChildProcess()) { - /* We need to wait for the child to die, then we can - * proceed. */ - return; - } - connection ** conns = zmalloc(sizeof(connection*)); - *conns = curr_export->client->conn; - serverLog(LL_NOTICE, "Initiating snapshot to conn with fd %d", curr_export->client->conn->fd); - if (saveSnapshotToConnectionSockets(conns, 1, 1, 0, childSnapshotForSyncSlot, curr_export->slot_bitmap) != C_OK) { - serverLog(LL_WARNING, "Failed to start slot export to target"); - curr_export->state = SLOT_EXPORT_FAILED; - continue; - } - curr_export->state = SLOT_EXPORT_SNAPSHOTTING; - continue; - case SLOT_EXPORT_SNAPSHOTTING: - /* During this time, we are waiting for SYNCSLOTS PAUSE to - * start flushing the accumulated backlog. */ + switch (curr_export->state) { + case SLOT_EXPORT_QUEUED: + if (hasActiveChildProcess()) { + /* We need to wait for the child to die, then we can + * proceed. */ return; - case SLOT_EXPORT_PAUSE_AND_REPLY: - addReplyArrayLen(curr_export->client, 4); - addReplyBulkCBuffer(curr_export->client, "CLUSTER", 7); - addReplyBulkCBuffer(curr_export->client, "SYNCSLOTS", 9); - addReplyBulkCBuffer(curr_export->client, "PAUSEOFFSET", 11); - addReplyBulkLongLong(curr_export->client, curr_export->syncslot_offset); - - /* Even though we just added replies, it's possible that, due to - * existing pending data, the client is not in the pending write - * queue. We enqueue it explicitly to work around this. */ - putClientInPendingWriteQueue(curr_export->client); - - curr_export->pause_end = mstime() + (CLUSTER_MF_TIMEOUT * CLUSTER_MF_PAUSE_MULT); - pauseActions(PAUSE_DURING_SLOT_MIGRATION, curr_export->pause_end, PAUSE_ACTIONS_CLIENT_WRITE_SET); - - curr_export->state = SLOT_EXPORT_PAUSED; + } + connection **conns = zmalloc(sizeof(connection *)); + *conns = curr_export->client->conn; + serverLog(LL_NOTICE, "Initiating snapshot to conn with fd %d", curr_export->client->conn->fd); + if (saveSnapshotToConnectionSockets(conns, 1, 1, 0, childSnapshotForSyncSlot, curr_export->slot_bitmap) != C_OK) { + serverLog(LL_WARNING, "Failed to start slot export to target"); + curr_export->state = SLOT_EXPORT_FAILED; continue; - case SLOT_EXPORT_PAUSED: - /* While paused, we simply want to check if we should unpause. */ - if (curr_export->pause_end <= mstime()) { - /* Every CLUSTER_MF_TIMEOUT, the source node should - * re-attempt the pause. If we reach this point, it hasn't - * attempted the pause in that time, we can assume it is - * dead and fail the migration.*/ - serverLog(LL_WARNING, "During slot export, unpausing self and cancelling export due to timeout."); - unpauseActions(PAUSE_DURING_SLOT_MIGRATION); - curr_export->state = SLOT_EXPORT_FAILED; - continue; - } - return; - case SLOT_EXPORT_FINISH: - case SLOT_EXPORT_FAILED: - listDelNode(server.cluster->slot_export_jobs, curr_node); - clusterFreeSlotExportJob(curr_export); + } + curr_export->state = SLOT_EXPORT_SNAPSHOTTING; + continue; + case SLOT_EXPORT_SNAPSHOTTING: + /* During this time, we are waiting for SYNCSLOTS PAUSE to + * start flushing the accumulated backlog. */ + return; + case SLOT_EXPORT_PAUSE_AND_REPLY: + addReplyArrayLen(curr_export->client, 4); + addReplyBulkCBuffer(curr_export->client, "CLUSTER", 7); + addReplyBulkCBuffer(curr_export->client, "SYNCSLOTS", 9); + addReplyBulkCBuffer(curr_export->client, "PAUSEOFFSET", 11); + addReplyBulkLongLong(curr_export->client, curr_export->syncslot_offset); + + /* Even though we just added replies, it's possible that, due to + * existing pending data, the client is not in the pending write + * queue. We enqueue it explicitly to work around this. */ + putClientInPendingWriteQueue(curr_export->client); + + curr_export->pause_end = mstime() + (CLUSTER_MF_TIMEOUT * CLUSTER_MF_PAUSE_MULT); + pauseActions(PAUSE_DURING_SLOT_MIGRATION, curr_export->pause_end, PAUSE_ACTIONS_CLIENT_WRITE_SET); + + curr_export->state = SLOT_EXPORT_PAUSED; + continue; + case SLOT_EXPORT_PAUSED: + /* While paused, we simply want to check if we should unpause. */ + if (curr_export->pause_end <= mstime()) { + /* Every CLUSTER_MF_TIMEOUT, the source node should + * re-attempt the pause. If we reach this point, it hasn't + * attempted the pause in that time, we can assume it is + * dead and fail the migration.*/ + serverLog(LL_WARNING, "During slot export, unpausing self and cancelling export due to timeout."); + unpauseActions(PAUSE_DURING_SLOT_MIGRATION); + curr_export->state = SLOT_EXPORT_FAILED; continue; + } + return; + case SLOT_EXPORT_FINISH: + case SLOT_EXPORT_FAILED: + listDelNode(server.cluster->slot_export_jobs, curr_node); + clusterFreeSlotExportJob(curr_export); + continue; } } } @@ -7532,8 +7532,8 @@ int clusterCommandSpecial(client *c) { slotBitmap requested_slots; memset(requested_slots, 0, sizeof(slotBitmap)); int i; - clusterNode * curr_owner = NULL; - for (i = 3; i + 1 < c->argc; i+=2) { + clusterNode *curr_owner = NULL; + for (i = 3; i + 1 < c->argc; i += 2) { int start = getSlotOrReply(c, c->argv[i]); if (start < 0) { return 1; @@ -7571,7 +7571,7 @@ int clusterCommandSpecial(client *c) { } } - slotImport * to_enqueue = clusterCreateSlotImportJob(curr_owner, requested_slots); + slotImport *to_enqueue = clusterCreateSlotImportJob(curr_owner, requested_slots); listAddNodeTail(server.cluster->slot_import_jobs, to_enqueue); clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION); addReply(c, shared.ok); @@ -7583,7 +7583,7 @@ int clusterCommandSpecial(client *c) { if (!strcasecmp(c->argv[2]->ptr, "start")) { /* CLUSTER SYNCSLOTS START */ if (c->argc != 4) { - addReplyError(c, "CLUSTER SYNCSLOTS START command requires exactly one argument"); + addReplyError(c, "CLUSTER SYNCSLOTS START command requires exactly one argument"); return 1; } if (sdslen(c->argv[3]->ptr) != sizeof(slotBitmap)) { diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index eb9ecc5bb1..00e686997a 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -132,9 +132,9 @@ typedef struct { } clusterMsgDataPublish; typedef struct { - uint64_t configEpoch; /* Config epoch of the specified instance. */ - char nodename[CLUSTER_NAMELEN]; /* Name of the slots owner. */ - slotBitmap slots; /* Slots bitmap. */ + uint64_t configEpoch; /* Config epoch of the specified instance. */ + char nodename[CLUSTER_NAMELEN]; /* Name of the slots owner. */ + slotBitmap slots; /* Slots bitmap. */ } clusterMsgDataUpdate; typedef struct { diff --git a/src/rdb.c b/src/rdb.c index e8d4bfae1a..36ae825670 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -3685,14 +3685,14 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi) { } } - int retval = saveSnapshotToConnectionSockets(conns, connsnum, !dual_channel, req, childSnapshotUsingRDB, (void *) rsi); + int retval = saveSnapshotToConnectionSockets(conns, connsnum, !dual_channel, req, childSnapshotUsingRDB, (void *)rsi); if (retval != C_OK) { serverLog(LL_WARNING, "Can't save in background: fork: %s", strerror(errno)); /* Undo the state change. The caller will perform cleanup on - * all the replicas in BGSAVE_START state, but an early call to - * replicationSetupReplicaForFullResync() turned it into BGSAVE_END */ + * all the replicas in BGSAVE_START state, but an early call to + * replicationSetupReplicaForFullResync() turned it into BGSAVE_END */ listRewind(server.replicas, &li); while ((ln = listNext(&li))) { client *replica = ln->value; diff --git a/src/server.c b/src/server.c index 50b93b2943..8654c89df0 100644 --- a/src/server.c +++ b/src/server.c @@ -4305,8 +4305,7 @@ int processCommand(client *c) { /* If the server is paused, block the client until * the pause has ended. Replicas are never paused. */ - if (!c->flag.replica && !c->flag.slot_migration_target && ((isPausedActions(PAUSE_ACTION_CLIENT_ALL)) || - ((isPausedActions(PAUSE_ACTION_CLIENT_WRITE)) && is_may_replicate_command))) { + if (!c->flag.replica && !c->flag.slot_migration_target && ((isPausedActions(PAUSE_ACTION_CLIENT_ALL)) || ((isPausedActions(PAUSE_ACTION_CLIENT_WRITE)) && is_may_replicate_command))) { blockPostponeClient(c); return C_OK; } diff --git a/src/server.h b/src/server.h index 1344db1de9..6269461ed5 100644 --- a/src/server.h +++ b/src/server.h @@ -182,15 +182,15 @@ struct hdr_histogram; #define RIO_CONNSET_WRITE_MAX_CHUNK_SIZE 16384 /* Instantaneous metrics tracking. */ -#define STATS_METRIC_SAMPLES 16 /* Number of samples per metric. */ -#define STATS_METRIC_COMMAND 0 /* Number of commands executed. */ -#define STATS_METRIC_NET_INPUT 1 /* Bytes read to network. */ -#define STATS_METRIC_NET_OUTPUT 2 /* Bytes written to network. */ -#define STATS_METRIC_NET_INPUT_REPLICATION 3 /* Bytes read to network during replication. */ -#define STATS_METRIC_NET_OUTPUT_REPLICATION 4 /* Bytes written to network during replication. */ -#define STATS_METRIC_EL_CYCLE 5 /* Number of eventloop cycled. */ -#define STATS_METRIC_EL_DURATION 6 /* Eventloop duration. */ -#define STATS_METRIC_NET_INPUT_SLOT_MIGRATION 7 /* Bytes read to network during slot migration. */ +#define STATS_METRIC_SAMPLES 16 /* Number of samples per metric. */ +#define STATS_METRIC_COMMAND 0 /* Number of commands executed. */ +#define STATS_METRIC_NET_INPUT 1 /* Bytes read to network. */ +#define STATS_METRIC_NET_OUTPUT 2 /* Bytes written to network. */ +#define STATS_METRIC_NET_INPUT_REPLICATION 3 /* Bytes read to network during replication. */ +#define STATS_METRIC_NET_OUTPUT_REPLICATION 4 /* Bytes written to network during replication. */ +#define STATS_METRIC_EL_CYCLE 5 /* Number of eventloop cycled. */ +#define STATS_METRIC_EL_DURATION 6 /* Eventloop duration. */ +#define STATS_METRIC_NET_INPUT_SLOT_MIGRATION 7 /* Bytes read to network during slot migration. */ #define STATS_METRIC_NET_OUTPUT_SLOT_MIGRATION 7 /* Bytes written to network during slot migration. */ #define STATS_METRIC_COUNT 8 @@ -380,10 +380,10 @@ typedef enum blocking_type { #define CLIENT_TYPE_PUBSUB 2 /* Clients subscribed to PubSub channels. */ #define CLIENT_TYPE_PRIMARY 3 /* Primary. */ #define CLIENT_TYPE_SLOT_MIGRATION 4 /* Slot migration client. */ -#define CLIENT_TYPE_COUNT 5 /* Total number of client types. */ -#define CLIENT_TYPE_OBUF_COUNT 3 /* Number of clients to expose to output \ - buffer configuration. Just the first \ - three: normal, replica, pubsub. */ +#define CLIENT_TYPE_COUNT 5 /* Total number of client types. */ +#define CLIENT_TYPE_OBUF_COUNT 3 /* Number of clients to expose to output \ + buffer configuration. Just the first \ + three: normal, replica, pubsub. */ /* Replica replication state. Used in server.repl_state for replicas to remember * what to do next. */ @@ -1110,8 +1110,8 @@ typedef struct ClientPubSubData { context of client side caching. */ } ClientPubSubData; -#define CLUSTER_SLOT_MASK_BITS 14 /* Number of bits used for slot id. */ -#define CLUSTER_SLOTS (1 << CLUSTER_SLOT_MASK_BITS) /* Total number of slots in cluster mode, which is 16384. */ +#define CLUSTER_SLOT_MASK_BITS 14 /* Number of bits used for slot id. */ +#define CLUSTER_SLOTS (1 << CLUSTER_SLOT_MASK_BITS) /* Total number of slots in cluster mode, which is 16384. */ typedef unsigned char slotBitmap[CLUSTER_SLOTS / 8]; @@ -1710,8 +1710,8 @@ struct valkeyServer { long long stat_net_repl_input_bytes; /* Bytes read during replication, added to stat_net_input_bytes in 'info'. */ /* Bytes written during replication, added to stat_net_output_bytes in 'info'. */ long long stat_net_repl_output_bytes; - long long stat_net_slot_migration_input_bytes; /* Bytes read during slot migration, added to stat_net_input_bytes in 'info'. */ - long long stat_net_slot_migration_output_bytes; /* Bytes written during slot migration, added to stat_net_output_bytes in 'info'. */ + long long stat_net_slot_migration_input_bytes; /* Bytes read during slot migration, added to stat_net_input_bytes in 'info'. */ + long long stat_net_slot_migration_output_bytes; /* Bytes written during slot migration, added to stat_net_output_bytes in 'info'. */ size_t stat_current_cow_peak; /* Peak size of copy on write bytes. */ size_t stat_current_cow_bytes; /* Copy on write bytes while child is active. */ monotime stat_current_cow_updated; /* Last update time of stat_current_cow_bytes */ @@ -2962,7 +2962,7 @@ void updateLoadingFileName(char *filename); void startSaving(int rdbflags); void stopSaving(int success); int allPersistenceDisabled(void); -typedef int(*ChildSnapshotFunc)(int req, rio *rdb, void *privdata); +typedef int (*ChildSnapshotFunc)(int req, rio *rdb, void *privdata); int saveSnapshotToConnectionSockets(connection **conns, int connsnum, int use_pipe, int req, ChildSnapshotFunc snapshot_func, void *privdata); #define DISK_ERROR_TYPE_AOF 1 /* Don't accept writes: AOF errors. */ From eb4c1a896d8ab4faae71c755ca7426cac49e53fd Mon Sep 17 00:00:00 2001 From: Jacob Murphy Date: Wed, 22 Jan 2025 00:45:36 +0000 Subject: [PATCH 14/18] Fix offset accounting and handle source primary failover Signed-off-by: Jacob Murphy --- src/cluster_legacy.c | 130 +++++++++++++++++++++++++++++++------------ src/cluster_legacy.h | 4 +- src/replication.c | 2 + 3 files changed, 99 insertions(+), 37 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index ae190fc83b..1357fdaf1d 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -2625,6 +2625,19 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc } } + /* Handle the case that we are importing the slot via atomic + * slot migration and the ownership changes. */ + slotImport *curr_import = clusterGetCurrentSlotImport(); + if (curr_import != NULL && bitmapTestBit(curr_import->slot_bitmap, j) && curr_import->source_node != sender && curr_import->state != SLOT_IMPORT_FAILED) { + if (areInSameShard(sender, curr_import->source_node)) { + serverLog(LL_WARNING, "Failover occurred during slot migration from %.40s (%s). Cancelling the migration.", curr_import->source_node->name, curr_import->source_node->human_nodename); + } else { + serverLog(LL_WARNING, "Slot %d has been moved to a different shard than that of %.40s (%s). Cancelling the migration.", j, curr_import->source_node->name, curr_import->source_node->human_nodename); + } + curr_import->state = SLOT_IMPORT_FAILED; + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION); + } + clusterDelSlot(j); clusterAddSlot(sender, j); bitmapClearBit(server.cluster->owner_not_claiming_slot, j); @@ -4428,7 +4441,6 @@ slotExport *clusterCreateSlotExportJob(client *c, slotBitmap slots) { slotExport *result = (slotExport *)zcalloc(sizeof(slotExport)); memcpy(result->slot_bitmap, slots, sizeof(slotBitmap)); result->state = SLOT_EXPORT_QUEUED; - result->pause_end = -1; result->client = c; return result; } @@ -4488,12 +4500,14 @@ void clusterFeedSlotMigration(int dbid, robj **argv, int argc) { } if (!bitmapTestBit(curr_export->slot_bitmap, slot)) return; - unsigned long long prev_pending = curr_export->client->reply_bytes; addReplyArrayLen(curr_export->client, argc); + /* '*' + argc + '\r\n' */ + curr_export->streamed_repl_offset += 1 + digits10(argc) + 2; for (i = 0; i < argc; i++) { addReplyBulk(curr_export->client, argv[i]); + /* '$' + len(argv[i]) + '\r\n' + argv[i] + '\r\n' */ + curr_export->streamed_repl_offset += 1 + digits10(stringObjectLen(argv[i])) + 2 + stringObjectLen(argv[i]) + 2; } - curr_export->syncslot_offset += curr_export->client->reply_bytes - prev_pending; } int clusterShouldWriteToSlotMigrationTarget(void) { @@ -4524,33 +4538,36 @@ void clusterImportHandler(connection *conn) { void clusterProceedWithSlotImport(void) { char *err; + mstime_t now; while (clusterGetCurrentSlotImport() != NULL) { + now = mstime(); listNode *curr_node = listFirst(server.cluster->slot_import_jobs); slotImport *curr_import = (slotImport *)curr_node->value; - if (curr_import->state != SLOT_IMPORT_QUEUED && curr_import->end_time < mstime()) { - serverLog(LL_WARNING, - "Timed out for slot import from source node %.40s", curr_import->source_node->name); - curr_import->state = SLOT_IMPORT_FAILED; - } - if (curr_import->state > SLOT_IMPORT_PAUSE_OWNER && curr_import->state != SLOT_IMPORT_FAILED && curr_import->pause_end < mstime()) { - /* If the owner ever unpauses, we have to move back in the state machine and retry. */ - serverLog(LL_WARNING, "Reinitiating pause on the node owning the slot range..."); - curr_import->state = SLOT_IMPORT_PAUSE_OWNER; - curr_import->pause_end = mstime() + CLUSTER_MF_TIMEOUT; - } - if (curr_import->state > SLOT_IMPORT_CONNECTING && curr_import->client == NULL) { - serverLog(LL_WARNING, "Client for slot import from source node %.40s has been closed", curr_import->source_node->name); - curr_import->state = SLOT_IMPORT_FAILED; + if (curr_import->state != SLOT_IMPORT_FAILED) { + if (curr_import->end_time && curr_import->end_time < now) { + serverLog(LL_WARNING, + "Timed out for slot import from source node %.40s", curr_import->source_node->name); + curr_import->state = SLOT_IMPORT_FAILED; + } else if (curr_import->state > SLOT_IMPORT_CONNECTING && curr_import->client == NULL) { + serverLog(LL_WARNING, "Client for slot import from source node %.40s (%s) has been closed. Cancelling the migration.", curr_import->source_node->name, curr_import->source_node->human_nodename); + curr_import->state = SLOT_IMPORT_FAILED; + } else if (nodeIsReplica(curr_import->source_node)) { + serverLog(LL_WARNING, "Source node %.40s (%s) has been demote to replica. Cancelling the migration.", curr_import->source_node->name, curr_import->source_node->human_nodename); + curr_import->state = SLOT_IMPORT_FAILED; + } else if (curr_import->pause_end && curr_import->pause_end < now) { + /* If the owner ever unpauses, we have to move back in the state machine and retry. */ + serverLog(LL_WARNING, "Reinitiating pause on the node owning the slot range..."); + curr_import->state = SLOT_IMPORT_PAUSE_OWNER; + } } switch (curr_import->state) { case SLOT_IMPORT_QUEUED: - /* Start the migration */ serverLog(LL_NOTICE, "Starting slot import from source node %.40s", curr_import->source_node->name); - curr_import->end_time = mstime() + CLUSTER_SLOT_IMPORT_TIMEOUT; + curr_import->end_time = now + CLUSTER_SLOT_IMPORT_TIMEOUT; curr_import->conn = connCreate(server.tls_replication ? connectionTypeTls() : connectionTypeTcp()); if (connConnect(curr_import->conn, curr_import->source_node->ip, getNodeDefaultReplicationPort(curr_import->source_node), server.bind_source_addr, clusterImportHandler) == C_ERR) { serverLog(LL_WARNING, - "Failed to connect to slot import source node %.40s", curr_import->source_node->name); + "Failed to connect to slot import source node %.40s (%s). Cancelling the migration.", curr_import->source_node->name, curr_import->source_node->human_nodename); curr_import->state = SLOT_IMPORT_FAILED; continue; } @@ -4561,11 +4578,11 @@ void clusterProceedWithSlotImport(void) { /* Nothing to do, waiting for connection to be established. */ return; } else if (curr_import->conn->state != CONN_STATE_CONNECTED) { - serverLog(LL_WARNING, "Failed during connection to slot import source node %.40s: %s", curr_import->source_node->name, connGetLastError(curr_import->conn)); + serverLog(LL_WARNING, "Failed during connection to slot import source node %.40s (%s): %s. Cancelling the migration.", curr_import->source_node->name, curr_import->source_node->human_nodename, connGetLastError(curr_import->conn)); curr_import->state = SLOT_IMPORT_FAILED; continue; } - serverLog(LL_NOTICE, "Connected to slot import source node %.40s", curr_import->source_node->name); + serverLog(LL_NOTICE, "Connected to slot import source node %.40s (%s)", curr_import->source_node->name, curr_import->source_node->human_nodename); connSetReadHandler(curr_import->conn, NULL); client *c = createClient(curr_import->conn); curr_import->client = c; @@ -4596,7 +4613,7 @@ void clusterProceedWithSlotImport(void) { argc++; err = sendCommandArgv(curr_import->conn, argc, auth_args, auth_lens); if (err) { - serverLog(LL_WARNING, "Failed to write AUTH to slot migration source: %s", err); + serverLog(LL_WARNING, "Failed to write AUTH to slot migration source %.40s (%s): %s. Cancelling the migration.", curr_import->source_node->name, curr_import->source_node->human_nodename, err); sdsfree(err); curr_import->state = SLOT_IMPORT_FAILED; continue; @@ -4606,26 +4623,31 @@ void clusterProceedWithSlotImport(void) { case SLOT_IMPORT_RECEIVE_AUTH: err = receiveSynchronousResponse(curr_import->conn); if (err == NULL) { - serverLog(LL_WARNING, "Slot migration source did not respond to AUTH command"); + serverLog(LL_WARNING, "Slot migration source %.40s (%s) did not respond to AUTH command. Cancelling the migration.", curr_import->source_node->name, curr_import->source_node->human_nodename); + curr_import->state = SLOT_IMPORT_FAILED; + continue; } if (err[0] == '-') { - serverLog(LL_WARNING, "Unable to AUTH to slot migration source: %s", err); + serverLog(LL_WARNING, "Unable to AUTH to slot migration source %.40s (%s): %s", curr_import->source_node->name, curr_import->source_node->human_nodename, err); sdsfree(err); + curr_import->state = SLOT_IMPORT_FAILED; + continue; } sdsfree(err); err = NULL; + serverLog(LL_NOTICE, "Successfully authenticated to slot migration source %.40s (%s)", curr_import->source_node->name, curr_import->source_node->human_nodename); curr_import->state = SLOT_IMPORT_SEND_SYNCSLOTS; continue; case SLOT_IMPORT_SEND_SYNCSLOTS: /* Ensure we have a clean state for the SYNC. */ delKeysInSlotBitmap(curr_import->slot_bitmap); - serverLog(LL_NOTICE, "Sending CLUSTER SYNCSLOTS START request to source node %.40s", curr_import->source_node->name); + serverLog(LL_NOTICE, "Sending CLUSTER SYNCSLOTS START request to source node %.40s (%s).", curr_import->source_node->name, curr_import->source_node->human_nodename); char *syncslots_args[4] = {"CLUSTER", "SYNCSLOTS", "START", (char *)curr_import->slot_bitmap}; size_t syncslots_lens[4] = {7, 9, 5, sizeof(slotBitmap)}; err = sendCommandArgv(curr_import->conn, 4, syncslots_args, syncslots_lens); if (err) { - serverLog(LL_WARNING, "Failed to write SYNCSLOTS START to slot migration source: %s", err); + serverLog(LL_WARNING, "Failed to write SYNCSLOTS START to slot migration source %.40s (%s): %s", curr_import->source_node->name, curr_import->source_node->human_nodename, err); sdsfree(err); curr_import->state = SLOT_IMPORT_FAILED; continue; @@ -4647,23 +4669,23 @@ void clusterProceedWithSlotImport(void) { addReplyBulkCBuffer(curr_import->client, "PAUSE", 5); curr_import->client->flag.replication_force_reply = 0; - serverLog(LL_NOTICE, "SYNCSLOTS from slot migration source %.40s (%s) has been performed. Pausing source node and waiting to continue", curr_import->source_node->name, curr_import->source_node->human_nodename); + serverLog(LL_NOTICE, "SYNCSLOTS from slot migration source %.40s (%s) has been performed, received offset %lld. Pausing source node and waiting to continue", curr_import->source_node->name, curr_import->source_node->human_nodename, curr_import->client->repl_data->reploff); curr_import->paused_at_offset = -1; - curr_import->pause_end = mstime() + CLUSTER_MF_TIMEOUT; + curr_import->pause_end = now + CLUSTER_MF_TIMEOUT; curr_import->state = SLOT_IMPORT_WAITING_FOR_OFFSET; continue; case SLOT_IMPORT_WAITING_FOR_OFFSET: return; case SLOT_IMPORT_SYNCING_TO_OFFSET: if (curr_import->client->repl_data->reploff >= curr_import->paused_at_offset) { - serverLog(LL_NOTICE, "SYNCSLOTS of slot range has caught up to paused slot owner (%lld), slot migration can start.", curr_import->paused_at_offset); + serverLog(LL_NOTICE, "SYNCSLOTS of slot range has caught up to paused slot owner %.40s (%s): my offset %lld, source offset %lld, slot migration can start.", curr_import->source_node->name, curr_import->source_node->human_nodename, curr_import->client->repl_data->reploff, curr_import->paused_at_offset); curr_import->state = SLOT_IMPORT_FINISH; continue; } /* Need to wait for the sync to progress further */ return; case SLOT_IMPORT_FINISH: - serverLog(LL_NOTICE, "Setting myself to owner of migrating slots and broadcasting"); + serverLog(LL_NOTICE, "Setting myself to owner of migrating slots and broadcasting."); for (int i = 0; i < CLUSTER_SLOTS; i++) { if (bitmapTestBit(curr_import->slot_bitmap, i)) { clusterDelSlot(i); @@ -4673,7 +4695,7 @@ void clusterProceedWithSlotImport(void) { clusterUpdateState(); clusterSaveConfigOrDie(1); if (clusterBumpConfigEpochWithoutConsensus() == C_OK) { - serverLog(LL_NOTICE, "ConfigEpoch updated after importing slots"); + serverLog(LL_NOTICE, "Epoch bumped after importing slots. New epoch %ld", server.cluster->currentEpoch); } clusterFreeSlotImportJob(curr_import); clusterBroadcastPong(CLUSTER_BROADCAST_ALL); @@ -4699,7 +4721,9 @@ int childSnapshotForSyncSlot(int req, rio *rdb, void *privdata) { } void clusterProceedWithSlotExport(void) { + mstime_t now; while (clusterGetCurrentSlotExport() != NULL) { + now = mstime(); listNode *curr_node = listFirst(server.cluster->slot_export_jobs); slotExport *curr_export = (slotExport *)curr_node->value; if (curr_export->client == NULL) { @@ -4729,24 +4753,53 @@ void clusterProceedWithSlotExport(void) { return; case SLOT_EXPORT_PAUSE_AND_REPLY: addReplyArrayLen(curr_export->client, 4); + curr_export->streamed_repl_offset += 4; /* '*4\r\n' */ addReplyBulkCBuffer(curr_export->client, "CLUSTER", 7); + curr_export->streamed_repl_offset += 13; /* '$7\r\nCLUSTER\r\n' */ addReplyBulkCBuffer(curr_export->client, "SYNCSLOTS", 9); + curr_export->streamed_repl_offset += 15; /* '$9\r\nSYNCSLOTS\r\n' */ addReplyBulkCBuffer(curr_export->client, "PAUSEOFFSET", 11); - addReplyBulkLongLong(curr_export->client, curr_export->syncslot_offset); + curr_export->streamed_repl_offset += 18; /* '$11\r\nPAUSEOFFSET\r\n' */ + + /* We add the length of the offset reply to the offest itself. */ + uint32_t offset_len = digits10(curr_export->streamed_repl_offset + curr_export->client->repl_data->repldbsize); + uint32_t offset_len_len = digits10(offset_len); + curr_export->streamed_repl_offset += 1 + offset_len_len + 2 + offset_len + 2; + uint32_t offset_len2 = digits10(curr_export->streamed_repl_offset + curr_export->client->repl_data->repldbsize); + if (offset_len2 > offset_len) { + /* Adding the offset will add at most one more digit, since + * it's length will be <=10 (uint32_t max) */ + serverAssert(offset_len2 == offset_len + 1); + curr_export->streamed_repl_offset++; + uint32_t offset_len_len2 = digits10(digits10(curr_export->streamed_repl_offset)); + if (offset_len_len2 > offset_len_len) { + /* If offset_len was really close to another digit, we + * have to handle that too. */ + serverAssert(offset_len_len2 == offset_len_len + 1); + curr_export->streamed_repl_offset++; + } + } + serverLog(LL_NOTICE, "At time of pause, slot migration AOF size: %lu, " + "slot migration streaming offset: %llu, total " + "offset: %llu", + curr_export->client->repl_data->repldbsize, + curr_export->streamed_repl_offset, + curr_export->streamed_repl_offset + curr_export->client->repl_data->repldbsize); + addReplyBulkLongLong(curr_export->client, curr_export->streamed_repl_offset + curr_export->client->repl_data->repldbsize); /* Even though we just added replies, it's possible that, due to * existing pending data, the client is not in the pending write * queue. We enqueue it explicitly to work around this. */ putClientInPendingWriteQueue(curr_export->client); - curr_export->pause_end = mstime() + (CLUSTER_MF_TIMEOUT * CLUSTER_MF_PAUSE_MULT); + curr_export->pause_end = now + (CLUSTER_MF_TIMEOUT * CLUSTER_MF_PAUSE_MULT); pauseActions(PAUSE_DURING_SLOT_MIGRATION, curr_export->pause_end, PAUSE_ACTIONS_CLIENT_WRITE_SET); curr_export->state = SLOT_EXPORT_PAUSED; continue; case SLOT_EXPORT_PAUSED: /* While paused, we simply want to check if we should unpause. */ - if (curr_export->pause_end <= mstime()) { + if (curr_export->pause_end <= now) { /* Every CLUSTER_MF_TIMEOUT, the source node should * re-attempt the pause. If we reach this point, it hasn't * attempted the pause in that time, we can assume it is @@ -7646,6 +7699,12 @@ int clusterCommandSpecial(client *c) { addReplyError(c, "CLUSTER SYNCSLOTS PAUSEOFFSET command requires exactly one argument."); return 1; } + if (c->flag.primary) { + /* Due to the proxying nature of replication from the source + * node through the target node to the target node's replicas, + * this message should simply be ignored. */ + return 1; + } slotImport *slot_import = clusterGetCurrentSlotImport(); if (!slot_import || slot_import->state != SLOT_IMPORT_WAITING_FOR_OFFSET) { addReplyError(c, "No CLUSTER SYNCSLOTS is waiting for a PAUSEOFFSET response."); @@ -7656,6 +7715,7 @@ int clusterCommandSpecial(client *c) { addReplyError(c, "Failed to parse PAUSEOFFSET offset."); return 1; } + serverLog(LL_NOTICE, "Received paused offset for slot migration from %.40s (%s). My offset: %lld, source offset: %lld", slot_import->source_node->name, slot_import->source_node->human_nodename, slot_import->client->repl_data->reploff, offset); slot_import->paused_at_offset = offset; slot_import->state = SLOT_IMPORT_SYNCING_TO_OFFSET; clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION); diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index 00e686997a..471dbdb950 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -403,8 +403,8 @@ typedef enum slotExportState { typedef struct slotExport { slotBitmap slot_bitmap; slotExportState state; - client *client; /* Client for replication */ - unsigned long long syncslot_offset; + client *client; /* Client for replication */ + unsigned long long streamed_repl_offset; /* Offset for just the streamed part of the syncslots command.*/ mstime_t pause_end; } slotExport; diff --git a/src/replication.c b/src/replication.c index bcb9e0a756..c13a0edb3f 100644 --- a/src/replication.c +++ b/src/replication.c @@ -1700,6 +1700,7 @@ void rdbPipeWriteHandler(struct connection *conn) { } else { replica->repl_data->repldboff += nwritten; server.stat_net_repl_output_bytes += nwritten; + replica->repl_data->repldbsize += nwritten; if (replica->repl_data->repldboff < server.rdb_pipe_bufflen) { replica->repl_data->repl_last_partial_write = server.unixtime; return; /* more data to write.. */ @@ -1774,6 +1775,7 @@ void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData, * of 'rdb_pipe_buff' sent rather than the offset of entire RDB. */ replica->repl_data->repldboff = nwritten; server.stat_net_repl_output_bytes += nwritten; + replica->repl_data->repldbsize += nwritten; } /* If we were unable to write all the data to one of the replicas, * setup write handler (and disable pipe read handler, below) */ From 4506fbe87e3f2023d23fbec0be8a041f06457e9d Mon Sep 17 00:00:00 2001 From: Jacob Murphy Date: Wed, 22 Jan 2025 00:55:50 +0000 Subject: [PATCH 15/18] Typo + mac build fix Signed-off-by: Jacob Murphy --- src/cluster_legacy.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 1357fdaf1d..024394659f 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -4761,7 +4761,7 @@ void clusterProceedWithSlotExport(void) { addReplyBulkCBuffer(curr_export->client, "PAUSEOFFSET", 11); curr_export->streamed_repl_offset += 18; /* '$11\r\nPAUSEOFFSET\r\n' */ - /* We add the length of the offset reply to the offest itself. */ + /* We add the length of the offset reply to the offset itself. */ uint32_t offset_len = digits10(curr_export->streamed_repl_offset + curr_export->client->repl_data->repldbsize); uint32_t offset_len_len = digits10(offset_len); curr_export->streamed_repl_offset += 1 + offset_len_len + 2 + offset_len + 2; @@ -4779,10 +4779,8 @@ void clusterProceedWithSlotExport(void) { curr_export->streamed_repl_offset++; } } - serverLog(LL_NOTICE, "At time of pause, slot migration AOF size: %lu, " - "slot migration streaming offset: %llu, total " - "offset: %llu", - curr_export->client->repl_data->repldbsize, + serverLog(LL_NOTICE, "At time of pause slot migration streaming offset: %llu, total " + "offset (with AOF snapshot): %llu", curr_export->streamed_repl_offset, curr_export->streamed_repl_offset + curr_export->client->repl_data->repldbsize); addReplyBulkLongLong(curr_export->client, curr_export->streamed_repl_offset + curr_export->client->repl_data->repldbsize); From 3eec13c4e08e26f9cd76834c912a59cf80051c9a Mon Sep 17 00:00:00 2001 From: Jacob Murphy Date: Wed, 22 Jan 2025 00:57:41 +0000 Subject: [PATCH 16/18] Log line fix Signed-off-by: Jacob Murphy --- src/cluster_legacy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 024394659f..04a1359649 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -4695,7 +4695,7 @@ void clusterProceedWithSlotImport(void) { clusterUpdateState(); clusterSaveConfigOrDie(1); if (clusterBumpConfigEpochWithoutConsensus() == C_OK) { - serverLog(LL_NOTICE, "Epoch bumped after importing slots. New epoch %ld", server.cluster->currentEpoch); + serverLog(LL_NOTICE, "Epoch bumped after importing slots. New epoch %llu", server.cluster->currentEpoch); } clusterFreeSlotImportJob(curr_import); clusterBroadcastPong(CLUSTER_BROADCAST_ALL); From 9021fffae3a526f562be1de0f481269340b788b9 Mon Sep 17 00:00:00 2001 From: Jacob Murphy Date: Wed, 22 Jan 2025 01:02:38 +0000 Subject: [PATCH 17/18] Another log line fix Signed-off-by: Jacob Murphy --- src/cluster_legacy.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 04a1359649..45581a2280 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -4695,7 +4695,7 @@ void clusterProceedWithSlotImport(void) { clusterUpdateState(); clusterSaveConfigOrDie(1); if (clusterBumpConfigEpochWithoutConsensus() == C_OK) { - serverLog(LL_NOTICE, "Epoch bumped after importing slots. New epoch %llu", server.cluster->currentEpoch); + serverLog(LL_NOTICE, "Epoch bumped after importing slots. New epoch %llu", (unsigned long long) server.cluster->currentEpoch); } clusterFreeSlotImportJob(curr_import); clusterBroadcastPong(CLUSTER_BROADCAST_ALL); @@ -4779,8 +4779,10 @@ void clusterProceedWithSlotExport(void) { curr_export->streamed_repl_offset++; } } - serverLog(LL_NOTICE, "At time of pause slot migration streaming offset: %llu, total " + serverLog(LL_NOTICE, "At time of pause slot migration AOF snapshot size: %llu, " + "slot migration streaming offset: %llu, total " "offset (with AOF snapshot): %llu", + (unsigned long long) curr_export->client->repl_data->repldbsize, curr_export->streamed_repl_offset, curr_export->streamed_repl_offset + curr_export->client->repl_data->repldbsize); addReplyBulkLongLong(curr_export->client, curr_export->streamed_repl_offset + curr_export->client->repl_data->repldbsize); From f1d824f57205ea2f1dce8a1061c871c35d64b5f7 Mon Sep 17 00:00:00 2001 From: Jacob Murphy Date: Wed, 22 Jan 2025 01:05:59 +0000 Subject: [PATCH 18/18] Apply clang format Signed-off-by: Jacob Murphy --- src/cluster_legacy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 45581a2280..04a4297b02 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -4695,7 +4695,7 @@ void clusterProceedWithSlotImport(void) { clusterUpdateState(); clusterSaveConfigOrDie(1); if (clusterBumpConfigEpochWithoutConsensus() == C_OK) { - serverLog(LL_NOTICE, "Epoch bumped after importing slots. New epoch %llu", (unsigned long long) server.cluster->currentEpoch); + serverLog(LL_NOTICE, "Epoch bumped after importing slots. New epoch %llu", (unsigned long long)server.cluster->currentEpoch); } clusterFreeSlotImportJob(curr_import); clusterBroadcastPong(CLUSTER_BROADCAST_ALL); @@ -4782,7 +4782,7 @@ void clusterProceedWithSlotExport(void) { serverLog(LL_NOTICE, "At time of pause slot migration AOF snapshot size: %llu, " "slot migration streaming offset: %llu, total " "offset (with AOF snapshot): %llu", - (unsigned long long) curr_export->client->repl_data->repldbsize, + (unsigned long long)curr_export->client->repl_data->repldbsize, curr_export->streamed_repl_offset, curr_export->streamed_repl_offset + curr_export->client->repl_data->repldbsize); addReplyBulkLongLong(curr_export->client, curr_export->streamed_repl_offset + curr_export->client->repl_data->repldbsize);