From 294dd94a40bbc499502ec987afa278ee060d4e78 Mon Sep 17 00:00:00 2001
From: Jacob Murphy <jkmurphy@google.com>
Date: Wed, 15 Jan 2025 12:04:36 +0000
Subject: [PATCH 01/18] Initial commit of AOF based atomic slot migration

Signed-off-by: Jacob Murphy <jkmurphy@google.com>
---
 src/aof.c                         |   26 +-
 src/blocked.c                     |    2 +-
 src/cluster.c                     |   70 ++
 src/cluster.h                     |    8 +
 src/cluster_legacy.c              |  259 ++++-
 src/cluster_legacy.h              |   42 +-
 src/commands.def                  |   20 +
 src/commands/cluster-migrate.json |   18 +
 src/config.c                      |   23 +-
 src/db.c                          |   21 +-
 src/evict.c                       |    2 +-
 src/expire.c                      |    2 +-
 src/io_threads.c                  |    2 +-
 src/kvstore.c                     |   52 +-
 src/kvstore.h                     |    4 +
 src/lazyfree.c                    |   23 +
 src/module.c                      |   28 +-
 src/networking.c                  |   34 +-
 src/rdb.c                         |   28 +-
 src/rdb.h                         |    2 +-
 src/replication.c                 | 1485 +++++++++++++++++------------
 src/script.c                      |   12 +-
 src/server.c                      |   72 +-
 src/server.h                      |  125 ++-
 24 files changed, 1583 insertions(+), 777 deletions(-)
 create mode 100644 src/commands/cluster-migrate.json

diff --git a/src/aof.c b/src/aof.c
index 024cdb2771..5c2691c1ba 100644
--- a/src/aof.c
+++ b/src/aof.c
@@ -2190,14 +2190,27 @@ static int rewriteFunctions(rio *aof) {
     return 0;
 }
 
-int rewriteAppendOnlyFileRio(rio *aof) {
+int shouldFilterSlot(int slot, void * slot_ranges) {
+    if (slot_ranges == NULL) return 0;
+    list *ranges = (list *)slot_ranges;
+    listIter li;
+    listNode *ln;
+    listRewind(ranges, &li);
+    while ((ln = listNext(&li))) {
+        slotRange *range = (slotRange *) ln->value;
+        if (slot >= range->start && slot <= range->end) return 0;
+    }
+    return 1;
+}
+
+int rewriteAppendOnlyFileRio(rio *aof, list *slot_ranges) {
     int j;
     long key_count = 0;
     long long updated_time = 0;
     kvstoreIterator *kvs_it = NULL;
 
     /* Record timestamp at the beginning of rewriting AOF. */
-    if (server.aof_timestamp_enabled) {
+    if (server.aof_timestamp_enabled && slot_ranges == NULL) {
         sds ts = genAofTimestampAnnotationIfNeeded(1);
         if (rioWrite(aof, ts, sdslen(ts)) == 0) {
             sdsfree(ts);
@@ -2217,7 +2230,11 @@ int rewriteAppendOnlyFileRio(rio *aof) {
         if (rioWrite(aof, selectcmd, sizeof(selectcmd) - 1) == 0) goto werr;
         if (rioWriteBulkLongLong(aof, j) == 0) goto werr;
 
-        kvs_it = kvstoreIteratorInit(db->keys);
+        if (slot_ranges == NULL) {
+            kvs_it = kvstoreIteratorInit(db->keys);
+        } else {
+            kvs_it = kvstoreFilteredIteratorInit(db->keys, &shouldFilterSlot, slot_ranges);
+        }
         /* Iterate this DB writing every entry */
         void *next;
         while (kvstoreIteratorNext(kvs_it, &next)) {
@@ -2280,6 +2297,7 @@ int rewriteAppendOnlyFileRio(rio *aof) {
                     updated_time = now;
                 }
             }
+            serverLog(LL_NOTICE, "AOF rewrite: %s, key_count: %ld", keystr, key_count);
 
             /* Delay before next key if required (for testing) */
             if (server.rdb_key_save_delay) debugDelay(server.rdb_key_save_delay);
@@ -2330,7 +2348,7 @@ int rewriteAppendOnlyFile(char *filename) {
             goto werr;
         }
     } else {
-        if (rewriteAppendOnlyFileRio(&aof) == C_ERR) goto werr;
+        if (rewriteAppendOnlyFileRio(&aof, NULL) == C_ERR) goto werr;
     }
 
     /* Make sure data will not remain on the OS's output buffers */
diff --git a/src/blocked.c b/src/blocked.c
index d2d6a5d314..d1a6ff9c6b 100644
--- a/src/blocked.c
+++ b/src/blocked.c
@@ -101,7 +101,7 @@ void freeClientBlockingState(client *c) {
  * and will be processed when the client is unblocked. */
 void blockClient(client *c, int btype) {
     /* Primary client should never be blocked unless pause or module */
-    serverAssert(!(c->flag.primary && btype != BLOCKED_MODULE && btype != BLOCKED_POSTPONE));
+    serverAssert(!(c->flag.replication_source && btype != BLOCKED_MODULE && btype != BLOCKED_POSTPONE));
 
     initClientBlockingState(c);
 
diff --git a/src/cluster.c b/src/cluster.c
index 309279e0be..8050cd869d 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -815,6 +815,76 @@ unsigned int countKeysInSlot(unsigned int slot) {
     return kvstoreHashtableSize(server.db->keys, slot);
 }
 
+unsigned int dropKeysInSlotRanges(list *slot_ranges, int async) {
+    unsigned int result = 0;
+    listIter li;
+    listNode *ln;
+    listRewind(slot_ranges, &li);
+    while ((ln = listNext(&li))) {
+        slotRange *slot_range = (slotRange *) listNodeValue(ln);
+        for (int i = slot_range->start; i <= slot_range->end; i++) {
+            result += dropKeysInSlot(i, async);
+        }
+    }
+    return result;
+}
+
+unsigned int dropKeysInSlot(unsigned int hashslot, int async) {
+    unsigned int result = kvstoreHashtableSize(server.db->keys, hashslot);
+    if (async) {
+        emptyHashtableAsync(server.db, hashslot);
+    } else {
+        kvstoreEmptyHashtable(server.db->keys, hashslot, NULL);
+        kvstoreEmptyHashtable(server.db->expires, hashslot, NULL);
+    }
+    return result;
+}
+
+
+
+void slotRangesToBitmap(list *slot_ranges, unsigned char *bitmap_out) {
+    listIter li;
+    listNode *ln;
+    listRewind(slot_ranges, &li);
+    while ((ln = listNext(&li))) {
+        slotRange *range = (slotRange *) listNodeValue(ln);
+        for (int i = range->start; i <= range->end; i++) {
+            bitmapSetBit(bitmap_out, i);
+        }
+    }
+}
+
+void bitmapToSlotRanges(unsigned char *bitmap, list **slot_ranges_out) {
+    *slot_ranges_out = listCreate();
+    int range_start = -1;
+    for (int i = 0; i <= CLUSTER_SLOTS; i++) {
+        if (i != CLUSTER_SLOTS && bitmapTestBit(bitmap, i)) {
+            if (range_start == -1) {
+                range_start = i;
+            }
+        } else if (range_start != -1) {
+            slotRange *range = zmalloc(sizeof(slotRange));
+            range->start = range_start;
+            range->end = i - 1;
+            range_start = -1;
+            serverLog(LL_NOTICE, "Got another range: %d-%d", range->start, range->end);
+            listAddNodeTail(*slot_ranges_out, range);
+        }
+    }
+}
+
+void freeSlotRanges(list *slot_ranges) {
+    listIter li;
+    listNode *ln;
+    listRewind(slot_ranges, &li);
+    while ((ln = listNext(&li))) {
+        slotRange *range = (slotRange *)ln->value;
+        zfree(range);
+        listDelNode(slot_ranges, ln);
+    }
+    listRelease(slot_ranges);
+}
+
 void clusterCommandHelp(client *c) {
     const char *help[] = {
         "COUNTKEYSINSLOT <slot>",
diff --git a/src/cluster.h b/src/cluster.h
index 142f2d70b3..fd994d1ce7 100644
--- a/src/cluster.h
+++ b/src/cluster.h
@@ -116,6 +116,14 @@ client *createCachedResponseClient(int resp);
 void deleteCachedResponseClient(client *recording_client);
 void clearCachedClusterSlotsResponse(void);
 unsigned int countKeysInSlot(unsigned int hashslot);
+unsigned int dropKeysInSlotRanges(list *slot_ranges, int async);
+unsigned int dropKeysInSlot(unsigned int hashslot, int async);
+void slotRangesToBitmap(list *slot_ranges, unsigned char *bitmap_out);
+void bitmapToSlotRanges(unsigned char *bitmap, list **slot_ranges_out);
+void freeSlotRanges(list *slot_ranges);
+int bitmapTestBit(unsigned char *bitmap, int pos);
+void bitmapSetBit(unsigned char *bitmap, int pos);
+void bitmapClearBit(unsigned char *bitmap, int pos);
 int getSlotOrReply(client *c, robj *o);
 
 /* functions with shared implementations */
diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 5c4bb65aae..15a5ee3b7d 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -72,9 +72,6 @@ int clusterNodeSetSlotBit(clusterNode *n, int slot);
 static void clusterSetPrimary(clusterNode *n, int closeSlots, int full_sync_required);
 void clusterHandleReplicaFailover(void);
 void clusterHandleReplicaMigration(int max_replicas);
-int bitmapTestBit(unsigned char *bitmap, int pos);
-void bitmapSetBit(unsigned char *bitmap, int pos);
-void bitmapClearBit(unsigned char *bitmap, int pos);
 void clusterDoBeforeSleep(int flags);
 void clusterSendUpdate(clusterLink *link, clusterNode *node);
 void resetManualFailover(void);
@@ -86,6 +83,8 @@ sds representSlotInfo(sds ci, uint16_t *slot_info_pairs, int slot_info_pairs_cou
 void clusterFreeNodesSlotsInfo(clusterNode *n);
 uint64_t clusterGetMaxEpoch(void);
 int clusterBumpConfigEpochWithoutConsensus(void);
+slotMigration *clusterGetCurrentSlotMigration(void);
+void clusterSendMigrateSlotStart(clusterNode *node, list *slot_ranges);
 void moduleCallClusterReceivers(const char *sender_id,
                                 uint64_t module_id,
                                 uint8_t type,
@@ -1134,6 +1133,7 @@ void clusterInit(void) {
     server.cluster->failover_auth_epoch = 0;
     server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE;
     server.cluster->lastVoteEpoch = 0;
+    server.cluster->slot_migrations = listCreate();
 
     /* Initialize stats */
     for (int i = 0; i < CLUSTERMSG_TYPE_COUNT; i++) {
@@ -1456,7 +1456,7 @@ void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
 
     /* If the server is starting up, don't accept cluster connections:
      * UPDATE messages may interact with the database content. */
-    if (server.primary_host == NULL && server.loading) return;
+    if (server.primary == NULL && server.loading) return;
 
     while (max--) {
         cfd = anetTcpAccept(server.neterr, fd, cip, sizeof(cip), &cport);
@@ -2570,6 +2570,12 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc
                     migrated_our_slots++;
                 }
 
+                /* Was this slot mine and it was in a paused state for slot
+                 * migration? If so, clear the manual failover state. */
+                if (server.cluster->slots[j] == myself && server.cluster->mf_end && server.cluster->mf_replica == sender) {
+                    resetManualFailover();
+                }
+
                 /* If the sender who claims this slot is not in the same shard,
                  * it must be a result of deliberate operator actions. Therefore,
                  * we should honor it and clear the outstanding migrating_slots_to
@@ -3245,6 +3251,20 @@ int clusterProcessPacket(clusterLink *link) {
                       "primary manual failover: %lld",
                       server.cluster->mf_primary_offset);
         }
+        /* If we are a importing a slot and the slot owner sent its offset
+         * while already paused, populate the migration state. */
+        slotMigration * curr_migration = clusterGetCurrentSlotMigration();
+        if (hdr->mflags[0] & CLUSTERMSG_FLAG0_PAUSED && curr_migration != NULL &&
+            curr_migration->state == SLOT_MIGRATION_WAITING_FOR_OFFSET &&
+            curr_migration->source_node == sender) {
+            curr_migration->pause_primary_offset = sender->repl_offset;
+            curr_migration->state = SLOT_MIGRATION_SYNCING_TO_OFFSET;
+            clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION);
+            serverLog(LL_NOTICE,
+                    "Received replication offset from paused owner for "
+                    "slot import: %lld",
+                    curr_migration->pause_primary_offset);
+        }
     }
 
     /* Initial processing of PING and MEET requests replying with a PONG. */
@@ -3699,6 +3719,26 @@ int clusterProcessPacket(clusterLink *link) {
         uint8_t type = hdr->data.module.msg.type;
         unsigned char *payload = hdr->data.module.msg.bulk_data;
         moduleCallClusterReceivers(sender->name, module_id, type, payload, len);
+    } else if (type == CLUSTERMSG_TYPE_MIGRATE_SLOT_START) {
+        /* This message is acceptable only if I'm a primary and I own the slot */
+        if (!sender) return 1;
+        for (int i = 0; i <= CLUSTER_SLOTS; i++) {
+            if (bitmapTestBit(hdr->data.slot_migration.msg.slot_bitmap, i) && server.cluster->slots[i] != myself) return 1;
+        }
+        /* Initialize the slot migration state accordingly */
+        resetManualFailover();
+        server.cluster->mf_end = now + CLUSTER_MF_TIMEOUT;
+        server.cluster->mf_replica = sender;
+        /* TODO(murphyjacob4) pause subset of slots */
+        pauseActions(PAUSE_DURING_FAILOVER, now + (CLUSTER_MF_TIMEOUT * CLUSTER_MF_PAUSE_MULT),
+                     PAUSE_ACTIONS_CLIENT_WRITE_SET);
+        serverLog(LL_NOTICE, "Slot migration requested by node %.40s (%s).", sender->name, sender->human_nodename);
+        /* We need to send a ping message to the replica, as it would carry
+         * `server.cluster->mf_primary_offset`, which means the primary paused clients
+         * at offset `server.cluster->mf_primary_offset`, so that the replica would
+         * know that it is safe to set its `server.cluster->mf_can_start` to 1 so as
+         * to complete failover as quickly as possible. */
+        clusterSendPing(link, CLUSTERMSG_TYPE_PING);
     } else {
         serverLog(LL_WARNING, "Received unknown packet type: %d", type);
     }
@@ -4395,6 +4435,128 @@ void clusterPropagatePublish(robj *channel, robj *message, int sharded) {
     clusterMsgSendBlockDecrRefCount(msgblock_light);
 }
 
+/* -----------------------------------------------------------------------------
+ * Slot Migration functions
+ * -------------------------------------------------------------------------- */
+
+/* Gets the current slot migration from the head of the queue. */
+slotMigration *clusterGetCurrentSlotMigration(void) {
+    if (listLength(server.cluster->slot_migrations) == 0) return NULL;
+    return (slotMigration *) listFirst(server.cluster->slot_migrations)->value;
+}
+
+void clusterSendMigrateSlotStart(clusterNode *node, list *slot_ranges) {
+    if (!node->link) return;
+
+    uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData) + sizeof(clusterMsgSlotMigration);
+    clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_MIGRATE_SLOT_START, msglen);
+    clusterMsg *hdr = getMessageFromSendBlock(msgblock);
+    slotRangesToBitmap(slot_ranges, hdr->data.slot_migration.msg.slot_bitmap);
+    clusterSendMessage(node->link, msgblock);
+    clusterMsgSendBlockDecrRefCount(msgblock);
+}
+
+/* This is the main state machine for the slot migration workflow. Slot
+ * migration is driven by the new owner of the slot. This function will do as
+ * much work as possible synchronously, processing the enqueued slot migrations
+ * and only returning once we are waiting on some IO. */
+void clusterProceedWithSlotMigration(void) {
+    server.cluster->todo_before_sleep &= ~CLUSTER_TODO_HANDLE_SLOTMIGRATION;
+
+    while (clusterGetCurrentSlotMigration() != NULL) {
+        listNode *curr_node = listFirst(server.cluster->slot_migrations);
+        slotMigration *curr_migration = (slotMigration *) curr_node->value;
+        if (curr_migration->state != SLOT_MIGRATION_QUEUED && curr_migration->end_time < mstime()) {
+            serverLog(LL_WARNING,
+                "Timed out for slot migration from source node %.40s", curr_migration->source_node->name);
+            curr_migration->state = SLOT_MIGRATION_FAILED;
+        }
+        if (curr_migration->state > SLOT_MIGRATION_PAUSE_OWNER && curr_migration->state < SLOT_MIGRATION_FAILED && curr_migration->pause_end < mstime() && curr_migration->vote_retry_time < mstime()) {
+            /* If the owner ever unpauses, we have to move back in the state machine and retry. */
+            serverLog(LL_WARNING, "Reinitiating pause on the node owning the slot range...");
+            curr_migration->state = SLOT_MIGRATION_PAUSE_OWNER;
+            curr_migration->pause_end = mstime() + CLUSTER_MF_TIMEOUT;
+        }
+        switch(curr_migration->state) {
+            case SLOT_MIGRATION_QUEUED:
+                /* Start the migration */
+                serverLog(LL_NOTICE, "Starting sync from migration source node %.40s", curr_migration->source_node->name);
+                curr_migration->end_time = mstime() + CLUSTER_SLOT_MIGRATION_TIMEOUT;
+                curr_migration->link = createReplicationLink(curr_migration->source_node->ip, getNodeDefaultReplicationPort(curr_migration->source_node), curr_migration->slot_ranges);
+                if (connectReplicationLink(curr_migration->link) == C_ERR) {
+                    serverLog(LL_WARNING,
+                            "Failed to begin sync from migration source node %.40s", curr_migration->source_node->name);
+                    curr_migration->state = SLOT_MIGRATION_FAILED;
+                    continue;
+                }
+                curr_migration->state = SLOT_MIGRATION_SYNCING;
+                continue;
+            case SLOT_MIGRATION_SYNCING:
+                /* replicationCron should manage retrying connection, but there could be scenarios where we hit an irrecoverable error. */
+                if (curr_migration->link->state == REPL_STATE_NONE || curr_migration->link->state == REPL_STATE_CANCELLED) {
+                    serverLog(LL_WARNING, "Sync failed from migration node %.40s", curr_migration->source_node->name);
+                    curr_migration->state = SLOT_MIGRATION_FAILED;
+                    continue;
+                }
+                if (curr_migration->link->state == REPL_STATE_CONNECTED) {
+                    curr_migration->state = SLOT_MIGRATION_PAUSE_OWNER;
+                    continue;
+                }
+                /* If we are in another state, nothing to do right now. */
+                return;
+            case SLOT_MIGRATION_PAUSE_OWNER:
+                serverLog(LL_NOTICE, "Replication link to slot owner %.40s has been established. Pausing source node and waiting to continue", curr_migration->source_node->name);
+                clusterSendMigrateSlotStart(curr_migration->source_node, curr_migration->slot_ranges);
+                curr_migration->pause_primary_offset = -1;
+                curr_migration->pause_end = mstime() + CLUSTER_MF_TIMEOUT;
+                curr_migration->state = SLOT_MIGRATION_WAITING_FOR_OFFSET;
+                continue;
+            case SLOT_MIGRATION_WAITING_FOR_OFFSET:
+                /* Nothing to do, need to wait for cluster message to come in. */
+                return;
+            case SLOT_MIGRATION_SYNCING_TO_OFFSET:
+                if (curr_migration->link->client->repl_data->reploff >= curr_migration->pause_primary_offset) {
+                    serverLog(LL_NOTICE, "Replication of slot range has caught up to paused slot owner, slot migration can start.");
+                    curr_migration->state = SLOT_MIGRATION_FINISH;
+                    continue;
+                }
+                /* Need to wait for the sync to progress further */
+                return;
+            case SLOT_MIGRATION_FINISH:
+                serverLog(LL_NOTICE, "Setting myself to owner of migrating slots and broadcasting");
+                listIter li;
+                listNode *ln;
+                listRewind(curr_migration->slot_ranges, &li);
+                while ((ln = listNext(&li))) {
+                    slotRange *range = (slotRange *) ln->value;
+                    for (int i = range->start; i <= range->end; i++) {
+                        clusterDelSlot(i);
+                        clusterAddSlot(myself, i);
+                    }
+                }
+                clusterUpdateState();
+                clusterSaveConfigOrDie(1);
+                if (clusterBumpConfigEpochWithoutConsensus() == C_OK) {
+                    serverLog(LL_NOTICE, "ConfigEpoch updated after importing slots");
+                }
+                clusterBroadcastPong(CLUSTER_BROADCAST_ALL);
+                listDelNode(server.cluster->slot_migrations, curr_node);
+                freeReplicationLink(curr_migration->link);
+                zfree(curr_migration);
+                continue;
+            case SLOT_MIGRATION_FAILED:
+                /* Delete the migration from the queue and proceed to the next migration */
+                listDelNode(server.cluster->slot_migrations, curr_node);
+                freeReplicationLink(curr_migration->link);
+                dropKeysInSlotRanges(curr_migration->slot_ranges, server.repl_replica_lazy_flush);
+                freeSlotRanges(curr_migration->slot_ranges);
+                zfree(curr_migration);
+                continue;
+        }
+    }
+}
+
+
 /* -----------------------------------------------------------------------------
  * REPLICA node specific functions
  * -------------------------------------------------------------------------- */
@@ -4739,8 +4901,8 @@ void clusterHandleReplicaFailover(void) {
 
     /* Set data_age to the number of milliseconds we are disconnected from
      * the primary. */
-    if (server.repl_state == REPL_STATE_CONNECTED) {
-        data_age = (mstime_t)(server.unixtime - server.primary->last_interaction) * 1000;
+    if (server.primary && server.primary->state == REPL_STATE_CONNECTED) {
+        data_age = (mstime_t)(server.unixtime - server.primary->client->last_interaction) * 1000;
     } else {
         data_age = (mstime_t)(server.unixtime - server.repl_down_since) * 1000;
     }
@@ -5332,7 +5494,7 @@ void clusterCron(void) {
     /* If we are a replica node but the replication is still turned off,
      * enable it if we know the address of our primary and it appears to
      * be up. */
-    if (nodeIsReplica(myself) && server.primary_host == NULL && myself->replicaof && nodeHasAddr(myself->replicaof)) {
+    if (nodeIsReplica(myself) && server.primary == NULL && myself->replicaof && nodeHasAddr(myself->replicaof)) {
         replicationSetPrimary(myself->replicaof->ip, getNodeDefaultReplicationPort(myself->replicaof), 0);
     }
 
@@ -5353,6 +5515,8 @@ void clusterCron(void) {
     }
 
     if (update_state || server.cluster->state == CLUSTER_FAIL) clusterUpdateState();
+
+    clusterProceedWithSlotMigration();
 }
 
 /* This function is called before the event handler returns to sleep for
@@ -5378,6 +5542,9 @@ void clusterBeforeSleep(void) {
         /* Handle failover, this is needed when it is likely that there is already
          * the quorum from primaries in order to react fast. */
         clusterHandleReplicaFailover();
+    } else if (flags & CLUSTER_TODO_HANDLE_SLOTMIGRATION) {
+        /* Continue with slot migration (e.g. if import offset is updated) */
+        clusterProceedWithSlotMigration();
     }
 
     /* Update the cluster state. */
@@ -6528,13 +6695,13 @@ int clusterParseSetSlotCommand(client *c, int *slot_out, clusterNode **node_out,
     int optarg_pos = 0;
 
     /* Allow primaries to replicate "CLUSTER SETSLOT" */
-    if (!c->flag.primary && nodeIsReplica(myself)) {
+    if (!c->flag.replication_source && nodeIsReplica(myself)) {
         addReplyError(c, "Please use SETSLOT only with masters.");
         return 0;
     }
 
     /* If 'myself' is a replica, 'c' must be the primary client. */
-    serverAssert(!nodeIsReplica(myself) || c == server.primary);
+    serverAssert(!nodeIsReplica(myself) || (server.primary && c == server.primary->client));
 
     if ((slot = getSlotOrReply(c, c->argv[2])) == -1) return 0;
 
@@ -7108,6 +7275,78 @@ int clusterCommandSpecial(client *c) {
     } else if (!strcasecmp(c->argv[1]->ptr, "links") && c->argc == 2) {
         /* CLUSTER LINKS */
         addReplyClusterLinksDescription(c);
+    } else if (!strcasecmp(c->argv[1]->ptr, "migrate")) {
+        /* CLUSTER MIGRATE SLOTSRANGE <start> <end> [<start> <end>] */
+        if (nodeIsReplica(myself)) {
+            addReplyError(c, "Only primaries can migrate slots");
+            return 1;
+        }
+        if (c->argc < 5 || strcasecmp(c->argv[2]->ptr, "slotsrange")) {
+            addReplyError(c, "Migrate command requires at least one ");
+            return 1;
+        }
+        unsigned char requested_slots[CLUSTER_SLOTS/8];
+        memset(requested_slots, 0, sizeof(requested_slots));
+        int i;
+        clusterNode * curr_owner = NULL;
+        for (i = 3; i + 1 < c->argc; i+=2) {
+            if (i > 3 && getLongLongFromObject(c->argv[i], NULL) != C_OK) {
+                /* If we find a non-integer in the args and we have already
+                 * parsed >=1 slot range, we assume it is the next token. */
+                break;
+            }
+            int start = getSlotOrReply(c, c->argv[i]);
+            if (start < 0) {
+                return 1;
+            }
+            int end = getSlotOrReply(c, c->argv[i + 1]);
+            if (end < 0) {
+                return 1;
+            }
+            if (end < start) {
+                addReplyErrorFormat(c, "Invalid SLOTSRANGE, start slot %d is greater than end slot %d", start, end);
+                return 1;
+            }
+            for (int j = start; j <= end; j++) {
+                if (bitmapTestBit(requested_slots, j)) {
+                    addReplyError(c, "Invalid SLOTSRANGE, slot ranges overlap");
+                    return 1;
+                }
+                if (curr_owner == NULL) {
+                    curr_owner = server.cluster->slots[j];
+                } else {
+                    if (curr_owner != server.cluster->slots[j]) {
+                        addReplyError(c, "Invalid SLOTSRANGE, slot ranges are not all owned by the same shard");
+                        return 1;
+                    }
+                }
+                if (curr_owner == myself) {
+                    addReplyErrorFormat(c, "I'm already the owner of hash slot %u", j);
+                    return 1;
+                }
+                if (nodeFailed(curr_owner)) {
+                    addReplyErrorFormat(c, "Primary is currently failing for slot %u. Please try again once there is a healthy primary", j);
+                    return 1;
+                }
+                bitmapSetBit(requested_slots, j);
+            }
+        }
+
+        slotMigration *to_enqueue = (slotMigration *) zmalloc(sizeof(slotMigration));
+        bitmapToSlotRanges(requested_slots, &to_enqueue->slot_ranges);
+        to_enqueue->source_node = curr_owner;
+        to_enqueue->state = SLOT_MIGRATION_QUEUED;
+        to_enqueue->end_time = 0; /* Will be set once started. */
+        to_enqueue->link = NULL;
+        to_enqueue->pause_end = 0;
+        to_enqueue->pause_primary_offset = -1;
+        to_enqueue->vote_end_time = 0;
+        to_enqueue->vote_retry_time = 0;
+        to_enqueue->vote_epoch = 0;
+        to_enqueue->auth_count = 0;
+        listAddNodeTail(server.cluster->slot_migrations, to_enqueue);
+        clusterProceedWithSlotMigration();
+        addReply(c, shared.ok);
     } else {
         return 0;
     }
@@ -7150,6 +7389,8 @@ const char **clusterCommandExtendedHelp(void) {
         "LINKS",
         "    Return information about all network links between this node and its peers.",
         "    Output format is an array where each array element is a map containing attributes of a link",
+        "MIGRATE SLOTSRANGE <start slot> <end slot> [<start slot> <end slot> ...] SHARD <shard-id>",
+        "    Initiate server driven slot migration of all slot ranges to the designated shard.",
         NULL};
 
     return help;
diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
index 226842c5dc..dc157af78b 100644
--- a/src/cluster_legacy.h
+++ b/src/cluster_legacy.h
@@ -10,6 +10,7 @@
 #define CLUSTER_MF_TIMEOUT 5000              /* Milliseconds to do a manual failover. */
 #define CLUSTER_MF_PAUSE_MULT 2              /* Primary pause manual failover mult. */
 #define CLUSTER_REPLICA_MIGRATION_DELAY 5000 /* Delay for replica migration. */
+#define CLUSTER_SLOT_MIGRATION_TIMEOUT 30000 /* Milliseconds to do a slot migration. */
 
 /* Reasons why a replica is not able to failover. */
 #define CLUSTER_CANT_FAILOVER_NONE 0
@@ -26,6 +27,7 @@
 #define CLUSTER_TODO_FSYNC_CONFIG (1 << 3)
 #define CLUSTER_TODO_HANDLE_MANUALFAILOVER (1 << 4)
 #define CLUSTER_TODO_BROADCAST_ALL (1 << 5)
+#define CLUSTER_TODO_HANDLE_SLOTMIGRATION (1 << 6)
 
 /* clusterLink encapsulates everything needed to talk with a remote node. */
 typedef struct clusterLink {
@@ -95,7 +97,9 @@ typedef struct clusterNodeFailReport {
 #define CLUSTERMSG_TYPE_MFSTART 8               /* Pause clients for manual failover */
 #define CLUSTERMSG_TYPE_MODULE 9                /* Module cluster API message. */
 #define CLUSTERMSG_TYPE_PUBLISHSHARD 10         /* Pub/Sub Publish shard propagation */
-#define CLUSTERMSG_TYPE_COUNT 11                /* Total number of message types. */
+#define CLUSTERMSG_TYPE_MIGRATE_SLOT_START 11   /* Pause clients for slot migration */
+#define CLUSTERMSG_TYPE_COUNT 12                /* Total number of message types. */
+
 
 #define CLUSTERMSG_LIGHT 0x8000 /* Modifier bit for message types that support light header */
 
@@ -142,6 +146,10 @@ typedef struct {
     unsigned char bulk_data[3]; /* 3 bytes just as placeholder. */
 } clusterMsgModule;
 
+typedef struct {
+    unsigned char slot_bitmap[CLUSTER_SLOTS / 8]; /* Slots bitmap. */
+} clusterMsgSlotMigration;
+
 /* The cluster supports optional extension messages that can be sent
  * along with ping/pong/meet messages to give additional info in a
  * consistent manner. */
@@ -228,6 +236,12 @@ union clusterMsgData {
     struct {
         clusterMsgModule msg;
     } module;
+
+    /* SLOT_MIGRATION */
+    struct {
+        clusterMsgSlotMigration msg;
+    } slot_migration;
+
 };
 
 #define CLUSTER_PROTO_VER 1 /* Cluster bus protocol version. */
@@ -362,6 +376,31 @@ struct _clusterNode {
                                                Update with updateAndCountChangedNodeHealth(). */
 };
 
+typedef enum slotMigrationState {
+    SLOT_MIGRATION_QUEUED,          /* Queued behind some other slot migration. */
+    SLOT_MIGRATION_SYNCING,         /* Syncing contents from current owner. */
+    SLOT_MIGRATION_PAUSE_OWNER,
+    SLOT_MIGRATION_WAITING_FOR_OFFSET,
+    SLOT_MIGRATION_SYNCING_TO_OFFSET,
+    SLOT_MIGRATION_FINISH,
+    SLOT_MIGRATION_FAILED,
+} slotMigrationState;
+
+typedef struct slotMigration {
+    list *slot_ranges;
+    slotMigrationState state;
+    clusterNode *source_node;
+    mstime_t end_time; /* Slot migration time limit (ms unixtime).
+                          If not yet in progress (e.g. queued), will be zero. */
+    replicationLink *link;
+    mstime_t pause_end;
+    long long pause_primary_offset;
+    mstime_t vote_end_time;
+    mstime_t vote_retry_time;
+    uint64_t vote_epoch;
+    int auth_count;
+} slotMigration;
+
 /* Struct used for storing slot statistics. */
 typedef struct slotStat {
     uint64_t cpu_usec;
@@ -420,6 +459,7 @@ struct clusterState {
     unsigned char owner_not_claiming_slot[CLUSTER_SLOTS / 8];
     /* Struct used for storing slot statistics, for all slots owned by the current shard. */
     slotStat slot_stats[CLUSTER_SLOTS];
+    list *slot_migrations; /* Queue of ongoing slot migrations. */
 };
 
 #endif // CLUSTER_LEGACY_H
diff --git a/src/commands.def b/src/commands.def
index c5d766e3f8..0e54094821 100644
--- a/src/commands.def
+++ b/src/commands.def
@@ -685,6 +685,25 @@ struct COMMAND_ARG CLUSTER_MEET_Args[] = {
 {MAKE_ARG("cluster-bus-port",ARG_TYPE_INTEGER,-1,NULL,NULL,"4.0.0",CMD_ARG_OPTIONAL,0,NULL)},
 };
 
+/********** CLUSTER MIGRATE ********************/
+
+#ifndef SKIP_CMD_HISTORY_TABLE
+/* CLUSTER MIGRATE history */
+#define CLUSTER_MIGRATE_History NULL
+#endif
+
+#ifndef SKIP_CMD_TIPS_TABLE
+/* CLUSTER MIGRATE tips */
+const char *CLUSTER_MIGRATE_Tips[] = {
+"nondeterministic_output",
+};
+#endif
+
+#ifndef SKIP_CMD_KEY_SPECS_TABLE
+/* CLUSTER MIGRATE key specs */
+#define CLUSTER_MIGRATE_Keyspecs NULL
+#endif
+
 /********** CLUSTER MYID ********************/
 
 #ifndef SKIP_CMD_HISTORY_TABLE
@@ -1020,6 +1039,7 @@ struct COMMAND_STRUCT CLUSTER_Subcommands[] = {
 {MAKE_CMD("keyslot","Returns the hash slot for a key.","O(N) where N is the number of bytes in the key","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_KEYSLOT_History,0,CLUSTER_KEYSLOT_Tips,0,clusterCommand,3,CMD_STALE,0,CLUSTER_KEYSLOT_Keyspecs,0,NULL,1),.args=CLUSTER_KEYSLOT_Args},
 {MAKE_CMD("links","Returns a list of all TCP links to and from peer nodes.","O(N) where N is the total number of Cluster nodes","7.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_LINKS_History,0,CLUSTER_LINKS_Tips,1,clusterCommand,2,CMD_STALE,0,CLUSTER_LINKS_Keyspecs,0,NULL,0)},
 {MAKE_CMD("meet","Forces a node to handshake with another node.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_MEET_History,1,CLUSTER_MEET_Tips,0,clusterCommand,-4,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_STALE,0,CLUSTER_MEET_Keyspecs,0,NULL,3),.args=CLUSTER_MEET_Args},
+{MAKE_CMD("migrate","Initiates server driven hash slot migration, importing the given slot to this shard.","O(N) where N is the total number of hash slot arguments","8.1.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_MIGRATE_History,0,CLUSTER_MIGRATE_Tips,1,clusterCommand,-2,CMD_ADMIN|CMD_STALE,0,CLUSTER_MIGRATE_Keyspecs,0,NULL,0)},
 {MAKE_CMD("myid","Returns the ID of a node.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_MYID_History,0,CLUSTER_MYID_Tips,0,clusterCommand,2,CMD_LOADING|CMD_STALE,0,CLUSTER_MYID_Keyspecs,0,NULL,0)},
 {MAKE_CMD("myshardid","Returns the shard ID of a node.","O(1)","7.2.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_MYSHARDID_History,0,CLUSTER_MYSHARDID_Tips,1,clusterCommand,2,CMD_LOADING|CMD_STALE,0,CLUSTER_MYSHARDID_Keyspecs,0,NULL,0)},
 {MAKE_CMD("nodes","Returns the cluster configuration for a node.","O(N) where N is the total number of Cluster nodes","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_NODES_History,0,CLUSTER_NODES_Tips,1,clusterCommand,2,CMD_LOADING|CMD_STALE,0,CLUSTER_NODES_Keyspecs,0,NULL,0)},
diff --git a/src/commands/cluster-migrate.json b/src/commands/cluster-migrate.json
new file mode 100644
index 0000000000..719e827fa4
--- /dev/null
+++ b/src/commands/cluster-migrate.json
@@ -0,0 +1,18 @@
+{
+    "MIGRATE": {
+        "summary": "Initiates server driven hash slot migration, importing the given slot to this shard.",
+        "complexity": "O(N) where N is the total number of hash slot arguments",
+        "group": "cluster",
+        "since": "8.1.0",
+        "arity": -2,
+        "container": "CLUSTER",
+        "function": "clusterCommand",
+        "command_flags": [
+            "ADMIN",
+            "STALE"
+        ],
+        "command_tips": [
+            "NONDETERMINISTIC_OUTPUT"
+        ]
+    }
+}
diff --git a/src/config.c b/src/config.c
index 5b90ebbd60..512b35f210 100644
--- a/src/config.c
+++ b/src/config.c
@@ -596,7 +596,7 @@ void loadServerConfigFromString(char *config) {
     }
 
     /* Sanity checks. */
-    if (server.cluster_enabled && server.primary_host) {
+    if (server.cluster_enabled && server.primary) {
         err = "replicaof directive not allowed in cluster mode";
         goto loaderr;
     }
@@ -1451,11 +1451,11 @@ void rewriteConfigReplicaOfOption(standardConfig *config, const char *name, stru
     /* If this is a primary, we want all the replicaof config options
      * in the file to be removed. Note that if this is a cluster instance
      * we don't want a replicaof directive inside valkey.conf. */
-    if (server.cluster_enabled || server.primary_host == NULL) {
+    if (server.cluster_enabled || server.primary == NULL) {
         rewriteConfigMarkAsProcessed(state, name);
         return;
     }
-    line = sdscatprintf(sdsempty(), "%s %s %d", name, server.primary_host, server.primary_port);
+    line = sdscatprintf(sdsempty(), "%s %s %d", name, server.primary->host, server.primary->port);
     rewriteConfigRewriteLine(state, name, line, 1);
 }
 
@@ -3000,19 +3000,20 @@ static int setConfigReplicaOfOption(standardConfig *config, sds *argv, int argc,
         return 0;
     }
 
-    sdsfree(server.primary_host);
-    server.primary_host = NULL;
+    freeReplicationLink(server.primary);
+    server.primary = NULL;
+
     if (!strcasecmp(argv[0], "no") && !strcasecmp(argv[1], "one")) {
         return 1;
     }
     char *ptr;
-    server.primary_port = strtol(argv[1], &ptr, 10);
-    if (server.primary_port < 0 || server.primary_port > 65535 || *ptr != '\0') {
+    int port = strtol(argv[1], &ptr, 10);
+    if (port < 0 || port > 65535 || *ptr != '\0') {
         *err = "Invalid primary port";
         return 0;
     }
-    server.primary_host = sdsnew(argv[0]);
-    server.repl_state = REPL_STATE_CONNECT;
+    server.primary = createReplicationLink(argv[0], port, NULL);
+    server.primary->state = REPL_STATE_CONNECT;
     return 1;
 }
 
@@ -3024,8 +3025,8 @@ static sds getConfigBindOption(standardConfig *config) {
 static sds getConfigReplicaOfOption(standardConfig *config) {
     UNUSED(config);
     char buf[256];
-    if (server.primary_host)
-        snprintf(buf, sizeof(buf), "%s %d", server.primary_host, server.primary_port);
+    if (server.primary)
+        snprintf(buf, sizeof(buf), "%s %d", server.primary->host, server.primary->port);
     else
         buf[0] = '\0';
     return sdsnew(buf);
diff --git a/src/db.c b/src/db.c
index 94074bf668..05b395728a 100644
--- a/src/db.c
+++ b/src/db.c
@@ -110,7 +110,7 @@ robj *lookupKey(serverDb *db, robj *key, int flags) {
          * It's possible that the WRITE flag is set even during a readonly
          * command, since the command may trigger events that cause modules to
          * perform additional writes. */
-        int is_ro_replica = server.primary_host && server.repl_replica_ro;
+        int is_ro_replica = server.primary && server.repl_replica_ro;
         int expire_flags = 0;
         if (flags & LOOKUP_WRITE && !is_ro_replica) expire_flags |= EXPIRE_FORCE_DELETE_EXPIRED;
         if (flags & LOOKUP_NOEXPIRE) expire_flags |= EXPIRE_AVOID_DELETE_EXPIRED;
@@ -258,7 +258,7 @@ int getKeySlot(sds key) {
      * so we must always recompute the slot for commands coming from the primary.
      */
     if (server.current_client && server.current_client->slot >= 0 && server.current_client->flag.executing_command &&
-        !server.current_client->flag.primary) {
+        !server.current_client->flag.replication_source) {
         debugServerAssertWithInfo(server.current_client, NULL,
                                   (int)keyHashSlot(key, (int)sdslen(key)) == server.current_client->slot);
         return server.current_client->slot;
@@ -267,7 +267,7 @@ int getKeySlot(sds key) {
     /* For the case of replicated commands from primary, getNodeByQuery() never gets called,
      * and thus c->slot never gets populated. That said, if this command ends up accessing a key,
      * we are able to backfill c->slot here, where the key's hash calculation is made. */
-    if (server.current_client && server.current_client->flag.primary) {
+    if (server.current_client && server.current_client->flag.replication_source) {
         server.current_client->slot = slot;
     }
     return slot;
@@ -432,6 +432,7 @@ void setKey(client *c, serverDb *db, robj *key, robj **valref, int flags) {
  * If there are no keys, NULL is returned.
  *
  * The function makes sure to return keys not already expired. */
+// TODO murphyjacob4 need to exclude the loading slots from this
 robj *dbRandomKey(serverDb *db) {
     int maxtries = 100;
     int allvolatile = kvstoreSize(db->keys) == kvstoreSize(db->expires);
@@ -445,7 +446,7 @@ robj *dbRandomKey(serverDb *db) {
         sds key = objectGetKey(valkey);
         robj *keyobj = createStringObject(key, sdslen(key));
         if (objectIsExpired(valkey)) {
-            if (allvolatile && (server.primary_host || server.import_mode) && --maxtries == 0) {
+            if (allvolatile && (server.primary || server.import_mode) && --maxtries == 0) {
                 /* If the DB is composed only of keys with an expire set,
                  * it could happen that all the keys are already logically
                  * expired in the replica, so the function cannot stop because
@@ -1800,8 +1801,8 @@ robj *setExpire(client *c, serverDb *db, robj *key, long long when) {
         serverAssert(added);
     }
 
-    int writable_replica = server.primary_host && server.repl_replica_ro == 0;
-    if (c && writable_replica && !c->flag.primary) rememberReplicaKeyWithExpire(db, key);
+    int writable_replica = server.primary && server.repl_replica_ro == 0;
+    if (c && writable_replica && !c->flag.replication_source) rememberReplicaKeyWithExpire(db, key);
     return val;
 }
 
@@ -1906,7 +1907,7 @@ static int objectIsExpired(robj *val) {
     /* Don't expire anything while loading. It will be done later. */
     if (server.loading) return 0;
     if (!timestampIsExpired(objectGetExpire(val))) return 0;
-    if (server.primary_host == NULL && server.import_mode) {
+    if (server.primary == NULL && server.import_mode) {
         if (server.current_client && server.current_client->flag.import_source) return 0;
     }
     return 1;
@@ -1924,7 +1925,7 @@ static int keyIsExpiredWithDictIndex(serverDb *db, robj *key, int dict_index) {
     if (!keyIsExpiredWithDictIndexImpl(db, key, dict_index)) return 0;
 
     /* See expireIfNeededWithDictIndex for more details. */
-    if (server.primary_host == NULL && server.import_mode) {
+    if (server.primary == NULL && server.import_mode) {
         if (server.current_client && server.current_client->flag.import_source) return 0;
     }
     return 1;
@@ -1958,8 +1959,8 @@ static keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, robj *val,
      *
      * When replicating commands from the primary, keys are never considered
      * expired. */
-    if (server.primary_host != NULL) {
-        if (server.current_client && (server.current_client->flag.primary)) return KEY_VALID;
+    if (server.primary != NULL) {
+        if (server.current_client && (server.current_client->flag.replication_source)) return KEY_VALID;
         if (!(flags & EXPIRE_FORCE_DELETE_EXPIRED)) return KEY_EXPIRED;
     } else if (server.import_mode) {
         /* If we are running in the import mode on a primary, instead of
diff --git a/src/evict.c b/src/evict.c
index d4bfade4fc..f91f2b76f7 100644
--- a/src/evict.c
+++ b/src/evict.c
@@ -466,7 +466,7 @@ static int isSafeToPerformEvictions(void) {
 
     /* By default replicas should ignore maxmemory
      * and just be primaries exact copies. */
-    if (server.primary_host && server.repl_replica_ignore_maxmemory) return 0;
+    if (server.primary && server.repl_replica_ignore_maxmemory) return 0;
 
     /* If 'evict' action is paused, for whatever reason, then return false */
     if (isPausedActionsWithUpdate(PAUSE_ACTION_EVICT)) return 0;
diff --git a/src/expire.c b/src/expire.c
index e4c3b0ec96..29dcd82c83 100644
--- a/src/expire.c
+++ b/src/expire.c
@@ -524,7 +524,7 @@ int checkAlreadyExpired(long long when) {
      *
      * If the server is a primary and in the import mode, we also add the already
      * expired key and wait for an explicit DEL from the import source. */
-    return (when <= commandTimeSnapshot() && !server.loading && !server.primary_host && !server.import_mode);
+    return (when <= commandTimeSnapshot() && !server.loading && !server.primary && !server.import_mode);
 }
 
 #define EXPIRE_NX (1 << 0)
diff --git a/src/io_threads.c b/src/io_threads.c
index 66ef4948b6..260d7007be 100644
--- a/src/io_threads.c
+++ b/src/io_threads.c
@@ -345,7 +345,7 @@ int trySendReadToIOThreads(client *c) {
     c->cur_tid = tid;
     c->read_flags = canParseCommand(c) ? 0 : READ_FLAGS_DONT_PARSE;
     c->read_flags |= authRequired(c) ? READ_FLAGS_AUTH_REQUIRED : 0;
-    c->read_flags |= c->flag.primary ? READ_FLAGS_PRIMARY : 0;
+    c->read_flags |= c->flag.replication_source ? READ_FLAGS_PRIMARY : 0;
 
     c->io_read_state = CLIENT_PENDING_IO;
     connSetPostponeUpdateState(c->conn, 1);
diff --git a/src/kvstore.c b/src/kvstore.c
index d6db4d3fe1..ef4b90af73 100644
--- a/src/kvstore.c
+++ b/src/kvstore.c
@@ -74,6 +74,8 @@ struct _kvstoreIterator {
     kvstore *kvs;
     long long didx;
     long long next_didx;
+    kvstoreIteratorFilter *filter;
+    void *filter_privdata;
     hashtableIterator di;
 };
 
@@ -300,12 +302,7 @@ kvstore *kvstoreCreate(hashtableType *type, int num_hashtables_bits, int flags)
 
 void kvstoreEmpty(kvstore *kvs, void(callback)(hashtable *)) {
     for (int didx = 0; didx < kvs->num_hashtables; didx++) {
-        hashtable *ht = kvstoreGetHashtable(kvs, didx);
-        if (!ht) continue;
-        kvstoreHashtableMetadata *metadata = (kvstoreHashtableMetadata *)hashtableMetadata(ht);
-        if (metadata->rehashing_node) metadata->rehashing_node = NULL;
-        hashtableEmpty(ht, callback);
-        freeHashtableIfNeeded(kvs, didx);
+        kvstoreEmptyHashtable(kvs, didx, callback);
     }
 
     listEmpty(kvs->rehashing);
@@ -318,6 +315,28 @@ void kvstoreEmpty(kvstore *kvs, void(callback)(hashtable *)) {
     kvs->overhead_hashtable_rehashing = 0;
 }
 
+void kvstoreEmptyHashtable(kvstore *kvs, int didx, void(callback)(hashtable *)) {
+    hashtable *ht = kvstoreGetHashtable(kvs, didx);
+    if (!ht) return;
+    kvstoreHashtableMetadata *metadata = (kvstoreHashtableMetadata *)hashtableMetadata(ht);
+    if (metadata->rehashing_node) metadata->rehashing_node = NULL;
+    hashtableEmpty(ht, callback);
+    freeHashtableIfNeeded(kvs, didx);
+}
+
+hashtable *kvstoreUnlinkHashtable(kvstore *kvs, int didx) {
+    hashtable *oldht = kvstoreGetHashtable(kvs, didx);
+    if (!oldht) return NULL;
+
+    /* Pause rehashing on the to be unlinked node. */
+    kvstoreHashtableMetadata *oldmetadata = (kvstoreHashtableMetadata *)hashtableMetadata(oldht);
+    if (oldmetadata->rehashing_node) oldmetadata->rehashing_node = NULL;
+
+    kvs->hashtables[didx] = NULL;
+    kvs->allocated_hashtables--;
+    return oldht;
+}
+
 void kvstoreRelease(kvstore *kvs) {
     for (int didx = 0; didx < kvs->num_hashtables; didx++) {
         hashtable *ht = kvstoreGetHashtable(kvs, didx);
@@ -581,6 +600,20 @@ kvstoreIterator *kvstoreIteratorInit(kvstore *kvs) {
     kvs_it->kvs = kvs;
     kvs_it->didx = -1;
     kvs_it->next_didx = kvstoreGetFirstNonEmptyHashtableIndex(kvs_it->kvs); /* Finds first non-empty hashtable index. */
+    kvs_it->filter = NULL;
+    kvs_it->filter_privdata = NULL;
+    hashtableInitSafeIterator(&kvs_it->di, NULL);
+    return kvs_it;
+}
+
+/* Returns kvstore iterator that filters out hash tables based on the predicate.*/
+kvstoreIterator *kvstoreFilteredIteratorInit(kvstore *kvs, kvstoreIteratorFilter *filter, void *privdata) {
+    kvstoreIterator *kvs_it = zmalloc(sizeof(*kvs_it));
+    kvs_it->kvs = kvs;
+    kvs_it->didx = -1;
+    kvs_it->next_didx = kvstoreGetFirstNonEmptyHashtableIndex(kvs_it->kvs);
+    kvs_it->filter = filter;
+    kvs_it->filter_privdata = privdata;
     hashtableInitSafeIterator(&kvs_it->di, NULL);
     return kvs_it;
 }
@@ -607,8 +640,11 @@ static hashtable *kvstoreIteratorNextHashtable(kvstoreIterator *kvs_it) {
         freeHashtableIfNeeded(kvs_it->kvs, kvs_it->didx);
     }
 
-    kvs_it->didx = kvs_it->next_didx;
-    kvs_it->next_didx = kvstoreGetNextNonEmptyHashtableIndex(kvs_it->kvs, kvs_it->didx);
+    do {
+        kvs_it->didx = kvs_it->next_didx;
+        if (kvs_it->didx == -1) return NULL;
+        kvs_it->next_didx = kvstoreGetNextNonEmptyHashtableIndex(kvs_it->kvs, kvs_it->didx);
+    } while (kvs_it->filter && kvs_it->filter(kvs_it->didx, kvs_it->filter_privdata));
     return kvs_it->kvs->hashtables[kvs_it->didx];
 }
 
diff --git a/src/kvstore.h b/src/kvstore.h
index 1a8c74a6b9..668b0ae23e 100644
--- a/src/kvstore.h
+++ b/src/kvstore.h
@@ -10,11 +10,14 @@ typedef struct _kvstoreHashtableIterator kvstoreHashtableIterator;
 
 typedef int(kvstoreScanShouldSkipHashtable)(hashtable *d);
 typedef int(kvstoreExpandShouldSkipHashtableIndex)(int didx);
+typedef int(kvstoreIteratorFilter)(int didx, void *privdata);
 
 #define KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND (1 << 0)
 #define KVSTORE_FREE_EMPTY_HASHTABLES (1 << 1)
 kvstore *kvstoreCreate(hashtableType *type, int num_hashtables_bits, int flags);
 void kvstoreEmpty(kvstore *kvs, void(callback)(hashtable *));
+void kvstoreEmptyHashtable(kvstore *kvs, int didx, void(callback)(hashtable *));
+hashtable *kvstoreUnlinkHashtable(kvstore *kvs, int didx);
 void kvstoreRelease(kvstore *kvs);
 unsigned long long kvstoreSize(kvstore *kvs);
 unsigned long kvstoreBuckets(kvstore *kvs);
@@ -44,6 +47,7 @@ size_t kvstoreHashtableMetadataSize(void);
 
 /* kvstore iterator specific functions */
 kvstoreIterator *kvstoreIteratorInit(kvstore *kvs);
+kvstoreIterator *kvstoreFilteredIteratorInit(kvstore *kvs, kvstoreIteratorFilter *filter, void *privdata);
 void kvstoreIteratorRelease(kvstoreIterator *kvs_it);
 int kvstoreIteratorGetCurrentHashtableIndex(kvstoreIterator *kvs_it);
 int kvstoreIteratorNext(kvstoreIterator *kvs_it, void **next);
diff --git a/src/lazyfree.c b/src/lazyfree.c
index 3b061ccd84..8cd04eed37 100644
--- a/src/lazyfree.c
+++ b/src/lazyfree.c
@@ -32,6 +32,18 @@ void lazyfreeFreeDatabase(void *args[]) {
     atomic_fetch_add_explicit(&lazyfreed_objects, numkeys, memory_order_relaxed);
 }
 
+/* Release a hashtable from the lazyfree thread. */
+void lazyfreeFreeHashtable(void *args[]) {
+    hashtable *ht1 = args[0];
+    hashtable *ht2 = args[1];
+
+    size_t numkeys = hashtableSize(ht1);
+    hashtableRelease(ht1);
+    if (ht2) hashtableRelease(ht2);
+    atomic_fetch_sub_explicit(&lazyfree_objects, numkeys, memory_order_relaxed);
+    atomic_fetch_add_explicit(&lazyfreed_objects, numkeys, memory_order_relaxed);
+}
+
 /* Release the key tracking table. */
 void lazyFreeTrackingTable(void *args[]) {
     rax *rt = args[0];
@@ -199,6 +211,17 @@ void emptyDbAsync(serverDb *db) {
     bioCreateLazyFreeJob(lazyfreeFreeDatabase, 2, oldkeys, oldexpires);
 }
 
+/* Empty a hashtable asynchrounously. */
+void emptyHashtableAsync(serverDb *db, int didx) {
+    hashtable *oldkeys = kvstoreUnlinkHashtable(db->keys, didx);
+    hashtable *oldexpires = kvstoreUnlinkHashtable(db->expires, didx);
+    if (!oldkeys) {
+        return;
+    }
+    atomic_fetch_add_explicit(&lazyfree_objects, hashtableSize(oldkeys), memory_order_relaxed);
+    bioCreateLazyFreeJob(lazyfreeFreeHashtable, 2, oldkeys, oldexpires);
+}
+
 /* Free the key tracking table.
  * If the table is huge enough, free it in async way. */
 void freeTrackingRadixTreeAsync(rax *tracking) {
diff --git a/src/module.c b/src/module.c
index fa60335837..40a5c8de20 100644
--- a/src/module.c
+++ b/src/module.c
@@ -3757,9 +3757,9 @@ int modulePopulateReplicationInfoStructure(void *ri, int structver) {
     ValkeyModuleReplicationInfoV1 *ri1 = ri;
     memset(ri1, 0, sizeof(*ri1));
     ri1->version = structver;
-    ri1->primary = server.primary_host == NULL;
-    ri1->primary_host = server.primary_host ? server.primary_host : "";
-    ri1->primary_port = server.primary_port;
+    ri1->primary = server.primary == NULL;
+    ri1->primary_host = server.primary ? server.primary->host : "";
+    ri1->primary_port = server.primary ? server.primary->port : 0;
     ri1->replid1 = server.replid;
     ri1->replid2 = server.replid2;
     ri1->repl1_offset = server.primary_repl_offset;
@@ -3948,7 +3948,7 @@ int VM_GetContextFlags(ValkeyModuleCtx *ctx) {
         if (ctx->client) {
             if (ctx->client->flag.deny_blocking) flags |= VALKEYMODULE_CTX_FLAGS_DENY_BLOCKING;
             /* Module command received from PRIMARY, is replicated. */
-            if (ctx->client->flag.primary) flags |= VALKEYMODULE_CTX_FLAGS_REPLICATED;
+            if (ctx->client->flag.replication_source) flags |= VALKEYMODULE_CTX_FLAGS_REPLICATED;
             if (ctx->client->resp == 3) {
                 flags |= VALKEYMODULE_CTX_FLAGS_RESP3;
             }
@@ -3973,7 +3973,7 @@ int VM_GetContextFlags(ValkeyModuleCtx *ctx) {
         flags |= VALKEYMODULE_CTX_FLAGS_LOADING;
 
     /* Maxmemory and eviction policy */
-    if (server.maxmemory > 0 && (!server.primary_host || !server.repl_replica_ignore_maxmemory)) {
+    if (server.maxmemory > 0 && (!server.primary || !server.repl_replica_ignore_maxmemory)) {
         flags |= VALKEYMODULE_CTX_FLAGS_MAXMEMORY;
 
         if (server.maxmemory_policy != MAXMEMORY_NO_EVICTION) flags |= VALKEYMODULE_CTX_FLAGS_EVICT;
@@ -3984,22 +3984,22 @@ int VM_GetContextFlags(ValkeyModuleCtx *ctx) {
     if (server.saveparamslen > 0) flags |= VALKEYMODULE_CTX_FLAGS_RDB;
 
     /* Replication flags */
-    if (server.primary_host == NULL) {
+    if (server.primary == NULL) {
         flags |= VALKEYMODULE_CTX_FLAGS_PRIMARY;
     } else {
         flags |= VALKEYMODULE_CTX_FLAGS_REPLICA;
         if (server.repl_replica_ro) flags |= VALKEYMODULE_CTX_FLAGS_READONLY;
 
         /* Replica state flags. */
-        if (server.repl_state == REPL_STATE_CONNECT || server.repl_state == REPL_STATE_CONNECTING) {
+        if (server.primary->state == REPL_STATE_CONNECT || server.primary->state == REPL_STATE_CONNECTING) {
             flags |= VALKEYMODULE_CTX_FLAGS_REPLICA_IS_CONNECTING;
-        } else if (server.repl_state == REPL_STATE_TRANSFER) {
+        } else if (server.primary->state == REPL_STATE_TRANSFER) {
             flags |= VALKEYMODULE_CTX_FLAGS_REPLICA_IS_TRANSFERRING;
-        } else if (server.repl_state == REPL_STATE_CONNECTED) {
+        } else if (server.primary->state == REPL_STATE_CONNECTED) {
             flags |= VALKEYMODULE_CTX_FLAGS_REPLICA_IS_ONLINE;
         }
 
-        if (server.repl_state != REPL_STATE_CONNECTED) flags |= VALKEYMODULE_CTX_FLAGS_REPLICA_IS_STALE;
+        if (server.primary->state != REPL_STATE_CONNECTED) flags |= VALKEYMODULE_CTX_FLAGS_REPLICA_IS_STALE;
     }
 
     /* OOM flag. */
@@ -6462,7 +6462,7 @@ ValkeyModuleCallReply *VM_Call(ValkeyModuleCtx *ctx, const char *cmdname, const
                 goto cleanup;
             }
 
-            if (server.primary_host && server.repl_replica_ro && !obey_client) {
+            if (server.primary && server.repl_replica_ro && !obey_client) {
                 errno = ESPIPE;
                 if (error_as_call_replies) {
                     sds msg = sdsdup(shared.roreplicaerr->ptr);
@@ -6472,7 +6472,7 @@ ValkeyModuleCallReply *VM_Call(ValkeyModuleCtx *ctx, const char *cmdname, const
             }
         }
 
-        if (server.primary_host && server.repl_state != REPL_STATE_CONNECTED && server.repl_serve_stale_data == 0 &&
+        if (server.primary && server.primary->state != REPL_STATE_CONNECTED && server.repl_serve_stale_data == 0 &&
             !(cmd_flags & CMD_STALE)) {
             errno = ESPIPE;
             if (error_as_call_replies) {
@@ -8782,7 +8782,7 @@ int VM_AddPostNotificationJob(ValkeyModuleCtx *ctx,
                               ValkeyModulePostNotificationJobFunc callback,
                               void *privdata,
                               void (*free_privdata)(void *)) {
-    if (server.loading || (server.primary_host && server.repl_replica_ro)) {
+    if (server.loading || (server.primary && server.repl_replica_ro)) {
         return VALKEYMODULE_ERR;
     }
     ValkeyModulePostExecUnitJob *job = zmalloc(sizeof(*job));
@@ -13059,7 +13059,7 @@ int VM_RdbLoad(ValkeyModuleCtx *ctx, ValkeyModuleRdbStream *stream, int flags) {
     }
 
     /* Not allowed on replicas. */
-    if (server.primary_host != NULL) {
+    if (server.primary != NULL) {
         errno = ENOTSUP;
         return VALKEYMODULE_ERR;
     }
diff --git a/src/networking.c b/src/networking.c
index 48e397e6f4..b9712d877a 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -290,7 +290,7 @@ int prepareClientToWrite(client *c) {
 
     /* Primaries don't receive replies, unless CLIENT_PRIMARY_FORCE_REPLY flag
      * is set. */
-    if (c->flag.primary && !c->flag.primary_force_reply) return C_ERR;
+    if (c->flag.replication_source && !c->flag.primary_force_reply) return C_ERR;
 
     /* Skip the fake client, such as the fake client for AOF loading.
      * But CLIENT_ID_CACHED_RESPONSE is allowed since it is a fake client
@@ -1599,7 +1599,7 @@ void clearClientConnectionState(client *c) {
         c->flag.replica = 0;
     }
 
-    serverAssert(!(c->flag.replica || c->flag.primary));
+    serverAssert(!(c->flag.replica || c->flag.replication_source));
 
     if (c->flag.tracking) disableTracking(c);
     selectDb(c, 0);
@@ -1668,7 +1668,7 @@ void freeClient(client *c) {
      *
      * Note that before doing this we make sure that the client is not in
      * some unexpected state, by checking its flags. */
-    if (server.primary && c->flag.primary) {
+    if (server.primary && server.primary->client == c) {
         serverLog(LL_NOTICE, "Connection with primary lost.");
         if (!c->flag.dont_cache_primary && !(c->flag.protocol_error || c->flag.blocked)) {
             c->flag.close_asap = 0;
@@ -1818,7 +1818,7 @@ void beforeNextClient(client *c) {
      * blocked client as well */
 
     /* Trim the query buffer to the current position. */
-    if (c->flag.primary) {
+    if (c->flag.replication_source) {
         /* If the client is a primary, trim the querybuf to repl_applied,
          * since primary client is very special, its querybuf not only
          * used to parse command, but also proxy to sub-replicas.
@@ -2148,7 +2148,7 @@ int postWriteToClient(client *c) {
          * as an interaction, since we always send REPLCONF ACK commands
          * that take some time to just fill the socket output buffer.
          * We just rely on data / pings received for timeout detection. */
-        if (!c->flag.primary) c->last_interaction = server.unixtime;
+        if (!c->flag.replication_source) c->last_interaction = server.unixtime;
     }
     if (!clientHasPendingReplies(c)) {
         c->sentlen = 0;
@@ -2236,7 +2236,7 @@ int handleReadResult(client *c) {
 
     c->last_interaction = server.unixtime;
     c->net_input_bytes += c->nread;
-    if (c->flag.primary) {
+    if (c->flag.replication_source) {
         c->repl_data->read_reploff += c->nread;
         server.stat_net_repl_input_bytes += c->nread;
     } else {
@@ -2642,7 +2642,7 @@ void processInlineBuffer(client *c) {
  * CLIENT_PROTOCOL_ERROR. */
 #define PROTO_DUMP_LEN 128
 static void setProtocolError(const char *errstr, client *c) {
-    if (server.verbosity <= LL_VERBOSE || c->flag.primary) {
+    if (server.verbosity <= LL_VERBOSE || c->flag.replication_source) {
         sds client = catClientInfoString(sdsempty(), c, server.hide_user_data_from_log);
 
         /* Sample some protocol to given an idea about what was inside. */
@@ -2664,7 +2664,7 @@ static void setProtocolError(const char *errstr, client *c) {
         }
 
         /* Log all the client and protocol info. */
-        int loglevel = (c->flag.primary) ? LL_WARNING : LL_VERBOSE;
+        int loglevel = (c->flag.replication_source) ? LL_WARNING : LL_VERBOSE;
         serverLog(loglevel, "Protocol error (%s) from client: %s. %s", errstr, client, buf);
         sdsfree(client);
     }
@@ -2895,7 +2895,7 @@ void commandProcessed(client *c) {
     if (!c->repl_data) return;
 
     long long prev_offset = c->repl_data->reploff;
-    if (c->flag.primary && !c->flag.multi) {
+    if (c->flag.replication_source && !c->flag.multi) {
         /* Update the applied replication offset of our primary. */
         c->repl_data->reploff = c->repl_data->read_reploff - sdslen(c->querybuf) + c->qb_pos;
     }
@@ -2906,7 +2906,7 @@ void commandProcessed(client *c) {
      * applied to the primary state: this quantity, and its corresponding
      * part of the replication stream, will be propagated to the
      * sub-replicas and to the replication backlog. */
-    if (c->flag.primary) {
+    if (c->flag.replication_source) {
         long long applied = c->repl_data->reploff - prev_offset;
         if (applied) {
             replicationFeedStreamFromPrimaryStream(c->querybuf + c->repl_data->repl_applied, applied);
@@ -3014,7 +3014,7 @@ int canParseCommand(client *c) {
      * condition on the replica. We want just to accumulate the replication
      * stream (instead of replying -BUSY like we do with other clients) and
      * later resume the processing. */
-    if (isInsideYieldingLongCommand() && c->flag.primary) return 0;
+    if (isInsideYieldingLongCommand() && c->flag.replication_source) return 0;
 
     /* CLIENT_CLOSE_AFTER_REPLY closes the connection once the reply is
      * written to the client. Make sure to not let the reply grow after
@@ -3033,7 +3033,7 @@ int processInputBuffer(client *c) {
             break;
         }
 
-        c->read_flags = c->flag.primary ? READ_FLAGS_PRIMARY : 0;
+        c->read_flags = c->flag.replication_source ? READ_FLAGS_PRIMARY : 0;
         c->read_flags |= authRequired(c) ? READ_FLAGS_AUTH_REQUIRED : 0;
 
         parseCommand(c);
@@ -3097,7 +3097,7 @@ void readToQueryBuf(client *c) {
 
         /* Primary client needs expand the readlen when meet BIG_ARG(see #9100),
          * but doesn't need align to the next arg, we can read more data. */
-        if (c->flag.primary && readlen < PROTO_IOBUF_LEN) readlen = PROTO_IOBUF_LEN;
+        if (c->flag.replication_source && readlen < PROTO_IOBUF_LEN) readlen = PROTO_IOBUF_LEN;
     }
 
     if (c->querybuf == NULL) {
@@ -3240,7 +3240,7 @@ sds catClientInfoString(sds s, client *client, int hide_user_data) {
             *p++ = 'S';
     }
 
-    if (client->flag.primary) *p++ = 'M';
+    if (client->flag.replication_source) *p++ = 'M';
     if (client->flag.pubsub) *p++ = 'P';
     if (client->flag.multi) *p++ = 'x';
     if (client->flag.blocked) *p++ = 'b';
@@ -3458,7 +3458,7 @@ void resetCommand(client *c) {
         flags.replica = 0;
     }
 
-    if (flags.replica || flags.primary || flags.module) {
+    if (flags.replica || flags.replication_source || flags.module) {
         addReplyError(c, "can only reset normal client connections");
         return;
     }
@@ -4132,7 +4132,7 @@ void helloCommand(client *c) {
 
     if (!server.sentinel_mode) {
         addReplyBulkCString(c, "role");
-        addReplyBulkCString(c, server.primary_host ? "replica" : "master");
+        addReplyBulkCString(c, server.primary ? "replica" : "master");
     }
 
     addReplyBulkCString(c, "modules");
@@ -4363,7 +4363,7 @@ size_t getClientMemoryUsage(client *c, size_t *output_buffer_mem_usage) {
  * CLIENT_TYPE_PRIMARY -> The client representing our replication primary.
  */
 int getClientType(client *c) {
-    if (c->flag.primary) return CLIENT_TYPE_PRIMARY;
+    if (c->flag.replication_source) return CLIENT_TYPE_PRIMARY;
     /* Even though MONITOR clients are marked as replicas, we
      * want the expose them as normal clients. */
     if (c->flag.replica && !c->flag.monitor) return CLIENT_TYPE_REPLICA;
diff --git a/src/rdb.c b/src/rdb.c
index 0bb5d7d45d..57fae239ad 100644
--- a/src/rdb.c
+++ b/src/rdb.c
@@ -1869,7 +1869,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
     if (server.sanitize_dump_payload == SANITIZE_DUMP_CLIENTS) {
         /* Skip sanitization when loading (an RDB), or getting a RESTORE command
          * from either the primary or a client using an ACL user with the skip-sanitize-payload flag. */
-        int skip = server.loading || (server.current_client && (server.current_client->flag.primary));
+        int skip = server.loading || (server.current_client && (server.current_client->flag.replication_source));
         if (!skip && server.current_client && server.current_client->user)
             skip = !!(server.current_client->user->flags & USER_FLAG_SANITIZE_PAYLOAD_SKIP);
         deep_integrity_validation = !skip;
@@ -2934,12 +2934,12 @@ void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) {
     if (server.loading_process_events_interval_bytes &&
         (r->processed_bytes + len) / server.loading_process_events_interval_bytes >
             r->processed_bytes / server.loading_process_events_interval_bytes) {
-        if (server.primary_host && server.repl_state == REPL_STATE_TRANSFER) replicationSendNewlineToPrimary();
+        replicationSendNewlineToConnectedLinks();
         loadingAbsProgress(r->processed_bytes);
         processEventsWhileBlocked();
         processModuleLoadingProgressEvent(0);
     }
-    if (server.repl_state == REPL_STATE_TRANSFER && rioCheckType(r) == RIO_TYPE_CONN) {
+    if (server.primary && server.primary->state == REPL_STATE_TRANSFER && rioCheckType(r) == RIO_TYPE_CONN) {
         server.stat_net_repl_input_bytes += len;
     }
 }
@@ -3526,12 +3526,13 @@ void killRDBChild(void) {
 
 /* Spawn an RDB child that writes the RDB to the sockets of the replicas
  * that are currently in REPLICA_STATE_WAIT_BGSAVE_START state. */
-int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi) {
+int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, list *slot_ranges) {
     listNode *ln;
     listIter li;
     pid_t childpid;
     int pipefds[2], rdb_pipe_write = 0, safe_to_exit_pipe = 0;
     int dual_channel = (req & REPLICA_REQ_RDB_CHANNEL);
+    int aof = (req & REPLICA_REQ_AOF_FORMAT);
 
     if (hasActiveChildProcess()) return C_ERR;
     serverAssert(server.rdb_pipe_read == -1 && server.rdb_child_exit_pipe == -1);
@@ -3560,7 +3561,7 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi) {
         server.rdb_child_exit_pipe = pipefds[1]; /* write end */
     }
     /* Collect the connections of the replicas we want to transfer
-     * the RDB to, which are i WAIT_BGSAVE_START state. */
+     * the RDB to, which are in WAIT_BGSAVE_START state. */
     int connsnum = 0;
     connection **conns = zmalloc(sizeof(connection *) * listLength(server.replicas));
     server.rdb_pipe_conns = NULL;
@@ -3576,6 +3577,8 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi) {
         if (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_START) {
             /* Check replica has the exact requirements */
             if (replica->repl_data->replica_req != req) continue;
+            /* No attempt to coallesce slot ranges, just use equality */
+            if (replica->repl_data->slot_ranges != slot_ranges) continue;
 
             conns[connsnum++] = replica->conn;
             if (dual_channel) {
@@ -3615,7 +3618,16 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi) {
         }
         serverSetCpuAffinity(server.bgsave_cpulist);
 
-        retval = rdbSaveRioWithEOFMark(req, &rdb, NULL, rsi);
+        if (aof) {
+            serverLog(LL_NOTICE, "Background AOF transfer started by pid %ld", (long)getpid());
+            retval = rewriteAppendOnlyFileRio(&rdb, slot_ranges);
+            rioWrite(&rdb, "*3\r\n", 4);
+            rioWriteBulkString(&rdb, "REPLCONF", 8);
+            rioWriteBulkString(&rdb, "SYNC-PAYLOAD-END", 17);
+            rioWriteBulkLongLong(&rdb, rsi->repl_stream_db);
+        } else {
+            retval = rdbSaveRioWithEOFMark(req, &rdb, NULL, rsi);
+        }
         if (retval == C_OK && rioFlush(&rdb) == 0) retval = C_ERR;
 
         if (retval == C_OK) {
@@ -3778,7 +3790,7 @@ rdbSaveInfo *rdbPopulateSaveInfo(rdbSaveInfo *rsi) {
      * connects to us, the NULL repl_backlog will trigger a full
      * synchronization, at the same time we will use a new replid and clear
      * replid2. */
-    if (!server.primary_host && server.repl_backlog) {
+    if (!server.primary && server.repl_backlog) {
         /* Note that when server.replicas_eldb is -1, it means that this primary
          * didn't apply any write commands after a full synchronization.
          * So we can let repl_stream_db be 0, this allows a restarted replica
@@ -3791,7 +3803,7 @@ rdbSaveInfo *rdbPopulateSaveInfo(rdbSaveInfo *rsi) {
     /* If the instance is a replica we need a connected primary
      * in order to fetch the currently selected DB. */
     if (server.primary) {
-        rsi->repl_stream_db = server.primary->db->id;
+        rsi->repl_stream_db = server.primary->client->db->id;
         return rsi;
     }
 
diff --git a/src/rdb.h b/src/rdb.h
index 7342a926b5..440620e5bb 100644
--- a/src/rdb.h
+++ b/src/rdb.h
@@ -152,7 +152,7 @@ int rdbSaveObjectType(rio *rdb, robj *o);
 int rdbLoadObjectType(rio *rdb);
 int rdbLoad(char *filename, rdbSaveInfo *rsi, int rdbflags);
 int rdbSaveBackground(int req, char *filename, rdbSaveInfo *rsi, int rdbflags);
-int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi);
+int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, list *slotRanges);
 void rdbRemoveTempFile(pid_t childpid, int from_signal);
 int rdbSaveToFile(const char *filename);
 int rdbSave(int req, char *filename, rdbSaveInfo *rsi, int rdbflags);
diff --git a/src/replication.c b/src/replication.c
index 9913d64d65..cecfad5ee5 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -47,16 +47,17 @@
 #include <ctype.h>
 
 void replicationDiscardCachedPrimary(void);
-void replicationResurrectCachedPrimary(connection *conn);
-void replicationResurrectProvisionalPrimary(void);
-void replicationSendAck(void);
+void replicationResurrectCachedPrimary(replicationLink *link);
+void replicationResurrectProvisionalSource(replicationLink *link);
+void replicationSendAck(replicationLink *link);
 int replicaPutOnline(client *replica);
 void replicaStartCommandStream(client *replica);
-int cancelReplicationHandshake(int reconnect);
-void replicationSteadyStateInit(void);
+int cancelReplicationHandshake(replicationLink *link, int reconnect);
+void replicationSteadyStateInit(replicationLink *link);
 void dualChannelSetupMainConnForPsync(connection *conn);
-void dualChannelSyncHandleRdbLoadCompletion(void);
-static void dualChannelFullSyncWithPrimary(connection *conn);
+int dualChannelSyncHandleRdbLoadCompletion(replicationLink *link);
+static void dualChannelFullSyncWithReplicationSource(connection *conn);
+void replicationFinishSyncPayload(connection *conn, replicationLink *link, int db);
 
 /* We take a global flag to remember if this instance generated an RDB
  * because of replication, so that we can remove the RDB file in case
@@ -537,7 +538,7 @@ void replicationFeedReplicas(int dictid, robj **argv, int argc) {
      * propagate *identical* replication stream. In this way this replica can
      * advertise the same replication ID as the primary (since it shares the
      * primary replication history and has the same backlog and offsets). */
-    if (server.primary_host != NULL) return;
+    if (server.primary != NULL) return;
 
     /* If there aren't replicas, and there is no backlog buffer to populate,
      * we can return ASAP. */
@@ -952,7 +953,7 @@ int primaryTryPartialResynchronization(client *c, long long psync_offset) {
  *    started.
  *
  * Returns C_OK on success or C_ERR otherwise. */
-int startBgsaveForReplication(int mincapa, int req) {
+int startBgsaveForReplication(int mincapa, int req, list *slot_ranges) {
     int retval;
     int socket_target = 0;
     listIter li;
@@ -965,9 +966,10 @@ int startBgsaveForReplication(int mincapa, int req) {
     /* `SYNC` should have failed with error if we don't support socket and require a filter, assert this here */
     serverAssert(socket_target || !(req & REPLICA_REQ_RDB_MASK));
 
-    serverLog(LL_NOTICE, "Starting BGSAVE for SYNC with target: %s using: %s",
+    serverLog(LL_NOTICE, "Starting BGSAVE for SYNC with target: %s using: %s with format %s",
               socket_target ? "replicas sockets" : "disk",
-              (req & REPLICA_REQ_RDB_CHANNEL) ? "dual-channel" : "normal sync");
+              (req & REPLICA_REQ_RDB_CHANNEL) ? "dual-channel" : "normal sync",
+              (req & REPLICA_REQ_AOF_FORMAT) ? "AOF" : "RDB");
 
     rdbSaveInfo rsi, *rsiptr;
     rsiptr = rdbPopulateSaveInfo(&rsi);
@@ -975,7 +977,7 @@ int startBgsaveForReplication(int mincapa, int req) {
      * otherwise replica will miss repl-stream-db. */
     if (rsiptr) {
         if (socket_target)
-            retval = rdbSaveToReplicasSockets(req, rsiptr);
+            retval = rdbSaveToReplicasSockets(req, rsiptr, slot_ranges);
         else {
             /* Keep the page cache since it'll get used soon */
             retval = rdbSaveBackground(req, server.rdb_filename, rsiptr, RDBFLAGS_REPLICATION | RDBFLAGS_KEEP_CACHE);
@@ -1046,7 +1048,7 @@ void syncCommand(client *c) {
      * become a primary if so. */
     if (c->argc > 3 && !strcasecmp(c->argv[0]->ptr, "psync") && !strcasecmp(c->argv[3]->ptr, "failover")) {
         serverLog(LL_NOTICE, "Failover request received for replid %s.", (unsigned char *)c->argv[1]->ptr);
-        if (!server.primary_host) {
+        if (server.primary == NULL) {
             addReplyError(c, "PSYNC FAILOVER can't be sent to a master.");
             return;
         }
@@ -1074,7 +1076,7 @@ void syncCommand(client *c) {
 
     /* Refuse SYNC requests if we are a replica but the link with our primary
      * is not ok... */
-    if (server.primary_host && server.repl_state != REPL_STATE_CONNECTED) {
+    if (server.primary && server.primary->state != REPL_STATE_CONNECTED) {
         addReplyError(c, "-NOMASTERLINK Can't SYNC while not connected with my master");
         return;
     }
@@ -1096,6 +1098,12 @@ void syncCommand(client *c) {
         return;
     }
 
+    /* Fail sync if it is asking for AOF format and a slot is not set via REPLCONF already. */
+    if (c->repl_data->replica_req & REPLICA_REQ_AOF_FORMAT && c->repl_data->slot_ranges == NULL) {
+        addReplyError(c, "AOF format is only supported for single slot SYNC");
+        return;
+    }
+
     serverLog(LL_NOTICE, "Replica %s asks for synchronization", replicationGetReplicaName(c));
 
     /* Try a partial resynchronization if this is a PSYNC command.
@@ -1171,8 +1179,11 @@ void syncCommand(client *c) {
                   server.replid, server.replid2);
     }
 
+    /* For slot level replication, we make no attempt to coallesce BGSAVEs */
+    int require_dedicated = c->repl_data->slot_ranges != NULL;
+
     /* CASE 1: BGSAVE is in progress, with disk target. */
-    if (server.child_type == CHILD_TYPE_RDB && server.rdb_child_type == RDB_CHILD_TYPE_DISK) {
+    if (!require_dedicated && server.child_type == CHILD_TYPE_RDB && server.rdb_child_type == RDB_CHILD_TYPE_DISK) {
         /* Ok a background save is in progress. Let's check if it is a good
          * one for replication, i.e. if there is another replica that is
          * registering differences since the server forked to save. */
@@ -1205,32 +1216,35 @@ void syncCommand(client *c) {
              * register differences. */
             serverLog(LL_NOTICE, "Can't attach the replica to the current BGSAVE. Waiting for next BGSAVE for SYNC");
         }
+    }
 
-        /* CASE 2: BGSAVE is in progress, with socket target. */
-    } else if (server.child_type == CHILD_TYPE_RDB && server.rdb_child_type == RDB_CHILD_TYPE_SOCKET) {
+    /* CASE 2: BGSAVE is in progress, with socket target. */
+    if (server.child_type == CHILD_TYPE_RDB && server.rdb_child_type == RDB_CHILD_TYPE_SOCKET) {
         /* There is an RDB child process but it is writing directly to
          * children sockets. We need to wait for the next BGSAVE
          * in order to synchronize. */
         serverLog(LL_NOTICE, "Current BGSAVE has socket target. Waiting for next BGSAVE for SYNC");
+        return;
+    }
 
-        /* CASE 3: There is no BGSAVE is in progress. */
-    } else {
-        if (server.repl_diskless_sync && (c->repl_data->replica_capa & REPLICA_CAPA_EOF) && server.repl_diskless_sync_delay) {
-            /* Diskless replication RDB child is created inside
-             * replicationCron() since we want to delay its start a
-             * few seconds to wait for more replicas to arrive. */
-            serverLog(LL_NOTICE, "Delay next BGSAVE for diskless SYNC");
-        } else {
-            /* We don't have a BGSAVE in progress, let's start one. Diskless
-             * or disk-based mode is determined by replica's capacity. */
-            if (!hasActiveChildProcess()) {
-                startBgsaveForReplication(c->repl_data->replica_capa, c->repl_data->replica_req);
-            } else {
-                serverLog(LL_NOTICE, "No BGSAVE in progress, but another BG operation is active. "
-                                     "BGSAVE for replication delayed");
-            }
-        }
+    /* CASE 3: There is no BGSAVE is in progress, but we need to delay. */
+    if (!require_dedicated && server.repl_diskless_sync && (c->repl_data->replica_capa & REPLICA_CAPA_EOF) && server.repl_diskless_sync_delay) {
+        /* Diskless replication RDB child is created inside
+         * replicationCron() since we want to delay its start a
+         * few seconds to wait for more replicas to arrive. */
+        serverLog(LL_NOTICE, "Delay next BGSAVE for diskless SYNC");
+        return;
+    }
+
+    /* CASE 4: We don't have a BGSAVE in progress, but there is an existing child process. */
+    if (hasActiveChildProcess()) {
+        serverLog(LL_NOTICE, "No BGSAVE in progress, but another BG operation is active. "
+                             "BGSAVE for replication delayed");
+        return;
     }
+
+    /* CASE 5: We are good to start a BGSAVE. Diskless or disk-based mode is determined by replica's capacity. */
+    startBgsaveForReplication(c->repl_data->replica_capa, c->repl_data->replica_req, c->repl_data->slot_ranges);
     return;
 }
 
@@ -1294,8 +1308,13 @@ void freeClientReplicationData(client *c) {
             moduleFireServerEvent(VALKEYMODULE_EVENT_REPLICA_CHANGE, VALKEYMODULE_SUBEVENT_REPLICA_CHANGE_OFFLINE,
                                   NULL);
     }
-    if (c->flag.primary) replicationHandlePrimaryDisconnection();
+    if (c->flag.replication_source) {
+        replicationHandleSourceDisconnection(c->repl_data->link);
+    }
     sdsfree(c->repl_data->replica_addr);
+    if (c->repl_data->slot_ranges) {
+        freeSlotRanges(c->repl_data->slot_ranges);
+    }
     zfree(c->repl_data);
     c->repl_data = NULL;
 }
@@ -1420,7 +1439,7 @@ void replconfCommand(client *c) {
         } else if (!strcasecmp(c->argv[j]->ptr, "getack")) {
             /* REPLCONF GETACK is used in order to request an ACK ASAP
              * to the replica. */
-            if (server.primary_host && server.primary) replicationSendAck();
+            if (server.primary && server.primary->client) replicationSendAck(server.primary);
             return;
         } else if (!strcasecmp(c->argv[j]->ptr, "rdb-only")) {
             /* REPLCONF RDB-ONLY is used to identify the client only wants
@@ -1491,6 +1510,45 @@ void replconfCommand(client *c) {
                 return;
             }
             c->repl_data->associated_rdb_client_id = (uint64_t)client_id;
+        } else if (!strcasecmp(c->argv[j]->ptr, "slot-bitmap")) {
+            /* REPLCONF slot-bitmap <slot-bitmap> is used to filter the replication stream to just a set number of slots. */
+            if (!server.cluster_enabled) {
+                addReplyError(c, "Cannot replicate a slot when cluster mode is disabled");
+            }
+            if (c->repl_data->slot_ranges != NULL) {
+                addReplyError(c, "Slot bitmap already set");
+            }
+            if (stringObjectLen(c->argv[j + 1]) != CLUSTER_SLOTS / 8) {
+                addReplyError(c, "Invalid slot bitmap length");
+                return;
+            }
+            list *slot_ranges;
+            bitmapToSlotRanges(c->argv[j + 1]->ptr, &slot_ranges);
+            for (int slot = 0; slot <= CLUSTER_SLOTS; slot++) {
+                if (bitmapTestBit(c->argv[j + 1]->ptr, slot) && server.cluster->slots[slot] != server.cluster->myself) {
+                    addReplyErrorFormat(c, "I cannot replicate slot %d since I do not own it", slot);
+                    freeSlotRanges(slot_ranges);
+                    return;
+                }
+            }
+            c->repl_data->slot_ranges = slot_ranges;
+
+            /* For now, we only support AOF for slot transfer. */
+            c->repl_data->replica_req |= REPLICA_REQ_AOF_FORMAT;
+        } else if (!strcasecmp(c->argv[j]->ptr, "sync-payload-end")) {
+            /* REPLCONF sync-payload-end <db_num> is used to inform the replica
+             * that the primary has finished sending the sync snapshot, and
+             * that it is free to begin processing the replication backlog.
+             *
+             * dbnum specifies which db to stream the backlog into. */
+            int db_num = 0;
+            if (getIntFromObjectOrReply(c, c->argv[j + 1], &db_num, NULL) != C_OK || db_num < 0 || db_num >= server.dbnum) {
+                addReplyError(c, "Unable to parse DB number");
+                return;
+            }
+            serverLog(LL_NOTICE, "Got sync-payload-end for db %d", db_num);
+
+            replicationFinishSyncPayload(c->conn, c->repl_data->link, db_num);
         } else {
             addReplyErrorFormat(c, "Unrecognized REPLCONF option: %s", (char *)c->argv[j]->ptr);
             return;
@@ -1932,13 +1990,30 @@ void shiftReplicationId(void) {
 
 /* ----------------------------------- REPLICA -------------------------------- */
 
+char *replicationGetNameForLogs(replicationLink *link) {
+    if (link == server.primary)
+        return "PRIMARY";
+    if (link->slot_ranges != NULL)
+        return "SLOT OWNER";
+    return "OTHER REPLICATION SOURCE";
+}
+
 /* Returns 1 if the given replication state is a handshake state,
  * 0 otherwise. */
-int replicaIsInHandshakeState(void) {
-    return server.repl_state >= REPL_STATE_RECEIVE_PING_REPLY && server.repl_state <= REPL_STATE_RECEIVE_PSYNC_REPLY;
+int replicaIsInHandshakeState(replicationLink *link) {
+    return link->state >= REPL_STATE_RECEIVE_PING_REPLY && link->state <= REPL_STATE_RECEIVE_PSYNC_REPLY;
+}
+
+void replicationSendNewlineOnLink(replicationLink *link) {
+    static time_t newline_sent;
+    if (time(NULL) != newline_sent) {
+        newline_sent = time(NULL);
+        /* Pinging back in this stage is best-effort. */
+        if (link->transfer_s) connWrite(link->transfer_s, "\n", 1);
+    }
 }
 
-/* Avoid the primary to detect the replica is timing out while loading the
+/* Avoid the primary to detect replicas as timing out while loading the
  * RDB file in initial synchronization. We send a single newline character
  * that is valid protocol but is guaranteed to either be sent entirely or
  * not, since the byte is indivisible.
@@ -1946,12 +2021,15 @@ int replicaIsInHandshakeState(void) {
  * The function is called in two contexts: while we flush the current
  * data with emptyData(), and while we load the new data received as an
  * RDB file from the primary. */
-void replicationSendNewlineToPrimary(void) {
-    static time_t newline_sent;
-    if (time(NULL) != newline_sent) {
-        newline_sent = time(NULL);
-        /* Pinging back in this stage is best-effort. */
-        if (server.repl_transfer_s) connWrite(server.repl_transfer_s, "\n", 1);
+void replicationSendNewlineToConnectedLinks() {
+    listIter li;
+    listNode *ln;
+    listRewind(server.replication_links, &li);
+    while ((ln = listNext(&li))) {
+        replicationLink *link = (replicationLink *)ln->value;
+        if (link->state >= REPL_STATE_CONNECTING && link->state <= REPL_STATE_CANCELLED) {
+            replicationSendNewlineOnLink(link);
+        }
     }
 }
 
@@ -1960,15 +2038,17 @@ void replicationSendNewlineToPrimary(void) {
  * after loading succeeded or failed. */
 void replicationEmptyDbCallback(hashtable *d) {
     UNUSED(d);
-    if (server.repl_state == REPL_STATE_TRANSFER) replicationSendNewlineToPrimary();
+    replicationSendNewlineToConnectedLinks();
 }
 
 /* Once we have a link with the primary and the synchronization was
  * performed, this function materializes the primary client we store
  * at server.primary, starting from the specified file descriptor. */
-void replicationCreatePrimaryClientWithHandler(connection *conn, int dbid, ConnectionCallbackFunc handler) {
-    server.primary = createClient(conn);
-    if (conn) connSetReadHandler(server.primary->conn, handler);
+client *createReplicationLinkClientWithHandler(replicationLink *link, connection *conn, int dbid, ConnectionCallbackFunc handler) {
+    client *c = createClient(conn);
+    if (conn) {
+        connSetReadHandler(conn, handler);
+    }
 
     /**
      * Important note:
@@ -1981,27 +2061,33 @@ void replicationCreatePrimaryClientWithHandler(connection *conn, int dbid, Conne
      * to pass the execution to a background thread and unblock after the
      * execution is done. This is the reason why we allow blocking the replication
      * connection. */
-    server.primary->flag.primary = 1;
-    server.primary->flag.authenticated = 1;
-
-    /* Allocate a private query buffer for the primary client instead of using the shared query buffer.
-     * This is done because the primary's query buffer data needs to be preserved for my sub-replicas to use. */
-    server.primary->querybuf = sdsempty();
-    initClientReplicationData(server.primary);
-    server.primary->repl_data->reploff = server.primary_initial_offset;
-    server.primary->repl_data->read_reploff = server.primary->repl_data->reploff;
-    server.primary->user = NULL; /* This client can do everything. */
-    memcpy(server.primary->repl_data->replid, server.primary_replid, sizeof(server.primary_replid));
+    c->flag.replication_source = 1;
+    c->flag.authenticated = 1;
+
+
+    /* Allocate a private query buffer for the replication link client instead of using the shared query buffer.
+     * This is done because the replication link's query buffer data needs to be preserved for my sub-replicas to use. */
+    c->querybuf = sdsempty();
+    initClientReplicationData(c);
+    c->repl_data->reploff = link->initial_offset;
+    c->repl_data->read_reploff = c->repl_data->reploff;
+    c->user = NULL; /* This client can do everything. */
+    c->repl_data->link = link;
+    memcpy(c->repl_data->replid, link->replid, sizeof(link->replid));
+
     /* If primary offset is set to -1, this primary is old and is not
      * PSYNC capable, so we flag it accordingly. */
-    if (server.primary->repl_data->reploff == -1) server.primary->flag.pre_psync = 1;
-    if (dbid != -1) selectDb(server.primary, dbid);
+    if (c->repl_data->reploff == -1) c->flag.pre_psync = 1;
+    if (dbid != -1) selectDb(c, dbid);
+    c->repl_data->slot_ranges = link->slot_ranges;
+
+    return c;
 }
 
 /* Wrapper for replicationCreatePrimaryClientWithHandler, init primary connection handler
  * with ordinary client connection handler */
-void replicationCreatePrimaryClient(connection *conn, int dbid) {
-    replicationCreatePrimaryClientWithHandler(conn, dbid, readQueryFromClient);
+client *createReplicationLinkClient(replicationLink *link, connection *conn, int dbid) {
+    return createReplicationLinkClientWithHandler(link, conn, dbid, readQueryFromClient);
 }
 
 /* This function will try to re-enable the AOF file after the
@@ -2078,13 +2164,75 @@ void disklessLoadDiscardFunctionsLibCtx(functionsLibCtx *temp_functions_lib_ctx)
 void replicationAttachToNewPrimary(void) {
     /* Replica starts to apply data from new primary, we must discard the cached
      * primary structure. */
-    serverAssert(server.primary == NULL);
+    serverAssert(server.primary == NULL || server.primary->client == NULL);
     replicationDiscardCachedPrimary();
 
     disconnectReplicas();     /* Force our replicas to resync with us as well. */
     freeReplicationBacklog(); /* Don't allow our chained replicas to PSYNC. */
 }
 
+void replicationFinishSyncPayload(connection *conn, replicationLink *link, int db) {
+    /* Final setup of the connected replica <- primary link */
+    int link_closed = 0;
+    if (link->rdb_channel_state == REPL_DUAL_CHANNEL_RDB_LOAD) {
+        if (dualChannelSyncHandleRdbLoadCompletion(link) == C_ERR) {
+            /* This may happen if, while loading the backlog, our primary is unset */
+            serverLog(LL_NOTICE, "%s <-> MYSELF sync: Failed to finalize dual channel load", replicationGetNameForLogs(link));
+            freeReplicationLink(link);
+            link_closed = 1;
+        }
+    } else {
+        /* Client could have been previously created for AOF load. */
+        if (!link->client) {
+            link->client = createReplicationLinkClient(link, link->transfer_s, db);
+            link->transfer_s = NULL;
+        }
+        link->state = REPL_STATE_CONNECTED;
+        /* Send the initial ACK immediately to put this replica in online state. */
+        replicationSendAck(link);
+    }
+
+    if (!link_closed && link == server.primary) {
+        server.repl_down_since = 0;
+
+        /* Fire the primary link modules event. */
+        moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_UP, NULL);
+        if (link->state == REPL_STATE_CONNECTED) {
+            /* After a full resynchronization we use the replication ID and
+             * offset of the primary. The secondary ID / offset are cleared since
+             * we are starting a new history. */
+            memcpy(server.replid, link->client->repl_data->replid, sizeof(server.replid));
+            server.primary_repl_offset = link->client->repl_data->reploff;
+        }
+        clearReplicationId2();
+
+        /* Let's create the replication backlog if needed. Replicas need to
+         * accumulate the backlog regardless of the fact they have sub-replicas
+         * or not, in order to behave correctly if they are promoted to
+         * primaries after a failover. */
+        if (server.repl_backlog == NULL) createReplicationBacklog();
+        serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Finished with success");
+
+        if (server.supervised_mode == SUPERVISED_SYSTEMD) {
+            serverCommunicateSystemd("STATUS=PRIMARY <-> REPLICA sync: Finished with success. Ready to accept connections "
+                                     "in read-write mode.\n");
+        }
+    }
+
+    /* Restart the AOF subsystem now that we finished the sync. This
+     * will trigger an AOF rewrite, and when done will start appending
+     * to the new file. */
+    if (server.aof_enabled && server.aof_state != AOF_OFF) restartAOFAfterSYNC();
+
+    /* In case of dual channel replication sync we want to close the RDB connection
+     * once the connection is established */
+    if (!link_closed && conn == link->rdb_transfer_s) {
+        connClose(conn);
+        link->rdb_transfer_s = NULL;
+    }
+    return;
+}
+
 /* Asynchronously read the SYNC payload we receive from a primary */
 #define REPL_MAX_WRITTEN_BEFORE_FSYNC (1024 * 1024 * 8) /* 8 MB */
 void readSyncBulkPayload(connection *conn) {
@@ -2096,6 +2244,11 @@ void readSyncBulkPayload(connection *conn) {
     int empty_db_flags = server.repl_replica_lazy_flush ? EMPTYDB_ASYNC : EMPTYDB_NO_FLAGS;
     off_t left;
 
+    replicationLink *link = (replicationLink *)connGetPrivateData(conn);
+
+    /* RDB bulk load will only be used if we are sending all slots. */
+    serverAssert(link->slot_ranges == NULL);
+
     /* Static vars used to hold the EOF mark, and the last bytes received
      * from the server: when they match, we reached the end of the transfer. */
     static char eofmark[RDB_EOF_MARK_SIZE];
@@ -2104,10 +2257,10 @@ void readSyncBulkPayload(connection *conn) {
 
     /* If repl_transfer_size == -1 we still have to read the bulk length
      * from the primary reply. */
-    if (server.repl_transfer_size == -1) {
+    if (link->transfer_size == -1) {
         nread = connSyncReadLine(conn, buf, 1024, server.repl_syncio_timeout * 1000);
         if (nread == -1) {
-            serverLog(LL_WARNING, "I/O error reading bulk count from PRIMARY: %s", connGetLastError(conn));
+            serverLog(LL_WARNING, "I/O error reading bulk count from %s: %s", replicationGetNameForLogs(link), connGetLastError(conn));
             goto error;
         } else {
             /* nread here is returned by connSyncReadLine(), which calls syncReadLine() and
@@ -2116,18 +2269,19 @@ void readSyncBulkPayload(connection *conn) {
         }
 
         if (buf[0] == '-') {
-            serverLog(LL_WARNING, "PRIMARY aborted replication with an error: %s", buf + 1);
+            serverLog(LL_WARNING, "%s aborted replication with an error: %s", replicationGetNameForLogs(link), buf + 1);
             goto error;
         } else if (buf[0] == '\0') {
             /* At this stage just a newline works as a PING in order to take
              * the connection live. So we refresh our last interaction
              * timestamp. */
-            server.repl_transfer_lastio = server.unixtime;
+            link->transfer_lastio = server.unixtime;
             return;
         } else if (buf[0] != '$') {
             serverLog(LL_WARNING,
-                      "Bad protocol from PRIMARY, the first byte is not '$' (we received '%s'), are you sure the host "
+                      "Bad protocol from %s, the first byte is not '$' (we received '%s'), are you sure the host "
                       "and port are right?",
+                      replicationGetNameForLogs(link),
                       buf);
             goto error;
         }
@@ -2148,14 +2302,14 @@ void readSyncBulkPayload(connection *conn) {
             memset(lastbytes, 0, RDB_EOF_MARK_SIZE);
             /* Set any repl_transfer_size to avoid entering this code path
              * at the next call. */
-            server.repl_transfer_size = 0;
-            serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: receiving streamed RDB from primary with EOF %s",
+            link->transfer_size = 0;
+            serverLog(LL_NOTICE, "%s <-> REPLICA sync: receiving streamed RDB from primary with EOF %s", replicationGetNameForLogs(link),
                       use_diskless_load ? "to parser" : "to disk");
         } else {
             usemark = 0;
-            server.repl_transfer_size = strtol(buf + 1, NULL, 10);
-            serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: receiving %lld bytes from primary %s",
-                      (long long)server.repl_transfer_size, use_diskless_load ? "to parser" : "to disk");
+            link->transfer_size = strtol(buf + 1, NULL, 10);
+            serverLog(LL_NOTICE, "%s <-> REPLICA sync: receiving %lld bytes from primary %s", replicationGetNameForLogs(link),
+                      (long long)link->transfer_size, use_diskless_load ? "to parser" : "to disk");
         }
         return;
     }
@@ -2166,7 +2320,7 @@ void readSyncBulkPayload(connection *conn) {
         if (usemark) {
             readlen = sizeof(buf);
         } else {
-            left = server.repl_transfer_size - server.repl_transfer_read;
+            left = link->transfer_size - link->transfer_read;
             readlen = (left < (signed)sizeof(buf)) ? left : (signed)sizeof(buf);
         }
 
@@ -2176,7 +2330,7 @@ void readSyncBulkPayload(connection *conn) {
                 /* equivalent to EAGAIN */
                 return;
             }
-            serverLog(LL_WARNING, "I/O error trying to sync with PRIMARY: %s",
+            serverLog(LL_WARNING, "I/O error trying to sync with %s: %s", replicationGetNameForLogs(link),
                       (nread == -1) ? connGetLastError(conn) : "connection lost");
             goto error;
         }
@@ -2202,19 +2356,20 @@ void readSyncBulkPayload(connection *conn) {
         /* Update the last I/O time for the replication transfer (used in
          * order to detect timeouts during replication), and write what we
          * got from the socket to the dump file on disk. */
-        server.repl_transfer_lastio = server.unixtime;
-        if ((nwritten = write(server.repl_transfer_fd, buf, nread)) != nread) {
+        link->transfer_lastio = server.unixtime;
+        if ((nwritten = write(link->transfer_fd, buf, nread)) != nread) {
             serverLog(LL_WARNING,
                       "Write error or short write writing to the DB dump file "
-                      "needed for PRIMARY <-> REPLICA synchronization: %s",
+                      "needed for %s <-> REPLICA synchronization: %s",
+                      replicationGetNameForLogs(link),
                       (nwritten == -1) ? strerror(errno) : "short write");
             goto error;
         }
-        server.repl_transfer_read += nread;
+        link->transfer_read += nread;
 
         /* Delete the last 40 bytes from the file if we reached EOF. */
         if (usemark && eof_reached) {
-            if (ftruncate(server.repl_transfer_fd, server.repl_transfer_read - RDB_EOF_MARK_SIZE) == -1) {
+            if (ftruncate(link->transfer_fd, link->transfer_read - RDB_EOF_MARK_SIZE) == -1) {
                 serverLog(LL_WARNING,
                           "Error truncating the RDB file received from the primary "
                           "for SYNC: %s",
@@ -2226,15 +2381,15 @@ void readSyncBulkPayload(connection *conn) {
         /* Sync data on disk from time to time, otherwise at the end of the
          * transfer we may suffer a big delay as the memory buffers are copied
          * into the actual disk. */
-        if (server.repl_transfer_read >= server.repl_transfer_last_fsync_off + REPL_MAX_WRITTEN_BEFORE_FSYNC) {
-            off_t sync_size = server.repl_transfer_read - server.repl_transfer_last_fsync_off;
-            rdb_fsync_range(server.repl_transfer_fd, server.repl_transfer_last_fsync_off, sync_size);
-            server.repl_transfer_last_fsync_off += sync_size;
+        if (link->transfer_read >= link->transfer_last_fsync_off + REPL_MAX_WRITTEN_BEFORE_FSYNC) {
+            off_t sync_size = link->transfer_read - link->transfer_last_fsync_off;
+            rdb_fsync_range(link->transfer_fd, link->transfer_last_fsync_off, sync_size);
+            link->transfer_last_fsync_off += sync_size;
         }
 
         /* Check if the transfer is now complete */
         if (!usemark) {
-            if (server.repl_transfer_read == server.repl_transfer_size) eof_reached = 1;
+            if (link->transfer_read == link->transfer_size) eof_reached = 1;
         }
 
         /* If the transfer is yet not complete, we need to read more, so
@@ -2297,7 +2452,7 @@ void readSyncBulkPayload(connection *conn) {
              * It is enabled only on SWAPDB diskless replication when primary replication ID hasn't changed,
              * because in that state the old content of the db represents a different point in time of the same
              * data set we're currently receiving from the primary. */
-            if (memcmp(server.replid, server.primary_replid, CONFIG_RUN_ID_SIZE) == 0) {
+            if (memcmp(server.replid, link->replid, CONFIG_RUN_ID_SIZE) == 0) {
                 asyncLoading = 1;
             }
             dbarray = diskless_load_tempDb;
@@ -2308,29 +2463,34 @@ void readSyncBulkPayload(connection *conn) {
             replicationAttachToNewPrimary();
 
             /* Even though we are on-empty-db and the database is empty, we still call emptyData. */
-            serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Flushing old data");
+            serverLog(LL_NOTICE, "%s <-> REPLICA sync: Flushing old data", replicationGetNameForLogs(link));
             emptyData(-1, empty_db_flags, replicationEmptyDbCallback);
 
             dbarray = server.db;
             functions_lib_ctx = functionsLibCtxGetCurrent();
         }
 
-        rioInitWithConn(&rdb, conn, server.repl_transfer_size);
+        rioInitWithConn(&rdb, conn, link->transfer_size);
 
         /* Put the socket in blocking mode to simplify RDB transfer.
          * We'll restore it when the RDB is received. */
         connBlock(conn);
         connRecvTimeout(conn, server.repl_timeout * 1000);
 
-        serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Loading DB in memory");
-        startLoading(server.repl_transfer_size, RDBFLAGS_REPLICATION, asyncLoading);
+        serverLog(LL_NOTICE, "%s <-> REPLICA sync: Loading DB in memory", replicationGetNameForLogs(link));
+        startLoading(link->transfer_size, RDBFLAGS_REPLICATION, asyncLoading);
+
+        /* Before loading, ensure that the link won't be freed, even if
+         * REPLICAOF NO ONE is called in background event processing. */
+        link->protected = 1;
 
         int loadingFailed = 0;
         rdbLoadingCtx loadingCtx = {.dbarray = dbarray, .functions_lib_ctx = functions_lib_ctx};
         if (rdbLoadRioWithLoadingCtxScopedRdb(&rdb, RDBFLAGS_REPLICATION, &rsi, &loadingCtx) != C_OK) {
             /* RDB loading failed. */
-            serverLog(LL_WARNING, "Failed trying to load the PRIMARY synchronization DB "
-                                  "from socket, check server logs.");
+            serverLog(LL_WARNING, "Failed trying to load the %s synchronization DB "
+                                  "from socket, check server logs.",
+                      replicationGetNameForLogs(link));
             loadingFailed = 1;
         } else if (usemark) {
             /* Verify the end mark is correct. */
@@ -2340,6 +2500,14 @@ void readSyncBulkPayload(connection *conn) {
             }
         }
 
+        /* After loading, check if replication was cancelled */
+        link->protected = 0;
+        if (link->state == REPL_STATE_CANCELLED) {
+            /* Link was freed during RDB load */
+            serverLog(LL_NOTICE, "%s <-> REPLICA sync: Link to primary closed during diskless RDB load", replicationGetNameForLogs(link));
+            loadingFailed = 1;
+        }
+
         if (loadingFailed) {
             stopLoading(0);
             rioFreeConn(&rdb, NULL);
@@ -2351,10 +2519,10 @@ void readSyncBulkPayload(connection *conn) {
 
                 disklessLoadDiscardTempDb(diskless_load_tempDb);
                 disklessLoadDiscardFunctionsLibCtx(temp_functions_lib_ctx);
-                serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding temporary DB in background");
+                serverLog(LL_NOTICE, "%s <-> REPLICA sync: Discarding temporary DB in background", replicationGetNameForLogs(link));
             } else {
                 /* Remove the half-loaded data in case we started with an empty replica. */
-                serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding the half-loaded data");
+                serverLog(LL_NOTICE, "%s <-> REPLICA sync: Discarding the half-loaded data", replicationGetNameForLogs(link));
                 emptyData(-1, empty_db_flags, replicationEmptyDbCallback);
             }
 
@@ -2371,7 +2539,7 @@ void readSyncBulkPayload(connection *conn) {
              * primary structure and force resync of sub-replicas. */
             replicationAttachToNewPrimary();
 
-            serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Swapping active DB with loaded DB");
+            serverLog(LL_NOTICE, "%s <-> REPLICA sync: Swapping active DB with loaded DB", replicationGetNameForLogs(link));
             swapMainDbWithTempDb(diskless_load_tempDb);
 
             /* swap existing functions ctx with the temporary one */
@@ -2382,7 +2550,7 @@ void readSyncBulkPayload(connection *conn) {
 
             /* Delete the old db as it's useless now. */
             disklessLoadDiscardTempDb(diskless_load_tempDb);
-            serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding old DB in background");
+            serverLog(LL_NOTICE, "%s <-> REPLICA sync: Discarding old DB in background", replicationGetNameForLogs(link));
         }
 
         /* Inform about db change, as replication was diskless and didn't cause a save. */
@@ -2398,20 +2566,22 @@ void readSyncBulkPayload(connection *conn) {
     } else {
         /* Make sure the new file (also used for persistence) is fully synced
          * (not covered by earlier calls to rdb_fsync_range). */
-        if (fsync(server.repl_transfer_fd) == -1) {
+        if (fsync(link->transfer_fd) == -1) {
             serverLog(LL_WARNING,
                       "Failed trying to sync the temp DB to disk in "
-                      "PRIMARY <-> REPLICA synchronization: %s",
+                      "%s <-> REPLICA synchronization: %s",
+                      replicationGetNameForLogs(link),
                       strerror(errno));
             goto error;
         }
 
         /* Rename rdb like renaming rewrite aof asynchronously. */
         int old_rdb_fd = open(server.rdb_filename, O_RDONLY | O_NONBLOCK);
-        if (rename(server.repl_transfer_tmpfile, server.rdb_filename) == -1) {
+        if (rename(link->transfer_tmpfile, server.rdb_filename) == -1) {
             serverLog(LL_WARNING,
                       "Failed trying to rename the temp DB into %s in "
-                      "PRIMARY <-> REPLICA synchronization: %s",
+                      "%s <-> REPLICA synchronization: %s",
+                      replicationGetNameForLogs(link),
                       server.rdb_filename, strerror(errno));
             if (old_rdb_fd != -1) close(old_rdb_fd);
             goto error;
@@ -2423,7 +2593,8 @@ void readSyncBulkPayload(connection *conn) {
         if (fsyncFileDir(server.rdb_filename) == -1) {
             serverLog(LL_WARNING,
                       "Failed trying to sync DB directory %s in "
-                      "PRIMARY <-> REPLICA synchronization: %s",
+                      "%s <-> REPLICA synchronization: %s",
+                      replicationGetNameForLogs(link),
                       server.rdb_filename, strerror(errno));
             goto error;
         }
@@ -2435,13 +2606,14 @@ void readSyncBulkPayload(connection *conn) {
         /* Empty the databases only after the RDB file is ok, that is, before the RDB file
          * is actually loaded, in case we encounter an error and drop the replication stream
          * and leave an empty database. */
-        serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Flushing old data");
+        serverLog(LL_NOTICE, "%s <-> REPLICA sync: Flushing old data", replicationGetNameForLogs(link));
         emptyData(-1, empty_db_flags, replicationEmptyDbCallback);
 
-        serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Loading DB in memory");
+        serverLog(LL_NOTICE, "%s <-> REPLICA sync: Loading DB in memory", replicationGetNameForLogs(link));
         if (rdbLoad(server.rdb_filename, &rsi, RDBFLAGS_REPLICATION) != RDB_OK) {
-            serverLog(LL_WARNING, "Failed trying to load the PRIMARY synchronization "
-                                  "DB from disk, check server logs.");
+            serverLog(LL_WARNING, "Failed trying to load the %s synchronization "
+                                  "DB from disk, check server logs.",
+                      replicationGetNameForLogs(link));
             if (server.rdb_del_sync_files && allPersistenceDisabled()) {
                 serverLog(LL_NOTICE, "Removing the RDB file obtained from "
                                      "the primary. This replica has persistence "
@@ -2450,7 +2622,7 @@ void readSyncBulkPayload(connection *conn) {
             }
 
             /* If disk-based RDB loading fails, remove the half-loaded dataset. */
-            serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding the half-loaded data");
+            serverLog(LL_NOTICE, "%s<-> REPLICA sync: Discarding the half-loaded data", replicationGetNameForLogs(link));
             emptyData(-1, empty_db_flags, replicationEmptyDbCallback);
 
             /* Note that there's no point in restarting the AOF on sync failure,
@@ -2466,61 +2638,17 @@ void readSyncBulkPayload(connection *conn) {
             bg_unlink(server.rdb_filename);
         }
 
-        zfree(server.repl_transfer_tmpfile);
-        close(server.repl_transfer_fd);
-        server.repl_transfer_fd = -1;
-        server.repl_transfer_tmpfile = NULL;
-    }
-
-    /* Final setup of the connected replica <- primary link */
-    if (conn == server.repl_rdb_transfer_s) {
-        dualChannelSyncHandleRdbLoadCompletion();
-    } else {
-        replicationCreatePrimaryClient(server.repl_transfer_s, rsi.repl_stream_db);
-        server.repl_state = REPL_STATE_CONNECTED;
-        server.repl_down_since = 0;
-        /* Send the initial ACK immediately to put this replica in online state. */
-        replicationSendAck();
+        zfree(link->transfer_tmpfile);
+        close(link->transfer_fd);
+        link->transfer_fd = -1;
+        link->transfer_tmpfile = NULL;
     }
 
-    /* Fire the primary link modules event. */
-    moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_UP, NULL);
-    if (server.repl_state == REPL_STATE_CONNECTED) {
-        /* After a full resynchronization we use the replication ID and
-         * offset of the primary. The secondary ID / offset are cleared since
-         * we are starting a new history. */
-        memcpy(server.replid, server.primary->repl_data->replid, sizeof(server.replid));
-        server.primary_repl_offset = server.primary->repl_data->reploff;
-    }
-    clearReplicationId2();
-
-    /* Let's create the replication backlog if needed. Replicas need to
-     * accumulate the backlog regardless of the fact they have sub-replicas
-     * or not, in order to behave correctly if they are promoted to
-     * primaries after a failover. */
-    if (server.repl_backlog == NULL) createReplicationBacklog();
-    serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Finished with success");
-
-    if (server.supervised_mode == SUPERVISED_SYSTEMD) {
-        serverCommunicateSystemd("STATUS=PRIMARY <-> REPLICA sync: Finished with success. Ready to accept connections "
-                                 "in read-write mode.\n");
-    }
-
-    /* Restart the AOF subsystem now that we finished the sync. This
-     * will trigger an AOF rewrite, and when done will start appending
-     * to the new file. */
-    if (server.aof_enabled) restartAOFAfterSYNC();
-
-    /* In case of dual channel replication sync we want to close the RDB connection
-     * once the connection is established */
-    if (conn == server.repl_rdb_transfer_s) {
-        connClose(conn);
-        server.repl_rdb_transfer_s = NULL;
-    }
+    replicationFinishSyncPayload(conn, link, rsi.repl_stream_db);
     return;
 
 error:
-    cancelReplicationHandshake(1);
+    if (link) cancelReplicationHandshake(link, 1);
     return;
 }
 
@@ -2531,7 +2659,8 @@ char *receiveSynchronousResponse(connection *conn) {
         serverLog(LL_WARNING, "Failed to read response from the server: %s", connGetLastError(conn));
         return NULL;
     }
-    server.repl_transfer_lastio = server.unixtime;
+    replicationLink *link = (replicationLink *)connGetPrivateData(conn);
+    link->transfer_lastio = server.unixtime;
     return sdsnew(buf);
 }
 
@@ -2628,35 +2757,34 @@ sds getReplicaPortString(void) {
 
 /* Replication: Replica side.
  * Free replica's local replication buffer */
-void freePendingReplDataBuf(void) {
-    listRelease(server.pending_repl_data.blocks);
-    server.pending_repl_data.blocks = NULL;
-    server.pending_repl_data.len = 0;
+void freePendingReplDataBuf(replicationLink *link) {
+    listRelease(link->pending_repl_data.blocks);
+    link->pending_repl_data.blocks = NULL;
+    link->pending_repl_data.len = 0;
 }
 
 /* Replication: Replica side.
  * Upon dual-channel sync failure, close rdb-connection, reset repl-state, reset
  * provisional primary struct, and free local replication buffer. */
-void replicationAbortDualChannelSyncTransfer(void) {
-    serverAssert(server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE);
+void replicationAbortDualChannelSyncTransfer(replicationLink *link) {
+    serverAssert(link->rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE);
     dualChannelServerLog(LL_NOTICE, "Aborting dual channel sync");
-    if (server.repl_rdb_transfer_s) {
-        connClose(server.repl_rdb_transfer_s);
-        server.repl_rdb_transfer_s = NULL;
-    }
-    zfree(server.repl_transfer_tmpfile);
-    server.repl_transfer_tmpfile = NULL;
-    if (server.repl_transfer_fd != -1) {
-        close(server.repl_transfer_fd);
-        server.repl_transfer_fd = -1;
-    }
-    server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE;
-    server.repl_provisional_primary.read_reploff = 0;
-    server.repl_provisional_primary.reploff = 0;
-    server.repl_provisional_primary.conn = NULL;
-    server.repl_provisional_primary.dbid = -1;
-    server.rdb_client_id = -1;
-    freePendingReplDataBuf();
+    if (link->rdb_transfer_s) {
+        connClose(link->rdb_transfer_s);
+        link->rdb_transfer_s = NULL;
+    }
+    zfree(link->transfer_tmpfile);
+    link->transfer_tmpfile = NULL;
+    if (link->transfer_fd != -1) {
+        close(link->transfer_fd);
+        link->transfer_fd = -1;
+    }
+    link->rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE;
+    link->provisional_source_state.read_reploff = 0;
+    link->provisional_source_state.reploff = 0;
+    link->provisional_source_state.dbid = -1;
+    link->rdb_client_id = -1;
+    freePendingReplDataBuf(link);
     return;
 }
 
@@ -2678,7 +2806,7 @@ int sendCurrentOffsetToReplica(client *replica) {
     return C_OK;
 }
 
-static int dualChannelReplHandleHandshake(connection *conn, sds *err) {
+static int dualChannelReplHandleHandshake(replicationLink *link, sds *err) {
     dualChannelServerLog(LL_DEBUG, "Received first reply from primary using rdb connection.");
     /* AUTH with the primary if required. */
     if (server.primary_auth) {
@@ -2693,7 +2821,7 @@ static int dualChannelReplHandleHandshake(connection *conn, sds *err) {
         args[argc] = server.primary_auth;
         lens[argc] = sdslen(server.primary_auth);
         argc++;
-        *err = sendCommandArgv(conn, argc, args, lens);
+        *err = sendCommandArgv(link->transfer_s, argc, args, lens);
         if (*err) {
             dualChannelServerLog(LL_WARNING, "Sending command to primary in dual channel replication handshake: %s", *err);
             return C_ERR;
@@ -2701,7 +2829,7 @@ static int dualChannelReplHandleHandshake(connection *conn, sds *err) {
     }
     /* Send replica listening port to primary for clarification */
     sds portstr = getReplicaPortString();
-    *err = sendCommand(conn, "REPLCONF", "capa", "eof", "rdb-only", "1", "rdb-channel", "1", "listening-port", portstr,
+    *err = sendCommand(link->transfer_s, "REPLCONF", "capa", "eof", "rdb-only", "1", "rdb-channel", "1", "listening-port", portstr,
                        NULL);
     sdsfree(portstr);
     if (*err) {
@@ -2709,17 +2837,17 @@ static int dualChannelReplHandleHandshake(connection *conn, sds *err) {
         return C_ERR;
     }
 
-    if (connSetReadHandler(conn, dualChannelFullSyncWithPrimary) == C_ERR) {
+    if (connSetReadHandler(link->transfer_s, dualChannelFullSyncWithReplicationSource) == C_ERR) {
         char conninfo[CONN_INFO_LEN];
         dualChannelServerLog(LL_WARNING, "Can't create readable event for SYNC: %s (%s)", strerror(errno),
-                             connGetInfo(conn, conninfo, sizeof(conninfo)));
+                             connGetInfo(link->transfer_s, conninfo, sizeof(conninfo)));
         return C_ERR;
     }
     return C_OK;
 }
 
-static int dualChannelReplHandleAuthReply(connection *conn, sds *err) {
-    *err = receiveSynchronousResponse(conn);
+static int dualChannelReplHandleAuthReply(replicationLink *link, sds *err) {
+    *err = receiveSynchronousResponse(link->transfer_s);
     if (*err == NULL) {
         dualChannelServerLog(LL_WARNING, "Primary did not respond to auth command during SYNC handshake");
         return C_ERR;
@@ -2728,12 +2856,11 @@ static int dualChannelReplHandleAuthReply(connection *conn, sds *err) {
         dualChannelServerLog(LL_WARNING, "Unable to AUTH to Primary: %s", *err);
         return C_ERR;
     }
-    server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY;
     return C_OK;
 }
 
-static int dualChannelReplHandleReplconfReply(connection *conn, sds *err) {
-    *err = receiveSynchronousResponse(conn);
+static int dualChannelReplHandleReplconfReply(replicationLink *link, sds *err) {
+    *err = receiveSynchronousResponse(link->transfer_s);
     if (*err == NULL) {
         dualChannelServerLog(LL_WARNING, "Primary did not respond to replconf command during SYNC handshake");
         return C_ERR;
@@ -2744,16 +2871,20 @@ static int dualChannelReplHandleReplconfReply(connection *conn, sds *err) {
                              *err);
         return C_ERR;
     }
-    if (connSyncWrite(conn, "SYNC\r\n", 6, server.repl_syncio_timeout * 1000) == -1) {
-        dualChannelServerLog(LL_WARNING, "I/O error writing to Primary: %s", connGetLastError(conn));
+    if (connSyncWrite(link->transfer_s, "SYNC\r\n", 6, server.repl_syncio_timeout * 1000) == -1) {
+        dualChannelServerLog(LL_WARNING, "I/O error writing to Primary: %s", connGetLastError(link->transfer_s));
         return C_ERR;
     }
     return C_OK;
 }
 
-static int dualChannelReplHandleEndOffsetResponse(connection *conn, sds *err) {
+int replicationUseAOFFormatSnapshot(replicationLink *link) {
+    return link->slot_ranges != NULL;
+}
+
+static int dualChannelReplHandleEndOffsetResponse(replicationLink *link, sds *err) {
     uint64_t rdb_client_id;
-    *err = receiveSynchronousResponse(conn);
+    *err = receiveSynchronousResponse(link->transfer_s);
     if (*err == NULL) {
         return C_ERR;
     }
@@ -2771,30 +2902,34 @@ static int dualChannelReplHandleEndOffsetResponse(connection *conn, sds *err) {
         dualChannelServerLog(LL_WARNING, "Received unexpected $ENDOFF response: %s", *err);
         return C_ERR;
     }
-    server.rdb_client_id = rdb_client_id;
-    server.primary_initial_offset = reploffset;
+    link->rdb_client_id = rdb_client_id;
+    link->initial_offset = reploffset;
 
     /* Initiate repl_provisional_primary to act as this replica temp primary until RDB is loaded */
-    server.repl_provisional_primary.conn = server.repl_transfer_s;
-    memcpy(server.repl_provisional_primary.replid, primary_replid, sizeof(server.repl_provisional_primary.replid));
-    server.repl_provisional_primary.reploff = reploffset;
-    server.repl_provisional_primary.read_reploff = reploffset;
-    server.repl_provisional_primary.dbid = dbid;
+    memcpy(link->provisional_source_state.replid, primary_replid, CONFIG_RUN_ID_SIZE + 1);
+    link->provisional_source_state.reploff = reploffset;
+    link->provisional_source_state.read_reploff = reploffset;
+    link->provisional_source_state.dbid = dbid;
 
     /* Now that we have the snapshot end-offset, we can ask for psync from that offset. Prepare the
      * main connection accordingly.*/
-    server.repl_transfer_s->state = CONN_STATE_CONNECTED;
-    server.repl_state = REPL_STATE_SEND_HANDSHAKE;
-    serverAssert(connSetReadHandler(server.repl_transfer_s, dualChannelSetupMainConnForPsync) != C_ERR);
-    dualChannelSetupMainConnForPsync(server.repl_transfer_s);
+    link->transfer_s->state = CONN_STATE_CONNECTED;
+    link->state = REPL_STATE_SEND_HANDSHAKE;
+    serverAssert(connSetReadHandler(link->transfer_s, dualChannelSetupMainConnForPsync) != C_ERR);
+    dualChannelSetupMainConnForPsync(link->transfer_s);
 
-    /* As the next block we will receive using this connection is the rdb, we need to prepare
+    /* As the next block we will receive using this connection is the snapshot, we need to prepare
      * the connection accordingly */
-    serverAssert(connSetReadHandler(server.repl_rdb_transfer_s, readSyncBulkPayload) != C_ERR);
-    server.repl_transfer_size = -1;
-    server.repl_transfer_read = 0;
-    server.repl_transfer_last_fsync_off = 0;
-    server.repl_transfer_lastio = server.unixtime;
+    if (replicationUseAOFFormatSnapshot(link)) {
+        link->client = createReplicationLinkClientWithHandler(link, link->rdb_transfer_s, -1, readQueryFromClient);
+        link->rdb_transfer_s = NULL;
+    } else {
+        serverAssert(connSetReadHandler(link->rdb_transfer_s, readSyncBulkPayload) != C_ERR);
+    }
+    link->transfer_size = -1;
+    link->transfer_read = 0;
+    link->transfer_last_fsync_off = 0;
+    link->transfer_lastio = server.unixtime;
 
     return C_OK;
 }
@@ -2802,15 +2937,15 @@ static int dualChannelReplHandleEndOffsetResponse(connection *conn, sds *err) {
 /* Replication: Replica side.
  * This connection handler is used to initialize the RDB connection (dual-channel-replication).
  * Once a replica with dual-channel-replication enabled, denied from PSYNC with its primary,
- * dualChannelFullSyncWithPrimary begins its role. The connection handler prepares server.repl_rdb_transfer_s
+ * dualChannelFullSyncWithReplicationSource begins its role. The connection handler prepares server.repl_rdb_transfer_s
  * for a rdb stream, and server.repl_transfer_s for incremental replication data stream. */
-static void dualChannelFullSyncWithPrimary(connection *conn) {
+static void dualChannelFullSyncWithReplicationSource(connection *conn) {
     char *err = NULL;
     int ret = 0;
-    serverAssert(conn == server.repl_rdb_transfer_s);
+    replicationLink *link = (replicationLink *)connGetPrivateData(conn);
     /* If this event fired after the user turned the instance into a primary
      * with REPLICAOF NO ONE we must just return ASAP. */
-    if (server.repl_state == REPL_STATE_NONE) {
+    if (link->state == REPL_STATE_NONE) {
         goto error;
     }
     /* Check for errors in the socket: after a non blocking connect() we
@@ -2820,30 +2955,30 @@ static void dualChannelFullSyncWithPrimary(connection *conn) {
                              connGetLastError(conn));
         goto error;
     }
-    switch (server.repl_rdb_channel_state) {
+    switch (link->rdb_channel_state) {
     case REPL_DUAL_CHANNEL_SEND_HANDSHAKE:
-        ret = dualChannelReplHandleHandshake(conn, &err);
-        if (ret == C_OK) server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_AUTH_REPLY;
+        ret = dualChannelReplHandleHandshake(link, &err);
+        if (ret == C_OK) link->rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_AUTH_REPLY;
         break;
     case REPL_DUAL_CHANNEL_RECEIVE_AUTH_REPLY:
         if (server.primary_auth) {
-            ret = dualChannelReplHandleAuthReply(conn, &err);
-            if (ret == C_OK) server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY;
+            ret = dualChannelReplHandleAuthReply(link, &err);
+            if (ret == C_OK) link->rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY;
             /* Wait for next bulk before trying to read replconf reply. */
             break;
         }
-        server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY;
+        link->rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY;
         /* fall through */
     case REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY:
-        ret = dualChannelReplHandleReplconfReply(conn, &err);
-        if (ret == C_OK) server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_ENDOFF;
+        ret = dualChannelReplHandleReplconfReply(link, &err);
+        if (ret == C_OK) link->rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_ENDOFF;
         break;
     case REPL_DUAL_CHANNEL_RECEIVE_ENDOFF:
-        ret = dualChannelReplHandleEndOffsetResponse(conn, &err);
-        if (ret == C_OK) server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RDB_LOAD;
+        ret = dualChannelReplHandleEndOffsetResponse(link, &err);
+        if (ret == C_OK) link->rdb_channel_state = REPL_DUAL_CHANNEL_RDB_LOAD;
         break;
     default:
-        serverPanic("Unexpected dual replication state: %d", server.repl_rdb_channel_state);
+        serverPanic("Unexpected dual replication state: %d", link->rdb_channel_state);
     }
     if (ret == C_ERR) goto error;
     sdsfree(err);
@@ -2854,29 +2989,29 @@ static void dualChannelFullSyncWithPrimary(connection *conn) {
         serverLog(LL_WARNING, "Dual channel sync failed with error %s", err);
         sdsfree(err);
     }
-    if (server.repl_transfer_s) {
-        connClose(server.repl_transfer_s);
-        server.repl_transfer_s = NULL;
+    if (link->transfer_s) {
+        connClose(link->transfer_s);
+        link->transfer_s = NULL;
     }
-    if (server.repl_rdb_transfer_s) {
-        connClose(server.repl_rdb_transfer_s);
-        server.repl_rdb_transfer_s = NULL;
+    if (link->rdb_transfer_s) {
+        connClose(link->rdb_transfer_s);
+        link->rdb_transfer_s = NULL;
     }
-    if (server.repl_transfer_fd != -1) close(server.repl_transfer_fd);
-    server.repl_transfer_fd = -1;
-    server.repl_state = REPL_STATE_CONNECT;
-    replicationAbortDualChannelSyncTransfer();
+    if (link->transfer_fd != -1) close(link->transfer_fd);
+    link->transfer_fd = -1;
+    link->state = REPL_STATE_CONNECT;
+    replicationAbortDualChannelSyncTransfer(link);
 }
 
 /* Replication: Replica side.
  * Initialize server.pending_repl_data infrastructure, we will allocate the buffer
  * itself once we need it */
-void replDataBufInit(void) {
-    serverAssert(server.pending_repl_data.blocks == NULL);
-    server.pending_repl_data.len = 0;
-    server.pending_repl_data.peak = 0;
-    server.pending_repl_data.blocks = listCreate();
-    server.pending_repl_data.blocks->free = zfree;
+void replDataBufInit(replicationLink *link) {
+    serverAssert(link->pending_repl_data.blocks == NULL);
+    link->pending_repl_data.len = 0;
+    link->pending_repl_data.peak = 0;
+    link->pending_repl_data.blocks = listCreate();
+    link->pending_repl_data.blocks->free = zfree;
 }
 
 /* Replication: Replica side.
@@ -2887,7 +3022,7 @@ void replStreamProgressCallback(size_t offset, int readlen, time_t *last_progres
         ((offset + readlen) / server.loading_process_events_interval_bytes >
          offset / server.loading_process_events_interval_bytes) &&
         (now - *last_progress_callback > server.loading_process_events_interval_ms)) {
-        replicationSendNewlineToPrimary();
+        replicationSendNewlineToConnectedLinks();
         processEventsWhileBlocked();
         *last_progress_callback = now;
     }
@@ -2902,14 +3037,16 @@ typedef struct replDataBufBlock {
 
 /* Replication: Replica side.
  * Reads replication data from primary into specified repl buffer block */
-int readIntoReplDataBlock(connection *conn, replDataBufBlock *data_block, size_t read) {
-    int nread = connRead(conn, data_block->buf + data_block->used, read);
+int readIntoReplDataBlock(replicationLink *link, replDataBufBlock *data_block, size_t read) {
+    int nread = connRead(link->transfer_s, data_block->buf + data_block->used, read);
     if (nread <= 0) {
-        if (nread == 0 || connGetState(conn) != CONN_STATE_CONNECTED) {
+        if (nread == 0 || connGetState(link->transfer_s) != CONN_STATE_CONNECTED) {
             dualChannelServerLog(LL_WARNING, "Provisional primary closed connection");
-            /* Signal ongoing RDB load to terminate gracefully */
-            if (server.loading_rio) rioCloseASAP(server.loading_rio);
-            cancelReplicationHandshake(1);
+            if (link->rdb_channel_state == REPL_DUAL_CHANNEL_RDB_LOAD) {
+                /* Signal ongoing RDB load to terminate gracefully */
+                if (server.loading_rio) rioCloseASAP(server.loading_rio);
+            }
+            cancelReplicationHandshake(link, 1);
         }
         return C_ERR;
     }
@@ -2924,8 +3061,10 @@ void bufferReplData(connection *conn) {
     size_t readlen = PROTO_IOBUF_LEN;
     int remaining_bytes = 0;
 
+    replicationLink *link = (replicationLink *)connGetPrivateData(conn);
+
     while (readlen > 0) {
-        listNode *ln = listLast(server.pending_repl_data.blocks);
+        listNode *ln = listLast(link->pending_repl_data.blocks);
         replDataBufBlock *tail = ln ? listNodeValue(ln) : NULL;
 
         /* Append to tail string when possible */
@@ -2933,11 +3072,11 @@ void bufferReplData(connection *conn) {
             size_t avail = tail->size - tail->used;
             remaining_bytes = min(readlen, avail);
             readlen -= remaining_bytes;
-            remaining_bytes = readIntoReplDataBlock(conn, tail, remaining_bytes);
+            remaining_bytes = readIntoReplDataBlock(link, tail, remaining_bytes);
         }
         if (readlen && remaining_bytes == 0) {
             if (server.client_obuf_limits[CLIENT_TYPE_REPLICA].hard_limit_bytes &&
-                server.pending_repl_data.len > server.client_obuf_limits[CLIENT_TYPE_REPLICA].hard_limit_bytes) {
+                link->pending_repl_data.len > server.client_obuf_limits[CLIENT_TYPE_REPLICA].hard_limit_bytes) {
                 dualChannelServerLog(LL_NOTICE, "Replication buffer limit reached, stopping buffering.");
                 /* Stop accumulating primary commands. */
                 connSetReadHandler(conn, NULL);
@@ -2952,15 +3091,15 @@ void bufferReplData(connection *conn) {
             tail = zmalloc_usable(size + sizeof(replDataBufBlock), &usable_size);
             tail->size = usable_size - sizeof(replDataBufBlock);
             tail->used = 0;
-            listAddNodeTail(server.pending_repl_data.blocks, tail);
-            server.pending_repl_data.len += tail->size;
+            listAddNodeTail(link->pending_repl_data.blocks, tail);
+            link->pending_repl_data.len += tail->size;
             /* Update buffer's peak */
-            if (server.pending_repl_data.peak < server.pending_repl_data.len)
-                server.pending_repl_data.peak = server.pending_repl_data.len;
+            if (link->pending_repl_data.peak < link->pending_repl_data.len)
+                link->pending_repl_data.peak = link->pending_repl_data.len;
 
             remaining_bytes = min(readlen, tail->size);
             readlen -= remaining_bytes;
-            remaining_bytes = readIntoReplDataBlock(conn, tail, remaining_bytes);
+            remaining_bytes = readIntoReplDataBlock(link, tail, remaining_bytes);
         }
         if (remaining_bytes > 0) {
             /* Stop reading in case we read less than we anticipated */
@@ -2974,29 +3113,34 @@ void bufferReplData(connection *conn) {
 
 /* Replication: Replica side.
  * Streams accumulated replication data into the database while freeing read nodes */
-int streamReplDataBufToDb(client *c) {
-    serverAssert(c->flag.primary);
+int streamReplDataBufToDb(replicationLink *link) {
+    serverAssert(link->client->flag.replication_source);
     blockingOperationStarts();
     size_t used, offset = 0;
     listNode *cur = NULL;
     time_t last_progress_callback = mstime();
-    while (server.pending_repl_data.blocks && (cur = listFirst(server.pending_repl_data.blocks))) {
+
+    /* Before loading, protect our link from being destructed. */
+    link->protected = 1;
+
+    while (link->pending_repl_data.blocks && (cur = listFirst(link->pending_repl_data.blocks))) {
         /* Read and process repl data block */
         replDataBufBlock *o = listNodeValue(cur);
         used = o->used;
-        c->querybuf = sdscatlen(c->querybuf, o->buf, used);
-        c->repl_data->read_reploff += used;
-        processInputBuffer(c);
-        server.pending_repl_data.len -= used;
+        link->client->querybuf = sdscatlen(link->client->querybuf, o->buf, used);
+        link->client->repl_data->read_reploff += used;
+        processInputBuffer(link->client);
+        link->pending_repl_data.len -= used;
         offset += used;
-        listDelNode(server.pending_repl_data.blocks, cur);
+        listDelNode(link->pending_repl_data.blocks, cur);
         replStreamProgressCallback(offset, used, &last_progress_callback);
     }
+    link->protected = 0;
     blockingOperationEnds();
-    if (!server.pending_repl_data.blocks) {
+
+    if (link->state == REPL_STATE_CANCELLED) {
         /* If we encounter a `replicaof` command during the replStreamProgressCallback,
-         * pending_repl_data.blocks will be NULL, and we should return an error and
-         * abort the current sync session. */
+         * we should return an error and abort the current sync session. */
         return C_ERR;
     }
     return C_OK;
@@ -3005,65 +3149,64 @@ int streamReplDataBufToDb(client *c) {
 /* Replication: Replica side.
  * After done loading the snapshot using the rdb-channel prepare this replica for steady state by
  * initializing the primary client, amd stream local incremental buffer into memory. */
-void dualChannelSyncSuccess(void) {
-    server.primary_initial_offset = server.repl_provisional_primary.reploff;
-    replicationResurrectProvisionalPrimary();
+int dualChannelSyncSuccess(replicationLink *link) {
+    link->initial_offset = link->provisional_source_state.reploff;
+    replicationResurrectProvisionalSource(link);
     /* Wait for the accumulated buffer to be processed before reading any more replication updates */
-    if (server.pending_repl_data.blocks && streamReplDataBufToDb(server.primary) == C_ERR) {
+    if (link->pending_repl_data.blocks && streamReplDataBufToDb(link) == C_ERR) {
         /* Sync session aborted during repl data streaming. */
         dualChannelServerLog(LL_WARNING, "Failed to stream local replication buffer into memory");
         /* Verify sync is still in progress */
-        if (server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) {
-            replicationAbortDualChannelSyncTransfer();
-            replicationUnsetPrimary();
+        if (link->rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) {
+            replicationAbortDualChannelSyncTransfer(link);
         }
-        return;
+        return C_ERR;
     }
-    freePendingReplDataBuf();
+    freePendingReplDataBuf(link);
     dualChannelServerLog(LL_NOTICE, "Successfully streamed replication data into memory");
     /* We can resume reading from the primary connection once the local replication buffer has been loaded. */
-    replicationSteadyStateInit();
-    replicationSendAck(); /* Send ACK to notify primary that replica is synced */
-    server.rdb_client_id = -1;
-    server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE;
+    replicationSteadyStateInit(link);
+    replicationSendAck(link); /* Send ACK to notify primary that replica is synced */
+    link->rdb_client_id = -1;
+    link->rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE;
+    return C_OK;
 }
 
 /* Replication: Replica side.
  * Main channel successfully established psync with primary. Check whether the rdb channel
  * has completed its part and act accordingly. */
-int dualChannelSyncHandlePsync(void) {
-    serverAssert(server.repl_state == REPL_STATE_RECEIVE_PSYNC_REPLY);
-    if (server.repl_rdb_channel_state < REPL_DUAL_CHANNEL_RDB_LOADED) {
+int dualChannelSyncHandlePsync(replicationLink *link) {
+    serverAssert(link->state == REPL_STATE_RECEIVE_PSYNC_REPLY);
+    if (link->rdb_channel_state < REPL_DUAL_CHANNEL_RDB_LOADED) {
         /* RDB is still loading */
-        if (connSetReadHandler(server.repl_provisional_primary.conn, bufferReplData) == C_ERR) {
+        if (connSetReadHandler(link->transfer_s, bufferReplData) == C_ERR) {
             dualChannelServerLog(LL_WARNING, "Error while setting readable handler: %s", strerror(errno));
-            cancelReplicationHandshake(1);
+            cancelReplicationHandshake(link, 1);
             return C_ERR;
         }
-        replDataBufInit();
+        replDataBufInit(link);
         return C_OK;
     }
-    serverAssert(server.repl_rdb_channel_state == REPL_DUAL_CHANNEL_RDB_LOADED);
+    serverAssert(link->rdb_channel_state == REPL_DUAL_CHANNEL_RDB_LOADED);
     /* RDB is loaded */
     dualChannelServerLog(LL_DEBUG, "Psync established after rdb load");
-    dualChannelSyncSuccess();
+    dualChannelSyncSuccess(link);
     return C_OK;
 }
 
 /* Replication: Replica side.
  * RDB channel done loading the RDB. Check whether the main channel has completed its part
  * and act accordingly. */
-void dualChannelSyncHandleRdbLoadCompletion(void) {
-    serverAssert(server.repl_rdb_channel_state == REPL_DUAL_CHANNEL_RDB_LOAD);
-    if (server.repl_state < REPL_STATE_TRANSFER) {
+int dualChannelSyncHandleRdbLoadCompletion(replicationLink *link) {
+    serverAssert(link->rdb_channel_state == REPL_DUAL_CHANNEL_RDB_LOAD);
+    if (link->state < REPL_STATE_TRANSFER) {
         /* Main psync channel hasn't been established yet */
-        server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RDB_LOADED;
-        return;
+        link->rdb_channel_state = REPL_DUAL_CHANNEL_RDB_LOADED;
+        return C_OK;
     }
-    serverAssert(server.repl_state == REPL_STATE_TRANSFER);
-    connSetReadHandler(server.repl_transfer_s, NULL);
-    dualChannelSyncSuccess();
-    return;
+    serverAssert(link->state == REPL_STATE_TRANSFER);
+    connSetReadHandler(link->transfer_s, NULL);
+    return dualChannelSyncSuccess(link);
 }
 
 /* Try a partial resynchronization with the primary if we are about to reconnect.
@@ -3121,8 +3264,8 @@ void dualChannelSyncHandleRdbLoadCompletion(void) {
 #define PSYNC_NOT_SUPPORTED 4
 #define PSYNC_TRY_LATER 5
 #define PSYNC_FULLRESYNC_DUAL_CHANNEL 6
-int replicaTryPartialResynchronization(connection *conn, int read_reply) {
-    char *psync_replid;
+int replicaTryPartialResynchronization(replicationLink *link, int read_reply) {
+    char *psync_replid = NULL;
     char psync_offset[32];
     sds reply;
 
@@ -3133,21 +3276,25 @@ int replicaTryPartialResynchronization(connection *conn, int read_reply) {
          * a FULL resync using the PSYNC command we'll set the offset at the
          * right value, so that this information will be propagated to the
          * client structure representing the primary into server.primary. */
-        server.primary_initial_offset = -1;
+        link->initial_offset = -1;
 
-        if (server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) {
+        if (link->rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) {
             /* While in dual channel replication, we should use our prepared repl id and offset. */
-            psync_replid = server.repl_provisional_primary.replid;
-            snprintf(psync_offset, sizeof(psync_offset), "%lld", server.repl_provisional_primary.reploff + 1);
+            psync_replid = link->provisional_source_state.replid;
+            snprintf(psync_offset, sizeof(psync_offset), "%lld", link->provisional_source_state.reploff + 1);
             dualChannelServerLog(LL_NOTICE,
                                  "Trying a partial resynchronization using main channel (request %s:%s).",
                                  psync_replid, psync_offset);
+        } else if (link != server.primary) {
+            serverLog(LL_NOTICE, "Partial resynchronization not attempted (not primary replication)");
         } else if (server.cached_primary) {
             psync_replid = server.cached_primary->repl_data->replid;
             snprintf(psync_offset, sizeof(psync_offset), "%lld", server.cached_primary->repl_data->reploff + 1);
             serverLog(LL_NOTICE, "Trying a partial resynchronization (request %s:%s).", psync_replid, psync_offset);
         } else {
             serverLog(LL_NOTICE, "Partial resynchronization not possible (no cached primary)");
+        }
+        if (!psync_replid) {
             psync_replid = "?";
             memcpy(psync_offset, "-1", 3);
         }
@@ -3155,26 +3302,26 @@ int replicaTryPartialResynchronization(connection *conn, int read_reply) {
         /* Issue the PSYNC command, if this is a primary with a failover in
          * progress then send the failover argument to the replica to cause it
          * to become a primary */
-        if (server.failover_state == FAILOVER_IN_PROGRESS) {
-            reply = sendCommand(conn, "PSYNC", psync_replid, psync_offset, "FAILOVER", NULL);
+        if (server.failover_state == FAILOVER_IN_PROGRESS && link == server.primary) {
+            reply = sendCommand(link->transfer_s, "PSYNC", psync_replid, psync_offset, "FAILOVER", NULL);
         } else {
-            reply = sendCommand(conn, "PSYNC", psync_replid, psync_offset, NULL);
+            reply = sendCommand(link->transfer_s, "PSYNC", psync_replid, psync_offset, NULL);
         }
 
         if (reply != NULL) {
-            serverLog(LL_WARNING, "Unable to send PSYNC to primary: %s", reply);
+            serverLog(LL_WARNING, "Unable to send PSYNC to source: %s", reply);
             sdsfree(reply);
-            connSetReadHandler(conn, NULL);
+            connSetReadHandler(link->transfer_s, NULL);
             return PSYNC_WRITE_ERROR;
         }
         return PSYNC_WAIT_REPLY;
     }
 
     /* Reading half */
-    reply = receiveSynchronousResponse(conn);
+    reply = receiveSynchronousResponse(link->transfer_s);
     /* Primary did not reply to PSYNC */
     if (reply == NULL) {
-        connSetReadHandler(conn, NULL);
+        connSetReadHandler(link->transfer_s, NULL);
         serverLog(LL_WARNING, "Primary did not reply to PSYNC, will try later");
         return PSYNC_TRY_LATER;
     }
@@ -3186,7 +3333,7 @@ int replicaTryPartialResynchronization(connection *conn, int read_reply) {
         return PSYNC_WAIT_REPLY;
     }
 
-    connSetReadHandler(conn, NULL);
+    connSetReadHandler(link->transfer_s, NULL);
 
     if (!strncmp(reply, "+FULLRESYNC", 11)) {
         char *replid = NULL, *offset = NULL;
@@ -3205,24 +3352,31 @@ int replicaTryPartialResynchronization(connection *conn, int read_reply) {
              * reply means that the primary supports PSYNC, but the reply
              * format seems wrong. To stay safe we blank the primary
              * replid to make sure next PSYNCs will fail. */
-            memset(server.primary_replid, 0, CONFIG_RUN_ID_SIZE + 1);
+            memset(link->replid, 0, CONFIG_RUN_ID_SIZE + 1);
         } else {
-            memcpy(server.primary_replid, replid, offset - replid - 1);
-            server.primary_replid[CONFIG_RUN_ID_SIZE] = '\0';
-            server.primary_initial_offset = strtoll(offset, NULL, 10);
-            serverLog(LL_NOTICE, "Full resync from primary: %s:%lld", server.primary_replid,
-                      server.primary_initial_offset);
+            memcpy(link->replid, replid, offset - replid - 1);
+            link->replid[CONFIG_RUN_ID_SIZE] = '\0';
+            link->initial_offset = strtoll(offset, NULL, 10);
+            serverLog(LL_NOTICE, "Full resync from primary: %s:%lld", link->replid,
+                      link->initial_offset);
         }
         sdsfree(reply);
         return PSYNC_FULLRESYNC;
     }
 
     if (!strncmp(reply, "+CONTINUE", 9)) {
-        if (server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) {
-            /* During dual channel sync sesseion, primary struct is already initialized. */
+        if (link->rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) {
+            /* During dual channel sync session, primary struct is already initialized. */
             sdsfree(reply);
             return PSYNC_CONTINUE;
         }
+        if (link != server.primary) {
+            /* Continuing from a cached primary should only happen when we are syncing for primary replication. */
+            sdsfree(reply);
+            serverLog(LL_WARNING, "Received +CONTINUE response to PSYNC when not doing replication and not performing dual channel sync. Failing PSYNC.");
+            return PSYNC_NOT_SUPPORTED;
+        }
+
         /* Partial resync was accepted. */
         serverLog(LL_NOTICE, "Successful partial resynchronization with primary.");
 
@@ -3259,7 +3413,7 @@ int replicaTryPartialResynchronization(connection *conn, int read_reply) {
 
         /* Setup the replication to continue. */
         sdsfree(reply);
-        replicationResurrectCachedPrimary(conn);
+        replicationResurrectCachedPrimary(link);
 
         /* If this instance was restarted and we read the metadata to
          * PSYNC from the persistence file, our replication backlog could
@@ -3320,16 +3474,16 @@ sds getTryPsyncString(int result) {
     }
 }
 
-int dualChannelReplMainConnSendHandshake(connection *conn, sds *err) {
+int dualChannelReplMainConnSendHandshake(replicationLink *link, sds *err) {
     char llstr[LONG_STR_SIZE];
-    ull2string(llstr, sizeof(llstr), server.rdb_client_id);
-    *err = sendCommand(conn, "REPLCONF", "set-rdb-client-id", llstr, NULL);
+    ull2string(llstr, sizeof(llstr), link->rdb_client_id);
+    *err = sendCommand(link->transfer_s, "REPLCONF", "set-rdb-client-id", llstr, NULL);
     if (*err) return C_ERR;
     return C_OK;
 }
 
-int dualChannelReplMainConnRecvCapaReply(connection *conn, sds *err) {
-    *err = receiveSynchronousResponse(conn);
+int dualChannelReplMainConnRecvCapaReply(replicationLink *link, sds *err) {
+    *err = receiveSynchronousResponse(link->transfer_s);
     if (*err == NULL) return C_ERR;
     if ((*err)[0] == '-') {
         dualChannelServerLog(LL_NOTICE, "Primary does not understand REPLCONF identify: %s", *err);
@@ -3338,28 +3492,28 @@ int dualChannelReplMainConnRecvCapaReply(connection *conn, sds *err) {
     return C_OK;
 }
 
-int dualChannelReplMainConnSendPsync(connection *conn, sds *err) {
+int dualChannelReplMainConnSendPsync(replicationLink *link, sds *err) {
     if (server.debug_pause_after_fork) debugPauseProcess();
-    if (replicaTryPartialResynchronization(conn, 0) == PSYNC_WRITE_ERROR) {
+    if (replicaTryPartialResynchronization(link, 0) == PSYNC_WRITE_ERROR) {
         dualChannelServerLog(LL_WARNING, "Aborting dual channel sync. Write error.");
-        *err = sdsnew(connGetLastError(conn));
+        *err = sdsnew(connGetLastError(link->transfer_s));
         return C_ERR;
     }
     return C_OK;
 }
 
-int dualChannelReplMainConnRecvPsyncReply(connection *conn, sds *err) {
-    int psync_result = replicaTryPartialResynchronization(conn, 1);
+int dualChannelReplMainConnRecvPsyncReply(replicationLink *link, sds *err) {
+    int psync_result = replicaTryPartialResynchronization(link, 1);
     if (psync_result == PSYNC_WAIT_REPLY) return C_OK; /* Try again later... */
 
     if (psync_result == PSYNC_CONTINUE) {
         dualChannelServerLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Primary accepted a Partial Resynchronization%s",
-                             server.repl_rdb_transfer_s != NULL ? ", RDB load in background." : ".");
+                             link->rdb_transfer_s != NULL ? ", RDB load in background." : ".");
         if (server.supervised_mode == SUPERVISED_SYSTEMD) {
             serverCommunicateSystemd("STATUS=PRIMARY <-> REPLICA sync: Partial Resynchronization accepted. Ready to "
                                      "accept connections in read-write mode.\n");
         }
-        dualChannelSyncHandlePsync();
+        dualChannelSyncHandlePsync(link);
         return C_OK;
     }
     *err = getTryPsyncString(psync_result);
@@ -3373,37 +3527,39 @@ void dualChannelSetupMainConnForPsync(connection *conn) {
     char *err = NULL;
     int ret;
 
-    switch (server.repl_state) {
+    replicationLink *link = (replicationLink *)connGetPrivateData(conn);
+
+    switch (link->state) {
     case REPL_STATE_SEND_HANDSHAKE:
-        ret = dualChannelReplMainConnSendHandshake(conn, &err);
-        if (ret == C_OK) server.repl_state = REPL_STATE_RECEIVE_CAPA_REPLY;
+        ret = dualChannelReplMainConnSendHandshake(link, &err);
+        if (ret == C_OK) link->state = REPL_STATE_RECEIVE_CAPA_REPLY;
         break;
     case REPL_STATE_RECEIVE_CAPA_REPLY:
-        ret = dualChannelReplMainConnRecvCapaReply(conn, &err);
+        ret = dualChannelReplMainConnRecvCapaReply(link, &err);
         if (ret == C_ERR) {
             break;
         }
-        if (ret == C_OK) server.repl_state = REPL_STATE_SEND_PSYNC;
+        if (ret == C_OK) link->state = REPL_STATE_SEND_PSYNC;
         sdsfree(err);
         err = NULL;
         /* fall through */
     case REPL_STATE_SEND_PSYNC:
-        ret = dualChannelReplMainConnSendPsync(conn, &err);
-        if (ret == C_OK) server.repl_state = REPL_STATE_RECEIVE_PSYNC_REPLY;
+        ret = dualChannelReplMainConnSendPsync(link, &err);
+        if (ret == C_OK) link->state = REPL_STATE_RECEIVE_PSYNC_REPLY;
         break;
     case REPL_STATE_RECEIVE_PSYNC_REPLY:
-        ret = dualChannelReplMainConnRecvPsyncReply(conn, &err);
-        if (ret == C_OK && server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE)
-            server.repl_state = REPL_STATE_TRANSFER;
-        /*  In case the RDB is already loaded, the repl_state will be set during establishPrimaryConnection. */
+        ret = dualChannelReplMainConnRecvPsyncReply(link, &err);
+        if (ret == C_OK && link->rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE)
+            link->state = REPL_STATE_TRANSFER;
+        /*  In case the RDB is already loaded, the repl_state will be set during establishSourceConnection. */
         break;
     default:
-        serverPanic("Unexpected replication state: %d", server.repl_state);
+        serverPanic("Unexpected replication state: %d", link->state);
     }
 
     if (ret == C_ERR) {
         dualChannelServerLog(LL_WARNING, "Aborting dual channel sync. Main channel psync result %d %s", ret, err ? err : "");
-        cancelReplicationHandshake(1);
+        cancelReplicationHandshake(link, 1);
     }
     sdsfree(err);
 }
@@ -3486,16 +3642,11 @@ void dualChannelSetupMainConnForPsync(connection *conn) {
  */
 /* This handler fires when the non blocking connect was able to
  * establish a connection with the primary. */
-void syncWithPrimary(connection *conn) {
+void syncWithSource(connection *conn) {
     char tmpfile[256], *err = NULL;
     int psync_result;
 
-    /* If this event fired after the user turned the instance into a primary
-     * with REPLICAOF NO ONE we must just return ASAP. */
-    if (server.repl_state == REPL_STATE_NONE) {
-        connClose(conn);
-        return;
-    }
+    replicationLink *link = (replicationLink *)connGetPrivateData(conn);
 
     /* Check for errors in the socket: after a non blocking connect() we
      * may find that the socket is in error state. */
@@ -3505,13 +3656,13 @@ void syncWithPrimary(connection *conn) {
     }
 
     /* Send a PING to check the primary is able to reply without errors. */
-    if (server.repl_state == REPL_STATE_CONNECTING) {
+    if (link->state == REPL_STATE_CONNECTING) {
         serverLog(LL_NOTICE, "Non blocking connect for SYNC fired the event.");
         /* Delete the writable event so that the readable event remains
          * registered and we can wait for the PONG reply. */
-        connSetReadHandler(conn, syncWithPrimary);
+        connSetReadHandler(conn, syncWithSource);
         connSetWriteHandler(conn, NULL);
-        server.repl_state = REPL_STATE_RECEIVE_PING_REPLY;
+        link->state = REPL_STATE_RECEIVE_PING_REPLY;
         /* Send the PING, don't check for errors at all, we have the timeout
          * that will take care about this. */
         err = sendCommand(conn, "PING", NULL);
@@ -3520,7 +3671,7 @@ void syncWithPrimary(connection *conn) {
     }
 
     /* Receive the PONG command. */
-    if (server.repl_state == REPL_STATE_RECEIVE_PING_REPLY) {
+    if (link->state == REPL_STATE_RECEIVE_PING_REPLY) {
         err = receiveSynchronousResponse(conn);
 
         /* The primary did not reply */
@@ -3541,10 +3692,10 @@ void syncWithPrimary(connection *conn) {
         }
         sdsfree(err);
         err = NULL;
-        server.repl_state = REPL_STATE_SEND_HANDSHAKE;
+        link->state = REPL_STATE_SEND_HANDSHAKE;
     }
 
-    if (server.repl_state == REPL_STATE_SEND_HANDSHAKE) {
+    if (link->state == REPL_STATE_SEND_HANDSHAKE) {
         /* AUTH with the primary if required. */
         if (server.primary_auth) {
             char *args[3] = {"AUTH", NULL, NULL};
@@ -3579,6 +3730,19 @@ void syncWithPrimary(connection *conn) {
             if (err) goto write_error;
         }
 
+        /* Set the slot number, so that the primary only provides us with the appropriate slot dictionary. */
+        if (link->slot_ranges != NULL) {
+            char *argv[3] = {"REPLCONF", "slot-bitmap", NULL};
+            size_t lens[3] = {8, 11, 0};
+            unsigned char slot_bitmap[CLUSTER_SLOTS/8 + 1] = {0};
+            slotRangesToBitmap(link->slot_ranges, slot_bitmap);
+            slot_bitmap[CLUSTER_SLOTS/8] = '\0';
+            argv[2] = (char *)slot_bitmap;
+            lens[2] = CLUSTER_SLOTS/8;
+            err = sendCommandArgv(conn, 3, argv, lens);
+            if (err) goto write_error;
+        }
+
         /* Inform the primary of our (replica) capabilities.
          *
          * EOF: supports EOF-style RDB transfer for diskless replication.
@@ -3594,30 +3758,30 @@ void syncWithPrimary(connection *conn) {
         err = sendCommand(conn, "REPLCONF", "version", VALKEY_VERSION, NULL);
         if (err) goto write_error;
 
-        server.repl_state = REPL_STATE_RECEIVE_AUTH_REPLY;
+        link->state = REPL_STATE_RECEIVE_AUTH_REPLY;
         return;
     }
 
-    if (server.repl_state == REPL_STATE_RECEIVE_AUTH_REPLY && !server.primary_auth)
-        server.repl_state = REPL_STATE_RECEIVE_PORT_REPLY;
+    if (link->state == REPL_STATE_RECEIVE_AUTH_REPLY && !server.primary_auth)
+        link->state = REPL_STATE_RECEIVE_PORT_REPLY;
 
     /* Receive AUTH reply. */
-    if (server.repl_state == REPL_STATE_RECEIVE_AUTH_REPLY) {
+    if (link->state == REPL_STATE_RECEIVE_AUTH_REPLY) {
         err = receiveSynchronousResponse(conn);
         if (err == NULL) goto no_response_error;
         if (err[0] == '-') {
-            serverLog(LL_WARNING, "Unable to AUTH to PRIMARY: %s", err);
+            serverLog(LL_WARNING, "Unable to AUTH to %s: %s", replicationGetNameForLogs(link), err);
             sdsfree(err);
             goto error;
         }
         sdsfree(err);
         err = NULL;
-        server.repl_state = REPL_STATE_RECEIVE_PORT_REPLY;
+        link->state = REPL_STATE_RECEIVE_PORT_REPLY;
         return;
     }
 
     /* Receive REPLCONF listening-port reply. */
-    if (server.repl_state == REPL_STATE_RECEIVE_PORT_REPLY) {
+    if (link->state == REPL_STATE_RECEIVE_PORT_REPLY) {
         err = receiveSynchronousResponse(conn);
         if (err == NULL) goto no_response_error;
         /* Ignore the error if any, not all the Redis OSS versions support
@@ -3629,15 +3793,15 @@ void syncWithPrimary(connection *conn) {
                       err);
         }
         sdsfree(err);
-        server.repl_state = REPL_STATE_RECEIVE_IP_REPLY;
+        link->state = REPL_STATE_RECEIVE_IP_REPLY;
         return;
     }
 
-    if (server.repl_state == REPL_STATE_RECEIVE_IP_REPLY && !server.replica_announce_ip)
-        server.repl_state = REPL_STATE_RECEIVE_CAPA_REPLY;
+    if (link->state == REPL_STATE_RECEIVE_IP_REPLY && !server.replica_announce_ip)
+        link->state = REPL_STATE_RECEIVE_SLOT_REPLY;
 
     /* Receive REPLCONF ip-address reply. */
-    if (server.repl_state == REPL_STATE_RECEIVE_IP_REPLY) {
+    if (link->state == REPL_STATE_RECEIVE_IP_REPLY) {
         err = receiveSynchronousResponse(conn);
         if (err == NULL) goto no_response_error;
         /* Ignore the error if any, not all the Redis OSS versions support
@@ -3649,42 +3813,59 @@ void syncWithPrimary(connection *conn) {
                       err);
         }
         sdsfree(err);
-        server.repl_state = REPL_STATE_RECEIVE_CAPA_REPLY;
+        link->state = REPL_STATE_RECEIVE_SLOT_REPLY;
+        return;
+    }
+
+    if (link->state == REPL_STATE_RECEIVE_SLOT_REPLY && link->slot_ranges == NULL)
+        link->state = REPL_STATE_RECEIVE_CAPA_REPLY;
+
+    if (link->state == REPL_STATE_RECEIVE_SLOT_REPLY) {
+        err = receiveSynchronousResponse(conn);
+        if (err == NULL) goto no_response_error;
+        /* If we sent the slot number, we need it to be properly acked, or we can't do slot migration. */
+        if (err[0] == '-') {
+            serverLog(LL_WARNING, "Source does not understand REPLCONF slot-num. Cannot continue with slot-level sync: %s", err);
+            sdsfree(err);
+            goto error;
+        }
+        sdsfree(err);
+        link->state = REPL_STATE_RECEIVE_CAPA_REPLY;
         return;
     }
 
     /* Receive CAPA reply. */
-    if (server.repl_state == REPL_STATE_RECEIVE_CAPA_REPLY) {
+    if (link->state == REPL_STATE_RECEIVE_CAPA_REPLY) {
         err = receiveSynchronousResponse(conn);
         if (err == NULL) goto no_response_error;
         /* Ignore the error if any, not all the Redis OSS versions support
          * REPLCONF capa. */
         if (err[0] == '-') {
             serverLog(LL_NOTICE,
-                      "(Non critical) Primary does not understand "
+                      "(Non critical) Source does not understand "
                       "REPLCONF capa: %s",
                       err);
         }
         sdsfree(err);
         err = NULL;
-        server.repl_state = REPL_STATE_RECEIVE_VERSION_REPLY;
+        link->state = REPL_STATE_RECEIVE_VERSION_REPLY;
         return;
     }
 
     /* Receive VERSION reply. */
-    if (server.repl_state == REPL_STATE_RECEIVE_VERSION_REPLY) {
+    if (link->state == REPL_STATE_RECEIVE_VERSION_REPLY) {
         err = receiveSynchronousResponse(conn);
         if (err == NULL) goto no_response_error;
         /* Ignore the error if any. Valkey >= 8 supports REPLCONF VERSION. */
         if (err[0] == '-') {
             serverLog(LL_NOTICE,
-                      "(Non critical) Primary does not understand "
+                      "(Non critical) Source does not understand "
                       "REPLCONF VERSION: %s",
                       err);
         }
         sdsfree(err);
         err = NULL;
-        server.repl_state = REPL_STATE_SEND_PSYNC;
+        link->state = REPL_STATE_SEND_PSYNC;
     }
 
     /* Try a partial resynchronization. If we don't have a cached primary
@@ -3692,32 +3873,32 @@ void syncWithPrimary(connection *conn) {
      * to start a full resynchronization so that we get the primary replid
      * and the global offset, to try a partial resync at the next
      * reconnection attempt. */
-    if (server.repl_state == REPL_STATE_SEND_PSYNC) {
-        if (replicaTryPartialResynchronization(conn, 0) == PSYNC_WRITE_ERROR) {
+    if (link->state == REPL_STATE_SEND_PSYNC) {
+        if (replicaTryPartialResynchronization(link, 0) == PSYNC_WRITE_ERROR) {
             err = sdsnew("Write error sending the PSYNC command.");
             abortFailover("Write error to failover target");
             goto write_error;
         }
-        server.repl_state = REPL_STATE_RECEIVE_PSYNC_REPLY;
+        link->state = REPL_STATE_RECEIVE_PSYNC_REPLY;
         return;
     }
 
     /* If reached this point, we should be in REPL_STATE_RECEIVE_PSYNC_REPLY. */
-    if (server.repl_state != REPL_STATE_RECEIVE_PSYNC_REPLY) {
+    if (link->state != REPL_STATE_RECEIVE_PSYNC_REPLY) {
         serverLog(LL_WARNING,
-                  "syncWithPrimary(): state machine error, "
+                  "syncWithSource(): state machine error, "
                   "state should be RECEIVE_PSYNC but is %d",
-                  server.repl_state);
+                  link->state);
         goto error;
     }
 
-    psync_result = replicaTryPartialResynchronization(conn, 1);
+    psync_result = replicaTryPartialResynchronization(link, 1);
     if (psync_result == PSYNC_WAIT_REPLY) return; /* Try again later... */
 
     /* Check the status of the planned failover. We expect PSYNC_CONTINUE,
      * but there is nothing technically wrong with a full resync which
      * could happen in edge cases. */
-    if (server.failover_state == FAILOVER_IN_PROGRESS) {
+    if (server.failover_state == FAILOVER_IN_PROGRESS && link == server.primary) {
         if (psync_result == PSYNC_CONTINUE || psync_result == PSYNC_FULLRESYNC) {
             clearFailoverState();
         } else {
@@ -3750,13 +3931,13 @@ void syncWithPrimary(connection *conn) {
     if (psync_result == PSYNC_NOT_SUPPORTED) {
         serverLog(LL_NOTICE, "Retrying with SYNC...");
         if (connSyncWrite(conn, "SYNC\r\n", 6, server.repl_syncio_timeout * 1000) == -1) {
-            serverLog(LL_WARNING, "I/O error writing to PRIMARY: %s", connGetLastError(conn));
+            serverLog(LL_WARNING, "I/O error writing to %s: %s", replicationGetNameForLogs(link), connGetLastError(conn));
             goto error;
         }
     }
 
     /* Prepare a suitable temp file for bulk transfer */
-    if (!useDisklessLoad()) {
+    if (!useDisklessLoad() && link->slot_ranges == NULL) {
         int dfd = -1, maxtries = 5;
         while (maxtries--) {
             snprintf(tmpfile, 256, "temp-%d.%ld.rdb", (int)server.unixtime, (long int)getpid());
@@ -3769,24 +3950,30 @@ void syncWithPrimary(connection *conn) {
             errno = saved_errno;
         }
         if (dfd == -1) {
-            serverLog(LL_WARNING, "Opening the temp file needed for PRIMARY <-> REPLICA synchronization: %s",
+            serverLog(LL_WARNING, "Opening the temp file needed for %s <-> REPLICA synchronization: %s", replicationGetNameForLogs(link),
                       strerror(errno));
             goto error;
         }
-        server.repl_transfer_tmpfile = zstrdup(tmpfile);
-        server.repl_transfer_fd = dfd;
+        link->transfer_tmpfile = zstrdup(tmpfile);
+        link->transfer_fd = dfd;
+    }
+
+    /* We are going to need to do a full resync. If we are accepting a single
+     * slot - make sure we have a clean slate to load it into.*/
+    if (link->slot_ranges != NULL) {
+        dropKeysInSlotRanges(link->slot_ranges, 1);
     }
 
     /* Using dual-channel-replication, the primary responded +DUALCHANNELSYNC. We need to
      * initialize the RDB channel. */
     if (psync_result == PSYNC_FULLRESYNC_DUAL_CHANNEL) {
         /* Create RDB connection */
-        server.repl_rdb_transfer_s = connCreate(connTypeOfReplication());
-        if (connConnect(server.repl_rdb_transfer_s, server.primary_host, server.primary_port, server.bind_source_addr,
-                        dualChannelFullSyncWithPrimary) == C_ERR) {
-            serverLog(LL_WARNING, "Unable to connect to Primary: %s", connGetLastError(server.repl_transfer_s));
-            connClose(server.repl_rdb_transfer_s);
-            server.repl_rdb_transfer_s = NULL;
+        link->rdb_transfer_s = connCreate(connTypeOfReplication());
+        if (connConnect(link->rdb_transfer_s, link->host, link->port, server.bind_source_addr,
+                        dualChannelFullSyncWithReplicationSource) == C_ERR) {
+            serverLog(LL_WARNING, "Unable to connect to source: %s", connGetLastError(link->transfer_s));
+            connClose(link->rdb_transfer_s);
+            link->rdb_transfer_s = NULL;
             goto error;
         }
         if (connSetReadHandler(conn, NULL) == C_ERR) {
@@ -3795,22 +3982,27 @@ void syncWithPrimary(connection *conn) {
                                  connGetInfo(conn, conninfo, sizeof(conninfo)));
             goto error;
         }
-        server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_SEND_HANDSHAKE;
+        link->rdb_channel_state = REPL_DUAL_CHANNEL_SEND_HANDSHAKE;
         return;
     }
-    /* Setup the non blocking download of the bulk file. */
-    if (connSetReadHandler(conn, readSyncBulkPayload) == C_ERR) {
-        char conninfo[CONN_INFO_LEN];
-        serverLog(LL_WARNING, "Can't create readable event for SYNC: %s (%s)", strerror(errno),
-                  connGetInfo(conn, conninfo, sizeof(conninfo)));
-        goto error;
+    if (replicationUseAOFFormatSnapshot(link)) {
+        link->client = createReplicationLinkClientWithHandler(link, conn, -1, readQueryFromClient);
+        link->transfer_s = NULL;
+    } else {
+        /* Setup the non blocking download of the bulk file. */
+        if (connSetReadHandler(conn, readSyncBulkPayload) == C_ERR) {
+            char conninfo[CONN_INFO_LEN];
+            serverLog(LL_WARNING, "Can't create readable event for SYNC: %s (%s)", strerror(errno),
+                    connGetInfo(conn, conninfo, sizeof(conninfo)));
+            goto error;
+        }
     }
 
-    server.repl_state = REPL_STATE_TRANSFER;
-    server.repl_transfer_size = -1;
-    server.repl_transfer_read = 0;
-    server.repl_transfer_last_fsync_off = 0;
-    server.repl_transfer_lastio = server.unixtime;
+    link->state = REPL_STATE_TRANSFER;
+    link->transfer_size = -1;
+    link->transfer_read = 0;
+    link->transfer_last_fsync_off = 0;
+    link->transfer_lastio = server.unixtime;
     return;
 
 no_response_error: /* Handle receiveSynchronousResponse() error when primary has no reply */
@@ -3819,16 +4011,16 @@ void syncWithPrimary(connection *conn) {
 
 error:
     connClose(conn);
-    server.repl_transfer_s = NULL;
-    if (server.repl_rdb_transfer_s) {
-        connClose(server.repl_rdb_transfer_s);
-        server.repl_rdb_transfer_s = NULL;
-    }
-    if (server.repl_transfer_fd != -1) close(server.repl_transfer_fd);
-    if (server.repl_transfer_tmpfile) zfree(server.repl_transfer_tmpfile);
-    server.repl_transfer_tmpfile = NULL;
-    server.repl_transfer_fd = -1;
-    server.repl_state = REPL_STATE_CONNECT;
+    link->transfer_s = NULL;
+    if (link->rdb_transfer_s) {
+        connClose(link->rdb_transfer_s);
+        link->rdb_transfer_s = NULL;
+    }
+    if (link->transfer_fd != -1) close(link->transfer_fd);
+    if (link->transfer_tmpfile) zfree(link->transfer_tmpfile);
+    link->transfer_tmpfile = NULL;
+    link->transfer_fd = -1;
+    link->state = REPL_STATE_CONNECT;
     return;
 
 write_error: /* Handle sendCommand() errors. */
@@ -3837,20 +4029,108 @@ void syncWithPrimary(connection *conn) {
     goto error;
 }
 
-int connectWithPrimary(void) {
-    server.repl_transfer_s = connCreate(connTypeOfReplication());
-    if (connConnect(server.repl_transfer_s, server.primary_host, server.primary_port, server.bind_source_addr,
-                    syncWithPrimary) == C_ERR) {
-        serverLog(LL_WARNING, "Unable to connect to PRIMARY: %s", connGetLastError(server.repl_transfer_s));
-        connClose(server.repl_transfer_s);
-        server.repl_transfer_s = NULL;
-        return C_ERR;
+replicationLink *createReplicationLink(char *host, int port, list *slot_ranges) {
+    replicationLink *result = (replicationLink *)zmalloc(sizeof(replicationLink));
+    result->protected = 0;
+    result->state = REPL_STATE_NONE;
+    result->rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE;
+    result->slot_ranges = slot_ranges;
+    result->client = NULL;
+    result->host = sdsnew(host);
+    result->port = port;
+    result->transfer_s = NULL;
+    result->rdb_transfer_s = NULL;
+    result->rdb_client_id = -1;
+    result->replid[0] = '\0';
+    result->initial_offset = -1;
+    result->transfer_size = 0;
+    result->transfer_read = 0;
+    result->transfer_last_fsync_off = 0;
+    result->transfer_fd = -1;
+    result->transfer_tmpfile = NULL;
+    result->transfer_lastio = 0;
+    result->provisional_source_state.replid[0] = '\0';
+    result->provisional_source_state.reploff = -1;
+    result->provisional_source_state.read_reploff = -1;
+    result->provisional_source_state.dbid = -1;
+    result->pending_repl_data.blocks = NULL;
+    result->pending_repl_data.len = 0;
+    result->pending_repl_data.peak = 0;
+    listAddNodeTail(server.replication_links, result);
+    return result;
+}
+
+
+int freeReplicationLink(replicationLink *link) {
+    if (!link) return 0;
+
+    /* Free primary_host before any calls to freeClient since it calls
+     * replicationHandleSourceDisconnection which can trigger a re-connect
+     * directly from within that call. */
+    sdsfree(link->host);
+    link->host = NULL;
+
+    cancelReplicationHandshake(link, 0);
+    if (link->client) {
+        freeClient(link->client);
+        link->client = NULL;
     }
 
+    if (link->transfer_s) {
+        connClose(link->transfer_s);
+        link->transfer_s = NULL;
+    }
+    if (link->rdb_transfer_s) {
+        connClose(link->rdb_transfer_s);
+        link->rdb_transfer_s = NULL;
+    }
+    if (link->transfer_tmpfile) {
+        zfree(link->transfer_tmpfile);
+        link->transfer_tmpfile = NULL;
+    }
+    if (link->transfer_fd != -1) {
+        close(link->transfer_fd);
+        link->transfer_fd = -1;
+    }
+    freePendingReplDataBuf(link);
+
+    /* Unlink this replication link from the server list */
+    listIter li;
+    listNode *ln;
+    listRewind(server.replication_links, &li);
+    while ((ln = listNext(&li))) {
+        replicationLink *elem = (replicationLink *)ln->value;
+        if (elem == link) {
+            listDelNode(server.replication_links, ln);
+            break;
+        }
+    }
+
+    /* Keep the link intact if it is protected, but mark it as such */
+    if (link->protected) {
+        link->state = REPL_STATE_CANCELLED;
+        return 0;
+    }
+    zfree(link);
+    return 1;
+}
+
+int connectReplicationLink(replicationLink *link) {
+    if (!link)
+        return C_ERR;
+
+    link->transfer_s = connCreate(connTypeOfReplication());
+    connSetPrivateData(link->transfer_s, link);
+    if (connConnect(link->transfer_s, link->host, link->port, server.bind_source_addr, syncWithSource) == C_ERR) {
+        serverLog(LL_WARNING, "Unable to connect to %s: %s", replicationGetNameForLogs(link), connGetLastError(link->transfer_s));
+        connClose(link->transfer_s);
+        link->transfer_s = NULL;
+        return C_ERR;
+    }
 
-    server.repl_transfer_lastio = server.unixtime;
-    server.repl_state = REPL_STATE_CONNECTING;
-    serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync started");
+    link->transfer_lastio = server.unixtime;
+    link->state = REPL_STATE_CONNECTING;
+    serverLog(LL_NOTICE, "%s <-> REPLICA sync started", replicationGetNameForLogs(link));
     return C_OK;
 }
 
@@ -3858,23 +4138,27 @@ int connectWithPrimary(void) {
  * in progress to undo it.
  * Never call this function directly, use cancelReplicationHandshake() instead.
  */
-void undoConnectWithPrimary(void) {
-    connClose(server.repl_transfer_s);
-    server.repl_transfer_s = NULL;
+void undoConnectWithSource(replicationLink *link) {
+    if (link->client) {
+        freeClient(link->client);
+    } else if (link->transfer_s) {
+        connClose(link->transfer_s);
+        link->transfer_s = NULL;
+    }
 }
 
 /* Abort the async download of the bulk dataset while SYNC-ing with primary.
  * Never call this function directly, use cancelReplicationHandshake() instead.
  */
-void replicationAbortSyncTransfer(void) {
-    serverAssert(server.repl_state == REPL_STATE_TRANSFER);
-    undoConnectWithPrimary();
-    if (server.repl_transfer_fd != -1) {
-        close(server.repl_transfer_fd);
-        bg_unlink(server.repl_transfer_tmpfile);
-        zfree(server.repl_transfer_tmpfile);
-        server.repl_transfer_tmpfile = NULL;
-        server.repl_transfer_fd = -1;
+void replicationAbortSyncTransfer(replicationLink *link) {
+    serverAssert(link->state == REPL_STATE_TRANSFER);
+    undoConnectWithSource(link);
+    if (link->transfer_fd != -1) {
+        close(link->transfer_fd);
+        bg_unlink(link->transfer_tmpfile);
+        zfree(link->transfer_tmpfile);
+        link->transfer_tmpfile = NULL;
+        link->transfer_fd = -1;
     }
 }
 
@@ -3883,19 +4167,22 @@ void replicationAbortSyncTransfer(void) {
  * the initial bulk transfer.
  *
  * If there was a replication handshake in progress 1 is returned and
- * the replication state (server.repl_state) set to REPL_STATE_CONNECT.
+ * the replication state (link->state) set to REPL_STATE_CONNECT.
  *
  * Otherwise zero is returned and no operation is performed at all. */
-int cancelReplicationHandshake(int reconnect) {
-    if (server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) {
-        replicationAbortDualChannelSyncTransfer();
-    }
-    if (server.repl_state == REPL_STATE_TRANSFER) {
-        replicationAbortSyncTransfer();
-        server.repl_state = REPL_STATE_CONNECT;
-    } else if (server.repl_state == REPL_STATE_CONNECTING || replicaIsInHandshakeState()) {
-        undoConnectWithPrimary();
-        server.repl_state = REPL_STATE_CONNECT;
+int cancelReplicationHandshake(replicationLink *link, int reconnect) {
+    if (link->rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) {
+        replicationAbortDualChannelSyncTransfer(link);
+    }
+    if (link->state == REPL_STATE_TRANSFER) {
+        replicationAbortSyncTransfer(link);
+        /* Note that disconnection may already trigger reconnect */
+        if (link->state == REPL_STATE_CONNECTING)
+            return 1;
+        link->state = REPL_STATE_CONNECT;
+    } else if (link->state == REPL_STATE_CONNECTING || replicaIsInHandshakeState(link)) {
+        undoConnectWithSource(link);
+        link->state = REPL_STATE_CONNECT;
     } else {
         return 0;
     }
@@ -3904,34 +4191,32 @@ int cancelReplicationHandshake(int reconnect) {
 
     /* try to re-connect without waiting for replicationCron, this is needed
      * for the "diskless loading short read" test. */
-    serverLog(LL_NOTICE, "Reconnecting to PRIMARY %s:%d after failure", server.primary_host, server.primary_port);
-    connectWithPrimary();
+    serverLog(LL_NOTICE, "Reconnecting to replication source %s:%d after failure", link->host, link->port);
+    connectReplicationLink(link);
 
     return 1;
 }
 
 /* Set replication to the specified primary address and port. */
 void replicationSetPrimary(char *ip, int port, int full_sync_required) {
-    int was_primary = server.primary_host == NULL;
+    int was_primary = server.primary == NULL;
+    int was_connected = server.primary->state == REPL_STATE_CONNECTED;
 
-    sdsfree(server.primary_host);
-    server.primary_host = NULL;
     if (server.primary) {
         /* When joining 'myself' to a new primary, set the dont_cache_primary flag
          * if a full sync is required. This happens when 'myself' was previously
          * part of a different shard from the new primary. Since 'myself' does not
          * have the replication history of the shard it is joining, clearing the
          * cached primary is necessary to ensure proper replication behavior. */
-        server.primary->flag.dont_cache_primary = full_sync_required;
-        freeClient(server.primary);
+        server.primary->client->flag.dont_cache_primary = full_sync_required;
+        freeReplicationLink(server.primary);
     }
     disconnectAllBlockedClients(); /* Clients blocked in primary, now replica. */
 
     /* Setting primary_host only after the call to freeClient since it calls
-     * replicationHandlePrimaryDisconnection which can trigger a re-connect
+     * replicationHandleSourceDisconnection which can trigger a re-connect
      * directly from within that call. */
-    server.primary_host = sdsnew(ip);
-    server.primary_port = port;
+    server.primary = createReplicationLink(ip, port, NULL);
 
     /* Update oom_score_adj */
     setOOMScoreAdj(-1);
@@ -3942,8 +4227,6 @@ void replicationSetPrimary(char *ip, int port, int full_sync_required) {
      * primary, or finishing transferring RDB and preparing loading DB on full
      * sync with new primary. */
 
-    cancelReplicationHandshake(0);
-
     /* Before destroying our primary state, create a cached primary using
      * our own parameters, to later PSYNC with the new primary. */
     if (was_primary && !full_sync_required) {
@@ -3956,31 +4239,26 @@ void replicationSetPrimary(char *ip, int port, int full_sync_required) {
                           NULL);
 
     /* Fire the primary link modules event. */
-    if (server.repl_state == REPL_STATE_CONNECTED)
+    if (was_connected)
         moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_DOWN, NULL);
 
-    server.repl_state = REPL_STATE_CONNECT;
     /* Allow trying dual-channel-replication with the new primary. If new primary doesn't
      * support dual-channel-replication, we will set to 0 afterwards. */
-    serverLog(LL_NOTICE, "Connecting to PRIMARY %s:%d", server.primary_host, server.primary_port);
-    connectWithPrimary();
+    serverLog(LL_NOTICE, "Connecting to PRIMARY %s:%d", server.primary->host, server.primary->port);
+    connectReplicationLink(server.primary);
 }
 
 /* Cancel replication, setting the instance as a primary itself. */
 void replicationUnsetPrimary(void) {
-    if (server.primary_host == NULL) return; /* Nothing to do. */
+    if (server.primary == NULL) return; /* Nothing to do. */
 
     /* Fire the primary link modules event. */
-    if (server.repl_state == REPL_STATE_CONNECTED)
+    if (server.primary->state == REPL_STATE_CONNECTED)
         moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_DOWN, NULL);
 
-    /* Clear primary_host first, since the freeClient calls
-     * replicationHandlePrimaryDisconnection which can attempt to re-connect. */
-    sdsfree(server.primary_host);
-    server.primary_host = NULL;
-    if (server.primary) freeClient(server.primary);
+    freeReplicationLink(server.primary);
     replicationDiscardCachedPrimary();
-    cancelReplicationHandshake(0);
+
     /* When a replica is turned into a primary, the current replication ID
      * (that was inherited from the primary at synchronization time) is
      * used as secondary ID up to the current offset, and a new replication
@@ -3991,7 +4269,6 @@ void replicationUnsetPrimary(void) {
      * the replicas will be able to partially resync with us, so it will be
      * a very fast reconnection. */
     disconnectReplicas();
-    server.repl_state = REPL_STATE_NONE;
 
     /* We need to make sure the new primary will start the replication stream
      * with a SELECT statement. This is forced after a full resync, but
@@ -4022,23 +4299,26 @@ void replicationUnsetPrimary(void) {
 
 /* This function is called when the replica lose the connection with the
  * primary into an unexpected way. */
-void replicationHandlePrimaryDisconnection(void) {
-    /* Fire the primary link modules event. */
-    if (server.repl_state == REPL_STATE_CONNECTED)
-        moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_DOWN, NULL);
+void replicationHandleSourceDisconnection(replicationLink *link) {
+    if (link == server.primary) {
+        if (link->state == REPL_STATE_CONNECTED && link == server.primary) {
+            moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_DOWN, NULL);
+        }
+        server.repl_down_since = server.unixtime;
 
-    server.primary = NULL;
-    server.repl_state = REPL_STATE_CONNECT;
-    server.repl_down_since = server.unixtime;
-    /* We lost connection with our primary, don't disconnect replicas yet,
-     * maybe we'll be able to PSYNC with our primary later. We'll disconnect
-     * the replicas only if we'll have to do a full resync with our primary. */
+        /* We lost connection with our primary, don't disconnect replicas yet,
+         * maybe we'll be able to PSYNC with our primary later. We'll disconnect
+         * the replicas only if we'll have to do a full resync with our primary. */
+    }
+
+    link->client = NULL;
+    link->state = REPL_STATE_CONNECT;
 
     /* Try to re-connect immediately rather than wait for replicationCron
      * waiting 1 second may risk backlog being recycled. */
-    if (server.primary_host) {
-        serverLog(LL_NOTICE, "Reconnecting to PRIMARY %s:%d", server.primary_host, server.primary_port);
-        connectWithPrimary();
+    if (link->host) {
+        serverLog(LL_NOTICE, "Reconnecting to replication source %s:%d", link->host, link->port);
+        connectReplicationLink(link);
     }
 }
 
@@ -4058,7 +4338,7 @@ void replicaofCommand(client *c) {
     /* The special host/port combination "NO" "ONE" turns the instance
      * into a primary. Otherwise the new primary address is set. */
     if (!strcasecmp(c->argv[1]->ptr, "no") && !strcasecmp(c->argv[2]->ptr, "one")) {
-        if (server.primary_host) {
+        if (server.primary) {
             replicationUnsetPrimary();
             sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log);
             serverLog(LL_NOTICE, "PRIMARY MODE enabled (user request from '%s')", client);
@@ -4078,7 +4358,7 @@ void replicaofCommand(client *c) {
         if (getRangeLongFromObjectOrReply(c, c->argv[2], 0, 65535, &port, "Invalid master port") != C_OK) return;
 
         /* Check if we are already attached to the specified primary */
-        if (server.primary_host && !strcasecmp(server.primary_host, c->argv[1]->ptr) && server.primary_port == port) {
+        if (server.primary && !strcasecmp(server.primary->host, c->argv[1]->ptr) && server.primary->port == port) {
             serverLog(LL_NOTICE, "REPLICAOF would result into synchronization "
                                  "with the primary we are already connected "
                                  "with. No operation performed.");
@@ -4090,8 +4370,8 @@ void replicaofCommand(client *c) {
          * we can continue. */
         replicationSetPrimary(c->argv[1]->ptr, port, 0);
         sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log);
-        serverLog(LL_NOTICE, "REPLICAOF %s:%d enabled (user request from '%s')", server.primary_host,
-                  server.primary_port, client);
+        serverLog(LL_NOTICE, "REPLICAOF %s:%d enabled (user request from '%s')", server.primary->host,
+                  server.primary->port, client);
         sdsfree(client);
     }
     addReply(c, shared.ok);
@@ -4106,7 +4386,7 @@ void roleCommand(client *c) {
         return;
     }
 
-    if (server.primary_host == NULL) {
+    if (server.primary == NULL) {
         listIter li;
         listNode *ln;
         void *mbcount;
@@ -4138,12 +4418,12 @@ void roleCommand(client *c) {
 
         addReplyArrayLen(c, 5);
         addReplyBulkCBuffer(c, "slave", 5);
-        addReplyBulkCString(c, server.primary_host);
-        addReplyLongLong(c, server.primary_port);
-        if (replicaIsInHandshakeState()) {
+        addReplyBulkCString(c, server.primary->host);
+        addReplyLongLong(c, server.primary->port);
+        if (replicaIsInHandshakeState(server.primary)) {
             replica_state = "handshake";
         } else {
-            switch (server.repl_state) {
+            switch (server.primary->state) {
             case REPL_STATE_NONE: replica_state = "none"; break;
             case REPL_STATE_CONNECT: replica_state = "connect"; break;
             case REPL_STATE_CONNECTING: replica_state = "connecting"; break;
@@ -4153,16 +4433,15 @@ void roleCommand(client *c) {
             }
         }
         addReplyBulkCString(c, replica_state);
-        addReplyLongLong(c, server.primary ? server.primary->repl_data->reploff : -1);
+        addReplyLongLong(c, server.primary->client ? server.primary->client->repl_data->reploff : -1);
     }
 }
 
 /* Send a REPLCONF ACK command to the primary to inform it about the current
  * processed offset. If we are not connected with a primary, the command has
  * no effects. */
-void replicationSendAck(void) {
-    client *c = server.primary;
-
+void replicationSendAck(replicationLink *link) {
+    client *c = link->client;
     if (c != NULL) {
         int send_fack = server.fsynced_reploff != -1;
         c->flag.primary_force_reply = 1;
@@ -4203,7 +4482,7 @@ void replicationSendAck(void) {
  * handshake in order to reactivate the cached primary.
  */
 void replicationCachePrimary(client *c) {
-    serverAssert(server.primary != NULL && server.cached_primary == NULL);
+    serverAssert(server.primary != NULL && server.primary->client != NULL && server.cached_primary == NULL);
     serverLog(LL_NOTICE, "Caching the disconnected primary state.");
 
     /* Wait for IO operations to be done before proceeding */
@@ -4215,10 +4494,10 @@ void replicationCachePrimary(client *c) {
      * we want to discard the non processed query buffers and non processed
      * offsets, including pending transactions, already populated arguments,
      * pending outputs to the primary. */
-    sdsclear(server.primary->querybuf);
-    server.primary->qb_pos = 0;
-    server.primary->repl_data->repl_applied = 0;
-    server.primary->repl_data->read_reploff = server.primary->repl_data->reploff;
+    sdsclear(c->querybuf);
+    c->qb_pos = 0;
+    c->repl_data->repl_applied = 0;
+    c->repl_data->read_reploff = c->repl_data->reploff;
     if (c->flag.multi) discardTransaction(c);
     listEmpty(c->reply);
     c->sentlen = 0;
@@ -4227,9 +4506,9 @@ void replicationCachePrimary(client *c) {
     resetClient(c);
     resetClientIOState(c);
 
-    /* Save the primary. Server.primary will be set to null later by
-     * replicationHandlePrimaryDisconnection(). */
-    server.cached_primary = server.primary;
+    /* Save the primary. Server.primary->client will be set to null later by
+     * replicationHandleSourceDisconnection(). */
+    server.cached_primary = c;
 
     /* Invalidate the Peer ID cache. */
     if (c->peerid) {
@@ -4244,8 +4523,8 @@ void replicationCachePrimary(client *c) {
 
     /* Caching the primary happens instead of the actual freeClient() call,
      * so make sure to adjust the replication state. This function will
-     * also set server.primary to NULL. */
-    replicationHandlePrimaryDisconnection();
+     * also set server.primary->client to NULL. */
+    replicationHandleSourceDisconnection(server.primary);
 }
 
 /* This function is called when a primary is turned into a replica, in order to
@@ -4261,24 +4540,27 @@ void replicationCachePrimaryUsingMyself(void) {
     serverLog(LL_NOTICE, "Before turning into a replica, using my own primary parameters "
                          "to synthesize a cached primary: I may be able to synchronize with "
                          "the new primary with just a partial transfer.");
+    /* Create a temporary link for the purpose of creating a client. */
+    replicationLink *temp_link = createReplicationLink(NULL, 0, NULL);
 
     /* This will be used to populate the field server.primary->repl_data->reploff
      * by replicationCreatePrimaryClient(). We'll later set the created
      * primary as server.cached_primary, so the replica will use such
      * offset for PSYNC. */
-    server.primary_initial_offset = server.primary_repl_offset;
+    temp_link->initial_offset = server.primary_repl_offset;
 
     /* The primary client we create can be set to any DBID, because
      * the new primary will start its replication stream with SELECT. */
-    replicationCreatePrimaryClient(NULL, -1);
+    createReplicationLinkClient(temp_link, NULL, -1);
 
     /* Use our own ID / offset. */
-    memcpy(server.primary->repl_data->replid, server.replid, sizeof(server.replid));
+    memcpy(temp_link->client->repl_data->replid, server.replid, sizeof(server.replid));
 
     /* Set as cached primary. */
-    unlinkClient(server.primary);
-    server.cached_primary = server.primary;
-    server.primary = NULL;
+    unlinkClient(temp_link->client);
+    server.cached_primary = temp_link->client;
+    temp_link->client = NULL;
+    freeReplicationLink(temp_link);
 }
 
 /* Free a cached primary, called when there are no longer the conditions for
@@ -4287,7 +4569,7 @@ void replicationDiscardCachedPrimary(void) {
     if (server.cached_primary == NULL) return;
 
     serverLog(LL_NOTICE, "Discarding previously cached primary state.");
-    server.cached_primary->flag.primary = 0;
+    server.cached_primary->flag.replication_source = 0;
     freeClient(server.cached_primary);
     server.cached_primary = NULL;
 }
@@ -4295,17 +4577,19 @@ void replicationDiscardCachedPrimary(void) {
 /* Replication: Replica side.
  * This method performs the necessary steps to establish a connection with the primary server.
  * It sets private data, updates flags, and fires an event to notify modules about the primary link change. */
-void establishPrimaryConnection(void) {
-    connSetPrivateData(server.primary->conn, server.primary);
-    server.primary->flag.close_after_reply = 0;
-    server.primary->flag.close_asap = 0;
-    server.primary->flag.authenticated = 1;
-    server.primary->last_interaction = server.unixtime;
-    server.repl_state = REPL_STATE_CONNECTED;
-    server.repl_down_since = 0;
+void establishSourceConnection(replicationLink *link) {
+    connSetPrivateData(link->client->conn, link->client);
+    link->client->flag.close_after_reply = 0;
+    link->client->flag.close_asap = 0;
+    link->client->flag.authenticated = 1;
+    link->client->last_interaction = server.unixtime;
+    link->state = REPL_STATE_CONNECTED;
+    if (link == server.primary) {
+        server.repl_down_since = 0;
 
-    /* Fire the primary link modules event. */
-    moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_UP, NULL);
+        /* Fire the primary link modules event. */
+        moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_UP, NULL);
+    }
 }
 
 /* Replication: Replica side.
@@ -4315,34 +4599,38 @@ void establishPrimaryConnection(void) {
  * This function is called when successfully setup a partial resynchronization
  * so the stream of data that we'll receive will start from where this
  * primary left. */
-void replicationResurrectCachedPrimary(connection *conn) {
-    server.primary = server.cached_primary;
+void replicationResurrectCachedPrimary(replicationLink *link) {
+    serverAssert(link == server.primary);
+    link->client = server.cached_primary;
     server.cached_primary = NULL;
-    server.primary->conn = conn;
 
-    establishPrimaryConnection();
+    /* The client takes ownership of the connection now. */
+    link->client->conn = link->transfer_s;
+    link->transfer_s = NULL;
+
+    establishSourceConnection(link);
     /* Re-add to the list of clients. */
-    linkClient(server.primary);
-    replicationSteadyStateInit();
+    linkClient(link->client);
+    replicationSteadyStateInit(link);
 }
 
 /* Replication: Replica side.
  * Prepare replica to steady state.
  * prerequisite: server.primary is already initialized and linked in client list. */
-void replicationSteadyStateInit(void) {
-    if (connSetReadHandler(server.primary->conn, readQueryFromClient)) {
+void replicationSteadyStateInit(replicationLink *link) {
+    if (connSetReadHandler(link->client->conn, readQueryFromClient)) {
         serverLog(LL_WARNING, "Error resurrecting the cached primary, impossible to add the readable handler: %s",
                   strerror(errno));
-        freeClientAsync(server.primary); /* Close ASAP. */
+        freeClientAsync(link->client); /* Close ASAP. */
     }
 
     /* We may also need to install the write handler as well if there is
      * pending data in the write buffers. */
-    if (clientHasPendingReplies(server.primary)) {
-        if (connSetWriteHandler(server.primary->conn, sendReplyToClient)) {
+    if (clientHasPendingReplies(link->client)) {
+        if (connSetWriteHandler(link->client->conn, sendReplyToClient)) {
             serverLog(LL_WARNING, "Error resurrecting the cached primary, impossible to add the writable handler: %s",
                       strerror(errno));
-            freeClientAsync(server.primary); /* Close ASAP. */
+            freeClientAsync(link->client); /* Close ASAP. */
         }
     }
 }
@@ -4350,16 +4638,19 @@ void replicationSteadyStateInit(void) {
 /* Replication: Replica side.
  * Turn the provisional primary into the current primary.
  * This function is called after dual channel sync is finished successfully. */
-void replicationResurrectProvisionalPrimary(void) {
-    /* Create a primary client, but do not initialize the read handler yet, as this replica still has a local buffer to
+void replicationResurrectProvisionalSource(replicationLink *link) {
+    /* Create a client, but do not initialize the read handler yet, as this replica still has a local buffer to
      * drain. */
-    replicationCreatePrimaryClientWithHandler(server.repl_transfer_s, server.repl_provisional_primary.dbid, NULL);
-    memcpy(server.primary->repl_data->replid, server.repl_provisional_primary.replid, sizeof(server.repl_provisional_primary.replid));
-    server.primary->repl_data->reploff = server.repl_provisional_primary.reploff;
-    server.primary->repl_data->read_reploff = server.repl_provisional_primary.read_reploff;
-    server.primary_repl_offset = server.primary->repl_data->reploff;
-    memcpy(server.replid, server.primary->repl_data->replid, sizeof(server.primary->repl_data->replid));
-    establishPrimaryConnection();
+    createReplicationLinkClientWithHandler(link, link->transfer_s, link->provisional_source_state.dbid, NULL);
+    link->transfer_s = NULL; /* link->client now takes ownership of this connection */
+    memcpy(link->client->repl_data->replid, link->provisional_source_state.replid, sizeof(link->provisional_source_state.replid));
+    link->client->repl_data->reploff = link->provisional_source_state.reploff;
+    link->client->repl_data->read_reploff = link->provisional_source_state.read_reploff;
+    if (link == server.primary) {
+        server.primary_repl_offset = link->client->repl_data->reploff;
+        memcpy(server.replid, link->client->repl_data->replid, sizeof(link->client->repl_data->replid));
+    }
+    establishSourceConnection(link);
 }
 
 /* ------------------------- MIN-REPLICAS-TO-WRITE  --------------------------- */
@@ -4386,7 +4677,7 @@ void refreshGoodReplicasCount(void) {
 
 /* return true if status of good replicas is OK. otherwise false */
 int checkGoodReplicasStatus(void) {
-    return server.primary_host ||                                                /* not a primary status should be OK */
+    return server.primary ||                                                     /* not a primary status should be OK */
            !server.repl_min_replicas_max_lag ||                                  /* Min replica max lag not configured */
            !server.repl_min_replicas_to_write ||                                 /* Min replica to write not configured */
            server.repl_good_replicas_count >= server.repl_min_replicas_to_write; /* check if we have enough replicas */
@@ -4479,7 +4770,7 @@ void waitCommand(client *c) {
     long numreplicas, ackreplicas;
     long long offset = getClientWriteOffset(c);
 
-    if (server.primary_host) {
+    if (server.primary) {
         addReplyError(
             c, "WAIT cannot be used with replica instances. Please also note that if a replica is configured to be "
                "writable (which is not the default) writes to replicas are just local and are not propagated.");
@@ -4517,7 +4808,7 @@ void waitaofCommand(client *c) {
     if (getPositiveLongFromObjectOrReply(c, c->argv[2], &numreplicas, NULL) != C_OK) return;
     if (getTimeoutFromObjectOrReply(c, c->argv[3], &timeout, UNIT_MILLISECONDS) != C_OK) return;
 
-    if (server.primary_host) {
+    if (server.primary) {
         addReplyError(c, "WAITAOF cannot be used with replica instances. Please also note that writes to replicas are "
                          "just local and are not propagated.");
         return;
@@ -4638,9 +4929,9 @@ void processClientsWaitingReplicas(void) {
 long long replicationGetReplicaOffset(void) {
     long long offset = 0;
 
-    if (server.primary_host != NULL) {
-        if (server.primary) {
-            offset = server.primary->repl_data->reploff;
+    if (server.primary != NULL) {
+        if (server.primary->client) {
+            offset = server.primary->client->repl_data->reploff;
         } else if (server.cached_primary) {
             offset = server.cached_primary->repl_data->reploff;
         }
@@ -4664,44 +4955,48 @@ void replicationCron(void) {
     updateFailoverStatus();
 
     /* Non blocking connection timeout? */
-    if (server.primary_host && (server.repl_state == REPL_STATE_CONNECTING || replicaIsInHandshakeState()) &&
-        (time(NULL) - server.repl_transfer_lastio) > server.repl_timeout) {
-        serverLog(LL_WARNING, "Timeout connecting to the PRIMARY...");
-        cancelReplicationHandshake(1);
-    }
+    listNode *ln;
+    listIter li;
+    listRewind(server.replication_links, &li);
+    while ((ln = listNext(&li))) {
+        replicationLink *link = (replicationLink *)ln->value;
+        if ((link->state == REPL_STATE_CONNECTING || replicaIsInHandshakeState(link)) &&
+            (time(NULL) - link->transfer_lastio) > server.repl_timeout) {
+            serverLog(LL_WARNING, "Timeout connecting to %s...", replicationGetNameForLogs(link));
+            cancelReplicationHandshake(link, 1);
+        }
 
-    /* Bulk transfer I/O timeout? */
-    if (server.primary_host && server.repl_state == REPL_STATE_TRANSFER &&
-        (time(NULL) - server.repl_transfer_lastio) > server.repl_timeout) {
-        serverLog(LL_WARNING, "Timeout receiving bulk data from PRIMARY... If the problem persists try to set the "
-                              "'repl-timeout' parameter in valkey.conf to a larger value.");
-        cancelReplicationHandshake(1);
-    }
+        /* Bulk transfer I/O timeout? */
+        if (link && link->state == REPL_STATE_TRANSFER &&
+            (time(NULL) - link->transfer_lastio) > server.repl_timeout) {
+            serverLog(LL_WARNING, "Timeout receiving bulk data from %s... If the problem persists try to set the "
+                                  "'repl-timeout' parameter in valkey.conf to a larger value.", replicationGetNameForLogs(link));
+            cancelReplicationHandshake(link, 1);
+        }
 
-    /* Timed out primary when we are an already connected replica? */
-    if (server.primary_host && server.repl_state == REPL_STATE_CONNECTED &&
-        (time(NULL) - server.primary->last_interaction) > server.repl_timeout) {
-        serverLog(LL_WARNING, "PRIMARY timeout: no data nor PING received...");
-        freeClient(server.primary);
-    }
+        /* Timed out primary when we are an already connected replica? */
+        if (link && link->state == REPL_STATE_CONNECTED &&
+            (time(NULL) - link->client->last_interaction) > server.repl_timeout) {
+            serverLog(LL_WARNING, "%s timeout: no data nor PING received...", replicationGetNameForLogs(link));
+            freeClient(link->client); /* free client will attempt reconnect */
+        }
 
-    /* Check if we should connect to a PRIMARY */
-    if (server.repl_state == REPL_STATE_CONNECT) {
-        serverLog(LL_NOTICE, "Connecting to PRIMARY %s:%d", server.primary_host, server.primary_port);
-        connectWithPrimary();
-    }
+        /* Check if we should connect to a replication source */
+        if (link && link->state == REPL_STATE_CONNECT) {
+            serverLog(LL_NOTICE, "Connecting to %s %s:%d", replicationGetNameForLogs(link), link->host, link->port);
+            connectReplicationLink(link);
+        }
 
-    /* Send ACK to primary from time to time.
-     * Note that we do not send periodic acks to primary that don't
-     * support PSYNC and replication offsets. */
-    if (server.primary_host && server.primary && !(server.primary->flag.pre_psync)) replicationSendAck();
+        /* Send ACK to replication sources from time to time.
+         * Note that we do not send periodic acks to replication sources that don't
+         * support PSYNC and replication offsets. */
+        if (link && link->client && !(link->client->flag.pre_psync)) replicationSendAck(link);
+    }
 
     /* If we have attached replicas, PING them from time to time.
      * So replicas can implement an explicit timeout to primaries, and will
      * be able to detect a link disconnection even if the TCP connection
      * will not actually go down. */
-    listIter li;
-    listNode *ln;
     robj *ping_argv[1];
 
     /* First, send PING according to ping_replica_period. */
@@ -4788,7 +5083,7 @@ void replicationCron(void) {
      * backlog, in order to reply to PSYNC queries if they are turned into
      * primaries after a failover. */
     if (listLength(server.replicas) == 0 && server.repl_backlog_time_limit && server.repl_backlog &&
-        server.primary_host == NULL) {
+        server.primary == NULL) {
         time_t idle = server.unixtime - server.repl_no_replicas_since;
 
         if (idle > server.repl_backlog_time_limit) {
@@ -4838,7 +5133,7 @@ void replicationCron(void) {
     replication_cron_loops++; /* Incremented with frequency 1 HZ. */
 }
 
-int shouldStartChildReplication(int *mincapa_out, int *req_out) {
+int shouldStartChildReplication(int *mincapa_out, int *req_out, list **slot_ranges_out) {
     /* We should start a BGSAVE good for replication if we have replicas in
      * WAIT_BGSAVE_START state.
      *
@@ -4850,6 +5145,7 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out) {
         int replicas_waiting = 0;
         int mincapa;
         int req;
+        list *slot_ranges;
         int first = 1;
         listNode *ln;
         listIter li;
@@ -4861,6 +5157,7 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out) {
                 if (first) {
                     /* Get first replica's requirements */
                     req = replica->repl_data->replica_req;
+                    slot_ranges = replica->repl_data->slot_ranges;
                 } else if (req != replica->repl_data->replica_req) {
                     /* Skip replicas that don't match */
                     continue;
@@ -4879,6 +5176,7 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out) {
                                  max_idle >= server.repl_diskless_sync_delay)) {
             if (mincapa_out) *mincapa_out = mincapa;
             if (req_out) *req_out = req;
+            if (slot_ranges_out) *slot_ranges_out = slot_ranges;
             return 1;
         }
     }
@@ -4889,12 +5187,13 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out) {
 void replicationStartPendingFork(void) {
     int mincapa = -1;
     int req = -1;
+    list *slot_ranges = NULL;
 
-    if (shouldStartChildReplication(&mincapa, &req)) {
+    if (shouldStartChildReplication(&mincapa, &req, &slot_ranges)) {
         /* Start the BGSAVE. The called function may start a
          * BGSAVE with socket target or disk target depending on the
          * configuration and replicas capabilities and requirements. */
-        startBgsaveForReplication(mincapa, req);
+        startBgsaveForReplication(mincapa, req, slot_ranges);
     }
 }
 
@@ -5033,7 +5332,7 @@ void failoverCommand(client *c) {
         return;
     }
 
-    if (server.primary_host) {
+    if (server.primary) {
         addReplyError(c, "FAILOVER is not valid when server is a replica.");
         return;
     }
diff --git a/src/script.c b/src/script.c
index a8e5b18eb9..a43de5c7af 100644
--- a/src/script.c
+++ b/src/script.c
@@ -51,7 +51,7 @@ static void exitScriptTimedoutMode(scriptRunCtx *run_ctx) {
     run_ctx->flags &= ~SCRIPT_TIMEDOUT;
     blockingOperationEnds();
     /* if we are a replica and we have an active primary, set it for continue processing */
-    if (server.primary_host && server.primary) queueClientForReprocessing(server.primary);
+    if (server.primary && server.primary->client) queueClientForReprocessing(server.primary->client);
 }
 
 static void enterScriptTimedoutMode(scriptRunCtx *run_ctx) {
@@ -137,7 +137,7 @@ int scriptPrepareForRun(scriptRunCtx *run_ctx,
     int client_allow_oom = !!(caller->flag.allow_oom);
 
     int running_stale =
-        server.primary_host && server.repl_state != REPL_STATE_CONNECTED && server.repl_serve_stale_data == 0;
+        server.primary && server.primary->state != REPL_STATE_CONNECTED && server.repl_serve_stale_data == 0;
     int obey_client = mustObeyClient(caller);
 
     if (!(script_flags & SCRIPT_FLAG_EVAL_COMPAT_MODE)) {
@@ -158,7 +158,7 @@ int scriptPrepareForRun(scriptRunCtx *run_ctx,
              * 1. we are not a readonly replica
              * 2. no disk error detected
              * 3. command is not `fcall_ro`/`eval[sha]_ro` */
-            if (server.primary_host && server.repl_replica_ro && !obey_client) {
+            if (server.primary && server.repl_replica_ro && !obey_client) {
                 addReplyError(caller, "-READONLY Can not run script with write flag on readonly replica");
                 return C_ERR;
             }
@@ -375,7 +375,7 @@ static int scriptVerifyWriteCommandAllow(scriptRunCtx *run_ctx, char **err) {
      * of this script. */
     int deny_write_type = writeCommandsDeniedByDiskError();
 
-    if (server.primary_host && server.repl_replica_ro && !mustObeyClient(run_ctx->original_client)) {
+    if (server.primary && server.repl_replica_ro && !mustObeyClient(run_ctx->original_client)) {
         *err = sdsdup(shared.roreplicaerr->ptr);
         return C_ERR;
     }
@@ -501,12 +501,12 @@ int scriptSetRepl(scriptRunCtx *run_ctx, int repl) {
 }
 
 static int scriptVerifyAllowStale(client *c, sds *err) {
-    if (!server.primary_host) {
+    if (!server.primary) {
         /* Not a replica, stale is irrelevant */
         return C_OK;
     }
 
-    if (server.repl_state == REPL_STATE_CONNECTED) {
+    if (server.primary->state == REPL_STATE_CONNECTED) {
         /* Connected to replica, stale is irrelevant */
         return C_OK;
     }
diff --git a/src/server.c b/src/server.c
index 8255b57e25..697ce48013 100644
--- a/src/server.c
+++ b/src/server.c
@@ -221,7 +221,7 @@ void serverLogRaw(int level, const char *msg) {
         } else if (pid != server.pid) {
             role_index = 1; /* RDB / AOF writing child. */
         } else {
-            role_index = (server.primary_host ? 2 : 3); /* Replica or Primary. */
+            role_index = (server.primary ? 2 : 3); /* Replica or Primary. */
         }
         switch (server.log_format) {
         case LOG_FORMAT_LOGFMT:
@@ -900,7 +900,7 @@ int clientsCronResizeQueryBuffer(client *c) {
         if (idletime > 2) {
             /* 1) Query is idle for a long time. */
             size_t remaining = sdslen(c->querybuf) - c->qb_pos;
-            if (!c->flag.primary && !remaining) {
+            if (!c->flag.replication_source && !remaining) {
                 /* If the client is not a primary and no data is pending,
                  * The client can safely use the shared query buffer in the next read - free the client's querybuf. */
                 sdsfree(c->querybuf);
@@ -2223,21 +2223,12 @@ void initServerConfig(void) {
     appendServerSaveParams(60, 10000);  /* save after 1 minute and 10000 changes */
 
     /* Replication related */
-    server.primary_host = NULL;
-    server.primary_port = 6379;
     server.primary = NULL;
     server.cached_primary = NULL;
-    server.primary_initial_offset = -1;
-    server.repl_state = REPL_STATE_NONE;
-    server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE;
-    server.repl_transfer_tmpfile = NULL;
-    server.repl_transfer_fd = -1;
-    server.repl_transfer_s = NULL;
     server.repl_syncio_timeout = CONFIG_REPL_SYNCIO_TIMEOUT;
     server.repl_down_since = 0; /* Never connected, repl is down since EVER. */
     server.primary_repl_offset = 0;
     server.fsynced_reploff_pending = 0;
-    server.rdb_client_id = -1;
     server.loading_process_events_interval_ms = LOADING_PROCESS_EVENTS_INTERVAL_DEFAULT;
     server.loading_rio = NULL;
 
@@ -2348,7 +2339,7 @@ int restartServer(client *c, int flags, mstime_t delay) {
  * depending on current role.
  */
 int setOOMScoreAdj(int process_class) {
-    if (process_class == -1) process_class = (server.primary_host ? CONFIG_OOM_REPLICA : CONFIG_OOM_PRIMARY);
+    if (process_class == -1) process_class = (server.primary ? CONFIG_OOM_REPLICA : CONFIG_OOM_PRIMARY);
 
     serverAssert(process_class >= 0 && process_class < CONFIG_OOM_COUNT);
 
@@ -2760,6 +2751,7 @@ void initServer(void) {
     server.reply_buffer_peak_reset_time = REPLY_BUFFER_DEFAULT_PEAK_RESET_TIME;
     server.reply_buffer_resizing_enabled = 1;
     server.client_mem_usage_buckets = NULL;
+    server.replication_links = listCreate();
     resetReplicationBuffer();
 
     /* Make sure the locale is set on startup based on the config file. */
@@ -3359,7 +3351,7 @@ struct serverCommand *lookupCommandOrOriginal(robj **argv, int argc) {
 
 /* Commands arriving from the primary client or AOF client, should never be rejected. */
 int mustObeyClient(client *c) {
-    return c->id == CLIENT_ID_AOF || c->flag.primary;
+    return c->id == CLIENT_ID_AOF || c->flag.replication_source;
 }
 
 static int shouldPropagate(int target) {
@@ -3369,7 +3361,7 @@ static int shouldPropagate(int target) {
         if (server.aof_state != AOF_OFF) return 1;
     }
     if (target & PROPAGATE_REPL) {
-        if (server.primary_host == NULL && (server.repl_backlog || listLength(server.replicas) != 0)) return 1;
+        if (server.primary == NULL && (server.repl_backlog || listLength(server.replicas) != 0)) return 1;
     }
 
     return 0;
@@ -4111,7 +4103,7 @@ int processCommand(client *c) {
         }
     }
 
-    if (!server.cluster_enabled && c->capa & CLIENT_CAPA_REDIRECT && server.primary_host && !obey_client &&
+    if (!server.cluster_enabled && c->capa & CLIENT_CAPA_REDIRECT && server.primary && !obey_client &&
         (is_write_command || (is_read_command && !c->flag.readonly))) {
         if (server.failover_state == FAILOVER_IN_PROGRESS) {
             /* During the FAILOVER process, when conditions are met (such as
@@ -4142,7 +4134,7 @@ int processCommand(client *c) {
             }
             c->duration = 0;
             c->cmd->rejected_calls++;
-            addReplyErrorSds(c, sdscatprintf(sdsempty(), "-REDIRECT %s:%d", server.primary_host, server.primary_port));
+            addReplyErrorSds(c, sdscatprintf(sdsempty(), "-REDIRECT %s:%d", server.primary->host, server.primary->port));
         }
         return C_OK;
     }
@@ -4227,7 +4219,7 @@ int processCommand(client *c) {
 
     /* Don't accept write commands if this is a read only replica. But
      * accept write commands if this is our primary. */
-    if (server.primary_host && server.repl_replica_ro && !obey_client && is_write_command) {
+    if (server.primary && server.repl_replica_ro && !obey_client && is_write_command) {
         rejectCommand(c, shared.roreplicaerr);
         return C_OK;
     }
@@ -4248,7 +4240,7 @@ int processCommand(client *c) {
     /* Only allow commands with flag "t", such as INFO, REPLICAOF and so on,
      * when replica-serve-stale-data is no and we are a replica with a broken
      * link with primary. */
-    if (server.primary_host && server.repl_state != REPL_STATE_CONNECTED && server.repl_serve_stale_data == 0 &&
+    if (server.primary && server.primary->state != REPL_STATE_CONNECTED && server.repl_serve_stale_data == 0 &&
         is_denystale_command) {
         rejectCommand(c, shared.primarydownerr);
         return C_OK;
@@ -5972,14 +5964,14 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) {
         info = sdscatprintf(info,
                             "# Replication\r\n"
                             "role:%s\r\n",
-                            server.primary_host == NULL ? "master" : "slave");
-        if (server.primary_host) {
+                            server.primary == NULL ? "master" : "slave");
+        if (server.primary) {
             long long replica_repl_offset = 1;
             long long replica_read_repl_offset = 1;
 
-            if (server.primary) {
-                replica_repl_offset = server.primary->repl_data->reploff;
-                replica_read_repl_offset = server.primary->repl_data->read_reploff;
+            if (server.primary->client) {
+                replica_repl_offset = server.primary->client->repl_data->reploff;
+                replica_read_repl_offset = server.primary->client->repl_data->read_reploff;
             } else if (server.cached_primary) {
                 replica_repl_offset = server.cached_primary->repl_data->reploff;
                 replica_read_repl_offset = server.cached_primary->repl_data->read_reploff;
@@ -5988,32 +5980,32 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) {
             info = sdscatprintf(
                 info,
                 FMTARGS(
-                    "master_host:%s\r\n", server.primary_host,
-                    "master_port:%d\r\n", server.primary_port,
-                    "master_link_status:%s\r\n", (server.repl_state == REPL_STATE_CONNECTED) ? "up" : "down",
-                    "master_last_io_seconds_ago:%d\r\n", server.primary ? ((int)(server.unixtime - server.primary->last_interaction)) : -1,
-                    "master_sync_in_progress:%d\r\n", server.repl_state == REPL_STATE_TRANSFER,
+                    "master_host:%s\r\n", server.primary->host,
+                    "master_port:%d\r\n", server.primary->port,
+                    "master_link_status:%s\r\n", (server.primary->state == REPL_STATE_CONNECTED) ? "up" : "down",
+                    "master_last_io_seconds_ago:%d\r\n", server.primary->client ? ((int)(server.unixtime - server.primary->client->last_interaction)) : -1,
+                    "master_sync_in_progress:%d\r\n", server.primary->state == REPL_STATE_TRANSFER,
                     "slave_read_repl_offset:%lld\r\n", replica_read_repl_offset,
                     "slave_repl_offset:%lld\r\n", replica_repl_offset,
-                    "replicas_repl_buffer_size:%zu\r\n", server.pending_repl_data.len,
-                    "replicas_repl_buffer_peak:%zu\r\n", server.pending_repl_data.peak));
+                    "replicas_repl_buffer_size:%zu\r\n", server.primary->pending_repl_data.len,
+                    "replicas_repl_buffer_peak:%zu\r\n", server.primary->pending_repl_data.peak));
 
-            if (server.repl_state == REPL_STATE_TRANSFER) {
+            if (server.primary->state == REPL_STATE_TRANSFER) {
                 double perc = 0;
-                if (server.repl_transfer_size) {
-                    perc = ((double)server.repl_transfer_read / server.repl_transfer_size) * 100;
+                if (server.primary->transfer_size) {
+                    perc = ((double)server.primary->transfer_read / server.primary->transfer_size) * 100;
                 }
                 info = sdscatprintf(
                     info,
                     FMTARGS(
-                        "master_sync_total_bytes:%lld\r\n", (long long)server.repl_transfer_size,
-                        "master_sync_read_bytes:%lld\r\n", (long long)server.repl_transfer_read,
-                        "master_sync_left_bytes:%lld\r\n", (long long)(server.repl_transfer_size - server.repl_transfer_read),
+                        "master_sync_total_bytes:%lld\r\n", (long long)server.primary->transfer_size,
+                        "master_sync_read_bytes:%lld\r\n", (long long)server.primary->transfer_read,
+                        "master_sync_left_bytes:%lld\r\n", (long long)(server.primary->transfer_size - server.primary->transfer_read),
                         "master_sync_perc:%.2f\r\n", perc,
-                        "master_sync_last_io_seconds_ago:%d\r\n", (int)(server.unixtime - server.repl_transfer_lastio)));
+                        "master_sync_last_io_seconds_ago:%d\r\n", (int)(server.unixtime - server.primary->transfer_lastio)));
             }
 
-            if (server.repl_state != REPL_STATE_CONNECTED) {
+            if (server.primary->state != REPL_STATE_CONNECTED) {
                 info = sdscatprintf(info, "master_link_down_since_seconds:%jd\r\n",
                                     server.repl_down_since ? (intmax_t)(server.unixtime - server.repl_down_since) : -1);
             }
@@ -6848,7 +6840,7 @@ int serverIsSupervised(int mode) {
 }
 
 int iAmPrimary(void) {
-    return ((!server.cluster_enabled && server.primary_host == NULL) ||
+    return ((!server.cluster_enabled && server.primary == NULL) ||
             (server.cluster_enabled && clusterNodeIsPrimary(getMyClusterNode())));
 }
 
@@ -7131,7 +7123,7 @@ __attribute__((weak)) int main(int argc, char **argv) {
         }
 
         if (server.supervised_mode == SUPERVISED_SYSTEMD) {
-            if (!server.primary_host) {
+            if (!server.primary) {
                 serverCommunicateSystemd("STATUS=Ready to accept connections\n");
             } else {
                 serverCommunicateSystemd(
diff --git a/src/server.h b/src/server.h
index d186d16c73..1bd78f57f6 100644
--- a/src/server.h
+++ b/src/server.h
@@ -394,6 +394,7 @@ typedef enum {
     REPL_STATE_RECEIVE_AUTH_REPLY,    /* Wait for AUTH reply */
     REPL_STATE_RECEIVE_PORT_REPLY,    /* Wait for REPLCONF reply */
     REPL_STATE_RECEIVE_IP_REPLY,      /* Wait for REPLCONF reply */
+    REPL_STATE_RECEIVE_SLOT_REPLY,    /* Wait for REPLCONF reply */
     REPL_STATE_RECEIVE_CAPA_REPLY,    /* Wait for REPLCONF reply */
     REPL_STATE_RECEIVE_VERSION_REPLY, /* Wait for REPLCONF reply */
     REPL_STATE_SEND_PSYNC,            /* Send PSYNC */
@@ -401,6 +402,7 @@ typedef enum {
     /* --- End of handshake states --- */
     REPL_STATE_TRANSFER,  /* Receiving .rdb from primary */
     REPL_STATE_CONNECTED, /* Connected to primary */
+    REPL_STATE_CANCELLED, /* Replication was cancelled, and this link is pending deletion. */
 } repl_state;
 
 /* Replica rdb-channel replication state. Used in server.repl_rdb_channel_state for
@@ -446,6 +448,7 @@ typedef enum {
 #define REPLICA_REQ_RDB_EXCLUDE_DATA (1 << 0)      /* Exclude data from RDB */
 #define REPLICA_REQ_RDB_EXCLUDE_FUNCTIONS (1 << 1) /* Exclude functions from RDB */
 #define REPLICA_REQ_RDB_CHANNEL (1 << 2)           /* Use dual-channel-replication */
+#define REPLICA_REQ_AOF_FORMAT (1 << 3)            /* Use AOF-based replication format*/
 /* Mask of all bits in the replica requirements bitfield that represent non-standard (filtered) RDB requirements */
 #define REPLICA_REQ_RDB_MASK (REPLICA_REQ_RDB_EXCLUDE_DATA | REPLICA_REQ_RDB_EXCLUDE_FUNCTIONS)
 
@@ -1011,7 +1014,7 @@ typedef enum {
 } clientIOState;
 
 typedef struct ClientFlags {
-    uint64_t primary : 1;                  /* This client is a primary */
+    uint64_t replication_source : 1;       /* This client is a replication source (i.e. primary or slot migration source) */
     uint64_t replica : 1;                  /* This client is a replica */
     uint64_t monitor : 1;                  /* This client is a replica monitor, see MONITOR */
     uint64_t multi : 1;                    /* This client is in a MULTI context */
@@ -1103,6 +1106,7 @@ typedef struct ClientPubSubData {
                                       context of client side caching. */
 } ClientPubSubData;
 
+typedef struct replicationLink replicationLink;
 typedef struct ClientReplicationData {
     int repl_state;                      /* Replication state if this is a replica. */
     int repl_start_cmd_stream_on_ack;    /* Install replica write handler on first ACK. */
@@ -1133,6 +1137,8 @@ typedef struct ClientReplicationData {
                                            see the definition of replBufBlock. */
     size_t ref_block_pos;                /* Access position of referenced buffer block,
                                            i.e. the next offset to send. */
+    list *slot_ranges;               /* The slot range this replica is replicating for. */
+    replicationLink *link;               /* The replication link owning this. */
 } ClientReplicationData;
 
 typedef struct ClientModuleData {
@@ -1414,7 +1420,7 @@ typedef enum {
  * top-level primary. */
 typedef struct rdbSaveInfo {
     /* Used saving and loading. */
-    int repl_stream_db; /* DB to select in server.primary client. */
+    int repl_stream_db; /* DB to select in server.primary->client. */
 
     /* Used only loading. */
     int repl_id_is_set;                   /* True if repl_id field is set. */
@@ -1536,6 +1542,43 @@ typedef enum childInfoType {
     CHILD_INFO_TYPE_MODULE_COW_SIZE
 } childInfoType;
 
+typedef struct slotRange {
+    int start;
+    int end;
+} slotRange;
+
+typedef struct replicationLink {
+    int protected; /* Used to protect link from destruction during background loading. */
+    int state; /* State of the sync operation overall. */
+    int rdb_channel_state;
+    client *client;
+    client *snapshot_load_client; /* client used for full sync when AOF format is used. */
+    sds host;
+    int port;
+    connection *transfer_s;        /* Replica -> Primary SYNC connection */
+    connection *rdb_transfer_s;    /* Primary FULL SYNC connection (RDB download) */
+    uint64_t rdb_client_id; /* Rdb client id as it defined at primary side */
+    /* The following two fields is where we store primary PSYNC replid/offset
+     * while the PSYNC is in progress. At the end we'll copy the fields into
+     * the server->primary client structure. */
+    char replid[CONFIG_RUN_ID_SIZE + 1]; /* Primary PSYNC runid. */
+    long long initial_offset;            /* Primary PSYNC offset. */
+    off_t transfer_size;           /* Size of RDB to read from primary during sync. */
+    off_t transfer_read;           /* Amount of RDB read from primary during sync. */
+    off_t transfer_last_fsync_off; /* Offset when we fsync-ed last time. */
+    int transfer_fd;               /* Replica -> Primary SYNC temp file descriptor */
+    char *transfer_tmpfile;        /* Replica-> Primary SYNC temp file name */
+    time_t transfer_lastio;        /* Unix time of the latest read, for timeout */
+    struct {
+        char replid[CONFIG_RUN_ID_SIZE + 1];
+        long long reploff;
+        long long read_reploff;
+        int dbid;
+    } provisional_source_state; /* Information about the provisional state (after RDB) for the source node, stored during dual channel sync. */
+    replDataBuf pending_repl_data;             /* Replication data buffer for dual-channel-replication */
+    list *slot_ranges; /* Slot range used for slot import. */
+} replicationLink;
+
 struct valkeyServer {
     /* General */
     pid_t pid;                /* Main process pid. */
@@ -1896,7 +1939,6 @@ struct valkeyServer {
     int repl_ping_replica_period;              /* Primary pings the replica every N seconds */
     replBacklog *repl_backlog;                 /* Replication backlog for partial syncs */
     long long repl_backlog_size;               /* Backlog circular buffer size */
-    replDataBuf pending_repl_data;             /* Replication data buffer for dual-channel-replication */
     time_t repl_backlog_time_limit;            /* Time without replicas after the backlog
                                                   gets released. */
     time_t repl_no_replicas_since;             /* We have no replicas since that time.
@@ -1920,52 +1962,28 @@ struct valkeyServer {
     list *repl_buffer_blocks;                  /* Replication buffers blocks list
                                                 * (serving replica clients and repl backlog) */
     /* Replication (replica) */
-    char *primary_user;     /* AUTH with this user and primary_auth with primary */
-    sds primary_auth;       /* AUTH with this password with primary */
-    char *primary_host;     /* Hostname of primary */
-    int primary_port;       /* Port of primary */
-    int repl_timeout;       /* Timeout after N seconds of primary idle */
-    client *primary;        /* Client that is primary for this replica */
-    uint64_t rdb_client_id; /* Rdb client id as it defined at primary side */
-    struct {
-        connection *conn;
-        char replid[CONFIG_RUN_ID_SIZE + 1];
-        long long reploff;
-        long long read_reploff;
-        int dbid;
-    } repl_provisional_primary;
-    client *cached_primary;             /* Cached primary to be reused for PSYNC. */
-    rio *loading_rio;                   /* Pointer to the rio object currently used for loading data. */
-    int repl_syncio_timeout;            /* Timeout for synchronous I/O calls */
-    int repl_state;                     /* Replication status if the instance is a replica */
-    int repl_rdb_channel_state;         /* State of the replica's rdb channel during dual-channel-replication */
-    off_t repl_transfer_size;           /* Size of RDB to read from primary during sync. */
-    off_t repl_transfer_read;           /* Amount of RDB read from primary during sync. */
-    off_t repl_transfer_last_fsync_off; /* Offset when we fsync-ed last time. */
-    connection *repl_transfer_s;        /* Replica -> Primary SYNC connection */
-    connection *repl_rdb_transfer_s;    /* Primary FULL SYNC connection (RDB download) */
-    int repl_transfer_fd;               /* Replica -> Primary SYNC temp file descriptor */
-    char *repl_transfer_tmpfile;        /* Replica-> Primary SYNC temp file name */
-    time_t repl_transfer_lastio;        /* Unix time of the latest read, for timeout */
-    int repl_serve_stale_data;          /* Serve stale data when link is down? */
-    int repl_replica_ro;                /* Replica is read only? */
-    int repl_replica_ignore_maxmemory;  /* If true replicas do not evict. */
-    time_t repl_down_since;             /* Unix time at which link with primary went down */
-    int repl_disable_tcp_nodelay;       /* Disable TCP_NODELAY after SYNC? */
-    int replica_priority;               /* Reported in INFO and used by Sentinel. */
-    int replica_announced;              /* If true, replica is announced by Sentinel */
-    int replica_announce_port;          /* Give the primary this listening port. */
-    char *replica_announce_ip;          /* Give the primary this ip address. */
-    int propagation_error_behavior;     /* Configures the behavior of the replica
-                                         * when it receives an error on the replication stream */
-    int repl_ignore_disk_write_error;   /* Configures whether replicas panic when unable to
-                                         * persist writes to AOF. */
-    /* The following two fields is where we store primary PSYNC replid/offset
-     * while the PSYNC is in progress. At the end we'll copy the fields into
-     * the server->primary client structure. */
-    char primary_replid[CONFIG_RUN_ID_SIZE + 1]; /* Primary PSYNC runid. */
-    long long primary_initial_offset;            /* Primary PSYNC offset. */
-    int repl_replica_lazy_flush;                 /* Lazy FLUSHALL before loading DB? */
+    char *primary_user;                /* AUTH with this user and primary_auth with primary */
+    sds primary_auth;                  /* AUTH with this password with primary */
+    int repl_timeout;                  /* Timeout after N seconds of primary idle */
+    replicationLink *primary;          /* Replication link for the primary. */
+    list *replication_links;           /* List of all current replication links. */
+    client *cached_primary;            /* Cached primary to be reused for PSYNC. */
+    int repl_syncio_timeout;           /* Timeout for synchronous I/O calls */
+    int repl_serve_stale_data;         /* Serve stale data when link is down? */
+    int repl_replica_ro;               /* Replica is read only? */
+    int repl_replica_ignore_maxmemory; /* If true replicas do not evict. */
+    time_t repl_down_since;            /* Unix time at which link with primary went down */
+    int repl_disable_tcp_nodelay;      /* Disable TCP_NODELAY after SYNC? */
+    int replica_priority;              /* Reported in INFO and used by Sentinel. */
+    int replica_announced;             /* If true, replica is announced by Sentinel */
+    int replica_announce_port;         /* Give the primary this listening port. */
+    char *replica_announce_ip;         /* Give the primary this ip address. */
+    int propagation_error_behavior;    /* Configures the behavior of the replica
+                                        * when it receives an error on the replication stream */
+    int repl_ignore_disk_write_error;  /* Configures whether replicas panic when unable to
+                                        * persist writes to AOF. */
+    int repl_replica_lazy_flush;       /* Lazy FLUSHALL before loading DB? */
+    rio *loading_rio;                  /* Pointer to the rio object currently used for loading data. */
     /* Import Mode */
     int import_mode; /* If true, server is in import mode and forbid expiration and eviction. */
     /* Synchronous replication. */
@@ -2745,6 +2763,9 @@ void ioThreadWriteToClient(void *data);
 int canParseCommand(client *c);
 int processIOThreadsReadDone(void);
 int processIOThreadsWriteDone(void);
+replicationLink *createReplicationLink(char *host, int port, list *slot_ranges);
+int connectReplicationLink(replicationLink *link);
+int freeReplicationLink(replicationLink *link);
 
 /* logreqres.c - logging of requests and responses */
 void reqresReset(client *c, int free_buf);
@@ -2898,7 +2919,7 @@ void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv,
 void updateReplicasWaitingBgsave(int bgsaveerr, int type);
 void replicationCron(void);
 void replicationStartPendingFork(void);
-void replicationHandlePrimaryDisconnection(void);
+void replicationHandleSourceDisconnection(replicationLink *link);
 void replicationCachePrimary(client *c);
 void resizeReplicationBacklog(void);
 void replicationSetPrimary(char *ip, int port, int full_sync_required);
@@ -2909,7 +2930,7 @@ void processClientsWaitingReplicas(void);
 void unblockClientWaitingReplicas(client *c);
 int replicationCountAcksByOffset(long long offset);
 int replicationCountAOFAcksByOffset(long long offset);
-void replicationSendNewlineToPrimary(void);
+void replicationSendNewlineToConnectedLinks(void);
 long long replicationGetReplicaOffset(void);
 char *replicationGetReplicaName(client *c);
 long long getPsyncInitialOffset(void);
@@ -2974,6 +2995,7 @@ void aofOpenIfNeededOnServerStart(void);
 void aofManifestFree(aofManifest *am);
 int aofDelHistoryFiles(void);
 int aofRewriteLimited(void);
+int rewriteAppendOnlyFileRio(rio *aof, list *slot_ranges);
 
 /* Child info */
 void openChildInfoPipe(void);
@@ -3429,6 +3451,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor);
 int parseScanCursorOrReply(client *c, robj *o, unsigned long long *cursor);
 int dbAsyncDelete(serverDb *db, robj *key);
 void emptyDbAsync(serverDb *db);
+void emptyHashtableAsync(serverDb *db, int didx);
 size_t lazyfreeGetPendingObjectsCount(void);
 size_t lazyfreeGetFreedObjectsCount(void);
 void lazyfreeResetStats(void);

From ac26e2099cb32b8465bfc666d8aabafff3bdf703 Mon Sep 17 00:00:00 2001
From: Jacob Murphy <jkmurphy@google.com>
Date: Wed, 15 Jan 2025 20:59:49 +0000
Subject: [PATCH 02/18] Use slot bitmap everywhere

Signed-off-by: Jacob Murphy <jkmurphy@google.com>
---
 src/aof.c            | 24 +++++++----------
 src/cluster.c        | 55 +++-----------------------------------
 src/cluster.h        | 10 +++----
 src/cluster_legacy.c | 34 ++++++++++++++----------
 src/cluster_legacy.h |  2 +-
 src/rdb.c            |  9 ++++---
 src/rdb.h            |  2 +-
 src/replication.c    | 63 ++++++++++++++++++++------------------------
 src/server.h         | 20 +++++++-------
 9 files changed, 82 insertions(+), 137 deletions(-)

diff --git a/src/aof.c b/src/aof.c
index 5c2691c1ba..3e9bc9d323 100644
--- a/src/aof.c
+++ b/src/aof.c
@@ -32,6 +32,7 @@
 #include "rio.h"
 #include "functions.h"
 #include "module.h"
+#include "cluster.h"
 
 #include <signal.h>
 #include <fcntl.h>
@@ -2190,27 +2191,20 @@ static int rewriteFunctions(rio *aof) {
     return 0;
 }
 
-int shouldFilterSlot(int slot, void * slot_ranges) {
-    if (slot_ranges == NULL) return 0;
-    list *ranges = (list *)slot_ranges;
-    listIter li;
-    listNode *ln;
-    listRewind(ranges, &li);
-    while ((ln = listNext(&li))) {
-        slotRange *range = (slotRange *) ln->value;
-        if (slot >= range->start && slot <= range->end) return 0;
-    }
-    return 1;
+int shouldFilterSlot(int slot, void * privdata) {
+    if (privdata == NULL) return 0;
+    unsigned char *slot_bitmap = (unsigned char *)privdata;
+    return !bitmapTestBit(slot_bitmap, slot);
 }
 
-int rewriteAppendOnlyFileRio(rio *aof, list *slot_ranges) {
+int rewriteAppendOnlyFileRio(rio *aof, unsigned char *slot_bitmap) {
     int j;
     long key_count = 0;
     long long updated_time = 0;
     kvstoreIterator *kvs_it = NULL;
 
     /* Record timestamp at the beginning of rewriting AOF. */
-    if (server.aof_timestamp_enabled && slot_ranges == NULL) {
+    if (server.aof_timestamp_enabled && isSlotBitmapAllSlots(slot_bitmap)) {
         sds ts = genAofTimestampAnnotationIfNeeded(1);
         if (rioWrite(aof, ts, sdslen(ts)) == 0) {
             sdsfree(ts);
@@ -2230,10 +2224,10 @@ int rewriteAppendOnlyFileRio(rio *aof, list *slot_ranges) {
         if (rioWrite(aof, selectcmd, sizeof(selectcmd) - 1) == 0) goto werr;
         if (rioWriteBulkLongLong(aof, j) == 0) goto werr;
 
-        if (slot_ranges == NULL) {
+        if (isSlotBitmapAllSlots(slot_bitmap)) {
             kvs_it = kvstoreIteratorInit(db->keys);
         } else {
-            kvs_it = kvstoreFilteredIteratorInit(db->keys, &shouldFilterSlot, slot_ranges);
+            kvs_it = kvstoreFilteredIteratorInit(db->keys, &shouldFilterSlot, slot_bitmap);
         }
         /* Iterate this DB writing every entry */
         void *next;
diff --git a/src/cluster.c b/src/cluster.c
index 8050cd869d..2e88ff8ba2 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -815,14 +815,10 @@ unsigned int countKeysInSlot(unsigned int slot) {
     return kvstoreHashtableSize(server.db->keys, slot);
 }
 
-unsigned int dropKeysInSlotRanges(list *slot_ranges, int async) {
+unsigned int dropKeysInSlotBitmap(unsigned char *slot_bitmap, int async) {
     unsigned int result = 0;
-    listIter li;
-    listNode *ln;
-    listRewind(slot_ranges, &li);
-    while ((ln = listNext(&li))) {
-        slotRange *slot_range = (slotRange *) listNodeValue(ln);
-        for (int i = slot_range->start; i <= slot_range->end; i++) {
+    for (int i = 0; i < CLUSTER_SLOTS; i++) {
+        if (bitmapTestBit(slot_bitmap, i)) {
             result += dropKeysInSlot(i, async);
         }
     }
@@ -840,51 +836,6 @@ unsigned int dropKeysInSlot(unsigned int hashslot, int async) {
     return result;
 }
 
-
-
-void slotRangesToBitmap(list *slot_ranges, unsigned char *bitmap_out) {
-    listIter li;
-    listNode *ln;
-    listRewind(slot_ranges, &li);
-    while ((ln = listNext(&li))) {
-        slotRange *range = (slotRange *) listNodeValue(ln);
-        for (int i = range->start; i <= range->end; i++) {
-            bitmapSetBit(bitmap_out, i);
-        }
-    }
-}
-
-void bitmapToSlotRanges(unsigned char *bitmap, list **slot_ranges_out) {
-    *slot_ranges_out = listCreate();
-    int range_start = -1;
-    for (int i = 0; i <= CLUSTER_SLOTS; i++) {
-        if (i != CLUSTER_SLOTS && bitmapTestBit(bitmap, i)) {
-            if (range_start == -1) {
-                range_start = i;
-            }
-        } else if (range_start != -1) {
-            slotRange *range = zmalloc(sizeof(slotRange));
-            range->start = range_start;
-            range->end = i - 1;
-            range_start = -1;
-            serverLog(LL_NOTICE, "Got another range: %d-%d", range->start, range->end);
-            listAddNodeTail(*slot_ranges_out, range);
-        }
-    }
-}
-
-void freeSlotRanges(list *slot_ranges) {
-    listIter li;
-    listNode *ln;
-    listRewind(slot_ranges, &li);
-    while ((ln = listNext(&li))) {
-        slotRange *range = (slotRange *)ln->value;
-        zfree(range);
-        listDelNode(slot_ranges, ln);
-    }
-    listRelease(slot_ranges);
-}
-
 void clusterCommandHelp(client *c) {
     const char *help[] = {
         "COUNTKEYSINSLOT <slot>",
diff --git a/src/cluster.h b/src/cluster.h
index fd994d1ce7..e6610a8074 100644
--- a/src/cluster.h
+++ b/src/cluster.h
@@ -5,8 +5,6 @@
  * Cluster exported API.
  *----------------------------------------------------------------------------*/
 
-#define CLUSTER_SLOT_MASK_BITS 14                                   /* Number of bits used for slot id. */
-#define CLUSTER_SLOTS (1 << CLUSTER_SLOT_MASK_BITS)                 /* Total number of slots in cluster mode, which is 16384. */
 #define CLUSTER_SLOT_MASK ((unsigned long long)(CLUSTER_SLOTS - 1)) /* Bit mask for slot id stored in LSB. */
 #define CLUSTER_OK 0                                                /* Everything looks ok */
 #define CLUSTER_FAIL 1                                              /* The cluster can't work */
@@ -116,14 +114,14 @@ client *createCachedResponseClient(int resp);
 void deleteCachedResponseClient(client *recording_client);
 void clearCachedClusterSlotsResponse(void);
 unsigned int countKeysInSlot(unsigned int hashslot);
-unsigned int dropKeysInSlotRanges(list *slot_ranges, int async);
+unsigned int dropKeysInSlotBitmap(unsigned char *slot_bitmap, int async);
 unsigned int dropKeysInSlot(unsigned int hashslot, int async);
-void slotRangesToBitmap(list *slot_ranges, unsigned char *bitmap_out);
-void bitmapToSlotRanges(unsigned char *bitmap, list **slot_ranges_out);
-void freeSlotRanges(list *slot_ranges);
+void bitmapToSlotRanges(unsigned char *bitmap, char **slot_bitmap_out);
 int bitmapTestBit(unsigned char *bitmap, int pos);
 void bitmapSetBit(unsigned char *bitmap, int pos);
 void bitmapClearBit(unsigned char *bitmap, int pos);
+void bitmapSetAllBits(unsigned char *bitmap, int len);
+int isSlotBitmapAllSlots(unsigned char *bitmap);
 int getSlotOrReply(client *c, robj *o);
 
 /* functions with shared implementations */
diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 15a5ee3b7d..95e6e600fe 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -84,7 +84,7 @@ void clusterFreeNodesSlotsInfo(clusterNode *n);
 uint64_t clusterGetMaxEpoch(void);
 int clusterBumpConfigEpochWithoutConsensus(void);
 slotMigration *clusterGetCurrentSlotMigration(void);
-void clusterSendMigrateSlotStart(clusterNode *node, list *slot_ranges);
+void clusterSendMigrateSlotStart(clusterNode *node, unsigned char *slot_bitmap);
 void moduleCallClusterReceivers(const char *sender_id,
                                 uint64_t module_id,
                                 uint8_t type,
@@ -4445,13 +4445,13 @@ slotMigration *clusterGetCurrentSlotMigration(void) {
     return (slotMigration *) listFirst(server.cluster->slot_migrations)->value;
 }
 
-void clusterSendMigrateSlotStart(clusterNode *node, list *slot_ranges) {
+void clusterSendMigrateSlotStart(clusterNode *node, unsigned char *slot_bitmap) {
     if (!node->link) return;
 
     uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData) + sizeof(clusterMsgSlotMigration);
     clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_MIGRATE_SLOT_START, msglen);
     clusterMsg *hdr = getMessageFromSendBlock(msgblock);
-    slotRangesToBitmap(slot_ranges, hdr->data.slot_migration.msg.slot_bitmap);
+    memcpy(hdr->data.slot_migration.msg.slot_bitmap, slot_bitmap, sizeof(hdr->data.slot_migration.msg.slot_bitmap));
     clusterSendMessage(node->link, msgblock);
     clusterMsgSendBlockDecrRefCount(msgblock);
 }
@@ -4482,7 +4482,7 @@ void clusterProceedWithSlotMigration(void) {
                 /* Start the migration */
                 serverLog(LL_NOTICE, "Starting sync from migration source node %.40s", curr_migration->source_node->name);
                 curr_migration->end_time = mstime() + CLUSTER_SLOT_MIGRATION_TIMEOUT;
-                curr_migration->link = createReplicationLink(curr_migration->source_node->ip, getNodeDefaultReplicationPort(curr_migration->source_node), curr_migration->slot_ranges);
+                curr_migration->link = createReplicationLink(curr_migration->source_node->ip, getNodeDefaultReplicationPort(curr_migration->source_node), curr_migration->slot_bitmap);
                 if (connectReplicationLink(curr_migration->link) == C_ERR) {
                     serverLog(LL_WARNING,
                             "Failed to begin sync from migration source node %.40s", curr_migration->source_node->name);
@@ -4506,7 +4506,7 @@ void clusterProceedWithSlotMigration(void) {
                 return;
             case SLOT_MIGRATION_PAUSE_OWNER:
                 serverLog(LL_NOTICE, "Replication link to slot owner %.40s has been established. Pausing source node and waiting to continue", curr_migration->source_node->name);
-                clusterSendMigrateSlotStart(curr_migration->source_node, curr_migration->slot_ranges);
+                clusterSendMigrateSlotStart(curr_migration->source_node, curr_migration->slot_bitmap);
                 curr_migration->pause_primary_offset = -1;
                 curr_migration->pause_end = mstime() + CLUSTER_MF_TIMEOUT;
                 curr_migration->state = SLOT_MIGRATION_WAITING_FOR_OFFSET;
@@ -4524,12 +4524,8 @@ void clusterProceedWithSlotMigration(void) {
                 return;
             case SLOT_MIGRATION_FINISH:
                 serverLog(LL_NOTICE, "Setting myself to owner of migrating slots and broadcasting");
-                listIter li;
-                listNode *ln;
-                listRewind(curr_migration->slot_ranges, &li);
-                while ((ln = listNext(&li))) {
-                    slotRange *range = (slotRange *) ln->value;
-                    for (int i = range->start; i <= range->end; i++) {
+                for (int i = 0; i < CLUSTER_SLOTS; i++) {
+                    if (bitmapTestBit(curr_migration->slot_bitmap, i)) {
                         clusterDelSlot(i);
                         clusterAddSlot(myself, i);
                     }
@@ -4548,8 +4544,7 @@ void clusterProceedWithSlotMigration(void) {
                 /* Delete the migration from the queue and proceed to the next migration */
                 listDelNode(server.cluster->slot_migrations, curr_node);
                 freeReplicationLink(curr_migration->link);
-                dropKeysInSlotRanges(curr_migration->slot_ranges, server.repl_replica_lazy_flush);
-                freeSlotRanges(curr_migration->slot_ranges);
+                dropKeysInSlotBitmap(curr_migration->slot_bitmap, server.repl_replica_lazy_flush);
                 zfree(curr_migration);
                 continue;
         }
@@ -5597,6 +5592,17 @@ void bitmapClearBit(unsigned char *bitmap, int pos) {
     bitmap[byte] &= ~(1 << bit);
 }
 
+void bitmapSetAllBits(unsigned char *bitmap, int len) {
+    memset(bitmap, 0xff, len);
+}
+
+/* Return if the slot bitmap contains all slots */
+int isSlotBitmapAllSlots(unsigned char *bitmap) {
+    unsigned char all_slot_bitmap[CLUSTER_SLOTS / 8];
+    bitmapSetAllBits(all_slot_bitmap, sizeof(all_slot_bitmap));
+    return memcmp(bitmap, all_slot_bitmap, sizeof(all_slot_bitmap)) == 0;
+}
+
 /* Return non-zero if there is at least one primary with replicas in the cluster.
  * Otherwise zero is returned. Used by clusterNodeSetSlotBit() to set the
  * MIGRATE_TO flag the when a primary gets the first slot. */
@@ -7333,7 +7339,7 @@ int clusterCommandSpecial(client *c) {
         }
 
         slotMigration *to_enqueue = (slotMigration *) zmalloc(sizeof(slotMigration));
-        bitmapToSlotRanges(requested_slots, &to_enqueue->slot_ranges);
+        memcpy(to_enqueue->slot_bitmap, requested_slots, sizeof(requested_slots));
         to_enqueue->source_node = curr_owner;
         to_enqueue->state = SLOT_MIGRATION_QUEUED;
         to_enqueue->end_time = 0; /* Will be set once started. */
diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
index dc157af78b..9a5add854d 100644
--- a/src/cluster_legacy.h
+++ b/src/cluster_legacy.h
@@ -387,7 +387,7 @@ typedef enum slotMigrationState {
 } slotMigrationState;
 
 typedef struct slotMigration {
-    list *slot_ranges;
+    unsigned char slot_bitmap[CLUSTER_SLOTS/8];
     slotMigrationState state;
     clusterNode *source_node;
     mstime_t end_time; /* Slot migration time limit (ms unixtime).
diff --git a/src/rdb.c b/src/rdb.c
index 57fae239ad..33fb2c274c 100644
--- a/src/rdb.c
+++ b/src/rdb.c
@@ -38,6 +38,7 @@
 #include "bio.h"
 #include "zmalloc.h"
 #include "module.h"
+#include "cluster.h"
 
 #include <math.h>
 #include <fcntl.h>
@@ -3526,7 +3527,7 @@ void killRDBChild(void) {
 
 /* Spawn an RDB child that writes the RDB to the sockets of the replicas
  * that are currently in REPLICA_STATE_WAIT_BGSAVE_START state. */
-int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, list *slot_ranges) {
+int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, unsigned char *slot_bitmap) {
     listNode *ln;
     listIter li;
     pid_t childpid;
@@ -3577,8 +3578,8 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, list *slot_ranges) {
         if (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_START) {
             /* Check replica has the exact requirements */
             if (replica->repl_data->replica_req != req) continue;
-            /* No attempt to coallesce slot ranges, just use equality */
-            if (replica->repl_data->slot_ranges != slot_ranges) continue;
+            /* Check matching slot bitmaps. */
+            if (memcmp(replica->repl_data->slot_bitmap, slot_bitmap, CLUSTER_SLOTS/8) != 0) continue;
 
             conns[connsnum++] = replica->conn;
             if (dual_channel) {
@@ -3620,7 +3621,7 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, list *slot_ranges) {
 
         if (aof) {
             serverLog(LL_NOTICE, "Background AOF transfer started by pid %ld", (long)getpid());
-            retval = rewriteAppendOnlyFileRio(&rdb, slot_ranges);
+            retval = rewriteAppendOnlyFileRio(&rdb, slot_bitmap);
             rioWrite(&rdb, "*3\r\n", 4);
             rioWriteBulkString(&rdb, "REPLCONF", 8);
             rioWriteBulkString(&rdb, "SYNC-PAYLOAD-END", 17);
diff --git a/src/rdb.h b/src/rdb.h
index 440620e5bb..5225933dd6 100644
--- a/src/rdb.h
+++ b/src/rdb.h
@@ -152,7 +152,7 @@ int rdbSaveObjectType(rio *rdb, robj *o);
 int rdbLoadObjectType(rio *rdb);
 int rdbLoad(char *filename, rdbSaveInfo *rsi, int rdbflags);
 int rdbSaveBackground(int req, char *filename, rdbSaveInfo *rsi, int rdbflags);
-int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, list *slotRanges);
+int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, unsigned char *slot_bitmap);
 void rdbRemoveTempFile(pid_t childpid, int from_signal);
 int rdbSaveToFile(const char *filename);
 int rdbSave(int req, char *filename, rdbSaveInfo *rsi, int rdbflags);
diff --git a/src/replication.c b/src/replication.c
index cecfad5ee5..5119eeb408 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -953,7 +953,7 @@ int primaryTryPartialResynchronization(client *c, long long psync_offset) {
  *    started.
  *
  * Returns C_OK on success or C_ERR otherwise. */
-int startBgsaveForReplication(int mincapa, int req, list *slot_ranges) {
+int startBgsaveForReplication(int mincapa, int req, unsigned char *slot_bitmap) {
     int retval;
     int socket_target = 0;
     listIter li;
@@ -977,7 +977,7 @@ int startBgsaveForReplication(int mincapa, int req, list *slot_ranges) {
      * otherwise replica will miss repl-stream-db. */
     if (rsiptr) {
         if (socket_target)
-            retval = rdbSaveToReplicasSockets(req, rsiptr, slot_ranges);
+            retval = rdbSaveToReplicasSockets(req, rsiptr, slot_bitmap);
         else {
             /* Keep the page cache since it'll get used soon */
             retval = rdbSaveBackground(req, server.rdb_filename, rsiptr, RDBFLAGS_REPLICATION | RDBFLAGS_KEEP_CACHE);
@@ -1099,7 +1099,7 @@ void syncCommand(client *c) {
     }
 
     /* Fail sync if it is asking for AOF format and a slot is not set via REPLCONF already. */
-    if (c->repl_data->replica_req & REPLICA_REQ_AOF_FORMAT && c->repl_data->slot_ranges == NULL) {
+    if (c->repl_data->replica_req & REPLICA_REQ_AOF_FORMAT && isSlotBitmapAllSlots(c->repl_data->slot_bitmap)) {
         addReplyError(c, "AOF format is only supported for single slot SYNC");
         return;
     }
@@ -1180,7 +1180,7 @@ void syncCommand(client *c) {
     }
 
     /* For slot level replication, we make no attempt to coallesce BGSAVEs */
-    int require_dedicated = c->repl_data->slot_ranges != NULL;
+    int require_dedicated = !isSlotBitmapAllSlots(c->repl_data->slot_bitmap);
 
     /* CASE 1: BGSAVE is in progress, with disk target. */
     if (!require_dedicated && server.child_type == CHILD_TYPE_RDB && server.rdb_child_type == RDB_CHILD_TYPE_DISK) {
@@ -1244,7 +1244,7 @@ void syncCommand(client *c) {
     }
 
     /* CASE 5: We are good to start a BGSAVE. Diskless or disk-based mode is determined by replica's capacity. */
-    startBgsaveForReplication(c->repl_data->replica_capa, c->repl_data->replica_req, c->repl_data->slot_ranges);
+    startBgsaveForReplication(c->repl_data->replica_capa, c->repl_data->replica_req, c->repl_data->slot_bitmap);
     return;
 }
 
@@ -1268,6 +1268,7 @@ int anyOtherReplicaWaitRdb(client *except_me) {
 void initClientReplicationData(client *c) {
     if (c->repl_data) return;
     c->repl_data = (ClientReplicationData *)zcalloc(sizeof(ClientReplicationData));
+    bitmapSetAllBits(c->repl_data->slot_bitmap, sizeof(c->repl_data->slot_bitmap));
 }
 
 void freeClientReplicationData(client *c) {
@@ -1312,9 +1313,6 @@ void freeClientReplicationData(client *c) {
         replicationHandleSourceDisconnection(c->repl_data->link);
     }
     sdsfree(c->repl_data->replica_addr);
-    if (c->repl_data->slot_ranges) {
-        freeSlotRanges(c->repl_data->slot_ranges);
-    }
     zfree(c->repl_data);
     c->repl_data = NULL;
 }
@@ -1515,23 +1513,20 @@ void replconfCommand(client *c) {
             if (!server.cluster_enabled) {
                 addReplyError(c, "Cannot replicate a slot when cluster mode is disabled");
             }
-            if (c->repl_data->slot_ranges != NULL) {
+            if (!isSlotBitmapAllSlots(c->repl_data->slot_bitmap)) {
                 addReplyError(c, "Slot bitmap already set");
             }
             if (stringObjectLen(c->argv[j + 1]) != CLUSTER_SLOTS / 8) {
                 addReplyError(c, "Invalid slot bitmap length");
                 return;
             }
-            list *slot_ranges;
-            bitmapToSlotRanges(c->argv[j + 1]->ptr, &slot_ranges);
             for (int slot = 0; slot <= CLUSTER_SLOTS; slot++) {
                 if (bitmapTestBit(c->argv[j + 1]->ptr, slot) && server.cluster->slots[slot] != server.cluster->myself) {
                     addReplyErrorFormat(c, "I cannot replicate slot %d since I do not own it", slot);
-                    freeSlotRanges(slot_ranges);
                     return;
                 }
             }
-            c->repl_data->slot_ranges = slot_ranges;
+            memcpy(c->repl_data->slot_bitmap, c->argv[j + 1]->ptr, CLUSTER_SLOTS / 8);
 
             /* For now, we only support AOF for slot transfer. */
             c->repl_data->replica_req |= REPLICA_REQ_AOF_FORMAT;
@@ -1993,7 +1988,7 @@ void shiftReplicationId(void) {
 char *replicationGetNameForLogs(replicationLink *link) {
     if (link == server.primary)
         return "PRIMARY";
-    if (link->slot_ranges != NULL)
+    if (!isSlotBitmapAllSlots(link->slot_bitmap))
         return "SLOT OWNER";
     return "OTHER REPLICATION SOURCE";
 }
@@ -2079,7 +2074,7 @@ client *createReplicationLinkClientWithHandler(replicationLink *link, connection
      * PSYNC capable, so we flag it accordingly. */
     if (c->repl_data->reploff == -1) c->flag.pre_psync = 1;
     if (dbid != -1) selectDb(c, dbid);
-    c->repl_data->slot_ranges = link->slot_ranges;
+    memcpy(c->repl_data->slot_bitmap, link->slot_bitmap, sizeof(c->repl_data->slot_bitmap));
 
     return c;
 }
@@ -2247,7 +2242,7 @@ void readSyncBulkPayload(connection *conn) {
     replicationLink *link = (replicationLink *)connGetPrivateData(conn);
 
     /* RDB bulk load will only be used if we are sending all slots. */
-    serverAssert(link->slot_ranges == NULL);
+    serverAssert(isSlotBitmapAllSlots(link->slot_bitmap));
 
     /* Static vars used to hold the EOF mark, and the last bytes received
      * from the server: when they match, we reached the end of the transfer. */
@@ -2879,7 +2874,7 @@ static int dualChannelReplHandleReplconfReply(replicationLink *link, sds *err) {
 }
 
 int replicationUseAOFFormatSnapshot(replicationLink *link) {
-    return link->slot_ranges != NULL;
+    return !isSlotBitmapAllSlots(link->slot_bitmap);
 }
 
 static int dualChannelReplHandleEndOffsetResponse(replicationLink *link, sds *err) {
@@ -3731,13 +3726,10 @@ void syncWithSource(connection *conn) {
         }
 
         /* Set the slot number, so that the primary only provides us with the appropriate slot dictionary. */
-        if (link->slot_ranges != NULL) {
+        if (!isSlotBitmapAllSlots(link->slot_bitmap)) {
             char *argv[3] = {"REPLCONF", "slot-bitmap", NULL};
             size_t lens[3] = {8, 11, 0};
-            unsigned char slot_bitmap[CLUSTER_SLOTS/8 + 1] = {0};
-            slotRangesToBitmap(link->slot_ranges, slot_bitmap);
-            slot_bitmap[CLUSTER_SLOTS/8] = '\0';
-            argv[2] = (char *)slot_bitmap;
+            argv[2] = (char *)link->slot_bitmap;
             lens[2] = CLUSTER_SLOTS/8;
             err = sendCommandArgv(conn, 3, argv, lens);
             if (err) goto write_error;
@@ -3817,7 +3809,7 @@ void syncWithSource(connection *conn) {
         return;
     }
 
-    if (link->state == REPL_STATE_RECEIVE_SLOT_REPLY && link->slot_ranges == NULL)
+    if (link->state == REPL_STATE_RECEIVE_SLOT_REPLY && isSlotBitmapAllSlots(link->slot_bitmap))
         link->state = REPL_STATE_RECEIVE_CAPA_REPLY;
 
     if (link->state == REPL_STATE_RECEIVE_SLOT_REPLY) {
@@ -3937,7 +3929,7 @@ void syncWithSource(connection *conn) {
     }
 
     /* Prepare a suitable temp file for bulk transfer */
-    if (!useDisklessLoad() && link->slot_ranges == NULL) {
+    if (!useDisklessLoad() && isSlotBitmapAllSlots(link->slot_bitmap)) {
         int dfd = -1, maxtries = 5;
         while (maxtries--) {
             snprintf(tmpfile, 256, "temp-%d.%ld.rdb", (int)server.unixtime, (long int)getpid());
@@ -3960,8 +3952,8 @@ void syncWithSource(connection *conn) {
 
     /* We are going to need to do a full resync. If we are accepting a single
      * slot - make sure we have a clean slate to load it into.*/
-    if (link->slot_ranges != NULL) {
-        dropKeysInSlotRanges(link->slot_ranges, 1);
+    if (!isSlotBitmapAllSlots(link->slot_bitmap)) {
+        dropKeysInSlotBitmap(link->slot_bitmap, 1);
     }
 
     /* Using dual-channel-replication, the primary responded +DUALCHANNELSYNC. We need to
@@ -4029,12 +4021,12 @@ void syncWithSource(connection *conn) {
     goto error;
 }
 
-replicationLink *createReplicationLink(char *host, int port, list *slot_ranges) {
+replicationLink *createReplicationLink(char *host, int port, unsigned char *slot_bitmap) {
     replicationLink *result = (replicationLink *)zmalloc(sizeof(replicationLink));
     result->protected = 0;
     result->state = REPL_STATE_NONE;
     result->rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE;
-    result->slot_ranges = slot_ranges;
+    memcpy(result->slot_bitmap, slot_bitmap, sizeof(result->slot_bitmap));
     result->client = NULL;
     result->host = sdsnew(host);
     result->port = port;
@@ -5133,7 +5125,7 @@ void replicationCron(void) {
     replication_cron_loops++; /* Incremented with frequency 1 HZ. */
 }
 
-int shouldStartChildReplication(int *mincapa_out, int *req_out, list **slot_ranges_out) {
+int shouldStartChildReplication(int *mincapa_out, int *req_out, unsigned char *slot_bitmap_out) {
     /* We should start a BGSAVE good for replication if we have replicas in
      * WAIT_BGSAVE_START state.
      *
@@ -5145,7 +5137,7 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out, list **slot_rang
         int replicas_waiting = 0;
         int mincapa;
         int req;
-        list *slot_ranges;
+        unsigned char slot_bitmap[CLUSTER_SLOTS/8];
         int first = 1;
         listNode *ln;
         listIter li;
@@ -5157,7 +5149,7 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out, list **slot_rang
                 if (first) {
                     /* Get first replica's requirements */
                     req = replica->repl_data->replica_req;
-                    slot_ranges = replica->repl_data->slot_ranges;
+                    memcpy(slot_bitmap, replica->repl_data->slot_bitmap, sizeof(slot_bitmap));
                 } else if (req != replica->repl_data->replica_req) {
                     /* Skip replicas that don't match */
                     continue;
@@ -5176,7 +5168,7 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out, list **slot_rang
                                  max_idle >= server.repl_diskless_sync_delay)) {
             if (mincapa_out) *mincapa_out = mincapa;
             if (req_out) *req_out = req;
-            if (slot_ranges_out) *slot_ranges_out = slot_ranges;
+            if (slot_bitmap_out) memcpy(slot_bitmap_out, slot_bitmap, sizeof(slot_bitmap));
             return 1;
         }
     }
@@ -5187,13 +5179,14 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out, list **slot_rang
 void replicationStartPendingFork(void) {
     int mincapa = -1;
     int req = -1;
-    list *slot_ranges = NULL;
+    unsigned char slot_bitmap[CLUSTER_SLOTS/8];
+    bitmapSetAllBits(slot_bitmap, sizeof(slot_bitmap));
 
-    if (shouldStartChildReplication(&mincapa, &req, &slot_ranges)) {
+    if (shouldStartChildReplication(&mincapa, &req, slot_bitmap)) {
         /* Start the BGSAVE. The called function may start a
          * BGSAVE with socket target or disk target depending on the
          * configuration and replicas capabilities and requirements. */
-        startBgsaveForReplication(mincapa, req, slot_ranges);
+        startBgsaveForReplication(mincapa, req, slot_bitmap);
     }
 }
 
diff --git a/src/server.h b/src/server.h
index 1bd78f57f6..5d7b8db461 100644
--- a/src/server.h
+++ b/src/server.h
@@ -153,6 +153,8 @@ struct hdr_histogram;
 #else
 #define CONFIG_ACTIVE_DEFRAG_DEFAULT 1
 #endif
+#define CLUSTER_SLOT_MASK_BITS 14                                   /* Number of bits used for slot id. */
+#define CLUSTER_SLOTS (1 << CLUSTER_SLOT_MASK_BITS)                 /* Total number of slots in cluster mode, which is 16384. */
 
 /* Bucket sizes for client eviction pools. Each bucket stores clients with
  * memory usage of up to twice the size of the bucket below it. */
@@ -1133,12 +1135,12 @@ typedef struct ClientReplicationData {
     short replica_req;                   /* Replica requirements: REPLICA_REQ_* */
     uint64_t associated_rdb_client_id;   /* The client id of this replica's rdb connection */
     time_t rdb_client_disconnect_time;   /* Time of the first freeClient call on this client. Used for delaying free. */
-    listNode *ref_repl_buf_node;         /* Referenced node of replication buffer blocks,
-                                           see the definition of replBufBlock. */
-    size_t ref_block_pos;                /* Access position of referenced buffer block,
-                                           i.e. the next offset to send. */
-    list *slot_ranges;               /* The slot range this replica is replicating for. */
-    replicationLink *link;               /* The replication link owning this. */
+    listNode *ref_repl_buf_node;                /* Referenced node of replication buffer blocks,
+                                                  see the definition of replBufBlock. */
+    size_t ref_block_pos;                       /* Access position of referenced buffer block,
+                                                   i.e. the next offset to send. */
+    unsigned char slot_bitmap[CLUSTER_SLOTS/8]; /* The slot range this replica is replicating for. */
+    replicationLink *link;                      /* The replication link owning this. */
 } ClientReplicationData;
 
 typedef struct ClientModuleData {
@@ -1576,7 +1578,7 @@ typedef struct replicationLink {
         int dbid;
     } provisional_source_state; /* Information about the provisional state (after RDB) for the source node, stored during dual channel sync. */
     replDataBuf pending_repl_data;             /* Replication data buffer for dual-channel-replication */
-    list *slot_ranges; /* Slot range used for slot import. */
+    unsigned char slot_bitmap[CLUSTER_SLOTS/8]; /* Slot range used for slot import. */
 } replicationLink;
 
 struct valkeyServer {
@@ -2763,7 +2765,7 @@ void ioThreadWriteToClient(void *data);
 int canParseCommand(client *c);
 int processIOThreadsReadDone(void);
 int processIOThreadsWriteDone(void);
-replicationLink *createReplicationLink(char *host, int port, list *slot_ranges);
+replicationLink *createReplicationLink(char *host, int port, unsigned char *slot_bitmap);
 int connectReplicationLink(replicationLink *link);
 int freeReplicationLink(replicationLink *link);
 
@@ -2995,7 +2997,7 @@ void aofOpenIfNeededOnServerStart(void);
 void aofManifestFree(aofManifest *am);
 int aofDelHistoryFiles(void);
 int aofRewriteLimited(void);
-int rewriteAppendOnlyFileRio(rio *aof, list *slot_ranges);
+int rewriteAppendOnlyFileRio(rio *aof, unsigned char *slot_bitmap);
 
 /* Child info */
 void openChildInfoPipe(void);

From 6c83496c45db364d3bf4f3e20ac69cea7a334cad Mon Sep 17 00:00:00 2001
From: Jacob Murphy <jkmurphy@google.com>
Date: Wed, 15 Jan 2025 22:45:19 +0000
Subject: [PATCH 03/18] Iterative improvements to get dual channel working

Signed-off-by: Jacob Murphy <jkmurphy@google.com>
---
 src/aof.c            |  2 +-
 src/cluster.c        |  2 +-
 src/cluster.h        |  7 ++--
 src/cluster_legacy.c | 25 +++++++-----
 src/cluster_legacy.h |  4 +-
 src/rdb.c            |  4 +-
 src/rdb.h            |  2 +-
 src/replication.c    | 95 ++++++++++++++++++++++++++++++++++----------
 src/server.h         | 18 +++++----
 9 files changed, 109 insertions(+), 50 deletions(-)

diff --git a/src/aof.c b/src/aof.c
index 3e9bc9d323..06f04760ca 100644
--- a/src/aof.c
+++ b/src/aof.c
@@ -2197,7 +2197,7 @@ int shouldFilterSlot(int slot, void * privdata) {
     return !bitmapTestBit(slot_bitmap, slot);
 }
 
-int rewriteAppendOnlyFileRio(rio *aof, unsigned char *slot_bitmap) {
+int rewriteAppendOnlyFileRio(rio *aof, slotBitmap slot_bitmap) {
     int j;
     long key_count = 0;
     long long updated_time = 0;
diff --git a/src/cluster.c b/src/cluster.c
index 2e88ff8ba2..d7e7be52af 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -815,7 +815,7 @@ unsigned int countKeysInSlot(unsigned int slot) {
     return kvstoreHashtableSize(server.db->keys, slot);
 }
 
-unsigned int dropKeysInSlotBitmap(unsigned char *slot_bitmap, int async) {
+unsigned int dropKeysInSlotBitmap(slotBitmap slot_bitmap, int async) {
     unsigned int result = 0;
     for (int i = 0; i < CLUSTER_SLOTS; i++) {
         if (bitmapTestBit(slot_bitmap, i)) {
diff --git a/src/cluster.h b/src/cluster.h
index e6610a8074..9b050d0b70 100644
--- a/src/cluster.h
+++ b/src/cluster.h
@@ -114,14 +114,15 @@ client *createCachedResponseClient(int resp);
 void deleteCachedResponseClient(client *recording_client);
 void clearCachedClusterSlotsResponse(void);
 unsigned int countKeysInSlot(unsigned int hashslot);
-unsigned int dropKeysInSlotBitmap(unsigned char *slot_bitmap, int async);
+unsigned int dropKeysInSlotBitmap(slotBitmap slot_bitmap, int async);
 unsigned int dropKeysInSlot(unsigned int hashslot, int async);
-void bitmapToSlotRanges(unsigned char *bitmap, char **slot_bitmap_out);
+void bitmapToSlotRanges(unsigned char *bitmap, slotBitmap slot_bitmap_out);
 int bitmapTestBit(unsigned char *bitmap, int pos);
 void bitmapSetBit(unsigned char *bitmap, int pos);
 void bitmapClearBit(unsigned char *bitmap, int pos);
 void bitmapSetAllBits(unsigned char *bitmap, int len);
-int isSlotBitmapAllSlots(unsigned char *bitmap);
+void slotBitmapSetAll(slotBitmap bitmap);
+int isSlotBitmapAllSlots(slotBitmap bitmap);
 int getSlotOrReply(client *c, robj *o);
 
 /* functions with shared implementations */
diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 95e6e600fe..0e07057856 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -84,7 +84,7 @@ void clusterFreeNodesSlotsInfo(clusterNode *n);
 uint64_t clusterGetMaxEpoch(void);
 int clusterBumpConfigEpochWithoutConsensus(void);
 slotMigration *clusterGetCurrentSlotMigration(void);
-void clusterSendMigrateSlotStart(clusterNode *node, unsigned char *slot_bitmap);
+void clusterSendMigrateSlotStart(clusterNode *node, slotBitmap slot_bitmap);
 void moduleCallClusterReceivers(const char *sender_id,
                                 uint64_t module_id,
                                 uint8_t type,
@@ -4445,13 +4445,13 @@ slotMigration *clusterGetCurrentSlotMigration(void) {
     return (slotMigration *) listFirst(server.cluster->slot_migrations)->value;
 }
 
-void clusterSendMigrateSlotStart(clusterNode *node, unsigned char *slot_bitmap) {
+void clusterSendMigrateSlotStart(clusterNode *node, slotBitmap slot_bitmap) {
     if (!node->link) return;
 
     uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData) + sizeof(clusterMsgSlotMigration);
     clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_MIGRATE_SLOT_START, msglen);
     clusterMsg *hdr = getMessageFromSendBlock(msgblock);
-    memcpy(hdr->data.slot_migration.msg.slot_bitmap, slot_bitmap, sizeof(hdr->data.slot_migration.msg.slot_bitmap));
+    memcpy(hdr->data.slot_migration.msg.slot_bitmap, slot_bitmap, sizeof(slotBitmap));
     clusterSendMessage(node->link, msgblock);
     clusterMsgSendBlockDecrRefCount(msgblock);
 }
@@ -5592,15 +5592,20 @@ void bitmapClearBit(unsigned char *bitmap, int pos) {
     bitmap[byte] &= ~(1 << bit);
 }
 
-void bitmapSetAllBits(unsigned char *bitmap, int len) {
-    memset(bitmap, 0xff, len);
+void slotBitmapSetAll(slotBitmap bitmap) {
+    memset(bitmap, 0xff, sizeof(slotBitmap));
+}
+
+int slotBitmapCompare(slotBitmap bitmap, slotBitmap otherbitmap) {
+    return memcmp(bitmap, otherbitmap, sizeof(slotBitmap));
 }
 
 /* Return if the slot bitmap contains all slots */
-int isSlotBitmapAllSlots(unsigned char *bitmap) {
-    unsigned char all_slot_bitmap[CLUSTER_SLOTS / 8];
-    bitmapSetAllBits(all_slot_bitmap, sizeof(all_slot_bitmap));
-    return memcmp(bitmap, all_slot_bitmap, sizeof(all_slot_bitmap)) == 0;
+int isSlotBitmapAllSlots(slotBitmap bitmap) {
+    if (!bitmap) return 1;
+    slotBitmap all_slot_bitmap;
+    slotBitmapSetAll(all_slot_bitmap);
+    return slotBitmapCompare(bitmap, all_slot_bitmap) == 0;
 }
 
 /* Return non-zero if there is at least one primary with replicas in the cluster.
@@ -7339,7 +7344,7 @@ int clusterCommandSpecial(client *c) {
         }
 
         slotMigration *to_enqueue = (slotMigration *) zmalloc(sizeof(slotMigration));
-        memcpy(to_enqueue->slot_bitmap, requested_slots, sizeof(requested_slots));
+        memcpy(to_enqueue->slot_bitmap, requested_slots, sizeof(slotBitmap));
         to_enqueue->source_node = curr_owner;
         to_enqueue->state = SLOT_MIGRATION_QUEUED;
         to_enqueue->end_time = 0; /* Will be set once started. */
diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
index 9a5add854d..1b83c1b2f5 100644
--- a/src/cluster_legacy.h
+++ b/src/cluster_legacy.h
@@ -147,7 +147,7 @@ typedef struct {
 } clusterMsgModule;
 
 typedef struct {
-    unsigned char slot_bitmap[CLUSTER_SLOTS / 8]; /* Slots bitmap. */
+    slotBitmap slot_bitmap;
 } clusterMsgSlotMigration;
 
 /* The cluster supports optional extension messages that can be sent
@@ -387,7 +387,7 @@ typedef enum slotMigrationState {
 } slotMigrationState;
 
 typedef struct slotMigration {
-    unsigned char slot_bitmap[CLUSTER_SLOTS/8];
+    slotBitmap slot_bitmap;
     slotMigrationState state;
     clusterNode *source_node;
     mstime_t end_time; /* Slot migration time limit (ms unixtime).
diff --git a/src/rdb.c b/src/rdb.c
index 33fb2c274c..4a3a7e1c8e 100644
--- a/src/rdb.c
+++ b/src/rdb.c
@@ -3527,7 +3527,7 @@ void killRDBChild(void) {
 
 /* Spawn an RDB child that writes the RDB to the sockets of the replicas
  * that are currently in REPLICA_STATE_WAIT_BGSAVE_START state. */
-int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, unsigned char *slot_bitmap) {
+int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, slotBitmap slot_bitmap) {
     listNode *ln;
     listIter li;
     pid_t childpid;
@@ -3579,7 +3579,7 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, unsigned char *slot_bitm
             /* Check replica has the exact requirements */
             if (replica->repl_data->replica_req != req) continue;
             /* Check matching slot bitmaps. */
-            if (memcmp(replica->repl_data->slot_bitmap, slot_bitmap, CLUSTER_SLOTS/8) != 0) continue;
+            if (memcmp(replica->repl_data->slot_bitmap, slot_bitmap, sizeof(slotBitmap)) != 0) continue;
 
             conns[connsnum++] = replica->conn;
             if (dual_channel) {
diff --git a/src/rdb.h b/src/rdb.h
index 5225933dd6..734ae7ba72 100644
--- a/src/rdb.h
+++ b/src/rdb.h
@@ -152,7 +152,7 @@ int rdbSaveObjectType(rio *rdb, robj *o);
 int rdbLoadObjectType(rio *rdb);
 int rdbLoad(char *filename, rdbSaveInfo *rsi, int rdbflags);
 int rdbSaveBackground(int req, char *filename, rdbSaveInfo *rsi, int rdbflags);
-int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, unsigned char *slot_bitmap);
+int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, slotBitmap slot_bitmap);
 void rdbRemoveTempFile(pid_t childpid, int from_signal);
 int rdbSaveToFile(const char *filename);
 int rdbSave(int req, char *filename, rdbSaveInfo *rsi, int rdbflags);
diff --git a/src/replication.c b/src/replication.c
index 5119eeb408..c50cebecfd 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -953,7 +953,7 @@ int primaryTryPartialResynchronization(client *c, long long psync_offset) {
  *    started.
  *
  * Returns C_OK on success or C_ERR otherwise. */
-int startBgsaveForReplication(int mincapa, int req, unsigned char *slot_bitmap) {
+int startBgsaveForReplication(int mincapa, int req, slotBitmap slot_bitmap) {
     int retval;
     int socket_target = 0;
     listIter li;
@@ -1268,7 +1268,7 @@ int anyOtherReplicaWaitRdb(client *except_me) {
 void initClientReplicationData(client *c) {
     if (c->repl_data) return;
     c->repl_data = (ClientReplicationData *)zcalloc(sizeof(ClientReplicationData));
-    bitmapSetAllBits(c->repl_data->slot_bitmap, sizeof(c->repl_data->slot_bitmap));
+    slotBitmapSetAll(c->repl_data->slot_bitmap);
 }
 
 void freeClientReplicationData(client *c) {
@@ -2074,7 +2074,7 @@ client *createReplicationLinkClientWithHandler(replicationLink *link, connection
      * PSYNC capable, so we flag it accordingly. */
     if (c->repl_data->reploff == -1) c->flag.pre_psync = 1;
     if (dbid != -1) selectDb(c, dbid);
-    memcpy(c->repl_data->slot_bitmap, link->slot_bitmap, sizeof(c->repl_data->slot_bitmap));
+    memcpy(c->repl_data->slot_bitmap, link->slot_bitmap, sizeof(slotBitmap));
 
     return c;
 }
@@ -2816,7 +2816,7 @@ static int dualChannelReplHandleHandshake(replicationLink *link, sds *err) {
         args[argc] = server.primary_auth;
         lens[argc] = sdslen(server.primary_auth);
         argc++;
-        *err = sendCommandArgv(link->transfer_s, argc, args, lens);
+        *err = sendCommandArgv(link->rdb_transfer_s, argc, args, lens);
         if (*err) {
             dualChannelServerLog(LL_WARNING, "Sending command to primary in dual channel replication handshake: %s", *err);
             return C_ERR;
@@ -2824,7 +2824,7 @@ static int dualChannelReplHandleHandshake(replicationLink *link, sds *err) {
     }
     /* Send replica listening port to primary for clarification */
     sds portstr = getReplicaPortString();
-    *err = sendCommand(link->transfer_s, "REPLCONF", "capa", "eof", "rdb-only", "1", "rdb-channel", "1", "listening-port", portstr,
+    *err = sendCommand(link->rdb_transfer_s, "REPLCONF", "capa", "eof", "rdb-only", "1", "rdb-channel", "1", "listening-port", portstr,
                        NULL);
     sdsfree(portstr);
     if (*err) {
@@ -2832,7 +2832,20 @@ static int dualChannelReplHandleHandshake(replicationLink *link, sds *err) {
         return C_ERR;
     }
 
-    if (connSetReadHandler(link->transfer_s, dualChannelFullSyncWithReplicationSource) == C_ERR) {
+    /* Send slot bitmap, if it is needed */
+    if (!isSlotBitmapAllSlots(link->slot_bitmap)) {
+        char *args[] = {"REPLCONF", "slot-bitmap", NULL};
+        size_t lens[] = {8, 11, 0};
+        args[2] = (char *) link->slot_bitmap;
+        lens[2] = sizeof(slotBitmap);
+        *err = sendCommandArgv(link->rdb_transfer_s, 3, args, lens);
+        if (*err) {
+            dualChannelServerLog(LL_WARNING, "Sending REPLCONF slot-bitmap command to primary in dual channel replication handshake: %s", *err);
+            return C_ERR;
+        }
+    }
+
+    if (connSetReadHandler(link->rdb_transfer_s, dualChannelFullSyncWithReplicationSource) == C_ERR) {
         char conninfo[CONN_INFO_LEN];
         dualChannelServerLog(LL_WARNING, "Can't create readable event for SYNC: %s (%s)", strerror(errno),
                              connGetInfo(link->transfer_s, conninfo, sizeof(conninfo)));
@@ -2842,7 +2855,7 @@ static int dualChannelReplHandleHandshake(replicationLink *link, sds *err) {
 }
 
 static int dualChannelReplHandleAuthReply(replicationLink *link, sds *err) {
-    *err = receiveSynchronousResponse(link->transfer_s);
+    *err = receiveSynchronousResponse(link->rdb_transfer_s);
     if (*err == NULL) {
         dualChannelServerLog(LL_WARNING, "Primary did not respond to auth command during SYNC handshake");
         return C_ERR;
@@ -2855,7 +2868,7 @@ static int dualChannelReplHandleAuthReply(replicationLink *link, sds *err) {
 }
 
 static int dualChannelReplHandleReplconfReply(replicationLink *link, sds *err) {
-    *err = receiveSynchronousResponse(link->transfer_s);
+    *err = receiveSynchronousResponse(link->rdb_transfer_s);
     if (*err == NULL) {
         dualChannelServerLog(LL_WARNING, "Primary did not respond to replconf command during SYNC handshake");
         return C_ERR;
@@ -2866,8 +2879,24 @@ static int dualChannelReplHandleReplconfReply(replicationLink *link, sds *err) {
                              *err);
         return C_ERR;
     }
-    if (connSyncWrite(link->transfer_s, "SYNC\r\n", 6, server.repl_syncio_timeout * 1000) == -1) {
-        dualChannelServerLog(LL_WARNING, "I/O error writing to Primary: %s", connGetLastError(link->transfer_s));
+
+    /* Recieve slot bitmap response as well. */
+    if (!isSlotBitmapAllSlots(link->slot_bitmap)) {
+        *err = receiveSynchronousResponse(link->rdb_transfer_s);
+        if (*err == NULL) {
+            dualChannelServerLog(LL_WARNING, "Primary did not respond to replconf slot-bitmap command during SYNC handshake");
+            return C_ERR;
+        }
+
+        if (*err[0] == '-') {
+            dualChannelServerLog(LL_NOTICE, "Server does not support sync with slot-bitmap, dual channel sync approach cannot be used: %s",
+                                *err);
+            return C_ERR;
+        }
+    }
+
+    if (connSyncWrite(link->rdb_transfer_s, "SYNC\r\n", 6, server.repl_syncio_timeout * 1000) == -1) {
+        dualChannelServerLog(LL_WARNING, "I/O error writing to Primary: %s", connGetLastError(link->rdb_transfer_s));
         return C_ERR;
     }
     return C_OK;
@@ -2879,7 +2908,7 @@ int replicationUseAOFFormatSnapshot(replicationLink *link) {
 
 static int dualChannelReplHandleEndOffsetResponse(replicationLink *link, sds *err) {
     uint64_t rdb_client_id;
-    *err = receiveSynchronousResponse(link->transfer_s);
+    *err = receiveSynchronousResponse(link->rdb_transfer_s);
     if (*err == NULL) {
         return C_ERR;
     }
@@ -2996,6 +3025,10 @@ static void dualChannelFullSyncWithReplicationSource(connection *conn) {
     link->transfer_fd = -1;
     link->state = REPL_STATE_CONNECT;
     replicationAbortDualChannelSyncTransfer(link);
+    if (link->client) {
+        freeClient(link->client);
+        link->client = NULL;
+    }
 }
 
 /* Replication: Replica side.
@@ -3730,7 +3763,7 @@ void syncWithSource(connection *conn) {
             char *argv[3] = {"REPLCONF", "slot-bitmap", NULL};
             size_t lens[3] = {8, 11, 0};
             argv[2] = (char *)link->slot_bitmap;
-            lens[2] = CLUSTER_SLOTS/8;
+            lens[2] = sizeof(slotBitmap);
             err = sendCommandArgv(conn, 3, argv, lens);
             if (err) goto write_error;
         }
@@ -3950,8 +3983,10 @@ void syncWithSource(connection *conn) {
         link->transfer_fd = dfd;
     }
 
-    /* We are going to need to do a full resync. If we are accepting a single
-     * slot - make sure we have a clean slate to load it into.*/
+    /* We are going to need to do a full resync. If we are accepting a
+     * slot subset - make sure we have a clean state to load it into. This may
+     * happen in cases where a previous replication attempt failed and is being
+     * retried. */
     if (!isSlotBitmapAllSlots(link->slot_bitmap)) {
         dropKeysInSlotBitmap(link->slot_bitmap, 1);
     }
@@ -3961,6 +3996,7 @@ void syncWithSource(connection *conn) {
     if (psync_result == PSYNC_FULLRESYNC_DUAL_CHANNEL) {
         /* Create RDB connection */
         link->rdb_transfer_s = connCreate(connTypeOfReplication());
+        connSetPrivateData(link->rdb_transfer_s, link);
         if (connConnect(link->rdb_transfer_s, link->host, link->port, server.bind_source_addr,
                         dualChannelFullSyncWithReplicationSource) == C_ERR) {
             serverLog(LL_WARNING, "Unable to connect to source: %s", connGetLastError(link->transfer_s));
@@ -4013,6 +4049,10 @@ void syncWithSource(connection *conn) {
     link->transfer_tmpfile = NULL;
     link->transfer_fd = -1;
     link->state = REPL_STATE_CONNECT;
+    if (link->client) {
+        freeClient(link->client);
+        link->client = NULL;
+    }
     return;
 
 write_error: /* Handle sendCommand() errors. */
@@ -4021,12 +4061,12 @@ void syncWithSource(connection *conn) {
     goto error;
 }
 
-replicationLink *createReplicationLink(char *host, int port, unsigned char *slot_bitmap) {
+replicationLink *createReplicationLink(char *host, int port, slotBitmap slot_bitmap) {
     replicationLink *result = (replicationLink *)zmalloc(sizeof(replicationLink));
     result->protected = 0;
     result->state = REPL_STATE_NONE;
     result->rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE;
-    memcpy(result->slot_bitmap, slot_bitmap, sizeof(result->slot_bitmap));
+    memcpy(result->slot_bitmap, slot_bitmap, sizeof(slotBitmap));
     result->client = NULL;
     result->host = sdsnew(host);
     result->port = port;
@@ -4306,6 +4346,17 @@ void replicationHandleSourceDisconnection(replicationLink *link) {
     link->client = NULL;
     link->state = REPL_STATE_CONNECT;
 
+    if (link->rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) {
+        /* Our client was closed in the middle of dual channel (e.g, we were
+         * loading AOF as a client). Ensure that the other dual channel
+         * connections are cleaned up. */
+        if (link->transfer_s) {
+            connClose(link->transfer_s);
+            link->transfer_s = NULL;
+        }
+        replicationAbortDualChannelSyncTransfer(link);
+    }
+
     /* Try to re-connect immediately rather than wait for replicationCron
      * waiting 1 second may risk backlog being recycled. */
     if (link->host) {
@@ -5125,7 +5176,7 @@ void replicationCron(void) {
     replication_cron_loops++; /* Incremented with frequency 1 HZ. */
 }
 
-int shouldStartChildReplication(int *mincapa_out, int *req_out, unsigned char *slot_bitmap_out) {
+int shouldStartChildReplication(int *mincapa_out, int *req_out, slotBitmap slot_bitmap_out) {
     /* We should start a BGSAVE good for replication if we have replicas in
      * WAIT_BGSAVE_START state.
      *
@@ -5137,7 +5188,7 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out, unsigned char *s
         int replicas_waiting = 0;
         int mincapa;
         int req;
-        unsigned char slot_bitmap[CLUSTER_SLOTS/8];
+        slotBitmap slot_bitmap;
         int first = 1;
         listNode *ln;
         listIter li;
@@ -5149,7 +5200,7 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out, unsigned char *s
                 if (first) {
                     /* Get first replica's requirements */
                     req = replica->repl_data->replica_req;
-                    memcpy(slot_bitmap, replica->repl_data->slot_bitmap, sizeof(slot_bitmap));
+                    memcpy(slot_bitmap, replica->repl_data->slot_bitmap, sizeof(slotBitmap));
                 } else if (req != replica->repl_data->replica_req) {
                     /* Skip replicas that don't match */
                     continue;
@@ -5168,7 +5219,7 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out, unsigned char *s
                                  max_idle >= server.repl_diskless_sync_delay)) {
             if (mincapa_out) *mincapa_out = mincapa;
             if (req_out) *req_out = req;
-            if (slot_bitmap_out) memcpy(slot_bitmap_out, slot_bitmap, sizeof(slot_bitmap));
+            if (slot_bitmap_out) memcpy(slot_bitmap_out, slot_bitmap, sizeof(slotBitmap));
             return 1;
         }
     }
@@ -5179,8 +5230,8 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out, unsigned char *s
 void replicationStartPendingFork(void) {
     int mincapa = -1;
     int req = -1;
-    unsigned char slot_bitmap[CLUSTER_SLOTS/8];
-    bitmapSetAllBits(slot_bitmap, sizeof(slot_bitmap));
+    slotBitmap slot_bitmap;
+    slotBitmapSetAll(slot_bitmap);
 
     if (shouldStartChildReplication(&mincapa, &req, slot_bitmap)) {
         /* Start the BGSAVE. The called function may start a
diff --git a/src/server.h b/src/server.h
index 5d7b8db461..34c8e9ba41 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1108,6 +1108,8 @@ typedef struct ClientPubSubData {
                                       context of client side caching. */
 } ClientPubSubData;
 
+typedef unsigned char slotBitmap[CLUSTER_SLOTS / 8];
+
 typedef struct replicationLink replicationLink;
 typedef struct ClientReplicationData {
     int repl_state;                      /* Replication state if this is a replica. */
@@ -1135,12 +1137,12 @@ typedef struct ClientReplicationData {
     short replica_req;                   /* Replica requirements: REPLICA_REQ_* */
     uint64_t associated_rdb_client_id;   /* The client id of this replica's rdb connection */
     time_t rdb_client_disconnect_time;   /* Time of the first freeClient call on this client. Used for delaying free. */
-    listNode *ref_repl_buf_node;                /* Referenced node of replication buffer blocks,
-                                                  see the definition of replBufBlock. */
-    size_t ref_block_pos;                       /* Access position of referenced buffer block,
-                                                   i.e. the next offset to send. */
-    unsigned char slot_bitmap[CLUSTER_SLOTS/8]; /* The slot range this replica is replicating for. */
-    replicationLink *link;                      /* The replication link owning this. */
+    listNode *ref_repl_buf_node;         /* Referenced node of replication buffer blocks,
+                                            see the definition of replBufBlock. */
+    size_t ref_block_pos;                /* Access position of referenced buffer block,
+                                            i.e. the next offset to send. */
+    slotBitmap slot_bitmap;              /* The slot range this replica is replicating for. */
+    replicationLink *link;               /* The replication link owning this. */
 } ClientReplicationData;
 
 typedef struct ClientModuleData {
@@ -2765,7 +2767,7 @@ void ioThreadWriteToClient(void *data);
 int canParseCommand(client *c);
 int processIOThreadsReadDone(void);
 int processIOThreadsWriteDone(void);
-replicationLink *createReplicationLink(char *host, int port, unsigned char *slot_bitmap);
+replicationLink *createReplicationLink(char *host, int port, slotBitmap slot_bitmap);
 int connectReplicationLink(replicationLink *link);
 int freeReplicationLink(replicationLink *link);
 
@@ -2997,7 +2999,7 @@ void aofOpenIfNeededOnServerStart(void);
 void aofManifestFree(aofManifest *am);
 int aofDelHistoryFiles(void);
 int aofRewriteLimited(void);
-int rewriteAppendOnlyFileRio(rio *aof, unsigned char *slot_bitmap);
+int rewriteAppendOnlyFileRio(rio *aof, slotBitmap slot_bitmap);
 
 /* Child info */
 void openChildInfoPipe(void);

From 50878f5784c940c6a48ca51ac37e81f3f0ba6970 Mon Sep 17 00:00:00 2001
From: Jacob Murphy <jkmurphy@google.com>
Date: Fri, 17 Jan 2025 22:39:46 +0000
Subject: [PATCH 04/18] Refactor code to reduce touch points in replication.c

Signed-off-by: Jacob Murphy <jkmurphy@google.com>
---
 src/aof.c            |    4 +-
 src/blocked.c        |    2 +-
 src/cluster.h        |    5 +-
 src/cluster_legacy.c |  164 ++--
 src/cluster_legacy.h |   16 +-
 src/config.c         |   23 +-
 src/db.c             |   20 +-
 src/evict.c          |    2 +-
 src/expire.c         |    2 +-
 src/io_threads.c     |    2 +-
 src/module.c         |   29 +-
 src/networking.c     |   86 ++-
 src/object.c         |    3 +-
 src/rdb.c            |   14 +-
 src/replication.c    | 1715 ++++++++++++++++++------------------------
 src/script.c         |   12 +-
 src/server.c         |   70 +-
 src/server.h         |  150 ++--
 src/valkeymodule.h   |    4 +-
 19 files changed, 1071 insertions(+), 1252 deletions(-)

diff --git a/src/aof.c b/src/aof.c
index 06f04760ca..dbebc92e63 100644
--- a/src/aof.c
+++ b/src/aof.c
@@ -2204,7 +2204,7 @@ int rewriteAppendOnlyFileRio(rio *aof, slotBitmap slot_bitmap) {
     kvstoreIterator *kvs_it = NULL;
 
     /* Record timestamp at the beginning of rewriting AOF. */
-    if (server.aof_timestamp_enabled && isSlotBitmapAllSlots(slot_bitmap)) {
+    if (server.aof_timestamp_enabled && !slot_bitmap) {
         sds ts = genAofTimestampAnnotationIfNeeded(1);
         if (rioWrite(aof, ts, sdslen(ts)) == 0) {
             sdsfree(ts);
@@ -2224,7 +2224,7 @@ int rewriteAppendOnlyFileRio(rio *aof, slotBitmap slot_bitmap) {
         if (rioWrite(aof, selectcmd, sizeof(selectcmd) - 1) == 0) goto werr;
         if (rioWriteBulkLongLong(aof, j) == 0) goto werr;
 
-        if (isSlotBitmapAllSlots(slot_bitmap)) {
+        if (!slot_bitmap) {
             kvs_it = kvstoreIteratorInit(db->keys);
         } else {
             kvs_it = kvstoreFilteredIteratorInit(db->keys, &shouldFilterSlot, slot_bitmap);
diff --git a/src/blocked.c b/src/blocked.c
index d1a6ff9c6b..70da7877ad 100644
--- a/src/blocked.c
+++ b/src/blocked.c
@@ -100,7 +100,7 @@ void freeClientBlockingState(client *c) {
  * flag is set client query buffer is not longer processed, but accumulated,
  * and will be processed when the client is unblocked. */
 void blockClient(client *c, int btype) {
-    /* Primary client should never be blocked unless pause or module */
+    /* Replication clients should never be blocked unless pause or module */
     serverAssert(!(c->flag.replication_source && btype != BLOCKED_MODULE && btype != BLOCKED_POSTPONE));
 
     initClientBlockingState(c);
diff --git a/src/cluster.h b/src/cluster.h
index 9b050d0b70..74889422b4 100644
--- a/src/cluster.h
+++ b/src/cluster.h
@@ -121,9 +121,10 @@ int bitmapTestBit(unsigned char *bitmap, int pos);
 void bitmapSetBit(unsigned char *bitmap, int pos);
 void bitmapClearBit(unsigned char *bitmap, int pos);
 void bitmapSetAllBits(unsigned char *bitmap, int len);
-void slotBitmapSetAll(slotBitmap bitmap);
-int isSlotBitmapAllSlots(slotBitmap bitmap);
+int slotBitmapCompare(slotBitmap bitmap, slotBitmap other);
+int isSlotBitmapEmpty(slotBitmap bitmap);
 int getSlotOrReply(client *c, robj *o);
+void clusterSlotMigrationDoneSyncing(long long initial_offset);
 
 /* functions with shared implementations */
 int clusterNodeIsMyself(clusterNode *n);
diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 0e07057856..d174124f40 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -122,6 +122,7 @@ int verifyClusterNodeId(const char *name, int length);
 sds clusterEncodeOpenSlotsAuxField(int rdbflags);
 int clusterDecodeOpenSlotsAuxField(int rdbflags, sds s);
 static int nodeExceedsHandshakeTimeout(clusterNode *node, mstime_t now);
+void clusterProceedWithSlotMigration(void);
 
 /* Only primaries that own slots have voting rights.
  * Returns 1 if the node has voting rights, otherwise returns 0. */
@@ -1456,7 +1457,7 @@ void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
 
     /* If the server is starting up, don't accept cluster connections:
      * UPDATE messages may interact with the database content. */
-    if (server.primary == NULL && server.loading) return;
+    if (server.primary_host == NULL && server.loading) return;
 
     while (max--) {
         cfd = anetTcpAccept(server.neterr, fd, cip, sizeof(cip), &cport);
@@ -4439,6 +4440,29 @@ void clusterPropagatePublish(robj *channel, robj *message, int sharded) {
  * Slot Migration functions
  * -------------------------------------------------------------------------- */
 
+slotMigration *clusterCreateSlotMigration(clusterNode *source, slotBitmap slots) {
+    slotMigration *result = (slotMigration *) zmalloc(sizeof(slotMigration));
+    memcpy(result->slot_bitmap, slots, sizeof(slotBitmap));
+    result->source_node = source;
+    result->state = SLOT_MIGRATION_QUEUED;
+    result->end_time = 0; /* Will be set once started. */
+    result->replication_connection = NULL;
+    result->replication_client = NULL;
+    result->replication_handshake_state = REPL_STATE_NONE;
+    result->pause_end = 0;
+    result->pause_primary_offset = -1;
+    return result;
+}
+
+void clusterFreeSlotMigration(slotMigration *migration) {
+    if (migration->replication_client) {
+        freeClient(migration->replication_client);
+    } else if (migration->replication_connection) {
+        connClose(migration->replication_connection);
+    }
+    zfree(migration);
+}
+
 /* Gets the current slot migration from the head of the queue. */
 slotMigration *clusterGetCurrentSlotMigration(void) {
     if (listLength(server.cluster->slot_migrations) == 0) return NULL;
@@ -4456,6 +4480,23 @@ void clusterSendMigrateSlotStart(clusterNode *node, slotBitmap slot_bitmap) {
     clusterMsgSendBlockDecrRefCount(msgblock);
 }
 
+void clusterImportHandler(connection *conn) {
+    UNUSED(conn);
+    /* This is called if there is an event on the current migrations
+     * connection. If that is the case, we can just continue with our
+     * state machine.*/
+    clusterProceedWithSlotMigration();
+}
+
+void clusterSlotMigrationDoneSyncing(long long initial_offset) {
+    slotMigration *migration = clusterGetCurrentSlotMigration();
+    serverAssert(migration != NULL && migration->state == SLOT_MIGRATION_RECEIVE_SYNC);
+    migration->state = SLOT_MIGRATION_PAUSE_OWNER;
+    migration->replication_client->repl_data->reploff = initial_offset;
+    migration->replication_client->repl_data->read_reploff = initial_offset;
+    clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION);
+}
+
 /* This is the main state machine for the slot migration workflow. Slot
  * migration is driven by the new owner of the slot. This function will do as
  * much work as possible synchronously, processing the enqueued slot migrations
@@ -4471,7 +4512,7 @@ void clusterProceedWithSlotMigration(void) {
                 "Timed out for slot migration from source node %.40s", curr_migration->source_node->name);
             curr_migration->state = SLOT_MIGRATION_FAILED;
         }
-        if (curr_migration->state > SLOT_MIGRATION_PAUSE_OWNER && curr_migration->state < SLOT_MIGRATION_FAILED && curr_migration->pause_end < mstime() && curr_migration->vote_retry_time < mstime()) {
+        if (curr_migration->state > SLOT_MIGRATION_PAUSE_OWNER && curr_migration->state < SLOT_MIGRATION_FAILED && curr_migration->pause_end < mstime()) {
             /* If the owner ever unpauses, we have to move back in the state machine and retry. */
             serverLog(LL_WARNING, "Reinitiating pause on the node owning the slot range...");
             curr_migration->state = SLOT_MIGRATION_PAUSE_OWNER;
@@ -4480,43 +4521,87 @@ void clusterProceedWithSlotMigration(void) {
         switch(curr_migration->state) {
             case SLOT_MIGRATION_QUEUED:
                 /* Start the migration */
-                serverLog(LL_NOTICE, "Starting sync from migration source node %.40s", curr_migration->source_node->name);
+                serverLog(LL_NOTICE, "Starting replication of slots from migration source node %.40s", curr_migration->source_node->name);
                 curr_migration->end_time = mstime() + CLUSTER_SLOT_MIGRATION_TIMEOUT;
-                curr_migration->link = createReplicationLink(curr_migration->source_node->ip, getNodeDefaultReplicationPort(curr_migration->source_node), curr_migration->slot_bitmap);
-                if (connectReplicationLink(curr_migration->link) == C_ERR) {
+                curr_migration->replication_connection = connCreate(server.tls_replication ? connectionTypeTls() : connectionTypeTcp());
+                if (connConnect(curr_migration->replication_connection, curr_migration->source_node->ip, getNodeDefaultReplicationPort(curr_migration->source_node), server.bind_source_addr, clusterImportHandler) == C_ERR) {
                     serverLog(LL_WARNING,
-                            "Failed to begin sync from migration source node %.40s", curr_migration->source_node->name);
+                            "Failed to connect to migration source node %.40s", curr_migration->source_node->name);
                     curr_migration->state = SLOT_MIGRATION_FAILED;
                     continue;
                 }
-                curr_migration->state = SLOT_MIGRATION_SYNCING;
+                curr_migration->replication_handshake_state = REPL_STATE_CONNECTING;
+                curr_migration->state = SLOT_MIGRATION_CONNECTING;
                 continue;
-            case SLOT_MIGRATION_SYNCING:
-                /* replicationCron should manage retrying connection, but there could be scenarios where we hit an irrecoverable error. */
-                if (curr_migration->link->state == REPL_STATE_NONE || curr_migration->link->state == REPL_STATE_CANCELLED) {
-                    serverLog(LL_WARNING, "Sync failed from migration node %.40s", curr_migration->source_node->name);
+            case SLOT_MIGRATION_CONNECTING:
+                if (curr_migration->replication_connection->state == CONN_STATE_CONNECTED) {
+                    curr_migration->state = SLOT_MIGRATION_REPL_HANDSHAKE;
+                    continue;
+                }
+                /* Nothing to do, waiting for connection to be established. */
+                return;
+            case SLOT_MIGRATION_REPL_HANDSHAKE:
+                curr_migration->replication_handshake_state = replicationProceedWithHandshake(curr_migration->replication_connection, curr_migration->replication_handshake_state, curr_migration->slot_bitmap);
+                if (curr_migration->replication_handshake_state == REPL_STATE_ERROR) {
+                    serverLog(LL_WARNING, "Handshake failed from migration node %.40s", curr_migration->source_node->name);
                     curr_migration->state = SLOT_MIGRATION_FAILED;
                     continue;
                 }
-                if (curr_migration->link->state == REPL_STATE_CONNECTED) {
-                    curr_migration->state = SLOT_MIGRATION_PAUSE_OWNER;
+                if (curr_migration->replication_handshake_state == REPL_STATE_SEND_PSYNC) {
+                    curr_migration->state = SLOT_MIGRATION_SEND_SYNC;
                     continue;
                 }
-                /* If we are in another state, nothing to do right now. */
                 return;
+            case SLOT_MIGRATION_SEND_SYNC:
+                /* Ensure we have a clean state for the SYNC. */
+                dropKeysInSlotBitmap(curr_migration->slot_bitmap, 1);
+
+                /* We are done with our handshake phase. We can proceed straight to doing our SYNC.
+                 * Note that we are skipping PSYNC. PSYNC will always result in full resync for a
+                 * slot migration anyways.
+                 *
+                 * In the future, we can do a PSYNC phase to incorporate dual channel. */
+                serverLog(LL_NOTICE, "Starting SYNC for slot migration from migration source node %.40s", curr_migration->source_node->name);
+                if (connSyncWrite(curr_migration->replication_connection, "SYNC\r\n", 6, server.repl_syncio_timeout * 1000) == -1) {
+                    serverLog(LL_WARNING, "I/O error writing to slot migration source: %s", connGetLastError(curr_migration->replication_connection));
+                    curr_migration->state = SLOT_MIGRATION_FAILED;
+                    continue;
+                }
+                client *c = createClient(curr_migration->replication_connection);
+                curr_migration->replication_client = c;
+                c->flag.replication_source = 1;
+                c->flag.slot_migration_source = 1;
+                c->flag.authenticated = 1;
+                c->user = NULL; /* This client can do everything. */
+                initClientReplicationData(c); /* We use this to track offset. */
+                c->querybuf = sdsempty(); /* Similar to primary, we use a dedicated query buf. */
+
+                /* Our result will be received in AOF format, so we can pipe it
+                 * straight to readQueryFromClient. */
+                connSetReadHandler(c->conn, readQueryFromClient);
+                curr_migration->state = SLOT_MIGRATION_RECEIVE_SYNC;
+                continue;
+            case SLOT_MIGRATION_RECEIVE_SYNC:
+                return; /* Nothing to do */
             case SLOT_MIGRATION_PAUSE_OWNER:
-                serverLog(LL_NOTICE, "Replication link to slot owner %.40s has been established. Pausing source node and waiting to continue", curr_migration->source_node->name);
+                /* Send an ACK to put the connection into streaming state. */
+                replicationSendAck(curr_migration->replication_client);
+
+                serverLog(LL_NOTICE, "Replication sync to slot owner %.40s has been performed. Current replication offset: %lld. Pausing source node and waiting to continue", curr_migration->source_node->name, curr_migration->replication_client->repl_data->reploff);
                 clusterSendMigrateSlotStart(curr_migration->source_node, curr_migration->slot_bitmap);
                 curr_migration->pause_primary_offset = -1;
                 curr_migration->pause_end = mstime() + CLUSTER_MF_TIMEOUT;
                 curr_migration->state = SLOT_MIGRATION_WAITING_FOR_OFFSET;
                 continue;
             case SLOT_MIGRATION_WAITING_FOR_OFFSET:
-                /* Nothing to do, need to wait for cluster message to come in. */
+                /* Send REPLCONF ACK from time to time */
+                replicationSendAck(curr_migration->replication_client);
                 return;
             case SLOT_MIGRATION_SYNCING_TO_OFFSET:
-                if (curr_migration->link->client->repl_data->reploff >= curr_migration->pause_primary_offset) {
-                    serverLog(LL_NOTICE, "Replication of slot range has caught up to paused slot owner, slot migration can start.");
+                /* Send REPLCONF ACK from time to time */
+                replicationSendAck(curr_migration->replication_client);
+                if (curr_migration->replication_client->repl_data->reploff >= curr_migration->pause_primary_offset) {
+                    serverLog(LL_NOTICE, "Replication of slot range has caught up to paused slot owner (%lld), slot migration can start.", curr_migration->pause_primary_offset);
                     curr_migration->state = SLOT_MIGRATION_FINISH;
                     continue;
                 }
@@ -4535,17 +4620,15 @@ void clusterProceedWithSlotMigration(void) {
                 if (clusterBumpConfigEpochWithoutConsensus() == C_OK) {
                     serverLog(LL_NOTICE, "ConfigEpoch updated after importing slots");
                 }
+                clusterFreeSlotMigration(curr_migration);
                 clusterBroadcastPong(CLUSTER_BROADCAST_ALL);
                 listDelNode(server.cluster->slot_migrations, curr_node);
-                freeReplicationLink(curr_migration->link);
-                zfree(curr_migration);
                 continue;
             case SLOT_MIGRATION_FAILED:
                 /* Delete the migration from the queue and proceed to the next migration */
                 listDelNode(server.cluster->slot_migrations, curr_node);
-                freeReplicationLink(curr_migration->link);
                 dropKeysInSlotBitmap(curr_migration->slot_bitmap, server.repl_replica_lazy_flush);
-                zfree(curr_migration);
+                clusterFreeSlotMigration(curr_migration);
                 continue;
         }
     }
@@ -4896,8 +4979,8 @@ void clusterHandleReplicaFailover(void) {
 
     /* Set data_age to the number of milliseconds we are disconnected from
      * the primary. */
-    if (server.primary && server.primary->state == REPL_STATE_CONNECTED) {
-        data_age = (mstime_t)(server.unixtime - server.primary->client->last_interaction) * 1000;
+    if (server.repl_state == REPL_STATE_CONNECTED) {
+        data_age = (mstime_t)(server.unixtime - server.primary->last_interaction) * 1000;
     } else {
         data_age = (mstime_t)(server.unixtime - server.repl_down_since) * 1000;
     }
@@ -5489,7 +5572,7 @@ void clusterCron(void) {
     /* If we are a replica node but the replication is still turned off,
      * enable it if we know the address of our primary and it appears to
      * be up. */
-    if (nodeIsReplica(myself) && server.primary == NULL && myself->replicaof && nodeHasAddr(myself->replicaof)) {
+    if (nodeIsReplica(myself) && server.primary_host == NULL && myself->replicaof && nodeHasAddr(myself->replicaof)) {
         replicationSetPrimary(myself->replicaof->ip, getNodeDefaultReplicationPort(myself->replicaof), 0);
     }
 
@@ -5592,20 +5675,14 @@ void bitmapClearBit(unsigned char *bitmap, int pos) {
     bitmap[byte] &= ~(1 << bit);
 }
 
-void slotBitmapSetAll(slotBitmap bitmap) {
-    memset(bitmap, 0xff, sizeof(slotBitmap));
-}
-
 int slotBitmapCompare(slotBitmap bitmap, slotBitmap otherbitmap) {
     return memcmp(bitmap, otherbitmap, sizeof(slotBitmap));
 }
 
-/* Return if the slot bitmap contains all slots */
-int isSlotBitmapAllSlots(slotBitmap bitmap) {
-    if (!bitmap) return 1;
-    slotBitmap all_slot_bitmap;
-    slotBitmapSetAll(all_slot_bitmap);
-    return slotBitmapCompare(bitmap, all_slot_bitmap) == 0;
+int isSlotBitmapEmpty(slotBitmap bitmap) {
+    slotBitmap empty;
+    memset(empty, 0, sizeof(slotBitmap));
+    return slotBitmapCompare(bitmap, empty) == 0;
 }
 
 /* Return non-zero if there is at least one primary with replicas in the cluster.
@@ -6706,13 +6783,13 @@ int clusterParseSetSlotCommand(client *c, int *slot_out, clusterNode **node_out,
     int optarg_pos = 0;
 
     /* Allow primaries to replicate "CLUSTER SETSLOT" */
-    if (!c->flag.replication_source && nodeIsReplica(myself)) {
+    if (!c->flag.primary && nodeIsReplica(myself)) {
         addReplyError(c, "Please use SETSLOT only with masters.");
         return 0;
     }
 
     /* If 'myself' is a replica, 'c' must be the primary client. */
-    serverAssert(!nodeIsReplica(myself) || (server.primary && c == server.primary->client));
+    serverAssert(!nodeIsReplica(myself) || c == server.primary);
 
     if ((slot = getSlotOrReply(c, c->argv[2])) == -1) return 0;
 
@@ -7343,18 +7420,7 @@ int clusterCommandSpecial(client *c) {
             }
         }
 
-        slotMigration *to_enqueue = (slotMigration *) zmalloc(sizeof(slotMigration));
-        memcpy(to_enqueue->slot_bitmap, requested_slots, sizeof(slotBitmap));
-        to_enqueue->source_node = curr_owner;
-        to_enqueue->state = SLOT_MIGRATION_QUEUED;
-        to_enqueue->end_time = 0; /* Will be set once started. */
-        to_enqueue->link = NULL;
-        to_enqueue->pause_end = 0;
-        to_enqueue->pause_primary_offset = -1;
-        to_enqueue->vote_end_time = 0;
-        to_enqueue->vote_retry_time = 0;
-        to_enqueue->vote_epoch = 0;
-        to_enqueue->auth_count = 0;
+        slotMigration * to_enqueue = clusterCreateSlotMigration(curr_owner, requested_slots);
         listAddNodeTail(server.cluster->slot_migrations, to_enqueue);
         clusterProceedWithSlotMigration();
         addReply(c, shared.ok);
diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
index 1b83c1b2f5..f9c6f5e5b8 100644
--- a/src/cluster_legacy.h
+++ b/src/cluster_legacy.h
@@ -377,8 +377,12 @@ struct _clusterNode {
 };
 
 typedef enum slotMigrationState {
-    SLOT_MIGRATION_QUEUED,          /* Queued behind some other slot migration. */
-    SLOT_MIGRATION_SYNCING,         /* Syncing contents from current owner. */
+    SLOT_MIGRATION_QUEUED,
+    SLOT_MIGRATION_CONNECTING,
+    SLOT_MIGRATION_REPL_HANDSHAKE,  /* The handshake has it's own state machine,
+                                     * see replicationProceedWithHandshake */
+    SLOT_MIGRATION_SEND_SYNC,
+    SLOT_MIGRATION_RECEIVE_SYNC,
     SLOT_MIGRATION_PAUSE_OWNER,
     SLOT_MIGRATION_WAITING_FOR_OFFSET,
     SLOT_MIGRATION_SYNCING_TO_OFFSET,
@@ -392,13 +396,11 @@ typedef struct slotMigration {
     clusterNode *source_node;
     mstime_t end_time; /* Slot migration time limit (ms unixtime).
                           If not yet in progress (e.g. queued), will be zero. */
-    replicationLink *link;
+    connection *replication_connection; /* Connection for replication. */
+    client *replication_client; /* Client for replication */
+    int replication_handshake_state;
     mstime_t pause_end;
     long long pause_primary_offset;
-    mstime_t vote_end_time;
-    mstime_t vote_retry_time;
-    uint64_t vote_epoch;
-    int auth_count;
 } slotMigration;
 
 /* Struct used for storing slot statistics. */
diff --git a/src/config.c b/src/config.c
index 512b35f210..5b90ebbd60 100644
--- a/src/config.c
+++ b/src/config.c
@@ -596,7 +596,7 @@ void loadServerConfigFromString(char *config) {
     }
 
     /* Sanity checks. */
-    if (server.cluster_enabled && server.primary) {
+    if (server.cluster_enabled && server.primary_host) {
         err = "replicaof directive not allowed in cluster mode";
         goto loaderr;
     }
@@ -1451,11 +1451,11 @@ void rewriteConfigReplicaOfOption(standardConfig *config, const char *name, stru
     /* If this is a primary, we want all the replicaof config options
      * in the file to be removed. Note that if this is a cluster instance
      * we don't want a replicaof directive inside valkey.conf. */
-    if (server.cluster_enabled || server.primary == NULL) {
+    if (server.cluster_enabled || server.primary_host == NULL) {
         rewriteConfigMarkAsProcessed(state, name);
         return;
     }
-    line = sdscatprintf(sdsempty(), "%s %s %d", name, server.primary->host, server.primary->port);
+    line = sdscatprintf(sdsempty(), "%s %s %d", name, server.primary_host, server.primary_port);
     rewriteConfigRewriteLine(state, name, line, 1);
 }
 
@@ -3000,20 +3000,19 @@ static int setConfigReplicaOfOption(standardConfig *config, sds *argv, int argc,
         return 0;
     }
 
-    freeReplicationLink(server.primary);
-    server.primary = NULL;
-
+    sdsfree(server.primary_host);
+    server.primary_host = NULL;
     if (!strcasecmp(argv[0], "no") && !strcasecmp(argv[1], "one")) {
         return 1;
     }
     char *ptr;
-    int port = strtol(argv[1], &ptr, 10);
-    if (port < 0 || port > 65535 || *ptr != '\0') {
+    server.primary_port = strtol(argv[1], &ptr, 10);
+    if (server.primary_port < 0 || server.primary_port > 65535 || *ptr != '\0') {
         *err = "Invalid primary port";
         return 0;
     }
-    server.primary = createReplicationLink(argv[0], port, NULL);
-    server.primary->state = REPL_STATE_CONNECT;
+    server.primary_host = sdsnew(argv[0]);
+    server.repl_state = REPL_STATE_CONNECT;
     return 1;
 }
 
@@ -3025,8 +3024,8 @@ static sds getConfigBindOption(standardConfig *config) {
 static sds getConfigReplicaOfOption(standardConfig *config) {
     UNUSED(config);
     char buf[256];
-    if (server.primary)
-        snprintf(buf, sizeof(buf), "%s %d", server.primary->host, server.primary->port);
+    if (server.primary_host)
+        snprintf(buf, sizeof(buf), "%s %d", server.primary_host, server.primary_port);
     else
         buf[0] = '\0';
     return sdsnew(buf);
diff --git a/src/db.c b/src/db.c
index 05b395728a..134dc6e9dd 100644
--- a/src/db.c
+++ b/src/db.c
@@ -110,7 +110,7 @@ robj *lookupKey(serverDb *db, robj *key, int flags) {
          * It's possible that the WRITE flag is set even during a readonly
          * command, since the command may trigger events that cause modules to
          * perform additional writes. */
-        int is_ro_replica = server.primary && server.repl_replica_ro;
+        int is_ro_replica = server.primary_host && server.repl_replica_ro;
         int expire_flags = 0;
         if (flags & LOOKUP_WRITE && !is_ro_replica) expire_flags |= EXPIRE_FORCE_DELETE_EXPIRED;
         if (flags & LOOKUP_NOEXPIRE) expire_flags |= EXPIRE_AVOID_DELETE_EXPIRED;
@@ -258,7 +258,7 @@ int getKeySlot(sds key) {
      * so we must always recompute the slot for commands coming from the primary.
      */
     if (server.current_client && server.current_client->slot >= 0 && server.current_client->flag.executing_command &&
-        !server.current_client->flag.replication_source) {
+        !server.current_client->flag.primary) {
         debugServerAssertWithInfo(server.current_client, NULL,
                                   (int)keyHashSlot(key, (int)sdslen(key)) == server.current_client->slot);
         return server.current_client->slot;
@@ -267,7 +267,7 @@ int getKeySlot(sds key) {
     /* For the case of replicated commands from primary, getNodeByQuery() never gets called,
      * and thus c->slot never gets populated. That said, if this command ends up accessing a key,
      * we are able to backfill c->slot here, where the key's hash calculation is made. */
-    if (server.current_client && server.current_client->flag.replication_source) {
+    if (server.current_client && server.current_client->flag.primary) {
         server.current_client->slot = slot;
     }
     return slot;
@@ -446,7 +446,7 @@ robj *dbRandomKey(serverDb *db) {
         sds key = objectGetKey(valkey);
         robj *keyobj = createStringObject(key, sdslen(key));
         if (objectIsExpired(valkey)) {
-            if (allvolatile && (server.primary || server.import_mode) && --maxtries == 0) {
+            if (allvolatile && (server.primary_host || server.import_mode) && --maxtries == 0) {
                 /* If the DB is composed only of keys with an expire set,
                  * it could happen that all the keys are already logically
                  * expired in the replica, so the function cannot stop because
@@ -1801,8 +1801,8 @@ robj *setExpire(client *c, serverDb *db, robj *key, long long when) {
         serverAssert(added);
     }
 
-    int writable_replica = server.primary && server.repl_replica_ro == 0;
-    if (c && writable_replica && !c->flag.replication_source) rememberReplicaKeyWithExpire(db, key);
+    int writable_replica = server.primary_host && server.repl_replica_ro == 0;
+    if (c && writable_replica && !c->flag.primary) rememberReplicaKeyWithExpire(db, key);
     return val;
 }
 
@@ -1907,7 +1907,7 @@ static int objectIsExpired(robj *val) {
     /* Don't expire anything while loading. It will be done later. */
     if (server.loading) return 0;
     if (!timestampIsExpired(objectGetExpire(val))) return 0;
-    if (server.primary == NULL && server.import_mode) {
+    if (server.primary_host == NULL && server.import_mode) {
         if (server.current_client && server.current_client->flag.import_source) return 0;
     }
     return 1;
@@ -1925,7 +1925,7 @@ static int keyIsExpiredWithDictIndex(serverDb *db, robj *key, int dict_index) {
     if (!keyIsExpiredWithDictIndexImpl(db, key, dict_index)) return 0;
 
     /* See expireIfNeededWithDictIndex for more details. */
-    if (server.primary == NULL && server.import_mode) {
+    if (server.primary_host == NULL && server.import_mode) {
         if (server.current_client && server.current_client->flag.import_source) return 0;
     }
     return 1;
@@ -1959,8 +1959,8 @@ static keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, robj *val,
      *
      * When replicating commands from the primary, keys are never considered
      * expired. */
-    if (server.primary != NULL) {
-        if (server.current_client && (server.current_client->flag.replication_source)) return KEY_VALID;
+    if (server.primary_host != NULL) {
+        if (server.current_client && (server.current_client->flag.primary)) return KEY_VALID;
         if (!(flags & EXPIRE_FORCE_DELETE_EXPIRED)) return KEY_EXPIRED;
     } else if (server.import_mode) {
         /* If we are running in the import mode on a primary, instead of
diff --git a/src/evict.c b/src/evict.c
index f91f2b76f7..d4bfade4fc 100644
--- a/src/evict.c
+++ b/src/evict.c
@@ -466,7 +466,7 @@ static int isSafeToPerformEvictions(void) {
 
     /* By default replicas should ignore maxmemory
      * and just be primaries exact copies. */
-    if (server.primary && server.repl_replica_ignore_maxmemory) return 0;
+    if (server.primary_host && server.repl_replica_ignore_maxmemory) return 0;
 
     /* If 'evict' action is paused, for whatever reason, then return false */
     if (isPausedActionsWithUpdate(PAUSE_ACTION_EVICT)) return 0;
diff --git a/src/expire.c b/src/expire.c
index 29dcd82c83..e4c3b0ec96 100644
--- a/src/expire.c
+++ b/src/expire.c
@@ -524,7 +524,7 @@ int checkAlreadyExpired(long long when) {
      *
      * If the server is a primary and in the import mode, we also add the already
      * expired key and wait for an explicit DEL from the import source. */
-    return (when <= commandTimeSnapshot() && !server.loading && !server.primary && !server.import_mode);
+    return (when <= commandTimeSnapshot() && !server.loading && !server.primary_host && !server.import_mode);
 }
 
 #define EXPIRE_NX (1 << 0)
diff --git a/src/io_threads.c b/src/io_threads.c
index 260d7007be..715251a06a 100644
--- a/src/io_threads.c
+++ b/src/io_threads.c
@@ -345,7 +345,7 @@ int trySendReadToIOThreads(client *c) {
     c->cur_tid = tid;
     c->read_flags = canParseCommand(c) ? 0 : READ_FLAGS_DONT_PARSE;
     c->read_flags |= authRequired(c) ? READ_FLAGS_AUTH_REQUIRED : 0;
-    c->read_flags |= c->flag.replication_source ? READ_FLAGS_PRIMARY : 0;
+    c->read_flags |= c->flag.replication_source ? READ_FLAGS_REPLICATION_SOURCE : 0;
 
     c->io_read_state = CLIENT_PENDING_IO;
     connSetPostponeUpdateState(c->conn, 1);
diff --git a/src/module.c b/src/module.c
index 40a5c8de20..01c9962e90 100644
--- a/src/module.c
+++ b/src/module.c
@@ -3757,9 +3757,9 @@ int modulePopulateReplicationInfoStructure(void *ri, int structver) {
     ValkeyModuleReplicationInfoV1 *ri1 = ri;
     memset(ri1, 0, sizeof(*ri1));
     ri1->version = structver;
-    ri1->primary = server.primary == NULL;
-    ri1->primary_host = server.primary ? server.primary->host : "";
-    ri1->primary_port = server.primary ? server.primary->port : 0;
+    ri1->primary = server.primary_host == NULL;
+    ri1->primary_host = server.primary_host ? server.primary_host : "";
+    ri1->primary_port = server.primary_port;
     ri1->replid1 = server.replid;
     ri1->replid2 = server.replid2;
     ri1->repl1_offset = server.primary_repl_offset;
@@ -3948,7 +3948,8 @@ int VM_GetContextFlags(ValkeyModuleCtx *ctx) {
         if (ctx->client) {
             if (ctx->client->flag.deny_blocking) flags |= VALKEYMODULE_CTX_FLAGS_DENY_BLOCKING;
             /* Module command received from PRIMARY, is replicated. */
-            if (ctx->client->flag.replication_source) flags |= VALKEYMODULE_CTX_FLAGS_REPLICATED;
+            if (ctx->client->flag.primary) flags |= VALKEYMODULE_CTX_FLAGS_REPLICATED;
+            if (ctx->client->flag.slot_migration_source) flags |= VALKEYMODULE_CTX_FLAGS_IMPORTING_SLOT;
             if (ctx->client->resp == 3) {
                 flags |= VALKEYMODULE_CTX_FLAGS_RESP3;
             }
@@ -3973,7 +3974,7 @@ int VM_GetContextFlags(ValkeyModuleCtx *ctx) {
         flags |= VALKEYMODULE_CTX_FLAGS_LOADING;
 
     /* Maxmemory and eviction policy */
-    if (server.maxmemory > 0 && (!server.primary || !server.repl_replica_ignore_maxmemory)) {
+    if (server.maxmemory > 0 && (!server.primary_host || !server.repl_replica_ignore_maxmemory)) {
         flags |= VALKEYMODULE_CTX_FLAGS_MAXMEMORY;
 
         if (server.maxmemory_policy != MAXMEMORY_NO_EVICTION) flags |= VALKEYMODULE_CTX_FLAGS_EVICT;
@@ -3984,22 +3985,22 @@ int VM_GetContextFlags(ValkeyModuleCtx *ctx) {
     if (server.saveparamslen > 0) flags |= VALKEYMODULE_CTX_FLAGS_RDB;
 
     /* Replication flags */
-    if (server.primary == NULL) {
+    if (server.primary_host == NULL) {
         flags |= VALKEYMODULE_CTX_FLAGS_PRIMARY;
     } else {
         flags |= VALKEYMODULE_CTX_FLAGS_REPLICA;
         if (server.repl_replica_ro) flags |= VALKEYMODULE_CTX_FLAGS_READONLY;
 
         /* Replica state flags. */
-        if (server.primary->state == REPL_STATE_CONNECT || server.primary->state == REPL_STATE_CONNECTING) {
+        if (server.repl_state == REPL_STATE_CONNECT || server.repl_state == REPL_STATE_CONNECTING) {
             flags |= VALKEYMODULE_CTX_FLAGS_REPLICA_IS_CONNECTING;
-        } else if (server.primary->state == REPL_STATE_TRANSFER) {
+        } else if (server.repl_state == REPL_STATE_TRANSFER) {
             flags |= VALKEYMODULE_CTX_FLAGS_REPLICA_IS_TRANSFERRING;
-        } else if (server.primary->state == REPL_STATE_CONNECTED) {
+        } else if (server.repl_state == REPL_STATE_CONNECTED) {
             flags |= VALKEYMODULE_CTX_FLAGS_REPLICA_IS_ONLINE;
         }
 
-        if (server.primary->state != REPL_STATE_CONNECTED) flags |= VALKEYMODULE_CTX_FLAGS_REPLICA_IS_STALE;
+        if (server.repl_state != REPL_STATE_CONNECTED) flags |= VALKEYMODULE_CTX_FLAGS_REPLICA_IS_STALE;
     }
 
     /* OOM flag. */
@@ -6462,7 +6463,7 @@ ValkeyModuleCallReply *VM_Call(ValkeyModuleCtx *ctx, const char *cmdname, const
                 goto cleanup;
             }
 
-            if (server.primary && server.repl_replica_ro && !obey_client) {
+            if (server.primary_host && server.repl_replica_ro && !obey_client) {
                 errno = ESPIPE;
                 if (error_as_call_replies) {
                     sds msg = sdsdup(shared.roreplicaerr->ptr);
@@ -6472,7 +6473,7 @@ ValkeyModuleCallReply *VM_Call(ValkeyModuleCtx *ctx, const char *cmdname, const
             }
         }
 
-        if (server.primary && server.primary->state != REPL_STATE_CONNECTED && server.repl_serve_stale_data == 0 &&
+        if (server.primary_host && server.repl_state != REPL_STATE_CONNECTED && server.repl_serve_stale_data == 0 &&
             !(cmd_flags & CMD_STALE)) {
             errno = ESPIPE;
             if (error_as_call_replies) {
@@ -8782,7 +8783,7 @@ int VM_AddPostNotificationJob(ValkeyModuleCtx *ctx,
                               ValkeyModulePostNotificationJobFunc callback,
                               void *privdata,
                               void (*free_privdata)(void *)) {
-    if (server.loading || (server.primary && server.repl_replica_ro)) {
+    if (server.loading || (server.primary_host && server.repl_replica_ro)) {
         return VALKEYMODULE_ERR;
     }
     ValkeyModulePostExecUnitJob *job = zmalloc(sizeof(*job));
@@ -13059,7 +13060,7 @@ int VM_RdbLoad(ValkeyModuleCtx *ctx, ValkeyModuleRdbStream *stream, int flags) {
     }
 
     /* Not allowed on replicas. */
-    if (server.primary != NULL) {
+    if (server.primary_host != NULL) {
         errno = ENOTSUP;
         return VALKEYMODULE_ERR;
     }
diff --git a/src/networking.c b/src/networking.c
index b9712d877a..5c31ac4562 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -288,9 +288,9 @@ int prepareClientToWrite(client *c) {
      * CLIENT_PUSHING handling: disables the reply silencing flags. */
     if ((c->flag.reply_off || c->flag.reply_skip) && !c->flag.pushing) return C_ERR;
 
-    /* Primaries don't receive replies, unless CLIENT_PRIMARY_FORCE_REPLY flag
+    /* Replication sources don't receive replies, unless force reply flag
      * is set. */
-    if (c->flag.replication_source && !c->flag.primary_force_reply) return C_ERR;
+    if ((c->flag.replication_source) && !c->flag.replication_force_reply) return C_ERR;
 
     /* Skip the fake client, such as the fake client for AOF loading.
      * But CLIENT_ID_CACHED_RESPONSE is allowed since it is a fake client
@@ -581,7 +581,7 @@ void afterErrorReply(client *c, const char *s, size_t len, int flags) {
      * the commands sent by the primary. However it is useful to log such events since
      * they are rare and may hint at errors in a script or a bug in the server. */
     int ctype = getClientType(c);
-    if (ctype == CLIENT_TYPE_PRIMARY || ctype == CLIENT_TYPE_REPLICA || c->id == CLIENT_ID_AOF) {
+    if (ctype == CLIENT_TYPE_PRIMARY || ctype == CLIENT_TYPE_REPLICA || c->id == CLIENT_ID_AOF || ctype == CLIENT_TYPE_SLOT_MIGRATION) {
         char *to, *from;
 
         if (c->id == CLIENT_ID_AOF) {
@@ -590,9 +590,12 @@ void afterErrorReply(client *c, const char *s, size_t len, int flags) {
         } else if (ctype == CLIENT_TYPE_PRIMARY) {
             to = "primary";
             from = "replica";
-        } else {
+        } else if (ctype == CLIENT_TYPE_REPLICA) {
             to = "replica";
             from = "primary";
+        } else {
+            to = "slot-migration-source";
+            from = "slot-migration-target";
         }
 
         if (len > 4096) len = 4096;
@@ -1668,7 +1671,7 @@ void freeClient(client *c) {
      *
      * Note that before doing this we make sure that the client is not in
      * some unexpected state, by checking its flags. */
-    if (server.primary && server.primary->client == c) {
+    if (server.primary && c->flag.primary) {
         serverLog(LL_NOTICE, "Connection with primary lost.");
         if (!c->flag.dont_cache_primary && !(c->flag.protocol_error || c->flag.blocked)) {
             c->flag.close_asap = 0;
@@ -1819,14 +1822,14 @@ void beforeNextClient(client *c) {
 
     /* Trim the query buffer to the current position. */
     if (c->flag.replication_source) {
-        /* If the client is a primary, trim the querybuf to repl_applied,
-         * since primary client is very special, its querybuf not only
+        /* If the client is a replication source, trim the querybuf to repl_applied,
+         * since replication clients are very special, its querybuf not only
          * used to parse command, but also proxy to sub-replicas.
          *
          * Here are some scenarios we cannot trim to qb_pos:
-         * 1. we don't receive complete command from primary
-         * 2. primary client blocked cause of client pause
-         * 3. io threads operate read, primary client flagged with CLIENT_PENDING_COMMAND
+         * 1. we don't receive complete command from replication
+         * 2. replication client blocked cause of client pause
+         * 3. io threads operate read, replication client flagged with CLIENT_PENDING_COMMAND
          *
          * In these scenarios, qb_pos points to the part of the current command
          * or the beginning of next command, and the current command is not applied yet,
@@ -2144,7 +2147,7 @@ int postWriteToClient(client *c) {
     }
     if (c->nwritten > 0) {
         c->net_output_bytes += c->nwritten;
-        /* For clients representing primaries we don't count sending data
+        /* For clients representing replication sources we don't count sending data
          * as an interaction, since we always send REPLCONF ACK commands
          * that take some time to just fill the socket output buffer.
          * We just rely on data / pings received for timeout detection. */
@@ -2238,7 +2241,11 @@ int handleReadResult(client *c) {
     c->net_input_bytes += c->nread;
     if (c->flag.replication_source) {
         c->repl_data->read_reploff += c->nread;
-        server.stat_net_repl_input_bytes += c->nread;
+        if (c->flag.primary) {
+            server.stat_net_repl_input_bytes += c->nread;
+        } else if (c->flag.slot_migration_source) {
+            server.stat_net_slot_migration_input_bytes += c->nread;
+        }
     } else {
         server.stat_net_input_bytes += c->nread;
     }
@@ -2281,7 +2288,7 @@ void handleParseError(client *c) {
     } else if (flags & READ_FLAGS_ERROR_UNBALANCED_QUOTES) {
         addReplyError(c, "Protocol error: unbalanced quotes in request");
         setProtocolError("unbalanced quotes in inline request", c);
-    } else if (flags & READ_FLAGS_ERROR_UNEXPECTED_INLINE_FROM_PRIMARY) {
+    } else if (flags & READ_FLAGS_ERROR_UNEXPECTED_INLINE_FROM_REPLICATION_SOURCE) {
         serverLog(LL_WARNING, "WARNING: Receiving inline protocol from primary, primary stream corruption? Closing the "
                               "primary connection and discarding the cached primary.");
         setProtocolError("Master using the inline protocol. Desync?", c);
@@ -2295,7 +2302,7 @@ int isParsingError(client *c) {
                             READ_FLAGS_ERROR_INVALID_MULTIBULK_LEN | READ_FLAGS_ERROR_UNAUTHENTICATED_MULTIBULK_LEN |
                             READ_FLAGS_ERROR_UNAUTHENTICATED_BULK_LEN | READ_FLAGS_ERROR_MBULK_INVALID_BULK_LEN |
                             READ_FLAGS_ERROR_BIG_BULK_COUNT | READ_FLAGS_ERROR_MBULK_UNEXPECTED_CHARACTER |
-                            READ_FLAGS_ERROR_UNEXPECTED_INLINE_FROM_PRIMARY | READ_FLAGS_ERROR_UNBALANCED_QUOTES);
+                            READ_FLAGS_ERROR_UNEXPECTED_INLINE_FROM_REPLICATION_SOURCE | READ_FLAGS_ERROR_UNBALANCED_QUOTES);
 }
 
 /* This function is called after the query-buffer was parsed.
@@ -2556,7 +2563,7 @@ void processInlineBuffer(client *c) {
     int argc, j, linefeed_chars = 1;
     sds *argv, aux;
     size_t querylen;
-    int is_primary = c->read_flags & READ_FLAGS_PRIMARY;
+    int is_replication_source = c->read_flags & READ_FLAGS_REPLICATION_SOURCE;
 
     /* Search for end of line */
     newline = strchr(c->querybuf + c->qb_pos, '\n');
@@ -2593,9 +2600,9 @@ void processInlineBuffer(client *c) {
      *
      * However there is an exception: primaries may send us just a newline
      * to keep the connection active. */
-    if (querylen != 0 && is_primary) {
+    if (querylen != 0 && is_replication_source) {
         sdsfreesplitres(argv, argc);
-        c->read_flags |= READ_FLAGS_ERROR_UNEXPECTED_INLINE_FROM_PRIMARY;
+        c->read_flags |= READ_FLAGS_ERROR_UNEXPECTED_INLINE_FROM_REPLICATION_SOURCE;
         return;
     }
 
@@ -2683,7 +2690,7 @@ void processMultibulkBuffer(client *c) {
     char *newline = NULL;
     int ok;
     long long ll;
-    int is_primary = c->read_flags & READ_FLAGS_PRIMARY;
+    int is_replication_source = c->read_flags & READ_FLAGS_REPLICATION_SOURCE;
     int auth_required = c->read_flags & READ_FLAGS_AUTH_REQUIRED;
 
     if (c->multibulklen == 0) {
@@ -2787,7 +2794,7 @@ void processMultibulkBuffer(client *c) {
 
             size_t bulklen_slen = newline - (c->querybuf + c->qb_pos + 1);
             ok = string2ll(c->querybuf + c->qb_pos + 1, bulklen_slen, &ll);
-            if (!ok || ll < 0 || (!(is_primary) && ll > server.proto_max_bulk_len)) {
+            if (!ok || ll < 0 || (!(is_replication_source) && ll > server.proto_max_bulk_len)) {
                 c->read_flags |= READ_FLAGS_ERROR_MBULK_INVALID_BULK_LEN;
                 return;
             } else if (ll > 16384 && auth_required) {
@@ -2796,7 +2803,7 @@ void processMultibulkBuffer(client *c) {
             }
 
             c->qb_pos = newline - c->querybuf + 2;
-            if (!(is_primary) && ll >= PROTO_MBULK_BIG_ARG) {
+            if (!(is_replication_source) && ll >= PROTO_MBULK_BIG_ARG) {
                 /* When the client is not a primary client (because primary
                  * client's querybuf can only be trimmed after data applied
                  * and sent to replicas).
@@ -2845,7 +2852,7 @@ void processMultibulkBuffer(client *c) {
             /* Optimization: if a non-primary client's buffer contains JUST our bulk element
              * instead of creating a new object by *copying* the sds we
              * just use the current sds string. */
-            if (!is_primary && c->qb_pos == 0 && c->bulklen >= PROTO_MBULK_BIG_ARG &&
+            if (!is_replication_source && c->qb_pos == 0 && c->bulklen >= PROTO_MBULK_BIG_ARG &&
                 sdslen(c->querybuf) == (size_t)(c->bulklen + 2)) {
                 c->argv[c->argc++] = createObject(OBJ_STRING, c->querybuf);
                 c->argv_len_sum += c->bulklen;
@@ -2895,15 +2902,15 @@ void commandProcessed(client *c) {
     if (!c->repl_data) return;
 
     long long prev_offset = c->repl_data->reploff;
-    if (c->flag.replication_source && !c->flag.multi) {
-        /* Update the applied replication offset of our primary. */
+    if (!c->flag.multi && c->flag.replication_source) {
+        /* Update the applied replication offset of our source. */
         c->repl_data->reploff = c->repl_data->read_reploff - sdslen(c->querybuf) + c->qb_pos;
     }
 
-    /* If the client is a primary we need to compute the difference
+    /* If the client is a replication source we need to compute the difference
      * between the applied offset before and after processing the buffer,
      * to understand how much of the replication stream was actually
-     * applied to the primary state: this quantity, and its corresponding
+     * applied to the replication state: this quantity, and its corresponding
      * part of the replication stream, will be propagated to the
      * sub-replicas and to the replication backlog. */
     if (c->flag.replication_source) {
@@ -3010,7 +3017,7 @@ int canParseCommand(client *c) {
      * commands to execute in c->argv. */
     if (c->flag.pending_command) return 0;
 
-    /* Don't process input from the primary while there is a busy script
+    /* Don't process input from replication while there is a busy script
      * condition on the replica. We want just to accumulate the replication
      * stream (instead of replying -BUSY like we do with other clients) and
      * later resume the processing. */
@@ -3033,7 +3040,7 @@ int processInputBuffer(client *c) {
             break;
         }
 
-        c->read_flags = c->flag.replication_source ? READ_FLAGS_PRIMARY : 0;
+        c->read_flags = c->flag.replication_source ? READ_FLAGS_REPLICATION_SOURCE : 0;
         c->read_flags |= authRequired(c) ? READ_FLAGS_AUTH_REQUIRED : 0;
 
         parseCommand(c);
@@ -3076,7 +3083,7 @@ void readToQueryBuf(client *c) {
     /* If the replica RDB client is marked as closed ASAP, do not try to read from it */
     if (c->flag.close_asap) return;
 
-    int is_primary = c->read_flags & READ_FLAGS_PRIMARY;
+    int is_replication_source = c->read_flags & READ_FLAGS_REPLICATION_SOURCE;
 
     readlen = PROTO_IOBUF_LEN;
     qblen = c->querybuf ? sdslen(c->querybuf) : 0;
@@ -3110,7 +3117,7 @@ void readToQueryBuf(client *c) {
      * Although we have ensured that c->querybuf will not be expanded in the current
      * thread_shared_qb, we still add this check for code robustness. */
     int use_thread_shared_qb = (c->querybuf == thread_shared_qb) ? 1 : 0;
-    if (!is_primary && // primary client's querybuf can grow greedy.
+    if (!is_replication_source && /* replication client's querybuf can grow greedy. */
         (big_arg || sdsalloc(c->querybuf) < PROTO_IOBUF_LEN)) {
         /* When reading a BIG_ARG we won't be reading more than that one arg
          * into the query buffer, so we don't need to pre-allocate more than we
@@ -3137,7 +3144,7 @@ void readToQueryBuf(client *c) {
     sdsIncrLen(c->querybuf, c->nread);
     qblen = sdslen(c->querybuf);
     if (c->querybuf_peak < qblen) c->querybuf_peak = qblen;
-    if (!is_primary) {
+    if (!is_replication_source) {
         /* The commands cached in the MULTI/EXEC queue have not been executed yet,
          * so they are also considered a part of the query buffer in a broader sense.
          *
@@ -3240,7 +3247,7 @@ sds catClientInfoString(sds s, client *client, int hide_user_data) {
             *p++ = 'S';
     }
 
-    if (client->flag.replication_source) *p++ = 'M';
+    if (client->flag.primary) *p++ = 'M';
     if (client->flag.pubsub) *p++ = 'P';
     if (client->flag.multi) *p++ = 'x';
     if (client->flag.blocked) *p++ = 'b';
@@ -4132,7 +4139,7 @@ void helloCommand(client *c) {
 
     if (!server.sentinel_mode) {
         addReplyBulkCString(c, "role");
-        addReplyBulkCString(c, server.primary ? "replica" : "master");
+        addReplyBulkCString(c, server.primary_host ? "replica" : "master");
     }
 
     addReplyBulkCString(c, "modules");
@@ -4361,13 +4368,15 @@ size_t getClientMemoryUsage(client *c, size_t *output_buffer_mem_usage) {
  * CLIENT_TYPE_REPLICA  -> replica
  * CLIENT_TYPE_PUBSUB -> Client subscribed to Pub/Sub channels
  * CLIENT_TYPE_PRIMARY -> The client representing our replication primary.
+ * CLIENT_TYPE_SLOT_MIGRATION -> The client representing a slot migration.
  */
 int getClientType(client *c) {
-    if (c->flag.replication_source) return CLIENT_TYPE_PRIMARY;
+    if (c->flag.primary) return CLIENT_TYPE_PRIMARY;
     /* Even though MONITOR clients are marked as replicas, we
      * want the expose them as normal clients. */
     if (c->flag.replica && !c->flag.monitor) return CLIENT_TYPE_REPLICA;
     if (c->flag.pubsub) return CLIENT_TYPE_PUBSUB;
+    if (c->flag.slot_migration_source) return CLIENT_TYPE_SLOT_MIGRATION;
     return CLIENT_TYPE_NORMAL;
 }
 
@@ -4382,6 +4391,8 @@ int getClientTypeByName(char *name) {
         return CLIENT_TYPE_PUBSUB;
     else if (!strcasecmp(name, "master") || !strcasecmp(name, "primary"))
         return CLIENT_TYPE_PRIMARY;
+    else if (!strcasecmp(name, "slot-migration"))
+        return CLIENT_TYPE_SLOT_MIGRATION;
     else
         return -1;
 }
@@ -4392,6 +4403,7 @@ char *getClientTypeName(int class) {
     case CLIENT_TYPE_REPLICA: return "slave";
     case CLIENT_TYPE_PUBSUB: return "pubsub";
     case CLIENT_TYPE_PRIMARY: return "master";
+    case CLIENT_TYPE_SLOT_MIGRATION: return "slot-migration";
     default: return NULL;
     }
 }
@@ -4407,9 +4419,9 @@ int checkClientOutputBufferLimits(client *c) {
     unsigned long used_mem = getClientOutputBufferMemoryUsage(c);
 
     class = getClientType(c);
-    /* For the purpose of output buffer limiting, primaries are handled
-     * like normal clients. */
-    if (class == CLIENT_TYPE_PRIMARY) class = CLIENT_TYPE_NORMAL;
+    /* For the purpose of output buffer limiting, primaries and slot migrations
+     * are handled like normal clients. */
+    if (class == CLIENT_TYPE_PRIMARY || class == CLIENT_TYPE_SLOT_MIGRATION) class = CLIENT_TYPE_NORMAL;
 
     /* Note that it doesn't make sense to set the replica clients output buffer
      * limit lower than the repl-backlog-size config (partial sync will succeed
@@ -4892,7 +4904,7 @@ void ioThreadReadQueryFromClient(void *data) {
 done:
     /* Only trim query buffer for non-primary clients
      * Primary client's buffer is handled by main thread using repl_applied position */
-    if (!(c->read_flags & READ_FLAGS_PRIMARY)) {
+    if (!(c->read_flags & READ_FLAGS_REPLICATION_SOURCE)) {
         trimClientQueryBuffer(c);
     }
     atomic_thread_fence(memory_order_release);
diff --git a/src/object.c b/src/object.c
index b8200dd815..a9c701964a 100644
--- a/src/object.c
+++ b/src/object.c
@@ -1337,7 +1337,8 @@ struct serverMemOverhead *getMemoryOverheadData(void) {
      * updateClientMemoryUsage(). */
     mh->clients_normal = server.stat_clients_type_memory[CLIENT_TYPE_PRIMARY] +
                          server.stat_clients_type_memory[CLIENT_TYPE_PUBSUB] +
-                         server.stat_clients_type_memory[CLIENT_TYPE_NORMAL];
+                         server.stat_clients_type_memory[CLIENT_TYPE_NORMAL] +
+                         server.stat_clients_type_memory[CLIENT_TYPE_SLOT_MIGRATION];
     mem_total += mh->clients_normal;
 
     mh->cluster_links = server.stat_cluster_links_memory;
diff --git a/src/rdb.c b/src/rdb.c
index 4a3a7e1c8e..7bb9edf31f 100644
--- a/src/rdb.c
+++ b/src/rdb.c
@@ -1869,7 +1869,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
     int deep_integrity_validation = server.sanitize_dump_payload == SANITIZE_DUMP_YES;
     if (server.sanitize_dump_payload == SANITIZE_DUMP_CLIENTS) {
         /* Skip sanitization when loading (an RDB), or getting a RESTORE command
-         * from either the primary or a client using an ACL user with the skip-sanitize-payload flag. */
+         * from either a replication source or a client using an ACL user with the skip-sanitize-payload flag. */
         int skip = server.loading || (server.current_client && (server.current_client->flag.replication_source));
         if (!skip && server.current_client && server.current_client->user)
             skip = !!(server.current_client->user->flags & USER_FLAG_SANITIZE_PAYLOAD_SKIP);
@@ -2935,12 +2935,12 @@ void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) {
     if (server.loading_process_events_interval_bytes &&
         (r->processed_bytes + len) / server.loading_process_events_interval_bytes >
             r->processed_bytes / server.loading_process_events_interval_bytes) {
-        replicationSendNewlineToConnectedLinks();
+        if (server.primary_host && server.repl_state == REPL_STATE_TRANSFER) replicationSendNewlineToPrimary();
         loadingAbsProgress(r->processed_bytes);
         processEventsWhileBlocked();
         processModuleLoadingProgressEvent(0);
     }
-    if (server.primary && server.primary->state == REPL_STATE_TRANSFER && rioCheckType(r) == RIO_TYPE_CONN) {
+    if (server.repl_state == REPL_STATE_TRANSFER && rioCheckType(r) == RIO_TYPE_CONN) {
         server.stat_net_repl_input_bytes += len;
     }
 }
@@ -3624,8 +3624,8 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, slotBitmap slot_bitmap)
             retval = rewriteAppendOnlyFileRio(&rdb, slot_bitmap);
             rioWrite(&rdb, "*3\r\n", 4);
             rioWriteBulkString(&rdb, "REPLCONF", 8);
-            rioWriteBulkString(&rdb, "SYNC-PAYLOAD-END", 17);
-            rioWriteBulkLongLong(&rdb, rsi->repl_stream_db);
+            rioWriteBulkString(&rdb, "AOF-PAYLOAD-END", 15);
+            rioWriteBulkLongLong(&rdb, server.primary_repl_offset);
         } else {
             retval = rdbSaveRioWithEOFMark(req, &rdb, NULL, rsi);
         }
@@ -3791,7 +3791,7 @@ rdbSaveInfo *rdbPopulateSaveInfo(rdbSaveInfo *rsi) {
      * connects to us, the NULL repl_backlog will trigger a full
      * synchronization, at the same time we will use a new replid and clear
      * replid2. */
-    if (!server.primary && server.repl_backlog) {
+    if (!server.primary_host && server.repl_backlog) {
         /* Note that when server.replicas_eldb is -1, it means that this primary
          * didn't apply any write commands after a full synchronization.
          * So we can let repl_stream_db be 0, this allows a restarted replica
@@ -3804,7 +3804,7 @@ rdbSaveInfo *rdbPopulateSaveInfo(rdbSaveInfo *rsi) {
     /* If the instance is a replica we need a connected primary
      * in order to fetch the currently selected DB. */
     if (server.primary) {
-        rsi->repl_stream_db = server.primary->client->db->id;
+        rsi->repl_stream_db = server.primary->db->id;
         return rsi;
     }
 
diff --git a/src/replication.c b/src/replication.c
index c50cebecfd..90a9e90e24 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -47,17 +47,15 @@
 #include <ctype.h>
 
 void replicationDiscardCachedPrimary(void);
-void replicationResurrectCachedPrimary(replicationLink *link);
-void replicationResurrectProvisionalSource(replicationLink *link);
-void replicationSendAck(replicationLink *link);
+void replicationResurrectCachedPrimary(connection *conn);
+void replicationResurrectProvisionalPrimary(void);
 int replicaPutOnline(client *replica);
 void replicaStartCommandStream(client *replica);
-int cancelReplicationHandshake(replicationLink *link, int reconnect);
-void replicationSteadyStateInit(replicationLink *link);
+int cancelReplicationHandshake(int reconnect);
+void replicationSteadyStateInit(void);
 void dualChannelSetupMainConnForPsync(connection *conn);
-int dualChannelSyncHandleRdbLoadCompletion(replicationLink *link);
-static void dualChannelFullSyncWithReplicationSource(connection *conn);
-void replicationFinishSyncPayload(connection *conn, replicationLink *link, int db);
+void dualChannelSyncHandleRdbLoadCompletion(void);
+static void dualChannelFullSyncWithPrimary(connection *conn);
 
 /* We take a global flag to remember if this instance generated an RDB
  * because of replication, so that we can remove the RDB file in case
@@ -538,7 +536,7 @@ void replicationFeedReplicas(int dictid, robj **argv, int argc) {
      * propagate *identical* replication stream. In this way this replica can
      * advertise the same replication ID as the primary (since it shares the
      * primary replication history and has the same backlog and offsets). */
-    if (server.primary != NULL) return;
+    if (server.primary_host != NULL) return;
 
     /* If there aren't replicas, and there is no backlog buffer to populate,
      * we can return ASAP. */
@@ -962,11 +960,11 @@ int startBgsaveForReplication(int mincapa, int req, slotBitmap slot_bitmap) {
     /* We use a socket target if replica can handle the EOF marker and we're configured to do diskless syncs.
      * Note that in case we're creating a "filtered" RDB (functions-only, for example) we also force socket replication
      * to avoid overwriting the snapshot RDB file with filtered data. */
-    socket_target = (server.repl_diskless_sync || req & REPLICA_REQ_RDB_MASK) && (mincapa & REPLICA_CAPA_EOF);
+    socket_target = (server.repl_diskless_sync || req & REPLICA_REQ_RDB_MASK) && (mincapa & REPLICA_CAPA_EOF || req & REPLICA_REQ_AOF_FORMAT);
     /* `SYNC` should have failed with error if we don't support socket and require a filter, assert this here */
     serverAssert(socket_target || !(req & REPLICA_REQ_RDB_MASK));
 
-    serverLog(LL_NOTICE, "Starting BGSAVE for SYNC with target: %s using: %s with format %s",
+    serverLog(LL_NOTICE, "Starting BGSAVE for SYNC with target: %s using: %s with format: %s",
               socket_target ? "replicas sockets" : "disk",
               (req & REPLICA_REQ_RDB_CHANNEL) ? "dual-channel" : "normal sync",
               (req & REPLICA_REQ_AOF_FORMAT) ? "AOF" : "RDB");
@@ -1048,7 +1046,7 @@ void syncCommand(client *c) {
      * become a primary if so. */
     if (c->argc > 3 && !strcasecmp(c->argv[0]->ptr, "psync") && !strcasecmp(c->argv[3]->ptr, "failover")) {
         serverLog(LL_NOTICE, "Failover request received for replid %s.", (unsigned char *)c->argv[1]->ptr);
-        if (server.primary == NULL) {
+        if (!server.primary_host) {
             addReplyError(c, "PSYNC FAILOVER can't be sent to a master.");
             return;
         }
@@ -1076,7 +1074,7 @@ void syncCommand(client *c) {
 
     /* Refuse SYNC requests if we are a replica but the link with our primary
      * is not ok... */
-    if (server.primary && server.primary->state != REPL_STATE_CONNECTED) {
+    if (server.primary_host && server.repl_state != REPL_STATE_CONNECTED) {
         addReplyError(c, "-NOMASTERLINK Can't SYNC while not connected with my master");
         return;
     }
@@ -1098,12 +1096,6 @@ void syncCommand(client *c) {
         return;
     }
 
-    /* Fail sync if it is asking for AOF format and a slot is not set via REPLCONF already. */
-    if (c->repl_data->replica_req & REPLICA_REQ_AOF_FORMAT && isSlotBitmapAllSlots(c->repl_data->slot_bitmap)) {
-        addReplyError(c, "AOF format is only supported for single slot SYNC");
-        return;
-    }
-
     serverLog(LL_NOTICE, "Replica %s asks for synchronization", replicationGetReplicaName(c));
 
     /* Try a partial resynchronization if this is a PSYNC command.
@@ -1179,11 +1171,8 @@ void syncCommand(client *c) {
                   server.replid, server.replid2);
     }
 
-    /* For slot level replication, we make no attempt to coallesce BGSAVEs */
-    int require_dedicated = !isSlotBitmapAllSlots(c->repl_data->slot_bitmap);
-
     /* CASE 1: BGSAVE is in progress, with disk target. */
-    if (!require_dedicated && server.child_type == CHILD_TYPE_RDB && server.rdb_child_type == RDB_CHILD_TYPE_DISK) {
+    if (server.child_type == CHILD_TYPE_RDB && server.rdb_child_type == RDB_CHILD_TYPE_DISK) {
         /* Ok a background save is in progress. Let's check if it is a good
          * one for replication, i.e. if there is another replica that is
          * registering differences since the server forked to save. */
@@ -1204,7 +1193,7 @@ void syncCommand(client *c) {
          * capabilities of the replica that triggered the current BGSAVE
          * and its exact requirements. */
         if (ln && ((c->repl_data->replica_capa & replica->repl_data->replica_capa) == replica->repl_data->replica_capa) &&
-            c->repl_data->replica_req == replica->repl_data->replica_req) {
+            c->repl_data->replica_req == replica->repl_data->replica_req && isSlotBitmapEmpty(c->repl_data->slot_bitmap)) {
             /* Perfect, the server is already registering differences for
              * another replica. Set the right state, and copy the buffer.
              * We don't copy buffer if clients don't want. */
@@ -1216,35 +1205,32 @@ void syncCommand(client *c) {
              * register differences. */
             serverLog(LL_NOTICE, "Can't attach the replica to the current BGSAVE. Waiting for next BGSAVE for SYNC");
         }
-    }
 
-    /* CASE 2: BGSAVE is in progress, with socket target. */
-    if (server.child_type == CHILD_TYPE_RDB && server.rdb_child_type == RDB_CHILD_TYPE_SOCKET) {
+        /* CASE 2: BGSAVE is in progress, with socket target. */
+    } else if (server.child_type == CHILD_TYPE_RDB && server.rdb_child_type == RDB_CHILD_TYPE_SOCKET) {
         /* There is an RDB child process but it is writing directly to
          * children sockets. We need to wait for the next BGSAVE
          * in order to synchronize. */
         serverLog(LL_NOTICE, "Current BGSAVE has socket target. Waiting for next BGSAVE for SYNC");
-        return;
-    }
 
-    /* CASE 3: There is no BGSAVE is in progress, but we need to delay. */
-    if (!require_dedicated && server.repl_diskless_sync && (c->repl_data->replica_capa & REPLICA_CAPA_EOF) && server.repl_diskless_sync_delay) {
-        /* Diskless replication RDB child is created inside
-         * replicationCron() since we want to delay its start a
-         * few seconds to wait for more replicas to arrive. */
-        serverLog(LL_NOTICE, "Delay next BGSAVE for diskless SYNC");
-        return;
-    }
-
-    /* CASE 4: We don't have a BGSAVE in progress, but there is an existing child process. */
-    if (hasActiveChildProcess()) {
-        serverLog(LL_NOTICE, "No BGSAVE in progress, but another BG operation is active. "
-                             "BGSAVE for replication delayed");
-        return;
+        /* CASE 3: There is no BGSAVE is in progress. */
+    } else {
+        if (server.repl_diskless_sync && (c->repl_data->replica_capa & REPLICA_CAPA_EOF) && server.repl_diskless_sync_delay && isSlotBitmapEmpty(c->repl_data->slot_bitmap)) {
+            /* Diskless replication RDB child is created inside
+             * replicationCron() since we want to delay its start a
+             * few seconds to wait for more replicas to arrive. */
+            serverLog(LL_NOTICE, "Delay next BGSAVE for diskless SYNC");
+        } else {
+            /* We don't have a BGSAVE in progress, let's start one. Diskless
+             * or disk-based mode is determined by replica's capacity. */
+            if (!hasActiveChildProcess()) {
+                startBgsaveForReplication(c->repl_data->replica_capa, c->repl_data->replica_req, c->repl_data->slot_bitmap);
+            } else {
+                serverLog(LL_NOTICE, "No BGSAVE in progress, but another BG operation is active. "
+                                     "BGSAVE for replication delayed");
+            }
+        }
     }
-
-    /* CASE 5: We are good to start a BGSAVE. Diskless or disk-based mode is determined by replica's capacity. */
-    startBgsaveForReplication(c->repl_data->replica_capa, c->repl_data->replica_req, c->repl_data->slot_bitmap);
     return;
 }
 
@@ -1268,7 +1254,7 @@ int anyOtherReplicaWaitRdb(client *except_me) {
 void initClientReplicationData(client *c) {
     if (c->repl_data) return;
     c->repl_data = (ClientReplicationData *)zcalloc(sizeof(ClientReplicationData));
-    slotBitmapSetAll(c->repl_data->slot_bitmap);
+    memset(c->repl_data->slot_bitmap, 0, sizeof(c->repl_data->slot_bitmap));
 }
 
 void freeClientReplicationData(client *c) {
@@ -1309,9 +1295,7 @@ void freeClientReplicationData(client *c) {
             moduleFireServerEvent(VALKEYMODULE_EVENT_REPLICA_CHANGE, VALKEYMODULE_SUBEVENT_REPLICA_CHANGE_OFFLINE,
                                   NULL);
     }
-    if (c->flag.replication_source) {
-        replicationHandleSourceDisconnection(c->repl_data->link);
-    }
+    if (c->flag.primary) replicationHandlePrimaryDisconnection();
     sdsfree(c->repl_data->replica_addr);
     zfree(c->repl_data);
     c->repl_data = NULL;
@@ -1437,7 +1421,7 @@ void replconfCommand(client *c) {
         } else if (!strcasecmp(c->argv[j]->ptr, "getack")) {
             /* REPLCONF GETACK is used in order to request an ACK ASAP
              * to the replica. */
-            if (server.primary && server.primary->client) replicationSendAck(server.primary);
+            if (server.primary_host && server.primary) replicationSendAck(server.primary);
             return;
         } else if (!strcasecmp(c->argv[j]->ptr, "rdb-only")) {
             /* REPLCONF RDB-ONLY is used to identify the client only wants
@@ -1513,10 +1497,7 @@ void replconfCommand(client *c) {
             if (!server.cluster_enabled) {
                 addReplyError(c, "Cannot replicate a slot when cluster mode is disabled");
             }
-            if (!isSlotBitmapAllSlots(c->repl_data->slot_bitmap)) {
-                addReplyError(c, "Slot bitmap already set");
-            }
-            if (stringObjectLen(c->argv[j + 1]) != CLUSTER_SLOTS / 8) {
+            if (stringObjectLen(c->argv[j + 1]) != sizeof(slotBitmap)) {
                 addReplyError(c, "Invalid slot bitmap length");
                 return;
             }
@@ -1526,24 +1507,26 @@ void replconfCommand(client *c) {
                     return;
                 }
             }
-            memcpy(c->repl_data->slot_bitmap, c->argv[j + 1]->ptr, CLUSTER_SLOTS / 8);
+            memcpy(c->repl_data->slot_bitmap, c->argv[j + 1]->ptr, sizeof(slotBitmap));
 
             /* For now, we only support AOF for slot transfer. */
             c->repl_data->replica_req |= REPLICA_REQ_AOF_FORMAT;
-        } else if (!strcasecmp(c->argv[j]->ptr, "sync-payload-end")) {
-            /* REPLCONF sync-payload-end <db_num> is used to inform the replica
-             * that the primary has finished sending the sync snapshot, and
-             * that it is free to begin processing the replication backlog.
-             *
-             * dbnum specifies which db to stream the backlog into. */
-            int db_num = 0;
-            if (getIntFromObjectOrReply(c, c->argv[j + 1], &db_num, NULL) != C_OK || db_num < 0 || db_num >= server.dbnum) {
-                addReplyError(c, "Unable to parse DB number");
+        } else if (!strcasecmp(c->argv[j]->ptr, "aof-payload-end")) {
+            /* REPLCONF aof-payload-end <offset> is used to inform the target
+             * that the replication source has finished sending the AOF formatted
+             * sync snapshot, and that it is free to begin processing the
+             * replication backlog. */
+            long long initial_offset = 0;
+            if (getLongLongFromObjectOrReply(c, c->argv[j + 1], &initial_offset, NULL) != C_OK) {
                 return;
             }
-            serverLog(LL_NOTICE, "Got sync-payload-end for db %d", db_num);
-
-            replicationFinishSyncPayload(c->conn, c->repl_data->link, db_num);
+            if (c->flag.slot_migration_source) {
+                clusterSlotMigrationDoneSyncing(initial_offset);
+                return;
+            }
+            /* Right now, we only support this for slot migration. */
+            addReplyErrorFormat(c, "AOF sync is not in progress.");
+            return;
         } else {
             addReplyErrorFormat(c, "Unrecognized REPLCONF option: %s", (char *)c->argv[j]->ptr);
             return;
@@ -1985,30 +1968,13 @@ void shiftReplicationId(void) {
 
 /* ----------------------------------- REPLICA -------------------------------- */
 
-char *replicationGetNameForLogs(replicationLink *link) {
-    if (link == server.primary)
-        return "PRIMARY";
-    if (!isSlotBitmapAllSlots(link->slot_bitmap))
-        return "SLOT OWNER";
-    return "OTHER REPLICATION SOURCE";
-}
-
 /* Returns 1 if the given replication state is a handshake state,
  * 0 otherwise. */
-int replicaIsInHandshakeState(replicationLink *link) {
-    return link->state >= REPL_STATE_RECEIVE_PING_REPLY && link->state <= REPL_STATE_RECEIVE_PSYNC_REPLY;
+int replicaIsInHandshakeState(void) {
+    return server.repl_state >= REPL_STATE_RECEIVE_PING_REPLY && server.repl_state <= REPL_STATE_RECEIVE_PSYNC_REPLY;
 }
 
-void replicationSendNewlineOnLink(replicationLink *link) {
-    static time_t newline_sent;
-    if (time(NULL) != newline_sent) {
-        newline_sent = time(NULL);
-        /* Pinging back in this stage is best-effort. */
-        if (link->transfer_s) connWrite(link->transfer_s, "\n", 1);
-    }
-}
-
-/* Avoid the primary to detect replicas as timing out while loading the
+/* Avoid the primary to detect the replica is timing out while loading the
  * RDB file in initial synchronization. We send a single newline character
  * that is valid protocol but is guaranteed to either be sent entirely or
  * not, since the byte is indivisible.
@@ -2016,15 +1982,12 @@ void replicationSendNewlineOnLink(replicationLink *link) {
  * The function is called in two contexts: while we flush the current
  * data with emptyData(), and while we load the new data received as an
  * RDB file from the primary. */
-void replicationSendNewlineToConnectedLinks() {
-    listIter li;
-    listNode *ln;
-    listRewind(server.replication_links, &li);
-    while ((ln = listNext(&li))) {
-        replicationLink *link = (replicationLink *)ln->value;
-        if (link->state >= REPL_STATE_CONNECTING && link->state <= REPL_STATE_CANCELLED) {
-            replicationSendNewlineOnLink(link);
-        }
+void replicationSendNewlineToPrimary(void) {
+    static time_t newline_sent;
+    if (time(NULL) != newline_sent) {
+        newline_sent = time(NULL);
+        /* Pinging back in this stage is best-effort. */
+        if (server.repl_transfer_s) connWrite(server.repl_transfer_s, "\n", 1);
     }
 }
 
@@ -2033,17 +1996,15 @@ void replicationSendNewlineToConnectedLinks() {
  * after loading succeeded or failed. */
 void replicationEmptyDbCallback(hashtable *d) {
     UNUSED(d);
-    replicationSendNewlineToConnectedLinks();
+    if (server.repl_state == REPL_STATE_TRANSFER) replicationSendNewlineToPrimary();
 }
 
 /* Once we have a link with the primary and the synchronization was
  * performed, this function materializes the primary client we store
  * at server.primary, starting from the specified file descriptor. */
-client *createReplicationLinkClientWithHandler(replicationLink *link, connection *conn, int dbid, ConnectionCallbackFunc handler) {
-    client *c = createClient(conn);
-    if (conn) {
-        connSetReadHandler(conn, handler);
-    }
+void replicationCreatePrimaryClientWithHandler(connection *conn, int dbid, ConnectionCallbackFunc handler) {
+    server.primary = createClient(conn);
+    if (conn) connSetReadHandler(server.primary->conn, handler);
 
     /**
      * Important note:
@@ -2056,33 +2017,28 @@ client *createReplicationLinkClientWithHandler(replicationLink *link, connection
      * to pass the execution to a background thread and unblock after the
      * execution is done. This is the reason why we allow blocking the replication
      * connection. */
-    c->flag.replication_source = 1;
-    c->flag.authenticated = 1;
-
-
-    /* Allocate a private query buffer for the replication link client instead of using the shared query buffer.
-     * This is done because the replication link's query buffer data needs to be preserved for my sub-replicas to use. */
-    c->querybuf = sdsempty();
-    initClientReplicationData(c);
-    c->repl_data->reploff = link->initial_offset;
-    c->repl_data->read_reploff = c->repl_data->reploff;
-    c->user = NULL; /* This client can do everything. */
-    c->repl_data->link = link;
-    memcpy(c->repl_data->replid, link->replid, sizeof(link->replid));
-
+    server.primary->flag.primary = 1;
+    server.primary->flag.authenticated = 1;
+    server.primary->flag.replication_source = 1;
+
+    /* Allocate a private query buffer for the primary client instead of using the shared query buffer.
+     * This is done because the primary's query buffer data needs to be preserved for my sub-replicas to use. */
+    server.primary->querybuf = sdsempty();
+    initClientReplicationData(server.primary);
+    server.primary->repl_data->reploff = server.primary_initial_offset;
+    server.primary->repl_data->read_reploff = server.primary->repl_data->reploff;
+    server.primary->user = NULL; /* This client can do everything. */
+    memcpy(server.primary->repl_data->replid, server.primary_replid, sizeof(server.primary_replid));
     /* If primary offset is set to -1, this primary is old and is not
      * PSYNC capable, so we flag it accordingly. */
-    if (c->repl_data->reploff == -1) c->flag.pre_psync = 1;
-    if (dbid != -1) selectDb(c, dbid);
-    memcpy(c->repl_data->slot_bitmap, link->slot_bitmap, sizeof(slotBitmap));
-
-    return c;
+    if (server.primary->repl_data->reploff == -1) server.primary->flag.pre_psync = 1;
+    if (dbid != -1) selectDb(server.primary, dbid);
 }
 
 /* Wrapper for replicationCreatePrimaryClientWithHandler, init primary connection handler
  * with ordinary client connection handler */
-client *createReplicationLinkClient(replicationLink *link, connection *conn, int dbid) {
-    return createReplicationLinkClientWithHandler(link, conn, dbid, readQueryFromClient);
+void replicationCreatePrimaryClient(connection *conn, int dbid) {
+    replicationCreatePrimaryClientWithHandler(conn, dbid, readQueryFromClient);
 }
 
 /* This function will try to re-enable the AOF file after the
@@ -2159,75 +2115,13 @@ void disklessLoadDiscardFunctionsLibCtx(functionsLibCtx *temp_functions_lib_ctx)
 void replicationAttachToNewPrimary(void) {
     /* Replica starts to apply data from new primary, we must discard the cached
      * primary structure. */
-    serverAssert(server.primary == NULL || server.primary->client == NULL);
+    serverAssert(server.primary == NULL);
     replicationDiscardCachedPrimary();
 
     disconnectReplicas();     /* Force our replicas to resync with us as well. */
     freeReplicationBacklog(); /* Don't allow our chained replicas to PSYNC. */
 }
 
-void replicationFinishSyncPayload(connection *conn, replicationLink *link, int db) {
-    /* Final setup of the connected replica <- primary link */
-    int link_closed = 0;
-    if (link->rdb_channel_state == REPL_DUAL_CHANNEL_RDB_LOAD) {
-        if (dualChannelSyncHandleRdbLoadCompletion(link) == C_ERR) {
-            /* This may happen if, while loading the backlog, our primary is unset */
-            serverLog(LL_NOTICE, "%s <-> MYSELF sync: Failed to finalize dual channel load", replicationGetNameForLogs(link));
-            freeReplicationLink(link);
-            link_closed = 1;
-        }
-    } else {
-        /* Client could have been previously created for AOF load. */
-        if (!link->client) {
-            link->client = createReplicationLinkClient(link, link->transfer_s, db);
-            link->transfer_s = NULL;
-        }
-        link->state = REPL_STATE_CONNECTED;
-        /* Send the initial ACK immediately to put this replica in online state. */
-        replicationSendAck(link);
-    }
-
-    if (!link_closed && link == server.primary) {
-        server.repl_down_since = 0;
-
-        /* Fire the primary link modules event. */
-        moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_UP, NULL);
-        if (link->state == REPL_STATE_CONNECTED) {
-            /* After a full resynchronization we use the replication ID and
-             * offset of the primary. The secondary ID / offset are cleared since
-             * we are starting a new history. */
-            memcpy(server.replid, link->client->repl_data->replid, sizeof(server.replid));
-            server.primary_repl_offset = link->client->repl_data->reploff;
-        }
-        clearReplicationId2();
-
-        /* Let's create the replication backlog if needed. Replicas need to
-         * accumulate the backlog regardless of the fact they have sub-replicas
-         * or not, in order to behave correctly if they are promoted to
-         * primaries after a failover. */
-        if (server.repl_backlog == NULL) createReplicationBacklog();
-        serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Finished with success");
-
-        if (server.supervised_mode == SUPERVISED_SYSTEMD) {
-            serverCommunicateSystemd("STATUS=PRIMARY <-> REPLICA sync: Finished with success. Ready to accept connections "
-                                     "in read-write mode.\n");
-        }
-    }
-
-    /* Restart the AOF subsystem now that we finished the sync. This
-     * will trigger an AOF rewrite, and when done will start appending
-     * to the new file. */
-    if (server.aof_enabled && server.aof_state != AOF_OFF) restartAOFAfterSYNC();
-
-    /* In case of dual channel replication sync we want to close the RDB connection
-     * once the connection is established */
-    if (!link_closed && conn == link->rdb_transfer_s) {
-        connClose(conn);
-        link->rdb_transfer_s = NULL;
-    }
-    return;
-}
-
 /* Asynchronously read the SYNC payload we receive from a primary */
 #define REPL_MAX_WRITTEN_BEFORE_FSYNC (1024 * 1024 * 8) /* 8 MB */
 void readSyncBulkPayload(connection *conn) {
@@ -2239,11 +2133,6 @@ void readSyncBulkPayload(connection *conn) {
     int empty_db_flags = server.repl_replica_lazy_flush ? EMPTYDB_ASYNC : EMPTYDB_NO_FLAGS;
     off_t left;
 
-    replicationLink *link = (replicationLink *)connGetPrivateData(conn);
-
-    /* RDB bulk load will only be used if we are sending all slots. */
-    serverAssert(isSlotBitmapAllSlots(link->slot_bitmap));
-
     /* Static vars used to hold the EOF mark, and the last bytes received
      * from the server: when they match, we reached the end of the transfer. */
     static char eofmark[RDB_EOF_MARK_SIZE];
@@ -2252,10 +2141,10 @@ void readSyncBulkPayload(connection *conn) {
 
     /* If repl_transfer_size == -1 we still have to read the bulk length
      * from the primary reply. */
-    if (link->transfer_size == -1) {
+    if (server.repl_transfer_size == -1) {
         nread = connSyncReadLine(conn, buf, 1024, server.repl_syncio_timeout * 1000);
         if (nread == -1) {
-            serverLog(LL_WARNING, "I/O error reading bulk count from %s: %s", replicationGetNameForLogs(link), connGetLastError(conn));
+            serverLog(LL_WARNING, "I/O error reading bulk count from PRIMARY: %s", connGetLastError(conn));
             goto error;
         } else {
             /* nread here is returned by connSyncReadLine(), which calls syncReadLine() and
@@ -2264,19 +2153,18 @@ void readSyncBulkPayload(connection *conn) {
         }
 
         if (buf[0] == '-') {
-            serverLog(LL_WARNING, "%s aborted replication with an error: %s", replicationGetNameForLogs(link), buf + 1);
+            serverLog(LL_WARNING, "PRIMARY aborted replication with an error: %s", buf + 1);
             goto error;
         } else if (buf[0] == '\0') {
             /* At this stage just a newline works as a PING in order to take
              * the connection live. So we refresh our last interaction
              * timestamp. */
-            link->transfer_lastio = server.unixtime;
+            server.repl_transfer_lastio = server.unixtime;
             return;
         } else if (buf[0] != '$') {
             serverLog(LL_WARNING,
-                      "Bad protocol from %s, the first byte is not '$' (we received '%s'), are you sure the host "
+                      "Bad protocol from PRIMARY, the first byte is not '$' (we received '%s'), are you sure the host "
                       "and port are right?",
-                      replicationGetNameForLogs(link),
                       buf);
             goto error;
         }
@@ -2297,14 +2185,14 @@ void readSyncBulkPayload(connection *conn) {
             memset(lastbytes, 0, RDB_EOF_MARK_SIZE);
             /* Set any repl_transfer_size to avoid entering this code path
              * at the next call. */
-            link->transfer_size = 0;
-            serverLog(LL_NOTICE, "%s <-> REPLICA sync: receiving streamed RDB from primary with EOF %s", replicationGetNameForLogs(link),
+            server.repl_transfer_size = 0;
+            serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: receiving streamed RDB from primary with EOF %s",
                       use_diskless_load ? "to parser" : "to disk");
         } else {
             usemark = 0;
-            link->transfer_size = strtol(buf + 1, NULL, 10);
-            serverLog(LL_NOTICE, "%s <-> REPLICA sync: receiving %lld bytes from primary %s", replicationGetNameForLogs(link),
-                      (long long)link->transfer_size, use_diskless_load ? "to parser" : "to disk");
+            server.repl_transfer_size = strtol(buf + 1, NULL, 10);
+            serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: receiving %lld bytes from primary %s",
+                      (long long)server.repl_transfer_size, use_diskless_load ? "to parser" : "to disk");
         }
         return;
     }
@@ -2315,7 +2203,7 @@ void readSyncBulkPayload(connection *conn) {
         if (usemark) {
             readlen = sizeof(buf);
         } else {
-            left = link->transfer_size - link->transfer_read;
+            left = server.repl_transfer_size - server.repl_transfer_read;
             readlen = (left < (signed)sizeof(buf)) ? left : (signed)sizeof(buf);
         }
 
@@ -2325,7 +2213,7 @@ void readSyncBulkPayload(connection *conn) {
                 /* equivalent to EAGAIN */
                 return;
             }
-            serverLog(LL_WARNING, "I/O error trying to sync with %s: %s", replicationGetNameForLogs(link),
+            serverLog(LL_WARNING, "I/O error trying to sync with PRIMARY: %s",
                       (nread == -1) ? connGetLastError(conn) : "connection lost");
             goto error;
         }
@@ -2351,20 +2239,19 @@ void readSyncBulkPayload(connection *conn) {
         /* Update the last I/O time for the replication transfer (used in
          * order to detect timeouts during replication), and write what we
          * got from the socket to the dump file on disk. */
-        link->transfer_lastio = server.unixtime;
-        if ((nwritten = write(link->transfer_fd, buf, nread)) != nread) {
+        server.repl_transfer_lastio = server.unixtime;
+        if ((nwritten = write(server.repl_transfer_fd, buf, nread)) != nread) {
             serverLog(LL_WARNING,
                       "Write error or short write writing to the DB dump file "
-                      "needed for %s <-> REPLICA synchronization: %s",
-                      replicationGetNameForLogs(link),
+                      "needed for PRIMARY <-> REPLICA synchronization: %s",
                       (nwritten == -1) ? strerror(errno) : "short write");
             goto error;
         }
-        link->transfer_read += nread;
+        server.repl_transfer_read += nread;
 
         /* Delete the last 40 bytes from the file if we reached EOF. */
         if (usemark && eof_reached) {
-            if (ftruncate(link->transfer_fd, link->transfer_read - RDB_EOF_MARK_SIZE) == -1) {
+            if (ftruncate(server.repl_transfer_fd, server.repl_transfer_read - RDB_EOF_MARK_SIZE) == -1) {
                 serverLog(LL_WARNING,
                           "Error truncating the RDB file received from the primary "
                           "for SYNC: %s",
@@ -2376,15 +2263,15 @@ void readSyncBulkPayload(connection *conn) {
         /* Sync data on disk from time to time, otherwise at the end of the
          * transfer we may suffer a big delay as the memory buffers are copied
          * into the actual disk. */
-        if (link->transfer_read >= link->transfer_last_fsync_off + REPL_MAX_WRITTEN_BEFORE_FSYNC) {
-            off_t sync_size = link->transfer_read - link->transfer_last_fsync_off;
-            rdb_fsync_range(link->transfer_fd, link->transfer_last_fsync_off, sync_size);
-            link->transfer_last_fsync_off += sync_size;
+        if (server.repl_transfer_read >= server.repl_transfer_last_fsync_off + REPL_MAX_WRITTEN_BEFORE_FSYNC) {
+            off_t sync_size = server.repl_transfer_read - server.repl_transfer_last_fsync_off;
+            rdb_fsync_range(server.repl_transfer_fd, server.repl_transfer_last_fsync_off, sync_size);
+            server.repl_transfer_last_fsync_off += sync_size;
         }
 
         /* Check if the transfer is now complete */
         if (!usemark) {
-            if (link->transfer_read == link->transfer_size) eof_reached = 1;
+            if (server.repl_transfer_read == server.repl_transfer_size) eof_reached = 1;
         }
 
         /* If the transfer is yet not complete, we need to read more, so
@@ -2447,7 +2334,7 @@ void readSyncBulkPayload(connection *conn) {
              * It is enabled only on SWAPDB diskless replication when primary replication ID hasn't changed,
              * because in that state the old content of the db represents a different point in time of the same
              * data set we're currently receiving from the primary. */
-            if (memcmp(server.replid, link->replid, CONFIG_RUN_ID_SIZE) == 0) {
+            if (memcmp(server.replid, server.primary_replid, CONFIG_RUN_ID_SIZE) == 0) {
                 asyncLoading = 1;
             }
             dbarray = diskless_load_tempDb;
@@ -2458,34 +2345,29 @@ void readSyncBulkPayload(connection *conn) {
             replicationAttachToNewPrimary();
 
             /* Even though we are on-empty-db and the database is empty, we still call emptyData. */
-            serverLog(LL_NOTICE, "%s <-> REPLICA sync: Flushing old data", replicationGetNameForLogs(link));
+            serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Flushing old data");
             emptyData(-1, empty_db_flags, replicationEmptyDbCallback);
 
             dbarray = server.db;
             functions_lib_ctx = functionsLibCtxGetCurrent();
         }
 
-        rioInitWithConn(&rdb, conn, link->transfer_size);
+        rioInitWithConn(&rdb, conn, server.repl_transfer_size);
 
         /* Put the socket in blocking mode to simplify RDB transfer.
          * We'll restore it when the RDB is received. */
         connBlock(conn);
         connRecvTimeout(conn, server.repl_timeout * 1000);
 
-        serverLog(LL_NOTICE, "%s <-> REPLICA sync: Loading DB in memory", replicationGetNameForLogs(link));
-        startLoading(link->transfer_size, RDBFLAGS_REPLICATION, asyncLoading);
-
-        /* Before loading, ensure that the link won't be freed, even if
-         * REPLICAOF NO ONE is called in background event processing. */
-        link->protected = 1;
+        serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Loading DB in memory");
+        startLoading(server.repl_transfer_size, RDBFLAGS_REPLICATION, asyncLoading);
 
         int loadingFailed = 0;
         rdbLoadingCtx loadingCtx = {.dbarray = dbarray, .functions_lib_ctx = functions_lib_ctx};
         if (rdbLoadRioWithLoadingCtxScopedRdb(&rdb, RDBFLAGS_REPLICATION, &rsi, &loadingCtx) != C_OK) {
             /* RDB loading failed. */
-            serverLog(LL_WARNING, "Failed trying to load the %s synchronization DB "
-                                  "from socket, check server logs.",
-                      replicationGetNameForLogs(link));
+            serverLog(LL_WARNING, "Failed trying to load the PRIMARY synchronization DB "
+                                  "from socket, check server logs.");
             loadingFailed = 1;
         } else if (usemark) {
             /* Verify the end mark is correct. */
@@ -2495,14 +2377,6 @@ void readSyncBulkPayload(connection *conn) {
             }
         }
 
-        /* After loading, check if replication was cancelled */
-        link->protected = 0;
-        if (link->state == REPL_STATE_CANCELLED) {
-            /* Link was freed during RDB load */
-            serverLog(LL_NOTICE, "%s <-> REPLICA sync: Link to primary closed during diskless RDB load", replicationGetNameForLogs(link));
-            loadingFailed = 1;
-        }
-
         if (loadingFailed) {
             stopLoading(0);
             rioFreeConn(&rdb, NULL);
@@ -2514,10 +2388,10 @@ void readSyncBulkPayload(connection *conn) {
 
                 disklessLoadDiscardTempDb(diskless_load_tempDb);
                 disklessLoadDiscardFunctionsLibCtx(temp_functions_lib_ctx);
-                serverLog(LL_NOTICE, "%s <-> REPLICA sync: Discarding temporary DB in background", replicationGetNameForLogs(link));
+                serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding temporary DB in background");
             } else {
                 /* Remove the half-loaded data in case we started with an empty replica. */
-                serverLog(LL_NOTICE, "%s <-> REPLICA sync: Discarding the half-loaded data", replicationGetNameForLogs(link));
+                serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding the half-loaded data");
                 emptyData(-1, empty_db_flags, replicationEmptyDbCallback);
             }
 
@@ -2534,7 +2408,7 @@ void readSyncBulkPayload(connection *conn) {
              * primary structure and force resync of sub-replicas. */
             replicationAttachToNewPrimary();
 
-            serverLog(LL_NOTICE, "%s <-> REPLICA sync: Swapping active DB with loaded DB", replicationGetNameForLogs(link));
+            serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Swapping active DB with loaded DB");
             swapMainDbWithTempDb(diskless_load_tempDb);
 
             /* swap existing functions ctx with the temporary one */
@@ -2545,7 +2419,7 @@ void readSyncBulkPayload(connection *conn) {
 
             /* Delete the old db as it's useless now. */
             disklessLoadDiscardTempDb(diskless_load_tempDb);
-            serverLog(LL_NOTICE, "%s <-> REPLICA sync: Discarding old DB in background", replicationGetNameForLogs(link));
+            serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding old DB in background");
         }
 
         /* Inform about db change, as replication was diskless and didn't cause a save. */
@@ -2561,22 +2435,20 @@ void readSyncBulkPayload(connection *conn) {
     } else {
         /* Make sure the new file (also used for persistence) is fully synced
          * (not covered by earlier calls to rdb_fsync_range). */
-        if (fsync(link->transfer_fd) == -1) {
+        if (fsync(server.repl_transfer_fd) == -1) {
             serverLog(LL_WARNING,
                       "Failed trying to sync the temp DB to disk in "
-                      "%s <-> REPLICA synchronization: %s",
-                      replicationGetNameForLogs(link),
+                      "PRIMARY <-> REPLICA synchronization: %s",
                       strerror(errno));
             goto error;
         }
 
         /* Rename rdb like renaming rewrite aof asynchronously. */
         int old_rdb_fd = open(server.rdb_filename, O_RDONLY | O_NONBLOCK);
-        if (rename(link->transfer_tmpfile, server.rdb_filename) == -1) {
+        if (rename(server.repl_transfer_tmpfile, server.rdb_filename) == -1) {
             serverLog(LL_WARNING,
                       "Failed trying to rename the temp DB into %s in "
-                      "%s <-> REPLICA synchronization: %s",
-                      replicationGetNameForLogs(link),
+                      "PRIMARY <-> REPLICA synchronization: %s",
                       server.rdb_filename, strerror(errno));
             if (old_rdb_fd != -1) close(old_rdb_fd);
             goto error;
@@ -2588,8 +2460,7 @@ void readSyncBulkPayload(connection *conn) {
         if (fsyncFileDir(server.rdb_filename) == -1) {
             serverLog(LL_WARNING,
                       "Failed trying to sync DB directory %s in "
-                      "%s <-> REPLICA synchronization: %s",
-                      replicationGetNameForLogs(link),
+                      "PRIMARY <-> REPLICA synchronization: %s",
                       server.rdb_filename, strerror(errno));
             goto error;
         }
@@ -2601,14 +2472,13 @@ void readSyncBulkPayload(connection *conn) {
         /* Empty the databases only after the RDB file is ok, that is, before the RDB file
          * is actually loaded, in case we encounter an error and drop the replication stream
          * and leave an empty database. */
-        serverLog(LL_NOTICE, "%s <-> REPLICA sync: Flushing old data", replicationGetNameForLogs(link));
+        serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Flushing old data");
         emptyData(-1, empty_db_flags, replicationEmptyDbCallback);
 
-        serverLog(LL_NOTICE, "%s <-> REPLICA sync: Loading DB in memory", replicationGetNameForLogs(link));
+        serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Loading DB in memory");
         if (rdbLoad(server.rdb_filename, &rsi, RDBFLAGS_REPLICATION) != RDB_OK) {
-            serverLog(LL_WARNING, "Failed trying to load the %s synchronization "
-                                  "DB from disk, check server logs.",
-                      replicationGetNameForLogs(link));
+            serverLog(LL_WARNING, "Failed trying to load the PRIMARY synchronization "
+                                  "DB from disk, check server logs.");
             if (server.rdb_del_sync_files && allPersistenceDisabled()) {
                 serverLog(LL_NOTICE, "Removing the RDB file obtained from "
                                      "the primary. This replica has persistence "
@@ -2617,7 +2487,7 @@ void readSyncBulkPayload(connection *conn) {
             }
 
             /* If disk-based RDB loading fails, remove the half-loaded dataset. */
-            serverLog(LL_NOTICE, "%s<-> REPLICA sync: Discarding the half-loaded data", replicationGetNameForLogs(link));
+            serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding the half-loaded data");
             emptyData(-1, empty_db_flags, replicationEmptyDbCallback);
 
             /* Note that there's no point in restarting the AOF on sync failure,
@@ -2633,17 +2503,61 @@ void readSyncBulkPayload(connection *conn) {
             bg_unlink(server.rdb_filename);
         }
 
-        zfree(link->transfer_tmpfile);
-        close(link->transfer_fd);
-        link->transfer_fd = -1;
-        link->transfer_tmpfile = NULL;
+        zfree(server.repl_transfer_tmpfile);
+        close(server.repl_transfer_fd);
+        server.repl_transfer_fd = -1;
+        server.repl_transfer_tmpfile = NULL;
+    }
+
+    /* Final setup of the connected replica <- primary link */
+    if (conn == server.repl_rdb_transfer_s) {
+        dualChannelSyncHandleRdbLoadCompletion();
+    } else {
+        replicationCreatePrimaryClient(server.repl_transfer_s, rsi.repl_stream_db);
+        server.repl_state = REPL_STATE_CONNECTED;
+        server.repl_down_since = 0;
+        /* Send the initial ACK immediately to put this replica in online state. */
+        replicationSendAck(server.primary);
+    }
+
+    /* Fire the primary link modules event. */
+    moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_UP, NULL);
+    if (server.repl_state == REPL_STATE_CONNECTED) {
+        /* After a full resynchronization we use the replication ID and
+         * offset of the primary. The secondary ID / offset are cleared since
+         * we are starting a new history. */
+        memcpy(server.replid, server.primary->repl_data->replid, sizeof(server.replid));
+        server.primary_repl_offset = server.primary->repl_data->reploff;
+    }
+    clearReplicationId2();
+
+    /* Let's create the replication backlog if needed. Replicas need to
+     * accumulate the backlog regardless of the fact they have sub-replicas
+     * or not, in order to behave correctly if they are promoted to
+     * primaries after a failover. */
+    if (server.repl_backlog == NULL) createReplicationBacklog();
+    serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Finished with success");
+
+    if (server.supervised_mode == SUPERVISED_SYSTEMD) {
+        serverCommunicateSystemd("STATUS=PRIMARY <-> REPLICA sync: Finished with success. Ready to accept connections "
+                                 "in read-write mode.\n");
     }
 
-    replicationFinishSyncPayload(conn, link, rsi.repl_stream_db);
+    /* Restart the AOF subsystem now that we finished the sync. This
+     * will trigger an AOF rewrite, and when done will start appending
+     * to the new file. */
+    if (server.aof_enabled) restartAOFAfterSYNC();
+
+    /* In case of dual channel replication sync we want to close the RDB connection
+     * once the connection is established */
+    if (conn == server.repl_rdb_transfer_s) {
+        connClose(conn);
+        server.repl_rdb_transfer_s = NULL;
+    }
     return;
 
 error:
-    if (link) cancelReplicationHandshake(link, 1);
+    cancelReplicationHandshake(1);
     return;
 }
 
@@ -2654,8 +2568,7 @@ char *receiveSynchronousResponse(connection *conn) {
         serverLog(LL_WARNING, "Failed to read response from the server: %s", connGetLastError(conn));
         return NULL;
     }
-    replicationLink *link = (replicationLink *)connGetPrivateData(conn);
-    link->transfer_lastio = server.unixtime;
+    server.repl_transfer_lastio = server.unixtime;
     return sdsnew(buf);
 }
 
@@ -2752,34 +2665,35 @@ sds getReplicaPortString(void) {
 
 /* Replication: Replica side.
  * Free replica's local replication buffer */
-void freePendingReplDataBuf(replicationLink *link) {
-    listRelease(link->pending_repl_data.blocks);
-    link->pending_repl_data.blocks = NULL;
-    link->pending_repl_data.len = 0;
+void freePendingReplDataBuf(void) {
+    listRelease(server.pending_repl_data.blocks);
+    server.pending_repl_data.blocks = NULL;
+    server.pending_repl_data.len = 0;
 }
 
 /* Replication: Replica side.
  * Upon dual-channel sync failure, close rdb-connection, reset repl-state, reset
  * provisional primary struct, and free local replication buffer. */
-void replicationAbortDualChannelSyncTransfer(replicationLink *link) {
-    serverAssert(link->rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE);
+void replicationAbortDualChannelSyncTransfer(void) {
+    serverAssert(server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE);
     dualChannelServerLog(LL_NOTICE, "Aborting dual channel sync");
-    if (link->rdb_transfer_s) {
-        connClose(link->rdb_transfer_s);
-        link->rdb_transfer_s = NULL;
-    }
-    zfree(link->transfer_tmpfile);
-    link->transfer_tmpfile = NULL;
-    if (link->transfer_fd != -1) {
-        close(link->transfer_fd);
-        link->transfer_fd = -1;
-    }
-    link->rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE;
-    link->provisional_source_state.read_reploff = 0;
-    link->provisional_source_state.reploff = 0;
-    link->provisional_source_state.dbid = -1;
-    link->rdb_client_id = -1;
-    freePendingReplDataBuf(link);
+    if (server.repl_rdb_transfer_s) {
+        connClose(server.repl_rdb_transfer_s);
+        server.repl_rdb_transfer_s = NULL;
+    }
+    zfree(server.repl_transfer_tmpfile);
+    server.repl_transfer_tmpfile = NULL;
+    if (server.repl_transfer_fd != -1) {
+        close(server.repl_transfer_fd);
+        server.repl_transfer_fd = -1;
+    }
+    server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE;
+    server.repl_provisional_primary.read_reploff = 0;
+    server.repl_provisional_primary.reploff = 0;
+    server.repl_provisional_primary.conn = NULL;
+    server.repl_provisional_primary.dbid = -1;
+    server.rdb_client_id = -1;
+    freePendingReplDataBuf();
     return;
 }
 
@@ -2801,7 +2715,7 @@ int sendCurrentOffsetToReplica(client *replica) {
     return C_OK;
 }
 
-static int dualChannelReplHandleHandshake(replicationLink *link, sds *err) {
+static int dualChannelReplHandleHandshake(connection *conn, sds *err) {
     dualChannelServerLog(LL_DEBUG, "Received first reply from primary using rdb connection.");
     /* AUTH with the primary if required. */
     if (server.primary_auth) {
@@ -2816,7 +2730,7 @@ static int dualChannelReplHandleHandshake(replicationLink *link, sds *err) {
         args[argc] = server.primary_auth;
         lens[argc] = sdslen(server.primary_auth);
         argc++;
-        *err = sendCommandArgv(link->rdb_transfer_s, argc, args, lens);
+        *err = sendCommandArgv(conn, argc, args, lens);
         if (*err) {
             dualChannelServerLog(LL_WARNING, "Sending command to primary in dual channel replication handshake: %s", *err);
             return C_ERR;
@@ -2824,7 +2738,7 @@ static int dualChannelReplHandleHandshake(replicationLink *link, sds *err) {
     }
     /* Send replica listening port to primary for clarification */
     sds portstr = getReplicaPortString();
-    *err = sendCommand(link->rdb_transfer_s, "REPLCONF", "capa", "eof", "rdb-only", "1", "rdb-channel", "1", "listening-port", portstr,
+    *err = sendCommand(conn, "REPLCONF", "capa", "eof", "rdb-only", "1", "rdb-channel", "1", "listening-port", portstr,
                        NULL);
     sdsfree(portstr);
     if (*err) {
@@ -2832,30 +2746,17 @@ static int dualChannelReplHandleHandshake(replicationLink *link, sds *err) {
         return C_ERR;
     }
 
-    /* Send slot bitmap, if it is needed */
-    if (!isSlotBitmapAllSlots(link->slot_bitmap)) {
-        char *args[] = {"REPLCONF", "slot-bitmap", NULL};
-        size_t lens[] = {8, 11, 0};
-        args[2] = (char *) link->slot_bitmap;
-        lens[2] = sizeof(slotBitmap);
-        *err = sendCommandArgv(link->rdb_transfer_s, 3, args, lens);
-        if (*err) {
-            dualChannelServerLog(LL_WARNING, "Sending REPLCONF slot-bitmap command to primary in dual channel replication handshake: %s", *err);
-            return C_ERR;
-        }
-    }
-
-    if (connSetReadHandler(link->rdb_transfer_s, dualChannelFullSyncWithReplicationSource) == C_ERR) {
+    if (connSetReadHandler(conn, dualChannelFullSyncWithPrimary) == C_ERR) {
         char conninfo[CONN_INFO_LEN];
         dualChannelServerLog(LL_WARNING, "Can't create readable event for SYNC: %s (%s)", strerror(errno),
-                             connGetInfo(link->transfer_s, conninfo, sizeof(conninfo)));
+                             connGetInfo(conn, conninfo, sizeof(conninfo)));
         return C_ERR;
     }
     return C_OK;
 }
 
-static int dualChannelReplHandleAuthReply(replicationLink *link, sds *err) {
-    *err = receiveSynchronousResponse(link->rdb_transfer_s);
+static int dualChannelReplHandleAuthReply(connection *conn, sds *err) {
+    *err = receiveSynchronousResponse(conn);
     if (*err == NULL) {
         dualChannelServerLog(LL_WARNING, "Primary did not respond to auth command during SYNC handshake");
         return C_ERR;
@@ -2864,11 +2765,12 @@ static int dualChannelReplHandleAuthReply(replicationLink *link, sds *err) {
         dualChannelServerLog(LL_WARNING, "Unable to AUTH to Primary: %s", *err);
         return C_ERR;
     }
+    server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY;
     return C_OK;
 }
 
-static int dualChannelReplHandleReplconfReply(replicationLink *link, sds *err) {
-    *err = receiveSynchronousResponse(link->rdb_transfer_s);
+static int dualChannelReplHandleReplconfReply(connection *conn, sds *err) {
+    *err = receiveSynchronousResponse(conn);
     if (*err == NULL) {
         dualChannelServerLog(LL_WARNING, "Primary did not respond to replconf command during SYNC handshake");
         return C_ERR;
@@ -2879,36 +2781,16 @@ static int dualChannelReplHandleReplconfReply(replicationLink *link, sds *err) {
                              *err);
         return C_ERR;
     }
-
-    /* Recieve slot bitmap response as well. */
-    if (!isSlotBitmapAllSlots(link->slot_bitmap)) {
-        *err = receiveSynchronousResponse(link->rdb_transfer_s);
-        if (*err == NULL) {
-            dualChannelServerLog(LL_WARNING, "Primary did not respond to replconf slot-bitmap command during SYNC handshake");
-            return C_ERR;
-        }
-
-        if (*err[0] == '-') {
-            dualChannelServerLog(LL_NOTICE, "Server does not support sync with slot-bitmap, dual channel sync approach cannot be used: %s",
-                                *err);
-            return C_ERR;
-        }
-    }
-
-    if (connSyncWrite(link->rdb_transfer_s, "SYNC\r\n", 6, server.repl_syncio_timeout * 1000) == -1) {
-        dualChannelServerLog(LL_WARNING, "I/O error writing to Primary: %s", connGetLastError(link->rdb_transfer_s));
+    if (connSyncWrite(conn, "SYNC\r\n", 6, server.repl_syncio_timeout * 1000) == -1) {
+        dualChannelServerLog(LL_WARNING, "I/O error writing to Primary: %s", connGetLastError(conn));
         return C_ERR;
     }
     return C_OK;
 }
 
-int replicationUseAOFFormatSnapshot(replicationLink *link) {
-    return !isSlotBitmapAllSlots(link->slot_bitmap);
-}
-
-static int dualChannelReplHandleEndOffsetResponse(replicationLink *link, sds *err) {
+static int dualChannelReplHandleEndOffsetResponse(connection *conn, sds *err) {
     uint64_t rdb_client_id;
-    *err = receiveSynchronousResponse(link->rdb_transfer_s);
+    *err = receiveSynchronousResponse(conn);
     if (*err == NULL) {
         return C_ERR;
     }
@@ -2926,34 +2808,30 @@ static int dualChannelReplHandleEndOffsetResponse(replicationLink *link, sds *er
         dualChannelServerLog(LL_WARNING, "Received unexpected $ENDOFF response: %s", *err);
         return C_ERR;
     }
-    link->rdb_client_id = rdb_client_id;
-    link->initial_offset = reploffset;
+    server.rdb_client_id = rdb_client_id;
+    server.primary_initial_offset = reploffset;
 
     /* Initiate repl_provisional_primary to act as this replica temp primary until RDB is loaded */
-    memcpy(link->provisional_source_state.replid, primary_replid, CONFIG_RUN_ID_SIZE + 1);
-    link->provisional_source_state.reploff = reploffset;
-    link->provisional_source_state.read_reploff = reploffset;
-    link->provisional_source_state.dbid = dbid;
+    server.repl_provisional_primary.conn = server.repl_transfer_s;
+    memcpy(server.repl_provisional_primary.replid, primary_replid, sizeof(server.repl_provisional_primary.replid));
+    server.repl_provisional_primary.reploff = reploffset;
+    server.repl_provisional_primary.read_reploff = reploffset;
+    server.repl_provisional_primary.dbid = dbid;
 
     /* Now that we have the snapshot end-offset, we can ask for psync from that offset. Prepare the
      * main connection accordingly.*/
-    link->transfer_s->state = CONN_STATE_CONNECTED;
-    link->state = REPL_STATE_SEND_HANDSHAKE;
-    serverAssert(connSetReadHandler(link->transfer_s, dualChannelSetupMainConnForPsync) != C_ERR);
-    dualChannelSetupMainConnForPsync(link->transfer_s);
+    server.repl_transfer_s->state = CONN_STATE_CONNECTED;
+    server.repl_state = REPL_STATE_SEND_HANDSHAKE;
+    serverAssert(connSetReadHandler(server.repl_transfer_s, dualChannelSetupMainConnForPsync) != C_ERR);
+    dualChannelSetupMainConnForPsync(server.repl_transfer_s);
 
-    /* As the next block we will receive using this connection is the snapshot, we need to prepare
+    /* As the next block we will receive using this connection is the rdb, we need to prepare
      * the connection accordingly */
-    if (replicationUseAOFFormatSnapshot(link)) {
-        link->client = createReplicationLinkClientWithHandler(link, link->rdb_transfer_s, -1, readQueryFromClient);
-        link->rdb_transfer_s = NULL;
-    } else {
-        serverAssert(connSetReadHandler(link->rdb_transfer_s, readSyncBulkPayload) != C_ERR);
-    }
-    link->transfer_size = -1;
-    link->transfer_read = 0;
-    link->transfer_last_fsync_off = 0;
-    link->transfer_lastio = server.unixtime;
+    serverAssert(connSetReadHandler(server.repl_rdb_transfer_s, readSyncBulkPayload) != C_ERR);
+    server.repl_transfer_size = -1;
+    server.repl_transfer_read = 0;
+    server.repl_transfer_last_fsync_off = 0;
+    server.repl_transfer_lastio = server.unixtime;
 
     return C_OK;
 }
@@ -2961,15 +2839,15 @@ static int dualChannelReplHandleEndOffsetResponse(replicationLink *link, sds *er
 /* Replication: Replica side.
  * This connection handler is used to initialize the RDB connection (dual-channel-replication).
  * Once a replica with dual-channel-replication enabled, denied from PSYNC with its primary,
- * dualChannelFullSyncWithReplicationSource begins its role. The connection handler prepares server.repl_rdb_transfer_s
+ * dualChannelFullSyncWithPrimary begins its role. The connection handler prepares server.repl_rdb_transfer_s
  * for a rdb stream, and server.repl_transfer_s for incremental replication data stream. */
-static void dualChannelFullSyncWithReplicationSource(connection *conn) {
+static void dualChannelFullSyncWithPrimary(connection *conn) {
     char *err = NULL;
     int ret = 0;
-    replicationLink *link = (replicationLink *)connGetPrivateData(conn);
+    serverAssert(conn == server.repl_rdb_transfer_s);
     /* If this event fired after the user turned the instance into a primary
      * with REPLICAOF NO ONE we must just return ASAP. */
-    if (link->state == REPL_STATE_NONE) {
+    if (server.repl_state == REPL_STATE_NONE) {
         goto error;
     }
     /* Check for errors in the socket: after a non blocking connect() we
@@ -2979,30 +2857,30 @@ static void dualChannelFullSyncWithReplicationSource(connection *conn) {
                              connGetLastError(conn));
         goto error;
     }
-    switch (link->rdb_channel_state) {
+    switch (server.repl_rdb_channel_state) {
     case REPL_DUAL_CHANNEL_SEND_HANDSHAKE:
-        ret = dualChannelReplHandleHandshake(link, &err);
-        if (ret == C_OK) link->rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_AUTH_REPLY;
+        ret = dualChannelReplHandleHandshake(conn, &err);
+        if (ret == C_OK) server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_AUTH_REPLY;
         break;
     case REPL_DUAL_CHANNEL_RECEIVE_AUTH_REPLY:
         if (server.primary_auth) {
-            ret = dualChannelReplHandleAuthReply(link, &err);
-            if (ret == C_OK) link->rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY;
+            ret = dualChannelReplHandleAuthReply(conn, &err);
+            if (ret == C_OK) server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY;
             /* Wait for next bulk before trying to read replconf reply. */
             break;
         }
-        link->rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY;
+        server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY;
         /* fall through */
     case REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY:
-        ret = dualChannelReplHandleReplconfReply(link, &err);
-        if (ret == C_OK) link->rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_ENDOFF;
+        ret = dualChannelReplHandleReplconfReply(conn, &err);
+        if (ret == C_OK) server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_ENDOFF;
         break;
     case REPL_DUAL_CHANNEL_RECEIVE_ENDOFF:
-        ret = dualChannelReplHandleEndOffsetResponse(link, &err);
-        if (ret == C_OK) link->rdb_channel_state = REPL_DUAL_CHANNEL_RDB_LOAD;
+        ret = dualChannelReplHandleEndOffsetResponse(conn, &err);
+        if (ret == C_OK) server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RDB_LOAD;
         break;
     default:
-        serverPanic("Unexpected dual replication state: %d", link->rdb_channel_state);
+        serverPanic("Unexpected dual replication state: %d", server.repl_rdb_channel_state);
     }
     if (ret == C_ERR) goto error;
     sdsfree(err);
@@ -3013,33 +2891,29 @@ static void dualChannelFullSyncWithReplicationSource(connection *conn) {
         serverLog(LL_WARNING, "Dual channel sync failed with error %s", err);
         sdsfree(err);
     }
-    if (link->transfer_s) {
-        connClose(link->transfer_s);
-        link->transfer_s = NULL;
+    if (server.repl_transfer_s) {
+        connClose(server.repl_transfer_s);
+        server.repl_transfer_s = NULL;
     }
-    if (link->rdb_transfer_s) {
-        connClose(link->rdb_transfer_s);
-        link->rdb_transfer_s = NULL;
-    }
-    if (link->transfer_fd != -1) close(link->transfer_fd);
-    link->transfer_fd = -1;
-    link->state = REPL_STATE_CONNECT;
-    replicationAbortDualChannelSyncTransfer(link);
-    if (link->client) {
-        freeClient(link->client);
-        link->client = NULL;
+    if (server.repl_rdb_transfer_s) {
+        connClose(server.repl_rdb_transfer_s);
+        server.repl_rdb_transfer_s = NULL;
     }
+    if (server.repl_transfer_fd != -1) close(server.repl_transfer_fd);
+    server.repl_transfer_fd = -1;
+    server.repl_state = REPL_STATE_CONNECT;
+    replicationAbortDualChannelSyncTransfer();
 }
 
 /* Replication: Replica side.
  * Initialize server.pending_repl_data infrastructure, we will allocate the buffer
  * itself once we need it */
-void replDataBufInit(replicationLink *link) {
-    serverAssert(link->pending_repl_data.blocks == NULL);
-    link->pending_repl_data.len = 0;
-    link->pending_repl_data.peak = 0;
-    link->pending_repl_data.blocks = listCreate();
-    link->pending_repl_data.blocks->free = zfree;
+void replDataBufInit(void) {
+    serverAssert(server.pending_repl_data.blocks == NULL);
+    server.pending_repl_data.len = 0;
+    server.pending_repl_data.peak = 0;
+    server.pending_repl_data.blocks = listCreate();
+    server.pending_repl_data.blocks->free = zfree;
 }
 
 /* Replication: Replica side.
@@ -3050,7 +2924,7 @@ void replStreamProgressCallback(size_t offset, int readlen, time_t *last_progres
         ((offset + readlen) / server.loading_process_events_interval_bytes >
          offset / server.loading_process_events_interval_bytes) &&
         (now - *last_progress_callback > server.loading_process_events_interval_ms)) {
-        replicationSendNewlineToConnectedLinks();
+        replicationSendNewlineToPrimary();
         processEventsWhileBlocked();
         *last_progress_callback = now;
     }
@@ -3065,16 +2939,14 @@ typedef struct replDataBufBlock {
 
 /* Replication: Replica side.
  * Reads replication data from primary into specified repl buffer block */
-int readIntoReplDataBlock(replicationLink *link, replDataBufBlock *data_block, size_t read) {
-    int nread = connRead(link->transfer_s, data_block->buf + data_block->used, read);
+int readIntoReplDataBlock(connection *conn, replDataBufBlock *data_block, size_t read) {
+    int nread = connRead(conn, data_block->buf + data_block->used, read);
     if (nread <= 0) {
-        if (nread == 0 || connGetState(link->transfer_s) != CONN_STATE_CONNECTED) {
+        if (nread == 0 || connGetState(conn) != CONN_STATE_CONNECTED) {
             dualChannelServerLog(LL_WARNING, "Provisional primary closed connection");
-            if (link->rdb_channel_state == REPL_DUAL_CHANNEL_RDB_LOAD) {
-                /* Signal ongoing RDB load to terminate gracefully */
-                if (server.loading_rio) rioCloseASAP(server.loading_rio);
-            }
-            cancelReplicationHandshake(link, 1);
+            /* Signal ongoing RDB load to terminate gracefully */
+            if (server.loading_rio) rioCloseASAP(server.loading_rio);
+            cancelReplicationHandshake(1);
         }
         return C_ERR;
     }
@@ -3089,10 +2961,8 @@ void bufferReplData(connection *conn) {
     size_t readlen = PROTO_IOBUF_LEN;
     int remaining_bytes = 0;
 
-    replicationLink *link = (replicationLink *)connGetPrivateData(conn);
-
     while (readlen > 0) {
-        listNode *ln = listLast(link->pending_repl_data.blocks);
+        listNode *ln = listLast(server.pending_repl_data.blocks);
         replDataBufBlock *tail = ln ? listNodeValue(ln) : NULL;
 
         /* Append to tail string when possible */
@@ -3100,11 +2970,11 @@ void bufferReplData(connection *conn) {
             size_t avail = tail->size - tail->used;
             remaining_bytes = min(readlen, avail);
             readlen -= remaining_bytes;
-            remaining_bytes = readIntoReplDataBlock(link, tail, remaining_bytes);
+            remaining_bytes = readIntoReplDataBlock(conn, tail, remaining_bytes);
         }
         if (readlen && remaining_bytes == 0) {
             if (server.client_obuf_limits[CLIENT_TYPE_REPLICA].hard_limit_bytes &&
-                link->pending_repl_data.len > server.client_obuf_limits[CLIENT_TYPE_REPLICA].hard_limit_bytes) {
+                server.pending_repl_data.len > server.client_obuf_limits[CLIENT_TYPE_REPLICA].hard_limit_bytes) {
                 dualChannelServerLog(LL_NOTICE, "Replication buffer limit reached, stopping buffering.");
                 /* Stop accumulating primary commands. */
                 connSetReadHandler(conn, NULL);
@@ -3119,15 +2989,15 @@ void bufferReplData(connection *conn) {
             tail = zmalloc_usable(size + sizeof(replDataBufBlock), &usable_size);
             tail->size = usable_size - sizeof(replDataBufBlock);
             tail->used = 0;
-            listAddNodeTail(link->pending_repl_data.blocks, tail);
-            link->pending_repl_data.len += tail->size;
+            listAddNodeTail(server.pending_repl_data.blocks, tail);
+            server.pending_repl_data.len += tail->size;
             /* Update buffer's peak */
-            if (link->pending_repl_data.peak < link->pending_repl_data.len)
-                link->pending_repl_data.peak = link->pending_repl_data.len;
+            if (server.pending_repl_data.peak < server.pending_repl_data.len)
+                server.pending_repl_data.peak = server.pending_repl_data.len;
 
             remaining_bytes = min(readlen, tail->size);
             readlen -= remaining_bytes;
-            remaining_bytes = readIntoReplDataBlock(link, tail, remaining_bytes);
+            remaining_bytes = readIntoReplDataBlock(conn, tail, remaining_bytes);
         }
         if (remaining_bytes > 0) {
             /* Stop reading in case we read less than we anticipated */
@@ -3141,34 +3011,29 @@ void bufferReplData(connection *conn) {
 
 /* Replication: Replica side.
  * Streams accumulated replication data into the database while freeing read nodes */
-int streamReplDataBufToDb(replicationLink *link) {
-    serverAssert(link->client->flag.replication_source);
+int streamReplDataBufToDb(client *c) {
+    serverAssert(c->flag.primary);
     blockingOperationStarts();
     size_t used, offset = 0;
     listNode *cur = NULL;
     time_t last_progress_callback = mstime();
-
-    /* Before loading, protect our link from being destructed. */
-    link->protected = 1;
-
-    while (link->pending_repl_data.blocks && (cur = listFirst(link->pending_repl_data.blocks))) {
+    while (server.pending_repl_data.blocks && (cur = listFirst(server.pending_repl_data.blocks))) {
         /* Read and process repl data block */
         replDataBufBlock *o = listNodeValue(cur);
         used = o->used;
-        link->client->querybuf = sdscatlen(link->client->querybuf, o->buf, used);
-        link->client->repl_data->read_reploff += used;
-        processInputBuffer(link->client);
-        link->pending_repl_data.len -= used;
+        c->querybuf = sdscatlen(c->querybuf, o->buf, used);
+        c->repl_data->read_reploff += used;
+        processInputBuffer(c);
+        server.pending_repl_data.len -= used;
         offset += used;
-        listDelNode(link->pending_repl_data.blocks, cur);
+        listDelNode(server.pending_repl_data.blocks, cur);
         replStreamProgressCallback(offset, used, &last_progress_callback);
     }
-    link->protected = 0;
     blockingOperationEnds();
-
-    if (link->state == REPL_STATE_CANCELLED) {
+    if (!server.pending_repl_data.blocks) {
         /* If we encounter a `replicaof` command during the replStreamProgressCallback,
-         * we should return an error and abort the current sync session. */
+         * pending_repl_data.blocks will be NULL, and we should return an error and
+         * abort the current sync session. */
         return C_ERR;
     }
     return C_OK;
@@ -3177,64 +3042,65 @@ int streamReplDataBufToDb(replicationLink *link) {
 /* Replication: Replica side.
  * After done loading the snapshot using the rdb-channel prepare this replica for steady state by
  * initializing the primary client, amd stream local incremental buffer into memory. */
-int dualChannelSyncSuccess(replicationLink *link) {
-    link->initial_offset = link->provisional_source_state.reploff;
-    replicationResurrectProvisionalSource(link);
+void dualChannelSyncSuccess(void) {
+    server.primary_initial_offset = server.repl_provisional_primary.reploff;
+    replicationResurrectProvisionalPrimary();
     /* Wait for the accumulated buffer to be processed before reading any more replication updates */
-    if (link->pending_repl_data.blocks && streamReplDataBufToDb(link) == C_ERR) {
+    if (server.pending_repl_data.blocks && streamReplDataBufToDb(server.primary) == C_ERR) {
         /* Sync session aborted during repl data streaming. */
         dualChannelServerLog(LL_WARNING, "Failed to stream local replication buffer into memory");
         /* Verify sync is still in progress */
-        if (link->rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) {
-            replicationAbortDualChannelSyncTransfer(link);
+        if (server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) {
+            replicationAbortDualChannelSyncTransfer();
+            replicationUnsetPrimary();
         }
-        return C_ERR;
+        return;
     }
-    freePendingReplDataBuf(link);
+    freePendingReplDataBuf();
     dualChannelServerLog(LL_NOTICE, "Successfully streamed replication data into memory");
     /* We can resume reading from the primary connection once the local replication buffer has been loaded. */
-    replicationSteadyStateInit(link);
-    replicationSendAck(link); /* Send ACK to notify primary that replica is synced */
-    link->rdb_client_id = -1;
-    link->rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE;
-    return C_OK;
+    replicationSteadyStateInit();
+    replicationSendAck(server.primary); /* Send ACK to notify primary that replica is synced */
+    server.rdb_client_id = -1;
+    server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE;
 }
 
 /* Replication: Replica side.
  * Main channel successfully established psync with primary. Check whether the rdb channel
  * has completed its part and act accordingly. */
-int dualChannelSyncHandlePsync(replicationLink *link) {
-    serverAssert(link->state == REPL_STATE_RECEIVE_PSYNC_REPLY);
-    if (link->rdb_channel_state < REPL_DUAL_CHANNEL_RDB_LOADED) {
+int dualChannelSyncHandlePsync(void) {
+    serverAssert(server.repl_state == REPL_STATE_RECEIVE_PSYNC_REPLY);
+    if (server.repl_rdb_channel_state < REPL_DUAL_CHANNEL_RDB_LOADED) {
         /* RDB is still loading */
-        if (connSetReadHandler(link->transfer_s, bufferReplData) == C_ERR) {
+        if (connSetReadHandler(server.repl_provisional_primary.conn, bufferReplData) == C_ERR) {
             dualChannelServerLog(LL_WARNING, "Error while setting readable handler: %s", strerror(errno));
-            cancelReplicationHandshake(link, 1);
+            cancelReplicationHandshake(1);
             return C_ERR;
         }
-        replDataBufInit(link);
+        replDataBufInit();
         return C_OK;
     }
-    serverAssert(link->rdb_channel_state == REPL_DUAL_CHANNEL_RDB_LOADED);
+    serverAssert(server.repl_rdb_channel_state == REPL_DUAL_CHANNEL_RDB_LOADED);
     /* RDB is loaded */
     dualChannelServerLog(LL_DEBUG, "Psync established after rdb load");
-    dualChannelSyncSuccess(link);
+    dualChannelSyncSuccess();
     return C_OK;
 }
 
 /* Replication: Replica side.
  * RDB channel done loading the RDB. Check whether the main channel has completed its part
  * and act accordingly. */
-int dualChannelSyncHandleRdbLoadCompletion(replicationLink *link) {
-    serverAssert(link->rdb_channel_state == REPL_DUAL_CHANNEL_RDB_LOAD);
-    if (link->state < REPL_STATE_TRANSFER) {
+void dualChannelSyncHandleRdbLoadCompletion(void) {
+    serverAssert(server.repl_rdb_channel_state == REPL_DUAL_CHANNEL_RDB_LOAD);
+    if (server.repl_state < REPL_STATE_TRANSFER) {
         /* Main psync channel hasn't been established yet */
-        link->rdb_channel_state = REPL_DUAL_CHANNEL_RDB_LOADED;
-        return C_OK;
+        server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RDB_LOADED;
+        return;
     }
-    serverAssert(link->state == REPL_STATE_TRANSFER);
-    connSetReadHandler(link->transfer_s, NULL);
-    return dualChannelSyncSuccess(link);
+    serverAssert(server.repl_state == REPL_STATE_TRANSFER);
+    connSetReadHandler(server.repl_transfer_s, NULL);
+    dualChannelSyncSuccess();
+    return;
 }
 
 /* Try a partial resynchronization with the primary if we are about to reconnect.
@@ -3292,8 +3158,8 @@ int dualChannelSyncHandleRdbLoadCompletion(replicationLink *link) {
 #define PSYNC_NOT_SUPPORTED 4
 #define PSYNC_TRY_LATER 5
 #define PSYNC_FULLRESYNC_DUAL_CHANNEL 6
-int replicaTryPartialResynchronization(replicationLink *link, int read_reply) {
-    char *psync_replid = NULL;
+int replicaTryPartialResynchronization(connection *conn, int read_reply) {
+    char *psync_replid;
     char psync_offset[32];
     sds reply;
 
@@ -3304,25 +3170,21 @@ int replicaTryPartialResynchronization(replicationLink *link, int read_reply) {
          * a FULL resync using the PSYNC command we'll set the offset at the
          * right value, so that this information will be propagated to the
          * client structure representing the primary into server.primary. */
-        link->initial_offset = -1;
+        server.primary_initial_offset = -1;
 
-        if (link->rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) {
+        if (server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) {
             /* While in dual channel replication, we should use our prepared repl id and offset. */
-            psync_replid = link->provisional_source_state.replid;
-            snprintf(psync_offset, sizeof(psync_offset), "%lld", link->provisional_source_state.reploff + 1);
+            psync_replid = server.repl_provisional_primary.replid;
+            snprintf(psync_offset, sizeof(psync_offset), "%lld", server.repl_provisional_primary.reploff + 1);
             dualChannelServerLog(LL_NOTICE,
                                  "Trying a partial resynchronization using main channel (request %s:%s).",
                                  psync_replid, psync_offset);
-        } else if (link != server.primary) {
-            serverLog(LL_NOTICE, "Partial resynchronization not attempted (not primary replication)");
         } else if (server.cached_primary) {
             psync_replid = server.cached_primary->repl_data->replid;
             snprintf(psync_offset, sizeof(psync_offset), "%lld", server.cached_primary->repl_data->reploff + 1);
             serverLog(LL_NOTICE, "Trying a partial resynchronization (request %s:%s).", psync_replid, psync_offset);
         } else {
             serverLog(LL_NOTICE, "Partial resynchronization not possible (no cached primary)");
-        }
-        if (!psync_replid) {
             psync_replid = "?";
             memcpy(psync_offset, "-1", 3);
         }
@@ -3330,26 +3192,26 @@ int replicaTryPartialResynchronization(replicationLink *link, int read_reply) {
         /* Issue the PSYNC command, if this is a primary with a failover in
          * progress then send the failover argument to the replica to cause it
          * to become a primary */
-        if (server.failover_state == FAILOVER_IN_PROGRESS && link == server.primary) {
-            reply = sendCommand(link->transfer_s, "PSYNC", psync_replid, psync_offset, "FAILOVER", NULL);
+        if (server.failover_state == FAILOVER_IN_PROGRESS) {
+            reply = sendCommand(conn, "PSYNC", psync_replid, psync_offset, "FAILOVER", NULL);
         } else {
-            reply = sendCommand(link->transfer_s, "PSYNC", psync_replid, psync_offset, NULL);
+            reply = sendCommand(conn, "PSYNC", psync_replid, psync_offset, NULL);
         }
 
         if (reply != NULL) {
-            serverLog(LL_WARNING, "Unable to send PSYNC to source: %s", reply);
+            serverLog(LL_WARNING, "Unable to send PSYNC to primary: %s", reply);
             sdsfree(reply);
-            connSetReadHandler(link->transfer_s, NULL);
+            connSetReadHandler(conn, NULL);
             return PSYNC_WRITE_ERROR;
         }
         return PSYNC_WAIT_REPLY;
     }
 
     /* Reading half */
-    reply = receiveSynchronousResponse(link->transfer_s);
+    reply = receiveSynchronousResponse(conn);
     /* Primary did not reply to PSYNC */
     if (reply == NULL) {
-        connSetReadHandler(link->transfer_s, NULL);
+        connSetReadHandler(conn, NULL);
         serverLog(LL_WARNING, "Primary did not reply to PSYNC, will try later");
         return PSYNC_TRY_LATER;
     }
@@ -3361,7 +3223,7 @@ int replicaTryPartialResynchronization(replicationLink *link, int read_reply) {
         return PSYNC_WAIT_REPLY;
     }
 
-    connSetReadHandler(link->transfer_s, NULL);
+    connSetReadHandler(conn, NULL);
 
     if (!strncmp(reply, "+FULLRESYNC", 11)) {
         char *replid = NULL, *offset = NULL;
@@ -3380,31 +3242,24 @@ int replicaTryPartialResynchronization(replicationLink *link, int read_reply) {
              * reply means that the primary supports PSYNC, but the reply
              * format seems wrong. To stay safe we blank the primary
              * replid to make sure next PSYNCs will fail. */
-            memset(link->replid, 0, CONFIG_RUN_ID_SIZE + 1);
+            memset(server.primary_replid, 0, CONFIG_RUN_ID_SIZE + 1);
         } else {
-            memcpy(link->replid, replid, offset - replid - 1);
-            link->replid[CONFIG_RUN_ID_SIZE] = '\0';
-            link->initial_offset = strtoll(offset, NULL, 10);
-            serverLog(LL_NOTICE, "Full resync from primary: %s:%lld", link->replid,
-                      link->initial_offset);
+            memcpy(server.primary_replid, replid, offset - replid - 1);
+            server.primary_replid[CONFIG_RUN_ID_SIZE] = '\0';
+            server.primary_initial_offset = strtoll(offset, NULL, 10);
+            serverLog(LL_NOTICE, "Full resync from primary: %s:%lld", server.primary_replid,
+                      server.primary_initial_offset);
         }
         sdsfree(reply);
         return PSYNC_FULLRESYNC;
     }
 
     if (!strncmp(reply, "+CONTINUE", 9)) {
-        if (link->rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) {
-            /* During dual channel sync session, primary struct is already initialized. */
+        if (server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) {
+            /* During dual channel sync sesseion, primary struct is already initialized. */
             sdsfree(reply);
             return PSYNC_CONTINUE;
         }
-        if (link != server.primary) {
-            /* Continuing from a cached primary should only happen when we are syncing for primary replication. */
-            sdsfree(reply);
-            serverLog(LL_WARNING, "Received +CONTINUE response to PSYNC when not doing replication and not performing dual channel sync. Failing PSYNC.");
-            return PSYNC_NOT_SUPPORTED;
-        }
-
         /* Partial resync was accepted. */
         serverLog(LL_NOTICE, "Successful partial resynchronization with primary.");
 
@@ -3441,7 +3296,7 @@ int replicaTryPartialResynchronization(replicationLink *link, int read_reply) {
 
         /* Setup the replication to continue. */
         sdsfree(reply);
-        replicationResurrectCachedPrimary(link);
+        replicationResurrectCachedPrimary(conn);
 
         /* If this instance was restarted and we read the metadata to
          * PSYNC from the persistence file, our replication backlog could
@@ -3502,16 +3357,16 @@ sds getTryPsyncString(int result) {
     }
 }
 
-int dualChannelReplMainConnSendHandshake(replicationLink *link, sds *err) {
+int dualChannelReplMainConnSendHandshake(connection *conn, sds *err) {
     char llstr[LONG_STR_SIZE];
-    ull2string(llstr, sizeof(llstr), link->rdb_client_id);
-    *err = sendCommand(link->transfer_s, "REPLCONF", "set-rdb-client-id", llstr, NULL);
+    ull2string(llstr, sizeof(llstr), server.rdb_client_id);
+    *err = sendCommand(conn, "REPLCONF", "set-rdb-client-id", llstr, NULL);
     if (*err) return C_ERR;
     return C_OK;
 }
 
-int dualChannelReplMainConnRecvCapaReply(replicationLink *link, sds *err) {
-    *err = receiveSynchronousResponse(link->transfer_s);
+int dualChannelReplMainConnRecvCapaReply(connection *conn, sds *err) {
+    *err = receiveSynchronousResponse(conn);
     if (*err == NULL) return C_ERR;
     if ((*err)[0] == '-') {
         dualChannelServerLog(LL_NOTICE, "Primary does not understand REPLCONF identify: %s", *err);
@@ -3520,28 +3375,28 @@ int dualChannelReplMainConnRecvCapaReply(replicationLink *link, sds *err) {
     return C_OK;
 }
 
-int dualChannelReplMainConnSendPsync(replicationLink *link, sds *err) {
+int dualChannelReplMainConnSendPsync(connection *conn, sds *err) {
     if (server.debug_pause_after_fork) debugPauseProcess();
-    if (replicaTryPartialResynchronization(link, 0) == PSYNC_WRITE_ERROR) {
+    if (replicaTryPartialResynchronization(conn, 0) == PSYNC_WRITE_ERROR) {
         dualChannelServerLog(LL_WARNING, "Aborting dual channel sync. Write error.");
-        *err = sdsnew(connGetLastError(link->transfer_s));
+        *err = sdsnew(connGetLastError(conn));
         return C_ERR;
     }
     return C_OK;
 }
 
-int dualChannelReplMainConnRecvPsyncReply(replicationLink *link, sds *err) {
-    int psync_result = replicaTryPartialResynchronization(link, 1);
+int dualChannelReplMainConnRecvPsyncReply(connection *conn, sds *err) {
+    int psync_result = replicaTryPartialResynchronization(conn, 1);
     if (psync_result == PSYNC_WAIT_REPLY) return C_OK; /* Try again later... */
 
     if (psync_result == PSYNC_CONTINUE) {
         dualChannelServerLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Primary accepted a Partial Resynchronization%s",
-                             link->rdb_transfer_s != NULL ? ", RDB load in background." : ".");
+                             server.repl_rdb_transfer_s != NULL ? ", RDB load in background." : ".");
         if (server.supervised_mode == SUPERVISED_SYSTEMD) {
             serverCommunicateSystemd("STATUS=PRIMARY <-> REPLICA sync: Partial Resynchronization accepted. Ready to "
                                      "accept connections in read-write mode.\n");
         }
-        dualChannelSyncHandlePsync(link);
+        dualChannelSyncHandlePsync();
         return C_OK;
     }
     *err = getTryPsyncString(psync_result);
@@ -3555,126 +3410,43 @@ void dualChannelSetupMainConnForPsync(connection *conn) {
     char *err = NULL;
     int ret;
 
-    replicationLink *link = (replicationLink *)connGetPrivateData(conn);
-
-    switch (link->state) {
+    switch (server.repl_state) {
     case REPL_STATE_SEND_HANDSHAKE:
-        ret = dualChannelReplMainConnSendHandshake(link, &err);
-        if (ret == C_OK) link->state = REPL_STATE_RECEIVE_CAPA_REPLY;
+        ret = dualChannelReplMainConnSendHandshake(conn, &err);
+        if (ret == C_OK) server.repl_state = REPL_STATE_RECEIVE_CAPA_REPLY;
         break;
     case REPL_STATE_RECEIVE_CAPA_REPLY:
-        ret = dualChannelReplMainConnRecvCapaReply(link, &err);
+        ret = dualChannelReplMainConnRecvCapaReply(conn, &err);
         if (ret == C_ERR) {
             break;
         }
-        if (ret == C_OK) link->state = REPL_STATE_SEND_PSYNC;
+        if (ret == C_OK) server.repl_state = REPL_STATE_SEND_PSYNC;
         sdsfree(err);
         err = NULL;
         /* fall through */
     case REPL_STATE_SEND_PSYNC:
-        ret = dualChannelReplMainConnSendPsync(link, &err);
-        if (ret == C_OK) link->state = REPL_STATE_RECEIVE_PSYNC_REPLY;
+        ret = dualChannelReplMainConnSendPsync(conn, &err);
+        if (ret == C_OK) server.repl_state = REPL_STATE_RECEIVE_PSYNC_REPLY;
         break;
     case REPL_STATE_RECEIVE_PSYNC_REPLY:
-        ret = dualChannelReplMainConnRecvPsyncReply(link, &err);
-        if (ret == C_OK && link->rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE)
-            link->state = REPL_STATE_TRANSFER;
-        /*  In case the RDB is already loaded, the repl_state will be set during establishSourceConnection. */
+        ret = dualChannelReplMainConnRecvPsyncReply(conn, &err);
+        if (ret == C_OK && server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE)
+            server.repl_state = REPL_STATE_TRANSFER;
+        /*  In case the RDB is already loaded, the repl_state will be set during establishPrimaryConnection. */
         break;
     default:
-        serverPanic("Unexpected replication state: %d", link->state);
+        serverPanic("Unexpected replication state: %d", server.repl_state);
     }
 
     if (ret == C_ERR) {
         dualChannelServerLog(LL_WARNING, "Aborting dual channel sync. Main channel psync result %d %s", ret, err ? err : "");
-        cancelReplicationHandshake(link, 1);
+        cancelReplicationHandshake(1);
     }
     sdsfree(err);
 }
 
-/*
- * Dual channel for full sync
- *
- * * Motivation *
- *  - Reduce primary memory load. We do that by moving the COB tracking to the replica side. This also decrease
- *    the chance for COB overruns. Note that primary's input buffer limits at the replica side are less restricted
- *    then primary's COB as the replica plays less critical part in the replication group. While increasing the
- *    primary's COB may end up with primary reaching swap and clients suffering, at replica side we're more at
- *    ease with it. Larger COB means better chance to sync successfully.
- *  - Reduce primary main process CPU load. By opening a new, dedicated channel for the RDB transfer, child
- *    processes can have direct access to the new channel. Due to TLS connection restrictions, this was not
- *    possible using one main channel. We eliminate the need for the child process to use the primary's
- *    child-proc -> main-proc pipeline, thus freeing up the main process to process clients queries.
- *
- * * High level interface design *
- *  - Dual channel sync begins when the replica sends a REPLCONF capa dual-channel to the primary during initial
- *    handshake. This allows the replica to verify whether the primary supports dual-channel-replication and, if
- *    so, state that this is the replica's main channel, which is not used for snapshot transfer.
- *  - When replica lacks sufficient data for PSYNC, the primary will send +DUALCHANNELSYNC response instead
- *    of RDB data. As a next step, the replica creates a new channel (rdb-channel) and configures it against
- *    the primary with the appropriate capabilities and requirements. The replica then requests a sync
- *    using the RDB channel.
- *  - Prior to forking, the primary sends the replica the snapshot's end repl-offset, and attaches the replica
- *    to the replication backlog to keep repl data until the replica requests psync. The replica uses the main
- *    channel to request a PSYNC starting at the snapshot end offset.
- *  - The primary main threads sends incremental changes via the main channel, while the bgsave process
- *    sends the RDB directly to the replica via the rdb-channel. As for the replica, the incremental
- *    changes are stored on a local buffer, while the RDB is loaded into memory.
- *  - Once the replica completes loading the rdb, it drops the rdb channel and streams the accumulated incremental
- *    changes into memory. Repl steady state continues normally.
- *
- * * Replica state machine *
- * ┌───────────────────┐             Dual channel sync
- * │RECEIVE_PING_REPLY │          ┌──────────────────────────────────────────────────────────────┐
- * └────────┬──────────┘          │     RDB channel states               Main channel state      │
- *          │+PONG                │     ┌────────────────────────────┐   ┌───────────────────┐   │
- * ┌────────▼──────────┐        ┌─┼─────►DUAL_CHANNEL_SEND_HANDSHAKE │ ┌─►SEND_HANDSHAKE     │   │
- * │SEND_HANDSHAKE     │        │ │     └────┬───────────────────────┘ │ └──┬────────────────┘   │
- * └────────┬──────────┘        │ │          │                         │    │REPLCONF set-rdb-client-id
- *          │                   │ │  ┌───────▼───────────────────────┐ │ ┌──▼────────────────┐   │
- * ┌────────▼──────────┐        │ │  │DUAL_CHANNEL_RECEIVE_AUTH_REPLY│ │ │RECEIVE_CAPA_REPLY │   │
- * │RECEIVE_AUTH_REPLY │        │ │  └───────┬───────────────────────┘ │ └──┬────────────────┘   │
- * └────────┬──────────┘        │ │          │+OK                      │    │+OK                 │
- *          │+OK                │ │  ┌───────▼───────────────────────┐ │ ┌──▼────────────────┐   │
- * ┌────────▼──────────┐        │ │  │DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY│SEND_PSYNC         │   │
- * │RECEIVE_PORT_REPLY │        │ │  └───────┬───────────────────────┘ │ └──┬────────────────┘   │
- * └────────┬──────────┘        │ │          │+OK                      │    │PSYNC use snapshot  │
- *          │+OK                │ │  ┌───────▼───────────────────┐     │    │end-offset provided │
- * ┌────────▼──────────┐        │ │  │DUAL_CHANNEL_RECEIVE_ENDOFF│     │    │by the primary      │
- * │RECEIVE_IP_REPLY   │        │ │  └───────┬───────────────────┘     │ ┌──▼────────────────┐   │
- * └────────┬──────────┘        │ │          │$ENDOFF                  │ │RECEIVE_PSYNC_REPLY│   │
- *          │+OK                │ │          ├─────────────────────────┘ └──┬────────────────┘   │
- * ┌────────▼──────────┐        │ │          │                              │+CONTINUE           │
- * │RECEIVE_CAPA_REPLY │        │ │  ┌───────▼───────────────┐           ┌──▼────────────────┐   │
- * └────────┬──────────┘        │ │  │DUAL_CHANNEL_RDB_LOAD  │           │TRANSFER           │   │
- *          │+OK                │ │  └───────┬───────────────┘           └─────┬─────────────┘   │
- * ┌────────▼─────────────┐     │ │          │Done loading                     │                 │
- * │RECEIVE_VERSION_REPLY │     │ │  ┌───────▼───────────────┐                 │                 │
- * └────────┬─────────────┘     │ │  │DUAL_CHANNEL_RDB_LOADED│                 │                 │
- *          │+OK                │ │  └───────┬───────────────┘                 │                 │
- * ┌────────▼───┐               │ │          │                                 │                 │
- * │SEND_PSYNC  │               │ │          │Replica loads local replication  │                 │
- * └─┬──────────┘               │ │          │buffer into memory               │                 │
- *   │PSYNC (use cached-primary)│ │          └─────────┬───────────────────────┘                 │
- * ┌─▼─────────────────┐        │ │                    │                                         │
- * │RECEIVE_PSYNC_REPLY│        │ └────────────────────┼─────────────────────────────────────────┘
- * └────────┬─┬────────┘        │                      │
- * +CONTINUE│ │+DUALCHANNELSYNC │                      │
- *   │      │ └─────────────────┘                      │
- *   │      │+FULLRESYNC                               │
- *   │    ┌─▼─────────────────┐                   ┌────▼──────────────┐
- *   │    │TRANSFER           ├───────────────────►CONNECTED          │
- *   │    └───────────────────┘                   └────▲──────────────┘
- *   │                                                 │
- *   └─────────────────────────────────────────────────┘
- */
-/* This handler fires when the non blocking connect was able to
- * establish a connection with the primary. */
-void syncWithSource(connection *conn) {
-    char tmpfile[256], *err = NULL;
-    int psync_result;
-
-    replicationLink *link = (replicationLink *)connGetPrivateData(conn);
+int replicationProceedWithHandshake(connection *conn, int curr_state, slotBitmap slot_bitmap) {
+    char *err = NULL;
 
     /* Check for errors in the socket: after a non blocking connect() we
      * may find that the socket is in error state. */
@@ -3684,22 +3456,16 @@ void syncWithSource(connection *conn) {
     }
 
     /* Send a PING to check the primary is able to reply without errors. */
-    if (link->state == REPL_STATE_CONNECTING) {
+    if (curr_state == REPL_STATE_CONNECTING) {
         serverLog(LL_NOTICE, "Non blocking connect for SYNC fired the event.");
-        /* Delete the writable event so that the readable event remains
-         * registered and we can wait for the PONG reply. */
-        connSetReadHandler(conn, syncWithSource);
-        connSetWriteHandler(conn, NULL);
-        link->state = REPL_STATE_RECEIVE_PING_REPLY;
         /* Send the PING, don't check for errors at all, we have the timeout
          * that will take care about this. */
         err = sendCommand(conn, "PING", NULL);
         if (err) goto write_error;
-        return;
+        return REPL_STATE_RECEIVE_PING_REPLY;
     }
-
-    /* Receive the PONG command. */
-    if (link->state == REPL_STATE_RECEIVE_PING_REPLY) {
+        /* Receive the PONG command. */
+    if (curr_state == REPL_STATE_RECEIVE_PING_REPLY) {
         err = receiveSynchronousResponse(conn);
 
         /* The primary did not reply */
@@ -3720,10 +3486,10 @@ void syncWithSource(connection *conn) {
         }
         sdsfree(err);
         err = NULL;
-        link->state = REPL_STATE_SEND_HANDSHAKE;
+        curr_state = REPL_STATE_SEND_HANDSHAKE;
     }
 
-    if (link->state == REPL_STATE_SEND_HANDSHAKE) {
+    if (curr_state == REPL_STATE_SEND_HANDSHAKE) {
         /* AUTH with the primary if required. */
         if (server.primary_auth) {
             char *args[3] = {"AUTH", NULL, NULL};
@@ -3758,11 +3524,11 @@ void syncWithSource(connection *conn) {
             if (err) goto write_error;
         }
 
-        /* Set the slot number, so that the primary only provides us with the appropriate slot dictionary. */
-        if (!isSlotBitmapAllSlots(link->slot_bitmap)) {
+        /* Set the slot bitmap, so that the primary only provides us with the appropriate slot dictionary. */
+        if (slot_bitmap != NULL && !isSlotBitmapEmpty(slot_bitmap)) {
             char *argv[3] = {"REPLCONF", "slot-bitmap", NULL};
             size_t lens[3] = {8, 11, 0};
-            argv[2] = (char *)link->slot_bitmap;
+            argv[2] = (char *)slot_bitmap;
             lens[2] = sizeof(slotBitmap);
             err = sendCommandArgv(conn, 3, argv, lens);
             if (err) goto write_error;
@@ -3783,30 +3549,28 @@ void syncWithSource(connection *conn) {
         err = sendCommand(conn, "REPLCONF", "version", VALKEY_VERSION, NULL);
         if (err) goto write_error;
 
-        link->state = REPL_STATE_RECEIVE_AUTH_REPLY;
-        return;
+        return REPL_STATE_RECEIVE_AUTH_REPLY;
     }
 
-    if (link->state == REPL_STATE_RECEIVE_AUTH_REPLY && !server.primary_auth)
-        link->state = REPL_STATE_RECEIVE_PORT_REPLY;
+    if (curr_state == REPL_STATE_RECEIVE_AUTH_REPLY && !server.primary_auth)
+        curr_state = REPL_STATE_RECEIVE_PORT_REPLY;
 
     /* Receive AUTH reply. */
-    if (link->state == REPL_STATE_RECEIVE_AUTH_REPLY) {
+    if (curr_state == REPL_STATE_RECEIVE_AUTH_REPLY) {
         err = receiveSynchronousResponse(conn);
         if (err == NULL) goto no_response_error;
         if (err[0] == '-') {
-            serverLog(LL_WARNING, "Unable to AUTH to %s: %s", replicationGetNameForLogs(link), err);
+            serverLog(LL_WARNING, "Unable to AUTH to PRIMARY: %s", err);
             sdsfree(err);
             goto error;
         }
         sdsfree(err);
         err = NULL;
-        link->state = REPL_STATE_RECEIVE_PORT_REPLY;
-        return;
+        return REPL_STATE_RECEIVE_PORT_REPLY;
     }
 
     /* Receive REPLCONF listening-port reply. */
-    if (link->state == REPL_STATE_RECEIVE_PORT_REPLY) {
+    if (curr_state == REPL_STATE_RECEIVE_PORT_REPLY) {
         err = receiveSynchronousResponse(conn);
         if (err == NULL) goto no_response_error;
         /* Ignore the error if any, not all the Redis OSS versions support
@@ -3818,15 +3582,14 @@ void syncWithSource(connection *conn) {
                       err);
         }
         sdsfree(err);
-        link->state = REPL_STATE_RECEIVE_IP_REPLY;
-        return;
+        return REPL_STATE_RECEIVE_IP_REPLY;
     }
 
-    if (link->state == REPL_STATE_RECEIVE_IP_REPLY && !server.replica_announce_ip)
-        link->state = REPL_STATE_RECEIVE_SLOT_REPLY;
+    if (curr_state == REPL_STATE_RECEIVE_IP_REPLY && !server.replica_announce_ip)
+        curr_state = REPL_STATE_RECEIVE_SLOT_REPLY;
 
     /* Receive REPLCONF ip-address reply. */
-    if (link->state == REPL_STATE_RECEIVE_IP_REPLY) {
+    if (curr_state == REPL_STATE_RECEIVE_IP_REPLY) {
         err = receiveSynchronousResponse(conn);
         if (err == NULL) goto no_response_error;
         /* Ignore the error if any, not all the Redis OSS versions support
@@ -3838,59 +3601,173 @@ void syncWithSource(connection *conn) {
                       err);
         }
         sdsfree(err);
-        link->state = REPL_STATE_RECEIVE_SLOT_REPLY;
-        return;
+        return REPL_STATE_RECEIVE_SLOT_REPLY;
     }
 
-    if (link->state == REPL_STATE_RECEIVE_SLOT_REPLY && isSlotBitmapAllSlots(link->slot_bitmap))
-        link->state = REPL_STATE_RECEIVE_CAPA_REPLY;
+    if (curr_state == REPL_STATE_RECEIVE_SLOT_REPLY && (slot_bitmap == NULL || isSlotBitmapEmpty(slot_bitmap)))
+        curr_state = REPL_STATE_RECEIVE_CAPA_REPLY;
 
-    if (link->state == REPL_STATE_RECEIVE_SLOT_REPLY) {
+    if (curr_state == REPL_STATE_RECEIVE_SLOT_REPLY) {
         err = receiveSynchronousResponse(conn);
         if (err == NULL) goto no_response_error;
-        /* If we sent the slot number, we need it to be properly acked, or we can't do slot migration. */
+        /* If we sent the slot bitmap, we need it to be properly acked, or we can't do slot migration. */
         if (err[0] == '-') {
             serverLog(LL_WARNING, "Source does not understand REPLCONF slot-num. Cannot continue with slot-level sync: %s", err);
             sdsfree(err);
             goto error;
         }
         sdsfree(err);
-        link->state = REPL_STATE_RECEIVE_CAPA_REPLY;
-        return;
+        return REPL_STATE_RECEIVE_CAPA_REPLY;
     }
 
+
     /* Receive CAPA reply. */
-    if (link->state == REPL_STATE_RECEIVE_CAPA_REPLY) {
+    if (curr_state == REPL_STATE_RECEIVE_CAPA_REPLY) {
         err = receiveSynchronousResponse(conn);
         if (err == NULL) goto no_response_error;
         /* Ignore the error if any, not all the Redis OSS versions support
          * REPLCONF capa. */
         if (err[0] == '-') {
             serverLog(LL_NOTICE,
-                      "(Non critical) Source does not understand "
+                      "(Non critical) Primary does not understand "
                       "REPLCONF capa: %s",
                       err);
         }
         sdsfree(err);
         err = NULL;
-        link->state = REPL_STATE_RECEIVE_VERSION_REPLY;
-        return;
+        return REPL_STATE_RECEIVE_VERSION_REPLY;
     }
 
     /* Receive VERSION reply. */
-    if (link->state == REPL_STATE_RECEIVE_VERSION_REPLY) {
+    if (curr_state == REPL_STATE_RECEIVE_VERSION_REPLY) {
         err = receiveSynchronousResponse(conn);
         if (err == NULL) goto no_response_error;
         /* Ignore the error if any. Valkey >= 8 supports REPLCONF VERSION. */
         if (err[0] == '-') {
             serverLog(LL_NOTICE,
-                      "(Non critical) Source does not understand "
+                      "(Non critical) Primary does not understand "
                       "REPLCONF VERSION: %s",
                       err);
         }
         sdsfree(err);
         err = NULL;
-        link->state = REPL_STATE_SEND_PSYNC;
+        return REPL_STATE_SEND_PSYNC;
+    }
+
+
+no_response_error: /* Handle receiveSynchronousResponse() error when primary has no reply */
+    serverLog(LL_WARNING, "Primary did not respond to command during SYNC handshake");
+    /* Fall through to regular error handling */
+
+error:
+    return REPL_STATE_ERROR;
+
+write_error: /* Handle sendCommand() errors. */
+    serverLog(LL_WARNING, "Sending command to primary in replication handshake: %s", err);
+    sdsfree(err);
+    goto error;
+}
+
+/*
+ * Dual channel for full sync
+ *
+ * * Motivation *
+ *  - Reduce primary memory load. We do that by moving the COB tracking to the replica side. This also decrease
+ *    the chance for COB overruns. Note that primary's input buffer limits at the replica side are less restricted
+ *    then primary's COB as the replica plays less critical part in the replication group. While increasing the
+ *    primary's COB may end up with primary reaching swap and clients suffering, at replica side we're more at
+ *    ease with it. Larger COB means better chance to sync successfully.
+ *  - Reduce primary main process CPU load. By opening a new, dedicated channel for the RDB transfer, child
+ *    processes can have direct access to the new channel. Due to TLS connection restrictions, this was not
+ *    possible using one main channel. We eliminate the need for the child process to use the primary's
+ *    child-proc -> main-proc pipeline, thus freeing up the main process to process clients queries.
+ *
+ * * High level interface design *
+ *  - Dual channel sync begins when the replica sends a REPLCONF capa dual-channel to the primary during initial
+ *    handshake. This allows the replica to verify whether the primary supports dual-channel-replication and, if
+ *    so, state that this is the replica's main channel, which is not used for snapshot transfer.
+ *  - When replica lacks sufficient data for PSYNC, the primary will send +DUALCHANNELSYNC response instead
+ *    of RDB data. As a next step, the replica creates a new channel (rdb-channel) and configures it against
+ *    the primary with the appropriate capabilities and requirements. The replica then requests a sync
+ *    using the RDB channel.
+ *  - Prior to forking, the primary sends the replica the snapshot's end repl-offset, and attaches the replica
+ *    to the replication backlog to keep repl data until the replica requests psync. The replica uses the main
+ *    channel to request a PSYNC starting at the snapshot end offset.
+ *  - The primary main threads sends incremental changes via the main channel, while the bgsave process
+ *    sends the RDB directly to the replica via the rdb-channel. As for the replica, the incremental
+ *    changes are stored on a local buffer, while the RDB is loaded into memory.
+ *  - Once the replica completes loading the rdb, it drops the rdb channel and streams the accumulated incremental
+ *    changes into memory. Repl steady state continues normally.
+ *
+ * * Replica state machine *
+ * ┌───────────────────┐             Dual channel sync
+ * │RECEIVE_PING_REPLY │          ┌──────────────────────────────────────────────────────────────┐
+ * └────────┬──────────┘          │     RDB channel states               Main channel state      │
+ *          │+PONG                │     ┌────────────────────────────┐   ┌───────────────────┐   │
+ * ┌────────▼──────────┐        ┌─┼─────►DUAL_CHANNEL_SEND_HANDSHAKE │ ┌─►SEND_HANDSHAKE     │   │
+ * │SEND_HANDSHAKE     │        │ │     └────┬───────────────────────┘ │ └──┬────────────────┘   │
+ * └────────┬──────────┘        │ │          │                         │    │REPLCONF set-rdb-client-id
+ *          │                   │ │  ┌───────▼───────────────────────┐ │ ┌──▼────────────────┐   │
+ * ┌────────▼──────────┐        │ │  │DUAL_CHANNEL_RECEIVE_AUTH_REPLY│ │ │RECEIVE_CAPA_REPLY │   │
+ * │RECEIVE_AUTH_REPLY │        │ │  └───────┬───────────────────────┘ │ └──┬────────────────┘   │
+ * └────────┬──────────┘        │ │          │+OK                      │    │+OK                 │
+ *          │+OK                │ │  ┌───────▼───────────────────────┐ │ ┌──▼────────────────┐   │
+ * ┌────────▼──────────┐        │ │  │DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY│SEND_PSYNC         │   │
+ * │RECEIVE_PORT_REPLY │        │ │  └───────┬───────────────────────┘ │ └──┬────────────────┘   │
+ * └────────┬──────────┘        │ │          │+OK                      │    │PSYNC use snapshot  │
+ *          │+OK                │ │  ┌───────▼───────────────────┐     │    │end-offset provided │
+ * ┌────────▼──────────┐        │ │  │DUAL_CHANNEL_RECEIVE_ENDOFF│     │    │by the primary      │
+ * │RECEIVE_IP_REPLY   │        │ │  └───────┬───────────────────┘     │ ┌──▼────────────────┐   │
+ * └────────┬──────────┘        │ │          │$ENDOFF                  │ │RECEIVE_PSYNC_REPLY│   │
+ *          │+OK                │ │          ├─────────────────────────┘ └──┬────────────────┘   │
+ * ┌────────▼──────────┐        │ │          │                              │+CONTINUE           │
+ * │RECEIVE_CAPA_REPLY │        │ │  ┌───────▼───────────────┐           ┌──▼────────────────┐   │
+ * └────────┬──────────┘        │ │  │DUAL_CHANNEL_RDB_LOAD  │           │TRANSFER           │   │
+ *          │+OK                │ │  └───────┬───────────────┘           └─────┬─────────────┘   │
+ * ┌────────▼─────────────┐     │ │          │Done loading                     │                 │
+ * │RECEIVE_VERSION_REPLY │     │ │  ┌───────▼───────────────┐                 │                 │
+ * └────────┬─────────────┘     │ │  │DUAL_CHANNEL_RDB_LOADED│                 │                 │
+ *          │+OK                │ │  └───────┬───────────────┘                 │                 │
+ * ┌────────▼───┐               │ │          │                                 │                 │
+ * │SEND_PSYNC  │               │ │          │Replica loads local replication  │                 │
+ * └─┬──────────┘               │ │          │buffer into memory               │                 │
+ *   │PSYNC (use cached-primary)│ │          └─────────┬───────────────────────┘                 │
+ * ┌─▼─────────────────┐        │ │                    │                                         │
+ * │RECEIVE_PSYNC_REPLY│        │ └────────────────────┼─────────────────────────────────────────┘
+ * └────────┬─┬────────┘        │                      │
+ * +CONTINUE│ │+DUALCHANNELSYNC │                      │
+ *   │      │ └─────────────────┘                      │
+ *   │      │+FULLRESYNC                               │
+ *   │    ┌─▼─────────────────┐                   ┌────▼──────────────┐
+ *   │    │TRANSFER           ├───────────────────►CONNECTED          │
+ *   │    └───────────────────┘                   └────▲──────────────┘
+ *   │                                                 │
+ *   └─────────────────────────────────────────────────┘
+ */
+/* This handler fires when the non blocking connect was able to
+ * establish a connection with the primary. */
+void syncWithPrimary(connection *conn) {
+    char tmpfile[256], *err = NULL;
+    int psync_result;
+
+    /* If this event fired after the user turned the instance into a primary
+     * with REPLICAOF NO ONE we must just return ASAP. */
+    if (server.repl_state == REPL_STATE_NONE) {
+        connClose(conn);
+        return;
+    }
+
+    if (server.repl_state < REPL_STATE_SEND_PSYNC) {
+        server.repl_state = replicationProceedWithHandshake(conn, server.repl_state, NULL);
+
+        if (server.repl_state == REPL_STATE_RECEIVE_PING_REPLY) {
+            /* Delete the writable event so that the readable event remains
+            * registered and we can wait for the PONG reply. */
+            connSetReadHandler(conn, syncWithPrimary);
+            connSetWriteHandler(conn, NULL);
+        } else if (server.repl_state == REPL_STATE_ERROR) {
+            goto error;
+        }
     }
 
     /* Try a partial resynchronization. If we don't have a cached primary
@@ -3898,32 +3775,32 @@ void syncWithSource(connection *conn) {
      * to start a full resynchronization so that we get the primary replid
      * and the global offset, to try a partial resync at the next
      * reconnection attempt. */
-    if (link->state == REPL_STATE_SEND_PSYNC) {
-        if (replicaTryPartialResynchronization(link, 0) == PSYNC_WRITE_ERROR) {
+    if (server.repl_state == REPL_STATE_SEND_PSYNC) {
+        if (replicaTryPartialResynchronization(conn, 0) == PSYNC_WRITE_ERROR) {
             err = sdsnew("Write error sending the PSYNC command.");
             abortFailover("Write error to failover target");
             goto write_error;
         }
-        link->state = REPL_STATE_RECEIVE_PSYNC_REPLY;
+        server.repl_state = REPL_STATE_RECEIVE_PSYNC_REPLY;
         return;
     }
 
     /* If reached this point, we should be in REPL_STATE_RECEIVE_PSYNC_REPLY. */
-    if (link->state != REPL_STATE_RECEIVE_PSYNC_REPLY) {
+    if (server.repl_state != REPL_STATE_RECEIVE_PSYNC_REPLY) {
         serverLog(LL_WARNING,
-                  "syncWithSource(): state machine error, "
+                  "syncWithPrimary(): state machine error, "
                   "state should be RECEIVE_PSYNC but is %d",
-                  link->state);
+                  server.repl_state);
         goto error;
     }
 
-    psync_result = replicaTryPartialResynchronization(link, 1);
+    psync_result = replicaTryPartialResynchronization(conn, 1);
     if (psync_result == PSYNC_WAIT_REPLY) return; /* Try again later... */
 
     /* Check the status of the planned failover. We expect PSYNC_CONTINUE,
      * but there is nothing technically wrong with a full resync which
      * could happen in edge cases. */
-    if (server.failover_state == FAILOVER_IN_PROGRESS && link == server.primary) {
+    if (server.failover_state == FAILOVER_IN_PROGRESS) {
         if (psync_result == PSYNC_CONTINUE || psync_result == PSYNC_FULLRESYNC) {
             clearFailoverState();
         } else {
@@ -3956,13 +3833,13 @@ void syncWithSource(connection *conn) {
     if (psync_result == PSYNC_NOT_SUPPORTED) {
         serverLog(LL_NOTICE, "Retrying with SYNC...");
         if (connSyncWrite(conn, "SYNC\r\n", 6, server.repl_syncio_timeout * 1000) == -1) {
-            serverLog(LL_WARNING, "I/O error writing to %s: %s", replicationGetNameForLogs(link), connGetLastError(conn));
+            serverLog(LL_WARNING, "I/O error writing to PRIMARY: %s", connGetLastError(conn));
             goto error;
         }
     }
 
     /* Prepare a suitable temp file for bulk transfer */
-    if (!useDisklessLoad() && isSlotBitmapAllSlots(link->slot_bitmap)) {
+    if (!useDisklessLoad()) {
         int dfd = -1, maxtries = 5;
         while (maxtries--) {
             snprintf(tmpfile, 256, "temp-%d.%ld.rdb", (int)server.unixtime, (long int)getpid());
@@ -3975,33 +3852,24 @@ void syncWithSource(connection *conn) {
             errno = saved_errno;
         }
         if (dfd == -1) {
-            serverLog(LL_WARNING, "Opening the temp file needed for %s <-> REPLICA synchronization: %s", replicationGetNameForLogs(link),
+            serverLog(LL_WARNING, "Opening the temp file needed for PRIMARY <-> REPLICA synchronization: %s",
                       strerror(errno));
             goto error;
         }
-        link->transfer_tmpfile = zstrdup(tmpfile);
-        link->transfer_fd = dfd;
-    }
-
-    /* We are going to need to do a full resync. If we are accepting a
-     * slot subset - make sure we have a clean state to load it into. This may
-     * happen in cases where a previous replication attempt failed and is being
-     * retried. */
-    if (!isSlotBitmapAllSlots(link->slot_bitmap)) {
-        dropKeysInSlotBitmap(link->slot_bitmap, 1);
+        server.repl_transfer_tmpfile = zstrdup(tmpfile);
+        server.repl_transfer_fd = dfd;
     }
 
     /* Using dual-channel-replication, the primary responded +DUALCHANNELSYNC. We need to
      * initialize the RDB channel. */
     if (psync_result == PSYNC_FULLRESYNC_DUAL_CHANNEL) {
         /* Create RDB connection */
-        link->rdb_transfer_s = connCreate(connTypeOfReplication());
-        connSetPrivateData(link->rdb_transfer_s, link);
-        if (connConnect(link->rdb_transfer_s, link->host, link->port, server.bind_source_addr,
-                        dualChannelFullSyncWithReplicationSource) == C_ERR) {
-            serverLog(LL_WARNING, "Unable to connect to source: %s", connGetLastError(link->transfer_s));
-            connClose(link->rdb_transfer_s);
-            link->rdb_transfer_s = NULL;
+        server.repl_rdb_transfer_s = connCreate(connTypeOfReplication());
+        if (connConnect(server.repl_rdb_transfer_s, server.primary_host, server.primary_port, server.bind_source_addr,
+                        dualChannelFullSyncWithPrimary) == C_ERR) {
+            serverLog(LL_WARNING, "Unable to connect to Primary: %s", connGetLastError(server.repl_transfer_s));
+            connClose(server.repl_rdb_transfer_s);
+            server.repl_rdb_transfer_s = NULL;
             goto error;
         }
         if (connSetReadHandler(conn, NULL) == C_ERR) {
@@ -4010,50 +3878,36 @@ void syncWithSource(connection *conn) {
                                  connGetInfo(conn, conninfo, sizeof(conninfo)));
             goto error;
         }
-        link->rdb_channel_state = REPL_DUAL_CHANNEL_SEND_HANDSHAKE;
+        server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_SEND_HANDSHAKE;
         return;
     }
-    if (replicationUseAOFFormatSnapshot(link)) {
-        link->client = createReplicationLinkClientWithHandler(link, conn, -1, readQueryFromClient);
-        link->transfer_s = NULL;
-    } else {
-        /* Setup the non blocking download of the bulk file. */
-        if (connSetReadHandler(conn, readSyncBulkPayload) == C_ERR) {
-            char conninfo[CONN_INFO_LEN];
-            serverLog(LL_WARNING, "Can't create readable event for SYNC: %s (%s)", strerror(errno),
-                    connGetInfo(conn, conninfo, sizeof(conninfo)));
-            goto error;
-        }
+    /* Setup the non blocking download of the bulk file. */
+    if (connSetReadHandler(conn, readSyncBulkPayload) == C_ERR) {
+        char conninfo[CONN_INFO_LEN];
+        serverLog(LL_WARNING, "Can't create readable event for SYNC: %s (%s)", strerror(errno),
+                  connGetInfo(conn, conninfo, sizeof(conninfo)));
+        goto error;
     }
 
-    link->state = REPL_STATE_TRANSFER;
-    link->transfer_size = -1;
-    link->transfer_read = 0;
-    link->transfer_last_fsync_off = 0;
-    link->transfer_lastio = server.unixtime;
+    server.repl_state = REPL_STATE_TRANSFER;
+    server.repl_transfer_size = -1;
+    server.repl_transfer_read = 0;
+    server.repl_transfer_last_fsync_off = 0;
+    server.repl_transfer_lastio = server.unixtime;
     return;
 
-no_response_error: /* Handle receiveSynchronousResponse() error when primary has no reply */
-    serverLog(LL_WARNING, "Primary did not respond to command during SYNC handshake");
-    /* Fall through to regular error handling */
-
 error:
     connClose(conn);
-    link->transfer_s = NULL;
-    if (link->rdb_transfer_s) {
-        connClose(link->rdb_transfer_s);
-        link->rdb_transfer_s = NULL;
-    }
-    if (link->transfer_fd != -1) close(link->transfer_fd);
-    if (link->transfer_tmpfile) zfree(link->transfer_tmpfile);
-    link->transfer_tmpfile = NULL;
-    link->transfer_fd = -1;
-    link->state = REPL_STATE_CONNECT;
-    if (link->client) {
-        freeClient(link->client);
-        link->client = NULL;
+    server.repl_transfer_s = NULL;
+    if (server.repl_rdb_transfer_s) {
+        connClose(server.repl_rdb_transfer_s);
+        server.repl_rdb_transfer_s = NULL;
     }
-    return;
+    if (server.repl_transfer_fd != -1) close(server.repl_transfer_fd);
+    if (server.repl_transfer_tmpfile) zfree(server.repl_transfer_tmpfile);
+    server.repl_transfer_tmpfile = NULL;
+    server.repl_transfer_fd = -1;
+    server.repl_state = REPL_STATE_CONNECT;
 
 write_error: /* Handle sendCommand() errors. */
     serverLog(LL_WARNING, "Sending command to primary in replication handshake: %s", err);
@@ -4061,108 +3915,20 @@ void syncWithSource(connection *conn) {
     goto error;
 }
 
-replicationLink *createReplicationLink(char *host, int port, slotBitmap slot_bitmap) {
-    replicationLink *result = (replicationLink *)zmalloc(sizeof(replicationLink));
-    result->protected = 0;
-    result->state = REPL_STATE_NONE;
-    result->rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE;
-    memcpy(result->slot_bitmap, slot_bitmap, sizeof(slotBitmap));
-    result->client = NULL;
-    result->host = sdsnew(host);
-    result->port = port;
-    result->transfer_s = NULL;
-    result->rdb_transfer_s = NULL;
-    result->rdb_client_id = -1;
-    result->replid[0] = '\0';
-    result->initial_offset = -1;
-    result->transfer_size = 0;
-    result->transfer_read = 0;
-    result->transfer_last_fsync_off = 0;
-    result->transfer_fd = -1;
-    result->transfer_tmpfile = NULL;
-    result->transfer_lastio = 0;
-    result->provisional_source_state.replid[0] = '\0';
-    result->provisional_source_state.reploff = -1;
-    result->provisional_source_state.read_reploff = -1;
-    result->provisional_source_state.dbid = -1;
-    result->pending_repl_data.blocks = NULL;
-    result->pending_repl_data.len = 0;
-    result->pending_repl_data.peak = 0;
-    listAddNodeTail(server.replication_links, result);
-    return result;
-}
-
-
-int freeReplicationLink(replicationLink *link) {
-    if (!link) return 0;
-
-    /* Free primary_host before any calls to freeClient since it calls
-     * replicationHandleSourceDisconnection which can trigger a re-connect
-     * directly from within that call. */
-    sdsfree(link->host);
-    link->host = NULL;
-
-    cancelReplicationHandshake(link, 0);
-    if (link->client) {
-        freeClient(link->client);
-        link->client = NULL;
-    }
-
-    if (link->transfer_s) {
-        connClose(link->transfer_s);
-        link->transfer_s = NULL;
-    }
-    if (link->rdb_transfer_s) {
-        connClose(link->rdb_transfer_s);
-        link->rdb_transfer_s = NULL;
-    }
-    if (link->transfer_tmpfile) {
-        zfree(link->transfer_tmpfile);
-        link->transfer_tmpfile = NULL;
-    }
-    if (link->transfer_fd != -1) {
-        close(link->transfer_fd);
-        link->transfer_fd = -1;
-    }
-    freePendingReplDataBuf(link);
-
-    /* Unlink this replication link from the server list */
-    listIter li;
-    listNode *ln;
-    listRewind(server.replication_links, &li);
-    while ((ln = listNext(&li))) {
-        replicationLink *elem = (replicationLink *)ln->value;
-        if (elem == link) {
-            listDelNode(server.replication_links, ln);
-            break;
-        }
-    }
-
-    /* Keep the link intact if it is protected, but mark it as such */
-    if (link->protected) {
-        link->state = REPL_STATE_CANCELLED;
-        return 0;
-    }
-    zfree(link);
-    return 1;
-}
-
-int connectReplicationLink(replicationLink *link) {
-    if (!link)
-        return C_ERR;
-
-    link->transfer_s = connCreate(connTypeOfReplication());
-    connSetPrivateData(link->transfer_s, link);
-    if (connConnect(link->transfer_s, link->host, link->port, server.bind_source_addr, syncWithSource) == C_ERR) {
-        serverLog(LL_WARNING, "Unable to connect to %s: %s", replicationGetNameForLogs(link), connGetLastError(link->transfer_s));
-        connClose(link->transfer_s);
-        link->transfer_s = NULL;
+int connectWithPrimary(void) {
+    server.repl_transfer_s = connCreate(connTypeOfReplication());
+    if (connConnect(server.repl_transfer_s, server.primary_host, server.primary_port, server.bind_source_addr,
+                    syncWithPrimary) == C_ERR) {
+        serverLog(LL_WARNING, "Unable to connect to PRIMARY: %s", connGetLastError(server.repl_transfer_s));
+        connClose(server.repl_transfer_s);
+        server.repl_transfer_s = NULL;
         return C_ERR;
     }
 
-    link->transfer_lastio = server.unixtime;
-    link->state = REPL_STATE_CONNECTING;
-    serverLog(LL_NOTICE, "%s <-> REPLICA sync started", replicationGetNameForLogs(link));
+
+    server.repl_transfer_lastio = server.unixtime;
+    server.repl_state = REPL_STATE_CONNECTING;
+    serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync started");
     return C_OK;
 }
 
@@ -4170,27 +3936,23 @@ int connectReplicationLink(replicationLink *link) {
  * in progress to undo it.
  * Never call this function directly, use cancelReplicationHandshake() instead.
  */
-void undoConnectWithSource(replicationLink *link) {
-    if (link->client) {
-        freeClient(link->client);
-    } else if (link->transfer_s) {
-        connClose(link->transfer_s);
-        link->transfer_s = NULL;
-    }
+void undoConnectWithPrimary(void) {
+    connClose(server.repl_transfer_s);
+    server.repl_transfer_s = NULL;
 }
 
 /* Abort the async download of the bulk dataset while SYNC-ing with primary.
  * Never call this function directly, use cancelReplicationHandshake() instead.
  */
-void replicationAbortSyncTransfer(replicationLink *link) {
-    serverAssert(link->state == REPL_STATE_TRANSFER);
-    undoConnectWithSource(link);
-    if (link->transfer_fd != -1) {
-        close(link->transfer_fd);
-        bg_unlink(link->transfer_tmpfile);
-        zfree(link->transfer_tmpfile);
-        link->transfer_tmpfile = NULL;
-        link->transfer_fd = -1;
+void replicationAbortSyncTransfer(void) {
+    serverAssert(server.repl_state == REPL_STATE_TRANSFER);
+    undoConnectWithPrimary();
+    if (server.repl_transfer_fd != -1) {
+        close(server.repl_transfer_fd);
+        bg_unlink(server.repl_transfer_tmpfile);
+        zfree(server.repl_transfer_tmpfile);
+        server.repl_transfer_tmpfile = NULL;
+        server.repl_transfer_fd = -1;
     }
 }
 
@@ -4199,22 +3961,19 @@ void replicationAbortSyncTransfer(replicationLink *link) {
  * the initial bulk transfer.
  *
  * If there was a replication handshake in progress 1 is returned and
- * the replication state (link->state) set to REPL_STATE_CONNECT.
+ * the replication state (server.repl_state) set to REPL_STATE_CONNECT.
  *
  * Otherwise zero is returned and no operation is performed at all. */
-int cancelReplicationHandshake(replicationLink *link, int reconnect) {
-    if (link->rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) {
-        replicationAbortDualChannelSyncTransfer(link);
-    }
-    if (link->state == REPL_STATE_TRANSFER) {
-        replicationAbortSyncTransfer(link);
-        /* Note that disconnection may already trigger reconnect */
-        if (link->state == REPL_STATE_CONNECTING)
-            return 1;
-        link->state = REPL_STATE_CONNECT;
-    } else if (link->state == REPL_STATE_CONNECTING || replicaIsInHandshakeState(link)) {
-        undoConnectWithSource(link);
-        link->state = REPL_STATE_CONNECT;
+int cancelReplicationHandshake(int reconnect) {
+    if (server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) {
+        replicationAbortDualChannelSyncTransfer();
+    }
+    if (server.repl_state == REPL_STATE_TRANSFER) {
+        replicationAbortSyncTransfer();
+        server.repl_state = REPL_STATE_CONNECT;
+    } else if (server.repl_state == REPL_STATE_CONNECTING || replicaIsInHandshakeState()) {
+        undoConnectWithPrimary();
+        server.repl_state = REPL_STATE_CONNECT;
     } else {
         return 0;
     }
@@ -4223,32 +3982,34 @@ int cancelReplicationHandshake(replicationLink *link, int reconnect) {
 
     /* try to re-connect without waiting for replicationCron, this is needed
      * for the "diskless loading short read" test. */
-    serverLog(LL_NOTICE, "Reconnecting to replication source %s:%d after failure", link->host, link->port);
-    connectReplicationLink(link);
+    serverLog(LL_NOTICE, "Reconnecting to PRIMARY %s:%d after failure", server.primary_host, server.primary_port);
+    connectWithPrimary();
 
     return 1;
 }
 
 /* Set replication to the specified primary address and port. */
 void replicationSetPrimary(char *ip, int port, int full_sync_required) {
-    int was_primary = server.primary == NULL;
-    int was_connected = server.primary->state == REPL_STATE_CONNECTED;
+    int was_primary = server.primary_host == NULL;
 
+    sdsfree(server.primary_host);
+    server.primary_host = NULL;
     if (server.primary) {
         /* When joining 'myself' to a new primary, set the dont_cache_primary flag
          * if a full sync is required. This happens when 'myself' was previously
          * part of a different shard from the new primary. Since 'myself' does not
          * have the replication history of the shard it is joining, clearing the
          * cached primary is necessary to ensure proper replication behavior. */
-        server.primary->client->flag.dont_cache_primary = full_sync_required;
-        freeReplicationLink(server.primary);
+        server.primary->flag.dont_cache_primary = full_sync_required;
+        freeClient(server.primary);
     }
     disconnectAllBlockedClients(); /* Clients blocked in primary, now replica. */
 
     /* Setting primary_host only after the call to freeClient since it calls
-     * replicationHandleSourceDisconnection which can trigger a re-connect
+     * replicationHandlePrimaryDisconnection which can trigger a re-connect
      * directly from within that call. */
-    server.primary = createReplicationLink(ip, port, NULL);
+    server.primary_host = sdsnew(ip);
+    server.primary_port = port;
 
     /* Update oom_score_adj */
     setOOMScoreAdj(-1);
@@ -4259,6 +4020,8 @@ void replicationSetPrimary(char *ip, int port, int full_sync_required) {
      * primary, or finishing transferring RDB and preparing loading DB on full
      * sync with new primary. */
 
+    cancelReplicationHandshake(0);
+
     /* Before destroying our primary state, create a cached primary using
      * our own parameters, to later PSYNC with the new primary. */
     if (was_primary && !full_sync_required) {
@@ -4271,26 +4034,31 @@ void replicationSetPrimary(char *ip, int port, int full_sync_required) {
                           NULL);
 
     /* Fire the primary link modules event. */
-    if (was_connected)
+    if (server.repl_state == REPL_STATE_CONNECTED)
         moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_DOWN, NULL);
 
+    server.repl_state = REPL_STATE_CONNECT;
     /* Allow trying dual-channel-replication with the new primary. If new primary doesn't
      * support dual-channel-replication, we will set to 0 afterwards. */
-    serverLog(LL_NOTICE, "Connecting to PRIMARY %s:%d", server.primary->host, server.primary->port);
-    connectReplicationLink(server.primary);
+    serverLog(LL_NOTICE, "Connecting to PRIMARY %s:%d", server.primary_host, server.primary_port);
+    connectWithPrimary();
 }
 
 /* Cancel replication, setting the instance as a primary itself. */
 void replicationUnsetPrimary(void) {
-    if (server.primary == NULL) return; /* Nothing to do. */
+    if (server.primary_host == NULL) return; /* Nothing to do. */
 
     /* Fire the primary link modules event. */
-    if (server.primary->state == REPL_STATE_CONNECTED)
+    if (server.repl_state == REPL_STATE_CONNECTED)
         moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_DOWN, NULL);
 
-    freeReplicationLink(server.primary);
+    /* Clear primary_host first, since the freeClient calls
+     * replicationHandlePrimaryDisconnection which can attempt to re-connect. */
+    sdsfree(server.primary_host);
+    server.primary_host = NULL;
+    if (server.primary) freeClient(server.primary);
     replicationDiscardCachedPrimary();
-
+    cancelReplicationHandshake(0);
     /* When a replica is turned into a primary, the current replication ID
      * (that was inherited from the primary at synchronization time) is
      * used as secondary ID up to the current offset, and a new replication
@@ -4301,6 +4069,7 @@ void replicationUnsetPrimary(void) {
      * the replicas will be able to partially resync with us, so it will be
      * a very fast reconnection. */
     disconnectReplicas();
+    server.repl_state = REPL_STATE_NONE;
 
     /* We need to make sure the new primary will start the replication stream
      * with a SELECT statement. This is forced after a full resync, but
@@ -4331,37 +4100,23 @@ void replicationUnsetPrimary(void) {
 
 /* This function is called when the replica lose the connection with the
  * primary into an unexpected way. */
-void replicationHandleSourceDisconnection(replicationLink *link) {
-    if (link == server.primary) {
-        if (link->state == REPL_STATE_CONNECTED && link == server.primary) {
-            moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_DOWN, NULL);
-        }
-        server.repl_down_since = server.unixtime;
-
-        /* We lost connection with our primary, don't disconnect replicas yet,
-         * maybe we'll be able to PSYNC with our primary later. We'll disconnect
-         * the replicas only if we'll have to do a full resync with our primary. */
-    }
+void replicationHandlePrimaryDisconnection(void) {
+    /* Fire the primary link modules event. */
+    if (server.repl_state == REPL_STATE_CONNECTED)
+        moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_DOWN, NULL);
 
-    link->client = NULL;
-    link->state = REPL_STATE_CONNECT;
-
-    if (link->rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) {
-        /* Our client was closed in the middle of dual channel (e.g, we were
-         * loading AOF as a client). Ensure that the other dual channel
-         * connections are cleaned up. */
-        if (link->transfer_s) {
-            connClose(link->transfer_s);
-            link->transfer_s = NULL;
-        }
-        replicationAbortDualChannelSyncTransfer(link);
-    }
+    server.primary = NULL;
+    server.repl_state = REPL_STATE_CONNECT;
+    server.repl_down_since = server.unixtime;
+    /* We lost connection with our primary, don't disconnect replicas yet,
+     * maybe we'll be able to PSYNC with our primary later. We'll disconnect
+     * the replicas only if we'll have to do a full resync with our primary. */
 
     /* Try to re-connect immediately rather than wait for replicationCron
      * waiting 1 second may risk backlog being recycled. */
-    if (link->host) {
-        serverLog(LL_NOTICE, "Reconnecting to replication source %s:%d", link->host, link->port);
-        connectReplicationLink(link);
+    if (server.primary_host) {
+        serverLog(LL_NOTICE, "Reconnecting to PRIMARY %s:%d", server.primary_host, server.primary_port);
+        connectWithPrimary();
     }
 }
 
@@ -4381,7 +4136,7 @@ void replicaofCommand(client *c) {
     /* The special host/port combination "NO" "ONE" turns the instance
      * into a primary. Otherwise the new primary address is set. */
     if (!strcasecmp(c->argv[1]->ptr, "no") && !strcasecmp(c->argv[2]->ptr, "one")) {
-        if (server.primary) {
+        if (server.primary_host) {
             replicationUnsetPrimary();
             sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log);
             serverLog(LL_NOTICE, "PRIMARY MODE enabled (user request from '%s')", client);
@@ -4401,7 +4156,7 @@ void replicaofCommand(client *c) {
         if (getRangeLongFromObjectOrReply(c, c->argv[2], 0, 65535, &port, "Invalid master port") != C_OK) return;
 
         /* Check if we are already attached to the specified primary */
-        if (server.primary && !strcasecmp(server.primary->host, c->argv[1]->ptr) && server.primary->port == port) {
+        if (server.primary_host && !strcasecmp(server.primary_host, c->argv[1]->ptr) && server.primary_port == port) {
             serverLog(LL_NOTICE, "REPLICAOF would result into synchronization "
                                  "with the primary we are already connected "
                                  "with. No operation performed.");
@@ -4413,8 +4168,8 @@ void replicaofCommand(client *c) {
          * we can continue. */
         replicationSetPrimary(c->argv[1]->ptr, port, 0);
         sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log);
-        serverLog(LL_NOTICE, "REPLICAOF %s:%d enabled (user request from '%s')", server.primary->host,
-                  server.primary->port, client);
+        serverLog(LL_NOTICE, "REPLICAOF %s:%d enabled (user request from '%s')", server.primary_host,
+                  server.primary_port, client);
         sdsfree(client);
     }
     addReply(c, shared.ok);
@@ -4429,7 +4184,7 @@ void roleCommand(client *c) {
         return;
     }
 
-    if (server.primary == NULL) {
+    if (server.primary_host == NULL) {
         listIter li;
         listNode *ln;
         void *mbcount;
@@ -4461,12 +4216,12 @@ void roleCommand(client *c) {
 
         addReplyArrayLen(c, 5);
         addReplyBulkCBuffer(c, "slave", 5);
-        addReplyBulkCString(c, server.primary->host);
-        addReplyLongLong(c, server.primary->port);
-        if (replicaIsInHandshakeState(server.primary)) {
+        addReplyBulkCString(c, server.primary_host);
+        addReplyLongLong(c, server.primary_port);
+        if (replicaIsInHandshakeState()) {
             replica_state = "handshake";
         } else {
-            switch (server.primary->state) {
+            switch (server.repl_state) {
             case REPL_STATE_NONE: replica_state = "none"; break;
             case REPL_STATE_CONNECT: replica_state = "connect"; break;
             case REPL_STATE_CONNECTING: replica_state = "connecting"; break;
@@ -4476,18 +4231,17 @@ void roleCommand(client *c) {
             }
         }
         addReplyBulkCString(c, replica_state);
-        addReplyLongLong(c, server.primary->client ? server.primary->client->repl_data->reploff : -1);
+        addReplyLongLong(c, server.primary ? server.primary->repl_data->reploff : -1);
     }
 }
 
 /* Send a REPLCONF ACK command to the primary to inform it about the current
  * processed offset. If we are not connected with a primary, the command has
  * no effects. */
-void replicationSendAck(replicationLink *link) {
-    client *c = link->client;
+void replicationSendAck(client *c) {
     if (c != NULL) {
         int send_fack = server.fsynced_reploff != -1;
-        c->flag.primary_force_reply = 1;
+        c->flag.replication_force_reply = 1;
         addReplyArrayLen(c, send_fack ? 5 : 3);
         addReplyBulkCString(c, "REPLCONF");
         addReplyBulkCString(c, "ACK");
@@ -4496,7 +4250,7 @@ void replicationSendAck(replicationLink *link) {
             addReplyBulkCString(c, "FACK");
             addReplyBulkLongLong(c, server.fsynced_reploff);
         }
-        c->flag.primary_force_reply = 0;
+        c->flag.replication_force_reply = 0;
 
         /* Accumulation from above replies must be reset back to 0 manually,
          * as this subroutine does not invoke resetClient(). */
@@ -4525,7 +4279,7 @@ void replicationSendAck(replicationLink *link) {
  * handshake in order to reactivate the cached primary.
  */
 void replicationCachePrimary(client *c) {
-    serverAssert(server.primary != NULL && server.primary->client != NULL && server.cached_primary == NULL);
+    serverAssert(server.primary != NULL && server.cached_primary == NULL);
     serverLog(LL_NOTICE, "Caching the disconnected primary state.");
 
     /* Wait for IO operations to be done before proceeding */
@@ -4537,10 +4291,10 @@ void replicationCachePrimary(client *c) {
      * we want to discard the non processed query buffers and non processed
      * offsets, including pending transactions, already populated arguments,
      * pending outputs to the primary. */
-    sdsclear(c->querybuf);
-    c->qb_pos = 0;
-    c->repl_data->repl_applied = 0;
-    c->repl_data->read_reploff = c->repl_data->reploff;
+    sdsclear(server.primary->querybuf);
+    server.primary->qb_pos = 0;
+    server.primary->repl_data->repl_applied = 0;
+    server.primary->repl_data->read_reploff = server.primary->repl_data->reploff;
     if (c->flag.multi) discardTransaction(c);
     listEmpty(c->reply);
     c->sentlen = 0;
@@ -4549,9 +4303,9 @@ void replicationCachePrimary(client *c) {
     resetClient(c);
     resetClientIOState(c);
 
-    /* Save the primary. Server.primary->client will be set to null later by
-     * replicationHandleSourceDisconnection(). */
-    server.cached_primary = c;
+    /* Save the primary. Server.primary will be set to null later by
+     * replicationHandlePrimaryDisconnection(). */
+    server.cached_primary = server.primary;
 
     /* Invalidate the Peer ID cache. */
     if (c->peerid) {
@@ -4566,8 +4320,8 @@ void replicationCachePrimary(client *c) {
 
     /* Caching the primary happens instead of the actual freeClient() call,
      * so make sure to adjust the replication state. This function will
-     * also set server.primary->client to NULL. */
-    replicationHandleSourceDisconnection(server.primary);
+     * also set server.primary to NULL. */
+    replicationHandlePrimaryDisconnection();
 }
 
 /* This function is called when a primary is turned into a replica, in order to
@@ -4583,27 +4337,24 @@ void replicationCachePrimaryUsingMyself(void) {
     serverLog(LL_NOTICE, "Before turning into a replica, using my own primary parameters "
                          "to synthesize a cached primary: I may be able to synchronize with "
                          "the new primary with just a partial transfer.");
-    /* Create a temporary link for the purpose of creating a client. */
-    replicationLink *temp_link = createReplicationLink(NULL, 0, NULL);
 
     /* This will be used to populate the field server.primary->repl_data->reploff
      * by replicationCreatePrimaryClient(). We'll later set the created
      * primary as server.cached_primary, so the replica will use such
      * offset for PSYNC. */
-    temp_link->initial_offset = server.primary_repl_offset;
+    server.primary_initial_offset = server.primary_repl_offset;
 
     /* The primary client we create can be set to any DBID, because
      * the new primary will start its replication stream with SELECT. */
-    createReplicationLinkClient(temp_link, NULL, -1);
+    replicationCreatePrimaryClient(NULL, -1);
 
     /* Use our own ID / offset. */
-    memcpy(temp_link->client->repl_data->replid, server.replid, sizeof(server.replid));
+    memcpy(server.primary->repl_data->replid, server.replid, sizeof(server.replid));
 
     /* Set as cached primary. */
-    unlinkClient(temp_link->client);
-    server.cached_primary = temp_link->client;
-    temp_link->client = NULL;
-    freeReplicationLink(temp_link);
+    unlinkClient(server.primary);
+    server.cached_primary = server.primary;
+    server.primary = NULL;
 }
 
 /* Free a cached primary, called when there are no longer the conditions for
@@ -4612,7 +4363,7 @@ void replicationDiscardCachedPrimary(void) {
     if (server.cached_primary == NULL) return;
 
     serverLog(LL_NOTICE, "Discarding previously cached primary state.");
-    server.cached_primary->flag.replication_source = 0;
+    server.cached_primary->flag.primary = 0;
     freeClient(server.cached_primary);
     server.cached_primary = NULL;
 }
@@ -4620,19 +4371,17 @@ void replicationDiscardCachedPrimary(void) {
 /* Replication: Replica side.
  * This method performs the necessary steps to establish a connection with the primary server.
  * It sets private data, updates flags, and fires an event to notify modules about the primary link change. */
-void establishSourceConnection(replicationLink *link) {
-    connSetPrivateData(link->client->conn, link->client);
-    link->client->flag.close_after_reply = 0;
-    link->client->flag.close_asap = 0;
-    link->client->flag.authenticated = 1;
-    link->client->last_interaction = server.unixtime;
-    link->state = REPL_STATE_CONNECTED;
-    if (link == server.primary) {
-        server.repl_down_since = 0;
+void establishPrimaryConnection(void) {
+    connSetPrivateData(server.primary->conn, server.primary);
+    server.primary->flag.close_after_reply = 0;
+    server.primary->flag.close_asap = 0;
+    server.primary->flag.authenticated = 1;
+    server.primary->last_interaction = server.unixtime;
+    server.repl_state = REPL_STATE_CONNECTED;
+    server.repl_down_since = 0;
 
-        /* Fire the primary link modules event. */
-        moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_UP, NULL);
-    }
+    /* Fire the primary link modules event. */
+    moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_UP, NULL);
 }
 
 /* Replication: Replica side.
@@ -4642,38 +4391,34 @@ void establishSourceConnection(replicationLink *link) {
  * This function is called when successfully setup a partial resynchronization
  * so the stream of data that we'll receive will start from where this
  * primary left. */
-void replicationResurrectCachedPrimary(replicationLink *link) {
-    serverAssert(link == server.primary);
-    link->client = server.cached_primary;
+void replicationResurrectCachedPrimary(connection *conn) {
+    server.primary = server.cached_primary;
     server.cached_primary = NULL;
+    server.primary->conn = conn;
 
-    /* The client takes ownership of the connection now. */
-    link->client->conn = link->transfer_s;
-    link->transfer_s = NULL;
-
-    establishSourceConnection(link);
+    establishPrimaryConnection();
     /* Re-add to the list of clients. */
-    linkClient(link->client);
-    replicationSteadyStateInit(link);
+    linkClient(server.primary);
+    replicationSteadyStateInit();
 }
 
 /* Replication: Replica side.
  * Prepare replica to steady state.
  * prerequisite: server.primary is already initialized and linked in client list. */
-void replicationSteadyStateInit(replicationLink *link) {
-    if (connSetReadHandler(link->client->conn, readQueryFromClient)) {
+void replicationSteadyStateInit(void) {
+    if (connSetReadHandler(server.primary->conn, readQueryFromClient)) {
         serverLog(LL_WARNING, "Error resurrecting the cached primary, impossible to add the readable handler: %s",
                   strerror(errno));
-        freeClientAsync(link->client); /* Close ASAP. */
+        freeClientAsync(server.primary); /* Close ASAP. */
     }
 
     /* We may also need to install the write handler as well if there is
      * pending data in the write buffers. */
-    if (clientHasPendingReplies(link->client)) {
-        if (connSetWriteHandler(link->client->conn, sendReplyToClient)) {
+    if (clientHasPendingReplies(server.primary)) {
+        if (connSetWriteHandler(server.primary->conn, sendReplyToClient)) {
             serverLog(LL_WARNING, "Error resurrecting the cached primary, impossible to add the writable handler: %s",
                       strerror(errno));
-            freeClientAsync(link->client); /* Close ASAP. */
+            freeClientAsync(server.primary); /* Close ASAP. */
         }
     }
 }
@@ -4681,19 +4426,16 @@ void replicationSteadyStateInit(replicationLink *link) {
 /* Replication: Replica side.
  * Turn the provisional primary into the current primary.
  * This function is called after dual channel sync is finished successfully. */
-void replicationResurrectProvisionalSource(replicationLink *link) {
-    /* Create a client, but do not initialize the read handler yet, as this replica still has a local buffer to
+void replicationResurrectProvisionalPrimary(void) {
+    /* Create a primary client, but do not initialize the read handler yet, as this replica still has a local buffer to
      * drain. */
-    createReplicationLinkClientWithHandler(link, link->transfer_s, link->provisional_source_state.dbid, NULL);
-    link->transfer_s = NULL; /* link->client now takes ownership of this connection */
-    memcpy(link->client->repl_data->replid, link->provisional_source_state.replid, sizeof(link->provisional_source_state.replid));
-    link->client->repl_data->reploff = link->provisional_source_state.reploff;
-    link->client->repl_data->read_reploff = link->provisional_source_state.read_reploff;
-    if (link == server.primary) {
-        server.primary_repl_offset = link->client->repl_data->reploff;
-        memcpy(server.replid, link->client->repl_data->replid, sizeof(link->client->repl_data->replid));
-    }
-    establishSourceConnection(link);
+    replicationCreatePrimaryClientWithHandler(server.repl_transfer_s, server.repl_provisional_primary.dbid, NULL);
+    memcpy(server.primary->repl_data->replid, server.repl_provisional_primary.replid, sizeof(server.repl_provisional_primary.replid));
+    server.primary->repl_data->reploff = server.repl_provisional_primary.reploff;
+    server.primary->repl_data->read_reploff = server.repl_provisional_primary.read_reploff;
+    server.primary_repl_offset = server.primary->repl_data->reploff;
+    memcpy(server.replid, server.primary->repl_data->replid, sizeof(server.primary->repl_data->replid));
+    establishPrimaryConnection();
 }
 
 /* ------------------------- MIN-REPLICAS-TO-WRITE  --------------------------- */
@@ -4720,7 +4462,7 @@ void refreshGoodReplicasCount(void) {
 
 /* return true if status of good replicas is OK. otherwise false */
 int checkGoodReplicasStatus(void) {
-    return server.primary ||                                                     /* not a primary status should be OK */
+    return server.primary_host ||                                                /* not a primary status should be OK */
            !server.repl_min_replicas_max_lag ||                                  /* Min replica max lag not configured */
            !server.repl_min_replicas_to_write ||                                 /* Min replica to write not configured */
            server.repl_good_replicas_count >= server.repl_min_replicas_to_write; /* check if we have enough replicas */
@@ -4813,7 +4555,7 @@ void waitCommand(client *c) {
     long numreplicas, ackreplicas;
     long long offset = getClientWriteOffset(c);
 
-    if (server.primary) {
+    if (server.primary_host) {
         addReplyError(
             c, "WAIT cannot be used with replica instances. Please also note that if a replica is configured to be "
                "writable (which is not the default) writes to replicas are just local and are not propagated.");
@@ -4851,7 +4593,7 @@ void waitaofCommand(client *c) {
     if (getPositiveLongFromObjectOrReply(c, c->argv[2], &numreplicas, NULL) != C_OK) return;
     if (getTimeoutFromObjectOrReply(c, c->argv[3], &timeout, UNIT_MILLISECONDS) != C_OK) return;
 
-    if (server.primary) {
+    if (server.primary_host) {
         addReplyError(c, "WAITAOF cannot be used with replica instances. Please also note that writes to replicas are "
                          "just local and are not propagated.");
         return;
@@ -4972,9 +4714,9 @@ void processClientsWaitingReplicas(void) {
 long long replicationGetReplicaOffset(void) {
     long long offset = 0;
 
-    if (server.primary != NULL) {
-        if (server.primary->client) {
-            offset = server.primary->client->repl_data->reploff;
+    if (server.primary_host != NULL) {
+        if (server.primary) {
+            offset = server.primary->repl_data->reploff;
         } else if (server.cached_primary) {
             offset = server.cached_primary->repl_data->reploff;
         }
@@ -4998,48 +4740,44 @@ void replicationCron(void) {
     updateFailoverStatus();
 
     /* Non blocking connection timeout? */
-    listNode *ln;
-    listIter li;
-    listRewind(server.replication_links, &li);
-    while ((ln = listNext(&li))) {
-        replicationLink *link = (replicationLink *)ln->value;
-        if ((link->state == REPL_STATE_CONNECTING || replicaIsInHandshakeState(link)) &&
-            (time(NULL) - link->transfer_lastio) > server.repl_timeout) {
-            serverLog(LL_WARNING, "Timeout connecting to %s...", replicationGetNameForLogs(link));
-            cancelReplicationHandshake(link, 1);
-        }
-
-        /* Bulk transfer I/O timeout? */
-        if (link && link->state == REPL_STATE_TRANSFER &&
-            (time(NULL) - link->transfer_lastio) > server.repl_timeout) {
-            serverLog(LL_WARNING, "Timeout receiving bulk data from %s... If the problem persists try to set the "
-                                  "'repl-timeout' parameter in valkey.conf to a larger value.", replicationGetNameForLogs(link));
-            cancelReplicationHandshake(link, 1);
-        }
+    if (server.primary_host && (server.repl_state == REPL_STATE_CONNECTING || replicaIsInHandshakeState()) &&
+        (time(NULL) - server.repl_transfer_lastio) > server.repl_timeout) {
+        serverLog(LL_WARNING, "Timeout connecting to the PRIMARY...");
+        cancelReplicationHandshake(1);
+    }
 
-        /* Timed out primary when we are an already connected replica? */
-        if (link && link->state == REPL_STATE_CONNECTED &&
-            (time(NULL) - link->client->last_interaction) > server.repl_timeout) {
-            serverLog(LL_WARNING, "%s timeout: no data nor PING received...", replicationGetNameForLogs(link));
-            freeClient(link->client); /* free client will attempt reconnect */
-        }
+    /* Bulk transfer I/O timeout? */
+    if (server.primary_host && server.repl_state == REPL_STATE_TRANSFER &&
+        (time(NULL) - server.repl_transfer_lastio) > server.repl_timeout) {
+        serverLog(LL_WARNING, "Timeout receiving bulk data from PRIMARY... If the problem persists try to set the "
+                              "'repl-timeout' parameter in valkey.conf to a larger value.");
+        cancelReplicationHandshake(1);
+    }
 
-        /* Check if we should connect to a replication source */
-        if (link && link->state == REPL_STATE_CONNECT) {
-            serverLog(LL_NOTICE, "Connecting to %s %s:%d", replicationGetNameForLogs(link), link->host, link->port);
-            connectReplicationLink(link);
-        }
+    /* Timed out primary when we are an already connected replica? */
+    if (server.primary_host && server.repl_state == REPL_STATE_CONNECTED &&
+        (time(NULL) - server.primary->last_interaction) > server.repl_timeout) {
+        serverLog(LL_WARNING, "PRIMARY timeout: no data nor PING received...");
+        freeClient(server.primary);
+    }
 
-        /* Send ACK to replication sources from time to time.
-         * Note that we do not send periodic acks to replication sources that don't
-         * support PSYNC and replication offsets. */
-        if (link && link->client && !(link->client->flag.pre_psync)) replicationSendAck(link);
+    /* Check if we should connect to a PRIMARY */
+    if (server.repl_state == REPL_STATE_CONNECT) {
+        serverLog(LL_NOTICE, "Connecting to PRIMARY %s:%d", server.primary_host, server.primary_port);
+        connectWithPrimary();
     }
 
+    /* Send ACK to primary from time to time.
+     * Note that we do not send periodic acks to primary that don't
+     * support PSYNC and replication offsets. */
+    if (server.primary_host && server.primary && !(server.primary->flag.pre_psync)) replicationSendAck(server.primary);
+
     /* If we have attached replicas, PING them from time to time.
      * So replicas can implement an explicit timeout to primaries, and will
      * be able to detect a link disconnection even if the TCP connection
      * will not actually go down. */
+    listIter li;
+    listNode *ln;
     robj *ping_argv[1];
 
     /* First, send PING according to ping_replica_period. */
@@ -5126,7 +4864,7 @@ void replicationCron(void) {
      * backlog, in order to reply to PSYNC queries if they are turned into
      * primaries after a failover. */
     if (listLength(server.replicas) == 0 && server.repl_backlog_time_limit && server.repl_backlog &&
-        server.primary == NULL) {
+        server.primary_host == NULL) {
         time_t idle = server.unixtime - server.repl_no_replicas_since;
 
         if (idle > server.repl_backlog_time_limit) {
@@ -5201,7 +4939,7 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out, slotBitmap slot_
                     /* Get first replica's requirements */
                     req = replica->repl_data->replica_req;
                     memcpy(slot_bitmap, replica->repl_data->slot_bitmap, sizeof(slotBitmap));
-                } else if (req != replica->repl_data->replica_req) {
+                } else if (req != replica->repl_data->replica_req || slotBitmapCompare(slot_bitmap, replica->repl_data->slot_bitmap) != 0) {
                     /* Skip replicas that don't match */
                     continue;
                 }
@@ -5231,7 +4969,6 @@ void replicationStartPendingFork(void) {
     int mincapa = -1;
     int req = -1;
     slotBitmap slot_bitmap;
-    slotBitmapSetAll(slot_bitmap);
 
     if (shouldStartChildReplication(&mincapa, &req, slot_bitmap)) {
         /* Start the BGSAVE. The called function may start a
@@ -5376,7 +5113,7 @@ void failoverCommand(client *c) {
         return;
     }
 
-    if (server.primary) {
+    if (server.primary_host) {
         addReplyError(c, "FAILOVER is not valid when server is a replica.");
         return;
     }
diff --git a/src/script.c b/src/script.c
index a43de5c7af..a8e5b18eb9 100644
--- a/src/script.c
+++ b/src/script.c
@@ -51,7 +51,7 @@ static void exitScriptTimedoutMode(scriptRunCtx *run_ctx) {
     run_ctx->flags &= ~SCRIPT_TIMEDOUT;
     blockingOperationEnds();
     /* if we are a replica and we have an active primary, set it for continue processing */
-    if (server.primary && server.primary->client) queueClientForReprocessing(server.primary->client);
+    if (server.primary_host && server.primary) queueClientForReprocessing(server.primary);
 }
 
 static void enterScriptTimedoutMode(scriptRunCtx *run_ctx) {
@@ -137,7 +137,7 @@ int scriptPrepareForRun(scriptRunCtx *run_ctx,
     int client_allow_oom = !!(caller->flag.allow_oom);
 
     int running_stale =
-        server.primary && server.primary->state != REPL_STATE_CONNECTED && server.repl_serve_stale_data == 0;
+        server.primary_host && server.repl_state != REPL_STATE_CONNECTED && server.repl_serve_stale_data == 0;
     int obey_client = mustObeyClient(caller);
 
     if (!(script_flags & SCRIPT_FLAG_EVAL_COMPAT_MODE)) {
@@ -158,7 +158,7 @@ int scriptPrepareForRun(scriptRunCtx *run_ctx,
              * 1. we are not a readonly replica
              * 2. no disk error detected
              * 3. command is not `fcall_ro`/`eval[sha]_ro` */
-            if (server.primary && server.repl_replica_ro && !obey_client) {
+            if (server.primary_host && server.repl_replica_ro && !obey_client) {
                 addReplyError(caller, "-READONLY Can not run script with write flag on readonly replica");
                 return C_ERR;
             }
@@ -375,7 +375,7 @@ static int scriptVerifyWriteCommandAllow(scriptRunCtx *run_ctx, char **err) {
      * of this script. */
     int deny_write_type = writeCommandsDeniedByDiskError();
 
-    if (server.primary && server.repl_replica_ro && !mustObeyClient(run_ctx->original_client)) {
+    if (server.primary_host && server.repl_replica_ro && !mustObeyClient(run_ctx->original_client)) {
         *err = sdsdup(shared.roreplicaerr->ptr);
         return C_ERR;
     }
@@ -501,12 +501,12 @@ int scriptSetRepl(scriptRunCtx *run_ctx, int repl) {
 }
 
 static int scriptVerifyAllowStale(client *c, sds *err) {
-    if (!server.primary) {
+    if (!server.primary_host) {
         /* Not a replica, stale is irrelevant */
         return C_OK;
     }
 
-    if (server.primary->state == REPL_STATE_CONNECTED) {
+    if (server.repl_state == REPL_STATE_CONNECTED) {
         /* Connected to replica, stale is irrelevant */
         return C_OK;
     }
diff --git a/src/server.c b/src/server.c
index 697ce48013..ea77cc1312 100644
--- a/src/server.c
+++ b/src/server.c
@@ -221,7 +221,7 @@ void serverLogRaw(int level, const char *msg) {
         } else if (pid != server.pid) {
             role_index = 1; /* RDB / AOF writing child. */
         } else {
-            role_index = (server.primary ? 2 : 3); /* Replica or Primary. */
+            role_index = (server.primary_host ? 2 : 3); /* Replica or Primary. */
         }
         switch (server.log_format) {
         case LOG_FORMAT_LOGFMT:
@@ -901,7 +901,7 @@ int clientsCronResizeQueryBuffer(client *c) {
             /* 1) Query is idle for a long time. */
             size_t remaining = sdslen(c->querybuf) - c->qb_pos;
             if (!c->flag.replication_source && !remaining) {
-                /* If the client is not a primary and no data is pending,
+                /* If the client is not for replication and no data is pending,
                  * The client can safely use the shared query buffer in the next read - free the client's querybuf. */
                 sdsfree(c->querybuf);
                 /* By setting the querybuf to NULL, the client will use the shared query buffer in the next read.
@@ -2223,12 +2223,21 @@ void initServerConfig(void) {
     appendServerSaveParams(60, 10000);  /* save after 1 minute and 10000 changes */
 
     /* Replication related */
+    server.primary_host = NULL;
+    server.primary_port = 6379;
     server.primary = NULL;
     server.cached_primary = NULL;
+    server.primary_initial_offset = -1;
+    server.repl_state = REPL_STATE_NONE;
+    server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE;
+    server.repl_transfer_tmpfile = NULL;
+    server.repl_transfer_fd = -1;
+    server.repl_transfer_s = NULL;
     server.repl_syncio_timeout = CONFIG_REPL_SYNCIO_TIMEOUT;
     server.repl_down_since = 0; /* Never connected, repl is down since EVER. */
     server.primary_repl_offset = 0;
     server.fsynced_reploff_pending = 0;
+    server.rdb_client_id = -1;
     server.loading_process_events_interval_ms = LOADING_PROCESS_EVENTS_INTERVAL_DEFAULT;
     server.loading_rio = NULL;
 
@@ -2339,7 +2348,7 @@ int restartServer(client *c, int flags, mstime_t delay) {
  * depending on current role.
  */
 int setOOMScoreAdj(int process_class) {
-    if (process_class == -1) process_class = (server.primary ? CONFIG_OOM_REPLICA : CONFIG_OOM_PRIMARY);
+    if (process_class == -1) process_class = (server.primary_host ? CONFIG_OOM_REPLICA : CONFIG_OOM_PRIMARY);
 
     serverAssert(process_class >= 0 && process_class < CONFIG_OOM_COUNT);
 
@@ -2751,7 +2760,6 @@ void initServer(void) {
     server.reply_buffer_peak_reset_time = REPLY_BUFFER_DEFAULT_PEAK_RESET_TIME;
     server.reply_buffer_resizing_enabled = 1;
     server.client_mem_usage_buckets = NULL;
-    server.replication_links = listCreate();
     resetReplicationBuffer();
 
     /* Make sure the locale is set on startup based on the config file. */
@@ -3349,7 +3357,7 @@ struct serverCommand *lookupCommandOrOriginal(robj **argv, int argc) {
     return cmd;
 }
 
-/* Commands arriving from the primary client or AOF client, should never be rejected. */
+/* Commands arriving from a replication source or AOF client, should never be rejected. */
 int mustObeyClient(client *c) {
     return c->id == CLIENT_ID_AOF || c->flag.replication_source;
 }
@@ -4103,7 +4111,7 @@ int processCommand(client *c) {
         }
     }
 
-    if (!server.cluster_enabled && c->capa & CLIENT_CAPA_REDIRECT && server.primary && !obey_client &&
+    if (!server.cluster_enabled && c->capa & CLIENT_CAPA_REDIRECT && server.primary_host && !obey_client &&
         (is_write_command || (is_read_command && !c->flag.readonly))) {
         if (server.failover_state == FAILOVER_IN_PROGRESS) {
             /* During the FAILOVER process, when conditions are met (such as
@@ -4134,7 +4142,7 @@ int processCommand(client *c) {
             }
             c->duration = 0;
             c->cmd->rejected_calls++;
-            addReplyErrorSds(c, sdscatprintf(sdsempty(), "-REDIRECT %s:%d", server.primary->host, server.primary->port));
+            addReplyErrorSds(c, sdscatprintf(sdsempty(), "-REDIRECT %s:%d", server.primary_host, server.primary_port));
         }
         return C_OK;
     }
@@ -4219,7 +4227,7 @@ int processCommand(client *c) {
 
     /* Don't accept write commands if this is a read only replica. But
      * accept write commands if this is our primary. */
-    if (server.primary && server.repl_replica_ro && !obey_client && is_write_command) {
+    if (server.primary_host && server.repl_replica_ro && !obey_client && is_write_command) {
         rejectCommand(c, shared.roreplicaerr);
         return C_OK;
     }
@@ -4240,7 +4248,7 @@ int processCommand(client *c) {
     /* Only allow commands with flag "t", such as INFO, REPLICAOF and so on,
      * when replica-serve-stale-data is no and we are a replica with a broken
      * link with primary. */
-    if (server.primary && server.primary->state != REPL_STATE_CONNECTED && server.repl_serve_stale_data == 0 &&
+    if (server.primary_host && server.repl_state != REPL_STATE_CONNECTED && server.repl_serve_stale_data == 0 &&
         is_denystale_command) {
         rejectCommand(c, shared.primarydownerr);
         return C_OK;
@@ -5964,14 +5972,14 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) {
         info = sdscatprintf(info,
                             "# Replication\r\n"
                             "role:%s\r\n",
-                            server.primary == NULL ? "master" : "slave");
-        if (server.primary) {
+                            server.primary_host == NULL ? "master" : "slave");
+        if (server.primary_host) {
             long long replica_repl_offset = 1;
             long long replica_read_repl_offset = 1;
 
-            if (server.primary->client) {
-                replica_repl_offset = server.primary->client->repl_data->reploff;
-                replica_read_repl_offset = server.primary->client->repl_data->read_reploff;
+            if (server.primary) {
+                replica_repl_offset = server.primary->repl_data->reploff;
+                replica_read_repl_offset = server.primary->repl_data->read_reploff;
             } else if (server.cached_primary) {
                 replica_repl_offset = server.cached_primary->repl_data->reploff;
                 replica_read_repl_offset = server.cached_primary->repl_data->read_reploff;
@@ -5980,32 +5988,32 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) {
             info = sdscatprintf(
                 info,
                 FMTARGS(
-                    "master_host:%s\r\n", server.primary->host,
-                    "master_port:%d\r\n", server.primary->port,
-                    "master_link_status:%s\r\n", (server.primary->state == REPL_STATE_CONNECTED) ? "up" : "down",
-                    "master_last_io_seconds_ago:%d\r\n", server.primary->client ? ((int)(server.unixtime - server.primary->client->last_interaction)) : -1,
-                    "master_sync_in_progress:%d\r\n", server.primary->state == REPL_STATE_TRANSFER,
+                    "master_host:%s\r\n", server.primary_host,
+                    "master_port:%d\r\n", server.primary_port,
+                    "master_link_status:%s\r\n", (server.repl_state == REPL_STATE_CONNECTED) ? "up" : "down",
+                    "master_last_io_seconds_ago:%d\r\n", server.primary ? ((int)(server.unixtime - server.primary->last_interaction)) : -1,
+                    "master_sync_in_progress:%d\r\n", server.repl_state == REPL_STATE_TRANSFER,
                     "slave_read_repl_offset:%lld\r\n", replica_read_repl_offset,
                     "slave_repl_offset:%lld\r\n", replica_repl_offset,
-                    "replicas_repl_buffer_size:%zu\r\n", server.primary->pending_repl_data.len,
-                    "replicas_repl_buffer_peak:%zu\r\n", server.primary->pending_repl_data.peak));
+                    "replicas_repl_buffer_size:%zu\r\n", server.pending_repl_data.len,
+                    "replicas_repl_buffer_peak:%zu\r\n", server.pending_repl_data.peak));
 
-            if (server.primary->state == REPL_STATE_TRANSFER) {
+            if (server.repl_state == REPL_STATE_TRANSFER) {
                 double perc = 0;
-                if (server.primary->transfer_size) {
-                    perc = ((double)server.primary->transfer_read / server.primary->transfer_size) * 100;
+                if (server.repl_transfer_size) {
+                    perc = ((double)server.repl_transfer_read / server.repl_transfer_size) * 100;
                 }
                 info = sdscatprintf(
                     info,
                     FMTARGS(
-                        "master_sync_total_bytes:%lld\r\n", (long long)server.primary->transfer_size,
-                        "master_sync_read_bytes:%lld\r\n", (long long)server.primary->transfer_read,
-                        "master_sync_left_bytes:%lld\r\n", (long long)(server.primary->transfer_size - server.primary->transfer_read),
+                        "master_sync_total_bytes:%lld\r\n", (long long)server.repl_transfer_size,
+                        "master_sync_read_bytes:%lld\r\n", (long long)server.repl_transfer_read,
+                        "master_sync_left_bytes:%lld\r\n", (long long)(server.repl_transfer_size - server.repl_transfer_read),
                         "master_sync_perc:%.2f\r\n", perc,
-                        "master_sync_last_io_seconds_ago:%d\r\n", (int)(server.unixtime - server.primary->transfer_lastio)));
+                        "master_sync_last_io_seconds_ago:%d\r\n", (int)(server.unixtime - server.repl_transfer_lastio)));
             }
 
-            if (server.primary->state != REPL_STATE_CONNECTED) {
+            if (server.repl_state != REPL_STATE_CONNECTED) {
                 info = sdscatprintf(info, "master_link_down_since_seconds:%jd\r\n",
                                     server.repl_down_since ? (intmax_t)(server.unixtime - server.repl_down_since) : -1);
             }
@@ -6840,7 +6848,7 @@ int serverIsSupervised(int mode) {
 }
 
 int iAmPrimary(void) {
-    return ((!server.cluster_enabled && server.primary == NULL) ||
+    return ((!server.cluster_enabled && server.primary_host == NULL) ||
             (server.cluster_enabled && clusterNodeIsPrimary(getMyClusterNode())));
 }
 
@@ -7123,7 +7131,7 @@ __attribute__((weak)) int main(int argc, char **argv) {
         }
 
         if (server.supervised_mode == SUPERVISED_SYSTEMD) {
-            if (!server.primary) {
+            if (!server.primary_host) {
                 serverCommunicateSystemd("STATUS=Ready to accept connections\n");
             } else {
                 serverCommunicateSystemd(
diff --git a/src/server.h b/src/server.h
index 34c8e9ba41..e1a8a1d503 100644
--- a/src/server.h
+++ b/src/server.h
@@ -153,8 +153,6 @@ struct hdr_histogram;
 #else
 #define CONFIG_ACTIVE_DEFRAG_DEFAULT 1
 #endif
-#define CLUSTER_SLOT_MASK_BITS 14                                   /* Number of bits used for slot id. */
-#define CLUSTER_SLOTS (1 << CLUSTER_SLOT_MASK_BITS)                 /* Total number of slots in cluster mode, which is 16384. */
 
 /* Bucket sizes for client eviction pools. Each bucket stores clients with
  * memory usage of up to twice the size of the bucket below it. */
@@ -375,11 +373,12 @@ typedef enum blocking_type {
 
 /* Client classes for client limits, currently used only for
  * the max-client-output-buffer limit implementation. */
-#define CLIENT_TYPE_NORMAL 0     /* Normal req-reply clients + MONITORs */
-#define CLIENT_TYPE_REPLICA 1    /* Replicas. */
-#define CLIENT_TYPE_PUBSUB 2     /* Clients subscribed to PubSub channels. */
-#define CLIENT_TYPE_PRIMARY 3    /* Primary. */
-#define CLIENT_TYPE_COUNT 4      /* Total number of client types. */
+#define CLIENT_TYPE_NORMAL 0         /* Normal req-reply clients + MONITORs */
+#define CLIENT_TYPE_REPLICA 1        /* Replicas. */
+#define CLIENT_TYPE_PUBSUB 2         /* Clients subscribed to PubSub channels. */
+#define CLIENT_TYPE_PRIMARY 3        /* Primary. */
+#define CLIENT_TYPE_SLOT_MIGRATION 4 /* Slot migration client. */
+#define CLIENT_TYPE_COUNT 5      /* Total number of client types. */
 #define CLIENT_TYPE_OBUF_COUNT 3 /* Number of clients to expose to output \
                                     buffer configuration. Just the first  \
                                     three: normal, replica, pubsub. */
@@ -388,6 +387,7 @@ typedef enum blocking_type {
  * what to do next. */
 typedef enum {
     REPL_STATE_NONE = 0,   /* No active replication */
+    REPL_STATE_ERROR,      /* Error in replication. */
     REPL_STATE_CONNECT,    /* Must connect to primary */
     REPL_STATE_CONNECTING, /* Connecting to primary */
     /* --- Handshake states, must be ordered --- */
@@ -404,7 +404,6 @@ typedef enum {
     /* --- End of handshake states --- */
     REPL_STATE_TRANSFER,  /* Receiving .rdb from primary */
     REPL_STATE_CONNECTED, /* Connected to primary */
-    REPL_STATE_CANCELLED, /* Replication was cancelled, and this link is pending deletion. */
 } repl_state;
 
 /* Replica rdb-channel replication state. Used in server.repl_rdb_channel_state for
@@ -1016,7 +1015,7 @@ typedef enum {
 } clientIOState;
 
 typedef struct ClientFlags {
-    uint64_t replication_source : 1;       /* This client is a replication source (i.e. primary or slot migration source) */
+    uint64_t primary : 1;                  /* This client is a primary */
     uint64_t replica : 1;                  /* This client is a replica */
     uint64_t monitor : 1;                  /* This client is a replica monitor, see MONITOR */
     uint64_t multi : 1;                    /* This client is in a MULTI context */
@@ -1029,7 +1028,7 @@ typedef struct ClientFlags {
     uint64_t close_asap : 1;               /* Close this client ASAP */
     uint64_t unix_socket : 1;              /* Client connected via Unix domain socket */
     uint64_t dirty_exec : 1;               /* EXEC will fail for errors while queueing */
-    uint64_t primary_force_reply : 1;      /* Queue replies even if is primary */
+    uint64_t replication_force_reply : 1;  /* Queue replies even if is primary */
     uint64_t force_aof : 1;                /* Force AOF propagation of current cmd. */
     uint64_t force_repl : 1;               /* Force replication of current cmd. */
     uint64_t pre_psync : 1;                /* Instance don't understand PSYNC. */
@@ -1092,7 +1091,9 @@ typedef struct ClientFlags {
                                             * flag, we won't cache the primary in freeClient. */
     uint64_t fake : 1;                     /* This is a fake client without a real connection. */
     uint64_t import_source : 1;            /* This client is importing data to server and can visit expired key. */
-    uint64_t reserved : 4;                 /* Reserved for future use */
+    uint64_t replication_source : 1;       /* This client is a replication source (i.e. primary or slot migration). */
+    uint64_t slot_migration_source : 1;    /* This client is a slot migration source. */
+    uint64_t reserved : 3;                 /* Reserved for future use */
 } ClientFlags;
 
 typedef struct ClientPubSubData {
@@ -1108,9 +1109,11 @@ typedef struct ClientPubSubData {
                                       context of client side caching. */
 } ClientPubSubData;
 
+#define CLUSTER_SLOT_MASK_BITS 14                                   /* Number of bits used for slot id. */
+#define CLUSTER_SLOTS (1 << CLUSTER_SLOT_MASK_BITS)                 /* Total number of slots in cluster mode, which is 16384. */
+
 typedef unsigned char slotBitmap[CLUSTER_SLOTS / 8];
 
-typedef struct replicationLink replicationLink;
 typedef struct ClientReplicationData {
     int repl_state;                      /* Replication state if this is a replica. */
     int repl_start_cmd_stream_on_ack;    /* Install replica write handler on first ACK. */
@@ -1142,7 +1145,6 @@ typedef struct ClientReplicationData {
     size_t ref_block_pos;                /* Access position of referenced buffer block,
                                             i.e. the next offset to send. */
     slotBitmap slot_bitmap;              /* The slot range this replica is replicating for. */
-    replicationLink *link;               /* The replication link owning this. */
 } ClientReplicationData;
 
 typedef struct ClientModuleData {
@@ -1424,7 +1426,7 @@ typedef enum {
  * top-level primary. */
 typedef struct rdbSaveInfo {
     /* Used saving and loading. */
-    int repl_stream_db; /* DB to select in server.primary->client. */
+    int repl_stream_db; /* DB to select in server.primary client. */
 
     /* Used only loading. */
     int repl_id_is_set;                   /* True if repl_id field is set. */
@@ -1546,43 +1548,6 @@ typedef enum childInfoType {
     CHILD_INFO_TYPE_MODULE_COW_SIZE
 } childInfoType;
 
-typedef struct slotRange {
-    int start;
-    int end;
-} slotRange;
-
-typedef struct replicationLink {
-    int protected; /* Used to protect link from destruction during background loading. */
-    int state; /* State of the sync operation overall. */
-    int rdb_channel_state;
-    client *client;
-    client *snapshot_load_client; /* client used for full sync when AOF format is used. */
-    sds host;
-    int port;
-    connection *transfer_s;        /* Replica -> Primary SYNC connection */
-    connection *rdb_transfer_s;    /* Primary FULL SYNC connection (RDB download) */
-    uint64_t rdb_client_id; /* Rdb client id as it defined at primary side */
-    /* The following two fields is where we store primary PSYNC replid/offset
-     * while the PSYNC is in progress. At the end we'll copy the fields into
-     * the server->primary client structure. */
-    char replid[CONFIG_RUN_ID_SIZE + 1]; /* Primary PSYNC runid. */
-    long long initial_offset;            /* Primary PSYNC offset. */
-    off_t transfer_size;           /* Size of RDB to read from primary during sync. */
-    off_t transfer_read;           /* Amount of RDB read from primary during sync. */
-    off_t transfer_last_fsync_off; /* Offset when we fsync-ed last time. */
-    int transfer_fd;               /* Replica -> Primary SYNC temp file descriptor */
-    char *transfer_tmpfile;        /* Replica-> Primary SYNC temp file name */
-    time_t transfer_lastio;        /* Unix time of the latest read, for timeout */
-    struct {
-        char replid[CONFIG_RUN_ID_SIZE + 1];
-        long long reploff;
-        long long read_reploff;
-        int dbid;
-    } provisional_source_state; /* Information about the provisional state (after RDB) for the source node, stored during dual channel sync. */
-    replDataBuf pending_repl_data;             /* Replication data buffer for dual-channel-replication */
-    unsigned char slot_bitmap[CLUSTER_SLOTS/8]; /* Slot range used for slot import. */
-} replicationLink;
-
 struct valkeyServer {
     /* General */
     pid_t pid;                /* Main process pid. */
@@ -1743,6 +1708,7 @@ struct valkeyServer {
     long long stat_net_input_bytes;                /* Bytes read from network. */
     long long stat_net_output_bytes;               /* Bytes written to network. */
     long long stat_net_repl_input_bytes;           /* Bytes read during replication, added to stat_net_input_bytes in 'info'. */
+    long long stat_net_slot_migration_input_bytes; /* Bytes read during replication, added to stat_net_input_bytes in 'info'. */
     /* Bytes written during replication, added to stat_net_output_bytes in 'info'. */
     long long stat_net_repl_output_bytes;
     size_t stat_current_cow_peak;                       /* Peak size of copy on write bytes. */
@@ -1943,6 +1909,7 @@ struct valkeyServer {
     int repl_ping_replica_period;              /* Primary pings the replica every N seconds */
     replBacklog *repl_backlog;                 /* Replication backlog for partial syncs */
     long long repl_backlog_size;               /* Backlog circular buffer size */
+    replDataBuf pending_repl_data;             /* Replication data buffer for dual-channel-replication */
     time_t repl_backlog_time_limit;            /* Time without replicas after the backlog
                                                   gets released. */
     time_t repl_no_replicas_since;             /* We have no replicas since that time.
@@ -1966,28 +1933,52 @@ struct valkeyServer {
     list *repl_buffer_blocks;                  /* Replication buffers blocks list
                                                 * (serving replica clients and repl backlog) */
     /* Replication (replica) */
-    char *primary_user;                /* AUTH with this user and primary_auth with primary */
-    sds primary_auth;                  /* AUTH with this password with primary */
-    int repl_timeout;                  /* Timeout after N seconds of primary idle */
-    replicationLink *primary;          /* Replication link for the primary. */
-    list *replication_links;           /* List of all current replication links. */
-    client *cached_primary;            /* Cached primary to be reused for PSYNC. */
-    int repl_syncio_timeout;           /* Timeout for synchronous I/O calls */
-    int repl_serve_stale_data;         /* Serve stale data when link is down? */
-    int repl_replica_ro;               /* Replica is read only? */
-    int repl_replica_ignore_maxmemory; /* If true replicas do not evict. */
-    time_t repl_down_since;            /* Unix time at which link with primary went down */
-    int repl_disable_tcp_nodelay;      /* Disable TCP_NODELAY after SYNC? */
-    int replica_priority;              /* Reported in INFO and used by Sentinel. */
-    int replica_announced;             /* If true, replica is announced by Sentinel */
-    int replica_announce_port;         /* Give the primary this listening port. */
-    char *replica_announce_ip;         /* Give the primary this ip address. */
-    int propagation_error_behavior;    /* Configures the behavior of the replica
-                                        * when it receives an error on the replication stream */
-    int repl_ignore_disk_write_error;  /* Configures whether replicas panic when unable to
-                                        * persist writes to AOF. */
-    int repl_replica_lazy_flush;       /* Lazy FLUSHALL before loading DB? */
-    rio *loading_rio;                  /* Pointer to the rio object currently used for loading data. */
+    char *primary_user;     /* AUTH with this user and primary_auth with primary */
+    sds primary_auth;       /* AUTH with this password with primary */
+    char *primary_host;     /* Hostname of primary */
+    int primary_port;       /* Port of primary */
+    int repl_timeout;       /* Timeout after N seconds of primary idle */
+    client *primary;        /* Client that is primary for this replica */
+    uint64_t rdb_client_id; /* Rdb client id as it defined at primary side */
+    struct {
+        connection *conn;
+        char replid[CONFIG_RUN_ID_SIZE + 1];
+        long long reploff;
+        long long read_reploff;
+        int dbid;
+    } repl_provisional_primary;
+    client *cached_primary;             /* Cached primary to be reused for PSYNC. */
+    rio *loading_rio;                   /* Pointer to the rio object currently used for loading data. */
+    int repl_syncio_timeout;            /* Timeout for synchronous I/O calls */
+    int repl_state;                     /* Replication status if the instance is a replica */
+    int repl_rdb_channel_state;         /* State of the replica's rdb channel during dual-channel-replication */
+    off_t repl_transfer_size;           /* Size of RDB to read from primary during sync. */
+    off_t repl_transfer_read;           /* Amount of RDB read from primary during sync. */
+    off_t repl_transfer_last_fsync_off; /* Offset when we fsync-ed last time. */
+    connection *repl_transfer_s;        /* Replica -> Primary SYNC connection */
+    connection *repl_rdb_transfer_s;    /* Primary FULL SYNC connection (RDB download) */
+    int repl_transfer_fd;               /* Replica -> Primary SYNC temp file descriptor */
+    char *repl_transfer_tmpfile;        /* Replica-> Primary SYNC temp file name */
+    time_t repl_transfer_lastio;        /* Unix time of the latest read, for timeout */
+    int repl_serve_stale_data;          /* Serve stale data when link is down? */
+    int repl_replica_ro;                /* Replica is read only? */
+    int repl_replica_ignore_maxmemory;  /* If true replicas do not evict. */
+    time_t repl_down_since;             /* Unix time at which link with primary went down */
+    int repl_disable_tcp_nodelay;       /* Disable TCP_NODELAY after SYNC? */
+    int replica_priority;               /* Reported in INFO and used by Sentinel. */
+    int replica_announced;              /* If true, replica is announced by Sentinel */
+    int replica_announce_port;          /* Give the primary this listening port. */
+    char *replica_announce_ip;          /* Give the primary this ip address. */
+    int propagation_error_behavior;     /* Configures the behavior of the replica
+                                         * when it receives an error on the replication stream */
+    int repl_ignore_disk_write_error;   /* Configures whether replicas panic when unable to
+                                         * persist writes to AOF. */
+    /* The following two fields is where we store primary PSYNC replid/offset
+     * while the PSYNC is in progress. At the end we'll copy the fields into
+     * the server->primary client structure. */
+    char primary_replid[CONFIG_RUN_ID_SIZE + 1]; /* Primary PSYNC runid. */
+    long long primary_initial_offset;            /* Primary PSYNC offset. */
+    int repl_replica_lazy_flush;                 /* Lazy FLUSHALL before loading DB? */
     /* Import Mode */
     int import_mode; /* If true, server is in import mode and forbid expiration and eviction. */
     /* Synchronous replication. */
@@ -2625,12 +2616,12 @@ void dictVanillaFree(void *val);
 #define READ_FLAGS_ERROR_BIG_BULK_COUNT (1 << 6)
 #define READ_FLAGS_ERROR_MBULK_UNEXPECTED_CHARACTER (1 << 7)
 #define READ_FLAGS_ERROR_MBULK_INVALID_BULK_LEN (1 << 8)
-#define READ_FLAGS_ERROR_UNEXPECTED_INLINE_FROM_PRIMARY (1 << 9)
+#define READ_FLAGS_ERROR_UNEXPECTED_INLINE_FROM_REPLICATION_SOURCE (1 << 9)
 #define READ_FLAGS_ERROR_UNBALANCED_QUOTES (1 << 10)
 #define READ_FLAGS_INLINE_ZERO_QUERY_LEN (1 << 11)
 #define READ_FLAGS_PARSING_NEGATIVE_MBULK_LEN (1 << 12)
 #define READ_FLAGS_PARSING_COMPLETED (1 << 13)
-#define READ_FLAGS_PRIMARY (1 << 14)
+#define READ_FLAGS_REPLICATION_SOURCE (1 << 14)
 #define READ_FLAGS_DONT_PARSE (1 << 15)
 #define READ_FLAGS_AUTH_REQUIRED (1 << 16)
 
@@ -2767,9 +2758,6 @@ void ioThreadWriteToClient(void *data);
 int canParseCommand(client *c);
 int processIOThreadsReadDone(void);
 int processIOThreadsWriteDone(void);
-replicationLink *createReplicationLink(char *host, int port, slotBitmap slot_bitmap);
-int connectReplicationLink(replicationLink *link);
-int freeReplicationLink(replicationLink *link);
 
 /* logreqres.c - logging of requests and responses */
 void reqresReset(client *c, int free_buf);
@@ -2923,7 +2911,7 @@ void replicationFeedMonitors(client *c, list *monitors, int dictid, robj **argv,
 void updateReplicasWaitingBgsave(int bgsaveerr, int type);
 void replicationCron(void);
 void replicationStartPendingFork(void);
-void replicationHandleSourceDisconnection(replicationLink *link);
+void replicationHandlePrimaryDisconnection(void);
 void replicationCachePrimary(client *c);
 void resizeReplicationBacklog(void);
 void replicationSetPrimary(char *ip, int port, int full_sync_required);
@@ -2934,7 +2922,7 @@ void processClientsWaitingReplicas(void);
 void unblockClientWaitingReplicas(client *c);
 int replicationCountAcksByOffset(long long offset);
 int replicationCountAOFAcksByOffset(long long offset);
-void replicationSendNewlineToConnectedLinks(void);
+void replicationSendNewlineToPrimary(void);
 long long replicationGetReplicaOffset(void);
 char *replicationGetReplicaName(client *c);
 long long getPsyncInitialOffset(void);
@@ -2960,6 +2948,8 @@ int sendCurrentOffsetToReplica(client *replica);
 void addRdbReplicaToPsyncWait(client *replica);
 void initClientReplicationData(client *c);
 void freeClientReplicationData(client *c);
+void replicationSendAck(client *c);
+int replicationProceedWithHandshake(connection *conn, int curr_state, slotBitmap slot_bitmap);
 
 /* Generic persistence functions */
 void startLoadingFile(size_t size, char *filename, int rdbflags);
diff --git a/src/valkeymodule.h b/src/valkeymodule.h
index 1d99d2ff7a..8a2090fcca 100644
--- a/src/valkeymodule.h
+++ b/src/valkeymodule.h
@@ -221,11 +221,13 @@ typedef struct ValkeyModuleStreamID {
 #define VALKEYMODULE_CTX_FLAGS_ASYNC_LOADING (1 << 23)
 /* Valkey is starting. */
 #define VALKEYMODULE_CTX_FLAGS_SERVER_STARTUP (1 << 24)
+/* The command was sent via slot migration link. */
+#define VALKEYMODULE_CTX_FLAGS_IMPORTING_SLOT (1 << 25)
 
 /* Next context flag, must be updated when adding new flags above!
 This flag should not be used directly by the module.
  * Use ValkeyModule_GetContextFlagsAll instead. */
-#define _VALKEYMODULE_CTX_FLAGS_NEXT (1 << 25)
+#define _VALKEYMODULE_CTX_FLAGS_NEXT (1 << 26)
 
 /* Keyspace changes notification classes. Every class is associated with a
  * character for configuration purposes.

From db1829590359d65955ddf4bca5c1f213649f8aa7 Mon Sep 17 00:00:00 2001
From: Jacob Murphy <jkmurphy@google.com>
Date: Sat, 18 Jan 2025 00:49:05 +0000
Subject: [PATCH 05/18] Get tests passing

Signed-off-by: Jacob Murphy <jkmurphy@google.com>
---
 src/aof.c            |  4 ++--
 src/cluster_legacy.c | 38 ++++++++++++++++++++++----------------
 src/cluster_legacy.h |  7 +++++--
 src/replication.c    |  3 +++
 4 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/src/aof.c b/src/aof.c
index dbebc92e63..ebdf6f1b71 100644
--- a/src/aof.c
+++ b/src/aof.c
@@ -2204,7 +2204,7 @@ int rewriteAppendOnlyFileRio(rio *aof, slotBitmap slot_bitmap) {
     kvstoreIterator *kvs_it = NULL;
 
     /* Record timestamp at the beginning of rewriting AOF. */
-    if (server.aof_timestamp_enabled && !slot_bitmap) {
+    if (server.aof_timestamp_enabled && (slot_bitmap == NULL || isSlotBitmapEmpty(slot_bitmap))) {
         sds ts = genAofTimestampAnnotationIfNeeded(1);
         if (rioWrite(aof, ts, sdslen(ts)) == 0) {
             sdsfree(ts);
@@ -2224,7 +2224,7 @@ int rewriteAppendOnlyFileRio(rio *aof, slotBitmap slot_bitmap) {
         if (rioWrite(aof, selectcmd, sizeof(selectcmd) - 1) == 0) goto werr;
         if (rioWriteBulkLongLong(aof, j) == 0) goto werr;
 
-        if (!slot_bitmap) {
+        if (slot_bitmap == NULL || isSlotBitmapEmpty(slot_bitmap)) {
             kvs_it = kvstoreIteratorInit(db->keys);
         } else {
             kvs_it = kvstoreFilteredIteratorInit(db->keys, &shouldFilterSlot, slot_bitmap);
diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index d174124f40..636a3ea930 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -2503,7 +2503,7 @@ void clusterSetNodeAsPrimary(clusterNode *n) {
  * The 'sender' is the node for which we received a configuration update.
  * Sometimes it is not actually the "Sender" of the information, like in the
  * case we receive the info via an UPDATE packet. */
-void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoch, unsigned char *slots) {
+void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoch, slotBitmap slots) {
     int j;
     clusterNode *cur_primary = NULL, *new_primary = NULL;
     /* The dirty slots list is a list of slots for which we lose the ownership
@@ -2572,9 +2572,13 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc
                 }
 
                 /* Was this slot mine and it was in a paused state for slot
-                 * migration? If so, clear the manual failover state. */
-                if (server.cluster->slots[j] == myself && server.cluster->mf_end && server.cluster->mf_replica == sender) {
-                    resetManualFailover();
+                 * migration? If so, mark the move as done. */
+                if (server.cluster->slots[j] == myself && server.cluster->mf_end && server.cluster->mf_slots_target == sender) {
+                    bitmapClearBit(server.cluster->mf_slots, j);
+                    if (isSlotBitmapEmpty(server.cluster->mf_slots)) {
+                        serverLog(LL_NOTICE, "Slot migration to node %s (%s) has finished. Unpausing myself.", server.cluster->mf_slots_target->name, server.cluster->mf_slots_target->human_nodename);
+                        resetManualFailover();
+                    }
                 }
 
                 /* If the sender who claims this slot is not in the same shard,
@@ -3252,7 +3256,7 @@ int clusterProcessPacket(clusterLink *link) {
                       "primary manual failover: %lld",
                       server.cluster->mf_primary_offset);
         }
-        /* If we are a importing a slot and the slot owner sent its offset
+        /* If we are importing a slot and the slot owner sent its offset
          * while already paused, populate the migration state. */
         slotMigration * curr_migration = clusterGetCurrentSlotMigration();
         if (hdr->mflags[0] & CLUSTERMSG_FLAG0_PAUSED && curr_migration != NULL &&
@@ -3729,11 +3733,12 @@ int clusterProcessPacket(clusterLink *link) {
         /* Initialize the slot migration state accordingly */
         resetManualFailover();
         server.cluster->mf_end = now + CLUSTER_MF_TIMEOUT;
-        server.cluster->mf_replica = sender;
+        server.cluster->mf_slots_target = sender;
+        memcpy(server.cluster->mf_slots, hdr->data.slot_migration.msg.slot_bitmap, sizeof(slotBitmap));
         /* TODO(murphyjacob4) pause subset of slots */
         pauseActions(PAUSE_DURING_FAILOVER, now + (CLUSTER_MF_TIMEOUT * CLUSTER_MF_PAUSE_MULT),
                      PAUSE_ACTIONS_CLIENT_WRITE_SET);
-        serverLog(LL_NOTICE, "Slot migration requested by node %.40s (%s).", sender->name, sender->human_nodename);
+        serverLog(LL_NOTICE, "Slot migration requested by node %.40s (%s). Pausing myself to allow slot takeover.", sender->name, sender->human_nodename);
         /* We need to send a ping message to the replica, as it would carry
          * `server.cluster->mf_primary_offset`, which means the primary paused clients
          * at offset `server.cluster->mf_primary_offset`, so that the replica would
@@ -5281,7 +5286,7 @@ void manualFailoverCanStart(void) {
  * The function can be used both to initialize the manual failover state at
  * startup or to abort a manual failover in progress. */
 void resetManualFailover(void) {
-    if (server.cluster->mf_replica) {
+    if (server.cluster->mf_replica || server.cluster->mf_slots_target) {
         /* We were a primary failing over, so we paused clients and related actions.
          * Regardless of the outcome we unpause now to allow traffic again. */
         unpauseActions(PAUSE_DURING_FAILOVER);
@@ -5290,6 +5295,8 @@ void resetManualFailover(void) {
     server.cluster->mf_can_start = 0;
     server.cluster->mf_replica = NULL;
     server.cluster->mf_primary_offset = -1;
+    memset(server.cluster->mf_slots, 0, sizeof(server.cluster->mf_slots));
+    server.cluster->mf_slots_target = NULL;
 }
 
 /* If a manual failover timed out, abort it. */
@@ -7370,19 +7377,18 @@ int clusterCommandSpecial(client *c) {
             return 1;
         }
         if (c->argc < 5 || strcasecmp(c->argv[2]->ptr, "slotsrange")) {
-            addReplyError(c, "Migrate command requires at least one ");
+            addReplyError(c, "Migrate command requires at least one slot range");
+            return 1;
+        }
+        if (c->argc % 2 == 0) {
+            addReplyError(c, "Invalid SLOTSRANGE, missing end slot");
             return 1;
         }
-        unsigned char requested_slots[CLUSTER_SLOTS/8];
-        memset(requested_slots, 0, sizeof(requested_slots));
+        slotBitmap requested_slots;
+        memset(requested_slots, 0, sizeof(slotBitmap));
         int i;
         clusterNode * curr_owner = NULL;
         for (i = 3; i + 1 < c->argc; i+=2) {
-            if (i > 3 && getLongLongFromObject(c->argv[i], NULL) != C_OK) {
-                /* If we find a non-integer in the args and we have already
-                 * parsed >=1 slot range, we assume it is the next token. */
-                break;
-            }
             int start = getSlotOrReply(c, c->argv[i]);
             if (start < 0) {
                 return 1;
diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
index f9c6f5e5b8..0a97ca37ad 100644
--- a/src/cluster_legacy.h
+++ b/src/cluster_legacy.h
@@ -340,7 +340,7 @@ struct _clusterNode {
     char shard_id[CLUSTER_NAMELEN];         /* shard id, hex string, sha1-size */
     int flags;                              /* CLUSTER_NODE_... */
     uint64_t configEpoch;                   /* Last configEpoch observed for this node */
-    unsigned char slots[CLUSTER_SLOTS / 8]; /* slots handled by this node */
+    slotBitmap slots;                       /* slots handled by this node */
     uint16_t *slot_info_pairs;              /* Slots info represented as (start/end) pair (consecutive index). */
     int slot_info_pairs_count;              /* Used number of slots in slot_info_pairs */
     int numslots;                           /* Number of slots handled by this node */
@@ -441,6 +441,9 @@ struct clusterState {
                                    or -1 if still not received. */
     int mf_can_start;            /* If non-zero signal that the manual failover
                                     can start requesting primary vote. */
+    /* Manual failover state for slot migration */
+    slotBitmap mf_slots; /* Slots in migration. */
+    clusterNode *mf_slots_target;
     /* The following fields are used by primaries to take state on elections. */
     uint64_t lastVoteEpoch; /* Epoch of the last vote granted. */
     int todo_before_sleep;  /* Things to do in clusterBeforeSleep(). */
@@ -458,7 +461,7 @@ struct clusterState {
      * the ownership transfer. Set the bit corresponding to the slot when a node
      * stops claiming the slot. This prevents spreading incorrect information (that
      * source still owns the slot) using UPDATE messages. */
-    unsigned char owner_not_claiming_slot[CLUSTER_SLOTS / 8];
+    slotBitmap owner_not_claiming_slot;
     /* Struct used for storing slot statistics, for all slots owned by the current shard. */
     slotStat slot_stats[CLUSTER_SLOTS];
     list *slot_migrations; /* Queue of ongoing slot migrations. */
diff --git a/src/replication.c b/src/replication.c
index 90a9e90e24..9abf5cead3 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -3768,6 +3768,8 @@ void syncWithPrimary(connection *conn) {
         } else if (server.repl_state == REPL_STATE_ERROR) {
             goto error;
         }
+        if (server.repl_state != REPL_STATE_SEND_PSYNC)
+            return;
     }
 
     /* Try a partial resynchronization. If we don't have a cached primary
@@ -3908,6 +3910,7 @@ void syncWithPrimary(connection *conn) {
     server.repl_transfer_tmpfile = NULL;
     server.repl_transfer_fd = -1;
     server.repl_state = REPL_STATE_CONNECT;
+    return;
 
 write_error: /* Handle sendCommand() errors. */
     serverLog(LL_WARNING, "Sending command to primary in replication handshake: %s", err);

From 6e8bdb5b5b3687ee8df65ec8b039a612d51007f0 Mon Sep 17 00:00:00 2001
From: Jacob Murphy <jkmurphy@google.com>
Date: Sun, 19 Jan 2025 13:12:18 +0000
Subject: [PATCH 06/18] Refactor to use dedicated sync mechanisms

Signed-off-by: Jacob Murphy <jkmurphy@google.com>
---
 src/aof.c                           |   8 +-
 src/blocked.c                       |   2 +-
 src/cluster.c                       |  14 +-
 src/cluster.h                       |   5 +-
 src/cluster_legacy.c                | 585 +++++++++++++++++++---------
 src/cluster_legacy.h                |  65 ++--
 src/commands.def                    |  18 +
 src/commands/cluster-syncslots.json |  14 +
 src/db.c                            |   4 +-
 src/io_threads.c                    |   2 +-
 src/kvstore.c                       |  18 +-
 src/kvstore.h                       |   4 +-
 src/networking.c                    |  58 +--
 src/rdb.c                           | 150 +++----
 src/rdb.h                           |   2 +-
 src/replication.c                   | 372 +++++++-----------
 src/server.c                        |  30 +-
 src/server.h                        |  38 +-
 tests/unit/slot-migration.tcl       |  22 ++
 19 files changed, 836 insertions(+), 575 deletions(-)
 create mode 100644 src/commands/cluster-syncslots.json
 create mode 100644 tests/unit/slot-migration.tcl

diff --git a/src/aof.c b/src/aof.c
index ebdf6f1b71..0cd64820c8 100644
--- a/src/aof.c
+++ b/src/aof.c
@@ -2191,10 +2191,10 @@ static int rewriteFunctions(rio *aof) {
     return 0;
 }
 
-int shouldFilterSlot(int slot, void * privdata) {
-    if (privdata == NULL) return 0;
+int slotFilterPredicate(int slot, void * privdata) {
+    if (privdata == NULL) return 1;
     unsigned char *slot_bitmap = (unsigned char *)privdata;
-    return !bitmapTestBit(slot_bitmap, slot);
+    return bitmapTestBit(slot_bitmap, slot);
 }
 
 int rewriteAppendOnlyFileRio(rio *aof, slotBitmap slot_bitmap) {
@@ -2227,7 +2227,7 @@ int rewriteAppendOnlyFileRio(rio *aof, slotBitmap slot_bitmap) {
         if (slot_bitmap == NULL || isSlotBitmapEmpty(slot_bitmap)) {
             kvs_it = kvstoreIteratorInit(db->keys);
         } else {
-            kvs_it = kvstoreFilteredIteratorInit(db->keys, &shouldFilterSlot, slot_bitmap);
+            kvs_it = kvstoreFilteredIteratorInit(db->keys, &slotFilterPredicate, slot_bitmap);
         }
         /* Iterate this DB writing every entry */
         void *next;
diff --git a/src/blocked.c b/src/blocked.c
index 70da7877ad..9bdab5be8e 100644
--- a/src/blocked.c
+++ b/src/blocked.c
@@ -101,7 +101,7 @@ void freeClientBlockingState(client *c) {
  * and will be processed when the client is unblocked. */
 void blockClient(client *c, int btype) {
     /* Replication clients should never be blocked unless pause or module */
-    serverAssert(!(c->flag.replication_source && btype != BLOCKED_MODULE && btype != BLOCKED_POSTPONE));
+    serverAssert(!(c->flag.replicated && btype != BLOCKED_MODULE && btype != BLOCKED_POSTPONE));
 
     initClientBlockingState(c);
 
diff --git a/src/cluster.c b/src/cluster.c
index d7e7be52af..508eddefc6 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -1023,7 +1023,7 @@ getNodeByQuery(client *c, struct serverCommand *cmd, robj **argv, int argc, int
 
     /* We handle all the cases as if they were EXEC commands, so we have
      * a common code path for everything */
-    if (cmd->proc == execCommand) {
+    if (c && cmd->proc == execCommand) {
         /* If CLIENT_MULTI flag is not set EXEC is just going to return an
          * error. */
         if (!c->flag.multi) return myself;
@@ -1040,11 +1040,11 @@ getNodeByQuery(client *c, struct serverCommand *cmd, robj **argv, int argc, int
         mc.cmd = cmd;
     }
 
-    uint64_t cmd_flags = getCommandFlags(c);
+    uint64_t cmd_flags = c ? getCommandFlags(c) : cmd->flags;
 
     /* Only valid for sharded pubsub as regular pubsub can operate on any node and bypasses this layer. */
     int pubsubshard_included =
-        (cmd_flags & CMD_PUBSUB) || (c->cmd->proc == execCommand && (c->mstate->cmd_flags & CMD_PUBSUB));
+        (cmd_flags & CMD_PUBSUB) || (c && c->cmd->proc == execCommand && (c->mstate->cmd_flags & CMD_PUBSUB));
 
     /* Check that all the keys are in the same hash slot, and obtain this
      * slot and the node associated. */
@@ -1089,7 +1089,7 @@ getNodeByQuery(client *c, struct serverCommand *cmd, robj **argv, int argc, int
                  * can safely serve the request, otherwise we return a TRYAGAIN
                  * error). To do so we set the importing/migrating state and
                  * increment a counter for every missing key. */
-                if (clusterNodeIsPrimary(myself) || c->flag.readonly) {
+                if (clusterNodeIsPrimary(myself) || (c && c->flag.readonly)) {
                     if (n == clusterNodeGetPrimary(myself) && getMigratingSlotDest(slot) != NULL) {
                         migrating_slot = 1;
                     } else if (getImportingSlotSource(slot) != NULL) {
@@ -1184,7 +1184,7 @@ getNodeByQuery(client *c, struct serverCommand *cmd, robj **argv, int argc, int
      * request as "ASKING", we can serve the request. However if the request
      * involves multiple keys and we don't have them all, the only option is
      * to send a TRYAGAIN error. */
-    if (importing_slot && (c->flag.asking || cmd_flags & CMD_ASKING)) {
+    if (importing_slot && (c && (c->flag.asking || cmd_flags & CMD_ASKING))) {
         if (multiple_keys && missing_keys) {
             if (error_code) *error_code = CLUSTER_REDIR_UNSTABLE;
             return NULL;
@@ -1197,8 +1197,8 @@ getNodeByQuery(client *c, struct serverCommand *cmd, robj **argv, int argc, int
      * node is a replica and the request is about a hash slot our primary
      * is serving, we can reply without redirection. */
     int is_write_command =
-        (cmd_flags & CMD_WRITE) || (c->cmd->proc == execCommand && (c->mstate->cmd_flags & CMD_WRITE));
-    if ((c->flag.readonly || pubsubshard_included) && !is_write_command && clusterNodeIsReplica(myself) &&
+        (cmd_flags & CMD_WRITE) || (c && c->cmd->proc == execCommand && (c->mstate->cmd_flags & CMD_WRITE));
+    if (((c && c->flag.readonly) || pubsubshard_included) && !is_write_command && clusterNodeIsReplica(myself) &&
         clusterNodeGetPrimary(myself) == n) {
         return myself;
     }
diff --git a/src/cluster.h b/src/cluster.h
index 74889422b4..5192bc405e 100644
--- a/src/cluster.h
+++ b/src/cluster.h
@@ -124,7 +124,10 @@ void bitmapSetAllBits(unsigned char *bitmap, int len);
 int slotBitmapCompare(slotBitmap bitmap, slotBitmap other);
 int isSlotBitmapEmpty(slotBitmap bitmap);
 int getSlotOrReply(client *c, robj *o);
-void clusterSlotMigrationDoneSyncing(long long initial_offset);
+void clusterSlotImportDoneSyncing(long long initial_offset);
+void clusterSlotMigrationHandleClientClose(client *c);
+void clusterFeedSlotMigration(int dbid, robj **argv, int argc);
+int clusterShouldWriteToSlotMigrationTarget(void);
 
 /* functions with shared implementations */
 int clusterNodeIsMyself(clusterNode *n);
diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 636a3ea930..fa0da913b2 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -83,8 +83,8 @@ sds representSlotInfo(sds ci, uint16_t *slot_info_pairs, int slot_info_pairs_cou
 void clusterFreeNodesSlotsInfo(clusterNode *n);
 uint64_t clusterGetMaxEpoch(void);
 int clusterBumpConfigEpochWithoutConsensus(void);
-slotMigration *clusterGetCurrentSlotMigration(void);
-void clusterSendMigrateSlotStart(clusterNode *node, slotBitmap slot_bitmap);
+slotImport *clusterGetCurrentSlotImport(void);
+slotExport *clusterGetCurrentSlotExport(void);
 void moduleCallClusterReceivers(const char *sender_id,
                                 uint64_t module_id,
                                 uint8_t type,
@@ -122,7 +122,7 @@ int verifyClusterNodeId(const char *name, int length);
 sds clusterEncodeOpenSlotsAuxField(int rdbflags);
 int clusterDecodeOpenSlotsAuxField(int rdbflags, sds s);
 static int nodeExceedsHandshakeTimeout(clusterNode *node, mstime_t now);
-void clusterProceedWithSlotMigration(void);
+void clusterProceedWithSlotImport(void);
 
 /* Only primaries that own slots have voting rights.
  * Returns 1 if the node has voting rights, otherwise returns 0. */
@@ -1134,7 +1134,8 @@ void clusterInit(void) {
     server.cluster->failover_auth_epoch = 0;
     server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE;
     server.cluster->lastVoteEpoch = 0;
-    server.cluster->slot_migrations = listCreate();
+    server.cluster->slot_import_jobs = listCreate();
+    server.cluster->slot_export_jobs = listCreate();
 
     /* Initialize stats */
     for (int i = 0; i < CLUSTERMSG_TYPE_COUNT; i++) {
@@ -2573,11 +2574,14 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc
 
                 /* Was this slot mine and it was in a paused state for slot
                  * migration? If so, mark the move as done. */
-                if (server.cluster->slots[j] == myself && server.cluster->mf_end && server.cluster->mf_slots_target == sender) {
-                    bitmapClearBit(server.cluster->mf_slots, j);
-                    if (isSlotBitmapEmpty(server.cluster->mf_slots)) {
-                        serverLog(LL_NOTICE, "Slot migration to node %s (%s) has finished. Unpausing myself.", server.cluster->mf_slots_target->name, server.cluster->mf_slots_target->human_nodename);
-                        resetManualFailover();
+                slotExport * curr_export = clusterGetCurrentSlotExport();
+                if (server.cluster->slots[j] == myself && curr_export && bitmapTestBit(curr_export->slot_bitmap, j)) {
+                    bitmapClearBit(curr_export->slot_bitmap, j);
+                    if (isSlotBitmapEmpty(curr_export->slot_bitmap)) {
+                        serverLog(LL_NOTICE, "Slot migration has finished. Unpausing myself.");
+                        unpauseActions(PAUSE_DURING_SLOT_MIGRATION);
+                        curr_export->state = SLOT_EXPORT_FINISH;
+                        clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION);
                     }
                 }
 
@@ -3256,20 +3260,6 @@ int clusterProcessPacket(clusterLink *link) {
                       "primary manual failover: %lld",
                       server.cluster->mf_primary_offset);
         }
-        /* If we are importing a slot and the slot owner sent its offset
-         * while already paused, populate the migration state. */
-        slotMigration * curr_migration = clusterGetCurrentSlotMigration();
-        if (hdr->mflags[0] & CLUSTERMSG_FLAG0_PAUSED && curr_migration != NULL &&
-            curr_migration->state == SLOT_MIGRATION_WAITING_FOR_OFFSET &&
-            curr_migration->source_node == sender) {
-            curr_migration->pause_primary_offset = sender->repl_offset;
-            curr_migration->state = SLOT_MIGRATION_SYNCING_TO_OFFSET;
-            clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION);
-            serverLog(LL_NOTICE,
-                    "Received replication offset from paused owner for "
-                    "slot import: %lld",
-                    curr_migration->pause_primary_offset);
-        }
     }
 
     /* Initial processing of PING and MEET requests replying with a PONG. */
@@ -3724,27 +3714,6 @@ int clusterProcessPacket(clusterLink *link) {
         uint8_t type = hdr->data.module.msg.type;
         unsigned char *payload = hdr->data.module.msg.bulk_data;
         moduleCallClusterReceivers(sender->name, module_id, type, payload, len);
-    } else if (type == CLUSTERMSG_TYPE_MIGRATE_SLOT_START) {
-        /* This message is acceptable only if I'm a primary and I own the slot */
-        if (!sender) return 1;
-        for (int i = 0; i <= CLUSTER_SLOTS; i++) {
-            if (bitmapTestBit(hdr->data.slot_migration.msg.slot_bitmap, i) && server.cluster->slots[i] != myself) return 1;
-        }
-        /* Initialize the slot migration state accordingly */
-        resetManualFailover();
-        server.cluster->mf_end = now + CLUSTER_MF_TIMEOUT;
-        server.cluster->mf_slots_target = sender;
-        memcpy(server.cluster->mf_slots, hdr->data.slot_migration.msg.slot_bitmap, sizeof(slotBitmap));
-        /* TODO(murphyjacob4) pause subset of slots */
-        pauseActions(PAUSE_DURING_FAILOVER, now + (CLUSTER_MF_TIMEOUT * CLUSTER_MF_PAUSE_MULT),
-                     PAUSE_ACTIONS_CLIENT_WRITE_SET);
-        serverLog(LL_NOTICE, "Slot migration requested by node %.40s (%s). Pausing myself to allow slot takeover.", sender->name, sender->human_nodename);
-        /* We need to send a ping message to the replica, as it would carry
-         * `server.cluster->mf_primary_offset`, which means the primary paused clients
-         * at offset `server.cluster->mf_primary_offset`, so that the replica would
-         * know that it is safe to set its `server.cluster->mf_can_start` to 1 so as
-         * to complete failover as quickly as possible. */
-        clusterSendPing(link, CLUSTERMSG_TYPE_PING);
     } else {
         serverLog(LL_WARNING, "Received unknown packet type: %d", type);
     }
@@ -4445,177 +4414,253 @@ void clusterPropagatePublish(robj *channel, robj *message, int sharded) {
  * Slot Migration functions
  * -------------------------------------------------------------------------- */
 
-slotMigration *clusterCreateSlotMigration(clusterNode *source, slotBitmap slots) {
-    slotMigration *result = (slotMigration *) zmalloc(sizeof(slotMigration));
+slotImport *clusterCreateSlotImportJob(clusterNode *source, slotBitmap slots) {
+    slotImport *result = (slotImport *) zcalloc(sizeof(slotImport));
     memcpy(result->slot_bitmap, slots, sizeof(slotBitmap));
     result->source_node = source;
-    result->state = SLOT_MIGRATION_QUEUED;
-    result->end_time = 0; /* Will be set once started. */
-    result->replication_connection = NULL;
-    result->replication_client = NULL;
-    result->replication_handshake_state = REPL_STATE_NONE;
+    result->state = SLOT_IMPORT_QUEUED;
+    result->paused_at_offset = -1;
+    return result;
+}
+
+slotExport *clusterCreateSlotExportJob(client *c, slotBitmap slots) {
+    slotExport *result = (slotExport *) zmalloc(sizeof(slotExport));
+    memcpy(result->slot_bitmap, slots, sizeof(slotBitmap));
+    result->state = SLOT_EXPORT_QUEUED;
     result->pause_end = 0;
-    result->pause_primary_offset = -1;
+    result->client = c;
     return result;
 }
 
-void clusterFreeSlotMigration(slotMigration *migration) {
-    if (migration->replication_client) {
-        freeClient(migration->replication_client);
-    } else if (migration->replication_connection) {
-        connClose(migration->replication_connection);
+void clusterFreeSlotImportJob(slotImport *slot_import) {
+    if (slot_import->client) {
+        freeClient(slot_import->client);
+    } else if (slot_import->conn) {
+        connClose(slot_import->conn);
     }
-    zfree(migration);
+    zfree(slot_import);
 }
 
-/* Gets the current slot migration from the head of the queue. */
-slotMigration *clusterGetCurrentSlotMigration(void) {
-    if (listLength(server.cluster->slot_migrations) == 0) return NULL;
-    return (slotMigration *) listFirst(server.cluster->slot_migrations)->value;
+void clusterFreeSlotExportJob(slotExport *slot_export) {
+    if (slot_export->client) {
+        freeClient(slot_export->client);
+    }
+    zfree(slot_export);
 }
 
-void clusterSendMigrateSlotStart(clusterNode *node, slotBitmap slot_bitmap) {
-    if (!node->link) return;
+slotImport *clusterGetCurrentSlotImport(void) {
+    if (listLength(server.cluster->slot_import_jobs) == 0) return NULL;
+    return (slotImport *) listFirst(server.cluster->slot_import_jobs)->value;
+}
 
-    uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData) + sizeof(clusterMsgSlotMigration);
-    clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_MIGRATE_SLOT_START, msglen);
-    clusterMsg *hdr = getMessageFromSendBlock(msgblock);
-    memcpy(hdr->data.slot_migration.msg.slot_bitmap, slot_bitmap, sizeof(slotBitmap));
-    clusterSendMessage(node->link, msgblock);
-    clusterMsgSendBlockDecrRefCount(msgblock);
+slotExport *clusterGetCurrentSlotExport(void) {
+    if (listLength(server.cluster->slot_export_jobs) == 0) return NULL;
+    return (slotExport *) listFirst(server.cluster->slot_export_jobs)->value;
 }
 
-void clusterImportHandler(connection *conn) {
-    UNUSED(conn);
-    /* This is called if there is an event on the current migrations
-     * connection. If that is the case, we can just continue with our
-     * state machine.*/
-    clusterProceedWithSlotMigration();
+void clusterFeedSlotMigration(int dbid, robj **argv, int argc) {
+    UNUSED(dbid);
+    int i, slot, error_code;
+    slotExport *curr_export = clusterGetCurrentSlotExport();
+    if (curr_export == NULL || curr_export->state < SLOT_EXPORT_SNAPSHOTTING) {
+        return;
+    }
+
+    /* Check the slot this command belongs to. Note that it is not a guarantee
+     * that the slot of the replicated command is the same as the slot of the
+     * executed command, for example in the case of module VM_Replicate APIs.
+     * Because of this case, we need to recomplete the slot lookup completely
+     * at this time. */
+    struct serverCommand *cmd = lookupCommand(argv, argc);
+    getNodeByQuery(server.current_client, cmd, argv, argc, &slot, &error_code);
+    if (error_code != CLUSTER_REDIR_NONE || slot == -1) {
+        /* This shouldn't happen - but is possible if a module does something
+         * like VM_Replicate a cross-slot command. In that case, we don't have
+         * a clear way to proceed, so it makes sense to give up. */
+        return;
+    }
+    if (!bitmapTestBit(curr_export->slot_bitmap, slot)) return;
+
+    unsigned long long prev_pending = curr_export->client->reply_bytes;
+    addReplyArrayLen(curr_export->client, argc);
+    for (i = 0; i < argc; i++) {
+        addReply(curr_export->client, argv[i]);
+    }
+    curr_export->syncslot_offset += curr_export->client->reply_bytes - prev_pending;
 }
 
-void clusterSlotMigrationDoneSyncing(long long initial_offset) {
-    slotMigration *migration = clusterGetCurrentSlotMigration();
-    serverAssert(migration != NULL && migration->state == SLOT_MIGRATION_RECEIVE_SYNC);
-    migration->state = SLOT_MIGRATION_PAUSE_OWNER;
-    migration->replication_client->repl_data->reploff = initial_offset;
-    migration->replication_client->repl_data->read_reploff = initial_offset;
-    clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION);
+int clusterShouldWriteToSlotMigrationTarget() {
+    slotExport *curr_export = clusterGetCurrentSlotExport();
+    if (curr_export->state != SLOT_EXPORT_PAUSED) {
+        return 0;
+    }
+    return 1;
 }
 
-/* This is the main state machine for the slot migration workflow. Slot
- * migration is driven by the new owner of the slot. This function will do as
- * much work as possible synchronously, processing the enqueued slot migrations
- * and only returning once we are waiting on some IO. */
-void clusterProceedWithSlotMigration(void) {
-    server.cluster->todo_before_sleep &= ~CLUSTER_TODO_HANDLE_SLOTMIGRATION;
+void clusterSlotMigrationHandleClientClose(client *c) {
+    if (c->flag.slot_migration_source) {
+        serverLog(LL_NOTICE, "Connection with slot migration source lost.");
+        slotImport *import = clusterGetCurrentSlotImport();
+        if (import == NULL || import->client != c) return;
+        import->client = NULL;
+        clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION);
+    } else if (c->flag.slot_migration_target) {
+        serverLog(LL_NOTICE, "Connection with slot export target lost.");
+        slotExport *export = clusterGetCurrentSlotExport();
+        if (export == NULL || export->client != c) return;
+        export->client = NULL;
+        clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION);
+    }
+}
 
-    while (clusterGetCurrentSlotMigration() != NULL) {
-        listNode *curr_node = listFirst(server.cluster->slot_migrations);
-        slotMigration *curr_migration = (slotMigration *) curr_node->value;
-        if (curr_migration->state != SLOT_MIGRATION_QUEUED && curr_migration->end_time < mstime()) {
+void clusterImportHandler(connection *conn) {
+    UNUSED(conn);
+    clusterProceedWithSlotImport();
+}
+
+void clusterProceedWithSlotImport(void) {
+    char *err;
+    while (clusterGetCurrentSlotImport() != NULL) {
+        listNode *curr_node = listFirst(server.cluster->slot_import_jobs);
+        slotImport *curr_import = (slotImport *) curr_node->value;
+        if (curr_import->state != SLOT_IMPORT_QUEUED && curr_import->end_time < mstime()) {
             serverLog(LL_WARNING,
-                "Timed out for slot migration from source node %.40s", curr_migration->source_node->name);
-            curr_migration->state = SLOT_MIGRATION_FAILED;
+                "Timed out for slot import from source node %.40s", curr_import->source_node->name);
+            curr_import->state = SLOT_IMPORT_FAILED;
         }
-        if (curr_migration->state > SLOT_MIGRATION_PAUSE_OWNER && curr_migration->state < SLOT_MIGRATION_FAILED && curr_migration->pause_end < mstime()) {
+        if (curr_import->state > SLOT_IMPORT_PAUSE_OWNER && curr_import->state != SLOT_IMPORT_FAILED && curr_import->pause_end < mstime()) {
             /* If the owner ever unpauses, we have to move back in the state machine and retry. */
             serverLog(LL_WARNING, "Reinitiating pause on the node owning the slot range...");
-            curr_migration->state = SLOT_MIGRATION_PAUSE_OWNER;
-            curr_migration->pause_end = mstime() + CLUSTER_MF_TIMEOUT;
+            curr_import->state = SLOT_IMPORT_PAUSE_OWNER;
+            curr_import->pause_end = mstime() + CLUSTER_MF_TIMEOUT;
         }
-        switch(curr_migration->state) {
-            case SLOT_MIGRATION_QUEUED:
+        if (curr_import->state > SLOT_IMPORT_CONNECTING && curr_import->client == NULL) {
+            serverLog(LL_WARNING, "Client for slot import from source node %.40s has been closed", curr_import->source_node->name);
+            curr_import->state = SLOT_IMPORT_FAILED;
+        }
+        switch(curr_import->state) {
+            case SLOT_IMPORT_QUEUED:
                 /* Start the migration */
-                serverLog(LL_NOTICE, "Starting replication of slots from migration source node %.40s", curr_migration->source_node->name);
-                curr_migration->end_time = mstime() + CLUSTER_SLOT_MIGRATION_TIMEOUT;
-                curr_migration->replication_connection = connCreate(server.tls_replication ? connectionTypeTls() : connectionTypeTcp());
-                if (connConnect(curr_migration->replication_connection, curr_migration->source_node->ip, getNodeDefaultReplicationPort(curr_migration->source_node), server.bind_source_addr, clusterImportHandler) == C_ERR) {
+                serverLog(LL_NOTICE, "Starting slot import from source node %.40s", curr_import->source_node->name);
+                curr_import->end_time = mstime() + CLUSTER_SLOT_IMPORT_TIMEOUT;
+                curr_import->conn = connCreate(server.tls_replication ? connectionTypeTls() : connectionTypeTcp());
+                if (connConnect(curr_import->conn, curr_import->source_node->ip, getNodeDefaultReplicationPort(curr_import->source_node), server.bind_source_addr, clusterImportHandler) == C_ERR) {
                     serverLog(LL_WARNING,
-                            "Failed to connect to migration source node %.40s", curr_migration->source_node->name);
-                    curr_migration->state = SLOT_MIGRATION_FAILED;
+                            "Failed to connect to slot import source node %.40s", curr_import->source_node->name);
+                    curr_import->state = SLOT_IMPORT_FAILED;
                     continue;
                 }
-                curr_migration->replication_handshake_state = REPL_STATE_CONNECTING;
-                curr_migration->state = SLOT_MIGRATION_CONNECTING;
+                curr_import->state = SLOT_IMPORT_CONNECTING;
                 continue;
-            case SLOT_MIGRATION_CONNECTING:
-                if (curr_migration->replication_connection->state == CONN_STATE_CONNECTED) {
-                    curr_migration->state = SLOT_MIGRATION_REPL_HANDSHAKE;
+            case SLOT_IMPORT_CONNECTING:
+                if (curr_import->conn->state == CONN_STATE_CONNECTING) {
+                    /* Nothing to do, waiting for connection to be established. */
+                    return;
+                } else if (curr_import->conn->state != CONN_STATE_CONNECTED) {
+                    serverLog(LL_WARNING, "Failed during connection to slot import source node %.40s: %s", curr_import->source_node->name, connGetLastError(curr_import->conn));
+                    curr_import->state = SLOT_IMPORT_FAILED;
                     continue;
                 }
-                /* Nothing to do, waiting for connection to be established. */
-                return;
-            case SLOT_MIGRATION_REPL_HANDSHAKE:
-                curr_migration->replication_handshake_state = replicationProceedWithHandshake(curr_migration->replication_connection, curr_migration->replication_handshake_state, curr_migration->slot_bitmap);
-                if (curr_migration->replication_handshake_state == REPL_STATE_ERROR) {
-                    serverLog(LL_WARNING, "Handshake failed from migration node %.40s", curr_migration->source_node->name);
-                    curr_migration->state = SLOT_MIGRATION_FAILED;
+                serverLog(LL_NOTICE, "Connected to slot import source node %.40s", curr_import->source_node->name);
+                connSetReadHandler(curr_import->conn, NULL);
+                client *c = createClient(curr_import->conn);
+                curr_import->client = c;
+                c->flag.replicated = 1;
+                c->flag.slot_migration_source = 1;
+                c->flag.authenticated = 1;
+                c->user = NULL; /* This client can do everything. */
+                c->querybuf = sdsempty(); /* Similar to primary, we use a dedicated query buf. */
+                initClientReplicationData(c); /* Used to track reploff */
+
+                curr_import->state = SLOT_IMPORT_SEND_AUTH;
+                continue;
+            case SLOT_IMPORT_SEND_AUTH:
+                if (!server.primary_auth) {
+                    curr_import->state = SLOT_IMPORT_SEND_SYNCSLOTS;
                     continue;
                 }
-                if (curr_migration->replication_handshake_state == REPL_STATE_SEND_PSYNC) {
-                    curr_migration->state = SLOT_MIGRATION_SEND_SYNC;
+                char *auth_args[3] = {"AUTH", NULL, NULL};
+                size_t auth_lens[3] = {4, 0, 0};
+                int argc = 1;
+                if (server.primary_user) {
+                    auth_args[argc] = server.primary_user;
+                    auth_lens[argc] = strlen(server.primary_user);
+                    argc++;
+                }
+                auth_args[argc] = server.primary_auth;
+                auth_lens[argc] = sdslen(server.primary_auth);
+                argc++;
+                err = sendCommandArgv(curr_import->conn, argc, auth_args, auth_lens);
+                if (err) {
+                    serverLog(LL_WARNING, "Failed to write AUTH to slot migration source: %s", err);
+                    sdsfree(err);
+                    curr_import->state = SLOT_IMPORT_FAILED;
                     continue;
                 }
-                return;
-            case SLOT_MIGRATION_SEND_SYNC:
+                curr_import->state = SLOT_IMPORT_RECEIVE_AUTH;
+                continue;
+            case SLOT_IMPORT_RECEIVE_AUTH:
+                err = receiveSynchronousResponse(curr_import->conn);
+                if (err == NULL) {
+                    serverLog(LL_WARNING, "Slot migration source did not respond to AUTH command");
+                }
+                if (err[0] == '-') {
+                    serverLog(LL_WARNING, "Unable to AUTH to slot migration source: %s", err);
+                    sdsfree(err);
+                }
+                sdsfree(err);
+                err = NULL;
+                curr_import->state = SLOT_IMPORT_SEND_SYNCSLOTS;
+                continue;
+            case SLOT_IMPORT_SEND_SYNCSLOTS:
                 /* Ensure we have a clean state for the SYNC. */
-                dropKeysInSlotBitmap(curr_migration->slot_bitmap, 1);
-
-                /* We are done with our handshake phase. We can proceed straight to doing our SYNC.
-                 * Note that we are skipping PSYNC. PSYNC will always result in full resync for a
-                 * slot migration anyways.
-                 *
-                 * In the future, we can do a PSYNC phase to incorporate dual channel. */
-                serverLog(LL_NOTICE, "Starting SYNC for slot migration from migration source node %.40s", curr_migration->source_node->name);
-                if (connSyncWrite(curr_migration->replication_connection, "SYNC\r\n", 6, server.repl_syncio_timeout * 1000) == -1) {
-                    serverLog(LL_WARNING, "I/O error writing to slot migration source: %s", connGetLastError(curr_migration->replication_connection));
-                    curr_migration->state = SLOT_MIGRATION_FAILED;
+                dropKeysInSlotBitmap(curr_import->slot_bitmap, 1);
+
+                serverLog(LL_NOTICE, "Sending CLUSTER SYNCSLOTS START request to source node %.40s", curr_import->source_node->name);
+                char *syncslots_args[4] = {"CLUSTER", "SYNCSLOTS", "START", (char *)curr_import->slot_bitmap};
+                size_t syncslots_lens[4] = {7, 9, 5, sizeof(slotBitmap)};
+                err = sendCommandArgv(curr_import->conn, 4, syncslots_args, syncslots_lens);
+                if (err) {
+                    serverLog(LL_WARNING, "Failed to write SYNCSLOTS START to slot migration source: %s", err);
+                    sdsfree(err);
+                    curr_import->state = SLOT_IMPORT_FAILED;
                     continue;
                 }
-                client *c = createClient(curr_migration->replication_connection);
-                curr_migration->replication_client = c;
-                c->flag.replication_source = 1;
-                c->flag.slot_migration_source = 1;
-                c->flag.authenticated = 1;
-                c->user = NULL; /* This client can do everything. */
-                initClientReplicationData(c); /* We use this to track offset. */
-                c->querybuf = sdsempty(); /* Similar to primary, we use a dedicated query buf. */
 
                 /* Our result will be received in AOF format, so we can pipe it
                  * straight to readQueryFromClient. */
-                connSetReadHandler(c->conn, readQueryFromClient);
-                curr_migration->state = SLOT_MIGRATION_RECEIVE_SYNC;
-                continue;
-            case SLOT_MIGRATION_RECEIVE_SYNC:
-                return; /* Nothing to do */
-            case SLOT_MIGRATION_PAUSE_OWNER:
-                /* Send an ACK to put the connection into streaming state. */
-                replicationSendAck(curr_migration->replication_client);
-
-                serverLog(LL_NOTICE, "Replication sync to slot owner %.40s has been performed. Current replication offset: %lld. Pausing source node and waiting to continue", curr_migration->source_node->name, curr_migration->replication_client->repl_data->reploff);
-                clusterSendMigrateSlotStart(curr_migration->source_node, curr_migration->slot_bitmap);
-                curr_migration->pause_primary_offset = -1;
-                curr_migration->pause_end = mstime() + CLUSTER_MF_TIMEOUT;
-                curr_migration->state = SLOT_MIGRATION_WAITING_FOR_OFFSET;
+                connSetReadHandler(curr_import->client->conn, readQueryFromClient);
+                curr_import->state = SLOT_IMPORT_RECEIVE_SYNCSLOTS;
+            case SLOT_IMPORT_RECEIVE_SYNCSLOTS:
+                /* Nothing to do in this state. Waiting for CLUSTER SYNCSLOTS END to be processed. */
+                return;
+            case SLOT_IMPORT_PAUSE_OWNER:
+                curr_import->client->flag.replication_force_reply = 1;
+                addReplyArrayLen(curr_import->client, 3);
+                addReplyBulkCBuffer(curr_import->client, "CLUSTER", 7);
+                addReplyBulkCBuffer(curr_import->client, "SYNCSLOTS", 9);
+                addReplyBulkCBuffer(curr_import->client, "PAUSE", 5);
+                curr_import->client->flag.replication_force_reply = 0;
+
+                serverLog(LL_NOTICE, "SYNCSLOTS from slot migration source %.40s (%s) has been performed. Pausing source node and waiting to continue", curr_import->source_node->name, curr_import->source_node->human_nodename);
+                curr_import->paused_at_offset = -1;
+                curr_import->pause_end = mstime() + CLUSTER_MF_TIMEOUT;
+                curr_import->state = SLOT_IMPORT_WAITING_FOR_OFFSET;
                 continue;
-            case SLOT_MIGRATION_WAITING_FOR_OFFSET:
-                /* Send REPLCONF ACK from time to time */
-                replicationSendAck(curr_migration->replication_client);
+            case SLOT_IMPORT_WAITING_FOR_OFFSET:
                 return;
-            case SLOT_MIGRATION_SYNCING_TO_OFFSET:
-                /* Send REPLCONF ACK from time to time */
-                replicationSendAck(curr_migration->replication_client);
-                if (curr_migration->replication_client->repl_data->reploff >= curr_migration->pause_primary_offset) {
-                    serverLog(LL_NOTICE, "Replication of slot range has caught up to paused slot owner (%lld), slot migration can start.", curr_migration->pause_primary_offset);
-                    curr_migration->state = SLOT_MIGRATION_FINISH;
+            case SLOT_IMPORT_SYNCING_TO_OFFSET:
+                if (curr_import->client->repl_data->reploff >= curr_import->paused_at_offset) {
+                    serverLog(LL_NOTICE, "SYNCSLOTS of slot range has caught up to paused slot owner (%lld), slot migration can start.", curr_import->paused_at_offset);
+                    curr_import->state = SLOT_IMPORT_FINISH;
                     continue;
                 }
                 /* Need to wait for the sync to progress further */
                 return;
-            case SLOT_MIGRATION_FINISH:
+            case SLOT_IMPORT_FINISH:
                 serverLog(LL_NOTICE, "Setting myself to owner of migrating slots and broadcasting");
                 for (int i = 0; i < CLUSTER_SLOTS; i++) {
-                    if (bitmapTestBit(curr_migration->slot_bitmap, i)) {
+                    if (bitmapTestBit(curr_import->slot_bitmap, i)) {
                         clusterDelSlot(i);
                         clusterAddSlot(myself, i);
                     }
@@ -4625,20 +4670,101 @@ void clusterProceedWithSlotMigration(void) {
                 if (clusterBumpConfigEpochWithoutConsensus() == C_OK) {
                     serverLog(LL_NOTICE, "ConfigEpoch updated after importing slots");
                 }
-                clusterFreeSlotMigration(curr_migration);
+                clusterFreeSlotImportJob(curr_import);
                 clusterBroadcastPong(CLUSTER_BROADCAST_ALL);
-                listDelNode(server.cluster->slot_migrations, curr_node);
+                listDelNode(server.cluster->slot_import_jobs, curr_node);
                 continue;
-            case SLOT_MIGRATION_FAILED:
-                /* Delete the migration from the queue and proceed to the next migration */
-                listDelNode(server.cluster->slot_migrations, curr_node);
-                dropKeysInSlotBitmap(curr_migration->slot_bitmap, server.repl_replica_lazy_flush);
-                clusterFreeSlotMigration(curr_migration);
+            case SLOT_IMPORT_REPLICA_TRACKING:
+                /* As a replica, we will simply apply the primaries updates
+                 * from the slot migration source. However, if we are ever
+                 * promoted to primary, we need to fail the migration to
+                 * prevent leaked keys in the importing slots. */
+                if (clusterNodeIsPrimary(myself)) {
+                    serverLog(LL_WARNING, "Promoted to primary during slot migration, failing the ongoing migration");
+                    curr_import->state = SLOT_IMPORT_FAILED;
+                    continue;
+                }
+                return;
+            case SLOT_IMPORT_FAILED:
+                listDelNode(server.cluster->slot_import_jobs, curr_node);
+                dropKeysInSlotBitmap(curr_import->slot_bitmap, server.repl_replica_lazy_flush);
+                clusterFreeSlotImportJob(curr_import);
                 continue;
         }
     }
 }
 
+int childSnapshotForSyncSlot(int req, rio *rdb, void *privdata) {
+    UNUSED(req);
+    int retval = rewriteAppendOnlyFileRio(rdb, (unsigned char *) privdata);
+    rioWrite(rdb, "*3\r\n", 4);
+    rioWriteBulkString(rdb, "CLUSTER", 7);
+    rioWriteBulkString(rdb, "SYNCSLOTS", 9);
+    rioWriteBulkString(rdb, "END", 3);
+    return retval;
+}
+
+void clusterProceedWithSlotExport(void) {
+    while (clusterGetCurrentSlotExport() != NULL) {
+        listNode *curr_node = listFirst(server.cluster->slot_export_jobs);
+        slotExport *curr_export = (slotExport *) curr_node->value;
+        if (curr_export->client == NULL) {
+            serverLog(LL_WARNING, "Client for slot export has been closed");
+            curr_export->state = SLOT_EXPORT_FAILED;
+        }
+        switch(curr_export->state) {
+            case SLOT_EXPORT_QUEUED:
+                if (hasActiveChildProcess()) {
+                    /* We need to wait for the child to die, then we can
+                     * proceed. */
+                    return;
+                }
+                connection ** conns = zmalloc(sizeof(connection*));
+                *conns = curr_export->client->conn;
+                if (saveSnapshotToConnectionSockets(conns, 1, 1, 0, childSnapshotForSyncSlot, curr_export->slot_bitmap) != C_OK) {
+                    serverLog(LL_WARNING, "Failed to start slot export to target");
+                    curr_export->state = SLOT_EXPORT_FAILED;
+                    continue;
+                }
+                curr_export->state = SLOT_EXPORT_SNAPSHOTTING;
+                continue;
+            case SLOT_EXPORT_SNAPSHOTTING:
+                /* During this time, we are waiting for SYNCSLOTS PAUSE to
+                 * start flushing the accumulated backlog. */
+                return;
+            case SLOT_EXPORT_PAUSE_AND_REPLY:
+                addReplyArrayLen(curr_export->client, 4);
+                addReplyBulkCBuffer(curr_export->client, "CLUSTER", 7);
+                addReplyBulkCBuffer(curr_export->client, "SYNCSLOTS", 9);
+                addReplyBulkCBuffer(curr_export->client, "PAUSEDAT", 8);
+                addReplyLongLong(curr_export->client, curr_export->syncslot_offset);
+
+                curr_export->pause_end = mstime() + (CLUSTER_MF_TIMEOUT * CLUSTER_MF_PAUSE_MULT);
+                pauseActions(PAUSE_DURING_SLOT_MIGRATION, curr_export->pause_end, PAUSE_ACTIONS_CLIENT_WRITE_SET);
+
+                curr_export->state = SLOT_EXPORT_PAUSED;
+                continue;
+            case SLOT_EXPORT_PAUSED:
+                /*  */
+            case SLOT_EXPORT_FINISH:
+            case SLOT_EXPORT_FAILED:
+                listDelNode(server.cluster->slot_export_jobs, curr_node);
+                clusterFreeSlotExportJob(curr_export);
+                continue;
+        }
+    }
+}
+
+
+/* This is the main state machine for the slot migration workflow. Slot
+ * migration is mostly driven by the new owner of the slot (target node). These
+ * functions will do as much work as possible synchronously, processing the
+ * enqueued slot migrations and only returning once we are waiting on some IO. */
+void clusterProceedWithSlotMigration(void) {
+    server.cluster->todo_before_sleep &= ~CLUSTER_TODO_HANDLE_SLOTMIGRATION;
+    clusterProceedWithSlotImport();
+    clusterProceedWithSlotExport();
+}
 
 /* -----------------------------------------------------------------------------
  * REPLICA node specific functions
@@ -5286,7 +5412,7 @@ void manualFailoverCanStart(void) {
  * The function can be used both to initialize the manual failover state at
  * startup or to abort a manual failover in progress. */
 void resetManualFailover(void) {
-    if (server.cluster->mf_replica || server.cluster->mf_slots_target) {
+    if (server.cluster->mf_replica) {
         /* We were a primary failing over, so we paused clients and related actions.
          * Regardless of the outcome we unpause now to allow traffic again. */
         unpauseActions(PAUSE_DURING_FAILOVER);
@@ -5295,8 +5421,6 @@ void resetManualFailover(void) {
     server.cluster->mf_can_start = 0;
     server.cluster->mf_replica = NULL;
     server.cluster->mf_primary_offset = -1;
-    memset(server.cluster->mf_slots, 0, sizeof(server.cluster->mf_slots));
-    server.cluster->mf_slots_target = NULL;
 }
 
 /* If a manual failover timed out, abort it. */
@@ -6592,6 +6716,7 @@ void removeChannelsInSlot(unsigned int slot) {
 
 /* Remove all the keys in the specified hash slot.
  * The number of removed items is returned. */
+// TODO(murphyjacob4) - can we just use this?
 unsigned int delKeysInSlot(unsigned int hashslot) {
     if (!countKeysInSlot(hashslot)) return 0;
 
@@ -7426,10 +7551,106 @@ int clusterCommandSpecial(client *c) {
             }
         }
 
-        slotMigration * to_enqueue = clusterCreateSlotMigration(curr_owner, requested_slots);
-        listAddNodeTail(server.cluster->slot_migrations, to_enqueue);
-        clusterProceedWithSlotMigration();
+        slotImport * to_enqueue = clusterCreateSlotImportJob(curr_owner, requested_slots);
+        listAddNodeTail(server.cluster->slot_import_jobs, to_enqueue);
+        clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION);
         addReply(c, shared.ok);
+    } else if (!strcasecmp(c->argv[1]->ptr, "syncslots")) {
+        if (c->argc < 3) {
+            addReplyError(c, "SYNCSLOTS command requires either START or END to be provided.");
+            return 1;
+        }
+        if (!strcasecmp(c->argv[2]->ptr, "start")) {
+            /* CLUSTER SYNCSLOTS START <slot-bitmap> */
+            if (c->argc != 4) {
+            addReplyError(c, "CLUSTER SYNCSLOTS START command requires exactly one argument");
+                return 1;
+            }
+            if (sdslen(c->argv[3]->ptr) != sizeof(slotBitmap)) {
+                addReplyError(c, "Invalid slot bitmap length");
+                return 1;
+            }
+            c->flag.slot_migration_target = 1;
+            initClientReplicationData(c);
+            slotExport *job = clusterCreateSlotExportJob(c, c->argv[2]->ptr);
+            listAddNodeTail(server.cluster->slot_export_jobs, job);
+            clusterProceedWithSlotMigration();
+        } else if (!strcasecmp(c->argv[2]->ptr, "inform")) {
+            /* CLUSTER SYNCSLOTS INFORM <slot-bitmap> */
+            if (c->argc != 4) {
+                addReplyError(c, "CLUSTER SYNCSLOTS INFORM command requires exactly one argument");
+                return 1;
+            }
+            slotImport * to_enqueue = clusterCreateSlotImportJob(NULL, c->argv[2]->ptr);
+            to_enqueue->state = SLOT_IMPORT_REPLICA_TRACKING;
+        } else if (!strcasecmp(c->argv[2]->ptr, "end")) {
+            /* CLUSTER SYNCSLOTS END */
+            if (c->argc != 3) {
+                addReplyError(c, "CLUSTER SYNCSLOTS END does not expect any arguments.");
+                return 1;
+            }
+            slotImport *curr_import = clusterGetCurrentSlotImport();
+            if (!curr_import || (curr_import->state != SLOT_IMPORT_RECEIVE_SYNCSLOTS && curr_import->state != SLOT_IMPORT_REPLICA_TRACKING)) {
+                addReplyError(c, "No ongoing CLUSTER SYNCSLOTS to end.");
+                return 1;
+            }
+            if (curr_import->state != SLOT_IMPORT_REPLICA_TRACKING) {
+                /* Replicas will also receive this command through the replication
+                 * stream, but it is not actionable. */
+                return 1;
+            }
+            if (curr_import->client != c) {
+                addReplyError(c, "This client is not the one that initiated the ongoing CLUSTER SYNCSLOTS.");
+            }
+            curr_import->state = SLOT_IMPORT_PAUSE_OWNER;
+            clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION);
+            addReply(c, shared.ok);
+        } else if (!strcasecmp(c->argv[2]->ptr, "pause")) {
+            /* CLUSTER SYNCSLOTS PAUSE */
+            if (c->argc != 3) {
+                addReplyError(c, "CLUSTER SYNCSLOTS PAUSE does not expect any arguments.");
+                return 1;
+            }
+            slotExport *slot_export = clusterGetCurrentSlotExport();
+            if (!slot_export) {
+                addReplyError(c, "No ongoing CLUSTER SYNCSLOTS to pause.");
+                return 1;
+            }
+            if (slot_export->state == SLOT_EXPORT_PAUSED) {
+                serverLog(LL_NOTICE, "Pause retriggered by target during slot migration.");
+            } else if (slot_export->state != SLOT_EXPORT_SNAPSHOTTING) {
+                addReplyError(c, "SYNCSLOTS is not in the correct state for this command.");
+                return 1;
+            } else {
+                /* First pause. We want to flush the output buffer that was not allowed to
+                 * flush during the snapshot. */
+                putClientInPendingWriteQueue(slot_export->client);
+            }
+
+            slot_export->state = SLOT_EXPORT_PAUSE_AND_REPLY;
+            clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION);
+        } else if (!strcasecmp(c->argv[2]->ptr, "pausedat")) {
+            /* CLUSTER SYNCSLOTS PAUSEDAT <offset> */
+            if (c->argc != 4) {
+                addReplyError(c, "CLUSTER SYNCSLOTS PAUSEDAT command requires exactly one argument.");
+                return 1;
+            }
+            slotImport *slot_import = clusterGetCurrentSlotImport();
+            if (!slot_import || slot_import->state != SLOT_IMPORT_WAITING_FOR_OFFSET) {
+                addReplyError(c, "No CLUSTER SYNCSLOTS is waiting for a PAUSEDAT response.");
+                return 1;
+            }
+            long long offset;
+            if (getLongLongFromObject(c->argv[3]->ptr, &offset) != C_OK) {
+                addReplyError(c, "Failed to parse PAUSEDAT offset.");
+                return 1;
+            }
+            slot_import->paused_at_offset = offset;
+            slot_import->state = SLOT_IMPORT_SYNCING_TO_OFFSET;
+            clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION);
+        } else {
+            addReplyError(c, "Unknown subcommand for CLUSTER SYNCSLOTS.");
+        }
     } else {
         return 0;
     }
@@ -7474,6 +7695,12 @@ const char **clusterCommandExtendedHelp(void) {
         "    Output format is an array where each array element is a map containing attributes of a link",
         "MIGRATE SLOTSRANGE <start slot> <end slot> [<start slot> <end slot> ...] SHARD <shard-id>",
         "    Initiate server driven slot migration of all slot ranges to the designated shard.",
+        "SYNCSLOTS [START <slot-bitmap>|END|INFORM <slot-bitmap>|PAUSE|PAUSEDAT]",
+        "    Internal command. SYNCSLOTS START initiates send of an AOF formatted snapshot containing the",
+        "    provided slot bitmap. SYNCSLOTS END terminates the AOF formatted snapshot, and after this",
+        "    SYNCSLOTS PAUSE signals for this node to be paused and for a continuous stream of commands"
+        "    for the slots to be replicated. SYNCSLOTS PAUSEDAT will be replied with the offset of remaining"
+        "    commands. SYNCSLOTS INFORM is used to inform replicas that the operation is occurring.",
         NULL};
 
     return help;
diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
index 0a97ca37ad..9eda033bda 100644
--- a/src/cluster_legacy.h
+++ b/src/cluster_legacy.h
@@ -10,7 +10,7 @@
 #define CLUSTER_MF_TIMEOUT 5000              /* Milliseconds to do a manual failover. */
 #define CLUSTER_MF_PAUSE_MULT 2              /* Primary pause manual failover mult. */
 #define CLUSTER_REPLICA_MIGRATION_DELAY 5000 /* Delay for replica migration. */
-#define CLUSTER_SLOT_MIGRATION_TIMEOUT 30000 /* Milliseconds to do a slot migration. */
+#define CLUSTER_SLOT_IMPORT_TIMEOUT 30000    /* Milliseconds to do a slot migration. */
 
 /* Reasons why a replica is not able to failover. */
 #define CLUSTER_CANT_FAILOVER_NONE 0
@@ -376,32 +376,50 @@ struct _clusterNode {
                                                Update with updateAndCountChangedNodeHealth(). */
 };
 
-typedef enum slotMigrationState {
-    SLOT_MIGRATION_QUEUED,
-    SLOT_MIGRATION_CONNECTING,
-    SLOT_MIGRATION_REPL_HANDSHAKE,  /* The handshake has it's own state machine,
-                                     * see replicationProceedWithHandshake */
-    SLOT_MIGRATION_SEND_SYNC,
-    SLOT_MIGRATION_RECEIVE_SYNC,
-    SLOT_MIGRATION_PAUSE_OWNER,
-    SLOT_MIGRATION_WAITING_FOR_OFFSET,
-    SLOT_MIGRATION_SYNCING_TO_OFFSET,
-    SLOT_MIGRATION_FINISH,
-    SLOT_MIGRATION_FAILED,
-} slotMigrationState;
-
-typedef struct slotMigration {
+typedef enum slotImportState {
+    SLOT_IMPORT_QUEUED,
+    SLOT_IMPORT_REPLICA_TRACKING, /* Replicas track the slot import as well */
+    SLOT_IMPORT_CONNECTING,
+    SLOT_IMPORT_SEND_AUTH,
+    SLOT_IMPORT_RECEIVE_AUTH,
+    SLOT_IMPORT_SEND_SYNCSLOTS,
+    SLOT_IMPORT_RECEIVE_SYNCSLOTS,
+    SLOT_IMPORT_PAUSE_OWNER,
+    SLOT_IMPORT_WAITING_FOR_OFFSET,
+    SLOT_IMPORT_SYNCING_TO_OFFSET,
+    SLOT_IMPORT_FINISH,
+    SLOT_IMPORT_FAILED,
+} slotImportState;
+
+typedef struct slotImport {
     slotBitmap slot_bitmap;
-    slotMigrationState state;
+    slotImportState state;
     clusterNode *source_node;
     mstime_t end_time; /* Slot migration time limit (ms unixtime).
                           If not yet in progress (e.g. queued), will be zero. */
-    connection *replication_connection; /* Connection for replication. */
-    client *replication_client; /* Client for replication */
-    int replication_handshake_state;
+    connection *conn;
+    client *client;
     mstime_t pause_end;
-    long long pause_primary_offset;
-} slotMigration;
+    long long syncslots_offset;
+    long long paused_at_offset;
+} slotImport;
+
+typedef enum slotExportState {
+    SLOT_EXPORT_QUEUED,
+    SLOT_EXPORT_SNAPSHOTTING,
+    SLOT_EXPORT_PAUSE_AND_REPLY,
+    SLOT_EXPORT_PAUSED,
+    SLOT_EXPORT_FINISH,
+    SLOT_EXPORT_FAILED,
+} slotExportState;
+
+typedef struct slotExport {
+    slotBitmap slot_bitmap;
+    slotExportState state;
+    client *client; /* Client for replication */
+    unsigned long long syncslot_offset;
+    mstime_t pause_end;
+} slotExport;
 
 /* Struct used for storing slot statistics. */
 typedef struct slotStat {
@@ -464,7 +482,8 @@ struct clusterState {
     slotBitmap owner_not_claiming_slot;
     /* Struct used for storing slot statistics, for all slots owned by the current shard. */
     slotStat slot_stats[CLUSTER_SLOTS];
-    list *slot_migrations; /* Queue of ongoing slot migrations. */
+    list *slot_import_jobs; /* Queue of ongoing slot imports (we are the target). */
+    list *slot_export_jobs; /* Queue of ongoing slot exports (we are the source). */
 };
 
 #endif // CLUSTER_LEGACY_H
diff --git a/src/commands.def b/src/commands.def
index 0e54094821..f0a1183e5a 100644
--- a/src/commands.def
+++ b/src/commands.def
@@ -1021,6 +1021,23 @@ const char *CLUSTER_SLOTS_Tips[] = {
 #define CLUSTER_SLOTS_Keyspecs NULL
 #endif
 
+/********** CLUSTER SYNCSLOTS ********************/
+
+#ifndef SKIP_CMD_HISTORY_TABLE
+/* CLUSTER SYNCSLOTS history */
+#define CLUSTER_SYNCSLOTS_History NULL
+#endif
+
+#ifndef SKIP_CMD_TIPS_TABLE
+/* CLUSTER SYNCSLOTS tips */
+#define CLUSTER_SYNCSLOTS_Tips NULL
+#endif
+
+#ifndef SKIP_CMD_KEY_SPECS_TABLE
+/* CLUSTER SYNCSLOTS key specs */
+#define CLUSTER_SYNCSLOTS_Keyspecs NULL
+#endif
+
 /* CLUSTER command table */
 struct COMMAND_STRUCT CLUSTER_Subcommands[] = {
 {MAKE_CMD("addslots","Assigns new hash slots to a node.","O(N) where N is the total number of hash slot arguments","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_ADDSLOTS_History,0,CLUSTER_ADDSLOTS_Tips,0,clusterCommand,-3,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_STALE,0,CLUSTER_ADDSLOTS_Keyspecs,0,NULL,1),.args=CLUSTER_ADDSLOTS_Args},
@@ -1053,6 +1070,7 @@ struct COMMAND_STRUCT CLUSTER_Subcommands[] = {
 {MAKE_CMD("slaves","Lists the replica nodes of a primary node.","O(N) where N is the number of replicas.","3.0.0",CMD_DOC_DEPRECATED,"`CLUSTER REPLICAS`","5.0.0","cluster",COMMAND_GROUP_CLUSTER,CLUSTER_SLAVES_History,0,CLUSTER_SLAVES_Tips,1,clusterCommand,3,CMD_ADMIN|CMD_STALE,0,CLUSTER_SLAVES_Keyspecs,0,NULL,1),.args=CLUSTER_SLAVES_Args},
 {MAKE_CMD("slot-stats","Return an array of slot usage statistics for slots assigned to the current node.","O(N) where N is the total number of slots based on arguments. O(N*log(N)) with ORDERBY subcommand.","8.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_SLOT_STATS_History,0,CLUSTER_SLOT_STATS_Tips,2,clusterSlotStatsCommand,-4,CMD_STALE|CMD_LOADING,0,CLUSTER_SLOT_STATS_Keyspecs,0,NULL,1),.args=CLUSTER_SLOT_STATS_Args},
 {MAKE_CMD("slots","Returns the mapping of cluster slots to nodes.","O(N) where N is the total number of Cluster nodes","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_SLOTS_History,2,CLUSTER_SLOTS_Tips,1,clusterCommand,2,CMD_LOADING|CMD_STALE,0,CLUSTER_SLOTS_Keyspecs,0,NULL,0)},
+{MAKE_CMD("syncslots","An internal command used in slot migration.",NULL,"8.1.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_SYNCSLOTS_History,0,CLUSTER_SYNCSLOTS_Tips,0,clusterCommand,-2,CMD_ADMIN|CMD_STALE,0,CLUSTER_SYNCSLOTS_Keyspecs,0,NULL,0)},
 {0}
 };
 
diff --git a/src/commands/cluster-syncslots.json b/src/commands/cluster-syncslots.json
new file mode 100644
index 0000000000..2d23903ac4
--- /dev/null
+++ b/src/commands/cluster-syncslots.json
@@ -0,0 +1,14 @@
+{
+    "SYNCSLOTS": {
+        "summary": "An internal command used in slot migration.",
+        "group": "cluster",
+        "since": "8.1.0",
+        "arity": -2,
+        "container": "CLUSTER",
+        "function": "clusterCommand",
+        "command_flags": [
+            "ADMIN",
+            "STALE"
+        ]
+    }
+}
diff --git a/src/db.c b/src/db.c
index 134dc6e9dd..905f5c6120 100644
--- a/src/db.c
+++ b/src/db.c
@@ -258,7 +258,7 @@ int getKeySlot(sds key) {
      * so we must always recompute the slot for commands coming from the primary.
      */
     if (server.current_client && server.current_client->slot >= 0 && server.current_client->flag.executing_command &&
-        !server.current_client->flag.primary) {
+        !server.current_client->flag.replicated) {
         debugServerAssertWithInfo(server.current_client, NULL,
                                   (int)keyHashSlot(key, (int)sdslen(key)) == server.current_client->slot);
         return server.current_client->slot;
@@ -267,7 +267,7 @@ int getKeySlot(sds key) {
     /* For the case of replicated commands from primary, getNodeByQuery() never gets called,
      * and thus c->slot never gets populated. That said, if this command ends up accessing a key,
      * we are able to backfill c->slot here, where the key's hash calculation is made. */
-    if (server.current_client && server.current_client->flag.primary) {
+    if (server.current_client && server.current_client->flag.replicated) {
         server.current_client->slot = slot;
     }
     return slot;
diff --git a/src/io_threads.c b/src/io_threads.c
index 715251a06a..93fe56fdb9 100644
--- a/src/io_threads.c
+++ b/src/io_threads.c
@@ -345,7 +345,7 @@ int trySendReadToIOThreads(client *c) {
     c->cur_tid = tid;
     c->read_flags = canParseCommand(c) ? 0 : READ_FLAGS_DONT_PARSE;
     c->read_flags |= authRequired(c) ? READ_FLAGS_AUTH_REQUIRED : 0;
-    c->read_flags |= c->flag.replication_source ? READ_FLAGS_REPLICATION_SOURCE : 0;
+    c->read_flags |= c->flag.replicated ? READ_FLAGS_REPLICATED : 0;
 
     c->io_read_state = CLIENT_PENDING_IO;
     connSetPostponeUpdateState(c->conn, 1);
diff --git a/src/kvstore.c b/src/kvstore.c
index ef4b90af73..f1ed085c43 100644
--- a/src/kvstore.c
+++ b/src/kvstore.c
@@ -74,7 +74,7 @@ struct _kvstoreIterator {
     kvstore *kvs;
     long long didx;
     long long next_didx;
-    kvstoreIteratorFilter *filter;
+    kvstoreIteratorPredicate *predicate;
     void *filter_privdata;
     hashtableIterator di;
 };
@@ -600,19 +600,22 @@ kvstoreIterator *kvstoreIteratorInit(kvstore *kvs) {
     kvs_it->kvs = kvs;
     kvs_it->didx = -1;
     kvs_it->next_didx = kvstoreGetFirstNonEmptyHashtableIndex(kvs_it->kvs); /* Finds first non-empty hashtable index. */
-    kvs_it->filter = NULL;
+    kvs_it->predicate = NULL;
     kvs_it->filter_privdata = NULL;
     hashtableInitSafeIterator(&kvs_it->di, NULL);
     return kvs_it;
 }
 
 /* Returns kvstore iterator that filters out hash tables based on the predicate.*/
-kvstoreIterator *kvstoreFilteredIteratorInit(kvstore *kvs, kvstoreIteratorFilter *filter, void *privdata) {
+kvstoreIterator *kvstoreFilteredIteratorInit(kvstore *kvs, kvstoreIteratorPredicate *predicate, void *privdata) {
     kvstoreIterator *kvs_it = zmalloc(sizeof(*kvs_it));
     kvs_it->kvs = kvs;
     kvs_it->didx = -1;
     kvs_it->next_didx = kvstoreGetFirstNonEmptyHashtableIndex(kvs_it->kvs);
-    kvs_it->filter = filter;
+    while (kvs_it->next_didx != -1 && predicate && !predicate(kvs_it->next_didx, privdata)) {
+        kvs_it->next_didx = kvstoreGetNextNonEmptyHashtableIndex(kvs_it->kvs, kvs_it->next_didx);
+    }
+    kvs_it->predicate = predicate;
     kvs_it->filter_privdata = privdata;
     hashtableInitSafeIterator(&kvs_it->di, NULL);
     return kvs_it;
@@ -640,11 +643,10 @@ static hashtable *kvstoreIteratorNextHashtable(kvstoreIterator *kvs_it) {
         freeHashtableIfNeeded(kvs_it->kvs, kvs_it->didx);
     }
 
+    kvs_it->didx = kvs_it->next_didx;
     do {
-        kvs_it->didx = kvs_it->next_didx;
-        if (kvs_it->didx == -1) return NULL;
-        kvs_it->next_didx = kvstoreGetNextNonEmptyHashtableIndex(kvs_it->kvs, kvs_it->didx);
-    } while (kvs_it->filter && kvs_it->filter(kvs_it->didx, kvs_it->filter_privdata));
+        kvs_it->next_didx = kvstoreGetNextNonEmptyHashtableIndex(kvs_it->kvs, kvs_it->next_didx);
+    } while (kvs_it->next_didx != -1 && kvs_it->predicate && !kvs_it->predicate(kvs_it->didx, kvs_it->filter_privdata));
     return kvs_it->kvs->hashtables[kvs_it->didx];
 }
 
diff --git a/src/kvstore.h b/src/kvstore.h
index 668b0ae23e..a79caf23aa 100644
--- a/src/kvstore.h
+++ b/src/kvstore.h
@@ -10,7 +10,7 @@ typedef struct _kvstoreHashtableIterator kvstoreHashtableIterator;
 
 typedef int(kvstoreScanShouldSkipHashtable)(hashtable *d);
 typedef int(kvstoreExpandShouldSkipHashtableIndex)(int didx);
-typedef int(kvstoreIteratorFilter)(int didx, void *privdata);
+typedef int(kvstoreIteratorPredicate)(int didx, void *privdata);
 
 #define KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND (1 << 0)
 #define KVSTORE_FREE_EMPTY_HASHTABLES (1 << 1)
@@ -47,7 +47,7 @@ size_t kvstoreHashtableMetadataSize(void);
 
 /* kvstore iterator specific functions */
 kvstoreIterator *kvstoreIteratorInit(kvstore *kvs);
-kvstoreIterator *kvstoreFilteredIteratorInit(kvstore *kvs, kvstoreIteratorFilter *filter, void *privdata);
+kvstoreIterator *kvstoreFilteredIteratorInit(kvstore *kvs, kvstoreIteratorPredicate *filter, void *privdata);
 void kvstoreIteratorRelease(kvstoreIterator *kvs_it);
 int kvstoreIteratorGetCurrentHashtableIndex(kvstoreIterator *kvs_it);
 int kvstoreIteratorNext(kvstoreIterator *kvs_it, void **next);
diff --git a/src/networking.c b/src/networking.c
index 5c31ac4562..c2828d384a 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -290,7 +290,9 @@ int prepareClientToWrite(client *c) {
 
     /* Replication sources don't receive replies, unless force reply flag
      * is set. */
-    if ((c->flag.replication_source) && !c->flag.replication_force_reply) return C_ERR;
+    if ((c->flag.replicated) && !c->flag.replication_force_reply) return C_ERR;
+
+    if ((c->flag.slot_migration_target && !clusterShouldWriteToSlotMigrationTarget())) return C_ERR;
 
     /* Skip the fake client, such as the fake client for AOF loading.
      * But CLIENT_ID_CACHED_RESPONSE is allowed since it is a fake client
@@ -1602,7 +1604,7 @@ void clearClientConnectionState(client *c) {
         c->flag.replica = 0;
     }
 
-    serverAssert(!(c->flag.replica || c->flag.replication_source));
+    serverAssert(!(c->flag.replica || c->flag.replicated));
 
     if (c->flag.tracking) disableTracking(c);
     selectDb(c, 0);
@@ -1681,6 +1683,10 @@ void freeClient(client *c) {
         }
     }
 
+    if (c->flag.slot_migration_source || c->flag.slot_migration_target) {
+        clusterSlotMigrationHandleClientClose(c);
+    }
+
     /* Log link disconnection with replica */
     if (getClientType(c) == CLIENT_TYPE_REPLICA) {
         if (c->flag.repl_rdb_channel)
@@ -1821,7 +1827,7 @@ void beforeNextClient(client *c) {
      * blocked client as well */
 
     /* Trim the query buffer to the current position. */
-    if (c->flag.replication_source) {
+    if (c->flag.replicated) {
         /* If the client is a replication source, trim the querybuf to repl_applied,
          * since replication clients are very special, its querybuf not only
          * used to parse command, but also proxy to sub-replicas.
@@ -2135,7 +2141,11 @@ int postWriteToClient(client *c) {
     if (getClientType(c) != CLIENT_TYPE_REPLICA) {
         _postWriteToClient(c);
     } else {
-        server.stat_net_repl_output_bytes += c->nwritten > 0 ? c->nwritten : 0;
+        if (c->flag.slot_migration_target) {
+            server.stat_net_slot_migration_output_bytes += c->nwritten > 0 ? c->nwritten : 0;
+        } else {
+            server.stat_net_repl_output_bytes += c->nwritten > 0 ? c->nwritten : 0;
+        }
     }
 
     if (c->write_flags & WRITE_FLAGS_WRITE_ERROR) {
@@ -2151,7 +2161,7 @@ int postWriteToClient(client *c) {
          * as an interaction, since we always send REPLCONF ACK commands
          * that take some time to just fill the socket output buffer.
          * We just rely on data / pings received for timeout detection. */
-        if (!c->flag.replication_source) c->last_interaction = server.unixtime;
+        if (!c->flag.replicated) c->last_interaction = server.unixtime;
     }
     if (!clientHasPendingReplies(c)) {
         c->sentlen = 0;
@@ -2239,7 +2249,7 @@ int handleReadResult(client *c) {
 
     c->last_interaction = server.unixtime;
     c->net_input_bytes += c->nread;
-    if (c->flag.replication_source) {
+    if (c->flag.replicated) {
         c->repl_data->read_reploff += c->nread;
         if (c->flag.primary) {
             server.stat_net_repl_input_bytes += c->nread;
@@ -2563,7 +2573,7 @@ void processInlineBuffer(client *c) {
     int argc, j, linefeed_chars = 1;
     sds *argv, aux;
     size_t querylen;
-    int is_replication_source = c->read_flags & READ_FLAGS_REPLICATION_SOURCE;
+    int is_replicated = c->read_flags & READ_FLAGS_REPLICATED;
 
     /* Search for end of line */
     newline = strchr(c->querybuf + c->qb_pos, '\n');
@@ -2600,7 +2610,7 @@ void processInlineBuffer(client *c) {
      *
      * However there is an exception: primaries may send us just a newline
      * to keep the connection active. */
-    if (querylen != 0 && is_replication_source) {
+    if (querylen != 0 && is_replicated) {
         sdsfreesplitres(argv, argc);
         c->read_flags |= READ_FLAGS_ERROR_UNEXPECTED_INLINE_FROM_REPLICATION_SOURCE;
         return;
@@ -2649,7 +2659,7 @@ void processInlineBuffer(client *c) {
  * CLIENT_PROTOCOL_ERROR. */
 #define PROTO_DUMP_LEN 128
 static void setProtocolError(const char *errstr, client *c) {
-    if (server.verbosity <= LL_VERBOSE || c->flag.replication_source) {
+    if (server.verbosity <= LL_VERBOSE || c->flag.replicated) {
         sds client = catClientInfoString(sdsempty(), c, server.hide_user_data_from_log);
 
         /* Sample some protocol to given an idea about what was inside. */
@@ -2671,7 +2681,7 @@ static void setProtocolError(const char *errstr, client *c) {
         }
 
         /* Log all the client and protocol info. */
-        int loglevel = (c->flag.replication_source) ? LL_WARNING : LL_VERBOSE;
+        int loglevel = (c->flag.replicated) ? LL_WARNING : LL_VERBOSE;
         serverLog(loglevel, "Protocol error (%s) from client: %s. %s", errstr, client, buf);
         sdsfree(client);
     }
@@ -2690,7 +2700,7 @@ void processMultibulkBuffer(client *c) {
     char *newline = NULL;
     int ok;
     long long ll;
-    int is_replication_source = c->read_flags & READ_FLAGS_REPLICATION_SOURCE;
+    int is_replicated = c->read_flags & READ_FLAGS_REPLICATED;
     int auth_required = c->read_flags & READ_FLAGS_AUTH_REQUIRED;
 
     if (c->multibulklen == 0) {
@@ -2794,7 +2804,7 @@ void processMultibulkBuffer(client *c) {
 
             size_t bulklen_slen = newline - (c->querybuf + c->qb_pos + 1);
             ok = string2ll(c->querybuf + c->qb_pos + 1, bulklen_slen, &ll);
-            if (!ok || ll < 0 || (!(is_replication_source) && ll > server.proto_max_bulk_len)) {
+            if (!ok || ll < 0 || (!(is_replicated) && ll > server.proto_max_bulk_len)) {
                 c->read_flags |= READ_FLAGS_ERROR_MBULK_INVALID_BULK_LEN;
                 return;
             } else if (ll > 16384 && auth_required) {
@@ -2803,7 +2813,7 @@ void processMultibulkBuffer(client *c) {
             }
 
             c->qb_pos = newline - c->querybuf + 2;
-            if (!(is_replication_source) && ll >= PROTO_MBULK_BIG_ARG) {
+            if (!(is_replicated) && ll >= PROTO_MBULK_BIG_ARG) {
                 /* When the client is not a primary client (because primary
                  * client's querybuf can only be trimmed after data applied
                  * and sent to replicas).
@@ -2852,7 +2862,7 @@ void processMultibulkBuffer(client *c) {
             /* Optimization: if a non-primary client's buffer contains JUST our bulk element
              * instead of creating a new object by *copying* the sds we
              * just use the current sds string. */
-            if (!is_replication_source && c->qb_pos == 0 && c->bulklen >= PROTO_MBULK_BIG_ARG &&
+            if (!is_replicated && c->qb_pos == 0 && c->bulklen >= PROTO_MBULK_BIG_ARG &&
                 sdslen(c->querybuf) == (size_t)(c->bulklen + 2)) {
                 c->argv[c->argc++] = createObject(OBJ_STRING, c->querybuf);
                 c->argv_len_sum += c->bulklen;
@@ -2902,7 +2912,7 @@ void commandProcessed(client *c) {
     if (!c->repl_data) return;
 
     long long prev_offset = c->repl_data->reploff;
-    if (!c->flag.multi && c->flag.replication_source) {
+    if (!c->flag.multi && c->flag.replicated) {
         /* Update the applied replication offset of our source. */
         c->repl_data->reploff = c->repl_data->read_reploff - sdslen(c->querybuf) + c->qb_pos;
     }
@@ -2913,7 +2923,7 @@ void commandProcessed(client *c) {
      * applied to the replication state: this quantity, and its corresponding
      * part of the replication stream, will be propagated to the
      * sub-replicas and to the replication backlog. */
-    if (c->flag.replication_source) {
+    if (c->flag.replicated) {
         long long applied = c->repl_data->reploff - prev_offset;
         if (applied) {
             replicationFeedStreamFromPrimaryStream(c->querybuf + c->repl_data->repl_applied, applied);
@@ -3021,7 +3031,7 @@ int canParseCommand(client *c) {
      * condition on the replica. We want just to accumulate the replication
      * stream (instead of replying -BUSY like we do with other clients) and
      * later resume the processing. */
-    if (isInsideYieldingLongCommand() && c->flag.replication_source) return 0;
+    if (isInsideYieldingLongCommand() && c->flag.replicated) return 0;
 
     /* CLIENT_CLOSE_AFTER_REPLY closes the connection once the reply is
      * written to the client. Make sure to not let the reply grow after
@@ -3040,7 +3050,7 @@ int processInputBuffer(client *c) {
             break;
         }
 
-        c->read_flags = c->flag.replication_source ? READ_FLAGS_REPLICATION_SOURCE : 0;
+        c->read_flags = c->flag.replicated ? READ_FLAGS_REPLICATED : 0;
         c->read_flags |= authRequired(c) ? READ_FLAGS_AUTH_REQUIRED : 0;
 
         parseCommand(c);
@@ -3083,7 +3093,7 @@ void readToQueryBuf(client *c) {
     /* If the replica RDB client is marked as closed ASAP, do not try to read from it */
     if (c->flag.close_asap) return;
 
-    int is_replication_source = c->read_flags & READ_FLAGS_REPLICATION_SOURCE;
+    int is_replicated = c->read_flags & READ_FLAGS_REPLICATED;
 
     readlen = PROTO_IOBUF_LEN;
     qblen = c->querybuf ? sdslen(c->querybuf) : 0;
@@ -3104,7 +3114,7 @@ void readToQueryBuf(client *c) {
 
         /* Primary client needs expand the readlen when meet BIG_ARG(see #9100),
          * but doesn't need align to the next arg, we can read more data. */
-        if (c->flag.replication_source && readlen < PROTO_IOBUF_LEN) readlen = PROTO_IOBUF_LEN;
+        if (c->flag.replicated && readlen < PROTO_IOBUF_LEN) readlen = PROTO_IOBUF_LEN;
     }
 
     if (c->querybuf == NULL) {
@@ -3117,7 +3127,7 @@ void readToQueryBuf(client *c) {
      * Although we have ensured that c->querybuf will not be expanded in the current
      * thread_shared_qb, we still add this check for code robustness. */
     int use_thread_shared_qb = (c->querybuf == thread_shared_qb) ? 1 : 0;
-    if (!is_replication_source && /* replication client's querybuf can grow greedy. */
+    if (!is_replicated && /* replication client's querybuf can grow greedy. */
         (big_arg || sdsalloc(c->querybuf) < PROTO_IOBUF_LEN)) {
         /* When reading a BIG_ARG we won't be reading more than that one arg
          * into the query buffer, so we don't need to pre-allocate more than we
@@ -3144,7 +3154,7 @@ void readToQueryBuf(client *c) {
     sdsIncrLen(c->querybuf, c->nread);
     qblen = sdslen(c->querybuf);
     if (c->querybuf_peak < qblen) c->querybuf_peak = qblen;
-    if (!is_replication_source) {
+    if (!is_replicated) {
         /* The commands cached in the MULTI/EXEC queue have not been executed yet,
          * so they are also considered a part of the query buffer in a broader sense.
          *
@@ -3465,7 +3475,7 @@ void resetCommand(client *c) {
         flags.replica = 0;
     }
 
-    if (flags.replica || flags.replication_source || flags.module) {
+    if (flags.replica || flags.replicated || flags.module) {
         addReplyError(c, "can only reset normal client connections");
         return;
     }
@@ -4904,7 +4914,7 @@ void ioThreadReadQueryFromClient(void *data) {
 done:
     /* Only trim query buffer for non-primary clients
      * Primary client's buffer is handled by main thread using repl_applied position */
-    if (!(c->read_flags & READ_FLAGS_REPLICATION_SOURCE)) {
+    if (!(c->read_flags & READ_FLAGS_REPLICATED)) {
         trimClientQueryBuffer(c);
     }
     atomic_thread_fence(memory_order_release);
diff --git a/src/rdb.c b/src/rdb.c
index 7bb9edf31f..ba5d219452 100644
--- a/src/rdb.c
+++ b/src/rdb.c
@@ -1870,7 +1870,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
     if (server.sanitize_dump_payload == SANITIZE_DUMP_CLIENTS) {
         /* Skip sanitization when loading (an RDB), or getting a RESTORE command
          * from either a replication source or a client using an ACL user with the skip-sanitize-payload flag. */
-        int skip = server.loading || (server.current_client && (server.current_client->flag.replication_source));
+        int skip = server.loading || (server.current_client && (server.current_client->flag.replicated));
         if (!skip && server.current_client && server.current_client->user)
             skip = !!(server.current_client->user->flags & USER_FLAG_SANITIZE_PAYLOAD_SKIP);
         deep_integrity_validation = !skip;
@@ -3525,16 +3525,14 @@ void killRDBChild(void) {
      * - rdbRemoveTempFile */
 }
 
-/* Spawn an RDB child that writes the RDB to the sockets of the replicas
- * that are currently in REPLICA_STATE_WAIT_BGSAVE_START state. */
-int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, slotBitmap slot_bitmap) {
-    listNode *ln;
-    listIter li;
+/* Save snapshot to the provided connections, spawning a child process and
+ * running the provided function.
+ *
+ * Connections array provided will be freed after the save is completed, and
+ * should not be freed by the caller. */
+int saveSnapshotToConnectionSockets(connection **conns, int connsnum, int use_pipe, int req, ChildSnapshotFunc snapshot_func, void *privdata) {
     pid_t childpid;
     int pipefds[2], rdb_pipe_write = 0, safe_to_exit_pipe = 0;
-    int dual_channel = (req & REPLICA_REQ_RDB_CHANNEL);
-    int aof = (req & REPLICA_REQ_AOF_FORMAT);
-
     if (hasActiveChildProcess()) return C_ERR;
     serverAssert(server.rdb_pipe_read == -1 && server.rdb_child_exit_pipe == -1);
 
@@ -3542,7 +3540,7 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, slotBitmap slot_bitmap)
      * drained the pipe. */
     if (server.rdb_pipe_conns) return C_ERR;
 
-    if (!dual_channel) {
+    if (use_pipe) {
         /* Before to fork, create a pipe that is used to transfer the rdb bytes to
          * the parent, we can't let it write directly to the sockets, since in case
          * of TLS we must let the parent handle a continuous TLS state when the
@@ -3561,49 +3559,20 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, slotBitmap slot_bitmap)
         safe_to_exit_pipe = pipefds[0];          /* read end */
         server.rdb_child_exit_pipe = pipefds[1]; /* write end */
     }
-    /* Collect the connections of the replicas we want to transfer
-     * the RDB to, which are in WAIT_BGSAVE_START state. */
-    int connsnum = 0;
-    connection **conns = zmalloc(sizeof(connection *) * listLength(server.replicas));
     server.rdb_pipe_conns = NULL;
-    if (!dual_channel) {
+    if (use_pipe) {
         server.rdb_pipe_conns = conns;
         server.rdb_pipe_numconns = 0;
         server.rdb_pipe_numconns_writing = 0;
+    } else {
+        server.rdb_pipe_numconns = connsnum;
     }
-    /* Filter replica connections pending full sync (ie. in WAIT_BGSAVE_START state). */
-    listRewind(server.replicas, &li);
-    while ((ln = listNext(&li))) {
-        client *replica = ln->value;
-        if (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_START) {
-            /* Check replica has the exact requirements */
-            if (replica->repl_data->replica_req != req) continue;
-            /* Check matching slot bitmaps. */
-            if (memcmp(replica->repl_data->slot_bitmap, slot_bitmap, sizeof(slotBitmap)) != 0) continue;
-
-            conns[connsnum++] = replica->conn;
-            if (dual_channel) {
-                connSendTimeout(replica->conn, server.repl_timeout * 1000);
-                /* This replica uses diskless dual channel sync, hence we need
-                 * to inform it with the save end offset.*/
-                sendCurrentOffsetToReplica(replica);
-                /* Make sure repl traffic is appended to the replication backlog */
-                addRdbReplicaToPsyncWait(replica);
-                /* Put the socket in blocking mode to simplify RDB transfer. */
-                connBlock(replica->conn);
-            } else {
-                server.rdb_pipe_numconns++;
-            }
-            replicationSetupReplicaForFullResync(replica, getPsyncInitialOffset());
-        }
-    }
-
     /* Create the child process. */
     if ((childpid = serverFork(CHILD_TYPE_RDB)) == 0) {
         /* Child */
         int retval, dummy;
         rio rdb;
-        if (dual_channel) {
+        if (!use_pipe) {
             rioInitWithConnset(&rdb, conns, connsnum);
         } else {
             rioInitWithFd(&rdb, rdb_pipe_write);
@@ -3611,7 +3580,7 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, slotBitmap slot_bitmap)
 
         /* Close the reading part, so that if the parent crashes, the child will
          * get a write error and exit. */
-        if (!dual_channel) close(server.rdb_pipe_read);
+        if (use_pipe) close(server.rdb_pipe_read);
         if (strstr(server.exec_argv[0], "redis-server") != NULL) {
             serverSetProcTitle("redis-rdb-to-slaves");
         } else {
@@ -3619,22 +3588,13 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, slotBitmap slot_bitmap)
         }
         serverSetCpuAffinity(server.bgsave_cpulist);
 
-        if (aof) {
-            serverLog(LL_NOTICE, "Background AOF transfer started by pid %ld", (long)getpid());
-            retval = rewriteAppendOnlyFileRio(&rdb, slot_bitmap);
-            rioWrite(&rdb, "*3\r\n", 4);
-            rioWriteBulkString(&rdb, "REPLCONF", 8);
-            rioWriteBulkString(&rdb, "AOF-PAYLOAD-END", 15);
-            rioWriteBulkLongLong(&rdb, server.primary_repl_offset);
-        } else {
-            retval = rdbSaveRioWithEOFMark(req, &rdb, NULL, rsi);
-        }
+        retval = snapshot_func(req, &rdb, privdata);
         if (retval == C_OK && rioFlush(&rdb) == 0) retval = C_ERR;
 
         if (retval == C_OK) {
             sendChildCowInfo(CHILD_INFO_TYPE_RDB_COW_SIZE, "RDB");
         }
-        if (dual_channel) {
+        if (!use_pipe) {
             rioFreeConnset(&rdb);
         } else {
             rioFreeFd(&rdb);
@@ -3645,7 +3605,7 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, slotBitmap slot_bitmap)
         zfree(conns);
         /* hold exit until the parent tells us it's safe. we're not expecting
          * to read anything, just get the error when the pipe is closed. */
-        if (!dual_channel) dummy = read(safe_to_exit_pipe, pipefds, 1);
+        if (use_pipe) dummy = read(safe_to_exit_pipe, pipefds, 1);
         UNUSED(dummy);
         exitFromChild((retval == C_OK) ? 0 : 1);
     } else {
@@ -3653,23 +3613,13 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, slotBitmap slot_bitmap)
         if (childpid == -1) {
             serverLog(LL_WARNING, "Can't save in background: fork: %s", strerror(errno));
 
-            /* Undo the state change. The caller will perform cleanup on
-             * all the replicas in BGSAVE_START state, but an early call to
-             * replicationSetupReplicaForFullResync() turned it into BGSAVE_END */
-            listRewind(server.replicas, &li);
-            while ((ln = listNext(&li))) {
-                client *replica = ln->value;
-                if (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_END) {
-                    replica->repl_data->repl_state = REPLICA_STATE_WAIT_BGSAVE_START;
-                }
-            }
-            if (!dual_channel) {
+            if (use_pipe) {
                 close(rdb_pipe_write);
                 close(server.rdb_pipe_read);
                 close(server.rdb_child_exit_pipe);
             }
             zfree(conns);
-            if (dual_channel) {
+            if (!use_pipe) {
                 closeChildInfoPipe();
             } else {
                 server.rdb_pipe_conns = NULL;
@@ -3678,10 +3628,10 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, slotBitmap slot_bitmap)
             }
         } else {
             serverLog(LL_NOTICE, "Background RDB transfer started by pid %ld to %s", (long)childpid,
-                      dual_channel ? "direct socket to replica" : "pipe through parent process");
+                      !use_pipe ? "direct socket to replica" : "pipe through parent process");
             server.rdb_save_time_start = time(NULL);
             server.rdb_child_type = RDB_CHILD_TYPE_SOCKET;
-            if (dual_channel) {
+            if (!use_pipe) {
                 /* For dual channel sync, the main process no longer requires these RDB connections. */
                 zfree(conns);
             } else {
@@ -3692,12 +3642,70 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, slotBitmap slot_bitmap)
                 }
             }
         }
-        if (!dual_channel) close(safe_to_exit_pipe);
+        if (use_pipe) close(safe_to_exit_pipe);
         return (childpid == -1) ? C_ERR : C_OK;
     }
     return C_OK; /* Unreached. */
 }
 
+int childSnapshotUsingRDB(int req, rio *rdb, void *privdata) {
+    return rdbSaveRioWithEOFMark(req, rdb, NULL, (rdbSaveInfo *)privdata);
+}
+
+/* Spawn an RDB child that writes the RDB to the sockets of the replicas
+ * that are currently in REPLICA_STATE_WAIT_BGSAVE_START state. */
+int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi) {
+    listNode *ln;
+    listIter li;
+    int dual_channel = (req & REPLICA_REQ_RDB_CHANNEL);
+
+    /* Collect the connections of the replicas we want to transfer
+     * the RDB to, which are i WAIT_BGSAVE_START state. */
+    int connsnum = 0;
+    connection **conns = zmalloc(sizeof(connection *) * listLength(server.replicas));
+
+    /* Filter replica connections pending full sync (ie. in WAIT_BGSAVE_START state). */
+    listRewind(server.replicas, &li);
+    while ((ln = listNext(&li))) {
+        client *replica = ln->value;
+        if (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_START) {
+            /* Check replica has the exact requirements */
+            if (replica->repl_data->replica_req != req) continue;
+
+            conns[connsnum++] = replica->conn;
+            if (dual_channel) {
+                connSendTimeout(replica->conn, server.repl_timeout * 1000);
+                /* This replica uses diskless dual channel sync, hence we need
+                 * to inform it with the save end offset.*/
+                sendCurrentOffsetToReplica(replica);
+                /* Make sure repl traffic is appended to the replication backlog */
+                addRdbReplicaToPsyncWait(replica);
+                /* Put the socket in blocking mode to simplify RDB transfer. */
+                connBlock(replica->conn);
+            }
+            replicationSetupReplicaForFullResync(replica, getPsyncInitialOffset());
+        }
+    }
+
+    int retval = saveSnapshotToConnectionSockets(conns, connsnum, !dual_channel, req, childSnapshotUsingRDB, (void *) rsi);
+
+    if (retval != C_OK) {
+        serverLog(LL_WARNING, "Can't save in background: fork: %s", strerror(errno));
+
+        /* Undo the state change. The caller will perform cleanup on
+            * all the replicas in BGSAVE_START state, but an early call to
+            * replicationSetupReplicaForFullResync() turned it into BGSAVE_END */
+        listRewind(server.replicas, &li);
+        while ((ln = listNext(&li))) {
+            client *replica = ln->value;
+            if (replica->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_END) {
+                replica->repl_data->repl_state = REPLICA_STATE_WAIT_BGSAVE_START;
+            }
+        }
+    }
+    return retval;
+}
+
 void saveCommand(client *c) {
     if (server.child_type == CHILD_TYPE_RDB) {
         addReplyError(c, "Background save already in progress");
diff --git a/src/rdb.h b/src/rdb.h
index 734ae7ba72..7342a926b5 100644
--- a/src/rdb.h
+++ b/src/rdb.h
@@ -152,7 +152,7 @@ int rdbSaveObjectType(rio *rdb, robj *o);
 int rdbLoadObjectType(rio *rdb);
 int rdbLoad(char *filename, rdbSaveInfo *rsi, int rdbflags);
 int rdbSaveBackground(int req, char *filename, rdbSaveInfo *rsi, int rdbflags);
-int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi, slotBitmap slot_bitmap);
+int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi);
 void rdbRemoveTempFile(pid_t childpid, int from_signal);
 int rdbSaveToFile(const char *filename);
 int rdbSave(int req, char *filename, rdbSaveInfo *rsi, int rdbflags);
diff --git a/src/replication.c b/src/replication.c
index 9abf5cead3..bcb9e0a756 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -49,6 +49,7 @@
 void replicationDiscardCachedPrimary(void);
 void replicationResurrectCachedPrimary(connection *conn);
 void replicationResurrectProvisionalPrimary(void);
+void replicationSendAck(void);
 int replicaPutOnline(client *replica);
 void replicaStartCommandStream(client *replica);
 int cancelReplicationHandshake(int reconnect);
@@ -951,7 +952,7 @@ int primaryTryPartialResynchronization(client *c, long long psync_offset) {
  *    started.
  *
  * Returns C_OK on success or C_ERR otherwise. */
-int startBgsaveForReplication(int mincapa, int req, slotBitmap slot_bitmap) {
+int startBgsaveForReplication(int mincapa, int req) {
     int retval;
     int socket_target = 0;
     listIter li;
@@ -960,14 +961,13 @@ int startBgsaveForReplication(int mincapa, int req, slotBitmap slot_bitmap) {
     /* We use a socket target if replica can handle the EOF marker and we're configured to do diskless syncs.
      * Note that in case we're creating a "filtered" RDB (functions-only, for example) we also force socket replication
      * to avoid overwriting the snapshot RDB file with filtered data. */
-    socket_target = (server.repl_diskless_sync || req & REPLICA_REQ_RDB_MASK) && (mincapa & REPLICA_CAPA_EOF || req & REPLICA_REQ_AOF_FORMAT);
+    socket_target = (server.repl_diskless_sync || req & REPLICA_REQ_RDB_MASK) && (mincapa & REPLICA_CAPA_EOF);
     /* `SYNC` should have failed with error if we don't support socket and require a filter, assert this here */
     serverAssert(socket_target || !(req & REPLICA_REQ_RDB_MASK));
 
-    serverLog(LL_NOTICE, "Starting BGSAVE for SYNC with target: %s using: %s with format: %s",
+    serverLog(LL_NOTICE, "Starting BGSAVE for SYNC with target: %s using: %s",
               socket_target ? "replicas sockets" : "disk",
-              (req & REPLICA_REQ_RDB_CHANNEL) ? "dual-channel" : "normal sync",
-              (req & REPLICA_REQ_AOF_FORMAT) ? "AOF" : "RDB");
+              (req & REPLICA_REQ_RDB_CHANNEL) ? "dual-channel" : "normal sync");
 
     rdbSaveInfo rsi, *rsiptr;
     rsiptr = rdbPopulateSaveInfo(&rsi);
@@ -975,7 +975,7 @@ int startBgsaveForReplication(int mincapa, int req, slotBitmap slot_bitmap) {
      * otherwise replica will miss repl-stream-db. */
     if (rsiptr) {
         if (socket_target)
-            retval = rdbSaveToReplicasSockets(req, rsiptr, slot_bitmap);
+            retval = rdbSaveToReplicasSockets(req, rsiptr);
         else {
             /* Keep the page cache since it'll get used soon */
             retval = rdbSaveBackground(req, server.rdb_filename, rsiptr, RDBFLAGS_REPLICATION | RDBFLAGS_KEEP_CACHE);
@@ -1193,7 +1193,7 @@ void syncCommand(client *c) {
          * capabilities of the replica that triggered the current BGSAVE
          * and its exact requirements. */
         if (ln && ((c->repl_data->replica_capa & replica->repl_data->replica_capa) == replica->repl_data->replica_capa) &&
-            c->repl_data->replica_req == replica->repl_data->replica_req && isSlotBitmapEmpty(c->repl_data->slot_bitmap)) {
+            c->repl_data->replica_req == replica->repl_data->replica_req) {
             /* Perfect, the server is already registering differences for
              * another replica. Set the right state, and copy the buffer.
              * We don't copy buffer if clients don't want. */
@@ -1215,7 +1215,7 @@ void syncCommand(client *c) {
 
         /* CASE 3: There is no BGSAVE is in progress. */
     } else {
-        if (server.repl_diskless_sync && (c->repl_data->replica_capa & REPLICA_CAPA_EOF) && server.repl_diskless_sync_delay && isSlotBitmapEmpty(c->repl_data->slot_bitmap)) {
+        if (server.repl_diskless_sync && (c->repl_data->replica_capa & REPLICA_CAPA_EOF) && server.repl_diskless_sync_delay) {
             /* Diskless replication RDB child is created inside
              * replicationCron() since we want to delay its start a
              * few seconds to wait for more replicas to arrive. */
@@ -1224,7 +1224,7 @@ void syncCommand(client *c) {
             /* We don't have a BGSAVE in progress, let's start one. Diskless
              * or disk-based mode is determined by replica's capacity. */
             if (!hasActiveChildProcess()) {
-                startBgsaveForReplication(c->repl_data->replica_capa, c->repl_data->replica_req, c->repl_data->slot_bitmap);
+                startBgsaveForReplication(c->repl_data->replica_capa, c->repl_data->replica_req);
             } else {
                 serverLog(LL_NOTICE, "No BGSAVE in progress, but another BG operation is active. "
                                      "BGSAVE for replication delayed");
@@ -1254,7 +1254,6 @@ int anyOtherReplicaWaitRdb(client *except_me) {
 void initClientReplicationData(client *c) {
     if (c->repl_data) return;
     c->repl_data = (ClientReplicationData *)zcalloc(sizeof(ClientReplicationData));
-    memset(c->repl_data->slot_bitmap, 0, sizeof(c->repl_data->slot_bitmap));
 }
 
 void freeClientReplicationData(client *c) {
@@ -1421,7 +1420,7 @@ void replconfCommand(client *c) {
         } else if (!strcasecmp(c->argv[j]->ptr, "getack")) {
             /* REPLCONF GETACK is used in order to request an ACK ASAP
              * to the replica. */
-            if (server.primary_host && server.primary) replicationSendAck(server.primary);
+            if (server.primary_host && server.primary) replicationSendAck();
             return;
         } else if (!strcasecmp(c->argv[j]->ptr, "rdb-only")) {
             /* REPLCONF RDB-ONLY is used to identify the client only wants
@@ -1492,41 +1491,6 @@ void replconfCommand(client *c) {
                 return;
             }
             c->repl_data->associated_rdb_client_id = (uint64_t)client_id;
-        } else if (!strcasecmp(c->argv[j]->ptr, "slot-bitmap")) {
-            /* REPLCONF slot-bitmap <slot-bitmap> is used to filter the replication stream to just a set number of slots. */
-            if (!server.cluster_enabled) {
-                addReplyError(c, "Cannot replicate a slot when cluster mode is disabled");
-            }
-            if (stringObjectLen(c->argv[j + 1]) != sizeof(slotBitmap)) {
-                addReplyError(c, "Invalid slot bitmap length");
-                return;
-            }
-            for (int slot = 0; slot <= CLUSTER_SLOTS; slot++) {
-                if (bitmapTestBit(c->argv[j + 1]->ptr, slot) && server.cluster->slots[slot] != server.cluster->myself) {
-                    addReplyErrorFormat(c, "I cannot replicate slot %d since I do not own it", slot);
-                    return;
-                }
-            }
-            memcpy(c->repl_data->slot_bitmap, c->argv[j + 1]->ptr, sizeof(slotBitmap));
-
-            /* For now, we only support AOF for slot transfer. */
-            c->repl_data->replica_req |= REPLICA_REQ_AOF_FORMAT;
-        } else if (!strcasecmp(c->argv[j]->ptr, "aof-payload-end")) {
-            /* REPLCONF aof-payload-end <offset> is used to inform the target
-             * that the replication source has finished sending the AOF formatted
-             * sync snapshot, and that it is free to begin processing the
-             * replication backlog. */
-            long long initial_offset = 0;
-            if (getLongLongFromObjectOrReply(c, c->argv[j + 1], &initial_offset, NULL) != C_OK) {
-                return;
-            }
-            if (c->flag.slot_migration_source) {
-                clusterSlotMigrationDoneSyncing(initial_offset);
-                return;
-            }
-            /* Right now, we only support this for slot migration. */
-            addReplyErrorFormat(c, "AOF sync is not in progress.");
-            return;
         } else {
             addReplyErrorFormat(c, "Unrecognized REPLCONF option: %s", (char *)c->argv[j]->ptr);
             return;
@@ -2019,7 +1983,7 @@ void replicationCreatePrimaryClientWithHandler(connection *conn, int dbid, Conne
      * connection. */
     server.primary->flag.primary = 1;
     server.primary->flag.authenticated = 1;
-    server.primary->flag.replication_source = 1;
+    server.primary->flag.replicated = 1;
 
     /* Allocate a private query buffer for the primary client instead of using the shared query buffer.
      * This is done because the primary's query buffer data needs to be preserved for my sub-replicas to use. */
@@ -2517,7 +2481,7 @@ void readSyncBulkPayload(connection *conn) {
         server.repl_state = REPL_STATE_CONNECTED;
         server.repl_down_since = 0;
         /* Send the initial ACK immediately to put this replica in online state. */
-        replicationSendAck(server.primary);
+        replicationSendAck();
     }
 
     /* Fire the primary link modules event. */
@@ -3060,7 +3024,7 @@ void dualChannelSyncSuccess(void) {
     dualChannelServerLog(LL_NOTICE, "Successfully streamed replication data into memory");
     /* We can resume reading from the primary connection once the local replication buffer has been loaded. */
     replicationSteadyStateInit();
-    replicationSendAck(server.primary); /* Send ACK to notify primary that replica is synced */
+    replicationSendAck(); /* Send ACK to notify primary that replica is synced */
     server.rdb_client_id = -1;
     server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_STATE_NONE;
 }
@@ -3445,8 +3409,94 @@ void dualChannelSetupMainConnForPsync(connection *conn) {
     sdsfree(err);
 }
 
-int replicationProceedWithHandshake(connection *conn, int curr_state, slotBitmap slot_bitmap) {
-    char *err = NULL;
+/*
+ * Dual channel for full sync
+ *
+ * * Motivation *
+ *  - Reduce primary memory load. We do that by moving the COB tracking to the replica side. This also decrease
+ *    the chance for COB overruns. Note that primary's input buffer limits at the replica side are less restricted
+ *    then primary's COB as the replica plays less critical part in the replication group. While increasing the
+ *    primary's COB may end up with primary reaching swap and clients suffering, at replica side we're more at
+ *    ease with it. Larger COB means better chance to sync successfully.
+ *  - Reduce primary main process CPU load. By opening a new, dedicated channel for the RDB transfer, child
+ *    processes can have direct access to the new channel. Due to TLS connection restrictions, this was not
+ *    possible using one main channel. We eliminate the need for the child process to use the primary's
+ *    child-proc -> main-proc pipeline, thus freeing up the main process to process clients queries.
+ *
+ * * High level interface design *
+ *  - Dual channel sync begins when the replica sends a REPLCONF capa dual-channel to the primary during initial
+ *    handshake. This allows the replica to verify whether the primary supports dual-channel-replication and, if
+ *    so, state that this is the replica's main channel, which is not used for snapshot transfer.
+ *  - When replica lacks sufficient data for PSYNC, the primary will send +DUALCHANNELSYNC response instead
+ *    of RDB data. As a next step, the replica creates a new channel (rdb-channel) and configures it against
+ *    the primary with the appropriate capabilities and requirements. The replica then requests a sync
+ *    using the RDB channel.
+ *  - Prior to forking, the primary sends the replica the snapshot's end repl-offset, and attaches the replica
+ *    to the replication backlog to keep repl data until the replica requests psync. The replica uses the main
+ *    channel to request a PSYNC starting at the snapshot end offset.
+ *  - The primary main threads sends incremental changes via the main channel, while the bgsave process
+ *    sends the RDB directly to the replica via the rdb-channel. As for the replica, the incremental
+ *    changes are stored on a local buffer, while the RDB is loaded into memory.
+ *  - Once the replica completes loading the rdb, it drops the rdb channel and streams the accumulated incremental
+ *    changes into memory. Repl steady state continues normally.
+ *
+ * * Replica state machine *
+ * ┌───────────────────┐             Dual channel sync
+ * │RECEIVE_PING_REPLY │          ┌──────────────────────────────────────────────────────────────┐
+ * └────────┬──────────┘          │     RDB channel states               Main channel state      │
+ *          │+PONG                │     ┌────────────────────────────┐   ┌───────────────────┐   │
+ * ┌────────▼──────────┐        ┌─┼─────►DUAL_CHANNEL_SEND_HANDSHAKE │ ┌─►SEND_HANDSHAKE     │   │
+ * │SEND_HANDSHAKE     │        │ │     └────┬───────────────────────┘ │ └──┬────────────────┘   │
+ * └────────┬──────────┘        │ │          │                         │    │REPLCONF set-rdb-client-id
+ *          │                   │ │  ┌───────▼───────────────────────┐ │ ┌──▼────────────────┐   │
+ * ┌────────▼──────────┐        │ │  │DUAL_CHANNEL_RECEIVE_AUTH_REPLY│ │ │RECEIVE_CAPA_REPLY │   │
+ * │RECEIVE_AUTH_REPLY │        │ │  └───────┬───────────────────────┘ │ └──┬────────────────┘   │
+ * └────────┬──────────┘        │ │          │+OK                      │    │+OK                 │
+ *          │+OK                │ │  ┌───────▼───────────────────────┐ │ ┌──▼────────────────┐   │
+ * ┌────────▼──────────┐        │ │  │DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY│SEND_PSYNC         │   │
+ * │RECEIVE_PORT_REPLY │        │ │  └───────┬───────────────────────┘ │ └──┬────────────────┘   │
+ * └────────┬──────────┘        │ │          │+OK                      │    │PSYNC use snapshot  │
+ *          │+OK                │ │  ┌───────▼───────────────────┐     │    │end-offset provided │
+ * ┌────────▼──────────┐        │ │  │DUAL_CHANNEL_RECEIVE_ENDOFF│     │    │by the primary      │
+ * │RECEIVE_IP_REPLY   │        │ │  └───────┬───────────────────┘     │ ┌──▼────────────────┐   │
+ * └────────┬──────────┘        │ │          │$ENDOFF                  │ │RECEIVE_PSYNC_REPLY│   │
+ *          │+OK                │ │          ├─────────────────────────┘ └──┬────────────────┘   │
+ * ┌────────▼──────────┐        │ │          │                              │+CONTINUE           │
+ * │RECEIVE_CAPA_REPLY │        │ │  ┌───────▼───────────────┐           ┌──▼────────────────┐   │
+ * └────────┬──────────┘        │ │  │DUAL_CHANNEL_RDB_LOAD  │           │TRANSFER           │   │
+ *          │+OK                │ │  └───────┬───────────────┘           └─────┬─────────────┘   │
+ * ┌────────▼─────────────┐     │ │          │Done loading                     │                 │
+ * │RECEIVE_VERSION_REPLY │     │ │  ┌───────▼───────────────┐                 │                 │
+ * └────────┬─────────────┘     │ │  │DUAL_CHANNEL_RDB_LOADED│                 │                 │
+ *          │+OK                │ │  └───────┬───────────────┘                 │                 │
+ * ┌────────▼───┐               │ │          │                                 │                 │
+ * │SEND_PSYNC  │               │ │          │Replica loads local replication  │                 │
+ * └─┬──────────┘               │ │          │buffer into memory               │                 │
+ *   │PSYNC (use cached-primary)│ │          └─────────┬───────────────────────┘                 │
+ * ┌─▼─────────────────┐        │ │                    │                                         │
+ * │RECEIVE_PSYNC_REPLY│        │ └────────────────────┼─────────────────────────────────────────┘
+ * └────────┬─┬────────┘        │                      │
+ * +CONTINUE│ │+DUALCHANNELSYNC │                      │
+ *   │      │ └─────────────────┘                      │
+ *   │      │+FULLRESYNC                               │
+ *   │    ┌─▼─────────────────┐                   ┌────▼──────────────┐
+ *   │    │TRANSFER           ├───────────────────►CONNECTED          │
+ *   │    └───────────────────┘                   └────▲──────────────┘
+ *   │                                                 │
+ *   └─────────────────────────────────────────────────┘
+ */
+/* This handler fires when the non blocking connect was able to
+ * establish a connection with the primary. */
+void syncWithPrimary(connection *conn) {
+    char tmpfile[256], *err = NULL;
+    int psync_result;
+
+    /* If this event fired after the user turned the instance into a primary
+     * with REPLICAOF NO ONE we must just return ASAP. */
+    if (server.repl_state == REPL_STATE_NONE) {
+        connClose(conn);
+        return;
+    }
 
     /* Check for errors in the socket: after a non blocking connect() we
      * may find that the socket is in error state. */
@@ -3456,16 +3506,22 @@ int replicationProceedWithHandshake(connection *conn, int curr_state, slotBitmap
     }
 
     /* Send a PING to check the primary is able to reply without errors. */
-    if (curr_state == REPL_STATE_CONNECTING) {
+    if (server.repl_state == REPL_STATE_CONNECTING) {
         serverLog(LL_NOTICE, "Non blocking connect for SYNC fired the event.");
+        /* Delete the writable event so that the readable event remains
+         * registered and we can wait for the PONG reply. */
+        connSetReadHandler(conn, syncWithPrimary);
+        connSetWriteHandler(conn, NULL);
+        server.repl_state = REPL_STATE_RECEIVE_PING_REPLY;
         /* Send the PING, don't check for errors at all, we have the timeout
          * that will take care about this. */
         err = sendCommand(conn, "PING", NULL);
         if (err) goto write_error;
-        return REPL_STATE_RECEIVE_PING_REPLY;
+        return;
     }
-        /* Receive the PONG command. */
-    if (curr_state == REPL_STATE_RECEIVE_PING_REPLY) {
+
+    /* Receive the PONG command. */
+    if (server.repl_state == REPL_STATE_RECEIVE_PING_REPLY) {
         err = receiveSynchronousResponse(conn);
 
         /* The primary did not reply */
@@ -3486,10 +3542,10 @@ int replicationProceedWithHandshake(connection *conn, int curr_state, slotBitmap
         }
         sdsfree(err);
         err = NULL;
-        curr_state = REPL_STATE_SEND_HANDSHAKE;
+        server.repl_state = REPL_STATE_SEND_HANDSHAKE;
     }
 
-    if (curr_state == REPL_STATE_SEND_HANDSHAKE) {
+    if (server.repl_state == REPL_STATE_SEND_HANDSHAKE) {
         /* AUTH with the primary if required. */
         if (server.primary_auth) {
             char *args[3] = {"AUTH", NULL, NULL};
@@ -3524,16 +3580,6 @@ int replicationProceedWithHandshake(connection *conn, int curr_state, slotBitmap
             if (err) goto write_error;
         }
 
-        /* Set the slot bitmap, so that the primary only provides us with the appropriate slot dictionary. */
-        if (slot_bitmap != NULL && !isSlotBitmapEmpty(slot_bitmap)) {
-            char *argv[3] = {"REPLCONF", "slot-bitmap", NULL};
-            size_t lens[3] = {8, 11, 0};
-            argv[2] = (char *)slot_bitmap;
-            lens[2] = sizeof(slotBitmap);
-            err = sendCommandArgv(conn, 3, argv, lens);
-            if (err) goto write_error;
-        }
-
         /* Inform the primary of our (replica) capabilities.
          *
          * EOF: supports EOF-style RDB transfer for diskless replication.
@@ -3549,14 +3595,15 @@ int replicationProceedWithHandshake(connection *conn, int curr_state, slotBitmap
         err = sendCommand(conn, "REPLCONF", "version", VALKEY_VERSION, NULL);
         if (err) goto write_error;
 
-        return REPL_STATE_RECEIVE_AUTH_REPLY;
+        server.repl_state = REPL_STATE_RECEIVE_AUTH_REPLY;
+        return;
     }
 
-    if (curr_state == REPL_STATE_RECEIVE_AUTH_REPLY && !server.primary_auth)
-        curr_state = REPL_STATE_RECEIVE_PORT_REPLY;
+    if (server.repl_state == REPL_STATE_RECEIVE_AUTH_REPLY && !server.primary_auth)
+        server.repl_state = REPL_STATE_RECEIVE_PORT_REPLY;
 
     /* Receive AUTH reply. */
-    if (curr_state == REPL_STATE_RECEIVE_AUTH_REPLY) {
+    if (server.repl_state == REPL_STATE_RECEIVE_AUTH_REPLY) {
         err = receiveSynchronousResponse(conn);
         if (err == NULL) goto no_response_error;
         if (err[0] == '-') {
@@ -3566,11 +3613,12 @@ int replicationProceedWithHandshake(connection *conn, int curr_state, slotBitmap
         }
         sdsfree(err);
         err = NULL;
-        return REPL_STATE_RECEIVE_PORT_REPLY;
+        server.repl_state = REPL_STATE_RECEIVE_PORT_REPLY;
+        return;
     }
 
     /* Receive REPLCONF listening-port reply. */
-    if (curr_state == REPL_STATE_RECEIVE_PORT_REPLY) {
+    if (server.repl_state == REPL_STATE_RECEIVE_PORT_REPLY) {
         err = receiveSynchronousResponse(conn);
         if (err == NULL) goto no_response_error;
         /* Ignore the error if any, not all the Redis OSS versions support
@@ -3582,14 +3630,15 @@ int replicationProceedWithHandshake(connection *conn, int curr_state, slotBitmap
                       err);
         }
         sdsfree(err);
-        return REPL_STATE_RECEIVE_IP_REPLY;
+        server.repl_state = REPL_STATE_RECEIVE_IP_REPLY;
+        return;
     }
 
-    if (curr_state == REPL_STATE_RECEIVE_IP_REPLY && !server.replica_announce_ip)
-        curr_state = REPL_STATE_RECEIVE_SLOT_REPLY;
+    if (server.repl_state == REPL_STATE_RECEIVE_IP_REPLY && !server.replica_announce_ip)
+        server.repl_state = REPL_STATE_RECEIVE_CAPA_REPLY;
 
     /* Receive REPLCONF ip-address reply. */
-    if (curr_state == REPL_STATE_RECEIVE_IP_REPLY) {
+    if (server.repl_state == REPL_STATE_RECEIVE_IP_REPLY) {
         err = receiveSynchronousResponse(conn);
         if (err == NULL) goto no_response_error;
         /* Ignore the error if any, not all the Redis OSS versions support
@@ -3601,28 +3650,12 @@ int replicationProceedWithHandshake(connection *conn, int curr_state, slotBitmap
                       err);
         }
         sdsfree(err);
-        return REPL_STATE_RECEIVE_SLOT_REPLY;
-    }
-
-    if (curr_state == REPL_STATE_RECEIVE_SLOT_REPLY && (slot_bitmap == NULL || isSlotBitmapEmpty(slot_bitmap)))
-        curr_state = REPL_STATE_RECEIVE_CAPA_REPLY;
-
-    if (curr_state == REPL_STATE_RECEIVE_SLOT_REPLY) {
-        err = receiveSynchronousResponse(conn);
-        if (err == NULL) goto no_response_error;
-        /* If we sent the slot bitmap, we need it to be properly acked, or we can't do slot migration. */
-        if (err[0] == '-') {
-            serverLog(LL_WARNING, "Source does not understand REPLCONF slot-num. Cannot continue with slot-level sync: %s", err);
-            sdsfree(err);
-            goto error;
-        }
-        sdsfree(err);
-        return REPL_STATE_RECEIVE_CAPA_REPLY;
+        server.repl_state = REPL_STATE_RECEIVE_CAPA_REPLY;
+        return;
     }
 
-
     /* Receive CAPA reply. */
-    if (curr_state == REPL_STATE_RECEIVE_CAPA_REPLY) {
+    if (server.repl_state == REPL_STATE_RECEIVE_CAPA_REPLY) {
         err = receiveSynchronousResponse(conn);
         if (err == NULL) goto no_response_error;
         /* Ignore the error if any, not all the Redis OSS versions support
@@ -3635,11 +3668,12 @@ int replicationProceedWithHandshake(connection *conn, int curr_state, slotBitmap
         }
         sdsfree(err);
         err = NULL;
-        return REPL_STATE_RECEIVE_VERSION_REPLY;
+        server.repl_state = REPL_STATE_RECEIVE_VERSION_REPLY;
+        return;
     }
 
     /* Receive VERSION reply. */
-    if (curr_state == REPL_STATE_RECEIVE_VERSION_REPLY) {
+    if (server.repl_state == REPL_STATE_RECEIVE_VERSION_REPLY) {
         err = receiveSynchronousResponse(conn);
         if (err == NULL) goto no_response_error;
         /* Ignore the error if any. Valkey >= 8 supports REPLCONF VERSION. */
@@ -3651,125 +3685,7 @@ int replicationProceedWithHandshake(connection *conn, int curr_state, slotBitmap
         }
         sdsfree(err);
         err = NULL;
-        return REPL_STATE_SEND_PSYNC;
-    }
-
-
-no_response_error: /* Handle receiveSynchronousResponse() error when primary has no reply */
-    serverLog(LL_WARNING, "Primary did not respond to command during SYNC handshake");
-    /* Fall through to regular error handling */
-
-error:
-    return REPL_STATE_ERROR;
-
-write_error: /* Handle sendCommand() errors. */
-    serverLog(LL_WARNING, "Sending command to primary in replication handshake: %s", err);
-    sdsfree(err);
-    goto error;
-}
-
-/*
- * Dual channel for full sync
- *
- * * Motivation *
- *  - Reduce primary memory load. We do that by moving the COB tracking to the replica side. This also decrease
- *    the chance for COB overruns. Note that primary's input buffer limits at the replica side are less restricted
- *    then primary's COB as the replica plays less critical part in the replication group. While increasing the
- *    primary's COB may end up with primary reaching swap and clients suffering, at replica side we're more at
- *    ease with it. Larger COB means better chance to sync successfully.
- *  - Reduce primary main process CPU load. By opening a new, dedicated channel for the RDB transfer, child
- *    processes can have direct access to the new channel. Due to TLS connection restrictions, this was not
- *    possible using one main channel. We eliminate the need for the child process to use the primary's
- *    child-proc -> main-proc pipeline, thus freeing up the main process to process clients queries.
- *
- * * High level interface design *
- *  - Dual channel sync begins when the replica sends a REPLCONF capa dual-channel to the primary during initial
- *    handshake. This allows the replica to verify whether the primary supports dual-channel-replication and, if
- *    so, state that this is the replica's main channel, which is not used for snapshot transfer.
- *  - When replica lacks sufficient data for PSYNC, the primary will send +DUALCHANNELSYNC response instead
- *    of RDB data. As a next step, the replica creates a new channel (rdb-channel) and configures it against
- *    the primary with the appropriate capabilities and requirements. The replica then requests a sync
- *    using the RDB channel.
- *  - Prior to forking, the primary sends the replica the snapshot's end repl-offset, and attaches the replica
- *    to the replication backlog to keep repl data until the replica requests psync. The replica uses the main
- *    channel to request a PSYNC starting at the snapshot end offset.
- *  - The primary main threads sends incremental changes via the main channel, while the bgsave process
- *    sends the RDB directly to the replica via the rdb-channel. As for the replica, the incremental
- *    changes are stored on a local buffer, while the RDB is loaded into memory.
- *  - Once the replica completes loading the rdb, it drops the rdb channel and streams the accumulated incremental
- *    changes into memory. Repl steady state continues normally.
- *
- * * Replica state machine *
- * ┌───────────────────┐             Dual channel sync
- * │RECEIVE_PING_REPLY │          ┌──────────────────────────────────────────────────────────────┐
- * └────────┬──────────┘          │     RDB channel states               Main channel state      │
- *          │+PONG                │     ┌────────────────────────────┐   ┌───────────────────┐   │
- * ┌────────▼──────────┐        ┌─┼─────►DUAL_CHANNEL_SEND_HANDSHAKE │ ┌─►SEND_HANDSHAKE     │   │
- * │SEND_HANDSHAKE     │        │ │     └────┬───────────────────────┘ │ └──┬────────────────┘   │
- * └────────┬──────────┘        │ │          │                         │    │REPLCONF set-rdb-client-id
- *          │                   │ │  ┌───────▼───────────────────────┐ │ ┌──▼────────────────┐   │
- * ┌────────▼──────────┐        │ │  │DUAL_CHANNEL_RECEIVE_AUTH_REPLY│ │ │RECEIVE_CAPA_REPLY │   │
- * │RECEIVE_AUTH_REPLY │        │ │  └───────┬───────────────────────┘ │ └──┬────────────────┘   │
- * └────────┬──────────┘        │ │          │+OK                      │    │+OK                 │
- *          │+OK                │ │  ┌───────▼───────────────────────┐ │ ┌──▼────────────────┐   │
- * ┌────────▼──────────┐        │ │  │DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY│SEND_PSYNC         │   │
- * │RECEIVE_PORT_REPLY │        │ │  └───────┬───────────────────────┘ │ └──┬────────────────┘   │
- * └────────┬──────────┘        │ │          │+OK                      │    │PSYNC use snapshot  │
- *          │+OK                │ │  ┌───────▼───────────────────┐     │    │end-offset provided │
- * ┌────────▼──────────┐        │ │  │DUAL_CHANNEL_RECEIVE_ENDOFF│     │    │by the primary      │
- * │RECEIVE_IP_REPLY   │        │ │  └───────┬───────────────────┘     │ ┌──▼────────────────┐   │
- * └────────┬──────────┘        │ │          │$ENDOFF                  │ │RECEIVE_PSYNC_REPLY│   │
- *          │+OK                │ │          ├─────────────────────────┘ └──┬────────────────┘   │
- * ┌────────▼──────────┐        │ │          │                              │+CONTINUE           │
- * │RECEIVE_CAPA_REPLY │        │ │  ┌───────▼───────────────┐           ┌──▼────────────────┐   │
- * └────────┬──────────┘        │ │  │DUAL_CHANNEL_RDB_LOAD  │           │TRANSFER           │   │
- *          │+OK                │ │  └───────┬───────────────┘           └─────┬─────────────┘   │
- * ┌────────▼─────────────┐     │ │          │Done loading                     │                 │
- * │RECEIVE_VERSION_REPLY │     │ │  ┌───────▼───────────────┐                 │                 │
- * └────────┬─────────────┘     │ │  │DUAL_CHANNEL_RDB_LOADED│                 │                 │
- *          │+OK                │ │  └───────┬───────────────┘                 │                 │
- * ┌────────▼───┐               │ │          │                                 │                 │
- * │SEND_PSYNC  │               │ │          │Replica loads local replication  │                 │
- * └─┬──────────┘               │ │          │buffer into memory               │                 │
- *   │PSYNC (use cached-primary)│ │          └─────────┬───────────────────────┘                 │
- * ┌─▼─────────────────┐        │ │                    │                                         │
- * │RECEIVE_PSYNC_REPLY│        │ └────────────────────┼─────────────────────────────────────────┘
- * └────────┬─┬────────┘        │                      │
- * +CONTINUE│ │+DUALCHANNELSYNC │                      │
- *   │      │ └─────────────────┘                      │
- *   │      │+FULLRESYNC                               │
- *   │    ┌─▼─────────────────┐                   ┌────▼──────────────┐
- *   │    │TRANSFER           ├───────────────────►CONNECTED          │
- *   │    └───────────────────┘                   └────▲──────────────┘
- *   │                                                 │
- *   └─────────────────────────────────────────────────┘
- */
-/* This handler fires when the non blocking connect was able to
- * establish a connection with the primary. */
-void syncWithPrimary(connection *conn) {
-    char tmpfile[256], *err = NULL;
-    int psync_result;
-
-    /* If this event fired after the user turned the instance into a primary
-     * with REPLICAOF NO ONE we must just return ASAP. */
-    if (server.repl_state == REPL_STATE_NONE) {
-        connClose(conn);
-        return;
-    }
-
-    if (server.repl_state < REPL_STATE_SEND_PSYNC) {
-        server.repl_state = replicationProceedWithHandshake(conn, server.repl_state, NULL);
-
-        if (server.repl_state == REPL_STATE_RECEIVE_PING_REPLY) {
-            /* Delete the writable event so that the readable event remains
-            * registered and we can wait for the PONG reply. */
-            connSetReadHandler(conn, syncWithPrimary);
-            connSetWriteHandler(conn, NULL);
-        } else if (server.repl_state == REPL_STATE_ERROR) {
-            goto error;
-        }
-        if (server.repl_state != REPL_STATE_SEND_PSYNC)
-            return;
+        server.repl_state = REPL_STATE_SEND_PSYNC;
     }
 
     /* Try a partial resynchronization. If we don't have a cached primary
@@ -3898,6 +3814,10 @@ void syncWithPrimary(connection *conn) {
     server.repl_transfer_lastio = server.unixtime;
     return;
 
+no_response_error: /* Handle receiveSynchronousResponse() error when primary has no reply */
+    serverLog(LL_WARNING, "Primary did not respond to command during SYNC handshake");
+    /* Fall through to regular error handling */
+
 error:
     connClose(conn);
     server.repl_transfer_s = NULL;
@@ -4241,7 +4161,9 @@ void roleCommand(client *c) {
 /* Send a REPLCONF ACK command to the primary to inform it about the current
  * processed offset. If we are not connected with a primary, the command has
  * no effects. */
-void replicationSendAck(client *c) {
+void replicationSendAck(void) {
+    client *c = server.primary;
+
     if (c != NULL) {
         int send_fack = server.fsynced_reploff != -1;
         c->flag.replication_force_reply = 1;
@@ -4773,7 +4695,7 @@ void replicationCron(void) {
     /* Send ACK to primary from time to time.
      * Note that we do not send periodic acks to primary that don't
      * support PSYNC and replication offsets. */
-    if (server.primary_host && server.primary && !(server.primary->flag.pre_psync)) replicationSendAck(server.primary);
+    if (server.primary_host && server.primary && !(server.primary->flag.pre_psync)) replicationSendAck();
 
     /* If we have attached replicas, PING them from time to time.
      * So replicas can implement an explicit timeout to primaries, and will
@@ -4917,7 +4839,7 @@ void replicationCron(void) {
     replication_cron_loops++; /* Incremented with frequency 1 HZ. */
 }
 
-int shouldStartChildReplication(int *mincapa_out, int *req_out, slotBitmap slot_bitmap_out) {
+int shouldStartChildReplication(int *mincapa_out, int *req_out) {
     /* We should start a BGSAVE good for replication if we have replicas in
      * WAIT_BGSAVE_START state.
      *
@@ -4929,7 +4851,6 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out, slotBitmap slot_
         int replicas_waiting = 0;
         int mincapa;
         int req;
-        slotBitmap slot_bitmap;
         int first = 1;
         listNode *ln;
         listIter li;
@@ -4941,8 +4862,7 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out, slotBitmap slot_
                 if (first) {
                     /* Get first replica's requirements */
                     req = replica->repl_data->replica_req;
-                    memcpy(slot_bitmap, replica->repl_data->slot_bitmap, sizeof(slotBitmap));
-                } else if (req != replica->repl_data->replica_req || slotBitmapCompare(slot_bitmap, replica->repl_data->slot_bitmap) != 0) {
+                } else if (req != replica->repl_data->replica_req) {
                     /* Skip replicas that don't match */
                     continue;
                 }
@@ -4960,7 +4880,6 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out, slotBitmap slot_
                                  max_idle >= server.repl_diskless_sync_delay)) {
             if (mincapa_out) *mincapa_out = mincapa;
             if (req_out) *req_out = req;
-            if (slot_bitmap_out) memcpy(slot_bitmap_out, slot_bitmap, sizeof(slotBitmap));
             return 1;
         }
     }
@@ -4971,13 +4890,12 @@ int shouldStartChildReplication(int *mincapa_out, int *req_out, slotBitmap slot_
 void replicationStartPendingFork(void) {
     int mincapa = -1;
     int req = -1;
-    slotBitmap slot_bitmap;
 
-    if (shouldStartChildReplication(&mincapa, &req, slot_bitmap)) {
+    if (shouldStartChildReplication(&mincapa, &req)) {
         /* Start the BGSAVE. The called function may start a
          * BGSAVE with socket target or disk target depending on the
          * configuration and replicas capabilities and requirements. */
-        startBgsaveForReplication(mincapa, req, slot_bitmap);
+        startBgsaveForReplication(mincapa, req);
     }
 }
 
diff --git a/src/server.c b/src/server.c
index ea77cc1312..50b93b2943 100644
--- a/src/server.c
+++ b/src/server.c
@@ -900,7 +900,7 @@ int clientsCronResizeQueryBuffer(client *c) {
         if (idletime > 2) {
             /* 1) Query is idle for a long time. */
             size_t remaining = sdslen(c->querybuf) - c->qb_pos;
-            if (!c->flag.replication_source && !remaining) {
+            if (!c->flag.replicated && !remaining) {
                 /* If the client is not for replication and no data is pending,
                  * The client can safely use the shared query buffer in the next read - free the client's querybuf. */
                 sdsfree(c->querybuf);
@@ -1451,7 +1451,7 @@ long long serverCron(struct aeEventLoop *eventLoop, long long id, void *clientDa
         monotime current_time = getMonotonicUs();
         long long factor = 1000000; // us
         trackInstantaneousMetric(STATS_METRIC_COMMAND, server.stat_numcommands, current_time, factor);
-        trackInstantaneousMetric(STATS_METRIC_NET_INPUT, server.stat_net_input_bytes + server.stat_net_repl_input_bytes,
+        trackInstantaneousMetric(STATS_METRIC_NET_INPUT, server.stat_net_input_bytes + server.stat_net_repl_input_bytes + server.stat_net_slot_migration_input_bytes,
                                  current_time, factor);
         trackInstantaneousMetric(STATS_METRIC_NET_OUTPUT,
                                  server.stat_net_output_bytes + server.stat_net_repl_output_bytes, current_time,
@@ -1464,6 +1464,8 @@ long long serverCron(struct aeEventLoop *eventLoop, long long id, void *clientDa
                                  factor);
         trackInstantaneousMetric(STATS_METRIC_EL_DURATION, server.duration_stats[EL_DURATION_TYPE_EL].sum,
                                  server.duration_stats[EL_DURATION_TYPE_EL].cnt, 1);
+        trackInstantaneousMetric(STATS_METRIC_NET_INPUT_SLOT_MIGRATION, server.stat_net_slot_migration_input_bytes,
+                                 current_time, factor);
     }
 
     /* We have just LRU_BITS bits per object for LRU information.
@@ -2684,6 +2686,7 @@ void resetServerStats(void) {
     server.stat_net_input_bytes = 0;
     server.stat_net_output_bytes = 0;
     server.stat_net_repl_input_bytes = 0;
+    server.stat_net_slot_migration_input_bytes = 0;
     server.stat_net_repl_output_bytes = 0;
     server.stat_unexpected_error_replies = 0;
     server.stat_total_error_replies = 0;
@@ -3359,7 +3362,7 @@ struct serverCommand *lookupCommandOrOriginal(robj **argv, int argc) {
 
 /* Commands arriving from a replication source or AOF client, should never be rejected. */
 int mustObeyClient(client *c) {
-    return c->id == CLIENT_ID_AOF || c->flag.replication_source;
+    return c->id == CLIENT_ID_AOF || c->flag.replicated;
 }
 
 static int shouldPropagate(int target) {
@@ -3369,7 +3372,7 @@ static int shouldPropagate(int target) {
         if (server.aof_state != AOF_OFF) return 1;
     }
     if (target & PROPAGATE_REPL) {
-        if (server.primary == NULL && (server.repl_backlog || listLength(server.replicas) != 0)) return 1;
+        if (server.primary_host == NULL && (server.repl_backlog || listLength(server.replicas) != 0)) return 1;
     }
 
     return 0;
@@ -3418,7 +3421,12 @@ static void propagateNow(int dbid, robj **argv, int argc, int target) {
                  server.server_del_keys_in_slot);
 
     if (server.aof_state != AOF_OFF && target & PROPAGATE_AOF) feedAppendOnlyFile(dbid, argv, argc);
-    if (target & PROPAGATE_REPL) replicationFeedReplicas(dbid, argv, argc);
+    if (target & PROPAGATE_REPL) {
+        replicationFeedReplicas(dbid, argv, argc);
+        if (server.cluster_enabled) {
+            clusterFeedSlotMigration(dbid, argv, argc);
+        }
+    }
 }
 
 /* Used inside commands to schedule the propagation of additional commands
@@ -4297,7 +4305,7 @@ int processCommand(client *c) {
 
     /* If the server is paused, block the client until
      * the pause has ended. Replicas are never paused. */
-    if (!c->flag.replica && ((isPausedActions(PAUSE_ACTION_CLIENT_ALL)) ||
+    if (!c->flag.replica && !c->flag.slot_migration_target && ((isPausedActions(PAUSE_ACTION_CLIENT_ALL)) ||
                              ((isPausedActions(PAUSE_ACTION_CLIENT_WRITE)) && is_may_replicate_command))) {
         blockPostponeClient(c);
         return C_OK;
@@ -5903,8 +5911,8 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) {
                 "total_connections_received:%lld\r\n", server.stat_numconnections,
                 "total_commands_processed:%lld\r\n", server.stat_numcommands,
                 "instantaneous_ops_per_sec:%lld\r\n", getInstantaneousMetric(STATS_METRIC_COMMAND),
-                "total_net_input_bytes:%lld\r\n", server.stat_net_input_bytes + server.stat_net_repl_input_bytes,
-                "total_net_output_bytes:%lld\r\n", server.stat_net_output_bytes + server.stat_net_repl_output_bytes,
+                "total_net_input_bytes:%lld\r\n", server.stat_net_input_bytes + server.stat_net_repl_input_bytes + server.stat_net_slot_migration_input_bytes,
+                "total_net_output_bytes:%lld\r\n", server.stat_net_output_bytes + server.stat_net_repl_output_bytes + server.stat_net_slot_migration_output_bytes,
                 "total_net_repl_input_bytes:%lld\r\n", server.stat_net_repl_input_bytes,
                 "total_net_repl_output_bytes:%lld\r\n", server.stat_net_repl_output_bytes,
                 "instantaneous_input_kbps:%.2f\r\n", (float)getInstantaneousMetric(STATS_METRIC_NET_INPUT) / 1024,
@@ -5962,7 +5970,11 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) {
                 "eventloop_duration_sum:%llu\r\n", server.duration_stats[EL_DURATION_TYPE_EL].sum,
                 "eventloop_duration_cmd_sum:%llu\r\n", server.duration_stats[EL_DURATION_TYPE_CMD].sum,
                 "instantaneous_eventloop_cycles_per_sec:%llu\r\n", getInstantaneousMetric(STATS_METRIC_EL_CYCLE),
-                "instantaneous_eventloop_duration_usec:%llu\r\n", getInstantaneousMetric(STATS_METRIC_EL_DURATION)));
+                "instantaneous_eventloop_duration_usec:%llu\r\n", getInstantaneousMetric(STATS_METRIC_EL_DURATION),
+                "total_net_slot_migration_input_bytes:%lld\r\n", server.stat_net_slot_migration_input_bytes,
+                "total_net_slot_migration_output_bytes:%lld\r\n", server.stat_net_slot_migration_output_bytes,
+                "instantaneous_slot_migration_input_kbps:%.2f\r\n", (float)getInstantaneousMetric(STATS_METRIC_NET_INPUT_SLOT_MIGRATION) / 1024,
+                "instantaneous_slot_migration_output_kbps:%.2f\r\n", (float)getInstantaneousMetric(STATS_METRIC_NET_OUTPUT_SLOT_MIGRATION) / 1024));
         info = genValkeyInfoStringACLStats(info);
     }
 
diff --git a/src/server.h b/src/server.h
index e1a8a1d503..32d71bafd2 100644
--- a/src/server.h
+++ b/src/server.h
@@ -182,15 +182,17 @@ struct hdr_histogram;
 #define RIO_CONNSET_WRITE_MAX_CHUNK_SIZE 16384
 
 /* Instantaneous metrics tracking. */
-#define STATS_METRIC_SAMPLES 16               /* Number of samples per metric. */
-#define STATS_METRIC_COMMAND 0                /* Number of commands executed. */
-#define STATS_METRIC_NET_INPUT 1              /* Bytes read to network. */
-#define STATS_METRIC_NET_OUTPUT 2             /* Bytes written to network. */
-#define STATS_METRIC_NET_INPUT_REPLICATION 3  /* Bytes read to network during replication. */
-#define STATS_METRIC_NET_OUTPUT_REPLICATION 4 /* Bytes written to network during replication. */
-#define STATS_METRIC_EL_CYCLE 5               /* Number of eventloop cycled. */
-#define STATS_METRIC_EL_DURATION 6            /* Eventloop duration. */
-#define STATS_METRIC_COUNT 7
+#define STATS_METRIC_SAMPLES 16                 /* Number of samples per metric. */
+#define STATS_METRIC_COMMAND 0                  /* Number of commands executed. */
+#define STATS_METRIC_NET_INPUT 1                /* Bytes read to network. */
+#define STATS_METRIC_NET_OUTPUT 2               /* Bytes written to network. */
+#define STATS_METRIC_NET_INPUT_REPLICATION 3    /* Bytes read to network during replication. */
+#define STATS_METRIC_NET_OUTPUT_REPLICATION 4   /* Bytes written to network during replication. */
+#define STATS_METRIC_EL_CYCLE 5                 /* Number of eventloop cycled. */
+#define STATS_METRIC_EL_DURATION 6              /* Eventloop duration. */
+#define STATS_METRIC_NET_INPUT_SLOT_MIGRATION 7 /* Bytes read to network during slot migration. */
+#define STATS_METRIC_NET_OUTPUT_SLOT_MIGRATION 7 /* Bytes written to network during slot migration. */
+#define STATS_METRIC_COUNT 8
 
 /* Protocol and I/O related defines */
 #define PROTO_IOBUF_LEN (1024 * 16)         /* Generic I/O buffer size */
@@ -599,6 +601,7 @@ typedef enum {
     PAUSE_BY_CLIENT_COMMAND = 0,
     PAUSE_DURING_SHUTDOWN,
     PAUSE_DURING_FAILOVER,
+    PAUSE_DURING_SLOT_MIGRATION,
     NUM_PAUSE_PURPOSES /* This value is the number of purposes above. */
 } pause_purpose;
 
@@ -1091,8 +1094,9 @@ typedef struct ClientFlags {
                                             * flag, we won't cache the primary in freeClient. */
     uint64_t fake : 1;                     /* This is a fake client without a real connection. */
     uint64_t import_source : 1;            /* This client is importing data to server and can visit expired key. */
-    uint64_t replication_source : 1;       /* This client is a replication source (i.e. primary or slot migration). */
+    uint64_t replicated : 1;       /* This client is a replication source (i.e. primary or slot migration). */
     uint64_t slot_migration_source : 1;    /* This client is a slot migration source. */
+    uint64_t slot_migration_target : 1;    /* This client is a slot migration target. */
     uint64_t reserved : 3;                 /* Reserved for future use */
 } ClientFlags;
 
@@ -1144,7 +1148,6 @@ typedef struct ClientReplicationData {
                                             see the definition of replBufBlock. */
     size_t ref_block_pos;                /* Access position of referenced buffer block,
                                             i.e. the next offset to send. */
-    slotBitmap slot_bitmap;              /* The slot range this replica is replicating for. */
 } ClientReplicationData;
 
 typedef struct ClientModuleData {
@@ -1540,6 +1543,7 @@ typedef struct {
 #define CHILD_TYPE_AOF 2
 #define CHILD_TYPE_LDB 3
 #define CHILD_TYPE_MODULE 4
+#define CHILD_TYPE_SYNCSLOTS 5
 
 typedef enum childInfoType {
     CHILD_INFO_TYPE_CURRENT_INFO,
@@ -1708,9 +1712,10 @@ struct valkeyServer {
     long long stat_net_input_bytes;                /* Bytes read from network. */
     long long stat_net_output_bytes;               /* Bytes written to network. */
     long long stat_net_repl_input_bytes;           /* Bytes read during replication, added to stat_net_input_bytes in 'info'. */
-    long long stat_net_slot_migration_input_bytes; /* Bytes read during replication, added to stat_net_input_bytes in 'info'. */
     /* Bytes written during replication, added to stat_net_output_bytes in 'info'. */
     long long stat_net_repl_output_bytes;
+    long long stat_net_slot_migration_input_bytes; /* Bytes read during replication, added to stat_net_input_bytes in 'info'. */
+    long long stat_net_slot_migration_output_bytes; /* Bytes read during replication, added to stat_net_input_bytes in 'info'. */
     size_t stat_current_cow_peak;                       /* Peak size of copy on write bytes. */
     size_t stat_current_cow_bytes;                      /* Copy on write bytes while child is active. */
     monotime stat_current_cow_updated;                  /* Last update time of stat_current_cow_bytes */
@@ -2621,7 +2626,7 @@ void dictVanillaFree(void *val);
 #define READ_FLAGS_INLINE_ZERO_QUERY_LEN (1 << 11)
 #define READ_FLAGS_PARSING_NEGATIVE_MBULK_LEN (1 << 12)
 #define READ_FLAGS_PARSING_COMPLETED (1 << 13)
-#define READ_FLAGS_REPLICATION_SOURCE (1 << 14)
+#define READ_FLAGS_REPLICATED (1 << 14)
 #define READ_FLAGS_DONT_PARSE (1 << 15)
 #define READ_FLAGS_AUTH_REQUIRED (1 << 16)
 
@@ -2948,8 +2953,8 @@ int sendCurrentOffsetToReplica(client *replica);
 void addRdbReplicaToPsyncWait(client *replica);
 void initClientReplicationData(client *c);
 void freeClientReplicationData(client *c);
-void replicationSendAck(client *c);
-int replicationProceedWithHandshake(connection *conn, int curr_state, slotBitmap slot_bitmap);
+char *sendCommandArgv(connection *conn, int argc, char **argv, size_t *argv_lens);
+char *receiveSynchronousResponse(connection *conn);
 
 /* Generic persistence functions */
 void startLoadingFile(size_t size, char *filename, int rdbflags);
@@ -2961,6 +2966,8 @@ void updateLoadingFileName(char *filename);
 void startSaving(int rdbflags);
 void stopSaving(int success);
 int allPersistenceDisabled(void);
+typedef int(*ChildSnapshotFunc)(int req, rio *rdb, void *privdata);
+int saveSnapshotToConnectionSockets(connection **conns, int connsnum, int use_pipe, int req, ChildSnapshotFunc snapshot_func, void *privdata);
 
 #define DISK_ERROR_TYPE_AOF 1  /* Don't accept writes: AOF errors. */
 #define DISK_ERROR_TYPE_RDB 2  /* Don't accept writes: RDB errors. */
@@ -3684,6 +3691,7 @@ void sdiffCommand(client *c);
 void sdiffstoreCommand(client *c);
 void sscanCommand(client *c);
 void syncCommand(client *c);
+void syncSlotsCommand(client *c);
 void flushdbCommand(client *c);
 void flushallCommand(client *c);
 void sortCommand(client *c);
diff --git a/tests/unit/slot-migration.tcl b/tests/unit/slot-migration.tcl
new file mode 100644
index 0000000000..90adaf84e0
--- /dev/null
+++ b/tests/unit/slot-migration.tcl
@@ -0,0 +1,22 @@
+
+
+# TEST CASES
+# ---- General ----
+# - Only migrating slots are synced
+# - Changes in non-migrating slots are not sent to target
+# - Parsing test
+# - Slot must have available primary
+#
+# ---- Reslience ----
+# - Target gives up if primary is unavailable
+# - Source unpauses itself if replica is unavailable
+# - Client is closed by target during migration
+#
+# ---- Importing slot is not exposed ----
+# - KEYS command on importing node
+# - RANDOMKEY on importing node
+#
+# ---- Replication
+# - Replica receives updates through target primary
+# - Time out results in replica dropping slots
+# - Failover during migration cleans up slots
\ No newline at end of file

From 98de0a757698378572efc4e6d0b758974d61cdd9 Mon Sep 17 00:00:00 2001
From: Jacob Murphy <jkmurphy@google.com>
Date: Mon, 20 Jan 2025 08:30:17 +0000
Subject: [PATCH 07/18] Bug fixes for SYNCSLOTS based implementation

Signed-off-by: Jacob Murphy <jkmurphy@google.com>
---
 src/cluster.c                 |  21 ------
 src/cluster.h                 |   2 -
 src/cluster_legacy.c          | 127 ++++++++++++++++++----------------
 src/cluster_legacy.h          |   1 -
 src/kvstore.c                 |  29 ++------
 src/kvstore.h                 |   1 -
 src/lazyfree.c                |  23 ------
 src/networking.c              |   5 +-
 src/rdb.c                     |   4 +-
 src/server.h                  |   1 -
 tests/unit/slot-migration.tcl |  12 +++-
 11 files changed, 85 insertions(+), 141 deletions(-)

diff --git a/src/cluster.c b/src/cluster.c
index 508eddefc6..f650d979f7 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -815,27 +815,6 @@ unsigned int countKeysInSlot(unsigned int slot) {
     return kvstoreHashtableSize(server.db->keys, slot);
 }
 
-unsigned int dropKeysInSlotBitmap(slotBitmap slot_bitmap, int async) {
-    unsigned int result = 0;
-    for (int i = 0; i < CLUSTER_SLOTS; i++) {
-        if (bitmapTestBit(slot_bitmap, i)) {
-            result += dropKeysInSlot(i, async);
-        }
-    }
-    return result;
-}
-
-unsigned int dropKeysInSlot(unsigned int hashslot, int async) {
-    unsigned int result = kvstoreHashtableSize(server.db->keys, hashslot);
-    if (async) {
-        emptyHashtableAsync(server.db, hashslot);
-    } else {
-        kvstoreEmptyHashtable(server.db->keys, hashslot, NULL);
-        kvstoreEmptyHashtable(server.db->expires, hashslot, NULL);
-    }
-    return result;
-}
-
 void clusterCommandHelp(client *c) {
     const char *help[] = {
         "COUNTKEYSINSLOT <slot>",
diff --git a/src/cluster.h b/src/cluster.h
index 5192bc405e..21d4469357 100644
--- a/src/cluster.h
+++ b/src/cluster.h
@@ -114,8 +114,6 @@ client *createCachedResponseClient(int resp);
 void deleteCachedResponseClient(client *recording_client);
 void clearCachedClusterSlotsResponse(void);
 unsigned int countKeysInSlot(unsigned int hashslot);
-unsigned int dropKeysInSlotBitmap(slotBitmap slot_bitmap, int async);
-unsigned int dropKeysInSlot(unsigned int hashslot, int async);
 void bitmapToSlotRanges(unsigned char *bitmap, slotBitmap slot_bitmap_out);
 int bitmapTestBit(unsigned char *bitmap, int pos);
 void bitmapSetBit(unsigned char *bitmap, int pos);
diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index fa0da913b2..97b201a0a2 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -94,6 +94,7 @@ const char *clusterGetMessageTypeString(int type);
 void removeChannelsInSlot(unsigned int slot);
 unsigned int countChannelsInSlot(unsigned int hashslot);
 unsigned int delKeysInSlot(unsigned int hashslot);
+unsigned int delKeysInSlotBitmap(slotBitmap bitmap);
 void clusterAddNodeToShard(const char *shard_id, clusterNode *node);
 list *clusterLookupNodeListByShardId(const char *shard_id);
 void clusterRemoveNodeFromShard(clusterNode *node);
@@ -4424,10 +4425,10 @@ slotImport *clusterCreateSlotImportJob(clusterNode *source, slotBitmap slots) {
 }
 
 slotExport *clusterCreateSlotExportJob(client *c, slotBitmap slots) {
-    slotExport *result = (slotExport *) zmalloc(sizeof(slotExport));
+    slotExport *result = (slotExport *) zcalloc(sizeof(slotExport));
     memcpy(result->slot_bitmap, slots, sizeof(slotBitmap));
     result->state = SLOT_EXPORT_QUEUED;
-    result->pause_end = 0;
+    result->pause_end = -1;
     result->client = c;
     return result;
 }
@@ -4484,17 +4485,14 @@ void clusterFeedSlotMigration(int dbid, robj **argv, int argc) {
     unsigned long long prev_pending = curr_export->client->reply_bytes;
     addReplyArrayLen(curr_export->client, argc);
     for (i = 0; i < argc; i++) {
-        addReply(curr_export->client, argv[i]);
+        addReplyBulk(curr_export->client, argv[i]);
     }
     curr_export->syncslot_offset += curr_export->client->reply_bytes - prev_pending;
 }
 
 int clusterShouldWriteToSlotMigrationTarget() {
     slotExport *curr_export = clusterGetCurrentSlotExport();
-    if (curr_export->state != SLOT_EXPORT_PAUSED) {
-        return 0;
-    }
-    return 1;
+    return curr_export && (curr_export->state == SLOT_EXPORT_PAUSE_AND_REPLY || curr_export->state == SLOT_EXPORT_PAUSED);
 }
 
 void clusterSlotMigrationHandleClientClose(client *c) {
@@ -4570,7 +4568,7 @@ void clusterProceedWithSlotImport(void) {
                 c->flag.authenticated = 1;
                 c->user = NULL; /* This client can do everything. */
                 c->querybuf = sdsempty(); /* Similar to primary, we use a dedicated query buf. */
-                initClientReplicationData(c); /* Used to track reploff */
+                initClientReplicationData(c);
 
                 curr_import->state = SLOT_IMPORT_SEND_AUTH;
                 continue;
@@ -4614,7 +4612,7 @@ void clusterProceedWithSlotImport(void) {
                 continue;
             case SLOT_IMPORT_SEND_SYNCSLOTS:
                 /* Ensure we have a clean state for the SYNC. */
-                dropKeysInSlotBitmap(curr_import->slot_bitmap, 1);
+                delKeysInSlotBitmap(curr_import->slot_bitmap);
 
                 serverLog(LL_NOTICE, "Sending CLUSTER SYNCSLOTS START request to source node %.40s", curr_import->source_node->name);
                 char *syncslots_args[4] = {"CLUSTER", "SYNCSLOTS", "START", (char *)curr_import->slot_bitmap};
@@ -4632,7 +4630,7 @@ void clusterProceedWithSlotImport(void) {
                 connSetReadHandler(curr_import->client->conn, readQueryFromClient);
                 curr_import->state = SLOT_IMPORT_RECEIVE_SYNCSLOTS;
             case SLOT_IMPORT_RECEIVE_SYNCSLOTS:
-                /* Nothing to do in this state. Waiting for CLUSTER SYNCSLOTS END to be processed. */
+                /* Nothing to do in this state. Waiting for CLUSTER SYNCSLOTS ENDSNAPSHOT to be processed. */
                 return;
             case SLOT_IMPORT_PAUSE_OWNER:
                 curr_import->client->flag.replication_force_reply = 1;
@@ -4674,20 +4672,9 @@ void clusterProceedWithSlotImport(void) {
                 clusterBroadcastPong(CLUSTER_BROADCAST_ALL);
                 listDelNode(server.cluster->slot_import_jobs, curr_node);
                 continue;
-            case SLOT_IMPORT_REPLICA_TRACKING:
-                /* As a replica, we will simply apply the primaries updates
-                 * from the slot migration source. However, if we are ever
-                 * promoted to primary, we need to fail the migration to
-                 * prevent leaked keys in the importing slots. */
-                if (clusterNodeIsPrimary(myself)) {
-                    serverLog(LL_WARNING, "Promoted to primary during slot migration, failing the ongoing migration");
-                    curr_import->state = SLOT_IMPORT_FAILED;
-                    continue;
-                }
-                return;
             case SLOT_IMPORT_FAILED:
                 listDelNode(server.cluster->slot_import_jobs, curr_node);
-                dropKeysInSlotBitmap(curr_import->slot_bitmap, server.repl_replica_lazy_flush);
+                delKeysInSlotBitmap(curr_import->slot_bitmap);
                 clusterFreeSlotImportJob(curr_import);
                 continue;
         }
@@ -4700,7 +4687,7 @@ int childSnapshotForSyncSlot(int req, rio *rdb, void *privdata) {
     rioWrite(rdb, "*3\r\n", 4);
     rioWriteBulkString(rdb, "CLUSTER", 7);
     rioWriteBulkString(rdb, "SYNCSLOTS", 9);
-    rioWriteBulkString(rdb, "END", 3);
+    rioWriteBulkString(rdb, "ENDSNAPSHOT", 11);
     return retval;
 }
 
@@ -4721,6 +4708,7 @@ void clusterProceedWithSlotExport(void) {
                 }
                 connection ** conns = zmalloc(sizeof(connection*));
                 *conns = curr_export->client->conn;
+                serverLog(LL_NOTICE, "Initiating snapshot to conn with fd %d", curr_export->client->conn->fd);
                 if (saveSnapshotToConnectionSockets(conns, 1, 1, 0, childSnapshotForSyncSlot, curr_export->slot_bitmap) != C_OK) {
                     serverLog(LL_WARNING, "Failed to start slot export to target");
                     curr_export->state = SLOT_EXPORT_FAILED;
@@ -4736,8 +4724,13 @@ void clusterProceedWithSlotExport(void) {
                 addReplyArrayLen(curr_export->client, 4);
                 addReplyBulkCBuffer(curr_export->client, "CLUSTER", 7);
                 addReplyBulkCBuffer(curr_export->client, "SYNCSLOTS", 9);
-                addReplyBulkCBuffer(curr_export->client, "PAUSEDAT", 8);
-                addReplyLongLong(curr_export->client, curr_export->syncslot_offset);
+                addReplyBulkCBuffer(curr_export->client, "PAUSEOFFSET", 11);
+                addReplyBulkLongLong(curr_export->client, curr_export->syncslot_offset);
+
+                /* Even though we just added replies, it's possible that, due to
+                 * existing pending data, the client is not in the pending write
+                 * queue. We enqueue it explicitly to work around this. */
+                putClientInPendingWriteQueue(curr_export->client);
 
                 curr_export->pause_end = mstime() + (CLUSTER_MF_TIMEOUT * CLUSTER_MF_PAUSE_MULT);
                 pauseActions(PAUSE_DURING_SLOT_MIGRATION, curr_export->pause_end, PAUSE_ACTIONS_CLIENT_WRITE_SET);
@@ -4745,7 +4738,18 @@ void clusterProceedWithSlotExport(void) {
                 curr_export->state = SLOT_EXPORT_PAUSED;
                 continue;
             case SLOT_EXPORT_PAUSED:
-                /*  */
+                /* While paused, we simply want to check if we should unpause. */
+                if (curr_export->pause_end <= mstime()) {
+                    /* Every CLUSTER_MF_TIMEOUT, the source node should
+                     * re-attempt the pause. If we reach this point, it hasn't
+                     * attempted the pause in that time, we can assume it is
+                     * dead and fail the migration.*/
+                    serverLog(LL_WARNING, "During slot export, unpausing self and cancelling export due to timeout.");
+                    unpauseActions(PAUSE_DURING_SLOT_MIGRATION);
+                    curr_export->state = SLOT_EXPORT_FAILED;
+                    continue;
+                }
+                return;
             case SLOT_EXPORT_FINISH:
             case SLOT_EXPORT_FAILED:
                 listDelNode(server.cluster->slot_export_jobs, curr_node);
@@ -6714,9 +6718,18 @@ void removeChannelsInSlot(unsigned int slot) {
     pubsubShardUnsubscribeAllChannelsInSlot(slot);
 }
 
+unsigned int delKeysInSlotBitmap(slotBitmap bitmap) {
+    unsigned int res = 0;
+    for (int i = 0; i < CLUSTER_SLOTS; i++) {
+        if (bitmapTestBit(bitmap, i)) {
+            res += delKeysInSlot(i);
+        }
+    }
+    return res;
+}
+
 /* Remove all the keys in the specified hash slot.
  * The number of removed items is returned. */
-// TODO(murphyjacob4) - can we just use this?
 unsigned int delKeysInSlot(unsigned int hashslot) {
     if (!countKeysInSlot(hashslot)) return 0;
 
@@ -7572,35 +7585,29 @@ int clusterCommandSpecial(client *c) {
             }
             c->flag.slot_migration_target = 1;
             initClientReplicationData(c);
-            slotExport *job = clusterCreateSlotExportJob(c, c->argv[2]->ptr);
+            slotExport *job = clusterCreateSlotExportJob(c, c->argv[3]->ptr);
             listAddNodeTail(server.cluster->slot_export_jobs, job);
             clusterProceedWithSlotMigration();
-        } else if (!strcasecmp(c->argv[2]->ptr, "inform")) {
-            /* CLUSTER SYNCSLOTS INFORM <slot-bitmap> */
-            if (c->argc != 4) {
-                addReplyError(c, "CLUSTER SYNCSLOTS INFORM command requires exactly one argument");
-                return 1;
-            }
-            slotImport * to_enqueue = clusterCreateSlotImportJob(NULL, c->argv[2]->ptr);
-            to_enqueue->state = SLOT_IMPORT_REPLICA_TRACKING;
-        } else if (!strcasecmp(c->argv[2]->ptr, "end")) {
-            /* CLUSTER SYNCSLOTS END */
+        } else if (!strcasecmp(c->argv[2]->ptr, "endsnapshot")) {
+            /* CLUSTER SYNCSLOTS ENDSNAPSHOT */
             if (c->argc != 3) {
-                addReplyError(c, "CLUSTER SYNCSLOTS END does not expect any arguments.");
+                addReplyError(c, "CLUSTER SYNCSLOTS ENDSNAPSHOT does not expect any arguments.");
                 return 1;
             }
-            slotImport *curr_import = clusterGetCurrentSlotImport();
-            if (!curr_import || (curr_import->state != SLOT_IMPORT_RECEIVE_SYNCSLOTS && curr_import->state != SLOT_IMPORT_REPLICA_TRACKING)) {
-                addReplyError(c, "No ongoing CLUSTER SYNCSLOTS to end.");
+            if (c->flag.primary) {
+                /* Due to the proxying nature of replication from the source
+                 * node through the target node to the target node's replicas,
+                 * this message should simply be ignored. */
                 return 1;
             }
-            if (curr_import->state != SLOT_IMPORT_REPLICA_TRACKING) {
-                /* Replicas will also receive this command through the replication
-                 * stream, but it is not actionable. */
+            slotImport *curr_import = clusterGetCurrentSlotImport();
+            if (!curr_import || (curr_import->state != SLOT_IMPORT_RECEIVE_SYNCSLOTS)) {
+                addReplyError(c, "No ongoing snapshot to end.");
                 return 1;
             }
             if (curr_import->client != c) {
                 addReplyError(c, "This client is not the one that initiated the ongoing CLUSTER SYNCSLOTS.");
+                return 1;
             }
             curr_import->state = SLOT_IMPORT_PAUSE_OWNER;
             clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION);
@@ -7621,28 +7628,25 @@ int clusterCommandSpecial(client *c) {
             } else if (slot_export->state != SLOT_EXPORT_SNAPSHOTTING) {
                 addReplyError(c, "SYNCSLOTS is not in the correct state for this command.");
                 return 1;
-            } else {
-                /* First pause. We want to flush the output buffer that was not allowed to
-                 * flush during the snapshot. */
-                putClientInPendingWriteQueue(slot_export->client);
             }
+            serverLog(LL_NOTICE, "Pause received by target during slot migration. Pausing and initiating stream of commands.");
 
             slot_export->state = SLOT_EXPORT_PAUSE_AND_REPLY;
             clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION);
-        } else if (!strcasecmp(c->argv[2]->ptr, "pausedat")) {
-            /* CLUSTER SYNCSLOTS PAUSEDAT <offset> */
+        } else if (!strcasecmp(c->argv[2]->ptr, "pauseoffset")) {
+            /* CLUSTER SYNCSLOTS PAUSEOFFSET <offset> */
             if (c->argc != 4) {
-                addReplyError(c, "CLUSTER SYNCSLOTS PAUSEDAT command requires exactly one argument.");
+                addReplyError(c, "CLUSTER SYNCSLOTS PAUSEOFFSET command requires exactly one argument.");
                 return 1;
             }
             slotImport *slot_import = clusterGetCurrentSlotImport();
             if (!slot_import || slot_import->state != SLOT_IMPORT_WAITING_FOR_OFFSET) {
-                addReplyError(c, "No CLUSTER SYNCSLOTS is waiting for a PAUSEDAT response.");
+                addReplyError(c, "No CLUSTER SYNCSLOTS is waiting for a PAUSEOFFSET response.");
                 return 1;
             }
             long long offset;
-            if (getLongLongFromObject(c->argv[3]->ptr, &offset) != C_OK) {
-                addReplyError(c, "Failed to parse PAUSEDAT offset.");
+            if (getLongLongFromObject(c->argv[3], &offset) != C_OK) {
+                addReplyError(c, "Failed to parse PAUSEOFFSET offset.");
                 return 1;
             }
             slot_import->paused_at_offset = offset;
@@ -7693,14 +7697,14 @@ const char **clusterCommandExtendedHelp(void) {
         "LINKS",
         "    Return information about all network links between this node and its peers.",
         "    Output format is an array where each array element is a map containing attributes of a link",
-        "MIGRATE SLOTSRANGE <start slot> <end slot> [<start slot> <end slot> ...] SHARD <shard-id>",
+        "MIGRATE SLOTSRANGE <start slot> <end slot> [<start slot> <end slot> ...]",
         "    Initiate server driven slot migration of all slot ranges to the designated shard.",
-        "SYNCSLOTS [START <slot-bitmap>|END|INFORM <slot-bitmap>|PAUSE|PAUSEDAT]",
+        "SYNCSLOTS [START <slot-bitmap>|ENDSNAPSHOT|PAUSE|PAUSEOFFSET <offset>]",
         "    Internal command. SYNCSLOTS START initiates send of an AOF formatted snapshot containing the",
-        "    provided slot bitmap. SYNCSLOTS END terminates the AOF formatted snapshot, and after this",
+        "    provided slot bitmap. SYNCSLOTS ENDSNAPSHOT terminates the AOF formatted snapshot, and after this",
         "    SYNCSLOTS PAUSE signals for this node to be paused and for a continuous stream of commands"
-        "    for the slots to be replicated. SYNCSLOTS PAUSEDAT will be replied with the offset of remaining"
-        "    commands. SYNCSLOTS INFORM is used to inform replicas that the operation is occurring.",
+        "    for the slots to be replicated. SYNCSLOTS PAUSEOFFSET will be replied with the offset of remaining"
+        "    commands.",
         NULL};
 
     return help;
@@ -7759,6 +7763,9 @@ int clusterAllowFailoverCmd(client *c) {
 
 void clusterPromoteSelfToPrimary(void) {
     replicationUnsetPrimary();
+    /* verifyClusterConfigWithData will delete keys in unowned slots. This
+     * could happen in the case of failover during a slot migration. */
+    serverAssert(verifyClusterConfigWithData() == C_OK);
 }
 
 int detectAndUpdateCachedNodeHealth(void) {
diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
index 9eda033bda..ee38b3eced 100644
--- a/src/cluster_legacy.h
+++ b/src/cluster_legacy.h
@@ -378,7 +378,6 @@ struct _clusterNode {
 
 typedef enum slotImportState {
     SLOT_IMPORT_QUEUED,
-    SLOT_IMPORT_REPLICA_TRACKING, /* Replicas track the slot import as well */
     SLOT_IMPORT_CONNECTING,
     SLOT_IMPORT_SEND_AUTH,
     SLOT_IMPORT_RECEIVE_AUTH,
diff --git a/src/kvstore.c b/src/kvstore.c
index f1ed085c43..b84ec5e8df 100644
--- a/src/kvstore.c
+++ b/src/kvstore.c
@@ -302,7 +302,12 @@ kvstore *kvstoreCreate(hashtableType *type, int num_hashtables_bits, int flags)
 
 void kvstoreEmpty(kvstore *kvs, void(callback)(hashtable *)) {
     for (int didx = 0; didx < kvs->num_hashtables; didx++) {
-        kvstoreEmptyHashtable(kvs, didx, callback);
+        hashtable *ht = kvstoreGetHashtable(kvs, didx);
+        if (!ht) continue;
+        kvstoreHashtableMetadata *metadata = (kvstoreHashtableMetadata *)hashtableMetadata(ht);
+        if (metadata->rehashing_node) metadata->rehashing_node = NULL;
+        hashtableEmpty(ht, callback);
+        freeHashtableIfNeeded(kvs, didx);
     }
 
     listEmpty(kvs->rehashing);
@@ -315,28 +320,6 @@ void kvstoreEmpty(kvstore *kvs, void(callback)(hashtable *)) {
     kvs->overhead_hashtable_rehashing = 0;
 }
 
-void kvstoreEmptyHashtable(kvstore *kvs, int didx, void(callback)(hashtable *)) {
-    hashtable *ht = kvstoreGetHashtable(kvs, didx);
-    if (!ht) return;
-    kvstoreHashtableMetadata *metadata = (kvstoreHashtableMetadata *)hashtableMetadata(ht);
-    if (metadata->rehashing_node) metadata->rehashing_node = NULL;
-    hashtableEmpty(ht, callback);
-    freeHashtableIfNeeded(kvs, didx);
-}
-
-hashtable *kvstoreUnlinkHashtable(kvstore *kvs, int didx) {
-    hashtable *oldht = kvstoreGetHashtable(kvs, didx);
-    if (!oldht) return NULL;
-
-    /* Pause rehashing on the to be unlinked node. */
-    kvstoreHashtableMetadata *oldmetadata = (kvstoreHashtableMetadata *)hashtableMetadata(oldht);
-    if (oldmetadata->rehashing_node) oldmetadata->rehashing_node = NULL;
-
-    kvs->hashtables[didx] = NULL;
-    kvs->allocated_hashtables--;
-    return oldht;
-}
-
 void kvstoreRelease(kvstore *kvs) {
     for (int didx = 0; didx < kvs->num_hashtables; didx++) {
         hashtable *ht = kvstoreGetHashtable(kvs, didx);
diff --git a/src/kvstore.h b/src/kvstore.h
index a79caf23aa..bc3baba43a 100644
--- a/src/kvstore.h
+++ b/src/kvstore.h
@@ -17,7 +17,6 @@ typedef int(kvstoreIteratorPredicate)(int didx, void *privdata);
 kvstore *kvstoreCreate(hashtableType *type, int num_hashtables_bits, int flags);
 void kvstoreEmpty(kvstore *kvs, void(callback)(hashtable *));
 void kvstoreEmptyHashtable(kvstore *kvs, int didx, void(callback)(hashtable *));
-hashtable *kvstoreUnlinkHashtable(kvstore *kvs, int didx);
 void kvstoreRelease(kvstore *kvs);
 unsigned long long kvstoreSize(kvstore *kvs);
 unsigned long kvstoreBuckets(kvstore *kvs);
diff --git a/src/lazyfree.c b/src/lazyfree.c
index 8cd04eed37..3b061ccd84 100644
--- a/src/lazyfree.c
+++ b/src/lazyfree.c
@@ -32,18 +32,6 @@ void lazyfreeFreeDatabase(void *args[]) {
     atomic_fetch_add_explicit(&lazyfreed_objects, numkeys, memory_order_relaxed);
 }
 
-/* Release a hashtable from the lazyfree thread. */
-void lazyfreeFreeHashtable(void *args[]) {
-    hashtable *ht1 = args[0];
-    hashtable *ht2 = args[1];
-
-    size_t numkeys = hashtableSize(ht1);
-    hashtableRelease(ht1);
-    if (ht2) hashtableRelease(ht2);
-    atomic_fetch_sub_explicit(&lazyfree_objects, numkeys, memory_order_relaxed);
-    atomic_fetch_add_explicit(&lazyfreed_objects, numkeys, memory_order_relaxed);
-}
-
 /* Release the key tracking table. */
 void lazyFreeTrackingTable(void *args[]) {
     rax *rt = args[0];
@@ -211,17 +199,6 @@ void emptyDbAsync(serverDb *db) {
     bioCreateLazyFreeJob(lazyfreeFreeDatabase, 2, oldkeys, oldexpires);
 }
 
-/* Empty a hashtable asynchrounously. */
-void emptyHashtableAsync(serverDb *db, int didx) {
-    hashtable *oldkeys = kvstoreUnlinkHashtable(db->keys, didx);
-    hashtable *oldexpires = kvstoreUnlinkHashtable(db->expires, didx);
-    if (!oldkeys) {
-        return;
-    }
-    atomic_fetch_add_explicit(&lazyfree_objects, hashtableSize(oldkeys), memory_order_relaxed);
-    bioCreateLazyFreeJob(lazyfreeFreeHashtable, 2, oldkeys, oldexpires);
-}
-
 /* Free the key tracking table.
  * If the table is huge enough, free it in async way. */
 void freeTrackingRadixTreeAsync(rax *tracking) {
diff --git a/src/networking.c b/src/networking.c
index c2828d384a..91a8001984 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -242,7 +242,8 @@ void putClientInPendingWriteQueue(client *c) {
     if (!c->flag.pending_write &&
         (!c->repl_data ||
          c->repl_data->repl_state == REPL_STATE_NONE ||
-         (isReplicaReadyForReplData(c) && !c->repl_data->repl_start_cmd_stream_on_ack))) {
+         (isReplicaReadyForReplData(c) && !c->repl_data->repl_start_cmd_stream_on_ack)) &&
+        (!c->flag.slot_migration_target || clusterShouldWriteToSlotMigrationTarget())) {
         /* Here instead of installing the write handler, we just flag the
          * client and put it into a list of clients that have something
          * to write to the socket. This way before re-entering the event
@@ -292,8 +293,6 @@ int prepareClientToWrite(client *c) {
      * is set. */
     if ((c->flag.replicated) && !c->flag.replication_force_reply) return C_ERR;
 
-    if ((c->flag.slot_migration_target && !clusterShouldWriteToSlotMigrationTarget())) return C_ERR;
-
     /* Skip the fake client, such as the fake client for AOF loading.
      * But CLIENT_ID_CACHED_RESPONSE is allowed since it is a fake client
      * but has a connection to cache the response. */
diff --git a/src/rdb.c b/src/rdb.c
index ba5d219452..e8d4bfae1a 100644
--- a/src/rdb.c
+++ b/src/rdb.c
@@ -3562,10 +3562,8 @@ int saveSnapshotToConnectionSockets(connection **conns, int connsnum, int use_pi
     server.rdb_pipe_conns = NULL;
     if (use_pipe) {
         server.rdb_pipe_conns = conns;
-        server.rdb_pipe_numconns = 0;
-        server.rdb_pipe_numconns_writing = 0;
-    } else {
         server.rdb_pipe_numconns = connsnum;
+        server.rdb_pipe_numconns_writing = 0;
     }
     /* Create the child process. */
     if ((childpid = serverFork(CHILD_TYPE_RDB)) == 0) {
diff --git a/src/server.h b/src/server.h
index 32d71bafd2..c0af0b6625 100644
--- a/src/server.h
+++ b/src/server.h
@@ -3452,7 +3452,6 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor);
 int parseScanCursorOrReply(client *c, robj *o, unsigned long long *cursor);
 int dbAsyncDelete(serverDb *db, robj *key);
 void emptyDbAsync(serverDb *db);
-void emptyHashtableAsync(serverDb *db, int didx);
 size_t lazyfreeGetPendingObjectsCount(void);
 size_t lazyfreeGetFreedObjectsCount(void);
 void lazyfreeResetStats(void);
diff --git a/tests/unit/slot-migration.tcl b/tests/unit/slot-migration.tcl
index 90adaf84e0..96048dad60 100644
--- a/tests/unit/slot-migration.tcl
+++ b/tests/unit/slot-migration.tcl
@@ -7,16 +7,22 @@
 # - Parsing test
 # - Slot must have available primary
 #
-# ---- Reslience ----
+# ---- Error handling ----
 # - Target gives up if primary is unavailable
 # - Source unpauses itself if replica is unavailable
 # - Client is closed by target during migration
+# - Client is closed by source during migration
 #
 # ---- Importing slot is not exposed ----
 # - KEYS command on importing node
 # - RANDOMKEY on importing node
 #
-# ---- Replication
+# ---- Replication ----
 # - Replica receives updates through target primary
 # - Time out results in replica dropping slots
-# - Failover during migration cleans up slots
\ No newline at end of file
+# - Failover during migration cleans up slots
+# - Full sync with pending migration includes pending slots, is cleaned up if migration fails
+#
+# ---- Loading ----
+# - Partial slot migration is cleaned up after AOF load
+# - Partial slot migration is cleaned up after RDB load
\ No newline at end of file

From 4784891a7651caf63cf02c687fc64922ff6d91d1 Mon Sep 17 00:00:00 2001
From: Jacob Murphy <jkmurphy@google.com>
Date: Mon, 20 Jan 2025 09:00:31 +0000
Subject: [PATCH 08/18] Code cleanup

Signed-off-by: Jacob Murphy <jkmurphy@google.com>
---
 src/cluster.h        |  1 -
 src/cluster_legacy.c | 24 +++++++++++++++---------
 src/cluster_legacy.h | 23 ++++-------------------
 src/kvstore.h        |  1 -
 src/networking.c     |  4 ++--
 src/server.h         | 13 ++++---------
 6 files changed, 25 insertions(+), 41 deletions(-)

diff --git a/src/cluster.h b/src/cluster.h
index 21d4469357..41b6263bd4 100644
--- a/src/cluster.h
+++ b/src/cluster.h
@@ -122,7 +122,6 @@ void bitmapSetAllBits(unsigned char *bitmap, int len);
 int slotBitmapCompare(slotBitmap bitmap, slotBitmap other);
 int isSlotBitmapEmpty(slotBitmap bitmap);
 int getSlotOrReply(client *c, robj *o);
-void clusterSlotImportDoneSyncing(long long initial_offset);
 void clusterSlotMigrationHandleClientClose(client *c);
 void clusterFeedSlotMigration(int dbid, robj **argv, int argc);
 int clusterShouldWriteToSlotMigrationTarget(void);
diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 97b201a0a2..8ca61e5b1a 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -4461,7 +4461,8 @@ slotExport *clusterGetCurrentSlotExport(void) {
 
 void clusterFeedSlotMigration(int dbid, robj **argv, int argc) {
     UNUSED(dbid);
-    int i, slot, error_code;
+    int i, error_code;
+    int slot = -1;
     slotExport *curr_export = clusterGetCurrentSlotExport();
     if (curr_export == NULL || curr_export->state < SLOT_EXPORT_SNAPSHOTTING) {
         return;
@@ -4475,9 +4476,14 @@ void clusterFeedSlotMigration(int dbid, robj **argv, int argc) {
     struct serverCommand *cmd = lookupCommand(argv, argc);
     getNodeByQuery(server.current_client, cmd, argv, argc, &slot, &error_code);
     if (error_code != CLUSTER_REDIR_NONE || slot == -1) {
-        /* This shouldn't happen - but is possible if a module does something
-         * like VM_Replicate a cross-slot command. In that case, we don't have
-         * a clear way to proceed, so it makes sense to give up. */
+        /* A couple cases where this could happen:
+        *    - The replicated command is a command without a slot.
+        *    - The replicated command is written by VM_Replicate module APIs
+        *      and is a cross-slot command, or a slot that is not owned by
+        *      this node.
+        *
+        * In any case, our best solution is to not replicate this to the
+        * target node. */
         return;
     }
     if (!bitmapTestBit(curr_export->slot_bitmap, slot)) return;
@@ -7508,14 +7514,14 @@ int clusterCommandSpecial(client *c) {
     } else if (!strcasecmp(c->argv[1]->ptr, "links") && c->argc == 2) {
         /* CLUSTER LINKS */
         addReplyClusterLinksDescription(c);
-    } else if (!strcasecmp(c->argv[1]->ptr, "migrate")) {
-        /* CLUSTER MIGRATE SLOTSRANGE <start> <end> [<start> <end>] */
+    } else if (!strcasecmp(c->argv[1]->ptr, "import")) {
+        /* CLUSTER IMPORT SLOTSRANGE <start> <end> [<start> <end>] */
         if (nodeIsReplica(myself)) {
-            addReplyError(c, "Only primaries can migrate slots");
+            addReplyError(c, "Only primaries can import slots");
             return 1;
         }
         if (c->argc < 5 || strcasecmp(c->argv[2]->ptr, "slotsrange")) {
-            addReplyError(c, "Migrate command requires at least one slot range");
+            addReplyError(c, "CLUSTER IMPORT command requires at least one slot range");
             return 1;
         }
         if (c->argc % 2 == 0) {
@@ -7570,7 +7576,7 @@ int clusterCommandSpecial(client *c) {
         addReply(c, shared.ok);
     } else if (!strcasecmp(c->argv[1]->ptr, "syncslots")) {
         if (c->argc < 3) {
-            addReplyError(c, "SYNCSLOTS command requires either START or END to be provided.");
+            addReplyError(c, "SYNCSLOTS command requires a subcommand to be provided.");
             return 1;
         }
         if (!strcasecmp(c->argv[2]->ptr, "start")) {
diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
index ee38b3eced..eb9ecc5bb1 100644
--- a/src/cluster_legacy.h
+++ b/src/cluster_legacy.h
@@ -10,7 +10,7 @@
 #define CLUSTER_MF_TIMEOUT 5000              /* Milliseconds to do a manual failover. */
 #define CLUSTER_MF_PAUSE_MULT 2              /* Primary pause manual failover mult. */
 #define CLUSTER_REPLICA_MIGRATION_DELAY 5000 /* Delay for replica migration. */
-#define CLUSTER_SLOT_IMPORT_TIMEOUT 30000    /* Milliseconds to do a slot migration. */
+#define CLUSTER_SLOT_IMPORT_TIMEOUT 30000    /* Milliseconds to do a slot import. */
 
 /* Reasons why a replica is not able to failover. */
 #define CLUSTER_CANT_FAILOVER_NONE 0
@@ -97,9 +97,7 @@ typedef struct clusterNodeFailReport {
 #define CLUSTERMSG_TYPE_MFSTART 8               /* Pause clients for manual failover */
 #define CLUSTERMSG_TYPE_MODULE 9                /* Module cluster API message. */
 #define CLUSTERMSG_TYPE_PUBLISHSHARD 10         /* Pub/Sub Publish shard propagation */
-#define CLUSTERMSG_TYPE_MIGRATE_SLOT_START 11   /* Pause clients for slot migration */
-#define CLUSTERMSG_TYPE_COUNT 12                /* Total number of message types. */
-
+#define CLUSTERMSG_TYPE_COUNT 11                /* Total number of message types. */
 
 #define CLUSTERMSG_LIGHT 0x8000 /* Modifier bit for message types that support light header */
 
@@ -136,7 +134,7 @@ typedef struct {
 typedef struct {
     uint64_t configEpoch;                   /* Config epoch of the specified instance. */
     char nodename[CLUSTER_NAMELEN];         /* Name of the slots owner. */
-    unsigned char slots[CLUSTER_SLOTS / 8]; /* Slots bitmap. */
+    slotBitmap slots; /* Slots bitmap. */
 } clusterMsgDataUpdate;
 
 typedef struct {
@@ -146,10 +144,6 @@ typedef struct {
     unsigned char bulk_data[3]; /* 3 bytes just as placeholder. */
 } clusterMsgModule;
 
-typedef struct {
-    slotBitmap slot_bitmap;
-} clusterMsgSlotMigration;
-
 /* The cluster supports optional extension messages that can be sent
  * along with ping/pong/meet messages to give additional info in a
  * consistent manner. */
@@ -236,12 +230,6 @@ union clusterMsgData {
     struct {
         clusterMsgModule msg;
     } module;
-
-    /* SLOT_MIGRATION */
-    struct {
-        clusterMsgSlotMigration msg;
-    } slot_migration;
-
 };
 
 #define CLUSTER_PROTO_VER 1 /* Cluster bus protocol version. */
@@ -260,7 +248,7 @@ typedef struct {
     uint64_t offset;              /* Primary replication offset if node is a primary or
                                      processed replication offset if node is a replica. */
     char sender[CLUSTER_NAMELEN]; /* Name of the sender node */
-    unsigned char myslots[CLUSTER_SLOTS / 8];
+    slotBitmap myslots;
     char replicaof[CLUSTER_NAMELEN];
     char myip[NET_IP_STR_LEN]; /* Sender IP, if not all zeroed. */
     uint16_t extensions;       /* Number of extensions sent along with this packet. */
@@ -458,9 +446,6 @@ struct clusterState {
                                    or -1 if still not received. */
     int mf_can_start;            /* If non-zero signal that the manual failover
                                     can start requesting primary vote. */
-    /* Manual failover state for slot migration */
-    slotBitmap mf_slots; /* Slots in migration. */
-    clusterNode *mf_slots_target;
     /* The following fields are used by primaries to take state on elections. */
     uint64_t lastVoteEpoch; /* Epoch of the last vote granted. */
     int todo_before_sleep;  /* Things to do in clusterBeforeSleep(). */
diff --git a/src/kvstore.h b/src/kvstore.h
index bc3baba43a..fee8d71dbd 100644
--- a/src/kvstore.h
+++ b/src/kvstore.h
@@ -16,7 +16,6 @@ typedef int(kvstoreIteratorPredicate)(int didx, void *privdata);
 #define KVSTORE_FREE_EMPTY_HASHTABLES (1 << 1)
 kvstore *kvstoreCreate(hashtableType *type, int num_hashtables_bits, int flags);
 void kvstoreEmpty(kvstore *kvs, void(callback)(hashtable *));
-void kvstoreEmptyHashtable(kvstore *kvs, int didx, void(callback)(hashtable *));
 void kvstoreRelease(kvstore *kvs);
 unsigned long long kvstoreSize(kvstore *kvs);
 unsigned long kvstoreBuckets(kvstore *kvs);
diff --git a/src/networking.c b/src/networking.c
index 91a8001984..cad0e86de0 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -2156,11 +2156,11 @@ int postWriteToClient(client *c) {
     }
     if (c->nwritten > 0) {
         c->net_output_bytes += c->nwritten;
-        /* For clients representing replication sources we don't count sending data
+        /* For clients representing primaries we don't count sending data
          * as an interaction, since we always send REPLCONF ACK commands
          * that take some time to just fill the socket output buffer.
          * We just rely on data / pings received for timeout detection. */
-        if (!c->flag.replicated) c->last_interaction = server.unixtime;
+        if (!c->flag.primary) c->last_interaction = server.unixtime;
     }
     if (!clientHasPendingReplies(c)) {
         c->sentlen = 0;
diff --git a/src/server.h b/src/server.h
index c0af0b6625..1344db1de9 100644
--- a/src/server.h
+++ b/src/server.h
@@ -389,7 +389,6 @@ typedef enum blocking_type {
  * what to do next. */
 typedef enum {
     REPL_STATE_NONE = 0,   /* No active replication */
-    REPL_STATE_ERROR,      /* Error in replication. */
     REPL_STATE_CONNECT,    /* Must connect to primary */
     REPL_STATE_CONNECTING, /* Connecting to primary */
     /* --- Handshake states, must be ordered --- */
@@ -398,7 +397,6 @@ typedef enum {
     REPL_STATE_RECEIVE_AUTH_REPLY,    /* Wait for AUTH reply */
     REPL_STATE_RECEIVE_PORT_REPLY,    /* Wait for REPLCONF reply */
     REPL_STATE_RECEIVE_IP_REPLY,      /* Wait for REPLCONF reply */
-    REPL_STATE_RECEIVE_SLOT_REPLY,    /* Wait for REPLCONF reply */
     REPL_STATE_RECEIVE_CAPA_REPLY,    /* Wait for REPLCONF reply */
     REPL_STATE_RECEIVE_VERSION_REPLY, /* Wait for REPLCONF reply */
     REPL_STATE_SEND_PSYNC,            /* Send PSYNC */
@@ -451,7 +449,6 @@ typedef enum {
 #define REPLICA_REQ_RDB_EXCLUDE_DATA (1 << 0)      /* Exclude data from RDB */
 #define REPLICA_REQ_RDB_EXCLUDE_FUNCTIONS (1 << 1) /* Exclude functions from RDB */
 #define REPLICA_REQ_RDB_CHANNEL (1 << 2)           /* Use dual-channel-replication */
-#define REPLICA_REQ_AOF_FORMAT (1 << 3)            /* Use AOF-based replication format*/
 /* Mask of all bits in the replica requirements bitfield that represent non-standard (filtered) RDB requirements */
 #define REPLICA_REQ_RDB_MASK (REPLICA_REQ_RDB_EXCLUDE_DATA | REPLICA_REQ_RDB_EXCLUDE_FUNCTIONS)
 
@@ -1094,10 +1091,10 @@ typedef struct ClientFlags {
                                             * flag, we won't cache the primary in freeClient. */
     uint64_t fake : 1;                     /* This is a fake client without a real connection. */
     uint64_t import_source : 1;            /* This client is importing data to server and can visit expired key. */
-    uint64_t replicated : 1;       /* This client is a replication source (i.e. primary or slot migration). */
+    uint64_t replicated : 1;               /* This client is a replication source (i.e. primary or slot migration). */
     uint64_t slot_migration_source : 1;    /* This client is a slot migration source. */
     uint64_t slot_migration_target : 1;    /* This client is a slot migration target. */
-    uint64_t reserved : 3;                 /* Reserved for future use */
+    uint64_t reserved : 1;                 /* Reserved for future use */
 } ClientFlags;
 
 typedef struct ClientPubSubData {
@@ -1543,7 +1540,6 @@ typedef struct {
 #define CHILD_TYPE_AOF 2
 #define CHILD_TYPE_LDB 3
 #define CHILD_TYPE_MODULE 4
-#define CHILD_TYPE_SYNCSLOTS 5
 
 typedef enum childInfoType {
     CHILD_INFO_TYPE_CURRENT_INFO,
@@ -1714,8 +1710,8 @@ struct valkeyServer {
     long long stat_net_repl_input_bytes;           /* Bytes read during replication, added to stat_net_input_bytes in 'info'. */
     /* Bytes written during replication, added to stat_net_output_bytes in 'info'. */
     long long stat_net_repl_output_bytes;
-    long long stat_net_slot_migration_input_bytes; /* Bytes read during replication, added to stat_net_input_bytes in 'info'. */
-    long long stat_net_slot_migration_output_bytes; /* Bytes read during replication, added to stat_net_input_bytes in 'info'. */
+    long long stat_net_slot_migration_input_bytes; /* Bytes read during slot migration, added to stat_net_input_bytes in 'info'. */
+    long long stat_net_slot_migration_output_bytes; /* Bytes written during slot migration, added to stat_net_output_bytes in 'info'. */
     size_t stat_current_cow_peak;                       /* Peak size of copy on write bytes. */
     size_t stat_current_cow_bytes;                      /* Copy on write bytes while child is active. */
     monotime stat_current_cow_updated;                  /* Last update time of stat_current_cow_bytes */
@@ -3690,7 +3686,6 @@ void sdiffCommand(client *c);
 void sdiffstoreCommand(client *c);
 void sscanCommand(client *c);
 void syncCommand(client *c);
-void syncSlotsCommand(client *c);
 void flushdbCommand(client *c);
 void flushallCommand(client *c);
 void sortCommand(client *c);

From b52d77980f72726d0f886a6f7b02c78ab2041861 Mon Sep 17 00:00:00 2001
From: Jacob Murphy <jkmurphy@google.com>
Date: Mon, 20 Jan 2025 09:01:53 +0000
Subject: [PATCH 09/18] Cleanup debug log

Signed-off-by: Jacob Murphy <jkmurphy@google.com>
---
 src/aof.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/aof.c b/src/aof.c
index 0cd64820c8..6ee7d99c0a 100644
--- a/src/aof.c
+++ b/src/aof.c
@@ -2291,7 +2291,6 @@ int rewriteAppendOnlyFileRio(rio *aof, slotBitmap slot_bitmap) {
                     updated_time = now;
                 }
             }
-            serverLog(LL_NOTICE, "AOF rewrite: %s, key_count: %ld", keystr, key_count);
 
             /* Delay before next key if required (for testing) */
             if (server.rdb_key_save_delay) debugDelay(server.rdb_key_save_delay);

From 3b5e555c8a03b0344eec9b1871144ffc435cc2ce Mon Sep 17 00:00:00 2001
From: Jacob Murphy <jkmurphy@google.com>
Date: Mon, 20 Jan 2025 09:05:57 +0000
Subject: [PATCH 10/18] Rename CLUSTER MIGRATE to CLUSTER IMPORT

Signed-off-by: Jacob Murphy <jkmurphy@google.com>
---
 src/commands.def                  | 40 +++++++++++++++----------------
 src/commands/cluster-migrate.json |  2 +-
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/commands.def b/src/commands.def
index f0a1183e5a..fc910f96bd 100644
--- a/src/commands.def
+++ b/src/commands.def
@@ -599,6 +599,25 @@ struct COMMAND_ARG CLUSTER_GETKEYSINSLOT_Args[] = {
 #define CLUSTER_HELP_Keyspecs NULL
 #endif
 
+/********** CLUSTER IMPORT ********************/
+
+#ifndef SKIP_CMD_HISTORY_TABLE
+/* CLUSTER IMPORT history */
+#define CLUSTER_IMPORT_History NULL
+#endif
+
+#ifndef SKIP_CMD_TIPS_TABLE
+/* CLUSTER IMPORT tips */
+const char *CLUSTER_IMPORT_Tips[] = {
+"nondeterministic_output",
+};
+#endif
+
+#ifndef SKIP_CMD_KEY_SPECS_TABLE
+/* CLUSTER IMPORT key specs */
+#define CLUSTER_IMPORT_Keyspecs NULL
+#endif
+
 /********** CLUSTER INFO ********************/
 
 #ifndef SKIP_CMD_HISTORY_TABLE
@@ -685,25 +704,6 @@ struct COMMAND_ARG CLUSTER_MEET_Args[] = {
 {MAKE_ARG("cluster-bus-port",ARG_TYPE_INTEGER,-1,NULL,NULL,"4.0.0",CMD_ARG_OPTIONAL,0,NULL)},
 };
 
-/********** CLUSTER MIGRATE ********************/
-
-#ifndef SKIP_CMD_HISTORY_TABLE
-/* CLUSTER MIGRATE history */
-#define CLUSTER_MIGRATE_History NULL
-#endif
-
-#ifndef SKIP_CMD_TIPS_TABLE
-/* CLUSTER MIGRATE tips */
-const char *CLUSTER_MIGRATE_Tips[] = {
-"nondeterministic_output",
-};
-#endif
-
-#ifndef SKIP_CMD_KEY_SPECS_TABLE
-/* CLUSTER MIGRATE key specs */
-#define CLUSTER_MIGRATE_Keyspecs NULL
-#endif
-
 /********** CLUSTER MYID ********************/
 
 #ifndef SKIP_CMD_HISTORY_TABLE
@@ -1052,11 +1052,11 @@ struct COMMAND_STRUCT CLUSTER_Subcommands[] = {
 {MAKE_CMD("forget","Removes a node from the nodes table.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_FORGET_History,0,CLUSTER_FORGET_Tips,0,clusterCommand,3,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_STALE,0,CLUSTER_FORGET_Keyspecs,0,NULL,1),.args=CLUSTER_FORGET_Args},
 {MAKE_CMD("getkeysinslot","Returns the key names in a hash slot.","O(N) where N is the number of requested keys","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_GETKEYSINSLOT_History,0,CLUSTER_GETKEYSINSLOT_Tips,1,clusterCommand,4,CMD_STALE,0,CLUSTER_GETKEYSINSLOT_Keyspecs,0,NULL,2),.args=CLUSTER_GETKEYSINSLOT_Args},
 {MAKE_CMD("help","Returns helpful text about the different subcommands.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_HELP_History,0,CLUSTER_HELP_Tips,0,clusterCommand,2,CMD_LOADING|CMD_STALE,0,CLUSTER_HELP_Keyspecs,0,NULL,0)},
+{MAKE_CMD("import","Initiates server driven hash slot migration, importing the given slot to this shard.","O(N) where N is the total number of hash slot arguments","8.1.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_IMPORT_History,0,CLUSTER_IMPORT_Tips,1,clusterCommand,-2,CMD_ADMIN|CMD_STALE,0,CLUSTER_IMPORT_Keyspecs,0,NULL,0)},
 {MAKE_CMD("info","Returns information about the state of a node.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_INFO_History,0,CLUSTER_INFO_Tips,1,clusterCommand,2,CMD_LOADING|CMD_STALE,0,CLUSTER_INFO_Keyspecs,0,NULL,0)},
 {MAKE_CMD("keyslot","Returns the hash slot for a key.","O(N) where N is the number of bytes in the key","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_KEYSLOT_History,0,CLUSTER_KEYSLOT_Tips,0,clusterCommand,3,CMD_STALE,0,CLUSTER_KEYSLOT_Keyspecs,0,NULL,1),.args=CLUSTER_KEYSLOT_Args},
 {MAKE_CMD("links","Returns a list of all TCP links to and from peer nodes.","O(N) where N is the total number of Cluster nodes","7.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_LINKS_History,0,CLUSTER_LINKS_Tips,1,clusterCommand,2,CMD_STALE,0,CLUSTER_LINKS_Keyspecs,0,NULL,0)},
 {MAKE_CMD("meet","Forces a node to handshake with another node.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_MEET_History,1,CLUSTER_MEET_Tips,0,clusterCommand,-4,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_STALE,0,CLUSTER_MEET_Keyspecs,0,NULL,3),.args=CLUSTER_MEET_Args},
-{MAKE_CMD("migrate","Initiates server driven hash slot migration, importing the given slot to this shard.","O(N) where N is the total number of hash slot arguments","8.1.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_MIGRATE_History,0,CLUSTER_MIGRATE_Tips,1,clusterCommand,-2,CMD_ADMIN|CMD_STALE,0,CLUSTER_MIGRATE_Keyspecs,0,NULL,0)},
 {MAKE_CMD("myid","Returns the ID of a node.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_MYID_History,0,CLUSTER_MYID_Tips,0,clusterCommand,2,CMD_LOADING|CMD_STALE,0,CLUSTER_MYID_Keyspecs,0,NULL,0)},
 {MAKE_CMD("myshardid","Returns the shard ID of a node.","O(1)","7.2.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_MYSHARDID_History,0,CLUSTER_MYSHARDID_Tips,1,clusterCommand,2,CMD_LOADING|CMD_STALE,0,CLUSTER_MYSHARDID_Keyspecs,0,NULL,0)},
 {MAKE_CMD("nodes","Returns the cluster configuration for a node.","O(N) where N is the total number of Cluster nodes","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_NODES_History,0,CLUSTER_NODES_Tips,1,clusterCommand,2,CMD_LOADING|CMD_STALE,0,CLUSTER_NODES_Keyspecs,0,NULL,0)},
diff --git a/src/commands/cluster-migrate.json b/src/commands/cluster-migrate.json
index 719e827fa4..e7b34be508 100644
--- a/src/commands/cluster-migrate.json
+++ b/src/commands/cluster-migrate.json
@@ -1,5 +1,5 @@
 {
-    "MIGRATE": {
+    "IMPORT": {
         "summary": "Initiates server driven hash slot migration, importing the given slot to this shard.",
         "complexity": "O(N) where N is the total number of hash slot arguments",
         "group": "cluster",

From c59a7f7e8e3858699ac0fc809a085723741163d6 Mon Sep 17 00:00:00 2001
From: Jacob Murphy <jkmurphy@google.com>
Date: Mon, 20 Jan 2025 09:42:49 +0000
Subject: [PATCH 11/18] Fix implicit fallthrough

Signed-off-by: Jacob Murphy <jkmurphy@google.com>
---
 src/cluster_legacy.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 8ca61e5b1a..ddb25663a0 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -4635,6 +4635,7 @@ void clusterProceedWithSlotImport(void) {
                  * straight to readQueryFromClient. */
                 connSetReadHandler(curr_import->client->conn, readQueryFromClient);
                 curr_import->state = SLOT_IMPORT_RECEIVE_SYNCSLOTS;
+                continue;
             case SLOT_IMPORT_RECEIVE_SYNCSLOTS:
                 /* Nothing to do in this state. Waiting for CLUSTER SYNCSLOTS ENDSNAPSHOT to be processed. */
                 return;

From 586b22e904efaaffbae04c31b911d3e223a13ca6 Mon Sep 17 00:00:00 2001
From: Jacob Murphy <jkmurphy@google.com>
Date: Mon, 20 Jan 2025 09:43:38 +0000
Subject: [PATCH 12/18] Fix mac build

Signed-off-by: Jacob Murphy <jkmurphy@google.com>
---
 src/cluster_legacy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index ddb25663a0..f2e1990804 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -4496,7 +4496,7 @@ void clusterFeedSlotMigration(int dbid, robj **argv, int argc) {
     curr_export->syncslot_offset += curr_export->client->reply_bytes - prev_pending;
 }
 
-int clusterShouldWriteToSlotMigrationTarget() {
+int clusterShouldWriteToSlotMigrationTarget(void) {
     slotExport *curr_export = clusterGetCurrentSlotExport();
     return curr_export && (curr_export->state == SLOT_EXPORT_PAUSE_AND_REPLY || curr_export->state == SLOT_EXPORT_PAUSED);
 }

From 117bfa98ffeea4cb17e1f2ee6782251378e2c7de Mon Sep 17 00:00:00 2001
From: Jacob Murphy <jkmurphy@google.com>
Date: Mon, 20 Jan 2025 10:10:16 +0000
Subject: [PATCH 13/18] Apply clang format

Signed-off-by: Jacob Murphy <jkmurphy@google.com>
---
 src/aof.c            |   2 +-
 src/cluster_legacy.c | 412 +++++++++++++++++++++----------------------
 src/cluster_legacy.h |   6 +-
 src/rdb.c            |   6 +-
 src/server.c         |   3 +-
 src/server.h         |  36 ++--
 6 files changed, 232 insertions(+), 233 deletions(-)

diff --git a/src/aof.c b/src/aof.c
index 6ee7d99c0a..8befd2d8a1 100644
--- a/src/aof.c
+++ b/src/aof.c
@@ -2191,7 +2191,7 @@ static int rewriteFunctions(rio *aof) {
     return 0;
 }
 
-int slotFilterPredicate(int slot, void * privdata) {
+int slotFilterPredicate(int slot, void *privdata) {
     if (privdata == NULL) return 1;
     unsigned char *slot_bitmap = (unsigned char *)privdata;
     return bitmapTestBit(slot_bitmap, slot);
diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index f2e1990804..ae190fc83b 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -2575,7 +2575,7 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc
 
                 /* Was this slot mine and it was in a paused state for slot
                  * migration? If so, mark the move as done. */
-                slotExport * curr_export = clusterGetCurrentSlotExport();
+                slotExport *curr_export = clusterGetCurrentSlotExport();
                 if (server.cluster->slots[j] == myself && curr_export && bitmapTestBit(curr_export->slot_bitmap, j)) {
                     bitmapClearBit(curr_export->slot_bitmap, j);
                     if (isSlotBitmapEmpty(curr_export->slot_bitmap)) {
@@ -4416,7 +4416,7 @@ void clusterPropagatePublish(robj *channel, robj *message, int sharded) {
  * -------------------------------------------------------------------------- */
 
 slotImport *clusterCreateSlotImportJob(clusterNode *source, slotBitmap slots) {
-    slotImport *result = (slotImport *) zcalloc(sizeof(slotImport));
+    slotImport *result = (slotImport *)zcalloc(sizeof(slotImport));
     memcpy(result->slot_bitmap, slots, sizeof(slotBitmap));
     result->source_node = source;
     result->state = SLOT_IMPORT_QUEUED;
@@ -4425,7 +4425,7 @@ slotImport *clusterCreateSlotImportJob(clusterNode *source, slotBitmap slots) {
 }
 
 slotExport *clusterCreateSlotExportJob(client *c, slotBitmap slots) {
-    slotExport *result = (slotExport *) zcalloc(sizeof(slotExport));
+    slotExport *result = (slotExport *)zcalloc(sizeof(slotExport));
     memcpy(result->slot_bitmap, slots, sizeof(slotBitmap));
     result->state = SLOT_EXPORT_QUEUED;
     result->pause_end = -1;
@@ -4451,12 +4451,12 @@ void clusterFreeSlotExportJob(slotExport *slot_export) {
 
 slotImport *clusterGetCurrentSlotImport(void) {
     if (listLength(server.cluster->slot_import_jobs) == 0) return NULL;
-    return (slotImport *) listFirst(server.cluster->slot_import_jobs)->value;
+    return (slotImport *)listFirst(server.cluster->slot_import_jobs)->value;
 }
 
 slotExport *clusterGetCurrentSlotExport(void) {
     if (listLength(server.cluster->slot_export_jobs) == 0) return NULL;
-    return (slotExport *) listFirst(server.cluster->slot_export_jobs)->value;
+    return (slotExport *)listFirst(server.cluster->slot_export_jobs)->value;
 }
 
 void clusterFeedSlotMigration(int dbid, robj **argv, int argc) {
@@ -4477,13 +4477,13 @@ void clusterFeedSlotMigration(int dbid, robj **argv, int argc) {
     getNodeByQuery(server.current_client, cmd, argv, argc, &slot, &error_code);
     if (error_code != CLUSTER_REDIR_NONE || slot == -1) {
         /* A couple cases where this could happen:
-        *    - The replicated command is a command without a slot.
-        *    - The replicated command is written by VM_Replicate module APIs
-        *      and is a cross-slot command, or a slot that is not owned by
-        *      this node.
-        *
-        * In any case, our best solution is to not replicate this to the
-        * target node. */
+         *    - The replicated command is a command without a slot.
+         *    - The replicated command is written by VM_Replicate module APIs
+         *      and is a cross-slot command, or a slot that is not owned by
+         *      this node.
+         *
+         * In any case, our best solution is to not replicate this to the
+         * target node. */
         return;
     }
     if (!bitmapTestBit(curr_export->slot_bitmap, slot)) return;
@@ -4526,10 +4526,10 @@ void clusterProceedWithSlotImport(void) {
     char *err;
     while (clusterGetCurrentSlotImport() != NULL) {
         listNode *curr_node = listFirst(server.cluster->slot_import_jobs);
-        slotImport *curr_import = (slotImport *) curr_node->value;
+        slotImport *curr_import = (slotImport *)curr_node->value;
         if (curr_import->state != SLOT_IMPORT_QUEUED && curr_import->end_time < mstime()) {
             serverLog(LL_WARNING,
-                "Timed out for slot import from source node %.40s", curr_import->source_node->name);
+                      "Timed out for slot import from source node %.40s", curr_import->source_node->name);
             curr_import->state = SLOT_IMPORT_FAILED;
         }
         if (curr_import->state > SLOT_IMPORT_PAUSE_OWNER && curr_import->state != SLOT_IMPORT_FAILED && curr_import->pause_end < mstime()) {
@@ -4542,155 +4542,155 @@ void clusterProceedWithSlotImport(void) {
             serverLog(LL_WARNING, "Client for slot import from source node %.40s has been closed", curr_import->source_node->name);
             curr_import->state = SLOT_IMPORT_FAILED;
         }
-        switch(curr_import->state) {
-            case SLOT_IMPORT_QUEUED:
-                /* Start the migration */
-                serverLog(LL_NOTICE, "Starting slot import from source node %.40s", curr_import->source_node->name);
-                curr_import->end_time = mstime() + CLUSTER_SLOT_IMPORT_TIMEOUT;
-                curr_import->conn = connCreate(server.tls_replication ? connectionTypeTls() : connectionTypeTcp());
-                if (connConnect(curr_import->conn, curr_import->source_node->ip, getNodeDefaultReplicationPort(curr_import->source_node), server.bind_source_addr, clusterImportHandler) == C_ERR) {
-                    serverLog(LL_WARNING,
-                            "Failed to connect to slot import source node %.40s", curr_import->source_node->name);
-                    curr_import->state = SLOT_IMPORT_FAILED;
-                    continue;
-                }
-                curr_import->state = SLOT_IMPORT_CONNECTING;
+        switch (curr_import->state) {
+        case SLOT_IMPORT_QUEUED:
+            /* Start the migration */
+            serverLog(LL_NOTICE, "Starting slot import from source node %.40s", curr_import->source_node->name);
+            curr_import->end_time = mstime() + CLUSTER_SLOT_IMPORT_TIMEOUT;
+            curr_import->conn = connCreate(server.tls_replication ? connectionTypeTls() : connectionTypeTcp());
+            if (connConnect(curr_import->conn, curr_import->source_node->ip, getNodeDefaultReplicationPort(curr_import->source_node), server.bind_source_addr, clusterImportHandler) == C_ERR) {
+                serverLog(LL_WARNING,
+                          "Failed to connect to slot import source node %.40s", curr_import->source_node->name);
+                curr_import->state = SLOT_IMPORT_FAILED;
                 continue;
-            case SLOT_IMPORT_CONNECTING:
-                if (curr_import->conn->state == CONN_STATE_CONNECTING) {
-                    /* Nothing to do, waiting for connection to be established. */
-                    return;
-                } else if (curr_import->conn->state != CONN_STATE_CONNECTED) {
-                    serverLog(LL_WARNING, "Failed during connection to slot import source node %.40s: %s", curr_import->source_node->name, connGetLastError(curr_import->conn));
-                    curr_import->state = SLOT_IMPORT_FAILED;
-                    continue;
-                }
-                serverLog(LL_NOTICE, "Connected to slot import source node %.40s", curr_import->source_node->name);
-                connSetReadHandler(curr_import->conn, NULL);
-                client *c = createClient(curr_import->conn);
-                curr_import->client = c;
-                c->flag.replicated = 1;
-                c->flag.slot_migration_source = 1;
-                c->flag.authenticated = 1;
-                c->user = NULL; /* This client can do everything. */
-                c->querybuf = sdsempty(); /* Similar to primary, we use a dedicated query buf. */
-                initClientReplicationData(c);
-
-                curr_import->state = SLOT_IMPORT_SEND_AUTH;
+            }
+            curr_import->state = SLOT_IMPORT_CONNECTING;
+            continue;
+        case SLOT_IMPORT_CONNECTING:
+            if (curr_import->conn->state == CONN_STATE_CONNECTING) {
+                /* Nothing to do, waiting for connection to be established. */
+                return;
+            } else if (curr_import->conn->state != CONN_STATE_CONNECTED) {
+                serverLog(LL_WARNING, "Failed during connection to slot import source node %.40s: %s", curr_import->source_node->name, connGetLastError(curr_import->conn));
+                curr_import->state = SLOT_IMPORT_FAILED;
                 continue;
-            case SLOT_IMPORT_SEND_AUTH:
-                if (!server.primary_auth) {
-                    curr_import->state = SLOT_IMPORT_SEND_SYNCSLOTS;
-                    continue;
-                }
-                char *auth_args[3] = {"AUTH", NULL, NULL};
-                size_t auth_lens[3] = {4, 0, 0};
-                int argc = 1;
-                if (server.primary_user) {
-                    auth_args[argc] = server.primary_user;
-                    auth_lens[argc] = strlen(server.primary_user);
-                    argc++;
-                }
-                auth_args[argc] = server.primary_auth;
-                auth_lens[argc] = sdslen(server.primary_auth);
+            }
+            serverLog(LL_NOTICE, "Connected to slot import source node %.40s", curr_import->source_node->name);
+            connSetReadHandler(curr_import->conn, NULL);
+            client *c = createClient(curr_import->conn);
+            curr_import->client = c;
+            c->flag.replicated = 1;
+            c->flag.slot_migration_source = 1;
+            c->flag.authenticated = 1;
+            c->user = NULL;           /* This client can do everything. */
+            c->querybuf = sdsempty(); /* Similar to primary, we use a dedicated query buf. */
+            initClientReplicationData(c);
+
+            curr_import->state = SLOT_IMPORT_SEND_AUTH;
+            continue;
+        case SLOT_IMPORT_SEND_AUTH:
+            if (!server.primary_auth) {
+                curr_import->state = SLOT_IMPORT_SEND_SYNCSLOTS;
+                continue;
+            }
+            char *auth_args[3] = {"AUTH", NULL, NULL};
+            size_t auth_lens[3] = {4, 0, 0};
+            int argc = 1;
+            if (server.primary_user) {
+                auth_args[argc] = server.primary_user;
+                auth_lens[argc] = strlen(server.primary_user);
                 argc++;
-                err = sendCommandArgv(curr_import->conn, argc, auth_args, auth_lens);
-                if (err) {
-                    serverLog(LL_WARNING, "Failed to write AUTH to slot migration source: %s", err);
-                    sdsfree(err);
-                    curr_import->state = SLOT_IMPORT_FAILED;
-                    continue;
-                }
-                curr_import->state = SLOT_IMPORT_RECEIVE_AUTH;
+            }
+            auth_args[argc] = server.primary_auth;
+            auth_lens[argc] = sdslen(server.primary_auth);
+            argc++;
+            err = sendCommandArgv(curr_import->conn, argc, auth_args, auth_lens);
+            if (err) {
+                serverLog(LL_WARNING, "Failed to write AUTH to slot migration source: %s", err);
+                sdsfree(err);
+                curr_import->state = SLOT_IMPORT_FAILED;
                 continue;
-            case SLOT_IMPORT_RECEIVE_AUTH:
-                err = receiveSynchronousResponse(curr_import->conn);
-                if (err == NULL) {
-                    serverLog(LL_WARNING, "Slot migration source did not respond to AUTH command");
-                }
-                if (err[0] == '-') {
-                    serverLog(LL_WARNING, "Unable to AUTH to slot migration source: %s", err);
-                    sdsfree(err);
-                }
+            }
+            curr_import->state = SLOT_IMPORT_RECEIVE_AUTH;
+            continue;
+        case SLOT_IMPORT_RECEIVE_AUTH:
+            err = receiveSynchronousResponse(curr_import->conn);
+            if (err == NULL) {
+                serverLog(LL_WARNING, "Slot migration source did not respond to AUTH command");
+            }
+            if (err[0] == '-') {
+                serverLog(LL_WARNING, "Unable to AUTH to slot migration source: %s", err);
                 sdsfree(err);
-                err = NULL;
-                curr_import->state = SLOT_IMPORT_SEND_SYNCSLOTS;
+            }
+            sdsfree(err);
+            err = NULL;
+            curr_import->state = SLOT_IMPORT_SEND_SYNCSLOTS;
+            continue;
+        case SLOT_IMPORT_SEND_SYNCSLOTS:
+            /* Ensure we have a clean state for the SYNC. */
+            delKeysInSlotBitmap(curr_import->slot_bitmap);
+
+            serverLog(LL_NOTICE, "Sending CLUSTER SYNCSLOTS START request to source node %.40s", curr_import->source_node->name);
+            char *syncslots_args[4] = {"CLUSTER", "SYNCSLOTS", "START", (char *)curr_import->slot_bitmap};
+            size_t syncslots_lens[4] = {7, 9, 5, sizeof(slotBitmap)};
+            err = sendCommandArgv(curr_import->conn, 4, syncslots_args, syncslots_lens);
+            if (err) {
+                serverLog(LL_WARNING, "Failed to write SYNCSLOTS START to slot migration source: %s", err);
+                sdsfree(err);
+                curr_import->state = SLOT_IMPORT_FAILED;
                 continue;
-            case SLOT_IMPORT_SEND_SYNCSLOTS:
-                /* Ensure we have a clean state for the SYNC. */
-                delKeysInSlotBitmap(curr_import->slot_bitmap);
-
-                serverLog(LL_NOTICE, "Sending CLUSTER SYNCSLOTS START request to source node %.40s", curr_import->source_node->name);
-                char *syncslots_args[4] = {"CLUSTER", "SYNCSLOTS", "START", (char *)curr_import->slot_bitmap};
-                size_t syncslots_lens[4] = {7, 9, 5, sizeof(slotBitmap)};
-                err = sendCommandArgv(curr_import->conn, 4, syncslots_args, syncslots_lens);
-                if (err) {
-                    serverLog(LL_WARNING, "Failed to write SYNCSLOTS START to slot migration source: %s", err);
-                    sdsfree(err);
-                    curr_import->state = SLOT_IMPORT_FAILED;
-                    continue;
-                }
+            }
 
-                /* Our result will be received in AOF format, so we can pipe it
-                 * straight to readQueryFromClient. */
-                connSetReadHandler(curr_import->client->conn, readQueryFromClient);
-                curr_import->state = SLOT_IMPORT_RECEIVE_SYNCSLOTS;
-                continue;
-            case SLOT_IMPORT_RECEIVE_SYNCSLOTS:
-                /* Nothing to do in this state. Waiting for CLUSTER SYNCSLOTS ENDSNAPSHOT to be processed. */
-                return;
-            case SLOT_IMPORT_PAUSE_OWNER:
-                curr_import->client->flag.replication_force_reply = 1;
-                addReplyArrayLen(curr_import->client, 3);
-                addReplyBulkCBuffer(curr_import->client, "CLUSTER", 7);
-                addReplyBulkCBuffer(curr_import->client, "SYNCSLOTS", 9);
-                addReplyBulkCBuffer(curr_import->client, "PAUSE", 5);
-                curr_import->client->flag.replication_force_reply = 0;
-
-                serverLog(LL_NOTICE, "SYNCSLOTS from slot migration source %.40s (%s) has been performed. Pausing source node and waiting to continue", curr_import->source_node->name, curr_import->source_node->human_nodename);
-                curr_import->paused_at_offset = -1;
-                curr_import->pause_end = mstime() + CLUSTER_MF_TIMEOUT;
-                curr_import->state = SLOT_IMPORT_WAITING_FOR_OFFSET;
+            /* Our result will be received in AOF format, so we can pipe it
+             * straight to readQueryFromClient. */
+            connSetReadHandler(curr_import->client->conn, readQueryFromClient);
+            curr_import->state = SLOT_IMPORT_RECEIVE_SYNCSLOTS;
+            continue;
+        case SLOT_IMPORT_RECEIVE_SYNCSLOTS:
+            /* Nothing to do in this state. Waiting for CLUSTER SYNCSLOTS ENDSNAPSHOT to be processed. */
+            return;
+        case SLOT_IMPORT_PAUSE_OWNER:
+            curr_import->client->flag.replication_force_reply = 1;
+            addReplyArrayLen(curr_import->client, 3);
+            addReplyBulkCBuffer(curr_import->client, "CLUSTER", 7);
+            addReplyBulkCBuffer(curr_import->client, "SYNCSLOTS", 9);
+            addReplyBulkCBuffer(curr_import->client, "PAUSE", 5);
+            curr_import->client->flag.replication_force_reply = 0;
+
+            serverLog(LL_NOTICE, "SYNCSLOTS from slot migration source %.40s (%s) has been performed. Pausing source node and waiting to continue", curr_import->source_node->name, curr_import->source_node->human_nodename);
+            curr_import->paused_at_offset = -1;
+            curr_import->pause_end = mstime() + CLUSTER_MF_TIMEOUT;
+            curr_import->state = SLOT_IMPORT_WAITING_FOR_OFFSET;
+            continue;
+        case SLOT_IMPORT_WAITING_FOR_OFFSET:
+            return;
+        case SLOT_IMPORT_SYNCING_TO_OFFSET:
+            if (curr_import->client->repl_data->reploff >= curr_import->paused_at_offset) {
+                serverLog(LL_NOTICE, "SYNCSLOTS of slot range has caught up to paused slot owner (%lld), slot migration can start.", curr_import->paused_at_offset);
+                curr_import->state = SLOT_IMPORT_FINISH;
                 continue;
-            case SLOT_IMPORT_WAITING_FOR_OFFSET:
-                return;
-            case SLOT_IMPORT_SYNCING_TO_OFFSET:
-                if (curr_import->client->repl_data->reploff >= curr_import->paused_at_offset) {
-                    serverLog(LL_NOTICE, "SYNCSLOTS of slot range has caught up to paused slot owner (%lld), slot migration can start.", curr_import->paused_at_offset);
-                    curr_import->state = SLOT_IMPORT_FINISH;
-                    continue;
-                }
-                /* Need to wait for the sync to progress further */
-                return;
-            case SLOT_IMPORT_FINISH:
-                serverLog(LL_NOTICE, "Setting myself to owner of migrating slots and broadcasting");
-                for (int i = 0; i < CLUSTER_SLOTS; i++) {
-                    if (bitmapTestBit(curr_import->slot_bitmap, i)) {
-                        clusterDelSlot(i);
-                        clusterAddSlot(myself, i);
-                    }
-                }
-                clusterUpdateState();
-                clusterSaveConfigOrDie(1);
-                if (clusterBumpConfigEpochWithoutConsensus() == C_OK) {
-                    serverLog(LL_NOTICE, "ConfigEpoch updated after importing slots");
+            }
+            /* Need to wait for the sync to progress further */
+            return;
+        case SLOT_IMPORT_FINISH:
+            serverLog(LL_NOTICE, "Setting myself to owner of migrating slots and broadcasting");
+            for (int i = 0; i < CLUSTER_SLOTS; i++) {
+                if (bitmapTestBit(curr_import->slot_bitmap, i)) {
+                    clusterDelSlot(i);
+                    clusterAddSlot(myself, i);
                 }
-                clusterFreeSlotImportJob(curr_import);
-                clusterBroadcastPong(CLUSTER_BROADCAST_ALL);
-                listDelNode(server.cluster->slot_import_jobs, curr_node);
-                continue;
-            case SLOT_IMPORT_FAILED:
-                listDelNode(server.cluster->slot_import_jobs, curr_node);
-                delKeysInSlotBitmap(curr_import->slot_bitmap);
-                clusterFreeSlotImportJob(curr_import);
-                continue;
+            }
+            clusterUpdateState();
+            clusterSaveConfigOrDie(1);
+            if (clusterBumpConfigEpochWithoutConsensus() == C_OK) {
+                serverLog(LL_NOTICE, "ConfigEpoch updated after importing slots");
+            }
+            clusterFreeSlotImportJob(curr_import);
+            clusterBroadcastPong(CLUSTER_BROADCAST_ALL);
+            listDelNode(server.cluster->slot_import_jobs, curr_node);
+            continue;
+        case SLOT_IMPORT_FAILED:
+            listDelNode(server.cluster->slot_import_jobs, curr_node);
+            delKeysInSlotBitmap(curr_import->slot_bitmap);
+            clusterFreeSlotImportJob(curr_import);
+            continue;
         }
     }
 }
 
 int childSnapshotForSyncSlot(int req, rio *rdb, void *privdata) {
     UNUSED(req);
-    int retval = rewriteAppendOnlyFileRio(rdb, (unsigned char *) privdata);
+    int retval = rewriteAppendOnlyFileRio(rdb, (unsigned char *)privdata);
     rioWrite(rdb, "*3\r\n", 4);
     rioWriteBulkString(rdb, "CLUSTER", 7);
     rioWriteBulkString(rdb, "SYNCSLOTS", 9);
@@ -4701,67 +4701,67 @@ int childSnapshotForSyncSlot(int req, rio *rdb, void *privdata) {
 void clusterProceedWithSlotExport(void) {
     while (clusterGetCurrentSlotExport() != NULL) {
         listNode *curr_node = listFirst(server.cluster->slot_export_jobs);
-        slotExport *curr_export = (slotExport *) curr_node->value;
+        slotExport *curr_export = (slotExport *)curr_node->value;
         if (curr_export->client == NULL) {
             serverLog(LL_WARNING, "Client for slot export has been closed");
             curr_export->state = SLOT_EXPORT_FAILED;
         }
-        switch(curr_export->state) {
-            case SLOT_EXPORT_QUEUED:
-                if (hasActiveChildProcess()) {
-                    /* We need to wait for the child to die, then we can
-                     * proceed. */
-                    return;
-                }
-                connection ** conns = zmalloc(sizeof(connection*));
-                *conns = curr_export->client->conn;
-                serverLog(LL_NOTICE, "Initiating snapshot to conn with fd %d", curr_export->client->conn->fd);
-                if (saveSnapshotToConnectionSockets(conns, 1, 1, 0, childSnapshotForSyncSlot, curr_export->slot_bitmap) != C_OK) {
-                    serverLog(LL_WARNING, "Failed to start slot export to target");
-                    curr_export->state = SLOT_EXPORT_FAILED;
-                    continue;
-                }
-                curr_export->state = SLOT_EXPORT_SNAPSHOTTING;
-                continue;
-            case SLOT_EXPORT_SNAPSHOTTING:
-                /* During this time, we are waiting for SYNCSLOTS PAUSE to
-                 * start flushing the accumulated backlog. */
+        switch (curr_export->state) {
+        case SLOT_EXPORT_QUEUED:
+            if (hasActiveChildProcess()) {
+                /* We need to wait for the child to die, then we can
+                 * proceed. */
                 return;
-            case SLOT_EXPORT_PAUSE_AND_REPLY:
-                addReplyArrayLen(curr_export->client, 4);
-                addReplyBulkCBuffer(curr_export->client, "CLUSTER", 7);
-                addReplyBulkCBuffer(curr_export->client, "SYNCSLOTS", 9);
-                addReplyBulkCBuffer(curr_export->client, "PAUSEOFFSET", 11);
-                addReplyBulkLongLong(curr_export->client, curr_export->syncslot_offset);
-
-                /* Even though we just added replies, it's possible that, due to
-                 * existing pending data, the client is not in the pending write
-                 * queue. We enqueue it explicitly to work around this. */
-                putClientInPendingWriteQueue(curr_export->client);
-
-                curr_export->pause_end = mstime() + (CLUSTER_MF_TIMEOUT * CLUSTER_MF_PAUSE_MULT);
-                pauseActions(PAUSE_DURING_SLOT_MIGRATION, curr_export->pause_end, PAUSE_ACTIONS_CLIENT_WRITE_SET);
-
-                curr_export->state = SLOT_EXPORT_PAUSED;
+            }
+            connection **conns = zmalloc(sizeof(connection *));
+            *conns = curr_export->client->conn;
+            serverLog(LL_NOTICE, "Initiating snapshot to conn with fd %d", curr_export->client->conn->fd);
+            if (saveSnapshotToConnectionSockets(conns, 1, 1, 0, childSnapshotForSyncSlot, curr_export->slot_bitmap) != C_OK) {
+                serverLog(LL_WARNING, "Failed to start slot export to target");
+                curr_export->state = SLOT_EXPORT_FAILED;
                 continue;
-            case SLOT_EXPORT_PAUSED:
-                /* While paused, we simply want to check if we should unpause. */
-                if (curr_export->pause_end <= mstime()) {
-                    /* Every CLUSTER_MF_TIMEOUT, the source node should
-                     * re-attempt the pause. If we reach this point, it hasn't
-                     * attempted the pause in that time, we can assume it is
-                     * dead and fail the migration.*/
-                    serverLog(LL_WARNING, "During slot export, unpausing self and cancelling export due to timeout.");
-                    unpauseActions(PAUSE_DURING_SLOT_MIGRATION);
-                    curr_export->state = SLOT_EXPORT_FAILED;
-                    continue;
-                }
-                return;
-            case SLOT_EXPORT_FINISH:
-            case SLOT_EXPORT_FAILED:
-                listDelNode(server.cluster->slot_export_jobs, curr_node);
-                clusterFreeSlotExportJob(curr_export);
+            }
+            curr_export->state = SLOT_EXPORT_SNAPSHOTTING;
+            continue;
+        case SLOT_EXPORT_SNAPSHOTTING:
+            /* During this time, we are waiting for SYNCSLOTS PAUSE to
+             * start flushing the accumulated backlog. */
+            return;
+        case SLOT_EXPORT_PAUSE_AND_REPLY:
+            addReplyArrayLen(curr_export->client, 4);
+            addReplyBulkCBuffer(curr_export->client, "CLUSTER", 7);
+            addReplyBulkCBuffer(curr_export->client, "SYNCSLOTS", 9);
+            addReplyBulkCBuffer(curr_export->client, "PAUSEOFFSET", 11);
+            addReplyBulkLongLong(curr_export->client, curr_export->syncslot_offset);
+
+            /* Even though we just added replies, it's possible that, due to
+             * existing pending data, the client is not in the pending write
+             * queue. We enqueue it explicitly to work around this. */
+            putClientInPendingWriteQueue(curr_export->client);
+
+            curr_export->pause_end = mstime() + (CLUSTER_MF_TIMEOUT * CLUSTER_MF_PAUSE_MULT);
+            pauseActions(PAUSE_DURING_SLOT_MIGRATION, curr_export->pause_end, PAUSE_ACTIONS_CLIENT_WRITE_SET);
+
+            curr_export->state = SLOT_EXPORT_PAUSED;
+            continue;
+        case SLOT_EXPORT_PAUSED:
+            /* While paused, we simply want to check if we should unpause. */
+            if (curr_export->pause_end <= mstime()) {
+                /* Every CLUSTER_MF_TIMEOUT, the source node should
+                 * re-attempt the pause. If we reach this point, it hasn't
+                 * attempted the pause in that time, we can assume it is
+                 * dead and fail the migration.*/
+                serverLog(LL_WARNING, "During slot export, unpausing self and cancelling export due to timeout.");
+                unpauseActions(PAUSE_DURING_SLOT_MIGRATION);
+                curr_export->state = SLOT_EXPORT_FAILED;
                 continue;
+            }
+            return;
+        case SLOT_EXPORT_FINISH:
+        case SLOT_EXPORT_FAILED:
+            listDelNode(server.cluster->slot_export_jobs, curr_node);
+            clusterFreeSlotExportJob(curr_export);
+            continue;
         }
     }
 }
@@ -7532,8 +7532,8 @@ int clusterCommandSpecial(client *c) {
         slotBitmap requested_slots;
         memset(requested_slots, 0, sizeof(slotBitmap));
         int i;
-        clusterNode * curr_owner = NULL;
-        for (i = 3; i + 1 < c->argc; i+=2) {
+        clusterNode *curr_owner = NULL;
+        for (i = 3; i + 1 < c->argc; i += 2) {
             int start = getSlotOrReply(c, c->argv[i]);
             if (start < 0) {
                 return 1;
@@ -7571,7 +7571,7 @@ int clusterCommandSpecial(client *c) {
             }
         }
 
-        slotImport * to_enqueue = clusterCreateSlotImportJob(curr_owner, requested_slots);
+        slotImport *to_enqueue = clusterCreateSlotImportJob(curr_owner, requested_slots);
         listAddNodeTail(server.cluster->slot_import_jobs, to_enqueue);
         clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION);
         addReply(c, shared.ok);
@@ -7583,7 +7583,7 @@ int clusterCommandSpecial(client *c) {
         if (!strcasecmp(c->argv[2]->ptr, "start")) {
             /* CLUSTER SYNCSLOTS START <slot-bitmap> */
             if (c->argc != 4) {
-            addReplyError(c, "CLUSTER SYNCSLOTS START command requires exactly one argument");
+                addReplyError(c, "CLUSTER SYNCSLOTS START command requires exactly one argument");
                 return 1;
             }
             if (sdslen(c->argv[3]->ptr) != sizeof(slotBitmap)) {
diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
index eb9ecc5bb1..00e686997a 100644
--- a/src/cluster_legacy.h
+++ b/src/cluster_legacy.h
@@ -132,9 +132,9 @@ typedef struct {
 } clusterMsgDataPublish;
 
 typedef struct {
-    uint64_t configEpoch;                   /* Config epoch of the specified instance. */
-    char nodename[CLUSTER_NAMELEN];         /* Name of the slots owner. */
-    slotBitmap slots; /* Slots bitmap. */
+    uint64_t configEpoch;           /* Config epoch of the specified instance. */
+    char nodename[CLUSTER_NAMELEN]; /* Name of the slots owner. */
+    slotBitmap slots;               /* Slots bitmap. */
 } clusterMsgDataUpdate;
 
 typedef struct {
diff --git a/src/rdb.c b/src/rdb.c
index e8d4bfae1a..36ae825670 100644
--- a/src/rdb.c
+++ b/src/rdb.c
@@ -3685,14 +3685,14 @@ int rdbSaveToReplicasSockets(int req, rdbSaveInfo *rsi) {
         }
     }
 
-    int retval = saveSnapshotToConnectionSockets(conns, connsnum, !dual_channel, req, childSnapshotUsingRDB, (void *) rsi);
+    int retval = saveSnapshotToConnectionSockets(conns, connsnum, !dual_channel, req, childSnapshotUsingRDB, (void *)rsi);
 
     if (retval != C_OK) {
         serverLog(LL_WARNING, "Can't save in background: fork: %s", strerror(errno));
 
         /* Undo the state change. The caller will perform cleanup on
-            * all the replicas in BGSAVE_START state, but an early call to
-            * replicationSetupReplicaForFullResync() turned it into BGSAVE_END */
+         * all the replicas in BGSAVE_START state, but an early call to
+         * replicationSetupReplicaForFullResync() turned it into BGSAVE_END */
         listRewind(server.replicas, &li);
         while ((ln = listNext(&li))) {
             client *replica = ln->value;
diff --git a/src/server.c b/src/server.c
index 50b93b2943..8654c89df0 100644
--- a/src/server.c
+++ b/src/server.c
@@ -4305,8 +4305,7 @@ int processCommand(client *c) {
 
     /* If the server is paused, block the client until
      * the pause has ended. Replicas are never paused. */
-    if (!c->flag.replica && !c->flag.slot_migration_target && ((isPausedActions(PAUSE_ACTION_CLIENT_ALL)) ||
-                             ((isPausedActions(PAUSE_ACTION_CLIENT_WRITE)) && is_may_replicate_command))) {
+    if (!c->flag.replica && !c->flag.slot_migration_target && ((isPausedActions(PAUSE_ACTION_CLIENT_ALL)) || ((isPausedActions(PAUSE_ACTION_CLIENT_WRITE)) && is_may_replicate_command))) {
         blockPostponeClient(c);
         return C_OK;
     }
diff --git a/src/server.h b/src/server.h
index 1344db1de9..6269461ed5 100644
--- a/src/server.h
+++ b/src/server.h
@@ -182,15 +182,15 @@ struct hdr_histogram;
 #define RIO_CONNSET_WRITE_MAX_CHUNK_SIZE 16384
 
 /* Instantaneous metrics tracking. */
-#define STATS_METRIC_SAMPLES 16                 /* Number of samples per metric. */
-#define STATS_METRIC_COMMAND 0                  /* Number of commands executed. */
-#define STATS_METRIC_NET_INPUT 1                /* Bytes read to network. */
-#define STATS_METRIC_NET_OUTPUT 2               /* Bytes written to network. */
-#define STATS_METRIC_NET_INPUT_REPLICATION 3    /* Bytes read to network during replication. */
-#define STATS_METRIC_NET_OUTPUT_REPLICATION 4   /* Bytes written to network during replication. */
-#define STATS_METRIC_EL_CYCLE 5                 /* Number of eventloop cycled. */
-#define STATS_METRIC_EL_DURATION 6              /* Eventloop duration. */
-#define STATS_METRIC_NET_INPUT_SLOT_MIGRATION 7 /* Bytes read to network during slot migration. */
+#define STATS_METRIC_SAMPLES 16                  /* Number of samples per metric. */
+#define STATS_METRIC_COMMAND 0                   /* Number of commands executed. */
+#define STATS_METRIC_NET_INPUT 1                 /* Bytes read to network. */
+#define STATS_METRIC_NET_OUTPUT 2                /* Bytes written to network. */
+#define STATS_METRIC_NET_INPUT_REPLICATION 3     /* Bytes read to network during replication. */
+#define STATS_METRIC_NET_OUTPUT_REPLICATION 4    /* Bytes written to network during replication. */
+#define STATS_METRIC_EL_CYCLE 5                  /* Number of eventloop cycled. */
+#define STATS_METRIC_EL_DURATION 6               /* Eventloop duration. */
+#define STATS_METRIC_NET_INPUT_SLOT_MIGRATION 7  /* Bytes read to network during slot migration. */
 #define STATS_METRIC_NET_OUTPUT_SLOT_MIGRATION 7 /* Bytes written to network during slot migration. */
 #define STATS_METRIC_COUNT 8
 
@@ -380,10 +380,10 @@ typedef enum blocking_type {
 #define CLIENT_TYPE_PUBSUB 2         /* Clients subscribed to PubSub channels. */
 #define CLIENT_TYPE_PRIMARY 3        /* Primary. */
 #define CLIENT_TYPE_SLOT_MIGRATION 4 /* Slot migration client. */
-#define CLIENT_TYPE_COUNT 5      /* Total number of client types. */
-#define CLIENT_TYPE_OBUF_COUNT 3 /* Number of clients to expose to output \
-                                    buffer configuration. Just the first  \
-                                    three: normal, replica, pubsub. */
+#define CLIENT_TYPE_COUNT 5          /* Total number of client types. */
+#define CLIENT_TYPE_OBUF_COUNT 3     /* Number of clients to expose to output \
+                                        buffer configuration. Just the first  \
+                                        three: normal, replica, pubsub. */
 
 /* Replica replication state. Used in server.repl_state for replicas to remember
  * what to do next. */
@@ -1110,8 +1110,8 @@ typedef struct ClientPubSubData {
                                       context of client side caching. */
 } ClientPubSubData;
 
-#define CLUSTER_SLOT_MASK_BITS 14                                   /* Number of bits used for slot id. */
-#define CLUSTER_SLOTS (1 << CLUSTER_SLOT_MASK_BITS)                 /* Total number of slots in cluster mode, which is 16384. */
+#define CLUSTER_SLOT_MASK_BITS 14                   /* Number of bits used for slot id. */
+#define CLUSTER_SLOTS (1 << CLUSTER_SLOT_MASK_BITS) /* Total number of slots in cluster mode, which is 16384. */
 
 typedef unsigned char slotBitmap[CLUSTER_SLOTS / 8];
 
@@ -1710,8 +1710,8 @@ struct valkeyServer {
     long long stat_net_repl_input_bytes;           /* Bytes read during replication, added to stat_net_input_bytes in 'info'. */
     /* Bytes written during replication, added to stat_net_output_bytes in 'info'. */
     long long stat_net_repl_output_bytes;
-    long long stat_net_slot_migration_input_bytes; /* Bytes read during slot migration, added to stat_net_input_bytes in 'info'. */
-    long long stat_net_slot_migration_output_bytes; /* Bytes written during slot migration, added to stat_net_output_bytes in 'info'. */
+    long long stat_net_slot_migration_input_bytes;      /* Bytes read during slot migration, added to stat_net_input_bytes in 'info'. */
+    long long stat_net_slot_migration_output_bytes;     /* Bytes written during slot migration, added to stat_net_output_bytes in 'info'. */
     size_t stat_current_cow_peak;                       /* Peak size of copy on write bytes. */
     size_t stat_current_cow_bytes;                      /* Copy on write bytes while child is active. */
     monotime stat_current_cow_updated;                  /* Last update time of stat_current_cow_bytes */
@@ -2962,7 +2962,7 @@ void updateLoadingFileName(char *filename);
 void startSaving(int rdbflags);
 void stopSaving(int success);
 int allPersistenceDisabled(void);
-typedef int(*ChildSnapshotFunc)(int req, rio *rdb, void *privdata);
+typedef int (*ChildSnapshotFunc)(int req, rio *rdb, void *privdata);
 int saveSnapshotToConnectionSockets(connection **conns, int connsnum, int use_pipe, int req, ChildSnapshotFunc snapshot_func, void *privdata);
 
 #define DISK_ERROR_TYPE_AOF 1  /* Don't accept writes: AOF errors. */

From eb4c1a896d8ab4faae71c755ca7426cac49e53fd Mon Sep 17 00:00:00 2001
From: Jacob Murphy <jkmurphy@google.com>
Date: Wed, 22 Jan 2025 00:45:36 +0000
Subject: [PATCH 14/18] Fix offset accounting and handle source primary
 failover

Signed-off-by: Jacob Murphy <jkmurphy@google.com>
---
 src/cluster_legacy.c | 130 +++++++++++++++++++++++++++++++------------
 src/cluster_legacy.h |   4 +-
 src/replication.c    |   2 +
 3 files changed, 99 insertions(+), 37 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index ae190fc83b..1357fdaf1d 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -2625,6 +2625,19 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc
                     }
                 }
 
+                /* Handle the case that we are importing the slot via atomic
+                 * slot migration and the ownership changes. */
+                slotImport *curr_import = clusterGetCurrentSlotImport();
+                if (curr_import != NULL && bitmapTestBit(curr_import->slot_bitmap, j) && curr_import->source_node != sender && curr_import->state != SLOT_IMPORT_FAILED) {
+                    if (areInSameShard(sender, curr_import->source_node)) {
+                        serverLog(LL_WARNING, "Failover occurred during slot migration from %.40s (%s). Cancelling the migration.", curr_import->source_node->name, curr_import->source_node->human_nodename);
+                    } else {
+                        serverLog(LL_WARNING, "Slot %d has been moved to a different shard than that of %.40s (%s). Cancelling the migration.", j, curr_import->source_node->name, curr_import->source_node->human_nodename);
+                    }
+                    curr_import->state = SLOT_IMPORT_FAILED;
+                    clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION);
+                }
+
                 clusterDelSlot(j);
                 clusterAddSlot(sender, j);
                 bitmapClearBit(server.cluster->owner_not_claiming_slot, j);
@@ -4428,7 +4441,6 @@ slotExport *clusterCreateSlotExportJob(client *c, slotBitmap slots) {
     slotExport *result = (slotExport *)zcalloc(sizeof(slotExport));
     memcpy(result->slot_bitmap, slots, sizeof(slotBitmap));
     result->state = SLOT_EXPORT_QUEUED;
-    result->pause_end = -1;
     result->client = c;
     return result;
 }
@@ -4488,12 +4500,14 @@ void clusterFeedSlotMigration(int dbid, robj **argv, int argc) {
     }
     if (!bitmapTestBit(curr_export->slot_bitmap, slot)) return;
 
-    unsigned long long prev_pending = curr_export->client->reply_bytes;
     addReplyArrayLen(curr_export->client, argc);
+    /* '*' + argc + '\r\n' */
+    curr_export->streamed_repl_offset += 1 + digits10(argc) + 2;
     for (i = 0; i < argc; i++) {
         addReplyBulk(curr_export->client, argv[i]);
+        /* '$' + len(argv[i]) + '\r\n' + argv[i] + '\r\n' */
+        curr_export->streamed_repl_offset += 1 + digits10(stringObjectLen(argv[i])) + 2 + stringObjectLen(argv[i]) + 2;
     }
-    curr_export->syncslot_offset += curr_export->client->reply_bytes - prev_pending;
 }
 
 int clusterShouldWriteToSlotMigrationTarget(void) {
@@ -4524,33 +4538,36 @@ void clusterImportHandler(connection *conn) {
 
 void clusterProceedWithSlotImport(void) {
     char *err;
+    mstime_t now;
     while (clusterGetCurrentSlotImport() != NULL) {
+        now = mstime();
         listNode *curr_node = listFirst(server.cluster->slot_import_jobs);
         slotImport *curr_import = (slotImport *)curr_node->value;
-        if (curr_import->state != SLOT_IMPORT_QUEUED && curr_import->end_time < mstime()) {
-            serverLog(LL_WARNING,
-                      "Timed out for slot import from source node %.40s", curr_import->source_node->name);
-            curr_import->state = SLOT_IMPORT_FAILED;
-        }
-        if (curr_import->state > SLOT_IMPORT_PAUSE_OWNER && curr_import->state != SLOT_IMPORT_FAILED && curr_import->pause_end < mstime()) {
-            /* If the owner ever unpauses, we have to move back in the state machine and retry. */
-            serverLog(LL_WARNING, "Reinitiating pause on the node owning the slot range...");
-            curr_import->state = SLOT_IMPORT_PAUSE_OWNER;
-            curr_import->pause_end = mstime() + CLUSTER_MF_TIMEOUT;
-        }
-        if (curr_import->state > SLOT_IMPORT_CONNECTING && curr_import->client == NULL) {
-            serverLog(LL_WARNING, "Client for slot import from source node %.40s has been closed", curr_import->source_node->name);
-            curr_import->state = SLOT_IMPORT_FAILED;
+        if (curr_import->state != SLOT_IMPORT_FAILED) {
+            if (curr_import->end_time && curr_import->end_time < now) {
+                serverLog(LL_WARNING,
+                          "Timed out for slot import from source node %.40s", curr_import->source_node->name);
+                curr_import->state = SLOT_IMPORT_FAILED;
+            } else if (curr_import->state > SLOT_IMPORT_CONNECTING && curr_import->client == NULL) {
+                serverLog(LL_WARNING, "Client for slot import from source node %.40s (%s) has been closed. Cancelling the migration.", curr_import->source_node->name, curr_import->source_node->human_nodename);
+                curr_import->state = SLOT_IMPORT_FAILED;
+            } else if (nodeIsReplica(curr_import->source_node)) {
+                serverLog(LL_WARNING, "Source node %.40s (%s) has been demote to replica. Cancelling the migration.", curr_import->source_node->name, curr_import->source_node->human_nodename);
+                curr_import->state = SLOT_IMPORT_FAILED;
+            } else if (curr_import->pause_end && curr_import->pause_end < now) {
+                /* If the owner ever unpauses, we have to move back in the state machine and retry. */
+                serverLog(LL_WARNING, "Reinitiating pause on the node owning the slot range...");
+                curr_import->state = SLOT_IMPORT_PAUSE_OWNER;
+            }
         }
         switch (curr_import->state) {
         case SLOT_IMPORT_QUEUED:
-            /* Start the migration */
             serverLog(LL_NOTICE, "Starting slot import from source node %.40s", curr_import->source_node->name);
-            curr_import->end_time = mstime() + CLUSTER_SLOT_IMPORT_TIMEOUT;
+            curr_import->end_time = now + CLUSTER_SLOT_IMPORT_TIMEOUT;
             curr_import->conn = connCreate(server.tls_replication ? connectionTypeTls() : connectionTypeTcp());
             if (connConnect(curr_import->conn, curr_import->source_node->ip, getNodeDefaultReplicationPort(curr_import->source_node), server.bind_source_addr, clusterImportHandler) == C_ERR) {
                 serverLog(LL_WARNING,
-                          "Failed to connect to slot import source node %.40s", curr_import->source_node->name);
+                          "Failed to connect to slot import source node %.40s (%s). Cancelling the migration.", curr_import->source_node->name, curr_import->source_node->human_nodename);
                 curr_import->state = SLOT_IMPORT_FAILED;
                 continue;
             }
@@ -4561,11 +4578,11 @@ void clusterProceedWithSlotImport(void) {
                 /* Nothing to do, waiting for connection to be established. */
                 return;
             } else if (curr_import->conn->state != CONN_STATE_CONNECTED) {
-                serverLog(LL_WARNING, "Failed during connection to slot import source node %.40s: %s", curr_import->source_node->name, connGetLastError(curr_import->conn));
+                serverLog(LL_WARNING, "Failed during connection to slot import source node %.40s (%s): %s. Cancelling the migration.", curr_import->source_node->name, curr_import->source_node->human_nodename, connGetLastError(curr_import->conn));
                 curr_import->state = SLOT_IMPORT_FAILED;
                 continue;
             }
-            serverLog(LL_NOTICE, "Connected to slot import source node %.40s", curr_import->source_node->name);
+            serverLog(LL_NOTICE, "Connected to slot import source node %.40s (%s)", curr_import->source_node->name, curr_import->source_node->human_nodename);
             connSetReadHandler(curr_import->conn, NULL);
             client *c = createClient(curr_import->conn);
             curr_import->client = c;
@@ -4596,7 +4613,7 @@ void clusterProceedWithSlotImport(void) {
             argc++;
             err = sendCommandArgv(curr_import->conn, argc, auth_args, auth_lens);
             if (err) {
-                serverLog(LL_WARNING, "Failed to write AUTH to slot migration source: %s", err);
+                serverLog(LL_WARNING, "Failed to write AUTH to slot migration source %.40s (%s): %s. Cancelling the migration.", curr_import->source_node->name, curr_import->source_node->human_nodename, err);
                 sdsfree(err);
                 curr_import->state = SLOT_IMPORT_FAILED;
                 continue;
@@ -4606,26 +4623,31 @@ void clusterProceedWithSlotImport(void) {
         case SLOT_IMPORT_RECEIVE_AUTH:
             err = receiveSynchronousResponse(curr_import->conn);
             if (err == NULL) {
-                serverLog(LL_WARNING, "Slot migration source did not respond to AUTH command");
+                serverLog(LL_WARNING, "Slot migration source %.40s (%s) did not respond to AUTH command. Cancelling the migration.", curr_import->source_node->name, curr_import->source_node->human_nodename);
+                curr_import->state = SLOT_IMPORT_FAILED;
+                continue;
             }
             if (err[0] == '-') {
-                serverLog(LL_WARNING, "Unable to AUTH to slot migration source: %s", err);
+                serverLog(LL_WARNING, "Unable to AUTH to slot migration source %.40s (%s): %s", curr_import->source_node->name, curr_import->source_node->human_nodename, err);
                 sdsfree(err);
+                curr_import->state = SLOT_IMPORT_FAILED;
+                continue;
             }
             sdsfree(err);
             err = NULL;
+            serverLog(LL_NOTICE, "Successfully authenticated to slot migration source %.40s (%s)", curr_import->source_node->name, curr_import->source_node->human_nodename);
             curr_import->state = SLOT_IMPORT_SEND_SYNCSLOTS;
             continue;
         case SLOT_IMPORT_SEND_SYNCSLOTS:
             /* Ensure we have a clean state for the SYNC. */
             delKeysInSlotBitmap(curr_import->slot_bitmap);
 
-            serverLog(LL_NOTICE, "Sending CLUSTER SYNCSLOTS START request to source node %.40s", curr_import->source_node->name);
+            serverLog(LL_NOTICE, "Sending CLUSTER SYNCSLOTS START request to source node %.40s (%s).", curr_import->source_node->name, curr_import->source_node->human_nodename);
             char *syncslots_args[4] = {"CLUSTER", "SYNCSLOTS", "START", (char *)curr_import->slot_bitmap};
             size_t syncslots_lens[4] = {7, 9, 5, sizeof(slotBitmap)};
             err = sendCommandArgv(curr_import->conn, 4, syncslots_args, syncslots_lens);
             if (err) {
-                serverLog(LL_WARNING, "Failed to write SYNCSLOTS START to slot migration source: %s", err);
+                serverLog(LL_WARNING, "Failed to write SYNCSLOTS START to slot migration source %.40s (%s): %s", curr_import->source_node->name, curr_import->source_node->human_nodename, err);
                 sdsfree(err);
                 curr_import->state = SLOT_IMPORT_FAILED;
                 continue;
@@ -4647,23 +4669,23 @@ void clusterProceedWithSlotImport(void) {
             addReplyBulkCBuffer(curr_import->client, "PAUSE", 5);
             curr_import->client->flag.replication_force_reply = 0;
 
-            serverLog(LL_NOTICE, "SYNCSLOTS from slot migration source %.40s (%s) has been performed. Pausing source node and waiting to continue", curr_import->source_node->name, curr_import->source_node->human_nodename);
+            serverLog(LL_NOTICE, "SYNCSLOTS from slot migration source %.40s (%s) has been performed, received offset %lld. Pausing source node and waiting to continue", curr_import->source_node->name, curr_import->source_node->human_nodename, curr_import->client->repl_data->reploff);
             curr_import->paused_at_offset = -1;
-            curr_import->pause_end = mstime() + CLUSTER_MF_TIMEOUT;
+            curr_import->pause_end = now + CLUSTER_MF_TIMEOUT;
             curr_import->state = SLOT_IMPORT_WAITING_FOR_OFFSET;
             continue;
         case SLOT_IMPORT_WAITING_FOR_OFFSET:
             return;
         case SLOT_IMPORT_SYNCING_TO_OFFSET:
             if (curr_import->client->repl_data->reploff >= curr_import->paused_at_offset) {
-                serverLog(LL_NOTICE, "SYNCSLOTS of slot range has caught up to paused slot owner (%lld), slot migration can start.", curr_import->paused_at_offset);
+                serverLog(LL_NOTICE, "SYNCSLOTS of slot range has caught up to paused slot owner %.40s (%s): my offset %lld, source offset %lld, slot migration can start.", curr_import->source_node->name, curr_import->source_node->human_nodename, curr_import->client->repl_data->reploff, curr_import->paused_at_offset);
                 curr_import->state = SLOT_IMPORT_FINISH;
                 continue;
             }
             /* Need to wait for the sync to progress further */
             return;
         case SLOT_IMPORT_FINISH:
-            serverLog(LL_NOTICE, "Setting myself to owner of migrating slots and broadcasting");
+            serverLog(LL_NOTICE, "Setting myself to owner of migrating slots and broadcasting.");
             for (int i = 0; i < CLUSTER_SLOTS; i++) {
                 if (bitmapTestBit(curr_import->slot_bitmap, i)) {
                     clusterDelSlot(i);
@@ -4673,7 +4695,7 @@ void clusterProceedWithSlotImport(void) {
             clusterUpdateState();
             clusterSaveConfigOrDie(1);
             if (clusterBumpConfigEpochWithoutConsensus() == C_OK) {
-                serverLog(LL_NOTICE, "ConfigEpoch updated after importing slots");
+                serverLog(LL_NOTICE, "Epoch bumped after importing slots. New epoch %ld", server.cluster->currentEpoch);
             }
             clusterFreeSlotImportJob(curr_import);
             clusterBroadcastPong(CLUSTER_BROADCAST_ALL);
@@ -4699,7 +4721,9 @@ int childSnapshotForSyncSlot(int req, rio *rdb, void *privdata) {
 }
 
 void clusterProceedWithSlotExport(void) {
+    mstime_t now;
     while (clusterGetCurrentSlotExport() != NULL) {
+        now = mstime();
         listNode *curr_node = listFirst(server.cluster->slot_export_jobs);
         slotExport *curr_export = (slotExport *)curr_node->value;
         if (curr_export->client == NULL) {
@@ -4729,24 +4753,53 @@ void clusterProceedWithSlotExport(void) {
             return;
         case SLOT_EXPORT_PAUSE_AND_REPLY:
             addReplyArrayLen(curr_export->client, 4);
+            curr_export->streamed_repl_offset += 4; /* '*4\r\n' */
             addReplyBulkCBuffer(curr_export->client, "CLUSTER", 7);
+            curr_export->streamed_repl_offset += 13; /* '$7\r\nCLUSTER\r\n' */
             addReplyBulkCBuffer(curr_export->client, "SYNCSLOTS", 9);
+            curr_export->streamed_repl_offset += 15; /* '$9\r\nSYNCSLOTS\r\n' */
             addReplyBulkCBuffer(curr_export->client, "PAUSEOFFSET", 11);
-            addReplyBulkLongLong(curr_export->client, curr_export->syncslot_offset);
+            curr_export->streamed_repl_offset += 18; /* '$11\r\nPAUSEOFFSET\r\n' */
+
+            /* We add the length of the offset reply to the offest itself. */
+            uint32_t offset_len = digits10(curr_export->streamed_repl_offset + curr_export->client->repl_data->repldbsize);
+            uint32_t offset_len_len = digits10(offset_len);
+            curr_export->streamed_repl_offset += 1 + offset_len_len + 2 + offset_len + 2;
+            uint32_t offset_len2 = digits10(curr_export->streamed_repl_offset + curr_export->client->repl_data->repldbsize);
+            if (offset_len2 > offset_len) {
+                /* Adding the offset will add at most one more digit, since
+                 * it's length will be <=10 (uint32_t max) */
+                serverAssert(offset_len2 == offset_len + 1);
+                curr_export->streamed_repl_offset++;
+                uint32_t offset_len_len2 = digits10(digits10(curr_export->streamed_repl_offset));
+                if (offset_len_len2 > offset_len_len) {
+                    /* If offset_len was really close to another digit, we
+                     * have to handle that too. */
+                    serverAssert(offset_len_len2 == offset_len_len + 1);
+                    curr_export->streamed_repl_offset++;
+                }
+            }
+            serverLog(LL_NOTICE, "At time of pause, slot migration AOF size: %lu, "
+                                 "slot migration streaming offset: %llu, total "
+                                 "offset: %llu",
+                      curr_export->client->repl_data->repldbsize,
+                      curr_export->streamed_repl_offset,
+                      curr_export->streamed_repl_offset + curr_export->client->repl_data->repldbsize);
+            addReplyBulkLongLong(curr_export->client, curr_export->streamed_repl_offset + curr_export->client->repl_data->repldbsize);
 
             /* Even though we just added replies, it's possible that, due to
              * existing pending data, the client is not in the pending write
              * queue. We enqueue it explicitly to work around this. */
             putClientInPendingWriteQueue(curr_export->client);
 
-            curr_export->pause_end = mstime() + (CLUSTER_MF_TIMEOUT * CLUSTER_MF_PAUSE_MULT);
+            curr_export->pause_end = now + (CLUSTER_MF_TIMEOUT * CLUSTER_MF_PAUSE_MULT);
             pauseActions(PAUSE_DURING_SLOT_MIGRATION, curr_export->pause_end, PAUSE_ACTIONS_CLIENT_WRITE_SET);
 
             curr_export->state = SLOT_EXPORT_PAUSED;
             continue;
         case SLOT_EXPORT_PAUSED:
             /* While paused, we simply want to check if we should unpause. */
-            if (curr_export->pause_end <= mstime()) {
+            if (curr_export->pause_end <= now) {
                 /* Every CLUSTER_MF_TIMEOUT, the source node should
                  * re-attempt the pause. If we reach this point, it hasn't
                  * attempted the pause in that time, we can assume it is
@@ -7646,6 +7699,12 @@ int clusterCommandSpecial(client *c) {
                 addReplyError(c, "CLUSTER SYNCSLOTS PAUSEOFFSET command requires exactly one argument.");
                 return 1;
             }
+            if (c->flag.primary) {
+                /* Due to the proxying nature of replication from the source
+                 * node through the target node to the target node's replicas,
+                 * this message should simply be ignored. */
+                return 1;
+            }
             slotImport *slot_import = clusterGetCurrentSlotImport();
             if (!slot_import || slot_import->state != SLOT_IMPORT_WAITING_FOR_OFFSET) {
                 addReplyError(c, "No CLUSTER SYNCSLOTS is waiting for a PAUSEOFFSET response.");
@@ -7656,6 +7715,7 @@ int clusterCommandSpecial(client *c) {
                 addReplyError(c, "Failed to parse PAUSEOFFSET offset.");
                 return 1;
             }
+            serverLog(LL_NOTICE, "Received paused offset for slot migration from %.40s (%s). My offset: %lld, source offset: %lld", slot_import->source_node->name, slot_import->source_node->human_nodename, slot_import->client->repl_data->reploff, offset);
             slot_import->paused_at_offset = offset;
             slot_import->state = SLOT_IMPORT_SYNCING_TO_OFFSET;
             clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_SLOTMIGRATION);
diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
index 00e686997a..471dbdb950 100644
--- a/src/cluster_legacy.h
+++ b/src/cluster_legacy.h
@@ -403,8 +403,8 @@ typedef enum slotExportState {
 typedef struct slotExport {
     slotBitmap slot_bitmap;
     slotExportState state;
-    client *client; /* Client for replication */
-    unsigned long long syncslot_offset;
+    client *client;                          /* Client for replication */
+    unsigned long long streamed_repl_offset; /* Offset for just the streamed part of the syncslots command.*/
     mstime_t pause_end;
 } slotExport;
 
diff --git a/src/replication.c b/src/replication.c
index bcb9e0a756..c13a0edb3f 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -1700,6 +1700,7 @@ void rdbPipeWriteHandler(struct connection *conn) {
     } else {
         replica->repl_data->repldboff += nwritten;
         server.stat_net_repl_output_bytes += nwritten;
+        replica->repl_data->repldbsize += nwritten;
         if (replica->repl_data->repldboff < server.rdb_pipe_bufflen) {
             replica->repl_data->repl_last_partial_write = server.unixtime;
             return; /* more data to write.. */
@@ -1774,6 +1775,7 @@ void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData,
                  * of 'rdb_pipe_buff' sent rather than the offset of entire RDB. */
                 replica->repl_data->repldboff = nwritten;
                 server.stat_net_repl_output_bytes += nwritten;
+                replica->repl_data->repldbsize += nwritten;
             }
             /* If we were unable to write all the data to one of the replicas,
              * setup write handler (and disable pipe read handler, below) */

From 4506fbe87e3f2023d23fbec0be8a041f06457e9d Mon Sep 17 00:00:00 2001
From: Jacob Murphy <jkmurphy@google.com>
Date: Wed, 22 Jan 2025 00:55:50 +0000
Subject: [PATCH 15/18] Typo + mac build fix

Signed-off-by: Jacob Murphy <jkmurphy@google.com>
---
 src/cluster_legacy.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 1357fdaf1d..024394659f 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -4761,7 +4761,7 @@ void clusterProceedWithSlotExport(void) {
             addReplyBulkCBuffer(curr_export->client, "PAUSEOFFSET", 11);
             curr_export->streamed_repl_offset += 18; /* '$11\r\nPAUSEOFFSET\r\n' */
 
-            /* We add the length of the offset reply to the offest itself. */
+            /* We add the length of the offset reply to the offset itself. */
             uint32_t offset_len = digits10(curr_export->streamed_repl_offset + curr_export->client->repl_data->repldbsize);
             uint32_t offset_len_len = digits10(offset_len);
             curr_export->streamed_repl_offset += 1 + offset_len_len + 2 + offset_len + 2;
@@ -4779,10 +4779,8 @@ void clusterProceedWithSlotExport(void) {
                     curr_export->streamed_repl_offset++;
                 }
             }
-            serverLog(LL_NOTICE, "At time of pause, slot migration AOF size: %lu, "
-                                 "slot migration streaming offset: %llu, total "
-                                 "offset: %llu",
-                      curr_export->client->repl_data->repldbsize,
+            serverLog(LL_NOTICE, "At time of pause slot migration streaming offset: %llu, total "
+                                 "offset (with AOF snapshot): %llu",
                       curr_export->streamed_repl_offset,
                       curr_export->streamed_repl_offset + curr_export->client->repl_data->repldbsize);
             addReplyBulkLongLong(curr_export->client, curr_export->streamed_repl_offset + curr_export->client->repl_data->repldbsize);

From 3eec13c4e08e26f9cd76834c912a59cf80051c9a Mon Sep 17 00:00:00 2001
From: Jacob Murphy <jkmurphy@google.com>
Date: Wed, 22 Jan 2025 00:57:41 +0000
Subject: [PATCH 16/18] Log line fix

Signed-off-by: Jacob Murphy <jkmurphy@google.com>
---
 src/cluster_legacy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 024394659f..04a1359649 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -4695,7 +4695,7 @@ void clusterProceedWithSlotImport(void) {
             clusterUpdateState();
             clusterSaveConfigOrDie(1);
             if (clusterBumpConfigEpochWithoutConsensus() == C_OK) {
-                serverLog(LL_NOTICE, "Epoch bumped after importing slots. New epoch %ld", server.cluster->currentEpoch);
+                serverLog(LL_NOTICE, "Epoch bumped after importing slots. New epoch %llu", server.cluster->currentEpoch);
             }
             clusterFreeSlotImportJob(curr_import);
             clusterBroadcastPong(CLUSTER_BROADCAST_ALL);

From 9021fffae3a526f562be1de0f481269340b788b9 Mon Sep 17 00:00:00 2001
From: Jacob Murphy <jkmurphy@google.com>
Date: Wed, 22 Jan 2025 01:02:38 +0000
Subject: [PATCH 17/18] Another log line fix

Signed-off-by: Jacob Murphy <jkmurphy@google.com>
---
 src/cluster_legacy.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 04a1359649..45581a2280 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -4695,7 +4695,7 @@ void clusterProceedWithSlotImport(void) {
             clusterUpdateState();
             clusterSaveConfigOrDie(1);
             if (clusterBumpConfigEpochWithoutConsensus() == C_OK) {
-                serverLog(LL_NOTICE, "Epoch bumped after importing slots. New epoch %llu", server.cluster->currentEpoch);
+                serverLog(LL_NOTICE, "Epoch bumped after importing slots. New epoch %llu", (unsigned long long) server.cluster->currentEpoch);
             }
             clusterFreeSlotImportJob(curr_import);
             clusterBroadcastPong(CLUSTER_BROADCAST_ALL);
@@ -4779,8 +4779,10 @@ void clusterProceedWithSlotExport(void) {
                     curr_export->streamed_repl_offset++;
                 }
             }
-            serverLog(LL_NOTICE, "At time of pause slot migration streaming offset: %llu, total "
+            serverLog(LL_NOTICE, "At time of pause slot migration AOF snapshot size: %llu, "
+                                 "slot migration streaming offset: %llu, total "
                                  "offset (with AOF snapshot): %llu",
+                      (unsigned long long) curr_export->client->repl_data->repldbsize,
                       curr_export->streamed_repl_offset,
                       curr_export->streamed_repl_offset + curr_export->client->repl_data->repldbsize);
             addReplyBulkLongLong(curr_export->client, curr_export->streamed_repl_offset + curr_export->client->repl_data->repldbsize);

From f1d824f57205ea2f1dce8a1061c871c35d64b5f7 Mon Sep 17 00:00:00 2001
From: Jacob Murphy <jkmurphy@google.com>
Date: Wed, 22 Jan 2025 01:05:59 +0000
Subject: [PATCH 18/18] Apply clang format

Signed-off-by: Jacob Murphy <jkmurphy@google.com>
---
 src/cluster_legacy.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 45581a2280..04a4297b02 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -4695,7 +4695,7 @@ void clusterProceedWithSlotImport(void) {
             clusterUpdateState();
             clusterSaveConfigOrDie(1);
             if (clusterBumpConfigEpochWithoutConsensus() == C_OK) {
-                serverLog(LL_NOTICE, "Epoch bumped after importing slots. New epoch %llu", (unsigned long long) server.cluster->currentEpoch);
+                serverLog(LL_NOTICE, "Epoch bumped after importing slots. New epoch %llu", (unsigned long long)server.cluster->currentEpoch);
             }
             clusterFreeSlotImportJob(curr_import);
             clusterBroadcastPong(CLUSTER_BROADCAST_ALL);
@@ -4782,7 +4782,7 @@ void clusterProceedWithSlotExport(void) {
             serverLog(LL_NOTICE, "At time of pause slot migration AOF snapshot size: %llu, "
                                  "slot migration streaming offset: %llu, total "
                                  "offset (with AOF snapshot): %llu",
-                      (unsigned long long) curr_export->client->repl_data->repldbsize,
+                      (unsigned long long)curr_export->client->repl_data->repldbsize,
                       curr_export->streamed_repl_offset,
                       curr_export->streamed_repl_offset + curr_export->client->repl_data->repldbsize);
             addReplyBulkLongLong(curr_export->client, curr_export->streamed_repl_offset + curr_export->client->repl_data->repldbsize);