From c6f89d2e0c3e1135bdca9360a191e2a31fc8f3fc Mon Sep 17 00:00:00 2001 From: Xuezhao Liu Date: Sun, 12 Jan 2025 11:09:24 +0000 Subject: [PATCH] DAOS-14010 rebuild: maintain global stable epoch through IV Maintain global stable epoch through IV, it is the globally minimal stable epoch of all engines' all targets for the container. Required-githooks: true Signed-off-by: Xuezhao Liu --- src/container/container_iv.c | 89 ++++---- src/container/srv_container.c | 338 ++++++++++++++++++------------- src/container/srv_internal.h | 53 ++--- src/container/srv_target.c | 310 ++++++++++++++++++++-------- src/include/daos_srv/container.h | 22 +- src/include/daos_srv/iv.h | 10 +- src/include/daos_srv/pool.h | 3 +- src/object/srv_ec_aggregate.c | 53 +---- src/pool/srv_pool.c | 18 +- src/pool/srv_target.c | 14 +- src/vos/vos_container.c | 11 +- 11 files changed, 555 insertions(+), 366 deletions(-) diff --git a/src/container/container_iv.c b/src/container/container_iv.c index 08f3445529a..03a3375ed21 100644 --- a/src/container/container_iv.c +++ b/src/container/container_iv.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2019-2025 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -191,8 +191,9 @@ cont_iv_ent_copy(struct ds_iv_entry *entry, struct cont_iv_key *key, cip_acl.dal_ace[src->iv_prop.cip_acl.dal_len]); memcpy(&dst->iv_prop, &src->iv_prop, size); break; - case IV_CONT_AGG_EPOCH_BOUNDRY: - dst->iv_agg_eph.eph = src->iv_agg_eph.eph; + case IV_CONT_TRACK_EPOCH: + dst->iv_track_eph.ite_ec_agg_eph = src->iv_track_eph.ite_ec_agg_eph; + dst->iv_track_eph.ite_stable_eph = src->iv_track_eph.ite_stable_eph; break; default: rc = -DER_INVAL; @@ -551,9 +552,9 @@ cont_iv_ent_fetch(struct ds_iv_entry *entry, struct ds_iv_key *key, return rc; } -/* Update the EC agg epoch all servers to the leader */ +/* Update the track epoch all servers to the leader */ static int -cont_iv_ent_agg_eph_update(struct ds_iv_entry *entry, struct ds_iv_key *key, +cont_iv_ent_track_eph_update(struct ds_iv_entry *entry, struct ds_iv_key *key, d_sg_list_t *src) { struct cont_iv_key *civ_key = key2priv(key); @@ -568,25 +569,25 @@ cont_iv_ent_agg_eph_update(struct ds_iv_entry *entry, struct ds_iv_key *key, if (rank != entry->ns->iv_master_rank) return -DER_IVCB_FORWARD; - rc = ds_cont_leader_update_agg_eph(entry->ns->iv_pool_uuid, - civ_key->cont_uuid, - civ_ent->iv_agg_eph.rank, - civ_ent->iv_agg_eph.eph); + rc = ds_cont_leader_update_track_eph(entry->ns->iv_pool_uuid, civ_key->cont_uuid, + civ_ent->iv_track_eph.ite_rank, + civ_ent->iv_track_eph.ite_ec_agg_eph, + civ_ent->iv_track_eph.ite_stable_eph); return rc; } -/* Each server refresh the VOS aggregation epoch gotten from the leader */ +/* Each server refresh the track epoch gotten from the leader */ static int -cont_iv_ent_agg_eph_refresh(struct ds_iv_entry *entry, struct ds_iv_key *key, +cont_iv_ent_track_eph_refresh(struct ds_iv_entry *entry, struct ds_iv_key *key, d_sg_list_t *src) { struct cont_iv_entry *civ_ent = src->sg_iovs[0].iov_buf; struct cont_iv_key *civ_key = key2priv(key); int rc; - rc = ds_cont_tgt_refresh_agg_eph(entry->ns->iv_pool_uuid, - civ_key->cont_uuid, - civ_ent->iv_agg_eph.eph); + rc = ds_cont_tgt_refresh_track_eph(entry->ns->iv_pool_uuid, civ_key->cont_uuid, + civ_ent->iv_track_eph.ite_ec_agg_eph, + civ_ent->iv_track_eph.ite_stable_eph); return rc; } @@ -638,13 +639,12 @@ cont_iv_ent_update(struct ds_iv_entry *entry, struct ds_iv_key *key, if (rc) D_GOTO(out, rc); } else if (entry->iv_class->iv_class_id == - IV_CONT_AGG_EPOCH_REPORT) { - rc = cont_iv_ent_agg_eph_update(entry, key, src); + IV_CONT_TRACK_EPOCH_REPORT) { + rc = cont_iv_ent_track_eph_update(entry, key, src); if (rc) D_GOTO(out, rc); - } else if (entry->iv_class->iv_class_id == - IV_CONT_AGG_EPOCH_BOUNDRY) { - rc = cont_iv_ent_agg_eph_refresh(entry, key, src); + } else if (entry->iv_class->iv_class_id == IV_CONT_TRACK_EPOCH) { + rc = cont_iv_ent_track_eph_refresh(entry, key, src); if (rc) D_GOTO(out, rc); } @@ -1068,20 +1068,20 @@ cont_iv_hdl_fetch(uuid_t cont_hdl_uuid, uuid_t pool_uuid, return rc; } -int -cont_iv_ec_agg_eph_update_internal(void *ns, uuid_t cont_uuid, - daos_epoch_t eph, unsigned int shortcut, - unsigned int sync_mode, - uint32_t op) +static int +cont_iv_track_eph_update_internal(void *ns, uuid_t cont_uuid, daos_epoch_t ec_agg_eph, + daos_epoch_t stable_eph, unsigned int shortcut, + unsigned int sync_mode, uint32_t op) { struct cont_iv_entry iv_entry = { 0 }; int rc; /* Only happens on xstream 0 */ D_ASSERT(dss_get_module_info()->dmi_xs_id == 0); - iv_entry.iv_agg_eph.eph = eph; + iv_entry.iv_track_eph.ite_ec_agg_eph = ec_agg_eph; + iv_entry.iv_track_eph.ite_stable_eph = stable_eph; uuid_copy(iv_entry.cont_uuid, cont_uuid); - rc = crt_group_rank(NULL, &iv_entry.iv_agg_eph.rank); + rc = crt_group_rank(NULL, &iv_entry.iv_track_eph.ite_rank); if (rc) { D_ERROR(DF_UUID" op %d, crt_group_rank failed "DF_RC"\n", DP_UUID(cont_uuid), op, DP_RC(rc)); @@ -1097,20 +1097,22 @@ cont_iv_ec_agg_eph_update_internal(void *ns, uuid_t cont_uuid, } int -cont_iv_ec_agg_eph_update(void *ns, uuid_t cont_uuid, daos_epoch_t eph) +cont_iv_track_eph_update(void *ns, uuid_t cont_uuid, daos_epoch_t ec_agg_eph, + daos_epoch_t stable_eph) { - return cont_iv_ec_agg_eph_update_internal(ns, cont_uuid, eph, - CRT_IV_SHORTCUT_TO_ROOT, - CRT_IV_SYNC_NONE, - IV_CONT_AGG_EPOCH_REPORT); + return cont_iv_track_eph_update_internal(ns, cont_uuid, ec_agg_eph, stable_eph, + CRT_IV_SHORTCUT_TO_ROOT, + CRT_IV_SYNC_NONE, + IV_CONT_TRACK_EPOCH_REPORT); } int -cont_iv_ec_agg_eph_refresh(void *ns, uuid_t cont_uuid, daos_epoch_t eph) +cont_iv_track_eph_refresh(void *ns, uuid_t cont_uuid, daos_epoch_t ec_agg_eph, + daos_epoch_t stable_eph) { - return cont_iv_ec_agg_eph_update_internal(ns, cont_uuid, eph, - 0, CRT_IV_SYNC_LAZY, - IV_CONT_AGG_EPOCH_BOUNDRY); + return cont_iv_track_eph_update_internal(ns, cont_uuid, ec_agg_eph, stable_eph, + 0, CRT_IV_SYNC_LAZY, + IV_CONT_TRACK_EPOCH); } int @@ -1123,14 +1125,14 @@ ds_cont_fetch_ec_agg_boundary(void *ns, uuid_t cont_uuid) /* Only happens on xstream 0 */ D_ASSERT(dss_get_module_info()->dmi_xs_id == 0); uuid_copy(iv_entry.cont_uuid, cont_uuid); - rc = crt_group_rank(NULL, &iv_entry.iv_agg_eph.rank); + rc = crt_group_rank(NULL, &iv_entry.iv_track_eph.ite_rank); if (rc) { D_ERROR(DF_UUID" crt_group_rank failed "DF_RC"\n", DP_UUID(cont_uuid), DP_RC(rc)); return rc; } - rc = cont_iv_fetch(ns, IV_CONT_AGG_EPOCH_BOUNDRY, cont_uuid, &iv_entry, + rc = cont_iv_fetch(ns, IV_CONT_TRACK_EPOCH, cont_uuid, &iv_entry, sizeof(struct cont_iv_entry), sizeof(struct cont_iv_entry), true); if (rc) @@ -1200,11 +1202,11 @@ cont_iv_entry_delete(void *ns, uuid_t pool_uuid, uuid_t cont_uuid) if (rc != 0) D_DEBUG(DB_MD, "delete prop "DF_UUID"\n", DP_UUID(cont_uuid)); - rc = cont_iv_invalidate(ns, IV_CONT_AGG_EPOCH_REPORT, cont_uuid, CRT_IV_SYNC_NONE); + rc = cont_iv_invalidate(ns, IV_CONT_TRACK_EPOCH_REPORT, cont_uuid, CRT_IV_SYNC_NONE); if (rc != 0) D_DEBUG(DB_MD, "delete agg epoch report "DF_UUID"\n", DP_UUID(cont_uuid)); - rc = cont_iv_invalidate(ns, IV_CONT_AGG_EPOCH_BOUNDRY, cont_uuid, CRT_IV_SYNC_NONE); + rc = cont_iv_invalidate(ns, IV_CONT_TRACK_EPOCH, cont_uuid, CRT_IV_SYNC_NONE); if (rc != 0) D_DEBUG(DB_MD, "delete agg epoch boundary "DF_UUID"\n", DP_UUID(cont_uuid)); @@ -1665,8 +1667,8 @@ ds_cont_iv_fini(void) ds_iv_class_unregister(IV_CONT_SNAP); ds_iv_class_unregister(IV_CONT_CAPA); ds_iv_class_unregister(IV_CONT_PROP); - ds_iv_class_unregister(IV_CONT_AGG_EPOCH_REPORT); - ds_iv_class_unregister(IV_CONT_AGG_EPOCH_BOUNDRY); + ds_iv_class_unregister(IV_CONT_TRACK_EPOCH_REPORT); + ds_iv_class_unregister(IV_CONT_TRACK_EPOCH); return 0; } @@ -1687,13 +1689,12 @@ ds_cont_iv_init(void) if (rc) D_GOTO(out, rc); - rc = ds_iv_class_register(IV_CONT_AGG_EPOCH_REPORT, &iv_cache_ops, + rc = ds_iv_class_register(IV_CONT_TRACK_EPOCH_REPORT, &iv_cache_ops, &cont_iv_ops); if (rc) D_GOTO(out, rc); - rc = ds_iv_class_register(IV_CONT_AGG_EPOCH_BOUNDRY, &iv_cache_ops, - &cont_iv_ops); + rc = ds_iv_class_register(IV_CONT_TRACK_EPOCH, &iv_cache_ops, &cont_iv_ops); if (rc) D_GOTO(out, rc); out: diff --git a/src/container/srv_container.c b/src/container/srv_container.c index 91d87d9b978..765d2813671 100644 --- a/src/container/srv_container.c +++ b/src/container/srv_container.c @@ -1,5 +1,6 @@ /* - * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2016-2025 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -21,6 +22,7 @@ #include #include #include +#include #include #include "rpc.h" #include "srv_internal.h" @@ -185,8 +187,8 @@ ds_cont_svc_fini(struct cont_svc **svcp) *svcp = NULL; } -static int cont_svc_ec_agg_leader_start(struct cont_svc *svc); -static void cont_svc_ec_agg_leader_stop(struct cont_svc *svc); +static int cont_svc_eph_track_leader_start(struct cont_svc *svc); +static void cont_svc_eph_track_leader_stop(struct cont_svc *svc); int ds_cont_svc_step_up(struct cont_svc *svc) @@ -202,7 +204,7 @@ ds_cont_svc_step_up(struct cont_svc *svc) } D_ASSERT(svc->cs_pool != NULL); - rc = cont_svc_ec_agg_leader_start(svc); + rc = cont_svc_eph_track_leader_start(svc); if (rc != 0) D_ERROR(DF_UUID": start ec agg leader failed: "DF_RC"\n", DP_UUID(svc->cs_pool_uuid), DP_RC(rc)); @@ -213,7 +215,7 @@ ds_cont_svc_step_up(struct cont_svc *svc) void ds_cont_svc_step_down(struct cont_svc *svc) { - cont_svc_ec_agg_leader_stop(svc); + cont_svc_eph_track_leader_stop(svc); D_ASSERT(svc->cs_pool != NULL); ds_pool_put(svc->cs_pool); svc->cs_pool = NULL; @@ -1518,7 +1520,7 @@ evict_hdls(struct rdb_tx *tx, struct cont *cont, bool force, struct ds_pool_hdl } static void -cont_ec_agg_delete(struct cont_svc *svc, uuid_t cont_uuid); +cont_track_eph_leader_delete(struct cont_svc *svc, uuid_t cont_uuid); static int cont_destroy(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont, crt_rpc_t *rpc, @@ -1571,7 +1573,7 @@ cont_destroy(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont, if (rc != 0) goto out_prop; - cont_ec_agg_delete(cont->c_svc, cont->c_uuid); + cont_track_eph_leader_delete(cont->c_svc, cont->c_uuid); if (pool_hdl->sph_global_ver >= DAOS_POOL_GLOBAL_VERSION_WITH_OIT_OID_KVS) { need_destroy_oid_oit_kvs = true; @@ -1647,32 +1649,32 @@ cont_destroy(struct rdb_tx *tx, struct ds_pool_hdl *pool_hdl, struct cont *cont, return rc; } -struct cont_ec_agg * -cont_ec_agg_lookup(struct cont_svc *cont_svc, uuid_t cont_uuid) +struct cont_track_eph_leader * +cont_track_eph_leader_lookup(struct cont_svc *cont_svc, uuid_t cont_uuid) { - struct cont_ec_agg *ec_agg; + struct cont_track_eph_leader *eph_ldr; - d_list_for_each_entry(ec_agg, &cont_svc->cs_ec_agg_list, ea_list) { - if (ec_agg->ea_deleted) + d_list_for_each_entry(eph_ldr, &cont_svc->cs_cont_ephs_leader_list, ea_list) { + if (eph_ldr->ea_deleted) continue; - if (uuid_compare(ec_agg->ea_cont_uuid, cont_uuid) == 0) - return ec_agg; + if (uuid_compare(eph_ldr->ea_cont_uuid, cont_uuid) == 0) + return eph_ldr; } return NULL; } static int -cont_ec_agg_alloc(struct cont_svc *cont_svc, uuid_t cont_uuid, - struct cont_ec_agg **ec_aggp) +cont_track_eph_leader_alloc(struct cont_svc *cont_svc, uuid_t cont_uuid, + struct cont_track_eph_leader **leader_p) { - struct cont_ec_agg *ec_agg = NULL; - struct pool_domain *doms; - int rank_nr; - int rc = 0; + struct cont_track_eph_leader *eph_ldr = NULL; + struct pool_domain *doms; + int rank_nr; + int rc = 0; int i; - D_ALLOC_PTR(ec_agg); - if (ec_agg == NULL) + D_ALLOC_PTR(eph_ldr); + if (eph_ldr == NULL) return -DER_NOMEM; D_ASSERT(cont_svc->cs_pool->sp_map != NULL); @@ -1680,42 +1682,43 @@ cont_ec_agg_alloc(struct cont_svc *cont_svc, uuid_t cont_uuid, if (rank_nr < 0) D_GOTO(out, rc = rank_nr); - D_ALLOC_ARRAY(ec_agg->ea_server_ephs, rank_nr); - if (ec_agg->ea_server_ephs == NULL) + D_ALLOC_ARRAY(eph_ldr->ea_server_ephs, rank_nr); + if (eph_ldr->ea_server_ephs == NULL) D_GOTO(out, rc = -DER_NOMEM); - uuid_copy(ec_agg->ea_cont_uuid, cont_uuid); - ec_agg->ea_servers_num = rank_nr; - ec_agg->ea_current_eph = 0; + uuid_copy(eph_ldr->ea_cont_uuid, cont_uuid); + eph_ldr->ea_servers_num = rank_nr; + eph_ldr->ea_current_ec_agg_eph = 0; for (i = 0; i < rank_nr; i++) { - ec_agg->ea_server_ephs[i].rank = doms[i].do_comp.co_rank; - ec_agg->ea_server_ephs[i].eph = 0; + eph_ldr->ea_server_ephs[i].re_rank = doms[i].do_comp.co_rank; + eph_ldr->ea_server_ephs[i].re_ec_agg_eph = 0; + eph_ldr->ea_server_ephs[i].re_stable_eph = 0; } - d_list_add(&ec_agg->ea_list, &cont_svc->cs_ec_agg_list); - *ec_aggp = ec_agg; + d_list_add(&eph_ldr->ea_list, &cont_svc->cs_cont_ephs_leader_list); + *leader_p = eph_ldr; out: if (rc) { - if (ec_agg) - D_FREE(ec_agg->ea_server_ephs); - D_FREE(ec_agg); + if (eph_ldr) + D_FREE(eph_ldr->ea_server_ephs); + D_FREE(eph_ldr); } return rc; } static void -cont_ec_agg_delete(struct cont_svc *svc, uuid_t cont_uuid) +cont_track_eph_leader_delete(struct cont_svc *svc, uuid_t cont_uuid) { - struct cont_ec_agg *ec_agg; + struct cont_track_eph_leader *eph_ldr; - ec_agg = cont_ec_agg_lookup(svc, cont_uuid); - if (ec_agg == NULL) + eph_ldr = cont_track_eph_leader_lookup(svc, cont_uuid); + if (eph_ldr == NULL) return; - /* Set ea_deleted flag to destroy it inside cont_agg_eph_leader_ult() + /* Set ea_deleted flag to destroy it inside cont_track_eph_leader_ult() * to avoid list iteration broken. */ - ec_agg->ea_deleted = 1; + eph_ldr->ea_deleted = 1; } /** @@ -1723,52 +1726,54 @@ cont_ec_agg_delete(struct cont_svc *svc, uuid_t cont_uuid) * will be called by IV update on the leader. */ int -ds_cont_leader_update_agg_eph(uuid_t pool_uuid, uuid_t cont_uuid, - d_rank_t rank, daos_epoch_t eph) +ds_cont_leader_update_track_eph(uuid_t pool_uuid, uuid_t cont_uuid, d_rank_t rank, + daos_epoch_t ec_agg_eph, daos_epoch_t stable_eph) { - struct cont_svc *svc; - struct cont_ec_agg *ec_agg; - int rc; - bool retried = false; - int i; + struct cont_svc *svc; + struct cont_track_eph_leader *eph_ldr; + int rc; + bool retried = false; + int i; - rc = cont_svc_lookup_leader(pool_uuid, 0 /* id */, &svc, - NULL /* hint */); + rc = cont_svc_lookup_leader(pool_uuid, 0 /* id */, &svc, NULL /* hint */); if (rc != 0) return rc; retry: - ec_agg = cont_ec_agg_lookup(svc, cont_uuid); - if (ec_agg == NULL) { - rc = cont_ec_agg_alloc(svc, cont_uuid, &ec_agg); + eph_ldr = cont_track_eph_leader_lookup(svc, cont_uuid); + if (eph_ldr == NULL) { + rc = cont_track_eph_leader_alloc(svc, cont_uuid, &eph_ldr); if (rc) D_GOTO(out_put, rc); } - for (i = 0; i < ec_agg->ea_servers_num; i++) { - if (ec_agg->ea_server_ephs[i].rank == rank) { - if (ec_agg->ea_server_ephs[i].eph < eph) - ec_agg->ea_server_ephs[i].eph = eph; + for (i = 0; i < eph_ldr->ea_servers_num; i++) { + if (eph_ldr->ea_server_ephs[i].re_rank == rank) { + if (eph_ldr->ea_server_ephs[i].re_ec_agg_eph < ec_agg_eph) + eph_ldr->ea_server_ephs[i].re_ec_agg_eph = ec_agg_eph; + if (eph_ldr->ea_server_ephs[i].re_stable_eph < stable_eph) + eph_ldr->ea_server_ephs[i].re_stable_eph = stable_eph; break; } } - if (i == ec_agg->ea_servers_num) { + if (i == eph_ldr->ea_servers_num) { if (!retried) { - D_DEBUG(DB_MD, "rank %u eph "DF_X64" retry for" - DF_CONT"\n", rank, eph, + D_DEBUG(DB_MD, "rank %u ec_agg_eph "DF_X64", stable_eph "DF_X64 + " retry for"DF_CONT"\n", rank, ec_agg_eph, stable_eph, DP_CONT(pool_uuid, cont_uuid)); retried = true; - ec_agg->ea_deleted = 1; + eph_ldr->ea_deleted = 1; goto retry; } else { - D_WARN("rank %u eph "DF_X64" does not exist for " - DF_CONT"\n", rank, eph, + D_WARN("rank %u ec_agg_eph "DF_X64", stable_eph "DF_X64 + " does not exist for "DF_CONT"\n", rank, ec_agg_eph, stable_eph, DP_CONT(pool_uuid, cont_uuid)); } } else { - D_DEBUG(DB_MD, DF_CONT" update eph rank %u eph "DF_X64"\n", - DP_CONT(pool_uuid, cont_uuid), rank, eph); + D_DEBUG(DB_MD, DF_CONT" update eph rank %u ec_agg_eph "DF_X64 + ", stable_eph "DF_X64".\n", DP_CONT(pool_uuid, cont_uuid), + rank, ec_agg_eph, stable_eph); } out_put: @@ -1776,67 +1781,116 @@ ds_cont_leader_update_agg_eph(uuid_t pool_uuid, uuid_t cont_uuid, return 0; } -struct refresh_vos_agg_eph_arg { - uuid_t pool_uuid; - uuid_t cont_uuid; - daos_epoch_t min_eph; +#define EPH_ARG_TGT_INLINE (64) +struct refresh_track_eph_arg { + uuid_t pool_uuid; + uuid_t cont_uuid; + daos_epoch_t min_ec_agg_eph; + daos_epoch_t min_stable_eph; + uint8_t *tgt_status; + uint8_t tgt_status_inline[EPH_ARG_TGT_INLINE]; }; -int -cont_refresh_vos_agg_eph_one(void *data) +static int +cont_refresh_track_eph_one(void *data) { - struct refresh_vos_agg_eph_arg *arg = data; - struct ds_cont_child *cont_child; - int rc; + struct refresh_track_eph_arg *arg = data; + struct ds_cont_child *cont_child; + unsigned int idx = dss_get_module_info()->dmi_tgt_id; + int rc; rc = ds_cont_child_lookup(arg->pool_uuid, arg->cont_uuid, &cont_child); if (rc) return rc; - D_DEBUG(DB_MD, DF_CONT": %s agg boundary eph "DF_X64"->"DF_X64"\n", + D_DEBUG(DB_MD, DF_CONT": %s ec agg boundary eph "DF_X64"->"DF_X64", " + ": %s stable eph "DF_X64"->"DF_X64"\n", DP_CONT(arg->pool_uuid, arg->cont_uuid), - cont_child->sc_ec_agg_eph_boundary < arg->min_eph ? "update" : "ignore", - cont_child->sc_ec_agg_eph_boundary, arg->min_eph); - - if (cont_child->sc_ec_agg_eph_boundary < arg->min_eph) - cont_child->sc_ec_agg_eph_boundary = arg->min_eph; + cont_child->sc_ec_agg_eph_boundary < arg->min_ec_agg_eph ? "update" : "ignore", + cont_child->sc_ec_agg_eph_boundary, arg->min_ec_agg_eph, + cont_child->sc_stable_epoch < arg->min_stable_eph ? "update" : "ignore", + cont_child->sc_stable_epoch, arg->min_stable_eph); + + if (cont_child->sc_ec_agg_eph_boundary < arg->min_ec_agg_eph) + cont_child->sc_ec_agg_eph_boundary = arg->min_ec_agg_eph; + + /* Only should update local stable epoch if the target is in UPIN status */ + if (cont_child->sc_stable_epoch < arg->min_stable_eph && + (arg->tgt_status[idx] & PO_COMP_ST_UPIN)) { + rc = vos_cont_set_global_stable_epoch(cont_child->sc_hdl, arg->min_stable_eph); + if (rc == 0) + cont_child->sc_stable_epoch = arg->min_stable_eph; + else + rc = 0; + } ds_cont_child_put(cont_child); return rc; } int -ds_cont_tgt_refresh_agg_eph(uuid_t pool_uuid, uuid_t cont_uuid, - daos_epoch_t eph) +ds_cont_tgt_refresh_track_eph(uuid_t pool_uuid, uuid_t cont_uuid, + daos_epoch_t ec_agg_eph, daos_epoch_t stable_eph) { - struct refresh_vos_agg_eph_arg arg; - int rc; + struct ds_pool *pool; + struct pool_target *tgts; + struct refresh_track_eph_arg arg; + d_rank_t rank; + int i, rc; uuid_copy(arg.pool_uuid, pool_uuid); uuid_copy(arg.cont_uuid, cont_uuid); - arg.min_eph = eph; + arg.min_ec_agg_eph = ec_agg_eph; + arg.min_stable_eph = stable_eph; + if (likely(dss_tgt_nr <= EPH_ARG_TGT_INLINE)) { + arg.tgt_status = arg.tgt_status_inline; + } else { + D_ALLOC_ARRAY(arg.tgt_status, dss_tgt_nr); + if (arg.tgt_status == NULL) { + ds_pool_put(pool); + return -DER_NOMEM; + } + } + + rc = ds_pool_lookup(pool_uuid, &pool); + if (rc != 0) { + D_DEBUG(DB_MD, DF_UUID" lookup pool failed: %d\n", + DP_UUID(pool_uuid), rc); + goto out; + } + rank = dss_self_rank(); + rc = pool_map_find_target_by_rank_idx(pool->sp_map, rank, -1, &tgts); + D_ASSERT(rc == dss_tgt_nr); + for (i = 0; i < dss_tgt_nr; i++) { + arg.tgt_status[i] = tgts[i].ta_comp.co_status; + } + ds_pool_put(pool); rc = ds_pool_task_collective(pool_uuid, PO_COMP_ST_NEW | PO_COMP_ST_DOWN | - PO_COMP_ST_DOWNOUT, cont_refresh_vos_agg_eph_one, + PO_COMP_ST_DOWNOUT, cont_refresh_track_eph_one, &arg, DSS_ULT_FL_PERIODIC); + +out: + if (arg.tgt_status != arg.tgt_status_inline) + D_FREE(arg.tgt_status); return rc; } -#define EC_AGG_EPH_INTV (10ULL * 1000) /* seconds interval to check*/ +#define TRACK_EPH_INTV (5ULL * 1000) /* seconds interval to check*/ static void -cont_agg_eph_leader_ult(void *arg) +cont_track_eph_leader_ult(void *arg) { - struct cont_svc *svc = arg; - struct ds_pool *pool = svc->cs_pool; - struct cont_ec_agg *ec_agg; - struct cont_ec_agg *tmp; - uint64_t cur_eph, new_eph; - int rc = 0; + struct cont_svc *svc = arg; + struct ds_pool *pool = svc->cs_pool; + struct cont_track_eph_leader *eph_ldr; + struct cont_track_eph_leader *tmp; + uint64_t cur_eph, new_eph; + int rc = 0; - if (svc->cs_ec_leader_ephs_req == NULL) + if (svc->cs_cont_ephs_leader_req == NULL) goto out; - while (!dss_ult_exiting(svc->cs_ec_leader_ephs_req)) { + while (!dss_ult_exiting(svc->cs_cont_ephs_leader_req)) { d_rank_list_t fail_ranks = { 0 }; if (pool->sp_rebuilding) { @@ -1853,59 +1907,64 @@ cont_agg_eph_leader_ult(void *arg) goto yield; } - d_list_for_each_entry_safe(ec_agg, tmp, &svc->cs_ec_agg_list, ea_list) { - daos_epoch_t min_eph = DAOS_EPOCH_MAX; + d_list_for_each_entry_safe(eph_ldr, tmp, &svc->cs_cont_ephs_leader_list, ea_list) { + daos_epoch_t min_ec_agg_eph = DAOS_EPOCH_MAX; + daos_epoch_t min_stable_eph = DAOS_EPOCH_MAX; int i; - if (ec_agg->ea_deleted) { - d_list_del(&ec_agg->ea_list); - D_FREE(ec_agg->ea_server_ephs); - D_FREE(ec_agg); + if (eph_ldr->ea_deleted) { + d_list_del(&eph_ldr->ea_list); + D_FREE(eph_ldr->ea_server_ephs); + D_FREE(eph_ldr); continue; } - for (i = 0; i < ec_agg->ea_servers_num; i++) { - d_rank_t rank = ec_agg->ea_server_ephs[i].rank; + for (i = 0; i < eph_ldr->ea_servers_num; i++) { + d_rank_t rank = eph_ldr->ea_server_ephs[i].re_rank; if (d_rank_in_rank_list(&fail_ranks, rank)) { D_DEBUG(DB_MD, DF_CONT" skip %u\n", DP_CONT(svc->cs_pool_uuid, - ec_agg->ea_cont_uuid), + eph_ldr->ea_cont_uuid), rank); continue; } - if (ec_agg->ea_server_ephs[i].eph < min_eph) - min_eph = ec_agg->ea_server_ephs[i].eph; + if (eph_ldr->ea_server_ephs[i].re_ec_agg_eph < min_ec_agg_eph) + min_ec_agg_eph = eph_ldr->ea_server_ephs[i].re_ec_agg_eph; + if (eph_ldr->ea_server_ephs[i].re_stable_eph < min_stable_eph) + min_stable_eph = eph_ldr->ea_server_ephs[i].re_stable_eph; } - if (min_eph == ec_agg->ea_current_eph) + if (min_ec_agg_eph == eph_ldr->ea_current_ec_agg_eph && + min_stable_eph == eph_ldr->ea_current_stable_eph) continue; /** * NB: during extending or reintegration, the new * server might cause the minimum epoch is less than - * ea_current_eph. + * ea_current_ec_agg_eph. */ - D_DEBUG(DB_MD, DF_CONT" minimum "DF_U64" current "DF_X64"\n", - DP_CONT(svc->cs_pool_uuid, ec_agg->ea_cont_uuid), - min_eph, ec_agg->ea_current_eph); - - cur_eph = d_hlc2sec(ec_agg->ea_current_eph); - new_eph = d_hlc2sec(min_eph); + D_DEBUG(DB_MD, DF_CONT" min_ec_agg_eph "DF_X64" current "DF_X64 + ", min_stable_eph "DF_X64" current "DF_X64".\n", + DP_CONT(svc->cs_pool_uuid, eph_ldr->ea_cont_uuid), + min_ec_agg_eph, eph_ldr->ea_current_ec_agg_eph, + min_stable_eph, eph_ldr->ea_current_stable_eph); + + cur_eph = d_hlc2sec(eph_ldr->ea_current_ec_agg_eph); + new_eph = d_hlc2sec(min_ec_agg_eph); if (cur_eph && new_eph > cur_eph && (new_eph - cur_eph) >= 600) D_WARN(DF_CONT": Sluggish EC boundary reporting. " "cur:"DF_U64" new:"DF_U64" gap:"DF_U64"\n", - DP_CONT(svc->cs_pool_uuid, ec_agg->ea_cont_uuid), + DP_CONT(svc->cs_pool_uuid, eph_ldr->ea_cont_uuid), cur_eph, new_eph, new_eph - cur_eph); - rc = cont_iv_ec_agg_eph_refresh(pool->sp_iv_ns, - ec_agg->ea_cont_uuid, - min_eph); + rc = cont_iv_track_eph_refresh(pool->sp_iv_ns, eph_ldr->ea_cont_uuid, + min_ec_agg_eph, min_stable_eph); if (rc) { DL_CDEBUG(rc == -DER_NONEXIST, DLOG_INFO, DLOG_ERR, rc, DF_CONT ": refresh failed", - DP_CONT(svc->cs_pool_uuid, ec_agg->ea_cont_uuid)); + DP_CONT(svc->cs_pool_uuid, eph_ldr->ea_cont_uuid)); /* If there are network error or pool map inconsistency, * let's skip the following eph sync, which will fail @@ -1919,45 +1978,46 @@ cont_agg_eph_leader_ult(void *arg) continue; } - ec_agg->ea_current_eph = min_eph; + eph_ldr->ea_current_ec_agg_eph = min_ec_agg_eph; + eph_ldr->ea_current_stable_eph = min_stable_eph; if (pool->sp_rebuilding) break; } map_ranks_fini(&fail_ranks); - if (dss_ult_exiting(svc->cs_ec_leader_ephs_req)) + if (dss_ult_exiting(svc->cs_cont_ephs_leader_req)) break; yield: - sched_req_sleep(svc->cs_ec_leader_ephs_req, EC_AGG_EPH_INTV); + sched_req_sleep(svc->cs_cont_ephs_leader_req, TRACK_EPH_INTV); } out: D_DEBUG(DB_MD, DF_UUID": stop eph ult: rc %d\n", DP_UUID(svc->cs_pool_uuid), rc); - d_list_for_each_entry_safe(ec_agg, tmp, &svc->cs_ec_agg_list, ea_list) { - d_list_del(&ec_agg->ea_list); - D_FREE(ec_agg->ea_server_ephs); - D_FREE(ec_agg); + d_list_for_each_entry_safe(eph_ldr, tmp, &svc->cs_cont_ephs_leader_list, ea_list) { + d_list_del(&eph_ldr->ea_list); + D_FREE(eph_ldr->ea_server_ephs); + D_FREE(eph_ldr); } } static int -cont_svc_ec_agg_leader_start(struct cont_svc *svc) +cont_svc_eph_track_leader_start(struct cont_svc *svc) { struct sched_req_attr attr; uuid_t anonym_uuid; - D_INIT_LIST_HEAD(&svc->cs_ec_agg_list); + D_INIT_LIST_HEAD(&svc->cs_cont_ephs_leader_list); if (unlikely(ec_agg_disabled)) return 0; - D_ASSERT(svc->cs_ec_leader_ephs_req == NULL); + D_ASSERT(svc->cs_cont_ephs_leader_req == NULL); uuid_clear(anonym_uuid); sched_req_attr_init(&attr, SCHED_REQ_ANONYM, &anonym_uuid); - svc->cs_ec_leader_ephs_req = sched_create_ult(&attr, cont_agg_eph_leader_ult, svc, 0); - if (svc->cs_ec_leader_ephs_req == NULL) { + svc->cs_cont_ephs_leader_req = sched_create_ult(&attr, cont_track_eph_leader_ult, svc, 0); + if (svc->cs_cont_ephs_leader_req == NULL) { D_ERROR(DF_UUID" Failed to create EC leader eph ULT.\n", DP_UUID(svc->cs_pool_uuid)); return -DER_NOMEM; @@ -1967,20 +2027,20 @@ cont_svc_ec_agg_leader_start(struct cont_svc *svc) } static void -cont_svc_ec_agg_leader_stop(struct cont_svc *svc) +cont_svc_eph_track_leader_stop(struct cont_svc *svc) { D_DEBUG(DB_MD, DF_UUID" wait for ec agg leader stop\n", DP_UUID(svc->cs_pool_uuid)); - if (svc->cs_ec_leader_ephs_req == NULL) + if (svc->cs_cont_ephs_leader_req == NULL) return; D_DEBUG(DB_MD, DF_UUID" Stopping EC query ULT\n", DP_UUID(svc->cs_pool_uuid)); - sched_req_wait(svc->cs_ec_leader_ephs_req, true); - sched_req_put(svc->cs_ec_leader_ephs_req); - svc->cs_ec_leader_ephs_req = NULL; + sched_req_wait(svc->cs_cont_ephs_leader_req, true); + sched_req_put(svc->cs_cont_ephs_leader_req); + svc->cs_cont_ephs_leader_req = NULL; } int @@ -5612,9 +5672,9 @@ ds_cont_op_handler(crt_rpc_t *rpc, int cont_proto_ver) prop = cqo->cqo_prop; } else if ((opc == CONT_OPEN) || (opc == CONT_OPEN_BYLABEL)) { - struct cont_open_out *coo = crt_reply_get(rpc); + struct cont_open_out *cout = crt_reply_get(rpc); - prop = coo->coo_prop; + prop = cout->coo_prop; } out->co_rc = rc; @@ -6009,7 +6069,7 @@ ds_cont_destroy_orphan(struct cont_svc *svc, uuid_t uuid) rc = cont_destroy_bcast(dss_get_module_info()->dmi_ctx, svc, uuid); if (rc == 0) - cont_ec_agg_delete(svc, uuid); + cont_track_eph_leader_delete(svc, uuid); out: D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, diff --git a/src/container/srv_internal.h b/src/container/srv_internal.h index 81527044b7f..8ebaef5fd65 100644 --- a/src/container/srv_internal.h +++ b/src/container/srv_internal.h @@ -1,5 +1,5 @@ /* - * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2016-2025 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -57,16 +57,18 @@ dsm_tls_get() extern bool ec_agg_disabled; -struct ec_eph { - d_rank_t rank; - daos_epoch_t eph; +struct rank_eph { + d_rank_t re_rank; + daos_epoch_t re_ec_agg_eph; + daos_epoch_t re_stable_eph; }; -/* container EC aggregation epoch control descriptor, which is only on leader */ -struct cont_ec_agg { +/* container EC aggregation epoch and stable epoch control descriptor, which is only on leader */ +struct cont_track_eph_leader { uuid_t ea_cont_uuid; - daos_epoch_t ea_current_eph; - struct ec_eph *ea_server_ephs; + daos_epoch_t ea_current_ec_agg_eph; + daos_epoch_t ea_current_stable_eph; + struct rank_eph *ea_server_ephs; d_list_t ea_list; int ea_servers_num; uint32_t ea_deleted:1; @@ -88,9 +90,9 @@ struct cont_svc { rdb_path_t cs_hdls; /* container handle KVS */ struct ds_pool *cs_pool; - /* Manage the EC aggregation epoch */ - struct sched_request *cs_ec_leader_ephs_req; - d_list_t cs_ec_agg_list; /* link cont_ec_agg */ + /* Manage the EC aggregation epoch and stable epoch */ + struct sched_request *cs_cont_ephs_leader_req; + d_list_t cs_cont_ephs_leader_list; /* link cont_track_eph_leader */ }; /* Container descriptor */ @@ -157,18 +159,19 @@ struct cont_iv_prop { struct daos_acl cip_acl; }; -struct cont_iv_agg_eph { - daos_epoch_t eph; - d_rank_t rank; +struct cont_iv_track_eph { + daos_epoch_t ite_ec_agg_eph; + daos_epoch_t ite_stable_eph; + d_rank_t ite_rank; }; struct cont_iv_entry { uuid_t cont_uuid; union { - struct cont_iv_snapshot iv_snap; - struct cont_iv_capa iv_capa; - struct cont_iv_prop iv_prop; - struct cont_iv_agg_eph iv_agg_eph; + struct cont_iv_snapshot iv_snap; + struct cont_iv_capa iv_capa; + struct cont_iv_prop iv_prop; + struct cont_iv_track_eph iv_track_eph; }; }; @@ -209,8 +212,8 @@ int struct container_hdl *hdl, crt_rpc_t *rpc, int cont_proto_ver); int ds_cont_get_prop(uuid_t pool_uuid, uuid_t cont_uuid, daos_prop_t **prop_out); -int ds_cont_leader_update_agg_eph(uuid_t pool_uuid, uuid_t cont_uuid, - d_rank_t rank, daos_epoch_t eph); +int ds_cont_leader_update_track_eph(uuid_t pool_uuid, uuid_t cont_uuid, + d_rank_t rank, daos_epoch_t agg_eph, daos_epoch_t stable_eph); /* srv_epoch.c */ int @@ -269,8 +272,8 @@ int ds_cont_tgt_snapshots_update(uuid_t pool_uuid, uuid_t cont_uuid, uint64_t *snapshots, int snap_count); int ds_cont_tgt_snapshots_refresh(uuid_t pool_uuid, uuid_t cont_uuid); int ds_cont_tgt_close(uuid_t pool_uuid, uuid_t cont_hdl_uuid); -int ds_cont_tgt_refresh_agg_eph(uuid_t pool_uuid, uuid_t cont_uuid, - daos_epoch_t eph); +int ds_cont_tgt_refresh_track_eph(uuid_t pool_uuid, uuid_t cont_uuid, + daos_epoch_t ec_agg_eph, daos_epoch_t stable_eph); int ds_cont_tgt_prop_update(uuid_t pool_uuid, uuid_t cont_uuid, daos_prop_t *prop); /* oid_iv.c */ @@ -294,8 +297,10 @@ int cont_iv_prop_update(void *ns, uuid_t cont_uuid, daos_prop_t *prop, bool sync int cont_iv_snapshots_refresh(void *ns, uuid_t cont_uuid); int cont_iv_snapshots_update(void *ns, uuid_t cont_uuid, uint64_t *snapshots, int snap_count); -int cont_iv_ec_agg_eph_update(void *ns, uuid_t cont_uuid, daos_epoch_t eph); -int cont_iv_ec_agg_eph_refresh(void *ns, uuid_t cont_uuid, daos_epoch_t eph); +int cont_iv_track_eph_update(void *ns, uuid_t cont_uuid, daos_epoch_t ec_agg_eph, + daos_epoch_t stable_eph); +int cont_iv_track_eph_refresh(void *ns, uuid_t cont_uuid, daos_epoch_t ec_agg_eph, + daos_epoch_t stable_eph); int cont_iv_entry_delete(void *ns, uuid_t pool_uuid, uuid_t cont_uuid); /* srv_metrics.c*/ diff --git a/src/container/srv_target.c b/src/container/srv_target.c index 71d13f637bd..a4e2cad02a0 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2016-2025 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -35,6 +35,9 @@ #include #include +static int cont_tgt_track_eph_init(struct ds_cont_child *cont_child); +static void cont_tgt_track_eph_fini(struct ds_cont_child *cont); + /* Per VOS container aggregation ULT ***************************************/ static inline struct sched_request * @@ -471,6 +474,12 @@ cont_aggregate_interval(struct ds_cont_child *cont, cont_aggregate_cb_t cb, */ uint64_t msecs = 2000; + /* Reuse the vos aggregation ULT to periodically query the stable epoch, + * ds_cont_track_eph_query_ult() will read it and report through IV. + */ + if (param->ap_vos_agg && cont->sc_query_stable_eph != NULL) + *cont->sc_query_stable_eph = vos_cont_get_local_stable_epoch(cont->sc_hdl); + if (!cont_aggregate_runnable(cont, req, param->ap_vos_agg)) goto next; @@ -886,6 +895,8 @@ cont_child_stop(struct ds_cont_child *cont_child) dtx_cont_deregister(cont_child); D_ASSERT(cont_child->sc_dtx_registered == 0); + cont_tgt_track_eph_fini(cont_child); + /* cont_stop_agg() may yield */ cont_stop_agg(cont_child); ds_cont_child_put(cont_child); @@ -957,12 +968,19 @@ cont_child_start(struct ds_pool_child *pool_child, const uuid_t co_uuid, rc = -DER_SHUTDOWN; } else if (!cont_child_started(cont_child)) { if (!ds_pool_restricted(pool_child->spc_pool, false)) { - rc = cont_start_agg(cont_child); + rc = cont_tgt_track_eph_init(cont_child); if (rc != 0) goto out; + rc = cont_start_agg(cont_child); + if (rc != 0) { + cont_tgt_track_eph_fini(cont_child); + goto out; + } + rc = dtx_cont_register(cont_child); if (rc != 0) { + cont_tgt_track_eph_fini(cont_child); cont_stop_agg(cont_child); goto out; } @@ -2378,131 +2396,229 @@ ds_cont_oid_alloc_handler(crt_rpc_t *rpc) crt_reply_send(rpc); } -/* Track each container EC aggregation Epoch under ds_pool */ -struct cont_ec_eph { - uuid_t ce_cont_uuid; - d_list_t ce_list; - daos_epoch_t *ce_ephs; - daos_epoch_t ce_last_eph; - uint32_t ce_ephs_cnt; - int ce_ref; +/* Track each container EC aggregation Epoch and stable epoch under ds_pool */ +struct cont_track_eph { + uuid_t cte_cont_uuid; + d_list_t cte_list; + /* each target's stable epoch */ + daos_epoch_t *cte_stable_ephs; + /* each target's EC aggregation epoch */ + daos_epoch_t *cte_ec_agg_ephs; + /* last reported (through IV) stable epoch */ + daos_epoch_t cte_last_stable_epoch; + /* last reported (through IV) EC aggregation epoch */ + daos_epoch_t cte_last_ec_agg_epoch; + /* number of tracked epochs (dss_tgt_nr) */ + uint32_t cte_ephs_cnt; + int cte_ref; }; -/* list for the eph for the pool */ -struct cont_eph_list { - uuid_t ce_pool_uuid; - d_list_t ce_list; -}; - -static struct cont_ec_eph * -cont_ec_eph_lookup(d_list_t *ec_list, uuid_t cont_uuid) +static struct cont_track_eph * +cont_track_eph_lookup(d_list_t *ec_list, uuid_t cont_uuid) { - struct cont_ec_eph *found = NULL; + struct cont_track_eph *found = NULL; - d_list_for_each_entry(found, ec_list, ce_list) { - if (found->ce_ref == 0) + d_list_for_each_entry(found, ec_list, cte_list) { + if (found->cte_ref == 0) continue; - if (uuid_compare(found->ce_cont_uuid, cont_uuid) == 0) + if (uuid_compare(found->cte_cont_uuid, cont_uuid) == 0) return found; } return NULL; } -static struct cont_ec_eph * -cont_ec_eph_alloc(d_list_t *ec_list, uuid_t cont_uuid) +static struct cont_track_eph * +cont_track_eph_alloc(d_list_t *ec_list, uuid_t cont_uuid) { - struct cont_ec_eph *new_ec; + struct cont_track_eph *new_ec; D_ALLOC_PTR(new_ec); if (new_ec == NULL) return NULL; - uuid_copy(new_ec->ce_cont_uuid, cont_uuid); - D_ALLOC_ARRAY(new_ec->ce_ephs, dss_tgt_nr); - if (new_ec->ce_ephs == NULL) { + uuid_copy(new_ec->cte_cont_uuid, cont_uuid); + D_ALLOC_ARRAY(new_ec->cte_stable_ephs, dss_tgt_nr); + if (new_ec->cte_stable_ephs == NULL) { + D_FREE(new_ec); + return NULL; + } + D_ALLOC_ARRAY(new_ec->cte_ec_agg_ephs, dss_tgt_nr); + if (new_ec->cte_ec_agg_ephs == NULL) { + D_FREE(new_ec->cte_stable_ephs); D_FREE(new_ec); return NULL; } - new_ec->ce_ephs_cnt = dss_tgt_nr; - d_list_add(&new_ec->ce_list, ec_list); - new_ec->ce_ref = 0; + new_ec->cte_ephs_cnt = dss_tgt_nr; + d_list_add(&new_ec->cte_list, ec_list); + new_ec->cte_ref = 0; return new_ec; } -int -ds_cont_ec_eph_insert(struct ds_pool *pool, uuid_t cont_uuid, int tgt_idx, - uint64_t **epoch_p) +static int +cont_track_eph_insert(struct ds_pool *pool, uuid_t cont_uuid, int tgt_idx, + uint64_t **ec_agg_epoch_p, uint64_t **stable_epoch_p) { - struct cont_ec_eph *new_eph; + struct cont_track_eph *new_eph; int rc = 0; D_ASSERT(dss_get_module_info()->dmi_xs_id == 0); - new_eph = cont_ec_eph_lookup(&pool->sp_ec_ephs_list, cont_uuid); + new_eph = cont_track_eph_lookup(&pool->sp_ec_ephs_list, cont_uuid); if (new_eph == NULL) { - new_eph = cont_ec_eph_alloc(&pool->sp_ec_ephs_list, cont_uuid); + new_eph = cont_track_eph_alloc(&pool->sp_ec_ephs_list, cont_uuid); if (new_eph == NULL) D_GOTO(out, rc = -DER_NOMEM); } - new_eph->ce_ref++; + new_eph->cte_ref++; D_DEBUG(DB_MD, DF_UUID "add %d tgt to epoch query list %d\n", - DP_UUID(cont_uuid), tgt_idx, new_eph->ce_ref); - D_ASSERT(tgt_idx < new_eph->ce_ephs_cnt); - new_eph->ce_ephs[tgt_idx] = 0; - *epoch_p = &new_eph->ce_ephs[tgt_idx]; + DP_UUID(cont_uuid), tgt_idx, new_eph->cte_ref); + D_ASSERT(tgt_idx < new_eph->cte_ephs_cnt); + new_eph->cte_ec_agg_ephs[tgt_idx] = 0; + new_eph->cte_stable_ephs[tgt_idx] = 0; + *ec_agg_epoch_p = &new_eph->cte_ec_agg_ephs[tgt_idx]; + *stable_epoch_p = &new_eph->cte_stable_ephs[tgt_idx]; out: return rc; } -int -ds_cont_ec_eph_delete(struct ds_pool *pool, uuid_t cont_uuid, int tgt_idx) +static void +cont_track_eph_delete(struct ds_pool *pool, uuid_t cont_uuid, int tgt_idx) { - struct cont_ec_eph *ec_eph; + struct cont_track_eph *ec_eph; D_ASSERT(dss_get_module_info()->dmi_xs_id == 0); - ec_eph = cont_ec_eph_lookup(&pool->sp_ec_ephs_list, cont_uuid); + ec_eph = cont_track_eph_lookup(&pool->sp_ec_ephs_list, cont_uuid); if (ec_eph == NULL) - return 0; + return; - D_ASSERT(tgt_idx < ec_eph->ce_ephs_cnt); - D_ASSERT(ec_eph->ce_ref > 0); - ec_eph->ce_ref--; + D_ASSERT(tgt_idx < ec_eph->cte_ephs_cnt); + D_ASSERT(ec_eph->cte_ref > 0); + ec_eph->cte_ref--; D_DEBUG(DB_MD, DF_UUID "delete %d tgt ref %d.\n", - DP_UUID(cont_uuid), tgt_idx, ec_eph->ce_ref); - return 0; + DP_UUID(cont_uuid), tgt_idx, ec_eph->cte_ref); + return; } static void -cont_ec_eph_destroy(struct cont_ec_eph *ec_eph) +cont_track_eph_destroy(struct cont_track_eph *ec_eph) { - D_ASSERT(ec_eph->ce_ref == 0); - d_list_del(&ec_eph->ce_list); - D_FREE(ec_eph->ce_ephs); + D_ASSERT(ec_eph->cte_ref == 0); + d_list_del(&ec_eph->cte_list); + D_FREE(ec_eph->cte_stable_ephs); + D_FREE(ec_eph->cte_ec_agg_ephs); D_FREE(ec_eph); } void -ds_cont_ec_eph_free(struct ds_pool *pool) +ds_cont_track_eph_free(struct ds_pool *pool) +{ + struct cont_track_eph *ec_eph, *tmp; + + d_list_for_each_entry_safe(ec_eph, tmp, &pool->sp_ec_ephs_list, cte_list) + cont_track_eph_destroy(ec_eph); +} + +struct track_eph_ult_arg { + struct ds_pool *pool; + uuid_t cont_uuid; + uint32_t tgt_idx; + daos_epoch_t *ec_agg_eph; + daos_epoch_t *stable_eph; +}; + +static int +cont_track_eph_fini_ult(void *data) +{ + struct track_eph_ult_arg *arg = data; + + cont_track_eph_delete(arg->pool, arg->cont_uuid, arg->tgt_idx); + return 0; +} + +static void +cont_tgt_track_eph_fini(struct ds_cont_child *cont_child) +{ + struct track_eph_ult_arg arg; + + if (cont_child->sc_query_ec_agg_eph == NULL) + return; + D_ASSERT(cont_child->sc_query_stable_eph != NULL); + + arg.pool = cont_child->sc_pool->spc_pool; + uuid_copy(arg.cont_uuid, cont_child->sc_uuid); + arg.tgt_idx = dss_get_module_info()->dmi_tgt_id; + dss_ult_execute(cont_track_eph_fini_ult, &arg, NULL, NULL, DSS_XS_SYS, 0, 0); + + cont_child->sc_query_ec_agg_eph = NULL; + cont_child->sc_query_stable_eph = NULL; +} + +static int +cont_track_eph_init_ult(void *data) { - struct cont_ec_eph *ec_eph, *tmp; + struct track_eph_ult_arg *arg = data; + int rc; - d_list_for_each_entry_safe(ec_eph, tmp, &pool->sp_ec_ephs_list, ce_list) - cont_ec_eph_destroy(ec_eph); + rc = cont_track_eph_insert(arg->pool, arg->cont_uuid, arg->tgt_idx, &arg->ec_agg_eph, + &arg->stable_eph); + return rc; +} + +static void +cont_tgt_track_eph_init_ult(void *data) +{ + struct ds_cont_child *cont_child = data; + struct track_eph_ult_arg arg; + int rc; + + arg.pool = cont_child->sc_pool->spc_pool; + uuid_copy(arg.cont_uuid, cont_child->sc_uuid); + arg.tgt_idx = dss_get_module_info()->dmi_tgt_id; + rc = dss_ult_execute(cont_track_eph_init_ult, &arg, NULL, NULL, DSS_XS_SYS, + 0, 0); + if (rc) { + DL_ERROR(rc, DF_CONT " init track eph failed.\n", + DP_CONT(cont_child->sc_pool->spc_uuid, cont_child->sc_uuid)); + ds_cont_child_put(cont_child); + return; + } + + D_DEBUG(DB_MD, DF_UUID " update init track %u\n", + DP_UUID(cont_child->sc_uuid), arg.tgt_idx); + cont_child->sc_query_ec_agg_eph = arg.ec_agg_eph; + cont_child->sc_query_stable_eph = arg.stable_eph; + + ds_cont_child_put(cont_child); +} + +static int +cont_tgt_track_eph_init(struct ds_cont_child *cont_child) +{ + int rc; + + ds_cont_child_get(cont_child); + + rc = dss_ult_create(cont_tgt_track_eph_init_ult, cont_child, DSS_XS_SELF, + 0, 0, NULL); + if (rc != 0) + ds_cont_child_put(cont_child); + + return rc; } /** * This ULT is actually per pool to collect all container EC aggregation * epoch, then report to the container service leader. */ -#define EC_TGT_AGG_INTV (10ULL * 1000) /* seconds interval to check*/ +#define EC_TGT_EPH_QUERY_INTV (5ULL * 1000) /* seconds interval to check*/ void -ds_cont_tgt_ec_eph_query_ult(void *data) +ds_cont_track_eph_query_ult(void *data) { struct ds_pool *pool = data; - struct cont_ec_eph *ec_eph; - struct cont_ec_eph *tmp; + struct cont_track_eph *ec_eph; + struct cont_track_eph *tmp; int rc; D_DEBUG(DB_MD, DF_UUID" start tgt ec query eph ULT\n", @@ -2525,19 +2641,22 @@ ds_cont_tgt_ec_eph_query_ult(void *data) goto yield; } - d_list_for_each_entry_safe(ec_eph, tmp, &pool->sp_ec_ephs_list, ce_list) { - daos_epoch_t min_eph = DAOS_EPOCH_MAX; + d_list_for_each_entry_safe(ec_eph, tmp, &pool->sp_ec_ephs_list, cte_list) { + daos_epoch_t min_ec_agg_eph; + daos_epoch_t min_stable_eph; int i; if (dss_ult_exiting(pool->sp_ec_ephs_req)) break; - if (ec_eph->ce_ref == 0) { - cont_ec_eph_destroy(ec_eph); + if (ec_eph->cte_ref == 0) { + cont_track_eph_destroy(ec_eph); continue; } - for (i = 0; i < ec_eph->ce_ephs_cnt; i++) { + min_ec_agg_eph = DAOS_EPOCH_MAX; + min_stable_eph = DAOS_EPOCH_MAX; + for (i = 0; i < ec_eph->cte_ephs_cnt; i++) { bool is_failed_tgts = false; int j; @@ -2548,39 +2667,54 @@ ds_cont_tgt_ec_eph_query_ult(void *data) } } - if (!is_failed_tgts) - min_eph = min(min_eph, ec_eph->ce_ephs[i]); + if (!is_failed_tgts) { + min_ec_agg_eph = min(min_ec_agg_eph, + ec_eph->cte_ec_agg_ephs[i]); + min_stable_eph = min(min_stable_eph, + ec_eph->cte_stable_ephs[i]); + } } - if (min_eph == 0 || min_eph == DAOS_EPOCH_MAX || - min_eph <= ec_eph->ce_last_eph) { - if (min_eph > 0 && min_eph < ec_eph->ce_last_eph) - D_ERROR("ignore for now "DF_X64" < "DF_X64 - " "DF_UUID"\n", min_eph, ec_eph->ce_last_eph, - DP_UUID(ec_eph->ce_cont_uuid)); + if (min_ec_agg_eph == 0 || min_ec_agg_eph == DAOS_EPOCH_MAX || + min_stable_eph == 0 || min_stable_eph == DAOS_EPOCH_MAX || + (min_ec_agg_eph <= ec_eph->cte_last_ec_agg_epoch && + min_stable_eph <= ec_eph->cte_last_stable_epoch)) { + if (min_ec_agg_eph > 0 && min_stable_eph > 0 && + min_ec_agg_eph <= ec_eph->cte_last_ec_agg_epoch && + min_stable_eph <= ec_eph->cte_last_stable_epoch) + D_ERROR("ignore for now "DF_X64" <= "DF_X64 + ", "DF_X64" <= "DF_X64 ", "DF_UUID"\n", + min_ec_agg_eph, ec_eph->cte_last_ec_agg_epoch, + min_stable_eph, ec_eph->cte_last_stable_epoch, + DP_UUID(ec_eph->cte_cont_uuid)); else D_DEBUG(DB_MD, "Skip eph "DF_X64"/"DF_X64 - " "DF_UUID"\n", min_eph, ec_eph->ce_last_eph, - DP_UUID(ec_eph->ce_cont_uuid)); + ", "DF_X64"/"DF_X64", "DF_UUID"\n", + min_ec_agg_eph, ec_eph->cte_last_ec_agg_epoch, + min_stable_eph, ec_eph->cte_last_stable_epoch, + DP_UUID(ec_eph->cte_cont_uuid)); continue; } - D_DEBUG(DB_MD, "Update eph "DF_X64" "DF_UUID"\n", - min_eph, DP_UUID(ec_eph->ce_cont_uuid)); - rc = cont_iv_ec_agg_eph_update(pool->sp_iv_ns, ec_eph->ce_cont_uuid, - min_eph); - if (rc == 0) - ec_eph->ce_last_eph = min_eph; - else + D_DEBUG(DB_MD, "Update ec_agg_eph "DF_X64", stable_eph "DF_X64", " + DF_UUID"\n", min_ec_agg_eph, min_stable_eph, + DP_UUID(ec_eph->cte_cont_uuid)); + rc = cont_iv_track_eph_update(pool->sp_iv_ns, ec_eph->cte_cont_uuid, + min_ec_agg_eph, min_stable_eph); + if (rc == 0) { + ec_eph->cte_last_ec_agg_epoch = min_ec_agg_eph; + ec_eph->cte_last_stable_epoch = min_stable_eph; + } else { D_INFO(DF_CONT": Update min epoch: %d\n", - DP_CONT(pool->sp_uuid, ec_eph->ce_cont_uuid), rc); + DP_CONT(pool->sp_uuid, ec_eph->cte_cont_uuid), rc); + } } D_FREE(failed_tgts); yield: if (dss_ult_exiting(pool->sp_ec_ephs_req)) break; - sched_req_sleep(pool->sp_ec_ephs_req, EC_TGT_AGG_INTV); + sched_req_sleep(pool->sp_ec_ephs_req, EC_TGT_EPH_QUERY_INTV); } out: D_INFO(DF_UUID" stop tgt ec query eph ULT\n", DP_UUID(pool->sp_uuid)); diff --git a/src/include/daos_srv/container.h b/src/include/daos_srv/container.h index 14953a1972a..21bbf21a948 100644 --- a/src/include/daos_srv/container.h +++ b/src/include/daos_srv/container.h @@ -1,5 +1,5 @@ /* - * (C) Copyright 2015-2024 Intel Corporation. + * (C) Copyright 2015-2025 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -111,6 +111,12 @@ struct ds_cont_child { /* Last timestamp when EC aggregation reports -DER_INPROGRESS. */ uint64_t sc_ec_agg_busy_ts; + + /* The global minimum stable epoch. All data @lower epoch should has been globally + * stable (committed or aborted). Used as the start epoch for incremental reintegration. + */ + uint64_t sc_stable_epoch; + /* The global minimum EC aggregation epoch, which will be upper * limit for VOS aggregation, i.e. EC object VOS aggregation can * not cross this limit. For simplification purpose, all objects @@ -119,10 +125,11 @@ struct ds_cont_child { uint64_t sc_ec_agg_eph_boundary; /* The current EC aggregate epoch for this xstream */ uint64_t sc_ec_agg_eph; - /* Used by cont_ec_eph_query_ult to query the minimum EC agg epoch from all - * local VOS. + /* Used by ds_cont_track_eph_query_ult to query the minimum ec_agg_eph and stable_eph + * from all local VOS. */ - uint64_t *sc_ec_query_agg_eph; + uint64_t *sc_query_ec_agg_eph; + uint64_t *sc_query_stable_eph; /** * Timestamp of last EC update, which is used by aggregation to check * if it needs to do EC aggregate. @@ -270,11 +277,8 @@ int dsc_cont_close(daos_handle_t poh, daos_handle_t coh); struct daos_csummer *dsc_cont2csummer(daos_handle_t coh); int dsc_cont_get_props(daos_handle_t coh, struct cont_props *props); -void ds_cont_tgt_ec_eph_query_ult(void *data); -int ds_cont_ec_eph_insert(struct ds_pool *pool, uuid_t cont_uuid, int tgt_idx, - uint64_t **epoch_p); -int ds_cont_ec_eph_delete(struct ds_pool *pool, uuid_t cont_uuid, int tgt_idx); -void ds_cont_ec_eph_free(struct ds_pool *pool); +void ds_cont_track_eph_query_ult(void *data); +void ds_cont_track_eph_free(struct ds_pool *pool); void ds_cont_ec_timestamp_update(struct ds_cont_child *cont); diff --git a/src/include/daos_srv/iv.h b/src/include/daos_srv/iv.h index b453fa121ab..d86c666871d 100644 --- a/src/include/daos_srv/iv.h +++ b/src/include/daos_srv/iv.h @@ -1,5 +1,5 @@ /** - * (C) Copyright 2017-2024 Intel Corporation. + * (C) Copyright 2017-2025 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -287,14 +287,14 @@ enum iv_key { /* Container properties */ IV_CONT_PROP, IV_POOL_HDL, - /* Each server report its own EC aggregation epoch to the container + /* Each server report its own EC aggregation epoch and stable epoch to the container * service leader */ - IV_CONT_AGG_EPOCH_REPORT, - /* leader sync the minimum epoch(VOS aggregate epoch boundary) to all + IV_CONT_TRACK_EPOCH_REPORT, + /* leader sync the minimum epoch(VOS aggregate epoch boundary and stable epoch) to all * other servers */ - IV_CONT_AGG_EPOCH_BOUNDRY, + IV_CONT_TRACK_EPOCH, IV_CHK, }; diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h index abb539fa74b..3a24a5009da 100644 --- a/src/include/daos_srv/pool.h +++ b/src/include/daos_srv/pool.h @@ -1,5 +1,5 @@ /* - * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2016-2025 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -380,6 +380,7 @@ int dsc_pool_svc_check_evict(uuid_t pool_uuid, d_rank_list_t *ranks, uint64_t de uuid_t *handles, size_t n_handles, uint32_t destroy, uint32_t force, char *machine, uint32_t *count); +int ds_pool_target_status(struct ds_pool *pool, uint32_t id); int ds_pool_target_status_check(struct ds_pool *pool, uint32_t id, uint8_t matched_status, struct pool_target **p_tgt); int ds_pool_mark_connectable(struct ds_pool_svc *ds_svc); diff --git a/src/object/srv_ec_aggregate.c b/src/object/srv_ec_aggregate.c index 3891a812ef2..357dfbf664b 100644 --- a/src/object/srv_ec_aggregate.c +++ b/src/object/srv_ec_aggregate.c @@ -1,5 +1,6 @@ /** - * (C) Copyright 2020-2024 Intel Corporation. + * (C) Copyright 2020-2025 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -2533,31 +2534,18 @@ agg_iterate_pre_cb(daos_handle_t ih, vos_iter_entry_t *entry, return rc; } -struct ec_agg_ult_arg { - struct ec_agg_param *param; - daos_epoch_t *ec_query_p; - uint32_t tgt_idx; -}; - /* Captures the IV values need for pool and container open. Runs in * system xstream. */ static int ec_agg_init_ult(void *arg) { - struct ec_agg_ult_arg *ult_arg = arg; - struct ec_agg_param *agg_param = ult_arg->param; + struct ec_agg_param *agg_param = arg; struct ds_pool *pool = agg_param->ap_pool_info.api_pool; struct daos_prop_entry *entry = NULL; daos_prop_t *prop = NULL; int rc; - rc = ds_cont_ec_eph_insert(agg_param->ap_pool_info.api_pool, - agg_param->ap_pool_info.api_cont_uuid, - ult_arg->tgt_idx, &ult_arg->ec_query_p); - if (rc) - D_GOTO(out, rc); - rc = ds_pool_iv_srv_hdl_fetch(pool, &agg_param->ap_pool_info.api_poh_uuid, &agg_param->ap_pool_info.api_coh_uuid); if (rc) @@ -2590,32 +2578,10 @@ ec_agg_init_ult(void *arg) return rc; } -static int -ec_agg_fini_ult(void *arg) -{ - struct ec_agg_ult_arg *ult_arg = arg; - struct ec_agg_param *agg_param = ult_arg->param; - int rc; - - rc = ds_cont_ec_eph_delete(agg_param->ap_pool_info.api_pool, - agg_param->ap_pool_info.api_cont_uuid, - ult_arg->tgt_idx); - D_ASSERT(rc == 0); - return 0; -} - static void ec_agg_param_fini(struct ds_cont_child *cont, struct ec_agg_param *agg_param) { struct ec_agg_entry *agg_entry = &agg_param->ap_agg_entry; - struct ec_agg_ult_arg arg; - - arg.param = agg_param; - arg.tgt_idx = dss_get_module_info()->dmi_tgt_id; - if (cont->sc_ec_query_agg_eph) { - dss_ult_execute(ec_agg_fini_ult, &arg, NULL, NULL, DSS_XS_SYS, 0, 0); - cont->sc_ec_query_agg_eph = NULL; - } if (daos_handle_is_valid(agg_param->ap_pool_info.api_cont_hdl)) dsc_cont_close(agg_param->ap_pool_info.api_pool_hdl, @@ -2637,7 +2603,6 @@ ec_agg_param_init(struct ds_cont_child *cont, struct agg_param *param) { struct ec_agg_param *agg_param = param->ap_data; struct ec_agg_pool_info *info = &agg_param->ap_pool_info; - struct ec_agg_ult_arg arg = { 0 }; int rc; D_ASSERT(agg_param->ap_initialized == 0); @@ -2651,11 +2616,7 @@ ec_agg_param_init(struct ds_cont_child *cont, struct agg_param *param) agg_param->ap_credits_max = EC_AGG_ITERATION_MAX; D_INIT_LIST_HEAD(&agg_param->ap_agg_entry.ae_cur_stripe.as_dextents); - arg.param = agg_param; - arg.tgt_idx = dss_get_module_info()->dmi_tgt_id; - rc = dss_ult_execute(ec_agg_init_ult, &arg, NULL, NULL, DSS_XS_SYS, 0, 0); - if (arg.ec_query_p != NULL) - cont->sc_ec_query_agg_eph = arg.ec_query_p; + rc = dss_ult_execute(ec_agg_init_ult, agg_param, NULL, NULL, DSS_XS_SYS, 0, 0); if (rc != 0) D_GOTO(out, rc); @@ -2814,10 +2775,10 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr, if (rc == 0) { cont->sc_ec_agg_eph = max(cont->sc_ec_agg_eph, epr->epr_hi); - if (!cont->sc_stopping && cont->sc_ec_query_agg_eph) { + if (!cont->sc_stopping && cont->sc_query_ec_agg_eph) { uint64_t orig, cur; - orig = d_hlc2sec(*cont->sc_ec_query_agg_eph); + orig = d_hlc2sec(*cont->sc_query_ec_agg_eph); cur = d_hlc2sec(cont->sc_ec_agg_eph); if (orig && cur > orig && (cur - orig) >= 600) D_WARN(DF_CONT" Sluggish EC boundary bumping: " @@ -2825,7 +2786,7 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr, DP_CONT(cont->sc_pool_uuid, cont->sc_uuid), orig, cur, cur - orig); - *cont->sc_ec_query_agg_eph = min(ec_agg_param->ap_min_unagg_eph, + *cont->sc_query_ec_agg_eph = min(ec_agg_param->ap_min_unagg_eph, cont->sc_ec_agg_eph); } } diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c index 06a5f29e866..cf96675f98c 100644 --- a/src/pool/srv_pool.c +++ b/src/pool/srv_pool.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2016-2025 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -8467,6 +8467,22 @@ is_pool_from_srv(uuid_t pool_uuid, uuid_t poh_uuid) return rc ? true : false; } +/* Query the target(by id)'s status */ +int +ds_pool_target_status(struct ds_pool *pool, uint32_t id) +{ + struct pool_target *target; + int rc; + + ABT_rwlock_rdlock(pool->sp_lock); + rc = pool_map_find_target(pool->sp_map, id, &target); + ABT_rwlock_unlock(pool->sp_lock); + if (rc <= 0) + return rc == 0 ? -DER_NONEXIST : rc; + + return (int)target->ta_comp.co_status; +} + /* Check if the target(by id) matched the status */ int ds_pool_target_status_check(struct ds_pool *pool, uint32_t id, uint8_t matched_status, diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c index 8d1cc7d6aa4..3ea050e4814 100644 --- a/src/pool/srv_target.c +++ b/src/pool/srv_target.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2016-2025 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -926,7 +926,7 @@ pool_free_ref(struct daos_llink *llink) D_ASSERT(d_list_empty(&pool->sp_hdls)); - ds_cont_ec_eph_free(pool); + ds_cont_track_eph_free(pool); pl_map_disconnect(pool->sp_uuid); if (pool->sp_map != NULL) @@ -1090,13 +1090,13 @@ pool_fetch_hdls_ult(void *data) } static void -tgt_ec_eph_query_ult(void *data) +tgt_track_eph_query_ult(void *data) { - ds_cont_tgt_ec_eph_query_ult(data); + ds_cont_track_eph_query_ult(data); } static int -ds_pool_start_ec_eph_query_ult(struct ds_pool *pool) +ds_pool_start_track_eph_query_ult(struct ds_pool *pool) { struct sched_req_attr attr; uuid_t anonym_uuid; @@ -1107,7 +1107,7 @@ ds_pool_start_ec_eph_query_ult(struct ds_pool *pool) D_ASSERT(pool->sp_ec_ephs_req == NULL); uuid_clear(anonym_uuid); sched_req_attr_init(&attr, SCHED_REQ_ANONYM, &anonym_uuid); - pool->sp_ec_ephs_req = sched_create_ult(&attr, tgt_ec_eph_query_ult, pool, + pool->sp_ec_ephs_req = sched_create_ult(&attr, tgt_track_eph_query_ult, pool, DSS_DEEP_STACK_SZ); if (pool->sp_ec_ephs_req == NULL) { D_ERROR(DF_UUID": failed create ec eph equery ult.\n", @@ -1240,7 +1240,7 @@ ds_pool_start(uuid_t uuid, bool aft_chk, bool immutable) } if (!ds_pool_restricted(pool, false)) { - rc = ds_pool_start_ec_eph_query_ult(pool); + rc = ds_pool_start_track_eph_query_ult(pool); if (rc != 0) { D_ERROR(DF_UUID": failed to start ec eph query ult: "DF_RC"\n", DP_UUID(uuid), DP_RC(rc)); diff --git a/src/vos/vos_container.c b/src/vos/vos_container.c index 373a3765022..94afcb0ea53 100644 --- a/src/vos/vos_container.c +++ b/src/vos/vos_container.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2016-2025 Intel Corporation. * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent @@ -473,7 +473,7 @@ vos_cont_open(daos_handle_t poh, uuid_t co_uuid, daos_handle_t *coh) * current container reopen (such as for engine restart without related pool service * down), but related RPC was not forwarded to current engine in time. After current * engine re-opening the container (shard), it will reject such old modification and - * ask related DTX leader to restart the transaction. It only may affect inflight IO + * ask related DTX leader to restart the transaction. It only may affect in-flight IO * during re-opening container without restarting pool service. * * With the assignment, we also do not need to consider former EC/VOS aggregation up @@ -934,6 +934,13 @@ vos_cont_get_globla_stable_epoch(daos_handle_t coh) cont = vos_hdl2cont(coh); D_ASSERT(cont != NULL); + if (cont->vc_pool->vp_pool_df->pd_version < VOS_POOL_DF_2_8) { + D_DEBUG(DB_MD, DF_CONT" return 0 stable epoch for lower pool version %d\n", + DP_CONT(cont->vc_pool->vp_pool_df->pd_id, cont->vc_id), + cont->vc_pool->vp_pool_df->pd_version); + return 0; + } + cont_ext = umem_off2ptr(vos_cont2umm(cont), cont->vc_cont_df->cd_ext); if (cont_ext != NULL) epoch = cont_ext->ced_global_stable_epoch;