From 7c878973e2dcf6476a2bcff5eaebb45afb683235 Mon Sep 17 00:00:00 2001 From: steviez Date: Fri, 1 Mar 2024 12:30:50 -0600 Subject: [PATCH] Cleanup ReplayStage loop timing struct (#35361) - Track loop_count in the struct - Rename ReplayTiming ==> ReplayLoopTiming - Make all metrics consistent to end with "_elapsed_us" --- core/src/replay_stage.rs | 244 +++++++++++++++++++++------------------ 1 file changed, 130 insertions(+), 114 deletions(-) diff --git a/core/src/replay_stage.rs b/core/src/replay_stage.rs index a80a04d47c1573..46014e3f7912de 100644 --- a/core/src/replay_stage.rs +++ b/core/src/replay_stage.rs @@ -297,86 +297,97 @@ pub struct ReplayStageConfig { pub replay_slots_concurrently: bool, } +/// Timing information for the ReplayStage main processing loop #[derive(Default)] -pub struct ReplayTiming { - last_print: u64, - collect_frozen_banks_elapsed: u64, - compute_bank_stats_elapsed: u64, - select_vote_and_reset_forks_elapsed: u64, - start_leader_elapsed: u64, - reset_bank_elapsed: u64, - voting_elapsed: u64, +struct ReplayLoopTiming { + last_submit: u64, + loop_count: u64, + collect_frozen_banks_elapsed_us: u64, + compute_bank_stats_elapsed_us: u64, + select_vote_and_reset_forks_elapsed_us: u64, + start_leader_elapsed_us: u64, + reset_bank_elapsed_us: u64, + voting_elapsed_us: u64, generate_vote_us: u64, update_commitment_cache_us: u64, - select_forks_elapsed: u64, - compute_slot_stats_elapsed: u64, - generate_new_bank_forks_elapsed: u64, - replay_active_banks_elapsed: u64, - wait_receive_elapsed: u64, - heaviest_fork_failures_elapsed: u64, + select_forks_elapsed_us: u64, + compute_slot_stats_elapsed_us: u64, + generate_new_bank_forks_elapsed_us: u64, + replay_active_banks_elapsed_us: u64, + wait_receive_elapsed_us: u64, + heaviest_fork_failures_elapsed_us: u64, bank_count: u64, - process_ancestor_hashes_duplicate_slots_elapsed: u64, - process_duplicate_confirmed_slots_elapsed: u64, - process_duplicate_slots_elapsed: u64, - process_unfrozen_gossip_verified_vote_hashes_elapsed: u64, - process_popular_pruned_forks_elapsed: u64, - repair_correct_slots_elapsed: u64, - retransmit_not_propagated_elapsed: u64, + process_ancestor_hashes_duplicate_slots_elapsed_us: u64, + process_duplicate_confirmed_slots_elapsed_us: u64, + process_duplicate_slots_elapsed_us: u64, + process_unfrozen_gossip_verified_vote_hashes_elapsed_us: u64, + process_popular_pruned_forks_elapsed_us: u64, + repair_correct_slots_elapsed_us: u64, + retransmit_not_propagated_elapsed_us: u64, generate_new_bank_forks_read_lock_us: u64, generate_new_bank_forks_get_slots_since_us: u64, generate_new_bank_forks_loop_us: u64, generate_new_bank_forks_write_lock_us: u64, - replay_blockstore_us: u64, //< When processing forks concurrently, only captures the longest fork + // When processing multiple forks concurrently, only captures the longest fork + replay_blockstore_us: u64, } -impl ReplayTiming { +impl ReplayLoopTiming { #[allow(clippy::too_many_arguments)] fn update( &mut self, - collect_frozen_banks_elapsed: u64, - compute_bank_stats_elapsed: u64, - select_vote_and_reset_forks_elapsed: u64, - start_leader_elapsed: u64, - reset_bank_elapsed: u64, - voting_elapsed: u64, - select_forks_elapsed: u64, - compute_slot_stats_elapsed: u64, - generate_new_bank_forks_elapsed: u64, - replay_active_banks_elapsed: u64, - wait_receive_elapsed: u64, - heaviest_fork_failures_elapsed: u64, + collect_frozen_banks_elapsed_us: u64, + compute_bank_stats_elapsed_us: u64, + select_vote_and_reset_forks_elapsed_us: u64, + start_leader_elapsed_us: u64, + reset_bank_elapsed_us: u64, + voting_elapsed_us: u64, + select_forks_elapsed_us: u64, + compute_slot_stats_elapsed_us: u64, + generate_new_bank_forks_elapsed_us: u64, + replay_active_banks_elapsed_us: u64, + wait_receive_elapsed_us: u64, + heaviest_fork_failures_elapsed_us: u64, bank_count: u64, - process_ancestor_hashes_duplicate_slots_elapsed: u64, - process_duplicate_confirmed_slots_elapsed: u64, - process_unfrozen_gossip_verified_vote_hashes_elapsed: u64, - process_popular_pruned_forks_elapsed: u64, - process_duplicate_slots_elapsed: u64, - repair_correct_slots_elapsed: u64, - retransmit_not_propagated_elapsed: u64, + process_ancestor_hashes_duplicate_slots_elapsed_us: u64, + process_duplicate_confirmed_slots_elapsed_us: u64, + process_unfrozen_gossip_verified_vote_hashes_elapsed_us: u64, + process_popular_pruned_forks_elapsed_us: u64, + process_duplicate_slots_elapsed_us: u64, + repair_correct_slots_elapsed_us: u64, + retransmit_not_propagated_elapsed_us: u64, ) { - self.collect_frozen_banks_elapsed += collect_frozen_banks_elapsed; - self.compute_bank_stats_elapsed += compute_bank_stats_elapsed; - self.select_vote_and_reset_forks_elapsed += select_vote_and_reset_forks_elapsed; - self.start_leader_elapsed += start_leader_elapsed; - self.reset_bank_elapsed += reset_bank_elapsed; - self.voting_elapsed += voting_elapsed; - self.select_forks_elapsed += select_forks_elapsed; - self.compute_slot_stats_elapsed += compute_slot_stats_elapsed; - self.generate_new_bank_forks_elapsed += generate_new_bank_forks_elapsed; - self.replay_active_banks_elapsed += replay_active_banks_elapsed; - self.wait_receive_elapsed += wait_receive_elapsed; - self.heaviest_fork_failures_elapsed += heaviest_fork_failures_elapsed; + self.loop_count += 1; + self.collect_frozen_banks_elapsed_us += collect_frozen_banks_elapsed_us; + self.compute_bank_stats_elapsed_us += compute_bank_stats_elapsed_us; + self.select_vote_and_reset_forks_elapsed_us += select_vote_and_reset_forks_elapsed_us; + self.start_leader_elapsed_us += start_leader_elapsed_us; + self.reset_bank_elapsed_us += reset_bank_elapsed_us; + self.voting_elapsed_us += voting_elapsed_us; + self.select_forks_elapsed_us += select_forks_elapsed_us; + self.compute_slot_stats_elapsed_us += compute_slot_stats_elapsed_us; + self.generate_new_bank_forks_elapsed_us += generate_new_bank_forks_elapsed_us; + self.replay_active_banks_elapsed_us += replay_active_banks_elapsed_us; + self.wait_receive_elapsed_us += wait_receive_elapsed_us; + self.heaviest_fork_failures_elapsed_us += heaviest_fork_failures_elapsed_us; self.bank_count += bank_count; - self.process_ancestor_hashes_duplicate_slots_elapsed += - process_ancestor_hashes_duplicate_slots_elapsed; - self.process_duplicate_confirmed_slots_elapsed += process_duplicate_confirmed_slots_elapsed; - self.process_unfrozen_gossip_verified_vote_hashes_elapsed += - process_unfrozen_gossip_verified_vote_hashes_elapsed; - self.process_popular_pruned_forks_elapsed += process_popular_pruned_forks_elapsed; - self.process_duplicate_slots_elapsed += process_duplicate_slots_elapsed; - self.repair_correct_slots_elapsed += repair_correct_slots_elapsed; - self.retransmit_not_propagated_elapsed += retransmit_not_propagated_elapsed; + self.process_ancestor_hashes_duplicate_slots_elapsed_us += + process_ancestor_hashes_duplicate_slots_elapsed_us; + self.process_duplicate_confirmed_slots_elapsed_us += + process_duplicate_confirmed_slots_elapsed_us; + self.process_unfrozen_gossip_verified_vote_hashes_elapsed_us += + process_unfrozen_gossip_verified_vote_hashes_elapsed_us; + self.process_popular_pruned_forks_elapsed_us += process_popular_pruned_forks_elapsed_us; + self.process_duplicate_slots_elapsed_us += process_duplicate_slots_elapsed_us; + self.repair_correct_slots_elapsed_us += repair_correct_slots_elapsed_us; + self.retransmit_not_propagated_elapsed_us += retransmit_not_propagated_elapsed_us; + + self.maybe_submit(); + } + + fn maybe_submit(&mut self) { let now = timestamp(); - let elapsed_ms = now - self.last_print; + let elapsed_ms = now - self.last_submit; + if elapsed_ms > 1000 { datapoint_info!( "replay-loop-voting-stats", @@ -389,93 +400,98 @@ impl ReplayTiming { ); datapoint_info!( "replay-loop-timing-stats", + ("loop_count", self.loop_count as i64, i64), ("total_elapsed_us", elapsed_ms * 1000, i64), ( - "collect_frozen_banks_elapsed", - self.collect_frozen_banks_elapsed as i64, + "collect_frozen_banks_elapsed_us", + self.collect_frozen_banks_elapsed_us as i64, + i64 + ), + ( + "compute_bank_stats_elapsed_us", + self.compute_bank_stats_elapsed_us as i64, i64 ), ( - "compute_bank_stats_elapsed", - self.compute_bank_stats_elapsed as i64, + "select_vote_and_reset_forks_elapsed_us", + self.select_vote_and_reset_forks_elapsed_us as i64, i64 ), ( - "select_vote_and_reset_forks_elapsed", - self.select_vote_and_reset_forks_elapsed as i64, + "start_leader_elapsed_us", + self.start_leader_elapsed_us as i64, i64 ), ( - "start_leader_elapsed", - self.start_leader_elapsed as i64, + "reset_bank_elapsed_us", + self.reset_bank_elapsed_us as i64, i64 ), - ("reset_bank_elapsed", self.reset_bank_elapsed as i64, i64), - ("voting_elapsed", self.voting_elapsed as i64, i64), + ("voting_elapsed_us", self.voting_elapsed_us as i64, i64), ( - "select_forks_elapsed", - self.select_forks_elapsed as i64, + "select_forks_elapsed_us", + self.select_forks_elapsed_us as i64, i64 ), ( - "compute_slot_stats_elapsed", - self.compute_slot_stats_elapsed as i64, + "compute_slot_stats_elapsed_us", + self.compute_slot_stats_elapsed_us as i64, i64 ), ( - "generate_new_bank_forks_elapsed", - self.generate_new_bank_forks_elapsed as i64, + "generate_new_bank_forks_elapsed_us", + self.generate_new_bank_forks_elapsed_us as i64, i64 ), ( - "replay_active_banks_elapsed", - self.replay_active_banks_elapsed as i64, + "replay_active_banks_elapsed_us", + self.replay_active_banks_elapsed_us as i64, i64 ), ( - "process_ancestor_hashes_duplicate_slots_elapsed", - self.process_ancestor_hashes_duplicate_slots_elapsed as i64, + "process_ancestor_hashes_duplicate_slots_elapsed_us", + self.process_ancestor_hashes_duplicate_slots_elapsed_us as i64, i64 ), ( - "process_duplicate_confirmed_slots_elapsed", - self.process_duplicate_confirmed_slots_elapsed as i64, + "process_duplicate_confirmed_slots_elapsed_us", + self.process_duplicate_confirmed_slots_elapsed_us as i64, i64 ), ( - "process_unfrozen_gossip_verified_vote_hashes_elapsed", - self.process_unfrozen_gossip_verified_vote_hashes_elapsed as i64, + "process_unfrozen_gossip_verified_vote_hashes_elapsed_us", + self.process_unfrozen_gossip_verified_vote_hashes_elapsed_us as i64, i64 ), ( - "process_popular_pruned_forks_elapsed", - self.process_popular_pruned_forks_elapsed as i64, + "process_popular_pruned_forks_elapsed_us", + self.process_popular_pruned_forks_elapsed_us as i64, i64 ), ( - "wait_receive_elapsed", - self.wait_receive_elapsed as i64, + "wait_receive_elapsed_us", + self.wait_receive_elapsed_us as i64, i64 ), ( - "heaviest_fork_failures_elapsed", - self.heaviest_fork_failures_elapsed as i64, + "heaviest_fork_failures_elapsed_us", + self.heaviest_fork_failures_elapsed_us as i64, i64 ), ("bank_count", self.bank_count as i64, i64), ( - "process_duplicate_slots_elapsed", - self.process_duplicate_slots_elapsed as i64, + "process_duplicate_slots_elapsed_us", + self.process_duplicate_slots_elapsed_us as i64, i64 ), ( - "repair_correct_slots_elapsed", - self.repair_correct_slots_elapsed as i64, + "repair_correct_slots_elapsed_us", + self.repair_correct_slots_elapsed_us as i64, i64 ), ( - "retransmit_not_propagated_elapsed", - self.retransmit_not_propagated_elapsed as i64, + "retransmit_not_propagated_elapsed_us", + self.retransmit_not_propagated_elapsed_us as i64, i64 ), ( @@ -504,8 +520,8 @@ impl ReplayTiming { i64 ), ); - *self = ReplayTiming::default(); - self.last_print = now; + *self = ReplayLoopTiming::default(); + self.last_submit = now; } } } @@ -615,7 +631,7 @@ impl ReplayStage { let mut last_reset = Hash::default(); let mut partition_info = PartitionInfo::new(); let mut skipped_slots_info = SkippedSlotsInfo::default(); - let mut replay_timing = ReplayTiming::default(); + let mut replay_timing = ReplayLoopTiming::default(); let mut duplicate_slots_tracker = DuplicateSlotsTracker::default(); let mut duplicate_confirmed_slots: DuplicateConfirmedSlots = DuplicateConfirmedSlots::default(); @@ -2272,7 +2288,7 @@ impl ReplayStage { unfrozen_gossip_verified_vote_hashes: &mut UnfrozenGossipVerifiedVoteHashes, vote_signatures: &mut Vec, has_new_vote_been_rooted: &mut bool, - replay_timing: &mut ReplayTiming, + replay_timing: &mut ReplayLoopTiming, voting_sender: &Sender, epoch_slots_frozen_slots: &mut EpochSlotsFrozenSlots, drop_bank_sender: &Sender>>, @@ -2604,7 +2620,7 @@ impl ReplayStage { switch_fork_decision: &SwitchForkDecision, vote_signatures: &mut Vec, has_new_vote_been_rooted: bool, - replay_timing: &mut ReplayTiming, + replay_timing: &mut ReplayLoopTiming, voting_sender: &Sender, wait_to_vote_slot: Option, ) { @@ -2697,7 +2713,7 @@ impl ReplayStage { entry_notification_sender: Option<&EntryNotifierSender>, verify_recyclers: &VerifyRecyclers, replay_vote_sender: &ReplayVoteSender, - replay_timing: &mut ReplayTiming, + replay_timing: &mut ReplayLoopTiming, log_messages_bytes_limit: Option, active_bank_slots: &[Slot], prioritization_fee_cache: &PrioritizationFeeCache, @@ -2810,7 +2826,7 @@ impl ReplayStage { entry_notification_sender: Option<&EntryNotifierSender>, verify_recyclers: &VerifyRecyclers, replay_vote_sender: &ReplayVoteSender, - replay_timing: &mut ReplayTiming, + replay_timing: &mut ReplayLoopTiming, log_messages_bytes_limit: Option, bank_slot: Slot, prioritization_fee_cache: &PrioritizationFeeCache, @@ -3157,7 +3173,7 @@ impl ReplayStage { duplicate_slots_to_repair: &mut DuplicateSlotsToRepair, ancestor_hashes_replay_update_sender: &AncestorHashesReplayUpdateSender, block_metadata_notifier: Option, - replay_timing: &mut ReplayTiming, + replay_timing: &mut ReplayLoopTiming, log_messages_bytes_limit: Option, replay_slots_concurrently: bool, prioritization_fee_cache: &PrioritizationFeeCache, @@ -4140,7 +4156,7 @@ impl ReplayStage { leader_schedule_cache: &Arc, rpc_subscriptions: &Arc, progress: &mut ProgressMap, - replay_timing: &mut ReplayTiming, + replay_timing: &mut ReplayLoopTiming, ) { // Find the next slot that chains to the old slot let mut generate_new_bank_forks_read_lock = @@ -4530,7 +4546,7 @@ pub(crate) mod tests { .unwrap() .get(NUM_CONSECUTIVE_LEADER_SLOTS) .is_none()); - let mut replay_timing = ReplayTiming::default(); + let mut replay_timing = ReplayLoopTiming::default(); ReplayStage::generate_new_bank_forks( &blockstore, &bank_forks, @@ -6351,7 +6367,7 @@ pub(crate) mod tests { .. } = vote_simulator; - let mut replay_timing = ReplayTiming::default(); + let mut replay_timing = ReplayLoopTiming::default(); // Create bank 7 and insert to blockstore and bank forks let root_bank = bank_forks.read().unwrap().root_bank(); @@ -7543,7 +7559,7 @@ pub(crate) mod tests { &SwitchForkDecision::SameFork, &mut voted_signatures, has_new_vote_been_rooted, - &mut ReplayTiming::default(), + &mut ReplayLoopTiming::default(), &voting_sender, None, ); @@ -7618,7 +7634,7 @@ pub(crate) mod tests { &SwitchForkDecision::SameFork, &mut voted_signatures, has_new_vote_been_rooted, - &mut ReplayTiming::default(), + &mut ReplayLoopTiming::default(), &voting_sender, None, ); @@ -7819,7 +7835,7 @@ pub(crate) mod tests { &SwitchForkDecision::SameFork, voted_signatures, has_new_vote_been_rooted, - &mut ReplayTiming::default(), + &mut ReplayLoopTiming::default(), voting_sender, None, );