diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp index f256a123de..0960bbb4fd 100644 --- a/src/cpp/include/openvino/genai/perf_metrics.hpp +++ b/src/cpp/include/openvino/genai/perf_metrics.hpp @@ -23,8 +23,10 @@ using MicroSeconds = std::chrono::duration>; * @param detokenization_durations Durations for the detokenization process in microseconds. * @param m_times_to_first_token Times to the first token for each call in microseconds. * @param m_new_token_times Time points for each new token generated. + * @param m_token_infer_durations Inference time for each token in microseconds. * @param m_batch_sizes Batch sizes for each generate call. * @param m_durations Total durations for each generate call in microseconds. + * @param m_inference_durations Total inference duration for each generate call in microseconds. * @param num_generated_tokens Total number of tokens generated. * @param num_input_tokens Total number of tokens in the input prompt. */ @@ -35,8 +37,10 @@ struct OPENVINO_GENAI_EXPORTS RawPerfMetrics { std::vector m_times_to_first_token; std::vector m_new_token_times; + std::vector m_token_infer_durations; std::vector m_batch_sizes; std::vector m_durations; + std::vector m_inference_durations; }; /** @@ -96,13 +100,15 @@ struct OPENVINO_GENAI_EXPORTS MeanStdPair { */ struct OPENVINO_GENAI_EXPORTS PerfMetrics { float load_time; // Load time in ms. - MeanStdPair ttft; // Time to the first token (in ms) (TTTFT). + MeanStdPair ttft; // Time to the first token (in ms) (TTFT). MeanStdPair tpot; // Time (in ms) per output token (TPOT). + MeanStdPair ipot; // Inference time (in ms) per output token. MeanStdPair throughput; // Tokens per second. MeanStdPair generate_duration; - MeanStdPair tokenization_duration = {-1, -1}; - MeanStdPair detokenization_duration = {-1. -1}; + MeanStdPair inference_duration; + MeanStdPair tokenization_duration = {-1.0f, -1.0f}; + MeanStdPair detokenization_duration = {-1.0f, -1.0f}; size_t num_generated_tokens; size_t num_input_tokens; @@ -112,8 +118,10 @@ struct OPENVINO_GENAI_EXPORTS PerfMetrics { size_t get_num_input_tokens(); MeanStdPair get_ttft(); // Time to the first token (in ms) (TTFT). MeanStdPair get_tpot(); // Time (in ms) per output token (TPOT). + MeanStdPair get_ipot(); // Inference time (in ms) per output token. MeanStdPair get_throughput(); // Tokens per second. + MeanStdPair get_inference_duration(); // in ms MeanStdPair get_generate_duration(); // in ms MeanStdPair get_tokenization_duration(); // in ms MeanStdPair get_detokenization_duration(); // in ms @@ -133,7 +141,7 @@ struct OPENVINO_GENAI_EXPORTS PerfMetrics { /** * @brief convert duration to microseconds * - * @param duration duration in + * @param duration steady clock duration */ static float get_microsec(std::chrono::steady_clock::duration duration); PerfMetrics operator+(const PerfMetrics& metrics) const; diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp index 8dc56b4ba8..95a1843645 100644 --- a/src/cpp/src/greedy_decoding.cpp +++ b/src/cpp/src/greedy_decoding.cpp @@ -24,7 +24,11 @@ EncodedResults greedy_decoding( // Initialize results and performance metrics. EncodedResults results; auto& raw_perf_counters = results.perf_metrics.raw_metrics; - + raw_perf_counters.m_new_token_times.reserve(max_new_tokens); + raw_perf_counters.m_batch_sizes.reserve(max_new_tokens); + raw_perf_counters.m_token_infer_durations.reserve(max_new_tokens); + raw_perf_counters.m_inference_durations = {{ MicroSeconds(0.0f) }}; + results.scores.resize(running_batch_size); results.tokens.resize(running_batch_size); std::fill(results.scores.begin(), results.scores.end(), 0); @@ -38,8 +42,13 @@ EncodedResults greedy_decoding( auto beam_data = m_model_runner.get_tensor("beam_idx").data(); std::iota(beam_data, beam_data + running_batch_size, 0); + const auto infer_start = std::chrono::steady_clock::now(); m_model_runner.infer(); + const auto infer_ms = PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start); + raw_perf_counters.m_inference_durations[0] = MicroSeconds(infer_ms); + raw_perf_counters.m_token_infer_durations.emplace_back(infer_ms); auto logits = m_model_runner.get_tensor("logits"); + ov::Shape logits_shape = logits.get_shape(); size_t seq_len = logits_shape[1], vocab_size = logits_shape[2]; m_model_runner.get_tensor("input_ids").set_shape({running_batch_size, 1}); @@ -71,8 +80,14 @@ EncodedResults greedy_decoding( utils::update_position_ids(m_model_runner.get_tensor("position_ids"), m_model_runner.get_tensor("attention_mask")); m_model_runner.set_tensor("attention_mask", utils::extend_attention(m_model_runner.get_tensor("attention_mask"))); + const auto infer_start = std::chrono::steady_clock::now(); m_model_runner.infer(); + const auto infer_ms = PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start); + raw_perf_counters.m_inference_durations[0] += MicroSeconds(infer_ms); + raw_perf_counters.m_token_infer_durations.emplace_back(infer_ms); + auto logits = m_model_runner.get_tensor("logits"); + ov::Shape logits_shape = logits.get_shape(); size_t seq_len = logits_shape[1], vocab_size = logits_shape[2]; diff --git a/src/cpp/src/multinomial_decoding.cpp b/src/cpp/src/multinomial_decoding.cpp index b00c62aed7..7e298b5598 100644 --- a/src/cpp/src/multinomial_decoding.cpp +++ b/src/cpp/src/multinomial_decoding.cpp @@ -156,15 +156,20 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner std::shared_ptr streamer, std::optional position_ids) { ov::Shape prompts_shape = input_ids.get_shape(); - size_t batch_size = prompts_shape[0]; + const size_t batch_size = prompts_shape[0]; OPENVINO_ASSERT(batch_size == 1, "Only batch size = 1 supported for multinomial decoding"); - size_t prompt_len = prompts_shape[1]; + const size_t prompt_len = prompts_shape[1]; + const size_t max_new_tokens = config.get_max_new_tokens(prompt_len); // Initialize results and performance metrics. EncodedResults results; auto& raw_perf_counters = results.perf_metrics.raw_metrics; + raw_perf_counters.m_new_token_times.reserve(max_new_tokens); + raw_perf_counters.m_batch_sizes.reserve(max_new_tokens); + raw_perf_counters.m_token_infer_durations.reserve(max_new_tokens); + raw_perf_counters.m_inference_durations = {{ MicroSeconds(0.0f) }}; results.scores.resize(batch_size, 0); results.tokens.resize(batch_size); @@ -180,8 +185,13 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner m_model_runner.get_tensor("beam_idx").set_shape({batch_size}); m_model_runner.get_tensor("beam_idx").data()[0] = 0; + const auto infer_start = std::chrono::steady_clock::now(); m_model_runner.infer(); - raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now()); + const auto infer_end = std::chrono::steady_clock::now(); + const auto infer_ms = PerfMetrics::get_microsec(infer_end - infer_start); + raw_perf_counters.m_inference_durations[0] += MicroSeconds(infer_ms); + raw_perf_counters.m_token_infer_durations.emplace_back(infer_ms); + raw_perf_counters.m_new_token_times.emplace_back(infer_end); raw_perf_counters.m_batch_sizes.emplace_back(batch_size); auto logits_tensor = m_model_runner.get_tensor("logits"); @@ -213,8 +223,6 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner m_model_runner.get_tensor("input_ids").set_shape({batch_size, 1}); - size_t max_new_tokens = config.get_max_new_tokens(prompt_len); - for (size_t i = 0; i < max_new_tokens - 1; i++) { if (position_ids.has_value()) { ov::genai::utils::update_position_ids(m_model_runner.get_tensor("position_ids"), @@ -225,8 +233,13 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner m_model_runner.get_tensor("input_ids").data()[0] = out_token.id; + const auto infer_start = std::chrono::steady_clock::now(); m_model_runner.infer(); - raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now()); + const auto infer_end = std::chrono::steady_clock::now(); + const auto infer_ms = PerfMetrics::get_microsec(infer_end - infer_start); + raw_perf_counters.m_inference_durations[0] += MicroSeconds(infer_ms); + raw_perf_counters.m_token_infer_durations.emplace_back(infer_ms); + raw_perf_counters.m_new_token_times.emplace_back(infer_end); raw_perf_counters.m_batch_sizes.emplace_back(batch_size); logits = m_model_runner.get_tensor("logits").data(); diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp index 9ed0233232..678900a9d9 100644 --- a/src/cpp/src/perf_metrics.cpp +++ b/src/cpp/src/perf_metrics.cpp @@ -11,7 +11,7 @@ namespace { ov::genai::MeanStdPair calc_mean_and_std(const std::vector& durations) { if (durations.size() == 0) { - return {-1, -1}; + return {-1.0f, -1.0f}; } // Accepts time durations in microseconds and returns standard deviation and mean in milliseconds. float mean = std::accumulate(durations.begin(), durations.end(), 0.0f, @@ -59,6 +59,11 @@ MeanStdPair PerfMetrics::get_tpot() { return tpot; } +MeanStdPair PerfMetrics::get_ipot() { + evaluate_statistics(); + return ipot; +} + MeanStdPair PerfMetrics::get_throughput() { evaluate_statistics(); return throughput; @@ -79,6 +84,11 @@ MeanStdPair PerfMetrics::get_detokenization_duration() { return detokenization_duration; } +MeanStdPair PerfMetrics::get_inference_duration() { + evaluate_statistics(); + return inference_duration; +} + float PerfMetrics::get_microsec(std::chrono::steady_clock::duration duration) { return std::chrono::duration_cast(duration).count(); } @@ -96,7 +106,7 @@ void PerfMetrics::evaluate_statistics(std::optional start_time) { auto ttft = tok_times[0] - start_time_val; raw_metrics.m_times_to_first_token = std::vector(); - raw_metrics.m_times_to_first_token.emplace_back(ttft); + raw_metrics.m_times_to_first_token.emplace_back(ttft / batch_sizes[0]); num_generated_tokens = 0; for (size_t i = 0; i < tok_times.size(); ++i) { raw_metrics.m_durations[i] = tok_times[i] - start_time_val; @@ -110,12 +120,14 @@ void PerfMetrics::evaluate_statistics(std::optional start_time) { // calc_mean_and_std will convert microsecond to milliseconds. tpot = calc_mean_and_std(raw_metrics.m_durations); + ipot = calc_mean_and_std(raw_metrics.m_token_infer_durations); ttft = calc_mean_and_std(raw_metrics.m_times_to_first_token); generate_duration = calc_mean_and_std(raw_metrics.generate_durations); tokenization_duration = calc_mean_and_std(raw_metrics.tokenization_durations); - detokenization_duration = calc_mean_and_std(raw_metrics.detokenization_durations); - + detokenization_duration = calc_mean_and_std(raw_metrics.detokenization_durations); + inference_duration = calc_mean_and_std(raw_metrics.m_inference_durations); + // tokens per second throughput = {1000.0f / tpot.mean, (tpot.std * 1000.0f) / (tpot.mean * tpot.mean)}; m_evaluated = true; @@ -151,9 +163,8 @@ PerfMetrics PerfMetrics::operator+(const PerfMetrics& right) const { new_detok_durations.insert(new_detok_durations.end(), right_detok_durations.begin(), right_detok_durations.end()); new_gen_durations.insert(new_gen_durations.end(), right_gen_durations.begin(), right_gen_durations.end()); - res.num_generated_tokens = num_generated_tokens + right.num_generated_tokens; - res.num_input_tokens = num_generated_tokens + right.num_input_tokens; - res.load_time = load_time; + res.num_generated_tokens += right.num_generated_tokens; + res.num_input_tokens += right.num_input_tokens; res.m_evaluated = false; return res; }