[perf metrics] Added infer timing stats and bug fixes (#843)

- Collect inference only (ipot - Inference Per Output Token) time statistics and calculate total and per token information - Bug fixes - Typo fixes
openvinotoolkit · Sep 9, 2024 · 38eed7d · 38eed7d
1 parent a722f8d
commit 38eed7d
Show file tree

Hide file tree

Showing 4 changed files with 65 additions and 18 deletions.
diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp
@@ -23,8 +23,10 @@ using MicroSeconds = std::chrono::duration<float, std::ratio<1, 1000000>>;
  * @param detokenization_durations Durations for the detokenization process in microseconds.
  * @param m_times_to_first_token Times to the first token for each call in microseconds.
  * @param m_new_token_times Time points for each new token generated.
+ * @param m_token_infer_durations Inference time for each token in microseconds.
  * @param m_batch_sizes Batch sizes for each generate call.
  * @param m_durations Total durations for each generate call in microseconds.
+ * @param m_inference_durations Total inference duration for each generate call in microseconds.
  * @param num_generated_tokens Total number of tokens generated.
  * @param num_input_tokens Total number of tokens in the input prompt.
  */
@@ -35,8 +37,10 @@ struct OPENVINO_GENAI_EXPORTS RawPerfMetrics {
 
     std::vector<MicroSeconds> m_times_to_first_token;
     std::vector<TimePoint> m_new_token_times;
+    std::vector<MicroSeconds> m_token_infer_durations;
     std::vector<size_t> m_batch_sizes;
     std::vector<MicroSeconds> m_durations;
+    std::vector<MicroSeconds> m_inference_durations;
 };
 
 /**
@@ -96,13 +100,15 @@ struct OPENVINO_GENAI_EXPORTS MeanStdPair {
  */
 struct OPENVINO_GENAI_EXPORTS PerfMetrics {
     float load_time;   // Load time in ms.
-    MeanStdPair ttft;  // Time to the first token (in ms) (TTTFT).
+    MeanStdPair ttft;  // Time to the first token (in ms) (TTFT).
     MeanStdPair tpot;  // Time (in ms) per output token (TPOT).
+    MeanStdPair ipot;  // Inference time (in ms) per output token.
     MeanStdPair throughput;  // Tokens per second.
 
     MeanStdPair generate_duration;
-    MeanStdPair tokenization_duration = {-1, -1};
-    MeanStdPair detokenization_duration = {-1. -1};
+    MeanStdPair inference_duration;
+    MeanStdPair tokenization_duration = {-1.0f, -1.0f};
+    MeanStdPair detokenization_duration = {-1.0f, -1.0f};
 
     size_t num_generated_tokens;
     size_t num_input_tokens;
@@ -112,8 +118,10 @@ struct OPENVINO_GENAI_EXPORTS PerfMetrics {
     size_t get_num_input_tokens();
     MeanStdPair get_ttft();         // Time to the first token (in ms) (TTFT).
     MeanStdPair get_tpot();         // Time (in ms) per output token (TPOT).
+    MeanStdPair get_ipot();         // Inference time (in ms) per output token.
     MeanStdPair get_throughput();   // Tokens per second.
 
+    MeanStdPair get_inference_duration();       // in ms
     MeanStdPair get_generate_duration();        // in ms
     MeanStdPair get_tokenization_duration();    // in ms
     MeanStdPair get_detokenization_duration();  // in ms
@@ -133,7 +141,7 @@ struct OPENVINO_GENAI_EXPORTS PerfMetrics {
     /** 
      * @brief convert duration to microseconds
      * 
-     * @param duration duration in 
+     * @param duration steady clock duration
      */
     static float get_microsec(std::chrono::steady_clock::duration duration);
     PerfMetrics operator+(const PerfMetrics& metrics) const;

diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp
@@ -24,7 +24,11 @@ EncodedResults greedy_decoding(
     // Initialize results and performance metrics.
     EncodedResults results;
     auto& raw_perf_counters = results.perf_metrics.raw_metrics;
-
+    raw_perf_counters.m_new_token_times.reserve(max_new_tokens);
+    raw_perf_counters.m_batch_sizes.reserve(max_new_tokens);
+    raw_perf_counters.m_token_infer_durations.reserve(max_new_tokens);
+    raw_perf_counters.m_inference_durations = {{ MicroSeconds(0.0f) }};
+
     results.scores.resize(running_batch_size);
     results.tokens.resize(running_batch_size);
     std::fill(results.scores.begin(), results.scores.end(), 0);
@@ -38,8 +42,13 @@ EncodedResults greedy_decoding(
     auto beam_data = m_model_runner.get_tensor("beam_idx").data<int32_t>();
     std::iota(beam_data, beam_data + running_batch_size, 0);
 
+    const auto infer_start = std::chrono::steady_clock::now();
     m_model_runner.infer();
+    const auto infer_ms = PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start);
+    raw_perf_counters.m_inference_durations[0] = MicroSeconds(infer_ms);
+    raw_perf_counters.m_token_infer_durations.emplace_back(infer_ms);
     auto logits = m_model_runner.get_tensor("logits");
+
     ov::Shape logits_shape = logits.get_shape();
     size_t seq_len = logits_shape[1], vocab_size = logits_shape[2];
     m_model_runner.get_tensor("input_ids").set_shape({running_batch_size, 1});
@@ -71,8 +80,14 @@ EncodedResults greedy_decoding(
             utils::update_position_ids(m_model_runner.get_tensor("position_ids"), m_model_runner.get_tensor("attention_mask"));
         m_model_runner.set_tensor("attention_mask", utils::extend_attention(m_model_runner.get_tensor("attention_mask")));
 
+        const auto infer_start = std::chrono::steady_clock::now();
         m_model_runner.infer();
+        const auto infer_ms = PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start);
+        raw_perf_counters.m_inference_durations[0] += MicroSeconds(infer_ms);
+        raw_perf_counters.m_token_infer_durations.emplace_back(infer_ms);
+
         auto logits = m_model_runner.get_tensor("logits");
+
         ov::Shape logits_shape = logits.get_shape();
         size_t seq_len = logits_shape[1], vocab_size = logits_shape[2];
 

diff --git a/src/cpp/src/multinomial_decoding.cpp b/src/cpp/src/multinomial_decoding.cpp
@@ -156,15 +156,20 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner
                                                 std::shared_ptr<ov::genai::StreamerBase> streamer,
                                                 std::optional<ov::Tensor> position_ids) {
     ov::Shape prompts_shape = input_ids.get_shape();
-    size_t batch_size = prompts_shape[0];
+    const size_t batch_size = prompts_shape[0];
 
     OPENVINO_ASSERT(batch_size == 1, "Only batch size = 1 supported for multinomial decoding");
 
-    size_t prompt_len = prompts_shape[1];
+    const size_t prompt_len = prompts_shape[1];
+    const size_t max_new_tokens = config.get_max_new_tokens(prompt_len);
 
     // Initialize results and performance metrics.
     EncodedResults results;
     auto& raw_perf_counters = results.perf_metrics.raw_metrics;
+    raw_perf_counters.m_new_token_times.reserve(max_new_tokens);
+    raw_perf_counters.m_batch_sizes.reserve(max_new_tokens);
+    raw_perf_counters.m_token_infer_durations.reserve(max_new_tokens);
+    raw_perf_counters.m_inference_durations = {{ MicroSeconds(0.0f) }};
     results.scores.resize(batch_size, 0);
     results.tokens.resize(batch_size);
 
@@ -180,8 +185,13 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner
     m_model_runner.get_tensor("beam_idx").set_shape({batch_size});
     m_model_runner.get_tensor("beam_idx").data<int32_t>()[0] = 0;
 
+    const auto infer_start = std::chrono::steady_clock::now();
     m_model_runner.infer();
-    raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
+    const auto infer_end = std::chrono::steady_clock::now();
+    const auto infer_ms = PerfMetrics::get_microsec(infer_end - infer_start);
+    raw_perf_counters.m_inference_durations[0] += MicroSeconds(infer_ms);
+    raw_perf_counters.m_token_infer_durations.emplace_back(infer_ms);
+    raw_perf_counters.m_new_token_times.emplace_back(infer_end);
     raw_perf_counters.m_batch_sizes.emplace_back(batch_size);
 
     auto logits_tensor = m_model_runner.get_tensor("logits");
@@ -213,8 +223,6 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner
 
     m_model_runner.get_tensor("input_ids").set_shape({batch_size, 1});
 
-    size_t max_new_tokens = config.get_max_new_tokens(prompt_len);
-
     for (size_t i = 0; i < max_new_tokens - 1; i++) {
         if (position_ids.has_value()) {
             ov::genai::utils::update_position_ids(m_model_runner.get_tensor("position_ids"),
@@ -225,8 +233,13 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner
 
         m_model_runner.get_tensor("input_ids").data<int64_t>()[0] = out_token.id;
 
+        const auto infer_start = std::chrono::steady_clock::now();
         m_model_runner.infer();
-        raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
+        const auto infer_end = std::chrono::steady_clock::now();
+        const auto infer_ms = PerfMetrics::get_microsec(infer_end - infer_start);
+        raw_perf_counters.m_inference_durations[0] += MicroSeconds(infer_ms);
+        raw_perf_counters.m_token_infer_durations.emplace_back(infer_ms);
+        raw_perf_counters.m_new_token_times.emplace_back(infer_end);
         raw_perf_counters.m_batch_sizes.emplace_back(batch_size);
 
         logits = m_model_runner.get_tensor("logits").data<float>();

diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp
@@ -11,7 +11,7 @@ namespace {
 
 ov::genai::MeanStdPair calc_mean_and_std(const std::vector<ov::genai::MicroSeconds>& durations) {
     if (durations.size() == 0) {
-        return {-1, -1};
+        return {-1.0f, -1.0f};
     }
     // Accepts time durations in microseconds and returns standard deviation and mean in milliseconds.
     float mean = std::accumulate(durations.begin(), durations.end(), 0.0f, 
@@ -59,6 +59,11 @@ MeanStdPair PerfMetrics::get_tpot() {
     return tpot;
 }
 
+MeanStdPair PerfMetrics::get_ipot() {
+    evaluate_statistics();
+    return ipot;
+}
+
 MeanStdPair PerfMetrics::get_throughput() {
     evaluate_statistics();
     return throughput;
@@ -79,6 +84,11 @@ MeanStdPair PerfMetrics::get_detokenization_duration() {
     return detokenization_duration;
 }
 
+MeanStdPair PerfMetrics::get_inference_duration() {
+    evaluate_statistics();
+    return inference_duration;
+}
+
 float PerfMetrics::get_microsec(std::chrono::steady_clock::duration duration) {
     return std::chrono::duration_cast<std::chrono::microseconds>(duration).count();
 }
@@ -96,7 +106,7 @@ void PerfMetrics::evaluate_statistics(std::optional<TimePoint> start_time) {
 
         auto ttft = tok_times[0] - start_time_val;
         raw_metrics.m_times_to_first_token = std::vector<MicroSeconds>();
-        raw_metrics.m_times_to_first_token.emplace_back(ttft);
+        raw_metrics.m_times_to_first_token.emplace_back(ttft / batch_sizes[0]);
         num_generated_tokens = 0;
         for (size_t i = 0; i < tok_times.size(); ++i) {
             raw_metrics.m_durations[i] = tok_times[i] - start_time_val;
@@ -110,12 +120,14 @@ void PerfMetrics::evaluate_statistics(std::optional<TimePoint> start_time) {
 
     // calc_mean_and_std will convert microsecond to milliseconds.
     tpot = calc_mean_and_std(raw_metrics.m_durations);
+    ipot = calc_mean_and_std(raw_metrics.m_token_infer_durations);
     ttft = calc_mean_and_std(raw_metrics.m_times_to_first_token);
 
     generate_duration = calc_mean_and_std(raw_metrics.generate_durations);
     tokenization_duration = calc_mean_and_std(raw_metrics.tokenization_durations);
-    detokenization_duration = calc_mean_and_std(raw_metrics.detokenization_durations);    
-
+    detokenization_duration = calc_mean_and_std(raw_metrics.detokenization_durations);
+    inference_duration = calc_mean_and_std(raw_metrics.m_inference_durations);
+
     // tokens per second
     throughput = {1000.0f / tpot.mean, (tpot.std * 1000.0f) / (tpot.mean * tpot.mean)};
     m_evaluated = true;
@@ -151,9 +163,8 @@ PerfMetrics PerfMetrics::operator+(const PerfMetrics& right) const {
     new_detok_durations.insert(new_detok_durations.end(), right_detok_durations.begin(), right_detok_durations.end());
     new_gen_durations.insert(new_gen_durations.end(), right_gen_durations.begin(), right_gen_durations.end());
 
-    res.num_generated_tokens = num_generated_tokens + right.num_generated_tokens;
-    res.num_input_tokens = num_generated_tokens + right.num_input_tokens;
-    res.load_time = load_time;
+    res.num_generated_tokens += right.num_generated_tokens;
+    res.num_input_tokens += right.num_input_tokens;
     res.m_evaluated = false;
     return res;
 }