Skip to content

Commit

Permalink
[perf metrics] Added infer timing stats and bug fixes (#843)
Browse files Browse the repository at this point in the history
- Collect inference only (ipot - Inference Per Output Token) time
statistics and calculate total and per token information
 - Bug fixes
 - Typo fixes
  • Loading branch information
ialbrecht authored Sep 9, 2024
1 parent a722f8d commit 38eed7d
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 18 deletions.
16 changes: 12 additions & 4 deletions src/cpp/include/openvino/genai/perf_metrics.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,10 @@ using MicroSeconds = std::chrono::duration<float, std::ratio<1, 1000000>>;
* @param detokenization_durations Durations for the detokenization process in microseconds.
* @param m_times_to_first_token Times to the first token for each call in microseconds.
* @param m_new_token_times Time points for each new token generated.
* @param m_token_infer_durations Inference time for each token in microseconds.
* @param m_batch_sizes Batch sizes for each generate call.
* @param m_durations Total durations for each generate call in microseconds.
* @param m_inference_durations Total inference duration for each generate call in microseconds.
* @param num_generated_tokens Total number of tokens generated.
* @param num_input_tokens Total number of tokens in the input prompt.
*/
Expand All @@ -35,8 +37,10 @@ struct OPENVINO_GENAI_EXPORTS RawPerfMetrics {

std::vector<MicroSeconds> m_times_to_first_token;
std::vector<TimePoint> m_new_token_times;
std::vector<MicroSeconds> m_token_infer_durations;
std::vector<size_t> m_batch_sizes;
std::vector<MicroSeconds> m_durations;
std::vector<MicroSeconds> m_inference_durations;
};

/**
Expand Down Expand Up @@ -96,13 +100,15 @@ struct OPENVINO_GENAI_EXPORTS MeanStdPair {
*/
struct OPENVINO_GENAI_EXPORTS PerfMetrics {
float load_time; // Load time in ms.
MeanStdPair ttft; // Time to the first token (in ms) (TTTFT).
MeanStdPair ttft; // Time to the first token (in ms) (TTFT).
MeanStdPair tpot; // Time (in ms) per output token (TPOT).
MeanStdPair ipot; // Inference time (in ms) per output token.
MeanStdPair throughput; // Tokens per second.

MeanStdPair generate_duration;
MeanStdPair tokenization_duration = {-1, -1};
MeanStdPair detokenization_duration = {-1. -1};
MeanStdPair inference_duration;
MeanStdPair tokenization_duration = {-1.0f, -1.0f};
MeanStdPair detokenization_duration = {-1.0f, -1.0f};

size_t num_generated_tokens;
size_t num_input_tokens;
Expand All @@ -112,8 +118,10 @@ struct OPENVINO_GENAI_EXPORTS PerfMetrics {
size_t get_num_input_tokens();
MeanStdPair get_ttft(); // Time to the first token (in ms) (TTFT).
MeanStdPair get_tpot(); // Time (in ms) per output token (TPOT).
MeanStdPair get_ipot(); // Inference time (in ms) per output token.
MeanStdPair get_throughput(); // Tokens per second.

MeanStdPair get_inference_duration(); // in ms
MeanStdPair get_generate_duration(); // in ms
MeanStdPair get_tokenization_duration(); // in ms
MeanStdPair get_detokenization_duration(); // in ms
Expand All @@ -133,7 +141,7 @@ struct OPENVINO_GENAI_EXPORTS PerfMetrics {
/**
* @brief convert duration to microseconds
*
* @param duration duration in
* @param duration steady clock duration
*/
static float get_microsec(std::chrono::steady_clock::duration duration);
PerfMetrics operator+(const PerfMetrics& metrics) const;
Expand Down
17 changes: 16 additions & 1 deletion src/cpp/src/greedy_decoding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,11 @@ EncodedResults greedy_decoding(
// Initialize results and performance metrics.
EncodedResults results;
auto& raw_perf_counters = results.perf_metrics.raw_metrics;

raw_perf_counters.m_new_token_times.reserve(max_new_tokens);
raw_perf_counters.m_batch_sizes.reserve(max_new_tokens);
raw_perf_counters.m_token_infer_durations.reserve(max_new_tokens);
raw_perf_counters.m_inference_durations = {{ MicroSeconds(0.0f) }};

results.scores.resize(running_batch_size);
results.tokens.resize(running_batch_size);
std::fill(results.scores.begin(), results.scores.end(), 0);
Expand All @@ -38,8 +42,13 @@ EncodedResults greedy_decoding(
auto beam_data = m_model_runner.get_tensor("beam_idx").data<int32_t>();
std::iota(beam_data, beam_data + running_batch_size, 0);

const auto infer_start = std::chrono::steady_clock::now();
m_model_runner.infer();
const auto infer_ms = PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start);
raw_perf_counters.m_inference_durations[0] = MicroSeconds(infer_ms);
raw_perf_counters.m_token_infer_durations.emplace_back(infer_ms);
auto logits = m_model_runner.get_tensor("logits");

ov::Shape logits_shape = logits.get_shape();
size_t seq_len = logits_shape[1], vocab_size = logits_shape[2];
m_model_runner.get_tensor("input_ids").set_shape({running_batch_size, 1});
Expand Down Expand Up @@ -71,8 +80,14 @@ EncodedResults greedy_decoding(
utils::update_position_ids(m_model_runner.get_tensor("position_ids"), m_model_runner.get_tensor("attention_mask"));
m_model_runner.set_tensor("attention_mask", utils::extend_attention(m_model_runner.get_tensor("attention_mask")));

const auto infer_start = std::chrono::steady_clock::now();
m_model_runner.infer();
const auto infer_ms = PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start);
raw_perf_counters.m_inference_durations[0] += MicroSeconds(infer_ms);
raw_perf_counters.m_token_infer_durations.emplace_back(infer_ms);

auto logits = m_model_runner.get_tensor("logits");

ov::Shape logits_shape = logits.get_shape();
size_t seq_len = logits_shape[1], vocab_size = logits_shape[2];

Expand Down
25 changes: 19 additions & 6 deletions src/cpp/src/multinomial_decoding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,15 +156,20 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner
std::shared_ptr<ov::genai::StreamerBase> streamer,
std::optional<ov::Tensor> position_ids) {
ov::Shape prompts_shape = input_ids.get_shape();
size_t batch_size = prompts_shape[0];
const size_t batch_size = prompts_shape[0];

OPENVINO_ASSERT(batch_size == 1, "Only batch size = 1 supported for multinomial decoding");

size_t prompt_len = prompts_shape[1];
const size_t prompt_len = prompts_shape[1];
const size_t max_new_tokens = config.get_max_new_tokens(prompt_len);

// Initialize results and performance metrics.
EncodedResults results;
auto& raw_perf_counters = results.perf_metrics.raw_metrics;
raw_perf_counters.m_new_token_times.reserve(max_new_tokens);
raw_perf_counters.m_batch_sizes.reserve(max_new_tokens);
raw_perf_counters.m_token_infer_durations.reserve(max_new_tokens);
raw_perf_counters.m_inference_durations = {{ MicroSeconds(0.0f) }};
results.scores.resize(batch_size, 0);
results.tokens.resize(batch_size);

Expand All @@ -180,8 +185,13 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner
m_model_runner.get_tensor("beam_idx").set_shape({batch_size});
m_model_runner.get_tensor("beam_idx").data<int32_t>()[0] = 0;

const auto infer_start = std::chrono::steady_clock::now();
m_model_runner.infer();
raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
const auto infer_end = std::chrono::steady_clock::now();
const auto infer_ms = PerfMetrics::get_microsec(infer_end - infer_start);
raw_perf_counters.m_inference_durations[0] += MicroSeconds(infer_ms);
raw_perf_counters.m_token_infer_durations.emplace_back(infer_ms);
raw_perf_counters.m_new_token_times.emplace_back(infer_end);
raw_perf_counters.m_batch_sizes.emplace_back(batch_size);

auto logits_tensor = m_model_runner.get_tensor("logits");
Expand Down Expand Up @@ -213,8 +223,6 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner

m_model_runner.get_tensor("input_ids").set_shape({batch_size, 1});

size_t max_new_tokens = config.get_max_new_tokens(prompt_len);

for (size_t i = 0; i < max_new_tokens - 1; i++) {
if (position_ids.has_value()) {
ov::genai::utils::update_position_ids(m_model_runner.get_tensor("position_ids"),
Expand All @@ -225,8 +233,13 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner

m_model_runner.get_tensor("input_ids").data<int64_t>()[0] = out_token.id;

const auto infer_start = std::chrono::steady_clock::now();
m_model_runner.infer();
raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
const auto infer_end = std::chrono::steady_clock::now();
const auto infer_ms = PerfMetrics::get_microsec(infer_end - infer_start);
raw_perf_counters.m_inference_durations[0] += MicroSeconds(infer_ms);
raw_perf_counters.m_token_infer_durations.emplace_back(infer_ms);
raw_perf_counters.m_new_token_times.emplace_back(infer_end);
raw_perf_counters.m_batch_sizes.emplace_back(batch_size);

logits = m_model_runner.get_tensor("logits").data<float>();
Expand Down
25 changes: 18 additions & 7 deletions src/cpp/src/perf_metrics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ namespace {

ov::genai::MeanStdPair calc_mean_and_std(const std::vector<ov::genai::MicroSeconds>& durations) {
if (durations.size() == 0) {
return {-1, -1};
return {-1.0f, -1.0f};
}
// Accepts time durations in microseconds and returns standard deviation and mean in milliseconds.
float mean = std::accumulate(durations.begin(), durations.end(), 0.0f,
Expand Down Expand Up @@ -59,6 +59,11 @@ MeanStdPair PerfMetrics::get_tpot() {
return tpot;
}

MeanStdPair PerfMetrics::get_ipot() {
evaluate_statistics();
return ipot;
}

MeanStdPair PerfMetrics::get_throughput() {
evaluate_statistics();
return throughput;
Expand All @@ -79,6 +84,11 @@ MeanStdPair PerfMetrics::get_detokenization_duration() {
return detokenization_duration;
}

MeanStdPair PerfMetrics::get_inference_duration() {
evaluate_statistics();
return inference_duration;
}

float PerfMetrics::get_microsec(std::chrono::steady_clock::duration duration) {
return std::chrono::duration_cast<std::chrono::microseconds>(duration).count();
}
Expand All @@ -96,7 +106,7 @@ void PerfMetrics::evaluate_statistics(std::optional<TimePoint> start_time) {

auto ttft = tok_times[0] - start_time_val;
raw_metrics.m_times_to_first_token = std::vector<MicroSeconds>();
raw_metrics.m_times_to_first_token.emplace_back(ttft);
raw_metrics.m_times_to_first_token.emplace_back(ttft / batch_sizes[0]);
num_generated_tokens = 0;
for (size_t i = 0; i < tok_times.size(); ++i) {
raw_metrics.m_durations[i] = tok_times[i] - start_time_val;
Expand All @@ -110,12 +120,14 @@ void PerfMetrics::evaluate_statistics(std::optional<TimePoint> start_time) {

// calc_mean_and_std will convert microsecond to milliseconds.
tpot = calc_mean_and_std(raw_metrics.m_durations);
ipot = calc_mean_and_std(raw_metrics.m_token_infer_durations);
ttft = calc_mean_and_std(raw_metrics.m_times_to_first_token);

generate_duration = calc_mean_and_std(raw_metrics.generate_durations);
tokenization_duration = calc_mean_and_std(raw_metrics.tokenization_durations);
detokenization_duration = calc_mean_and_std(raw_metrics.detokenization_durations);

detokenization_duration = calc_mean_and_std(raw_metrics.detokenization_durations);
inference_duration = calc_mean_and_std(raw_metrics.m_inference_durations);

// tokens per second
throughput = {1000.0f / tpot.mean, (tpot.std * 1000.0f) / (tpot.mean * tpot.mean)};
m_evaluated = true;
Expand Down Expand Up @@ -151,9 +163,8 @@ PerfMetrics PerfMetrics::operator+(const PerfMetrics& right) const {
new_detok_durations.insert(new_detok_durations.end(), right_detok_durations.begin(), right_detok_durations.end());
new_gen_durations.insert(new_gen_durations.end(), right_gen_durations.begin(), right_gen_durations.end());

res.num_generated_tokens = num_generated_tokens + right.num_generated_tokens;
res.num_input_tokens = num_generated_tokens + right.num_input_tokens;
res.load_time = load_time;
res.num_generated_tokens += right.num_generated_tokens;
res.num_input_tokens += right.num_input_tokens;
res.m_evaluated = false;
return res;
}
Expand Down

0 comments on commit 38eed7d

Please sign in to comment.