From 2f778f3e7aacfe3a2eafed5966494d9d13bd9c2c Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Wed, 31 Jul 2024 14:33:48 +0200 Subject: [PATCH] Add perf metric docstrings (#713) Docstring for generation time metrics Ticket: CVS-132859 --- .../include/openvino/genai/perf_metrics.hpp | 64 ++++++++++++-- src/python/py_generate_pipeline.cpp | 84 ++++++++++++++++++- 2 files changed, 140 insertions(+), 8 deletions(-) diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp index ddb9ff581f..ad53d8d941 100644 --- a/src/cpp/include/openvino/genai/perf_metrics.hpp +++ b/src/cpp/include/openvino/genai/perf_metrics.hpp @@ -16,8 +16,18 @@ using TimePoint = std::chrono::steady_clock::time_point; using MicroSeconds = std::chrono::duration>; /** -* @brief Structure with raw performance metrics for each generation before any statistics calculated. -*/ + * @brief Structure with raw performance metrics for each generation before any statistics are calculated. + * + * @param generate_durations Durations for each generate call in microseconds. + * @param tokenization_durations Durations for the tokenization process in microseconds. + * @param detokenization_durations Durations for the detokenization process in microseconds. + * @param m_times_to_first_token Times to the first token for each call in microseconds. + * @param m_new_token_times Time points for each new token generated. + * @param m_batch_sizes Batch sizes for each generate call. + * @param m_durations Total durations for each generate call in microseconds. + * @param num_generated_tokens Total number of tokens generated. + * @param num_input_tokens Total number of tokens in the input prompt. + */ struct OPENVINO_GENAI_EXPORTS RawPerfMetrics { std::vector generate_durations; std::vector tokenization_durations; @@ -41,10 +51,52 @@ struct OPENVINO_GENAI_EXPORTS MeanStdPair { }; /** -* @brief Structure to store performance metric for each generation. -* -* @param -*/ + * @brief Holds performance metrics for each generate call. + * + * PerfMetrics holds fields with mean and standard deviations for the following metrics: + * - Time To the First Token (TTFT), ms + * - Time per Output Token (TPOT), ms/token + * - Generate total duration, ms + * - Tokenization duration, ms + * - Detokenization duration, ms + * - Throughput, tokens/s + * + * Additional fields include: + * - Load time, ms + * - Number of generated tokens + * - Number of tokens in the input prompt + * + * Preverable way to access values is via get functions. Getters calculate mean and std values from raw_metrics are return pairs. + * If mean and std were already calcualted getters return cached values. + * @param get_load_time Returns the load time in milliseconds. + * @param get_num_generated_tokens Returns the number of generated tokens. + * @param get_num_input_tokens Returns the number of tokens in the input prompt. + * @param get_ttft Returns the mean and standard deviation of TTFT. + * @param get_tpot Returns the mean and standard deviation of TPOT. + * @param get_throughput Returns the mean and standard deviation of throughput. + * @param get_generate_duration Returns the mean and standard deviation of generate duration. + * @param get_tokenization_duration Returns the mean and standard deviation of tokenization duration. + * @param get_detokenization_duration Returns the mean and standard deviation of detokenization duration. + * @param get_microsec Converts a duration to microseconds. + * @param m_evaluated Flag indicating if raw metrics were evaluated. + * If false, current mean/std TTFT, TPOT, etc. are not actual and evaluate_statistics() should recalculate them. + * @param evaluate_statistics Calculates mean and standard deviation values from raw_metrics. + * Optional start_time can be provided to update durations. + * @param operator+ Adds two PerfMetrics objects. + * @param operator+= Adds and assigns the right-hand PerfMetrics to the current object. + * @param raw_metrics A structure of RawPerfMetrics type that holds raw metrics. + * @param load_time Load time in milliseconds. + * + * Cached mean and standard deviations. + * @param ttft Mean and standard deviation of Time to the First Token (TTFT) in milliseconds. + * @param tpot Mean and standard deviation of Time per Output Token (TPOT) in milliseconds per token. + * @param throughput Mean and standard deviation of tokens per second. + * @param generate_duration Mean and standard deviation of the total duration of generate calls in milliseconds. + * @param tokenization_duration Mean and standard deviation of the tokenization duration in milliseconds. + * @param detokenization_duration Mean and standard deviation of the detokenization duration in milliseconds. + * @param num_generated_tokens Number of generated tokens. + * @param num_input_tokens Number of tokens in the input prompt. + */ struct OPENVINO_GENAI_EXPORTS PerfMetrics { float load_time; // Load time in ms. MeanStdPair ttft; // Time to the first token (in ms) (TTTFT). diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp index 031c8fb97b..9518e1ece4 100644 --- a/src/python/py_generate_pipeline.cpp +++ b/src/python/py_generate_pipeline.cpp @@ -102,6 +102,86 @@ auto generation_config_docstring = R"( repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. )"; +auto raw_perf_metrics_docstring = R"( + Structure with raw performance metrics for each generation before any statistics are calculated. + + :param generate_durations: Durations for each generate call in microseconds. + :type generate_durations: List[MicroSeconds] + + :param tokenization_durations: Durations for the tokenization process in microseconds. + :type tokenization_durations: List[MicroSeconds] + + :param detokenization_durations: Durations for the detokenization process in microseconds. + :type detokenization_durations: List[MicroSeconds] + + :param m_times_to_first_token: Times to the first token for each call in microseconds. + :type m_times_to_first_token: List[MicroSeconds] + + :param m_new_token_times: Time points for each new token generated. + :type m_new_token_times: List[TimePoint] + + :param m_batch_sizes: Batch sizes for each generate call. + :type m_batch_sizes: List[int] + + :param m_durations: Total durations for each generate call in microseconds. + :type m_durations: List[MicroSeconds] + + :param num_generated_tokens: Total number of tokens generated. + :type num_generated_tokens: int + + :param num_input_tokens: Total number of tokens in the input prompt. + :type num_input_tokens: int +)"; + +auto perf_metrics_docstring = R"( + Holds performance metrics for each generate call. + + PerfMetrics holds fields with mean and standard deviations for the following metrics: + - Time To the First Token (TTFT), ms + - Time per Output Token (TPOT), ms/token + - Generate total duration, ms + - Tokenization duration, ms + - Detokenization duration, ms + - Throughput, tokens/s + + Additional fields include: + - Load time, ms + - Number of generated tokens + - Number of tokens in the input prompt + + Preferable way to access values is via get functions. Getters calculate mean and std values from raw_metrics and return pairs. + If mean and std were already calculated, getters return cached values. + + :param get_load_time: Returns the load time in milliseconds. + :type get_load_time: float + + :param get_num_generated_tokens: Returns the number of generated tokens. + :type get_num_generated_tokens: int + + :param get_num_input_tokens: Returns the number of tokens in the input prompt. + :type get_num_input_tokens: int + + :param get_ttft: Returns the mean and standard deviation of TTFT. + :type get_ttft: MeanStdPair + + :param get_tpot: Returns the mean and standard deviation of TPOT. + :type get_tpot: MeanStdPair + + :param get_throughput: Returns the mean and standard deviation of throughput. + :type get_throughput: MeanStdPair + + :param get_generate_duration: Returns the mean and standard deviation of generate duration. + :type get_generate_duration: MeanStdPair + + :param get_tokenization_duration: Returns the mean and standard deviation of tokenization duration. + :type get_tokenization_duration: MeanStdPair + + :param get_detokenization_duration: Returns the mean and standard deviation of detokenization duration. + :type get_detokenization_duration: MeanStdPair + + :param raw_metrics: A structure of RawPerfMetrics type that holds raw metrics. + :type raw_metrics: RawPerfMetrics +)"; OptionalGenerationConfig update_config_from_kwargs(const OptionalGenerationConfig& config, const py::kwargs& kwargs) { if(!config.has_value() && kwargs.empty()) @@ -580,7 +660,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def_readonly("perf_metrics", &DecodedResults::perf_metrics) .def("__str__", &DecodedResults::operator std::string); - py::class_(m, "RawPerfMetrics") + py::class_(m, "RawPerfMetrics", raw_perf_metrics_docstring) .def(py::init<>()) .def_readonly("generate_durations", &RawPerfMetrics::generate_durations) .def_property_readonly("tokenization_durations", [](const RawPerfMetrics &rw) { @@ -604,7 +684,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def_readonly("mean", &MeanStdPair::mean) .def_readonly("std", &MeanStdPair::std); - py::class_(m, "PerfMetrics") + py::class_(m, "PerfMetrics", perf_metrics_docstring) .def(py::init<>()) .def("get_generate_duration", &PerfMetrics::get_generate_duration) .def("get_tokenization_duration", &PerfMetrics::get_tokenization_duration)