From 2d9faf54878e988ef0537aa225dad910b1f285c8 Mon Sep 17 00:00:00 2001 From: Andrew Mueller <89394290+AndyM1098@users.noreply.github.com> Date: Fri, 22 Dec 2023 15:50:19 -0500 Subject: [PATCH] Updated percentile.py to allow user defined percentiles, and separating calculated values into separate columns (#103) * Changed percentiles.py to take in custom values, and split calulated values into seperate columns * Updated doc string for function * Updated percentile.py to pass black and flake8 * Updating percentile unit testing * Updated formatting for unit testing * code cleanup * percentiles: add test for percentiles=None --------- Co-authored-by: Stephanie Brink --- thicket/stats/percentiles.py | 103 +++++++++++++++++++++++++---------- thicket/tests/test_stats.py | 90 +++++++++++++++++++++++++++--- 2 files changed, 157 insertions(+), 36 deletions(-) diff --git a/thicket/stats/percentiles.py b/thicket/stats/percentiles.py index 40aa16e4..c678a228 100644 --- a/thicket/stats/percentiles.py +++ b/thicket/stats/percentiles.py @@ -8,11 +8,13 @@ from ..utils import verify_thicket_structures -def percentiles(thicket, columns=None): +def percentiles(thicket, columns=None, percentiles=[0.25, 0.50, 0.75]): """Calculate the q-th percentile for each node in the performance data table. Designed to take in a thicket, and append one or more columns to the aggregated - statistics table for the q-th percentile calculation for each node. + statistics table for the q-th percentile calculation for each node. Each percentile + calculation is a separate column in the statistics table, where the column will + have the format: {columnName}_percentiles_{percentile}. The 25th percentile is the lower quartile, and is the value at which 25% of the answers lie below that value. @@ -28,7 +30,20 @@ def percentiles(thicket, columns=None): columns (list): List of hardware/timing metrics to perform percentile calculation on. Note if using a columnar joined thicket a list of tuples must be passed in with the format (column index, column name). + percentiles (list): List of percentile values that is desired to be calculated + for each column in columns. If no list is specified, the default values, + [0.25, 0.50, 0.75] are used for calculations """ + if not percentiles: + percentiles = [0.25, 0.50, 0.75] + + # Enforce that percentiles are in range of [0.0, 1.0] + for percentile in percentiles: + if percentile < 0.0 or percentile > 1.0: + raise ValueError( + "Percentile {} is out of range of [0.0, 1.0]".format(percentile) + ) + if columns is None: raise ValueError( "To see a list of valid columns, run 'Thicket.performance_cols'." @@ -36,40 +51,70 @@ def percentiles(thicket, columns=None): verify_thicket_structures(thicket.dataframe, index=["node"], columns=columns) + # select numeric columns within thicket (.quantiles) will not work without this step + numerics = ["int16", "int32", "int64", "float16", "float32", "float64"] + # thicket object without columnar index if thicket.dataframe.columns.nlevels == 1: - # select numeric columns within thicket (.quantiles) will not work without this step - numerics = ["int16", "int32", "int64", "float16", "float32", "float64"] - df_num = thicket.dataframe.select_dtypes(include=numerics) - df = df_num.reset_index().groupby("node").quantile([0.25, 0.50, 0.75]) + df_num = thicket.dataframe.select_dtypes(include=numerics)[columns] + df = df_num.reset_index().groupby("node").quantile(percentiles) for column in columns: - percentiles = [] + calculated_percentiles = [] for node in pd.unique(df.reset_index()["node"].tolist()): - percentiles.append(list(df.loc[node][column])) - thicket.statsframe.dataframe[column + "_percentiles"] = percentiles - # check to see if exclusive metric - if column in thicket.exc_metrics: - thicket.statsframe.exc_metrics.append(column + "_percentiles") - # check to see if inclusive metric - else: - thicket.statsframe.inc_metrics.append(column + "_percentiles") + calculated_percentiles.append(list(df.loc[node][column])) + + for index, percentile in enumerate(percentiles): + column_to_append = column + "_percentiles_" + str(int(percentile * 100)) + thicket.statsframe.dataframe[column_to_append] = [ + x[index] for x in calculated_percentiles + ] + + # check to see if exclusive metric and that the metric is not already in the metrics list + if ( + column in thicket.exc_metrics + and column_to_append not in thicket.statsframe.exc_metrics + ): + thicket.statsframe.exc_metrics.append(column_to_append) + # check inclusive metrics + elif ( + column in thicket.inc_metrics + and column_to_append not in thicket.statsframe.inc_metrics + ): + thicket.statsframe.inc_metrics.append(column_to_append) + # columnar joined thicket object else: - numerics = ["int16", "int32", "int64", "float16", "float32", "float64"] - df_num = thicket.dataframe.select_dtypes(include=numerics) - df = df_num.reset_index(level=1).groupby("node").quantile([0.25, 0.50, 0.75]) - percentiles = [] - for idx, column in columns: - percentiles = [] + df_num = thicket.dataframe.select_dtypes(include=numerics)[columns] + df = df_num.reset_index(level=1).groupby("node").quantile(percentiles) + for idx_level, column in columns: + calculated_percentiles = [] + + # Get all the calculated values into a list for each node for node in pd.unique(df.reset_index()["node"].tolist()): - percentiles.append(list(df.loc[node][(idx, column)])) - thicket.statsframe.dataframe[(idx, column + "_percentiles")] = percentiles - # check to see if exclusive metric - if (idx, column) in thicket.exc_metrics: - thicket.statsframe.exc_metrics.append((idx, column + "_percentiles")) - # check to see if inclusive metric - else: - thicket.statsframe.inc_metrics.append((idx, column + "_percentiles")) + calculated_percentiles.append(list(df.loc[node][(idx_level, column)])) + + # Go through each of the percentiles, and make them it's own column + for index, percentile in enumerate(percentiles): + column_to_append = ( + idx_level, + "{}_percentiles_{}".format(column, str(int(percentile * 100))), + ) + thicket.statsframe.dataframe[column_to_append] = [ + x[index] for x in calculated_percentiles + ] + + # check to see if exclusive metric + if ( + (idx_level, column) in thicket.exc_metrics + and column_to_append not in thicket.statsframe.exc_metrics + ): + thicket.statsframe.exc_metrics.append(column_to_append) + # check to see if inclusive metric + elif ( + (idx_level, column) in thicket.inc_metrics + and column_to_append not in thicket.statsframe.inc_metrics + ): + thicket.statsframe.inc_metrics.append(column_to_append) # sort columns in index thicket.statsframe.dataframe = thicket.statsframe.dataframe.sort_index(axis=1) diff --git a/thicket/tests/test_stats.py b/thicket/tests/test_stats.py index 5d68f17e..b8e63316 100644 --- a/thicket/tests/test_stats.py +++ b/thicket/tests/test_stats.py @@ -206,14 +206,50 @@ def test_percentiles(example_cali): th.percentiles(th_ens, columns=["Min time/rank"]) - assert "Min time/rank_percentiles" in th_ens.statsframe.dataframe.columns - assert len(th_ens.statsframe.dataframe["Min time/rank_percentiles"][0]) == 3 + assert "Min time/rank_percentiles_25" in th_ens.statsframe.dataframe.columns + assert "Min time/rank_percentiles_50" in th_ens.statsframe.dataframe.columns + assert "Min time/rank_percentiles_75" in th_ens.statsframe.dataframe.columns assert ( - "Min time/rank_percentiles" + "Min time/rank_percentiles_25" + in th_ens.statsframe.exc_metrics + th_ens.statsframe.inc_metrics + ) + assert ( + "Min time/rank_percentiles_50" + in th_ens.statsframe.exc_metrics + th_ens.statsframe.inc_metrics + ) + assert ( + "Min time/rank_percentiles_75" + in th_ens.statsframe.exc_metrics + th_ens.statsframe.inc_metrics + ) + assert "Min time/rank_percentiles_25" in th_ens.statsframe.show_metric_columns() + assert "Min time/rank_percentiles_50" in th_ens.statsframe.show_metric_columns() + assert "Min time/rank_percentiles_75" in th_ens.statsframe.show_metric_columns() + + +def test_percentiles_none(example_cali): + th_ens = th.Thicket.from_caliperreader(example_cali) + + th.percentiles(th_ens, columns=["Min time/rank"], percentiles=None) + + assert "Min time/rank_percentiles_25" in th_ens.statsframe.dataframe.columns + assert "Min time/rank_percentiles_50" in th_ens.statsframe.dataframe.columns + assert "Min time/rank_percentiles_75" in th_ens.statsframe.dataframe.columns + + +def test_percentiles_single_value(example_cali): + th_ens = th.Thicket.from_caliperreader(example_cali) + + th.percentiles(th_ens, columns=["Min time/rank"], percentiles=[0.3]) + + assert "Min time/rank_percentiles_30" in th_ens.statsframe.dataframe.columns + + assert ( + "Min time/rank_percentiles_30" in th_ens.statsframe.exc_metrics + th_ens.statsframe.inc_metrics ) - assert "Min time/rank_percentiles" in th_ens.statsframe.show_metric_columns() + + assert "Min time/rank_percentiles_30" in th_ens.statsframe.show_metric_columns() def test_percentiles_columnar_join(thicket_axis_columns): @@ -229,15 +265,55 @@ def test_percentiles_columnar_join(thicket_axis_columns): assert ( idx, - "Min time/rank_percentiles", + "Min time/rank_percentiles_25", + ) in combined_th.statsframe.dataframe.columns + assert ( + idx, + "Min time/rank_percentiles_50", + ) in combined_th.statsframe.dataframe.columns + assert ( + idx, + "Min time/rank_percentiles_75", + ) in combined_th.statsframe.dataframe.columns + assert ( + idx, + "Min time/rank_percentiles_25", + ) in combined_th.statsframe.exc_metrics + combined_th.statsframe.inc_metrics + assert ( + idx, + "Min time/rank_percentiles_50", + ) in combined_th.statsframe.exc_metrics + combined_th.statsframe.inc_metrics + assert ( + idx, + "Min time/rank_percentiles_75", + ) in combined_th.statsframe.exc_metrics + combined_th.statsframe.inc_metrics + + assert ( + idx, + "Min time/rank_percentiles_25", + ) in combined_th.statsframe.show_metric_columns() + assert ( + idx, + "Min time/rank_percentiles_50", + ) in combined_th.statsframe.show_metric_columns() + assert ( + idx, + "Min time/rank_percentiles_75", + ) in combined_th.statsframe.show_metric_columns() + + th.percentiles(combined_th, columns=[(idx, "Min time/rank")], percentiles=[0.4]) + + assert ( + idx, + "Min time/rank_percentiles_40", ) in combined_th.statsframe.dataframe.columns assert ( idx, - "Min time/rank_percentiles", + "Min time/rank_percentiles_40", ) in combined_th.statsframe.exc_metrics + combined_th.statsframe.inc_metrics assert ( idx, - "Min time/rank_percentiles", + "Min time/rank_percentiles_40", ) in combined_th.statsframe.show_metric_columns()