Skip to content

Commit

Permalink
Updated percentile.py to allow user defined percentiles, and separati…
Browse files Browse the repository at this point in the history
…ng calculated values into separate columns (#103)

* Changed percentiles.py to take in custom values, and split calulated values into seperate columns

* Updated doc string for function

* Updated percentile.py to pass black and flake8

* Updating percentile unit testing

* Updated formatting for unit testing

* code cleanup

* percentiles: add test for percentiles=None

---------

Co-authored-by: Stephanie Brink <[email protected]>
  • Loading branch information
AndyM1098 and slabasan authored Dec 22, 2023
1 parent dce8f44 commit 2d9faf5
Show file tree
Hide file tree
Showing 2 changed files with 157 additions and 36 deletions.
103 changes: 74 additions & 29 deletions thicket/stats/percentiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@
from ..utils import verify_thicket_structures


def percentiles(thicket, columns=None):
def percentiles(thicket, columns=None, percentiles=[0.25, 0.50, 0.75]):
"""Calculate the q-th percentile for each node in the performance data table.
Designed to take in a thicket, and append one or more columns to the aggregated
statistics table for the q-th percentile calculation for each node.
statistics table for the q-th percentile calculation for each node. Each percentile
calculation is a separate column in the statistics table, where the column will
have the format: {columnName}_percentiles_{percentile}.
The 25th percentile is the lower quartile, and is the value at which 25% of the
answers lie below that value.
Expand All @@ -28,48 +30,91 @@ def percentiles(thicket, columns=None):
columns (list): List of hardware/timing metrics to perform percentile
calculation on. Note if using a columnar joined thicket a list of tuples
must be passed in with the format (column index, column name).
percentiles (list): List of percentile values that is desired to be calculated
for each column in columns. If no list is specified, the default values,
[0.25, 0.50, 0.75] are used for calculations
"""
if not percentiles:
percentiles = [0.25, 0.50, 0.75]

# Enforce that percentiles are in range of [0.0, 1.0]
for percentile in percentiles:
if percentile < 0.0 or percentile > 1.0:
raise ValueError(
"Percentile {} is out of range of [0.0, 1.0]".format(percentile)
)

if columns is None:
raise ValueError(
"To see a list of valid columns, run 'Thicket.performance_cols'."
)

verify_thicket_structures(thicket.dataframe, index=["node"], columns=columns)

# select numeric columns within thicket (.quantiles) will not work without this step
numerics = ["int16", "int32", "int64", "float16", "float32", "float64"]

# thicket object without columnar index
if thicket.dataframe.columns.nlevels == 1:
# select numeric columns within thicket (.quantiles) will not work without this step
numerics = ["int16", "int32", "int64", "float16", "float32", "float64"]
df_num = thicket.dataframe.select_dtypes(include=numerics)
df = df_num.reset_index().groupby("node").quantile([0.25, 0.50, 0.75])
df_num = thicket.dataframe.select_dtypes(include=numerics)[columns]
df = df_num.reset_index().groupby("node").quantile(percentiles)
for column in columns:
percentiles = []
calculated_percentiles = []
for node in pd.unique(df.reset_index()["node"].tolist()):
percentiles.append(list(df.loc[node][column]))
thicket.statsframe.dataframe[column + "_percentiles"] = percentiles
# check to see if exclusive metric
if column in thicket.exc_metrics:
thicket.statsframe.exc_metrics.append(column + "_percentiles")
# check to see if inclusive metric
else:
thicket.statsframe.inc_metrics.append(column + "_percentiles")
calculated_percentiles.append(list(df.loc[node][column]))

for index, percentile in enumerate(percentiles):
column_to_append = column + "_percentiles_" + str(int(percentile * 100))
thicket.statsframe.dataframe[column_to_append] = [
x[index] for x in calculated_percentiles
]

# check to see if exclusive metric and that the metric is not already in the metrics list
if (
column in thicket.exc_metrics
and column_to_append not in thicket.statsframe.exc_metrics
):
thicket.statsframe.exc_metrics.append(column_to_append)
# check inclusive metrics
elif (
column in thicket.inc_metrics
and column_to_append not in thicket.statsframe.inc_metrics
):
thicket.statsframe.inc_metrics.append(column_to_append)

# columnar joined thicket object
else:
numerics = ["int16", "int32", "int64", "float16", "float32", "float64"]
df_num = thicket.dataframe.select_dtypes(include=numerics)
df = df_num.reset_index(level=1).groupby("node").quantile([0.25, 0.50, 0.75])
percentiles = []
for idx, column in columns:
percentiles = []
df_num = thicket.dataframe.select_dtypes(include=numerics)[columns]
df = df_num.reset_index(level=1).groupby("node").quantile(percentiles)
for idx_level, column in columns:
calculated_percentiles = []

# Get all the calculated values into a list for each node
for node in pd.unique(df.reset_index()["node"].tolist()):
percentiles.append(list(df.loc[node][(idx, column)]))
thicket.statsframe.dataframe[(idx, column + "_percentiles")] = percentiles
# check to see if exclusive metric
if (idx, column) in thicket.exc_metrics:
thicket.statsframe.exc_metrics.append((idx, column + "_percentiles"))
# check to see if inclusive metric
else:
thicket.statsframe.inc_metrics.append((idx, column + "_percentiles"))
calculated_percentiles.append(list(df.loc[node][(idx_level, column)]))

# Go through each of the percentiles, and make them it's own column
for index, percentile in enumerate(percentiles):
column_to_append = (
idx_level,
"{}_percentiles_{}".format(column, str(int(percentile * 100))),
)
thicket.statsframe.dataframe[column_to_append] = [
x[index] for x in calculated_percentiles
]

# check to see if exclusive metric
if (
(idx_level, column) in thicket.exc_metrics
and column_to_append not in thicket.statsframe.exc_metrics
):
thicket.statsframe.exc_metrics.append(column_to_append)
# check to see if inclusive metric
elif (
(idx_level, column) in thicket.inc_metrics
and column_to_append not in thicket.statsframe.inc_metrics
):
thicket.statsframe.inc_metrics.append(column_to_append)

# sort columns in index
thicket.statsframe.dataframe = thicket.statsframe.dataframe.sort_index(axis=1)
90 changes: 83 additions & 7 deletions thicket/tests/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,14 +206,50 @@ def test_percentiles(example_cali):

th.percentiles(th_ens, columns=["Min time/rank"])

assert "Min time/rank_percentiles" in th_ens.statsframe.dataframe.columns
assert len(th_ens.statsframe.dataframe["Min time/rank_percentiles"][0]) == 3
assert "Min time/rank_percentiles_25" in th_ens.statsframe.dataframe.columns
assert "Min time/rank_percentiles_50" in th_ens.statsframe.dataframe.columns
assert "Min time/rank_percentiles_75" in th_ens.statsframe.dataframe.columns

assert (
"Min time/rank_percentiles"
"Min time/rank_percentiles_25"
in th_ens.statsframe.exc_metrics + th_ens.statsframe.inc_metrics
)
assert (
"Min time/rank_percentiles_50"
in th_ens.statsframe.exc_metrics + th_ens.statsframe.inc_metrics
)
assert (
"Min time/rank_percentiles_75"
in th_ens.statsframe.exc_metrics + th_ens.statsframe.inc_metrics
)
assert "Min time/rank_percentiles_25" in th_ens.statsframe.show_metric_columns()
assert "Min time/rank_percentiles_50" in th_ens.statsframe.show_metric_columns()
assert "Min time/rank_percentiles_75" in th_ens.statsframe.show_metric_columns()


def test_percentiles_none(example_cali):
th_ens = th.Thicket.from_caliperreader(example_cali)

th.percentiles(th_ens, columns=["Min time/rank"], percentiles=None)

assert "Min time/rank_percentiles_25" in th_ens.statsframe.dataframe.columns
assert "Min time/rank_percentiles_50" in th_ens.statsframe.dataframe.columns
assert "Min time/rank_percentiles_75" in th_ens.statsframe.dataframe.columns


def test_percentiles_single_value(example_cali):
th_ens = th.Thicket.from_caliperreader(example_cali)

th.percentiles(th_ens, columns=["Min time/rank"], percentiles=[0.3])

assert "Min time/rank_percentiles_30" in th_ens.statsframe.dataframe.columns

assert (
"Min time/rank_percentiles_30"
in th_ens.statsframe.exc_metrics + th_ens.statsframe.inc_metrics
)
assert "Min time/rank_percentiles" in th_ens.statsframe.show_metric_columns()

assert "Min time/rank_percentiles_30" in th_ens.statsframe.show_metric_columns()


def test_percentiles_columnar_join(thicket_axis_columns):
Expand All @@ -229,15 +265,55 @@ def test_percentiles_columnar_join(thicket_axis_columns):

assert (
idx,
"Min time/rank_percentiles",
"Min time/rank_percentiles_25",
) in combined_th.statsframe.dataframe.columns
assert (
idx,
"Min time/rank_percentiles_50",
) in combined_th.statsframe.dataframe.columns
assert (
idx,
"Min time/rank_percentiles_75",
) in combined_th.statsframe.dataframe.columns
assert (
idx,
"Min time/rank_percentiles_25",
) in combined_th.statsframe.exc_metrics + combined_th.statsframe.inc_metrics
assert (
idx,
"Min time/rank_percentiles_50",
) in combined_th.statsframe.exc_metrics + combined_th.statsframe.inc_metrics
assert (
idx,
"Min time/rank_percentiles_75",
) in combined_th.statsframe.exc_metrics + combined_th.statsframe.inc_metrics

assert (
idx,
"Min time/rank_percentiles_25",
) in combined_th.statsframe.show_metric_columns()
assert (
idx,
"Min time/rank_percentiles_50",
) in combined_th.statsframe.show_metric_columns()
assert (
idx,
"Min time/rank_percentiles_75",
) in combined_th.statsframe.show_metric_columns()

th.percentiles(combined_th, columns=[(idx, "Min time/rank")], percentiles=[0.4])

assert (
idx,
"Min time/rank_percentiles_40",
) in combined_th.statsframe.dataframe.columns
assert (
idx,
"Min time/rank_percentiles",
"Min time/rank_percentiles_40",
) in combined_th.statsframe.exc_metrics + combined_th.statsframe.inc_metrics
assert (
idx,
"Min time/rank_percentiles",
"Min time/rank_percentiles_40",
) in combined_th.statsframe.show_metric_columns()


Expand Down

0 comments on commit 2d9faf5

Please sign in to comment.