diff --git a/thicket/stats/scoring.py b/thicket/stats/scoring.py index cee9a40f..e104b2fa 100644 --- a/thicket/stats/scoring.py +++ b/thicket/stats/scoring.py @@ -6,59 +6,66 @@ from .std import std import math -def _scoring_1(means, stds, num_nodes): +def _scoring_1(means_1, means_2, stds_1, stds_2, num_nodes): results = [] for i in range(num_nodes): - result = (means[0][1] - means[1][i]) * ( (stds[0][i] - stds[1][i]) / (np.abs(means[0][1] - means[1][i]))) + result = None + try: + result = (means_1[i] - means_2[i]) * ((stds_1[i] - stds_2[i]) / (np.abs(means_1[i] - means_2[i]))) + except RuntimeWarning: + print("Score 1 means's: ", means_1[i], means_2[i], i) + result = np.nan results.append(result) - + return results -def _scoring_2(means, stds, num_nodes): +def _scoring_2(means_1, means_2, stds_1, stds_2, num_nodes): results = [] for i in range(num_nodes): - result = (means[0][1] - means[1][i]) + (stds[0][i] / means[0][i]) - (stds[1][i] / means[1][i]) + result = (means_1[i] - means_2[i]) + (stds_1[i] / means_1[i]) - (stds_2[i] / means_2[i]) results.append(result) return results -def _scoring_3(means, stds, num_nodes): +def _scoring_3(means_1, means_2, stds_1, stds_2, num_nodes): results = [] for i in range(num_nodes): result = None try: - result = 0.25 * np.log(0.25 * ( (stds[0][i] ** 2 /stds[1][i] ** 2 ) + (stds[1][i] ** 2 /stds[0][i] ** 2))) +\ - 0.25 * ( (means[0][i] - means[1][i]) **2 / (stds[0][i] ** 2 + stds[1][i] ** 2) ) + result = 0.25 * np.log(0.25 * ( (stds_1[i] ** 2 / stds_2[i] ** 2 ) + (stds_2[i] ** 2 /stds_1[i] ** 2) + 2) ) +\ + 0.25 * ( (means_1[i] - means_2[i]) **2 / (stds_1[i] ** 2 + stds_2[i] ** 2) ) except ZeroDivisionError: - print("Score 3 std's: ", stds[0][i], stds[1][i], i) + print("Score 3 std's: ", stds_1[i], stds_2[i], i) result = np.nan results.append(result) return results -def _scoring_4(means, stds, num_nodes): +def _scoring_4(means_1, means_2, stds_1, stds_2, num_nodes): results = [] for i in range(num_nodes): result = None try: - result = 1 - math.sqrt((2 * stds[0][i] * stds[1][i]) / (stds[0][i] ** 2 + stds[1][i] ** 2)) *\ - math.exp(-0.25 * ( (means[0][i] - means[1][i])**2) / (stds[0][i] ** 2 + stds[1][i] ** 2)) + result = 1 - math.sqrt((2 * stds_1[i] * stds_2[i]) / (stds_1[i] ** 2 + stds_2[i] ** 2)) *\ + math.exp(-0.25 * ( (means_1[i] - means_2[i])**2) / (stds_1[i] ** 2 + stds_2[i] ** 2)) except ZeroDivisionError: - print("Score 4 std's: ", stds[0][i], stds[1][i], i) + print("Score 4 std's: ", stds_1[i], stds_2[i], i) result = np.nan results.append(result) return results -def score(thicket, columns, scoring_function): +# Implement warning for user that NAN's were put in stats frame, and why + +def score(thicket, columns, output_column_name, scoring_function): if isinstance(columns, list) is False: raise ValueError( @@ -72,24 +79,16 @@ def score(thicket, columns, scoring_function): "Columns listed in columns must be a tuple!" ) - if thicket.dataframe.columns.nlevels != 2: + if thicket.dataframe.columns.nlevels == 1: raise ValueError( "Thicket passed in must be a columnar joined thicket" ) - # Right now I have only two columns going in since we only have two compilers, - # But going forward I'm not sure we need this check. Or we can at least check if columns is at least 1 if len(columns) != 2: raise ValueError( "Must specify two columns" ) - # Scoring across targets must have the same column - if columns[0][1] != columns[1][1]: - raise ValueError( - "Columns to score must be the same column!" - ) - num_nodes = len(thicket.dataframe.index.get_level_values(0).unique()) if num_nodes < 2: @@ -99,54 +98,40 @@ def score(thicket, columns, scoring_function): verify_thicket_structures(thicket.dataframe, columns) - # Note: Right now we are dealing with two columns because I am making the assumption - # That the scoring will only be applied to two targets (clang vs. intel). We need to discuss - # how scoring should work if there are three targets, ie, introducing a third target like gcc or - # something like that. - - means = [[], []] - stds = [[], []] - # Calculate means and stds, adds both onto statsframe mean(thicket, columns) std(thicket, columns) - """ - This is where I would put in the logic to group targets if thicket object has more than two targets. - example: - If thicket has Clang, Intel, and gcc - The scoring would be applied to all three as such: - Score (Clang, Intel), (Clang, gcc), (Intel, gcc) - - All three would be appended to the statsframe + # Grab means and stds calculated from above + means_target1 = thicket.statsframe.dataframe[(columns[0][0], "{}_mean".format(columns[0][1]))].to_list() + means_target2 = thicket.statsframe.dataframe[(columns[1][0], "{}_mean".format(columns[1][1]))].to_list() + stds_target1 = thicket.statsframe.dataframe[(columns[0][0], "{}_std".format(columns[0][1]))].to_list() + stds_target2 = thicket.statsframe.dataframe[(columns[1][0], "{}_std".format(columns[1][1]))].to_list() - Need discussion on this! - """ - - means[0] = thicket.statsframe.dataframe[(columns[0][0], "{}_mean".format(columns[0][1]))].to_list() - means[1] = thicket.statsframe.dataframe[(columns[1][0], "{}_mean".format(columns[1][1]))].to_list() - stds[0] = thicket.statsframe.dataframe[(columns[0][0], "{}_std".format(columns[0][1]))].to_list() - stds[1] = thicket.statsframe.dataframe[(columns[1][0], "{}_std".format(columns[1][1]))].to_list() - - #This is where we call the scoring function that the user specified! - resulting_scores = scoring_function(means, stds, num_nodes) + # Call the scoring function that the user specified + resulting_scores = scoring_function(means_target1, means_target2, stds_target1, stds_target2, num_nodes) - # Statsframe column naming: - # (Scoring, columnName_target1_target2_scoringFunctionName) - stats_frame_column_name = ("Scoring", "{}_{}_{}_{}".format(columns[0][1], columns[0][0], columns[1][0], scoring_function.__name__)) + # User can specify a column name for the statsframe, otherwise default it to: + # "target1_column1_target2_column2_scoreFunctionName" + stats_frame_column_name = None + + if output_column_name == None: + stats_frame_column_name = ("Scoring", "{}_{}_{}_{}_{}".format(columns[0][0], columns[0][1], columns[1][0], columns[1][1], scoring_function.__name__)) + else: + stats_frame_column_name = output_column_name thicket.statsframe.dataframe[stats_frame_column_name] = resulting_scores return -def scoring_1(thicket, columns): - score(thicket, columns, _scoring_1) +def scoring_1(thicket, columns, output_column_name = None): + score(thicket, columns, output_column_name, _scoring_1) -def scoring_2(thicket, columns): - score(thicket, columns, _scoring_2) +def scoring_2(thicket, columns, output_column_name = None): + score(thicket, columns, output_column_name, _scoring_2) -def scoring_3(thicket, columns): - score(thicket, columns, _scoring_3) +def scoring_3(thicket, columns, output_column_name = None): + score(thicket, columns, output_column_name, _scoring_3) -def scoring_4(thicket, columns): - score(thicket, columns, _scoring_4) +def scoring_4(thicket, columns, output_column_name = None): + score(thicket, columns, output_column_name, _scoring_4)