Most up to date scoring.py file, hand off

TauferLab · Mar 6, 2024 · 22df8ef · 22df8ef
1 parent 1d027ff
commit 22df8ef
Showing 1 changed file with 45 additions and 60 deletions.
diff --git a/thicket/stats/scoring.py b/thicket/stats/scoring.py
@@ -6,59 +6,66 @@
 from .std import std
 import math
 
-def _scoring_1(means, stds, num_nodes):
+def _scoring_1(means_1, means_2, stds_1, stds_2, num_nodes):
 
     results = []
 
     for i in range(num_nodes):
-        result = (means[0][1] - means[1][i])  * ( (stds[0][i] - stds[1][i]) / (np.abs(means[0][1] - means[1][i])))
+        result = None
+        try:
+            result = (means_1[i] - means_2[i]) * ((stds_1[i] - stds_2[i]) / (np.abs(means_1[i] - means_2[i])))
+        except RuntimeWarning:
+            print("Score 1 means's: ", means_1[i], means_2[i], i)
+            result = np.nan
         results.append(result)
-    
+
     return results
 
-def _scoring_2(means, stds, num_nodes):
+def _scoring_2(means_1, means_2, stds_1, stds_2, num_nodes):
 
     results = []
 
     for i in range(num_nodes):
-        result = (means[0][1] - means[1][i]) + (stds[0][i] / means[0][i])  - (stds[1][i] / means[1][i])
+        result = (means_1[i] - means_2[i]) + (stds_1[i] / means_1[i])  - (stds_2[i] / means_2[i])
         results.append(result)
 
     return results
 
-def _scoring_3(means, stds, num_nodes):
+def _scoring_3(means_1, means_2, stds_1, stds_2, num_nodes):
 
     results = []
 
     for i in range(num_nodes):
         result = None
         try:
-            result = 0.25 * np.log(0.25 * ( (stds[0][i] ** 2 /stds[1][i] ** 2 ) + (stds[1][i] ** 2 /stds[0][i] ** 2))) +\
-                            0.25 * ( (means[0][i] - means[1][i]) **2 / (stds[0][i] ** 2 + stds[1][i] ** 2) )
+            result = 0.25 * np.log(0.25 * ( (stds_1[i] ** 2 / stds_2[i] ** 2 ) + (stds_2[i] ** 2 /stds_1[i] ** 2) + 2) ) +\
+                            0.25 * ( (means_1[i] - means_2[i]) **2 / (stds_1[i] ** 2 + stds_2[i] ** 2) )
         except ZeroDivisionError:
-            print("Score 3 std's: ", stds[0][i], stds[1][i], i)
+            print("Score 3 std's: ", stds_1[i], stds_2[i], i)
             result = np.nan
         results.append(result)
 
     return results
 
-def _scoring_4(means, stds, num_nodes):
+def _scoring_4(means_1, means_2, stds_1, stds_2, num_nodes):
 
     results = []
 
     for i in range(num_nodes):
         result = None
         try:
-            result = 1 - math.sqrt((2 * stds[0][i] * stds[1][i]) / (stds[0][i] ** 2 + stds[1][i] ** 2)) *\
-                math.exp(-0.25 * ( (means[0][i] - means[1][i])**2) / (stds[0][i] ** 2 + stds[1][i] ** 2))
+            result = 1 - math.sqrt((2 * stds_1[i] * stds_2[i]) / (stds_1[i] ** 2 + stds_2[i] ** 2)) *\
+                math.exp(-0.25 * ( (means_1[i] - means_2[i])**2) / (stds_1[i] ** 2 + stds_2[i] ** 2))
         except ZeroDivisionError:
-            print("Score 4 std's: ", stds[0][i], stds[1][i], i)
+            print("Score 4 std's: ", stds_1[i], stds_2[i], i)
             result = np.nan
         results.append(result)
 
     return results
 
-def score(thicket, columns, scoring_function):
+# Implement warning for user that NAN's were put in stats frame, and why
+
+def score(thicket, columns, output_column_name, scoring_function):
 
     if isinstance(columns, list) is False:
         raise ValueError(
@@ -72,24 +79,16 @@ def score(thicket, columns, scoring_function):
                 "Columns listed in columns must be a tuple!"
             )
 
-    if thicket.dataframe.columns.nlevels != 2:
+    if thicket.dataframe.columns.nlevels == 1:
         raise ValueError(
                 "Thicket passed in must be a columnar joined thicket"
             )
 
-    # Right now I have only two columns going in since we only have two compilers,
-    # But going forward I'm not sure we need this check. Or we can at least check if columns is at least 1
     if len(columns) != 2:
         raise ValueError(
             "Must specify two columns"
         )
 
-    # Scoring across targets must have the same column
-    if columns[0][1] != columns[1][1]:
-        raise ValueError(
-            "Columns to score must be the same column!"
-        )
-
     num_nodes = len(thicket.dataframe.index.get_level_values(0).unique())
 
     if num_nodes < 2:
@@ -99,54 +98,40 @@ def score(thicket, columns, scoring_function):
 
     verify_thicket_structures(thicket.dataframe, columns)
 
-    # Note: Right now we are dealing with two columns because I am making the assumption
-    # That the scoring will only be applied to two targets (clang vs. intel). We need to discuss
-    # how scoring should work if there are three targets, ie, introducing a third target like gcc or
-    # something like that. 
-
-    means = [[], []]
-    stds = [[], []]
-
     # Calculate means and stds, adds both onto statsframe
     mean(thicket, columns)
     std(thicket, columns)
 
-    """
-        This is where I would put in the logic to group targets if thicket object has more than two targets.
-        example:
-            If thicket has Clang, Intel, and gcc
-            The scoring would be applied to all three as such:
-            Score (Clang, Intel), (Clang, gcc), (Intel, gcc)
-
-            All three would be appended to the statsframe
+    # Grab means and stds calculated from above
+    means_target1 = thicket.statsframe.dataframe[(columns[0][0], "{}_mean".format(columns[0][1]))].to_list()
+    means_target2 = thicket.statsframe.dataframe[(columns[1][0], "{}_mean".format(columns[1][1]))].to_list()
+    stds_target1 = thicket.statsframe.dataframe[(columns[0][0], "{}_std".format(columns[0][1]))].to_list()
+    stds_target2 = thicket.statsframe.dataframe[(columns[1][0], "{}_std".format(columns[1][1]))].to_list()
 
-            Need discussion on this!
-    """
-
-    means[0] = thicket.statsframe.dataframe[(columns[0][0], "{}_mean".format(columns[0][1]))].to_list()
-    means[1] = thicket.statsframe.dataframe[(columns[1][0], "{}_mean".format(columns[1][1]))].to_list()
-    stds[0] = thicket.statsframe.dataframe[(columns[0][0], "{}_std".format(columns[0][1]))].to_list()
-    stds[1] = thicket.statsframe.dataframe[(columns[1][0], "{}_std".format(columns[1][1]))].to_list()
-
-    #This is where we call the scoring function that the user specified!
-    resulting_scores = scoring_function(means, stds, num_nodes)
+    # Call the scoring function that the user specified
+    resulting_scores = scoring_function(means_target1, means_target2, stds_target1, stds_target2, num_nodes)
 
-    # Statsframe column naming:
-    #   (Scoring, columnName_target1_target2_scoringFunctionName)
-    stats_frame_column_name = ("Scoring", "{}_{}_{}_{}".format(columns[0][1], columns[0][0], columns[1][0], scoring_function.__name__))
+    # User can specify a column name for the statsframe, otherwise default it to:
+    #   "target1_column1_target2_column2_scoreFunctionName"
+    stats_frame_column_name = None
+
+    if output_column_name == None:
+        stats_frame_column_name = ("Scoring", "{}_{}_{}_{}_{}".format(columns[0][0], columns[0][1], columns[1][0], columns[1][1], scoring_function.__name__))
+    else:
+        stats_frame_column_name = output_column_name
 
     thicket.statsframe.dataframe[stats_frame_column_name] = resulting_scores
 
     return
 
-def scoring_1(thicket, columns):
-    score(thicket, columns, _scoring_1)
+def scoring_1(thicket, columns, output_column_name = None):
+    score(thicket, columns, output_column_name, _scoring_1)
 
-def scoring_2(thicket, columns):
-    score(thicket, columns, _scoring_2)
+def scoring_2(thicket, columns, output_column_name = None):
+    score(thicket, columns, output_column_name,  _scoring_2)
 
-def scoring_3(thicket, columns):
-    score(thicket, columns, _scoring_3)
+def scoring_3(thicket, columns, output_column_name = None):
+    score(thicket, columns, output_column_name, _scoring_3)
 
-def scoring_4(thicket, columns):
-    score(thicket, columns, _scoring_4)
+def scoring_4(thicket, columns, output_column_name = None):
+    score(thicket, columns, output_column_name, _scoring_4)