Skip to content

Commit

Permalink
Most up to date scoring.py file, hand off
Browse files Browse the repository at this point in the history
  • Loading branch information
AndyM1098 authored and Yejashi committed Mar 6, 2024
1 parent 1d027ff commit 22df8ef
Showing 1 changed file with 45 additions and 60 deletions.
105 changes: 45 additions & 60 deletions thicket/stats/scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,59 +6,66 @@
from .std import std
import math

def _scoring_1(means, stds, num_nodes):
def _scoring_1(means_1, means_2, stds_1, stds_2, num_nodes):

results = []

for i in range(num_nodes):
result = (means[0][1] - means[1][i]) * ( (stds[0][i] - stds[1][i]) / (np.abs(means[0][1] - means[1][i])))
result = None
try:
result = (means_1[i] - means_2[i]) * ((stds_1[i] - stds_2[i]) / (np.abs(means_1[i] - means_2[i])))
except RuntimeWarning:
print("Score 1 means's: ", means_1[i], means_2[i], i)
result = np.nan
results.append(result)

return results

def _scoring_2(means, stds, num_nodes):
def _scoring_2(means_1, means_2, stds_1, stds_2, num_nodes):

results = []

for i in range(num_nodes):
result = (means[0][1] - means[1][i]) + (stds[0][i] / means[0][i]) - (stds[1][i] / means[1][i])
result = (means_1[i] - means_2[i]) + (stds_1[i] / means_1[i]) - (stds_2[i] / means_2[i])
results.append(result)

return results

def _scoring_3(means, stds, num_nodes):
def _scoring_3(means_1, means_2, stds_1, stds_2, num_nodes):

results = []

for i in range(num_nodes):
result = None
try:
result = 0.25 * np.log(0.25 * ( (stds[0][i] ** 2 /stds[1][i] ** 2 ) + (stds[1][i] ** 2 /stds[0][i] ** 2))) +\
0.25 * ( (means[0][i] - means[1][i]) **2 / (stds[0][i] ** 2 + stds[1][i] ** 2) )
result = 0.25 * np.log(0.25 * ( (stds_1[i] ** 2 / stds_2[i] ** 2 ) + (stds_2[i] ** 2 /stds_1[i] ** 2) + 2) ) +\
0.25 * ( (means_1[i] - means_2[i]) **2 / (stds_1[i] ** 2 + stds_2[i] ** 2) )
except ZeroDivisionError:
print("Score 3 std's: ", stds[0][i], stds[1][i], i)
print("Score 3 std's: ", stds_1[i], stds_2[i], i)
result = np.nan
results.append(result)

return results

def _scoring_4(means, stds, num_nodes):
def _scoring_4(means_1, means_2, stds_1, stds_2, num_nodes):

results = []

for i in range(num_nodes):
result = None
try:
result = 1 - math.sqrt((2 * stds[0][i] * stds[1][i]) / (stds[0][i] ** 2 + stds[1][i] ** 2)) *\
math.exp(-0.25 * ( (means[0][i] - means[1][i])**2) / (stds[0][i] ** 2 + stds[1][i] ** 2))
result = 1 - math.sqrt((2 * stds_1[i] * stds_2[i]) / (stds_1[i] ** 2 + stds_2[i] ** 2)) *\
math.exp(-0.25 * ( (means_1[i] - means_2[i])**2) / (stds_1[i] ** 2 + stds_2[i] ** 2))
except ZeroDivisionError:
print("Score 4 std's: ", stds[0][i], stds[1][i], i)
print("Score 4 std's: ", stds_1[i], stds_2[i], i)
result = np.nan
results.append(result)

return results

def score(thicket, columns, scoring_function):
# Implement warning for user that NAN's were put in stats frame, and why

def score(thicket, columns, output_column_name, scoring_function):

if isinstance(columns, list) is False:
raise ValueError(
Expand All @@ -72,24 +79,16 @@ def score(thicket, columns, scoring_function):
"Columns listed in columns must be a tuple!"
)

if thicket.dataframe.columns.nlevels != 2:
if thicket.dataframe.columns.nlevels == 1:
raise ValueError(
"Thicket passed in must be a columnar joined thicket"
)

# Right now I have only two columns going in since we only have two compilers,
# But going forward I'm not sure we need this check. Or we can at least check if columns is at least 1
if len(columns) != 2:
raise ValueError(
"Must specify two columns"
)

# Scoring across targets must have the same column
if columns[0][1] != columns[1][1]:
raise ValueError(
"Columns to score must be the same column!"
)

num_nodes = len(thicket.dataframe.index.get_level_values(0).unique())

if num_nodes < 2:
Expand All @@ -99,54 +98,40 @@ def score(thicket, columns, scoring_function):

verify_thicket_structures(thicket.dataframe, columns)

# Note: Right now we are dealing with two columns because I am making the assumption
# That the scoring will only be applied to two targets (clang vs. intel). We need to discuss
# how scoring should work if there are three targets, ie, introducing a third target like gcc or
# something like that.

means = [[], []]
stds = [[], []]

# Calculate means and stds, adds both onto statsframe
mean(thicket, columns)
std(thicket, columns)

"""
This is where I would put in the logic to group targets if thicket object has more than two targets.
example:
If thicket has Clang, Intel, and gcc
The scoring would be applied to all three as such:
Score (Clang, Intel), (Clang, gcc), (Intel, gcc)
All three would be appended to the statsframe
# Grab means and stds calculated from above
means_target1 = thicket.statsframe.dataframe[(columns[0][0], "{}_mean".format(columns[0][1]))].to_list()
means_target2 = thicket.statsframe.dataframe[(columns[1][0], "{}_mean".format(columns[1][1]))].to_list()
stds_target1 = thicket.statsframe.dataframe[(columns[0][0], "{}_std".format(columns[0][1]))].to_list()
stds_target2 = thicket.statsframe.dataframe[(columns[1][0], "{}_std".format(columns[1][1]))].to_list()

Need discussion on this!
"""

means[0] = thicket.statsframe.dataframe[(columns[0][0], "{}_mean".format(columns[0][1]))].to_list()
means[1] = thicket.statsframe.dataframe[(columns[1][0], "{}_mean".format(columns[1][1]))].to_list()
stds[0] = thicket.statsframe.dataframe[(columns[0][0], "{}_std".format(columns[0][1]))].to_list()
stds[1] = thicket.statsframe.dataframe[(columns[1][0], "{}_std".format(columns[1][1]))].to_list()

#This is where we call the scoring function that the user specified!
resulting_scores = scoring_function(means, stds, num_nodes)
# Call the scoring function that the user specified
resulting_scores = scoring_function(means_target1, means_target2, stds_target1, stds_target2, num_nodes)

# Statsframe column naming:
# (Scoring, columnName_target1_target2_scoringFunctionName)
stats_frame_column_name = ("Scoring", "{}_{}_{}_{}".format(columns[0][1], columns[0][0], columns[1][0], scoring_function.__name__))
# User can specify a column name for the statsframe, otherwise default it to:
# "target1_column1_target2_column2_scoreFunctionName"
stats_frame_column_name = None

if output_column_name == None:
stats_frame_column_name = ("Scoring", "{}_{}_{}_{}_{}".format(columns[0][0], columns[0][1], columns[1][0], columns[1][1], scoring_function.__name__))
else:
stats_frame_column_name = output_column_name

thicket.statsframe.dataframe[stats_frame_column_name] = resulting_scores

return

def scoring_1(thicket, columns):
score(thicket, columns, _scoring_1)
def scoring_1(thicket, columns, output_column_name = None):
score(thicket, columns, output_column_name, _scoring_1)

def scoring_2(thicket, columns):
score(thicket, columns, _scoring_2)
def scoring_2(thicket, columns, output_column_name = None):
score(thicket, columns, output_column_name, _scoring_2)

def scoring_3(thicket, columns):
score(thicket, columns, _scoring_3)
def scoring_3(thicket, columns, output_column_name = None):
score(thicket, columns, output_column_name, _scoring_3)

def scoring_4(thicket, columns):
score(thicket, columns, _scoring_4)
def scoring_4(thicket, columns, output_column_name = None):
score(thicket, columns, output_column_name, _scoring_4)

0 comments on commit 22df8ef

Please sign in to comment.