Skip to content

Commit

Permalink
Merge branch 'rl4sem' into develop
Browse files Browse the repository at this point in the history
Conflicts:
	src/sensai/data/io_data.py
	src/sensai/evaluation/eval_stats/eval_stats_base.py
  • Loading branch information
opcode81 committed Nov 29, 2024
2 parents 9c5530c + 97c2b37 commit 382248c
Show file tree
Hide file tree
Showing 26 changed files with 1,049 additions and 163 deletions.
6 changes: 6 additions & 0 deletions src/sensai/catboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ def _update_model_args(self, inputs: pd.DataFrame, outputs: pd.DataFrame):
self.log.info(f"Updating model parameters with {args}")
self.modelArgs.update(args)

def is_sample_weight_supported(self) -> bool:
return True


# noinspection DuplicatedCode
class CatBoostVectorClassificationModel(AbstractSkLearnVectorClassificationModel):
Expand Down Expand Up @@ -79,3 +82,6 @@ def _update_model_args(self, inputs: pd.DataFrame, outputs: pd.DataFrame):
args = {"cat_features": col_indices}
self.log.info(f"Updating model parameters with {args}")
self.modelArgs.update(args)

def is_sample_weight_supported(self) -> bool:
return True
116 changes: 109 additions & 7 deletions src/sensai/data/io_data.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import logging
import math
import random
from abc import ABC, abstractmethod
from typing import Tuple, Sequence, TypeVar, Generic
from typing import Tuple, Sequence, TypeVar, Generic, Optional, Union

import numpy as np
import pandas as pd
import scipy.stats
from sklearn.model_selection import StratifiedShuffleSplit

from ..util.pickle import setstate
from ..util.string import ToStringMixin

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -57,8 +59,14 @@ class InputOutputData(BaseInputOutputData[pd.DataFrame], ToStringMixin):
"""
Holds input and output data for learning problems
"""
def __init__(self, inputs: pd.DataFrame, outputs: pd.DataFrame):
def __init__(self, inputs: pd.DataFrame, outputs: pd.DataFrame, weights: Optional[Union[pd.Series, "DataPointWeighting"]] = None):
super().__init__(inputs, outputs)
if isinstance(weights, DataPointWeighting):
weights = weights.compute_weights(inputs, outputs)
self.weights = weights

def __setstate__(self, state):
setstate(InputOutputData, self, state, new_optional_properties=["weights"])

def _tostring_object_info(self) -> str:
return f"N={len(self.inputs)}, numInputColumns={len(self.inputs.columns)}, numOutputColumns={len(self.outputs.columns)}"
Expand All @@ -74,15 +82,35 @@ def from_data_frame(cls, df: pd.DataFrame, *output_columns: str) -> "InputOutput
outputs = df[list(output_columns)]
return cls(inputs, outputs)

def to_data_frame(self, add_weights: bool = False, weights_col_name: str = "weights") -> pd.DataFrame:
"""
:param add_weights: whether to add the weights as a column (provided that weights are present)
:param weights_col_name: the column name to use for weights if `add_weights` is True
:return: a data frame containing both the inputs and outputs (and optionally the weights)
"""
df = pd.concat([self.inputs, self.outputs], axis=1)
if add_weights and self.weights is not None:
df[weights_col_name] = self.weights
return df

def to_df(self, add_weights: bool = False, weights_col_name: str = "weights") -> pd.DataFrame:
return self.to_data_frame(add_weights=add_weights, weights_col_name=weights_col_name)

def filter_indices(self, indices: Sequence[int]) -> __qualname__:
inputs = self.inputs.iloc[indices]
outputs = self.outputs.iloc[indices]
return InputOutputData(inputs, outputs)
weights = None
if self.weights is not None:
weights = self.weights.iloc[indices]
return InputOutputData(inputs, outputs, weights)

def filter_index(self, index_elements: Sequence[any]) -> __qualname__:
inputs = self.inputs.loc[index_elements]
outputs = self.outputs.loc[index_elements]
return InputOutputData(inputs, outputs)
weights = None
if self.weights is not None:
weights = self.weights
return InputOutputData(inputs, outputs, weights)

@property
def input_dim(self):
Expand All @@ -103,8 +131,8 @@ def compute_input_output_correlation(self):
correlations[outputCol][inputCol] = pcc
return correlations

def to_df(self) -> pd.DataFrame:
return pd.concat((self.inputs, self.outputs), axis=1)
def apply_weighting(self, weighting: "DataPointWeighting"):
self.weights = weighting.compute_weights(self.inputs, self.outputs)


TInputOutputData = TypeVar("TInputOutputData", bound=BaseInputOutputData)
Expand Down Expand Up @@ -267,4 +295,78 @@ def compute_split_indices(self, df: pd.DataFrame, fractional_size_of_first_set:
first_set_indices.append(i)
else:
second_set_indices.append(i)
return first_set_indices, second_set_indices
return first_set_indices, second_set_indices


class DataPointWeighting(ABC):
@abstractmethod
def compute_weights(self, x: pd.DataFrame, y: pd.DataFrame) -> pd.Series:
pass


class DataPointWeightingRegressionTargetIntervalTotalWeight(DataPointWeighting):
"""
Based on relative weights specified for intervals of the regression target,
will weight individual data point weights such that the sum of weights of data points within each interval
satisfies the user-specified relative weight, while ensuring that the total weight of all data points
is still equal to the number of data points.
For example, if one specifies `interval_weights` as [(0.5, 1), (inf, 2)], then the data points with target values
up to 0.5 will get 1/3 of the weight and the remaining data points will get 2/3 of the weight.
So if there are 100 data points and 50 of them are in the first interval (up to 0.5), then these 50 data points
will each get weight 1/3*100/50=2/3 and the remaining 50 data points will each get weight 2/3*100/50=4/3.
The sum of all weights is the number of data points, i.e. 100.
Example:
>>> targets = [0.1, 0.2, 0.5, 0.7, 0.8, 0.6]
>>> x = pd.DataFrame({"foo": np.zeros(len(targets))})
>>> y = pd.DataFrame({"target": targets})
>>> weighting = DataPointWeightingRegressionTargetIntervalTotalWeight([(0.5, 1), (1.0, 2)])
>>> weights = weighting.compute_weights(x, y)
>>> assert(np.isclose(weights.sum(), len(y)))
>>> weights.tolist()
[0.6666666666666666,
0.6666666666666666,
0.6666666666666666,
1.3333333333333333,
1.3333333333333333,
1.3333333333333333]
"""
def __init__(self, intervals_weights: Sequence[Tuple[float, float]]):
"""
:param intervals_weights: a sequence of tuples (upper_bound, rel_total_weight) where upper_bound is the upper bound
of the interval, `(lower_bound, upper_bound]`; `lower_bound` is the upper bound of the preceding interval
or -inf for the first interval. `rel_total_weight` specifies the relative weight of all data points within
the interval.
"""
a = -math.inf
sum_rel_weights = sum(t[1] for t in intervals_weights)
self.intervals = []
for b, rel_weight in intervals_weights:
self.intervals.append(self.Interval(a, b, rel_weight / sum_rel_weights))
a = b

class Interval:
def __init__(self, a: float, b: float, weight_fraction: float):
self.a = a
self.b = b
self.weight_fraction = weight_fraction

def contains(self, x: float):
return self.a < x <= self.b

def compute_weights(self, x: pd.DataFrame, y: pd.DataFrame) -> pd.Series:
assert len(y.columns) == 1, f"Only a single regression target is supported {self.__class__.__name__}"
targets = y.iloc[:, 0]
n = len(x)
weights = np.zeros(n)
num_weighted = 0
for interval in self.intervals:
mask = np.array([interval.contains(x) for x in targets])
subset_size = mask.sum()
num_weighted += subset_size
weights[mask] = interval.weight_fraction * n / subset_size
if num_weighted != n:
raise Exception("Not all data points were weighted. Most likely, the intervals do not cover the entire range of targets")
return pd.Series(weights, index=x.index)
6 changes: 4 additions & 2 deletions src/sensai/ensemble/ensemble_base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from abc import ABC, abstractmethod
from concurrent.futures.process import ProcessPoolExecutor
from typing import Sequence, List
from typing import Sequence, List, Optional
from inspect import currentframe, getframeinfo

import pandas as pd
Expand All @@ -20,7 +20,9 @@ def __init__(self, models: Sequence[VectorModel], num_processes=1):
self.models = list(models)
super().__init__(check_input_columns=False)

def _fit(self, x: pd.DataFrame, y: pd.DataFrame):
def _fit(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.Series] = None):
self._warn_sample_weights_unsupported(False, weights)

if self.num_processes == 1 or len(self.models) == 1:
for model in self.models:
model.fit(x, y)
Expand Down
30 changes: 26 additions & 4 deletions src/sensai/evaluation/eval_stats/eval_stats_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pandas as pd
from matplotlib import pyplot as plt

from ...util.pickle import setstate
from ...util.plot import ScatterPlot, HistogramPlot, Plot, HeatMapPlot
from ...util.string import ToStringMixin, dict_string
from ...vector_model import VectorModel
Expand All @@ -18,6 +19,7 @@
TMetric = TypeVar("TMetric", bound="Metric")
TVectorModel = TypeVar("TVectorModel", bound=VectorModel)

Array = Union[np.ndarray, pd.Series, list]
PredictionArray = Union[np.ndarray, pd.Series, pd.DataFrame, list]


Expand Down Expand Up @@ -242,23 +244,29 @@ class PredictionEvalStats(EvalStats[TMetric], ABC):
and computes corresponding metrics
"""
def __init__(self, y_predicted: Optional[PredictionArray], y_true: Optional[PredictionArray],
metrics: List[TMetric], additional_metrics: List[TMetric] = None):
metrics: List[TMetric], additional_metrics: List[TMetric] = None,
weights: Optional[Array] = None):
"""
:param y_predicted: sequence of predicted values, or, in case of multi-dimensional predictions, either a data frame with
one column per dimension or a nested sequence of values
:param y_true: sequence of ground truth labels of same shape as y_predicted
:param metrics: list of metrics to be computed on the provided data
:param additional_metrics: the metrics to additionally compute. This should only be provided if metrics is None
:param weights: weights for each data point contained in `y_predicted` and `y_true`
"""
self.y_true = []
self.y_predicted = []
self.weights: Optional[List[float]] = None
self.y_true_multidim = None
self.y_predicted_multidim = None
if y_predicted is not None:
self.add_all(y_predicted, y_true)
self.add_all(y_predicted=y_predicted, y_true=y_true, weights=weights)
super().__init__(metrics, additional_metrics=additional_metrics)

def add(self, y_predicted, y_true) -> None:
def __setstate__(self, state):
return setstate(PredictionEvalStats, self, state, new_optional_properties=["weights"])

def add(self, y_predicted, y_true, weight: Optional[float] = None) -> None:
"""
Adds a single pair of values to the evaluation
Expand All @@ -267,12 +275,17 @@ def add(self, y_predicted, y_true) -> None:
"""
self.y_true.append(y_true)
self.y_predicted.append(y_predicted)
if weight is not None:
if self.weights is None:
self.weights = []
self.weights.append(weight)

def add_all(self, y_predicted: PredictionArray, y_true: PredictionArray) -> None:
def add_all(self, y_predicted: PredictionArray, y_true: PredictionArray, weights: Optional[Array] = None) -> None:
"""
:param y_predicted: sequence of predicted values, or, in case of multi-dimensional predictions, either a data frame with
one column per dimension or a nested sequence of values
:param y_true: sequence of ground truth labels of same shape as y_predicted
:param weights: optional weights of data points
"""
def is_sequence(x):
return isinstance(x, pd.Series) or isinstance(x, list) or isinstance(x, np.ndarray)
Expand Down Expand Up @@ -313,6 +326,12 @@ def is_sequence(x):
else:
raise Exception(f"Unhandled data types: {type(y_predicted)}, {type(y_true)}")

if weights is not None:
if self.weights is None:
self.weights = []
assert len(weights) == len(self.y_predicted) - len(self.weights), "Length of weights does not match"
self.weights.extend(weights)

def _tostring_object_info(self) -> str:
return f"{super()._tostring_object_info()}, N={len(self.y_predicted)}"

Expand All @@ -336,3 +355,6 @@ def create_figure(self, eval_stats: TEvalStats, subtitle: str) -> Optional[plt.F
:return: the figure or None if this plot is not applicable/cannot be created
"""
pass

def is_applicable(self, eval_stats: TEvalStats) -> bool:
return True
Loading

0 comments on commit 382248c

Please sign in to comment.