Merge branch 'rl4sem' into develop

Conflicts: src/sensai/data/io_data.py src/sensai/evaluation/eval_stats/eval_stats_base.py
opcode81 · Nov 29, 2024 · 382248c · 382248c
2 parents 9c5530c + 97c2b37
commit 382248c
Show file tree

Hide file tree

Showing 26 changed files with 1,049 additions and 163 deletions.
diff --git a/src/sensai/catboost.py b/src/sensai/catboost.py
@@ -44,6 +44,9 @@ def _update_model_args(self, inputs: pd.DataFrame, outputs: pd.DataFrame):
             self.log.info(f"Updating model parameters with {args}")
             self.modelArgs.update(args)
 
+    def is_sample_weight_supported(self) -> bool:
+        return True
+
 
 # noinspection DuplicatedCode
 class CatBoostVectorClassificationModel(AbstractSkLearnVectorClassificationModel):
@@ -79,3 +82,6 @@ def _update_model_args(self, inputs: pd.DataFrame, outputs: pd.DataFrame):
             args = {"cat_features": col_indices}
             self.log.info(f"Updating model parameters with {args}")
             self.modelArgs.update(args)
+
+    def is_sample_weight_supported(self) -> bool:
+        return True
diff --git a/src/sensai/data/io_data.py b/src/sensai/data/io_data.py
@@ -1,13 +1,15 @@
 import logging
+import math
 import random
 from abc import ABC, abstractmethod
-from typing import Tuple, Sequence, TypeVar, Generic
+from typing import Tuple, Sequence, TypeVar, Generic, Optional, Union
 
 import numpy as np
 import pandas as pd
 import scipy.stats
 from sklearn.model_selection import StratifiedShuffleSplit
 
+from ..util.pickle import setstate
 from ..util.string import ToStringMixin
 
 log = logging.getLogger(__name__)
@@ -57,8 +59,14 @@ class InputOutputData(BaseInputOutputData[pd.DataFrame], ToStringMixin):
     """
     Holds input and output data for learning problems
     """
-    def __init__(self, inputs: pd.DataFrame, outputs: pd.DataFrame):
+    def __init__(self, inputs: pd.DataFrame, outputs: pd.DataFrame, weights: Optional[Union[pd.Series, "DataPointWeighting"]] = None):
         super().__init__(inputs, outputs)
+        if isinstance(weights, DataPointWeighting):
+            weights = weights.compute_weights(inputs, outputs)
+        self.weights = weights
+
+    def __setstate__(self, state):
+        setstate(InputOutputData, self, state, new_optional_properties=["weights"])
 
     def _tostring_object_info(self) -> str:
         return f"N={len(self.inputs)}, numInputColumns={len(self.inputs.columns)}, numOutputColumns={len(self.outputs.columns)}"
@@ -74,15 +82,35 @@ def from_data_frame(cls, df: pd.DataFrame, *output_columns: str) -> "InputOutput
         outputs = df[list(output_columns)]
         return cls(inputs, outputs)
 
+    def to_data_frame(self, add_weights: bool = False, weights_col_name: str = "weights") -> pd.DataFrame:
+        """
+        :param add_weights: whether to add the weights as a column (provided that weights are present)
+        :param weights_col_name: the column name to use for weights if `add_weights` is True
+        :return: a data frame containing both the inputs and outputs (and optionally the weights)
+        """
+        df = pd.concat([self.inputs, self.outputs], axis=1)
+        if add_weights and self.weights is not None:
+            df[weights_col_name] = self.weights
+        return df
+
+    def to_df(self, add_weights: bool = False, weights_col_name: str = "weights") -> pd.DataFrame:
+        return self.to_data_frame(add_weights=add_weights, weights_col_name=weights_col_name)
+
     def filter_indices(self, indices: Sequence[int]) -> __qualname__:
         inputs = self.inputs.iloc[indices]
         outputs = self.outputs.iloc[indices]
-        return InputOutputData(inputs, outputs)
+        weights = None
+        if self.weights is not None:
+            weights = self.weights.iloc[indices]
+        return InputOutputData(inputs, outputs, weights)
 
     def filter_index(self, index_elements: Sequence[any]) -> __qualname__:
         inputs = self.inputs.loc[index_elements]
         outputs = self.outputs.loc[index_elements]
-        return InputOutputData(inputs, outputs)
+        weights = None
+        if self.weights is not None:
+            weights = self.weights
+        return InputOutputData(inputs, outputs, weights)
 
     @property
     def input_dim(self):
@@ -103,8 +131,8 @@ def compute_input_output_correlation(self):
                 correlations[outputCol][inputCol] = pcc
         return correlations
 
-    def to_df(self) -> pd.DataFrame:
-        return pd.concat((self.inputs, self.outputs), axis=1)
+    def apply_weighting(self, weighting: "DataPointWeighting"):
+        self.weights = weighting.compute_weights(self.inputs, self.outputs)
 
 
 TInputOutputData = TypeVar("TInputOutputData", bound=BaseInputOutputData)
@@ -267,4 +295,78 @@ def compute_split_indices(self, df: pd.DataFrame, fractional_size_of_first_set:
                 first_set_indices.append(i)
             else:
                 second_set_indices.append(i)
-        return first_set_indices, second_set_indices
+        return first_set_indices, second_set_indices
+
+
+class DataPointWeighting(ABC):
+    @abstractmethod
+    def compute_weights(self, x: pd.DataFrame, y: pd.DataFrame) -> pd.Series:
+        pass
+
+
+class DataPointWeightingRegressionTargetIntervalTotalWeight(DataPointWeighting):
+    """
+    Based on relative weights specified for intervals of the regression target,
+    will weight individual data point weights such that the sum of weights of data points within each interval
+    satisfies the user-specified relative weight, while ensuring that the total weight of all data points
+    is still equal to the number of data points.
+
+    For example, if one specifies `interval_weights` as [(0.5, 1), (inf, 2)], then the data points with target values
+    up to 0.5 will get 1/3 of the weight and the remaining data points will get 2/3 of the weight.
+    So if there are 100 data points and 50 of them are in the first interval (up to 0.5), then these 50 data points
+    will each get weight 1/3*100/50=2/3 and the remaining 50 data points will each get weight 2/3*100/50=4/3.
+    The sum of all weights is the number of data points, i.e. 100.
+
+    Example:
+
+    >>> targets = [0.1, 0.2, 0.5, 0.7, 0.8, 0.6]
+    >>> x = pd.DataFrame({"foo": np.zeros(len(targets))})
+    >>> y = pd.DataFrame({"target": targets})
+    >>> weighting = DataPointWeightingRegressionTargetIntervalTotalWeight([(0.5, 1), (1.0, 2)])
+    >>> weights = weighting.compute_weights(x, y)
+    >>> assert(np.isclose(weights.sum(), len(y)))
+    >>> weights.tolist()
+    [0.6666666666666666,
+     0.6666666666666666,
+     0.6666666666666666,
+     1.3333333333333333,
+     1.3333333333333333,
+     1.3333333333333333]
+    """
+    def __init__(self, intervals_weights: Sequence[Tuple[float, float]]):
+        """
+        :param intervals_weights: a sequence of tuples (upper_bound, rel_total_weight) where upper_bound is the upper bound
+            of the interval, `(lower_bound, upper_bound]`; `lower_bound` is the upper bound of the preceding interval
+            or -inf for the first interval. `rel_total_weight` specifies the relative weight of all data points within
+            the interval.
+        """
+        a = -math.inf
+        sum_rel_weights = sum(t[1] for t in intervals_weights)
+        self.intervals = []
+        for b, rel_weight in intervals_weights:
+            self.intervals.append(self.Interval(a, b, rel_weight / sum_rel_weights))
+            a = b
+
+    class Interval:
+        def __init__(self, a: float, b: float, weight_fraction: float):
+            self.a = a
+            self.b = b
+            self.weight_fraction = weight_fraction
+
+        def contains(self, x: float):
+            return self.a < x <= self.b
+
+    def compute_weights(self, x: pd.DataFrame, y: pd.DataFrame) -> pd.Series:
+        assert len(y.columns) == 1, f"Only a single regression target is supported {self.__class__.__name__}"
+        targets = y.iloc[:, 0]
+        n = len(x)
+        weights = np.zeros(n)
+        num_weighted = 0
+        for interval in self.intervals:
+            mask = np.array([interval.contains(x) for x in targets])
+            subset_size = mask.sum()
+            num_weighted += subset_size
+            weights[mask] = interval.weight_fraction * n / subset_size
+        if num_weighted != n:
+            raise Exception("Not all data points were weighted. Most likely, the intervals do not cover the entire range of targets")
+        return pd.Series(weights, index=x.index)
diff --git a/src/sensai/ensemble/ensemble_base.py b/src/sensai/ensemble/ensemble_base.py
@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
 from concurrent.futures.process import ProcessPoolExecutor
-from typing import Sequence, List
+from typing import Sequence, List, Optional
 from inspect import currentframe, getframeinfo
 
 import pandas as pd
@@ -20,7 +20,9 @@ def __init__(self, models: Sequence[VectorModel], num_processes=1):
         self.models = list(models)
         super().__init__(check_input_columns=False)
 
-    def _fit(self, x: pd.DataFrame, y: pd.DataFrame):
+    def _fit(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.Series] = None):
+        self._warn_sample_weights_unsupported(False, weights)
+
         if self.num_processes == 1 or len(self.models) == 1:
             for model in self.models:
                 model.fit(x, y)

diff --git a/src/sensai/evaluation/eval_stats/eval_stats_base.py b/src/sensai/evaluation/eval_stats/eval_stats_base.py
@@ -5,6 +5,7 @@
 import pandas as pd
 from matplotlib import pyplot as plt
 
+from ...util.pickle import setstate
 from ...util.plot import ScatterPlot, HistogramPlot, Plot, HeatMapPlot
 from ...util.string import ToStringMixin, dict_string
 from ...vector_model import VectorModel
@@ -18,6 +19,7 @@
 TMetric = TypeVar("TMetric", bound="Metric")
 TVectorModel = TypeVar("TVectorModel", bound=VectorModel)
 
+Array = Union[np.ndarray, pd.Series, list]
 PredictionArray = Union[np.ndarray, pd.Series, pd.DataFrame, list]
 
 
@@ -242,23 +244,29 @@ class PredictionEvalStats(EvalStats[TMetric], ABC):
     and computes corresponding metrics
     """
     def __init__(self, y_predicted: Optional[PredictionArray], y_true: Optional[PredictionArray],
-                 metrics: List[TMetric], additional_metrics: List[TMetric] = None):
+            metrics: List[TMetric], additional_metrics: List[TMetric] = None,
+            weights: Optional[Array] = None):
         """
         :param y_predicted: sequence of predicted values, or, in case of multi-dimensional predictions, either a data frame with
             one column per dimension or a nested sequence of values
         :param y_true: sequence of ground truth labels of same shape as y_predicted
         :param metrics: list of metrics to be computed on the provided data
         :param additional_metrics: the metrics to additionally compute. This should only be provided if metrics is None
+        :param weights: weights for each data point contained in `y_predicted` and `y_true`
         """
         self.y_true = []
         self.y_predicted = []
+        self.weights: Optional[List[float]] = None
         self.y_true_multidim = None
         self.y_predicted_multidim = None
         if y_predicted is not None:
-            self.add_all(y_predicted, y_true)
+            self.add_all(y_predicted=y_predicted, y_true=y_true, weights=weights)
         super().__init__(metrics, additional_metrics=additional_metrics)
 
-    def add(self, y_predicted, y_true) -> None:
+    def __setstate__(self, state):
+        return setstate(PredictionEvalStats, self, state, new_optional_properties=["weights"])
+
+    def add(self, y_predicted, y_true, weight: Optional[float] = None) -> None:
         """
         Adds a single pair of values to the evaluation
 
@@ -267,12 +275,17 @@ def add(self, y_predicted, y_true) -> None:
         """
         self.y_true.append(y_true)
         self.y_predicted.append(y_predicted)
+        if weight is not None:
+            if self.weights is None:
+                self.weights = []
+            self.weights.append(weight)
 
-    def add_all(self, y_predicted: PredictionArray, y_true: PredictionArray) -> None:
+    def add_all(self, y_predicted: PredictionArray, y_true: PredictionArray, weights: Optional[Array] = None) -> None:
         """
         :param y_predicted: sequence of predicted values, or, in case of multi-dimensional predictions, either a data frame with
             one column per dimension or a nested sequence of values
         :param y_true: sequence of ground truth labels of same shape as y_predicted
+        :param weights: optional weights of data points
         """
         def is_sequence(x):
             return isinstance(x, pd.Series) or isinstance(x, list) or isinstance(x, np.ndarray)
@@ -313,6 +326,12 @@ def is_sequence(x):
         else:
             raise Exception(f"Unhandled data types: {type(y_predicted)}, {type(y_true)}")
 
+        if weights is not None:
+            if self.weights is None:
+                self.weights = []
+            assert len(weights) == len(self.y_predicted) - len(self.weights), "Length of weights does not match"
+            self.weights.extend(weights)
+
     def _tostring_object_info(self) -> str:
         return f"{super()._tostring_object_info()}, N={len(self.y_predicted)}"
 
@@ -336,3 +355,6 @@ def create_figure(self, eval_stats: TEvalStats, subtitle: str) -> Optional[plt.F
         :return: the figure or None if this plot is not applicable/cannot be created
         """
         pass
+
+    def is_applicable(self, eval_stats: TEvalStats) -> bool:
+        return True