Sync rl4sem

commit 69c684a7b949255e5f81ce3fcf2fd2d15a4753a8 Author: Michael Panchenko <[email protected]> Date: Tue Jul 2 18:55:30 2024 +0200 setstate: extended renamed_properties to also handle the case of new values src/sensai/util/pickle.py commit 48248b9e8b225e62f0a94b0137d8960299d40af6 Author: Dominik Jain <[email protected]> Date: Fri Jun 28 14:20:16 2024 +0200 Add sensai.util.cache.LRUCache src/sensai/util/cache.py commit 78da9d014c52a2710d6bd56e12c81c0546d3119a Author: Dominik Jain <[email protected]> Date: Mon Apr 22 13:07:53 2024 +0200 Add util methods for path creation: create_path, create_dir_path, create_file_path src/sensai/util/io.py commit 2b48fe8f9eeb8dc7da4861ee47ba8359d24867c9 Author: Dominik Jain <[email protected]> Date: Fri Apr 19 15:06:19 2024 +0200 Add AverageSeriesLinePlot, changing the interface of the draw argument of the Plot constructor src/sensai/util/plot.py commit 2401a6a56943742162bb5f25907e275aac532400 Author: Dominik Jain <[email protected]> Date: Fri Apr 19 15:05:14 2024 +0200 util.pandas: Add SeriesInterpolation abstraction with implementations SeriesInterpolationLinearIndex and SeriesInterpolationRepeatPreceding src/sensai/util/pandas.py commit 52f529829a17860a65d2397e2fc3ebe43a5ea1d0 Author: Dominik Jain <[email protected]> Date: Fri Apr 19 12:50:07 2024 +0200 XGBGradientBoostedVectorRegressionModel: Fix early stopping enabled logic, establish backward compatibility src/sensai/xgboost.py commit ec2f6b293c5d2ff72ae459a403efa0adc9db6ff3 Author: Dominik Jain <[email protected]> Date: Thu Apr 18 22:33:53 2024 +0200 XGBGradientBoostedVectorRegressionModel: Add early stopping support, refactoring the interface of _fit_sklearn src/sensai/sklearn/sklearn_base.py src/sensai/xgboost.py commit 35912619ac053fcdfb23396d87be6c923f84e83e Author: Dominik Jain <[email protected]> Date: Thu Apr 18 21:41:49 2024 +0200 Support sample weights in AbstractSkLearnMultiDimVectorRegressionModel src/sensai/sklearn/sklearn_base.py commit a4f860f8d0cd26698b8074e3ed74dbc1beabdc98 Author: Dominik Jain <[email protected]> Date: Tue Apr 16 17:02:47 2024 +0200 ResultWriter.path: replace forbidden character '>' (also in combination '>=') with gt (gte) src/sensai/util/io.py commit 266be6b874b77b52eeabe4142e6b03b56bb028f8 Author: Dominik Jain <[email protected]> Date: Tue Apr 16 12:46:59 2024 +0200 Clean imports src/sensai/evaluation/eval_stats/eval_stats_regression.py commit b3431e6768e1b3dc27af66407d1dd0a7308f9aa7 Author: Dominik Jain <[email protected]> Date: Tue Apr 16 12:38:45 2024 +0200 RegressionMetric: Change compute_value from a class method to an instance method Add RegressionMetricFromBinaryClassificationMetric for the computation of binary classification metrics by converting regressor predictions and targets to binary class labels src/sensai/evaluation/eval_stats/eval_stats_regression.py commit 9b81a0d9dd84ea26c2f410020c9ecb5026e8704f Author: Dominik Jain <[email protected]> Date: Tue Apr 16 12:02:07 2024 +0200 RegressionEvalStats.plot_heatmap_ground_truth_predictions: Allow to specify Axes src/sensai/evaluation/eval_stats/eval_stats_regression.py commit b19a7c20da0e32eebe064f7246004c3487a79ee9 Author: Dominik Jain <[email protected]> Date: Fri Apr 12 13:55:26 2024 +0200 query_data_frame: Relax the object column filtering, retaining supported types such as str src/sensai/util/pandas.py commit 41c6b1a8aa0878ced0e2338ba7d0a28b1d3af7f6 Author: Dominik Jain <[email protected]> Date: Fri Apr 12 11:42:17 2024 +0200 Add util.pandas.query_data_frame (and apply in ResultSet) src/sensai/evaluation/result_set.py src/sensai/util/pandas.py commit be1025f58c8bdba40b64b7bc803c104acd146c08 Author: Dominik Jain <[email protected]> Date: Thu Apr 11 13:57:32 2024 +0200 ScatterPlot: Add option add_diagonal src/sensai/util/plot.py commit 0480d557088959e0a56dfe9142778653a5c06e6a Author: Dominik Jain <[email protected]> Date: Thu Apr 11 09:32:00 2024 +0200 Add util.helper.contains_any src/sensai/util/helper.py commit 054caca69acea35a59776ded16c7e25d0d95a705 Author: Dominik Jain <[email protected]> Date: Thu Apr 11 09:31:25 2024 +0200 Add PersistableObject as a superclass for persited objects, making sure that __setstate__ is called regardless of the presence of attributes src/sensai/util/pickle.py commit f4e0b3f3bc27b6e9a8eb1f1665c21a20a8a2405c Author: Dominik Jain <[email protected]> Date: Wed Apr 10 11:37:16 2024 +0200 Introduce queryable ResultSet to support interactive querying and analysis of prediction results with specialisation RegressionResultSet for regression VectorRegressionModelEvaluationData: * Support creation of corresponding RegressionResultSet via new method `create_result_set` * Add method `to_data_frame` to support the result set creation src/sensai/evaluation/evaluator.py src/sensai/evaluation/result_set.py commit aab162d23428b5aa2e0d91f320b9ec60ece6eb3a Author: Dominik Jain <[email protected]> Date: Wed Apr 10 11:34:12 2024 +0200 Add helper function get_predicted_var_name src/sensai/vector_model.py commit 27f8403accf50879500ef501b6d0b8abc1e5d059 Author: Dominik Jain <[email protected]> Date: Mon Apr 8 13:32:53 2024 +0200 util.logging: Allow to control 'append' mode in add_file_logger and FileLoggerContext src/sensai/util/logging.py commit a26139ef875fa82186aa942b7bf4be75430ff420 Author: Dominik Jain <[email protected]> Date: Tue Apr 2 12:02:38 2024 +0200 InputOutputData: Add method to_data_frame src/sensai/data.py commit 933a0da650daee50b82ae802bfd337a301063f23 Author: Dominik Jain <[email protected]> Date: Thu Mar 28 16:05:49 2024 +0100 Add support for regression heat map plot with weighted data points src/sensai/evaluation/eval_stats/eval_stats_base.py src/sensai/evaluation/eval_stats/eval_stats_regression.py src/sensai/evaluation/eval_util.py commit dc7a0383391ea5cb14ec29ce0aa036811ed1f8d7 Author: Dominik Jain <[email protected]> Date: Thu Mar 28 13:36:59 2024 +0100 Support weighting of data points in regression evaluation, adding support to RegressionEvalStats and all applicable RegressionMetrics, switching to sklearn-based metric computations under the hood (as they already support weighting). Remove unnecessary intermediate class VectorModelFittableBase, which was not correctly maintained anyway. src/sensai/evaluation/eval_stats/eval_stats_base.py src/sensai/evaluation/eval_stats/eval_stats_regression.py src/sensai/evaluation/evaluator.py src/sensai/vector_model.py commit a8411013dda8e8f4723e389be6a1ce15d6208595 Author: Dominik Jain <[email protected]> Date: Thu Mar 28 12:42:46 2024 +0100 Add abstraction DataPointWeighting and first specialisation DataPointWeightingRegressionTargetIntervalTotalWeight src/sensai/data.py commit d232f75d9f4e8aeb818d4a9d6738853683e7f8a0 Author: Dominik Jain <[email protected]> Date: Thu Mar 28 11:35:47 2024 +0100 Add option to provide sample weights for training in VectorModel, adjusting all subclasses accordingly. Models that do not support weighting will log a warning if weights are specified. src/sensai/catboost.py src/sensai/data.py src/sensai/ensemble/ensemble_base.py src/sensai/lightgbm.py src/sensai/naive_bayes.py src/sensai/nearest_neighbors.py src/sensai/sklearn/sklearn_base.py src/sensai/sklearn/sklearn_classification.py src/sensai/sklearn/sklearn_regression.py src/sensai/tensor_model.py src/sensai/tensorflow/tf_base.py src/sensai/torch/torch_base.py src/sensai/util/helper.py src/sensai/vector_model.py src/sensai/xgboost.py
opcode81 · Nov 29, 2024 · 97c2b37 · 97c2b37
1 parent bc9e829
commit 97c2b37
Show file tree

Hide file tree

Showing 26 changed files with 1,047 additions and 161 deletions.
diff --git a/src/sensai/catboost.py b/src/sensai/catboost.py
@@ -44,6 +44,9 @@ def _update_model_args(self, inputs: pd.DataFrame, outputs: pd.DataFrame):
             self.log.info(f"Updating model parameters with {args}")
             self.modelArgs.update(args)
 
+    def is_sample_weight_supported(self) -> bool:
+        return True
+
 
 # noinspection DuplicatedCode
 class CatBoostVectorClassificationModel(AbstractSkLearnVectorClassificationModel):
@@ -79,3 +82,6 @@ def _update_model_args(self, inputs: pd.DataFrame, outputs: pd.DataFrame):
             args = {"cat_features": col_indices}
             self.log.info(f"Updating model parameters with {args}")
             self.modelArgs.update(args)
+
+    def is_sample_weight_supported(self) -> bool:
+        return True
diff --git a/src/sensai/data.py b/src/sensai/data.py
@@ -1,13 +1,15 @@
 import logging
+import math
 import random
 from abc import ABC, abstractmethod
-from typing import Tuple, Sequence, TypeVar, Generic
+from typing import Tuple, Sequence, TypeVar, Generic, Optional, Union
 
 import numpy as np
 import pandas as pd
 import scipy.stats
 from sklearn.model_selection import StratifiedShuffleSplit
 
+from .util.pickle import setstate
 from .util.string import ToStringMixin
 
 log = logging.getLogger(__name__)
@@ -57,8 +59,14 @@ class InputOutputData(BaseInputOutputData[pd.DataFrame], ToStringMixin):
     """
     Holds input and output data for learning problems
     """
-    def __init__(self, inputs: pd.DataFrame, outputs: pd.DataFrame):
+    def __init__(self, inputs: pd.DataFrame, outputs: pd.DataFrame, weights: Optional[Union[pd.Series, "DataPointWeighting"]] = None):
         super().__init__(inputs, outputs)
+        if isinstance(weights, DataPointWeighting):
+            weights = weights.compute_weights(inputs, outputs)
+        self.weights = weights
+
+    def __setstate__(self, state):
+        setstate(InputOutputData, self, state, new_optional_properties=["weights"])
 
     def _tostring_object_info(self) -> str:
         return f"N={len(self.inputs)}, numInputColumns={len(self.inputs.columns)}, numOutputColumns={len(self.outputs.columns)}"
@@ -74,15 +82,32 @@ def from_data_frame(cls, df: pd.DataFrame, *output_columns: str) -> "InputOutput
         outputs = df[list(output_columns)]
         return cls(inputs, outputs)
 
+    def to_data_frame(self, add_weights: bool = False, weights_col_name: str = "weights") -> pd.DataFrame:
+        """
+        :param add_weights: whether to add the weights as a column (provided that weights are present)
+        :param weights_col_name: the column name to use for weights if `add_weights` is True
+        :return: a data frame containing both the inputs and outputs (and optionally the weights)
+        """
+        df = pd.concat([self.inputs, self.outputs], axis=1)
+        if add_weights and self.weights is not None:
+            df[weights_col_name] = self.weights
+        return df
+
     def filter_indices(self, indices: Sequence[int]) -> __qualname__:
         inputs = self.inputs.iloc[indices]
         outputs = self.outputs.iloc[indices]
-        return InputOutputData(inputs, outputs)
+        weights = None
+        if self.weights is not None:
+            weights = self.weights.iloc[indices]
+        return InputOutputData(inputs, outputs, weights)
 
     def filter_index(self, index_elements: Sequence[any]) -> __qualname__:
         inputs = self.inputs.loc[index_elements]
         outputs = self.outputs.loc[index_elements]
-        return InputOutputData(inputs, outputs)
+        weights = None
+        if self.weights is not None:
+            weights = self.weights
+        return InputOutputData(inputs, outputs, weights)
 
     @property
     def input_dim(self):
@@ -103,6 +128,9 @@ def compute_input_output_correlation(self):
                 correlations[outputCol][inputCol] = pcc
         return correlations
 
+    def apply_weighting(self, weighting: "DataPointWeighting"):
+        self.weights = weighting.compute_weights(self.inputs, self.outputs)
+
 
 TInputOutputData = TypeVar("TInputOutputData", bound=BaseInputOutputData)
 
@@ -264,4 +292,78 @@ def compute_split_indices(self, df: pd.DataFrame, fractional_size_of_first_set:
                 first_set_indices.append(i)
             else:
                 second_set_indices.append(i)
-        return first_set_indices, second_set_indices
+        return first_set_indices, second_set_indices
+
+
+class DataPointWeighting(ABC):
+    @abstractmethod
+    def compute_weights(self, x: pd.DataFrame, y: pd.DataFrame) -> pd.Series:
+        pass
+
+
+class DataPointWeightingRegressionTargetIntervalTotalWeight(DataPointWeighting):
+    """
+    Based on relative weights specified for intervals of the regression target,
+    will weight individual data point weights such that the sum of weights of data points within each interval
+    satisfies the user-specified relative weight, while ensuring that the total weight of all data points
+    is still equal to the number of data points.
+
+    For example, if one specifies `interval_weights` as [(0.5, 1), (inf, 2)], then the data points with target values
+    up to 0.5 will get 1/3 of the weight and the remaining data points will get 2/3 of the weight.
+    So if there are 100 data points and 50 of them are in the first interval (up to 0.5), then these 50 data points
+    will each get weight 1/3*100/50=2/3 and the remaining 50 data points will each get weight 2/3*100/50=4/3.
+    The sum of all weights is the number of data points, i.e. 100.
+
+    Example:
+
+    >>> targets = [0.1, 0.2, 0.5, 0.7, 0.8, 0.6]
+    >>> x = pd.DataFrame({"foo": np.zeros(len(targets))})
+    >>> y = pd.DataFrame({"target": targets})
+    >>> weighting = DataPointWeightingRegressionTargetIntervalTotalWeight([(0.5, 1), (1.0, 2)])
+    >>> weights = weighting.compute_weights(x, y)
+    >>> assert(np.isclose(weights.sum(), len(y)))
+    >>> weights.tolist()
+    [0.6666666666666666,
+     0.6666666666666666,
+     0.6666666666666666,
+     1.3333333333333333,
+     1.3333333333333333,
+     1.3333333333333333]
+    """
+    def __init__(self, intervals_weights: Sequence[Tuple[float, float]]):
+        """
+        :param intervals_weights: a sequence of tuples (upper_bound, rel_total_weight) where upper_bound is the upper bound
+            of the interval, `(lower_bound, upper_bound]`; `lower_bound` is the upper bound of the preceding interval
+            or -inf for the first interval. `rel_total_weight` specifies the relative weight of all data points within
+            the interval.
+        """
+        a = -math.inf
+        sum_rel_weights = sum(t[1] for t in intervals_weights)
+        self.intervals = []
+        for b, rel_weight in intervals_weights:
+            self.intervals.append(self.Interval(a, b, rel_weight / sum_rel_weights))
+            a = b
+
+    class Interval:
+        def __init__(self, a: float, b: float, weight_fraction: float):
+            self.a = a
+            self.b = b
+            self.weight_fraction = weight_fraction
+
+        def contains(self, x: float):
+            return self.a < x <= self.b
+
+    def compute_weights(self, x: pd.DataFrame, y: pd.DataFrame) -> pd.Series:
+        assert len(y.columns) == 1, f"Only a single regression target is supported {self.__class__.__name__}"
+        targets = y.iloc[:, 0]
+        n = len(x)
+        weights = np.zeros(n)
+        num_weighted = 0
+        for interval in self.intervals:
+            mask = np.array([interval.contains(x) for x in targets])
+            subset_size = mask.sum()
+            num_weighted += subset_size
+            weights[mask] = interval.weight_fraction * n / subset_size
+        if num_weighted != n:
+            raise Exception("Not all data points were weighted. Most likely, the intervals do not cover the entire range of targets")
+        return pd.Series(weights, index=x.index)
diff --git a/src/sensai/ensemble/ensemble_base.py b/src/sensai/ensemble/ensemble_base.py
@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
 from concurrent.futures.process import ProcessPoolExecutor
-from typing import Sequence, List
+from typing import Sequence, List, Optional
 from inspect import currentframe, getframeinfo
 
 import pandas as pd
@@ -20,7 +20,9 @@ def __init__(self, models: Sequence[VectorModel], num_processes=1):
         self.models = list(models)
         super().__init__(check_input_columns=False)
 
-    def _fit(self, x: pd.DataFrame, y: pd.DataFrame):
+    def _fit(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.Series] = None):
+        self._warn_sample_weights_unsupported(False, weights)
+
         if self.num_processes == 1 or len(self.models) == 1:
             for model in self.models:
                 model.fit(x, y)

diff --git a/src/sensai/evaluation/eval_stats/eval_stats_base.py b/src/sensai/evaluation/eval_stats/eval_stats_base.py
@@ -5,6 +5,7 @@
 import pandas as pd
 from matplotlib import pyplot as plt
 
+from ...util.pickle import setstate
 from ...util.plot import ScatterPlot, HistogramPlot, Plot, HeatMapPlot
 from ...util.string import ToStringMixin, dict_string
 from ...vector_model import VectorModel
@@ -18,6 +19,7 @@
 TMetric = TypeVar("TMetric", bound="Metric")
 TVectorModel = TypeVar("TVectorModel", bound=VectorModel)
 
+Array = Union[np.ndarray, pd.Series, list]
 PredictionArray = Union[np.ndarray, pd.Series, pd.DataFrame, list]
 
 
@@ -242,23 +244,29 @@ class PredictionEvalStats(EvalStats[TMetric], ABC):
     and computes corresponding metrics
     """
     def __init__(self, y_predicted: Optional[PredictionArray], y_true: Optional[PredictionArray],
-                 metrics: List[TMetric], additional_metrics: List[TMetric] = None):
+            metrics: List[TMetric], additional_metrics: List[TMetric] = None,
+            weights: Optional[Array] = None):
         """
         :param y_predicted: sequence of predicted values, or, in case of multi-dimensional predictions, either a data frame with
             one column per dimension or a nested sequence of values
         :param y_true: sequence of ground truth labels of same shape as y_predicted
         :param metrics: list of metrics to be computed on the provided data
         :param additional_metrics: the metrics to additionally compute. This should only be provided if metrics is None
+        :param weights: weights for each data point contained in `y_predicted` and `y_true`
         """
         self.y_true = []
         self.y_predicted = []
+        self.weights: Optional[List[float]] = None
         self.y_true_multidim = None
         self.y_predicted_multidim = None
         if y_predicted is not None:
-            self.add_all(y_predicted, y_true)
+            self.add_all(y_predicted=y_predicted, y_true=y_true, weights=weights)
         super().__init__(metrics, additional_metrics=additional_metrics)
 
-    def add(self, y_predicted, y_true):
+    def __setstate__(self, state):
+        return setstate(PredictionEvalStats, self, state, new_optional_properties=["weights"])
+
+    def add(self, y_predicted, y_true, weight: Optional[float] = None):
         """
         Adds a single pair of values to the evaluation
         Parameters:
@@ -267,12 +275,17 @@ def add(self, y_predicted, y_true):
         """
         self.y_true.append(y_true)
         self.y_predicted.append(y_predicted)
+        if weight is not None:
+            if self.weights is None:
+                self.weights = []
+            self.weights.append(weight)
 
-    def add_all(self, y_predicted: PredictionArray, y_true: PredictionArray):
+    def add_all(self, y_predicted: PredictionArray, y_true: PredictionArray, weights: Optional[Array] = None):
         """
         :param y_predicted: sequence of predicted values, or, in case of multi-dimensional predictions, either a data frame with
             one column per dimension or a nested sequence of values
         :param y_true: sequence of ground truth labels of same shape as y_predicted
+        :param weights: optional weights of data points
         """
         def is_sequence(x):
             return isinstance(x, pd.Series) or isinstance(x, list) or isinstance(x, np.ndarray)
@@ -313,6 +326,12 @@ def is_sequence(x):
         else:
             raise Exception(f"Unhandled data types: {type(y_predicted)}, {type(y_true)}")
 
+        if weights is not None:
+            if self.weights is None:
+                self.weights = []
+            assert len(weights) == len(self.y_predicted) - len(self.weights), "Length of weights does not match"
+            self.weights.extend(weights)
+
     def _tostring_object_info(self) -> str:
         return f"{super()._tostring_object_info()}, N={len(self.y_predicted)}"
 
@@ -336,3 +355,6 @@ def create_figure(self, eval_stats: TEvalStats, subtitle: str) -> Optional[plt.F
         :return: the figure or None if this plot is not applicable/cannot be created
         """
         pass
+
+    def is_applicable(self, eval_stats: TEvalStats) -> bool:
+        return True