From 6513247b7163e3ae87356cfd9ad6c4a2e72bc19a Mon Sep 17 00:00:00 2001 From: brimoor Date: Mon, 16 Dec 2024 09:48:06 -0500 Subject: [PATCH] adding support for custom evaluation metrics --- fiftyone/utils/eval/activitynet.py | 13 ++- fiftyone/utils/eval/base.py | 112 +++++++++++++++++++++++++- fiftyone/utils/eval/classification.py | 60 +++++++++++--- fiftyone/utils/eval/coco.py | 13 ++- fiftyone/utils/eval/detection.py | 44 ++++++++-- fiftyone/utils/eval/openimages.py | 13 ++- fiftyone/utils/eval/regression.py | 60 ++++++++++++-- fiftyone/utils/eval/segmentation.py | 68 +++++++++++++--- 8 files changed, 346 insertions(+), 37 deletions(-) diff --git a/fiftyone/utils/eval/activitynet.py b/fiftyone/utils/eval/activitynet.py index 7a337522b8..6e739ef9c0 100644 --- a/fiftyone/utils/eval/activitynet.py +++ b/fiftyone/utils/eval/activitynet.py @@ -40,6 +40,8 @@ class ActivityNetEvaluationConfig(DetectionEvaluationConfig): that mAP and PR curves can be generated iou_threshs (None): a list of IoU thresholds to use when computing mAP and PR curves. Only applicable when ``compute_mAP`` is True + custom_metrics (None): an optional list of custom metrics to compute + or dict mapping metric names to kwargs dicts """ def __init__( @@ -50,10 +52,16 @@ def __init__( classwise=None, compute_mAP=False, iou_threshs=None, + custom_metrics=None, **kwargs, ): super().__init__( - pred_field, gt_field, iou=iou, classwise=classwise, **kwargs + pred_field, + gt_field, + iou=iou, + classwise=classwise, + custom_metrics=custom_metrics, + **kwargs, ) if compute_mAP and iou_threshs is None: @@ -323,6 +331,7 @@ class ActivityNetDetectionResults(DetectionResults): ``num_iou_threshs x num_classes x num_recall`` missing (None): a missing label string. Any unmatched segments are given this label for evaluation purposes + custom_metrics (None): an optional dict of custom metrics backend (None): a :class:`ActivityNetEvaluation` backend """ @@ -339,6 +348,7 @@ def __init__( classes, thresholds=None, missing=None, + custom_metrics=None, backend=None, ): super().__init__( @@ -348,6 +358,7 @@ def __init__( matches, classes=classes, missing=missing, + custom_metrics=custom_metrics, backend=backend, ) diff --git a/fiftyone/utils/eval/base.py b/fiftyone/utils/eval/base.py index 5fdcf9ba38..8471184c70 100644 --- a/fiftyone/utils/eval/base.py +++ b/fiftyone/utils/eval/base.py @@ -6,17 +6,124 @@ | """ import itertools +import logging import numpy as np import sklearn.metrics as skm import fiftyone.core.evaluation as foe import fiftyone.core.plots as fop +import fiftyone.core.utils as fou + +foo = fou.lazy_import("fiftyone.operators") + + +logger = logging.getLogger(__name__) + + +class BaseEvaluationMethodConfig(foe.EvaluationMethodConfig): + """Base class for configuring evaluation methods. + + Args: + **kwargs: any leftover keyword arguments after subclasses have done + their parsing + """ + + pass + + +class BaseEvaluationMethod(foe.EvaluationMethod): + """Base class for evaluation methods. + + Args: + config: an :class:`BaseEvaluationMethodConfig` + """ + + def _get_custom_metrics(self): + if not self.config.custom_metrics: + return {} + + if isinstance(self.config.custom_metrics, list): + return {m: None for m in self.config.custom_metrics} + + return self.config.custom_metrics + + def compute_custom_metrics(self, samples, eval_key, results): + results.custom_metrics = {} + + for metric, kwargs in self._get_custom_metrics().items(): + try: + operator = foo.get_operator(metric) + value = operator.compute( + samples, eval_key, results, **kwargs or {} + ) + if value is not None: + results.custom_metrics[operator.name] = value + except Exception as e: + logger.warning( + "Failed to compute metric '%s': Reason: %s", + operator.uri, + e, + ) + + def get_custom_metric_fields(self, samples, eval_key): + fields = [] + + for metric in self._get_custom_metrics().keys(): + try: + operator = foo.get_operator(metric) + fields.extend(operator.get_fields(samples, eval_key)) + except Exception as e: + logger.warning( + "Failed to get fields for metric '%s': Reason: %s", + operator.uri, + e, + ) + + return fields + + def rename_custom_metrics(self, samples, eval_key, new_eval_key): + for metric in self._get_custom_metrics().keys(): + try: + operator = foo.get_operator(metric) + operator.rename(samples, eval_key, new_eval_key) + except Exception as e: + logger.warning( + "Failed to rename fields for metric '%s': Reason: %s", + operator.uri, + e, + ) + + def cleanup_custom_metrics(self, samples, eval_key): + for metric in self._get_custom_metrics().keys(): + try: + operator = foo.get_operator(metric) + operator.cleanup(samples, eval_key) + except Exception as e: + logger.warning( + "Failed to cleanup metric '%s': Reason: %s", + operator.uri, + e, + ) class BaseEvaluationResults(foe.EvaluationResults): """Base class for evaluation results. + Args: + samples: the :class:`fiftyone.core.collections.SampleCollection` used + config: the :class:`BaseEvaluationMethodConfig` used + eval_key: the evaluation key + backend (None): an :class:`EvaluationMethod` backend + """ + + pass + + +class BaseClassificationResults(BaseEvaluationResults): + """Base class for evaluation results that expose classification metrics + like P/R/F1 and confusion matrices. + Args: samples: the :class:`fiftyone.core.collections.SampleCollection` used config: the :class:`fiftyone.core.evaluation.EvaluationMethodConfig` @@ -32,8 +139,7 @@ class BaseEvaluationResults(foe.EvaluationResults): observed ground truth/predicted labels are used missing (None): a missing label string. Any None-valued labels are given this label for evaluation purposes - samples (None): the :class:`fiftyone.core.collections.SampleCollection` - for which the results were computed + custom_metrics (None): an optional dict of custom metrics backend (None): a :class:`fiftyone.core.evaluation.EvaluationMethod` backend """ @@ -51,6 +157,7 @@ def __init__( ypred_ids=None, classes=None, missing=None, + custom_metrics=None, backend=None, ): super().__init__(samples, config, eval_key, backend=backend) @@ -72,6 +179,7 @@ def __init__( ) self.classes = np.asarray(classes) self.missing = missing + self.custom_metrics = custom_metrics def report(self, classes=None): """Generates a classification report for the results via diff --git a/fiftyone/utils/eval/classification.py b/fiftyone/utils/eval/classification.py index 6df82a5798..65ae55a92b 100644 --- a/fiftyone/utils/eval/classification.py +++ b/fiftyone/utils/eval/classification.py @@ -16,7 +16,6 @@ import eta.core.utils as etau import fiftyone as fo -import fiftyone.core.evaluation as foe from fiftyone.core.expressions import ViewField as F import fiftyone.core.fields as fof import fiftyone.core.labels as fol @@ -24,7 +23,11 @@ import fiftyone.core.utils as fou import fiftyone.core.validation as fov -from .base import BaseEvaluationResults +from .base import ( + BaseEvaluationMethod, + BaseEvaluationMethodConfig, + BaseClassificationResults, +) def evaluate_classifications( @@ -35,6 +38,7 @@ def evaluate_classifications( classes=None, missing=None, method=None, + custom_metrics=None, progress=None, **kwargs, ): @@ -82,6 +86,8 @@ def evaluate_classifications( supported values are ``fo.evaluation_config.classification_backends.keys()`` and the default is ``fo.evaluation_config.default_classification_backend`` + custom_metrics (None): an optional list of custom metrics to compute + or dict mapping metric names to kwargs dicts progress (None): whether to render a progress bar (True/False), use the default value ``fiftyone.config.show_progress_bars`` (None), or a progress callback function to invoke instead @@ -96,7 +102,13 @@ def evaluate_classifications( samples, (pred_field, gt_field), fol.Classification, same_type=True ) - config = _parse_config(pred_field, gt_field, method, **kwargs) + config = _parse_config( + pred_field, + gt_field, + method, + custom_metrics=custom_metrics, + **kwargs, + ) eval_method = config.build() eval_method.ensure_requirements() @@ -110,12 +122,13 @@ def evaluate_classifications( missing=missing, progress=progress, ) + eval_method.compute_custom_metrics(samples, eval_key, results) eval_method.save_run_results(samples, eval_key, results) return results -class ClassificationEvaluationConfig(foe.EvaluationMethodConfig): +class ClassificationEvaluationConfig(BaseEvaluationMethodConfig): """Base class for configuring :class:`ClassificationEvaluation` instances. @@ -124,19 +137,22 @@ class ClassificationEvaluationConfig(foe.EvaluationMethodConfig): :class:`fiftyone.core.labels.Classification` instances gt_field: the name of the field containing the ground truth :class:`fiftyone.core.labels.Classification` instances + custom_metrics (None): an optional list of custom metrics to compute + or dict mapping metric names to kwargs dicts """ - def __init__(self, pred_field, gt_field, **kwargs): + def __init__(self, pred_field, gt_field, custom_metrics=None, **kwargs): super().__init__(**kwargs) self.pred_field = pred_field self.gt_field = gt_field + self.custom_metrics = custom_metrics @property def type(self): return "classification" -class ClassificationEvaluation(foe.EvaluationMethod): +class ClassificationEvaluation(BaseEvaluationMethod): """Base class for classification evaluation methods. Args: @@ -187,6 +203,8 @@ def get_fields(self, samples, eval_key): if is_frame_field: fields.append(samples._FRAMES_PREFIX + eval_key) + fields.extend(self.get_custom_metric_fields(samples, eval_key)) + return fields def rename(self, samples, eval_key, new_eval_key): @@ -208,6 +226,8 @@ def rename(self, samples, eval_key, new_eval_key): fields = dict(zip(in_frame_fields, out_frame_fields)) dataset.rename_frame_fields(fields) + self.rename_custom_metrics(samples, eval_key, new_eval_key) + def cleanup(self, samples, eval_key): dataset = samples._dataset is_frame_field = samples._is_frame_field(self.config.gt_field) @@ -217,6 +237,8 @@ def cleanup(self, samples, eval_key): if is_frame_field: dataset.delete_frame_field(eval_key, error_level=1) + self.cleanup_custom_metrics(samples, eval_key) + def _validate_run(self, samples, eval_key, existing_info): self._validate_fields_match(eval_key, "pred_field", existing_info) self._validate_fields_match(eval_key, "gt_field", existing_info) @@ -230,6 +252,8 @@ class SimpleEvaluationConfig(ClassificationEvaluationConfig): :class:`fiftyone.core.labels.Classification` instances gt_field: the name of the field containing the ground truth :class:`fiftyone.core.labels.Classification` instances + custom_metrics (None): an optional list of custom metrics to compute + or dict mapping metric names to kwargs dicts """ @property @@ -337,10 +361,16 @@ class TopKEvaluationConfig(ClassificationEvaluationConfig): gt_field: the name of the field containing the ground truth :class:`fiftyone.core.labels.Classification` instances k (5): the top-k value to use when assessing accuracy + custom_metrics (None): an optional list of custom metrics to compute + or dict mapping metric names to kwargs dicts """ - def __init__(self, pred_field, gt_field, k=5, **kwargs): - super().__init__(pred_field, gt_field, **kwargs) + def __init__( + self, pred_field, gt_field, k=5, custom_metrics=None, **kwargs + ): + super().__init__( + pred_field, gt_field, custom_metrics=custom_metrics, **kwargs + ) self.k = k @property @@ -535,6 +565,8 @@ class BinaryEvaluationConfig(ClassificationEvaluationConfig): pred_field: the name of the field containing the predicted :class:`fiftyone.core.labels.Classification` instances gt_field: the name of the field containing the ground truth + custom_metrics (None): an optional list of custom metrics to compute + or dict mapping metric names to kwargs dicts """ @property @@ -678,7 +710,7 @@ def evaluate_samples( return results -class ClassificationResults(BaseEvaluationResults): +class ClassificationResults(BaseClassificationResults): """Class that stores the results of a classification evaluation. Args: @@ -695,6 +727,7 @@ class ClassificationResults(BaseEvaluationResults): observed ground truth/predicted labels are used missing (None): a missing label string. Any None-valued labels are given this label for evaluation purposes + custom_metrics (None): an optional dict of custom metrics backend (None): a :class:`ClassificationEvaluation` backend """ @@ -718,6 +751,7 @@ class BinaryClassificationResults(ClassificationResults): weights (None): an optional list of sample weights ytrue_ids (None): a list of IDs for the ground truth labels ypred_ids (None): a list of IDs for the predicted labels + custom_metrics (None): an optional dict of custom metrics backend (None): a :class:`ClassificationEvaluation` backend """ @@ -733,6 +767,7 @@ def __init__( weights=None, ytrue_ids=None, ypred_ids=None, + custom_metrics=None, backend=None, ): super().__init__( @@ -747,6 +782,7 @@ def __init__( ypred_ids=ypred_ids, classes=classes, missing=classes[0], + custom_metrics=custom_metrics, backend=backend, ) @@ -866,6 +902,7 @@ def _from_dict(cls, d, samples, config, eval_key, **kwargs): weights = d.get("weights", None) ytrue_ids = d.get("ytrue_ids", None) ypred_ids = d.get("ypred_ids", None) + custom_metrics = d.get("custom_metrics", None) return cls( samples, config, @@ -877,6 +914,7 @@ def _from_dict(cls, d, samples, config, eval_key, **kwargs): weights=weights, ytrue_ids=ytrue_ids, ypred_ids=ypred_ids, + custom_metrics=custom_metrics, **kwargs, ) @@ -885,6 +923,10 @@ def _parse_config(pred_field, gt_field, method, **kwargs): if method is None: method = fo.evaluation_config.default_classification_backend + custom_metrics = kwargs.get("custom_metrics", None) + if etau.is_str(custom_metrics): + kwargs["custom_metrics"] = [custom_metrics] + if inspect.isclass(method): return method(pred_field, gt_field, **kwargs) diff --git a/fiftyone/utils/eval/coco.py b/fiftyone/utils/eval/coco.py index 3de1702051..47483a3966 100644 --- a/fiftyone/utils/eval/coco.py +++ b/fiftyone/utils/eval/coco.py @@ -66,6 +66,8 @@ class COCOEvaluationConfig(DetectionEvaluationConfig): If ``error_level > 0``, any calculation that raises a geometric error will default to an IoU of 0 + custom_metrics (None): an optional list of custom metrics to compute + or dict mapping metric names to kwargs dicts """ def __init__( @@ -82,10 +84,16 @@ def __init__( iou_threshs=None, max_preds=None, error_level=1, + custom_metrics=None, **kwargs, ): super().__init__( - pred_field, gt_field, iou=iou, classwise=classwise, **kwargs + pred_field, + gt_field, + iou=iou, + classwise=classwise, + custom_metrics=custom_metrics, + **kwargs, ) if compute_mAP and iou_threshs is None: @@ -262,6 +270,7 @@ class COCODetectionResults(DetectionResults): ``num_iou_threshs x num_classes x num_recall`` missing (None): a missing label string. Any unmatched objects are given this label for evaluation purposes + custom_metrics (None): an optional dict of custom metrics backend (None): a :class:`COCOEvaluation` backend """ @@ -278,6 +287,7 @@ def __init__( recall_sweep=None, thresholds=None, missing=None, + custom_metrics=None, backend=None, ): super().__init__( @@ -287,6 +297,7 @@ def __init__( matches, classes=classes, missing=missing, + custom_metrics=custom_metrics, backend=backend, ) diff --git a/fiftyone/utils/eval/detection.py b/fiftyone/utils/eval/detection.py index f5a7173578..0fedacfb39 100644 --- a/fiftyone/utils/eval/detection.py +++ b/fiftyone/utils/eval/detection.py @@ -15,13 +15,16 @@ import eta.core.utils as etau import fiftyone as fo -import fiftyone.core.evaluation as foe import fiftyone.core.fields as fof import fiftyone.core.labels as fol import fiftyone.core.utils as fou import fiftyone.core.validation as fov -from .base import BaseEvaluationResults +from .base import ( + BaseEvaluationMethod, + BaseEvaluationMethodConfig, + BaseClassificationResults, +) logger = logging.getLogger(__name__) @@ -40,6 +43,7 @@ def evaluate_detections( use_boxes=False, classwise=True, dynamic=True, + custom_metrics=None, progress=None, **kwargs, ): @@ -132,6 +136,8 @@ def evaluate_detections( label (True) or allow matches between classes (False) dynamic (True): whether to declare the dynamic object-level attributes that are populated on the dataset's schema + custom_metrics (None): an optional list of custom metrics to compute + or dict mapping metric names to kwargs dicts progress (None): whether to render a progress bar (True/False), use the default value ``fiftyone.config.show_progress_bars`` (None), or a progress callback function to invoke instead @@ -164,6 +170,7 @@ def evaluate_detections( is_temporal, iou=iou, classwise=classwise, + custom_metrics=custom_metrics, **kwargs, ) @@ -223,12 +230,13 @@ def evaluate_detections( missing=missing, progress=progress, ) + eval_method.compute_custom_metrics(samples, eval_key, results) eval_method.save_run_results(samples, eval_key, results) return results -class DetectionEvaluationConfig(foe.EvaluationMethodConfig): +class DetectionEvaluationConfig(BaseEvaluationMethodConfig): """Base class for configuring :class:`DetectionEvaluation` instances. Args: @@ -243,16 +251,25 @@ class DetectionEvaluationConfig(foe.EvaluationMethodConfig): iou (None): the IoU threshold to use to determine matches classwise (None): whether to only match objects with the same class label (True) or allow matches between classes (False) + custom_metrics (None): an optional list of custom metrics to compute + or dict mapping metric names to kwargs dicts """ def __init__( - self, pred_field, gt_field, iou=None, classwise=None, **kwargs + self, + pred_field, + gt_field, + iou=None, + classwise=None, + custom_metrics=None, + **kwargs, ): super().__init__(**kwargs) self.pred_field = pred_field self.gt_field = gt_field self.iou = iou self.classwise = classwise + self.custom_metrics = custom_metrics @property def type(self): @@ -270,7 +287,7 @@ def requires_additional_fields(self): return False -class DetectionEvaluation(foe.EvaluationMethod): +class DetectionEvaluation(BaseEvaluationMethod): """Base class for detection evaluation methods. Args: @@ -442,6 +459,8 @@ def get_fields(self, samples, eval_key): ["%s_tp" % prefix, "%s_fp" % prefix, "%s_fn" % prefix] ) + fields.extend(self.get_custom_metric_fields(samples, eval_key)) + return fields def rename(self, samples, eval_key, new_eval_key): @@ -463,6 +482,8 @@ def rename(self, samples, eval_key, new_eval_key): fields = dict(zip(in_frame_fields, out_frame_fields)) dataset.rename_frame_fields(fields) + self.rename_custom_metrics(samples, eval_key, new_eval_key) + def cleanup(self, samples, eval_key): dataset = samples._dataset @@ -507,12 +528,14 @@ def cleanup(self, samples, eval_key): else: dataset.delete_sample_fields(fields, error_level=1) + self.cleanup_custom_metrics(samples, eval_key) + def _validate_run(self, samples, eval_key, existing_info): self._validate_fields_match(eval_key, "pred_field", existing_info) self._validate_fields_match(eval_key, "gt_field", existing_info) -class DetectionResults(BaseEvaluationResults): +class DetectionResults(BaseClassificationResults): """Class that stores the results of a detection evaluation. Args: @@ -527,6 +550,7 @@ class DetectionResults(BaseEvaluationResults): observed ground truth/predicted labels are used missing (None): a missing label string. Any unmatched objects are given this label for evaluation purposes + custom_metrics (None): an optional dict of custom metrics backend (None): a :class:`DetectionEvaluation` backend """ @@ -538,6 +562,7 @@ def __init__( matches, classes=None, missing=None, + custom_metrics=None, backend=None, ): if matches: @@ -563,6 +588,7 @@ def __init__( ypred_ids=ypred_ids, classes=classes, missing=missing, + custom_metrics=custom_metrics, backend=backend, ) @@ -588,6 +614,7 @@ def _from_dict(cls, d, samples, config, eval_key, **kwargs): classes = d.get("classes", None) missing = d.get("missing", None) + custom_metrics = d.get("custom_metrics", None) matches = list(zip(ytrue, ypred, ious, confs, ytrue_ids, ypred_ids)) @@ -598,6 +625,7 @@ def _from_dict(cls, d, samples, config, eval_key, **kwargs): matches, classes=classes, missing=missing, + custom_metrics=custom_metrics, **kwargs, ) @@ -610,6 +638,10 @@ def _parse_config(pred_field, gt_field, method, is_temporal, **kwargs): else: method = fo.evaluation_config.default_detection_backend + custom_metrics = kwargs.get("custom_metrics", None) + if etau.is_str(custom_metrics): + kwargs["custom_metrics"] = [custom_metrics] + if inspect.isclass(method): return method(pred_field, gt_field, **kwargs) diff --git a/fiftyone/utils/eval/openimages.py b/fiftyone/utils/eval/openimages.py index ffd4d65e20..6a7addefaf 100644 --- a/fiftyone/utils/eval/openimages.py +++ b/fiftyone/utils/eval/openimages.py @@ -68,6 +68,8 @@ class OpenImagesEvaluationConfig(DetectionEvaluationConfig): labels according to the provided ``hierarchy`` expand_pred_hierarchy (False): whether to expand predicted objects and labels according to the provided ``hierarchy`` + custom_metrics (None): an optional list of custom metrics to compute + or dict mapping metric names to kwargs dicts """ def __init__( @@ -87,10 +89,16 @@ def __init__( neg_label_field=None, expand_gt_hierarchy=True, expand_pred_hierarchy=False, + custom_metrics=None, **kwargs ): super().__init__( - pred_field, gt_field, iou=iou, classwise=classwise, **kwargs + pred_field, + gt_field, + iou=iou, + classwise=classwise, + custom_metrics=custom_metrics, + **kwargs, ) self.iscrowd = iscrowd @@ -286,6 +294,7 @@ class OpenImagesDetectionResults(DetectionResults): thresholds (None): an optional dict of per-class decision thresholds missing (None): a missing label string. Any unmatched objects are given this label for evaluation purposes + custom_metrics (None): an optional dict of custom metrics backend (None): a :class:`OpenImagesEvaluation` backend """ @@ -300,6 +309,7 @@ def __init__( classes, thresholds=None, missing=None, + custom_metrics=None, backend=None, ): super().__init__( @@ -309,6 +319,7 @@ def __init__( matches, classes=classes, missing=missing, + custom_metrics=custom_metrics, backend=backend, ) diff --git a/fiftyone/utils/eval/regression.py b/fiftyone/utils/eval/regression.py index 09663ab6af..0db937179e 100644 --- a/fiftyone/utils/eval/regression.py +++ b/fiftyone/utils/eval/regression.py @@ -18,13 +18,18 @@ import eta.core.utils as etau import fiftyone as fo -import fiftyone.core.evaluation as foe import fiftyone.core.fields as fof import fiftyone.core.labels as fol import fiftyone.core.plots as fop import fiftyone.core.utils as fou import fiftyone.core.validation as fov +from .base import ( + BaseEvaluationMethodConfig, + BaseEvaluationMethod, + BaseEvaluationResults, +) + logger = logging.getLogger(__name__) @@ -36,6 +41,7 @@ def evaluate_regressions( eval_key=None, missing=None, method=None, + custom_metrics=None, progress=None, **kwargs, ): @@ -75,6 +81,8 @@ def evaluate_regressions( supported values are ``fo.evaluation_config.regression_backends.keys()`` and the default is ``fo.evaluation_config.default_regression_backend`` + custom_metrics (None): an optional list of custom metrics to compute + or dict mapping metric names to kwargs dicts progress (None): whether to render a progress bar (True/False), use the default value ``fiftyone.config.show_progress_bars`` (None), or a progress callback function to invoke instead @@ -89,7 +97,13 @@ def evaluate_regressions( samples, (pred_field, gt_field), fol.Regression, same_type=True ) - config = _parse_config(pred_field, gt_field, method, **kwargs) + config = _parse_config( + pred_field, + gt_field, + method, + custom_metrics=custom_metrics, + **kwargs, + ) eval_method = config.build() eval_method.ensure_requirements() @@ -99,12 +113,13 @@ def evaluate_regressions( results = eval_method.evaluate_samples( samples, eval_key=eval_key, missing=missing, progress=progress ) + eval_method.compute_custom_metrics(samples, eval_key, results) eval_method.save_run_results(samples, eval_key, results) return results -class RegressionEvaluationConfig(foe.EvaluationMethodConfig): +class RegressionEvaluationConfig(BaseEvaluationMethodConfig): """Base class for configuring :class:`RegressionEvaluation` instances. Args: @@ -112,19 +127,22 @@ class RegressionEvaluationConfig(foe.EvaluationMethodConfig): :class:`fiftyone.core.labels.Regression` instances gt_field ("ground_truth"): the name of the field containing the ground truth :class:`fiftyone.core.labels.Regression` instances + custom_metrics (None): an optional list of custom metrics to compute + or dict mapping metric names to kwargs dicts """ - def __init__(self, pred_field, gt_field, **kwargs): + def __init__(self, pred_field, gt_field, custom_metrics=None, **kwargs): super().__init__(**kwargs) self.pred_field = pred_field self.gt_field = gt_field + self.custom_metrics = custom_metrics @property def type(self): return "regression" -class RegressionEvaluation(foe.EvaluationMethod): +class RegressionEvaluation(BaseEvaluationMethod): """Base class for regression evaluation methods. Args: @@ -181,6 +199,8 @@ def get_fields(self, samples, eval_key): prefix = samples._FRAMES_PREFIX + eval_key fields.append(prefix) + fields.extend(self.get_custom_metric_fields(samples, eval_key)) + return fields def rename(self, samples, eval_key, new_eval_key): @@ -202,6 +222,8 @@ def rename(self, samples, eval_key, new_eval_key): fields = dict(zip(in_frame_fields, out_frame_fields)) dataset.rename_frame_fields(fields) + self.rename_custom_metrics(samples, eval_key, new_eval_key) + def cleanup(self, samples, eval_key): dataset = samples._dataset @@ -211,6 +233,8 @@ def cleanup(self, samples, eval_key): if dataset._is_frame_field(self.config.gt_field): dataset.delete_frame_fields(fields, error_level=1) + self.cleanup_custom_metrics(samples, eval_key) + def _validate_run(self, samples, eval_key, existing_info): self._validate_fields_match(eval_key, "pred_field", existing_info) self._validate_fields_match(eval_key, "gt_field", existing_info) @@ -228,10 +252,21 @@ class SimpleEvaluationConfig(RegressionEvaluationConfig): sample/frame-level error data. Supported values are ``("squared_error", "absolute_error")`` or any function that accepts two scalar arguments ``(ypred, ytrue)`` + custom_metrics (None): an optional list of custom metrics to compute + or dict mapping metric names to kwargs dicts """ - def __init__(self, pred_field, gt_field, metric="squared_error", **kwargs): - super().__init__(pred_field, gt_field, **kwargs) + def __init__( + self, + pred_field, + gt_field, + metric="squared_error", + custom_metrics=None, + **kwargs, + ): + super().__init__( + pred_field, gt_field, custom_metrics=custom_metrics, **kwargs + ) self._metric = metric @property @@ -344,7 +379,7 @@ def compute_error(yp, yt): return results -class RegressionResults(foe.EvaluationResults): +class RegressionResults(BaseEvaluationResults): """Class that stores the results of a regression evaluation. Args: @@ -361,6 +396,7 @@ class RegressionResults(foe.EvaluationResults): regressions missing (None): a missing value. Any None-valued regressions are given this value for results purposes + custom_metrics (None): an optional dict of custom metrics backend (None): a :class:`RegressionEvaluation` backend """ @@ -374,6 +410,7 @@ def __init__( confs=None, ids=None, missing=None, + custom_metrics=None, backend=None, ): super().__init__(samples, config, eval_key, backend=backend) @@ -387,6 +424,7 @@ def __init__( self.confs = confs self.ids = ids self.missing = missing + self.custom_metrics = custom_metrics def metrics(self, weights=None): """Computes various popular regression metrics for the results. @@ -517,6 +555,7 @@ def _from_dict(cls, d, samples, config, eval_key, **kwargs): confs = d.get("confs", None) ids = d.get("ids", None) missing = d.get("missing", None) + custom_metrics = d.get("custom_metrics", None) return cls( samples, config, @@ -526,6 +565,7 @@ def _from_dict(cls, d, samples, config, eval_key, **kwargs): confs=confs, ids=ids, missing=missing, + custom_metrics=custom_metrics, **kwargs, ) @@ -534,6 +574,10 @@ def _parse_config(pred_field, gt_field, method, **kwargs): if method is None: method = fo.evaluation_config.default_regression_backend + custom_metrics = kwargs.get("custom_metrics", None) + if etau.is_str(custom_metrics): + kwargs["custom_metrics"] = [custom_metrics] + if inspect.isclass(method): return method(pred_field, gt_field, **kwargs) diff --git a/fiftyone/utils/eval/segmentation.py b/fiftyone/utils/eval/segmentation.py index 6e15be95c1..2bd5c5770f 100644 --- a/fiftyone/utils/eval/segmentation.py +++ b/fiftyone/utils/eval/segmentation.py @@ -17,13 +17,16 @@ import eta.core.utils as etau import fiftyone as fo -import fiftyone.core.evaluation as foe import fiftyone.core.fields as fof import fiftyone.core.labels as fol import fiftyone.core.utils as fou import fiftyone.core.validation as fov -from .base import BaseEvaluationResults +from .base import ( + BaseEvaluationMethod, + BaseEvaluationMethodConfig, + BaseClassificationResults, +) logger = logging.getLogger(__name__) @@ -36,6 +39,7 @@ def evaluate_segmentations( eval_key=None, mask_targets=None, method=None, + custom_metrics=None, progress=None, **kwargs, ): @@ -87,6 +91,8 @@ def evaluate_segmentations( supported values are ``fo.evaluation_config.segmentation_backends.keys()`` and the default is ``fo.evaluation_config.default_segmentation_backend`` + custom_metrics (None): an optional list of custom metrics to compute + or dict mapping metric names to kwargs dicts progress (None): whether to render a progress bar (True/False), use the default value ``fiftyone.config.show_progress_bars`` (None), or a progress callback function to invoke instead @@ -101,7 +107,13 @@ def evaluate_segmentations( samples, (pred_field, gt_field), fol.Segmentation, same_type=True ) - config = _parse_config(pred_field, gt_field, method, **kwargs) + config = _parse_config( + pred_field, + gt_field, + method, + custom_metrics=custom_metrics, + **kwargs, + ) eval_method = config.build() eval_method.ensure_requirements() @@ -114,12 +126,13 @@ def evaluate_segmentations( mask_targets=mask_targets, progress=progress, ) + eval_method.compute_custom_metrics(samples, eval_key, results) eval_method.save_run_results(samples, eval_key, results) return results -class SegmentationEvaluationConfig(foe.EvaluationMethodConfig): +class SegmentationEvaluationConfig(BaseEvaluationMethodConfig): """Base class for configuring :class:`SegmentationEvaluation` instances. Args: @@ -129,20 +142,30 @@ class SegmentationEvaluationConfig(foe.EvaluationMethodConfig): :class:`fiftyone.core.labels.Segmentation` instances compute_dice (False): whether to compute the Dice coefficient for each sample + custom_metrics (None): an optional list of custom metrics to compute + or dict mapping metric names to kwargs dicts """ - def __init__(self, pred_field, gt_field, compute_dice=False, **kwargs): + def __init__( + self, + pred_field, + gt_field, + compute_dice=False, + custom_metrics=None, + **kwargs, + ): super().__init__(**kwargs) self.pred_field = pred_field self.gt_field = gt_field self.compute_dice = compute_dice + self.custom_metrics = custom_metrics @property def type(self): return "segmentation" -class SegmentationEvaluation(foe.EvaluationMethod): +class SegmentationEvaluation(BaseEvaluationMethod): """Base class for segmentation evaluation methods. Args: @@ -232,6 +255,8 @@ def get_fields(self, samples, eval_key): if self.config.compute_dice: fields.append("%s_dice" % prefix) + fields.extend(self.get_custom_metric_fields(samples, eval_key)) + return fields def rename(self, samples, eval_key, new_eval_key): @@ -253,6 +278,8 @@ def rename(self, samples, eval_key, new_eval_key): fields = dict(zip(in_frame_fields, out_frame_fields)) dataset.rename_frame_fields(fields) + self.rename_custom_metrics(samples, eval_key, new_eval_key) + def cleanup(self, samples, eval_key): dataset = samples._dataset processing_frames = samples._is_frame_field(self.config.gt_field) @@ -271,6 +298,8 @@ def cleanup(self, samples, eval_key): if processing_frames: dataset.delete_frame_fields(fields, error_level=1) + self.cleanup_custom_metrics(samples, eval_key) + def _validate_run(self, samples, eval_key, existing_info): self._validate_fields_match(eval_key, "pred_field", existing_info) self._validate_fields_match(eval_key, "gt_field", existing_info) @@ -292,6 +321,8 @@ class SimpleEvaluationConfig(SegmentationEvaluationConfig): default, the entire masks are evaluated average ("micro"): the averaging strategy to use when populating precision and recall numbers on each sample + custom_metrics (None): an optional list of custom metrics to compute + or dict mapping metric names to kwargs dicts """ def __init__( @@ -301,10 +332,15 @@ def __init__( compute_dice=False, bandwidth=None, average="micro", + custom_metrics=None, **kwargs, ): super().__init__( - pred_field, gt_field, compute_dice=compute_dice, **kwargs + pred_field, + gt_field, + compute_dice=compute_dice, + custom_metrics=custom_metrics, + **kwargs, ) self.bandwidth = bandwidth self.average = average @@ -429,7 +465,7 @@ def evaluate_samples( ) -class SegmentationResults(BaseEvaluationResults): +class SegmentationResults(BaseClassificationResults): """Class that stores the results of a segmentation evaluation. Args: @@ -439,6 +475,7 @@ class SegmentationResults(BaseEvaluationResults): pixel_confusion_matrix: a pixel value confusion matrix classes: a list of class labels corresponding to the confusion matrix missing (None): a missing (background) class + custom_metrics (None): an optional dict of custom metrics backend (None): a :class:`SegmentationEvaluation` backend """ @@ -450,6 +487,7 @@ def __init__( pixel_confusion_matrix, classes, missing=None, + custom_metrics=None, backend=None, ): pixel_confusion_matrix = np.asarray(pixel_confusion_matrix) @@ -466,13 +504,20 @@ def __init__( weights=weights, classes=classes, missing=missing, + custom_metrics=custom_metrics, backend=backend, ) self.pixel_confusion_matrix = pixel_confusion_matrix def attributes(self): - return ["cls", "pixel_confusion_matrix", "classes", "missing"] + return [ + "cls", + "pixel_confusion_matrix", + "classes", + "missing", + "custom_metrics", + ] def dice_score(self): """Computes the Dice score across all samples in the evaluation. @@ -491,6 +536,7 @@ def _from_dict(cls, d, samples, config, eval_key, **kwargs): d["pixel_confusion_matrix"], d["classes"], missing=d.get("missing", None), + custom_metrics=d.get("custom_metrics", None), **kwargs, ) @@ -515,6 +561,10 @@ def _parse_config(pred_field, gt_field, method, **kwargs): if method is None: method = fo.evaluation_config.default_segmentation_backend + custom_metrics = kwargs.get("custom_metrics", None) + if etau.is_str(custom_metrics): + kwargs["custom_metrics"] = [custom_metrics] + if inspect.isclass(method): return method(pred_field, gt_field, **kwargs)