From 6513247b7163e3ae87356cfd9ad6c4a2e72bc19a Mon Sep 17 00:00:00 2001
From: brimoor <brimoor@umich.edu>
Date: Mon, 16 Dec 2024 09:48:06 -0500
Subject: [PATCH] adding support for custom evaluation metrics

---
 fiftyone/utils/eval/activitynet.py    |  13 ++-
 fiftyone/utils/eval/base.py           | 112 +++++++++++++++++++++++++-
 fiftyone/utils/eval/classification.py |  60 +++++++++++---
 fiftyone/utils/eval/coco.py           |  13 ++-
 fiftyone/utils/eval/detection.py      |  44 ++++++++--
 fiftyone/utils/eval/openimages.py     |  13 ++-
 fiftyone/utils/eval/regression.py     |  60 ++++++++++++--
 fiftyone/utils/eval/segmentation.py   |  68 +++++++++++++---
 8 files changed, 346 insertions(+), 37 deletions(-)

diff --git a/fiftyone/utils/eval/activitynet.py b/fiftyone/utils/eval/activitynet.py
index 7a337522b8..6e739ef9c0 100644
--- a/fiftyone/utils/eval/activitynet.py
+++ b/fiftyone/utils/eval/activitynet.py
@@ -40,6 +40,8 @@ class ActivityNetEvaluationConfig(DetectionEvaluationConfig):
             that mAP and PR curves can be generated
         iou_threshs (None): a list of IoU thresholds to use when computing mAP
             and PR curves. Only applicable when ``compute_mAP`` is True
+        custom_metrics (None): an optional list of custom metrics to compute
+            or dict mapping metric names to kwargs dicts
     """
 
     def __init__(
@@ -50,10 +52,16 @@ def __init__(
         classwise=None,
         compute_mAP=False,
         iou_threshs=None,
+        custom_metrics=None,
         **kwargs,
     ):
         super().__init__(
-            pred_field, gt_field, iou=iou, classwise=classwise, **kwargs
+            pred_field,
+            gt_field,
+            iou=iou,
+            classwise=classwise,
+            custom_metrics=custom_metrics,
+            **kwargs,
         )
 
         if compute_mAP and iou_threshs is None:
@@ -323,6 +331,7 @@ class ActivityNetDetectionResults(DetectionResults):
             ``num_iou_threshs x num_classes x num_recall``
         missing (None): a missing label string. Any unmatched segments are
             given this label for evaluation purposes
+        custom_metrics (None): an optional dict of custom metrics
         backend (None): a :class:`ActivityNetEvaluation` backend
     """
 
@@ -339,6 +348,7 @@ def __init__(
         classes,
         thresholds=None,
         missing=None,
+        custom_metrics=None,
         backend=None,
     ):
         super().__init__(
@@ -348,6 +358,7 @@ def __init__(
             matches,
             classes=classes,
             missing=missing,
+            custom_metrics=custom_metrics,
             backend=backend,
         )
 
diff --git a/fiftyone/utils/eval/base.py b/fiftyone/utils/eval/base.py
index 5fdcf9ba38..8471184c70 100644
--- a/fiftyone/utils/eval/base.py
+++ b/fiftyone/utils/eval/base.py
@@ -6,17 +6,124 @@
 |
 """
 import itertools
+import logging
 
 import numpy as np
 import sklearn.metrics as skm
 
 import fiftyone.core.evaluation as foe
 import fiftyone.core.plots as fop
+import fiftyone.core.utils as fou
+
+foo = fou.lazy_import("fiftyone.operators")
+
+
+logger = logging.getLogger(__name__)
+
+
+class BaseEvaluationMethodConfig(foe.EvaluationMethodConfig):
+    """Base class for configuring evaluation methods.
+
+    Args:
+        **kwargs: any leftover keyword arguments after subclasses have done
+            their parsing
+    """
+
+    pass
+
+
+class BaseEvaluationMethod(foe.EvaluationMethod):
+    """Base class for evaluation methods.
+
+    Args:
+        config: an :class:`BaseEvaluationMethodConfig`
+    """
+
+    def _get_custom_metrics(self):
+        if not self.config.custom_metrics:
+            return {}
+
+        if isinstance(self.config.custom_metrics, list):
+            return {m: None for m in self.config.custom_metrics}
+
+        return self.config.custom_metrics
+
+    def compute_custom_metrics(self, samples, eval_key, results):
+        results.custom_metrics = {}
+
+        for metric, kwargs in self._get_custom_metrics().items():
+            try:
+                operator = foo.get_operator(metric)
+                value = operator.compute(
+                    samples, eval_key, results, **kwargs or {}
+                )
+                if value is not None:
+                    results.custom_metrics[operator.name] = value
+            except Exception as e:
+                logger.warning(
+                    "Failed to compute metric '%s': Reason: %s",
+                    operator.uri,
+                    e,
+                )
+
+    def get_custom_metric_fields(self, samples, eval_key):
+        fields = []
+
+        for metric in self._get_custom_metrics().keys():
+            try:
+                operator = foo.get_operator(metric)
+                fields.extend(operator.get_fields(samples, eval_key))
+            except Exception as e:
+                logger.warning(
+                    "Failed to get fields for metric '%s': Reason: %s",
+                    operator.uri,
+                    e,
+                )
+
+        return fields
+
+    def rename_custom_metrics(self, samples, eval_key, new_eval_key):
+        for metric in self._get_custom_metrics().keys():
+            try:
+                operator = foo.get_operator(metric)
+                operator.rename(samples, eval_key, new_eval_key)
+            except Exception as e:
+                logger.warning(
+                    "Failed to rename fields for metric '%s': Reason: %s",
+                    operator.uri,
+                    e,
+                )
+
+    def cleanup_custom_metrics(self, samples, eval_key):
+        for metric in self._get_custom_metrics().keys():
+            try:
+                operator = foo.get_operator(metric)
+                operator.cleanup(samples, eval_key)
+            except Exception as e:
+                logger.warning(
+                    "Failed to cleanup metric '%s': Reason: %s",
+                    operator.uri,
+                    e,
+                )
 
 
 class BaseEvaluationResults(foe.EvaluationResults):
     """Base class for evaluation results.
 
+    Args:
+        samples: the :class:`fiftyone.core.collections.SampleCollection` used
+        config: the :class:`BaseEvaluationMethodConfig` used
+        eval_key: the evaluation key
+        backend (None): an :class:`EvaluationMethod` backend
+    """
+
+    pass
+
+
+class BaseClassificationResults(BaseEvaluationResults):
+    """Base class for evaluation results that expose classification metrics
+    like P/R/F1 and confusion matrices.
+
     Args:
         samples: the :class:`fiftyone.core.collections.SampleCollection` used
         config: the :class:`fiftyone.core.evaluation.EvaluationMethodConfig`
@@ -32,8 +139,7 @@ class BaseEvaluationResults(foe.EvaluationResults):
             observed ground truth/predicted labels are used
         missing (None): a missing label string. Any None-valued labels are
             given this label for evaluation purposes
-        samples (None): the :class:`fiftyone.core.collections.SampleCollection`
-            for which the results were computed
+        custom_metrics (None): an optional dict of custom metrics
         backend (None): a :class:`fiftyone.core.evaluation.EvaluationMethod`
             backend
     """
@@ -51,6 +157,7 @@ def __init__(
         ypred_ids=None,
         classes=None,
         missing=None,
+        custom_metrics=None,
         backend=None,
     ):
         super().__init__(samples, config, eval_key, backend=backend)
@@ -72,6 +179,7 @@ def __init__(
         )
         self.classes = np.asarray(classes)
         self.missing = missing
+        self.custom_metrics = custom_metrics
 
     def report(self, classes=None):
         """Generates a classification report for the results via
diff --git a/fiftyone/utils/eval/classification.py b/fiftyone/utils/eval/classification.py
index 6df82a5798..65ae55a92b 100644
--- a/fiftyone/utils/eval/classification.py
+++ b/fiftyone/utils/eval/classification.py
@@ -16,7 +16,6 @@
 import eta.core.utils as etau
 
 import fiftyone as fo
-import fiftyone.core.evaluation as foe
 from fiftyone.core.expressions import ViewField as F
 import fiftyone.core.fields as fof
 import fiftyone.core.labels as fol
@@ -24,7 +23,11 @@
 import fiftyone.core.utils as fou
 import fiftyone.core.validation as fov
 
-from .base import BaseEvaluationResults
+from .base import (
+    BaseEvaluationMethod,
+    BaseEvaluationMethodConfig,
+    BaseClassificationResults,
+)
 
 
 def evaluate_classifications(
@@ -35,6 +38,7 @@ def evaluate_classifications(
     classes=None,
     missing=None,
     method=None,
+    custom_metrics=None,
     progress=None,
     **kwargs,
 ):
@@ -82,6 +86,8 @@ def evaluate_classifications(
             supported values are
             ``fo.evaluation_config.classification_backends.keys()`` and the
             default is ``fo.evaluation_config.default_classification_backend``
+        custom_metrics (None): an optional list of custom metrics to compute
+            or dict mapping metric names to kwargs dicts
         progress (None): whether to render a progress bar (True/False), use the
             default value ``fiftyone.config.show_progress_bars`` (None), or a
             progress callback function to invoke instead
@@ -96,7 +102,13 @@ def evaluate_classifications(
         samples, (pred_field, gt_field), fol.Classification, same_type=True
     )
 
-    config = _parse_config(pred_field, gt_field, method, **kwargs)
+    config = _parse_config(
+        pred_field,
+        gt_field,
+        method,
+        custom_metrics=custom_metrics,
+        **kwargs,
+    )
     eval_method = config.build()
     eval_method.ensure_requirements()
 
@@ -110,12 +122,13 @@ def evaluate_classifications(
         missing=missing,
         progress=progress,
     )
+    eval_method.compute_custom_metrics(samples, eval_key, results)
     eval_method.save_run_results(samples, eval_key, results)
 
     return results
 
 
-class ClassificationEvaluationConfig(foe.EvaluationMethodConfig):
+class ClassificationEvaluationConfig(BaseEvaluationMethodConfig):
     """Base class for configuring :class:`ClassificationEvaluation`
     instances.
 
@@ -124,19 +137,22 @@ class ClassificationEvaluationConfig(foe.EvaluationMethodConfig):
             :class:`fiftyone.core.labels.Classification` instances
         gt_field: the name of the field containing the ground truth
             :class:`fiftyone.core.labels.Classification` instances
+        custom_metrics (None): an optional list of custom metrics to compute
+            or dict mapping metric names to kwargs dicts
     """
 
-    def __init__(self, pred_field, gt_field, **kwargs):
+    def __init__(self, pred_field, gt_field, custom_metrics=None, **kwargs):
         super().__init__(**kwargs)
         self.pred_field = pred_field
         self.gt_field = gt_field
+        self.custom_metrics = custom_metrics
 
     @property
     def type(self):
         return "classification"
 
 
-class ClassificationEvaluation(foe.EvaluationMethod):
+class ClassificationEvaluation(BaseEvaluationMethod):
     """Base class for classification evaluation methods.
 
     Args:
@@ -187,6 +203,8 @@ def get_fields(self, samples, eval_key):
         if is_frame_field:
             fields.append(samples._FRAMES_PREFIX + eval_key)
 
+        fields.extend(self.get_custom_metric_fields(samples, eval_key))
+
         return fields
 
     def rename(self, samples, eval_key, new_eval_key):
@@ -208,6 +226,8 @@ def rename(self, samples, eval_key, new_eval_key):
             fields = dict(zip(in_frame_fields, out_frame_fields))
             dataset.rename_frame_fields(fields)
 
+        self.rename_custom_metrics(samples, eval_key, new_eval_key)
+
     def cleanup(self, samples, eval_key):
         dataset = samples._dataset
         is_frame_field = samples._is_frame_field(self.config.gt_field)
@@ -217,6 +237,8 @@ def cleanup(self, samples, eval_key):
         if is_frame_field:
             dataset.delete_frame_field(eval_key, error_level=1)
 
+        self.cleanup_custom_metrics(samples, eval_key)
+
     def _validate_run(self, samples, eval_key, existing_info):
         self._validate_fields_match(eval_key, "pred_field", existing_info)
         self._validate_fields_match(eval_key, "gt_field", existing_info)
@@ -230,6 +252,8 @@ class SimpleEvaluationConfig(ClassificationEvaluationConfig):
             :class:`fiftyone.core.labels.Classification` instances
         gt_field: the name of the field containing the ground truth
             :class:`fiftyone.core.labels.Classification` instances
+        custom_metrics (None): an optional list of custom metrics to compute
+            or dict mapping metric names to kwargs dicts
     """
 
     @property
@@ -337,10 +361,16 @@ class TopKEvaluationConfig(ClassificationEvaluationConfig):
         gt_field: the name of the field containing the ground truth
             :class:`fiftyone.core.labels.Classification` instances
         k (5): the top-k value to use when assessing accuracy
+        custom_metrics (None): an optional list of custom metrics to compute
+            or dict mapping metric names to kwargs dicts
     """
 
-    def __init__(self, pred_field, gt_field, k=5, **kwargs):
-        super().__init__(pred_field, gt_field, **kwargs)
+    def __init__(
+        self, pred_field, gt_field, k=5, custom_metrics=None, **kwargs
+    ):
+        super().__init__(
+            pred_field, gt_field, custom_metrics=custom_metrics, **kwargs
+        )
         self.k = k
 
     @property
@@ -535,6 +565,8 @@ class BinaryEvaluationConfig(ClassificationEvaluationConfig):
         pred_field: the name of the field containing the predicted
             :class:`fiftyone.core.labels.Classification` instances
         gt_field: the name of the field containing the ground truth
+        custom_metrics (None): an optional list of custom metrics to compute
+            or dict mapping metric names to kwargs dicts
     """
 
     @property
@@ -678,7 +710,7 @@ def evaluate_samples(
         return results
 
 
-class ClassificationResults(BaseEvaluationResults):
+class ClassificationResults(BaseClassificationResults):
     """Class that stores the results of a classification evaluation.
 
     Args:
@@ -695,6 +727,7 @@ class ClassificationResults(BaseEvaluationResults):
             observed ground truth/predicted labels are used
         missing (None): a missing label string. Any None-valued labels are
             given this label for evaluation purposes
+        custom_metrics (None): an optional dict of custom metrics
         backend (None): a :class:`ClassificationEvaluation` backend
     """
 
@@ -718,6 +751,7 @@ class BinaryClassificationResults(ClassificationResults):
         weights (None): an optional list of sample weights
         ytrue_ids (None): a list of IDs for the ground truth labels
         ypred_ids (None): a list of IDs for the predicted labels
+        custom_metrics (None): an optional dict of custom metrics
         backend (None): a :class:`ClassificationEvaluation` backend
     """
 
@@ -733,6 +767,7 @@ def __init__(
         weights=None,
         ytrue_ids=None,
         ypred_ids=None,
+        custom_metrics=None,
         backend=None,
     ):
         super().__init__(
@@ -747,6 +782,7 @@ def __init__(
             ypred_ids=ypred_ids,
             classes=classes,
             missing=classes[0],
+            custom_metrics=custom_metrics,
             backend=backend,
         )
 
@@ -866,6 +902,7 @@ def _from_dict(cls, d, samples, config, eval_key, **kwargs):
         weights = d.get("weights", None)
         ytrue_ids = d.get("ytrue_ids", None)
         ypred_ids = d.get("ypred_ids", None)
+        custom_metrics = d.get("custom_metrics", None)
         return cls(
             samples,
             config,
@@ -877,6 +914,7 @@ def _from_dict(cls, d, samples, config, eval_key, **kwargs):
             weights=weights,
             ytrue_ids=ytrue_ids,
             ypred_ids=ypred_ids,
+            custom_metrics=custom_metrics,
             **kwargs,
         )
 
@@ -885,6 +923,10 @@ def _parse_config(pred_field, gt_field, method, **kwargs):
     if method is None:
         method = fo.evaluation_config.default_classification_backend
 
+    custom_metrics = kwargs.get("custom_metrics", None)
+    if etau.is_str(custom_metrics):
+        kwargs["custom_metrics"] = [custom_metrics]
+
     if inspect.isclass(method):
         return method(pred_field, gt_field, **kwargs)
 
diff --git a/fiftyone/utils/eval/coco.py b/fiftyone/utils/eval/coco.py
index 3de1702051..47483a3966 100644
--- a/fiftyone/utils/eval/coco.py
+++ b/fiftyone/utils/eval/coco.py
@@ -66,6 +66,8 @@ class COCOEvaluationConfig(DetectionEvaluationConfig):
 
             If ``error_level > 0``, any calculation that raises a geometric
             error will default to an IoU of 0
+        custom_metrics (None): an optional list of custom metrics to compute
+            or dict mapping metric names to kwargs dicts
     """
 
     def __init__(
@@ -82,10 +84,16 @@ def __init__(
         iou_threshs=None,
         max_preds=None,
         error_level=1,
+        custom_metrics=None,
         **kwargs,
     ):
         super().__init__(
-            pred_field, gt_field, iou=iou, classwise=classwise, **kwargs
+            pred_field,
+            gt_field,
+            iou=iou,
+            classwise=classwise,
+            custom_metrics=custom_metrics,
+            **kwargs,
         )
 
         if compute_mAP and iou_threshs is None:
@@ -262,6 +270,7 @@ class COCODetectionResults(DetectionResults):
             ``num_iou_threshs x num_classes x num_recall``
         missing (None): a missing label string. Any unmatched objects are
             given this label for evaluation purposes
+        custom_metrics (None): an optional dict of custom metrics
         backend (None): a :class:`COCOEvaluation` backend
     """
 
@@ -278,6 +287,7 @@ def __init__(
         recall_sweep=None,
         thresholds=None,
         missing=None,
+        custom_metrics=None,
         backend=None,
     ):
         super().__init__(
@@ -287,6 +297,7 @@ def __init__(
             matches,
             classes=classes,
             missing=missing,
+            custom_metrics=custom_metrics,
             backend=backend,
         )
 
diff --git a/fiftyone/utils/eval/detection.py b/fiftyone/utils/eval/detection.py
index f5a7173578..0fedacfb39 100644
--- a/fiftyone/utils/eval/detection.py
+++ b/fiftyone/utils/eval/detection.py
@@ -15,13 +15,16 @@
 import eta.core.utils as etau
 
 import fiftyone as fo
-import fiftyone.core.evaluation as foe
 import fiftyone.core.fields as fof
 import fiftyone.core.labels as fol
 import fiftyone.core.utils as fou
 import fiftyone.core.validation as fov
 
-from .base import BaseEvaluationResults
+from .base import (
+    BaseEvaluationMethod,
+    BaseEvaluationMethodConfig,
+    BaseClassificationResults,
+)
 
 
 logger = logging.getLogger(__name__)
@@ -40,6 +43,7 @@ def evaluate_detections(
     use_boxes=False,
     classwise=True,
     dynamic=True,
+    custom_metrics=None,
     progress=None,
     **kwargs,
 ):
@@ -132,6 +136,8 @@ def evaluate_detections(
             label (True) or allow matches between classes (False)
         dynamic (True): whether to declare the dynamic object-level attributes
             that are populated on the dataset's schema
+        custom_metrics (None): an optional list of custom metrics to compute
+            or dict mapping metric names to kwargs dicts
         progress (None): whether to render a progress bar (True/False), use the
             default value ``fiftyone.config.show_progress_bars`` (None), or a
             progress callback function to invoke instead
@@ -164,6 +170,7 @@ def evaluate_detections(
         is_temporal,
         iou=iou,
         classwise=classwise,
+        custom_metrics=custom_metrics,
         **kwargs,
     )
 
@@ -223,12 +230,13 @@ def evaluate_detections(
         missing=missing,
         progress=progress,
     )
+    eval_method.compute_custom_metrics(samples, eval_key, results)
     eval_method.save_run_results(samples, eval_key, results)
 
     return results
 
 
-class DetectionEvaluationConfig(foe.EvaluationMethodConfig):
+class DetectionEvaluationConfig(BaseEvaluationMethodConfig):
     """Base class for configuring :class:`DetectionEvaluation` instances.
 
     Args:
@@ -243,16 +251,25 @@ class DetectionEvaluationConfig(foe.EvaluationMethodConfig):
         iou (None): the IoU threshold to use to determine matches
         classwise (None): whether to only match objects with the same class
             label (True) or allow matches between classes (False)
+        custom_metrics (None): an optional list of custom metrics to compute
+            or dict mapping metric names to kwargs dicts
     """
 
     def __init__(
-        self, pred_field, gt_field, iou=None, classwise=None, **kwargs
+        self,
+        pred_field,
+        gt_field,
+        iou=None,
+        classwise=None,
+        custom_metrics=None,
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.pred_field = pred_field
         self.gt_field = gt_field
         self.iou = iou
         self.classwise = classwise
+        self.custom_metrics = custom_metrics
 
     @property
     def type(self):
@@ -270,7 +287,7 @@ def requires_additional_fields(self):
         return False
 
 
-class DetectionEvaluation(foe.EvaluationMethod):
+class DetectionEvaluation(BaseEvaluationMethod):
     """Base class for detection evaluation methods.
 
     Args:
@@ -442,6 +459,8 @@ def get_fields(self, samples, eval_key):
                 ["%s_tp" % prefix, "%s_fp" % prefix, "%s_fn" % prefix]
             )
 
+        fields.extend(self.get_custom_metric_fields(samples, eval_key))
+
         return fields
 
     def rename(self, samples, eval_key, new_eval_key):
@@ -463,6 +482,8 @@ def rename(self, samples, eval_key, new_eval_key):
             fields = dict(zip(in_frame_fields, out_frame_fields))
             dataset.rename_frame_fields(fields)
 
+        self.rename_custom_metrics(samples, eval_key, new_eval_key)
+
     def cleanup(self, samples, eval_key):
         dataset = samples._dataset
 
@@ -507,12 +528,14 @@ def cleanup(self, samples, eval_key):
         else:
             dataset.delete_sample_fields(fields, error_level=1)
 
+        self.cleanup_custom_metrics(samples, eval_key)
+
     def _validate_run(self, samples, eval_key, existing_info):
         self._validate_fields_match(eval_key, "pred_field", existing_info)
         self._validate_fields_match(eval_key, "gt_field", existing_info)
 
 
-class DetectionResults(BaseEvaluationResults):
+class DetectionResults(BaseClassificationResults):
     """Class that stores the results of a detection evaluation.
 
     Args:
@@ -527,6 +550,7 @@ class DetectionResults(BaseEvaluationResults):
             observed ground truth/predicted labels are used
         missing (None): a missing label string. Any unmatched objects are given
             this label for evaluation purposes
+        custom_metrics (None): an optional dict of custom metrics
         backend (None): a :class:`DetectionEvaluation` backend
     """
 
@@ -538,6 +562,7 @@ def __init__(
         matches,
         classes=None,
         missing=None,
+        custom_metrics=None,
         backend=None,
     ):
         if matches:
@@ -563,6 +588,7 @@ def __init__(
             ypred_ids=ypred_ids,
             classes=classes,
             missing=missing,
+            custom_metrics=custom_metrics,
             backend=backend,
         )
 
@@ -588,6 +614,7 @@ def _from_dict(cls, d, samples, config, eval_key, **kwargs):
 
         classes = d.get("classes", None)
         missing = d.get("missing", None)
+        custom_metrics = d.get("custom_metrics", None)
 
         matches = list(zip(ytrue, ypred, ious, confs, ytrue_ids, ypred_ids))
 
@@ -598,6 +625,7 @@ def _from_dict(cls, d, samples, config, eval_key, **kwargs):
             matches,
             classes=classes,
             missing=missing,
+            custom_metrics=custom_metrics,
             **kwargs,
         )
 
@@ -610,6 +638,10 @@ def _parse_config(pred_field, gt_field, method, is_temporal, **kwargs):
         else:
             method = fo.evaluation_config.default_detection_backend
 
+    custom_metrics = kwargs.get("custom_metrics", None)
+    if etau.is_str(custom_metrics):
+        kwargs["custom_metrics"] = [custom_metrics]
+
     if inspect.isclass(method):
         return method(pred_field, gt_field, **kwargs)
 
diff --git a/fiftyone/utils/eval/openimages.py b/fiftyone/utils/eval/openimages.py
index ffd4d65e20..6a7addefaf 100644
--- a/fiftyone/utils/eval/openimages.py
+++ b/fiftyone/utils/eval/openimages.py
@@ -68,6 +68,8 @@ class OpenImagesEvaluationConfig(DetectionEvaluationConfig):
             labels according to the provided ``hierarchy``
         expand_pred_hierarchy (False): whether to expand predicted objects and
             labels according to the provided ``hierarchy``
+        custom_metrics (None): an optional list of custom metrics to compute
+            or dict mapping metric names to kwargs dicts
     """
 
     def __init__(
@@ -87,10 +89,16 @@ def __init__(
         neg_label_field=None,
         expand_gt_hierarchy=True,
         expand_pred_hierarchy=False,
+        custom_metrics=None,
         **kwargs
     ):
         super().__init__(
-            pred_field, gt_field, iou=iou, classwise=classwise, **kwargs
+            pred_field,
+            gt_field,
+            iou=iou,
+            classwise=classwise,
+            custom_metrics=custom_metrics,
+            **kwargs,
         )
 
         self.iscrowd = iscrowd
@@ -286,6 +294,7 @@ class OpenImagesDetectionResults(DetectionResults):
         thresholds (None): an optional dict of per-class decision thresholds
         missing (None): a missing label string. Any unmatched objects are
             given this label for evaluation purposes
+        custom_metrics (None): an optional dict of custom metrics
         backend (None): a :class:`OpenImagesEvaluation` backend
     """
 
@@ -300,6 +309,7 @@ def __init__(
         classes,
         thresholds=None,
         missing=None,
+        custom_metrics=None,
         backend=None,
     ):
         super().__init__(
@@ -309,6 +319,7 @@ def __init__(
             matches,
             classes=classes,
             missing=missing,
+            custom_metrics=custom_metrics,
             backend=backend,
         )
 
diff --git a/fiftyone/utils/eval/regression.py b/fiftyone/utils/eval/regression.py
index 09663ab6af..0db937179e 100644
--- a/fiftyone/utils/eval/regression.py
+++ b/fiftyone/utils/eval/regression.py
@@ -18,13 +18,18 @@
 import eta.core.utils as etau
 
 import fiftyone as fo
-import fiftyone.core.evaluation as foe
 import fiftyone.core.fields as fof
 import fiftyone.core.labels as fol
 import fiftyone.core.plots as fop
 import fiftyone.core.utils as fou
 import fiftyone.core.validation as fov
 
+from .base import (
+    BaseEvaluationMethodConfig,
+    BaseEvaluationMethod,
+    BaseEvaluationResults,
+)
+
 
 logger = logging.getLogger(__name__)
 
@@ -36,6 +41,7 @@ def evaluate_regressions(
     eval_key=None,
     missing=None,
     method=None,
+    custom_metrics=None,
     progress=None,
     **kwargs,
 ):
@@ -75,6 +81,8 @@ def evaluate_regressions(
             supported values are
             ``fo.evaluation_config.regression_backends.keys()`` and the default
             is ``fo.evaluation_config.default_regression_backend``
+        custom_metrics (None): an optional list of custom metrics to compute
+            or dict mapping metric names to kwargs dicts
         progress (None): whether to render a progress bar (True/False), use the
             default value ``fiftyone.config.show_progress_bars`` (None), or a
             progress callback function to invoke instead
@@ -89,7 +97,13 @@ def evaluate_regressions(
         samples, (pred_field, gt_field), fol.Regression, same_type=True
     )
 
-    config = _parse_config(pred_field, gt_field, method, **kwargs)
+    config = _parse_config(
+        pred_field,
+        gt_field,
+        method,
+        custom_metrics=custom_metrics,
+        **kwargs,
+    )
     eval_method = config.build()
     eval_method.ensure_requirements()
 
@@ -99,12 +113,13 @@ def evaluate_regressions(
     results = eval_method.evaluate_samples(
         samples, eval_key=eval_key, missing=missing, progress=progress
     )
+    eval_method.compute_custom_metrics(samples, eval_key, results)
     eval_method.save_run_results(samples, eval_key, results)
 
     return results
 
 
-class RegressionEvaluationConfig(foe.EvaluationMethodConfig):
+class RegressionEvaluationConfig(BaseEvaluationMethodConfig):
     """Base class for configuring :class:`RegressionEvaluation` instances.
 
     Args:
@@ -112,19 +127,22 @@ class RegressionEvaluationConfig(foe.EvaluationMethodConfig):
             :class:`fiftyone.core.labels.Regression` instances
         gt_field ("ground_truth"): the name of the field containing the ground
             truth :class:`fiftyone.core.labels.Regression` instances
+        custom_metrics (None): an optional list of custom metrics to compute
+            or dict mapping metric names to kwargs dicts
     """
 
-    def __init__(self, pred_field, gt_field, **kwargs):
+    def __init__(self, pred_field, gt_field, custom_metrics=None, **kwargs):
         super().__init__(**kwargs)
         self.pred_field = pred_field
         self.gt_field = gt_field
+        self.custom_metrics = custom_metrics
 
     @property
     def type(self):
         return "regression"
 
 
-class RegressionEvaluation(foe.EvaluationMethod):
+class RegressionEvaluation(BaseEvaluationMethod):
     """Base class for regression evaluation methods.
 
     Args:
@@ -181,6 +199,8 @@ def get_fields(self, samples, eval_key):
             prefix = samples._FRAMES_PREFIX + eval_key
             fields.append(prefix)
 
+        fields.extend(self.get_custom_metric_fields(samples, eval_key))
+
         return fields
 
     def rename(self, samples, eval_key, new_eval_key):
@@ -202,6 +222,8 @@ def rename(self, samples, eval_key, new_eval_key):
             fields = dict(zip(in_frame_fields, out_frame_fields))
             dataset.rename_frame_fields(fields)
 
+        self.rename_custom_metrics(samples, eval_key, new_eval_key)
+
     def cleanup(self, samples, eval_key):
         dataset = samples._dataset
 
@@ -211,6 +233,8 @@ def cleanup(self, samples, eval_key):
         if dataset._is_frame_field(self.config.gt_field):
             dataset.delete_frame_fields(fields, error_level=1)
 
+        self.cleanup_custom_metrics(samples, eval_key)
+
     def _validate_run(self, samples, eval_key, existing_info):
         self._validate_fields_match(eval_key, "pred_field", existing_info)
         self._validate_fields_match(eval_key, "gt_field", existing_info)
@@ -228,10 +252,21 @@ class SimpleEvaluationConfig(RegressionEvaluationConfig):
             sample/frame-level error data. Supported values are
             ``("squared_error", "absolute_error")`` or any function that
             accepts two scalar arguments ``(ypred, ytrue)``
+        custom_metrics (None): an optional list of custom metrics to compute
+            or dict mapping metric names to kwargs dicts
     """
 
-    def __init__(self, pred_field, gt_field, metric="squared_error", **kwargs):
-        super().__init__(pred_field, gt_field, **kwargs)
+    def __init__(
+        self,
+        pred_field,
+        gt_field,
+        metric="squared_error",
+        custom_metrics=None,
+        **kwargs,
+    ):
+        super().__init__(
+            pred_field, gt_field, custom_metrics=custom_metrics, **kwargs
+        )
         self._metric = metric
 
     @property
@@ -344,7 +379,7 @@ def compute_error(yp, yt):
         return results
 
 
-class RegressionResults(foe.EvaluationResults):
+class RegressionResults(BaseEvaluationResults):
     """Class that stores the results of a regression evaluation.
 
     Args:
@@ -361,6 +396,7 @@ class RegressionResults(foe.EvaluationResults):
             regressions
         missing (None): a missing value. Any None-valued regressions are
             given this value for results purposes
+        custom_metrics (None): an optional dict of custom metrics
         backend (None): a :class:`RegressionEvaluation` backend
     """
 
@@ -374,6 +410,7 @@ def __init__(
         confs=None,
         ids=None,
         missing=None,
+        custom_metrics=None,
         backend=None,
     ):
         super().__init__(samples, config, eval_key, backend=backend)
@@ -387,6 +424,7 @@ def __init__(
         self.confs = confs
         self.ids = ids
         self.missing = missing
+        self.custom_metrics = custom_metrics
 
     def metrics(self, weights=None):
         """Computes various popular regression metrics for the results.
@@ -517,6 +555,7 @@ def _from_dict(cls, d, samples, config, eval_key, **kwargs):
         confs = d.get("confs", None)
         ids = d.get("ids", None)
         missing = d.get("missing", None)
+        custom_metrics = d.get("custom_metrics", None)
         return cls(
             samples,
             config,
@@ -526,6 +565,7 @@ def _from_dict(cls, d, samples, config, eval_key, **kwargs):
             confs=confs,
             ids=ids,
             missing=missing,
+            custom_metrics=custom_metrics,
             **kwargs,
         )
 
@@ -534,6 +574,10 @@ def _parse_config(pred_field, gt_field, method, **kwargs):
     if method is None:
         method = fo.evaluation_config.default_regression_backend
 
+    custom_metrics = kwargs.get("custom_metrics", None)
+    if etau.is_str(custom_metrics):
+        kwargs["custom_metrics"] = [custom_metrics]
+
     if inspect.isclass(method):
         return method(pred_field, gt_field, **kwargs)
 
diff --git a/fiftyone/utils/eval/segmentation.py b/fiftyone/utils/eval/segmentation.py
index 6e15be95c1..2bd5c5770f 100644
--- a/fiftyone/utils/eval/segmentation.py
+++ b/fiftyone/utils/eval/segmentation.py
@@ -17,13 +17,16 @@
 import eta.core.utils as etau
 
 import fiftyone as fo
-import fiftyone.core.evaluation as foe
 import fiftyone.core.fields as fof
 import fiftyone.core.labels as fol
 import fiftyone.core.utils as fou
 import fiftyone.core.validation as fov
 
-from .base import BaseEvaluationResults
+from .base import (
+    BaseEvaluationMethod,
+    BaseEvaluationMethodConfig,
+    BaseClassificationResults,
+)
 
 
 logger = logging.getLogger(__name__)
@@ -36,6 +39,7 @@ def evaluate_segmentations(
     eval_key=None,
     mask_targets=None,
     method=None,
+    custom_metrics=None,
     progress=None,
     **kwargs,
 ):
@@ -87,6 +91,8 @@ def evaluate_segmentations(
             supported values are
             ``fo.evaluation_config.segmentation_backends.keys()`` and the
             default is ``fo.evaluation_config.default_segmentation_backend``
+        custom_metrics (None): an optional list of custom metrics to compute
+            or dict mapping metric names to kwargs dicts
         progress (None): whether to render a progress bar (True/False), use the
             default value ``fiftyone.config.show_progress_bars`` (None), or a
             progress callback function to invoke instead
@@ -101,7 +107,13 @@ def evaluate_segmentations(
         samples, (pred_field, gt_field), fol.Segmentation, same_type=True
     )
 
-    config = _parse_config(pred_field, gt_field, method, **kwargs)
+    config = _parse_config(
+        pred_field,
+        gt_field,
+        method,
+        custom_metrics=custom_metrics,
+        **kwargs,
+    )
     eval_method = config.build()
     eval_method.ensure_requirements()
 
@@ -114,12 +126,13 @@ def evaluate_segmentations(
         mask_targets=mask_targets,
         progress=progress,
     )
+    eval_method.compute_custom_metrics(samples, eval_key, results)
     eval_method.save_run_results(samples, eval_key, results)
 
     return results
 
 
-class SegmentationEvaluationConfig(foe.EvaluationMethodConfig):
+class SegmentationEvaluationConfig(BaseEvaluationMethodConfig):
     """Base class for configuring :class:`SegmentationEvaluation` instances.
 
     Args:
@@ -129,20 +142,30 @@ class SegmentationEvaluationConfig(foe.EvaluationMethodConfig):
             :class:`fiftyone.core.labels.Segmentation` instances
         compute_dice (False): whether to compute the Dice coefficient for each
             sample
+        custom_metrics (None): an optional list of custom metrics to compute
+            or dict mapping metric names to kwargs dicts
     """
 
-    def __init__(self, pred_field, gt_field, compute_dice=False, **kwargs):
+    def __init__(
+        self,
+        pred_field,
+        gt_field,
+        compute_dice=False,
+        custom_metrics=None,
+        **kwargs,
+    ):
         super().__init__(**kwargs)
         self.pred_field = pred_field
         self.gt_field = gt_field
         self.compute_dice = compute_dice
+        self.custom_metrics = custom_metrics
 
     @property
     def type(self):
         return "segmentation"
 
 
-class SegmentationEvaluation(foe.EvaluationMethod):
+class SegmentationEvaluation(BaseEvaluationMethod):
     """Base class for segmentation evaluation methods.
 
     Args:
@@ -232,6 +255,8 @@ def get_fields(self, samples, eval_key):
             if self.config.compute_dice:
                 fields.append("%s_dice" % prefix)
 
+        fields.extend(self.get_custom_metric_fields(samples, eval_key))
+
         return fields
 
     def rename(self, samples, eval_key, new_eval_key):
@@ -253,6 +278,8 @@ def rename(self, samples, eval_key, new_eval_key):
             fields = dict(zip(in_frame_fields, out_frame_fields))
             dataset.rename_frame_fields(fields)
 
+        self.rename_custom_metrics(samples, eval_key, new_eval_key)
+
     def cleanup(self, samples, eval_key):
         dataset = samples._dataset
         processing_frames = samples._is_frame_field(self.config.gt_field)
@@ -271,6 +298,8 @@ def cleanup(self, samples, eval_key):
         if processing_frames:
             dataset.delete_frame_fields(fields, error_level=1)
 
+        self.cleanup_custom_metrics(samples, eval_key)
+
     def _validate_run(self, samples, eval_key, existing_info):
         self._validate_fields_match(eval_key, "pred_field", existing_info)
         self._validate_fields_match(eval_key, "gt_field", existing_info)
@@ -292,6 +321,8 @@ class SimpleEvaluationConfig(SegmentationEvaluationConfig):
             default, the entire masks are evaluated
         average ("micro"): the averaging strategy to use when populating
             precision and recall numbers on each sample
+        custom_metrics (None): an optional list of custom metrics to compute
+            or dict mapping metric names to kwargs dicts
     """
 
     def __init__(
@@ -301,10 +332,15 @@ def __init__(
         compute_dice=False,
         bandwidth=None,
         average="micro",
+        custom_metrics=None,
         **kwargs,
     ):
         super().__init__(
-            pred_field, gt_field, compute_dice=compute_dice, **kwargs
+            pred_field,
+            gt_field,
+            compute_dice=compute_dice,
+            custom_metrics=custom_metrics,
+            **kwargs,
         )
         self.bandwidth = bandwidth
         self.average = average
@@ -429,7 +465,7 @@ def evaluate_samples(
         )
 
 
-class SegmentationResults(BaseEvaluationResults):
+class SegmentationResults(BaseClassificationResults):
     """Class that stores the results of a segmentation evaluation.
 
     Args:
@@ -439,6 +475,7 @@ class SegmentationResults(BaseEvaluationResults):
         pixel_confusion_matrix: a pixel value confusion matrix
         classes: a list of class labels corresponding to the confusion matrix
         missing (None): a missing (background) class
+        custom_metrics (None): an optional dict of custom metrics
         backend (None): a :class:`SegmentationEvaluation` backend
     """
 
@@ -450,6 +487,7 @@ def __init__(
         pixel_confusion_matrix,
         classes,
         missing=None,
+        custom_metrics=None,
         backend=None,
     ):
         pixel_confusion_matrix = np.asarray(pixel_confusion_matrix)
@@ -466,13 +504,20 @@ def __init__(
             weights=weights,
             classes=classes,
             missing=missing,
+            custom_metrics=custom_metrics,
             backend=backend,
         )
 
         self.pixel_confusion_matrix = pixel_confusion_matrix
 
     def attributes(self):
-        return ["cls", "pixel_confusion_matrix", "classes", "missing"]
+        return [
+            "cls",
+            "pixel_confusion_matrix",
+            "classes",
+            "missing",
+            "custom_metrics",
+        ]
 
     def dice_score(self):
         """Computes the Dice score across all samples in the evaluation.
@@ -491,6 +536,7 @@ def _from_dict(cls, d, samples, config, eval_key, **kwargs):
             d["pixel_confusion_matrix"],
             d["classes"],
             missing=d.get("missing", None),
+            custom_metrics=d.get("custom_metrics", None),
             **kwargs,
         )
 
@@ -515,6 +561,10 @@ def _parse_config(pred_field, gt_field, method, **kwargs):
     if method is None:
         method = fo.evaluation_config.default_segmentation_backend
 
+    custom_metrics = kwargs.get("custom_metrics", None)
+    if etau.is_str(custom_metrics):
+        kwargs["custom_metrics"] = [custom_metrics]
+
     if inspect.isclass(method):
         return method(pred_field, gt_field, **kwargs)