Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dawid-Sebastiani Score #75

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions scoringrules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
twenergy_score,
vrenergy_score,
)
from scoringrules._dawid_sebastiani import dawid_sebastiani_score
from scoringrules._error_spread import error_spread_score
from scoringrules._interval import interval_score, weighted_interval_score
from scoringrules._logs import (
Expand Down Expand Up @@ -113,6 +114,7 @@
"crps_quantile",
"crps_t",
"crps_uniform",
"dawid_sebastiani_score",
"owcrps_ensemble",
"twcrps_ensemble",
"vrcrps_ensemble",
Expand Down
49 changes: 49 additions & 0 deletions scoringrules/_dawid_sebastiani.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import typing as tp

from scoringrules.core import dss
from scoringrules.core.utils import multivariate_array_check

if tp.TYPE_CHECKING:
from scoringrules.core.typing import Array, Backend


def dawid_sebastiani_score(
observations: "Array",
forecasts: "Array",
/,
m_axis: int = -2,
v_axis: int = -1,
*,
backend: "Backend" = None,
) -> "Array":
r"""Compute the Dawid-Sebastiani-Score for a finite multivariate ensemble.

## TO-DO ##

Parameters
----------
forecasts: Array
The predicted forecast ensemble, where the ensemble dimension is by default
represented by the second last axis and the variables dimension by the last axis.
observations: Array
The observed values, where the variables dimension is by default the last axis.
m_axis: int
The axis corresponding to the ensemble dimension. Defaults to -2.
v_axis: int
The axis corresponding to the variables dimension. Defaults to -1.
backend: str
The name of the backend used for computations. Defaults to 'numba' if available, else 'numpy'.

Returns
-------
ds_score: Array
The computed Dawid-Sebastiani-Score.
"""
observations, forecasts = multivariate_array_check(
observations, forecasts, m_axis, v_axis, backend=backend
)

if backend == "numba":
return dss._dss_gufunc(observations, forecasts)

return dss._dss(observations, forecasts, backend=backend)
14 changes: 14 additions & 0 deletions scoringrules/backend/numpy.py
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose this would look more or less the same for the other backends?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pretty much I guess. I have not so much experience in tf/torch though, so I'm not sure yet about the exact implementation.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The det and inv functions should be easy/similar to define in the other backends.

For cov:

  • jax should be almost exactly the same as numpy.
  • torch would need "tranpose" being replaced with "permute".
  • tensorflow also uses "transpose" but does not have a "cov" function, so this would need to be adapted.

Although, as mentioned in a separate comment, the transpose function currently assumes we only have 3 dimensions.

Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,20 @@ def where(self, condition: "NDArray", x1: "NDArray", x2: "NDArray") -> "NDArray"
def size(self, x: "NDArray") -> int:
return x.size

def inv(self, x: "NDArray") -> "NDArray":
return np.linalg.inv(x)

def cov(self, x: "NDArray") -> "NDArray":
# TODO: This deserves some axis parameter probably?
if len(x.shape) == 2:
return np.cov(x)
else:
centered = x - x.mean(axis=-2, keepdims=True)
return (centered.transpose(0, 2, 1) @ centered) / (x.shape[-2] - 1)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

centered.transpose(0, 2, 1) assumes the array is 3D. Is there a way to generalise this to calculate the covariance matrix for higher dimensions?

Copy link
Contributor Author

@simon-hirsch simon-hirsch Dec 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Merry Christmas :)
If we assume that we always have the square matrix in the last two dimensions (like numpy does on the batched version of e.g. np.linalg.inv()), something like centered.swapaxes(-1, -2) should do the trick. Is that convention fine for you @sallen12?


def det(self, x: "NDArray") -> "NDArray":
return np.linalg.det(x)


class NumbaBackend(NumpyBackend):
"""Numba backend."""
Expand Down
10 changes: 10 additions & 0 deletions scoringrules/core/dss/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
try:
from ._gufuncs import (
_dss_gufunc,
)
except ImportError:
_dss_gufunc = None

from ._score import _ds_score as _dss

__all__ = ["_dss_gufunc", "_dss"]
19 changes: 19 additions & 0 deletions scoringrules/core/dss/_gufuncs.py
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd be curious to see the performance benefit here compared to plain numpy. Usually numba doesn't optimize much when using a lot of numpy operations.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, and the DSS is also by far not as computationlly intensive as e.g. ES or VS so I guess even if we get a good relative speed improvement the user might not feel it

Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import numpy as np
from numba import guvectorize


@guvectorize(
[
"void(float32[:], float32[:,:], float32[:])",
"void(float64[:], float64[:,:], float64[:])",
],
"(d),(m,d)->()",
)
def _dss_gufunc(obs: np.ndarray, fct: np.ndarray, out: np.ndarray):
M = fct.shape[0]
bias = obs - (np.sum(fct, axis=0) / M)
cov = np.cov(fct, rowvar=False).astype(bias.dtype)
prec = np.linalg.inv(cov).astype(bias.dtype)
log_det = np.log(np.linalg.det(cov))
bias_precision = bias.T @ prec @ bias
out[0] = log_det + bias_precision
21 changes: 21 additions & 0 deletions scoringrules/core/dss/_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import typing as tp

from scoringrules.backend import backends

if tp.TYPE_CHECKING:
from scoringrules.core.typing import Array, Backend


def _ds_score(
obs: "Array", # (... D)
fct: "Array", # (... M D)
backend: "Backend" = None,
) -> "Array":
"""Compute the Dawid Sebastiani score Score for a multivariate finite ensemble."""
B = backends.active if backend is None else backends[backend]
cov = B.cov(fct)
precision = B.inv(cov)
log_det = B.log(B.det(cov))
bias = obs - B.mean(fct, axis=-2)
bias_precision = B.squeeze(bias[:, None, :] @ precision @ bias[:, :, None])
return log_det + bias_precision
Loading