Skip to content

Commit

Permalink
Merge remote-tracking branch 'dask/main' into 734-pca-skip-centering
Browse files Browse the repository at this point in the history
  • Loading branch information
hristog committed Apr 15, 2021
2 parents 4c82add + db2e7d5 commit d762673
Show file tree
Hide file tree
Showing 7 changed files with 178 additions and 25 deletions.
39 changes: 31 additions & 8 deletions README.rst
Original file line number Diff line number Diff line change
@@ -1,12 +1,35 @@
dask-ml
Dask-ML
=======

``dask-ml`` is a library for distributed and parallel machine learning using `dask`_.
See the `documentation`_ for more.
|Build Status| |Azure Pipelines| |Coverage| |Doc Status| |Gitter| |Version Status| |NumFOCUS|

.. image:: https://dev.azure.com/dask-dev/dask/_apis/build/status/dask.dask-ml?branchName=main
:target: https://dev.azure.com/dask-dev/dask/_build/latest?definitionId=1&branchName=main
:alt: CI Status
Dask-ML provides scalable machine learning in Python using `Dask <https://dask.org/>`__ alongside popular machine learning libraries like `Scikit-Learn <http://scikit-learn.org/>`__, `XGBoost <https://ml.dask.org/xgboost.html>`__, and others.

You can try Dask-ML on a small cloud instance by clicking the following button:

.. image:: https://mybinder.org/badge.svg
:target: https://mybinder.org/v2/gh/dask/dask-examples/main?filepath=machine-learning.ipynb

.. _dask: https://dask.org
.. _documentation: http://ml.dask.org
LICENSE
-------

New BSD. See `License File <https://github.com/dask/dask-ml/blob/main/LICENSE.txt>`__.

.. _documentation: https://dask.org
.. |Build Status| image:: https://github.com/dask/dask-ml/workflows/CI/badge.svg?branch=main
:target: https://github.com/dask/dask-ml/actions?query=workflow%3A%22CI%22
.. |Azure Pipelines| image:: https://dev.azure.com/dask-dev/dask/_apis/build/status/dask.dask-ml?branchName=main
:target: https://dev.azure.com/dask-dev/dask/_build/latest?definitionId=1&branchName=main
.. |Coverage| image:: https://codecov.io/gh/dask/dask-ml/branch/main/graph/badge.svg
:target: https://codecov.io/gh/dask/dask-ml/branch/main
:alt: Coverage status
.. |Doc Status| image:: https://readthedocs.org/projects/ml/badge/?version=latest
:target: https://ml.dask.org/
:alt: Documentation Status
.. |Gitter| image:: https://badges.gitter.im/Join%20Chat.svg
:alt: Join the chat at https://gitter.im/dask/dask
:target: https://gitter.im/dask/dask?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge
.. |Version Status| image:: https://img.shields.io/pypi/v/dask-ml.svg
:target: https://pypi.python.org/pypi/dask-ml/
.. |NumFOCUS| image:: https://img.shields.io/badge/powered%20by-NumFOCUS-orange.svg?style=flat&colorA=E1523D&colorB=007D8A
:target: https://www.numfocus.org/
6 changes: 3 additions & 3 deletions ci/posix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@ jobs:
matrix:
linux37:
envFile: 'ci/environment-3.7.yaml'
SKLARN_DEV: "no"
SKLEARN_DEV: "no"
linux38:
envFile: 'ci/environment-3.8.yaml'
SKLARN_DEV: "no"
SKLEARN_DEV: "no"
earliest:
envFile: 'ci/environment-3.6.yaml'
SKLARN_DEV: "no"
SKLEARN_DEV: "no"
sklearnDev:
envFile: 'ci/environment-3.7.yaml'
SKLEARN_DEV: "yes"
Expand Down
1 change: 1 addition & 0 deletions dask_ml/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
)
from .regression import ( # noqa
mean_absolute_error,
mean_absolute_percentage_error,
mean_squared_error,
mean_squared_log_error,
r2_score,
Expand Down
87 changes: 75 additions & 12 deletions dask_ml/metrics/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def _check_sample_weight(sample_weight: Optional[ArrayLike]):
def _check_reg_targets(
y_true: ArrayLike, y_pred: ArrayLike, multioutput: Optional[str]
):
if multioutput != "uniform_average":
if multioutput is not None and multioutput != "uniform_average":
raise NotImplementedError("'multioutput' must be 'uniform_average'")

if y_true.ndim == 1:
Expand All @@ -40,12 +40,12 @@ def mean_squared_error(
_check_sample_weight(sample_weight)
output_errors = ((y_pred - y_true) ** 2).mean(axis=0)

if isinstance(multioutput, str):
if isinstance(multioutput, str) or multioutput is None:
if multioutput == "raw_values":
return output_errors
elif multioutput == "uniform_average":
# pass None as weights to np.average: uniform mean
multioutput = None
if compute:
return output_errors.compute()
else:
return output_errors
else:
raise ValueError("Weighted 'multioutput' not supported.")
result = output_errors.mean()
Expand All @@ -67,12 +67,75 @@ def mean_absolute_error(
_check_sample_weight(sample_weight)
output_errors = abs(y_pred - y_true).mean(axis=0)

if isinstance(multioutput, str):
if isinstance(multioutput, str) or multioutput is None:
if multioutput == "raw_values":
return output_errors
elif multioutput == "uniform_average":
# pass None as weights to np.average: uniform mean
multioutput = None
if compute:
return output_errors.compute()
else:
return output_errors
else:
raise ValueError("Weighted 'multioutput' not supported.")
result = output_errors.mean()
if compute:
result = result.compute()
return result


def mean_absolute_percentage_error(
y_true: ArrayLike,
y_pred: ArrayLike,
sample_weight: Optional[ArrayLike] = None,
multioutput: Optional[str] = "uniform_average",
compute: bool = True,
) -> ArrayLike:
"""Mean absolute percentage error regression loss.
Note here that we do not represent the output as a percentage in range
[0, 100]. Instead, we represent it in range [0, 1/eps]. Read more in
https://scikit-learn.org/stable/modules/model_evaluation.html#mean-absolute-percentage-error
Parameters
----------
y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
Ground truth (correct) target values.
y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
Estimated target values.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
multioutput : {'raw_values', 'uniform_average'} or array-like
Defines aggregating of multiple output values.
Array-like value defines weights used to average errors.
If input is list then the shape must be (n_outputs,).
'raw_values' :
Returns a full set of errors in case of multioutput input.
'uniform_average' :
Errors of all outputs are averaged with uniform weight.
compute : bool
Whether to compute this result (default ``True``)
Returns
-------
loss : float or array-like of floats in the range [0, 1/eps]
If multioutput is 'raw_values', then mean absolute percentage error
is returned for each output separately.
If multioutput is 'uniform_average' or ``None``, then the
equally-weighted average of all output errors is returned.
MAPE output is non-negative floating point. The best value is 0.0.
But note the fact that bad predictions can lead to arbitarily large
MAPE values, especially if some y_true values are very close to zero.
Note that we return a large value instead of `inf` when y_true is zero.
"""
_check_sample_weight(sample_weight)
epsilon = np.finfo(np.float64).eps
mape = abs(y_pred - y_true) / da.maximum(y_true, epsilon)
output_errors = mape.mean(axis=0)

if isinstance(multioutput, str) or multioutput is None:
if multioutput == "raw_values":
if compute:
return output_errors.compute()
else:
return output_errors
else:
raise ValueError("Weighted 'multioutput' not supported.")
result = output_errors.mean()
Expand All @@ -90,7 +153,7 @@ def r2_score(
compute: bool = True,
) -> ArrayLike:
_check_sample_weight(sample_weight)
_, y_true, y_pred, multioutput = _check_reg_targets(y_true, y_pred, multioutput)
_, y_true, y_pred, _ = _check_reg_targets(y_true, y_pred, multioutput)
weight = 1.0

numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0, dtype="f8")
Expand Down
3 changes: 2 additions & 1 deletion docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Dask-ML
=======

Dask-ML provides scalable machine learning in Python using Dask_ alongside
popular machine learning libraries like Scikit-Learn_, XGBoost, and others.
popular machine learning libraries like Scikit-Learn_, XGBoost_, and others.

You can try Dask-ML on a small cloud instance by clicking the following button:

Expand Down Expand Up @@ -132,3 +132,4 @@ See :doc:`Dask-ML + XGBoost <xgboost>` for more information.

.. _Dask: https://dask.org/
.. _Scikit-Learn: http://scikit-learn.org/
.. _XGBoost: https://ml.dask.org/xgboost.html
1 change: 1 addition & 0 deletions docs/source/modules/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,7 @@ Regression Metrics
:toctree: generated/

metrics.mean_absolute_error
metrics.mean_absolute_percentage_error
metrics.mean_squared_error
metrics.mean_squared_log_error
metrics.r2_score
Expand Down
66 changes: 65 additions & 1 deletion tests/metrics/test_regression.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,32 @@
import numbers

import dask.array as da
import numpy as np
import pytest
import sklearn.metrics
from dask.array.utils import assert_eq

import dask_ml.metrics
from dask_ml._compat import SK_024

_METRICS_TO_TEST = [
"mean_squared_error",
"mean_absolute_error",
"r2_score",
]

@pytest.fixture(params=["mean_squared_error", "mean_absolute_error", "r2_score"])
# mean_absolute_percentage_error() was added in scikit-learn 0.24.0
if SK_024:
_METRICS_TO_TEST.append("mean_absolute_percentage_error")


@pytest.fixture(params=_METRICS_TO_TEST)
def metric_pairs(request):
"""Pairs of (dask-ml, sklearn) regression metrics.
* mean_squared_error
* mean_absolute_error
* mean_absolute_percentage_error (if scikit-learn >= 0.24.0)
* r2_score
"""
return (
Expand Down Expand Up @@ -60,3 +74,53 @@ def test_mean_squared_log_error():
result = m1(a, b)
expected = m2(a, b)
assert abs(result - expected) < 1e-5


@pytest.mark.parametrize("multioutput", ["uniform_average", None])
def test_regression_metrics_unweighted_average_multioutput(metric_pairs, multioutput):
m1, m2 = metric_pairs

a = da.random.uniform(size=(100,), chunks=(25,))
b = da.random.uniform(size=(100,), chunks=(25,))

result = m1(a, b, multioutput=multioutput)
expected = m2(a, b, multioutput=multioutput)
assert abs(result - expected) < 1e-5


@pytest.mark.parametrize("compute", [True, False])
def test_regression_metrics_raw_values(metric_pairs, compute):
m1, m2 = metric_pairs

if m1.__name__ == "r2_score":
pytest.skip("r2_score does not support multioutput='raw_values'")

a = da.random.uniform(size=(100, 3), chunks=(25, 3))
b = da.random.uniform(size=(100, 3), chunks=(25, 3))

result = m1(a, b, multioutput="raw_values", compute=compute)
expected = m2(a, b, multioutput="raw_values")

if compute:
assert isinstance(result, np.ndarray)
else:
assert isinstance(result, da.Array)

assert_eq(result, expected)
assert result.shape == (3,)


def test_regression_metrics_do_not_support_weighted_multioutput(metric_pairs):
m1, _ = metric_pairs

a = da.random.uniform(size=(100, 3), chunks=(25, 3))
b = da.random.uniform(size=(100, 3), chunks=(25, 3))
weights = da.random.uniform(size=(3,))

if m1.__name__ == "r2_score":
error_msg = "'multioutput' must be 'uniform_average'"
else:
error_msg = "Weighted 'multioutput' not supported."

with pytest.raises((NotImplementedError, ValueError), match=error_msg):
_ = m1(a, b, multioutput=weights)

0 comments on commit d762673

Please sign in to comment.