From 32fe2691f7eea7d2d2ed3bf3460965450f2ba256 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 23 Oct 2024 13:02:21 +0200 Subject: [PATCH 001/135] add finiteness_checker pybind11 bindings --- onedal/dal.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/onedal/dal.cpp b/onedal/dal.cpp index 814b22aa8b..14e0aed35d 100644 --- a/onedal/dal.cpp +++ b/onedal/dal.cpp @@ -75,6 +75,9 @@ namespace oneapi::dal::python { #if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240001 ONEDAL_PY_INIT_MODULE(logistic_regression); #endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240001 + #if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240700 + ONEDAL_PY_INIT_MODULE(finiteness_checker); + #endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240700 #endif // ONEDAL_DATA_PARALLEL_SPMD #ifdef ONEDAL_DATA_PARALLEL_SPMD @@ -133,6 +136,9 @@ namespace oneapi::dal::python { #if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240001 init_logistic_regression(m); #endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240001 + #if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240700 + init_finiteness_checker(m); + #endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240700 } #endif // ONEDAL_DATA_PARALLEL_SPMD From cdbf1b5e5bfdc8036beee80545ea11e553ceac99 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 23 Oct 2024 13:04:00 +0200 Subject: [PATCH 002/135] added finiteness checker --- onedal/primitives/finiteness_checker.cpp | 96 ++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 onedal/primitives/finiteness_checker.cpp diff --git a/onedal/primitives/finiteness_checker.cpp b/onedal/primitives/finiteness_checker.cpp new file mode 100644 index 0000000000..6aaf7c52d6 --- /dev/null +++ b/onedal/primitives/finiteness_checker.cpp @@ -0,0 +1,96 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "oneapi/dal/algo/finiteness_checker.hpp" + +#include "onedal/common.hpp" +#include "onedal/version.hpp" + +namespace py = pybind11; + +namespace oneapi::dal::python { + +template +struct method2t { + method2t(const Task& task, const Ops& ops) : ops(ops) {} + + template + auto operator()(const py::dict& params) { + using namespace finiteness_checker; + + const auto method = params["method"].cast(); + + ONEDAL_PARAM_DISPATCH_VALUE(method, "dense", ops, Float, method::dense); + ONEDAL_PARAM_DISPATCH_VALUE(method, "by_default", ops, Float, method::by_default); + ONEDAL_PARAM_DISPATCH_THROW_INVALID_VALUE(method); + } + + Ops ops; +}; + +struct params2desc { + template + auto operator()(const pybind11::dict& params) { + using namespace dal::finiteness_checker; + + auto desc = descriptor(); + desc.set_allow_NaN(params["allow_nan"].cast()); + return desc; + } +}; + +template +void init_compute_ops(py::module_& m) { + m.def("compute", + [](const Policy& policy, + const py::dict& params, + const table& data) { + using namespace finiteness_checker; + using input_t = compute_input; + + compute_ops ops(policy, input_t{ data}, params2desc{}); + return fptype2t{ method2t{ Task{}, ops } }(params); + }); +} + +template +void init_compute_result(py::module_& m) { + using namespace finiteness_checker; + using result_t = compute_result; + + py::class_(m, "compute_result") + .def(py::init()) + .DEF_ONEDAL_PY_PROPERTY(finite, result_t) +} + +ONEDAL_PY_TYPE2STR(finiteness_checker::task::compute, "compute"); + +ONEDAL_PY_DECLARE_INSTANTIATOR(init_compute_ops); +ONEDAL_PY_DECLARE_INSTANTIATOR(init_compute_result); + +ONEDAL_PY_INIT_MODULE(finiteness_checker) { + using namespace dal::detail; + using namespace finiteness_checker; + using namespace dal::finiteness; + + using task_list = types; + auto sub = m.def_submodule("finiteness_checker"); + + ONEDAL_PY_INSTANTIATE(init_compute_ops, sub, policy_list, task_list); + ONEDAL_PY_INSTANTIATE(init_compute_result, sub, task_list); +} + +} // namespace oneapi::dal::python From 62674a24547cf4f7771efbd48657666ed41a97fe Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 23 Oct 2024 13:37:53 +0200 Subject: [PATCH 003/135] Update finiteness_checker.cpp --- onedal/primitives/finiteness_checker.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/primitives/finiteness_checker.cpp b/onedal/primitives/finiteness_checker.cpp index 6aaf7c52d6..51a3ef161a 100644 --- a/onedal/primitives/finiteness_checker.cpp +++ b/onedal/primitives/finiteness_checker.cpp @@ -14,7 +14,7 @@ * limitations under the License. *******************************************************************************/ -#include "oneapi/dal/algo/finiteness_checker.hpp" +#include "oneapi/dal/algo/finiteness_checker/compute.hpp" #include "onedal/common.hpp" #include "onedal/version.hpp" From c75c23b34e714ac22eace32d4a44ae5699286262 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 23 Oct 2024 13:46:49 +0200 Subject: [PATCH 004/135] Update finiteness_checker.cpp --- onedal/primitives/finiteness_checker.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/onedal/primitives/finiteness_checker.cpp b/onedal/primitives/finiteness_checker.cpp index 51a3ef161a..761ee28de9 100644 --- a/onedal/primitives/finiteness_checker.cpp +++ b/onedal/primitives/finiteness_checker.cpp @@ -14,7 +14,12 @@ * limitations under the License. *******************************************************************************/ +// fix error with missing headers +#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20250200 +#include "oneapi/dal/algo/finiteness_checker.hpp +#else #include "oneapi/dal/algo/finiteness_checker/compute.hpp" +#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20250200 #include "onedal/common.hpp" #include "onedal/version.hpp" From 6a20938aba804e69b09bf5d15c12f3128982df7d Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 23 Oct 2024 13:47:36 +0200 Subject: [PATCH 005/135] Update finiteness_checker.cpp --- onedal/primitives/finiteness_checker.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onedal/primitives/finiteness_checker.cpp b/onedal/primitives/finiteness_checker.cpp index 761ee28de9..531554f857 100644 --- a/onedal/primitives/finiteness_checker.cpp +++ b/onedal/primitives/finiteness_checker.cpp @@ -16,9 +16,9 @@ // fix error with missing headers #if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20250200 -#include "oneapi/dal/algo/finiteness_checker.hpp + #include "oneapi/dal/algo/finiteness_checker.hpp #else -#include "oneapi/dal/algo/finiteness_checker/compute.hpp" + #include "oneapi/dal/algo/finiteness_checker/compute.hpp" #endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20250200 #include "onedal/common.hpp" From 382d7a1268a4612f6eec162a30c02b18bcc0e041 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 23 Oct 2024 13:47:47 +0200 Subject: [PATCH 006/135] Update finiteness_checker.cpp --- onedal/primitives/finiteness_checker.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/primitives/finiteness_checker.cpp b/onedal/primitives/finiteness_checker.cpp index 531554f857..ebc7bfd798 100644 --- a/onedal/primitives/finiteness_checker.cpp +++ b/onedal/primitives/finiteness_checker.cpp @@ -16,7 +16,7 @@ // fix error with missing headers #if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20250200 - #include "oneapi/dal/algo/finiteness_checker.hpp + #include "oneapi/dal/algo/finiteness_checker.hpp" #else #include "oneapi/dal/algo/finiteness_checker/compute.hpp" #endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20250200 From c8ffd9c0c2c9a132449020fa2ffc492b7c9bd1fb Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 23 Oct 2024 13:54:20 +0200 Subject: [PATCH 007/135] Update finiteness_checker.cpp --- onedal/primitives/finiteness_checker.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/primitives/finiteness_checker.cpp b/onedal/primitives/finiteness_checker.cpp index ebc7bfd798..92a17a875d 100644 --- a/onedal/primitives/finiteness_checker.cpp +++ b/onedal/primitives/finiteness_checker.cpp @@ -52,7 +52,7 @@ struct params2desc { using namespace dal::finiteness_checker; auto desc = descriptor(); - desc.set_allow_NaN(params["allow_nan"].cast()); + desc.set_allow_NaN(params["allow_nan"].cast()); return desc; } }; From 9aa13d5e72340509c33986befce7ff5f3169a325 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 23 Oct 2024 13:58:13 +0200 Subject: [PATCH 008/135] Update finiteness_checker.cpp --- onedal/primitives/finiteness_checker.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onedal/primitives/finiteness_checker.cpp b/onedal/primitives/finiteness_checker.cpp index 92a17a875d..7189aec5d9 100644 --- a/onedal/primitives/finiteness_checker.cpp +++ b/onedal/primitives/finiteness_checker.cpp @@ -78,7 +78,7 @@ void init_compute_result(py::module_& m) { py::class_(m, "compute_result") .def(py::init()) - .DEF_ONEDAL_PY_PROPERTY(finite, result_t) + .DEF_ONEDAL_PY_PROPERTY(finite, result_t); } ONEDAL_PY_TYPE2STR(finiteness_checker::task::compute, "compute"); @@ -89,7 +89,7 @@ ONEDAL_PY_DECLARE_INSTANTIATOR(init_compute_result); ONEDAL_PY_INIT_MODULE(finiteness_checker) { using namespace dal::detail; using namespace finiteness_checker; - using namespace dal::finiteness; + using namespace dal::finiteness_checker; using task_list = types; auto sub = m.def_submodule("finiteness_checker"); From 84e15d598392ebf5da945468cd1cf110a25d3764 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 23 Oct 2024 14:21:02 +0200 Subject: [PATCH 009/135] Rename finiteness_checker.cpp to finiteness_checker.cpp --- onedal/{primitives => utils}/finiteness_checker.cpp | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename onedal/{primitives => utils}/finiteness_checker.cpp (100%) diff --git a/onedal/primitives/finiteness_checker.cpp b/onedal/utils/finiteness_checker.cpp similarity index 100% rename from onedal/primitives/finiteness_checker.cpp rename to onedal/utils/finiteness_checker.cpp From 63073c60d17c192781e30db5425eeee4832761d9 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Thu, 24 Oct 2024 10:58:08 +0200 Subject: [PATCH 010/135] Update finiteness_checker.cpp --- onedal/utils/finiteness_checker.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/onedal/utils/finiteness_checker.cpp b/onedal/utils/finiteness_checker.cpp index 7189aec5d9..6bc6a2e66b 100644 --- a/onedal/utils/finiteness_checker.cpp +++ b/onedal/utils/finiteness_checker.cpp @@ -94,8 +94,10 @@ ONEDAL_PY_INIT_MODULE(finiteness_checker) { using task_list = types; auto sub = m.def_submodule("finiteness_checker"); - ONEDAL_PY_INSTANTIATE(init_compute_ops, sub, policy_list, task_list); - ONEDAL_PY_INSTANTIATE(init_compute_result, sub, task_list); + #ifndef ONEDAL_DATA_PARALLEL_SPMD + ONEDAL_PY_INSTANTIATE(init_compute_ops, sub, policy_list, task_list); + ONEDAL_PY_INSTANTIATE(init_compute_result, sub, task_list); + #endif } } // namespace oneapi::dal::python From 3dddf2dc3469f197c7e539c73f407670173c9864 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Fri, 1 Nov 2024 00:30:15 +0100 Subject: [PATCH 011/135] add next step --- onedal/utils/validation.py | 41 +++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index bde2390e80..eb313cd980 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -20,6 +20,10 @@ import numpy as np from scipy import sparse as sp +from onedal import _backend +from ..common._policy import _get_policy +from ..datatypes import _convert_to_supported, to_table + if np.lib.NumpyVersion(np.__version__) >= np.lib.NumpyVersion("2.0.0a0"): # numpy_version >= 2.0 @@ -31,7 +35,9 @@ from sklearn.preprocessing import LabelEncoder from sklearn.utils.validation import check_array -from daal4py.sklearn.utils.validation import _assert_all_finite +from daal4py.sklearn.utils.validation import ( + _assert_all_finite as _daal4py_assert_all_finite, +) class DataConversionWarning(UserWarning): @@ -135,10 +141,10 @@ def _check_array( if force_all_finite: if sp.issparse(array): if hasattr(array, "data"): - _assert_all_finite(array.data) + _daal4py_assert_all_finite(array.data) force_all_finite = False else: - _assert_all_finite(array) + _daal4py_assert_all_finite(array) force_all_finite = False array = check_array( array=array, @@ -200,7 +206,7 @@ def _check_X_y( if y_numeric and y.dtype.kind == "O": y = y.astype(np.float64) if force_all_finite: - _assert_all_finite(y) + _daal4py_assert_all_finite(y) lengths = [X.shape[0], y.shape[0]] uniques = np.unique(lengths) @@ -285,7 +291,7 @@ def _type_of_target(y): # check float and contains non-integer float values if y.dtype.kind == "f" and np.any(y != y.astype(int)): # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.] - _assert_all_finite(y) + _daal4py_assert_all_finite(y) return "continuous" + suffix if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1): @@ -430,3 +436,28 @@ def _is_csr(x): return isinstance(x, sp.csr_matrix) or ( hasattr(sp, "csr_array") and isinstance(x, sp.csr_array) ) + + +def _assert_all_finite(X, allow_nan=False, input_name=""): + # NOTE: This function does not respond to target_offload, as the memory movement + # is likely to cause a significant reduction in performance + # requires extracting the queue to generate a policy for converting the data to fp32 + X = to_table(_convert_to_supported(_get_policy(None, X), X)) + if not _backend.finiteness_checker(allow_nan=allow_nan).compute(X).finite: + type_err = "infinity" if allow_nan else "NaN, infinity" + padded_input_name = input_name + " " if input_name else "" + msg_err = f"Input {padded_input_name}contains {type_err}." + raise ValueError(msg_err) + + +def assert_all_finite( + X, + *, + allow_nan=False, + input_name="", +): + _assert_all_finite( + X.data if sp.issparse(X) else X, + allow_nan=allow_nan, + input_name=input_name, + ) From 1e1213e60e2d52310b26625a1c749379affcd007 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Fri, 1 Nov 2024 00:37:07 +0100 Subject: [PATCH 012/135] follow conventions --- onedal/utils/validation.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index eb313cd980..3a9d849486 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -442,8 +442,11 @@ def _assert_all_finite(X, allow_nan=False, input_name=""): # NOTE: This function does not respond to target_offload, as the memory movement # is likely to cause a significant reduction in performance # requires extracting the queue to generate a policy for converting the data to fp32 - X = to_table(_convert_to_supported(_get_policy(None, X), X)) - if not _backend.finiteness_checker(allow_nan=allow_nan).compute(X).finite: + policy = _get_policy(None, X) + X = to_table(_convert_to_supported(policy, X)) + if not _backend.finiteness_checker.compute( + policy, {"allow_nan": allow_nan}, X + ).finite: type_err = "infinity" if allow_nan else "NaN, infinity" padded_input_name = input_name + " " if input_name else "" msg_err = f"Input {padded_input_name}contains {type_err}." From 053171340099a68ced8fec11f79371f6bac253ef Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Fri, 1 Nov 2024 00:38:57 +0100 Subject: [PATCH 013/135] make xtable explicit --- onedal/utils/validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index 3a9d849486..67c7a2dee0 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -443,9 +443,9 @@ def _assert_all_finite(X, allow_nan=False, input_name=""): # is likely to cause a significant reduction in performance # requires extracting the queue to generate a policy for converting the data to fp32 policy = _get_policy(None, X) - X = to_table(_convert_to_supported(policy, X)) + X_table = to_table(_convert_to_supported(policy, X)) if not _backend.finiteness_checker.compute( - policy, {"allow_nan": allow_nan}, X + policy, {"allow_nan": allow_nan}, X_table ).finite: type_err = "infinity" if allow_nan else "NaN, infinity" padded_input_name = input_name + " " if input_name else "" From e831167b32b85135b9e685c7dd83227db89603e2 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Fri, 1 Nov 2024 00:42:29 +0100 Subject: [PATCH 014/135] remove comment --- onedal/utils/validation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index 67c7a2dee0..10bb920291 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -441,7 +441,6 @@ def _is_csr(x): def _assert_all_finite(X, allow_nan=False, input_name=""): # NOTE: This function does not respond to target_offload, as the memory movement # is likely to cause a significant reduction in performance - # requires extracting the queue to generate a policy for converting the data to fp32 policy = _get_policy(None, X) X_table = to_table(_convert_to_supported(policy, X)) if not _backend.finiteness_checker.compute( From d6eb1d05e9de1c6bc0a1f9683659ddef4540480d Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Fri, 1 Nov 2024 00:57:56 +0100 Subject: [PATCH 015/135] Update validation.py --- onedal/utils/validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index 10bb920291..f4597cd01c 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -21,8 +21,8 @@ import numpy as np from scipy import sparse as sp from onedal import _backend -from ..common._policy import _get_policy -from ..datatypes import _convert_to_supported, to_table +from onedal.common._policy import _get_policy +from onedal.datatypes import _convert_to_supported, to_table if np.lib.NumpyVersion(np.__version__) >= np.lib.NumpyVersion("2.0.0a0"): From fb30d6e69a2c6244112079a9c6a0dd75cd9a3a85 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Fri, 1 Nov 2024 22:34:52 +0100 Subject: [PATCH 016/135] Update __init__.py --- onedal/utils/__init__.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/onedal/utils/__init__.py b/onedal/utils/__init__.py index 0a1b05fbc2..0bc9ed35a3 100644 --- a/onedal/utils/__init__.py +++ b/onedal/utils/__init__.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +import scipy.sparse as sp from .validation import ( _check_array, @@ -22,7 +23,6 @@ _column_or_1d, _is_arraylike, _is_arraylike_not_scalar, - _is_csr, _is_integral_float, _is_multilabel, _num_features, @@ -31,6 +31,12 @@ _validate_targets, ) +def _is_csr(x): + """Return True if x is scipy.sparse.csr_matrix or scipy.sparse.csr_array""" + return isinstance(x, sp.csr_matrix) or ( + hasattr(sp, "csr_array") and isinstance(x, sp.csr_array) + ) + __all__ = [ "_column_or_1d", "_validate_targets", From 63a18c2f66ad93720408c33aa3a3b05f74d58f48 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Fri, 1 Nov 2024 22:35:12 +0100 Subject: [PATCH 017/135] Update validation.py --- onedal/utils/validation.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index f4597cd01c..1421bfaefc 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -431,13 +431,6 @@ def _num_samples(x): raise TypeError(message) from type_error -def _is_csr(x): - """Return True if x is scipy.sparse.csr_matrix or scipy.sparse.csr_array""" - return isinstance(x, sp.csr_matrix) or ( - hasattr(sp, "csr_array") and isinstance(x, sp.csr_array) - ) - - def _assert_all_finite(X, allow_nan=False, input_name=""): # NOTE: This function does not respond to target_offload, as the memory movement # is likely to cause a significant reduction in performance From 76c0856a12c04d4d3eb13d3c21382b1b84a23dc7 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Fri, 1 Nov 2024 22:40:03 +0100 Subject: [PATCH 018/135] Update __init__.py --- onedal/utils/__init__.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/onedal/utils/__init__.py b/onedal/utils/__init__.py index 0bc9ed35a3..a7e1495cf9 100644 --- a/onedal/utils/__init__.py +++ b/onedal/utils/__init__.py @@ -13,8 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -import scipy.sparse as sp +def _is_csr(x): + """Return True if x is scipy.sparse.csr_matrix or scipy.sparse.csr_array""" + return isinstance(x, sp.csr_matrix) or ( + hasattr(sp, "csr_array") and isinstance(x, sp.csr_array) + ) from .validation import ( _check_array, _check_classification_targets, @@ -23,6 +27,7 @@ _column_or_1d, _is_arraylike, _is_arraylike_not_scalar, + _is_csr, _is_integral_float, _is_multilabel, _num_features, @@ -31,12 +36,6 @@ _validate_targets, ) -def _is_csr(x): - """Return True if x is scipy.sparse.csr_matrix or scipy.sparse.csr_array""" - return isinstance(x, sp.csr_matrix) or ( - hasattr(sp, "csr_array") and isinstance(x, sp.csr_array) - ) - __all__ = [ "_column_or_1d", "_validate_targets", From 7deb2bbce9c0435b2484ae0fcfc754f5521bb01d Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Fri, 1 Nov 2024 22:40:24 +0100 Subject: [PATCH 019/135] Update __init__.py --- onedal/utils/__init__.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/onedal/utils/__init__.py b/onedal/utils/__init__.py index a7e1495cf9..0a1b05fbc2 100644 --- a/onedal/utils/__init__.py +++ b/onedal/utils/__init__.py @@ -14,11 +14,6 @@ # limitations under the License. # ============================================================================== -def _is_csr(x): - """Return True if x is scipy.sparse.csr_matrix or scipy.sparse.csr_array""" - return isinstance(x, sp.csr_matrix) or ( - hasattr(sp, "csr_array") and isinstance(x, sp.csr_array) - ) from .validation import ( _check_array, _check_classification_targets, From ed46b2907bb0a00678dab9c2516543941471b64a Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Fri, 1 Nov 2024 22:41:17 +0100 Subject: [PATCH 020/135] Update validation.py --- onedal/utils/validation.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index 1421bfaefc..f4597cd01c 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -431,6 +431,13 @@ def _num_samples(x): raise TypeError(message) from type_error +def _is_csr(x): + """Return True if x is scipy.sparse.csr_matrix or scipy.sparse.csr_array""" + return isinstance(x, sp.csr_matrix) or ( + hasattr(sp, "csr_array") and isinstance(x, sp.csr_array) + ) + + def _assert_all_finite(X, allow_nan=False, input_name=""): # NOTE: This function does not respond to target_offload, as the memory movement # is likely to cause a significant reduction in performance From 67d6273f3520232daad4f7f16b49291240600e16 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Fri, 1 Nov 2024 22:42:45 +0100 Subject: [PATCH 021/135] Update _data_conversion.py --- onedal/datatypes/_data_conversion.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onedal/datatypes/_data_conversion.py b/onedal/datatypes/_data_conversion.py index 0caac10884..011a2eb89d 100644 --- a/onedal/datatypes/_data_conversion.py +++ b/onedal/datatypes/_data_conversion.py @@ -17,11 +17,11 @@ import warnings import numpy as np +import scipy.sparse as sp from daal4py.sklearn._utils import make2d from onedal import _backend, _is_dpc_backend -from ..utils import _is_csr from ..utils._dpep_helpers import is_dpctl_available dpctl_available = is_dpctl_available("0.14") @@ -46,7 +46,7 @@ def convert_one_to_table(arg): if isinstance(arg, dpt.usm_ndarray): return _backend.dpctl_to_table(arg) - if not _is_csr(arg): + if not sp.issparse(arg): arg = make2d(arg) return _backend.to_table(arg) From 8abead922bd8c2fceff7e8e6dffe4b76389fe1d4 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Fri, 1 Nov 2024 22:58:03 +0100 Subject: [PATCH 022/135] Update _data_conversion.py --- onedal/datatypes/_data_conversion.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onedal/datatypes/_data_conversion.py b/onedal/datatypes/_data_conversion.py index 386101eb14..12dc24eca3 100644 --- a/onedal/datatypes/_data_conversion.py +++ b/onedal/datatypes/_data_conversion.py @@ -103,7 +103,7 @@ def convert_one_to_table(arg, sua_iface=None): if sua_iface: return _backend.sua_iface_to_table(arg) - if not sp.sparse(arg): + if not sp.issparse(arg): arg = make2d(arg) return _backend.to_table(arg) @@ -130,7 +130,7 @@ def convert_one_to_table(arg, sua_iface=None): "SYCL usm array conversion to table requires the DPC backend" ) - if not sp.sparse(arg): + if not sp.issparse(arg): arg = make2d(arg) return _backend.to_table(arg) From 47d0f8bf7f0544089bcc2626dc06863be663757b Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sat, 2 Nov 2024 00:39:18 +0100 Subject: [PATCH 023/135] Update policy_common.cpp --- onedal/common/policy_common.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/onedal/common/policy_common.cpp b/onedal/common/policy_common.cpp index bfb3c02cbd..3d8443378d 100644 --- a/onedal/common/policy_common.cpp +++ b/onedal/common/policy_common.cpp @@ -31,6 +31,10 @@ constexpr const char py_capsule_name[] = "PyCapsule"; constexpr const char get_capsule_name[] = "_get_capsule"; constexpr const char queue_capsule_name[] = "SyclQueueRef"; constexpr const char context_capsule_name[] = "SyclContextRef"; +constexpr const char device_name[] = "sycl_device"; +constexpr const char filter_name[] = "filter_selector"; + + sycl::queue extract_queue(py::capsule capsule) { constexpr const char* gtr_name = queue_capsule_name; @@ -79,7 +83,12 @@ sycl::queue get_queue_from_python(const py::object& syclobj) { const auto caps = syclobj.cast(); return extract_from_capsule(std::move(caps)); } - else { + else if (py::hasattr(syclobj, device_name) && py::hasattr(syclobj.attr(device_name), filter_name)) { + auto attr = syclobj.attr(device_name).attr(filter_name); + return get_queue_by_filter_string(attr.cast()); + } + else + { throw std::runtime_error("Unable to interpret \"syclobj\""); } } From e48c2bdca15b554e9b325508b8827465ae6d34bf Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sat, 2 Nov 2024 00:45:56 +0100 Subject: [PATCH 024/135] Update policy_common.cpp --- onedal/common/policy_common.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/onedal/common/policy_common.cpp b/onedal/common/policy_common.cpp index 3d8443378d..364f248992 100644 --- a/onedal/common/policy_common.cpp +++ b/onedal/common/policy_common.cpp @@ -32,7 +32,7 @@ constexpr const char get_capsule_name[] = "_get_capsule"; constexpr const char queue_capsule_name[] = "SyclQueueRef"; constexpr const char context_capsule_name[] = "SyclContextRef"; constexpr const char device_name[] = "sycl_device"; -constexpr const char filter_name[] = "filter_selector"; +constexpr const char get_filter_name[] = "get_filter_string"; @@ -83,9 +83,9 @@ sycl::queue get_queue_from_python(const py::object& syclobj) { const auto caps = syclobj.cast(); return extract_from_capsule(std::move(caps)); } - else if (py::hasattr(syclobj, device_name) && py::hasattr(syclobj.attr(device_name), filter_name)) { - auto attr = syclobj.attr(device_name).attr(filter_name); - return get_queue_by_filter_string(attr.cast()); + else if (py::hasattr(syclobj, device_name) && py::hasattr(syclobj.attr(device_name), get_filter_name)) { + auto attr = syclobj.attr(device_name).attr(get_filter_name); + return get_queue_by_filter_string(attr().cast()); } else { From c6751c4bc2dea6fd8e38c470d9f398bb0b8f8161 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sat, 2 Nov 2024 00:47:04 +0100 Subject: [PATCH 025/135] Update _policy.py --- onedal/common/_policy.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/onedal/common/_policy.py b/onedal/common/_policy.py index 90705854f6..abd267f4a6 100644 --- a/onedal/common/_policy.py +++ b/onedal/common/_policy.py @@ -48,12 +48,7 @@ def __init__(self): if _is_dpc_backend: - from onedal._device_offload import DummySyclQueue - class _DataParallelInteropPolicy(_backend.data_parallel_policy): def __init__(self, queue): self._queue = queue - if isinstance(queue, DummySyclQueue): - super().__init__(self._queue.sycl_device.get_filter_string()) - return super().__init__(self._queue) From f3e4a3a678298b7a7b135bae67ef29e293a45ee5 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sat, 2 Nov 2024 01:01:33 +0100 Subject: [PATCH 026/135] Update policy_common.cpp --- onedal/common/policy_common.cpp | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/onedal/common/policy_common.cpp b/onedal/common/policy_common.cpp index 364f248992..3bd18c3689 100644 --- a/onedal/common/policy_common.cpp +++ b/onedal/common/policy_common.cpp @@ -34,8 +34,6 @@ constexpr const char context_capsule_name[] = "SyclContextRef"; constexpr const char device_name[] = "sycl_device"; constexpr const char get_filter_name[] = "get_filter_string"; - - sycl::queue extract_queue(py::capsule capsule) { constexpr const char* gtr_name = queue_capsule_name; constexpr std::size_t gtr_size = sizeof(queue_capsule_name); @@ -74,6 +72,20 @@ sycl::queue get_queue_by_get_capsule(const py::object& syclobj) { return extract_from_capsule(std::move(capsule)); } +sycl::queue get_queue_by_filter_string(const std::string& filter) { + filter_selector_wrapper selector{ filter }; + return sycl::queue{ selector }; +} + +sycl::queue get_queue_by_device_id(std::uint32_t id) { + if (auto device = get_device_by_id(id)) { + return sycl::queue{ device.value() }; + } + else { + throw std::runtime_error(unknown_device); + } +} + sycl::queue get_queue_from_python(const py::object& syclobj) { static auto pycapsule = py::cast(py_capsule_name); if (py::hasattr(syclobj, get_capsule_name)) { @@ -93,20 +105,6 @@ sycl::queue get_queue_from_python(const py::object& syclobj) { } } -sycl::queue get_queue_by_filter_string(const std::string& filter) { - filter_selector_wrapper selector{ filter }; - return sycl::queue{ selector }; -} - -sycl::queue get_queue_by_device_id(std::uint32_t id) { - if (auto device = get_device_by_id(id)) { - return sycl::queue{ device.value() }; - } - else { - throw std::runtime_error(unknown_device); - } -} - std::string get_device_name(const sycl::queue& queue) { const auto& device = queue.get_device(); if (device.is_gpu()) { From 39cdb5f3c48810a178b12608fa18eb2a8edecfd0 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sat, 2 Nov 2024 01:28:12 +0100 Subject: [PATCH 027/135] Rename finiteness_checker.cpp to finiteness_checker.cpp --- onedal/{utils => primitives}/finiteness_checker.cpp | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename onedal/{utils => primitives}/finiteness_checker.cpp (100%) diff --git a/onedal/utils/finiteness_checker.cpp b/onedal/primitives/finiteness_checker.cpp similarity index 100% rename from onedal/utils/finiteness_checker.cpp rename to onedal/primitives/finiteness_checker.cpp From 0f39613063f153d054826cbcac9f931232c14177 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sat, 2 Nov 2024 01:33:21 +0100 Subject: [PATCH 028/135] Create finiteness_checker.py --- onedal/primitives/finiteness_checker.py | 48 +++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 onedal/primitives/finiteness_checker.py diff --git a/onedal/primitives/finiteness_checker.py b/onedal/primitives/finiteness_checker.py new file mode 100644 index 0000000000..c1a2b5c364 --- /dev/null +++ b/onedal/primitives/finiteness_checker.py @@ -0,0 +1,48 @@ +# ============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import scipy.sparse as sp + +from onedal import _backend +from onedal.common._policy import _get_policy +from onedal.datatypes import _convert_to_supported, to_table + + +def _assert_all_finite(X, allow_nan=False, input_name=""): + # NOTE: This function does not respond to target_offload, as the memory movement + # is likely to cause a significant reduction in performance + policy = _get_policy(None, X) + X_table = to_table(_convert_to_supported(policy, X)) + if not _backend.finiteness_checker.compute( + policy, {"allow_nan": allow_nan}, X_table + ).finite: + type_err = "infinity" if allow_nan else "NaN, infinity" + padded_input_name = input_name + " " if input_name else "" + msg_err = f"Input {padded_input_name}contains {type_err}." + raise ValueError(msg_err) + + +def assert_all_finite( + X, + *, + allow_nan=False, + input_name="", +): + _assert_all_finite( + X.data if sp.issparse(X) else X, + allow_nan=allow_nan, + input_name=input_name, + ) From b42cfe365d6dba0735dee79e732b6f1bddd9b1dc Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sat, 2 Nov 2024 01:33:45 +0100 Subject: [PATCH 029/135] Update validation.py --- onedal/utils/validation.py | 31 ------------------------------- 1 file changed, 31 deletions(-) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index f4597cd01c..bb501617fa 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -20,10 +20,6 @@ import numpy as np from scipy import sparse as sp -from onedal import _backend -from onedal.common._policy import _get_policy -from onedal.datatypes import _convert_to_supported, to_table - if np.lib.NumpyVersion(np.__version__) >= np.lib.NumpyVersion("2.0.0a0"): # numpy_version >= 2.0 @@ -436,30 +432,3 @@ def _is_csr(x): return isinstance(x, sp.csr_matrix) or ( hasattr(sp, "csr_array") and isinstance(x, sp.csr_array) ) - - -def _assert_all_finite(X, allow_nan=False, input_name=""): - # NOTE: This function does not respond to target_offload, as the memory movement - # is likely to cause a significant reduction in performance - policy = _get_policy(None, X) - X_table = to_table(_convert_to_supported(policy, X)) - if not _backend.finiteness_checker.compute( - policy, {"allow_nan": allow_nan}, X_table - ).finite: - type_err = "infinity" if allow_nan else "NaN, infinity" - padded_input_name = input_name + " " if input_name else "" - msg_err = f"Input {padded_input_name}contains {type_err}." - raise ValueError(msg_err) - - -def assert_all_finite( - X, - *, - allow_nan=False, - input_name="", -): - _assert_all_finite( - X.data if sp.issparse(X) else X, - allow_nan=allow_nan, - input_name=input_name, - ) From 0ed615e9b44825e483aaad292187296416a08960 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sat, 2 Nov 2024 01:34:51 +0100 Subject: [PATCH 030/135] Update __init__.py --- onedal/primitives/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/onedal/primitives/__init__.py b/onedal/primitives/__init__.py index 39213819b5..c501a78d67 100644 --- a/onedal/primitives/__init__.py +++ b/onedal/primitives/__init__.py @@ -15,13 +15,16 @@ # ============================================================================== from .get_tree import get_tree_state_cls, get_tree_state_reg +from .finiteness_checker import assert_all_finite, _assert_all_finite from .kernel_functions import linear_kernel, poly_kernel, rbf_kernel, sigmoid_kernel __all__ = [ + "assert_all_finite", "get_tree_state_cls", "get_tree_state_reg", "linear_kernel", "rbf_kernel", "poly_kernel", "sigmoid_kernel", + "_assert_all_finite", ] From f101affd5068f017edd6f399666528920a4e309f Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Sat, 2 Nov 2024 13:32:43 +0100 Subject: [PATCH 031/135] attempt at fixing circular imports again --- onedal/common/_policy.py | 1 + onedal/datatypes/_data_conversion.py | 31 ++++++++-------- onedal/primitives/finiteness_checker.py | 48 ------------------------- onedal/utils/validation.py | 31 ++++++++++++++++ 4 files changed, 49 insertions(+), 62 deletions(-) delete mode 100644 onedal/primitives/finiteness_checker.py diff --git a/onedal/common/_policy.py b/onedal/common/_policy.py index abd267f4a6..0d7d8ca6a3 100644 --- a/onedal/common/_policy.py +++ b/onedal/common/_policy.py @@ -48,6 +48,7 @@ def __init__(self): if _is_dpc_backend: + class _DataParallelInteropPolicy(_backend.data_parallel_policy): def __init__(self, queue): self._queue = queue diff --git a/onedal/datatypes/_data_conversion.py b/onedal/datatypes/_data_conversion.py index 12dc24eca3..af5b41eb6b 100644 --- a/onedal/datatypes/_data_conversion.py +++ b/onedal/datatypes/_data_conversion.py @@ -31,13 +31,23 @@ def _apply_and_pass(func, *args, **kwargs): if _is_dpc_backend: - from ..utils._dpep_helpers import dpctl_available, dpnp_available + try: + import dpnp - if dpctl_available: - import dpctl.tensor as dpt + def _onedal_gpu_table_to_array(table, xp=None): + # By default DPNP ndarray created with a copy. + # TODO: + # investigate why dpnp.array(table, copy=False) doesn't work. + # Work around with using dpctl.tensor.asarray. + if xp == dpnp: + return dpnp.array(dpnp.dpctl.tensor.asarray(table), copy=False) + else: + return xp.asarray(table) - if dpnp_available: - import dpnp + except ImportError: + + def _onedal_gpu_table_to_array(table, xp=None): + return xp.asarray(table) from ..common._policy import _HostInteropPolicy @@ -86,15 +96,8 @@ def convert_one_from_table(table, sycl_queue=None, sua_iface=None, xp=None): _backend.from_table(table), usm_type="device", sycl_queue=sycl_queue ) else: - xp_name = xp.__name__ - if dpnp_available and xp_name == "dpnp": - # By default DPNP ndarray created with a copy. - # TODO: - # investigate why dpnp.array(table, copy=False) doesn't work. - # Work around with using dpctl.tensor.asarray. - return dpnp.array(dpt.asarray(table), copy=False) - else: - return xp.asarray(table) + return _onedal_gpu_table_to_array(table, xp=xp) + return _backend.from_table(table) def convert_one_to_table(arg, sua_iface=None): diff --git a/onedal/primitives/finiteness_checker.py b/onedal/primitives/finiteness_checker.py deleted file mode 100644 index c1a2b5c364..0000000000 --- a/onedal/primitives/finiteness_checker.py +++ /dev/null @@ -1,48 +0,0 @@ -# ============================================================================== -# Copyright 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -import scipy.sparse as sp - -from onedal import _backend -from onedal.common._policy import _get_policy -from onedal.datatypes import _convert_to_supported, to_table - - -def _assert_all_finite(X, allow_nan=False, input_name=""): - # NOTE: This function does not respond to target_offload, as the memory movement - # is likely to cause a significant reduction in performance - policy = _get_policy(None, X) - X_table = to_table(_convert_to_supported(policy, X)) - if not _backend.finiteness_checker.compute( - policy, {"allow_nan": allow_nan}, X_table - ).finite: - type_err = "infinity" if allow_nan else "NaN, infinity" - padded_input_name = input_name + " " if input_name else "" - msg_err = f"Input {padded_input_name}contains {type_err}." - raise ValueError(msg_err) - - -def assert_all_finite( - X, - *, - allow_nan=False, - input_name="", -): - _assert_all_finite( - X.data if sp.issparse(X) else X, - allow_nan=allow_nan, - input_name=input_name, - ) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index bb501617fa..c620b7b2e4 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -35,6 +35,10 @@ _assert_all_finite as _daal4py_assert_all_finite, ) +from onedal import _backend +from onedal.common._policy import _get_policy +from onedal.datatypes import _convert_to_supported, to_table + class DataConversionWarning(UserWarning): """Warning used to notify implicit data conversions happening in the code.""" @@ -432,3 +436,30 @@ def _is_csr(x): return isinstance(x, sp.csr_matrix) or ( hasattr(sp, "csr_array") and isinstance(x, sp.csr_array) ) + + +def _assert_all_finite(X, allow_nan=False, input_name=""): + # NOTE: This function does not respond to target_offload, as the memory movement + # is likely to cause a significant reduction in performance + policy = _get_policy(None, X) + X_table = to_table(_convert_to_supported(policy, X)) + if not _backend.finiteness_checker.compute( + policy, {"allow_nan": allow_nan}, X_table + ).finite: + type_err = "infinity" if allow_nan else "NaN, infinity" + padded_input_name = input_name + " " if input_name else "" + msg_err = f"Input {padded_input_name}contains {type_err}." + raise ValueError(msg_err) + + +def assert_all_finite( + X, + *, + allow_nan=False, + input_name="", +): + _assert_all_finite( + X.data if sp.issparse(X) else X, + allow_nan=allow_nan, + input_name=input_name, + ) From 24c0e9472a85b2023ddb21a27fe6a783adb5cc1c Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Sat, 2 Nov 2024 13:33:06 +0100 Subject: [PATCH 032/135] fix isort --- onedal/primitives/__init__.py | 2 +- onedal/utils/validation.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/onedal/primitives/__init__.py b/onedal/primitives/__init__.py index c501a78d67..79d72e2f16 100644 --- a/onedal/primitives/__init__.py +++ b/onedal/primitives/__init__.py @@ -14,8 +14,8 @@ # limitations under the License. # ============================================================================== +from .finiteness_checker import _assert_all_finite, assert_all_finite from .get_tree import get_tree_state_cls, get_tree_state_reg -from .finiteness_checker import assert_all_finite, _assert_all_finite from .kernel_functions import linear_kernel, poly_kernel, rbf_kernel, sigmoid_kernel __all__ = [ diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index c620b7b2e4..4c5cc9746f 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -34,7 +34,6 @@ from daal4py.sklearn.utils.validation import ( _assert_all_finite as _daal4py_assert_all_finite, ) - from onedal import _backend from onedal.common._policy import _get_policy from onedal.datatypes import _convert_to_supported, to_table From 3f96166299d3ac5f07931ba64e5b0e96af345496 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Sat, 2 Nov 2024 13:35:06 +0100 Subject: [PATCH 033/135] remove __init__ changes --- onedal/primitives/__init__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/onedal/primitives/__init__.py b/onedal/primitives/__init__.py index 79d72e2f16..39213819b5 100644 --- a/onedal/primitives/__init__.py +++ b/onedal/primitives/__init__.py @@ -14,17 +14,14 @@ # limitations under the License. # ============================================================================== -from .finiteness_checker import _assert_all_finite, assert_all_finite from .get_tree import get_tree_state_cls, get_tree_state_reg from .kernel_functions import linear_kernel, poly_kernel, rbf_kernel, sigmoid_kernel __all__ = [ - "assert_all_finite", "get_tree_state_cls", "get_tree_state_reg", "linear_kernel", "rbf_kernel", "poly_kernel", "sigmoid_kernel", - "_assert_all_finite", ] From d98505388701b670e037148e14490163e5675590 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Sat, 2 Nov 2024 13:35:50 +0100 Subject: [PATCH 034/135] last move --- onedal/{primitives => utils}/finiteness_checker.cpp | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename onedal/{primitives => utils}/finiteness_checker.cpp (100%) diff --git a/onedal/primitives/finiteness_checker.cpp b/onedal/utils/finiteness_checker.cpp similarity index 100% rename from onedal/primitives/finiteness_checker.cpp rename to onedal/utils/finiteness_checker.cpp From 90ec48b46bc0c06a1da5b07e7b5d93efc12c12b7 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sat, 2 Nov 2024 14:39:03 +0100 Subject: [PATCH 035/135] Update policy_common.cpp --- onedal/common/policy_common.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/onedal/common/policy_common.cpp b/onedal/common/policy_common.cpp index 3bd18c3689..828be51547 100644 --- a/onedal/common/policy_common.cpp +++ b/onedal/common/policy_common.cpp @@ -87,11 +87,10 @@ sycl::queue get_queue_by_device_id(std::uint32_t id) { } sycl::queue get_queue_from_python(const py::object& syclobj) { - static auto pycapsule = py::cast(py_capsule_name); if (py::hasattr(syclobj, get_capsule_name)) { return get_queue_by_get_capsule(syclobj); } - else if (py::isinstance(syclobj, pycapsule)) { + else if (py::isinstance(syclobj, py::capsule)) { const auto caps = syclobj.cast(); return extract_from_capsule(std::move(caps)); } From 8c2c854c06b0e4486aae563418ea047d24f528df Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sat, 2 Nov 2024 14:59:19 +0100 Subject: [PATCH 036/135] Update policy_common.cpp --- onedal/common/policy_common.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/onedal/common/policy_common.cpp b/onedal/common/policy_common.cpp index 828be51547..224e7a04e1 100644 --- a/onedal/common/policy_common.cpp +++ b/onedal/common/policy_common.cpp @@ -19,7 +19,6 @@ #endif // ONEDAL_DATA_PARALLEL #include - #include "onedal/common/policy_common.hpp" namespace oneapi::dal::python { @@ -90,7 +89,7 @@ sycl::queue get_queue_from_python(const py::object& syclobj) { if (py::hasattr(syclobj, get_capsule_name)) { return get_queue_by_get_capsule(syclobj); } - else if (py::isinstance(syclobj, py::capsule)) { + else if (py::isinstance(syclobj) { const auto caps = syclobj.cast(); return extract_from_capsule(std::move(caps)); } From 6fa38d7f49d95a831d663101e076530297980865 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sat, 2 Nov 2024 15:07:44 +0100 Subject: [PATCH 037/135] Update policy_common.cpp --- onedal/common/policy_common.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/common/policy_common.cpp b/onedal/common/policy_common.cpp index 224e7a04e1..b10c60880d 100644 --- a/onedal/common/policy_common.cpp +++ b/onedal/common/policy_common.cpp @@ -89,7 +89,7 @@ sycl::queue get_queue_from_python(const py::object& syclobj) { if (py::hasattr(syclobj, get_capsule_name)) { return get_queue_by_get_capsule(syclobj); } - else if (py::isinstance(syclobj) { + else if (py::isinstance(syclobj)) { const auto caps = syclobj.cast(); return extract_from_capsule(std::move(caps)); } From 9c1ca9c3f29d3f00f5b10444e3e78101fb39adc0 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sat, 2 Nov 2024 17:22:59 +0100 Subject: [PATCH 038/135] Update policy_common.cpp --- onedal/common/policy_common.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/onedal/common/policy_common.cpp b/onedal/common/policy_common.cpp index b10c60880d..284762b035 100644 --- a/onedal/common/policy_common.cpp +++ b/onedal/common/policy_common.cpp @@ -19,6 +19,7 @@ #endif // ONEDAL_DATA_PARALLEL #include + #include "onedal/common/policy_common.hpp" namespace oneapi::dal::python { From 4b67dbde880bfa8c3d5373473a589bd2f6577c56 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sat, 2 Nov 2024 19:27:45 +0100 Subject: [PATCH 039/135] Update validation.py --- onedal/utils/validation.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index 4c5cc9746f..2ea8de8f51 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -438,8 +438,6 @@ def _is_csr(x): def _assert_all_finite(X, allow_nan=False, input_name=""): - # NOTE: This function does not respond to target_offload, as the memory movement - # is likely to cause a significant reduction in performance policy = _get_policy(None, X) X_table = to_table(_convert_to_supported(policy, X)) if not _backend.finiteness_checker.compute( From fa59a3c0103e9bd9d31ac1c0bf94cc9d1f86ae26 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Sat, 2 Nov 2024 22:23:58 +0100 Subject: [PATCH 040/135] add testing --- onedal/utils/tests/test_validation.py | 115 ++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 onedal/utils/tests/test_validation.py diff --git a/onedal/utils/tests/test_validation.py b/onedal/utils/tests/test_validation.py new file mode 100644 index 0000000000..406a2fd7bc --- /dev/null +++ b/onedal/utils/tests/test_validation.py @@ -0,0 +1,115 @@ +# ============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import time + +import numpy as np +import numpy.random as rand +import pytest +from numpy.testing import assert_raises + +from onedal.tests.utils._dataframes_support import ( + _convert_to_dataframe, + get_dataframes_and_queues, +) +from onedal.utils.validation import assert_all_finite, _assert_all_finite + + +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize( + "shape", + [ + [16, 2048], + [ + 2**16 + 3, + ], + [1000, 1000], + [ + 3, + ], + ], +) +@pytest.mark.parametrize("allow_nan", [False, True]) +@pytest.mark.parametrize( + "dataframe, queue", get_dataframes_and_queues("numpy,dpnp,dpctl") +) +def test_sum_infinite_actually_finite(dtype, shape, allow_nan, dataframe, queue): + X = np.array(shape, dtype=dtype) + X.fill(np.finfo(dtype).max) + X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + _assert_all_finite(X, allow_nan=allow_nan) + + +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize( + "shape", + [ + [16, 2048], + [ + 2**16 + 3, + ], + [1000, 1000], + [ + 3, + ], + ], +) +@pytest.mark.parametrize("allow_nan", [False, True]) +@pytest.mark.parametrize("check", ["inf", "NaN", None]) +@pytest.mark.parametrize("seed", [0, int(time.time())]) +@pytest.mark.parametrize( + "dataframe, queue", get_dataframes_and_queues("numpy,dpnp,dpctl") +) +def test_assert_finite_random_location( + dtype, shape, allow_nan, check, seed, dataframe, queue +): + rand.seed(seed) + X = rand.uniform(high=np.finfo(dtype).max, size=shape).astype(dtype) + X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + + if check: + loc = rand.randint(0, X.size - 1) + X.reshape((-1,))[loc] = float(check) + + if check is None or (allow_nan and check == "NaN"): + _assert_all_finite(X, allow_nan=allow_nan) + else: + assert_raises(ValueError, _assert_all_finite, X, allow_nan=allow_nan) + + +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("allow_nan", [False, True]) +@pytest.mark.parametrize("check", ["inf", "NaN", None]) +@pytest.mark.parametrize("seed", [0, int(time.time())]) +@pytest.mark.parametrize( + "dataframe, queue", get_dataframes_and_queues("numpy,dpnp,dpctl") +) +def test_assert_finite_random_shape_and_location( + dtype, allow_nan, check, seed, dataframe, queue +): + lb, ub = 2, 1048576 # lb is a patching condition, ub 2^20 + rand.seed(seed) + X = rand.uniform(high=np.finfo(dtype).max, size=rand.randint(lb, ub)).astype(dtype) + X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + + if check: + loc = rand.randint(0, X.size - 1) + X[loc] = float(check) + + if check is None or (allow_nan and check == "NaN"): + _assert_all_finite(X, allow_nan=allow_nan) + else: + assert_raises(ValueError, _assert_all_finite, X, allow_nan=allow_nan) From 3330b3312f07a751859d8e9c7639512e5d035ed3 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Sat, 2 Nov 2024 22:24:38 +0100 Subject: [PATCH 041/135] isort --- onedal/utils/tests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/utils/tests/test_validation.py b/onedal/utils/tests/test_validation.py index 406a2fd7bc..5788a9ccc3 100644 --- a/onedal/utils/tests/test_validation.py +++ b/onedal/utils/tests/test_validation.py @@ -25,7 +25,7 @@ _convert_to_dataframe, get_dataframes_and_queues, ) -from onedal.utils.validation import assert_all_finite, _assert_all_finite +from onedal.utils.validation import _assert_all_finite, assert_all_finite @pytest.mark.parametrize("dtype", [np.float32, np.float64]) From 48959403bde34845dd7bcc9bb357cc6e79eb846e Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Sat, 2 Nov 2024 22:53:23 +0100 Subject: [PATCH 042/135] attempt to fix module error --- onedal/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index 2ea8de8f51..9b33d49fe0 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -440,7 +440,7 @@ def _is_csr(x): def _assert_all_finite(X, allow_nan=False, input_name=""): policy = _get_policy(None, X) X_table = to_table(_convert_to_supported(policy, X)) - if not _backend.finiteness_checker.compute( + if not _backend.finiteness_checker.compute.compute( policy, {"allow_nan": allow_nan}, X_table ).finite: type_err = "infinity" if allow_nan else "NaN, infinity" From 0c6dd5d284155478773d1d4cf88c4fab3c9b6558 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Sat, 2 Nov 2024 23:20:51 +0100 Subject: [PATCH 043/135] add fptype --- onedal/utils/validation.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index 9b33d49fe0..f6e62bef14 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -439,10 +439,12 @@ def _is_csr(x): def _assert_all_finite(X, allow_nan=False, input_name=""): policy = _get_policy(None, X) - X_table = to_table(_convert_to_supported(policy, X)) - if not _backend.finiteness_checker.compute.compute( - policy, {"allow_nan": allow_nan}, X_table - ).finite: + X_t = to_table(_convert_to_supported(policy, X)) + params = { + "fptype": "float" if X_t.dtype.name == "float32" else "double", + "allow_nan": allow_nan, + } + if not _backend.finiteness_checker.compute.compute(policy, params, X_t).finite: type_err = "infinity" if allow_nan else "NaN, infinity" padded_input_name = input_name + " " if input_name else "" msg_err = f"Input {padded_input_name}contains {type_err}." From e2182fa81ffc0b35b485a01f43b1d0dca5bb79e1 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Sat, 2 Nov 2024 23:40:24 +0100 Subject: [PATCH 044/135] fix typo --- onedal/utils/validation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index f6e62bef14..1ce7e5378d 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -438,12 +438,12 @@ def _is_csr(x): def _assert_all_finite(X, allow_nan=False, input_name=""): - policy = _get_policy(None, X) - X_t = to_table(_convert_to_supported(policy, X)) params = { - "fptype": "float" if X_t.dtype.name == "float32" else "double", + "fptype": "float" if X.dtype.name == "float32" else "double", "allow_nan": allow_nan, } + policy = _get_policy(None, X) + X_t = to_table(_convert_to_supported(policy, X)) if not _backend.finiteness_checker.compute.compute(policy, params, X_t).finite: type_err = "infinity" if allow_nan else "NaN, infinity" padded_input_name = input_name + " " if input_name else "" From 982ef2c8e57e56d4d018b72fa7cd3e7ba58e0ebb Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sun, 3 Nov 2024 00:02:35 +0100 Subject: [PATCH 045/135] Update validation.py --- onedal/utils/validation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index 1ce7e5378d..6298f3ee5a 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -438,11 +438,12 @@ def _is_csr(x): def _assert_all_finite(X, allow_nan=False, input_name=""): + policy = _get_policy(None, X) params = { "fptype": "float" if X.dtype.name == "float32" else "double", + "method": "dense", "allow_nan": allow_nan, } - policy = _get_policy(None, X) X_t = to_table(_convert_to_supported(policy, X)) if not _backend.finiteness_checker.compute.compute(policy, params, X_t).finite: type_err = "infinity" if allow_nan else "NaN, infinity" From 2fb52a82bc27226d53ddfa27a462840e2011c9cb Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Sun, 3 Nov 2024 02:38:41 +0100 Subject: [PATCH 046/135] remove sua_ifcae from to_table --- onedal/datatypes/_data_conversion.py | 39 +++++++++++----------------- onedal/datatypes/table.cpp | 11 ++++---- onedal/datatypes/tests/test_data.py | 12 ++++----- sklearnex/tests/test_memory_usage.py | 6 ++--- 4 files changed, 30 insertions(+), 38 deletions(-) diff --git a/onedal/datatypes/_data_conversion.py b/onedal/datatypes/_data_conversion.py index af5b41eb6b..2ef6903041 100644 --- a/onedal/datatypes/_data_conversion.py +++ b/onedal/datatypes/_data_conversion.py @@ -19,15 +19,29 @@ import numpy as np import scipy.sparse as sp -from daal4py.sklearn._utils import make2d from onedal import _backend, _is_dpc_backend +def make2d(X): + # generalized for array-like inputs + if hasattr(X, "reshape") and hasattr(X, "ndim") and X.ndim == 1: + return X.reshape((-1, 1)) + if np.isscalar(X): + return np.atleast_2d(X) + return X + + def _apply_and_pass(func, *args, **kwargs): if len(args) == 1: return func(args[0], **kwargs) return tuple(map(lambda arg: func(arg, **kwargs), args)) +def convert_one_to_table(arg): + return _backend.to_table(arg if sp.issparse(arg) else make2d(arg)) + +def to_table(*args): + return _apply_and_pass(convert_one_to_table, *args) + if _is_dpc_backend: @@ -100,16 +114,6 @@ def convert_one_from_table(table, sycl_queue=None, sua_iface=None, xp=None): return _backend.from_table(table) - def convert_one_to_table(arg, sua_iface=None): - # Note: currently only oneDAL homogen tables are supported and the - # contiuginity of the input array should be checked in advance. - if sua_iface: - return _backend.sua_iface_to_table(arg) - - if not sp.issparse(arg): - arg = make2d(arg) - return _backend.to_table(arg) - else: def _convert_to_supported(policy, *data): @@ -127,22 +131,9 @@ def convert_one_from_table(table, sycl_queue=None, sua_iface=None, xp=None): ) return _backend.from_table(table) - def convert_one_to_table(arg, sua_iface=None): - if sua_iface: - raise RuntimeError( - "SYCL usm array conversion to table requires the DPC backend" - ) - - if not sp.issparse(arg): - arg = make2d(arg) - return _backend.to_table(arg) - def from_table(*args, sycl_queue=None, sua_iface=None, xp=None): return _apply_and_pass( convert_one_from_table, *args, sycl_queue=sycl_queue, sua_iface=sua_iface, xp=xp ) - -def to_table(*args, sua_iface=None): - return _apply_and_pass(convert_one_to_table, *args, sua_iface=sua_iface) diff --git a/onedal/datatypes/table.cpp b/onedal/datatypes/table.cpp index 9771306118..ce0f15936b 100644 --- a/onedal/datatypes/table.cpp +++ b/onedal/datatypes/table.cpp @@ -78,6 +78,12 @@ ONEDAL_PY_INIT_MODULE(table) { #endif // ONEDAL_DATA_PARALLEL m.def("to_table", [](py::object obj) { + #ifdef ONEDAL_DATA_PARALLEL + if (py::hasattr(obj, "__sycl_usm_array_interface__")) { + return convert_from_sua_iface(obj); + } + #endif // ONEDAL_DATA_PARALLEL + auto* obj_ptr = obj.ptr(); return convert_to_table(obj_ptr); }); @@ -87,11 +93,6 @@ ONEDAL_PY_INIT_MODULE(table) { return obj_ptr; }); -#ifdef ONEDAL_DATA_PARALLEL - m.def("sua_iface_to_table", [](py::object obj) { - return convert_from_sua_iface(obj); - }); -#endif // ONEDAL_DATA_PARALLEL } } // namespace oneapi::dal::python diff --git a/onedal/datatypes/tests/test_data.py b/onedal/datatypes/tests/test_data.py index 471d6f0a64..de47e18ad4 100644 --- a/onedal/datatypes/tests/test_data.py +++ b/onedal/datatypes/tests/test_data.py @@ -68,7 +68,7 @@ def fit(self, X, y=None): X = xp.astype(X, dtype=xp.float64) dtype = get_dtype(X) params = bs_DBSCAN._get_onedal_params(dtype) - X_table = to_table(X, sua_iface=sua_iface) + X_table = to_table(X) # TODO: # check other candidates for the dummy base oneDAL func. # oneDAL backend func is needed to check result table checks. @@ -251,7 +251,7 @@ def test_input_sua_iface_zero_copy(dataframe, queue, order, dtype): sua_iface, X_dp_namespace, _ = _get_sycl_namespace(X_dp) - X_table = to_table(X_dp, sua_iface=sua_iface) + X_table = to_table(X_dp) _assert_sua_iface_fields(X_dp, X_table) X_dp_from_table = from_table( @@ -339,7 +339,7 @@ def test_sua_iface_interop_invalid_shape(dataframe, queue, data_shape): "Unable to convert from SUA interface: only 1D & 2D tensors are allowed" ) with pytest.raises(ValueError, match=expected_err_msg): - to_table(X, sua_iface=sua_iface) + to_table(X) @pytest.mark.skipif( @@ -368,7 +368,7 @@ def test_sua_iface_interop_unsupported_dtypes(dataframe, queue, dtype): expected_err_msg = "Unable to convert from SUA interface: unknown data type" with pytest.raises(ValueError, match=expected_err_msg): - to_table(X, sua_iface=sua_iface) + to_table(X) @pytest.mark.parametrize( @@ -393,7 +393,7 @@ def test_to_table_non_contiguous_input(dataframe, queue): else: expected_err_msg = "Numpy input Could not convert Python object to onedal table." with pytest.raises(ValueError, match=expected_err_msg): - to_table(X, sua_iface=sua_iface) + to_table(X) @pytest.mark.skipif( @@ -411,4 +411,4 @@ def test_sua_iface_interop_if_no_dpc_backend(dataframe, queue, dtype): expected_err_msg = "SYCL usm array conversion to table requires the DPC backend" with pytest.raises(RuntimeError, match=expected_err_msg): - to_table(X, sua_iface=sua_iface) + to_table(X) diff --git a/sklearnex/tests/test_memory_usage.py b/sklearnex/tests/test_memory_usage.py index 4035832d37..6e7fdb72b5 100644 --- a/sklearnex/tests/test_memory_usage.py +++ b/sklearnex/tests/test_memory_usage.py @@ -142,8 +142,8 @@ class DummyEstimatorWithTableConversions(BaseEstimator): def fit(self, X, y=None): sua_iface, xp, _ = _get_sycl_namespace(X) - X_table = to_table(X, sua_iface=sua_iface) - y_table = to_table(y, sua_iface=sua_iface) + X_table = to_table(X) + y_table = to_table(y) # The presence of the fitted attributes (ending with a trailing # underscore) is required for the correct check. The cleanup of # the memory will occur at the estimator instance deletion. @@ -160,7 +160,7 @@ def predict(self, X): # fitted attributes (ending with a trailing underscore). check_is_fitted(self) sua_iface, xp, _ = _get_sycl_namespace(X) - X_table = to_table(X, sua_iface=sua_iface) + X_table = to_table(X) returned_X = from_table( X_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp ) From 28dc267ab319edf2cef611340c0ab634eae036c4 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Sun, 3 Nov 2024 02:42:29 +0100 Subject: [PATCH 047/135] isort and black --- onedal/datatypes/_data_conversion.py | 3 ++- onedal/datatypes/table.cpp | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/onedal/datatypes/_data_conversion.py b/onedal/datatypes/_data_conversion.py index 2ef6903041..c08196f1d6 100644 --- a/onedal/datatypes/_data_conversion.py +++ b/onedal/datatypes/_data_conversion.py @@ -36,9 +36,11 @@ def _apply_and_pass(func, *args, **kwargs): return func(args[0], **kwargs) return tuple(map(lambda arg: func(arg, **kwargs), args)) + def convert_one_to_table(arg): return _backend.to_table(arg if sp.issparse(arg) else make2d(arg)) + def to_table(*args): return _apply_and_pass(convert_one_to_table, *args) @@ -136,4 +138,3 @@ def from_table(*args, sycl_queue=None, sua_iface=None, xp=None): return _apply_and_pass( convert_one_from_table, *args, sycl_queue=sycl_queue, sua_iface=sua_iface, xp=xp ) - diff --git a/onedal/datatypes/table.cpp b/onedal/datatypes/table.cpp index ce0f15936b..113d881228 100644 --- a/onedal/datatypes/table.cpp +++ b/onedal/datatypes/table.cpp @@ -92,7 +92,6 @@ ONEDAL_PY_INIT_MODULE(table) { auto* obj_ptr = convert_to_pyobject(t); return obj_ptr; }); - } } // namespace oneapi::dal::python From 2f85fd4713535424395acfe5d0f72d1451c27d16 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sun, 3 Nov 2024 08:19:57 +0100 Subject: [PATCH 048/135] Update test_memory_usage.py --- sklearnex/tests/test_memory_usage.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sklearnex/tests/test_memory_usage.py b/sklearnex/tests/test_memory_usage.py index 6e7fdb72b5..6e3ef2b3f7 100644 --- a/sklearnex/tests/test_memory_usage.py +++ b/sklearnex/tests/test_memory_usage.py @@ -142,6 +142,14 @@ class DummyEstimatorWithTableConversions(BaseEstimator): def fit(self, X, y=None): sua_iface, xp, _ = _get_sycl_namespace(X) + assert X.flags['C_CONTIGUOUS'] or X.flags['F_CONTIGUOUS'] + assert y.flags['C_CONTIGUOUS'] or y.flags['F_CONTIGUOUS'] + if not (X.flags['C_CONTIGUOUS'] or X.flags['F_CONTIGUOUS']): + X = xp.copy(X) + if not (y.flags['C_CONTIGUOUS'] or y.flags['F_CONTIGUOUS']): + y = xp.copy(y) + assert X.flags['C_CONTIGUOUS'] or X.flags['F_CONTIGUOUS'] + assert y.flags['C_CONTIGUOUS'] or y.flags['F_CONTIGUOUS'] X_table = to_table(X) y_table = to_table(y) # The presence of the fitted attributes (ending with a trailing @@ -160,6 +168,10 @@ def predict(self, X): # fitted attributes (ending with a trailing underscore). check_is_fitted(self) sua_iface, xp, _ = _get_sycl_namespace(X) + assert X.flags['C_CONTIGUOUS'] or X.flags['F_CONTIGUOUS'] + if not (X.flags['C_CONTIGUOUS'] or X.flags['F_CONTIGUOUS']): + X = xp.copy(X) + assert X.flags['C_CONTIGUOUS'] or X.flags['F_CONTIGUOUS'] X_table = to_table(X) returned_X = from_table( X_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp From 8659248f70dc78cc94058690e217fa6383747b9b Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Sun, 3 Nov 2024 09:19:39 +0100 Subject: [PATCH 049/135] format --- sklearnex/tests/test_memory_usage.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sklearnex/tests/test_memory_usage.py b/sklearnex/tests/test_memory_usage.py index 6e3ef2b3f7..214c03a6ba 100644 --- a/sklearnex/tests/test_memory_usage.py +++ b/sklearnex/tests/test_memory_usage.py @@ -142,14 +142,14 @@ class DummyEstimatorWithTableConversions(BaseEstimator): def fit(self, X, y=None): sua_iface, xp, _ = _get_sycl_namespace(X) - assert X.flags['C_CONTIGUOUS'] or X.flags['F_CONTIGUOUS'] - assert y.flags['C_CONTIGUOUS'] or y.flags['F_CONTIGUOUS'] - if not (X.flags['C_CONTIGUOUS'] or X.flags['F_CONTIGUOUS']): + assert X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"] + assert y.flags["C_CONTIGUOUS"] or y.flags["F_CONTIGUOUS"] + if not (X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"]): X = xp.copy(X) - if not (y.flags['C_CONTIGUOUS'] or y.flags['F_CONTIGUOUS']): + if not (y.flags["C_CONTIGUOUS"] or y.flags["F_CONTIGUOUS"]): y = xp.copy(y) - assert X.flags['C_CONTIGUOUS'] or X.flags['F_CONTIGUOUS'] - assert y.flags['C_CONTIGUOUS'] or y.flags['F_CONTIGUOUS'] + assert X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"] + assert y.flags["C_CONTIGUOUS"] or y.flags["F_CONTIGUOUS"] X_table = to_table(X) y_table = to_table(y) # The presence of the fitted attributes (ending with a trailing @@ -168,10 +168,10 @@ def predict(self, X): # fitted attributes (ending with a trailing underscore). check_is_fitted(self) sua_iface, xp, _ = _get_sycl_namespace(X) - assert X.flags['C_CONTIGUOUS'] or X.flags['F_CONTIGUOUS'] - if not (X.flags['C_CONTIGUOUS'] or X.flags['F_CONTIGUOUS']): + assert X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"] + if not (X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"]): X = xp.copy(X) - assert X.flags['C_CONTIGUOUS'] or X.flags['F_CONTIGUOUS'] + assert X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"] X_table = to_table(X) returned_X = from_table( X_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp From 3827d6f38cfcd5ef065d8d6a3ea34bc749de436a Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sun, 3 Nov 2024 11:01:26 +0100 Subject: [PATCH 050/135] Update _data_conversion.py --- onedal/datatypes/_data_conversion.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/onedal/datatypes/_data_conversion.py b/onedal/datatypes/_data_conversion.py index c08196f1d6..0deacf4c74 100644 --- a/onedal/datatypes/_data_conversion.py +++ b/onedal/datatypes/_data_conversion.py @@ -24,8 +24,9 @@ def make2d(X): # generalized for array-like inputs + # dpnp -1 indexing is broken, use size if hasattr(X, "reshape") and hasattr(X, "ndim") and X.ndim == 1: - return X.reshape((-1, 1)) + return X.reshape((X.size, 1)) if np.isscalar(X): return np.atleast_2d(X) return X From 55fa7d214f7a2f0398f1a83a7961a8491c587269 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sun, 3 Nov 2024 12:28:38 +0100 Subject: [PATCH 051/135] Update _data_conversion.py --- onedal/datatypes/_data_conversion.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/onedal/datatypes/_data_conversion.py b/onedal/datatypes/_data_conversion.py index 0deacf4c74..353fef7e9c 100644 --- a/onedal/datatypes/_data_conversion.py +++ b/onedal/datatypes/_data_conversion.py @@ -22,16 +22,6 @@ from onedal import _backend, _is_dpc_backend -def make2d(X): - # generalized for array-like inputs - # dpnp -1 indexing is broken, use size - if hasattr(X, "reshape") and hasattr(X, "ndim") and X.ndim == 1: - return X.reshape((X.size, 1)) - if np.isscalar(X): - return np.atleast_2d(X) - return X - - def _apply_and_pass(func, *args, **kwargs): if len(args) == 1: return func(args[0], **kwargs) @@ -39,7 +29,7 @@ def _apply_and_pass(func, *args, **kwargs): def convert_one_to_table(arg): - return _backend.to_table(arg if sp.issparse(arg) else make2d(arg)) + return _backend.to_table(np.atleast_2d(arg) if np.isscalar(arg) else arg) def to_table(*args): From 175cd7899f2a3851c60cd1964c7f7fe1f48712f3 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sun, 3 Nov 2024 13:33:34 +0100 Subject: [PATCH 052/135] Update test_validation.py --- onedal/utils/tests/test_validation.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/onedal/utils/tests/test_validation.py b/onedal/utils/tests/test_validation.py index 5788a9ccc3..6f9f1c383f 100644 --- a/onedal/utils/tests/test_validation.py +++ b/onedal/utils/tests/test_validation.py @@ -78,12 +78,13 @@ def test_assert_finite_random_location( ): rand.seed(seed) X = rand.uniform(high=np.finfo(dtype).max, size=shape).astype(dtype) - X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) if check: loc = rand.randint(0, X.size - 1) X.reshape((-1,))[loc] = float(check) + X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + if check is None or (allow_nan and check == "NaN"): _assert_all_finite(X, allow_nan=allow_nan) else: @@ -103,12 +104,13 @@ def test_assert_finite_random_shape_and_location( lb, ub = 2, 1048576 # lb is a patching condition, ub 2^20 rand.seed(seed) X = rand.uniform(high=np.finfo(dtype).max, size=rand.randint(lb, ub)).astype(dtype) - X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) if check: loc = rand.randint(0, X.size - 1) X[loc] = float(check) + X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + if check is None or (allow_nan and check == "NaN"): _assert_all_finite(X, allow_nan=allow_nan) else: From 7016ad0871a5f4c5f1d0c53bad5709752a88361c Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Sun, 3 Nov 2024 14:33:38 +0100 Subject: [PATCH 053/135] remove unnecessary code --- onedal/datatypes/_data_conversion.py | 1 - sklearnex/tests/test_memory_usage.py | 12 ------------ 2 files changed, 13 deletions(-) diff --git a/onedal/datatypes/_data_conversion.py b/onedal/datatypes/_data_conversion.py index 353fef7e9c..018b79524e 100644 --- a/onedal/datatypes/_data_conversion.py +++ b/onedal/datatypes/_data_conversion.py @@ -17,7 +17,6 @@ import warnings import numpy as np -import scipy.sparse as sp from onedal import _backend, _is_dpc_backend diff --git a/sklearnex/tests/test_memory_usage.py b/sklearnex/tests/test_memory_usage.py index 214c03a6ba..6e7fdb72b5 100644 --- a/sklearnex/tests/test_memory_usage.py +++ b/sklearnex/tests/test_memory_usage.py @@ -142,14 +142,6 @@ class DummyEstimatorWithTableConversions(BaseEstimator): def fit(self, X, y=None): sua_iface, xp, _ = _get_sycl_namespace(X) - assert X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"] - assert y.flags["C_CONTIGUOUS"] or y.flags["F_CONTIGUOUS"] - if not (X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"]): - X = xp.copy(X) - if not (y.flags["C_CONTIGUOUS"] or y.flags["F_CONTIGUOUS"]): - y = xp.copy(y) - assert X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"] - assert y.flags["C_CONTIGUOUS"] or y.flags["F_CONTIGUOUS"] X_table = to_table(X) y_table = to_table(y) # The presence of the fitted attributes (ending with a trailing @@ -168,10 +160,6 @@ def predict(self, X): # fitted attributes (ending with a trailing underscore). check_is_fitted(self) sua_iface, xp, _ = _get_sycl_namespace(X) - assert X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"] - if not (X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"]): - X = xp.copy(X) - assert X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"] X_table = to_table(X) returned_X = from_table( X_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp From fb7375f796834d6dd6a2ed490bdcc38a018f80e3 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Tue, 19 Nov 2024 06:57:01 +0100 Subject: [PATCH 054/135] make reviewer changes --- onedal/utils/finiteness_checker.cpp | 2 +- onedal/utils/tests/test_validation.py | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/onedal/utils/finiteness_checker.cpp b/onedal/utils/finiteness_checker.cpp index 6bc6a2e66b..2b8d84bd6f 100644 --- a/onedal/utils/finiteness_checker.cpp +++ b/onedal/utils/finiteness_checker.cpp @@ -66,7 +66,7 @@ void init_compute_ops(py::module_& m) { using namespace finiteness_checker; using input_t = compute_input; - compute_ops ops(policy, input_t{ data}, params2desc{}); + compute_ops ops(policy, input_t{ data }, params2desc{}); return fptype2t{ method2t{ Task{}, ops } }(params); }); } diff --git a/onedal/utils/tests/test_validation.py b/onedal/utils/tests/test_validation.py index 6f9f1c383f..5f92a64bf7 100644 --- a/onedal/utils/tests/test_validation.py +++ b/onedal/utils/tests/test_validation.py @@ -19,7 +19,6 @@ import numpy as np import numpy.random as rand import pytest -from numpy.testing import assert_raises from onedal.tests.utils._dataframes_support import ( _convert_to_dataframe, @@ -88,7 +87,9 @@ def test_assert_finite_random_location( if check is None or (allow_nan and check == "NaN"): _assert_all_finite(X, allow_nan=allow_nan) else: - assert_raises(ValueError, _assert_all_finite, X, allow_nan=allow_nan) + msg_err = "Input contains " + ("infinity" if allow_nan else "NaN, infinity") + "." + with pytest.raises(ValueError, match=msg_err): + _assert_all_finite(X, allow_nan=allow_nan) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @@ -114,4 +115,6 @@ def test_assert_finite_random_shape_and_location( if check is None or (allow_nan and check == "NaN"): _assert_all_finite(X, allow_nan=allow_nan) else: - assert_raises(ValueError, _assert_all_finite, X, allow_nan=allow_nan) + msg_err = "Input contains " + ("infinity" if allow_nan else "NaN, infinity") + "." + with pytest.raises(ValueError, match=msg_err): + _assert_all_finite(X, allow_nan=allow_nan) From 30816bf546a8b5aa5470a34ec0b4e6c82577a3c9 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Tue, 19 Nov 2024 15:43:29 +0100 Subject: [PATCH 055/135] make dtype check change --- onedal/datatypes/table.cpp | 4 ++++ onedal/utils/validation.py | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/onedal/datatypes/table.cpp b/onedal/datatypes/table.cpp index 113d881228..634cc99a1d 100644 --- a/onedal/datatypes/table.cpp +++ b/onedal/datatypes/table.cpp @@ -72,6 +72,10 @@ ONEDAL_PY_INIT_MODULE(table) { const auto column_count = t.get_column_count(); return py::make_tuple(row_count, column_count); }); + table_obj.def_property_readonly("dtype", [](const table& t){ + // returns a numpy dtype, even if source was not from numpy + return convert_dal_to_npy_type(t.get_metadata().get_data_type(0)); + }); #ifdef ONEDAL_DATA_PARALLEL define_sycl_usm_array_property(table_obj); diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index 5294483ac2..836dd84a75 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -447,12 +447,12 @@ def _is_csr(x): def _assert_all_finite(X, allow_nan=False, input_name=""): policy = _get_policy(None, X) + X_t = to_table(_convert_to_supported(policy, X)) params = { - "fptype": "float" if X.dtype.name == "float32" else "double", + "fptype": "float" if X_t.dtype == np.float32 else "double", "method": "dense", "allow_nan": allow_nan, } - X_t = to_table(_convert_to_supported(policy, X)) if not _backend.finiteness_checker.compute.compute(policy, params, X_t).finite: type_err = "infinity" if allow_nan else "NaN, infinity" padded_input_name = input_name + " " if input_name else "" From abb3b1683f71fe758beec194795ab6a8b24545f3 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Tue, 19 Nov 2024 16:06:59 +0100 Subject: [PATCH 056/135] add sparse testing --- onedal/utils/tests/test_validation.py | 29 +++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/onedal/utils/tests/test_validation.py b/onedal/utils/tests/test_validation.py index 5f92a64bf7..aefa1dbb36 100644 --- a/onedal/utils/tests/test_validation.py +++ b/onedal/utils/tests/test_validation.py @@ -19,6 +19,7 @@ import numpy as np import numpy.random as rand import pytest +import scipy.sparse as sp from onedal.tests.utils._dataframes_support import ( _convert_to_dataframe, @@ -118,3 +119,31 @@ def test_assert_finite_random_shape_and_location( msg_err = "Input contains " + ("infinity" if allow_nan else "NaN, infinity") + "." with pytest.raises(ValueError, match=msg_err): _assert_all_finite(X, allow_nan=allow_nan) + + +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("allow_nan", [False, True]) +@pytest.mark.parametrize("check", ["inf", "NaN", None]) +@pytest.mark.parametrize("seed", [0, int(time.time())]) +def test_assert_finite_sparse(dtype, allow_nan, check, seed): + lb, ub = 2, 1048576 # lb is a patching condition, ub 2^20 + rand.seed(seed) + X = sp.random( + rand.randint(lb, ub), + rand.randint(lb, ub), + format="csr", + dtype=dtype, + random_state=rand.default_rng(seed), + ) + + if check: + locx = rand.randint(0, X.shape[0] - 1) + locy = rand.randint(0, X.shape[1] - 1) + X[locx, locy] = float(check) + + if check is None or (allow_nan and check == "NaN"): + assert_all_finite(X, allow_nan=allow_nan) + else: + msg_err = "Input contains " + ("infinity" if allow_nan else "NaN, infinity") + "." + with pytest.raises(ValueError, match=msg_err): + assert_all_finite(X, allow_nan=allow_nan) From 97aef73e5866db07206fdf47571f9fb94f93185c Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Tue, 19 Nov 2024 17:06:17 +0100 Subject: [PATCH 057/135] try again --- onedal/datatypes/table.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/datatypes/table.cpp b/onedal/datatypes/table.cpp index 634cc99a1d..a06a08710d 100644 --- a/onedal/datatypes/table.cpp +++ b/onedal/datatypes/table.cpp @@ -74,7 +74,7 @@ ONEDAL_PY_INIT_MODULE(table) { }); table_obj.def_property_readonly("dtype", [](const table& t){ // returns a numpy dtype, even if source was not from numpy - return convert_dal_to_npy_type(t.get_metadata().get_data_type(0)); + return py::dtype(convert_dal_to_npy_type(t.get_metadata().get_data_type(0))); }); #ifdef ONEDAL_DATA_PARALLEL From 6e29651587f42226b06c2d733d386a0bc19e0168 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Tue, 19 Nov 2024 17:29:19 +0100 Subject: [PATCH 058/135] try again --- onedal/utils/tests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/utils/tests/test_validation.py b/onedal/utils/tests/test_validation.py index aefa1dbb36..d953038f33 100644 --- a/onedal/utils/tests/test_validation.py +++ b/onedal/utils/tests/test_validation.py @@ -126,7 +126,7 @@ def test_assert_finite_random_shape_and_location( @pytest.mark.parametrize("check", ["inf", "NaN", None]) @pytest.mark.parametrize("seed", [0, int(time.time())]) def test_assert_finite_sparse(dtype, allow_nan, check, seed): - lb, ub = 2, 1048576 # lb is a patching condition, ub 2^20 + lb, ub = 2, 256 rand.seed(seed) X = sp.random( rand.randint(lb, ub), From 59363a8126643a1eb5aff981d1d7ce09cdbf711b Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Tue, 19 Nov 2024 17:30:46 +0100 Subject: [PATCH 059/135] try again --- onedal/utils/tests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/utils/tests/test_validation.py b/onedal/utils/tests/test_validation.py index d953038f33..7662f486f3 100644 --- a/onedal/utils/tests/test_validation.py +++ b/onedal/utils/tests/test_validation.py @@ -126,7 +126,7 @@ def test_assert_finite_random_shape_and_location( @pytest.mark.parametrize("check", ["inf", "NaN", None]) @pytest.mark.parametrize("seed", [0, int(time.time())]) def test_assert_finite_sparse(dtype, allow_nan, check, seed): - lb, ub = 2, 256 + lb, ub = 2, 2056 rand.seed(seed) X = sp.random( rand.randint(lb, ub), From 12de7038d719510df8043ae3dbce216afb39c6b2 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 07:18:21 +0100 Subject: [PATCH 060/135] temporary commit --- sklearnex/utils/validation.py | 40 ++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index b2d1898643..e41dec4a18 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -14,4 +14,42 @@ # limitations under the License. # =============================================================================== -from daal4py.sklearn.utils.validation import _assert_all_finite +import scipy.sparse as sp +from sklearn.utils.validation import _assert_all_finite as _sklearn_assert_all_finite +from onedal.utils.validation import _assert_all_finite as _onedal_assert_all_finite +from daal4py.sklearn._utils import sklearn_check_version + +if sklearn_check_version("1.6"): + from sklearn.utils.validation import validate_data as _sklearn_validate_data + _finite_keyword = "ensure_all_finite" + +else: + from sklearn.base import BaseEstimator + _sklearn_validate_data = BaseEstimator._validate_data + _finite_keyword = "force_all_finite" + + + +def validate_data(*args, **kwargs): + # force finite check to not occur in sklearn, default is True + force_all_finite = _finite_keyword not in kwargs or kwargs[_finite_keyword] + kwargs[_finite_keyword] = False + out = _sklearn_validate_data(*args, **kwargs) + if force_all_finite: + # run local finite check + for arg in out: + assert_all_finite(arg) + return out + + +def assert_all_finite( + X, + *, + allow_nan=False, + input_name="", +): + _assert_all_finite( + X.data if sp.issparse(X) else X, + allow_nan=allow_nan, + input_name=input_name, + ) \ No newline at end of file From 07ec3d88ca0a5754edcf42a060ce03f1ab438dd7 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 10:58:56 +0100 Subject: [PATCH 061/135] first attempt --- sklearnex/utils/validation.py | 137 +++++++++++++++++++++++++++++++--- 1 file changed, 125 insertions(+), 12 deletions(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index e41dec4a18..16b398380e 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -16,30 +16,107 @@ import scipy.sparse as sp from sklearn.utils.validation import _assert_all_finite as _sklearn_assert_all_finite -from onedal.utils.validation import _assert_all_finite as _onedal_assert_all_finite + from daal4py.sklearn._utils import sklearn_check_version +from onedal.utils._array_api import _is_numpy_namespace +from onedal.utils.validation import _assert_all_finite as _onedal_assert_all_finite + +from ._array_api import get_namespace if sklearn_check_version("1.6"): from sklearn.utils.validation import validate_data as _sklearn_validate_data + _finite_keyword = "ensure_all_finite" else: from sklearn.base import BaseEstimator + _sklearn_validate_data = BaseEstimator._validate_data _finite_keyword = "force_all_finite" +def _is_contiguous(X): + # array_api does not have a `strides` or `flags` attribute for testing memory + # order. When dlpack support is brought in for oneDAL, the dlpack object can + # then be inspected and this must be updated. _is_contiguous is therefore + # conservative in verifying attributes and does not support array_api. This + # will block onedal_assert_all_finite from being used for array api inputs. + if hasattr(X, "flags") and X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"]: + return True + return False -def validate_data(*args, **kwargs): - # force finite check to not occur in sklearn, default is True - force_all_finite = _finite_keyword not in kwargs or kwargs[_finite_keyword] - kwargs[_finite_keyword] = False - out = _sklearn_validate_data(*args, **kwargs) - if force_all_finite: - # run local finite check - for arg in out: - assert_all_finite(arg) - return out + +def _assert_all_finite_core(X, *, xp, allow_nan, input_name=""): + # This is a reproduction of code from sklearn.utils.validation + # necessary for older sklearn versions (<1.2) and for dpnp inputs + # which do not conform to the array_api standard, and cannot be + # checked in sklearn. + first_pass_isfinite = xp.isfinite(xp.sum(X)) + if first_pass_isfinite: + return + + has_inf = xp.any(xp.isinf(X)) + has_nan_error = False if allow_nan else xp.any(xp.isnan(X)) + if has_inf or has_nan_error: + type_err = "infinity" if allow_nan else "NaN, infinity" + padded_input_name = input_name + " " if input_name else "" + msg_err = f"Input {padded_input_name}contains {type_err}." + raise ValueError(msg_err) + + +if sklearn_check_version("1.2"): + + def _array_api_assert_all_finite( + X, *, xp, is_array_api_compliant, allow_nan=False, input_name="" + ): + if _is_numpy_namespace(xp) or is_array_api_compliant: + _sklearn_assert_all_finite(X, allow_nan=allow_nan, input_name=input_name) + elif "float" not in xp.dtype.name or "complex" not in xp.dtype.name: + return + # handle dpnp inputs + _assert_all_finite_core(X, xp, allow_nan, input_name=input_name) + +else: + + def _array_api_assert_all_finite( + X, xp, is_array_api_compliant, *, allow_nan=False, input_name="" + ): + + if _is_numpy_namespace(xp): + _sklearn_assert_all_finite(X, allow_nan, input_name=input_name) + elif is_array_api_compliant and not xp.isdtype( + X, ("real floating", "complex floating") + ): + return + elif "float" not in xp.dtype.name or "complex" not in xp.dtype.name: + return + + # handle array_api and dpnp inputs + _assert_all_finite_core(X, xp, allow_nan, input_name=input_name) + + +def _assert_all_finite( + X, + *, + allow_nan=False, + input_name="", +): + # array_api compliance in sklearn varies betweeen the support sklearn versions + # therefore a separate check matching sklearn's assert_all_finite is necessary + # when the data is not float32 or float64 but of a float type. The onedal + # assert_all_finite is only for float32 and float64 contiguous arrays. + + # initial match to daal4py, can be optimized later + xp, is_array_api_compliant = get_namespace(X) + if X.size < 32768 or X.dtype not in [xp.float32, xp.float64] or not _is_contiguous(X): + + # all non-numpy arrays for sklearn 1.0 and dpnp for sklearn are not handeled properly + # separate function for import-time sklearn version check + _array_api_assert_all_finite( + X, xp, is_array_api_compliant, allow_nan=allow_nan, input_name=input_name + ) + else: + _onedal_assert_all_finite(X, allow_nan=allow_nan, input_name=input_name) def assert_all_finite( @@ -52,4 +129,40 @@ def assert_all_finite( X.data if sp.issparse(X) else X, allow_nan=allow_nan, input_name=input_name, - ) \ No newline at end of file + ) + + +def validate_data( + _estimator, + /, + X="no_validation", + y="no_validation", + reset=True, + validate_separately=False, + skip_check_array=False, + **check_params, +): + # force finite check to not occur in sklearn, default is True + # `ensure_all_finite` is the most up-to-date keyword name in sklearn + # _finite_keyword provides backward compatability for `force_all_finite` + force_all_finite = ( + "ensure_all_finite" not in check_params or check_params["ensure_all_finite"] + ) + check_params[_finite_keyword] = False + out = _sklearn_validate_data( + _estimator, + X=X, + y=y, + reset=reset, + validate_separate=validate_separately, + skip_check_array=skip_check_array, + **check_params, + ) + if force_all_finite: + # run local finite check + arg = iter(out) + if not isinstance(X, str) or X != "no_validation": + assert_all_finite(next(arg), input_name="X") + if y is not None or not isinstance(y, str) or y != "no_validation": + assert_all_finite(next(arg), input_name="y") + return out From 32c565d42ad0d07ed37d5a2ea264c32b25510676 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 11:18:13 +0100 Subject: [PATCH 062/135] missing change? --- sklearnex/utils/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearnex/utils/__init__.py b/sklearnex/utils/__init__.py index 4c3fe21154..686e089adf 100755 --- a/sklearnex/utils/__init__.py +++ b/sklearnex/utils/__init__.py @@ -14,6 +14,6 @@ # limitations under the License. # =============================================================================== -from .validation import _assert_all_finite +from .validation import assert_all_finite -__all__ = ["_assert_all_finite"] +__all__ = ["assert_all_finite"] From 5093ed7d8e35559c7966d3e4fd573cd2a6f19b80 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 12:15:56 +0100 Subject: [PATCH 063/135] modify DummyEstimator for testing --- sklearnex/tests/test_memory_usage.py | 44 +++++----------------------- sklearnex/tests/utils/__init__.py | 2 ++ sklearnex/tests/utils/base.py | 35 ++++++++++++++++++++++ 3 files changed, 44 insertions(+), 37 deletions(-) diff --git a/sklearnex/tests/test_memory_usage.py b/sklearnex/tests/test_memory_usage.py index 6e7fdb72b5..570e061040 100644 --- a/sklearnex/tests/test_memory_usage.py +++ b/sklearnex/tests/test_memory_usage.py @@ -38,7 +38,12 @@ from onedal.utils._array_api import _get_sycl_namespace from onedal.utils._dpep_helpers import dpctl_available, dpnp_available from sklearnex import config_context -from sklearnex.tests.utils import PATCHED_FUNCTIONS, PATCHED_MODELS, SPECIAL_INSTANCES +from sklearnex.tests.utils import ( + PATCHED_FUNCTIONS, + PATCHED_MODELS, + SPECIAL_INSTANCES, + DummyEstimator, +) from sklearnex.utils._array_api import get_namespace if dpctl_available: @@ -132,41 +137,6 @@ def gen_functions(functions): ORDER_DICT = {"F": np.asfortranarray, "C": np.ascontiguousarray} -if _is_dpc_backend: - - from sklearn.utils.validation import check_is_fitted - - from onedal.datatypes import from_table, to_table - - class DummyEstimatorWithTableConversions(BaseEstimator): - - def fit(self, X, y=None): - sua_iface, xp, _ = _get_sycl_namespace(X) - X_table = to_table(X) - y_table = to_table(y) - # The presence of the fitted attributes (ending with a trailing - # underscore) is required for the correct check. The cleanup of - # the memory will occur at the estimator instance deletion. - self.x_attr_ = from_table( - X_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp - ) - self.y_attr_ = from_table( - y_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp - ) - return self - - def predict(self, X): - # Checks if the estimator is fitted by verifying the presence of - # fitted attributes (ending with a trailing underscore). - check_is_fitted(self) - sua_iface, xp, _ = _get_sycl_namespace(X) - X_table = to_table(X) - returned_X = from_table( - X_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp - ) - return returned_X - - def gen_clsf_data(n_samples, n_features, dtype=None): data, label = make_classification( n_classes=2, n_samples=n_samples, n_features=n_features, random_state=777 @@ -370,7 +340,7 @@ def test_table_conversions_memory_leaks(dataframe, queue, order, data_shape, dty pytest.skip("SYCL device memory leak check requires the level zero sysman") _kfold_function_template( - DummyEstimatorWithTableConversions, + DummyEstimator, dataframe, data_shape, queue, diff --git a/sklearnex/tests/utils/__init__.py b/sklearnex/tests/utils/__init__.py index 60ca67fa37..db728fe913 100644 --- a/sklearnex/tests/utils/__init__.py +++ b/sklearnex/tests/utils/__init__.py @@ -21,6 +21,7 @@ SPECIAL_INSTANCES, UNPATCHED_FUNCTIONS, UNPATCHED_MODELS, + DummyEstimator, _get_processor_info, call_method, gen_dataset, @@ -39,6 +40,7 @@ "gen_models_info", "gen_dataset", "sklearn_clone_dict", + "DummyEstimator", ] _IS_INTEL = "GenuineIntel" in _get_processor_info() diff --git a/sklearnex/tests/utils/base.py b/sklearnex/tests/utils/base.py index 1949519585..248eb85a59 100755 --- a/sklearnex/tests/utils/base.py +++ b/sklearnex/tests/utils/base.py @@ -32,7 +32,9 @@ ) from sklearn.datasets import load_diabetes, load_iris from sklearn.neighbors._base import KNeighborsMixin +from sklearn.utils.validation import check_is_fitted +from onedal.datatypes import from_table, to_table from onedal.tests.utils._dataframes_support import _convert_to_dataframe from sklearnex import get_patch_map, patch_sklearn, sklearn_is_patched, unpatch_sklearn from sklearnex.basic_statistics import BasicStatistics, IncrementalBasicStatistics @@ -44,6 +46,7 @@ NearestNeighbors, ) from sklearnex.svm import SVC, NuSVC +from sklearnex.utils.validation import validate_data def _load_all_models(with_sklearnex=True, estimator=True): @@ -369,3 +372,35 @@ def _get_processor_info(): ) return proc + + +class DummyEstimator(BaseEstimator): + + def fit(self, X, y=None): + X_array, y_array = validate_data(self, X, y) + + sua_iface, xp, _ = _get_sycl_namespace(X_array) + X_table = to_table(X_array) + y_table = to_table(y_array) + # The presence of the fitted attributes (ending with a trailing + # underscore) is required for the correct check. The cleanup of + # the memory will occur at the estimator instance deletion. + self.x_attr_ = from_table( + X_table, sua_iface=sua_iface, sycl_queue=X_array.sycl_queue, xp=xp + ) + self.y_attr_ = from_table( + y_table, sua_iface=sua_iface, sycl_queue=X_array.sycl_queue, xp=xp + ) + return self + + def predict(self, X): + # Checks if the estimator is fitted by verifying the presence of + # fitted attributes (ending with a trailing underscore). + check_is_fitted(self) + X_array = validate_data(self, X, reset=False) + sua_iface, xp, _ = _get_sycl_namespace(X_array) + X_table = to_table(X_array) + returned_X = from_table( + X_table, sua_iface=sua_iface, sycl_queue=X_array.sycl_queue, xp=xp + ) + return returned_X From f04deba338611c4367d3c7ca91f9fcfaf3e1c432 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 12:21:32 +0100 Subject: [PATCH 064/135] generalize DummyEstimator --- sklearnex/tests/utils/base.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/sklearnex/tests/utils/base.py b/sklearnex/tests/utils/base.py index 248eb85a59..1d4eb3d0cf 100755 --- a/sklearnex/tests/utils/base.py +++ b/sklearnex/tests/utils/base.py @@ -385,12 +385,19 @@ def fit(self, X, y=None): # The presence of the fitted attributes (ending with a trailing # underscore) is required for the correct check. The cleanup of # the memory will occur at the estimator instance deletion. - self.x_attr_ = from_table( - X_table, sua_iface=sua_iface, sycl_queue=X_array.sycl_queue, xp=xp - ) - self.y_attr_ = from_table( - y_table, sua_iface=sua_iface, sycl_queue=X_array.sycl_queue, xp=xp - ) + if sua_iface: + self.x_attr_ = from_table( + X_table, sua_iface=sua_iface, sycl_queue=X_array.sycl_queue, xp=xp + ) + self.y_attr_ = from_table( + y_table, sua_iface=sua_iface, sycl_queue=X_array.sycl_queue, xp=xp + ) + else: + self.x_attr = from_table(X_table) + self.y_attr = from_table(y_table) + + assert type(self.x_attr) == type(X) + return self def predict(self, X): @@ -400,7 +407,13 @@ def predict(self, X): X_array = validate_data(self, X, reset=False) sua_iface, xp, _ = _get_sycl_namespace(X_array) X_table = to_table(X_array) - returned_X = from_table( - X_table, sua_iface=sua_iface, sycl_queue=X_array.sycl_queue, xp=xp - ) + if sua_iface: + returned_X = from_table( + X_table, sua_iface=sua_iface, sycl_queue=X_array.sycl_queue, xp=xp + ) + else: + returned_X = from_table(X_table) + + assert type(returned_X) == type(X) + return returned_X From 740a5e762788d989186222b79c9f467d4c0973c4 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 12:42:42 +0100 Subject: [PATCH 065/135] switch test --- sklearnex/utils/tests/test_finite.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index 2874ec3400..eaa39fe2c0 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -21,7 +21,7 @@ import pytest from numpy.testing import assert_raises -from sklearnex.utils import _assert_all_finite +from sklearnex.utils import assert_all_finite @pytest.mark.parametrize("dtype", [np.float32, np.float64]) From 27050bd5a4329dcc30d8f9ec39efce6212cd8694 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 12:43:23 +0100 Subject: [PATCH 066/135] further testing changes --- sklearnex/utils/tests/test_finite.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index eaa39fe2c0..487bb39369 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -65,7 +65,7 @@ def test_assert_finite_random_location(dtype, shape, allow_nan, check, seed): X.reshape((-1,))[loc] = float(check) if check is None or (allow_nan and check == "NaN"): - _assert_all_finite(X, allow_nan=allow_nan) + assert_all_finite(X, allow_nan=allow_nan) else: assert_raises(ValueError, _assert_all_finite, X, allow_nan=allow_nan) @@ -84,6 +84,6 @@ def test_assert_finite_random_shape_and_location(dtype, allow_nan, check, seed): X[loc] = float(check) if check is None or (allow_nan and check == "NaN"): - _assert_all_finite(X, allow_nan=allow_nan) + assert_all_finite(X, allow_nan=allow_nan) else: - assert_raises(ValueError, _assert_all_finite, X, allow_nan=allow_nan) + assert_raises(ValueError, assert_all_finite, X, allow_nan=allow_nan) From 53c8f7b7152d53019819fe7cbb30b382cf7b4e66 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 13:34:29 +0100 Subject: [PATCH 067/135] add initial validate_data test, will be refactored --- sklearnex/utils/tests/test_finite.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index 487bb39369..6468fde2cc 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -21,6 +21,8 @@ import pytest from numpy.testing import assert_raises +from onedal.tests.utils._dataframes_support import get_dataframes_and_queues +from sklearnex.tests.utils import DummyEstimator, gen_dataset from sklearnex.utils import assert_all_finite @@ -39,7 +41,7 @@ def test_sum_infinite_actually_finite(dtype, shape, allow_nan): X = np.array(shape, dtype=dtype) X.fill(np.finfo(dtype).max) - _assert_all_finite(X, allow_nan=allow_nan) + assert_all_finite(X, allow_nan=allow_nan) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @@ -67,7 +69,7 @@ def test_assert_finite_random_location(dtype, shape, allow_nan, check, seed): if check is None or (allow_nan and check == "NaN"): assert_all_finite(X, allow_nan=allow_nan) else: - assert_raises(ValueError, _assert_all_finite, X, allow_nan=allow_nan) + assert_raises(ValueError, assert_all_finite, X, allow_nan=allow_nan) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @@ -87,3 +89,13 @@ def test_assert_finite_random_shape_and_location(dtype, allow_nan, check, seed): assert_all_finite(X, allow_nan=allow_nan) else: assert_raises(ValueError, assert_all_finite, X, allow_nan=allow_nan) + + +@pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues()) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_validate_data(dtype, dataframe, queue): + est = DummyEstimator() + X, y = gen_dataset(est, queue=queue, target_df=dataframe, dtype=dtype)[0] + est.fit(X, y) + output = est.predict(X) + assert type(X) == type(output) From 90f59c442021b4c529e64ef9f4844296f412c014 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 15:10:04 +0100 Subject: [PATCH 068/135] fixes for CI --- sklearnex/utils/validation.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 16b398380e..7bcfc3fdf6 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -67,7 +67,7 @@ def _assert_all_finite_core(X, *, xp, allow_nan, input_name=""): if sklearn_check_version("1.2"): def _array_api_assert_all_finite( - X, *, xp, is_array_api_compliant, allow_nan=False, input_name="" + X, xp, is_array_api_compliant, *, allow_nan=False, input_name="" ): if _is_numpy_namespace(xp) or is_array_api_compliant: _sklearn_assert_all_finite(X, allow_nan=allow_nan, input_name=input_name) @@ -137,26 +137,18 @@ def validate_data( /, X="no_validation", y="no_validation", - reset=True, - validate_separately=False, - skip_check_array=False, - **check_params, + **kwargs, ): # force finite check to not occur in sklearn, default is True # `ensure_all_finite` is the most up-to-date keyword name in sklearn # _finite_keyword provides backward compatability for `force_all_finite` - force_all_finite = ( - "ensure_all_finite" not in check_params or check_params["ensure_all_finite"] - ) - check_params[_finite_keyword] = False + force_all_finite = "ensure_all_finite" not in kwargs or kwargs["ensure_all_finite"] + kwargs[_finite_keyword] = False out = _sklearn_validate_data( _estimator, X=X, y=y, - reset=reset, - validate_separate=validate_separately, - skip_check_array=skip_check_array, - **check_params, + **kwargs, ) if force_all_finite: # run local finite check From 7f170e2efc494d66b1a7b9b1f29c87eb1c3f9edf Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 20 Nov 2024 16:36:38 +0100 Subject: [PATCH 069/135] Update validation.py --- sklearnex/utils/validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 7bcfc3fdf6..0fc31d53c0 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -1,4 +1,4 @@ -# =============================================================================== +the# =============================================================================== # Copyright 2022 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -46,7 +46,7 @@ def _is_contiguous(X): return False -def _assert_all_finite_core(X, *, xp, allow_nan, input_name=""): +def _assert_all_finite_core(X, xp, allow_nan, *, input_name=""): # This is a reproduction of code from sklearn.utils.validation # necessary for older sklearn versions (<1.2) and for dpnp inputs # which do not conform to the array_api standard, and cannot be From 81e2bbc763b21bdd29b40e1a72c1ac41355de569 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 20 Nov 2024 16:54:08 +0100 Subject: [PATCH 070/135] Update validation.py --- sklearnex/utils/validation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 0fc31d53c0..3e65223331 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -1,4 +1,4 @@ -the# =============================================================================== +# =============================================================================== # Copyright 2022 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -46,7 +46,7 @@ def _is_contiguous(X): return False -def _assert_all_finite_core(X, xp, allow_nan, *, input_name=""): +def _assert_all_finite_core(X, xp, *, allow_nan=False, input_name=""): # This is a reproduction of code from sklearn.utils.validation # necessary for older sklearn versions (<1.2) and for dpnp inputs # which do not conform to the array_api standard, and cannot be @@ -74,7 +74,7 @@ def _array_api_assert_all_finite( elif "float" not in xp.dtype.name or "complex" not in xp.dtype.name: return # handle dpnp inputs - _assert_all_finite_core(X, xp, allow_nan, input_name=input_name) + _assert_all_finite_core(X, xp, allow_nan=allow_nan, input_name=input_name) else: From 116bdba61f83fda8d66566cfd6bbeb999ca532df Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 20 Nov 2024 17:18:33 +0100 Subject: [PATCH 071/135] Update test_memory_usage.py --- sklearnex/tests/test_memory_usage.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearnex/tests/test_memory_usage.py b/sklearnex/tests/test_memory_usage.py index 570e061040..be501be218 100644 --- a/sklearnex/tests/test_memory_usage.py +++ b/sklearnex/tests/test_memory_usage.py @@ -35,7 +35,6 @@ get_dataframes_and_queues, ) from onedal.tests.utils._device_selection import get_queues, is_dpctl_device_available -from onedal.utils._array_api import _get_sycl_namespace from onedal.utils._dpep_helpers import dpctl_available, dpnp_available from sklearnex import config_context from sklearnex.tests.utils import ( From 076ebc401b4e9fbd872f9f1bc971bad1eb095f32 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 20 Nov 2024 17:19:15 +0100 Subject: [PATCH 072/135] Update base.py --- sklearnex/tests/utils/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearnex/tests/utils/base.py b/sklearnex/tests/utils/base.py index 1d4eb3d0cf..35ba2811e2 100755 --- a/sklearnex/tests/utils/base.py +++ b/sklearnex/tests/utils/base.py @@ -36,6 +36,7 @@ from onedal.datatypes import from_table, to_table from onedal.tests.utils._dataframes_support import _convert_to_dataframe +from onedal.utils._array_api import _get_sycl_namespace from sklearnex import get_patch_map, patch_sklearn, sklearn_is_patched, unpatch_sklearn from sklearnex.basic_statistics import BasicStatistics, IncrementalBasicStatistics from sklearnex.linear_model import LogisticRegression From e1d074365e51fa77fa75f6457c090346eb6d527a Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 20 Nov 2024 18:03:03 +0100 Subject: [PATCH 073/135] Update base.py --- sklearnex/tests/utils/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearnex/tests/utils/base.py b/sklearnex/tests/utils/base.py index 35ba2811e2..0d58b5189b 100755 --- a/sklearnex/tests/utils/base.py +++ b/sklearnex/tests/utils/base.py @@ -396,7 +396,7 @@ def fit(self, X, y=None): else: self.x_attr = from_table(X_table) self.y_attr = from_table(y_table) - + assert type(X_array) == type(X) assert type(self.x_attr) == type(X) return self @@ -414,7 +414,7 @@ def predict(self, X): ) else: returned_X = from_table(X_table) - + assert type(X_array) == type(X) assert type(returned_X) == type(X) return returned_X From f59cdd33d29321c3989d0b4415b99b5055408f23 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 22:38:30 +0100 Subject: [PATCH 074/135] improve tests --- sklearnex/tests/utils/base.py | 23 +++----- sklearnex/utils/tests/test_finite.py | 83 +++++++++++++++++++++++----- 2 files changed, 77 insertions(+), 29 deletions(-) diff --git a/sklearnex/tests/utils/base.py b/sklearnex/tests/utils/base.py index 0d58b5189b..e484423cfc 100755 --- a/sklearnex/tests/utils/base.py +++ b/sklearnex/tests/utils/base.py @@ -378,26 +378,24 @@ def _get_processor_info(): class DummyEstimator(BaseEstimator): def fit(self, X, y=None): - X_array, y_array = validate_data(self, X, y) + X, y = validate_data(self, X, y) - sua_iface, xp, _ = _get_sycl_namespace(X_array) - X_table = to_table(X_array) - y_table = to_table(y_array) + sua_iface, xp, _ = _get_sycl_namespace(X) + X_table = to_table(X) + y_table = to_table(y) # The presence of the fitted attributes (ending with a trailing # underscore) is required for the correct check. The cleanup of # the memory will occur at the estimator instance deletion. if sua_iface: self.x_attr_ = from_table( - X_table, sua_iface=sua_iface, sycl_queue=X_array.sycl_queue, xp=xp + X_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp ) self.y_attr_ = from_table( - y_table, sua_iface=sua_iface, sycl_queue=X_array.sycl_queue, xp=xp + y_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp ) else: self.x_attr = from_table(X_table) self.y_attr = from_table(y_table) - assert type(X_array) == type(X) - assert type(self.x_attr) == type(X) return self @@ -405,16 +403,13 @@ def predict(self, X): # Checks if the estimator is fitted by verifying the presence of # fitted attributes (ending with a trailing underscore). check_is_fitted(self) - X_array = validate_data(self, X, reset=False) - sua_iface, xp, _ = _get_sycl_namespace(X_array) - X_table = to_table(X_array) + sua_iface, xp, _ = _get_sycl_namespace(X) + X_table = to_table(X) if sua_iface: returned_X = from_table( - X_table, sua_iface=sua_iface, sycl_queue=X_array.sycl_queue, xp=xp + X_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp ) else: returned_X = from_table(X_table) - assert type(X_array) == type(X) - assert type(returned_X) == type(X) return returned_X diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index 6468fde2cc..5c3ee2d50e 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -19,11 +19,15 @@ import numpy as np import numpy.random as rand import pytest -from numpy.testing import assert_raises -from onedal.tests.utils._dataframes_support import get_dataframes_and_queues +from daal4py.sklearn._utils import sklearn_check_version +from onedal.tests.utils._dataframes_support import ( + _convert_to_dataframe, + get_dataframes_and_queues, +) +from sklearnex import config_context from sklearnex.tests.utils import DummyEstimator, gen_dataset -from sklearnex.utils import assert_all_finite +from sklearnex.utils import validate_data @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @@ -39,9 +43,11 @@ ) @pytest.mark.parametrize("allow_nan", [False, True]) def test_sum_infinite_actually_finite(dtype, shape, allow_nan): + est = DummyEstimator() X = np.array(shape, dtype=dtype) X.fill(np.finfo(dtype).max) - assert_all_finite(X, allow_nan=allow_nan) + X_array = validate_data(est, X, allow_nan=allow_nan) + assert type(X_array) == type(X) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @@ -58,7 +64,11 @@ def test_sum_infinite_actually_finite(dtype, shape, allow_nan): @pytest.mark.parametrize("allow_nan", [False, True]) @pytest.mark.parametrize("check", ["inf", "NaN", None]) @pytest.mark.parametrize("seed", [0, int(time.time())]) -def test_assert_finite_random_location(dtype, shape, allow_nan, check, seed): +@pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues()) +def test_validate_data_random_location( + dataframe, queue, dtype, shape, allow_nan, check, seed +): + est = DummyEstimator() rand.seed(seed) X = rand.uniform(high=np.finfo(dtype).max, size=shape).astype(dtype) @@ -66,17 +76,29 @@ def test_assert_finite_random_location(dtype, shape, allow_nan, check, seed): loc = rand.randint(0, X.size - 1) X.reshape((-1,))[loc] = float(check) + X = _convert_to_dataframe( + X, + target_df=dataframe, + sycl_queue=queue, + ) + if check is None or (allow_nan and check == "NaN"): - assert_all_finite(X, allow_nan=allow_nan) + validate_data(est, X, allow_nan=allow_nan) else: - assert_raises(ValueError, assert_all_finite, X, allow_nan=allow_nan) + msg_err = "Input contains " + ("infinity" if allow_nan else "NaN, infinity") + "." + with pytest.raises(ValueError, match=msg_err): + validate_data(est, X, allow_nan=allow_nan) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("allow_nan", [False, True]) @pytest.mark.parametrize("check", ["inf", "NaN", None]) @pytest.mark.parametrize("seed", [0, int(time.time())]) -def test_assert_finite_random_shape_and_location(dtype, allow_nan, check, seed): +@pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues()) +def test_validate_data_random_shape_and_location( + dataframe, queue, dtype, allow_nan, check, seed +): + est = DummyEstimator() lb, ub = 32768, 1048576 # lb is a patching condition, ub 2^20 rand.seed(seed) X = rand.uniform(high=np.finfo(dtype).max, size=rand.randint(lb, ub)).astype(dtype) @@ -85,17 +107,48 @@ def test_assert_finite_random_shape_and_location(dtype, allow_nan, check, seed): loc = rand.randint(0, X.size - 1) X[loc] = float(check) + X = _convert_to_dataframe( + X, + target_df=dataframe, + sycl_queue=queue, + ) + if check is None or (allow_nan and check == "NaN"): - assert_all_finite(X, allow_nan=allow_nan) + validate_data(est, X) else: - assert_raises(ValueError, assert_all_finite, X, allow_nan=allow_nan) + msg_err = "Input contains " + ("infinity" if allow_nan else "NaN, infinity") + "." + with pytest.raises(ValueError, match=msg_err): + validate_data(est, X, allow_nan=allow_nan) -@pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues()) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) -def test_validate_data(dtype, dataframe, queue): +@pytest.mark.parametrize("array_api_dispatch", [True, False]) +@pytest.mark.parametrize( + "dataframe, queue", get_dataframes_and_queues("numpy,dpctl,dpnp") +) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_validate_data_output(array_api_dispatch, dtype, dataframe, queue): est = DummyEstimator() X, y = gen_dataset(est, queue=queue, target_df=dataframe, dtype=dtype)[0] - est.fit(X, y) - output = est.predict(X) - assert type(X) == type(output) + + dispatch = {} + if sklearn_check_version("1.2"): + dispatch["array_api_dispatch"] = array_api_dispatch + + with config_context(**dispatch): + validate_data(est, X, y) + est.fit(X, y) + X_array = validate_data(est, X, reset=False) + X_out = est.predict(X) + + if ( + sklearn_check_version("1.2") or dataframe != "array_api" + ) and dataframe != "pandas": + assert type(X) == type( + X_array + ), f"validate_data converted {type(X)} to {type(X_array)}" + assert type(X) == type(X_out), f"from_array converted {type(X)} to {type(X_out)}" + else: + # array_api_strict from sklearn < 1.2 and pandas will convert to numpy arrays + assert isinstance(X_array, np.ndarray) + assert isinstance(X_out, np.ndarray) From 7f9ea25aceaff20983895aab9770311211fb9211 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 22:56:10 +0100 Subject: [PATCH 075/135] fix logic --- sklearnex/utils/tests/test_finite.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index 5c3ee2d50e..9ddbed4d67 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -141,9 +141,9 @@ def test_validate_data_output(array_api_dispatch, dtype, dataframe, queue): X_array = validate_data(est, X, reset=False) X_out = est.predict(X) - if ( - sklearn_check_version("1.2") or dataframe != "array_api" - ) and dataframe != "pandas": + if dataframe != "pandas" and not ( + dataframe == "array_api" and sklearn_check_version("1.2") and array_api_dispatch + ): assert type(X) == type( X_array ), f"validate_data converted {type(X)} to {type(X_array)}" From 51247c050952481babace230e099f26750806ae5 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 23:00:48 +0100 Subject: [PATCH 076/135] fix logic --- sklearnex/utils/tests/test_finite.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index 9ddbed4d67..cd400c855c 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -142,7 +142,9 @@ def test_validate_data_output(array_api_dispatch, dtype, dataframe, queue): X_out = est.predict(X) if dataframe != "pandas" and not ( - dataframe == "array_api" and sklearn_check_version("1.2") and array_api_dispatch + dataframe == "array_api" + and not sklearn_check_version("1.2") + and not array_api_dispatch ): assert type(X) == type( X_array From 6e5c0efeae8743c2406cf0e89aca19197cc9654f Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 23:09:24 +0100 Subject: [PATCH 077/135] fix logic again --- sklearnex/utils/tests/test_finite.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index cd400c855c..9a789f274f 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -141,16 +141,15 @@ def test_validate_data_output(array_api_dispatch, dtype, dataframe, queue): X_array = validate_data(est, X, reset=False) X_out = est.predict(X) - if dataframe != "pandas" and not ( + if dataframe == "pandas" or ( dataframe == "array_api" - and not sklearn_check_version("1.2") - and not array_api_dispatch + and not (sklearn_check_version("1.2") and array_api_dispatch) ): + # array_api_strict from sklearn < 1.2 and pandas will convert to numpy arrays + assert isinstance(X_array, np.ndarray) + assert isinstance(X_out, np.ndarray) + else: assert type(X) == type( X_array ), f"validate_data converted {type(X)} to {type(X_array)}" assert type(X) == type(X_out), f"from_array converted {type(X)} to {type(X_out)}" - else: - # array_api_strict from sklearn < 1.2 and pandas will convert to numpy arrays - assert isinstance(X_array, np.ndarray) - assert isinstance(X_out, np.ndarray) From 8d47744f25c0b32e9b0ad639e772107710c56e98 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 23:21:14 +0100 Subject: [PATCH 078/135] rename file --- sklearnex/utils/tests/{test_finite.py => test_validation.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename sklearnex/utils/tests/{test_finite.py => test_validation.py} (100%) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_validation.py similarity index 100% rename from sklearnex/utils/tests/test_finite.py rename to sklearnex/utils/tests/test_validation.py From 1ae9af5aa01ea34228e52e55f304b9c5e436e3fb Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 23:25:59 +0100 Subject: [PATCH 079/135] Revert "rename file" This reverts commit 8d47744f25c0b32e9b0ad639e772107710c56e98. --- sklearnex/utils/tests/{test_validation.py => test_finite.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename sklearnex/utils/tests/{test_validation.py => test_finite.py} (100%) diff --git a/sklearnex/utils/tests/test_validation.py b/sklearnex/utils/tests/test_finite.py similarity index 100% rename from sklearnex/utils/tests/test_validation.py rename to sklearnex/utils/tests/test_finite.py From bf9b46e84bdc0833463aa99ad7a61090fc7bbd30 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 23:26:43 +0100 Subject: [PATCH 080/135] remove duplication --- sklearnex/utils/tests/test_finite.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index 9a789f274f..3fea947cd7 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -123,10 +123,7 @@ def test_validate_data_random_shape_and_location( @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("array_api_dispatch", [True, False]) -@pytest.mark.parametrize( - "dataframe, queue", get_dataframes_and_queues("numpy,dpctl,dpnp") -) -@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues()) def test_validate_data_output(array_api_dispatch, dtype, dataframe, queue): est = DummyEstimator() X, y = gen_dataset(est, queue=queue, target_df=dataframe, dtype=dtype)[0] From 3101c3fb0b5bbcc4f3a8386de28da538c5ed4467 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 23:41:44 +0100 Subject: [PATCH 081/135] fix imports --- sklearnex/utils/tests/test_finite.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index 3fea947cd7..d9d8d461fe 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -27,7 +27,7 @@ ) from sklearnex import config_context from sklearnex.tests.utils import DummyEstimator, gen_dataset -from sklearnex.utils import validate_data +from sklearnex.utils.validation import validate_data @pytest.mark.parametrize("dtype", [np.float32, np.float64]) From ee799f60c000651eb828bd7586a91825706b644b Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 20 Nov 2024 23:42:45 +0100 Subject: [PATCH 082/135] Rename test_finite.py to test_validation.py --- sklearnex/utils/tests/{test_finite.py => test_validation.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename sklearnex/utils/tests/{test_finite.py => test_validation.py} (100%) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_validation.py similarity index 100% rename from sklearnex/utils/tests/test_finite.py rename to sklearnex/utils/tests/test_validation.py From db4a6c6fe00883b42b8c580b11ecee8b169bc237 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 20 Nov 2024 23:43:15 +0100 Subject: [PATCH 083/135] Revert "Rename test_finite.py to test_validation.py" This reverts commit ee799f60c000651eb828bd7586a91825706b644b. --- sklearnex/utils/tests/{test_validation.py => test_finite.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename sklearnex/utils/tests/{test_validation.py => test_finite.py} (100%) diff --git a/sklearnex/utils/tests/test_validation.py b/sklearnex/utils/tests/test_finite.py similarity index 100% rename from sklearnex/utils/tests/test_validation.py rename to sklearnex/utils/tests/test_finite.py From b5acbac8782f6022eff6ee85425d593ce9826e6e Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 21 Nov 2024 06:07:57 +0100 Subject: [PATCH 084/135] updates --- sklearnex/utils/tests/test_finite.py | 36 +++++++++++++++++----------- sklearnex/utils/validation.py | 11 +++++---- 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index d9d8d461fe..180b256771 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -41,12 +41,12 @@ [1000, 1000], ], ) -@pytest.mark.parametrize("allow_nan", [False, True]) -def test_sum_infinite_actually_finite(dtype, shape, allow_nan): +@pytest.mark.parametrize("ensure_all_finite", [False, True]) +def test_sum_infinite_actually_finite(dtype, shape, ensure_all_finite): est = DummyEstimator() X = np.array(shape, dtype=dtype) X.fill(np.finfo(dtype).max) - X_array = validate_data(est, X, allow_nan=allow_nan) + X_array = validate_data(est, X, ensure_all_finite=ensure_all_finite) assert type(X_array) == type(X) @@ -61,12 +61,12 @@ def test_sum_infinite_actually_finite(dtype, shape, allow_nan): [1000, 1000], ], ) -@pytest.mark.parametrize("allow_nan", [False, True]) +@pytest.mark.parametrize("ensure_all_finite", [False, True]) @pytest.mark.parametrize("check", ["inf", "NaN", None]) @pytest.mark.parametrize("seed", [0, int(time.time())]) @pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues()) def test_validate_data_random_location( - dataframe, queue, dtype, shape, allow_nan, check, seed + dataframe, queue, dtype, shape, ensure_all_finite, check, seed ): est = DummyEstimator() rand.seed(seed) @@ -82,21 +82,25 @@ def test_validate_data_random_location( sycl_queue=queue, ) - if check is None or (allow_nan and check == "NaN"): - validate_data(est, X, allow_nan=allow_nan) + if check is None or (ensure_all_finite and check == "NaN"): + validate_data(est, X, ensure_all_finite=ensure_all_finite) else: - msg_err = "Input contains " + ("infinity" if allow_nan else "NaN, infinity") + "." + msg_err = ( + "Input contains " + + ("infinity" if ensure_all_finite else "NaN, infinity") + + "." + ) with pytest.raises(ValueError, match=msg_err): - validate_data(est, X, allow_nan=allow_nan) + validate_data(est, X, ensure_all_finite=ensure_all_finite) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) -@pytest.mark.parametrize("allow_nan", [False, True]) +@pytest.mark.parametrize("ensure_all_finite", [False, True]) @pytest.mark.parametrize("check", ["inf", "NaN", None]) @pytest.mark.parametrize("seed", [0, int(time.time())]) @pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues()) def test_validate_data_random_shape_and_location( - dataframe, queue, dtype, allow_nan, check, seed + dataframe, queue, dtype, ensure_all_finite, check, seed ): est = DummyEstimator() lb, ub = 32768, 1048576 # lb is a patching condition, ub 2^20 @@ -113,12 +117,16 @@ def test_validate_data_random_shape_and_location( sycl_queue=queue, ) - if check is None or (allow_nan and check == "NaN"): + if check is None or (ensure_all_finite and check == "NaN"): validate_data(est, X) else: - msg_err = "Input contains " + ("infinity" if allow_nan else "NaN, infinity") + "." + msg_err = ( + "Input contains " + + ("infinity" if ensure_all_finite else "NaN, infinity") + + "." + ) with pytest.raises(ValueError, match=msg_err): - validate_data(est, X, allow_nan=allow_nan) + validate_data(est, X, ensure_all_finite=ensure_all_finite) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 3e65223331..34bb988748 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -142,7 +142,9 @@ def validate_data( # force finite check to not occur in sklearn, default is True # `ensure_all_finite` is the most up-to-date keyword name in sklearn # _finite_keyword provides backward compatability for `force_all_finite` - force_all_finite = "ensure_all_finite" not in kwargs or kwargs["ensure_all_finite"] + ensure_all_finite = "ensure_all_finite" not in kwargs or kwargs.pop( + "ensure_all_finite" + ) kwargs[_finite_keyword] = False out = _sklearn_validate_data( _estimator, @@ -150,11 +152,12 @@ def validate_data( y=y, **kwargs, ) - if force_all_finite: + if ensure_all_finite: # run local finite check + allow_nan = ensure_all_finite == "allow-nan" arg = iter(out) if not isinstance(X, str) or X != "no_validation": - assert_all_finite(next(arg), input_name="X") + assert_all_finite(next(arg), allow_nan=allow_nan, input_name="X") if y is not None or not isinstance(y, str) or y != "no_validation": - assert_all_finite(next(arg), input_name="y") + assert_all_finite(next(arg), allow_nan=allow_nan, input_name="y") return out From ed57c15e7e08dee51970b4db316aaea16343d7c0 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Thu, 21 Nov 2024 06:14:53 +0100 Subject: [PATCH 085/135] Update validation.py --- sklearnex/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 34bb988748..e3dd92b7ed 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -142,7 +142,7 @@ def validate_data( # force finite check to not occur in sklearn, default is True # `ensure_all_finite` is the most up-to-date keyword name in sklearn # _finite_keyword provides backward compatability for `force_all_finite` - ensure_all_finite = "ensure_all_finite" not in kwargs or kwargs.pop( + ensure_all_finite = True if "ensure_all_finite" not in kwargs else kwargs.pop( "ensure_all_finite" ) kwargs[_finite_keyword] = False From 414f8979da5d44d8d0d19255d1b5f621733d8065 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 21 Nov 2024 07:07:15 +0100 Subject: [PATCH 086/135] fixes for some test failures --- sklearnex/utils/tests/test_finite.py | 29 ++++++--------- sklearnex/utils/validation.py | 55 +++++++++++++++------------- 2 files changed, 41 insertions(+), 43 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index 180b256771..f75ff33301 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -41,11 +41,12 @@ [1000, 1000], ], ) -@pytest.mark.parametrize("ensure_all_finite", [False, True]) +@pytest.mark.parametrize("ensure_all_finite", ["allow-nan", True]) def test_sum_infinite_actually_finite(dtype, shape, ensure_all_finite): est = DummyEstimator() X = np.array(shape, dtype=dtype) X.fill(np.finfo(dtype).max) + X = np.atleast_2d(X) X_array = validate_data(est, X, ensure_all_finite=ensure_all_finite) assert type(X_array) == type(X) @@ -61,7 +62,7 @@ def test_sum_infinite_actually_finite(dtype, shape, ensure_all_finite): [1000, 1000], ], ) -@pytest.mark.parametrize("ensure_all_finite", [False, True]) +@pytest.mark.parametrize("ensure_all_finite", ["allow-nan", True]) @pytest.mark.parametrize("check", ["inf", "NaN", None]) @pytest.mark.parametrize("seed", [0, int(time.time())]) @pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues()) @@ -77,25 +78,22 @@ def test_validate_data_random_location( X.reshape((-1,))[loc] = float(check) X = _convert_to_dataframe( - X, + np.atleast_2d(X), target_df=dataframe, sycl_queue=queue, ) - if check is None or (ensure_all_finite and check == "NaN"): + allow_nan = ensure_all_finite == "allow_nan" + if check is None or (allow_nan and check == "NaN"): validate_data(est, X, ensure_all_finite=ensure_all_finite) else: - msg_err = ( - "Input contains " - + ("infinity" if ensure_all_finite else "NaN, infinity") - + "." - ) + msg_err = "Input contains " + ("infinity" if allow_nan else "NaN, infinity") + "." with pytest.raises(ValueError, match=msg_err): validate_data(est, X, ensure_all_finite=ensure_all_finite) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) -@pytest.mark.parametrize("ensure_all_finite", [False, True]) +@pytest.mark.parametrize("ensure_all_finite", ["allow-nan", True]) @pytest.mark.parametrize("check", ["inf", "NaN", None]) @pytest.mark.parametrize("seed", [0, int(time.time())]) @pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues()) @@ -112,19 +110,16 @@ def test_validate_data_random_shape_and_location( X[loc] = float(check) X = _convert_to_dataframe( - X, + np.atleast_2d(X), target_df=dataframe, sycl_queue=queue, ) - if check is None or (ensure_all_finite and check == "NaN"): + allow_nan = ensure_all_finite == "allow_nan" + if check is None or (allow_nan and check == "NaN"): validate_data(est, X) else: - msg_err = ( - "Input contains " - + ("infinity" if ensure_all_finite else "NaN, infinity") - + "." - ) + msg_err = "Input contains " + ("infinity" if allow_nan else "NaN, infinity") + "." with pytest.raises(ValueError, match=msg_err): validate_data(est, X, ensure_all_finite=ensure_all_finite) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index e3dd92b7ed..804fafdb48 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -37,20 +37,20 @@ def _is_contiguous(X): # array_api does not have a `strides` or `flags` attribute for testing memory - # order. When dlpack support is brought in for oneDAL, the dlpack object can - # then be inspected and this must be updated. _is_contiguous is therefore - # conservative in verifying attributes and does not support array_api. This - # will block onedal_assert_all_finite from being used for array api inputs. + # order. When dlpack support is brought in for oneDAL, the dlpack python capsule + # can then be inspected for strides and this must be updated. _is_contiguous is + # therefore conservative in verifying attributes and does not support array_api. + # This will block onedal_assert_all_finite from being used for array_api inputs. if hasattr(X, "flags") and X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"]: return True return False -def _assert_all_finite_core(X, xp, *, allow_nan=False, input_name=""): - # This is a reproduction of code from sklearn.utils.validation - # necessary for older sklearn versions (<1.2) and for dpnp inputs - # which do not conform to the array_api standard, and cannot be - # checked in sklearn. +def _sycl_usm_assert_all_finite(X, xp, *, allow_nan=False, input_name=""): + # This is a reproduction of code from sklearn.utils.validation necessary for + # non-contiguous or non-fp32/fp64 dpctl inputs when sklearn version is <1.2 or + # for non-contiguous or non-fp32/fp64 dpnp inputs, as these cannot be checked + # for finiteness in sklearn nor onedal while preserving their object type. first_pass_isfinite = xp.isfinite(xp.sum(X)) if first_pass_isfinite: return @@ -66,7 +66,7 @@ def _assert_all_finite_core(X, xp, *, allow_nan=False, input_name=""): if sklearn_check_version("1.2"): - def _array_api_assert_all_finite( + def _general_assert_all_finite( X, xp, is_array_api_compliant, *, allow_nan=False, input_name="" ): if _is_numpy_namespace(xp) or is_array_api_compliant: @@ -74,11 +74,11 @@ def _array_api_assert_all_finite( elif "float" not in xp.dtype.name or "complex" not in xp.dtype.name: return # handle dpnp inputs - _assert_all_finite_core(X, xp, allow_nan=allow_nan, input_name=input_name) + _sycl_usm_assert_all_finite(X, xp, allow_nan=allow_nan, input_name=input_name) else: - def _array_api_assert_all_finite( + def _general_assert_all_finite( X, xp, is_array_api_compliant, *, allow_nan=False, input_name="" ): @@ -90,9 +90,8 @@ def _array_api_assert_all_finite( return elif "float" not in xp.dtype.name or "complex" not in xp.dtype.name: return - - # handle array_api and dpnp inputs - _assert_all_finite_core(X, xp, allow_nan, input_name=input_name) + # handle dpctl and dpnp inputs + _sycl_usm_assert_all_finite(X, xp, allow_nan, input_name=input_name) def _assert_all_finite( @@ -101,18 +100,22 @@ def _assert_all_finite( allow_nan=False, input_name="", ): - # array_api compliance in sklearn varies betweeen the support sklearn versions - # therefore a separate check matching sklearn's assert_all_finite is necessary - # when the data is not float32 or float64 but of a float type. The onedal - # assert_all_finite is only for float32 and float64 contiguous arrays. - - # initial match to daal4py, can be optimized later + # unlike sklearnex, sklearn does not support sycl_usm_ndarrays by default + # therefore a separate finite check implementation matching sklearn's + # `_assert_all_finite` is necessary when the data is not float32 or float64 or + # non-contiguous. The onedal assert_all_finite is only for float32 and float64 + # contiguous arrays. + + # size check is an initial match to daal4py for performance reasons, can be + # optimized later xp, is_array_api_compliant = get_namespace(X) if X.size < 32768 or X.dtype not in [xp.float32, xp.float64] or not _is_contiguous(X): - # all non-numpy arrays for sklearn 1.0 and dpnp for sklearn are not handeled properly - # separate function for import-time sklearn version check - _array_api_assert_all_finite( + # all sycl_usm_ndarrays for sklearn < 1.2 and dpnp for sklearn > 1.2 are not + # handled properly, it calls a separate function for an import-time sklearn + # version check before possible hand-off to sklearn's _assert_all_finite or to + # _assert_all_finite_core. + _general_assert_all_finite( X, xp, is_array_api_compliant, allow_nan=allow_nan, input_name=input_name ) else: @@ -142,8 +145,8 @@ def validate_data( # force finite check to not occur in sklearn, default is True # `ensure_all_finite` is the most up-to-date keyword name in sklearn # _finite_keyword provides backward compatability for `force_all_finite` - ensure_all_finite = True if "ensure_all_finite" not in kwargs else kwargs.pop( - "ensure_all_finite" + ensure_all_finite = ( + True if "ensure_all_finite" not in kwargs else kwargs.pop("ensure_all_finite") ) kwargs[_finite_keyword] = False out = _sklearn_validate_data( From 83253b3cba87bbec4e5a16b5a75519013e93a5b2 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 21 Nov 2024 07:13:13 +0100 Subject: [PATCH 087/135] fix text --- sklearnex/utils/validation.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 804fafdb48..5e85bc559d 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -50,7 +50,7 @@ def _sycl_usm_assert_all_finite(X, xp, *, allow_nan=False, input_name=""): # This is a reproduction of code from sklearn.utils.validation necessary for # non-contiguous or non-fp32/fp64 dpctl inputs when sklearn version is <1.2 or # for non-contiguous or non-fp32/fp64 dpnp inputs, as these cannot be checked - # for finiteness in sklearn nor onedal while preserving their object type. + # for finiteness in onedal or by sklearn (while preserving their object type). first_pass_isfinite = xp.isfinite(xp.sum(X)) if first_pass_isfinite: return @@ -100,12 +100,6 @@ def _assert_all_finite( allow_nan=False, input_name="", ): - # unlike sklearnex, sklearn does not support sycl_usm_ndarrays by default - # therefore a separate finite check implementation matching sklearn's - # `_assert_all_finite` is necessary when the data is not float32 or float64 or - # non-contiguous. The onedal assert_all_finite is only for float32 and float64 - # contiguous arrays. - # size check is an initial match to daal4py for performance reasons, can be # optimized later xp, is_array_api_compliant = get_namespace(X) @@ -114,7 +108,7 @@ def _assert_all_finite( # all sycl_usm_ndarrays for sklearn < 1.2 and dpnp for sklearn > 1.2 are not # handled properly, it calls a separate function for an import-time sklearn # version check before possible hand-off to sklearn's _assert_all_finite or to - # _assert_all_finite_core. + # _sycl_usm_assert_all_finite. _general_assert_all_finite( X, xp, is_array_api_compliant, allow_nan=allow_nan, input_name=input_name ) From b22e23a47d1cb88d94d71dc29cf61f2f3f39fcc3 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 21 Nov 2024 08:22:54 +0100 Subject: [PATCH 088/135] fixes for some failures --- sklearnex/utils/tests/test_finite.py | 7 +++++-- sklearnex/utils/validation.py | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index f75ff33301..a790301a27 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -87,7 +87,9 @@ def test_validate_data_random_location( if check is None or (allow_nan and check == "NaN"): validate_data(est, X, ensure_all_finite=ensure_all_finite) else: - msg_err = "Input contains " + ("infinity" if allow_nan else "NaN, infinity") + "." + msg_err = ( + "Input X contains " + ("infinity" if allow_nan else "NaN, infinity") + "." + ) with pytest.raises(ValueError, match=msg_err): validate_data(est, X, ensure_all_finite=ensure_all_finite) @@ -119,7 +121,8 @@ def test_validate_data_random_shape_and_location( if check is None or (allow_nan and check == "NaN"): validate_data(est, X) else: - msg_err = "Input contains " + ("infinity" if allow_nan else "NaN, infinity") + "." + type_err = "infinity" if allow_nan else "NaN, infinity" + msg_err = f"Input X contains {type_err}." with pytest.raises(ValueError, match=msg_err): validate_data(est, X, ensure_all_finite=ensure_all_finite) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 5e85bc559d..61cb9acba8 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -153,8 +153,8 @@ def validate_data( # run local finite check allow_nan = ensure_all_finite == "allow-nan" arg = iter(out) - if not isinstance(X, str) or X != "no_validation": + if X != "no_validation": assert_all_finite(next(arg), allow_nan=allow_nan, input_name="X") - if y is not None or not isinstance(y, str) or y != "no_validation": + if y is not None and y != "no_validation": assert_all_finite(next(arg), allow_nan=allow_nan, input_name="y") return out From 2f8ec169a563ccc1c0d6fadb9dc27ee68d25fec3 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 21 Nov 2024 08:23:45 +0100 Subject: [PATCH 089/135] make consistent --- sklearnex/utils/tests/test_finite.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index a790301a27..157b79f6c7 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -87,9 +87,8 @@ def test_validate_data_random_location( if check is None or (allow_nan and check == "NaN"): validate_data(est, X, ensure_all_finite=ensure_all_finite) else: - msg_err = ( - "Input X contains " + ("infinity" if allow_nan else "NaN, infinity") + "." - ) + type_err = "infinity" if allow_nan else "NaN, infinity" + msg_err = f"Input X contains {type_err}." with pytest.raises(ValueError, match=msg_err): validate_data(est, X, ensure_all_finite=ensure_all_finite) From 1fd9973d018eb1b059c85c555216ce2e9377daae Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 21 Nov 2024 09:21:14 +0100 Subject: [PATCH 090/135] fix bad logic --- sklearnex/utils/validation.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 61cb9acba8..996299f37b 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -94,7 +94,7 @@ def _general_assert_all_finite( _sycl_usm_assert_all_finite(X, xp, allow_nan, input_name=input_name) -def _assert_all_finite( +def _sklearnex_assert_all_finite( X, *, allow_nan=False, @@ -122,7 +122,7 @@ def assert_all_finite( allow_nan=False, input_name="", ): - _assert_all_finite( + _sklearnex_assert_all_finite( X.data if sp.issparse(X) else X, allow_nan=allow_nan, input_name=input_name, @@ -139,9 +139,7 @@ def validate_data( # force finite check to not occur in sklearn, default is True # `ensure_all_finite` is the most up-to-date keyword name in sklearn # _finite_keyword provides backward compatability for `force_all_finite` - ensure_all_finite = ( - True if "ensure_all_finite" not in kwargs else kwargs.pop("ensure_all_finite") - ) + ensure_all_finite = kwargs.pop("ensure_all_finite", True) kwargs[_finite_keyword] = False out = _sklearn_validate_data( _estimator, @@ -153,8 +151,8 @@ def validate_data( # run local finite check allow_nan = ensure_all_finite == "allow-nan" arg = iter(out) - if X != "no_validation": + if not isinstance(X, str) or X != "no_validation": assert_all_finite(next(arg), allow_nan=allow_nan, input_name="X") - if y is not None and y != "no_validation": + if not (y is None or isinstance(y, str) and y == "no_validation"): assert_all_finite(next(arg), allow_nan=allow_nan, input_name="y") return out From c20c8cc5891d6b41e5ffb36898617c6d310344b2 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 21 Nov 2024 10:03:43 +0100 Subject: [PATCH 091/135] fix in string --- sklearnex/utils/tests/test_finite.py | 4 ++-- sklearnex/utils/validation.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index 157b79f6c7..c2dec65e00 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -83,7 +83,7 @@ def test_validate_data_random_location( sycl_queue=queue, ) - allow_nan = ensure_all_finite == "allow_nan" + allow_nan = ensure_all_finite == "allow-nan" if check is None or (allow_nan and check == "NaN"): validate_data(est, X, ensure_all_finite=ensure_all_finite) else: @@ -116,7 +116,7 @@ def test_validate_data_random_shape_and_location( sycl_queue=queue, ) - allow_nan = ensure_all_finite == "allow_nan" + allow_nan = ensure_all_finite == "allow-nan" if check is None or (allow_nan and check == "NaN"): validate_data(est, X) else: diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 996299f37b..10257623a0 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -46,7 +46,7 @@ def _is_contiguous(X): return False -def _sycl_usm_assert_all_finite(X, xp, *, allow_nan=False, input_name=""): +def _assert_all_finite(X, xp, *, allow_nan=False, input_name=""): # This is a reproduction of code from sklearn.utils.validation necessary for # non-contiguous or non-fp32/fp64 dpctl inputs when sklearn version is <1.2 or # for non-contiguous or non-fp32/fp64 dpnp inputs, as these cannot be checked @@ -74,7 +74,7 @@ def _general_assert_all_finite( elif "float" not in xp.dtype.name or "complex" not in xp.dtype.name: return # handle dpnp inputs - _sycl_usm_assert_all_finite(X, xp, allow_nan=allow_nan, input_name=input_name) + _assert_all_finite(X, xp, allow_nan=allow_nan, input_name=input_name) else: @@ -91,7 +91,7 @@ def _general_assert_all_finite( elif "float" not in xp.dtype.name or "complex" not in xp.dtype.name: return # handle dpctl and dpnp inputs - _sycl_usm_assert_all_finite(X, xp, allow_nan, input_name=input_name) + _assert_all_finite(X, xp, allow_nan, input_name=input_name) def _sklearnex_assert_all_finite( From 1ce1b10df9ebd80cf5bf445373ff6e157cf4a207 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 21 Nov 2024 10:57:15 +0100 Subject: [PATCH 092/135] attempt tp see if dataframe conversion is causing the issue --- sklearnex/utils/tests/test_finite.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index c2dec65e00..e8995fe6d0 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -77,11 +77,12 @@ def test_validate_data_random_location( loc = rand.randint(0, X.size - 1) X.reshape((-1,))[loc] = float(check) - X = _convert_to_dataframe( + _ = _convert_to_dataframe( np.atleast_2d(X), target_df=dataframe, sycl_queue=queue, - ) + ) #test to see if convert_to_dataframe is causing problems + X = np.atleast_2d(X) allow_nan = ensure_all_finite == "allow-nan" if check is None or (allow_nan and check == "NaN"): @@ -110,11 +111,12 @@ def test_validate_data_random_shape_and_location( loc = rand.randint(0, X.size - 1) X[loc] = float(check) - X = _convert_to_dataframe( + _ = _convert_to_dataframe( np.atleast_2d(X), target_df=dataframe, sycl_queue=queue, - ) + ) #test to see if convert_to_dataframe is causing problems + X = np.atleast_2d(X) allow_nan = ensure_all_finite == "allow-nan" if check is None or (allow_nan and check == "NaN"): From 5355039022d9f39c447f39c91ed46d65f4555810 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 21 Nov 2024 13:46:18 +0100 Subject: [PATCH 093/135] fix iter problem --- sklearnex/utils/validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 10257623a0..acdd21323c 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -41,7 +41,7 @@ def _is_contiguous(X): # can then be inspected for strides and this must be updated. _is_contiguous is # therefore conservative in verifying attributes and does not support array_api. # This will block onedal_assert_all_finite from being used for array_api inputs. - if hasattr(X, "flags") and X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"]: + if hasattr(X, "flags") and (X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"]): return True return False @@ -150,7 +150,7 @@ def validate_data( if ensure_all_finite: # run local finite check allow_nan = ensure_all_finite == "allow-nan" - arg = iter(out) + arg = iter(out if isinstance(out, tuple) else (out,)) if not isinstance(X, str) or X != "no_validation": assert_all_finite(next(arg), allow_nan=allow_nan, input_name="X") if not (y is None or isinstance(y, str) and y == "no_validation"): From b5b84427f2b8c5d5ce39f34f75076190a36ffd6f Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 21 Nov 2024 14:26:05 +0100 Subject: [PATCH 094/135] fix testing issues --- sklearnex/utils/tests/test_finite.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index e8995fe6d0..f20d95a05c 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -44,7 +44,7 @@ @pytest.mark.parametrize("ensure_all_finite", ["allow-nan", True]) def test_sum_infinite_actually_finite(dtype, shape, ensure_all_finite): est = DummyEstimator() - X = np.array(shape, dtype=dtype) + X = np.empty(shape, dtype=dtype) X.fill(np.finfo(dtype).max) X = np.atleast_2d(X) X_array = validate_data(est, X, ensure_all_finite=ensure_all_finite) @@ -120,7 +120,7 @@ def test_validate_data_random_shape_and_location( allow_nan = ensure_all_finite == "allow-nan" if check is None or (allow_nan and check == "NaN"): - validate_data(est, X) + validate_data(est, X, ensure_all_finite=ensure_all_finite) else: type_err = "infinity" if allow_nan else "NaN, infinity" msg_err = f"Input X contains {type_err}." @@ -129,26 +129,25 @@ def test_validate_data_random_shape_and_location( @pytest.mark.parametrize("dtype", [np.float32, np.float64]) -@pytest.mark.parametrize("array_api_dispatch", [True, False]) +@pytest.mark.parametrize("array_api_dispatch", [True, False] if sklearn_check_version("1.2") else [False]) @pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues()) def test_validate_data_output(array_api_dispatch, dtype, dataframe, queue): est = DummyEstimator() X, y = gen_dataset(est, queue=queue, target_df=dataframe, dtype=dtype)[0] dispatch = {} - if sklearn_check_version("1.2"): + if array_api_dispatch: + pytest.skip(dataframe == "pandas", "pandas inputs do not work with sklearn's array_api_dispatch") dispatch["array_api_dispatch"] = array_api_dispatch with config_context(**dispatch): - validate_data(est, X, y) - est.fit(X, y) + X_out, y_out = validate_data(est, X, y) + # check sklearn validate_data operations work underneath X_array = validate_data(est, X, reset=False) - X_out = est.predict(X) if dataframe == "pandas" or ( dataframe == "array_api" - and not (sklearn_check_version("1.2") and array_api_dispatch) - ): + and not array_api_dispatch): # array_api_strict from sklearn < 1.2 and pandas will convert to numpy arrays assert isinstance(X_array, np.ndarray) assert isinstance(X_out, np.ndarray) From d025c89547d7eb5f21deba665dd42ed173925400 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 21 Nov 2024 14:27:38 +0100 Subject: [PATCH 095/135] formatting --- sklearnex/utils/tests/test_finite.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index f20d95a05c..884b3ec6c5 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -81,7 +81,7 @@ def test_validate_data_random_location( np.atleast_2d(X), target_df=dataframe, sycl_queue=queue, - ) #test to see if convert_to_dataframe is causing problems + ) # test to see if convert_to_dataframe is causing problems X = np.atleast_2d(X) allow_nan = ensure_all_finite == "allow-nan" @@ -115,7 +115,7 @@ def test_validate_data_random_shape_and_location( np.atleast_2d(X), target_df=dataframe, sycl_queue=queue, - ) #test to see if convert_to_dataframe is causing problems + ) # test to see if convert_to_dataframe is causing problems X = np.atleast_2d(X) allow_nan = ensure_all_finite == "allow-nan" @@ -129,7 +129,9 @@ def test_validate_data_random_shape_and_location( @pytest.mark.parametrize("dtype", [np.float32, np.float64]) -@pytest.mark.parametrize("array_api_dispatch", [True, False] if sklearn_check_version("1.2") else [False]) +@pytest.mark.parametrize( + "array_api_dispatch", [True, False] if sklearn_check_version("1.2") else [False] +) @pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues()) def test_validate_data_output(array_api_dispatch, dtype, dataframe, queue): est = DummyEstimator() @@ -137,7 +139,10 @@ def test_validate_data_output(array_api_dispatch, dtype, dataframe, queue): dispatch = {} if array_api_dispatch: - pytest.skip(dataframe == "pandas", "pandas inputs do not work with sklearn's array_api_dispatch") + pytest.skip( + dataframe == "pandas", + "pandas inputs do not work with sklearn's array_api_dispatch", + ) dispatch["array_api_dispatch"] = array_api_dispatch with config_context(**dispatch): @@ -145,9 +150,7 @@ def test_validate_data_output(array_api_dispatch, dtype, dataframe, queue): # check sklearn validate_data operations work underneath X_array = validate_data(est, X, reset=False) - if dataframe == "pandas" or ( - dataframe == "array_api" - and not array_api_dispatch): + if dataframe == "pandas" or (dataframe == "array_api" and not array_api_dispatch): # array_api_strict from sklearn < 1.2 and pandas will convert to numpy arrays assert isinstance(X_array, np.ndarray) assert isinstance(X_out, np.ndarray) From 428bfb6f5a0db7df71a546d80e80221afbf8a32b Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 21 Nov 2024 14:31:43 +0100 Subject: [PATCH 096/135] revert change --- sklearnex/utils/tests/test_finite.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index 884b3ec6c5..6be0f50841 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -77,7 +77,7 @@ def test_validate_data_random_location( loc = rand.randint(0, X.size - 1) X.reshape((-1,))[loc] = float(check) - _ = _convert_to_dataframe( + X = _convert_to_dataframe( np.atleast_2d(X), target_df=dataframe, sycl_queue=queue, @@ -111,7 +111,7 @@ def test_validate_data_random_shape_and_location( loc = rand.randint(0, X.size - 1) X[loc] = float(check) - _ = _convert_to_dataframe( + X = _convert_to_dataframe( np.atleast_2d(X), target_df=dataframe, sycl_queue=queue, From da2313873bb0db18bbfbe88a4b0756b735cb5533 Mon Sep 17 00:00:00 2001 From: icfaust Date: Thu, 21 Nov 2024 05:38:41 -0800 Subject: [PATCH 097/135] fixes for pandas --- sklearnex/utils/tests/test_finite.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index 6be0f50841..637d12b631 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -139,10 +139,8 @@ def test_validate_data_output(array_api_dispatch, dtype, dataframe, queue): dispatch = {} if array_api_dispatch: - pytest.skip( - dataframe == "pandas", - "pandas inputs do not work with sklearn's array_api_dispatch", - ) + if dataframe == "pandas": + pytest.skip("pandas inputs do not work with sklearn's array_api_dispatch") dispatch["array_api_dispatch"] = array_api_dispatch with config_context(**dispatch): From 1d0c330f513acd12af54ef5ca43286bf941585f9 Mon Sep 17 00:00:00 2001 From: icfaust Date: Thu, 21 Nov 2024 05:42:04 -0800 Subject: [PATCH 098/135] there is a slowdown with pandas that needs to be solved --- sklearnex/utils/tests/test_finite.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index 637d12b631..2ad2341d6f 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -81,8 +81,7 @@ def test_validate_data_random_location( np.atleast_2d(X), target_df=dataframe, sycl_queue=queue, - ) # test to see if convert_to_dataframe is causing problems - X = np.atleast_2d(X) + ) allow_nan = ensure_all_finite == "allow-nan" if check is None or (allow_nan and check == "NaN"): @@ -115,8 +114,7 @@ def test_validate_data_random_shape_and_location( np.atleast_2d(X), target_df=dataframe, sycl_queue=queue, - ) # test to see if convert_to_dataframe is causing problems - X = np.atleast_2d(X) + ) allow_nan = ensure_all_finite == "allow-nan" if check is None or (allow_nan and check == "NaN"): From f3f63a6a11955670c3763c8cfd2932a0d4864aa7 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 21 Nov 2024 14:56:33 +0100 Subject: [PATCH 099/135] swap to transpose for speed --- sklearnex/utils/tests/test_finite.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index 2ad2341d6f..2904ff2bf3 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -77,8 +77,10 @@ def test_validate_data_random_location( loc = rand.randint(0, X.size - 1) X.reshape((-1,))[loc] = float(check) + # column heavy pandas inputs are very slow in sklearn's check_array + # transpose inputs to guarantee fast processing in tests X = _convert_to_dataframe( - np.atleast_2d(X), + np.atleast_2d(X).T, target_df=dataframe, sycl_queue=queue, ) @@ -111,7 +113,7 @@ def test_validate_data_random_shape_and_location( X[loc] = float(check) X = _convert_to_dataframe( - np.atleast_2d(X), + np.atleast_2d(X).T, target_df=dataframe, sycl_queue=queue, ) From 56c80545af46e1116f9445300fa4517f14476d32 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 21 Nov 2024 14:58:07 +0100 Subject: [PATCH 100/135] more clarity --- sklearnex/utils/tests/test_finite.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_finite.py index 2904ff2bf3..fdaec2e2e4 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_finite.py @@ -77,8 +77,8 @@ def test_validate_data_random_location( loc = rand.randint(0, X.size - 1) X.reshape((-1,))[loc] = float(check) - # column heavy pandas inputs are very slow in sklearn's check_array - # transpose inputs to guarantee fast processing in tests + # column heavy pandas inputs are very slow in sklearn's check_array even without + # the finite check, just transpose inputs to guarantee fast processing in tests X = _convert_to_dataframe( np.atleast_2d(X).T, target_df=dataframe, From 1580d770ed403475853cf4909438f61d028b1744 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Fri, 22 Nov 2024 14:24:05 +0100 Subject: [PATCH 101/135] add _check_sample_weight --- sklearnex/utils/validation.py | 120 +++++++++++++++++----------------- 1 file changed, 59 insertions(+), 61 deletions(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index acdd21323c..72876bcae6 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -14,11 +14,16 @@ # limitations under the License. # =============================================================================== +import numbers +import warnings + +import numpy as np import scipy.sparse as sp from sklearn.utils.validation import _assert_all_finite as _sklearn_assert_all_finite +from sklearn.utils.validation import _num_samples, check_array, check_non_negative from daal4py.sklearn._utils import sklearn_check_version -from onedal.utils._array_api import _is_numpy_namespace +from onedal.utils._array_api import _get_sycl_namespace, _is_numpy_namespace from onedal.utils.validation import _assert_all_finite as _onedal_assert_all_finite from ._array_api import get_namespace @@ -41,57 +46,7 @@ def _is_contiguous(X): # can then be inspected for strides and this must be updated. _is_contiguous is # therefore conservative in verifying attributes and does not support array_api. # This will block onedal_assert_all_finite from being used for array_api inputs. - if hasattr(X, "flags") and (X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"]): - return True - return False - - -def _assert_all_finite(X, xp, *, allow_nan=False, input_name=""): - # This is a reproduction of code from sklearn.utils.validation necessary for - # non-contiguous or non-fp32/fp64 dpctl inputs when sklearn version is <1.2 or - # for non-contiguous or non-fp32/fp64 dpnp inputs, as these cannot be checked - # for finiteness in onedal or by sklearn (while preserving their object type). - first_pass_isfinite = xp.isfinite(xp.sum(X)) - if first_pass_isfinite: - return - - has_inf = xp.any(xp.isinf(X)) - has_nan_error = False if allow_nan else xp.any(xp.isnan(X)) - if has_inf or has_nan_error: - type_err = "infinity" if allow_nan else "NaN, infinity" - padded_input_name = input_name + " " if input_name else "" - msg_err = f"Input {padded_input_name}contains {type_err}." - raise ValueError(msg_err) - - -if sklearn_check_version("1.2"): - - def _general_assert_all_finite( - X, xp, is_array_api_compliant, *, allow_nan=False, input_name="" - ): - if _is_numpy_namespace(xp) or is_array_api_compliant: - _sklearn_assert_all_finite(X, allow_nan=allow_nan, input_name=input_name) - elif "float" not in xp.dtype.name or "complex" not in xp.dtype.name: - return - # handle dpnp inputs - _assert_all_finite(X, xp, allow_nan=allow_nan, input_name=input_name) - -else: - - def _general_assert_all_finite( - X, xp, is_array_api_compliant, *, allow_nan=False, input_name="" - ): - - if _is_numpy_namespace(xp): - _sklearn_assert_all_finite(X, allow_nan, input_name=input_name) - elif is_array_api_compliant and not xp.isdtype( - X, ("real floating", "complex floating") - ): - return - elif "float" not in xp.dtype.name or "complex" not in xp.dtype.name: - return - # handle dpctl and dpnp inputs - _assert_all_finite(X, xp, allow_nan, input_name=input_name) + return hasattr(X, "flags") and (X.flags["C_CONTIGUOUS"] or X.flags["F_CONTIGUOUS"]) def _sklearnex_assert_all_finite( @@ -102,16 +57,9 @@ def _sklearnex_assert_all_finite( ): # size check is an initial match to daal4py for performance reasons, can be # optimized later - xp, is_array_api_compliant = get_namespace(X) + xp, _ = get_namespace(X) if X.size < 32768 or X.dtype not in [xp.float32, xp.float64] or not _is_contiguous(X): - - # all sycl_usm_ndarrays for sklearn < 1.2 and dpnp for sklearn > 1.2 are not - # handled properly, it calls a separate function for an import-time sklearn - # version check before possible hand-off to sklearn's _assert_all_finite or to - # _sycl_usm_assert_all_finite. - _general_assert_all_finite( - X, xp, is_array_api_compliant, allow_nan=allow_nan, input_name=input_name - ) + _sklearn_assert_all_finite(X, allow_nan=allow_nan, input_name=input_name) else: _onedal_assert_all_finite(X, allow_nan=allow_nan, input_name=input_name) @@ -141,6 +89,7 @@ def validate_data( # _finite_keyword provides backward compatability for `force_all_finite` ensure_all_finite = kwargs.pop("ensure_all_finite", True) kwargs[_finite_keyword] = False + out = _sklearn_validate_data( _estimator, X=X, @@ -156,3 +105,52 @@ def validate_data( if not (y is None or isinstance(y, str) and y == "no_validation"): assert_all_finite(next(arg), allow_nan=allow_nan, input_name="y") return out + + +def _check_sample_weight( + sample_weight, X, dtype=None, copy=False, only_non_negative=False +): + + n_samples = _num_samples(X) + xp, _ = get_namespace(X) + + if dtype is not None and dtype not in [xp.float32, xp.float64]: + dtype = xp.float64 + + if sample_weight is None: + sample_weight = xp.ones(n_samples, dtype=dtype) + elif isinstance(sample_weight, numbers.Number): + sample_weight = xp.full(n_samples, sample_weight, dtype=dtype) + else: + if dtype is None: + dtype = [xp.float64, xp.float32] + + # create param dict such that the variable finite_keyword can + # be added to it without direct sklearn_check_version maintenance + params = { + "accept_sparse": False, + "ensure_2d": False, + "dtype": dtype, + "order": "C", + "copy": copy, + "input_name": "sample_weight", + _finite_keyword: False, + } + + sample_weight = check_array(sample_weight, **params) + assert_all_finite(sample_weight, input_name="sample_weight") + + if sample_weight.ndim != 1: + raise ValueError("Sample weights must be 1D array or scalar") + + if sample_weight.shape != (n_samples,): + raise ValueError( + "sample_weight.shape == {}, expected {}!".format( + sample_weight.shape, (n_samples,) + ) + ) + + if only_non_negative: + check_non_negative(sample_weight, "`sample_weight`") + + return sample_weight From ffc9f1f33c361495177e8277f9d6fdda4bcce449 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Fri, 22 Nov 2024 15:07:22 +0100 Subject: [PATCH 102/135] add more testing' --- .../{test_finite.py => test_validation.py} | 76 ++++++++++++++++++- sklearnex/utils/validation.py | 2 +- 2 files changed, 75 insertions(+), 3 deletions(-) rename sklearnex/utils/tests/{test_finite.py => test_validation.py} (66%) diff --git a/sklearnex/utils/tests/test_finite.py b/sklearnex/utils/tests/test_validation.py similarity index 66% rename from sklearnex/utils/tests/test_finite.py rename to sklearnex/utils/tests/test_validation.py index fdaec2e2e4..31530c4866 100644 --- a/sklearnex/utils/tests/test_finite.py +++ b/sklearnex/utils/tests/test_validation.py @@ -27,7 +27,7 @@ ) from sklearnex import config_context from sklearnex.tests.utils import DummyEstimator, gen_dataset -from sklearnex.utils.validation import validate_data +from sklearnex.utils.validation import _check_sample_weight, validate_data @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @@ -129,11 +129,83 @@ def test_validate_data_random_shape_and_location( @pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("check", ["inf", "NaN", None]) @pytest.mark.parametrize( "array_api_dispatch", [True, False] if sklearn_check_version("1.2") else [False] ) -@pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues()) +@pytest.mark.parametrize("seed", [0, int(time.time())]) +@pytest.mark.parametrize( + "dataframe, queue", + get_dataframes_and_queues( + "numpy,pandas" + ("dpctl,array_api" if sklearn_check_version("1.2") else "") + ), +) +def test__check_sample_weights_random_shape_and_location( + dataframe, queue, dtype, array_api_dispatch, check, seed +): + # This testing assumes that array api inputs to validate_data will only occur + # with sklearn array_api support which began in sklearn 1.2. This would assume + # that somewhere upstream of the validate_data call, a data conversion of dpnp, + # dpctl, or array_api inputs to numpy inputs would have occurred. + + lb, ub = 32768, 1048576 # lb is a patching condition, ub 2^20 + rand.seed(seed) + shape = (rand.randint(lb, ub), 2) + X = rand.uniform(high=np.finfo(dtype).max, size=shape).astype(dtype) + sample_weight = rand.uniform(high=np.finfo(dtype).max, size=shape[0]).astype(dtype) + + if check: + loc = rand.randint(0, shape[0] - 1) + sample_weight[loc] = float(check) + + X = _convert_to_dataframe( + X, + target_df=dataframe, + sycl_queue=queue, + ) + sample_weight = _convert_to_dataframe( + sample_weight, + target_df=dataframe, + sycl_queue=queue, + ) + + dispatch = {} + if array_api_dispatch: + if dataframe == "pandas": + pytest.skip("pandas inputs do not work with sklearn's array_api_dispatch") + dispatch["array_api_dispatch"] = array_api_dispatch + + with config_context(**dispatch): + + if check is None: + X_out = _check_sample_weight(X, sample_weight) + if dataframe == "pandas" or ( + dataframe == "array_api" and not array_api_dispatch + ): + assert isinstance(X, np.ndarray) + else: + assert type(X_out) == type(X) + else: + msg_err = "Input sample_weight contains NaN, infinity." + with pytest.raises(ValueError, match=msg_err): + X_out = _check_sample_weight(X, sample_weight) + + +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize( + "array_api_dispatch", [True, False] if sklearn_check_version("1.2") else [False] +) +@pytest.mark.parametrize( + "dataframe, queue", + get_dataframes_and_queues( + "numpy,pandas" + ("dpctl,array_api" if sklearn_check_version("1.2") else "") + ), +) def test_validate_data_output(array_api_dispatch, dtype, dataframe, queue): + # This testing assumes that array api inputs to validate_data will only occur + # with sklearn array_api support which began in sklearn 1.2. This would assume + # that somewhere upstream of the validate_data call, a data conversion of dpnp, + # dpctl, or array_api inputs to numpy inputs would have occurred. est = DummyEstimator() X, y = gen_dataset(est, queue=queue, target_df=dataframe, dtype=dtype)[0] diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 72876bcae6..f0ed55d86a 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -125,7 +125,7 @@ def _check_sample_weight( if dtype is None: dtype = [xp.float64, xp.float32] - # create param dict such that the variable finite_keyword can + # create param dict such that the variable _finite_keyword can # be added to it without direct sklearn_check_version maintenance params = { "accept_sparse": False, From d184ed044c1bd26b6f38362a6d706331e49714db Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Fri, 22 Nov 2024 15:09:29 +0100 Subject: [PATCH 103/135] rename --- sklearnex/utils/tests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/utils/tests/test_validation.py b/sklearnex/utils/tests/test_validation.py index 31530c4866..13934acc7c 100644 --- a/sklearnex/utils/tests/test_validation.py +++ b/sklearnex/utils/tests/test_validation.py @@ -140,7 +140,7 @@ def test_validate_data_random_shape_and_location( "numpy,pandas" + ("dpctl,array_api" if sklearn_check_version("1.2") else "") ), ) -def test__check_sample_weights_random_shape_and_location( +def test__check_sample_weight_random_shape_and_location( dataframe, queue, dtype, array_api_dispatch, check, seed ): # This testing assumes that array api inputs to validate_data will only occur From c68616f26b77f39e2dfcc7f502efb5079583070b Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Fri, 22 Nov 2024 15:18:58 +0100 Subject: [PATCH 104/135] remove unnecessary imports --- sklearnex/utils/validation.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index f0ed55d86a..17a83ea054 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -15,15 +15,12 @@ # =============================================================================== import numbers -import warnings -import numpy as np import scipy.sparse as sp from sklearn.utils.validation import _assert_all_finite as _sklearn_assert_all_finite from sklearn.utils.validation import _num_samples, check_array, check_non_negative from daal4py.sklearn._utils import sklearn_check_version -from onedal.utils._array_api import _get_sycl_namespace, _is_numpy_namespace from onedal.utils.validation import _assert_all_finite as _onedal_assert_all_finite from ._array_api import get_namespace From e7ea94e3fea7d28e213bd36a8816499742dfc15f Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Fri, 22 Nov 2024 15:42:59 +0100 Subject: [PATCH 105/135] fix test slowness --- sklearnex/utils/tests/test_validation.py | 40 +++++++++--------------- 1 file changed, 14 insertions(+), 26 deletions(-) diff --git a/sklearnex/utils/tests/test_validation.py b/sklearnex/utils/tests/test_validation.py index 13934acc7c..d1976decce 100644 --- a/sklearnex/utils/tests/test_validation.py +++ b/sklearnex/utils/tests/test_validation.py @@ -130,9 +130,6 @@ def test_validate_data_random_shape_and_location( @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("check", ["inf", "NaN", None]) -@pytest.mark.parametrize( - "array_api_dispatch", [True, False] if sklearn_check_version("1.2") else [False] -) @pytest.mark.parametrize("seed", [0, int(time.time())]) @pytest.mark.parametrize( "dataframe, queue", @@ -141,7 +138,7 @@ def test_validate_data_random_shape_and_location( ), ) def test__check_sample_weight_random_shape_and_location( - dataframe, queue, dtype, array_api_dispatch, check, seed + dataframe, queue, dtype, check, seed ): # This testing assumes that array api inputs to validate_data will only occur # with sklearn array_api support which began in sklearn 1.2. This would assume @@ -170,21 +167,17 @@ def test__check_sample_weight_random_shape_and_location( ) dispatch = {} - if array_api_dispatch: - if dataframe == "pandas": - pytest.skip("pandas inputs do not work with sklearn's array_api_dispatch") - dispatch["array_api_dispatch"] = array_api_dispatch + if dataframe in ["array_api", "dpctl"]: + dispatch["array_api_dispatch"] = True with config_context(**dispatch): if check is None: X_out = _check_sample_weight(X, sample_weight) - if dataframe == "pandas" or ( - dataframe == "array_api" and not array_api_dispatch - ): - assert isinstance(X, np.ndarray) - else: + if dispatch: assert type(X_out) == type(X) + else: + assert isinstance(X, np.ndarray) else: msg_err = "Input sample_weight contains NaN, infinity." with pytest.raises(ValueError, match=msg_err): @@ -192,16 +185,13 @@ def test__check_sample_weight_random_shape_and_location( @pytest.mark.parametrize("dtype", [np.float32, np.float64]) -@pytest.mark.parametrize( - "array_api_dispatch", [True, False] if sklearn_check_version("1.2") else [False] -) @pytest.mark.parametrize( "dataframe, queue", get_dataframes_and_queues( "numpy,pandas" + ("dpctl,array_api" if sklearn_check_version("1.2") else "") ), ) -def test_validate_data_output(array_api_dispatch, dtype, dataframe, queue): +def test_validate_data_output(dtype, dataframe, queue): # This testing assumes that array api inputs to validate_data will only occur # with sklearn array_api support which began in sklearn 1.2. This would assume # that somewhere upstream of the validate_data call, a data conversion of dpnp, @@ -210,22 +200,20 @@ def test_validate_data_output(array_api_dispatch, dtype, dataframe, queue): X, y = gen_dataset(est, queue=queue, target_df=dataframe, dtype=dtype)[0] dispatch = {} - if array_api_dispatch: - if dataframe == "pandas": - pytest.skip("pandas inputs do not work with sklearn's array_api_dispatch") - dispatch["array_api_dispatch"] = array_api_dispatch + if dataframe in ["array_api", "dpctl"]: + dispatch["array_api_dispatch"] = True with config_context(**dispatch): X_out, y_out = validate_data(est, X, y) # check sklearn validate_data operations work underneath X_array = validate_data(est, X, reset=False) - if dataframe == "pandas" or (dataframe == "array_api" and not array_api_dispatch): - # array_api_strict from sklearn < 1.2 and pandas will convert to numpy arrays - assert isinstance(X_array, np.ndarray) - assert isinstance(X_out, np.ndarray) - else: + if dispatch: assert type(X) == type( X_array ), f"validate_data converted {type(X)} to {type(X_array)}" assert type(X) == type(X_out), f"from_array converted {type(X)} to {type(X_out)}" + else: + # array_api_strict from sklearn < 1.2 and pandas will convert to numpy arrays + assert isinstance(X_array, np.ndarray) + assert isinstance(X_out, np.ndarray) From dbe108dd0d9c09bc4ec9801c9dce9a71739e874b Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Fri, 22 Nov 2024 15:45:40 +0100 Subject: [PATCH 106/135] focus get_dataframes_and_queues --- sklearnex/utils/tests/test_validation.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/sklearnex/utils/tests/test_validation.py b/sklearnex/utils/tests/test_validation.py index d1976decce..64bc18e280 100644 --- a/sklearnex/utils/tests/test_validation.py +++ b/sklearnex/utils/tests/test_validation.py @@ -65,7 +65,12 @@ def test_sum_infinite_actually_finite(dtype, shape, ensure_all_finite): @pytest.mark.parametrize("ensure_all_finite", ["allow-nan", True]) @pytest.mark.parametrize("check", ["inf", "NaN", None]) @pytest.mark.parametrize("seed", [0, int(time.time())]) -@pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues()) +@pytest.mark.parametrize( + "dataframe, queue", + get_dataframes_and_queues( + "numpy,pandas" + ("dpctl,array_api" if sklearn_check_version("1.2") else "") + ), +) def test_validate_data_random_location( dataframe, queue, dtype, shape, ensure_all_finite, check, seed ): @@ -99,7 +104,12 @@ def test_validate_data_random_location( @pytest.mark.parametrize("ensure_all_finite", ["allow-nan", True]) @pytest.mark.parametrize("check", ["inf", "NaN", None]) @pytest.mark.parametrize("seed", [0, int(time.time())]) -@pytest.mark.parametrize("dataframe, queue", get_dataframes_and_queues()) +@pytest.mark.parametrize( + "dataframe, queue", + get_dataframes_and_queues( + "numpy,pandas" + ("dpctl,array_api" if sklearn_check_version("1.2") else "") + ), +) def test_validate_data_random_shape_and_location( dataframe, queue, dtype, ensure_all_finite, check, seed ): From 7284b59910cd839d571a6c586a9b302fcb5d5760 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Fri, 22 Nov 2024 15:50:22 +0100 Subject: [PATCH 107/135] put config_context around --- sklearnex/utils/tests/test_validation.py | 44 +++++++++++++++--------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/sklearnex/utils/tests/test_validation.py b/sklearnex/utils/tests/test_validation.py index 64bc18e280..3c1978e127 100644 --- a/sklearnex/utils/tests/test_validation.py +++ b/sklearnex/utils/tests/test_validation.py @@ -90,14 +90,20 @@ def test_validate_data_random_location( sycl_queue=queue, ) - allow_nan = ensure_all_finite == "allow-nan" - if check is None or (allow_nan and check == "NaN"): - validate_data(est, X, ensure_all_finite=ensure_all_finite) - else: - type_err = "infinity" if allow_nan else "NaN, infinity" - msg_err = f"Input X contains {type_err}." - with pytest.raises(ValueError, match=msg_err): + dispatch = {} + if sklearn_check_version("1.2") and dataframe != "pandas": + dispatch["array_api_dispatch"] = True + + with config_context(**dispatch): + + allow_nan = ensure_all_finite == "allow-nan" + if check is None or (allow_nan and check == "NaN"): validate_data(est, X, ensure_all_finite=ensure_all_finite) + else: + type_err = "infinity" if allow_nan else "NaN, infinity" + msg_err = f"Input X contains {type_err}." + with pytest.raises(ValueError, match=msg_err): + validate_data(est, X, ensure_all_finite=ensure_all_finite) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @@ -128,14 +134,20 @@ def test_validate_data_random_shape_and_location( sycl_queue=queue, ) - allow_nan = ensure_all_finite == "allow-nan" - if check is None or (allow_nan and check == "NaN"): - validate_data(est, X, ensure_all_finite=ensure_all_finite) - else: - type_err = "infinity" if allow_nan else "NaN, infinity" - msg_err = f"Input X contains {type_err}." - with pytest.raises(ValueError, match=msg_err): + dispatch = {} + if sklearn_check_version("1.2") and dataframe != "pandas": + dispatch["array_api_dispatch"] = True + + with config_context(**dispatch): + + allow_nan = ensure_all_finite == "allow-nan" + if check is None or (allow_nan and check == "NaN"): validate_data(est, X, ensure_all_finite=ensure_all_finite) + else: + type_err = "infinity" if allow_nan else "NaN, infinity" + msg_err = f"Input X contains {type_err}." + with pytest.raises(ValueError, match=msg_err): + validate_data(est, X, ensure_all_finite=ensure_all_finite) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @@ -177,7 +189,7 @@ def test__check_sample_weight_random_shape_and_location( ) dispatch = {} - if dataframe in ["array_api", "dpctl"]: + if sklearn_check_version("1.2") and dataframe != "pandas": dispatch["array_api_dispatch"] = True with config_context(**dispatch): @@ -210,7 +222,7 @@ def test_validate_data_output(dtype, dataframe, queue): X, y = gen_dataset(est, queue=queue, target_df=dataframe, dtype=dtype)[0] dispatch = {} - if dataframe in ["array_api", "dpctl"]: + if sklearn_check_version("1.2") and dataframe != "pandas": dispatch["array_api_dispatch"] = True with config_context(**dispatch): From e1be91d13c5cef9815ecc9d8a7c3ced8e7386efa Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sun, 24 Nov 2024 14:28:54 +0100 Subject: [PATCH 108/135] Update test_validation.py --- sklearnex/utils/tests/test_validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearnex/utils/tests/test_validation.py b/sklearnex/utils/tests/test_validation.py index 3c1978e127..dc6117e6d4 100644 --- a/sklearnex/utils/tests/test_validation.py +++ b/sklearnex/utils/tests/test_validation.py @@ -195,7 +195,7 @@ def test__check_sample_weight_random_shape_and_location( with config_context(**dispatch): if check is None: - X_out = _check_sample_weight(X, sample_weight) + X_out = _check_sample_weight(sample_weight, X) if dispatch: assert type(X_out) == type(X) else: @@ -203,7 +203,7 @@ def test__check_sample_weight_random_shape_and_location( else: msg_err = "Input sample_weight contains NaN, infinity." with pytest.raises(ValueError, match=msg_err): - X_out = _check_sample_weight(X, sample_weight) + X_out = _check_sample_weight(sample_weight, X) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) From 8a0f9e9dd1219d2ad1e514c9fecd9055cdfb0d60 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sun, 24 Nov 2024 15:20:57 +0100 Subject: [PATCH 109/135] Update base.py --- sklearnex/tests/utils/base.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sklearnex/tests/utils/base.py b/sklearnex/tests/utils/base.py index e484423cfc..706de39a91 100755 --- a/sklearnex/tests/utils/base.py +++ b/sklearnex/tests/utils/base.py @@ -47,7 +47,6 @@ NearestNeighbors, ) from sklearnex.svm import SVC, NuSVC -from sklearnex.utils.validation import validate_data def _load_all_models(with_sklearnex=True, estimator=True): @@ -378,8 +377,6 @@ def _get_processor_info(): class DummyEstimator(BaseEstimator): def fit(self, X, y=None): - X, y = validate_data(self, X, y) - sua_iface, xp, _ = _get_sycl_namespace(X) X_table = to_table(X) y_table = to_table(y) From 52722077467d2844dcce2233f24efb7e5dafd7f4 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sun, 24 Nov 2024 21:07:29 +0100 Subject: [PATCH 110/135] Update test_validation.py --- sklearnex/utils/tests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/utils/tests/test_validation.py b/sklearnex/utils/tests/test_validation.py index dc6117e6d4..d366a74560 100644 --- a/sklearnex/utils/tests/test_validation.py +++ b/sklearnex/utils/tests/test_validation.py @@ -199,7 +199,7 @@ def test__check_sample_weight_random_shape_and_location( if dispatch: assert type(X_out) == type(X) else: - assert isinstance(X, np.ndarray) + assert isinstance(X_out, np.ndarray) else: msg_err = "Input sample_weight contains NaN, infinity." with pytest.raises(ValueError, match=msg_err): From 56b5c4c4730de70243cb158e88ddda6ac38bc082 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Mon, 25 Nov 2024 06:46:06 +0100 Subject: [PATCH 111/135] generalize regex --- sklearnex/utils/tests/test_validation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearnex/utils/tests/test_validation.py b/sklearnex/utils/tests/test_validation.py index d366a74560..3f7fb0758d 100644 --- a/sklearnex/utils/tests/test_validation.py +++ b/sklearnex/utils/tests/test_validation.py @@ -100,8 +100,8 @@ def test_validate_data_random_location( if check is None or (allow_nan and check == "NaN"): validate_data(est, X, ensure_all_finite=ensure_all_finite) else: - type_err = "infinity" if allow_nan else "NaN, infinity" - msg_err = f"Input X contains {type_err}." + type_err = "infinity" if allow_nan else "[NaN|infinity]" + msg_err = f"Input X contains {type_err}" with pytest.raises(ValueError, match=msg_err): validate_data(est, X, ensure_all_finite=ensure_all_finite) @@ -144,7 +144,7 @@ def test_validate_data_random_shape_and_location( if check is None or (allow_nan and check == "NaN"): validate_data(est, X, ensure_all_finite=ensure_all_finite) else: - type_err = "infinity" if allow_nan else "NaN, infinity" + type_err = "infinity" if allow_nan else "[NaN|infinity]" msg_err = f"Input X contains {type_err}." with pytest.raises(ValueError, match=msg_err): validate_data(est, X, ensure_all_finite=ensure_all_finite) @@ -201,7 +201,7 @@ def test__check_sample_weight_random_shape_and_location( else: assert isinstance(X_out, np.ndarray) else: - msg_err = "Input sample_weight contains NaN, infinity." + msg_err = "Input sample_weight contains [NaN|infinity]" with pytest.raises(ValueError, match=msg_err): X_out = _check_sample_weight(sample_weight, X) From 0d1b30607d0fa12c93eb7eeaf9c1b818cee44467 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Mon, 25 Nov 2024 10:29:42 +0100 Subject: [PATCH 112/135] add fixes for sklearn 1.0 and input_name --- sklearnex/utils/validation.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 17a83ea054..76470091ce 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -56,7 +56,10 @@ def _sklearnex_assert_all_finite( # optimized later xp, _ = get_namespace(X) if X.size < 32768 or X.dtype not in [xp.float32, xp.float64] or not _is_contiguous(X): - _sklearn_assert_all_finite(X, allow_nan=allow_nan, input_name=input_name) + if sklearn_check_version("1.1"): + _sklearn_assert_all_finite(X, allow_nan=allow_nan, input_name=input_name) + else: + _sklearn_assert_all_finite(X, allow_nan=allow_nan) else: _onedal_assert_all_finite(X, allow_nan=allow_nan, input_name=input_name) @@ -122,17 +125,16 @@ def _check_sample_weight( if dtype is None: dtype = [xp.float64, xp.float32] - # create param dict such that the variable _finite_keyword can - # be added to it without direct sklearn_check_version maintenance params = { "accept_sparse": False, "ensure_2d": False, "dtype": dtype, "order": "C", "copy": copy, - "input_name": "sample_weight", _finite_keyword: False, } + if sklearn_check_version("1.1"): + params["input_name"] = "sample_weight" sample_weight = check_array(sample_weight, **params) assert_all_finite(sample_weight, input_name="sample_weight") From 8ff312eecb289815b8e5ff65c558a39dadb1a72d Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Mon, 25 Nov 2024 10:35:24 +0100 Subject: [PATCH 113/135] fixes for test failures --- sklearnex/utils/tests/test_validation.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/sklearnex/utils/tests/test_validation.py b/sklearnex/utils/tests/test_validation.py index 3f7fb0758d..92ba0d742a 100644 --- a/sklearnex/utils/tests/test_validation.py +++ b/sklearnex/utils/tests/test_validation.py @@ -29,6 +29,13 @@ from sklearnex.tests.utils import DummyEstimator, gen_dataset from sklearnex.utils.validation import _check_sample_weight, validate_data +# array_api support starts in sklearn 1.2, and array_api_strict conformance starts in sklearn 1.3 +_dataframes_supported = ( + "numpy,pandas" + + (",dpctl" if sklearn_check_version("1.2") else "") + + (",array_api" if sklearn_check_version("1.3") else "") +) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize( @@ -67,9 +74,7 @@ def test_sum_infinite_actually_finite(dtype, shape, ensure_all_finite): @pytest.mark.parametrize("seed", [0, int(time.time())]) @pytest.mark.parametrize( "dataframe, queue", - get_dataframes_and_queues( - "numpy,pandas" + ("dpctl,array_api" if sklearn_check_version("1.2") else "") - ), + get_dataframes_and_queues(_dataframes_supported), ) def test_validate_data_random_location( dataframe, queue, dtype, shape, ensure_all_finite, check, seed @@ -112,9 +117,7 @@ def test_validate_data_random_location( @pytest.mark.parametrize("seed", [0, int(time.time())]) @pytest.mark.parametrize( "dataframe, queue", - get_dataframes_and_queues( - "numpy,pandas" + ("dpctl,array_api" if sklearn_check_version("1.2") else "") - ), + get_dataframes_and_queues(_dataframes_supported), ) def test_validate_data_random_shape_and_location( dataframe, queue, dtype, ensure_all_finite, check, seed @@ -155,9 +158,7 @@ def test_validate_data_random_shape_and_location( @pytest.mark.parametrize("seed", [0, int(time.time())]) @pytest.mark.parametrize( "dataframe, queue", - get_dataframes_and_queues( - "numpy,pandas" + ("dpctl,array_api" if sklearn_check_version("1.2") else "") - ), + get_dataframes_and_queues(_dataframes_supported), ) def test__check_sample_weight_random_shape_and_location( dataframe, queue, dtype, check, seed @@ -209,9 +210,7 @@ def test__check_sample_weight_random_shape_and_location( @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize( "dataframe, queue", - get_dataframes_and_queues( - "numpy,pandas" + ("dpctl,array_api" if sklearn_check_version("1.2") else "") - ), + get_dataframes_and_queues(_dataframes_supported), ) def test_validate_data_output(dtype, dataframe, queue): # This testing assumes that array api inputs to validate_data will only occur From 87b7e3b461c431d07da1114c15ea8e9ca3c9c4b9 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Mon, 25 Nov 2024 21:42:18 +0100 Subject: [PATCH 114/135] Update validation.py --- onedal/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index 836dd84a75..38dcfd3fb3 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -449,7 +449,7 @@ def _assert_all_finite(X, allow_nan=False, input_name=""): policy = _get_policy(None, X) X_t = to_table(_convert_to_supported(policy, X)) params = { - "fptype": "float" if X_t.dtype == np.float32 else "double", + "fptype": X_t.dtype, "method": "dense", "allow_nan": allow_nan, } From 29e8f8c1a34aad809695d86d55bb197bb6e3fae1 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Mon, 25 Nov 2024 21:42:56 +0100 Subject: [PATCH 115/135] Update test_validation.py --- onedal/utils/tests/test_validation.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/onedal/utils/tests/test_validation.py b/onedal/utils/tests/test_validation.py index 7662f486f3..37486f0337 100644 --- a/onedal/utils/tests/test_validation.py +++ b/onedal/utils/tests/test_validation.py @@ -137,9 +137,8 @@ def test_assert_finite_sparse(dtype, allow_nan, check, seed): ) if check: - locx = rand.randint(0, X.shape[0] - 1) - locy = rand.randint(0, X.shape[1] - 1) - X[locx, locy] = float(check) + locx = rand.randint(0, X.data.shape[0] - 1) + X.data[locx] = float(check) if check is None or (allow_nan and check == "NaN"): assert_all_finite(X, allow_nan=allow_nan) From 1175a98419f4b613bf7fa36773c425e1a3b92f00 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Tue, 26 Nov 2024 12:45:10 +0100 Subject: [PATCH 116/135] don't have more time at the moment to do this. --- onedal/basic_statistics/basic_statistics.py | 66 ++++++++------------- 1 file changed, 25 insertions(+), 41 deletions(-) diff --git a/onedal/basic_statistics/basic_statistics.py b/onedal/basic_statistics/basic_statistics.py index c60d1599ac..549524a533 100644 --- a/onedal/basic_statistics/basic_statistics.py +++ b/onedal/basic_statistics/basic_statistics.py @@ -22,12 +22,14 @@ from ..common._base import BaseEstimator from ..datatypes import _convert_to_supported, from_table, to_table from ..utils import _is_csr -from ..utils.validation import _check_array -class BaseBasicStatistics(BaseEstimator, metaclass=ABCMeta): +class BasicStatistics(BaseEstimator, metaclass=ABCMeta): + """ + Basic Statistics oneDAL implementation. + """ @abstractmethod - def __init__(self, result_options, algorithm): + def __init__(self, result_options="all", algorithm="by_default"): self.options = result_options self.algorithm = algorithm @@ -46,62 +48,44 @@ def get_all_result_options(): "second_order_raw_moment", ] - def _get_result_options(self, options): - if options == "all": - options = self.get_all_result_options() - if isinstance(options, list): - options = "|".join(options) - assert isinstance(options, str) - return options - + @property + def options(self): + if self._options == "all": + return self.get_all_result_options() + return self._options + + @options.setter + def options(self, options): + # options always to be an iterable + self._options = options.split("|") if isinstance(options, str) else options + def _get_onedal_params(self, is_csr, dtype=np.float32): - options = self._get_result_options(self.options) return { "fptype": dtype, "method": "sparse" if is_csr else self.algorithm, - "result_option": options, + "result_option": "|".join(self.options), } - -class BasicStatistics(BaseBasicStatistics): - """ - Basic Statistics oneDAL implementation. - """ - - def __init__(self, result_options="all", algorithm="by_default"): - super().__init__(result_options, algorithm) - def fit(self, data, sample_weight=None, queue=None): policy = self._get_policy(queue, data, sample_weight) is_csr = _is_csr(data) - if data is not None and not is_csr: - data = _check_array(data, ensure_2d=False) - if sample_weight is not None: - sample_weight = _check_array(sample_weight, ensure_2d=False) - data, sample_weight = _convert_to_supported(policy, data, sample_weight) is_single_dim = data.ndim == 1 data_table, weights_table = to_table(data, sample_weight) - dtype = data.dtype - raw_result = self._compute_raw(data_table, weights_table, policy, dtype, is_csr) - for opt, raw_value in raw_result.items(): - value = from_table(raw_value).ravel() + dtype = data_table.dtype + module = self._get_backend("basic_statistics") + params = self._get_onedal_params(is_csr, data_table.dtype) + result = module.compute(policy, params, data_table, weights_table) + + for opt in self.options: + value = from_table(getattr(result, opt)).ravel() if is_single_dim: - setattr(self, opt, value[0]) + setattr(self, getattr(raw_result, opt), value[0]) else: setattr(self, opt, value) return self - def _compute_raw( - self, data_table, weights_table, policy, dtype=np.float32, is_csr=False - ): - module = self._get_backend("basic_statistics") - params = self._get_onedal_params(is_csr, dtype) - result = module.compute(policy, params, data_table, weights_table) - options = self._get_result_options(self.options).split("|") - - return {opt: getattr(result, opt) for opt in options} From 50ba766ade5294a1c0bbc422b49aeee11ab4f09b Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 28 Nov 2024 06:52:24 +0100 Subject: [PATCH 117/135] remove old code --- onedal/basic_statistics/basic_statistics.py | 10 ++--- .../incremental_basic_statistics.py | 40 ++++++------------- 2 files changed, 17 insertions(+), 33 deletions(-) diff --git a/onedal/basic_statistics/basic_statistics.py b/onedal/basic_statistics/basic_statistics.py index 549524a533..580c5c5092 100644 --- a/onedal/basic_statistics/basic_statistics.py +++ b/onedal/basic_statistics/basic_statistics.py @@ -28,6 +28,7 @@ class BasicStatistics(BaseEstimator, metaclass=ABCMeta): """ Basic Statistics oneDAL implementation. """ + @abstractmethod def __init__(self, result_options="all", algorithm="by_default"): self.options = result_options @@ -58,7 +59,7 @@ def options(self): def options(self, options): # options always to be an iterable self._options = options.split("|") if isinstance(options, str) else options - + def _get_onedal_params(self, is_csr, dtype=np.float32): return { "fptype": dtype, @@ -79,13 +80,12 @@ def fit(self, data, sample_weight=None, queue=None): module = self._get_backend("basic_statistics") params = self._get_onedal_params(is_csr, data_table.dtype) result = module.compute(policy, params, data_table, weights_table) - + for opt in self.options: - value = from_table(getattr(result, opt)).ravel() + value = from_table(getattr(result, opt))[:, 0] # two-dimensional table [n, 1] if is_single_dim: - setattr(self, getattr(raw_result, opt), value[0]) + setattr(self, opt, value[0]) else: setattr(self, opt, value) return self - diff --git a/onedal/basic_statistics/incremental_basic_statistics.py b/onedal/basic_statistics/incremental_basic_statistics.py index 4935a57a47..3501ce3521 100644 --- a/onedal/basic_statistics/incremental_basic_statistics.py +++ b/onedal/basic_statistics/incremental_basic_statistics.py @@ -16,14 +16,11 @@ import numpy as np -from daal4py.sklearn._utils import get_dtype - from ..datatypes import _convert_to_supported, from_table, to_table -from ..utils import _check_array -from .basic_statistics import BaseBasicStatistics +from .basic_statistics import BasicStatistics -class IncrementalBasicStatistics(BaseBasicStatistics): +class IncrementalBasicStatistics(BasicStatistics): """ Incremental estimator for basic statistics based on oneDAL implementation. Allows to compute basic statistics if data are splitted into batches. @@ -65,8 +62,8 @@ class IncrementalBasicStatistics(BaseBasicStatistics): Second order moment of each feature over all samples. """ - def __init__(self, result_options="all"): - super().__init__(result_options, algorithm="by_default") + def __init__(self, result_options="all", algorithm="by_default"): + super().__init__(result_options, algorithm) self._reset() def _reset(self): @@ -74,7 +71,7 @@ def _reset(self): "basic_statistics", None, "partial_compute_result" ) - def partial_fit(self, X, weights=None, queue=None): + def partial_fit(self, X, sample_weight=None, queue=None): """ Computes partial data for basic statistics from data batch X and saves it to `_partial_result`. @@ -95,24 +92,11 @@ def partial_fit(self, X, weights=None, queue=None): """ self._queue = queue policy = self._get_policy(queue, X) - X, weights = _convert_to_supported(policy, X, weights) - - X = _check_array( - X, dtype=[np.float64, np.float32], ensure_2d=False, force_all_finite=False - ) - if weights is not None: - weights = _check_array( - weights, - dtype=[np.float64, np.float32], - ensure_2d=False, - force_all_finite=False, - ) + X, sample_weight = to_table(_convert_to_supported(policy, X, sample_weight)) if not hasattr(self, "_onedal_params"): - dtype = get_dtype(X) - self._onedal_params = self._get_onedal_params(False, dtype=dtype) + self._onedal_params = self._get_onedal_params(False, dtype=X.dtype) - X_table, weights_table = to_table(X, weights) self._partial_result = self._get_backend( "basic_statistics", None, @@ -120,8 +104,8 @@ def partial_fit(self, X, weights=None, queue=None): policy, self._onedal_params, self._partial_result, - X_table, - weights_table, + X, + sample_weight, ) def finalize_fit(self, queue=None): @@ -153,8 +137,8 @@ def finalize_fit(self, queue=None): self._onedal_params, self._partial_result, ) - options = self._get_result_options(self.options).split("|") - for opt in options: - setattr(self, opt, from_table(getattr(result, opt)).ravel()) + + for opt in self.options: + setattr(self, opt, from_table(getattr(result, opt))[:, 0]) return self From 05ef656a15cc70dd512d5a5b6460741e93367f04 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 28 Nov 2024 07:27:05 +0100 Subject: [PATCH 118/135] interim stop --- .../basic_statistics/basic_statistics.py | 38 ++++--------------- 1 file changed, 8 insertions(+), 30 deletions(-) diff --git a/sklearnex/basic_statistics/basic_statistics.py b/sklearnex/basic_statistics/basic_statistics.py index da82e3bd82..092bc0974d 100644 --- a/sklearnex/basic_statistics/basic_statistics.py +++ b/sklearnex/basic_statistics/basic_statistics.py @@ -19,7 +19,6 @@ import numpy as np from sklearn.base import BaseEstimator from sklearn.utils import check_array -from sklearn.utils.validation import _check_sample_weight from daal4py.sklearn._n_jobs_support import control_n_jobs from daal4py.sklearn._utils import sklearn_check_version @@ -27,11 +26,8 @@ from .._device_offload import dispatch from .._utils import IntelEstimator, PatchingConditionsChain - -if sklearn_check_version("1.6"): - from sklearn.utils.validation import validate_data -else: - validate_data = BaseEstimator._validate_data +from ..utils._array_api import get_namespace +from ..utils.validation import _check_sample_weight, validate_data if sklearn_check_version("1.2"): from sklearn.utils._param_validation import StrOptions @@ -130,30 +126,11 @@ def __init__(self, result_options="all"): def _save_attributes(self): assert hasattr(self, "_onedal_estimator") - - if self.result_options == "all": - result_options = onedal_BasicStatistics.get_all_result_options() - else: - result_options = self.result_options - - if isinstance(result_options, str): - setattr( - self, - result_options + "_", - getattr(self._onedal_estimator, result_options), - ) - elif isinstance(result_options, list): - for option in result_options: - setattr(self, option + "_", getattr(self._onedal_estimator, option)) + for option in self._onedal_estimator.options: + setattr(self, option + "_", getattr(self._onedal_estimator, option)) def __getattr__(self, attr): - if self.result_options == "all": - result_options = onedal_BasicStatistics.get_all_result_options() - else: - result_options = self.result_options - is_deprecated_attr = ( - isinstance(result_options, str) and (attr == result_options) - ) or (isinstance(result_options, list) and (attr in result_options)) + is_deprecated_attr = attr in self._onedal_estimator.options if is_deprecated_attr: warnings.warn( "Result attributes without a trailing underscore were deprecated in version 2025.1 and will be removed in 2026.0" @@ -179,10 +156,11 @@ def _onedal_fit(self, X, sample_weight=None, queue=None): if sklearn_check_version("1.2"): self._validate_params() + xp, _ = get_namespace(X) if sklearn_check_version("1.0"): - X = validate_data(self, X, dtype=[np.float64, np.float32], ensure_2d=False) + X = validate_data(self, X, dtype=[xp.float64, xp.float32], ensure_2d=False) else: - X = check_array(X, dtype=[np.float64, np.float32]) + X = check_array(X, dtype=[xp.float64, xp.float32]) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) From 68ffc45f7c0664871eb6dd39e9ec9dce62b82c39 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 28 Nov 2024 09:02:45 +0100 Subject: [PATCH 119/135] attempt at fixing --- .../basic_statistics/basic_statistics.py | 6 ++- .../incremental_basic_statistics.py | 47 +++++++------------ 2 files changed, 22 insertions(+), 31 deletions(-) diff --git a/sklearnex/basic_statistics/basic_statistics.py b/sklearnex/basic_statistics/basic_statistics.py index 092bc0974d..035a2160f6 100644 --- a/sklearnex/basic_statistics/basic_statistics.py +++ b/sklearnex/basic_statistics/basic_statistics.py @@ -130,7 +130,11 @@ def _save_attributes(self): setattr(self, option + "_", getattr(self._onedal_estimator, option)) def __getattr__(self, attr): - is_deprecated_attr = attr in self._onedal_estimator.options + is_deprecated_attr = ( + attr in self._onedal_estimator.options + if hasattr(self, "_onedal_estimator") + else False + ) if is_deprecated_attr: warnings.warn( "Result attributes without a trailing underscore were deprecated in version 2025.1 and will be removed in 2026.0" diff --git a/sklearnex/basic_statistics/incremental_basic_statistics.py b/sklearnex/basic_statistics/incremental_basic_statistics.py index bafd0d8a57..e0c0717142 100644 --- a/sklearnex/basic_statistics/incremental_basic_statistics.py +++ b/sklearnex/basic_statistics/incremental_basic_statistics.py @@ -17,7 +17,6 @@ import numpy as np from sklearn.base import BaseEstimator from sklearn.utils import check_array, gen_batches -from sklearn.utils.validation import _check_sample_weight from daal4py.sklearn._n_jobs_support import control_n_jobs from daal4py.sklearn._utils import sklearn_check_version @@ -34,10 +33,8 @@ import numbers import warnings -if sklearn_check_version("1.6"): - from sklearn.utils.validation import validate_data -else: - validate_data = BaseEstimator._validate_data +from ..utils._array_api import get_namespace +from ..utils.validation import _check_sample_weight, validate_data @control_n_jobs(decorated_methods=["partial_fit", "_onedal_finalize_fit"]) @@ -153,12 +150,7 @@ class IncrementalBasicStatistics(IntelEstimator, BaseEstimator): } def __init__(self, result_options="all", batch_size=None): - if result_options == "all": - self.result_options = ( - self._onedal_incremental_basic_statistics.get_all_result_options() - ) - else: - self.result_options = result_options + self.result_options = result_options self._need_to_finalize = False self.batch_size = batch_size @@ -171,14 +163,6 @@ def _onedal_supported(self, method_name, *data): _onedal_cpu_supported = _onedal_supported _onedal_gpu_supported = _onedal_supported - def _get_onedal_result_options(self, options): - if isinstance(options, list): - onedal_options = "|".join(self.result_options) - else: - onedal_options = options - assert isinstance(onedal_options, str) - return options - def _onedal_finalize_fit(self, queue=None): assert hasattr(self, "_onedal_estimator") self._onedal_estimator.finalize_fit(queue=queue) @@ -188,6 +172,7 @@ def _onedal_partial_fit(self, X, sample_weight=None, queue=None, check_input=Tru first_pass = not hasattr(self, "n_samples_seen_") or self.n_samples_seen_ == 0 if check_input: + xp, _ = get_namespace(X) if sklearn_check_version("1.0"): X = validate_data( self, @@ -210,27 +195,28 @@ def _onedal_partial_fit(self, X, sample_weight=None, queue=None, check_input=Tru else: self.n_samples_seen_ += X.shape[0] - onedal_params = { - "result_options": self._get_onedal_result_options(self.result_options) - } if not hasattr(self, "_onedal_estimator"): self._onedal_estimator = self._onedal_incremental_basic_statistics( - **onedal_params + result_options=self.result_options ) - self._onedal_estimator.partial_fit(X, weights=sample_weight, queue=queue) + + self._onedal_estimator.partial_fit(X, sample_weight=sample_weight, queue=queue) self._need_to_finalize = True def _onedal_fit(self, X, sample_weight=None, queue=None): if sklearn_check_version("1.2"): self._validate_params() + xp, _ = get_namespace(X) if sklearn_check_version("1.0"): - X = validate_data(self, X, dtype=[np.float64, np.float32]) + X = validate_data(self, X, dtype=[xp.float64, xp.float32]) else: - X = check_array(X, dtype=[np.float64, np.float32]) + X = check_array(X, dtype=[xp.float64, xp.float32]) if sample_weight is not None: - sample_weight = _check_sample_weight(sample_weight, X) + sample_weight = _check_sample_weight( + sample_weight, X, dtype=[xp.float64, xp.float32] + ) n_samples, n_features = X.shape if self.batch_size is None: @@ -256,11 +242,12 @@ def _onedal_fit(self, X, sample_weight=None, queue=None): return self def __getattr__(self, attr): - result_options = self.__dict__["result_options"] sattr = attr.removesuffix("_") is_statistic_attr = ( - isinstance(result_options, str) and (sattr == result_options) - ) or (isinstance(result_options, list) and (sattr in result_options)) + sattr in self._onedal_estimator.options + if hasattr(self, "_onedal_estimator") + else False + ) if is_statistic_attr: if self._need_to_finalize: self._onedal_finalize_fit() From cfeb2c5e0814980b8401c330d3bef91df5838010 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 28 Nov 2024 09:25:19 +0100 Subject: [PATCH 120/135] remover abstractmethod --- onedal/basic_statistics/basic_statistics.py | 1 - 1 file changed, 1 deletion(-) diff --git a/onedal/basic_statistics/basic_statistics.py b/onedal/basic_statistics/basic_statistics.py index 580c5c5092..f16741ff90 100644 --- a/onedal/basic_statistics/basic_statistics.py +++ b/onedal/basic_statistics/basic_statistics.py @@ -29,7 +29,6 @@ class BasicStatistics(BaseEstimator, metaclass=ABCMeta): Basic Statistics oneDAL implementation. """ - @abstractmethod def __init__(self, result_options="all", algorithm="by_default"): self.options = result_options self.algorithm = algorithm From d3a69c6b3a989cd7282b89ad29023fd822a630d5 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 28 Nov 2024 10:13:51 +0100 Subject: [PATCH 121/135] fix issues --- onedal/basic_statistics/basic_statistics.py | 10 +++++----- .../basic_statistics/incremental_basic_statistics.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/onedal/basic_statistics/basic_statistics.py b/onedal/basic_statistics/basic_statistics.py index f16741ff90..0d0cc99461 100644 --- a/onedal/basic_statistics/basic_statistics.py +++ b/onedal/basic_statistics/basic_statistics.py @@ -71,14 +71,14 @@ def fit(self, data, sample_weight=None, queue=None): is_csr = _is_csr(data) - data, sample_weight = _convert_to_supported(policy, data, sample_weight) is_single_dim = data.ndim == 1 - data_table, weights_table = to_table(data, sample_weight) + data, sample_weight = to_table( + *_convert_to_supported(policy, data, sample_weight) + ) - dtype = data_table.dtype module = self._get_backend("basic_statistics") - params = self._get_onedal_params(is_csr, data_table.dtype) - result = module.compute(policy, params, data_table, weights_table) + params = self._get_onedal_params(is_csr, data.dtype) + result = module.compute(policy, params, data, sample_weight) for opt in self.options: value = from_table(getattr(result, opt))[:, 0] # two-dimensional table [n, 1] diff --git a/onedal/basic_statistics/incremental_basic_statistics.py b/onedal/basic_statistics/incremental_basic_statistics.py index 3501ce3521..61c0634675 100644 --- a/onedal/basic_statistics/incremental_basic_statistics.py +++ b/onedal/basic_statistics/incremental_basic_statistics.py @@ -92,7 +92,7 @@ def partial_fit(self, X, sample_weight=None, queue=None): """ self._queue = queue policy = self._get_policy(queue, X) - X, sample_weight = to_table(_convert_to_supported(policy, X, sample_weight)) + X, sample_weight = to_table(*_convert_to_supported(policy, X, sample_weight)) if not hasattr(self, "_onedal_params"): self._onedal_params = self._get_onedal_params(False, dtype=X.dtype) From c74485d806f15cfff9c10bd68c92be0666af82c2 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 28 Nov 2024 10:26:16 +0100 Subject: [PATCH 122/135] fix sample weights --- sklearnex/basic_statistics/basic_statistics.py | 4 +++- sklearnex/basic_statistics/incremental_basic_statistics.py | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/sklearnex/basic_statistics/basic_statistics.py b/sklearnex/basic_statistics/basic_statistics.py index 035a2160f6..32b9d50562 100644 --- a/sklearnex/basic_statistics/basic_statistics.py +++ b/sklearnex/basic_statistics/basic_statistics.py @@ -167,7 +167,9 @@ def _onedal_fit(self, X, sample_weight=None, queue=None): X = check_array(X, dtype=[xp.float64, xp.float32]) if sample_weight is not None: - sample_weight = _check_sample_weight(sample_weight, X) + sample_weight = _check_sample_weight( + sample_weight, X, dtype=[xp.float64, xp.float32] + ) onedal_params = { "result_options": self.result_options, diff --git a/sklearnex/basic_statistics/incremental_basic_statistics.py b/sklearnex/basic_statistics/incremental_basic_statistics.py index e0c0717142..9ec934ab9b 100644 --- a/sklearnex/basic_statistics/incremental_basic_statistics.py +++ b/sklearnex/basic_statistics/incremental_basic_statistics.py @@ -186,8 +186,10 @@ def _onedal_partial_fit(self, X, sample_weight=None, queue=None, check_input=Tru dtype=[np.float64, np.float32], ) - if sample_weight is not None: - sample_weight = _check_sample_weight(sample_weight, X) + if sample_weight is not None: + sample_weight = _check_sample_weight( + sample_weight, X, dtype=[np.float64, np.float32] + ) if first_pass: self.n_samples_seen_ = X.shape[0] From ee3c4751ad9cf3ebb992d7d7e6c8d3bff85e6e8a Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 28 Nov 2024 10:28:40 +0100 Subject: [PATCH 123/135] remove numpy --- sklearnex/basic_statistics/basic_statistics.py | 1 - sklearnex/basic_statistics/incremental_basic_statistics.py | 7 +++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/sklearnex/basic_statistics/basic_statistics.py b/sklearnex/basic_statistics/basic_statistics.py index 32b9d50562..eb7c6f7fe1 100644 --- a/sklearnex/basic_statistics/basic_statistics.py +++ b/sklearnex/basic_statistics/basic_statistics.py @@ -16,7 +16,6 @@ import warnings -import numpy as np from sklearn.base import BaseEstimator from sklearn.utils import check_array diff --git a/sklearnex/basic_statistics/incremental_basic_statistics.py b/sklearnex/basic_statistics/incremental_basic_statistics.py index 9ec934ab9b..c1935c88ef 100644 --- a/sklearnex/basic_statistics/incremental_basic_statistics.py +++ b/sklearnex/basic_statistics/incremental_basic_statistics.py @@ -14,7 +14,6 @@ # limitations under the License. # ============================================================================== -import numpy as np from sklearn.base import BaseEstimator from sklearn.utils import check_array, gen_batches @@ -177,18 +176,18 @@ def _onedal_partial_fit(self, X, sample_weight=None, queue=None, check_input=Tru X = validate_data( self, X, - dtype=[np.float64, np.float32], + dtype=[xp.float64, xp.float32], reset=first_pass, ) else: X = check_array( X, - dtype=[np.float64, np.float32], + dtype=[xp.float64, xp.float32], ) if sample_weight is not None: sample_weight = _check_sample_weight( - sample_weight, X, dtype=[np.float64, np.float32] + sample_weight, X, dtype=[xp.float64, xp.float32] ) if first_pass: From e135c47e498038b7ebc1ccc10ebdf58ea2affebb Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 28 Nov 2024 10:49:38 +0100 Subject: [PATCH 124/135] try again --- onedal/basic_statistics/basic_statistics.py | 2 +- onedal/basic_statistics/incremental_basic_statistics.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/onedal/basic_statistics/basic_statistics.py b/onedal/basic_statistics/basic_statistics.py index 0d0cc99461..44d9235966 100644 --- a/onedal/basic_statistics/basic_statistics.py +++ b/onedal/basic_statistics/basic_statistics.py @@ -81,7 +81,7 @@ def fit(self, data, sample_weight=None, queue=None): result = module.compute(policy, params, data, sample_weight) for opt in self.options: - value = from_table(getattr(result, opt))[:, 0] # two-dimensional table [n, 1] + value = from_table(getattr(result, opt))[0] # two-dimensional table [1, n] if is_single_dim: setattr(self, opt, value[0]) else: diff --git a/onedal/basic_statistics/incremental_basic_statistics.py b/onedal/basic_statistics/incremental_basic_statistics.py index 61c0634675..fd440e794d 100644 --- a/onedal/basic_statistics/incremental_basic_statistics.py +++ b/onedal/basic_statistics/incremental_basic_statistics.py @@ -139,6 +139,6 @@ def finalize_fit(self, queue=None): ) for opt in self.options: - setattr(self, opt, from_table(getattr(result, opt))[:, 0]) + setattr(self, opt, from_table(getattr(result, opt))[0]) return self From 39257bba5d8f25fe6879b7339a5a63a4b1e9b50a Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 28 Nov 2024 11:03:36 +0100 Subject: [PATCH 125/135] reintroduce _compute_raw for kmeans --- onedal/basic_statistics/basic_statistics.py | 14 +++++++++++--- onedal/cluster/kmeans.py | 2 +- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/onedal/basic_statistics/basic_statistics.py b/onedal/basic_statistics/basic_statistics.py index 44d9235966..40ebac3e88 100644 --- a/onedal/basic_statistics/basic_statistics.py +++ b/onedal/basic_statistics/basic_statistics.py @@ -76,9 +76,7 @@ def fit(self, data, sample_weight=None, queue=None): *_convert_to_supported(policy, data, sample_weight) ) - module = self._get_backend("basic_statistics") - params = self._get_onedal_params(is_csr, data.dtype) - result = module.compute(policy, params, data, sample_weight) + result = self._compute_raw(data, sample_weight, policy, data.dtype, is_csr) for opt in self.options: value = from_table(getattr(result, opt))[0] # two-dimensional table [1, n] @@ -88,3 +86,13 @@ def fit(self, data, sample_weight=None, queue=None): setattr(self, opt, value) return self + + def _compute_raw( + self, data_table, weights_table, policy, dtype=None, is_csr=False + ): + # This function is maintained for internal use by KMeans tolerance + # calculations, but is otherwise considered legacy code and is not + # to be used externally in any circumstance + module = self._get_backend("basic_statistics") + params = self._get_onedal_params(is_csr, dtype) + return module.compute(policy, params, data_table, weights_table) \ No newline at end of file diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 93eadf8c6b..1d661f140c 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -89,7 +89,7 @@ def _tolerance(self, X_table, rtol, is_csr, policy, dtype): bs = self._get_basic_statistics_backend("variance") res = bs._compute_raw(X_table, dummy, policy, dtype, is_csr) - mean_var = from_table(res["variance"]).mean() + mean_var = from_table(res.variance).mean() return mean_var * rtol From afed17557006abd927f9bdd20cc289162eab796e Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 28 Nov 2024 11:04:09 +0100 Subject: [PATCH 126/135] formatting --- onedal/basic_statistics/basic_statistics.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/onedal/basic_statistics/basic_statistics.py b/onedal/basic_statistics/basic_statistics.py index 40ebac3e88..1f484377d1 100644 --- a/onedal/basic_statistics/basic_statistics.py +++ b/onedal/basic_statistics/basic_statistics.py @@ -87,12 +87,10 @@ def fit(self, data, sample_weight=None, queue=None): return self - def _compute_raw( - self, data_table, weights_table, policy, dtype=None, is_csr=False - ): + def _compute_raw(self, data_table, weights_table, policy, dtype=None, is_csr=False): # This function is maintained for internal use by KMeans tolerance # calculations, but is otherwise considered legacy code and is not # to be used externally in any circumstance module = self._get_backend("basic_statistics") params = self._get_onedal_params(is_csr, dtype) - return module.compute(policy, params, data_table, weights_table) \ No newline at end of file + return module.compute(policy, params, data_table, weights_table) From 71cb39c4d0529112bd7b509807321330147dfe4d Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 28 Nov 2024 11:34:41 +0100 Subject: [PATCH 127/135] iterable fix --- onedal/basic_statistics/basic_statistics.py | 10 ++++------ .../basic_statistics/incremental_basic_statistics.py | 2 -- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/onedal/basic_statistics/basic_statistics.py b/onedal/basic_statistics/basic_statistics.py index 1f484377d1..9a4c8ede79 100644 --- a/onedal/basic_statistics/basic_statistics.py +++ b/onedal/basic_statistics/basic_statistics.py @@ -17,8 +17,6 @@ import warnings from abc import ABCMeta, abstractmethod -import numpy as np - from ..common._base import BaseEstimator from ..datatypes import _convert_to_supported, from_table, to_table from ..utils import _is_csr @@ -50,16 +48,16 @@ def get_all_result_options(): @property def options(self): - if self._options == "all": + if self._options[0] == "all": return self.get_all_result_options() return self._options @options.setter - def options(self, options): + def options(self, opts): # options always to be an iterable - self._options = options.split("|") if isinstance(options, str) else options + self._options = opts.split("|") if isinstance(opts, str) else opts - def _get_onedal_params(self, is_csr, dtype=np.float32): + def _get_onedal_params(self, is_csr, dtype=None): return { "fptype": dtype, "method": "sparse" if is_csr else self.algorithm, diff --git a/onedal/basic_statistics/incremental_basic_statistics.py b/onedal/basic_statistics/incremental_basic_statistics.py index fd440e794d..457cf59507 100644 --- a/onedal/basic_statistics/incremental_basic_statistics.py +++ b/onedal/basic_statistics/incremental_basic_statistics.py @@ -14,8 +14,6 @@ # limitations under the License. # ============================================================================== -import numpy as np - from ..datatypes import _convert_to_supported, from_table, to_table from .basic_statistics import BasicStatistics From fcb543cfcce09abfb9542a6f3be9e45ef8c1b42d Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 28 Nov 2024 11:35:36 +0100 Subject: [PATCH 128/135] make stricter --- onedal/basic_statistics/basic_statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/basic_statistics/basic_statistics.py b/onedal/basic_statistics/basic_statistics.py index 9a4c8ede79..9dc82e8757 100644 --- a/onedal/basic_statistics/basic_statistics.py +++ b/onedal/basic_statistics/basic_statistics.py @@ -48,7 +48,7 @@ def get_all_result_options(): @property def options(self): - if self._options[0] == "all": + if self._options == ["all"]: return self.get_all_result_options() return self._options From 11f3c76f128d9ede1e374ea098799f39c311c385 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Thu, 28 Nov 2024 14:05:51 +0100 Subject: [PATCH 129/135] attempt at fixing recursion issue --- sklearnex/basic_statistics/basic_statistics.py | 2 +- sklearnex/basic_statistics/incremental_basic_statistics.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearnex/basic_statistics/basic_statistics.py b/sklearnex/basic_statistics/basic_statistics.py index eb7c6f7fe1..94c6f607e2 100644 --- a/sklearnex/basic_statistics/basic_statistics.py +++ b/sklearnex/basic_statistics/basic_statistics.py @@ -131,7 +131,7 @@ def _save_attributes(self): def __getattr__(self, attr): is_deprecated_attr = ( attr in self._onedal_estimator.options - if hasattr(self, "_onedal_estimator") + if "_onedal_estimator" in self.__dict__ else False ) if is_deprecated_attr: diff --git a/sklearnex/basic_statistics/incremental_basic_statistics.py b/sklearnex/basic_statistics/incremental_basic_statistics.py index c1935c88ef..8bfe2377fb 100644 --- a/sklearnex/basic_statistics/incremental_basic_statistics.py +++ b/sklearnex/basic_statistics/incremental_basic_statistics.py @@ -246,7 +246,7 @@ def __getattr__(self, attr): sattr = attr.removesuffix("_") is_statistic_attr = ( sattr in self._onedal_estimator.options - if hasattr(self, "_onedal_estimator") + if "_onedal_estimator" in self.__dict__ else False ) if is_statistic_attr: From 85815518c20ac0f65ae3c0a176cd1d95f6a811a7 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Fri, 29 Nov 2024 08:45:45 +0100 Subject: [PATCH 130/135] Update basic_statistics.py --- onedal/spmd/basic_statistics/basic_statistics.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/onedal/spmd/basic_statistics/basic_statistics.py b/onedal/spmd/basic_statistics/basic_statistics.py index 8253aa6628..5a2e4e59bd 100644 --- a/onedal/spmd/basic_statistics/basic_statistics.py +++ b/onedal/spmd/basic_statistics/basic_statistics.py @@ -21,10 +21,6 @@ class BasicStatistics(BaseEstimatorSPMD, BasicStatistics_Batch): - @support_input_format() - def compute(self, data, weights=None, queue=None): - return super().compute(data, weights=weights, queue=queue) - @support_input_format() def fit(self, data, sample_weight=None, queue=None): return super().fit(data, sample_weight=sample_weight, queue=queue) From 5334b38a633e132d3854ee5f8b4c7a681db4fc41 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Fri, 29 Nov 2024 08:47:40 +0100 Subject: [PATCH 131/135] Update incremental_basic_statistics.py --- .../basic_statistics/incremental_basic_statistics.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/onedal/spmd/basic_statistics/incremental_basic_statistics.py b/onedal/spmd/basic_statistics/incremental_basic_statistics.py index a0bd62868a..a38a20465e 100644 --- a/onedal/spmd/basic_statistics/incremental_basic_statistics.py +++ b/onedal/spmd/basic_statistics/incremental_basic_statistics.py @@ -29,7 +29,7 @@ def _reset(self): "basic_statistics", None, "partial_compute_result" ) - def partial_fit(self, X, weights=None, queue=None): + def partial_fit(self, X, sample_weight=None, queue=None): """ Computes partial data for basic statistics from data batch X and saves it to `_partial_result`. @@ -50,13 +50,11 @@ def partial_fit(self, X, weights=None, queue=None): """ self._queue = queue policy = super(base_IncrementalBasicStatistics, self)._get_policy(queue, X) - X, weights = _convert_to_supported(policy, X, weights) + X, sample_weight = to_table(*_convert_to_supported(policy, X, sample_weight)) if not hasattr(self, "_onedal_params"): - dtype = get_dtype(X) - self._onedal_params = self._get_onedal_params(False, dtype=dtype) + self._onedal_params = self._get_onedal_params(False, dtype=X.dtype) - X_table, weights_table = to_table(X, weights) self._partial_result = super(base_IncrementalBasicStatistics, self)._get_backend( "basic_statistics", None, @@ -64,6 +62,6 @@ def partial_fit(self, X, weights=None, queue=None): policy, self._onedal_params, self._partial_result, - X_table, - weights_table, + X, + sample_weight, ) From 5f353c60ac95c1ada43611abcb3723d6fc664885 Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Fri, 29 Nov 2024 09:04:28 +0100 Subject: [PATCH 132/135] remove todo --- sklearnex/spmd/basic_statistics/basic_statistics.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/sklearnex/spmd/basic_statistics/basic_statistics.py b/sklearnex/spmd/basic_statistics/basic_statistics.py index eef0b666a8..0d5e522d7e 100644 --- a/sklearnex/spmd/basic_statistics/basic_statistics.py +++ b/sklearnex/spmd/basic_statistics/basic_statistics.py @@ -14,8 +14,11 @@ # limitations under the License. # ============================================================================== -from onedal.spmd.basic_statistics import BasicStatistics +from onedal.spmd.basic_statistics import BasicStatistics as onedal_BasicStatistics -# TODO: -# Currently it uses `onedal` module interface. -# Add sklearnex dispatching. +from ..basic_statistics import BasicStatistics as BasicStatistics_Batch + + +class BasicStatistics(BasicStatistics_Batch): + __doc__ = BasicStatistics_Batch.__doc__ + _onedal_basic_statistics = staticmethod(onedal_BasicStatistics) From b3ece1e38610d349f41c1ad5a169639255c8ce21 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sun, 1 Dec 2024 13:57:12 +0100 Subject: [PATCH 133/135] Update basic_statistics.py --- sklearnex/spmd/basic_statistics/basic_statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/spmd/basic_statistics/basic_statistics.py b/sklearnex/spmd/basic_statistics/basic_statistics.py index 0d5e522d7e..94854bd85c 100644 --- a/sklearnex/spmd/basic_statistics/basic_statistics.py +++ b/sklearnex/spmd/basic_statistics/basic_statistics.py @@ -16,7 +16,7 @@ from onedal.spmd.basic_statistics import BasicStatistics as onedal_BasicStatistics -from ..basic_statistics import BasicStatistics as BasicStatistics_Batch +from ...basic_statistics import BasicStatistics as BasicStatistics_Batch class BasicStatistics(BasicStatistics_Batch): From 2ebf71b31add78925a0a26abbae54efa9c842767 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Sun, 1 Dec 2024 22:01:49 +0100 Subject: [PATCH 134/135] Update basic_statistics.py --- onedal/spmd/basic_statistics/basic_statistics.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/onedal/spmd/basic_statistics/basic_statistics.py b/onedal/spmd/basic_statistics/basic_statistics.py index 5a2e4e59bd..f7f273b9ea 100644 --- a/onedal/spmd/basic_statistics/basic_statistics.py +++ b/onedal/spmd/basic_statistics/basic_statistics.py @@ -21,6 +21,4 @@ class BasicStatistics(BaseEstimatorSPMD, BasicStatistics_Batch): - @support_input_format() - def fit(self, data, sample_weight=None, queue=None): - return super().fit(data, sample_weight=sample_weight, queue=queue) + pass From 60aeaa6efea4d3e3913268341f0c26873815e4ad Mon Sep 17 00:00:00 2001 From: "Faust, Ian" Date: Wed, 4 Dec 2024 08:48:23 +0100 Subject: [PATCH 135/135] warning removal from BS examples --- examples/sklearnex/basic_statistics_spmd.py | 4 ++-- examples/sklearnex/incremental_basic_statistics.py | 12 ++++++------ .../sklearnex/incremental_basic_statistics_dpctl.py | 12 ++++++------ 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/examples/sklearnex/basic_statistics_spmd.py b/examples/sklearnex/basic_statistics_spmd.py index 909c842cb9..e469f7d9cb 100644 --- a/examples/sklearnex/basic_statistics_spmd.py +++ b/examples/sklearnex/basic_statistics_spmd.py @@ -60,5 +60,5 @@ def generate_data(par, size, seed=777): bss = BasicStatisticsSpmd(["mean", "standard_deviation"]) bss.fit(dpt_data, dpt_weights) -print(f"Computed mean on rank {rank}:\n", bss.mean) -print(f"Computed std on rank {rank}:\n", bss.standard_deviation) +print(f"Computed mean on rank {rank}:\n", bss.mean_) +print(f"Computed std on rank {rank}:\n", bss.standard_deviation_) diff --git a/examples/sklearnex/incremental_basic_statistics.py b/examples/sklearnex/incremental_basic_statistics.py index b2713e1657..3e4ec8aa13 100644 --- a/examples/sklearnex/incremental_basic_statistics.py +++ b/examples/sklearnex/incremental_basic_statistics.py @@ -30,9 +30,9 @@ X_3 = np.array([[1, 1], [1, 2], [2, 3]]) result = incbs.partial_fit(X_3) -print(f"Mean:\n{result.mean}") -print(f"Max:\n{result.max}") -print(f"Sum:\n{result.sum}") +print(f"Mean:\n{result.mean_}") +print(f"Max:\n{result.max_}") +print(f"Sum:\n{result.sum_}") # We put the whole data to fit method, it is split automatically and then # partial_fit is called for each batch. @@ -40,6 +40,6 @@ X = np.array([[0, 1], [0, 1], [1, 2], [1, 1], [1, 2], [2, 3]]) result = incbs.fit(X) -print(f"Mean:\n{result.mean}") -print(f"Max:\n{result.max}") -print(f"Sum:\n{result.sum}") +print(f"Mean:\n{result.mean_}") +print(f"Max:\n{result.max_}") +print(f"Sum:\n{result.sum_}") diff --git a/examples/sklearnex/incremental_basic_statistics_dpctl.py b/examples/sklearnex/incremental_basic_statistics_dpctl.py index 170ba0e446..7b6a905dec 100644 --- a/examples/sklearnex/incremental_basic_statistics_dpctl.py +++ b/examples/sklearnex/incremental_basic_statistics_dpctl.py @@ -36,9 +36,9 @@ X_3 = dpt.asarray([[1, 1], [1, 2], [2, 3]], sycl_queue=queue) result = incbs.partial_fit(X_3) -print(f"Mean:\n{result.mean}") -print(f"Max:\n{result.max}") -print(f"Sum:\n{result.sum}") +print(f"Mean:\n{result.mean_}") +print(f"Max:\n{result.max_}") +print(f"Sum:\n{result.sum_}") # We put the whole data to fit method, it is split automatically and then # partial_fit is called for each batch. @@ -46,6 +46,6 @@ X = dpt.asarray([[0, 1], [0, 1], [1, 2], [1, 1], [1, 2], [2, 3]], sycl_queue=queue) result = incbs.fit(X) -print(f"Mean:\n{result.mean}") -print(f"Max:\n{result.max}") -print(f"Sum:\n{result.sum}") +print(f"Mean:\n{result.mean_}") +print(f"Max:\n{result.max_}") +print(f"Sum:\n{result.sum_}")