Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove all Dask-ML uses #886

Merged
merged 13 commits into from
Nov 29, 2022
Merged
9 changes: 3 additions & 6 deletions .github/workflows/test-upstream.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,10 @@ jobs:
mamba install -c conda-forge "sasl>=0.3.1"
docker pull bde2020/hive:2.3.2-postgresql-metastore
docker pull bde2020/hive-metastore-postgresql:2.3.0
- name: Install upstream dev Dask / dask-ml
- name: Install upstream dev Dask
if: env.which_upstream == 'Dask'
run: |
mamba update dask
python -m pip install --no-deps git+https://github.com/dask/dask-ml
- name: Test with pytest
run: |
pytest --junitxml=junit/test-results.xml --cov-report=xml -n auto tests --dist loadfile
Expand Down Expand Up @@ -112,11 +111,10 @@ jobs:
which python
pip list
mamba list
- name: Install upstream dev dask-ml
- name: Install upstream dev Dask
if: env.which_upstream == 'Dask'
run: |
mamba update dask
python -m pip install --no-deps git+https://github.com/dask/dask-ml
- name: run a dask cluster
run: |
if [[ $which_upstream == "Dask" ]]; then
Expand Down Expand Up @@ -161,12 +159,11 @@ jobs:
which python
pip list
mamba list
- name: Install upstream dev Dask / dask-ml
- name: Install upstream dev Dask
if: env.which_upstream == 'Dask'
run: |
python -m pip install --no-deps git+https://github.com/dask/dask
python -m pip install --no-deps git+https://github.com/dask/distributed
python -m pip install --no-deps git+https://github.com/dask/dask-ml
- name: Try to import dask-sql
run: |
python -c "import dask_sql; print('ok')"
Expand Down
9 changes: 3 additions & 6 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,10 @@ jobs:
mamba install -c conda-forge "sasl>=0.3.1"
docker pull bde2020/hive:2.3.2-postgresql-metastore
docker pull bde2020/hive-metastore-postgresql:2.3.0
- name: Optionally install upstream dev Dask / dask-ml
- name: Optionally install upstream dev Dask
if: needs.detect-ci-trigger.outputs.triggered == 'true'
run: |
mamba update dask
python -m pip install --no-deps git+https://github.com/dask/dask-ml
- name: Test with pytest
run: |
pytest --junitxml=junit/test-results.xml --cov-report=xml -n auto tests --dist loadfile
Expand Down Expand Up @@ -108,11 +107,10 @@ jobs:
which python
pip list
mamba list
- name: Optionally install upstream dev dask-ml
- name: Optionally install upstream dev Dask
if: needs.detect-ci-trigger.outputs.triggered == 'true'
run: |
mamba update dask
python -m pip install --no-deps git+https://github.com/dask/dask-ml
- name: run a dask cluster
env:
UPSTREAM: ${{ needs.detect-ci-trigger.outputs.triggered }}
Expand Down Expand Up @@ -153,12 +151,11 @@ jobs:
which python
pip list
mamba list
- name: Optionally install upstream dev Dask / dask-ml
- name: Optionally install upstream dev Dask
if: needs.detect-ci-trigger.outputs.triggered == 'true'
run: |
python -m pip install --no-deps git+https://github.com/dask/dask
python -m pip install --no-deps git+https://github.com/dask/distributed
python -m pip install --no-deps git+https://github.com/dask/dask-ml
- name: Try to import dask-sql
run: |
python -c "import dask_sql; print('ok')"
1 change: 0 additions & 1 deletion continuous_integration/environment-3.10-dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ channels:
- conda-forge
- nodefaults
dependencies:
- dask-ml>=2022.1.22
- dask>=2022.3.0
- fastapi>=0.69.0
- fugue>=0.7.0
Expand Down
1 change: 0 additions & 1 deletion continuous_integration/environment-3.8-dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ channels:
- conda-forge
- nodefaults
dependencies:
- dask-ml=2022.1.22
- dask=2022.3.0
- fastapi=0.69.0
- fugue=0.7.0
Expand Down
1 change: 0 additions & 1 deletion continuous_integration/environment-3.9-dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ channels:
- conda-forge
- nodefaults
dependencies:
- dask-ml>=2022.1.22
- dask>=2022.3.0
- fastapi>=0.69.0
- fugue>=0.7.0
Expand Down
1 change: 0 additions & 1 deletion continuous_integration/gpuci/environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ channels:
- conda-forge
- nodefaults
dependencies:
- dask-ml>=2022.1.22
- dask>=2022.3.0
- fastapi>=0.69.0
- fugue>=0.7.0
Expand Down
23 changes: 19 additions & 4 deletions dask_sql/physical/rel/custom/predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
import uuid
from typing import TYPE_CHECKING

import dask.dataframe as dd
import pandas as pd

from dask_sql.datacontainer import ColumnContainer, DataContainer
from dask_sql.physical.rel.base import BaseRelPlugin

Expand Down Expand Up @@ -30,8 +33,7 @@ class PredictModelPlugin(BaseRelPlugin):
Please note however, that it will need to act on Dask dataframes. If you
are using a model not optimized for this, it might be that you run out of memory if
your data is larger than the RAM of a single machine.
To prevent this, have a look into the dask-ml package,
especially the [ParallelPostFit](https://ml.dask.org/meta-estimators.html)
To prevent this, have a look into the dask_sql.physical.rel.custom.wrappers.ParallelPostFit
meta-estimator. If you are using a model trained with `CREATE MODEL`
and the `wrap_predict` flag, this is done automatically.

Expand Down Expand Up @@ -59,8 +61,21 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContai

model, training_columns = context.schema[schema_name].models[model_name]
df = context.sql(sql_select)
prediction = model.predict(df[training_columns])
predicted_df = df.assign(target=prediction)
try:
prediction = model.predict(df[training_columns])
predicted_df = df.assign(target=prediction)
except TypeError:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In what scenarios do we hit this case?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is to allow test_clustering_and_prediction to pass. It would error with a TypeError: Column assignment doesn't support type numpy.ndarray because sklearn returns the list of clusters as a Numpy array (like array([1, 3, 4, 7, 7, 2, 0, 7, 5, 1, 2, 0, 1, 6, 0, 3, 6, 6, 4, 7, 0, 3, 2, 0, 3, 4, 5, 4, 1, 0], dtype=int32)), but Dask does not support column assignment with this datatype. So we have to convert it to a Dask Series before assignment.

I think it should be OK to use Pandas in the except block because this should only happen in the CPU case with sklearn. As an example, test_gpu_clustering_and_prediction uses cuml.dask and doesn't need to go into the except block.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense. So this is specifically for the case where predict returns a numpy array instead of a Dask Array.

df = df.set_index(df.columns[0], drop=False)
prediction = model.predict(df[training_columns])
# Convert numpy.ndarray to Dask Series
prediction = dd.from_pandas(
pd.Series(prediction, index=df.index),
npartitions=df.npartitions,
)
predicted_df = df.assign(target=prediction)
# Need to drop first column to reset index
# because the first column is equal to the index
predicted_df = predicted_df.drop(columns=[df.columns[0]]).reset_index()

# Create a temporary context, which includes the
# new "table" so that we can use the normal
Expand Down
1 change: 0 additions & 1 deletion docker/conda.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ uvicorn>=0.13.4
pyarrow>=6.0.1
prompt_toolkit>=3.0.8
pygments>=2.7.1
dask-ml>=2022.1.22
scikit-learn>=1.0.0
intake>=0.6.0
pre-commit>=2.11.1
Expand Down
1 change: 0 additions & 1 deletion docker/main.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ RUN mamba install -y \
nest-asyncio \
# additional dependencies
"pyarrow>=6.0.1" \
"dask-ml>=2022.1.22" \
"scikit-learn>=1.0.0" \
"intake>=0.6.0" \
&& conda clean -ay
Expand Down
8 changes: 3 additions & 5 deletions docs/source/machine_learning.rst
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,7 @@ following sql statements
Want to increase the performance of your model by tuning the
parameters? Use the hyperparameter tuning directly
in SQL using below SQL syntax, choose different tuners
from the dask_ml package based on memory and compute constraints and
for more details refer to the `dask ml documentation <https://ml.dask.org/hyper-parameter-search.html#incremental-hyperparameter-optimization>`_
based on memory and compute constraints.

..
TODO - add a GPU section to these examples once we have working CREATE EXPERIMENT tests for GPU
Expand All @@ -135,7 +134,7 @@ for more details refer to the `dask ml documentation <https://ml.dask.org/hyper-

CREATE EXPERIMENT my_exp WITH (
model_class = 'sklearn.ensemble.GradientBoostingClassifier',
experiment_class = 'dask_ml.model_selection.GridSearchCV',
experiment_class = 'sklearn.model_selection.GridSearchCV',
tune_parameters = (n_estimators = ARRAY [16, 32, 2],
learning_rate = ARRAY [0.1,0.01,0.001],
max_depth = ARRAY [3,4,5,10]
Expand Down Expand Up @@ -258,7 +257,6 @@ and the boolean target ``label``.
SELECT * FROM training_data

-- We can now train a model from the sklearn package.
-- Make sure to install it together with dask-ml with conda or pip.
CREATE OR REPLACE MODEL my_model WITH (
model_class = 'sklearn.ensemble.GradientBoostingClassifier',
wrap_predict = True,
Expand All @@ -282,7 +280,7 @@ and the boolean target ``label``.
-- experiment to tune different hyperparameters
CREATE EXPERIMENT my_exp WITH(
model_class = 'sklearn.ensemble.GradientBoostingClassifier',
experiment_class = 'dask_ml.model_selection.GridSearchCV',
experiment_class = 'sklearn.model_selection.GridSearchCV',
tune_parameters = (n_estimators = ARRAY [16, 32, 2],
learning_rate = ARRAY [0.1,0.01,0.001],
max_depth = ARRAY [3,4,5,10]
Expand Down
3 changes: 1 addition & 2 deletions docs/source/sql/ml.rst
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,7 @@ A model can be anything which has a ``predict`` function.
Please note however, that it will need to act on Dask dataframes. If you
are using a model not optimized for this, it might be that you run out of memory if
your data is larger than the RAM of a single machine.
To prevent this, have a look into the dask-ml package,
especially the `ParallelPostFit <https://ml.dask.org/meta-estimators.html>`_
To prevent this, have a look into the `dask_sql.physical.rel.custom.wrappers.ParallelPostFit`
meta-estimator. If you are using a model trained with ``CREATE MODEL``
and the ``wrap_predict`` flag set to true, this is done automatically.

Expand Down
4 changes: 2 additions & 2 deletions notebooks/Feature Overview.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -590,7 +590,7 @@
"metadata": {},
"source": [
"- Tune single model with different Hyperparameters \n",
" - install **dask_ml** for tunning\n",
" - install **sklearn** for tuning\n",
"- Tune multiple model with different Hyperparameters\n",
" - install **tpot** for Automl"
]
Expand All @@ -604,7 +604,7 @@
"%%sql\n",
"CREATE EXPERIMENT my_exp WITH (\n",
" model_class = 'sklearn.ensemble.GradientBoostingClassifier',\n",
" experiment_class = 'dask_ml.model_selection.GridSearchCV',\n",
" experiment_class = 'sklearn.model_selection.GridSearchCV',\n",
" tune_parameters = (n_estimators = ARRAY [16, 32, 2],learning_rate = ARRAY [0.1,0.01,0.001],\n",
" max_depth = ARRAY [3,4,5,10]),\n",
" target_column = 'species'\n",
Expand Down
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@
"mock>=4.0.3",
"sphinx>=3.2.1",
"pyarrow>=6.0.1",
"dask-ml>=2022.1.22",
"scikit-learn>=1.0.0",
"intake>=0.6.0",
"pre-commit",
Expand Down
44 changes: 29 additions & 15 deletions tests/integration/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
xgboost = None
dask_cudf = None

pytest.importorskip("dask_ml")


def check_trained_model(c, model_name=None):
if model_name is None:
Expand Down Expand Up @@ -157,7 +155,24 @@ def test_clustering_and_prediction(c, training_df):
c.sql(
"""
CREATE MODEL my_model WITH (
model_class = 'dask_ml.cluster.KMeans'
model_class = 'sklearn.cluster.KMeans'
) AS (
SELECT x, y
FROM timeseries
LIMIT 100
)
"""
)

check_trained_model(c)


@pytest.mark.gpu
def test_gpu_clustering_and_prediction(c, gpu_training_df, gpu_client):
c.sql(
"""
CREATE MODEL my_model WITH (
model_class = 'cuml.dask.cluster.KMeans'
) AS (
SELECT x, y
FROM timeseries
Expand Down Expand Up @@ -244,7 +259,7 @@ def test_show_models(c, training_df):
c.sql(
"""
CREATE MODEL my_model2 WITH (
model_class = 'dask_ml.cluster.KMeans'
model_class = 'sklearn.cluster.KMeans'
) AS (
SELECT x, y
FROM timeseries
Expand Down Expand Up @@ -691,7 +706,7 @@ def test_ml_experiment(c, client, training_df):
c.sql(
"""
CREATE EXPERIMENT my_exp WITH (
experiment_class = 'dask_ml.model_selection.GridSearchCV',
experiment_class = 'sklearn.model_selection.GridSearchCV',
tune_parameters = (n_estimators = ARRAY [16, 32, 2],learning_rate = ARRAY [0.1,0.01,0.001],
max_depth = ARRAY [3,4,5,10]),
target_column = 'target'
Expand Down Expand Up @@ -731,7 +746,7 @@ def test_ml_experiment(c, client, training_df):
"""
CREATE EXPERIMENT IF NOT EXISTS my_exp WITH (
model_class = 'that.is.not.a.python.class',
experiment_class = 'dask_ml.model_selection.GridSearchCV',
experiment_class = 'sklearn.model_selection.GridSearchCV',
tune_parameters = (n_estimators = ARRAY [16, 32, 2],learning_rate = ARRAY [0.1,0.01,0.001],
max_depth = ARRAY [3,4,5,10]),
target_column = 'target'
Expand Down Expand Up @@ -794,7 +809,7 @@ def test_ml_experiment(c, client, training_df):
"""
CREATE EXPERIMENT my_exp WITH (
model_class = 'sklearn.ensemble.GradientBoostingClassifier',
experiment_class = 'dask_ml.model_selection.GridSearchCV',
experiment_class = 'sklearn.model_selection.GridSearchCV',
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't really get experiment_class to work on the GPU. For example:

c.sql(
    """
    CREATE EXPERIMENT my_exp WITH (
    model_class = 'sklearn.ensemble.GradientBoostingClassifier',
    experiment_class = 'cuml.model_selection.GridSearchCV',
    tune_parameters = (n_estimators = ARRAY [16, 32, 2],learning_rate = ARRAY [0.1,0.01,0.001],
                       max_depth = ARRAY [3,4,5,10]),
    target_column = 'target'
) AS (
        SELECT x, y, x*y > 0 AS target
        FROM timeseries
        LIMIT 100
    )
    """
)

errors with:

INFO:dask_sql.physical.rel.custom.create_experiment:{'n_estimators': [16, 32, 2], 'learning_rate': [0.1, 0.01, 0.001], 'max_depth': [3, 4, 5, 10]}
INFO:dask_sql.physical.rel.custom.create_experiment:{}
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In [8], line 1
----> 1 c.sql(
      2     """
      3     CREATE EXPERIMENT my_exp WITH (
      4     model_class = 'sklearn.ensemble.GradientBoostingClassifier',
      5     experiment_class = 'cuml.model_selection.GridSearchCV',
      6     tune_parameters = (n_estimators = ARRAY [16, 32, 2],learning_rate = ARRAY [0.1,0.01,0.001],
      7                        max_depth = ARRAY [3,4,5,10]),
      8     target_column = 'target'
      9 ) AS (
     10         SELECT x, y, x*y > 0 AS target
     11         FROM timeseries
     12         LIMIT 100
     13     )
     14     """
     15 )

File ~/miniconda3/envs/dsql_rapids-22.12/lib/python3.9/site-packages/dask_sql/context.py:501, in Context.sql(self, sql, return_futures, dataframes, gpu, config_options)
    496 else:
    497     raise RuntimeError(
    498         f"Encountered unsupported `LogicalPlan` sql type: {type(sql)}"
    499     )
--> 501 return self._compute_table_from_rel(rel, return_futures)

File ~/miniconda3/envs/dsql_rapids-22.12/lib/python3.9/site-packages/dask_sql/context.py:830, in Context._compute_table_from_rel(self, rel, return_futures)
    829 def _compute_table_from_rel(self, rel: "LogicalPlan", return_futures: bool = True):
--> 830     dc = RelConverter.convert(rel, context=self)
    832     # Optimization might remove some alias projects. Make sure to keep them here.
    833     select_names = [field for field in rel.getRowType().getFieldList()]

File ~/miniconda3/envs/dsql_rapids-22.12/lib/python3.9/site-packages/dask_sql/physical/rel/convert.py:61, in RelConverter.convert(cls, rel, context)
     55     raise NotImplementedError(
     56         f"No relational conversion for node type {node_type} available (yet)."
     57     )
     58 logger.debug(
     59     f"Processing REL {rel} using {plugin_instance.__class__.__name__}..."
     60 )
---> 61 df = plugin_instance.convert(rel, context=context)
     62 logger.debug(f"Processed REL {rel} into {LoggableDataFrame(df)}")
     63 return df

File ~/miniconda3/envs/dsql_rapids-22.12/lib/python3.9/site-packages/dask_sql/physical/rel/custom/create_experiment.py:169, in CreateExperimentPlugin.convert(self, rel, context)
    167 search = ExperimentClass(model, {**parameters}, **experiment_kwargs)
    168 logger.info(tune_fit_kwargs)
--> 169 search.fit(
    170     X.to_dask_array(lengths=True),
    171     y.to_dask_array(lengths=True),
    172     **tune_fit_kwargs,
    173 )
    174 df = pd.DataFrame(search.cv_results_)
    175 df["model_class"] = model_class

File ~/miniconda3/envs/dsql_rapids-22.12/lib/python3.9/site-packages/sklearn/model_selection/_search.py:786, in BaseSearchCV.fit(self, X, y, groups, **fit_params)
    783 X, y, groups = indexable(X, y, groups)
    784 fit_params = _check_fit_params(X, fit_params)
--> 786 cv_orig = check_cv(self.cv, y, classifier=is_classifier(estimator))
    787 n_splits = cv_orig.get_n_splits(X, y, groups)
    789 base_estimator = clone(self.estimator)

File ~/miniconda3/envs/dsql_rapids-22.12/lib/python3.9/site-packages/sklearn/model_selection/_split.py:2331, in check_cv(cv, y, classifier)
   2326 cv = 5 if cv is None else cv
   2327 if isinstance(cv, numbers.Integral):
   2328     if (
   2329         classifier
   2330         and (y is not None)
-> 2331         and (type_of_target(y, input_name="y") in ("binary", "multiclass"))
   2332     ):
   2333         return StratifiedKFold(cv)
   2334     else:

File ~/miniconda3/envs/dsql_rapids-22.12/lib/python3.9/site-packages/sklearn/utils/multiclass.py:286, in type_of_target(y, input_name)
    283 if sparse_pandas:
    284     raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'")
--> 286 if is_multilabel(y):
    287     return "multilabel-indicator"
    289 # DeprecationWarning will be replaced by ValueError, see NEP 34
    290 # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html

File ~/miniconda3/envs/dsql_rapids-22.12/lib/python3.9/site-packages/sklearn/utils/multiclass.py:152, in is_multilabel(y)
    150 warnings.simplefilter("error", np.VisibleDeprecationWarning)
    151 try:
--> 152     y = np.asarray(y)
    153 except (np.VisibleDeprecationWarning, ValueError):
    154     # dtype=object should be provided explicitly for ragged arrays,
    155     # see NEP 34
    156     y = np.array(y, dtype=object)

File ~/miniconda3/envs/dsql_rapids-22.12/lib/python3.9/site-packages/dask/array/core.py:1704, in Array.__array__(self, dtype, **kwargs)
   1702     x = x.astype(dtype)
   1703 if not isinstance(x, np.ndarray):
-> 1704     x = np.array(x)
   1705 return x

File cupy/_core/core.pyx:1473, in cupy._core.core._ndarray_base.__array__()

TypeError: Implicit conversion to a NumPy array is not allowed. Please use `.get()` to construct a NumPy array explicitly.

Using model_class = 'xgboost.XGBClassifier' or model_class = 'xgboost.dask.XGBClassifier' results in the same error as above.

When I try it with a model_class from cuML, more errors arise. For example, if I try it with model_class = 'cuml.dask.ensemble.RandomForestClassifier' (cuML has no GradientBoostingClassifier), sklearn raises a

TypeError: If no scoring is specified, the estimator passed should have a 'score' method. The estimator <cuml.dask.ensemble.randomforestclassifier.RandomForestClassifier object at 0x7f0c5f692820> does not.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could open some issues to try and get experiment_class working on the GPU. From what I've tried so far, I think the fixes would lie on the Dask and/or cuML side of things.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can open up an issue to track this and followup in a future pr

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Opened #943

tune_parameters = (n_estimators = ARRAY [16, 32, 2],learning_rate = ARRAY [0.1,0.01,0.001],
max_depth = ARRAY [3,4,5,10]),
target_column = 'target'
Expand All @@ -816,7 +831,7 @@ def test_ml_experiment(c, client, training_df):
"""
CREATE EXPERIMENT my_exp WITH (
model_class = 'sklearn.ensemble.GradientBoostingClassifier',
experiment_class = 'dask_ml.model_selection.GridSearchCV',
experiment_class = 'sklearn.model_selection.GridSearchCV',
tune_parameters = (n_estimators = ARRAY [16, 32, 2],learning_rate = ARRAY [0.1,0.01,0.001],
max_depth = ARRAY [3,4,5,10]),
target_column = 'target'
Expand All @@ -831,7 +846,7 @@ def test_ml_experiment(c, client, training_df):
"""
CREATE EXPERIMENT IF NOT EXISTS my_exp WITH (
model_class = 'sklearn.ensemble.GradientBoostingClassifier',
experiment_class = 'dask_ml.model_selection.GridSearchCV',
experiment_class = 'sklearn.model_selection.GridSearchCV',
tune_parameters = (n_estimators = ARRAY [16, 32, 2],learning_rate = ARRAY [0.1,0.01,0.001],
max_depth = ARRAY [3,4,5,10]),
target_column = 'target'
Expand All @@ -847,7 +862,7 @@ def test_ml_experiment(c, client, training_df):
"""
CREATE OR REPLACE EXPERIMENT my_exp WITH (
model_class = 'sklearn.ensemble.GradientBoostingClassifier',
experiment_class = 'dask_ml.model_selection.GridSearchCV',
experiment_class = 'sklearn.model_selection.GridSearchCV',
tune_parameters = (n_estimators = ARRAY [16, 32, 2],learning_rate = ARRAY [0.1,0.01,0.001],
max_depth = ARRAY [3,4,5,10]),
target_column = 'target'
Expand All @@ -867,8 +882,8 @@ def test_ml_experiment(c, client, training_df):
c.sql(
"""
CREATE EXPERIMENT my_exp1 WITH (
model_class = 'dask_ml.cluster.KMeans',
experiment_class = 'dask_ml.model_selection.RandomizedSearchCV',
model_class = 'sklearn.cluster.KMeans',
experiment_class = 'sklearn.model_selection.RandomizedSearchCV',
tune_parameters = (n_clusters = ARRAY [3,4,16],tol = ARRAY [0.1,0.01,0.001],
max_iter = ARRAY [3,4,5,10])
) AS (
Expand All @@ -889,7 +904,7 @@ def test_experiment_automl_classifier(c, client, training_df):
"""
CREATE EXPERIMENT my_automl_exp1 WITH (
automl_class = 'tpot.TPOTClassifier',
automl_kwargs = (population_size = 2 ,generations=2,cv=2,n_jobs=-1,use_dask=True),
automl_kwargs = (population_size=2, generations=2, cv=2, n_jobs=-1),
target_column = 'target'
) AS (
SELECT x, y, x*y > 0 AS target
Expand All @@ -914,11 +929,10 @@ def test_experiment_automl_regressor(c, client, training_df):
"""
CREATE EXPERIMENT my_automl_exp2 WITH (
automl_class = 'tpot.TPOTRegressor',
automl_kwargs = (population_size = 2,
automl_kwargs = (population_size=2,
generations=2,
cv=2,
n_jobs=-1,
use_dask=True,
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use_dask requires Dask-ML to be installed.

max_eval_time_mins=1),

target_column = 'target'
Expand Down