dask-contrib · ayushdg · Nov 29, 2022 · Oct 24, 2022 · Nov 15, 2022 · Nov 15, 2022
@@ -73,11 +73,10 @@ jobs:
           mamba install -c conda-forge "sasl>=0.3.1"
           docker pull bde2020/hive:2.3.2-postgresql-metastore
           docker pull bde2020/hive-metastore-postgresql:2.3.0
-      - name: Install upstream dev Dask / dask-ml
+      - name: Install upstream dev Dask
         if: env.which_upstream == 'Dask'
         run: |
           mamba update dask
-          python -m pip install --no-deps git+https://github.com/dask/dask-ml
       - name: Test with pytest
         run: |
           pytest --junitxml=junit/test-results.xml --cov-report=xml -n auto tests --dist loadfile
@@ -112,11 +111,10 @@ jobs:
           which python
           pip list
           mamba list
-      - name: Install upstream dev dask-ml
+      - name: Install upstream dev Dask
         if: env.which_upstream == 'Dask'
         run: |
           mamba update dask
-          python -m pip install --no-deps git+https://github.com/dask/dask-ml
       - name: run a dask cluster
         run: |
           if [[ $which_upstream == "Dask" ]]; then
@@ -161,12 +159,11 @@ jobs:
           which python
           pip list
           mamba list
-      - name: Install upstream dev Dask / dask-ml
+      - name: Install upstream dev Dask
         if: env.which_upstream == 'Dask'
         run: |
           python -m pip install --no-deps git+https://github.com/dask/dask
           python -m pip install --no-deps git+https://github.com/dask/distributed
-          python -m pip install --no-deps git+https://github.com/dask/dask-ml
       - name: Try to import dask-sql
         run: |
           python -c "import dask_sql; print('ok')"

@@ -64,11 +64,10 @@ jobs:
           mamba install -c conda-forge "sasl>=0.3.1"
           docker pull bde2020/hive:2.3.2-postgresql-metastore
           docker pull bde2020/hive-metastore-postgresql:2.3.0
-      - name: Optionally install upstream dev Dask / dask-ml
+      - name: Optionally install upstream dev Dask
         if: needs.detect-ci-trigger.outputs.triggered == 'true'
         run: |
           mamba update dask
-          python -m pip install --no-deps git+https://github.com/dask/dask-ml
       - name: Test with pytest
         run: |
           pytest --junitxml=junit/test-results.xml --cov-report=xml -n auto tests --dist loadfile
@@ -108,11 +107,10 @@ jobs:
           which python
           pip list
           mamba list
-      - name: Optionally install upstream dev dask-ml
+      - name: Optionally install upstream dev Dask
         if: needs.detect-ci-trigger.outputs.triggered == 'true'
         run: |
           mamba update dask
-          python -m pip install --no-deps git+https://github.com/dask/dask-ml
       - name: run a dask cluster
         env:
           UPSTREAM: ${{ needs.detect-ci-trigger.outputs.triggered }}
@@ -153,12 +151,11 @@ jobs:
           which python
           pip list
           mamba list
-      - name: Optionally install upstream dev Dask / dask-ml
+      - name: Optionally install upstream dev Dask
         if: needs.detect-ci-trigger.outputs.triggered == 'true'
         run: |
           python -m pip install --no-deps git+https://github.com/dask/dask
           python -m pip install --no-deps git+https://github.com/dask/distributed
-          python -m pip install --no-deps git+https://github.com/dask/dask-ml
       - name: Try to import dask-sql
         run: |
           python -c "import dask_sql; print('ok')"
@@ -3,7 +3,6 @@ channels:
 - conda-forge
 - nodefaults
 dependencies:
-- dask-ml>=2022.1.22
 - dask>=2022.3.0
 - fastapi>=0.69.0
 - fugue>=0.7.0

@@ -3,7 +3,6 @@ channels:
 - conda-forge
 - nodefaults
 dependencies:
-- dask-ml=2022.1.22
 - dask=2022.3.0
 - fastapi=0.69.0
 - fugue=0.7.0

@@ -3,7 +3,6 @@ channels:
 - conda-forge
 - nodefaults
 dependencies:
-- dask-ml>=2022.1.22
 - dask>=2022.3.0
 - fastapi>=0.69.0
 - fugue>=0.7.0

@@ -6,7 +6,6 @@ channels:
 - conda-forge
 - nodefaults
 dependencies:
-- dask-ml>=2022.1.22
 - dask>=2022.3.0
 - fastapi>=0.69.0
 - fugue>=0.7.0

@@ -2,6 +2,9 @@
 import uuid
 from typing import TYPE_CHECKING
 
+import dask.dataframe as dd
+import pandas as pd
+
 from dask_sql.datacontainer import ColumnContainer, DataContainer
 from dask_sql.physical.rel.base import BaseRelPlugin
 
@@ -30,8 +33,7 @@ class PredictModelPlugin(BaseRelPlugin):
     Please note however, that it will need to act on Dask dataframes. If you
     are using a model not optimized for this, it might be that you run out of memory if
     your data is larger than the RAM of a single machine.
-    To prevent this, have a look into the dask-ml package,
-    especially the [ParallelPostFit](https://ml.dask.org/meta-estimators.html)
+    To prevent this, have a look into the dask_sql.physical.rel.custom.wrappers.ParallelPostFit
     meta-estimator. If you are using a model trained with `CREATE MODEL`
     and the `wrap_predict` flag, this is done automatically.
 
@@ -59,8 +61,21 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContai
 
         model, training_columns = context.schema[schema_name].models[model_name]
         df = context.sql(sql_select)
-        prediction = model.predict(df[training_columns])
-        predicted_df = df.assign(target=prediction)
+        try:
+            prediction = model.predict(df[training_columns])
+            predicted_df = df.assign(target=prediction)
+        except TypeError:
+            df = df.set_index(df.columns[0], drop=False)
+            prediction = model.predict(df[training_columns])
+            # Convert numpy.ndarray to Dask Series
+            prediction = dd.from_pandas(
+                pd.Series(prediction, index=df.index),
+                npartitions=df.npartitions,
+            )
+            predicted_df = df.assign(target=prediction)
+            # Need to drop first column to reset index
+            # because the first column is equal to the index
+            predicted_df = predicted_df.drop(columns=[df.columns[0]]).reset_index()
 
         # Create a temporary context, which includes the
         # new "table" so that we can use the normal

@@ -16,7 +16,6 @@ uvicorn>=0.13.4
 pyarrow>=6.0.1
 prompt_toolkit>=3.0.8
 pygments>=2.7.1
-dask-ml>=2022.1.22
 scikit-learn>=1.0.0
 intake>=0.6.0
 pre-commit>=2.11.1

@@ -27,7 +27,6 @@ RUN mamba install --freeze-installed -y \
     nest-asyncio \
     # additional dependencies
     "pyarrow>=6.0.1" \
-    "dask-ml>=2022.1.22" \
     "scikit-learn>=1.0.0" \
     "intake>=0.6.0" \
     && conda clean -ay

@@ -125,8 +125,7 @@ following sql statements
 Want to increase the performance of your model by tuning the
 parameters? Use the hyperparameter tuning directly
 in SQL using below SQL syntax, choose different tuners
-from the dask_ml package based on memory and compute constraints and
-for more details refer to the `dask ml documentation <https://ml.dask.org/hyper-parameter-search.html#incremental-hyperparameter-optimization>`_
+based on memory and compute constraints.
 
 ..
     TODO - add a GPU section to these examples once we have working CREATE EXPERIMENT tests for GPU
@@ -135,7 +134,7 @@ for more details refer to the `dask ml documentation <https://ml.dask.org/hyper-
 
  CREATE EXPERIMENT my_exp WITH (
     model_class = 'sklearn.ensemble.GradientBoostingClassifier',
-    experiment_class = 'dask_ml.model_selection.GridSearchCV',
+    experiment_class = 'sklearn.model_selection.GridSearchCV',
     tune_parameters = (n_estimators = ARRAY [16, 32, 2],
                     learning_rate = ARRAY [0.1,0.01,0.001],
                    max_depth = ARRAY [3,4,5,10]
@@ -258,7 +257,6 @@ and the boolean target ``label``.
     SELECT * FROM training_data
 
     -- We can now train a model from the sklearn package.
-    -- Make sure to install it together with dask-ml with conda or pip.
     CREATE OR REPLACE MODEL my_model WITH (
         model_class = 'sklearn.ensemble.GradientBoostingClassifier',
         wrap_predict = True,
@@ -282,7 +280,7 @@ and the boolean target ``label``.
     -- experiment to tune different hyperparameters
     CREATE EXPERIMENT my_exp WITH(
     model_class = 'sklearn.ensemble.GradientBoostingClassifier',
-    experiment_class = 'dask_ml.model_selection.GridSearchCV',
+    experiment_class = 'sklearn.model_selection.GridSearchCV',
     tune_parameters = (n_estimators = ARRAY [16, 32, 2],
                     learning_rate = ARRAY [0.1,0.01,0.001],
                    max_depth = ARRAY [3,4,5,10]

@@ -127,8 +127,7 @@ A model can be anything which has a ``predict`` function.
 Please note however, that it will need to act on Dask dataframes. If you
 are using a model not optimized for this, it might be that you run out of memory if
 your data is larger than the RAM of a single machine.
-To prevent this, have a look into the dask-ml package,
-especially the `ParallelPostFit <https://ml.dask.org/meta-estimators.html>`_
+To prevent this, have a look into the `dask_sql.physical.rel.custom.wrappers.ParallelPostFit`
 meta-estimator. If you are using a model trained with ``CREATE MODEL``
 and the ``wrap_predict`` flag set to true, this is done automatically.
 

@@ -590,7 +590,7 @@
    "metadata": {},
    "source": [
     "- Tune single model with different Hyperparameters \n",
-    "  - install **dask_ml** for tunning\n",
+    "  - install **sklearn** for tuning\n",
     "- Tune multiple model with different Hyperparameters\n",
     "  - install **tpot** for Automl"
    ]
@@ -604,7 +604,7 @@
     "%%sql\n",
     "CREATE EXPERIMENT my_exp WITH (\n",
     "        model_class = 'sklearn.ensemble.GradientBoostingClassifier',\n",
-    "        experiment_class = 'dask_ml.model_selection.GridSearchCV',\n",
+    "        experiment_class = 'sklearn.model_selection.GridSearchCV',\n",
     "        tune_parameters = (n_estimators = ARRAY [16, 32, 2],learning_rate = ARRAY [0.1,0.01,0.001],\n",
     "                           max_depth = ARRAY [3,4,5,10]),\n",
     "        target_column = 'species'\n",

@@ -59,7 +59,6 @@
             "mock>=4.0.3",
             "sphinx>=3.2.1",
             "pyarrow>=6.0.1",
-            "dask-ml>=2022.1.22",
             "scikit-learn>=1.0.0",
             "intake>=0.6.0",
             "pre-commit",