-
Notifications
You must be signed in to change notification settings - Fork 72
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Remove all Dask-ML uses #886
Changes from all commits
9674fe5
0c7cdcd
0027dd6
c785aef
9007c0b
e139fa4
ac00961
07a060f
a6acef5
565db5c
9a0a5bc
264cc34
670aea0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,8 +18,6 @@ | |
xgboost = None | ||
dask_cudf = None | ||
|
||
pytest.importorskip("dask_ml") | ||
|
||
|
||
def check_trained_model(c, model_name=None): | ||
if model_name is None: | ||
|
@@ -157,7 +155,24 @@ def test_clustering_and_prediction(c, training_df): | |
c.sql( | ||
""" | ||
CREATE MODEL my_model WITH ( | ||
model_class = 'dask_ml.cluster.KMeans' | ||
model_class = 'sklearn.cluster.KMeans' | ||
) AS ( | ||
SELECT x, y | ||
FROM timeseries | ||
LIMIT 100 | ||
) | ||
""" | ||
) | ||
|
||
check_trained_model(c) | ||
|
||
|
||
@pytest.mark.gpu | ||
def test_gpu_clustering_and_prediction(c, gpu_training_df, gpu_client): | ||
c.sql( | ||
""" | ||
CREATE MODEL my_model WITH ( | ||
model_class = 'cuml.dask.cluster.KMeans' | ||
) AS ( | ||
SELECT x, y | ||
FROM timeseries | ||
|
@@ -244,7 +259,7 @@ def test_show_models(c, training_df): | |
c.sql( | ||
""" | ||
CREATE MODEL my_model2 WITH ( | ||
model_class = 'dask_ml.cluster.KMeans' | ||
model_class = 'sklearn.cluster.KMeans' | ||
) AS ( | ||
SELECT x, y | ||
FROM timeseries | ||
|
@@ -691,7 +706,7 @@ def test_ml_experiment(c, client, training_df): | |
c.sql( | ||
""" | ||
CREATE EXPERIMENT my_exp WITH ( | ||
experiment_class = 'dask_ml.model_selection.GridSearchCV', | ||
experiment_class = 'sklearn.model_selection.GridSearchCV', | ||
tune_parameters = (n_estimators = ARRAY [16, 32, 2],learning_rate = ARRAY [0.1,0.01,0.001], | ||
max_depth = ARRAY [3,4,5,10]), | ||
target_column = 'target' | ||
|
@@ -731,7 +746,7 @@ def test_ml_experiment(c, client, training_df): | |
""" | ||
CREATE EXPERIMENT IF NOT EXISTS my_exp WITH ( | ||
model_class = 'that.is.not.a.python.class', | ||
experiment_class = 'dask_ml.model_selection.GridSearchCV', | ||
experiment_class = 'sklearn.model_selection.GridSearchCV', | ||
tune_parameters = (n_estimators = ARRAY [16, 32, 2],learning_rate = ARRAY [0.1,0.01,0.001], | ||
max_depth = ARRAY [3,4,5,10]), | ||
target_column = 'target' | ||
|
@@ -794,7 +809,7 @@ def test_ml_experiment(c, client, training_df): | |
""" | ||
CREATE EXPERIMENT my_exp WITH ( | ||
model_class = 'sklearn.ensemble.GradientBoostingClassifier', | ||
experiment_class = 'dask_ml.model_selection.GridSearchCV', | ||
experiment_class = 'sklearn.model_selection.GridSearchCV', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can't really get
errors with:
Using When I try it with a
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I could open some issues to try and get There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we can open up an issue to track this and followup in a future pr There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Opened #943 |
||
tune_parameters = (n_estimators = ARRAY [16, 32, 2],learning_rate = ARRAY [0.1,0.01,0.001], | ||
max_depth = ARRAY [3,4,5,10]), | ||
target_column = 'target' | ||
|
@@ -816,7 +831,7 @@ def test_ml_experiment(c, client, training_df): | |
""" | ||
CREATE EXPERIMENT my_exp WITH ( | ||
model_class = 'sklearn.ensemble.GradientBoostingClassifier', | ||
experiment_class = 'dask_ml.model_selection.GridSearchCV', | ||
experiment_class = 'sklearn.model_selection.GridSearchCV', | ||
tune_parameters = (n_estimators = ARRAY [16, 32, 2],learning_rate = ARRAY [0.1,0.01,0.001], | ||
max_depth = ARRAY [3,4,5,10]), | ||
target_column = 'target' | ||
|
@@ -831,7 +846,7 @@ def test_ml_experiment(c, client, training_df): | |
""" | ||
CREATE EXPERIMENT IF NOT EXISTS my_exp WITH ( | ||
model_class = 'sklearn.ensemble.GradientBoostingClassifier', | ||
experiment_class = 'dask_ml.model_selection.GridSearchCV', | ||
experiment_class = 'sklearn.model_selection.GridSearchCV', | ||
tune_parameters = (n_estimators = ARRAY [16, 32, 2],learning_rate = ARRAY [0.1,0.01,0.001], | ||
max_depth = ARRAY [3,4,5,10]), | ||
target_column = 'target' | ||
|
@@ -847,7 +862,7 @@ def test_ml_experiment(c, client, training_df): | |
""" | ||
CREATE OR REPLACE EXPERIMENT my_exp WITH ( | ||
model_class = 'sklearn.ensemble.GradientBoostingClassifier', | ||
experiment_class = 'dask_ml.model_selection.GridSearchCV', | ||
experiment_class = 'sklearn.model_selection.GridSearchCV', | ||
tune_parameters = (n_estimators = ARRAY [16, 32, 2],learning_rate = ARRAY [0.1,0.01,0.001], | ||
max_depth = ARRAY [3,4,5,10]), | ||
target_column = 'target' | ||
|
@@ -867,8 +882,8 @@ def test_ml_experiment(c, client, training_df): | |
c.sql( | ||
""" | ||
CREATE EXPERIMENT my_exp1 WITH ( | ||
model_class = 'dask_ml.cluster.KMeans', | ||
experiment_class = 'dask_ml.model_selection.RandomizedSearchCV', | ||
model_class = 'sklearn.cluster.KMeans', | ||
experiment_class = 'sklearn.model_selection.RandomizedSearchCV', | ||
tune_parameters = (n_clusters = ARRAY [3,4,16],tol = ARRAY [0.1,0.01,0.001], | ||
max_iter = ARRAY [3,4,5,10]) | ||
) AS ( | ||
|
@@ -889,7 +904,7 @@ def test_experiment_automl_classifier(c, client, training_df): | |
""" | ||
CREATE EXPERIMENT my_automl_exp1 WITH ( | ||
automl_class = 'tpot.TPOTClassifier', | ||
automl_kwargs = (population_size = 2 ,generations=2,cv=2,n_jobs=-1,use_dask=True), | ||
automl_kwargs = (population_size=2, generations=2, cv=2, n_jobs=-1), | ||
target_column = 'target' | ||
) AS ( | ||
SELECT x, y, x*y > 0 AS target | ||
|
@@ -914,11 +929,10 @@ def test_experiment_automl_regressor(c, client, training_df): | |
""" | ||
CREATE EXPERIMENT my_automl_exp2 WITH ( | ||
automl_class = 'tpot.TPOTRegressor', | ||
automl_kwargs = (population_size = 2, | ||
automl_kwargs = (population_size=2, | ||
generations=2, | ||
cv=2, | ||
n_jobs=-1, | ||
use_dask=True, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
max_eval_time_mins=1), | ||
|
||
target_column = 'target' | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In what scenarios do we hit this case?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is to allow test_clustering_and_prediction to pass. It would error with a
TypeError: Column assignment doesn't support type numpy.ndarray
because sklearn returns the list of clusters as a Numpy array (likearray([1, 3, 4, 7, 7, 2, 0, 7, 5, 1, 2, 0, 1, 6, 0, 3, 6, 6, 4, 7, 0, 3, 2, 0, 3, 4, 5, 4, 1, 0], dtype=int32)
), but Dask does not support column assignment with this datatype. So we have to convert it to a Dask Series before assignment.I think it should be OK to use Pandas in the
except
block because this should only happen in the CPU case with sklearn. As an example,test_gpu_clustering_and_prediction
uses cuml.dask and doesn't need to go into theexcept
block.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That makes sense. So this is specifically for the case where predict returns a numpy array instead of a Dask Array.