From 2c7fec3b740d45194f203f9b464aeb0ef59373fb Mon Sep 17 00:00:00 2001 From: Daniel Dale Date: Sat, 1 Jun 2024 12:46:11 -0700 Subject: [PATCH] adjust FTS dependency adjustment warning to reference more informative object name, update HF Datasets `trust_remote_code` to `True` in examples as will be required with HF Datasets >= 3.x --- src/finetuning_scheduler/fts_supporters.py | 3 +-- src/fts_examples/stable/fts_superglue.py | 7 +++++-- src/fts_examples/stable/ipynb_src/fts_superglue_nb.py | 7 +++++-- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/finetuning_scheduler/fts_supporters.py b/src/finetuning_scheduler/fts_supporters.py index 75c1a68..e02fe74 100644 --- a/src/finetuning_scheduler/fts_supporters.py +++ b/src/finetuning_scheduler/fts_supporters.py @@ -1876,8 +1876,7 @@ def _add_fts_callback(trainer: "pl.Trainer", fts_cls: FTSCallbackDepType, cfg: D """ if cfg.get("monitor", None) is None: cfg["monitor"] = "val_loss" - rank_zero_warn(f"No monitor metric specified for {fts_cls.__class__.__name__}," - " using 'val_loss' as default.") + rank_zero_warn(f"No monitor metric specified for {fts_cls.__name__}, using 'val_loss' as default.") trainer.callbacks.append(fts_cls(**cfg)) def _callback_dep_setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: str) -> None: diff --git a/src/fts_examples/stable/fts_superglue.py b/src/fts_examples/stable/fts_superglue.py index 0d84e29..e9b1c41 100644 --- a/src/fts_examples/stable/fts_superglue.py +++ b/src/fts_examples/stable/fts_superglue.py @@ -126,6 +126,8 @@ def __init__( "dataloader_kwargs": dataloader_kwargs, "tokenizers_parallelism": tokenizers_parallelism, } + # starting with HF Datasets v3.x, trust_remote_code must be `True` https://bit.ly/hf_datasets_trust_remote_req + self.trust_remote_code = True self.save_hyperparameters(self.init_hparams) self.dataloader_kwargs = { "num_workers": dataloader_kwargs.get("num_workers", 0), @@ -139,11 +141,12 @@ def prepare_data(self): """Load the SuperGLUE dataset.""" # N.B. PL calls prepare_data from a single process (rank 0) so do not use it to assign # state (e.g. self.x=y) - datasets.load_dataset("super_glue", self.hparams.task_name) + datasets.load_dataset("super_glue", self.hparams.task_name, trust_remote_code=self.trust_remote_code) def setup(self, stage): """Setup our dataset splits for training/validation.""" - self.dataset = datasets.load_dataset("super_glue", self.hparams.task_name) + self.dataset = datasets.load_dataset("super_glue", self.hparams.task_name, + trust_remote_code=self.trust_remote_code) for split in self.dataset.keys(): self.dataset[split] = self.dataset[split].map( self._convert_to_features, batched=True, remove_columns=["label"] diff --git a/src/fts_examples/stable/ipynb_src/fts_superglue_nb.py b/src/fts_examples/stable/ipynb_src/fts_superglue_nb.py index de9633d..2d1981d 100644 --- a/src/fts_examples/stable/ipynb_src/fts_superglue_nb.py +++ b/src/fts_examples/stable/ipynb_src/fts_superglue_nb.py @@ -259,6 +259,8 @@ def __init__( "dataloader_kwargs": dataloader_kwargs, "tokenizers_parallelism": tokenizers_parallelism, } + # starting with HF Datasets v3.x, trust_remote_code must be `True` https://bit.ly/hf_datasets_trust_remote_req + self.trust_remote_code = True self.save_hyperparameters(self.init_hparams) self.dataloader_kwargs = { "num_workers": dataloader_kwargs.get("num_workers", 0), @@ -273,11 +275,12 @@ def prepare_data(self): """Load the SuperGLUE dataset.""" # N.B. PL calls prepare_data from a single process (rank 0) so do not use it to assign # state (e.g. self.x=y) - datasets.load_dataset("super_glue", self.hparams.task_name) + datasets.load_dataset("super_glue", self.hparams.task_name, trust_remote_code=self.trust_remote_code) def setup(self, stage): """Setup our dataset splits for training/validation.""" - self.dataset = datasets.load_dataset("super_glue", self.hparams.task_name) + self.dataset = datasets.load_dataset("super_glue", self.hparams.task_name, + trust_remote_code=self.trust_remote_code) for split in self.dataset.keys(): self.dataset[split] = self.dataset[split].map( self._convert_to_features, batched=True, remove_columns=["label"]