Update workflows for DeepSpeech v2 for PyTorch 2.x

Signed-off-by: Beat Buesser <[email protected]>
Trusted-AI · Dec 7, 2023 · cf85969 · cf85969
1 parent 34fe469
commit cf85969
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 25 deletions.
diff --git a/.github/actions/deepspeech-v3/Dockerfile b/.github/actions/deepspeech-v3/Dockerfile
@@ -1,5 +1,5 @@
-# Get base from a pytorch image
-FROM pytorch/pytorch:1.6.0-cuda10.1-cudnn7-runtime
+pod# Get base from a pytorch image
+FROM pytorch/pytorch:2.1.1-cuda12.1-cudnn8-runtime
 
 # Set to install things in non-interactive mode
 ENV DEBIAN_FRONTEND noninteractive
@@ -17,26 +17,16 @@ RUN apt-get update \
         curl \
         libsndfile-dev \
         libsndfile1 \
+        vim \
      && apt-get clean all \
      && rm -r /var/lib/apt/lists/*
 
-RUN /opt/conda/bin/conda install --yes \
-    astropy \
-    matplotlib \
-    pandas \
-    scikit-learn \
-    scikit-image
-
 # Install necessary libraries for deepspeech v3
-RUN pip install torch
-RUN pip install tensorflow
-RUN pip install torchaudio==0.6.0
-RUN pip install --no-build-isolation fairscale
+RUN pip install --ignore-installed PyYAML torch==2.1.1 tensorflow==2.14.1 torchaudio==2.1.1 pytorch-lightning==2.1.2 scikit-learn==1.3.2
+RUN pip install --no-build-isolation fairscale==0.4.13
 
 RUN git clone https://github.com/SeanNaren/deepspeech.pytorch.git
-RUN cd deepspeech.pytorch && pip install -r requirements.txt
-RUN cd deepspeech.pytorch && pip install -e .
+RUN cd deepspeech.pytorch && sed -i '/^sklearn/d' requirements.txt && pip install -r requirements.txt && pip install -e .
 
-RUN pip install numba==0.50.0
-RUN pip install pytest-cov
-RUN pip install pydub==0.25.1
+RUN pip install numba==0.56.4 pytest-cov==4.1.0 pydub==0.25.1
+RUN pip list
diff --git a/.github/workflows/ci-deepspeech-v3.yml b/.github/workflows/ci-deepspeech-v3.yml
@@ -23,9 +23,9 @@ on:
 
 jobs:
   test_deepspeech_v3_torch_1_10:
-    name: PyTorchDeepSpeech v3 / PyTorch 1.10
+    name: PyTorchDeepSpeech v3 / PyTorch 2.1.1
     runs-on: ubuntu-latest
-    container: adversarialrobustnesstoolbox/art_testing_envs:deepspeech_v3_torch_1_10
+    container: adversarialrobustnesstoolbox/art_testing_envs:deepspeech_v3_torch_2_1_1
     steps:
       - name: Checkout Repo
         uses: actions/checkout@v3

diff --git a/art/estimators/speech_recognition/pytorch_deep_speech.py b/art/estimators/speech_recognition/pytorch_deep_speech.py
@@ -146,7 +146,7 @@ def __init__(
         # Check DeepSpeech version
         if str(DeepSpeech.__base__) == "<class 'torch.nn.modules.module.Module'>":
             self._version = 2
-        elif str(DeepSpeech.__base__) == "<class 'pytorch_lightning.core.lightning.LightningModule'>":
+        elif str(DeepSpeech.__base__) in ["<class 'pytorch_lightning.core.lightning.LightningModule'>", "<class 'pytorch_lightning.core.module.LightningModule'>"]:
             self._version = 3
         else:
             raise NotImplementedError("Only DeepSpeech version 2 and DeepSpeech version 3 are currently supported.")
@@ -381,7 +381,7 @@ def predict(
 
             # Call to DeepSpeech model for prediction
             with torch.no_grad():
-                outputs, output_sizes = self._model(
+                outputs, output_sizes, _ = self._model(
                     inputs[begin:end].to(self._device), input_sizes[begin:end].to(self._device)
                 )
 
@@ -455,7 +455,7 @@ def loss_gradient(self, x: np.ndarray, y: np.ndarray, **kwargs) -> np.ndarray:
         input_sizes = input_rates.mul_(inputs.size()[-1]).int()
 
         # Call to DeepSpeech model for prediction
-        outputs, output_sizes = self._model(inputs.to(self._device), input_sizes.to(self._device))
+        outputs, output_sizes, _ = self._model(inputs.to(self._device), input_sizes.to(self._device))
         outputs = outputs.transpose(0, 1)
 
         if self._version == 2:
@@ -566,7 +566,7 @@ def fit(self, x: np.ndarray, y: np.ndarray, batch_size: int = 128, nb_epochs: in
                 self.optimizer.zero_grad()
 
                 # Call to DeepSpeech model for prediction
-                outputs, output_sizes = self._model(inputs.to(self._device), input_sizes.to(self._device))
+                outputs, output_sizes, _ = self._model(inputs.to(self._device), input_sizes.to(self._device))
                 outputs = outputs.transpose(0, 1)
 
                 if self._version == 2:
@@ -625,7 +625,7 @@ def compute_loss_and_decoded_output(
         input_sizes = input_rates.mul_(inputs.size()[-1]).int()
 
         # Call to DeepSpeech model for prediction
-        outputs, output_sizes = self.model(inputs.to(self.device), input_sizes.to(self.device))
+        outputs, output_sizes, _ = self.model(inputs.to(self.device), input_sizes.to(self.device))
         outputs_ = outputs.transpose(0, 1)
 
         if self._version == 2: