Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Whisperx integration #267

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-docker-compose
{
"name": "Existing Docker Compose (Extend)",

// Update the 'dockerComposeFile' list if you have more compose files or use different names.
// The .devcontainer/docker-compose.yml file contains any overrides you need/want to make.
"dockerComposeFile": [
"../docker-compose.yml",
"docker-compose.yml"
],

// The 'service' property is the name of the service for the container that VS Code should
// use. Update this value and .devcontainer/docker-compose.yml to the real service name.
"service": "whisper-asr-webservice",

// The optional 'workspaceFolder' property is the path VS Code should open by default when
// connected. This is typically a file mount in .devcontainer/docker-compose.yml
"workspaceFolder": "/workspaces/${localWorkspaceFolderBasename}",

// "overrideCommand": "/bin/sh -c 'while sleep 1000; do :; done'"
"overrideCommand": true

// Features to add to the dev container. More info: https://containers.dev/features.
// "features": {},

// Use 'forwardPorts' to make a list of ports inside the container available locally.
// "forwardPorts": [],

// Uncomment the next line if you want start specific services in your Docker Compose config.
// "runServices": [],

// Uncomment the next line if you want to keep your containers running after VS Code shuts down.
// "shutdownAction": "none",

// Uncomment the next line to run commands after the container is created.
// "postCreateCommand": "cat /etc/os-release",

// Configure tool-specific properties.
// "customizations": {},

// Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
// "remoteUser": "devcontainer"
}
30 changes: 30 additions & 0 deletions .devcontainer/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
version: '3.4'
services:
# Update this to the name of the service you want to work with in your docker-compose.yml file
whisper-asr-webservice:
# Uncomment if you want to override the service's Dockerfile to one in the .devcontainer
# folder. Note that the path of the Dockerfile and context is relative to the *primary*
# docker-compose.yml file (the first in the devcontainer.json "dockerComposeFile"
# array). The sample below assumes your primary file is in the root of your project.
#
# build:
# context: .
# dockerfile: .devcontainer/Dockerfile
env_file: .devcontainer/dev.env
environment:
ASR_ENGINE: ${ASR_ENGINE}
HF_TOKEN: ${HF_TOKEN}

volumes:
# Update this to wherever you want VS Code to mount the folder of your project
- ..:/workspaces:cached

# Uncomment the next four lines if you will use a ptrace-based debugger like C++, Go, and Rust.
# cap_add:
# - SYS_PTRACE
# security_opt:
# - seccomp:unconfined

# Overrides default command so things don't shut down after the process ends.
command: sleep infinity

6 changes: 5 additions & 1 deletion .github/workflows/docker-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ env:
REPO_NAME: ${{secrets.REPO_NAME}}
jobs:
build:
runs-on: ubuntu-latest
runs-on: [self-hosted, ubuntu-latest]
strategy:
matrix:
include:
Expand All @@ -22,6 +22,10 @@ jobs:
tag_extension: -gpu
platforms: linux/amd64
steps:
- name: Remove unnecessary files
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
- name: Checkout
uses: actions/checkout@v3
- name: Set up QEMU
Expand Down
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,6 @@ pip-wheel-metadata

poetry/core/*

public
public

.devcontainer/dev.env
13 changes: 13 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ RUN export DEBIAN_FRONTEND=noninteractive \
pkg-config \
yasm \
ca-certificates \
gcc \
python3-dev \
&& rm -rf /var/lib/apt/lists/*

RUN git clone https://github.com/FFmpeg/FFmpeg.git --depth 1 --branch n6.1.1 --single-branch /FFmpeg-6.1.1
Expand Down Expand Up @@ -42,6 +44,12 @@ FROM swaggerapi/swagger-ui:v5.9.1 AS swagger-ui

FROM python:3.10-bookworm

RUN export DEBIAN_FRONTEND=noninteractive \
&& apt-get -qq update \
&& apt-get -qq install --no-install-recommends \
libsndfile1 \
&& rm -rf /var/lib/apt/lists/*

ENV POETRY_VENV=/app/.venv

RUN python3 -m venv $POETRY_VENV \
Expand All @@ -61,6 +69,11 @@ COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui-bundle.js swagger-ui-ass
RUN poetry config virtualenvs.in-project true
RUN poetry install

RUN $POETRY_VENV/bin/pip install pandas transformers nltk pyannote.audio
RUN git clone --depth 1 https://github.com/m-bain/whisperX.git \
&& cd whisperX \
&& $POETRY_VENV/bin/pip install -e .

EXPOSE 9000

ENTRYPOINT ["whisper-asr-webservice"]
12 changes: 12 additions & 0 deletions Dockerfile.gpu
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,13 @@ FROM swaggerapi/swagger-ui:v5.9.1 AS swagger-ui
FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04

ENV PYTHON_VERSION=3.10

RUN export DEBIAN_FRONTEND=noninteractive \
&& apt-get -qq update \
&& apt-get -qq install --no-install-recommends \
libsndfile1 \
&& rm -rf /var/lib/apt/lists/*

ENV POETRY_VENV=/app/.venv

RUN export DEBIAN_FRONTEND=noninteractive \
Expand Down Expand Up @@ -79,6 +86,11 @@ COPY --from=swagger-ui /usr/share/nginx/html/swagger-ui-bundle.js swagger-ui-ass
RUN poetry install
RUN $POETRY_VENV/bin/pip install torch==1.13.1+cu117 -f https://download.pytorch.org/whl/torch

RUN $POETRY_VENV/bin/pip install pandas transformers nltk pyannote.audio
RUN git clone --depth 1 https://github.com/m-bain/whisperX.git \
&& cd whisperX \
&& $POETRY_VENV/bin/pip install -e .

EXPOSE 9000

CMD whisper-asr-webservice
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Current release (v1.7.1) supports following whisper models:

- [openai/whisper](https://github.com/openai/whisper)@[v20240930](https://github.com/openai/whisper/releases/tag/v20240930)
- [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper)@[v1.1.0](https://github.com/SYSTRAN/faster-whisper/releases/tag/v1.1.0)
- [whisperX](https://github.com/m-bain/whisperX)@[v3.1.1](https://github.com/m-bain/whisperX/releases/tag/v3.1.1)

## Quick Usage

Expand Down
27 changes: 18 additions & 9 deletions app/asr_models/asr_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@ class ASRModel(ABC):
"""
Abstract base class for ASR (Automatic Speech Recognition) models.
"""

model = None
diarize_model = None # used for WhisperX
x_models = dict() # used for WhisperX
model_lock = Lock()
last_activity_time = time.time()

Expand All @@ -28,14 +31,17 @@ def load_model(self):
pass

@abstractmethod
def transcribe(self,
audio,
task: Union[str, None],
language: Union[str, None],
initial_prompt: Union[str, None],
vad_filter: Union[bool, None],
word_timestamps: Union[bool, None]
):
def transcribe(
self,
audio,
task: Union[str, None],
language: Union[str, None],
initial_prompt: Union[str, None],
vad_filter: Union[bool, None],
word_timestamps: Union[bool, None],
options: Union[dict, None],
output,
):
"""
Perform transcription on the given audio file.
"""
Expand All @@ -52,7 +58,8 @@ def monitor_idleness(self):
"""
Monitors the idleness of the ASR model and releases the model if it has been idle for too long.
"""
if CONFIG.MODEL_IDLE_TIMEOUT <= 0: return
if CONFIG.MODEL_IDLE_TIMEOUT <= 0:
return
while True:
time.sleep(15)
if time.time() - self.last_activity_time > CONFIG.MODEL_IDLE_TIMEOUT:
Expand All @@ -68,4 +75,6 @@ def release_model(self):
torch.cuda.empty_cache()
gc.collect()
self.model = None
self.diarize_model = None
self.x_models = dict()
print("Model unloaded due to timeout")
1 change: 1 addition & 0 deletions app/asr_models/faster_whisper_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def transcribe(
initial_prompt: Union[str, None],
vad_filter: Union[bool, None],
word_timestamps: Union[bool, None],
options: Union[dict, None],
output,
):
self.last_activity_time = time.time()
Expand Down
111 changes: 111 additions & 0 deletions app/asr_models/mbain_whisperx_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
from typing import BinaryIO, Union
from io import StringIO
import whisperx
import whisper
from whisperx.utils import SubtitlesWriter, ResultWriter

from app.asr_models.asr_model import ASRModel
from app.config import CONFIG
from app.utils import WriteTXT, WriteSRT, WriteVTT, WriteTSV, WriteJSON


class WhisperXASR(ASRModel):
def __init__(self):
self.x_models = dict()

def load_model(self):

asr_options = {"without_timestamps": False}
self.model = whisperx.load_model(
CONFIG.MODEL_NAME, device=CONFIG.DEVICE, compute_type="float32", asr_options=asr_options
)

if CONFIG.HF_TOKEN != "":
self.diarize_model = whisperx.DiarizationPipeline(use_auth_token=CONFIG.HF_TOKEN, device=CONFIG.DEVICE)

def transcribe(
self,
audio,
task: Union[str, None],
language: Union[str, None],
initial_prompt: Union[str, None],
vad_filter: Union[bool, None],
word_timestamps: Union[bool, None],
options: Union[dict, None],
output,
):
options_dict = {"task": task}
if language:
options_dict["language"] = language
if initial_prompt:
options_dict["initial_prompt"] = initial_prompt
with self.model_lock:
if self.model is None:
self.load_model()
result = self.model.transcribe(audio, **options_dict)

# Load the required model and cache it
# If we transcribe models in many different languages, this may lead to OOM propblems
if result["language"] in self.x_models:
model_x, metadata = self.x_models[result["language"]]
else:
self.x_models[result["language"]] = whisperx.load_align_model(
language_code=result["language"], device=CONFIG.DEVICE
)
model_x, metadata = self.x_models[result["language"]]

# Align whisper output
result = whisperx.align(
result["segments"], model_x, metadata, audio, CONFIG.DEVICE, return_char_alignments=False
)

if options.get("diarize", False):
if CONFIG.HF_TOKEN == "":
print("Warning! HF_TOKEN is not set. Diarization may not work as expected.")
min_speakers = options.get("min_speakers", None)
max_speakers = options.get("max_speakers", None)
# add min/max number of speakers if known
diarize_segments = self.diarize_model(audio, min_speakers, max_speakers)
result = whisperx.assign_word_speakers(diarize_segments, result)

output_file = StringIO()
self.write_result(result, output_file, output)
output_file.seek(0)

return output_file

def language_detection(self, audio):
# load audio and pad/trim it to fit 30 seconds
audio = whisper.pad_or_trim(audio)

# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(self.model.device)

# detect the spoken language
with self.model_lock:
if self.model is None:
self.load_model()
_, probs = self.model.detect_language(mel)
detected_lang_code = max(probs, key=probs.get)

return detected_lang_code

def write_result(self, result: dict, file: BinaryIO, output: Union[str, None]):
if output == "srt":
if CONFIG.HF_TOKEN != "":
WriteSRT(SubtitlesWriter).write_result(result, file=file, options={})
else:
WriteSRT(ResultWriter).write_result(result, file=file, options={})
elif output == "vtt":
if CONFIG.HF_TOKEN != "":
WriteVTT(SubtitlesWriter).write_result(result, file=file, options={})
else:
WriteVTT(ResultWriter).write_result(result, file=file, options={})
elif output == "tsv":
WriteTSV(ResultWriter).write_result(result, file=file, options={})
elif output == "json":
WriteJSON(ResultWriter).write_result(result, file=file, options={})
elif output == "txt":
WriteTXT(ResultWriter).write_result(result, file=file, options={})
else:
return 'Please select an output method!'
Loading