perf: Enhance VAD

microsoft · Dec 5, 2024 · 0497c8d · 0497c8d
1 parent bd2b3b8
commit 0497c8d
Show file tree

Hide file tree

Showing 6 changed files with 25 additions and 14 deletions.
diff --git a/README.md b/README.md
@@ -632,7 +632,6 @@ Conversation options are represented as features. They can be configured from Ap
 | `slow_llm_for_chat` | Whether to use the slow LLM for chat. | `bool` | false |
 | `vad_cutoff_timeout_ms` | The cutoff timeout for voice activity detection in seconds. | `int` | 400 |
 | `vad_silence_timeout_ms` | The timeout for phone silence in seconds. | `int` | 400 |
-| `vad_threshold` | The threshold for voice activity detection. | `float` | 0.5 |
 
 ### Use an OpenAI compatible model for the LLM
 

diff --git a/app/helpers/call_llm.py b/app/helpers/call_llm.py
@@ -13,7 +13,11 @@
 from azure.communication.callautomation.aio import CallAutomationClient
 from openai import APIError
 from pydub import AudioSegment
-from pydub.effects import high_pass_filter, low_pass_filter
+from pydub.effects import (
+    high_pass_filter,
+    low_pass_filter,
+)
+from webrtcvad import Vad
 
 from app.helpers.call_utils import (
     handle_clear_queue,
@@ -27,7 +31,6 @@
     answer_soft_timeout_sec,
     vad_cutoff_timeout_ms,
     vad_silence_timeout_ms,
-    vad_threshold,
 )
 from app.helpers.identity import token
 from app.helpers.llm_tools import DefaultPlugin
@@ -586,6 +589,11 @@ async def _in_audio(  # noqa: PLR0913
 ) -> None:
     clear_tts_task: asyncio.Task | None = None
     flush_task: asyncio.Task | None = None
+    vad = Vad(
+        # Aggressiveness mode (0, 1, 2, or 3)
+        # Sets the VAD operating mode. A more aggressive (higher mode) VAD is more restrictive in reporting speech. Put in other words the probability of being speech when the VAD returns 1 is increased with increasing mode. As a consequence also the missed detection rate goes up.
+        mode=3,
+    )
 
     async def _flush_callback() -> None:
         """
@@ -637,18 +645,18 @@ async def _clear_tts_callback() -> None:
         in_stream.task_done()
 
         # Apply high-pass and low-pass filters in a simple attempt to reduce noise
-        in_audio = high_pass_filter(in_audio, 200)
-        in_audio = low_pass_filter(in_audio, 3000)
+        in_audio = high_pass_filter(seg=in_audio, cutoff=85)
+        in_audio = low_pass_filter(seg=in_audio, cutoff=3000)
 
         # Always add the audio to the buffer
         assert isinstance(in_audio.raw_data, bytes)
         out_stream.write(in_audio.raw_data)
 
-        # Get the relative dB, silences shoudl be at 1 to 5% of the max, so 0.1 to 0.5 of the threshold
+        # Use WebRTC VAD algorithm to detect voice
         in_empty = False
-        if (
-            min(in_audio.rms / in_audio.max_possible_amplitude * 10, 1)
-            < await vad_threshold()
+        if not vad.is_speech(
+            buf=in_audio.raw_data,
+            sample_rate=in_audio.frame_rate,
         ):
             in_empty = True
             # Start timeout if not already started and VAD already triggered

diff --git a/app/helpers/features.py b/app/helpers/features.py
@@ -36,10 +36,6 @@ async def vad_cutoff_timeout_ms() -> int:
     return await _get(key="vad_cutoff_timeout_ms", type_res=int) or 400
 
 
-async def vad_threshold() -> float:
-    return await _get(key="vad_threshold", type_res=float) or 0.5
-
-
 async def recording_enabled() -> bool:
     return await _get(key="recording_enabled", type_res=bool) or False
 

diff --git a/cicd/bicep/app.bicep b/cicd/bicep/app.bicep
@@ -907,7 +907,6 @@ resource configValues 'Microsoft.AppConfiguration/configurationStores/keyValues@
     slow_llm_for_chat: false
     vad_cutoff_timeout_ms: 400
     vad_silence_timeout_ms: 400
-    vad_threshold: '0.5'
   }): {
     parent: configStore
     name: item.key

diff --git a/pyproject.toml b/pyproject.toml
@@ -51,6 +51,7 @@ dependencies = [
   "twilio~=9.3",  # Twilio SDK, used for SMS
   "typing-extensions~=4.12",  # Typing extensions for Python 3.6+
   "uvicorn[standard]~=0.32",  # Application middleware
+  "webrtcvad~=2.0",  # Voice activity detection
 ]
 
 [project.optional-dependencies]

diff --git a/uv.lock b/uv.lock