Skip to content

Commit

Permalink
perf: Enhance VAD
Browse files Browse the repository at this point in the history
  • Loading branch information
clemlesne committed Dec 5, 2024
1 parent bd2b3b8 commit 0497c8d
Show file tree
Hide file tree
Showing 6 changed files with 25 additions and 14 deletions.
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -632,7 +632,6 @@ Conversation options are represented as features. They can be configured from Ap
| `slow_llm_for_chat` | Whether to use the slow LLM for chat. | `bool` | false |
| `vad_cutoff_timeout_ms` | The cutoff timeout for voice activity detection in seconds. | `int` | 400 |
| `vad_silence_timeout_ms` | The timeout for phone silence in seconds. | `int` | 400 |
| `vad_threshold` | The threshold for voice activity detection. | `float` | 0.5 |

### Use an OpenAI compatible model for the LLM

Expand Down
24 changes: 16 additions & 8 deletions app/helpers/call_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,11 @@
from azure.communication.callautomation.aio import CallAutomationClient
from openai import APIError
from pydub import AudioSegment
from pydub.effects import high_pass_filter, low_pass_filter
from pydub.effects import (
high_pass_filter,
low_pass_filter,
)
from webrtcvad import Vad

from app.helpers.call_utils import (
handle_clear_queue,
Expand All @@ -27,7 +31,6 @@
answer_soft_timeout_sec,
vad_cutoff_timeout_ms,
vad_silence_timeout_ms,
vad_threshold,
)
from app.helpers.identity import token
from app.helpers.llm_tools import DefaultPlugin
Expand Down Expand Up @@ -586,6 +589,11 @@ async def _in_audio( # noqa: PLR0913
) -> None:
clear_tts_task: asyncio.Task | None = None
flush_task: asyncio.Task | None = None
vad = Vad(
# Aggressiveness mode (0, 1, 2, or 3)
# Sets the VAD operating mode. A more aggressive (higher mode) VAD is more restrictive in reporting speech. Put in other words the probability of being speech when the VAD returns 1 is increased with increasing mode. As a consequence also the missed detection rate goes up.
mode=3,
)

async def _flush_callback() -> None:
"""
Expand Down Expand Up @@ -637,18 +645,18 @@ async def _clear_tts_callback() -> None:
in_stream.task_done()

# Apply high-pass and low-pass filters in a simple attempt to reduce noise
in_audio = high_pass_filter(in_audio, 200)
in_audio = low_pass_filter(in_audio, 3000)
in_audio = high_pass_filter(seg=in_audio, cutoff=85)
in_audio = low_pass_filter(seg=in_audio, cutoff=3000)

# Always add the audio to the buffer
assert isinstance(in_audio.raw_data, bytes)
out_stream.write(in_audio.raw_data)

# Get the relative dB, silences shoudl be at 1 to 5% of the max, so 0.1 to 0.5 of the threshold
# Use WebRTC VAD algorithm to detect voice
in_empty = False
if (
min(in_audio.rms / in_audio.max_possible_amplitude * 10, 1)
< await vad_threshold()
if not vad.is_speech(
buf=in_audio.raw_data,
sample_rate=in_audio.frame_rate,
):
in_empty = True
# Start timeout if not already started and VAD already triggered
Expand Down
4 changes: 0 additions & 4 deletions app/helpers/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,6 @@ async def vad_cutoff_timeout_ms() -> int:
return await _get(key="vad_cutoff_timeout_ms", type_res=int) or 400


async def vad_threshold() -> float:
return await _get(key="vad_threshold", type_res=float) or 0.5


async def recording_enabled() -> bool:
return await _get(key="recording_enabled", type_res=bool) or False

Expand Down
1 change: 0 additions & 1 deletion cicd/bicep/app.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -907,7 +907,6 @@ resource configValues 'Microsoft.AppConfiguration/configurationStores/keyValues@
slow_llm_for_chat: false
vad_cutoff_timeout_ms: 400
vad_silence_timeout_ms: 400
vad_threshold: '0.5'
}): {
parent: configStore
name: item.key
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ dependencies = [
"twilio~=9.3", # Twilio SDK, used for SMS
"typing-extensions~=4.12", # Typing extensions for Python 3.6+
"uvicorn[standard]~=0.32", # Application middleware
"webrtcvad~=2.0", # Voice activity detection
]

[project.optional-dependencies]
Expand Down
8 changes: 8 additions & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 0497c8d

Please sign in to comment.