Merge pull request #86 from njbbaer/audio-input

Use transcription endpoint to convert voice messages to text
njbbaer · Jan 31, 2024 · 91e88e9 · 91e88e9
2 parents b4d28af + 2cf548b
commit 91e88e9
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 5 deletions.
diff --git a/src/telegram/telegram_bot.py b/src/telegram/telegram_bot.py
@@ -17,10 +17,15 @@ def __init__(self, context_filepath, telegram_token, authorized_users):
         self.app = ApplicationBuilder().token(telegram_token).build()
         self.sim = Simulacrum(context_filepath)
 
+        # Ignore stale messages
         self.app.add_handler(MessageHandler(StaleMessageFilter(), self.do_nothing))
+
+        # Disallow unauthorized users
         self.app.add_handler(
             MessageHandler(~filters.User(username=authorized_users), self.unauthorized)
         )
+
+        # Handle commands
         self.app.add_handler(
             CommandHandler("new", self.new_conversation_command_handler)
         )
@@ -32,12 +37,16 @@ def __init__(self, context_filepath, telegram_token, authorized_users):
         self.app.add_handler(CommandHandler("clear", self.clear_command_handler))
         self.app.add_handler(CommandHandler("help", self.help_command_handler))
         self.app.add_handler(CommandHandler("start", self.do_nothing))
+
+        # Handle messages
         self.app.add_handler(
             MessageHandler(
-                filters.TEXT & ~filters.COMMAND | filters.PHOTO,
+                (filters.TEXT & ~filters.COMMAND) | filters.PHOTO | filters.VOICE,
                 self.chat_message_handler,
             )
         )
+
+        # Handle unknown messages and errors
         self.app.add_handler(MessageHandler(filters.ALL, self.unknown_message_handler))
         self.app.add_error_handler(self.error_handler)
 
@@ -47,7 +56,7 @@ def run(self):
     @message_handler
     async def chat_message_handler(self, ctx):
         image_url = await ctx.get_image_url()
-        text = ctx.message.text or ctx.message.caption
+        text = await ctx.get_text()
         await self._chat(ctx, text, image_url)
 
     @message_handler
@@ -75,18 +84,15 @@ async def undo_command_handler(self, ctx):
     @message_handler
     async def stats_command_handler(self, ctx):
         lines = []
-
         lines.append("*Conversation*")
         lines.append(f"`Cost: ${self.sim.get_conversation_cost():.2f}`")
-
         lines.append("\n*Last Message*")
         if self.sim.last_cost:
             lines.append(f"`Cost: ${self.sim.last_cost:.2f}`")
             lines.append(f"`Prompt tokens: {self.sim.last_prompt_tokens}`")
             lines.append(f"`Completion tokens: {self.sim.last_completion_tokens}`")
         else:
             lines.append("`Not available`")
-
         await ctx.send_message("\n".join(lines))
 
     @message_handler

diff --git a/src/telegram/telegram_context.py b/src/telegram/telegram_context.py
@@ -1,3 +1,8 @@
+import os
+import uuid
+
+from openai import AsyncOpenAI
+
 from telegram.error import BadRequest
 
 
@@ -23,6 +28,11 @@ async def get_image_url(self):
         file = await self.app.bot.get_file(photo_file.file_id)
         return file.file_path
 
+    async def get_text(self):
+        if self.message.voice:
+            return await self._transcribe_voice()
+        return self.message.text or self.message.caption
+
     async def send_message(self, text):
         # Attempt to fix broken markdown
         if text.count("_") % 2 != 0 and text.endswith("_"):
@@ -35,3 +45,18 @@ async def send_message(self, text):
 
     async def send_typing_action(self):
         await self.app.bot.send_chat_action(chat_id=self.chat_id, action="typing")
+
+    async def _transcribe_voice(self):
+        file_id = self.message.voice.file_id
+        voice_file = await self.context.bot.get_file(file_id)
+        os.makedirs("tmp", exist_ok=True)
+        voice_filepath = f"tmp/{uuid.uuid4()}.ogg"
+        try:
+            await voice_file.download_to_drive(voice_filepath)
+            with open(voice_filepath, "rb") as file:
+                transcript = await AsyncOpenAI().audio.transcriptions.create(
+                    model="whisper-1", file=file
+                )
+            return transcript.text
+        finally:
+            os.remove(voice_filepath)