Merge pull request #19 from freemocap/jon/add-audio

Audio transcription method
freemocap · Jan 22, 2024 · 406e6a5 · 406e6a5
2 parents 08870b9 + 453efb0
commit 406e6a5
Show file tree

Hide file tree

Showing 22 changed files with 608 additions and 105 deletions.
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -26,6 +26,7 @@
     "@langchain/community": "^0.0.14",
     "@langchain/core": "^0.1.8",
     "@langchain/openai": "^0.0.9",
+    "@nestjs/axios": "^3.0.1",
     "@nestjs/common": "^10.0.0",
     "@nestjs/config": "^3.1.1",
     "@nestjs/core": "^10.0.0",
@@ -34,15 +35,18 @@
     "@nestjs/platform-express": "^10.0.0",
     "@nestjs/swagger": "^7.1.17",
     "@slack/bolt": "^3.17.0",
+    "axios": "^1.6.5",
     "bson": "^6.2.0",
     "bufferutil": "^4.0.8",
     "class-transformer": "^0.5.1",
     "class-validator": "^0.14.1",
     "discord.js": "^14.14.1",
+    "form-data": "^4.0.0",
     "langchain": "^0.0.209",
     "mongodb": "^5.9.2",
     "mongoose": "^8.0.3",
     "necord": "^6.4.2",
+    "openai": "^4.24.7",
     "reflect-metadata": "^0.1.13",
     "resource": "^0.8.1",
     "rxjs": "^7.8.1",
@@ -55,7 +59,7 @@
     "@nestjs/testing": "^10.0.0",
     "@types/express": "^4.17.17",
     "@types/jest": "^29.5.2",
-    "@types/node": "^20.10.6",
+    "@types/node": "^20.11.5",
     "@types/supertest": "^2.0.12",
     "@typescript-eslint/eslint-plugin": "^6.0.0",
     "@typescript-eslint/parser": "^6.0.0",

diff --git a/src/core/ai/langchain/langchain.service.ts b/src/core/ai/langchain/langchain.service.ts
@@ -22,7 +22,7 @@ export class LangchainService {
       this._logger.log('Creating model...');
       this._model = new OpenAI({
         modelName: modelName || 'gpt-4-1106-preview',
-        openAIApiKey: await this._openAiSecrets.getOpenAIKey(),
+        openAIApiKey: await this._openAiSecrets.getOpenaiApiKey(),
       });
     }
     this._logger.log('Returning model: ' + this._model.modelName);

diff --git a/src/core/ai/openai/dto/speech-to-text.dto.ts b/src/core/ai/openai/dto/speech-to-text.dto.ts
@@ -0,0 +1,50 @@
+import {
+  IsString,
+  IsIn,
+  IsOptional,
+  IsNumber,
+  Min,
+  Max,
+  IsMimeType,
+} from 'class-validator';
+import { Type } from 'class-transformer';
+
+export class SpeechToTextDto {
+  @IsMimeType({
+    each: true,
+    message:
+      'File must be in one of the following formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, webm.',
+  })
+  @Type(() => Object)
+  file: any;
+
+  @IsOptional()
+  @IsIn(['whisper-1'])
+  model: 'whisper-1' = 'whisper-1';
+
+  @IsOptional()
+  @IsString({ message: 'Language code must be a valid ISO-639-1 string.' })
+  language?: string; // ISO-639-1 format
+
+  @IsOptional()
+  @IsString({ message: 'Prompt must be a string.' })
+  @Max(4096, {
+    message: 'Prompt must be less than or equal to 4096 characters.',
+  })
+  prompt?: string;
+
+  @IsOptional()
+  @IsIn(['json', 'text', 'srt', 'verbose_json', 'vtt'])
+  response_format: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt' =
+    'verbose_json';
+
+  @IsOptional()
+  @IsNumber()
+  @Min(0)
+  @Max(1)
+  temperature: number = 0;
+
+  constructor(partial: Partial<SpeechToTextDto>) {
+    Object.assign(this, partial);
+  }
+}
diff --git a/src/core/ai/openai/dto/text-to-speech.dto.ts b/src/core/ai/openai/dto/text-to-speech.dto.ts
@@ -0,0 +1,40 @@
+import {
+  IsString,
+  IsIn,
+  IsOptional,
+  IsNumber,
+  Min,
+  Max,
+} from 'class-validator';
+import { Type } from 'class-transformer';
+
+export class TextToSpeechDto {
+  @IsIn(['tts-1', 'tts-1-hd'])
+  model: 'tts-1' | 'tts-1-hd';
+
+  @IsString()
+  @IsOptional()
+  @Type(() => String)
+  @IsString({ message: 'Input must be a string.' })
+  @Max(4096, {
+    message: 'Input must be less than or equal to 4096 characters.',
+  })
+  input: string; // Text to generate audio for, max 4096 characters
+
+  @IsIn(['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'])
+  voice: 'alloy' | 'echo' | 'fable' | 'onyx' | 'nova' | 'shimmer';
+
+  @IsOptional()
+  @IsIn(['mp3', 'opus', 'aac', 'flac'])
+  response_format?: 'mp3' | 'opus' | 'aac' | 'flac';
+
+  @IsOptional()
+  @IsNumber()
+  @Min(0.25)
+  @Max(4.0)
+  speed?: number; // Between 0.25 and 4.0, defaults to 1
+
+  constructor(partial: Partial<TextToSpeechDto>) {
+    Object.assign(this, partial);
+  }
+}