support Web Speech API

sista-ai · May 10, 2024 · 51209be · 51209be
1 parent 0b97b51
commit 51209be
Show file tree

Hide file tree

Showing 4 changed files with 191 additions and 28 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
-# Voice AI Assistant (React JS)
+# AI Voice Assistant (React JS)
 
-Turn your App smart with a conversational AI assistant and interactive voice UI **in less than 10 minutes**!
+Give your App a voice, with a conversational AI assistant and interactive voice UI **in less than 10 minutes**!
 
 **No Code Changes! No Intent Definitions!** _Just add our magic button `<AiAssistantButton />`._
 
@@ -367,7 +367,7 @@ Change AI assistant's voice via the [Admin Panel](https://admin.sista.ai/applica
   <img src="./assets/sista-icon.png" alt="Sista Logo" width="100"/>
 </a>
 
-Unlock the Future with our advacned **Voice AI Assistant**: Embrace top-tier components:
+Unlock the Future with our advacned **AI Voice Assistant**: Embrace top-tier components:
 
 -   Conversational AI Agents
 -   Interactive Voice UI

diff --git a/src/core/AiAssistantEngine.ts b/src/core/AiAssistantEngine.ts
@@ -10,6 +10,7 @@ import Scraper from './Scraper';
 import config from './config';
 import { VoiceFunction } from './commonTypes';
 import User from './User';
+import SpeechToText from './SpeechToText';
 
 interface ApiResponse {
     data: {
@@ -22,17 +23,24 @@ interface ApiResponse {
     message: string;
 }
 
+enum UserInputMethod {
+    AUDIO = 'AUDIO',
+    TEXT = 'TEXT',
+}
+
 class AiAssistantEngine extends EventEmitter {
     private readonly apiKey: string;
     private readonly apiUrl: string;
     private readonly scrapeContent: boolean;
     private readonly sdkVersion: string;
     private readonly audioPlayer: AudioPlayer;
     private readonly audioRecorder: AudioRecorder;
+    private readonly speechToText: SpeechToText;
     private readonly functionExecutor: FunctionExecutor;
     private readonly scraper: Scraper;
     private readonly user: User;
     private readonly pageContent: Record<string, string[]> | null;
+    private userInputMethod: UserInputMethod;
 
     constructor(
         apiKey: string,
@@ -42,33 +50,45 @@ class AiAssistantEngine extends EventEmitter {
         debugMode: boolean = false,
     ) {
         super();
+        Logger.setDebugMode(debugMode);
 
         if (!apiKey) {
             throw new Error(
                 'Missing API Key for AiAssistantProvider. Get your FREE Key from https://admin.sista.ai/applications',
             );
         }
 
-        Logger.setDebugMode(debugMode);
+        // Control the user input method. TEXT = try to convert speech to text using
+        // browser's SpeechRecognition API first and fallback to Audio recording if it fails.
+        this.userInputMethod = UserInputMethod.TEXT;
         this.sdkVersion = pkg.version;
-        Logger.log(
-            `--[SISTA]-- Initializing AiAssistantEngine Version: ${this.sdkVersion}`,
-        );
-        this.scrapeContent = scrapeContent;
         this.apiKey = apiKey;
-        Logger.log(
-            '--[SISTA]-- Using Access Key:',
-            '...' + this.apiKey.slice(-8),
-        );
         this.apiUrl = apiUrl;
-        Logger.log('--[SISTA]-- Using Base URL:', this.apiUrl);
-
-        this.audioPlayer = new AudioPlayer();
+        this.scrapeContent = scrapeContent;
+        this.user = new User(userId);
+        this.speechToText = new SpeechToText();
         this.audioRecorder = new AudioRecorder();
+        this.audioPlayer = new AudioPlayer();
         this.functionExecutor = new FunctionExecutor();
         this.scraper = new Scraper();
         this.pageContent = this.scrapeContent ? this.scraper.getText() : null;
-        this.user = new User(userId);
+
+        // Log the custom object
+        Logger.log(
+            '--[SISTA]-- Initialize Ai Assistant Engine:',
+            JSON.stringify(
+                {
+                    Version: this.sdkVersion,
+                    APIKey: '...' + this.apiKey.slice(-8),
+                    APIUrl: this.apiUrl,
+                    AutoScrapeContent: this.scrapeContent,
+                    ConfiguredUserInputMethod: this.userInputMethod,
+                    User: this.user,
+                },
+                null,
+                2,
+            ),
+        );
     }
 
     registerFunctions(voiceFunctions: VoiceFunction[]): void {
@@ -79,40 +99,72 @@ class AiAssistantEngine extends EventEmitter {
         Logger.log('--[SISTA]-- startProcessing');
 
         this.emitStateChange(EventEmitter.STATE_LISTENING_START);
-
         this.audioPlayer.playStartTone();
 
-        let userAudioCommand: Blob | undefined;
+        let inputUserCommand: string | Blob;
 
         try {
-            userAudioCommand = await this.audioRecorder.startRecording();
+            inputUserCommand = await this.getUserInput();
+            Logger.log(
+                `--[SISTA]-- Used "User Input Method" = ${this.userInputMethod}`,
+            );
         } catch (err) {
-            Logger.error('Error accessing the microphone:', err);
+            Logger.error('Error getting user input:', err);
             this.emitStateChange(EventEmitter.STATE_IDLE);
             return;
         }
 
-        if (userAudioCommand) {
+        if (inputUserCommand) {
             try {
-                await this._makeAPIRequest(userAudioCommand);
+                await this._makeAPIRequest(inputUserCommand);
             } catch (err) {
                 Logger.error('Error making API request:', err);
                 this.emitStateChange(EventEmitter.STATE_IDLE);
             }
         }
     };
 
-    private _makeAPIRequest = async (audioBlob: Blob): Promise<void> => {
+    private async getUserInput(): Promise<string | Blob> {
+        try {
+            if (this.userInputMethod === UserInputMethod.AUDIO) {
+                return await this.audioRecorder.startRecording();
+            } else if (this.userInputMethod === UserInputMethod.TEXT) {
+                return await this.speechToText.convertSpeechToText();
+            } else {
+                throw new Error('Invalid user input method!');
+            }
+        } catch (err) {
+            Logger.error('Error getting user input, switching method:', err);
+            this.userInputMethod =
+                this.userInputMethod === UserInputMethod.AUDIO
+                    ? UserInputMethod.TEXT
+                    : UserInputMethod.AUDIO;
+            Logger.log(
+                `--[SISTA]-- FALLBACK: Switchig "User Input Method" To = ${this.userInputMethod}`,
+            );
+            return this.getUserInput();
+        }
+    }
+
+    private _makeAPIRequest = async (
+        userInput: Blob | string,
+    ): Promise<void> => {
         Logger.log('--[SISTA]-- _makeAPIRequest');
         this.emitStateChange(EventEmitter.STATE_THINKING_START);
 
         const formData = new FormData();
+
+        if (this.userInputMethod === UserInputMethod.AUDIO) {
+            formData.append('userInputAsAudio', userInput as Blob);
+        } else if (this.userInputMethod === UserInputMethod.TEXT) {
+            formData.append('userInputAsText', userInput as string);
+        }
+
         formData.append('sdkVersion', this.sdkVersion);
         formData.append(
             'endUser',
             JSON.stringify(this.user.getEndUserDetails()),
         );
-        formData.append('audio', audioBlob);
         formData.append(
             'functionsSignatures',
             JSON.stringify(this.functionExecutor.functionSignatures),
@@ -154,7 +206,9 @@ class AiAssistantEngine extends EventEmitter {
         // ----[ Step 1: Display User Input Command ]----
         // Handle user command as text first. This is useful for debugging
         if (response.data.inputVoiceCommandAsText) {
-            this._handleInputVoiceCommandAsText(response.data.inputVoiceCommandAsText);
+            this._handleInputVoiceCommandAsText(
+                response.data.inputVoiceCommandAsText,
+            );
         }
 
         // ----[ Step 2: Display AI Text Reply ]----
@@ -180,7 +234,6 @@ class AiAssistantEngine extends EventEmitter {
         if (response.data.outputAudioReply) {
             this._handleAudioResponse(response.data.outputAudioReply);
         }
-
     };
 
     private _handleAudioResponse = (audioFile: string): void => {

diff --git a/src/core/SpeechToText.ts b/src/core/SpeechToText.ts
@@ -0,0 +1,109 @@
+import Logger from './Logger';
+
+interface SpeechRecognitionEvent {
+    results: Array<{
+        [index: number]: { transcript: string };
+        isFinal: boolean;
+    }>;
+    resultIndex: number;
+    error?: string;
+}
+
+interface SpeechRecognitionObject {
+    new (): SpeechRecognitionObject;
+    prototype: SpeechRecognitionObject;
+    continuous: boolean;
+    interimResults: boolean;
+    lang: string;
+    onresult?: (event: SpeechRecognitionEvent) => void;
+    onerror?: (event: SpeechRecognitionEvent) => void;
+    onend?: () => void;
+    start: () => void;
+    stop: () => void;
+}
+
+declare const SpeechRecognition: SpeechRecognitionObject | undefined;
+declare const webkitSpeechRecognition: SpeechRecognitionObject | undefined;
+
+class SpeechToText {
+    private recognition: SpeechRecognitionObject;
+    private finalTranscript: string = '';
+    private isListening: boolean = false;
+
+    constructor() {
+        const SpeechRecognition =
+            window.SpeechRecognition || window.webkitSpeechRecognition;
+        if (!SpeechRecognition) {
+            throw new Error(
+                'Speech recognition API not supported in this browser.',
+            );
+        }
+        this.recognition = new SpeechRecognition();
+        this.recognition.continuous = false;
+        this.recognition.interimResults = false;
+        this.recognition.lang = navigator.language || 'en-US';
+
+        this.recognition.onresult = (event: SpeechRecognitionEvent) => {
+            for (let i = event.resultIndex; i < event.results.length; ++i) {
+                const result = event.results[i];
+                if (result.isFinal) {
+                    this.finalTranscript += result[0].transcript.trim();
+                }
+            }
+            Logger.log(`Final result: ${this.finalTranscript}`);
+        };
+
+        this.recognition.onerror = (event: SpeechRecognitionEvent) => {
+            Logger.error(`Speech recognition error: ${event.error}`);
+            this.isListening = false;
+        };
+
+        this.recognition.onend = () => {
+            Logger.log('Speech recognition stopped.');
+            this.isListening = false;
+        };
+    }
+
+    public async convertSpeechToText(): Promise<string> {
+        return new Promise((resolve, reject) => {
+            if (this.recognition.continuous) {
+                reject('Recognition is already in progress');
+                return;
+            }
+
+            this.finalTranscript = '';
+
+            this.recognition.onend = () => {
+                Logger.log('Speech recognition service has ended.');
+                resolve(this.finalTranscript);
+                this.isListening = false;
+            };
+
+            this.recognition.onerror = (event: SpeechRecognitionEvent) => {
+                Logger.error(`Speech recognition error: ${event.error}`);
+                reject(new Error(event.error));
+                this.isListening = false;
+            };
+
+            this.recognition.start();
+            Logger.log('Speech recognition started.');
+        });
+    }
+
+    public startListening(): void {
+        if (!this.isListening) {
+            this.finalTranscript = '';
+            this.recognition.start();
+            Logger.log('Speech recognition started.');
+        }
+    }
+
+    public stopListening(): void {
+        if (this.isListening) {
+            this.recognition.stop();
+            Logger.log('Speech recognition stopped.');
+        }
+    }
+}
+
+export default SpeechToText;
diff --git a/src/core/User.ts b/src/core/User.ts
@@ -8,15 +8,17 @@ interface EndUserDetails {
 
 class User {
     private providedUserId: string | null;
+    private generatedUserId: string;
 
     constructor(providedUserId: string | null) {
         this.providedUserId = providedUserId;
+        this.generatedUserId = this._generateEndUserId();
     }
 
     public getEndUserDetails(): EndUserDetails {
         return {
             endUserAgent: navigator.userAgent,
-            generatedEndUserId: this._generateEndUserId(),
+            generatedEndUserId: this.generatedUserId,
             providedEndUserId: this.providedUserId,
         };
     }
@@ -34,5 +36,4 @@ class User {
         return endUserId;
     }
 }
-
 export default User;