"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); __setModuleDefault(result, mod); return result; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.AudioNodeVAD = exports.MicVAD = exports.getDefaultRealTimeVADOptions = exports.ort = exports.DEFAULT_MODEL = void 0; const ortInstance = __importStar(require("onnxruntime-web")); const default_model_fetcher_1 = require("./default-model-fetcher"); const frame_processor_1 = require("./frame-processor"); const logging_1 = require("./logging"); const messages_1 = require("./messages"); const models_1 = require("./models"); const resampler_1 = require("./resampler"); exports.DEFAULT_MODEL = "legacy"; exports.ort = ortInstance; const workletFile = "vad.worklet.bundle.min.js"; const sileroV5File = "silero_vad_v5.onnx"; const sileroLegacyFile = "silero_vad_legacy.onnx"; const getDefaultRealTimeVADOptions = (model) => { const frameProcessorOptions = model === "v5" ? frame_processor_1.defaultV5FrameProcessorOptions : frame_processor_1.defaultLegacyFrameProcessorOptions; return { ...frameProcessorOptions, onFrameProcessed: (probabilities, frame) => { }, onVADMisfire: () => { logging_1.log.debug("VAD misfire"); }, onSpeechStart: () => { logging_1.log.debug("Detected speech start"); }, onSpeechEnd: () => { logging_1.log.debug("Detected speech end"); }, onSpeechRealStart: () => { logging_1.log.debug("Detected real speech start"); }, baseAssetPath: "https://cdn.jsdelivr.net/npm/@ricky0123/vad-web@latest/dist/", onnxWASMBasePath: "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.14.0/dist/", stream: undefined, ortConfig: undefined, model: model, workletOptions: {}, }; }; exports.getDefaultRealTimeVADOptions = getDefaultRealTimeVADOptions; class MicVAD { static async new(options = {}) { const fullOptions = { ...(0, exports.getDefaultRealTimeVADOptions)(options.model ?? exports.DEFAULT_MODEL), ...options, }; (0, frame_processor_1.validateOptions)(fullOptions); let stream; if (fullOptions.stream === undefined) stream = await navigator.mediaDevices.getUserMedia({ audio: { ...fullOptions.additionalAudioConstraints, channelCount: 1, echoCancellation: true, autoGainControl: true, noiseSuppression: true, }, }); else stream = fullOptions.stream; const audioContext = new AudioContext(); const sourceNode = new MediaStreamAudioSourceNode(audioContext, { mediaStream: stream, }); const audioNodeVAD = await AudioNodeVAD.new(audioContext, fullOptions); audioNodeVAD.receive(sourceNode); return new MicVAD(fullOptions, audioContext, stream, audioNodeVAD, sourceNode); } constructor(options, audioContext, stream, audioNodeVAD, sourceNode, listening = false) { this.options = options; this.audioContext = audioContext; this.stream = stream; this.audioNodeVAD = audioNodeVAD; this.sourceNode = sourceNode; this.listening = listening; this.pause = () => { this.audioNodeVAD.pause(); this.listening = false; }; this.start = () => { this.audioNodeVAD.start(); this.listening = true; }; this.destroy = () => { if (this.listening) { this.pause(); } if (this.options.stream === undefined) { this.stream.getTracks().forEach((track) => track.stop()); } this.sourceNode.disconnect(); this.audioNodeVAD.destroy(); this.audioContext.close(); }; this.setOptions = (options) => { this.audioNodeVAD.setFrameProcessorOptions(options); }; } } exports.MicVAD = MicVAD; class AudioNodeVAD { static async new(ctx, options = {}) { const fullOptions = { ...(0, exports.getDefaultRealTimeVADOptions)(options.model ?? exports.DEFAULT_MODEL), ...options, }; (0, frame_processor_1.validateOptions)(fullOptions); exports.ort.env.wasm.wasmPaths = fullOptions.onnxWASMBasePath; if (fullOptions.ortConfig !== undefined) { fullOptions.ortConfig(exports.ort); } const modelFile = fullOptions.model === "v5" ? sileroV5File : sileroLegacyFile; const modelURL = fullOptions.baseAssetPath + modelFile; const modelFactory = fullOptions.model === "v5" ? models_1.SileroV5.new : models_1.SileroLegacy.new; let model; try { model = await modelFactory(exports.ort, () => (0, default_model_fetcher_1.defaultModelFetcher)(modelURL)); } catch (e) { console.error(`Encountered an error while loading model file ${modelURL}`); throw e; } const frameProcessor = new frame_processor_1.FrameProcessor(model.process, model.reset_state, { frameSamples: fullOptions.frameSamples, positiveSpeechThreshold: fullOptions.positiveSpeechThreshold, negativeSpeechThreshold: fullOptions.negativeSpeechThreshold, redemptionFrames: fullOptions.redemptionFrames, preSpeechPadFrames: fullOptions.preSpeechPadFrames, minSpeechFrames: fullOptions.minSpeechFrames, submitUserSpeechOnPause: fullOptions.submitUserSpeechOnPause, }); const audioNodeVAD = new AudioNodeVAD(ctx, fullOptions, frameProcessor); await audioNodeVAD.setupAudioNode(); return audioNodeVAD; } constructor(ctx, options, frameProcessor) { this.ctx = ctx; this.options = options; this.bufferIndex = 0; this.pause = () => { this.frameProcessor.pause(this.handleFrameProcessorEvent); }; this.start = () => { this.frameProcessor.resume(); }; this.receive = (node) => { node.connect(this.audioNode); }; this.processFrame = async (frame) => { await this.frameProcessor.process(frame, this.handleFrameProcessorEvent); }; this.handleFrameProcessorEvent = (ev) => { switch (ev.msg) { case messages_1.Message.FrameProcessed: this.options.onFrameProcessed(ev.probs, ev.frame); break; case messages_1.Message.SpeechStart: this.options.onSpeechStart(); break; case messages_1.Message.SpeechRealStart: this.options.onSpeechRealStart(); break; case messages_1.Message.VADMisfire: this.options.onVADMisfire(); break; case messages_1.Message.SpeechEnd: this.options.onSpeechEnd(ev.audio); break; } }; this.destroy = () => { if (this.audioNode instanceof AudioWorkletNode) { this.audioNode.port.postMessage({ message: messages_1.Message.SpeechStop, }); } this.audioNode.disconnect(); this.gainNode?.disconnect(); }; this.setFrameProcessorOptions = (options) => { this.frameProcessor.options = { ...this.frameProcessor.options, ...options, }; }; this.frameProcessor = frameProcessor; } async setupAudioNode() { const hasAudioWorklet = "audioWorklet" in this.ctx && typeof AudioWorkletNode === "function"; if (hasAudioWorklet) { try { const workletURL = this.options.baseAssetPath + workletFile; await this.ctx.audioWorklet.addModule(workletURL); const workletOptions = this.options.workletOptions ?? {}; workletOptions.processorOptions = { ...(workletOptions.processorOptions ?? {}), frameSamples: this.options.frameSamples, }; this.audioNode = new AudioWorkletNode(this.ctx, "vad-helper-worklet", workletOptions); this.audioNode.port.onmessage = async (ev) => { switch (ev.data?.message) { case messages_1.Message.AudioFrame: let buffer = ev.data.data; if (!(buffer instanceof ArrayBuffer)) { buffer = new ArrayBuffer(ev.data.data.byteLength); new Uint8Array(buffer).set(new Uint8Array(ev.data.data)); } const frame = new Float32Array(buffer); await this.processFrame(frame); break; } }; return; } catch (e) { console.log("AudioWorklet setup failed, falling back to ScriptProcessor", e); } } // Initialize resampler for ScriptProcessor this.resampler = new resampler_1.Resampler({ nativeSampleRate: this.ctx.sampleRate, targetSampleRate: 16000, targetFrameSize: this.options.frameSamples ?? 480, }); // Fallback to ScriptProcessor const bufferSize = 4096; // Increased for more stable processing this.audioNode = this.ctx.createScriptProcessor(bufferSize, 1, 1); // Create a gain node with zero gain to handle the audio chain this.gainNode = this.ctx.createGain(); this.gainNode.gain.value = 0; let processingAudio = false; this.audioNode.onaudioprocess = async (e) => { if (processingAudio) return; processingAudio = true; try { const input = e.inputBuffer.getChannelData(0); const output = e.outputBuffer.getChannelData(0); output.fill(0); // Process through resampler if (this.resampler) { const frames = this.resampler.process(input); for (const frame of frames) { await this.processFrame(frame); } } } catch (error) { console.error("Error processing audio:", error); } finally { processingAudio = false; } }; // Connect the audio chain this.audioNode.connect(this.gainNode); this.gainNode.connect(this.ctx.destination); } } exports.AudioNodeVAD = AudioNodeVAD; //# sourceMappingURL=real-time-vad.js.map