167 lines
6.3 KiB
JavaScript
167 lines
6.3 KiB
JavaScript
"use strict";
|
|
/*
|
|
Some of this code, together with the default options found in index.ts,
|
|
were taken (or took inspiration) from https://github.com/snakers4/silero-vad
|
|
*/
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
exports.FrameProcessor = exports.validateOptions = exports.defaultV5FrameProcessorOptions = exports.defaultLegacyFrameProcessorOptions = void 0;
|
|
const logging_1 = require("./logging");
|
|
const messages_1 = require("./messages");
|
|
const RECOMMENDED_FRAME_SAMPLES = [512, 1024, 1536];
|
|
exports.defaultLegacyFrameProcessorOptions = {
|
|
positiveSpeechThreshold: 0.5,
|
|
negativeSpeechThreshold: 0.5 - 0.15,
|
|
preSpeechPadFrames: 1,
|
|
redemptionFrames: 8,
|
|
frameSamples: 1536,
|
|
minSpeechFrames: 3,
|
|
submitUserSpeechOnPause: false,
|
|
};
|
|
exports.defaultV5FrameProcessorOptions = {
|
|
positiveSpeechThreshold: 0.5,
|
|
negativeSpeechThreshold: 0.5 - 0.15,
|
|
preSpeechPadFrames: 3,
|
|
redemptionFrames: 24,
|
|
frameSamples: 512,
|
|
minSpeechFrames: 9,
|
|
submitUserSpeechOnPause: false,
|
|
};
|
|
function validateOptions(options) {
|
|
if (!RECOMMENDED_FRAME_SAMPLES.includes(options.frameSamples)) {
|
|
logging_1.log.warn("You are using an unusual frame size");
|
|
}
|
|
if (options.positiveSpeechThreshold < 0 ||
|
|
options.positiveSpeechThreshold > 1) {
|
|
logging_1.log.error("positiveSpeechThreshold should be a number between 0 and 1");
|
|
}
|
|
if (options.negativeSpeechThreshold < 0 ||
|
|
options.negativeSpeechThreshold > options.positiveSpeechThreshold) {
|
|
logging_1.log.error("negativeSpeechThreshold should be between 0 and positiveSpeechThreshold");
|
|
}
|
|
if (options.preSpeechPadFrames < 0) {
|
|
logging_1.log.error("preSpeechPadFrames should be positive");
|
|
}
|
|
if (options.redemptionFrames < 0) {
|
|
logging_1.log.error("redemptionFrames should be positive");
|
|
}
|
|
}
|
|
exports.validateOptions = validateOptions;
|
|
const concatArrays = (arrays) => {
|
|
const sizes = arrays.reduce((out, next) => {
|
|
out.push(out.at(-1) + next.length);
|
|
return out;
|
|
}, [0]);
|
|
const outArray = new Float32Array(sizes.at(-1));
|
|
arrays.forEach((arr, index) => {
|
|
const place = sizes[index];
|
|
outArray.set(arr, place);
|
|
});
|
|
return outArray;
|
|
};
|
|
class FrameProcessor {
|
|
constructor(modelProcessFunc, modelResetFunc, options) {
|
|
this.modelProcessFunc = modelProcessFunc;
|
|
this.modelResetFunc = modelResetFunc;
|
|
this.options = options;
|
|
this.speaking = false;
|
|
this.redemptionCounter = 0;
|
|
this.speechFrameCount = 0;
|
|
this.active = false;
|
|
this.speechRealStartFired = false;
|
|
this.reset = () => {
|
|
this.speaking = false;
|
|
this.speechRealStartFired = false;
|
|
this.audioBuffer = [];
|
|
this.modelResetFunc();
|
|
this.redemptionCounter = 0;
|
|
this.speechFrameCount = 0;
|
|
};
|
|
this.pause = (handleEvent) => {
|
|
this.active = false;
|
|
if (this.options.submitUserSpeechOnPause) {
|
|
this.endSegment(handleEvent);
|
|
}
|
|
else {
|
|
this.reset();
|
|
}
|
|
};
|
|
this.resume = () => {
|
|
this.active = true;
|
|
};
|
|
this.endSegment = (handleEvent) => {
|
|
const audioBuffer = this.audioBuffer;
|
|
this.audioBuffer = [];
|
|
const speaking = this.speaking;
|
|
this.reset();
|
|
if (speaking) {
|
|
const speechFrameCount = audioBuffer.reduce((acc, item) => {
|
|
return item.isSpeech ? (acc + 1) : acc;
|
|
}, 0);
|
|
if (speechFrameCount >= this.options.minSpeechFrames) {
|
|
const audio = concatArrays(audioBuffer.map((item) => item.frame));
|
|
handleEvent({ msg: messages_1.Message.SpeechEnd, audio });
|
|
}
|
|
else {
|
|
handleEvent({ msg: messages_1.Message.VADMisfire });
|
|
}
|
|
}
|
|
return {};
|
|
};
|
|
this.process = async (frame, handleEvent) => {
|
|
if (!this.active) {
|
|
return;
|
|
}
|
|
const probs = await this.modelProcessFunc(frame);
|
|
const isSpeech = probs.isSpeech >= this.options.positiveSpeechThreshold;
|
|
handleEvent({ probs, msg: messages_1.Message.FrameProcessed, frame });
|
|
this.audioBuffer.push({
|
|
frame,
|
|
isSpeech,
|
|
});
|
|
if (isSpeech) {
|
|
this.speechFrameCount++;
|
|
this.redemptionCounter = 0;
|
|
}
|
|
if (isSpeech && !this.speaking) {
|
|
this.speaking = true;
|
|
handleEvent({ msg: messages_1.Message.SpeechStart });
|
|
}
|
|
if (this.speaking &&
|
|
this.speechFrameCount === this.options.minSpeechFrames &&
|
|
!this.speechRealStartFired) {
|
|
this.speechRealStartFired = true;
|
|
handleEvent({ msg: messages_1.Message.SpeechRealStart });
|
|
}
|
|
if (probs.isSpeech < this.options.negativeSpeechThreshold &&
|
|
this.speaking &&
|
|
++this.redemptionCounter >= this.options.redemptionFrames) {
|
|
this.redemptionCounter = 0;
|
|
this.speechFrameCount = 0;
|
|
this.speaking = false;
|
|
this.speechRealStartFired = false;
|
|
const audioBuffer = this.audioBuffer;
|
|
this.audioBuffer = [];
|
|
const speechFrameCount = audioBuffer.reduce((acc, item) => {
|
|
return item.isSpeech ? (acc + 1) : acc;
|
|
}, 0);
|
|
if (speechFrameCount >= this.options.minSpeechFrames) {
|
|
const audio = concatArrays(audioBuffer.map((item) => item.frame));
|
|
handleEvent({ msg: messages_1.Message.SpeechEnd, audio });
|
|
}
|
|
else {
|
|
handleEvent({ msg: messages_1.Message.VADMisfire });
|
|
}
|
|
}
|
|
if (!this.speaking) {
|
|
while (this.audioBuffer.length > this.options.preSpeechPadFrames) {
|
|
this.audioBuffer.shift();
|
|
}
|
|
this.speechFrameCount = 0;
|
|
}
|
|
};
|
|
this.audioBuffer = [];
|
|
this.reset();
|
|
}
|
|
}
|
|
exports.FrameProcessor = FrameProcessor;
|
|
//# sourceMappingURL=frame-processor.js.map
|