whisper.cpp en tiempo real con tiny
This commit is contained in:
@ -12,6 +12,8 @@ defmodule Recognition_VAD.Application do
|
|||||||
{DNSCluster, query: Application.get_env(:recognition_VAD, :dns_cluster_query) || :ignore},
|
{DNSCluster, query: Application.get_env(:recognition_VAD, :dns_cluster_query) || :ignore},
|
||||||
{Phoenix.PubSub, name: Recognition_VAD.PubSub},
|
{Phoenix.PubSub, name: Recognition_VAD.PubSub},
|
||||||
Recognition_VAD.AudioProcessor,
|
Recognition_VAD.AudioProcessor,
|
||||||
|
Recognition_VAD.WhisperStreamer,
|
||||||
|
|
||||||
# Start the Finch HTTP client for sending emails
|
# Start the Finch HTTP client for sending emails
|
||||||
{Finch, name: Recognition_VAD.Finch},
|
{Finch, name: Recognition_VAD.Finch},
|
||||||
# Start a worker by calling: Recognition_VAD.Worker.start_link(arg)
|
# Start a worker by calling: Recognition_VAD.Worker.start_link(arg)
|
||||||
|
46
recognition_VAD/lib/recognition_VAD/whisper.ex
Normal file
46
recognition_VAD/lib/recognition_VAD/whisper.ex
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
defmodule Recognition_VAD.Whisper do
|
||||||
|
@default_model "ggml-tiny.bin"
|
||||||
|
@script_path "/home/aime-pc2/i_m/whisper.cpp/transcribe.sh"
|
||||||
|
require Logger
|
||||||
|
|
||||||
|
def transcribe(path, model \\ @default_model) do
|
||||||
|
path_to_run = convert_path_to_wsl(path)
|
||||||
|
|
||||||
|
args = [@script_path, path_to_run, model]
|
||||||
|
|
||||||
|
case System.cmd("wsl", args, stderr_to_stdout: true) do
|
||||||
|
{output, 0} ->
|
||||||
|
text = extract_transcription(output)
|
||||||
|
Logger.info("📝 Transcripción: #{text}")
|
||||||
|
{:ok, text}
|
||||||
|
|
||||||
|
{error_output, _} ->
|
||||||
|
Logger.error("❌ Error al transcribir con whisper: #{error_output}")
|
||||||
|
{:error, error_output}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp convert_path_to_wsl(path) do
|
||||||
|
if String.starts_with?(path, "C:/") do
|
||||||
|
path
|
||||||
|
|> String.replace_prefix("C:/", "/mnt/c/")
|
||||||
|
|> String.replace("\\", "/")
|
||||||
|
else
|
||||||
|
path
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp extract_transcription(output) do
|
||||||
|
output
|
||||||
|
|> String.split("\n")
|
||||||
|
|> Enum.filter(fn line ->
|
||||||
|
line =~ ~r/[\p{L}\p{N}]/u and
|
||||||
|
not String.starts_with?(line, "whisper_") and
|
||||||
|
not String.starts_with?(line, "system_info") and
|
||||||
|
not String.starts_with?(line, "main: ") and
|
||||||
|
not String.starts_with?(line, "whisper_print_timings:")
|
||||||
|
end)
|
||||||
|
|> Enum.join(" ")
|
||||||
|
|> String.trim()
|
||||||
|
end
|
||||||
|
end
|
56
recognition_VAD/lib/recognition_VAD/whisper_streamer.ex
Normal file
56
recognition_VAD/lib/recognition_VAD/whisper_streamer.ex
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
defmodule Recognition_VAD.WhisperStreamer do
|
||||||
|
use GenServer
|
||||||
|
require Logger
|
||||||
|
|
||||||
|
@transcribe_interval 2000 # cada 2 segundos
|
||||||
|
@max_chunks 100 # máximo a mantener en memoria
|
||||||
|
|
||||||
|
def start_link(_opts) do
|
||||||
|
GenServer.start_link(__MODULE__, %{chunks: [], sample_rate: 48000}, name: __MODULE__)
|
||||||
|
end
|
||||||
|
|
||||||
|
def push_chunk(chunk, sample_rate) do
|
||||||
|
GenServer.cast(__MODULE__, {:chunk, chunk, sample_rate})
|
||||||
|
end
|
||||||
|
|
||||||
|
@impl true
|
||||||
|
def init(state) do
|
||||||
|
schedule_transcription()
|
||||||
|
{:ok, state}
|
||||||
|
end
|
||||||
|
|
||||||
|
@impl true
|
||||||
|
def handle_cast({:chunk, binary, sr}, %{chunks: chunks} = state) do
|
||||||
|
new_chunks = [binary | chunks] |> Enum.take(@max_chunks)
|
||||||
|
{:noreply, %{state | chunks: new_chunks, sample_rate: sr}}
|
||||||
|
end
|
||||||
|
|
||||||
|
@impl true
|
||||||
|
def handle_info(:transcribe_timer, %{chunks: []} = state) do
|
||||||
|
# Si no hay audio, solo reprogramamos
|
||||||
|
schedule_transcription()
|
||||||
|
{:noreply, state}
|
||||||
|
end
|
||||||
|
|
||||||
|
def handle_info(:transcribe_timer, %{chunks: chunks, sample_rate: sr} = state) do
|
||||||
|
Task.start(fn ->
|
||||||
|
path = "C:/Users/rolan/i_m/voice_recognition/recognition_VAD/tmp/realtime_#{System.system_time(:millisecond)}.wav"
|
||||||
|
:ok = Recognition_VAD.WavWriter.write_pcm_chunks_to_wav(Enum.reverse(chunks), sr, path)
|
||||||
|
|
||||||
|
case Recognition_VAD.Whisper.transcribe(path) do
|
||||||
|
{:ok, text} when byte_size(text) > 0 ->
|
||||||
|
Phoenix.PubSub.broadcast(Recognition_VAD.PubSub, "audio_output", {:realtime, %{"text" => text}})
|
||||||
|
|
||||||
|
_ ->
|
||||||
|
Logger.debug("⏱ Nada para transcribir o error")
|
||||||
|
end
|
||||||
|
end)
|
||||||
|
|
||||||
|
schedule_transcription()
|
||||||
|
{:noreply, %{state | chunks: []}}
|
||||||
|
end
|
||||||
|
|
||||||
|
defp schedule_transcription do
|
||||||
|
Process.send_after(self(), :transcribe_timer, @transcribe_interval)
|
||||||
|
end
|
||||||
|
end
|
@ -6,8 +6,15 @@ defmodule Recognition_VADWeb.DataChannel do
|
|||||||
{:ok, socket}
|
{:ok, socket}
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Parcial
|
||||||
|
def handle_info({:realtime, msg}, socket) do
|
||||||
|
push(socket, "realtime", msg)
|
||||||
|
{:noreply, socket}
|
||||||
|
end
|
||||||
|
|
||||||
|
# Completo
|
||||||
def handle_info({:broadcast_audio, msg}, socket) do
|
def handle_info({:broadcast_audio, msg}, socket) do
|
||||||
push(socket, "transcription", Jason.decode!(msg))
|
push(socket, "transcription", msg)
|
||||||
{:noreply, socket}
|
{:noreply, socket}
|
||||||
end
|
end
|
||||||
|
|
||||||
@ -15,7 +22,7 @@ defmodule Recognition_VADWeb.DataChannel do
|
|||||||
def handle_in("audio_chunk", %{"data" => base64_chunk, "sample_rate" => sample_rate}, socket) do
|
def handle_in("audio_chunk", %{"data" => base64_chunk, "sample_rate" => sample_rate}, socket) do
|
||||||
case Base.decode64(base64_chunk) do
|
case Base.decode64(base64_chunk) do
|
||||||
{:ok, binary_audio} ->
|
{:ok, binary_audio} ->
|
||||||
GenServer.cast(Recognition_VAD.AudioProcessor, {:chunk, binary_audio, sample_rate})
|
Recognition_VAD.WhisperStreamer.push_chunk(binary_audio, sample_rate)
|
||||||
{:noreply, socket}
|
{:noreply, socket}
|
||||||
|
|
||||||
:error ->
|
:error ->
|
||||||
|
@ -7,114 +7,137 @@ defmodule Recognition_VADWeb.Stt.TestWithChannel do
|
|||||||
|
|
||||||
def render(assigns) do
|
def render(assigns) do
|
||||||
~H"""
|
~H"""
|
||||||
<div id="container">
|
<div id="container">
|
||||||
<div id="status">Presioná "Start Recording"…</div>
|
<div id="status">Presioná "Start Recording"…</div>
|
||||||
<button id="startButton">Start Recording</button>
|
<button id="startButton">Start Recording</button>
|
||||||
<button id="stopButton" disabled>Stop Recording</button>
|
<button id="stopButton" disabled>Stop Recording</button>
|
||||||
|
|
||||||
<div id="transcriptionContainer">
|
<div id="transcriptionContainer">
|
||||||
<div id="transcription" class="realtime"></div>
|
<div id="transcription" class="realtime"></div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div id="fullTextContainer">
|
<script type="module">
|
||||||
<div id="fullText"></div>
|
import { Socket } from "https://cdn.skypack.dev/phoenix";
|
||||||
</div>
|
|
||||||
|
|
||||||
<script type="module">
|
const statusDiv = document.getElementById("status");
|
||||||
import { Socket } from "https://cdn.skypack.dev/phoenix";
|
const transcriptionDiv = document.getElementById("transcription");
|
||||||
|
const startButton = document.getElementById("startButton");
|
||||||
|
const stopButton = document.getElementById("stopButton");
|
||||||
|
|
||||||
const statusDiv = document.getElementById("status");
|
let socket, channel;
|
||||||
const transcriptionDiv = document.getElementById("transcription");
|
let audioContext, mediaStream, mediaProcessor;
|
||||||
const fullTextDiv = document.getElementById("fullText");
|
|
||||||
const startButton = document.getElementById("startButton");
|
|
||||||
const stopButton = document.getElementById("stopButton");
|
|
||||||
|
|
||||||
let socket, channel;
|
async function startRecording() {
|
||||||
let audioContext, mediaStream, mediaProcessor;
|
startButton.disabled = true;
|
||||||
|
stopButton.disabled = false;
|
||||||
async function startRecording() {
|
statusDiv.textContent = "🎙 Grabando…";
|
||||||
startButton.disabled = true;
|
|
||||||
stopButton.disabled = false;
|
|
||||||
statusDiv.textContent = "Recording…";
|
|
||||||
transcriptionDiv.textContent = "";
|
|
||||||
fullTextDiv.textContent = "";
|
|
||||||
|
|
||||||
socket = new Socket("ws://localhost:4000/socket");
|
|
||||||
socket.connect();
|
|
||||||
|
|
||||||
channel = socket.channel("data:lobby");
|
|
||||||
channel.join()
|
|
||||||
.receive("ok", () => {
|
|
||||||
statusDiv.textContent = "🎙 Conectado a Phoenix STT";
|
|
||||||
console.log("Canal conectado");
|
|
||||||
})
|
|
||||||
.receive("error", () => {
|
|
||||||
statusDiv.textContent = "❌ Error al conectar";
|
|
||||||
console.error("Error al conectar canal");
|
|
||||||
});
|
|
||||||
|
|
||||||
channel.on("realtime", payload => {
|
|
||||||
const words = payload.text.split(" ");
|
|
||||||
const lastWord = words.pop();
|
|
||||||
transcriptionDiv.innerHTML = `${words.join(" ")} <span class="last-word">${lastWord}</span>`;
|
|
||||||
});
|
|
||||||
|
|
||||||
channel.on("fullSentence", payload => {
|
|
||||||
fullTextDiv.innerHTML += payload.text + " ";
|
|
||||||
transcriptionDiv.innerHTML = "";
|
transcriptionDiv.innerHTML = "";
|
||||||
});
|
|
||||||
|
|
||||||
audioContext = new AudioContext();
|
socket = new Socket("ws://localhost:4000/socket");
|
||||||
mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
socket.connect();
|
||||||
const input = audioContext.createMediaStreamSource(mediaStream);
|
|
||||||
|
|
||||||
mediaProcessor = audioContext.createScriptProcessor(1024, 1, 1);
|
channel = socket.channel("data:lobby");
|
||||||
mediaProcessor.onaudioprocess = (event) => {
|
|
||||||
const float32Array = event.inputBuffer.getChannelData(0);
|
|
||||||
const int16Array = new Int16Array(float32Array.length);
|
|
||||||
for (let i = 0; i < float32Array.length; i++) {
|
|
||||||
int16Array[i] = Math.max(-1, Math.min(1, float32Array[i])) * 0x7FFF;
|
|
||||||
}
|
|
||||||
|
|
||||||
const base64Audio = btoa(String.fromCharCode(...new Uint8Array(int16Array.buffer)));
|
channel.join()
|
||||||
channel.push("audio_chunk", {
|
.receive("ok", () => {
|
||||||
data: base64Audio,
|
statusDiv.textContent = "✅ Conectado a Phoenix STT";
|
||||||
sample_rate: audioContext.sampleRate
|
console.log("Canal conectado");
|
||||||
|
})
|
||||||
|
.receive("error", () => {
|
||||||
|
statusDiv.textContent = "❌ Error al conectar canal";
|
||||||
|
console.error("Error al conectar canal");
|
||||||
|
});
|
||||||
|
|
||||||
|
// Realtime parcial (palabras mientras habla)
|
||||||
|
let partialTranscript = "";
|
||||||
|
|
||||||
|
channel.on("realtime", payload => {
|
||||||
|
const words = payload.text.split(" ");
|
||||||
|
const lastWord = words.pop();
|
||||||
|
const rest = words.join(" ");
|
||||||
|
|
||||||
|
if (rest.length > 0) {
|
||||||
|
partialTranscript += rest + " ";
|
||||||
|
}
|
||||||
|
|
||||||
|
transcriptionDiv.innerHTML = `
|
||||||
|
${partialTranscript}<span class="last-word">${lastWord}</span>
|
||||||
|
`;
|
||||||
});
|
});
|
||||||
};
|
|
||||||
|
|
||||||
input.connect(mediaProcessor);
|
|
||||||
mediaProcessor.connect(audioContext.destination);
|
|
||||||
}
|
|
||||||
|
|
||||||
function stopRecording() {
|
// Frase completa (después de procesar chunks)
|
||||||
stopButton.disabled = true;
|
channel.on("transcription", payload => {
|
||||||
startButton.disabled = false;
|
const sentence = payload.text.trim();
|
||||||
statusDiv.textContent = "🛑 Grabación detenida.";
|
if (sentence.length > 0) {
|
||||||
|
partialTranscript = ""; // reseteamos el parcial
|
||||||
|
const span = document.createElement("div");
|
||||||
|
span.className = "sentence";
|
||||||
|
span.textContent = sentence;
|
||||||
|
transcriptionDiv.appendChild(span);
|
||||||
|
transcriptionDiv.innerHTML += "<br />";
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
// ✅ Enviamos evento especial para guardar
|
|
||||||
if (channel) {
|
// Audio setup
|
||||||
channel.push("save_audio", {});
|
audioContext = new AudioContext();
|
||||||
|
mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||||
|
const input = audioContext.createMediaStreamSource(mediaStream);
|
||||||
|
|
||||||
|
mediaProcessor = audioContext.createScriptProcessor(1024, 1, 1);
|
||||||
|
mediaProcessor.onaudioprocess = (event) => {
|
||||||
|
const float32Array = event.inputBuffer.getChannelData(0);
|
||||||
|
const int16Array = new Int16Array(float32Array.length);
|
||||||
|
for (let i = 0; i < float32Array.length; i++) {
|
||||||
|
const s = Math.max(-1, Math.min(1, float32Array[i]));
|
||||||
|
int16Array[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
const base64Audio = btoa(String.fromCharCode(...new Uint8Array(int16Array.buffer)));
|
||||||
|
channel.push("audio_chunk", {
|
||||||
|
data: base64Audio,
|
||||||
|
sample_rate: audioContext.sampleRate
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
input.connect(mediaProcessor);
|
||||||
|
mediaProcessor.connect(audioContext.destination);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mediaProcessor) mediaProcessor.disconnect();
|
function stopRecording() {
|
||||||
if (audioContext) audioContext.close();
|
stopButton.disabled = true;
|
||||||
if (mediaStream) mediaStream.getTracks().forEach(track => track.stop());
|
startButton.disabled = false;
|
||||||
if (channel) channel.leave();
|
statusDiv.textContent = "🛑 Grabación detenida.";
|
||||||
if (socket) socket.disconnect();
|
|
||||||
}
|
|
||||||
|
|
||||||
document.getElementById("startButton").onclick = startRecording;
|
if (mediaProcessor) mediaProcessor.disconnect();
|
||||||
document.getElementById("stopButton").onclick = stopRecording;
|
if (audioContext) audioContext.close();
|
||||||
</script>
|
if (mediaStream) mediaStream.getTracks().forEach(track => track.stop());
|
||||||
|
if (channel) channel.leave();
|
||||||
|
if (socket) socket.disconnect();
|
||||||
|
}
|
||||||
|
|
||||||
<style>
|
document.getElementById("startButton").onclick = startRecording;
|
||||||
.last-word {
|
document.getElementById("stopButton").onclick = stopRecording;
|
||||||
font-weight: bold;
|
</script>
|
||||||
color: orange;
|
|
||||||
}
|
<style>
|
||||||
</style>
|
.last-word {
|
||||||
</div>
|
font-weight: bold;
|
||||||
|
color: orange;
|
||||||
|
}
|
||||||
|
#transcriptionContainer {
|
||||||
|
margin-top: 1rem;
|
||||||
|
font-family: sans-serif;
|
||||||
|
font-size: 1.1rem;
|
||||||
|
}
|
||||||
|
.sentence {
|
||||||
|
margin-bottom: 0.5rem;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</div>
|
||||||
"""
|
"""
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
Reference in New Issue
Block a user