correccion VAD en formato binario pcm16 - agrego transcripcion al live

This commit is contained in:
2025-08-01 21:22:11 +00:00
parent e43a8c01a7
commit 976e350436
11 changed files with 133 additions and 123 deletions

View File

@ -1,8 +1,8 @@
{application,whisper, {application,whisper,
[{modules,['Elixir.AudioBuffer','Elixir.AudioFilesList', [{modules,['Elixir.AudioBuffer','Elixir.AudioSaver',
'Elixir.AudioSaver','Elixir.Whisper', 'Elixir.Whisper','Elixir.Whisper.Application',
'Elixir.Whisper.Application','Elixir.Whisper.Counter', 'Elixir.Whisper.Counter','Elixir.Whisper.LargeModel',
'Elixir.Whisper.LargeModel','Elixir.Whisper.Mailer', 'Elixir.Whisper.Mailer',
'Elixir.Whisper.RealtimeModel', 'Elixir.Whisper.RealtimeModel',
'Elixir.Whisper.SendToModel', 'Elixir.Whisper.SendToModel',
'Elixir.Whisper.Transcriber','Elixir.WhisperWeb', 'Elixir.Whisper.Transcriber','Elixir.WhisperWeb',

View File

@ -4,7 +4,6 @@ export const VadHook = {
async mounted() { async mounted() {
const statusDiv = document.getElementById("vad-status"); const statusDiv = document.getElementById("vad-status");
// Cargar onnxruntime y luego vad-web
const ortScript = document.createElement("script"); const ortScript = document.createElement("script");
ortScript.src = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.14.0/dist/ort.js"; ortScript.src = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.14.0/dist/ort.js";
@ -13,7 +12,6 @@ export const VadHook = {
ortScript.onload = () => { ortScript.onload = () => {
vadScript.onload = async () => { vadScript.onload = async () => {
// Inicializar canal Phoenix
this.socket = new Socket("ws://localhost:4003/socket"); this.socket = new Socket("ws://localhost:4003/socket");
this.socket.connect(); this.socket.connect();
this.channel = this.socket.channel("audio:lobby"); this.channel = this.socket.channel("audio:lobby");
@ -21,40 +19,32 @@ export const VadHook = {
console.log("✅ Canal audio:lobby unido."); console.log("✅ Canal audio:lobby unido.");
}); });
// Preparar VAD pero no arrancar aún const myvad = await vad.MicVAD.new({
this.myvad = await vad.MicVAD.new({
onSpeechStart: () => { onSpeechStart: () => {
statusDiv.textContent = "🎤 Voz detectada..."; statusDiv.textContent = "🎤 Voz detectada...";
}, },
onSpeechEnd: async (float32Audio) => { onSpeechEnd: async (float32Audio) => {
statusDiv.textContent = "✅ Voz finalizada. Enviando audio..."; statusDiv.textContent = "✅ Voz finalizada. Enviando audio...";
// Enviar el audio correctamente formateado
await sendAudioChunk(float32Audio, this.channel); await sendAudioChunk(float32Audio, this.channel);
// Indicar stop si querés (como payload vacío JSON)
this.channel.push("stop_audio", {}); this.channel.push("stop_audio", {});
} }
}); });
// Esperar eventos desde LiveView myvad.start();
this.handleEvent("init-vad", async () => { statusDiv.textContent = "🚀 VAD iniciado.";
await this.myvad.start();
statusDiv.textContent = "🚀 VAD iniciado.";
});
this.handleEvent("stop-vad", async () => {
if (this.myvad) {
await this.myvad.stop();
statusDiv.textContent = "🛑 VAD detenido.";
}
});
}; };
document.body.appendChild(vadScript); document.body.appendChild(vadScript);
}; };
document.body.appendChild(ortScript); document.body.appendChild(ortScript);
} }
}; };
// Convertir Float32Array a PCM 16-bit // Función de helper para enviar el chunk
function float32ToInt16(float32Array) { function float32ToInt16(float32Array) {
const int16Array = new Int16Array(float32Array.length); const int16Array = new Int16Array(float32Array.length);
for (let i = 0; i < float32Array.length; i++) { for (let i = 0; i < float32Array.length; i++) {
@ -64,16 +54,24 @@ function float32ToInt16(float32Array) {
return int16Array; return int16Array;
} }
// Enviar audio binario al canal
async function sendAudioChunk(float32Audio, channel) { async function sendAudioChunk(float32Audio, channel) {
const pcm16 = float32ToInt16(float32Audio); const pcm16 = float32ToInt16(float32Audio);
const header = JSON.stringify({ sample_rate: 16000 }); const header = JSON.stringify({ sample_rate: 16000 });
const headerBytes = new TextEncoder().encode(header); const headerBytes = new TextEncoder().encode(header);
const totalLength = 2 + headerBytes.length + pcm16.byteLength; const audioBytes = new Uint8Array(pcm16.buffer); // same as merged in el otro ejemplo
const totalLength = 2 + headerBytes.length + audioBytes.length;
const buffer = new ArrayBuffer(totalLength); const buffer = new ArrayBuffer(totalLength);
const view = new DataView(buffer); const view = new DataView(buffer);
view.setUint16(0, headerBytes.length, true);
// Encabezado: longitud en big endian
view.setUint16(0, headerBytes.length, false); // <== big endian
// Copiar header y audio al buffer
new Uint8Array(buffer, 2, headerBytes.length).set(headerBytes); new Uint8Array(buffer, 2, headerBytes.length).set(headerBytes);
new Uint8Array(buffer, 2 + headerBytes.length).set(new Uint8Array(pcm16.buffer)); new Uint8Array(buffer, 2 + headerBytes.length).set(audioBytes);
channel.pushBinary(buffer);
// Enviar el buffer binario
channel.push("audio_chunk", buffer);
console.log("📤 Chunk binario enviado");
} }

View File

@ -26,8 +26,8 @@ defmodule Whisper.Application do
{Phoenix.PubSub, name: Whisper.PubSub}, {Phoenix.PubSub, name: Whisper.PubSub},
WhisperWeb.Endpoint, WhisperWeb.Endpoint,
Whisper.Counter, Whisper.Counter,
AudioBuffer, AudioBuffer
AudioFilesList # AudioFilesList
] ]
opts = [strategy: :one_for_one, name: Whisper.Supervisor] opts = [strategy: :one_for_one, name: Whisper.Supervisor]

View File

@ -1,66 +1,66 @@
defmodule AudioFilesList do # defmodule AudioFilesList do
use GenServer # use GenServer
require Logger # require Logger
alias Phoenix.PubSub # alias Phoenix.PubSub
def start_link(_opts) do # def start_link(_opts) do
GenServer.start_link(__MODULE__, :idle, name: __MODULE__) # GenServer.start_link(__MODULE__, :idle, name: __MODULE__)
end # end
def add_file(path) do # def add_file(path) do
Logger.debug("add file") # Logger.debug("add file")
GenServer.cast(__MODULE__, {:new_file, path}) # GenServer.cast(__MODULE__, {:new_file, path})
end # end
def init(:idle) do # def init(:idle) do
Logger.info("AudioFilesList iniciado") # Logger.info("AudioFilesList iniciado")
{:ok, %{queue: [], processing: false}} # {:ok, %{queue: [], processing: false}}
end # end
def handle_cast({:new_file, path}, %{queue: queue, processing: false} = state) do # def handle_cast({:new_file, path}, %{queue: queue, processing: false} = state) do
Logger.info("📥 Archivo encolado: #{path}") # Logger.info("📥 Archivo encolado: #{path}")
queue = queue ++ [path] # queue = queue ++ [path]
[next | rest] = queue # [next | rest] = queue
{:noreply, %{queue: rest, processing: false}} # {:noreply, %{queue: rest, processing: false}}
end # end
def handle_cast({:new_file, path}, %{queue: queue, processing: true} = state) do # def handle_cast({:new_file, path}, %{queue: queue, processing: true} = state) do
{:noreply, %{state | queue: queue ++ [path]}} # {:noreply, %{state | queue: queue ++ [path]}}
end # end
def handle_info(:done, %{queue: []} = state) do # def handle_info(:done, %{queue: []} = state) do
{:noreply, %{state | processing: false}} # {:noreply, %{state | processing: false}}
end # end
def handle_info(:done, %{queue: [next | rest]} = state) do # def handle_info(:done, %{queue: [next | rest]} = state) do
{:noreply, %{state | queue: rest}} # {:noreply, %{state | queue: rest}}
end # end
# def handle_cast({:done_processing, path}, %{queue: queue} = state) do # # def handle_cast({:done_processing, path}, %{queue: queue} = state) do
# new_queue = Enum.reject(queue, fn p -> p == path end) # # new_queue = Enum.reject(queue, fn p -> p == path end)
# Logger.info("🗑️ Archivo eliminado y removido de la cola: #{path}") # # Logger.info("🗑️ Archivo eliminado y removido de la cola: #{path}")
# {:noreply, %{state | queue: new_queue}} # # {:noreply, %{state | queue: new_queue}}
# end # # end
# defp process_file(path) do # # defp process_file(path) do
# Logger.info("▶️ Inicia procesamiento realtime: #{path}") # # Logger.info("▶️ Inicia procesamiento realtime: #{path}")
# Task.start(fn -> # # Task.start(fn ->
# case Whisper.SendToModel.realtime(path) do # # case Whisper.SendToModel.realtime(path) do
# {:ok, text} when is_binary(text) -> # # {:ok, text} when is_binary(text) ->
# message = %{"chunks" => [%{"text" => text}]} # # message = %{"chunks" => [%{"text" => text}]}
# Phoenix.PubSub.broadcast(Whisper.PubSub, "transcription", {:transcription, Jason.encode!(message)}) # # Phoenix.PubSub.broadcast(Whisper.PubSub, "transcription", {:transcription, Jason.encode!(message)})
# Logger.info("✅ Transcripción (realtime): #{text}") # # Logger.info("✅ Transcripción (realtime): #{text}")
# File.rm!(path) # # File.rm!(path)
# # AudioFilesList.done_processing(path) # # # AudioFilesList.done_processing(path)
# {:error, reason} -> # # {:error, reason} ->
# Logger.error("❌ Error transcribiendo: #{inspect(reason)}") # # Logger.error("❌ Error transcribiendo: #{inspect(reason)}")
# end # # end
# send(__MODULE__, :done) # # send(__MODULE__, :done)
# end) # # end)
# end # # end
end # end

View File

@ -26,22 +26,20 @@ defmodule WhisperWeb.AudioChannel do
<<header_len::16, rest::binary>> = raw_binary <<header_len::16, rest::binary>> = raw_binary
<<header::binary-size(header_len), audio::binary>> = rest <<header::binary-size(header_len), audio::binary>> = rest
IO.inspect(header, label: "HEADER BINARIO RECIBIDO") %{"sample_rate" => rate} = Jason.decode!(header)
ref = socket_id(socket)
case Jason.decode(header) do Logger.info("Chunk recibido: #{byte_size(audio)} bytes, sample_rate: #{rate}")
{:ok, %{"sample_rate" => rate}} -> AudioBuffer.append(ref, {rate, audio})
Logger.info("Chunk recibido: #{byte_size(audio)} bytes, sample_rate: #{rate}")
AudioBuffer.append(socket_id(socket), {rate, audio})
{:noreply, socket}
{:error, reason} -> # {:ok, path} = AudioSaver.save_chunk_as_wav(ref, audio, rate, "part")
Logger.error("Error decodificando header JSON: #{inspect(reason)}") # AudioFilesList.add_file(path)
{:noreply, socket}
end
{:noreply, socket}
end end
@doc """ @doc """
Recupera todos los chunks acumulados en el buffer, los concatena y guarda un archivo WAV final (sufijo `"final"`). Recupera todos los chunks acumulados en el buffer, los concatena y guarda un archivo WAV final (sufijo `"final"`).
""" """
@ -60,8 +58,8 @@ defmodule WhisperWeb.AudioChannel do
Task.start(fn -> Task.start(fn ->
transcription = Whisper.SendToModel.large(path) transcription = Whisper.SendToModel.large(path)
Logger.info("✅ Transcripción completa:\n#{transcription}") Logger.info("✅ Transcripción completa:\n#{transcription}")
# message = %{"chunks" => [%{"text" => transcription}]} message = %{"chunks" => [%{"text" => transcription}]}
# Phoenix.PubSub.broadcast(Whisper.PubSub, "transcription", {:transcription_m, Jason.encode!(message)}) Phoenix.PubSub.broadcast(Whisper.PubSub, "transcription", {:transcription, Jason.encode!(message)})
File.rm!(path) File.rm!(path)
end) end)
end end

View File

@ -1,28 +1,48 @@
defmodule WhisperWeb.VadLive do defmodule WhisperWeb.VadLive do
use WhisperWeb, :live_view use WhisperWeb, :live_view
alias Phoenix.PubSub
def mount(_, _, socket) do def mount(_, _, socket) do
{:ok, assign(socket, started: false)} PubSub.subscribe(Whisper.PubSub, "transcription")
socket =
socket
|> assign(:transcription, "")
|> assign(:started, false)
{:ok, socket}
end
def handle_event("start_vad", _params, socket) do
push_event(socket, "init-vad", %{})
{:noreply, assign(socket, started: true)}
end
def handle_info({:transcription, raw_json}, socket) do
new_text =
raw_json
|> Jason.decode!()
|> get_in(["chunks", Access.at(0), "text"])
{:noreply, update(socket, :transcription, &(&1 <> " " <> new_text))}
end end
def render(assigns) do def render(assigns) do
~H""" ~H"""
<div id="vad-container" phx-hook="VadHook"> <div id="vad-container" phx-hook="VadHook">
<button phx-click="start_vad" class="btn btn-primary">🎙 Iniciar VAD</button> <button phx-click="start_vad" class="btn btn-primary">🎙 Iniciar VAD</button>
<button phx-click="stop_vad" class="btn btn-danger">🛑 Detener VAD</button>
<div id="vad-status" class="mt-4 text-sm text-gray-700"></div> <div id="vad-status" class="mt-4 text-sm text-gray-700"></div>
</div> </div>
<div id="transcriptionContainer" class="w-full max-w-2xl space-y-4">
<%= if @transcription != "" do %>
<div class="p-4 bg-gray-100 rounded shadow-md">
<h2 class="text-sm font-semibold text-gray-700 mb-2">✅ Transcripción</h2>
<p class="text-green-600 whitespace-pre-wrap break-words text-sm leading-relaxed"><%= @transcription %></p>
</div>
<% end %>
</div>
""" """
end end
def handle_event("start_vad", _, socket) do
push_event(socket, "init-vad", %{})
{:noreply, socket}
end
def handle_event("stop_vad", _, socket) do
push_event(socket, "stop-vad", %{})
{:noreply, socket}
end
end end

File diff suppressed because one or more lines are too long