correccion VAD en formato binario pcm16 - agrego transcripcion al live
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,8 +1,8 @@
|
|||||||
{application,whisper,
|
{application,whisper,
|
||||||
[{modules,['Elixir.AudioBuffer','Elixir.AudioFilesList',
|
[{modules,['Elixir.AudioBuffer','Elixir.AudioSaver',
|
||||||
'Elixir.AudioSaver','Elixir.Whisper',
|
'Elixir.Whisper','Elixir.Whisper.Application',
|
||||||
'Elixir.Whisper.Application','Elixir.Whisper.Counter',
|
'Elixir.Whisper.Counter','Elixir.Whisper.LargeModel',
|
||||||
'Elixir.Whisper.LargeModel','Elixir.Whisper.Mailer',
|
'Elixir.Whisper.Mailer',
|
||||||
'Elixir.Whisper.RealtimeModel',
|
'Elixir.Whisper.RealtimeModel',
|
||||||
'Elixir.Whisper.SendToModel',
|
'Elixir.Whisper.SendToModel',
|
||||||
'Elixir.Whisper.Transcriber','Elixir.WhisperWeb',
|
'Elixir.Whisper.Transcriber','Elixir.WhisperWeb',
|
||||||
|
@ -4,7 +4,6 @@ export const VadHook = {
|
|||||||
async mounted() {
|
async mounted() {
|
||||||
const statusDiv = document.getElementById("vad-status");
|
const statusDiv = document.getElementById("vad-status");
|
||||||
|
|
||||||
// Cargar onnxruntime y luego vad-web
|
|
||||||
const ortScript = document.createElement("script");
|
const ortScript = document.createElement("script");
|
||||||
ortScript.src = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.14.0/dist/ort.js";
|
ortScript.src = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.14.0/dist/ort.js";
|
||||||
|
|
||||||
@ -13,7 +12,6 @@ export const VadHook = {
|
|||||||
|
|
||||||
ortScript.onload = () => {
|
ortScript.onload = () => {
|
||||||
vadScript.onload = async () => {
|
vadScript.onload = async () => {
|
||||||
// Inicializar canal Phoenix
|
|
||||||
this.socket = new Socket("ws://localhost:4003/socket");
|
this.socket = new Socket("ws://localhost:4003/socket");
|
||||||
this.socket.connect();
|
this.socket.connect();
|
||||||
this.channel = this.socket.channel("audio:lobby");
|
this.channel = this.socket.channel("audio:lobby");
|
||||||
@ -21,40 +19,32 @@ export const VadHook = {
|
|||||||
console.log("✅ Canal audio:lobby unido.");
|
console.log("✅ Canal audio:lobby unido.");
|
||||||
});
|
});
|
||||||
|
|
||||||
// Preparar VAD pero no arrancar aún
|
const myvad = await vad.MicVAD.new({
|
||||||
this.myvad = await vad.MicVAD.new({
|
|
||||||
onSpeechStart: () => {
|
onSpeechStart: () => {
|
||||||
statusDiv.textContent = "🎤 Voz detectada...";
|
statusDiv.textContent = "🎤 Voz detectada...";
|
||||||
},
|
},
|
||||||
onSpeechEnd: async (float32Audio) => {
|
onSpeechEnd: async (float32Audio) => {
|
||||||
statusDiv.textContent = "✅ Voz finalizada. Enviando audio...";
|
statusDiv.textContent = "✅ Voz finalizada. Enviando audio...";
|
||||||
|
|
||||||
|
// Enviar el audio correctamente formateado
|
||||||
await sendAudioChunk(float32Audio, this.channel);
|
await sendAudioChunk(float32Audio, this.channel);
|
||||||
|
|
||||||
|
// Indicar stop si querés (como payload vacío JSON)
|
||||||
this.channel.push("stop_audio", {});
|
this.channel.push("stop_audio", {});
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// Esperar eventos desde LiveView
|
myvad.start();
|
||||||
this.handleEvent("init-vad", async () => {
|
statusDiv.textContent = "🚀 VAD iniciado.";
|
||||||
await this.myvad.start();
|
|
||||||
statusDiv.textContent = "🚀 VAD iniciado.";
|
|
||||||
});
|
|
||||||
|
|
||||||
this.handleEvent("stop-vad", async () => {
|
|
||||||
if (this.myvad) {
|
|
||||||
await this.myvad.stop();
|
|
||||||
statusDiv.textContent = "🛑 VAD detenido.";
|
|
||||||
}
|
|
||||||
});
|
|
||||||
};
|
};
|
||||||
|
|
||||||
document.body.appendChild(vadScript);
|
document.body.appendChild(vadScript);
|
||||||
};
|
};
|
||||||
|
|
||||||
document.body.appendChild(ortScript);
|
document.body.appendChild(ortScript);
|
||||||
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Convertir Float32Array a PCM 16-bit
|
// Función de helper para enviar el chunk
|
||||||
function float32ToInt16(float32Array) {
|
function float32ToInt16(float32Array) {
|
||||||
const int16Array = new Int16Array(float32Array.length);
|
const int16Array = new Int16Array(float32Array.length);
|
||||||
for (let i = 0; i < float32Array.length; i++) {
|
for (let i = 0; i < float32Array.length; i++) {
|
||||||
@ -64,16 +54,24 @@ function float32ToInt16(float32Array) {
|
|||||||
return int16Array;
|
return int16Array;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Enviar audio binario al canal
|
|
||||||
async function sendAudioChunk(float32Audio, channel) {
|
async function sendAudioChunk(float32Audio, channel) {
|
||||||
const pcm16 = float32ToInt16(float32Audio);
|
const pcm16 = float32ToInt16(float32Audio);
|
||||||
const header = JSON.stringify({ sample_rate: 16000 });
|
const header = JSON.stringify({ sample_rate: 16000 });
|
||||||
const headerBytes = new TextEncoder().encode(header);
|
const headerBytes = new TextEncoder().encode(header);
|
||||||
const totalLength = 2 + headerBytes.length + pcm16.byteLength;
|
const audioBytes = new Uint8Array(pcm16.buffer); // same as merged in el otro ejemplo
|
||||||
|
const totalLength = 2 + headerBytes.length + audioBytes.length;
|
||||||
const buffer = new ArrayBuffer(totalLength);
|
const buffer = new ArrayBuffer(totalLength);
|
||||||
const view = new DataView(buffer);
|
const view = new DataView(buffer);
|
||||||
view.setUint16(0, headerBytes.length, true);
|
|
||||||
|
// Encabezado: longitud en big endian
|
||||||
|
view.setUint16(0, headerBytes.length, false); // <== big endian
|
||||||
|
|
||||||
|
// Copiar header y audio al buffer
|
||||||
new Uint8Array(buffer, 2, headerBytes.length).set(headerBytes);
|
new Uint8Array(buffer, 2, headerBytes.length).set(headerBytes);
|
||||||
new Uint8Array(buffer, 2 + headerBytes.length).set(new Uint8Array(pcm16.buffer));
|
new Uint8Array(buffer, 2 + headerBytes.length).set(audioBytes);
|
||||||
channel.pushBinary(buffer);
|
|
||||||
|
// Enviar el buffer binario
|
||||||
|
channel.push("audio_chunk", buffer);
|
||||||
|
console.log("📤 Chunk binario enviado");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -26,8 +26,8 @@ defmodule Whisper.Application do
|
|||||||
{Phoenix.PubSub, name: Whisper.PubSub},
|
{Phoenix.PubSub, name: Whisper.PubSub},
|
||||||
WhisperWeb.Endpoint,
|
WhisperWeb.Endpoint,
|
||||||
Whisper.Counter,
|
Whisper.Counter,
|
||||||
AudioBuffer,
|
AudioBuffer
|
||||||
AudioFilesList
|
# AudioFilesList
|
||||||
]
|
]
|
||||||
|
|
||||||
opts = [strategy: :one_for_one, name: Whisper.Supervisor]
|
opts = [strategy: :one_for_one, name: Whisper.Supervisor]
|
||||||
|
@ -1,66 +1,66 @@
|
|||||||
defmodule AudioFilesList do
|
# defmodule AudioFilesList do
|
||||||
use GenServer
|
# use GenServer
|
||||||
require Logger
|
# require Logger
|
||||||
alias Phoenix.PubSub
|
# alias Phoenix.PubSub
|
||||||
|
|
||||||
def start_link(_opts) do
|
# def start_link(_opts) do
|
||||||
GenServer.start_link(__MODULE__, :idle, name: __MODULE__)
|
# GenServer.start_link(__MODULE__, :idle, name: __MODULE__)
|
||||||
end
|
# end
|
||||||
|
|
||||||
def add_file(path) do
|
# def add_file(path) do
|
||||||
Logger.debug("add file")
|
# Logger.debug("add file")
|
||||||
GenServer.cast(__MODULE__, {:new_file, path})
|
# GenServer.cast(__MODULE__, {:new_file, path})
|
||||||
end
|
# end
|
||||||
|
|
||||||
def init(:idle) do
|
# def init(:idle) do
|
||||||
Logger.info("AudioFilesList iniciado")
|
# Logger.info("AudioFilesList iniciado")
|
||||||
{:ok, %{queue: [], processing: false}}
|
# {:ok, %{queue: [], processing: false}}
|
||||||
end
|
# end
|
||||||
|
|
||||||
|
|
||||||
def handle_cast({:new_file, path}, %{queue: queue, processing: false} = state) do
|
# def handle_cast({:new_file, path}, %{queue: queue, processing: false} = state) do
|
||||||
Logger.info("📥 Archivo encolado: #{path}")
|
# Logger.info("📥 Archivo encolado: #{path}")
|
||||||
queue = queue ++ [path]
|
# queue = queue ++ [path]
|
||||||
[next | rest] = queue
|
# [next | rest] = queue
|
||||||
{:noreply, %{queue: rest, processing: false}}
|
# {:noreply, %{queue: rest, processing: false}}
|
||||||
end
|
# end
|
||||||
|
|
||||||
def handle_cast({:new_file, path}, %{queue: queue, processing: true} = state) do
|
# def handle_cast({:new_file, path}, %{queue: queue, processing: true} = state) do
|
||||||
{:noreply, %{state | queue: queue ++ [path]}}
|
# {:noreply, %{state | queue: queue ++ [path]}}
|
||||||
end
|
# end
|
||||||
|
|
||||||
def handle_info(:done, %{queue: []} = state) do
|
# def handle_info(:done, %{queue: []} = state) do
|
||||||
{:noreply, %{state | processing: false}}
|
# {:noreply, %{state | processing: false}}
|
||||||
end
|
# end
|
||||||
|
|
||||||
def handle_info(:done, %{queue: [next | rest]} = state) do
|
# def handle_info(:done, %{queue: [next | rest]} = state) do
|
||||||
{:noreply, %{state | queue: rest}}
|
# {:noreply, %{state | queue: rest}}
|
||||||
end
|
# end
|
||||||
|
|
||||||
# def handle_cast({:done_processing, path}, %{queue: queue} = state) do
|
# # def handle_cast({:done_processing, path}, %{queue: queue} = state) do
|
||||||
# new_queue = Enum.reject(queue, fn p -> p == path end)
|
# # new_queue = Enum.reject(queue, fn p -> p == path end)
|
||||||
# Logger.info("🗑️ Archivo eliminado y removido de la cola: #{path}")
|
# # Logger.info("🗑️ Archivo eliminado y removido de la cola: #{path}")
|
||||||
# {:noreply, %{state | queue: new_queue}}
|
# # {:noreply, %{state | queue: new_queue}}
|
||||||
# end
|
# # end
|
||||||
|
|
||||||
# defp process_file(path) do
|
# # defp process_file(path) do
|
||||||
# Logger.info("▶️ Inicia procesamiento realtime: #{path}")
|
# # Logger.info("▶️ Inicia procesamiento realtime: #{path}")
|
||||||
|
|
||||||
# Task.start(fn ->
|
# # Task.start(fn ->
|
||||||
# case Whisper.SendToModel.realtime(path) do
|
# # case Whisper.SendToModel.realtime(path) do
|
||||||
# {:ok, text} when is_binary(text) ->
|
# # {:ok, text} when is_binary(text) ->
|
||||||
# message = %{"chunks" => [%{"text" => text}]}
|
# # message = %{"chunks" => [%{"text" => text}]}
|
||||||
# Phoenix.PubSub.broadcast(Whisper.PubSub, "transcription", {:transcription, Jason.encode!(message)})
|
# # Phoenix.PubSub.broadcast(Whisper.PubSub, "transcription", {:transcription, Jason.encode!(message)})
|
||||||
# Logger.info("✅ Transcripción (realtime): #{text}")
|
# # Logger.info("✅ Transcripción (realtime): #{text}")
|
||||||
# File.rm!(path)
|
# # File.rm!(path)
|
||||||
# # AudioFilesList.done_processing(path)
|
# # # AudioFilesList.done_processing(path)
|
||||||
# {:error, reason} ->
|
# # {:error, reason} ->
|
||||||
# Logger.error("❌ Error transcribiendo: #{inspect(reason)}")
|
# # Logger.error("❌ Error transcribiendo: #{inspect(reason)}")
|
||||||
# end
|
# # end
|
||||||
|
|
||||||
# send(__MODULE__, :done)
|
# # send(__MODULE__, :done)
|
||||||
|
|
||||||
# end)
|
# # end)
|
||||||
# end
|
# # end
|
||||||
|
|
||||||
end
|
# end
|
||||||
|
@ -26,22 +26,20 @@ defmodule WhisperWeb.AudioChannel do
|
|||||||
<<header_len::16, rest::binary>> = raw_binary
|
<<header_len::16, rest::binary>> = raw_binary
|
||||||
<<header::binary-size(header_len), audio::binary>> = rest
|
<<header::binary-size(header_len), audio::binary>> = rest
|
||||||
|
|
||||||
IO.inspect(header, label: "HEADER BINARIO RECIBIDO")
|
%{"sample_rate" => rate} = Jason.decode!(header)
|
||||||
|
ref = socket_id(socket)
|
||||||
|
|
||||||
case Jason.decode(header) do
|
Logger.info("Chunk recibido: #{byte_size(audio)} bytes, sample_rate: #{rate}")
|
||||||
{:ok, %{"sample_rate" => rate}} ->
|
AudioBuffer.append(ref, {rate, audio})
|
||||||
Logger.info("Chunk recibido: #{byte_size(audio)} bytes, sample_rate: #{rate}")
|
|
||||||
AudioBuffer.append(socket_id(socket), {rate, audio})
|
|
||||||
{:noreply, socket}
|
|
||||||
|
|
||||||
{:error, reason} ->
|
# {:ok, path} = AudioSaver.save_chunk_as_wav(ref, audio, rate, "part")
|
||||||
Logger.error("Error decodificando header JSON: #{inspect(reason)}")
|
# AudioFilesList.add_file(path)
|
||||||
{:noreply, socket}
|
|
||||||
end
|
|
||||||
|
{:noreply, socket}
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@doc """
|
@doc """
|
||||||
Recupera todos los chunks acumulados en el buffer, los concatena y guarda un archivo WAV final (sufijo `"final"`).
|
Recupera todos los chunks acumulados en el buffer, los concatena y guarda un archivo WAV final (sufijo `"final"`).
|
||||||
"""
|
"""
|
||||||
@ -60,8 +58,8 @@ defmodule WhisperWeb.AudioChannel do
|
|||||||
Task.start(fn ->
|
Task.start(fn ->
|
||||||
transcription = Whisper.SendToModel.large(path)
|
transcription = Whisper.SendToModel.large(path)
|
||||||
Logger.info("✅ Transcripción completa:\n#{transcription}")
|
Logger.info("✅ Transcripción completa:\n#{transcription}")
|
||||||
# message = %{"chunks" => [%{"text" => transcription}]}
|
message = %{"chunks" => [%{"text" => transcription}]}
|
||||||
# Phoenix.PubSub.broadcast(Whisper.PubSub, "transcription", {:transcription_m, Jason.encode!(message)})
|
Phoenix.PubSub.broadcast(Whisper.PubSub, "transcription", {:transcription, Jason.encode!(message)})
|
||||||
File.rm!(path)
|
File.rm!(path)
|
||||||
end)
|
end)
|
||||||
end
|
end
|
||||||
|
@ -1,28 +1,48 @@
|
|||||||
defmodule WhisperWeb.VadLive do
|
defmodule WhisperWeb.VadLive do
|
||||||
use WhisperWeb, :live_view
|
use WhisperWeb, :live_view
|
||||||
|
alias Phoenix.PubSub
|
||||||
|
|
||||||
def mount(_, _, socket) do
|
def mount(_, _, socket) do
|
||||||
{:ok, assign(socket, started: false)}
|
PubSub.subscribe(Whisper.PubSub, "transcription")
|
||||||
|
|
||||||
|
socket =
|
||||||
|
socket
|
||||||
|
|> assign(:transcription, "")
|
||||||
|
|> assign(:started, false)
|
||||||
|
|
||||||
|
{:ok, socket}
|
||||||
|
end
|
||||||
|
|
||||||
|
def handle_event("start_vad", _params, socket) do
|
||||||
|
push_event(socket, "init-vad", %{})
|
||||||
|
{:noreply, assign(socket, started: true)}
|
||||||
|
end
|
||||||
|
|
||||||
|
def handle_info({:transcription, raw_json}, socket) do
|
||||||
|
new_text =
|
||||||
|
raw_json
|
||||||
|
|> Jason.decode!()
|
||||||
|
|> get_in(["chunks", Access.at(0), "text"])
|
||||||
|
{:noreply, update(socket, :transcription, &(&1 <> " " <> new_text))}
|
||||||
end
|
end
|
||||||
|
|
||||||
def render(assigns) do
|
def render(assigns) do
|
||||||
~H"""
|
~H"""
|
||||||
<div id="vad-container" phx-hook="VadHook">
|
<div id="vad-container" phx-hook="VadHook">
|
||||||
<button phx-click="start_vad" class="btn btn-primary">🎙 Iniciar VAD</button>
|
<button phx-click="start_vad" class="btn btn-primary">🎙 Iniciar VAD</button>
|
||||||
<button phx-click="stop_vad" class="btn btn-danger">🛑 Detener VAD</button>
|
|
||||||
<div id="vad-status" class="mt-4 text-sm text-gray-700"></div>
|
<div id="vad-status" class="mt-4 text-sm text-gray-700"></div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<div id="transcriptionContainer" class="w-full max-w-2xl space-y-4">
|
||||||
|
<%= if @transcription != "" do %>
|
||||||
|
<div class="p-4 bg-gray-100 rounded shadow-md">
|
||||||
|
<h2 class="text-sm font-semibold text-gray-700 mb-2">✅ Transcripción</h2>
|
||||||
|
<p class="text-green-600 whitespace-pre-wrap break-words text-sm leading-relaxed"><%= @transcription %></p>
|
||||||
|
</div>
|
||||||
|
<% end %>
|
||||||
|
</div>
|
||||||
"""
|
"""
|
||||||
end
|
end
|
||||||
|
|
||||||
def handle_event("start_vad", _, socket) do
|
|
||||||
push_event(socket, "init-vad", %{})
|
|
||||||
{:noreply, socket}
|
|
||||||
end
|
|
||||||
|
|
||||||
def handle_event("stop_vad", _, socket) do
|
|
||||||
push_event(socket, "stop-vad", %{})
|
|
||||||
{:noreply, socket}
|
|
||||||
end
|
|
||||||
|
|
||||||
end
|
end
|
||||||
|
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user