From 46df5fc5eb067597b548e6969c5874d9ac8a43a3 Mon Sep 17 00:00:00 2001 From: aime-rolandi Date: Wed, 2 Jul 2025 16:23:19 -0300 Subject: [PATCH] Transcripcion realtime y cambio a la tanscripcion large. Conexion al modelo large v3. --- recognition_VAD/assets/css/app.css | 4 +- recognition_VAD/assets/js/app.js | 13 ++- recognition_VAD/config/config.exs | 1 + .../lib/recognition_VAD/application.ex | 1 + .../lib/recognition_VAD/audio_processor.ex | 15 ++- .../lib/recognition_VAD/large_transcriber.ex | 67 +++++++++++++ .../lib/recognition_VAD/whisper.ex | 3 +- .../lib/recognition_VAD/whisper_streamer.ex | 10 +- .../channels/data_channel.ex | 21 +++- .../lib/recognition_VAD_web/endpoint.ex | 3 +- .../live/stt/test_with_channel.ex | 96 ++++++++++++++++--- .../lib/recognition_VAD_web/router.ex | 1 + 12 files changed, 211 insertions(+), 24 deletions(-) create mode 100644 recognition_VAD/lib/recognition_VAD/large_transcriber.ex diff --git a/recognition_VAD/assets/css/app.css b/recognition_VAD/assets/css/app.css index d67ce14d..2723319c 100644 --- a/recognition_VAD/assets/css/app.css +++ b/recognition_VAD/assets/css/app.css @@ -30,7 +30,7 @@ text-align: center; } #transcriptionContainer { - height: 90px; /* Fixed height for approximately 3 lines of text */ + height: auto; /* Fixed height for approximately 3 lines of text */ overflow-y: auto; width: 100%; padding: 10px; @@ -83,4 +83,4 @@ button:disabled { background-color: #cccccc; cursor: not-allowed; - } \ No newline at end of file + } diff --git a/recognition_VAD/assets/js/app.js b/recognition_VAD/assets/js/app.js index ce20ef7b..7597357d 100644 --- a/recognition_VAD/assets/js/app.js +++ b/recognition_VAD/assets/js/app.js @@ -22,9 +22,20 @@ import {Socket} from "phoenix" import {LiveSocket} from "phoenix_live_view" import topbar from "../vendor/topbar" import SttRecorder from "./stt_recorder.js"; +let csrfToken = document.querySelector("meta[name='csrf-token']").getAttribute("content"); + +let Hooks = {}; + +Hooks.AudioPathHook = { + mounted() { + this.el.addEventListener("audio_path", (event) => { + this.pushEvent("audio_path", { audio_path: event.detail.audio_path }); + }); + } +}; let liveSocket = new LiveSocket("/live", Socket, { - hooks: { SttRecorder }, + hooks: { SttRecorder, Hooks}, params: { _csrf_token: csrfToken } }); diff --git a/recognition_VAD/config/config.exs b/recognition_VAD/config/config.exs index 3eee2063..4dd3ff03 100644 --- a/recognition_VAD/config/config.exs +++ b/recognition_VAD/config/config.exs @@ -18,6 +18,7 @@ config :recognition_VAD, Recognition_VADWeb.Endpoint, formats: [html: Recognition_VADWeb.ErrorHTML, json: Recognition_VADWeb.ErrorJSON], layout: false ], + server: true, pubsub_server: Recognition_VAD.PubSub, live_view: [signing_salt: "MLX284g+"] diff --git a/recognition_VAD/lib/recognition_VAD/application.ex b/recognition_VAD/lib/recognition_VAD/application.ex index c7d10410..aee46276 100644 --- a/recognition_VAD/lib/recognition_VAD/application.ex +++ b/recognition_VAD/lib/recognition_VAD/application.ex @@ -13,6 +13,7 @@ defmodule Recognition_VAD.Application do {Phoenix.PubSub, name: Recognition_VAD.PubSub}, Recognition_VAD.AudioProcessor, Recognition_VAD.WhisperStreamer, + Recognition_VAD.LargeTranscriber, # Start the Finch HTTP client for sending emails {Finch, name: Recognition_VAD.Finch}, diff --git a/recognition_VAD/lib/recognition_VAD/audio_processor.ex b/recognition_VAD/lib/recognition_VAD/audio_processor.ex index ae4e42ca..ef357848 100644 --- a/recognition_VAD/lib/recognition_VAD/audio_processor.ex +++ b/recognition_VAD/lib/recognition_VAD/audio_processor.ex @@ -11,22 +11,29 @@ defmodule Recognition_VAD.AudioProcessor do end def handle_cast({:chunk, binary_audio, sample_rate}, state) do - # 👇 Guardamos el chunk en el buffer - new_buffer = [binary_audio | state.buffer] |> Enum.take(100) # máximo 100 chunks + new_buffer = [binary_audio | state.buffer] # 🔥 quitá el Enum.take(100) Logger.info("🟡 Recibido chunk de #{byte_size(binary_audio)} bytes a #{sample_rate} Hz") {:noreply, %{state | buffer: new_buffer, sample_rate: sample_rate}} end + def handle_cast(:save_wav, state) do timestamp = DateTime.utc_now() |> DateTime.to_unix() filename = "recording_#{timestamp}.wav" - Recognition_VAD.WavWriter.write_pcm_chunks_to_wav(state.buffer, state.sample_rate, filename) Logger.info("💾 Guardado archivo: #{filename}") + Recognition_VAD.LargeTranscriber.improve_transcription(filename) + + # Notificamos a LiveView por PubSub + Phoenix.PubSub.broadcast(Recognition_VAD.PubSub, "audio_output", {:audio_saved, %{path: filename}}) + {:noreply, state} end - + def handle_cast(:reset, state) do + Logger.info("🔄 Reset del buffer de audio para nueva grabación") + {:noreply, %{state | buffer: [], sample_rate: 0}} + end end diff --git a/recognition_VAD/lib/recognition_VAD/large_transcriber.ex b/recognition_VAD/lib/recognition_VAD/large_transcriber.ex new file mode 100644 index 00000000..3d4be5cd --- /dev/null +++ b/recognition_VAD/lib/recognition_VAD/large_transcriber.ex @@ -0,0 +1,67 @@ +defmodule Recognition_VAD.LargeTranscriber do + use GenServer + require Logger + + @default_model "ggml-large-v3-turbo.bin" + @script_path "/home/aime-pc2/i_m/whisper.cpp/large_transcribe.sh" + + def start_link(_opts) do + GenServer.start_link(__MODULE__, %{}, name: __MODULE__) + end + + @impl true + def init(state) do + {:ok, state} + end + + @doc """ + Llamada externa para iniciar la mejora con el modelo large. + """ + def improve_transcription(audio_path) do + GenServer.cast(__MODULE__, {:improve, audio_path}) + end + + @impl true + def handle_cast({:improve, path}, state) do + Logger.info("🚀 LargeTranscriber recibió la ruta: #{path}") + large_path = "/mnt/c/Users/rolan/i_m/voice_recognition/recognition_VAD/#{path}" + Phoenix.PubSub.broadcast(Recognition_VAD.PubSub, "large", {:large_path, :info, "#{path}"}) + + transcribe(large_path, @default_model) + + # Aquí luego vas a invocar el whisper grande con esa ruta. + {:noreply, state} + end + def transcribe(path, model) do + + args = [@script_path, path, model] + + case System.cmd("wsl", args, stderr_to_stdout: true) do + {output, 0} -> + text = extract_transcription(output) + # Logger.info("📝 Transcripción mejorada: #{text}") + Phoenix.PubSub.broadcast(Recognition_VAD.PubSub, "large", {:transcription_improved, :info, "#{text}"}) + # Phoenix.PubSub.broadcast(Recognition_VAD.PubSub, "audio_output", {:log_message, :info, text, "large"}) + + {:ok, text} + + {error_output, _} -> + Logger.error("❌ Error al transcribir con whisper: #{error_output}") + {:error, error_output} + end + end + + defp extract_transcription(output) do + output + |> String.split("\n") + |> Enum.filter(fn line -> + line =~ ~r/[\p{L}\p{N}]/u and + not String.starts_with?(line, "whisper_") and + not String.starts_with?(line, "system_info") and + not String.starts_with?(line, "main: ") and + not String.starts_with?(line, "whisper_print_timings:") + end) + |> Enum.join(" ") + |> String.trim() + end +end diff --git a/recognition_VAD/lib/recognition_VAD/whisper.ex b/recognition_VAD/lib/recognition_VAD/whisper.ex index a7791578..c7a1bd1d 100644 --- a/recognition_VAD/lib/recognition_VAD/whisper.ex +++ b/recognition_VAD/lib/recognition_VAD/whisper.ex @@ -11,7 +11,8 @@ defmodule Recognition_VAD.Whisper do case System.cmd("wsl", args, stderr_to_stdout: true) do {output, 0} -> text = extract_transcription(output) - Logger.info("📝 Transcripción: #{text}") + Logger.info("📝 Transcripción real time: #{text}") + {:ok, text} {error_output, _} -> diff --git a/recognition_VAD/lib/recognition_VAD/whisper_streamer.ex b/recognition_VAD/lib/recognition_VAD/whisper_streamer.ex index 1238e515..bd85c74c 100644 --- a/recognition_VAD/lib/recognition_VAD/whisper_streamer.ex +++ b/recognition_VAD/lib/recognition_VAD/whisper_streamer.ex @@ -2,7 +2,7 @@ defmodule Recognition_VAD.WhisperStreamer do use GenServer require Logger - @transcribe_interval 2000 # cada 2 segundos + @transcribe_interval 1000 # cada 1 segundo @max_chunks 100 # máximo a mantener en memoria def start_link(_opts) do @@ -47,9 +47,15 @@ defmodule Recognition_VAD.WhisperStreamer do end) schedule_transcription() - {:noreply, %{state | chunks: []}} + + # 👉 Conservamos un 25% del audio anterior para contexto + overlap_chunks = + Enum.take(Enum.reverse(chunks), trunc(length(chunks) * 0.25)) + + {:noreply, %{state | chunks: overlap_chunks}} end + defp schedule_transcription do Process.send_after(self(), :transcribe_timer, @transcribe_interval) end diff --git a/recognition_VAD/lib/recognition_VAD_web/channels/data_channel.ex b/recognition_VAD/lib/recognition_VAD_web/channels/data_channel.ex index 6b3c416b..59761d7e 100644 --- a/recognition_VAD/lib/recognition_VAD_web/channels/data_channel.ex +++ b/recognition_VAD/lib/recognition_VAD_web/channels/data_channel.ex @@ -1,11 +1,13 @@ defmodule Recognition_VADWeb.DataChannel do use Phoenix.Channel + require Logger def join("data:lobby", _params, socket) do Phoenix.PubSub.subscribe(Recognition_VAD.PubSub, "audio_output") {:ok, socket} end + # Parcial def handle_info({:realtime, msg}, socket) do push(socket, "realtime", msg) @@ -14,15 +16,27 @@ defmodule Recognition_VADWeb.DataChannel do # Completo def handle_info({:broadcast_audio, msg}, socket) do - push(socket, "transcription", msg) + push(socket, "realtime", msg) + {:noreply, socket} + end + + def handle_info({:audio_saved, %{path: _path}}, socket) do + {:noreply, socket} + end + + def handle_in("start_recording", _params, socket) do + GenServer.cast(Recognition_VAD.AudioProcessor, :reset) {:noreply, socket} end - # Recibe audio codificado en base64 (para transporte seguro) def handle_in("audio_chunk", %{"data" => base64_chunk, "sample_rate" => sample_rate}, socket) do + Logger.debug("📥 Recibido audio_chunk con sample_rate=#{sample_rate}") case Base.decode64(base64_chunk) do {:ok, binary_audio} -> Recognition_VAD.WhisperStreamer.push_chunk(binary_audio, sample_rate) + # GenServer.cast(Recognition_VAD.AudioProcessor, :save_wav) + + GenServer.cast(Recognition_VAD.AudioProcessor, {:chunk, binary_audio, sample_rate}) # ✅ activa esta línea {:noreply, socket} :error -> @@ -32,11 +46,14 @@ defmodule Recognition_VADWeb.DataChannel do end def handle_in("save_audio", _params, socket) do + GenServer.cast(Recognition_VAD.AudioProcessor, :save_wav) {:noreply, socket} end + def handle_in(_unknown, _payload, socket) do {:noreply, socket} end + end diff --git a/recognition_VAD/lib/recognition_VAD_web/endpoint.ex b/recognition_VAD/lib/recognition_VAD_web/endpoint.ex index 4836eb43..4f4fe40b 100644 --- a/recognition_VAD/lib/recognition_VAD_web/endpoint.ex +++ b/recognition_VAD/lib/recognition_VAD_web/endpoint.ex @@ -17,7 +17,8 @@ defmodule Recognition_VADWeb.Endpoint do socket "/socket", Recognition_VADWeb.UserSocket, websocket: true, - longpoll: false + longpoll: false, + pubsub_server: Recognition_VAD.PubSub # Serve at "/" the static files from "priv/static" directory. # diff --git a/recognition_VAD/lib/recognition_VAD_web/live/stt/test_with_channel.ex b/recognition_VAD/lib/recognition_VAD_web/live/stt/test_with_channel.ex index c5523e83..2efa9851 100644 --- a/recognition_VAD/lib/recognition_VAD_web/live/stt/test_with_channel.ex +++ b/recognition_VAD/lib/recognition_VAD_web/live/stt/test_with_channel.ex @@ -1,19 +1,81 @@ defmodule Recognition_VADWeb.Stt.TestWithChannel do use Recognition_VADWeb, :live_view + require Logger def mount(_params, _session, socket) do + Phoenix.PubSub.subscribe(Recognition_VAD.PubSub, "large") + socket = + socket + |> assign(improved_transcription: "") + |> assign(audio_path: nil) + |> assign(realtime_transcription: "") + |> assign(improving?: false) + |> assign(view_stop: false) + |> assign(view_start: true) + |> assign(stop_recording: false) + |> assign(:audio_path, nil) + {:ok, socket} end + def handle_event("start", %{"value" => ""}, socket) do + socket = assign(socket, view_start: false, view_stop: true) + {:noreply, socket} + end + + def handle_event("stop_recording", %{"value" => ""}, socket) do + IO.inspect("stop_recording event in LiveView ----------------------") + socket = assign(socket, stop_recording: true) + {:noreply, socket} + end + + def handle_info({:large_path, _level, large_path}, socket) do + IO.inspect(large_path, label: "large_path in live view ----------------------\n") + + {:noreply, assign(socket, audio_path: large_path)} + end + + + def handle_info({:transcription_improved, _level, text}, socket) do + IO.inspect(text, label: "Log message received in LiveView ----------------------\n") + File.rm!(socket.assigns.audio_path) + {:noreply, assign(socket, improved_transcription: text, improving?: true)} + end + def render(assigns) do ~H"""
Presioná "Start Recording"…
- - + + <%= if @view_start == true do %> + + <% else %> + + <% end %> + + <%= if @view_stop == true do %> + + <% else %> + + <% end %> + + <%= case [@stop_recording, @improving?] do %> + <% [true, false] -> %> +
Mejorando transcripción...
+ <% [true, true] -> %> +
Transcripción Final.
+ <% _ -> %> +
Presioná "Start Recording"…
+ <% end %>
-
+ <%= if @improving? == false do %> +
+
+
+ <% else %> +
<%= @improved_transcription %>
+ <% end %>