Transcripcion realtime y cambio a la tanscripcion large. Conexion al modelo large v3.
This commit is contained in:
		| @ -30,7 +30,7 @@ | ||||
|       text-align: center; | ||||
|     } | ||||
|     #transcriptionContainer { | ||||
|       height: 90px; /* Fixed height for approximately 3 lines of text */ | ||||
|       height: auto; /* Fixed height for approximately 3 lines of text */ | ||||
|       overflow-y: auto; | ||||
|       width: 100%; | ||||
|       padding: 10px; | ||||
| @ -83,4 +83,4 @@ | ||||
|     button:disabled { | ||||
|       background-color: #cccccc; | ||||
|       cursor: not-allowed; | ||||
|     } | ||||
|     } | ||||
|  | ||||
| @ -22,9 +22,20 @@ import {Socket} from "phoenix" | ||||
| import {LiveSocket} from "phoenix_live_view" | ||||
| import topbar from "../vendor/topbar" | ||||
| import SttRecorder from "./stt_recorder.js"; | ||||
| let csrfToken = document.querySelector("meta[name='csrf-token']").getAttribute("content"); | ||||
|  | ||||
| let Hooks = {}; | ||||
|  | ||||
| Hooks.AudioPathHook = { | ||||
|   mounted() { | ||||
|     this.el.addEventListener("audio_path", (event) => { | ||||
|       this.pushEvent("audio_path", { audio_path: event.detail.audio_path }); | ||||
|     }); | ||||
|   } | ||||
| }; | ||||
|  | ||||
| let liveSocket = new LiveSocket("/live", Socket, { | ||||
|   hooks: { SttRecorder }, | ||||
|   hooks: { SttRecorder, Hooks}, | ||||
|   params: { _csrf_token: csrfToken } | ||||
| }); | ||||
|  | ||||
|  | ||||
| @ -18,6 +18,7 @@ config :recognition_VAD, Recognition_VADWeb.Endpoint, | ||||
|     formats: [html: Recognition_VADWeb.ErrorHTML, json: Recognition_VADWeb.ErrorJSON], | ||||
|     layout: false | ||||
|   ], | ||||
|   server: true, | ||||
|   pubsub_server: Recognition_VAD.PubSub, | ||||
|   live_view: [signing_salt: "MLX284g+"] | ||||
|  | ||||
|  | ||||
| @ -13,6 +13,7 @@ defmodule Recognition_VAD.Application do | ||||
|       {Phoenix.PubSub, name: Recognition_VAD.PubSub}, | ||||
|       Recognition_VAD.AudioProcessor, | ||||
|       Recognition_VAD.WhisperStreamer, | ||||
|       Recognition_VAD.LargeTranscriber, | ||||
|  | ||||
|       # Start the Finch HTTP client for sending emails | ||||
|       {Finch, name: Recognition_VAD.Finch}, | ||||
|  | ||||
| @ -11,22 +11,29 @@ defmodule Recognition_VAD.AudioProcessor do | ||||
|   end | ||||
|  | ||||
|   def handle_cast({:chunk, binary_audio, sample_rate}, state) do | ||||
|     # 👇 Guardamos el chunk en el buffer | ||||
|     new_buffer = [binary_audio | state.buffer] |> Enum.take(100) # máximo 100 chunks | ||||
|     new_buffer = [binary_audio | state.buffer] # 🔥 quitá el Enum.take(100) | ||||
|  | ||||
|     Logger.info("🟡 Recibido chunk de #{byte_size(binary_audio)} bytes a #{sample_rate} Hz") | ||||
|  | ||||
|     {:noreply, %{state | buffer: new_buffer, sample_rate: sample_rate}} | ||||
|   end | ||||
|  | ||||
|  | ||||
|   def handle_cast(:save_wav, state) do | ||||
|     timestamp = DateTime.utc_now() |> DateTime.to_unix() | ||||
|     filename = "recording_#{timestamp}.wav" | ||||
|  | ||||
|     Recognition_VAD.WavWriter.write_pcm_chunks_to_wav(state.buffer, state.sample_rate, filename) | ||||
|     Logger.info("💾 Guardado archivo: #{filename}") | ||||
|     Recognition_VAD.LargeTranscriber.improve_transcription(filename) | ||||
|  | ||||
|     # Notificamos a LiveView por PubSub | ||||
|     Phoenix.PubSub.broadcast(Recognition_VAD.PubSub, "audio_output", {:audio_saved, %{path: filename}}) | ||||
|  | ||||
|     {:noreply, state} | ||||
|   end | ||||
|  | ||||
|   def handle_cast(:reset, state) do | ||||
|     Logger.info("🔄 Reset del buffer de audio para nueva grabación") | ||||
|     {:noreply, %{state | buffer: [], sample_rate: 0}} | ||||
|   end | ||||
|  | ||||
| end | ||||
|  | ||||
							
								
								
									
										67
									
								
								recognition_VAD/lib/recognition_VAD/large_transcriber.ex
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										67
									
								
								recognition_VAD/lib/recognition_VAD/large_transcriber.ex
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,67 @@ | ||||
| defmodule Recognition_VAD.LargeTranscriber do | ||||
|   use GenServer | ||||
|   require Logger | ||||
|  | ||||
|   @default_model "ggml-large-v3-turbo.bin" | ||||
|   @script_path "/home/aime-pc2/i_m/whisper.cpp/large_transcribe.sh" | ||||
|  | ||||
|   def start_link(_opts) do | ||||
|     GenServer.start_link(__MODULE__, %{}, name: __MODULE__) | ||||
|   end | ||||
|  | ||||
|   @impl true | ||||
|   def init(state) do | ||||
|     {:ok, state} | ||||
|   end | ||||
|  | ||||
|   @doc """ | ||||
|   Llamada externa para iniciar la mejora con el modelo large. | ||||
|   """ | ||||
|   def improve_transcription(audio_path) do | ||||
|     GenServer.cast(__MODULE__, {:improve, audio_path}) | ||||
|   end | ||||
|  | ||||
|   @impl true | ||||
|   def handle_cast({:improve, path}, state) do | ||||
|     Logger.info("🚀 LargeTranscriber recibió la ruta: #{path}") | ||||
|     large_path = "/mnt/c/Users/rolan/i_m/voice_recognition/recognition_VAD/#{path}" | ||||
|     Phoenix.PubSub.broadcast(Recognition_VAD.PubSub, "large", {:large_path, :info, "#{path}"}) | ||||
|  | ||||
|     transcribe(large_path, @default_model) | ||||
|  | ||||
|     # Aquí luego vas a invocar el whisper grande con esa ruta. | ||||
|     {:noreply, state} | ||||
|   end | ||||
|   def transcribe(path, model) do | ||||
|  | ||||
|     args = [@script_path, path, model] | ||||
|  | ||||
|     case System.cmd("wsl", args, stderr_to_stdout: true) do | ||||
|       {output, 0} -> | ||||
|         text = extract_transcription(output) | ||||
|         # Logger.info("📝 Transcripción mejorada: #{text}") | ||||
|         Phoenix.PubSub.broadcast(Recognition_VAD.PubSub, "large", {:transcription_improved, :info, "#{text}"}) | ||||
|         # Phoenix.PubSub.broadcast(Recognition_VAD.PubSub, "audio_output", {:log_message, :info, text, "large"}) | ||||
|  | ||||
|         {:ok, text} | ||||
|  | ||||
|       {error_output, _} -> | ||||
|         Logger.error("❌ Error al transcribir con whisper: #{error_output}") | ||||
|         {:error, error_output} | ||||
|     end | ||||
|   end | ||||
|  | ||||
|   defp extract_transcription(output) do | ||||
|     output | ||||
|     |> String.split("\n") | ||||
|     |> Enum.filter(fn line -> | ||||
|       line =~ ~r/[\p{L}\p{N}]/u and | ||||
|         not String.starts_with?(line, "whisper_") and | ||||
|         not String.starts_with?(line, "system_info") and | ||||
|         not String.starts_with?(line, "main: ") and | ||||
|         not String.starts_with?(line, "whisper_print_timings:") | ||||
|     end) | ||||
|     |> Enum.join(" ") | ||||
|     |> String.trim() | ||||
|   end | ||||
| end | ||||
| @ -11,7 +11,8 @@ defmodule Recognition_VAD.Whisper do | ||||
|     case System.cmd("wsl", args, stderr_to_stdout: true) do | ||||
|       {output, 0} -> | ||||
|         text = extract_transcription(output) | ||||
|         Logger.info("📝 Transcripción: #{text}") | ||||
|         Logger.info("📝 Transcripción real time: #{text}") | ||||
|  | ||||
|         {:ok, text} | ||||
|  | ||||
|       {error_output, _} -> | ||||
|  | ||||
| @ -2,7 +2,7 @@ defmodule Recognition_VAD.WhisperStreamer do | ||||
|   use GenServer | ||||
|   require Logger | ||||
|  | ||||
|   @transcribe_interval 2000  # cada 2 segundos | ||||
|   @transcribe_interval 1000  # cada 1 segundo | ||||
|   @max_chunks 100            # máximo a mantener en memoria | ||||
|  | ||||
|   def start_link(_opts) do | ||||
| @ -47,9 +47,15 @@ defmodule Recognition_VAD.WhisperStreamer do | ||||
|     end) | ||||
|  | ||||
|     schedule_transcription() | ||||
|     {:noreply, %{state | chunks: []}} | ||||
|  | ||||
|     # 👉 Conservamos un 25% del audio anterior para contexto | ||||
|     overlap_chunks = | ||||
|       Enum.take(Enum.reverse(chunks), trunc(length(chunks) * 0.25)) | ||||
|  | ||||
|     {:noreply, %{state | chunks: overlap_chunks}} | ||||
|   end | ||||
|  | ||||
|  | ||||
|   defp schedule_transcription do | ||||
|     Process.send_after(self(), :transcribe_timer, @transcribe_interval) | ||||
|   end | ||||
|  | ||||
| @ -1,11 +1,13 @@ | ||||
| defmodule Recognition_VADWeb.DataChannel do | ||||
|   use Phoenix.Channel | ||||
|   require Logger | ||||
|  | ||||
|   def join("data:lobby", _params, socket) do | ||||
|     Phoenix.PubSub.subscribe(Recognition_VAD.PubSub, "audio_output") | ||||
|     {:ok, socket} | ||||
|   end | ||||
|  | ||||
|  | ||||
|   # Parcial | ||||
|   def handle_info({:realtime, msg}, socket) do | ||||
|     push(socket, "realtime", msg) | ||||
| @ -14,15 +16,27 @@ defmodule Recognition_VADWeb.DataChannel do | ||||
|  | ||||
|   # Completo | ||||
|   def handle_info({:broadcast_audio, msg}, socket) do | ||||
|     push(socket, "transcription", msg) | ||||
|     push(socket, "realtime", msg) | ||||
|     {:noreply, socket} | ||||
|   end | ||||
|  | ||||
|   def handle_info({:audio_saved, %{path: _path}}, socket) do | ||||
|     {:noreply, socket} | ||||
|   end | ||||
|  | ||||
|   def handle_in("start_recording", _params, socket) do | ||||
|     GenServer.cast(Recognition_VAD.AudioProcessor, :reset) | ||||
|     {:noreply, socket} | ||||
|   end | ||||
|  | ||||
|   # Recibe audio codificado en base64 (para transporte seguro) | ||||
|   def handle_in("audio_chunk", %{"data" => base64_chunk, "sample_rate" => sample_rate}, socket) do | ||||
|     Logger.debug("📥 Recibido audio_chunk con sample_rate=#{sample_rate}") | ||||
|     case Base.decode64(base64_chunk) do | ||||
|       {:ok, binary_audio} -> | ||||
|         Recognition_VAD.WhisperStreamer.push_chunk(binary_audio, sample_rate) | ||||
|         # GenServer.cast(Recognition_VAD.AudioProcessor, :save_wav) | ||||
|  | ||||
|         GenServer.cast(Recognition_VAD.AudioProcessor, {:chunk, binary_audio, sample_rate}) # ✅ activa esta línea | ||||
|         {:noreply, socket} | ||||
|  | ||||
|       :error -> | ||||
| @ -32,11 +46,14 @@ defmodule Recognition_VADWeb.DataChannel do | ||||
|   end | ||||
|  | ||||
|   def handle_in("save_audio", _params, socket) do | ||||
|  | ||||
|     GenServer.cast(Recognition_VAD.AudioProcessor, :save_wav) | ||||
|     {:noreply, socket} | ||||
|   end | ||||
|  | ||||
|  | ||||
|   def handle_in(_unknown, _payload, socket) do | ||||
|     {:noreply, socket} | ||||
|   end | ||||
|  | ||||
| end | ||||
|  | ||||
| @ -17,7 +17,8 @@ defmodule Recognition_VADWeb.Endpoint do | ||||
|  | ||||
|   socket "/socket", Recognition_VADWeb.UserSocket, | ||||
|     websocket: true, | ||||
|     longpoll: false | ||||
|     longpoll: false, | ||||
|     pubsub_server: Recognition_VAD.PubSub | ||||
|  | ||||
|   # Serve at "/" the static files from "priv/static" directory. | ||||
|   # | ||||
|  | ||||
| @ -1,19 +1,81 @@ | ||||
| defmodule Recognition_VADWeb.Stt.TestWithChannel do | ||||
|   use Recognition_VADWeb, :live_view | ||||
|   require Logger | ||||
|  | ||||
|   def mount(_params, _session, socket) do | ||||
|     Phoenix.PubSub.subscribe(Recognition_VAD.PubSub, "large") | ||||
|     socket = | ||||
|       socket | ||||
|       |> assign(improved_transcription: "") | ||||
|       |> assign(audio_path: nil) | ||||
|       |> assign(realtime_transcription: "") | ||||
|       |> assign(improving?: false) | ||||
|       |> assign(view_stop: false) | ||||
|       |> assign(view_start: true) | ||||
|       |> assign(stop_recording: false) | ||||
|       |> assign(:audio_path, nil) | ||||
|  | ||||
|     {:ok, socket} | ||||
|   end | ||||
|  | ||||
|   def handle_event("start", %{"value" => ""}, socket) do | ||||
|     socket = assign(socket, view_start: false, view_stop: true) | ||||
|     {:noreply, socket} | ||||
|   end | ||||
|  | ||||
|   def handle_event("stop_recording", %{"value" => ""}, socket) do | ||||
|     IO.inspect("stop_recording event in LiveView ----------------------") | ||||
|     socket = assign(socket, stop_recording: true) | ||||
|     {:noreply, socket} | ||||
|   end | ||||
|  | ||||
|   def handle_info({:large_path, _level, large_path}, socket) do | ||||
|     IO.inspect(large_path, label: "large_path in live view ----------------------\n") | ||||
|  | ||||
|     {:noreply, assign(socket, audio_path: large_path)} | ||||
|   end | ||||
|  | ||||
|  | ||||
|   def handle_info({:transcription_improved, _level, text}, socket) do | ||||
|     IO.inspect(text, label: "Log message received in LiveView ----------------------\n") | ||||
|     File.rm!(socket.assigns.audio_path) | ||||
|     {:noreply, assign(socket, improved_transcription: text, improving?: true)} | ||||
|   end | ||||
|  | ||||
|   def render(assigns) do | ||||
|     ~H""" | ||||
|       <div id="container"> | ||||
|         <div id="status">Presioná "Start Recording"…</div> | ||||
|         <button id="startButton">Start Recording</button> | ||||
|         <button id="stopButton" disabled>Stop Recording</button> | ||||
|  | ||||
|         <%= if @view_start == true do %> | ||||
|           <button id="startButton" phx-click="start">Start Recording</button> | ||||
|         <% else %> | ||||
|           <button id="startButton" disabled>Start Recording</button> | ||||
|         <% end %> | ||||
|  | ||||
|         <%= if @view_stop == true do %> | ||||
|           <button id="stopButton" phx-click="stop_recording">Stop Recording</button> | ||||
|         <% else %> | ||||
|           <button id="stopButton" disabled>Stop Recording</button> | ||||
|         <% end %> | ||||
|  | ||||
|         <%= case [@stop_recording, @improving?] do %> | ||||
|           <% [true, false] -> %> | ||||
|             <div id="status" class="px-3 py-1 text-xs font-medium leading-none font-bold text-blue-900  rounded-full animate-pulse">Mejorando transcripción...</div> | ||||
|           <% [true, true] -> %> | ||||
|             <div id="status">Transcripción Final.</div> | ||||
|           <% _ -> %> | ||||
|             <div id="status">Presioná "Start Recording"…</div> | ||||
|         <% end %> | ||||
|  | ||||
|         <div id="transcriptionContainer"> | ||||
|           <div id="transcription" class="realtime"></div> | ||||
|           <%= if @improving? == false do %> | ||||
|             <div> | ||||
|                 <div id="transcription" phx-update="ignore" class="realtime px-3 py-1 text-xs font-medium leading-none font-bold text-blue-900  rounded-full animate-pulse"></div> | ||||
|             </div> | ||||
|           <% else %> | ||||
|             <div><%= @improved_transcription %></div> | ||||
|           <% end %> | ||||
|         </div> | ||||
|  | ||||
|         <script type="module"> | ||||
| @ -28,8 +90,8 @@ defmodule Recognition_VADWeb.Stt.TestWithChannel do | ||||
|           let audioContext, mediaStream, mediaProcessor; | ||||
|  | ||||
|           async function startRecording() { | ||||
|             startButton.disabled = true; | ||||
|             stopButton.disabled = false; | ||||
|             //startButton.disabled = true; | ||||
|            // stopButton.disabled = false; | ||||
|             statusDiv.textContent = "🎙 Grabando…"; | ||||
|             transcriptionDiv.innerHTML = ""; | ||||
|  | ||||
| @ -42,6 +104,7 @@ defmodule Recognition_VADWeb.Stt.TestWithChannel do | ||||
|               .receive("ok", () => { | ||||
|                 statusDiv.textContent = "✅ Conectado a Phoenix STT"; | ||||
|                 console.log("Canal conectado"); | ||||
|                 channel.push("start_recording", {}); | ||||
|               }) | ||||
|               .receive("error", () => { | ||||
|                 statusDiv.textContent = "❌ Error al conectar canal"; | ||||
| @ -65,7 +128,6 @@ defmodule Recognition_VADWeb.Stt.TestWithChannel do | ||||
|               `; | ||||
|             }); | ||||
|  | ||||
|  | ||||
|             // Frase completa (después de procesar chunks) | ||||
|             channel.on("transcription", payload => { | ||||
|               const sentence = payload.text.trim(); | ||||
| @ -107,15 +169,28 @@ defmodule Recognition_VADWeb.Stt.TestWithChannel do | ||||
|           } | ||||
|  | ||||
|           function stopRecording() { | ||||
|             stopButton.disabled = true; | ||||
|             startButton.disabled = false; | ||||
|             statusDiv.textContent = "🛑 Grabación detenida."; | ||||
|  | ||||
|             if (mediaProcessor) mediaProcessor.disconnect(); | ||||
|             if (audioContext) audioContext.close(); | ||||
|             if (mediaStream) mediaStream.getTracks().forEach(track => track.stop()); | ||||
|             if (channel) channel.leave(); | ||||
|             if (socket) socket.disconnect(); | ||||
|  | ||||
|             if (channel) { | ||||
|               channel.push("save_audio", {}).receive("ok", (resp) => { | ||||
|                 console.log("Recibí audio_path del canal:", resp.audio_path); | ||||
|                 const hookElement = document.getElementById("lv-container"); | ||||
|                 if (hookElement && resp.audio_path) { | ||||
|                   hookElement.dispatchEvent(new CustomEvent("audio_path", { detail: { audio_path: resp.audio_path } })); | ||||
|                 } | ||||
|               }); | ||||
|  | ||||
|               // Esperar 5 segundos antes de cerrar el canal y socket | ||||
|               setTimeout(() => { | ||||
|                 console.log("Cerrando canal y socket después de 5 segundos de espera para recibir mensajes tardíos..."); | ||||
|                 channel.leave(); | ||||
|                 if (socket) socket.disconnect(); | ||||
|               }, 5000); | ||||
|             } | ||||
|           } | ||||
|  | ||||
|           document.getElementById("startButton").onclick = startRecording; | ||||
| @ -139,5 +214,4 @@ defmodule Recognition_VADWeb.Stt.TestWithChannel do | ||||
|       </div> | ||||
|     """ | ||||
|   end | ||||
|  | ||||
| end | ||||
|  | ||||
| @ -19,6 +19,7 @@ defmodule Recognition_VADWeb.Router do | ||||
|  | ||||
|     get "/", PageController, :home | ||||
|     live "/sttest", Stt.TestWithChannel | ||||
|  | ||||
|   end | ||||
|  | ||||
|   # Other scopes may use custom stacks. | ||||
|  | ||||
		Reference in New Issue
	
	Block a user