whisper.cpp en tiempo real con tiny
This commit is contained in:
		| @ -12,6 +12,8 @@ defmodule Recognition_VAD.Application do | ||||
|       {DNSCluster, query: Application.get_env(:recognition_VAD, :dns_cluster_query) || :ignore}, | ||||
|       {Phoenix.PubSub, name: Recognition_VAD.PubSub}, | ||||
|       Recognition_VAD.AudioProcessor, | ||||
|       Recognition_VAD.WhisperStreamer, | ||||
|  | ||||
|       # Start the Finch HTTP client for sending emails | ||||
|       {Finch, name: Recognition_VAD.Finch}, | ||||
|       # Start a worker by calling: Recognition_VAD.Worker.start_link(arg) | ||||
|  | ||||
							
								
								
									
										46
									
								
								recognition_VAD/lib/recognition_VAD/whisper.ex
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										46
									
								
								recognition_VAD/lib/recognition_VAD/whisper.ex
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,46 @@ | ||||
| defmodule Recognition_VAD.Whisper do | ||||
|   @default_model "ggml-tiny.bin" | ||||
|   @script_path "/home/aime-pc2/i_m/whisper.cpp/transcribe.sh" | ||||
|   require Logger | ||||
|  | ||||
|   def transcribe(path, model \\ @default_model) do | ||||
|     path_to_run = convert_path_to_wsl(path) | ||||
|  | ||||
|     args = [@script_path, path_to_run, model] | ||||
|  | ||||
|     case System.cmd("wsl", args, stderr_to_stdout: true) do | ||||
|       {output, 0} -> | ||||
|         text = extract_transcription(output) | ||||
|         Logger.info("📝 Transcripción: #{text}") | ||||
|         {:ok, text} | ||||
|  | ||||
|       {error_output, _} -> | ||||
|         Logger.error("❌ Error al transcribir con whisper: #{error_output}") | ||||
|         {:error, error_output} | ||||
|     end | ||||
|   end | ||||
|  | ||||
|   defp convert_path_to_wsl(path) do | ||||
|     if String.starts_with?(path, "C:/") do | ||||
|       path | ||||
|       |> String.replace_prefix("C:/", "/mnt/c/") | ||||
|       |> String.replace("\\", "/") | ||||
|     else | ||||
|       path | ||||
|     end | ||||
|   end | ||||
|  | ||||
|   defp extract_transcription(output) do | ||||
|     output | ||||
|     |> String.split("\n") | ||||
|     |> Enum.filter(fn line -> | ||||
|       line =~ ~r/[\p{L}\p{N}]/u and | ||||
|         not String.starts_with?(line, "whisper_") and | ||||
|         not String.starts_with?(line, "system_info") and | ||||
|         not String.starts_with?(line, "main: ") and | ||||
|         not String.starts_with?(line, "whisper_print_timings:") | ||||
|     end) | ||||
|     |> Enum.join(" ") | ||||
|     |> String.trim() | ||||
|   end | ||||
| end | ||||
							
								
								
									
										56
									
								
								recognition_VAD/lib/recognition_VAD/whisper_streamer.ex
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										56
									
								
								recognition_VAD/lib/recognition_VAD/whisper_streamer.ex
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,56 @@ | ||||
| defmodule Recognition_VAD.WhisperStreamer do | ||||
|   use GenServer | ||||
|   require Logger | ||||
|  | ||||
|   @transcribe_interval 2000  # cada 2 segundos | ||||
|   @max_chunks 100            # máximo a mantener en memoria | ||||
|  | ||||
|   def start_link(_opts) do | ||||
|     GenServer.start_link(__MODULE__, %{chunks: [], sample_rate: 48000}, name: __MODULE__) | ||||
|   end | ||||
|  | ||||
|   def push_chunk(chunk, sample_rate) do | ||||
|     GenServer.cast(__MODULE__, {:chunk, chunk, sample_rate}) | ||||
|   end | ||||
|  | ||||
|   @impl true | ||||
|   def init(state) do | ||||
|     schedule_transcription() | ||||
|     {:ok, state} | ||||
|   end | ||||
|  | ||||
|   @impl true | ||||
|   def handle_cast({:chunk, binary, sr}, %{chunks: chunks} = state) do | ||||
|     new_chunks = [binary | chunks] |> Enum.take(@max_chunks) | ||||
|     {:noreply, %{state | chunks: new_chunks, sample_rate: sr}} | ||||
|   end | ||||
|  | ||||
|   @impl true | ||||
|   def handle_info(:transcribe_timer, %{chunks: []} = state) do | ||||
|     # Si no hay audio, solo reprogramamos | ||||
|     schedule_transcription() | ||||
|     {:noreply, state} | ||||
|   end | ||||
|  | ||||
|   def handle_info(:transcribe_timer, %{chunks: chunks, sample_rate: sr} = state) do | ||||
|     Task.start(fn -> | ||||
|       path = "C:/Users/rolan/i_m/voice_recognition/recognition_VAD/tmp/realtime_#{System.system_time(:millisecond)}.wav" | ||||
|       :ok = Recognition_VAD.WavWriter.write_pcm_chunks_to_wav(Enum.reverse(chunks), sr, path) | ||||
|  | ||||
|       case Recognition_VAD.Whisper.transcribe(path) do | ||||
|         {:ok, text} when byte_size(text) > 0 -> | ||||
|           Phoenix.PubSub.broadcast(Recognition_VAD.PubSub, "audio_output", {:realtime, %{"text" => text}}) | ||||
|  | ||||
|         _ -> | ||||
|           Logger.debug("⏱ Nada para transcribir o error") | ||||
|       end | ||||
|     end) | ||||
|  | ||||
|     schedule_transcription() | ||||
|     {:noreply, %{state | chunks: []}} | ||||
|   end | ||||
|  | ||||
|   defp schedule_transcription do | ||||
|     Process.send_after(self(), :transcribe_timer, @transcribe_interval) | ||||
|   end | ||||
| end | ||||
| @ -6,8 +6,15 @@ defmodule Recognition_VADWeb.DataChannel do | ||||
|     {:ok, socket} | ||||
|   end | ||||
|  | ||||
|   # Parcial | ||||
|   def handle_info({:realtime, msg}, socket) do | ||||
|     push(socket, "realtime", msg) | ||||
|     {:noreply, socket} | ||||
|   end | ||||
|  | ||||
|   # Completo | ||||
|   def handle_info({:broadcast_audio, msg}, socket) do | ||||
|     push(socket, "transcription", Jason.decode!(msg)) | ||||
|     push(socket, "transcription", msg) | ||||
|     {:noreply, socket} | ||||
|   end | ||||
|  | ||||
| @ -15,7 +22,7 @@ defmodule Recognition_VADWeb.DataChannel do | ||||
|   def handle_in("audio_chunk", %{"data" => base64_chunk, "sample_rate" => sample_rate}, socket) do | ||||
|     case Base.decode64(base64_chunk) do | ||||
|       {:ok, binary_audio} -> | ||||
|         GenServer.cast(Recognition_VAD.AudioProcessor, {:chunk, binary_audio, sample_rate}) | ||||
|         Recognition_VAD.WhisperStreamer.push_chunk(binary_audio, sample_rate) | ||||
|         {:noreply, socket} | ||||
|  | ||||
|       :error -> | ||||
|  | ||||
| @ -7,114 +7,137 @@ defmodule Recognition_VADWeb.Stt.TestWithChannel do | ||||
|  | ||||
|   def render(assigns) do | ||||
|     ~H""" | ||||
|     <div id="container"> | ||||
|       <div id="status">Presioná "Start Recording"…</div> | ||||
|       <button id="startButton">Start Recording</button> | ||||
|       <button id="stopButton" disabled>Stop Recording</button> | ||||
|       <div id="container"> | ||||
|         <div id="status">Presioná "Start Recording"…</div> | ||||
|         <button id="startButton">Start Recording</button> | ||||
|         <button id="stopButton" disabled>Stop Recording</button> | ||||
|  | ||||
|       <div id="transcriptionContainer"> | ||||
|         <div id="transcription" class="realtime"></div> | ||||
|       </div> | ||||
|         <div id="transcriptionContainer"> | ||||
|           <div id="transcription" class="realtime"></div> | ||||
|         </div> | ||||
|  | ||||
|       <div id="fullTextContainer"> | ||||
|         <div id="fullText"></div> | ||||
|       </div> | ||||
|         <script type="module"> | ||||
|           import { Socket } from "https://cdn.skypack.dev/phoenix"; | ||||
|  | ||||
|       <script type="module"> | ||||
|         import { Socket } from "https://cdn.skypack.dev/phoenix"; | ||||
|           const statusDiv = document.getElementById("status"); | ||||
|           const transcriptionDiv = document.getElementById("transcription"); | ||||
|           const startButton = document.getElementById("startButton"); | ||||
|           const stopButton = document.getElementById("stopButton"); | ||||
|  | ||||
|         const statusDiv = document.getElementById("status"); | ||||
|         const transcriptionDiv = document.getElementById("transcription"); | ||||
|         const fullTextDiv = document.getElementById("fullText"); | ||||
|         const startButton = document.getElementById("startButton"); | ||||
|         const stopButton = document.getElementById("stopButton"); | ||||
|           let socket, channel; | ||||
|           let audioContext, mediaStream, mediaProcessor; | ||||
|  | ||||
|         let socket, channel; | ||||
|         let audioContext, mediaStream, mediaProcessor; | ||||
|  | ||||
|         async function startRecording() { | ||||
|           startButton.disabled = true; | ||||
|           stopButton.disabled = false; | ||||
|           statusDiv.textContent = "Recording…"; | ||||
|           transcriptionDiv.textContent = ""; | ||||
|           fullTextDiv.textContent = ""; | ||||
|  | ||||
|           socket = new Socket("ws://localhost:4000/socket"); | ||||
|           socket.connect(); | ||||
|  | ||||
|           channel = socket.channel("data:lobby"); | ||||
|           channel.join() | ||||
|             .receive("ok", () => { | ||||
|               statusDiv.textContent = "🎙 Conectado a Phoenix STT"; | ||||
|               console.log("Canal conectado"); | ||||
|             }) | ||||
|             .receive("error", () => { | ||||
|               statusDiv.textContent = "❌ Error al conectar"; | ||||
|               console.error("Error al conectar canal"); | ||||
|             }); | ||||
|  | ||||
|           channel.on("realtime", payload => { | ||||
|             const words = payload.text.split(" "); | ||||
|             const lastWord = words.pop(); | ||||
|             transcriptionDiv.innerHTML = `${words.join(" ")} <span class="last-word">${lastWord}</span>`; | ||||
|           }); | ||||
|  | ||||
|           channel.on("fullSentence", payload => { | ||||
|             fullTextDiv.innerHTML += payload.text + " "; | ||||
|           async function startRecording() { | ||||
|             startButton.disabled = true; | ||||
|             stopButton.disabled = false; | ||||
|             statusDiv.textContent = "🎙 Grabando…"; | ||||
|             transcriptionDiv.innerHTML = ""; | ||||
|           }); | ||||
|  | ||||
|           audioContext = new AudioContext(); | ||||
|           mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true }); | ||||
|           const input = audioContext.createMediaStreamSource(mediaStream); | ||||
|             socket = new Socket("ws://localhost:4000/socket"); | ||||
|             socket.connect(); | ||||
|  | ||||
|           mediaProcessor = audioContext.createScriptProcessor(1024, 1, 1); | ||||
|           mediaProcessor.onaudioprocess = (event) => { | ||||
|             const float32Array = event.inputBuffer.getChannelData(0); | ||||
|             const int16Array = new Int16Array(float32Array.length); | ||||
|             for (let i = 0; i < float32Array.length; i++) { | ||||
|               int16Array[i] = Math.max(-1, Math.min(1, float32Array[i])) * 0x7FFF; | ||||
|             } | ||||
|             channel = socket.channel("data:lobby"); | ||||
|  | ||||
|             const base64Audio = btoa(String.fromCharCode(...new Uint8Array(int16Array.buffer))); | ||||
|             channel.push("audio_chunk", { | ||||
|               data: base64Audio, | ||||
|               sample_rate: audioContext.sampleRate | ||||
|             channel.join() | ||||
|               .receive("ok", () => { | ||||
|                 statusDiv.textContent = "✅ Conectado a Phoenix STT"; | ||||
|                 console.log("Canal conectado"); | ||||
|               }) | ||||
|               .receive("error", () => { | ||||
|                 statusDiv.textContent = "❌ Error al conectar canal"; | ||||
|                 console.error("Error al conectar canal"); | ||||
|               }); | ||||
|  | ||||
|             // Realtime parcial (palabras mientras habla) | ||||
|             let partialTranscript = ""; | ||||
|  | ||||
|             channel.on("realtime", payload => { | ||||
|               const words = payload.text.split(" "); | ||||
|               const lastWord = words.pop(); | ||||
|               const rest = words.join(" "); | ||||
|  | ||||
|               if (rest.length > 0) { | ||||
|                 partialTranscript += rest + " "; | ||||
|               } | ||||
|  | ||||
|               transcriptionDiv.innerHTML = ` | ||||
|                 ${partialTranscript}<span class="last-word">${lastWord}</span> | ||||
|               `; | ||||
|             }); | ||||
|           }; | ||||
|  | ||||
|           input.connect(mediaProcessor); | ||||
|           mediaProcessor.connect(audioContext.destination); | ||||
|         } | ||||
|  | ||||
|         function stopRecording() { | ||||
|           stopButton.disabled = true; | ||||
|           startButton.disabled = false; | ||||
|           statusDiv.textContent = "🛑 Grabación detenida."; | ||||
|             // Frase completa (después de procesar chunks) | ||||
|             channel.on("transcription", payload => { | ||||
|               const sentence = payload.text.trim(); | ||||
|               if (sentence.length > 0) { | ||||
|                 partialTranscript = ""; // reseteamos el parcial | ||||
|                 const span = document.createElement("div"); | ||||
|                 span.className = "sentence"; | ||||
|                 span.textContent = sentence; | ||||
|                 transcriptionDiv.appendChild(span); | ||||
|                 transcriptionDiv.innerHTML += "<br />"; | ||||
|               } | ||||
|             }); | ||||
|  | ||||
|           // ✅ Enviamos evento especial para guardar | ||||
|           if (channel) { | ||||
|             channel.push("save_audio", {}); | ||||
|  | ||||
|             // Audio setup | ||||
|             audioContext = new AudioContext(); | ||||
|             mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true }); | ||||
|             const input = audioContext.createMediaStreamSource(mediaStream); | ||||
|  | ||||
|             mediaProcessor = audioContext.createScriptProcessor(1024, 1, 1); | ||||
|             mediaProcessor.onaudioprocess = (event) => { | ||||
|               const float32Array = event.inputBuffer.getChannelData(0); | ||||
|               const int16Array = new Int16Array(float32Array.length); | ||||
|               for (let i = 0; i < float32Array.length; i++) { | ||||
|                 const s = Math.max(-1, Math.min(1, float32Array[i])); | ||||
|                 int16Array[i] = s < 0 ? s * 0x8000 : s * 0x7FFF; | ||||
|               } | ||||
|  | ||||
|  | ||||
|               const base64Audio = btoa(String.fromCharCode(...new Uint8Array(int16Array.buffer))); | ||||
|               channel.push("audio_chunk", { | ||||
|                 data: base64Audio, | ||||
|                 sample_rate: audioContext.sampleRate | ||||
|               }); | ||||
|             }; | ||||
|  | ||||
|             input.connect(mediaProcessor); | ||||
|             mediaProcessor.connect(audioContext.destination); | ||||
|           } | ||||
|  | ||||
|           if (mediaProcessor) mediaProcessor.disconnect(); | ||||
|           if (audioContext) audioContext.close(); | ||||
|           if (mediaStream) mediaStream.getTracks().forEach(track => track.stop()); | ||||
|           if (channel) channel.leave(); | ||||
|           if (socket) socket.disconnect(); | ||||
|         } | ||||
|           function stopRecording() { | ||||
|             stopButton.disabled = true; | ||||
|             startButton.disabled = false; | ||||
|             statusDiv.textContent = "🛑 Grabación detenida."; | ||||
|  | ||||
|         document.getElementById("startButton").onclick = startRecording; | ||||
|         document.getElementById("stopButton").onclick = stopRecording; | ||||
|       </script> | ||||
|             if (mediaProcessor) mediaProcessor.disconnect(); | ||||
|             if (audioContext) audioContext.close(); | ||||
|             if (mediaStream) mediaStream.getTracks().forEach(track => track.stop()); | ||||
|             if (channel) channel.leave(); | ||||
|             if (socket) socket.disconnect(); | ||||
|           } | ||||
|  | ||||
|       <style> | ||||
|         .last-word { | ||||
|           font-weight: bold; | ||||
|           color: orange; | ||||
|         } | ||||
|       </style> | ||||
|     </div> | ||||
|           document.getElementById("startButton").onclick = startRecording; | ||||
|           document.getElementById("stopButton").onclick = stopRecording; | ||||
|         </script> | ||||
|  | ||||
|         <style> | ||||
|           .last-word { | ||||
|             font-weight: bold; | ||||
|             color: orange; | ||||
|           } | ||||
|           #transcriptionContainer { | ||||
|             margin-top: 1rem; | ||||
|             font-family: sans-serif; | ||||
|             font-size: 1.1rem; | ||||
|           } | ||||
|           .sentence { | ||||
|             margin-bottom: 0.5rem; | ||||
|           } | ||||
|         </style> | ||||
|       </div> | ||||
|     """ | ||||
|   end | ||||
|  | ||||
| end | ||||
|  | ||||
		Reference in New Issue
	
	Block a user