diff --git a/recognition_VAD/lib/recognition_VAD/application.ex b/recognition_VAD/lib/recognition_VAD/application.ex index 7c802ea4..c7d10410 100644 --- a/recognition_VAD/lib/recognition_VAD/application.ex +++ b/recognition_VAD/lib/recognition_VAD/application.ex @@ -12,6 +12,8 @@ defmodule Recognition_VAD.Application do {DNSCluster, query: Application.get_env(:recognition_VAD, :dns_cluster_query) || :ignore}, {Phoenix.PubSub, name: Recognition_VAD.PubSub}, Recognition_VAD.AudioProcessor, + Recognition_VAD.WhisperStreamer, + # Start the Finch HTTP client for sending emails {Finch, name: Recognition_VAD.Finch}, # Start a worker by calling: Recognition_VAD.Worker.start_link(arg) diff --git a/recognition_VAD/lib/recognition_VAD/whisper.ex b/recognition_VAD/lib/recognition_VAD/whisper.ex new file mode 100644 index 00000000..a7791578 --- /dev/null +++ b/recognition_VAD/lib/recognition_VAD/whisper.ex @@ -0,0 +1,46 @@ +defmodule Recognition_VAD.Whisper do + @default_model "ggml-tiny.bin" + @script_path "/home/aime-pc2/i_m/whisper.cpp/transcribe.sh" + require Logger + + def transcribe(path, model \\ @default_model) do + path_to_run = convert_path_to_wsl(path) + + args = [@script_path, path_to_run, model] + + case System.cmd("wsl", args, stderr_to_stdout: true) do + {output, 0} -> + text = extract_transcription(output) + Logger.info("📝 Transcripción: #{text}") + {:ok, text} + + {error_output, _} -> + Logger.error("❌ Error al transcribir con whisper: #{error_output}") + {:error, error_output} + end + end + + defp convert_path_to_wsl(path) do + if String.starts_with?(path, "C:/") do + path + |> String.replace_prefix("C:/", "/mnt/c/") + |> String.replace("\\", "/") + else + path + end + end + + defp extract_transcription(output) do + output + |> String.split("\n") + |> Enum.filter(fn line -> + line =~ ~r/[\p{L}\p{N}]/u and + not String.starts_with?(line, "whisper_") and + not String.starts_with?(line, "system_info") and + not String.starts_with?(line, "main: ") and + not String.starts_with?(line, "whisper_print_timings:") + end) + |> Enum.join(" ") + |> String.trim() + end +end diff --git a/recognition_VAD/lib/recognition_VAD/whisper_streamer.ex b/recognition_VAD/lib/recognition_VAD/whisper_streamer.ex new file mode 100644 index 00000000..1238e515 --- /dev/null +++ b/recognition_VAD/lib/recognition_VAD/whisper_streamer.ex @@ -0,0 +1,56 @@ +defmodule Recognition_VAD.WhisperStreamer do + use GenServer + require Logger + + @transcribe_interval 2000 # cada 2 segundos + @max_chunks 100 # máximo a mantener en memoria + + def start_link(_opts) do + GenServer.start_link(__MODULE__, %{chunks: [], sample_rate: 48000}, name: __MODULE__) + end + + def push_chunk(chunk, sample_rate) do + GenServer.cast(__MODULE__, {:chunk, chunk, sample_rate}) + end + + @impl true + def init(state) do + schedule_transcription() + {:ok, state} + end + + @impl true + def handle_cast({:chunk, binary, sr}, %{chunks: chunks} = state) do + new_chunks = [binary | chunks] |> Enum.take(@max_chunks) + {:noreply, %{state | chunks: new_chunks, sample_rate: sr}} + end + + @impl true + def handle_info(:transcribe_timer, %{chunks: []} = state) do + # Si no hay audio, solo reprogramamos + schedule_transcription() + {:noreply, state} + end + + def handle_info(:transcribe_timer, %{chunks: chunks, sample_rate: sr} = state) do + Task.start(fn -> + path = "C:/Users/rolan/i_m/voice_recognition/recognition_VAD/tmp/realtime_#{System.system_time(:millisecond)}.wav" + :ok = Recognition_VAD.WavWriter.write_pcm_chunks_to_wav(Enum.reverse(chunks), sr, path) + + case Recognition_VAD.Whisper.transcribe(path) do + {:ok, text} when byte_size(text) > 0 -> + Phoenix.PubSub.broadcast(Recognition_VAD.PubSub, "audio_output", {:realtime, %{"text" => text}}) + + _ -> + Logger.debug("⏱ Nada para transcribir o error") + end + end) + + schedule_transcription() + {:noreply, %{state | chunks: []}} + end + + defp schedule_transcription do + Process.send_after(self(), :transcribe_timer, @transcribe_interval) + end +end diff --git a/recognition_VAD/lib/recognition_VAD_web/channels/data_channel.ex b/recognition_VAD/lib/recognition_VAD_web/channels/data_channel.ex index ac87dfff..6b3c416b 100644 --- a/recognition_VAD/lib/recognition_VAD_web/channels/data_channel.ex +++ b/recognition_VAD/lib/recognition_VAD_web/channels/data_channel.ex @@ -6,8 +6,15 @@ defmodule Recognition_VADWeb.DataChannel do {:ok, socket} end + # Parcial + def handle_info({:realtime, msg}, socket) do + push(socket, "realtime", msg) + {:noreply, socket} + end + + # Completo def handle_info({:broadcast_audio, msg}, socket) do - push(socket, "transcription", Jason.decode!(msg)) + push(socket, "transcription", msg) {:noreply, socket} end @@ -15,7 +22,7 @@ defmodule Recognition_VADWeb.DataChannel do def handle_in("audio_chunk", %{"data" => base64_chunk, "sample_rate" => sample_rate}, socket) do case Base.decode64(base64_chunk) do {:ok, binary_audio} -> - GenServer.cast(Recognition_VAD.AudioProcessor, {:chunk, binary_audio, sample_rate}) + Recognition_VAD.WhisperStreamer.push_chunk(binary_audio, sample_rate) {:noreply, socket} :error -> diff --git a/recognition_VAD/lib/recognition_VAD_web/live/stt/test_with_channel.ex b/recognition_VAD/lib/recognition_VAD_web/live/stt/test_with_channel.ex index 05e47615..c5523e83 100644 --- a/recognition_VAD/lib/recognition_VAD_web/live/stt/test_with_channel.ex +++ b/recognition_VAD/lib/recognition_VAD_web/live/stt/test_with_channel.ex @@ -7,114 +7,137 @@ defmodule Recognition_VADWeb.Stt.TestWithChannel do def render(assigns) do ~H""" -
-
Presioná "Start Recording"…
- - +
+
Presioná "Start Recording"…
+ + -
-
-
+
+
+
-
-
-
+ + if (mediaProcessor) mediaProcessor.disconnect(); + if (audioContext) audioContext.close(); + if (mediaStream) mediaStream.getTracks().forEach(track => track.stop()); + if (channel) channel.leave(); + if (socket) socket.disconnect(); + } - -
+ document.getElementById("startButton").onclick = startRecording; + document.getElementById("stopButton").onclick = stopRecording; + + + +
""" end + end