mejora dictado

2025-07-18 10:50:51 +00:00
parent 92e307db34
commit 5249af7d23
7 changed files with 330 additions and 145 deletions
--- a/whisper_live/lib/whisper_live/audio_buffer.ex
+++ b/whisper_live/lib/whisper_live/audio_buffer.ex
@ -1,7 +1,7 @@
 defmodule WhisperLive.AudioBuffer do
    use GenServer

-  ## API
+    # API

    def start_link(ref), do: GenServer.start_link(__MODULE__, [], name: via(ref))

@ -9,19 +9,98 @@ defmodule WhisperLive.AudioBuffer do

    def get_all(ref), do: GenServer.call(via(ref), :get_all)

+    def get_tiny(ref), do: GenServer.call(via(ref), :get_tiny)       
+
+    def clear_tiny(ref), do: GenServer.call(via(ref), :clear_tiny)     
+
    def clear(ref), do: GenServer.call(via(ref), :clear)

    def stop(ref), do: GenServer.stop(via(ref))

+    def pop_chunk_with_overlap(ref, keep_ms \\ 1000), do: GenServer.call(via(ref), {:pop_chunk_with_overlap, keep_ms})
+
    defp via(ref), do: {:via, Registry, {WhisperLive.AudioRegistry, ref}}

-  ## Callbacks
+    # Callbacks

-  def init(_), do: {:ok, []}
+    def init(_), do: {:ok, %{tiny: [], full: []}}

-  def handle_cast({:append, chunk}, state), do: {:noreply, [chunk | state]}
+    def handle_cast({:append, chunk}, state) do
+        {:noreply, %{state | tiny: [chunk | state.tiny], full: [chunk | state.full]}}
+    end

-  def handle_call(:get_all, _from, state), do: {:reply, Enum.reverse(state), state}
+    @impl true
+    def handle_call({:pop_chunk_with_overlap, keep_ms}, _from, state) do
+        bytes_per_ms = div(state.rate * 2, 1000)
+        total = state.last_overlap <> state.buffer

-  def handle_call(:clear, _from, _state), do: {:reply, :ok, []}
+        total_size = byte_size(total)
+        keep_bytes = keep_ms * bytes_per_ms
+        overlap_bytes = min(keep_bytes, total_size)
+
+        {to_send, to_keep} = split_bytes(total, overlap_bytes)
+
+        new_state = %{state | buffer: <<>>, last_overlap: to_keep}
+        {:reply, {to_send, state.rate}, new_state}
+    end
+
+    defp split_bytes(binary, keep_bytes) do
+        total = byte_size(binary)
+        send_bytes = max(total - keep_bytes, 0)
+        <<send::binary-size(send_bytes), keep::binary>> = binary
+        {send, keep}
+    end
+
+    def handle_call(:get_all, _from, state), do: {:reply, Enum.reverse(state.full), state}
+    def handle_call(:get_tiny, _from, state), do: {:reply, Enum.reverse(state.tiny), state}
+    def handle_call(:clear_tiny, _from, state), do: {:reply, :ok, %{state | tiny: []}}
+    def handle_call(:clear, _from, _state), do: {:reply, :ok, %{tiny: [], full: []}}
 end
+
+# defmodule WhisperLive.AudioBuffer do
+#     use GenServer
+
+#     def start_link(ref), do: GenServer.start_link(__MODULE__, [], name: via(ref))
+#     def append(ref, chunk), do: GenServer.cast(via(ref), {:append, chunk})
+#     def pop_chunk_with_overlap(ref, keep_ms \\ 1000), do: GenServer.call(via(ref), {:pop_chunk_with_overlap, keep_ms})
+#     def get_all(pid) do
+#         GenServer.call(pid, :get_all)
+#     end
+
+#     defp via(ref), do: {:via, Registry, {WhisperLive.AudioRegistry, ref}}
+
+#     @impl true
+#     def init(_), do: {:ok, %{buffer: <<>>, last_overlap: <<>>, rate: 48_000}}
+
+#     @impl true
+#     def handle_cast({:append, {_ts, chunk}}, state) do
+#         {:noreply, %{state | buffer: state.buffer <> chunk}}
+#     end
+
+#     @impl true
+#     def handle_call({:pop_chunk_with_overlap, keep_ms}, _from, state) do
+#         bytes_per_ms = div(state.rate * 2, 1000)
+#         total = state.last_overlap <> state.buffer
+
+#         total_size = byte_size(total)
+#         keep_bytes = keep_ms * bytes_per_ms
+#         overlap_bytes = min(keep_bytes, total_size)
+
+#         {to_send, to_keep} = split_bytes(total, overlap_bytes)
+
+#         new_state = %{state | buffer: <<>>, last_overlap: to_keep}
+#         {:reply, {to_send, state.rate}, new_state}
+#     end
+
+#     defp split_bytes(binary, keep_bytes) do
+#         total = byte_size(binary)
+#         send_bytes = max(total - keep_bytes, 0)
+#         <<send::binary-size(send_bytes), keep::binary>> = binary
+#         {send, keep}
+#     end
+
+
+#     def handle_call(:get_all, _from, state) do
+#         {:reply, Enum.reverse(state.buffer), state}
+#     end
+# end
--- a/whisper_live/lib/whisper_live/audio_full_buffer.ex
+++ b/whisper_live/lib/whisper_live/audio_full_buffer.ex
@ -0,0 +1,17 @@
+defmodule WhisperLive.AudioFullBuffer do
+  use GenServer
+
+  def start_link(_), do: GenServer.start_link(__MODULE__, [], name: __MODULE__)
+  def append(chunk), do: GenServer.cast(__MODULE__, {:append, chunk})
+  def get_all(), do: GenServer.call(__MODULE__, :get_all)
+
+  def init(_), do: {:ok, %{chunks: [], rate: 48_000}}
+
+  def handle_cast({:append, {_rate, chunk}}, state) do
+    {:noreply, %{state | chunks: [chunk | state.chunks]}}
+  end
+
+  def handle_call(:get_all, _from, state) do
+    {:reply, {state.rate, Enum.reverse(state.chunks)}, state}
+  end
+end
--- a/whisper_live/lib/whisper_live/audio_stream_buffer.ex
+++ b/whisper_live/lib/whisper_live/audio_stream_buffer.ex
@ -0,0 +1,30 @@
+# mantiene un buffer de tiempo corto + overlap
+defmodule WhisperLive.AudioStreamBuffer do
+  use GenServer
+
+  def start_link(_), do: GenServer.start_link(__MODULE__, [], name: __MODULE__)
+  def append(chunk), do: GenServer.cast(__MODULE__, {:append, chunk})
+  def pop_chunk_with_overlap(n_seconds), do: GenServer.call(__MODULE__, {:pop_chunk, n_seconds})
+
+  def init(_), do: {:ok, %{buffer: <<>>, last_overlap: <<>>, rate: 48_000}}
+
+  def handle_cast({:append, {_rate, chunk}}, state) do
+    {:noreply, %{state | buffer: state.buffer <> chunk}}
+  end
+
+  def handle_call({:pop_chunk, n}, _from, state) do
+    rate = state.rate
+    bytes_per_second = rate * 2
+    bytes_to_take = n * bytes_per_second
+    <<chunk::binary-size(bytes_to_take), rest::binary>> = state.buffer
+
+    overlap_ms = 1000  # 1 segundo
+    overlap_bytes = div(overlap_ms * bytes_per_second, 1000)
+    new_overlap = binary_part(chunk, byte_size(chunk) - overlap_bytes, overlap_bytes)
+
+    reply = state.last_overlap <> chunk
+    new_buffer = new_overlap <> rest
+
+    {:reply, {rate, reply}, %{state | buffer: new_buffer, last_overlap: new_overlap}}
+  end
+end
--- a/whisper_live/lib/whisper_live/transcriber.ex
+++ b/whisper_live/lib/whisper_live/transcriber.ex
@ -22,15 +22,14 @@ defmodule WhisperLive.Transcriber do
    end

    def handle_info(:transcribe, %{ref: ref} = state) do
-        case AudioBuffer.get_all(ref) do
-            [] ->
-            :noop
+        case AudioBuffer.get_tiny(ref) do
+            [] -> :noop

            [{rate, _} | _] = chunks ->
                merged = chunks |> Enum.map(fn {_, bin} -> bin end) |> IO.iodata_to_binary()
-            tmpfile = tmp_path("realtime_#{ref}")
-            :ok = File.write!(tmpfile, encode_wav(merged, rate))
-
+                tmpfile = "tmp/rt_#{System.system_time(:millisecond)}.wav"
+                File.mkdir_p!("tmp")
+                File.write!(tmpfile, encode_wav(merged, rate))
                case send_to_whisper(tmpfile) do
                {:ok, response} ->
                    PubSub.broadcast(WhisperLive.PubSub, "transcription", {:transcription, response})
@ -40,28 +39,54 @@ defmodule WhisperLive.Transcriber do
                end

                File.rm(tmpfile)
+
+                case GenServer.whereis({:via, Registry, {WhisperLive.AudioRegistry, ref}}) do
+                    pid when is_pid(pid) ->
+                        if Process.alive?(pid) do
+                            AudioBuffer.clear_tiny(ref)
+                        else
+                            Logger.debug("AudioBuffer #{inspect(ref)} no está vivo.")
+                        end
+
+                    _ ->
+                        Logger.debug("AudioBuffer #{inspect(ref)} no existe.")
+                end
+
        end

        schedule()
        {:noreply, state}
    end

+    # def handle_info(:transcribe, %{ref: ref} = state) do
+    #     case AudioBuffer.pop_chunk_with_overlap(ref, 1000) do
+    #         {"", _rate} ->
+    #         :noop
+
+    #         {audio, rate} ->
+    #         tmpfile = "tmp/rt_#{ref}_#{System.system_time(:millisecond)}.wav"
+    #         File.mkdir_p!("tmp")
+    #         File.write!(tmpfile, encode_wav(audio, rate))
+
+    #         case send_to_whisper(tmpfile) do
+    #             {:ok, response} ->
+    #             PubSub.broadcast(WhisperLive.PubSub, "transcription", {:transcription, response})
+
+    #             {:error, reason} ->
+    #             Logger.warning("Realtime transcription error: #{inspect(reason)}")
+    #         end
+    #     end
+
+    #     schedule()
+    #     {:noreply, state}
+    # end
+
    defp tmp_path(prefix) do
        unique = :erlang.unique_integer([:positive]) |> Integer.to_string()
        filename = prefix <> "_" <> unique <> ".wav"
        Path.join(System.tmp_dir!(), filename)
    end

-
-    # def handle_info({:transcription, raw_json}, socket) do
-    #     new_text =
-    #     raw_json
-    #     |> Jason.decode!()
-    #     |> get_in(["chunks", Access.at(0), "text"])
-
-    #     {:noreply, update(socket, :transcription, &(&1 <> " " <> new_text))}
-    # end
-
    defp schedule, do: Process.send_after(self(), :transcribe, @interval_ms)

    defp encode_wav(data, sample_rate) do
--- a/whisper_live/lib/whisper_live_web/channels/audio_channel.ex
+++ b/whisper_live/lib/whisper_live_web/channels/audio_channel.ex
@ -7,10 +7,20 @@ defmodule WhisperLiveWeb.AudioChannel do
        ref = socket_id(socket)
        Logger.info("Cliente conectado al canal audio:lobby")
        {:ok, _} = AudioBuffer.start_link(ref)
+        # {:ok, _} = AudioFullBuffer.start_link(ref)
        {:ok, _} = WhisperLive.Transcriber.start_link(ref)
        {:ok, socket}
    end

+    # def handle_in("audio_chunk", %{"chunk" => chunk}, socket) do
+    #     decoded_chunk = Base.decode64!(chunk)
+
+    #     AudioStreamBuffer.append({48_000, decoded_chunk})
+    #     AudioFullBuffer.append({48_000, decoded_chunk})
+
+    #     {:noreply, socket}
+    # end
+
    def handle_in("audio_chunk", %{"data" => data, "sample_rate" => rate}, socket) do
        {:ok, binary} = Base.decode64(data)
        AudioBuffer.append(socket_id(socket), {rate, binary})
@ -18,6 +28,29 @@ defmodule WhisperLiveWeb.AudioChannel do
        {:noreply, socket}
    end

+    # def handle_in("stop_audio", _payload, socket) do
+    #     Logger.info("🛑 Grabación detenida por cliente")
+
+    #     ref = socket_id(socket)
+
+    #     case AudioFullBuffer.get_all(ref) do
+    #     [{rate, _} | _] = chunks ->
+    #         merged = chunks |> Enum.map(fn {_, bin} -> bin end) |> IO.iodata_to_binary()
+    #         filename = "recordings/recording_#{System.system_time(:millisecond)}.wav"
+    #         File.mkdir_p!("recordings")
+    #         File.write!(filename, encode_wav(merged, rate))
+    #         whisper_large(filename)
+    #         File.rm!(filename)
+    #     _ ->
+    #         Logger.warning("No se recibieron chunks de audio")
+    #     end
+
+    #     AudioStreamBuffer.stop(ref)
+    #     AudioFullBuffer.stop(ref)
+    #     WhisperLive.Transcriber.stop(ref)
+    #     {:noreply, socket}
+    # end
+    
    def handle_in("stop_audio", _payload, socket) do
        Logger.info("🛑 Grabación detenida por cliente")

@ -32,7 +65,7 @@ defmodule WhisperLiveWeb.AudioChannel do
            whisper_large(filename)
            File.rm!(filename)
        _ ->
-            Logger.warning("⚠️ No se recibieron chunks de audio")
+            Logger.warning("No se recibieron chunks de audio")
        end

        AudioBuffer.stop(ref)
--- a/whisper_live/lib/whisper_live_web/live/recorder.ex
+++ b/whisper_live/lib/whisper_live_web/live/recorder.ex
@ -65,21 +65,22 @@ defmodule WhisperLiveWeb.Live.Recorder do
                </button>
            </div>

-            <div id="status" class="text-sm text-gray-600"></div>
+            <div id="status" class="hidden"></div>

-            <div id="transcriptionContainer" class="space-y-2">
-                <div class="p-2 bg-gray-100 rounded shadow">
-                <h2 class="text-sm font-semibold text-gray-700 mb-1">🟠 Transcripción en vivo</h2>
-                <p id="transcription" class="text-orange-600 whitespace-pre-wrap"><%= @transcription %></p>
+            <div id="transcriptionContainer" class="w-full max-w-2xl space-y-4">
+                <div class="p-4 bg-gray-100 rounded shadow-md">
+                    <h2 class="text-sm font-semibold text-gray-700 mb-2">🟠 Transcripción en vivo</h2>
+                    <p id="transcription" class="text-orange-600 whitespace-pre-wrap break-words text-sm leading-relaxed"><%= @transcription %></p>
                </div>

                <%= if @transcription_m != "" do %>
-                <div class="p-2 bg-gray-100 rounded shadow">
-                    <h2 class="text-sm font-semibold text-gray-700 mb-1">✅ Transcripción mejorada</h2>
-                    <p class="text-green-600 whitespace-pre-wrap"><%= @transcription_m %></p>
+                    <div class="p-4 bg-gray-100 rounded shadow-md">
+                        <h2 class="text-sm font-semibold text-gray-700 mb-2">✅ Transcripción mejorada</h2>
+                        <p class="text-green-600 whitespace-pre-wrap break-words text-sm leading-relaxed"><%= @transcription_m %></p>
                    </div>
                <% end %>
            </div>
+
            <script type="module">
                import { Socket } from "https://cdn.skypack.dev/phoenix"

@ -157,8 +158,8 @@ defmodule WhisperLiveWeb.Live.Recorder do

                        const base64 = encodeBase64(new Uint8Array(merged.buffer))
                        channel.push("audio_chunk", { data: base64, sample_rate: sampleRate })
-                console.log("📤 Enviado chunk")
-                }, 2000)
+                        console.log("Chunk enviado")
+                    }, 1000)
                }

                function stopRecording() {
--- a/whisper_live/recordings/recording_1752678344186.wav
+++ b/whisper_live/recordings/recording_1752678344186.wav