mejora dictado

2025-07-18 10:50:51 +00:00
parent 92e307db34
commit 5249af7d23
7 changed files with 330 additions and 145 deletions
--- a/whisper_live/lib/whisper_live/audio_buffer.ex
+++ b/whisper_live/lib/whisper_live/audio_buffer.ex
@ -1,27 +1,106 @@
 defmodule WhisperLive.AudioBuffer do
-  use GenServer
+    use GenServer
-  ## API
+    # API
-  def start_link(ref), do: GenServer.start_link(__MODULE__, [], name: via(ref))
+    def start_link(ref), do: GenServer.start_link(__MODULE__, [], name: via(ref))
-  def append(ref, chunk), do: GenServer.cast(via(ref), {:append, chunk})
+    def append(ref, chunk), do: GenServer.cast(via(ref), {:append, chunk})
-  def get_all(ref), do: GenServer.call(via(ref), :get_all)
+    def get_all(ref), do: GenServer.call(via(ref), :get_all)
-  def clear(ref), do: GenServer.call(via(ref), :clear)
+    def get_tiny(ref), do: GenServer.call(via(ref), :get_tiny)       
-  def stop(ref), do: GenServer.stop(via(ref))
+    def clear_tiny(ref), do: GenServer.call(via(ref), :clear_tiny)     
-  defp via(ref), do: {:via, Registry, {WhisperLive.AudioRegistry, ref}}
+    def clear(ref), do: GenServer.call(via(ref), :clear)
-  ## Callbacks
+    def stop(ref), do: GenServer.stop(via(ref))
-  def init(_), do: {:ok, []}
+    def pop_chunk_with_overlap(ref, keep_ms \\ 1000), do: GenServer.call(via(ref), {:pop_chunk_with_overlap, keep_ms})
-  def handle_cast({:append, chunk}, state), do: {:noreply, [chunk | state]}
+    defp via(ref), do: {:via, Registry, {WhisperLive.AudioRegistry, ref}}
-  def handle_call(:get_all, _from, state), do: {:reply, Enum.reverse(state), state}
+    # Callbacks
-  def handle_call(:clear, _from, _state), do: {:reply, :ok, []}
+    def init(_), do: {:ok, %{tiny: [], full: []}}
    def handle_cast({:append, chunk}, state) do
        {:noreply, %{state | tiny: [chunk | state.tiny], full: [chunk | state.full]}}
    end
    @impl true
    def handle_call({:pop_chunk_with_overlap, keep_ms}, _from, state) do
        bytes_per_ms = div(state.rate * 2, 1000)
        total = state.last_overlap <> state.buffer
        total_size = byte_size(total)
        keep_bytes = keep_ms * bytes_per_ms
        overlap_bytes = min(keep_bytes, total_size)
        {to_send, to_keep} = split_bytes(total, overlap_bytes)
        new_state = %{state | buffer: <<>>, last_overlap: to_keep}
        {:reply, {to_send, state.rate}, new_state}
    end
    defp split_bytes(binary, keep_bytes) do
        total = byte_size(binary)
        send_bytes = max(total - keep_bytes, 0)
        <<send::binary-size(send_bytes), keep::binary>> = binary
        {send, keep}
    end
    def handle_call(:get_all, _from, state), do: {:reply, Enum.reverse(state.full), state}
    def handle_call(:get_tiny, _from, state), do: {:reply, Enum.reverse(state.tiny), state}
    def handle_call(:clear_tiny, _from, state), do: {:reply, :ok, %{state | tiny: []}}
    def handle_call(:clear, _from, _state), do: {:reply, :ok, %{tiny: [], full: []}}
 end
 # defmodule WhisperLive.AudioBuffer do
 #     use GenServer
 #     def start_link(ref), do: GenServer.start_link(__MODULE__, [], name: via(ref))
 #     def append(ref, chunk), do: GenServer.cast(via(ref), {:append, chunk})
 #     def pop_chunk_with_overlap(ref, keep_ms \\ 1000), do: GenServer.call(via(ref), {:pop_chunk_with_overlap, keep_ms})
 #     def get_all(pid) do
 #         GenServer.call(pid, :get_all)
 #     end
 #     defp via(ref), do: {:via, Registry, {WhisperLive.AudioRegistry, ref}}
 #     @impl true
 #     def init(_), do: {:ok, %{buffer: <<>>, last_overlap: <<>>, rate: 48_000}}
 #     @impl true
 #     def handle_cast({:append, {_ts, chunk}}, state) do
 #         {:noreply, %{state | buffer: state.buffer <> chunk}}
 #     end
 #     @impl true
 #     def handle_call({:pop_chunk_with_overlap, keep_ms}, _from, state) do
 #         bytes_per_ms = div(state.rate * 2, 1000)
 #         total = state.last_overlap <> state.buffer
 #         total_size = byte_size(total)
 #         keep_bytes = keep_ms * bytes_per_ms
 #         overlap_bytes = min(keep_bytes, total_size)
 #         {to_send, to_keep} = split_bytes(total, overlap_bytes)
 #         new_state = %{state | buffer: <<>>, last_overlap: to_keep}
 #         {:reply, {to_send, state.rate}, new_state}
 #     end
 #     defp split_bytes(binary, keep_bytes) do
 #         total = byte_size(binary)
 #         send_bytes = max(total - keep_bytes, 0)
 #         <<send::binary-size(send_bytes), keep::binary>> = binary
 #         {send, keep}
 #     end
 #     def handle_call(:get_all, _from, state) do
 #         {:reply, Enum.reverse(state.buffer), state}
 #     end
 # end
--- a/whisper_live/lib/whisper_live/audio_full_buffer.ex
+++ b/whisper_live/lib/whisper_live/audio_full_buffer.ex
@ -0,0 +1,17 @@
 defmodule WhisperLive.AudioFullBuffer do
  use GenServer
  def start_link(_), do: GenServer.start_link(__MODULE__, [], name: __MODULE__)
  def append(chunk), do: GenServer.cast(__MODULE__, {:append, chunk})
  def get_all(), do: GenServer.call(__MODULE__, :get_all)
  def init(_), do: {:ok, %{chunks: [], rate: 48_000}}
  def handle_cast({:append, {_rate, chunk}}, state) do
    {:noreply, %{state | chunks: [chunk | state.chunks]}}
  end
  def handle_call(:get_all, _from, state) do
    {:reply, {state.rate, Enum.reverse(state.chunks)}, state}
  end
 end
--- a/whisper_live/lib/whisper_live/audio_stream_buffer.ex
+++ b/whisper_live/lib/whisper_live/audio_stream_buffer.ex
@ -0,0 +1,30 @@
 # mantiene un buffer de tiempo corto + overlap
 defmodule WhisperLive.AudioStreamBuffer do
  use GenServer
  def start_link(_), do: GenServer.start_link(__MODULE__, [], name: __MODULE__)
  def append(chunk), do: GenServer.cast(__MODULE__, {:append, chunk})
  def pop_chunk_with_overlap(n_seconds), do: GenServer.call(__MODULE__, {:pop_chunk, n_seconds})
  def init(_), do: {:ok, %{buffer: <<>>, last_overlap: <<>>, rate: 48_000}}
  def handle_cast({:append, {_rate, chunk}}, state) do
    {:noreply, %{state | buffer: state.buffer <> chunk}}
  end
  def handle_call({:pop_chunk, n}, _from, state) do
    rate = state.rate
    bytes_per_second = rate * 2
    bytes_to_take = n * bytes_per_second
    <<chunk::binary-size(bytes_to_take), rest::binary>> = state.buffer
    overlap_ms = 1000  # 1 segundo
    overlap_bytes = div(overlap_ms * bytes_per_second, 1000)
    new_overlap = binary_part(chunk, byte_size(chunk) - overlap_bytes, overlap_bytes)
    reply = state.last_overlap <> chunk
    new_buffer = new_overlap <> rest
    {:reply, {rate, reply}, %{state | buffer: new_buffer, last_overlap: new_overlap}}
  end
 end
--- a/whisper_live/lib/whisper_live/transcriber.ex
+++ b/whisper_live/lib/whisper_live/transcriber.ex
@ -22,46 +22,71 @@ defmodule WhisperLive.Transcriber do
    end
    def handle_info(:transcribe, %{ref: ref} = state) do
-        case AudioBuffer.get_all(ref) do
+        case AudioBuffer.get_tiny(ref) do
-            [] ->
+            [] -> :noop
            :noop
            [{rate, _} | _] = chunks ->
-            merged = chunks |> Enum.map(fn {_, bin} -> bin end) |> IO.iodata_to_binary()
+                merged = chunks |> Enum.map(fn {_, bin} -> bin end) |> IO.iodata_to_binary()
-            tmpfile = tmp_path("realtime_#{ref}")
+                tmpfile = "tmp/rt_#{System.system_time(:millisecond)}.wav"
-            :ok = File.write!(tmpfile, encode_wav(merged, rate))
+                File.mkdir_p!("tmp")
-
+                File.write!(tmpfile, encode_wav(merged, rate))
-            case send_to_whisper(tmpfile) do
+                case send_to_whisper(tmpfile) do
                {:ok, response} ->
-                PubSub.broadcast(WhisperLive.PubSub, "transcription", {:transcription, response})
+                    PubSub.broadcast(WhisperLive.PubSub, "transcription", {:transcription, response})
                {:error, reason} ->
-                Logger.warning("Realtime transcription error: #{inspect(reason)}")
+                    Logger.warning("Realtime transcription error: #{inspect(reason)}")
-            end
+                end
                File.rm(tmpfile)
                case GenServer.whereis({:via, Registry, {WhisperLive.AudioRegistry, ref}}) do
                    pid when is_pid(pid) ->
                        if Process.alive?(pid) do
                            AudioBuffer.clear_tiny(ref)
                        else
                            Logger.debug("AudioBuffer #{inspect(ref)} no está vivo.")
                        end
                    _ ->
                        Logger.debug("AudioBuffer #{inspect(ref)} no existe.")
                end
            File.rm(tmpfile)
        end
        schedule()
        {:noreply, state}
    end
    # def handle_info(:transcribe, %{ref: ref} = state) do
    #     case AudioBuffer.pop_chunk_with_overlap(ref, 1000) do
    #         {"", _rate} ->
    #         :noop
    #         {audio, rate} ->
    #         tmpfile = "tmp/rt_#{ref}_#{System.system_time(:millisecond)}.wav"
    #         File.mkdir_p!("tmp")
    #         File.write!(tmpfile, encode_wav(audio, rate))
    #         case send_to_whisper(tmpfile) do
    #             {:ok, response} ->
    #             PubSub.broadcast(WhisperLive.PubSub, "transcription", {:transcription, response})
    #             {:error, reason} ->
    #             Logger.warning("Realtime transcription error: #{inspect(reason)}")
    #         end
    #     end
    #     schedule()
    #     {:noreply, state}
    # end
    defp tmp_path(prefix) do
        unique = :erlang.unique_integer([:positive]) |> Integer.to_string()
        filename = prefix <> "_" <> unique <> ".wav"
        Path.join(System.tmp_dir!(), filename)
    end
    # def handle_info({:transcription, raw_json}, socket) do
    #     new_text =
    #     raw_json
    #     |> Jason.decode!()
    #     |> get_in(["chunks", Access.at(0), "text"])
    #     {:noreply, update(socket, :transcription, &(&1 <> " " <> new_text))}
    # end
    defp schedule, do: Process.send_after(self(), :transcribe, @interval_ms)
    defp encode_wav(data, sample_rate) do
--- a/whisper_live/lib/whisper_live_web/channels/audio_channel.ex
+++ b/whisper_live/lib/whisper_live_web/channels/audio_channel.ex
@ -7,10 +7,20 @@ defmodule WhisperLiveWeb.AudioChannel do
        ref = socket_id(socket)
        Logger.info("Cliente conectado al canal audio:lobby")
        {:ok, _} = AudioBuffer.start_link(ref)
        # {:ok, _} = AudioFullBuffer.start_link(ref)
        {:ok, _} = WhisperLive.Transcriber.start_link(ref)
        {:ok, socket}
    end
    # def handle_in("audio_chunk", %{"chunk" => chunk}, socket) do
    #     decoded_chunk = Base.decode64!(chunk)
    #     AudioStreamBuffer.append({48_000, decoded_chunk})
    #     AudioFullBuffer.append({48_000, decoded_chunk})
    #     {:noreply, socket}
    # end
    def handle_in("audio_chunk", %{"data" => data, "sample_rate" => rate}, socket) do
        {:ok, binary} = Base.decode64(data)
        AudioBuffer.append(socket_id(socket), {rate, binary})
@ -18,6 +28,29 @@ defmodule WhisperLiveWeb.AudioChannel do
        {:noreply, socket}
    end
    # def handle_in("stop_audio", _payload, socket) do
    #     Logger.info("🛑 Grabación detenida por cliente")
    #     ref = socket_id(socket)
    #     case AudioFullBuffer.get_all(ref) do
    #     [{rate, _} | _] = chunks ->
    #         merged = chunks |> Enum.map(fn {_, bin} -> bin end) |> IO.iodata_to_binary()
    #         filename = "recordings/recording_#{System.system_time(:millisecond)}.wav"
    #         File.mkdir_p!("recordings")
    #         File.write!(filename, encode_wav(merged, rate))
    #         whisper_large(filename)
    #         File.rm!(filename)
    #     _ ->
    #         Logger.warning("No se recibieron chunks de audio")
    #     end
    #     AudioStreamBuffer.stop(ref)
    #     AudioFullBuffer.stop(ref)
    #     WhisperLive.Transcriber.stop(ref)
    #     {:noreply, socket}
    # end
    def handle_in("stop_audio", _payload, socket) do
        Logger.info("🛑 Grabación detenida por cliente")
@ -32,7 +65,7 @@ defmodule WhisperLiveWeb.AudioChannel do
            whisper_large(filename)
            File.rm!(filename)
        _ ->
-            Logger.warning("⚠️ No se recibieron chunks de audio")
+            Logger.warning("No se recibieron chunks de audio")
        end
        AudioBuffer.stop(ref)
--- a/whisper_live/lib/whisper_live_web/live/recorder.ex
+++ b/whisper_live/lib/whisper_live_web/live/recorder.ex
@ -65,135 +65,136 @@ defmodule WhisperLiveWeb.Live.Recorder do
                </button>
            </div>
-            <div id="status" class="text-sm text-gray-600"></div>
+            <div id="status" class="hidden"></div>
-            <div id="transcriptionContainer" class="space-y-2">
+            <div id="transcriptionContainer" class="w-full max-w-2xl space-y-4">
-                <div class="p-2 bg-gray-100 rounded shadow">
+                <div class="p-4 bg-gray-100 rounded shadow-md">
-                <h2 class="text-sm font-semibold text-gray-700 mb-1">🟠 Transcripción en vivo</h2>
+                    <h2 class="text-sm font-semibold text-gray-700 mb-2">🟠 Transcripción en vivo</h2>
-                <p id="transcription" class="text-orange-600 whitespace-pre-wrap"><%= @transcription %></p>
+                    <p id="transcription" class="text-orange-600 whitespace-pre-wrap break-words text-sm leading-relaxed"><%= @transcription %></p>
            </div>
            <%= if @transcription_m != "" do %>
                <div class="p-2 bg-gray-100 rounded shadow">
                    <h2 class="text-sm font-semibold text-gray-700 mb-1">✅ Transcripción mejorada</h2>
                    <p class="text-green-600 whitespace-pre-wrap"><%= @transcription_m %></p>
                </div>
-            <% end %>
+
                <%= if @transcription_m != "" do %>
                    <div class="p-4 bg-gray-100 rounded shadow-md">
                        <h2 class="text-sm font-semibold text-gray-700 mb-2">✅ Transcripción mejorada</h2>
                        <p class="text-green-600 whitespace-pre-wrap break-words text-sm leading-relaxed"><%= @transcription_m %></p>
                    </div>
                <% end %>
            </div>
            <script type="module">
-            import { Socket } from "https://cdn.skypack.dev/phoenix"
+                import { Socket } from "https://cdn.skypack.dev/phoenix"
-            const startButton = document.getElementById("startButton")
+                const startButton = document.getElementById("startButton")
-            const stopButton = document.getElementById("stopButton")
+                const stopButton = document.getElementById("stopButton")
-            const statusDiv = document.getElementById("status")
+                const statusDiv = document.getElementById("status")
-            let socket = null
+                let socket = null
-            let channel = null
+                let channel = null
-            let audioContext = null
+                let audioContext = null
-            let processor = null
+                let processor = null
-            let mediaStream = null
+                let mediaStream = null
-            let buffer = []
+                let buffer = []
-            let sendInterval = null
+                let sendInterval = null
-            const sampleRate = 48000
+                const sampleRate = 48000
-            async function startRecording() {
+                async function startRecording() {
-                startButton.disabled = true
+                    startButton.disabled = true
-                stopButton.disabled = false
+                    stopButton.disabled = false
-                statusDiv.textContent = "🎙 Grabando..."
+                    statusDiv.textContent = "🎙 Grabando..."
-                socket = new Socket("ws://localhost:4004/socket")
+                    socket = new Socket("ws://localhost:4004/socket")
-                socket.connect()
+                    socket.connect()
-                channel = socket.channel("audio:lobby")
+                    channel = socket.channel("audio:lobby")
-                await channel.join()
+                    await channel.join()
-                .receive("ok", () => {
+                    .receive("ok", () => {
-                    console.log("✅ Canal conectado")
+                        console.log("✅ Canal conectado")
-                    statusDiv.textContent = "✅ Canal conectado"
+                        statusDiv.textContent = "✅ Canal conectado"
-                })
+                    })
-                .receive("error", () => {
+                    .receive("error", () => {
-                    console.error("❌ Error al conectar canal")
+                        console.error("❌ Error al conectar canal")
-                    statusDiv.textContent = "❌ Error canal"
+                        statusDiv.textContent = "❌ Error canal"
-                })
+                    })
-                try {
+                    try {
-                audioContext = new AudioContext({ sampleRate })
+                    audioContext = new AudioContext({ sampleRate })
-                mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true })
+                    mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true })
-                } catch (err) {
+                    } catch (err) {
-                console.error("❌ Micrófono error:", err)
+                    console.error("❌ Micrófono error:", err)
-                statusDiv.textContent = "❌ Error accediendo al micrófono"
+                    statusDiv.textContent = "❌ Error accediendo al micrófono"
-                return
+                    return
                }
                const source = audioContext.createMediaStreamSource(mediaStream)
                processor = audioContext.createScriptProcessor(4096, 1, 1)
                source.connect(processor)
                processor.connect(audioContext.destination)
                buffer = []
                processor.onaudioprocess = e => {
                const input = e.inputBuffer.getChannelData(0)
                const pcm = new Int16Array(input.length)
                for (let i = 0; i < input.length; i++) {
                    let s = Math.max(-1, Math.min(1, input[i]))
                    pcm[i] = s < 0 ? s * 0x8000 : s * 0x7FFF
                }
                buffer.push(pcm)
                }
                sendInterval = setInterval(() => {
                if (buffer.length === 0) return
                const merged = flattenInt16(buffer)
                buffer = []
                function encodeBase64(uint8Array) {
                    let binary = ''
                    const len = uint8Array.byteLength
                    for (let i = 0; i < len; i++) {
                    binary += String.fromCharCode(uint8Array[i])
                    }
-                    return btoa(binary)
+
                    const source = audioContext.createMediaStreamSource(mediaStream)
                    processor = audioContext.createScriptProcessor(4096, 1, 1)
                    source.connect(processor)
                    processor.connect(audioContext.destination)
                    buffer = []
                    processor.onaudioprocess = e => {
                    const input = e.inputBuffer.getChannelData(0)
                    const pcm = new Int16Array(input.length)
                    for (let i = 0; i < input.length; i++) {
                        let s = Math.max(-1, Math.min(1, input[i]))
                        pcm[i] = s < 0 ? s * 0x8000 : s * 0x7FFF
                    }
                    buffer.push(pcm)
                    }
                    sendInterval = setInterval(() => {
                        if (buffer.length === 0) return
                        const merged = flattenInt16(buffer)
                        buffer = []
                        function encodeBase64(uint8Array) {
                            let binary = ''
                            const len = uint8Array.byteLength
                            for (let i = 0; i < len; i++) {
                            binary += String.fromCharCode(uint8Array[i])
                            }
                            return btoa(binary)
                        }
                        const base64 = encodeBase64(new Uint8Array(merged.buffer))
                        channel.push("audio_chunk", { data: base64, sample_rate: sampleRate })
                        console.log("Chunk enviado")
                    }, 1000)
                }
-                const base64 = encodeBase64(new Uint8Array(merged.buffer))
+                function stopRecording() {
-                channel.push("audio_chunk", { data: base64, sample_rate: sampleRate })
+                    stopButton.disabled = true
-                console.log("📤 Enviado chunk")
+                    startButton.disabled = false
-                }, 2000)
+                    statusDiv.textContent = "🛑 Grabación detenida."
            }
-            function stopRecording() {
+                    if (processor) processor.disconnect()
-                stopButton.disabled = true
+                    if (audioContext) audioContext.close()
-                startButton.disabled = false
+                    if (mediaStream) mediaStream.getTracks().forEach(t => t.stop())
-                statusDiv.textContent = "🛑 Grabación detenida."
+                    if (sendInterval) clearInterval(sendInterval)
-                if (processor) processor.disconnect()
+                    if (channel) {
-                if (audioContext) audioContext.close()
+                    channel.push("stop_audio")
-                if (mediaStream) mediaStream.getTracks().forEach(t => t.stop())
+                    setTimeout(() => {
-                if (sendInterval) clearInterval(sendInterval)
+                        channel.leave()
-
+                        socket.disconnect()
-                if (channel) {
+                        console.log("🔌 Socket cerrado")
-                channel.push("stop_audio")
+                    }, 500)
-                setTimeout(() => {
+                    }
                    channel.leave()
                    socket.disconnect()
                    console.log("🔌 Socket cerrado")
                }, 500)
                }
            }
-            function flattenInt16(buffers) {
+                function flattenInt16(buffers) {
-                const length = buffers.reduce((acc, b) => acc + b.length, 0)
+                    const length = buffers.reduce((acc, b) => acc + b.length, 0)
-                const out = new Int16Array(length)
+                    const out = new Int16Array(length)
-                let offset = 0
+                    let offset = 0
-                for (const b of buffers) {
+                    for (const b of buffers) {
-                out.set(b, offset)
+                    out.set(b, offset)
-                offset += b.length
+                    offset += b.length
                    }
                    return out
                }
                return out
            }
-            startButton.onclick = startRecording
+                startButton.onclick = startRecording
-            stopButton.onclick = stopRecording
+                stopButton.onclick = stopRecording
            </script>
        </div>
        """
--- a/whisper_live/recordings/recording_1752678344186.wav
+++ b/whisper_live/recordings/recording_1752678344186.wav