Transcripcion en vivo + transcripcion mejorada

2025-07-16 15:50:13 +00:00
parent 8386b685d6
commit 89168522b6
12 changed files with 293 additions and 223 deletions
--- a/whisper_live/assets/css/app.css
+++ b/whisper_live/assets/css/app.css
@ -3,8 +3,84 @@
@import "tailwindcss/utilities";

 /* This file is for your main application CSS */
-.realtime {
-  white-space: pre-wrap;
-  font-family: monospace;
-  margin-top: 1em;
-}
+    body {
+      background-color: #f4f4f9;
+      color: #333;
+      font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+      display: flex;
+      align-items: center;
+      justify-content: center;
+      height: 100vh;
+      margin: 0;
+    }
+    #container {
+      display: flex;
+      flex-direction: column;
+      align-items: center;
+      width: 100%;
+      max-width: 700px;
+      padding: 20px;
+      box-sizing: border-box;
+      gap: 20px; /* Add more vertical space between items */
+      height: 90%; /* Fixed height to prevent layout shift */
+    }
+    #status {
+      color: #0056b3;
+      font-size: 20px;
+      text-align: center;
+    }
+    #transcriptionContainer {
+      height: auto; /* Fixed height for approximately 3 lines of text */
+      overflow-y: auto;
+      width: 100%;
+      padding: 10px;
+      box-sizing: border-box;
+      background-color: #f9f9f9;
+      border: 1px solid #ddd;
+      border-radius: 5px;
+    }
+    #transcription {
+      font-size: 18px;
+      line-height: 1.6;
+      color: #333;
+      word-wrap: break-word;
+    }
+    #fullTextContainer {
+      height: 150px; /* Fixed height to prevent layout shift */
+      overflow-y: auto;
+      width: 100%;
+      padding: 10px;
+      box-sizing: border-box;
+      background-color: #f9f9f9;
+      border: 1px solid #ddd;
+      border-radius: 5px;
+    }
+    #fullText {
+      color: #4CAF50;
+      font-size: 18px;
+      font-weight: 600;
+      word-wrap: break-word;
+    }
+    .last-word {
+      color: #007bff;
+      font-weight: 600;
+    }
+    button {
+      padding: 12px 24px;
+      font-size: 16px;
+      cursor: pointer;
+      border: none;
+      border-radius: 5px;
+      margin: 5px;
+      transition: background-color 0.3s ease;
+      color: #fff;
+      background-color: #0056b3;
+      box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
+    }
+    button:hover {
+      background-color: #007bff;
+    }
+    button:disabled {
+      background-color: #cccccc;
+      cursor: not-allowed;
+    }
--- a/whisper_live/lib/whisper_live/audio_buffer.ex
+++ b/whisper_live/lib/whisper_live/audio_buffer.ex
@ -9,6 +9,8 @@ defmodule WhisperLive.AudioBuffer do

  def get_all(ref), do: GenServer.call(via(ref), :get_all)

+  def clear(ref), do: GenServer.call(via(ref), :clear)
+
  def stop(ref), do: GenServer.stop(via(ref))

  defp via(ref), do: {:via, Registry, {WhisperLive.AudioRegistry, ref}}
@ -20,4 +22,6 @@ defmodule WhisperLive.AudioBuffer do
  def handle_cast({:append, chunk}, state), do: {:noreply, [chunk | state]}

  def handle_call(:get_all, _from, state), do: {:reply, Enum.reverse(state), state}
+
+  def handle_call(:clear, _from, _state), do: {:reply, :ok, []}
 end
--- a/whisper_live/lib/whisper_live/transcriber.ex
+++ b/whisper_live/lib/whisper_live/transcriber.ex
@ -33,7 +33,7 @@ defmodule WhisperLive.Transcriber do

            case send_to_whisper(tmpfile) do
                {:ok, response} ->
-                PubSub.broadcast(WhisperLive.PubSub, "transcription:#{ref}", {:transcription, response})
+                PubSub.broadcast(WhisperLive.PubSub, "transcription", {:transcription, response})

                {:error, reason} ->
                Logger.warning("Realtime transcription error: #{inspect(reason)}")
@ -90,7 +90,7 @@ defmodule WhisperLive.Transcriber do
    end

    defp send_to_whisper(filepath) do
-        url = "http://localhost:4000/infer"
+        url = "http://localhost:4000/tiny"
        {:ok, file_bin} = File.read(filepath)
        filename = Path.basename(filepath)

@ -108,9 +108,17 @@ defmodule WhisperLive.Transcriber do

        :httpc.request(:post, {url, headers, 'multipart/form-data; boundary=----ElixirBoundary', body}, [], [])
        |> case do
-            {:ok, {{_, 200, _}, _headers, body}} -> {:ok, to_string(body)}
-            {:ok, {{_, status, _}, _, body}} -> {:error, {:http_error, status, to_string(body)}}
-            error -> {:error, error}
+            {:ok, {{_, 200, _}, _headers, body}} ->
+                # Logger.info("en transcriber --------------------------\n   -> > #{IO.iodata_to_binary(body)}")
+                # Phoenix.PubSub.broadcast(WhisperLive.PubSub, "transcription", {:transcription, "#{IO.iodata_to_binary(body)}"})
+
+                {:ok, "#{IO.iodata_to_binary(body)}"}
+
+            {:ok, {{_, status, _}, _, body}} ->
+                {:error, {:http_error, status,"#{IO.iodata_to_binary(body)}"}}
+
+            error ->
+                {:error, error}
        end
    end
 end
--- a/whisper_live/lib/whisper_live_web/channels/audio_channel.ex
+++ b/whisper_live/lib/whisper_live_web/channels/audio_channel.ex
@ -11,31 +11,13 @@ defmodule WhisperLiveWeb.AudioChannel do
        {:ok, socket}
    end

-    def handle_in("audio_chunk", %{"data" => base64_audio, "sample_rate" => sample_rate}, socket) do
-        # 1. Decodificas el audio base64
-        {:ok, bin} = Base.decode64(base64_audio)
-
-        # 2. Guardas o procesas el chunk de audio
-        # Podrías escribirlo en un archivo temporal para enviar a Whisper
-        tmpfile = tmp_path("chunk_#{socket.assigns.ref}")
-        :ok = File.write!(tmpfile, encode_wav(bin, sample_rate))
-
-        # 3. Llamas a la transcripción del chunk (podría ser sync o async)
-        case send_to_whisper(tmpfile) do
-        {:ok, transcription} ->
-            # 4. Envías el texto parcial por PubSub o Push a LiveView/cliente
-            Phoenix.PubSub.broadcast(YourApp.PubSub, "transcription:#{socket.assigns.ref}", {:transcription, transcription})
-
-        {:error, reason} ->
-            Logger.error("Error en transcripción parcial: #{inspect(reason)}")
-        end
-
-        File.rm(tmpfile)
-
+    def handle_in("audio_chunk", %{"data" => data, "sample_rate" => rate}, socket) do
+        {:ok, binary} = Base.decode64(data)
+        AudioBuffer.append(socket_id(socket), {rate, binary})
+        Logger.info("📦 Chunk recibido: #{byte_size(binary)} bytes, sample_rate: #{rate}")
        {:noreply, socket}
    end

-
    def handle_in("stop_audio", _payload, socket) do
        Logger.info("🛑 Grabación detenida por cliente")

@ -47,16 +29,8 @@ defmodule WhisperLiveWeb.AudioChannel do
            filename = "recordings/recording_#{System.system_time(:millisecond)}.wav"
            File.mkdir_p!("recordings")
            File.write!(filename, encode_wav(merged, rate))
-            Logger.info("💾 Audio guardado en #{filename}")
-
-            # 🔁 Transcribir automáticamente
-            case send_to_whisper(filename) do
-            {:ok, response} ->
-                Logger.info("📝 Transcripción recibida: #{response}")
-            {:error, reason} ->
-                Logger.error("❌ Error al transcribir: #{inspect(reason)}")
-            end
-
+            whisper_large(filename)
+            File.rm!(filename)
        _ ->
            Logger.warning("⚠️ No se recibieron chunks de audio")
        end
@ -93,18 +67,16 @@ defmodule WhisperLiveWeb.AudioChannel do
        >> <> data
    end

-    defp send_to_whisper(filepath) do
-        url = "http://localhost:4000/infer"
-
+    defp whisper_large(filepath) do
+        url = "http://localhost:4000/large"
        {:ok, file_bin} = File.read(filepath)
        filename = Path.basename(filepath)

        headers = [
-        {'Content-Type', 'multipart/form-data; boundary=----ElixirBoundary'}
+            {'Content-Type', 'multipart/form-data; boundary=----ElixirBoundary'}
        ]

-        body =
-        [
+        body = [
            "------ElixirBoundary\r\n",
            "Content-Disposition: form-data; name=\"file\"; filename=\"#{filename}\"\r\n",
            "Content-Type: audio/wav\r\n\r\n",
@ -114,21 +86,17 @@ defmodule WhisperLiveWeb.AudioChannel do

        :httpc.request(:post, {url, headers, 'multipart/form-data; boundary=----ElixirBoundary', body}, [], [])
        |> case do
-        {:ok, {{_, 200, _}, _headers, body}} ->
-            {:ok, to_string(body)}
+            {:ok, {{_, 200, _}, _headers, body}} ->
+                # Logger.info("transcripcion mejorada --------------------------\n   -> > #{IO.iodata_to_binary(body)}")
+                Phoenix.PubSub.broadcast(WhisperLive.PubSub, "transcription", {:transcription_m, "#{IO.iodata_to_binary(body)}"})

-        {:ok, {{_, status, _}, _, body}} ->
-            {:error, {:http_error, status, to_string(body)}}
+                {:ok, "#{IO.iodata_to_binary(body)}"}

-        error ->
-            {:error, error}
+            {:ok, {{_, status, _}, _, body}} ->
+                {:error, {:http_error, status, IO.iodata_to_binary(body)}}
+
+            error ->
+                {:error, error}
        end
    end
-
-    defp tmp_path(prefix) do
-        unique = :erlang.unique_integer([:positive]) |> Integer.to_string()
-        filename = prefix <> "_" <> unique <> ".wav"
-        Path.join(System.tmp_dir!(), filename)
-    end
-
 end
--- a/whisper_live/lib/whisper_live_web/components/layouts/app.html.heex
+++ b/whisper_live/lib/whisper_live_web/components/layouts/app.html.heex
@ -1,32 +1,5 @@
-<header class="px-4 sm:px-6 lg:px-8">
-  <div class="flex items-center justify-between border-b border-zinc-100 py-3 text-sm">
-    <div class="flex items-center gap-4">
-      <a href="/">
-        <img src={~p"/images/logo.svg"} width="36" />
-      </a>
-      <p class="bg-brand/5 text-brand rounded-full px-2 font-medium leading-6">
-        v{Application.spec(:phoenix, :vsn)}
-      </p>
-    </div>
-    <div class="flex items-center gap-4 font-semibold leading-6 text-zinc-900">
-      <a href="https://twitter.com/elixirphoenix" class="hover:text-zinc-700">
-        @elixirphoenix
-      </a>
-      <a href="https://github.com/phoenixframework/phoenix" class="hover:text-zinc-700">
-        GitHub
-      </a>
-      <a
-        href="https://hexdocs.pm/phoenix/overview.html"
-        class="rounded-lg bg-zinc-100 px-2 py-1 hover:bg-zinc-200/80"
-      >
-        Get Started <span aria-hidden="true">&rarr;</span>
-      </a>
-    </div>
-  </div>
-</header>
-<main class="px-4 py-20 sm:px-6 lg:px-8">
-  <div class="mx-auto max-w-2xl">
-    <.flash_group flash={@flash} />
+<main>
+  <div>
    {@inner_content}
  </div>
 </main>
--- a/whisper_live/lib/whisper_live_web/components/layouts/root.html.heex
+++ b/whisper_live/lib/whisper_live_web/components/layouts/root.html.heex
@ -11,7 +11,7 @@
    <script defer phx-track-static type="text/javascript" src={~p"/assets/app.js"}>
    </script>
  </head>
-  <body class="bg-white">
+  <body>
    {@inner_content}
  </body>
 </html>
--- a/whisper_live/lib/whisper_live_web/live/recorder.ex
+++ b/whisper_live/lib/whisper_live_web/live/recorder.ex
@ -1,160 +1,201 @@
 defmodule WhisperLiveWeb.Live.Recorder do
-  use WhisperLiveWeb, :live_view
-  alias Phoenix.PubSub
+    use WhisperLiveWeb, :live_view
+    alias Phoenix.PubSub

-  def mount(_, _, socket) do
-    if connected?(socket), do: PubSub.subscribe(WhisperLive.PubSub, "transcription:#{socket_id(socket)}")
-    {:ok, assign(socket, transcription: "")}
-  end
+    def mount(_, _, socket) do
+        PubSub.subscribe(WhisperLive.PubSub, "transcription")

-  def handle_info({:transcription, raw_json}, socket) do
-    new_text =
-      raw_json
-      |> Jason.decode!()
-      |> get_in(["chunks", Access.at(0), "text"])
+        socket =
+            socket
+            |> assign(:transcription, "")
+            |> assign(:transcription_m, "")

-    {:noreply, update(socket, :transcription, &(&1 <> " " <> new_text))}
-  end
+        {:ok, socket}
+    end

-  def handle_event("start_recording", _params, socket) do
-    push_event(socket, "start-recording", %{})
-    {:noreply, socket}
-  end
+    def handle_info({:transcription, raw_json}, socket) do
+        IO.inspect(raw_json, label: "en vivo ---------------->\n")

-  def handle_event("stop_recording", _params, socket) do
-    push_event(socket, "stop-recording", %{})
-    {:noreply, socket}
-  end
+        new_text =
+            raw_json
+            |> Jason.decode!()
+            |> get_in(["chunks", Access.at(0), "text"])

-  defp socket_id(socket), do: socket.transport_pid |> :erlang.pid_to_list() |> List.to_string()
+        old_text = socket.assigns.transcription

-  def render(assigns) do
-    ~H"""
-    <div id="recorder" data-hook="recorder">
-      <button id="startButton" phx-click="start_recording">Start Recording</button>
-      <button id="stopButton" phx-click="stop_recording">Stop Recording</button>
+        # Sacar lo ya incluido al inicio
+        added_part = String.replace_prefix(new_text, old_text, "")

-      <div id="transcriptionContainer">
-        <div id="transcription" class="realtime"><%= @transcription %></div>
-      </div>
-      <div id="status" class="realtime"></div>
+        {:noreply, update(socket, :transcription, &(&1 <> added_part))}
+    end

-      <script type="module">
-        import { Socket } from "https://cdn.skypack.dev/phoenix"

-        const startButton = document.getElementById("startButton")
-        const stopButton = document.getElementById("stopButton")
-        const statusDiv = document.getElementById("status")
+    def handle_info({:transcription_m, raw_json}, socket) do
+        IO.inspect(raw_json, label: "meojada ---------------->\n")

-        let socket = null
-        let channel = null
-        let audioContext = null
-        let processor = null
-        let mediaStream = null
-        let buffer = []
-        let sendInterval = null
+        new_text =
+            raw_json
+            |> Jason.decode!()
+            |> get_in(["chunks", Access.at(0), "text"])
+        {:noreply, update(socket, :transcription_m, &(&1 <> " " <> new_text))}
+    end

-        const sampleRate = 48000
+    def handle_event("start_recording", _params, socket) do
+        push_event(socket, "start-recording", %{})
+        {:noreply, assign(socket, transcription: "", transcription_m: "")}
+    end

-        async function startRecording() {
-          startButton.disabled = true
-          stopButton.disabled = false
-          statusDiv.textContent = "🎙 Grabando..."

-          socket = new Socket("ws://localhost:4004/socket")
-          socket.connect()
-          channel = socket.channel("audio:lobby")
+    def handle_event("stop_recording", _params, socket) do
+        push_event(socket, "stop-recording", %{})
+        {:noreply, socket}
+    end

-          await channel.join()
-            .receive("ok", () => {
-              console.log("✅ Canal conectado")
-              statusDiv.textContent = "✅ Canal conectado"
-            })
-            .receive("error", () => {
-              console.error("❌ Error al conectar canal")
-              statusDiv.textContent = "❌ Error canal"
-            })
+    defp socket_id(socket), do: socket.transport_pid |> :erlang.pid_to_list() |> List.to_string()

-          try {
-            audioContext = new AudioContext({ sampleRate })
-            mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true })
-          } catch (err) {
-            console.error("❌ Micrófono error:", err)
-            statusDiv.textContent = "❌ Error accediendo al micrófono"
-            return
-          }
+    def render(assigns) do
+        ~H"""
+        <div id="recorder" data-hook="recorder">
+            <div class="flex space-x-2">
+                <button id="startButton" phx-click="start_recording" class="px-4 py-2 bg-blue-500 text-white rounded hover:bg-blue-600">
+                Start Recording
+                </button>
+                <button id="stopButton" phx-click="stop_recording" class="px-4 py-2 bg-red-500 text-white rounded hover:bg-red-600">
+                Stop Recording
+                </button>
+            </div>

-          const source = audioContext.createMediaStreamSource(mediaStream)
-          processor = audioContext.createScriptProcessor(4096, 1, 1)
-          source.connect(processor)
-          processor.connect(audioContext.destination)
+            <div id="status" class="text-sm text-gray-600"></div>

-          buffer = []
-          processor.onaudioprocess = e => {
-            const input = e.inputBuffer.getChannelData(0)
-            const pcm = new Int16Array(input.length)
-            for (let i = 0; i < input.length; i++) {
-              let s = Math.max(-1, Math.min(1, input[i]))
-              pcm[i] = s < 0 ? s * 0x8000 : s * 0x7FFF
-            }
-            buffer.push(pcm)
-          }
+            <div id="transcriptionContainer" class="space-y-2">
+                <div class="p-2 bg-gray-100 rounded shadow">
+                <h2 class="text-sm font-semibold text-gray-700 mb-1">🟠 Transcripción en vivo</h2>
+                <p id="transcription" class="text-orange-600 whitespace-pre-wrap"><%= @transcription %></p>
+            </div>

-          sendInterval = setInterval(() => {
-            if (buffer.length === 0) return
-            const merged = flattenInt16(buffer)
-            buffer = []
+            <%= if @transcription_m != "" do %>
+                <div class="p-2 bg-gray-100 rounded shadow">
+                    <h2 class="text-sm font-semibold text-gray-700 mb-1">✅ Transcripción mejorada</h2>
+                    <p class="text-green-600 whitespace-pre-wrap"><%= @transcription_m %></p>
+                </div>
+            <% end %>
+            </div>
+            <script type="module">
+            import { Socket } from "https://cdn.skypack.dev/phoenix"

-            function encodeBase64(uint8Array) {
-              let binary = ''
-              const len = uint8Array.byteLength
-              for (let i = 0; i < len; i++) {
-                binary += String.fromCharCode(uint8Array[i])
-              }
-              return btoa(binary)
+            const startButton = document.getElementById("startButton")
+            const stopButton = document.getElementById("stopButton")
+            const statusDiv = document.getElementById("status")
+
+            let socket = null
+            let channel = null
+            let audioContext = null
+            let processor = null
+            let mediaStream = null
+            let buffer = []
+            let sendInterval = null
+
+            const sampleRate = 48000
+
+            async function startRecording() {
+                startButton.disabled = true
+                stopButton.disabled = false
+                statusDiv.textContent = "🎙 Grabando..."
+
+                socket = new Socket("ws://localhost:4004/socket")
+                socket.connect()
+                channel = socket.channel("audio:lobby")
+
+                await channel.join()
+                .receive("ok", () => {
+                    console.log("✅ Canal conectado")
+                    statusDiv.textContent = "✅ Canal conectado"
+                })
+                .receive("error", () => {
+                    console.error("❌ Error al conectar canal")
+                    statusDiv.textContent = "❌ Error canal"
+                })
+
+                try {
+                audioContext = new AudioContext({ sampleRate })
+                mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true })
+                } catch (err) {
+                console.error("❌ Micrófono error:", err)
+                statusDiv.textContent = "❌ Error accediendo al micrófono"
+                return
+                }
+
+                const source = audioContext.createMediaStreamSource(mediaStream)
+                processor = audioContext.createScriptProcessor(4096, 1, 1)
+                source.connect(processor)
+                processor.connect(audioContext.destination)
+
+                buffer = []
+                processor.onaudioprocess = e => {
+                const input = e.inputBuffer.getChannelData(0)
+                const pcm = new Int16Array(input.length)
+                for (let i = 0; i < input.length; i++) {
+                    let s = Math.max(-1, Math.min(1, input[i]))
+                    pcm[i] = s < 0 ? s * 0x8000 : s * 0x7FFF
+                }
+                buffer.push(pcm)
+                }
+
+                sendInterval = setInterval(() => {
+                if (buffer.length === 0) return
+                const merged = flattenInt16(buffer)
+                buffer = []
+
+                function encodeBase64(uint8Array) {
+                    let binary = ''
+                    const len = uint8Array.byteLength
+                    for (let i = 0; i < len; i++) {
+                    binary += String.fromCharCode(uint8Array[i])
+                    }
+                    return btoa(binary)
+                }
+
+                const base64 = encodeBase64(new Uint8Array(merged.buffer))
+                channel.push("audio_chunk", { data: base64, sample_rate: sampleRate })
+                console.log("📤 Enviado chunk")
+                }, 2000)
            }

-            const base64 = encodeBase64(new Uint8Array(merged.buffer))
-            channel.push("audio_chunk", { data: base64, sample_rate: sampleRate })
-            console.log("📤 Enviado chunk")
-          }, 2000)
-        }
+            function stopRecording() {
+                stopButton.disabled = true
+                startButton.disabled = false
+                statusDiv.textContent = "🛑 Grabación detenida."

-        function stopRecording() {
-          stopButton.disabled = true
-          startButton.disabled = false
-          statusDiv.textContent = "🛑 Grabación detenida."
+                if (processor) processor.disconnect()
+                if (audioContext) audioContext.close()
+                if (mediaStream) mediaStream.getTracks().forEach(t => t.stop())
+                if (sendInterval) clearInterval(sendInterval)

-          if (processor) processor.disconnect()
-          if (audioContext) audioContext.close()
-          if (mediaStream) mediaStream.getTracks().forEach(t => t.stop())
-          if (sendInterval) clearInterval(sendInterval)
+                if (channel) {
+                channel.push("stop_audio")
+                setTimeout(() => {
+                    channel.leave()
+                    socket.disconnect()
+                    console.log("🔌 Socket cerrado")
+                }, 500)
+                }
+            }

-          if (channel) {
-            channel.push("stop_audio")
-            setTimeout(() => {
-              channel.leave()
-              socket.disconnect()
-              console.log("🔌 Socket cerrado")
-            }, 500)
-          }
-        }
+            function flattenInt16(buffers) {
+                const length = buffers.reduce((acc, b) => acc + b.length, 0)
+                const out = new Int16Array(length)
+                let offset = 0
+                for (const b of buffers) {
+                out.set(b, offset)
+                offset += b.length
+                }
+                return out
+            }

-        function flattenInt16(buffers) {
-          const length = buffers.reduce((acc, b) => acc + b.length, 0)
-          const out = new Int16Array(length)
-          let offset = 0
-          for (const b of buffers) {
-            out.set(b, offset)
-            offset += b.length
-          }
-          return out
-        }
-
-        startButton.onclick = startRecording
-        stopButton.onclick = stopRecording
-      </script>
-    </div>
-    """
-  end
+            startButton.onclick = startRecording
+            stopButton.onclick = stopRecording
+            </script>
+        </div>
+        """
+    end
 end
--- a/whisper_live/recordings/recording_1752601669350.wav
+++ b/whisper_live/recordings/recording_1752601669350.wav
--- a/whisper_live/recordings/recording_1752602147301.wav
+++ b/whisper_live/recordings/recording_1752602147301.wav
--- a/whisper_live/recordings/recording_1752605184367.wav
+++ b/whisper_live/recordings/recording_1752605184367.wav
--- a/whisper_live/recordings/recording_1752605420377.wav
+++ b/whisper_live/recordings/recording_1752605420377.wav
--- a/whisper_live/recordings/recording_1752678344186.wav
+++ b/whisper_live/recordings/recording_1752678344186.wav