whisper large v3, arranque run.sh

2025-08-04 19:23:40 +00:00
parent 08cf12beb1
commit 4a22a68ce3
1244 changed files with 235 additions and 207 deletions
--- a/whisper/lib/whisper/application.ex
+++ b/whisper/lib/whisper/application.ex
@ -12,22 +12,20 @@ defmodule Whisper.Application do
    client = args[:client] || System.get_env("CLIENT") || "cuda"
    Application.put_env(:whisper, :client, String.to_atom(client))

-    # Application.put_env(:whisper, :model_name, args[:model] || System.get_env("MODEL") || "openai/whisper-base")
-    # Application.put_env(:whisper, :batch_size, args[:batch_size] || String.to_integer(System.get_env("BATCH_SIZE") || "3"))
-    # Application.put_env(:whisper, :batch_timeout, args[:batch_timeout] || String.to_integer(System.get_env("BATCH_TIMEOUT") || "3000"))
-    # Application.put_env(:whisper, :port, args[:port] || String.to_integer(System.get_env("PORT") || "4003"))
+    Application.put_env(:whisper, :model_name, args[:model] || System.get_env("MODEL") || "openai/whisper-large-v3-turbo")
+    Application.put_env(:whisper, :batch_size, args[:batch_size] || String.to_integer(System.get_env("BATCH_SIZE") || "3"))
+    Application.put_env(:whisper, :batch_timeout, args[:batch_timeout] || String.to_integer(System.get_env("BATCH_TIMEOUT") || "3000"))
+    Application.put_env(:whisper, :client, String.to_atom(client))
+

    children = [
-      Whisper.RealtimeModel,
      Whisper.LargeModel,
-      # {Plug.Cowboy, scheme: :http, plug: Whisper, options: [port: Application.get_env(:whisper, :port)]},
      {Registry, keys: :unique, name: Whisper.Registry},
      {Registry, keys: :unique, name: Whisper.AudioRegistry}, 
      {Phoenix.PubSub, name: Whisper.PubSub},
      WhisperWeb.Endpoint,
      Whisper.Counter,
      AudioBuffer
-      # AudioFilesList
    ]

    opts = [strategy: :one_for_one, name: Whisper.Supervisor]
--- a/whisper/lib/whisper/audio_manager/audio_helper.ex
+++ b/whisper/lib/whisper/audio_manager/audio_helper.ex
@ -0,0 +1,23 @@
+defmodule AudioHelper do
+  def get_oldest_wav_file(ref) do
+    path = Path.expand("recordings")
+    pattern = Path.join(path, "#{ref}_*.wav")
+
+    files =
+      Path.wildcard(pattern)
+      |> Enum.map(fn filename ->
+        # Extraer el número entre el último "_" y ".wav"
+        case Regex.run(~r/#{Regex.escape(ref)}_(\d+)\.wav$/, filename) do
+          [_, number] -> {String.to_integer(number), filename}
+          _ -> nil
+        end
+      end)
+      |> Enum.reject(&is_nil/1)
+      |> Enum.sort_by(fn {num, _} -> num end)
+
+    case files do
+      [] -> nil
+      [{_, file} | _] -> file
+    end
+  end
+end
--- a/whisper/lib/whisper/audio_manager/audio_saver.ex
+++ b/whisper/lib/whisper/audio_manager/audio_saver.ex
@ -28,15 +28,11 @@ defmodule AudioSaver do

    chunk_number = Whisper.Counter.next(ref)
    
-    filename = 
-        case type do 
-            "part" -> "#{ref}_#{chunk_number}.wav"
-            "final" -> "#{ref}_final.wav"
-            _ -> "#{ref}_#{chunk_number}.wav"
-        end
+    filename = "#{ref}_#{chunk_number}.wav"
    path = Path.join(@wav_dir, filename)
-    IO.inspect(path, label: "---> ")
    File.write!(path, header <> chunk)
+    IO.inspect(path, label: "---> ")
+
    {:ok, path}
  end
 end
--- a/whisper/lib/whisper/audio_manager/send_to_model.ex
+++ b/whisper/lib/whisper/audio_manager/send_to_model.ex
@ -20,11 +20,11 @@ defmodule Whisper.SendToModel do

    def large(path) do
        case Nx.Serving.batched_run(Whisper.LargeModel.Serving, {:file, path}) do
-        %{chunks: chunks} ->
-            chunks
-            |> Enum.map(& &1.text)
-            |> Enum.join(" ")
-        _ -> "Transcripción no disponible"
+            %{chunks: chunks} ->
+                chunks
+                |> Enum.map(& &1.text)
+                |> Enum.join(" ")
+            _ -> "Transcripción no disponible"
        end
    end
 end
--- a/whisper/lib/whisper/audio_manager/transcriber.ex
+++ b/whisper/lib/whisper/audio_manager/transcriber.ex
@ -1,11 +0,0 @@
-defmodule Whisper.Transcriber do
-  def transcribe_file(path) do
-    case Nx.Serving.batched_run(Whisper.LargeModel.Serving, {:file, path}) do
-      %{chunks: chunks} ->
-        chunks
-        |> Enum.map(& &1.text)
-        |> Enum.join(" ")
-      _ -> "Transcripción no disponible"
-    end
-  end
-end
--- a/whisper/lib/whisper/server/large_model.ex
+++ b/whisper/lib/whisper/server/large_model.ex
@ -1,24 +1,24 @@
 defmodule Whisper.LargeModel do
  use Supervisor

-  @model "openai/whisper-large-v3"
-#   @model "openai/whisper-large-v3'turbo"
-
  def start_link(_opts) do
    Supervisor.start_link(__MODULE__, [], name: __MODULE__)
  end

  def init(_opts) do
-    {:ok, model} = Bumblebee.load_model({:hf, @model})
-    {:ok, featurizer} = Bumblebee.load_featurizer({:hf, @model})
-    {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, @model})
-    {:ok, generation_config} = Bumblebee.load_generation_config({:hf, @model})
-    generation_config = Bumblebee.configure(generation_config, max_new_tokens: 448)
+    model_name = Application.get_env(:whisper, :model_name, "openai/whisper-large-v3")

+    IO.inspect("Supervisor modelo #{model_name}")
+
+    {:ok, model} = Bumblebee.load_model({:hf, model_name})
+    {:ok, featurizer} = Bumblebee.load_featurizer({:hf, model_name})
+    {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, model_name})
+    {:ok, generation_config} = Bumblebee.load_generation_config({:hf, model_name})
+    generation_config = Bumblebee.configure(generation_config, max_new_tokens: 448)
    serving =
      Bumblebee.Audio.speech_to_text_whisper(
        model, featurizer, tokenizer, generation_config,
-        chunk_num_seconds: 30,
+        chunk_num_seconds: 5,
        language: "es",
        timestamps: :segments,
        defn_options: [compiler: EXLA, client: :cuda]
@ -28,8 +28,8 @@ defmodule Whisper.LargeModel do
      {Nx.Serving,
       serving: serving,
       name: __MODULE__.Serving,
-       batch_size: 1,
-       batch_timeout: 5000}
+       batch_size: Application.get_env(:whisper, :batch_size, 1),
+       batch_timeout: Application.get_env(:whisper, :batch_timeout, 0)}
    ]

    Supervisor.init(children, strategy: :one_for_one)
--- a/whisper/lib/whisper/server/realtime_molde.ex
+++ b/whisper/lib/whisper/server/realtime_molde.ex
@ -1,53 +0,0 @@
-defmodule Whisper.RealtimeModel do
-    use Supervisor
-
-    @moduledoc """
-    Initializes the Whisper model and sets up the serving process.
-    """
-
-    def start_link(opts) do
-        Supervisor.start_link(__MODULE__, opts, name: __MODULE__)
-    end
-
-    def init(_opts) do
-        model_name = Application.get_env(:whisper, :model_name, "openai/whisper-tiny")
-        raw_client = Application.get_env(:whisper, :client, :cuda)
-
-        client =
-            case String.to_atom(to_string(raw_client)) do
-            :rocm ->
-                :cuda
-            :cuda ->
-                :cuda
-            atom -> atom
-            end
-
-
-        batch_size = Application.get_env(:whisper, :batch_size, 3)
-        batch_timeout = Application.get_env(:whisper, :batch_timeout, 3000)
-
-        Nx.global_default_backend({EXLA.Backend, client: client})
-
-        {:ok, model} = Bumblebee.load_model({:hf, model_name})
-        {:ok, featurizer} = Bumblebee.load_featurizer({:hf, model_name})
-        {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, model_name})
-        {:ok, generation_config} = Bumblebee.load_generation_config({:hf, model_name})
-
-        serving = Bumblebee.Audio.speech_to_text_whisper(
-            model, featurizer, tokenizer, generation_config,
-            chunk_num_seconds: 30,
-            language: "es",
-            defn_options: [compiler: EXLA, client: :cuda]
-        )
-
-        children = [
-            {Nx.Serving,
-            serving: serving,
-            name: __MODULE__.Serving,
-            batch_size: batch_size,
-            batch_timeout: batch_timeout}
-        ]
-
-        Supervisor.init(children, strategy: :one_for_one)
-    end
-end
--- a/whisper/lib/whisper_web/channels/audio_channel.ex
+++ b/whisper/lib/whisper_web/channels/audio_channel.ex
@ -31,49 +31,69 @@ defmodule WhisperWeb.AudioChannel do

    Logger.info("Chunk recibido: #{byte_size(audio)} bytes, sample_rate: #{rate}")
    AudioBuffer.append(ref, {rate, audio})
+    chunks = AudioBuffer.get_and_clear(ref)

-    # {:ok, path} = AudioSaver.save_chunk_as_wav(ref, audio, rate, "part")
-    # AudioFilesList.add_file(path)
+    start_total = System.monotonic_time(:millisecond)
+
+    if chunks != [] do
+
+      Task.start(fn ->
+        [{rate, _} | _] = chunks
+        full_audio = Enum.map(chunks, fn {_, bin} -> bin end) |> IO.iodata_to_binary()
+
+        {wav_time, {:ok, path}} =
+          :timer.tc(fn ->
+            AudioSaver.save_chunk_as_wav(ref, full_audio, rate, "part")
+          end)
+        model_start = System.monotonic_time(:millisecond)
+        Logger.info("WAV guardado en #{div(wav_time, 1000)} ms")


+        transcription =
+          if path do
+            case Nx.Serving.batched_run(Whisper.LargeModel.Serving, {:file, path}) do
+              %{chunks: chunks} ->
+                chunks
+                |> Enum.map(& &1.text)
+                |> Enum.join(" ")
+
+              _ ->
+                "Transcripción no disponible"
+            end
+          else
+            "Archivo no disponible"
+          end
+
+
+        model_end = System.monotonic_time(:millisecond)
+        Logger.info("El modelo procesó en #{model_end - model_start} ms")
+
+        Logger.info("✅ Transcripción:\n#{transcription}")
+
+        message = %{"chunks" => [%{"text" => transcription}]}
+        PubSub.broadcast(Whisper.PubSub, "transcription", {:transcription, %{
+          "received_at" => model_start,
+          "text" => transcription
+        }})
+        File.rm!(path)
+
+        end_total = System.monotonic_time(:millisecond)
+        Logger.info("⏱ Total procesamiento stop_audio: #{end_total - start_total} ms")
+      end)
+    end
    {:noreply, socket}
  end

-
-  @doc """
-  Recupera todos los chunks acumulados en el buffer, los concatena y guarda un archivo WAV final (sufijo `"final"`).
-  """
  def handle_in("stop_audio", _payload, socket) do
-
    Logger.info("🛑 Grabación detenida por cliente")

    ref = socket_id(socket)
-    chunks = AudioBuffer.get_and_clear(ref)

-    if chunks != [] do
-      [{rate, _} | _] = chunks
-      full_audio = Enum.map(chunks, fn {_, bin} -> bin end) |> IO.iodata_to_binary()
-      {:ok, path} = AudioSaver.save_chunk_as_wav(ref, full_audio, rate, "final")
-      
-      Task.start(fn ->
-        transcription = Whisper.SendToModel.large(path)
-        Logger.info("✅ Transcripción completa:\n#{transcription}")
-        message = %{"chunks" => [%{"text" => transcription}]}
-        Phoenix.PubSub.broadcast(Whisper.PubSub, "transcription", {:transcription, Jason.encode!(message)})
-        File.rm!(path)
-      end)
-    end

    {:noreply, socket}
  end

+
  defp socket_id(socket), do: socket.transport_pid |> :erlang.pid_to_list() |> List.to_string()
  
-  def save_raw(ref, bin) do
-    File.mkdir_p!("recordings/")
-    filename = "#{ref}_#{Whisper.Counter.next(ref)}.raw"
-    path = Path.join("recordings", filename)
-    File.write!(path, bin)
-    {:ok, path}
-  end
 end
--- a/whisper/lib/whisper_web/components/layouts/app.html.heex
+++ b/whisper/lib/whisper_web/components/layouts/app.html.heex
@ -1,32 +1,6 @@
-<header class="px-4 sm:px-6 lg:px-8">
-  <div class="flex items-center justify-between border-b border-zinc-100 py-3 text-sm">
-    <div class="flex items-center gap-4">
-      <a href="/">
-        <img src={~p"/images/logo.svg"} width="36" />
-      </a>
-      <p class="bg-brand/5 text-brand rounded-full px-2 font-medium leading-6">
-        v{Application.spec(:phoenix, :vsn)}
-      </p>
-    </div>
-    <div class="flex items-center gap-4 font-semibold leading-6 text-zinc-900">
-      <a href="https://twitter.com/elixirphoenix" class="hover:text-zinc-700">
-        @elixirphoenix
-      </a>
-      <a href="https://github.com/phoenixframework/phoenix" class="hover:text-zinc-700">
-        GitHub
-      </a>
-      <a
-        href="https://hexdocs.pm/phoenix/overview.html"
-        class="rounded-lg bg-zinc-100 px-2 py-1 hover:bg-zinc-200/80"
-      >
-        Get Started <span aria-hidden="true">&rarr;</span>
-      </a>
-    </div>
-  </div>
-</header>
+
 <main class="px-4 py-20 sm:px-6 lg:px-8">
  <div class="mx-auto max-w-2xl">
-    <.flash_group flash={@flash} />
    {@inner_content}
  </div>
 </main>
--- a/whisper/lib/whisper_web/live/vad_live.ex
+++ b/whisper/lib/whisper_web/live/vad_live.ex
@ -9,6 +9,7 @@ defmodule WhisperWeb.VadLive do
            socket
            |> assign(:transcription, "")
            |> assign(:started, false)
+            |> assign(:transcriptions, [])

        {:ok, socket}
    end
@ -18,31 +19,50 @@ defmodule WhisperWeb.VadLive do
        {:noreply, assign(socket, started: true)}
    end

-    def handle_info({:transcription, raw_json}, socket) do
-        new_text =
-            raw_json
-            |> Jason.decode!()
-            |> get_in(["chunks", Access.at(0), "text"])
-        {:noreply, update(socket, :transcription, &(&1 <> " " <> new_text))}
+    def handle_event("stop_vad", _params, socket) do
+        push_event(socket, "stop-vad", %{})
+        {:noreply, assign(socket, started: false)}
    end

+    def handle_info({:transcription, %{"received_at" => ts, "text" => new_text}}, socket) do
+        updated_transcriptions =
+            [%{received_at: ts, text: new_text} | socket.assigns.transcriptions]
+            |> Enum.sort_by(& &1.received_at)
+
+        final_text = 
+            updated_transcriptions
+            |> Enum.map_join(" ", & &1.text)
+
+        socket =
+            socket
+            |> assign(:transcriptions, updated_transcriptions)
+            |> assign(:transcription, final_text)
+
+        {:noreply, socket}
+    end
+
+    
    def render(assigns) do
-        ~H"""
+    ~H"""
        <div id="vad-container" phx-hook="VadHook">
-            <button phx-click="start_vad" class="btn btn-primary">🎙 Iniciar VAD</button>
+            <%= if !@started do %>
+                <button phx-click="start_vad" class="btn btn-primary">🎙 Iniciar VAD</button>
+            <% end %>

            <div id="vad-status" class="mt-4 text-sm text-gray-700"></div>
        </div>
-    
+
        <div id="transcriptionContainer" class="w-full max-w-2xl space-y-4">
            <%= if @transcription != "" do %>
-                <div class="p-4 bg-gray-100 rounded shadow-md">
-                    <h2 class="text-sm font-semibold text-gray-700 mb-2">✅ Transcripción</h2>
-                    <p class="text-green-600 whitespace-pre-wrap break-words text-sm leading-relaxed"><%= @transcription %></p>
-                </div>
+            <div class="p-4 bg-gray-100 rounded shadow-md">
+                <h2 class="text-sm font-semibold text-gray-700 mb-2">✅ Transcripción</h2>
+                <p class="text-green-600 whitespace-pre-wrap break-words text-sm leading-relaxed">
+                <%= @transcription %>
+                </p>
+            </div>
            <% end %>
        </div>
-        """
+    """
    end

 end