whisper large v3, arranque run.sh
This commit is contained in:
@ -12,22 +12,20 @@ defmodule Whisper.Application do
|
||||
client = args[:client] || System.get_env("CLIENT") || "cuda"
|
||||
Application.put_env(:whisper, :client, String.to_atom(client))
|
||||
|
||||
# Application.put_env(:whisper, :model_name, args[:model] || System.get_env("MODEL") || "openai/whisper-base")
|
||||
# Application.put_env(:whisper, :batch_size, args[:batch_size] || String.to_integer(System.get_env("BATCH_SIZE") || "3"))
|
||||
# Application.put_env(:whisper, :batch_timeout, args[:batch_timeout] || String.to_integer(System.get_env("BATCH_TIMEOUT") || "3000"))
|
||||
# Application.put_env(:whisper, :port, args[:port] || String.to_integer(System.get_env("PORT") || "4003"))
|
||||
Application.put_env(:whisper, :model_name, args[:model] || System.get_env("MODEL") || "openai/whisper-large-v3-turbo")
|
||||
Application.put_env(:whisper, :batch_size, args[:batch_size] || String.to_integer(System.get_env("BATCH_SIZE") || "3"))
|
||||
Application.put_env(:whisper, :batch_timeout, args[:batch_timeout] || String.to_integer(System.get_env("BATCH_TIMEOUT") || "3000"))
|
||||
Application.put_env(:whisper, :client, String.to_atom(client))
|
||||
|
||||
|
||||
children = [
|
||||
Whisper.RealtimeModel,
|
||||
Whisper.LargeModel,
|
||||
# {Plug.Cowboy, scheme: :http, plug: Whisper, options: [port: Application.get_env(:whisper, :port)]},
|
||||
{Registry, keys: :unique, name: Whisper.Registry},
|
||||
{Registry, keys: :unique, name: Whisper.AudioRegistry},
|
||||
{Phoenix.PubSub, name: Whisper.PubSub},
|
||||
WhisperWeb.Endpoint,
|
||||
Whisper.Counter,
|
||||
AudioBuffer
|
||||
# AudioFilesList
|
||||
]
|
||||
|
||||
opts = [strategy: :one_for_one, name: Whisper.Supervisor]
|
||||
|
23
whisper/lib/whisper/audio_manager/audio_helper.ex
Normal file
23
whisper/lib/whisper/audio_manager/audio_helper.ex
Normal file
@ -0,0 +1,23 @@
|
||||
defmodule AudioHelper do
|
||||
def get_oldest_wav_file(ref) do
|
||||
path = Path.expand("recordings")
|
||||
pattern = Path.join(path, "#{ref}_*.wav")
|
||||
|
||||
files =
|
||||
Path.wildcard(pattern)
|
||||
|> Enum.map(fn filename ->
|
||||
# Extraer el número entre el último "_" y ".wav"
|
||||
case Regex.run(~r/#{Regex.escape(ref)}_(\d+)\.wav$/, filename) do
|
||||
[_, number] -> {String.to_integer(number), filename}
|
||||
_ -> nil
|
||||
end
|
||||
end)
|
||||
|> Enum.reject(&is_nil/1)
|
||||
|> Enum.sort_by(fn {num, _} -> num end)
|
||||
|
||||
case files do
|
||||
[] -> nil
|
||||
[{_, file} | _] -> file
|
||||
end
|
||||
end
|
||||
end
|
@ -28,15 +28,11 @@ defmodule AudioSaver do
|
||||
|
||||
chunk_number = Whisper.Counter.next(ref)
|
||||
|
||||
filename =
|
||||
case type do
|
||||
"part" -> "#{ref}_#{chunk_number}.wav"
|
||||
"final" -> "#{ref}_final.wav"
|
||||
_ -> "#{ref}_#{chunk_number}.wav"
|
||||
end
|
||||
filename = "#{ref}_#{chunk_number}.wav"
|
||||
path = Path.join(@wav_dir, filename)
|
||||
IO.inspect(path, label: "---> ")
|
||||
File.write!(path, header <> chunk)
|
||||
IO.inspect(path, label: "---> ")
|
||||
|
||||
{:ok, path}
|
||||
end
|
||||
end
|
||||
|
@ -20,11 +20,11 @@ defmodule Whisper.SendToModel do
|
||||
|
||||
def large(path) do
|
||||
case Nx.Serving.batched_run(Whisper.LargeModel.Serving, {:file, path}) do
|
||||
%{chunks: chunks} ->
|
||||
chunks
|
||||
|> Enum.map(& &1.text)
|
||||
|> Enum.join(" ")
|
||||
_ -> "Transcripción no disponible"
|
||||
%{chunks: chunks} ->
|
||||
chunks
|
||||
|> Enum.map(& &1.text)
|
||||
|> Enum.join(" ")
|
||||
_ -> "Transcripción no disponible"
|
||||
end
|
||||
end
|
||||
end
|
@ -1,11 +0,0 @@
|
||||
defmodule Whisper.Transcriber do
|
||||
def transcribe_file(path) do
|
||||
case Nx.Serving.batched_run(Whisper.LargeModel.Serving, {:file, path}) do
|
||||
%{chunks: chunks} ->
|
||||
chunks
|
||||
|> Enum.map(& &1.text)
|
||||
|> Enum.join(" ")
|
||||
_ -> "Transcripción no disponible"
|
||||
end
|
||||
end
|
||||
end
|
@ -1,24 +1,24 @@
|
||||
defmodule Whisper.LargeModel do
|
||||
use Supervisor
|
||||
|
||||
@model "openai/whisper-large-v3"
|
||||
# @model "openai/whisper-large-v3'turbo"
|
||||
|
||||
def start_link(_opts) do
|
||||
Supervisor.start_link(__MODULE__, [], name: __MODULE__)
|
||||
end
|
||||
|
||||
def init(_opts) do
|
||||
{:ok, model} = Bumblebee.load_model({:hf, @model})
|
||||
{:ok, featurizer} = Bumblebee.load_featurizer({:hf, @model})
|
||||
{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, @model})
|
||||
{:ok, generation_config} = Bumblebee.load_generation_config({:hf, @model})
|
||||
generation_config = Bumblebee.configure(generation_config, max_new_tokens: 448)
|
||||
model_name = Application.get_env(:whisper, :model_name, "openai/whisper-large-v3")
|
||||
|
||||
IO.inspect("Supervisor modelo #{model_name}")
|
||||
|
||||
{:ok, model} = Bumblebee.load_model({:hf, model_name})
|
||||
{:ok, featurizer} = Bumblebee.load_featurizer({:hf, model_name})
|
||||
{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, model_name})
|
||||
{:ok, generation_config} = Bumblebee.load_generation_config({:hf, model_name})
|
||||
generation_config = Bumblebee.configure(generation_config, max_new_tokens: 448)
|
||||
serving =
|
||||
Bumblebee.Audio.speech_to_text_whisper(
|
||||
model, featurizer, tokenizer, generation_config,
|
||||
chunk_num_seconds: 30,
|
||||
chunk_num_seconds: 5,
|
||||
language: "es",
|
||||
timestamps: :segments,
|
||||
defn_options: [compiler: EXLA, client: :cuda]
|
||||
@ -28,8 +28,8 @@ defmodule Whisper.LargeModel do
|
||||
{Nx.Serving,
|
||||
serving: serving,
|
||||
name: __MODULE__.Serving,
|
||||
batch_size: 1,
|
||||
batch_timeout: 5000}
|
||||
batch_size: Application.get_env(:whisper, :batch_size, 1),
|
||||
batch_timeout: Application.get_env(:whisper, :batch_timeout, 0)}
|
||||
]
|
||||
|
||||
Supervisor.init(children, strategy: :one_for_one)
|
||||
|
@ -1,53 +0,0 @@
|
||||
defmodule Whisper.RealtimeModel do
|
||||
use Supervisor
|
||||
|
||||
@moduledoc """
|
||||
Initializes the Whisper model and sets up the serving process.
|
||||
"""
|
||||
|
||||
def start_link(opts) do
|
||||
Supervisor.start_link(__MODULE__, opts, name: __MODULE__)
|
||||
end
|
||||
|
||||
def init(_opts) do
|
||||
model_name = Application.get_env(:whisper, :model_name, "openai/whisper-tiny")
|
||||
raw_client = Application.get_env(:whisper, :client, :cuda)
|
||||
|
||||
client =
|
||||
case String.to_atom(to_string(raw_client)) do
|
||||
:rocm ->
|
||||
:cuda
|
||||
:cuda ->
|
||||
:cuda
|
||||
atom -> atom
|
||||
end
|
||||
|
||||
|
||||
batch_size = Application.get_env(:whisper, :batch_size, 3)
|
||||
batch_timeout = Application.get_env(:whisper, :batch_timeout, 3000)
|
||||
|
||||
Nx.global_default_backend({EXLA.Backend, client: client})
|
||||
|
||||
{:ok, model} = Bumblebee.load_model({:hf, model_name})
|
||||
{:ok, featurizer} = Bumblebee.load_featurizer({:hf, model_name})
|
||||
{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, model_name})
|
||||
{:ok, generation_config} = Bumblebee.load_generation_config({:hf, model_name})
|
||||
|
||||
serving = Bumblebee.Audio.speech_to_text_whisper(
|
||||
model, featurizer, tokenizer, generation_config,
|
||||
chunk_num_seconds: 30,
|
||||
language: "es",
|
||||
defn_options: [compiler: EXLA, client: :cuda]
|
||||
)
|
||||
|
||||
children = [
|
||||
{Nx.Serving,
|
||||
serving: serving,
|
||||
name: __MODULE__.Serving,
|
||||
batch_size: batch_size,
|
||||
batch_timeout: batch_timeout}
|
||||
]
|
||||
|
||||
Supervisor.init(children, strategy: :one_for_one)
|
||||
end
|
||||
end
|
@ -31,49 +31,69 @@ defmodule WhisperWeb.AudioChannel do
|
||||
|
||||
Logger.info("Chunk recibido: #{byte_size(audio)} bytes, sample_rate: #{rate}")
|
||||
AudioBuffer.append(ref, {rate, audio})
|
||||
chunks = AudioBuffer.get_and_clear(ref)
|
||||
|
||||
# {:ok, path} = AudioSaver.save_chunk_as_wav(ref, audio, rate, "part")
|
||||
# AudioFilesList.add_file(path)
|
||||
start_total = System.monotonic_time(:millisecond)
|
||||
|
||||
if chunks != [] do
|
||||
|
||||
Task.start(fn ->
|
||||
[{rate, _} | _] = chunks
|
||||
full_audio = Enum.map(chunks, fn {_, bin} -> bin end) |> IO.iodata_to_binary()
|
||||
|
||||
{wav_time, {:ok, path}} =
|
||||
:timer.tc(fn ->
|
||||
AudioSaver.save_chunk_as_wav(ref, full_audio, rate, "part")
|
||||
end)
|
||||
model_start = System.monotonic_time(:millisecond)
|
||||
Logger.info("WAV guardado en #{div(wav_time, 1000)} ms")
|
||||
|
||||
|
||||
transcription =
|
||||
if path do
|
||||
case Nx.Serving.batched_run(Whisper.LargeModel.Serving, {:file, path}) do
|
||||
%{chunks: chunks} ->
|
||||
chunks
|
||||
|> Enum.map(& &1.text)
|
||||
|> Enum.join(" ")
|
||||
|
||||
_ ->
|
||||
"Transcripción no disponible"
|
||||
end
|
||||
else
|
||||
"Archivo no disponible"
|
||||
end
|
||||
|
||||
|
||||
model_end = System.monotonic_time(:millisecond)
|
||||
Logger.info("El modelo procesó en #{model_end - model_start} ms")
|
||||
|
||||
Logger.info("✅ Transcripción:\n#{transcription}")
|
||||
|
||||
message = %{"chunks" => [%{"text" => transcription}]}
|
||||
PubSub.broadcast(Whisper.PubSub, "transcription", {:transcription, %{
|
||||
"received_at" => model_start,
|
||||
"text" => transcription
|
||||
}})
|
||||
File.rm!(path)
|
||||
|
||||
end_total = System.monotonic_time(:millisecond)
|
||||
Logger.info("⏱ Total procesamiento stop_audio: #{end_total - start_total} ms")
|
||||
end)
|
||||
end
|
||||
{:noreply, socket}
|
||||
end
|
||||
|
||||
|
||||
@doc """
|
||||
Recupera todos los chunks acumulados en el buffer, los concatena y guarda un archivo WAV final (sufijo `"final"`).
|
||||
"""
|
||||
def handle_in("stop_audio", _payload, socket) do
|
||||
|
||||
Logger.info("🛑 Grabación detenida por cliente")
|
||||
|
||||
ref = socket_id(socket)
|
||||
chunks = AudioBuffer.get_and_clear(ref)
|
||||
|
||||
if chunks != [] do
|
||||
[{rate, _} | _] = chunks
|
||||
full_audio = Enum.map(chunks, fn {_, bin} -> bin end) |> IO.iodata_to_binary()
|
||||
{:ok, path} = AudioSaver.save_chunk_as_wav(ref, full_audio, rate, "final")
|
||||
|
||||
Task.start(fn ->
|
||||
transcription = Whisper.SendToModel.large(path)
|
||||
Logger.info("✅ Transcripción completa:\n#{transcription}")
|
||||
message = %{"chunks" => [%{"text" => transcription}]}
|
||||
Phoenix.PubSub.broadcast(Whisper.PubSub, "transcription", {:transcription, Jason.encode!(message)})
|
||||
File.rm!(path)
|
||||
end)
|
||||
end
|
||||
|
||||
{:noreply, socket}
|
||||
end
|
||||
|
||||
|
||||
defp socket_id(socket), do: socket.transport_pid |> :erlang.pid_to_list() |> List.to_string()
|
||||
|
||||
def save_raw(ref, bin) do
|
||||
File.mkdir_p!("recordings/")
|
||||
filename = "#{ref}_#{Whisper.Counter.next(ref)}.raw"
|
||||
path = Path.join("recordings", filename)
|
||||
File.write!(path, bin)
|
||||
{:ok, path}
|
||||
end
|
||||
end
|
||||
|
@ -1,32 +1,6 @@
|
||||
<header class="px-4 sm:px-6 lg:px-8">
|
||||
<div class="flex items-center justify-between border-b border-zinc-100 py-3 text-sm">
|
||||
<div class="flex items-center gap-4">
|
||||
<a href="/">
|
||||
<img src={~p"/images/logo.svg"} width="36" />
|
||||
</a>
|
||||
<p class="bg-brand/5 text-brand rounded-full px-2 font-medium leading-6">
|
||||
v{Application.spec(:phoenix, :vsn)}
|
||||
</p>
|
||||
</div>
|
||||
<div class="flex items-center gap-4 font-semibold leading-6 text-zinc-900">
|
||||
<a href="https://twitter.com/elixirphoenix" class="hover:text-zinc-700">
|
||||
@elixirphoenix
|
||||
</a>
|
||||
<a href="https://github.com/phoenixframework/phoenix" class="hover:text-zinc-700">
|
||||
GitHub
|
||||
</a>
|
||||
<a
|
||||
href="https://hexdocs.pm/phoenix/overview.html"
|
||||
class="rounded-lg bg-zinc-100 px-2 py-1 hover:bg-zinc-200/80"
|
||||
>
|
||||
Get Started <span aria-hidden="true">→</span>
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<main class="px-4 py-20 sm:px-6 lg:px-8">
|
||||
<div class="mx-auto max-w-2xl">
|
||||
<.flash_group flash={@flash} />
|
||||
{@inner_content}
|
||||
</div>
|
||||
</main>
|
||||
|
@ -9,6 +9,7 @@ defmodule WhisperWeb.VadLive do
|
||||
socket
|
||||
|> assign(:transcription, "")
|
||||
|> assign(:started, false)
|
||||
|> assign(:transcriptions, [])
|
||||
|
||||
{:ok, socket}
|
||||
end
|
||||
@ -18,31 +19,50 @@ defmodule WhisperWeb.VadLive do
|
||||
{:noreply, assign(socket, started: true)}
|
||||
end
|
||||
|
||||
def handle_info({:transcription, raw_json}, socket) do
|
||||
new_text =
|
||||
raw_json
|
||||
|> Jason.decode!()
|
||||
|> get_in(["chunks", Access.at(0), "text"])
|
||||
{:noreply, update(socket, :transcription, &(&1 <> " " <> new_text))}
|
||||
def handle_event("stop_vad", _params, socket) do
|
||||
push_event(socket, "stop-vad", %{})
|
||||
{:noreply, assign(socket, started: false)}
|
||||
end
|
||||
|
||||
def handle_info({:transcription, %{"received_at" => ts, "text" => new_text}}, socket) do
|
||||
updated_transcriptions =
|
||||
[%{received_at: ts, text: new_text} | socket.assigns.transcriptions]
|
||||
|> Enum.sort_by(& &1.received_at)
|
||||
|
||||
final_text =
|
||||
updated_transcriptions
|
||||
|> Enum.map_join(" ", & &1.text)
|
||||
|
||||
socket =
|
||||
socket
|
||||
|> assign(:transcriptions, updated_transcriptions)
|
||||
|> assign(:transcription, final_text)
|
||||
|
||||
{:noreply, socket}
|
||||
end
|
||||
|
||||
|
||||
def render(assigns) do
|
||||
~H"""
|
||||
~H"""
|
||||
<div id="vad-container" phx-hook="VadHook">
|
||||
<button phx-click="start_vad" class="btn btn-primary">🎙 Iniciar VAD</button>
|
||||
<%= if !@started do %>
|
||||
<button phx-click="start_vad" class="btn btn-primary">🎙 Iniciar VAD</button>
|
||||
<% end %>
|
||||
|
||||
<div id="vad-status" class="mt-4 text-sm text-gray-700"></div>
|
||||
</div>
|
||||
|
||||
|
||||
<div id="transcriptionContainer" class="w-full max-w-2xl space-y-4">
|
||||
<%= if @transcription != "" do %>
|
||||
<div class="p-4 bg-gray-100 rounded shadow-md">
|
||||
<h2 class="text-sm font-semibold text-gray-700 mb-2">✅ Transcripción</h2>
|
||||
<p class="text-green-600 whitespace-pre-wrap break-words text-sm leading-relaxed"><%= @transcription %></p>
|
||||
</div>
|
||||
<div class="p-4 bg-gray-100 rounded shadow-md">
|
||||
<h2 class="text-sm font-semibold text-gray-700 mb-2">✅ Transcripción</h2>
|
||||
<p class="text-green-600 whitespace-pre-wrap break-words text-sm leading-relaxed">
|
||||
<%= @transcription %>
|
||||
</p>
|
||||
</div>
|
||||
<% end %>
|
||||
</div>
|
||||
"""
|
||||
"""
|
||||
end
|
||||
|
||||
end
|
||||
|
Reference in New Issue
Block a user