mejora dictado

This commit is contained in:
2025-07-18 10:50:51 +00:00
parent 92e307db34
commit 5249af7d23
7 changed files with 330 additions and 145 deletions

View File

@ -1,7 +1,7 @@
defmodule WhisperLive.AudioBuffer do
use GenServer
## API
# API
def start_link(ref), do: GenServer.start_link(__MODULE__, [], name: via(ref))
@ -9,19 +9,98 @@ defmodule WhisperLive.AudioBuffer do
def get_all(ref), do: GenServer.call(via(ref), :get_all)
def get_tiny(ref), do: GenServer.call(via(ref), :get_tiny)
def clear_tiny(ref), do: GenServer.call(via(ref), :clear_tiny)
def clear(ref), do: GenServer.call(via(ref), :clear)
def stop(ref), do: GenServer.stop(via(ref))
def pop_chunk_with_overlap(ref, keep_ms \\ 1000), do: GenServer.call(via(ref), {:pop_chunk_with_overlap, keep_ms})
defp via(ref), do: {:via, Registry, {WhisperLive.AudioRegistry, ref}}
## Callbacks
# Callbacks
def init(_), do: {:ok, []}
def init(_), do: {:ok, %{tiny: [], full: []}}
def handle_cast({:append, chunk}, state), do: {:noreply, [chunk | state]}
def handle_cast({:append, chunk}, state) do
{:noreply, %{state | tiny: [chunk | state.tiny], full: [chunk | state.full]}}
end
def handle_call(:get_all, _from, state), do: {:reply, Enum.reverse(state), state}
@impl true
def handle_call({:pop_chunk_with_overlap, keep_ms}, _from, state) do
bytes_per_ms = div(state.rate * 2, 1000)
total = state.last_overlap <> state.buffer
def handle_call(:clear, _from, _state), do: {:reply, :ok, []}
total_size = byte_size(total)
keep_bytes = keep_ms * bytes_per_ms
overlap_bytes = min(keep_bytes, total_size)
{to_send, to_keep} = split_bytes(total, overlap_bytes)
new_state = %{state | buffer: <<>>, last_overlap: to_keep}
{:reply, {to_send, state.rate}, new_state}
end
defp split_bytes(binary, keep_bytes) do
total = byte_size(binary)
send_bytes = max(total - keep_bytes, 0)
<<send::binary-size(send_bytes), keep::binary>> = binary
{send, keep}
end
def handle_call(:get_all, _from, state), do: {:reply, Enum.reverse(state.full), state}
def handle_call(:get_tiny, _from, state), do: {:reply, Enum.reverse(state.tiny), state}
def handle_call(:clear_tiny, _from, state), do: {:reply, :ok, %{state | tiny: []}}
def handle_call(:clear, _from, _state), do: {:reply, :ok, %{tiny: [], full: []}}
end
# defmodule WhisperLive.AudioBuffer do
# use GenServer
# def start_link(ref), do: GenServer.start_link(__MODULE__, [], name: via(ref))
# def append(ref, chunk), do: GenServer.cast(via(ref), {:append, chunk})
# def pop_chunk_with_overlap(ref, keep_ms \\ 1000), do: GenServer.call(via(ref), {:pop_chunk_with_overlap, keep_ms})
# def get_all(pid) do
# GenServer.call(pid, :get_all)
# end
# defp via(ref), do: {:via, Registry, {WhisperLive.AudioRegistry, ref}}
# @impl true
# def init(_), do: {:ok, %{buffer: <<>>, last_overlap: <<>>, rate: 48_000}}
# @impl true
# def handle_cast({:append, {_ts, chunk}}, state) do
# {:noreply, %{state | buffer: state.buffer <> chunk}}
# end
# @impl true
# def handle_call({:pop_chunk_with_overlap, keep_ms}, _from, state) do
# bytes_per_ms = div(state.rate * 2, 1000)
# total = state.last_overlap <> state.buffer
# total_size = byte_size(total)
# keep_bytes = keep_ms * bytes_per_ms
# overlap_bytes = min(keep_bytes, total_size)
# {to_send, to_keep} = split_bytes(total, overlap_bytes)
# new_state = %{state | buffer: <<>>, last_overlap: to_keep}
# {:reply, {to_send, state.rate}, new_state}
# end
# defp split_bytes(binary, keep_bytes) do
# total = byte_size(binary)
# send_bytes = max(total - keep_bytes, 0)
# <<send::binary-size(send_bytes), keep::binary>> = binary
# {send, keep}
# end
# def handle_call(:get_all, _from, state) do
# {:reply, Enum.reverse(state.buffer), state}
# end
# end

View File

@ -0,0 +1,17 @@
defmodule WhisperLive.AudioFullBuffer do
use GenServer
def start_link(_), do: GenServer.start_link(__MODULE__, [], name: __MODULE__)
def append(chunk), do: GenServer.cast(__MODULE__, {:append, chunk})
def get_all(), do: GenServer.call(__MODULE__, :get_all)
def init(_), do: {:ok, %{chunks: [], rate: 48_000}}
def handle_cast({:append, {_rate, chunk}}, state) do
{:noreply, %{state | chunks: [chunk | state.chunks]}}
end
def handle_call(:get_all, _from, state) do
{:reply, {state.rate, Enum.reverse(state.chunks)}, state}
end
end

View File

@ -0,0 +1,30 @@
# mantiene un buffer de tiempo corto + overlap
defmodule WhisperLive.AudioStreamBuffer do
use GenServer
def start_link(_), do: GenServer.start_link(__MODULE__, [], name: __MODULE__)
def append(chunk), do: GenServer.cast(__MODULE__, {:append, chunk})
def pop_chunk_with_overlap(n_seconds), do: GenServer.call(__MODULE__, {:pop_chunk, n_seconds})
def init(_), do: {:ok, %{buffer: <<>>, last_overlap: <<>>, rate: 48_000}}
def handle_cast({:append, {_rate, chunk}}, state) do
{:noreply, %{state | buffer: state.buffer <> chunk}}
end
def handle_call({:pop_chunk, n}, _from, state) do
rate = state.rate
bytes_per_second = rate * 2
bytes_to_take = n * bytes_per_second
<<chunk::binary-size(bytes_to_take), rest::binary>> = state.buffer
overlap_ms = 1000 # 1 segundo
overlap_bytes = div(overlap_ms * bytes_per_second, 1000)
new_overlap = binary_part(chunk, byte_size(chunk) - overlap_bytes, overlap_bytes)
reply = state.last_overlap <> chunk
new_buffer = new_overlap <> rest
{:reply, {rate, reply}, %{state | buffer: new_buffer, last_overlap: new_overlap}}
end
end

View File

@ -22,15 +22,14 @@ defmodule WhisperLive.Transcriber do
end
def handle_info(:transcribe, %{ref: ref} = state) do
case AudioBuffer.get_all(ref) do
[] ->
:noop
case AudioBuffer.get_tiny(ref) do
[] -> :noop
[{rate, _} | _] = chunks ->
merged = chunks |> Enum.map(fn {_, bin} -> bin end) |> IO.iodata_to_binary()
tmpfile = tmp_path("realtime_#{ref}")
:ok = File.write!(tmpfile, encode_wav(merged, rate))
tmpfile = "tmp/rt_#{System.system_time(:millisecond)}.wav"
File.mkdir_p!("tmp")
File.write!(tmpfile, encode_wav(merged, rate))
case send_to_whisper(tmpfile) do
{:ok, response} ->
PubSub.broadcast(WhisperLive.PubSub, "transcription", {:transcription, response})
@ -40,28 +39,54 @@ defmodule WhisperLive.Transcriber do
end
File.rm(tmpfile)
case GenServer.whereis({:via, Registry, {WhisperLive.AudioRegistry, ref}}) do
pid when is_pid(pid) ->
if Process.alive?(pid) do
AudioBuffer.clear_tiny(ref)
else
Logger.debug("AudioBuffer #{inspect(ref)} no está vivo.")
end
_ ->
Logger.debug("AudioBuffer #{inspect(ref)} no existe.")
end
end
schedule()
{:noreply, state}
end
# def handle_info(:transcribe, %{ref: ref} = state) do
# case AudioBuffer.pop_chunk_with_overlap(ref, 1000) do
# {"", _rate} ->
# :noop
# {audio, rate} ->
# tmpfile = "tmp/rt_#{ref}_#{System.system_time(:millisecond)}.wav"
# File.mkdir_p!("tmp")
# File.write!(tmpfile, encode_wav(audio, rate))
# case send_to_whisper(tmpfile) do
# {:ok, response} ->
# PubSub.broadcast(WhisperLive.PubSub, "transcription", {:transcription, response})
# {:error, reason} ->
# Logger.warning("Realtime transcription error: #{inspect(reason)}")
# end
# end
# schedule()
# {:noreply, state}
# end
defp tmp_path(prefix) do
unique = :erlang.unique_integer([:positive]) |> Integer.to_string()
filename = prefix <> "_" <> unique <> ".wav"
Path.join(System.tmp_dir!(), filename)
end
# def handle_info({:transcription, raw_json}, socket) do
# new_text =
# raw_json
# |> Jason.decode!()
# |> get_in(["chunks", Access.at(0), "text"])
# {:noreply, update(socket, :transcription, &(&1 <> " " <> new_text))}
# end
defp schedule, do: Process.send_after(self(), :transcribe, @interval_ms)
defp encode_wav(data, sample_rate) do

View File

@ -7,10 +7,20 @@ defmodule WhisperLiveWeb.AudioChannel do
ref = socket_id(socket)
Logger.info("Cliente conectado al canal audio:lobby")
{:ok, _} = AudioBuffer.start_link(ref)
# {:ok, _} = AudioFullBuffer.start_link(ref)
{:ok, _} = WhisperLive.Transcriber.start_link(ref)
{:ok, socket}
end
# def handle_in("audio_chunk", %{"chunk" => chunk}, socket) do
# decoded_chunk = Base.decode64!(chunk)
# AudioStreamBuffer.append({48_000, decoded_chunk})
# AudioFullBuffer.append({48_000, decoded_chunk})
# {:noreply, socket}
# end
def handle_in("audio_chunk", %{"data" => data, "sample_rate" => rate}, socket) do
{:ok, binary} = Base.decode64(data)
AudioBuffer.append(socket_id(socket), {rate, binary})
@ -18,6 +28,29 @@ defmodule WhisperLiveWeb.AudioChannel do
{:noreply, socket}
end
# def handle_in("stop_audio", _payload, socket) do
# Logger.info("🛑 Grabación detenida por cliente")
# ref = socket_id(socket)
# case AudioFullBuffer.get_all(ref) do
# [{rate, _} | _] = chunks ->
# merged = chunks |> Enum.map(fn {_, bin} -> bin end) |> IO.iodata_to_binary()
# filename = "recordings/recording_#{System.system_time(:millisecond)}.wav"
# File.mkdir_p!("recordings")
# File.write!(filename, encode_wav(merged, rate))
# whisper_large(filename)
# File.rm!(filename)
# _ ->
# Logger.warning("No se recibieron chunks de audio")
# end
# AudioStreamBuffer.stop(ref)
# AudioFullBuffer.stop(ref)
# WhisperLive.Transcriber.stop(ref)
# {:noreply, socket}
# end
def handle_in("stop_audio", _payload, socket) do
Logger.info("🛑 Grabación detenida por cliente")
@ -32,7 +65,7 @@ defmodule WhisperLiveWeb.AudioChannel do
whisper_large(filename)
File.rm!(filename)
_ ->
Logger.warning("⚠️ No se recibieron chunks de audio")
Logger.warning("No se recibieron chunks de audio")
end
AudioBuffer.stop(ref)

View File

@ -65,21 +65,22 @@ defmodule WhisperLiveWeb.Live.Recorder do
</button>
</div>
<div id="status" class="text-sm text-gray-600"></div>
<div id="status" class="hidden"></div>
<div id="transcriptionContainer" class="space-y-2">
<div class="p-2 bg-gray-100 rounded shadow">
<h2 class="text-sm font-semibold text-gray-700 mb-1">🟠 Transcripción en vivo</h2>
<p id="transcription" class="text-orange-600 whitespace-pre-wrap"><%= @transcription %></p>
<div id="transcriptionContainer" class="w-full max-w-2xl space-y-4">
<div class="p-4 bg-gray-100 rounded shadow-md">
<h2 class="text-sm font-semibold text-gray-700 mb-2">🟠 Transcripción en vivo</h2>
<p id="transcription" class="text-orange-600 whitespace-pre-wrap break-words text-sm leading-relaxed"><%= @transcription %></p>
</div>
<%= if @transcription_m != "" do %>
<div class="p-2 bg-gray-100 rounded shadow">
<h2 class="text-sm font-semibold text-gray-700 mb-1">✅ Transcripción mejorada</h2>
<p class="text-green-600 whitespace-pre-wrap"><%= @transcription_m %></p>
<div class="p-4 bg-gray-100 rounded shadow-md">
<h2 class="text-sm font-semibold text-gray-700 mb-2">✅ Transcripción mejorada</h2>
<p class="text-green-600 whitespace-pre-wrap break-words text-sm leading-relaxed"><%= @transcription_m %></p>
</div>
<% end %>
</div>
<script type="module">
import { Socket } from "https://cdn.skypack.dev/phoenix"
@ -157,8 +158,8 @@ defmodule WhisperLiveWeb.Live.Recorder do
const base64 = encodeBase64(new Uint8Array(merged.buffer))
channel.push("audio_chunk", { data: base64, sample_rate: sampleRate })
console.log("📤 Enviado chunk")
}, 2000)
console.log("Chunk enviado")
}, 1000)
}
function stopRecording() {