Transcripcion en vivo + transcripcion mejorada

This commit is contained in:
2025-07-16 15:50:13 +00:00
parent 8386b685d6
commit 89168522b6
12 changed files with 293 additions and 223 deletions

View File

@ -3,8 +3,84 @@
@import "tailwindcss/utilities";
/* This file is for your main application CSS */
.realtime {
white-space: pre-wrap;
font-family: monospace;
margin-top: 1em;
}
body {
background-color: #f4f4f9;
color: #333;
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
display: flex;
align-items: center;
justify-content: center;
height: 100vh;
margin: 0;
}
#container {
display: flex;
flex-direction: column;
align-items: center;
width: 100%;
max-width: 700px;
padding: 20px;
box-sizing: border-box;
gap: 20px; /* Add more vertical space between items */
height: 90%; /* Fixed height to prevent layout shift */
}
#status {
color: #0056b3;
font-size: 20px;
text-align: center;
}
#transcriptionContainer {
height: auto; /* Fixed height for approximately 3 lines of text */
overflow-y: auto;
width: 100%;
padding: 10px;
box-sizing: border-box;
background-color: #f9f9f9;
border: 1px solid #ddd;
border-radius: 5px;
}
#transcription {
font-size: 18px;
line-height: 1.6;
color: #333;
word-wrap: break-word;
}
#fullTextContainer {
height: 150px; /* Fixed height to prevent layout shift */
overflow-y: auto;
width: 100%;
padding: 10px;
box-sizing: border-box;
background-color: #f9f9f9;
border: 1px solid #ddd;
border-radius: 5px;
}
#fullText {
color: #4CAF50;
font-size: 18px;
font-weight: 600;
word-wrap: break-word;
}
.last-word {
color: #007bff;
font-weight: 600;
}
button {
padding: 12px 24px;
font-size: 16px;
cursor: pointer;
border: none;
border-radius: 5px;
margin: 5px;
transition: background-color 0.3s ease;
color: #fff;
background-color: #0056b3;
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
}
button:hover {
background-color: #007bff;
}
button:disabled {
background-color: #cccccc;
cursor: not-allowed;
}

View File

@ -9,6 +9,8 @@ defmodule WhisperLive.AudioBuffer do
def get_all(ref), do: GenServer.call(via(ref), :get_all)
def clear(ref), do: GenServer.call(via(ref), :clear)
def stop(ref), do: GenServer.stop(via(ref))
defp via(ref), do: {:via, Registry, {WhisperLive.AudioRegistry, ref}}
@ -20,4 +22,6 @@ defmodule WhisperLive.AudioBuffer do
def handle_cast({:append, chunk}, state), do: {:noreply, [chunk | state]}
def handle_call(:get_all, _from, state), do: {:reply, Enum.reverse(state), state}
def handle_call(:clear, _from, _state), do: {:reply, :ok, []}
end

View File

@ -33,7 +33,7 @@ defmodule WhisperLive.Transcriber do
case send_to_whisper(tmpfile) do
{:ok, response} ->
PubSub.broadcast(WhisperLive.PubSub, "transcription:#{ref}", {:transcription, response})
PubSub.broadcast(WhisperLive.PubSub, "transcription", {:transcription, response})
{:error, reason} ->
Logger.warning("Realtime transcription error: #{inspect(reason)}")
@ -90,7 +90,7 @@ defmodule WhisperLive.Transcriber do
end
defp send_to_whisper(filepath) do
url = "http://localhost:4000/infer"
url = "http://localhost:4000/tiny"
{:ok, file_bin} = File.read(filepath)
filename = Path.basename(filepath)
@ -108,9 +108,17 @@ defmodule WhisperLive.Transcriber do
:httpc.request(:post, {url, headers, 'multipart/form-data; boundary=----ElixirBoundary', body}, [], [])
|> case do
{:ok, {{_, 200, _}, _headers, body}} -> {:ok, to_string(body)}
{:ok, {{_, status, _}, _, body}} -> {:error, {:http_error, status, to_string(body)}}
error -> {:error, error}
{:ok, {{_, 200, _}, _headers, body}} ->
# Logger.info("en transcriber --------------------------\n -> > #{IO.iodata_to_binary(body)}")
# Phoenix.PubSub.broadcast(WhisperLive.PubSub, "transcription", {:transcription, "#{IO.iodata_to_binary(body)}"})
{:ok, "#{IO.iodata_to_binary(body)}"}
{:ok, {{_, status, _}, _, body}} ->
{:error, {:http_error, status,"#{IO.iodata_to_binary(body)}"}}
error ->
{:error, error}
end
end
end

View File

@ -11,31 +11,13 @@ defmodule WhisperLiveWeb.AudioChannel do
{:ok, socket}
end
def handle_in("audio_chunk", %{"data" => base64_audio, "sample_rate" => sample_rate}, socket) do
# 1. Decodificas el audio base64
{:ok, bin} = Base.decode64(base64_audio)
# 2. Guardas o procesas el chunk de audio
# Podrías escribirlo en un archivo temporal para enviar a Whisper
tmpfile = tmp_path("chunk_#{socket.assigns.ref}")
:ok = File.write!(tmpfile, encode_wav(bin, sample_rate))
# 3. Llamas a la transcripción del chunk (podría ser sync o async)
case send_to_whisper(tmpfile) do
{:ok, transcription} ->
# 4. Envías el texto parcial por PubSub o Push a LiveView/cliente
Phoenix.PubSub.broadcast(YourApp.PubSub, "transcription:#{socket.assigns.ref}", {:transcription, transcription})
{:error, reason} ->
Logger.error("Error en transcripción parcial: #{inspect(reason)}")
end
File.rm(tmpfile)
def handle_in("audio_chunk", %{"data" => data, "sample_rate" => rate}, socket) do
{:ok, binary} = Base.decode64(data)
AudioBuffer.append(socket_id(socket), {rate, binary})
Logger.info("📦 Chunk recibido: #{byte_size(binary)} bytes, sample_rate: #{rate}")
{:noreply, socket}
end
def handle_in("stop_audio", _payload, socket) do
Logger.info("🛑 Grabación detenida por cliente")
@ -47,16 +29,8 @@ defmodule WhisperLiveWeb.AudioChannel do
filename = "recordings/recording_#{System.system_time(:millisecond)}.wav"
File.mkdir_p!("recordings")
File.write!(filename, encode_wav(merged, rate))
Logger.info("💾 Audio guardado en #{filename}")
# 🔁 Transcribir automáticamente
case send_to_whisper(filename) do
{:ok, response} ->
Logger.info("📝 Transcripción recibida: #{response}")
{:error, reason} ->
Logger.error("❌ Error al transcribir: #{inspect(reason)}")
end
whisper_large(filename)
File.rm!(filename)
_ ->
Logger.warning("⚠️ No se recibieron chunks de audio")
end
@ -93,9 +67,8 @@ defmodule WhisperLiveWeb.AudioChannel do
>> <> data
end
defp send_to_whisper(filepath) do
url = "http://localhost:4000/infer"
defp whisper_large(filepath) do
url = "http://localhost:4000/large"
{:ok, file_bin} = File.read(filepath)
filename = Path.basename(filepath)
@ -103,8 +76,7 @@ defmodule WhisperLiveWeb.AudioChannel do
{'Content-Type', 'multipart/form-data; boundary=----ElixirBoundary'}
]
body =
[
body = [
"------ElixirBoundary\r\n",
"Content-Disposition: form-data; name=\"file\"; filename=\"#{filename}\"\r\n",
"Content-Type: audio/wav\r\n\r\n",
@ -115,20 +87,16 @@ defmodule WhisperLiveWeb.AudioChannel do
:httpc.request(:post, {url, headers, 'multipart/form-data; boundary=----ElixirBoundary', body}, [], [])
|> case do
{:ok, {{_, 200, _}, _headers, body}} ->
{:ok, to_string(body)}
# Logger.info("transcripcion mejorada --------------------------\n -> > #{IO.iodata_to_binary(body)}")
Phoenix.PubSub.broadcast(WhisperLive.PubSub, "transcription", {:transcription_m, "#{IO.iodata_to_binary(body)}"})
{:ok, "#{IO.iodata_to_binary(body)}"}
{:ok, {{_, status, _}, _, body}} ->
{:error, {:http_error, status, to_string(body)}}
{:error, {:http_error, status, IO.iodata_to_binary(body)}}
error ->
{:error, error}
end
end
defp tmp_path(prefix) do
unique = :erlang.unique_integer([:positive]) |> Integer.to_string()
filename = prefix <> "_" <> unique <> ".wav"
Path.join(System.tmp_dir!(), filename)
end
end

View File

@ -1,32 +1,5 @@
<header class="px-4 sm:px-6 lg:px-8">
<div class="flex items-center justify-between border-b border-zinc-100 py-3 text-sm">
<div class="flex items-center gap-4">
<a href="/">
<img src={~p"/images/logo.svg"} width="36" />
</a>
<p class="bg-brand/5 text-brand rounded-full px-2 font-medium leading-6">
v{Application.spec(:phoenix, :vsn)}
</p>
</div>
<div class="flex items-center gap-4 font-semibold leading-6 text-zinc-900">
<a href="https://twitter.com/elixirphoenix" class="hover:text-zinc-700">
@elixirphoenix
</a>
<a href="https://github.com/phoenixframework/phoenix" class="hover:text-zinc-700">
GitHub
</a>
<a
href="https://hexdocs.pm/phoenix/overview.html"
class="rounded-lg bg-zinc-100 px-2 py-1 hover:bg-zinc-200/80"
>
Get Started <span aria-hidden="true">&rarr;</span>
</a>
</div>
</div>
</header>
<main class="px-4 py-20 sm:px-6 lg:px-8">
<div class="mx-auto max-w-2xl">
<.flash_group flash={@flash} />
<main>
<div>
{@inner_content}
</div>
</main>

View File

@ -11,7 +11,7 @@
<script defer phx-track-static type="text/javascript" src={~p"/assets/app.js"}>
</script>
</head>
<body class="bg-white">
<body>
{@inner_content}
</body>
</html>

View File

@ -3,24 +3,49 @@ defmodule WhisperLiveWeb.Live.Recorder do
alias Phoenix.PubSub
def mount(_, _, socket) do
if connected?(socket), do: PubSub.subscribe(WhisperLive.PubSub, "transcription:#{socket_id(socket)}")
{:ok, assign(socket, transcription: "")}
PubSub.subscribe(WhisperLive.PubSub, "transcription")
socket =
socket
|> assign(:transcription, "")
|> assign(:transcription_m, "")
{:ok, socket}
end
def handle_info({:transcription, raw_json}, socket) do
IO.inspect(raw_json, label: "en vivo ---------------->\n")
new_text =
raw_json
|> Jason.decode!()
|> get_in(["chunks", Access.at(0), "text"])
{:noreply, update(socket, :transcription, &(&1 <> " " <> new_text))}
old_text = socket.assigns.transcription
# Sacar lo ya incluido al inicio
added_part = String.replace_prefix(new_text, old_text, "")
{:noreply, update(socket, :transcription, &(&1 <> added_part))}
end
def handle_info({:transcription_m, raw_json}, socket) do
IO.inspect(raw_json, label: "meojada ---------------->\n")
new_text =
raw_json
|> Jason.decode!()
|> get_in(["chunks", Access.at(0), "text"])
{:noreply, update(socket, :transcription_m, &(&1 <> " " <> new_text))}
end
def handle_event("start_recording", _params, socket) do
push_event(socket, "start-recording", %{})
{:noreply, socket}
{:noreply, assign(socket, transcription: "", transcription_m: "")}
end
def handle_event("stop_recording", _params, socket) do
push_event(socket, "stop-recording", %{})
{:noreply, socket}
@ -31,14 +56,30 @@ defmodule WhisperLiveWeb.Live.Recorder do
def render(assigns) do
~H"""
<div id="recorder" data-hook="recorder">
<button id="startButton" phx-click="start_recording">Start Recording</button>
<button id="stopButton" phx-click="stop_recording">Stop Recording</button>
<div id="transcriptionContainer">
<div id="transcription" class="realtime"><%= @transcription %></div>
<div class="flex space-x-2">
<button id="startButton" phx-click="start_recording" class="px-4 py-2 bg-blue-500 text-white rounded hover:bg-blue-600">
Start Recording
</button>
<button id="stopButton" phx-click="stop_recording" class="px-4 py-2 bg-red-500 text-white rounded hover:bg-red-600">
Stop Recording
</button>
</div>
<div id="status" class="realtime"></div>
<div id="status" class="text-sm text-gray-600"></div>
<div id="transcriptionContainer" class="space-y-2">
<div class="p-2 bg-gray-100 rounded shadow">
<h2 class="text-sm font-semibold text-gray-700 mb-1">🟠 Transcripción en vivo</h2>
<p id="transcription" class="text-orange-600 whitespace-pre-wrap"><%= @transcription %></p>
</div>
<%= if @transcription_m != "" do %>
<div class="p-2 bg-gray-100 rounded shadow">
<h2 class="text-sm font-semibold text-gray-700 mb-1">✅ Transcripción mejorada</h2>
<p class="text-green-600 whitespace-pre-wrap"><%= @transcription_m %></p>
</div>
<% end %>
</div>
<script type="module">
import { Socket } from "https://cdn.skypack.dev/phoenix"

Binary file not shown.