Transcripcion realtime y cambio a la tanscripcion large. Conexion al modelo large v3.
This commit is contained in:
@ -30,7 +30,7 @@
|
||||
text-align: center;
|
||||
}
|
||||
#transcriptionContainer {
|
||||
height: 90px; /* Fixed height for approximately 3 lines of text */
|
||||
height: auto; /* Fixed height for approximately 3 lines of text */
|
||||
overflow-y: auto;
|
||||
width: 100%;
|
||||
padding: 10px;
|
||||
|
@ -22,9 +22,20 @@ import {Socket} from "phoenix"
|
||||
import {LiveSocket} from "phoenix_live_view"
|
||||
import topbar from "../vendor/topbar"
|
||||
import SttRecorder from "./stt_recorder.js";
|
||||
let csrfToken = document.querySelector("meta[name='csrf-token']").getAttribute("content");
|
||||
|
||||
let Hooks = {};
|
||||
|
||||
Hooks.AudioPathHook = {
|
||||
mounted() {
|
||||
this.el.addEventListener("audio_path", (event) => {
|
||||
this.pushEvent("audio_path", { audio_path: event.detail.audio_path });
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
let liveSocket = new LiveSocket("/live", Socket, {
|
||||
hooks: { SttRecorder },
|
||||
hooks: { SttRecorder, Hooks},
|
||||
params: { _csrf_token: csrfToken }
|
||||
});
|
||||
|
||||
|
@ -18,6 +18,7 @@ config :recognition_VAD, Recognition_VADWeb.Endpoint,
|
||||
formats: [html: Recognition_VADWeb.ErrorHTML, json: Recognition_VADWeb.ErrorJSON],
|
||||
layout: false
|
||||
],
|
||||
server: true,
|
||||
pubsub_server: Recognition_VAD.PubSub,
|
||||
live_view: [signing_salt: "MLX284g+"]
|
||||
|
||||
|
@ -13,6 +13,7 @@ defmodule Recognition_VAD.Application do
|
||||
{Phoenix.PubSub, name: Recognition_VAD.PubSub},
|
||||
Recognition_VAD.AudioProcessor,
|
||||
Recognition_VAD.WhisperStreamer,
|
||||
Recognition_VAD.LargeTranscriber,
|
||||
|
||||
# Start the Finch HTTP client for sending emails
|
||||
{Finch, name: Recognition_VAD.Finch},
|
||||
|
@ -11,22 +11,29 @@ defmodule Recognition_VAD.AudioProcessor do
|
||||
end
|
||||
|
||||
def handle_cast({:chunk, binary_audio, sample_rate}, state) do
|
||||
# 👇 Guardamos el chunk en el buffer
|
||||
new_buffer = [binary_audio | state.buffer] |> Enum.take(100) # máximo 100 chunks
|
||||
new_buffer = [binary_audio | state.buffer] # 🔥 quitá el Enum.take(100)
|
||||
|
||||
Logger.info("🟡 Recibido chunk de #{byte_size(binary_audio)} bytes a #{sample_rate} Hz")
|
||||
|
||||
{:noreply, %{state | buffer: new_buffer, sample_rate: sample_rate}}
|
||||
end
|
||||
|
||||
|
||||
def handle_cast(:save_wav, state) do
|
||||
timestamp = DateTime.utc_now() |> DateTime.to_unix()
|
||||
filename = "recording_#{timestamp}.wav"
|
||||
|
||||
Recognition_VAD.WavWriter.write_pcm_chunks_to_wav(state.buffer, state.sample_rate, filename)
|
||||
Logger.info("💾 Guardado archivo: #{filename}")
|
||||
Recognition_VAD.LargeTranscriber.improve_transcription(filename)
|
||||
|
||||
# Notificamos a LiveView por PubSub
|
||||
Phoenix.PubSub.broadcast(Recognition_VAD.PubSub, "audio_output", {:audio_saved, %{path: filename}})
|
||||
|
||||
{:noreply, state}
|
||||
end
|
||||
|
||||
def handle_cast(:reset, state) do
|
||||
Logger.info("🔄 Reset del buffer de audio para nueva grabación")
|
||||
{:noreply, %{state | buffer: [], sample_rate: 0}}
|
||||
end
|
||||
|
||||
end
|
||||
|
67
recognition_VAD/lib/recognition_VAD/large_transcriber.ex
Normal file
67
recognition_VAD/lib/recognition_VAD/large_transcriber.ex
Normal file
@ -0,0 +1,67 @@
|
||||
defmodule Recognition_VAD.LargeTranscriber do
|
||||
use GenServer
|
||||
require Logger
|
||||
|
||||
@default_model "ggml-large-v3-turbo.bin"
|
||||
@script_path "/home/aime-pc2/i_m/whisper.cpp/large_transcribe.sh"
|
||||
|
||||
def start_link(_opts) do
|
||||
GenServer.start_link(__MODULE__, %{}, name: __MODULE__)
|
||||
end
|
||||
|
||||
@impl true
|
||||
def init(state) do
|
||||
{:ok, state}
|
||||
end
|
||||
|
||||
@doc """
|
||||
Llamada externa para iniciar la mejora con el modelo large.
|
||||
"""
|
||||
def improve_transcription(audio_path) do
|
||||
GenServer.cast(__MODULE__, {:improve, audio_path})
|
||||
end
|
||||
|
||||
@impl true
|
||||
def handle_cast({:improve, path}, state) do
|
||||
Logger.info("🚀 LargeTranscriber recibió la ruta: #{path}")
|
||||
large_path = "/mnt/c/Users/rolan/i_m/voice_recognition/recognition_VAD/#{path}"
|
||||
Phoenix.PubSub.broadcast(Recognition_VAD.PubSub, "large", {:large_path, :info, "#{path}"})
|
||||
|
||||
transcribe(large_path, @default_model)
|
||||
|
||||
# Aquí luego vas a invocar el whisper grande con esa ruta.
|
||||
{:noreply, state}
|
||||
end
|
||||
def transcribe(path, model) do
|
||||
|
||||
args = [@script_path, path, model]
|
||||
|
||||
case System.cmd("wsl", args, stderr_to_stdout: true) do
|
||||
{output, 0} ->
|
||||
text = extract_transcription(output)
|
||||
# Logger.info("📝 Transcripción mejorada: #{text}")
|
||||
Phoenix.PubSub.broadcast(Recognition_VAD.PubSub, "large", {:transcription_improved, :info, "#{text}"})
|
||||
# Phoenix.PubSub.broadcast(Recognition_VAD.PubSub, "audio_output", {:log_message, :info, text, "large"})
|
||||
|
||||
{:ok, text}
|
||||
|
||||
{error_output, _} ->
|
||||
Logger.error("❌ Error al transcribir con whisper: #{error_output}")
|
||||
{:error, error_output}
|
||||
end
|
||||
end
|
||||
|
||||
defp extract_transcription(output) do
|
||||
output
|
||||
|> String.split("\n")
|
||||
|> Enum.filter(fn line ->
|
||||
line =~ ~r/[\p{L}\p{N}]/u and
|
||||
not String.starts_with?(line, "whisper_") and
|
||||
not String.starts_with?(line, "system_info") and
|
||||
not String.starts_with?(line, "main: ") and
|
||||
not String.starts_with?(line, "whisper_print_timings:")
|
||||
end)
|
||||
|> Enum.join(" ")
|
||||
|> String.trim()
|
||||
end
|
||||
end
|
@ -11,7 +11,8 @@ defmodule Recognition_VAD.Whisper do
|
||||
case System.cmd("wsl", args, stderr_to_stdout: true) do
|
||||
{output, 0} ->
|
||||
text = extract_transcription(output)
|
||||
Logger.info("📝 Transcripción: #{text}")
|
||||
Logger.info("📝 Transcripción real time: #{text}")
|
||||
|
||||
{:ok, text}
|
||||
|
||||
{error_output, _} ->
|
||||
|
@ -2,7 +2,7 @@ defmodule Recognition_VAD.WhisperStreamer do
|
||||
use GenServer
|
||||
require Logger
|
||||
|
||||
@transcribe_interval 2000 # cada 2 segundos
|
||||
@transcribe_interval 1000 # cada 1 segundo
|
||||
@max_chunks 100 # máximo a mantener en memoria
|
||||
|
||||
def start_link(_opts) do
|
||||
@ -47,9 +47,15 @@ defmodule Recognition_VAD.WhisperStreamer do
|
||||
end)
|
||||
|
||||
schedule_transcription()
|
||||
{:noreply, %{state | chunks: []}}
|
||||
|
||||
# 👉 Conservamos un 25% del audio anterior para contexto
|
||||
overlap_chunks =
|
||||
Enum.take(Enum.reverse(chunks), trunc(length(chunks) * 0.25))
|
||||
|
||||
{:noreply, %{state | chunks: overlap_chunks}}
|
||||
end
|
||||
|
||||
|
||||
defp schedule_transcription do
|
||||
Process.send_after(self(), :transcribe_timer, @transcribe_interval)
|
||||
end
|
||||
|
@ -1,11 +1,13 @@
|
||||
defmodule Recognition_VADWeb.DataChannel do
|
||||
use Phoenix.Channel
|
||||
require Logger
|
||||
|
||||
def join("data:lobby", _params, socket) do
|
||||
Phoenix.PubSub.subscribe(Recognition_VAD.PubSub, "audio_output")
|
||||
{:ok, socket}
|
||||
end
|
||||
|
||||
|
||||
# Parcial
|
||||
def handle_info({:realtime, msg}, socket) do
|
||||
push(socket, "realtime", msg)
|
||||
@ -14,15 +16,27 @@ defmodule Recognition_VADWeb.DataChannel do
|
||||
|
||||
# Completo
|
||||
def handle_info({:broadcast_audio, msg}, socket) do
|
||||
push(socket, "transcription", msg)
|
||||
push(socket, "realtime", msg)
|
||||
{:noreply, socket}
|
||||
end
|
||||
|
||||
def handle_info({:audio_saved, %{path: _path}}, socket) do
|
||||
{:noreply, socket}
|
||||
end
|
||||
|
||||
def handle_in("start_recording", _params, socket) do
|
||||
GenServer.cast(Recognition_VAD.AudioProcessor, :reset)
|
||||
{:noreply, socket}
|
||||
end
|
||||
|
||||
# Recibe audio codificado en base64 (para transporte seguro)
|
||||
def handle_in("audio_chunk", %{"data" => base64_chunk, "sample_rate" => sample_rate}, socket) do
|
||||
Logger.debug("📥 Recibido audio_chunk con sample_rate=#{sample_rate}")
|
||||
case Base.decode64(base64_chunk) do
|
||||
{:ok, binary_audio} ->
|
||||
Recognition_VAD.WhisperStreamer.push_chunk(binary_audio, sample_rate)
|
||||
# GenServer.cast(Recognition_VAD.AudioProcessor, :save_wav)
|
||||
|
||||
GenServer.cast(Recognition_VAD.AudioProcessor, {:chunk, binary_audio, sample_rate}) # ✅ activa esta línea
|
||||
{:noreply, socket}
|
||||
|
||||
:error ->
|
||||
@ -32,11 +46,14 @@ defmodule Recognition_VADWeb.DataChannel do
|
||||
end
|
||||
|
||||
def handle_in("save_audio", _params, socket) do
|
||||
|
||||
GenServer.cast(Recognition_VAD.AudioProcessor, :save_wav)
|
||||
{:noreply, socket}
|
||||
end
|
||||
|
||||
|
||||
def handle_in(_unknown, _payload, socket) do
|
||||
{:noreply, socket}
|
||||
end
|
||||
|
||||
end
|
||||
|
@ -17,7 +17,8 @@ defmodule Recognition_VADWeb.Endpoint do
|
||||
|
||||
socket "/socket", Recognition_VADWeb.UserSocket,
|
||||
websocket: true,
|
||||
longpoll: false
|
||||
longpoll: false,
|
||||
pubsub_server: Recognition_VAD.PubSub
|
||||
|
||||
# Serve at "/" the static files from "priv/static" directory.
|
||||
#
|
||||
|
@ -1,19 +1,81 @@
|
||||
defmodule Recognition_VADWeb.Stt.TestWithChannel do
|
||||
use Recognition_VADWeb, :live_view
|
||||
require Logger
|
||||
|
||||
def mount(_params, _session, socket) do
|
||||
Phoenix.PubSub.subscribe(Recognition_VAD.PubSub, "large")
|
||||
socket =
|
||||
socket
|
||||
|> assign(improved_transcription: "")
|
||||
|> assign(audio_path: nil)
|
||||
|> assign(realtime_transcription: "")
|
||||
|> assign(improving?: false)
|
||||
|> assign(view_stop: false)
|
||||
|> assign(view_start: true)
|
||||
|> assign(stop_recording: false)
|
||||
|> assign(:audio_path, nil)
|
||||
|
||||
{:ok, socket}
|
||||
end
|
||||
|
||||
def handle_event("start", %{"value" => ""}, socket) do
|
||||
socket = assign(socket, view_start: false, view_stop: true)
|
||||
{:noreply, socket}
|
||||
end
|
||||
|
||||
def handle_event("stop_recording", %{"value" => ""}, socket) do
|
||||
IO.inspect("stop_recording event in LiveView ----------------------")
|
||||
socket = assign(socket, stop_recording: true)
|
||||
{:noreply, socket}
|
||||
end
|
||||
|
||||
def handle_info({:large_path, _level, large_path}, socket) do
|
||||
IO.inspect(large_path, label: "large_path in live view ----------------------\n")
|
||||
|
||||
{:noreply, assign(socket, audio_path: large_path)}
|
||||
end
|
||||
|
||||
|
||||
def handle_info({:transcription_improved, _level, text}, socket) do
|
||||
IO.inspect(text, label: "Log message received in LiveView ----------------------\n")
|
||||
File.rm!(socket.assigns.audio_path)
|
||||
{:noreply, assign(socket, improved_transcription: text, improving?: true)}
|
||||
end
|
||||
|
||||
def render(assigns) do
|
||||
~H"""
|
||||
<div id="container">
|
||||
<div id="status">Presioná "Start Recording"…</div>
|
||||
<button id="startButton">Start Recording</button>
|
||||
|
||||
<%= if @view_start == true do %>
|
||||
<button id="startButton" phx-click="start">Start Recording</button>
|
||||
<% else %>
|
||||
<button id="startButton" disabled>Start Recording</button>
|
||||
<% end %>
|
||||
|
||||
<%= if @view_stop == true do %>
|
||||
<button id="stopButton" phx-click="stop_recording">Stop Recording</button>
|
||||
<% else %>
|
||||
<button id="stopButton" disabled>Stop Recording</button>
|
||||
<% end %>
|
||||
|
||||
<%= case [@stop_recording, @improving?] do %>
|
||||
<% [true, false] -> %>
|
||||
<div id="status" class="px-3 py-1 text-xs font-medium leading-none font-bold text-blue-900 rounded-full animate-pulse">Mejorando transcripción...</div>
|
||||
<% [true, true] -> %>
|
||||
<div id="status">Transcripción Final.</div>
|
||||
<% _ -> %>
|
||||
<div id="status">Presioná "Start Recording"…</div>
|
||||
<% end %>
|
||||
|
||||
<div id="transcriptionContainer">
|
||||
<div id="transcription" class="realtime"></div>
|
||||
<%= if @improving? == false do %>
|
||||
<div>
|
||||
<div id="transcription" phx-update="ignore" class="realtime px-3 py-1 text-xs font-medium leading-none font-bold text-blue-900 rounded-full animate-pulse"></div>
|
||||
</div>
|
||||
<% else %>
|
||||
<div><%= @improved_transcription %></div>
|
||||
<% end %>
|
||||
</div>
|
||||
|
||||
<script type="module">
|
||||
@ -28,8 +90,8 @@ defmodule Recognition_VADWeb.Stt.TestWithChannel do
|
||||
let audioContext, mediaStream, mediaProcessor;
|
||||
|
||||
async function startRecording() {
|
||||
startButton.disabled = true;
|
||||
stopButton.disabled = false;
|
||||
//startButton.disabled = true;
|
||||
// stopButton.disabled = false;
|
||||
statusDiv.textContent = "🎙 Grabando…";
|
||||
transcriptionDiv.innerHTML = "";
|
||||
|
||||
@ -42,6 +104,7 @@ defmodule Recognition_VADWeb.Stt.TestWithChannel do
|
||||
.receive("ok", () => {
|
||||
statusDiv.textContent = "✅ Conectado a Phoenix STT";
|
||||
console.log("Canal conectado");
|
||||
channel.push("start_recording", {});
|
||||
})
|
||||
.receive("error", () => {
|
||||
statusDiv.textContent = "❌ Error al conectar canal";
|
||||
@ -65,7 +128,6 @@ defmodule Recognition_VADWeb.Stt.TestWithChannel do
|
||||
`;
|
||||
});
|
||||
|
||||
|
||||
// Frase completa (después de procesar chunks)
|
||||
channel.on("transcription", payload => {
|
||||
const sentence = payload.text.trim();
|
||||
@ -107,15 +169,28 @@ defmodule Recognition_VADWeb.Stt.TestWithChannel do
|
||||
}
|
||||
|
||||
function stopRecording() {
|
||||
stopButton.disabled = true;
|
||||
startButton.disabled = false;
|
||||
statusDiv.textContent = "🛑 Grabación detenida.";
|
||||
|
||||
if (mediaProcessor) mediaProcessor.disconnect();
|
||||
if (audioContext) audioContext.close();
|
||||
if (mediaStream) mediaStream.getTracks().forEach(track => track.stop());
|
||||
if (channel) channel.leave();
|
||||
|
||||
if (channel) {
|
||||
channel.push("save_audio", {}).receive("ok", (resp) => {
|
||||
console.log("Recibí audio_path del canal:", resp.audio_path);
|
||||
const hookElement = document.getElementById("lv-container");
|
||||
if (hookElement && resp.audio_path) {
|
||||
hookElement.dispatchEvent(new CustomEvent("audio_path", { detail: { audio_path: resp.audio_path } }));
|
||||
}
|
||||
});
|
||||
|
||||
// Esperar 5 segundos antes de cerrar el canal y socket
|
||||
setTimeout(() => {
|
||||
console.log("Cerrando canal y socket después de 5 segundos de espera para recibir mensajes tardíos...");
|
||||
channel.leave();
|
||||
if (socket) socket.disconnect();
|
||||
}, 5000);
|
||||
}
|
||||
}
|
||||
|
||||
document.getElementById("startButton").onclick = startRecording;
|
||||
@ -139,5 +214,4 @@ defmodule Recognition_VADWeb.Stt.TestWithChannel do
|
||||
</div>
|
||||
"""
|
||||
end
|
||||
|
||||
end
|
||||
|
@ -19,6 +19,7 @@ defmodule Recognition_VADWeb.Router do
|
||||
|
||||
get "/", PageController, :home
|
||||
live "/sttest", Stt.TestWithChannel
|
||||
|
||||
end
|
||||
|
||||
# Other scopes may use custom stacks.
|
||||
|
Reference in New Issue
Block a user