minimo reconocimiento de voz

This commit is contained in:
2025-06-17 08:48:55 -03:00
commit 36fe9f603e
79 changed files with 7662 additions and 0 deletions

View File

@ -0,0 +1,3 @@
from .audio_recorder import AudioToTextRecorder
from .audio_recorder_client import AudioToTextRecorderClient
from .audio_input import AudioInput

View File

@ -0,0 +1,220 @@
from colorama import init, Fore, Style
from scipy.signal import butter, filtfilt, resample_poly
import pyaudio
import logging
DESIRED_RATE = 16000
CHUNK_SIZE = 1024
AUDIO_FORMAT = pyaudio.paInt16
CHANNELS = 1
class AudioInput:
def __init__(
self,
input_device_index: int = None,
debug_mode: bool = False,
target_samplerate: int = DESIRED_RATE,
chunk_size: int = CHUNK_SIZE,
audio_format: int = AUDIO_FORMAT,
channels: int = CHANNELS,
resample_to_target: bool = True,
):
self.input_device_index = input_device_index
self.debug_mode = debug_mode
self.audio_interface = None
self.stream = None
self.device_sample_rate = None
self.target_samplerate = target_samplerate
self.chunk_size = chunk_size
self.audio_format = audio_format
self.channels = channels
self.resample_to_target = resample_to_target
def get_supported_sample_rates(self, device_index):
"""Test which standard sample rates are supported by the specified device."""
standard_rates = [8000, 9600, 11025, 12000, 16000, 22050, 24000, 32000, 44100, 48000]
supported_rates = []
device_info = self.audio_interface.get_device_info_by_index(device_index)
max_channels = device_info.get('maxInputChannels') # Changed from maxOutputChannels
for rate in standard_rates:
try:
if self.audio_interface.is_format_supported(
rate,
input_device=device_index, # Changed to input_device
input_channels=max_channels, # Changed to input_channels
input_format=self.audio_format, # Changed to input_format
):
supported_rates.append(rate)
except:
continue
return supported_rates
def _get_best_sample_rate(self, actual_device_index, desired_rate):
"""Determines the best available sample rate for the device."""
try:
device_info = self.audio_interface.get_device_info_by_index(actual_device_index)
supported_rates = self.get_supported_sample_rates(actual_device_index)
if desired_rate in supported_rates:
return desired_rate
return max(supported_rates)
# lower_rates = [r for r in supported_rates if r <= desired_rate]
# if lower_rates:
# return max(lower_rates)
# higher_rates = [r for r in supported_rates if r > desired_rate]
# if higher_rates:
# return min(higher_rates)
return int(device_info.get('defaultSampleRate', 44100))
except Exception as e:
logging.warning(f"Error determining sample rate: {e}")
return 44100 # Safe fallback
def list_devices(self):
"""List all available audio input devices with supported sample rates."""
try:
init() # Initialize colorama
self.audio_interface = pyaudio.PyAudio()
device_count = self.audio_interface.get_device_count()
print(f"Available audio input devices:")
#print(f"{Fore.LIGHTBLUE_EX}Available audio input devices:{Style.RESET_ALL}")
for i in range(device_count):
device_info = self.audio_interface.get_device_info_by_index(i)
device_name = device_info.get('name')
max_input_channels = device_info.get('maxInputChannels', 0)
if max_input_channels > 0: # Only consider devices with input capabilities
supported_rates = self.get_supported_sample_rates(i)
print(f"{Fore.LIGHTGREEN_EX}Device {Style.RESET_ALL}{i}{Fore.LIGHTGREEN_EX}: {device_name}{Style.RESET_ALL}")
# Format each rate in cyan
if supported_rates:
rates_formatted = ", ".join([f"{Fore.CYAN}{rate}{Style.RESET_ALL}" for rate in supported_rates])
print(f" {Fore.YELLOW}Supported sample rates: {rates_formatted}{Style.RESET_ALL}")
else:
print(f" {Fore.YELLOW}Supported sample rates: None{Style.RESET_ALL}")
except Exception as e:
print(f"Error listing devices: {e}")
finally:
if self.audio_interface:
self.audio_interface.terminate()
def setup(self):
"""Initialize audio interface and open stream"""
try:
self.audio_interface = pyaudio.PyAudio()
if self.debug_mode:
print(f"Input device index: {self.input_device_index}")
actual_device_index = (self.input_device_index if self.input_device_index is not None
else self.audio_interface.get_default_input_device_info()['index'])
if self.debug_mode:
print(f"Actual selected device index: {actual_device_index}")
self.input_device_index = actual_device_index
self.device_sample_rate = self._get_best_sample_rate(actual_device_index, self.target_samplerate)
if self.debug_mode:
print(f"Setting up audio on device {self.input_device_index} with sample rate {self.device_sample_rate}")
try:
self.stream = self.audio_interface.open(
format=self.audio_format,
channels=self.channels,
rate=self.device_sample_rate,
input=True,
frames_per_buffer=self.chunk_size,
input_device_index=self.input_device_index,
)
if self.debug_mode:
print(f"Audio recording initialized successfully at {self.device_sample_rate} Hz")
return True
except Exception as e:
print(f"Failed to initialize audio stream at {self.device_sample_rate} Hz: {e}")
return False
except Exception as e:
print(f"Error initializing audio recording: {e}")
if self.audio_interface:
self.audio_interface.terminate()
return False
def lowpass_filter(self, signal, cutoff_freq, sample_rate):
"""
Apply a low-pass Butterworth filter to prevent aliasing in the signal.
Args:
signal (np.ndarray): Input audio signal to filter
cutoff_freq (float): Cutoff frequency in Hz
sample_rate (float): Sampling rate of the input signal in Hz
Returns:
np.ndarray: Filtered audio signal
Notes:
- Uses a 5th order Butterworth filter
- Applies zero-phase filtering using filtfilt
"""
# Calculate the Nyquist frequency (half the sample rate)
nyquist_rate = sample_rate / 2.0
# Normalize cutoff frequency to Nyquist rate (required by butter())
normal_cutoff = cutoff_freq / nyquist_rate
# Design the Butterworth filter
b, a = butter(5, normal_cutoff, btype='low', analog=False)
# Apply zero-phase filtering (forward and backward)
filtered_signal = filtfilt(b, a, signal)
return filtered_signal
def resample_audio(self, pcm_data, target_sample_rate, original_sample_rate):
"""
Filter and resample audio data to a target sample rate.
Args:
pcm_data (np.ndarray): Input audio data
target_sample_rate (int): Desired output sample rate in Hz
original_sample_rate (int): Original sample rate of input in Hz
Returns:
np.ndarray: Resampled audio data
Notes:
- Applies anti-aliasing filter before resampling
- Uses polyphase filtering for high-quality resampling
"""
if target_sample_rate < original_sample_rate:
# Downsampling with low-pass filter
pcm_filtered = self.lowpass_filter(pcm_data, target_sample_rate / 2, original_sample_rate)
resampled = resample_poly(pcm_filtered, target_sample_rate, original_sample_rate)
else:
# Upsampling without low-pass filter
resampled = resample_poly(pcm_data, target_sample_rate, original_sample_rate)
return resampled
def read_chunk(self):
"""Read a chunk of audio data"""
return self.stream.read(self.chunk_size, exception_on_overflow=False)
def cleanup(self):
"""Clean up audio resources"""
try:
if self.stream:
self.stream.stop_stream()
self.stream.close()
self.stream = None
if self.audio_interface:
self.audio_interface.terminate()
self.audio_interface = None
except Exception as e:
print(f"Error cleaning up audio resources: {e}")

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,881 @@
log_outgoing_chunks = False
debug_mode = False
from typing import Iterable, List, Optional, Union
from urllib.parse import urlparse
from datetime import datetime
from websocket import WebSocketApp
from websocket import ABNF
import numpy as np
import subprocess
import threading
import platform
import logging
import struct
import base64
import wave
import json
import time
import sys
import os
# Import the AudioInput class
from .audio_input import AudioInput
DEFAULT_CONTROL_URL = "ws://127.0.0.1:8011"
DEFAULT_DATA_URL = "ws://127.0.0.1:8012"
INIT_MODEL_TRANSCRIPTION = "tiny"
INIT_MODEL_TRANSCRIPTION_REALTIME = "tiny"
INIT_REALTIME_PROCESSING_PAUSE = 0.2
INIT_REALTIME_INITIAL_PAUSE = 0.2
INIT_SILERO_SENSITIVITY = 0.4
INIT_WEBRTC_SENSITIVITY = 3
INIT_POST_SPEECH_SILENCE_DURATION = 0.6
INIT_MIN_LENGTH_OF_RECORDING = 0.5
INIT_MIN_GAP_BETWEEN_RECORDINGS = 0
INIT_WAKE_WORDS_SENSITIVITY = 0.6
INIT_PRE_RECORDING_BUFFER_DURATION = 1.0
INIT_WAKE_WORD_ACTIVATION_DELAY = 0.0
INIT_WAKE_WORD_TIMEOUT = 5.0
INIT_WAKE_WORD_BUFFER_DURATION = 0.1
ALLOWED_LATENCY_LIMIT = 100
BUFFER_SIZE = 512
SAMPLE_RATE = 16000
INIT_HANDLE_BUFFER_OVERFLOW = False
if platform.system() != 'Darwin':
INIT_HANDLE_BUFFER_OVERFLOW = True
# Define ANSI color codes for terminal output
class bcolors:
HEADER = '\033[95m' # Magenta
OKBLUE = '\033[94m' # Blue
OKCYAN = '\033[96m' # Cyan
OKGREEN = '\033[92m' # Green
WARNING = '\033[93m' # Yellow
FAIL = '\033[91m' # Red
ENDC = '\033[0m' # Reset to default
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
def format_timestamp_ns(timestamp_ns: int) -> str:
# Split into whole seconds and the nanosecond remainder
seconds = timestamp_ns // 1_000_000_000
remainder_ns = timestamp_ns % 1_000_000_000
# Convert seconds part into a datetime object (local time)
dt = datetime.fromtimestamp(seconds)
# Format the main time as HH:MM:SS
time_str = dt.strftime("%H:%M:%S")
# For instance, if you want milliseconds, divide the remainder by 1e6 and format as 3-digit
milliseconds = remainder_ns // 1_000_000
formatted_timestamp = f"{time_str}.{milliseconds:03d}"
return formatted_timestamp
class AudioToTextRecorderClient:
"""
A class responsible for capturing audio from the microphone, detecting
voice activity, and then transcribing the captured audio using the
`faster_whisper` model.
"""
def __init__(self,
model: str = INIT_MODEL_TRANSCRIPTION,
download_root: str = None,
language: str = "",
compute_type: str = "default",
input_device_index: int = None,
gpu_device_index: Union[int, List[int]] = 0,
device: str = "cuda",
on_recording_start=None,
on_recording_stop=None,
on_transcription_start=None,
ensure_sentence_starting_uppercase=True,
ensure_sentence_ends_with_period=True,
use_microphone=True,
spinner=True,
level=logging.WARNING,
batch_size: int = 16,
# Realtime transcription parameters
enable_realtime_transcription=False,
use_main_model_for_realtime=False,
realtime_model_type=INIT_MODEL_TRANSCRIPTION_REALTIME,
realtime_processing_pause=INIT_REALTIME_PROCESSING_PAUSE,
init_realtime_after_seconds=INIT_REALTIME_INITIAL_PAUSE,
on_realtime_transcription_update=None,
on_realtime_transcription_stabilized=None,
realtime_batch_size: int = 16,
# Voice activation parameters
silero_sensitivity: float = INIT_SILERO_SENSITIVITY,
silero_use_onnx: bool = False,
silero_deactivity_detection: bool = False,
webrtc_sensitivity: int = INIT_WEBRTC_SENSITIVITY,
post_speech_silence_duration: float = (
INIT_POST_SPEECH_SILENCE_DURATION
),
min_length_of_recording: float = (
INIT_MIN_LENGTH_OF_RECORDING
),
min_gap_between_recordings: float = (
INIT_MIN_GAP_BETWEEN_RECORDINGS
),
pre_recording_buffer_duration: float = (
INIT_PRE_RECORDING_BUFFER_DURATION
),
on_vad_start=None,
on_vad_stop=None,
on_vad_detect_start=None,
on_vad_detect_stop=None,
on_turn_detection_start=None,
on_turn_detection_stop=None,
# Wake word parameters
wakeword_backend: str = "pvporcupine",
openwakeword_model_paths: str = None,
openwakeword_inference_framework: str = "onnx",
wake_words: str = "",
wake_words_sensitivity: float = INIT_WAKE_WORDS_SENSITIVITY,
wake_word_activation_delay: float = (
INIT_WAKE_WORD_ACTIVATION_DELAY
),
wake_word_timeout: float = INIT_WAKE_WORD_TIMEOUT,
wake_word_buffer_duration: float = INIT_WAKE_WORD_BUFFER_DURATION,
on_wakeword_detected=None,
on_wakeword_timeout=None,
on_wakeword_detection_start=None,
on_wakeword_detection_end=None,
on_recorded_chunk=None,
debug_mode=False,
handle_buffer_overflow: bool = INIT_HANDLE_BUFFER_OVERFLOW,
beam_size: int = 5,
beam_size_realtime: int = 3,
buffer_size: int = BUFFER_SIZE,
sample_rate: int = SAMPLE_RATE,
initial_prompt: Optional[Union[str, Iterable[int]]] = None,
initial_prompt_realtime: Optional[Union[str, Iterable[int]]] = None,
suppress_tokens: Optional[List[int]] = [-1],
print_transcription_time: bool = False,
early_transcription_on_silence: int = 0,
allowed_latency_limit: int = ALLOWED_LATENCY_LIMIT,
no_log_file: bool = False,
use_extended_logging: bool = False,
# Server urls
control_url: str = DEFAULT_CONTROL_URL,
data_url: str = DEFAULT_DATA_URL,
autostart_server: bool = True,
output_wav_file: str = None,
faster_whisper_vad_filter: bool = False,
):
# Set instance variables from constructor parameters
self.model = model
self.language = language
self.compute_type = compute_type
self.input_device_index = input_device_index
self.gpu_device_index = gpu_device_index
self.device = device
self.on_recording_start = on_recording_start
self.on_recording_stop = on_recording_stop
self.on_transcription_start = on_transcription_start
self.ensure_sentence_starting_uppercase = ensure_sentence_starting_uppercase
self.ensure_sentence_ends_with_period = ensure_sentence_ends_with_period
self.use_microphone = use_microphone
self.spinner = spinner
self.level = level
self.batch_size = batch_size
self.init_realtime_after_seconds = init_realtime_after_seconds
self.realtime_batch_size = realtime_batch_size
# Real-time transcription parameters
self.enable_realtime_transcription = enable_realtime_transcription
self.use_main_model_for_realtime = use_main_model_for_realtime
self.download_root = download_root
self.realtime_model_type = realtime_model_type
self.realtime_processing_pause = realtime_processing_pause
self.on_realtime_transcription_update = on_realtime_transcription_update
self.on_realtime_transcription_stabilized = on_realtime_transcription_stabilized
# Voice activation parameters
self.silero_sensitivity = silero_sensitivity
self.silero_use_onnx = silero_use_onnx
self.silero_deactivity_detection = silero_deactivity_detection
self.webrtc_sensitivity = webrtc_sensitivity
self.post_speech_silence_duration = post_speech_silence_duration
self.min_length_of_recording = min_length_of_recording
self.min_gap_between_recordings = min_gap_between_recordings
self.pre_recording_buffer_duration = pre_recording_buffer_duration
self.on_vad_start = on_vad_start
self.on_vad_stop = on_vad_stop
self.on_vad_detect_start = on_vad_detect_start
self.on_vad_detect_stop = on_vad_detect_stop
self.on_turn_detection_start = on_turn_detection_start
self.on_turn_detection_stop = on_turn_detection_stop
# Wake word parameters
self.wakeword_backend = wakeword_backend
self.openwakeword_model_paths = openwakeword_model_paths
self.openwakeword_inference_framework = openwakeword_inference_framework
self.wake_words = wake_words
self.wake_words_sensitivity = wake_words_sensitivity
self.wake_word_activation_delay = wake_word_activation_delay
self.wake_word_timeout = wake_word_timeout
self.wake_word_buffer_duration = wake_word_buffer_duration
self.on_wakeword_detected = on_wakeword_detected
self.on_wakeword_timeout = on_wakeword_timeout
self.on_wakeword_detection_start = on_wakeword_detection_start
self.on_wakeword_detection_end = on_wakeword_detection_end
self.on_recorded_chunk = on_recorded_chunk
self.debug_mode = debug_mode
self.handle_buffer_overflow = handle_buffer_overflow
self.beam_size = beam_size
self.beam_size_realtime = beam_size_realtime
self.buffer_size = buffer_size
self.sample_rate = sample_rate
self.initial_prompt = initial_prompt
self.initial_prompt_realtime = initial_prompt_realtime
self.suppress_tokens = suppress_tokens
self.print_transcription_time = print_transcription_time
self.early_transcription_on_silence = early_transcription_on_silence
self.allowed_latency_limit = allowed_latency_limit
self.no_log_file = no_log_file
self.use_extended_logging = use_extended_logging
self.faster_whisper_vad_filter = faster_whisper_vad_filter
# Server URLs
self.control_url = control_url
self.data_url = data_url
self.autostart_server = autostart_server
self.output_wav_file = output_wav_file
# Instance variables
self.muted = False
self.recording_thread = None
self.is_running = True
self.connection_established = threading.Event()
self.recording_start = threading.Event()
self.final_text_ready = threading.Event()
self.realtime_text = ""
self.final_text = ""
self._recording = False
self.server_already_running = False
self.wav_file = None
self.request_counter = 0
self.pending_requests = {} # Map from request_id to threading.Event and value
if self.debug_mode:
print("Checking STT server")
if not self.connect():
print("Failed to connect to the server.", file=sys.stderr)
else:
if self.debug_mode:
print("STT server is running and connected.")
if self.use_microphone:
self.start_recording()
if self.server_already_running:
if not self.connection_established.wait(timeout=10):
print("Server connection not established within 10 seconds.")
else:
self.set_parameter("language", self.language)
print(f"Language set to {self.language}")
self.set_parameter("wake_word_activation_delay", self.wake_word_activation_delay)
print(f"Wake word activation delay set to {self.wake_word_activation_delay}")
def text(self, on_transcription_finished=None):
self.realtime_text = ""
self.submitted_realtime_text = ""
self.final_text = ""
self.final_text_ready.clear()
self.recording_start.set()
try:
total_wait_time = 0
wait_interval = 0.02 # Wait in small intervals, e.g., 100ms
max_wait_time = 60 # Timeout after 60 seconds
while total_wait_time < max_wait_time and self.is_running and self._recording:
if self.final_text_ready.wait(timeout=wait_interval):
break # Break if transcription is ready
if not self.is_running or not self._recording:
break
total_wait_time += wait_interval
# Check if a manual interrupt has occurred
if total_wait_time >= max_wait_time:
if self.debug_mode:
print("Timeout while waiting for text from the server.")
self.recording_start.clear()
if on_transcription_finished:
threading.Thread(target=on_transcription_finished, args=("",)).start()
return ""
self.recording_start.clear()
if not self.is_running or not self._recording:
return ""
if on_transcription_finished:
threading.Thread(target=on_transcription_finished, args=(self.final_text,)).start()
return self.final_text
except KeyboardInterrupt:
if self.debug_mode:
print("KeyboardInterrupt in text(), exiting...")
raise KeyboardInterrupt
except Exception as e:
print(f"Error in AudioToTextRecorderClient.text(): {e}")
return ""
def feed_audio(self, chunk, audio_meta_data, original_sample_rate=16000):
# Start with the base metadata
metadata = {"sampleRate": original_sample_rate}
# Merge additional metadata if provided
if audio_meta_data:
server_sent_to_stt_ns = time.time_ns()
audio_meta_data["server_sent_to_stt"] = server_sent_to_stt_ns
metadata["server_sent_to_stt_formatted"] = format_timestamp_ns(server_sent_to_stt_ns)
metadata.update(audio_meta_data)
# Convert metadata to JSON and prepare the message
metadata_json = json.dumps(metadata)
metadata_length = len(metadata_json)
message = struct.pack('<I', metadata_length) + metadata_json.encode('utf-8') + chunk
# Send the message if the connection is running
if self.is_running:
self.data_ws.send(message, opcode=ABNF.OPCODE_BINARY)
def set_microphone(self, microphone_on=True):
"""
Set the microphone on or off.
"""
self.muted = not microphone_on
def abort(self):
self.call_method("abort")
def wakeup(self):
self.call_method("wakeup")
def clear_audio_queue(self):
self.call_method("clear_audio_queue")
def perform_final_transcription(self):
self.call_method("perform_final_transcription")
def stop(self):
self.call_method("stop")
def connect(self):
if not self.ensure_server_running():
print("Cannot start STT server. Exiting.")
return False
try:
# Connect to control WebSocket
self.control_ws = WebSocketApp(self.control_url,
on_message=self.on_control_message,
on_error=self.on_error,
on_close=self.on_close,
on_open=self.on_control_open)
self.control_ws_thread = threading.Thread(target=self.control_ws.run_forever)
self.control_ws_thread.daemon = False
self.control_ws_thread.start()
# Connect to data WebSocket
self.data_ws = WebSocketApp(self.data_url,
on_message=self.on_data_message,
on_error=self.on_error,
on_close=self.on_close,
on_open=self.on_data_open)
self.data_ws_thread = threading.Thread(target=self.data_ws.run_forever)
self.data_ws_thread.daemon = False
self.data_ws_thread.start()
# Wait for the connections to be established
if not self.connection_established.wait(timeout=10):
print("Timeout while connecting to the server.")
return False
if self.debug_mode:
print("WebSocket connections established successfully.")
return True
except Exception as e:
print(f"Error while connecting to the server: {e}")
return False
def start_server(self):
args = ['stt-server']
# Map constructor parameters to server arguments
if self.model:
args += ['--model', self.model]
if self.realtime_model_type:
args += ['--realtime_model_type', self.realtime_model_type]
if self.download_root:
args += ['--root', self.download_root]
if self.batch_size is not None:
args += ['--batch', str(self.batch_size)]
if self.realtime_batch_size is not None:
args += ['--realtime_batch_size', str(self.realtime_batch_size)]
if self.init_realtime_after_seconds is not None:
args += ['--init_realtime_after_seconds', str(self.init_realtime_after_seconds)]
if self.initial_prompt_realtime:
sanitized_prompt = self.initial_prompt_realtime.replace("\n", "\\n")
args += ['--initial_prompt_realtime', sanitized_prompt]
# if self.compute_type:
# args += ['--compute_type', self.compute_type]
# if self.input_device_index is not None:
# args += ['--input_device_index', str(self.input_device_index)]
# if self.gpu_device_index is not None:
# args += ['--gpu_device_index', str(self.gpu_device_index)]
# if self.device:
# args += ['--device', self.device]
# if self.spinner:
# args.append('--spinner') # flag, no need for True/False
# if self.enable_realtime_transcription:
# args.append('--enable_realtime_transcription') # flag, no need for True/False
# if self.handle_buffer_overflow:
# args.append('--handle_buffer_overflow') # flag, no need for True/False
# if self.suppress_tokens:
# args += ['--suppress_tokens', str(self.suppress_tokens)]
# if self.print_transcription_time:
# args.append('--print_transcription_time') # flag, no need for True/False
# if self.allowed_latency_limit is not None:
# args += ['--allowed_latency_limit', str(self.allowed_latency_limit)]
# if self.no_log_file:
# args.append('--no_log_file') # flag, no need for True
if self.debug_mode:
args.append('--debug') # flag, no need for True/False
if self.language:
args += ['--language', self.language]
if self.silero_sensitivity is not None:
args += ['--silero_sensitivity', str(self.silero_sensitivity)]
if self.silero_use_onnx:
args.append('--silero_use_onnx') # flag, no need for True/False
if self.webrtc_sensitivity is not None:
args += ['--webrtc_sensitivity', str(self.webrtc_sensitivity)]
if self.min_length_of_recording is not None:
args += ['--min_length_of_recording', str(self.min_length_of_recording)]
if self.min_gap_between_recordings is not None:
args += ['--min_gap_between_recordings', str(self.min_gap_between_recordings)]
if self.realtime_processing_pause is not None:
args += ['--realtime_processing_pause', str(self.realtime_processing_pause)]
if self.early_transcription_on_silence is not None:
args += ['--early_transcription_on_silence', str(self.early_transcription_on_silence)]
if self.silero_deactivity_detection:
args.append('--silero_deactivity_detection') # flag, no need for True/False
if self.beam_size is not None:
args += ['--beam_size', str(self.beam_size)]
if self.beam_size_realtime is not None:
args += ['--beam_size_realtime', str(self.beam_size_realtime)]
if self.wake_words is not None:
args += ['--wake_words', str(self.wake_words)]
if self.wake_words_sensitivity is not None:
args += ['--wake_words_sensitivity', str(self.wake_words_sensitivity)]
if self.wake_word_timeout is not None:
args += ['--wake_word_timeout', str(self.wake_word_timeout)]
if self.wake_word_activation_delay is not None:
args += ['--wake_word_activation_delay', str(self.wake_word_activation_delay)]
if self.wakeword_backend is not None:
args += ['--wakeword_backend', str(self.wakeword_backend)]
if self.openwakeword_model_paths:
args += ['--openwakeword_model_paths', str(self.openwakeword_model_paths)]
if self.openwakeword_inference_framework is not None:
args += ['--openwakeword_inference_framework', str(self.openwakeword_inference_framework)]
if self.wake_word_buffer_duration is not None:
args += ['--wake_word_buffer_duration', str(self.wake_word_buffer_duration)]
if self.use_main_model_for_realtime:
args.append('--use_main_model_for_realtime') # flag, no need for True/False
if self.use_extended_logging:
args.append('--use_extended_logging') # flag, no need for True/False
if self.control_url:
parsed_control_url = urlparse(self.control_url)
if parsed_control_url.port:
args += ['--control_port', str(parsed_control_url.port)]
if self.data_url:
parsed_data_url = urlparse(self.data_url)
if parsed_data_url.port:
args += ['--data_port', str(parsed_data_url.port)]
if self.initial_prompt:
sanitized_prompt = self.initial_prompt.replace("\n", "\\n")
args += ['--initial_prompt', sanitized_prompt]
# Start the subprocess with the mapped arguments
if os.name == 'nt': # Windows
cmd = 'start /min cmd /c ' + subprocess.list2cmdline(args)
if debug_mode:
print(f"Opening server with cli command: {cmd}")
subprocess.Popen(cmd, shell=True)
else: # Unix-like systems
subprocess.Popen(args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, start_new_session=True)
print("STT server start command issued. Please wait a moment for it to initialize.", file=sys.stderr)
def is_server_running(self):
try:
# Attempt a proper WebSocket handshake to the control URL.
from websocket import create_connection
ws = create_connection(self.control_url, timeout=3)
ws.close()
return True
except Exception as e:
if self.debug_mode:
print(f"Server connectivity check failed: {e}")
return False
def ensure_server_running(self):
if not self.is_server_running():
if self.debug_mode:
print("STT server is not running.", file=sys.stderr)
if self.autostart_server:
self.start_server()
if self.debug_mode:
print("Waiting for STT server to start...", file=sys.stderr)
for _ in range(20): # Wait up to 20 seconds
if self.is_server_running():
if self.debug_mode:
print("STT server started successfully.", file=sys.stderr)
time.sleep(2) # Give the server a moment to fully initialize
return True
time.sleep(1)
print("Failed to start STT server.", file=sys.stderr)
return False
else:
print("STT server is required. Please start it manually.", file=sys.stderr)
return False
else:
self.server_already_running = True
return True
def list_devices(self):
"""List all available audio input devices."""
audio = AudioInput(debug_mode=self.debug_mode)
audio.list_devices()
def start_recording(self):
self.recording_thread = threading.Thread(target=self.record_and_send_audio)
self.recording_thread.daemon = False
self.recording_thread.start()
def setup_audio(self):
"""Initialize audio input"""
self.audio_input = AudioInput(
input_device_index=self.input_device_index,
debug_mode=self.debug_mode
)
return self.audio_input.setup()
def record_and_send_audio(self):
"""Record and stream audio data"""
self._recording = True
try:
if not self.setup_audio():
raise Exception("Failed to set up audio recording.")
# Initialize WAV file writer if output_wav_file is provided
if self.output_wav_file and not self.wav_file:
self.wav_file = wave.open(self.output_wav_file, 'wb')
self.wav_file.setnchannels(1)
self.wav_file.setsampwidth(2)
self.wav_file.setframerate(self.audio_input.device_sample_rate) # Use self.device_sample_rate
if self.debug_mode:
print("Recording and sending audio...")
while self.is_running:
if self.muted:
time.sleep(0.01)
continue
try:
audio_data = self.audio_input.read_chunk()
if self.wav_file:
self.wav_file.writeframes(audio_data)
if self.on_recorded_chunk:
self.on_recorded_chunk(audio_data)
if self.muted:
continue
if self.recording_start.is_set():
metadata = {"sampleRate": self.audio_input.device_sample_rate}
metadata_json = json.dumps(metadata)
metadata_length = len(metadata_json)
message = struct.pack('<I', metadata_length) + metadata_json.encode('utf-8') + audio_data
if self.is_running:
if log_outgoing_chunks:
print(".", flush=True, end='')
self.data_ws.send(message, opcode=ABNF.OPCODE_BINARY)
except KeyboardInterrupt:
if self.debug_mode:
print("KeyboardInterrupt in record_and_send_audio, exiting...")
break
except Exception as e:
print(f"Error sending audio data: {e}")
break
except Exception as e:
print(f"Error in record_and_send_audio: {e}", file=sys.stderr)
finally:
self.cleanup_audio()
self.final_text_ready.set() # fake final text to stop the text() method
self.is_running = False
self._recording = False
def cleanup_audio(self):
"""Clean up audio resources"""
if hasattr(self, 'audio_input'):
self.audio_input.cleanup()
def on_control_message(self, ws, message):
try:
data = json.loads(message)
# Handle server response with status
if 'status' in data:
if data['status'] == 'success':
if 'parameter' in data and 'value' in data:
request_id = data.get('request_id')
if request_id is not None and request_id in self.pending_requests:
if self.debug_mode:
print(f"Parameter {data['parameter']} = {data['value']}")
self.pending_requests[request_id]['value'] = data['value']
self.pending_requests[request_id]['event'].set()
elif data['status'] == 'error':
print(f"Server Error: {data.get('message', '')}")
else:
print(f"Unknown control message format: {data}")
except json.JSONDecodeError:
print(f"Received non-JSON control message: {message}")
except Exception as e:
print(f"Error processing control message: {e}")
# Handle real-time transcription and full sentence updates
def on_data_message(self, ws, message):
try:
data = json.loads(message)
# Handle real-time transcription updates
if data.get('type') == 'realtime':
if data['text'] != self.realtime_text:
self.realtime_text = data['text']
timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
# print(f"Realtime text [{timestamp}]: {bcolors.OKCYAN}{self.realtime_text}{bcolors.ENDC}")
if self.on_realtime_transcription_update:
# Call the callback in a new thread to avoid blocking
threading.Thread(
target=self.on_realtime_transcription_update,
args=(self.realtime_text,)
).start()
# Handle full sentences
elif data.get('type') == 'fullSentence':
self.final_text = data['text']
self.final_text_ready.set()
elif data.get('type') == 'recording_start':
if self.on_recording_start:
self.on_recording_start()
elif data.get('type') == 'recording_stop':
if self.on_recording_stop:
self.on_recording_stop()
elif data.get('type') == 'transcription_start':
audio_bytes_base64 = data.get('audio_bytes_base64')
decoded_bytes = base64.b64decode(audio_bytes_base64)
# Reconstruct the np.int16 array from the decoded bytes
audio_array = np.frombuffer(decoded_bytes, dtype=np.int16)
# If the original data was normalized, convert to np.float32 and normalize
INT16_MAX_ABS_VALUE = 32768.0
normalized_audio = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
if self.on_transcription_start:
self.on_transcription_start(normalized_audio)
elif data.get('type') == 'vad_detect_start':
if self.on_vad_detect_start:
self.on_vad_detect_start()
elif data.get('type') == 'vad_detect_stop':
if self.on_vad_detect_stop:
self.on_vad_detect_stop()
elif data.get('type') == 'vad_start':
if self.on_vad_start:
self.on_vad_start()
elif data.get('type') == 'vad_stop':
if self.on_vad_stop:
self.on_vad_stop()
elif data.get('type') == 'start_turn_detection':
if self.on_turn_detection_start:
self.on_turn_detection_start()
elif data.get('type') == 'stop_turn_detection':
if self.on_turn_detection_stop:
self.on_turn_detection_stop()
elif data.get('type') == 'wakeword_detected':
if self.on_wakeword_detected:
self.on_wakeword_detected()
elif data.get('type') == 'wakeword_detection_start':
if self.on_wakeword_detection_start:
self.on_wakeword_detection_start()
elif data.get('type') == 'wakeword_detection_end':
if self.on_wakeword_detection_end:
self.on_wakeword_detection_end()
elif data.get('type') == 'recorded_chunk':
pass
else:
print(f"Unknown data message format: {data}")
except json.JSONDecodeError:
print(f"Received non-JSON data message: {message}")
except Exception as e:
print(f"Error processing data message: {e}")
def on_error(self, ws, error):
print(f"WebSocket error: {error}")
def on_close(self, ws, close_status_code, close_msg):
if self.debug_mode:
if ws == self.data_ws:
print(f"Data WebSocket connection closed: {close_status_code} - {close_msg}")
elif ws == self.control_ws:
print(f"Control WebSocket connection closed: {close_status_code} - {close_msg}")
self.is_running = False
def on_control_open(self, ws):
if self.debug_mode:
print("Control WebSocket connection opened.")
self.connection_established.set()
def on_data_open(self, ws):
if self.debug_mode:
print("Data WebSocket connection opened.")
def set_parameter(self, parameter, value):
command = {
"command": "set_parameter",
"parameter": parameter,
"value": value
}
self.control_ws.send(json.dumps(command))
def get_parameter(self, parameter):
# Generate a unique request_id
request_id = self.request_counter
self.request_counter += 1
# Prepare the command with the request_id
command = {
"command": "get_parameter",
"parameter": parameter,
"request_id": request_id
}
# Create an event to wait for the response
event = threading.Event()
self.pending_requests[request_id] = {'event': event, 'value': None}
# Send the command to the server
self.control_ws.send(json.dumps(command))
# Wait for the response or timeout after 5 seconds
if event.wait(timeout=5):
value = self.pending_requests[request_id]['value']
# Clean up the pending request
del self.pending_requests[request_id]
return value
else:
print(f"Timeout waiting for get_parameter {parameter}")
# Clean up the pending request
del self.pending_requests[request_id]
return None
def call_method(self, method, args=None, kwargs=None):
command = {
"command": "call_method",
"method": method,
"args": args or [],
"kwargs": kwargs or {}
}
self.control_ws.send(json.dumps(command))
def shutdown(self):
"""Shutdown all resources"""
self.is_running = False
if self.control_ws:
self.control_ws.close()
if self.data_ws:
self.data_ws.close()
# Join threads
if self.control_ws_thread:
self.control_ws_thread.join()
if self.data_ws_thread:
self.data_ws_thread.join()
if self.recording_thread:
self.recording_thread.join()
# Clean up audio
self.cleanup_audio()
def __enter__(self):
"""
Method to setup the context manager protocol.
This enables the instance to be used in a `with` statement, ensuring
proper resource management. When the `with` block is entered, this
method is automatically called.
Returns:
self: The current instance of the class.
"""
return self
def __exit__(self, exc_type, exc_value, traceback):
"""
Method to define behavior when the context manager protocol exits.
This is called when exiting the `with` block and ensures that any
necessary cleanup or resource release processes are executed, such as
shutting down the system properly.
Args:
exc_type (Exception or None): The type of the exception that
caused the context to be exited, if any.
exc_value (Exception or None): The exception instance that caused
the context to be exited, if any.
traceback (Traceback or None): The traceback corresponding to the
exception, if any.
"""
self.shutdown()

View File

@ -0,0 +1,245 @@
import sys
import multiprocessing as mp
import queue
import threading
import time
import logging
# Configure logging. Adjust level and formatting as needed.
# logging.basicConfig(level=logging.DEBUG,
# format='[%(asctime)s] %(levelname)s:%(name)s: %(message)s')
logger = logging.getLogger(__name__)
try:
# Only set the start method if it hasn't been set already.
if sys.platform.startswith('linux') or sys.platform == 'darwin': # For Linux or macOS
mp.set_start_method("spawn")
elif mp.get_start_method(allow_none=True) is None:
mp.set_start_method("spawn")
except RuntimeError as e:
logger.debug("Start method has already been set. Details: %s", e)
class ParentPipe:
"""
A thread-safe wrapper around the 'parent end' of a multiprocessing pipe.
All actual pipe operations happen in a dedicated worker thread, so it's safe
for multiple threads to call send(), recv(), or poll() on the same ParentPipe
without interfering.
"""
def __init__(self, parent_synthesize_pipe):
self.name = "ParentPipe"
self._pipe = parent_synthesize_pipe # The raw pipe.
self._closed = False # A flag to mark if close() has been called.
# The request queue for sending operations to the worker.
self._request_queue = queue.Queue()
# This event signals the worker thread to stop.
self._stop_event = threading.Event()
# Worker thread that executes actual .send(), .recv(), .poll() calls.
self._worker_thread = threading.Thread(
target=self._pipe_worker,
name=f"{self.name}_Worker",
daemon=True
)
self._worker_thread.start()
def _pipe_worker(self):
while not self._stop_event.is_set():
try:
request = self._request_queue.get(timeout=0.1)
except queue.Empty:
continue
if request["type"] == "CLOSE":
# Exit worker loop on CLOSE request.
break
try:
if request["type"] == "SEND":
data = request["data"]
logger.debug("[%s] Worker: sending => %s", self.name, data)
self._pipe.send(data)
request["result_queue"].put(None)
elif request["type"] == "RECV":
logger.debug("[%s] Worker: receiving...", self.name)
data = self._pipe.recv()
request["result_queue"].put(data)
elif request["type"] == "POLL":
timeout = request.get("timeout", 0.0)
logger.debug("[%s] Worker: poll() with timeout: %s", self.name, timeout)
result = self._pipe.poll(timeout)
request["result_queue"].put(result)
except (EOFError, BrokenPipeError, OSError) as e:
# When the other end has closed or an error occurs,
# log and notify the waiting thread.
logger.debug("[%s] Worker: pipe closed or error occurred (%s). Shutting down.", self.name, e)
request["result_queue"].put(None)
break
except Exception as e:
logger.exception("[%s] Worker: unexpected error.", self.name)
request["result_queue"].put(e)
break
logger.debug("[%s] Worker: stopping.", self.name)
try:
self._pipe.close()
except Exception as e:
logger.debug("[%s] Worker: error during pipe close: %s", self.name, e)
def send(self, data):
"""
Synchronously asks the worker thread to perform .send().
"""
if self._closed:
logger.debug("[%s] send() called but pipe is already closed", self.name)
return
logger.debug("[%s] send() requested with: %s", self.name, data)
result_queue = queue.Queue()
request = {
"type": "SEND",
"data": data,
"result_queue": result_queue
}
self._request_queue.put(request)
result_queue.get() # Wait until sending completes.
logger.debug("[%s] send() completed", self.name)
def recv(self):
"""
Synchronously asks the worker to perform .recv() and returns the data.
"""
if self._closed:
logger.debug("[%s] recv() called but pipe is already closed", self.name)
return None
logger.debug("[%s] recv() requested", self.name)
result_queue = queue.Queue()
request = {
"type": "RECV",
"result_queue": result_queue
}
self._request_queue.put(request)
data = result_queue.get()
# Log a preview for huge byte blobs.
if isinstance(data, tuple) and len(data) == 2 and isinstance(data[1], bytes):
data_preview = (data[0], f"<{len(data[1])} bytes>")
else:
data_preview = data
logger.debug("[%s] recv() returning => %s", self.name, data_preview)
return data
def poll(self, timeout=0.0):
"""
Synchronously checks whether data is available.
Returns True if data is ready, or False otherwise.
"""
if self._closed:
return False
logger.debug("[%s] poll() requested with timeout: %s", self.name, timeout)
result_queue = queue.Queue()
request = {
"type": "POLL",
"timeout": timeout,
"result_queue": result_queue
}
self._request_queue.put(request)
try:
# Use a slightly longer timeout to give the worker a chance.
result = result_queue.get(timeout=timeout + 0.1)
except queue.Empty:
result = False
logger.debug("[%s] poll() returning => %s", self.name, result)
return result
def close(self):
"""
Closes the pipe and stops the worker thread. The _closed flag makes
sure no further operations are attempted.
"""
if self._closed:
return
logger.debug("[%s] close() called", self.name)
self._closed = True
stop_request = {"type": "CLOSE", "result_queue": queue.Queue()}
self._request_queue.put(stop_request)
self._stop_event.set()
self._worker_thread.join()
logger.debug("[%s] closed", self.name)
def SafePipe(debug=False):
"""
Returns a pair: (thread-safe parent pipe, raw child pipe).
"""
parent_synthesize_pipe, child_synthesize_pipe = mp.Pipe()
parent_pipe = ParentPipe(parent_synthesize_pipe)
return parent_pipe, child_synthesize_pipe
def child_process_code(child_end):
"""
Example child process code that receives messages, logs them,
sends acknowledgements, and then closes.
"""
for i in range(3):
msg = child_end.recv()
logger.debug("[Child] got: %s", msg)
child_end.send(f"ACK: {msg}")
child_end.close()
if __name__ == "__main__":
parent_pipe, child_pipe = SafePipe()
# Create child process with the child_process_code function.
p = mp.Process(target=child_process_code, args=(child_pipe,))
p.start()
# Event to signal sender threads to stop if needed.
stop_polling_event = threading.Event()
def sender_thread(n):
try:
parent_pipe.send(f"hello_from_thread_{n}")
except Exception as e:
logger.debug("[sender_thread_%s] send exception: %s", n, e)
return
# Use a poll loop with error handling.
for _ in range(10):
try:
if parent_pipe.poll(0.1):
reply = parent_pipe.recv()
logger.debug("[sender_thread_%s] got: %s", n, reply)
break
else:
logger.debug("[sender_thread_%s] no data yet...", n)
except (OSError, EOFError, BrokenPipeError) as e:
logger.debug("[sender_thread_%s] poll/recv exception: %s. Exiting thread.", n, e)
break
# Allow exit if a shutdown is signaled.
if stop_polling_event.is_set():
logger.debug("[sender_thread_%s] stop event set. Exiting thread.", n)
break
threads = []
for i in range(3):
t = threading.Thread(target=sender_thread, args=(i,))
t.start()
threads.append(t)
for t in threads:
t.join()
# Signal shutdown to any polling threads, then close the pipe.
stop_polling_event.set()
parent_pipe.close()
p.join()

View File

@ -0,0 +1,23 @@
from fastapi import FastAPI, WebSocket
from RealtimeSTT.audio_recorder import AudioToTextRecorder
import numpy as np
app = FastAPI()
recorder = AudioToTextRecorder(
model="tiny",
device="cuda",
compute_type="float16",
use_microphone=False,
)
@app.websocket("/ws/transcribe")
async def websocket_endpoint(websocket: WebSocket):
await websocket.accept()
while True:
data = await websocket.receive_bytes()
# Convierte los bytes a numpy array (ajusta según tu formato)
audio = np.frombuffer(data, dtype=np.float32)
recorder.feed_audio(audio)
text = recorder.text()
await websocket.send_text(text)

Binary file not shown.