minimo reconocimiento de voz
This commit is contained in:
3
minimal_server/RealtimeSTT/__init__.py
Normal file
3
minimal_server/RealtimeSTT/__init__.py
Normal file
@ -0,0 +1,3 @@
|
||||
from .audio_recorder import AudioToTextRecorder
|
||||
from .audio_recorder_client import AudioToTextRecorderClient
|
||||
from .audio_input import AudioInput
|
||||
BIN
minimal_server/RealtimeSTT/__pycache__/__init__.cpython-310.pyc
Normal file
BIN
minimal_server/RealtimeSTT/__pycache__/__init__.cpython-310.pyc
Normal file
Binary file not shown.
BIN
minimal_server/RealtimeSTT/__pycache__/__init__.cpython-311.pyc
Normal file
BIN
minimal_server/RealtimeSTT/__pycache__/__init__.cpython-311.pyc
Normal file
Binary file not shown.
BIN
minimal_server/RealtimeSTT/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
minimal_server/RealtimeSTT/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
minimal_server/RealtimeSTT/__pycache__/__init__.cpython-313.pyc
Normal file
BIN
minimal_server/RealtimeSTT/__pycache__/__init__.cpython-313.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
minimal_server/RealtimeSTT/__pycache__/safepipe.cpython-310.pyc
Normal file
BIN
minimal_server/RealtimeSTT/__pycache__/safepipe.cpython-310.pyc
Normal file
Binary file not shown.
BIN
minimal_server/RealtimeSTT/__pycache__/safepipe.cpython-311.pyc
Normal file
BIN
minimal_server/RealtimeSTT/__pycache__/safepipe.cpython-311.pyc
Normal file
Binary file not shown.
BIN
minimal_server/RealtimeSTT/__pycache__/safepipe.cpython-313.pyc
Normal file
BIN
minimal_server/RealtimeSTT/__pycache__/safepipe.cpython-313.pyc
Normal file
Binary file not shown.
BIN
minimal_server/RealtimeSTT/__pycache__/server.cpython-311.pyc
Normal file
BIN
minimal_server/RealtimeSTT/__pycache__/server.cpython-311.pyc
Normal file
Binary file not shown.
220
minimal_server/RealtimeSTT/audio_input.py
Normal file
220
minimal_server/RealtimeSTT/audio_input.py
Normal file
@ -0,0 +1,220 @@
|
||||
from colorama import init, Fore, Style
|
||||
from scipy.signal import butter, filtfilt, resample_poly
|
||||
import pyaudio
|
||||
import logging
|
||||
|
||||
DESIRED_RATE = 16000
|
||||
CHUNK_SIZE = 1024
|
||||
AUDIO_FORMAT = pyaudio.paInt16
|
||||
CHANNELS = 1
|
||||
|
||||
class AudioInput:
|
||||
def __init__(
|
||||
self,
|
||||
input_device_index: int = None,
|
||||
debug_mode: bool = False,
|
||||
target_samplerate: int = DESIRED_RATE,
|
||||
chunk_size: int = CHUNK_SIZE,
|
||||
audio_format: int = AUDIO_FORMAT,
|
||||
channels: int = CHANNELS,
|
||||
resample_to_target: bool = True,
|
||||
):
|
||||
|
||||
self.input_device_index = input_device_index
|
||||
self.debug_mode = debug_mode
|
||||
self.audio_interface = None
|
||||
self.stream = None
|
||||
self.device_sample_rate = None
|
||||
self.target_samplerate = target_samplerate
|
||||
self.chunk_size = chunk_size
|
||||
self.audio_format = audio_format
|
||||
self.channels = channels
|
||||
self.resample_to_target = resample_to_target
|
||||
|
||||
def get_supported_sample_rates(self, device_index):
|
||||
"""Test which standard sample rates are supported by the specified device."""
|
||||
standard_rates = [8000, 9600, 11025, 12000, 16000, 22050, 24000, 32000, 44100, 48000]
|
||||
supported_rates = []
|
||||
|
||||
device_info = self.audio_interface.get_device_info_by_index(device_index)
|
||||
max_channels = device_info.get('maxInputChannels') # Changed from maxOutputChannels
|
||||
|
||||
for rate in standard_rates:
|
||||
try:
|
||||
if self.audio_interface.is_format_supported(
|
||||
rate,
|
||||
input_device=device_index, # Changed to input_device
|
||||
input_channels=max_channels, # Changed to input_channels
|
||||
input_format=self.audio_format, # Changed to input_format
|
||||
):
|
||||
supported_rates.append(rate)
|
||||
except:
|
||||
continue
|
||||
return supported_rates
|
||||
|
||||
def _get_best_sample_rate(self, actual_device_index, desired_rate):
|
||||
"""Determines the best available sample rate for the device."""
|
||||
try:
|
||||
device_info = self.audio_interface.get_device_info_by_index(actual_device_index)
|
||||
supported_rates = self.get_supported_sample_rates(actual_device_index)
|
||||
|
||||
if desired_rate in supported_rates:
|
||||
return desired_rate
|
||||
|
||||
return max(supported_rates)
|
||||
|
||||
# lower_rates = [r for r in supported_rates if r <= desired_rate]
|
||||
# if lower_rates:
|
||||
# return max(lower_rates)
|
||||
|
||||
# higher_rates = [r for r in supported_rates if r > desired_rate]
|
||||
# if higher_rates:
|
||||
# return min(higher_rates)
|
||||
|
||||
return int(device_info.get('defaultSampleRate', 44100))
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(f"Error determining sample rate: {e}")
|
||||
return 44100 # Safe fallback
|
||||
|
||||
def list_devices(self):
|
||||
"""List all available audio input devices with supported sample rates."""
|
||||
try:
|
||||
init() # Initialize colorama
|
||||
self.audio_interface = pyaudio.PyAudio()
|
||||
device_count = self.audio_interface.get_device_count()
|
||||
|
||||
print(f"Available audio input devices:")
|
||||
#print(f"{Fore.LIGHTBLUE_EX}Available audio input devices:{Style.RESET_ALL}")
|
||||
for i in range(device_count):
|
||||
device_info = self.audio_interface.get_device_info_by_index(i)
|
||||
device_name = device_info.get('name')
|
||||
max_input_channels = device_info.get('maxInputChannels', 0)
|
||||
|
||||
if max_input_channels > 0: # Only consider devices with input capabilities
|
||||
supported_rates = self.get_supported_sample_rates(i)
|
||||
print(f"{Fore.LIGHTGREEN_EX}Device {Style.RESET_ALL}{i}{Fore.LIGHTGREEN_EX}: {device_name}{Style.RESET_ALL}")
|
||||
|
||||
# Format each rate in cyan
|
||||
if supported_rates:
|
||||
rates_formatted = ", ".join([f"{Fore.CYAN}{rate}{Style.RESET_ALL}" for rate in supported_rates])
|
||||
print(f" {Fore.YELLOW}Supported sample rates: {rates_formatted}{Style.RESET_ALL}")
|
||||
else:
|
||||
print(f" {Fore.YELLOW}Supported sample rates: None{Style.RESET_ALL}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error listing devices: {e}")
|
||||
finally:
|
||||
if self.audio_interface:
|
||||
self.audio_interface.terminate()
|
||||
|
||||
def setup(self):
|
||||
"""Initialize audio interface and open stream"""
|
||||
try:
|
||||
self.audio_interface = pyaudio.PyAudio()
|
||||
|
||||
if self.debug_mode:
|
||||
print(f"Input device index: {self.input_device_index}")
|
||||
actual_device_index = (self.input_device_index if self.input_device_index is not None
|
||||
else self.audio_interface.get_default_input_device_info()['index'])
|
||||
|
||||
if self.debug_mode:
|
||||
print(f"Actual selected device index: {actual_device_index}")
|
||||
self.input_device_index = actual_device_index
|
||||
self.device_sample_rate = self._get_best_sample_rate(actual_device_index, self.target_samplerate)
|
||||
|
||||
if self.debug_mode:
|
||||
print(f"Setting up audio on device {self.input_device_index} with sample rate {self.device_sample_rate}")
|
||||
|
||||
try:
|
||||
self.stream = self.audio_interface.open(
|
||||
format=self.audio_format,
|
||||
channels=self.channels,
|
||||
rate=self.device_sample_rate,
|
||||
input=True,
|
||||
frames_per_buffer=self.chunk_size,
|
||||
input_device_index=self.input_device_index,
|
||||
)
|
||||
if self.debug_mode:
|
||||
print(f"Audio recording initialized successfully at {self.device_sample_rate} Hz")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Failed to initialize audio stream at {self.device_sample_rate} Hz: {e}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error initializing audio recording: {e}")
|
||||
if self.audio_interface:
|
||||
self.audio_interface.terminate()
|
||||
return False
|
||||
|
||||
def lowpass_filter(self, signal, cutoff_freq, sample_rate):
|
||||
"""
|
||||
Apply a low-pass Butterworth filter to prevent aliasing in the signal.
|
||||
|
||||
Args:
|
||||
signal (np.ndarray): Input audio signal to filter
|
||||
cutoff_freq (float): Cutoff frequency in Hz
|
||||
sample_rate (float): Sampling rate of the input signal in Hz
|
||||
|
||||
Returns:
|
||||
np.ndarray: Filtered audio signal
|
||||
|
||||
Notes:
|
||||
- Uses a 5th order Butterworth filter
|
||||
- Applies zero-phase filtering using filtfilt
|
||||
"""
|
||||
# Calculate the Nyquist frequency (half the sample rate)
|
||||
nyquist_rate = sample_rate / 2.0
|
||||
|
||||
# Normalize cutoff frequency to Nyquist rate (required by butter())
|
||||
normal_cutoff = cutoff_freq / nyquist_rate
|
||||
|
||||
# Design the Butterworth filter
|
||||
b, a = butter(5, normal_cutoff, btype='low', analog=False)
|
||||
|
||||
# Apply zero-phase filtering (forward and backward)
|
||||
filtered_signal = filtfilt(b, a, signal)
|
||||
return filtered_signal
|
||||
|
||||
def resample_audio(self, pcm_data, target_sample_rate, original_sample_rate):
|
||||
"""
|
||||
Filter and resample audio data to a target sample rate.
|
||||
|
||||
Args:
|
||||
pcm_data (np.ndarray): Input audio data
|
||||
target_sample_rate (int): Desired output sample rate in Hz
|
||||
original_sample_rate (int): Original sample rate of input in Hz
|
||||
|
||||
Returns:
|
||||
np.ndarray: Resampled audio data
|
||||
|
||||
Notes:
|
||||
- Applies anti-aliasing filter before resampling
|
||||
- Uses polyphase filtering for high-quality resampling
|
||||
"""
|
||||
if target_sample_rate < original_sample_rate:
|
||||
# Downsampling with low-pass filter
|
||||
pcm_filtered = self.lowpass_filter(pcm_data, target_sample_rate / 2, original_sample_rate)
|
||||
resampled = resample_poly(pcm_filtered, target_sample_rate, original_sample_rate)
|
||||
else:
|
||||
# Upsampling without low-pass filter
|
||||
resampled = resample_poly(pcm_data, target_sample_rate, original_sample_rate)
|
||||
return resampled
|
||||
|
||||
def read_chunk(self):
|
||||
"""Read a chunk of audio data"""
|
||||
return self.stream.read(self.chunk_size, exception_on_overflow=False)
|
||||
|
||||
def cleanup(self):
|
||||
"""Clean up audio resources"""
|
||||
try:
|
||||
if self.stream:
|
||||
self.stream.stop_stream()
|
||||
self.stream.close()
|
||||
self.stream = None
|
||||
if self.audio_interface:
|
||||
self.audio_interface.terminate()
|
||||
self.audio_interface = None
|
||||
except Exception as e:
|
||||
print(f"Error cleaning up audio resources: {e}")
|
||||
2850
minimal_server/RealtimeSTT/audio_recorder.py
Normal file
2850
minimal_server/RealtimeSTT/audio_recorder.py
Normal file
File diff suppressed because it is too large
Load Diff
881
minimal_server/RealtimeSTT/audio_recorder_client.py
Normal file
881
minimal_server/RealtimeSTT/audio_recorder_client.py
Normal file
@ -0,0 +1,881 @@
|
||||
log_outgoing_chunks = False
|
||||
debug_mode = False
|
||||
|
||||
from typing import Iterable, List, Optional, Union
|
||||
from urllib.parse import urlparse
|
||||
from datetime import datetime
|
||||
from websocket import WebSocketApp
|
||||
from websocket import ABNF
|
||||
import numpy as np
|
||||
import subprocess
|
||||
import threading
|
||||
import platform
|
||||
import logging
|
||||
import struct
|
||||
import base64
|
||||
import wave
|
||||
import json
|
||||
import time
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Import the AudioInput class
|
||||
from .audio_input import AudioInput
|
||||
|
||||
DEFAULT_CONTROL_URL = "ws://127.0.0.1:8011"
|
||||
DEFAULT_DATA_URL = "ws://127.0.0.1:8012"
|
||||
|
||||
INIT_MODEL_TRANSCRIPTION = "tiny"
|
||||
INIT_MODEL_TRANSCRIPTION_REALTIME = "tiny"
|
||||
INIT_REALTIME_PROCESSING_PAUSE = 0.2
|
||||
INIT_REALTIME_INITIAL_PAUSE = 0.2
|
||||
INIT_SILERO_SENSITIVITY = 0.4
|
||||
INIT_WEBRTC_SENSITIVITY = 3
|
||||
INIT_POST_SPEECH_SILENCE_DURATION = 0.6
|
||||
INIT_MIN_LENGTH_OF_RECORDING = 0.5
|
||||
INIT_MIN_GAP_BETWEEN_RECORDINGS = 0
|
||||
INIT_WAKE_WORDS_SENSITIVITY = 0.6
|
||||
INIT_PRE_RECORDING_BUFFER_DURATION = 1.0
|
||||
INIT_WAKE_WORD_ACTIVATION_DELAY = 0.0
|
||||
INIT_WAKE_WORD_TIMEOUT = 5.0
|
||||
INIT_WAKE_WORD_BUFFER_DURATION = 0.1
|
||||
ALLOWED_LATENCY_LIMIT = 100
|
||||
|
||||
BUFFER_SIZE = 512
|
||||
SAMPLE_RATE = 16000
|
||||
|
||||
INIT_HANDLE_BUFFER_OVERFLOW = False
|
||||
if platform.system() != 'Darwin':
|
||||
INIT_HANDLE_BUFFER_OVERFLOW = True
|
||||
|
||||
# Define ANSI color codes for terminal output
|
||||
class bcolors:
|
||||
HEADER = '\033[95m' # Magenta
|
||||
OKBLUE = '\033[94m' # Blue
|
||||
OKCYAN = '\033[96m' # Cyan
|
||||
OKGREEN = '\033[92m' # Green
|
||||
WARNING = '\033[93m' # Yellow
|
||||
FAIL = '\033[91m' # Red
|
||||
ENDC = '\033[0m' # Reset to default
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
|
||||
def format_timestamp_ns(timestamp_ns: int) -> str:
|
||||
# Split into whole seconds and the nanosecond remainder
|
||||
seconds = timestamp_ns // 1_000_000_000
|
||||
remainder_ns = timestamp_ns % 1_000_000_000
|
||||
|
||||
# Convert seconds part into a datetime object (local time)
|
||||
dt = datetime.fromtimestamp(seconds)
|
||||
|
||||
# Format the main time as HH:MM:SS
|
||||
time_str = dt.strftime("%H:%M:%S")
|
||||
|
||||
# For instance, if you want milliseconds, divide the remainder by 1e6 and format as 3-digit
|
||||
milliseconds = remainder_ns // 1_000_000
|
||||
formatted_timestamp = f"{time_str}.{milliseconds:03d}"
|
||||
|
||||
return formatted_timestamp
|
||||
|
||||
class AudioToTextRecorderClient:
|
||||
"""
|
||||
A class responsible for capturing audio from the microphone, detecting
|
||||
voice activity, and then transcribing the captured audio using the
|
||||
`faster_whisper` model.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
model: str = INIT_MODEL_TRANSCRIPTION,
|
||||
download_root: str = None,
|
||||
language: str = "",
|
||||
compute_type: str = "default",
|
||||
input_device_index: int = None,
|
||||
gpu_device_index: Union[int, List[int]] = 0,
|
||||
device: str = "cuda",
|
||||
on_recording_start=None,
|
||||
on_recording_stop=None,
|
||||
on_transcription_start=None,
|
||||
ensure_sentence_starting_uppercase=True,
|
||||
ensure_sentence_ends_with_period=True,
|
||||
use_microphone=True,
|
||||
spinner=True,
|
||||
level=logging.WARNING,
|
||||
batch_size: int = 16,
|
||||
|
||||
# Realtime transcription parameters
|
||||
enable_realtime_transcription=False,
|
||||
use_main_model_for_realtime=False,
|
||||
realtime_model_type=INIT_MODEL_TRANSCRIPTION_REALTIME,
|
||||
realtime_processing_pause=INIT_REALTIME_PROCESSING_PAUSE,
|
||||
init_realtime_after_seconds=INIT_REALTIME_INITIAL_PAUSE,
|
||||
on_realtime_transcription_update=None,
|
||||
on_realtime_transcription_stabilized=None,
|
||||
realtime_batch_size: int = 16,
|
||||
|
||||
# Voice activation parameters
|
||||
silero_sensitivity: float = INIT_SILERO_SENSITIVITY,
|
||||
silero_use_onnx: bool = False,
|
||||
silero_deactivity_detection: bool = False,
|
||||
webrtc_sensitivity: int = INIT_WEBRTC_SENSITIVITY,
|
||||
post_speech_silence_duration: float = (
|
||||
INIT_POST_SPEECH_SILENCE_DURATION
|
||||
),
|
||||
min_length_of_recording: float = (
|
||||
INIT_MIN_LENGTH_OF_RECORDING
|
||||
),
|
||||
min_gap_between_recordings: float = (
|
||||
INIT_MIN_GAP_BETWEEN_RECORDINGS
|
||||
),
|
||||
pre_recording_buffer_duration: float = (
|
||||
INIT_PRE_RECORDING_BUFFER_DURATION
|
||||
),
|
||||
on_vad_start=None,
|
||||
on_vad_stop=None,
|
||||
on_vad_detect_start=None,
|
||||
on_vad_detect_stop=None,
|
||||
on_turn_detection_start=None,
|
||||
on_turn_detection_stop=None,
|
||||
|
||||
# Wake word parameters
|
||||
wakeword_backend: str = "pvporcupine",
|
||||
openwakeword_model_paths: str = None,
|
||||
openwakeword_inference_framework: str = "onnx",
|
||||
wake_words: str = "",
|
||||
wake_words_sensitivity: float = INIT_WAKE_WORDS_SENSITIVITY,
|
||||
wake_word_activation_delay: float = (
|
||||
INIT_WAKE_WORD_ACTIVATION_DELAY
|
||||
),
|
||||
wake_word_timeout: float = INIT_WAKE_WORD_TIMEOUT,
|
||||
wake_word_buffer_duration: float = INIT_WAKE_WORD_BUFFER_DURATION,
|
||||
on_wakeword_detected=None,
|
||||
on_wakeword_timeout=None,
|
||||
on_wakeword_detection_start=None,
|
||||
on_wakeword_detection_end=None,
|
||||
on_recorded_chunk=None,
|
||||
debug_mode=False,
|
||||
handle_buffer_overflow: bool = INIT_HANDLE_BUFFER_OVERFLOW,
|
||||
beam_size: int = 5,
|
||||
beam_size_realtime: int = 3,
|
||||
buffer_size: int = BUFFER_SIZE,
|
||||
sample_rate: int = SAMPLE_RATE,
|
||||
initial_prompt: Optional[Union[str, Iterable[int]]] = None,
|
||||
initial_prompt_realtime: Optional[Union[str, Iterable[int]]] = None,
|
||||
suppress_tokens: Optional[List[int]] = [-1],
|
||||
print_transcription_time: bool = False,
|
||||
early_transcription_on_silence: int = 0,
|
||||
allowed_latency_limit: int = ALLOWED_LATENCY_LIMIT,
|
||||
no_log_file: bool = False,
|
||||
use_extended_logging: bool = False,
|
||||
|
||||
# Server urls
|
||||
control_url: str = DEFAULT_CONTROL_URL,
|
||||
data_url: str = DEFAULT_DATA_URL,
|
||||
autostart_server: bool = True,
|
||||
output_wav_file: str = None,
|
||||
faster_whisper_vad_filter: bool = False,
|
||||
):
|
||||
|
||||
# Set instance variables from constructor parameters
|
||||
self.model = model
|
||||
self.language = language
|
||||
self.compute_type = compute_type
|
||||
self.input_device_index = input_device_index
|
||||
self.gpu_device_index = gpu_device_index
|
||||
self.device = device
|
||||
self.on_recording_start = on_recording_start
|
||||
self.on_recording_stop = on_recording_stop
|
||||
self.on_transcription_start = on_transcription_start
|
||||
self.ensure_sentence_starting_uppercase = ensure_sentence_starting_uppercase
|
||||
self.ensure_sentence_ends_with_period = ensure_sentence_ends_with_period
|
||||
self.use_microphone = use_microphone
|
||||
self.spinner = spinner
|
||||
self.level = level
|
||||
self.batch_size = batch_size
|
||||
self.init_realtime_after_seconds = init_realtime_after_seconds
|
||||
self.realtime_batch_size = realtime_batch_size
|
||||
|
||||
# Real-time transcription parameters
|
||||
self.enable_realtime_transcription = enable_realtime_transcription
|
||||
self.use_main_model_for_realtime = use_main_model_for_realtime
|
||||
self.download_root = download_root
|
||||
self.realtime_model_type = realtime_model_type
|
||||
self.realtime_processing_pause = realtime_processing_pause
|
||||
self.on_realtime_transcription_update = on_realtime_transcription_update
|
||||
self.on_realtime_transcription_stabilized = on_realtime_transcription_stabilized
|
||||
|
||||
# Voice activation parameters
|
||||
self.silero_sensitivity = silero_sensitivity
|
||||
self.silero_use_onnx = silero_use_onnx
|
||||
self.silero_deactivity_detection = silero_deactivity_detection
|
||||
self.webrtc_sensitivity = webrtc_sensitivity
|
||||
self.post_speech_silence_duration = post_speech_silence_duration
|
||||
self.min_length_of_recording = min_length_of_recording
|
||||
self.min_gap_between_recordings = min_gap_between_recordings
|
||||
self.pre_recording_buffer_duration = pre_recording_buffer_duration
|
||||
|
||||
self.on_vad_start = on_vad_start
|
||||
self.on_vad_stop = on_vad_stop
|
||||
self.on_vad_detect_start = on_vad_detect_start
|
||||
self.on_vad_detect_stop = on_vad_detect_stop
|
||||
self.on_turn_detection_start = on_turn_detection_start
|
||||
self.on_turn_detection_stop = on_turn_detection_stop
|
||||
|
||||
# Wake word parameters
|
||||
self.wakeword_backend = wakeword_backend
|
||||
self.openwakeword_model_paths = openwakeword_model_paths
|
||||
self.openwakeword_inference_framework = openwakeword_inference_framework
|
||||
self.wake_words = wake_words
|
||||
self.wake_words_sensitivity = wake_words_sensitivity
|
||||
self.wake_word_activation_delay = wake_word_activation_delay
|
||||
self.wake_word_timeout = wake_word_timeout
|
||||
self.wake_word_buffer_duration = wake_word_buffer_duration
|
||||
self.on_wakeword_detected = on_wakeword_detected
|
||||
self.on_wakeword_timeout = on_wakeword_timeout
|
||||
self.on_wakeword_detection_start = on_wakeword_detection_start
|
||||
self.on_wakeword_detection_end = on_wakeword_detection_end
|
||||
self.on_recorded_chunk = on_recorded_chunk
|
||||
self.debug_mode = debug_mode
|
||||
self.handle_buffer_overflow = handle_buffer_overflow
|
||||
self.beam_size = beam_size
|
||||
self.beam_size_realtime = beam_size_realtime
|
||||
self.buffer_size = buffer_size
|
||||
self.sample_rate = sample_rate
|
||||
self.initial_prompt = initial_prompt
|
||||
self.initial_prompt_realtime = initial_prompt_realtime
|
||||
self.suppress_tokens = suppress_tokens
|
||||
self.print_transcription_time = print_transcription_time
|
||||
self.early_transcription_on_silence = early_transcription_on_silence
|
||||
self.allowed_latency_limit = allowed_latency_limit
|
||||
self.no_log_file = no_log_file
|
||||
self.use_extended_logging = use_extended_logging
|
||||
self.faster_whisper_vad_filter = faster_whisper_vad_filter
|
||||
|
||||
# Server URLs
|
||||
self.control_url = control_url
|
||||
self.data_url = data_url
|
||||
self.autostart_server = autostart_server
|
||||
self.output_wav_file = output_wav_file
|
||||
|
||||
# Instance variables
|
||||
self.muted = False
|
||||
self.recording_thread = None
|
||||
self.is_running = True
|
||||
self.connection_established = threading.Event()
|
||||
self.recording_start = threading.Event()
|
||||
self.final_text_ready = threading.Event()
|
||||
self.realtime_text = ""
|
||||
self.final_text = ""
|
||||
self._recording = False
|
||||
self.server_already_running = False
|
||||
self.wav_file = None
|
||||
|
||||
self.request_counter = 0
|
||||
self.pending_requests = {} # Map from request_id to threading.Event and value
|
||||
|
||||
if self.debug_mode:
|
||||
print("Checking STT server")
|
||||
if not self.connect():
|
||||
print("Failed to connect to the server.", file=sys.stderr)
|
||||
else:
|
||||
if self.debug_mode:
|
||||
print("STT server is running and connected.")
|
||||
|
||||
if self.use_microphone:
|
||||
self.start_recording()
|
||||
|
||||
|
||||
if self.server_already_running:
|
||||
if not self.connection_established.wait(timeout=10):
|
||||
print("Server connection not established within 10 seconds.")
|
||||
else:
|
||||
self.set_parameter("language", self.language)
|
||||
print(f"Language set to {self.language}")
|
||||
self.set_parameter("wake_word_activation_delay", self.wake_word_activation_delay)
|
||||
print(f"Wake word activation delay set to {self.wake_word_activation_delay}")
|
||||
|
||||
def text(self, on_transcription_finished=None):
|
||||
self.realtime_text = ""
|
||||
self.submitted_realtime_text = ""
|
||||
self.final_text = ""
|
||||
self.final_text_ready.clear()
|
||||
|
||||
self.recording_start.set()
|
||||
|
||||
try:
|
||||
total_wait_time = 0
|
||||
wait_interval = 0.02 # Wait in small intervals, e.g., 100ms
|
||||
max_wait_time = 60 # Timeout after 60 seconds
|
||||
|
||||
while total_wait_time < max_wait_time and self.is_running and self._recording:
|
||||
if self.final_text_ready.wait(timeout=wait_interval):
|
||||
break # Break if transcription is ready
|
||||
|
||||
if not self.is_running or not self._recording:
|
||||
break
|
||||
|
||||
total_wait_time += wait_interval
|
||||
|
||||
# Check if a manual interrupt has occurred
|
||||
if total_wait_time >= max_wait_time:
|
||||
if self.debug_mode:
|
||||
print("Timeout while waiting for text from the server.")
|
||||
self.recording_start.clear()
|
||||
if on_transcription_finished:
|
||||
threading.Thread(target=on_transcription_finished, args=("",)).start()
|
||||
return ""
|
||||
|
||||
self.recording_start.clear()
|
||||
|
||||
if not self.is_running or not self._recording:
|
||||
return ""
|
||||
|
||||
if on_transcription_finished:
|
||||
threading.Thread(target=on_transcription_finished, args=(self.final_text,)).start()
|
||||
|
||||
return self.final_text
|
||||
|
||||
except KeyboardInterrupt:
|
||||
if self.debug_mode:
|
||||
print("KeyboardInterrupt in text(), exiting...")
|
||||
raise KeyboardInterrupt
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in AudioToTextRecorderClient.text(): {e}")
|
||||
return ""
|
||||
|
||||
def feed_audio(self, chunk, audio_meta_data, original_sample_rate=16000):
|
||||
# Start with the base metadata
|
||||
metadata = {"sampleRate": original_sample_rate}
|
||||
|
||||
# Merge additional metadata if provided
|
||||
if audio_meta_data:
|
||||
server_sent_to_stt_ns = time.time_ns()
|
||||
audio_meta_data["server_sent_to_stt"] = server_sent_to_stt_ns
|
||||
metadata["server_sent_to_stt_formatted"] = format_timestamp_ns(server_sent_to_stt_ns)
|
||||
|
||||
metadata.update(audio_meta_data)
|
||||
|
||||
# Convert metadata to JSON and prepare the message
|
||||
metadata_json = json.dumps(metadata)
|
||||
metadata_length = len(metadata_json)
|
||||
message = struct.pack('<I', metadata_length) + metadata_json.encode('utf-8') + chunk
|
||||
|
||||
# Send the message if the connection is running
|
||||
if self.is_running:
|
||||
self.data_ws.send(message, opcode=ABNF.OPCODE_BINARY)
|
||||
|
||||
def set_microphone(self, microphone_on=True):
|
||||
"""
|
||||
Set the microphone on or off.
|
||||
"""
|
||||
self.muted = not microphone_on
|
||||
|
||||
def abort(self):
|
||||
self.call_method("abort")
|
||||
|
||||
def wakeup(self):
|
||||
self.call_method("wakeup")
|
||||
|
||||
def clear_audio_queue(self):
|
||||
self.call_method("clear_audio_queue")
|
||||
|
||||
def perform_final_transcription(self):
|
||||
self.call_method("perform_final_transcription")
|
||||
|
||||
def stop(self):
|
||||
self.call_method("stop")
|
||||
|
||||
def connect(self):
|
||||
if not self.ensure_server_running():
|
||||
print("Cannot start STT server. Exiting.")
|
||||
return False
|
||||
|
||||
try:
|
||||
# Connect to control WebSocket
|
||||
self.control_ws = WebSocketApp(self.control_url,
|
||||
on_message=self.on_control_message,
|
||||
on_error=self.on_error,
|
||||
on_close=self.on_close,
|
||||
on_open=self.on_control_open)
|
||||
|
||||
self.control_ws_thread = threading.Thread(target=self.control_ws.run_forever)
|
||||
self.control_ws_thread.daemon = False
|
||||
self.control_ws_thread.start()
|
||||
|
||||
# Connect to data WebSocket
|
||||
self.data_ws = WebSocketApp(self.data_url,
|
||||
on_message=self.on_data_message,
|
||||
on_error=self.on_error,
|
||||
on_close=self.on_close,
|
||||
on_open=self.on_data_open)
|
||||
|
||||
self.data_ws_thread = threading.Thread(target=self.data_ws.run_forever)
|
||||
self.data_ws_thread.daemon = False
|
||||
self.data_ws_thread.start()
|
||||
|
||||
# Wait for the connections to be established
|
||||
if not self.connection_established.wait(timeout=10):
|
||||
print("Timeout while connecting to the server.")
|
||||
return False
|
||||
|
||||
if self.debug_mode:
|
||||
print("WebSocket connections established successfully.")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Error while connecting to the server: {e}")
|
||||
return False
|
||||
|
||||
def start_server(self):
|
||||
args = ['stt-server']
|
||||
|
||||
# Map constructor parameters to server arguments
|
||||
if self.model:
|
||||
args += ['--model', self.model]
|
||||
if self.realtime_model_type:
|
||||
args += ['--realtime_model_type', self.realtime_model_type]
|
||||
if self.download_root:
|
||||
args += ['--root', self.download_root]
|
||||
if self.batch_size is not None:
|
||||
args += ['--batch', str(self.batch_size)]
|
||||
if self.realtime_batch_size is not None:
|
||||
args += ['--realtime_batch_size', str(self.realtime_batch_size)]
|
||||
if self.init_realtime_after_seconds is not None:
|
||||
args += ['--init_realtime_after_seconds', str(self.init_realtime_after_seconds)]
|
||||
if self.initial_prompt_realtime:
|
||||
sanitized_prompt = self.initial_prompt_realtime.replace("\n", "\\n")
|
||||
args += ['--initial_prompt_realtime', sanitized_prompt]
|
||||
|
||||
# if self.compute_type:
|
||||
# args += ['--compute_type', self.compute_type]
|
||||
# if self.input_device_index is not None:
|
||||
# args += ['--input_device_index', str(self.input_device_index)]
|
||||
# if self.gpu_device_index is not None:
|
||||
# args += ['--gpu_device_index', str(self.gpu_device_index)]
|
||||
# if self.device:
|
||||
# args += ['--device', self.device]
|
||||
# if self.spinner:
|
||||
# args.append('--spinner') # flag, no need for True/False
|
||||
# if self.enable_realtime_transcription:
|
||||
# args.append('--enable_realtime_transcription') # flag, no need for True/False
|
||||
# if self.handle_buffer_overflow:
|
||||
# args.append('--handle_buffer_overflow') # flag, no need for True/False
|
||||
# if self.suppress_tokens:
|
||||
# args += ['--suppress_tokens', str(self.suppress_tokens)]
|
||||
# if self.print_transcription_time:
|
||||
# args.append('--print_transcription_time') # flag, no need for True/False
|
||||
# if self.allowed_latency_limit is not None:
|
||||
# args += ['--allowed_latency_limit', str(self.allowed_latency_limit)]
|
||||
# if self.no_log_file:
|
||||
# args.append('--no_log_file') # flag, no need for True
|
||||
if self.debug_mode:
|
||||
args.append('--debug') # flag, no need for True/False
|
||||
|
||||
if self.language:
|
||||
args += ['--language', self.language]
|
||||
if self.silero_sensitivity is not None:
|
||||
args += ['--silero_sensitivity', str(self.silero_sensitivity)]
|
||||
if self.silero_use_onnx:
|
||||
args.append('--silero_use_onnx') # flag, no need for True/False
|
||||
if self.webrtc_sensitivity is not None:
|
||||
args += ['--webrtc_sensitivity', str(self.webrtc_sensitivity)]
|
||||
if self.min_length_of_recording is not None:
|
||||
args += ['--min_length_of_recording', str(self.min_length_of_recording)]
|
||||
if self.min_gap_between_recordings is not None:
|
||||
args += ['--min_gap_between_recordings', str(self.min_gap_between_recordings)]
|
||||
if self.realtime_processing_pause is not None:
|
||||
args += ['--realtime_processing_pause', str(self.realtime_processing_pause)]
|
||||
if self.early_transcription_on_silence is not None:
|
||||
args += ['--early_transcription_on_silence', str(self.early_transcription_on_silence)]
|
||||
if self.silero_deactivity_detection:
|
||||
args.append('--silero_deactivity_detection') # flag, no need for True/False
|
||||
if self.beam_size is not None:
|
||||
args += ['--beam_size', str(self.beam_size)]
|
||||
if self.beam_size_realtime is not None:
|
||||
args += ['--beam_size_realtime', str(self.beam_size_realtime)]
|
||||
if self.wake_words is not None:
|
||||
args += ['--wake_words', str(self.wake_words)]
|
||||
if self.wake_words_sensitivity is not None:
|
||||
args += ['--wake_words_sensitivity', str(self.wake_words_sensitivity)]
|
||||
if self.wake_word_timeout is not None:
|
||||
args += ['--wake_word_timeout', str(self.wake_word_timeout)]
|
||||
if self.wake_word_activation_delay is not None:
|
||||
args += ['--wake_word_activation_delay', str(self.wake_word_activation_delay)]
|
||||
if self.wakeword_backend is not None:
|
||||
args += ['--wakeword_backend', str(self.wakeword_backend)]
|
||||
if self.openwakeword_model_paths:
|
||||
args += ['--openwakeword_model_paths', str(self.openwakeword_model_paths)]
|
||||
if self.openwakeword_inference_framework is not None:
|
||||
args += ['--openwakeword_inference_framework', str(self.openwakeword_inference_framework)]
|
||||
if self.wake_word_buffer_duration is not None:
|
||||
args += ['--wake_word_buffer_duration', str(self.wake_word_buffer_duration)]
|
||||
if self.use_main_model_for_realtime:
|
||||
args.append('--use_main_model_for_realtime') # flag, no need for True/False
|
||||
if self.use_extended_logging:
|
||||
args.append('--use_extended_logging') # flag, no need for True/False
|
||||
|
||||
if self.control_url:
|
||||
parsed_control_url = urlparse(self.control_url)
|
||||
if parsed_control_url.port:
|
||||
args += ['--control_port', str(parsed_control_url.port)]
|
||||
if self.data_url:
|
||||
parsed_data_url = urlparse(self.data_url)
|
||||
if parsed_data_url.port:
|
||||
args += ['--data_port', str(parsed_data_url.port)]
|
||||
if self.initial_prompt:
|
||||
sanitized_prompt = self.initial_prompt.replace("\n", "\\n")
|
||||
args += ['--initial_prompt', sanitized_prompt]
|
||||
|
||||
# Start the subprocess with the mapped arguments
|
||||
if os.name == 'nt': # Windows
|
||||
cmd = 'start /min cmd /c ' + subprocess.list2cmdline(args)
|
||||
if debug_mode:
|
||||
print(f"Opening server with cli command: {cmd}")
|
||||
subprocess.Popen(cmd, shell=True)
|
||||
else: # Unix-like systems
|
||||
subprocess.Popen(args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, start_new_session=True)
|
||||
print("STT server start command issued. Please wait a moment for it to initialize.", file=sys.stderr)
|
||||
|
||||
def is_server_running(self):
|
||||
try:
|
||||
# Attempt a proper WebSocket handshake to the control URL.
|
||||
from websocket import create_connection
|
||||
ws = create_connection(self.control_url, timeout=3)
|
||||
ws.close()
|
||||
return True
|
||||
except Exception as e:
|
||||
if self.debug_mode:
|
||||
print(f"Server connectivity check failed: {e}")
|
||||
return False
|
||||
|
||||
def ensure_server_running(self):
|
||||
if not self.is_server_running():
|
||||
if self.debug_mode:
|
||||
print("STT server is not running.", file=sys.stderr)
|
||||
if self.autostart_server:
|
||||
self.start_server()
|
||||
if self.debug_mode:
|
||||
print("Waiting for STT server to start...", file=sys.stderr)
|
||||
for _ in range(20): # Wait up to 20 seconds
|
||||
if self.is_server_running():
|
||||
if self.debug_mode:
|
||||
print("STT server started successfully.", file=sys.stderr)
|
||||
time.sleep(2) # Give the server a moment to fully initialize
|
||||
return True
|
||||
time.sleep(1)
|
||||
print("Failed to start STT server.", file=sys.stderr)
|
||||
return False
|
||||
else:
|
||||
print("STT server is required. Please start it manually.", file=sys.stderr)
|
||||
return False
|
||||
|
||||
else:
|
||||
self.server_already_running = True
|
||||
|
||||
return True
|
||||
|
||||
def list_devices(self):
|
||||
"""List all available audio input devices."""
|
||||
audio = AudioInput(debug_mode=self.debug_mode)
|
||||
audio.list_devices()
|
||||
|
||||
def start_recording(self):
|
||||
self.recording_thread = threading.Thread(target=self.record_and_send_audio)
|
||||
self.recording_thread.daemon = False
|
||||
self.recording_thread.start()
|
||||
|
||||
def setup_audio(self):
|
||||
"""Initialize audio input"""
|
||||
self.audio_input = AudioInput(
|
||||
input_device_index=self.input_device_index,
|
||||
debug_mode=self.debug_mode
|
||||
)
|
||||
return self.audio_input.setup()
|
||||
|
||||
def record_and_send_audio(self):
|
||||
"""Record and stream audio data"""
|
||||
self._recording = True
|
||||
|
||||
try:
|
||||
if not self.setup_audio():
|
||||
raise Exception("Failed to set up audio recording.")
|
||||
|
||||
# Initialize WAV file writer if output_wav_file is provided
|
||||
if self.output_wav_file and not self.wav_file:
|
||||
self.wav_file = wave.open(self.output_wav_file, 'wb')
|
||||
self.wav_file.setnchannels(1)
|
||||
self.wav_file.setsampwidth(2)
|
||||
self.wav_file.setframerate(self.audio_input.device_sample_rate) # Use self.device_sample_rate
|
||||
|
||||
|
||||
if self.debug_mode:
|
||||
print("Recording and sending audio...")
|
||||
|
||||
while self.is_running:
|
||||
if self.muted:
|
||||
time.sleep(0.01)
|
||||
continue
|
||||
|
||||
try:
|
||||
audio_data = self.audio_input.read_chunk()
|
||||
|
||||
if self.wav_file:
|
||||
self.wav_file.writeframes(audio_data)
|
||||
|
||||
if self.on_recorded_chunk:
|
||||
self.on_recorded_chunk(audio_data)
|
||||
|
||||
if self.muted:
|
||||
continue
|
||||
|
||||
if self.recording_start.is_set():
|
||||
metadata = {"sampleRate": self.audio_input.device_sample_rate}
|
||||
metadata_json = json.dumps(metadata)
|
||||
metadata_length = len(metadata_json)
|
||||
message = struct.pack('<I', metadata_length) + metadata_json.encode('utf-8') + audio_data
|
||||
|
||||
if self.is_running:
|
||||
if log_outgoing_chunks:
|
||||
print(".", flush=True, end='')
|
||||
self.data_ws.send(message, opcode=ABNF.OPCODE_BINARY)
|
||||
except KeyboardInterrupt:
|
||||
if self.debug_mode:
|
||||
print("KeyboardInterrupt in record_and_send_audio, exiting...")
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"Error sending audio data: {e}")
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in record_and_send_audio: {e}", file=sys.stderr)
|
||||
finally:
|
||||
self.cleanup_audio()
|
||||
self.final_text_ready.set() # fake final text to stop the text() method
|
||||
self.is_running = False
|
||||
self._recording = False
|
||||
|
||||
def cleanup_audio(self):
|
||||
"""Clean up audio resources"""
|
||||
if hasattr(self, 'audio_input'):
|
||||
self.audio_input.cleanup()
|
||||
|
||||
def on_control_message(self, ws, message):
|
||||
try:
|
||||
data = json.loads(message)
|
||||
# Handle server response with status
|
||||
if 'status' in data:
|
||||
if data['status'] == 'success':
|
||||
if 'parameter' in data and 'value' in data:
|
||||
request_id = data.get('request_id')
|
||||
if request_id is not None and request_id in self.pending_requests:
|
||||
if self.debug_mode:
|
||||
print(f"Parameter {data['parameter']} = {data['value']}")
|
||||
self.pending_requests[request_id]['value'] = data['value']
|
||||
self.pending_requests[request_id]['event'].set()
|
||||
elif data['status'] == 'error':
|
||||
print(f"Server Error: {data.get('message', '')}")
|
||||
else:
|
||||
print(f"Unknown control message format: {data}")
|
||||
except json.JSONDecodeError:
|
||||
print(f"Received non-JSON control message: {message}")
|
||||
except Exception as e:
|
||||
print(f"Error processing control message: {e}")
|
||||
|
||||
# Handle real-time transcription and full sentence updates
|
||||
def on_data_message(self, ws, message):
|
||||
try:
|
||||
data = json.loads(message)
|
||||
# Handle real-time transcription updates
|
||||
if data.get('type') == 'realtime':
|
||||
if data['text'] != self.realtime_text:
|
||||
self.realtime_text = data['text']
|
||||
|
||||
timestamp = datetime.now().strftime('%H:%M:%S.%f')[:-3]
|
||||
# print(f"Realtime text [{timestamp}]: {bcolors.OKCYAN}{self.realtime_text}{bcolors.ENDC}")
|
||||
|
||||
if self.on_realtime_transcription_update:
|
||||
# Call the callback in a new thread to avoid blocking
|
||||
threading.Thread(
|
||||
target=self.on_realtime_transcription_update,
|
||||
args=(self.realtime_text,)
|
||||
).start()
|
||||
|
||||
# Handle full sentences
|
||||
elif data.get('type') == 'fullSentence':
|
||||
self.final_text = data['text']
|
||||
self.final_text_ready.set()
|
||||
|
||||
elif data.get('type') == 'recording_start':
|
||||
if self.on_recording_start:
|
||||
self.on_recording_start()
|
||||
elif data.get('type') == 'recording_stop':
|
||||
if self.on_recording_stop:
|
||||
self.on_recording_stop()
|
||||
elif data.get('type') == 'transcription_start':
|
||||
audio_bytes_base64 = data.get('audio_bytes_base64')
|
||||
decoded_bytes = base64.b64decode(audio_bytes_base64)
|
||||
|
||||
# Reconstruct the np.int16 array from the decoded bytes
|
||||
audio_array = np.frombuffer(decoded_bytes, dtype=np.int16)
|
||||
|
||||
# If the original data was normalized, convert to np.float32 and normalize
|
||||
INT16_MAX_ABS_VALUE = 32768.0
|
||||
normalized_audio = audio_array.astype(np.float32) / INT16_MAX_ABS_VALUE
|
||||
|
||||
if self.on_transcription_start:
|
||||
self.on_transcription_start(normalized_audio)
|
||||
elif data.get('type') == 'vad_detect_start':
|
||||
if self.on_vad_detect_start:
|
||||
self.on_vad_detect_start()
|
||||
elif data.get('type') == 'vad_detect_stop':
|
||||
if self.on_vad_detect_stop:
|
||||
self.on_vad_detect_stop()
|
||||
elif data.get('type') == 'vad_start':
|
||||
if self.on_vad_start:
|
||||
self.on_vad_start()
|
||||
elif data.get('type') == 'vad_stop':
|
||||
if self.on_vad_stop:
|
||||
self.on_vad_stop()
|
||||
elif data.get('type') == 'start_turn_detection':
|
||||
if self.on_turn_detection_start:
|
||||
self.on_turn_detection_start()
|
||||
elif data.get('type') == 'stop_turn_detection':
|
||||
if self.on_turn_detection_stop:
|
||||
self.on_turn_detection_stop()
|
||||
elif data.get('type') == 'wakeword_detected':
|
||||
if self.on_wakeword_detected:
|
||||
self.on_wakeword_detected()
|
||||
elif data.get('type') == 'wakeword_detection_start':
|
||||
if self.on_wakeword_detection_start:
|
||||
self.on_wakeword_detection_start()
|
||||
elif data.get('type') == 'wakeword_detection_end':
|
||||
if self.on_wakeword_detection_end:
|
||||
self.on_wakeword_detection_end()
|
||||
elif data.get('type') == 'recorded_chunk':
|
||||
pass
|
||||
|
||||
else:
|
||||
print(f"Unknown data message format: {data}")
|
||||
|
||||
except json.JSONDecodeError:
|
||||
print(f"Received non-JSON data message: {message}")
|
||||
except Exception as e:
|
||||
print(f"Error processing data message: {e}")
|
||||
|
||||
def on_error(self, ws, error):
|
||||
print(f"WebSocket error: {error}")
|
||||
|
||||
def on_close(self, ws, close_status_code, close_msg):
|
||||
if self.debug_mode:
|
||||
if ws == self.data_ws:
|
||||
print(f"Data WebSocket connection closed: {close_status_code} - {close_msg}")
|
||||
elif ws == self.control_ws:
|
||||
print(f"Control WebSocket connection closed: {close_status_code} - {close_msg}")
|
||||
|
||||
self.is_running = False
|
||||
|
||||
def on_control_open(self, ws):
|
||||
if self.debug_mode:
|
||||
print("Control WebSocket connection opened.")
|
||||
self.connection_established.set()
|
||||
|
||||
def on_data_open(self, ws):
|
||||
if self.debug_mode:
|
||||
print("Data WebSocket connection opened.")
|
||||
|
||||
def set_parameter(self, parameter, value):
|
||||
command = {
|
||||
"command": "set_parameter",
|
||||
"parameter": parameter,
|
||||
"value": value
|
||||
}
|
||||
self.control_ws.send(json.dumps(command))
|
||||
|
||||
def get_parameter(self, parameter):
|
||||
# Generate a unique request_id
|
||||
request_id = self.request_counter
|
||||
self.request_counter += 1
|
||||
|
||||
# Prepare the command with the request_id
|
||||
command = {
|
||||
"command": "get_parameter",
|
||||
"parameter": parameter,
|
||||
"request_id": request_id
|
||||
}
|
||||
|
||||
# Create an event to wait for the response
|
||||
event = threading.Event()
|
||||
self.pending_requests[request_id] = {'event': event, 'value': None}
|
||||
|
||||
# Send the command to the server
|
||||
self.control_ws.send(json.dumps(command))
|
||||
|
||||
# Wait for the response or timeout after 5 seconds
|
||||
if event.wait(timeout=5):
|
||||
value = self.pending_requests[request_id]['value']
|
||||
# Clean up the pending request
|
||||
del self.pending_requests[request_id]
|
||||
return value
|
||||
else:
|
||||
print(f"Timeout waiting for get_parameter {parameter}")
|
||||
# Clean up the pending request
|
||||
del self.pending_requests[request_id]
|
||||
return None
|
||||
|
||||
def call_method(self, method, args=None, kwargs=None):
|
||||
command = {
|
||||
"command": "call_method",
|
||||
"method": method,
|
||||
"args": args or [],
|
||||
"kwargs": kwargs or {}
|
||||
}
|
||||
self.control_ws.send(json.dumps(command))
|
||||
|
||||
def shutdown(self):
|
||||
"""Shutdown all resources"""
|
||||
self.is_running = False
|
||||
if self.control_ws:
|
||||
self.control_ws.close()
|
||||
if self.data_ws:
|
||||
self.data_ws.close()
|
||||
|
||||
# Join threads
|
||||
if self.control_ws_thread:
|
||||
self.control_ws_thread.join()
|
||||
if self.data_ws_thread:
|
||||
self.data_ws_thread.join()
|
||||
if self.recording_thread:
|
||||
self.recording_thread.join()
|
||||
|
||||
# Clean up audio
|
||||
self.cleanup_audio()
|
||||
|
||||
def __enter__(self):
|
||||
"""
|
||||
Method to setup the context manager protocol.
|
||||
|
||||
This enables the instance to be used in a `with` statement, ensuring
|
||||
proper resource management. When the `with` block is entered, this
|
||||
method is automatically called.
|
||||
|
||||
Returns:
|
||||
self: The current instance of the class.
|
||||
"""
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
"""
|
||||
Method to define behavior when the context manager protocol exits.
|
||||
|
||||
This is called when exiting the `with` block and ensures that any
|
||||
necessary cleanup or resource release processes are executed, such as
|
||||
shutting down the system properly.
|
||||
|
||||
Args:
|
||||
exc_type (Exception or None): The type of the exception that
|
||||
caused the context to be exited, if any.
|
||||
exc_value (Exception or None): The exception instance that caused
|
||||
the context to be exited, if any.
|
||||
traceback (Traceback or None): The traceback corresponding to the
|
||||
exception, if any.
|
||||
"""
|
||||
self.shutdown()
|
||||
245
minimal_server/RealtimeSTT/safepipe.py
Normal file
245
minimal_server/RealtimeSTT/safepipe.py
Normal file
@ -0,0 +1,245 @@
|
||||
import sys
|
||||
import multiprocessing as mp
|
||||
import queue
|
||||
import threading
|
||||
import time
|
||||
import logging
|
||||
|
||||
# Configure logging. Adjust level and formatting as needed.
|
||||
# logging.basicConfig(level=logging.DEBUG,
|
||||
# format='[%(asctime)s] %(levelname)s:%(name)s: %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
# Only set the start method if it hasn't been set already.
|
||||
if sys.platform.startswith('linux') or sys.platform == 'darwin': # For Linux or macOS
|
||||
mp.set_start_method("spawn")
|
||||
elif mp.get_start_method(allow_none=True) is None:
|
||||
mp.set_start_method("spawn")
|
||||
except RuntimeError as e:
|
||||
logger.debug("Start method has already been set. Details: %s", e)
|
||||
|
||||
|
||||
class ParentPipe:
|
||||
"""
|
||||
A thread-safe wrapper around the 'parent end' of a multiprocessing pipe.
|
||||
All actual pipe operations happen in a dedicated worker thread, so it's safe
|
||||
for multiple threads to call send(), recv(), or poll() on the same ParentPipe
|
||||
without interfering.
|
||||
"""
|
||||
def __init__(self, parent_synthesize_pipe):
|
||||
self.name = "ParentPipe"
|
||||
self._pipe = parent_synthesize_pipe # The raw pipe.
|
||||
self._closed = False # A flag to mark if close() has been called.
|
||||
|
||||
# The request queue for sending operations to the worker.
|
||||
self._request_queue = queue.Queue()
|
||||
|
||||
# This event signals the worker thread to stop.
|
||||
self._stop_event = threading.Event()
|
||||
|
||||
# Worker thread that executes actual .send(), .recv(), .poll() calls.
|
||||
self._worker_thread = threading.Thread(
|
||||
target=self._pipe_worker,
|
||||
name=f"{self.name}_Worker",
|
||||
daemon=True
|
||||
)
|
||||
self._worker_thread.start()
|
||||
|
||||
def _pipe_worker(self):
|
||||
while not self._stop_event.is_set():
|
||||
try:
|
||||
request = self._request_queue.get(timeout=0.1)
|
||||
except queue.Empty:
|
||||
continue
|
||||
|
||||
if request["type"] == "CLOSE":
|
||||
# Exit worker loop on CLOSE request.
|
||||
break
|
||||
|
||||
try:
|
||||
if request["type"] == "SEND":
|
||||
data = request["data"]
|
||||
logger.debug("[%s] Worker: sending => %s", self.name, data)
|
||||
self._pipe.send(data)
|
||||
request["result_queue"].put(None)
|
||||
|
||||
elif request["type"] == "RECV":
|
||||
logger.debug("[%s] Worker: receiving...", self.name)
|
||||
data = self._pipe.recv()
|
||||
request["result_queue"].put(data)
|
||||
|
||||
elif request["type"] == "POLL":
|
||||
timeout = request.get("timeout", 0.0)
|
||||
logger.debug("[%s] Worker: poll() with timeout: %s", self.name, timeout)
|
||||
result = self._pipe.poll(timeout)
|
||||
request["result_queue"].put(result)
|
||||
|
||||
except (EOFError, BrokenPipeError, OSError) as e:
|
||||
# When the other end has closed or an error occurs,
|
||||
# log and notify the waiting thread.
|
||||
logger.debug("[%s] Worker: pipe closed or error occurred (%s). Shutting down.", self.name, e)
|
||||
request["result_queue"].put(None)
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
logger.exception("[%s] Worker: unexpected error.", self.name)
|
||||
request["result_queue"].put(e)
|
||||
break
|
||||
|
||||
logger.debug("[%s] Worker: stopping.", self.name)
|
||||
try:
|
||||
self._pipe.close()
|
||||
except Exception as e:
|
||||
logger.debug("[%s] Worker: error during pipe close: %s", self.name, e)
|
||||
|
||||
def send(self, data):
|
||||
"""
|
||||
Synchronously asks the worker thread to perform .send().
|
||||
"""
|
||||
if self._closed:
|
||||
logger.debug("[%s] send() called but pipe is already closed", self.name)
|
||||
return
|
||||
logger.debug("[%s] send() requested with: %s", self.name, data)
|
||||
result_queue = queue.Queue()
|
||||
request = {
|
||||
"type": "SEND",
|
||||
"data": data,
|
||||
"result_queue": result_queue
|
||||
}
|
||||
self._request_queue.put(request)
|
||||
result_queue.get() # Wait until sending completes.
|
||||
logger.debug("[%s] send() completed", self.name)
|
||||
|
||||
def recv(self):
|
||||
"""
|
||||
Synchronously asks the worker to perform .recv() and returns the data.
|
||||
"""
|
||||
if self._closed:
|
||||
logger.debug("[%s] recv() called but pipe is already closed", self.name)
|
||||
return None
|
||||
logger.debug("[%s] recv() requested", self.name)
|
||||
result_queue = queue.Queue()
|
||||
request = {
|
||||
"type": "RECV",
|
||||
"result_queue": result_queue
|
||||
}
|
||||
self._request_queue.put(request)
|
||||
data = result_queue.get()
|
||||
|
||||
# Log a preview for huge byte blobs.
|
||||
if isinstance(data, tuple) and len(data) == 2 and isinstance(data[1], bytes):
|
||||
data_preview = (data[0], f"<{len(data[1])} bytes>")
|
||||
else:
|
||||
data_preview = data
|
||||
logger.debug("[%s] recv() returning => %s", self.name, data_preview)
|
||||
return data
|
||||
|
||||
def poll(self, timeout=0.0):
|
||||
"""
|
||||
Synchronously checks whether data is available.
|
||||
Returns True if data is ready, or False otherwise.
|
||||
"""
|
||||
if self._closed:
|
||||
return False
|
||||
logger.debug("[%s] poll() requested with timeout: %s", self.name, timeout)
|
||||
result_queue = queue.Queue()
|
||||
request = {
|
||||
"type": "POLL",
|
||||
"timeout": timeout,
|
||||
"result_queue": result_queue
|
||||
}
|
||||
self._request_queue.put(request)
|
||||
try:
|
||||
# Use a slightly longer timeout to give the worker a chance.
|
||||
result = result_queue.get(timeout=timeout + 0.1)
|
||||
except queue.Empty:
|
||||
result = False
|
||||
logger.debug("[%s] poll() returning => %s", self.name, result)
|
||||
return result
|
||||
|
||||
def close(self):
|
||||
"""
|
||||
Closes the pipe and stops the worker thread. The _closed flag makes
|
||||
sure no further operations are attempted.
|
||||
"""
|
||||
if self._closed:
|
||||
return
|
||||
logger.debug("[%s] close() called", self.name)
|
||||
self._closed = True
|
||||
stop_request = {"type": "CLOSE", "result_queue": queue.Queue()}
|
||||
self._request_queue.put(stop_request)
|
||||
self._stop_event.set()
|
||||
self._worker_thread.join()
|
||||
logger.debug("[%s] closed", self.name)
|
||||
|
||||
|
||||
def SafePipe(debug=False):
|
||||
"""
|
||||
Returns a pair: (thread-safe parent pipe, raw child pipe).
|
||||
"""
|
||||
parent_synthesize_pipe, child_synthesize_pipe = mp.Pipe()
|
||||
parent_pipe = ParentPipe(parent_synthesize_pipe)
|
||||
return parent_pipe, child_synthesize_pipe
|
||||
|
||||
|
||||
def child_process_code(child_end):
|
||||
"""
|
||||
Example child process code that receives messages, logs them,
|
||||
sends acknowledgements, and then closes.
|
||||
"""
|
||||
for i in range(3):
|
||||
msg = child_end.recv()
|
||||
logger.debug("[Child] got: %s", msg)
|
||||
child_end.send(f"ACK: {msg}")
|
||||
child_end.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parent_pipe, child_pipe = SafePipe()
|
||||
|
||||
# Create child process with the child_process_code function.
|
||||
p = mp.Process(target=child_process_code, args=(child_pipe,))
|
||||
p.start()
|
||||
|
||||
# Event to signal sender threads to stop if needed.
|
||||
stop_polling_event = threading.Event()
|
||||
|
||||
def sender_thread(n):
|
||||
try:
|
||||
parent_pipe.send(f"hello_from_thread_{n}")
|
||||
except Exception as e:
|
||||
logger.debug("[sender_thread_%s] send exception: %s", n, e)
|
||||
return
|
||||
|
||||
# Use a poll loop with error handling.
|
||||
for _ in range(10):
|
||||
try:
|
||||
if parent_pipe.poll(0.1):
|
||||
reply = parent_pipe.recv()
|
||||
logger.debug("[sender_thread_%s] got: %s", n, reply)
|
||||
break
|
||||
else:
|
||||
logger.debug("[sender_thread_%s] no data yet...", n)
|
||||
except (OSError, EOFError, BrokenPipeError) as e:
|
||||
logger.debug("[sender_thread_%s] poll/recv exception: %s. Exiting thread.", n, e)
|
||||
break
|
||||
|
||||
# Allow exit if a shutdown is signaled.
|
||||
if stop_polling_event.is_set():
|
||||
logger.debug("[sender_thread_%s] stop event set. Exiting thread.", n)
|
||||
break
|
||||
|
||||
threads = []
|
||||
for i in range(3):
|
||||
t = threading.Thread(target=sender_thread, args=(i,))
|
||||
t.start()
|
||||
threads.append(t)
|
||||
|
||||
for t in threads:
|
||||
t.join()
|
||||
|
||||
# Signal shutdown to any polling threads, then close the pipe.
|
||||
stop_polling_event.set()
|
||||
parent_pipe.close()
|
||||
p.join()
|
||||
23
minimal_server/RealtimeSTT/server.py
Normal file
23
minimal_server/RealtimeSTT/server.py
Normal file
@ -0,0 +1,23 @@
|
||||
from fastapi import FastAPI, WebSocket
|
||||
from RealtimeSTT.audio_recorder import AudioToTextRecorder
|
||||
import numpy as np
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
recorder = AudioToTextRecorder(
|
||||
model="tiny",
|
||||
device="cuda",
|
||||
compute_type="float16",
|
||||
use_microphone=False,
|
||||
)
|
||||
|
||||
@app.websocket("/ws/transcribe")
|
||||
async def websocket_endpoint(websocket: WebSocket):
|
||||
await websocket.accept()
|
||||
while True:
|
||||
data = await websocket.receive_bytes()
|
||||
# Convierte los bytes a numpy array (ajusta según tu formato)
|
||||
audio = np.frombuffer(data, dtype=np.float32)
|
||||
recorder.feed_audio(audio)
|
||||
text = recorder.text()
|
||||
await websocket.send_text(text)
|
||||
BIN
minimal_server/RealtimeSTT/warmup_audio.wav
Normal file
BIN
minimal_server/RealtimeSTT/warmup_audio.wav
Normal file
Binary file not shown.
Reference in New Issue
Block a user