mirror of
https://github.com/leon-ai/leon.git
synced 2024-11-27 16:16:48 +03:00
feat(python tcp server): multi threading and new ASR engine
This commit is contained in:
parent
72390e2fe6
commit
ef368f89fb
126
tcp_server/src/lib/asr.py
Normal file
126
tcp_server/src/lib/asr.py
Normal file
@ -0,0 +1,126 @@
|
||||
import pyaudio
|
||||
import audioop
|
||||
import time
|
||||
import torch
|
||||
import numpy as np
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
class ASR:
|
||||
def __init__(self, device='auto'):
|
||||
self.log('Loading model...')
|
||||
|
||||
if device == 'auto':
|
||||
device = 'cpu'
|
||||
|
||||
if torch.cuda.is_available(): device = 'cuda'
|
||||
else: self.log('GPU not available. CUDA is not installed?')
|
||||
|
||||
if torch.backends.mps.is_available(): device = 'mps'
|
||||
if 'cuda' in device:
|
||||
assert torch.cuda.is_available()
|
||||
|
||||
self.log(f'Device: {device}')
|
||||
|
||||
self.device = device
|
||||
self.utterance = []
|
||||
self.circular_buffer = []
|
||||
self.is_voice_activity_detected = False
|
||||
self.silence_start_time = 0
|
||||
self.is_wake_word_detected = False
|
||||
self.saved_utterances = []
|
||||
self.segment_text = ''
|
||||
|
||||
self.audio_format = pyaudio.paInt16
|
||||
self.channels = 1
|
||||
self.rate = 16000
|
||||
self.chunk = 4096
|
||||
self.threshold = 200
|
||||
self.silence_duration = 1 # duration of silence in seconds
|
||||
self.model_size = "distil-large-v3"
|
||||
self.buffer_size = 64 # Size of the circular buffer
|
||||
|
||||
self.audio = pyaudio.PyAudio()
|
||||
self.model = WhisperModel(self.model_size, device=self.device, compute_type="float16")
|
||||
self.stream = None
|
||||
|
||||
self.log('Model loaded')
|
||||
|
||||
def detect_wake_word(self, speech: str) -> bool:
|
||||
lowercased_speech = speech.lower().strip()
|
||||
wake_words = ["ok leon", "okay leon", "hi leon", "hey leon"]
|
||||
for wake_word in wake_words:
|
||||
if wake_word in lowercased_speech:
|
||||
return True
|
||||
return False
|
||||
|
||||
def process_circular_buffer(self):
|
||||
if len(self.circular_buffer) > self.buffer_size:
|
||||
self.circular_buffer.pop(0)
|
||||
audio_data = np.concatenate(self.circular_buffer)
|
||||
segments, info = self.model.transcribe(
|
||||
audio_data,
|
||||
beam_size=5,
|
||||
language="en",
|
||||
task="transcribe",
|
||||
condition_on_previous_text=False,
|
||||
hotwords="talking to Leon"
|
||||
)
|
||||
for segment in segments:
|
||||
words = segment.text.split()
|
||||
self.segment_text += ' '.join(words) + ' '
|
||||
if self.is_wake_word_detected:
|
||||
self.utterance.append(self.segment_text)
|
||||
if self.detect_wake_word(segment.text):
|
||||
self.log('Wake word detected')
|
||||
self.is_wake_word_detected = True
|
||||
self.log("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
|
||||
self.segment_text = ''
|
||||
|
||||
def start_recording(self):
|
||||
self.stream = self.audio.open(format=self.audio_format,
|
||||
channels=self.channels,
|
||||
rate=self.rate,
|
||||
frames_per_buffer=self.chunk,
|
||||
input=True,
|
||||
input_device_index=self.audio.get_default_input_device_info()["index"]) # Use the default input device
|
||||
self.log("Recording...")
|
||||
frames = []
|
||||
while True:
|
||||
data = self.stream.read(self.chunk)
|
||||
data_np = np.frombuffer(data, dtype=np.int16)
|
||||
|
||||
# Check if the audio data contains any non-finite values
|
||||
if not np.isfinite(data_np).all():
|
||||
self.log("Non-finite values detected in audio data. Replacing with zeros.")
|
||||
data_np = np.nan_to_num(data_np) # Replace non-finite values with zeros
|
||||
|
||||
rms = audioop.rms(data, 2) # width=2 for format=paInt16
|
||||
if rms >= self.threshold: # audio threshold
|
||||
if not self.is_voice_activity_detected:
|
||||
self.is_voice_activity_detected = True
|
||||
self.circular_buffer.append(data_np)
|
||||
self.process_circular_buffer()
|
||||
else:
|
||||
if self.is_voice_activity_detected:
|
||||
self.silence_start_time = time.time()
|
||||
self.is_voice_activity_detected = False
|
||||
if time.time() - self.silence_start_time > self.silence_duration: # If silence for SILENCE_DURATION seconds
|
||||
if len(self.utterance) > 0:
|
||||
self.log('Reset')
|
||||
|
||||
if self.is_wake_word_detected:
|
||||
self.saved_utterances.append(" ".join(self.utterance))
|
||||
self.utterance = []
|
||||
self.is_wake_word_detected = False
|
||||
|
||||
self.circular_buffer = []
|
||||
# self.log('Silence detected')
|
||||
|
||||
def stop_recording(self):
|
||||
self.stream.stop_stream()
|
||||
self.stream.close()
|
||||
self.audio.terminate()
|
||||
|
||||
@staticmethod
|
||||
def log(*args, **kwargs):
|
||||
print('[ASR]', *args, **kwargs)
|
@ -6,6 +6,7 @@ import time
|
||||
import re
|
||||
|
||||
import lib.nlp as nlp
|
||||
from .asr import ASR
|
||||
from .tts.api import TTS
|
||||
from .constants import TTS_MODEL_CONFIG_PATH, TTS_MODEL_PATH, IS_TTS_ENABLED, TMP_PATH, IS_ASR_ENABLED
|
||||
|
||||
@ -18,6 +19,7 @@ class TCPServer:
|
||||
self.conn = None
|
||||
self.addr = None
|
||||
self.tts = None
|
||||
self.asr = None
|
||||
|
||||
@staticmethod
|
||||
def log(*args, **kwargs):
|
||||
@ -47,7 +49,9 @@ class TCPServer:
|
||||
self.log('ASR is disabled')
|
||||
return
|
||||
|
||||
# TODO
|
||||
# TODO: local model path
|
||||
self.asr = ASR(device='auto')
|
||||
self.asr.start_recording()
|
||||
|
||||
def init(self):
|
||||
try:
|
||||
|
@ -1,4 +1,5 @@
|
||||
import os
|
||||
import threading
|
||||
from os.path import join, dirname
|
||||
from dotenv import load_dotenv
|
||||
|
||||
@ -14,6 +15,11 @@ tcp_server_host = os.environ.get('LEON_PY_TCP_SERVER_HOST', '0.0.0.0')
|
||||
tcp_server_port = os.environ.get('LEON_PY_TCP_SERVER_PORT', 1342)
|
||||
|
||||
tcp_server = TCPServer(tcp_server_host, tcp_server_port)
|
||||
tcp_server.init_asr()
|
||||
tcp_server.init_tts()
|
||||
|
||||
asr_thread = threading.Thread(target=tcp_server.init_asr)
|
||||
asr_thread.start()
|
||||
|
||||
tts_thread = threading.Thread(target=tcp_server.init_tts)
|
||||
tts_thread.start()
|
||||
|
||||
tcp_server.init()
|
||||
|
@ -1,39 +0,0 @@
|
||||
import socket
|
||||
import pyaudio
|
||||
|
||||
# Set up a TCP socket
|
||||
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
server_address = ('127.0.0.1', 1342) # Replace with your server's IP and port
|
||||
sock.connect(server_address)
|
||||
|
||||
# Set up the audio stream
|
||||
chunk = 1024 # Record in chunks of 1024 samples
|
||||
sample_format = pyaudio.paInt16 # 16 bits per sample
|
||||
channels = 1
|
||||
fs = 44100 # Record at 44100 samples per second
|
||||
|
||||
p = pyaudio.PyAudio()
|
||||
|
||||
stream = p.open(format=sample_format,
|
||||
channels=channels,
|
||||
rate=fs,
|
||||
frames_per_buffer=chunk,
|
||||
input=True)
|
||||
|
||||
# Start the stream and send audio data over the TCP socket
|
||||
print('Recording')
|
||||
while True:
|
||||
data = stream.read(chunk)
|
||||
sock.sendall(data)
|
||||
|
||||
# Stop and close the stream
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
|
||||
# Terminate the PortAudio interface
|
||||
p.terminate()
|
||||
|
||||
print('Finished recording')
|
||||
|
||||
# Close the socket
|
||||
sock.close()
|
@ -1,33 +0,0 @@
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
def detect_wake_word(speech: str) -> bool:
|
||||
lowercased_speech = speech.lower().strip()
|
||||
wake_words = ["ok leon", "hi leon", "hey leon"]
|
||||
for wake_word in wake_words:
|
||||
if wake_word in lowercased_speech:
|
||||
return True
|
||||
return False
|
||||
|
||||
# config.json; preprocessor_config.json; model.bin; tokenizer.json; vocabulary.json
|
||||
model_size = "distil-medium.en"
|
||||
|
||||
audio_path = '/home/louis/Desktop/asr-test.wav'
|
||||
|
||||
# https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py
|
||||
# model_size_or_path = "distil-large-v3"
|
||||
# download_root = "/path/to/download"
|
||||
# local_files_only = True
|
||||
# files= ["config.json", "preprocessor_config.json", "model.bin", "tokenizer.json", "vocabulary.json"]
|
||||
# TODO: auto device choice
|
||||
model = WhisperModel(model_size, device="cuda", compute_type="float16")
|
||||
segments, info = model.transcribe(
|
||||
audio_path,
|
||||
beam_size=5,
|
||||
language="en",
|
||||
condition_on_previous_text=False,
|
||||
hotwords="Leon"
|
||||
)
|
||||
|
||||
for segment in segments:
|
||||
print(segment)
|
||||
print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
|
Loading…
Reference in New Issue
Block a user