1
1
mirror of https://github.com/leon-ai/leon.git synced 2024-11-27 16:16:48 +03:00

feat(python tcp server): multi threading and new ASR engine

This commit is contained in:
louistiti 2024-05-21 16:38:45 +08:00
parent 72390e2fe6
commit ef368f89fb
No known key found for this signature in database
GPG Key ID: 92CD6A2E497E1669
5 changed files with 139 additions and 75 deletions

126
tcp_server/src/lib/asr.py Normal file
View File

@ -0,0 +1,126 @@
import pyaudio
import audioop
import time
import torch
import numpy as np
from faster_whisper import WhisperModel
class ASR:
def __init__(self, device='auto'):
self.log('Loading model...')
if device == 'auto':
device = 'cpu'
if torch.cuda.is_available(): device = 'cuda'
else: self.log('GPU not available. CUDA is not installed?')
if torch.backends.mps.is_available(): device = 'mps'
if 'cuda' in device:
assert torch.cuda.is_available()
self.log(f'Device: {device}')
self.device = device
self.utterance = []
self.circular_buffer = []
self.is_voice_activity_detected = False
self.silence_start_time = 0
self.is_wake_word_detected = False
self.saved_utterances = []
self.segment_text = ''
self.audio_format = pyaudio.paInt16
self.channels = 1
self.rate = 16000
self.chunk = 4096
self.threshold = 200
self.silence_duration = 1 # duration of silence in seconds
self.model_size = "distil-large-v3"
self.buffer_size = 64 # Size of the circular buffer
self.audio = pyaudio.PyAudio()
self.model = WhisperModel(self.model_size, device=self.device, compute_type="float16")
self.stream = None
self.log('Model loaded')
def detect_wake_word(self, speech: str) -> bool:
lowercased_speech = speech.lower().strip()
wake_words = ["ok leon", "okay leon", "hi leon", "hey leon"]
for wake_word in wake_words:
if wake_word in lowercased_speech:
return True
return False
def process_circular_buffer(self):
if len(self.circular_buffer) > self.buffer_size:
self.circular_buffer.pop(0)
audio_data = np.concatenate(self.circular_buffer)
segments, info = self.model.transcribe(
audio_data,
beam_size=5,
language="en",
task="transcribe",
condition_on_previous_text=False,
hotwords="talking to Leon"
)
for segment in segments:
words = segment.text.split()
self.segment_text += ' '.join(words) + ' '
if self.is_wake_word_detected:
self.utterance.append(self.segment_text)
if self.detect_wake_word(segment.text):
self.log('Wake word detected')
self.is_wake_word_detected = True
self.log("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
self.segment_text = ''
def start_recording(self):
self.stream = self.audio.open(format=self.audio_format,
channels=self.channels,
rate=self.rate,
frames_per_buffer=self.chunk,
input=True,
input_device_index=self.audio.get_default_input_device_info()["index"]) # Use the default input device
self.log("Recording...")
frames = []
while True:
data = self.stream.read(self.chunk)
data_np = np.frombuffer(data, dtype=np.int16)
# Check if the audio data contains any non-finite values
if not np.isfinite(data_np).all():
self.log("Non-finite values detected in audio data. Replacing with zeros.")
data_np = np.nan_to_num(data_np) # Replace non-finite values with zeros
rms = audioop.rms(data, 2) # width=2 for format=paInt16
if rms >= self.threshold: # audio threshold
if not self.is_voice_activity_detected:
self.is_voice_activity_detected = True
self.circular_buffer.append(data_np)
self.process_circular_buffer()
else:
if self.is_voice_activity_detected:
self.silence_start_time = time.time()
self.is_voice_activity_detected = False
if time.time() - self.silence_start_time > self.silence_duration: # If silence for SILENCE_DURATION seconds
if len(self.utterance) > 0:
self.log('Reset')
if self.is_wake_word_detected:
self.saved_utterances.append(" ".join(self.utterance))
self.utterance = []
self.is_wake_word_detected = False
self.circular_buffer = []
# self.log('Silence detected')
def stop_recording(self):
self.stream.stop_stream()
self.stream.close()
self.audio.terminate()
@staticmethod
def log(*args, **kwargs):
print('[ASR]', *args, **kwargs)

View File

@ -6,6 +6,7 @@ import time
import re
import lib.nlp as nlp
from .asr import ASR
from .tts.api import TTS
from .constants import TTS_MODEL_CONFIG_PATH, TTS_MODEL_PATH, IS_TTS_ENABLED, TMP_PATH, IS_ASR_ENABLED
@ -18,6 +19,7 @@ class TCPServer:
self.conn = None
self.addr = None
self.tts = None
self.asr = None
@staticmethod
def log(*args, **kwargs):
@ -47,7 +49,9 @@ class TCPServer:
self.log('ASR is disabled')
return
# TODO
# TODO: local model path
self.asr = ASR(device='auto')
self.asr.start_recording()
def init(self):
try:

View File

@ -1,4 +1,5 @@
import os
import threading
from os.path import join, dirname
from dotenv import load_dotenv
@ -14,6 +15,11 @@ tcp_server_host = os.environ.get('LEON_PY_TCP_SERVER_HOST', '0.0.0.0')
tcp_server_port = os.environ.get('LEON_PY_TCP_SERVER_PORT', 1342)
tcp_server = TCPServer(tcp_server_host, tcp_server_port)
tcp_server.init_asr()
tcp_server.init_tts()
asr_thread = threading.Thread(target=tcp_server.init_asr)
asr_thread.start()
tts_thread = threading.Thread(target=tcp_server.init_tts)
tts_thread.start()
tcp_server.init()

View File

@ -1,39 +0,0 @@
import socket
import pyaudio
# Set up a TCP socket
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server_address = ('127.0.0.1', 1342) # Replace with your server's IP and port
sock.connect(server_address)
# Set up the audio stream
chunk = 1024 # Record in chunks of 1024 samples
sample_format = pyaudio.paInt16 # 16 bits per sample
channels = 1
fs = 44100 # Record at 44100 samples per second
p = pyaudio.PyAudio()
stream = p.open(format=sample_format,
channels=channels,
rate=fs,
frames_per_buffer=chunk,
input=True)
# Start the stream and send audio data over the TCP socket
print('Recording')
while True:
data = stream.read(chunk)
sock.sendall(data)
# Stop and close the stream
stream.stop_stream()
stream.close()
# Terminate the PortAudio interface
p.terminate()
print('Finished recording')
# Close the socket
sock.close()

View File

@ -1,33 +0,0 @@
from faster_whisper import WhisperModel
def detect_wake_word(speech: str) -> bool:
lowercased_speech = speech.lower().strip()
wake_words = ["ok leon", "hi leon", "hey leon"]
for wake_word in wake_words:
if wake_word in lowercased_speech:
return True
return False
# config.json; preprocessor_config.json; model.bin; tokenizer.json; vocabulary.json
model_size = "distil-medium.en"
audio_path = '/home/louis/Desktop/asr-test.wav'
# https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py
# model_size_or_path = "distil-large-v3"
# download_root = "/path/to/download"
# local_files_only = True
# files= ["config.json", "preprocessor_config.json", "model.bin", "tokenizer.json", "vocabulary.json"]
# TODO: auto device choice
model = WhisperModel(model_size, device="cuda", compute_type="float16")
segments, info = model.transcribe(
audio_path,
beam_size=5,
language="en",
condition_on_previous_text=False,
hotwords="Leon"
)
for segment in segments:
print(segment)
print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))