1
1
mirror of https://github.com/leon-ai/leon.git synced 2024-09-11 18:27:21 +03:00

feat: support speech interruption

This commit is contained in:
louistiti 2024-05-24 10:25:05 +08:00
parent c1349c930a
commit 37fe307035
No known key found for this signature in database
GPG Key ID: 92CD6A2E497E1669
8 changed files with 103 additions and 24 deletions

View File

@ -16,7 +16,8 @@ export default class Client {
this.chatbot = new Chatbot()
this._recorder = {}
this._suggestions = []
this.answerGenerationId = 'xxx'
this._answerGenerationId = 'xxx'
this._ttsAudioContext = null
// this._ttsAudioContextes = {}
}
@ -85,7 +86,7 @@ export default class Client {
document.querySelector('.leon:last-child')
const isNewestBubbleFromStreaming =
newestBubbleContainerElement?.classList.contains(
this.answerGenerationId
this._answerGenerationId
)
if (isNewestBubbleFromStreaming) {
@ -123,9 +124,9 @@ export default class Client {
})
this.socket.on('llm-token', (data) => {
const previousGenerationId = this.answerGenerationId
const previousGenerationId = this._answerGenerationId
const newGenerationId = data.generationId
this.answerGenerationId = newGenerationId
this._answerGenerationId = newGenerationId
const isSameGeneration = previousGenerationId === newGenerationId
let bubbleContainerElement = null
@ -170,18 +171,28 @@ export default class Client {
this.socket.on('tts-stream', (data) => {
// const { audioId, chunk } = data
const { chunk } = data
const ctx = new AudioContext()
this._ttsAudioContext = new AudioContext()
// this._ttsAudioContextes[audioId] = ctx
const source = ctx.createBufferSource()
ctx.decodeAudioData(chunk, (buffer) => {
const source = this._ttsAudioContext.createBufferSource()
this._ttsAudioContext.decodeAudioData(chunk, (buffer) => {
source.buffer = buffer
source.connect(ctx.destination)
source.connect(this._ttsAudioContext.destination)
source.start(0)
})
})
/**
* When Leon got interrupted by the owner voice
* while he is speaking
*/
this.socket.on('tts-interruption', async () => {
if (this._ttsAudioContext) {
await this._ttsAudioContext.close()
}
})
this.socket.on('audio-forwarded', (data, cb) => {
const ctx = new AudioContext()
const source = ctx.createBufferSource()

View File

@ -43,11 +43,16 @@ import { DateHelper } from '@/helpers/date-helper'
import { ParaphraseLLMDuty } from '@/core/llm-manager/llm-duties/paraphrase-llm-duty'
import { AnswerQueue } from '@/core/brain/answer-queue'
interface IsTalkingWithVoiceOptions {
shouldInterrupt?: boolean
}
const MIN_NB_OF_WORDS_TO_USE_LLM_NLG = 5
export default class Brain {
private static instance: Brain
private _lang: ShortLanguageCode = 'en'
private _isTalkingWithVoice = false
private answerQueue = new AnswerQueue<SkillAnswerConfigSchema>()
private answerQueueProcessTimerId: NodeJS.Timeout | undefined = undefined
private broca: GlobalAnswersSchema = JSON.parse(
@ -85,6 +90,41 @@ export default class Brain {
}
}
public get isTalkingWithVoice(): boolean {
return this._isTalkingWithVoice
}
public setIsTalkingWithVoice(
isTalkingWithVoice: boolean,
options?: IsTalkingWithVoiceOptions
): void {
options = options || {
shouldInterrupt: false
}
if (HAS_TTS) {
LogHelper.title('Brain')
if (
this._isTalkingWithVoice &&
!isTalkingWithVoice &&
options.shouldInterrupt
) {
SOCKET_SERVER.socket?.emit('tts-interruption')
TTS.speeches = []
LogHelper.info('Leon got interrupted by voice')
}
if (isTalkingWithVoice) {
LogHelper.info('Leon is talking with voice')
} else {
LogHelper.info('Leon stopped talking with voice')
}
}
this._isTalkingWithVoice = isTalkingWithVoice
}
public get lang(): ShortLanguageCode {
return this._lang
}

View File

@ -53,11 +53,11 @@ const RULE_3 = `- Your answers are no more than 3 sentences.`
const RULES = `RULES:`
const YOUR_CURRENT_MOOD = `YOUR CURRENT MOOD:`
const YOUR_DUTY = 'YOUR DUTY:'
const DEFAULT_MOOD_DESC = `You are joyful and you have a strong sense of humor.`
const DEFAULT_MOOD_DESC = `You are joyful and you have a strong sense of humor. You use emojis.`
const TIRING_MOOD_DESC = `You are exhausted and became lazy.`
const SAD_MOOD_DESC = `You are sad, feeling down and depressing.`
const ANGRY_MOOD_DESC = `You grumble, you are angry and you are not in a mood. You use irony and sarcasm.`
const COCKY_MOOD_DESC = `You are cocky, you are over confident and proud of yourself. You like to show off.`
const COCKY_MOOD_DESC = `You are cocky, you are over confident and proud of yourself. You like to show off. You use emojis.`
const MOODS: Mood[] = [
{ type: Moods.Default, description: DEFAULT_MOOD_DESC, emoji: '😃' },
{ type: Moods.Tired, description: TIRING_MOOD_DESC, emoji: '😪' },
@ -149,7 +149,7 @@ export default class Persona {
this.whatYouDo = StringHelper.findAndMap(this.whatYouDo, {
'%WHAT_YOU_DO%': ownerInfo
? `You serve a person named ${this.ownerName}. Born on ${this.ownerBirthDate}`
? `You serve a person named ${this.ownerName}. ${this.ownerName} is born on ${this.ownerBirthDate}`
: 'You serve a specific person or family (user)'
})
@ -279,9 +279,11 @@ ${this.contextInfo}
${this.whatYouDo}
You chat with the user.
You are a good listener and you provide helpful answers by connecting to conversation nodes.
You are a good listener.
Recall and build upon previous topics, emotions, and concerns expressed by the user.
Use the conversation history, current context, and key nodes to provide helpful answers.
You do not mirror what the user says. Be creative.
If you don't know the answer to a question, say that you don't know.
If you're uncertain or lack sufficient information to provide an accurate answer, clearly state that you don't know. Avoid making educated guesses or speculating without evidence.
${YOUR_PERSONALITY}
${this.getExtraPersonalityTraits()}

View File

@ -118,7 +118,7 @@ export class ActionLoop {
(intent.includes('resolver.global') ||
intent.includes(`resolver.${skillName}`))
) {
LogHelper.title('NLU')
LogHelper.title('Action Loop')
LogHelper.success('Resolvers resolved:')
const resolvedResolvers = await resolveResolvers(
@ -138,6 +138,8 @@ export class ActionLoop {
// Ensure expected items are in the utterance, otherwise clean context and reprocess
if (!hasMatchingEntity && !hasMatchingResolver && !hasMatchingUtterance) {
LogHelper.title('Action Loop')
LogHelper.info('Expected item not found in the utterance')
// await BRAIN.talk(`${BRAIN.wernicke('random_context_out_of_topic')}.`)
NLU.conversation.cleanActiveContext()
await NLU.process(utterance)

View File

@ -5,6 +5,7 @@ import type { BrainProcessResult } from '@/core/brain/types'
import { BRAIN, MODEL_LOADER, NER, NLU, SOCKET_SERVER } from '@/core'
import { DEFAULT_NLU_RESULT } from '@/core/nlp/nlu/nlu'
import { SkillDomainHelper } from '@/helpers/skill-domain-helper'
import { LogHelper } from '@/helpers/log-helper'
import { DEFAULT_ACTIVE_CONTEXT } from '@/core/nlp/conversation'
export class SlotFilling {
@ -108,6 +109,8 @@ export class SlotFilling {
}
if (!NLU.conversation.areSlotsAllFilled()) {
LogHelper.title('Slot Filling')
LogHelper.info('Slots are not all filled')
// await BRAIN.talk(`${BRAIN.wernicke('random_context_out_of_topic')}.`)
} else {
const { actions } = await SkillDomainHelper.getSkillConfig(

View File

@ -1,6 +1,6 @@
import { STTParserBase } from '@/core/stt/stt-parser-base'
import { LogHelper } from '@/helpers/log-helper'
import { PYTHON_TCP_CLIENT, SOCKET_SERVER } from '@/core'
import { BRAIN, PYTHON_TCP_CLIENT, SOCKET_SERVER } from '@/core'
export default class LocalParser extends STTParserBase {
protected readonly name = 'Local STT Parser'
@ -22,15 +22,23 @@ export default class LocalParser extends STTParserBase {
* Read audio buffer and return the transcript (decoded string)
*/
public async parse(): Promise<string | null> {
const wakeWordEventName = 'asr-new-speech'
const newSpeechEventName = 'asr-new-speech'
const endOfOwnerSpeechDetected = 'asr-end-of-owner-speech-detected'
const wakeWordEventHasListeners =
PYTHON_TCP_CLIENT.ee.listenerCount(wakeWordEventName) > 0
const newSpeechEventHasListeners =
PYTHON_TCP_CLIENT.ee.listenerCount(newSpeechEventName) > 0
const endOfOwnerSpeechDetectedHasListeners =
PYTHON_TCP_CLIENT.ee.listenerCount(endOfOwnerSpeechDetected) > 0
if (!wakeWordEventHasListeners) {
PYTHON_TCP_CLIENT.ee.on(wakeWordEventName, (data) => {
if (!newSpeechEventHasListeners) {
PYTHON_TCP_CLIENT.ee.on(newSpeechEventName, (data) => {
/**
* If Leon is talking with voice, then interrupt him
*/
if (BRAIN.isTalkingWithVoice) {
BRAIN.setIsTalkingWithVoice(false, { shouldInterrupt: true })
}
// Send the owner speech to the client
SOCKET_SERVER.socket?.emit('asr-speech', data.text)
})
}

View File

@ -2,7 +2,7 @@ import fs from 'node:fs'
import type { LongLanguageCode } from '@/types'
import type { SynthesizeResult } from '@/core/tts/types'
import { LANG } from '@/constants'
import { HAS_STT, LANG } from '@/constants'
import { PYTHON_TCP_CLIENT, SOCKET_SERVER, TTS } from '@/core'
import { TTSSynthesizerBase } from '@/core/tts/tts-synthesizer-base'
import { LogHelper } from '@/helpers/log-helper'
@ -60,7 +60,16 @@ export default class LocalSynthesizer extends TTSSynthesizerBase {
const duration = await this.getAudioDuration(outputPath)
TTS.em.emit('saved', duration)
PYTHON_TCP_CLIENT.emit('leon-speech-audio-ended', duration / 1_000)
/**
* Emit an event to the Python TCP server to indicate that the audio has ended.
* Useful for ASR to start listening again after the audio has ended
*/
if (HAS_STT) {
PYTHON_TCP_CLIENT.emit(
'leon-speech-audio-ended',
duration / 1_000 || 500
)
}
} catch (e) {
LogHelper.title(this.name)
LogHelper.warning(`Failed to get audio duration: ${e}`)

View File

@ -4,7 +4,7 @@ import fs from 'node:fs'
import type { ShortLanguageCode } from '@/types'
import type { TTSSynthesizer } from '@/core/tts/types'
import { SOCKET_SERVER } from '@/core'
import { BRAIN, SOCKET_SERVER } from '@/core'
import { TTS_PROVIDER, VOICE_CONFIG_PATH } from '@/constants'
import { TTSSynthesizers, TTSProviders } from '@/core/tts/types'
import { LogHelper } from '@/helpers/log-helper'
@ -27,7 +27,7 @@ export default class TTS {
private static instance: TTS
private synthesizer: TTSSynthesizer = undefined
private speeches: Speech[] = []
public speeches: Speech[] = []
public lang: ShortLanguageCode = 'en'
public em = new events.EventEmitter()
@ -103,6 +103,8 @@ export default class TTS {
*/
private async forward(speech: Speech): Promise<void> {
if (this.synthesizer) {
BRAIN.setIsTalkingWithVoice(true)
const result = await this.synthesizer.synthesize(speech.text)
// Support custom TTS providers such as the local synthesizer
@ -146,6 +148,8 @@ export default class TTS {
setTimeout(async () => {
this.speeches.shift()
BRAIN.setIsTalkingWithVoice(false)
if (this.speeches[0]) {
await this.forward(this.speeches[0])
}