feat: support speech interruption

2024-09-11 18:27:21 +03:00 · 2024-05-24 10:25:05 +08:00 · 2024-05-24 10:25:05 +08:00 · 37fe307035
commit 37fe307035
parent c1349c930a
8 changed files with 103 additions and 24 deletions
--- a/app/src/js/client.js
+++ b/app/src/js/client.js
@ -16,7 +16,8 @@ export default class Client {
    this.chatbot = new Chatbot()
    this._recorder = {}
    this._suggestions = []
-    this.answerGenerationId = 'xxx'
+    this._answerGenerationId = 'xxx'
+    this._ttsAudioContext = null
    // this._ttsAudioContextes = {}
  }

@ -85,7 +86,7 @@ export default class Client {
        document.querySelector('.leon:last-child')
      const isNewestBubbleFromStreaming =
        newestBubbleContainerElement?.classList.contains(
-          this.answerGenerationId
+          this._answerGenerationId
        )

      if (isNewestBubbleFromStreaming) {
@ -123,9 +124,9 @@ export default class Client {
    })

    this.socket.on('llm-token', (data) => {
-      const previousGenerationId = this.answerGenerationId
+      const previousGenerationId = this._answerGenerationId
      const newGenerationId = data.generationId
-      this.answerGenerationId = newGenerationId
+      this._answerGenerationId = newGenerationId
      const isSameGeneration = previousGenerationId === newGenerationId
      let bubbleContainerElement = null

@ -170,18 +171,28 @@ export default class Client {
    this.socket.on('tts-stream', (data) => {
      // const { audioId, chunk } = data
      const { chunk } = data
-      const ctx = new AudioContext()
+      this._ttsAudioContext = new AudioContext()
      // this._ttsAudioContextes[audioId] = ctx

-      const source = ctx.createBufferSource()
-      ctx.decodeAudioData(chunk, (buffer) => {
+      const source = this._ttsAudioContext.createBufferSource()
+      this._ttsAudioContext.decodeAudioData(chunk, (buffer) => {
        source.buffer = buffer

-        source.connect(ctx.destination)
+        source.connect(this._ttsAudioContext.destination)
        source.start(0)
      })
    })

+    /**
+     * When Leon got interrupted by the owner voice
+     * while he is speaking
+     */
+    this.socket.on('tts-interruption', async () => {
+      if (this._ttsAudioContext) {
+        await this._ttsAudioContext.close()
+      }
+    })
+
    this.socket.on('audio-forwarded', (data, cb) => {
      const ctx = new AudioContext()
      const source = ctx.createBufferSource()
--- a/server/src/core/brain/brain.ts
+++ b/server/src/core/brain/brain.ts
@ -43,11 +43,16 @@ import { DateHelper } from '@/helpers/date-helper'
 import { ParaphraseLLMDuty } from '@/core/llm-manager/llm-duties/paraphrase-llm-duty'
 import { AnswerQueue } from '@/core/brain/answer-queue'

+interface IsTalkingWithVoiceOptions {
+  shouldInterrupt?: boolean
+}
+
 const MIN_NB_OF_WORDS_TO_USE_LLM_NLG = 5

 export default class Brain {
  private static instance: Brain
  private _lang: ShortLanguageCode = 'en'
+  private _isTalkingWithVoice = false
  private answerQueue = new AnswerQueue<SkillAnswerConfigSchema>()
  private answerQueueProcessTimerId: NodeJS.Timeout | undefined = undefined
  private broca: GlobalAnswersSchema = JSON.parse(
@ -85,6 +90,41 @@ export default class Brain {
    }
  }

+  public get isTalkingWithVoice(): boolean {
+    return this._isTalkingWithVoice
+  }
+
+  public setIsTalkingWithVoice(
+    isTalkingWithVoice: boolean,
+    options?: IsTalkingWithVoiceOptions
+  ): void {
+    options = options || {
+      shouldInterrupt: false
+    }
+
+    if (HAS_TTS) {
+      LogHelper.title('Brain')
+
+      if (
+        this._isTalkingWithVoice &&
+        !isTalkingWithVoice &&
+        options.shouldInterrupt
+      ) {
+        SOCKET_SERVER.socket?.emit('tts-interruption')
+        TTS.speeches = []
+        LogHelper.info('Leon got interrupted by voice')
+      }
+
+      if (isTalkingWithVoice) {
+        LogHelper.info('Leon is talking with voice')
+      } else {
+        LogHelper.info('Leon stopped talking with voice')
+      }
+    }
+
+    this._isTalkingWithVoice = isTalkingWithVoice
+  }
+
  public get lang(): ShortLanguageCode {
    return this._lang
  }
--- a/server/src/core/llm-manager/persona.ts
+++ b/server/src/core/llm-manager/persona.ts
@ -53,11 +53,11 @@ const RULE_3 = `- Your answers are no more than 3 sentences.`
 const RULES = `RULES:`
 const YOUR_CURRENT_MOOD = `YOUR CURRENT MOOD:`
 const YOUR_DUTY = 'YOUR DUTY:'
-const DEFAULT_MOOD_DESC = `You are joyful and you have a strong sense of humor.`
+const DEFAULT_MOOD_DESC = `You are joyful and you have a strong sense of humor. You use emojis.`
 const TIRING_MOOD_DESC = `You are exhausted and became lazy.`
 const SAD_MOOD_DESC = `You are sad, feeling down and depressing.`
 const ANGRY_MOOD_DESC = `You grumble, you are angry and you are not in a mood. You use irony and sarcasm.`
-const COCKY_MOOD_DESC = `You are cocky, you are over confident and proud of yourself. You like to show off.`
+const COCKY_MOOD_DESC = `You are cocky, you are over confident and proud of yourself. You like to show off. You use emojis.`
 const MOODS: Mood[] = [
  { type: Moods.Default, description: DEFAULT_MOOD_DESC, emoji: '😃' },
  { type: Moods.Tired, description: TIRING_MOOD_DESC, emoji: '😪' },
@ -149,7 +149,7 @@ export default class Persona {

    this.whatYouDo = StringHelper.findAndMap(this.whatYouDo, {
      '%WHAT_YOU_DO%': ownerInfo
-        ? `You serve a person named ${this.ownerName}. Born on ${this.ownerBirthDate}`
+        ? `You serve a person named ${this.ownerName}. ${this.ownerName} is born on ${this.ownerBirthDate}`
        : 'You serve a specific person or family (user)'
    })

@ -279,9 +279,11 @@ ${this.contextInfo}

 ${this.whatYouDo}
 You chat with the user.
-You are a good listener and you provide helpful answers by connecting to conversation nodes.
+You are a good listener.
+Recall and build upon previous topics, emotions, and concerns expressed by the user.
+Use the conversation history, current context, and key nodes to provide helpful answers.
 You do not mirror what the user says. Be creative.
-If you don't know the answer to a question, say that you don't know.
+If you're uncertain or lack sufficient information to provide an accurate answer, clearly state that you don't know. Avoid making educated guesses or speculating without evidence.

 ${YOUR_PERSONALITY}
 ${this.getExtraPersonalityTraits()}
--- a/server/src/core/nlp/nlu/action-loop.ts
+++ b/server/src/core/nlp/nlu/action-loop.ts
@ -118,7 +118,7 @@ export class ActionLoop {
          (intent.includes('resolver.global') ||
            intent.includes(`resolver.${skillName}`))
        ) {
-          LogHelper.title('NLU')
+          LogHelper.title('Action Loop')
          LogHelper.success('Resolvers resolved:')

          const resolvedResolvers = await resolveResolvers(
@ -138,6 +138,8 @@ export class ActionLoop {

      // Ensure expected items are in the utterance, otherwise clean context and reprocess
      if (!hasMatchingEntity && !hasMatchingResolver && !hasMatchingUtterance) {
+        LogHelper.title('Action Loop')
+        LogHelper.info('Expected item not found in the utterance')
        // await BRAIN.talk(`${BRAIN.wernicke('random_context_out_of_topic')}.`)
        NLU.conversation.cleanActiveContext()
        await NLU.process(utterance)
--- a/server/src/core/nlp/nlu/slot-filling.ts
+++ b/server/src/core/nlp/nlu/slot-filling.ts
@ -5,6 +5,7 @@ import type { BrainProcessResult } from '@/core/brain/types'
 import { BRAIN, MODEL_LOADER, NER, NLU, SOCKET_SERVER } from '@/core'
 import { DEFAULT_NLU_RESULT } from '@/core/nlp/nlu/nlu'
 import { SkillDomainHelper } from '@/helpers/skill-domain-helper'
+import { LogHelper } from '@/helpers/log-helper'
 import { DEFAULT_ACTIVE_CONTEXT } from '@/core/nlp/conversation'

 export class SlotFilling {
@ -108,6 +109,8 @@ export class SlotFilling {
    }

    if (!NLU.conversation.areSlotsAllFilled()) {
+      LogHelper.title('Slot Filling')
+      LogHelper.info('Slots are not all filled')
      // await BRAIN.talk(`${BRAIN.wernicke('random_context_out_of_topic')}.`)
    } else {
      const { actions } = await SkillDomainHelper.getSkillConfig(
--- a/server/src/core/stt/parsers/local-parser.ts
+++ b/server/src/core/stt/parsers/local-parser.ts
@ -1,6 +1,6 @@
 import { STTParserBase } from '@/core/stt/stt-parser-base'
 import { LogHelper } from '@/helpers/log-helper'
-import { PYTHON_TCP_CLIENT, SOCKET_SERVER } from '@/core'
+import { BRAIN, PYTHON_TCP_CLIENT, SOCKET_SERVER } from '@/core'

 export default class LocalParser extends STTParserBase {
  protected readonly name = 'Local STT Parser'
@ -22,15 +22,23 @@ export default class LocalParser extends STTParserBase {
   * Read audio buffer and return the transcript (decoded string)
   */
  public async parse(): Promise<string | null> {
-    const wakeWordEventName = 'asr-new-speech'
+    const newSpeechEventName = 'asr-new-speech'
    const endOfOwnerSpeechDetected = 'asr-end-of-owner-speech-detected'
-    const wakeWordEventHasListeners =
-      PYTHON_TCP_CLIENT.ee.listenerCount(wakeWordEventName) > 0
+    const newSpeechEventHasListeners =
+      PYTHON_TCP_CLIENT.ee.listenerCount(newSpeechEventName) > 0
    const endOfOwnerSpeechDetectedHasListeners =
      PYTHON_TCP_CLIENT.ee.listenerCount(endOfOwnerSpeechDetected) > 0

-    if (!wakeWordEventHasListeners) {
-      PYTHON_TCP_CLIENT.ee.on(wakeWordEventName, (data) => {
+    if (!newSpeechEventHasListeners) {
+      PYTHON_TCP_CLIENT.ee.on(newSpeechEventName, (data) => {
+        /**
+         * If Leon is talking with voice, then interrupt him
+         */
+        if (BRAIN.isTalkingWithVoice) {
+          BRAIN.setIsTalkingWithVoice(false, { shouldInterrupt: true })
+        }
+
+        // Send the owner speech to the client
        SOCKET_SERVER.socket?.emit('asr-speech', data.text)
      })
    }
--- a/server/src/core/tts/synthesizers/local-synthesizer.ts
+++ b/server/src/core/tts/synthesizers/local-synthesizer.ts
@ -2,7 +2,7 @@ import fs from 'node:fs'

 import type { LongLanguageCode } from '@/types'
 import type { SynthesizeResult } from '@/core/tts/types'
-import { LANG } from '@/constants'
+import { HAS_STT, LANG } from '@/constants'
 import { PYTHON_TCP_CLIENT, SOCKET_SERVER, TTS } from '@/core'
 import { TTSSynthesizerBase } from '@/core/tts/tts-synthesizer-base'
 import { LogHelper } from '@/helpers/log-helper'
@ -60,7 +60,16 @@ export default class LocalSynthesizer extends TTSSynthesizerBase {
            const duration = await this.getAudioDuration(outputPath)
            TTS.em.emit('saved', duration)

-            PYTHON_TCP_CLIENT.emit('leon-speech-audio-ended', duration / 1_000)
+            /**
+             * Emit an event to the Python TCP server to indicate that the audio has ended.
+             * Useful for ASR to start listening again after the audio has ended
+             */
+            if (HAS_STT) {
+              PYTHON_TCP_CLIENT.emit(
+                'leon-speech-audio-ended',
+                duration / 1_000 || 500
+              )
+            }
          } catch (e) {
            LogHelper.title(this.name)
            LogHelper.warning(`Failed to get audio duration: ${e}`)
--- a/server/src/core/tts/tts.ts
+++ b/server/src/core/tts/tts.ts
@ -4,7 +4,7 @@ import fs from 'node:fs'

 import type { ShortLanguageCode } from '@/types'
 import type { TTSSynthesizer } from '@/core/tts/types'
-import { SOCKET_SERVER } from '@/core'
+import { BRAIN, SOCKET_SERVER } from '@/core'
 import { TTS_PROVIDER, VOICE_CONFIG_PATH } from '@/constants'
 import { TTSSynthesizers, TTSProviders } from '@/core/tts/types'
 import { LogHelper } from '@/helpers/log-helper'
@ -27,7 +27,7 @@ export default class TTS {
  private static instance: TTS

  private synthesizer: TTSSynthesizer = undefined
-  private speeches: Speech[] = []
+  public speeches: Speech[] = []

  public lang: ShortLanguageCode = 'en'
  public em = new events.EventEmitter()
@ -103,6 +103,8 @@ export default class TTS {
   */
  private async forward(speech: Speech): Promise<void> {
    if (this.synthesizer) {
+      BRAIN.setIsTalkingWithVoice(true)
+
      const result = await this.synthesizer.synthesize(speech.text)

      // Support custom TTS providers such as the local synthesizer
@ -146,6 +148,8 @@ export default class TTS {
      setTimeout(async () => {
        this.speeches.shift()

+        BRAIN.setIsTalkingWithVoice(false)
+
        if (this.speeches[0]) {
          await this.forward(this.speeches[0])
        }