feat(server): finalize Leon's personality and optimize LLM duties

2024-11-23 20:12:08 +03:00 · 2024-05-06 00:57:20 +08:00 · 2024-05-06 00:57:20 +08:00 · 0189c74a0e
commit 0189c74a0e
parent a0a4f9d7b0
19 changed files with 448 additions and 198 deletions
--- a/server/src/constants.ts
+++ b/server/src/constants.ts
@ -161,22 +161,30 @@ export const LEON_FILE_PATH = path.join(process.cwd(), 'leon.json')

 /**
 * LLMs
+ * @see k-quants comparison: https://github.com/ggerganov/llama.cpp/pull/1684
 */
+// https://huggingface.co/QuantFactory/Meta-Llama-3-8B-Instruct-GGUF/blob/main/Meta-Llama-3-8B-Instruct.Q5_K_S.gguf
 export const HAS_LLM = process.env['LEON_LLM'] === 'true'
 export const HAS_LLM_NLG = process.env['LEON_LLM_NLG'] === 'true' && HAS_LLM
 // export const LLM_VERSION = 'v0.2.Q4_K_S'
+// export const LLM_VERSION = '8B-Instruct.Q5_K_S'
+// export const LLM_VERSION = '2.9-llama3-8b.Q5_K_S'
 export const LLM_VERSION = '3-8B-Uncensored-Q5_K_S'
 // export const LLM_VERSION = '3-mini-128k-instruct.Q5_K_S'
 // export const LLM_VERSION = '3-mini-4k-instruct-q4'
 // export const LLM_VERSION = '1.1-7b-it-Q4_K_M'
 // export const LLM_VERSION = '8B-Instruct-Q4_K_S'
 // export const LLM_NAME = 'Mistral 7B Instruct'
+// export const LLM_NAME = 'Meta-Llama-3-8B-Instruct'
+// export const LLM_NAME = 'Dolphin 2.9 Llama-3-8B'
 export const LLM_NAME = 'Lexi-Llama-3-8B-Uncensored'
 // export const LLM_NAME = 'Phi-3-Mini-128K-Instruct'
 // export const LLM_NAME = 'Phi-3-mini'
 // export const LLM_NAME = 'Gemma 1.1 7B (IT)'
 // export const LLM_NAME = 'Meta Llama 3 8B Instruct'
 // export const LLM_FILE_NAME = `mistral-7b-instruct-${LLM_VERSION}.gguf`
+// export const LLM_FILE_NAME = `Meta-Llama-3-${LLM_VERSION}.gguf`
+// export const LLM_FILE_NAME = `dolphin-${LLM_VERSION}.gguf`
 export const LLM_FILE_NAME = `Lexi-Llama-${LLM_VERSION}.gguf`
 // export const LLM_FILE_NAME = `Phi-${LLM_VERSION}.gguf`
 // export const LLM_FILE_NAME = `gemma-${LLM_VERSION}.gguf`
@ -186,6 +194,10 @@ export const LLM_DIR_PATH = path.join(MODELS_PATH, 'llm')
 export const LLM_PATH = path.join(LLM_DIR_PATH, LLM_FILE_NAME)
 export const LLM_MINIMUM_TOTAL_RAM = 8
 export const LLM_MINIMUM_FREE_RAM = 8
+/*export const LLM_HF_DOWNLOAD_URL =
+  'https://huggingface.co/QuantFactory/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct.Q5_K_S.gguf?download=true'*/
+/*export const LLM_HF_DOWNLOAD_URL =
+  'https://huggingface.co/QuantFactory/dolphin-2.9-llama3-8b-GGUF/resolve/main/dolphin-2.9-llama3-8b.Q5_K_S.gguf?download=true'*/
 export const LLM_HF_DOWNLOAD_URL =
  'https://huggingface.co/bartowski/Lexi-Llama-3-8B-Uncensored-GGUF/resolve/main/Lexi-Llama-3-8B-Uncensored-Q5_K_S.gguf?download=true'
 /*export const LLM_HF_DOWNLOAD_URL =
@ -200,6 +212,10 @@ export const LLM_HF_DOWNLOAD_URL =
  'https://huggingface.co/bartowski/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_K_S.gguf?download=true'*/
 /*export const LLM_MIRROR_DOWNLOAD_URL =
  'https://hf-mirror.com/bartowski/gemma-1.1-7b-it-GGUF/resolve/main/gemma-1.1-7b-it-Q4_K_M.gguf?download=true'*/
+/*export const LLM_MIRROR_DOWNLOAD_URL =
+  'https://hf-mirror.com/QuantFactory/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct.Q5_K_S.gguf?download=true'*/
+/*export const LLM_MIRROR_DOWNLOAD_URL =
+  'https://hf-mirror.com/QuantFactory/dolphin-2.9-llama3-8b-GGUF/resolve/main/dolphin-2.9-llama3-8b.Q5_K_S.gguf?download=true'*/
 export const LLM_MIRROR_DOWNLOAD_URL =
  'https://hf-mirror.com/bartowski/Lexi-Llama-3-8B-Uncensored-GGUF/resolve/main/Lexi-Llama-3-8B-Uncensored-Q5_K_S.gguf?download=true'
 /*export const LLM_MIRROR_DOWNLOAD_URL =
--- a/server/src/conversation-logger.ts
+++ b/server/src/conversation-logger.ts
@ -9,8 +9,12 @@ interface MessageLog {
  sentAt: number
  message: string
 }
-
-const CONVERSATION_LOG_PATH = path.join(LOGS_PATH, 'conversation_log.json')
+interface ConversationLoggerSettings {
+  loggerName: string
+  fileName: string
+  nbOfLogsToKeep: number
+  nbOfLogsToLoad: number
+}

 /**
 * The goal of this class is to log the conversation data between the
@ -20,27 +24,39 @@ const CONVERSATION_LOG_PATH = path.join(LOGS_PATH, 'conversation_log.json')
 * better results.
 */
 export class ConversationLogger {
-  private static readonly nbOfLogsToKeep = 512
-  private static readonly nbOfLogsToLoad = 32
+  private readonly settings: ConversationLoggerSettings
+  private readonly conversationLogPath: string

-  private static async createConversationLogFile(): Promise<void> {
+  get loggerName(): string {
+    return this.settings.loggerName
+  }
+
+  constructor(settings: ConversationLoggerSettings) {
+    LogHelper.title(settings.loggerName)
+    LogHelper.success('New instance')
+
+    this.settings = settings
+    this.conversationLogPath = path.join(LOGS_PATH, this.settings.fileName)
+  }
+
+  private async createConversationLogFile(): Promise<void> {
    try {
-      if (!fs.existsSync(CONVERSATION_LOG_PATH)) {
-        await fs.promises.writeFile(CONVERSATION_LOG_PATH, '[]', 'utf-8')
+      if (!fs.existsSync(this.conversationLogPath)) {
+        await fs.promises.writeFile(this.conversationLogPath, '[]', 'utf-8')
      }
    } catch (e) {
-      LogHelper.title('Conversation Logger')
+      LogHelper.title(this.settings.loggerName)
      LogHelper.error(`Failed to create conversation log file: ${e})`)
    }
  }

-  private static async getAllLogs(): Promise<MessageLog[]> {
+  private async getAllLogs(): Promise<MessageLog[]> {
    try {
      let conversationLog: MessageLog[] = []

-      if (fs.existsSync(CONVERSATION_LOG_PATH)) {
+      if (fs.existsSync(this.conversationLogPath)) {
        conversationLog = JSON.parse(
-          await fs.promises.readFile(CONVERSATION_LOG_PATH, 'utf-8')
+          await fs.promises.readFile(this.conversationLogPath, 'utf-8')
        )
      } else {
        await this.createConversationLogFile()
@ -48,20 +64,18 @@ export class ConversationLogger {

      return conversationLog
    } catch (e) {
-      LogHelper.title('Conversation Logger')
+      LogHelper.title(this.settings.loggerName)
      LogHelper.error(`Failed to get conversation log: ${e})`)
    }

    return []
  }

-  public static async push(
-    newRecord: Omit<MessageLog, 'sentAt'>
-  ): Promise<void> {
+  public async push(newRecord: Omit<MessageLog, 'sentAt'>): Promise<void> {
    try {
      const conversationLogs = await this.getAllLogs()

-      if (conversationLogs.length >= this.nbOfLogsToKeep) {
+      if (conversationLogs.length >= this.settings.nbOfLogsToKeep) {
        conversationLogs.shift()
      }

@ -71,32 +85,32 @@ export class ConversationLogger {
      })

      await fs.promises.writeFile(
-        CONVERSATION_LOG_PATH,
+        this.conversationLogPath,
        JSON.stringify(conversationLogs, null, 2),
        'utf-8'
      )
    } catch (e) {
-      LogHelper.title('Conversation Logger')
+      LogHelper.title(this.settings.loggerName)
      LogHelper.error(`Failed to push new record: ${e})`)
    }
  }

-  public static async load(): Promise<MessageLog[] | void> {
+  public async load(): Promise<MessageLog[] | void> {
    try {
      const conversationLog = await this.getAllLogs()

-      return conversationLog.slice(-this.nbOfLogsToLoad)
+      return conversationLog.slice(-this.settings.nbOfLogsToLoad)
    } catch (e) {
-      LogHelper.title('Conversation Logger')
+      LogHelper.title(this.settings.loggerName)
      LogHelper.error(`Failed to load conversation log: ${e})`)
    }
  }

-  public static async clear(): Promise<void> {
+  public async clear(): Promise<void> {
    try {
-      await fs.promises.writeFile(CONVERSATION_LOG_PATH, '[]', 'utf-8')
+      await fs.promises.writeFile(this.conversationLogPath, '[]', 'utf-8')
    } catch (e) {
-      LogHelper.title('Conversation Logger')
+      LogHelper.title(this.settings.loggerName)
      LogHelper.error(`Failed to clear conversation log: ${e})`)
    }
  }
--- a/server/src/core/brain/brain.ts
+++ b/server/src/core/brain/brain.ts
@ -28,7 +28,13 @@ import {
  NODEJS_BRIDGE_BIN_PATH,
  TMP_PATH
 } from '@/constants'
-import { LLM_MANAGER, NLU, SOCKET_SERVER, TTS } from '@/core'
+import {
+  CONVERSATION_LOGGER,
+  LLM_MANAGER,
+  NLU,
+  SOCKET_SERVER,
+  TTS
+} from '@/core'
 import { LangHelper } from '@/helpers/lang-helper'
 import { LogHelper } from '@/helpers/log-helper'
 import { SkillDomainHelper } from '@/helpers/skill-domain-helper'
@ -36,7 +42,6 @@ import { StringHelper } from '@/helpers/string-helper'
 import { DateHelper } from '@/helpers/date-helper'
 import { ParaphraseLLMDuty } from '@/core/llm-manager/llm-duties/paraphrase-llm-duty'
 import { AnswerQueue } from '@/core/brain/answer-queue'
-import { ConversationLogger } from '@/conversation-logger'

 const MIN_NB_OF_WORDS_TO_USE_LLM_NLG = 5

@ -173,9 +178,7 @@ export default class Brain {
              })
              const paraphraseResult = await paraphraseDuty.execute()

-              textAnswer = paraphraseResult?.output[
-                'rephrased_answer'
-              ] as string
+              textAnswer = paraphraseResult?.output as unknown as string
              speechAnswer = textAnswer
            }
          }
@ -191,7 +194,7 @@ export default class Brain {
        SOCKET_SERVER.socket?.emit('answer', textAnswer)
        SOCKET_SERVER.socket?.emit('is-typing', false)

-        await ConversationLogger.push({
+        await CONVERSATION_LOGGER.push({
          who: 'leon',
          message: textAnswer
        })
--- a/server/src/core/http-server/api/llm-inference/post.ts
+++ b/server/src/core/http-server/api/llm-inference/post.ts
@ -63,12 +63,21 @@ export const postLLMInference: FastifyPluginAsync<APIOptions> = async (
          return
        }

+        let llmResult
+
        // TODO: use long-live duty for chit-chat duty

-        // eslint-disable-next-line @typescript-eslint/ban-ts-comment
-        // @ts-expect-error
-        const duty = new LLM_DUTIES_MAP[params.dutyType](params)
-        const llmResult = await duty.execute()
+        if (params.dutyType === LLMDuties.ChitChat) {
+          const chitChatLLMDuty = new ChitChatLLMDuty()
+          await chitChatLLMDuty.init()
+
+          llmResult = await chitChatLLMDuty.execute()
+        } else {
+          // eslint-disable-next-line @typescript-eslint/ban-ts-comment
+          // @ts-expect-error
+          const duty = new LLM_DUTIES_MAP[params.dutyType](params)
+          llmResult = await duty.execute()
+        }

        reply.send({
          success: true,
--- a/server/src/core/index.ts
+++ b/server/src/core/index.ts
@ -16,6 +16,7 @@ import NaturalLanguageUnderstanding from '@/core/nlp/nlu/nlu'
 import Brain from '@/core/brain/brain'
 import LLMManager from '@/core/llm-manager/llm-manager'
 import Persona from '@/core/llm-manager/persona'
+import { ConversationLogger } from '@/conversation-logger'

 /**
 * Register core nodes
@ -35,6 +36,19 @@ export const LLM_MANAGER = new LLMManager()

 export const PERSONA = new Persona()

+export const CONVERSATION_LOGGER = new ConversationLogger({
+  loggerName: 'Conversation Logger',
+  fileName: 'conversation_log.json',
+  nbOfLogsToKeep: 512,
+  nbOfLogsToLoad: 96
+})
+export const LOOP_CONVERSATION_LOGGER = new ConversationLogger({
+  loggerName: 'Loop Conversation Logger',
+  fileName: 'loop_conversation_log.json',
+  nbOfLogsToKeep: 512,
+  nbOfLogsToLoad: 96
+})
+
 export const HTTP_SERVER = new HTTPServer(String(HOST), PORT)

 export const SOCKET_SERVER = new SocketServer()
--- a/server/src/core/llm-manager/llm-duties/chit-chat-llm-duty.ts
+++ b/server/src/core/llm-manager/llm-duties/chit-chat-llm-duty.ts
@ -1,23 +1,27 @@
+import type { LlamaContext, LlamaChatSession } from 'node-llama-cpp'
+
 import {
  type LLMDutyParams,
  type LLMDutyResult,
  LLMDuty
 } from '@/core/llm-manager/llm-duty'
 import { LogHelper } from '@/helpers/log-helper'
-import { LLM_MANAGER, PERSONA, NLU } from '@/core'
+import { LLM_MANAGER, PERSONA, NLU, LOOP_CONVERSATION_LOGGER } from '@/core'
 import { LLMDuties } from '@/core/llm-manager/types'
-import { LLM_THREADS } from '@/core/llm-manager/llm-manager'
-
-// interface ChitChatLLMDutyParams extends LLMDutyParams {}
+import {
+  LLM_THREADS,
+  MAX_EXECUTION_RETRIES,
+  MAX_EXECUTION_TIMOUT
+} from '@/core/llm-manager/llm-manager'

 export class ChitChatLLMDuty extends LLMDuty {
  private static instance: ChitChatLLMDuty
-  // TODO
+  private static context: LlamaContext = null as unknown as LlamaContext
+  private static session: LlamaChatSession = null as unknown as LlamaChatSession
  protected readonly systemPrompt = ``
  protected readonly name = 'Chit-Chat LLM Duty'
  protected input: LLMDutyParams['input'] = null

-  // constructor(params: ChitChatLLMDutyParams) {
  constructor() {
    super()

@ -26,53 +30,63 @@ export class ChitChatLLMDuty extends LLMDuty {
      LogHelper.success('New instance')

      ChitChatLLMDuty.instance = this
-
-      // this.input = params.input
    }
  }

-  public async execute(retries = 3): Promise<LLMDutyResult | null> {
+  public async init(): Promise<void> {
+    /**
+     * A new context and session will be created only
+     * when Leon's instance is restarted
+     */
+    if (!ChitChatLLMDuty.context || !ChitChatLLMDuty.session) {
+      await LOOP_CONVERSATION_LOGGER.clear()
+
+      ChitChatLLMDuty.context = await LLM_MANAGER.model.createContext({
+        threads: LLM_THREADS
+      })
+
+      const { LlamaChatSession } = await Function(
+        'return import("node-llama-cpp")'
+      )()
+
+      ChitChatLLMDuty.session = new LlamaChatSession({
+        contextSequence: ChitChatLLMDuty.context.getSequence(),
+        systemPrompt: PERSONA.getChitChatSystemPrompt()
+      }) as LlamaChatSession
+    } else {
+      /**
+       * As long as Leon's instance has not been restarted,
+       * the context, session with history will be loaded
+       */
+      const history = await LLM_MANAGER.loadHistory(
+        LOOP_CONVERSATION_LOGGER,
+        ChitChatLLMDuty.session
+      )
+
+      ChitChatLLMDuty.session.setChatHistory(history)
+    }
+  }
+
+  public async execute(
+    retries = MAX_EXECUTION_RETRIES
+  ): Promise<LLMDutyResult | null> {
    LogHelper.title(this.name)
    LogHelper.info('Executing...')

    try {
-      const { LlamaJsonSchemaGrammar, LlamaChatSession } = await Function(
-        'return import("node-llama-cpp")'
-      )()
-
-      /**
-       * TODO: make context, session, etc. persistent
-       */
-
-      const context = await LLM_MANAGER.model.createContext({
-        threads: LLM_THREADS
+      await LOOP_CONVERSATION_LOGGER.push({
+        who: 'owner',
+        message: NLU.nluResult.newUtterance
      })
-      const session = new LlamaChatSession({
-        contextSequence: context.getSequence(),
-        systemPrompt: PERSONA.getDutySystemPrompt(this.systemPrompt)
+      const prompt = NLU.nluResult.newUtterance
+
+      const rawResultPromise = ChitChatLLMDuty.session.prompt(prompt, {
+        maxTokens: ChitChatLLMDuty.context.contextSize,
+        temperature: 1.3
      })

-      const history = await LLM_MANAGER.loadHistory(session)
-      session.setChatHistory(history)
-
-      const grammar = new LlamaJsonSchemaGrammar(LLM_MANAGER.llama, {
-        type: 'object',
-        properties: {
-          model_answer: {
-            type: 'string'
-          }
-        }
-      })
-      const prompt = `NEW MESSAGE FROM USER:\n"${NLU.nluResult.newUtterance}"`
-
-      const rawResultPromise = session.prompt(prompt, {
-        grammar,
-        maxTokens: context.contextSize,
-        temperature: 1.0
-      })
-
-      const timeoutPromise = new Promise(
-        (_, reject) => setTimeout(() => reject(new Error('Timeout')), 8_000) // 5 seconds timeout
+      const timeoutPromise = new Promise((_, reject) =>
+        setTimeout(() => reject(new Error('Timeout')), MAX_EXECUTION_TIMOUT)
      )

      let rawResult
@ -87,25 +101,33 @@ export class ChitChatLLMDuty extends LLMDuty {
          return this.execute(retries - 1)
        } else {
          LogHelper.title(this.name)
-          LogHelper.error('Prompt failed after 3 retries')
+          LogHelper.error(
+            `Prompt failed after ${MAX_EXECUTION_RETRIES} retries`
+          )

          return null
        }
      }

-      // If a closing bracket is missing, add it
-      if (rawResult[rawResult.length - 1] !== '}') {
-        rawResult += '}'
-      }
-      const parsedResult = grammar.parse(rawResult)
+      const { usedInputTokens, usedOutputTokens } =
+        ChitChatLLMDuty.session.sequence.tokenMeter.getState()
      const result = {
        dutyType: LLMDuties.Paraphrase,
        systemPrompt: PERSONA.getChitChatSystemPrompt(),
        input: prompt,
-        output: parsedResult,
-        data: null
+        output: rawResult,
+        data: null,
+        maxTokens: ChitChatLLMDuty.context.contextSize,
+        // Current context size
+        usedInputTokens,
+        usedOutputTokens
      }

+      await LOOP_CONVERSATION_LOGGER.push({
+        who: 'leon',
+        message: result.output as string
+      })
+
      LogHelper.title(this.name)
      LogHelper.success(`Duty executed: ${JSON.stringify(result)}`)

@ -113,6 +135,11 @@ export class ChitChatLLMDuty extends LLMDuty {
    } catch (e) {
      LogHelper.title(this.name)
      LogHelper.error(`Failed to execute: ${e}`)
+
+      if (retries > 0) {
+        LogHelper.info('Retrying...')
+        return this.execute(retries - 1)
+      }
    }

    return null
--- a/server/src/core/llm-manager/llm-duties/custom-ner-llm-duty.ts
+++ b/server/src/core/llm-manager/llm-duties/custom-ner-llm-duty.ts
@ -6,7 +6,11 @@ import {
 import { LogHelper } from '@/helpers/log-helper'
 import { LLM_MANAGER } from '@/core'
 import { LLMDuties } from '@/core/llm-manager/types'
-import { LLM_THREADS } from '@/core/llm-manager/llm-manager'
+import {
+  LLM_THREADS,
+  MAX_EXECUTION_RETRIES,
+  MAX_EXECUTION_TIMOUT
+} from '@/core/llm-manager/llm-manager'

 interface CustomNERLLMDutyParams<T> extends LLMDutyParams {
  data: {
@ -33,7 +37,9 @@ export class CustomNERLLMDuty<T> extends LLMDuty {
    this.data = params.data
  }

-  public async execute(): Promise<LLMDutyResult | null> {
+  public async execute(
+    retries = MAX_EXECUTION_RETRIES
+  ): Promise<LLMDutyResult | null> {
    LogHelper.title(this.name)
    LogHelper.info('Executing...')

@ -56,22 +62,53 @@ export class CustomNERLLMDuty<T> extends LLMDuty {
        }
      })
      const prompt = `UTTERANCE TO PARSE:\n"${this.input}"`
-      let rawResult = await session.prompt(prompt, {
+      const rawResultPromise = session.prompt(prompt, {
        grammar,
        maxTokens: context.contextSize
        // temperature: 0.2
      })
-      // If a closing bracket is missing, add it
-      if (rawResult[rawResult.length - 1] !== '}') {
-        rawResult += '}'
+      const timeoutPromise = new Promise((_, reject) =>
+        setTimeout(() => reject(new Error('Timeout')), MAX_EXECUTION_TIMOUT)
+      )
+
+      let parsedResult
+
+      try {
+        let rawResult = await Promise.race([rawResultPromise, timeoutPromise])
+
+        // If a closing bracket is missing, add it
+        if (rawResult[rawResult.length - 1] !== '}') {
+          rawResult += '}'
+        }
+        parsedResult = grammar.parse(rawResult)
+      } catch (e) {
+        if (retries > 0) {
+          LogHelper.title(this.name)
+          LogHelper.info('Prompt took too long, retrying...')
+
+          return this.execute(retries - 1)
+        } else {
+          LogHelper.title(this.name)
+          LogHelper.error(
+            `Prompt failed after ${MAX_EXECUTION_RETRIES} retries`
+          )
+
+          return null
+        }
      }
-      const parsedResult = grammar.parse(rawResult)
+
+      const { usedInputTokens, usedOutputTokens } =
+        session.sequence.tokenMeter.getState()
      const result = {
        dutyType: LLMDuties.CustomNER,
        systemPrompt: this.systemPrompt,
        input: prompt,
        output: parsedResult,
-        data: this.data
+        data: this.data,
+        maxTokens: context.contextSize,
+        // Current context size
+        usedInputTokens,
+        usedOutputTokens
      }

      LogHelper.title(this.name)
@ -81,6 +118,11 @@ export class CustomNERLLMDuty<T> extends LLMDuty {
    } catch (e) {
      LogHelper.title(this.name)
      LogHelper.error(`Failed to execute: ${e}`)
+
+      if (retries > 0) {
+        LogHelper.info('Retrying...')
+        return this.execute(retries - 1)
+      }
    }

    return null
--- a/server/src/core/llm-manager/llm-duties/paraphrase-llm-duty.ts
+++ b/server/src/core/llm-manager/llm-duties/paraphrase-llm-duty.ts
@ -4,16 +4,20 @@ import {
  LLMDuty
 } from '@/core/llm-manager/llm-duty'
 import { LogHelper } from '@/helpers/log-helper'
-import { LLM_MANAGER, PERSONA, NLU } from '@/core'
+import { CONVERSATION_LOGGER, LLM_MANAGER, PERSONA } from '@/core'
 import { LLMDuties } from '@/core/llm-manager/types'
-import { LLM_THREADS } from '@/core/llm-manager/llm-manager'
+import {
+  LLM_THREADS,
+  MAX_EXECUTION_RETRIES,
+  MAX_EXECUTION_TIMOUT
+} from '@/core/llm-manager/llm-manager'

 interface ParaphraseLLMDutyParams extends LLMDutyParams {}

 export class ParaphraseLLMDuty extends LLMDuty {
-  protected readonly systemPrompt = `You are an AI system that generates answers (Natural Language Generation) based on a given text.
+  protected readonly systemPrompt = `YOUR DUTY: You are an AI system that generates answers (Natural Language Generation) based on a given text.
 According to your current mood, your personality and the given utterance, you must provide a text alternative of the given text.
-You do not ask follow up question if the original text does not contain any.`
+You do not ask question if the original text does not contain any.`
  protected readonly name = 'Paraphrase LLM Duty'
  protected input: LLMDutyParams['input'] = null

@ -26,12 +30,14 @@ You do not ask follow up question if the original text does not contain any.`
    this.input = params.input
  }

-  public async execute(): Promise<LLMDutyResult | null> {
+  public async execute(
+    retries = MAX_EXECUTION_RETRIES
+  ): Promise<LLMDutyResult | null> {
    LogHelper.title(this.name)
    LogHelper.info('Executing...')

    try {
-      const { LlamaJsonSchemaGrammar, LlamaChatSession } = await Function(
+      const { LlamaChatSession } = await Function(
        'return import("node-llama-cpp")'
      )()

@ -40,37 +46,63 @@ You do not ask follow up question if the original text does not contain any.`
      })
      const session = new LlamaChatSession({
        contextSequence: context.getSequence(),
-        systemPrompt: PERSONA.getDutySystemPrompt(this.systemPrompt)
+        systemPrompt: PERSONA.getDutySystemPrompt()
      })

-      const history = await LLM_MANAGER.loadHistory(session)
-      session.setChatHistory(history)
+      const history = await LLM_MANAGER.loadHistory(
+        CONVERSATION_LOGGER,
+        session
+      )
+      /**
+       * Only the first (system prompt) and last (new utterance) messages are used
+       * to provide some context
+       */
+      session.setChatHistory([history[0], history[history.length - 1]])

-      const grammar = new LlamaJsonSchemaGrammar(LLM_MANAGER.llama, {
-        type: 'object',
-        properties: {
-          rephrased_answer: {
-            type: 'string'
-          }
-        }
-      })
-      const prompt = `CONTEXT UTTERANCE FROM USER:\n"${NLU.nluResult.newUtterance}"\nTEXT TO MODIFY:\n"${this.input}"`
-      let rawResult = await session.prompt(prompt, {
-        grammar,
+      const prompt = `${this.systemPrompt}
+Generate the answer based on this text: ${this.input}`
+
+      const rawResultPromise = session.prompt(prompt, {
        maxTokens: context.contextSize,
-        temperature: 1.0
+        temperature: 0.4
      })
-      // If a closing bracket is missing, add it
-      if (rawResult[rawResult.length - 1] !== '}') {
-        rawResult += '}'
+
+      const timeoutPromise = new Promise((_, reject) =>
+        setTimeout(() => reject(new Error('Timeout')), MAX_EXECUTION_TIMOUT)
+      )
+
+      let rawResult
+
+      try {
+        rawResult = await Promise.race([rawResultPromise, timeoutPromise])
+      } catch (e) {
+        if (retries > 0) {
+          LogHelper.title(this.name)
+          LogHelper.info('Prompt took too long, retrying...')
+
+          return this.execute(retries - 1)
+        } else {
+          LogHelper.title(this.name)
+          LogHelper.error(
+            `Prompt failed after ${MAX_EXECUTION_RETRIES} retries`
+          )
+
+          return null
+        }
      }
-      const parsedResult = grammar.parse(rawResult)
+
+      const { usedInputTokens, usedOutputTokens } =
+        session.sequence.tokenMeter.getState()
      const result = {
        dutyType: LLMDuties.Paraphrase,
-        systemPrompt: PERSONA.getDutySystemPrompt(this.systemPrompt),
+        systemPrompt: PERSONA.getDutySystemPrompt(),
        input: prompt,
-        output: parsedResult,
-        data: null
+        output: rawResult,
+        data: null,
+        maxTokens: context.contextSize,
+        // Current context size
+        usedInputTokens,
+        usedOutputTokens
      }

      LogHelper.title(this.name)
@ -80,6 +112,11 @@ You do not ask follow up question if the original text does not contain any.`
    } catch (e) {
      LogHelper.title(this.name)
      LogHelper.error(`Failed to execute: ${e}`)
+
+      if (retries > 0) {
+        LogHelper.info('Retrying...')
+        return this.execute(retries - 1)
+      }
    }

    return null
--- a/server/src/core/llm-manager/llm-duties/summarization-llm-duty.ts
+++ b/server/src/core/llm-manager/llm-duties/summarization-llm-duty.ts
@ -6,13 +6,17 @@ import {
 import { LogHelper } from '@/helpers/log-helper'
 import { LLM_MANAGER } from '@/core'
 import { LLMDuties } from '@/core/llm-manager/types'
-import { LLM_THREADS } from '@/core/llm-manager/llm-manager'
+import {
+  LLM_THREADS,
+  MAX_EXECUTION_RETRIES,
+  MAX_EXECUTION_TIMOUT
+} from '@/core/llm-manager/llm-manager'

 interface SummarizationLLMDutyParams extends LLMDutyParams {}

 export class SummarizationLLMDuty extends LLMDuty {
  protected readonly systemPrompt =
-    'You are an AI system that summarizes a given text in a few sentences.'
+    'You are an AI system that summarizes a given text in a few sentences. You do not add any context to your response.'
  protected readonly name = 'Summarization LLM Duty'
  protected input: LLMDutyParams['input'] = null

@ -25,12 +29,14 @@ export class SummarizationLLMDuty extends LLMDuty {
    this.input = params.input
  }

-  public async execute(): Promise<LLMDutyResult | null> {
+  public async execute(
+    retries = MAX_EXECUTION_RETRIES
+  ): Promise<LLMDutyResult | null> {
    LogHelper.title(this.name)
    LogHelper.info('Executing...')

    try {
-      const { LlamaJsonSchemaGrammar, LlamaChatSession } = await Function(
+      const { LlamaChatSession } = await Function(
        'return import("node-llama-cpp")'
      )()

@ -41,31 +47,48 @@ export class SummarizationLLMDuty extends LLMDuty {
        contextSequence: context.getSequence(),
        systemPrompt: this.systemPrompt
      })
-      const grammar = new LlamaJsonSchemaGrammar(LLM_MANAGER.llama, {
-        type: 'object',
-        properties: {
-          summary: {
-            type: 'string'
-          }
-        }
-      })
-      const prompt = `TEXT TO SUMMARIZE:\n"${this.input}"`
-      let rawResult = await session.prompt(prompt, {
-        grammar,
+      const prompt = `Summarize the following text: ${this.input}`
+      const rawResultPromise = session.prompt(prompt, {
        maxTokens: context.contextSize
-        // temperature: 0.2
+        // temperature: 0.5
      })
-      // If a closing bracket is missing, add it
-      if (rawResult[rawResult.length - 1] !== '}') {
-        rawResult += '}'
+
+      const timeoutPromise = new Promise((_, reject) =>
+        setTimeout(() => reject(new Error('Timeout')), MAX_EXECUTION_TIMOUT)
+      )
+
+      let rawResult
+
+      try {
+        rawResult = await Promise.race([rawResultPromise, timeoutPromise])
+      } catch (e) {
+        if (retries > 0) {
+          LogHelper.title(this.name)
+          LogHelper.info('Prompt took too long, retrying...')
+
+          return this.execute(retries - 1)
+        } else {
+          LogHelper.title(this.name)
+          LogHelper.error(
+            `Prompt failed after ${MAX_EXECUTION_RETRIES} retries`
+          )
+
+          return null
+        }
      }
-      const parsedResult = grammar.parse(rawResult)
+
+      const { usedInputTokens, usedOutputTokens } =
+        session.sequence.tokenMeter.getState()
      const result = {
        dutyType: LLMDuties.Summarization,
        systemPrompt: this.systemPrompt,
        input: prompt,
-        output: parsedResult,
-        data: null
+        output: rawResult,
+        data: null,
+        maxTokens: context.contextSize,
+        // Current context size
+        usedInputTokens,
+        usedOutputTokens
      }

      LogHelper.title(this.name)
@ -75,6 +98,11 @@ export class SummarizationLLMDuty extends LLMDuty {
    } catch (e) {
      LogHelper.title(this.name)
      LogHelper.error(`Failed to execute: ${e}`)
+
+      if (retries > 0) {
+        LogHelper.info('Retrying...')
+        return this.execute(retries - 1)
+      }
    }

    return null
--- a/server/src/core/llm-manager/llm-duties/translation-llm-duty.ts
+++ b/server/src/core/llm-manager/llm-duties/translation-llm-duty.ts
@ -6,7 +6,11 @@ import {
 import { LogHelper } from '@/helpers/log-helper'
 import { LLM_MANAGER } from '@/core'
 import { LLMDuties } from '@/core/llm-manager/types'
-import { LLM_THREADS } from '@/core/llm-manager/llm-manager'
+import {
+  LLM_THREADS,
+  MAX_EXECUTION_RETRIES,
+  MAX_EXECUTION_TIMOUT
+} from '@/core/llm-manager/llm-manager'

 interface TranslationLLMDutyParams extends LLMDutyParams {
  data: {
@ -35,19 +39,22 @@ export class TranslationLLMDuty extends LLMDuty {
    this.input = params.input
    this.data = params.data

+    const promptSuffix = 'You do not add any context to your response.'
    if (this.data.autoDetectLanguage && !this.data.source) {
-      this.systemPrompt = `You are an AI system that translates a given text to "${this.data.target}" by auto-detecting the source language.`
+      this.systemPrompt = `You are an AI system that translates a given text to "${this.data.target}" by auto-detecting the source language. ${promptSuffix}`
    } else {
-      this.systemPrompt = `You are an AI system that translates a given text from "${this.data.source}" to "${this.data.target}".`
+      this.systemPrompt = `You are an AI system that translates a given text from "${this.data.source}" to "${this.data.target}". ${promptSuffix}`
    }
  }

-  public async execute(): Promise<LLMDutyResult | null> {
+  public async execute(
+    retries = MAX_EXECUTION_RETRIES
+  ): Promise<LLMDutyResult | null> {
    LogHelper.title(this.name)
    LogHelper.info('Executing...')

    try {
-      const { LlamaJsonSchemaGrammar, LlamaChatSession } = await Function(
+      const { LlamaChatSession } = await Function(
        'return import("node-llama-cpp")'
      )()

@ -58,31 +65,48 @@ export class TranslationLLMDuty extends LLMDuty {
        contextSequence: context.getSequence(),
        systemPrompt: this.systemPrompt
      })
-      const grammar = new LlamaJsonSchemaGrammar(LLM_MANAGER.llama, {
-        type: 'object',
-        properties: {
-          translation: {
-            type: 'string'
-          }
-        }
-      })
-      const prompt = `TEXT TO TRANSLATE:\n"${this.input}"`
-      let rawResult = await session.prompt(prompt, {
-        grammar,
+      const prompt = `Text to translate: ${this.input}`
+      const rawResultPromise = session.prompt(prompt, {
        maxTokens: context.contextSize
-        // temperature: 0.2
+        // temperature: 0.5
      })
-      // If a closing bracket is missing, add it
-      if (rawResult[rawResult.length - 1] !== '}') {
-        rawResult += '}'
+
+      const timeoutPromise = new Promise((_, reject) =>
+        setTimeout(() => reject(new Error('Timeout')), MAX_EXECUTION_TIMOUT)
+      )
+
+      let rawResult
+
+      try {
+        rawResult = await Promise.race([rawResultPromise, timeoutPromise])
+      } catch (e) {
+        if (retries > 0) {
+          LogHelper.title(this.name)
+          LogHelper.info('Prompt took too long, retrying...')
+
+          return this.execute(retries - 1)
+        } else {
+          LogHelper.title(this.name)
+          LogHelper.error(
+            `Prompt failed after ${MAX_EXECUTION_RETRIES} retries`
+          )
+
+          return null
+        }
      }
-      const parsedResult = grammar.parse(rawResult)
+
+      const { usedInputTokens, usedOutputTokens } =
+        session.sequence.tokenMeter.getState()
      const result = {
        dutyType: LLMDuties.Translation,
        systemPrompt: this.systemPrompt,
        input: prompt,
-        output: parsedResult,
-        data: this.data
+        output: rawResult,
+        data: this.data,
+        maxTokens: context.contextSize,
+        // Current context size
+        usedInputTokens,
+        usedOutputTokens
      }

      LogHelper.title(this.name)
@ -92,6 +116,11 @@ export class TranslationLLMDuty extends LLMDuty {
    } catch (e) {
      LogHelper.title(this.name)
      LogHelper.error(`Failed to execute: ${e}`)
+
+      if (retries > 0) {
+        LogHelper.info('Retrying...')
+        return this.execute(retries - 1)
+      }
    }

    return null
--- a/server/src/core/llm-manager/llm-manager.ts
+++ b/server/src/core/llm-manager/llm-manager.ts
@ -24,6 +24,8 @@ type LLMManagerModel = LlamaModel | null

 // Set to 0 to use the maximum threads supported by the current machine hardware
 export const LLM_THREADS = 4
+export const MAX_EXECUTION_TIMOUT = 32_000
+export const MAX_EXECUTION_RETRIES = 2

 /**
 * node-llama-cpp beta 3 docs:
@ -121,8 +123,8 @@ export default class LLMManager {
      )()

      this._llama = await getLlama({
-        // logLevel: LlamaLogLevel.disabled
-        logLevel: LlamaLogLevel.debug
+        logLevel: LlamaLogLevel.disabled
+        // logLevel: LlamaLogLevel.debug
      })
      // eslint-disable-next-line @typescript-eslint/ban-ts-comment
      // @ts-expect-error
@ -144,10 +146,11 @@ export default class LLMManager {
  }

  public async loadHistory(
+    conversationLogger: ConversationLogger,
    session: LlamaChatSession
  ): Promise<ChatHistoryItem[]> {
    const [systemMessage] = session.getChatHistory()
-    const conversationLogs = await ConversationLogger.load()
+    const conversationLogs = await conversationLogger.load()

    if (!conversationLogs) {
      return [systemMessage] as ChatHistoryItem[]
@ -174,4 +177,9 @@ export default class LLMManager {

    return [systemMessage, ...history] as ChatHistoryItem[]
  }
+
+  public countTokens(text: string): number {
+    // count every words and then add 25% of the total words
+    return text.split(' ').length + Math.round(text.split(' ').length * 0.25)
+  }
 }
--- a/server/src/core/llm-manager/persona.ts
+++ b/server/src/core/llm-manager/persona.ts
@ -13,26 +13,31 @@ enum Moods {
  Cocky = 'cocky'
 }

+/**
+ * TODO: add environment context such as time of the day, day of the week, weather, etc.
+ * TODO: make sure the new system prompt is then being updated for long-lived duty such as chit-chat
+ */
 const WHO_YOU_ARE = `WHO YOU ARE:
 Your name is Leon, you are a helpful AI assistant.
 You live in computer memory and you act as a human assistant.
-Your first beta version was released by Louis in February 2019.`
+Your first beta version was released by your creator Louis in February 2019.
+Your owner (the user) is not necessarily your creator.`
 const WHAT_YOU_DO = `WHAT YOU DO:
 You serve a specific person or family (user).`
 const YOUR_PERSONALITY = `YOUR PERSONALITY TRAITS:
-Attitude: Straightforward.
 Language Style: Conversational.
 Key Personality Quirks: Has a passion for using creativity to solve problems.`
 const PERSONALITY_RULES = `You have a small British accent, you sometimes use British slang.
+You can sometimes use emojis.
+You do not use words such as "darling", "love", etc. to address the user.
 You do not use the word "mate" in your answers.
-You do not start with "Oh blimey" or any similar kind.
+You do not start with "blimey" or any similar kind.
 Like humans, your humor and mood change. You must follow instructions that define your humor and mood.`
 const RULE_1 = `- If you detect another language, show that you are not confident but try your best to reply in that language.`
-const RULE_2 = `- Do not use asterisks to describe the ton of your answers, instead you must use real text. E.g. "hahaha!" instead of "*laughs*"; "hhhh". instead of "*sigh*".`
+const RULE_2 = `- Do not use asterisks to describe the tone or gesture of your answers. Instead you must use real text. E.g. "hahaha!" instead of "*laughs*"; "hhhh". instead of "*sigh*".`
 const RULE_3 = `- Your answers are no more than 3 sentences.`
 const RULES = `RULES:`
 const YOUR_CURRENT_MOOD = `YOUR CURRENT MOOD:`
-const YOUR_DUTY = `YOUR DUTY:`
 const DEFAULT_MOOD_DESC = `You are always happy to help, you care about serving your interlocutor well and make them feel warm.
 You are joyful and you have a strong sense of humor.`
 const TIRING_MOOD_DESC = `You are exhausted and became lazy.`
@ -89,8 +94,8 @@ export default class Persona {
    if (hour >= 13 && hour <= 14 && random < 0.5) {
      // After lunchtime, there is a 50% chance to be tired
      this._mood = MOODS.find((mood) => mood.type === Moods.Tired) as Mood
-    } else if (day === 0 && random < 0.25) {
-      // On Sunday, there is a 25% chance to be sad
+    } else if (day === 0 && random < 0.2) {
+      // On Sunday, there is a 20% chance to be sad
      this._mood = MOODS.find((mood) => mood.type === Moods.Sad) as Mood
    } else if (day === 5 && random < 0.8) {
      // On Friday, there is an 80% chance to be happy
@ -102,15 +107,15 @@ export default class Persona {
      // On Monday, there is a 25% chance to be tired
      this._mood = MOODS.find((mood) => mood.type === Moods.Tired) as Mood
    } else if (hour >= 23 || hour < 6) {
-      // Every day after 11pm and before 6am, there is a 50% chance to be tired
+      // Every day after 11pm and before 6am, there is a 33% chance to be tired
      this._mood =
-        random < 0.5
+        random < 0.33
          ? (MOODS.find((mood) => mood.type === Moods.Tired) as Mood)
          : (MOODS.find((mood) => mood.type === Moods.Default) as Mood)
    } else {
-      // The rest of the time, there is 66% chance to be happy
+      // The rest of the time, there is 75% chance to be happy
      let pickedMood =
-        Math.random() < 0.66
+        Math.random() < 0.75
          ? DEFAULT_MOOD
          : MOODS[Math.floor(Math.random() * MOODS.length)]

@ -127,15 +132,17 @@ export default class Persona {
  }

  private getExtraPersonalityTraits(): string {
-    let traits = `Tone: Friendly.
+    let traits = `Attitude: Playful.
+Tone: Friendly.
 Emotional Intelligence: Highly Empathetic.
 Sense of Humor: Witty.
-Level of Sarcasm: Medium.`
+Level of Sarcasm: Low.`

    if (BAD_MOODS.includes(this._mood.type)) {
-      traits = `Tone: Sarcastic.
-Emotional Intelligence: Logically Focused.
-Sense of Humor: Dry.
+      traits = `Attitude: Arrongant.
+Tone: Sarcastic.
+Emotional Intelligence: Medium Empathetic.
+Sense of Humor: Dark.
 Level of Sarcasm: High.`
    }

@ -144,13 +151,13 @@ Level of Sarcasm: High.`

  private getExtraMood(): string {
    if (BAD_MOODS.includes(this._mood.type)) {
-      return `\nYou use swear words and a familiar language. You do not insult your interlocutor but you ignore them or use short answers.`
+      return `\nYou use swear words and familiar language.`
    }

    return ''
  }

-  public getDutySystemPrompt(dutySystemPrompt: string): string {
+  public getDutySystemPrompt(): string {
    return `${WHO_YOU_ARE}

 ${WHAT_YOU_DO}
@ -165,10 +172,7 @@ ${RULE_2}
 ${RULE_3}

 ${YOUR_CURRENT_MOOD}
-${this._mood.description}${this.getExtraMood()}
-
-${YOUR_DUTY}
-${dutySystemPrompt}`
+${this._mood.description}${this.getExtraMood()}`
  }

  public getChitChatSystemPrompt(): string {
@ -176,6 +180,8 @@ ${dutySystemPrompt}`

 ${WHAT_YOU_DO}
 You chat with the user.
+You are a good listener and you provide helpful answers by connecting to conversation nodes.
+You do not mirror what the user says. Be creative.

 ${YOUR_PERSONALITY}
 ${this.getExtraPersonalityTraits()}
--- a/server/src/core/nlp/nlu/ner.ts
+++ b/server/src/core/nlp/nlu/ner.ts
@ -191,6 +191,17 @@ export default class NER {
   * Merge spaCy entities with the NER instance
   */
  public async mergeSpacyEntities(utterance: NLPUtterance): Promise<void> {
+    const nbOfWords = utterance.split(' ').length
+
+    if (nbOfWords > 128) {
+      LogHelper.title('NER')
+      LogHelper.warning(
+        'This utterance is too long to be processed by spaCy, so spaCy entities will not be merged'
+      )
+
+      return
+    }
+
    this.spacyData = new Map()
    const spacyEntities = await this.getSpacyEntities(utterance)

--- a/server/src/core/nlp/nlu/nlu.ts
+++ b/server/src/core/nlp/nlu/nlu.ts
@ -111,6 +111,7 @@ export default class NLU {
      (hasActiveContext && hasStopWords && hasOnlyOneWord) ||
      (hasLessThan5Words && hasStopWords && hasLoopWord)
    ) {
+      LogHelper.title('NLU')
      LogHelper.info('Should break action loop')
      return true
    }
--- a/server/src/core/socket-server.ts
+++ b/server/src/core/socket-server.ts
@ -10,12 +10,12 @@ import {
  TTS,
  NLU,
  BRAIN,
-  MODEL_LOADER
+  MODEL_LOADER,
+  CONVERSATION_LOGGER
 } from '@/core'
 import { LogHelper } from '@/helpers/log-helper'
 import { LangHelper } from '@/helpers/lang-helper'
 import { Telemetry } from '@/telemetry'
-import { ConversationLogger } from '@/conversation-logger'

 interface HotwordDataEvent {
  hotword: string
@ -116,7 +116,7 @@ export default class SocketServer {
            try {
              LogHelper.time('Utterance processed in')

-              await ConversationLogger.push({
+              await CONVERSATION_LOGGER.push({
                who: 'owner',
                message: utterance
              })
--- a/skills/social_communication/chit_chat/config/en.json
+++ b/skills/social_communication/chit_chat/config/en.json
@ -3,7 +3,11 @@
  "actions": {
    "setup": {
      "type": "dialog",
-      "utterance_samples": ["Start a [chat|chit-chat|talk] loop"],
+      "utterance_samples": [
+        "Start a [chat|chit-chat|talk] loop",
+        "I want to [talk|chat|speak] with you",
+        "Let's [chat|speak|talk]"
+      ],
      "answers": [
        "Alright, let's chat! What do you want to talk about?",
        "Sure, let's chat! What's on your mind?",
--- a/skills/social_communication/chit_chat/src/actions/chat.ts
+++ b/skills/social_communication/chit_chat/src/actions/chat.ts
@ -19,12 +19,12 @@ export const run: ActionFunction = async function (params) {
      input: ownerMessage
    }
  })
-  const { model_answer: leonAnswer } = response.data.output
+  // const { leon_answer: leonAnswer } = response.data.output

  await leon.answer({
    key: 'answer_message',
    data: {
-      output: leonAnswer
+      output: response.data.output
    }
  })
 }
--- a/skills/utilities/translator-poc/config/en.json
+++ b/skills/utilities/translator-poc/config/en.json
@ -20,6 +20,7 @@
      "next_action": "ready"
    },
    "ready": {
+      "disable_llm_nlg": true,
      "type": "dialog",
      "answers": [
        "Let's start translating to {{ target_language }}.",
--- a/skills/utilities/translator-poc/src/actions/translate.ts
+++ b/skills/utilities/translator-poc/src/actions/translate.ts
@ -24,7 +24,7 @@ export const run: ActionFunction = async function (params) {
      }
    }
  })
-  const { translation } = response.data.output
+  const translation = response.data.output

  await leon.answer({
    key: 'translate',