feat(server): enable FlashAttention for faster inference

2024-09-11 18:27:21 +03:00 · 2024-07-14 21:19:42 +08:00 · 2024-07-14 21:19:42 +08:00 · fd5e952695
commit fd5e952695
parent 626c77d340
1 changed files with 2 additions and 1 deletions
--- a/server/src/core/llm-manager/llm-manager.ts
+++ b/server/src/core/llm-manager/llm-manager.ts
@ -230,7 +230,8 @@ export default class LLMManager {
        // eslint-disable-next-line @typescript-eslint/ban-ts-comment
        // @ts-expect-error
        this._model = await this._llama.loadModel({
-          modelPath: LLM_PATH
+          modelPath: LLM_PATH,
+          defaultContextFlashAttention: true
        })

        if (HAS_LLM_NLG) {