feat: use VRAM as LLM unit requirements

2024-10-03 20:57:55 +03:00 · 2024-07-03 23:41:23 +08:00 · 2024-07-03 23:41:23 +08:00 · 0c775ba2e5
commit 0c775ba2e5
parent 6867e9c6db
8 changed files with 103 additions and 40 deletions
--- a/app/src/js/main.js
+++ b/app/src/js/main.js
@ -36,6 +36,11 @@ document.addEventListener('DOMContentLoaded', async () => {
    const infoKeys = [
      'timeZone',
      'telemetry',
+      'gpu',
+      'graphicsComputeAPI',
+      'totalVRAM',
+      'freeVRAM',
+      'usedVRAM',
      'llm',
      'shouldWarmUpLLMDuties',
      'isLLMActionRecognitionEnabled',
--- a/scripts/setup/setup-llm.js
+++ b/scripts/setup/setup-llm.js
@ -7,7 +7,7 @@ import { command } from 'execa'
 import {
  LLM_NAME,
  LLM_NAME_WITH_VERSION,
-  LLM_MINIMUM_TOTAL_RAM,
+  LLM_MINIMUM_TOTAL_VRAM,
  LLM_DIR_PATH,
  LLM_PATH,
  LLM_VERSION,
@ -31,8 +31,27 @@ import { FileHelper } from '@/helpers/file-helper'
 const LLM_MANIFEST_PATH = path.join(LLM_DIR_PATH, 'manifest.json')
 let manifest = null

-function checkMinimumHardwareRequirements() {
-  return SystemHelper.getTotalRAM() >= LLM_MINIMUM_TOTAL_RAM
+async function checkMinimumHardwareRequirements() {
+  const { getLlama, LlamaLogLevel } = await Function(
+    'return import("node-llama-cpp")'
+  )()
+  const llama = await getLlama({
+    logLevel: LlamaLogLevel.disabled
+  })
+
+  if (!(await SystemHelper.hasGPU(llama))) {
+    return false
+  }
+
+  LogHelper.info(
+    `GPU detected: ${(await SystemHelper.getGPUDeviceNames(llama))[0]}`
+  )
+  LogHelper.info(
+    `Graphics compute API: ${await SystemHelper.getGraphicsComputeAPI(llama)}`
+  )
+  LogHelper.info(`Total VRAM: ${await SystemHelper.getTotalVRAM(llama)} GB`)
+
+  return (await SystemHelper.getTotalVRAM(llama)) >= LLM_MINIMUM_TOTAL_VRAM
 }

 async function downloadLLM() {
@ -162,13 +181,19 @@ async function downloadAndCompileLlamaCPP() {
 }

 export default async () => {
-  const canSetupLLM = checkMinimumHardwareRequirements()
+  const canSetupLLM = await checkMinimumHardwareRequirements()

  if (!canSetupLLM) {
-    const totalRAM = SystemHelper.getTotalRAM()
+    const { getLlama, LlamaLogLevel } = await Function(
+      'return import("node-llama-cpp")'
+    )()
+    const llama = await getLlama({
+      logLevel: LlamaLogLevel.disabled
+    })
+    const totalVRAM = await SystemHelper.getTotalVRAM(llama)

    LogHelper.warning(
-      `LLM requires at least ${LLM_MINIMUM_TOTAL_RAM} of total RAM. Current total RAM is ${totalRAM} GB. No worries though, Leon can still run without LLM.`
+      `LLM requires at least ${LLM_MINIMUM_TOTAL_VRAM} GB of total VRAM. Current total VRAM is ${totalVRAM} GB. No worries though, Leon can still run without LLM.`
    )
  } else {
    await downloadLLM()
--- a/server/src/constants.ts
+++ b/server/src/constants.ts
@ -283,8 +283,8 @@ export const LLM_FILE_NAME = `Lexi-Llama-${LLM_VERSION}.gguf`
 export const LLM_NAME_WITH_VERSION = `${LLM_NAME} (${LLM_VERSION})`
 export const LLM_DIR_PATH = path.join(MODELS_PATH, 'llm')
 export const LLM_PATH = path.join(LLM_DIR_PATH, LLM_FILE_NAME)
-export const LLM_MINIMUM_TOTAL_RAM = 8
-export const LLM_MINIMUM_FREE_RAM = 8
+export const LLM_MINIMUM_TOTAL_VRAM = 8
+export const LLM_MINIMUM_FREE_VRAM = 8
 /*export const LLM_HF_DOWNLOAD_URL = NetworkHelper.setHuggingFaceURL(
  'https://huggingface.co/QuantFactory/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct.Q5_K_S.gguf?download=true'
 )*/
--- a/server/src/core/http-server/api/info/get.ts
+++ b/server/src/core/http-server/api/info/get.ts
@ -14,6 +14,7 @@ import {
 import { LLM_MANAGER, PERSONA } from '@/core'
 import { LogHelper } from '@/helpers/log-helper'
 import { DateHelper } from '@/helpers/date-helper'
+import { SystemHelper } from '@/helpers/system-helper'

 export const getInfo: FastifyPluginAsync<APIOptions> = async (
  fastify,
@ -27,6 +28,20 @@ export const getInfo: FastifyPluginAsync<APIOptions> = async (
      const message = 'Information pulled.'
      LogHelper.success(message)

+      const [
+        gpuDeviceNames,
+        graphicsComputeAPI,
+        totalVRAM,
+        freeVRAM,
+        usedVRAM
+      ] = await Promise.all([
+        SystemHelper.getGPUDeviceNames(),
+        SystemHelper.getGraphicsComputeAPI(),
+        SystemHelper.getTotalVRAM(),
+        SystemHelper.getFreeVRAM(),
+        SystemHelper.getUsedVRAM()
+      ])
+
      reply.send({
        success: true,
        status: 200,
@ -39,6 +54,11 @@ export const getInfo: FastifyPluginAsync<APIOptions> = async (
          LLM_MANAGER.isLLMActionRecognitionEnabled,
        isLLMNLGEnabled: LLM_MANAGER.isLLMNLGEnabled,
        timeZone: DateHelper.getTimeZone(),
+        gpu: gpuDeviceNames[0],
+        graphicsComputeAPI,
+        totalVRAM,
+        freeVRAM,
+        usedVRAM,
        llm: {
          enabled: LLM_MANAGER.isLLMEnabled,
          provider: LLM_PROVIDER
--- a/server/src/core/http-server/http-server.ts
+++ b/server/src/core/http-server/http-server.ts
@ -19,6 +19,7 @@ import { llmInferencePlugin } from '@/core/http-server/api/llm-inference'
 import { keyMidd } from '@/core/http-server/plugins/key'
 import { utterancePlugin } from '@/core/http-server/api/utterance'
 import { LLM_MANAGER, PERSONA } from '@/core'
+import { SystemHelper } from '@/helpers/system-helper'

 const API_VERSION = 'v1'

@ -65,6 +66,12 @@ export default class HTTPServer {

    LogHelper.info(`Mood: ${PERSONA.mood.type}`)

+    LogHelper.info(`GPU: ${(await SystemHelper.getGPUDeviceNames())[0]}`)
+    LogHelper.info(
+      `Graphics compute API: ${await SystemHelper.getGraphicsComputeAPI()}`
+    )
+    LogHelper.info(`Total VRAM: ${await SystemHelper.getTotalVRAM()} GB`)
+
    const isLLMEnabled = LLM_MANAGER.isLLMEnabled ? 'enabled' : 'disabled'
    LogHelper.info(`LLM: ${isLLMEnabled}`)

--- a/server/src/core/llm-manager/llm-manager.ts
+++ b/server/src/core/llm-manager/llm-manager.ts
@ -12,8 +12,8 @@ import {
  HAS_LLM,
  HAS_LLM_ACTION_RECOGNITION,
  HAS_LLM_NLG,
-  LLM_MINIMUM_FREE_RAM,
-  LLM_MINIMUM_TOTAL_RAM,
+  LLM_MINIMUM_FREE_VRAM,
+  LLM_MINIMUM_TOTAL_VRAM,
  LLM_NAME_WITH_VERSION,
  LLM_PATH,
  LLM_PROVIDER,
@ -180,11 +180,13 @@ export default class LLMManager {
    }

    if (LLM_PROVIDER === LLMProviders.Local) {
-      const freeRAMInGB = SystemHelper.getFreeRAM()
-      const totalRAMInGB = SystemHelper.getTotalRAM()
+      const [freeVRAMInGB, totalVRAMInGB] = await Promise.all([
+        SystemHelper.getFreeVRAM(),
+        SystemHelper.getTotalVRAM()
+      ])
      const isLLMPathFound = fs.existsSync(LLM_PATH)
-      const isCurrentFreeRAMEnough = LLM_MINIMUM_FREE_RAM <= freeRAMInGB * 4 // Multiply by 4 to boost probability of success
-      const isTotalRAMEnough = LLM_MINIMUM_TOTAL_RAM <= totalRAMInGB
+      const isCurrentFreeRAMEnough = LLM_MINIMUM_FREE_VRAM <= freeVRAMInGB
+      const isTotalRAMEnough = LLM_MINIMUM_TOTAL_VRAM <= totalVRAMInGB

      /**
       * In case the LLM is not set up and
--- a/server/src/helpers/log-helper.ts
+++ b/server/src/helpers/log-helper.ts
@ -24,14 +24,14 @@ export class LogHelper {
   * This one looks obvious :)
   */
  public static info(value: string): void {
-    console.info('\x1b[36mℹ️  %s\x1b[0m', value)
+    console.info('\x1b[36mℹ️ %s\x1b[0m', value)
  }

  /**
   * This one looks obvious :)
   */
  public static warning(value: string): void {
-    console.warn('\x1b[33m⚠️  %s\x1b[0m', value)
+    console.warn('\x1b[33m⚠️ %s\x1b[0m', value)
  }

  /**
--- a/server/src/helpers/system-helper.ts
+++ b/server/src/helpers/system-helper.ts
@ -1,5 +1,7 @@
 import os from 'node:os'

+import type { Llama } from 'node-llama-cpp'
+
 import { OSTypes, CPUArchitectures } from '@/types'

 enum OSNames {
@ -196,11 +198,11 @@ export class SystemHelper {
   * Get the names of the GPU devices on the machine
   * @example getGPUDeviceNames() // ['Apple M1 Pro']
   */
-  public static async getGPUDeviceNames(): Promise<string[]> {
-    const { LLM_MANAGER } = await import('@/core')
+  public static async getGPUDeviceNames(llama?: Llama): Promise<string[]> {
+    const llamaAPI = llama ? llama : (await import('@/core')).LLM_MANAGER.llama

-    if (LLM_MANAGER.llama) {
-      return LLM_MANAGER.llama.getGpuDeviceNames()
+    if (llamaAPI) {
+      return llamaAPI.getGpuDeviceNames()
    }

    return []
@ -210,11 +212,11 @@ export class SystemHelper {
   * Check if the machine has a GPU
   * @example hasGPU() // true
   */
-  public static async hasGPU(): Promise<boolean> {
-    const { LLM_MANAGER } = await import('@/core')
+  public static async hasGPU(llama?: Llama): Promise<boolean> {
+    const llamaAPI = llama ? llama : (await import('@/core')).LLM_MANAGER.llama

-    if (LLM_MANAGER.llama) {
-      return !!LLM_MANAGER.llama.gpu
+    if (llamaAPI) {
+      return !!llamaAPI.gpu
    }

    return false
@ -224,11 +226,13 @@ export class SystemHelper {
   * Get the graphics compute API used by the machine
   * @example getGraphicsComputeAPI() // 'cuda'
   */
-  public static async getGraphicsComputeAPI(): Promise<GraphicsComputeAPIs> {
-    const { LLM_MANAGER } = await import('@/core')
+  public static async getGraphicsComputeAPI(
+    llama?: Llama
+  ): Promise<GraphicsComputeAPIs> {
+    const llamaAPI = llama ? llama : (await import('@/core')).LLM_MANAGER.llama

-    if (LLM_MANAGER.llama && LLM_MANAGER.llama.gpu) {
-      return LLM_MANAGER.llama.gpu as GraphicsComputeAPIs
+    if (llamaAPI && llamaAPI.gpu) {
+      return llamaAPI.gpu as GraphicsComputeAPIs
    }

    return GraphicsComputeAPIs.CPU
@ -238,11 +242,11 @@ export class SystemHelper {
   * Get the amount of used VRAM (in GB) on the machine
   * @example getUsedVRAM() // 6.04
   */
-  public static async getUsedVRAM(): Promise<number> {
-    const { LLM_MANAGER } = await import('@/core')
+  public static async getUsedVRAM(llama?: Llama): Promise<number> {
+    const llamaAPI = llama ? llama : (await import('@/core')).LLM_MANAGER.llama

-    if (LLM_MANAGER.llama) {
-      const vramState = await LLM_MANAGER.llama.getVramState()
+    if (llamaAPI) {
+      const vramState = await llamaAPI.getVramState()

      return Number((vramState.used / (1_024 * 1_024 * 1_024)).toFixed(2))
    }
@ -254,11 +258,11 @@ export class SystemHelper {
   * Get the total amount of VRAM (in GB) on the machine
   * @example getTotalVRAM() // 12
   */
-  public static async getTotalVRAM(): Promise<number> {
-    const { LLM_MANAGER } = await import('@/core')
+  public static async getTotalVRAM(llama?: Llama): Promise<number> {
+    const llamaAPI = llama ? llama : (await import('@/core')).LLM_MANAGER.llama

-    if (LLM_MANAGER.llama) {
-      const vramState = await LLM_MANAGER.llama.getVramState()
+    if (llamaAPI) {
+      const vramState = await llamaAPI.getVramState()

      return Number((vramState.total / (1_024 * 1_024 * 1_024)).toFixed(2))
    }
@ -270,11 +274,11 @@ export class SystemHelper {
   * Get the amount of free VRAM (in GB) on the machine
   * @example getFreeVRAM() // 6
   */
-  public static async getFreeVRAM(): Promise<number> {
-    const { LLM_MANAGER } = await import('@/core')
+  public static async getFreeVRAM(llama?: Llama): Promise<number> {
+    const llamaAPI = llama ? llama : (await import('@/core')).LLM_MANAGER.llama

-    if (LLM_MANAGER.llama) {
-      const vramState = await LLM_MANAGER.llama.getVramState()
+    if (llamaAPI) {
+      const vramState = await llamaAPI.getVramState()

      return Number((vramState.free / (1_024 * 1_024 * 1_024)).toFixed(2))
    }