mirror of
https://github.com/leon-ai/leon.git
synced 2024-10-03 20:57:55 +03:00
feat: use VRAM as LLM unit requirements
This commit is contained in:
parent
6867e9c6db
commit
0c775ba2e5
@ -36,6 +36,11 @@ document.addEventListener('DOMContentLoaded', async () => {
|
||||
const infoKeys = [
|
||||
'timeZone',
|
||||
'telemetry',
|
||||
'gpu',
|
||||
'graphicsComputeAPI',
|
||||
'totalVRAM',
|
||||
'freeVRAM',
|
||||
'usedVRAM',
|
||||
'llm',
|
||||
'shouldWarmUpLLMDuties',
|
||||
'isLLMActionRecognitionEnabled',
|
||||
|
@ -7,7 +7,7 @@ import { command } from 'execa'
|
||||
import {
|
||||
LLM_NAME,
|
||||
LLM_NAME_WITH_VERSION,
|
||||
LLM_MINIMUM_TOTAL_RAM,
|
||||
LLM_MINIMUM_TOTAL_VRAM,
|
||||
LLM_DIR_PATH,
|
||||
LLM_PATH,
|
||||
LLM_VERSION,
|
||||
@ -31,8 +31,27 @@ import { FileHelper } from '@/helpers/file-helper'
|
||||
const LLM_MANIFEST_PATH = path.join(LLM_DIR_PATH, 'manifest.json')
|
||||
let manifest = null
|
||||
|
||||
function checkMinimumHardwareRequirements() {
|
||||
return SystemHelper.getTotalRAM() >= LLM_MINIMUM_TOTAL_RAM
|
||||
async function checkMinimumHardwareRequirements() {
|
||||
const { getLlama, LlamaLogLevel } = await Function(
|
||||
'return import("node-llama-cpp")'
|
||||
)()
|
||||
const llama = await getLlama({
|
||||
logLevel: LlamaLogLevel.disabled
|
||||
})
|
||||
|
||||
if (!(await SystemHelper.hasGPU(llama))) {
|
||||
return false
|
||||
}
|
||||
|
||||
LogHelper.info(
|
||||
`GPU detected: ${(await SystemHelper.getGPUDeviceNames(llama))[0]}`
|
||||
)
|
||||
LogHelper.info(
|
||||
`Graphics compute API: ${await SystemHelper.getGraphicsComputeAPI(llama)}`
|
||||
)
|
||||
LogHelper.info(`Total VRAM: ${await SystemHelper.getTotalVRAM(llama)} GB`)
|
||||
|
||||
return (await SystemHelper.getTotalVRAM(llama)) >= LLM_MINIMUM_TOTAL_VRAM
|
||||
}
|
||||
|
||||
async function downloadLLM() {
|
||||
@ -162,13 +181,19 @@ async function downloadAndCompileLlamaCPP() {
|
||||
}
|
||||
|
||||
export default async () => {
|
||||
const canSetupLLM = checkMinimumHardwareRequirements()
|
||||
const canSetupLLM = await checkMinimumHardwareRequirements()
|
||||
|
||||
if (!canSetupLLM) {
|
||||
const totalRAM = SystemHelper.getTotalRAM()
|
||||
const { getLlama, LlamaLogLevel } = await Function(
|
||||
'return import("node-llama-cpp")'
|
||||
)()
|
||||
const llama = await getLlama({
|
||||
logLevel: LlamaLogLevel.disabled
|
||||
})
|
||||
const totalVRAM = await SystemHelper.getTotalVRAM(llama)
|
||||
|
||||
LogHelper.warning(
|
||||
`LLM requires at least ${LLM_MINIMUM_TOTAL_RAM} of total RAM. Current total RAM is ${totalRAM} GB. No worries though, Leon can still run without LLM.`
|
||||
`LLM requires at least ${LLM_MINIMUM_TOTAL_VRAM} GB of total VRAM. Current total VRAM is ${totalVRAM} GB. No worries though, Leon can still run without LLM.`
|
||||
)
|
||||
} else {
|
||||
await downloadLLM()
|
||||
|
@ -283,8 +283,8 @@ export const LLM_FILE_NAME = `Lexi-Llama-${LLM_VERSION}.gguf`
|
||||
export const LLM_NAME_WITH_VERSION = `${LLM_NAME} (${LLM_VERSION})`
|
||||
export const LLM_DIR_PATH = path.join(MODELS_PATH, 'llm')
|
||||
export const LLM_PATH = path.join(LLM_DIR_PATH, LLM_FILE_NAME)
|
||||
export const LLM_MINIMUM_TOTAL_RAM = 8
|
||||
export const LLM_MINIMUM_FREE_RAM = 8
|
||||
export const LLM_MINIMUM_TOTAL_VRAM = 8
|
||||
export const LLM_MINIMUM_FREE_VRAM = 8
|
||||
/*export const LLM_HF_DOWNLOAD_URL = NetworkHelper.setHuggingFaceURL(
|
||||
'https://huggingface.co/QuantFactory/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct.Q5_K_S.gguf?download=true'
|
||||
)*/
|
||||
|
@ -14,6 +14,7 @@ import {
|
||||
import { LLM_MANAGER, PERSONA } from '@/core'
|
||||
import { LogHelper } from '@/helpers/log-helper'
|
||||
import { DateHelper } from '@/helpers/date-helper'
|
||||
import { SystemHelper } from '@/helpers/system-helper'
|
||||
|
||||
export const getInfo: FastifyPluginAsync<APIOptions> = async (
|
||||
fastify,
|
||||
@ -27,6 +28,20 @@ export const getInfo: FastifyPluginAsync<APIOptions> = async (
|
||||
const message = 'Information pulled.'
|
||||
LogHelper.success(message)
|
||||
|
||||
const [
|
||||
gpuDeviceNames,
|
||||
graphicsComputeAPI,
|
||||
totalVRAM,
|
||||
freeVRAM,
|
||||
usedVRAM
|
||||
] = await Promise.all([
|
||||
SystemHelper.getGPUDeviceNames(),
|
||||
SystemHelper.getGraphicsComputeAPI(),
|
||||
SystemHelper.getTotalVRAM(),
|
||||
SystemHelper.getFreeVRAM(),
|
||||
SystemHelper.getUsedVRAM()
|
||||
])
|
||||
|
||||
reply.send({
|
||||
success: true,
|
||||
status: 200,
|
||||
@ -39,6 +54,11 @@ export const getInfo: FastifyPluginAsync<APIOptions> = async (
|
||||
LLM_MANAGER.isLLMActionRecognitionEnabled,
|
||||
isLLMNLGEnabled: LLM_MANAGER.isLLMNLGEnabled,
|
||||
timeZone: DateHelper.getTimeZone(),
|
||||
gpu: gpuDeviceNames[0],
|
||||
graphicsComputeAPI,
|
||||
totalVRAM,
|
||||
freeVRAM,
|
||||
usedVRAM,
|
||||
llm: {
|
||||
enabled: LLM_MANAGER.isLLMEnabled,
|
||||
provider: LLM_PROVIDER
|
||||
|
@ -19,6 +19,7 @@ import { llmInferencePlugin } from '@/core/http-server/api/llm-inference'
|
||||
import { keyMidd } from '@/core/http-server/plugins/key'
|
||||
import { utterancePlugin } from '@/core/http-server/api/utterance'
|
||||
import { LLM_MANAGER, PERSONA } from '@/core'
|
||||
import { SystemHelper } from '@/helpers/system-helper'
|
||||
|
||||
const API_VERSION = 'v1'
|
||||
|
||||
@ -65,6 +66,12 @@ export default class HTTPServer {
|
||||
|
||||
LogHelper.info(`Mood: ${PERSONA.mood.type}`)
|
||||
|
||||
LogHelper.info(`GPU: ${(await SystemHelper.getGPUDeviceNames())[0]}`)
|
||||
LogHelper.info(
|
||||
`Graphics compute API: ${await SystemHelper.getGraphicsComputeAPI()}`
|
||||
)
|
||||
LogHelper.info(`Total VRAM: ${await SystemHelper.getTotalVRAM()} GB`)
|
||||
|
||||
const isLLMEnabled = LLM_MANAGER.isLLMEnabled ? 'enabled' : 'disabled'
|
||||
LogHelper.info(`LLM: ${isLLMEnabled}`)
|
||||
|
||||
|
@ -12,8 +12,8 @@ import {
|
||||
HAS_LLM,
|
||||
HAS_LLM_ACTION_RECOGNITION,
|
||||
HAS_LLM_NLG,
|
||||
LLM_MINIMUM_FREE_RAM,
|
||||
LLM_MINIMUM_TOTAL_RAM,
|
||||
LLM_MINIMUM_FREE_VRAM,
|
||||
LLM_MINIMUM_TOTAL_VRAM,
|
||||
LLM_NAME_WITH_VERSION,
|
||||
LLM_PATH,
|
||||
LLM_PROVIDER,
|
||||
@ -180,11 +180,13 @@ export default class LLMManager {
|
||||
}
|
||||
|
||||
if (LLM_PROVIDER === LLMProviders.Local) {
|
||||
const freeRAMInGB = SystemHelper.getFreeRAM()
|
||||
const totalRAMInGB = SystemHelper.getTotalRAM()
|
||||
const [freeVRAMInGB, totalVRAMInGB] = await Promise.all([
|
||||
SystemHelper.getFreeVRAM(),
|
||||
SystemHelper.getTotalVRAM()
|
||||
])
|
||||
const isLLMPathFound = fs.existsSync(LLM_PATH)
|
||||
const isCurrentFreeRAMEnough = LLM_MINIMUM_FREE_RAM <= freeRAMInGB * 4 // Multiply by 4 to boost probability of success
|
||||
const isTotalRAMEnough = LLM_MINIMUM_TOTAL_RAM <= totalRAMInGB
|
||||
const isCurrentFreeRAMEnough = LLM_MINIMUM_FREE_VRAM <= freeVRAMInGB
|
||||
const isTotalRAMEnough = LLM_MINIMUM_TOTAL_VRAM <= totalVRAMInGB
|
||||
|
||||
/**
|
||||
* In case the LLM is not set up and
|
||||
|
@ -24,14 +24,14 @@ export class LogHelper {
|
||||
* This one looks obvious :)
|
||||
*/
|
||||
public static info(value: string): void {
|
||||
console.info('\x1b[36mℹ️ %s\x1b[0m', value)
|
||||
console.info('\x1b[36mℹ️ %s\x1b[0m', value)
|
||||
}
|
||||
|
||||
/**
|
||||
* This one looks obvious :)
|
||||
*/
|
||||
public static warning(value: string): void {
|
||||
console.warn('\x1b[33m⚠️ %s\x1b[0m', value)
|
||||
console.warn('\x1b[33m⚠️ %s\x1b[0m', value)
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1,5 +1,7 @@
|
||||
import os from 'node:os'
|
||||
|
||||
import type { Llama } from 'node-llama-cpp'
|
||||
|
||||
import { OSTypes, CPUArchitectures } from '@/types'
|
||||
|
||||
enum OSNames {
|
||||
@ -196,11 +198,11 @@ export class SystemHelper {
|
||||
* Get the names of the GPU devices on the machine
|
||||
* @example getGPUDeviceNames() // ['Apple M1 Pro']
|
||||
*/
|
||||
public static async getGPUDeviceNames(): Promise<string[]> {
|
||||
const { LLM_MANAGER } = await import('@/core')
|
||||
public static async getGPUDeviceNames(llama?: Llama): Promise<string[]> {
|
||||
const llamaAPI = llama ? llama : (await import('@/core')).LLM_MANAGER.llama
|
||||
|
||||
if (LLM_MANAGER.llama) {
|
||||
return LLM_MANAGER.llama.getGpuDeviceNames()
|
||||
if (llamaAPI) {
|
||||
return llamaAPI.getGpuDeviceNames()
|
||||
}
|
||||
|
||||
return []
|
||||
@ -210,11 +212,11 @@ export class SystemHelper {
|
||||
* Check if the machine has a GPU
|
||||
* @example hasGPU() // true
|
||||
*/
|
||||
public static async hasGPU(): Promise<boolean> {
|
||||
const { LLM_MANAGER } = await import('@/core')
|
||||
public static async hasGPU(llama?: Llama): Promise<boolean> {
|
||||
const llamaAPI = llama ? llama : (await import('@/core')).LLM_MANAGER.llama
|
||||
|
||||
if (LLM_MANAGER.llama) {
|
||||
return !!LLM_MANAGER.llama.gpu
|
||||
if (llamaAPI) {
|
||||
return !!llamaAPI.gpu
|
||||
}
|
||||
|
||||
return false
|
||||
@ -224,11 +226,13 @@ export class SystemHelper {
|
||||
* Get the graphics compute API used by the machine
|
||||
* @example getGraphicsComputeAPI() // 'cuda'
|
||||
*/
|
||||
public static async getGraphicsComputeAPI(): Promise<GraphicsComputeAPIs> {
|
||||
const { LLM_MANAGER } = await import('@/core')
|
||||
public static async getGraphicsComputeAPI(
|
||||
llama?: Llama
|
||||
): Promise<GraphicsComputeAPIs> {
|
||||
const llamaAPI = llama ? llama : (await import('@/core')).LLM_MANAGER.llama
|
||||
|
||||
if (LLM_MANAGER.llama && LLM_MANAGER.llama.gpu) {
|
||||
return LLM_MANAGER.llama.gpu as GraphicsComputeAPIs
|
||||
if (llamaAPI && llamaAPI.gpu) {
|
||||
return llamaAPI.gpu as GraphicsComputeAPIs
|
||||
}
|
||||
|
||||
return GraphicsComputeAPIs.CPU
|
||||
@ -238,11 +242,11 @@ export class SystemHelper {
|
||||
* Get the amount of used VRAM (in GB) on the machine
|
||||
* @example getUsedVRAM() // 6.04
|
||||
*/
|
||||
public static async getUsedVRAM(): Promise<number> {
|
||||
const { LLM_MANAGER } = await import('@/core')
|
||||
public static async getUsedVRAM(llama?: Llama): Promise<number> {
|
||||
const llamaAPI = llama ? llama : (await import('@/core')).LLM_MANAGER.llama
|
||||
|
||||
if (LLM_MANAGER.llama) {
|
||||
const vramState = await LLM_MANAGER.llama.getVramState()
|
||||
if (llamaAPI) {
|
||||
const vramState = await llamaAPI.getVramState()
|
||||
|
||||
return Number((vramState.used / (1_024 * 1_024 * 1_024)).toFixed(2))
|
||||
}
|
||||
@ -254,11 +258,11 @@ export class SystemHelper {
|
||||
* Get the total amount of VRAM (in GB) on the machine
|
||||
* @example getTotalVRAM() // 12
|
||||
*/
|
||||
public static async getTotalVRAM(): Promise<number> {
|
||||
const { LLM_MANAGER } = await import('@/core')
|
||||
public static async getTotalVRAM(llama?: Llama): Promise<number> {
|
||||
const llamaAPI = llama ? llama : (await import('@/core')).LLM_MANAGER.llama
|
||||
|
||||
if (LLM_MANAGER.llama) {
|
||||
const vramState = await LLM_MANAGER.llama.getVramState()
|
||||
if (llamaAPI) {
|
||||
const vramState = await llamaAPI.getVramState()
|
||||
|
||||
return Number((vramState.total / (1_024 * 1_024 * 1_024)).toFixed(2))
|
||||
}
|
||||
@ -270,11 +274,11 @@ export class SystemHelper {
|
||||
* Get the amount of free VRAM (in GB) on the machine
|
||||
* @example getFreeVRAM() // 6
|
||||
*/
|
||||
public static async getFreeVRAM(): Promise<number> {
|
||||
const { LLM_MANAGER } = await import('@/core')
|
||||
public static async getFreeVRAM(llama?: Llama): Promise<number> {
|
||||
const llamaAPI = llama ? llama : (await import('@/core')).LLM_MANAGER.llama
|
||||
|
||||
if (LLM_MANAGER.llama) {
|
||||
const vramState = await LLM_MANAGER.llama.getVramState()
|
||||
if (llamaAPI) {
|
||||
const vramState = await llamaAPI.getVramState()
|
||||
|
||||
return Number((vramState.free / (1_024 * 1_024 * 1_024)).toFixed(2))
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user