typescript bindings maintenance (#2363)

* remove outdated comments

Signed-off-by: limez <limez@protonmail.com>

* simpler build from source

Signed-off-by: limez <limez@protonmail.com>

* update unix build script to create .so runtimes correctly

Signed-off-by: limez <limez@protonmail.com>

* configure ci build type, use RelWithDebInfo for dev build script

Signed-off-by: limez <limez@protonmail.com>

* add clean script

Signed-off-by: limez <limez@protonmail.com>

* fix streamed token decoding / emoji

Signed-off-by: limez <limez@protonmail.com>

* remove deprecated nCtx

Signed-off-by: limez <limez@protonmail.com>

* update typings

Signed-off-by: jacob <jacoobes@sern.dev>

update typings

Signed-off-by: jacob <jacoobes@sern.dev>

* readme,mspell

Signed-off-by: jacob <jacoobes@sern.dev>

* cuda/backend logic changes + name napi methods like their js counterparts

Signed-off-by: limez <limez@protonmail.com>

* convert llmodel example into a test, separate test suite that can run in ci

Signed-off-by: limez <limez@protonmail.com>

* update examples / naming

Signed-off-by: limez <limez@protonmail.com>

* update deps, remove the need for binding.ci.gyp, make node-gyp-build fallback easier testable

Signed-off-by: limez <limez@protonmail.com>

* make sure the assert-backend-sources.js script is published, but not the others

Signed-off-by: limez <limez@protonmail.com>

* build correctly on windows (regression on node-gyp-build)

Signed-off-by: Jacob Nguyen <76754747+jacoobes@users.noreply.github.com>

* codespell

Signed-off-by: limez <limez@protonmail.com>

* make sure dlhandle.cpp gets linked correctly

Signed-off-by: limez <limez@protonmail.com>

* add include for check_cxx_compiler_flag call during aarch64 builds

Signed-off-by: limez <limez@protonmail.com>

* x86 > arm64 cross compilation of runtimes and bindings

Signed-off-by: limez <limez@protonmail.com>

* default to cpu instead of kompute on arm64

Signed-off-by: limez <limez@protonmail.com>

* formatting, more minimal example

Signed-off-by: limez <limez@protonmail.com>

---------

Signed-off-by: limez <limez@protonmail.com>
Signed-off-by: jacob <jacoobes@sern.dev>
Signed-off-by: Jacob Nguyen <76754747+jacoobes@users.noreply.github.com>
Co-authored-by: Jacob Nguyen <76754747+jacoobes@users.noreply.github.com>
Co-authored-by: jacob <jacoobes@sern.dev>
This commit is contained in:
Andreas Obersteiner 2024-06-03 18:12:55 +02:00 committed by GitHub
parent f001897a1a
commit a602f7fde7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
30 changed files with 1112 additions and 873 deletions

View File

@ -570,7 +570,7 @@ jobs:
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get install -y cmake build-essential vulkan-sdk cuda-compiler-12-4 libcublas-dev-12-4 libnvidia-compute-550-server libmysqlclient21 libodbc2 libpq5
sudo apt-get install -y cmake build-essential g++-12-aarch64-linux-gnu gcc-12-aarch64-linux-gnu vulkan-sdk cuda-compiler-12-4 libcublas-dev-12-4 libnvidia-compute-550-server libmysqlclient21 libodbc2 libpq5
- run:
name: Build Libraries
command: |
@ -578,14 +578,19 @@ jobs:
cd gpt4all-backend
mkdir -p runtimes/build
cd runtimes/build
cmake ../..
cmake --build . --parallel --config Release
cmake ../.. -DCMAKE_BUILD_TYPE=Release
cmake --build . --parallel
mkdir ../linux-x64
cp -L *.so ../linux-x64 # otherwise persist_to_workspace seems to mess symlinks
cmake ../.. -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE="./toolchains/linux-arm64-toolchain.cmake"
cmake --build . --parallel
mkdir ../linux-arm64
cp -L *.so ../linux-arm64
- persist_to_workspace:
root: gpt4all-backend
paths:
- runtimes/linux-x64/*.so
- runtimes/linux-arm64/*.so
build-bindings-backend-macos:
macos:
@ -896,6 +901,11 @@ jobs:
- checkout
- attach_workspace:
at: /tmp/gpt4all-backend
- run:
name: Install dependencies
command: |
sudo apt-get update
sudo apt-get install -y g++-12-aarch64-linux-gnu gcc-12-aarch64-linux-gnu
- node/install:
install-yarn: true
node-version: "18.16"
@ -908,18 +918,24 @@ jobs:
- run:
command: |
cd gpt4all-bindings/typescript
yarn prebuildify -t 18.16.0 --napi
yarn build:prebuilds
- run:
command: |
mkdir -p gpt4all-backend/prebuilds/linux-x64
mkdir -p gpt4all-backend/runtimes/linux-x64
cp /tmp/gpt4all-backend/runtimes/linux-x64/*-*.so gpt4all-backend/runtimes/linux-x64
cp gpt4all-bindings/typescript/prebuilds/linux-x64/*.node gpt4all-backend/prebuilds/linux-x64
mkdir -p gpt4all-backend/prebuilds/linux-arm64
mkdir -p gpt4all-backend/runtimes/linux-arm64
cp /tmp/gpt4all-backend/runtimes/linux-arm64/*-*.so gpt4all-backend/runtimes/linux-arm64
cp gpt4all-bindings/typescript/prebuilds/linux-arm64/*.node gpt4all-backend/prebuilds/linux-arm64
- persist_to_workspace:
root: gpt4all-backend
paths:
- prebuilds/linux-x64/*.node
- runtimes/linux-x64/*-*.so
- prebuilds/linux-arm64/*.node
- runtimes/linux-arm64/*-*.so
build-nodejs-macos:
macos:
xcode: "14.0.0"
@ -1029,13 +1045,11 @@ jobs:
cp /tmp/gpt4all-backend/runtimes/darwin/*-*.* runtimes/darwin/native/
cp /tmp/gpt4all-backend/prebuilds/darwin-x64/*.node prebuilds/darwin-x64/
# Fallback build if user is not on above prebuilds
mv -f binding.ci.gyp binding.gyp
mkdir gpt4all-backend
# copy the backend source we depend on to make fallback builds work
mkdir backend
cd ../../gpt4all-backend
mv llmodel.h llmodel.cpp llmodel_c.cpp llmodel_c.h sysinfo.h dlhandle.h ../gpt4all-bindings/typescript/gpt4all-backend/
mv llmodel.h llmodel.cpp llmodel_c.cpp llmodel_c.h sysinfo.h dlhandle.h ../gpt4all-bindings/typescript/backend/
# Test install
- node/install-packages:
@ -1045,7 +1059,7 @@ jobs:
- run:
command: |
cd gpt4all-bindings/typescript
yarn run test
yarn run test:ci
- run:
command: |
cd gpt4all-bindings/typescript

View File

@ -79,6 +79,7 @@ if (LLMODEL_ROCM)
endif()
set(CMAKE_VERBOSE_MAKEFILE ON)
include(CheckCXXCompilerFlag)
# Go through each build variant
foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)

View File

@ -0,0 +1,11 @@
# Toolchain to crosscompile runtimes for arm64 on jammy x86_64
# You may have to `sudo apt-get install g++-12-aarch64-linux-gnu gcc-12-aarch64-linux-gnu`
set(CMAKE_SYSTEM_NAME Linux)
set(CMAKE_SYSTEM_PROCESSOR aarch64)
set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc-12)
set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++-12)
# Supported backends
set(LLMODEL_CUDA off)
set(LLMODEL_KOMPUTE off)

View File

@ -8,4 +8,5 @@ prebuilds/
!.yarn/sdks
!.yarn/versions
runtimes/
backend/
compile_flags.txt

View File

@ -1,4 +1,5 @@
test/
spec/
scripts/
scripts/*
!scripts/assert-backend-sources.js
build

View File

@ -188,6 +188,8 @@ model.dispose();
* python 3
* On Windows and Linux, building GPT4All requires the complete Vulkan SDK. You may download it from here: https://vulkan.lunarg.com/sdk/home
* macOS users do not need Vulkan, as GPT4All will use Metal instead.
* CUDA Toolkit >= 11.4 (you can bypass this with adding a custom flag to build step)
- Windows: There is difficulty compiling with cuda if the Visual Studio IDE is NOT present.
### Build (from source)
@ -196,23 +198,29 @@ git clone https://github.com/nomic-ai/gpt4all.git
cd gpt4all-bindings/typescript
```
* The below shell commands assume the current working directory is `typescript`.
* To Build and Rebuild:
```sh
node scripts/prebuild.js
```
* llama.cpp git submodule for gpt4all can be possibly absent. If this is the case, make sure to run in llama.cpp parent directory
llama.cpp git submodule for gpt4all can be possibly absent or outdated. Make sure to run
```sh
git submodule update --init --recursive
```
The below shell commands assume the current working directory is `typescript`.
Using yarn
```sh
yarn build:backend
yarn install
yarn build
```
This will build platform-dependent dynamic libraries, and will be located in runtimes/(platform)/native
Using npm
```sh
npm install
npm run build
```
The `build:runtimes` script will create runtime libraries for your platform in `runtimes` and `build:prebuilds` will create the bindings in `prebuilds`. `build` is a shortcut for both.
### Test
@ -259,7 +267,7 @@ yarn test
This package has been stabilizing over time development, and breaking changes may happen until the api stabilizes. Here's what's the todo list:
* \[ ] Purely offline. Per the gui, which can be run completely offline, the bindings should be as well.
* \[x] [Purely offline](#Offline-usage). Per the gui, which can be run completely offline, the bindings should be as well.
* \[ ] NPM bundle size reduction via optionalDependencies strategy (need help)
* Should include prebuilds to avoid painful node-gyp errors
* \[x] createChatSession ( the python equivalent to create\_chat\_session )
@ -276,7 +284,7 @@ This package has been stabilizing over time development, and breaking changes ma
This repository serves as the new bindings for nodejs users.
- If you were a user of [these bindings](https://github.com/nomic-ai/gpt4all-ts), they are outdated.
- Version 4 includes the follow breaking changes
* `createEmbedding` & `EmbeddingModel.embed()` returns an object, `EmbeddingResult`, instead of a float32array.
* `createEmbedding` & `EmbeddingModel.embed()` returns an object, `EmbeddingResult`, instead of a Float32Array.
* Removed deprecated types `ModelType` and `ModelFile`
* Removed deprecated initiation of model by string path only

View File

@ -1,62 +0,0 @@
{
"targets": [
{
"target_name": "gpt4all", # gpt4all-ts will cause compile error
"include_dirs": [
"<!@(node -p \"require('node-addon-api').include\")",
"gpt4all-backend",
],
"sources": [
# PREVIOUS VERSION: had to required the sources, but with newest changes do not need to
#"../../gpt4all-backend/llama.cpp/examples/common.cpp",
#"../../gpt4all-backend/llama.cpp/ggml.c",
#"../../gpt4all-backend/llama.cpp/llama.cpp",
# "../../gpt4all-backend/utils.cpp",
"gpt4all-backend/llmodel_c.cpp",
"gpt4all-backend/llmodel.cpp",
"prompt.cc",
"index.cc",
],
"conditions": [
['OS=="mac"', {
'xcode_settings': {
'GCC_ENABLE_CPP_EXCEPTIONS': 'YES'
},
'defines': [
'LIB_FILE_EXT=".dylib"',
'NAPI_CPP_EXCEPTIONS',
],
'cflags_cc': [
"-fexceptions"
]
}],
['OS=="win"', {
'defines': [
'LIB_FILE_EXT=".dll"',
'NAPI_CPP_EXCEPTIONS',
],
"msvs_settings": {
"VCCLCompilerTool": {
"AdditionalOptions": [
"/std:c++20",
"/EHsc",
],
},
},
}],
['OS=="linux"', {
'defines': [
'LIB_FILE_EXT=".so"',
'NAPI_CPP_EXCEPTIONS',
],
'cflags_cc!': [
'-fno-rtti',
],
'cflags_cc': [
'-std=c++2a',
'-fexceptions'
]
}]
]
}]
}

View File

@ -1,19 +1,15 @@
{
"targets": [
{
"target_name": "gpt4all", # gpt4all-ts will cause compile error
"target_name": "gpt4all",
"include_dirs": [
"<!@(node -p \"require('node-addon-api').include\")",
"../../gpt4all-backend",
"backend",
],
"sources": [
# PREVIOUS VERSION: had to required the sources, but with newest changes do not need to
#"../../gpt4all-backend/llama.cpp/examples/common.cpp",
#"../../gpt4all-backend/llama.cpp/ggml.c",
#"../../gpt4all-backend/llama.cpp/llama.cpp",
# "../../gpt4all-backend/utils.cpp",
"../../gpt4all-backend/llmodel_c.cpp",
"../../gpt4all-backend/llmodel.cpp",
"backend/llmodel_c.cpp",
"backend/llmodel.cpp",
"backend/dlhandle.cpp",
"prompt.cc",
"index.cc",
],

View File

@ -3,23 +3,24 @@
Napi::Function NodeModelWrapper::GetClass(Napi::Env env)
{
Napi::Function self = DefineClass(env, "LLModel",
{InstanceMethod("type", &NodeModelWrapper::GetType),
InstanceMethod("isModelLoaded", &NodeModelWrapper::IsModelLoaded),
InstanceMethod("name", &NodeModelWrapper::GetName),
InstanceMethod("stateSize", &NodeModelWrapper::StateSize),
InstanceMethod("infer", &NodeModelWrapper::Infer),
InstanceMethod("setThreadCount", &NodeModelWrapper::SetThreadCount),
InstanceMethod("embed", &NodeModelWrapper::GenerateEmbedding),
InstanceMethod("threadCount", &NodeModelWrapper::ThreadCount),
InstanceMethod("getLibraryPath", &NodeModelWrapper::GetLibraryPath),
InstanceMethod("initGpuByString", &NodeModelWrapper::InitGpuByString),
InstanceMethod("hasGpuDevice", &NodeModelWrapper::HasGpuDevice),
InstanceMethod("listGpu", &NodeModelWrapper::GetGpuDevices),
InstanceMethod("memoryNeeded", &NodeModelWrapper::GetRequiredMemory),
InstanceMethod("dispose", &NodeModelWrapper::Dispose)});
Napi::Function self = DefineClass(
env, "LLModel",
{InstanceMethod("load", &NodeModelWrapper::Load),
InstanceMethod("initGpu", &NodeModelWrapper::InitGpu),
InstanceMethod("infer", &NodeModelWrapper::Infer),
InstanceMethod("embed", &NodeModelWrapper::Embed),
InstanceMethod("isModelLoaded", &NodeModelWrapper::IsModelLoaded),
InstanceMethod("getType", &NodeModelWrapper::GetType),
InstanceMethod("getName", &NodeModelWrapper::GetName),
InstanceMethod("getStateSize", &NodeModelWrapper::GetStateSize),
InstanceMethod("setThreadCount", &NodeModelWrapper::SetThreadCount),
InstanceMethod("getThreadCount", &NodeModelWrapper::GetThreadCount),
InstanceMethod("getLibraryPath", &NodeModelWrapper::GetLibraryPath),
InstanceMethod("hasGpuDevice", &NodeModelWrapper::HasGpuDevice),
InstanceMethod("getGpuDevices", &NodeModelWrapper::GetGpuDevices),
InstanceMethod("getRequiredMemory", &NodeModelWrapper::GetRequiredMemory),
InstanceMethod("dispose", &NodeModelWrapper::Dispose)});
// Keep a static reference to the constructor
//
Napi::FunctionReference *constructor = new Napi::FunctionReference();
*constructor = Napi::Persistent(self);
env.SetInstanceData(constructor);
@ -29,13 +30,13 @@ Napi::Value NodeModelWrapper::GetRequiredMemory(const Napi::CallbackInfo &info)
{
auto env = info.Env();
return Napi::Number::New(
env, static_cast<uint32_t>(llmodel_required_mem(GetInference(), full_model_path.c_str(), nCtx, nGpuLayers)));
env, static_cast<uint32_t>(llmodel_required_mem(GetInference(), model_file.c_str(), n_ctx, n_gpu_layers)));
}
Napi::Value NodeModelWrapper::GetGpuDevices(const Napi::CallbackInfo &info)
{
auto env = info.Env();
int num_devices = 0;
auto mem_size = llmodel_required_mem(GetInference(), full_model_path.c_str(), nCtx, nGpuLayers);
auto mem_size = llmodel_required_mem(GetInference(), model_file.c_str(), n_ctx, n_gpu_layers);
llmodel_gpu_device *all_devices = llmodel_available_gpu_devices(mem_size, &num_devices);
if (all_devices == nullptr)
{
@ -63,6 +64,7 @@ Napi::Value NodeModelWrapper::GetGpuDevices(const Napi::CallbackInfo &info)
js_gpu_device["heapSize"] = static_cast<uint32_t>(gpu_device.heapSize);
js_gpu_device["name"] = gpu_device.name;
js_gpu_device["vendor"] = gpu_device.vendor;
js_gpu_device["backend"] = gpu_device.backend;
js_array[i] = js_gpu_device;
}
@ -71,35 +73,13 @@ Napi::Value NodeModelWrapper::GetGpuDevices(const Napi::CallbackInfo &info)
Napi::Value NodeModelWrapper::GetType(const Napi::CallbackInfo &info)
{
if (type.empty())
if (model_type.empty())
{
return info.Env().Undefined();
}
return Napi::String::New(info.Env(), type);
return Napi::String::New(info.Env(), model_type);
}
Napi::Value NodeModelWrapper::InitGpuByString(const Napi::CallbackInfo &info)
{
auto env = info.Env();
size_t memory_required = static_cast<size_t>(info[0].As<Napi::Number>().Uint32Value());
std::string gpu_device_identifier = info[1].As<Napi::String>();
size_t converted_value;
if (memory_required <= std::numeric_limits<size_t>::max())
{
converted_value = static_cast<size_t>(memory_required);
}
else
{
Napi::Error::New(env, "invalid number for memory size. Exceeded bounds for memory.")
.ThrowAsJavaScriptException();
return env.Undefined();
}
auto result = llmodel_gpu_init_gpu_device_by_string(GetInference(), converted_value, gpu_device_identifier.c_str());
return Napi::Boolean::New(env, result);
}
Napi::Value NodeModelWrapper::HasGpuDevice(const Napi::CallbackInfo &info)
{
return Napi::Boolean::New(info.Env(), llmodel_has_gpu_device(GetInference()));
@ -110,82 +90,61 @@ NodeModelWrapper::NodeModelWrapper(const Napi::CallbackInfo &info) : Napi::Objec
auto env = info.Env();
auto config_object = info[0].As<Napi::Object>();
// sets the directory where models (gguf files) are to be searched
llmodel_set_implementation_search_path(
config_object.Has("library_path") ? config_object.Get("library_path").As<Napi::String>().Utf8Value().c_str()
: ".");
// sets the directories where runtime libs are to be searched
llmodel_set_implementation_search_path(config_object.Has("librariesPath")
? config_object.Get("librariesPath").As<Napi::String>().Utf8Value().c_str()
: ".");
std::string model_name = config_object.Get("model_name").As<Napi::String>();
fs::path model_path = config_object.Get("model_path").As<Napi::String>().Utf8Value();
std::string full_weight_path = (model_path / fs::path(model_name)).string();
model_file = config_object.Get("modelFile").As<Napi::String>().Utf8Value();
model_name = model_file.substr(model_file.find_last_of("/\\") + 1);
backend = config_object.Get("backend").As<Napi::String>().Utf8Value();
n_ctx = config_object.Get("nCtx").As<Napi::Number>().Int32Value();
n_gpu_layers = config_object.Get("nGpuLayers").As<Napi::Number>().Int32Value();
name = model_name.empty() ? model_path.filename().string() : model_name;
full_model_path = full_weight_path;
nCtx = config_object.Get("nCtx").As<Napi::Number>().Int32Value();
nGpuLayers = config_object.Get("ngl").As<Napi::Number>().Int32Value();
const char *e;
inference_ = llmodel_model_create2(full_weight_path.c_str(), "auto", &e);
const char *err;
inference_ = llmodel_model_create2(model_file.c_str(), backend.c_str(), &err);
if (!inference_)
{
Napi::Error::New(env, e).ThrowAsJavaScriptException();
Napi::Error::New(env, err).ThrowAsJavaScriptException();
return;
}
if (GetInference() == nullptr)
{
std::cerr << "Tried searching libraries in \"" << llmodel_get_implementation_search_path() << "\"" << std::endl;
std::cerr << "Tried searching for model weight in \"" << full_weight_path << "\"" << std::endl;
std::cerr << "Tried using model weights in \"" << model_file << "\"" << std::endl;
std::cerr << "Do you have runtime libraries installed?" << std::endl;
Napi::Error::New(env, "Had an issue creating llmodel object, inference is null").ThrowAsJavaScriptException();
return;
}
std::string device = config_object.Get("device").As<Napi::String>();
if (device != "cpu")
{
size_t mem = llmodel_required_mem(GetInference(), full_weight_path.c_str(), nCtx, nGpuLayers);
auto success = llmodel_gpu_init_gpu_device_by_string(GetInference(), mem, device.c_str());
if (!success)
{
// https://github.com/nomic-ai/gpt4all/blob/3acbef14b7c2436fe033cae9036e695d77461a16/gpt4all-bindings/python/gpt4all/pyllmodel.py#L215
// Haven't implemented this but it is still open to contribution
std::cout << "WARNING: Failed to init GPU\n";
}
}
auto success = llmodel_loadModel(GetInference(), full_weight_path.c_str(), nCtx, nGpuLayers);
if (!success)
{
Napi::Error::New(env, "Failed to load model at given path").ThrowAsJavaScriptException();
return;
}
// optional
if (config_object.Has("model_type"))
if (config_object.Has("modelType"))
{
type = config_object.Get("model_type").As<Napi::String>();
model_type = config_object.Get("modelType").As<Napi::String>();
}
};
// NodeModelWrapper::~NodeModelWrapper() {
// if(GetInference() != nullptr) {
// std::cout << "Debug: deleting model\n";
// llmodel_model_destroy(inference_);
// std::cout << (inference_ == nullptr);
// }
// }
// void NodeModelWrapper::Finalize(Napi::Env env) {
// if(inference_ != nullptr) {
// std::cout << "Debug: deleting model\n";
//
// }
// }
Napi::Value NodeModelWrapper::Load(const Napi::CallbackInfo &info)
{
auto env = info.Env();
auto success = llmodel_loadModel(GetInference(), model_file.c_str(), n_ctx, n_gpu_layers);
return Napi::Boolean::New(env, success);
}
Napi::Value NodeModelWrapper::InitGpu(const Napi::CallbackInfo &info)
{
auto env = info.Env();
auto device = info[0].As<Napi::String>().Utf8Value();
size_t mem_required = llmodel_required_mem(GetInference(), model_file.c_str(), n_ctx, n_gpu_layers);
auto success = llmodel_gpu_init_gpu_device_by_string(GetInference(), mem_required, device.c_str());
return Napi::Boolean::New(env, success);
}
Napi::Value NodeModelWrapper::IsModelLoaded(const Napi::CallbackInfo &info)
{
return Napi::Boolean::New(info.Env(), llmodel_isModelLoaded(GetInference()));
}
Napi::Value NodeModelWrapper::StateSize(const Napi::CallbackInfo &info)
Napi::Value NodeModelWrapper::GetStateSize(const Napi::CallbackInfo &info)
{
// Implement the binding for the stateSize method
return Napi::Number::New(info.Env(), static_cast<int64_t>(llmodel_get_state_size(GetInference())));
@ -220,7 +179,7 @@ Napi::Array ChunkedFloatPtr(float *embedding_ptr, int embedding_size, int text_l
return result;
}
Napi::Value NodeModelWrapper::GenerateEmbedding(const Napi::CallbackInfo &info)
Napi::Value NodeModelWrapper::Embed(const Napi::CallbackInfo &info)
{
auto env = info.Env();
@ -256,7 +215,7 @@ Napi::Value NodeModelWrapper::GenerateEmbedding(const Napi::CallbackInfo &info)
str_ptrs.push_back(text_arr[i].c_str());
str_ptrs.push_back(nullptr);
const char *_err = nullptr;
float *embeds = llmodel_embed(GetInference(), str_ptrs.data(), &embedding_size,
float *embeds = llmodel_embed(GetInference(), str_ptrs.data(), &embedding_size,
prefix.IsUndefined() ? nullptr : prefix.As<Napi::String>().Utf8Value().c_str(),
dimensionality, &token_count, do_mean, atlas, nullptr, &_err);
if (!embeds)
@ -271,9 +230,12 @@ Napi::Value NodeModelWrapper::GenerateEmbedding(const Napi::CallbackInfo &info)
llmodel_free_embedding(embeds);
auto res = Napi::Object::New(env);
res.Set("n_prompt_tokens", token_count);
if(is_single_text) {
if (is_single_text)
{
res.Set("embeddings", embedmat.Get(static_cast<uint32_t>(0)));
} else {
}
else
{
res.Set("embeddings", embedmat);
}
@ -308,7 +270,7 @@ Napi::Value NodeModelWrapper::Infer(const Napi::CallbackInfo &info)
llmodel_prompt_context promptContext = {.logits = nullptr,
.tokens = nullptr,
.n_past = 0,
.n_ctx = nCtx,
.n_ctx = n_ctx,
.n_predict = 4096,
.top_k = 40,
.top_p = 0.9f,
@ -323,6 +285,12 @@ Napi::Value NodeModelWrapper::Infer(const Napi::CallbackInfo &info)
auto inputObject = info[1].As<Napi::Object>();
if (!inputObject.Has("promptTemplate"))
{
Napi::Error::New(info.Env(), "Missing Prompt Template").ThrowAsJavaScriptException();
return info.Env().Undefined();
}
if (inputObject.Has("logits") || inputObject.Has("tokens"))
{
Napi::Error::New(info.Env(), "Invalid input: 'logits' or 'tokens' properties are not allowed")
@ -425,9 +393,9 @@ void NodeModelWrapper::SetThreadCount(const Napi::CallbackInfo &info)
Napi::Value NodeModelWrapper::GetName(const Napi::CallbackInfo &info)
{
return Napi::String::New(info.Env(), name);
return Napi::String::New(info.Env(), model_name);
}
Napi::Value NodeModelWrapper::ThreadCount(const Napi::CallbackInfo &info)
Napi::Value NodeModelWrapper::GetThreadCount(const Napi::CallbackInfo &info)
{
return Napi::Number::New(info.Env(), llmodel_threadCount(GetInference()));
}

View File

@ -16,30 +16,28 @@ class NodeModelWrapper : public Napi::ObjectWrap<NodeModelWrapper>
public:
NodeModelWrapper(const Napi::CallbackInfo &);
// virtual ~NodeModelWrapper();
Napi::Value GetType(const Napi::CallbackInfo &info);
Napi::Value IsModelLoaded(const Napi::CallbackInfo &info);
Napi::Value StateSize(const Napi::CallbackInfo &info);
// void Finalize(Napi::Env env) override;
Napi::Value Load(const Napi::CallbackInfo &info);
Napi::Value InitGpu(const Napi::CallbackInfo &info);
/**
* Prompting the model. This entails spawning a new thread and adding the response tokens
* into a thread local string variable.
*/
Napi::Value Infer(const Napi::CallbackInfo &info);
void SetThreadCount(const Napi::CallbackInfo &info);
void Dispose(const Napi::CallbackInfo &info);
Napi::Value Embed(const Napi::CallbackInfo &info);
Napi::Value IsModelLoaded(const Napi::CallbackInfo &info);
Napi::Value GetType(const Napi::CallbackInfo &info);
Napi::Value GetName(const Napi::CallbackInfo &info);
Napi::Value ThreadCount(const Napi::CallbackInfo &info);
Napi::Value GenerateEmbedding(const Napi::CallbackInfo &info);
Napi::Value HasGpuDevice(const Napi::CallbackInfo &info);
Napi::Value ListGpus(const Napi::CallbackInfo &info);
Napi::Value InitGpuByString(const Napi::CallbackInfo &info);
Napi::Value GetRequiredMemory(const Napi::CallbackInfo &info);
Napi::Value GetGpuDevices(const Napi::CallbackInfo &info);
Napi::Value GetStateSize(const Napi::CallbackInfo &info);
void SetThreadCount(const Napi::CallbackInfo &info);
Napi::Value GetThreadCount(const Napi::CallbackInfo &info);
/*
* The path that is used to search for the dynamic libraries
*/
Napi::Value GetLibraryPath(const Napi::CallbackInfo &info);
Napi::Value HasGpuDevice(const Napi::CallbackInfo &info);
Napi::Value GetGpuDevices(const Napi::CallbackInfo &info);
Napi::Value GetRequiredMemory(const Napi::CallbackInfo &info);
void Dispose(const Napi::CallbackInfo &info);
/**
* Creates the LLModel class
*/
@ -54,10 +52,10 @@ class NodeModelWrapper : public Napi::ObjectWrap<NodeModelWrapper>
std::mutex inference_mutex;
std::string type;
// corresponds to LLModel::name() in typescript
std::string name;
int nCtx{};
int nGpuLayers{};
std::string full_model_path;
std::string model_type;
std::string model_name;
std::string model_file;
std::string backend;
int n_ctx{};
int n_gpu_layers{};
};

View File

@ -5,32 +5,38 @@
"main": "src/gpt4all.js",
"repository": "nomic-ai/gpt4all",
"scripts": {
"install": "node-gyp-build",
"install": "node ./scripts/assert-backend-sources.js && node-gyp-build",
"test:ci": "jest test/ci.test.js",
"test": "jest",
"build:backend": "node scripts/build.js",
"build": "node-gyp-build",
"clean": "rimraf build runtimes prebuilds backend",
"prebuild": "npm run clean",
"build": "npm run build:runtimes && npm run build:prebuilds",
"build:runtimes": "node scripts/build.js",
"build:prebuilds": "node scripts/assert-backend-sources.js && node scripts/prebuild.js",
"docs:build": "node scripts/docs.js && documentation readme ./src/gpt4all.d.ts --parse-extension js d.ts --format md --section \"API Reference\" --readme-file ../python/docs/gpt4all_nodejs.md"
},
"files": [
"binding.gyp",
"src/**/*",
"runtimes/**/*",
"binding.gyp",
"prebuilds/**/*",
"backend/**/*",
"scripts/assert-backend-sources.js",
"*.h",
"*.cc",
"gpt4all-backend/**/*"
"*.cc"
],
"dependencies": {
"md5-file": "^5.0.0",
"node-addon-api": "^6.1.0",
"node-gyp-build": "^4.6.0"
"node-addon-api": "^8.0.0",
"node-gyp-build": "~4.8.0"
},
"devDependencies": {
"@types/node": "^20.1.5",
"@types/node": "^20.12.12",
"documentation": "^14.0.2",
"jest": "^29.5.0",
"prebuildify": "^5.0.1",
"prettier": "^2.8.8"
"jest": "^29.7.0",
"prebuildify": "^6.0.1",
"prettier": "^3.2.5",
"rimraf": "^5.0.7"
},
"optionalDependencies": {
"node-gyp": "9.x.x"

View File

@ -131,7 +131,8 @@ bool PromptWorker::ResponseCallback(int32_t token_id, const std::string token)
// Transform native data into JS data, passing it to the provided
// `jsCallback` -- the TSFN's JavaScript function.
auto token_id = Napi::Number::New(env, value->tokenId);
auto token = Napi::String::New(env, value->token);
auto token = Napi::Uint8Array::New(env, value->token.size());
memcpy(token.Data(), value->token.data(), value->token.size());
auto jsResult = jsCallback.Call({token_id, token}).ToBoolean();
promise.set_value(jsResult);
}

View File

@ -0,0 +1,47 @@
const fs = require("fs");
const path = require("path");
// Copies the shared llmodel sources from gpt4all-backend into the backend folder.
// These are dependencies of the bindings and will be required in case node-gyp-build
// cannot find a prebuild. This script is used in the package install hook and will
// be executed BOTH when `yarn install` is run in the root folder AND when the package
// is installed as a dependency in another project.
const backendDeps = [
"llmodel.h",
"llmodel.cpp",
"llmodel_c.cpp",
"llmodel_c.h",
"sysinfo.h",
"dlhandle.h",
"dlhandle.cpp",
];
const sourcePath = path.resolve(__dirname, "../../../gpt4all-backend");
const destPath = path.resolve(__dirname, "../backend");
// Silently ignore if the backend sources are not available.
// When the package is installed as a dependency, gpt4all-backend will not be present.
if (fs.existsSync(sourcePath)) {
if (!fs.existsSync(destPath)) {
fs.mkdirSync(destPath);
}
for (const file of backendDeps) {
const sourceFile = path.join(sourcePath, file);
const destFile = path.join(destPath, file);
if (fs.existsSync(sourceFile)) {
console.info(`Copying ${sourceFile} to ${destFile}`);
fs.copyFileSync(sourceFile, destFile); // overwrite
} else {
throw new Error(`File ${sourceFile} does not exist`);
}
}
}
// assert that the backend sources are present
for (const file of backendDeps) {
const destFile = path.join(destPath, file);
if (!fs.existsSync(destFile)) {
throw new Error(`File ${destFile} does not exist`);
}
}

View File

@ -1,12 +1,42 @@
#!/bin/sh
# Build script for Unix-like systems (Linux, macOS).
# Script assumes the current working directory is the bindings project root.
SYSNAME=$(uname -s)
PLATFORM=$(uname -m)
# Allows overriding target sysname and platform via args
# If not provided, the current system's sysname and platform will be used
while [ $# -gt 0 ]; do
case "$1" in
--sysname=*)
SYSNAME="${1#*=}"
shift
;;
--platform=*)
PLATFORM="${1#*=}"
shift
;;
*)
echo "Unknown argument: $1" >&2
exit 1
;;
esac
done
if [ "$SYSNAME" = "Linux" ]; then
BASE_DIR="runtimes/linux-x64"
if [ "$PLATFORM" = "x86_64" ]; then
BASE_DIR="runtimes/linux-x64"
elif [ "$PLATFORM" = "aarch64" ]; then
BASE_DIR="runtimes/linux-arm64"
else
echo "Unsupported platform: $PLATFORM" >&2
exit 1
fi
LIB_EXT="so"
elif [ "$SYSNAME" = "Darwin" ]; then
BASE_DIR="runtimes/osx"
BASE_DIR="runtimes/darwin"
LIB_EXT="dylib"
elif [ -n "$SYSNAME" ]; then
echo "Unsupported system: $SYSNAME" >&2
@ -22,8 +52,24 @@ BUILD_DIR="$BASE_DIR/build"
rm -rf "$BASE_DIR"
mkdir -p "$NATIVE_DIR" "$BUILD_DIR"
cmake -S ../../gpt4all-backend -B "$BUILD_DIR" &&
cmake --build "$BUILD_DIR" -j --config Release && {
if [ "$PLATFORM" = "x86_64" ]; then
echo "Building for x86_64"
cmake -S ../../gpt4all-backend -B "$BUILD_DIR" -DCMAKE_BUILD_TYPE=RelWithDebInfo
fi
if [ "$PLATFORM" = "aarch64" ]; then
if [ "$(uname -m)" != "aarch64" ]; then
echo "Cross-compiling for aarch64"
cmake -S ../../gpt4all-backend \
-B "$BUILD_DIR" \
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
-DCMAKE_TOOLCHAIN_FILE="./toolchains/linux-arm64-toolchain.cmake"
else
cmake -S ../../gpt4all-backend -B "$BUILD_DIR" -DCMAKE_BUILD_TYPE=RelWithDebInfo
fi
fi
cmake --build "$BUILD_DIR" --parallel && {
cp "$BUILD_DIR"/libgptj*.$LIB_EXT "$NATIVE_DIR"/
cp "$BUILD_DIR"/libllama*.$LIB_EXT "$NATIVE_DIR"/
}
}

View File

@ -1,22 +1,21 @@
const prebuildify = require("prebuildify");
async function createPrebuilds(combinations) {
for (const { platform, arch } of combinations) {
async function createPrebuilds(configs) {
for (const config of configs) {
const opts = {
platform,
arch,
napi: true,
targets: ["18.16.0"]
targets: ["18.16.0"],
...config,
};
try {
await createPrebuild(opts);
console.log(
`Build succeeded for platform ${opts.platform} and architecture ${opts.arch}`
`Build succeeded for platform ${opts.platform} and architecture ${opts.arch}`,
);
} catch (err) {
console.error(
`Error building for platform ${opts.platform} and architecture ${opts.arch}:`,
err
err,
);
}
}
@ -24,6 +23,17 @@ async function createPrebuilds(combinations) {
function createPrebuild(opts) {
return new Promise((resolve, reject) => {
// if this prebuild is cross-compiling for arm64 on a non-arm64 machine,
// set the CXX and CC environment variables to the cross-compilers
if (
opts.arch === "arm64" &&
process.arch !== "arm64" &&
process.platform === "linux"
) {
process.env.CXX = "aarch64-linux-gnu-g++-12";
process.env.CC = "aarch64-linux-gnu-gcc-12";
}
prebuildify(opts, (err) => {
if (err) {
reject(err);
@ -35,22 +45,18 @@ function createPrebuild(opts) {
}
let prebuildConfigs;
if(process.platform === 'win32') {
prebuildConfigs = [
{ platform: "win32", arch: "x64" }
];
} else if(process.platform === 'linux') {
//Unsure if darwin works, need mac tester!
prebuildConfigs = [
{ platform: "linux", arch: "x64" },
//{ platform: "linux", arch: "arm64" },
//{ platform: "linux", arch: "armv7" },
]
} else if(process.platform === 'darwin') {
if (process.platform === "win32") {
prebuildConfigs = [{ platform: "win32", arch: "x64" }];
} else if (process.platform === "linux") {
prebuildConfigs = [
{ platform: "darwin", arch: "x64" },
{ platform: "darwin", arch: "arm64" },
]
{ platform: "linux", arch: "x64" },
{ platform: "linux", arch: "arm64" },
];
} else if (process.platform === "darwin") {
prebuildConfigs = [
{ platform: "darwin", arch: "x64" },
{ platform: "darwin", arch: "arm64" },
];
}
createPrebuilds(prebuildConfigs)

View File

@ -2,7 +2,6 @@ import { loadModel, createCompletion } from "../src/gpt4all.js";
const model = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", {
verbose: true,
device: "gpu",
});
const chat = await model.createChatSession();
@ -12,8 +11,6 @@ await createCompletion(
"Why are bananas rather blue than bread at night sometimes?",
{
verbose: true,
nPredict: 10,
}
);
await createCompletion(chat, "Are you sure?", {
verbose: true,
});
);

View File

@ -7,12 +7,12 @@ const modelOptions = {
verbose: true,
};
const model1 = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", {
const model1 = await loadModel("Phi-3-mini-4k-instruct.Q4_0.gguf", {
...modelOptions,
device: "gpu", // only one model can be on gpu
});
const model2 = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", modelOptions);
const model3 = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", modelOptions);
const model2 = await loadModel("Phi-3-mini-4k-instruct.Q4_0.gguf", modelOptions);
const model3 = await loadModel("Phi-3-mini-4k-instruct.Q4_0.gguf", modelOptions);
const promptContext = {
verbose: true,
@ -27,3 +27,6 @@ const responses = await Promise.all([
createCompletion(model3, "What is 1 + 3?", promptContext),
]);
console.log(responses.map((res) => res.choices[0].message));
model1.dispose();
model2.dispose();
model3.dispose();

View File

@ -1,61 +0,0 @@
import {
LLModel,
createCompletion,
DEFAULT_DIRECTORY,
DEFAULT_LIBRARIES_DIRECTORY,
loadModel,
} from "../src/gpt4all.js";
const model = await loadModel("mistral-7b-openorca.gguf2.Q4_0.gguf", {
verbose: true,
device: "gpu",
});
const ll = model.llm;
try {
class Extended extends LLModel {}
} catch (e) {
console.log("Extending from native class gone wrong " + e);
}
console.log("state size " + ll.stateSize());
console.log("thread count " + ll.threadCount());
ll.setThreadCount(5);
console.log("thread count " + ll.threadCount());
ll.setThreadCount(4);
console.log("thread count " + ll.threadCount());
console.log("name " + ll.name());
console.log("type: " + ll.type());
console.log("Default directory for models", DEFAULT_DIRECTORY);
console.log("Default directory for libraries", DEFAULT_LIBRARIES_DIRECTORY);
console.log("Has GPU", ll.hasGpuDevice());
console.log("gpu devices", ll.listGpu());
console.log("Required Mem in bytes", ll.memoryNeeded());
// to ingest a custom system prompt without using a chat session.
await createCompletion(
model,
"<|im_start|>system\nYou are an advanced mathematician.\n<|im_end|>\n",
{
promptTemplate: "%1",
nPredict: 0,
special: true,
}
);
const completion1 = await createCompletion(model, "What is 1 + 1?", {
verbose: true,
});
console.log(`🤖 > ${completion1.choices[0].message.content}`);
//Very specific:
// tested on Ubuntu 22.0, Linux Mint, if I set nPast to 100, the app hangs.
const completion2 = await createCompletion(model, "And if we add two?", {
verbose: true,
});
console.log(`🤖 > ${completion2.choices[0].message.content}`);
//CALLING DISPOSE WILL INVALID THE NATIVE MODEL. USE THIS TO CLEANUP
model.dispose();
console.log("model disposed, exiting...");

View File

@ -1,7 +1,6 @@
import { promises as fs } from "node:fs";
import { loadModel, createCompletion } from "../src/gpt4all.js";
const model = await loadModel("Nous-Hermes-2-Mistral-7B-DPO.Q4_0.gguf", {
const model = await loadModel("Phi-3-mini-4k-instruct.Q4_0.gguf", {
verbose: true,
device: "gpu",
});
@ -12,14 +11,15 @@ const res = await createCompletion(
{
onPromptToken: (tokenId) => {
console.debug("onPromptToken", { tokenId });
// throwing an error will cancel
// errors within the callback will cancel ingestion, inference will still run
throw new Error("This is an error");
// const foo = thisMethodDoesNotExist();
// returning false will cancel as well
// return false;
},
onResponseToken: (tokenId, token) => {
console.debug("onResponseToken", { tokenId, token });
onResponseTokens: ({ tokenIds, text }) => {
// console.debug("onResponseToken", { tokenIds, text });
process.stdout.write(text);
// same applies here
},
}

View File

@ -0,0 +1,37 @@
import {
loadModel,
createCompletion,
createCompletionStream,
createCompletionGenerator,
} from "../src/gpt4all.js";
const model = await loadModel("Phi-3-mini-4k-instruct.Q4_0.gguf", {
device: "cpu",
});
const prompt = "Tell a short story but only use emojis. Three sentences max.";
const result = await createCompletion(model, prompt, {
onResponseToken: (tokens) => {
console.debug(tokens)
},
});
console.debug(result.choices[0].message);
process.stdout.write("### Stream:");
const stream = createCompletionStream(model, prompt);
stream.tokens.on("data", (data) => {
process.stdout.write(data);
});
await stream.result;
process.stdout.write("\n");
process.stdout.write("### Generator:");
const gen = createCompletionGenerator(model, prompt);
for await (const chunk of gen) {
process.stdout.write(chunk);
}
model.dispose();

View File

@ -38,8 +38,8 @@ process.stdout.write("\n");
process.stdout.write("### Callback:");
await createCompletion(model, "Why not just callbacks?", {
onResponseToken: (tokenId, token) => {
process.stdout.write(token);
onResponseTokens: ({ text }) => {
process.stdout.write(text);
},
});
process.stdout.write("\n");

View File

@ -25,7 +25,7 @@ class ChatSession {
const { messages, systemPrompt, ...sessionDefaultPromptContext } =
chatSessionOpts;
this.model = model;
this.modelName = model.llm.name();
this.modelName = model.llm.getName();
this.messages = messages ?? [];
this.systemPrompt = systemPrompt ?? model.config.systemPrompt;
this.initialized = false;

View File

@ -5,10 +5,27 @@ interface LLModelOptions {
/**
* Model architecture. This argument currently does not have any functionality and is just used as descriptive identifier for user.
*/
type?: string;
model_name: string;
model_path: string;
library_path?: string;
modelType?: string;
/**
* Absolute path to the model file.
*/
modelFile: string;
/**
* Path to the llmodel implementation shared objects. This can be a single path or a list of paths separated by ';' delimiter.
*/
librariesPath?: string;
/**
* A string representing the implementation to use. One of 'auto', 'cpu', 'metal', 'kompute', or 'cuda'.
*/
backend: string;
/**
* The maximum window size of this model.
*/
nCtx: number;
/**
* Number of GPU layers to use (Vulkan)
*/
nGpuLayers: number;
}
interface ModelConfig {
@ -263,10 +280,10 @@ interface LLModelInferenceResult {
interface LLModelInferenceOptions extends Partial<LLModelPromptContext> {
/** Callback for response tokens, called for each generated token.
* @param {number} tokenId The token id.
* @param {string} token The token.
* @param {Uint8Array} bytes The token bytes.
* @returns {boolean | undefined} Whether to continue generating tokens.
* */
onResponseToken?: (tokenId: number, token: string) => boolean | void;
onResponseToken?: (tokenId: number, bytes: Uint8Array) => boolean | void;
/** Callback for prompt tokens, called for each input token in the prompt.
* @param {number} tokenId The token id.
* @returns {boolean | undefined} Whether to continue ingesting the prompt.
@ -281,30 +298,42 @@ interface LLModelInferenceOptions extends Partial<LLModelPromptContext> {
declare class LLModel {
/**
* Initialize a new LLModel.
* @param {string} path Absolute path to the model file.
* @throws {Error} If the model file does not exist.
* @param {LLModelOptions} options LLModel options.
* @throws {Error} If the model can't be loaded or necessary runtimes are not found.
*/
constructor(options: LLModelOptions);
/**
* Loads the LLModel.
* @return {boolean} true if the model was loaded successfully, false otherwise.
*/
load(): boolean;
/**
* Initiate a GPU by a string identifier. See LoadModelOptions.device for more information
* @param {string} device 'amd' | 'nvidia' | 'intel' | 'gpu' | gpu name.
* @return {boolean} true if the GPU was initialized successfully, false otherwise.
*/
initGpu(device: string): boolean;
/** undefined or user supplied */
type(): string | undefined;
getType(): string | undefined;
/** The name of the model. */
name(): string;
getName(): string;
/**
* Get the size of the internal state of the model.
* NOTE: This state data is specific to the type of model you have created.
* @return the size in bytes of the internal state of the model
*/
stateSize(): number;
getStateSize(): number;
/**
* Get the number of threads used for model inference.
* The default is the number of physical cores your computer has.
* @returns The number of threads used for model inference.
*/
threadCount(): number;
getThreadCount(): number;
/**
* Set the number of threads used for model inference.
@ -375,14 +404,6 @@ declare class LLModel {
*/
getLibraryPath(): string;
/**
* Initiate a GPU by a string identifier.
* @param {number} memory_required Should be in the range size_t or will throw
* @param {string} device_name 'amd' | 'nvidia' | 'intel' | 'gpu' | gpu name.
* read LoadModelOptions.device for more information
*/
initGpuByString(memory_required: number, device_name: string): boolean;
/**
* From C documentation
* @returns True if a GPU device is successfully initialized, false otherwise.
@ -391,11 +412,10 @@ declare class LLModel {
/**
* GPUs that are usable for this LLModel
* @param {number} nCtx Maximum size of context window
* @throws if hasGpuDevice returns false (i think)
* @returns
* @throws if gpu device list is not available
* @returns an array of GpuDevice objects
*/
listGpu(nCtx: number): GpuDevice[];
getGpuDevices(): GpuDevice[];
/**
* delete and cleanup the native model
@ -414,6 +434,7 @@ interface GpuDevice {
heapSize: number;
name: string;
vendor: string;
backend: string;
}
/**
@ -443,13 +464,15 @@ interface LoadModelOptions {
/**
* The processing unit on which the model will run. It can be set to
* - "cpu": Model will run on the central processing unit.
* - "gpu": Model will run on the best available graphics processing unit, irrespective of its vendor.
* - "amd", "nvidia", "intel": Model will run on the best available GPU from the specified vendor.
* - "kompute": Model will run using the kompute (vulkan) gpu backend
* - "cuda": Model will run using the cuda gpu backend
* - "gpu": Use Metal on ARM64 macOS, otherwise the same as "kompute"
* - "amd", "nvidia": Use the best GPU provided by the Kompute backend from this vendor.
* - "gpu name": Model will run on the GPU that matches the name if it's available.
* Note: If a GPU device lacks sufficient RAM to accommodate the model, an error will be thrown, and the GPT4All
* instance will be rendered invalid. It's advised to ensure the device has enough memory before initiating the
* model.
* @default "cpu"
* @default Metal on ARM64 macOS, "cpu" otherwise.
*/
device?: string;
/**
@ -458,10 +481,16 @@ interface LoadModelOptions {
*/
nCtx?: number;
/**
* Number of gpu layers needed
* Number of GPU layers to use (Vulkan)
* @default 100
* @alias ngl
*/
nGpuLayers?: number;
ngl?: number;
/**
* Number of CPU threads used by GPT4All. Default is None, then the number of threads are determined automatically.
*/
nThreads?: number;
}
interface InferenceModelOptions extends LoadModelOptions {
@ -507,15 +536,33 @@ interface CompletionProvider {
): Promise<InferenceResult>;
}
interface CompletionTokens {
/** The token ids. */
tokenIds: number[];
/** The token text. May be an empty string. */
text: string;
}
/**
* Options for creating a completion.
*/
interface CompletionOptions extends LLModelInferenceOptions {
interface CompletionOptions extends Partial<LLModelPromptContext> {
/**
* Indicates if verbose logging is enabled.
* @default false
*/
verbose?: boolean;
/** Called every time new tokens can be decoded to text.
* @param {CompletionTokens} tokens The token ids and decoded text.
* @returns {boolean | undefined} Whether to continue generating tokens.
* */
onResponseTokens?: (tokens: CompletionTokens) => boolean | void;
/** Callback for prompt tokens, called for each input token in the prompt.
* @param {number} tokenId The token id.
* @returns {boolean | undefined} Whether to continue ingesting the prompt.
* */
onPromptToken?: (tokenId: number) => boolean | void;
}
/**
@ -639,13 +686,6 @@ interface LLModelPromptContext {
*/
promptTemplate?: string;
/** The context window size. Do not use, it has no effect. See loadModel options.
* THIS IS DEPRECATED!!!
* Use loadModel's nCtx option instead.
* @default 2048
*/
nCtx: number;
/** The top-k logits to sample from.
* Top-K sampling selects the next token only from the top K most likely tokens predicted by the model.
* It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit

View File

@ -37,9 +37,8 @@ async function loadModel(modelName, options = {}) {
type: "inference",
allowDownload: true,
verbose: false,
device: "cpu",
nCtx: 2048,
ngl: 100,
nGpuLayers: options.ngl ?? 100,
...options,
};
@ -54,27 +53,77 @@ async function loadModel(modelName, options = {}) {
typeof loadOptions.librariesPath === "string",
"Libraries path should be a string"
);
const existingPaths = loadOptions.librariesPath
const existingLibPaths = loadOptions.librariesPath
.split(";")
.filter(existsSync)
.join(";");
const llmOptions = {
model_name: appendBinSuffixIfMissing(modelName),
model_path: loadOptions.modelPath,
library_path: existingPaths,
device: loadOptions.device,
modelFile: modelConfig.path,
librariesPath: existingLibPaths,
nCtx: loadOptions.nCtx,
ngl: loadOptions.ngl,
nGpuLayers: loadOptions.nGpuLayers,
};
let initDevice;
if (process.platform === "darwin") {
if (!loadOptions.device) {
llmOptions.backend = "auto"; // 'auto' is effectively 'metal' due to currently non-functional fallback
} else if (loadOptions.device === "cpu") {
llmOptions.backend = "cpu";
} else {
if (process.arch !== "arm64" || loadOptions.device !== "gpu") {
throw new Error(
`Unknown device for this platform: ${loadOptions.device}`
);
}
llmOptions.backend = "metal";
}
} else {
// default to kompute. use cpu for arm64 because we currently dont build kompute runtimes for arm64
llmOptions.backend = process.arch === "arm64" ? "cpu" : "kompute";
if (!loadOptions.device || loadOptions.device === "cpu") {
// use the default backend
} else if (
loadOptions.device === "cuda" ||
loadOptions.device === "kompute"
) {
llmOptions.backend = loadOptions.device;
initDevice = "gpu";
} else if (loadOptions.device.startsWith("cuda:")) {
llmOptions.backend = "cuda";
initDevice = loadOptions.device.replace(/^cuda:/, "");
} else {
initDevice = loadOptions.device.replace(/^kompute:/, "");
}
}
if (loadOptions.verbose) {
console.debug("Creating LLModel:", {
initDevice,
llmOptions,
modelConfig,
});
}
const llmodel = new LLModel(llmOptions);
if (initDevice) {
const gpuInitSuccess = llmodel.initGpu(initDevice);
if (!gpuInitSuccess) {
const availableDevices = llmodel.getGpuDevices();
const deviceNames = availableDevices
.map((device) => device.name)
.join(", ");
console.warn(
`Failed to initialize GPU device "${initDevice}" - Available devices: ${deviceNames}`
);
}
}
llmodel.load();
if (loadOptions.nThreads) {
llmodel.setThreadCount(loadOptions.nThreads);
}
if (loadOptions.type === "embedding") {
return new EmbeddingModel(llmodel, modelConfig);
} else if (loadOptions.type === "inference") {
@ -84,7 +133,7 @@ async function loadModel(modelName, options = {}) {
}
}
function createEmbedding(model, text, options={}) {
function createEmbedding(model, text, options = {}) {
let {
dimensionality = undefined,
longTextMode = "mean",
@ -138,10 +187,7 @@ async function createCompletion(
...options,
};
const result = await provider.generate(
input,
completionOptions,
);
const result = await provider.generate(input, completionOptions);
return {
model: provider.modelName,
@ -174,10 +220,10 @@ function createCompletionStream(
const completionPromise = createCompletion(provider, input, {
...options,
onResponseToken: (tokenId, token) => {
completionStream.push(token);
if (options.onResponseToken) {
return options.onResponseToken(tokenId, token);
onResponseTokens: (tokens) => {
completionStream.push(tokens.text);
if (options.onResponseTokens) {
return options.onResponseTokens(tokens);
}
},
}).then((result) => {

View File

@ -11,7 +11,7 @@ class InferenceModel {
constructor(llmodel, config) {
this.llm = llmodel;
this.config = config;
this.modelName = this.llm.name();
this.modelName = this.llm.getName();
}
async createChatSession(options) {
@ -89,6 +89,25 @@ class InferenceModel {
}
let tokensGenerated = 0;
const decoder = new TokenDecoder((tokenIds, text) => {
let continueGeneration = true;
tokensGenerated += tokenIds.length;
if (options.onResponseTokens) {
// catch here because if errors bubble through cpp they will loose stacktraces
try {
// don't cancel the generation unless user explicitly returns false
continueGeneration =
options.onResponseTokens({ tokenIds, text }) !== false;
} catch (err) {
console.error("Error in onResponseToken callback", err);
continueGeneration = false;
}
}
return continueGeneration;
});
const result = await this.llm.infer(prompt, {
...promptContext,
@ -97,7 +116,7 @@ class InferenceModel {
let continueIngestion = true;
tokensIngested++;
if (options.onPromptToken) {
// catch errors because if they go through cpp they will loose stacktraces
// catch here because if errors bubble through cpp they will looe stacktraces
try {
// don't cancel ingestion unless user explicitly returns false
continueIngestion =
@ -109,20 +128,8 @@ class InferenceModel {
}
return continueIngestion;
},
onResponseToken: (tokenId, token) => {
let continueGeneration = true;
tokensGenerated++;
if (options.onResponseToken) {
try {
// don't cancel the generation unless user explicitly returns false
continueGeneration =
options.onResponseToken(tokenId, token) !== false;
} catch (err) {
console.error("Error in onResponseToken callback", err);
continueGeneration = false;
}
}
return continueGeneration;
onResponseToken: (tokenId, bytes) => {
return decoder.decode(tokenId, bytes);
},
});
@ -141,6 +148,63 @@ class InferenceModel {
}
}
// see https://github.com/nomic-ai/gpt4all/pull/1281
class TokenDecoder {
constructor(callback) {
this.callback = callback;
this.buffer = [];
this.tokenIds = [];
this.buffExpectingContBytes = 0;
this.textDecoder = new TextDecoder();
}
decode(tokenId, bytes) {
const decoded = [];
this.tokenIds.push(tokenId);
for (let i = 0; i < bytes.length; i++) {
const byte = bytes[i];
const bits = byte.toString(2).padStart(8, '0');
const highOnes = bits.split('0')[0];
if (highOnes.length === 1) {
// Continuation byte
this.buffer.push(byte);
this.buffExpectingContBytes -= 1;
} else {
// Beginning of a byte sequence
if (this.buffer.length > 0) {
decoded.push(this._decodeBuffer());
this.buffer = [];
}
this.buffer.push(byte);
this.buffExpectingContBytes = Math.max(0, highOnes.length - 1);
}
if (this.buffExpectingContBytes <= 0) {
// Received the whole sequence or an out-of-place continuation byte
decoded.push(this._decodeBuffer());
this.buffer = [];
this.buffExpectingContBytes = 0;
}
}
if (decoded.length === 0 && this.buffExpectingContBytes > 0) {
// Wait for more continuation bytes
return true;
}
const tokenIds = this.tokenIds;
this.tokenIds = [];
return this.callback(tokenIds, decoded.join(''));
}
_decodeBuffer() {
return this.textDecoder.decode(new Uint8Array(this.buffer));
}
}
class EmbeddingModel {
llm;
config;
@ -160,6 +224,7 @@ class EmbeddingModel {
}
module.exports = {
TokenDecoder,
InferenceModel,
EmbeddingModel,
};

View File

@ -0,0 +1,73 @@
const { loadModel } = require("../src/gpt4all.js");
// these tests require an internet connection / a real model
const testModel = "Phi-3-mini-4k-instruct.Q4_0.gguf";
describe("llmodel", () => {
let model;
test("load on cpu", async () => {
model = await loadModel(testModel, {
device: "cpu",
});
});
test("getter working", async () => {
const stateSize = model.llm.getStateSize();
expect(stateSize).toBeGreaterThan(0);
const name = model.llm.getName();
expect(name).toBe(testModel);
const type = model.llm.getType();
expect(type).toBeUndefined();
const devices = model.llm.getGpuDevices();
expect(Array.isArray(devices)).toBe(true);
const gpuEnabled = model.llm.hasGpuDevice();
expect(gpuEnabled).toBe(false);
const requiredMem = model.llm.getRequiredMemory();
expect(typeof requiredMem).toBe('number');
const threadCount = model.llm.getThreadCount();
expect(threadCount).toBe(4);
});
test("setting thread count", () => {
model.llm.setThreadCount(5);
expect(model.llm.getThreadCount()).toBe(5);
});
test("cpu inference", async () => {
const res = await model.llm.infer("what is the capital of france?", {
temp: 0,
promptTemplate: model.config.promptTemplate,
nPredict: 10,
onResponseToken: () => {
return true;
},
});
expect(res.text).toMatch(/paris/i);
}, 10000);
test("dispose and load model on gpu", async () => {
model.dispose();
model = await loadModel(testModel, {
device: "gpu",
});
const gpuEnabled = model.llm.hasGpuDevice();
expect(gpuEnabled).toBe(true);
});
test("gpu inference", async () => {
const res = await model.llm.infer("what is the capital of france?", {
temp: 0,
promptTemplate: model.config.promptTemplate,
nPredict: 10,
onResponseToken: () => {
return true;
},
});
expect(res.text).toMatch(/paris/i);
}, 10000);
afterAll(() => {
model.dispose();
});
});

View File

@ -2,7 +2,6 @@ const path = require("node:path");
const os = require("node:os");
const fsp = require("node:fs/promises");
const { existsSync } = require('node:fs');
const { LLModel } = require("node-gyp-build")(path.resolve(__dirname, ".."));
const {
listModels,
downloadModel,
@ -13,11 +12,8 @@ const {
DEFAULT_LIBRARIES_DIRECTORY,
DEFAULT_MODEL_LIST_URL,
} = require("../src/config.js");
const {
loadModel,
createPrompt,
createCompletion,
} = require("../src/gpt4all.js");
// these tests do not require an internet connection or an actual model
describe("config", () => {
test("default paths constants are available and correct", () => {

File diff suppressed because it is too large Load Diff