typescript bindings maintenance (#2363)

* remove outdated comments Signed-off-by: limez <limez@protonmail.com> * simpler build from source Signed-off-by: limez <limez@protonmail.com> * update unix build script to create .so runtimes correctly Signed-off-by: limez <limez@protonmail.com> * configure ci build type, use RelWithDebInfo for dev build script Signed-off-by: limez <limez@protonmail.com> * add clean script Signed-off-by: limez <limez@protonmail.com> * fix streamed token decoding / emoji Signed-off-by: limez <limez@protonmail.com> * remove deprecated nCtx Signed-off-by: limez <limez@protonmail.com> * update typings Signed-off-by: jacob <jacoobes@sern.dev> update typings Signed-off-by: jacob <jacoobes@sern.dev> * readme,mspell Signed-off-by: jacob <jacoobes@sern.dev> * cuda/backend logic changes + name napi methods like their js counterparts Signed-off-by: limez <limez@protonmail.com> * convert llmodel example into a test, separate test suite that can run in ci Signed-off-by: limez <limez@protonmail.com> * update examples / naming Signed-off-by: limez <limez@protonmail.com> * update deps, remove the need for binding.ci.gyp, make node-gyp-build fallback easier testable Signed-off-by: limez <limez@protonmail.com> * make sure the assert-backend-sources.js script is published, but not the others Signed-off-by: limez <limez@protonmail.com> * build correctly on windows (regression on node-gyp-build) Signed-off-by: Jacob Nguyen <76754747+jacoobes@users.noreply.github.com> * codespell Signed-off-by: limez <limez@protonmail.com> * make sure dlhandle.cpp gets linked correctly Signed-off-by: limez <limez@protonmail.com> * add include for check_cxx_compiler_flag call during aarch64 builds Signed-off-by: limez <limez@protonmail.com> * x86 > arm64 cross compilation of runtimes and bindings Signed-off-by: limez <limez@protonmail.com> * default to cpu instead of kompute on arm64 Signed-off-by: limez <limez@protonmail.com> * formatting, more minimal example Signed-off-by: limez <limez@protonmail.com> --------- Signed-off-by: limez <limez@protonmail.com> Signed-off-by: jacob <jacoobes@sern.dev> Signed-off-by: Jacob Nguyen <76754747+jacoobes@users.noreply.github.com> Co-authored-by: Jacob Nguyen <76754747+jacoobes@users.noreply.github.com> Co-authored-by: jacob <jacoobes@sern.dev>
2024-09-11 13:25:35 +03:00 · 2024-06-03 18:12:55 +02:00 · 2024-06-03 18:12:55 +02:00 · a602f7fde7
commit a602f7fde7
parent f001897a1a
30 changed files with 1112 additions and 873 deletions
--- a/.circleci/continue_config.yml
+++ b/.circleci/continue_config.yml
@ -570,7 +570,7 @@ jobs:
            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
            sudo dpkg -i cuda-keyring_1.1-1_all.deb
            sudo apt-get update
-            sudo apt-get install -y cmake build-essential vulkan-sdk cuda-compiler-12-4 libcublas-dev-12-4 libnvidia-compute-550-server libmysqlclient21 libodbc2 libpq5
+            sudo apt-get install -y cmake build-essential g++-12-aarch64-linux-gnu gcc-12-aarch64-linux-gnu vulkan-sdk cuda-compiler-12-4 libcublas-dev-12-4 libnvidia-compute-550-server libmysqlclient21 libodbc2 libpq5
      - run:
          name: Build Libraries
          command: |
@ -578,14 +578,19 @@ jobs:
            cd gpt4all-backend
            mkdir -p runtimes/build
            cd runtimes/build
-            cmake ../..
-            cmake --build . --parallel --config Release
+            cmake ../.. -DCMAKE_BUILD_TYPE=Release
+            cmake --build . --parallel
            mkdir ../linux-x64
            cp -L *.so ../linux-x64 # otherwise persist_to_workspace seems to mess symlinks
+            cmake ../.. -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE="./toolchains/linux-arm64-toolchain.cmake"
+            cmake --build . --parallel
+            mkdir ../linux-arm64
+            cp -L *.so ../linux-arm64
      - persist_to_workspace:
          root: gpt4all-backend
          paths:
            - runtimes/linux-x64/*.so
+            - runtimes/linux-arm64/*.so

  build-bindings-backend-macos:
    macos:
@ -896,6 +901,11 @@ jobs:
      - checkout
      - attach_workspace:
          at: /tmp/gpt4all-backend
+      - run:
+          name: Install dependencies
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y g++-12-aarch64-linux-gnu gcc-12-aarch64-linux-gnu
      - node/install:
          install-yarn: true
          node-version: "18.16"
@ -908,18 +918,24 @@ jobs:
      - run:
          command: | 
            cd gpt4all-bindings/typescript
-            yarn prebuildify -t 18.16.0 --napi
+            yarn build:prebuilds
      - run: 
          command: |
            mkdir -p gpt4all-backend/prebuilds/linux-x64
            mkdir -p gpt4all-backend/runtimes/linux-x64
            cp /tmp/gpt4all-backend/runtimes/linux-x64/*-*.so gpt4all-backend/runtimes/linux-x64
            cp gpt4all-bindings/typescript/prebuilds/linux-x64/*.node gpt4all-backend/prebuilds/linux-x64
+            mkdir -p gpt4all-backend/prebuilds/linux-arm64
+            mkdir -p gpt4all-backend/runtimes/linux-arm64
+            cp /tmp/gpt4all-backend/runtimes/linux-arm64/*-*.so gpt4all-backend/runtimes/linux-arm64
+            cp gpt4all-bindings/typescript/prebuilds/linux-arm64/*.node gpt4all-backend/prebuilds/linux-arm64
      - persist_to_workspace:
          root: gpt4all-backend
          paths:
            - prebuilds/linux-x64/*.node 
            - runtimes/linux-x64/*-*.so
+            - prebuilds/linux-arm64/*.node
+            - runtimes/linux-arm64/*-*.so
  build-nodejs-macos: 
    macos:
      xcode: "14.0.0"
@ -1029,13 +1045,11 @@ jobs:
            cp /tmp/gpt4all-backend/runtimes/darwin/*-*.* runtimes/darwin/native/

            cp /tmp/gpt4all-backend/prebuilds/darwin-x64/*.node prebuilds/darwin-x64/    
-            
-            # Fallback build if user is not on above prebuilds
-            mv -f binding.ci.gyp binding.gyp

-            mkdir gpt4all-backend
+            # copy the backend source we depend on to make fallback builds work
+            mkdir backend
            cd ../../gpt4all-backend
-            mv llmodel.h llmodel.cpp llmodel_c.cpp llmodel_c.h sysinfo.h dlhandle.h ../gpt4all-bindings/typescript/gpt4all-backend/
+            mv llmodel.h llmodel.cpp llmodel_c.cpp llmodel_c.h sysinfo.h dlhandle.h ../gpt4all-bindings/typescript/backend/
            
      # Test install
      - node/install-packages:
@ -1045,7 +1059,7 @@ jobs:
      - run: 
          command: | 
            cd gpt4all-bindings/typescript
-            yarn run test
+            yarn run test:ci
      - run:
          command: |
            cd gpt4all-bindings/typescript
--- a/gpt4all-backend/CMakeLists.txt
+++ b/gpt4all-backend/CMakeLists.txt
@ -79,6 +79,7 @@ if (LLMODEL_ROCM)
 endif()

 set(CMAKE_VERBOSE_MAKEFILE ON)
+include(CheckCXXCompilerFlag)

 # Go through each build variant
 foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
--- a/gpt4all-backend/toolchains/linux-arm64-toolchain.cmake
+++ b/gpt4all-backend/toolchains/linux-arm64-toolchain.cmake
@ -0,0 +1,11 @@
+# Toolchain to crosscompile runtimes for arm64 on jammy x86_64
+# You may have to `sudo apt-get install g++-12-aarch64-linux-gnu gcc-12-aarch64-linux-gnu`
+
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
+set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc-12)
+set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++-12)
+
+# Supported backends
+set(LLMODEL_CUDA off)
+set(LLMODEL_KOMPUTE off)
--- a/gpt4all-bindings/typescript/.gitignore
+++ b/gpt4all-bindings/typescript/.gitignore
@ -8,4 +8,5 @@ prebuilds/
 !.yarn/sdks
 !.yarn/versions
 runtimes/
+backend/
 compile_flags.txt
--- a/gpt4all-bindings/typescript/.npmignore
+++ b/gpt4all-bindings/typescript/.npmignore
@ -1,4 +1,5 @@
 test/
 spec/
-scripts/
+scripts/*
+!scripts/assert-backend-sources.js
 build
--- a/gpt4all-bindings/typescript/README.md
+++ b/gpt4all-bindings/typescript/README.md
@ -188,6 +188,8 @@ model.dispose();
 *   python 3
 *   On Windows and Linux, building GPT4All requires the complete Vulkan SDK. You may download it from here: https://vulkan.lunarg.com/sdk/home
 *   macOS users do not need Vulkan, as GPT4All will use Metal instead.
+*   CUDA Toolkit >= 11.4  (you can bypass this with adding a custom flag to build step)
+    -  Windows: There is difficulty compiling with cuda if the Visual Studio IDE is NOT present.

 ### Build (from source)

@ -196,23 +198,29 @@ git clone https://github.com/nomic-ai/gpt4all.git
 cd gpt4all-bindings/typescript
 ```

-*   The below shell commands assume the current working directory is `typescript`.
-
-*   To Build and Rebuild:
-
-```sh
-node scripts/prebuild.js
-```
-*   llama.cpp git submodule for gpt4all can be possibly absent. If this is the case, make sure to run in llama.cpp parent directory
+llama.cpp git submodule for gpt4all can be possibly absent or outdated. Make sure to run

 ```sh
 git submodule update --init --recursive
 ```

+The below shell commands assume the current working directory is `typescript`.
+
+Using yarn
+
 ```sh
-yarn build:backend
+yarn install
+yarn build
 ```
-This will build platform-dependent dynamic libraries, and will be located in runtimes/(platform)/native
+
+Using npm
+
+```sh
+npm install
+npm run build
+```
+
+The `build:runtimes` script will create runtime libraries for your platform in `runtimes` and `build:prebuilds` will create the bindings in `prebuilds`. `build` is a shortcut for both.

 ### Test

@ -259,7 +267,7 @@ yarn test

 This package has been stabilizing over time development, and breaking changes may happen until the api stabilizes. Here's what's the todo list:

-*   \[ ] Purely offline. Per the gui, which can be run completely offline, the bindings should be as well.
+*   \[x] [Purely offline](#Offline-usage). Per the gui, which can be run completely offline, the bindings should be as well. 
 *   \[ ] NPM bundle size reduction via optionalDependencies strategy (need help)
    *   Should include prebuilds to avoid painful node-gyp errors
 *   \[x] createChatSession ( the python equivalent to create\_chat\_session )
@ -276,7 +284,7 @@ This package has been stabilizing over time development, and breaking changes ma
 This repository serves as the new bindings for nodejs users.
 - If you were a user of [these bindings](https://github.com/nomic-ai/gpt4all-ts), they are outdated.
 - Version 4 includes the follow breaking changes
-    * `createEmbedding` & `EmbeddingModel.embed()` returns an object, `EmbeddingResult`, instead of a float32array.
+    * `createEmbedding` & `EmbeddingModel.embed()` returns an object, `EmbeddingResult`, instead of a Float32Array.
    * Removed deprecated types `ModelType` and `ModelFile`
    * Removed deprecated initiation of model by string path only

--- a/gpt4all-bindings/typescript/binding.ci.gyp
+++ b/gpt4all-bindings/typescript/binding.ci.gyp
@ -1,62 +0,0 @@
-{
-  "targets": [
-    {
-      "target_name": "gpt4all", # gpt4all-ts will cause compile error
-      "include_dirs": [
-        "<!@(node -p \"require('node-addon-api').include\")",
-        "gpt4all-backend",
-      ],
-      "sources": [
-        # PREVIOUS VERSION: had to required the sources, but with newest changes do not need to
-        #"../../gpt4all-backend/llama.cpp/examples/common.cpp",
-        #"../../gpt4all-backend/llama.cpp/ggml.c",
-        #"../../gpt4all-backend/llama.cpp/llama.cpp",
-        # "../../gpt4all-backend/utils.cpp",
-        "gpt4all-backend/llmodel_c.cpp",
-        "gpt4all-backend/llmodel.cpp",
-        "prompt.cc",
-        "index.cc",
-       ],
-      "conditions": [
-        ['OS=="mac"', {
-            'xcode_settings': {
-                'GCC_ENABLE_CPP_EXCEPTIONS': 'YES'
-            },
-            'defines': [
-                'LIB_FILE_EXT=".dylib"',
-                'NAPI_CPP_EXCEPTIONS',
-            ],
-            'cflags_cc': [
-                "-fexceptions"
-            ]
-        }],
-        ['OS=="win"', {
-            'defines': [
-                'LIB_FILE_EXT=".dll"',
-                'NAPI_CPP_EXCEPTIONS',
-            ],
-            "msvs_settings": {
-                "VCCLCompilerTool": {
-                    "AdditionalOptions": [
-                        "/std:c++20",
-                        "/EHsc",
-                  ],
-                },
-            },
-        }],
-        ['OS=="linux"', {
-            'defines': [
-                'LIB_FILE_EXT=".so"',
-                'NAPI_CPP_EXCEPTIONS',
-            ],
-            'cflags_cc!': [
-                '-fno-rtti',
-            ],
-            'cflags_cc': [
-                '-std=c++2a',
-                '-fexceptions'
-            ]
-        }]
-      ]
-    }]
-}
--- a/gpt4all-bindings/typescript/binding.gyp
+++ b/gpt4all-bindings/typescript/binding.gyp
@ -1,19 +1,15 @@
 {
  "targets": [
    {
-      "target_name": "gpt4all", # gpt4all-ts will cause compile error
+      "target_name": "gpt4all",
      "include_dirs": [
        "<!@(node -p \"require('node-addon-api').include\")",
-        "../../gpt4all-backend",
+        "backend",
      ],
      "sources": [
-        # PREVIOUS VERSION: had to required the sources, but with newest changes do not need to
-        #"../../gpt4all-backend/llama.cpp/examples/common.cpp",
-        #"../../gpt4all-backend/llama.cpp/ggml.c",
-        #"../../gpt4all-backend/llama.cpp/llama.cpp",
-        # "../../gpt4all-backend/utils.cpp",
-        "../../gpt4all-backend/llmodel_c.cpp",
-        "../../gpt4all-backend/llmodel.cpp",
+        "backend/llmodel_c.cpp",
+        "backend/llmodel.cpp",
+        "backend/dlhandle.cpp",
        "prompt.cc",
        "index.cc",
       ],
--- a/gpt4all-bindings/typescript/index.cc
+++ b/gpt4all-bindings/typescript/index.cc
@ -3,23 +3,24 @@

 Napi::Function NodeModelWrapper::GetClass(Napi::Env env)
 {
-    Napi::Function self = DefineClass(env, "LLModel",
-                                      {InstanceMethod("type", &NodeModelWrapper::GetType),
-                                       InstanceMethod("isModelLoaded", &NodeModelWrapper::IsModelLoaded),
-                                       InstanceMethod("name", &NodeModelWrapper::GetName),
-                                       InstanceMethod("stateSize", &NodeModelWrapper::StateSize),
-                                       InstanceMethod("infer", &NodeModelWrapper::Infer),
-                                       InstanceMethod("setThreadCount", &NodeModelWrapper::SetThreadCount),
-                                       InstanceMethod("embed", &NodeModelWrapper::GenerateEmbedding),
-                                       InstanceMethod("threadCount", &NodeModelWrapper::ThreadCount),
-                                       InstanceMethod("getLibraryPath", &NodeModelWrapper::GetLibraryPath),
-                                       InstanceMethod("initGpuByString", &NodeModelWrapper::InitGpuByString),
-                                       InstanceMethod("hasGpuDevice", &NodeModelWrapper::HasGpuDevice),
-                                       InstanceMethod("listGpu", &NodeModelWrapper::GetGpuDevices),
-                                       InstanceMethod("memoryNeeded", &NodeModelWrapper::GetRequiredMemory),
-                                       InstanceMethod("dispose", &NodeModelWrapper::Dispose)});
+    Napi::Function self = DefineClass(
+        env, "LLModel",
+        {InstanceMethod("load", &NodeModelWrapper::Load),
+         InstanceMethod("initGpu", &NodeModelWrapper::InitGpu),
+         InstanceMethod("infer", &NodeModelWrapper::Infer),
+         InstanceMethod("embed", &NodeModelWrapper::Embed),
+         InstanceMethod("isModelLoaded", &NodeModelWrapper::IsModelLoaded),
+         InstanceMethod("getType", &NodeModelWrapper::GetType),
+         InstanceMethod("getName", &NodeModelWrapper::GetName),
+         InstanceMethod("getStateSize", &NodeModelWrapper::GetStateSize),
+         InstanceMethod("setThreadCount", &NodeModelWrapper::SetThreadCount),
+         InstanceMethod("getThreadCount", &NodeModelWrapper::GetThreadCount),
+         InstanceMethod("getLibraryPath", &NodeModelWrapper::GetLibraryPath),
+         InstanceMethod("hasGpuDevice", &NodeModelWrapper::HasGpuDevice),
+         InstanceMethod("getGpuDevices", &NodeModelWrapper::GetGpuDevices),
+         InstanceMethod("getRequiredMemory", &NodeModelWrapper::GetRequiredMemory),
+         InstanceMethod("dispose", &NodeModelWrapper::Dispose)});
    // Keep a static reference to the constructor
-    //
    Napi::FunctionReference *constructor = new Napi::FunctionReference();
    *constructor = Napi::Persistent(self);
    env.SetInstanceData(constructor);
@ -29,13 +30,13 @@ Napi::Value NodeModelWrapper::GetRequiredMemory(const Napi::CallbackInfo &info)
 {
    auto env = info.Env();
    return Napi::Number::New(
-        env, static_cast<uint32_t>(llmodel_required_mem(GetInference(), full_model_path.c_str(), nCtx, nGpuLayers)));
+        env, static_cast<uint32_t>(llmodel_required_mem(GetInference(), model_file.c_str(), n_ctx, n_gpu_layers)));
 }
 Napi::Value NodeModelWrapper::GetGpuDevices(const Napi::CallbackInfo &info)
 {
    auto env = info.Env();
    int num_devices = 0;
-    auto mem_size = llmodel_required_mem(GetInference(), full_model_path.c_str(), nCtx, nGpuLayers);
+    auto mem_size = llmodel_required_mem(GetInference(), model_file.c_str(), n_ctx, n_gpu_layers);
    llmodel_gpu_device *all_devices = llmodel_available_gpu_devices(mem_size, &num_devices);
    if (all_devices == nullptr)
    {
@ -63,6 +64,7 @@ Napi::Value NodeModelWrapper::GetGpuDevices(const Napi::CallbackInfo &info)
        js_gpu_device["heapSize"] = static_cast<uint32_t>(gpu_device.heapSize);
        js_gpu_device["name"] = gpu_device.name;
        js_gpu_device["vendor"] = gpu_device.vendor;
+        js_gpu_device["backend"] = gpu_device.backend;

        js_array[i] = js_gpu_device;
    }
@ -71,35 +73,13 @@ Napi::Value NodeModelWrapper::GetGpuDevices(const Napi::CallbackInfo &info)

 Napi::Value NodeModelWrapper::GetType(const Napi::CallbackInfo &info)
 {
-    if (type.empty())
+    if (model_type.empty())
    {
        return info.Env().Undefined();
    }
-    return Napi::String::New(info.Env(), type);
+    return Napi::String::New(info.Env(), model_type);
 }

-Napi::Value NodeModelWrapper::InitGpuByString(const Napi::CallbackInfo &info)
-{
-    auto env = info.Env();
-    size_t memory_required = static_cast<size_t>(info[0].As<Napi::Number>().Uint32Value());
-
-    std::string gpu_device_identifier = info[1].As<Napi::String>();
-
-    size_t converted_value;
-    if (memory_required <= std::numeric_limits<size_t>::max())
-    {
-        converted_value = static_cast<size_t>(memory_required);
-    }
-    else
-    {
-        Napi::Error::New(env, "invalid number for memory size. Exceeded bounds for memory.")
-            .ThrowAsJavaScriptException();
-        return env.Undefined();
-    }
-
-    auto result = llmodel_gpu_init_gpu_device_by_string(GetInference(), converted_value, gpu_device_identifier.c_str());
-    return Napi::Boolean::New(env, result);
-}
 Napi::Value NodeModelWrapper::HasGpuDevice(const Napi::CallbackInfo &info)
 {
    return Napi::Boolean::New(info.Env(), llmodel_has_gpu_device(GetInference()));
@ -110,82 +90,61 @@ NodeModelWrapper::NodeModelWrapper(const Napi::CallbackInfo &info) : Napi::Objec
    auto env = info.Env();
    auto config_object = info[0].As<Napi::Object>();

-    // sets the directory where models (gguf files) are to be searched
-    llmodel_set_implementation_search_path(
-        config_object.Has("library_path") ? config_object.Get("library_path").As<Napi::String>().Utf8Value().c_str()
-                                          : ".");
+    // sets the directories where runtime libs are to be searched
+    llmodel_set_implementation_search_path(config_object.Has("librariesPath")
+                                               ? config_object.Get("librariesPath").As<Napi::String>().Utf8Value().c_str()
+                                               : ".");

-    std::string model_name = config_object.Get("model_name").As<Napi::String>();
-    fs::path model_path = config_object.Get("model_path").As<Napi::String>().Utf8Value();
-    std::string full_weight_path = (model_path / fs::path(model_name)).string();
+    model_file = config_object.Get("modelFile").As<Napi::String>().Utf8Value();
+    model_name = model_file.substr(model_file.find_last_of("/\\") + 1);
+    backend = config_object.Get("backend").As<Napi::String>().Utf8Value();
+    n_ctx = config_object.Get("nCtx").As<Napi::Number>().Int32Value();
+    n_gpu_layers = config_object.Get("nGpuLayers").As<Napi::Number>().Int32Value();

-    name = model_name.empty() ? model_path.filename().string() : model_name;
-    full_model_path = full_weight_path;
-    nCtx = config_object.Get("nCtx").As<Napi::Number>().Int32Value();
-    nGpuLayers = config_object.Get("ngl").As<Napi::Number>().Int32Value();
-
-    const char *e;
-    inference_ = llmodel_model_create2(full_weight_path.c_str(), "auto", &e);
+    const char *err;
+    inference_ = llmodel_model_create2(model_file.c_str(), backend.c_str(), &err);
    if (!inference_)
    {
-        Napi::Error::New(env, e).ThrowAsJavaScriptException();
+        Napi::Error::New(env, err).ThrowAsJavaScriptException();
        return;
    }
    if (GetInference() == nullptr)
    {
        std::cerr << "Tried searching libraries in \"" << llmodel_get_implementation_search_path() << "\"" << std::endl;
-        std::cerr << "Tried searching for model weight in \"" << full_weight_path << "\"" << std::endl;
+        std::cerr << "Tried using model weights in \"" << model_file << "\"" << std::endl;
        std::cerr << "Do you have runtime libraries installed?" << std::endl;
        Napi::Error::New(env, "Had an issue creating llmodel object, inference is null").ThrowAsJavaScriptException();
        return;
    }
-
-    std::string device = config_object.Get("device").As<Napi::String>();
-    if (device != "cpu")
-    {
-        size_t mem = llmodel_required_mem(GetInference(), full_weight_path.c_str(), nCtx, nGpuLayers);
-
-        auto success = llmodel_gpu_init_gpu_device_by_string(GetInference(), mem, device.c_str());
-        if (!success)
-        {
-            // https://github.com/nomic-ai/gpt4all/blob/3acbef14b7c2436fe033cae9036e695d77461a16/gpt4all-bindings/python/gpt4all/pyllmodel.py#L215
-            // Haven't implemented this but it is still open to contribution
-            std::cout << "WARNING: Failed to init GPU\n";
-        }
-    }
-
-    auto success = llmodel_loadModel(GetInference(), full_weight_path.c_str(), nCtx, nGpuLayers);
-    if (!success)
-    {
-        Napi::Error::New(env, "Failed to load model at given path").ThrowAsJavaScriptException();
-        return;
-    }
    // optional
-    if (config_object.Has("model_type"))
+    if (config_object.Has("modelType"))
    {
-        type = config_object.Get("model_type").As<Napi::String>();
+        model_type = config_object.Get("modelType").As<Napi::String>();
    }
 };

-//  NodeModelWrapper::~NodeModelWrapper() {
-//    if(GetInference() != nullptr) {
-//        std::cout << "Debug: deleting model\n";
-//        llmodel_model_destroy(inference_);
-//        std::cout << (inference_ == nullptr);
-//    }
-//  }
-//  void NodeModelWrapper::Finalize(Napi::Env env) {
-//    if(inference_ != nullptr) {
-//        std::cout << "Debug: deleting model\n";
-//
-//    }
-//  }
+Napi::Value NodeModelWrapper::Load(const Napi::CallbackInfo &info)
+{
+    auto env = info.Env();
+    auto success = llmodel_loadModel(GetInference(), model_file.c_str(), n_ctx, n_gpu_layers);
+    return Napi::Boolean::New(env, success);
+}
+
+Napi::Value NodeModelWrapper::InitGpu(const Napi::CallbackInfo &info)
+{
+    auto env = info.Env();
+    auto device = info[0].As<Napi::String>().Utf8Value();
+    size_t mem_required = llmodel_required_mem(GetInference(), model_file.c_str(), n_ctx, n_gpu_layers);
+    auto success = llmodel_gpu_init_gpu_device_by_string(GetInference(), mem_required, device.c_str());
+    return Napi::Boolean::New(env, success);
+}
+
 Napi::Value NodeModelWrapper::IsModelLoaded(const Napi::CallbackInfo &info)
 {
    return Napi::Boolean::New(info.Env(), llmodel_isModelLoaded(GetInference()));
 }

-Napi::Value NodeModelWrapper::StateSize(const Napi::CallbackInfo &info)
+Napi::Value NodeModelWrapper::GetStateSize(const Napi::CallbackInfo &info)
 {
    // Implement the binding for the stateSize method
    return Napi::Number::New(info.Env(), static_cast<int64_t>(llmodel_get_state_size(GetInference())));
@ -220,7 +179,7 @@ Napi::Array ChunkedFloatPtr(float *embedding_ptr, int embedding_size, int text_l
    return result;
 }

-Napi::Value NodeModelWrapper::GenerateEmbedding(const Napi::CallbackInfo &info)
+Napi::Value NodeModelWrapper::Embed(const Napi::CallbackInfo &info)
 {
    auto env = info.Env();

@ -256,7 +215,7 @@ Napi::Value NodeModelWrapper::GenerateEmbedding(const Napi::CallbackInfo &info)
        str_ptrs.push_back(text_arr[i].c_str());
    str_ptrs.push_back(nullptr);
    const char *_err = nullptr;
-    float *embeds = llmodel_embed(GetInference(), str_ptrs.data(),  &embedding_size,
+    float *embeds = llmodel_embed(GetInference(), str_ptrs.data(), &embedding_size,
                                  prefix.IsUndefined() ? nullptr : prefix.As<Napi::String>().Utf8Value().c_str(),
                                  dimensionality, &token_count, do_mean, atlas, nullptr, &_err);
    if (!embeds)
@ -271,9 +230,12 @@ Napi::Value NodeModelWrapper::GenerateEmbedding(const Napi::CallbackInfo &info)
    llmodel_free_embedding(embeds);
    auto res = Napi::Object::New(env);
    res.Set("n_prompt_tokens", token_count);
-    if(is_single_text) {
+    if (is_single_text)
+    {
        res.Set("embeddings", embedmat.Get(static_cast<uint32_t>(0)));
-    } else {
+    }
+    else
+    {
        res.Set("embeddings", embedmat);
    }

@ -308,7 +270,7 @@ Napi::Value NodeModelWrapper::Infer(const Napi::CallbackInfo &info)
    llmodel_prompt_context promptContext = {.logits = nullptr,
                                            .tokens = nullptr,
                                            .n_past = 0,
-                                            .n_ctx = nCtx,
+                                            .n_ctx = n_ctx,
                                            .n_predict = 4096,
                                            .top_k = 40,
                                            .top_p = 0.9f,
@ -323,6 +285,12 @@ Napi::Value NodeModelWrapper::Infer(const Napi::CallbackInfo &info)

    auto inputObject = info[1].As<Napi::Object>();

+    if (!inputObject.Has("promptTemplate"))
+    {
+        Napi::Error::New(info.Env(), "Missing Prompt Template").ThrowAsJavaScriptException();
+        return info.Env().Undefined();
+    }
+
    if (inputObject.Has("logits") || inputObject.Has("tokens"))
    {
        Napi::Error::New(info.Env(), "Invalid input: 'logits' or 'tokens' properties are not allowed")
@ -425,9 +393,9 @@ void NodeModelWrapper::SetThreadCount(const Napi::CallbackInfo &info)

 Napi::Value NodeModelWrapper::GetName(const Napi::CallbackInfo &info)
 {
-    return Napi::String::New(info.Env(), name);
+    return Napi::String::New(info.Env(), model_name);
 }
-Napi::Value NodeModelWrapper::ThreadCount(const Napi::CallbackInfo &info)
+Napi::Value NodeModelWrapper::GetThreadCount(const Napi::CallbackInfo &info)
 {
    return Napi::Number::New(info.Env(), llmodel_threadCount(GetInference()));
 }
--- a/gpt4all-bindings/typescript/index.h
+++ b/gpt4all-bindings/typescript/index.h
@ -16,30 +16,28 @@ class NodeModelWrapper : public Napi::ObjectWrap<NodeModelWrapper>

  public:
    NodeModelWrapper(const Napi::CallbackInfo &);
-    // virtual ~NodeModelWrapper();
-    Napi::Value GetType(const Napi::CallbackInfo &info);
-    Napi::Value IsModelLoaded(const Napi::CallbackInfo &info);
-    Napi::Value StateSize(const Napi::CallbackInfo &info);
-    // void Finalize(Napi::Env env) override;
+    Napi::Value Load(const Napi::CallbackInfo &info);
+    Napi::Value InitGpu(const Napi::CallbackInfo &info);
    /**
     * Prompting the model. This entails spawning a new thread and adding the response tokens
     * into a thread local string variable.
     */
    Napi::Value Infer(const Napi::CallbackInfo &info);
-    void SetThreadCount(const Napi::CallbackInfo &info);
-    void Dispose(const Napi::CallbackInfo &info);
+    Napi::Value Embed(const Napi::CallbackInfo &info);
+    Napi::Value IsModelLoaded(const Napi::CallbackInfo &info);
+    Napi::Value GetType(const Napi::CallbackInfo &info);
    Napi::Value GetName(const Napi::CallbackInfo &info);
-    Napi::Value ThreadCount(const Napi::CallbackInfo &info);
-    Napi::Value GenerateEmbedding(const Napi::CallbackInfo &info);
-    Napi::Value HasGpuDevice(const Napi::CallbackInfo &info);
-    Napi::Value ListGpus(const Napi::CallbackInfo &info);
-    Napi::Value InitGpuByString(const Napi::CallbackInfo &info);
-    Napi::Value GetRequiredMemory(const Napi::CallbackInfo &info);
-    Napi::Value GetGpuDevices(const Napi::CallbackInfo &info);
+    Napi::Value GetStateSize(const Napi::CallbackInfo &info);
+    void SetThreadCount(const Napi::CallbackInfo &info);
+    Napi::Value GetThreadCount(const Napi::CallbackInfo &info);
    /*
     * The path that is used to search for the dynamic libraries
     */
    Napi::Value GetLibraryPath(const Napi::CallbackInfo &info);
+    Napi::Value HasGpuDevice(const Napi::CallbackInfo &info);
+    Napi::Value GetGpuDevices(const Napi::CallbackInfo &info);
+    Napi::Value GetRequiredMemory(const Napi::CallbackInfo &info);
+    void Dispose(const Napi::CallbackInfo &info);
    /**
     * Creates the LLModel class
     */
@ -54,10 +52,10 @@ class NodeModelWrapper : public Napi::ObjectWrap<NodeModelWrapper>

    std::mutex inference_mutex;

-    std::string type;
-    // corresponds to LLModel::name() in typescript
-    std::string name;
-    int nCtx{};
-    int nGpuLayers{};
-    std::string full_model_path;
+    std::string model_type;
+    std::string model_name;
+    std::string model_file;
+    std::string backend;
+    int n_ctx{};
+    int n_gpu_layers{};
 };
--- a/gpt4all-bindings/typescript/package.json
+++ b/gpt4all-bindings/typescript/package.json
@ -5,32 +5,38 @@
  "main": "src/gpt4all.js",
  "repository": "nomic-ai/gpt4all",
  "scripts": {
-    "install": "node-gyp-build",
+    "install": "node ./scripts/assert-backend-sources.js && node-gyp-build",
+    "test:ci": "jest test/ci.test.js",
    "test": "jest",
-    "build:backend": "node scripts/build.js",
-    "build": "node-gyp-build",
+    "clean": "rimraf build runtimes prebuilds backend",
+    "prebuild": "npm run clean",
+    "build": "npm run build:runtimes && npm run build:prebuilds",
+    "build:runtimes": "node scripts/build.js",
+    "build:prebuilds": "node scripts/assert-backend-sources.js && node scripts/prebuild.js",
    "docs:build": "node scripts/docs.js && documentation readme ./src/gpt4all.d.ts --parse-extension js d.ts --format md --section \"API Reference\" --readme-file ../python/docs/gpt4all_nodejs.md"
  },
  "files": [
+    "binding.gyp",
    "src/**/*",
    "runtimes/**/*",
-    "binding.gyp",
    "prebuilds/**/*",
+    "backend/**/*",
+    "scripts/assert-backend-sources.js",
    "*.h",
-    "*.cc",
-    "gpt4all-backend/**/*"
+    "*.cc"
  ],
  "dependencies": {
    "md5-file": "^5.0.0",
-    "node-addon-api": "^6.1.0",
-    "node-gyp-build": "^4.6.0"
+    "node-addon-api": "^8.0.0",
+    "node-gyp-build": "~4.8.0"
  },
  "devDependencies": {
-    "@types/node": "^20.1.5",
+    "@types/node": "^20.12.12",
    "documentation": "^14.0.2",
-    "jest": "^29.5.0",
-    "prebuildify": "^5.0.1",
-    "prettier": "^2.8.8"
+    "jest": "^29.7.0",
+    "prebuildify": "^6.0.1",
+    "prettier": "^3.2.5",
+    "rimraf": "^5.0.7"
  },
  "optionalDependencies": {
    "node-gyp": "9.x.x"
--- a/gpt4all-bindings/typescript/prompt.cc
+++ b/gpt4all-bindings/typescript/prompt.cc
@ -131,7 +131,8 @@ bool PromptWorker::ResponseCallback(int32_t token_id, const std::string token)
                // Transform native data into JS data, passing it to the provided
                // `jsCallback` -- the TSFN's JavaScript function.
                auto token_id = Napi::Number::New(env, value->tokenId);
-                auto token = Napi::String::New(env, value->token);
+                auto token = Napi::Uint8Array::New(env, value->token.size());
+                memcpy(token.Data(), value->token.data(), value->token.size());
                auto jsResult = jsCallback.Call({token_id, token}).ToBoolean();
                promise.set_value(jsResult);
            }
--- a/gpt4all-bindings/typescript/scripts/assert-backend-sources.js
+++ b/gpt4all-bindings/typescript/scripts/assert-backend-sources.js
@ -0,0 +1,47 @@
+const fs = require("fs");
+const path = require("path");
+
+// Copies the shared llmodel sources from gpt4all-backend into the backend folder.
+// These are dependencies of the bindings and will be required in case node-gyp-build
+// cannot find a prebuild. This script is used in the package install hook and will
+// be executed BOTH when `yarn install` is run in the root folder AND when the package
+// is installed as a dependency in another project.
+
+const backendDeps = [
+    "llmodel.h",
+    "llmodel.cpp",
+    "llmodel_c.cpp",
+    "llmodel_c.h",
+    "sysinfo.h",
+    "dlhandle.h",
+    "dlhandle.cpp",
+];
+
+const sourcePath = path.resolve(__dirname, "../../../gpt4all-backend");
+const destPath = path.resolve(__dirname, "../backend");
+
+// Silently ignore if the backend sources are not available.
+// When the package is installed as a dependency, gpt4all-backend will not be present.
+if (fs.existsSync(sourcePath)) {
+    if (!fs.existsSync(destPath)) {
+        fs.mkdirSync(destPath);
+    }
+    for (const file of backendDeps) {
+        const sourceFile = path.join(sourcePath, file);
+        const destFile = path.join(destPath, file);
+        if (fs.existsSync(sourceFile)) {
+            console.info(`Copying ${sourceFile} to ${destFile}`);
+            fs.copyFileSync(sourceFile, destFile); // overwrite
+        } else {
+            throw new Error(`File ${sourceFile} does not exist`);
+        }
+    }
+}
+
+// assert that the backend sources are present
+for (const file of backendDeps) {
+    const destFile = path.join(destPath, file);
+    if (!fs.existsSync(destFile)) {
+        throw new Error(`File ${destFile} does not exist`);
+    }
+}
--- a/gpt4all-bindings/typescript/scripts/build_unix.sh
+++ b/gpt4all-bindings/typescript/scripts/build_unix.sh
@ -1,12 +1,42 @@
 #!/bin/sh
+# Build script for Unix-like systems (Linux, macOS).
+# Script assumes the current working directory is the bindings project root.

 SYSNAME=$(uname -s)
+PLATFORM=$(uname -m)
+
+# Allows overriding target sysname and platform via args
+# If not provided, the current system's sysname and platform will be used
+
+while [ $# -gt 0 ]; do
+  case "$1" in
+    --sysname=*)
+      SYSNAME="${1#*=}"
+      shift
+      ;;
+    --platform=*)
+      PLATFORM="${1#*=}"
+      shift
+      ;;
+    *)
+      echo "Unknown argument: $1" >&2
+      exit 1
+      ;;
+  esac
+done

 if [ "$SYSNAME" = "Linux" ]; then
-  BASE_DIR="runtimes/linux-x64"
+  if [ "$PLATFORM" = "x86_64" ]; then
+    BASE_DIR="runtimes/linux-x64"
+  elif [ "$PLATFORM" = "aarch64" ]; then
+    BASE_DIR="runtimes/linux-arm64"
+  else
+    echo "Unsupported platform: $PLATFORM" >&2
+    exit 1
+  fi
  LIB_EXT="so"
 elif [ "$SYSNAME" = "Darwin" ]; then
-  BASE_DIR="runtimes/osx"
+  BASE_DIR="runtimes/darwin"
  LIB_EXT="dylib"
 elif [ -n "$SYSNAME" ]; then
  echo "Unsupported system: $SYSNAME" >&2
@ -22,8 +52,24 @@ BUILD_DIR="$BASE_DIR/build"
 rm -rf "$BASE_DIR"
 mkdir -p "$NATIVE_DIR" "$BUILD_DIR"

-cmake -S ../../gpt4all-backend -B "$BUILD_DIR" &&
-cmake --build "$BUILD_DIR" -j --config Release && {
+if [ "$PLATFORM" = "x86_64" ]; then
+  echo "Building for x86_64"
+  cmake -S ../../gpt4all-backend -B "$BUILD_DIR" -DCMAKE_BUILD_TYPE=RelWithDebInfo
+fi
+
+if [ "$PLATFORM" = "aarch64" ]; then
+  if [ "$(uname -m)" != "aarch64" ]; then
+    echo "Cross-compiling for aarch64"
+    cmake -S ../../gpt4all-backend \
+      -B "$BUILD_DIR" \
+      -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+      -DCMAKE_TOOLCHAIN_FILE="./toolchains/linux-arm64-toolchain.cmake"
+  else
+    cmake -S ../../gpt4all-backend -B "$BUILD_DIR" -DCMAKE_BUILD_TYPE=RelWithDebInfo
+  fi
+fi
+
+cmake --build "$BUILD_DIR" --parallel && {
  cp "$BUILD_DIR"/libgptj*.$LIB_EXT   "$NATIVE_DIR"/
  cp "$BUILD_DIR"/libllama*.$LIB_EXT  "$NATIVE_DIR"/
-}
+}
--- a/gpt4all-bindings/typescript/scripts/prebuild.js
+++ b/gpt4all-bindings/typescript/scripts/prebuild.js
@ -1,22 +1,21 @@
 const prebuildify = require("prebuildify");

-async function createPrebuilds(combinations) {
-    for (const { platform, arch } of combinations) {
+async function createPrebuilds(configs) {
+    for (const config of configs) {
        const opts = {
-            platform,
-            arch,
            napi: true,
-            targets: ["18.16.0"]
+            targets: ["18.16.0"],
+            ...config,
        };
        try {
            await createPrebuild(opts);
            console.log(
-                `Build succeeded for platform ${opts.platform} and architecture ${opts.arch}`
+                `Build succeeded for platform ${opts.platform} and architecture ${opts.arch}`,
            );
        } catch (err) {
            console.error(
                `Error building for platform ${opts.platform} and architecture ${opts.arch}:`,
-                err
+                err,
            );
        }
    }
@ -24,6 +23,17 @@ async function createPrebuilds(combinations) {

 function createPrebuild(opts) {
    return new Promise((resolve, reject) => {
+        // if this prebuild is cross-compiling for arm64 on a non-arm64 machine,
+        // set the CXX and CC environment variables to the cross-compilers
+        if (
+            opts.arch === "arm64" &&
+            process.arch !== "arm64" &&
+            process.platform === "linux"
+        ) {
+            process.env.CXX = "aarch64-linux-gnu-g++-12";
+            process.env.CC = "aarch64-linux-gnu-gcc-12";
+        }
+
        prebuildify(opts, (err) => {
            if (err) {
                reject(err);
@ -35,22 +45,18 @@ function createPrebuild(opts) {
 }

 let prebuildConfigs;
-if(process.platform === 'win32') {
-   prebuildConfigs = [
-    { platform: "win32", arch: "x64" }
-   ];
-} else if(process.platform === 'linux') {
-   //Unsure if darwin works, need mac tester!
-   prebuildConfigs = [
-    { platform: "linux", arch: "x64" },
-    //{ platform: "linux", arch: "arm64" },
-    //{ platform: "linux", arch: "armv7" },
-   ]
-} else if(process.platform === 'darwin') {
+if (process.platform === "win32") {
+    prebuildConfigs = [{ platform: "win32", arch: "x64" }];
+} else if (process.platform === "linux") {
    prebuildConfigs = [
-       { platform: "darwin", arch: "x64" },
-       { platform: "darwin", arch: "arm64" },
-    ]
+        { platform: "linux", arch: "x64" },
+        { platform: "linux", arch: "arm64" },
+    ];
+} else if (process.platform === "darwin") {
+    prebuildConfigs = [
+        { platform: "darwin", arch: "x64" },
+        { platform: "darwin", arch: "arm64" },
+    ];
 }

 createPrebuilds(prebuildConfigs)
--- a/gpt4all-bindings/typescript/spec/chat-minimal.mjs
+++ b/gpt4all-bindings/typescript/spec/chat-minimal.mjs
@ -2,7 +2,6 @@ import { loadModel, createCompletion } from "../src/gpt4all.js";

 const model = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", {
    verbose: true,
-    device: "gpu",
 });

 const chat = await model.createChatSession();
@ -12,8 +11,6 @@ await createCompletion(
    "Why are bananas rather blue than bread at night sometimes?",
    {
        verbose: true,
+        nPredict: 10,
    }
-);
-await createCompletion(chat, "Are you sure?", {
-    verbose: true,
-});
+);
--- a/gpt4all-bindings/typescript/spec/concurrency.mjs
+++ b/gpt4all-bindings/typescript/spec/concurrency.mjs
@ -7,12 +7,12 @@ const modelOptions = {
    verbose: true,
 };

-const model1 = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", {
+const model1 = await loadModel("Phi-3-mini-4k-instruct.Q4_0.gguf", {
    ...modelOptions,
    device: "gpu", // only one model can be on gpu
 });
-const model2 = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", modelOptions);
-const model3 = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", modelOptions);
+const model2 = await loadModel("Phi-3-mini-4k-instruct.Q4_0.gguf", modelOptions);
+const model3 = await loadModel("Phi-3-mini-4k-instruct.Q4_0.gguf", modelOptions);

 const promptContext = {
    verbose: true,
@ -27,3 +27,6 @@ const responses = await Promise.all([
    createCompletion(model3, "What is 1 + 3?", promptContext),
 ]);
 console.log(responses.map((res) => res.choices[0].message));
+model1.dispose();
+model2.dispose();
+model3.dispose();
--- a/gpt4all-bindings/typescript/spec/context-large.mjs
+++ b/gpt4all-bindings/typescript/spec/context-large.mjs
--- a/gpt4all-bindings/typescript/spec/context-recalc.mjs
+++ b/gpt4all-bindings/typescript/spec/context-recalc.mjs
--- a/gpt4all-bindings/typescript/spec/llmodel.mjs
+++ b/gpt4all-bindings/typescript/spec/llmodel.mjs
@ -1,61 +0,0 @@
-import {
-    LLModel,
-    createCompletion,
-    DEFAULT_DIRECTORY,
-    DEFAULT_LIBRARIES_DIRECTORY,
-    loadModel,
-} from "../src/gpt4all.js";
-
-const model = await loadModel("mistral-7b-openorca.gguf2.Q4_0.gguf", {
-    verbose: true,
-    device: "gpu",
-});
-const ll = model.llm;
-
-try {
-    class Extended extends LLModel {}
-} catch (e) {
-    console.log("Extending from native class gone wrong " + e);
-}
-
-console.log("state size " + ll.stateSize());
-
-console.log("thread count " + ll.threadCount());
-ll.setThreadCount(5);
-
-console.log("thread count " + ll.threadCount());
-ll.setThreadCount(4);
-console.log("thread count " + ll.threadCount());
-console.log("name " + ll.name());
-console.log("type: " + ll.type());
-console.log("Default directory for models", DEFAULT_DIRECTORY);
-console.log("Default directory for libraries", DEFAULT_LIBRARIES_DIRECTORY);
-console.log("Has GPU", ll.hasGpuDevice());
-console.log("gpu devices", ll.listGpu());
-console.log("Required Mem in bytes", ll.memoryNeeded());
-
-// to ingest a custom system prompt without using a chat session.
-await createCompletion(
-    model,
-    "<|im_start|>system\nYou are an advanced mathematician.\n<|im_end|>\n",
-    {
-        promptTemplate: "%1",
-        nPredict: 0,
-        special: true,
-    }
-);
-const completion1 = await createCompletion(model, "What is 1 + 1?", {
-    verbose: true,
-});
-console.log(`🤖 > ${completion1.choices[0].message.content}`);
-//Very specific:
-// tested on Ubuntu 22.0, Linux Mint, if I set nPast to 100, the app hangs.
-const completion2 = await createCompletion(model, "And if we add two?", {
-    verbose: true,
-});
-console.log(`🤖 > ${completion2.choices[0].message.content}`);
-
-//CALLING DISPOSE WILL INVALID THE NATIVE MODEL. USE THIS TO CLEANUP
-model.dispose();
-
-console.log("model disposed, exiting...");
--- a/gpt4all-bindings/typescript/spec/token-callbacks.mjs
+++ b/gpt4all-bindings/typescript/spec/token-callbacks.mjs
@ -1,7 +1,6 @@
-import { promises as fs } from "node:fs";
 import { loadModel, createCompletion } from "../src/gpt4all.js";

-const model = await loadModel("Nous-Hermes-2-Mistral-7B-DPO.Q4_0.gguf", {
+const model = await loadModel("Phi-3-mini-4k-instruct.Q4_0.gguf", {
    verbose: true,
    device: "gpu",
 });
@ -12,14 +11,15 @@ const res = await createCompletion(
    {
        onPromptToken: (tokenId) => {
            console.debug("onPromptToken", { tokenId });
-            // throwing an error will cancel
+            // errors within the callback will cancel ingestion, inference will still run
            throw new Error("This is an error");
            // const foo = thisMethodDoesNotExist();
            // returning false will cancel as well
            // return false;
        },
-        onResponseToken: (tokenId, token) => {
-            console.debug("onResponseToken", { tokenId, token });
+        onResponseTokens: ({ tokenIds, text }) => {
+            // console.debug("onResponseToken", { tokenIds, text });
+            process.stdout.write(text);
            // same applies here
        },
    }
--- a/gpt4all-bindings/typescript/spec/token-streaming-emoji.mjs
+++ b/gpt4all-bindings/typescript/spec/token-streaming-emoji.mjs
@ -0,0 +1,37 @@
+import {
+    loadModel,
+    createCompletion,
+    createCompletionStream,
+    createCompletionGenerator,
+} from "../src/gpt4all.js";
+
+const model = await loadModel("Phi-3-mini-4k-instruct.Q4_0.gguf", {
+    device: "cpu",
+});
+
+const prompt = "Tell a short story but only use emojis. Three sentences max.";
+
+const result = await createCompletion(model, prompt, {
+    onResponseToken: (tokens) => {
+        console.debug(tokens)
+    },
+});
+
+console.debug(result.choices[0].message);
+
+process.stdout.write("### Stream:");
+const stream = createCompletionStream(model, prompt);
+stream.tokens.on("data", (data) => {
+    process.stdout.write(data);
+});
+await stream.result;
+process.stdout.write("\n");
+
+process.stdout.write("### Generator:");
+const gen = createCompletionGenerator(model, prompt);
+for await (const chunk of gen) {
+    process.stdout.write(chunk);
+}
+
+
+model.dispose();
--- a/gpt4all-bindings/typescript/spec/token-streaming.mjs
+++ b/gpt4all-bindings/typescript/spec/token-streaming.mjs
@ -38,8 +38,8 @@ process.stdout.write("\n");

 process.stdout.write("### Callback:");
 await createCompletion(model, "Why not just callbacks?", {
-    onResponseToken: (tokenId, token) => {
-        process.stdout.write(token);
+    onResponseTokens: ({ text }) => {
+        process.stdout.write(text);
    },
 });
 process.stdout.write("\n");
--- a/gpt4all-bindings/typescript/src/chat-session.js
+++ b/gpt4all-bindings/typescript/src/chat-session.js
@ -25,7 +25,7 @@ class ChatSession {
        const { messages, systemPrompt, ...sessionDefaultPromptContext } =
            chatSessionOpts;
        this.model = model;
-        this.modelName = model.llm.name();
+        this.modelName = model.llm.getName();
        this.messages = messages ?? [];
        this.systemPrompt = systemPrompt ?? model.config.systemPrompt;
        this.initialized = false;
--- a/gpt4all-bindings/typescript/src/gpt4all.d.ts
+++ b/gpt4all-bindings/typescript/src/gpt4all.d.ts
@ -5,10 +5,27 @@ interface LLModelOptions {
    /**
     * Model architecture. This argument currently does not have any functionality and is just used as descriptive identifier for user.
     */
-    type?: string;
-    model_name: string;
-    model_path: string;
-    library_path?: string;
+    modelType?: string;
+    /**
+     * Absolute path to the model file.
+     */
+    modelFile: string;
+    /**
+     * Path to the llmodel implementation shared objects. This can be a single path or a list of paths separated by ';' delimiter.
+     */
+    librariesPath?: string;
+    /**
+     * A string representing the implementation to use. One of 'auto', 'cpu', 'metal', 'kompute', or 'cuda'.
+     */
+    backend: string;
+    /**
+     * The maximum window size of this model.
+     */
+    nCtx: number;
+    /**
+     * Number of GPU layers to use (Vulkan)
+     */
+    nGpuLayers: number;
 }

 interface ModelConfig {
@ -263,10 +280,10 @@ interface LLModelInferenceResult {
 interface LLModelInferenceOptions extends Partial<LLModelPromptContext> {
    /** Callback for response tokens, called for each generated token.
     * @param {number} tokenId The token id.
-     * @param {string} token The token.
+     * @param {Uint8Array} bytes The token bytes.
     * @returns {boolean | undefined} Whether to continue generating tokens.
     * */
-    onResponseToken?: (tokenId: number, token: string) => boolean | void;
+    onResponseToken?: (tokenId: number, bytes: Uint8Array) => boolean | void;
    /** Callback for prompt tokens, called for each input token in the prompt.
     * @param {number} tokenId The token id.
     * @returns {boolean | undefined} Whether to continue ingesting the prompt.
@ -281,30 +298,42 @@ interface LLModelInferenceOptions extends Partial<LLModelPromptContext> {
 declare class LLModel {
    /**
     * Initialize a new LLModel.
-     * @param {string} path Absolute path to the model file.
-     * @throws {Error} If the model file does not exist.
+     * @param {LLModelOptions} options LLModel options.
+     * @throws {Error} If the model can't be loaded or necessary runtimes are not found.
     */
    constructor(options: LLModelOptions);
+    /**
+     * Loads the LLModel.
+     * @return {boolean} true if the model was loaded successfully, false otherwise.
+     */
+    load(): boolean;
+    
+    /**
+     * Initiate a GPU by a string identifier. See LoadModelOptions.device for more information
+     * @param {string} device  'amd' | 'nvidia' | 'intel' | 'gpu' | gpu name.
+     * @return {boolean} true if the GPU was initialized successfully, false otherwise.
+     */
+    initGpu(device: string): boolean;

    /** undefined or user supplied */
-    type(): string | undefined;
+    getType(): string | undefined;

    /** The name of the model. */
-    name(): string;
+    getName(): string;

    /**
     * Get the size of the internal state of the model.
     * NOTE: This state data is specific to the type of model you have created.
     * @return the size in bytes of the internal state of the model
     */
-    stateSize(): number;
+    getStateSize(): number;

    /**
     * Get the number of threads used for model inference.
     * The default is the number of physical cores your computer has.
     * @returns The number of threads used for model inference.
     */
-    threadCount(): number;
+    getThreadCount(): number;

    /**
     * Set the number of threads used for model inference.
@ -375,14 +404,6 @@ declare class LLModel {
     */
    getLibraryPath(): string;

-    /**
-     * Initiate a GPU by a string identifier.
-     * @param {number} memory_required Should be in the range size_t or will throw
-     * @param {string} device_name  'amd' | 'nvidia' | 'intel' | 'gpu' | gpu name.
-     * read LoadModelOptions.device for more information
-     */
-    initGpuByString(memory_required: number, device_name: string): boolean;
-
    /**
     * From C documentation
     * @returns True if a GPU device is successfully initialized, false otherwise.
@ -391,11 +412,10 @@ declare class LLModel {

    /**
     * GPUs that are usable for this LLModel
-     * @param {number} nCtx Maximum size of context window
-     * @throws if hasGpuDevice returns false (i think)
-     * @returns
+     * @throws if gpu device list is not available
+     * @returns an array of GpuDevice objects
     */
-    listGpu(nCtx: number): GpuDevice[];
+    getGpuDevices(): GpuDevice[];

    /**
     * delete and cleanup the native model
@ -414,6 +434,7 @@ interface GpuDevice {
    heapSize: number;
    name: string;
    vendor: string;
+    backend: string;
 }

 /**
@ -443,13 +464,15 @@ interface LoadModelOptions {
    /**
     * The processing unit on which the model will run. It can be set to
     * - "cpu": Model will run on the central processing unit.
-     * - "gpu": Model will run on the best available graphics processing unit, irrespective of its vendor.
-     * - "amd", "nvidia", "intel": Model will run on the best available GPU from the specified vendor.
+     * - "kompute": Model will run using the kompute (vulkan) gpu backend
+     * - "cuda": Model will run using the cuda gpu backend
+     * - "gpu": Use Metal on ARM64 macOS, otherwise the same as "kompute"
+     * - "amd", "nvidia":  Use the best GPU provided by the Kompute backend from this vendor.
     * - "gpu name": Model will run on the GPU that matches the name if it's available.
     * Note: If a GPU device lacks sufficient RAM to accommodate the model, an error will be thrown, and the GPT4All
     * instance will be rendered invalid. It's advised to ensure the device has enough memory before initiating the
     * model.
-     * @default "cpu"
+     * @default Metal on ARM64 macOS, "cpu" otherwise.
     */
    device?: string;
    /**
@ -458,10 +481,16 @@ interface LoadModelOptions {
     */
    nCtx?: number;
    /**
-     * Number of gpu layers needed
+     * Number of GPU layers to use (Vulkan)
     * @default 100
+     * @alias ngl
     */
+    nGpuLayers?: number;
    ngl?: number;
+    /**
+     * Number of CPU threads used by GPT4All. Default is None, then the number of threads are determined automatically.
+     */
+    nThreads?: number;
 }

 interface InferenceModelOptions extends LoadModelOptions {
@ -507,15 +536,33 @@ interface CompletionProvider {
    ): Promise<InferenceResult>;
 }

+interface CompletionTokens {
+    /** The token ids. */
+    tokenIds: number[];
+    /** The token text. May be an empty string. */
+    text: string;
+}
+
 /**
 * Options for creating a completion.
 */
-interface CompletionOptions extends LLModelInferenceOptions {
+interface CompletionOptions extends Partial<LLModelPromptContext> {
    /**
     * Indicates if verbose logging is enabled.
     * @default false
     */
    verbose?: boolean;
+
+    /** Called every time new tokens can be decoded to text.
+     * @param {CompletionTokens} tokens The token ids and decoded text.
+     * @returns {boolean | undefined} Whether to continue generating tokens.
+     * */
+    onResponseTokens?: (tokens: CompletionTokens) => boolean | void;
+    /** Callback for prompt tokens, called for each input token in the prompt.
+     * @param {number} tokenId The token id.
+     * @returns {boolean | undefined} Whether to continue ingesting the prompt.
+     * */
+    onPromptToken?: (tokenId: number) => boolean | void;
 }

 /**
@ -639,13 +686,6 @@ interface LLModelPromptContext {
     */
    promptTemplate?: string;

-    /** The context window size. Do not use, it has no effect. See loadModel options.
-     * THIS IS DEPRECATED!!!
-     * Use loadModel's nCtx option instead.
-     * @default 2048
-     */
-    nCtx: number;
-
    /** The top-k logits to sample from.
     * Top-K sampling selects the next token only from the top K most likely tokens predicted by the model.
     * It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit
--- a/gpt4all-bindings/typescript/src/gpt4all.js
+++ b/gpt4all-bindings/typescript/src/gpt4all.js
@ -37,9 +37,8 @@ async function loadModel(modelName, options = {}) {
        type: "inference",
        allowDownload: true,
        verbose: false,
-        device: "cpu",
        nCtx: 2048,
-        ngl: 100,
+        nGpuLayers: options.ngl ?? 100,
        ...options,
    };

@ -54,27 +53,77 @@ async function loadModel(modelName, options = {}) {
        typeof loadOptions.librariesPath === "string",
        "Libraries path should be a string"
    );
-    const existingPaths = loadOptions.librariesPath
+    const existingLibPaths = loadOptions.librariesPath
        .split(";")
        .filter(existsSync)
        .join(";");
-
+        
    const llmOptions = {
-        model_name: appendBinSuffixIfMissing(modelName),
-        model_path: loadOptions.modelPath,
-        library_path: existingPaths,
-        device: loadOptions.device,
+        modelFile: modelConfig.path,
+        librariesPath: existingLibPaths,
        nCtx: loadOptions.nCtx,
-        ngl: loadOptions.ngl,
+        nGpuLayers: loadOptions.nGpuLayers,
    };

+    let initDevice;
+    if (process.platform === "darwin") {
+        if (!loadOptions.device) {
+            llmOptions.backend = "auto"; // 'auto' is effectively 'metal' due to currently non-functional fallback
+        } else if (loadOptions.device === "cpu") {
+            llmOptions.backend = "cpu";
+        } else {
+            if (process.arch !== "arm64" || loadOptions.device !== "gpu") {
+                throw new Error(
+                    `Unknown device for this platform: ${loadOptions.device}`
+                );
+            }
+            llmOptions.backend = "metal";
+        }
+    } else {
+        // default to kompute. use cpu for arm64 because we currently dont build kompute runtimes for arm64
+        llmOptions.backend = process.arch === "arm64" ? "cpu" : "kompute";
+        if (!loadOptions.device || loadOptions.device === "cpu") {
+            // use the default backend
+        } else if (
+            loadOptions.device === "cuda" ||
+            loadOptions.device === "kompute"
+        ) {
+            llmOptions.backend = loadOptions.device;
+            initDevice = "gpu";
+        } else if (loadOptions.device.startsWith("cuda:")) {
+            llmOptions.backend = "cuda";
+            initDevice = loadOptions.device.replace(/^cuda:/, "");
+        } else {
+            initDevice = loadOptions.device.replace(/^kompute:/, "");
+        }
+    }
+
    if (loadOptions.verbose) {
        console.debug("Creating LLModel:", {
+            initDevice,
            llmOptions,
            modelConfig,
        });
    }
    const llmodel = new LLModel(llmOptions);
+    if (initDevice) {
+        const gpuInitSuccess = llmodel.initGpu(initDevice);
+        if (!gpuInitSuccess) {
+            const availableDevices = llmodel.getGpuDevices();
+            const deviceNames = availableDevices
+                .map((device) => device.name)
+                .join(", ");
+            console.warn(
+                `Failed to initialize GPU device "${initDevice}" - Available devices: ${deviceNames}`
+            );
+        }
+    }
+    llmodel.load();
+    
+    if (loadOptions.nThreads) {
+        llmodel.setThreadCount(loadOptions.nThreads);
+    }
+
    if (loadOptions.type === "embedding") {
        return new EmbeddingModel(llmodel, modelConfig);
    } else if (loadOptions.type === "inference") {
@ -84,7 +133,7 @@ async function loadModel(modelName, options = {}) {
    }
 }

-function createEmbedding(model, text, options={}) {
+function createEmbedding(model, text, options = {}) {
    let {
        dimensionality = undefined,
        longTextMode = "mean",
@ -138,10 +187,7 @@ async function createCompletion(
        ...options,
    };

-    const result = await provider.generate(
-        input,
-        completionOptions,
-    );
+    const result = await provider.generate(input, completionOptions);

    return {
        model: provider.modelName,
@ -174,10 +220,10 @@ function createCompletionStream(

    const completionPromise = createCompletion(provider, input, {
        ...options,
-        onResponseToken: (tokenId, token) => {
-            completionStream.push(token);
-            if (options.onResponseToken) {
-                return options.onResponseToken(tokenId, token);
+        onResponseTokens: (tokens) => {
+            completionStream.push(tokens.text);
+            if (options.onResponseTokens) {
+                return options.onResponseTokens(tokens);
            }
        },
    }).then((result) => {
--- a/gpt4all-bindings/typescript/src/models.js
+++ b/gpt4all-bindings/typescript/src/models.js
@ -11,7 +11,7 @@ class InferenceModel {
    constructor(llmodel, config) {
        this.llm = llmodel;
        this.config = config;
-        this.modelName = this.llm.name();
+        this.modelName = this.llm.getName();
    }

    async createChatSession(options) {
@ -89,6 +89,25 @@ class InferenceModel {
        }

        let tokensGenerated = 0;
+        
+        const decoder = new TokenDecoder((tokenIds, text) => {
+            let continueGeneration = true;
+            tokensGenerated += tokenIds.length;
+            
+            if (options.onResponseTokens) {
+                // catch here because if errors bubble through cpp they will loose stacktraces
+                try {
+                    // don't cancel the generation unless user explicitly returns false
+                    continueGeneration =
+                        options.onResponseTokens({ tokenIds, text }) !== false;
+                } catch (err) {
+                    console.error("Error in onResponseToken callback", err);
+                    continueGeneration = false;
+                }
+            }
+            return continueGeneration;
+            
+        });

        const result = await this.llm.infer(prompt, {
            ...promptContext,
@ -97,7 +116,7 @@ class InferenceModel {
                let continueIngestion = true;
                tokensIngested++;
                if (options.onPromptToken) {
-                    // catch errors because if they go through cpp they will loose stacktraces
+                    // catch here because if errors bubble through cpp they will looe stacktraces
                    try {
                        // don't cancel ingestion unless user explicitly returns false
                        continueIngestion =
@ -109,20 +128,8 @@ class InferenceModel {
                }
                return continueIngestion;
            },
-            onResponseToken: (tokenId, token) => {
-                let continueGeneration = true;
-                tokensGenerated++;
-                if (options.onResponseToken) {
-                    try {
-                        // don't cancel the generation unless user explicitly returns false
-                        continueGeneration =
-                            options.onResponseToken(tokenId, token) !== false;
-                    } catch (err) {
-                        console.error("Error in onResponseToken callback", err);
-                        continueGeneration = false;
-                    }
-                }
-                return continueGeneration;
+            onResponseToken: (tokenId, bytes) => {
+                return decoder.decode(tokenId, bytes);
            },
        });

@ -141,6 +148,63 @@ class InferenceModel {
    }
 }

+// see https://github.com/nomic-ai/gpt4all/pull/1281
+class TokenDecoder {
+
+    constructor(callback) {
+        this.callback = callback;
+        this.buffer = [];
+        this.tokenIds = [];
+        this.buffExpectingContBytes = 0;
+        this.textDecoder = new TextDecoder();
+    }
+
+    decode(tokenId, bytes) {
+        const decoded = [];
+        this.tokenIds.push(tokenId);
+
+        for (let i = 0; i < bytes.length; i++) {
+            const byte = bytes[i];
+            const bits = byte.toString(2).padStart(8, '0');
+            const highOnes = bits.split('0')[0];
+
+            if (highOnes.length === 1) {
+                // Continuation byte
+                this.buffer.push(byte);
+                this.buffExpectingContBytes -= 1;
+            } else {
+                // Beginning of a byte sequence
+                if (this.buffer.length > 0) {
+                    decoded.push(this._decodeBuffer());
+                    this.buffer = [];
+                }
+
+                this.buffer.push(byte);
+                this.buffExpectingContBytes = Math.max(0, highOnes.length - 1);
+            }
+
+            if (this.buffExpectingContBytes <= 0) {
+                // Received the whole sequence or an out-of-place continuation byte
+                decoded.push(this._decodeBuffer());
+                this.buffer = [];
+                this.buffExpectingContBytes = 0;
+            }
+        }
+
+        if (decoded.length === 0 && this.buffExpectingContBytes > 0) {
+            // Wait for more continuation bytes
+            return true;
+        }
+        const tokenIds = this.tokenIds;
+        this.tokenIds = [];
+        return this.callback(tokenIds, decoded.join(''));
+    }
+
+    _decodeBuffer() {
+        return this.textDecoder.decode(new Uint8Array(this.buffer));
+    }
+}
+
 class EmbeddingModel {
    llm;
    config;
@ -160,6 +224,7 @@ class EmbeddingModel {
 }

 module.exports = {
+    TokenDecoder,
    InferenceModel,
    EmbeddingModel,
 };
--- a/gpt4all-bindings/typescript/test/bindings.test.js
+++ b/gpt4all-bindings/typescript/test/bindings.test.js
@ -0,0 +1,73 @@
+const { loadModel } = require("../src/gpt4all.js");
+
+// these tests require an internet connection / a real model
+const testModel = "Phi-3-mini-4k-instruct.Q4_0.gguf";
+
+describe("llmodel", () => {
+    let model;
+
+    test("load on cpu", async () => {
+        model = await loadModel(testModel, {
+            device: "cpu",
+        });
+    });
+
+    test("getter working", async () => {
+        const stateSize = model.llm.getStateSize();
+        expect(stateSize).toBeGreaterThan(0);
+        const name = model.llm.getName();
+        expect(name).toBe(testModel);
+        const type = model.llm.getType();
+        expect(type).toBeUndefined();
+        const devices = model.llm.getGpuDevices();
+        expect(Array.isArray(devices)).toBe(true);
+        const gpuEnabled = model.llm.hasGpuDevice();
+        expect(gpuEnabled).toBe(false);
+        const requiredMem = model.llm.getRequiredMemory();
+        expect(typeof requiredMem).toBe('number');
+        const threadCount = model.llm.getThreadCount();
+        expect(threadCount).toBe(4);
+    });
+
+    test("setting thread count", () => {
+        model.llm.setThreadCount(5);
+        expect(model.llm.getThreadCount()).toBe(5);
+    });
+
+    test("cpu inference", async () => {
+        const res = await model.llm.infer("what is the capital of france?", {
+            temp: 0,
+            promptTemplate: model.config.promptTemplate,
+            nPredict: 10,
+            onResponseToken: () => {
+                return true;
+            },
+        });
+        expect(res.text).toMatch(/paris/i);
+    }, 10000);
+
+    test("dispose and load model on gpu", async () => {
+        model.dispose();
+        model = await loadModel(testModel, {
+            device: "gpu",
+        });
+        const gpuEnabled = model.llm.hasGpuDevice();
+        expect(gpuEnabled).toBe(true);
+    });
+
+    test("gpu inference", async () => {
+        const res = await model.llm.infer("what is the capital of france?", {
+            temp: 0,
+            promptTemplate: model.config.promptTemplate,
+            nPredict: 10,
+            onResponseToken: () => {
+                return true;
+            },
+        });
+        expect(res.text).toMatch(/paris/i);
+    }, 10000);
+
+    afterAll(() => {
+        model.dispose();
+    });
+});
--- a/gpt4all-bindings/typescript/test/gpt4all.test.js
+++ b/gpt4all-bindings/typescript/test/gpt4all.test.js
@ -2,7 +2,6 @@ const path = require("node:path");
 const os = require("node:os");
 const fsp = require("node:fs/promises");
 const { existsSync } = require('node:fs');
-const { LLModel } = require("node-gyp-build")(path.resolve(__dirname, ".."));
 const {
    listModels,
    downloadModel,
@ -13,11 +12,8 @@ const {
    DEFAULT_LIBRARIES_DIRECTORY,
    DEFAULT_MODEL_LIST_URL,
 } = require("../src/config.js");
-const {
-    loadModel,
-    createPrompt,
-    createCompletion,
-} = require("../src/gpt4all.js");
+
+// these tests do not require an internet connection or an actual model

 describe("config", () => {
    test("default paths constants are available and correct", () => {
--- a/gpt4all-bindings/typescript/yarn.lock
+++ b/gpt4all-bindings/typescript/yarn.lock