diff --git a/gpt4all-backend/llama.cpp-mainline b/gpt4all-backend/llama.cpp-mainline index f67f4651..b2db03ac 160000 --- a/gpt4all-backend/llama.cpp-mainline +++ b/gpt4all-backend/llama.cpp-mainline @@ -1 +1 @@ -Subproject commit f67f4651fac0b2f377dc53fe853b1dafa96f9aa9 +Subproject commit b2db03acf299111885af2921a4230de07623eaf8 diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp index 0e10ab8a..ab560e89 100644 --- a/gpt4all-backend/llamamodel.cpp +++ b/gpt4all-backend/llamamodel.cpp @@ -371,6 +371,11 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl) d_ptr->model_params.main_gpu = d_ptr->device; d_ptr->model_params.n_gpu_layers = ngl; d_ptr->model_params.split_mode = LLAMA_SPLIT_MODE_NONE; + } else { +#ifdef GGML_USE_CUDA + std::cerr << "Llama ERROR: CUDA loadModel was called without a device\n"; + return false; +#endif // GGML_USE_CUDA } #elif defined(GGML_USE_METAL) (void)ngl; @@ -383,15 +388,17 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl) // always fully offload on Metal // TODO(cebtenzzre): use this parameter to allow using more than 53% of system RAM to load a model d_ptr->model_params.n_gpu_layers = 100; -#else +#else // !KOMPUTE && !VULKAN && !CUDA && !METAL (void)ngl; #endif - d_ptr->model = llama_load_model_from_file_gpt4all(modelPath.c_str(), &d_ptr->model_params); + d_ptr->model = llama_load_model_from_file(modelPath.c_str(), d_ptr->model_params); if (!d_ptr->model) { fflush(stdout); +#ifndef GGML_USE_CUDA d_ptr->device = -1; d_ptr->deviceName.clear(); +#endif std::cerr << "LLAMA ERROR: failed to load model from " << modelPath << std::endl; return false; } @@ -434,8 +441,10 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl) std::cerr << "LLAMA ERROR: failed to init context for model " << modelPath << std::endl; llama_free_model(d_ptr->model); d_ptr->model = nullptr; +#ifndef GGML_USE_CUDA d_ptr->device = -1; d_ptr->deviceName.clear(); +#endif return false; } @@ -723,31 +732,16 @@ bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) co #endif } -bool LLamaModel::hasGPUDevice() const -{ -#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA) - return d_ptr->device != -1; -#else - return false; -#endif -} - bool LLamaModel::usingGPUDevice() const { - bool hasDevice; + if (!d_ptr->model) + return false; + bool usingGPU = llama_model_using_gpu(d_ptr->model); #ifdef GGML_USE_KOMPUTE - hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0; - assert(!hasDevice || ggml_vk_has_device()); -#elif defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA) - hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0; -#elif defined(GGML_USE_METAL) - hasDevice = true; -#else - hasDevice = false; + assert(!usingGPU || ggml_vk_has_device()); #endif - - return hasDevice; + return usingGPU; } const char *LLamaModel::backendName() const @@ -760,6 +754,8 @@ const char *LLamaModel::gpuDeviceName() const if (usingGPUDevice()) { #if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA) return d_ptr->deviceName.c_str(); +#elif defined(GGML_USE_METAL) + return "Metal"; #endif } return nullptr; diff --git a/gpt4all-backend/llamamodel_impl.h b/gpt4all-backend/llamamodel_impl.h index 9ab386bc..f162a94d 100644 --- a/gpt4all-backend/llamamodel_impl.h +++ b/gpt4all-backend/llamamodel_impl.h @@ -34,7 +34,6 @@ public: std::vector availableGPUDevices(size_t memoryRequired = 0) const override; bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const override; bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const override; - bool hasGPUDevice() const override; bool usingGPUDevice() const override; const char *backendName() const override; const char *gpuDeviceName() const override; diff --git a/gpt4all-backend/llmodel.h b/gpt4all-backend/llmodel.h index de5d4b5c..5a26f4a4 100644 --- a/gpt4all-backend/llmodel.h +++ b/gpt4all-backend/llmodel.h @@ -2,6 +2,7 @@ #define LLMODEL_H #include +#include #include #include #include @@ -57,23 +58,30 @@ public: backend(backend), index(index), type(type), heapSize(heapSize), name(std::move(name)), vendor(std::move(vendor)) {} - std::string selectionName() const { return m_backendNames.at(backend) + ": " + name; } - std::string reportedName() const { return name + " (" + m_backendNames.at(backend) + ")"; } + std::string selectionName() const + { + assert(backend == "cuda"s || backend == "kompute"s); + return backendName() + ": " + name; + } + + std::string backendName() const { return backendIdToName(backend); } + + static std::string backendIdToName(const std::string &backend) { return s_backendNames.at(backend); } static std::string updateSelectionName(const std::string &name) { if (name == "Auto" || name == "CPU" || name == "Metal") return name; - auto it = std::find_if(m_backendNames.begin(), m_backendNames.end(), [&name](const auto &entry) { + auto it = std::find_if(s_backendNames.begin(), s_backendNames.end(), [&name](const auto &entry) { return name.starts_with(entry.second + ": "); }); - if (it != m_backendNames.end()) + if (it != s_backendNames.end()) return name; return "Vulkan: " + name; // previously, there were only Vulkan devices } private: - static inline const std::unordered_map m_backendNames { - {"cuda", "CUDA"}, {"kompute", "Vulkan"}, + static inline const std::unordered_map s_backendNames { + {"cpu", "CPU"}, {"metal", "Metal"}, {"cuda", "CUDA"}, {"kompute", "Vulkan"}, }; }; @@ -196,7 +204,6 @@ public: return false; } - virtual bool hasGPUDevice() const { return false; } virtual bool usingGPUDevice() const { return false; } virtual const char *backendName() const { return "cpu"; } virtual const char *gpuDeviceName() const { return nullptr; } diff --git a/gpt4all-backend/llmodel_c.cpp b/gpt4all-backend/llmodel_c.cpp index 62663d7f..b585e4bf 100644 --- a/gpt4all-backend/llmodel_c.cpp +++ b/gpt4all-backend/llmodel_c.cpp @@ -287,12 +287,6 @@ bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device) return wrapper->llModel->initializeGPUDevice(device); } -bool llmodel_has_gpu_device(llmodel_model model) -{ - const auto *wrapper = static_cast(model); - return wrapper->llModel->hasGPUDevice(); -} - const char *llmodel_model_backend_name(llmodel_model model) { const auto *wrapper = static_cast(model); diff --git a/gpt4all-backend/llmodel_c.h b/gpt4all-backend/llmodel_c.h index 3955260a..ced23a96 100644 --- a/gpt4all-backend/llmodel_c.h +++ b/gpt4all-backend/llmodel_c.h @@ -291,11 +291,6 @@ bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gp */ bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device); -/** - * @return True if a GPU device is successfully initialized, false otherwise. - */ -bool llmodel_has_gpu_device(llmodel_model model); - /** * @return The name of the llama.cpp backend currently in use. One of "cpu", "kompute", or "metal". */ diff --git a/gpt4all-bindings/python/gpt4all/_pyllmodel.py b/gpt4all-bindings/python/gpt4all/_pyllmodel.py index fa879cde..892d72e7 100644 --- a/gpt4all-bindings/python/gpt4all/_pyllmodel.py +++ b/gpt4all-bindings/python/gpt4all/_pyllmodel.py @@ -177,9 +177,6 @@ llmodel.llmodel_gpu_init_gpu_device_by_struct.restype = ctypes.c_bool llmodel.llmodel_gpu_init_gpu_device_by_int.argtypes = [ctypes.c_void_p, ctypes.c_int32] llmodel.llmodel_gpu_init_gpu_device_by_int.restype = ctypes.c_bool -llmodel.llmodel_has_gpu_device.argtypes = [ctypes.c_void_p] -llmodel.llmodel_has_gpu_device.restype = ctypes.c_bool - llmodel.llmodel_model_backend_name.argtypes = [ctypes.c_void_p] llmodel.llmodel_model_backend_name.restype = ctypes.c_char_p diff --git a/gpt4all-chat/chat.cpp b/gpt4all-chat/chat.cpp index b5564657..5874abe3 100644 --- a/gpt4all-chat/chat.cpp +++ b/gpt4all-chat/chat.cpp @@ -64,8 +64,7 @@ void Chat::connectLLM() connect(m_llmodel, &ChatLLM::recalcChanged, this, &Chat::handleRecalculating, Qt::QueuedConnection); connect(m_llmodel, &ChatLLM::generatedNameChanged, this, &Chat::generatedNameChanged, Qt::QueuedConnection); connect(m_llmodel, &ChatLLM::reportSpeed, this, &Chat::handleTokenSpeedChanged, Qt::QueuedConnection); - connect(m_llmodel, &ChatLLM::reportDevice, this, &Chat::handleDeviceChanged, Qt::QueuedConnection); - connect(m_llmodel, &ChatLLM::reportFallbackReason, this, &Chat::handleFallbackReasonChanged, Qt::QueuedConnection); + connect(m_llmodel, &ChatLLM::loadedModelInfoChanged, this, &Chat::loadedModelInfoChanged, Qt::QueuedConnection); connect(m_llmodel, &ChatLLM::databaseResultsChanged, this, &Chat::handleDatabaseResultsChanged, Qt::QueuedConnection); connect(m_llmodel, &ChatLLM::modelInfoChanged, this, &Chat::handleModelInfoChanged, Qt::QueuedConnection); connect(m_llmodel, &ChatLLM::trySwitchContextOfLoadedModelCompleted, this, &Chat::handleTrySwitchContextOfLoadedModelCompleted, Qt::QueuedConnection); @@ -327,16 +326,19 @@ void Chat::handleTokenSpeedChanged(const QString &tokenSpeed) emit tokenSpeedChanged(); } -void Chat::handleDeviceChanged(const QString &device) +QString Chat::deviceBackend() const { - m_device = device; - emit deviceChanged(); + return m_llmodel->deviceBackend(); } -void Chat::handleFallbackReasonChanged(const QString &fallbackReason) +QString Chat::device() const { - m_fallbackReason = fallbackReason; - emit fallbackReasonChanged(); + return m_llmodel->device(); +} + +QString Chat::fallbackReason() const +{ + return m_llmodel->fallbackReason(); } void Chat::handleDatabaseResultsChanged(const QList &results) diff --git a/gpt4all-chat/chat.h b/gpt4all-chat/chat.h index 9da04459..019caf89 100644 --- a/gpt4all-chat/chat.h +++ b/gpt4all-chat/chat.h @@ -33,8 +33,9 @@ class Chat : public QObject Q_PROPERTY(QList collectionList READ collectionList NOTIFY collectionListChanged) Q_PROPERTY(QString modelLoadingError READ modelLoadingError NOTIFY modelLoadingErrorChanged) Q_PROPERTY(QString tokenSpeed READ tokenSpeed NOTIFY tokenSpeedChanged); - Q_PROPERTY(QString device READ device NOTIFY deviceChanged); - Q_PROPERTY(QString fallbackReason READ fallbackReason NOTIFY fallbackReasonChanged); + Q_PROPERTY(QString deviceBackend READ deviceBackend NOTIFY loadedModelInfoChanged) + Q_PROPERTY(QString device READ device NOTIFY loadedModelInfoChanged) + Q_PROPERTY(QString fallbackReason READ fallbackReason NOTIFY loadedModelInfoChanged) Q_PROPERTY(LocalDocsCollectionsModel *collectionModel READ collectionModel NOTIFY collectionModelChanged) // 0=no, 1=waiting, 2=working Q_PROPERTY(int trySwitchContextInProgress READ trySwitchContextInProgress NOTIFY trySwitchContextInProgressChanged) @@ -111,8 +112,10 @@ public: QString modelLoadingError() const { return m_modelLoadingError; } QString tokenSpeed() const { return m_tokenSpeed; } - QString device() const { return m_device; } - QString fallbackReason() const { return m_fallbackReason; } + QString deviceBackend() const; + QString device() const; + // not loaded -> QString(), no fallback -> QString("") + QString fallbackReason() const; int trySwitchContextInProgress() const { return m_trySwitchContextInProgress; } @@ -149,6 +152,7 @@ Q_SIGNALS: void fallbackReasonChanged(); void collectionModelChanged(); void trySwitchContextInProgressChanged(); + void loadedModelInfoChanged(); private Q_SLOTS: void handleResponseChanged(const QString &response); @@ -159,8 +163,6 @@ private Q_SLOTS: void handleRecalculating(); void handleModelLoadingError(const QString &error); void handleTokenSpeedChanged(const QString &tokenSpeed); - void handleDeviceChanged(const QString &device); - void handleFallbackReasonChanged(const QString &device); void handleDatabaseResultsChanged(const QList &results); void handleModelInfoChanged(const ModelInfo &modelInfo); void handleTrySwitchContextOfLoadedModelCompleted(int value); diff --git a/gpt4all-chat/chatllm.cpp b/gpt4all-chat/chatllm.cpp index 581eaab5..96239fdb 100644 --- a/gpt4all-chat/chatllm.cpp +++ b/gpt4all-chat/chatllm.cpp @@ -93,6 +93,12 @@ void LLModelStore::destroy() m_availableModel.reset(); } +void LLModelInfo::resetModel(ChatLLM *cllm, LLModel *model) { + this->model.reset(model); + fallbackReason.reset(); + emit cllm->loadedModelInfoChanged(); +} + ChatLLM::ChatLLM(Chat *parent, bool isServer) : QObject{nullptr} , m_promptResponseTokens(0) @@ -141,7 +147,7 @@ void ChatLLM::destroy() // The only time we should have a model loaded here is on shutdown // as we explicitly unload the model in all other circumstances if (isModelLoaded()) { - m_llModelInfo.model.reset(); + m_llModelInfo.resetModel(this); } } @@ -208,7 +214,7 @@ void ChatLLM::trySwitchContextOfLoadedModel(const ModelInfo &modelInfo) QString filePath = modelInfo.dirpath + modelInfo.filename(); QFileInfo fileInfo(filePath); - m_llModelInfo = LLModelStore::globalInstance()->acquireModel(); + acquireModel(); #if defined(DEBUG_MODEL_LOADING) qDebug() << "acquired model from store" << m_llmThread.objectName() << m_llModelInfo.model.get(); #endif @@ -251,8 +257,6 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo) // reset status emit modelLoadingPercentageChanged(std::numeric_limits::min()); // small non-zero positive value emit modelLoadingError(""); - emit reportFallbackReason(""); - emit reportDevice(""); m_pristineLoadedState = false; QString filePath = modelInfo.dirpath + modelInfo.filename(); @@ -265,12 +269,12 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo) #if defined(DEBUG_MODEL_LOADING) qDebug() << "already acquired model deleted" << m_llmThread.objectName() << m_llModelInfo.model.get(); #endif - m_llModelInfo.model.reset(); + m_llModelInfo.resetModel(this); } else if (!m_isServer) { // This is a blocking call that tries to retrieve the model we need from the model store. // If it succeeds, then we just have to restore state. If the store has never had a model // returned to it, then the modelInfo.model pointer should be null which will happen on startup - m_llModelInfo = LLModelStore::globalInstance()->acquireModel(); + acquireModel(); #if defined(DEBUG_MODEL_LOADING) qDebug() << "acquired model from store" << m_llmThread.objectName() << m_llModelInfo.model.get(); #endif @@ -305,7 +309,7 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo) #if defined(DEBUG_MODEL_LOADING) qDebug() << "deleting model" << m_llmThread.objectName() << m_llModelInfo.model.get(); #endif - m_llModelInfo.model.reset(); + m_llModelInfo.resetModel(this); } } @@ -335,7 +339,7 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo) model->setModelName(modelName); model->setRequestURL(modelInfo.url()); model->setAPIKey(apiKey); - m_llModelInfo.model.reset(model); + m_llModelInfo.resetModel(this, model); } else { QElapsedTimer modelLoadTimer; modelLoadTimer.start(); @@ -360,10 +364,10 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo) #endif QString constructError; - m_llModelInfo.model.reset(); + m_llModelInfo.resetModel(this); try { auto *model = LLModel::Implementation::construct(filePath.toStdString(), backend, n_ctx); - m_llModelInfo.model.reset(model); + m_llModelInfo.resetModel(this, model); } catch (const LLModel::MissingImplementationError &e) { modelLoadProps.insert("error", "missing_model_impl"); constructError = e.what(); @@ -412,14 +416,15 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo) memGB = std::floor(memGB * 10.f) / 10.f; // truncate to 1 decimal place modelLoadProps.insert("default_device", QString::fromStdString(defaultDevice->name)); modelLoadProps.insert("default_device_mem", approxDeviceMemGB(defaultDevice)); + modelLoadProps.insert("default_device_backend", QString::fromStdString(defaultDevice->backendName())); } } - QString actualDevice("CPU"); + bool actualDeviceIsCPU = true; #if defined(Q_OS_MAC) && defined(__aarch64__) if (m_llModelInfo.model->implementation().buildVariant() == "metal") - actualDevice = "Metal"; + actualDeviceIsCPU = false; #else if (requestedDevice != "CPU") { const auto *device = defaultDevice; @@ -437,41 +442,39 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo) if (!device) { // GPU not available } else if (!m_llModelInfo.model->initializeGPUDevice(device->index, &unavail_reason)) { - emit reportFallbackReason(QString::fromStdString("
" + unavail_reason)); + m_llModelInfo.fallbackReason = QString::fromStdString(unavail_reason); } else { - actualDevice = QString::fromStdString(device->reportedName()); + actualDeviceIsCPU = false; modelLoadProps.insert("requested_device_mem", approxDeviceMemGB(device)); } } #endif // Report which device we're actually using - emit reportDevice(actualDevice); bool success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx, ngl); if (!m_shouldBeLoaded) { - m_llModelInfo.model.reset(); + m_llModelInfo.resetModel(this); if (!m_isServer) LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo)); - m_llModelInfo = LLModelInfo(); + resetModel(); emit modelLoadingPercentageChanged(0.0f); return false; } - if (actualDevice == "CPU") { + if (actualDeviceIsCPU) { // we asked llama.cpp to use the CPU } else if (!success) { // llama_init_from_file returned nullptr - emit reportDevice("CPU"); - emit reportFallbackReason("
GPU loading failed (out of VRAM?)"); + m_llModelInfo.fallbackReason = "GPU loading failed (out of VRAM?)"; modelLoadProps.insert("cpu_fallback_reason", "gpu_load_failed"); success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx, 0); if (!m_shouldBeLoaded) { - m_llModelInfo.model.reset(); + m_llModelInfo.resetModel(this); if (!m_isServer) LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo)); - m_llModelInfo = LLModelInfo(); + resetModel(); emit modelLoadingPercentageChanged(0.0f); return false; } @@ -479,16 +482,15 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo) // ggml_vk_init was not called in llama.cpp // We might have had to fallback to CPU after load if the model is not possible to accelerate // for instance if the quantization method is not supported on Vulkan yet - emit reportDevice("CPU"); - emit reportFallbackReason("
model or quant has no GPU support"); + m_llModelInfo.fallbackReason = "model or quant has no GPU support"; modelLoadProps.insert("cpu_fallback_reason", "gpu_unsupported_model"); } if (!success) { - m_llModelInfo.model.reset(); + m_llModelInfo.resetModel(this); if (!m_isServer) LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo)); - m_llModelInfo = LLModelInfo(); + resetModel(); emit modelLoadingError(u"Could not load model due to invalid model file for %1"_s.arg(modelInfo.filename())); modelLoadProps.insert("error", "loadmodel_failed"); } else { @@ -497,10 +499,10 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo) case 'G': m_llModelType = LLModelType::GPTJ_; break; default: { - m_llModelInfo.model.reset(); + m_llModelInfo.resetModel(this); if (!m_isServer) LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo)); - m_llModelInfo = LLModelInfo(); + resetModel(); emit modelLoadingError(u"Could not determine model type for %1"_s.arg(modelInfo.filename())); } } @@ -510,7 +512,7 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo) } else { if (!m_isServer) LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo)); - m_llModelInfo = LLModelInfo(); + resetModel(); emit modelLoadingError(u"Error loading %1: %2"_s.arg(modelInfo.filename(), constructError)); } } @@ -523,6 +525,7 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo) fflush(stdout); #endif emit modelLoadingPercentageChanged(isModelLoaded() ? 1.0f : 0.0f); + emit loadedModelInfoChanged(); modelLoadProps.insert("requestedDevice", MySettings::globalInstance()->device()); modelLoadProps.insert("model", modelInfo.filename()); @@ -530,7 +533,7 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo) } else { if (!m_isServer) LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo)); // release back into the store - m_llModelInfo = LLModelInfo(); + resetModel(); emit modelLoadingError(u"Could not find file for model %1"_s.arg(modelInfo.filename())); } @@ -621,6 +624,16 @@ void ChatLLM::setModelInfo(const ModelInfo &modelInfo) emit modelInfoChanged(modelInfo); } +void ChatLLM::acquireModel() { + m_llModelInfo = LLModelStore::globalInstance()->acquireModel(); + emit loadedModelInfoChanged(); +} + +void ChatLLM::resetModel() { + m_llModelInfo = {}; + emit loadedModelInfoChanged(); +} + void ChatLLM::modelChangeRequested(const ModelInfo &modelInfo) { m_shouldBeLoaded = true; @@ -809,7 +822,7 @@ void ChatLLM::unloadModel() #endif if (m_forceUnloadModel) { - m_llModelInfo.model.reset(); + m_llModelInfo.resetModel(this); m_forceUnloadModel = false; } diff --git a/gpt4all-chat/chatllm.h b/gpt4all-chat/chatllm.h index fde97f1c..001eae6f 100644 --- a/gpt4all-chat/chatllm.h +++ b/gpt4all-chat/chatllm.h @@ -20,6 +20,7 @@ #include #include #include +#include #include using namespace Qt::Literals::StringLiterals; @@ -32,11 +33,17 @@ enum LLModelType { API_, }; +class ChatLLM; + struct LLModelInfo { std::unique_ptr model; QFileInfo fileInfo; + std::optional fallbackReason; + // NOTE: This does not store the model type or name on purpose as this is left for ChatLLM which // must be able to serialize the information even if it is in the unloaded state + + void resetModel(ChatLLM *cllm, LLModel *model = nullptr); }; class TokenTimer : public QObject { @@ -84,6 +91,9 @@ class ChatLLM : public QObject { Q_OBJECT Q_PROPERTY(bool isRecalc READ isRecalc NOTIFY recalcChanged) + Q_PROPERTY(QString deviceBackend READ deviceBackend NOTIFY loadedModelInfoChanged) + Q_PROPERTY(QString device READ device NOTIFY loadedModelInfoChanged) + Q_PROPERTY(QString fallbackReason READ fallbackReason NOTIFY loadedModelInfoChanged) public: ChatLLM(Chat *parent, bool isServer = false); virtual ~ChatLLM(); @@ -110,6 +120,30 @@ public: bool isRecalc() const { return m_isRecalc; } + void acquireModel(); + void resetModel(); + + QString deviceBackend() const + { + if (!isModelLoaded()) return QString(); + std::string name = LLModel::GPUDevice::backendIdToName(m_llModelInfo.model->backendName()); + return QString::fromStdString(name); + } + + QString device() const + { + if (!isModelLoaded()) return QString(); + const char *name = m_llModelInfo.model->gpuDeviceName(); + return name ? QString(name) : u"CPU"_s; + } + + // not loaded -> QString(), no fallback -> QString("") + QString fallbackReason() const + { + if (!isModelLoaded()) return QString(); + return m_llModelInfo.fallbackReason.value_or(u""_s); + } + QString generatedName() const { return QString::fromStdString(m_nameResponse); } bool serialize(QDataStream &stream, int version, bool serializeKV); @@ -135,6 +169,7 @@ public Q_SLOTS: Q_SIGNALS: void recalcChanged(); + void loadedModelInfoChanged(); void modelLoadingPercentageChanged(float); void modelLoadingError(const QString &error); void modelLoadingWarning(const QString &warning); diff --git a/gpt4all-chat/network.cpp b/gpt4all-chat/network.cpp index 9b99683a..b30b4fdf 100644 --- a/gpt4all-chat/network.cpp +++ b/gpt4all-chat/network.cpp @@ -298,6 +298,7 @@ void Network::trackChatEvent(const QString &ev, QVariantMap props) const auto &curChat = ChatListModel::globalInstance()->currentChat(); if (!props.contains("model")) props.insert("model", curChat->modelInfo().filename()); + props.insert("device_backend", curChat->deviceBackend()); props.insert("actualDevice", curChat->device()); props.insert("doc_collections_enabled", curChat->collectionList().count()); props.insert("doc_collections_total", LocalDocs::globalInstance()->localDocsModel()->rowCount()); diff --git a/gpt4all-chat/qml/ChatView.qml b/gpt4all-chat/qml/ChatView.qml index 93cfe936..c7f13190 100644 --- a/gpt4all-chat/qml/ChatView.qml +++ b/gpt4all-chat/qml/ChatView.qml @@ -1294,7 +1294,21 @@ Rectangle { visible: currentChat.tokenSpeed !== "" elide: Text.ElideRight wrapMode: Text.WordWrap - text: currentChat.tokenSpeed + " \u00B7 " + currentChat.device + currentChat.fallbackReason + text: { + const segments = [currentChat.tokenSpeed]; + const device = currentChat.device; + const backend = currentChat.deviceBackend; + if (device !== null) { // device is null if we have no model loaded + var deviceSegment = device; + if (backend === "CUDA" || backend === "Vulkan") + deviceSegment += ` (${backend})`; + segments.push(deviceSegment); + } + const fallbackReason = currentChat.fallbackReason; + if (fallbackReason !== null && fallbackReason !== "") + segments.push(fallbackReason); + return segments.join(" \u00B7 "); + } font.pixelSize: theme.fontSizeSmaller font.bold: true }