Only show GPU when we're actually using it.

2024-09-19 17:17:30 +03:00 · 2023-09-14 09:59:19 -04:00 · 2023-09-14 09:59:19 -04:00 · 3076e0bf26
commit 3076e0bf26
parent 1fa67a585c
6 changed files with 29 additions and 3 deletions
--- a/gpt4all-backend/llamamodel.cpp
+++ b/gpt4all-backend/llamamodel.cpp
@ -337,6 +337,16 @@ bool LLamaModel::hasGPUDevice()
 #endif
 }

+bool LLamaModel::usingGPUDevice()
+{
+#if defined(GGML_USE_KOMPUTE)
+    return ggml_vk_using_vulkan();
+#elif defined(GGML_USE_METAL)
+    return true;
+#endif
+    return false;
+}
+
 #if defined(_WIN32)
 #define DLL_EXPORT __declspec(dllexport)
 #else
--- a/gpt4all-backend/llamamodel_impl.h
+++ b/gpt4all-backend/llamamodel_impl.h
@ -30,6 +30,7 @@ public:
    bool initializeGPUDevice(const GPUDevice &device) override;
    bool initializeGPUDevice(int device) override;
    bool hasGPUDevice() override;
+    bool usingGPUDevice() override;

 private:
    LLamaPrivate *d_ptr;
--- a/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/llmodel.h
@ -100,6 +100,7 @@ public:
    virtual bool initializeGPUDevice(const GPUDevice &/*device*/) { return false; }
    virtual bool initializeGPUDevice(int /*device*/) { return false; }
    virtual bool hasGPUDevice() { return false; }
+    virtual bool usingGPUDevice() { return false; }

 protected:
    // These are pure virtual because subclasses need to implement as the default implementation of
--- a/gpt4all-backend/replit.cpp
+++ b/gpt4all-backend/replit.cpp
@ -975,6 +975,14 @@ const std::vector<LLModel::Token> &Replit::endTokens() const
    return fres;
 }

+bool Replit::usingGPUDevice()
+{
+#if defined(GGML_USE_METAL)
+    return true;
+#endif
+    return false;
+}
+
 #if defined(_WIN32)
 #define DLL_EXPORT __declspec(dllexport)
 #else
--- a/gpt4all-backend/replit_impl.h
+++ b/gpt4all-backend/replit_impl.h
@ -27,6 +27,7 @@ public:
    size_t restoreState(const uint8_t *src) override;
    void setThreadCount(int32_t n_threads) override;
    int32_t threadCount() const override;
+    bool usingGPUDevice() override;

 private:
    ReplitPrivate *d_ptr;
--- a/gpt4all-chat/chatllm.cpp
+++ b/gpt4all-chat/chatllm.cpp
@ -302,6 +302,11 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
                    m_llModelInfo = LLModelInfo();
                    emit modelLoadingError(QString("Could not load model due to invalid model file for %1").arg(modelInfo.filename()));
                } else {
+                    // We might have had to fallback to CPU after load if the model is not possible to accelerate
+                    // for instance if the quantization method is not supported on Vulkan yet
+                    if (actualDevice != "CPU" && !m_llModelInfo.model->usingGPUDevice())
+                        emit reportDevice("CPU");
+
                    switch (m_llModelInfo.model->implementation().modelType()[0]) {
                    case 'L': m_llModelType = LLModelType::LLAMA_; break;
                    case 'G': m_llModelType = LLModelType::GPTJ_; break;