gpt4all/gpt4all-chat/chatllm.h

#ifndef CHATLLM_H
#define CHATLLM_H

#include <QObject>
#include <QThread>
#include <QFileInfo>

#include "localdocs.h"
#include "../gpt4all-backend/llmodel.h"

enum LLModelType {
    MPT_,
    GPTJ_,
    LLAMA_,
    CHATGPT_,
    REPLIT_
};

struct LLModelInfo {
    LLModel *model = nullptr;
    QFileInfo fileInfo;
    // NOTE: This does not store the model type or name on purpose as this is left for ChatLLM which
    // must be able to serialize the information even if it is in the unloaded state
};

class TokenTimer : public QObject {
    Q_OBJECT
public:
    explicit TokenTimer(QObject *parent)
        : QObject(parent)
        , m_elapsed(0) {}

    static int rollingAverage(int oldAvg, int newNumber, int n)
    {
        // i.e. to calculate the new average after then nth number,
        // you multiply the old average by n−1, add the new number, and divide the total by n.
        return qRound(((float(oldAvg) * (n - 1)) + newNumber) / float(n));
    }

    void start() { m_tokens = 0; m_elapsed = 0; m_time.invalidate(); }
    void stop() { handleTimeout(); }
    void inc() {
        if (!m_time.isValid())
            m_time.start();
        ++m_tokens;
        if (m_time.elapsed() > 999)
            handleTimeout();
    }

Q_SIGNALS:
    void report(const QString &speed);

private Q_SLOTS:
    void handleTimeout()
    {
        m_elapsed += m_time.restart();
        emit report(QString("%1 tokens/sec").arg(m_tokens / float(m_elapsed / 1000.0f), 0, 'g', 2));
    }

private:
    QElapsedTimer m_time;
    qint64 m_elapsed;
    quint32 m_tokens;
};

class Chat;
class ChatLLM : public QObject
{
    Q_OBJECT
    Q_PROPERTY(bool isModelLoaded READ isModelLoaded NOTIFY isModelLoadedChanged)
    Q_PROPERTY(QString response READ response NOTIFY responseChanged)
    Q_PROPERTY(QString modelName READ modelName WRITE setModelName NOTIFY modelNameChanged)
    Q_PROPERTY(bool isRecalc READ isRecalc NOTIFY recalcChanged)
    Q_PROPERTY(QString generatedName READ generatedName NOTIFY generatedNameChanged)

public:
    ChatLLM(Chat *parent, bool isServer = false);
    virtual ~ChatLLM();

    bool isModelLoaded() const;
    void regenerateResponse();
    void resetResponse();
    void resetContext();
    QList<ResultInfo> databaseResults() const { return m_databaseResults; }

    void stopGenerating() { m_stopGenerating = true; }

    bool shouldBeLoaded() const { return m_shouldBeLoaded; }
    void setShouldBeLoaded(bool b);

    QString response() const;
    QString modelName() const;

    void setModelName(const QString &modelName);

    bool isRecalc() const { return m_isRecalc; }

    QString generatedName() const { return QString::fromStdString(m_nameResponse); }

    bool serialize(QDataStream &stream, int version);
    bool deserialize(QDataStream &stream, int version);

public Q_SLOTS:
    bool prompt(const QString &prompt, const QString &prompt_template, int32_t n_predict,
        int32_t top_k, float top_p, float temp, int32_t n_batch, float repeat_penalty, int32_t repeat_penalty_tokens,
        int32_t n_threads);
    bool loadDefaultModel();
    bool loadModel(const QString &modelName);
    void modelNameChangeRequested(const QString &modelName);
    void forceUnloadModel();
    void unloadModel();
    void reloadModel();
    void generateName();
    void handleChatIdChanged();
    void handleShouldBeLoadedChanged();
    void handleThreadStarted();

Q_SIGNALS:
    void isModelLoadedChanged();
    void modelLoadingError(const QString &error);
    void responseChanged();
    void promptProcessing();
    void responseStopped();
    void modelNameChanged();
    void recalcChanged();
    void sendStartup();
    void sendModelLoaded();
    void generatedNameChanged();
    void stateChanged();
    void threadStarted();
    void shouldBeLoadedChanged();
    void requestRetrieveFromDB(const QList<QString> &collections, const QString &text, int retrievalSize, QList<ResultInfo> *results);
    void reportSpeed(const QString &speed);

protected:
    bool handlePrompt(int32_t token);
    bool handleResponse(int32_t token, const std::string &response);
    bool handleRecalculate(bool isRecalc);
    bool handleNamePrompt(int32_t token);
    bool handleNameResponse(int32_t token, const std::string &response);
    bool handleNameRecalculate(bool isRecalc);
    void saveState();
    void restoreState();

protected:
    LLModel::PromptContext m_ctx;
    quint32 m_promptTokens;
    quint32 m_promptResponseTokens;
    LLModelInfo m_modelInfo;
    LLModelType m_modelType;
    std::string m_response;
    std::string m_nameResponse;
    QString m_modelName;
    Chat *m_chat;
    TokenTimer *m_timer;
    QByteArray m_state;
    QThread m_llmThread;
    std::atomic<bool> m_stopGenerating;
    std::atomic<bool> m_shouldBeLoaded;
    QList<ResultInfo> m_databaseResults;
    bool m_isRecalc;
    bool m_isServer;
    bool m_isChatGPT;
};

#endif // CHATLLM_H
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 16:10:05 +03:00
+								#ifndef CHATLLM_H
 								#define CHATLLM_H
 								#include <QObject>
 								#include <QThread>
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-14 02:05:35 +03:00
+								#include <QFileInfo>
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 16:10:05 +03:00
-												Make localdocs work with server mode.

											
										
										
											2023-06-01 21:13:12 +03:00
+								#include "localdocs.h"
-												Move the llmodel C API to new top-level directory and version it.

											
										
										
											2023-05-10 18:46:40 +03:00
+								#include "../gpt4all-backend/llmodel.h"
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 16:10:05 +03:00
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-14 02:05:35 +03:00
+								enum LLModelType {
 								    MPT_,
 								    GPTJ_,
-												Preliminary support for chatgpt models.

											
										
										
											2023-05-15 03:12:15 +03:00
+								    LLAMA_,
 								    CHATGPT_,
-												Replit Model (#713)

* porting over replit code model to gpt4all

* replaced memory with kv_self struct

* continuing debug

* welp it built but lot of sus things

* working model loading and somewhat working generate.. need to format response?

* revert back to semi working version

* finally got rid of weird formatting

* figured out problem is with python bindings - this is good to go for testing

* addressing PR feedback

* output refactor

* fixed prompt reponse collection

* cleanup

* addressing PR comments

* building replit backend with new ggmlver code

* chatllm replit and clean python files

* cleanup

* updated replit to match new llmodel api

* match llmodel api and change size_t to Token

* resolve PR comments

* replit model commit comment
											
										
										
											2023-06-07 00:09:00 +03:00
+								    REPLIT_
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-14 02:05:35 +03:00
+								};
 								struct LLModelInfo {
 								    LLModel *model = nullptr;
 								    QFileInfo fileInfo;
 								    // NOTE: This does not store the model type or name on purpose as this is left for ChatLLM which
 								    // must be able to serialize the information even if it is in the unloaded state
 								};
-												Show token generation speed in gui. (#1020)


											
										
										
											2023-06-19 21:34:53 +03:00
+								class TokenTimer : public QObject {
 								    Q_OBJECT
 								public:
 								    explicit TokenTimer(QObject *parent)
 								        : QObject(parent)
 								        , m_elapsed(0) {}
 								    static int rollingAverage(int oldAvg, int newNumber, int n)
 								    {
 								        // i.e. to calculate the new average after then nth number,
 								        // you multiply the old average by n−1, add the new number, and divide the total by n.
 								        return qRound(((float(oldAvg) * (n - 1)) + newNumber) / float(n));
 								    }
 								    void start() { m_tokens = 0; m_elapsed = 0; m_time.invalidate(); }
 								    void stop() { handleTimeout(); }
 								    void inc() {
 								        if (!m_time.isValid())
 								            m_time.start();
 								        ++m_tokens;
 								        if (m_time.elapsed() > 999)
 								            handleTimeout();
 								    }
 								Q_SIGNALS:
 								    void report(const QString &speed);
 								private Q_SLOTS:
 								    void handleTimeout()
 								    {
 								        m_elapsed += m_time.restart();
 								        emit report(QString("%1 tokens/sec").arg(m_tokens / float(m_elapsed / 1000.0f), 0, 'g', 2));
 								    }
 								private:
 								    QElapsedTimer m_time;
 								    qint64 m_elapsed;
 								    quint32 m_tokens;
 								};
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 22:31:41 +03:00
+								class Chat;
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 16:10:05 +03:00
+								class ChatLLM : public QObject
 								{
 								    Q_OBJECT
 								    Q_PROPERTY(bool isModelLoaded READ isModelLoaded NOTIFY isModelLoadedChanged)
 								    Q_PROPERTY(QString response READ response NOTIFY responseChanged)
 								    Q_PROPERTY(QString modelName READ modelName WRITE setModelName NOTIFY modelNameChanged)
 								    Q_PROPERTY(bool isRecalc READ isRecalc NOTIFY recalcChanged)
-												Generate names via llm.

											
										
										
											2023-05-02 18:19:17 +03:00
+								    Q_PROPERTY(QString generatedName READ generatedName NOTIFY generatedNameChanged)
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 16:10:05 +03:00
 								public:
-												The server has different lifetime mgmt than the other chats.

											
										
										
											2023-05-14 02:33:19 +03:00
+								    ChatLLM(Chat *parent, bool isServer = false);
-												Cleanup the chatllm properly.

											
										
										
											2023-05-12 21:06:03 +03:00
+								    virtual ~ChatLLM();
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 16:10:05 +03:00
 								    bool isModelLoaded() const;
 								    void regenerateResponse();
 								    void resetResponse();
 								    void resetContext();
-												Better name for database results.

											
										
										
											2023-06-01 23:12:21 +03:00
+								    QList<ResultInfo> databaseResults() const { return m_databaseResults; }
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 16:10:05 +03:00
 								    void stopGenerating() { m_stopGenerating = true; }
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-14 02:05:35 +03:00
+								    bool shouldBeLoaded() const { return m_shouldBeLoaded; }
 								    void setShouldBeLoaded(bool b);
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 16:10:05 +03:00
+								    QString response() const;
 								    QString modelName() const;
 								    void setModelName(const QString &modelName);
 								    bool isRecalc() const { return m_isRecalc; }
-												Generate names via llm.

											
										
										
											2023-05-02 18:19:17 +03:00
+								    QString generatedName() const { return QString::fromStdString(m_nameResponse); }
-												Convert the old format properly.

											
										
										
											2023-05-08 12:52:57 +03:00
+								    bool serialize(QDataStream &stream, int version);
 								    bool deserialize(QDataStream &stream, int version);
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 22:31:41 +03:00
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 16:10:05 +03:00
+								public Q_SLOTS:
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 22:31:41 +03:00
+								    bool prompt(const QString &prompt, const QString &prompt_template, int32_t n_predict,
 								        int32_t top_k, float top_p, float temp, int32_t n_batch, float repeat_penalty, int32_t repeat_penalty_tokens,
 								        int32_t n_threads);
 								    bool loadDefaultModel();
 								    bool loadModel(const QString &modelName);
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 16:10:05 +03:00
+								    void modelNameChangeRequested(const QString &modelName);
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-14 02:05:35 +03:00
+								    void forceUnloadModel();
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 22:31:41 +03:00
+								    void unloadModel();
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-14 02:05:35 +03:00
+								    void reloadModel();
-												Generate names via llm.

											
										
										
											2023-05-02 18:19:17 +03:00
+								    void generateName();
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 22:31:41 +03:00
+								    void handleChatIdChanged();
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-14 02:05:35 +03:00
+								    void handleShouldBeLoadedChanged();
-												Show token generation speed in gui. (#1020)


											
										
										
											2023-06-19 21:34:53 +03:00
+								    void handleThreadStarted();
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 16:10:05 +03:00
 								Q_SIGNALS:
 								    void isModelLoadedChanged();
-												Gracefully handle when we have a previous chat where the model that it used has gone away.

											
										
										
											2023-05-09 03:51:03 +03:00
+								    void modelLoadingError(const QString &error);
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 16:10:05 +03:00
+								    void responseChanged();
-												Add prompt processing and localdocs to the busy indicator in UI.

											
										
										
											2023-05-21 03:04:36 +03:00
+								    void promptProcessing();
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 16:10:05 +03:00
+								    void responseStopped();
 								    void modelNameChanged();
 								    void recalcChanged();
 								    void sendStartup();
 								    void sendModelLoaded();
-												Generate names via llm.

											
										
										
											2023-05-02 18:19:17 +03:00
+								    void generatedNameChanged();
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 22:31:41 +03:00
+								    void stateChanged();
-												httpserver

											
										
										
											2023-05-11 23:46:25 +03:00
+								    void threadStarted();
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-14 02:05:35 +03:00
+								    void shouldBeLoadedChanged();
-												Make localdocs work with server mode.

											
										
										
											2023-06-01 21:13:12 +03:00
+								    void requestRetrieveFromDB(const QList<QString> &collections, const QString &text, int retrievalSize, QList<ResultInfo> *results);
-												Show token generation speed in gui. (#1020)


											
										
										
											2023-06-19 21:34:53 +03:00
+								    void reportSpeed(const QString &speed);
-												httpserver

											
										
										
											2023-05-11 23:46:25 +03:00
 								protected:
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 16:10:05 +03:00
+								    bool handlePrompt(int32_t token);
 								    bool handleResponse(int32_t token, const std::string &response);
 								    bool handleRecalculate(bool isRecalc);
-												Generate names via llm.

											
										
										
											2023-05-02 18:19:17 +03:00
+								    bool handleNamePrompt(int32_t token);
 								    bool handleNameResponse(int32_t token, const std::string &response);
 								    bool handleNameRecalculate(bool isRecalc);
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 22:31:41 +03:00
+								    void saveState();
 								    void restoreState();
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 16:10:05 +03:00
-												The server has different lifetime mgmt than the other chats.

											
										
										
											2023-05-14 02:33:19 +03:00
+								protected:
 								    LLModel::PromptContext m_ctx;
 								    quint32 m_promptTokens;
 								    quint32 m_promptResponseTokens;
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-14 02:05:35 +03:00
+								    LLModelInfo m_modelInfo;
 								    LLModelType m_modelType;
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 16:10:05 +03:00
+								    std::string m_response;
-												Generate names via llm.

											
										
										
											2023-05-02 18:19:17 +03:00
+								    std::string m_nameResponse;
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 16:10:05 +03:00
+								    QString m_modelName;
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 22:31:41 +03:00
+								    Chat *m_chat;
-												Show token generation speed in gui. (#1020)


											
										
										
											2023-06-19 21:34:53 +03:00
+								    TokenTimer *m_timer;
-												First attempt at providing a persistent chat list experience.

Limitations:

1) Context is not restored for gpt-j models
2) When you switch between different model types in an existing chat
   the context and all the conversation is lost
3) The settings are not chat or conversation specific
4) The sizes of the chat persisted files are very large due to how much
   data the llama.cpp backend tries to persist. Need to investigate how
   we can shrink this.

											
										
										
											2023-05-04 22:31:41 +03:00
+								    QByteArray m_state;
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 16:10:05 +03:00
+								    QThread m_llmThread;
 								    std::atomic<bool> m_stopGenerating;
-												Much better memory mgmt for multi-threaded model loading/unloading.

											
										
										
											2023-05-14 02:05:35 +03:00
+								    std::atomic<bool> m_shouldBeLoaded;
-												Better name for database results.

											
										
										
											2023-06-01 23:12:21 +03:00
+								    QList<ResultInfo> m_databaseResults;
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 16:10:05 +03:00
+								    bool m_isRecalc;
-												The server has different lifetime mgmt than the other chats.

											
										
										
											2023-05-14 02:33:19 +03:00
+								    bool m_isServer;
-												Add large network icon background for chatgpt and server modes.

											
										
										
											2023-05-15 21:08:08 +03:00
+								    bool m_isChatGPT;
-												Major refactor in prep for multiple conversations.

											
										
										
											2023-05-01 16:10:05 +03:00
+								};
 								#endif // CHATLLM_H