From b7c92c5afd32780e3c86741bd13bc57e05b6b2b0 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Tue, 16 Jan 2024 16:36:21 -0500 Subject: [PATCH] sync llama.cpp with latest Vulkan PR and newer upstream (#1819) --- gpt4all-backend/llama.cpp-mainline | 2 +- gpt4all-backend/llama.cpp.cmake | 76 +++++++++++++++--------------- gpt4all-backend/llamamodel.cpp | 13 ++--- gpt4all-backend/llmodel_shared.h | 2 +- 4 files changed, 47 insertions(+), 46 deletions(-) diff --git a/gpt4all-backend/llama.cpp-mainline b/gpt4all-backend/llama.cpp-mainline index 3cd95323..8eb893c5 160000 --- a/gpt4all-backend/llama.cpp-mainline +++ b/gpt4all-backend/llama.cpp-mainline @@ -1 +1 @@ -Subproject commit 3cd95323d995af7df4b42f6461f3d919a9267dad +Subproject commit 8eb893c572a4eaffac173f6cf9b17f15562b0d58 diff --git a/gpt4all-backend/llama.cpp.cmake b/gpt4all-backend/llama.cpp.cmake index 62b57226..8a2ce5ee 100644 --- a/gpt4all-backend/llama.cpp.cmake +++ b/gpt4all-backend/llama.cpp.cmake @@ -173,9 +173,9 @@ if (LLAMA_KOMPUTE) add_custom_command( OUTPUT ${spv_file} DEPENDS ${LLAMA_DIR}/${source} - ${LLAMA_DIR}/kompute/common.comp - ${LLAMA_DIR}/kompute/op_getrows.comp - ${LLAMA_DIR}/kompute/op_mul_mv_q_n.comp + ${LLAMA_DIR}/kompute-shaders/common.comp + ${LLAMA_DIR}/kompute-shaders/op_getrows.comp + ${LLAMA_DIR}/kompute-shaders/op_mul_mv_q_n.comp COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${LLAMA_DIR}/${source} COMMENT "Compiling ${source} to ${source}.spv" ) @@ -195,7 +195,7 @@ if (LLAMA_KOMPUTE) COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE} COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_BINARY_DIR}/bin/$/xxd -i ${spv_file} >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_BINARY_DIR}/bin/$/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE} COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE} COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} DEPENDS ${spv_file} xxd @@ -209,7 +209,7 @@ if (LLAMA_KOMPUTE) COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE} COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${spv_file} >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE} COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE} COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} DEPENDS ${spv_file} xxd @@ -226,35 +226,35 @@ if (LLAMA_KOMPUTE) # Compile our shaders compile_shader(SOURCES - kompute/op_scale.comp - kompute/op_scale_8.comp - kompute/op_add.comp - kompute/op_addrow.comp - kompute/op_mul.comp - kompute/op_mulrow.comp - kompute/op_silu.comp - kompute/op_relu.comp - kompute/op_gelu.comp - kompute/op_softmax.comp - kompute/op_norm.comp - kompute/op_rmsnorm.comp - kompute/op_diagmask.comp - kompute/op_mul_mat_mat_f32.comp - kompute/op_mul_mat_f16.comp - kompute/op_mul_mat_q8_0.comp - kompute/op_mul_mat_q4_0.comp - kompute/op_mul_mat_q4_1.comp - kompute/op_mul_mat_q6_k.comp - kompute/op_getrows_f16.comp - kompute/op_getrows_q4_0.comp - kompute/op_getrows_q4_1.comp - kompute/op_getrows_q6_k.comp - kompute/op_rope_f16.comp - kompute/op_rope_f32.comp - kompute/op_cpy_f16_f16.comp - kompute/op_cpy_f16_f32.comp - kompute/op_cpy_f32_f16.comp - kompute/op_cpy_f32_f32.comp + kompute-shaders/op_scale.comp + kompute-shaders/op_scale_8.comp + kompute-shaders/op_add.comp + kompute-shaders/op_addrow.comp + kompute-shaders/op_mul.comp + kompute-shaders/op_mulrow.comp + kompute-shaders/op_silu.comp + kompute-shaders/op_relu.comp + kompute-shaders/op_gelu.comp + kompute-shaders/op_softmax.comp + kompute-shaders/op_norm.comp + kompute-shaders/op_rmsnorm.comp + kompute-shaders/op_diagmask.comp + kompute-shaders/op_mul_mat_mat_f32.comp + kompute-shaders/op_mul_mat_f16.comp + kompute-shaders/op_mul_mat_q8_0.comp + kompute-shaders/op_mul_mat_q4_0.comp + kompute-shaders/op_mul_mat_q4_1.comp + kompute-shaders/op_mul_mat_q6_k.comp + kompute-shaders/op_getrows_f16.comp + kompute-shaders/op_getrows_q4_0.comp + kompute-shaders/op_getrows_q4_1.comp + kompute-shaders/op_getrows_q6_k.comp + kompute-shaders/op_rope_f16.comp + kompute-shaders/op_rope_f32.comp + kompute-shaders/op_cpy_f16_f16.comp + kompute-shaders/op_cpy_f16_f32.comp + kompute-shaders/op_cpy_f32_f16.comp + kompute-shaders/op_cpy_f32_f32.comp ) # Create a custom target for our generated shaders @@ -292,14 +292,14 @@ if (LLAMA_KOMPUTE) # Create a custom command that depends on the generated_shaders add_custom_command( - OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp - COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp + COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp DEPENDS generated_shaders - COMMENT "Ensuring shaders are generated before compiling ggml-vulkan.cpp" + COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp" ) # Add the stamp to the main sources to ensure dependency tracking - set(GGML_SOURCES_KOMPUTE ${LLAMA_DIR}/ggml-vulkan.cpp ${LLAMA_DIR}/ggml-vulkan.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp) + set(GGML_SOURCES_KOMPUTE ${LLAMA_DIR}/ggml-kompute.cpp ${LLAMA_DIR}/ggml-kompute.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp) add_compile_definitions(GGML_USE_KOMPUTE) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute) set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR}) diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp index 4a73e46b..d19d7b54 100644 --- a/gpt4all-backend/llamamodel.cpp +++ b/gpt4all-backend/llamamodel.cpp @@ -29,7 +29,7 @@ #include #ifdef GGML_USE_KOMPUTE -#include "ggml-vulkan.h" +#include "ggml-kompute.h" #endif namespace { @@ -58,7 +58,7 @@ struct gpt_params { std::string prompt = ""; - bool memory_f16 = true; // use f16 instead of f32 for memory kv + enum ggml_type kv_type = GGML_TYPE_F16; // use f16 instead of f32 for memory kv bool use_mmap = true; // use mmap for faster loads bool use_mlock = false; // use mlock to keep model in memory @@ -199,9 +199,10 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx) d_ptr->ctx_params = llama_context_default_params(); - d_ptr->ctx_params.n_ctx = n_ctx; - d_ptr->ctx_params.seed = params.seed; - d_ptr->ctx_params.f16_kv = params.memory_f16; + d_ptr->ctx_params.n_ctx = n_ctx; + d_ptr->ctx_params.seed = params.seed; + d_ptr->ctx_params.type_k = params.kv_type; + d_ptr->ctx_params.type_v = params.kv_type; // The new batch API provides space for n_vocab*n_tokens logits. Tell llama.cpp early // that we want this many logits so the state serializes consistently. @@ -332,7 +333,7 @@ const std::vector &LLamaModel::endTokens() const } #if defined(GGML_USE_KOMPUTE) -#include "ggml-vulkan.h" +#include "ggml-kompute.h" #endif std::vector LLamaModel::availableGPUDevices(size_t memoryRequired) diff --git a/gpt4all-backend/llmodel_shared.h b/gpt4all-backend/llmodel_shared.h index c48f1fdf..b7b1a837 100644 --- a/gpt4all-backend/llmodel_shared.h +++ b/gpt4all-backend/llmodel_shared.h @@ -5,7 +5,7 @@ #include #if defined(GGML_USE_KOMPUTE) -#include "ggml-vulkan.h" +#include "ggml-kompute.h" struct llm_buffer { uint8_t * addr = NULL; size_t size = 0;