Add Triton Marian backend running in AzureML Inference Environment (#749)

* Add Triton Marian backend running in AzureML Inference Environment
This commit is contained in:
delong-coder 2020-11-05 06:29:36 +08:00 committed by GitHub
parent e274ac76b2
commit ca7a887aa7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 1383 additions and 0 deletions

View File

@ -0,0 +1,96 @@
# It is recommended to use a machine which supports CUDA to build this image.
FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 AS BUILDER
RUN apt-get update --fix-missing
RUN apt-get install -y curl git autoconf automake libtool curl make g++ unzip cmake build-essential cpio
RUN apt-get -y clean && \
rm -rf /var/lib/apt/lists/*
# install zlib
WORKDIR /
RUN git clone --no-checkout https://github.com/madler/zlib
WORKDIR /zlib
RUN git checkout tags/v1.2.10 && \
./configure && \
make install
# protobuf install
WORKDIR /
RUN git clone --no-checkout https://github.com/protocolbuffers/protobuf.git
WORKDIR /protobuf
RUN git checkout tags/v3.8.0 && \
git submodule update --init --recursive && \
./autogen.sh
RUN ./configure --disable-shared --prefix=/usr CFLAGS="-fPIC" CXXFLAGS="-fPIC" && \
make && \
make check && \
make install && \
ldconfig # refresh shared library cache.
# Intel mkl install
WORKDIR /
RUN curl --tlsv1.2 --output l_mkl_2020.0.166.tgz https://registrationcenter-download.intel.com/akdlm/irc_nas/tec/16318/l_mkl_2020.0.166.tgz
RUN tar zxvf l_mkl_2020.0.166.tgz
WORKDIR /l_mkl_2020.0.166
RUN ./install.sh --silent ./silent.cfg --install_dir /opt/intel/ --accept_eula
# boost install
WORKDIR /
RUN git clone --recursive https://github.com/boostorg/boost --branch boost-1.72.0 /boost
WORKDIR /boost
RUN ./bootstrap.sh
RUN ./b2 install --prefix=/usr --with-system --with-thread --with-date_time --with-regex --with-serialization
# Marian install
WORKDIR /
RUN git clone --no-checkout https://github.com/marian-nmt/marian-dev
WORKDIR marian-dev
RUN git checkout youki/quantize-embedding
RUN git checkout dad48865fd3b7f1d7b891de81040f7651e824510
RUN mkdir src/static
RUN mkdir build
COPY src/cmarian.cpp /marian-dev/src/static
COPY src/logging.cpp /marian-dev/src/common
RUN rm src/CMakeLists.txt
COPY src/CMakeLists.txt /marian-dev/src
WORKDIR /marian-dev/build
RUN cmake .. -DCOMPILE_CPU=on -DCOMPILE_CUDA=on -DUSE_SENTENCEPIECE=on -DUSE_STATIC_LIBS=off -DCOMPILE_SERVER=off -DUSE_FBGEMM=on -DCUDA_cublas_device_LIBRARY=/usr/lib/x86_64-linux-gnu/libcublas.so
RUN make -j $(grep -c ^processor /proc/cpuinfo)
# build cmarian static library
FROM nvcr.io/nvidia/tritonserver:20.09-py3
RUN mkdir -p /marian-dev/build/src/3rd_party/sentencepiece/src
COPY --from=BUILDER /usr/lib/libprotobuf.a /usr/lib
COPY --from=BUILDER /usr/lib/libboost_system.a /usr/lib
COPY --from=BUILDER /marian-dev/build/src/3rd_party/fbgemm/libfbgemm.a /usr/lib
COPY --from=BUILDER /marian-dev/build/src/3rd_party/fbgemm/asmjit/libasmjit.a /usr/lib
COPY --from=BUILDER /marian-dev/build/src/3rd_party/sentencepiece/src/libsentencepiece_train.a /usr/lib
COPY --from=BUILDER /marian-dev/build/src/3rd_party/sentencepiece/src/libsentencepiece.a /usr/lib
COPY --from=BUILDER /marian-dev/build/libmarian.a /usr/lib/libcmarian.a
COPY --from=BUILDER /marian-dev/build/src/libmarian_cuda.a /usr/lib/libcmarian_cuda.a
# build triton custom backend
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && \
apt-get install -y --no-install-recommends \
software-properties-common \
build-essential \
git \
libopencv-dev \
libopencv-core-dev \
libssl-dev \
libtool \
pkg-config \
rapidjson-dev
# install cmake-3.19.0
RUN wget https://github.com/Kitware/CMake/releases/download/v3.19.0-rc1/cmake-3.19.0-rc1-Linux-x86_64.sh
RUN sh cmake-3.19.0-rc1-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir
ADD marian_backend /opt/tritonserver/marian_backend
WORKDIR /opt/tritonserver/marian_backend
RUN mkdir build
RUN cd build && \
cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install .. && \
make install

View File

@ -0,0 +1,36 @@
Triton-AML
======
*Triton-AML* is a Triton custom backend running with Marian in the AzureML Inference Environment, it's one of the implementation of [Triton Backend Shared Library](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/backend.html#backend-shared-library).
This backend is compiled with the static library of Marian on a specific version.
Layout:
- marian_backend: Triton Marian backend source code
- src: Changed code and CMakeLists.txt of Marian
- Dockerfile: Used for compiling the backend with the static library of Marian
- build.sh: A simple shell script to run the Dockerfile to get the generated libtriton_marian.so
## Usage
Run `./build.sh` to get the Triton Marian backend shared library.
For all the users, you can put the libtriton_marian.so into the following places:
- <model_repository>/<model_name>/<version_directory>/libtriton_marian.so
- <model_repository>/<model_name>/libtriton_marian.so
For the AzureML Inference team members, you can put it into the following place of *aml-triton* base image:
- <backend_directory>/marian/libtriton_marian.so
Where <backend_directory> is by default /opt/tritonserver/backends.
## Make changes
If you want to compile with another version of Marian, you need to replace `RUN git checkout youki/quantize-embedding` in the Dockerfile, then copy the new CMakeLists.txt replace the old one, add src/cmarian.cpp into CMakeLists.txt and make some changes to make sure it will build a static library of Marian.
## Limitation
For now, it's only used for *nlxseq2seq* model, some hard code is in the `ModelState::SetMarianConfigPath` function, some changes must be done if you want to run other models with Marian.

View File

@ -0,0 +1,10 @@
#!/bin/sh
echo Building Triton Marian backend ...
docker build -t triton-marian-build .
echo Copying artifacts ...
docker container create --name extract triton-marian-build
docker container cp extract:/opt/tritonserver/marian_backend/build/libtriton_marian.so .
docker container rm -f extract

View File

@ -0,0 +1,156 @@
cmake_minimum_required(VERSION 3.17)
project(tritonmarianbackend LANGUAGES C CXX)
#
# Options
#
# Must include options required for this project as well as any
# projects included in this one by FetchContent.
#
# GPU support is disabled by default because marian backend doesn't
# support GPUs.
#
option(TRITON_ENABLE_GPU "Enable GPU support in backend" OFF)
option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON)
set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo")
set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo")
set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo")
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()
#
# Dependencies
#
# FetchContent's composibility isn't very good. We must include the
# transitive closure of all repos so that we can override the tag.
#
include(FetchContent)
FetchContent_Declare(
repo-common
GIT_REPOSITORY https://github.com/triton-inference-server/common.git
GIT_TAG ${TRITON_COMMON_REPO_TAG}
GIT_SHALLOW ON
)
FetchContent_Declare(
repo-core
GIT_REPOSITORY https://github.com/triton-inference-server/core.git
GIT_TAG ${TRITON_CORE_REPO_TAG}
GIT_SHALLOW ON
)
FetchContent_Declare(
repo-backend
GIT_REPOSITORY https://github.com/triton-inference-server/backend.git
GIT_TAG ${TRITON_BACKEND_REPO_TAG}
GIT_SHALLOW ON
)
FetchContent_MakeAvailable(repo-common repo-core repo-backend)
#
# Shared library implementing the Triton Backend API
#
configure_file(src/libtriton_marian.ldscript libtriton_marian.ldscript COPYONLY)
add_library(
triton-marian-backend SHARED
src/marian.cc
)
add_library(
TritonMarianBackend::triton-marian-backend ALIAS triton-marian-backend
)
target_include_directories(
triton-marian-backend
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/src
)
target_compile_features(triton-marian-backend PRIVATE cxx_std_11)
target_compile_options(
triton-marian-backend PRIVATE
$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
-Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror>
)
target_link_libraries(
triton-marian-backend
PRIVATE
triton-backend-utils # from repo-backend
triton-core-serverstub # from repo-core
)
target_link_libraries(
triton-marian-backend
PRIVATE # from marian environment
cmarian
cmarian_cuda
sentencepiece
sentencepiece_train
fbgemm
asmjit
protobuf
)
set_target_properties(
triton-marian-backend PROPERTIES
POSITION_INDEPENDENT_CODE ON
OUTPUT_NAME triton_marian
LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_marian.ldscript
LINK_FLAGS "-Wl,--version-script libtriton_marian.ldscript"
)
#
# Install
#
include(GNUInstallDirs)
set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TritonMarianBackend)
install(
TARGETS
triton-marian-backend
EXPORT
triton-marian-backend-targets
LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/marian
ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/marian
)
install(
EXPORT
triton-marian-backend-targets
FILE
TritonMarianBackendTargets.cmake
NAMESPACE
TritonMarianBackend::
DESTINATION
${INSTALL_CONFIGDIR}
)
include(CMakePackageConfigHelpers)
configure_package_config_file(
${CMAKE_CURRENT_LIST_DIR}/cmake/TritonMarianBackendConfig.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/TritonMarianBackendConfig.cmake
INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
)
install(
FILES
${CMAKE_CURRENT_BINARY_DIR}/TritonMarianBackendConfig.cmake
DESTINATION ${INSTALL_CONFIGDIR}
)
#
# Export from build tree
#
export(
EXPORT triton-marian-backend-targets
FILE ${CMAKE_CURRENT_BINARY_DIR}/TritonMarianBackendTargets.cmake
NAMESPACE TritonMarianBackend::
)
export(PACKAGE TritonMarianBackend)

View File

@ -0,0 +1,16 @@
Use cmake to build and install in a local directory.
```
$ mkdir build
$ cd build
$ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install ..
$ make install
```
The following required Triton repositories will be pulled and used in
the build. By default the "main" branch/tag will be used for each repo
but the listed CMake argument can be used to override.
* triton-inference-server/backend: -DTRITON_BACKEND_REPO_TAG=[tag]
* triton-inference-server/core: -DTRITON_CORE_REPO_TAG=[tag]
* triton-inference-server/common: -DTRITON_COMMON_REPO_TAG=[tag]

View File

@ -0,0 +1,13 @@
include(CMakeFindDependencyMacro)
get_filename_component(
TRITONMARIANBACKEND_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
)
list(APPEND CMAKE_MODULE_PATH ${TRITONMARIANBACKEND_CMAKE_DIR})
if(NOT TARGET TritonMarianBackend::triton-marian-backend)
include("${TRITONMARIANBACKEND_CMAKE_DIR}/TritonMarianBackendTargets.cmake")
endif()
set(TRITONMARIANBACKEND_LIBRARIES TritonMarianBackend::triton-marian-backend)

View File

@ -0,0 +1,5 @@
{
global:
TRITONBACKEND_*;
local: *;
};

View File

@ -0,0 +1,570 @@
#include <algorithm>
#include "marian.h"
#include "triton/backend/backend_common.h"
namespace triton { namespace backend { namespace marian {
#define GUARDED_RESPOND_IF_ERROR(RESPONSES, IDX, X) \
do { \
if ((RESPONSES)[IDX] != nullptr) { \
TRITONSERVER_Error* err__ = (X); \
if (err__ != nullptr) { \
LOG_IF_ERROR( \
TRITONBACKEND_ResponseSend( \
(RESPONSES)[IDX], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \
err__), \
"failed to send error response"); \
(RESPONSES)[IDX] = nullptr; \
TRITONSERVER_ErrorDelete(err__); \
} \
} \
} while (false)
//
// ModelState
//
// State associated with a model that is using this backend. An object
// of this class is created and associated with each
// TRITONBACKEND_Model.
//
class ModelState {
public:
static TRITONSERVER_Error* Create(
TRITONBACKEND_Model* triton_model, ModelState** state);
TRITONSERVER_Error* SetMarianConfigPath();
// Get the handle to the TRITONBACKEND model.
TRITONBACKEND_Model* TritonModel() { return triton_model_; }
// Get the name of the model.
const std::string& Name() const { return name_; }
// Get the Marian config path of the model.
const std::string& MarianConfigPath() const { return marian_config_path_; }
private:
ModelState(
TRITONBACKEND_Model* triton_model, const char* name,
common::TritonJson::Value&& model_config);
TRITONBACKEND_Model* triton_model_;
const std::string name_;
common::TritonJson::Value model_config_;
std::string marian_config_path_;
};
TRITONSERVER_Error*
ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
{
TRITONSERVER_Message* config_message;
RETURN_IF_ERROR(TRITONBACKEND_ModelConfig(
triton_model, 1 /* config_version */, &config_message));
// Get the model configuration as a json string from
// config_message, parse it with the TritonJson.
const char* buffer;
size_t byte_size;
RETURN_IF_ERROR(
TRITONSERVER_MessageSerializeToJson(config_message, &buffer, &byte_size));
common::TritonJson::Value model_config;
TRITONSERVER_Error* err = model_config.Parse(buffer, byte_size);
RETURN_IF_ERROR(TRITONSERVER_MessageDelete(config_message));
RETURN_IF_ERROR(err);
const char* model_name;
RETURN_IF_ERROR(TRITONBACKEND_ModelName(triton_model, &model_name));
*state = new ModelState(
triton_model, model_name, std::move(model_config));
return nullptr; // success
}
ModelState::ModelState(
TRITONBACKEND_Model* triton_model, const char* name,
common::TritonJson::Value&& model_config)
: triton_model_(triton_model), name_(name),
model_config_(std::move(model_config))
{
}
TRITONSERVER_Error*
ModelState::SetMarianConfigPath()
{
common::TritonJson::WriteBuffer buffer;
RETURN_IF_ERROR(model_config_.PrettyWrite(&buffer));
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
(std::string("model configuration:\n") + buffer.Contents()).c_str());
std::string config_filepath_str;
common::TritonJson::Value parameters;
if (model_config_.Find("parameters", &parameters)) {
common::TritonJson::Value config_filepath;
if (parameters.Find("config_filepath", &config_filepath)) {
RETURN_IF_ERROR(config_filepath.MemberAsString(
"string_value", &config_filepath_str)
);
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
(std::string("model config path is set to : ") + config_filepath_str)
.c_str()
);
}
}
// Set the Marian config path.
std::string config_path("/var/azureml-app/");
config_path.append(std::getenv("AZUREML_MODEL_DIR"));
config_path.append("/nlxseq2seq/triton/nlxseq2seq/1/data/model/");
config_path.append(config_filepath_str);
marian_config_path_ = config_path;
return nullptr; // success
}
//
// ModelInstanceState
//
// State associated with a model instance. An object of this class is
// created and associated with each TRITONBACKEND_ModelInstance.
//
class ModelInstanceState {
public:
static TRITONSERVER_Error* Create(
TRITONBACKEND_ModelInstance* triton_model_instance,
void* marian, ModelInstanceState **state);
// Get the handle to the TRITONBACKEND model instance.
TRITONBACKEND_ModelInstance* TritonModelInstance()
{
return triton_model_instance_;
}
// Get the name, kind, device ID and marian instance of the instance.
const std::string& Name() const { return name_; }
TRITONSERVER_InstanceGroupKind Kind() const { return kind_; }
int32_t DeviceId() const { return device_id_; }
void* Marian() const { return marian_; }
private:
ModelInstanceState(
TRITONBACKEND_ModelInstance* triton_model_instance,
void* marian, const char* name,
const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id);
TRITONBACKEND_ModelInstance* triton_model_instance_;
void* marian_;
const std::string name_;
const TRITONSERVER_InstanceGroupKind kind_;
const int32_t device_id_;
};
TRITONSERVER_Error*
ModelInstanceState::Create(
TRITONBACKEND_ModelInstance* triton_model_instance,
void* marian, ModelInstanceState** state)
{
const char* instance_name;
RETURN_IF_ERROR(
TRITONBACKEND_ModelInstanceName(triton_model_instance, &instance_name));
TRITONSERVER_InstanceGroupKind instance_kind;
RETURN_IF_ERROR(
TRITONBACKEND_ModelInstanceKind(triton_model_instance, &instance_kind));
int32_t instance_id;
RETURN_IF_ERROR(
TRITONBACKEND_ModelInstanceDeviceId(triton_model_instance, &instance_id));
*state = new ModelInstanceState(
triton_model_instance, marian, instance_name,
instance_kind, instance_id);
return nullptr; // success
}
ModelInstanceState::ModelInstanceState(
TRITONBACKEND_ModelInstance* triton_model_instance,
void* marian, const char* name,
const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id)
: triton_model_instance_(triton_model_instance), marian_(marian),
name_(name), kind_(kind), device_id_(device_id)
{
}
/////////////
extern "C" {
TRITONSERVER_Error*
TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)
{
ModelState* model_state;
RETURN_IF_ERROR(ModelState::Create(model, &model_state));
RETURN_IF_ERROR(model_state->SetMarianConfigPath());
RETURN_IF_ERROR(
TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state))
);
return nullptr; // success
}
TRITONSERVER_Error*
TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model)
{
void* vstate;
RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate));
ModelState* model_state = reinterpret_cast<ModelState*>(vstate);
LOG_MESSAGE(
TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelFinalize: delete model state");
delete model_state;
return nullptr; // success
}
TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
{
TRITONBACKEND_Model* model;
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model));
void* vmodelstate;
RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate));
ModelState* model_state = reinterpret_cast<ModelState*>(vmodelstate);
std::string marian_config_path = model_state->MarianConfigPath();
int32_t device;
RETURN_IF_ERROR(
TRITONBACKEND_ModelInstanceDeviceId(instance, &device));
void* marian_instance = init(const_cast<char*>(marian_config_path.c_str()), device);
ModelInstanceState* instance_state;
RETURN_IF_ERROR(
ModelInstanceState::Create(instance, marian_instance, &instance_state));
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(
instance, reinterpret_cast<void*>(instance_state)));
return nullptr; // success
}
TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
{
void* vstate;
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));
ModelInstanceState* instance_state =
reinterpret_cast<ModelInstanceState*>(vstate);
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
"TRITONBACKEND_ModelInstanceFinalize: delete instance state");
delete instance_state;
return nullptr; // success
}
TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceExecute(
TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests,
const uint32_t request_count)
{
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
("Marian model instance executing " + std::to_string(request_count) +
" requests").c_str()
);
// 'responses' is initialized with the response objects below and
// if/when an error response is sent the corresponding entry in
// 'responses' is set to nullptr to indicate that that response has
// already been sent.
std::vector<TRITONBACKEND_Response*> responses;
responses.reserve(request_count);
// Create a single response object for each request. If something
// goes wrong when attempting to create the response objects just
// fail all of the requests by returning an error.
for (uint32_t r = 0; r < request_count; ++r) {
TRITONBACKEND_Request* request = requests[r];
TRITONBACKEND_Response* response;
RETURN_IF_ERROR(TRITONBACKEND_ResponseNew(&response, request));
responses.push_back(response);
}
// We will execute all the requests at the same time, and so there
// will be a single compute-start / compute-end time-range.
uint64_t total_batch_size = 0;
uint64_t exec_start_ns = 0;
SET_TIMESTAMP(exec_start_ns);
std::vector<TRITONBACKEND_Input*> request_input;
std::vector<int> request_batch_size;
std::vector<std::string> inputs;
std::string input_strings;
// Create a single response object for each request. If something
// goes wrong when attempting to create the response objects just
// fail all of the requests by returning an error.
for (uint32_t r = 0; r < request_count; ++r) {
TRITONBACKEND_Request* request = requests[r];
const char* input_name;
GUARDED_RESPOND_IF_ERROR(
responses, r,
TRITONBACKEND_RequestInputName(request, 0 /* index */, &input_name)
);
TRITONBACKEND_Input* input = nullptr;
GUARDED_RESPOND_IF_ERROR(
responses, r,
TRITONBACKEND_RequestInput(request, input_name, &input)
);
request_input.push_back(input);
// If an error response was sent while getting the input name
// or input then display an error message and move on
// to next request.
if (responses[r] == nullptr) {
LOG_MESSAGE(
TRITONSERVER_LOG_ERROR,
(std::string("request ") + std::to_string(r) +
": failed to read input or requested output name, error response sent")
.c_str()
);
continue;
}
// Get input buffer count.
uint32_t input_buffer_count;
GUARDED_RESPOND_IF_ERROR(
responses, r,
TRITONBACKEND_InputProperties(
input, nullptr /* input_name */, nullptr, nullptr,
nullptr, nullptr, &input_buffer_count
)
);
if (responses[r] == nullptr) {
LOG_MESSAGE(
TRITONSERVER_LOG_ERROR,
(std::string("request ") + std::to_string(r) +
": failed to read input properties, error response sent")
.c_str()
);
continue;
}
// Compose all the requests input to make a batch request,
// record the sentences count of each request for further process.
std::vector<char> content_buffer;
for (uint32_t b = 0; b < input_buffer_count; ++b) {
const void* input_buffer = nullptr;
uint64_t buffer_byte_size = 0;
TRITONSERVER_MemoryType input_memory_type = TRITONSERVER_MEMORY_CPU;
int64_t input_memory_type_id = 0;
GUARDED_RESPOND_IF_ERROR(
responses, r,
TRITONBACKEND_InputBuffer(
input, b, &input_buffer, &buffer_byte_size,
&input_memory_type, &input_memory_type_id
)
);
if ((responses[r] == nullptr) ||
(input_memory_type == TRITONSERVER_MEMORY_GPU)) {
GUARDED_RESPOND_IF_ERROR(
responses, r,
TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
"failed to get input buffer in CPU memory"
)
);
}
content_buffer.insert(
content_buffer.end(), reinterpret_cast<const char*>(input_buffer) + 4,
reinterpret_cast<const char*>(input_buffer) + buffer_byte_size - 4
);
}
std::string s(content_buffer.begin(), content_buffer.end());
int count = std::count(s.begin(), s.end(), '\n');
request_batch_size.push_back(count + 1);
inputs.push_back(s);
content_buffer.clear();
if (input_strings.empty()) {
input_strings = s;
} else {
input_strings.append("\n");
input_strings.append(s);
}
total_batch_size++;
}
// Operate on the entire batch of requests for improved performance.
void* vstate;
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));
ModelInstanceState* instance_state =
reinterpret_cast<ModelInstanceState*>(vstate);
void* marian = instance_state->Marian();
char* result = translate(marian, const_cast<char*>(input_strings.c_str()));
// Assign the results to the corresponding request.
char* pos = result;
for (uint32_t r = 0; r < request_count; ++r) {
int batch_size = request_batch_size[r];
uint64_t output_byte_size = 0;
char* output_content = nullptr;
// Find current output content.
while (batch_size > 0) {
char* p = strchr(pos, '\n');
if (p != nullptr) {
*p = '\0';
}
if (output_content == nullptr) {
output_content = pos;
} else {
strcat(output_content, "\n");
strcat(output_content, pos);
}
// Move to next output content.
if (p != nullptr) {
pos = p + 1;
}
batch_size--;
}
output_byte_size = strlen(output_content);
TRITONBACKEND_Input* input = request_input[r];
const char* input_name;
TRITONSERVER_DataType input_datatype;
const int64_t* input_shape;
uint32_t input_dims_count;
uint64_t input_byte_size;
uint32_t input_buffer_count;
GUARDED_RESPOND_IF_ERROR(
responses, r,
TRITONBACKEND_InputProperties(
input, &input_name, &input_datatype, &input_shape,
&input_dims_count, &input_byte_size, &input_buffer_count
)
);
if (responses[r] == nullptr) {
LOG_MESSAGE(
TRITONSERVER_LOG_ERROR,
(std::string("request ") + std::to_string(r) +
": failed to read input properties, error response sent")
.c_str()
);
continue;
}
TRITONBACKEND_Request* request = requests[r];
const char* requested_output_name = nullptr;
GUARDED_RESPOND_IF_ERROR(
responses, r,
TRITONBACKEND_RequestOutputName(
request, 0 /* index */, &requested_output_name
)
);
// Create an output tensor in the response,
// input and output have same datatype and shape...
TRITONBACKEND_Response* response = responses[r];
TRITONBACKEND_Output* output;
GUARDED_RESPOND_IF_ERROR(
responses, r,
TRITONBACKEND_ResponseOutput(
response, &output, requested_output_name, input_datatype,
input_shape, input_dims_count
)
);
// Get the output buffer. We request a buffer in CPU memory
// but we have to handle any returned type. If we get back
// a buffer in GPU memory we just fail the request.
void* output_buffer;
TRITONSERVER_MemoryType output_memory_type = TRITONSERVER_MEMORY_CPU;
int64_t output_memory_type_id = 0;
GUARDED_RESPOND_IF_ERROR(
responses, r,
TRITONBACKEND_OutputBuffer(
output, &output_buffer, output_byte_size + 4,
&output_memory_type, &output_memory_type_id
)
);
if ((responses[r] == nullptr) ||
(output_memory_type == TRITONSERVER_MEMORY_GPU)) {
GUARDED_RESPOND_IF_ERROR(
responses, r,
TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
"failed to create output buffer in CPU memory"
)
);
LOG_MESSAGE(
TRITONSERVER_LOG_ERROR,
(std::string("request ") + std::to_string(r) +
": failed to create output buffer in CPU memory, error request sent")
.c_str()
);
continue;
}
// Copy Marian result -> output.
memcpy(output_buffer, reinterpret_cast<char*>(&output_byte_size), 4);
memcpy(reinterpret_cast<char*>(output_buffer) + 4, output_content, output_byte_size);
// Send the response.
LOG_IF_ERROR(
TRITONBACKEND_ResponseSend(
responses[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL,
nullptr /* success */),
"failed sending response"
);
// Report statistics for the successful request.
uint64_t request_exec_end_ns = 0;
SET_TIMESTAMP(request_exec_end_ns);
LOG_IF_ERROR(
TRITONBACKEND_ModelInstanceReportStatistics(
instance_state->TritonModelInstance(), request, true /* success */,
exec_start_ns, exec_start_ns, request_exec_end_ns, request_exec_end_ns),
"failed reporting request statistics"
);
// Release each request as soon as we sent the corresponding response.
LOG_IF_ERROR(
TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL),
"failed releasing request"
);
}
// Report statistics for the entire batch of requests.
uint64_t exec_end_ns = 0;
SET_TIMESTAMP(exec_end_ns);
LOG_IF_ERROR(
TRITONBACKEND_ModelInstanceReportBatchStatistics(
instance_state->TritonModelInstance(), total_batch_size,
exec_start_ns, exec_start_ns, exec_end_ns, exec_end_ns),
"failed reporting batch request statistics"
);
// Release Marian result.
free_result(result);
return nullptr; // success
}
} // extern "C"
}}} // namespace triton::backend::marian

View File

@ -0,0 +1,11 @@
#pragma once
#ifdef _WIN32
#define DLLEXPORT extern "C" __declspec(dllexport)
#else
#define DLLEXPORT extern "C"
#endif
DLLEXPORT void* init(char* path, int device_num);
DLLEXPORT char* translate(void* marian, char* sent);
DLLEXPORT void free_result(char* to_free);

View File

@ -0,0 +1,239 @@
add_subdirectory(3rd_party)
include_directories(.)
include_directories(3rd_party)
include_directories(3rd_party/SQLiteCpp/include)
include_directories(3rd_party/sentencepiece)
include_directories(3rd_party/fbgemm/include)
include_directories(${CMAKE_BINARY_DIR}/local/include)
add_library(marian STATIC
static/cmarian.cpp
common/aliases.cpp
common/fastopt.cpp
common/version.cpp
common/utils.cpp
common/logging.cpp
common/cli_helper.cpp
common/cli_wrapper.cpp
common/config.cpp
common/config_parser.cpp
common/config_validator.cpp
common/options.cpp
common/binary.cpp
common/io.cpp
common/filesystem.cpp
common/file_stream.cpp
common/types.cpp
data/alignment.cpp
data/vocab.cpp
data/default_vocab.cpp
data/sentencepiece_vocab.cpp
data/factored_vocab.cpp
data/corpus_base.cpp
data/corpus.cpp
data/corpus_sqlite.cpp
data/corpus_nbest.cpp
data/text_input.cpp
3rd_party/cnpy/cnpy.cpp
3rd_party/ExceptionWithCallStack.cpp
3rd_party/phf/phf.cc
tensors/backend.cpp
tensors/rand.cpp
tensors/tensor.cpp
tensors/cpu/device.cpp
tensors/cpu/prod.cpp
tensors/cpu/tensor_operators.cpp
tensors/cpu/sharp/int_gemm.cpp
tensors/cpu/sharp/avx_gemm.cpp
tensors/cpu/sharp/sse_gemm.cpp
tensors/cpu/fbgemm/packed_gemm.cpp
graph/expression_graph.cpp
graph/expression_operators.cpp
graph/node.cpp
graph/node_operators.cpp
graph/node_initializers.cpp
layers/convolution.cpp
layers/generic.cpp
layers/loss.cpp
layers/weight.cpp
rnn/cells.cpp
rnn/attention.cpp
optimizers/clippers.cpp
optimizers/optimizers.cpp
models/model_factory.cpp
models/encoder_decoder.cpp
models/transformer_stub.cpp
rescorer/score_collector.cpp
translator/history.cpp
translator/output_collector.cpp
translator/output_printer.cpp
translator/nth_element.cpp
translator/helpers.cpp
translator/scorers.cpp
training/graph_group_async.cpp
training/graph_group_async_drop.cpp
training/graph_group_sync.cpp
training/graph_group_singleton.cpp
training/graph_group_multinode.cpp
training/graph_group_multinode_sync.cpp
training/validator.cpp
training/communicator.cpp
training/scheduler.cpp
# this is only compiled to catch build errors, but not linked
microsoft/quicksand.cpp
$<TARGET_OBJECTS:libyaml-cpp>
$<TARGET_OBJECTS:SQLiteCpp>
$<TARGET_OBJECTS:pathie-cpp>
$<TARGET_OBJECTS:zlib>
)
target_compile_options(marian PUBLIC ${ALL_WARNINGS})
# Generate git_revision.h to reflect current git revision information
# [https://stackoverflow.com/questions/1435953/how-can-i-pass-git-sha1-to-compiler-as-definition-using-cmake]
# Git updates .git/logs/HEAD file whenever you pull or commit something.
# If Marian is checked out as a submodule in another repository,
# there's no .git directory in ${CMAKE_SOURCE_DIR}. Instead .git is a
# file that specifies the relative path from ${CMAKE_SOURCE_DIR} to
# ./git/modules/<MARIAN_ROOT_DIR> in the root of the repository that
# contains Marian as a submodule. We set MARIAN_GIT_DIR to the appropriate
# path, depending on whether ${CMAKE_SOURCE_DIR}/.git is a directory or file.
if(IS_DIRECTORY ${CMAKE_SOURCE_DIR}/.git) # not a submodule
set(MARIAN_GIT_DIR ${CMAKE_SOURCE_DIR}/.git)
else(IS_DIRECTORY ${CMAKE_SOURCE_DIR}/.git)
file(READ ${CMAKE_SOURCE_DIR}/.git MARIAN_GIT_DIR)
string(REGEX REPLACE "gitdir: (.*)\n" "\\1" MARIAN_GIT_DIR ${MARIAN_GIT_DIR})
get_filename_component(MARIAN_GIT_DIR "${CMAKE_SOURCE_DIR}/${MARIAN_GIT_DIR}" ABSOLUTE)
endif(IS_DIRECTORY ${CMAKE_SOURCE_DIR}/.git)
add_custom_command(OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/common/git_revision.h
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
COMMAND git log -1 --pretty=format:\#define\ GIT_REVISION\ \"\%h\ \%ai\" > ${CMAKE_CURRENT_SOURCE_DIR}/common/git_revision.h
DEPENDS ${MARIAN_GIT_DIR}/logs/HEAD
VERBATIM
)
add_custom_target(marian_version DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/common/git_revision.h)
add_dependencies(marian marian_version) # marian must depend on it so that it gets created first
# make sure all local dependencies are installed first before this is built
add_dependencies(marian 3rd_party_installs)
if(CUDA_FOUND)
cuda_add_library(marian_cuda
tensors/gpu/device.cu
tensors/gpu/algorithm.cu
tensors/gpu/prod.cpp
tensors/gpu/element.cu
tensors/gpu/add.cu
tensors/gpu/add_all.cu
tensors/gpu/tensor_operators.cu
tensors/gpu/cudnn_wrappers.cu
translator/nth_element.cu
translator/helpers.cu
training/gradient_dropping/gpu/dropper.cu
training/gradient_dropping/gpu/sparse_algorithm.cu
STATIC)
target_compile_options(marian_cuda PUBLIC ${ALL_WARNINGS})
# make sure all local dependencies are installed first before this is built
add_dependencies(marian_cuda 3rd_party_installs)
set_target_properties(marian_cuda PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif(CUDA_FOUND)
set_target_properties(marian PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(marian PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
set_target_properties(marian PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
add_executable(marian_train command/marian_main.cpp)
set_target_properties(marian_train PROPERTIES OUTPUT_NAME marian)
target_compile_options(marian_train PUBLIC ${ALL_WARNINGS})
add_executable(marian_decoder command/marian_decoder.cpp)
set_target_properties(marian_decoder PROPERTIES OUTPUT_NAME marian-decoder)
target_compile_options(marian_decoder PUBLIC ${ALL_WARNINGS})
add_executable(marian_scorer command/marian_scorer.cpp)
set_target_properties(marian_scorer PROPERTIES OUTPUT_NAME marian-scorer)
target_compile_options(marian_scorer PUBLIC ${ALL_WARNINGS})
add_executable(marian_vocab command/marian_vocab.cpp)
set_target_properties(marian_vocab PROPERTIES OUTPUT_NAME marian-vocab)
target_compile_options(marian_vocab PUBLIC ${ALL_WARNINGS})
add_executable(marian_conv command/marian_conv.cpp)
set_target_properties(marian_conv PROPERTIES OUTPUT_NAME marian-conv)
target_compile_options(marian_conv PUBLIC ${ALL_WARNINGS})
set(EXECUTABLES ${EXECUTABLES} marian_train marian_decoder marian_scorer marian_vocab marian_conv)
# marian.zip and marian.tgz
# This combines marian, marian_decoder in a single ZIP or TAR file for
# execution in MSFT internal tools FLO and Philly.
# For Philly submission, we need statically-linked versions to deal with
# library dependencies, so this target is only enabled for static builds.
if(USE_STATIC_LIBS)
add_custom_command(
OUTPUT "${CMAKE_BINARY_DIR}/marian.zip"
COMMAND zip -v -0 -j "${CMAKE_BINARY_DIR}/marian.zip"
"${CMAKE_BINARY_DIR}/marian"
"${CMAKE_BINARY_DIR}/marian-decoder"
"${CMAKE_BINARY_DIR}/marian-scorer"
"${CMAKE_BINARY_DIR}/marian-vocab"
"${CMAKE_BINARY_DIR}/marian-conv"
DEPENDS marian_train marian_decoder marian_scorer marian_vocab marian_conv)
add_custom_target(marian_zip DEPENDS "${CMAKE_BINARY_DIR}/marian.zip")
add_custom_command(
OUTPUT "${CMAKE_BINARY_DIR}/marian.tgz"
COMMAND tar -cvvzf "${CMAKE_BINARY_DIR}/marian.tgz" -C "${CMAKE_BINARY_DIR}"
"marian"
"marian-decoder"
"marian-scorer"
"marian-vocab"
"marian-conv"
DEPENDS marian_train marian_decoder marian_scorer marian_vocab marian_conv)
add_custom_target(marian_tgz DEPENDS "${CMAKE_BINARY_DIR}/marian.tgz")
add_custom_target(philly DEPENDS marian_tgz marian_zip)
endif(USE_STATIC_LIBS)
if(COMPILE_SERVER)
add_executable(marian_server command/marian_server.cpp)
set_target_properties(marian_server PROPERTIES OUTPUT_NAME marian-server)
target_compile_options(marian_server PUBLIC ${ALL_WARNINGS})
set(EXECUTABLES ${EXECUTABLES} marian_server)
endif(COMPILE_SERVER)
foreach(exec ${EXECUTABLES})
target_link_libraries(${exec} marian ${EXT_LIBS} ${EXT_LIBS} ${CMAKE_THREAD_LIBS_INIT})
if(CUDA_FOUND)
target_link_libraries(${exec} marian marian_cuda ${EXT_LIBS} ${CMAKE_THREAD_LIBS_INIT})
endif(CUDA_FOUND)
set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
endforeach(exec)
if(COMPILE_TESTS)
set(CATCH_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/3rd_party)
add_library(Catch INTERFACE)
target_include_directories(Catch INTERFACE ${CATCH_INCLUDE_DIR})
add_subdirectory(tests)
endif(COMPILE_TESTS)
if(COMPILE_EXAMPLES)
add_subdirectory(examples)
endif(COMPILE_EXAMPLES)

View File

@ -0,0 +1,74 @@
#include "marian.h"
#include "translator/beam_search.h"
#include "translator/translator.h"
#include "common/utils.h"
#include<stdio.h>
#include<string.h>
#include<iostream>
#include <string>
#ifdef _WIN32
#define DLLEXPORT extern "C" __declspec(dllexport)
#else
#define DLLEXPORT extern "C"
#endif
using namespace marian;
class CMarian {
private:
Ptr<Options> options_;
char* configPath_;
Ptr<TranslateService<BeamSearch>> task_;
public:
CMarian(char* configPath, int device_num) : configPath_(configPath) {
int argc = 5;
char** argv = new char*[argc];
argv[0] = new char[20];
strcpy(argv[0], "./marian-decoder");
argv[1] = new char[12];
strcpy(argv[1], "--config");
argv[2] = configPath_;
argv[3] = new char[12];
strcpy(argv[3], "--devices");
argv[4] = new char[sizeof(device_num) + 1];
strcpy(argv[4], std::to_string(device_num).c_str());
options_ = marian::parseOptions(argc, argv, cli::mode::translation, true);
task_ = New<TranslateService<BeamSearch>>(options_);
delete[] argv[0];
delete[] argv[1];
delete[] argv[3];
delete[] argv[4];
}
/**
* @brief Exposes Marian translation capabilities based on the loaded YAML config associated with this class.
* @param sent The sentence to run inference on.
* @return A string delimited by ||| with newlines separating beams.
*/
char* translate(char* sent) {
std::string strSent(sent);
auto outputText = task_->run(strSent);
char* ret = (char*) malloc(outputText.length() + 1);
snprintf(ret, outputText.length() + 1, "%s", outputText.c_str());
return ret;
}
};
DLLEXPORT void* init(char* path, int device_num) {
CMarian* m = new CMarian(path, device_num);
return (void*)m;
}
DLLEXPORT char* translate(void* marian, char* sent) {
CMarian* m = static_cast<CMarian*>(marian);
return m->translate(sent);
}
DLLEXPORT void free_result(char* to_free) {
free(to_free);
}

View File

@ -0,0 +1,157 @@
#include "logging.h"
#include "common/config.h"
#include "spdlog/sinks/null_sink.h"
#include "3rd_party/ExceptionWithCallStack.h"
#include <time.h>
#include <stdlib.h>
#ifdef __unix__
#include <signal.h>
#endif
#ifdef _MSC_VER
#define noinline __declspec(noinline)
#else
#define noinline __attribute__((noinline))
#endif
namespace marian {
static bool throwExceptionOnAbort = false;
bool getThrowExceptionOnAbort() { return throwExceptionOnAbort; }
void setThrowExceptionOnAbort(bool doThrowExceptionOnAbort) { throwExceptionOnAbort = doThrowExceptionOnAbort; };
}
std::shared_ptr<spdlog::logger> createStderrLogger(const std::string& name,
const std::string& pattern,
const std::vector<std::string>& files,
bool quiet) {
std::vector<spdlog::sink_ptr> sinks;
auto stderr_sink = spdlog::sinks::stderr_sink_mt::instance();
if(!quiet)
sinks.push_back(stderr_sink);
for(auto&& file : files) {
auto file_sink = std::make_shared<spdlog::sinks::simple_file_sink_st>(file, true);
sinks.push_back(file_sink);
}
auto existing = spdlog::get(name);
if (existing) return existing;
auto logger = std::make_shared<spdlog::logger>(name, begin(sinks), end(sinks));
spdlog::register_logger(logger);
logger->set_pattern(pattern);
return logger;
}
bool setLoggingLevel(spdlog::logger& logger, std::string const level) {
if(level == "trace")
logger.set_level(spdlog::level::trace);
else if(level == "debug")
logger.set_level(spdlog::level::debug);
else if(level == "info")
logger.set_level(spdlog::level::info);
else if(level == "warn")
logger.set_level(spdlog::level::warn);
else if(level == "err" || level == "error")
logger.set_level(spdlog::level::err);
else if(level == "critical")
logger.set_level(spdlog::level::critical);
else if(level == "off")
logger.set_level(spdlog::level::off);
else {
logger.warn("Unknown log level '{}' for logger '{}'", level.c_str(), logger.name().c_str());
return false;
}
return true;
}
static void setErrorHandlers();
void createLoggers(const marian::Config* config) {
std::vector<std::string> generalLogs;
std::vector<std::string> validLogs;
if(config && !config->get<std::string>("log").empty()) {
generalLogs.push_back(config->get<std::string>("log"));
#ifndef _WIN32
// can't open the same file twice in Windows for some reason
validLogs.push_back(config->get<std::string>("log"));
#endif
}
// valid-log is available only for training
if(config && config->has("valid-log") && !config->get<std::string>("valid-log").empty()) {
validLogs.push_back(config->get<std::string>("valid-log"));
}
bool quiet = config && config->get<bool>("quiet");
Logger general{createStderrLogger("general", "[%Y-%m-%d %T] %v", generalLogs, quiet)};
Logger valid{createStderrLogger("valid", "[%Y-%m-%d %T] [valid] %v", validLogs, quiet)};
if(config && config->has("log-level")) {
std::string loglevel = config->get<std::string>("log-level");
if(!setLoggingLevel(*general, loglevel))
return;
setLoggingLevel(*valid, loglevel);
}
if(config && !config->get<std::string>("log-time-zone").empty()) {
std::string timezone = config->get<std::string>("log-time-zone");
#ifdef _WIN32
#define setenv(var, val, over) SetEnvironmentVariableA(var, val) // ignoring over flag
#endif
setenv("TZ", timezone.c_str(), true);
tzset();
}
setErrorHandlers();
}
static void unhandledException() {
if(std::current_exception()) {
try {
throw; // rethrow so that we can get access to what()
} catch(const std::exception& e) {
ABORT("Unhandled exception of type '{}': {}", typeid(e).name(), e.what());
} catch(...) {
ABORT("Unhandled exception");
}
} else {
std::abort();
}
}
static void setErrorHandlers() {
// call stack for unhandled exceptions
std::set_terminate(unhandledException);
#ifdef __unix__
// catch segfaults
struct sigaction sa = { 0 };
sigemptyset(&sa.sa_mask);
sa.sa_flags = SA_SIGINFO;
sa.sa_sigaction = [](int /*signal*/, siginfo_t*, void*) { ABORT("Segmentation fault"); };
sigaction(SIGSEGV, &sa, NULL);
sa.sa_sigaction = [](int /*signal*/, siginfo_t*, void*) { ABORT("Floating-point exception"); };
sigaction(SIGFPE, &sa, NULL);
#endif
}
// modify the log pattern for the "general" logger to include the MPI rank
// This is called upon initializing MPI. It is needed to associated error messages to ranks.
void switchtoMultinodeLogging(std::string nodeIdStr) {
Logger log = spdlog::get("general");
if(log)
log->set_pattern("[%Y-%m-%d %T " + nodeIdStr + ":%t] %v");
}
namespace marian {
std::string noinline getCallStack(size_t skipLevels) {
return ::Microsoft::MSR::CNTK::DebugUtil::GetCallStack(skipLevels + 2, /*makeFunctionNamesStandOut=*/true);
}
void noinline logCallStack(size_t skipLevels) {
checkedLog("general", "critical", getCallStack(skipLevels));
}
}