Add Triton Marian backend running in AzureML Inference Environment (#749)

* Add Triton Marian backend running in AzureML Inference Environment
2024-09-17 09:47:34 +03:00 · 2020-11-05 06:29:36 +08:00 · 2020-11-05 06:29:36 +08:00 · ca7a887aa7
commit ca7a887aa7
parent e274ac76b2
12 changed files with 1383 additions and 0 deletions
--- a/contrib/triton-aml/Dockerfile
+++ b/contrib/triton-aml/Dockerfile
@ -0,0 +1,96 @@
+# It is recommended to use a machine which supports CUDA to build this image.
+FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 AS BUILDER
+RUN apt-get update --fix-missing
+RUN apt-get install -y curl git autoconf automake libtool curl make g++ unzip cmake build-essential cpio
+RUN apt-get -y clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# install zlib
+WORKDIR /
+RUN git clone --no-checkout https://github.com/madler/zlib
+WORKDIR /zlib
+RUN git checkout tags/v1.2.10 && \
+    ./configure && \
+    make install
+
+# protobuf install
+WORKDIR /
+RUN git clone --no-checkout https://github.com/protocolbuffers/protobuf.git
+WORKDIR /protobuf
+RUN git checkout tags/v3.8.0 && \
+    git submodule update --init --recursive && \
+    ./autogen.sh
+RUN ./configure --disable-shared --prefix=/usr CFLAGS="-fPIC"  CXXFLAGS="-fPIC" && \
+    make && \
+    make check  && \
+    make install  && \
+    ldconfig # refresh shared library cache.
+
+# Intel mkl install
+WORKDIR /
+RUN curl --tlsv1.2 --output l_mkl_2020.0.166.tgz https://registrationcenter-download.intel.com/akdlm/irc_nas/tec/16318/l_mkl_2020.0.166.tgz
+RUN tar zxvf l_mkl_2020.0.166.tgz
+WORKDIR /l_mkl_2020.0.166
+RUN ./install.sh --silent ./silent.cfg --install_dir /opt/intel/ --accept_eula
+
+# boost install
+WORKDIR /
+RUN git clone --recursive https://github.com/boostorg/boost --branch boost-1.72.0 /boost
+WORKDIR /boost
+RUN ./bootstrap.sh
+RUN ./b2 install --prefix=/usr --with-system --with-thread --with-date_time --with-regex --with-serialization
+
+# Marian install
+WORKDIR /
+RUN git clone --no-checkout https://github.com/marian-nmt/marian-dev
+WORKDIR marian-dev
+RUN git checkout youki/quantize-embedding
+RUN git checkout dad48865fd3b7f1d7b891de81040f7651e824510
+RUN mkdir src/static
+RUN mkdir build
+COPY src/cmarian.cpp /marian-dev/src/static
+COPY src/logging.cpp /marian-dev/src/common
+RUN rm src/CMakeLists.txt
+COPY src/CMakeLists.txt /marian-dev/src
+
+WORKDIR /marian-dev/build
+RUN cmake .. -DCOMPILE_CPU=on -DCOMPILE_CUDA=on -DUSE_SENTENCEPIECE=on -DUSE_STATIC_LIBS=off -DCOMPILE_SERVER=off -DUSE_FBGEMM=on -DCUDA_cublas_device_LIBRARY=/usr/lib/x86_64-linux-gnu/libcublas.so
+RUN make -j $(grep -c ^processor /proc/cpuinfo)
+
+# build cmarian static library
+FROM nvcr.io/nvidia/tritonserver:20.09-py3
+RUN mkdir -p /marian-dev/build/src/3rd_party/sentencepiece/src
+COPY --from=BUILDER /usr/lib/libprotobuf.a /usr/lib
+COPY --from=BUILDER /usr/lib/libboost_system.a /usr/lib
+COPY --from=BUILDER /marian-dev/build/src/3rd_party/fbgemm/libfbgemm.a /usr/lib
+COPY --from=BUILDER /marian-dev/build/src/3rd_party/fbgemm/asmjit/libasmjit.a /usr/lib
+COPY --from=BUILDER /marian-dev/build/src/3rd_party/sentencepiece/src/libsentencepiece_train.a /usr/lib
+COPY --from=BUILDER /marian-dev/build/src/3rd_party/sentencepiece/src/libsentencepiece.a /usr/lib
+COPY --from=BUILDER /marian-dev/build/libmarian.a /usr/lib/libcmarian.a
+COPY --from=BUILDER /marian-dev/build/src/libmarian_cuda.a /usr/lib/libcmarian_cuda.a
+
+# build triton custom backend
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+            software-properties-common \
+            build-essential \
+            git \
+            libopencv-dev \
+            libopencv-core-dev \
+            libssl-dev \
+            libtool \
+            pkg-config \
+            rapidjson-dev
+
+# install cmake-3.19.0
+RUN wget https://github.com/Kitware/CMake/releases/download/v3.19.0-rc1/cmake-3.19.0-rc1-Linux-x86_64.sh
+RUN sh cmake-3.19.0-rc1-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir
+
+ADD marian_backend /opt/tritonserver/marian_backend
+WORKDIR /opt/tritonserver/marian_backend
+RUN mkdir build
+RUN cd build && \
+    cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install .. && \
+    make install
--- a/contrib/triton-aml/README.md
+++ b/contrib/triton-aml/README.md
@ -0,0 +1,36 @@
+Triton-AML
+======
+
+*Triton-AML* is a Triton custom backend running with Marian in the AzureML Inference Environment, it's one of the implementation of [Triton Backend Shared Library](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/backend.html#backend-shared-library).
+
+This backend is compiled with the static library of Marian on a specific version.
+
+Layout:
+
+- marian_backend: Triton Marian backend source code
+- src: Changed code and CMakeLists.txt of Marian
+- Dockerfile: Used for compiling the backend with the static library of Marian
+- build.sh: A simple shell script to run the Dockerfile to get the generated libtriton_marian.so
+
+## Usage
+
+Run `./build.sh` to get the Triton Marian backend shared library.
+
+For all the users, you can put the libtriton_marian.so into the following places:
+
+- <model_repository>/<model_name>/<version_directory>/libtriton_marian.so
+- <model_repository>/<model_name>/libtriton_marian.so
+
+For the AzureML Inference team members, you can put it into the following place of *aml-triton* base image:
+
+- <backend_directory>/marian/libtriton_marian.so
+
+Where <backend_directory> is by default /opt/tritonserver/backends.
+
+## Make changes
+
+If you want to compile with another version of Marian, you need to replace `RUN git checkout youki/quantize-embedding` in the Dockerfile, then copy the new CMakeLists.txt replace the old one, add src/cmarian.cpp into CMakeLists.txt and make some changes to make sure it will build a static library of Marian.
+
+## Limitation
+
+For now, it's only used for *nlxseq2seq* model, some hard code is in the `ModelState::SetMarianConfigPath` function, some changes must be done if you want to run other models with Marian.
--- a/contrib/triton-aml/build.sh
+++ b/contrib/triton-aml/build.sh
@ -0,0 +1,10 @@
+#!/bin/sh
+echo Building Triton Marian backend ...
+
+docker build -t triton-marian-build .
+
+echo Copying artifacts ...
+
+docker container create --name extract triton-marian-build
+docker container cp extract:/opt/tritonserver/marian_backend/build/libtriton_marian.so .
+docker container rm -f extract
--- a/contrib/triton-aml/marian_backend/CMakeLists.txt
+++ b/contrib/triton-aml/marian_backend/CMakeLists.txt
@ -0,0 +1,156 @@
+cmake_minimum_required(VERSION 3.17)
+
+project(tritonmarianbackend LANGUAGES C CXX)
+
+#
+# Options
+#
+# Must include options required for this project as well as any
+# projects included in this one by FetchContent.
+#
+# GPU support is disabled by default because marian backend doesn't
+# support GPUs.
+#
+option(TRITON_ENABLE_GPU "Enable GPU support in backend" OFF)
+option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON)
+
+set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo")
+set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo")
+set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo")
+
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+
+#
+# Dependencies
+#
+# FetchContent's composibility isn't very good. We must include the
+# transitive closure of all repos so that we can override the tag.
+#
+include(FetchContent)
+
+FetchContent_Declare(
+  repo-common
+  GIT_REPOSITORY https://github.com/triton-inference-server/common.git
+  GIT_TAG ${TRITON_COMMON_REPO_TAG}
+  GIT_SHALLOW ON
+)
+FetchContent_Declare(
+  repo-core
+  GIT_REPOSITORY https://github.com/triton-inference-server/core.git
+  GIT_TAG ${TRITON_CORE_REPO_TAG}
+  GIT_SHALLOW ON
+)
+FetchContent_Declare(
+  repo-backend
+  GIT_REPOSITORY https://github.com/triton-inference-server/backend.git
+  GIT_TAG ${TRITON_BACKEND_REPO_TAG}
+  GIT_SHALLOW ON
+)
+FetchContent_MakeAvailable(repo-common repo-core repo-backend)
+
+#
+# Shared library implementing the Triton Backend API
+#
+configure_file(src/libtriton_marian.ldscript libtriton_marian.ldscript COPYONLY)
+
+add_library(
+  triton-marian-backend SHARED
+  src/marian.cc
+)
+
+add_library(
+  TritonMarianBackend::triton-marian-backend ALIAS triton-marian-backend
+)
+
+target_include_directories(
+  triton-marian-backend
+  PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/src
+)
+
+target_compile_features(triton-marian-backend PRIVATE cxx_std_11)
+target_compile_options(
+  triton-marian-backend PRIVATE
+  $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
+    -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror>
+)
+
+target_link_libraries(
+  triton-marian-backend
+  PRIVATE
+    triton-backend-utils    # from repo-backend
+    triton-core-serverstub  # from repo-core
+)
+
+target_link_libraries(
+  triton-marian-backend
+  PRIVATE   # from marian environment
+    cmarian
+    cmarian_cuda
+    sentencepiece
+    sentencepiece_train
+    fbgemm
+    asmjit
+    protobuf
+)
+
+
+set_target_properties(
+  triton-marian-backend PROPERTIES
+  POSITION_INDEPENDENT_CODE ON
+  OUTPUT_NAME triton_marian
+  LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_marian.ldscript
+  LINK_FLAGS "-Wl,--version-script libtriton_marian.ldscript"
+)
+
+#
+# Install
+#
+include(GNUInstallDirs)
+set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TritonMarianBackend)
+
+install(
+  TARGETS
+    triton-marian-backend
+  EXPORT
+    triton-marian-backend-targets
+  LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/marian
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/marian
+)
+
+install(
+  EXPORT
+    triton-marian-backend-targets
+  FILE
+    TritonMarianBackendTargets.cmake
+  NAMESPACE
+    TritonMarianBackend::
+  DESTINATION
+    ${INSTALL_CONFIGDIR}
+)
+
+include(CMakePackageConfigHelpers)
+configure_package_config_file(
+  ${CMAKE_CURRENT_LIST_DIR}/cmake/TritonMarianBackendConfig.cmake.in
+  ${CMAKE_CURRENT_BINARY_DIR}/TritonMarianBackendConfig.cmake
+  INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
+)
+
+install(
+  FILES
+  ${CMAKE_CURRENT_BINARY_DIR}/TritonMarianBackendConfig.cmake
+  DESTINATION ${INSTALL_CONFIGDIR}
+)
+
+#
+# Export from build tree
+#
+export(
+  EXPORT triton-marian-backend-targets
+  FILE ${CMAKE_CURRENT_BINARY_DIR}/TritonMarianBackendTargets.cmake
+  NAMESPACE TritonMarianBackend::
+)
+
+export(PACKAGE TritonMarianBackend)
--- a/contrib/triton-aml/marian_backend/README.md
+++ b/contrib/triton-aml/marian_backend/README.md
@ -0,0 +1,16 @@
+Use cmake to build and install in a local directory.
+
+```
+$ mkdir build
+$ cd build
+$ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install ..
+$ make install
+```
+
+The following required Triton repositories will be pulled and used in
+the build. By default the "main" branch/tag will be used for each repo
+but the listed CMake argument can be used to override.
+
+* triton-inference-server/backend: -DTRITON_BACKEND_REPO_TAG=[tag]
+* triton-inference-server/core: -DTRITON_CORE_REPO_TAG=[tag]
+* triton-inference-server/common: -DTRITON_COMMON_REPO_TAG=[tag]
--- a/contrib/triton-aml/marian_backend/cmake/TritonMarianBackendConfig.cmake.in
+++ b/contrib/triton-aml/marian_backend/cmake/TritonMarianBackendConfig.cmake.in
@ -0,0 +1,13 @@
+include(CMakeFindDependencyMacro)
+
+get_filename_component(
+  TRITONMARIANBACKEND_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
+)
+
+list(APPEND CMAKE_MODULE_PATH ${TRITONMARIANBACKEND_CMAKE_DIR})
+
+if(NOT TARGET TritonMarianBackend::triton-marian-backend)
+  include("${TRITONMARIANBACKEND_CMAKE_DIR}/TritonMarianBackendTargets.cmake")
+endif()
+
+set(TRITONMARIANBACKEND_LIBRARIES TritonMarianBackend::triton-marian-backend)
--- a/contrib/triton-aml/marian_backend/src/libtriton_marian.ldscript
+++ b/contrib/triton-aml/marian_backend/src/libtriton_marian.ldscript
@ -0,0 +1,5 @@
+{
+  global:
+    TRITONBACKEND_*;
+  local: *;
+};
--- a/contrib/triton-aml/marian_backend/src/marian.cc
+++ b/contrib/triton-aml/marian_backend/src/marian.cc
@ -0,0 +1,570 @@
+#include <algorithm>
+#include "marian.h"
+#include "triton/backend/backend_common.h"
+
+namespace triton { namespace backend { namespace marian {
+
+#define GUARDED_RESPOND_IF_ERROR(RESPONSES, IDX, X)                             \
+    do {                                                                        \
+        if ((RESPONSES)[IDX] != nullptr) {                                      \
+            TRITONSERVER_Error* err__ = (X);                                    \
+            if (err__ != nullptr) {                                             \
+                LOG_IF_ERROR(                                                   \
+                    TRITONBACKEND_ResponseSend(                                 \
+                        (RESPONSES)[IDX], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \
+                        err__),                                                 \
+                    "failed to send error response");                           \
+                (RESPONSES)[IDX] = nullptr;                                     \
+                TRITONSERVER_ErrorDelete(err__);                                \
+            }                                                                   \
+        }                                                                       \
+    } while (false)
+
+//
+// ModelState
+//
+// State associated with a model that is using this backend. An object
+// of this class is created and associated with each
+// TRITONBACKEND_Model.
+//
+class ModelState {
+public:
+    static TRITONSERVER_Error* Create(
+        TRITONBACKEND_Model* triton_model, ModelState** state);
+
+    TRITONSERVER_Error* SetMarianConfigPath();
+
+    // Get the handle to the TRITONBACKEND model.
+    TRITONBACKEND_Model* TritonModel() { return triton_model_; }
+
+    // Get the name of the model.
+    const std::string& Name() const { return name_; }
+
+    // Get the Marian config path of the model.
+    const std::string& MarianConfigPath() const { return marian_config_path_; }
+
+private:
+    ModelState(
+        TRITONBACKEND_Model* triton_model, const char* name,
+        common::TritonJson::Value&& model_config);
+
+    TRITONBACKEND_Model* triton_model_;
+    const std::string name_;
+    common::TritonJson::Value model_config_;
+    std::string marian_config_path_;
+};
+
+TRITONSERVER_Error*
+ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
+{
+    TRITONSERVER_Message* config_message;
+    RETURN_IF_ERROR(TRITONBACKEND_ModelConfig(
+        triton_model, 1 /* config_version */, &config_message));
+
+    // Get the model configuration as a json string from
+    // config_message, parse it with the TritonJson.
+    const char* buffer;
+    size_t byte_size;
+    RETURN_IF_ERROR(
+        TRITONSERVER_MessageSerializeToJson(config_message, &buffer, &byte_size));
+
+    common::TritonJson::Value model_config;
+    TRITONSERVER_Error* err = model_config.Parse(buffer, byte_size);
+    RETURN_IF_ERROR(TRITONSERVER_MessageDelete(config_message));
+    RETURN_IF_ERROR(err);
+
+    const char* model_name;
+    RETURN_IF_ERROR(TRITONBACKEND_ModelName(triton_model, &model_name));
+
+    *state = new ModelState(
+        triton_model, model_name, std::move(model_config));
+
+    return nullptr;  // success
+}
+
+ModelState::ModelState(
+    TRITONBACKEND_Model* triton_model, const char* name,
+    common::TritonJson::Value&& model_config)
+    : triton_model_(triton_model), name_(name),
+      model_config_(std::move(model_config))
+{
+}
+
+TRITONSERVER_Error*
+ModelState::SetMarianConfigPath()
+{
+    common::TritonJson::WriteBuffer buffer;
+    RETURN_IF_ERROR(model_config_.PrettyWrite(&buffer));
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_INFO,
+        (std::string("model configuration:\n") + buffer.Contents()).c_str());
+
+    std::string config_filepath_str;
+    common::TritonJson::Value parameters;
+    if (model_config_.Find("parameters", &parameters)) {
+        common::TritonJson::Value config_filepath;
+        if (parameters.Find("config_filepath", &config_filepath)) {
+            RETURN_IF_ERROR(config_filepath.MemberAsString(
+                "string_value", &config_filepath_str)
+            );
+            LOG_MESSAGE(
+                TRITONSERVER_LOG_INFO,
+                (std::string("model config path is set to : ") + config_filepath_str)
+                .c_str()
+            );
+        }
+    }
+
+    // Set the Marian config path.
+    std::string config_path("/var/azureml-app/");
+    config_path.append(std::getenv("AZUREML_MODEL_DIR"));
+    config_path.append("/nlxseq2seq/triton/nlxseq2seq/1/data/model/");
+    config_path.append(config_filepath_str);
+    marian_config_path_ = config_path;
+
+    return nullptr;  // success
+}
+
+//
+// ModelInstanceState
+//
+// State associated with a model instance. An object of this class is
+// created and associated with each TRITONBACKEND_ModelInstance.
+//
+class ModelInstanceState {
+public:
+    static TRITONSERVER_Error* Create(
+        TRITONBACKEND_ModelInstance* triton_model_instance,
+        void* marian, ModelInstanceState **state);
+
+    // Get the handle to the TRITONBACKEND model instance.
+    TRITONBACKEND_ModelInstance* TritonModelInstance()
+    {
+        return triton_model_instance_;
+    }
+
+    // Get the name, kind, device ID and marian instance of the instance.
+    const std::string& Name() const { return name_; }
+    TRITONSERVER_InstanceGroupKind Kind() const { return kind_; }
+    int32_t DeviceId() const { return device_id_; }
+    void* Marian() const { return marian_; }
+
+private:
+    ModelInstanceState(
+        TRITONBACKEND_ModelInstance* triton_model_instance,
+        void* marian, const char* name,
+        const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id);
+
+    TRITONBACKEND_ModelInstance* triton_model_instance_;
+    void* marian_;
+    const std::string name_;
+    const TRITONSERVER_InstanceGroupKind kind_;
+    const int32_t device_id_;
+};
+
+TRITONSERVER_Error*
+ModelInstanceState::Create(
+    TRITONBACKEND_ModelInstance* triton_model_instance,
+    void* marian, ModelInstanceState** state)
+{
+    const char* instance_name;
+    RETURN_IF_ERROR(
+        TRITONBACKEND_ModelInstanceName(triton_model_instance, &instance_name));
+
+    TRITONSERVER_InstanceGroupKind instance_kind;
+    RETURN_IF_ERROR(
+        TRITONBACKEND_ModelInstanceKind(triton_model_instance, &instance_kind));
+
+    int32_t instance_id;
+    RETURN_IF_ERROR(
+        TRITONBACKEND_ModelInstanceDeviceId(triton_model_instance, &instance_id));
+
+    *state = new ModelInstanceState(
+        triton_model_instance, marian, instance_name,
+        instance_kind, instance_id);
+
+    return nullptr;  // success
+}
+
+ModelInstanceState::ModelInstanceState(
+    TRITONBACKEND_ModelInstance* triton_model_instance,
+    void* marian, const char* name,
+    const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id)
+    : triton_model_instance_(triton_model_instance), marian_(marian),
+      name_(name), kind_(kind), device_id_(device_id)
+{
+}
+
+/////////////
+
+extern "C" {
+
+TRITONSERVER_Error*
+TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)
+{
+    ModelState* model_state;
+    RETURN_IF_ERROR(ModelState::Create(model, &model_state));
+    RETURN_IF_ERROR(model_state->SetMarianConfigPath());
+    RETURN_IF_ERROR(
+        TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state))
+    );
+
+    return nullptr; // success
+}
+
+TRITONSERVER_Error*
+TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model)
+{
+    void* vstate;
+    RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate));
+    ModelState* model_state = reinterpret_cast<ModelState*>(vstate);
+
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelFinalize: delete model state");
+
+    delete model_state;
+
+    return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
+{
+    TRITONBACKEND_Model* model;
+    RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model));
+
+    void* vmodelstate;
+    RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate));
+    ModelState* model_state = reinterpret_cast<ModelState*>(vmodelstate);
+
+    std::string marian_config_path = model_state->MarianConfigPath();
+
+    int32_t device;
+    RETURN_IF_ERROR(
+        TRITONBACKEND_ModelInstanceDeviceId(instance, &device));
+
+    void* marian_instance = init(const_cast<char*>(marian_config_path.c_str()), device);
+
+    ModelInstanceState* instance_state;
+    RETURN_IF_ERROR(
+        ModelInstanceState::Create(instance, marian_instance, &instance_state));
+    RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(
+        instance, reinterpret_cast<void*>(instance_state)));
+
+    return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
+{
+    void* vstate;
+    RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));
+    ModelInstanceState* instance_state =
+        reinterpret_cast<ModelInstanceState*>(vstate);
+
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_INFO,
+        "TRITONBACKEND_ModelInstanceFinalize: delete instance state");
+
+    delete instance_state;
+
+    return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceExecute(
+    TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests,
+    const uint32_t request_count)
+{
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_INFO,
+        ("Marian model instance executing " + std::to_string(request_count) +
+         " requests").c_str()
+    );
+
+    // 'responses' is initialized with the response objects below and
+    // if/when an error response is sent the corresponding entry in
+    // 'responses' is set to nullptr to indicate that that response has
+    // already been sent.
+    std::vector<TRITONBACKEND_Response*> responses;
+    responses.reserve(request_count);
+
+    // Create a single response object for each request. If something
+    // goes wrong when attempting to create the response objects just
+    // fail all of the requests by returning an error.
+    for (uint32_t r = 0; r < request_count; ++r) {
+        TRITONBACKEND_Request* request = requests[r];
+
+        TRITONBACKEND_Response* response;
+        RETURN_IF_ERROR(TRITONBACKEND_ResponseNew(&response, request));
+        responses.push_back(response);
+    }
+
+    // We will execute all the requests at the same time, and so there
+    // will be a single compute-start / compute-end time-range.
+    uint64_t total_batch_size = 0;
+    uint64_t exec_start_ns = 0;
+    SET_TIMESTAMP(exec_start_ns);
+
+    std::vector<TRITONBACKEND_Input*> request_input;
+    std::vector<int> request_batch_size;
+    std::vector<std::string> inputs;
+    std::string input_strings;
+
+    // Create a single response object for each request. If something
+    // goes wrong when attempting to create the response objects just
+    // fail all of the requests by returning an error.
+    for (uint32_t r = 0; r < request_count; ++r) {
+        TRITONBACKEND_Request* request = requests[r];
+
+        const char* input_name;
+        GUARDED_RESPOND_IF_ERROR(
+            responses, r,
+            TRITONBACKEND_RequestInputName(request, 0 /* index */, &input_name)
+        );
+
+        TRITONBACKEND_Input* input = nullptr;
+        GUARDED_RESPOND_IF_ERROR(
+            responses, r,
+            TRITONBACKEND_RequestInput(request, input_name, &input)
+        );
+        request_input.push_back(input);
+
+        // If an error response was sent while getting the input name
+        // or input then display an error message and move on
+        // to next request.
+        if (responses[r] == nullptr) {
+            LOG_MESSAGE(
+                TRITONSERVER_LOG_ERROR,
+                (std::string("request ") + std::to_string(r) +
+                 ": failed to read input or requested output name, error response sent")
+                 .c_str()
+            );
+            continue;
+        }
+
+        // Get input buffer count.
+        uint32_t input_buffer_count;
+        GUARDED_RESPOND_IF_ERROR(
+            responses, r,
+            TRITONBACKEND_InputProperties(
+                input, nullptr /* input_name */, nullptr, nullptr,
+                nullptr, nullptr, &input_buffer_count
+            )
+        );
+        if (responses[r] == nullptr) {
+            LOG_MESSAGE(
+                TRITONSERVER_LOG_ERROR,
+                (std::string("request ") + std::to_string(r) +
+                 ": failed to read input properties, error response sent")
+                 .c_str()
+            );
+            continue;
+        }
+
+        // Compose all the requests input to make a batch request,
+        // record the sentences count of each request for further process.
+        std::vector<char> content_buffer;
+        for (uint32_t b = 0; b < input_buffer_count; ++b) {
+            const void* input_buffer = nullptr;
+            uint64_t buffer_byte_size = 0;
+            TRITONSERVER_MemoryType input_memory_type = TRITONSERVER_MEMORY_CPU;
+            int64_t input_memory_type_id = 0;
+            GUARDED_RESPOND_IF_ERROR(
+                responses, r,
+                TRITONBACKEND_InputBuffer(
+                    input, b, &input_buffer, &buffer_byte_size,
+                    &input_memory_type, &input_memory_type_id
+                )
+            );
+            if ((responses[r] == nullptr) ||
+                (input_memory_type == TRITONSERVER_MEMORY_GPU)) {
+                GUARDED_RESPOND_IF_ERROR(
+                    responses, r,
+                    TRITONSERVER_ErrorNew(
+                        TRITONSERVER_ERROR_UNSUPPORTED,
+                        "failed to get input buffer in CPU memory"
+                    )
+                );
+            }
+            content_buffer.insert(
+                content_buffer.end(), reinterpret_cast<const char*>(input_buffer) + 4,
+                reinterpret_cast<const char*>(input_buffer) + buffer_byte_size - 4
+            );
+        }
+
+        std::string s(content_buffer.begin(), content_buffer.end());
+        int count = std::count(s.begin(), s.end(), '\n');
+        request_batch_size.push_back(count + 1);
+        inputs.push_back(s);
+        content_buffer.clear();
+
+        if (input_strings.empty()) {
+            input_strings = s;
+        } else {
+            input_strings.append("\n");
+            input_strings.append(s);
+        }
+
+        total_batch_size++;
+    }
+
+    // Operate on the entire batch of requests for improved performance.
+    void* vstate;
+    RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));
+    ModelInstanceState* instance_state =
+        reinterpret_cast<ModelInstanceState*>(vstate);
+    void* marian = instance_state->Marian();
+    char* result = translate(marian, const_cast<char*>(input_strings.c_str()));
+
+    // Assign the results to the corresponding request.
+    char* pos = result;
+    for (uint32_t r = 0; r < request_count; ++r) {
+        int batch_size = request_batch_size[r];
+        uint64_t output_byte_size = 0;
+        char* output_content = nullptr;
+
+        // Find current output content.
+        while (batch_size > 0) {
+            char* p = strchr(pos, '\n');
+            if (p != nullptr) {
+                *p = '\0';
+            }
+            if (output_content == nullptr) {
+                output_content = pos;
+            } else {
+                strcat(output_content, "\n");
+                strcat(output_content, pos);
+            }
+            // Move to next output content.
+            if (p != nullptr) {
+                pos = p + 1;
+            }
+            batch_size--;
+        }
+        output_byte_size = strlen(output_content);
+
+        TRITONBACKEND_Input* input = request_input[r];
+        const char* input_name;
+        TRITONSERVER_DataType input_datatype;
+        const int64_t* input_shape;
+        uint32_t input_dims_count;
+        uint64_t input_byte_size;
+        uint32_t input_buffer_count;
+        GUARDED_RESPOND_IF_ERROR(
+            responses, r,
+            TRITONBACKEND_InputProperties(
+                input, &input_name, &input_datatype, &input_shape,
+                &input_dims_count, &input_byte_size, &input_buffer_count
+            )
+        );
+        if (responses[r] == nullptr) {
+            LOG_MESSAGE(
+                TRITONSERVER_LOG_ERROR,
+                (std::string("request ") + std::to_string(r) +
+                 ": failed to read input properties, error response sent")
+                 .c_str()
+            );
+            continue;
+        }
+
+        TRITONBACKEND_Request* request = requests[r];
+        const char* requested_output_name = nullptr;
+        GUARDED_RESPOND_IF_ERROR(
+            responses, r,
+            TRITONBACKEND_RequestOutputName(
+                request, 0 /* index */, &requested_output_name
+            )
+        );
+
+        // Create an output tensor in the response,
+        // input and output have same datatype and shape...
+        TRITONBACKEND_Response* response = responses[r];
+        TRITONBACKEND_Output* output;
+        GUARDED_RESPOND_IF_ERROR(
+            responses, r,
+            TRITONBACKEND_ResponseOutput(
+                response, &output, requested_output_name, input_datatype,
+                input_shape, input_dims_count
+            )
+        );
+
+        // Get the output buffer. We request a buffer in CPU memory
+        // but we have to handle any returned type. If we get back
+        // a buffer in GPU memory we just fail the request.
+        void* output_buffer;
+        TRITONSERVER_MemoryType output_memory_type = TRITONSERVER_MEMORY_CPU;
+        int64_t output_memory_type_id = 0;
+        GUARDED_RESPOND_IF_ERROR(
+            responses, r,
+            TRITONBACKEND_OutputBuffer(
+                output, &output_buffer, output_byte_size + 4,
+                &output_memory_type, &output_memory_type_id
+            )
+        );
+        if ((responses[r] == nullptr) ||
+            (output_memory_type == TRITONSERVER_MEMORY_GPU)) {
+            GUARDED_RESPOND_IF_ERROR(
+                responses, r,
+                TRITONSERVER_ErrorNew(
+                    TRITONSERVER_ERROR_UNSUPPORTED,
+                    "failed to create output buffer in CPU memory"
+                )
+            );
+            LOG_MESSAGE(
+                TRITONSERVER_LOG_ERROR,
+                (std::string("request ") + std::to_string(r) +
+                ": failed to create output buffer in CPU memory, error request sent")
+                .c_str()
+            );
+            continue;
+        }
+
+        // Copy Marian result -> output.
+        memcpy(output_buffer, reinterpret_cast<char*>(&output_byte_size), 4);
+        memcpy(reinterpret_cast<char*>(output_buffer) + 4, output_content, output_byte_size);
+
+        // Send the response.
+        LOG_IF_ERROR(
+            TRITONBACKEND_ResponseSend(
+                responses[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL,
+                nullptr /* success */),
+            "failed sending response"
+        );
+
+        // Report statistics for the successful request.
+        uint64_t request_exec_end_ns = 0;
+        SET_TIMESTAMP(request_exec_end_ns);
+        LOG_IF_ERROR(
+            TRITONBACKEND_ModelInstanceReportStatistics(
+                instance_state->TritonModelInstance(), request, true /* success */,
+                exec_start_ns, exec_start_ns, request_exec_end_ns, request_exec_end_ns),
+            "failed reporting request statistics"
+        );
+
+        // Release each request as soon as we sent the corresponding response.
+        LOG_IF_ERROR(
+            TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL),
+            "failed releasing request"
+        );
+    }
+
+    // Report statistics for the entire batch of requests.
+    uint64_t exec_end_ns = 0;
+    SET_TIMESTAMP(exec_end_ns);
+    LOG_IF_ERROR(
+        TRITONBACKEND_ModelInstanceReportBatchStatistics(
+            instance_state->TritonModelInstance(), total_batch_size,
+            exec_start_ns, exec_start_ns, exec_end_ns, exec_end_ns),
+        "failed reporting batch request statistics"
+    );
+
+    // Release Marian result.
+    free_result(result);
+
+    return nullptr;  // success
+}
+
+}  // extern "C"
+
+}}} // namespace triton::backend::marian
--- a/contrib/triton-aml/marian_backend/src/marian.h
+++ b/contrib/triton-aml/marian_backend/src/marian.h
@ -0,0 +1,11 @@
+#pragma once
+
+#ifdef _WIN32
+    #define DLLEXPORT extern "C" __declspec(dllexport)
+#else
+    #define DLLEXPORT extern "C"
+#endif
+
+DLLEXPORT void* init(char* path, int device_num);
+DLLEXPORT char* translate(void* marian, char* sent);
+DLLEXPORT void free_result(char* to_free);
--- a/contrib/triton-aml/src/CMakeLists.txt
+++ b/contrib/triton-aml/src/CMakeLists.txt
@ -0,0 +1,239 @@
+add_subdirectory(3rd_party)
+
+include_directories(.)
+include_directories(3rd_party)
+include_directories(3rd_party/SQLiteCpp/include)
+include_directories(3rd_party/sentencepiece)
+include_directories(3rd_party/fbgemm/include)
+include_directories(${CMAKE_BINARY_DIR}/local/include)
+
+add_library(marian STATIC
+  static/cmarian.cpp
+  common/aliases.cpp
+  common/fastopt.cpp
+  common/version.cpp
+  common/utils.cpp
+  common/logging.cpp
+  common/cli_helper.cpp
+  common/cli_wrapper.cpp
+  common/config.cpp
+  common/config_parser.cpp
+  common/config_validator.cpp
+  common/options.cpp
+  common/binary.cpp
+  common/io.cpp
+  common/filesystem.cpp
+  common/file_stream.cpp
+  common/types.cpp
+
+  data/alignment.cpp
+  data/vocab.cpp
+  data/default_vocab.cpp
+  data/sentencepiece_vocab.cpp
+  data/factored_vocab.cpp
+  data/corpus_base.cpp
+  data/corpus.cpp
+  data/corpus_sqlite.cpp
+  data/corpus_nbest.cpp
+  data/text_input.cpp
+
+  3rd_party/cnpy/cnpy.cpp
+  3rd_party/ExceptionWithCallStack.cpp
+
+  3rd_party/phf/phf.cc
+
+  tensors/backend.cpp
+  tensors/rand.cpp
+  tensors/tensor.cpp
+  tensors/cpu/device.cpp
+  tensors/cpu/prod.cpp
+  tensors/cpu/tensor_operators.cpp
+
+  tensors/cpu/sharp/int_gemm.cpp
+  tensors/cpu/sharp/avx_gemm.cpp
+  tensors/cpu/sharp/sse_gemm.cpp
+  tensors/cpu/fbgemm/packed_gemm.cpp
+
+  graph/expression_graph.cpp
+  graph/expression_operators.cpp
+  graph/node.cpp
+  graph/node_operators.cpp
+  graph/node_initializers.cpp
+
+  layers/convolution.cpp
+  layers/generic.cpp
+  layers/loss.cpp
+  layers/weight.cpp
+
+  rnn/cells.cpp
+  rnn/attention.cpp
+
+  optimizers/clippers.cpp
+  optimizers/optimizers.cpp
+
+  models/model_factory.cpp
+  models/encoder_decoder.cpp
+  models/transformer_stub.cpp
+
+  rescorer/score_collector.cpp
+
+  translator/history.cpp
+  translator/output_collector.cpp
+  translator/output_printer.cpp
+  translator/nth_element.cpp
+  translator/helpers.cpp
+  translator/scorers.cpp
+
+  training/graph_group_async.cpp
+  training/graph_group_async_drop.cpp
+  training/graph_group_sync.cpp
+  training/graph_group_singleton.cpp
+  training/graph_group_multinode.cpp
+  training/graph_group_multinode_sync.cpp
+  training/validator.cpp
+  training/communicator.cpp
+  training/scheduler.cpp
+
+  # this is only compiled to catch build errors, but not linked
+  microsoft/quicksand.cpp
+
+  $<TARGET_OBJECTS:libyaml-cpp>
+  $<TARGET_OBJECTS:SQLiteCpp>
+  $<TARGET_OBJECTS:pathie-cpp>
+  $<TARGET_OBJECTS:zlib>
+)
+target_compile_options(marian PUBLIC ${ALL_WARNINGS})
+
+# Generate git_revision.h to reflect current git revision information
+# [https://stackoverflow.com/questions/1435953/how-can-i-pass-git-sha1-to-compiler-as-definition-using-cmake]
+# Git updates .git/logs/HEAD file whenever you pull or commit something.
+
+# If Marian is checked out as a submodule in another repository,
+# there's no .git directory in ${CMAKE_SOURCE_DIR}. Instead .git is a
+# file that specifies the relative path from ${CMAKE_SOURCE_DIR} to
+# ./git/modules/<MARIAN_ROOT_DIR> in the root of the repository that
+# contains Marian as a submodule. We set MARIAN_GIT_DIR to the appropriate
+# path, depending on whether ${CMAKE_SOURCE_DIR}/.git is a directory or file.
+if(IS_DIRECTORY ${CMAKE_SOURCE_DIR}/.git) # not a submodule
+  set(MARIAN_GIT_DIR ${CMAKE_SOURCE_DIR}/.git)
+else(IS_DIRECTORY ${CMAKE_SOURCE_DIR}/.git)
+  file(READ ${CMAKE_SOURCE_DIR}/.git MARIAN_GIT_DIR)
+  string(REGEX REPLACE "gitdir: (.*)\n" "\\1" MARIAN_GIT_DIR ${MARIAN_GIT_DIR})
+  get_filename_component(MARIAN_GIT_DIR "${CMAKE_SOURCE_DIR}/${MARIAN_GIT_DIR}" ABSOLUTE)
+endif(IS_DIRECTORY ${CMAKE_SOURCE_DIR}/.git)
+
+add_custom_command(OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/common/git_revision.h
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  COMMAND git log -1 --pretty=format:\#define\ GIT_REVISION\ \"\%h\ \%ai\" > ${CMAKE_CURRENT_SOURCE_DIR}/common/git_revision.h
+  DEPENDS ${MARIAN_GIT_DIR}/logs/HEAD
+  VERBATIM
+)
+add_custom_target(marian_version DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/common/git_revision.h)
+add_dependencies(marian marian_version) # marian must depend on it so that it gets created first
+# make sure all local dependencies are installed first before this is built
+add_dependencies(marian 3rd_party_installs)
+
+if(CUDA_FOUND)
+cuda_add_library(marian_cuda
+  tensors/gpu/device.cu
+  tensors/gpu/algorithm.cu
+  tensors/gpu/prod.cpp
+  tensors/gpu/element.cu
+  tensors/gpu/add.cu
+  tensors/gpu/add_all.cu
+  tensors/gpu/tensor_operators.cu
+  tensors/gpu/cudnn_wrappers.cu
+  translator/nth_element.cu
+  translator/helpers.cu
+  training/gradient_dropping/gpu/dropper.cu
+  training/gradient_dropping/gpu/sparse_algorithm.cu
+  STATIC)
+
+  target_compile_options(marian_cuda PUBLIC ${ALL_WARNINGS})
+  # make sure all local dependencies are installed first before this is built
+  add_dependencies(marian_cuda 3rd_party_installs)
+  set_target_properties(marian_cuda PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif(CUDA_FOUND)
+
+set_target_properties(marian PROPERTIES POSITION_INDEPENDENT_CODE ON)
+set_target_properties(marian PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
+set_target_properties(marian PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
+
+add_executable(marian_train command/marian_main.cpp)
+set_target_properties(marian_train PROPERTIES OUTPUT_NAME marian)
+target_compile_options(marian_train PUBLIC ${ALL_WARNINGS})
+
+add_executable(marian_decoder command/marian_decoder.cpp)
+set_target_properties(marian_decoder PROPERTIES OUTPUT_NAME marian-decoder)
+target_compile_options(marian_decoder PUBLIC ${ALL_WARNINGS})
+
+add_executable(marian_scorer command/marian_scorer.cpp)
+set_target_properties(marian_scorer PROPERTIES OUTPUT_NAME marian-scorer)
+target_compile_options(marian_scorer PUBLIC ${ALL_WARNINGS})
+
+add_executable(marian_vocab command/marian_vocab.cpp)
+set_target_properties(marian_vocab PROPERTIES OUTPUT_NAME marian-vocab)
+target_compile_options(marian_vocab PUBLIC ${ALL_WARNINGS})
+
+add_executable(marian_conv command/marian_conv.cpp)
+set_target_properties(marian_conv PROPERTIES OUTPUT_NAME marian-conv)
+target_compile_options(marian_conv PUBLIC ${ALL_WARNINGS})
+
+set(EXECUTABLES ${EXECUTABLES} marian_train marian_decoder marian_scorer marian_vocab marian_conv)
+
+# marian.zip and marian.tgz
+# This combines marian, marian_decoder in a single ZIP or TAR file for
+# execution in MSFT internal tools FLO and Philly.
+# For Philly submission, we need statically-linked versions to deal with
+# library dependencies, so this target is only enabled for static builds.
+if(USE_STATIC_LIBS)
+  add_custom_command(
+    OUTPUT "${CMAKE_BINARY_DIR}/marian.zip"
+    COMMAND zip -v -0 -j "${CMAKE_BINARY_DIR}/marian.zip"
+                "${CMAKE_BINARY_DIR}/marian"
+                "${CMAKE_BINARY_DIR}/marian-decoder"
+                "${CMAKE_BINARY_DIR}/marian-scorer"
+                "${CMAKE_BINARY_DIR}/marian-vocab"
+                "${CMAKE_BINARY_DIR}/marian-conv"
+    DEPENDS marian_train marian_decoder marian_scorer marian_vocab marian_conv)
+  add_custom_target(marian_zip DEPENDS "${CMAKE_BINARY_DIR}/marian.zip")
+
+  add_custom_command(
+    OUTPUT "${CMAKE_BINARY_DIR}/marian.tgz"
+    COMMAND tar -cvvzf "${CMAKE_BINARY_DIR}/marian.tgz" -C "${CMAKE_BINARY_DIR}"
+                "marian"
+                "marian-decoder"
+                "marian-scorer"
+                "marian-vocab"
+                "marian-conv"
+    DEPENDS marian_train marian_decoder marian_scorer marian_vocab marian_conv)
+  add_custom_target(marian_tgz DEPENDS "${CMAKE_BINARY_DIR}/marian.tgz")
+  add_custom_target(philly DEPENDS marian_tgz marian_zip)
+endif(USE_STATIC_LIBS)
+
+if(COMPILE_SERVER)
+  add_executable(marian_server command/marian_server.cpp)
+  set_target_properties(marian_server PROPERTIES OUTPUT_NAME marian-server)
+  target_compile_options(marian_server PUBLIC ${ALL_WARNINGS})
+  set(EXECUTABLES ${EXECUTABLES} marian_server)
+endif(COMPILE_SERVER)
+
+foreach(exec ${EXECUTABLES})
+  target_link_libraries(${exec} marian ${EXT_LIBS} ${EXT_LIBS} ${CMAKE_THREAD_LIBS_INIT})
+  if(CUDA_FOUND)
+    target_link_libraries(${exec} marian marian_cuda ${EXT_LIBS} ${CMAKE_THREAD_LIBS_INIT})
+  endif(CUDA_FOUND)
+  set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
+endforeach(exec)
+
+if(COMPILE_TESTS)
+  set(CATCH_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/3rd_party)
+  add_library(Catch INTERFACE)
+  target_include_directories(Catch INTERFACE ${CATCH_INCLUDE_DIR})
+
+  add_subdirectory(tests)
+endif(COMPILE_TESTS)
+
+if(COMPILE_EXAMPLES)
+  add_subdirectory(examples)
+endif(COMPILE_EXAMPLES)
--- a/contrib/triton-aml/src/cmarian.cpp
+++ b/contrib/triton-aml/src/cmarian.cpp
@ -0,0 +1,74 @@
+#include "marian.h"
+#include "translator/beam_search.h"
+#include "translator/translator.h"
+#include "common/utils.h"
+
+#include<stdio.h>
+#include<string.h>
+#include<iostream>
+#include <string>
+
+#ifdef _WIN32
+    #define DLLEXPORT extern "C" __declspec(dllexport)
+#else
+    #define DLLEXPORT extern "C"
+#endif
+
+using namespace marian;
+
+class CMarian {
+private:
+    Ptr<Options> options_;
+    char* configPath_;
+    Ptr<TranslateService<BeamSearch>> task_;
+
+public:
+    CMarian(char* configPath, int device_num) : configPath_(configPath) {
+        int argc = 5;
+        char** argv = new char*[argc];
+        argv[0] = new char[20];
+        strcpy(argv[0], "./marian-decoder");
+        argv[1] = new char[12];
+        strcpy(argv[1], "--config");
+        argv[2] = configPath_;
+        argv[3] = new char[12];
+        strcpy(argv[3], "--devices");
+        argv[4] = new char[sizeof(device_num) + 1];
+        strcpy(argv[4], std::to_string(device_num).c_str());
+
+        options_ = marian::parseOptions(argc, argv, cli::mode::translation, true);
+        task_ = New<TranslateService<BeamSearch>>(options_);
+
+        delete[] argv[0];
+        delete[] argv[1];
+        delete[] argv[3];
+        delete[] argv[4];
+    }
+
+    /**
+     * @brief Exposes Marian translation capabilities based on the loaded YAML config associated with this class.
+     * @param sent The sentence to run inference on.
+     * @return A string delimited by ||| with newlines separating beams.
+     */
+    char* translate(char* sent) {
+        std::string strSent(sent);
+        auto outputText = task_->run(strSent);
+        char* ret = (char*) malloc(outputText.length() + 1);
+        snprintf(ret, outputText.length() + 1, "%s", outputText.c_str());
+        return ret;
+    }
+};
+
+DLLEXPORT void* init(char* path, int device_num) {
+    CMarian* m = new CMarian(path, device_num);
+    return (void*)m;
+}
+
+DLLEXPORT char* translate(void* marian, char* sent) {
+    CMarian* m = static_cast<CMarian*>(marian);
+    return m->translate(sent);
+}
+
+DLLEXPORT void free_result(char* to_free) {
+    free(to_free);
+}
--- a/contrib/triton-aml/src/logging.cpp
+++ b/contrib/triton-aml/src/logging.cpp
@ -0,0 +1,157 @@
+#include "logging.h"
+#include "common/config.h"
+#include "spdlog/sinks/null_sink.h"
+#include "3rd_party/ExceptionWithCallStack.h"
+#include <time.h>
+#include <stdlib.h>
+#ifdef __unix__
+#include <signal.h>
+#endif
+
+#ifdef _MSC_VER
+#define noinline __declspec(noinline)
+#else
+#define noinline __attribute__((noinline))
+#endif
+
+namespace marian {
+  static bool throwExceptionOnAbort = false;
+  bool getThrowExceptionOnAbort() { return throwExceptionOnAbort; }
+  void setThrowExceptionOnAbort(bool doThrowExceptionOnAbort) { throwExceptionOnAbort = doThrowExceptionOnAbort; };
+}
+
+std::shared_ptr<spdlog::logger> createStderrLogger(const std::string& name,
+                                                   const std::string& pattern,
+                                                   const std::vector<std::string>& files,
+                                                   bool quiet) {
+  std::vector<spdlog::sink_ptr> sinks;
+
+  auto stderr_sink = spdlog::sinks::stderr_sink_mt::instance();
+  if(!quiet)
+    sinks.push_back(stderr_sink);
+
+  for(auto&& file : files) {
+    auto file_sink = std::make_shared<spdlog::sinks::simple_file_sink_st>(file, true);
+    sinks.push_back(file_sink);
+  }
+
+  auto existing = spdlog::get(name);
+  if (existing) return existing;
+
+  auto logger = std::make_shared<spdlog::logger>(name, begin(sinks), end(sinks));
+
+  spdlog::register_logger(logger);
+  logger->set_pattern(pattern);
+  return logger;
+}
+
+bool setLoggingLevel(spdlog::logger& logger, std::string const level) {
+  if(level == "trace")
+    logger.set_level(spdlog::level::trace);
+  else if(level == "debug")
+    logger.set_level(spdlog::level::debug);
+  else if(level == "info")
+    logger.set_level(spdlog::level::info);
+  else if(level == "warn")
+    logger.set_level(spdlog::level::warn);
+  else if(level == "err" || level == "error")
+    logger.set_level(spdlog::level::err);
+  else if(level == "critical")
+    logger.set_level(spdlog::level::critical);
+  else if(level == "off")
+    logger.set_level(spdlog::level::off);
+  else {
+    logger.warn("Unknown log level '{}' for logger '{}'", level.c_str(), logger.name().c_str());
+    return false;
+  }
+  return true;
+}
+
+static void setErrorHandlers();
+void createLoggers(const marian::Config* config) {
+  std::vector<std::string> generalLogs;
+  std::vector<std::string> validLogs;
+
+  if(config && !config->get<std::string>("log").empty()) {
+    generalLogs.push_back(config->get<std::string>("log"));
+#ifndef _WIN32
+    // can't open the same file twice in Windows for some reason
+    validLogs.push_back(config->get<std::string>("log"));
+#endif
+  }
+
+  // valid-log is available only for training
+  if(config && config->has("valid-log") && !config->get<std::string>("valid-log").empty()) {
+    validLogs.push_back(config->get<std::string>("valid-log"));
+  }
+
+  bool quiet = config && config->get<bool>("quiet");
+  Logger general{createStderrLogger("general", "[%Y-%m-%d %T] %v", generalLogs, quiet)};
+  Logger valid{createStderrLogger("valid", "[%Y-%m-%d %T] [valid] %v", validLogs, quiet)};
+
+  if(config && config->has("log-level")) {
+    std::string loglevel = config->get<std::string>("log-level");
+    if(!setLoggingLevel(*general, loglevel))
+      return;
+    setLoggingLevel(*valid, loglevel);
+  }
+
+  if(config && !config->get<std::string>("log-time-zone").empty()) {
+    std::string timezone = config->get<std::string>("log-time-zone");
+#ifdef _WIN32
+#define setenv(var, val, over) SetEnvironmentVariableA(var, val) // ignoring over flag
+#endif
+    setenv("TZ", timezone.c_str(), true);
+    tzset();
+  }
+
+  setErrorHandlers();
+}
+
+static void unhandledException() {
+  if(std::current_exception()) {
+    try {
+      throw;  // rethrow so that we can get access to what()
+    } catch(const std::exception& e) {
+      ABORT("Unhandled exception of type '{}': {}", typeid(e).name(), e.what());
+    } catch(...) {
+      ABORT("Unhandled exception");
+    }
+  } else {
+    std::abort();
+  }
+}
+
+static void setErrorHandlers() {
+  // call stack for unhandled exceptions
+  std::set_terminate(unhandledException);
+#ifdef __unix__
+  // catch segfaults
+  struct sigaction sa = { 0 };
+  sigemptyset(&sa.sa_mask);
+  sa.sa_flags = SA_SIGINFO;
+  sa.sa_sigaction = [](int /*signal*/, siginfo_t*, void*) { ABORT("Segmentation fault"); };
+  sigaction(SIGSEGV, &sa, NULL);
+  sa.sa_sigaction = [](int /*signal*/, siginfo_t*, void*) { ABORT("Floating-point exception"); };
+  sigaction(SIGFPE, &sa, NULL);
+#endif
+}
+
+// modify the log pattern for the "general" logger to include the MPI rank
+// This is called upon initializing MPI. It is needed to associated error messages to ranks.
+void switchtoMultinodeLogging(std::string nodeIdStr) {
+  Logger log = spdlog::get("general");
+  if(log)
+    log->set_pattern("[%Y-%m-%d %T " + nodeIdStr + ":%t] %v");
+}
+
+
+namespace marian {
+  std::string noinline getCallStack(size_t skipLevels) {
+    return ::Microsoft::MSR::CNTK::DebugUtil::GetCallStack(skipLevels + 2, /*makeFunctionNamesStandOut=*/true);
+  }
+
+  void noinline logCallStack(size_t skipLevels) {
+    checkedLog("general", "critical", getCallStack(skipLevels));
+  }
+}