Enable binary shortlist loading from bytebuffer (#69)

Contains "hack" that must go immediately by editing TranslationModel, to come in following commit.  

* add shortlist_memory and update service-cli-bytearray test

* update marian-dev

* address review comments

* fix ccompliation and tests failures and further address review comments

* small update on marian-dev (based on browsermt/marian-dev PR#28)

* update marian-dev with upstream

* code refactoring according to review

* fix marian-dev submodule conflicts

* switch MemoryGift to AlignedVector

* copy aligned.h from kpu/intgemm for AlignedVector

* changes based on memory ownership and AlignedVector

* fix BatchTranslator inits

* small fixes according to review comments

* update submodule marian-dev to master

* update submodule marian-dev with upstream

Co-authored-by: Kenneth Heafield <kpu@users.noreply.github.com>
This commit is contained in:
Qianqian Zhu 2021-04-01 19:36:07 +01:00 committed by GitHub
parent 2e5daac978
commit f654ab0f71
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 211 additions and 95 deletions

View File

@ -9,7 +9,7 @@
#include "TranslationModel.h"
#include "translator/parser.h"
#include "translator/byteArrayExample.h"
#include "translator/byte_array_util.h"
int main(int argc, char **argv) {
@ -19,9 +19,11 @@ int main(int argc, char **argv) {
auto options = configParser.parseOptions(argc, argv, true);
std::string config = options->asYamlString();
// Prepare model byte array
marian::bergamot::AlignedMemory modelBytes = marian::bergamot::getModelMemoryFromConfig(options);
// Route the config string to construct marian model through TranslationModel
void * model_bytes = bergamot::getBinaryModelFromConfig(options);
auto model = std::make_shared<TranslationModel>(config, model_bytes);
TranslationModel model(config, modelBytes.begin());
TranslationRequest translationRequest;
std::vector<std::string> texts;
@ -42,7 +44,7 @@ int main(int argc, char **argv) {
"Prague, the University of Sheffield, University of Tartu, and "
"Mozilla.");
auto results = model->translate(std::move(texts), translationRequest);
auto results = model.translate(std::move(texts), translationRequest);
// Resolve the future and get the actual result
//std::vector<TranslationResult> results = futureResults.get();
@ -61,8 +63,5 @@ int main(int argc, char **argv) {
std::cout << std::endl;
}
// Clear the memory used for the byte array
free(model_bytes); // Ideally, this should be done after the translation model has been gracefully shut down.
return 0;
}

View File

@ -9,14 +9,17 @@
#include "translator/parser.h"
#include "translator/response.h"
#include "translator/service.h"
#include "translator/byteArrayExample.h"
#include "translator/byte_array_util.h"
int main(int argc, char *argv[]) {
auto cp = marian::bergamot::createConfigParser();
auto options = cp.parseOptions(argc, argv, true);
void * model_bytes = bergamot::getBinaryModelFromConfig(options);
marian::bergamot::Service service(options, model_bytes);
// Prepare memories for model and shortlist
marian::bergamot::AlignedMemory modelBytes = marian::bergamot::getModelMemoryFromConfig(options);
marian::bergamot::AlignedMemory shortlistBytes = marian::bergamot::getShortlistMemoryFromConfig(options);
marian::bergamot::Service service(options, std::move(modelBytes), std::move(shortlistBytes));
// Read a large input text blob from stdin
std::ostringstream std_input;
@ -30,8 +33,5 @@ int main(int argc, char *argv[]) {
Response response = responseFuture.get();
std::cout << response.target.text << std::endl;
// Clear the memory used for the byte array
free(model_bytes); // Ideally, this should be done after the translation model has been gracefully shut down.
return 0;
}

View File

@ -1,7 +1,6 @@
add_library(bergamot-translator STATIC
TranslationModel.cpp
byteArrayExample.cpp
byte_array_util.cpp
text_processor.cpp
sentence_splitter.cpp
batch_translator.cpp

71
src/translator/aligned.h Normal file
View File

@ -0,0 +1,71 @@
#pragma once
#include <cstdlib>
#include <new>
#ifdef _MSC_VER
#include <malloc.h>
#endif
// Aligned simple vector.
namespace marian {
namespace bergamot {
template <class T> class AlignedVector {
public:
AlignedVector() : mem_(nullptr), size_(0) {}
explicit AlignedVector(std::size_t size, std::size_t alignment = 64 /* CPU cares about this */)
: size_(size) {
#ifdef _MSC_VER
mem_ = static_cast<T*>(_aligned_malloc(size * sizeof(T), alignment));
if (!mem_) throw std::bad_alloc();
#else
if (posix_memalign(reinterpret_cast<void **>(&mem_), alignment, size * sizeof(T))) {
throw std::bad_alloc();
}
#endif
}
AlignedVector(AlignedVector &&from) : mem_(from.mem_), size_(from.size_) {
from.mem_ = nullptr;
from.size_ = 0;
}
AlignedVector &operator=(AlignedVector &&from) {
mem_ = from.mem_;
size_ = from.size_;
from.mem_ = nullptr;
from.size_ = 0;
return *this;
}
AlignedVector(const AlignedVector&) = delete;
AlignedVector& operator=(const AlignedVector&) = delete;
~AlignedVector() {
#ifdef _MSC_VER
_aligned_free(mem_);
#else
std::free(mem_);
#endif
}
std::size_t size() const { return size_; }
T &operator[](std::size_t offset) { return mem_[offset]; }
const T &operator[](std::size_t offset) const { return mem_[offset]; }
T *begin() { return mem_; }
const T *begin() const { return mem_; }
T *end() { return mem_ + size_; }
const T *end() const { return mem_ + size_; }
template <typename ReturnType>
ReturnType *as() { return reinterpret_cast<ReturnType*>(mem_); }
private:
T *mem_;
std::size_t size_;
};
} // namespace bergamot
} // namespace marian

View File

@ -11,17 +11,29 @@ namespace bergamot {
BatchTranslator::BatchTranslator(DeviceId const device,
std::vector<Ptr<Vocab const>> &vocabs,
Ptr<Options> options,
const void * model_memory)
: device_(device), options_(options), vocabs_(&vocabs), model_memory_(model_memory) {}
const AlignedMemory* modelMemory,
const AlignedMemory* shortlistMemory)
: device_(device), options_(options), vocabs_(&vocabs),
modelMemory_(modelMemory), shortlistMemory_(shortlistMemory) {}
void BatchTranslator::initialize() {
// Initializes the graph.
if (options_->hasAndNotEmpty("shortlist")) {
int srcIdx = 0, trgIdx = 1;
bool shared_vcb = vocabs_->front() == vocabs_->back();
slgen_ = New<data::LexicalShortlistGenerator>(options_, vocabs_->front(),
vocabs_->back(), srcIdx,
trgIdx, shared_vcb);
if (shortlistMemory_->size() > 0 && shortlistMemory_->begin() != nullptr) {
bool check = options_->get<bool>("check-bytearray",true);
slgen_ = New<data::BinaryShortlistGenerator>(shortlistMemory_->begin(), shortlistMemory_->size(),
vocabs_->front(), vocabs_->back(),
srcIdx, trgIdx, shared_vcb, check);
}
else {
// Changed to BinaryShortlistGenerator to enable loading binary shortlist file
// This class also supports text shortlist file
slgen_ = New<data::BinaryShortlistGenerator>(options_, vocabs_->front(),
vocabs_->back(), srcIdx,
trgIdx, shared_vcb);
}
}
graph_ = New<ExpressionGraph>(true); // always optimize
@ -30,12 +42,10 @@ void BatchTranslator::initialize() {
graph_->setDevice(device_);
graph_->getBackend()->configureDevice(options_);
graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
if (model_memory_) { // If we have provided a byte array that contains the model memory, we can initialise the model from there, as opposed to from reading in the config file
if ((uintptr_t)model_memory_ % 256 != 0) {
std::cerr << "The provided memory is not aligned to 256 bytes and will crash when vector instructions are used on it." << std::endl;
exit(1);
}
const std::vector<const void *> container = {model_memory_}; // Marian supports multiple models initialised in this manner hence std::vector. However we will only ever use 1 during decoding.
if (modelMemory_->size() > 0 && modelMemory_->begin() != nullptr) { // If we have provided a byte array that contains the model memory, we can initialise the model from there, as opposed to from reading in the config file
ABORT_IF((uintptr_t)modelMemory_->begin() % 256 != 0,
"The provided memory is not aligned to 256 bytes and will crash when vector instructions are used on it.");
const std::vector<const void *> container = {modelMemory_->begin()}; // Marian supports multiple models initialised in this manner hence std::vector. However we will only ever use 1 during decoding.
scorers_ = createScorers(options_, container);
} else {
scorers_ = createScorers(options_);

View File

@ -31,10 +31,11 @@ public:
* @param device DeviceId that performs translation. Could be CPU or GPU
* @param vocabs Vector that contains ptrs to two vocabs
* @param options Marian options object
* @param model_memory byte array (aligned to 64!!!) that contains the bytes of a model.bin. Provide a nullptr if not used.
* @param modelMemory byte array (aligned to 256!!!) that contains the bytes of a model.bin. Provide a nullptr if not used.
* @param shortlistMemory byte array of shortlist (aligned to 64)
*/
explicit BatchTranslator(DeviceId const device, std::vector<Ptr<Vocab const>> &vocabs,
Ptr<Options> options, const void * model_memory);
Ptr<Options> options, const AlignedMemory* modelMemory, const AlignedMemory* shortlistMemory);
// convenience function for logging. TODO(jerin)
std::string _identifier() { return "worker" + std::to_string(device_.no); }
@ -48,7 +49,8 @@ private:
Ptr<ExpressionGraph> graph_;
std::vector<Ptr<Scorer>> scorers_;
Ptr<data::ShortlistGenerator const> slgen_;
const void * model_memory_;
const AlignedMemory* modelMemory_{nullptr};
const AlignedMemory* shortlistMemory_{nullptr};
};
} // namespace bergamot

View File

@ -1,45 +0,0 @@
#include "byteArrayExample.h"
#include <stdlib.h>
#include <fstream>
#include <iostream>
namespace bergamot {
void * getBinaryFile(std::string path) {
std::ifstream is (path, std::ifstream::binary);
uint64_t length = 0; // Determine the length of file in bytes
if (is) {
is.seekg(0, is.end);
length = is.tellg();
is.seekg(0, is.beg);
} else {
std::cerr << "Failed opening file stream: " << path << std::endl;
std::exit(1);
}
void *result;
int fail = posix_memalign(&result, 256, length);
if (fail) {
std::cerr << "Failed to allocate aligned memory." << std::endl;
std::exit(1);
}
is.read(static_cast<char *>(result), length);
return result;
}
void * getBinaryModelFromConfig(marian::Ptr<marian::Options> options) {
std::vector<std::string> models = options->get<std::vector<std::string>>("models");
if (models.size() != 1) {
std::cerr << "Loading multiple binary models is not supported for now as it is not necessary." << std::endl;
std::exit(1);
marian::filesystem::Path modelPath(models[0]);
if (modelPath.extension() != marian::filesystem::Path(".bin")) {
std::cerr << "Non binary models cannot be loaded as a byte array." << std::endl;
std::exit(1);
}
return nullptr;
} else {
return getBinaryFile(models[0]);
}
}
} // namespace bergamot

View File

@ -1,8 +0,0 @@
#include "marian.h"
namespace bergamot {
void * getBinaryFile(std::string path);
void * getBinaryModelFromConfig(marian::Ptr<marian::Options> options);
} // namespace bergamot

View File

@ -0,0 +1,33 @@
#include "byte_array_util.h"
#include <stdlib.h>
#include <iostream>
namespace marian {
namespace bergamot {
AlignedMemory loadFileToMemory(const std::string& path, size_t alignment){
uint64_t fileSize = filesystem::fileSize(path);
io::InputFileStream in(path);
ABORT_IF(in.bad(), "Failed opening file stream: {}", path);
AlignedMemory alignedMemory(fileSize, alignment);
in.read(reinterpret_cast<char *>(alignedMemory.begin()), fileSize);
ABORT_IF(alignedMemory.size() != fileSize, "Error reading file {}", path);
return alignedMemory;
}
AlignedMemory getModelMemoryFromConfig(marian::Ptr<marian::Options> options){
auto models = options->get<std::vector<std::string>>("models");
ABORT_IF(models.size() != 1, "Loading multiple binary models is not supported for now as it is not necessary.");
marian::filesystem::Path modelPath(models[0]);
ABORT_IF(modelPath.extension() != marian::filesystem::Path(".bin"), "The file of binary model should end with .bin");
return loadFileToMemory(models[0], 256);
}
AlignedMemory getShortlistMemoryFromConfig(marian::Ptr<marian::Options> options){
auto shortlist = options->get<std::vector<std::string>>("shortlist");
ABORT_IF(shortlist.empty(), "No path to shortlist file is given.");
return loadFileToMemory(shortlist[0], 64);
}
} // namespace bergamot
} // namespace marian

View File

@ -0,0 +1,12 @@
#include "marian.h"
#include "definitions.h"
namespace marian {
namespace bergamot {
AlignedMemory loadFileToMemory(const std::string& path, size_t alignment);
AlignedMemory getModelMemoryFromConfig(marian::Ptr<marian::Options> options);
AlignedMemory getShortlistMemoryFromConfig(marian::Ptr<marian::Options> options);
} // namespace bergamot
} // namespace marian

View File

@ -3,6 +3,7 @@
#include "data/types.h"
#include "data/vocab_base.h"
#include "aligned.h"
#include <vector>
namespace marian {
@ -21,6 +22,9 @@ template <class T, typename... Args> UPtr<T> UNew(Args &&... args) {
template <class T> UPtr<T> UNew(UPtr<T> p) { return UPtr<T>(p); }
/// Shortcut to AlignedVector<const void*> for byte arrays
typedef AlignedVector<const void*> AlignedMemory;
} // namespace bergamot
} // namespace marian

View File

@ -23,7 +23,11 @@ inline marian::ConfigParser createConfigParser() {
"--max-length-break", "Bergamot Options",
"Maximum input tokens to be processed in a single sentence.", 128);
return cp;
cp.addOption<bool>(
"--check-bytearray", "Bergamot Options",
"Flag holds whether to check the content of the bytearray (true by default)", true);
return cp;
}
inline std::shared_ptr<marian::Options>

View File

@ -28,10 +28,11 @@ loadVocabularies(marian::Ptr<marian::Options> options) {
namespace marian {
namespace bergamot {
Service::Service(Ptr<Options> options, const void *model_memory)
Service::Service(Ptr<Options> options, AlignedMemory modelMemory, AlignedMemory shortlistMemory)
: requestId_(0), vocabs_(std::move(loadVocabularies(options))),
text_processor_(vocabs_, options), batcher_(options),
numWorkers_(options->get<int>("cpu-threads")), model_memory_(model_memory)
numWorkers_(options->get<int>("cpu-threads")),
modelMemory_(std::move(modelMemory)), shortlistMemory_(std::move(shortlistMemory))
#ifndef WASM_COMPATIBLE_SOURCE
// 0 elements in PCQueue is illegal and can lead to failures. Adding a
// guard to have at least one entry allocated. In the single-threaded
@ -54,7 +55,7 @@ void Service::build_translators(Ptr<Options> options, size_t numTranslators) {
translators_.reserve(numTranslators);
for (size_t cpuId = 0; cpuId < numTranslators; cpuId++) {
marian::DeviceId deviceId(cpuId, DeviceType::cpu);
translators_.emplace_back(deviceId, vocabs_, options, model_memory_);
translators_.emplace_back(deviceId, vocabs_, options, &modelMemory_, &shortlistMemory_);
}
}

View File

@ -18,6 +18,33 @@
namespace marian {
namespace bergamot {
// Hack code to construct AlignedMemory* from void*
inline AlignedMemory hackModel(const void* modelMemory) {
if(modelMemory != nullptr){
// Here is a hack to make TranslationModel works
size_t modelMemorySize = 73837568; // Hack: model memory size should be changed to actual model size
AlignedMemory alignedMemory(modelMemorySize);
memcpy(alignedMemory.begin(), modelMemory, modelMemorySize);
return alignedMemory;
} else {
return AlignedMemory();
}
}
inline AlignedMemory hackShortLis(const void* shortlistMemory) {
if(shortlistMemory!= nullptr) {
// Hacks to obtain shortlist memory size as this will be checked during construction
size_t shortlistMemorySize = sizeof(uint64_t) * (6 + *((uint64_t*)shortlistMemory+4))
+ sizeof(uint32_t) * *((uint64_t*)shortlistMemory+5);
// Here is a hack to make TranslationModel works
AlignedMemory alignedMemory(shortlistMemorySize);
memcpy(alignedMemory.begin(), shortlistMemory, shortlistMemorySize);
return alignedMemory;
}else {
return AlignedMemory();
}
}
/// Service exposes methods to translate an incoming blob of text to the
/// Consumer of bergamot API.
///
@ -38,18 +65,22 @@ class Service {
public:
/// @param options Marian options object
/// @param model_memory byte array (aligned to 64!!!) that contains the bytes
/// @param modelMemory byte array (aligned to 256!!!) that contains the bytes
/// of a model.bin. Optional, defaults to nullptr when not used
explicit Service(Ptr<Options> options, const void *model_memory = nullptr);
/// @param shortlistMemory byte array of shortlist (aligned to 64)
explicit Service(Ptr<Options> options, AlignedMemory modelMemory, AlignedMemory shortlistMemory);
/// Construct Service from a string configuration.
explicit Service(Ptr<Options> options) : Service(options, AlignedMemory(), AlignedMemory()){}
/// Construct Service from a string configuration.
/// @param [in] config string parsable as YAML expected to adhere with marian
/// config
/// @param [in] model_memory byte array (aligned to 64!!!) that contains the
/// @param [in] model_memory byte array (aligned to 256!!!) that contains the
/// bytes of a model.bin. Optional, defaults to nullptr when not used
/// @param [in] shortlistMemory byte array of shortlist (aligned to 64)
explicit Service(const std::string &config,
const void *model_memory = nullptr)
: Service(parseOptions(config), model_memory) {}
const void* modelMemory = nullptr, const void* shortlistMemory = nullptr)
: Service(parseOptions(config), hackModel(modelMemory), hackShortLis(shortlistMemory)) {}
/// Explicit destructor to clean up after any threads initialized in
/// asynchronous operation mode.
@ -85,13 +116,16 @@ private:
void async_translate();
/// Number of workers to launch.
size_t numWorkers_; // ORDER DEPENDENCY (pcqueue_)
const void *model_memory_; /// Model memory to load model passed as bytes.
size_t numWorkers_; // ORDER DEPENDENCY (pcqueue_)
/// Model memory to load model passed as bytes.
AlignedMemory modelMemory_; // ORDER DEPENDENCY (translators_)
/// Shortlist memory passed as bytes.
AlignedMemory shortlistMemory_; // ORDER DEPENDENCY (translators_)
/// Holds instances of batch translators, just one in case
/// of single-threaded application, numWorkers_ in case of multithreaded
/// setting.
std::vector<BatchTranslator> translators_;
std::vector<BatchTranslator> translators_; // ORDER DEPENDENCY (modelMemory_, shortlistMemory_)
/// Stores requestId of active request. Used to establish
/// ordering among requests and logging/book-keeping.