Fix failures when loading text shortlist (#154)

This commit is contained in:
Qianqian Zhu 2021-05-25 12:05:16 +01:00 committed by GitHub
parent 576afae6b3
commit 8bec1b7b6b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 15 additions and 10 deletions

View File

@ -19,7 +19,7 @@ int main(int argc, char *argv[]) {
// Prepare memories for bytearrays (including model, shortlist and vocabs)
marian::bergamot::MemoryBundle memoryBundle;
if (options->get<bool>("check-bytearray")) {
if (options->get<bool>("bytearray")) {
// Load legit values into bytearrays.
memoryBundle = marian::bergamot::getMemoryBundleFromConfig(options);
}

@ -1 +1 @@
Subproject commit 636af01c63f2f080a9e59e99b15ac4bfdaec76e1
Subproject commit 1b20a62f6614db371f59b97ff83262b8ebd235de

View File

@ -20,8 +20,6 @@ BatchTranslator::BatchTranslator(DeviceId const device, Vocabs &vocabs, Ptr<Opti
void BatchTranslator::initialize() {
// Initializes the graph.
bool check =
options_->get<bool>("check-bytearray", false); // Flag holds whether validate the bytearray (model and shortlist)
if (options_->hasAndNotEmpty("shortlist")) {
int srcIdx = 0, trgIdx = 1;
bool shared_vcb =
@ -30,7 +28,7 @@ void BatchTranslator::initialize() {
if (shortlistMemory_->size() > 0 && shortlistMemory_->begin() != nullptr) {
slgen_ = New<data::BinaryShortlistGenerator>(shortlistMemory_->begin(), shortlistMemory_->size(),
vocabs_.sources().front(), vocabs_.target(), srcIdx, trgIdx,
shared_vcb, check);
shared_vcb, options_->get<bool>("check-bytearray"));
} else {
// Changed to BinaryShortlistGenerator to enable loading binary shortlist file
// This class also supports text shortlist file
@ -51,7 +49,7 @@ void BatchTranslator::initialize() {
// from there, as opposed to from reading in the config file
ABORT_IF((uintptr_t)modelMemory_->begin() % 256 != 0,
"The provided memory is not aligned to 256 bytes and will crash when vector instructions are used on it.");
if (check) {
if (options_->get<bool>("check-bytearray")) {
ABORT_IF(!validateBinaryModel(*modelMemory_, modelMemory_->size()),
"The binary file is invalid. Incomplete or corrupted download?");
}

View File

@ -1,10 +1,10 @@
#include "byte_array_util.h"
#include <stdlib.h>
#include <iostream>
#include <cstdlib>
#include <memory>
#include "data/shortlist.h"
namespace marian {
namespace bergamot {
@ -102,6 +102,8 @@ AlignedMemory getModelMemoryFromConfig(marian::Ptr<marian::Options> options) {
AlignedMemory getShortlistMemoryFromConfig(marian::Ptr<marian::Options> options) {
auto shortlist = options->get<std::vector<std::string>>("shortlist");
ABORT_IF(shortlist.empty(), "No path to shortlist file is given.");
ABORT_IF(!marian::data::isBinaryShortlist(shortlist[0]),
"Loading non-binary shortlist file into memory is not supported");
return loadFileToMemory(shortlist[0], 64);
}
@ -112,6 +114,8 @@ void getVocabsMemoryFromConfig(marian::Ptr<marian::Options> options,
vocabMemories.resize(vfiles.size());
std::unordered_map<std::string, std::shared_ptr<AlignedMemory>> vocabMap;
for (size_t i = 0; i < vfiles.size(); ++i) {
ABORT_IF(marian::filesystem::Path(vfiles[i]).extension() != marian::filesystem::Path(".spm"),
"Loading non-SentencePiece vocab files into memory is not supported");
auto m = vocabMap.emplace(std::make_pair(vfiles[i], std::shared_ptr<AlignedMemory>()));
if (m.second) {
m.first->second = std::make_shared<AlignedMemory>(loadFileToMemory(vfiles[i], 64));

View File

@ -20,8 +20,11 @@ inline marian::ConfigParser createConfigParser() {
cp.addOption<int>("--max-length-break", "Bergamot Options",
"Maximum input tokens to be processed in a single sentence.", 128);
cp.addOption<bool>("--bytearray", "Bergamot Options",
"Flag holds whether to construct service from bytearrays, only for testing purpose", false);
cp.addOption<bool>("--check-bytearray", "Bergamot Options",
"Flag holds whether to check the content of the bytearray (true by default)", true);
"Flag holds whether to check the content of the bytearrays (true by default)", true);
return cp;
}