Fix failures when loading text shortlist (#154)

This commit is contained in:
Qianqian Zhu 2021-05-25 12:05:16 +01:00 committed by GitHub
parent 576afae6b3
commit 8bec1b7b6b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 15 additions and 10 deletions

View File

@ -19,7 +19,7 @@ int main(int argc, char *argv[]) {
// Prepare memories for bytearrays (including model, shortlist and vocabs) // Prepare memories for bytearrays (including model, shortlist and vocabs)
marian::bergamot::MemoryBundle memoryBundle; marian::bergamot::MemoryBundle memoryBundle;
if (options->get<bool>("check-bytearray")) { if (options->get<bool>("bytearray")) {
// Load legit values into bytearrays. // Load legit values into bytearrays.
memoryBundle = marian::bergamot::getMemoryBundleFromConfig(options); memoryBundle = marian::bergamot::getMemoryBundleFromConfig(options);
} }

@ -1 +1 @@
Subproject commit 636af01c63f2f080a9e59e99b15ac4bfdaec76e1 Subproject commit 1b20a62f6614db371f59b97ff83262b8ebd235de

View File

@ -20,8 +20,6 @@ BatchTranslator::BatchTranslator(DeviceId const device, Vocabs &vocabs, Ptr<Opti
void BatchTranslator::initialize() { void BatchTranslator::initialize() {
// Initializes the graph. // Initializes the graph.
bool check =
options_->get<bool>("check-bytearray", false); // Flag holds whether validate the bytearray (model and shortlist)
if (options_->hasAndNotEmpty("shortlist")) { if (options_->hasAndNotEmpty("shortlist")) {
int srcIdx = 0, trgIdx = 1; int srcIdx = 0, trgIdx = 1;
bool shared_vcb = bool shared_vcb =
@ -30,7 +28,7 @@ void BatchTranslator::initialize() {
if (shortlistMemory_->size() > 0 && shortlistMemory_->begin() != nullptr) { if (shortlistMemory_->size() > 0 && shortlistMemory_->begin() != nullptr) {
slgen_ = New<data::BinaryShortlistGenerator>(shortlistMemory_->begin(), shortlistMemory_->size(), slgen_ = New<data::BinaryShortlistGenerator>(shortlistMemory_->begin(), shortlistMemory_->size(),
vocabs_.sources().front(), vocabs_.target(), srcIdx, trgIdx, vocabs_.sources().front(), vocabs_.target(), srcIdx, trgIdx,
shared_vcb, check); shared_vcb, options_->get<bool>("check-bytearray"));
} else { } else {
// Changed to BinaryShortlistGenerator to enable loading binary shortlist file // Changed to BinaryShortlistGenerator to enable loading binary shortlist file
// This class also supports text shortlist file // This class also supports text shortlist file
@ -51,7 +49,7 @@ void BatchTranslator::initialize() {
// from there, as opposed to from reading in the config file // from there, as opposed to from reading in the config file
ABORT_IF((uintptr_t)modelMemory_->begin() % 256 != 0, ABORT_IF((uintptr_t)modelMemory_->begin() % 256 != 0,
"The provided memory is not aligned to 256 bytes and will crash when vector instructions are used on it."); "The provided memory is not aligned to 256 bytes and will crash when vector instructions are used on it.");
if (check) { if (options_->get<bool>("check-bytearray")) {
ABORT_IF(!validateBinaryModel(*modelMemory_, modelMemory_->size()), ABORT_IF(!validateBinaryModel(*modelMemory_, modelMemory_->size()),
"The binary file is invalid. Incomplete or corrupted download?"); "The binary file is invalid. Incomplete or corrupted download?");
} }

View File

@ -1,10 +1,10 @@
#include "byte_array_util.h" #include "byte_array_util.h"
#include <stdlib.h> #include <cstdlib>
#include <iostream>
#include <memory> #include <memory>
#include "data/shortlist.h"
namespace marian { namespace marian {
namespace bergamot { namespace bergamot {
@ -102,6 +102,8 @@ AlignedMemory getModelMemoryFromConfig(marian::Ptr<marian::Options> options) {
AlignedMemory getShortlistMemoryFromConfig(marian::Ptr<marian::Options> options) { AlignedMemory getShortlistMemoryFromConfig(marian::Ptr<marian::Options> options) {
auto shortlist = options->get<std::vector<std::string>>("shortlist"); auto shortlist = options->get<std::vector<std::string>>("shortlist");
ABORT_IF(shortlist.empty(), "No path to shortlist file is given."); ABORT_IF(shortlist.empty(), "No path to shortlist file is given.");
ABORT_IF(!marian::data::isBinaryShortlist(shortlist[0]),
"Loading non-binary shortlist file into memory is not supported");
return loadFileToMemory(shortlist[0], 64); return loadFileToMemory(shortlist[0], 64);
} }
@ -112,6 +114,8 @@ void getVocabsMemoryFromConfig(marian::Ptr<marian::Options> options,
vocabMemories.resize(vfiles.size()); vocabMemories.resize(vfiles.size());
std::unordered_map<std::string, std::shared_ptr<AlignedMemory>> vocabMap; std::unordered_map<std::string, std::shared_ptr<AlignedMemory>> vocabMap;
for (size_t i = 0; i < vfiles.size(); ++i) { for (size_t i = 0; i < vfiles.size(); ++i) {
ABORT_IF(marian::filesystem::Path(vfiles[i]).extension() != marian::filesystem::Path(".spm"),
"Loading non-SentencePiece vocab files into memory is not supported");
auto m = vocabMap.emplace(std::make_pair(vfiles[i], std::shared_ptr<AlignedMemory>())); auto m = vocabMap.emplace(std::make_pair(vfiles[i], std::shared_ptr<AlignedMemory>()));
if (m.second) { if (m.second) {
m.first->second = std::make_shared<AlignedMemory>(loadFileToMemory(vfiles[i], 64)); m.first->second = std::make_shared<AlignedMemory>(loadFileToMemory(vfiles[i], 64));

View File

@ -20,8 +20,11 @@ inline marian::ConfigParser createConfigParser() {
cp.addOption<int>("--max-length-break", "Bergamot Options", cp.addOption<int>("--max-length-break", "Bergamot Options",
"Maximum input tokens to be processed in a single sentence.", 128); "Maximum input tokens to be processed in a single sentence.", 128);
cp.addOption<bool>("--bytearray", "Bergamot Options",
"Flag holds whether to construct service from bytearrays, only for testing purpose", false);
cp.addOption<bool>("--check-bytearray", "Bergamot Options", cp.addOption<bool>("--check-bytearray", "Bergamot Options",
"Flag holds whether to check the content of the bytearray (true by default)", true); "Flag holds whether to check the content of the bytearrays (true by default)", true);
return cp; return cp;
} }