Unify options with marian

Service specific options are renamed to align with marian-option naming
as follows:

1. max-input-sentence-tokens -> max-length-break (There's a
   max-length-crop in marian, this is the same, except breaks into
   multiple sentences than truncate/crop).
2. max-input-tokens -> mini-batch-words.
This commit is contained in:
Jerin Philip 2021-02-17 00:54:30 +00:00
parent 0296a38cd4
commit 69201ba44c
4 changed files with 9 additions and 15 deletions

View File

@ -7,8 +7,8 @@ namespace marian {
namespace bergamot {
Batcher::Batcher(Ptr<Options> options) {
miniBatchWords = options->get<int>("max-input-tokens");
bucket_.resize(options->get<int>("max-input-sentence-tokens") + 1);
miniBatchWords = options->get<int>("mini-batch-words");
bucket_.resize(options->get<int>("max-length-break") + 1);
ABORT_IF(
miniBatchWords < bucket_.size() - 1,
"max-input-tokens cannot be less than than max-input-sentence-tokens, "

View File

@ -16,14 +16,9 @@ inline marian::ConfigParser createConfigParser() {
"[paragraph, sentence, wrapped_text]", "paragraph");
cp.addOption<int>(
"--max-input-sentence-tokens", "Bergamot Options",
"--max-length-break", "Bergamot Options",
"Maximum input tokens to be processed in a single sentence.", 128);
cp.addOption<int>("--max-input-tokens", "Bergamot Options",
"Maximum input tokens in a batch. control for"
"Bergamot Queue",
1024);
return cp;
}

View File

@ -20,10 +20,9 @@ TextProcessor::TextProcessor(std::vector<Ptr<Vocab const>> &vocabs,
Ptr<Options> options)
: vocabs_(&vocabs), sentence_splitter_(options) {
max_input_sentence_tokens_ = options->get<int>("max-input-sentence-tokens");
max_input_sentence_tokens_ = max_input_sentence_tokens_ - 1;
ABORT_IF(max_input_sentence_tokens_ < 0,
"max-input-sentence-tokens cannot be < 0");
max_length_break_ = options->get<int>("max-length-break");
max_length_break_ = max_length_break_ - 1;
ABORT_IF(max_length_break_ < 0, "max-length-break cannot be < 0");
}
void TextProcessor::process(const string_view &query, Segments &segments,
@ -52,11 +51,11 @@ void TextProcessor::truncate(Segment &segment,
std::vector<string_view> &wordRanges,
Segments &segments, SentenceRanges &sourceRanges) {
for (size_t offset = 0; offset < segment.size();
offset += max_input_sentence_tokens_) {
offset += max_length_break_) {
auto start = segment.begin() + offset;
size_t left = segment.size() - offset;
size_t diff = std::min(max_input_sentence_tokens_, left);
size_t diff = std::min(max_length_break_, left);
segments.emplace_back(start, start + diff);
segments.back().push_back(sourceEosId());

View File

@ -41,7 +41,7 @@ private:
std::vector<Ptr<Vocab const>> *vocabs_;
SentenceSplitter sentence_splitter_;
size_t max_input_sentence_tokens_;
size_t max_length_break_;
};
} // namespace bergamot