Add option to print word-level scores (#501)

* Add printing word level scores

* Add option --no-spm-decode

* Fix precision for word-level scores

* Fix getting the no-spm-decode option

* Update CHANGELOG

* Add comments and refactor

* Print word-level scores next to other scores in an n-best list

* Remove --word-scores from marian-scorer

* Add --no-spm-decode only if compiled with SentencePiece

* Add comments

* Printing word scores before model scores in n-best lists

* Update VERSION

Co-authored-by: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
This commit is contained in:
Roman Grundkiewicz 2020-01-04 03:10:21 +00:00 committed by Marcin Junczys-Dowmunt
parent 2bd986d8a7
commit 24f062cd27
8 changed files with 94 additions and 42 deletions

View File

@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
## [Unreleased]
### Added
- An option to print word-level translation scores
- An option to turn off automatic detokenization from SentencePiece
- Separate quantization types for 8-bit FBGEMM for AVX2 and AVX512
- Sequence-level unliklihood training
- Allow file name templated valid-translation-output files

View File

@ -1 +1 @@
v1.8.35
v1.8.36

View File

@ -545,6 +545,8 @@ void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) {
"Allow unknown words to appear in output");
cli.add<bool>("--n-best",
"Generate n-best list");
cli.add<bool>("--word-scores",
"Print word-level scores");
// efficiency options
cli.add<int>("--valid-mini-batch",
@ -607,6 +609,12 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
cli.add<std::string>("--alignment",
"Return word alignment. Possible values: 0.0-1.0, hard, soft")
->implicit_val("1");
cli.add<bool>("--word-scores",
"Print word-level scores");
#ifdef USE_SENTENCEPIECE
cli.add<bool>("--no-spm-decode",
"Keep the output segmented into SentencePiece subwords");
#endif
addSuboptionsDevices(cli);
addSuboptionsInputLength(cli);

View File

@ -8,9 +8,7 @@ namespace data {
WordAlignment::WordAlignment() {}
WordAlignment::WordAlignment(
const std::vector<Point>& align)
: data_(align) {}
WordAlignment::WordAlignment(const std::vector<Point>& align) : data_(align) {}
WordAlignment::WordAlignment(const std::string& line) {
std::vector<std::string> atok = utils::splitAny(line, " -");

View File

@ -36,17 +36,18 @@ private:
std::mt19937 generator_;
std::uniform_int_distribution<int> randInt_; // from 0 to INT_MAX
// Keeps sentences segmented into subword units
bool keepEncoded_{false};
// Sample from one file, based on first algorithm from:
// https://en.wikipedia.org/wiki/Reservoir_sampling
void reservoirSampling(std::vector<std::string>& sample, size_t& seenLines,
const std::string& trainPath, size_t maxLines, size_t maxBytes) {
ABORT_IF(maxLines == 0, "Sample needs to be larger 0");
std::unique_ptr<std::istream> trainStrm(
trainPath == "stdin" ? new std::istream(std::cin.rdbuf())
: new io::InputFileStream(trainPath)
);
std::unique_ptr<std::istream> trainStrm(trainPath == "stdin"
? new std::istream(std::cin.rdbuf())
: new io::InputFileStream(trainPath));
std::string line;
while(getline(*trainStrm, line)) {
@ -109,8 +110,10 @@ private:
public:
SentencePieceVocab(Ptr<Options> options, size_t batchIndex)
: options_(options), batchIndex_(batchIndex), generator_((uint32_t)Config::seed) {
: options_(options),
batchIndex_(batchIndex),
generator_((uint32_t)Config::seed),
keepEncoded_(options->get<bool>("no-spm-decode", false)) {
if(options_->has("sentencepiece-alphas")) {
auto alphas = options_->get<std::vector<float>>("sentencepiece-alphas");
if(alphas.size() <= batchIndex)
@ -221,11 +224,18 @@ public:
std::string decode(const Words& sentence, bool /*ignoreEOS*/) const override {
std::string line;
if(keepEncoded_) { // i.e. keep the sentence segmented into subword units
for(const Word& id : sentence)
line += (*this)[id] + " ";
line.pop_back(); // trim the trailing whitespace
} else {
// convert vector of Word to vector of int
std::vector<int> spmSentence; spmSentence.reserve(sentence.size());
for (auto&& word : sentence)
std::vector<int> spmSentence;
spmSentence.reserve(sentence.size());
for(auto&& word : sentence)
spmSentence.push_back(word.toWordIndex());
spm_->Decode(spmSentence, &line);
}
return line;
}

View File

@ -42,29 +42,40 @@ public:
float getPathScore() const { return pathScore_; }
const std::vector<float>& getScoreBreakdown() { return scoreBreakdown_; }
void setScoreBreakdown(const std::vector<float>& scoreBreaddown) { scoreBreakdown_ = scoreBreaddown; }
void setScoreBreakdown(const std::vector<float>& scoreBreakdown) { scoreBreakdown_ = scoreBreakdown; }
const std::vector<float>& getAlignment() { return alignment_; }
void setAlignment(const std::vector<float>& align) { alignment_ = align; };
// helpers to trace back paths referenced from this hypothesis
Words tracebackWords()
{
// trace back paths referenced from this hypothesis
Words tracebackWords() {
Words targetWords;
for (auto hyp = this; hyp->getPrevHyp(); hyp = hyp->getPrevHyp().get()) {
for(auto hyp = this; hyp->getPrevHyp(); hyp = hyp->getPrevHyp().get()) {
targetWords.push_back(hyp->getWord());
// std::cerr << hyp->getWord() << " " << hyp << std::endl;
}
std::reverse(targetWords.begin(), targetWords.end());
return targetWords;
}
// calculate word-level scores for each target word by de-aggregating the path score
std::vector<float> tracebackWordScores() {
std::vector<float> scores;
// traverse hypotheses backward
for(auto hyp = this; hyp->getPrevHyp(); hyp = hyp->getPrevHyp().get()) {
// a path score is a cumulative score including scores from all preceding hypotheses (words),
// so calculate a word-level score by subtracting the previous path score from the current path score
auto prevPathScore = hyp->getPrevHyp() ? hyp->getPrevHyp().get()->pathScore_ : 0.f;
scores.push_back(hyp->pathScore_ - prevPathScore);
}
std::reverse(scores.begin(), scores.end());
return scores;
}
// get soft alignments [t][s] -> P(s|t) for each target word starting from the hyp one
typedef data::SoftAlignment SoftAlignment;
SoftAlignment tracebackAlignment()
{
SoftAlignment tracebackAlignment() {
SoftAlignment align;
for (auto hyp = this; hyp->getPrevHyp(); hyp = hyp->getPrevHyp().get()) {
for(auto hyp = this; hyp->getPrevHyp(); hyp = hyp->getPrevHyp().get()) {
align.push_back(hyp->getAlignment());
}
std::reverse(align.begin(), align.end());

View File

@ -1,5 +1,7 @@
#include "output_printer.h"
#include <sstream>
namespace marian {
std::string OutputPrinter::getAlignment(const Hypothesis::PtrType& hyp) {
@ -19,11 +21,18 @@ std::string OutputPrinter::getAlignment(const Hypothesis::PtrType& hyp) {
} else if(alignment_ == "hard") {
return data::ConvertSoftAlignToHardAlign(align, 1.f).toString();
} else if(alignmentThreshold_ > 0.f) {
return data::ConvertSoftAlignToHardAlign(align, alignmentThreshold_)
.toString();
return data::ConvertSoftAlignToHardAlign(align, alignmentThreshold_).toString();
} else {
ABORT("Unrecognized word alignment type");
}
}
std::string OutputPrinter::getWordScores(const Hypothesis::PtrType& hyp) {
std::ostringstream scores;
scores.precision(5);
for(const auto& score : hyp->tracebackWordScores())
scores << " " << std::fixed << score;
return scores.str();
}
} // namespace marian

View File

@ -20,12 +20,14 @@ public:
? options->get<size_t>("beam-size")
: 0),
alignment_(options->get<std::string>("alignment", "")),
alignmentThreshold_(getAlignmentThreshold(alignment_)) {}
alignmentThreshold_(getAlignmentThreshold(alignment_)),
wordScores_(options->get<bool>("word-scores")) {}
template <class OStream>
void print(Ptr<const History> history, OStream& best1, OStream& bestn) {
const auto& nbl = history->nBest(nbest_);
// prepare n-best list output
for(size_t i = 0; i < nbl.size(); ++i) {
const auto& result = nbl[i];
const auto& hypo = std::get<1>(result);
@ -40,6 +42,9 @@ public:
if(!alignment_.empty())
bestn << " ||| " << getAlignment(hypo);
if(wordScores_)
bestn << " ||| WordScores=" << getWordScores(hypo);
bestn << " |||";
if(hypo->getScoreBreakdown().empty()) {
bestn << " F0=" << hypo->getPathScore();
@ -72,17 +77,26 @@ public:
best1 << " ||| " << getAlignment(hypo);
}
if(wordScores_) {
const auto& hypo = std::get<1>(result);
best1 << " ||| WordScores=" << getWordScores(hypo);
}
best1 << std::flush;
}
private:
Ptr<Vocab const> vocab_;
bool reverse_{false};
size_t nbest_{0};
std::string alignment_;
float alignmentThreshold_{0.f};
bool reverse_{false}; // If it is a right-to-left model that needs reversed word order
size_t nbest_{0}; // Size of the n-best list to print
std::string alignment_; // A non-empty string indicates the type of word alignment
float alignmentThreshold_{0.f}; // Threshold for converting attention into hard word alignment
bool wordScores_{false}; // Whether to print word-level scores or not
// Get word alignment pairs or soft alignment
std::string getAlignment(const Hypothesis::PtrType& hyp);
// Get word-level scores
std::string getWordScores(const Hypothesis::PtrType& hyp);
float getAlignmentThreshold(const std::string& str) {
try {