mirror of
https://github.com/marian-nmt/marian.git
synced 2024-10-26 09:09:10 +03:00
Add option to print word-level scores (#501)
* Add printing word level scores * Add option --no-spm-decode * Fix precision for word-level scores * Fix getting the no-spm-decode option * Update CHANGELOG * Add comments and refactor * Print word-level scores next to other scores in an n-best list * Remove --word-scores from marian-scorer * Add --no-spm-decode only if compiled with SentencePiece * Add comments * Printing word scores before model scores in n-best lists * Update VERSION Co-authored-by: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
This commit is contained in:
parent
2bd986d8a7
commit
24f062cd27
@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
- An option to print word-level translation scores
|
||||
- An option to turn off automatic detokenization from SentencePiece
|
||||
- Separate quantization types for 8-bit FBGEMM for AVX2 and AVX512
|
||||
- Sequence-level unliklihood training
|
||||
- Allow file name templated valid-translation-output files
|
||||
|
@ -545,6 +545,8 @@ void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) {
|
||||
"Allow unknown words to appear in output");
|
||||
cli.add<bool>("--n-best",
|
||||
"Generate n-best list");
|
||||
cli.add<bool>("--word-scores",
|
||||
"Print word-level scores");
|
||||
|
||||
// efficiency options
|
||||
cli.add<int>("--valid-mini-batch",
|
||||
@ -607,6 +609,12 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
|
||||
cli.add<std::string>("--alignment",
|
||||
"Return word alignment. Possible values: 0.0-1.0, hard, soft")
|
||||
->implicit_val("1");
|
||||
cli.add<bool>("--word-scores",
|
||||
"Print word-level scores");
|
||||
#ifdef USE_SENTENCEPIECE
|
||||
cli.add<bool>("--no-spm-decode",
|
||||
"Keep the output segmented into SentencePiece subwords");
|
||||
#endif
|
||||
|
||||
addSuboptionsDevices(cli);
|
||||
addSuboptionsInputLength(cli);
|
||||
|
@ -8,9 +8,7 @@ namespace data {
|
||||
|
||||
WordAlignment::WordAlignment() {}
|
||||
|
||||
WordAlignment::WordAlignment(
|
||||
const std::vector<Point>& align)
|
||||
: data_(align) {}
|
||||
WordAlignment::WordAlignment(const std::vector<Point>& align) : data_(align) {}
|
||||
|
||||
WordAlignment::WordAlignment(const std::string& line) {
|
||||
std::vector<std::string> atok = utils::splitAny(line, " -");
|
||||
|
@ -36,17 +36,18 @@ private:
|
||||
std::mt19937 generator_;
|
||||
std::uniform_int_distribution<int> randInt_; // from 0 to INT_MAX
|
||||
|
||||
// Keeps sentences segmented into subword units
|
||||
bool keepEncoded_{false};
|
||||
|
||||
// Sample from one file, based on first algorithm from:
|
||||
// https://en.wikipedia.org/wiki/Reservoir_sampling
|
||||
void reservoirSampling(std::vector<std::string>& sample, size_t& seenLines,
|
||||
const std::string& trainPath, size_t maxLines, size_t maxBytes) {
|
||||
|
||||
ABORT_IF(maxLines == 0, "Sample needs to be larger 0");
|
||||
|
||||
std::unique_ptr<std::istream> trainStrm(
|
||||
trainPath == "stdin" ? new std::istream(std::cin.rdbuf())
|
||||
: new io::InputFileStream(trainPath)
|
||||
);
|
||||
std::unique_ptr<std::istream> trainStrm(trainPath == "stdin"
|
||||
? new std::istream(std::cin.rdbuf())
|
||||
: new io::InputFileStream(trainPath));
|
||||
|
||||
std::string line;
|
||||
while(getline(*trainStrm, line)) {
|
||||
@ -109,8 +110,10 @@ private:
|
||||
|
||||
public:
|
||||
SentencePieceVocab(Ptr<Options> options, size_t batchIndex)
|
||||
: options_(options), batchIndex_(batchIndex), generator_((uint32_t)Config::seed) {
|
||||
|
||||
: options_(options),
|
||||
batchIndex_(batchIndex),
|
||||
generator_((uint32_t)Config::seed),
|
||||
keepEncoded_(options->get<bool>("no-spm-decode", false)) {
|
||||
if(options_->has("sentencepiece-alphas")) {
|
||||
auto alphas = options_->get<std::vector<float>>("sentencepiece-alphas");
|
||||
if(alphas.size() <= batchIndex)
|
||||
@ -221,11 +224,18 @@ public:
|
||||
|
||||
std::string decode(const Words& sentence, bool /*ignoreEOS*/) const override {
|
||||
std::string line;
|
||||
if(keepEncoded_) { // i.e. keep the sentence segmented into subword units
|
||||
for(const Word& id : sentence)
|
||||
line += (*this)[id] + " ";
|
||||
line.pop_back(); // trim the trailing whitespace
|
||||
} else {
|
||||
// convert vector of Word to vector of int
|
||||
std::vector<int> spmSentence; spmSentence.reserve(sentence.size());
|
||||
for (auto&& word : sentence)
|
||||
std::vector<int> spmSentence;
|
||||
spmSentence.reserve(sentence.size());
|
||||
for(auto&& word : sentence)
|
||||
spmSentence.push_back(word.toWordIndex());
|
||||
spm_->Decode(spmSentence, &line);
|
||||
}
|
||||
return line;
|
||||
}
|
||||
|
||||
|
@ -42,29 +42,40 @@ public:
|
||||
float getPathScore() const { return pathScore_; }
|
||||
|
||||
const std::vector<float>& getScoreBreakdown() { return scoreBreakdown_; }
|
||||
void setScoreBreakdown(const std::vector<float>& scoreBreaddown) { scoreBreakdown_ = scoreBreaddown; }
|
||||
void setScoreBreakdown(const std::vector<float>& scoreBreakdown) { scoreBreakdown_ = scoreBreakdown; }
|
||||
|
||||
const std::vector<float>& getAlignment() { return alignment_; }
|
||||
void setAlignment(const std::vector<float>& align) { alignment_ = align; };
|
||||
|
||||
// helpers to trace back paths referenced from this hypothesis
|
||||
Words tracebackWords()
|
||||
{
|
||||
// trace back paths referenced from this hypothesis
|
||||
Words tracebackWords() {
|
||||
Words targetWords;
|
||||
for (auto hyp = this; hyp->getPrevHyp(); hyp = hyp->getPrevHyp().get()) {
|
||||
for(auto hyp = this; hyp->getPrevHyp(); hyp = hyp->getPrevHyp().get()) {
|
||||
targetWords.push_back(hyp->getWord());
|
||||
// std::cerr << hyp->getWord() << " " << hyp << std::endl;
|
||||
}
|
||||
std::reverse(targetWords.begin(), targetWords.end());
|
||||
return targetWords;
|
||||
}
|
||||
|
||||
// calculate word-level scores for each target word by de-aggregating the path score
|
||||
std::vector<float> tracebackWordScores() {
|
||||
std::vector<float> scores;
|
||||
// traverse hypotheses backward
|
||||
for(auto hyp = this; hyp->getPrevHyp(); hyp = hyp->getPrevHyp().get()) {
|
||||
// a path score is a cumulative score including scores from all preceding hypotheses (words),
|
||||
// so calculate a word-level score by subtracting the previous path score from the current path score
|
||||
auto prevPathScore = hyp->getPrevHyp() ? hyp->getPrevHyp().get()->pathScore_ : 0.f;
|
||||
scores.push_back(hyp->pathScore_ - prevPathScore);
|
||||
}
|
||||
std::reverse(scores.begin(), scores.end());
|
||||
return scores;
|
||||
}
|
||||
|
||||
// get soft alignments [t][s] -> P(s|t) for each target word starting from the hyp one
|
||||
typedef data::SoftAlignment SoftAlignment;
|
||||
SoftAlignment tracebackAlignment()
|
||||
{
|
||||
SoftAlignment tracebackAlignment() {
|
||||
SoftAlignment align;
|
||||
for (auto hyp = this; hyp->getPrevHyp(); hyp = hyp->getPrevHyp().get()) {
|
||||
for(auto hyp = this; hyp->getPrevHyp(); hyp = hyp->getPrevHyp().get()) {
|
||||
align.push_back(hyp->getAlignment());
|
||||
}
|
||||
std::reverse(align.begin(), align.end());
|
||||
|
@ -1,5 +1,7 @@
|
||||
#include "output_printer.h"
|
||||
|
||||
#include <sstream>
|
||||
|
||||
namespace marian {
|
||||
|
||||
std::string OutputPrinter::getAlignment(const Hypothesis::PtrType& hyp) {
|
||||
@ -19,11 +21,18 @@ std::string OutputPrinter::getAlignment(const Hypothesis::PtrType& hyp) {
|
||||
} else if(alignment_ == "hard") {
|
||||
return data::ConvertSoftAlignToHardAlign(align, 1.f).toString();
|
||||
} else if(alignmentThreshold_ > 0.f) {
|
||||
return data::ConvertSoftAlignToHardAlign(align, alignmentThreshold_)
|
||||
.toString();
|
||||
return data::ConvertSoftAlignToHardAlign(align, alignmentThreshold_).toString();
|
||||
} else {
|
||||
ABORT("Unrecognized word alignment type");
|
||||
}
|
||||
}
|
||||
|
||||
std::string OutputPrinter::getWordScores(const Hypothesis::PtrType& hyp) {
|
||||
std::ostringstream scores;
|
||||
scores.precision(5);
|
||||
for(const auto& score : hyp->tracebackWordScores())
|
||||
scores << " " << std::fixed << score;
|
||||
return scores.str();
|
||||
}
|
||||
|
||||
} // namespace marian
|
||||
|
@ -20,12 +20,14 @@ public:
|
||||
? options->get<size_t>("beam-size")
|
||||
: 0),
|
||||
alignment_(options->get<std::string>("alignment", "")),
|
||||
alignmentThreshold_(getAlignmentThreshold(alignment_)) {}
|
||||
alignmentThreshold_(getAlignmentThreshold(alignment_)),
|
||||
wordScores_(options->get<bool>("word-scores")) {}
|
||||
|
||||
template <class OStream>
|
||||
void print(Ptr<const History> history, OStream& best1, OStream& bestn) {
|
||||
const auto& nbl = history->nBest(nbest_);
|
||||
|
||||
// prepare n-best list output
|
||||
for(size_t i = 0; i < nbl.size(); ++i) {
|
||||
const auto& result = nbl[i];
|
||||
const auto& hypo = std::get<1>(result);
|
||||
@ -40,6 +42,9 @@ public:
|
||||
if(!alignment_.empty())
|
||||
bestn << " ||| " << getAlignment(hypo);
|
||||
|
||||
if(wordScores_)
|
||||
bestn << " ||| WordScores=" << getWordScores(hypo);
|
||||
|
||||
bestn << " |||";
|
||||
if(hypo->getScoreBreakdown().empty()) {
|
||||
bestn << " F0=" << hypo->getPathScore();
|
||||
@ -72,17 +77,26 @@ public:
|
||||
best1 << " ||| " << getAlignment(hypo);
|
||||
}
|
||||
|
||||
if(wordScores_) {
|
||||
const auto& hypo = std::get<1>(result);
|
||||
best1 << " ||| WordScores=" << getWordScores(hypo);
|
||||
}
|
||||
|
||||
best1 << std::flush;
|
||||
}
|
||||
|
||||
private:
|
||||
Ptr<Vocab const> vocab_;
|
||||
bool reverse_{false};
|
||||
size_t nbest_{0};
|
||||
std::string alignment_;
|
||||
float alignmentThreshold_{0.f};
|
||||
bool reverse_{false}; // If it is a right-to-left model that needs reversed word order
|
||||
size_t nbest_{0}; // Size of the n-best list to print
|
||||
std::string alignment_; // A non-empty string indicates the type of word alignment
|
||||
float alignmentThreshold_{0.f}; // Threshold for converting attention into hard word alignment
|
||||
bool wordScores_{false}; // Whether to print word-level scores or not
|
||||
|
||||
// Get word alignment pairs or soft alignment
|
||||
std::string getAlignment(const Hypothesis::PtrType& hyp);
|
||||
// Get word-level scores
|
||||
std::string getWordScores(const Hypothesis::PtrType& hyp);
|
||||
|
||||
float getAlignmentThreshold(const std::string& str) {
|
||||
try {
|
||||
|
Loading…
Reference in New Issue
Block a user