mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 13:23:25 +03:00
Support for using factors in mert and evaluator
example: Use --factor "0|2" to use only first and third factor from nbest list and from reference. If you use interpolated scorer, separate records with comma (e.g. --factor "0|2,1").
This commit is contained in:
parent
d1292d4d11
commit
457d004368
@ -139,6 +139,7 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
|
||||
string line;
|
||||
size_t sid = 0; //sentence counter
|
||||
while (getline(refin,line)) {
|
||||
line = this->applyFactors(line);
|
||||
if (i == 0) {
|
||||
NgramCounts *counts = new NgramCounts; //these get leaked
|
||||
m_ref_counts.push_back(counts);
|
||||
@ -183,8 +184,9 @@ void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
|
||||
}
|
||||
NgramCounts testcounts;
|
||||
// stats for this line
|
||||
vector<ScoreStatsType> stats(kLENGTH * 2);;
|
||||
const size_t length = countNgrams(text, testcounts, kLENGTH);
|
||||
vector<ScoreStatsType> stats(kLENGTH * 2);
|
||||
string sentence = this->applyFactors(text);
|
||||
const size_t length = countNgrams(sentence, testcounts, kLENGTH);
|
||||
|
||||
// Calculate effective reference length.
|
||||
switch (m_ref_length_type) {
|
||||
|
@ -31,6 +31,7 @@ void CderScorer::setReferenceFiles(const vector<string>& referenceFiles)
|
||||
m_ref_sentences.push_back(vector<sent_t>());
|
||||
string line;
|
||||
while (getline(refin,line)) {
|
||||
line = this->applyFactors(line);
|
||||
sent_t encoded;
|
||||
TokenizeAndEncode(line, encoded);
|
||||
m_ref_sentences[rid].push_back(encoded);
|
||||
@ -40,8 +41,10 @@ void CderScorer::setReferenceFiles(const vector<string>& referenceFiles)
|
||||
|
||||
void CderScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
|
||||
{
|
||||
string sentence = this->applyFactors(text);
|
||||
|
||||
vector<int> stats;
|
||||
prepareStatsVector(sid, text, stats);
|
||||
prepareStatsVector(sid, sentence, stats);
|
||||
entry.set(stats);
|
||||
}
|
||||
|
||||
|
@ -159,3 +159,24 @@ void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats
|
||||
string str = buff.str();
|
||||
entry.set(str);
|
||||
}
|
||||
|
||||
void InterpolatedScorer::setFactors(const string& factors)
|
||||
{
|
||||
if (factors.empty()) return;
|
||||
|
||||
vector<string> fsplit;
|
||||
split(factors, ',', fsplit);
|
||||
|
||||
if (fsplit.size() != _scorers.size()) throw runtime_error("Number of factor specifications does not equal number of interpolated scorers.");
|
||||
|
||||
for (size_t i = 0; i < _scorers.size(); ++i)
|
||||
{
|
||||
_scorers[i]->setFactors(fsplit[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -42,6 +42,11 @@ public:
|
||||
|
||||
virtual void setScoreData(ScoreData* data);
|
||||
|
||||
/**
|
||||
* Set the factors, which should be used for this metric
|
||||
*/
|
||||
virtual void setFactors(const string& factors);
|
||||
|
||||
protected:
|
||||
ScopedVector<Scorer> _scorers;
|
||||
|
||||
|
@ -29,6 +29,7 @@ void PerScorer::setReferenceFiles(const vector<string>& referenceFiles)
|
||||
string line;
|
||||
int sid = 0;
|
||||
while (getline(in,line)) {
|
||||
line = this->applyFactors(line);
|
||||
vector<int> tokens;
|
||||
TokenizeAndEncode(line, tokens);
|
||||
m_ref_tokens.push_back(multiset<int>());
|
||||
@ -52,10 +53,13 @@ void PerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
|
||||
msg << "Sentence id (" << sid << ") not found in reference set";
|
||||
throw runtime_error(msg.str());
|
||||
}
|
||||
|
||||
string sentence = this->applyFactors(text);
|
||||
|
||||
// Calculate correct, output_length and ref_length for
|
||||
// the line and store it in entry
|
||||
vector<int> testtokens;
|
||||
TokenizeAndEncode(text, testtokens);
|
||||
TokenizeAndEncode(sentence, testtokens);
|
||||
multiset<int> testtokens_all(testtokens.begin(),testtokens.end());
|
||||
set<int> testtokens_unique(testtokens.begin(),testtokens.end());
|
||||
int correct = 0;
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include "Scorer.h"
|
||||
#include <limits>
|
||||
#include "Util.h"
|
||||
|
||||
namespace {
|
||||
|
||||
@ -95,6 +96,55 @@ void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded) {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the factors, which should be used for this metric
|
||||
*/
|
||||
void Scorer::setFactors(const string& factors)
|
||||
{
|
||||
if (factors.empty()) return;
|
||||
vector<string> factors_vec;
|
||||
split(factors, '|', factors_vec);
|
||||
for(vector<string>::iterator it = factors_vec.begin(); it != factors_vec.end(); ++it)
|
||||
{
|
||||
int factor = atoi(it->c_str());
|
||||
m_factors.push_back(factor);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Take the factored sentence and return the desired factors
|
||||
*/
|
||||
string Scorer::applyFactors(const string& sentence)
|
||||
{
|
||||
if (m_factors.size() == 0) return sentence;
|
||||
|
||||
vector<string> tokens;
|
||||
split(sentence, ' ', tokens);
|
||||
|
||||
stringstream sstream;
|
||||
for (size_t i = 0; i < tokens.size(); ++i)
|
||||
{
|
||||
if (tokens[i] == "") continue;
|
||||
|
||||
vector<string> factors;
|
||||
split(tokens[i], '|', factors);
|
||||
|
||||
int fsize = factors.size();
|
||||
|
||||
if (i>0) sstream << " ";
|
||||
|
||||
for (size_t j = 0; j < m_factors.size(); ++j)
|
||||
{
|
||||
int findex = m_factors[j];
|
||||
if (findex < 0 || findex >= fsize) throw runtime_error("Factor index is out of range.");
|
||||
|
||||
if (j>0) sstream << "|";
|
||||
sstream << factors[findex];
|
||||
}
|
||||
}
|
||||
return sstream.str();
|
||||
}
|
||||
|
||||
StatisticsBasedScorer::StatisticsBasedScorer(const string& name, const string& config)
|
||||
: Scorer(name,config) {
|
||||
//configure regularisation
|
||||
|
@ -97,6 +97,16 @@ class Scorer
|
||||
m_score_data = data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the factors, which should be used for this metric
|
||||
*/
|
||||
virtual void setFactors(const string& factors);
|
||||
|
||||
/**
|
||||
* Take the factored sentence and return the desired factors
|
||||
*/
|
||||
virtual string applyFactors(const string& sentece);
|
||||
|
||||
private:
|
||||
class Encoder {
|
||||
public:
|
||||
@ -114,6 +124,7 @@ class Scorer
|
||||
string m_name;
|
||||
Encoder* m_encoder;
|
||||
map<string, string> m_config;
|
||||
vector<int> m_factors;
|
||||
|
||||
protected:
|
||||
ScoreData* m_score_data;
|
||||
|
@ -33,6 +33,7 @@ void TerScorer::setReferenceFiles ( const vector<string>& referenceFiles )
|
||||
string line;
|
||||
int sid = 0;
|
||||
while ( getline ( in, line ) ) {
|
||||
line = this->applyFactors(line);
|
||||
vector<int> tokens;
|
||||
TokenizeAndEncode(line, tokens);
|
||||
m_references.push_back ( tokens );
|
||||
@ -48,6 +49,7 @@ void TerScorer::setReferenceFiles ( const vector<string>& referenceFiles )
|
||||
|
||||
void TerScorer::prepareStats ( size_t sid, const string& text, ScoreStats& entry )
|
||||
{
|
||||
string sentence = this->applyFactors(text);
|
||||
|
||||
terAlignment result;
|
||||
result.numEdits = 0.0 ;
|
||||
@ -74,7 +76,7 @@ void TerScorer::prepareStats ( size_t sid, const string& text, ScoreStats& entry
|
||||
averageLength+=(double)m_multi_references.at ( incRefsBis ).at ( sid ).size();
|
||||
}
|
||||
averageLength=averageLength/( double ) m_multi_references.size();
|
||||
TokenizeAndEncode(text, testtokens);
|
||||
TokenizeAndEncode(sentence, testtokens);
|
||||
terCalc * evaluation=new terCalc();
|
||||
evaluation->setDebugMode ( false );
|
||||
terAlignment tmp_result = evaluation->TER ( reftokens, testtokens );
|
||||
|
@ -131,6 +131,7 @@ void usage()
|
||||
cerr << "[--sctype|-s] the scorer type (default BLEU)" << endl;
|
||||
cerr << "[--scconfig|-c] configuration string passed to scorer" << endl;
|
||||
cerr << "\tThis is of the form NAME1:VAL1,NAME2:VAL2 etc " << endl;
|
||||
cerr << "[--factors|-f] list of factors passed to the scorer (e.g. 0|2)" << endl;
|
||||
cerr << "[--reference|-R] comma separated list of reference files" << endl;
|
||||
cerr << "[--candidate|-C] comma separated list of candidate files" << endl;
|
||||
cerr << "[--bootstrap|-b] number of booststraped samples (default 0 - no bootstraping)" << endl;
|
||||
@ -164,6 +165,7 @@ static struct option long_options[] = {
|
||||
{"candidate", required_argument, 0, 'C'},
|
||||
{"bootstrap", required_argument, 0, 'b'},
|
||||
{"rseed", required_argument, 0, 'r'},
|
||||
{"factors", required_argument, 0, 'f'},
|
||||
{"help", no_argument, 0, 'h'},
|
||||
{0, 0, 0, 0}
|
||||
};
|
||||
@ -174,6 +176,7 @@ struct ProgramOption {
|
||||
vector<string> scorer_configs;
|
||||
string reference;
|
||||
string candidate;
|
||||
vector<string> scorer_factors;
|
||||
int bootstrap;
|
||||
int seed;
|
||||
bool has_seed;
|
||||
@ -190,11 +193,12 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
|
||||
int c;
|
||||
int option_index;
|
||||
int last_scorer_index = -1;
|
||||
while ((c = getopt_long(argc, argv, "s:c:R:C:b:r:h", long_options, &option_index)) != -1) {
|
||||
while ((c = getopt_long(argc, argv, "s:c:R:C:b:r:f:h", long_options, &option_index)) != -1) {
|
||||
switch(c) {
|
||||
case 's':
|
||||
opt->scorer_types.push_back(string(optarg));
|
||||
opt->scorer_configs.push_back(string(""));
|
||||
opt->scorer_factors.push_back(string(""));
|
||||
last_scorer_index++;
|
||||
break;
|
||||
case 'c':
|
||||
@ -213,6 +217,9 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
|
||||
opt->seed = strtol(optarg, NULL, 10);
|
||||
opt->has_seed = true;
|
||||
break;
|
||||
case 'f':
|
||||
opt->scorer_factors[last_scorer_index] = string(optarg);
|
||||
break;
|
||||
default:
|
||||
usage();
|
||||
}
|
||||
@ -223,6 +230,7 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
|
||||
{
|
||||
opt->scorer_types.push_back(string("BLEU"));
|
||||
opt->scorer_configs.push_back(string(""));
|
||||
opt->scorer_factors.push_back(string(""));
|
||||
}
|
||||
}
|
||||
|
||||
@ -268,6 +276,7 @@ int main(int argc, char** argv)
|
||||
for (size_t i = 0; i < option.scorer_types.size(); i++)
|
||||
{
|
||||
g_scorer = ScorerFactory::getScorer(option.scorer_types[i], option.scorer_configs[i]);
|
||||
g_scorer->setFactors(option.scorer_factors[i]);
|
||||
g_scorer->setReferenceFiles(refFiles);
|
||||
EvaluatorUtil::evaluate(*fileIt, option.bootstrap);
|
||||
delete g_scorer;
|
||||
|
@ -26,6 +26,7 @@ void usage()
|
||||
cerr << "[--sctype|-s] the scorer type (default BLEU)" << endl;
|
||||
cerr << "[--scconfig|-c] configuration string passed to scorer" << endl;
|
||||
cerr << "\tThis is of the form NAME1:VAL1,NAME2:VAL2 etc " << endl;
|
||||
cerr << "[--factors|-f] list of factors passed to the scorer (e.g. 0|2)" << endl;
|
||||
cerr << "[--reference|-r] comma separated list of reference files" << endl;
|
||||
cerr << "[--binary|-b] use binary output format (default to text )" << endl;
|
||||
cerr << "[--nbest|-n] the nbest file" << endl;
|
||||
@ -41,6 +42,7 @@ void usage()
|
||||
static struct option long_options[] = {
|
||||
{"sctype", required_argument, 0, 's'},
|
||||
{"scconfig", required_argument,0, 'c'},
|
||||
{"factors", required_argument,0, 'f'},
|
||||
{"reference", required_argument, 0, 'r'},
|
||||
{"binary", no_argument, 0, 'b'},
|
||||
{"nbest", required_argument, 0, 'n'},
|
||||
@ -57,6 +59,7 @@ static struct option long_options[] = {
|
||||
struct ProgramOption {
|
||||
string scorerType;
|
||||
string scorerConfig;
|
||||
string scorerFactors;
|
||||
string referenceFile;
|
||||
string nbestFile;
|
||||
string scoreDataFile;
|
||||
@ -69,6 +72,7 @@ struct ProgramOption {
|
||||
ProgramOption()
|
||||
: scorerType("BLEU"),
|
||||
scorerConfig(""),
|
||||
scorerFactors(""),
|
||||
referenceFile(""),
|
||||
nbestFile(""),
|
||||
scoreDataFile("statscore.data"),
|
||||
@ -83,7 +87,7 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
|
||||
int c;
|
||||
int option_index;
|
||||
|
||||
while ((c = getopt_long(argc, argv, "s:r:n:S:F:R:E:v:hb", long_options, &option_index)) != -1) {
|
||||
while ((c = getopt_long(argc, argv, "s:r:f:n:S:F:R:E:v:hb", long_options, &option_index)) != -1) {
|
||||
switch (c) {
|
||||
case 's':
|
||||
opt->scorerType = string(optarg);
|
||||
@ -91,6 +95,9 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
|
||||
case 'c':
|
||||
opt->scorerConfig = string(optarg);
|
||||
break;
|
||||
case 'f':
|
||||
opt->scorerFactors = string(optarg);
|
||||
break;
|
||||
case 'r':
|
||||
opt->referenceFile = string(optarg);
|
||||
break;
|
||||
@ -180,6 +187,8 @@ int main(int argc, char** argv)
|
||||
|
||||
Scorer* scorer = ScorerFactory::getScorer(option.scorerType, option.scorerConfig);
|
||||
|
||||
scorer->setFactors(option.scorerFactors);
|
||||
|
||||
// load references
|
||||
if (referenceFiles.size() > 0)
|
||||
scorer->setReferenceFiles(referenceFiles);
|
||||
|
Loading…
Reference in New Issue
Block a user