Support for using factors in mert and evaluator

example:
Use --factor "0|2" to use only first and third factor from nbest list and from reference.
If you use interpolated scorer, separate records with comma (e.g. --factor "0|2,1").
This commit is contained in:
Matous Machacek 2012-02-28 02:27:23 +01:00
parent e38cd12ef3
commit ba987c94ba
10 changed files with 123 additions and 7 deletions

View File

@ -139,6 +139,7 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
string line;
size_t sid = 0; //sentence counter
while (getline(refin,line)) {
line = this->applyFactors(line);
if (i == 0) {
NgramCounts *counts = new NgramCounts; //these get leaked
m_ref_counts.push_back(counts);
@ -183,8 +184,9 @@ void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
}
NgramCounts testcounts;
// stats for this line
vector<ScoreStatsType> stats(kLENGTH * 2);;
const size_t length = countNgrams(text, testcounts, kLENGTH);
vector<ScoreStatsType> stats(kLENGTH * 2);
string sentence = this->applyFactors(text);
const size_t length = countNgrams(sentence, testcounts, kLENGTH);
// Calculate effective reference length.
switch (m_ref_length_type) {

View File

@ -31,6 +31,7 @@ void CderScorer::setReferenceFiles(const vector<string>& referenceFiles)
m_ref_sentences.push_back(vector<sent_t>());
string line;
while (getline(refin,line)) {
line = this->applyFactors(line);
sent_t encoded;
TokenizeAndEncode(line, encoded);
m_ref_sentences[rid].push_back(encoded);
@ -40,8 +41,10 @@ void CderScorer::setReferenceFiles(const vector<string>& referenceFiles)
void CderScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
string sentence = this->applyFactors(text);
vector<int> stats;
prepareStatsVector(sid, text, stats);
prepareStatsVector(sid, sentence, stats);
entry.set(stats);
}

View File

@ -159,3 +159,24 @@ void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats
string str = buff.str();
entry.set(str);
}
void InterpolatedScorer::setFactors(const string& factors)
{
if (factors.empty()) return;
vector<string> fsplit;
split(factors, ',', fsplit);
if (fsplit.size() != _scorers.size()) throw runtime_error("Number of factor specifications does not equal number of interpolated scorers.");
for (size_t i = 0; i < _scorers.size(); ++i)
{
_scorers[i]->setFactors(fsplit[i]);
}
}

View File

@ -42,6 +42,11 @@ public:
virtual void setScoreData(ScoreData* data);
/**
* Set the factors, which should be used for this metric
*/
virtual void setFactors(const string& factors);
protected:
ScopedVector<Scorer> _scorers;

View File

@ -29,6 +29,7 @@ void PerScorer::setReferenceFiles(const vector<string>& referenceFiles)
string line;
int sid = 0;
while (getline(in,line)) {
line = this->applyFactors(line);
vector<int> tokens;
TokenizeAndEncode(line, tokens);
m_ref_tokens.push_back(multiset<int>());
@ -52,10 +53,13 @@ void PerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
msg << "Sentence id (" << sid << ") not found in reference set";
throw runtime_error(msg.str());
}
string sentence = this->applyFactors(text);
// Calculate correct, output_length and ref_length for
// the line and store it in entry
vector<int> testtokens;
TokenizeAndEncode(text, testtokens);
TokenizeAndEncode(sentence, testtokens);
multiset<int> testtokens_all(testtokens.begin(),testtokens.end());
set<int> testtokens_unique(testtokens.begin(),testtokens.end());
int correct = 0;

View File

@ -1,5 +1,6 @@
#include "Scorer.h"
#include <limits>
#include "Util.h"
namespace {
@ -95,6 +96,55 @@ void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded) {
}
}
/**
* Set the factors, which should be used for this metric
*/
void Scorer::setFactors(const string& factors)
{
if (factors.empty()) return;
vector<string> factors_vec;
split(factors, '|', factors_vec);
for(vector<string>::iterator it = factors_vec.begin(); it != factors_vec.end(); ++it)
{
int factor = atoi(it->c_str());
m_factors.push_back(factor);
}
}
/**
* Take the factored sentence and return the desired factors
*/
string Scorer::applyFactors(const string& sentence)
{
if (m_factors.size() == 0) return sentence;
vector<string> tokens;
split(sentence, ' ', tokens);
stringstream sstream;
for (size_t i = 0; i < tokens.size(); ++i)
{
if (tokens[i] == "") continue;
vector<string> factors;
split(tokens[i], '|', factors);
int fsize = factors.size();
if (i>0) sstream << " ";
for (size_t j = 0; j < m_factors.size(); ++j)
{
int findex = m_factors[j];
if (findex < 0 || findex >= fsize) throw runtime_error("Factor index is out of range.");
if (j>0) sstream << "|";
sstream << factors[findex];
}
}
return sstream.str();
}
StatisticsBasedScorer::StatisticsBasedScorer(const string& name, const string& config)
: Scorer(name,config) {
//configure regularisation

View File

@ -97,6 +97,16 @@ class Scorer
m_score_data = data;
}
/**
* Set the factors, which should be used for this metric
*/
virtual void setFactors(const string& factors);
/**
* Take the factored sentence and return the desired factors
*/
virtual string applyFactors(const string& sentece);
private:
class Encoder {
public:
@ -114,6 +124,7 @@ class Scorer
string m_name;
Encoder* m_encoder;
map<string, string> m_config;
vector<int> m_factors;
protected:
ScoreData* m_score_data;

View File

@ -33,6 +33,7 @@ void TerScorer::setReferenceFiles ( const vector<string>& referenceFiles )
string line;
int sid = 0;
while ( getline ( in, line ) ) {
line = this->applyFactors(line);
vector<int> tokens;
TokenizeAndEncode(line, tokens);
m_references.push_back ( tokens );
@ -48,6 +49,7 @@ void TerScorer::setReferenceFiles ( const vector<string>& referenceFiles )
void TerScorer::prepareStats ( size_t sid, const string& text, ScoreStats& entry )
{
string sentence = this->applyFactors(text);
terAlignment result;
result.numEdits = 0.0 ;
@ -74,7 +76,7 @@ void TerScorer::prepareStats ( size_t sid, const string& text, ScoreStats& entry
averageLength+=(double)m_multi_references.at ( incRefsBis ).at ( sid ).size();
}
averageLength=averageLength/( double ) m_multi_references.size();
TokenizeAndEncode(text, testtokens);
TokenizeAndEncode(sentence, testtokens);
terCalc * evaluation=new terCalc();
evaluation->setDebugMode ( false );
terAlignment tmp_result = evaluation->TER ( reftokens, testtokens );

View File

@ -131,6 +131,7 @@ void usage()
cerr << "[--sctype|-s] the scorer type (default BLEU)" << endl;
cerr << "[--scconfig|-c] configuration string passed to scorer" << endl;
cerr << "\tThis is of the form NAME1:VAL1,NAME2:VAL2 etc " << endl;
cerr << "[--factors|-f] list of factors passed to the scorer (e.g. 0|2)" << endl;
cerr << "[--reference|-R] comma separated list of reference files" << endl;
cerr << "[--candidate|-C] comma separated list of candidate files" << endl;
cerr << "[--bootstrap|-b] number of booststraped samples (default 0 - no bootstraping)" << endl;
@ -164,6 +165,7 @@ static struct option long_options[] = {
{"candidate", required_argument, 0, 'C'},
{"bootstrap", required_argument, 0, 'b'},
{"rseed", required_argument, 0, 'r'},
{"factors", required_argument, 0, 'f'},
{"help", no_argument, 0, 'h'},
{0, 0, 0, 0}
};
@ -174,6 +176,7 @@ struct ProgramOption {
vector<string> scorer_configs;
string reference;
string candidate;
vector<string> scorer_factors;
int bootstrap;
int seed;
bool has_seed;
@ -190,11 +193,12 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
int c;
int option_index;
int last_scorer_index = -1;
while ((c = getopt_long(argc, argv, "s:c:R:C:b:r:h", long_options, &option_index)) != -1) {
while ((c = getopt_long(argc, argv, "s:c:R:C:b:r:f:h", long_options, &option_index)) != -1) {
switch(c) {
case 's':
opt->scorer_types.push_back(string(optarg));
opt->scorer_configs.push_back(string(""));
opt->scorer_factors.push_back(string(""));
last_scorer_index++;
break;
case 'c':
@ -213,6 +217,9 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
opt->seed = strtol(optarg, NULL, 10);
opt->has_seed = true;
break;
case 'f':
opt->scorer_factors[last_scorer_index] = string(optarg);
break;
default:
usage();
}
@ -223,6 +230,7 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
{
opt->scorer_types.push_back(string("BLEU"));
opt->scorer_configs.push_back(string(""));
opt->scorer_factors.push_back(string(""));
}
}
@ -268,6 +276,7 @@ int main(int argc, char** argv)
for (size_t i = 0; i < option.scorer_types.size(); i++)
{
g_scorer = ScorerFactory::getScorer(option.scorer_types[i], option.scorer_configs[i]);
g_scorer->setFactors(option.scorer_factors[i]);
g_scorer->setReferenceFiles(refFiles);
EvaluatorUtil::evaluate(*fileIt, option.bootstrap);
delete g_scorer;

View File

@ -26,6 +26,7 @@ void usage()
cerr << "[--sctype|-s] the scorer type (default BLEU)" << endl;
cerr << "[--scconfig|-c] configuration string passed to scorer" << endl;
cerr << "\tThis is of the form NAME1:VAL1,NAME2:VAL2 etc " << endl;
cerr << "[--factors|-f] list of factors passed to the scorer (e.g. 0|2)" << endl;
cerr << "[--reference|-r] comma separated list of reference files" << endl;
cerr << "[--binary|-b] use binary output format (default to text )" << endl;
cerr << "[--nbest|-n] the nbest file" << endl;
@ -41,6 +42,7 @@ void usage()
static struct option long_options[] = {
{"sctype", required_argument, 0, 's'},
{"scconfig", required_argument,0, 'c'},
{"factors", required_argument,0, 'f'},
{"reference", required_argument, 0, 'r'},
{"binary", no_argument, 0, 'b'},
{"nbest", required_argument, 0, 'n'},
@ -57,6 +59,7 @@ static struct option long_options[] = {
struct ProgramOption {
string scorerType;
string scorerConfig;
string scorerFactors;
string referenceFile;
string nbestFile;
string scoreDataFile;
@ -69,6 +72,7 @@ struct ProgramOption {
ProgramOption()
: scorerType("BLEU"),
scorerConfig(""),
scorerFactors(""),
referenceFile(""),
nbestFile(""),
scoreDataFile("statscore.data"),
@ -83,7 +87,7 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
int c;
int option_index;
while ((c = getopt_long(argc, argv, "s:r:n:S:F:R:E:v:hb", long_options, &option_index)) != -1) {
while ((c = getopt_long(argc, argv, "s:r:f:n:S:F:R:E:v:hb", long_options, &option_index)) != -1) {
switch (c) {
case 's':
opt->scorerType = string(optarg);
@ -91,6 +95,9 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
case 'c':
opt->scorerConfig = string(optarg);
break;
case 'f':
opt->scorerFactors = string(optarg);
break;
case 'r':
opt->referenceFile = string(optarg);
break;
@ -180,6 +187,8 @@ int main(int argc, char** argv)
Scorer* scorer = ScorerFactory::getScorer(option.scorerType, option.scorerConfig);
scorer->setFactors(option.scorerFactors);
// load references
if (referenceFiles.size() > 0)
scorer->setReferenceFiles(referenceFiles);