2008-05-16 23:57:01 +04:00
|
|
|
/**
|
|
|
|
* Extract features and score statistics from nvest file, optionally merging with
|
|
|
|
* those from the previous iteration.
|
|
|
|
* Developed during the 2nd MT marathon.
|
|
|
|
**/
|
2008-05-15 12:35:56 +04:00
|
|
|
|
|
|
|
#include <iostream>
|
2008-05-16 23:57:01 +04:00
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
2008-05-15 12:35:56 +04:00
|
|
|
|
2008-05-16 23:57:01 +04:00
|
|
|
#include <getopt.h>
|
2008-05-15 12:35:56 +04:00
|
|
|
|
2008-05-16 23:57:01 +04:00
|
|
|
#include "Data.h"
|
|
|
|
#include "Scorer.h"
|
2008-05-27 20:50:52 +04:00
|
|
|
#include "ScorerFactory.h"
|
2008-05-16 23:57:01 +04:00
|
|
|
#include "Timer.h"
|
|
|
|
#include "Util.h"
|
2008-05-15 12:35:56 +04:00
|
|
|
|
2008-05-16 23:57:01 +04:00
|
|
|
using namespace std;
|
2008-05-15 12:35:56 +04:00
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
void usage()
|
|
|
|
{
|
2008-05-16 23:57:01 +04:00
|
|
|
cerr<<"usage: extractor [options])"<<endl;
|
|
|
|
cerr<<"[--sctype|-s] the scorer type (default BLEU)"<<endl;
|
2008-06-24 23:27:18 +04:00
|
|
|
cerr<<"[--scconfig|-c] configuration string passed to scorer"<<endl;
|
|
|
|
cerr<<"\tThis is of the form NAME1:VAL1,NAME2:VAL2 etc "<<endl;
|
2008-06-03 12:56:37 +04:00
|
|
|
cerr<<"[--reference|-r] comma separated list of reference files"<<endl;
|
|
|
|
cerr<<"[--binary|-b] use binary output format (default to text )"<<endl;
|
|
|
|
cerr<<"[--nbest|-n] the nbest file"<<endl;
|
|
|
|
cerr<<"[--scfile|-S] the scorer data output file"<<endl;
|
|
|
|
cerr<<"[--ffile|-F] the feature data output file"<<endl;
|
2011-02-24 15:42:19 +03:00
|
|
|
cerr<<"[--prev-ffile|-E] comma separated list of previous feature data" <<endl;
|
2008-06-03 12:56:37 +04:00
|
|
|
cerr<<"[--prev-scfile|-R] comma separated list of previous scorer data"<<endl;
|
2008-05-16 23:57:01 +04:00
|
|
|
cerr<<"[-v] verbose level"<<endl;
|
|
|
|
cerr<<"[--help|-h] print this message and exit"<<endl;
|
|
|
|
exit(1);
|
|
|
|
}
|
2008-05-15 12:35:56 +04:00
|
|
|
|
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
static struct option long_options[] = {
|
|
|
|
{"sctype",required_argument,0,'s'},
|
|
|
|
{"scconfig",required_argument,0,'c'},
|
|
|
|
{"reference",required_argument,0,'r'},
|
|
|
|
{"binary",no_argument,0,'b'},
|
|
|
|
{"nbest",required_argument,0,'n'},
|
|
|
|
{"scfile",required_argument,0,'S'},
|
|
|
|
{"ffile",required_argument,0,'F'},
|
|
|
|
{"prev-scfile",required_argument,0,'R'},
|
|
|
|
{"prev-ffile",required_argument,0,'E'},
|
|
|
|
{"verbose",required_argument,0,'v'},
|
|
|
|
{"help",no_argument,0,'h'},
|
|
|
|
{0, 0, 0, 0}
|
|
|
|
};
|
2008-05-16 23:57:01 +04:00
|
|
|
int option_index;
|
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
int main(int argc, char** argv)
|
|
|
|
{
|
|
|
|
|
|
|
|
|
|
|
|
ResetUserTime();
|
|
|
|
|
|
|
|
/*
|
|
|
|
Timer timer;
|
|
|
|
timer.start("Starting...");
|
|
|
|
*/
|
|
|
|
|
|
|
|
//defaults
|
|
|
|
string scorerType("BLEU");
|
|
|
|
string scorerConfig("");
|
|
|
|
string referenceFile("");
|
|
|
|
string nbestFile("");
|
|
|
|
string scoreDataFile("statscore.data");
|
|
|
|
string featureDataFile("features.data");
|
|
|
|
string prevScoreDataFile("");
|
|
|
|
string prevFeatureDataFile("");
|
|
|
|
bool binmode = false;
|
|
|
|
int verbosity = 0;
|
|
|
|
int c;
|
|
|
|
while ((c=getopt_long (argc,argv, "s:r:n:S:F:R:E:v:hb", long_options, &option_index)) != -1) {
|
|
|
|
switch(c) {
|
2011-11-12 04:24:19 +04:00
|
|
|
case 's':
|
|
|
|
scorerType = string(optarg);
|
|
|
|
break;
|
|
|
|
case 'c':
|
|
|
|
scorerConfig = string(optarg);
|
|
|
|
break;
|
|
|
|
case 'r':
|
|
|
|
referenceFile = string(optarg);
|
|
|
|
break;
|
|
|
|
case 'b':
|
|
|
|
binmode = true;
|
|
|
|
break;
|
|
|
|
case 'n':
|
|
|
|
nbestFile = string(optarg);
|
|
|
|
break;
|
|
|
|
case 'S':
|
|
|
|
scoreDataFile = string(optarg);
|
|
|
|
break;
|
|
|
|
case 'F':
|
|
|
|
featureDataFile = string(optarg);
|
|
|
|
break;
|
|
|
|
case 'E':
|
|
|
|
prevFeatureDataFile = string(optarg);
|
|
|
|
break;
|
|
|
|
case 'R':
|
|
|
|
prevScoreDataFile = string(optarg);
|
|
|
|
break;
|
|
|
|
case 'v':
|
|
|
|
verbosity = atoi(optarg);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
usage();
|
2008-05-16 23:57:01 +04:00
|
|
|
}
|
2011-02-24 15:42:19 +03:00
|
|
|
}
|
|
|
|
try {
|
2011-11-12 03:58:23 +04:00
|
|
|
// check whether score statistics file is specified
|
2011-02-24 15:42:19 +03:00
|
|
|
if (scoreDataFile.length() == 0) {
|
|
|
|
throw runtime_error("Error: output score statistics file is not specified");
|
2008-05-16 23:57:01 +04:00
|
|
|
}
|
2008-05-15 12:35:56 +04:00
|
|
|
|
2011-11-12 03:58:23 +04:00
|
|
|
// check wheter feature file is specified
|
2011-02-24 15:42:19 +03:00
|
|
|
if (featureDataFile.length() == 0) {
|
|
|
|
throw runtime_error("Error: output feature file is not specified");
|
2008-06-03 12:56:37 +04:00
|
|
|
}
|
2008-05-15 12:35:56 +04:00
|
|
|
|
2011-11-12 03:58:23 +04:00
|
|
|
// check whether reference file is specified when nbest is specified
|
2011-02-24 15:42:19 +03:00
|
|
|
if ((nbestFile.length() > 0 && referenceFile.length() == 0)) {
|
|
|
|
throw runtime_error("Error: reference file is not specified; you can not score the nbest");
|
2008-06-03 12:56:37 +04:00
|
|
|
}
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2008-06-03 12:56:37 +04:00
|
|
|
vector<string> nbestFiles;
|
2011-02-24 15:42:19 +03:00
|
|
|
if (nbestFile.length() > 0) {
|
2011-11-11 17:00:30 +04:00
|
|
|
Tokenize(nbestFile.c_str(), ',', &nbestFiles);
|
2008-05-16 23:57:01 +04:00
|
|
|
}
|
2008-05-15 12:35:56 +04:00
|
|
|
|
2008-05-16 23:57:01 +04:00
|
|
|
vector<string> referenceFiles;
|
2011-02-24 15:42:19 +03:00
|
|
|
if (referenceFile.length() > 0) {
|
2011-11-11 17:00:30 +04:00
|
|
|
Tokenize(referenceFile.c_str(), ',', &referenceFiles);
|
2008-06-03 12:56:37 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
vector<string> prevScoreDataFiles;
|
2011-02-24 15:42:19 +03:00
|
|
|
if (prevScoreDataFile.length() > 0) {
|
2011-11-11 17:00:30 +04:00
|
|
|
Tokenize(prevScoreDataFile.c_str(), ',', &prevScoreDataFiles);
|
2008-05-16 23:57:01 +04:00
|
|
|
}
|
2008-05-15 12:35:56 +04:00
|
|
|
|
2008-06-03 12:56:37 +04:00
|
|
|
vector<string> prevFeatureDataFiles;
|
2011-02-24 15:42:19 +03:00
|
|
|
if (prevFeatureDataFile.length() > 0) {
|
2011-11-11 17:00:30 +04:00
|
|
|
Tokenize(prevFeatureDataFile.c_str(), ',', &prevFeatureDataFiles);
|
2011-02-24 15:42:19 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (prevScoreDataFiles.size() != prevFeatureDataFiles.size()) {
|
|
|
|
throw runtime_error("Error: there is a different number of previous score and feature files");
|
2008-06-03 12:56:37 +04:00
|
|
|
}
|
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2012-04-30 08:29:18 +04:00
|
|
|
// if (binmode) cerr << "Binary write mode is selected" << endl;
|
|
|
|
// else cerr << "Binary write mode is NOT selected" << endl;
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2012-04-30 08:29:18 +04:00
|
|
|
// TRACE_ERR("Scorer type: " << scorerType << endl);
|
2011-11-11 15:40:59 +04:00
|
|
|
// ScorerFactory sfactory;
|
|
|
|
Scorer* scorer = ScorerFactory::getScorer(scorerType,scorerConfig);
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2011-11-12 03:58:23 +04:00
|
|
|
// load references
|
2011-02-24 15:42:19 +03:00
|
|
|
if (referenceFiles.size() > 0)
|
|
|
|
scorer->setReferenceFiles(referenceFiles);
|
|
|
|
|
2012-04-30 08:29:18 +04:00
|
|
|
// PrintUserTime("References loaded");
|
2011-02-24 15:42:19 +03:00
|
|
|
|
|
|
|
Data data(*scorer);
|
|
|
|
|
2011-11-12 03:58:23 +04:00
|
|
|
// load old data
|
2011-02-24 15:42:19 +03:00
|
|
|
for (size_t i=0; i < prevScoreDataFiles.size(); i++) {
|
|
|
|
data.load(prevFeatureDataFiles.at(i), prevScoreDataFiles.at(i));
|
2008-06-03 12:56:37 +04:00
|
|
|
}
|
|
|
|
|
2012-04-30 08:29:18 +04:00
|
|
|
// PrintUserTime("Previous data loaded");
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2011-11-12 03:58:23 +04:00
|
|
|
// computing score statistics of each nbest file
|
2011-02-24 15:42:19 +03:00
|
|
|
for (size_t i=0; i < nbestFiles.size(); i++) {
|
|
|
|
data.loadnbest(nbestFiles.at(i));
|
2008-05-15 18:13:32 +04:00
|
|
|
}
|
2008-05-15 12:35:56 +04:00
|
|
|
|
2012-04-30 08:29:18 +04:00
|
|
|
// PrintUserTime("Nbest entries loaded and scored");
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2012-04-30 08:29:18 +04:00
|
|
|
// if (binmode)
|
|
|
|
// cerr << "Binary write mode is selected" << endl;
|
|
|
|
// else
|
|
|
|
// cerr << "Binary write mode is NOT selected" << endl;
|
2011-02-24 15:42:19 +03:00
|
|
|
|
|
|
|
data.save(featureDataFile, scoreDataFile, binmode);
|
2012-04-30 08:29:18 +04:00
|
|
|
// PrintUserTime("Stopping...");
|
2011-11-12 04:24:19 +04:00
|
|
|
|
|
|
|
// timer.stop("Stopping...");
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2011-11-11 14:11:10 +04:00
|
|
|
delete scorer;
|
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
return EXIT_SUCCESS;
|
|
|
|
} catch (const exception& e) {
|
|
|
|
cerr << "Exception: " << e.what() << endl;
|
|
|
|
return EXIT_FAILURE;
|
|
|
|
}
|
|
|
|
|
2008-05-15 12:35:56 +04:00
|
|
|
}
|