2008-05-16 23:57:01 +04:00
|
|
|
/**
|
|
|
|
* Extract features and score statistics from nvest file, optionally merging with
|
|
|
|
* those from the previous iteration.
|
|
|
|
* Developed during the 2nd MT marathon.
|
|
|
|
**/
|
2008-05-15 12:35:56 +04:00
|
|
|
|
|
|
|
#include <iostream>
|
2008-05-16 23:57:01 +04:00
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
2008-05-15 12:35:56 +04:00
|
|
|
|
2008-05-16 23:57:01 +04:00
|
|
|
#include <getopt.h>
|
2012-03-04 19:35:07 +04:00
|
|
|
#include <boost/scoped_ptr.hpp>
|
2008-05-15 12:35:56 +04:00
|
|
|
|
2008-05-16 23:57:01 +04:00
|
|
|
#include "Data.h"
|
|
|
|
#include "Scorer.h"
|
2008-05-27 20:50:52 +04:00
|
|
|
#include "ScorerFactory.h"
|
2008-05-16 23:57:01 +04:00
|
|
|
#include "Timer.h"
|
|
|
|
#include "Util.h"
|
2008-05-15 12:35:56 +04:00
|
|
|
|
2008-05-16 23:57:01 +04:00
|
|
|
using namespace std;
|
2012-07-01 00:39:10 +04:00
|
|
|
using namespace MosesTuning;
|
2008-05-15 12:35:56 +04:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
namespace
|
|
|
|
{
|
2012-02-01 07:23:15 +04:00
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
void usage()
|
|
|
|
{
|
2012-02-01 07:29:45 +04:00
|
|
|
cerr << "usage: extractor [options])" << endl;
|
|
|
|
cerr << "[--sctype|-s] the scorer type (default BLEU)" << endl;
|
|
|
|
cerr << "[--scconfig|-c] configuration string passed to scorer" << endl;
|
|
|
|
cerr << "\tThis is of the form NAME1:VAL1,NAME2:VAL2 etc " << endl;
|
|
|
|
cerr << "[--reference|-r] comma separated list of reference files" << endl;
|
|
|
|
cerr << "[--binary|-b] use binary output format (default to text )" << endl;
|
|
|
|
cerr << "[--nbest|-n] the nbest file" << endl;
|
|
|
|
cerr << "[--scfile|-S] the scorer data output file" << endl;
|
|
|
|
cerr << "[--ffile|-F] the feature data output file" << endl;
|
|
|
|
cerr << "[--prev-ffile|-E] comma separated list of previous feature data" << endl;
|
|
|
|
cerr << "[--prev-scfile|-R] comma separated list of previous scorer data" << endl;
|
2012-05-09 21:21:41 +04:00
|
|
|
cerr << "[--factors|-f] list of factors passed to the scorer (e.g. 0|2)" << endl;
|
|
|
|
cerr << "[--filter|-l] filter command used to preprocess the sentences" << endl;
|
2012-07-12 22:08:55 +04:00
|
|
|
cerr << "[--allow-duplicates|-d] omit the duplicate removal step" << endl;
|
2012-02-01 07:29:45 +04:00
|
|
|
cerr << "[-v] verbose level" << endl;
|
|
|
|
cerr << "[--help|-h] print this message and exit" << endl;
|
2008-05-16 23:57:01 +04:00
|
|
|
exit(1);
|
|
|
|
}
|
2008-05-15 12:35:56 +04:00
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
static struct option long_options[] = {
|
2012-02-01 07:29:45 +04:00
|
|
|
{"sctype", required_argument, 0, 's'},
|
|
|
|
{"scconfig", required_argument,0, 'c'},
|
2012-02-28 05:27:23 +04:00
|
|
|
{"factors", required_argument,0, 'f'},
|
2012-05-09 21:21:41 +04:00
|
|
|
{"filter", required_argument,0, 'l'},
|
2012-02-01 07:29:45 +04:00
|
|
|
{"reference", required_argument, 0, 'r'},
|
|
|
|
{"binary", no_argument, 0, 'b'},
|
|
|
|
{"nbest", required_argument, 0, 'n'},
|
|
|
|
{"scfile", required_argument, 0, 'S'},
|
|
|
|
{"ffile", required_argument, 0, 'F'},
|
|
|
|
{"prev-scfile", required_argument, 0, 'R'},
|
|
|
|
{"prev-ffile", required_argument, 0, 'E'},
|
|
|
|
{"verbose", required_argument, 0, 'v'},
|
|
|
|
{"help", no_argument, 0, 'h'},
|
2012-07-12 22:08:55 +04:00
|
|
|
{"allow-duplicates", no_argument, 0, 'd'},
|
2011-02-24 15:42:19 +03:00
|
|
|
{0, 0, 0, 0}
|
|
|
|
};
|
|
|
|
|
2012-02-01 07:23:15 +04:00
|
|
|
// Command line options used in extractor.
|
|
|
|
struct ProgramOption {
|
|
|
|
string scorerType;
|
|
|
|
string scorerConfig;
|
2012-02-28 05:27:23 +04:00
|
|
|
string scorerFactors;
|
2012-05-09 21:21:41 +04:00
|
|
|
string scorerFilter;
|
2012-02-01 07:23:15 +04:00
|
|
|
string referenceFile;
|
|
|
|
string nbestFile;
|
|
|
|
string scoreDataFile;
|
|
|
|
string featureDataFile;
|
|
|
|
string prevScoreDataFile;
|
|
|
|
string prevFeatureDataFile;
|
|
|
|
bool binmode;
|
2012-07-12 22:08:55 +04:00
|
|
|
bool allowDuplicates;
|
2012-02-01 07:23:15 +04:00
|
|
|
int verbosity;
|
|
|
|
|
|
|
|
ProgramOption()
|
2013-05-29 21:16:15 +04:00
|
|
|
: scorerType("BLEU"),
|
|
|
|
scorerConfig(""),
|
|
|
|
scorerFactors(""),
|
|
|
|
scorerFilter(""),
|
|
|
|
referenceFile(""),
|
|
|
|
nbestFile(""),
|
|
|
|
scoreDataFile("statscore.data"),
|
|
|
|
featureDataFile("features.data"),
|
|
|
|
prevScoreDataFile(""),
|
|
|
|
prevFeatureDataFile(""),
|
|
|
|
binmode(false),
|
|
|
|
allowDuplicates(false),
|
|
|
|
verbosity(0) { }
|
2012-02-01 07:23:15 +04:00
|
|
|
};
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
void ParseCommandOptions(int argc, char** argv, ProgramOption* opt)
|
|
|
|
{
|
2011-02-24 15:42:19 +03:00
|
|
|
int c;
|
2012-02-01 07:23:15 +04:00
|
|
|
int option_index;
|
|
|
|
|
2012-07-12 22:47:57 +04:00
|
|
|
while ((c = getopt_long(argc, argv, "s:r:f:l:n:S:F:R:E:v:hbd", long_options, &option_index)) != -1) {
|
2012-02-01 07:23:15 +04:00
|
|
|
switch (c) {
|
2013-05-29 21:16:15 +04:00
|
|
|
case 's':
|
|
|
|
opt->scorerType = string(optarg);
|
|
|
|
break;
|
|
|
|
case 'c':
|
|
|
|
opt->scorerConfig = string(optarg);
|
|
|
|
break;
|
|
|
|
case 'f':
|
|
|
|
opt->scorerFactors = string(optarg);
|
|
|
|
break;
|
|
|
|
case 'l':
|
|
|
|
opt->scorerFilter = string(optarg);
|
|
|
|
break;
|
|
|
|
case 'r':
|
|
|
|
opt->referenceFile = string(optarg);
|
|
|
|
break;
|
|
|
|
case 'b':
|
|
|
|
opt->binmode = true;
|
|
|
|
break;
|
|
|
|
case 'n':
|
|
|
|
opt->nbestFile = string(optarg);
|
|
|
|
break;
|
|
|
|
case 'S':
|
|
|
|
opt->scoreDataFile = string(optarg);
|
|
|
|
break;
|
|
|
|
case 'F':
|
|
|
|
opt->featureDataFile = string(optarg);
|
|
|
|
break;
|
|
|
|
case 'E':
|
|
|
|
opt->prevFeatureDataFile = string(optarg);
|
|
|
|
break;
|
|
|
|
case 'R':
|
|
|
|
opt->prevScoreDataFile = string(optarg);
|
|
|
|
break;
|
|
|
|
case 'v':
|
|
|
|
opt->verbosity = atoi(optarg);
|
|
|
|
break;
|
|
|
|
case 'd':
|
|
|
|
opt->allowDuplicates = true;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
usage();
|
2008-05-16 23:57:01 +04:00
|
|
|
}
|
2011-02-24 15:42:19 +03:00
|
|
|
}
|
2012-02-01 07:23:15 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
} // anonymous namespace
|
|
|
|
|
|
|
|
int main(int argc, char** argv)
|
|
|
|
{
|
|
|
|
ResetUserTime();
|
|
|
|
|
|
|
|
ProgramOption option;
|
|
|
|
ParseCommandOptions(argc, argv, &option);
|
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
try {
|
2011-11-12 03:58:23 +04:00
|
|
|
// check whether score statistics file is specified
|
2012-02-01 07:23:15 +04:00
|
|
|
if (option.scoreDataFile.length() == 0) {
|
2011-02-24 15:42:19 +03:00
|
|
|
throw runtime_error("Error: output score statistics file is not specified");
|
2008-05-16 23:57:01 +04:00
|
|
|
}
|
2008-05-15 12:35:56 +04:00
|
|
|
|
2011-11-12 03:58:23 +04:00
|
|
|
// check wheter feature file is specified
|
2012-02-01 07:23:15 +04:00
|
|
|
if (option.featureDataFile.length() == 0) {
|
2011-02-24 15:42:19 +03:00
|
|
|
throw runtime_error("Error: output feature file is not specified");
|
2008-06-03 12:56:37 +04:00
|
|
|
}
|
2008-05-15 12:35:56 +04:00
|
|
|
|
2011-11-12 03:58:23 +04:00
|
|
|
// check whether reference file is specified when nbest is specified
|
2012-02-01 07:23:15 +04:00
|
|
|
if ((option.nbestFile.length() > 0 && option.referenceFile.length() == 0)) {
|
2011-02-24 15:42:19 +03:00
|
|
|
throw runtime_error("Error: reference file is not specified; you can not score the nbest");
|
2008-06-03 12:56:37 +04:00
|
|
|
}
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2008-06-03 12:56:37 +04:00
|
|
|
vector<string> nbestFiles;
|
2012-02-01 07:23:15 +04:00
|
|
|
if (option.nbestFile.length() > 0) {
|
|
|
|
Tokenize(option.nbestFile.c_str(), ',', &nbestFiles);
|
2008-05-16 23:57:01 +04:00
|
|
|
}
|
2008-05-15 12:35:56 +04:00
|
|
|
|
2008-05-16 23:57:01 +04:00
|
|
|
vector<string> referenceFiles;
|
2012-02-01 07:23:15 +04:00
|
|
|
if (option.referenceFile.length() > 0) {
|
|
|
|
Tokenize(option.referenceFile.c_str(), ',', &referenceFiles);
|
2008-06-03 12:56:37 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
vector<string> prevScoreDataFiles;
|
2012-02-01 07:23:15 +04:00
|
|
|
if (option.prevScoreDataFile.length() > 0) {
|
|
|
|
Tokenize(option.prevScoreDataFile.c_str(), ',', &prevScoreDataFiles);
|
2008-05-16 23:57:01 +04:00
|
|
|
}
|
2008-05-15 12:35:56 +04:00
|
|
|
|
2008-06-03 12:56:37 +04:00
|
|
|
vector<string> prevFeatureDataFiles;
|
2012-02-01 07:23:15 +04:00
|
|
|
if (option.prevFeatureDataFile.length() > 0) {
|
|
|
|
Tokenize(option.prevFeatureDataFile.c_str(), ',', &prevFeatureDataFiles);
|
2011-02-24 15:42:19 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (prevScoreDataFiles.size() != prevFeatureDataFiles.size()) {
|
|
|
|
throw runtime_error("Error: there is a different number of previous score and feature files");
|
2008-06-03 12:56:37 +04:00
|
|
|
}
|
|
|
|
|
2012-02-01 07:23:15 +04:00
|
|
|
if (option.binmode) {
|
|
|
|
cerr << "Binary write mode is selected" << endl;
|
|
|
|
} else {
|
|
|
|
cerr << "Binary write mode is NOT selected" << endl;
|
|
|
|
}
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2012-02-01 07:23:15 +04:00
|
|
|
TRACE_ERR("Scorer type: " << option.scorerType << endl);
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2012-03-04 19:35:07 +04:00
|
|
|
boost::scoped_ptr<Scorer> scorer(
|
2013-05-29 21:16:15 +04:00
|
|
|
ScorerFactory::getScorer(option.scorerType, option.scorerConfig));
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2012-05-09 21:21:41 +04:00
|
|
|
// set Factors and Filter used to preprocess the sentences
|
2012-02-28 05:27:23 +04:00
|
|
|
scorer->setFactors(option.scorerFactors);
|
2012-05-09 21:21:41 +04:00
|
|
|
scorer->setFilter(option.scorerFilter);
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2011-11-12 03:58:23 +04:00
|
|
|
// load references
|
2011-02-24 15:42:19 +03:00
|
|
|
if (referenceFiles.size() > 0)
|
|
|
|
scorer->setReferenceFiles(referenceFiles);
|
|
|
|
|
2012-04-30 08:29:18 +04:00
|
|
|
// PrintUserTime("References loaded");
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2012-03-10 12:28:38 +04:00
|
|
|
Data data(scorer.get());
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2011-11-12 03:58:23 +04:00
|
|
|
// load old data
|
2012-02-01 07:29:45 +04:00
|
|
|
for (size_t i = 0; i < prevScoreDataFiles.size(); i++) {
|
2011-02-24 15:42:19 +03:00
|
|
|
data.load(prevFeatureDataFiles.at(i), prevScoreDataFiles.at(i));
|
2008-06-03 12:56:37 +04:00
|
|
|
}
|
|
|
|
|
2012-04-30 08:29:18 +04:00
|
|
|
// PrintUserTime("Previous data loaded");
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2011-11-12 03:58:23 +04:00
|
|
|
// computing score statistics of each nbest file
|
2012-02-01 07:29:45 +04:00
|
|
|
for (size_t i = 0; i < nbestFiles.size(); i++) {
|
2012-03-10 12:47:01 +04:00
|
|
|
data.loadNBest(nbestFiles.at(i));
|
2008-05-15 18:13:32 +04:00
|
|
|
}
|
2008-05-15 12:35:56 +04:00
|
|
|
|
2012-04-30 08:29:18 +04:00
|
|
|
// PrintUserTime("Nbest entries loaded and scored");
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2011-12-12 17:48:42 +04:00
|
|
|
//ADDED_BY_TS
|
2012-07-12 22:08:55 +04:00
|
|
|
if (!option.allowDuplicates) {
|
|
|
|
data.removeDuplicates();
|
|
|
|
}
|
2011-12-12 17:48:42 +04:00
|
|
|
//END_ADDED
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2012-02-01 07:23:15 +04:00
|
|
|
data.save(option.featureDataFile, option.scoreDataFile, option.binmode);
|
2011-02-24 15:42:19 +03:00
|
|
|
PrintUserTime("Stopping...");
|
2011-11-11 14:11:10 +04:00
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
return EXIT_SUCCESS;
|
|
|
|
} catch (const exception& e) {
|
|
|
|
cerr << "Exception: " << e.what() << endl;
|
|
|
|
return EXIT_FAILURE;
|
|
|
|
}
|
2008-05-15 12:35:56 +04:00
|
|
|
}
|