2008-05-15 18:04:42 +04:00
|
|
|
/**
|
2009-02-12 15:58:27 +03:00
|
|
|
\description The is the main for the new version of the mert algorithm developed during the 2nd MT marathon
|
2008-05-15 18:04:42 +04:00
|
|
|
*/
|
2008-05-15 15:06:32 +04:00
|
|
|
|
|
|
|
#include <limits>
|
2009-02-12 15:58:27 +03:00
|
|
|
#include <unistd.h>
|
|
|
|
#include <cstdlib>
|
|
|
|
#include <iostream>
|
|
|
|
#include <fstream>
|
|
|
|
#include <cmath>
|
|
|
|
#include <ctime>
|
|
|
|
|
|
|
|
#include <getopt.h>
|
|
|
|
|
2008-05-15 15:06:32 +04:00
|
|
|
#include "Data.h"
|
|
|
|
#include "Point.h"
|
|
|
|
#include "Scorer.h"
|
2008-05-27 20:50:52 +04:00
|
|
|
#include "ScorerFactory.h"
|
2008-05-15 15:06:32 +04:00
|
|
|
#include "ScoreData.h"
|
|
|
|
#include "FeatureData.h"
|
|
|
|
#include "Optimizer.h"
|
2008-05-15 23:09:01 +04:00
|
|
|
#include "Types.h"
|
2008-05-16 23:57:01 +04:00
|
|
|
#include "Timer.h"
|
2008-05-16 11:09:15 +04:00
|
|
|
#include "Util.h"
|
2008-05-15 15:06:32 +04:00
|
|
|
|
|
|
|
|
|
|
|
float min_interval = 1e-3;
|
|
|
|
|
2008-05-15 18:04:42 +04:00
|
|
|
using namespace std;
|
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
void usage(void)
|
|
|
|
{
|
2008-05-16 00:49:49 +04:00
|
|
|
cerr<<"usage: mert -d <dimensions> (mandatory )"<<endl;
|
|
|
|
cerr<<"[-n retry ntimes (default 1)]"<<endl;
|
2011-07-23 04:24:45 +04:00
|
|
|
cerr<<"[-m number of random directions in powell (default 0)]"<<endl;
|
2008-05-16 00:49:49 +04:00
|
|
|
cerr<<"[-o\tthe indexes to optimize(default all)]"<<endl;
|
2008-05-17 16:19:02 +04:00
|
|
|
cerr<<"[-t\tthe optimizer(default powell)]"<<endl;
|
2009-02-12 15:58:27 +03:00
|
|
|
cerr<<"[-r\tthe random seed (defaults to system clock)"<<endl;
|
2008-05-16 23:57:01 +04:00
|
|
|
cerr<<"[--sctype|-s] the scorer type (default BLEU)"<<endl;
|
2008-06-24 23:27:18 +04:00
|
|
|
cerr<<"[--scconfig|-c] configuration string passed to scorer"<<endl;
|
2008-11-18 21:51:02 +03:00
|
|
|
cerr<<"[--scfile|-S] comma separated list of scorer data files (default score.data)"<<endl;
|
|
|
|
cerr<<"[--ffile|-F] comma separated list of feature data files (default feature.data)"<<endl;
|
2010-01-08 18:12:28 +03:00
|
|
|
cerr<<"[--ifile|-i] the starting point data file (default init.opt)"<<endl;
|
|
|
|
cerr<<"[-v] verbose level"<<endl;
|
2008-06-24 23:27:18 +04:00
|
|
|
cerr<<"[--help|-h] print this message and exit"<<endl;
|
2008-05-15 15:06:32 +04:00
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
static struct option long_options[] = {
|
|
|
|
{"pdim", 1, 0, 'd'},
|
|
|
|
{"ntry",1,0,'n'},
|
2011-07-23 04:24:45 +04:00
|
|
|
{"nrandom",1,0,'m'},
|
2011-02-24 15:42:19 +03:00
|
|
|
{"rseed",required_argument,0,'r'},
|
|
|
|
{"optimize",1,0,'o'},
|
|
|
|
{"type",1,0,'t'},
|
|
|
|
{"sctype",1,0,'s'},
|
|
|
|
{"scconfig",required_argument,0,'c'},
|
|
|
|
{"scfile",1,0,'S'},
|
|
|
|
{"ffile",1,0,'F'},
|
|
|
|
{"ifile",1,0,'i'},
|
|
|
|
{"verbose",1,0,'v'},
|
|
|
|
{"help",no_argument,0,'h'},
|
|
|
|
{0, 0, 0, 0}
|
|
|
|
};
|
2008-05-15 18:04:42 +04:00
|
|
|
int option_index;
|
2008-05-15 15:06:32 +04:00
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
int main (int argc, char **argv)
|
|
|
|
{
|
|
|
|
|
|
|
|
|
|
|
|
ResetUserTime();
|
|
|
|
|
|
|
|
/*
|
|
|
|
Timer timer;
|
|
|
|
timer.start("Starting...");
|
|
|
|
*/
|
|
|
|
|
2008-05-16 14:57:24 +04:00
|
|
|
int c,pdim,i;
|
|
|
|
pdim=-1;
|
2008-05-15 18:04:42 +04:00
|
|
|
int ntry=1;
|
2011-07-23 04:24:45 +04:00
|
|
|
int nrandom=0;
|
2009-02-12 15:58:27 +03:00
|
|
|
int seed=0;
|
|
|
|
bool hasSeed = false;
|
2008-05-15 18:04:42 +04:00
|
|
|
string type("powell");
|
|
|
|
string scorertype("BLEU");
|
2008-06-24 23:27:18 +04:00
|
|
|
string scorerconfig("");
|
2008-05-16 00:49:49 +04:00
|
|
|
string scorerfile("statscore.data");
|
|
|
|
string featurefile("features.data");
|
2008-05-23 15:48:16 +04:00
|
|
|
string initfile("init.opt");
|
2010-01-08 18:12:28 +03:00
|
|
|
|
|
|
|
string tooptimizestr("");
|
|
|
|
vector<unsigned> tooptimize;
|
2011-07-23 04:24:45 +04:00
|
|
|
vector<vector<parameter_t> > start_list;
|
2011-07-04 01:01:16 +04:00
|
|
|
vector<parameter_t> min;
|
|
|
|
vector<parameter_t> max;
|
|
|
|
//note: those mins and max are the bound for the starting points of the algorithm, not strict bound on the result!
|
|
|
|
|
2011-07-23 04:24:45 +04:00
|
|
|
while ((c=getopt_long (argc, argv, "o:r:d:n:m:t:s:S:F:v:", long_options, &option_index)) != -1) {
|
2008-05-15 15:06:32 +04:00
|
|
|
switch (c) {
|
2010-01-08 18:12:28 +03:00
|
|
|
case 'o':
|
|
|
|
tooptimizestr = string(optarg);
|
|
|
|
break;
|
2008-05-15 15:06:32 +04:00
|
|
|
case 'd':
|
2008-05-16 14:57:24 +04:00
|
|
|
pdim = strtol(optarg, NULL, 10);
|
2008-05-15 15:06:32 +04:00
|
|
|
break;
|
|
|
|
case 'n':
|
2008-05-15 18:04:42 +04:00
|
|
|
ntry=strtol(optarg, NULL, 10);
|
2008-05-15 15:06:32 +04:00
|
|
|
break;
|
2011-07-23 04:24:45 +04:00
|
|
|
case 'm':
|
|
|
|
nrandom=strtol(optarg, NULL, 10);
|
|
|
|
break;
|
2009-02-12 15:58:27 +03:00
|
|
|
case 'r':
|
|
|
|
seed=strtol(optarg, NULL, 10);
|
|
|
|
hasSeed = true;
|
|
|
|
break;
|
2008-05-15 18:04:42 +04:00
|
|
|
case 't':
|
|
|
|
type=string(optarg);
|
2008-05-16 00:49:49 +04:00
|
|
|
break;
|
2008-05-21 14:03:48 +04:00
|
|
|
case's':
|
|
|
|
scorertype=string(optarg);
|
2008-05-16 00:49:49 +04:00
|
|
|
break;
|
2008-06-24 23:27:18 +04:00
|
|
|
case 'c':
|
|
|
|
scorerconfig = string(optarg);
|
|
|
|
break;
|
2008-05-16 00:49:49 +04:00
|
|
|
case 'S':
|
|
|
|
scorerfile=string(optarg);
|
2008-05-21 14:03:48 +04:00
|
|
|
break;
|
2008-05-16 00:49:49 +04:00
|
|
|
case 'F':
|
|
|
|
featurefile=string(optarg);
|
|
|
|
break;
|
2008-05-23 15:48:16 +04:00
|
|
|
case 'i':
|
|
|
|
initfile=string(optarg);
|
|
|
|
break;
|
2008-05-16 11:09:15 +04:00
|
|
|
case 'v':
|
|
|
|
setverboselevel(strtol(optarg,NULL,10));
|
|
|
|
break;
|
2008-05-15 15:06:32 +04:00
|
|
|
default:
|
|
|
|
usage();
|
|
|
|
}
|
|
|
|
}
|
2008-05-16 14:57:24 +04:00
|
|
|
if (pdim < 0)
|
2008-05-16 11:09:15 +04:00
|
|
|
usage();
|
2009-02-12 15:58:27 +03:00
|
|
|
|
|
|
|
if (hasSeed) {
|
2011-02-24 15:42:19 +03:00
|
|
|
cerr << "Seeding random numbers with " << seed << endl;
|
|
|
|
srandom(seed);
|
2009-02-12 15:58:27 +03:00
|
|
|
} else {
|
2011-02-24 15:42:19 +03:00
|
|
|
cerr << "Seeding random numbers with system clock " << endl;
|
|
|
|
srandom(time(NULL));
|
2009-02-12 15:58:27 +03:00
|
|
|
}
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2011-07-23 04:24:45 +04:00
|
|
|
// read in starting points
|
|
|
|
std::string onefile;
|
|
|
|
while (!initfile.empty()) {
|
|
|
|
getNextPound(initfile, onefile, ",");
|
|
|
|
vector<parameter_t> start;
|
|
|
|
ifstream opt(onefile.c_str());
|
|
|
|
if(opt.fail()) {
|
|
|
|
cerr<<"could not open initfile: " << initfile << endl;
|
|
|
|
exit(3);
|
|
|
|
}
|
|
|
|
start.resize(pdim);//to do:read from file
|
|
|
|
int j;
|
|
|
|
for( j=0; j<pdim&&!opt.fail(); j++)
|
|
|
|
opt>>start[j];
|
|
|
|
if(j<pdim) {
|
|
|
|
cerr<<initfile<<":Too few starting weights." << endl;
|
|
|
|
exit(3);
|
|
|
|
}
|
|
|
|
start_list.push_back(start);
|
|
|
|
// for the first time, also read in the min/max values for scores
|
|
|
|
if (start_list.size() == 1) {
|
|
|
|
min.resize(pdim);
|
|
|
|
for( j=0; j<pdim&&!opt.fail(); j++)
|
|
|
|
opt>>min[j];
|
|
|
|
if(j<pdim) {
|
|
|
|
cerr<<initfile<<":Too few minimum weights." << endl;
|
|
|
|
cerr<<"error could not initialize start point with " << initfile << endl;
|
|
|
|
exit(3);
|
|
|
|
}
|
|
|
|
max.resize(pdim);
|
|
|
|
for( j=0; j<pdim&&!opt.fail(); j++)
|
|
|
|
opt>>max[j];
|
|
|
|
if(j<pdim) {
|
|
|
|
cerr<<initfile<<":Too few maximum weights." << endl;
|
|
|
|
exit(3);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
opt.close();
|
2011-07-04 01:01:16 +04:00
|
|
|
}
|
2008-05-16 14:57:24 +04:00
|
|
|
|
2008-11-18 21:51:02 +03:00
|
|
|
vector<string> ScoreDataFiles;
|
2011-02-24 15:42:19 +03:00
|
|
|
if (scorerfile.length() > 0) {
|
2008-11-18 21:51:02 +03:00
|
|
|
std::string substring;
|
2011-02-24 15:42:19 +03:00
|
|
|
while (!scorerfile.empty()) {
|
2008-11-18 21:51:02 +03:00
|
|
|
getNextPound(scorerfile, substring, ",");
|
|
|
|
ScoreDataFiles.push_back(substring);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
vector<string> FeatureDataFiles;
|
2011-02-24 15:42:19 +03:00
|
|
|
if (featurefile.length() > 0) {
|
2008-11-18 21:51:02 +03:00
|
|
|
std::string substring;
|
2011-02-24 15:42:19 +03:00
|
|
|
while (!featurefile.empty()) {
|
2008-11-18 21:51:02 +03:00
|
|
|
getNextPound(featurefile, substring, ",");
|
|
|
|
FeatureDataFiles.push_back(substring);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
if (ScoreDataFiles.size() != FeatureDataFiles.size()) {
|
2008-11-18 21:51:02 +03:00
|
|
|
throw runtime_error("Error: there is a different number of previous score and feature files");
|
|
|
|
}
|
|
|
|
|
2008-05-16 14:57:24 +04:00
|
|
|
//it make sense to know what parameter set were used to generate the nbest
|
2008-05-16 00:49:49 +04:00
|
|
|
ScorerFactory SF;
|
2008-06-24 23:27:18 +04:00
|
|
|
Scorer *TheScorer=SF.getScorer(scorertype,scorerconfig);
|
2008-05-21 14:03:48 +04:00
|
|
|
|
2008-11-18 21:51:02 +03:00
|
|
|
//load data
|
2008-05-27 20:50:52 +04:00
|
|
|
Data D(*TheScorer);
|
2011-02-24 15:42:19 +03:00
|
|
|
for (size_t i=0; i < ScoreDataFiles.size(); i++) {
|
2008-11-18 21:51:02 +03:00
|
|
|
cerr<<"Loading Data from: "<< ScoreDataFiles.at(i) << " and " << FeatureDataFiles.at(i) << endl;
|
|
|
|
D.load(FeatureDataFiles.at(i), ScoreDataFiles.at(i));
|
|
|
|
}
|
2008-05-21 14:03:48 +04:00
|
|
|
|
2010-01-08 18:12:28 +03:00
|
|
|
PrintUserTime("Data loaded");
|
2011-02-24 15:42:19 +03:00
|
|
|
|
|
|
|
if (tooptimizestr.length() > 0) {
|
2010-01-08 18:12:28 +03:00
|
|
|
cerr << "Weights to optimize: " << tooptimizestr << endl;
|
|
|
|
|
|
|
|
//parse string to get weights to optimize
|
|
|
|
//and set them as active
|
|
|
|
std::string substring;
|
|
|
|
int index;
|
2011-02-24 15:42:19 +03:00
|
|
|
while (!tooptimizestr.empty()) {
|
2010-01-08 18:12:28 +03:00
|
|
|
getNextPound(tooptimizestr, substring, ",");
|
|
|
|
index = D.getFeatureIndex(substring);
|
|
|
|
cerr << "FeatNameIndex:" << index << " to insert" << endl;
|
|
|
|
//index = strtol(substring.c_str(), NULL, 10);
|
2011-02-24 15:42:19 +03:00
|
|
|
if (index >= 0 && index < pdim) {
|
|
|
|
tooptimize.push_back(index);
|
|
|
|
} else {
|
|
|
|
cerr << "Index " << index << " is out of bounds. Allowed indexes are [0," << (pdim-1) << "]." << endl;
|
|
|
|
}
|
2010-01-08 18:12:28 +03:00
|
|
|
}
|
2011-02-24 15:42:19 +03:00
|
|
|
} else {
|
2010-01-08 18:12:28 +03:00
|
|
|
//set all weights as active
|
|
|
|
tooptimize.resize(pdim);//We'll optimize on everything
|
2011-02-24 15:42:19 +03:00
|
|
|
for(int i=0; i<pdim; i++) {
|
|
|
|
tooptimize[i]=1;
|
|
|
|
}
|
2010-01-08 18:12:28 +03:00
|
|
|
}
|
|
|
|
|
2011-07-23 04:24:45 +04:00
|
|
|
Optimizer *O=OptimizerFactory::BuildOptimizer(pdim,tooptimize,start_list[0],type,nrandom);
|
2008-05-15 18:04:42 +04:00
|
|
|
O->SetScorer(TheScorer);
|
2008-05-27 20:50:52 +04:00
|
|
|
O->SetFData(D.getFeatureData());
|
2009-01-07 16:30:06 +03:00
|
|
|
|
2011-07-23 04:24:45 +04:00
|
|
|
// run with specified starting points
|
2011-02-24 15:42:19 +03:00
|
|
|
stringstream oss;
|
2011-07-23 04:24:45 +04:00
|
|
|
statscore_t best=0, mean=0, var=0;
|
|
|
|
Point bestP;
|
|
|
|
for(int i=0;i<start_list.size();i++) {
|
|
|
|
Point P(start_list[i], min, max);//Generate from the full feature set. Warning: must be done after Optimizer initialization
|
|
|
|
statscore_t score=O->Run(P);
|
|
|
|
oss.str("");
|
|
|
|
oss << "Specified starting point number " << (1+i) << ", score: " << score;
|
|
|
|
if (i==0 || score>best) {
|
|
|
|
best=score;
|
|
|
|
bestP=P;
|
|
|
|
oss << " (new best)";
|
|
|
|
}
|
|
|
|
mean+=score;
|
|
|
|
var+=(score*score);
|
|
|
|
PrintUserTime(oss.str());
|
|
|
|
}
|
2011-02-24 15:42:19 +03:00
|
|
|
|
2011-07-23 04:24:45 +04:00
|
|
|
// run with random starting points
|
|
|
|
for(int i=0; i<ntry; i++) {
|
|
|
|
Point P(start_list[0], min, max);
|
2011-07-04 01:01:16 +04:00
|
|
|
P.Randomize(); // randomize within min and max as given to the constructor
|
2011-02-24 15:42:19 +03:00
|
|
|
statscore_t score=O->Run(P);
|
2011-07-23 04:24:45 +04:00
|
|
|
oss.str("");
|
|
|
|
oss << "Randomized starting point number " << (1+i) << ", score: " << score;
|
2011-02-24 15:42:19 +03:00
|
|
|
if(score>best) {
|
|
|
|
best=score;
|
|
|
|
bestP=P;
|
2011-07-23 04:24:45 +04:00
|
|
|
oss << " (new best)";
|
2011-02-24 15:42:19 +03:00
|
|
|
}
|
|
|
|
mean+=score;
|
|
|
|
var+=(score*score);
|
|
|
|
PrintUserTime(oss.str());
|
2008-05-15 18:04:42 +04:00
|
|
|
}
|
2011-02-24 15:42:19 +03:00
|
|
|
mean/=(float)ntry;
|
|
|
|
var/=(float)ntry;
|
|
|
|
var=sqrt(abs(var-mean*mean));
|
|
|
|
if (verboselevel()>1)
|
|
|
|
cerr<<"best score: "<< best << " variance of the score (for "<<ntry<<" try): "<<var<<endl;
|
|
|
|
|
2011-07-23 04:24:45 +04:00
|
|
|
// L1-Normalization of the best Point
|
|
|
|
if ((int)tooptimize.size() == pdim)
|
2011-01-25 19:10:47 +03:00
|
|
|
bestP.NormalizeL1();
|
|
|
|
|
2011-02-24 15:42:19 +03:00
|
|
|
cerr << "Best point: " << bestP << " => " << best << endl;
|
|
|
|
ofstream res("weights.txt");
|
|
|
|
res<<bestP<<endl;
|
|
|
|
|
|
|
|
PrintUserTime("Stopping...");
|
2008-05-15 15:06:32 +04:00
|
|
|
}
|