added random directions [Cer&al.,2008] and historic best as starting points [Foster&Kuhn,2009] to MERT

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4086 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
phkoehn 2011-07-23 00:24:45 +00:00
parent 6a27dc4f17
commit 1bd74fc87f
4 changed files with 185 additions and 72 deletions

View File

@ -28,7 +28,7 @@ void Optimizer::SetFData(FeatureData *F)
FData=F;
};
Optimizer::Optimizer(unsigned Pd,vector<unsigned> i2O,vector<parameter_t> start):scorer(NULL),FData(NULL)
Optimizer::Optimizer(unsigned Pd,vector<unsigned> i2O,vector<parameter_t> start, unsigned int nrandom):scorer(NULL),FData(NULL),number_of_random_directions(nrandom)
{
//warning: the init vector is a full set of parameters, of dimension pdim!
@ -381,15 +381,20 @@ statscore_t SimpleOptimizer::TrueRun(Point& P)const
Point linebest;
for(unsigned int d=0; d<Point::getdim(); d++) {
for(unsigned int d=0; d<Point::getdim()+number_of_random_directions; d++) {
if(verboselevel()>4) {
// cerr<<"minimizing along direction "<<d<<endl;
cerr<<"starting point: " << P << " => " << prevscore << endl;
}
Point direction;
for(unsigned int i=0; i<Point::getdim(); i++)
direction[i];
direction[d]=1.0;
if (d<Point::getdim()) { // regular updates along one dimension
for(unsigned int i=0; i<Point::getdim(); i++)
direction[i]=0.0;
direction[d]=1.0;
}
else { // random direction update
direction.Randomize();
}
statscore_t curscore=LineOptimize(P,direction,linebest);//find the minimum on the line
if(verboselevel()>5) {
cerr<<"direction: "<< d << " => " << curscore << endl;
@ -417,6 +422,39 @@ statscore_t SimpleOptimizer::TrueRun(Point& P)const
return bestscore;
}
//---------------- code for the optimizer with random directions
float RandomDirectionOptimizer::eps=0.0001;
statscore_t RandomDirectionOptimizer::TrueRun(Point& P)const
{
statscore_t prevscore=P.score;
// do specified number of random direction optimizations
unsigned int nrun = 0;
unsigned int nrun_no_change = 0;
for(; nrun_no_change<number_of_random_directions; nrun++, nrun_no_change++)
{
// choose a random direction in which to optimize
Point direction;
direction.Randomize();
//find the minimum on the line
statscore_t score=LineOptimize(P,direction,P);
if(verboselevel()>4) {
cerr<<"direction: "<< direction << " => " << score;
cerr<<" ("<< (score-prevscore) << ")" << endl;
cerr<<"\tending point: "<< P << " => " << score << endl;
}
if (score-prevscore > eps)
nrun_no_change=0;
prevscore = score;
}
if(verboselevel()>2) {
cerr<<"end Powell Algo, nrun="<<nrun<<endl;
}
return prevscore;
}
/**RandomOptimizer to use as beaseline and test.\n
Just return a random point*/
@ -436,6 +474,7 @@ void OptimizerFactory::SetTypeNames()
if(typenames.empty()) {
typenames.resize(NOPTIMIZER);
typenames[POWELL]="powell";
typenames[RANDOM_DIRECTION]="random-direction";
typenames[RANDOM]="random";
//add new type there
}
@ -458,9 +497,8 @@ OptimizerFactory::OptType OptimizerFactory::GetOType(string type)
return((OptType)thetype);
};
Optimizer* OptimizerFactory::BuildOptimizer(unsigned dim,vector<unsigned> i2o,vector<parameter_t> start,string type)
Optimizer* OptimizerFactory::BuildOptimizer(unsigned dim,vector<unsigned> i2o,vector<parameter_t> start,string type, unsigned int nrandom)
{
OptType T=GetOType(type);
if(T==NOPTIMIZER) {
cerr<<"Error: unknown Optimizer type "<<type<<endl;
@ -473,10 +511,13 @@ Optimizer* OptimizerFactory::BuildOptimizer(unsigned dim,vector<unsigned> i2o,ve
switch((OptType)T) {
case POWELL:
return new SimpleOptimizer(dim,i2o,start);
return new SimpleOptimizer(dim,i2o,start,nrandom);
break;
case RANDOM_DIRECTION:
return new RandomDirectionOptimizer(dim,i2o,start,nrandom);
break;
case RANDOM:
return new RandomOptimizer(dim,i2o,start);
return new RandomOptimizer(dim,i2o,start,nrandom);
break;
default:
cerr<<"Error: unknown optimizer"<<type<<endl;

View File

@ -11,8 +11,6 @@
typedef float featurescore;
using namespace std;
/**abstract virtual class*/
class Optimizer
@ -20,8 +18,9 @@ class Optimizer
protected:
Scorer * scorer; //no accessor for them only child can use them
FeatureData * FData;//no accessor for them only child can use them
unsigned int number_of_random_directions;
public:
Optimizer(unsigned Pd,vector<unsigned> i2O,vector<parameter_t> start);
Optimizer(unsigned Pd,vector<unsigned> i2O,vector<parameter_t> start,unsigned int nrandom);
void SetScorer(Scorer *S);
void SetFData(FeatureData *F);
virtual ~Optimizer();
@ -46,34 +45,43 @@ public:
};
/**default basic optimizer*/
/**default basic optimizer*/
class SimpleOptimizer: public Optimizer
{
private:
static float eps;
static float eps;
public:
SimpleOptimizer(unsigned dim,vector<unsigned> i2O,vector<parameter_t> start):Optimizer(dim,i2O,start) {};
SimpleOptimizer(unsigned dim,vector<unsigned> i2O,vector<parameter_t> start,unsigned int nrandom):Optimizer(dim,i2O,start,nrandom) {};
virtual statscore_t TrueRun(Point&)const;
};
/**optimizer with random directions*/
class RandomDirectionOptimizer: public Optimizer
{
private:
static float eps;
public:
RandomDirectionOptimizer(unsigned dim,vector<unsigned> i2O,vector<parameter_t> start,unsigned int nrandom):Optimizer(dim,i2O,start,nrandom) {};
virtual statscore_t TrueRun(Point&)const;
};
/**dumb baseline optimizer: just picks a random point and quits*/
class RandomOptimizer: public Optimizer
{
public:
RandomOptimizer(unsigned dim,vector<unsigned> i2O,vector<parameter_t> start):Optimizer(dim,i2O,start) {};
RandomOptimizer(unsigned dim,vector<unsigned> i2O,vector<parameter_t> start, unsigned int nrandom):Optimizer(dim,i2O,start,nrandom) {};
virtual statscore_t TrueRun(Point&)const;
};
class OptimizerFactory
{
public:
// unsigned dim;
//Point Start;
static vector<string> GetTypeNames();
static Optimizer* BuildOptimizer(unsigned dim,vector<unsigned>tooptimize,vector<parameter_t> start,string type);
static Optimizer* BuildOptimizer(unsigned dim,vector<unsigned>tooptimize,vector<parameter_t> start,string type,unsigned int nrandom);
private:
enum OptType {POWELL=0,RANDOM,NOPTIMIZER}; //Add new optimizer here BEFORE NOPTIMZER
enum OptType {POWELL=0,RANDOM_DIRECTION=1,RANDOM,NOPTIMIZER}; //Add new optimizer here BEFORE NOPTIMZER
static OptType GetOType(string);
static vector<string> typenames;
static void SetTypeNames();

View File

@ -32,6 +32,7 @@ void usage(void)
{
cerr<<"usage: mert -d <dimensions> (mandatory )"<<endl;
cerr<<"[-n retry ntimes (default 1)]"<<endl;
cerr<<"[-m number of random directions in powell (default 0)]"<<endl;
cerr<<"[-o\tthe indexes to optimize(default all)]"<<endl;
cerr<<"[-t\tthe optimizer(default powell)]"<<endl;
cerr<<"[-r\tthe random seed (defaults to system clock)"<<endl;
@ -48,6 +49,7 @@ void usage(void)
static struct option long_options[] = {
{"pdim", 1, 0, 'd'},
{"ntry",1,0,'n'},
{"nrandom",1,0,'m'},
{"rseed",required_argument,0,'r'},
{"optimize",1,0,'o'},
{"type",1,0,'t'},
@ -76,6 +78,7 @@ int main (int argc, char **argv)
int c,pdim,i;
pdim=-1;
int ntry=1;
int nrandom=0;
int seed=0;
bool hasSeed = false;
string type("powell");
@ -87,13 +90,12 @@ int main (int argc, char **argv)
string tooptimizestr("");
vector<unsigned> tooptimize;
vector<parameter_t> start;
vector<vector<parameter_t> > start_list;
vector<parameter_t> min;
vector<parameter_t> max;
//note: those mins and max are the bound for the starting points of the algorithm, not strict bound on the result!
while ((c=getopt_long (argc, argv, "o:r:d:n:t:s:S:F:v:", long_options, &option_index)) != -1) {
while ((c=getopt_long (argc, argv, "o:r:d:n:m:t:s:S:F:v:", long_options, &option_index)) != -1) {
switch (c) {
case 'o':
tooptimizestr = string(optarg);
@ -104,6 +106,9 @@ int main (int argc, char **argv)
case 'n':
ntry=strtol(optarg, NULL, 10);
break;
case 'm':
nrandom=strtol(optarg, NULL, 10);
break;
case 'r':
seed=strtol(optarg, NULL, 10);
hasSeed = true;
@ -144,36 +149,45 @@ int main (int argc, char **argv)
srandom(time(NULL));
}
ifstream opt(initfile.c_str());
if(opt.fail()) {
cerr<<"could not open initfile: " << initfile << endl;
exit(3);
// read in starting points
std::string onefile;
while (!initfile.empty()) {
getNextPound(initfile, onefile, ",");
vector<parameter_t> start;
ifstream opt(onefile.c_str());
if(opt.fail()) {
cerr<<"could not open initfile: " << initfile << endl;
exit(3);
}
start.resize(pdim);//to do:read from file
int j;
for( j=0; j<pdim&&!opt.fail(); j++)
opt>>start[j];
if(j<pdim) {
cerr<<initfile<<":Too few starting weights." << endl;
exit(3);
}
start_list.push_back(start);
// for the first time, also read in the min/max values for scores
if (start_list.size() == 1) {
min.resize(pdim);
for( j=0; j<pdim&&!opt.fail(); j++)
opt>>min[j];
if(j<pdim) {
cerr<<initfile<<":Too few minimum weights." << endl;
cerr<<"error could not initialize start point with " << initfile << endl;
exit(3);
}
max.resize(pdim);
for( j=0; j<pdim&&!opt.fail(); j++)
opt>>max[j];
if(j<pdim) {
cerr<<initfile<<":Too few maximum weights." << endl;
exit(3);
}
}
opt.close();
}
start.resize(pdim);//to do:read from file
int j;
for( j=0; j<pdim&&!opt.fail(); j++)
opt>>start[j];
if(j<pdim) {
cerr<<initfile<<":Too few starting weights." << endl;
exit(3);
}
min.resize(pdim);
for( j=0; j<pdim&&!opt.fail(); j++)
opt>>min[j];
if(j<pdim) {
cerr<<initfile<<":Too few minimum weights." << endl;
cerr<<"error could not initialize start point with " << initfile << endl;
exit(3);
}
max.resize(pdim);
for( j=0; j<pdim&&!opt.fail(); j++)
opt>>max[j];
if(j<pdim) {
cerr<<initfile<<":Too few maximum weights." << endl;
exit(3);
}
opt.close();
vector<string> ScoreDataFiles;
if (scorerfile.length() > 0) {
@ -236,32 +250,43 @@ int main (int argc, char **argv)
}
}
Optimizer *O=OptimizerFactory::BuildOptimizer(pdim,tooptimize,start,type);
Optimizer *O=OptimizerFactory::BuildOptimizer(pdim,tooptimize,start_list[0],type,nrandom);
O->SetScorer(TheScorer);
O->SetFData(D.getFeatureData());
Point P(start, min, max);//Generate from the full feature set. Warning: must be done after Optimizer initialization
statscore_t best=O->Run(P);
Point bestP=P;
statscore_t mean=best;
statscore_t var=best*best;
// run with specified starting points
stringstream oss;
oss << "Try number 1";
PrintUserTime(oss.str());
for(int i=1; i<ntry; i++) {
P.Randomize(); // randomize within min and max as given to the constructor
statscore_t best=0, mean=0, var=0;
Point bestP;
for(int i=0;i<start_list.size();i++) {
Point P(start_list[i], min, max);//Generate from the full feature set. Warning: must be done after Optimizer initialization
statscore_t score=O->Run(P);
if(score>best) {
oss.str("");
oss << "Specified starting point number " << (1+i) << ", score: " << score;
if (i==0 || score>best) {
best=score;
bestP=P;
oss << " (new best)";
}
mean+=score;
var+=(score*score);
PrintUserTime(oss.str());
}
// run with random starting points
for(int i=0; i<ntry; i++) {
Point P(start_list[0], min, max);
P.Randomize(); // randomize within min and max as given to the constructor
statscore_t score=O->Run(P);
oss.str("");
oss << "Try number " << (i+1);
oss << "Randomized starting point number " << (1+i) << ", score: " << score;
if(score>best) {
best=score;
bestP=P;
oss << " (new best)";
}
mean+=score;
var+=(score*score);
PrintUserTime(oss.str());
}
mean/=(float)ntry;
@ -270,11 +295,10 @@ int main (int argc, char **argv)
if (verboselevel()>1)
cerr<<"best score: "<< best << " variance of the score (for "<<ntry<<" try): "<<var<<endl;
//L1-Normalization of the best Point
if (tooptimize.size() == pdim)
// L1-Normalization of the best Point
if ((int)tooptimize.size() == pdim)
bestP.NormalizeL1();
cerr << "Best point: " << bestP << " => " << best << endl;
ofstream res("weights.txt");
res<<bestP<<endl;

View File

@ -115,7 +115,9 @@ my $continue = 0; # should we try to continue from the last saved step?
my $skip_decoder = 0; # and should we skip the first decoder run (assuming we got interrupted during mert)
my $___FILTER_PHRASE_TABLE = 1; # filter phrase table
my $___PREDICTABLE_SEEDS = 0;
my $___START_WITH_HISTORIC_BESTS = 0; # use best settings from all previous iterations as starting points [Foster&Kuhn,2009]
my $___RANDOM_DIRECTIONS = 0; # search in random directions only
my $___NUM_RANDOM_DIRECTIONS = 0; # number of random directions, also works with default optimizer [Cer&al.,2008]
# Parameter for effective reference length when computing BLEU score
# Default is to use shortest reference
@ -193,6 +195,9 @@ GetOptions(
"old-sge" => \$old_sge, #passed to moses-parallel
"filter-phrase-table!" => \$___FILTER_PHRASE_TABLE, # allow (disallow)filtering of phrase tables
"predictable-seeds" => \$___PREDICTABLE_SEEDS, # allow (disallow) switch on/off reseeding of random restarts
"historic-bests" => \$___START_WITH_HISTORIC_BESTS, # use best settings from all previous iterations as starting points
"random-directions" => \$___RANDOM_DIRECTIONS, # search only in random directions
"number-of-random-directions=i" => \$___NUM_RANDOM_DIRECTIONS, # number of random directions
"efficient_scorenbest_flag" => \$efficient_scorenbest_flag, # activate a time-efficient scoring of nbest lists
"activate-features=s" => \$___ACTIVATE_FEATURES, #comma-separated (or blank-separated) list of features to work on (others are fixed to the starting values)
"prev-aggregate-nbestlist=i" => \$prev_aggregate_nbl_size, #number of previous step to consider when loading data (default =-1, i.e. all previous)
@ -273,6 +278,9 @@ Options:
the starting weights (and also as the fixed
weights if --activate-features is used).
default: yes (used to be 'no')
--random-directions ... search only in random directions
--number-of-random-directions=int ... number of random directions
(also works with regular optimizer, default: 0)
";
exit 1;
}
@ -481,6 +489,7 @@ my $devbleu = undef;
my $prev_feature_file = undef;
my $prev_score_file = undef;
my $prev_init_file = undef;
if ($continue) {
# getting the last finished step
@ -529,6 +538,16 @@ if ($continue) {
$prev_score_file = "run$prevstep.scores.dat";
}
}
if (! -e "run$prevstep.${weights_in_file}"){
die "Can't start from step $step, because run$prevstep.${weights_in_file} was not found!";
}else{
if (defined $prev_init_file){
$prev_init_file = "${prev_init_file},run$prevstep.${weights_in_file}";
}
else{
$prev_init_file = "run$prevstep.${weights_in_file}";
}
}
}
if (! -e "run$step.weights.txt"){
die "Can't start from step $step, because run$step.weights.txt was not found!";
@ -706,6 +725,15 @@ while(1) {
my $seed = $run * 1000;
$cmd = $cmd." -r $seed";
}
if ($___RANDOM_DIRECTIONS) {
if ($___NUM_RANDOM_DIRECTIONS == 0) {
$cmd .= " -m 50";
}
$cmd = $cmd." -t random-direction";
}
if ($___NUM_RANDOM_DIRECTIONS) {
$cmd .= " -m $___NUM_RANDOM_DIRECTIONS";
}
if (defined $prev_feature_file) {
$cmd = $cmd." --ffile $prev_feature_file,$feature_file";
@ -719,8 +747,12 @@ while(1) {
else{
$cmd = $cmd." --scfile $score_file";
}
$cmd = $cmd." --ifile run$run.$weights_in_file";
if ($___START_WITH_HISTORIC_BESTS && defined $prev_init_file) {
$cmd = $cmd." --ifile $prev_init_file,run$run.$weights_in_file";
}
else{
$cmd = $cmd." --ifile run$run.$weights_in_file";
}
if (defined $___JOBS && $___JOBS > 0) {
safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -stdout=$mert_outfile -stderr=$mert_logfile -queue-parameter=\"$queue_flags\"") or die "Failed to start mert (via qsubwrapper $qsubwrapper)";
@ -793,6 +825,7 @@ while(1) {
print "loading data from $firstrun to $run (prev_aggregate_nbl_size=$prev_aggregate_nbl_size)\n";
$prev_feature_file = undef;
$prev_score_file = undef;
$prev_init_file = undef;
for (my $i=$firstrun;$i<=$run;$i++){
if (defined $prev_feature_file){
$prev_feature_file = "${prev_feature_file},run${i}.${base_feature_file}";
@ -806,9 +839,16 @@ while(1) {
else{
$prev_score_file = "run${i}.${base_score_file}";
}
if (defined $prev_init_file){
$prev_init_file = "${prev_init_file},run${i}.${weights_in_file}";
}
else{
$prev_init_file = "run${i}.${weights_in_file}";
}
}
print "loading data from $prev_feature_file\n" if defined($prev_feature_file);
print "loading data from $prev_score_file\n" if defined($prev_score_file);
print "loading data from $prev_init_file\n" if defined($prev_init_file);
}
print "Training finished at ".`date`;