fix handling of large memory jobs

git-svn-id: http://svn.statmt.org/repository/mira@3811 cc96ff50-19ce-11e0-b349-13d7f0bd23df
This commit is contained in:
ehasler 2011-02-28 19:49:15 +00:00 committed by Ondrej Bojar
parent eba945b61c
commit 9b3c9a6fd6

View File

@ -62,11 +62,11 @@ my $decoder_settings = &param("general.decoder-settings", "");
my $working_dir = &param("general.working-dir");
system("mkdir -p $working_dir") == 0 or die "Error: unable to create directory \"$working_dir\"";
my $train_script = "$name-train";
my $jobs = &param("train.jobs",8);
my $job_name = "$name-t";
my $hours = &param("train.hours",48);
#required training parameters
my $moses_ini_file = &param_required("train.moses-ini-file");
&check_exists ("moses ini file", $moses_ini_file);
my $input_file = &param_required("train.input-file");
@ -84,8 +84,9 @@ my $learner = &param("train.learner", "mira");
my $batch = &param("train.batch", 1);
my $extra_args = &param("train.extra-args");
my $continue_from_epoch = &param("train.continue-from-epoch", 0);
my $extra_memory_train = &param("train.extra-memory",0);
my $by_node = &param("train.by-node",0);
my $slots = &param("train.slots",8);
my $jobs = &param("train.jobs",8);
#test configuration
my ($test_input_file, $test_reference_file,$test_ini_file,$bleu_script,$use_moses);
@ -130,9 +131,9 @@ my $train_err = $train_script . ".err";
#write the script
open TRAIN, ">$train_script_file" or die "Unable to open \"$train_script_file\" for writing";
&header(*TRAIN,$job_name,$working_dir,$jobs,$hours,$vmem,$train_out,$train_err);
&header(*TRAIN,$job_name,$working_dir,$slots,$jobs,$hours,$vmem,$train_out,$train_err);
if ($by_node) {
print TRAIN "mpirun -np \$NSLOTS --bynode $trainer_exe \\\n";
print TRAIN "mpirun -np $jobs --bynode $trainer_exe \\\n";
}
else {
print TRAIN "mpirun -np \$NSLOTS $trainer_exe \\\n";
@ -160,7 +161,6 @@ print TRAIN "--epochs $epochs \\\n";
print TRAIN "-b $batch \\\n";
print TRAIN "--decoder-settings \"$decoder_settings\" \\\n";
print TRAIN $extra_args;
print TRAIN "\n";
print TRAIN "echo \"mpirun finished.\"\n";
close TRAIN;
@ -173,14 +173,8 @@ if (! $execute) {
#submit the training job
my $train_job_id;
if ($have_sge) {
if ($extra_memory_train) {
print "SUBMIT TRAINING JOB WITH EXTRA MEMORY: $extra_memory_train \n";
$train_job_id = &submit_job_sge_extra_memory($train_script_file,$extra_memory_train);
}
else {
print "SUBMIT TRAINING JOB WITH NO EXTRA MEMORY \n";
$train_job_id = &submit_job_sge($train_script_file);
}
$train_job_id = &submit_job_sge($train_script_file);
} else {
$train_job_id = &submit_job_no_sge($train_script_file, $train_out,$train_err);
}
@ -201,7 +195,12 @@ while(1) {
$train_iteration += 1;
my $new_weight_file = "$working_dir/$weight_file_stem" . "_";
if ($weight_frequency == 1) {
$new_weight_file .= $train_iteration;
if ($train_iteration < 10) {
$new_weight_file .= "0".$train_iteration;
}
else {
$new_weight_file .= $train_iteration;
}
} else {
#my $epoch = 1 + int $train_iteration / $weight_frequency;
$epoch = int $train_iteration / $weight_frequency;
@ -498,7 +497,7 @@ while(1) {
#launch testing
if ($have_sge) {
if ($extra_memory_test) {
print "SUBMIT TEST JOB WITH EXTRA MEMORY: $extra_memory_train \n";
print "SUBMIT TEST JOB WITH EXTRA MEMORY: $extra_memory_test \n";
&submit_job_sge_extra_memory($test_script_file,$extra_memory_test);
}
else {
@ -529,12 +528,12 @@ sub param_required {
}
sub header {
my ($OUT,$name,$working_dir,$jobs,$hours,$vmem,$out,$err) = @_;
my ($OUT,$name,$working_dir,$slots,$jobs,$hours,$vmem,$out,$err) = @_;
print $OUT "#!/bin/sh\n";
if ($have_sge) {
print $OUT "#\$ -N $name\n";
print $OUT "#\$ -wd $working_dir\n";
print $OUT "#\$ -pe $mpienv $jobs\n";
print $OUT "#\$ -pe $mpienv $slots\n";
print $OUT "#\$ -l h_rt=$hours:00:00\n";
print $OUT "#\$ -l h_vmem=$vmem" . "G" . "\n";
print $OUT "#\$ -o $out\n";