mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-11-10 10:59:21 +03:00
fix handling of large memory jobs
git-svn-id: http://svn.statmt.org/repository/mira@3811 cc96ff50-19ce-11e0-b349-13d7f0bd23df
This commit is contained in:
parent
eba945b61c
commit
9b3c9a6fd6
@ -62,11 +62,11 @@ my $decoder_settings = ¶m("general.decoder-settings", "");
|
||||
my $working_dir = ¶m("general.working-dir");
|
||||
system("mkdir -p $working_dir") == 0 or die "Error: unable to create directory \"$working_dir\"";
|
||||
my $train_script = "$name-train";
|
||||
my $jobs = ¶m("train.jobs",8);
|
||||
my $job_name = "$name-t";
|
||||
my $hours = ¶m("train.hours",48);
|
||||
|
||||
#required training parameters
|
||||
|
||||
my $moses_ini_file = ¶m_required("train.moses-ini-file");
|
||||
&check_exists ("moses ini file", $moses_ini_file);
|
||||
my $input_file = ¶m_required("train.input-file");
|
||||
@ -84,8 +84,9 @@ my $learner = ¶m("train.learner", "mira");
|
||||
my $batch = ¶m("train.batch", 1);
|
||||
my $extra_args = ¶m("train.extra-args");
|
||||
my $continue_from_epoch = ¶m("train.continue-from-epoch", 0);
|
||||
my $extra_memory_train = ¶m("train.extra-memory",0);
|
||||
my $by_node = ¶m("train.by-node",0);
|
||||
my $slots = ¶m("train.slots",8);
|
||||
my $jobs = ¶m("train.jobs",8);
|
||||
|
||||
#test configuration
|
||||
my ($test_input_file, $test_reference_file,$test_ini_file,$bleu_script,$use_moses);
|
||||
@ -130,9 +131,9 @@ my $train_err = $train_script . ".err";
|
||||
#write the script
|
||||
open TRAIN, ">$train_script_file" or die "Unable to open \"$train_script_file\" for writing";
|
||||
|
||||
&header(*TRAIN,$job_name,$working_dir,$jobs,$hours,$vmem,$train_out,$train_err);
|
||||
&header(*TRAIN,$job_name,$working_dir,$slots,$jobs,$hours,$vmem,$train_out,$train_err);
|
||||
if ($by_node) {
|
||||
print TRAIN "mpirun -np \$NSLOTS --bynode $trainer_exe \\\n";
|
||||
print TRAIN "mpirun -np $jobs --bynode $trainer_exe \\\n";
|
||||
}
|
||||
else {
|
||||
print TRAIN "mpirun -np \$NSLOTS $trainer_exe \\\n";
|
||||
@ -160,7 +161,6 @@ print TRAIN "--epochs $epochs \\\n";
|
||||
print TRAIN "-b $batch \\\n";
|
||||
print TRAIN "--decoder-settings \"$decoder_settings\" \\\n";
|
||||
print TRAIN $extra_args;
|
||||
|
||||
print TRAIN "\n";
|
||||
print TRAIN "echo \"mpirun finished.\"\n";
|
||||
close TRAIN;
|
||||
@ -173,14 +173,8 @@ if (! $execute) {
|
||||
#submit the training job
|
||||
my $train_job_id;
|
||||
if ($have_sge) {
|
||||
if ($extra_memory_train) {
|
||||
print "SUBMIT TRAINING JOB WITH EXTRA MEMORY: $extra_memory_train \n";
|
||||
$train_job_id = &submit_job_sge_extra_memory($train_script_file,$extra_memory_train);
|
||||
}
|
||||
else {
|
||||
print "SUBMIT TRAINING JOB WITH NO EXTRA MEMORY \n";
|
||||
$train_job_id = &submit_job_sge($train_script_file);
|
||||
}
|
||||
$train_job_id = &submit_job_sge($train_script_file);
|
||||
|
||||
} else {
|
||||
$train_job_id = &submit_job_no_sge($train_script_file, $train_out,$train_err);
|
||||
}
|
||||
@ -201,7 +195,12 @@ while(1) {
|
||||
$train_iteration += 1;
|
||||
my $new_weight_file = "$working_dir/$weight_file_stem" . "_";
|
||||
if ($weight_frequency == 1) {
|
||||
$new_weight_file .= $train_iteration;
|
||||
if ($train_iteration < 10) {
|
||||
$new_weight_file .= "0".$train_iteration;
|
||||
}
|
||||
else {
|
||||
$new_weight_file .= $train_iteration;
|
||||
}
|
||||
} else {
|
||||
#my $epoch = 1 + int $train_iteration / $weight_frequency;
|
||||
$epoch = int $train_iteration / $weight_frequency;
|
||||
@ -498,7 +497,7 @@ while(1) {
|
||||
#launch testing
|
||||
if ($have_sge) {
|
||||
if ($extra_memory_test) {
|
||||
print "SUBMIT TEST JOB WITH EXTRA MEMORY: $extra_memory_train \n";
|
||||
print "SUBMIT TEST JOB WITH EXTRA MEMORY: $extra_memory_test \n";
|
||||
&submit_job_sge_extra_memory($test_script_file,$extra_memory_test);
|
||||
}
|
||||
else {
|
||||
@ -529,12 +528,12 @@ sub param_required {
|
||||
}
|
||||
|
||||
sub header {
|
||||
my ($OUT,$name,$working_dir,$jobs,$hours,$vmem,$out,$err) = @_;
|
||||
my ($OUT,$name,$working_dir,$slots,$jobs,$hours,$vmem,$out,$err) = @_;
|
||||
print $OUT "#!/bin/sh\n";
|
||||
if ($have_sge) {
|
||||
print $OUT "#\$ -N $name\n";
|
||||
print $OUT "#\$ -wd $working_dir\n";
|
||||
print $OUT "#\$ -pe $mpienv $jobs\n";
|
||||
print $OUT "#\$ -pe $mpienv $slots\n";
|
||||
print $OUT "#\$ -l h_rt=$hours:00:00\n";
|
||||
print $OUT "#\$ -l h_vmem=$vmem" . "G" . "\n";
|
||||
print $OUT "#\$ -o $out\n";
|
||||
|
Loading…
Reference in New Issue
Block a user