mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-10-26 19:37:58 +03:00
support for use of baseline alignment model
This commit is contained in:
parent
438dcb1a34
commit
24e1df7520
@ -340,6 +340,18 @@ alignment-symmetrization-method = grow-diag-final-and
|
||||
#berkeley-process-options = "-EMWordAligner.numThreads 8"
|
||||
#berkeley-posterior = 0.5
|
||||
|
||||
### use of baseline alignment model (incremental training)
|
||||
#
|
||||
#baseline = 68
|
||||
#baseline-alignment-model = "$working-dir/training/prepared.$baseline/$input-extension.vcb \
|
||||
# $working-dir/training/prepared.$baseline/$output-extension.vcb \
|
||||
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.cooc \
|
||||
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.cooc \
|
||||
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.thmm.5 \
|
||||
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.hhmm.5 \
|
||||
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.thmm.5 \
|
||||
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.hhmm.5"
|
||||
|
||||
### if word alignment should be skipped,
|
||||
# point to word alignment files
|
||||
#
|
||||
|
@ -360,6 +360,18 @@ alignment-symmetrization-method = grow-diag-final-and
|
||||
#berkeley-process-options = "-EMWordAligner.numThreads 8"
|
||||
#berkeley-posterior = 0.5
|
||||
|
||||
### use of baseline alignment model (incremental training)
|
||||
#
|
||||
#baseline = 68
|
||||
#baseline-alignment-model = "$working-dir/training/prepared.$baseline/$input-extension.vcb \
|
||||
# $working-dir/training/prepared.$baseline/$output-extension.vcb \
|
||||
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.cooc \
|
||||
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.cooc \
|
||||
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.thmm.5 \
|
||||
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.hhmm.5 \
|
||||
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.thmm.5 \
|
||||
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.hhmm.5"
|
||||
|
||||
### if word alignment should be skipped,
|
||||
# point to word alignment files
|
||||
#
|
||||
|
@ -340,6 +340,18 @@ alignment-symmetrization-method = grow-diag-final-and
|
||||
#berkeley-process-options = "-EMWordAligner.numThreads 8"
|
||||
#berkeley-posterior = 0.5
|
||||
|
||||
### use of baseline alignment model (incremental training)
|
||||
#
|
||||
#baseline = 68
|
||||
#baseline-alignment-model = "$working-dir/training/prepared.$baseline/$input-extension.vcb \
|
||||
# $working-dir/training/prepared.$baseline/$output-extension.vcb \
|
||||
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.cooc \
|
||||
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.cooc \
|
||||
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.thmm.5 \
|
||||
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.hhmm.5 \
|
||||
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.thmm.5 \
|
||||
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.hhmm.5"
|
||||
|
||||
### if word alignment should be skipped,
|
||||
# point to word alignment files
|
||||
#
|
||||
|
@ -344,6 +344,18 @@ alignment-symmetrization-method = grow-diag-final-and
|
||||
#berkeley-process-options = "-EMWordAligner.numThreads 8"
|
||||
#berkeley-posterior = 0.5
|
||||
|
||||
### use of baseline alignment model (incremental training)
|
||||
#
|
||||
#baseline = 68
|
||||
#baseline-alignment-model = "$working-dir/training/prepared.$baseline/$input-extension.vcb \
|
||||
# $working-dir/training/prepared.$baseline/$output-extension.vcb \
|
||||
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.cooc \
|
||||
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.cooc \
|
||||
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.thmm.5 \
|
||||
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.hhmm.5 \
|
||||
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.thmm.5 \
|
||||
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.hhmm.5"
|
||||
|
||||
### if word alignment should be skipped,
|
||||
# point to word alignment files
|
||||
#
|
||||
|
@ -324,6 +324,18 @@ alignment-symmetrization-method = grow-diag-final-and
|
||||
#berkeley-process-options = "-EMWordAligner.numThreads 8"
|
||||
#berkeley-posterior = 0.5
|
||||
|
||||
### use of baseline alignment model (incremental training)
|
||||
#
|
||||
#baseline = 68
|
||||
#baseline-alignment-model = "$working-dir/training/prepared.$baseline/$input-extension.vcb \
|
||||
# $working-dir/training/prepared.$baseline/$output-extension.vcb \
|
||||
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.cooc \
|
||||
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.cooc \
|
||||
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.thmm.5 \
|
||||
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.hhmm.5 \
|
||||
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.thmm.5 \
|
||||
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.hhmm.5"
|
||||
|
||||
### if word alignment should be skipped,
|
||||
# point to word alignment files
|
||||
#
|
||||
|
@ -401,23 +401,25 @@ mml-filter-before-wa
|
||||
prepare-data
|
||||
in: corpus-mml-prefilter=OR=corpus
|
||||
out: prepared-data
|
||||
rerun-on-change: alignment-factors training-options script
|
||||
rerun-on-change: alignment-factors training-options script baseline-alignment-model
|
||||
ignore-if: use-berkeley
|
||||
default-name: prepared
|
||||
run-giza
|
||||
in: prepared-data
|
||||
out: giza-alignment
|
||||
ignore-if: use-berkeley
|
||||
rerun-on-change: giza-settings training-options script
|
||||
rerun-on-change: giza-settings training-options script baseline-alignment-model
|
||||
default-name: giza
|
||||
error: not found
|
||||
not-error: 0 not found
|
||||
run-giza-inverse
|
||||
in: prepared-data
|
||||
out: giza-alignment-inverse
|
||||
rerun-on-change: giza-settings training-options script
|
||||
rerun-on-change: giza-settings training-options script baseline-alignment-model
|
||||
ignore-if: use-berkeley
|
||||
default-name: giza-inverse
|
||||
error: not found
|
||||
not-error: 0 not found
|
||||
run-berkeley
|
||||
in: corpus-mml-prefilter
|
||||
out: berkeley-alignment
|
||||
|
@ -297,6 +297,10 @@ sub read_config {
|
||||
$line_count++;
|
||||
s/\#.*$//; # strip comments
|
||||
next if /^\#/ || /^\s*$/;
|
||||
while (/\\\s*$/) { # merge with next line
|
||||
s/\s*\\\s*$/ /;
|
||||
$_ .= <INI>;
|
||||
}
|
||||
if (/^\[(.+)\]/) {
|
||||
$module = $1;
|
||||
$ignore = /ignore/i;
|
||||
@ -329,7 +333,7 @@ sub read_config {
|
||||
# resolve parameters used in values
|
||||
my $resolve = 1;
|
||||
my $loop_count = 0;
|
||||
while($resolve && $loop_count++ < 10) {
|
||||
while($resolve && $loop_count++ < 100) {
|
||||
$resolve = 0;
|
||||
foreach my $parameter (keys %CONFIG) {
|
||||
foreach (@{$CONFIG{$parameter}}) {
|
||||
@ -2354,6 +2358,7 @@ sub get_training_setting {
|
||||
my $score_settings = &get("TRAINING:score-settings");
|
||||
my $parallel = &get("TRAINING:parallel");
|
||||
my $pcfg = &get("TRAINING:use-pcfg-feature");
|
||||
my $baseline_alignment = &get("TRAINING:baseline-alignment-model");
|
||||
|
||||
my $xml = $source_syntax || $target_syntax;
|
||||
|
||||
@ -2377,6 +2382,7 @@ sub get_training_setting {
|
||||
$cmd .= "-score-options '".$score_settings."' " if $score_settings;
|
||||
$cmd .= "-parallel " if $parallel;
|
||||
$cmd .= "-pcfg " if $pcfg;
|
||||
$cmd .= "-baseline-alignment-model $baseline_alignment " if defined($baseline_alignment) && ($step == 1 || $step == 2);
|
||||
|
||||
# factored training
|
||||
if (&backoff_and_get("TRAINING:input-factors")) {
|
||||
|
@ -38,7 +38,7 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
|
||||
$_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
|
||||
$_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS,
|
||||
@_ADDITIONAL_INI,$_ADDITIONAL_INI_FILE,
|
||||
$_SPARSE_TRANSLATION_TABLE,
|
||||
$_SPARSE_TRANSLATION_TABLE, @_BASELINE_ALIGNMENT_MODEL,
|
||||
$_DICTIONARY, $_SPARSE_PHRASE_FEATURES, $_EPPEX, $IGNORE);
|
||||
my $_CORES = 1;
|
||||
|
||||
@ -128,6 +128,7 @@ $_HELP = 1
|
||||
'additional-ini=s' => \@_ADDITIONAL_INI,
|
||||
'additional-ini-file=s' => \$_ADDITIONAL_INI_FILE,
|
||||
'sparse-translation-table' => \$_SPARSE_TRANSLATION_TABLE,
|
||||
'baseline-alignment-model=s{8}' => \@_BASELINE_ALIGNMENT_MODEL,
|
||||
'cores=i' => \$_CORES
|
||||
);
|
||||
|
||||
@ -373,6 +374,11 @@ my $___ALIGNMENT = "grow-diag-final";
|
||||
$___ALIGNMENT = $_ALIGNMENT if $_ALIGNMENT;
|
||||
my $___NOTE_ALIGNMENT_DROPS = 1;
|
||||
|
||||
# baseline alignment model for incremetal updating
|
||||
die "ERROR: buggy definition of baseline alignment model, should have 8 values:\n\t".join("\n\t",@_BASELINE_ALIGNMENT_MODEL)."\n"
|
||||
unless scalar(@_BASELINE_ALIGNMENT_MODEL) == 8 || scalar(@_BASELINE_ALIGNMENT_MODEL) == 0;
|
||||
die "ERROR: use of baseline alignment model limited to HMM training (-hmm-align)\n"
|
||||
if defined($___FINAL_ALIGNMENT_MODEL) && $___FINAL_ALIGNMENT_MODEL ne 'hmm' && scalar(@_BASELINE_ALIGNMENT_MODEL) == 8;
|
||||
|
||||
# model dir and alignment/extract file
|
||||
my $___MODEL_DIR = $___ROOT_DIR."/model";
|
||||
@ -620,8 +626,8 @@ sub prepare {
|
||||
&make_classes($corpus.".".$___F,$___VCB_F.".classes");
|
||||
&make_classes($corpus.".".$___E,$___VCB_E.".classes");
|
||||
|
||||
$VCB_F = &get_vocabulary($corpus.".".$___F,$___VCB_F);
|
||||
$VCB_E = &get_vocabulary($corpus.".".$___E,$___VCB_E);
|
||||
$VCB_F = &get_vocabulary($corpus.".".$___F,$___VCB_F,0);
|
||||
$VCB_E = &get_vocabulary($corpus.".".$___E,$___VCB_E,1);
|
||||
|
||||
&numberize_txt_file($VCB_F,$corpus.".".$___F,
|
||||
$VCB_E,$corpus.".".$___E,
|
||||
@ -659,8 +665,8 @@ sub prepare {
|
||||
exit 0;
|
||||
}
|
||||
|
||||
$VCB_F = &get_vocabulary($corpus.".".$___F,$___VCB_F);
|
||||
$VCB_E = &get_vocabulary($corpus.".".$___E,$___VCB_E);
|
||||
$VCB_F = &get_vocabulary($corpus.".".$___F,$___VCB_F,0);
|
||||
$VCB_E = &get_vocabulary($corpus.".".$___E,$___VCB_E,1);
|
||||
|
||||
&numberize_txt_file($VCB_F,$corpus.".".$___F,
|
||||
$VCB_E,$corpus.".".$___E,
|
||||
@ -787,7 +793,7 @@ sub make_classes {
|
||||
|
||||
sub get_vocabulary {
|
||||
# return unless $___LEXICAL_WEIGHTING;
|
||||
my($corpus,$vcb) = @_;
|
||||
my($corpus,$vcb,$is_target) = @_;
|
||||
print STDERR "(1.2) creating vcb file $vcb @ ".`date`;
|
||||
|
||||
my %WORD;
|
||||
@ -797,17 +803,37 @@ sub get_vocabulary {
|
||||
foreach (split) { $WORD{$_}++; }
|
||||
}
|
||||
close(TXT);
|
||||
|
||||
|
||||
my ($id,%VCB);
|
||||
open(VCB,">", "$vcb") or die "ERROR: Can't write $vcb";
|
||||
|
||||
# words from baseline alignment model when incrementally updating
|
||||
if (scalar @_BASELINE_ALIGNMENT_MODEL) {
|
||||
open(BASELINE_VCB,$_BASELINE_ALIGNMENT_MODEL[$is_target]);
|
||||
while(<BASELINE_VCB>) {
|
||||
chop;
|
||||
my ($i,$word,$count) = split;
|
||||
if (defined($WORD{$word})) {
|
||||
$count += $WORD{$word};
|
||||
delete($WORD{$word});
|
||||
}
|
||||
printf VCB "%d\t%s\t%d\n",$i,$word,$count;
|
||||
$VCB{$word} = $i;
|
||||
$id = $i+1;
|
||||
}
|
||||
close(BASELINE_VCB);
|
||||
}
|
||||
# not incrementally updating
|
||||
else {
|
||||
print VCB "1\tUNK\t0\n";
|
||||
$id=2;
|
||||
}
|
||||
|
||||
my @NUM;
|
||||
foreach my $word (keys %WORD) {
|
||||
my $vcb_with_number = sprintf("%07d %s",$WORD{$word},$word);
|
||||
push @NUM,$vcb_with_number;
|
||||
}
|
||||
|
||||
my %VCB;
|
||||
open(VCB,">", "$vcb") or die "ERROR: Can't write $vcb";
|
||||
print VCB "1\tUNK\t0\n";
|
||||
my $id=2;
|
||||
foreach (reverse sort @NUM) {
|
||||
my($count,$word) = split;
|
||||
printf VCB "%d\t%s\t%d\n",$id,$word,$count;
|
||||
@ -986,15 +1012,30 @@ sub run_single_giza_on_parts {
|
||||
close(SNT);
|
||||
|
||||
# run snt2cooc in parts
|
||||
my @COOC_PART_FILE_NAME;
|
||||
for(my $i=1;$i<=$___PARTS;$i++) {
|
||||
&run_single_snt2cooc("$dir/part$i",$e,$f,$vcb_e,$vcb_f,"$___CORPUS_DIR/part$i/$f-$e-int-train.snt");
|
||||
push @COOC_PART_FILE_NAME, "$dir/part$i/$f-$e.cooc";
|
||||
}
|
||||
# include baseline cooc, if baseline alignment model (incremental training)
|
||||
if (scalar @_BASELINE_ALIGNMENT_MODEL) {
|
||||
push @COOC_PART_FILE_NAME, $_BASELINE_ALIGNMENT_MODEL[2 + ($dir eq $___GIZA_F2E?1:0)];
|
||||
}
|
||||
&merge_cooc_files($dir,$e,$f,@COOC_PART_FILE_NAME);
|
||||
|
||||
# run giza
|
||||
&run_single_giza($dir,$e,$f,$vcb_e,$vcb_f,$train);
|
||||
}
|
||||
|
||||
sub merge_cooc_files {
|
||||
my ($dir,$e,$f,@COOC_PART_FILE_NAME) = @_;
|
||||
|
||||
# merge parts
|
||||
open(COOC,">$dir/$f-$e.cooc") or die "ERROR: Can't write $dir/$f-$e.cooc";
|
||||
my(@PF,@CURRENT);
|
||||
for(my $i=1;$i<=$___PARTS;$i++) {
|
||||
open($PF[$i],"$dir/part$i/$f-$e.cooc")or die "ERROR: Can't read $dir/part$i/$f-$e.cooc";
|
||||
for(my $i=0;$i<scalar(@COOC_PART_FILE_NAME);$i++) {
|
||||
print STDERR "merging cooc file $COOC_PART_FILE_NAME[$i]...\n";
|
||||
open($PF[$i],$COOC_PART_FILE_NAME[$i]) or die "ERROR: Can't read $COOC_PART_FILE_NAME[$i]";
|
||||
my $pf = $PF[$i];
|
||||
$CURRENT[$i] = <$pf>;
|
||||
chop($CURRENT[$i]) if $CURRENT[$i];
|
||||
@ -1002,7 +1043,7 @@ sub run_single_giza_on_parts {
|
||||
|
||||
while(1) {
|
||||
my ($min1,$min2) = (1e20,1e20);
|
||||
for(my $i=1;$i<=$___PARTS;$i++) {
|
||||
for(my $i=0;$i<scalar(@COOC_PART_FILE_NAME);$i++) {
|
||||
next unless $CURRENT[$i];
|
||||
my ($w1,$w2) = split(/ /,$CURRENT[$i]);
|
||||
if ($w1 < $min1 || ($w1 == $min1 && $w2 < $min2)) {
|
||||
@ -1012,7 +1053,7 @@ sub run_single_giza_on_parts {
|
||||
}
|
||||
last if $min1 == 1e20;
|
||||
print COOC "$min1 $min2\n";
|
||||
for(my $i=1;$i<=$___PARTS;$i++) {
|
||||
for(my $i=0;$i<scalar(@COOC_PART_FILE_NAME);$i++) {
|
||||
next unless $CURRENT[$i];
|
||||
my ($w1,$w2) = split(/ /,$CURRENT[$i]);
|
||||
if ($w1 == $min1 && $w2 == $min2) {
|
||||
@ -1022,13 +1063,10 @@ sub run_single_giza_on_parts {
|
||||
}
|
||||
}
|
||||
}
|
||||
for(my $i=1;$i<=$___PARTS;$i++) {
|
||||
for(my $i=0;$i<scalar(@COOC_PART_FILE_NAME);$i++) {
|
||||
close($PF[$i]);
|
||||
}
|
||||
close(COOC);
|
||||
|
||||
# run giza
|
||||
&run_single_giza($dir,$e,$f,$vcb_e,$vcb_f,$train);
|
||||
}
|
||||
|
||||
sub run_single_giza {
|
||||
@ -1083,6 +1121,12 @@ sub run_single_giza {
|
||||
$GizaDefaultOptions{m5} = ($___FINAL_ALIGNMENT_MODEL eq '5')? 3: 0;
|
||||
}
|
||||
|
||||
if (scalar(@_BASELINE_ALIGNMENT_MODEL)) {
|
||||
$GizaDefaultOptions{oldTrPrbs} = $_BASELINE_ALIGNMENT_MODEL[4 + ($dir eq $___GIZA_F2E?2:0)];
|
||||
$GizaDefaultOptions{oldAlPrbs} = $_BASELINE_ALIGNMENT_MODEL[5 + ($dir eq $___GIZA_F2E?2:0)];
|
||||
$GizaDefaultOptions{step_k} = 1;
|
||||
}
|
||||
|
||||
if ($___GIZA_OPTION) {
|
||||
foreach (split(/[ ,]+/,$___GIZA_OPTION)) {
|
||||
my ($option,$value) = split(/=/,$_,2);
|
||||
@ -1123,16 +1167,19 @@ sub run_single_giza {
|
||||
}
|
||||
|
||||
sub run_single_snt2cooc {
|
||||
my($dir,$e,$f,$vcb_e,$vcb_f,$train) = @_;
|
||||
print STDERR "(2.1a) running snt2cooc $f-$e @ ".`date`."\n";
|
||||
safesystem("mkdir -p $dir") or die("ERROR");
|
||||
if ($SNT2COOC eq "$_EXTERNAL_BINDIR/snt2cooc.out") {
|
||||
print "$SNT2COOC $vcb_e $vcb_f $train > $dir/$f-$e.cooc\n";
|
||||
safesystem("$SNT2COOC $vcb_e $vcb_f $train > $dir/$f-$e.cooc") or die("ERROR");
|
||||
} else {
|
||||
print "$SNT2COOC $dir/$f-$e.cooc $vcb_e $vcb_f $train\n";
|
||||
safesystem("$SNT2COOC $dir/$f-$e.cooc $vcb_e $vcb_f $train") or die("ERROR");
|
||||
}
|
||||
my($dir,$e,$f,$vcb_e,$vcb_f,$train) = @_;
|
||||
print STDERR "(2.1a) running snt2cooc $f-$e @ ".`date`."\n";
|
||||
my $suffix = (scalar @_BASELINE_ALIGNMENT_MODEL) ? ".new" : "";
|
||||
safesystem("mkdir -p $dir") or die("ERROR");
|
||||
if ($SNT2COOC eq "$_EXTERNAL_BINDIR/snt2cooc.out") {
|
||||
print "$SNT2COOC $vcb_e $vcb_f $train > $dir/$f-$e.cooc$suffix\n";
|
||||
safesystem("$SNT2COOC $vcb_e $vcb_f $train > $dir/$f-$e.cooc$suffix") or die("ERROR");
|
||||
} else {
|
||||
print "$SNT2COOC $dir/$f-$e.cooc$suffix $vcb_e $vcb_f $train\n";
|
||||
safesystem("$SNT2COOC $dir/$f-$e.cooc$suffix $vcb_e $vcb_f $train") or die("ERROR");
|
||||
}
|
||||
&merge_cooc_files($dir,$e,$f,"$dir/$f-$e.cooc.new",$_BASELINE_ALIGNMENT_MODEL[2 + ($dir eq $___GIZA_F2E?1:0)])
|
||||
if scalar @_BASELINE_ALIGNMENT_MODEL;
|
||||
}
|
||||
|
||||
### (3) CREATE WORD ALIGNMENT FROM GIZA ALIGNMENTS
|
||||
|
Loading…
Reference in New Issue
Block a user