support for use of baseline alignment model

This commit is contained in:
phikoehn 2012-12-12 03:59:14 +00:00
parent 438dcb1a34
commit 24e1df7520
8 changed files with 149 additions and 34 deletions

View File

@ -340,6 +340,18 @@ alignment-symmetrization-method = grow-diag-final-and
#berkeley-process-options = "-EMWordAligner.numThreads 8"
#berkeley-posterior = 0.5
### use of baseline alignment model (incremental training)
#
#baseline = 68
#baseline-alignment-model = "$working-dir/training/prepared.$baseline/$input-extension.vcb \
# $working-dir/training/prepared.$baseline/$output-extension.vcb \
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.cooc \
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.cooc \
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.thmm.5 \
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.hhmm.5 \
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.thmm.5 \
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.hhmm.5"
### if word alignment should be skipped,
# point to word alignment files
#

View File

@ -360,6 +360,18 @@ alignment-symmetrization-method = grow-diag-final-and
#berkeley-process-options = "-EMWordAligner.numThreads 8"
#berkeley-posterior = 0.5
### use of baseline alignment model (incremental training)
#
#baseline = 68
#baseline-alignment-model = "$working-dir/training/prepared.$baseline/$input-extension.vcb \
# $working-dir/training/prepared.$baseline/$output-extension.vcb \
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.cooc \
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.cooc \
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.thmm.5 \
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.hhmm.5 \
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.thmm.5 \
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.hhmm.5"
### if word alignment should be skipped,
# point to word alignment files
#

View File

@ -340,6 +340,18 @@ alignment-symmetrization-method = grow-diag-final-and
#berkeley-process-options = "-EMWordAligner.numThreads 8"
#berkeley-posterior = 0.5
### use of baseline alignment model (incremental training)
#
#baseline = 68
#baseline-alignment-model = "$working-dir/training/prepared.$baseline/$input-extension.vcb \
# $working-dir/training/prepared.$baseline/$output-extension.vcb \
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.cooc \
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.cooc \
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.thmm.5 \
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.hhmm.5 \
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.thmm.5 \
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.hhmm.5"
### if word alignment should be skipped,
# point to word alignment files
#

View File

@ -344,6 +344,18 @@ alignment-symmetrization-method = grow-diag-final-and
#berkeley-process-options = "-EMWordAligner.numThreads 8"
#berkeley-posterior = 0.5
### use of baseline alignment model (incremental training)
#
#baseline = 68
#baseline-alignment-model = "$working-dir/training/prepared.$baseline/$input-extension.vcb \
# $working-dir/training/prepared.$baseline/$output-extension.vcb \
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.cooc \
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.cooc \
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.thmm.5 \
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.hhmm.5 \
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.thmm.5 \
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.hhmm.5"
### if word alignment should be skipped,
# point to word alignment files
#

View File

@ -324,6 +324,18 @@ alignment-symmetrization-method = grow-diag-final-and
#berkeley-process-options = "-EMWordAligner.numThreads 8"
#berkeley-posterior = 0.5
### use of baseline alignment model (incremental training)
#
#baseline = 68
#baseline-alignment-model = "$working-dir/training/prepared.$baseline/$input-extension.vcb \
# $working-dir/training/prepared.$baseline/$output-extension.vcb \
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.cooc \
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.cooc \
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.thmm.5 \
# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.hhmm.5 \
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.thmm.5 \
# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.hhmm.5"
### if word alignment should be skipped,
# point to word alignment files
#

View File

@ -401,23 +401,25 @@ mml-filter-before-wa
prepare-data
in: corpus-mml-prefilter=OR=corpus
out: prepared-data
rerun-on-change: alignment-factors training-options script
rerun-on-change: alignment-factors training-options script baseline-alignment-model
ignore-if: use-berkeley
default-name: prepared
run-giza
in: prepared-data
out: giza-alignment
ignore-if: use-berkeley
rerun-on-change: giza-settings training-options script
rerun-on-change: giza-settings training-options script baseline-alignment-model
default-name: giza
error: not found
not-error: 0 not found
run-giza-inverse
in: prepared-data
out: giza-alignment-inverse
rerun-on-change: giza-settings training-options script
rerun-on-change: giza-settings training-options script baseline-alignment-model
ignore-if: use-berkeley
default-name: giza-inverse
error: not found
not-error: 0 not found
run-berkeley
in: corpus-mml-prefilter
out: berkeley-alignment

View File

@ -297,6 +297,10 @@ sub read_config {
$line_count++;
s/\#.*$//; # strip comments
next if /^\#/ || /^\s*$/;
while (/\\\s*$/) { # merge with next line
s/\s*\\\s*$/ /;
$_ .= <INI>;
}
if (/^\[(.+)\]/) {
$module = $1;
$ignore = /ignore/i;
@ -329,7 +333,7 @@ sub read_config {
# resolve parameters used in values
my $resolve = 1;
my $loop_count = 0;
while($resolve && $loop_count++ < 10) {
while($resolve && $loop_count++ < 100) {
$resolve = 0;
foreach my $parameter (keys %CONFIG) {
foreach (@{$CONFIG{$parameter}}) {
@ -2354,6 +2358,7 @@ sub get_training_setting {
my $score_settings = &get("TRAINING:score-settings");
my $parallel = &get("TRAINING:parallel");
my $pcfg = &get("TRAINING:use-pcfg-feature");
my $baseline_alignment = &get("TRAINING:baseline-alignment-model");
my $xml = $source_syntax || $target_syntax;
@ -2377,6 +2382,7 @@ sub get_training_setting {
$cmd .= "-score-options '".$score_settings."' " if $score_settings;
$cmd .= "-parallel " if $parallel;
$cmd .= "-pcfg " if $pcfg;
$cmd .= "-baseline-alignment-model $baseline_alignment " if defined($baseline_alignment) && ($step == 1 || $step == 2);
# factored training
if (&backoff_and_get("TRAINING:input-factors")) {

View File

@ -38,7 +38,7 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
$_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
$_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS,
@_ADDITIONAL_INI,$_ADDITIONAL_INI_FILE,
$_SPARSE_TRANSLATION_TABLE,
$_SPARSE_TRANSLATION_TABLE, @_BASELINE_ALIGNMENT_MODEL,
$_DICTIONARY, $_SPARSE_PHRASE_FEATURES, $_EPPEX, $IGNORE);
my $_CORES = 1;
@ -128,6 +128,7 @@ $_HELP = 1
'additional-ini=s' => \@_ADDITIONAL_INI,
'additional-ini-file=s' => \$_ADDITIONAL_INI_FILE,
'sparse-translation-table' => \$_SPARSE_TRANSLATION_TABLE,
'baseline-alignment-model=s{8}' => \@_BASELINE_ALIGNMENT_MODEL,
'cores=i' => \$_CORES
);
@ -373,6 +374,11 @@ my $___ALIGNMENT = "grow-diag-final";
$___ALIGNMENT = $_ALIGNMENT if $_ALIGNMENT;
my $___NOTE_ALIGNMENT_DROPS = 1;
# baseline alignment model for incremetal updating
die "ERROR: buggy definition of baseline alignment model, should have 8 values:\n\t".join("\n\t",@_BASELINE_ALIGNMENT_MODEL)."\n"
unless scalar(@_BASELINE_ALIGNMENT_MODEL) == 8 || scalar(@_BASELINE_ALIGNMENT_MODEL) == 0;
die "ERROR: use of baseline alignment model limited to HMM training (-hmm-align)\n"
if defined($___FINAL_ALIGNMENT_MODEL) && $___FINAL_ALIGNMENT_MODEL ne 'hmm' && scalar(@_BASELINE_ALIGNMENT_MODEL) == 8;
# model dir and alignment/extract file
my $___MODEL_DIR = $___ROOT_DIR."/model";
@ -620,8 +626,8 @@ sub prepare {
&make_classes($corpus.".".$___F,$___VCB_F.".classes");
&make_classes($corpus.".".$___E,$___VCB_E.".classes");
$VCB_F = &get_vocabulary($corpus.".".$___F,$___VCB_F);
$VCB_E = &get_vocabulary($corpus.".".$___E,$___VCB_E);
$VCB_F = &get_vocabulary($corpus.".".$___F,$___VCB_F,0);
$VCB_E = &get_vocabulary($corpus.".".$___E,$___VCB_E,1);
&numberize_txt_file($VCB_F,$corpus.".".$___F,
$VCB_E,$corpus.".".$___E,
@ -659,8 +665,8 @@ sub prepare {
exit 0;
}
$VCB_F = &get_vocabulary($corpus.".".$___F,$___VCB_F);
$VCB_E = &get_vocabulary($corpus.".".$___E,$___VCB_E);
$VCB_F = &get_vocabulary($corpus.".".$___F,$___VCB_F,0);
$VCB_E = &get_vocabulary($corpus.".".$___E,$___VCB_E,1);
&numberize_txt_file($VCB_F,$corpus.".".$___F,
$VCB_E,$corpus.".".$___E,
@ -787,7 +793,7 @@ sub make_classes {
sub get_vocabulary {
# return unless $___LEXICAL_WEIGHTING;
my($corpus,$vcb) = @_;
my($corpus,$vcb,$is_target) = @_;
print STDERR "(1.2) creating vcb file $vcb @ ".`date`;
my %WORD;
@ -797,17 +803,37 @@ sub get_vocabulary {
foreach (split) { $WORD{$_}++; }
}
close(TXT);
my ($id,%VCB);
open(VCB,">", "$vcb") or die "ERROR: Can't write $vcb";
# words from baseline alignment model when incrementally updating
if (scalar @_BASELINE_ALIGNMENT_MODEL) {
open(BASELINE_VCB,$_BASELINE_ALIGNMENT_MODEL[$is_target]);
while(<BASELINE_VCB>) {
chop;
my ($i,$word,$count) = split;
if (defined($WORD{$word})) {
$count += $WORD{$word};
delete($WORD{$word});
}
printf VCB "%d\t%s\t%d\n",$i,$word,$count;
$VCB{$word} = $i;
$id = $i+1;
}
close(BASELINE_VCB);
}
# not incrementally updating
else {
print VCB "1\tUNK\t0\n";
$id=2;
}
my @NUM;
foreach my $word (keys %WORD) {
my $vcb_with_number = sprintf("%07d %s",$WORD{$word},$word);
push @NUM,$vcb_with_number;
}
my %VCB;
open(VCB,">", "$vcb") or die "ERROR: Can't write $vcb";
print VCB "1\tUNK\t0\n";
my $id=2;
foreach (reverse sort @NUM) {
my($count,$word) = split;
printf VCB "%d\t%s\t%d\n",$id,$word,$count;
@ -986,15 +1012,30 @@ sub run_single_giza_on_parts {
close(SNT);
# run snt2cooc in parts
my @COOC_PART_FILE_NAME;
for(my $i=1;$i<=$___PARTS;$i++) {
&run_single_snt2cooc("$dir/part$i",$e,$f,$vcb_e,$vcb_f,"$___CORPUS_DIR/part$i/$f-$e-int-train.snt");
push @COOC_PART_FILE_NAME, "$dir/part$i/$f-$e.cooc";
}
# include baseline cooc, if baseline alignment model (incremental training)
if (scalar @_BASELINE_ALIGNMENT_MODEL) {
push @COOC_PART_FILE_NAME, $_BASELINE_ALIGNMENT_MODEL[2 + ($dir eq $___GIZA_F2E?1:0)];
}
&merge_cooc_files($dir,$e,$f,@COOC_PART_FILE_NAME);
# run giza
&run_single_giza($dir,$e,$f,$vcb_e,$vcb_f,$train);
}
sub merge_cooc_files {
my ($dir,$e,$f,@COOC_PART_FILE_NAME) = @_;
# merge parts
open(COOC,">$dir/$f-$e.cooc") or die "ERROR: Can't write $dir/$f-$e.cooc";
my(@PF,@CURRENT);
for(my $i=1;$i<=$___PARTS;$i++) {
open($PF[$i],"$dir/part$i/$f-$e.cooc")or die "ERROR: Can't read $dir/part$i/$f-$e.cooc";
for(my $i=0;$i<scalar(@COOC_PART_FILE_NAME);$i++) {
print STDERR "merging cooc file $COOC_PART_FILE_NAME[$i]...\n";
open($PF[$i],$COOC_PART_FILE_NAME[$i]) or die "ERROR: Can't read $COOC_PART_FILE_NAME[$i]";
my $pf = $PF[$i];
$CURRENT[$i] = <$pf>;
chop($CURRENT[$i]) if $CURRENT[$i];
@ -1002,7 +1043,7 @@ sub run_single_giza_on_parts {
while(1) {
my ($min1,$min2) = (1e20,1e20);
for(my $i=1;$i<=$___PARTS;$i++) {
for(my $i=0;$i<scalar(@COOC_PART_FILE_NAME);$i++) {
next unless $CURRENT[$i];
my ($w1,$w2) = split(/ /,$CURRENT[$i]);
if ($w1 < $min1 || ($w1 == $min1 && $w2 < $min2)) {
@ -1012,7 +1053,7 @@ sub run_single_giza_on_parts {
}
last if $min1 == 1e20;
print COOC "$min1 $min2\n";
for(my $i=1;$i<=$___PARTS;$i++) {
for(my $i=0;$i<scalar(@COOC_PART_FILE_NAME);$i++) {
next unless $CURRENT[$i];
my ($w1,$w2) = split(/ /,$CURRENT[$i]);
if ($w1 == $min1 && $w2 == $min2) {
@ -1022,13 +1063,10 @@ sub run_single_giza_on_parts {
}
}
}
for(my $i=1;$i<=$___PARTS;$i++) {
for(my $i=0;$i<scalar(@COOC_PART_FILE_NAME);$i++) {
close($PF[$i]);
}
close(COOC);
# run giza
&run_single_giza($dir,$e,$f,$vcb_e,$vcb_f,$train);
}
sub run_single_giza {
@ -1083,6 +1121,12 @@ sub run_single_giza {
$GizaDefaultOptions{m5} = ($___FINAL_ALIGNMENT_MODEL eq '5')? 3: 0;
}
if (scalar(@_BASELINE_ALIGNMENT_MODEL)) {
$GizaDefaultOptions{oldTrPrbs} = $_BASELINE_ALIGNMENT_MODEL[4 + ($dir eq $___GIZA_F2E?2:0)];
$GizaDefaultOptions{oldAlPrbs} = $_BASELINE_ALIGNMENT_MODEL[5 + ($dir eq $___GIZA_F2E?2:0)];
$GizaDefaultOptions{step_k} = 1;
}
if ($___GIZA_OPTION) {
foreach (split(/[ ,]+/,$___GIZA_OPTION)) {
my ($option,$value) = split(/=/,$_,2);
@ -1123,16 +1167,19 @@ sub run_single_giza {
}
sub run_single_snt2cooc {
my($dir,$e,$f,$vcb_e,$vcb_f,$train) = @_;
print STDERR "(2.1a) running snt2cooc $f-$e @ ".`date`."\n";
safesystem("mkdir -p $dir") or die("ERROR");
if ($SNT2COOC eq "$_EXTERNAL_BINDIR/snt2cooc.out") {
print "$SNT2COOC $vcb_e $vcb_f $train > $dir/$f-$e.cooc\n";
safesystem("$SNT2COOC $vcb_e $vcb_f $train > $dir/$f-$e.cooc") or die("ERROR");
} else {
print "$SNT2COOC $dir/$f-$e.cooc $vcb_e $vcb_f $train\n";
safesystem("$SNT2COOC $dir/$f-$e.cooc $vcb_e $vcb_f $train") or die("ERROR");
}
my($dir,$e,$f,$vcb_e,$vcb_f,$train) = @_;
print STDERR "(2.1a) running snt2cooc $f-$e @ ".`date`."\n";
my $suffix = (scalar @_BASELINE_ALIGNMENT_MODEL) ? ".new" : "";
safesystem("mkdir -p $dir") or die("ERROR");
if ($SNT2COOC eq "$_EXTERNAL_BINDIR/snt2cooc.out") {
print "$SNT2COOC $vcb_e $vcb_f $train > $dir/$f-$e.cooc$suffix\n";
safesystem("$SNT2COOC $vcb_e $vcb_f $train > $dir/$f-$e.cooc$suffix") or die("ERROR");
} else {
print "$SNT2COOC $dir/$f-$e.cooc$suffix $vcb_e $vcb_f $train\n";
safesystem("$SNT2COOC $dir/$f-$e.cooc$suffix $vcb_e $vcb_f $train") or die("ERROR");
}
&merge_cooc_files($dir,$e,$f,"$dir/$f-$e.cooc.new",$_BASELINE_ALIGNMENT_MODEL[2 + ($dir eq $___GIZA_F2E?1:0)])
if scalar @_BASELINE_ALIGNMENT_MODEL;
}
### (3) CREATE WORD ALIGNMENT FROM GIZA ALIGNMENTS