mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-27 22:14:57 +03:00
Merge branch 'master' of github.com:moses-smt/mosesdecoder
This commit is contained in:
commit
ac764f9dce
@ -260,7 +260,8 @@ script = $moses-script-dir/training/train-model.perl
|
||||
### general options
|
||||
# these are options that are passed on to train-model.perl, for instance
|
||||
# * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza
|
||||
# * "-sort-buffer-size 8G" to reduce on-disk sorting
|
||||
# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting
|
||||
# * "-sort-parallel 8 -cores 8" to speed up phrase table building
|
||||
#
|
||||
#training-options = ""
|
||||
|
||||
|
@ -280,7 +280,8 @@ script = $moses-script-dir/training/train-model.perl
|
||||
### general options
|
||||
# these are options that are passed on to train-model.perl, for instance
|
||||
# * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza
|
||||
# * "-sort-buffer-size 8G" to reduce on-disk sorting
|
||||
# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting
|
||||
# * "-sort-parallel 8 -cores 8" to speed up phrase table building
|
||||
#
|
||||
#training-options = ""
|
||||
|
||||
|
@ -260,7 +260,8 @@ script = $moses-script-dir/training/train-model.perl
|
||||
### general options
|
||||
# these are options that are passed on to train-model.perl, for instance
|
||||
# * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza
|
||||
# * "-sort-buffer-size 8G" to reduce on-disk sorting
|
||||
# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting
|
||||
# * "-sort-parallel 8 -cores 8" to speed up phrase table building
|
||||
#
|
||||
#training-options = ""
|
||||
|
||||
|
@ -264,7 +264,8 @@ script = $moses-script-dir/training/train-model.perl
|
||||
### general options
|
||||
# these are options that are passed on to train-model.perl, for instance
|
||||
# * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza
|
||||
# * "-sort-buffer-size 8G" to reduce on-disk sorting
|
||||
# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting
|
||||
# * "-sort-parallel 8 -cores 8" to speed up phrase table building
|
||||
#
|
||||
#training-options = ""
|
||||
|
||||
|
@ -244,7 +244,8 @@ script = $moses-script-dir/training/train-model.perl
|
||||
### general options
|
||||
# these are options that are passed on to train-model.perl, for instance
|
||||
# * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza
|
||||
# * "-sort-buffer-size 8G" to reduce on-disk sorting
|
||||
# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting
|
||||
# * "-sort-parallel 8 -cores 8" to speed up phrase table building
|
||||
#
|
||||
#training-options = ""
|
||||
|
||||
|
@ -8,15 +8,23 @@ my $FILLER = ":s:es";
|
||||
my $MIN_SIZE = 3;
|
||||
my $MIN_COUNT = 5;
|
||||
my $MAX_COUNT = 5;
|
||||
my $FACTORED = 0;
|
||||
my $SYNTAX = 0;
|
||||
my $MARK_SPLIT = 0;
|
||||
my $BINARIZE = 0;
|
||||
$HELP = 1
|
||||
unless &GetOptions('corpus=s' => \$CORPUS,
|
||||
'model=s' => \$MODEL,
|
||||
'filler=s' => \$FILLER,
|
||||
'factored' => \$FACTORED,
|
||||
'min-size=i' => \$MIN_SIZE,
|
||||
'min-count=i' => \$MIN_COUNT,
|
||||
'max-count=i' => \$MAX_COUNT,
|
||||
'help' => \$HELP,
|
||||
'verbose' => \$VERBOSE,
|
||||
'syntax' => \$SYNTAX,
|
||||
'binarize' => \$BINARIZE,
|
||||
'mark-split' => \$MARK_SPLIT,
|
||||
'train' => \$TRAIN);
|
||||
|
||||
if ($HELP ||
|
||||
@ -29,59 +37,152 @@ if ($HELP ||
|
||||
print "options: -min-size: minimum word size (default $MIN_SIZE)\n";
|
||||
print " -min-count: minimum word count (default $MIN_COUNT)\n";
|
||||
print " -filler: filler letters between words (default $FILLER)\n";
|
||||
print " -factor: factored data, assuming factor 0 as surface (default $FACTORED)\n";
|
||||
print " -syntax: syntactically parsed data (default $SYNTAX)\n";
|
||||
print " -mark-split: mark non-terminal label of split words (default $MARK_SPLIT)\n";
|
||||
print " -binarize: binarize subtree for split word (default $BINARIZE)\n";
|
||||
exit;
|
||||
}
|
||||
|
||||
if ($TRAIN) {
|
||||
&train;
|
||||
if ($SYNTAX) { &train_syntax(); }
|
||||
elsif ($FACTORED) { &train_factored(); }
|
||||
else { &train(); }
|
||||
}
|
||||
else {
|
||||
&apply;
|
||||
&apply();
|
||||
}
|
||||
|
||||
sub train {
|
||||
my %WORD;
|
||||
my %COUNT;
|
||||
open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'");
|
||||
while(<CORPUS>) {
|
||||
chop; s/\s+/ /g; s/^ //; s/ $//;
|
||||
foreach (split) {
|
||||
$WORD{$_}++;
|
||||
$COUNT{$_}++;
|
||||
}
|
||||
}
|
||||
close($CORPUS);
|
||||
close(CORPUS);
|
||||
&save_trained_model(\%COUNT);
|
||||
}
|
||||
|
||||
sub save_trained_model {
|
||||
my ($COUNT) = @_;
|
||||
my $id = 0;
|
||||
open(MODEL,">".$MODEL);
|
||||
foreach my $word (keys %WORD) {
|
||||
print MODEL "".(++$id)."\t".$word."\t".$WORD{$word}."\n";
|
||||
foreach my $word (keys %$COUNT) {
|
||||
print MODEL "".(++$id)."\t".$word."\t".$$COUNT{$word}."\n";
|
||||
}
|
||||
close(MODEL);
|
||||
print STDERR "written model file with ".(scalar keys %WORD)." words.\n";
|
||||
print STDERR "written model file with ".(scalar keys %$COUNT)." words.\n";
|
||||
}
|
||||
|
||||
sub train_factored {
|
||||
my (%COUNT,%FACTORED_COUNT);
|
||||
# collect counts for interpretations for each surface word
|
||||
open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'");
|
||||
while(<CORPUS>) {
|
||||
chop; s/\s+/ /g; s/^ //; s/ $//;
|
||||
foreach my $factored_word (split) {
|
||||
my $word = $factored_word;
|
||||
$word =~ s/\|.+//g; # just first factor
|
||||
$FACTORED_COUNT{$word}{$factored_word}++;
|
||||
}
|
||||
}
|
||||
close(CORPUS);
|
||||
# only preserve most frequent interpretation, assign sum of counts
|
||||
foreach my $word (keys %FACTORED_COUNT) {
|
||||
my ($max,$best,$total) = (0,"",0);
|
||||
foreach my $factored_word (keys %{$FACTORED_COUNT{$word}}) {
|
||||
my $count = $FACTORED_COUNT{$word}{$factored_word};
|
||||
$total += $count;
|
||||
if ($count > $max) {
|
||||
$max = $count;
|
||||
$best = $factored_word;
|
||||
}
|
||||
}
|
||||
$COUNT{$best} = $total;
|
||||
}
|
||||
&save_trained_model(\%COUNT);
|
||||
}
|
||||
|
||||
sub train_syntax {
|
||||
my (%COUNT,%LABELED_COUNT);
|
||||
# collect counts for interpretations for each surface word
|
||||
open(CORPUS,$CORPUS) || die("ERROR: could not open corpus '$CORPUS'");
|
||||
while(<CORPUS>) {
|
||||
chop; s/\s+/ /g; s/^ //; s/ $//;
|
||||
my $label;
|
||||
foreach (split) {
|
||||
if (/^label="([^\"]+)"/) {
|
||||
$label = $1;
|
||||
}
|
||||
elsif (! /^</) {
|
||||
$LABELED_COUNT{$_}{$label}++;
|
||||
}
|
||||
}
|
||||
}
|
||||
close(CORPUS);
|
||||
|
||||
# only preserve most frequent label, assign sum of counts
|
||||
foreach my $word (keys %LABELED_COUNT) {
|
||||
my ($max,$best,$total) = (0,"",0);
|
||||
foreach my $label (keys %{$LABELED_COUNT{$word}}) {
|
||||
my $count = $LABELED_COUNT{$word}{$label};
|
||||
$total += $count;
|
||||
if ($count > $max) {
|
||||
$max = $count;
|
||||
$best = "$word $label";
|
||||
}
|
||||
}
|
||||
$COUNT{$best} = $total;
|
||||
}
|
||||
&save_trained_model(\%COUNT);
|
||||
}
|
||||
|
||||
sub apply {
|
||||
my (%WORD,%TRUECASE);
|
||||
my (%COUNT,%TRUECASE,%LABEL);
|
||||
open(MODEL,$MODEL) || die("ERROR: could not open model '$MODEL'");
|
||||
while(<MODEL>) {
|
||||
chomp;
|
||||
my ($id,$word,$count) = split(/\t/);
|
||||
my ($id,$factored_word,$count) = split(/\t/);
|
||||
my $label;
|
||||
($factored_word,$label) = split(/ /,$factored_word);
|
||||
my $word = $factored_word;
|
||||
$word =~ s/\|.+//g; # just first factor
|
||||
my $lc = lc($word);
|
||||
# if word exists with multipe casings, only record most frequent
|
||||
next if defined($WORD{$lc}) && $WORD{$lc} > $count;
|
||||
$WORD{$lc} = $count;
|
||||
$TRUECASE{$lc} = $word;
|
||||
next if defined($COUNT{$lc}) && $COUNT{$lc} > $count;
|
||||
$COUNT{$lc} = $count;
|
||||
$TRUECASE{$lc} = $factored_word;
|
||||
$LABEL{$lc} = $label if $SYNTAX;
|
||||
}
|
||||
close(MODEL);
|
||||
|
||||
while(<STDIN>) {
|
||||
my $first = 1;
|
||||
chop; s/\s+/ /g; s/^ //; s/ $//;
|
||||
foreach my $word (split) {
|
||||
my @BUFFER; # for xml tags
|
||||
foreach my $factored_word (split) {
|
||||
print " " unless $first;
|
||||
$first = 0;
|
||||
|
||||
# syntax: don't split xml
|
||||
if ($SYNTAX && ($factored_word =~ /^</ || $factored_word =~ />$/)) {
|
||||
push @BUFFER,$factored_word;
|
||||
$first = 1;
|
||||
next;
|
||||
}
|
||||
|
||||
# get case class
|
||||
my $word = $factored_word;
|
||||
$word =~ s/\|.+//g; # just first factor
|
||||
my $lc = lc($word);
|
||||
|
||||
# don't split frequent words
|
||||
if (defined($WORD{$word}) && $WORD{$word}>=$MAX_COUNT) {
|
||||
print $word;
|
||||
if (defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) {
|
||||
print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
|
||||
print $factored_word;
|
||||
next;
|
||||
}
|
||||
|
||||
@ -100,17 +201,18 @@ sub apply {
|
||||
my $subword = lc(substr($word,
|
||||
$start+length($filler),
|
||||
$end-$start+1-length($filler)));
|
||||
next unless defined($WORD{$subword});
|
||||
next unless $WORD{$subword} >= $MIN_COUNT;
|
||||
print STDERR "\tmatching word $start .. $end ($filler)$subword $WORD{$subword}\n" if $VERBOSE;
|
||||
push @{$REACHABLE{$end}},"$start $TRUECASE{$subword} $WORD{$subword}";
|
||||
next unless defined($COUNT{$subword});
|
||||
next unless $COUNT{$subword} >= $MIN_COUNT;
|
||||
print STDERR "\tmatching word $start .. $end ($filler)$subword $COUNT{$subword}\n" if $VERBOSE;
|
||||
push @{$REACHABLE{$end}},"$start $TRUECASE{$subword} $COUNT{$subword}";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# no matches at all?
|
||||
if (!defined($REACHABLE{$final})) {
|
||||
print $word;
|
||||
print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
|
||||
print $factored_word;
|
||||
next;
|
||||
}
|
||||
|
||||
@ -152,9 +254,35 @@ sub apply {
|
||||
last unless scalar @{$REACHABLE{$final}} > $ITERATOR{$final};
|
||||
for(my $i=0;$i<$increase;$i++) { $ITERATOR{$i}=0; }
|
||||
}
|
||||
$best_split = $word unless $best_split =~ / /; # do not change case for unsplit words
|
||||
print $best_split;
|
||||
if ($best_split !~ / /) {
|
||||
print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
|
||||
print $word; # do not change case for unsplit words
|
||||
next;
|
||||
}
|
||||
if (!$SYNTAX) {
|
||||
print $best_split;
|
||||
}
|
||||
else {
|
||||
$BUFFER[$#BUFFER] =~ s/label=\"/label=\"SPLIT-/ if $MARK_SPLIT;
|
||||
$BUFFER[$#BUFFER] =~ /label=\"([^\"]+)\"/ || die("ERROR: $BUFFER[$#BUFFER]\n");
|
||||
my $pos = $1;
|
||||
print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
|
||||
|
||||
my @SPLIT = split(/ /,$best_split);
|
||||
my @OUT = ();
|
||||
if ($BINARIZE) {
|
||||
for(my $w=0;$w<scalar(@SPLIT)-2;$w++) {
|
||||
push @OUT,"<tree label=\"\@$pos\">";
|
||||
}
|
||||
}
|
||||
for(my $w=0;$w<scalar(@SPLIT);$w++) {
|
||||
if ($BINARIZE && $w>=2) { push @OUT, "</tree>"; }
|
||||
push @OUT,"<tree label=\"".$LABEL{lc($SPLIT[$w])}."\"> $SPLIT[$w] </tree>";
|
||||
}
|
||||
print join(" ",@OUT);
|
||||
}
|
||||
}
|
||||
print " ".join(" ",@BUFFER) if scalar(@BUFFER); @BUFFER = (); # clear buffer
|
||||
print "\n";
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user