mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-28 06:22:14 +03:00
3c07c5df4d
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1307 1f5c12ca-751b-0410-a591-d2e778427230
117 lines
3.2 KiB
Perl
Executable File
117 lines
3.2 KiB
Perl
Executable File
#!/usr/bin/perl
|
|
|
|
# $Id$
|
|
use strict;
|
|
use Getopt::Long "GetOptions";
|
|
|
|
my $_CORPUS;
|
|
my $_OUTPUT = "generation";
|
|
my $_GENERATION_FACTORS;
|
|
|
|
die "specify options" unless &GetOptions('corpus=s' => \$_CORPUS,
|
|
'output=s' => \$_OUTPUT,
|
|
'generation-factors=s' => \$_GENERATION_FACTORS);
|
|
|
|
|
|
die "Please use --corpus to specify the factored input corpus\n" unless $_CORPUS;
|
|
|
|
if (! defined $_GENERATION_FACTORS) {
|
|
die "Please use --generation-factors to set generation factors\n";
|
|
}
|
|
|
|
my $___GENERATION_FACTORS = $_GENERATION_FACTORS || "0-0";
|
|
die("format for generation factors is \"0-1\" or \"0-1+0-2\" or \"0-1+0,1-1,2\", you provided $___GENERATION_FACTORS\n")
|
|
if $___GENERATION_FACTORS !~ /^\d+(\,\d+)*\-\d+(\,\d+)*(\+\d+(\,\d+)*\-\d+(\,\d+)*)*$/;
|
|
|
|
print "output=$_OUTPUT.<factor-map>\n";
|
|
|
|
get_generation_factored();
|
|
print "Done\n";
|
|
exit 0;
|
|
|
|
sub get_generation_factored {
|
|
print STDERR "(8) learn generation model @ ".`date`;
|
|
foreach my $f (split(/\+/,$___GENERATION_FACTORS)) {
|
|
my $factor = $f;
|
|
my ($factor_e_source,$factor_e) = split(/\-/,$factor);
|
|
&get_generation($factor, $factor_e_source, $factor_e);
|
|
}
|
|
}
|
|
|
|
|
|
sub get_generation {
|
|
my ($factor, $factor_e_source, $factor_e) = @_;
|
|
|
|
print STDERR "(8) [$factor] generate generation table @ ".`date`;
|
|
my (%WORD_TRANSLATION,%TOTAL_FOREIGN,%TOTAL_ENGLISH);
|
|
|
|
my %INCLUDE_SOURCE;
|
|
foreach my $factor (split(/,/,$factor_e_source)) {
|
|
|
|
$INCLUDE_SOURCE{$factor} = 1;
|
|
}
|
|
my %INCLUDE;
|
|
foreach my $factor (split(/,/,$factor_e)) {
|
|
$INCLUDE{$factor} = 1;
|
|
}
|
|
|
|
my (%GENERATION,%GENERATION_TOTAL_SOURCE,%GENERATION_TOTAL_TARGET);
|
|
open(E,$_CORPUS) or die "Can't read ".$_CORPUS;
|
|
while(<E>) {
|
|
chomp;
|
|
foreach (split) {
|
|
my @FACTOR = split(/\|/);
|
|
|
|
my ($source,$target);
|
|
my $first_factor = 1;
|
|
foreach my $factor (split(/,/,$factor_e_source)) {
|
|
$source .= "|" unless $first_factor;
|
|
$first_factor = 0;
|
|
$source .= $FACTOR[$factor];
|
|
}
|
|
|
|
$first_factor = 1;
|
|
foreach my $factor (split(/,/,$factor_e)) {
|
|
$target .= "|" unless $first_factor;
|
|
$first_factor = 0;
|
|
$target .= $FACTOR[$factor];
|
|
}
|
|
$GENERATION{$source}{$target}++;
|
|
$GENERATION_TOTAL_SOURCE{$source}++;
|
|
$GENERATION_TOTAL_TARGET{$target}++;
|
|
}
|
|
}
|
|
close(E);
|
|
|
|
open(GEN,">$_OUTPUT.$factor") or die "Can't write $_OUTPUT.$factor";
|
|
foreach my $source (keys %GENERATION) {
|
|
foreach my $target (keys %{$GENERATION{$source}}) {
|
|
printf GEN ("%s %s %.7f %.7f\n",$source,$target,
|
|
$GENERATION{$source}{$target}/$GENERATION_TOTAL_SOURCE{$source},
|
|
$GENERATION{$source}{$target}/$GENERATION_TOTAL_TARGET{$target});
|
|
}
|
|
}
|
|
close(GEN);
|
|
safesystem("rm -f $_OUTPUT.$factor.gz") or die;
|
|
safesystem("gzip $_OUTPUT.$factor") or die;
|
|
}
|
|
|
|
sub safesystem {
|
|
print STDERR "Executing: @_\n";
|
|
system(@_);
|
|
if ($? == -1) {
|
|
print STDERR "Failed to execute: @_\n $!\n";
|
|
exit(1);
|
|
}
|
|
elsif ($? & 127) {
|
|
printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n",
|
|
($? & 127), ($? & 128) ? 'with' : 'without';
|
|
}
|
|
else {
|
|
my $exitcode = $? >> 8;
|
|
print STDERR "Exit code: $exitcode\n" if $exitcode;
|
|
return ! $exitcode;
|
|
}
|
|
}
|
|
|