mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-27 05:55:02 +03:00
In-Decoding Transliteration Module
This commit is contained in:
parent
75213181ae
commit
5e3e50d4ec
230
scripts/Transliteration/in-decoding-transliteration.pl
Executable file
230
scripts/Transliteration/in-decoding-transliteration.pl
Executable file
@ -0,0 +1,230 @@
|
||||
#!/usr/bin/perl -w
|
||||
|
||||
use strict;
|
||||
|
||||
use utf8;
|
||||
use File::Basename;
|
||||
use Getopt::Long "GetOptions";
|
||||
use FindBin qw($RealBin);
|
||||
use Scalar::Util qw(looks_like_number);
|
||||
use IO::Handle;
|
||||
binmode(STDIN, ':utf8');
|
||||
binmode(STDOUT, ':utf8');
|
||||
binmode(STDERR, ':utf8');
|
||||
|
||||
my $___FACTOR_DELIMITER = "|";
|
||||
my $OUT_FILE = "/tmp/transliteration-phrase-table.$$";
|
||||
|
||||
my ($MOSES_SRC_DIR,$TRANSLIT_MODEL,$OOV_FILE, $OOV_FILE_NAME, $EXTERNAL_BIN_DIR, $LM_FILE, $INPUT_EXTENSION, $OUTPUT_EXTENSION);
|
||||
die("ERROR: wrong syntax when invoking postDecodingTransliteration.perl")
|
||||
unless &GetOptions('moses-src-dir=s' => \$MOSES_SRC_DIR,
|
||||
'external-bin-dir=s' => \$EXTERNAL_BIN_DIR,
|
||||
'transliteration-model-dir=s' => \$TRANSLIT_MODEL,
|
||||
'input-extension=s' => \$INPUT_EXTENSION,
|
||||
'output-extension=s' => \$OUTPUT_EXTENSION,
|
||||
'transliteration-file=s' => \$OOV_FILE,
|
||||
'out-file=s' => \$OUT_FILE);
|
||||
|
||||
# check if the files are in place
|
||||
die("ERROR: you need to define --moses-src-dir --external-bin-dir, --transliteration-model-dir, --transliteration-file, --input-extension, and --output-extension")
|
||||
unless (defined($MOSES_SRC_DIR) &&
|
||||
defined($TRANSLIT_MODEL) &&
|
||||
defined($OOV_FILE) &&
|
||||
defined($INPUT_EXTENSION)&&
|
||||
defined($OUTPUT_EXTENSION)&&
|
||||
defined($EXTERNAL_BIN_DIR));
|
||||
|
||||
die("ERROR: could not find Transliteration Model '$TRANSLIT_MODEL'")
|
||||
unless -e $TRANSLIT_MODEL;
|
||||
die("ERROR: could not find Transliteration file $OOV_FILE'")
|
||||
unless -e $OOV_FILE;
|
||||
|
||||
$OOV_FILE_NAME = basename ($OOV_FILE);
|
||||
|
||||
`mkdir $TRANSLIT_MODEL/evaluation`;
|
||||
`cp $OOV_FILE $TRANSLIT_MODEL/evaluation/`;
|
||||
my $translitFile = $TRANSLIT_MODEL . "/evaluation/" . $OOV_FILE_NAME;
|
||||
|
||||
print "Preparing for Transliteration\n";
|
||||
prepare_for_transliteration ($OOV_FILE, $translitFile);
|
||||
print "Run Transliteration\n";
|
||||
run_transliteration ($MOSES_SRC_DIR , $EXTERNAL_BIN_DIR , $TRANSLIT_MODEL , $OOV_FILE_NAME);
|
||||
print "Pick Best Transliteration\n";
|
||||
form_corpus ($translitFile , $translitFile.".op.nBest" , $OUT_FILE);
|
||||
|
||||
|
||||
################### Read the UNK word file and prepare for Transliteration ###############################
|
||||
|
||||
sub prepare_for_transliteration
|
||||
{
|
||||
my @list = @_;
|
||||
my $testFile = $list[0];
|
||||
my $translitFile = $list[1];
|
||||
my %UNK;
|
||||
my @words;
|
||||
my $src;
|
||||
my @tW;
|
||||
|
||||
open MYFILE, "<:encoding(UTF-8)", $testFile or die "Can't open $testFile: $!\n";
|
||||
|
||||
while (<MYFILE>)
|
||||
{
|
||||
chomp;
|
||||
#print "$_\n";
|
||||
@words = split(/ /, "$_");
|
||||
|
||||
foreach (@words)
|
||||
{
|
||||
|
||||
@tW = split /\Q$___FACTOR_DELIMITER/;
|
||||
|
||||
if (defined $tW[0])
|
||||
{
|
||||
|
||||
if (! ($tW[0] =~ /[0-9.,]/))
|
||||
{
|
||||
$UNK{$tW[0]} = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
print "Not transliterating $tW[0] \n";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
close (MYFILE);
|
||||
|
||||
open MYFILE, ">:encoding(UTF-8)", $translitFile or die "Can't open $translitFile: $!\n";
|
||||
|
||||
foreach my $key ( keys %UNK )
|
||||
{
|
||||
$src=join(' ', split('',$key));
|
||||
print MYFILE "$src\n";
|
||||
}
|
||||
close (MYFILE);
|
||||
}
|
||||
|
||||
################### Run Transliteration Module to Obtain Transliterations ###############################
|
||||
|
||||
sub run_transliteration
|
||||
{
|
||||
my @list = @_;
|
||||
my $MOSES_SRC = $list[0];
|
||||
my $EXTERNAL_BIN_DIR = $list[1];
|
||||
my $TRANSLIT_MODEL = $list[2];
|
||||
my $eval_file = $list[3];
|
||||
|
||||
`touch $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini`;
|
||||
|
||||
print "Filter Table\n";
|
||||
|
||||
`$MOSES_SRC/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -score-options '--KneserNey' -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table -config $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini -lm 0:3:$TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini:8`;
|
||||
|
||||
`$MOSES_SRC/scripts/training/filter-model-given-input.pl $TRANSLIT_MODEL/evaluation/$eval_file.filtered $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini $TRANSLIT_MODEL/evaluation/$eval_file -Binarizer "$MOSES_SRC/bin/processPhraseTable"`;
|
||||
|
||||
`rm $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini`;
|
||||
|
||||
print "Apply Filter\n";
|
||||
|
||||
`$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl $TRANSLIT_MODEL/evaluation/$eval_file.filtered/moses.ini $TRANSLIT_MODEL/model/moses.ini $TRANSLIT_MODEL/tuning/moses.tuned.ini $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini`;
|
||||
|
||||
`$MOSES_SRC/bin/moses -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads 16 -drop-unknown -distortion-limit 0 -n-best-list $TRANSLIT_MODEL/evaluation/$eval_file.op.nBest 100 distinct -f $TRANSLIT_MODEL/evaluation/$eval_file.filtered.ini < $TRANSLIT_MODEL/evaluation/$eval_file > $TRANSLIT_MODEL/evaluation/$eval_file.op`;
|
||||
|
||||
}
|
||||
|
||||
################### Read the output of Transliteration Model and Form Corpus ###############################
|
||||
|
||||
|
||||
sub form_corpus
|
||||
{
|
||||
|
||||
my @list = @_;
|
||||
my $inp_file = $list[0];
|
||||
my $testFile = $list[1];
|
||||
my @words;
|
||||
my $thisStr;
|
||||
my $features;
|
||||
my $prev = 0;
|
||||
my $sNum;
|
||||
my @UNK;
|
||||
my %vocab;
|
||||
|
||||
my $antLog = exp(0.2);
|
||||
my $phraseTable = $list[2];
|
||||
|
||||
open MYFILE, "<:encoding(UTF-8)", $inp_file or die "Can't open $inp_file: $!\n";
|
||||
open PT, ">:encoding(UTF-8)", $phraseTable or die "Can't open $phraseTable: $!\n";
|
||||
|
||||
while (<MYFILE>)
|
||||
{
|
||||
chomp;
|
||||
#print "$_\n";
|
||||
@words = split(/ /, "$_");
|
||||
|
||||
$thisStr = "";
|
||||
foreach (@words)
|
||||
{
|
||||
$thisStr = $thisStr . "$_";
|
||||
}
|
||||
|
||||
push(@UNK, $thisStr);
|
||||
$vocab{$thisStr} = 1;
|
||||
}
|
||||
close (MYFILE);
|
||||
|
||||
open MYFILE, "<:encoding(UTF-8)", $testFile or die "Can't open $testFile: $!\n";
|
||||
my $inpCount = 0;
|
||||
|
||||
while (<MYFILE>)
|
||||
{
|
||||
chomp;
|
||||
#print "$_\n";
|
||||
@words = split(/ /, "$_");
|
||||
|
||||
$sNum = $words[0];
|
||||
|
||||
if ($prev != $sNum){
|
||||
$inpCount++;
|
||||
}
|
||||
|
||||
my $i = 2;
|
||||
$thisStr = "";
|
||||
$features = "";
|
||||
|
||||
while ($words[$i] ne "|||")
|
||||
{
|
||||
$thisStr = $thisStr . $words[$i];
|
||||
$i++;
|
||||
}
|
||||
|
||||
$i++;
|
||||
|
||||
while ($words[$i] ne "|||")
|
||||
{
|
||||
if ($words[$i] =~ /Penalty0/ || $words[$i] eq "Distortion0=" || $words[$i] eq "LM0=" ){
|
||||
$i++;
|
||||
}
|
||||
elsif (looks_like_number($words[$i])){
|
||||
$features = $features . " " . exp($words[$i]);
|
||||
}
|
||||
|
||||
$i++;
|
||||
}
|
||||
$i++;
|
||||
|
||||
#$features = $features . " " . $words[$i];
|
||||
|
||||
if ($thisStr ne ""){
|
||||
print PT "$UNK[$inpCount] ||| $thisStr ||| $features ||| 0-0 ||| 0 0 0\n";
|
||||
}
|
||||
$prev = $sNum;
|
||||
}
|
||||
close (MYFILE);
|
||||
close (PT);
|
||||
|
||||
|
||||
`gzip $phraseTable`;
|
||||
|
||||
}
|
||||
|
||||
|
@ -533,6 +533,13 @@ build-transliteration-model
|
||||
ignore-unless: transliteration-module
|
||||
rerun-on-change: transliteration-module training-options script giza-settings
|
||||
default-name: model/Transliteration
|
||||
build-translit-table
|
||||
in: transliteration-model
|
||||
out: transliteration-table
|
||||
ignore-unless: in-decoding-transliteration
|
||||
rerun-on-change: in-decoding-transliteration transliteration-module
|
||||
default-name: model/transliteration-phrase-table
|
||||
template: $moses-script-dir/Transliteration/in-decoding-transliteration.pl --moses-src-dir $moses-src-dir --external-bin-dir $external-bin-dir --transliteration-model-dir IN --input-extension $input-extension --output-extension $output-extension --transliteration-file $transliteration-file --out-file OUT
|
||||
extract-phrases
|
||||
in: corpus-mml-postfilter=OR=word-alignment scored-corpus
|
||||
out: extracted-phrases
|
||||
@ -601,7 +608,7 @@ build-sparse
|
||||
default-name: model/sparse-features
|
||||
template: $moses-script-dir/ems/support/build-sparse-features.perl IN $input-extension $output-extension OUT "$sparse-features"
|
||||
create-config
|
||||
in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table transliteration-model generation-table sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains osm-model INTERPOLATED-LM:binlm LM:binlm
|
||||
in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table transliteration-table generation-table sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains osm-model INTERPOLATED-LM:binlm LM:binlm
|
||||
out: config
|
||||
ignore-if: use-hiero
|
||||
rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini
|
||||
@ -863,7 +870,7 @@ split-reference-devtest
|
||||
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
|
||||
template: $output-splitter -model IN1.$output-extension < IN > OUT
|
||||
filter
|
||||
in: input TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus-mml-postfilter=OR=TRAINING:domains
|
||||
in: input TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus-mml-postfilter=OR=TRAINING:domains TRAINING:transliteration-table
|
||||
out: filtered-dir
|
||||
default-name: tuning/filtered
|
||||
rerun-on-change: filter-settings ttable-binarizer
|
||||
@ -989,8 +996,8 @@ split-input
|
||||
pass-unless: input-splitter
|
||||
template: $input-splitter -model IN1.$input-extension < IN > OUT
|
||||
filter
|
||||
in: input TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus-mml-postfilter=OR=TRAINING:domains
|
||||
out: filtered-dir
|
||||
in: input TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus-mml-postfilter=OR=TRAINING:domains TRAINING:transliteration-table
|
||||
out: filtered-dir
|
||||
default-name: evaluation/filtered
|
||||
rerun-on-change: filter-settings report-precision-by-coverage ttable-binarizer
|
||||
pass-if: TRAINING:binarize-all
|
||||
@ -1027,11 +1034,11 @@ remove-markup
|
||||
pass-unless: report-segmentation
|
||||
template: $moses-script-dir/ems/support/remove-segmentation-markup.perl < IN > OUT
|
||||
post-decoding-transliteration
|
||||
in: cleaned-output system-output TRAINING:transliteration-model LM:binlm
|
||||
in: cleaned-output system-output TRAINING:transliteration-model
|
||||
out: transliterated-output
|
||||
default-name: evaluation/transliterated
|
||||
pass-unless: TRAINING:post-decoding-transliteration
|
||||
template: $moses-script-dir/Transliteration/post-decoding-transliteration.pl --moses-src-dir $moses-src-dir --external-bin-dir $external-bin-dir --transliteration-model-dir IN2 --input-extension $input-extension --output-extension $output-extension --language-model IN3 --output-file IN0 --oov-file IN1.oov
|
||||
template: $moses-script-dir/Transliteration/post-decoding-transliteration.pl --moses-src-dir $moses-src-dir --external-bin-dir $external-bin-dir --transliteration-model-dir IN2 --input-extension $input-extension --output-extension $output-extension --language-model $TRAINING:language-model-file --output-file IN0 --oov-file IN1.oov
|
||||
recase-output
|
||||
in: transliterated-output RECASING:recase-config
|
||||
out: recased-output
|
||||
|
@ -2233,11 +2233,15 @@ sub get_config_tables {
|
||||
sub define_training_create_config {
|
||||
my ($step_id) = @_;
|
||||
|
||||
my ($config,$reordering_table,$phrase_translation_table,$translit_model,$generation_table,$sparse_lexical_features,$domains,$osm, @LM)
|
||||
my ($config,$reordering_table,$phrase_translation_table,$transliteration_pt,$generation_table,$sparse_lexical_features,$domains,$osm, @LM)
|
||||
= &get_output_and_input($step_id);
|
||||
|
||||
my $cmd = &get_config_tables($config,$reordering_table,$phrase_translation_table,$generation_table,$domains);
|
||||
|
||||
if($transliteration_pt){
|
||||
$cmd .= "-transliteration-phrase-table $transliteration_pt ";
|
||||
}
|
||||
|
||||
if($osm){
|
||||
|
||||
my $osm_settings = &get("TRAINING:operation-sequence-model-settings");
|
||||
@ -2623,7 +2627,7 @@ sub define_tuningevaluation_filter {
|
||||
my $tuning_flag = !defined($set);
|
||||
my $hierarchical = &get("TRAINING:hierarchical-rule-set");
|
||||
|
||||
my ($filter_dir,$input,$phrase_translation_table,$reordering_table,$domains) = &get_output_and_input($step_id);
|
||||
my ($filter_dir,$input,$phrase_translation_table,$reordering_table,$domains,$transliteration_table) = &get_output_and_input($step_id);
|
||||
|
||||
my $binarizer;
|
||||
$binarizer = &backoff_and_get("EVALUATION:$set:ttable-binarizer") unless $tuning_flag;
|
||||
@ -2683,7 +2687,14 @@ sub define_tuningevaluation_filter {
|
||||
|
||||
$cmd .= &get_config_tables($config,$reordering_table,$phrase_translation_table,undef,$domains);
|
||||
|
||||
if (&get("TRAINING:in-decoding-transliteration")) {
|
||||
|
||||
$cmd .= "-transliteration-phrase-table $dir/model/transliteration-phrase-table.$VERSION ";
|
||||
}
|
||||
|
||||
|
||||
$cmd .= "-lm 0:3:$config:8\n"; # dummy kenlm 3-gram model on factor 0
|
||||
|
||||
}
|
||||
|
||||
# filter command
|
||||
|
@ -31,7 +31,7 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
|
||||
$_DECODING_GRAPH_BACKOFF,
|
||||
$_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE,
|
||||
@_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS,
|
||||
$_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM, $_OSM_FACTORS, $_POST_DECODING_TRANSLIT,
|
||||
$_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM, $_OSM_FACTORS, $_POST_DECODING_TRANSLIT, $_TRANSLITERATION_PHRASE_TABLE,
|
||||
$_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_GHKM_TREE_FRAGMENTS,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,
|
||||
$_ALT_DIRECT_RULE_SCORE_1, $_ALT_DIRECT_RULE_SCORE_2, $_UNKNOWN_WORD_SOFT_MATCHES_FILE,
|
||||
$_OMIT_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
|
||||
@ -122,7 +122,8 @@ $_HELP = 1
|
||||
'config=s' => \$_CONFIG,
|
||||
'osm-model=s' => \$_OSM,
|
||||
'osm-setting=s' => \$_OSM_FACTORS,
|
||||
'post-decoding-translit=s' => \$_POST_DECODING_TRANSLIT,
|
||||
'post-decoding-translit=s' => \$_POST_DECODING_TRANSLIT,
|
||||
'transliteration-phrase-table=s' => \$_TRANSLITERATION_PHRASE_TABLE,
|
||||
'max-lexical-reordering' => \$_MAX_LEXICAL_REORDERING,
|
||||
'do-steps=s' => \$_DO_STEPS,
|
||||
'memscore:s' => \$_MEMSCORE,
|
||||
@ -1879,6 +1880,8 @@ sub create_ini {
|
||||
$path++;
|
||||
}
|
||||
print INI "1 T 1\n" if $_GLUE_GRAMMAR;
|
||||
|
||||
print INI "1 T 1\n" if $_TRANSLITERATION_PHRASE_TABLE;
|
||||
|
||||
if (defined($_DECODING_GRAPH_BACKOFF)) {
|
||||
$_DECODING_GRAPH_BACKOFF =~ s/\s+/ /g;
|
||||
@ -1962,6 +1965,13 @@ sub create_ini {
|
||||
exit 1 if $i < $stepsused{"T"}; # fatal to define less
|
||||
}
|
||||
|
||||
if ($_TRANSLITERATION_PHRASE_TABLE){
|
||||
|
||||
$feature_spec .= "PhraseDictionaryMemory name=TranslationModel$i table-limit=100 num-features=4 path=$_TRANSLITERATION_PHRASE_TABLE input-factor=0 output-factor=0\n";
|
||||
$weight_spec .= "TranslationModel$i= 0.2 0.2 0.2 0.2\n";
|
||||
$i++;
|
||||
}
|
||||
|
||||
# glue grammar
|
||||
if ($_GLUE_GRAMMAR) {
|
||||
&full_path(\$___GLUE_GRAMMAR_FILE);
|
||||
@ -2069,8 +2079,9 @@ sub create_ini {
|
||||
|
||||
my $lm_oov_prob = 0.1;
|
||||
|
||||
if ($_POST_DECODING_TRANSLIT){
|
||||
if ($_POST_DECODING_TRANSLIT || $_TRANSLITERATION_PHRASE_TABLE){
|
||||
$lm_oov_prob = -100.0;
|
||||
$_LMODEL_OOV_FEATURE = "yes";
|
||||
}
|
||||
|
||||
$feature_spec .= "$type_name name=LM$i factor=$f path=$fn order=$o\n";
|
||||
|
Loading…
Reference in New Issue
Block a user