2015-05-17 16:04:04 +03:00
#!/usr/bin/env perl
2015-05-29 14:30:26 +03:00
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
2014-04-16 20:28:49 +04:00
2015-04-13 19:42:33 +03:00
use warnings ;
2014-04-16 20:28:49 +04:00
use strict ;
use utf8 ;
use File::Basename ;
use Getopt::Long "GetOptions" ;
use FindBin qw( $RealBin ) ;
use Scalar::Util qw( looks_like_number ) ;
use IO::Handle ;
binmode ( STDIN , ':utf8' ) ;
binmode ( STDOUT , ':utf8' ) ;
binmode ( STDERR , ':utf8' ) ;
my $ ___FACTOR_DELIMITER = "|" ;
my $ OUT_FILE = "/tmp/transliteration-phrase-table.$$" ;
my ( $ MOSES_SRC_DIR , $ TRANSLIT_MODEL , $ OOV_FILE , $ OOV_FILE_NAME , $ EXTERNAL_BIN_DIR , $ LM_FILE , $ INPUT_EXTENSION , $ OUTPUT_EXTENSION ) ;
2014-04-16 20:40:49 +04:00
die ( "ERROR: wrong syntax when invoking in-decoding-transliteration.perl" )
2014-04-16 20:28:49 +04:00
unless & GetOptions ( 'moses-src-dir=s' = > \ $ MOSES_SRC_DIR ,
'external-bin-dir=s' = > \ $ EXTERNAL_BIN_DIR ,
'transliteration-model-dir=s' = > \ $ TRANSLIT_MODEL ,
'input-extension=s' = > \ $ INPUT_EXTENSION ,
'output-extension=s' = > \ $ OUTPUT_EXTENSION ,
'transliteration-file=s' = > \ $ OOV_FILE ,
'out-file=s' = > \ $ OUT_FILE ) ;
# check if the files are in place
die ( "ERROR: you need to define --moses-src-dir --external-bin-dir, --transliteration-model-dir, --transliteration-file, --input-extension, and --output-extension" )
unless ( defined ( $ MOSES_SRC_DIR ) &&
defined ( $ TRANSLIT_MODEL ) &&
defined ( $ OOV_FILE ) &&
2015-05-17 16:04:04 +03:00
defined ( $ INPUT_EXTENSION ) &&
defined ( $ OUTPUT_EXTENSION ) &&
2014-04-16 20:28:49 +04:00
defined ( $ EXTERNAL_BIN_DIR ) ) ;
die ( "ERROR: could not find Transliteration Model '$TRANSLIT_MODEL'" )
unless - e $ TRANSLIT_MODEL ;
die ( "ERROR: could not find Transliteration file $OOV_FILE'" )
unless - e $ OOV_FILE ;
$ OOV_FILE_NAME = basename ( $ OOV_FILE ) ;
`mkdir $TRANSLIT_MODEL/evaluation` ;
`cp $OOV_FILE $TRANSLIT_MODEL/evaluation/` ;
my $ translitFile = $ TRANSLIT_MODEL . "/evaluation/" . $ OOV_FILE_NAME ;
print "Preparing for Transliteration\n" ;
prepare_for_transliteration ( $ OOV_FILE , $ translitFile ) ;
print "Run Transliteration\n" ;
run_transliteration ( $ MOSES_SRC_DIR , $ EXTERNAL_BIN_DIR , $ TRANSLIT_MODEL , $ OOV_FILE_NAME ) ;
print "Pick Best Transliteration\n" ;
form_corpus ( $ translitFile , $ translitFile . ".op.nBest" , $ OUT_FILE ) ;
################### Read the UNK word file and prepare for Transliteration ###############################
sub prepare_for_transliteration
{
my @ list = @ _ ;
my $ testFile = $ list [ 0 ] ;
my $ translitFile = $ list [ 1 ] ;
my % UNK ;
my @ words ;
my $ src ;
my @ tW ;
open MYFILE , "<:encoding(UTF-8)" , $ testFile or die "Can't open $testFile: $!\n" ;
2015-05-17 16:04:04 +03:00
while ( <MYFILE> )
2014-04-16 20:28:49 +04:00
{
chomp ;
#print "$_\n";
@ words = split ( / / , "$_" ) ;
foreach ( @ words )
{
2015-05-17 16:04:04 +03:00
2014-04-16 20:28:49 +04:00
@ tW = split /\Q$___FACTOR_DELIMITER/ ;
if ( defined $ tW [ 0 ] )
{
2015-05-17 16:04:04 +03:00
2014-04-16 20:28:49 +04:00
if ( ! ( $ tW [ 0 ] =~ /[0-9.,]/ ) )
{
$ UNK { $ tW [ 0 ] } = 1 ;
}
else
{
print "Not transliterating $tW[0] \n" ;
}
2015-05-17 16:04:04 +03:00
}
2014-04-16 20:28:49 +04:00
}
}
close ( MYFILE ) ;
open MYFILE , ">:encoding(UTF-8)" , $ translitFile or die "Can't open $translitFile: $!\n" ;
foreach my $ key ( keys % UNK )
{
$ src = join ( ' ' , split ( '' , $ key ) ) ;
2015-05-17 16:04:04 +03:00
print MYFILE "$src\n" ;
2014-04-16 20:28:49 +04:00
}
close ( MYFILE ) ;
}
################### Run Transliteration Module to Obtain Transliterations ###############################
sub run_transliteration
{
my @ list = @ _ ;
my $ MOSES_SRC = $ list [ 0 ] ;
my $ EXTERNAL_BIN_DIR = $ list [ 1 ] ;
my $ TRANSLIT_MODEL = $ list [ 2 ] ;
my $ eval_file = $ list [ 3 ] ;
`touch $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini` ;
2015-05-17 16:04:04 +03:00
2014-04-16 20:28:49 +04:00
print "Filter Table\n" ;
2015-06-02 12:02:39 +03:00
` $ MOSES_SRC /scripts/ training / train - model . perl \
- mgiza - mgiza - cpus 10 - dont - zip - first - step 9 \
- external - bin - dir $ EXTERNAL_BIN_DIR - f $ INPUT_EXTENSION \
- e $ OUTPUT_EXTENSION - alignment grow - diag - final - and - parts 5 \
- score - options '--KneserNey' \
- phrase - translation - table $ TRANSLIT_MODEL /model/ phrase - table \
- config $ TRANSLIT_MODEL /evaluation/ $ eval_file . moses . table . ini \
- lm 0 : 3 : $ TRANSLIT_MODEL /evaluation/ $ eval_file . moses . table . ini:8 ` ;
` $ MOSES_SRC /scripts/ training / filter - model - given - input . pl \
$ TRANSLIT_MODEL /evaluation/ $ eval_file . filtered \
$ TRANSLIT_MODEL /evaluation/ $ eval_file . moses . table . ini \
$ TRANSLIT_MODEL /evaluation/ $ eval_file \
- Binarizer "$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2" ` ;
2014-04-16 20:28:49 +04:00
`rm $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini` ;
print "Apply Filter\n" ;
2015-06-02 12:02:39 +03:00
` $ MOSES_SRC /scripts/ ems /support/s ubstitute - filtered - tables - and - weights . perl \
$ TRANSLIT_MODEL /evaluation/ $ eval_file . filtered / moses . ini \
$ TRANSLIT_MODEL /model/mos es . ini \
$ TRANSLIT_MODEL /tuning/mos es . tuned . ini \
$ TRANSLIT_MODEL /evaluation/ $ eval_file . filtered . ini ` ;
` $ MOSES_SRC /bin/mos es \
- search - algorithm 1 - cube - pruning - pop - limit 5000 - s 5000 \
- threads 16 - drop - unknown - distortion - limit 0 \
- n - best - list $ TRANSLIT_MODEL /evaluation/ $ eval_file . op . nBest 100 \
distinct - f $ TRANSLIT_MODEL /evaluation/ $ eval_file . filtered . ini \
< $ TRANSLIT_MODEL /evaluation/ $ eval_file \
> $ TRANSLIT_MODEL /evaluation/ $ eval_file . op ` ;
2014-04-16 20:28:49 +04:00
}
################### Read the output of Transliteration Model and Form Corpus ###############################
sub form_corpus
{
my @ list = @ _ ;
my $ inp_file = $ list [ 0 ] ;
my $ testFile = $ list [ 1 ] ;
my @ words ;
my $ thisStr ;
my $ features ;
my $ prev = 0 ;
my $ sNum ;
my @ UNK ;
my % vocab ;
my $ antLog = exp ( 0.2 ) ;
my $ phraseTable = $ list [ 2 ] ;
2015-05-17 16:04:04 +03:00
2014-04-16 20:28:49 +04:00
open MYFILE , "<:encoding(UTF-8)" , $ inp_file or die "Can't open $inp_file: $!\n" ;
open PT , ">:encoding(UTF-8)" , $ phraseTable or die "Can't open $phraseTable: $!\n" ;
2015-05-17 16:04:04 +03:00
while ( <MYFILE> )
2014-04-16 20:28:49 +04:00
{
chomp ;
#print "$_\n";
@ words = split ( / / , "$_" ) ;
2015-05-17 16:04:04 +03:00
2014-04-16 20:28:49 +04:00
$ thisStr = "" ;
foreach ( @ words )
{
$ thisStr = $ thisStr . "$_" ;
}
push ( @ UNK , $ thisStr ) ;
2015-05-17 16:04:04 +03:00
$ vocab { $ thisStr } = 1 ;
2014-04-16 20:28:49 +04:00
}
close ( MYFILE ) ;
open MYFILE , "<:encoding(UTF-8)" , $ testFile or die "Can't open $testFile: $!\n" ;
my $ inpCount = 0 ;
2015-05-17 16:04:04 +03:00
while ( <MYFILE> )
2014-04-16 20:28:49 +04:00
{
chomp ;
#print "$_\n";
@ words = split ( / / , "$_" ) ;
$ sNum = $ words [ 0 ] ;
if ( $ prev != $ sNum ) {
$ inpCount + + ;
2015-05-17 16:04:04 +03:00
}
2014-04-16 20:28:49 +04:00
my $ i = 2 ;
$ thisStr = "" ;
$ features = "" ;
while ( $ words [ $ i ] ne "|||" )
{
$ thisStr = $ thisStr . $ words [ $ i ] ;
$ i + + ;
}
$ i + + ;
2015-05-17 16:04:04 +03:00
2014-04-16 20:28:49 +04:00
while ( $ words [ $ i ] ne "|||" )
{
if ( $ words [ $ i ] =~ /Penalty0/ || $ words [ $ i ] eq "Distortion0=" || $ words [ $ i ] eq "LM0=" ) {
$ i + + ;
}
elsif ( looks_like_number ( $ words [ $ i ] ) ) {
$ features = $ features . " " . exp ( $ words [ $ i ] ) ;
}
$ i + + ;
}
$ i + + ;
#$features = $features . " " . $words[$i];
2015-05-17 16:04:04 +03:00
2014-04-16 20:28:49 +04:00
if ( $ thisStr ne "" ) {
print PT "$UNK[$inpCount] ||| $thisStr ||| $features ||| 0-0 ||| 0 0 0\n" ;
}
$ prev = $ sNum ;
}
close ( MYFILE ) ;
close ( PT ) ;
2015-05-17 16:04:04 +03:00
2014-04-16 20:28:49 +04:00
`gzip $phraseTable` ;
2015-05-17 16:04:04 +03:00
2014-04-16 20:28:49 +04:00
}