mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-09-19 07:07:24 +03:00
Transliteration Scripts
This commit is contained in:
parent
1caadce208
commit
c527f0db8d
316
scripts/Transliteration/clean.pl
Executable file
316
scripts/Transliteration/clean.pl
Executable file
@ -0,0 +1,316 @@
|
|||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
#input hindi word urdu word, delete all those entries that have number on any side
|
||||||
|
use utf8;
|
||||||
|
|
||||||
|
use Getopt::Std;
|
||||||
|
use IO::Handle;
|
||||||
|
|
||||||
|
binmode(STDIN, ':utf8');
|
||||||
|
binmode(STDOUT, ':utf8');
|
||||||
|
binmode(STDERR, ':utf8');
|
||||||
|
use open qw(:std :utf8);
|
||||||
|
|
||||||
|
$srcHash = ();
|
||||||
|
$trgHash = ();
|
||||||
|
|
||||||
|
$file = $ARGV[0];
|
||||||
|
|
||||||
|
@f0 = split(/\//, $file); # if file name has a path
|
||||||
|
@f1 = split(/\./, $f0[$#f0]); # last element would be the file name
|
||||||
|
@f2 = split(/\-/, $f1[1]);
|
||||||
|
$srcMark = $f2[0];
|
||||||
|
$trgMark = $f2[1];
|
||||||
|
|
||||||
|
$lang = 0;
|
||||||
|
$lang1 = 1;
|
||||||
|
$lang2 = 1;
|
||||||
|
|
||||||
|
if ($srcMark eq "en" || $srcMark eq "de" || $srcMark eq "es" || $srcMark eq "fr" || $srcMark eq "it" || $srcMark eq "nl" || $srcMark eq "pt-br" || $srcMark eq "ro" || $srcMark eq "sl" || $srcMark eq "tr" )
|
||||||
|
{
|
||||||
|
print STDERR "Source is Latin\n";
|
||||||
|
$lang1 = 0;
|
||||||
|
$lang = $lang + 1;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( "$trgMark" eq "en" || "$trgMark" eq "de" || "$trgMark" eq "es" || "$trgMark" eq "fr" || "$trgMark" eq "it" || "$trgMark" eq "nl" || "$trgMark" eq "pt-br" || "$trgMark" eq "ro" || "$trgMark" eq "sl" || "$trgMark" eq "tr" )
|
||||||
|
{
|
||||||
|
print STDERR "Target is Latin\n";
|
||||||
|
$lang2 = 0;
|
||||||
|
$lang = $lang + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ("$lang" == 2)
|
||||||
|
{
|
||||||
|
print STDERR "No Transliteration Module Possible\n";
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{ print STDERR "will run Transliteration module\n";
|
||||||
|
print STDERR "Three preprocessing steps to do:\n 1) Delete Symbol \t 2) Delete Latin from non-Latin langauge \t 3) Character Frequency based filtering\n";
|
||||||
|
print STDERR "STARTING 1 and 2 ...\n";
|
||||||
|
open ($IN, $ARGV[0]);
|
||||||
|
while(<$IN>)
|
||||||
|
{
|
||||||
|
chomp;
|
||||||
|
$retur = deleteSymbol($_);
|
||||||
|
if($retur == 1)
|
||||||
|
{
|
||||||
|
#print "$_\n";
|
||||||
|
$retur = deleteEnglish($lang1, $lang2, $_);
|
||||||
|
if ($retur == 1)
|
||||||
|
{
|
||||||
|
#print "$_\n";
|
||||||
|
push (@inputArr, $_);
|
||||||
|
charFreqFilterPreprocess($_);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
close ($IN);
|
||||||
|
}
|
||||||
|
print STDERR "DONE 1 and 2\nSTARTING 3) Preprocessing for Character filtering...\n";
|
||||||
|
|
||||||
|
charFreqFilterPreprocess2();
|
||||||
|
print STDERR "DONE 3\n";
|
||||||
|
|
||||||
|
foreach (@inputArr)
|
||||||
|
{
|
||||||
|
charFreqFilter($_);
|
||||||
|
}
|
||||||
|
|
||||||
|
###############################Delete English##################################
|
||||||
|
|
||||||
|
sub deleteEnglish{
|
||||||
|
@list = @_;
|
||||||
|
$backEng = 0;
|
||||||
|
|
||||||
|
if($list[0] == 1 && $list[1] == 1)
|
||||||
|
{
|
||||||
|
# print "Both are Non-Latin\n";
|
||||||
|
if (m/[A-Za-z]/) {}
|
||||||
|
else {$backEng = 1; return $backEng;}
|
||||||
|
}
|
||||||
|
elsif($list[0] == 0 && $list[1] == 1)
|
||||||
|
{
|
||||||
|
# print "Target is Non-Latin\n";
|
||||||
|
@F=split("\t");
|
||||||
|
if ($F[1] =~ m/[A-Za-z]/) {}
|
||||||
|
else {$backEng = 1; return $backEng;}
|
||||||
|
|
||||||
|
}
|
||||||
|
elsif($list[0] == 1 && $list[1] == 0)
|
||||||
|
{
|
||||||
|
# print "Source is Non-Latin\n";
|
||||||
|
@F=split("\t");
|
||||||
|
if ($F[0] =~ m/[A-Za-z]/) {}
|
||||||
|
else {$backEng = 1; return $backEng;}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
###############################Delete Symbol##################################
|
||||||
|
sub deleteSymbol{
|
||||||
|
$back = 0;
|
||||||
|
if (/\d+/) {}
|
||||||
|
elsif(/\?/) {}
|
||||||
|
elsif(/\!/) {}
|
||||||
|
elsif(/@/) {}
|
||||||
|
elsif(/\./) {}
|
||||||
|
elsif(/\#/) {}
|
||||||
|
elsif(/\%/) {}
|
||||||
|
elsif(/\$/) {}
|
||||||
|
elsif(/-/) {}
|
||||||
|
elsif(/"/) {}
|
||||||
|
elsif(/\(/) {}
|
||||||
|
elsif(/\)/) {}
|
||||||
|
elsif(/\&/) {}
|
||||||
|
elsif(/\;/) {}
|
||||||
|
elsif(/\\/) {}
|
||||||
|
elsif(/\*/) {}
|
||||||
|
elsif(/\+/) {}
|
||||||
|
elsif(/\,/) {}
|
||||||
|
elsif(/\</){}
|
||||||
|
elsif(/\>/){}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
@wrds = split(/\t/);
|
||||||
|
if($wrds[0] eq $wrds[1])
|
||||||
|
{}
|
||||||
|
elsif(length $wrds[0] < 3 )
|
||||||
|
{}
|
||||||
|
elsif(length $wrds[1] < 3)
|
||||||
|
{}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
$back = 1;
|
||||||
|
return $back;
|
||||||
|
# print "$_\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#################################Char Frequency Filter Preprocess########################
|
||||||
|
sub charFreqFilterPreprocess{
|
||||||
|
|
||||||
|
@wrds = split(/\t/);
|
||||||
|
$srcWrd = lc $wrds[0];
|
||||||
|
$trgWrd = lc $wrds[1];
|
||||||
|
|
||||||
|
if($srcWrd eq $trgWrd)
|
||||||
|
{}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
@src = split('',$srcWrd);
|
||||||
|
foreach (@src)
|
||||||
|
{
|
||||||
|
if(exists $srcHash{$_})
|
||||||
|
{
|
||||||
|
$srcHash{$_}++;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
$srcHash{$_} = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@trg = split('',$trgWrd);
|
||||||
|
foreach (@trg)
|
||||||
|
{
|
||||||
|
if(exists $trgHash{$_})
|
||||||
|
{
|
||||||
|
$trgHash{$_}++;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
$trgHash{$_} = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
##############################Preprocess Two#############################
|
||||||
|
sub charFreqFilterPreprocess2{
|
||||||
|
|
||||||
|
###################srchash###################################
|
||||||
|
|
||||||
|
@keys = sort { $srcHash{$b} <=> $srcHash{$a} } keys %srcHash;
|
||||||
|
|
||||||
|
$bestsrcfreq = $srcHash{$keys[0]};
|
||||||
|
$srcOnePer = $bestsrcfreq * 0.005;
|
||||||
|
|
||||||
|
$take = 0; # take top 30 character from hash
|
||||||
|
|
||||||
|
foreach (@keys)
|
||||||
|
{
|
||||||
|
# print "$srcHash{$_}\t$_\n";
|
||||||
|
|
||||||
|
if($take < 30)
|
||||||
|
{
|
||||||
|
$srcChar{$_} = 1;
|
||||||
|
# print "$srcHash{$_}\t$_\n";
|
||||||
|
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{ ################# take worst characters that are not 1% of the best character################
|
||||||
|
if($srcHash{$_} < $srcOnePer || $take > 50)
|
||||||
|
{
|
||||||
|
$srcBadChar{$_} = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
# print "$_\t$srcHash{$_}\n";
|
||||||
|
$take++;
|
||||||
|
}
|
||||||
|
|
||||||
|
################### target hash ###################################
|
||||||
|
|
||||||
|
@keys = sort { $trgHash{$b} <=> $trgHash{$a} } keys %trgHash;
|
||||||
|
|
||||||
|
$besttrgfreq = $trgHash{$keys[0]};
|
||||||
|
$trgOnePer = $besttrgfreq * 0.005;
|
||||||
|
|
||||||
|
#print "$besttrgfreq\t$trgOnePer\n";
|
||||||
|
|
||||||
|
$take = 0; # take top 30 character from hash
|
||||||
|
foreach (@keys)
|
||||||
|
{
|
||||||
|
if($take < 30)
|
||||||
|
{
|
||||||
|
$trgChar{$_} = 1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{ ################# take worst characters that are not 1% of the best character################
|
||||||
|
if($trgHash{$_} < $trgOnePer || $take > 50 )
|
||||||
|
{
|
||||||
|
$trgBadChar{$_} = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
# print "$_\t$trgHash{$_}\n";
|
||||||
|
$take++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
###############################CharFreqFiltering###################################
|
||||||
|
sub charFreqFilter{
|
||||||
|
@in = @_;
|
||||||
|
@wrds = split(/\t/, $in[0]);
|
||||||
|
$srcWrd = lc $wrds[0];
|
||||||
|
$trgWrd = lc $wrds[1];
|
||||||
|
|
||||||
|
@srcWrdArr = split("",$srcWrd);
|
||||||
|
@trgWrdArr = split("",$trgWrd);
|
||||||
|
|
||||||
|
|
||||||
|
$check = 0;
|
||||||
|
$remove = 0;
|
||||||
|
|
||||||
|
########################## search if word contain any of the bad characters ####################################
|
||||||
|
|
||||||
|
foreach (@srcWrdArr)
|
||||||
|
{
|
||||||
|
# print "$srcWrd\n";
|
||||||
|
if (exists $srcBadChar{$_}) # if this character is in the list of worst characters
|
||||||
|
{
|
||||||
|
$remove = 1;
|
||||||
|
# print "#######EXIT src: \t$srcWrd##########\n";
|
||||||
|
last;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if($remove == 1)
|
||||||
|
{}
|
||||||
|
else
|
||||||
|
{ foreach (@trgWrdArr)
|
||||||
|
{
|
||||||
|
if (exists $trgBadChar{$_}) # if this character is in the list of worst characters
|
||||||
|
{
|
||||||
|
$remove = 1;
|
||||||
|
# print "EXIT target: \t$trgWrd\n";
|
||||||
|
last;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
########################## search if word contain any of the good characters ####################################
|
||||||
|
if($remove == 1)
|
||||||
|
{}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
foreach (@srcWrdArr)
|
||||||
|
{
|
||||||
|
if(exists ($srcChar{$_}))
|
||||||
|
{
|
||||||
|
$check = 1;
|
||||||
|
last;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if($check == 1)
|
||||||
|
{
|
||||||
|
foreach (@trgWrdArr)
|
||||||
|
{
|
||||||
|
if(exists ($trgChar{$_}))
|
||||||
|
{
|
||||||
|
# print "$wrds[0]\t$wrds[1]\n";
|
||||||
|
$printSrc = join (" ", split("",$wrds[0]));
|
||||||
|
$printTrg = join (" ", split("",$wrds[1]));
|
||||||
|
print "$printSrc\n$printTrg\n";
|
||||||
|
last;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
86
scripts/Transliteration/corpusCreator.pl
Executable file
86
scripts/Transliteration/corpusCreator.pl
Executable file
@ -0,0 +1,86 @@
|
|||||||
|
#!/usr/bin/perl -w
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
|
||||||
|
use utf8;
|
||||||
|
use Getopt::Std;
|
||||||
|
use IO::Handle;
|
||||||
|
binmode(STDIN, ':utf8');
|
||||||
|
binmode(STDOUT, ':utf8');
|
||||||
|
binmode(STDERR, ':utf8');
|
||||||
|
|
||||||
|
my @source;
|
||||||
|
my @target;
|
||||||
|
my @words;
|
||||||
|
my $tPath = $ARGV[0];
|
||||||
|
my $tFile = $ARGV[1];
|
||||||
|
my $inp_ext = $ARGV[2];
|
||||||
|
my $op_ext = $ARGV[3];
|
||||||
|
my $src;
|
||||||
|
my $tgt;
|
||||||
|
my $t;
|
||||||
|
my $s;
|
||||||
|
|
||||||
|
`mkdir $tPath/training`;
|
||||||
|
`mkdir $tPath/tuning`;
|
||||||
|
|
||||||
|
open FH, "<:encoding(UTF-8)", "$tPath/$tFile" or die "Can't open $tPath/$tFile: $!\n";
|
||||||
|
open MYSFILE, ">:encoding(UTF-8)", "$tPath/training/corpus.$inp_ext" or die "Can't open $tPath/training/corpus.$inp_ext: $!\n";
|
||||||
|
open MYTFILE, ">:encoding(UTF-8)", "$tPath/training/corpus.$op_ext" or die "Can't open $tPath/training/corpus.$op_ext: $!\n";
|
||||||
|
|
||||||
|
while (<FH>)
|
||||||
|
{
|
||||||
|
chomp;
|
||||||
|
my ($src,$tgt) = split(/\t/);
|
||||||
|
|
||||||
|
$s = join(' ', split('',$src));
|
||||||
|
$t = join(' ', split('',$tgt));
|
||||||
|
print MYSFILE "$s\n";
|
||||||
|
print MYTFILE "$t\n";
|
||||||
|
push(@source, $s);
|
||||||
|
push(@target, $t);
|
||||||
|
}
|
||||||
|
|
||||||
|
close (FH);
|
||||||
|
close (MYSFILE);
|
||||||
|
close (MYTFILE);
|
||||||
|
|
||||||
|
open MYSFILE, ">:encoding(UTF-8)", "$tPath/training/corpusA.$inp_ext" or die "Can't open $tPath/training/corpusA.$inp_ext: $!\n";
|
||||||
|
open MYTFILE, ">:encoding(UTF-8)", "$tPath/training/corpusA.$op_ext" or die "Can't open $tPath/training/corpusA.$op_ext: $!\n";
|
||||||
|
|
||||||
|
open MYSDEVFILE, ">:encoding(UTF-8)", "$tPath/tuning/input" or die "Can't open $tPath/tuning/input: $!\n";
|
||||||
|
open MYTDEVFILE, ">:encoding(UTF-8)", "$tPath/tuning/reference" or die "Can't open $tPath/tuning/reference: $!\n";
|
||||||
|
|
||||||
|
my $corpus_size = @source;
|
||||||
|
my $count = 11;
|
||||||
|
my $dev_size = 0;
|
||||||
|
|
||||||
|
|
||||||
|
foreach (@source)
|
||||||
|
{
|
||||||
|
if ($count % 5 == 0 && $dev_size < 1000)
|
||||||
|
{
|
||||||
|
print MYSDEVFILE "$source[$count-11]\n";
|
||||||
|
print MYTDEVFILE "$target[$count-11]\n";
|
||||||
|
$dev_size++;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
print MYSFILE "$source[$count-11]\n";
|
||||||
|
print MYTFILE "$target[$count-11]\n";
|
||||||
|
}
|
||||||
|
$count++;
|
||||||
|
}
|
||||||
|
|
||||||
|
close (MYSFILE);
|
||||||
|
close (MYTFILE);
|
||||||
|
close (MYSDEVFILE);
|
||||||
|
close (MYTDEVFILE);
|
||||||
|
|
||||||
|
if ($corpus_size < 6000)
|
||||||
|
{
|
||||||
|
`rm $tPath/training/corpusA.$inp_ext`;
|
||||||
|
`rm $tPath/training/corpusA.$op_ext`;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
169
scripts/Transliteration/prepare-transliteration-phrase-table.pl
Executable file
169
scripts/Transliteration/prepare-transliteration-phrase-table.pl
Executable file
@ -0,0 +1,169 @@
|
|||||||
|
#!/usr/bin/perl -w
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
|
||||||
|
use utf8;
|
||||||
|
use Getopt::Long "GetOptions";
|
||||||
|
use FindBin qw($RealBin);
|
||||||
|
use IO::Handle;
|
||||||
|
use File::Basename;
|
||||||
|
binmode(STDIN, ':utf8');
|
||||||
|
binmode(STDOUT, ':utf8');
|
||||||
|
binmode(STDERR, ':utf8');
|
||||||
|
|
||||||
|
my $OUT_DIR = "/tmp/Transliteration-Phrase-Table.$$";
|
||||||
|
|
||||||
|
my ($MOSES_SRC_DIR,$TRANSLIT_MODEL,$OOV_FILE,$EXTERNAL_BIN_DIR, $INPUT_EXTENSION, $OUTPUT_EXTENSION);
|
||||||
|
die("ERROR: wrong syntax when invoking train-transliteration-PT.pl")
|
||||||
|
unless &GetOptions('moses-src-dir=s' => \$MOSES_SRC_DIR,
|
||||||
|
'external-bin-dir=s' => \$EXTERNAL_BIN_DIR,
|
||||||
|
'transliteration-model-dir=s' => \$TRANSLIT_MODEL,
|
||||||
|
'input-extension=s' => \$INPUT_EXTENSION,
|
||||||
|
'output-extension=s' => \$OUTPUT_EXTENSION,
|
||||||
|
'out-dir=s' => \$OUT_DIR,
|
||||||
|
'oov-file=s' => \$OOV_FILE);
|
||||||
|
|
||||||
|
# check if the files are in place
|
||||||
|
die("ERROR: you need to define --moses-src-dir --external-bin-dir, --transliteration-model-dir, --oov-file, --input-extension, --output-extension")
|
||||||
|
unless (defined($MOSES_SRC_DIR) &&
|
||||||
|
defined($TRANSLIT_MODEL) &&
|
||||||
|
defined($OOV_FILE) &&
|
||||||
|
defined($INPUT_EXTENSION)&&
|
||||||
|
defined($OUTPUT_EXTENSION));
|
||||||
|
|
||||||
|
die("ERROR: could not find Transliteration Model '$TRANSLIT_MODEL'")
|
||||||
|
unless -e $TRANSLIT_MODEL;
|
||||||
|
die("ERROR: could not find OOV file '$OOV_FILE'")
|
||||||
|
unless -e $OOV_FILE;
|
||||||
|
|
||||||
|
my $UNK_FILE_NAME = basename($OOV_FILE);
|
||||||
|
`mkdir -p $OUT_DIR/$UNK_FILE_NAME/training`;
|
||||||
|
`cp $OOV_FILE $OUT_DIR/$UNK_FILE_NAME/$UNK_FILE_NAME`;
|
||||||
|
|
||||||
|
my $translitFile = "$OUT_DIR/" . $UNK_FILE_NAME . "/" . $UNK_FILE_NAME . ".translit";
|
||||||
|
|
||||||
|
print "Preparing for Transliteration\n";
|
||||||
|
prepare_for_transliteration ($OOV_FILE , $translitFile);
|
||||||
|
print "Run Transliteration\n";
|
||||||
|
run_transliteration ($MOSES_SRC_DIR , $EXTERNAL_BIN_DIR , $TRANSLIT_MODEL , $translitFile);
|
||||||
|
print "Form Transliteration Corpus\n";
|
||||||
|
form_corpus ($translitFile , $translitFile.".op.nBest" , $OUT_DIR);
|
||||||
|
|
||||||
|
|
||||||
|
################### Read the UNK word file and prepare for Transliteration ###############################
|
||||||
|
|
||||||
|
sub prepare_for_transliteration
|
||||||
|
{
|
||||||
|
my @list = @_;
|
||||||
|
my $testFile = $list[0];
|
||||||
|
my $translitFile = $list[1];
|
||||||
|
my %UNK;
|
||||||
|
my @words;
|
||||||
|
my $src;
|
||||||
|
open MYFILE, "<:encoding(UTF-8)", $testFile or die "Can't open $testFile: $!\n";
|
||||||
|
|
||||||
|
while (<MYFILE>)
|
||||||
|
{
|
||||||
|
chomp;
|
||||||
|
#print "$_\n";
|
||||||
|
@words = split(/ /, "$_");
|
||||||
|
|
||||||
|
foreach (@words)
|
||||||
|
{
|
||||||
|
$UNK{"$_"} = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
close (MYFILE);
|
||||||
|
|
||||||
|
open MYFILE, ">:encoding(UTF-8)", $translitFile or die "Can't open $translitFile: $!\n";
|
||||||
|
|
||||||
|
foreach my $key ( keys %UNK )
|
||||||
|
{
|
||||||
|
$src=join(' ', split('',$key));
|
||||||
|
print MYFILE "$src\n";
|
||||||
|
}
|
||||||
|
close (MYFILE);
|
||||||
|
}
|
||||||
|
|
||||||
|
################### Run Transliteration Module to Obtain Transliterations ###############################
|
||||||
|
|
||||||
|
sub run_transliteration
|
||||||
|
{
|
||||||
|
my @list = @_;
|
||||||
|
my $MOSES_SRC = $list[0];
|
||||||
|
my $EXTERNAL_BIN_DIR = $list[1];
|
||||||
|
my $TRANSLIT_MODEL = $list[2];
|
||||||
|
my $eval_file = $list[3];
|
||||||
|
|
||||||
|
`touch $eval_file.moses.table.ini`;
|
||||||
|
|
||||||
|
print "Filter Table\n";
|
||||||
|
|
||||||
|
`$MOSES_SRC/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -reordering msd-bidirectional-fe -score-options '--KneserNey' -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table -reordering-table $TRANSLIT_MODEL/model/reordering-table -config $eval_file.moses.table.ini -lm 0:3:$eval_file.moses.table.ini:8`;
|
||||||
|
|
||||||
|
`$MOSES_SRC/scripts/training/filter-model-given-input.pl $eval_file.filtered $eval_file.moses.table.ini $eval_file -Binarizer "$MOSES_SRC/bin/processPhraseTable"`;
|
||||||
|
|
||||||
|
`rm $eval_file.moses.table.ini`;
|
||||||
|
|
||||||
|
print "Apply Filter\n";
|
||||||
|
|
||||||
|
`$MOSES_SRC/scripts/ems/support/substitute-filtered-tables-and-weights.perl $eval_file.filtered/moses.ini $TRANSLIT_MODEL/model/moses.ini $TRANSLIT_MODEL/tuning/moses.tuned.ini $eval_file.filtered.ini`;
|
||||||
|
|
||||||
|
`$MOSES_SRC/bin/moses -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads 16 -drop-unknown -distortion-limit 0 -n-best-list $eval_file.op.nBest 50 -f $eval_file.filtered.ini < $eval_file > $eval_file.op`;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
################### Read the output of Transliteration Model and Form Corpus ###############################
|
||||||
|
|
||||||
|
|
||||||
|
sub form_corpus
|
||||||
|
{
|
||||||
|
|
||||||
|
my @list = @_;
|
||||||
|
my $inp_file = $list[0];
|
||||||
|
my $testFile = $list[1];
|
||||||
|
my $EVAL_DIR = $list[2];
|
||||||
|
my %vocab;
|
||||||
|
my @words;
|
||||||
|
my $thisStr;
|
||||||
|
|
||||||
|
my $UNK_FILE_NAME = basename($OOV_FILE);
|
||||||
|
my $target = $EVAL_DIR . "/$UNK_FILE_NAME/training/corpus.$OUTPUT_EXTENSION";
|
||||||
|
|
||||||
|
|
||||||
|
open MYFILE, "<:encoding(UTF-8)", $testFile or die "Can't open $testFile: $!\n";
|
||||||
|
|
||||||
|
|
||||||
|
while (<MYFILE>)
|
||||||
|
{
|
||||||
|
chomp;
|
||||||
|
#print "$_\n";
|
||||||
|
@words = split(/ /, "$_");
|
||||||
|
|
||||||
|
|
||||||
|
my $i = 2;
|
||||||
|
my $prob;
|
||||||
|
|
||||||
|
$thisStr = "";
|
||||||
|
|
||||||
|
while ($words[$i] ne "|||")
|
||||||
|
{
|
||||||
|
$thisStr = $thisStr . $words[$i];
|
||||||
|
$i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
$i++;
|
||||||
|
|
||||||
|
while ($words[$i] ne "|||")
|
||||||
|
{
|
||||||
|
$i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
$i++;
|
||||||
|
$prob = $words[$i];
|
||||||
|
|
||||||
|
print "$thisStr \t $prob\n";
|
||||||
|
}
|
||||||
|
close (MYFILE);
|
||||||
|
}
|
||||||
|
|
30
scripts/Transliteration/threshold.pl
Executable file
30
scripts/Transliteration/threshold.pl
Executable file
@ -0,0 +1,30 @@
|
|||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
use utf8;
|
||||||
|
require Encode;
|
||||||
|
use IO::Handle;
|
||||||
|
|
||||||
|
$input = <STDIN>;
|
||||||
|
#print $input;
|
||||||
|
|
||||||
|
$filename = shift or die "Error: missing hindi urdu file argument!\n";
|
||||||
|
open(FILE,$filename) or die "Error: unable to open file \"$filename\"!\n";
|
||||||
|
|
||||||
|
binmode(STDIN, ':utf8');
|
||||||
|
binmode(STDOUT, ':utf8');
|
||||||
|
binmode(STDERR, ':utf8');
|
||||||
|
binmode(FILE, ':utf8');
|
||||||
|
$c=0;
|
||||||
|
while (<FILE>)
|
||||||
|
{
|
||||||
|
chomp;
|
||||||
|
@F=split("\t");
|
||||||
|
$hash{$F[0]."\t".$F[1]}=$F[$#F];
|
||||||
|
$c++;
|
||||||
|
if($F[$#F] < $input)
|
||||||
|
{
|
||||||
|
print "$F[0]\t$F[1]\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
}close FILE;
|
||||||
|
|
306
scripts/Transliteration/train-transliteration-module.pl
Executable file
306
scripts/Transliteration/train-transliteration-module.pl
Executable file
@ -0,0 +1,306 @@
|
|||||||
|
#!/usr/bin/perl -w
|
||||||
|
|
||||||
|
use utf8;
|
||||||
|
use strict;
|
||||||
|
use Getopt::Long "GetOptions";
|
||||||
|
use FindBin qw($RealBin);
|
||||||
|
binmode(STDIN, ':utf8');
|
||||||
|
binmode(STDOUT, ':utf8');
|
||||||
|
binmode(STDERR, ':utf8');
|
||||||
|
|
||||||
|
print STDERR "Training Transliteration Module - Start\n".`date`;
|
||||||
|
|
||||||
|
my $ORDER = 5;
|
||||||
|
my $OUT_DIR = "/tmp/Transliteration-Model.$$";
|
||||||
|
my $___FACTOR_DELIMITER = "|";
|
||||||
|
my ($MOSES_SRC_DIR,$CORPUS_F,$CORPUS_E,$ALIGNMENT,$SRILM_DIR,$FACTOR,$EXTERNAL_BIN_DIR,$INPUT_EXTENSION, $OUTPUT_EXTENSION);
|
||||||
|
|
||||||
|
# utilities
|
||||||
|
my $ZCAT = "gzip -cd";
|
||||||
|
my $BZCAT = "bzcat";
|
||||||
|
|
||||||
|
die("ERROR: wrong syntax when invoking TransliterationModel.perl")
|
||||||
|
unless &GetOptions('moses-src-dir=s' => \$MOSES_SRC_DIR,
|
||||||
|
'external-bin-dir=s' => \$EXTERNAL_BIN_DIR,
|
||||||
|
'input-extension=s' => \$INPUT_EXTENSION,
|
||||||
|
'output-extension=s' => \$OUTPUT_EXTENSION,
|
||||||
|
'corpus-f=s' => \$CORPUS_F,
|
||||||
|
'corpus-e=s' => \$CORPUS_E,
|
||||||
|
'alignment=s' => \$ALIGNMENT,
|
||||||
|
'order=i' => \$ORDER,
|
||||||
|
'factor=s' => \$FACTOR,
|
||||||
|
'srilm-dir=s' => \$SRILM_DIR,
|
||||||
|
'out-dir=s' => \$OUT_DIR);
|
||||||
|
|
||||||
|
# check if the files are in place
|
||||||
|
die("ERROR: you need to define --corpus-e, --corpus-f, --alignment, --srilm-dir, --moses-src-dir --external-bin-dir, --input-extension and --output-extension")
|
||||||
|
unless (defined($MOSES_SRC_DIR) &&
|
||||||
|
defined($CORPUS_F) &&
|
||||||
|
defined($CORPUS_E) &&
|
||||||
|
defined($ALIGNMENT)&&
|
||||||
|
defined($INPUT_EXTENSION)&&
|
||||||
|
defined($OUTPUT_EXTENSION)&&
|
||||||
|
defined($EXTERNAL_BIN_DIR)&&
|
||||||
|
defined($SRILM_DIR));
|
||||||
|
die("ERROR: could not find input corpus file '$CORPUS_F'")
|
||||||
|
unless -e $CORPUS_F;
|
||||||
|
die("ERROR: could not find output corpus file '$CORPUS_E'")
|
||||||
|
unless -e $CORPUS_E;
|
||||||
|
die("ERROR: could not find algnment file '$ALIGNMENT'")
|
||||||
|
unless -e $ALIGNMENT;
|
||||||
|
|
||||||
|
# create factors
|
||||||
|
`mkdir $OUT_DIR`;
|
||||||
|
|
||||||
|
if (defined($FACTOR)) {
|
||||||
|
|
||||||
|
my @factor_values = split(',', $FACTOR);
|
||||||
|
|
||||||
|
foreach my $factor_val (@factor_values) {
|
||||||
|
`mkdir $OUT_DIR/$factor_val`;
|
||||||
|
my ($factor_f,$factor_e) = split(/\-/,$factor_val);
|
||||||
|
|
||||||
|
$CORPUS_F =~ /^(.+)\.([^\.]+)/;
|
||||||
|
my ($corpus_stem_f,$ext_f) = ($1,$OUT_DIR);
|
||||||
|
$CORPUS_E =~ /^(.+)\.([^\.]+)/;
|
||||||
|
my ($corpus_stem_e,$ext_e) = ($1,$OUT_DIR);
|
||||||
|
&reduce_factors($CORPUS_F,"$corpus_stem_f.$factor_val.$ext_f",$factor_f);
|
||||||
|
&reduce_factors($CORPUS_E,"$corpus_stem_e.$factor_val.$ext_e",$factor_e);
|
||||||
|
|
||||||
|
`ln -s $corpus_stem_f.$factor_val.$ext_f $OUT_DIR/$factor_val/f`;
|
||||||
|
`ln -s $corpus_stem_e.$factor_val.$ext_e $OUT_DIR/$factor_val/e`;
|
||||||
|
`ln -s $ALIGNMENT $OUT_DIR/$factor_val/a`;
|
||||||
|
mine_transliterations($factor_val, $INPUT_EXTENSION, $OUTPUT_EXTENSION);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
`ln -s $CORPUS_F $OUT_DIR/f`;
|
||||||
|
`ln -s $CORPUS_E $OUT_DIR/e`;
|
||||||
|
`ln -s $ALIGNMENT $OUT_DIR/a`;
|
||||||
|
mine_transliterations("", $INPUT_EXTENSION, $OUTPUT_EXTENSION);
|
||||||
|
}
|
||||||
|
|
||||||
|
train_transliteration_module();
|
||||||
|
retrain_transliteration_module();
|
||||||
|
|
||||||
|
|
||||||
|
# create model
|
||||||
|
|
||||||
|
print "Training Transliteration Module - End ".`date`;
|
||||||
|
|
||||||
|
sub learn_transliteration_model{
|
||||||
|
|
||||||
|
my ($t) = @_;
|
||||||
|
|
||||||
|
`cp $OUT_DIR/training/corpus$t.$OUTPUT_EXTENSION $OUT_DIR/lm/target`;
|
||||||
|
|
||||||
|
print "Align Corpus\n";
|
||||||
|
|
||||||
|
`$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -last-step 1 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -reordering msd-bidirectional-fe -score-options '--KneserNey' -corpus $OUT_DIR/training/corpus$t -corpus-dir $OUT_DIR/training/prepared`;
|
||||||
|
|
||||||
|
`$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 2 -last-step 2 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -reordering msd-bidirectional-fe -score-options '--KneserNey' -corpus-dir $OUT_DIR/training/prepared -giza-e2f $OUT_DIR/training/giza -direction 2`;
|
||||||
|
|
||||||
|
`$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 2 -last-step 2 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -reordering msd-bidirectional-fe -score-options '--KneserNey' -corpus-dir $OUT_DIR/training/prepared -giza-f2e $OUT_DIR/training/giza-inverse -direction 1`;
|
||||||
|
|
||||||
|
`$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 3 -last-step 3 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -reordering msd-bidirectional-fe -score-options '--KneserNey' -giza-e2f $OUT_DIR/training/giza -giza-f2e $OUT_DIR/training/giza-inverse -alignment-file $OUT_DIR/model/aligned -alignment-stem $OUT_DIR/model/aligned -alignment grow-diag-final-and`;
|
||||||
|
|
||||||
|
print "Train Translation Models\n";
|
||||||
|
|
||||||
|
`$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 4 -last-step 4 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -reordering msd-bidirectional-fe -score-options '--KneserNey' -lexical-file $OUT_DIR/model/lex -alignment-file $OUT_DIR/model/aligned -alignment-stem $OUT_DIR/model/aligned -corpus $OUT_DIR/training/corpus$t`;
|
||||||
|
|
||||||
|
`$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 5 -last-step 5 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -reordering msd-bidirectional-fe -score-options '--KneserNey' -alignment-file $OUT_DIR/model/aligned -alignment-stem $OUT_DIR/model/aligned -extract-file $OUT_DIR/model/extract -corpus $OUT_DIR/training/corpus$t`;
|
||||||
|
|
||||||
|
`$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 6 -last-step 6 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -reordering msd-bidirectional-fe -score-options '--KneserNey' -extract-file $OUT_DIR/model/extract -lexical-file $OUT_DIR/model/lex -phrase-translation-table $OUT_DIR/model/phrase-table`;
|
||||||
|
|
||||||
|
`$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 7 -last-step 7 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -reordering msd-bidirectional-fe -score-options '--KneserNey' -extract-file $OUT_DIR/model/extract -reordering-table $OUT_DIR/model/reordering-table`;
|
||||||
|
|
||||||
|
print "Train Language Models\n";
|
||||||
|
|
||||||
|
`$SRILM_DIR/ngram-count -order 5 -interpolate -kndiscount -addsmooth1 0.0 -unk -text $OUT_DIR/lm/target -lm $OUT_DIR/lm/targetLM`;
|
||||||
|
|
||||||
|
`$MOSES_SRC_DIR/bin/build_binary $OUT_DIR/lm/targetLM $OUT_DIR/lm/targetLM.bin`;
|
||||||
|
|
||||||
|
print "Create Config File\n";
|
||||||
|
|
||||||
|
`$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -reordering msd-bidirectional-fe -score-options '--KneserNey' -phrase-translation-table $OUT_DIR/model/phrase-table -reordering-table $OUT_DIR/model/reordering-table -config $OUT_DIR/model/moses.ini -lm 0:5:$OUT_DIR/lm/targetLM.bin:8`;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
sub retrain_transliteration_module{
|
||||||
|
|
||||||
|
if (-e "$OUT_DIR/training/corpusA.$OUTPUT_EXTENSION")
|
||||||
|
{
|
||||||
|
`rm -r $OUT_DIR/model`;
|
||||||
|
`rm -r $OUT_DIR/lm`;
|
||||||
|
`rm -r $OUT_DIR/training/giza`;
|
||||||
|
`rm -r $OUT_DIR/training/giza-inverse`;
|
||||||
|
`rm -r $OUT_DIR/training/prepared`;
|
||||||
|
`mkdir $OUT_DIR/model`;
|
||||||
|
`mkdir $OUT_DIR/lm`;
|
||||||
|
|
||||||
|
learn_transliteration_model("");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sub train_transliteration_module{
|
||||||
|
|
||||||
|
`mkdir $OUT_DIR/model`;
|
||||||
|
`mkdir $OUT_DIR/lm`;
|
||||||
|
print "Preparing Corpus\n";
|
||||||
|
`$MOSES_SRC_DIR/scripts/Transliteration/corpusCreator.pl $OUT_DIR 1-1.$INPUT_EXTENSION-$OUTPUT_EXTENSION.mined-pairs $INPUT_EXTENSION $OUTPUT_EXTENSION`;
|
||||||
|
|
||||||
|
if (-e "$OUT_DIR/training/corpusA.$OUTPUT_EXTENSION")
|
||||||
|
{
|
||||||
|
learn_transliteration_model("A");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
learn_transliteration_model("");
|
||||||
|
}
|
||||||
|
|
||||||
|
print "Running Tuning for Transliteration Module\n";
|
||||||
|
|
||||||
|
`touch $OUT_DIR/tuning/moses.table.ini`;
|
||||||
|
|
||||||
|
`$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -reordering msd-bidirectional-fe -score-options '--KneserNey' -phrase-translation-table $OUT_DIR/model/phrase-table -reordering-table $OUT_DIR/model/reordering-table -config $OUT_DIR/tuning/moses.table.ini -lm 0:3:$OUT_DIR/tuning/moses.table.ini:8`;
|
||||||
|
|
||||||
|
`$MOSES_SRC_DIR/scripts/training/filter-model-given-input.pl $OUT_DIR/tuning/filtered $OUT_DIR/tuning/moses.table.ini $OUT_DIR/tuning/input -Binarizer "$MOSES_SRC_DIR/bin/processPhraseTable"`;
|
||||||
|
|
||||||
|
`rm $OUT_DIR/tuning/moses.table.ini`;
|
||||||
|
|
||||||
|
`$MOSES_SRC_DIR/scripts/ems/support/substitute-filtered-tables.perl $OUT_DIR/tuning/filtered/moses.ini < $OUT_DIR/model/moses.ini > $OUT_DIR/tuning/moses.filtered.ini`;
|
||||||
|
|
||||||
|
`$MOSES_SRC_DIR/scripts/training/mert-moses.pl $OUT_DIR/tuning/input $OUT_DIR/tuning/reference $MOSES_SRC_DIR/bin/moses $OUT_DIR/tuning/moses.filtered.ini --nbest 100 --working-dir $OUT_DIR/tuning/tmp --decoder-flags "-threads 16 -drop-unknown -v 0 -distortion-limit 0" --rootdir $MOSES_SRC_DIR/scripts -mertdir $MOSES_SRC_DIR/mert -threads=16 --no-filter-phrase-table`;
|
||||||
|
|
||||||
|
`cp $OUT_DIR/tuning/tmp/moses.ini $OUT_DIR/tuning/moses.ini`;
|
||||||
|
|
||||||
|
`$MOSES_SRC_DIR/scripts/ems/support/substitute-weights.perl $OUT_DIR/model/moses.ini $OUT_DIR/tuning/moses.ini $OUT_DIR/tuning/moses.tuned.ini`;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
sub mine_transliterations{
|
||||||
|
|
||||||
|
my @list = @_;
|
||||||
|
my $factor_val = $list[0];
|
||||||
|
my $inp_ext = $list[1];
|
||||||
|
my $op_ext = $list[2];
|
||||||
|
my $count = 0;
|
||||||
|
my $l1 = 1;
|
||||||
|
my $l2 = 1;
|
||||||
|
|
||||||
|
print "Creating Model ".$factor_val."\n";
|
||||||
|
|
||||||
|
print "Extracting 1-1 Alignments\n";
|
||||||
|
`$MOSES_SRC_DIR/bin/1-1-Extraction $OUT_DIR/$factor_val/f $OUT_DIR/$factor_val/e $OUT_DIR/$factor_val/a > $OUT_DIR/$factor_val/1-1.$inp_ext-$op_ext`;
|
||||||
|
|
||||||
|
print "Cleaning the list for Miner\n";
|
||||||
|
|
||||||
|
`$MOSES_SRC_DIR/scripts/Transliteration/clean.pl $OUT_DIR/$factor_val/1-1.$inp_ext-$op_ext > $OUT_DIR/$factor_val/1-1.$inp_ext-$op_ext.cleaned`;
|
||||||
|
|
||||||
|
|
||||||
|
if (-e "$OUT_DIR/$factor_val/1-1.$inp_ext-$op_ext.pair-probs")
|
||||||
|
{
|
||||||
|
print STDERR "1-1.$inp_ext-$op_ext.pair-probs in place, reusing\n";
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
print "Extracting Transliteration Pairs \n";
|
||||||
|
`$MOSES_SRC_DIR/bin/TMining $OUT_DIR/$factor_val/1-1.$inp_ext-$op_ext.cleaned > $OUT_DIR/$factor_val/1-1.$inp_ext-$op_ext.pair-probs`;
|
||||||
|
}
|
||||||
|
|
||||||
|
print "Selecting Transliteration Pairs with threshold 0.5 \n";
|
||||||
|
`echo 0.5 | $MOSES_SRC_DIR/scripts/Transliteration/threshold.pl $OUT_DIR/$factor_val/1-1.$inp_ext-$op_ext.pair-probs > $OUT_DIR/$factor_val/1-1.$inp_ext-$op_ext.mined-pairs`;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
# from train-model.perl
|
||||||
|
sub reduce_factors {
|
||||||
|
my ($full,$reduced,$factors) = @_;
|
||||||
|
|
||||||
|
my @INCLUDE = sort {$a <=> $b} split(/,/,$factors);
|
||||||
|
|
||||||
|
print "Reducing factors to produce $reduced @ ".`date`;
|
||||||
|
while(-e $reduced.".lock") {
|
||||||
|
sleep(10);
|
||||||
|
}
|
||||||
|
if (-e $reduced) {
|
||||||
|
print STDERR " $reduced in place, reusing\n";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (-e $reduced.".gz") {
|
||||||
|
print STDERR " $reduced.gz in place, reusing\n";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
# peek at input, to check if we are asked to produce exactly the
|
||||||
|
# available factors
|
||||||
|
my $inh = open_or_zcat($full);
|
||||||
|
my $firstline = <$inh>;
|
||||||
|
die "Corpus file $full is empty" unless $firstline;
|
||||||
|
close $inh;
|
||||||
|
# pick first word
|
||||||
|
$firstline =~ s/^\s*//;
|
||||||
|
$firstline =~ s/\s.*//;
|
||||||
|
# count factors
|
||||||
|
my $maxfactorindex = $firstline =~ tr/|/|/;
|
||||||
|
if (join(",", @INCLUDE) eq join(",", 0..$maxfactorindex)) {
|
||||||
|
# create just symlink; preserving compression
|
||||||
|
my $realfull = $full;
|
||||||
|
if (!-e $realfull && -e $realfull.".gz") {
|
||||||
|
$realfull .= ".gz";
|
||||||
|
$reduced =~ s/(\.gz)?$/.gz/;
|
||||||
|
}
|
||||||
|
safesystem("ln -s '$realfull' '$reduced'")
|
||||||
|
or die "Failed to create symlink $realfull -> $reduced";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
# The default is to select the needed factors
|
||||||
|
`touch $reduced.lock`;
|
||||||
|
*IN = open_or_zcat($full);
|
||||||
|
open(OUT,">".$reduced) or die "ERROR: Can't write $reduced";
|
||||||
|
my $nr = 0;
|
||||||
|
while(<IN>) {
|
||||||
|
$nr++;
|
||||||
|
print STDERR "." if $nr % 10000 == 0;
|
||||||
|
print STDERR "($nr)" if $nr % 100000 == 0;
|
||||||
|
chomp; s/ +/ /g; s/^ //; s/ $//;
|
||||||
|
my $first = 1;
|
||||||
|
foreach (split) {
|
||||||
|
my @FACTOR = split /\Q$___FACTOR_DELIMITER/;
|
||||||
|
# \Q causes to disable metacharacters in regex
|
||||||
|
print OUT " " unless $first;
|
||||||
|
$first = 0;
|
||||||
|
my $first_factor = 1;
|
||||||
|
foreach my $outfactor (@INCLUDE) {
|
||||||
|
print OUT "|" unless $first_factor;
|
||||||
|
$first_factor = 0;
|
||||||
|
my $out = $FACTOR[$outfactor];
|
||||||
|
die "ERROR: Couldn't find factor $outfactor in token \"$_\" in $full LINE $nr" if !defined $out;
|
||||||
|
print OUT $out;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
print OUT "\n";
|
||||||
|
}
|
||||||
|
print STDERR "\n";
|
||||||
|
close(OUT);
|
||||||
|
close(IN);
|
||||||
|
`rm -f $reduced.lock`;
|
||||||
|
}
|
||||||
|
|
||||||
|
sub open_or_zcat {
|
||||||
|
my $fn = shift;
|
||||||
|
my $read = $fn;
|
||||||
|
$fn = $fn.".gz" if ! -e $fn && -e $fn.".gz";
|
||||||
|
$fn = $fn.".bz2" if ! -e $fn && -e $fn.".bz2";
|
||||||
|
if ($fn =~ /\.bz2$/) {
|
||||||
|
$read = "$BZCAT $fn|";
|
||||||
|
} elsif ($fn =~ /\.gz$/) {
|
||||||
|
$read = "$ZCAT $fn|";
|
||||||
|
}
|
||||||
|
my $hdl;
|
||||||
|
open($hdl,$read) or die "Can't read $fn ($read)";
|
||||||
|
return $hdl;
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user