mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-28 06:22:14 +03:00
316 lines
6.2 KiB
Perl
Executable File
316 lines
6.2 KiB
Perl
Executable File
#!/usr/bin/perl
|
|
|
|
#input hindi word urdu word, delete all those entries that have number on any side
|
|
use utf8;
|
|
|
|
use Getopt::Std;
|
|
use IO::Handle;
|
|
|
|
binmode(STDIN, ':utf8');
|
|
binmode(STDOUT, ':utf8');
|
|
binmode(STDERR, ':utf8');
|
|
use open qw(:std :utf8);
|
|
|
|
$srcHash = ();
|
|
$trgHash = ();
|
|
|
|
$file = $ARGV[0];
|
|
|
|
@f0 = split(/\//, $file); # if file name has a path
|
|
@f1 = split(/\./, $f0[$#f0]); # last element would be the file name
|
|
@f2 = split(/\-/, $f1[1]);
|
|
$srcMark = $f2[0];
|
|
$trgMark = $f2[1];
|
|
|
|
$lang = 0;
|
|
$lang1 = 1;
|
|
$lang2 = 1;
|
|
|
|
if ($srcMark eq "en" || $srcMark eq "de" || $srcMark eq "es" || $srcMark eq "fr" || $srcMark eq "it" || $srcMark eq "nl" || $srcMark eq "pt-br" || $srcMark eq "ro" || $srcMark eq "sl" || $srcMark eq "tr" )
|
|
{
|
|
print STDERR "Source is Latin\n";
|
|
$lang1 = 0;
|
|
$lang = $lang + 1;
|
|
|
|
}
|
|
|
|
if ( "$trgMark" eq "en" || "$trgMark" eq "de" || "$trgMark" eq "es" || "$trgMark" eq "fr" || "$trgMark" eq "it" || "$trgMark" eq "nl" || "$trgMark" eq "pt-br" || "$trgMark" eq "ro" || "$trgMark" eq "sl" || "$trgMark" eq "tr" )
|
|
{
|
|
print STDERR "Target is Latin\n";
|
|
$lang2 = 0;
|
|
$lang = $lang + 1;
|
|
}
|
|
|
|
if ("$lang" == 2)
|
|
{
|
|
print STDERR "No Transliteration Module Possible\n";
|
|
}
|
|
else
|
|
{ print STDERR "will run Transliteration module\n";
|
|
print STDERR "Three preprocessing steps to do:\n 1) Delete Symbol \t 2) Delete Latin from non-Latin langauge \t 3) Character Frequency based filtering\n";
|
|
print STDERR "STARTING 1 and 2 ...\n";
|
|
open ($IN, $ARGV[0]);
|
|
while(<$IN>)
|
|
{
|
|
chomp;
|
|
$retur = deleteSymbol($_);
|
|
if($retur == 1)
|
|
{
|
|
#print "$_\n";
|
|
$retur = deleteEnglish($lang1, $lang2, $_);
|
|
if ($retur == 1)
|
|
{
|
|
#print "$_\n";
|
|
push (@inputArr, $_);
|
|
charFreqFilterPreprocess($_);
|
|
}
|
|
}
|
|
}
|
|
close ($IN);
|
|
}
|
|
print STDERR "DONE 1 and 2\nSTARTING 3) Preprocessing for Character filtering...\n";
|
|
|
|
charFreqFilterPreprocess2();
|
|
print STDERR "DONE 3\n";
|
|
|
|
foreach (@inputArr)
|
|
{
|
|
charFreqFilter($_);
|
|
}
|
|
|
|
###############################Delete English##################################
|
|
|
|
sub deleteEnglish{
|
|
@list = @_;
|
|
$backEng = 0;
|
|
|
|
if($list[0] == 1 && $list[1] == 1)
|
|
{
|
|
# print "Both are Non-Latin\n";
|
|
if (m/[A-Za-z]/) {}
|
|
else {$backEng = 1; return $backEng;}
|
|
}
|
|
elsif($list[0] == 0 && $list[1] == 1)
|
|
{
|
|
# print "Target is Non-Latin\n";
|
|
@F=split("\t");
|
|
if ($F[1] =~ m/[A-Za-z]/) {}
|
|
else {$backEng = 1; return $backEng;}
|
|
|
|
}
|
|
elsif($list[0] == 1 && $list[1] == 0)
|
|
{
|
|
# print "Source is Non-Latin\n";
|
|
@F=split("\t");
|
|
if ($F[0] =~ m/[A-Za-z]/) {}
|
|
else {$backEng = 1; return $backEng;}
|
|
}
|
|
}
|
|
###############################Delete Symbol##################################
|
|
sub deleteSymbol{
|
|
$back = 0;
|
|
if (/\d+/) {}
|
|
elsif(/\?/) {}
|
|
elsif(/\!/) {}
|
|
elsif(/@/) {}
|
|
elsif(/\./) {}
|
|
elsif(/\#/) {}
|
|
elsif(/\%/) {}
|
|
elsif(/\$/) {}
|
|
elsif(/-/) {}
|
|
elsif(/"/) {}
|
|
elsif(/\(/) {}
|
|
elsif(/\)/) {}
|
|
elsif(/\&/) {}
|
|
elsif(/\;/) {}
|
|
elsif(/\\/) {}
|
|
elsif(/\*/) {}
|
|
elsif(/\+/) {}
|
|
elsif(/\,/) {}
|
|
elsif(/\</){}
|
|
elsif(/\>/){}
|
|
else
|
|
{
|
|
@wrds = split(/\t/);
|
|
if($wrds[0] eq $wrds[1])
|
|
{}
|
|
elsif(length $wrds[0] < 3 )
|
|
{}
|
|
elsif(length $wrds[1] < 3)
|
|
{}
|
|
else
|
|
{
|
|
$back = 1;
|
|
return $back;
|
|
# print "$_\n";
|
|
}
|
|
}
|
|
}
|
|
#################################Char Frequency Filter Preprocess########################
|
|
sub charFreqFilterPreprocess{
|
|
|
|
@wrds = split(/\t/);
|
|
$srcWrd = lc $wrds[0];
|
|
$trgWrd = lc $wrds[1];
|
|
|
|
if($srcWrd eq $trgWrd)
|
|
{}
|
|
else
|
|
{
|
|
@src = split('',$srcWrd);
|
|
foreach (@src)
|
|
{
|
|
if(exists $srcHash{$_})
|
|
{
|
|
$srcHash{$_}++;
|
|
}
|
|
else
|
|
{
|
|
$srcHash{$_} = 0;
|
|
}
|
|
}
|
|
@trg = split('',$trgWrd);
|
|
foreach (@trg)
|
|
{
|
|
if(exists $trgHash{$_})
|
|
{
|
|
$trgHash{$_}++;
|
|
}
|
|
else
|
|
{
|
|
$trgHash{$_} = 0;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
##############################Preprocess Two#############################
|
|
sub charFreqFilterPreprocess2{
|
|
|
|
###################srchash###################################
|
|
|
|
@keys = sort { $srcHash{$b} <=> $srcHash{$a} } keys %srcHash;
|
|
|
|
$bestsrcfreq = $srcHash{$keys[0]};
|
|
$srcOnePer = $bestsrcfreq * 0.005;
|
|
|
|
$take = 0; # take top 30 character from hash
|
|
|
|
foreach (@keys)
|
|
{
|
|
# print "$srcHash{$_}\t$_\n";
|
|
|
|
if($take < 30)
|
|
{
|
|
$srcChar{$_} = 1;
|
|
# print "$srcHash{$_}\t$_\n";
|
|
|
|
}
|
|
else
|
|
{ ################# take worst characters that are not 1% of the best character################
|
|
if($srcHash{$_} < $srcOnePer || $take > 50)
|
|
{
|
|
$srcBadChar{$_} = 1;
|
|
}
|
|
}
|
|
# print "$_\t$srcHash{$_}\n";
|
|
$take++;
|
|
}
|
|
|
|
################### target hash ###################################
|
|
|
|
@keys = sort { $trgHash{$b} <=> $trgHash{$a} } keys %trgHash;
|
|
|
|
$besttrgfreq = $trgHash{$keys[0]};
|
|
$trgOnePer = $besttrgfreq * 0.005;
|
|
|
|
#print "$besttrgfreq\t$trgOnePer\n";
|
|
|
|
$take = 0; # take top 30 character from hash
|
|
foreach (@keys)
|
|
{
|
|
if($take < 30)
|
|
{
|
|
$trgChar{$_} = 1;
|
|
}
|
|
else
|
|
{ ################# take worst characters that are not 1% of the best character################
|
|
if($trgHash{$_} < $trgOnePer || $take > 50 )
|
|
{
|
|
$trgBadChar{$_} = 1;
|
|
}
|
|
}
|
|
# print "$_\t$trgHash{$_}\n";
|
|
$take++;
|
|
}
|
|
}
|
|
|
|
###############################CharFreqFiltering###################################
|
|
sub charFreqFilter{
|
|
@in = @_;
|
|
@wrds = split(/\t/, $in[0]);
|
|
$srcWrd = lc $wrds[0];
|
|
$trgWrd = lc $wrds[1];
|
|
|
|
@srcWrdArr = split("",$srcWrd);
|
|
@trgWrdArr = split("",$trgWrd);
|
|
|
|
|
|
$check = 0;
|
|
$remove = 0;
|
|
|
|
########################## search if word contain any of the bad characters ####################################
|
|
|
|
foreach (@srcWrdArr)
|
|
{
|
|
# print "$srcWrd\n";
|
|
if (exists $srcBadChar{$_}) # if this character is in the list of worst characters
|
|
{
|
|
$remove = 1;
|
|
# print "#######EXIT src: \t$srcWrd##########\n";
|
|
last;
|
|
}
|
|
}
|
|
|
|
if($remove == 1)
|
|
{}
|
|
else
|
|
{ foreach (@trgWrdArr)
|
|
{
|
|
if (exists $trgBadChar{$_}) # if this character is in the list of worst characters
|
|
{
|
|
$remove = 1;
|
|
# print "EXIT target: \t$trgWrd\n";
|
|
last;
|
|
}
|
|
}
|
|
}
|
|
########################## search if word contain any of the good characters ####################################
|
|
if($remove == 1)
|
|
{}
|
|
else
|
|
{
|
|
foreach (@srcWrdArr)
|
|
{
|
|
if(exists ($srcChar{$_}))
|
|
{
|
|
$check = 1;
|
|
last;
|
|
}
|
|
}
|
|
|
|
if($check == 1)
|
|
{
|
|
foreach (@trgWrdArr)
|
|
{
|
|
if(exists ($trgChar{$_}))
|
|
{
|
|
# print "$wrds[0]\t$wrds[1]\n";
|
|
$printSrc = join (" ", split("",$wrds[0]));
|
|
$printTrg = join (" ", split("",$wrds[1]));
|
|
print "$printSrc\n$printTrg\n";
|
|
last;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} |