mosesdecoder/scripts/Transliteration/corpusCreator.pl
2015-04-13 20:42:33 +04:00

88 lines
2.0 KiB
Perl
Executable File

#!/usr/bin/env perl
use warnings;
use strict;
use utf8;
use Getopt::Std;
use IO::Handle;
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
my @source;
my @target;
my @words;
my $tPath = $ARGV[0];
my $tFile = $ARGV[1];
my $inp_ext = $ARGV[2];
my $op_ext = $ARGV[3];
my $src;
my $tgt;
my $t;
my $s;
`mkdir $tPath/training`;
`mkdir $tPath/tuning`;
open FH, "<:encoding(UTF-8)", "$tPath/$tFile" or die "Can't open $tPath/$tFile: $!\n";
open MYSFILE, ">:encoding(UTF-8)", "$tPath/training/corpus.$inp_ext" or die "Can't open $tPath/training/corpus.$inp_ext: $!\n";
open MYTFILE, ">:encoding(UTF-8)", "$tPath/training/corpus.$op_ext" or die "Can't open $tPath/training/corpus.$op_ext: $!\n";
while (<FH>)
{
chomp;
my ($src,$tgt) = split(/\t/);
$s = join(' ', split('',$src));
$t = join(' ', split('',$tgt));
print MYSFILE "$s\n";
print MYTFILE "$t\n";
push(@source, $s);
push(@target, $t);
}
close (FH);
close (MYSFILE);
close (MYTFILE);
open MYSFILE, ">:encoding(UTF-8)", "$tPath/training/corpusA.$inp_ext" or die "Can't open $tPath/training/corpusA.$inp_ext: $!\n";
open MYTFILE, ">:encoding(UTF-8)", "$tPath/training/corpusA.$op_ext" or die "Can't open $tPath/training/corpusA.$op_ext: $!\n";
open MYSDEVFILE, ">:encoding(UTF-8)", "$tPath/tuning/input" or die "Can't open $tPath/tuning/input: $!\n";
open MYTDEVFILE, ">:encoding(UTF-8)", "$tPath/tuning/reference" or die "Can't open $tPath/tuning/reference: $!\n";
my $corpus_size = @source;
my $count = 11;
my $dev_size = 0;
foreach (@source)
{
if ($count % 5 == 0 && $dev_size < 1000)
{
print MYSDEVFILE "$source[$count-11]\n";
print MYTDEVFILE "$target[$count-11]\n";
$dev_size++;
}
else
{
print MYSFILE "$source[$count-11]\n";
print MYTFILE "$target[$count-11]\n";
}
$count++;
}
close (MYSFILE);
close (MYTFILE);
close (MYSDEVFILE);
close (MYTDEVFILE);
if ($corpus_size < 6000)
{
`rm $tPath/training/corpusA.$inp_ext`;
`rm $tPath/training/corpusA.$op_ext`;
}