2015-04-02 16:38:56 +03:00
|
|
|
#!/usr/bin/env perl
|
2013-12-02 18:49:21 +04:00
|
|
|
|
2015-04-13 19:42:33 +03:00
|
|
|
use warnings;
|
2013-12-02 18:49:21 +04:00
|
|
|
use strict;
|
|
|
|
|
|
|
|
use utf8;
|
|
|
|
use Getopt::Std;
|
|
|
|
use IO::Handle;
|
|
|
|
binmode(STDIN, ':utf8');
|
|
|
|
binmode(STDOUT, ':utf8');
|
|
|
|
binmode(STDERR, ':utf8');
|
|
|
|
|
|
|
|
my @source;
|
|
|
|
my @target;
|
|
|
|
my @words;
|
|
|
|
my $tPath = $ARGV[0];
|
|
|
|
my $tFile = $ARGV[1];
|
|
|
|
my $inp_ext = $ARGV[2];
|
|
|
|
my $op_ext = $ARGV[3];
|
|
|
|
my $src;
|
|
|
|
my $tgt;
|
|
|
|
my $t;
|
|
|
|
my $s;
|
|
|
|
|
|
|
|
`mkdir $tPath/training`;
|
|
|
|
`mkdir $tPath/tuning`;
|
|
|
|
|
|
|
|
open FH, "<:encoding(UTF-8)", "$tPath/$tFile" or die "Can't open $tPath/$tFile: $!\n";
|
|
|
|
open MYSFILE, ">:encoding(UTF-8)", "$tPath/training/corpus.$inp_ext" or die "Can't open $tPath/training/corpus.$inp_ext: $!\n";
|
|
|
|
open MYTFILE, ">:encoding(UTF-8)", "$tPath/training/corpus.$op_ext" or die "Can't open $tPath/training/corpus.$op_ext: $!\n";
|
|
|
|
|
|
|
|
while (<FH>)
|
|
|
|
{
|
|
|
|
chomp;
|
|
|
|
my ($src,$tgt) = split(/\t/);
|
|
|
|
|
|
|
|
$s = join(' ', split('',$src));
|
|
|
|
$t = join(' ', split('',$tgt));
|
|
|
|
print MYSFILE "$s\n";
|
|
|
|
print MYTFILE "$t\n";
|
|
|
|
push(@source, $s);
|
|
|
|
push(@target, $t);
|
|
|
|
}
|
|
|
|
|
|
|
|
close (FH);
|
|
|
|
close (MYSFILE);
|
|
|
|
close (MYTFILE);
|
|
|
|
|
|
|
|
open MYSFILE, ">:encoding(UTF-8)", "$tPath/training/corpusA.$inp_ext" or die "Can't open $tPath/training/corpusA.$inp_ext: $!\n";
|
|
|
|
open MYTFILE, ">:encoding(UTF-8)", "$tPath/training/corpusA.$op_ext" or die "Can't open $tPath/training/corpusA.$op_ext: $!\n";
|
|
|
|
|
|
|
|
open MYSDEVFILE, ">:encoding(UTF-8)", "$tPath/tuning/input" or die "Can't open $tPath/tuning/input: $!\n";
|
|
|
|
open MYTDEVFILE, ">:encoding(UTF-8)", "$tPath/tuning/reference" or die "Can't open $tPath/tuning/reference: $!\n";
|
|
|
|
|
|
|
|
my $corpus_size = @source;
|
|
|
|
my $count = 11;
|
|
|
|
my $dev_size = 0;
|
|
|
|
|
|
|
|
|
|
|
|
foreach (@source)
|
|
|
|
{
|
|
|
|
if ($count % 5 == 0 && $dev_size < 1000)
|
|
|
|
{
|
|
|
|
print MYSDEVFILE "$source[$count-11]\n";
|
|
|
|
print MYTDEVFILE "$target[$count-11]\n";
|
|
|
|
$dev_size++;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
print MYSFILE "$source[$count-11]\n";
|
|
|
|
print MYTFILE "$target[$count-11]\n";
|
|
|
|
}
|
|
|
|
$count++;
|
|
|
|
}
|
|
|
|
|
|
|
|
close (MYSFILE);
|
|
|
|
close (MYTFILE);
|
|
|
|
close (MYSDEVFILE);
|
|
|
|
close (MYTDEVFILE);
|
|
|
|
|
|
|
|
if ($corpus_size < 6000)
|
|
|
|
{
|
|
|
|
`rm $tPath/training/corpusA.$inp_ext`;
|
|
|
|
`rm $tPath/training/corpusA.$op_ext`;
|
|
|
|
}
|
|
|
|
|
|
|
|
|