2015-05-17 16:04:04 +03:00
|
|
|
#!/usr/bin/env perl
|
2015-05-29 14:30:26 +03:00
|
|
|
#
|
|
|
|
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
|
|
|
# Public License version 2.1 or, at your option, any later version.
|
2012-09-03 10:23:32 +04:00
|
|
|
|
2015-04-13 19:42:33 +03:00
|
|
|
use warnings;
|
2012-09-03 10:23:32 +04:00
|
|
|
use strict;
|
|
|
|
|
|
|
|
# Create domain file from corpora
|
|
|
|
# (helper for domain adatpation)
|
|
|
|
|
|
|
|
# Creates a file with domain names and end line numbers for different domains
|
2015-05-17 16:04:04 +03:00
|
|
|
# within the cleaned training corpus. This file is used by various domain
|
2012-09-03 10:23:32 +04:00
|
|
|
# adaptation methods.
|
|
|
|
|
|
|
|
my ($extension,@SUBCORPORA) = @ARGV;
|
|
|
|
|
|
|
|
my $line_count = 0;
|
|
|
|
my %UNIQUE_NAME;
|
|
|
|
my $number = 1;
|
|
|
|
foreach (@SUBCORPORA) {
|
|
|
|
# get number of lines
|
|
|
|
if (!-e "$_.$extension" && -e "$_.$extension.gz") {
|
|
|
|
$line_count += `zcat $_.$extension.gz | wc -l`;
|
|
|
|
}
|
|
|
|
elsif (-e "$_.$extension") {
|
|
|
|
$line_count += `wc -l < $_.$extension`;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
die("ERROR: could not open sub corpus file $_.$extension\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
# construct name
|
|
|
|
my $name = $number++; # default: cardinal number
|
|
|
|
while(defined($UNIQUE_NAME{$name})) { $name = $number++; } # slightly paranoid
|
|
|
|
if (/\/([^\.\/]+)\.[^\/]+$/ && !defined($UNIQUE_NAME{$1})) { # reconstruct corpus name
|
|
|
|
$name = $1;
|
|
|
|
$UNIQUE_NAME{$1}++;
|
|
|
|
}
|
|
|
|
print "$line_count $name\n";
|
|
|
|
}
|
|
|
|
|