mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-09-20 23:58:15 +03:00
adding scripts to extract POSs from LOPAR output and to extract arbitrary sets of factors from a corpus
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@530 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
parent
8420ecf516
commit
0d91864621
19
scripts/generic/extract-factors.pl
Executable file
19
scripts/generic/extract-factors.pl
Executable file
@ -0,0 +1,19 @@
|
||||
#!/usr/bin/perl -w
|
||||
|
||||
#extract-factors.pl: extract only the desired factors from a factored corpus
|
||||
#usage: extract-factors corpusfile factor-index factor-index ... > outfile
|
||||
#factor indices start at 0
|
||||
#factor indices too large ought to be ignored
|
||||
|
||||
use strict;
|
||||
|
||||
my ($filename, @factors) = @ARGV;
|
||||
my %indices = map {$_ => 1} @factors;
|
||||
|
||||
open(INFILE, "<$filename") or die "couldn't open '$filename' for read: $!\n";
|
||||
while(my $line = <INFILE>)
|
||||
{
|
||||
chop $line;
|
||||
print join(' ', map {my $i = 0; join('|', grep($indices{$i++}, split(/\|/, $_)))} split(/\s+/, $line)) . "\n";
|
||||
}
|
||||
close(INFILE);
|
14
scripts/generic/lopar2pos.pl
Executable file
14
scripts/generic/lopar2pos.pl
Executable file
@ -0,0 +1,14 @@
|
||||
#!/usr/bin/perl -w
|
||||
|
||||
#lopar2pos: extract POSs from LOPAR output
|
||||
#usage: lopar2pos.pl CORPUS.lopar > CORPUS.pos
|
||||
|
||||
my $infilename = shift @ARGV;
|
||||
open(INFILE, "<$infilename") or die "couldn't open '$infilename' for read: $!\n";
|
||||
while(my $line = <INFILE>)
|
||||
{
|
||||
my @words = split(/\s+/, $line);
|
||||
my @tags = map {$_ =~ /^[^_]*_([A-Z]+)/; $1} @words;
|
||||
print join(' ', @tags) . "\n";
|
||||
}
|
||||
close(INFILE);
|
Loading…
Reference in New Issue
Block a user