Implementation of corpus identifier feature

This commit is contained in:
Barry Haddow 2011-10-14 12:44:37 +01:00
parent 235f737c76
commit fc51c94dd2
5 changed files with 30 additions and 6 deletions

View File

@ -362,12 +362,18 @@ build-pairs-to-sids
ignore-unless: sparse-phrase-scripts-dir
default-name: model/pairs-to-sids
template: $sparse-phrase-scripts-dir/assign-pairs-to-sids.perl IN.sid | gzip -c > OUT.gz
get-corpus-sizes
in: CORPUS:clean-split-stem
out: corpus-sizes
default-name: corpus/corpus-sizes
template: $moses-script-dir/training/corpus-sizes.perl $input-extension $output-extension IN > OUT
build-sparse-ttable
in: corpus phrase-translation-table pairs-to-sids
in: corpus phrase-translation-table pairs-to-sids corpus-sizes
out: sparse-phrase-translation-table
default-name: model/phrase-table-sparse
ignore-unless: sparse-phrase-scripts-dir
template: $sparse-phrase-scripts-dir/extract-features.py -f $input-extension -e $output-extension -c IN -t IN1 -p IN2 -x $sparse-phrase-features | gzip -c > OUT.gz
template: $sparse-phrase-scripts-dir/extract-features.py -f $input-extension -e $output-extension -c IN -t IN1 -p IN2 -x $sparse-phrase-features -s IN3 | gzip -c > OUT.gz
rerun-on-change: TRAINING:sparse-phrase-features
build-generation
in: corpus
out: generation-table

View File

@ -15,7 +15,6 @@ if (scalar @PART == 1) {
exit;
}
my $part_id = 0;
foreach my $part (@PART) {
die("ERROR: no part $part.$in or $part.$out")
if (! -e "$part.$in" || ! -e "$part.$out");
@ -25,6 +24,4 @@ foreach my $part (@PART) {
if $in_size != $out_size;
`cat $part.$in >> $consolidated.$in`;
`cat $part.$out >> $consolidated.$out`;
print "$part_id $in_size";
++$part_id;
}

View File

@ -80,6 +80,7 @@ training/compact-rule-table/tools/compactify
training/eppex/counter
training/eppex/eppex
training/mbr/mbr
training/corpus-sizes.perl
training/filter-model-given-input.pl
training/filter-rule-table.py
training/lexical-reordering/score

View File

@ -0,0 +1,16 @@
#!/usr/bin/perl -w
# $Id: consolidate-training-data.perl 928 2009-09-02 02:58:01Z philipp $
use strict;
my ($in,$out,@PART) = @ARGV;
foreach my $part (@PART) {
die("ERROR: no part $part.$in or $part.$out") if (! -e "$part.$in" || ! -e "$part.$out");
my $in_size = `cat $part.$in | wc -l`;
my $out_size = `cat $part.$out | wc -l`;
die("number of lines don't match: '$part.$in' ($in_size) != '$part.$out' ($out_size)")
if $in_size != $out_size;
print "$in_size";
}

View File

@ -30,13 +30,14 @@ BOOST_AUTO_TEST_SUITE(fv)
BOOST_AUTO_TEST_CASE(vector_sum_diff)
{
FVector f1,f2;
FVector f1,f2,f3;
FName n1("a");
FName n2("b");
FName n3("c");
FName n4("d");
f1[n1] = 1.2; f1[n2] = 1.4; f1[n3] = -0.1;
f2[n1] = 0.01; f2[n3] = 5.6; f2[n4] = 0.6;
f3[n1] =1.2;
FVector sum = f1 + f2;
FVector diff = f1 - f2;
BOOST_CHECK_CLOSE((FValue)sum[n1], 1.21, TOL);
@ -47,6 +48,9 @@ BOOST_AUTO_TEST_CASE(vector_sum_diff)
BOOST_CHECK_CLOSE((FValue)diff[n2], 1.4, TOL);
BOOST_CHECK_CLOSE((FValue)diff[n3], -5.7, TOL);
BOOST_CHECK_CLOSE((FValue)diff[n4], -0.6, TOL);
f1 -= f3;
cerr << f1 << endl << f3 << endl ;
BOOST_CHECK_CLOSE((FValue)f1[n1],0,TOL);
}