mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-25 21:03:22 +03:00
Merge branch 'master' of https://github.com/moses-smt/mosesdecoder
Conflicts: moses/FF/ConstrainedDecoding.cpp
This commit is contained in:
commit
9c27dc08e9
@ -1,158 +1,3 @@
|
|||||||
PRELIMINARIES
|
Please see the Moses website on how to compile and run Moses
|
||||||
|
http://www.statmt.org/moses/?n=Development.GetStarted
|
||||||
|
|
||||||
Moses is primarily targeted at gcc on UNIX.
|
|
||||||
|
|
||||||
Moses requires gcc, Boost >= 1.36, and zlib including the headers that some
|
|
||||||
distributions package separately (i.e. -dev or -devel packages). Source is
|
|
||||||
available at http://boost.org .
|
|
||||||
|
|
||||||
There are several optional dependencies:
|
|
||||||
|
|
||||||
GIZA++ from http://code.google.com/p/giza-pp/ is used to align words in the parallel corpus during training.
|
|
||||||
|
|
||||||
Moses server requires xmlrpc-c with abyss-server. Source is available from
|
|
||||||
http://xmlrpc-c.sourceforge.net/.
|
|
||||||
|
|
||||||
The scripts support building ARPA format language models with SRILM or IRSTLM.
|
|
||||||
To apply models inside the decoder, you can use SRILM, IRSTLM, or KenLM. The
|
|
||||||
ARPA format is exchangable so that e.g. you can build a model with SRILM and
|
|
||||||
run the decoder with IRSTLM or KenLM.
|
|
||||||
|
|
||||||
If you want to use SRILM, you will need to download its source and build it.
|
|
||||||
The SRILM can be downloaded from
|
|
||||||
http://www.speech.sri.com/projects/srilm/download.html .
|
|
||||||
On x86_64, the default machine type is broken. Edit sbin/machine-type, find
|
|
||||||
this code
|
|
||||||
else if (`uname -m` == x86_64) then
|
|
||||||
set MACHINE_TYPE = i686
|
|
||||||
and change it to
|
|
||||||
else if (`uname -m` == x86_64) then
|
|
||||||
set MACHINE_TYPE = i686-m64
|
|
||||||
You may have to chmod +w sbin/machine-type first.
|
|
||||||
|
|
||||||
If you want to use IRSTLM, you will need to download its source and build it.
|
|
||||||
The IRSTLM can be downloaded from either the SourceForge website
|
|
||||||
http://sourceforge.net/projects/irstlm
|
|
||||||
or the official IRSTLM website
|
|
||||||
http://hlt.fbk.eu/en/irstlm
|
|
||||||
|
|
||||||
KenLM is included with Moses.
|
|
||||||
|
|
||||||
--------------------------------------------------------------------------
|
|
||||||
|
|
||||||
ADVICE ON INSTALLING EXTERNAL LIBRARIES
|
|
||||||
|
|
||||||
Generally, for trouble installing external libraries, you should get support
|
|
||||||
directly from the library maker:
|
|
||||||
|
|
||||||
Boost: http://www.boost.org/doc/libs/release/more/getting_started/unix-variants.html
|
|
||||||
IRSTLM: https://list.fbk.eu/sympa/subscribe/user-irstlm
|
|
||||||
SRILM: http://www.speech.sri.com/projects/srilm/#srilm-user
|
|
||||||
|
|
||||||
However, here's some general advice on installing software (for bash users):
|
|
||||||
|
|
||||||
#Determine where you want to install packages
|
|
||||||
PREFIX=$HOME/usr
|
|
||||||
#If your system has lib64 directories, lib64 should be used AND NOT lib
|
|
||||||
if [ -d /lib64 ]; then
|
|
||||||
LIBDIR=$PREFIX/lib64
|
|
||||||
else
|
|
||||||
LIBDIR=$PREFIX/lib
|
|
||||||
fi
|
|
||||||
#If you're installing to a non-standard path, tell programs where to find things:
|
|
||||||
export PATH=$PREFIX/bin${PATH:+:$PATH}
|
|
||||||
export LD_LIBRARY_PATH=$LIBDIR${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}
|
|
||||||
export LIBRARY_PATH=$LIBDIR${LIBRARY_PATH:+:$LIBRARY_PATH}
|
|
||||||
export CPATH=$PREFIX/include${CPATH:+:$CPATH}
|
|
||||||
|
|
||||||
Add all the above code to your .bashrc or .bash_login as appropriate. Then
|
|
||||||
you're ready to install packages in non-standard paths:
|
|
||||||
|
|
||||||
#For autotools packages e.g. xmlrpc-c and zlib
|
|
||||||
./configure --prefix=$PREFIX --libdir=$LIBDIR [other options here]
|
|
||||||
|
|
||||||
#tcmalloc is a malloc implementation with threaded performance. To see how it
|
|
||||||
#improves Moses performance, read
|
|
||||||
# http://www.mail-archive.com/moses-support@mit.edu/msg07303.html
|
|
||||||
#It is part of gperftools which can be downloaded from from
|
|
||||||
# https://code.google.com/p/gperftools/downloads/list
|
|
||||||
#configure with this:
|
|
||||||
./configure --prefix=$PREFIX --libdir=$LIBDIR --enable-shared --enable-static --enable-minimal
|
|
||||||
|
|
||||||
#For bzip2:
|
|
||||||
wget http://www.bzip.org/1.0.6/bzip2-1.0.6.tar.gz
|
|
||||||
tar xzvf bzip2-1.0.6.tar.gz
|
|
||||||
cd bzip2-1.0.6/
|
|
||||||
#Compile and install libbz2.a (static library)
|
|
||||||
make
|
|
||||||
make install PREFIX=$PREFIX
|
|
||||||
mkdir -p $LIBDIR
|
|
||||||
#Note this may be the same file; you can ignore the error
|
|
||||||
mv $PREFIX/lib/libbz2.a $LIBDIR 2>/dev/null
|
|
||||||
#Compile and install libbz2.so (dynamic library)
|
|
||||||
make clean
|
|
||||||
make -f Makefile-libbz2_so
|
|
||||||
cp libbz2.so.* $LIBDIR
|
|
||||||
ln -sf libbz2.so.1.0 $LIBDIR/libbz2.so
|
|
||||||
|
|
||||||
#For Boost:
|
|
||||||
./bootstrap.sh
|
|
||||||
./b2 --prefix=$PWD --libdir=$PWD/lib64 --layout=tagged link=static,shared threading=multi,single install || echo FAILURE
|
|
||||||
|
|
||||||
This will put the header files and libraries files in the current directory, rather than the system directory.
|
|
||||||
|
|
||||||
For most Linux systems, you should replace
|
|
||||||
link=static,shared
|
|
||||||
with
|
|
||||||
link=static
|
|
||||||
so it will only create static libraries. The minimised headaches when linking with Moses.
|
|
||||||
|
|
||||||
To link Moses to your version of boost,
|
|
||||||
./bjam --with-boost=[boost/path]
|
|
||||||
|
|
||||||
Alternatively, you run
|
|
||||||
./b2 --prefix=/usr/ --libdir=/usr/lib
|
|
||||||
to install boost in the systems folder. However, this may override the built in boost and causes problems for your OS, therefore, it is not recommended.
|
|
||||||
|
|
||||||
--------------------------------------------------------------------------
|
|
||||||
|
|
||||||
BUILDING
|
|
||||||
|
|
||||||
Building consists of running
|
|
||||||
./bjam [options]
|
|
||||||
|
|
||||||
Common options are:
|
|
||||||
--with-srilm=/path/to/srilm to compile the decoder with SRILM support
|
|
||||||
--with-irstlm=/path/to/irstlm to compile the decoder with IRSTLM support
|
|
||||||
-jN where N is the number of CPUs
|
|
||||||
|
|
||||||
--with-macports=/path/to/macports use MacPorts on Mac OS X.
|
|
||||||
|
|
||||||
If you leave out /path/to/macports bjam will use the /opt/local as default.
|
|
||||||
You don't have to use --with-boost with-macports as it is implicitly set.
|
|
||||||
Also note that using --with-macports automatically triggers "using darwin".
|
|
||||||
|
|
||||||
Binaries will appear in dist/bin.
|
|
||||||
|
|
||||||
You can clean up data from previous builds using
|
|
||||||
./bjam --clean
|
|
||||||
|
|
||||||
For further documentation, run
|
|
||||||
./bjam --help
|
|
||||||
|
|
||||||
--------------------------------------------------------------------------
|
|
||||||
|
|
||||||
ALTERNATIVE WAYS TO BUILD ON UNIX AND OTHER PLATFORMS
|
|
||||||
|
|
||||||
Microsoft Windows
|
|
||||||
-----------------
|
|
||||||
Moses is primarily targeted at gcc on UNIX. Windows users should
|
|
||||||
install using Cygwin. Outdated instructions can be found here:
|
|
||||||
http://ssli.ee.washington.edu/people/amittai/Moses-on-Win7.pdf .
|
|
||||||
|
|
||||||
Binaries for all external libraries needed can be downloaded from
|
|
||||||
http://www.statmt.org/moses/?n=Moses.LibrariesUsed
|
|
||||||
|
|
||||||
Only the decoder is developed and tested under Windows. There are
|
|
||||||
difficulties using the training scripts under Windows, even with
|
|
||||||
Cygwin, but it can be done.
|
|
||||||
|
29
contrib/mert-sge-nosync/README
Normal file
29
contrib/mert-sge-nosync/README
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
MERT-sge-nosync
|
||||||
|
Raymond Ng, University of Sheffield.
|
||||||
|
Apr, 2014.
|
||||||
|
|
||||||
|
The parallel MERT tuning scripts in moses cannot run in the SGE "no-sync" mode (where job submission is done in one go, after which user can go offline while leaving the SGE to manage the whole process). Scripts provided in this site are for parallel MERT in SGE no-sync mode. You will need to have SSH support in perl (Step 2), and set up public ssh-keys between the running hosts and the submission hosts (Step 3).
|
||||||
|
|
||||||
|
1. Untar the scipts in ${MOSES}/scripts/
|
||||||
|
|
||||||
|
2. Download and install OpenSSH packages for perl:
|
||||||
|
Net-OpenSSH-Compat (http://search.cpan.org/CPAN/authors/id/S/SA/SALVA/Net-OpenSSH-Compat-0.06.tar.gz)
|
||||||
|
$ cd Net-OpenSSH-Compat-0.06
|
||||||
|
$ perl Makefile.PL
|
||||||
|
(You may have dependency issue and need to install IO-Tty and Net-OpenSSH in advance)
|
||||||
|
IO-Tty-1.10 (http://search.cpan.org/CPAN/authors/id/T/TO/TODDR/IO-Tty-1.10.tar.gz)
|
||||||
|
Net-OpenSSH-0.60 (http://search.cpan.org/CPAN/authors/id/S/SA/SALVA/Net-OpenSSH-0.60.tar.gz)
|
||||||
|
|
||||||
|
3. Set up public ssh-keys for accessing the submithost (machine from which qsub is executed) from the running hosts (machines which actually runs the scripts)
|
||||||
|
http://www.linuxproblem.org/art_9.html
|
||||||
|
|
||||||
|
4. Run parallel MERT by
|
||||||
|
nohup nice ${MOSES}/scripts/training/mert-moses-sge-nosync.pl ${TM_DEVTEXT_SOURCE} \
|
||||||
|
--threads 20 --jobs 20 \
|
||||||
|
--queue-flags='-q normal.q -P project' \
|
||||||
|
--submithost='squeal' \
|
||||||
|
${TM_DEVTEXT_TARGET} ${MOSES}/dist/bin/moses ${WORKINGDIR}/train/model/moses.ini \
|
||||||
|
--mertdir ${MOSES}/dist/bin/ >& ${WORKINGDIR}/mert.out
|
||||||
|
|
||||||
|
|
||||||
|
(tested on moses version Built Apr 2012 version)
|
1388
contrib/mert-sge-nosync/generic/moses-parallel-sge-nosync.pl
Executable file
1388
contrib/mert-sge-nosync/generic/moses-parallel-sge-nosync.pl
Executable file
File diff suppressed because it is too large
Load Diff
312
contrib/mert-sge-nosync/generic/qsub-wrapper-exit-sge-nosync.pl
Executable file
312
contrib/mert-sge-nosync/generic/qsub-wrapper-exit-sge-nosync.pl
Executable file
@ -0,0 +1,312 @@
|
|||||||
|
#! /usr/bin/perl
|
||||||
|
|
||||||
|
# $Id$
|
||||||
|
use strict;
|
||||||
|
use Net::OpenSSH::Compat::Perl;
|
||||||
|
#######################
|
||||||
|
#Default parameters
|
||||||
|
#parameters for submiiting processes through SGE
|
||||||
|
#NOTE: group name is ws06ossmt (with 2 's') and not ws06osmt (with 1 's')
|
||||||
|
my $queueparameters="";
|
||||||
|
|
||||||
|
# look for the correct pwdcmd
|
||||||
|
my $pwdcmd = getPwdCmd();
|
||||||
|
|
||||||
|
my $workingdir = `$pwdcmd`; chomp $workingdir;
|
||||||
|
# my $tmpdir="$workingdir/tmp$$";
|
||||||
|
# my $jobscript="$workingdir/job$$.sh";
|
||||||
|
# my $qsubout="$workingdir/out.job$$";
|
||||||
|
# my $qsuberr="$workingdir/err.job$$";
|
||||||
|
|
||||||
|
|
||||||
|
$SIG{INT} = \&kill_all_and_quit; # catch exception for CTRL-C
|
||||||
|
|
||||||
|
my $submithost="";
|
||||||
|
my $help="";
|
||||||
|
my $dbg="";
|
||||||
|
my $version="";
|
||||||
|
my $qsubname="WR$$";
|
||||||
|
my $cmd="";
|
||||||
|
my $cmdout=undef;
|
||||||
|
my $cmderr=undef;
|
||||||
|
my $jid=0;
|
||||||
|
my $jidfile=undef;
|
||||||
|
my $pid=0;
|
||||||
|
my $pidfile=undef;
|
||||||
|
my $prevjid=undef;
|
||||||
|
my $parameters="";
|
||||||
|
my $old_sge = 0; # assume grid engine < 6.0
|
||||||
|
my $prevjidarraysize = 0;
|
||||||
|
my $force_delete = 0;
|
||||||
|
my @prevjidarray = ();
|
||||||
|
|
||||||
|
sub init(){
|
||||||
|
use Getopt::Long qw(:config pass_through);
|
||||||
|
GetOptions('version'=>\$version,
|
||||||
|
'help'=>\$help,
|
||||||
|
'debug'=>\$dbg,
|
||||||
|
'submithost=s'=> \$submithost,
|
||||||
|
'qsub-prefix=s'=> \$qsubname,
|
||||||
|
'stdout=s'=> \$cmdout,
|
||||||
|
'stderr=s'=> \$cmderr,
|
||||||
|
'jidfile=s'=> \$jidfile,
|
||||||
|
'pidfile=s'=> \$pidfile, # process id for previous job
|
||||||
|
'prevjid=s'=> \$prevjid,
|
||||||
|
'queue-parameter=s'=> \$queueparameters,
|
||||||
|
'force-delete=i' => \$force_delete,
|
||||||
|
'old-sge' => \$old_sge,
|
||||||
|
) or exit(1);
|
||||||
|
$parameters="@ARGV";
|
||||||
|
|
||||||
|
# read $pid from file
|
||||||
|
chomp($pid=`tail -n 1 $pidfile`);
|
||||||
|
# print "PID=+$pidfile+\n";
|
||||||
|
|
||||||
|
if (defined $jidfile) {
|
||||||
|
chomp($jid=`tail -n 1 $jidfile`);
|
||||||
|
}
|
||||||
|
|
||||||
|
# print STDERR "INPUT prevjid =+$prevjid+\n";
|
||||||
|
$prevjid =~ s/^\s+|\s+$//g;
|
||||||
|
# print STDERR "TRIMMED prevjid =+$prevjid+\n";
|
||||||
|
|
||||||
|
@prevjidarray = split(/\s+/,$prevjid);
|
||||||
|
$prevjidarraysize = scalar(@prevjidarray);
|
||||||
|
|
||||||
|
# print STDERR "arraysize: $prevjidarraysize\n";
|
||||||
|
|
||||||
|
|
||||||
|
version() if $version;
|
||||||
|
usage() if $help;
|
||||||
|
print_parameters() if $dbg;
|
||||||
|
}
|
||||||
|
|
||||||
|
#######################
|
||||||
|
##print version
|
||||||
|
sub version(){
|
||||||
|
# print STDERR "version 1.0 (29-07-2006)\n";
|
||||||
|
print STDERR "version 1.1 (31-07-2006)\n";
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#usage
|
||||||
|
sub usage(){
|
||||||
|
print STDERR "qsub-wrapper.pl [options]\n";
|
||||||
|
print STDERR "Options:\n";
|
||||||
|
print STDERR "-stdout <file> file to find stdout from target cmd (optional)\n";
|
||||||
|
print STDERR "-stderr <file> file to find stderr from target cmd (optional)\n";
|
||||||
|
print STDERR "-jidfile <file> file to find the submit jobid (for submit option)\n";
|
||||||
|
print STDERR "-pidfile <file> file to find the process id to the target job for deletion\n";
|
||||||
|
print STDERR "-prevjid <id> wait for the previous job with jobid=id to finish before starting (optional)\n";
|
||||||
|
print STDERR "-force-delete 1 force-delete without checking\n";
|
||||||
|
print STDERR "-qsub-prefix <string> name for sumbitted jobs (optional)\n";
|
||||||
|
print STDERR "-queue-parameters <string> parameter for the queue (optional)\n";
|
||||||
|
print STDERR "-old-sge ... assume Sun Grid Engine < 6.0\n";
|
||||||
|
print STDERR "-debug debug\n";
|
||||||
|
print STDERR "-version print version of the script\n";
|
||||||
|
print STDERR "-help this help\n";
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#printparameters
|
||||||
|
sub print_parameters(){
|
||||||
|
# print STDERR "command: $cmd\n";
|
||||||
|
if (defined($cmdout)){ print STDERR "file for stdout: $cmdout\n"; }
|
||||||
|
else { print STDERR "file for stdout is not defined, stdout is discarded\n"; }
|
||||||
|
if (defined($cmderr)){ print STDERR "file for stdout: $cmderr\n"; }
|
||||||
|
else { print STDERR "file for stderr is not defined, stderr is discarded\n"; }
|
||||||
|
if (defined($jidfile)){ print STDERR "file for submit job id: $jidfile\n"; }
|
||||||
|
else { print STDERR "file for submit job id is not defined, jidfile is discarded\n"; }
|
||||||
|
print STDERR "Qsub name: $qsubname\n";
|
||||||
|
print STDERR "Queue parameters: $queueparameters\n";
|
||||||
|
print STDERR "parameters directly passed to cmd: $parameters\n";
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#######################
|
||||||
|
#Script starts here
|
||||||
|
|
||||||
|
init();
|
||||||
|
|
||||||
|
my $tmpdir="$workingdir/tmp$pid";
|
||||||
|
my $jobscript="$workingdir/job$pid.sh";
|
||||||
|
my $qsubout="$workingdir/out.job$pid";
|
||||||
|
my $qsuberr="$workingdir/err.job$pid";
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### usage() if $cmd eq "";
|
||||||
|
###
|
||||||
|
### safesystem("mkdir -p $tmpdir") or die;
|
||||||
|
###
|
||||||
|
### preparing_script();
|
||||||
|
###
|
||||||
|
#### my $maysync = $old_sge ? "" : "-sync y";
|
||||||
|
#### never run in syn mode
|
||||||
|
###my $maysync = "";
|
||||||
|
###
|
||||||
|
###my $qsubcmd = "";
|
||||||
|
#### create the qsubcmd to submit to the queue with the parameter "-b yes"
|
||||||
|
####my $qsubcmd="qsub $queueparameters $maysync -V -o $qsubout -e $qsuberr -N $qsubname -b yes $jobscript > $jobscript.log 2>&1";
|
||||||
|
###
|
||||||
|
#### add -b yes if not yet defined, otherwise leave empty
|
||||||
|
###$queueparameters .= " -b yes " if (index($queueparameters," -b ")==-1);
|
||||||
|
###
|
||||||
|
###
|
||||||
|
###if (defined $prevjid && $prevjid!=-1 && $prevjidarraysize == 1) {
|
||||||
|
### $qsubcmd="qsub $queueparameters $maysync -V -hold_jid $prevjid -o $qsubout -e $qsuberr -N $qsubname $jobscript > $jobscript.log 2>&1";
|
||||||
|
###} elsif (defined $prevjid && $prevjidarraysize > 1) {
|
||||||
|
### my $hj = "-hold_jid " . join(" -hold_jid ", @prevjidarray);
|
||||||
|
### # print STDERR "hj is $hj\n";
|
||||||
|
### $qsubcmd="qsub $queueparameters $maysync -V $hj -o $qsubout -e $qsuberr -N $qsubname $jobscript > $jobscript.log 2>&1";
|
||||||
|
###} else {
|
||||||
|
### $qsubcmd="qsub $queueparameters $maysync -V -o $qsubout -e $qsuberr -N $qsubname $jobscript > $jobscript.log 2>&1";
|
||||||
|
###}
|
||||||
|
###
|
||||||
|
###print "submitting $qsubcmd\n";
|
||||||
|
###
|
||||||
|
####run the qsubcmd
|
||||||
|
###
|
||||||
|
###safesystem($qsubcmd) or die;
|
||||||
|
###
|
||||||
|
####getting id of submitted job#############
|
||||||
|
###my $res;
|
||||||
|
###open (IN,"$jobscript.log") or die "Can't read main job id: $jobscript.log";
|
||||||
|
###chomp($res=<IN>);
|
||||||
|
###my @arrayStr = split(/\s+/,$res);
|
||||||
|
###my $id=$arrayStr[2];
|
||||||
|
###die "Failed to get job id from $jobscript.log, got: $res"
|
||||||
|
### if $id !~ /^[0-9]+$/;
|
||||||
|
###close(IN);
|
||||||
|
############################################
|
||||||
|
###print STDERR " res:$res\n";
|
||||||
|
###print STDERR " id:$id\n";
|
||||||
|
###
|
||||||
|
###open (JIDOUT,">$jidfile") or die "Can't open jid file to write";
|
||||||
|
###print JIDOUT "$id\n";
|
||||||
|
###close(JIDOUT);
|
||||||
|
###
|
||||||
|
###open (JOBNUMOUT,">$jidfile.job") or die "Can't open id.job file to write";
|
||||||
|
###print JOBNUMOUT "$$\n";
|
||||||
|
###close(JOBNUMOUT);
|
||||||
|
###
|
||||||
|
###if ($old_sge) {
|
||||||
|
### # need to workaround -sync, add another job that will wait for the main one
|
||||||
|
### # prepare a fake waiting script
|
||||||
|
### my $syncscript = "$jobscript.sync_workaround_script.sh";
|
||||||
|
### safesystem("echo 'date' > $syncscript") or die;
|
||||||
|
###
|
||||||
|
### my $checkpointfile = "$jobscript.sync_workaround_checkpoint";
|
||||||
|
###
|
||||||
|
### # ensure checkpoint does not exist
|
||||||
|
### safesystem("\\rm -f $checkpointfile") or die;
|
||||||
|
###
|
||||||
|
### # start the 'hold' job, i.e. the job that will wait
|
||||||
|
#### $cmd="qsub -cwd $queueparameters -hold_jid $id -o $checkpointfile f -e /dev/null -N $qsubname.W $syncscript >& $qsubname.W.log";
|
||||||
|
### $cmd="qsub -cwd $queueparameters -hold_jid $id -o $checkpointfile -e /dev/null -N $qsubname.W $syncscript >& $qsubname.W.log";
|
||||||
|
### safesystem($cmd) or die;
|
||||||
|
###
|
||||||
|
### # and wait for checkpoint file to appear
|
||||||
|
### my $nr=0;
|
||||||
|
### while (!-e $checkpointfile) {
|
||||||
|
### sleep(10);
|
||||||
|
### $nr++;
|
||||||
|
### print STDERR "w" if $nr % 3 == 0;
|
||||||
|
### }
|
||||||
|
### safesystem("\\rm -f $checkpointfile $syncscript") or die();
|
||||||
|
### print STDERR "End of waiting workaround.\n";
|
||||||
|
###}
|
||||||
|
|
||||||
|
|
||||||
|
my $failure=0;
|
||||||
|
|
||||||
|
|
||||||
|
if (!$force_delete) {
|
||||||
|
$failure=&check_exit_status();
|
||||||
|
print STDERR "check_exit_status returned $failure\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
&kill_all_and_quit() if $failure;
|
||||||
|
|
||||||
|
&remove_temporary_files() if !$dbg;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
sub check_exit_status(){
|
||||||
|
my $failure=0;
|
||||||
|
|
||||||
|
print STDERR "check_exit_status of submitted job $jid from file $qsubout\n";
|
||||||
|
open(IN,"$qsubout") or die "Can't read $qsubout";
|
||||||
|
while (<IN>){
|
||||||
|
$failure=1 if (/failed with exit status/);
|
||||||
|
}
|
||||||
|
close(IN);
|
||||||
|
return $failure;
|
||||||
|
}
|
||||||
|
|
||||||
|
sub kill_all_and_quit(){
|
||||||
|
my $my_username = undef;
|
||||||
|
|
||||||
|
# chomp($my_username = `whoami`);
|
||||||
|
#
|
||||||
|
# my $ssh = Net::OpenSSH::Compat::Perl->new($submithost, debug=>0);
|
||||||
|
#
|
||||||
|
# $ssh->login("$my_username",`cat /home/$my_username/accpw`);
|
||||||
|
|
||||||
|
my $ssh = Net::OpenSSH::Compat::Perl->new($submithost, debug=>0);
|
||||||
|
|
||||||
|
$ssh->login();
|
||||||
|
|
||||||
|
print STDERR "kill_all_and_quit\n";
|
||||||
|
print STDERR "qdel $jid\n";
|
||||||
|
# safesystem("qdel $jid");
|
||||||
|
$ssh->cmd("qdel $jid");
|
||||||
|
|
||||||
|
print STDERR "The submitted jobs died not correctly\n";
|
||||||
|
print STDERR "Send qdel signal to the submitted jobs\n";
|
||||||
|
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
sub remove_temporary_files(){
|
||||||
|
#removing temporary files
|
||||||
|
|
||||||
|
unlink("${jobscript}");
|
||||||
|
unlink("${jobscript}.log");
|
||||||
|
unlink("$qsubout");
|
||||||
|
unlink("$qsuberr");
|
||||||
|
rmdir("$tmpdir");
|
||||||
|
}
|
||||||
|
|
||||||
|
sub safesystem {
|
||||||
|
print STDERR "Executing: @_\n";
|
||||||
|
system(@_);
|
||||||
|
if ($? == -1) {
|
||||||
|
print STDERR "Failed to execute: @_\n $!\n";
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
elsif ($? & 127) {
|
||||||
|
printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n",
|
||||||
|
($? & 127), ($? & 128) ? 'with' : 'without';
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
my $exitcode = $? >> 8;
|
||||||
|
print STDERR "Exit code: $exitcode\n" if $exitcode;
|
||||||
|
return ! $exitcode;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# look for the correct pwdcmd (pwd by default, pawd if it exists)
|
||||||
|
# I assume that pwd always exists
|
||||||
|
sub getPwdCmd(){
|
||||||
|
my $pwdcmd="pwd";
|
||||||
|
my $a;
|
||||||
|
chomp($a=`which pawd | head -1 | awk '{print $1}'`);
|
||||||
|
if ($a && -e $a){ $pwdcmd=$a; }
|
||||||
|
return $pwdcmd;
|
||||||
|
}
|
||||||
|
|
320
contrib/mert-sge-nosync/generic/qsub-wrapper-sge-nosync.pl
Executable file
320
contrib/mert-sge-nosync/generic/qsub-wrapper-sge-nosync.pl
Executable file
@ -0,0 +1,320 @@
|
|||||||
|
#! /usr/bin/perl
|
||||||
|
|
||||||
|
# $Id$
|
||||||
|
use strict;
|
||||||
|
|
||||||
|
#######################
|
||||||
|
#Default parameters
|
||||||
|
#parameters for submiiting processes through SGE
|
||||||
|
#NOTE: group name is ws06ossmt (with 2 's') and not ws06osmt (with 1 's')
|
||||||
|
my $queueparameters="";
|
||||||
|
|
||||||
|
# look for the correct pwdcmd
|
||||||
|
my $pwdcmd = getPwdCmd();
|
||||||
|
|
||||||
|
my $workingdir = `$pwdcmd`; chomp $workingdir;
|
||||||
|
|
||||||
|
my $uniqtime = `date +"%s%N"`; chomp $uniqtime;
|
||||||
|
my $uid = "$$".".".$uniqtime;
|
||||||
|
|
||||||
|
|
||||||
|
my $tmpdir="$workingdir/tmp$uid";
|
||||||
|
my $jobscript="$workingdir/job$uid.sh";
|
||||||
|
my $qsubout="$workingdir/out.job$uid";
|
||||||
|
my $qsuberr="$workingdir/err.job$uid";
|
||||||
|
|
||||||
|
|
||||||
|
$SIG{INT} = \&kill_all_and_quit; # catch exception for CTRL-C
|
||||||
|
|
||||||
|
my $help="";
|
||||||
|
my $dbg="";
|
||||||
|
my $version="";
|
||||||
|
my $qsubname="WR$uid";
|
||||||
|
my $cmd="";
|
||||||
|
my $cmdout=undef;
|
||||||
|
my $cmderr=undef;
|
||||||
|
my $jidfile=undef;
|
||||||
|
my $pidfile=undef;
|
||||||
|
my $prevjid=undef;
|
||||||
|
my $parameters="";
|
||||||
|
my $old_sge = 0; # assume grid engine < 6.0
|
||||||
|
my $prevjidarraysize = 0;
|
||||||
|
my @prevjidarray = ();
|
||||||
|
|
||||||
|
sub init(){
|
||||||
|
use Getopt::Long qw(:config pass_through);
|
||||||
|
GetOptions('version'=>\$version,
|
||||||
|
'help'=>\$help,
|
||||||
|
'debug'=>\$dbg,
|
||||||
|
'qsub-prefix=s'=> \$qsubname,
|
||||||
|
'command=s'=> \$cmd,
|
||||||
|
'stdout=s'=> \$cmdout,
|
||||||
|
'stderr=s'=> \$cmderr,
|
||||||
|
'jidfile=s'=> \$jidfile,
|
||||||
|
'prevjid=s'=> \$prevjid,
|
||||||
|
'queue-parameters=s'=> \$queueparameters,
|
||||||
|
'old-sge' => \$old_sge,
|
||||||
|
) or exit(1);
|
||||||
|
$parameters="@ARGV";
|
||||||
|
|
||||||
|
# print STDERR "INPUT prevjid =+$prevjid+\n";
|
||||||
|
$prevjid =~ s/^\s+|\s+$//g;
|
||||||
|
# print STDERR "TRIMMED prevjid =+$prevjid+\n";
|
||||||
|
|
||||||
|
@prevjidarray = split(/\s+/,$prevjid);
|
||||||
|
$prevjidarraysize = scalar(@prevjidarray);
|
||||||
|
|
||||||
|
# print STDERR "arraysize: $prevjidarraysize\n";
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
version() if $version;
|
||||||
|
usage() if $help;
|
||||||
|
print_parameters() if $dbg;
|
||||||
|
}
|
||||||
|
|
||||||
|
#######################
|
||||||
|
##print version
|
||||||
|
sub version(){
|
||||||
|
# print STDERR "version 1.0 (29-07-2006)\n";
|
||||||
|
print STDERR "version 1.1 (31-07-2006)\n";
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#usage
|
||||||
|
sub usage(){
|
||||||
|
print STDERR "qsub-wrapper.pl [options]\n";
|
||||||
|
print STDERR "Options:\n";
|
||||||
|
print STDERR "-command <file> command to run\n";
|
||||||
|
print STDERR "-stdout <file> file to save stdout of cmd (optional)\n";
|
||||||
|
print STDERR "-stderr <file> file to save stderr of cmd (optional)\n";
|
||||||
|
print STDERR "-jidfile <file> file to save the submit jobid (for submit option)\n";
|
||||||
|
print STDERR "-prevjid <id> wait for the previous job with jobid=id to finish before starting (optional)\n";
|
||||||
|
print STDERR "-qsub-prefix <string> name for sumbitted jobs (optional)\n";
|
||||||
|
print STDERR "-queue-parameters <string> parameter for the queue (optional)\n";
|
||||||
|
print STDERR "-old-sge ... assume Sun Grid Engine < 6.0\n";
|
||||||
|
print STDERR "-debug debug\n";
|
||||||
|
print STDERR "-version print version of the script\n";
|
||||||
|
print STDERR "-help this help\n";
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#printparameters
|
||||||
|
sub print_parameters(){
|
||||||
|
print STDERR "command: $cmd\n";
|
||||||
|
if (defined($cmdout)){ print STDERR "file for stdout: $cmdout\n"; }
|
||||||
|
else { print STDERR "file for stdout is not defined, stdout is discarded\n"; }
|
||||||
|
if (defined($cmderr)){ print STDERR "file for stdout: $cmderr\n"; }
|
||||||
|
else { print STDERR "file for stderr is not defined, stderr is discarded\n"; }
|
||||||
|
if (defined($jidfile)){ print STDERR "file for submit job id: $jidfile\n"; }
|
||||||
|
else { print STDERR "file for submit job id is not defined, jidfile is discarded\n"; }
|
||||||
|
print STDERR "Qsub name: $qsubname\n";
|
||||||
|
print STDERR "Queue parameters: $queueparameters\n";
|
||||||
|
print STDERR "parameters directly passed to cmd: $parameters\n";
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#script creation
|
||||||
|
sub preparing_script(){
|
||||||
|
my $scriptheader="\#\!/bin/bash\n# the above line is ignored by qsub, unless parameter \"-b yes\" is set!\n\n";
|
||||||
|
$scriptheader.="uname -a\n\n";
|
||||||
|
|
||||||
|
$scriptheader.="cd $workingdir\n\n";
|
||||||
|
|
||||||
|
open (OUT, "> $jobscript");
|
||||||
|
print OUT $scriptheader;
|
||||||
|
|
||||||
|
print OUT "if $cmd $parameters > $tmpdir/cmdout$uid 2> $tmpdir/cmderr$uid ; then
|
||||||
|
echo 'succeeded'
|
||||||
|
else
|
||||||
|
echo failed with exit status \$\?
|
||||||
|
die=1
|
||||||
|
fi
|
||||||
|
";
|
||||||
|
|
||||||
|
if (defined $cmdout){
|
||||||
|
print OUT "mv -f $tmpdir/cmdout$uid $cmdout || echo failed to preserve the log: $tmpdir/cmdout$uid\n\n";
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
print OUT "rm -f $tmpdir/cmdout$uid\n\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (defined $cmderr){
|
||||||
|
print OUT "mv -f $tmpdir/cmderr$uid $cmderr || echo failed to preserve the log: $tmpdir/cmderr$uid\n\n";
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
print OUT "rm -f $tmpdir/cmderr$uid\n\n";
|
||||||
|
}
|
||||||
|
print OUT "if [ x\$die == 1 ]; then exit 1; fi\n";
|
||||||
|
close(OUT);
|
||||||
|
|
||||||
|
#setting permissions of the script
|
||||||
|
chmod(oct(755),$jobscript);
|
||||||
|
}
|
||||||
|
|
||||||
|
#######################
|
||||||
|
#Script starts here
|
||||||
|
|
||||||
|
init();
|
||||||
|
|
||||||
|
usage() if $cmd eq "";
|
||||||
|
|
||||||
|
safesystem("mkdir -p $tmpdir") or die;
|
||||||
|
|
||||||
|
preparing_script();
|
||||||
|
|
||||||
|
# my $maysync = $old_sge ? "" : "-sync y";
|
||||||
|
# never run in syn mode
|
||||||
|
my $maysync = "";
|
||||||
|
|
||||||
|
my $qsubcmd = "";
|
||||||
|
# create the qsubcmd to submit to the queue with the parameter "-b yes"
|
||||||
|
#my $qsubcmd="qsub $queueparameters $maysync -V -o $qsubout -e $qsuberr -N $qsubname -b yes $jobscript > $jobscript.log 2>&1";
|
||||||
|
|
||||||
|
# add -b yes if not yet defined, otherwise leave empty
|
||||||
|
$queueparameters .= " -b yes " if (index($queueparameters," -b ")==-1);
|
||||||
|
|
||||||
|
|
||||||
|
if (defined $prevjid && $prevjid!=-1 && $prevjidarraysize == 1) {
|
||||||
|
$qsubcmd="qsub $queueparameters $maysync -V -hold_jid $prevjid -o $qsubout -e $qsuberr -N $qsubname $jobscript > $jobscript.log 2>&1";
|
||||||
|
} elsif (defined $prevjid && $prevjidarraysize > 1) {
|
||||||
|
my $hj = "-hold_jid " . join(" -hold_jid ", @prevjidarray);
|
||||||
|
# print STDERR "hj is $hj\n";
|
||||||
|
$qsubcmd="qsub $queueparameters $maysync -V $hj -o $qsubout -e $qsuberr -N $qsubname $jobscript > $jobscript.log 2>&1";
|
||||||
|
} else {
|
||||||
|
$qsubcmd="qsub $queueparameters $maysync -V -o $qsubout -e $qsuberr -N $qsubname $jobscript > $jobscript.log 2>&1";
|
||||||
|
}
|
||||||
|
|
||||||
|
print "submitting $qsubcmd\n";
|
||||||
|
|
||||||
|
#run the qsubcmd
|
||||||
|
|
||||||
|
safesystem($qsubcmd) or die;
|
||||||
|
|
||||||
|
#getting id of submitted job#############
|
||||||
|
my $res;
|
||||||
|
open (IN,"$jobscript.log") or die "Can't read main job id: $jobscript.log";
|
||||||
|
chomp($res=<IN>);
|
||||||
|
my @arrayStr = split(/\s+/,$res);
|
||||||
|
my $id=$arrayStr[2];
|
||||||
|
die "Failed to get job id from $jobscript.log, got: $res"
|
||||||
|
if $id !~ /^[0-9]+$/;
|
||||||
|
close(IN);
|
||||||
|
#########################################
|
||||||
|
print STDERR " res:$res\n";
|
||||||
|
print STDERR " id:$id\n";
|
||||||
|
|
||||||
|
open (JIDOUT,">$jidfile") or die "Can't open jid file to write";
|
||||||
|
print JIDOUT "$id\n";
|
||||||
|
close(JIDOUT);
|
||||||
|
|
||||||
|
open (PIDOUT,">$jidfile.pid") or die "Can't open id.pid file to write";
|
||||||
|
print PIDOUT "$uid\n";
|
||||||
|
close(PIDOUT);
|
||||||
|
|
||||||
|
if ($old_sge) {
|
||||||
|
# need to workaround -sync, add another job that will wait for the main one
|
||||||
|
# prepare a fake waiting script
|
||||||
|
my $syncscript = "$jobscript.sync_workaround_script.sh";
|
||||||
|
safesystem("echo 'date' > $syncscript") or die;
|
||||||
|
|
||||||
|
my $checkpointfile = "$jobscript.sync_workaround_checkpoint";
|
||||||
|
|
||||||
|
# ensure checkpoint does not exist
|
||||||
|
safesystem("\\rm -f $checkpointfile") or die;
|
||||||
|
|
||||||
|
# start the 'hold' job, i.e. the job that will wait
|
||||||
|
# $cmd="qsub -cwd $queueparameters -hold_jid $id -o $checkpointfile -e /dev/null -N $qsubname.W $syncscript >& $qsubname.W.log";
|
||||||
|
$cmd="qsub -cwd $queueparameters -hold_jid $id -o $checkpointfile -e /dev/null -N $qsubname.W $syncscript >& $qsubname.W.log";
|
||||||
|
safesystem($cmd) or die;
|
||||||
|
|
||||||
|
# and wait for checkpoint file to appear
|
||||||
|
my $nr=0;
|
||||||
|
while (!-e $checkpointfile) {
|
||||||
|
sleep(10);
|
||||||
|
$nr++;
|
||||||
|
print STDERR "w" if $nr % 3 == 0;
|
||||||
|
}
|
||||||
|
safesystem("\\rm -f $checkpointfile $syncscript") or die();
|
||||||
|
print STDERR "End of waiting workaround.\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# my $failure=&check_exit_status();
|
||||||
|
# print STDERR "check_exit_status returned $failure\n";
|
||||||
|
|
||||||
|
# &kill_all_and_quit() if $failure;
|
||||||
|
|
||||||
|
# &remove_temporary_files() if !$dbg;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
sub check_exit_status(){
|
||||||
|
my $failure=0;
|
||||||
|
|
||||||
|
print STDERR "check_exit_status of submitted job $id\n";
|
||||||
|
open(IN,"$qsubout") or die "Can't read $qsubout";
|
||||||
|
while (<IN>){
|
||||||
|
$failure=1 if (/failed with exit status/);
|
||||||
|
}
|
||||||
|
close(IN);
|
||||||
|
return $failure;
|
||||||
|
}
|
||||||
|
|
||||||
|
sub kill_all_and_quit(){
|
||||||
|
print STDERR "kill_all_and_quit\n";
|
||||||
|
print STDERR "qdel $id\n";
|
||||||
|
safesystem("qdel $id");
|
||||||
|
|
||||||
|
print STDERR "The submitted jobs died not correctly\n";
|
||||||
|
print STDERR "Send qdel signal to the submitted jobs\n";
|
||||||
|
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
sub remove_temporary_files(){
|
||||||
|
#removing temporary files
|
||||||
|
|
||||||
|
unlink("${jobscript}");
|
||||||
|
unlink("${jobscript}.log");
|
||||||
|
unlink("$qsubout");
|
||||||
|
unlink("$qsuberr");
|
||||||
|
rmdir("$tmpdir");
|
||||||
|
}
|
||||||
|
|
||||||
|
sub safesystem {
|
||||||
|
print STDERR "Executing: @_\n";
|
||||||
|
system(@_);
|
||||||
|
if ($? == -1) {
|
||||||
|
print STDERR "Failed to execute: @_\n $!\n";
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
elsif ($? & 127) {
|
||||||
|
printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n",
|
||||||
|
($? & 127), ($? & 128) ? 'with' : 'without';
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
my $exitcode = $? >> 8;
|
||||||
|
print STDERR "Exit code: $exitcode\n" if $exitcode;
|
||||||
|
return ! $exitcode;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# look for the correct pwdcmd (pwd by default, pawd if it exists)
|
||||||
|
# I assume that pwd always exists
|
||||||
|
sub getPwdCmd(){
|
||||||
|
my $pwdcmd="pwd";
|
||||||
|
my $a;
|
||||||
|
chomp($a=`which pawd | head -1 | awk '{print $1}'`);
|
||||||
|
if ($a && -e $a){ $pwdcmd=$a; }
|
||||||
|
return $pwdcmd;
|
||||||
|
}
|
||||||
|
|
1633
contrib/mert-sge-nosync/training/mert-moses-sge-nosync.pl
Executable file
1633
contrib/mert-sge-nosync/training/mert-moses-sge-nosync.pl
Executable file
File diff suppressed because it is too large
Load Diff
64
contrib/mert-sge-nosync/training/sge-nosync/cleartmpfiles.pl
Executable file
64
contrib/mert-sge-nosync/training/sge-nosync/cleartmpfiles.pl
Executable file
@ -0,0 +1,64 @@
|
|||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
|
||||||
|
my @filename_id = "";
|
||||||
|
my $this_id = "";
|
||||||
|
|
||||||
|
# remove exitjob and forceexitjob
|
||||||
|
chomp(my @rddfile_list = `ls exitjob* forceexitjob*`);
|
||||||
|
foreach my $rddfile (@rddfile_list) {
|
||||||
|
unlink("$rddfile");
|
||||||
|
}
|
||||||
|
|
||||||
|
chomp(@filename_id = `ls *.id | grep -v 'clear'`);
|
||||||
|
open (OUT, "> all.id.all");
|
||||||
|
print OUT "==Combine log at ".`date`;
|
||||||
|
print OUT `tail -n +1 *.id`;
|
||||||
|
print OUT "==LOG combined ".`date`;
|
||||||
|
close(OUT);
|
||||||
|
foreach $this_id (@filename_id) {
|
||||||
|
# print OUT `cat $this_id`;
|
||||||
|
unlink("$this_id");
|
||||||
|
}
|
||||||
|
|
||||||
|
chomp (@filename_id = `ls *.id.pid | grep -v 'clear'`);
|
||||||
|
open (OUT, "> all.id.pid.all");
|
||||||
|
print OUT "==Combine log at ".`date`;
|
||||||
|
print OUT `tail -n +1 *.id.pid`;
|
||||||
|
print OUT "==Log combined ".`date`;
|
||||||
|
close(OUT);
|
||||||
|
foreach $this_id (@filename_id) {
|
||||||
|
# print OUT `cat $this_id`;
|
||||||
|
unlink("$this_id");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
chomp(@filename_id = `ls *.out | grep -v 'clear'`);
|
||||||
|
open (OUT, "> all.out.all");
|
||||||
|
print OUT "==Combine log at ".`date`;
|
||||||
|
print OUT `tail -n +1 *.out`;
|
||||||
|
print OUT "==Log combined ".`date`;
|
||||||
|
close(OUT);
|
||||||
|
foreach $this_id (@filename_id) {
|
||||||
|
# print OUT `cat $this_id`;
|
||||||
|
unlink("$this_id");
|
||||||
|
}
|
||||||
|
|
||||||
|
chomp(@filename_id = `ls *.err | grep -v 'clear'`);
|
||||||
|
open (OUT, "> all.err.all");
|
||||||
|
print OUT "==Combine log at ".`date`;
|
||||||
|
print OUT `tail -n +1 *.err`;
|
||||||
|
print OUT "==Log combined ".`date`;
|
||||||
|
close(OUT);
|
||||||
|
foreach $this_id (@filename_id) {
|
||||||
|
# print OUT `cat $this_id`;
|
||||||
|
unlink("$this_id");
|
||||||
|
}
|
||||||
|
|
||||||
|
# waitall.sh which cannot be deleted inside moses-parallel-sge-nosync.pl
|
||||||
|
chomp(@filename_id = `ls *waitall.sh`);
|
||||||
|
foreach $this_id (@filename_id) {
|
||||||
|
unlink("$this_id");
|
||||||
|
}
|
||||||
|
|
426
contrib/mert-sge-nosync/training/sge-nosync/create-config-sge-nosync.pl
Executable file
426
contrib/mert-sge-nosync/training/sge-nosync/create-config-sge-nosync.pl
Executable file
@ -0,0 +1,426 @@
|
|||||||
|
#!/usr/bin/perl -w
|
||||||
|
|
||||||
|
# $Id$
|
||||||
|
# after filter-mode-given-input.pl, process the feature list
|
||||||
|
|
||||||
|
# original code by Philipp Koehn
|
||||||
|
# changes by Ondrej Bojar
|
||||||
|
# adapted for hierarchical models by Phil Williams
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
|
||||||
|
use FindBin qw($Bin);
|
||||||
|
use Getopt::Long;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
my $SCRIPTS_ROOTDIR;
|
||||||
|
if (defined($ENV{"SCRIPTS_ROOTDIR"})) {
|
||||||
|
$SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"};
|
||||||
|
} else {
|
||||||
|
$SCRIPTS_ROOTDIR = $Bin;
|
||||||
|
if ($SCRIPTS_ROOTDIR eq '') {
|
||||||
|
$SCRIPTS_ROOTDIR = dirname(__FILE__);
|
||||||
|
}
|
||||||
|
$SCRIPTS_ROOTDIR =~ s/\/training$//;
|
||||||
|
$ENV{"SCRIPTS_ROOTDIR"} = $SCRIPTS_ROOTDIR;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# moses.ini file uses FULL names for lambdas, while this training script
|
||||||
|
# internally (and on the command line) uses ABBR names.
|
||||||
|
my @ABBR_FULL_MAP = qw(d=weight-d lm=weight-l tm=weight-t w=weight-w
|
||||||
|
g=weight-generation lex=weight-lex I=weight-i);
|
||||||
|
my %ABBR2FULL = map {split/=/,$_,2} @ABBR_FULL_MAP;
|
||||||
|
my %FULL2ABBR = map {my ($a, $b) = split/=/,$_,2; ($b, $a);} @ABBR_FULL_MAP;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
my $verbose = 0;
|
||||||
|
my $usage = 0; # request for --help
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
##!# # consider phrases in input up to $MAX_LENGTH
|
||||||
|
##!# # in other words, all phrase-tables will be truncated at least to 10 words per
|
||||||
|
##!# # phrase.
|
||||||
|
##!# my $MAX_LENGTH = 10;
|
||||||
|
|
||||||
|
# utilities
|
||||||
|
##!# my $ZCAT = "gzip -cd";
|
||||||
|
|
||||||
|
# get optional parameters
|
||||||
|
##!# my $opt_hierarchical = 0;
|
||||||
|
##!# my $binarizer = undef;
|
||||||
|
##!# my $opt_min_non_initial_rule_count = undef;
|
||||||
|
##!# my $opt_gzip = 1; # gzip output files (so far only phrase-based ttable until someone tests remaining models and formats)
|
||||||
|
|
||||||
|
my $___RANGES = undef;
|
||||||
|
my $___ACTIVATE_FEATURES = undef; # comma-separated (or blank-separated) list of features to work on
|
||||||
|
# if undef work on all features
|
||||||
|
# (others are fixed to the starting values)
|
||||||
|
my $___DECODER_FLAGS = ""; # additional parametrs to pass to the decoder
|
||||||
|
|
||||||
|
my $devbleu = undef;
|
||||||
|
my $___WORKING_DIR = undef;
|
||||||
|
my $___DEV_F = undef;
|
||||||
|
my $run = undef; # either first or final
|
||||||
|
my $runid_final = undef;
|
||||||
|
my $runid_finalplus=0;
|
||||||
|
my $sparse_weights_file = undef;
|
||||||
|
|
||||||
|
|
||||||
|
# set 0 if input type is text, set 1 if input type is confusion network
|
||||||
|
my $___INPUTTYPE = 0;
|
||||||
|
|
||||||
|
my $___DECODER = undef; # required, pathname to the decoder executable
|
||||||
|
my $___CONFIG = undef; # required, pathname to startup ini file
|
||||||
|
|
||||||
|
|
||||||
|
GetOptions(
|
||||||
|
"activate-features=s" => \$___ACTIVATE_FEATURES, #comma-separated (or blank-separated) list of features to work on (others are fixed to the starting values)
|
||||||
|
"range=s@" => \$___RANGES,
|
||||||
|
"decoder-flags=s" => \$___DECODER_FLAGS,
|
||||||
|
"inputtype=i" => \$___INPUTTYPE,
|
||||||
|
"devbleu=s" => \$devbleu,
|
||||||
|
"sparse_weight_file=s" => \$sparse_weights_file,
|
||||||
|
"working-dir=s" => \$___WORKING_DIR,
|
||||||
|
) or exit(1);
|
||||||
|
|
||||||
|
##!# GetOptions(
|
||||||
|
##!# "gzip!" => \$opt_gzip,
|
||||||
|
##!# "Hierarchical" => \$opt_hierarchical,
|
||||||
|
##!# "Binarizer=s" => \$binarizer,
|
||||||
|
##!# "MinNonInitialRuleCount=i" => \$opt_min_non_initial_rule_count
|
||||||
|
##!# ) or exit(1);
|
||||||
|
|
||||||
|
|
||||||
|
# the ?? required parameters can be supplied on the command line directly
|
||||||
|
# or using the --options
|
||||||
|
if (scalar @ARGV == 4) {
|
||||||
|
# required parameters: options
|
||||||
|
$___DEV_F = shift;
|
||||||
|
$___DECODER = shift;
|
||||||
|
$___CONFIG = shift;
|
||||||
|
$run = shift; # first or final
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($usage || !defined $___DECODER || !defined $___CONFIG) {
|
||||||
|
print STDERR "usage: $0 \$___DECODER \$___CONFIG(decoder.ini)
|
||||||
|
Options:
|
||||||
|
--activate-features=STRING ... comma-separated list of features to optimize,
|
||||||
|
others are fixed to the starting values
|
||||||
|
default: optimize all features
|
||||||
|
example: tm_0,tm_4,d_0
|
||||||
|
--range=tm:0..1,-1..1 ... specify min and max value for some features
|
||||||
|
--range can be repeated as needed.
|
||||||
|
The order of the various --range specifications
|
||||||
|
is important only within a feature name.
|
||||||
|
E.g.:
|
||||||
|
--range=tm:0..1,-1..1 --range=tm:0..2
|
||||||
|
is identical to:
|
||||||
|
--range=tm:0..1,-1..1,0..2
|
||||||
|
but not to:
|
||||||
|
--range=tm:0..2 --range=tm:0..1,-1..1
|
||||||
|
--decoder-flags=STRING ... extra parameters for the decoder
|
||||||
|
--inputtype=[0|1|2] ... Handle different input types: (0 for text,
|
||||||
|
1 for confusion network, 2 for lattices,
|
||||||
|
default is 0)
|
||||||
|
";
|
||||||
|
exit 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
##!# # get command line parameters
|
||||||
|
##!# my $dir = shift;
|
||||||
|
##!# my $config = shift;
|
||||||
|
##!# my $input = shift;
|
||||||
|
|
||||||
|
##!# $dir = ensure_full_path($dir);
|
||||||
|
|
||||||
|
############################################################
|
||||||
|
############################################################
|
||||||
|
############################################################
|
||||||
|
|
||||||
|
# main
|
||||||
|
|
||||||
|
# we run moses to check validity of moses.ini and to obtain all the feature
|
||||||
|
# names
|
||||||
|
|
||||||
|
if (($run eq "first")){
|
||||||
|
my $featlist = get_featlist_from_moses($___CONFIG,$___CONFIG,"first");
|
||||||
|
$featlist = insert_ranges_to_featlist($featlist, $___RANGES);
|
||||||
|
create_config($___CONFIG,"$___WORKING_DIR/run1.moses.ini",$featlist,1,(defined$devbleu?$devbleu:"--not-estimated--"),$sparse_weights_file);
|
||||||
|
} else { # $run eq "final"
|
||||||
|
chomp ($runid_final = `cat $___WORKING_DIR/finished_step.txt | tail -n 1`);
|
||||||
|
$runid_finalplus = $runid_final + 1;
|
||||||
|
`mv run${runid_finalplus}.moses.ini run_final.moses.ini`;
|
||||||
|
chomp ($devbleu = `cat $___WORKING_DIR/run_final.moses.ini | tail -n +3 | head -n 1 | gawk '{print \$3}'`);
|
||||||
|
my $featlist = get_featlist_from_moses($___CONFIG,"$___WORKING_DIR/run_final.moses.ini","final");
|
||||||
|
$featlist = insert_ranges_to_featlist($featlist, $___RANGES);
|
||||||
|
create_config($___CONFIG,"$___WORKING_DIR/moses.ini",$featlist,$runid_finalplus,$devbleu,$sparse_weights_file);
|
||||||
|
}
|
||||||
|
|
||||||
|
##COPIED## Mark which features are disabled:
|
||||||
|
##COPIED#if (defined $___ACTIVATE_FEATURES) {
|
||||||
|
##COPIED# my %enabled = map { ($_, 1) } split /[, ]+/, $___ACTIVATE_FEATURES;
|
||||||
|
##COPIED# my %cnt;
|
||||||
|
##COPIED# for(my $i=0; $i<scalar(@{$featlist->{"names"}}); $i++) {
|
||||||
|
##COPIED# my $name = $featlist->{"names"}->[$i];
|
||||||
|
##COPIED# $cnt{$name} = 0 if !defined $cnt{$name};
|
||||||
|
##COPIED# $featlist->{"enabled"}->[$i] = $enabled{$name."_".$cnt{$name}};
|
||||||
|
##COPIED# $cnt{$name}++;
|
||||||
|
##COPIED# }
|
||||||
|
##COPIED#} else {
|
||||||
|
##COPIED# # all enabled
|
||||||
|
##COPIED# for(my $i=0; $i<scalar(@{$featlist->{"names"}}); $i++) {
|
||||||
|
##COPIED# $featlist->{"enabled"}->[$i] = 1;
|
||||||
|
##COPIED# }
|
||||||
|
##COPIED#}
|
||||||
|
##COPIED#
|
||||||
|
##COPIED#print STDERR "MERT starting values and ranges for random generation:\n";
|
||||||
|
##COPIED#for(my $i=0; $i<scalar(@{$featlist->{"names"}}); $i++) {
|
||||||
|
##COPIED# my $name = $featlist->{"names"}->[$i];
|
||||||
|
##COPIED# my $val = $featlist->{"values"}->[$i];
|
||||||
|
##COPIED# my $min = $featlist->{"mins"}->[$i];
|
||||||
|
##COPIED# my $max = $featlist->{"maxs"}->[$i];
|
||||||
|
##COPIED# my $enabled = $featlist->{"enabled"}->[$i];
|
||||||
|
##COPIED# printf STDERR " %5s = %7.3f", $name, $val;
|
||||||
|
##COPIED# if ($enabled) {
|
||||||
|
##COPIED# printf STDERR " (%5.2f .. %5.2f)\n", $min, $max;
|
||||||
|
##COPIED# } else {
|
||||||
|
##COPIED# print STDERR " --- inactive, not optimized ---\n";
|
||||||
|
##COPIED# }
|
||||||
|
##COPIED#}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
sub get_featlist_from_moses {
|
||||||
|
# run moses with the given config file and return the list of features and
|
||||||
|
# their initial values
|
||||||
|
my $configfn = shift;
|
||||||
|
my $config_score = shift;
|
||||||
|
my $run = shift;
|
||||||
|
|
||||||
|
my $featlistfn = "";
|
||||||
|
if ($run eq 'first') {
|
||||||
|
$featlistfn = "./features.list"; # given feature list
|
||||||
|
} elsif ($run eq "final") {
|
||||||
|
$featlistfn = "./features.list.run_final";
|
||||||
|
}
|
||||||
|
if (-e $featlistfn) {
|
||||||
|
print STDERR "Using cached features list: $featlistfn\n";
|
||||||
|
} else {
|
||||||
|
print STDERR "Asking moses for feature names and values from $config_score\n";
|
||||||
|
my $cmd = "$___DECODER $___DECODER_FLAGS -config $config_score -inputtype $___INPUTTYPE -show-weights > $featlistfn";
|
||||||
|
print STDERR "$cmd\n"; #DEBUG
|
||||||
|
safesystem($cmd) or die "Failed to run moses with the config $config_score";
|
||||||
|
}
|
||||||
|
|
||||||
|
# read feature list
|
||||||
|
my @names = ();
|
||||||
|
my @startvalues = ();
|
||||||
|
open(INI,$featlistfn) or die "Can't read $featlistfn";
|
||||||
|
my $nr = 0;
|
||||||
|
my @errs = ();
|
||||||
|
while (<INI>) {
|
||||||
|
$nr++;
|
||||||
|
chomp;
|
||||||
|
/^(.+) (\S+) (\S+)$/ || die("invalid feature: $_");
|
||||||
|
my ($longname, $feature, $value) = ($1,$2,$3);
|
||||||
|
next if $value eq "sparse";
|
||||||
|
push @errs, "$featlistfn:$nr:Bad initial value of $feature: $value\n"
|
||||||
|
if $value !~ /^[+-]?[0-9.e]+$/;
|
||||||
|
push @errs, "$featlistfn:$nr:Unknown feature '$feature', please add it to \@ABBR_FULL_MAP\n"
|
||||||
|
if !defined $ABBR2FULL{$feature};
|
||||||
|
push @names, $feature;
|
||||||
|
push @startvalues, $value;
|
||||||
|
}
|
||||||
|
close INI;
|
||||||
|
if (scalar @errs) {
|
||||||
|
print STDERR join("", @errs);
|
||||||
|
exit 1;
|
||||||
|
}
|
||||||
|
return {"names"=>\@names, "values"=>\@startvalues};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub insert_ranges_to_featlist {
|
||||||
|
my $featlist = shift;
|
||||||
|
my $ranges = shift;
|
||||||
|
|
||||||
|
$ranges = [] if !defined $ranges;
|
||||||
|
|
||||||
|
# first collect the ranges from options
|
||||||
|
my $niceranges;
|
||||||
|
foreach my $range (@$ranges) {
|
||||||
|
my $name = undef;
|
||||||
|
foreach my $namedpair (split /,/, $range) {
|
||||||
|
if ($namedpair =~ /^(.*?):/) {
|
||||||
|
$name = $1;
|
||||||
|
$namedpair =~ s/^.*?://;
|
||||||
|
die "Unrecognized name '$name' in --range=$range"
|
||||||
|
if !defined $ABBR2FULL{$name};
|
||||||
|
}
|
||||||
|
my ($min, $max) = split /\.\./, $namedpair;
|
||||||
|
die "Bad min '$min' in --range=$range" if $min !~ /^-?[0-9.]+$/;
|
||||||
|
die "Bad max '$max' in --range=$range" if $min !~ /^-?[0-9.]+$/;
|
||||||
|
die "No name given in --range=$range" if !defined $name;
|
||||||
|
push @{$niceranges->{$name}}, [$min, $max];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# now populate featlist
|
||||||
|
my $seen = undef;
|
||||||
|
for(my $i=0; $i<scalar(@{$featlist->{"names"}}); $i++) {
|
||||||
|
my $name = $featlist->{"names"}->[$i];
|
||||||
|
$seen->{$name} ++;
|
||||||
|
my $min = 0.0;
|
||||||
|
my $max = 1.0;
|
||||||
|
if (defined $niceranges->{$name}) {
|
||||||
|
my $minmax = shift @{$niceranges->{$name}};
|
||||||
|
($min, $max) = @$minmax if defined $minmax;
|
||||||
|
}
|
||||||
|
$featlist->{"mins"}->[$i] = $min;
|
||||||
|
$featlist->{"maxs"}->[$i] = $max;
|
||||||
|
}
|
||||||
|
return $featlist;
|
||||||
|
}
|
||||||
|
|
||||||
|
sub safesystem {
|
||||||
|
print STDERR "Executing: @_\n";
|
||||||
|
system(@_);
|
||||||
|
if ($? == -1) {
|
||||||
|
print STDERR "Failed to execute: @_\n $!\n";
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
elsif ($? & 127) {
|
||||||
|
printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n",
|
||||||
|
($? & 127), ($? & 128) ? 'with' : 'without';
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
my $exitcode = $? >> 8;
|
||||||
|
print STDERR "Exit code: $exitcode\n" if $exitcode;
|
||||||
|
return ! $exitcode;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
sub create_config {
|
||||||
|
my $infn = shift; # source config
|
||||||
|
my $outfn = shift; # where to save the config
|
||||||
|
my $featlist = shift; # the lambdas we should write
|
||||||
|
my $iteration = shift; # just for verbosity
|
||||||
|
my $bleu_achieved = shift; # just for verbosity
|
||||||
|
my $sparse_weights_file = shift; # only defined when optimizing sparse features
|
||||||
|
|
||||||
|
my %P; # the hash of all parameters we wish to override
|
||||||
|
|
||||||
|
# first convert the command line parameters to the hash
|
||||||
|
{ # ensure local scope of vars
|
||||||
|
my $parameter=undef;
|
||||||
|
print "Parsing --decoder-flags: |$___DECODER_FLAGS|\n";
|
||||||
|
$___DECODER_FLAGS =~ s/^\s*|\s*$//;
|
||||||
|
$___DECODER_FLAGS =~ s/\s+/ /;
|
||||||
|
foreach (split(/ /,$___DECODER_FLAGS)) {
|
||||||
|
if (/^\-([^\d].*)$/) {
|
||||||
|
$parameter = $1;
|
||||||
|
$parameter = $ABBR2FULL{$parameter} if defined($ABBR2FULL{$parameter});
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
die "Found value with no -paramname before it: $_"
|
||||||
|
if !defined $parameter;
|
||||||
|
push @{$P{$parameter}},$_;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# First delete all weights params from the input, we're overwriting them.
|
||||||
|
# Delete both short and long-named version.
|
||||||
|
for(my $i=0; $i<scalar(@{$featlist->{"names"}}); $i++) {
|
||||||
|
my $name = $featlist->{"names"}->[$i];
|
||||||
|
delete($P{$name});
|
||||||
|
delete($P{$ABBR2FULL{$name}});
|
||||||
|
}
|
||||||
|
|
||||||
|
# Convert weights to elements in P
|
||||||
|
for(my $i=0; $i<scalar(@{$featlist->{"names"}}); $i++) {
|
||||||
|
my $name = $featlist->{"names"}->[$i];
|
||||||
|
my $val = $featlist->{"values"}->[$i];
|
||||||
|
$name = defined $ABBR2FULL{$name} ? $ABBR2FULL{$name} : $name;
|
||||||
|
# ensure long name
|
||||||
|
push @{$P{$name}}, $val;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (defined($sparse_weights_file)) {
|
||||||
|
push @{$P{"weights-file"}}, $___WORKING_DIR."/".$sparse_weights_file;
|
||||||
|
}
|
||||||
|
|
||||||
|
# create new moses.ini decoder config file by cloning and overriding the original one
|
||||||
|
open(INI,$infn) or die "Can't read $infn";
|
||||||
|
delete($P{"config"}); # never output
|
||||||
|
print "Saving new config to: $outfn\n";
|
||||||
|
open(OUT,"> $outfn") or die "Can't write $outfn";
|
||||||
|
print OUT "# MERT optimized configuration\n";
|
||||||
|
print OUT "# decoder $___DECODER\n";
|
||||||
|
print OUT "# BLEU $bleu_achieved on dev $___DEV_F\n";
|
||||||
|
print OUT "# We were before running iteration $iteration\n";
|
||||||
|
print OUT "# finished ".`date`;
|
||||||
|
my $line = <INI>;
|
||||||
|
while(1) {
|
||||||
|
last unless $line;
|
||||||
|
|
||||||
|
# skip until hit [parameter]
|
||||||
|
if ($line !~ /^\[(.+)\]\s*$/) {
|
||||||
|
$line = <INI>;
|
||||||
|
print OUT $line if $line =~ /^\#/ || $line =~ /^\s+$/;
|
||||||
|
next;
|
||||||
|
}
|
||||||
|
|
||||||
|
# parameter name
|
||||||
|
my $parameter = $1;
|
||||||
|
$parameter = $ABBR2FULL{$parameter} if defined($ABBR2FULL{$parameter});
|
||||||
|
print OUT "[$parameter]\n";
|
||||||
|
|
||||||
|
# change parameter, if new values
|
||||||
|
if (defined($P{$parameter})) {
|
||||||
|
# write new values
|
||||||
|
foreach (@{$P{$parameter}}) {
|
||||||
|
print OUT $_."\n";
|
||||||
|
}
|
||||||
|
delete($P{$parameter});
|
||||||
|
# skip until new parameter, only write comments
|
||||||
|
while($line = <INI>) {
|
||||||
|
print OUT $line if $line =~ /^\#/ || $line =~ /^\s+$/;
|
||||||
|
last if $line =~ /^\[/;
|
||||||
|
last unless $line;
|
||||||
|
}
|
||||||
|
next;
|
||||||
|
}
|
||||||
|
# unchanged parameter, write old
|
||||||
|
while($line = <INI>) {
|
||||||
|
last if $line =~ /^\[/;
|
||||||
|
print OUT $line;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# write all additional parameters
|
||||||
|
foreach my $parameter (keys %P) {
|
||||||
|
print OUT "\n[$parameter]\n";
|
||||||
|
foreach (@{$P{$parameter}}) {
|
||||||
|
print OUT $_."\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
close(INI);
|
||||||
|
close(OUT);
|
||||||
|
print STDERR "Saved: $outfn\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -0,0 +1,235 @@
|
|||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
|
||||||
|
my $logflag="";
|
||||||
|
my $logfile="";
|
||||||
|
my $alifile=undef;
|
||||||
|
my $nbestflag=0;
|
||||||
|
my $processid=0;
|
||||||
|
my $idxliststr="";
|
||||||
|
my $workingdir="";
|
||||||
|
my $inputfile="";
|
||||||
|
my $tmpdir="";
|
||||||
|
my $splitpfx="";
|
||||||
|
my $jobscript="";
|
||||||
|
my $qsubout="";
|
||||||
|
my $qsuberr="";
|
||||||
|
my $nbestfile=undef;
|
||||||
|
my $nbestlist=undef;
|
||||||
|
my $outnbest="";
|
||||||
|
my $lsamp_filename="";
|
||||||
|
my @idxlist=();
|
||||||
|
|
||||||
|
|
||||||
|
###############################
|
||||||
|
# Script starts here
|
||||||
|
|
||||||
|
init();
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#concatenating translations and removing temporary files
|
||||||
|
concatenate_1best();
|
||||||
|
concatenate_logs() if $logflag;
|
||||||
|
concatenate_ali() if defined $alifile;
|
||||||
|
concatenate_nbest() if $nbestflag;
|
||||||
|
safesystem("cat nbest$$ >> /dev/stdout") if $nbestlist[0] eq '-';
|
||||||
|
|
||||||
|
|
||||||
|
print STDERR "Not support searchgraphflag for sync mert\n" if $searchgraphflag;
|
||||||
|
# concatenate_searchgraph() if $searchgraphflag;
|
||||||
|
# safesystem("cat searchgraph$$ >> /dev/stdout") if $searchgraphlist eq '-';
|
||||||
|
|
||||||
|
print STDERR "Not support wordgraphflag for sync mert\n" if $searchgraphflag;
|
||||||
|
# concatenate_wordgraph() if $wordgraphflag;
|
||||||
|
# safesystem("cat wordgraph$$ >> /dev/stdout") if $wordgraphlist[0] eq '-';
|
||||||
|
|
||||||
|
remove_temporary_files();
|
||||||
|
####
|
||||||
|
#### ### ending scripts in run_decoder() ##############
|
||||||
|
#### sanity_check_order_of_lambdas($featlist, $filename);
|
||||||
|
#### ## how to do return???
|
||||||
|
#### return ($filename, $lsamp_filename);
|
||||||
|
######################################################
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
sub init(){
|
||||||
|
use strict;
|
||||||
|
use Getopt::Long qw(:config pass_through no_ignore_case permute);
|
||||||
|
|
||||||
|
GetOptions('alignment-output-file=s'=>\$alifile,
|
||||||
|
'process-id=s'=>\$processid,
|
||||||
|
'idxliststr=s'=>\$idxliststr,
|
||||||
|
'logfile=s'=>\$logfile,
|
||||||
|
'nbestfile=s'=>\$nbestfile,
|
||||||
|
'outnbest=s'=>\$outnbest,
|
||||||
|
'lsamp-filename=s'=>\$lsamp_filename,
|
||||||
|
'input-file=s'=>\$inputfile
|
||||||
|
) or exit(1);
|
||||||
|
|
||||||
|
if ($logfile){ $logflag=1; }
|
||||||
|
|
||||||
|
if (defined $nbestfile) { $nbestflag=1; }
|
||||||
|
|
||||||
|
$idxliststr =~ s/^\s+|\s+$//g;
|
||||||
|
@idxlist = split(/\s+/,$idxliststr);
|
||||||
|
|
||||||
|
my $pwdcmd = getPwdCmd();
|
||||||
|
|
||||||
|
$workingdir = `$pwdcmd`; chomp $workingdir;
|
||||||
|
$tmpdir="$workingdir/tmp$processid";
|
||||||
|
$splitpfx="split$processid";
|
||||||
|
|
||||||
|
$jobscript="$workingdir/job$processid";
|
||||||
|
$qsubout="$workingdir/out.job$processid";
|
||||||
|
$qsuberr="$workingdir/err.job$processid";
|
||||||
|
|
||||||
|
# print STDERR "$idxliststr\n";
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub concatenate_nbest(){
|
||||||
|
my $oldcode="";
|
||||||
|
my $newcode=-1;
|
||||||
|
my %inplength = ();
|
||||||
|
my $offset = 0;
|
||||||
|
|
||||||
|
# get the list of feature and set a fictitious string with zero scores
|
||||||
|
open (IN, "${nbestfile}.${splitpfx}$idxlist[0]");
|
||||||
|
my $str = <IN>;
|
||||||
|
chomp($str);
|
||||||
|
close(IN);
|
||||||
|
my ($code,$trans,$featurescores,$globalscore)=split(/\|\|\|/,$str);
|
||||||
|
|
||||||
|
my $emptytrans = " ";
|
||||||
|
my $emptyglobalscore = " 0.0";
|
||||||
|
my $emptyfeaturescores = $featurescores;
|
||||||
|
$emptyfeaturescores =~ s/[-0-9\.]+/0/g;
|
||||||
|
|
||||||
|
if ($outnbest eq '-'){ $outnbest="nbest$processid"; }
|
||||||
|
|
||||||
|
# my $outnbest=$nbestlist[0];
|
||||||
|
# if ($nbestlist[0] eq '-'){ $outnbest="nbest$$"; }
|
||||||
|
|
||||||
|
open (OUT, "> $outnbest");
|
||||||
|
foreach my $idx (@idxlist){
|
||||||
|
|
||||||
|
#computing the length of each input file
|
||||||
|
# print STDERR "this idx: $idx\n";
|
||||||
|
|
||||||
|
my @in=();
|
||||||
|
open (IN, "${inputfile}.${splitpfx}${idx}.trans");
|
||||||
|
@in=<IN>;
|
||||||
|
close(IN);
|
||||||
|
$inplength{$idx} = scalar(@in);
|
||||||
|
|
||||||
|
open (IN, "${nbestfile}.${splitpfx}${idx}");
|
||||||
|
while (<IN>){
|
||||||
|
my ($code,@extra)=split(/\|\|\|/,$_);
|
||||||
|
$code += $offset;
|
||||||
|
if ($code ne $oldcode){
|
||||||
|
# if there is a jump between two consecutive codes
|
||||||
|
# it means that an input sentence is not translated
|
||||||
|
# fill this hole with a "fictitious" list of translation
|
||||||
|
# comprising just one "emtpy translation" with zero scores
|
||||||
|
while ($code - $oldcode > 1){
|
||||||
|
$oldcode++;
|
||||||
|
print OUT join("\|\|\|",($oldcode,$emptytrans,$emptyfeaturescores,$emptyglobalscore)),"\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$oldcode=$code;
|
||||||
|
print OUT join("\|\|\|",($oldcode,@extra));
|
||||||
|
}
|
||||||
|
close(IN);
|
||||||
|
$offset += $inplength{$idx};
|
||||||
|
|
||||||
|
while ($offset - $oldcode > 1){
|
||||||
|
$oldcode++;
|
||||||
|
print OUT join("\|\|\|",($oldcode,$emptytrans,$emptyfeaturescores,$emptyglobalscore)),"\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
close(OUT);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub concatenate_1best(){
|
||||||
|
foreach my $idx (@idxlist){
|
||||||
|
# print STDERR "reading 1best file ${inputfile}.${splitpfx}$idx.trans\n";
|
||||||
|
my @in=();
|
||||||
|
open (IN, "${inputfile}.${splitpfx}${idx}.trans");
|
||||||
|
@in=<IN>;
|
||||||
|
# print STDERR "in array is : @in";
|
||||||
|
print STDOUT "@in";
|
||||||
|
close(IN);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sub concatenate_logs(){
|
||||||
|
open (OUT, "> ${logfile}");
|
||||||
|
foreach my $idx (@idxlist){
|
||||||
|
my @in=();
|
||||||
|
open (IN, "$qsubout$idx");
|
||||||
|
@in=<IN>;
|
||||||
|
print OUT "@in";
|
||||||
|
close(IN);
|
||||||
|
}
|
||||||
|
close(OUT);
|
||||||
|
}
|
||||||
|
|
||||||
|
sub concatenate_ali(){
|
||||||
|
open (OUT, "> ${alifile}");
|
||||||
|
foreach my $idx (@idxlist){
|
||||||
|
my @in=();
|
||||||
|
open (IN, "$alifile.$splitpfx$idx");
|
||||||
|
@in=<IN>;
|
||||||
|
print OUT "@in";
|
||||||
|
close(IN);
|
||||||
|
}
|
||||||
|
close(OUT);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# look for the correct pwdcmd (pwd by default, pawd if it exists)
|
||||||
|
# I assume that pwd always exists
|
||||||
|
sub getPwdCmd(){
|
||||||
|
my $pwdcmd="pwd";
|
||||||
|
my $a;
|
||||||
|
chomp($a=`which pawd | head -1 | awk '{print $1}'`);
|
||||||
|
if ($a && -e $a){ $pwdcmd=$a; }
|
||||||
|
return $pwdcmd;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub remove_temporary_files(){
|
||||||
|
# removing temporary files
|
||||||
|
foreach my $idx (@idxlist){
|
||||||
|
unlink("${inputfile}.${splitpfx}${idx}.trans");
|
||||||
|
unlink("${inputfile}.${splitpfx}${idx}");
|
||||||
|
if (defined $alifile){ unlink("${alifile}.${splitpfx}${idx}"); }
|
||||||
|
if ($nbestflag){ unlink("${nbestfile}.${splitpfx}${idx}"); }
|
||||||
|
if ($searchgraphflag){ unlink("${searchgraphfile}.${splitpfx}${idx}"); }
|
||||||
|
if ($wordgraphflag){ unlink("${wordgraphfile}.${splitpfx}${idx}"); }
|
||||||
|
|
||||||
|
# print STDERR "Deleting ${jobscript}${idx}.bash\n";
|
||||||
|
unlink("${jobscript}${idx}.bash");
|
||||||
|
unlink("${jobscript}${idx}.log");
|
||||||
|
unlink("$qsubname.W.log");
|
||||||
|
unlink("$qsubout$idx");
|
||||||
|
unlink("$qsuberr$idx");
|
||||||
|
rmdir("$tmpdir");
|
||||||
|
}
|
||||||
|
# unlink("${jobscript}.sync_workaround_script.sh");
|
||||||
|
if ($nbestflag && $nbestlist[0] eq '-'){ unlink("${nbestfile}$$"); };
|
||||||
|
if ($searchgraphflag && $searchgraphlist eq '-'){ unlink("${searchgraphfile}$$"); };
|
||||||
|
if ($wordgraphflag && $wordgraphlist eq '-'){ unlink("${wordgraphfile}$$"); };
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
30
contrib/mert-sge-nosync/training/sge-nosync/poll-decoder.pl
Executable file
30
contrib/mert-sge-nosync/training/sge-nosync/poll-decoder.pl
Executable file
@ -0,0 +1,30 @@
|
|||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
use Getopt::Long qw(:config pass_through no_ignore_case permute);
|
||||||
|
|
||||||
|
my $poll_target = undef;
|
||||||
|
my $working_dir = undef;
|
||||||
|
|
||||||
|
GetOptions('poll-target=s'=> \$poll_target,
|
||||||
|
'working-dir'=> \$working_dir
|
||||||
|
) or exit(1);
|
||||||
|
|
||||||
|
|
||||||
|
if (defined $working_dir) {
|
||||||
|
chdir($working_dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
my $cnt = 1;
|
||||||
|
|
||||||
|
print STDERR "Wait for file: $poll_target\n";
|
||||||
|
|
||||||
|
while (1) {
|
||||||
|
if (-e $poll_target){
|
||||||
|
print STDERR "\n File found!!\n";
|
||||||
|
last;
|
||||||
|
} else {
|
||||||
|
sleep(10);
|
||||||
|
print STDERR ".";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
283
contrib/mert-sge-nosync/training/sge-nosync/process-featlist-sge-nosync.pl
Executable file
283
contrib/mert-sge-nosync/training/sge-nosync/process-featlist-sge-nosync.pl
Executable file
@ -0,0 +1,283 @@
|
|||||||
|
#!/usr/bin/perl -w
|
||||||
|
|
||||||
|
# $Id$
|
||||||
|
# after filter-mode-given-input.pl, process the feature list
|
||||||
|
|
||||||
|
# original code by Philipp Koehn
|
||||||
|
# changes by Ondrej Bojar
|
||||||
|
# adapted for hierarchical models by Phil Williams
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
|
||||||
|
use FindBin qw($Bin);
|
||||||
|
use Getopt::Long;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
my $SCRIPTS_ROOTDIR;
|
||||||
|
if (defined($ENV{"SCRIPTS_ROOTDIR"})) {
|
||||||
|
$SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"};
|
||||||
|
} else {
|
||||||
|
$SCRIPTS_ROOTDIR = $Bin;
|
||||||
|
if ($SCRIPTS_ROOTDIR eq '') {
|
||||||
|
$SCRIPTS_ROOTDIR = dirname(__FILE__);
|
||||||
|
}
|
||||||
|
$SCRIPTS_ROOTDIR =~ s/\/training$//;
|
||||||
|
$ENV{"SCRIPTS_ROOTDIR"} = $SCRIPTS_ROOTDIR;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# moses.ini file uses FULL names for lambdas, while this training script
|
||||||
|
# internally (and on the command line) uses ABBR names.
|
||||||
|
my @ABBR_FULL_MAP = qw(d=weight-d lm=weight-l tm=weight-t w=weight-w
|
||||||
|
g=weight-generation lex=weight-lex I=weight-i);
|
||||||
|
my %ABBR2FULL = map {split/=/,$_,2} @ABBR_FULL_MAP;
|
||||||
|
my %FULL2ABBR = map {my ($a, $b) = split/=/,$_,2; ($b, $a);} @ABBR_FULL_MAP;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
my $verbose = 0;
|
||||||
|
my $usage = 0; # request for --help
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
##!# # consider phrases in input up to $MAX_LENGTH
|
||||||
|
##!# # in other words, all phrase-tables will be truncated at least to 10 words per
|
||||||
|
##!# # phrase.
|
||||||
|
##!# my $MAX_LENGTH = 10;
|
||||||
|
|
||||||
|
# utilities
|
||||||
|
##!# my $ZCAT = "gzip -cd";
|
||||||
|
|
||||||
|
# get optional parameters
|
||||||
|
##!# my $opt_hierarchical = 0;
|
||||||
|
##!# my $binarizer = undef;
|
||||||
|
##!# my $opt_min_non_initial_rule_count = undef;
|
||||||
|
##!# my $opt_gzip = 1; # gzip output files (so far only phrase-based ttable until someone tests remaining models and formats)
|
||||||
|
|
||||||
|
my $___RANGES = undef;
|
||||||
|
my $___ACTIVATE_FEATURES = undef; # comma-separated (or blank-separated) list of features to work on
|
||||||
|
# if undef work on all features
|
||||||
|
# (others are fixed to the starting values)
|
||||||
|
my $___DECODER_FLAGS = ""; # additional parametrs to pass to the decoder
|
||||||
|
|
||||||
|
# set 0 if input type is text, set 1 if input type is confusion network
|
||||||
|
my $___INPUTTYPE = 0;
|
||||||
|
|
||||||
|
my $___DECODER = undef; # required, pathname to the decoder executable
|
||||||
|
my $___CONFIG = undef; # required, pathname to startup ini file
|
||||||
|
|
||||||
|
|
||||||
|
GetOptions(
|
||||||
|
"activate-features=s" => \$___ACTIVATE_FEATURES, #comma-separated (or blank-separated) list of features to work on (others are fixed to the starting values)
|
||||||
|
"range=s@" => \$___RANGES,
|
||||||
|
"decoder-flags=s" => \$___DECODER_FLAGS,
|
||||||
|
"inputtype=i" => \$___INPUTTYPE
|
||||||
|
) or exit(1);
|
||||||
|
|
||||||
|
##!# GetOptions(
|
||||||
|
##!# "gzip!" => \$opt_gzip,
|
||||||
|
##!# "Hierarchical" => \$opt_hierarchical,
|
||||||
|
##!# "Binarizer=s" => \$binarizer,
|
||||||
|
##!# "MinNonInitialRuleCount=i" => \$opt_min_non_initial_rule_count
|
||||||
|
##!# ) or exit(1);
|
||||||
|
|
||||||
|
|
||||||
|
# the ?? required parameters can be supplied on the command line directly
|
||||||
|
# or using the --options
|
||||||
|
if (scalar @ARGV == 2) {
|
||||||
|
# required parameters: options
|
||||||
|
$___DECODER = shift;
|
||||||
|
$___CONFIG = shift;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($usage || !defined $___DECODER || !defined $___CONFIG) {
|
||||||
|
print STDERR "usage: $0 \$___DECODER \$___CONFIG(decoder.ini)
|
||||||
|
Options:
|
||||||
|
--activate-features=STRING ... comma-separated list of features to optimize,
|
||||||
|
others are fixed to the starting values
|
||||||
|
default: optimize all features
|
||||||
|
example: tm_0,tm_4,d_0
|
||||||
|
--range=tm:0..1,-1..1 ... specify min and max value for some features
|
||||||
|
--range can be repeated as needed.
|
||||||
|
The order of the various --range specifications
|
||||||
|
is important only within a feature name.
|
||||||
|
E.g.:
|
||||||
|
--range=tm:0..1,-1..1 --range=tm:0..2
|
||||||
|
is identical to:
|
||||||
|
--range=tm:0..1,-1..1,0..2
|
||||||
|
but not to:
|
||||||
|
--range=tm:0..2 --range=tm:0..1,-1..1
|
||||||
|
--decoder-flags=STRING ... extra parameters for the decoder
|
||||||
|
--inputtype=[0|1|2] ... Handle different input types: (0 for text,
|
||||||
|
1 for confusion network, 2 for lattices,
|
||||||
|
default is 0)
|
||||||
|
";
|
||||||
|
exit 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
##!# # get command line parameters
|
||||||
|
##!# my $dir = shift;
|
||||||
|
##!# my $config = shift;
|
||||||
|
##!# my $input = shift;
|
||||||
|
|
||||||
|
##!# $dir = ensure_full_path($dir);
|
||||||
|
|
||||||
|
############################################################
|
||||||
|
############################################################
|
||||||
|
############################################################
|
||||||
|
|
||||||
|
# main
|
||||||
|
|
||||||
|
# we run moses to check validity of moses.ini and to obtain all the feature
|
||||||
|
# names
|
||||||
|
my $featlist = get_featlist_from_moses($___CONFIG);
|
||||||
|
$featlist = insert_ranges_to_featlist($featlist, $___RANGES);
|
||||||
|
|
||||||
|
|
||||||
|
# Mark which features are disabled:
|
||||||
|
if (defined $___ACTIVATE_FEATURES) {
|
||||||
|
my %enabled = map { ($_, 1) } split /[, ]+/, $___ACTIVATE_FEATURES;
|
||||||
|
my %cnt;
|
||||||
|
for(my $i=0; $i<scalar(@{$featlist->{"names"}}); $i++) {
|
||||||
|
my $name = $featlist->{"names"}->[$i];
|
||||||
|
$cnt{$name} = 0 if !defined $cnt{$name};
|
||||||
|
$featlist->{"enabled"}->[$i] = $enabled{$name."_".$cnt{$name}};
|
||||||
|
$cnt{$name}++;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
# all enabled
|
||||||
|
for(my $i=0; $i<scalar(@{$featlist->{"names"}}); $i++) {
|
||||||
|
$featlist->{"enabled"}->[$i] = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
print STDERR "MERT starting values and ranges for random generation:\n";
|
||||||
|
for(my $i=0; $i<scalar(@{$featlist->{"names"}}); $i++) {
|
||||||
|
my $name = $featlist->{"names"}->[$i];
|
||||||
|
my $val = $featlist->{"values"}->[$i];
|
||||||
|
my $min = $featlist->{"mins"}->[$i];
|
||||||
|
my $max = $featlist->{"maxs"}->[$i];
|
||||||
|
my $enabled = $featlist->{"enabled"}->[$i];
|
||||||
|
printf STDERR " %5s = %7.3f", $name, $val;
|
||||||
|
if ($enabled) {
|
||||||
|
printf STDERR " (%5.2f .. %5.2f)\n", $min, $max;
|
||||||
|
} else {
|
||||||
|
print STDERR " --- inactive, not optimized ---\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
sub get_featlist_from_moses {
|
||||||
|
# run moses with the given config file and return the list of features and
|
||||||
|
# their initial values
|
||||||
|
my $configfn = shift;
|
||||||
|
my $featlistfn = "./features.list";
|
||||||
|
if (-e $featlistfn) {
|
||||||
|
print STDERR "Using cached features list: $featlistfn\n";
|
||||||
|
} else {
|
||||||
|
print STDERR "Asking moses for feature names and values from $___CONFIG\n";
|
||||||
|
my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn -inputtype $___INPUTTYPE -show-weights > $featlistfn";
|
||||||
|
print STDERR "$cmd\n"; #DEBUG
|
||||||
|
safesystem($cmd) or die "Failed to run moses with the config $configfn";
|
||||||
|
}
|
||||||
|
|
||||||
|
# read feature list
|
||||||
|
my @names = ();
|
||||||
|
my @startvalues = ();
|
||||||
|
open(INI,$featlistfn) or die "Can't read $featlistfn";
|
||||||
|
my $nr = 0;
|
||||||
|
my @errs = ();
|
||||||
|
while (<INI>) {
|
||||||
|
$nr++;
|
||||||
|
chomp;
|
||||||
|
/^(.+) (\S+) (\S+)$/ || die("invalid feature: $_");
|
||||||
|
my ($longname, $feature, $value) = ($1,$2,$3);
|
||||||
|
next if $value eq "sparse";
|
||||||
|
push @errs, "$featlistfn:$nr:Bad initial value of $feature: $value\n"
|
||||||
|
if $value !~ /^[+-]?[0-9.e]+$/;
|
||||||
|
push @errs, "$featlistfn:$nr:Unknown feature '$feature', please add it to \@ABBR_FULL_MAP\n"
|
||||||
|
if !defined $ABBR2FULL{$feature};
|
||||||
|
push @names, $feature;
|
||||||
|
push @startvalues, $value;
|
||||||
|
}
|
||||||
|
close INI;
|
||||||
|
if (scalar @errs) {
|
||||||
|
print STDERR join("", @errs);
|
||||||
|
exit 1;
|
||||||
|
}
|
||||||
|
return {"names"=>\@names, "values"=>\@startvalues};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub insert_ranges_to_featlist {
|
||||||
|
my $featlist = shift;
|
||||||
|
my $ranges = shift;
|
||||||
|
|
||||||
|
$ranges = [] if !defined $ranges;
|
||||||
|
|
||||||
|
# first collect the ranges from options
|
||||||
|
my $niceranges;
|
||||||
|
foreach my $range (@$ranges) {
|
||||||
|
my $name = undef;
|
||||||
|
foreach my $namedpair (split /,/, $range) {
|
||||||
|
if ($namedpair =~ /^(.*?):/) {
|
||||||
|
$name = $1;
|
||||||
|
$namedpair =~ s/^.*?://;
|
||||||
|
die "Unrecognized name '$name' in --range=$range"
|
||||||
|
if !defined $ABBR2FULL{$name};
|
||||||
|
}
|
||||||
|
my ($min, $max) = split /\.\./, $namedpair;
|
||||||
|
die "Bad min '$min' in --range=$range" if $min !~ /^-?[0-9.]+$/;
|
||||||
|
die "Bad max '$max' in --range=$range" if $min !~ /^-?[0-9.]+$/;
|
||||||
|
die "No name given in --range=$range" if !defined $name;
|
||||||
|
push @{$niceranges->{$name}}, [$min, $max];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# now populate featlist
|
||||||
|
my $seen = undef;
|
||||||
|
for(my $i=0; $i<scalar(@{$featlist->{"names"}}); $i++) {
|
||||||
|
my $name = $featlist->{"names"}->[$i];
|
||||||
|
$seen->{$name} ++;
|
||||||
|
my $min = 0.0;
|
||||||
|
my $max = 1.0;
|
||||||
|
if (defined $niceranges->{$name}) {
|
||||||
|
my $minmax = shift @{$niceranges->{$name}};
|
||||||
|
($min, $max) = @$minmax if defined $minmax;
|
||||||
|
}
|
||||||
|
$featlist->{"mins"}->[$i] = $min;
|
||||||
|
$featlist->{"maxs"}->[$i] = $max;
|
||||||
|
}
|
||||||
|
return $featlist;
|
||||||
|
}
|
||||||
|
|
||||||
|
sub safesystem {
|
||||||
|
print STDERR "Executing: @_\n";
|
||||||
|
system(@_);
|
||||||
|
if ($? == -1) {
|
||||||
|
print STDERR "Failed to execute: @_\n $!\n";
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
elsif ($? & 127) {
|
||||||
|
printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n",
|
||||||
|
($? & 127), ($? & 128) ? 'with' : 'without';
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
my $exitcode = $? >> 8;
|
||||||
|
print STDERR "Exit code: $exitcode\n" if $exitcode;
|
||||||
|
return ! $exitcode;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
1377
contrib/mert-sge-nosync/training/sge-nosync/process-moses-result-sge-nosync.pl
Executable file
1377
contrib/mert-sge-nosync/training/sge-nosync/process-moses-result-sge-nosync.pl
Executable file
File diff suppressed because it is too large
Load Diff
271
contrib/mert-sge-nosync/training/sge-nosync/run-decoder-sge-nosync.pl
Executable file
271
contrib/mert-sge-nosync/training/sge-nosync/run-decoder-sge-nosync.pl
Executable file
@ -0,0 +1,271 @@
|
|||||||
|
#!/usr/bin/perl -w
|
||||||
|
|
||||||
|
# $Id$
|
||||||
|
# after filter-mode-given-input.pl, process the feature list
|
||||||
|
|
||||||
|
# original code by Philipp Koehn
|
||||||
|
# changes by Ondrej Bojar
|
||||||
|
# adapted for hierarchical models by Phil Williams
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
|
||||||
|
use FindBin qw($Bin);
|
||||||
|
use Getopt::Long;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
my $SCRIPTS_ROOTDIR;
|
||||||
|
if (defined($ENV{"SCRIPTS_ROOTDIR"})) {
|
||||||
|
$SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"};
|
||||||
|
} else {
|
||||||
|
$SCRIPTS_ROOTDIR = $Bin;
|
||||||
|
if ($SCRIPTS_ROOTDIR eq '') {
|
||||||
|
$SCRIPTS_ROOTDIR = dirname(__FILE__);
|
||||||
|
}
|
||||||
|
$SCRIPTS_ROOTDIR =~ s/\/training$//;
|
||||||
|
$ENV{"SCRIPTS_ROOTDIR"} = $SCRIPTS_ROOTDIR;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
##!# # moses.ini file uses FULL names for lambdas, while this training script
|
||||||
|
##!# # internally (and on the command line) uses ABBR names.
|
||||||
|
##!# my @ABBR_FULL_MAP = qw(d=weight-d lm=weight-l tm=weight-t w=weight-w
|
||||||
|
##!# g=weight-generation lex=weight-lex I=weight-i);
|
||||||
|
##!# my %ABBR2FULL = map {split/=/,$_,2} @ABBR_FULL_MAP;
|
||||||
|
##!# my %FULL2ABBR = map {my ($a, $b) = split/=/,$_,2; ($b, $a);} @ABBR_FULL_MAP;
|
||||||
|
|
||||||
|
|
||||||
|
my $verbose = 0;
|
||||||
|
my $usage = 0; # request for --help
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
##!# # consider phrases in input up to $MAX_LENGTH
|
||||||
|
##!# # in other words, all phrase-tables will be truncated at least to 10 words per
|
||||||
|
##!# # phrase.
|
||||||
|
##!# my $MAX_LENGTH = 10;
|
||||||
|
|
||||||
|
# utilities
|
||||||
|
##!# my $ZCAT = "gzip -cd";
|
||||||
|
|
||||||
|
# get optional parameters
|
||||||
|
##!# my $opt_hierarchical = 0;
|
||||||
|
##!# my $binarizer = undef;
|
||||||
|
##!# my $opt_min_non_initial_rule_count = undef;
|
||||||
|
##!# my $opt_gzip = 1; # gzip output files (so far only phrase-based ttable until someone tests remaining models and formats)
|
||||||
|
|
||||||
|
my $___RANGES = undef;
|
||||||
|
my $___ACTIVATE_FEATURES = undef; # comma-separated (or blank-separated) list of features to work on
|
||||||
|
# if undef work on all features
|
||||||
|
# (others are fixed to the starting values)
|
||||||
|
my $___DECODER_FLAGS = ""; # additional parametrs to pass to the decoder
|
||||||
|
|
||||||
|
# set 0 if input type is text, set 1 if input type is confusion network
|
||||||
|
my $___INPUTTYPE = 0;
|
||||||
|
|
||||||
|
my $___DECODER = undef; # required, pathname to the decoder executable
|
||||||
|
my $___CONFIG = undef; # required, pathname to startup ini file
|
||||||
|
|
||||||
|
|
||||||
|
GetOptions(
|
||||||
|
##!# "activate-features=s" => \$___ACTIVATE_FEATURES, #comma-separated (or blank-separated) list of features to work on (others are fixed to the starting values)
|
||||||
|
##!# "range=s@" => \$___RANGES,
|
||||||
|
##!# "decoder-flags=s" => \$___DECODER_FLAGS,
|
||||||
|
##!# "inputtype=i" => \$___INPUTTYPE
|
||||||
|
""
|
||||||
|
) or exit(1);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# the ?? required parameters can be supplied on the command line directly
|
||||||
|
# or using the --options
|
||||||
|
if (scalar @ARGV == 2) {
|
||||||
|
# required parameters: options
|
||||||
|
$___DECODER = shift;
|
||||||
|
$___CONFIG = shift;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($usage || !defined $___DECODER || !defined $___CONFIG) {
|
||||||
|
print STDERR "usage: $0 \$___DECODER \$___CONFIG(decoder.ini)
|
||||||
|
Options:
|
||||||
|
--activate-features=STRING ... comma-separated list of features to optimize,
|
||||||
|
others are fixed to the starting values
|
||||||
|
default: optimize all features
|
||||||
|
example: tm_0,tm_4,d_0
|
||||||
|
--range=tm:0..1,-1..1 ... specify min and max value for some features
|
||||||
|
--range can be repeated as needed.
|
||||||
|
The order of the various --range specifications
|
||||||
|
is important only within a feature name.
|
||||||
|
E.g.:
|
||||||
|
--range=tm:0..1,-1..1 --range=tm:0..2
|
||||||
|
is identical to:
|
||||||
|
--range=tm:0..1,-1..1,0..2
|
||||||
|
but not to:
|
||||||
|
--range=tm:0..2 --range=tm:0..1,-1..1
|
||||||
|
--decoder-flags=STRING ... extra parameters for the decoder
|
||||||
|
--inputtype=[0|1|2] ... Handle different input types: (0 for text,
|
||||||
|
1 for confusion network, 2 for lattices,
|
||||||
|
default is 0)
|
||||||
|
";
|
||||||
|
exit 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
############################################################
|
||||||
|
############################################################
|
||||||
|
############################################################
|
||||||
|
|
||||||
|
# main
|
||||||
|
|
||||||
|
# we run moses to check validity of moses.ini and to obtain all the feature
|
||||||
|
# names
|
||||||
|
my $featlist = get_featlist_from_moses($___CONFIG);
|
||||||
|
$featlist = insert_ranges_to_featlist($featlist, $___RANGES);
|
||||||
|
|
||||||
|
|
||||||
|
# Mark which features are disabled:
|
||||||
|
if (defined $___ACTIVATE_FEATURES) {
|
||||||
|
my %enabled = map { ($_, 1) } split /[, ]+/, $___ACTIVATE_FEATURES;
|
||||||
|
my %cnt;
|
||||||
|
for(my $i=0; $i<scalar(@{$featlist->{"names"}}); $i++) {
|
||||||
|
my $name = $featlist->{"names"}->[$i];
|
||||||
|
$cnt{$name} = 0 if !defined $cnt{$name};
|
||||||
|
$featlist->{"enabled"}->[$i] = $enabled{$name."_".$cnt{$name}};
|
||||||
|
$cnt{$name}++;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
# all enabled
|
||||||
|
for(my $i=0; $i<scalar(@{$featlist->{"names"}}); $i++) {
|
||||||
|
$featlist->{"enabled"}->[$i] = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
print STDERR "MERT starting values and ranges for random generation:\n";
|
||||||
|
for(my $i=0; $i<scalar(@{$featlist->{"names"}}); $i++) {
|
||||||
|
my $name = $featlist->{"names"}->[$i];
|
||||||
|
my $val = $featlist->{"values"}->[$i];
|
||||||
|
my $min = $featlist->{"mins"}->[$i];
|
||||||
|
my $max = $featlist->{"maxs"}->[$i];
|
||||||
|
my $enabled = $featlist->{"enabled"}->[$i];
|
||||||
|
printf STDERR " %5s = %7.3f", $name, $val;
|
||||||
|
if ($enabled) {
|
||||||
|
printf STDERR " (%5.2f .. %5.2f)\n", $min, $max;
|
||||||
|
} else {
|
||||||
|
print STDERR " --- inactive, not optimized ---\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
sub get_featlist_from_moses {
|
||||||
|
# run moses with the given config file and return the list of features and
|
||||||
|
# their initial values
|
||||||
|
my $configfn = shift;
|
||||||
|
my $featlistfn = "./features.list";
|
||||||
|
if (-e $featlistfn) {
|
||||||
|
print STDERR "Using cached features list: $featlistfn\n";
|
||||||
|
} else {
|
||||||
|
print STDERR "Asking moses for feature names and values from $___CONFIG\n";
|
||||||
|
my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn -inputtype $___INPUTTYPE -show-weights > $featlistfn";
|
||||||
|
print STDERR "$cmd\n"; #DEBUG
|
||||||
|
safesystem($cmd) or die "Failed to run moses with the config $configfn";
|
||||||
|
}
|
||||||
|
|
||||||
|
# read feature list
|
||||||
|
my @names = ();
|
||||||
|
my @startvalues = ();
|
||||||
|
open(INI,$featlistfn) or die "Can't read $featlistfn";
|
||||||
|
my $nr = 0;
|
||||||
|
my @errs = ();
|
||||||
|
while (<INI>) {
|
||||||
|
$nr++;
|
||||||
|
chomp;
|
||||||
|
/^(.+) (\S+) (\S+)$/ || die("invalid feature: $_");
|
||||||
|
my ($longname, $feature, $value) = ($1,$2,$3);
|
||||||
|
next if $value eq "sparse";
|
||||||
|
push @errs, "$featlistfn:$nr:Bad initial value of $feature: $value\n"
|
||||||
|
if $value !~ /^[+-]?[0-9.e]+$/;
|
||||||
|
push @errs, "$featlistfn:$nr:Unknown feature '$feature', please add it to \@ABBR_FULL_MAP\n"
|
||||||
|
if !defined $ABBR2FULL{$feature};
|
||||||
|
push @names, $feature;
|
||||||
|
push @startvalues, $value;
|
||||||
|
}
|
||||||
|
close INI;
|
||||||
|
if (scalar @errs) {
|
||||||
|
print STDERR join("", @errs);
|
||||||
|
exit 1;
|
||||||
|
}
|
||||||
|
return {"names"=>\@names, "values"=>\@startvalues};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub insert_ranges_to_featlist {
|
||||||
|
my $featlist = shift;
|
||||||
|
my $ranges = shift;
|
||||||
|
|
||||||
|
$ranges = [] if !defined $ranges;
|
||||||
|
|
||||||
|
# first collect the ranges from options
|
||||||
|
my $niceranges;
|
||||||
|
foreach my $range (@$ranges) {
|
||||||
|
my $name = undef;
|
||||||
|
foreach my $namedpair (split /,/, $range) {
|
||||||
|
if ($namedpair =~ /^(.*?):/) {
|
||||||
|
$name = $1;
|
||||||
|
$namedpair =~ s/^.*?://;
|
||||||
|
die "Unrecognized name '$name' in --range=$range"
|
||||||
|
if !defined $ABBR2FULL{$name};
|
||||||
|
}
|
||||||
|
my ($min, $max) = split /\.\./, $namedpair;
|
||||||
|
die "Bad min '$min' in --range=$range" if $min !~ /^-?[0-9.]+$/;
|
||||||
|
die "Bad max '$max' in --range=$range" if $min !~ /^-?[0-9.]+$/;
|
||||||
|
die "No name given in --range=$range" if !defined $name;
|
||||||
|
push @{$niceranges->{$name}}, [$min, $max];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# now populate featlist
|
||||||
|
my $seen = undef;
|
||||||
|
for(my $i=0; $i<scalar(@{$featlist->{"names"}}); $i++) {
|
||||||
|
my $name = $featlist->{"names"}->[$i];
|
||||||
|
$seen->{$name} ++;
|
||||||
|
my $min = 0.0;
|
||||||
|
my $max = 1.0;
|
||||||
|
if (defined $niceranges->{$name}) {
|
||||||
|
my $minmax = shift @{$niceranges->{$name}};
|
||||||
|
($min, $max) = @$minmax if defined $minmax;
|
||||||
|
}
|
||||||
|
$featlist->{"mins"}->[$i] = $min;
|
||||||
|
$featlist->{"maxs"}->[$i] = $max;
|
||||||
|
}
|
||||||
|
return $featlist;
|
||||||
|
}
|
||||||
|
|
||||||
|
sub safesystem {
|
||||||
|
print STDERR "Executing: @_\n";
|
||||||
|
system(@_);
|
||||||
|
if ($? == -1) {
|
||||||
|
print STDERR "Failed to execute: @_\n $!\n";
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
elsif ($? & 127) {
|
||||||
|
printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n",
|
||||||
|
($? & 127), ($? & 128) ? 'with' : 'without';
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
my $exitcode = $? >> 8;
|
||||||
|
print STDERR "Exit code: $exitcode\n" if $exitcode;
|
||||||
|
return ! $exitcode;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
832
contrib/mert-sge-nosync/training/sge-nosync/zipextract-decoder-result.pl
Executable file
832
contrib/mert-sge-nosync/training/sge-nosync/zipextract-decoder-result.pl
Executable file
@ -0,0 +1,832 @@
|
|||||||
|
#!/usr/bin/perl -w
|
||||||
|
# $Id$
|
||||||
|
# Usage:
|
||||||
|
# mert-moses.pl <foreign> <english> <decoder-executable> <decoder-config>
|
||||||
|
# For other options see below or run 'mert-moses.pl --help'
|
||||||
|
|
||||||
|
# Notes:
|
||||||
|
# <foreign> and <english> should be raw text files, one sentence per line
|
||||||
|
# <english> can be a prefix, in which case the files are <english>0, <english>1, etc. are used
|
||||||
|
|
||||||
|
# Excerpts from revision history
|
||||||
|
|
||||||
|
# Sept 2011 multi-threaded mert (Barry Haddow)
|
||||||
|
# 3 Aug 2011 Added random directions, historic best, pairwise ranked (PK)
|
||||||
|
# Jul 2011 simplifications (Ondrej Bojar)
|
||||||
|
# -- rely on moses' -show-weights instead of parsing moses.ini
|
||||||
|
# ... so moses is also run once *before* mert starts, checking
|
||||||
|
# the model to some extent
|
||||||
|
# -- got rid of the 'triples' mess;
|
||||||
|
# use --range to supply bounds for random starting values:
|
||||||
|
# --range tm:-3..3 --range lm:-3..3
|
||||||
|
# 5 Aug 2009 Handling with different reference length policies (shortest, average, closest) for BLEU
|
||||||
|
# and case-sensistive/insensitive evaluation (Nicola Bertoldi)
|
||||||
|
# 5 Jun 2008 Forked previous version to support new mert implementation.
|
||||||
|
# 13 Feb 2007 Better handling of default values for lambda, now works with multiple
|
||||||
|
# models and lexicalized reordering
|
||||||
|
# 11 Oct 2006 Handle different input types through parameter --inputype=[0|1]
|
||||||
|
# (0 for text, 1 for confusion network, default is 0) (Nicola Bertoldi)
|
||||||
|
# 10 Oct 2006 Allow skip of filtering of phrase tables (--no-filter-phrase-table)
|
||||||
|
# useful if binary phrase tables are used (Nicola Bertoldi)
|
||||||
|
# 28 Aug 2006 Use either closest or average or shortest (default) reference
|
||||||
|
# length as effective reference length
|
||||||
|
# Use either normalization or not (default) of texts (Nicola Bertoldi)
|
||||||
|
# 31 Jul 2006 move gzip run*.out to avoid failure wit restartings
|
||||||
|
# adding default paths
|
||||||
|
# 29 Jul 2006 run-filter, score-nbest and mert run on the queue (Nicola; Ondrej had to type it in again)
|
||||||
|
# 28 Jul 2006 attempt at foolproof usage, strong checking of input validity, merged the parallel and nonparallel version (Ondrej Bojar)
|
||||||
|
# 27 Jul 2006 adding the safesystem() function to handle with process failure
|
||||||
|
# 22 Jul 2006 fixed a bug about handling relative path of configuration file (Nicola Bertoldi)
|
||||||
|
# 21 Jul 2006 adapted for Moses-in-parallel (Nicola Bertoldi)
|
||||||
|
# 18 Jul 2006 adapted for Moses and cleaned up (PK)
|
||||||
|
# 21 Jan 2005 unified various versions, thorough cleanup (DWC)
|
||||||
|
# now indexing accumulated n-best list solely by feature vectors
|
||||||
|
# 14 Dec 2004 reimplemented find_threshold_points in C (NMD)
|
||||||
|
# 25 Oct 2004 Use either average or shortest (default) reference
|
||||||
|
# length as effective reference length (DWC)
|
||||||
|
# 13 Oct 2004 Use alternative decoders (DWC)
|
||||||
|
# Original version by Philipp Koehn
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
use FindBin qw($Bin);
|
||||||
|
use File::Basename;
|
||||||
|
use File::Path;
|
||||||
|
use File::Spec;
|
||||||
|
use Cwd;
|
||||||
|
|
||||||
|
my $SCRIPTS_ROOTDIR = $Bin;
|
||||||
|
$SCRIPTS_ROOTDIR =~ s/\/training$//;
|
||||||
|
$SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"} if defined($ENV{"SCRIPTS_ROOTDIR"});
|
||||||
|
|
||||||
|
## We preserve this bit of comments to keep the traditional weight ranges.
|
||||||
|
# "w" => [ [ 0.0, -1.0, 1.0 ] ], # word penalty
|
||||||
|
# "d" => [ [ 1.0, 0.0, 2.0 ] ], # lexicalized reordering model
|
||||||
|
# "lm" => [ [ 1.0, 0.0, 2.0 ] ], # language model
|
||||||
|
# "g" => [ [ 1.0, 0.0, 2.0 ], # generation model
|
||||||
|
# [ 1.0, 0.0, 2.0 ] ],
|
||||||
|
# "tm" => [ [ 0.3, 0.0, 0.5 ], # translation model
|
||||||
|
# [ 0.2, 0.0, 0.5 ],
|
||||||
|
# [ 0.3, 0.0, 0.5 ],
|
||||||
|
# [ 0.2, 0.0, 0.5 ],
|
||||||
|
# [ 0.0,-1.0, 1.0 ] ], # ... last weight is phrase penalty
|
||||||
|
# "lex"=> [ [ 0.1, 0.0, 0.2 ] ], # global lexical model
|
||||||
|
# "I" => [ [ 0.0,-1.0, 1.0 ] ], # input lattice scores
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# moses.ini file uses FULL names for lambdas, while this training script
|
||||||
|
# internally (and on the command line) uses ABBR names.
|
||||||
|
my @ABBR_FULL_MAP = qw(d=weight-d lm=weight-l tm=weight-t w=weight-w
|
||||||
|
g=weight-generation lex=weight-lex I=weight-i);
|
||||||
|
my %ABBR2FULL = map {split/=/,$_,2} @ABBR_FULL_MAP;
|
||||||
|
my %FULL2ABBR = map {my ($a, $b) = split/=/,$_,2; ($b, $a);} @ABBR_FULL_MAP;
|
||||||
|
|
||||||
|
my $minimum_required_change_in_weights = 0.00001;
|
||||||
|
# stop if no lambda changes more than this
|
||||||
|
|
||||||
|
my $verbose = 0;
|
||||||
|
my $usage = 0; # request for --help
|
||||||
|
|
||||||
|
# We assume that if you don't specify working directory,
|
||||||
|
# we set the default is set to `pwd`/mert-work
|
||||||
|
#@@# my $___WORKING_DIR = File::Spec->catfile(Cwd::getcwd(), "mert-work");
|
||||||
|
my $___WORKING_DIR = undef;
|
||||||
|
my $___DEV_F = undef; # required, input text to decode
|
||||||
|
my $___DEV_E = undef; # required, basename of files with references
|
||||||
|
my $___DECODER = undef; # required, pathname to the decoder executable
|
||||||
|
my $___CONFIG = undef; # required, pathname to startup ini file
|
||||||
|
my $___N_BEST_LIST_SIZE = 100;
|
||||||
|
my $___LATTICE_SAMPLES = 0;
|
||||||
|
my $submithost = "";
|
||||||
|
my $queue_flags = "-hard"; # extra parameters for parallelizer
|
||||||
|
# the -l ws0ssmt was relevant only to JHU 2006 workshop
|
||||||
|
my $___JOBS = undef; # if parallel, number of jobs to use (undef or 0 -> serial)
|
||||||
|
my $___DECODER_FLAGS = ""; # additional parametrs to pass to the decoder
|
||||||
|
my $continue = 0; # should we try to continue from the last saved step?
|
||||||
|
my $skip_decoder = 0; # and should we skip the first decoder run (assuming we got interrupted during mert)
|
||||||
|
my $___FILTER_PHRASE_TABLE = 1; # filter phrase table
|
||||||
|
my $___PREDICTABLE_SEEDS = 0;
|
||||||
|
my $___START_WITH_HISTORIC_BESTS = 0; # use best settings from all previous iterations as starting points [Foster&Kuhn,2009]
|
||||||
|
my $___RANDOM_DIRECTIONS = 0; # search in random directions only
|
||||||
|
my $___NUM_RANDOM_DIRECTIONS = 0; # number of random directions, also works with default optimizer [Cer&al.,2008]
|
||||||
|
my $___PAIRWISE_RANKED_OPTIMIZER = 0; # use Hopkins&May[2011]
|
||||||
|
my $___PRO_STARTING_POINT = 0; # get a starting point from pairwise ranked optimizer
|
||||||
|
my $___RANDOM_RESTARTS = 20;
|
||||||
|
my $___HISTORIC_INTERPOLATION = 0; # interpolate optimize weights with previous iteration's weights [Hopkins&May,2011,5.4.3]
|
||||||
|
my $__THREADS = 0;
|
||||||
|
my $run = 0;
|
||||||
|
|
||||||
|
# Parameter for effective reference length when computing BLEU score
|
||||||
|
# Default is to use shortest reference
|
||||||
|
# Use "--shortest" to use shortest reference length
|
||||||
|
# Use "--average" to use average reference length
|
||||||
|
# Use "--closest" to use closest reference length
|
||||||
|
# Only one between --shortest, --average and --closest can be set
|
||||||
|
# If more than one choice the defualt (--shortest) is used
|
||||||
|
my $___SHORTEST = 0;
|
||||||
|
my $___AVERAGE = 0;
|
||||||
|
my $___CLOSEST = 0;
|
||||||
|
|
||||||
|
# Use "--nocase" to compute case-insensitive scores
|
||||||
|
my $___NOCASE = 0;
|
||||||
|
|
||||||
|
# Use "--nonorm" to non normalize translation before computing scores
|
||||||
|
my $___NONORM = 0;
|
||||||
|
|
||||||
|
# set 0 if input type is text, set 1 if input type is confusion network
|
||||||
|
my $___INPUTTYPE = 0;
|
||||||
|
|
||||||
|
|
||||||
|
my $mertdir = undef; # path to new mert directory
|
||||||
|
my $mertargs = undef; # args to pass through to mert & extractor
|
||||||
|
my $mertmertargs = undef; # args to pass through to mert only
|
||||||
|
my $extractorargs = undef; # args to pass through to extractor only
|
||||||
|
my $filtercmd = undef; # path to filter-model-given-input.pl
|
||||||
|
my $filterfile = undef;
|
||||||
|
my $qsubwrapper = undef;
|
||||||
|
my $qsubwrapper_exit = undef;
|
||||||
|
my $moses_parallel_cmd = undef;
|
||||||
|
my $old_sge = 0; # assume sge<6.0
|
||||||
|
my $___CONFIG_ORIG = undef; # pathname to startup ini file before filtering
|
||||||
|
my $___ACTIVATE_FEATURES = undef; # comma-separated (or blank-separated) list of features to work on
|
||||||
|
# if undef work on all features
|
||||||
|
# (others are fixed to the starting values)
|
||||||
|
my $___RANGES = undef;
|
||||||
|
my $prev_aggregate_nbl_size = -1; # number of previous step to consider when loading data (default =-1)
|
||||||
|
# -1 means all previous, i.e. from iteration 1
|
||||||
|
# 0 means no previous data, i.e. from actual iteration
|
||||||
|
# 1 means 1 previous data , i.e. from the actual iteration and from the previous one
|
||||||
|
# and so on
|
||||||
|
my $maximum_iterations = 25;
|
||||||
|
|
||||||
|
#####################
|
||||||
|
my $processfeatlistcmd = undef;
|
||||||
|
my $processfeatlistargs = undef;
|
||||||
|
my $createconfigcmd = undef;
|
||||||
|
my $createconfigargs = undef;
|
||||||
|
my $decoderargs = undef;
|
||||||
|
#####################
|
||||||
|
|
||||||
|
use Getopt::Long;
|
||||||
|
GetOptions(
|
||||||
|
"working-dir=s" => \$___WORKING_DIR,
|
||||||
|
"input=s" => \$___DEV_F,
|
||||||
|
"inputtype=i" => \$___INPUTTYPE,
|
||||||
|
"refs=s" => \$___DEV_E,
|
||||||
|
"decoder=s" => \$___DECODER,
|
||||||
|
"config=s" => \$___CONFIG,
|
||||||
|
"nbest=i" => \$___N_BEST_LIST_SIZE,
|
||||||
|
"lattice-samples=i" => \$___LATTICE_SAMPLES,
|
||||||
|
"submithost=s" => \$submithost,
|
||||||
|
"queue-flags=s" => \$queue_flags,
|
||||||
|
"jobs=i" => \$___JOBS,
|
||||||
|
"decoder-flags=s" => \$___DECODER_FLAGS,
|
||||||
|
"continue" => \$continue,
|
||||||
|
"skip-decoder" => \$skip_decoder,
|
||||||
|
"shortest" => \$___SHORTEST,
|
||||||
|
"average" => \$___AVERAGE,
|
||||||
|
"closest" => \$___CLOSEST,
|
||||||
|
"nocase" => \$___NOCASE,
|
||||||
|
"nonorm" => \$___NONORM,
|
||||||
|
"help" => \$usage,
|
||||||
|
"verbose" => \$verbose,
|
||||||
|
"mertdir=s" => \$mertdir,
|
||||||
|
"mertargs=s" => \$mertargs,
|
||||||
|
"extractorargs=s" => \$extractorargs,
|
||||||
|
"mertmertargs=s" => \$mertmertargs,
|
||||||
|
"rootdir=s" => \$SCRIPTS_ROOTDIR,
|
||||||
|
"filtercmd=s" => \$filtercmd, # allow to override the default location
|
||||||
|
"filterfile=s" => \$filterfile, # input to filtering script (useful for lattices/confnets)
|
||||||
|
"qsubwrapper=s" => \$qsubwrapper, # allow to override the default location
|
||||||
|
"mosesparallelcmd=s" => \$moses_parallel_cmd, # allow to override the default location
|
||||||
|
"old-sge" => \$old_sge, #passed to moses-parallel
|
||||||
|
"filter-phrase-table!" => \$___FILTER_PHRASE_TABLE, # (dis)allow of phrase tables
|
||||||
|
"predictable-seeds" => \$___PREDICTABLE_SEEDS, # make random restarts deterministic
|
||||||
|
"historic-bests" => \$___START_WITH_HISTORIC_BESTS, # use best settings from all previous iterations as starting points
|
||||||
|
"random-directions" => \$___RANDOM_DIRECTIONS, # search only in random directions
|
||||||
|
"run=i" => \$run,
|
||||||
|
"number-of-random-directions=i" => \$___NUM_RANDOM_DIRECTIONS, # number of random directions
|
||||||
|
"random-restarts=i" => \$___RANDOM_RESTARTS, # number of random restarts
|
||||||
|
"activate-features=s" => \$___ACTIVATE_FEATURES, #comma-separated (or blank-separated) list of features to work on (others are fixed to the starting values)
|
||||||
|
"range=s@" => \$___RANGES,
|
||||||
|
"prev-aggregate-nbestlist=i" => \$prev_aggregate_nbl_size, #number of previous step to consider when loading data (default =-1, i.e. all previous)
|
||||||
|
"maximum-iterations=i" => \$maximum_iterations,
|
||||||
|
"pairwise-ranked" => \$___PAIRWISE_RANKED_OPTIMIZER,
|
||||||
|
"pro-starting-point" => \$___PRO_STARTING_POINT,
|
||||||
|
"historic-interpolation=f" => \$___HISTORIC_INTERPOLATION,
|
||||||
|
"threads=i" => \$__THREADS
|
||||||
|
) or exit(1);
|
||||||
|
|
||||||
|
# the 4 required parameters can be supplied on the command line directly
|
||||||
|
# or using the --options
|
||||||
|
if (scalar @ARGV == 2) {
|
||||||
|
# required parameters: input_file references_basename decoder_executable
|
||||||
|
# $___DEV_F = shift;
|
||||||
|
$___DEV_E = shift;
|
||||||
|
# $___DECODER = shift;
|
||||||
|
$___CONFIG = shift;
|
||||||
|
}
|
||||||
|
|
||||||
|
# if ($usage || !defined $___DEV_F || !defined $___DEV_E || !defined $___DECODER || !defined $___CONFIG) {
|
||||||
|
if ($usage || !defined $___CONFIG || !defined $___DEV_E ){
|
||||||
|
print STDERR "usage: $0 reference decoder.ini
|
||||||
|
Options:
|
||||||
|
--working-dir=mert-dir ... where all the files are created
|
||||||
|
--nbest=100 ... how big nbestlist to generate
|
||||||
|
--lattice-samples ... how many lattice samples (Chatterjee & Cancedda, emnlp 2010)
|
||||||
|
--jobs=N ... set this to anything to run moses in parallel
|
||||||
|
--mosesparallelcmd=STR ... use a different script instead of moses-parallel
|
||||||
|
--queue-flags=STRING ... anything you with to pass to qsub, eg.
|
||||||
|
'-l ws06osssmt=true'. The default is: '-hard'
|
||||||
|
To reset the parameters, please use
|
||||||
|
--queue-flags=' '
|
||||||
|
(i.e. a space between the quotes).
|
||||||
|
--decoder-flags=STRING ... extra parameters for the decoder
|
||||||
|
--continue ... continue from the last successful iteration
|
||||||
|
--skip-decoder ... skip the decoder run for the first time,
|
||||||
|
assuming that we got interrupted during
|
||||||
|
optimization
|
||||||
|
--shortest --average --closest
|
||||||
|
... Use shortest/average/closest reference length
|
||||||
|
as effective reference length (mutually exclusive)
|
||||||
|
--nocase ... Do not preserve case information; i.e.
|
||||||
|
case-insensitive evaluation (default is false).
|
||||||
|
--nonorm ... Do not use text normalization (flag is not active,
|
||||||
|
i.e. text is NOT normalized)
|
||||||
|
--filtercmd=STRING ... path to filter-model-given-input.pl
|
||||||
|
--filterfile=STRING ... path to alternative to input-text for filtering
|
||||||
|
model. useful for lattice decoding
|
||||||
|
--rootdir=STRING ... where do helpers reside (if not given explicitly)
|
||||||
|
--mertdir=STRING ... path to new mert implementation
|
||||||
|
--mertargs=STRING ... extra args for both extractor and mert
|
||||||
|
--extractorargs=STRING ... extra args for extractor only
|
||||||
|
--mertmertargs=STRING ... extra args for mert only
|
||||||
|
--scorenbestcmd=STRING ... path to score-nbest.py
|
||||||
|
--old-sge ... passed to parallelizers, assume Grid Engine < 6.0
|
||||||
|
--inputtype=[0|1|2] ... Handle different input types: (0 for text,
|
||||||
|
1 for confusion network, 2 for lattices,
|
||||||
|
default is 0)
|
||||||
|
--no-filter-phrase-table ... disallow filtering of phrase tables
|
||||||
|
(useful if binary phrase tables are available)
|
||||||
|
--random-restarts=INT ... number of random restarts (default: 20)
|
||||||
|
--predictable-seeds ... provide predictable seeds to mert so that random
|
||||||
|
restarts are the same on every run
|
||||||
|
--range=tm:0..1,-1..1 ... specify min and max value for some features
|
||||||
|
--range can be repeated as needed.
|
||||||
|
The order of the various --range specifications
|
||||||
|
is important only within a feature name.
|
||||||
|
E.g.:
|
||||||
|
--range=tm:0..1,-1..1 --range=tm:0..2
|
||||||
|
is identical to:
|
||||||
|
--range=tm:0..1,-1..1,0..2
|
||||||
|
but not to:
|
||||||
|
--range=tm:0..2 --range=tm:0..1,-1..1
|
||||||
|
--activate-features=STRING ... comma-separated list of features to optimize,
|
||||||
|
others are fixed to the starting values
|
||||||
|
default: optimize all features
|
||||||
|
example: tm_0,tm_4,d_0
|
||||||
|
--prev-aggregate-nbestlist=INT ... number of previous step to consider when
|
||||||
|
loading data (default = $prev_aggregate_nbl_size)
|
||||||
|
-1 means all previous, i.e. from iteration 1
|
||||||
|
0 means no previous data, i.e. only the
|
||||||
|
current iteration
|
||||||
|
N means this and N previous iterations
|
||||||
|
|
||||||
|
--maximum-iterations=ITERS ... Maximum number of iterations. Default: $maximum_iterations
|
||||||
|
--random-directions ... search only in random directions
|
||||||
|
--number-of-random-directions=int ... number of random directions
|
||||||
|
(also works with regular optimizer, default: 0)
|
||||||
|
--pairwise-ranked ... Use PRO for optimisation (Hopkins and May, emnlp 2011)
|
||||||
|
--pro-starting-point ... Use PRO to get a starting point for MERT
|
||||||
|
--threads=NUMBER ... Use multi-threaded mert (must be compiled in).
|
||||||
|
--historic-interpolation ... Interpolate optimized weights with prior iterations' weight
|
||||||
|
(parameter sets factor [0;1] given to current weights)
|
||||||
|
";
|
||||||
|
exit 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Check validity of input parameters and set defaults if needed
|
||||||
|
|
||||||
|
print STDERR "Using SCRIPTS_ROOTDIR: $SCRIPTS_ROOTDIR\n";
|
||||||
|
|
||||||
|
# path of script for filtering phrase tables and running the decoder
|
||||||
|
$filtercmd="$SCRIPTS_ROOTDIR/training/filter-model-given-input.pl" if !defined $filtercmd;
|
||||||
|
|
||||||
|
if ( ! -x $filtercmd && ! $___FILTER_PHRASE_TABLE) {
|
||||||
|
print STDERR "Filtering command not found: $filtercmd.\n";
|
||||||
|
print STDERR "Use --filtercmd=PATH to specify a valid one or --no-filter-phrase-table\n";
|
||||||
|
exit 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
# $qsubwrapper="$SCRIPTS_ROOTDIR/generic/qsub-wrapper.pl" if !defined $qsubwrapper;
|
||||||
|
$qsubwrapper = "$SCRIPTS_ROOTDIR/generic/qsub-wrapper-sge-nosync.pl" if !defined $qsubwrapper;
|
||||||
|
|
||||||
|
$qsubwrapper_exit = "$SCRIPTS_ROOTDIR/generic/qsub-wrapper-exit-sge-nosync.pl" if !defined $qsubwrapper_exit;
|
||||||
|
|
||||||
|
# $moses_parallel_cmd = "$SCRIPTS_ROOTDIR/generic/moses-parallel.pl"
|
||||||
|
# if !defined $moses_parallel_cmd;
|
||||||
|
$moses_parallel_cmd = "$SCRIPTS_ROOTDIR/generic/moses-parallel-sge-nosync.pl"
|
||||||
|
if !defined $moses_parallel_cmd;
|
||||||
|
|
||||||
|
if (!defined $mertdir) {
|
||||||
|
$mertdir = "$SCRIPTS_ROOTDIR/../mert";
|
||||||
|
print STDERR "Assuming --mertdir=$mertdir\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
my $mert_extract_cmd = "$mertdir/extractor";
|
||||||
|
my $mert_mert_cmd = "$mertdir/mert";
|
||||||
|
my $mert_pro_cmd = "$mertdir/pro";
|
||||||
|
|
||||||
|
die "Not executable: $mert_extract_cmd" if ! -x $mert_extract_cmd;
|
||||||
|
die "Not executable: $mert_mert_cmd" if ! -x $mert_mert_cmd;
|
||||||
|
die "Not executable: $mert_pro_cmd" if ! -x $mert_pro_cmd;
|
||||||
|
|
||||||
|
my $pro_optimizer = "$mertdir/megam_i686.opt"; # or set to your installation
|
||||||
|
if (($___PAIRWISE_RANKED_OPTIMIZER || $___PRO_STARTING_POINT) && ! -x $pro_optimizer) {
|
||||||
|
print "did not find $pro_optimizer, installing it in $mertdir\n";
|
||||||
|
`cd $mertdir; wget http://www.cs.utah.edu/~hal/megam/megam_i686.opt.gz;`;
|
||||||
|
`gunzip $pro_optimizer.gz`;
|
||||||
|
`chmod +x $pro_optimizer`;
|
||||||
|
die("ERROR: Installation of megam_i686.opt failed! Install by hand from http://www.cs.utah.edu/~hal/megam/") unless -x $pro_optimizer;
|
||||||
|
}
|
||||||
|
|
||||||
|
$mertargs = "" if !defined $mertargs;
|
||||||
|
|
||||||
|
my $scconfig = undef;
|
||||||
|
if ($mertargs =~ /\-\-scconfig\s+(.+?)(\s|$)/){
|
||||||
|
$scconfig=$1;
|
||||||
|
$scconfig =~ s/\,/ /g;
|
||||||
|
$mertargs =~ s/\-\-scconfig\s+(.+?)(\s|$)//;
|
||||||
|
}
|
||||||
|
|
||||||
|
# handling reference lengh strategy
|
||||||
|
if (($___CLOSEST + $___AVERAGE + $___SHORTEST) > 1){
|
||||||
|
die "You can specify just ONE reference length strategy (closest or shortest or average) not both\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($___SHORTEST){
|
||||||
|
$scconfig .= " reflen:shortest";
|
||||||
|
}elsif ($___AVERAGE){
|
||||||
|
$scconfig .= " reflen:average";
|
||||||
|
}elsif ($___CLOSEST){
|
||||||
|
$scconfig .= " reflen:closest";
|
||||||
|
}
|
||||||
|
|
||||||
|
# handling case-insensitive flag
|
||||||
|
if ($___NOCASE) {
|
||||||
|
$scconfig .= " case:false";
|
||||||
|
}else{
|
||||||
|
$scconfig .= " case:true";
|
||||||
|
}
|
||||||
|
$scconfig =~ s/^\s+//;
|
||||||
|
$scconfig =~ s/\s+$//;
|
||||||
|
$scconfig =~ s/\s+/,/g;
|
||||||
|
|
||||||
|
$scconfig = "--scconfig $scconfig" if ($scconfig);
|
||||||
|
|
||||||
|
my $mert_extract_args=$mertargs;
|
||||||
|
$mert_extract_args .=" $scconfig";
|
||||||
|
$mert_extract_args .=" $extractorargs";
|
||||||
|
|
||||||
|
$mertmertargs = "" if !defined $mertmertargs;
|
||||||
|
|
||||||
|
my $mert_mert_args="$mertargs $mertmertargs";
|
||||||
|
$mert_mert_args =~ s/\-+(binary|b)\b//;
|
||||||
|
$mert_mert_args .=" $scconfig";
|
||||||
|
if ($___ACTIVATE_FEATURES){ $mert_mert_args .=" -o \"$___ACTIVATE_FEATURES\""; }
|
||||||
|
|
||||||
|
my ($just_cmd_filtercmd,$x) = split(/ /,$filtercmd);
|
||||||
|
die "Not executable: $just_cmd_filtercmd" if ! -x $just_cmd_filtercmd;
|
||||||
|
die "Not executable: $moses_parallel_cmd" if defined $___JOBS && ! -x $moses_parallel_cmd;
|
||||||
|
die "Not executable: $qsubwrapper" if defined $___JOBS && ! -x $qsubwrapper;
|
||||||
|
#@@# die "Not executable: $___DECODER" if ! -x $___DECODER;
|
||||||
|
|
||||||
|
#@@# my $input_abs = ensure_full_path($___DEV_F);
|
||||||
|
#@@# die "File not found: $___DEV_F (interpreted as $input_abs)."
|
||||||
|
#@@# if ! -e $input_abs;
|
||||||
|
#@@# $___DEV_F = $input_abs;
|
||||||
|
|
||||||
|
# Option to pass to qsubwrapper and moses-parallel
|
||||||
|
my $pass_old_sge = $old_sge ? "-old-sge" : "";
|
||||||
|
|
||||||
|
#@@# my $decoder_abs = ensure_full_path($___DECODER);
|
||||||
|
#@@# die "File not executable: $___DECODER (interpreted as $decoder_abs)."
|
||||||
|
#@@# if ! -x $decoder_abs;
|
||||||
|
#@@# $___DECODER = $decoder_abs;
|
||||||
|
|
||||||
|
my $ref_abs = ensure_full_path($___DEV_E);
|
||||||
|
# check if English dev set (reference translations) exist and store a list of all references
|
||||||
|
my @references;
|
||||||
|
if (-e $ref_abs) {
|
||||||
|
push @references, $ref_abs;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
# if multiple file, get a full list of the files
|
||||||
|
my $part = 0;
|
||||||
|
if (! -e $ref_abs."0" && -e $ref_abs.".ref0") {
|
||||||
|
$ref_abs .= ".ref";
|
||||||
|
}
|
||||||
|
while (-e $ref_abs.$part) {
|
||||||
|
push @references, $ref_abs.$part;
|
||||||
|
$part++;
|
||||||
|
}
|
||||||
|
die("Reference translations not found: $___DEV_E (interpreted as $ref_abs)") unless $part;
|
||||||
|
}
|
||||||
|
|
||||||
|
my $config_abs = ensure_full_path($___CONFIG);
|
||||||
|
die "File not found: $___CONFIG (interpreted as $config_abs)."
|
||||||
|
if ! -e $config_abs;
|
||||||
|
$___CONFIG = $config_abs;
|
||||||
|
|
||||||
|
# moses should use our config
|
||||||
|
if ($___DECODER_FLAGS =~ /(^|\s)-(config|f) /
|
||||||
|
|| $___DECODER_FLAGS =~ /(^|\s)-(ttable-file|t) /
|
||||||
|
|| $___DECODER_FLAGS =~ /(^|\s)-(distortion-file) /
|
||||||
|
|| $___DECODER_FLAGS =~ /(^|\s)-(generation-file) /
|
||||||
|
|| $___DECODER_FLAGS =~ /(^|\s)-(lmodel-file) /
|
||||||
|
|| $___DECODER_FLAGS =~ /(^|\s)-(global-lexical-file) /
|
||||||
|
) {
|
||||||
|
die "It is forbidden to supply any of -config, -ttable-file, -distortion-file, -generation-file or -lmodel-file in the --decoder-flags.\nPlease use only the --config option to give the config file that lists all the supplementary files.";
|
||||||
|
}
|
||||||
|
|
||||||
|
# as weights are normalized in the next steps (by cmert)
|
||||||
|
# normalize initial LAMBDAs, too
|
||||||
|
my $need_to_normalize = 1;
|
||||||
|
|
||||||
|
#store current directory and create the working directory (if needed)
|
||||||
|
my $cwd = `pawd 2>/dev/null`;
|
||||||
|
if(!$cwd){$cwd = `pwd`;}
|
||||||
|
chomp($cwd);
|
||||||
|
|
||||||
|
$___WORKING_DIR = $cwd if (!defined $___WORKING_DIR);
|
||||||
|
chomp $___WORKING_DIR;
|
||||||
|
|
||||||
|
print STDERR "working dir is $___WORKING_DIR\n";
|
||||||
|
#@@# mkpath($___WORKING_DIR);
|
||||||
|
|
||||||
|
{
|
||||||
|
# open local scope
|
||||||
|
|
||||||
|
#chdir to the working directory
|
||||||
|
chdir($___WORKING_DIR) or die "Can't chdir to $___WORKING_DIR";
|
||||||
|
|
||||||
|
# fixed file names
|
||||||
|
my $mert_outfile = "mert.out";
|
||||||
|
my $mert_logfile = "mert.log";
|
||||||
|
my $weights_in_file = "init.opt";
|
||||||
|
my $weights_out_file = "weights.txt";
|
||||||
|
|
||||||
|
# set start run
|
||||||
|
my $start_run = 1;
|
||||||
|
my $bestpoint = undef;
|
||||||
|
my $devbleu = undef;
|
||||||
|
my $sparse_weights_file = undef;
|
||||||
|
my $jobid = -1;
|
||||||
|
|
||||||
|
my $prev_feature_file = undef;
|
||||||
|
my $prev_score_file = undef;
|
||||||
|
my $prev_init_file = undef;
|
||||||
|
|
||||||
|
|
||||||
|
#########################
|
||||||
|
# set jobid to trace different jobs
|
||||||
|
my $prevjid = undef;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#### my $run=$start_run-1;
|
||||||
|
|
||||||
|
my $oldallsorted = undef;
|
||||||
|
my $allsorted = undef;
|
||||||
|
|
||||||
|
my $nbest_file=undef;
|
||||||
|
my $lsamp_file=undef; #Lattice samples
|
||||||
|
my $orig_nbest_file=undef; # replaced if lattice sampling
|
||||||
|
my $cmd=undef;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
$nbest_file="run$run.best$___N_BEST_LIST_SIZE.out";
|
||||||
|
safesystem("gzip -f $nbest_file") or die "Failed to gzip run*out";
|
||||||
|
$nbest_file = $nbest_file.".gz";
|
||||||
|
|
||||||
|
|
||||||
|
# extract score statistics and features from the nbest lists
|
||||||
|
print STDERR "Scoring the nbestlist.\n";
|
||||||
|
|
||||||
|
my $base_feature_file = "features.dat";
|
||||||
|
my $base_score_file = "scores.dat";
|
||||||
|
my $feature_file = "run$run.${base_feature_file}";
|
||||||
|
my $score_file = "run$run.${base_score_file}";
|
||||||
|
|
||||||
|
$cmd = "$mert_extract_cmd $mert_extract_args --scfile $score_file --ffile $feature_file -r ".join(",", @references)." -n $nbest_file";
|
||||||
|
$cmd = create_extractor_script($cmd, $___WORKING_DIR);
|
||||||
|
|
||||||
|
&submit_or_exec($cmd,"extract.out","extract.err","extract.id");
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
} # end of local scope
|
||||||
|
|
||||||
|
sub get_weights_from_mert {
|
||||||
|
my ($outfile,$logfile,$weight_count,$sparse_weights) = @_;
|
||||||
|
my ($bestpoint,$devbleu);
|
||||||
|
if ($___PAIRWISE_RANKED_OPTIMIZER || ($___PRO_STARTING_POINT && $logfile =~ /pro/)) {
|
||||||
|
open(IN,$outfile) or die "Can't open $outfile";
|
||||||
|
my (@WEIGHT,$sum);
|
||||||
|
for(my $i=0;$i<$weight_count;$i++) { push @WEIGHT, 0; }
|
||||||
|
while(<IN>) {
|
||||||
|
# regular features
|
||||||
|
if (/^F(\d+) ([\-\.\de]+)/) {
|
||||||
|
$WEIGHT[$1] = $2;
|
||||||
|
$sum += abs($2);
|
||||||
|
}
|
||||||
|
# sparse features
|
||||||
|
elsif(/^(.+_.+) ([\-\.\de]+)/) {
|
||||||
|
$$sparse_weights{$1} = $2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$devbleu = "unknown";
|
||||||
|
foreach (@WEIGHT) { $_ /= $sum; }
|
||||||
|
foreach (keys %{$sparse_weights}) { $$sparse_weights{$_} /= $sum; }
|
||||||
|
$bestpoint = join(" ",@WEIGHT);
|
||||||
|
close IN;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
open(IN,$logfile) or die "Can't open $logfile";
|
||||||
|
while (<IN>) {
|
||||||
|
if (/Best point:\s*([\s\d\.\-e]+?)\s*=> ([\-\d\.]+)/) {
|
||||||
|
$bestpoint = $1;
|
||||||
|
$devbleu = $2;
|
||||||
|
last;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
close IN;
|
||||||
|
}
|
||||||
|
return ($bestpoint,$devbleu);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
sub sanity_check_order_of_lambdas {
|
||||||
|
my $featlist = shift;
|
||||||
|
my $filename_or_stream = shift;
|
||||||
|
|
||||||
|
my @expected_lambdas = @{$featlist->{"names"}};
|
||||||
|
my @got = get_order_of_scores_from_nbestlist($filename_or_stream);
|
||||||
|
die "Mismatched lambdas. Decoder returned @got, we expected @expected_lambdas"
|
||||||
|
if "@got" ne "@expected_lambdas";
|
||||||
|
}
|
||||||
|
|
||||||
|
sub get_featlist_from_moses {
|
||||||
|
# run moses with the given config file and return the list of features and
|
||||||
|
# their initial values
|
||||||
|
my $configfn = shift;
|
||||||
|
my $featlistfn = "./features.list";
|
||||||
|
if (-e $featlistfn) {
|
||||||
|
print STDERR "Using cached features list: $featlistfn\n";
|
||||||
|
} else {
|
||||||
|
print STDERR "Asking moses for feature names and values from $___CONFIG\n";
|
||||||
|
my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn -inputtype $___INPUTTYPE -show-weights > $featlistfn";
|
||||||
|
safesystem($cmd) or die "Failed to run moses with the config $configfn";
|
||||||
|
}
|
||||||
|
|
||||||
|
# read feature list
|
||||||
|
my @names = ();
|
||||||
|
my @startvalues = ();
|
||||||
|
open(INI,$featlistfn) or die "Can't read $featlistfn";
|
||||||
|
my $nr = 0;
|
||||||
|
my @errs = ();
|
||||||
|
while (<INI>) {
|
||||||
|
$nr++;
|
||||||
|
chomp;
|
||||||
|
/^(.+) (\S+) (\S+)$/ || die("invalid feature: $_");
|
||||||
|
my ($longname, $feature, $value) = ($1,$2,$3);
|
||||||
|
next if $value eq "sparse";
|
||||||
|
push @errs, "$featlistfn:$nr:Bad initial value of $feature: $value\n"
|
||||||
|
if $value !~ /^[+-]?[0-9.e]+$/;
|
||||||
|
push @errs, "$featlistfn:$nr:Unknown feature '$feature', please add it to \@ABBR_FULL_MAP\n"
|
||||||
|
if !defined $ABBR2FULL{$feature};
|
||||||
|
push @names, $feature;
|
||||||
|
push @startvalues, $value;
|
||||||
|
}
|
||||||
|
close INI;
|
||||||
|
if (scalar @errs) {
|
||||||
|
print STDERR join("", @errs);
|
||||||
|
exit 1;
|
||||||
|
}
|
||||||
|
return {"names"=>\@names, "values"=>\@startvalues};
|
||||||
|
}
|
||||||
|
|
||||||
|
sub get_order_of_scores_from_nbestlist {
|
||||||
|
# read the first line and interpret the ||| label: num num num label2: num ||| column in nbestlist
|
||||||
|
# return the score labels in order
|
||||||
|
my $fname_or_source = shift;
|
||||||
|
# print STDERR "Peeking at the beginning of nbestlist to get order of scores: $fname_or_source\n";
|
||||||
|
open IN, $fname_or_source or die "Failed to get order of scores from nbestlist '$fname_or_source'";
|
||||||
|
my $line = <IN>;
|
||||||
|
close IN;
|
||||||
|
die "Line empty in nbestlist '$fname_or_source'" if !defined $line;
|
||||||
|
my ($sent, $hypo, $scores, $total) = split /\|\|\|/, $line;
|
||||||
|
$scores =~ s/^\s*|\s*$//g;
|
||||||
|
die "No scores in line: $line" if $scores eq "";
|
||||||
|
|
||||||
|
my @order = ();
|
||||||
|
my $label = undef;
|
||||||
|
my $sparse = 0; # we ignore sparse features here
|
||||||
|
foreach my $tok (split /\s+/, $scores) {
|
||||||
|
if ($tok =~ /.+_.+:/) {
|
||||||
|
$sparse = 1;
|
||||||
|
} elsif ($tok =~ /^([a-z][0-9a-z]*):/i) {
|
||||||
|
$label = $1;
|
||||||
|
} elsif ($tok =~ /^-?[-0-9.e]+$/) {
|
||||||
|
if (!$sparse) {
|
||||||
|
# a score found, remember it
|
||||||
|
die "Found a score but no label before it! Bad nbestlist '$fname_or_source'!"
|
||||||
|
if !defined $label;
|
||||||
|
push @order, $label;
|
||||||
|
}
|
||||||
|
$sparse = 0;
|
||||||
|
} else {
|
||||||
|
die "Not a label, not a score '$tok'. Failed to parse the scores string: '$scores' of nbestlist '$fname_or_source'";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
print STDERR "The decoder returns the scores in this order: @order\n";
|
||||||
|
return @order;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
sub safesystem {
|
||||||
|
print STDERR "Executing: @_\n";
|
||||||
|
system(@_);
|
||||||
|
if ($? == -1) {
|
||||||
|
print STDERR "Failed to execute: @_\n $!\n";
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
elsif ($? & 127) {
|
||||||
|
printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n",
|
||||||
|
($? & 127), ($? & 128) ? 'with' : 'without';
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
my $exitcode = $? >> 8;
|
||||||
|
print STDERR "Exit code: $exitcode\n" if $exitcode;
|
||||||
|
return ! $exitcode;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sub ensure_full_path {
|
||||||
|
my $PATH = shift;
|
||||||
|
$PATH =~ s/\/nfsmnt//;
|
||||||
|
return $PATH if $PATH =~ /^\//;
|
||||||
|
my $dir = `pawd 2>/dev/null`;
|
||||||
|
if(!$dir){$dir = `pwd`;}
|
||||||
|
chomp($dir);
|
||||||
|
$PATH = $dir."/".$PATH;
|
||||||
|
$PATH =~ s/[\r\n]//g;
|
||||||
|
$PATH =~ s/\/\.\//\//g;
|
||||||
|
$PATH =~ s/\/+/\//g;
|
||||||
|
my $sanity = 0;
|
||||||
|
while($PATH =~ /\/\.\.\// && $sanity++<10) {
|
||||||
|
$PATH =~ s/\/+/\//g;
|
||||||
|
$PATH =~ s/\/[^\/]+\/\.\.\//\//g;
|
||||||
|
}
|
||||||
|
$PATH =~ s/\/[^\/]+\/\.\.$//;
|
||||||
|
$PATH =~ s/\/+$//;
|
||||||
|
$PATH =~ s/\/nfsmnt//;
|
||||||
|
return $PATH;
|
||||||
|
}
|
||||||
|
|
||||||
|
sub submit_or_exec {
|
||||||
|
|
||||||
|
my $argvlen = @_;
|
||||||
|
my $cmd = undef;
|
||||||
|
my $stdout = undef;
|
||||||
|
my $stderr = undef;
|
||||||
|
my $jidfile = undef;
|
||||||
|
my $prevjid = undef;
|
||||||
|
|
||||||
|
# if supply 3 arguments, exec without submit
|
||||||
|
# if supply 4 arguments, then submit new job
|
||||||
|
# if supply 5 arguments, wait for the previous job to finish
|
||||||
|
if ($argvlen == 3){
|
||||||
|
($cmd,$stdout,$stderr) = @_;
|
||||||
|
} elsif ($argvlen == 4){
|
||||||
|
($cmd,$stdout,$stderr,$jidfile) = @_;
|
||||||
|
} elsif ($argvlen == 5){
|
||||||
|
($cmd,$stdout,$stderr,$jidfile,$prevjid) = @_;
|
||||||
|
}
|
||||||
|
|
||||||
|
print STDERR "exec: $cmd\n";
|
||||||
|
if (defined $___JOBS && $___JOBS > 0 && $argvlen==5) {
|
||||||
|
safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameters=\"$queue_flags\" -stdout=$stdout -stderr=$stderr -jidfile=$jidfile -prevjid=$prevjid" )
|
||||||
|
or die "ERROR: Failed to submit '$cmd' (via $qsubwrapper)";
|
||||||
|
}
|
||||||
|
elsif (defined $___JOBS && $___JOBS > 0 && $argvlen==4) {
|
||||||
|
safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameters=\"$queue_flags\" -stdout=$stdout -stderr=$stderr -jidfile=$jidfile" )
|
||||||
|
or die "ERROR: Failed to submit '$cmd' (via $qsubwrapper)";
|
||||||
|
} else {
|
||||||
|
safesystem("$cmd > $stdout 2> $stderr") or die "ERROR: Failed to run '$cmd'.";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sub exit_submit {
|
||||||
|
|
||||||
|
my $argvlen = @_;
|
||||||
|
my $cmd = undef;
|
||||||
|
my $stdout = undef;
|
||||||
|
my $stderr = undef;
|
||||||
|
my $jidfile = undef;
|
||||||
|
my $pidfile = undef;
|
||||||
|
my $prevjid = undef;
|
||||||
|
my $prevjidarraysize = 0;
|
||||||
|
my @prevjidarray = ();
|
||||||
|
my $pid = undef;
|
||||||
|
my $qsubcmd="";
|
||||||
|
my $hj="";
|
||||||
|
|
||||||
|
# if supply 4 arguments, then submit new job
|
||||||
|
# if supply 5 arguments, wait for the previous job to finish
|
||||||
|
if ($argvlen == 2) {
|
||||||
|
($stdout,$stderr) = @_;
|
||||||
|
} elsif ($argvlen == 4){
|
||||||
|
($stdout,$stderr,$jidfile,$pidfile) = @_;
|
||||||
|
} elsif ($argvlen == 5){
|
||||||
|
($stdout,$stderr,$jidfile,$pidfile,$prevjid) = @_;
|
||||||
|
}
|
||||||
|
|
||||||
|
# parse prevjid ########################
|
||||||
|
$prevjid =~ s/^\s+|\s+$//g;
|
||||||
|
@prevjidarray = split(/\s+/,$prevjid);
|
||||||
|
$prevjidarraysize = scalar(@prevjidarray);
|
||||||
|
########################################
|
||||||
|
|
||||||
|
|
||||||
|
# print STDERR "exec: $stdout\n";
|
||||||
|
|
||||||
|
# read pid from file, and draft exit script ##################
|
||||||
|
chomp ($pid=`tail -n 1 $pidfile`);
|
||||||
|
open (OUT, ">exitjob$pid.sh");
|
||||||
|
|
||||||
|
my $scriptheader="\#\!/bin/bash\n\#\$ -S /bin/sh\n# Both lines are needed to invoke base\n#the above line is ignored by qsub, unless parameter \"-b yes\" is set!\n\n";
|
||||||
|
$scriptheader .="uname -a\n\n";
|
||||||
|
$scriptheader .="cd $___WORKING_DIR\n\n";
|
||||||
|
|
||||||
|
print OUT $scriptheader;
|
||||||
|
|
||||||
|
print OUT "if $qsubwrapper_exit -submithost=$submithost -stdout=$stdout -stderr=$stderr -jidfile=$jidfile -pidfile=$pidfile > exitjob$pid.out 2> exitjob$pid.err ; then
|
||||||
|
echo 'succeeded'
|
||||||
|
else
|
||||||
|
echo failed with exit status \$\?
|
||||||
|
die=1
|
||||||
|
fi
|
||||||
|
";
|
||||||
|
print OUT "\n\n";
|
||||||
|
|
||||||
|
close (OUT);
|
||||||
|
# setting permissions of the script
|
||||||
|
chmod(oct(755),"exitjob$pid.sh");
|
||||||
|
##############################################################
|
||||||
|
|
||||||
|
|
||||||
|
if (defined $___JOBS && $___JOBS > 0 && $argvlen==5) {
|
||||||
|
if (defined $prevjid && $prevjid!=-1 && $prevjidarraysize == 1){
|
||||||
|
$hj = "-hold_jid $prevjid";
|
||||||
|
} elsif (defined $prevjid && $prevjidarraysize > 1){
|
||||||
|
$hj = "-hold_jid " . join(" -hold_jid ", @prevjidarray);
|
||||||
|
}
|
||||||
|
$qsubcmd="qsub $queue_flags -V $hj exitjob$pid.sh > exitjob$pid.log 2>&1";
|
||||||
|
safesystem($qsubcmd) or die "ERROR: Failed to exit-submit $pid (via $qsubwrapper_exit)";
|
||||||
|
} elsif (defined $___JOBS && $___JOBS > 0 && $argvlen==4) {
|
||||||
|
$qsubcmd="qsub $queue_flags -V exitjob$pid.sh > exitjob$pid.log 2>&1";
|
||||||
|
safesystem($qsubcmd) or die "ERROR: Failed to exit-submit $pid (via $qsubwrapper_exit)";
|
||||||
|
} else {
|
||||||
|
safesystem("rm $stdout") or die "ERROR: Failed to remove '$stdout'.";
|
||||||
|
safesystem("rm $stderr") or die "ERROR: Failed to remove '$stderr'.";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
sub create_extractor_script()
|
||||||
|
{
|
||||||
|
my ($cmd, $outdir) = @_;
|
||||||
|
my $script_path = File::Spec->catfile($outdir, "extractor.sh");
|
||||||
|
|
||||||
|
open my $out, '>', $script_path
|
||||||
|
or die "Couldn't open $script_path for writing: $!\n";
|
||||||
|
print $out "#!/bin/bash\n";
|
||||||
|
print $out "cd $outdir\n";
|
||||||
|
print $out "$cmd\n";
|
||||||
|
close($out);
|
||||||
|
|
||||||
|
`chmod +x $script_path`;
|
||||||
|
|
||||||
|
return $script_path;
|
||||||
|
}
|
@ -18,16 +18,16 @@
|
|||||||
<folderInfo id="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127." name="/" resourcePath="">
|
<folderInfo id="cdt.managedbuild.config.gnu.cross.exe.debug.1624346127." name="/" resourcePath="">
|
||||||
<toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.debug.499747849" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.debug">
|
<toolChain id="cdt.managedbuild.toolchain.gnu.cross.exe.debug.499747849" name="Cross GCC" superClass="cdt.managedbuild.toolchain.gnu.cross.exe.debug">
|
||||||
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.798364121" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
|
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.798364121" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
|
||||||
<builder buildPath="${workspace_loc:/extract-ordering}/Debug" id="cdt.managedbuild.builder.gnu.cross.1976289814" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
|
<builder buildPath="${workspace_loc:/extract-ordering}/Debug" id="cdt.managedbuild.builder.gnu.cross.1976289814" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.builder.gnu.cross"/>
|
||||||
<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.1699460827" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
|
<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.1699460827" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
|
||||||
<option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.option.optimization.level.1324749613" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" useByScannerDiscovery="false" valueType="enumerated"/>
|
<option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.option.optimization.level.1324749613" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
|
||||||
<option id="gnu.c.compiler.option.debugging.level.1750299246" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" useByScannerDiscovery="false" value="gnu.c.debugging.level.max" valueType="enumerated"/>
|
<option id="gnu.c.compiler.option.debugging.level.1750299246" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
|
||||||
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.719498215" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
|
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.719498215" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
|
||||||
</tool>
|
</tool>
|
||||||
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1317297964" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
|
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1317297964" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
|
||||||
<option id="gnu.cpp.compiler.option.optimization.level.251118848" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" useByScannerDiscovery="false" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
|
<option id="gnu.cpp.compiler.option.optimization.level.251118848" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
|
||||||
<option id="gnu.cpp.compiler.option.debugging.level.99297656" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" useByScannerDiscovery="false" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
|
<option id="gnu.cpp.compiler.option.debugging.level.99297656" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
|
||||||
<option id="gnu.cpp.compiler.option.include.paths.106920816" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
|
<option id="gnu.cpp.compiler.option.include.paths.106920816" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
|
||||||
<listOptionValue builtIn="false" value=""${workspace_loc}/../../boost/include""/>
|
<listOptionValue builtIn="false" value=""${workspace_loc}/../../boost/include""/>
|
||||||
</option>
|
</option>
|
||||||
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1327002489" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
|
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1327002489" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
|
||||||
@ -123,4 +123,12 @@
|
|||||||
</scannerConfigBuildInfo>
|
</scannerConfigBuildInfo>
|
||||||
</storageModule>
|
</storageModule>
|
||||||
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
|
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
|
||||||
|
<storageModule moduleId="refreshScope" versionNumber="2">
|
||||||
|
<configuration configurationName="Release">
|
||||||
|
<resource resourceType="PROJECT" workspacePath="/extract-ordering"/>
|
||||||
|
</configuration>
|
||||||
|
<configuration configurationName="Debug">
|
||||||
|
<resource resourceType="PROJECT" workspacePath="/extract-ordering"/>
|
||||||
|
</configuration>
|
||||||
|
</storageModule>
|
||||||
</cproject>
|
</cproject>
|
||||||
|
@ -241,51 +241,6 @@
|
|||||||
<type>1</type>
|
<type>1</type>
|
||||||
<locationURI>PARENT-3-PROJECT_LOC/moses/ChartTranslationOptions.h</locationURI>
|
<locationURI>PARENT-3-PROJECT_LOC/moses/ChartTranslationOptions.h</locationURI>
|
||||||
</link>
|
</link>
|
||||||
<link>
|
|
||||||
<name>ChartTrellisDetour.cpp</name>
|
|
||||||
<type>1</type>
|
|
||||||
<locationURI>PARENT-3-PROJECT_LOC/moses/ChartTrellisDetour.cpp</locationURI>
|
|
||||||
</link>
|
|
||||||
<link>
|
|
||||||
<name>ChartTrellisDetour.h</name>
|
|
||||||
<type>1</type>
|
|
||||||
<locationURI>PARENT-3-PROJECT_LOC/moses/ChartTrellisDetour.h</locationURI>
|
|
||||||
</link>
|
|
||||||
<link>
|
|
||||||
<name>ChartTrellisDetourQueue.cpp</name>
|
|
||||||
<type>1</type>
|
|
||||||
<locationURI>PARENT-3-PROJECT_LOC/moses/ChartTrellisDetourQueue.cpp</locationURI>
|
|
||||||
</link>
|
|
||||||
<link>
|
|
||||||
<name>ChartTrellisDetourQueue.h</name>
|
|
||||||
<type>1</type>
|
|
||||||
<locationURI>PARENT-3-PROJECT_LOC/moses/ChartTrellisDetourQueue.h</locationURI>
|
|
||||||
</link>
|
|
||||||
<link>
|
|
||||||
<name>ChartTrellisNode.cpp</name>
|
|
||||||
<type>1</type>
|
|
||||||
<locationURI>PARENT-3-PROJECT_LOC/moses/ChartTrellisNode.cpp</locationURI>
|
|
||||||
</link>
|
|
||||||
<link>
|
|
||||||
<name>ChartTrellisNode.h</name>
|
|
||||||
<type>1</type>
|
|
||||||
<locationURI>PARENT-3-PROJECT_LOC/moses/ChartTrellisNode.h</locationURI>
|
|
||||||
</link>
|
|
||||||
<link>
|
|
||||||
<name>ChartTrellisPath.cpp</name>
|
|
||||||
<type>1</type>
|
|
||||||
<locationURI>PARENT-3-PROJECT_LOC/moses/ChartTrellisPath.cpp</locationURI>
|
|
||||||
</link>
|
|
||||||
<link>
|
|
||||||
<name>ChartTrellisPath.h</name>
|
|
||||||
<type>1</type>
|
|
||||||
<locationURI>PARENT-3-PROJECT_LOC/moses/ChartTrellisPath.h</locationURI>
|
|
||||||
</link>
|
|
||||||
<link>
|
|
||||||
<name>ChartTrellisPathList.h</name>
|
|
||||||
<type>1</type>
|
|
||||||
<locationURI>PARENT-3-PROJECT_LOC/moses/ChartTrellisPathList.h</locationURI>
|
|
||||||
</link>
|
|
||||||
<link>
|
<link>
|
||||||
<name>ConfusionNet.cpp</name>
|
<name>ConfusionNet.cpp</name>
|
||||||
<type>1</type>
|
<type>1</type>
|
||||||
@ -586,6 +541,11 @@
|
|||||||
<type>1</type>
|
<type>1</type>
|
||||||
<locationURI>PARENT-3-PROJECT_LOC/moses/PDTAimp.h</locationURI>
|
<locationURI>PARENT-3-PROJECT_LOC/moses/PDTAimp.h</locationURI>
|
||||||
</link>
|
</link>
|
||||||
|
<link>
|
||||||
|
<name>PP</name>
|
||||||
|
<type>2</type>
|
||||||
|
<locationURI>virtual:/virtual</locationURI>
|
||||||
|
</link>
|
||||||
<link>
|
<link>
|
||||||
<name>Parameter.cpp</name>
|
<name>Parameter.cpp</name>
|
||||||
<type>1</type>
|
<type>1</type>
|
||||||
@ -1211,6 +1171,26 @@
|
|||||||
<type>2</type>
|
<type>2</type>
|
||||||
<locationURI>virtual:/virtual</locationURI>
|
<locationURI>virtual:/virtual</locationURI>
|
||||||
</link>
|
</link>
|
||||||
|
<link>
|
||||||
|
<name>FF/MaxSpanFreeNonTermSource.cpp</name>
|
||||||
|
<type>1</type>
|
||||||
|
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/MaxSpanFreeNonTermSource.cpp</locationURI>
|
||||||
|
</link>
|
||||||
|
<link>
|
||||||
|
<name>FF/MaxSpanFreeNonTermSource.h</name>
|
||||||
|
<type>1</type>
|
||||||
|
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/MaxSpanFreeNonTermSource.h</locationURI>
|
||||||
|
</link>
|
||||||
|
<link>
|
||||||
|
<name>FF/NieceTerminal.cpp</name>
|
||||||
|
<type>1</type>
|
||||||
|
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/NieceTerminal.cpp</locationURI>
|
||||||
|
</link>
|
||||||
|
<link>
|
||||||
|
<name>FF/NieceTerminal.h</name>
|
||||||
|
<type>1</type>
|
||||||
|
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/NieceTerminal.h</locationURI>
|
||||||
|
</link>
|
||||||
<link>
|
<link>
|
||||||
<name>FF/OSM-Feature</name>
|
<name>FF/OSM-Feature</name>
|
||||||
<type>2</type>
|
<type>2</type>
|
||||||
@ -1272,14 +1252,14 @@
|
|||||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/ReferenceComparison.h</locationURI>
|
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/ReferenceComparison.h</locationURI>
|
||||||
</link>
|
</link>
|
||||||
<link>
|
<link>
|
||||||
<name>FF/RuleAmbiguity.cpp</name>
|
<name>FF/RuleScope.cpp</name>
|
||||||
<type>1</type>
|
<type>1</type>
|
||||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/RuleAmbiguity.cpp</locationURI>
|
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/RuleScope.cpp</locationURI>
|
||||||
</link>
|
</link>
|
||||||
<link>
|
<link>
|
||||||
<name>FF/RuleAmbiguity.h</name>
|
<name>FF/RuleScope.h</name>
|
||||||
<type>1</type>
|
<type>1</type>
|
||||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/RuleAmbiguity.h</locationURI>
|
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/RuleScope.h</locationURI>
|
||||||
</link>
|
</link>
|
||||||
<link>
|
<link>
|
||||||
<name>FF/SetSourcePhrase.cpp</name>
|
<name>FF/SetSourcePhrase.cpp</name>
|
||||||
@ -1626,6 +1606,26 @@
|
|||||||
<type>1</type>
|
<type>1</type>
|
||||||
<locationURI>PARENT-3-PROJECT_LOC/moses/LM/backward.arpa</locationURI>
|
<locationURI>PARENT-3-PROJECT_LOC/moses/LM/backward.arpa</locationURI>
|
||||||
</link>
|
</link>
|
||||||
|
<link>
|
||||||
|
<name>PP/Factory.cpp</name>
|
||||||
|
<type>1</type>
|
||||||
|
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/Factory.cpp</locationURI>
|
||||||
|
</link>
|
||||||
|
<link>
|
||||||
|
<name>PP/Factory.h</name>
|
||||||
|
<type>1</type>
|
||||||
|
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/Factory.h</locationURI>
|
||||||
|
</link>
|
||||||
|
<link>
|
||||||
|
<name>PP/PhraseProperty.h</name>
|
||||||
|
<type>1</type>
|
||||||
|
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/PhraseProperty.h</locationURI>
|
||||||
|
</link>
|
||||||
|
<link>
|
||||||
|
<name>PP/TreeStructurePhraseProperty.h</name>
|
||||||
|
<type>1</type>
|
||||||
|
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/TreeStructurePhraseProperty.h</locationURI>
|
||||||
|
</link>
|
||||||
<link>
|
<link>
|
||||||
<name>TranslationModel/BilingualDynSuffixArray.cpp</name>
|
<name>TranslationModel/BilingualDynSuffixArray.cpp</name>
|
||||||
<type>1</type>
|
<type>1</type>
|
||||||
|
@ -20,14 +20,14 @@
|
|||||||
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.2040884960" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
|
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.2040884960" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
|
||||||
<builder buildPath="${workspace_loc:/score/Debug}" id="cdt.managedbuild.builder.gnu.cross.1709170788" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
|
<builder buildPath="${workspace_loc:/score/Debug}" id="cdt.managedbuild.builder.gnu.cross.1709170788" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
|
||||||
<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.786339685" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
|
<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.786339685" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
|
||||||
<option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.option.optimization.level.1516054114" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" useByScannerDiscovery="false" valueType="enumerated"/>
|
<option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.option.optimization.level.1516054114" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
|
||||||
<option id="gnu.c.compiler.option.debugging.level.1061705384" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" useByScannerDiscovery="false" value="gnu.c.debugging.level.max" valueType="enumerated"/>
|
<option id="gnu.c.compiler.option.debugging.level.1061705384" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
|
||||||
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.2108019237" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
|
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.2108019237" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
|
||||||
</tool>
|
</tool>
|
||||||
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1013232238" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
|
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.1013232238" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
|
||||||
<option id="gnu.cpp.compiler.option.optimization.level.1874109813" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" useByScannerDiscovery="false" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
|
<option id="gnu.cpp.compiler.option.optimization.level.1874109813" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
|
||||||
<option id="gnu.cpp.compiler.option.debugging.level.2032778777" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" useByScannerDiscovery="false" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
|
<option id="gnu.cpp.compiler.option.debugging.level.2032778777" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
|
||||||
<option id="gnu.cpp.compiler.option.include.paths.1713606194" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" useByScannerDiscovery="false" valueType="includePath">
|
<option id="gnu.cpp.compiler.option.include.paths.1713606194" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
|
||||||
<listOptionValue builtIn="false" value=""${workspace_loc}/../../boost/include""/>
|
<listOptionValue builtIn="false" value=""${workspace_loc}/../../boost/include""/>
|
||||||
<listOptionValue builtIn="false" value=""${workspace_loc}/../../""/>
|
<listOptionValue builtIn="false" value=""${workspace_loc}/../../""/>
|
||||||
</option>
|
</option>
|
||||||
@ -42,11 +42,12 @@
|
|||||||
</option>
|
</option>
|
||||||
<option id="gnu.cpp.link.option.libs.936233947" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
|
<option id="gnu.cpp.link.option.libs.936233947" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
|
||||||
<listOptionValue builtIn="false" value="z"/>
|
<listOptionValue builtIn="false" value="z"/>
|
||||||
<listOptionValue builtIn="false" value="util"/>
|
|
||||||
<listOptionValue builtIn="false" value="moses"/>
|
<listOptionValue builtIn="false" value="moses"/>
|
||||||
|
<listOptionValue builtIn="false" value="util"/>
|
||||||
<listOptionValue builtIn="false" value="boost_iostreams-mt"/>
|
<listOptionValue builtIn="false" value="boost_iostreams-mt"/>
|
||||||
<listOptionValue builtIn="false" value="boost_system-mt"/>
|
<listOptionValue builtIn="false" value="boost_system-mt"/>
|
||||||
<listOptionValue builtIn="false" value="boost_filesystem-mt"/>
|
<listOptionValue builtIn="false" value="boost_filesystem-mt"/>
|
||||||
|
<listOptionValue builtIn="false" value="rt"/>
|
||||||
</option>
|
</option>
|
||||||
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.589709979" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
|
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.589709979" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
|
||||||
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
|
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
|
||||||
@ -82,13 +83,13 @@
|
|||||||
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.1353054437" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
|
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.targetPlatform.gnu.cross.1353054437" isAbstract="false" osList="all" superClass="cdt.managedbuild.targetPlatform.gnu.cross"/>
|
||||||
<builder buildPath="${workspace_loc:/score/Release}" id="cdt.managedbuild.builder.gnu.cross.1851758128" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
|
<builder buildPath="${workspace_loc:/score/Release}" id="cdt.managedbuild.builder.gnu.cross.1851758128" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.builder.gnu.cross"/>
|
||||||
<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.323743241" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
|
<tool id="cdt.managedbuild.tool.gnu.cross.c.compiler.323743241" name="Cross GCC Compiler" superClass="cdt.managedbuild.tool.gnu.cross.c.compiler">
|
||||||
<option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.option.optimization.level.534423111" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" useByScannerDiscovery="false" valueType="enumerated"/>
|
<option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.option.optimization.level.534423111" name="Optimization Level" superClass="gnu.c.compiler.option.optimization.level" valueType="enumerated"/>
|
||||||
<option id="gnu.c.compiler.option.debugging.level.518786530" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" useByScannerDiscovery="false" value="gnu.c.debugging.level.none" valueType="enumerated"/>
|
<option id="gnu.c.compiler.option.debugging.level.518786530" name="Debug Level" superClass="gnu.c.compiler.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
|
||||||
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.392640311" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
|
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.392640311" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
|
||||||
</tool>
|
</tool>
|
||||||
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.307472312" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
|
<tool id="cdt.managedbuild.tool.gnu.cross.cpp.compiler.307472312" name="Cross G++ Compiler" superClass="cdt.managedbuild.tool.gnu.cross.cpp.compiler">
|
||||||
<option id="gnu.cpp.compiler.option.optimization.level.407718562" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" useByScannerDiscovery="false" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
|
<option id="gnu.cpp.compiler.option.optimization.level.407718562" name="Optimization Level" superClass="gnu.cpp.compiler.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
|
||||||
<option id="gnu.cpp.compiler.option.debugging.level.1687450255" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" useByScannerDiscovery="false" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
|
<option id="gnu.cpp.compiler.option.debugging.level.1687450255" name="Debug Level" superClass="gnu.cpp.compiler.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
|
||||||
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.593478428" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
|
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.593478428" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
|
||||||
</tool>
|
</tool>
|
||||||
<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.165176764" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
|
<tool id="cdt.managedbuild.tool.gnu.cross.c.linker.165176764" name="Cross GCC Linker" superClass="cdt.managedbuild.tool.gnu.cross.c.linker"/>
|
||||||
|
@ -3,6 +3,8 @@
|
|||||||
<name>score</name>
|
<name>score</name>
|
||||||
<comment></comment>
|
<comment></comment>
|
||||||
<projects>
|
<projects>
|
||||||
|
<project>moses</project>
|
||||||
|
<project>util</project>
|
||||||
</projects>
|
</projects>
|
||||||
<buildSpec>
|
<buildSpec>
|
||||||
<buildCommand>
|
<buildCommand>
|
||||||
|
@ -279,6 +279,12 @@ public:
|
|||||||
manager.ProcessSentence();
|
manager.ProcessSentence();
|
||||||
const ChartHypothesis *hypo = manager.GetBestHypothesis();
|
const ChartHypothesis *hypo = manager.GetBestHypothesis();
|
||||||
outputChartHypo(out,hypo);
|
outputChartHypo(out,hypo);
|
||||||
|
if (addGraphInfo) {
|
||||||
|
const size_t translationId = tinput.GetTranslationId();
|
||||||
|
std::ostringstream sgstream;
|
||||||
|
manager.GetSearchGraph(translationId,sgstream);
|
||||||
|
retData.insert(pair<string, xmlrpc_c::value>("sg", xmlrpc_c::value_string(sgstream.str())));
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
Sentence sentence;
|
Sentence sentence;
|
||||||
const vector<FactorType> &inputFactorOrder =
|
const vector<FactorType> &inputFactorOrder =
|
||||||
@ -310,7 +316,7 @@ public:
|
|||||||
retData.insert(pair<string, xmlrpc_c::value_array>("word-align", alignments));
|
retData.insert(pair<string, xmlrpc_c::value_array>("word-align", alignments));
|
||||||
}
|
}
|
||||||
|
|
||||||
if(addGraphInfo) {
|
if (addGraphInfo) {
|
||||||
insertGraphInfo(manager,retData);
|
insertGraphInfo(manager,retData);
|
||||||
(const_cast<StaticData&>(staticData)).SetOutputSearchGraph(false);
|
(const_cast<StaticData&>(staticData)).SetOutputSearchGraph(false);
|
||||||
}
|
}
|
||||||
|
11
lm/Jamfile
11
lm/Jamfile
@ -13,7 +13,16 @@ update-if-changed $(ORDER-LOG) $(max-order) ;
|
|||||||
|
|
||||||
max-order += <dependency>$(ORDER-LOG) ;
|
max-order += <dependency>$(ORDER-LOG) ;
|
||||||
|
|
||||||
fakelib kenlm : [ glob *.cc : *main.cc *test.cc ] ../util//kenutil : <include>.. $(max-order) : : <include>.. $(max-order) ;
|
wrappers = ;
|
||||||
|
local with-nplm = [ option.get "with-nplm" ] ;
|
||||||
|
if $(with-nplm) {
|
||||||
|
lib neuralLM : : <search>$(with-nplm)/src ;
|
||||||
|
obj nplm.o : wrappers/nplm.cc : <include>.. <include>$(with-nplm)/src <cxxflags>-fopenmp ;
|
||||||
|
alias nplm : nplm.o neuralLM ..//boost_thread : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>WITH_NPLM <library>..//boost_thread ;
|
||||||
|
wrappers += nplm ;
|
||||||
|
}
|
||||||
|
|
||||||
|
fakelib kenlm : $(wrappers) [ glob *.cc : *main.cc *test.cc ] ../util//kenutil : <include>.. $(max-order) : : <include>.. $(max-order) ;
|
||||||
|
|
||||||
import testing ;
|
import testing ;
|
||||||
|
|
||||||
|
@ -10,8 +10,8 @@
|
|||||||
* Currently only used for next pointers.
|
* Currently only used for next pointers.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef LM_BHIKSHA__
|
#ifndef LM_BHIKSHA_H
|
||||||
#define LM_BHIKSHA__
|
#define LM_BHIKSHA_H
|
||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
@ -109,4 +109,4 @@ class ArrayBhiksha {
|
|||||||
} // namespace ngram
|
} // namespace ngram
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // LM_BHIKSHA__
|
#endif // LM_BHIKSHA_H
|
||||||
|
@ -149,7 +149,7 @@ void BinaryFormat::InitializeBinary(int fd, ModelType model_type, unsigned int s
|
|||||||
|
|
||||||
void BinaryFormat::ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const {
|
void BinaryFormat::ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const {
|
||||||
assert(header_size_ != kInvalidSize);
|
assert(header_size_ != kInvalidSize);
|
||||||
util::PReadOrThrow(file_.get(), to, amount, offset_excluding_header + header_size_);
|
util::ErsatzPRead(file_.get(), to, amount, offset_excluding_header + header_size_);
|
||||||
}
|
}
|
||||||
|
|
||||||
void *BinaryFormat::LoadBinary(std::size_t size) {
|
void *BinaryFormat::LoadBinary(std::size_t size) {
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_BINARY_FORMAT__
|
#ifndef LM_BINARY_FORMAT_H
|
||||||
#define LM_BINARY_FORMAT__
|
#define LM_BINARY_FORMAT_H
|
||||||
|
|
||||||
#include "lm/config.hh"
|
#include "lm/config.hh"
|
||||||
#include "lm/model_type.hh"
|
#include "lm/model_type.hh"
|
||||||
@ -103,4 +103,4 @@ bool IsBinaryFormat(int fd);
|
|||||||
|
|
||||||
} // namespace ngram
|
} // namespace ngram
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
#endif // LM_BINARY_FORMAT__
|
#endif // LM_BINARY_FORMAT_H
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_BLANK__
|
#ifndef LM_BLANK_H
|
||||||
#define LM_BLANK__
|
#define LM_BLANK_H
|
||||||
|
|
||||||
#include <limits>
|
#include <limits>
|
||||||
|
|
||||||
@ -40,4 +40,4 @@ inline bool HasExtension(const float &backoff) {
|
|||||||
|
|
||||||
} // namespace ngram
|
} // namespace ngram
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
#endif // LM_BLANK__
|
#endif // LM_BLANK_H
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
#include "lm/builder/adjust_counts.hh"
|
#include "lm/builder/adjust_counts.hh"
|
||||||
#include "lm/builder/multi_stream.hh"
|
#include "lm/builder/ngram_stream.hh"
|
||||||
#include "util/stream/timer.hh"
|
#include "util/stream/timer.hh"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
namespace lm { namespace builder {
|
namespace lm { namespace builder {
|
||||||
|
|
||||||
@ -10,19 +11,19 @@ BadDiscountException::BadDiscountException() throw() {}
|
|||||||
BadDiscountException::~BadDiscountException() throw() {}
|
BadDiscountException::~BadDiscountException() throw() {}
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
// Return last word in full that is different.
|
// Return last word in full that is different.
|
||||||
const WordIndex* FindDifference(const NGram &full, const NGram &lower_last) {
|
const WordIndex* FindDifference(const NGram &full, const NGram &lower_last) {
|
||||||
const WordIndex *cur_word = full.end() - 1;
|
const WordIndex *cur_word = full.end() - 1;
|
||||||
const WordIndex *pre_word = lower_last.end() - 1;
|
const WordIndex *pre_word = lower_last.end() - 1;
|
||||||
// Find last difference.
|
// Find last difference.
|
||||||
for (; pre_word >= lower_last.begin() && *pre_word == *cur_word; --cur_word, --pre_word) {}
|
for (; pre_word >= lower_last.begin() && *pre_word == *cur_word; --cur_word, --pre_word) {}
|
||||||
return cur_word;
|
return cur_word;
|
||||||
}
|
}
|
||||||
|
|
||||||
class StatCollector {
|
class StatCollector {
|
||||||
public:
|
public:
|
||||||
StatCollector(std::size_t order, std::vector<uint64_t> &counts, std::vector<Discount> &discounts)
|
StatCollector(std::size_t order, std::vector<uint64_t> &counts, std::vector<uint64_t> &counts_pruned, std::vector<Discount> &discounts)
|
||||||
: orders_(order), full_(orders_.back()), counts_(counts), discounts_(discounts) {
|
: orders_(order), full_(orders_.back()), counts_(counts), counts_pruned_(counts_pruned), discounts_(discounts) {
|
||||||
memset(&orders_[0], 0, sizeof(OrderStat) * order);
|
memset(&orders_[0], 0, sizeof(OrderStat) * order);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -30,10 +31,12 @@ class StatCollector {
|
|||||||
|
|
||||||
void CalculateDiscounts() {
|
void CalculateDiscounts() {
|
||||||
counts_.resize(orders_.size());
|
counts_.resize(orders_.size());
|
||||||
|
counts_pruned_.resize(orders_.size());
|
||||||
discounts_.resize(orders_.size());
|
discounts_.resize(orders_.size());
|
||||||
for (std::size_t i = 0; i < orders_.size(); ++i) {
|
for (std::size_t i = 0; i < orders_.size(); ++i) {
|
||||||
const OrderStat &s = orders_[i];
|
const OrderStat &s = orders_[i];
|
||||||
counts_[i] = s.count;
|
counts_[i] = s.count;
|
||||||
|
counts_pruned_[i] = s.count_pruned;
|
||||||
|
|
||||||
for (unsigned j = 1; j < 4; ++j) {
|
for (unsigned j = 1; j < 4; ++j) {
|
||||||
// TODO: Specialize error message for j == 3, meaning 3+
|
// TODO: Specialize error message for j == 3, meaning 3+
|
||||||
@ -52,14 +55,18 @@ class StatCollector {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Add(std::size_t order_minus_1, uint64_t count) {
|
void Add(std::size_t order_minus_1, uint64_t count, bool pruned = false) {
|
||||||
OrderStat &stat = orders_[order_minus_1];
|
OrderStat &stat = orders_[order_minus_1];
|
||||||
++stat.count;
|
++stat.count;
|
||||||
|
if (!pruned)
|
||||||
|
++stat.count_pruned;
|
||||||
if (count < 5) ++stat.n[count];
|
if (count < 5) ++stat.n[count];
|
||||||
}
|
}
|
||||||
|
|
||||||
void AddFull(uint64_t count) {
|
void AddFull(uint64_t count, bool pruned = false) {
|
||||||
++full_.count;
|
++full_.count;
|
||||||
|
if (!pruned)
|
||||||
|
++full_.count_pruned;
|
||||||
if (count < 5) ++full_.n[count];
|
if (count < 5) ++full_.n[count];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -68,24 +75,27 @@ class StatCollector {
|
|||||||
// n_1 in equation 26 of Chen and Goodman etc
|
// n_1 in equation 26 of Chen and Goodman etc
|
||||||
uint64_t n[5];
|
uint64_t n[5];
|
||||||
uint64_t count;
|
uint64_t count;
|
||||||
|
uint64_t count_pruned;
|
||||||
};
|
};
|
||||||
|
|
||||||
std::vector<OrderStat> orders_;
|
std::vector<OrderStat> orders_;
|
||||||
OrderStat &full_;
|
OrderStat &full_;
|
||||||
|
|
||||||
std::vector<uint64_t> &counts_;
|
std::vector<uint64_t> &counts_;
|
||||||
|
std::vector<uint64_t> &counts_pruned_;
|
||||||
std::vector<Discount> &discounts_;
|
std::vector<Discount> &discounts_;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Reads all entries in order like NGramStream does.
|
// Reads all entries in order like NGramStream does.
|
||||||
// But deletes any entries that have <s> in the 1st (not 0th) position on the
|
// But deletes any entries that have <s> in the 1st (not 0th) position on the
|
||||||
// way out by putting other entries in their place. This disrupts the sort
|
// way out by putting other entries in their place. This disrupts the sort
|
||||||
// order but we don't care because the data is going to be sorted again.
|
// order but we don't care because the data is going to be sorted again.
|
||||||
class CollapseStream {
|
class CollapseStream {
|
||||||
public:
|
public:
|
||||||
CollapseStream(const util::stream::ChainPosition &position) :
|
CollapseStream(const util::stream::ChainPosition &position, uint64_t prune_threshold) :
|
||||||
current_(NULL, NGram::OrderFromSize(position.GetChain().EntrySize())),
|
current_(NULL, NGram::OrderFromSize(position.GetChain().EntrySize())),
|
||||||
block_(position) {
|
prune_threshold_(prune_threshold),
|
||||||
|
block_(position) {
|
||||||
StartBlock();
|
StartBlock();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -96,10 +106,18 @@ class CollapseStream {
|
|||||||
|
|
||||||
CollapseStream &operator++() {
|
CollapseStream &operator++() {
|
||||||
assert(block_);
|
assert(block_);
|
||||||
|
|
||||||
if (current_.begin()[1] == kBOS && current_.Base() < copy_from_) {
|
if (current_.begin()[1] == kBOS && current_.Base() < copy_from_) {
|
||||||
memcpy(current_.Base(), copy_from_, current_.TotalSize());
|
memcpy(current_.Base(), copy_from_, current_.TotalSize());
|
||||||
UpdateCopyFrom();
|
UpdateCopyFrom();
|
||||||
|
|
||||||
|
// Mark highest order n-grams for later pruning
|
||||||
|
if(current_.Count() <= prune_threshold_) {
|
||||||
|
current_.Mark();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
current_.NextInMemory();
|
current_.NextInMemory();
|
||||||
uint8_t *block_base = static_cast<uint8_t*>(block_->Get());
|
uint8_t *block_base = static_cast<uint8_t*>(block_->Get());
|
||||||
if (current_.Base() == block_base + block_->ValidSize()) {
|
if (current_.Base() == block_base + block_->ValidSize()) {
|
||||||
@ -107,6 +125,12 @@ class CollapseStream {
|
|||||||
++block_;
|
++block_;
|
||||||
StartBlock();
|
StartBlock();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Mark highest order n-grams for later pruning
|
||||||
|
if(current_.Count() <= prune_threshold_) {
|
||||||
|
current_.Mark();
|
||||||
|
}
|
||||||
|
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -119,9 +143,15 @@ class CollapseStream {
|
|||||||
current_.ReBase(block_->Get());
|
current_.ReBase(block_->Get());
|
||||||
copy_from_ = static_cast<uint8_t*>(block_->Get()) + block_->ValidSize();
|
copy_from_ = static_cast<uint8_t*>(block_->Get()) + block_->ValidSize();
|
||||||
UpdateCopyFrom();
|
UpdateCopyFrom();
|
||||||
|
|
||||||
|
// Mark highest order n-grams for later pruning
|
||||||
|
if(current_.Count() <= prune_threshold_) {
|
||||||
|
current_.Mark();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find last without bos.
|
// Find last without bos.
|
||||||
void UpdateCopyFrom() {
|
void UpdateCopyFrom() {
|
||||||
for (copy_from_ -= current_.TotalSize(); copy_from_ >= current_.Base(); copy_from_ -= current_.TotalSize()) {
|
for (copy_from_ -= current_.TotalSize(); copy_from_ >= current_.Base(); copy_from_ -= current_.TotalSize()) {
|
||||||
if (NGram(copy_from_, current_.Order()).begin()[1] != kBOS) break;
|
if (NGram(copy_from_, current_.Order()).begin()[1] != kBOS) break;
|
||||||
@ -132,79 +162,103 @@ class CollapseStream {
|
|||||||
|
|
||||||
// Goes backwards in the block
|
// Goes backwards in the block
|
||||||
uint8_t *copy_from_;
|
uint8_t *copy_from_;
|
||||||
|
uint64_t prune_threshold_;
|
||||||
util::stream::Link block_;
|
util::stream::Link block_;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
void AdjustCounts::Run(const ChainPositions &positions) {
|
void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
|
||||||
UTIL_TIMER("(%w s) Adjusted counts\n");
|
UTIL_TIMER("(%w s) Adjusted counts\n");
|
||||||
|
|
||||||
const std::size_t order = positions.size();
|
const std::size_t order = positions.size();
|
||||||
StatCollector stats(order, counts_, discounts_);
|
StatCollector stats(order, counts_, counts_pruned_, discounts_);
|
||||||
if (order == 1) {
|
if (order == 1) {
|
||||||
|
|
||||||
// Only unigrams. Just collect stats.
|
// Only unigrams. Just collect stats.
|
||||||
for (NGramStream full(positions[0]); full; ++full)
|
for (NGramStream full(positions[0]); full; ++full)
|
||||||
stats.AddFull(full->Count());
|
stats.AddFull(full->Count());
|
||||||
|
|
||||||
stats.CalculateDiscounts();
|
stats.CalculateDiscounts();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
NGramStreams streams;
|
NGramStreams streams;
|
||||||
streams.Init(positions, positions.size() - 1);
|
streams.Init(positions, positions.size() - 1);
|
||||||
CollapseStream full(positions[positions.size() - 1]);
|
|
||||||
|
CollapseStream full(positions[positions.size() - 1], prune_thresholds_.back());
|
||||||
|
|
||||||
// Initialization: <unk> has count 0 and so does <s>.
|
// Initialization: <unk> has count 0 and so does <s>.
|
||||||
NGramStream *lower_valid = streams.begin();
|
NGramStream *lower_valid = streams.begin();
|
||||||
streams[0]->Count() = 0;
|
streams[0]->Count() = 0;
|
||||||
*streams[0]->begin() = kUNK;
|
*streams[0]->begin() = kUNK;
|
||||||
stats.Add(0, 0);
|
stats.Add(0, 0);
|
||||||
(++streams[0])->Count() = 0;
|
(++streams[0])->Count() = 0;
|
||||||
*streams[0]->begin() = kBOS;
|
*streams[0]->begin() = kBOS;
|
||||||
// not in stats because it will get put in later.
|
// not in stats because it will get put in later.
|
||||||
|
|
||||||
|
std::vector<uint64_t> lower_counts(positions.size(), 0);
|
||||||
|
|
||||||
// iterate over full (the stream of the highest order ngrams)
|
// iterate over full (the stream of the highest order ngrams)
|
||||||
for (; full; ++full) {
|
for (; full; ++full) {
|
||||||
const WordIndex *different = FindDifference(*full, **lower_valid);
|
const WordIndex *different = FindDifference(*full, **lower_valid);
|
||||||
std::size_t same = full->end() - 1 - different;
|
std::size_t same = full->end() - 1 - different;
|
||||||
// Increment the adjusted count.
|
// Increment the adjusted count.
|
||||||
if (same) ++streams[same - 1]->Count();
|
if (same) ++streams[same - 1]->Count();
|
||||||
|
|
||||||
// Output all the valid ones that changed.
|
// Output all the valid ones that changed.
|
||||||
for (; lower_valid >= &streams[same]; --lower_valid) {
|
for (; lower_valid >= &streams[same]; --lower_valid) {
|
||||||
stats.Add(lower_valid - streams.begin(), (*lower_valid)->Count());
|
|
||||||
|
// mjd: review this!
|
||||||
|
uint64_t order = (*lower_valid)->Order();
|
||||||
|
uint64_t realCount = lower_counts[order - 1];
|
||||||
|
if(order > 1 && prune_thresholds_[order - 1] && realCount <= prune_thresholds_[order - 1])
|
||||||
|
(*lower_valid)->Mark();
|
||||||
|
|
||||||
|
stats.Add(lower_valid - streams.begin(), (*lower_valid)->UnmarkedCount(), (*lower_valid)->IsMarked());
|
||||||
++*lower_valid;
|
++*lower_valid;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Count the true occurrences of lower-order n-grams
|
||||||
|
for (std::size_t i = 0; i < lower_counts.size(); ++i) {
|
||||||
|
if (i >= same) {
|
||||||
|
lower_counts[i] = 0;
|
||||||
|
}
|
||||||
|
lower_counts[i] += full->UnmarkedCount();
|
||||||
|
}
|
||||||
|
|
||||||
// This is here because bos is also const WordIndex *, so copy gets
|
// This is here because bos is also const WordIndex *, so copy gets
|
||||||
// consistent argument types.
|
// consistent argument types.
|
||||||
const WordIndex *full_end = full->end();
|
const WordIndex *full_end = full->end();
|
||||||
// Initialize and mark as valid up to bos.
|
// Initialize and mark as valid up to bos.
|
||||||
const WordIndex *bos;
|
const WordIndex *bos;
|
||||||
for (bos = different; (bos > full->begin()) && (*bos != kBOS); --bos) {
|
for (bos = different; (bos > full->begin()) && (*bos != kBOS); --bos) {
|
||||||
++lower_valid;
|
++lower_valid;
|
||||||
std::copy(bos, full_end, (*lower_valid)->begin());
|
std::copy(bos, full_end, (*lower_valid)->begin());
|
||||||
(*lower_valid)->Count() = 1;
|
(*lower_valid)->Count() = 1;
|
||||||
}
|
}
|
||||||
// Now bos indicates where <s> is or is the 0th word of full.
|
// Now bos indicates where <s> is or is the 0th word of full.
|
||||||
if (bos != full->begin()) {
|
if (bos != full->begin()) {
|
||||||
// There is an <s> beyond the 0th word.
|
// There is an <s> beyond the 0th word.
|
||||||
NGramStream &to = *++lower_valid;
|
NGramStream &to = *++lower_valid;
|
||||||
std::copy(bos, full_end, to->begin());
|
std::copy(bos, full_end, to->begin());
|
||||||
to->Count() = full->Count();
|
|
||||||
|
// mjd: what is this doing?
|
||||||
|
to->Count() = full->UnmarkedCount();
|
||||||
} else {
|
} else {
|
||||||
stats.AddFull(full->Count());
|
stats.AddFull(full->UnmarkedCount(), full->IsMarked());
|
||||||
}
|
}
|
||||||
assert(lower_valid >= &streams[0]);
|
assert(lower_valid >= &streams[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Output everything valid.
|
// Output everything valid.
|
||||||
for (NGramStream *s = streams.begin(); s <= lower_valid; ++s) {
|
for (NGramStream *s = streams.begin(); s <= lower_valid; ++s) {
|
||||||
stats.Add(s - streams.begin(), (*s)->Count());
|
if((*s)->Count() <= prune_thresholds_[(*s)->Order() - 1])
|
||||||
|
(*s)->Mark();
|
||||||
|
stats.Add(s - streams.begin(), (*s)->UnmarkedCount(), (*s)->IsMarked());
|
||||||
++*s;
|
++*s;
|
||||||
}
|
}
|
||||||
// Poison everyone! Except the N-grams which were already poisoned by the input.
|
// Poison everyone! Except the N-grams which were already poisoned by the input.
|
||||||
for (NGramStream *s = streams.begin(); s != streams.end(); ++s)
|
for (NGramStream *s = streams.begin(); s != streams.end(); ++s)
|
||||||
s->Poison();
|
s->Poison();
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_BUILDER_ADJUST_COUNTS__
|
#ifndef LM_BUILDER_ADJUST_COUNTS_H
|
||||||
#define LM_BUILDER_ADJUST_COUNTS__
|
#define LM_BUILDER_ADJUST_COUNTS_H
|
||||||
|
|
||||||
#include "lm/builder/discount.hh"
|
#include "lm/builder/discount.hh"
|
||||||
#include "util/exception.hh"
|
#include "util/exception.hh"
|
||||||
@ -8,11 +8,11 @@
|
|||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
|
namespace util { namespace stream { class ChainPositions; } }
|
||||||
|
|
||||||
namespace lm {
|
namespace lm {
|
||||||
namespace builder {
|
namespace builder {
|
||||||
|
|
||||||
class ChainPositions;
|
|
||||||
|
|
||||||
class BadDiscountException : public util::Exception {
|
class BadDiscountException : public util::Exception {
|
||||||
public:
|
public:
|
||||||
BadDiscountException() throw();
|
BadDiscountException() throw();
|
||||||
@ -27,18 +27,21 @@ class BadDiscountException : public util::Exception {
|
|||||||
*/
|
*/
|
||||||
class AdjustCounts {
|
class AdjustCounts {
|
||||||
public:
|
public:
|
||||||
AdjustCounts(std::vector<uint64_t> &counts, std::vector<Discount> &discounts)
|
AdjustCounts(std::vector<uint64_t> &counts, std::vector<uint64_t> &counts_pruned, std::vector<Discount> &discounts, std::vector<uint64_t> &prune_thresholds)
|
||||||
: counts_(counts), discounts_(discounts) {}
|
: counts_(counts), counts_pruned_(counts_pruned), discounts_(discounts), prune_thresholds_(prune_thresholds)
|
||||||
|
{}
|
||||||
|
|
||||||
void Run(const ChainPositions &positions);
|
void Run(const util::stream::ChainPositions &positions);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::vector<uint64_t> &counts_;
|
std::vector<uint64_t> &counts_;
|
||||||
|
std::vector<uint64_t> &counts_pruned_;
|
||||||
std::vector<Discount> &discounts_;
|
std::vector<Discount> &discounts_;
|
||||||
|
std::vector<uint64_t> &prune_thresholds_;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace builder
|
} // namespace builder
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // LM_BUILDER_ADJUST_COUNTS__
|
#endif // LM_BUILDER_ADJUST_COUNTS_H
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
#include "lm/builder/adjust_counts.hh"
|
#include "lm/builder/adjust_counts.hh"
|
||||||
|
|
||||||
#include "lm/builder/multi_stream.hh"
|
#include "lm/builder/ngram_stream.hh"
|
||||||
#include "util/scoped.hh"
|
#include "util/scoped.hh"
|
||||||
|
|
||||||
#include <boost/thread/thread.hpp>
|
#include <boost/thread/thread.hpp>
|
||||||
@ -61,19 +61,21 @@ BOOST_AUTO_TEST_CASE(Simple) {
|
|||||||
util::stream::ChainConfig config;
|
util::stream::ChainConfig config;
|
||||||
config.total_memory = 100;
|
config.total_memory = 100;
|
||||||
config.block_count = 1;
|
config.block_count = 1;
|
||||||
Chains chains(4);
|
util::stream::Chains chains(4);
|
||||||
for (unsigned i = 0; i < 4; ++i) {
|
for (unsigned i = 0; i < 4; ++i) {
|
||||||
config.entry_size = NGram::TotalSize(i + 1);
|
config.entry_size = NGram::TotalSize(i + 1);
|
||||||
chains.push_back(config);
|
chains.push_back(config);
|
||||||
}
|
}
|
||||||
|
|
||||||
chains[3] >> WriteInput();
|
chains[3] >> WriteInput();
|
||||||
ChainPositions for_adjust(chains);
|
util::stream::ChainPositions for_adjust(chains);
|
||||||
for (unsigned i = 0; i < 4; ++i) {
|
for (unsigned i = 0; i < 4; ++i) {
|
||||||
chains[i] >> boost::ref(outputs[i]);
|
chains[i] >> boost::ref(outputs[i]);
|
||||||
}
|
}
|
||||||
chains >> util::stream::kRecycle;
|
chains >> util::stream::kRecycle;
|
||||||
BOOST_CHECK_THROW(AdjustCounts(counts, discount).Run(for_adjust), BadDiscountException);
|
std::vector<uint64_t> counts_pruned(4);
|
||||||
|
std::vector<uint64_t> prune_thresholds(4);
|
||||||
|
BOOST_CHECK_THROW(AdjustCounts(counts, counts_pruned, discount, prune_thresholds).Run(for_adjust), BadDiscountException);
|
||||||
}
|
}
|
||||||
BOOST_REQUIRE_EQUAL(4UL, counts.size());
|
BOOST_REQUIRE_EQUAL(4UL, counts.size());
|
||||||
BOOST_CHECK_EQUAL(4UL, counts[0]);
|
BOOST_CHECK_EQUAL(4UL, counts[0]);
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
#include "lm/builder/ngram.hh"
|
#include "lm/builder/ngram.hh"
|
||||||
#include "lm/lm_exception.hh"
|
#include "lm/lm_exception.hh"
|
||||||
|
#include "lm/vocab.hh"
|
||||||
#include "lm/word_index.hh"
|
#include "lm/word_index.hh"
|
||||||
#include "util/fake_ofstream.hh"
|
#include "util/fake_ofstream.hh"
|
||||||
#include "util/file.hh"
|
#include "util/file.hh"
|
||||||
@ -37,60 +38,6 @@ struct VocabEntry {
|
|||||||
};
|
};
|
||||||
#pragma pack(pop)
|
#pragma pack(pop)
|
||||||
|
|
||||||
const float kProbingMultiplier = 1.5;
|
|
||||||
|
|
||||||
class VocabHandout {
|
|
||||||
public:
|
|
||||||
static std::size_t MemUsage(WordIndex initial_guess) {
|
|
||||||
if (initial_guess < 2) initial_guess = 2;
|
|
||||||
return util::CheckOverflow(Table::Size(initial_guess, kProbingMultiplier));
|
|
||||||
}
|
|
||||||
|
|
||||||
explicit VocabHandout(int fd, WordIndex initial_guess) :
|
|
||||||
table_backing_(util::CallocOrThrow(MemUsage(initial_guess))),
|
|
||||||
table_(table_backing_.get(), MemUsage(initial_guess)),
|
|
||||||
double_cutoff_(std::max<std::size_t>(initial_guess * 1.1, 1)),
|
|
||||||
word_list_(fd) {
|
|
||||||
Lookup("<unk>"); // Force 0
|
|
||||||
Lookup("<s>"); // Force 1
|
|
||||||
Lookup("</s>"); // Force 2
|
|
||||||
}
|
|
||||||
|
|
||||||
WordIndex Lookup(const StringPiece &word) {
|
|
||||||
VocabEntry entry;
|
|
||||||
entry.key = util::MurmurHashNative(word.data(), word.size());
|
|
||||||
entry.value = table_.SizeNoSerialization();
|
|
||||||
|
|
||||||
Table::MutableIterator it;
|
|
||||||
if (table_.FindOrInsert(entry, it))
|
|
||||||
return it->value;
|
|
||||||
word_list_ << word << '\0';
|
|
||||||
UTIL_THROW_IF(Size() >= std::numeric_limits<lm::WordIndex>::max(), VocabLoadException, "Too many vocabulary words. Change WordIndex to uint64_t in lm/word_index.hh.");
|
|
||||||
if (Size() >= double_cutoff_) {
|
|
||||||
table_backing_.call_realloc(table_.DoubleTo());
|
|
||||||
table_.Double(table_backing_.get());
|
|
||||||
double_cutoff_ *= 2;
|
|
||||||
}
|
|
||||||
return entry.value;
|
|
||||||
}
|
|
||||||
|
|
||||||
WordIndex Size() const {
|
|
||||||
return table_.SizeNoSerialization();
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
// TODO: factor out a resizable probing hash table.
|
|
||||||
// TODO: use mremap on linux to get all zeros on resizes.
|
|
||||||
util::scoped_malloc table_backing_;
|
|
||||||
|
|
||||||
typedef util::ProbingHashTable<VocabEntry, util::IdentityHash> Table;
|
|
||||||
Table table_;
|
|
||||||
|
|
||||||
std::size_t double_cutoff_;
|
|
||||||
|
|
||||||
util::FakeOFStream word_list_;
|
|
||||||
};
|
|
||||||
|
|
||||||
class DedupeHash : public std::unary_function<const WordIndex *, bool> {
|
class DedupeHash : public std::unary_function<const WordIndex *, bool> {
|
||||||
public:
|
public:
|
||||||
explicit DedupeHash(std::size_t order) : size_(order * sizeof(WordIndex)) {}
|
explicit DedupeHash(std::size_t order) : size_(order * sizeof(WordIndex)) {}
|
||||||
@ -127,6 +74,10 @@ struct DedupeEntry {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
// TODO: don't have this here, should be with probing hash table defaults?
|
||||||
|
const float kProbingMultiplier = 1.5;
|
||||||
|
|
||||||
typedef util::ProbingHashTable<DedupeEntry, DedupeHash, DedupeEquals> Dedupe;
|
typedef util::ProbingHashTable<DedupeEntry, DedupeHash, DedupeEquals> Dedupe;
|
||||||
|
|
||||||
class Writer {
|
class Writer {
|
||||||
@ -220,37 +171,50 @@ float CorpusCount::DedupeMultiplier(std::size_t order) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::size_t CorpusCount::VocabUsage(std::size_t vocab_estimate) {
|
std::size_t CorpusCount::VocabUsage(std::size_t vocab_estimate) {
|
||||||
return VocabHandout::MemUsage(vocab_estimate);
|
return ngram::GrowableVocab<ngram::WriteUniqueWords>::MemUsage(vocab_estimate);
|
||||||
}
|
}
|
||||||
|
|
||||||
CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block)
|
CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block, WarningAction disallowed_symbol)
|
||||||
: from_(from), vocab_write_(vocab_write), token_count_(token_count), type_count_(type_count),
|
: from_(from), vocab_write_(vocab_write), token_count_(token_count), type_count_(type_count),
|
||||||
dedupe_mem_size_(Dedupe::Size(entries_per_block, kProbingMultiplier)),
|
dedupe_mem_size_(Dedupe::Size(entries_per_block, kProbingMultiplier)),
|
||||||
dedupe_mem_(util::MallocOrThrow(dedupe_mem_size_)) {
|
dedupe_mem_(util::MallocOrThrow(dedupe_mem_size_)),
|
||||||
|
disallowed_symbol_action_(disallowed_symbol) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void CorpusCount::Run(const util::stream::ChainPosition &position) {
|
namespace {
|
||||||
UTIL_TIMER("(%w s) Counted n-grams\n");
|
void ComplainDisallowed(StringPiece word, WarningAction &action) {
|
||||||
|
switch (action) {
|
||||||
|
case SILENT:
|
||||||
|
return;
|
||||||
|
case COMPLAIN:
|
||||||
|
std::cerr << "Warning: " << word << " appears in the input. All instances of <s>, </s>, and <unk> will be interpreted as whitespace." << std::endl;
|
||||||
|
action = SILENT;
|
||||||
|
return;
|
||||||
|
case THROW_UP:
|
||||||
|
UTIL_THROW(FormatLoadException, "Special word " << word << " is not allowed in the corpus. I plan to support models containing <unk> in the future. Pass --skip_symbols to convert these symbols to whitespace.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // namespace
|
||||||
|
|
||||||
VocabHandout vocab(vocab_write_, type_count_);
|
void CorpusCount::Run(const util::stream::ChainPosition &position) {
|
||||||
|
ngram::GrowableVocab<ngram::WriteUniqueWords> vocab(type_count_, vocab_write_);
|
||||||
token_count_ = 0;
|
token_count_ = 0;
|
||||||
type_count_ = 0;
|
type_count_ = 0;
|
||||||
const WordIndex end_sentence = vocab.Lookup("</s>");
|
const WordIndex end_sentence = vocab.FindOrInsert("</s>");
|
||||||
Writer writer(NGram::OrderFromSize(position.GetChain().EntrySize()), position, dedupe_mem_.get(), dedupe_mem_size_);
|
Writer writer(NGram::OrderFromSize(position.GetChain().EntrySize()), position, dedupe_mem_.get(), dedupe_mem_size_);
|
||||||
uint64_t count = 0;
|
uint64_t count = 0;
|
||||||
bool delimiters[256];
|
bool delimiters[256];
|
||||||
memset(delimiters, 0, sizeof(delimiters));
|
util::BoolCharacter::Build("\0\t\n\r ", delimiters);
|
||||||
const char kDelimiterSet[] = "\0\t\n\r ";
|
|
||||||
for (const char *i = kDelimiterSet; i < kDelimiterSet + sizeof(kDelimiterSet); ++i) {
|
|
||||||
delimiters[static_cast<unsigned char>(*i)] = true;
|
|
||||||
}
|
|
||||||
try {
|
try {
|
||||||
while(true) {
|
while(true) {
|
||||||
StringPiece line(from_.ReadLine());
|
StringPiece line(from_.ReadLine());
|
||||||
writer.StartSentence();
|
writer.StartSentence();
|
||||||
for (util::TokenIter<util::BoolCharacter, true> w(line, delimiters); w; ++w) {
|
for (util::TokenIter<util::BoolCharacter, true> w(line, delimiters); w; ++w) {
|
||||||
WordIndex word = vocab.Lookup(*w);
|
WordIndex word = vocab.FindOrInsert(*w);
|
||||||
UTIL_THROW_IF(word <= 2, FormatLoadException, "Special word " << *w << " is not allowed in the corpus. I plan to support models containing <unk> in the future.");
|
if (word <= 2) {
|
||||||
|
ComplainDisallowed(*w, disallowed_symbol_action_);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
writer.Append(word);
|
writer.Append(word);
|
||||||
++count;
|
++count;
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#ifndef LM_BUILDER_CORPUS_COUNT__
|
#ifndef LM_BUILDER_CORPUS_COUNT_H
|
||||||
#define LM_BUILDER_CORPUS_COUNT__
|
#define LM_BUILDER_CORPUS_COUNT_H
|
||||||
|
|
||||||
|
#include "lm/lm_exception.hh"
|
||||||
#include "lm/word_index.hh"
|
#include "lm/word_index.hh"
|
||||||
#include "util/scoped.hh"
|
#include "util/scoped.hh"
|
||||||
|
|
||||||
@ -28,7 +29,7 @@ class CorpusCount {
|
|||||||
|
|
||||||
// token_count: out.
|
// token_count: out.
|
||||||
// type_count aka vocabulary size. Initialize to an estimate. It is set to the exact value.
|
// type_count aka vocabulary size. Initialize to an estimate. It is set to the exact value.
|
||||||
CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block);
|
CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::size_t entries_per_block, WarningAction disallowed_symbol);
|
||||||
|
|
||||||
void Run(const util::stream::ChainPosition &position);
|
void Run(const util::stream::ChainPosition &position);
|
||||||
|
|
||||||
@ -40,8 +41,10 @@ class CorpusCount {
|
|||||||
|
|
||||||
std::size_t dedupe_mem_size_;
|
std::size_t dedupe_mem_size_;
|
||||||
util::scoped_malloc dedupe_mem_;
|
util::scoped_malloc dedupe_mem_;
|
||||||
|
|
||||||
|
WarningAction disallowed_symbol_action_;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace builder
|
} // namespace builder
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
#endif // LM_BUILDER_CORPUS_COUNT__
|
#endif // LM_BUILDER_CORPUS_COUNT_H
|
||||||
|
@ -45,7 +45,7 @@ BOOST_AUTO_TEST_CASE(Short) {
|
|||||||
NGramStream stream;
|
NGramStream stream;
|
||||||
uint64_t token_count;
|
uint64_t token_count;
|
||||||
WordIndex type_count = 10;
|
WordIndex type_count = 10;
|
||||||
CorpusCount counter(input_piece, vocab.get(), token_count, type_count, chain.BlockSize() / chain.EntrySize());
|
CorpusCount counter(input_piece, vocab.get(), token_count, type_count, chain.BlockSize() / chain.EntrySize(), SILENT);
|
||||||
chain >> boost::ref(counter) >> stream >> util::stream::kRecycle;
|
chain >> boost::ref(counter) >> stream >> util::stream::kRecycle;
|
||||||
|
|
||||||
const char *v[] = {"<unk>", "<s>", "</s>", "looking", "on", "a", "little", "more", "loin", "foo", "bar"};
|
const char *v[] = {"<unk>", "<s>", "</s>", "looking", "on", "a", "little", "more", "loin", "foo", "bar"};
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef BUILDER_DISCOUNT__
|
#ifndef LM_BUILDER_DISCOUNT_H
|
||||||
#define BUILDER_DISCOUNT__
|
#define LM_BUILDER_DISCOUNT_H
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
@ -23,4 +23,4 @@ struct Discount {
|
|||||||
} // namespace builder
|
} // namespace builder
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // BUILDER_DISCOUNT__
|
#endif // LM_BUILDER_DISCOUNT_H
|
||||||
|
19
lm/builder/hash_gamma.hh
Normal file
19
lm/builder/hash_gamma.hh
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
#ifndef LM_BUILDER_HASH_GAMMA__
|
||||||
|
#define LM_BUILDER_HASH_GAMMA__
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
namespace lm { namespace builder {
|
||||||
|
|
||||||
|
#pragma pack(push)
|
||||||
|
#pragma pack(4)
|
||||||
|
|
||||||
|
struct HashGamma {
|
||||||
|
uint64_t hash_value;
|
||||||
|
float gamma;
|
||||||
|
};
|
||||||
|
|
||||||
|
#pragma pack(pop)
|
||||||
|
|
||||||
|
}} // namespaces
|
||||||
|
#endif // LM_BUILDER_HASH_GAMMA__
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_BUILDER_HEADER_INFO__
|
#ifndef LM_BUILDER_HEADER_INFO_H
|
||||||
#define LM_BUILDER_HEADER_INFO__
|
#define LM_BUILDER_HEADER_INFO_H
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
@ -3,6 +3,8 @@
|
|||||||
#include "lm/builder/discount.hh"
|
#include "lm/builder/discount.hh"
|
||||||
#include "lm/builder/ngram_stream.hh"
|
#include "lm/builder/ngram_stream.hh"
|
||||||
#include "lm/builder/sort.hh"
|
#include "lm/builder/sort.hh"
|
||||||
|
#include "lm/builder/hash_gamma.hh"
|
||||||
|
#include "util/murmur_hash.hh"
|
||||||
#include "util/file.hh"
|
#include "util/file.hh"
|
||||||
#include "util/stream/chain.hh"
|
#include "util/stream/chain.hh"
|
||||||
#include "util/stream/io.hh"
|
#include "util/stream/io.hh"
|
||||||
@ -14,55 +16,179 @@ namespace lm { namespace builder {
|
|||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
struct BufferEntry {
|
struct BufferEntry {
|
||||||
// Gamma from page 20 of Chen and Goodman.
|
// Gamma from page 20 of Chen and Goodman.
|
||||||
float gamma;
|
float gamma;
|
||||||
// \sum_w a(c w) for all w.
|
// \sum_w a(c w) for all w.
|
||||||
float denominator;
|
float denominator;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Extract an array of gamma from an array of BufferEntry.
|
struct HashBufferEntry : public BufferEntry {
|
||||||
|
// Hash value of ngram. Used to join contexts with backoffs.
|
||||||
|
uint64_t hash_value;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Reads all entries in order like NGramStream does.
|
||||||
|
// But deletes any entries that have CutoffCount below or equal to pruning
|
||||||
|
// threshold.
|
||||||
|
class PruneNGramStream {
|
||||||
|
public:
|
||||||
|
PruneNGramStream(const util::stream::ChainPosition &position) :
|
||||||
|
current_(NULL, NGram::OrderFromSize(position.GetChain().EntrySize())),
|
||||||
|
dest_(NULL, NGram::OrderFromSize(position.GetChain().EntrySize())),
|
||||||
|
currentCount_(0),
|
||||||
|
block_(position)
|
||||||
|
{
|
||||||
|
StartBlock();
|
||||||
|
}
|
||||||
|
|
||||||
|
NGram &operator*() { return current_; }
|
||||||
|
NGram *operator->() { return ¤t_; }
|
||||||
|
|
||||||
|
operator bool() const {
|
||||||
|
return block_;
|
||||||
|
}
|
||||||
|
|
||||||
|
PruneNGramStream &operator++() {
|
||||||
|
assert(block_);
|
||||||
|
|
||||||
|
if (current_.Order() > 1) {
|
||||||
|
if(currentCount_ > 0) {
|
||||||
|
if(dest_.Base() < current_.Base()) {
|
||||||
|
memcpy(dest_.Base(), current_.Base(), current_.TotalSize());
|
||||||
|
}
|
||||||
|
dest_.NextInMemory();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
dest_.NextInMemory();
|
||||||
|
}
|
||||||
|
|
||||||
|
current_.NextInMemory();
|
||||||
|
|
||||||
|
uint8_t *block_base = static_cast<uint8_t*>(block_->Get());
|
||||||
|
if (current_.Base() == block_base + block_->ValidSize()) {
|
||||||
|
block_->SetValidSize(dest_.Base() - block_base);
|
||||||
|
++block_;
|
||||||
|
StartBlock();
|
||||||
|
}
|
||||||
|
|
||||||
|
currentCount_ = current_.CutoffCount();
|
||||||
|
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
void StartBlock() {
|
||||||
|
for (; ; ++block_) {
|
||||||
|
if (!block_) return;
|
||||||
|
if (block_->ValidSize()) break;
|
||||||
|
}
|
||||||
|
current_.ReBase(block_->Get());
|
||||||
|
currentCount_ = current_.CutoffCount();
|
||||||
|
|
||||||
|
dest_.ReBase(block_->Get());
|
||||||
|
}
|
||||||
|
|
||||||
|
NGram current_; // input iterator
|
||||||
|
NGram dest_; // output iterator
|
||||||
|
|
||||||
|
uint64_t currentCount_;
|
||||||
|
|
||||||
|
util::stream::Link block_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Extract an array of HashedGamma from an array of BufferEntry.
|
||||||
class OnlyGamma {
|
class OnlyGamma {
|
||||||
public:
|
public:
|
||||||
|
OnlyGamma(bool pruning) : pruning_(pruning) {}
|
||||||
|
|
||||||
void Run(const util::stream::ChainPosition &position) {
|
void Run(const util::stream::ChainPosition &position) {
|
||||||
for (util::stream::Link block_it(position); block_it; ++block_it) {
|
for (util::stream::Link block_it(position); block_it; ++block_it) {
|
||||||
float *out = static_cast<float*>(block_it->Get());
|
if(pruning_) {
|
||||||
const float *in = out;
|
const HashBufferEntry *in = static_cast<const HashBufferEntry*>(block_it->Get());
|
||||||
const float *end = static_cast<const float*>(block_it->ValidEnd());
|
const HashBufferEntry *end = static_cast<const HashBufferEntry*>(block_it->ValidEnd());
|
||||||
for (out += 1, in += 2; in < end; out += 1, in += 2) {
|
|
||||||
*out = *in;
|
// Just make it point to the beginning of the stream so it can be overwritten
|
||||||
|
// With HashGamma values. Do not attempt to interpret the values until set below.
|
||||||
|
HashGamma *out = static_cast<HashGamma*>(block_it->Get());
|
||||||
|
for (; in < end; out += 1, in += 1) {
|
||||||
|
// buffering, otherwise might overwrite values too early
|
||||||
|
float gamma_buf = in->gamma;
|
||||||
|
uint64_t hash_buf = in->hash_value;
|
||||||
|
|
||||||
|
out->gamma = gamma_buf;
|
||||||
|
out->hash_value = hash_buf;
|
||||||
|
}
|
||||||
|
block_it->SetValidSize((block_it->ValidSize() * sizeof(HashGamma)) / sizeof(HashBufferEntry));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
float *out = static_cast<float*>(block_it->Get());
|
||||||
|
const float *in = out;
|
||||||
|
const float *end = static_cast<const float*>(block_it->ValidEnd());
|
||||||
|
for (out += 1, in += 2; in < end; out += 1, in += 2) {
|
||||||
|
*out = *in;
|
||||||
|
}
|
||||||
|
block_it->SetValidSize(block_it->ValidSize() / 2);
|
||||||
}
|
}
|
||||||
block_it->SetValidSize(block_it->ValidSize() / 2);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool pruning_;
|
||||||
};
|
};
|
||||||
|
|
||||||
class AddRight {
|
class AddRight {
|
||||||
public:
|
public:
|
||||||
AddRight(const Discount &discount, const util::stream::ChainPosition &input)
|
AddRight(const Discount &discount, const util::stream::ChainPosition &input, bool pruning)
|
||||||
: discount_(discount), input_(input) {}
|
: discount_(discount), input_(input), pruning_(pruning) {}
|
||||||
|
|
||||||
void Run(const util::stream::ChainPosition &output) {
|
void Run(const util::stream::ChainPosition &output) {
|
||||||
NGramStream in(input_);
|
NGramStream in(input_);
|
||||||
util::stream::Stream out(output);
|
util::stream::Stream out(output);
|
||||||
|
|
||||||
std::vector<WordIndex> previous(in->Order() - 1);
|
std::vector<WordIndex> previous(in->Order() - 1);
|
||||||
|
// Silly windows requires this workaround to just get an invalid pointer when empty.
|
||||||
|
void *const previous_raw = previous.empty() ? NULL : static_cast<void*>(&previous[0]);
|
||||||
const std::size_t size = sizeof(WordIndex) * previous.size();
|
const std::size_t size = sizeof(WordIndex) * previous.size();
|
||||||
|
|
||||||
for(; in; ++out) {
|
for(; in; ++out) {
|
||||||
memcpy(&previous[0], in->begin(), size);
|
memcpy(previous_raw, in->begin(), size);
|
||||||
uint64_t denominator = 0;
|
uint64_t denominator = 0;
|
||||||
|
uint64_t normalizer = 0;
|
||||||
|
|
||||||
uint64_t counts[4];
|
uint64_t counts[4];
|
||||||
memset(counts, 0, sizeof(counts));
|
memset(counts, 0, sizeof(counts));
|
||||||
do {
|
do {
|
||||||
denominator += in->Count();
|
denominator += in->UnmarkedCount();
|
||||||
++counts[std::min(in->Count(), static_cast<uint64_t>(3))];
|
|
||||||
} while (++in && !memcmp(&previous[0], in->begin(), size));
|
// Collect unused probability mass from pruning.
|
||||||
|
// Becomes 0 for unpruned ngrams.
|
||||||
|
normalizer += in->UnmarkedCount() - in->CutoffCount();
|
||||||
|
|
||||||
|
// Chen&Goodman do not mention counting based on cutoffs, but
|
||||||
|
// backoff becomes larger than 1 otherwise, so probably needs
|
||||||
|
// to count cutoffs. Counts normally without pruning.
|
||||||
|
if(in->CutoffCount() > 0)
|
||||||
|
++counts[std::min(in->CutoffCount(), static_cast<uint64_t>(3))];
|
||||||
|
|
||||||
|
} while (++in && !memcmp(previous_raw, in->begin(), size));
|
||||||
|
|
||||||
BufferEntry &entry = *reinterpret_cast<BufferEntry*>(out.Get());
|
BufferEntry &entry = *reinterpret_cast<BufferEntry*>(out.Get());
|
||||||
entry.denominator = static_cast<float>(denominator);
|
entry.denominator = static_cast<float>(denominator);
|
||||||
entry.gamma = 0.0;
|
entry.gamma = 0.0;
|
||||||
for (unsigned i = 1; i <= 3; ++i) {
|
for (unsigned i = 1; i <= 3; ++i) {
|
||||||
entry.gamma += discount_.Get(i) * static_cast<float>(counts[i]);
|
entry.gamma += discount_.Get(i) * static_cast<float>(counts[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Makes model sum to 1 with pruning (I hope).
|
||||||
|
entry.gamma += normalizer;
|
||||||
|
|
||||||
entry.gamma /= entry.denominator;
|
entry.gamma /= entry.denominator;
|
||||||
|
|
||||||
|
if(pruning_) {
|
||||||
|
// If pruning is enabled the stream actually contains HashBufferEntry, see InitialProbabilities(...),
|
||||||
|
// so add a hash value that identifies the current ngram.
|
||||||
|
static_cast<HashBufferEntry*>(&entry)->hash_value = util::MurmurHashNative(previous_raw, size);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
out.Poison();
|
out.Poison();
|
||||||
}
|
}
|
||||||
@ -70,6 +196,7 @@ class AddRight {
|
|||||||
private:
|
private:
|
||||||
const Discount &discount_;
|
const Discount &discount_;
|
||||||
const util::stream::ChainPosition input_;
|
const util::stream::ChainPosition input_;
|
||||||
|
bool pruning_;
|
||||||
};
|
};
|
||||||
|
|
||||||
class MergeRight {
|
class MergeRight {
|
||||||
@ -82,7 +209,7 @@ class MergeRight {
|
|||||||
void Run(const util::stream::ChainPosition &primary) {
|
void Run(const util::stream::ChainPosition &primary) {
|
||||||
util::stream::Stream summed(from_adder_);
|
util::stream::Stream summed(from_adder_);
|
||||||
|
|
||||||
NGramStream grams(primary);
|
PruneNGramStream grams(primary);
|
||||||
|
|
||||||
// Without interpolation, the interpolation weight goes to <unk>.
|
// Without interpolation, the interpolation weight goes to <unk>.
|
||||||
if (grams->Order() == 1 && !interpolate_unigrams_) {
|
if (grams->Order() == 1 && !interpolate_unigrams_) {
|
||||||
@ -97,15 +224,16 @@ class MergeRight {
|
|||||||
++summed;
|
++summed;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<WordIndex> previous(grams->Order() - 1);
|
std::vector<WordIndex> previous(grams->Order() - 1);
|
||||||
const std::size_t size = sizeof(WordIndex) * previous.size();
|
const std::size_t size = sizeof(WordIndex) * previous.size();
|
||||||
for (; grams; ++summed) {
|
for (; grams; ++summed) {
|
||||||
memcpy(&previous[0], grams->begin(), size);
|
memcpy(&previous[0], grams->begin(), size);
|
||||||
const BufferEntry &sums = *static_cast<const BufferEntry*>(summed.Get());
|
const BufferEntry &sums = *static_cast<const BufferEntry*>(summed.Get());
|
||||||
|
|
||||||
do {
|
do {
|
||||||
Payload &pay = grams->Value();
|
Payload &pay = grams->Value();
|
||||||
pay.uninterp.prob = discount_.Apply(pay.count) / sums.denominator;
|
pay.uninterp.prob = discount_.Apply(grams->UnmarkedCount()) / sums.denominator;
|
||||||
pay.uninterp.gamma = sums.gamma;
|
pay.uninterp.gamma = sums.gamma;
|
||||||
} while (++grams && !memcmp(&previous[0], grams->begin(), size));
|
} while (++grams && !memcmp(&previous[0], grams->begin(), size));
|
||||||
}
|
}
|
||||||
@ -119,17 +247,29 @@ class MergeRight {
|
|||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
void InitialProbabilities(const InitialProbabilitiesConfig &config, const std::vector<Discount> &discounts, Chains &primary, Chains &second_in, Chains &gamma_out) {
|
void InitialProbabilities(
|
||||||
util::stream::ChainConfig gamma_config = config.adder_out;
|
const InitialProbabilitiesConfig &config,
|
||||||
gamma_config.entry_size = sizeof(BufferEntry);
|
const std::vector<Discount> &discounts,
|
||||||
|
util::stream::Chains &primary,
|
||||||
|
util::stream::Chains &second_in,
|
||||||
|
util::stream::Chains &gamma_out,
|
||||||
|
const std::vector<uint64_t> &prune_thresholds) {
|
||||||
for (size_t i = 0; i < primary.size(); ++i) {
|
for (size_t i = 0; i < primary.size(); ++i) {
|
||||||
|
util::stream::ChainConfig gamma_config = config.adder_out;
|
||||||
|
if(prune_thresholds[i] > 0)
|
||||||
|
gamma_config.entry_size = sizeof(HashBufferEntry);
|
||||||
|
else
|
||||||
|
gamma_config.entry_size = sizeof(BufferEntry);
|
||||||
|
|
||||||
util::stream::ChainPosition second(second_in[i].Add());
|
util::stream::ChainPosition second(second_in[i].Add());
|
||||||
second_in[i] >> util::stream::kRecycle;
|
second_in[i] >> util::stream::kRecycle;
|
||||||
gamma_out.push_back(gamma_config);
|
gamma_out.push_back(gamma_config);
|
||||||
gamma_out[i] >> AddRight(discounts[i], second);
|
gamma_out[i] >> AddRight(discounts[i], second, prune_thresholds[i] > 0);
|
||||||
|
|
||||||
primary[i] >> MergeRight(config.interpolate_unigrams, gamma_out[i].Add(), discounts[i]);
|
primary[i] >> MergeRight(config.interpolate_unigrams, gamma_out[i].Add(), discounts[i]);
|
||||||
// Don't bother with the OnlyGamma thread for something to discard.
|
|
||||||
if (i) gamma_out[i] >> OnlyGamma();
|
// Don't bother with the OnlyGamma thread for something to discard.
|
||||||
|
if (i) gamma_out[i] >> OnlyGamma(prune_thresholds[i] > 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,14 +1,15 @@
|
|||||||
#ifndef LM_BUILDER_INITIAL_PROBABILITIES__
|
#ifndef LM_BUILDER_INITIAL_PROBABILITIES_H
|
||||||
#define LM_BUILDER_INITIAL_PROBABILITIES__
|
#define LM_BUILDER_INITIAL_PROBABILITIES_H
|
||||||
|
|
||||||
#include "lm/builder/discount.hh"
|
#include "lm/builder/discount.hh"
|
||||||
#include "util/stream/config.hh"
|
#include "util/stream/config.hh"
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
namespace util { namespace stream { class Chains; } }
|
||||||
|
|
||||||
namespace lm {
|
namespace lm {
|
||||||
namespace builder {
|
namespace builder {
|
||||||
class Chains;
|
|
||||||
|
|
||||||
struct InitialProbabilitiesConfig {
|
struct InitialProbabilitiesConfig {
|
||||||
// These should be small buffers to keep the adder from getting too far ahead
|
// These should be small buffers to keep the adder from getting too far ahead
|
||||||
@ -26,9 +27,15 @@ struct InitialProbabilitiesConfig {
|
|||||||
* The values are bare floats and should be buffered for interpolation to
|
* The values are bare floats and should be buffered for interpolation to
|
||||||
* use.
|
* use.
|
||||||
*/
|
*/
|
||||||
void InitialProbabilities(const InitialProbabilitiesConfig &config, const std::vector<Discount> &discounts, Chains &primary, Chains &second_in, Chains &gamma_out);
|
void InitialProbabilities(
|
||||||
|
const InitialProbabilitiesConfig &config,
|
||||||
|
const std::vector<Discount> &discounts,
|
||||||
|
util::stream::Chains &primary,
|
||||||
|
util::stream::Chains &second_in,
|
||||||
|
util::stream::Chains &gamma_out,
|
||||||
|
const std::vector<uint64_t> &prune_thresholds);
|
||||||
|
|
||||||
} // namespace builder
|
} // namespace builder
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // LM_BUILDER_INITIAL_PROBABILITIES__
|
#endif // LM_BUILDER_INITIAL_PROBABILITIES_H
|
||||||
|
@ -1,9 +1,12 @@
|
|||||||
#include "lm/builder/interpolate.hh"
|
#include "lm/builder/interpolate.hh"
|
||||||
|
|
||||||
|
#include "lm/builder/hash_gamma.hh"
|
||||||
#include "lm/builder/joint_order.hh"
|
#include "lm/builder/joint_order.hh"
|
||||||
#include "lm/builder/multi_stream.hh"
|
#include "lm/builder/ngram_stream.hh"
|
||||||
#include "lm/builder/sort.hh"
|
#include "lm/builder/sort.hh"
|
||||||
#include "lm/lm_exception.hh"
|
#include "lm/lm_exception.hh"
|
||||||
|
#include "util/fixed_array.hh"
|
||||||
|
#include "util/murmur_hash.hh"
|
||||||
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
|
||||||
@ -12,7 +15,8 @@ namespace {
|
|||||||
|
|
||||||
class Callback {
|
class Callback {
|
||||||
public:
|
public:
|
||||||
Callback(float uniform_prob, const ChainPositions &backoffs) : backoffs_(backoffs.size()), probs_(backoffs.size() + 2) {
|
Callback(float uniform_prob, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t> &prune_thresholds)
|
||||||
|
: backoffs_(backoffs.size()), probs_(backoffs.size() + 2), prune_thresholds_(prune_thresholds) {
|
||||||
probs_[0] = uniform_prob;
|
probs_[0] = uniform_prob;
|
||||||
for (std::size_t i = 0; i < backoffs.size(); ++i) {
|
for (std::size_t i = 0; i < backoffs.size(); ++i) {
|
||||||
backoffs_.push_back(backoffs[i]);
|
backoffs_.push_back(backoffs[i]);
|
||||||
@ -33,12 +37,37 @@ class Callback {
|
|||||||
pay.complete.prob = pay.uninterp.prob + pay.uninterp.gamma * probs_[order_minus_1];
|
pay.complete.prob = pay.uninterp.prob + pay.uninterp.gamma * probs_[order_minus_1];
|
||||||
probs_[order_minus_1 + 1] = pay.complete.prob;
|
probs_[order_minus_1 + 1] = pay.complete.prob;
|
||||||
pay.complete.prob = log10(pay.complete.prob);
|
pay.complete.prob = log10(pay.complete.prob);
|
||||||
// TODO: this is a hack to skip n-grams that don't appear as context. Pruning will require some different handling.
|
|
||||||
if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != kUNK && *(gram.end() - 1) != kEOS) {
|
if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != kUNK && *(gram.end() - 1) != kEOS) {
|
||||||
pay.complete.backoff = log10(*static_cast<const float*>(backoffs_[order_minus_1].Get()));
|
// This skips over ngrams if backoffs have been exhausted.
|
||||||
++backoffs_[order_minus_1];
|
if(!backoffs_[order_minus_1]) {
|
||||||
|
pay.complete.backoff = 0.0;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(prune_thresholds_[order_minus_1 + 1] > 0) {
|
||||||
|
//Compute hash value for current context
|
||||||
|
uint64_t current_hash = util::MurmurHashNative(gram.begin(), gram.Order() * sizeof(WordIndex));
|
||||||
|
|
||||||
|
const HashGamma *hashed_backoff = static_cast<const HashGamma*>(backoffs_[order_minus_1].Get());
|
||||||
|
while(backoffs_[order_minus_1] && current_hash != hashed_backoff->hash_value) {
|
||||||
|
hashed_backoff = static_cast<const HashGamma*>(backoffs_[order_minus_1].Get());
|
||||||
|
++backoffs_[order_minus_1];
|
||||||
|
}
|
||||||
|
|
||||||
|
if(current_hash == hashed_backoff->hash_value) {
|
||||||
|
pay.complete.backoff = log10(hashed_backoff->gamma);
|
||||||
|
++backoffs_[order_minus_1];
|
||||||
|
} else {
|
||||||
|
// Has been pruned away so it is not a context anymore
|
||||||
|
pay.complete.backoff = 0.0;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
pay.complete.backoff = log10(*static_cast<const float*>(backoffs_[order_minus_1].Get()));
|
||||||
|
++backoffs_[order_minus_1];
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// Not a context.
|
// Not a context.
|
||||||
pay.complete.backoff = 0.0;
|
pay.complete.backoff = 0.0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -46,19 +75,22 @@ class Callback {
|
|||||||
void Exit(unsigned, const NGram &) const {}
|
void Exit(unsigned, const NGram &) const {}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
FixedArray<util::stream::Stream> backoffs_;
|
util::FixedArray<util::stream::Stream> backoffs_;
|
||||||
|
|
||||||
std::vector<float> probs_;
|
std::vector<float> probs_;
|
||||||
|
const std::vector<uint64_t>& prune_thresholds_;
|
||||||
};
|
};
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
Interpolate::Interpolate(uint64_t unigram_count, const ChainPositions &backoffs)
|
Interpolate::Interpolate(uint64_t vocab_size, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t>& prune_thresholds)
|
||||||
: uniform_prob_(1.0 / static_cast<float>(unigram_count - 1)), backoffs_(backoffs) {}
|
: uniform_prob_(1.0 / static_cast<float>(vocab_size)), // Includes <unk> but excludes <s>.
|
||||||
|
backoffs_(backoffs),
|
||||||
|
prune_thresholds_(prune_thresholds) {}
|
||||||
|
|
||||||
// perform order-wise interpolation
|
// perform order-wise interpolation
|
||||||
void Interpolate::Run(const ChainPositions &positions) {
|
void Interpolate::Run(const util::stream::ChainPositions &positions) {
|
||||||
assert(positions.size() == backoffs_.size() + 1);
|
assert(positions.size() == backoffs_.size() + 1);
|
||||||
Callback callback(uniform_prob_, backoffs_);
|
Callback callback(uniform_prob_, backoffs_, prune_thresholds_);
|
||||||
JointOrder<Callback, SuffixOrder>(positions, callback);
|
JointOrder<Callback, SuffixOrder>(positions, callback);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,10 +1,12 @@
|
|||||||
#ifndef LM_BUILDER_INTERPOLATE__
|
#ifndef LM_BUILDER_INTERPOLATE_H
|
||||||
#define LM_BUILDER_INTERPOLATE__
|
#define LM_BUILDER_INTERPOLATE_H
|
||||||
|
|
||||||
|
#include "util/stream/multi_stream.hh"
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#include "lm/builder/multi_stream.hh"
|
|
||||||
|
|
||||||
namespace lm { namespace builder {
|
namespace lm { namespace builder {
|
||||||
|
|
||||||
/* Interpolate step.
|
/* Interpolate step.
|
||||||
@ -14,14 +16,17 @@ namespace lm { namespace builder {
|
|||||||
*/
|
*/
|
||||||
class Interpolate {
|
class Interpolate {
|
||||||
public:
|
public:
|
||||||
explicit Interpolate(uint64_t unigram_count, const ChainPositions &backoffs);
|
// Normally vocab_size is the unigram count-1 (since p(<s>) = 0) but might
|
||||||
|
// be larger when the user specifies a consistent vocabulary size.
|
||||||
|
explicit Interpolate(uint64_t vocab_size, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t> &prune_thresholds);
|
||||||
|
|
||||||
void Run(const ChainPositions &positions);
|
void Run(const util::stream::ChainPositions &positions);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
float uniform_prob_;
|
float uniform_prob_;
|
||||||
ChainPositions backoffs_;
|
util::stream::ChainPositions backoffs_;
|
||||||
|
const std::vector<uint64_t> prune_thresholds_;
|
||||||
};
|
};
|
||||||
|
|
||||||
}} // namespaces
|
}} // namespaces
|
||||||
#endif // LM_BUILDER_INTERPOLATE__
|
#endif // LM_BUILDER_INTERPOLATE_H
|
||||||
|
@ -1,14 +1,14 @@
|
|||||||
#ifndef LM_BUILDER_JOINT_ORDER__
|
#ifndef LM_BUILDER_JOINT_ORDER_H
|
||||||
#define LM_BUILDER_JOINT_ORDER__
|
#define LM_BUILDER_JOINT_ORDER_H
|
||||||
|
|
||||||
#include "lm/builder/multi_stream.hh"
|
#include "lm/builder/ngram_stream.hh"
|
||||||
#include "lm/lm_exception.hh"
|
#include "lm/lm_exception.hh"
|
||||||
|
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
namespace lm { namespace builder {
|
namespace lm { namespace builder {
|
||||||
|
|
||||||
template <class Callback, class Compare> void JointOrder(const ChainPositions &positions, Callback &callback) {
|
template <class Callback, class Compare> void JointOrder(const util::stream::ChainPositions &positions, Callback &callback) {
|
||||||
// Allow matching to reference streams[-1].
|
// Allow matching to reference streams[-1].
|
||||||
NGramStreams streams_with_dummy;
|
NGramStreams streams_with_dummy;
|
||||||
streams_with_dummy.InitWithDummy(positions);
|
streams_with_dummy.InitWithDummy(positions);
|
||||||
@ -40,4 +40,4 @@ template <class Callback, class Compare> void JointOrder(const ChainPositions &p
|
|||||||
|
|
||||||
}} // namespaces
|
}} // namespaces
|
||||||
|
|
||||||
#endif // LM_BUILDER_JOINT_ORDER__
|
#endif // LM_BUILDER_JOINT_ORDER_H
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#include "lm/builder/pipeline.hh"
|
#include "lm/builder/pipeline.hh"
|
||||||
|
#include "lm/lm_exception.hh"
|
||||||
#include "util/file.hh"
|
#include "util/file.hh"
|
||||||
#include "util/file_piece.hh"
|
#include "util/file_piece.hh"
|
||||||
#include "util/usage.hh"
|
#include "util/usage.hh"
|
||||||
@ -7,6 +8,7 @@
|
|||||||
|
|
||||||
#include <boost/program_options.hpp>
|
#include <boost/program_options.hpp>
|
||||||
#include <boost/version.hpp>
|
#include <boost/version.hpp>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
class SizeNotify {
|
class SizeNotify {
|
||||||
@ -25,6 +27,46 @@ boost::program_options::typed_value<std::string> *SizeOption(std::size_t &to, co
|
|||||||
return boost::program_options::value<std::string>()->notifier(SizeNotify(to))->default_value(default_value);
|
return boost::program_options::value<std::string>()->notifier(SizeNotify(to))->default_value(default_value);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Parse and validate pruning thresholds then return vector of threshold counts
|
||||||
|
// for each n-grams order.
|
||||||
|
std::vector<uint64_t> ParsePruning(const std::vector<std::string> ¶m, std::size_t order) {
|
||||||
|
// convert to vector of integers
|
||||||
|
std::vector<uint64_t> prune_thresholds;
|
||||||
|
prune_thresholds.reserve(order);
|
||||||
|
std::cerr << "Pruning ";
|
||||||
|
for (std::vector<std::string>::const_iterator it(param.begin()); it != param.end(); ++it) {
|
||||||
|
try {
|
||||||
|
prune_thresholds.push_back(boost::lexical_cast<uint64_t>(*it));
|
||||||
|
} catch(const boost::bad_lexical_cast &) {
|
||||||
|
UTIL_THROW(util::Exception, "Bad pruning threshold " << *it);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fill with zeros by default.
|
||||||
|
if (prune_thresholds.empty()) {
|
||||||
|
prune_thresholds.resize(order, 0);
|
||||||
|
return prune_thresholds;
|
||||||
|
}
|
||||||
|
|
||||||
|
// validate pruning threshold if specified
|
||||||
|
// throw if each n-gram order has not threshold specified
|
||||||
|
UTIL_THROW_IF(prune_thresholds.size() > order, util::Exception, "You specified pruning thresholds for orders 1 through " << prune_thresholds.size() << " but the model only has order " << order);
|
||||||
|
// threshold for unigram can only be 0 (no pruning)
|
||||||
|
UTIL_THROW_IF(prune_thresholds[0] != 0, util::Exception, "Unigram pruning is not implemented, so the first pruning threshold must be 0.");
|
||||||
|
|
||||||
|
// check if threshold are not in decreasing order
|
||||||
|
uint64_t lower_threshold = 0;
|
||||||
|
for (std::vector<uint64_t>::iterator it = prune_thresholds.begin(); it != prune_thresholds.end(); ++it) {
|
||||||
|
UTIL_THROW_IF(lower_threshold > *it, util::Exception, "Pruning thresholds should be in non-decreasing order. Otherwise substrings would be removed, which is bad for query-time data structures.");
|
||||||
|
lower_threshold = *it;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pad to all orders using the last value.
|
||||||
|
prune_thresholds.resize(order, prune_thresholds.back());
|
||||||
|
return prune_thresholds;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
@ -34,25 +76,30 @@ int main(int argc, char *argv[]) {
|
|||||||
lm::builder::PipelineConfig pipeline;
|
lm::builder::PipelineConfig pipeline;
|
||||||
|
|
||||||
std::string text, arpa;
|
std::string text, arpa;
|
||||||
|
std::vector<std::string> pruning;
|
||||||
|
|
||||||
|
|
||||||
options.add_options()
|
options.add_options()
|
||||||
("help", po::bool_switch(), "Show this help message")
|
("help,h", po::bool_switch(), "Show this help message")
|
||||||
("order,o", po::value<std::size_t>(&pipeline.order)
|
("order,o", po::value<std::size_t>(&pipeline.order)
|
||||||
#if BOOST_VERSION >= 104200
|
#if BOOST_VERSION >= 104200
|
||||||
->required()
|
->required()
|
||||||
#endif
|
#endif
|
||||||
, "Order of the model")
|
, "Order of the model")
|
||||||
("interpolate_unigrams", po::bool_switch(&pipeline.initial_probs.interpolate_unigrams), "Interpolate the unigrams (default: emulate SRILM by not interpolating)")
|
("interpolate_unigrams", po::bool_switch(&pipeline.initial_probs.interpolate_unigrams), "Interpolate the unigrams (default: emulate SRILM by not interpolating)")
|
||||||
|
("skip_symbols", po::bool_switch(), "Treat <s>, </s>, and <unk> as whitespace instead of throwing an exception")
|
||||||
("temp_prefix,T", po::value<std::string>(&pipeline.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix")
|
("temp_prefix,T", po::value<std::string>(&pipeline.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix")
|
||||||
("memory,S", SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory")
|
("memory,S", SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory")
|
||||||
("minimum_block", SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow")
|
("minimum_block", SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow")
|
||||||
("sort_block", SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)")
|
("sort_block", SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)")
|
||||||
("vocab_estimate", po::value<lm::WordIndex>(&pipeline.vocab_estimate)->default_value(1000000), "Assume this vocabulary size for purposes of calculating memory in step 1 (corpus count) and pre-sizing the hash table")
|
|
||||||
("block_count", po::value<std::size_t>(&pipeline.block_count)->default_value(2), "Block count (per order)")
|
("block_count", po::value<std::size_t>(&pipeline.block_count)->default_value(2), "Block count (per order)")
|
||||||
("vocab_file", po::value<std::string>(&pipeline.vocab_file)->default_value(""), "Location to write vocabulary file")
|
("vocab_estimate", po::value<lm::WordIndex>(&pipeline.vocab_estimate)->default_value(1000000), "Assume this vocabulary size for purposes of calculating memory in step 1 (corpus count) and pre-sizing the hash table")
|
||||||
|
("vocab_file", po::value<std::string>(&pipeline.vocab_file)->default_value(""), "Location to write a file containing the unique vocabulary strings delimited by null bytes")
|
||||||
|
("vocab_pad", po::value<uint64_t>(&pipeline.vocab_size_for_unk)->default_value(0), "If the vocabulary is smaller than this value, pad with <unk> to reach this size. Requires --interpolate_unigrams")
|
||||||
("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.")
|
("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.")
|
||||||
("text", po::value<std::string>(&text), "Read text from a file instead of stdin")
|
("text", po::value<std::string>(&text), "Read text from a file instead of stdin")
|
||||||
("arpa", po::value<std::string>(&arpa), "Write ARPA to a file instead of stdout");
|
("arpa", po::value<std::string>(&arpa), "Write ARPA to a file instead of stdout")
|
||||||
|
("prune", po::value<std::vector<std::string> >(&pruning)->multitoken(), "Prune n-grams with count less than or equal to the given threshold. Specify one value for each order i.e. 0 0 1 to prune singleton trigrams and above. The sequence of values must be non-decreasing and the last value applies to any remaining orders. Unigram pruning is not implemented, so the first value must be zero. Default is to not prune, which is equivalent to --prune 0.");
|
||||||
po::variables_map vm;
|
po::variables_map vm;
|
||||||
po::store(po::parse_command_line(argc, argv, options), vm);
|
po::store(po::parse_command_line(argc, argv, options), vm);
|
||||||
|
|
||||||
@ -95,6 +142,20 @@ int main(int argc, char *argv[]) {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
if (pipeline.vocab_size_for_unk && !pipeline.initial_probs.interpolate_unigrams) {
|
||||||
|
std::cerr << "--vocab_pad requires --interpolate_unigrams" << std::endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (vm["skip_symbols"].as<bool>()) {
|
||||||
|
pipeline.disallowed_symbol_action = lm::COMPLAIN;
|
||||||
|
} else {
|
||||||
|
pipeline.disallowed_symbol_action = lm::THROW_UP;
|
||||||
|
}
|
||||||
|
|
||||||
|
// parse pruning thresholds. These depend on order, so it is not done as a notifier.
|
||||||
|
pipeline.prune_thresholds = ParsePruning(pruning, pipeline.order);
|
||||||
|
|
||||||
util::NormalizeTempPrefix(pipeline.sort.temp_prefix);
|
util::NormalizeTempPrefix(pipeline.sort.temp_prefix);
|
||||||
|
|
||||||
lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs;
|
lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs;
|
||||||
|
@ -1,180 +0,0 @@
|
|||||||
#ifndef LM_BUILDER_MULTI_STREAM__
|
|
||||||
#define LM_BUILDER_MULTI_STREAM__
|
|
||||||
|
|
||||||
#include "lm/builder/ngram_stream.hh"
|
|
||||||
#include "util/scoped.hh"
|
|
||||||
#include "util/stream/chain.hh"
|
|
||||||
|
|
||||||
#include <cstddef>
|
|
||||||
#include <new>
|
|
||||||
|
|
||||||
#include <assert.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
|
|
||||||
namespace lm { namespace builder {
|
|
||||||
|
|
||||||
template <class T> class FixedArray {
|
|
||||||
public:
|
|
||||||
explicit FixedArray(std::size_t count) {
|
|
||||||
Init(count);
|
|
||||||
}
|
|
||||||
|
|
||||||
FixedArray() : newed_end_(NULL) {}
|
|
||||||
|
|
||||||
void Init(std::size_t count) {
|
|
||||||
assert(!block_.get());
|
|
||||||
block_.reset(malloc(sizeof(T) * count));
|
|
||||||
if (!block_.get()) throw std::bad_alloc();
|
|
||||||
newed_end_ = begin();
|
|
||||||
}
|
|
||||||
|
|
||||||
FixedArray(const FixedArray &from) {
|
|
||||||
std::size_t size = from.newed_end_ - static_cast<const T*>(from.block_.get());
|
|
||||||
Init(size);
|
|
||||||
for (std::size_t i = 0; i < size; ++i) {
|
|
||||||
new(end()) T(from[i]);
|
|
||||||
Constructed();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
~FixedArray() { clear(); }
|
|
||||||
|
|
||||||
T *begin() { return static_cast<T*>(block_.get()); }
|
|
||||||
const T *begin() const { return static_cast<const T*>(block_.get()); }
|
|
||||||
// Always call Constructed after successful completion of new.
|
|
||||||
T *end() { return newed_end_; }
|
|
||||||
const T *end() const { return newed_end_; }
|
|
||||||
|
|
||||||
T &back() { return *(end() - 1); }
|
|
||||||
const T &back() const { return *(end() - 1); }
|
|
||||||
|
|
||||||
std::size_t size() const { return end() - begin(); }
|
|
||||||
bool empty() const { return begin() == end(); }
|
|
||||||
|
|
||||||
T &operator[](std::size_t i) { return begin()[i]; }
|
|
||||||
const T &operator[](std::size_t i) const { return begin()[i]; }
|
|
||||||
|
|
||||||
template <class C> void push_back(const C &c) {
|
|
||||||
new (end()) T(c);
|
|
||||||
Constructed();
|
|
||||||
}
|
|
||||||
|
|
||||||
void clear() {
|
|
||||||
for (T *i = begin(); i != end(); ++i)
|
|
||||||
i->~T();
|
|
||||||
newed_end_ = begin();
|
|
||||||
}
|
|
||||||
|
|
||||||
protected:
|
|
||||||
void Constructed() {
|
|
||||||
++newed_end_;
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
util::scoped_malloc block_;
|
|
||||||
|
|
||||||
T *newed_end_;
|
|
||||||
};
|
|
||||||
|
|
||||||
class Chains;
|
|
||||||
|
|
||||||
class ChainPositions : public FixedArray<util::stream::ChainPosition> {
|
|
||||||
public:
|
|
||||||
ChainPositions() {}
|
|
||||||
|
|
||||||
void Init(Chains &chains);
|
|
||||||
|
|
||||||
explicit ChainPositions(Chains &chains) {
|
|
||||||
Init(chains);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
class Chains : public FixedArray<util::stream::Chain> {
|
|
||||||
private:
|
|
||||||
template <class T, void (T::*ptr)(const ChainPositions &) = &T::Run> struct CheckForRun {
|
|
||||||
typedef Chains type;
|
|
||||||
};
|
|
||||||
|
|
||||||
public:
|
|
||||||
explicit Chains(std::size_t limit) : FixedArray<util::stream::Chain>(limit) {}
|
|
||||||
|
|
||||||
template <class Worker> typename CheckForRun<Worker>::type &operator>>(const Worker &worker) {
|
|
||||||
threads_.push_back(new util::stream::Thread(ChainPositions(*this), worker));
|
|
||||||
return *this;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Worker> typename CheckForRun<Worker>::type &operator>>(const boost::reference_wrapper<Worker> &worker) {
|
|
||||||
threads_.push_back(new util::stream::Thread(ChainPositions(*this), worker));
|
|
||||||
return *this;
|
|
||||||
}
|
|
||||||
|
|
||||||
Chains &operator>>(const util::stream::Recycler &recycler) {
|
|
||||||
for (util::stream::Chain *i = begin(); i != end(); ++i)
|
|
||||||
*i >> recycler;
|
|
||||||
return *this;
|
|
||||||
}
|
|
||||||
|
|
||||||
void Wait(bool release_memory = true) {
|
|
||||||
threads_.clear();
|
|
||||||
for (util::stream::Chain *i = begin(); i != end(); ++i) {
|
|
||||||
i->Wait(release_memory);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
boost::ptr_vector<util::stream::Thread> threads_;
|
|
||||||
|
|
||||||
Chains(const Chains &);
|
|
||||||
void operator=(const Chains &);
|
|
||||||
};
|
|
||||||
|
|
||||||
inline void ChainPositions::Init(Chains &chains) {
|
|
||||||
FixedArray<util::stream::ChainPosition>::Init(chains.size());
|
|
||||||
for (util::stream::Chain *i = chains.begin(); i != chains.end(); ++i) {
|
|
||||||
new (end()) util::stream::ChainPosition(i->Add()); Constructed();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline Chains &operator>>(Chains &chains, ChainPositions &positions) {
|
|
||||||
positions.Init(chains);
|
|
||||||
return chains;
|
|
||||||
}
|
|
||||||
|
|
||||||
class NGramStreams : public FixedArray<NGramStream> {
|
|
||||||
public:
|
|
||||||
NGramStreams() {}
|
|
||||||
|
|
||||||
// This puts a dummy NGramStream at the beginning (useful to algorithms that need to reference something at the beginning).
|
|
||||||
void InitWithDummy(const ChainPositions &positions) {
|
|
||||||
FixedArray<NGramStream>::Init(positions.size() + 1);
|
|
||||||
new (end()) NGramStream(); Constructed();
|
|
||||||
for (const util::stream::ChainPosition *i = positions.begin(); i != positions.end(); ++i) {
|
|
||||||
push_back(*i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Limit restricts to positions[0,limit)
|
|
||||||
void Init(const ChainPositions &positions, std::size_t limit) {
|
|
||||||
FixedArray<NGramStream>::Init(limit);
|
|
||||||
for (const util::stream::ChainPosition *i = positions.begin(); i != positions.begin() + limit; ++i) {
|
|
||||||
push_back(*i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
void Init(const ChainPositions &positions) {
|
|
||||||
Init(positions, positions.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
NGramStreams(const ChainPositions &positions) {
|
|
||||||
Init(positions);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
inline Chains &operator>>(Chains &chains, NGramStreams &streams) {
|
|
||||||
ChainPositions positions;
|
|
||||||
chains >> positions;
|
|
||||||
streams.Init(positions);
|
|
||||||
return chains;
|
|
||||||
}
|
|
||||||
|
|
||||||
}} // namespaces
|
|
||||||
#endif // LM_BUILDER_MULTI_STREAM__
|
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_BUILDER_NGRAM__
|
#ifndef LM_BUILDER_NGRAM_H
|
||||||
#define LM_BUILDER_NGRAM__
|
#define LM_BUILDER_NGRAM_H
|
||||||
|
|
||||||
#include "lm/weights.hh"
|
#include "lm/weights.hh"
|
||||||
#include "lm/word_index.hh"
|
#include "lm/word_index.hh"
|
||||||
@ -26,7 +26,7 @@ union Payload {
|
|||||||
|
|
||||||
class NGram {
|
class NGram {
|
||||||
public:
|
public:
|
||||||
NGram(void *begin, std::size_t order)
|
NGram(void *begin, std::size_t order)
|
||||||
: begin_(static_cast<WordIndex*>(begin)), end_(begin_ + order) {}
|
: begin_(static_cast<WordIndex*>(begin)), end_(begin_ + order) {}
|
||||||
|
|
||||||
const uint8_t *Base() const { return reinterpret_cast<const uint8_t*>(begin_); }
|
const uint8_t *Base() const { return reinterpret_cast<const uint8_t*>(begin_); }
|
||||||
@ -38,12 +38,12 @@ class NGram {
|
|||||||
end_ = begin_ + difference;
|
end_ = begin_ + difference;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Would do operator++ but that can get confusing for a stream.
|
// Would do operator++ but that can get confusing for a stream.
|
||||||
void NextInMemory() {
|
void NextInMemory() {
|
||||||
ReBase(&Value() + 1);
|
ReBase(&Value() + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Lower-case in deference to STL.
|
// Lower-case in deference to STL.
|
||||||
const WordIndex *begin() const { return begin_; }
|
const WordIndex *begin() const { return begin_; }
|
||||||
WordIndex *begin() { return begin_; }
|
WordIndex *begin() { return begin_; }
|
||||||
const WordIndex *end() const { return end_; }
|
const WordIndex *end() const { return end_; }
|
||||||
@ -61,7 +61,7 @@ class NGram {
|
|||||||
return order * sizeof(WordIndex) + sizeof(Payload);
|
return order * sizeof(WordIndex) + sizeof(Payload);
|
||||||
}
|
}
|
||||||
std::size_t TotalSize() const {
|
std::size_t TotalSize() const {
|
||||||
// Compiler should optimize this.
|
// Compiler should optimize this.
|
||||||
return TotalSize(Order());
|
return TotalSize(Order());
|
||||||
}
|
}
|
||||||
static std::size_t OrderFromSize(std::size_t size) {
|
static std::size_t OrderFromSize(std::size_t size) {
|
||||||
@ -69,6 +69,31 @@ class NGram {
|
|||||||
assert(size == TotalSize(ret));
|
assert(size == TotalSize(ret));
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// manipulate msb to signal that ngram can be pruned
|
||||||
|
/*mjd**********************************************************************/
|
||||||
|
|
||||||
|
bool IsMarked() const {
|
||||||
|
return Value().count >> (sizeof(Value().count) * 8 - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Mark() {
|
||||||
|
Value().count |= (1ul << (sizeof(Value().count) * 8 - 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
void Unmark() {
|
||||||
|
Value().count &= ~(1ul << (sizeof(Value().count) * 8 - 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t UnmarkedCount() const {
|
||||||
|
return Value().count & ~(1ul << (sizeof(Value().count) * 8 - 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t CutoffCount() const {
|
||||||
|
return IsMarked() ? 0 : UnmarkedCount();
|
||||||
|
}
|
||||||
|
|
||||||
|
/*mjd**********************************************************************/
|
||||||
|
|
||||||
private:
|
private:
|
||||||
WordIndex *begin_, *end_;
|
WordIndex *begin_, *end_;
|
||||||
@ -81,4 +106,4 @@ const WordIndex kEOS = 2;
|
|||||||
} // namespace builder
|
} // namespace builder
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // LM_BUILDER_NGRAM__
|
#endif // LM_BUILDER_NGRAM_H
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
#ifndef LM_BUILDER_NGRAM_STREAM__
|
#ifndef LM_BUILDER_NGRAM_STREAM_H
|
||||||
#define LM_BUILDER_NGRAM_STREAM__
|
#define LM_BUILDER_NGRAM_STREAM_H
|
||||||
|
|
||||||
#include "lm/builder/ngram.hh"
|
#include "lm/builder/ngram.hh"
|
||||||
#include "util/stream/chain.hh"
|
#include "util/stream/chain.hh"
|
||||||
|
#include "util/stream/multi_stream.hh"
|
||||||
#include "util/stream/stream.hh"
|
#include "util/stream/stream.hh"
|
||||||
|
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
@ -51,5 +52,7 @@ inline util::stream::Chain &operator>>(util::stream::Chain &chain, NGramStream &
|
|||||||
return chain;
|
return chain;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
typedef util::stream::GenericStreams<NGramStream> NGramStreams;
|
||||||
|
|
||||||
}} // namespaces
|
}} // namespaces
|
||||||
#endif // LM_BUILDER_NGRAM_STREAM__
|
#endif // LM_BUILDER_NGRAM_STREAM_H
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
#include "lm/builder/adjust_counts.hh"
|
#include "lm/builder/adjust_counts.hh"
|
||||||
#include "lm/builder/corpus_count.hh"
|
#include "lm/builder/corpus_count.hh"
|
||||||
|
#include "lm/builder/hash_gamma.hh"
|
||||||
#include "lm/builder/initial_probabilities.hh"
|
#include "lm/builder/initial_probabilities.hh"
|
||||||
#include "lm/builder/interpolate.hh"
|
#include "lm/builder/interpolate.hh"
|
||||||
#include "lm/builder/print.hh"
|
#include "lm/builder/print.hh"
|
||||||
@ -20,10 +21,13 @@
|
|||||||
namespace lm { namespace builder {
|
namespace lm { namespace builder {
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
void PrintStatistics(const std::vector<uint64_t> &counts, const std::vector<Discount> &discounts) {
|
void PrintStatistics(const std::vector<uint64_t> &counts, const std::vector<uint64_t> &counts_pruned, const std::vector<Discount> &discounts) {
|
||||||
std::cerr << "Statistics:\n";
|
std::cerr << "Statistics:\n";
|
||||||
for (size_t i = 0; i < counts.size(); ++i) {
|
for (size_t i = 0; i < counts.size(); ++i) {
|
||||||
std::cerr << (i + 1) << ' ' << counts[i];
|
std::cerr << (i + 1) << ' ' << counts_pruned[i];
|
||||||
|
if(counts[i] != counts_pruned[i])
|
||||||
|
std::cerr << "/" << counts[i];
|
||||||
|
|
||||||
for (size_t d = 1; d <= 3; ++d)
|
for (size_t d = 1; d <= 3; ++d)
|
||||||
std::cerr << " D" << d << (d == 3 ? "+=" : "=") << discounts[i].amount[d];
|
std::cerr << " D" << d << (d == 3 ? "+=" : "=") << discounts[i].amount[d];
|
||||||
std::cerr << '\n';
|
std::cerr << '\n';
|
||||||
@ -39,7 +43,7 @@ class Master {
|
|||||||
|
|
||||||
const PipelineConfig &Config() const { return config_; }
|
const PipelineConfig &Config() const { return config_; }
|
||||||
|
|
||||||
Chains &MutableChains() { return chains_; }
|
util::stream::Chains &MutableChains() { return chains_; }
|
||||||
|
|
||||||
template <class T> Master &operator>>(const T &worker) {
|
template <class T> Master &operator>>(const T &worker) {
|
||||||
chains_ >> worker;
|
chains_ >> worker;
|
||||||
@ -64,7 +68,7 @@ class Master {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// For initial probabilities, but this is generic.
|
// For initial probabilities, but this is generic.
|
||||||
void SortAndReadTwice(const std::vector<uint64_t> &counts, Sorts<ContextOrder> &sorts, Chains &second, util::stream::ChainConfig second_config) {
|
void SortAndReadTwice(const std::vector<uint64_t> &counts, Sorts<ContextOrder> &sorts, util::stream::Chains &second, util::stream::ChainConfig second_config) {
|
||||||
// Do merge first before allocating chain memory.
|
// Do merge first before allocating chain memory.
|
||||||
for (std::size_t i = 1; i < config_.order; ++i) {
|
for (std::size_t i = 1; i < config_.order; ++i) {
|
||||||
sorts[i - 1].Merge(0);
|
sorts[i - 1].Merge(0);
|
||||||
@ -198,9 +202,9 @@ class Master {
|
|||||||
|
|
||||||
PipelineConfig config_;
|
PipelineConfig config_;
|
||||||
|
|
||||||
Chains chains_;
|
util::stream::Chains chains_;
|
||||||
// Often only unigrams, but sometimes all orders.
|
// Often only unigrams, but sometimes all orders.
|
||||||
FixedArray<util::stream::FileBuffer> files_;
|
util::FixedArray<util::stream::FileBuffer> files_;
|
||||||
};
|
};
|
||||||
|
|
||||||
void CountText(int text_file /* input */, int vocab_file /* output */, Master &master, uint64_t &token_count, std::string &text_file_name) {
|
void CountText(int text_file /* input */, int vocab_file /* output */, Master &master, uint64_t &token_count, std::string &text_file_name) {
|
||||||
@ -221,7 +225,7 @@ void CountText(int text_file /* input */, int vocab_file /* output */, Master &m
|
|||||||
WordIndex type_count = config.vocab_estimate;
|
WordIndex type_count = config.vocab_estimate;
|
||||||
util::FilePiece text(text_file, NULL, &std::cerr);
|
util::FilePiece text(text_file, NULL, &std::cerr);
|
||||||
text_file_name = text.FileName();
|
text_file_name = text.FileName();
|
||||||
CorpusCount counter(text, vocab_file, token_count, type_count, chain.BlockSize() / chain.EntrySize());
|
CorpusCount counter(text, vocab_file, token_count, type_count, chain.BlockSize() / chain.EntrySize(), config.disallowed_symbol_action);
|
||||||
chain >> boost::ref(counter);
|
chain >> boost::ref(counter);
|
||||||
|
|
||||||
util::stream::Sort<SuffixOrder, AddCombiner> sorter(chain, config.sort, SuffixOrder(config.order), AddCombiner());
|
util::stream::Sort<SuffixOrder, AddCombiner> sorter(chain, config.sort, SuffixOrder(config.order), AddCombiner());
|
||||||
@ -231,21 +235,22 @@ void CountText(int text_file /* input */, int vocab_file /* output */, Master &m
|
|||||||
master.InitForAdjust(sorter, type_count);
|
master.InitForAdjust(sorter, type_count);
|
||||||
}
|
}
|
||||||
|
|
||||||
void InitialProbabilities(const std::vector<uint64_t> &counts, const std::vector<Discount> &discounts, Master &master, Sorts<SuffixOrder> &primary, FixedArray<util::stream::FileBuffer> &gammas) {
|
void InitialProbabilities(const std::vector<uint64_t> &counts, const std::vector<uint64_t> &counts_pruned, const std::vector<Discount> &discounts, Master &master, Sorts<SuffixOrder> &primary,
|
||||||
|
util::FixedArray<util::stream::FileBuffer> &gammas, const std::vector<uint64_t> &prune_thresholds) {
|
||||||
const PipelineConfig &config = master.Config();
|
const PipelineConfig &config = master.Config();
|
||||||
Chains second(config.order);
|
util::stream::Chains second(config.order);
|
||||||
|
|
||||||
{
|
{
|
||||||
Sorts<ContextOrder> sorts;
|
Sorts<ContextOrder> sorts;
|
||||||
master.SetupSorts(sorts);
|
master.SetupSorts(sorts);
|
||||||
PrintStatistics(counts, discounts);
|
PrintStatistics(counts, counts_pruned, discounts);
|
||||||
lm::ngram::ShowSizes(counts);
|
lm::ngram::ShowSizes(counts_pruned);
|
||||||
std::cerr << "=== 3/5 Calculating and sorting initial probabilities ===" << std::endl;
|
std::cerr << "=== 3/5 Calculating and sorting initial probabilities ===" << std::endl;
|
||||||
master.SortAndReadTwice(counts, sorts, second, config.initial_probs.adder_in);
|
master.SortAndReadTwice(counts_pruned, sorts, second, config.initial_probs.adder_in);
|
||||||
}
|
}
|
||||||
|
|
||||||
Chains gamma_chains(config.order);
|
util::stream::Chains gamma_chains(config.order);
|
||||||
InitialProbabilities(config.initial_probs, discounts, master.MutableChains(), second, gamma_chains);
|
InitialProbabilities(config.initial_probs, discounts, master.MutableChains(), second, gamma_chains, prune_thresholds);
|
||||||
// Don't care about gamma for 0.
|
// Don't care about gamma for 0.
|
||||||
gamma_chains[0] >> util::stream::kRecycle;
|
gamma_chains[0] >> util::stream::kRecycle;
|
||||||
gammas.Init(config.order - 1);
|
gammas.Init(config.order - 1);
|
||||||
@ -257,19 +262,25 @@ void InitialProbabilities(const std::vector<uint64_t> &counts, const std::vector
|
|||||||
master.SetupSorts(primary);
|
master.SetupSorts(primary);
|
||||||
}
|
}
|
||||||
|
|
||||||
void InterpolateProbabilities(const std::vector<uint64_t> &counts, Master &master, Sorts<SuffixOrder> &primary, FixedArray<util::stream::FileBuffer> &gammas) {
|
void InterpolateProbabilities(const std::vector<uint64_t> &counts, Master &master, Sorts<SuffixOrder> &primary, util::FixedArray<util::stream::FileBuffer> &gammas) {
|
||||||
std::cerr << "=== 4/5 Calculating and writing order-interpolated probabilities ===" << std::endl;
|
std::cerr << "=== 4/5 Calculating and writing order-interpolated probabilities ===" << std::endl;
|
||||||
const PipelineConfig &config = master.Config();
|
const PipelineConfig &config = master.Config();
|
||||||
master.MaximumLazyInput(counts, primary);
|
master.MaximumLazyInput(counts, primary);
|
||||||
|
|
||||||
Chains gamma_chains(config.order - 1);
|
util::stream::Chains gamma_chains(config.order - 1);
|
||||||
util::stream::ChainConfig read_backoffs(config.read_backoffs);
|
|
||||||
read_backoffs.entry_size = sizeof(float);
|
|
||||||
for (std::size_t i = 0; i < config.order - 1; ++i) {
|
for (std::size_t i = 0; i < config.order - 1; ++i) {
|
||||||
|
util::stream::ChainConfig read_backoffs(config.read_backoffs);
|
||||||
|
|
||||||
|
// Add 1 because here we are skipping unigrams
|
||||||
|
if(config.prune_thresholds[i + 1] > 0)
|
||||||
|
read_backoffs.entry_size = sizeof(HashGamma);
|
||||||
|
else
|
||||||
|
read_backoffs.entry_size = sizeof(float);
|
||||||
|
|
||||||
gamma_chains.push_back(read_backoffs);
|
gamma_chains.push_back(read_backoffs);
|
||||||
gamma_chains.back() >> gammas[i].Source();
|
gamma_chains.back() >> gammas[i].Source();
|
||||||
}
|
}
|
||||||
master >> Interpolate(counts[0], ChainPositions(gamma_chains));
|
master >> Interpolate(std::max(master.Config().vocab_size_for_unk, counts[0] - 1 /* <s> is not included */), util::stream::ChainPositions(gamma_chains), config.prune_thresholds);
|
||||||
gamma_chains >> util::stream::kRecycle;
|
gamma_chains >> util::stream::kRecycle;
|
||||||
master.BufferFinal(counts);
|
master.BufferFinal(counts);
|
||||||
}
|
}
|
||||||
@ -301,21 +312,22 @@ void Pipeline(PipelineConfig config, int text_file, int out_arpa) {
|
|||||||
CountText(text_file, vocab_file.get(), master, token_count, text_file_name);
|
CountText(text_file, vocab_file.get(), master, token_count, text_file_name);
|
||||||
|
|
||||||
std::vector<uint64_t> counts;
|
std::vector<uint64_t> counts;
|
||||||
|
std::vector<uint64_t> counts_pruned;
|
||||||
std::vector<Discount> discounts;
|
std::vector<Discount> discounts;
|
||||||
master >> AdjustCounts(counts, discounts);
|
master >> AdjustCounts(counts, counts_pruned, discounts, config.prune_thresholds);
|
||||||
|
|
||||||
{
|
{
|
||||||
FixedArray<util::stream::FileBuffer> gammas;
|
util::FixedArray<util::stream::FileBuffer> gammas;
|
||||||
Sorts<SuffixOrder> primary;
|
Sorts<SuffixOrder> primary;
|
||||||
InitialProbabilities(counts, discounts, master, primary, gammas);
|
InitialProbabilities(counts, counts_pruned, discounts, master, primary, gammas, config.prune_thresholds);
|
||||||
InterpolateProbabilities(counts, master, primary, gammas);
|
InterpolateProbabilities(counts_pruned, master, primary, gammas);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cerr << "=== 5/5 Writing ARPA model ===" << std::endl;
|
std::cerr << "=== 5/5 Writing ARPA model ===" << std::endl;
|
||||||
VocabReconstitute vocab(vocab_file.get());
|
VocabReconstitute vocab(vocab_file.get());
|
||||||
UTIL_THROW_IF(vocab.Size() != counts[0], util::Exception, "Vocab words don't match up. Is there a null byte in the input?");
|
UTIL_THROW_IF(vocab.Size() != counts[0], util::Exception, "Vocab words don't match up. Is there a null byte in the input?");
|
||||||
HeaderInfo header_info(text_file_name, token_count);
|
HeaderInfo header_info(text_file_name, token_count);
|
||||||
master >> PrintARPA(vocab, counts, (config.verbose_header ? &header_info : NULL), out_arpa) >> util::stream::kRecycle;
|
master >> PrintARPA(vocab, counts_pruned, (config.verbose_header ? &header_info : NULL), out_arpa) >> util::stream::kRecycle;
|
||||||
master.MutableChains().Wait(true);
|
master.MutableChains().Wait(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
#ifndef LM_BUILDER_PIPELINE__
|
#ifndef LM_BUILDER_PIPELINE_H
|
||||||
#define LM_BUILDER_PIPELINE__
|
#define LM_BUILDER_PIPELINE_H
|
||||||
|
|
||||||
#include "lm/builder/initial_probabilities.hh"
|
#include "lm/builder/initial_probabilities.hh"
|
||||||
#include "lm/builder/header_info.hh"
|
#include "lm/builder/header_info.hh"
|
||||||
|
#include "lm/lm_exception.hh"
|
||||||
#include "lm/word_index.hh"
|
#include "lm/word_index.hh"
|
||||||
#include "util/stream/config.hh"
|
#include "util/stream/config.hh"
|
||||||
#include "util/file_piece.hh"
|
#include "util/file_piece.hh"
|
||||||
@ -30,6 +31,28 @@ struct PipelineConfig {
|
|||||||
// Number of blocks to use. This will be overridden to 1 if everything fits.
|
// Number of blocks to use. This will be overridden to 1 if everything fits.
|
||||||
std::size_t block_count;
|
std::size_t block_count;
|
||||||
|
|
||||||
|
// n-gram count thresholds for pruning. 0 values means no pruning for
|
||||||
|
// corresponding n-gram order
|
||||||
|
std::vector<uint64_t> prune_thresholds; //mjd
|
||||||
|
|
||||||
|
/* Computing the perplexity of LMs with different vocabularies is hard. For
|
||||||
|
* example, the lowest perplexity is attained by a unigram model that
|
||||||
|
* predicts p(<unk>) = 1 and has no other vocabulary. Also, linearly
|
||||||
|
* interpolated models will sum to more than 1 because <unk> is duplicated
|
||||||
|
* (SRI just pretends p(<unk>) = 0 for these purposes, which makes it sum to
|
||||||
|
* 1 but comes with its own problems). This option will make the vocabulary
|
||||||
|
* a particular size by replicating <unk> multiple times for purposes of
|
||||||
|
* computing vocabulary size. It has no effect if the actual vocabulary is
|
||||||
|
* larger. This parameter serves the same purpose as IRSTLM's "dub".
|
||||||
|
*/
|
||||||
|
uint64_t vocab_size_for_unk;
|
||||||
|
|
||||||
|
/* What to do the first time <s>, </s>, or <unk> appears in the input. If
|
||||||
|
* this is anything but THROW_UP, then the symbol will always be treated as
|
||||||
|
* whitespace.
|
||||||
|
*/
|
||||||
|
WarningAction disallowed_symbol_action;
|
||||||
|
|
||||||
const std::string &TempPrefix() const { return sort.temp_prefix; }
|
const std::string &TempPrefix() const { return sort.temp_prefix; }
|
||||||
std::size_t TotalMemory() const { return sort.total_memory; }
|
std::size_t TotalMemory() const { return sort.total_memory; }
|
||||||
};
|
};
|
||||||
@ -38,4 +61,4 @@ struct PipelineConfig {
|
|||||||
void Pipeline(PipelineConfig config, int text_file, int out_arpa);
|
void Pipeline(PipelineConfig config, int text_file, int out_arpa);
|
||||||
|
|
||||||
}} // namespaces
|
}} // namespaces
|
||||||
#endif // LM_BUILDER_PIPELINE__
|
#endif // LM_BUILDER_PIPELINE_H
|
||||||
|
@ -42,14 +42,14 @@ PrintARPA::PrintARPA(const VocabReconstitute &vocab, const std::vector<uint64_t>
|
|||||||
util::WriteOrThrow(out_fd, as_string.data(), as_string.size());
|
util::WriteOrThrow(out_fd, as_string.data(), as_string.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
void PrintARPA::Run(const ChainPositions &positions) {
|
void PrintARPA::Run(const util::stream::ChainPositions &positions) {
|
||||||
util::scoped_fd closer(out_fd_);
|
util::scoped_fd closer(out_fd_);
|
||||||
UTIL_TIMER("(%w s) Wrote ARPA file\n");
|
UTIL_TIMER("(%w s) Wrote ARPA file\n");
|
||||||
util::FakeOFStream out(out_fd_);
|
util::FakeOFStream out(out_fd_);
|
||||||
for (unsigned order = 1; order <= positions.size(); ++order) {
|
for (unsigned order = 1; order <= positions.size(); ++order) {
|
||||||
out << "\\" << order << "-grams:" << '\n';
|
out << "\\" << order << "-grams:" << '\n';
|
||||||
for (NGramStream stream(positions[order - 1]); stream; ++stream) {
|
for (NGramStream stream(positions[order - 1]); stream; ++stream) {
|
||||||
// Correcting for numerical precision issues. Take that IRST.
|
// Correcting for numerical precision issues. Take that IRST.
|
||||||
out << std::min(0.0f, stream->Value().complete.prob) << '\t' << vocab_.Lookup(*stream->begin());
|
out << std::min(0.0f, stream->Value().complete.prob) << '\t' << vocab_.Lookup(*stream->begin());
|
||||||
for (const WordIndex *i = stream->begin() + 1; i != stream->end(); ++i) {
|
for (const WordIndex *i = stream->begin() + 1; i != stream->end(); ++i) {
|
||||||
out << ' ' << vocab_.Lookup(*i);
|
out << ' ' << vocab_.Lookup(*i);
|
||||||
@ -58,6 +58,7 @@ void PrintARPA::Run(const ChainPositions &positions) {
|
|||||||
if (backoff != 0.0)
|
if (backoff != 0.0)
|
||||||
out << '\t' << backoff;
|
out << '\t' << backoff;
|
||||||
out << '\n';
|
out << '\n';
|
||||||
|
|
||||||
}
|
}
|
||||||
out << '\n';
|
out << '\n';
|
||||||
}
|
}
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
#ifndef LM_BUILDER_PRINT__
|
#ifndef LM_BUILDER_PRINT_H
|
||||||
#define LM_BUILDER_PRINT__
|
#define LM_BUILDER_PRINT_H
|
||||||
|
|
||||||
#include "lm/builder/ngram.hh"
|
#include "lm/builder/ngram.hh"
|
||||||
#include "lm/builder/multi_stream.hh"
|
#include "lm/builder/ngram_stream.hh"
|
||||||
#include "lm/builder/header_info.hh"
|
#include "lm/builder/header_info.hh"
|
||||||
#include "util/file.hh"
|
#include "util/file.hh"
|
||||||
#include "util/mmap.hh"
|
#include "util/mmap.hh"
|
||||||
@ -59,7 +59,7 @@ template <class V> class Print {
|
|||||||
public:
|
public:
|
||||||
explicit Print(const VocabReconstitute &vocab, std::ostream &to) : vocab_(vocab), to_(to) {}
|
explicit Print(const VocabReconstitute &vocab, std::ostream &to) : vocab_(vocab), to_(to) {}
|
||||||
|
|
||||||
void Run(const ChainPositions &chains) {
|
void Run(const util::stream::ChainPositions &chains) {
|
||||||
NGramStreams streams(chains);
|
NGramStreams streams(chains);
|
||||||
for (NGramStream *s = streams.begin(); s != streams.end(); ++s) {
|
for (NGramStream *s = streams.begin(); s != streams.end(); ++s) {
|
||||||
DumpStream(*s);
|
DumpStream(*s);
|
||||||
@ -92,7 +92,7 @@ class PrintARPA {
|
|||||||
// Takes ownership of out_fd upon Run().
|
// Takes ownership of out_fd upon Run().
|
||||||
explicit PrintARPA(const VocabReconstitute &vocab, const std::vector<uint64_t> &counts, const HeaderInfo* header_info, int out_fd);
|
explicit PrintARPA(const VocabReconstitute &vocab, const std::vector<uint64_t> &counts, const HeaderInfo* header_info, int out_fd);
|
||||||
|
|
||||||
void Run(const ChainPositions &positions);
|
void Run(const util::stream::ChainPositions &positions);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
const VocabReconstitute &vocab_;
|
const VocabReconstitute &vocab_;
|
||||||
@ -100,4 +100,4 @@ class PrintARPA {
|
|||||||
};
|
};
|
||||||
|
|
||||||
}} // namespaces
|
}} // namespaces
|
||||||
#endif // LM_BUILDER_PRINT__
|
#endif // LM_BUILDER_PRINT_H
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
#ifndef LM_BUILDER_SORT__
|
#ifndef LM_BUILDER_SORT_H
|
||||||
#define LM_BUILDER_SORT__
|
#define LM_BUILDER_SORT_H
|
||||||
|
|
||||||
#include "lm/builder/multi_stream.hh"
|
#include "lm/builder/ngram_stream.hh"
|
||||||
#include "lm/builder/ngram.hh"
|
#include "lm/builder/ngram.hh"
|
||||||
#include "lm/word_index.hh"
|
#include "lm/word_index.hh"
|
||||||
#include "util/stream/sort.hh"
|
#include "util/stream/sort.hh"
|
||||||
@ -14,24 +14,71 @@
|
|||||||
namespace lm {
|
namespace lm {
|
||||||
namespace builder {
|
namespace builder {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Abstract parent class for defining custom n-gram comparators.
|
||||||
|
*/
|
||||||
template <class Child> class Comparator : public std::binary_function<const void *, const void *, bool> {
|
template <class Child> class Comparator : public std::binary_function<const void *, const void *, bool> {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructs a comparator capable of comparing two n-grams.
|
||||||
|
*
|
||||||
|
* @param order Number of words in each n-gram
|
||||||
|
*/
|
||||||
explicit Comparator(std::size_t order) : order_(order) {}
|
explicit Comparator(std::size_t order) : order_(order) {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Applies the comparator using the Compare method that must be defined in any class that inherits from this class.
|
||||||
|
*
|
||||||
|
* @param lhs A pointer to the n-gram on the left-hand side of the comparison
|
||||||
|
* @param rhs A pointer to the n-gram on the right-hand side of the comparison
|
||||||
|
*
|
||||||
|
* @see ContextOrder::Compare
|
||||||
|
* @see PrefixOrder::Compare
|
||||||
|
* @see SuffixOrder::Compare
|
||||||
|
*/
|
||||||
inline bool operator()(const void *lhs, const void *rhs) const {
|
inline bool operator()(const void *lhs, const void *rhs) const {
|
||||||
return static_cast<const Child*>(this)->Compare(static_cast<const WordIndex*>(lhs), static_cast<const WordIndex*>(rhs));
|
return static_cast<const Child*>(this)->Compare(static_cast<const WordIndex*>(lhs), static_cast<const WordIndex*>(rhs));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Gets the n-gram order defined for this comparator. */
|
||||||
std::size_t Order() const { return order_; }
|
std::size_t Order() const { return order_; }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
std::size_t order_;
|
std::size_t order_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* N-gram comparator that compares n-grams according to their reverse (suffix) order.
|
||||||
|
*
|
||||||
|
* This comparator compares n-grams lexicographically, one word at a time,
|
||||||
|
* beginning with the last word of each n-gram and ending with the first word of each n-gram.
|
||||||
|
*
|
||||||
|
* Some examples of n-gram comparisons as defined by this comparator:
|
||||||
|
* - a b c == a b c
|
||||||
|
* - a b c < a b d
|
||||||
|
* - a b c > a d b
|
||||||
|
* - a b c > a b b
|
||||||
|
* - a b c > x a c
|
||||||
|
* - a b c < x y z
|
||||||
|
*/
|
||||||
class SuffixOrder : public Comparator<SuffixOrder> {
|
class SuffixOrder : public Comparator<SuffixOrder> {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructs a comparator capable of comparing two n-grams.
|
||||||
|
*
|
||||||
|
* @param order Number of words in each n-gram
|
||||||
|
*/
|
||||||
explicit SuffixOrder(std::size_t order) : Comparator<SuffixOrder>(order) {}
|
explicit SuffixOrder(std::size_t order) : Comparator<SuffixOrder>(order) {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compares two n-grams lexicographically, one word at a time,
|
||||||
|
* beginning with the last word of each n-gram and ending with the first word of each n-gram.
|
||||||
|
*
|
||||||
|
* @param lhs A pointer to the n-gram on the left-hand side of the comparison
|
||||||
|
* @param rhs A pointer to the n-gram on the right-hand side of the comparison
|
||||||
|
*/
|
||||||
inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const {
|
inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const {
|
||||||
for (std::size_t i = order_ - 1; i != 0; --i) {
|
for (std::size_t i = order_ - 1; i != 0; --i) {
|
||||||
if (lhs[i] != rhs[i])
|
if (lhs[i] != rhs[i])
|
||||||
@ -43,10 +90,40 @@ class SuffixOrder : public Comparator<SuffixOrder> {
|
|||||||
static const unsigned kMatchOffset = 1;
|
static const unsigned kMatchOffset = 1;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* N-gram comparator that compares n-grams according to the reverse (suffix) order of the n-gram context.
|
||||||
|
*
|
||||||
|
* This comparator compares n-grams lexicographically, one word at a time,
|
||||||
|
* beginning with the penultimate word of each n-gram and ending with the first word of each n-gram;
|
||||||
|
* finally, this comparator compares the last word of each n-gram.
|
||||||
|
*
|
||||||
|
* Some examples of n-gram comparisons as defined by this comparator:
|
||||||
|
* - a b c == a b c
|
||||||
|
* - a b c < a b d
|
||||||
|
* - a b c < a d b
|
||||||
|
* - a b c > a b b
|
||||||
|
* - a b c > x a c
|
||||||
|
* - a b c < x y z
|
||||||
|
*/
|
||||||
class ContextOrder : public Comparator<ContextOrder> {
|
class ContextOrder : public Comparator<ContextOrder> {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructs a comparator capable of comparing two n-grams.
|
||||||
|
*
|
||||||
|
* @param order Number of words in each n-gram
|
||||||
|
*/
|
||||||
explicit ContextOrder(std::size_t order) : Comparator<ContextOrder>(order) {}
|
explicit ContextOrder(std::size_t order) : Comparator<ContextOrder>(order) {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compares two n-grams lexicographically, one word at a time,
|
||||||
|
* beginning with the penultimate word of each n-gram and ending with the first word of each n-gram;
|
||||||
|
* finally, this comparator compares the last word of each n-gram.
|
||||||
|
*
|
||||||
|
* @param lhs A pointer to the n-gram on the left-hand side of the comparison
|
||||||
|
* @param rhs A pointer to the n-gram on the right-hand side of the comparison
|
||||||
|
*/
|
||||||
inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const {
|
inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const {
|
||||||
for (int i = order_ - 2; i >= 0; --i) {
|
for (int i = order_ - 2; i >= 0; --i) {
|
||||||
if (lhs[i] != rhs[i])
|
if (lhs[i] != rhs[i])
|
||||||
@ -56,10 +133,37 @@ class ContextOrder : public Comparator<ContextOrder> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* N-gram comparator that compares n-grams according to their natural (prefix) order.
|
||||||
|
*
|
||||||
|
* This comparator compares n-grams lexicographically, one word at a time,
|
||||||
|
* beginning with the first word of each n-gram and ending with the last word of each n-gram.
|
||||||
|
*
|
||||||
|
* Some examples of n-gram comparisons as defined by this comparator:
|
||||||
|
* - a b c == a b c
|
||||||
|
* - a b c < a b d
|
||||||
|
* - a b c < a d b
|
||||||
|
* - a b c > a b b
|
||||||
|
* - a b c < x a c
|
||||||
|
* - a b c < x y z
|
||||||
|
*/
|
||||||
class PrefixOrder : public Comparator<PrefixOrder> {
|
class PrefixOrder : public Comparator<PrefixOrder> {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructs a comparator capable of comparing two n-grams.
|
||||||
|
*
|
||||||
|
* @param order Number of words in each n-gram
|
||||||
|
*/
|
||||||
explicit PrefixOrder(std::size_t order) : Comparator<PrefixOrder>(order) {}
|
explicit PrefixOrder(std::size_t order) : Comparator<PrefixOrder>(order) {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compares two n-grams lexicographically, one word at a time,
|
||||||
|
* beginning with the first word of each n-gram and ending with the last word of each n-gram.
|
||||||
|
*
|
||||||
|
* @param lhs A pointer to the n-gram on the left-hand side of the comparison
|
||||||
|
* @param rhs A pointer to the n-gram on the right-hand side of the comparison
|
||||||
|
*/
|
||||||
inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const {
|
inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const {
|
||||||
for (std::size_t i = 0; i < order_; ++i) {
|
for (std::size_t i = 0; i < order_; ++i) {
|
||||||
if (lhs[i] != rhs[i])
|
if (lhs[i] != rhs[i])
|
||||||
@ -84,15 +188,52 @@ struct AddCombiner {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// The combiner is only used on a single chain, so I didn't bother to allow
|
// The combiner is only used on a single chain, so I didn't bother to allow
|
||||||
// that template.
|
// that template.
|
||||||
template <class Compare> class Sorts : public FixedArray<util::stream::Sort<Compare> > {
|
/**
|
||||||
|
* Represents an @ref util::FixedArray "array" capable of storing @ref util::stream::Sort "Sort" objects.
|
||||||
|
*
|
||||||
|
* In the anticipated use case, an instance of this class will maintain one @ref util::stream::Sort "Sort" object
|
||||||
|
* for each n-gram order (ranging from 1 up to the maximum n-gram order being processed).
|
||||||
|
* Use in this manner would enable the n-grams each n-gram order to be sorted, in parallel.
|
||||||
|
*
|
||||||
|
* @tparam Compare An @ref Comparator "ngram comparator" to use during sorting.
|
||||||
|
*/
|
||||||
|
template <class Compare> class Sorts : public util::FixedArray<util::stream::Sort<Compare> > {
|
||||||
private:
|
private:
|
||||||
typedef util::stream::Sort<Compare> S;
|
typedef util::stream::Sort<Compare> S;
|
||||||
typedef FixedArray<S> P;
|
typedef util::FixedArray<S> P;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructs, but does not initialize.
|
||||||
|
*
|
||||||
|
* @ref util::FixedArray::Init() "Init" must be called before use.
|
||||||
|
*
|
||||||
|
* @see util::FixedArray::Init()
|
||||||
|
*/
|
||||||
|
Sorts() {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructs an @ref util::FixedArray "array" capable of storing a fixed number of @ref util::stream::Sort "Sort" objects.
|
||||||
|
*
|
||||||
|
* @param number The maximum number of @ref util::stream::Sort "sorters" that can be held by this @ref util::FixedArray "array"
|
||||||
|
* @see util::FixedArray::FixedArray()
|
||||||
|
*/
|
||||||
|
explicit Sorts(std::size_t number) : util::FixedArray<util::stream::Sort<Compare> >(number) {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructs a new @ref util::stream::Sort "Sort" object which is stored in this @ref util::FixedArray "array".
|
||||||
|
*
|
||||||
|
* The new @ref util::stream::Sort "Sort" object is constructed using the provided @ref util::stream::SortConfig "SortConfig" and @ref Comparator "ngram comparator";
|
||||||
|
* once constructed, a new worker @ref util::stream::Thread "thread" (owned by the @ref util::stream::Chain "chain") will sort the n-gram data stored
|
||||||
|
* in the @ref util::stream::Block "blocks" of the provided @ref util::stream::Chain "chain".
|
||||||
|
*
|
||||||
|
* @see util::stream::Sort::Sort()
|
||||||
|
* @see util::stream::Chain::operator>>()
|
||||||
|
*/
|
||||||
void push_back(util::stream::Chain &chain, const util::stream::SortConfig &config, const Compare &compare) {
|
void push_back(util::stream::Chain &chain, const util::stream::SortConfig &config, const Compare &compare) {
|
||||||
new (P::end()) S(chain, config, compare);
|
new (P::end()) S(chain, config, compare); // use "placement new" syntax to initalize S in an already-allocated memory location
|
||||||
P::Constructed();
|
P::Constructed();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -100,4 +241,4 @@ template <class Compare> class Sorts : public FixedArray<util::stream::Sort<Comp
|
|||||||
} // namespace builder
|
} // namespace builder
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // LM_BUILDER_SORT__
|
#endif // LM_BUILDER_SORT_H
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_CONFIG__
|
#ifndef LM_CONFIG_H
|
||||||
#define LM_CONFIG__
|
#define LM_CONFIG_H
|
||||||
|
|
||||||
#include "lm/lm_exception.hh"
|
#include "lm/lm_exception.hh"
|
||||||
#include "util/mmap.hh"
|
#include "util/mmap.hh"
|
||||||
@ -120,4 +120,4 @@ struct Config {
|
|||||||
|
|
||||||
} /* namespace ngram */ } /* namespace lm */
|
} /* namespace ngram */ } /* namespace lm */
|
||||||
|
|
||||||
#endif // LM_CONFIG__
|
#endif // LM_CONFIG_H
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_ENUMERATE_VOCAB__
|
#ifndef LM_ENUMERATE_VOCAB_H
|
||||||
#define LM_ENUMERATE_VOCAB__
|
#define LM_ENUMERATE_VOCAB_H
|
||||||
|
|
||||||
#include "lm/word_index.hh"
|
#include "lm/word_index.hh"
|
||||||
#include "util/string_piece.hh"
|
#include "util/string_piece.hh"
|
||||||
@ -24,5 +24,5 @@ class EnumerateVocab {
|
|||||||
|
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // LM_ENUMERATE_VOCAB__
|
#endif // LM_ENUMERATE_VOCAB_H
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_FACADE__
|
#ifndef LM_FACADE_H
|
||||||
#define LM_FACADE__
|
#define LM_FACADE_H
|
||||||
|
|
||||||
#include "lm/virtual_interface.hh"
|
#include "lm/virtual_interface.hh"
|
||||||
#include "util/string_piece.hh"
|
#include "util/string_piece.hh"
|
||||||
@ -70,4 +70,4 @@ template <class Child, class StateT, class VocabularyT> class ModelFacade : publ
|
|||||||
} // mamespace base
|
} // mamespace base
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // LM_FACADE__
|
#endif // LM_FACADE_H
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_FILTER_ARPA_IO__
|
#ifndef LM_FILTER_ARPA_IO_H
|
||||||
#define LM_FILTER_ARPA_IO__
|
#define LM_FILTER_ARPA_IO_H
|
||||||
/* Input and output for ARPA format language model files.
|
/* Input and output for ARPA format language model files.
|
||||||
*/
|
*/
|
||||||
#include "lm/read_arpa.hh"
|
#include "lm/read_arpa.hh"
|
||||||
@ -111,4 +111,4 @@ template <class Output> void ReadARPA(util::FilePiece &in_lm, Output &out) {
|
|||||||
|
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // LM_FILTER_ARPA_IO__
|
#endif // LM_FILTER_ARPA_IO_H
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_FILTER_COUNT_IO__
|
#ifndef LM_FILTER_COUNT_IO_H
|
||||||
#define LM_FILTER_COUNT_IO__
|
#define LM_FILTER_COUNT_IO_H
|
||||||
|
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
@ -86,4 +86,4 @@ template <class Output> void ReadCount(util::FilePiece &in_file, Output &out) {
|
|||||||
|
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // LM_FILTER_COUNT_IO__
|
#endif // LM_FILTER_COUNT_IO_H
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_FILTER_FORMAT_H__
|
#ifndef LM_FILTER_FORMAT_H
|
||||||
#define LM_FILTER_FORMAT_H__
|
#define LM_FILTER_FORMAT_H
|
||||||
|
|
||||||
#include "lm/filter/arpa_io.hh"
|
#include "lm/filter/arpa_io.hh"
|
||||||
#include "lm/filter/count_io.hh"
|
#include "lm/filter/count_io.hh"
|
||||||
@ -247,4 +247,4 @@ class MultipleOutputBuffer {
|
|||||||
|
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // LM_FILTER_FORMAT_H__
|
#endif // LM_FILTER_FORMAT_H
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_FILTER_PHRASE_H__
|
#ifndef LM_FILTER_PHRASE_H
|
||||||
#define LM_FILTER_PHRASE_H__
|
#define LM_FILTER_PHRASE_H
|
||||||
|
|
||||||
#include "util/murmur_hash.hh"
|
#include "util/murmur_hash.hh"
|
||||||
#include "util/string_piece.hh"
|
#include "util/string_piece.hh"
|
||||||
@ -165,4 +165,4 @@ class Multiple : public detail::ConditionCommon {
|
|||||||
|
|
||||||
} // namespace phrase
|
} // namespace phrase
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
#endif // LM_FILTER_PHRASE_H__
|
#endif // LM_FILTER_PHRASE_H
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_FILTER_THREAD_H__
|
#ifndef LM_FILTER_THREAD_H
|
||||||
#define LM_FILTER_THREAD_H__
|
#define LM_FILTER_THREAD_H
|
||||||
|
|
||||||
#include "util/thread_pool.hh"
|
#include "util/thread_pool.hh"
|
||||||
|
|
||||||
@ -164,4 +164,4 @@ template <class Filter, class OutputBuffer, class RealOutput> class Controller :
|
|||||||
|
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // LM_FILTER_THREAD_H__
|
#endif // LM_FILTER_THREAD_H
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_FILTER_VOCAB_H__
|
#ifndef LM_FILTER_VOCAB_H
|
||||||
#define LM_FILTER_VOCAB_H__
|
#define LM_FILTER_VOCAB_H
|
||||||
|
|
||||||
// Vocabulary-based filters for language models.
|
// Vocabulary-based filters for language models.
|
||||||
|
|
||||||
@ -130,4 +130,4 @@ class Multiple {
|
|||||||
} // namespace vocab
|
} // namespace vocab
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // LM_FILTER_VOCAB_H__
|
#endif // LM_FILTER_VOCAB_H
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_FILTER_WRAPPER_H__
|
#ifndef LM_FILTER_WRAPPER_H
|
||||||
#define LM_FILTER_WRAPPER_H__
|
#define LM_FILTER_WRAPPER_H
|
||||||
|
|
||||||
#include "util/string_piece.hh"
|
#include "util/string_piece.hh"
|
||||||
|
|
||||||
@ -53,4 +53,4 @@ template <class FilterT> class ContextFilter {
|
|||||||
|
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // LM_FILTER_WRAPPER_H__
|
#endif // LM_FILTER_WRAPPER_H
|
||||||
|
@ -35,8 +35,8 @@
|
|||||||
* phrase, even if hypotheses are generated left-to-right.
|
* phrase, even if hypotheses are generated left-to-right.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef LM_LEFT__
|
#ifndef LM_LEFT_H
|
||||||
#define LM_LEFT__
|
#define LM_LEFT_H
|
||||||
|
|
||||||
#include "lm/max_order.hh"
|
#include "lm/max_order.hh"
|
||||||
#include "lm/state.hh"
|
#include "lm/state.hh"
|
||||||
@ -213,4 +213,4 @@ template <class M> class RuleScore {
|
|||||||
} // namespace ngram
|
} // namespace ngram
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // LM_LEFT__
|
#endif // LM_LEFT_H
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_LM_EXCEPTION__
|
#ifndef LM_LM_EXCEPTION_H
|
||||||
#define LM_LM_EXCEPTION__
|
#define LM_LM_EXCEPTION_H
|
||||||
|
|
||||||
// Named to avoid conflict with util/exception.hh.
|
// Named to avoid conflict with util/exception.hh.
|
||||||
|
|
||||||
|
@ -1,9 +1,13 @@
|
|||||||
/* IF YOUR BUILD SYSTEM PASSES -DKENLM_MAX_ORDER, THEN CHANGE THE BUILD SYSTEM.
|
#ifndef LM_MAX_ORDER_H
|
||||||
|
#define LM_MAX_ORDER_H
|
||||||
|
/* IF YOUR BUILD SYSTEM PASSES -DKENLM_MAX_ORDER_H, THEN CHANGE THE BUILD SYSTEM.
|
||||||
* If not, this is the default maximum order.
|
* If not, this is the default maximum order.
|
||||||
* Having this limit means that State can be
|
* Having this limit means that State can be
|
||||||
* (kMaxOrder - 1) * sizeof(float) bytes instead of
|
* (kMaxOrder - 1) * sizeof(float) bytes instead of
|
||||||
* sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead
|
* sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead
|
||||||
*/
|
*/
|
||||||
#ifndef KENLM_ORDER_MESSAGE
|
#ifndef KENLM_ORDER_MESSAGE
|
||||||
#define KENLM_ORDER_MESSAGE "If your build system supports changing KENLM_MAX_ORDER, change it there and recompile. In the KenLM tarball or Moses, use e.g. `bjam --max-kenlm-order=6 -a'. Otherwise, edit lm/max_order.hh."
|
#define KENLM_ORDER_MESSAGE "If your build system supports changing KENLM_MAX_ORDER_H, change it there and recompile. In the KenLM tarball or Moses, use e.g. `bjam --max-kenlm-order=6 -a'. Otherwise, edit lm/max_order.hh."
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif // LM_MAX_ORDER_H
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_MODEL__
|
#ifndef LM_MODEL_H
|
||||||
#define LM_MODEL__
|
#define LM_MODEL_H
|
||||||
|
|
||||||
#include "lm/bhiksha.hh"
|
#include "lm/bhiksha.hh"
|
||||||
#include "lm/binary_format.hh"
|
#include "lm/binary_format.hh"
|
||||||
@ -153,4 +153,4 @@ base::Model *LoadVirtual(const char *file_name, const Config &config = Config(),
|
|||||||
} // namespace ngram
|
} // namespace ngram
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // LM_MODEL__
|
#endif // LM_MODEL_H
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_MODEL_TYPE__
|
#ifndef LM_MODEL_TYPE_H
|
||||||
#define LM_MODEL_TYPE__
|
#define LM_MODEL_TYPE_H
|
||||||
|
|
||||||
namespace lm {
|
namespace lm {
|
||||||
namespace ngram {
|
namespace ngram {
|
||||||
@ -20,4 +20,4 @@ const static ModelType kArrayAdd = static_cast<ModelType>(ARRAY_TRIE - TRIE);
|
|||||||
|
|
||||||
} // namespace ngram
|
} // namespace ngram
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
#endif // LM_MODEL_TYPE__
|
#endif // LM_MODEL_TYPE_H
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
#ifndef LM_NGRAM_QUERY__
|
#ifndef LM_NGRAM_QUERY_H
|
||||||
#define LM_NGRAM_QUERY__
|
#define LM_NGRAM_QUERY_H
|
||||||
|
|
||||||
#include "lm/enumerate_vocab.hh"
|
#include "lm/enumerate_vocab.hh"
|
||||||
#include "lm/model.hh"
|
#include "lm/model.hh"
|
||||||
|
#include "util/file_piece.hh"
|
||||||
#include "util/usage.hh"
|
#include "util/usage.hh"
|
||||||
|
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
@ -16,64 +17,94 @@
|
|||||||
namespace lm {
|
namespace lm {
|
||||||
namespace ngram {
|
namespace ngram {
|
||||||
|
|
||||||
template <class Model> void Query(const Model &model, bool sentence_context, std::istream &in_stream, std::ostream &out_stream) {
|
struct BasicPrint {
|
||||||
|
void Word(StringPiece, WordIndex, const FullScoreReturn &) const {}
|
||||||
|
void Line(uint64_t oov, float total) const {
|
||||||
|
std::cout << "Total: " << total << " OOV: " << oov << '\n';
|
||||||
|
}
|
||||||
|
void Summary(double, double, uint64_t, uint64_t) {}
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
struct FullPrint : public BasicPrint {
|
||||||
|
void Word(StringPiece surface, WordIndex vocab, const FullScoreReturn &ret) const {
|
||||||
|
std::cout << surface << '=' << vocab << ' ' << static_cast<unsigned int>(ret.ngram_length) << ' ' << ret.prob << '\t';
|
||||||
|
}
|
||||||
|
|
||||||
|
void Summary(double ppl_including_oov, double ppl_excluding_oov, uint64_t corpus_oov, uint64_t corpus_tokens) {
|
||||||
|
std::cout <<
|
||||||
|
"Perplexity including OOVs:\t" << ppl_including_oov << "\n"
|
||||||
|
"Perplexity excluding OOVs:\t" << ppl_excluding_oov << "\n"
|
||||||
|
"OOVs:\t" << corpus_oov << "\n"
|
||||||
|
"Tokenss:\t" << corpus_tokens << '\n'
|
||||||
|
;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <class Model, class Printer> void Query(const Model &model, bool sentence_context) {
|
||||||
|
Printer printer;
|
||||||
typename Model::State state, out;
|
typename Model::State state, out;
|
||||||
lm::FullScoreReturn ret;
|
lm::FullScoreReturn ret;
|
||||||
std::string word;
|
StringPiece word;
|
||||||
|
|
||||||
|
util::FilePiece in(0);
|
||||||
|
|
||||||
double corpus_total = 0.0;
|
double corpus_total = 0.0;
|
||||||
|
double corpus_total_oov_only = 0.0;
|
||||||
uint64_t corpus_oov = 0;
|
uint64_t corpus_oov = 0;
|
||||||
uint64_t corpus_tokens = 0;
|
uint64_t corpus_tokens = 0;
|
||||||
|
|
||||||
while (in_stream) {
|
while (true) {
|
||||||
state = sentence_context ? model.BeginSentenceState() : model.NullContextState();
|
state = sentence_context ? model.BeginSentenceState() : model.NullContextState();
|
||||||
float total = 0.0;
|
float total = 0.0;
|
||||||
bool got = false;
|
|
||||||
uint64_t oov = 0;
|
uint64_t oov = 0;
|
||||||
while (in_stream >> word) {
|
|
||||||
got = true;
|
while (in.ReadWordSameLine(word)) {
|
||||||
lm::WordIndex vocab = model.GetVocabulary().Index(word);
|
lm::WordIndex vocab = model.GetVocabulary().Index(word);
|
||||||
if (vocab == 0) ++oov;
|
|
||||||
ret = model.FullScore(state, vocab, out);
|
ret = model.FullScore(state, vocab, out);
|
||||||
|
if (vocab == model.GetVocabulary().NotFound()) {
|
||||||
|
++oov;
|
||||||
|
corpus_total_oov_only += ret.prob;
|
||||||
|
}
|
||||||
total += ret.prob;
|
total += ret.prob;
|
||||||
out_stream << word << '=' << vocab << ' ' << static_cast<unsigned int>(ret.ngram_length) << ' ' << ret.prob << '\t';
|
printer.Word(word, vocab, ret);
|
||||||
++corpus_tokens;
|
++corpus_tokens;
|
||||||
state = out;
|
state = out;
|
||||||
char c;
|
|
||||||
while (true) {
|
|
||||||
c = in_stream.get();
|
|
||||||
if (!in_stream) break;
|
|
||||||
if (c == '\n') break;
|
|
||||||
if (!isspace(c)) {
|
|
||||||
in_stream.unget();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (c == '\n') break;
|
|
||||||
}
|
}
|
||||||
if (!got && !in_stream) break;
|
// If people don't have a newline after their last query, this won't add a </s>.
|
||||||
|
// Sue me.
|
||||||
|
try {
|
||||||
|
UTIL_THROW_IF('\n' != in.get(), util::Exception, "FilePiece is confused.");
|
||||||
|
} catch (const util::EndOfFileException &e) { break; }
|
||||||
if (sentence_context) {
|
if (sentence_context) {
|
||||||
ret = model.FullScore(state, model.GetVocabulary().EndSentence(), out);
|
ret = model.FullScore(state, model.GetVocabulary().EndSentence(), out);
|
||||||
total += ret.prob;
|
total += ret.prob;
|
||||||
++corpus_tokens;
|
++corpus_tokens;
|
||||||
out_stream << "</s>=" << model.GetVocabulary().EndSentence() << ' ' << static_cast<unsigned int>(ret.ngram_length) << ' ' << ret.prob << '\t';
|
printer.Word("</s>", model.GetVocabulary().EndSentence(), ret);
|
||||||
}
|
}
|
||||||
out_stream << "Total: " << total << " OOV: " << oov << '\n';
|
printer.Line(oov, total);
|
||||||
corpus_total += total;
|
corpus_total += total;
|
||||||
corpus_oov += oov;
|
corpus_oov += oov;
|
||||||
}
|
}
|
||||||
out_stream << "Perplexity " << pow(10.0, -(corpus_total / static_cast<double>(corpus_tokens))) << std::endl;
|
printer.Summary(
|
||||||
|
pow(10.0, -(corpus_total / static_cast<double>(corpus_tokens))), // PPL including OOVs
|
||||||
|
pow(10.0, -((corpus_total - corpus_total_oov_only) / static_cast<double>(corpus_tokens - corpus_oov))), // PPL excluding OOVs
|
||||||
|
corpus_oov,
|
||||||
|
corpus_tokens);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class M> void Query(const char *file, bool sentence_context, std::istream &in_stream, std::ostream &out_stream) {
|
template <class Model> void Query(const char *file, const Config &config, bool sentence_context, bool show_words) {
|
||||||
Config config;
|
Model model(file, config);
|
||||||
M model(file, config);
|
if (show_words) {
|
||||||
Query(model, sentence_context, in_stream, out_stream);
|
Query<Model, FullPrint>(model, sentence_context);
|
||||||
|
} else {
|
||||||
|
Query<Model, BasicPrint>(model, sentence_context);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace ngram
|
} // namespace ngram
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // LM_NGRAM_QUERY__
|
#endif // LM_NGRAM_QUERY_H
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_PARTIAL__
|
#ifndef LM_PARTIAL_H
|
||||||
#define LM_PARTIAL__
|
#define LM_PARTIAL_H
|
||||||
|
|
||||||
#include "lm/return.hh"
|
#include "lm/return.hh"
|
||||||
#include "lm/state.hh"
|
#include "lm/state.hh"
|
||||||
@ -164,4 +164,4 @@ template <class Model> float Subsume(const Model &model, Left &first_left, const
|
|||||||
} // namespace ngram
|
} // namespace ngram
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // LM_PARTIAL__
|
#endif // LM_PARTIAL_H
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_QUANTIZE_H__
|
#ifndef LM_QUANTIZE_H
|
||||||
#define LM_QUANTIZE_H__
|
#define LM_QUANTIZE_H
|
||||||
|
|
||||||
#include "lm/blank.hh"
|
#include "lm/blank.hh"
|
||||||
#include "lm/config.hh"
|
#include "lm/config.hh"
|
||||||
@ -230,4 +230,4 @@ class SeparatelyQuantize {
|
|||||||
} // namespace ngram
|
} // namespace ngram
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // LM_QUANTIZE_H__
|
#endif // LM_QUANTIZE_H
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#include "lm/ngram_query.hh"
|
#include "lm/ngram_query.hh"
|
||||||
|
#include "util/getopt.hh"
|
||||||
|
|
||||||
#ifdef WITH_NPLM
|
#ifdef WITH_NPLM
|
||||||
#include "lm/wrappers/nplm.hh"
|
#include "lm/wrappers/nplm.hh"
|
||||||
@ -7,47 +8,76 @@
|
|||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
|
||||||
void Usage(const char *name) {
|
void Usage(const char *name) {
|
||||||
std::cerr << "KenLM was compiled with maximum order " << KENLM_MAX_ORDER << "." << std::endl;
|
std::cerr <<
|
||||||
std::cerr << "Usage: " << name << " [-n] lm_file" << std::endl;
|
"KenLM was compiled with maximum order " << KENLM_MAX_ORDER << ".\n"
|
||||||
std::cerr << "Input is wrapped in <s> and </s> unless -n is passed." << std::endl;
|
"Usage: " << name << " [-n] [-s] lm_file\n"
|
||||||
|
"-n: Do not wrap the input in <s> and </s>.\n"
|
||||||
|
"-s: Sentence totals only.\n"
|
||||||
|
"-l lazy|populate|read|parallel: Load lazily, with populate, or malloc+read\n"
|
||||||
|
"The default loading method is populate on Linux and read on others.\n";
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
|
if (argc == 1 || (argc == 2 && !strcmp(argv[1], "--help")))
|
||||||
|
Usage(argv[0]);
|
||||||
|
|
||||||
|
lm::ngram::Config config;
|
||||||
bool sentence_context = true;
|
bool sentence_context = true;
|
||||||
const char *file = NULL;
|
bool show_words = true;
|
||||||
for (char **arg = argv + 1; arg != argv + argc; ++arg) {
|
|
||||||
if (!strcmp(*arg, "-n")) {
|
int opt;
|
||||||
sentence_context = false;
|
while ((opt = getopt(argc, argv, "hnsl:")) != -1) {
|
||||||
} else if (!strcmp(*arg, "-h") || !strcmp(*arg, "--help") || file) {
|
switch (opt) {
|
||||||
Usage(argv[0]);
|
case 'n':
|
||||||
} else {
|
sentence_context = false;
|
||||||
file = *arg;
|
break;
|
||||||
|
case 's':
|
||||||
|
show_words = false;
|
||||||
|
break;
|
||||||
|
case 'l':
|
||||||
|
if (!strcmp(optarg, "lazy")) {
|
||||||
|
config.load_method = util::LAZY;
|
||||||
|
} else if (!strcmp(optarg, "populate")) {
|
||||||
|
config.load_method = util::POPULATE_OR_READ;
|
||||||
|
} else if (!strcmp(optarg, "read")) {
|
||||||
|
config.load_method = util::READ;
|
||||||
|
} else if (!strcmp(optarg, "parallel")) {
|
||||||
|
config.load_method = util::PARALLEL_READ;
|
||||||
|
} else {
|
||||||
|
Usage(argv[0]);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 'h':
|
||||||
|
default:
|
||||||
|
Usage(argv[0]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!file) Usage(argv[0]);
|
if (optind + 1 != argc)
|
||||||
|
Usage(argv[0]);
|
||||||
|
const char *file = argv[optind];
|
||||||
try {
|
try {
|
||||||
using namespace lm::ngram;
|
using namespace lm::ngram;
|
||||||
ModelType model_type;
|
ModelType model_type;
|
||||||
if (RecognizeBinary(file, model_type)) {
|
if (RecognizeBinary(file, model_type)) {
|
||||||
switch(model_type) {
|
switch(model_type) {
|
||||||
case PROBING:
|
case PROBING:
|
||||||
Query<lm::ngram::ProbingModel>(file, sentence_context, std::cin, std::cout);
|
Query<lm::ngram::ProbingModel>(file, config, sentence_context, show_words);
|
||||||
break;
|
break;
|
||||||
case REST_PROBING:
|
case REST_PROBING:
|
||||||
Query<lm::ngram::RestProbingModel>(file, sentence_context, std::cin, std::cout);
|
Query<lm::ngram::RestProbingModel>(file, config, sentence_context, show_words);
|
||||||
break;
|
break;
|
||||||
case TRIE:
|
case TRIE:
|
||||||
Query<TrieModel>(file, sentence_context, std::cin, std::cout);
|
Query<TrieModel>(file, config, sentence_context, show_words);
|
||||||
break;
|
break;
|
||||||
case QUANT_TRIE:
|
case QUANT_TRIE:
|
||||||
Query<QuantTrieModel>(file, sentence_context, std::cin, std::cout);
|
Query<QuantTrieModel>(file, config, sentence_context, show_words);
|
||||||
break;
|
break;
|
||||||
case ARRAY_TRIE:
|
case ARRAY_TRIE:
|
||||||
Query<ArrayTrieModel>(file, sentence_context, std::cin, std::cout);
|
Query<ArrayTrieModel>(file, config, sentence_context, show_words);
|
||||||
break;
|
break;
|
||||||
case QUANT_ARRAY_TRIE:
|
case QUANT_ARRAY_TRIE:
|
||||||
Query<QuantArrayTrieModel>(file, sentence_context, std::cin, std::cout);
|
Query<QuantArrayTrieModel>(file, config, sentence_context, show_words);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
std::cerr << "Unrecognized kenlm model type " << model_type << std::endl;
|
std::cerr << "Unrecognized kenlm model type " << model_type << std::endl;
|
||||||
@ -56,12 +86,15 @@ int main(int argc, char *argv[]) {
|
|||||||
#ifdef WITH_NPLM
|
#ifdef WITH_NPLM
|
||||||
} else if (lm::np::Model::Recognize(file)) {
|
} else if (lm::np::Model::Recognize(file)) {
|
||||||
lm::np::Model model(file);
|
lm::np::Model model(file);
|
||||||
Query(model, sentence_context, std::cin, std::cout);
|
if (show_words) {
|
||||||
|
Query<lm::np::Model, lm::ngram::FullPrint>(model, sentence_context);
|
||||||
|
} else {
|
||||||
|
Query<lm::np::Model, lm::ngram::BasicPrint>(model, sentence_context);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
} else {
|
} else {
|
||||||
Query<ProbingModel>(file, sentence_context, std::cin, std::cout);
|
Query<ProbingModel>(file, config, sentence_context, show_words);
|
||||||
}
|
}
|
||||||
std::cerr << "Total time including destruction:\n";
|
|
||||||
util::PrintUsage(std::cerr);
|
util::PrintUsage(std::cerr);
|
||||||
} catch (const std::exception &e) {
|
} catch (const std::exception &e) {
|
||||||
std::cerr << e.what() << std::endl;
|
std::cerr << e.what() << std::endl;
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_READ_ARPA__
|
#ifndef LM_READ_ARPA_H
|
||||||
#define LM_READ_ARPA__
|
#define LM_READ_ARPA_H
|
||||||
|
|
||||||
#include "lm/lm_exception.hh"
|
#include "lm/lm_exception.hh"
|
||||||
#include "lm/word_index.hh"
|
#include "lm/word_index.hh"
|
||||||
@ -28,7 +28,7 @@ void ReadEnd(util::FilePiece &in);
|
|||||||
|
|
||||||
extern const bool kARPASpaces[256];
|
extern const bool kARPASpaces[256];
|
||||||
|
|
||||||
// Positive log probability warning.
|
// Positive log probability warning.
|
||||||
class PositiveProbWarn {
|
class PositiveProbWarn {
|
||||||
public:
|
public:
|
||||||
PositiveProbWarn() : action_(THROW_UP) {}
|
PositiveProbWarn() : action_(THROW_UP) {}
|
||||||
@ -41,24 +41,29 @@ class PositiveProbWarn {
|
|||||||
WarningAction action_;
|
WarningAction action_;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Voc, class Weights> void Read1Gram(util::FilePiece &f, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) {
|
template <class Weights> StringPiece Read1Gram(util::FilePiece &f, Weights &weights, PositiveProbWarn &warn) {
|
||||||
try {
|
try {
|
||||||
float prob = f.ReadFloat();
|
weights.prob = f.ReadFloat();
|
||||||
if (prob > 0.0) {
|
if (weights.prob > 0.0) {
|
||||||
warn.Warn(prob);
|
warn.Warn(weights.prob);
|
||||||
prob = 0.0;
|
weights.prob = 0.0;
|
||||||
}
|
}
|
||||||
if (f.get() != '\t') UTIL_THROW(FormatLoadException, "Expected tab after probability");
|
UTIL_THROW_IF(f.get() != '\t', FormatLoadException, "Expected tab after probability");
|
||||||
Weights &value = unigrams[vocab.Insert(f.ReadDelimited(kARPASpaces))];
|
StringPiece ret(f.ReadDelimited(kARPASpaces));
|
||||||
value.prob = prob;
|
ReadBackoff(f, weights);
|
||||||
ReadBackoff(f, value);
|
return ret;
|
||||||
} catch(util::Exception &e) {
|
} catch(util::Exception &e) {
|
||||||
e << " in the 1-gram at byte " << f.Offset();
|
e << " in the 1-gram at byte " << f.Offset();
|
||||||
throw;
|
throw;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return true if a positive log probability came out.
|
template <class Voc, class Weights> void Read1Gram(util::FilePiece &f, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) {
|
||||||
|
Weights temp;
|
||||||
|
WordIndex word = vocab.Insert(Read1Gram(f, temp, warn));
|
||||||
|
unigrams[word] = temp;
|
||||||
|
}
|
||||||
|
|
||||||
template <class Voc, class Weights> void Read1Grams(util::FilePiece &f, std::size_t count, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) {
|
template <class Voc, class Weights> void Read1Grams(util::FilePiece &f, std::size_t count, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) {
|
||||||
ReadNGramHeader(f, 1);
|
ReadNGramHeader(f, 1);
|
||||||
for (std::size_t i = 0; i < count; ++i) {
|
for (std::size_t i = 0; i < count; ++i) {
|
||||||
@ -67,16 +72,16 @@ template <class Voc, class Weights> void Read1Grams(util::FilePiece &f, std::siz
|
|||||||
vocab.FinishedLoading(unigrams);
|
vocab.FinishedLoading(unigrams);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return true if a positive log probability came out.
|
// Read ngram, write vocab ids to indices_out.
|
||||||
template <class Voc, class Weights> void ReadNGram(util::FilePiece &f, const unsigned char n, const Voc &vocab, WordIndex *const reverse_indices, Weights &weights, PositiveProbWarn &warn) {
|
template <class Voc, class Weights, class Iterator> void ReadNGram(util::FilePiece &f, const unsigned char n, const Voc &vocab, Iterator indices_out, Weights &weights, PositiveProbWarn &warn) {
|
||||||
try {
|
try {
|
||||||
weights.prob = f.ReadFloat();
|
weights.prob = f.ReadFloat();
|
||||||
if (weights.prob > 0.0) {
|
if (weights.prob > 0.0) {
|
||||||
warn.Warn(weights.prob);
|
warn.Warn(weights.prob);
|
||||||
weights.prob = 0.0;
|
weights.prob = 0.0;
|
||||||
}
|
}
|
||||||
for (WordIndex *vocab_out = reverse_indices + n - 1; vocab_out >= reverse_indices; --vocab_out) {
|
for (unsigned char i = 0; i < n; ++i, ++indices_out) {
|
||||||
*vocab_out = vocab.Index(f.ReadDelimited(kARPASpaces));
|
*indices_out = vocab.Index(f.ReadDelimited(kARPASpaces));
|
||||||
}
|
}
|
||||||
ReadBackoff(f, weights);
|
ReadBackoff(f, weights);
|
||||||
} catch(util::Exception &e) {
|
} catch(util::Exception &e) {
|
||||||
@ -87,4 +92,4 @@ template <class Voc, class Weights> void ReadNGram(util::FilePiece &f, const uns
|
|||||||
|
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // LM_READ_ARPA__
|
#endif // LM_READ_ARPA_H
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_RETURN__
|
#ifndef LM_RETURN_H
|
||||||
#define LM_RETURN__
|
#define LM_RETURN_H
|
||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
@ -39,4 +39,4 @@ struct FullScoreReturn {
|
|||||||
};
|
};
|
||||||
|
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
#endif // LM_RETURN__
|
#endif // LM_RETURN_H
|
||||||
|
@ -178,7 +178,7 @@ template <class Build, class Activate, class Store> void ReadNGrams(
|
|||||||
typename Store::Entry entry;
|
typename Store::Entry entry;
|
||||||
std::vector<typename Value::Weights *> between;
|
std::vector<typename Value::Weights *> between;
|
||||||
for (size_t i = 0; i < count; ++i) {
|
for (size_t i = 0; i < count; ++i) {
|
||||||
ReadNGram(f, n, vocab, &*vocab_ids.begin(), entry.value, warn);
|
ReadNGram(f, n, vocab, vocab_ids.rbegin(), entry.value, warn);
|
||||||
build.SetRest(&*vocab_ids.begin(), n, entry.value);
|
build.SetRest(&*vocab_ids.begin(), n, entry.value);
|
||||||
|
|
||||||
keys[0] = detail::CombineWordHash(static_cast<uint64_t>(vocab_ids.front()), vocab_ids[1]);
|
keys[0] = detail::CombineWordHash(static_cast<uint64_t>(vocab_ids.front()), vocab_ids[1]);
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_SEARCH_HASHED__
|
#ifndef LM_SEARCH_HASHED_H
|
||||||
#define LM_SEARCH_HASHED__
|
#define LM_SEARCH_HASHED_H
|
||||||
|
|
||||||
#include "lm/model_type.hh"
|
#include "lm/model_type.hh"
|
||||||
#include "lm/config.hh"
|
#include "lm/config.hh"
|
||||||
@ -189,4 +189,4 @@ template <class Value> class HashedSearch {
|
|||||||
} // namespace ngram
|
} // namespace ngram
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // LM_SEARCH_HASHED__
|
#endif // LM_SEARCH_HASHED_H
|
||||||
|
@ -561,6 +561,7 @@ template <class Quant, class Bhiksha> uint8_t *TrieSearch<Quant, Bhiksha>::Setup
|
|||||||
}
|
}
|
||||||
// Crazy backwards thing so we initialize using pointers to ones that have already been initialized
|
// Crazy backwards thing so we initialize using pointers to ones that have already been initialized
|
||||||
for (unsigned char i = counts.size() - 1; i >= 2; --i) {
|
for (unsigned char i = counts.size() - 1; i >= 2; --i) {
|
||||||
|
// use "placement new" syntax to initalize Middle in an already-allocated memory location
|
||||||
new (middle_begin_ + i - 2) Middle(
|
new (middle_begin_ + i - 2) Middle(
|
||||||
middle_starts[i-2],
|
middle_starts[i-2],
|
||||||
quant_.MiddleBits(config),
|
quant_.MiddleBits(config),
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_SEARCH_TRIE__
|
#ifndef LM_SEARCH_TRIE_H
|
||||||
#define LM_SEARCH_TRIE__
|
#define LM_SEARCH_TRIE_H
|
||||||
|
|
||||||
#include "lm/config.hh"
|
#include "lm/config.hh"
|
||||||
#include "lm/model_type.hh"
|
#include "lm/model_type.hh"
|
||||||
@ -127,4 +127,4 @@ template <class Quant, class Bhiksha> class TrieSearch {
|
|||||||
} // namespace ngram
|
} // namespace ngram
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // LM_SEARCH_TRIE__
|
#endif // LM_SEARCH_TRIE_H
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_SIZES__
|
#ifndef LM_SIZES_H
|
||||||
#define LM_SIZES__
|
#define LM_SIZES_H
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
@ -14,4 +14,4 @@ void ShowSizes(const std::vector<uint64_t> &counts);
|
|||||||
void ShowSizes(const char *file, const lm::ngram::Config &config);
|
void ShowSizes(const char *file, const lm::ngram::Config &config);
|
||||||
|
|
||||||
}} // namespaces
|
}} // namespaces
|
||||||
#endif // LM_SIZES__
|
#endif // LM_SIZES_H
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_STATE__
|
#ifndef LM_STATE_H
|
||||||
#define LM_STATE__
|
#define LM_STATE_H
|
||||||
|
|
||||||
#include "lm/max_order.hh"
|
#include "lm/max_order.hh"
|
||||||
#include "lm/word_index.hh"
|
#include "lm/word_index.hh"
|
||||||
@ -122,4 +122,4 @@ inline uint64_t hash_value(const ChartState &state) {
|
|||||||
} // namespace ngram
|
} // namespace ngram
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // LM_STATE__
|
#endif // LM_STATE_H
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_TRIE__
|
#ifndef LM_TRIE_H
|
||||||
#define LM_TRIE__
|
#define LM_TRIE_H
|
||||||
|
|
||||||
#include "lm/weights.hh"
|
#include "lm/weights.hh"
|
||||||
#include "lm/word_index.hh"
|
#include "lm/word_index.hh"
|
||||||
@ -143,4 +143,4 @@ class BitPackedLongest : public BitPacked {
|
|||||||
} // namespace ngram
|
} // namespace ngram
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // LM_TRIE__
|
#endif // LM_TRIE_H
|
||||||
|
@ -16,6 +16,7 @@
|
|||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <deque>
|
#include <deque>
|
||||||
|
#include <iterator>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
@ -248,11 +249,13 @@ void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vo
|
|||||||
uint8_t *out_end = out + std::min(count - done, batch_size) * entry_size;
|
uint8_t *out_end = out + std::min(count - done, batch_size) * entry_size;
|
||||||
if (order == counts.size()) {
|
if (order == counts.size()) {
|
||||||
for (; out != out_end; out += entry_size) {
|
for (; out != out_end; out += entry_size) {
|
||||||
ReadNGram(f, order, vocab, reinterpret_cast<WordIndex*>(out), *reinterpret_cast<Prob*>(out + words_size), warn);
|
std::reverse_iterator<WordIndex*> it(reinterpret_cast<WordIndex*>(out) + order);
|
||||||
|
ReadNGram(f, order, vocab, it, *reinterpret_cast<Prob*>(out + words_size), warn);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (; out != out_end; out += entry_size) {
|
for (; out != out_end; out += entry_size) {
|
||||||
ReadNGram(f, order, vocab, reinterpret_cast<WordIndex*>(out), *reinterpret_cast<ProbBackoff*>(out + words_size), warn);
|
std::reverse_iterator<WordIndex*> it(reinterpret_cast<WordIndex*>(out) + order);
|
||||||
|
ReadNGram(f, order, vocab, it, *reinterpret_cast<ProbBackoff*>(out + words_size), warn);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Sort full records by full n-gram.
|
// Sort full records by full n-gram.
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
// Step of trie builder: create sorted files.
|
// Step of trie builder: create sorted files.
|
||||||
|
|
||||||
#ifndef LM_TRIE_SORT__
|
#ifndef LM_TRIE_SORT_H
|
||||||
#define LM_TRIE_SORT__
|
#define LM_TRIE_SORT_H
|
||||||
|
|
||||||
#include "lm/max_order.hh"
|
#include "lm/max_order.hh"
|
||||||
#include "lm/word_index.hh"
|
#include "lm/word_index.hh"
|
||||||
@ -111,4 +111,4 @@ class SortedFiles {
|
|||||||
} // namespace ngram
|
} // namespace ngram
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // LM_TRIE_SORT__
|
#endif // LM_TRIE_SORT_H
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_VALUE__
|
#ifndef LM_VALUE_H
|
||||||
#define LM_VALUE__
|
#define LM_VALUE_H
|
||||||
|
|
||||||
#include "lm/model_type.hh"
|
#include "lm/model_type.hh"
|
||||||
#include "lm/value_build.hh"
|
#include "lm/value_build.hh"
|
||||||
@ -154,4 +154,4 @@ struct RestValue {
|
|||||||
} // namespace ngram
|
} // namespace ngram
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // LM_VALUE__
|
#endif // LM_VALUE_H
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_VALUE_BUILD__
|
#ifndef LM_VALUE_BUILD_H
|
||||||
#define LM_VALUE_BUILD__
|
#define LM_VALUE_BUILD_H
|
||||||
|
|
||||||
#include "lm/weights.hh"
|
#include "lm/weights.hh"
|
||||||
#include "lm/word_index.hh"
|
#include "lm/word_index.hh"
|
||||||
@ -94,4 +94,4 @@ template <class Model> class LowerRestBuild {
|
|||||||
} // namespace ngram
|
} // namespace ngram
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // LM_VALUE_BUILD__
|
#endif // LM_VALUE_BUILD_H
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_VIRTUAL_INTERFACE__
|
#ifndef LM_VIRTUAL_INTERFACE_H
|
||||||
#define LM_VIRTUAL_INTERFACE__
|
#define LM_VIRTUAL_INTERFACE_H
|
||||||
|
|
||||||
#include "lm/return.hh"
|
#include "lm/return.hh"
|
||||||
#include "lm/word_index.hh"
|
#include "lm/word_index.hh"
|
||||||
@ -157,4 +157,4 @@ class Model {
|
|||||||
} // mamespace base
|
} // mamespace base
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // LM_VIRTUAL_INTERFACE__
|
#endif // LM_VIRTUAL_INTERFACE_H
|
||||||
|
14
lm/vocab.cc
14
lm/vocab.cc
@ -170,11 +170,15 @@ struct ProbingVocabularyHeader {
|
|||||||
|
|
||||||
ProbingVocabulary::ProbingVocabulary() : enumerate_(NULL) {}
|
ProbingVocabulary::ProbingVocabulary() : enumerate_(NULL) {}
|
||||||
|
|
||||||
uint64_t ProbingVocabulary::Size(uint64_t entries, const Config &config) {
|
uint64_t ProbingVocabulary::Size(uint64_t entries, float probing_multiplier) {
|
||||||
return ALIGN8(sizeof(detail::ProbingVocabularyHeader)) + Lookup::Size(entries, config.probing_multiplier);
|
return ALIGN8(sizeof(detail::ProbingVocabularyHeader)) + Lookup::Size(entries, probing_multiplier);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ProbingVocabulary::SetupMemory(void *start, std::size_t allocated, std::size_t /*entries*/, const Config &/*config*/) {
|
uint64_t ProbingVocabulary::Size(uint64_t entries, const Config &config) {
|
||||||
|
return Size(entries, config.probing_multiplier);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ProbingVocabulary::SetupMemory(void *start, std::size_t allocated) {
|
||||||
header_ = static_cast<detail::ProbingVocabularyHeader*>(start);
|
header_ = static_cast<detail::ProbingVocabularyHeader*>(start);
|
||||||
lookup_ = Lookup(static_cast<uint8_t*>(start) + ALIGN8(sizeof(detail::ProbingVocabularyHeader)), allocated);
|
lookup_ = Lookup(static_cast<uint8_t*>(start) + ALIGN8(sizeof(detail::ProbingVocabularyHeader)), allocated);
|
||||||
bound_ = 1;
|
bound_ = 1;
|
||||||
@ -201,12 +205,12 @@ WordIndex ProbingVocabulary::Insert(const StringPiece &str) {
|
|||||||
return 0;
|
return 0;
|
||||||
} else {
|
} else {
|
||||||
if (enumerate_) enumerate_->Add(bound_, str);
|
if (enumerate_) enumerate_->Add(bound_, str);
|
||||||
lookup_.Insert(ProbingVocabuaryEntry::Make(hashed, bound_));
|
lookup_.Insert(ProbingVocabularyEntry::Make(hashed, bound_));
|
||||||
return bound_++;
|
return bound_++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ProbingVocabulary::InternalFinishedLoading() {
|
void ProbingVocabulary::FinishedLoading() {
|
||||||
lookup_.FinishedInserting();
|
lookup_.FinishedInserting();
|
||||||
header_->bound = bound_;
|
header_->bound = bound_;
|
||||||
header_->version = kProbingVocabularyVersion;
|
header_->version = kProbingVocabularyVersion;
|
||||||
|
90
lm/vocab.hh
90
lm/vocab.hh
@ -1,9 +1,11 @@
|
|||||||
#ifndef LM_VOCAB__
|
#ifndef LM_VOCAB_H
|
||||||
#define LM_VOCAB__
|
#define LM_VOCAB_H
|
||||||
|
|
||||||
#include "lm/enumerate_vocab.hh"
|
#include "lm/enumerate_vocab.hh"
|
||||||
#include "lm/lm_exception.hh"
|
#include "lm/lm_exception.hh"
|
||||||
#include "lm/virtual_interface.hh"
|
#include "lm/virtual_interface.hh"
|
||||||
|
#include "util/fake_ofstream.hh"
|
||||||
|
#include "util/murmur_hash.hh"
|
||||||
#include "util/pool.hh"
|
#include "util/pool.hh"
|
||||||
#include "util/probing_hash_table.hh"
|
#include "util/probing_hash_table.hh"
|
||||||
#include "util/sorted_uniform.hh"
|
#include "util/sorted_uniform.hh"
|
||||||
@ -104,17 +106,16 @@ class SortedVocabulary : public base::Vocabulary {
|
|||||||
|
|
||||||
#pragma pack(push)
|
#pragma pack(push)
|
||||||
#pragma pack(4)
|
#pragma pack(4)
|
||||||
struct ProbingVocabuaryEntry {
|
struct ProbingVocabularyEntry {
|
||||||
uint64_t key;
|
uint64_t key;
|
||||||
WordIndex value;
|
WordIndex value;
|
||||||
|
|
||||||
typedef uint64_t Key;
|
typedef uint64_t Key;
|
||||||
uint64_t GetKey() const {
|
uint64_t GetKey() const { return key; }
|
||||||
return key;
|
void SetKey(uint64_t to) { key = to; }
|
||||||
}
|
|
||||||
|
|
||||||
static ProbingVocabuaryEntry Make(uint64_t key, WordIndex value) {
|
static ProbingVocabularyEntry Make(uint64_t key, WordIndex value) {
|
||||||
ProbingVocabuaryEntry ret;
|
ProbingVocabularyEntry ret;
|
||||||
ret.key = key;
|
ret.key = key;
|
||||||
ret.value = value;
|
ret.value = value;
|
||||||
return ret;
|
return ret;
|
||||||
@ -132,13 +133,18 @@ class ProbingVocabulary : public base::Vocabulary {
|
|||||||
return lookup_.Find(detail::HashForVocab(str), i) ? i->value : 0;
|
return lookup_.Find(detail::HashForVocab(str), i) ? i->value : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static uint64_t Size(uint64_t entries, float probing_multiplier);
|
||||||
|
// This just unwraps Config to get the probing_multiplier.
|
||||||
static uint64_t Size(uint64_t entries, const Config &config);
|
static uint64_t Size(uint64_t entries, const Config &config);
|
||||||
|
|
||||||
// Vocab words are [0, Bound()).
|
// Vocab words are [0, Bound()).
|
||||||
WordIndex Bound() const { return bound_; }
|
WordIndex Bound() const { return bound_; }
|
||||||
|
|
||||||
// Everything else is for populating. I'm too lazy to hide and friend these, but you'll only get a const reference anyway.
|
// Everything else is for populating. I'm too lazy to hide and friend these, but you'll only get a const reference anyway.
|
||||||
void SetupMemory(void *start, std::size_t allocated, std::size_t entries, const Config &config);
|
void SetupMemory(void *start, std::size_t allocated);
|
||||||
|
void SetupMemory(void *start, std::size_t allocated, std::size_t /*entries*/, const Config &/*config*/) {
|
||||||
|
SetupMemory(start, allocated);
|
||||||
|
}
|
||||||
|
|
||||||
void Relocate(void *new_start);
|
void Relocate(void *new_start);
|
||||||
|
|
||||||
@ -147,8 +153,9 @@ class ProbingVocabulary : public base::Vocabulary {
|
|||||||
WordIndex Insert(const StringPiece &str);
|
WordIndex Insert(const StringPiece &str);
|
||||||
|
|
||||||
template <class Weights> void FinishedLoading(Weights * /*reorder_vocab*/) {
|
template <class Weights> void FinishedLoading(Weights * /*reorder_vocab*/) {
|
||||||
InternalFinishedLoading();
|
FinishedLoading();
|
||||||
}
|
}
|
||||||
|
void FinishedLoading();
|
||||||
|
|
||||||
std::size_t UnkCountChangePadding() const { return 0; }
|
std::size_t UnkCountChangePadding() const { return 0; }
|
||||||
|
|
||||||
@ -157,9 +164,7 @@ class ProbingVocabulary : public base::Vocabulary {
|
|||||||
void LoadedBinary(bool have_words, int fd, EnumerateVocab *to, uint64_t offset);
|
void LoadedBinary(bool have_words, int fd, EnumerateVocab *to, uint64_t offset);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void InternalFinishedLoading();
|
typedef util::ProbingHashTable<ProbingVocabularyEntry, util::IdentityHash> Lookup;
|
||||||
|
|
||||||
typedef util::ProbingHashTable<ProbingVocabuaryEntry, util::IdentityHash> Lookup;
|
|
||||||
|
|
||||||
Lookup lookup_;
|
Lookup lookup_;
|
||||||
|
|
||||||
@ -181,7 +186,64 @@ template <class Vocab> void CheckSpecials(const Config &config, const Vocab &voc
|
|||||||
if (vocab.EndSentence() == vocab.NotFound()) MissingSentenceMarker(config, "</s>");
|
if (vocab.EndSentence() == vocab.NotFound()) MissingSentenceMarker(config, "</s>");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class WriteUniqueWords {
|
||||||
|
public:
|
||||||
|
explicit WriteUniqueWords(int fd) : word_list_(fd) {}
|
||||||
|
|
||||||
|
void operator()(const StringPiece &word) {
|
||||||
|
word_list_ << word << '\0';
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
util::FakeOFStream word_list_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class NoOpUniqueWords {
|
||||||
|
public:
|
||||||
|
NoOpUniqueWords() {}
|
||||||
|
void operator()(const StringPiece &word) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <class NewWordAction = NoOpUniqueWords> class GrowableVocab {
|
||||||
|
public:
|
||||||
|
static std::size_t MemUsage(WordIndex content) {
|
||||||
|
return Lookup::MemUsage(content > 2 ? content : 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Does not take ownership of write_wordi
|
||||||
|
template <class NewWordConstruct> GrowableVocab(WordIndex initial_size, const NewWordConstruct &new_word_construct = NewWordAction())
|
||||||
|
: lookup_(initial_size), new_word_(new_word_construct) {
|
||||||
|
FindOrInsert("<unk>"); // Force 0
|
||||||
|
FindOrInsert("<s>"); // Force 1
|
||||||
|
FindOrInsert("</s>"); // Force 2
|
||||||
|
}
|
||||||
|
|
||||||
|
WordIndex Index(const StringPiece &str) const {
|
||||||
|
Lookup::ConstIterator i;
|
||||||
|
return lookup_.Find(detail::HashForVocab(str), i) ? i->value : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
WordIndex FindOrInsert(const StringPiece &word) {
|
||||||
|
ProbingVocabularyEntry entry = ProbingVocabularyEntry::Make(util::MurmurHashNative(word.data(), word.size()), Size());
|
||||||
|
Lookup::MutableIterator it;
|
||||||
|
if (!lookup_.FindOrInsert(entry, it)) {
|
||||||
|
new_word_(word);
|
||||||
|
UTIL_THROW_IF(Size() >= std::numeric_limits<lm::WordIndex>::max(), VocabLoadException, "Too many vocabulary words. Change WordIndex to uint64_t in lm/word_index.hh");
|
||||||
|
}
|
||||||
|
return it->value;
|
||||||
|
}
|
||||||
|
|
||||||
|
WordIndex Size() const { return lookup_.Size(); }
|
||||||
|
|
||||||
|
private:
|
||||||
|
typedef util::AutoProbing<ProbingVocabularyEntry, util::IdentityHash> Lookup;
|
||||||
|
|
||||||
|
Lookup lookup_;
|
||||||
|
|
||||||
|
NewWordAction new_word_;
|
||||||
|
};
|
||||||
|
|
||||||
} // namespace ngram
|
} // namespace ngram
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
|
|
||||||
#endif // LM_VOCAB__
|
#endif // LM_VOCAB_H
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#ifndef LM_WEIGHTS__
|
#ifndef LM_WEIGHTS_H
|
||||||
#define LM_WEIGHTS__
|
#define LM_WEIGHTS_H
|
||||||
|
|
||||||
// Weights for n-grams. Probability and possibly a backoff.
|
// Weights for n-grams. Probability and possibly a backoff.
|
||||||
|
|
||||||
@ -19,4 +19,4 @@ struct RestWeights {
|
|||||||
};
|
};
|
||||||
|
|
||||||
} // namespace lm
|
} // namespace lm
|
||||||
#endif // LM_WEIGHTS__
|
#endif // LM_WEIGHTS_H
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
// Separate header because this is used often.
|
// Separate header because this is used often.
|
||||||
#ifndef LM_WORD_INDEX__
|
#ifndef LM_WORD_INDEX_H
|
||||||
#define LM_WORD_INDEX__
|
#define LM_WORD_INDEX_H
|
||||||
|
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
|
|
||||||
|
@ -24,8 +24,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|||||||
#include "moses/InputType.h"
|
#include "moses/InputType.h"
|
||||||
#include "moses/Phrase.h"
|
#include "moses/Phrase.h"
|
||||||
#include "moses/TrellisPathList.h"
|
#include "moses/TrellisPathList.h"
|
||||||
#include "moses/ChartTrellisPathList.h"
|
#include "moses/ChartKBestExtractor.h"
|
||||||
#include "moses/ChartTrellisPath.h"
|
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
using namespace Moses;
|
using namespace Moses;
|
||||||
@ -223,28 +222,28 @@ vector< vector<const Word*> > MosesDecoder::runChartDecoder(const std::string& s
|
|||||||
// run the decoder
|
// run the decoder
|
||||||
m_chartManager = new ChartManager(*m_sentence);
|
m_chartManager = new ChartManager(*m_sentence);
|
||||||
m_chartManager->ProcessSentence();
|
m_chartManager->ProcessSentence();
|
||||||
ChartTrellisPathList nBestList;
|
ChartKBestExtractor::KBestVec nBestList;
|
||||||
m_chartManager->CalcNBest(nBestSize, nBestList, distinct);
|
m_chartManager->CalcNBest(nBestSize, nBestList, distinct);
|
||||||
|
|
||||||
// read off the feature values and bleu scores for each sentence in the nbest list
|
// read off the feature values and bleu scores for each sentence in the nbest list
|
||||||
ChartTrellisPathList::const_iterator iter;
|
for (ChartKBestExtractor::KBestVec::const_iterator p = nBestList.begin();
|
||||||
for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
|
p != nBestList.end(); ++p) {
|
||||||
const Moses::ChartTrellisPath &path = **iter;
|
const ChartKBestExtractor::Derivation &derivation = **p;
|
||||||
featureValues.push_back(path.GetScoreBreakdown());
|
featureValues.push_back(derivation.scoreBreakdown);
|
||||||
float bleuScore, dynBleuScore, realBleuScore;
|
float bleuScore, dynBleuScore, realBleuScore;
|
||||||
dynBleuScore = getBleuScore(featureValues.back());
|
dynBleuScore = getBleuScore(featureValues.back());
|
||||||
realBleuScore = m_bleuScoreFeature->CalculateBleu(path.GetOutputPhrase());
|
Phrase outputPhrase = ChartKBestExtractor::GetOutputPhrase(derivation);
|
||||||
|
realBleuScore = m_bleuScoreFeature->CalculateBleu(outputPhrase);
|
||||||
bleuScore = realBleu ? realBleuScore : dynBleuScore;
|
bleuScore = realBleu ? realBleuScore : dynBleuScore;
|
||||||
bleuScores.push_back(bleuScore);
|
bleuScores.push_back(bleuScore);
|
||||||
|
|
||||||
//std::cout << "Score breakdown: " << path.GetScoreBreakdown() << endl;
|
float scoreWithoutBleu = derivation.score - (bleuObjectiveWeight * bleuScoreWeight * bleuScore);
|
||||||
float scoreWithoutBleu = path.GetTotalScore() - (bleuObjectiveWeight * bleuScoreWeight * bleuScore);
|
|
||||||
modelScores.push_back(scoreWithoutBleu);
|
modelScores.push_back(scoreWithoutBleu);
|
||||||
|
|
||||||
if (iter != nBestList.begin())
|
if (p != nBestList.begin())
|
||||||
cerr << endl;
|
cerr << endl;
|
||||||
cerr << "Rank " << rank << ", epoch " << epoch << ", \"" << path.GetOutputPhrase() << "\", score: "
|
cerr << "Rank " << rank << ", epoch " << epoch << ", \"" << outputPhrase << "\", score: "
|
||||||
<< scoreWithoutBleu << ", Bleu: " << bleuScore << ", total: " << path.GetTotalScore();
|
<< scoreWithoutBleu << ", Bleu: " << bleuScore << ", total: " << derivation.score;
|
||||||
if (m_bleuScoreFeature->Enabled() && realBleu)
|
if (m_bleuScoreFeature->Enabled() && realBleu)
|
||||||
cerr << " (d-bleu: " << dynBleuScore << ", r-bleu: " << realBleuScore << ") ";
|
cerr << " (d-bleu: " << dynBleuScore << ", r-bleu: " << realBleuScore << ") ";
|
||||||
|
|
||||||
@ -254,9 +253,10 @@ vector< vector<const Word*> > MosesDecoder::runChartDecoder(const std::string& s
|
|||||||
|
|
||||||
// prepare translations to return
|
// prepare translations to return
|
||||||
vector< vector<const Word*> > translations;
|
vector< vector<const Word*> > translations;
|
||||||
for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
|
for (ChartKBestExtractor::KBestVec::const_iterator p = nBestList.begin();
|
||||||
const ChartTrellisPath &path = **iter;
|
p != nBestList.end(); ++p) {
|
||||||
Phrase phrase = path.GetOutputPhrase();
|
const ChartKBestExtractor::Derivation &derivation = **p;
|
||||||
|
Phrase phrase = ChartKBestExtractor::GetOutputPhrase(derivation);
|
||||||
|
|
||||||
vector<const Word*> translation;
|
vector<const Word*> translation;
|
||||||
for (size_t pos = 0; pos < phrase.GetSize(); ++pos) {
|
for (size_t pos = 0; pos < phrase.GetSize(); ++pos) {
|
||||||
|
@ -24,7 +24,6 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
|
||||||
|
|
||||||
#include "moses/ChartTrellisPathList.h"
|
|
||||||
#include "moses/Hypothesis.h"
|
#include "moses/Hypothesis.h"
|
||||||
#include "moses/Parameter.h"
|
#include "moses/Parameter.h"
|
||||||
#include "moses/SearchNormal.h"
|
#include "moses/SearchNormal.h"
|
||||||
|
@ -37,8 +37,6 @@ namespace mpi = boost::mpi;
|
|||||||
#include "Hildreth.h"
|
#include "Hildreth.h"
|
||||||
#include "HypothesisQueue.h"
|
#include "HypothesisQueue.h"
|
||||||
#include "moses/StaticData.h"
|
#include "moses/StaticData.h"
|
||||||
#include "moses/ChartTrellisPathList.h"
|
|
||||||
#include "moses/ChartTrellisPath.h"
|
|
||||||
#include "moses/ScoreComponentCollection.h"
|
#include "moses/ScoreComponentCollection.h"
|
||||||
#include "moses/ThreadPool.h"
|
#include "moses/ThreadPool.h"
|
||||||
#include "mert/BleuScorer.h"
|
#include "mert/BleuScorer.h"
|
||||||
|
@ -42,15 +42,13 @@ POSSIBILITY OF SUCH DAMAGE.
|
|||||||
#include "moses/InputFileStream.h"
|
#include "moses/InputFileStream.h"
|
||||||
#include "moses/Incremental.h"
|
#include "moses/Incremental.h"
|
||||||
#include "moses/TranslationModel/PhraseDictionary.h"
|
#include "moses/TranslationModel/PhraseDictionary.h"
|
||||||
#include "moses/ChartTrellisPathList.h"
|
|
||||||
#include "moses/ChartTrellisPath.h"
|
|
||||||
#include "moses/ChartTrellisNode.h"
|
|
||||||
#include "moses/ChartTranslationOptions.h"
|
#include "moses/ChartTranslationOptions.h"
|
||||||
#include "moses/ChartHypothesis.h"
|
#include "moses/ChartHypothesis.h"
|
||||||
#include "moses/FeatureVector.h"
|
#include "moses/FeatureVector.h"
|
||||||
#include "moses/FF/StatefulFeatureFunction.h"
|
#include "moses/FF/StatefulFeatureFunction.h"
|
||||||
#include "moses/FF/StatelessFeatureFunction.h"
|
#include "moses/FF/StatelessFeatureFunction.h"
|
||||||
#include "moses/FF/TreeStructureFeature.h"
|
#include "moses/FF/TreeStructureFeature.h"
|
||||||
|
#include "moses/PP/TreeStructurePhraseProperty.h"
|
||||||
#include "util/exception.hh"
|
#include "util/exception.hh"
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
@ -413,17 +411,15 @@ void IOWrapper::OutputTreeFragmentsTranslationOptions(std::ostream &out, Applica
|
|||||||
if (hypo != NULL) {
|
if (hypo != NULL) {
|
||||||
OutputTranslationOption(out, applicationContext, hypo, sentence, translationId);
|
OutputTranslationOption(out, applicationContext, hypo, sentence, translationId);
|
||||||
|
|
||||||
const std::string key = "Tree";
|
|
||||||
std::string value;
|
|
||||||
bool hasProperty;
|
|
||||||
const TargetPhrase &currTarPhr = hypo->GetCurrTargetPhrase();
|
const TargetPhrase &currTarPhr = hypo->GetCurrTargetPhrase();
|
||||||
currTarPhr.GetProperty(key, value, hasProperty);
|
boost::shared_ptr<PhraseProperty> property;
|
||||||
|
|
||||||
out << " ||| ";
|
out << " ||| ";
|
||||||
if (hasProperty)
|
if (currTarPhr.GetProperty("Tree", property)) {
|
||||||
out << " " << value;
|
out << " " << property->GetValueString();
|
||||||
else
|
} else {
|
||||||
out << " " << "noTreeInfo";
|
out << " " << "noTreeInfo";
|
||||||
|
}
|
||||||
out << std::endl;
|
out << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -442,17 +438,15 @@ void IOWrapper::OutputTreeFragmentsTranslationOptions(std::ostream &out, Applica
|
|||||||
if (applied != NULL) {
|
if (applied != NULL) {
|
||||||
OutputTranslationOption(out, applicationContext, applied, sentence, translationId);
|
OutputTranslationOption(out, applicationContext, applied, sentence, translationId);
|
||||||
|
|
||||||
const std::string key = "Tree";
|
|
||||||
std::string value;
|
|
||||||
bool hasProperty;
|
|
||||||
const TargetPhrase &currTarPhr = *static_cast<const TargetPhrase*>(applied->GetNote().vp);
|
const TargetPhrase &currTarPhr = *static_cast<const TargetPhrase*>(applied->GetNote().vp);
|
||||||
currTarPhr.GetProperty(key, value, hasProperty);
|
boost::shared_ptr<PhraseProperty> property;
|
||||||
|
|
||||||
out << " ||| ";
|
out << " ||| ";
|
||||||
if (hasProperty)
|
if (currTarPhr.GetProperty("Tree", property)) {
|
||||||
out << " " << value;
|
out << " " << property->GetValueString();
|
||||||
else
|
} else {
|
||||||
out << " " << "noTreeInfo";
|
out << " " << "noTreeInfo";
|
||||||
|
}
|
||||||
out << std::endl;
|
out << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -705,94 +699,6 @@ void IOWrapper::OutputFeatureScores( std::ostream& out, const ScoreComponentColl
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, long translationId)
|
|
||||||
{
|
|
||||||
std::ostringstream out;
|
|
||||||
|
|
||||||
// Check if we're writing to std::cout.
|
|
||||||
if (m_nBestOutputCollector->OutputIsCout()) {
|
|
||||||
// Set precision only if we're writing the n-best list to cout. This is to
|
|
||||||
// preserve existing behaviour, but should probably be done either way.
|
|
||||||
IOWrapper::FixPrecision(out);
|
|
||||||
|
|
||||||
// Used to check StaticData's GetOutputHypoScore(), but it makes no sense with nbest output.
|
|
||||||
}
|
|
||||||
|
|
||||||
//bool includeAlignment = StaticData::Instance().NBestIncludesAlignment();
|
|
||||||
bool includeWordAlignment = StaticData::Instance().PrintAlignmentInfoInNbest();
|
|
||||||
|
|
||||||
ChartTrellisPathList::const_iterator iter;
|
|
||||||
for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
|
|
||||||
const ChartTrellisPath &path = **iter;
|
|
||||||
//cerr << path << endl << endl;
|
|
||||||
|
|
||||||
Moses::Phrase outputPhrase = path.GetOutputPhrase();
|
|
||||||
|
|
||||||
// delete 1st & last
|
|
||||||
UTIL_THROW_IF2(outputPhrase.GetSize() < 2,
|
|
||||||
"Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
|
|
||||||
|
|
||||||
outputPhrase.RemoveWord(0);
|
|
||||||
outputPhrase.RemoveWord(outputPhrase.GetSize() - 1);
|
|
||||||
|
|
||||||
// print the surface factor of the translation
|
|
||||||
out << translationId << " ||| ";
|
|
||||||
OutputSurface(out, outputPhrase, m_outputFactorOrder, false);
|
|
||||||
out << " ||| ";
|
|
||||||
|
|
||||||
// print the scores in a hardwired order
|
|
||||||
// before each model type, the corresponding command-line-like name must be emitted
|
|
||||||
// MERT script relies on this
|
|
||||||
|
|
||||||
OutputAllFeatureScores(path.GetScoreBreakdown(), out);
|
|
||||||
|
|
||||||
// total
|
|
||||||
out << " ||| " << path.GetTotalScore();
|
|
||||||
|
|
||||||
/*
|
|
||||||
if (includeAlignment) {
|
|
||||||
*m_nBestStream << " |||";
|
|
||||||
for (int currEdge = (int)edges.size() - 2 ; currEdge >= 0 ; currEdge--)
|
|
||||||
{
|
|
||||||
const ChartHypothesis &edge = *edges[currEdge];
|
|
||||||
WordsRange sourceRange = edge.GetCurrSourceWordsRange();
|
|
||||||
WordsRange targetRange = edge.GetCurrTargetWordsRange();
|
|
||||||
*m_nBestStream << " " << sourceRange.GetStartPos();
|
|
||||||
if (sourceRange.GetStartPos() < sourceRange.GetEndPos()) {
|
|
||||||
*m_nBestStream << "-" << sourceRange.GetEndPos();
|
|
||||||
}
|
|
||||||
*m_nBestStream << "=" << targetRange.GetStartPos();
|
|
||||||
if (targetRange.GetStartPos() < targetRange.GetEndPos()) {
|
|
||||||
*m_nBestStream << "-" << targetRange.GetEndPos();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
if (includeWordAlignment) {
|
|
||||||
out << " ||| ";
|
|
||||||
|
|
||||||
Alignments retAlign;
|
|
||||||
|
|
||||||
const ChartTrellisNode &node = path.GetFinalNode();
|
|
||||||
OutputAlignmentNBest(retAlign, node, 0);
|
|
||||||
|
|
||||||
Alignments::const_iterator iter;
|
|
||||||
for (iter = retAlign.begin(); iter != retAlign.end(); ++iter) {
|
|
||||||
const pair<size_t, size_t> &alignPoint = *iter;
|
|
||||||
out << alignPoint.first << "-" << alignPoint.second << " ";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
out << endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
out <<std::flush;
|
|
||||||
|
|
||||||
assert(m_nBestOutputCollector);
|
|
||||||
m_nBestOutputCollector->Write(translationId, out.str());
|
|
||||||
}
|
|
||||||
|
|
||||||
void IOWrapper::OutputNBestList(const ChartKBestExtractor::KBestVec &nBestList,
|
void IOWrapper::OutputNBestList(const ChartKBestExtractor::KBestVec &nBestList,
|
||||||
long translationId)
|
long translationId)
|
||||||
{
|
{
|
||||||
@ -904,81 +810,6 @@ size_t CalcSourceSize(const Moses::ChartHypothesis *hypo)
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartTrellisNode &node, size_t startTarget)
|
|
||||||
{
|
|
||||||
const ChartHypothesis *hypo = &node.GetHypothesis();
|
|
||||||
|
|
||||||
size_t totalTargetSize = 0;
|
|
||||||
size_t startSource = hypo->GetCurrSourceRange().GetStartPos();
|
|
||||||
|
|
||||||
const TargetPhrase &tp = hypo->GetCurrTargetPhrase();
|
|
||||||
|
|
||||||
size_t thisSourceSize = CalcSourceSize(hypo);
|
|
||||||
|
|
||||||
// position of each terminal word in translation rule, irrespective of alignment
|
|
||||||
// if non-term, number is undefined
|
|
||||||
vector<size_t> sourceOffsets(thisSourceSize, 0);
|
|
||||||
vector<size_t> targetOffsets(tp.GetSize(), 0);
|
|
||||||
|
|
||||||
const ChartTrellisNode::NodeChildren &prevNodes = node.GetChildren();
|
|
||||||
|
|
||||||
const AlignmentInfo &aiNonTerm = hypo->GetCurrTargetPhrase().GetAlignNonTerm();
|
|
||||||
vector<size_t> sourceInd2pos = aiNonTerm.GetSourceIndex2PosMap();
|
|
||||||
const AlignmentInfo::NonTermIndexMap &targetPos2SourceInd = aiNonTerm.GetNonTermIndexMap();
|
|
||||||
|
|
||||||
UTIL_THROW_IF2(sourceInd2pos.size() != prevNodes.size(), "Error");
|
|
||||||
|
|
||||||
size_t targetInd = 0;
|
|
||||||
for (size_t targetPos = 0; targetPos < tp.GetSize(); ++targetPos) {
|
|
||||||
if (tp.GetWord(targetPos).IsNonTerminal()) {
|
|
||||||
UTIL_THROW_IF2(targetPos >= targetPos2SourceInd.size(), "Error");
|
|
||||||
size_t sourceInd = targetPos2SourceInd[targetPos];
|
|
||||||
size_t sourcePos = sourceInd2pos[sourceInd];
|
|
||||||
|
|
||||||
const ChartTrellisNode &prevNode = *prevNodes[sourceInd];
|
|
||||||
|
|
||||||
// calc source size
|
|
||||||
size_t sourceSize = prevNode.GetHypothesis().GetCurrSourceRange().GetNumWordsCovered();
|
|
||||||
sourceOffsets[sourcePos] = sourceSize;
|
|
||||||
|
|
||||||
// calc target size.
|
|
||||||
// Recursively look thru child hypos
|
|
||||||
size_t currStartTarget = startTarget + totalTargetSize;
|
|
||||||
size_t targetSize = OutputAlignmentNBest(retAlign, prevNode, currStartTarget);
|
|
||||||
targetOffsets[targetPos] = targetSize;
|
|
||||||
|
|
||||||
totalTargetSize += targetSize;
|
|
||||||
++targetInd;
|
|
||||||
} else {
|
|
||||||
++totalTargetSize;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// convert position within translation rule to absolute position within
|
|
||||||
// source sentence / output sentence
|
|
||||||
ShiftOffsets(sourceOffsets, startSource);
|
|
||||||
ShiftOffsets(targetOffsets, startTarget);
|
|
||||||
|
|
||||||
// get alignments from this hypo
|
|
||||||
const AlignmentInfo &aiTerm = hypo->GetCurrTargetPhrase().GetAlignTerm();
|
|
||||||
|
|
||||||
// add to output arg, offsetting by source & target
|
|
||||||
AlignmentInfo::const_iterator iter;
|
|
||||||
for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) {
|
|
||||||
const std::pair<size_t,size_t> &align = *iter;
|
|
||||||
size_t relSource = align.first;
|
|
||||||
size_t relTarget = align.second;
|
|
||||||
size_t absSource = sourceOffsets[relSource];
|
|
||||||
size_t absTarget = targetOffsets[relTarget];
|
|
||||||
|
|
||||||
pair<size_t, size_t> alignPoint(absSource, absTarget);
|
|
||||||
pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
|
|
||||||
UTIL_THROW_IF2(!ret.second, "Error");
|
|
||||||
}
|
|
||||||
|
|
||||||
return totalTargetSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t IOWrapper::OutputAlignmentNBest(
|
size_t IOWrapper::OutputAlignmentNBest(
|
||||||
Alignments &retAlign,
|
Alignments &retAlign,
|
||||||
const Moses::ChartKBestExtractor::Derivation &derivation,
|
const Moses::ChartKBestExtractor::Derivation &derivation,
|
||||||
|
@ -41,17 +41,14 @@ POSSIBILITY OF SUCH DAMAGE.
|
|||||||
#include "moses/Sentence.h"
|
#include "moses/Sentence.h"
|
||||||
#include "moses/FactorTypeSet.h"
|
#include "moses/FactorTypeSet.h"
|
||||||
#include "moses/ChartKBestExtractor.h"
|
#include "moses/ChartKBestExtractor.h"
|
||||||
#include "moses/ChartTrellisPathList.h"
|
|
||||||
#include "moses/OutputCollector.h"
|
#include "moses/OutputCollector.h"
|
||||||
#include "moses/ChartHypothesis.h"
|
#include "moses/ChartHypothesis.h"
|
||||||
#include "moses/ChartTrellisPath.h"
|
|
||||||
#include "search/applied.hh"
|
#include "search/applied.hh"
|
||||||
#include "moses/ChartManager.h"
|
#include "moses/ChartManager.h"
|
||||||
|
|
||||||
namespace Moses
|
namespace Moses
|
||||||
{
|
{
|
||||||
class FactorCollection;
|
class FactorCollection;
|
||||||
class ChartTrellisPathList;
|
|
||||||
class ScoreComponentCollection;
|
class ScoreComponentCollection;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -90,7 +87,6 @@ protected:
|
|||||||
Moses::OutputCollector *m_unknownsCollector;
|
Moses::OutputCollector *m_unknownsCollector;
|
||||||
|
|
||||||
typedef std::set< std::pair<size_t, size_t> > Alignments;
|
typedef std::set< std::pair<size_t, size_t> > Alignments;
|
||||||
size_t OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartTrellisNode &node, size_t startTarget);
|
|
||||||
std::size_t OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartKBestExtractor::Derivation &derivation, std::size_t startTarget);
|
std::size_t OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartKBestExtractor::Derivation &derivation, std::size_t startTarget);
|
||||||
size_t OutputAlignment(Alignments &retAlign, const Moses::ChartHypothesis *hypo, size_t startTarget);
|
size_t OutputAlignment(Alignments &retAlign, const Moses::ChartHypothesis *hypo, size_t startTarget);
|
||||||
void OutputAlignment(std::vector< std::set<size_t> > &retAlignmentsS2T, const Moses::AlignmentInfo &ai);
|
void OutputAlignment(std::vector< std::set<size_t> > &retAlignmentsS2T, const Moses::AlignmentInfo &ai);
|
||||||
@ -130,7 +126,6 @@ public:
|
|||||||
void OutputBestHypo(search::Applied applied, long translationId);
|
void OutputBestHypo(search::Applied applied, long translationId);
|
||||||
void OutputBestHypo(const std::vector<const Moses::Factor*>& mbrBestHypo, long translationId);
|
void OutputBestHypo(const std::vector<const Moses::Factor*>& mbrBestHypo, long translationId);
|
||||||
void OutputBestNone(long translationId);
|
void OutputBestNone(long translationId);
|
||||||
void OutputNBestList(const Moses::ChartTrellisPathList &nBestList, long translationId);
|
|
||||||
void OutputNBestList(const std::vector<boost::shared_ptr<Moses::ChartKBestExtractor::Derivation> > &nBestList, long translationId);
|
void OutputNBestList(const std::vector<boost::shared_ptr<Moses::ChartKBestExtractor::Derivation> > &nBestList, long translationId);
|
||||||
void OutputNBestList(const std::vector<search::Applied> &nbest, long translationId);
|
void OutputNBestList(const std::vector<search::Applied> &nbest, long translationId);
|
||||||
void OutputDetailedTranslationReport(const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId);
|
void OutputDetailedTranslationReport(const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId);
|
||||||
|
@ -56,8 +56,6 @@ POSSIBILITY OF SUCH DAMAGE.
|
|||||||
#include "moses/ThreadPool.h"
|
#include "moses/ThreadPool.h"
|
||||||
#include "moses/ChartManager.h"
|
#include "moses/ChartManager.h"
|
||||||
#include "moses/ChartHypothesis.h"
|
#include "moses/ChartHypothesis.h"
|
||||||
#include "moses/ChartTrellisPath.h"
|
|
||||||
#include "moses/ChartTrellisPathList.h"
|
|
||||||
#include "moses/Incremental.h"
|
#include "moses/Incremental.h"
|
||||||
#include "moses/FF/StatefulFeatureFunction.h"
|
#include "moses/FF/StatefulFeatureFunction.h"
|
||||||
#include "moses/FF/StatelessFeatureFunction.h"
|
#include "moses/FF/StatelessFeatureFunction.h"
|
||||||
|
@ -27,6 +27,8 @@
|
|||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
namespace Moses
|
namespace Moses
|
||||||
{
|
{
|
||||||
|
|
||||||
@ -52,7 +54,7 @@ void ChartKBestExtractor::Extract(
|
|||||||
// recombined.
|
// recombined.
|
||||||
for (++p; p != topLevelHypos.end(); ++p) {
|
for (++p; p != topLevelHypos.end(); ++p) {
|
||||||
// Check that the first item in topLevelHypos really was the best.
|
// Check that the first item in topLevelHypos really was the best.
|
||||||
UTIL_THROW_IF2((*p)->GetTotalScore() <= bestTopLevelHypo.GetTotalScore(),
|
UTIL_THROW_IF2((*p)->GetTotalScore() > bestTopLevelHypo.GetTotalScore(),
|
||||||
"top-level hypotheses are not correctly sorted");
|
"top-level hypotheses are not correctly sorted");
|
||||||
// Note: there's no need for a smart pointer here: supremeHypo will take
|
// Note: there's no need for a smart pointer here: supremeHypo will take
|
||||||
// ownership of altHypo.
|
// ownership of altHypo.
|
||||||
|
@ -25,10 +25,6 @@
|
|||||||
#include "ChartHypothesis.h"
|
#include "ChartHypothesis.h"
|
||||||
#include "ChartKBestExtractor.h"
|
#include "ChartKBestExtractor.h"
|
||||||
#include "ChartTranslationOptions.h"
|
#include "ChartTranslationOptions.h"
|
||||||
#include "ChartTrellisDetourQueue.h"
|
|
||||||
#include "ChartTrellisNode.h"
|
|
||||||
#include "ChartTrellisPath.h"
|
|
||||||
#include "ChartTrellisPathList.h"
|
|
||||||
#include "StaticData.h"
|
#include "StaticData.h"
|
||||||
#include "DecodeStep.h"
|
#include "DecodeStep.h"
|
||||||
#include "TreeInput.h"
|
#include "TreeInput.h"
|
||||||
@ -167,101 +163,6 @@ const ChartHypothesis *ChartManager::GetBestHypothesis() const
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Calculate the n-best paths through the output hypergraph.
|
|
||||||
* Return the list of paths with the variable ret
|
|
||||||
* \param count how may paths to return
|
|
||||||
* \param ret return argument
|
|
||||||
* \param onlyDistinct whether to check for distinct output sentence or not (default - don't check, just return top n-paths)
|
|
||||||
*/
|
|
||||||
void ChartManager::CalcNBest(size_t count, ChartTrellisPathList &ret,bool onlyDistinct) const
|
|
||||||
{
|
|
||||||
size_t size = m_source.GetSize();
|
|
||||||
if (count == 0 || size == 0)
|
|
||||||
return;
|
|
||||||
|
|
||||||
// Build a ChartTrellisPath for the 1-best path, if any.
|
|
||||||
WordsRange range(0, size-1);
|
|
||||||
const ChartCell &lastCell = m_hypoStackColl.Get(range);
|
|
||||||
const ChartHypothesis *hypo = lastCell.GetBestHypothesis();
|
|
||||||
if (hypo == NULL) {
|
|
||||||
// no hypothesis
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
boost::shared_ptr<ChartTrellisPath> basePath(new ChartTrellisPath(*hypo));
|
|
||||||
|
|
||||||
// Add it to the n-best list.
|
|
||||||
if (count == 1) {
|
|
||||||
ret.Add(basePath);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Set a limit on the number of detours to pop. If the n-best list is
|
|
||||||
// restricted to distinct translations then this limit should be bigger
|
|
||||||
// than n. The n-best factor determines how much bigger the limit should be.
|
|
||||||
const StaticData &staticData = StaticData::Instance();
|
|
||||||
const size_t nBestFactor = staticData.GetNBestFactor();
|
|
||||||
size_t popLimit;
|
|
||||||
if (!onlyDistinct) {
|
|
||||||
popLimit = count-1;
|
|
||||||
} else if (nBestFactor == 0) {
|
|
||||||
// 0 = 'unlimited.' This actually sets a large-ish limit in case too many
|
|
||||||
// translations are identical.
|
|
||||||
popLimit = count * 1000;
|
|
||||||
} else {
|
|
||||||
popLimit = count * nBestFactor;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create an empty priority queue of detour objects. It is bounded to
|
|
||||||
// contain no more than popLimit items.
|
|
||||||
ChartTrellisDetourQueue contenders(popLimit);
|
|
||||||
|
|
||||||
// Get all complete translations
|
|
||||||
const HypoList *topHypos = lastCell.GetAllSortedHypotheses();
|
|
||||||
|
|
||||||
// Create a ChartTrellisDetour for each complete translation and add it to the queue
|
|
||||||
HypoList::const_iterator iter;
|
|
||||||
for (iter = topHypos->begin(); iter != topHypos->end(); ++iter) {
|
|
||||||
const ChartHypothesis &hypo = **iter;
|
|
||||||
boost::shared_ptr<ChartTrellisPath> basePath(new ChartTrellisPath(hypo));
|
|
||||||
ChartTrellisDetour *detour = new ChartTrellisDetour(basePath, basePath->GetFinalNode(), hypo);
|
|
||||||
contenders.Push(detour);
|
|
||||||
}
|
|
||||||
|
|
||||||
delete topHypos;
|
|
||||||
|
|
||||||
// Record the output phrase if distinct translations are required.
|
|
||||||
set<Phrase> distinctHyps;
|
|
||||||
|
|
||||||
// MAIN loop
|
|
||||||
for (size_t i = 0; ret.GetSize() < count && !contenders.Empty() && i < popLimit; ++i) {
|
|
||||||
// Get the best detour from the queue.
|
|
||||||
std::auto_ptr<const ChartTrellisDetour> detour(contenders.Pop());
|
|
||||||
UTIL_THROW_IF2(detour.get() == NULL, "Empty detour");
|
|
||||||
|
|
||||||
// Create a full base path from the chosen detour.
|
|
||||||
//basePath.reset(new ChartTrellisPath(*detour));
|
|
||||||
boost::shared_ptr<ChartTrellisPath> path(new ChartTrellisPath(*detour));
|
|
||||||
|
|
||||||
// Generate new detours from this base path and add them to the queue of
|
|
||||||
// contenders. The new detours deviate from the base path by a single
|
|
||||||
// replacement along the previous detour sub-path.
|
|
||||||
UTIL_THROW_IF2(path->GetDeviationPoint() == NULL, "Empty deviant path");
|
|
||||||
CreateDeviantPaths(path, *(path->GetDeviationPoint()), contenders);
|
|
||||||
|
|
||||||
// If the n-best list is allowed to contain duplicate translations (at the
|
|
||||||
// surface level) then add the new path unconditionally, otherwise check
|
|
||||||
// whether the translation has seen before.
|
|
||||||
if (!onlyDistinct) {
|
|
||||||
ret.Add(path);
|
|
||||||
} else {
|
|
||||||
Phrase tgtPhrase = path->GetOutputPhrase();
|
|
||||||
if (distinctHyps.insert(tgtPhrase).second) {
|
|
||||||
ret.Add(path);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Calculate the n-best paths through the output hypergraph.
|
/** Calculate the n-best paths through the output hypergraph.
|
||||||
* Return the list of paths with the variable ret
|
* Return the list of paths with the variable ret
|
||||||
* \param n how may paths to return
|
* \param n how may paths to return
|
||||||
@ -374,34 +275,4 @@ void ChartManager::FindReachableHypotheses( const ChartHypothesis *hypo, std::ma
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ChartManager::CreateDeviantPaths(
|
|
||||||
boost::shared_ptr<const ChartTrellisPath> basePath,
|
|
||||||
ChartTrellisDetourQueue &q)
|
|
||||||
{
|
|
||||||
CreateDeviantPaths(basePath, basePath->GetFinalNode(), q);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ChartManager::CreateDeviantPaths(
|
|
||||||
boost::shared_ptr<const ChartTrellisPath> basePath,
|
|
||||||
const ChartTrellisNode &substitutedNode,
|
|
||||||
ChartTrellisDetourQueue &queue)
|
|
||||||
{
|
|
||||||
const ChartArcList *arcList = substitutedNode.GetHypothesis().GetArcList();
|
|
||||||
if (arcList) {
|
|
||||||
for (ChartArcList::const_iterator iter = arcList->begin();
|
|
||||||
iter != arcList->end(); ++iter) {
|
|
||||||
const ChartHypothesis &replacement = **iter;
|
|
||||||
queue.Push(new ChartTrellisDetour(basePath, substitutedNode,
|
|
||||||
replacement));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// recusively create deviant paths for child nodes
|
|
||||||
const ChartTrellisNode::NodeChildren &children = substitutedNode.GetChildren();
|
|
||||||
ChartTrellisNode::NodeChildren::const_iterator iter;
|
|
||||||
for (iter = children.begin(); iter != children.end(); ++iter) {
|
|
||||||
const ChartTrellisNode &child = **iter;
|
|
||||||
CreateDeviantPaths(basePath, child, queue);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace Moses
|
} // namespace Moses
|
||||||
|
@ -38,23 +38,12 @@ namespace Moses
|
|||||||
{
|
{
|
||||||
|
|
||||||
class ChartHypothesis;
|
class ChartHypothesis;
|
||||||
class ChartTrellisDetourQueue;
|
|
||||||
class ChartTrellisNode;
|
|
||||||
class ChartTrellisPath;
|
|
||||||
class ChartTrellisPathList;
|
|
||||||
|
|
||||||
/** Holds everything you need to decode 1 sentence with the hierachical/syntax decoder
|
/** Holds everything you need to decode 1 sentence with the hierachical/syntax decoder
|
||||||
*/
|
*/
|
||||||
class ChartManager
|
class ChartManager
|
||||||
{
|
{
|
||||||
private:
|
private:
|
||||||
static void CreateDeviantPaths(boost::shared_ptr<const ChartTrellisPath>,
|
|
||||||
ChartTrellisDetourQueue &);
|
|
||||||
|
|
||||||
static void CreateDeviantPaths(boost::shared_ptr<const ChartTrellisPath>,
|
|
||||||
const ChartTrellisNode &,
|
|
||||||
ChartTrellisDetourQueue &);
|
|
||||||
|
|
||||||
InputType const& m_source; /**< source sentence to be translated */
|
InputType const& m_source; /**< source sentence to be translated */
|
||||||
ChartCellCollection m_hypoStackColl;
|
ChartCellCollection m_hypoStackColl;
|
||||||
std::auto_ptr<SentenceStats> m_sentenceStats;
|
std::auto_ptr<SentenceStats> m_sentenceStats;
|
||||||
@ -71,7 +60,6 @@ public:
|
|||||||
void ProcessSentence();
|
void ProcessSentence();
|
||||||
void AddXmlChartOptions();
|
void AddXmlChartOptions();
|
||||||
const ChartHypothesis *GetBestHypothesis() const;
|
const ChartHypothesis *GetBestHypothesis() const;
|
||||||
void CalcNBest(size_t count, ChartTrellisPathList &ret, bool onlyDistinct=0) const;
|
|
||||||
void CalcNBest(size_t n, std::vector<boost::shared_ptr<ChartKBestExtractor::Derivation> > &nBestList, bool onlyDistinct=false) const;
|
void CalcNBest(size_t n, std::vector<boost::shared_ptr<ChartKBestExtractor::Derivation> > &nBestList, bool onlyDistinct=false) const;
|
||||||
|
|
||||||
void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const;
|
void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const;
|
||||||
|
@ -10,13 +10,15 @@ ChartTranslationOption::ChartTranslationOption(const TargetPhrase &targetPhrase)
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
void ChartTranslationOption::Evaluate(const InputType &input, const InputPath &inputPath)
|
void ChartTranslationOption::Evaluate(const InputType &input,
|
||||||
|
const InputPath &inputPath,
|
||||||
|
const StackVec &stackVec)
|
||||||
{
|
{
|
||||||
const std::vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
|
const std::vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
|
||||||
|
|
||||||
for (size_t i = 0; i < ffs.size(); ++i) {
|
for (size_t i = 0; i < ffs.size(); ++i) {
|
||||||
const FeatureFunction &ff = *ffs[i];
|
const FeatureFunction &ff = *ffs[i];
|
||||||
ff.Evaluate(input, inputPath, m_targetPhrase, m_scoreBreakdown);
|
ff.Evaluate(input, inputPath, m_targetPhrase, &stackVec, m_scoreBreakdown);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7,6 +7,7 @@ namespace Moses
|
|||||||
class TargetPhrase;
|
class TargetPhrase;
|
||||||
class InputPath;
|
class InputPath;
|
||||||
class InputType;
|
class InputType;
|
||||||
|
class StackVec;
|
||||||
|
|
||||||
class ChartTranslationOption
|
class ChartTranslationOption
|
||||||
{
|
{
|
||||||
@ -43,7 +44,9 @@ public:
|
|||||||
return m_scoreBreakdown;
|
return m_scoreBreakdown;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Evaluate(const InputType &input, const InputPath &inputPath);
|
void Evaluate(const InputType &input,
|
||||||
|
const InputPath &inputPath,
|
||||||
|
const StackVec &stackVec);
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -176,6 +176,31 @@ void ChartTranslationOptionList::Evaluate(const InputType &input, const InputPat
|
|||||||
ChartTranslationOptions &transOpts = **iter;
|
ChartTranslationOptions &transOpts = **iter;
|
||||||
transOpts.Evaluate(input, inputPath);
|
transOpts.Evaluate(input, inputPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// get rid of empty trans opts
|
||||||
|
size_t numDiscard = 0;
|
||||||
|
for (size_t i = 0; i < m_size; ++i) {
|
||||||
|
ChartTranslationOptions *transOpts = m_collection[i];
|
||||||
|
if (transOpts->GetSize() == 0) {
|
||||||
|
//delete transOpts;
|
||||||
|
++numDiscard;
|
||||||
|
}
|
||||||
|
else if (numDiscard) {
|
||||||
|
SwapTranslationOptions(i - numDiscard, i);
|
||||||
|
//m_collection[] = transOpts;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t newSize = m_size - numDiscard;
|
||||||
|
m_size = newSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ChartTranslationOptionList::SwapTranslationOptions(size_t a, size_t b)
|
||||||
|
{
|
||||||
|
ChartTranslationOptions *transOptsA = m_collection[a];
|
||||||
|
ChartTranslationOptions *transOptsB = m_collection[b];
|
||||||
|
m_collection[a] = transOptsB;
|
||||||
|
m_collection[b] = transOptsA;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::ostream& operator<<(std::ostream &out, const ChartTranslationOptionList &obj)
|
std::ostream& operator<<(std::ostream &out, const ChartTranslationOptionList &obj)
|
||||||
|
@ -78,6 +78,8 @@ private:
|
|||||||
float m_thresholdScore;
|
float m_thresholdScore;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
void SwapTranslationOptions(size_t a, size_t b);
|
||||||
|
|
||||||
CollType m_collection;
|
CollType m_collection;
|
||||||
size_t m_size;
|
size_t m_size;
|
||||||
float m_scoreThreshold;
|
float m_scoreThreshold;
|
||||||
|
@ -62,9 +62,24 @@ void ChartTranslationOptions::Evaluate(const InputType &input, const InputPath &
|
|||||||
for (iter = m_collection.begin(); iter != m_collection.end(); ++iter) {
|
for (iter = m_collection.begin(); iter != m_collection.end(); ++iter) {
|
||||||
ChartTranslationOption &transOpt = **iter;
|
ChartTranslationOption &transOpt = **iter;
|
||||||
transOpt.SetInputPath(&inputPath);
|
transOpt.SetInputPath(&inputPath);
|
||||||
transOpt.Evaluate(input, inputPath);
|
transOpt.Evaluate(input, inputPath, m_stackVec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// get rid of -inf trans opts
|
||||||
|
size_t numDiscard = 0;
|
||||||
|
for (size_t i = 0; i < m_collection.size(); ++i) {
|
||||||
|
ChartTranslationOption *transOpt = m_collection[i].get();
|
||||||
|
|
||||||
|
if (transOpt->GetScores().GetWeightedScore() == - std::numeric_limits<float>::infinity()) {
|
||||||
|
++numDiscard;
|
||||||
|
}
|
||||||
|
else if (numDiscard) {
|
||||||
|
m_collection[i - numDiscard] = boost::shared_ptr<ChartTranslationOption>(transOpt);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t newSize = m_collection.size() - numDiscard;
|
||||||
|
m_collection.resize(newSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ChartTranslationOptions::SetInputPath(const InputPath *inputPath)
|
void ChartTranslationOptions::SetInputPath(const InputPath *inputPath)
|
||||||
|
@ -1,42 +0,0 @@
|
|||||||
/***********************************************************************
|
|
||||||
Moses - statistical machine translation system
|
|
||||||
Copyright (C) 2006-2011 University of Edinburgh
|
|
||||||
|
|
||||||
This library is free software; you can redistribute it and/or
|
|
||||||
modify it under the terms of the GNU Lesser General Public
|
|
||||||
License as published by the Free Software Foundation; either
|
|
||||||
version 2.1 of the License, or (at your option) any later version.
|
|
||||||
|
|
||||||
This library is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
Lesser General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU Lesser General Public
|
|
||||||
License along with this library; if not, write to the Free Software
|
|
||||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
||||||
***********************************************************************/
|
|
||||||
|
|
||||||
#include "ChartTrellisDetour.h"
|
|
||||||
|
|
||||||
#include "ChartHypothesis.h"
|
|
||||||
#include "ChartTrellisNode.h"
|
|
||||||
#include "ChartTrellisPath.h"
|
|
||||||
|
|
||||||
namespace Moses
|
|
||||||
{
|
|
||||||
|
|
||||||
ChartTrellisDetour::ChartTrellisDetour(
|
|
||||||
boost::shared_ptr<const ChartTrellisPath> basePath,
|
|
||||||
const ChartTrellisNode &substitutedNode,
|
|
||||||
const ChartHypothesis &replacementHypo)
|
|
||||||
: m_basePath(basePath)
|
|
||||||
, m_substitutedNode(substitutedNode)
|
|
||||||
, m_replacementHypo(replacementHypo)
|
|
||||||
{
|
|
||||||
float diff = replacementHypo.GetTotalScore()
|
|
||||||
- substitutedNode.GetHypothesis().GetTotalScore();
|
|
||||||
m_totalScore = basePath->GetTotalScore() + diff;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace Moses
|
|
@ -1,58 +0,0 @@
|
|||||||
/***********************************************************************
|
|
||||||
Moses - statistical machine translation system
|
|
||||||
Copyright (C) 2006-2011 University of Edinburgh
|
|
||||||
|
|
||||||
This library is free software; you can redistribute it and/or
|
|
||||||
modify it under the terms of the GNU Lesser General Public
|
|
||||||
License as published by the Free Software Foundation; either
|
|
||||||
version 2.1 of the License, or (at your option) any later version.
|
|
||||||
|
|
||||||
This library is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
Lesser General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU Lesser General Public
|
|
||||||
License along with this library; if not, write to the Free Software
|
|
||||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
||||||
***********************************************************************/
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include <boost/shared_ptr.hpp>
|
|
||||||
|
|
||||||
namespace Moses
|
|
||||||
{
|
|
||||||
class ChartHypothesis;
|
|
||||||
class ChartTrellisNode;
|
|
||||||
class ChartTrellisPath;
|
|
||||||
|
|
||||||
/** @todo Something to do with make deviant paths
|
|
||||||
*/
|
|
||||||
class ChartTrellisDetour
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
ChartTrellisDetour(boost::shared_ptr<const ChartTrellisPath>,
|
|
||||||
const ChartTrellisNode &, const ChartHypothesis &);
|
|
||||||
|
|
||||||
const ChartTrellisPath &GetBasePath() const {
|
|
||||||
return *m_basePath;
|
|
||||||
}
|
|
||||||
const ChartTrellisNode &GetSubstitutedNode() const {
|
|
||||||
return m_substitutedNode;
|
|
||||||
}
|
|
||||||
const ChartHypothesis &GetReplacementHypo() const {
|
|
||||||
return m_replacementHypo;
|
|
||||||
}
|
|
||||||
float GetTotalScore() const {
|
|
||||||
return m_totalScore;
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
boost::shared_ptr<const ChartTrellisPath> m_basePath;
|
|
||||||
const ChartTrellisNode &m_substitutedNode;
|
|
||||||
const ChartHypothesis &m_replacementHypo;
|
|
||||||
float m_totalScore;
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace Moses
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user