Merge remote branch 'github/master' into miramerge

Compiles, but not tested. Had to disable relent filter. Strangely, it seems to contain the
whole of moses-cmd.

Conflicts:
	Jamroot
	OnDiskPt/TargetPhrase.cpp
	moses-cmd/src/Main.cpp
	moses/src/AlignmentInfo.cpp
	moses/src/AlignmentInfo.h
	moses/src/ChartTranslationOptionCollection.cpp
	moses/src/ChartTranslationOptionCollection.h
	moses/src/GenerationDictionary.cpp
	moses/src/Jamfile
	moses/src/Parameter.cpp
	moses/src/PhraseDictionary.cpp
	moses/src/StaticData.cpp
	moses/src/StaticData.h
	moses/src/TargetPhrase.h
	moses/src/TranslationSystem.cpp
	moses/src/TranslationSystem.h
	moses/src/Word.cpp
	phrase-extract/score.cpp
	regression-testing/Jamfile
	scripts/ems/experiment.meta
	scripts/ems/experiment.perl
	scripts/training/train-model.perl
This commit is contained in:
Barry Haddow 2012-09-26 22:49:33 +01:00
commit 0a950ee9f4
324 changed files with 59117 additions and 3283 deletions

4
.gitignore vendored
View File

@ -61,3 +61,7 @@ scripts/training/train-model.perl
dist
bin
previous.sh
contrib/other-builds/*.xcodeproj/project.xcworkspace/
contrib/other-builds/*.xcodeproj/xcuserdata/
*/*.xcodeproj/project.xcworkspace
*/*.xcodeproj/xcuserdata

3
.gitmodules vendored Normal file
View File

@ -0,0 +1,3 @@
[submodule "regression-testing/tests"]
path = regression-testing/tests
url = ../moses-regression-tests.git

23
Jamroot
View File

@ -15,13 +15,15 @@
#Note that, like language models, this is the --prefix where the library was
#installed, not some executable within the library.
#
#Compact phrase table and compact lexical reordering table
#--with-cmph=/path/to/cmph
#
#Thread-caching malloc (optional):
#--with-tcmalloc
#
#REGRESSION TESTING
#--with-regtest=/path/to/moses-reg-test-data
#
#
#INSTALLATION
#--prefix=/path/to/prefix sets the install prefix [default is source root].
#--bindir=/path/to/prefix/bin sets the bin directory [PREFIX/bin]
@ -29,6 +31,7 @@
#--includedir=/path/to/prefix/include installs headers.
# Does not install if missing. No argument defaults to PREFIX/include .
#--install-scripts=/path/to/scripts copies scripts into a directory.
# Does not install if missing. No argument defaults to PREFIX/scripts .
#--git appends the git revision to the prefix directory.
#
#
@ -41,7 +44,9 @@
# variant=release|debug|profile builds optimized (default), for debug, or for
# profiling
#
# link=static|shared controls linking (default static)
# link=static|shared controls preferred linking (default static)
# --static forces static linking (the default will fall
# back to shared)
#
# debug-symbols=on|off include (default) or exclude debugging
# information also known as -g
@ -50,6 +55,9 @@
# --enable-boost-pool uses Boost pools for the memory SCFG table
#
# --enable-mpi switch on mpi
# --without-libsegfault does not link with libSegFault
#
# --max-kenlm-order maximum ngram order that kenlm can process (default 6)
#
#CONTROLLING THE BUILD
#-a to build from scratch
@ -84,6 +92,10 @@ if [ option.get "enable-mpi" : : "yes" ] {
requirements += [ option.get "notrace" : <define>TRACE_ENABLE=1 ] ;
requirements += [ option.get "enable-boost-pool" : : <define>USE_BOOST_POOL ] ;
if [ option.get "with-cmph" ] {
requirements += <define>HAVE_CMPH ;
}
project : default-build
<threading>multi
<warnings>on
@ -99,12 +111,13 @@ project : requirements
$(requirements)
;
build-projects util lm mert moses-cmd/src moses-chart-cmd/src mira scripts regression-testing ;
#Add directories here if you want their incidental targets too (i.e. tests).
build-projects util lm mert moses-cmd/src moses-chart-cmd/src mira scripts regression-testing ;
alias programs : lm//query lm//build_binary moses-chart-cmd/src//moses_chart moses-cmd/src//programs OnDiskPt//CreateOnDiskPt OnDiskPt//queryOnDiskPt mert//programs contrib/server//mosesserver misc//programs mira//programs symal phrase-extract phrase-extract//lexical-reordering phrase-extract//extract-ghkm phrase-extract//pcfg-extract phrase-extract//pcfg-score biconcor ;
alias programs : lm//query lm//build_binary lm//kenlm_max_order moses-chart-cmd/src//moses_chart moses-cmd/src//programs OnDiskPt//CreateOnDiskPt OnDiskPt//queryOnDiskPt mert//programs contrib/server//mosesserver misc//programs mira//programs symal phrase-extract phrase-extract//lexical-reordering phrase-extract//extract-ghkm phrase-extract//pcfg-extract phrase-extract//pcfg-score biconcor ;
install-bin-libs programs ;
install-headers headers-base : [ glob-tree *.h *.hh : jam-files dist bin lib include kenlm moses ] : . ;
install-headers headers-base : [ path.glob-tree biconcor contrib lm mert misc moses-chart-cmd moses-cmd OnDiskPt phrase-extract symal util : *.hh *.h ] : . ;
install-headers headers-moses : moses/src//headers-to-install : moses/src ;
alias install : prefix-bin prefix-lib headers-base headers-moses ;

View File

@ -240,9 +240,7 @@ Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::Facto
--phraseSize;
for (size_t pos = 0; pos < phraseSize; ++pos) {
Moses::Word *mosesWord = GetWord(pos).ConvertToMoses(Moses::Output, outputFactors, vocab);
ret->AddWord(*mosesWord);
delete mosesWord;
GetWord(pos).ConvertToMoses(outputFactors, vocab, ret->AddWord());
}
// scores
@ -261,16 +259,12 @@ Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::Facto
}
ret->SetAlignmentInfo(alignmentInfo, indicator);
Moses::Word *lhs = GetWord(GetSize() - 1).ConvertToMoses(Moses::Output, outputFactors, vocab);
ret->SetTargetLHS(*lhs);
delete lhs;
GetWord(GetSize() - 1).ConvertToMoses(outputFactors, vocab, ret->MutableTargetLHS());
// set source phrase
Moses::Phrase mosesSP(Moses::Input);
for (size_t pos = 0; pos < sp->GetSize(); ++pos) {
Moses::Word *mosesWord = sp->GetWord(pos).ConvertToMoses(Moses::Input, inputFactors, vocab);
mosesSP.AddWord(*mosesWord);
delete mosesWord;
sp->GetWord(pos).ConvertToMoses(inputFactors, vocab, mosesSP.AddWord());
}
ret->SetSourcePhrase(mosesSP);

View File

@ -23,6 +23,9 @@
#include "../moses/src/Word.h"
#include "Word.h"
#include "util/tokenize_piece.hh"
#include "util/exception.hh"
using namespace std;
namespace OnDiskPt
@ -94,23 +97,21 @@ size_t Word::ReadFromFile(std::fstream &file)
return memUsed;
}
Moses::Word *Word::ConvertToMoses(Moses::FactorDirection direction
, const std::vector<Moses::FactorType> &outputFactorsVec
, const Vocab &vocab) const
{
void Word::ConvertToMoses(
const std::vector<Moses::FactorType> &outputFactorsVec,
const Vocab &vocab,
Moses::Word &overwrite) const {
Moses::FactorCollection &factorColl = Moses::FactorCollection::Instance();
Moses::Word *ret = new Moses::Word(m_isNonTerminal);
overwrite = Moses::Word(m_isNonTerminal);
const string &str = vocab.GetString(m_vocabId);
vector<string> toks = Moses::Tokenize(str, "|");
for (size_t ind = 0; ind < toks.size(); ++ind) {
Moses::FactorType factorType = outputFactorsVec[ind];
const Moses::Factor *factor = factorColl.AddFactor(direction, factorType, toks[ind]);
ret->SetFactor(factorType, factor);
// TODO: this conversion should have been done at load time.
util::TokenIter<util::SingleCharacter> tok(vocab.GetString(m_vocabId), '|');
for (std::vector<Moses::FactorType>::const_iterator t = outputFactorsVec.begin(); t != outputFactorsVec.end(); ++t, ++tok) {
UTIL_THROW_IF(!tok, util::Exception, "Too few factors in \"" << vocab.GetString(m_vocabId) << "\"; was expecting " << outputFactorsVec.size());
overwrite.SetFactor(*t, factorColl.AddFactor(*tok));
}
return ret;
UTIL_THROW_IF(tok, util::Exception, "Too many factors in \"" << vocab.GetString(m_vocabId) << "\"; was expecting " << outputFactorsVec.size());
}
int Word::Compare(const Word &compare) const

View File

@ -71,9 +71,10 @@ public:
m_vocabId = vocabId;
}
Moses::Word *ConvertToMoses(Moses::FactorDirection direction
, const std::vector<Moses::FactorType> &outputFactorsVec
, const Vocab &vocab) const;
void ConvertToMoses(
const std::vector<Moses::FactorType> &outputFactorsVec,
const Vocab &vocab,
Moses::Word &overwrite) const;
virtual void DebugPrint(std::ostream &out, const Vocab &vocab) const;

4
bjam
View File

@ -4,8 +4,8 @@ if
bjam="$(which bjam 2>/dev/null)" && #exists
[ ${#bjam} != 0 ] && #paranoia about which printing nothing then returning true
! grep UFIHGUFIHBDJKNCFZXAEVA "${bjam}" </dev/null >/dev/null && #bjam in path isn't this script
"${bjam}" --help >/dev/null 2>/dev/null && #bjam in path isn't broken (i.e. has boost-build)
"${bjam}" --version |grep "Boost.Build 201" >/dev/null 2>/dev/null #It's recent enough.
"${bjam}" --sanity-test 2>/dev/null |grep Sane >/dev/null && #The test in jam-files/sanity.jam passes
(cd jam-files/fail && ! "${bjam}") >/dev/null #Returns non-zero on failure
then
#Delegate to system bjam
exec "${bjam}" "$@"

View File

@ -0,0 +1,139 @@
`combine-ptables.pl`: fill-up and other techniques of translation models combination.
Author:
Arianna Bisazza bisazza[AT]fbk.eu
ABOUT
-----
This tool implements "fill-up" and other operations that are useful to combine translation and reordering tables.
In the "fill-up" approach, the weights of out-domain data sources are estimated directly by MERT along with the
other model weights.
This tool also supports linear interpolation, but weights must be provided by the user.
If you want to automatically estimate linear interpolation weights, use `contrib/tmcombine` instead.
REFERENCE
---------
When using this script, please cite:
Arianna Bisazza, Nick Ruiz, and Marcello Federico. 2011.
"Fill-up versus Interpolation Methods for Phrase-based SMT Adaptation."
In International Workshop on Spoken Language Translation (IWSLT), San Francisco, CA.
FILL-UP
-------
This combination technique is useful when the relevance of the models is known a priori,
e.g. when one is trained on in-domain data and the others on out-of-domain data.
This mode preserves all the entries and scores coming from the first model, and adds
entries from the other models only if new.
If more than two tables are provided, each entry is taken only from the first table
that contains it.
Moreover, a binary feature is added for each additional table to denote the provenance
of an entry. For in-domain entries, the binary features are all set to 1 (=exp(0)).
Entries coming from the 2nd table will have the 1st binary feature set to 2.718 (=exp(1)).
This technique was proposed in the following works:
Preslav Nakov. 2008.
"Improving English-Spanish Statistical Machine Translation: Experiments in Domain
Adaptation, Sentence Paraphrasing, Tokenization, and Recasing."
In Workshop on Statistical Machine Translation.
Arianna Bisazza, Nick Ruiz, and Marcello Federico. 2011.
"Fill-up versus Interpolation Methods for Phrase-based SMT Adaptation."
In International Workshop on Spoken Language Translation (IWSLT), San Francisco, CA.
The latter paper contains details about the present implementation as well as an empirical
evaluation of fill-up against other combination techniques.
Reordering model fill-up, cascaded fill-up and pruning criteria are also discussed in the
same paper.
Among the findings of this paper, pruning new (out-of-domain) phrases with more than 4
source words appeared to be beneficial on the Arabic-English TED task when combining the
in-domain models with MultiUn models.
This corresponds to the option:
`--newSourceMaxLength=4`
LINEAR INTERPOLATION
--------------------
This combination technique consists in linearly combining the feature values coming
from all tables. The combination weights should be provided by the user, otherwise
uniform weights are assumed.
When a phrase pair is absent from a table, a constant value (epsilon) is assumed for
the corresponding feature values. You may want to set your own epsilon.
See [Bisazza et al. 2011] for an empirical comparison of uniformly weighted linear
interpolation against fill-up and decoding-time log-linear interpolation. In that paper,
epsilon was always set to 1e-06.
UNION
-----
This combination technique creates the union of all phrase pairs and assigns to each
of them the concatenation of all tables scores.
INTERSECTION
------------
This combination technique creates the intersection of all phrase pairs: each phrase
pair that occurs in all phrase tables is output along with the feature vector taken
from the *first* table.
The intersection can be used to prune the reordering table in order to match the
entries of a corresponding pruned phrase table.
USAGE
-----
Get statistics about overlap of entries:
`combine-ptables.pl --mode=stats ptable1 ptable2 ... ptableN > ptables-overlap-stats`
Interpolate phrase tables...
- with uniform weights:
`combine-ptables.pl --mode=interp --phpenalty-at=4 ptable1 ptable2 ptable3 > interp-ptable.X`
- with custom weights:
`combine-ptables.pl --mode=interp --phpenalty-at=4 --weights=0.8,0.1,0.1 ptable1 ptable2 ptable3 > interp-ptable.Y`
- with custom epsilon:
`combine-ptables.pl --mode=interp --phpenalty-at=4 --epsilon=1e-05 ptable1 ptable2 ptable3 > interp-ptable.Z`
Fillup phrase tables...
- unpruned:
`combine-ptables.pl --mode=fillup ptable1 ptable2 ... ptableN > fillup-ptable`
- pruned (new phrases only with max. 4 source words):
`combine-ptables.pl --mode=fillup --newSourceMaxLength=4 ptable1 ptable2 ... ptableN > fillup-ptable`
Given a pruned phrase table, prune the corresponding reordering table:
`combine-ptables.pl --mode=intersect1 reotable1-unpruned ptable1-pruned > reotable1-pruned`
NOTES
-----
The script works only with textual (non-binarized) phrase or reordering tables
that were *previously sorted* with `LC_ALL=C sort`
The resulting combined tables are also textual and need to binarized normally.
The script combine-ptables.pl can be used on lexicalized reordering tables as well.
Input tables can be gzipped.
When integrating filled up models into a Moses system, remember to:
- specify the correct number of features (typically 6) under [ttable-file] in the configuration file `moses.ini`
- add a weight under [weight-t] in `moses.ini`
- if you binarize the models, provide the correct number of features to the command:
`$moses/bin/processPhraseTable -ttable 0 0 - -nscores $nbFeatures`

View File

@ -0,0 +1,425 @@
#! /usr/bin/perl
#******************************************************************************
# Arianna Bisazza @ FBK-irst. March 2012
#******************************************************************************
# combine-ptables.pl : Combine Moses-style phrase tables, using different approaches
use strict;
use open ':utf8';
binmode STDIN, ':utf8';
binmode STDOUT, ':utf8';
use Getopt::Long "GetOptions";
sub main {
my $usage = "
USAGE
-----
combine-ptables.pl --mode=(interp|union|fillup|intersect1|stats) ptable1 ptable2 ... ptableN > combined-ptable
combine-ptables.pl --mode=intersect1 reotable-unpruned ptable-pruned > reotable-pruned
-----
#
# This scripts reads two or more *sorted* phrase tables and combines them in different modes.
#
# (Note: if present, word alignments are ignored).
#
# ----------------
# OPTIONS
# ----------------
#
# Required:
# --mode fillup: Each entry is taken only from the first table that contains it.
# A binary feature is added from each table except the first.
# interp: Linear interpolation.
# union: Union of entries, feature vectors are concatenated.
# intersect1: Intersection of entries, feature vectors taken from the first table.
# stats: Only compute some statistics about tables overlap. No table is produced.
#
# NOTE: if present, additional fields such as word alignment, phrase counts etc. are always
# taken from the first table.
#
# Generic options:
# --phpenalty=FLOAT Constant value for phrase penalty. Default is exp(1)=2.718
# --phpenalty-at=N The (N+1)th score of each table is considered as phrase penalty with a constant value.
# In 'interp' mode, the corresponding feature is not interpolated but simply set to the constant.
# In 'union' mode, the ph.penalty (constant) is output only once, after all the other scores.
# By default, no score is considered as phrase penalty.
#
#
# Options for 'fillup':
# --newSourceMaxLength=INT Don't include \"new\" source phrases if longer than INT words.
#
# Options for 'interp':
# --weights=W1,W2,...WN Weights for interpolation. By default, uniform weights are applied.
# --epsilon=X Score to assume when a phrase pair is not contained in a table (in 'interp' and 'union' modes).
# Default epsilon is 1e-06.
#
# Options for 'union':
#
#
";
my $combination_mode = '';
my $debug = '';
my $weights_str = '';
my $epsilon = 0.000001;
my $phPenalty = 2.718; # exp(1)
my $phPenalty_idx = -1;
my $delim= " ||| ";
my $delim_RE= ' \|\|\| ';
my $exp_one = 2.718;
my $exp_zero = 1;
my $newSourceMaxLength = -1;
my $help = '';
GetOptions ('debug' => \$debug,
'mode=s' => \$combination_mode,
'weights=s' => \$weights_str,
'epsilon=f' => \$epsilon,
'phpenalty=f' => \$phPenalty,
'phpenalty-at=i' => \$phPenalty_idx,
'newSourceMaxLength=i' => \$newSourceMaxLength,
'help' => \$help);
if($help) { die "$usage\n\n"; }
if($combination_mode!~/(interp|union|fillup|intersect1|stats)/) {die "$usage\nUnknown combination mode!\n"};
if(@ARGV < 2) {die "$usage\n\n Please provide at least 2 tables to combine \n\n";}
print STDERR "
WARNING: Your phrase tables must be sorted (with LC_ALL=C) !!
******************************
Combination mode is [$combination_mode]
******************************
";
my @tables = @ARGV;
my $nbtables = scalar(@tables);
###########################################
# The newSourceMaxLength option requires reading all the first PT before starting the combination
my %sourcePhrasesPT1;
if($combination_mode eq "fillup" && $newSourceMaxLength>-1) {
my $table1=$tables[0];
$table1 =~ s/(.*\.gz)\s*$/gzip -dc < $1|/;
open(TABLE1, "$table1") or die "Cannot open $table1: ($!)\n";
while(my $line=<TABLE1>) {
$line=~m/^(.*?)$delim_RE/;
$sourcePhrasesPT1{$1}++;
}
close(TABLE1);
}
my @table_files=();
foreach my $table (@tables) {
$table =~ s/(.*\.gz)\s*$/gzip -dc < $1|/;
#localize the file glob, so FILE is unique to the inner loop.
local *FILE;
open(FILE, "$table") or die "Cannot open $table: ($!)\n";
push(@table_files, *FILE);
}
# Read first line from all tables to find number of weights (and sanity checks)
my @read_ppairs=();
my $nbscores = &read_line_from_tables(\@table_files, \@read_ppairs);
print STDERR "Each phrase table contains $nbscores features.\n";
###########################################
if($phPenalty_idx!=-1) {
if($phPenalty_idx<0 || $phPenalty_idx>=$nbscores) {
die "Invalid value for option phpenalty-at! Should be in the range [0,($nbscores-1)]\n\n";
}
else { print STDERR "Phrase penalty at index $phPenalty_idx\n"; }
}
#if($weights_str ne "") { die "Weights option NOT supported yet. Can only use uniform (1/nbscores)\n\n"; }
#my $unifw = 1/$nbtables;
my @weights=(); # Array of arrays each containing the feature weights for a phrase table
if($combination_mode eq "interp") {
my @table_level_weights=();
if($weights_str eq "") {
@table_level_weights= ((1/$nbtables) x $nbtables); # assuming uniform weights
}
else {
@table_level_weights= split(/,/, $weights_str);
if(scalar(@table_level_weights) != $nbtables) {
die "$usage\n Invalid string for option --weights! Must be a comma-separated list of floats, one per ph.table.\n";
}
}
for(my $i=0; $i<$nbtables; $i++) {
my @weights_pt = (($table_level_weights[$i]) x $nbscores);
if($phPenalty_idx!=-1) {
$weights_pt[$phPenalty_idx]=0;
}
print STDERR "WEIGHTS-PT_$i: ", join(" -- ", @weights_pt), "\n";
$weights[$i] = \@weights_pt;
}
print STDERR "EPSILON: $epsilon \n";
}
###########################################
my @empty_ppair=("");
my @epsilons = (($epsilon) x $nbscores);
if($phPenalty_idx>-1) {
pop @epsilons;
}
my $nbPpairs_inAll=0;
my @nbPairs_found_only_in=((0) x $nbtables);
my $MINSCORE=1;
print STDERR "Working...\n\n";
while(1) {
my $min_ppair="";
my $reached_end_of_tables=1;
my @tablesContainingPpair=((0) x $nbtables);
for(my $i=0; $i<$nbtables; $i++) {
my $ppair=$read_ppairs[$i]->[0];
if($ppair ne "") {
$reached_end_of_tables=0;
if($min_ppair eq "" || $ppair lt $min_ppair) {
$min_ppair=$ppair;
@tablesContainingPpair=((0) x $nbtables);
$tablesContainingPpair[$i]=1;
}
elsif($ppair eq $min_ppair) {
$tablesContainingPpair[$i]=1;
}
}
}
last if($reached_end_of_tables);
## Actual combination is performed here:
&combine_ppair(\@read_ppairs, \@tablesContainingPpair);
&read_line_from_tables(\@table_files, \@read_ppairs, \@tablesContainingPpair);
}
print STDERR "...done!\n";
print STDERR "The minimum score in all tables is $MINSCORE\n";
if($combination_mode eq "stats") {
my $tot_ppairs=0;
print "
# entries
found in all tables: $nbPpairs_inAll\n";
for(my $i=0; $i<$nbtables; $i++) {
print "found only in PT_$i: $nbPairs_found_only_in[$i]\n";
}
}
####################################
sub combine_ppair(PPAIRS_REFARRAY, TABLE_INDICES_REFARRAY) {
my $ra_ppairs=shift; # 1st item: phrase-pair key (string);
# 2nd item: ref.array of scores;
# 3rd item: additional info (string, may be empty)
my $ra_toRead=shift; # Important: this says which phrase tables contain the ph.pair currently processed
my $ppair="";
my @scores=();
my $additional_info="";
my $to_print=1;
if($debug) {
print STDERR "combine_ppair:\n";
for(my $i=0; $i<$nbtables; $i++) {
if($ra_toRead->[$i]) {
print STDERR "ppair_$i= ", join (" // ", @{$ra_ppairs->[$i]}), "\n";
}
}
}
if($combination_mode eq "stats") {
$to_print=0;
my $found_in=-1;
my $nb_found=0;
for(my $i=0; $i<$nbtables; $i++) {
if($ra_toRead->[$i]) {
$found_in=$i;
$nb_found++;
}
}
if($nb_found==1) { $nbPairs_found_only_in[$found_in]++; }
elsif($nb_found==$nbtables) { $nbPpairs_inAll++; }
}
### Fill-up + additional binary feature
elsif($combination_mode eq "fillup") {
my @bin_feats=(($exp_zero) x ($nbtables-1));
for(my $i=0; $i<$nbtables; $i++) {
if($ra_toRead->[$i]) {
$ppair= shift(@{$ra_ppairs->[$i]});
# pruning criteria are applied here:
if($i>0 && $newSourceMaxLength>-1) {
$ppair=~m/^(.*?)$delim_RE/;
if(scalar(split(/ +/, $1)) > $newSourceMaxLength &&
!defined($sourcePhrasesPT1{$1}))
{ $to_print=0; }
}
# @scores= @{$ra_ppairs->[$i]};
@scores = @{shift(@{$ra_ppairs->[$i]})};
# binary feature for ph.pair provenance fires here
if($i>0) { $bin_feats[$i-1]=$exp_one; }
$additional_info=shift(@{$ra_ppairs->[$i]});
last;
}
}
push(@scores, @bin_feats);
}
### Linear interpolation
elsif($combination_mode eq "interp") {
my $firstPpair=-1;
@scores=((0) x $nbscores);
for(my $i=0; $i<$nbtables; $i++) {
if($ra_toRead->[$i]) {
if($firstPpair==-1) { $firstPpair=$i; }
$ppair= shift(@{$ra_ppairs->[$i]});
my @scoresPT = @{shift(@{$ra_ppairs->[$i]})};
for(my $j=0; $j<$nbscores; $j++) {
# $scores[$j]+= $weights[$i]->[$j]* $ra_ppairs->[$i][$j];
$scores[$j]+= $weights[$i]->[$j]* $scoresPT[$j];
}
}
else {
for(my $j=0; $j<$nbscores; $j++) {
$scores[$j]+= $weights[$i]->[$j]* $epsilon;
}
}
if($phPenalty_idx!=-1) {
$scores[$phPenalty_idx]= $phPenalty;
}
}
if($debug) { print STDERR "..taking info from ptable_$firstPpair\n"; }
$additional_info= shift(@{$ra_ppairs->[$firstPpair]});
}
### Union + feature concatenation
elsif($combination_mode eq "union") {
my $firstPpair=-1;
for(my $i=0; $i<$nbtables; $i++) {
if($ra_toRead->[$i]) {
if($firstPpair==-1) { $firstPpair=$i; }
$ppair= shift(@{$ra_ppairs->[$i]});
my @scoresPT= @{shift(@{$ra_ppairs->[$i]})};
if($phPenalty_idx!=-1) {
# splice(@{$ra_ppairs->[$i]}, $phPenalty_idx, 1);
splice(@scoresPT, $phPenalty_idx, 1);
}
# push(@scores, @{$ra_ppairs->[$i]});
push(@scores, @scoresPT);
}
else {
push(@scores, @epsilons);
}
}
if($phPenalty_idx!=-1) {
push(@scores, $phPenalty);
}
if($debug) { print STDERR "..taking info from ptable_$firstPpair\n"; }
$additional_info= shift(@{$ra_ppairs->[$firstPpair]});
}
### Intersect + features from first table
elsif($combination_mode eq "intersect1") {
$to_print=0;
my $found_in_all=1;
for(my $i=0; $i<$nbtables; $i++) {
if(!$ra_toRead->[$i]) {
$found_in_all=0;
last;
}
}
if($found_in_all) {
$to_print=1;
$ppair= shift(@{$ra_ppairs->[0]});
# @scores= @{$ra_ppairs->[0]};
@scores= @{shift(@{$ra_ppairs->[0]})};
$additional_info= shift(@{$ra_ppairs->[0]});
}
}
else {
die "$usage\nUnknown combination mode!\n";
}
if($to_print) {
if($additional_info eq "") {
print $ppair, join(" ", @scores), "\n";
}else {
print $ppair, join(" ", @scores), $delim, $additional_info, "\n";
}
}
}
####################################
# Read lines from all filehandles given in FILES_REFARRAY,
# or from the files whose indices are assigned 1 in the array TABLE_INDICES_REFARRAY
# Parse each of them as a phrase pair entry and stores it to the corresponding position of PPAIRS_REFARRAY
sub read_line_from_tables(FILES_REFARRAY, PPAIRS_REFARRAY, TABLE_INDICES_REFARRAY) {
my $ra_files=shift;
my $ra_ppairs=shift;
my $ra_toRead=shift;
my @toRead=((1) x $nbtables); # by default read from all files
if($ra_toRead ne "") {
@toRead=@$ra_toRead;
}
my $nbscores=-1;
my $key=""; my $additional_info="";
for(my $i=0; $i<$nbtables; $i++) {
next if($toRead[$i]==0);
my @ppair=();
my $file=$ra_files->[$i];
if(my $line = <$file>) {
chomp $line;
my @fields = split(/$delim_RE/, $line);
if(scalar(@fields)<3) {
die "Invalid phrase table entry:\n$line\n";
}
my @scores = split(/\s+/, $fields[2]);
foreach my $score (@scores) {
if($score<$MINSCORE) { $MINSCORE=$score; }
}
# Get nb of scores from the 1st table. Check that all tables provide the same nb of scores,
# unless mode is 'intersect' (then it doesn't matter as scores are taken only from 1st table)
if($nbscores==-1) {
$nbscores=scalar(@scores);
} elsif($nbscores!=scalar(@scores) && $combination_mode ne "intersect1") {
die "Wrong number of scores in table-$i! Should be $nbscores\n";
}
# Get additional fields if any (word aligment, phrase counts etc.)
if(scalar(@fields)>3) {
$additional_info=join($delim, splice(@fields,3));
#print STDOUT "additional_info:__{$additional_info}__\n";
}
my $key = "$fields[0]$delim$fields[1]$delim"; ## IMPORTANT: the | delimiter at the end of the phrase pair is crucial to preserve sorting!!
push(@ppair, $key, \@scores, $additional_info);
}
else {
push(@ppair, "");
}
$ra_ppairs->[$i]=\@ppair;
}
return $nbscores;
}
#########
}
&main;

View File

@ -0,0 +1,16 @@
all: suffix-test fuzzy-match fuzzy-match2
clean:
rm -f *.o
.cpp.o:
g++ -O6 -g -c $<
suffix-test: Vocabulary.o SuffixArray.o suffix-test.o
g++ Vocabulary.o SuffixArray.o suffix-test.o -o suffix-test
fuzzy-match: Vocabulary.o SuffixArray.o old/fuzzy-match.o
g++ Vocabulary.o SuffixArray.o fuzzy-match.o -o fuzzy-match
fuzzy-match2: Vocabulary.o SuffixArray.o fuzzy-match2.o Util.o
g++ Vocabulary.o SuffixArray.o fuzzy-match2.o Util.o -o fuzzy-match2

View File

@ -0,0 +1,29 @@
//
// Match.h
// fuzzy-match
//
// Created by Hieu Hoang on 25/07/2012.
// Copyright 2012 __MyCompanyName__. All rights reserved.
//
#ifndef fuzzy_match_Match_h
#define fuzzy_match_Match_h
/* data structure for n-gram match between input and corpus */
class Match {
public:
int input_start;
int input_end;
int tm_start;
int tm_end;
int min_cost;
int max_cost;
int internal_cost;
Match( int is, int ie, int ts, int te, int min, int max, int i )
:input_start(is), input_end(ie), tm_start(ts), tm_end(te), min_cost(min), max_cost(max), internal_cost(i)
{}
};
#endif

View File

@ -0,0 +1,48 @@
//
// SentenceAlignment.h
// fuzzy-match
//
// Created by Hieu Hoang on 25/07/2012.
// Copyright 2012 __MyCompanyName__. All rights reserved.
//
#ifndef fuzzy_match_SentenceAlignment_h
#define fuzzy_match_SentenceAlignment_h
#include <sstream>
#include "Vocabulary.h"
extern Vocabulary vocabulary;
struct SentenceAlignment
{
int count;
vector< WORD_ID > target;
vector< pair<int,int> > alignment;
SentenceAlignment()
{}
string getTargetString() const
{
stringstream strme;
for (size_t i = 0; i < target.size(); ++i) {
const WORD &word = vocabulary.GetWord(target[i]);
strme << word << " ";
}
return strme.str();
}
string getAlignmentString() const
{
stringstream strme;
for (size_t i = 0; i < alignment.size(); ++i) {
const pair<int,int> &alignPair = alignment[i];
strme << alignPair.first << "-" << alignPair.second << " ";
}
return strme.str();
}
};
#endif

View File

@ -0,0 +1,244 @@
#include "SuffixArray.h"
#include <string>
#include <stdlib.h>
#include <cstring>
using namespace std;
SuffixArray::SuffixArray( string fileName )
{
m_vcb.StoreIfNew( "<uNk>" );
m_endOfSentence = m_vcb.StoreIfNew( "<s>" );
ifstream extractFile;
char line[LINE_MAX_LENGTH];
// count the number of words first;
extractFile.open(fileName.c_str());
istream *fileP = &extractFile;
m_size = 0;
size_t sentenceCount = 0;
while(!fileP->eof()) {
SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
if (fileP->eof()) break;
vector< WORD_ID > words = m_vcb.Tokenize( line );
m_size += words.size() + 1;
sentenceCount++;
}
extractFile.close();
cerr << m_size << " words (incl. sentence boundaries)" << endl;
// allocate memory
m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
m_index = (INDEX*) calloc( sizeof( INDEX ), m_size );
m_wordInSentence = (char*) calloc( sizeof( char ), m_size );
m_sentence = (size_t*) calloc( sizeof( size_t ), m_size );
m_sentenceLength = (char*) calloc( sizeof( char ), sentenceCount );
// fill the array
int wordIndex = 0;
int sentenceId = 0;
extractFile.open(fileName.c_str());
fileP = &extractFile;
while(!fileP->eof()) {
SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
if (fileP->eof()) break;
vector< WORD_ID > words = m_vcb.Tokenize( line );
vector< WORD_ID >::const_iterator i;
for( i=words.begin(); i!=words.end(); i++)
{
m_index[ wordIndex ] = wordIndex;
m_sentence[ wordIndex ] = sentenceId;
m_wordInSentence[ wordIndex ] = i-words.begin();
m_array[ wordIndex++ ] = *i;
}
m_index[ wordIndex ] = wordIndex;
m_array[ wordIndex++ ] = m_endOfSentence;
m_sentenceLength[ sentenceId++ ] = words.size();
}
extractFile.close();
cerr << "done reading " << wordIndex << " words, " << sentenceId << " sentences." << endl;
// List(0,9);
// sort
m_buffer = (INDEX*) calloc( sizeof( INDEX ), m_size );
Sort( 0, m_size-1 );
free( m_buffer );
cerr << "done sorting" << endl;
}
// good ol' quick sort
void SuffixArray::Sort(INDEX start, INDEX end) {
if (start == end) return;
INDEX mid = (start+end+1)/2;
Sort( start, mid-1 );
Sort( mid, end );
// merge
int i = start;
int j = mid;
int k = 0;
int length = end-start+1;
while( k<length )
{
if (i == mid )
{
m_buffer[ k++ ] = m_index[ j++ ];
}
else if (j > end )
{
m_buffer[ k++ ] = m_index[ i++ ];
}
else {
if (CompareIndex( m_index[i], m_index[j] ) < 0)
{
m_buffer[ k++ ] = m_index[ i++ ];
}
else
{
m_buffer[ k++ ] = m_index[ j++ ];
}
}
}
memcpy( ((char*)m_index) + sizeof( INDEX ) * start,
((char*)m_buffer), sizeof( INDEX ) * (end-start+1) );
}
SuffixArray::~SuffixArray()
{
free(m_index);
free(m_array);
}
int SuffixArray::CompareIndex( INDEX a, INDEX b ) const
{
// skip over identical words
INDEX offset = 0;
while( a+offset < m_size &&
b+offset < m_size &&
m_array[ a+offset ] == m_array[ b+offset ] )
{ offset++; }
if( a+offset == m_size ) return -1;
if( b+offset == m_size ) return 1;
return CompareWord( m_array[ a+offset ], m_array[ b+offset ] );
}
inline int SuffixArray::CompareWord( WORD_ID a, WORD_ID b ) const
{
// cerr << "c(" << m_vcb.GetWord(a) << ":" << m_vcb.GetWord(b) << ")=" << m_vcb.GetWord(a).compare( m_vcb.GetWord(b) ) << endl;
return m_vcb.GetWord(a).compare( m_vcb.GetWord(b) );
}
int SuffixArray::Count( const vector< WORD > &phrase )
{
INDEX dummy;
return LimitedCount( phrase, m_size, dummy, dummy, 0, m_size-1 );
}
bool SuffixArray::MinCount( const vector< WORD > &phrase, INDEX min )
{
INDEX dummy;
return LimitedCount( phrase, min, dummy, dummy, 0, m_size-1 ) >= min;
}
bool SuffixArray::Exists( const vector< WORD > &phrase )
{
INDEX dummy;
return LimitedCount( phrase, 1, dummy, dummy, 0, m_size-1 ) == 1;
}
int SuffixArray::FindMatches( const vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end )
{
return LimitedCount( phrase, m_size, firstMatch, lastMatch, search_start, search_end );
}
int SuffixArray::LimitedCount( const vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end )
{
// cerr << "FindFirst\n";
INDEX start = search_start;
INDEX end = (search_end == -1) ? (m_size-1) : search_end;
INDEX mid = FindFirst( phrase, start, end );
// cerr << "done\n";
if (mid == m_size) return 0; // no matches
if (min == 1) return 1; // only existance check
int matchCount = 1;
//cerr << "before...\n";
firstMatch = FindLast( phrase, mid, start, -1 );
matchCount += mid - firstMatch;
//cerr << "after...\n";
lastMatch = FindLast( phrase, mid, end, 1 );
matchCount += lastMatch - mid;
return matchCount;
}
SuffixArray::INDEX SuffixArray::FindLast( const vector< WORD > &phrase, INDEX start, INDEX end, int direction )
{
end += direction;
while(true)
{
INDEX mid = ( start + end + (direction>0 ? 0 : 1) )/2;
int match = Match( phrase, mid );
int matchNext = Match( phrase, mid+direction );
//cerr << "\t" << start << ";" << mid << ";" << end << " -> " << match << "," << matchNext << endl;
if (match == 0 && matchNext != 0) return mid;
if (match == 0) // mid point is a match
start = mid;
else
end = mid;
}
}
SuffixArray::INDEX SuffixArray::FindFirst( const vector< WORD > &phrase, INDEX &start, INDEX &end )
{
while(true)
{
INDEX mid = ( start + end + 1 )/2;
//cerr << "FindFirst(" << start << ";" << mid << ";" << end << ")\n";
int match = Match( phrase, mid );
if (match == 0) return mid;
if (start >= end && match != 0 ) return m_size;
if (match > 0)
start = mid+1;
else
end = mid-1;
}
}
int SuffixArray::Match( const vector< WORD > &phrase, INDEX index )
{
INDEX pos = m_index[ index ];
for(INDEX i=0; i<phrase.size() && i+pos<m_size; i++)
{
int match = CompareWord( m_vcb.GetWordID( phrase[i] ), m_array[ pos+i ] );
// cerr << "{" << index << "+" << i << "," << pos+i << ":" << match << "}" << endl;
if (match != 0)
return match;
}
return 0;
}
void SuffixArray::List(INDEX start, INDEX end)
{
for(INDEX i=start; i<=end; i++)
{
INDEX pos = m_index[ i ];
// cerr << i << ":" << pos << "\t";
for(int j=0; j<5 && j+pos<m_size; j++)
{
cout << " " << m_vcb.GetWord( m_array[ pos+j ] );
}
// cerr << "\n";
}
}

View File

@ -0,0 +1,45 @@
#include "Vocabulary.h"
#pragma once
#define LINE_MAX_LENGTH 10000
class SuffixArray
{
public:
typedef unsigned int INDEX;
private:
WORD_ID *m_array;
INDEX *m_index;
INDEX *m_buffer;
char *m_wordInSentence;
size_t *m_sentence;
char *m_sentenceLength;
WORD_ID m_endOfSentence;
Vocabulary m_vcb;
INDEX m_size;
public:
SuffixArray( string fileName );
~SuffixArray();
void Sort(INDEX start, INDEX end);
int CompareIndex( INDEX a, INDEX b ) const;
inline int CompareWord( WORD_ID a, WORD_ID b ) const;
int Count( const vector< WORD > &phrase );
bool MinCount( const vector< WORD > &phrase, INDEX min );
bool Exists( const vector< WORD > &phrase );
int FindMatches( const vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = 0, INDEX search_end = -1 );
int LimitedCount( const vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = -1, INDEX search_end = 0 );
INDEX FindFirst( const vector< WORD > &phrase, INDEX &start, INDEX &end );
INDEX FindLast( const vector< WORD > &phrase, INDEX start, INDEX end, int direction );
int Match( const vector< WORD > &phrase, INDEX index );
void List( INDEX start, INDEX end );
inline INDEX GetPosition( INDEX index ) { return m_index[ index ]; }
inline size_t GetSentence( INDEX position ) { return m_sentence[position]; }
inline char GetWordInSentence( INDEX position ) { return m_wordInSentence[position]; }
inline char GetSentenceLength( size_t sentenceId ) { return m_sentenceLength[sentenceId]; }
inline INDEX GetSize() { return m_size; }
};

View File

@ -0,0 +1,147 @@
//
// Util.cpp
// fuzzy-match
//
// Created by Hieu Hoang on 26/07/2012.
// Copyright 2012 __MyCompanyName__. All rights reserved.
//
#include <iostream>
#include <stdio.h>
#include "Util.h"
#include "SentenceAlignment.h"
#include "SuffixArray.h"
void load_corpus( const char* fileName, vector< vector< WORD_ID > > &corpus )
{ // source
ifstream fileStream;
fileStream.open(fileName);
if (!fileStream) {
cerr << "file not found: " << fileName << endl;
exit(1);
}
cerr << "loading " << fileName << endl;
istream *fileStreamP = &fileStream;
char line[LINE_MAX_LENGTH];
while(true)
{
SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
if (fileStreamP->eof()) break;
corpus.push_back( vocabulary.Tokenize( line ) );
}
}
void load_target( const char* fileName, vector< vector< SentenceAlignment > > &corpus)
{
ifstream fileStream;
fileStream.open(fileName);
if (!fileStream) {
cerr << "file not found: " << fileName << endl;
exit(1);
}
cerr << "loading " << fileName << endl;
istream *fileStreamP = &fileStream;
WORD_ID delimiter = vocabulary.StoreIfNew("|||");
int lineNum = 0;
char line[LINE_MAX_LENGTH];
while(true)
{
SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
if (fileStreamP->eof()) break;
vector<WORD_ID> toks = vocabulary.Tokenize( line );
corpus.push_back(vector< SentenceAlignment >());
vector< SentenceAlignment > &vec = corpus.back();
vec.push_back(SentenceAlignment());
SentenceAlignment *sentence = &vec.back();
const WORD &countStr = vocabulary.GetWord(toks[0]);
sentence->count = atoi(countStr.c_str());
for (size_t i = 1; i < toks.size(); ++i) {
WORD_ID wordId = toks[i];
if (wordId == delimiter) {
// target and alignments can have multiple sentences.
vec.push_back(SentenceAlignment());
sentence = &vec.back();
// count
++i;
const WORD &countStr = vocabulary.GetWord(toks[i]);
sentence->count = atoi(countStr.c_str());
}
else {
// just a normal word, add
sentence->target.push_back(wordId);
}
}
++lineNum;
}
}
void load_alignment( const char* fileName, vector< vector< SentenceAlignment > > &corpus )
{
ifstream fileStream;
fileStream.open(fileName);
if (!fileStream) {
cerr << "file not found: " << fileName << endl;
exit(1);
}
cerr << "loading " << fileName << endl;
istream *fileStreamP = &fileStream;
string delimiter = "|||";
int lineNum = 0;
char line[LINE_MAX_LENGTH];
while(true)
{
SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
if (fileStreamP->eof()) break;
vector< SentenceAlignment > &vec = corpus[lineNum];
size_t targetInd = 0;
SentenceAlignment *sentence = &vec[targetInd];
vector<string> toks = Tokenize(line);
for (size_t i = 0; i < toks.size(); ++i) {
string &tok = toks[i];
if (tok == delimiter) {
// target and alignments can have multiple sentences.
++targetInd;
sentence = &vec[targetInd];
++i;
}
else {
// just a normal alignment, add
vector<int> alignPoint = Tokenize<int>(tok, "-");
assert(alignPoint.size() == 2);
sentence->alignment.push_back(pair<int,int>(alignPoint[0], alignPoint[1]));
}
}
++lineNum;
}
}

View File

@ -0,0 +1,87 @@
//
// Util.h
// fuzzy-match
//
// Created by Hieu Hoang on 25/07/2012.
// Copyright 2012 __MyCompanyName__. All rights reserved.
//
#ifndef fuzzy_match_Util_h
#define fuzzy_match_Util_h
#include <vector>
#include <sstream>
#include "Vocabulary.h"
class SentenceAlignment;
void load_corpus( const char* fileName, std::vector< std::vector< WORD_ID > > &corpus );
void load_target( const char* fileName, std::vector< std::vector< SentenceAlignment > > &corpus);
void load_alignment( const char* fileName, std::vector< std::vector< SentenceAlignment > > &corpus );
/**
* Convert vector of type T to string
*/
template <typename T>
std::string Join(const std::string& delimiter, const std::vector<T>& items)
{
std::ostringstream outstr;
if(items.size() == 0) return "";
outstr << items[0];
for(unsigned int i = 1; i < items.size(); i++)
outstr << delimiter << items[i];
return outstr.str();
}
//! convert string to variable of type T. Used to reading floats, int etc from files
template<typename T>
inline T Scan(const std::string &input)
{
std::stringstream stream(input);
T ret;
stream >> ret;
return ret;
}
//! convert vectors of string to vectors of type T variables
template<typename T>
inline std::vector<T> Scan(const std::vector< std::string > &input)
{
std::vector<T> output(input.size());
for (size_t i = 0 ; i < input.size() ; i++) {
output[i] = Scan<T>( input[i] );
}
return output;
}
inline std::vector<std::string> Tokenize(const std::string& str,
const std::string& delimiters = " \t")
{
std::vector<std::string> tokens;
// Skip delimiters at beginning.
std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
// Find first "non-delimiter".
std::string::size_type pos = str.find_first_of(delimiters, lastPos);
while (std::string::npos != pos || std::string::npos != lastPos) {
// Found a token, add it to the vector.
tokens.push_back(str.substr(lastPos, pos - lastPos));
// Skip delimiters. Note the "not_of"
lastPos = str.find_first_not_of(delimiters, pos);
// Find next "non-delimiter"
pos = str.find_first_of(delimiters, lastPos);
}
return tokens;
}
template<typename T>
inline std::vector<T> Tokenize( const std::string &input
, const std::string& delimiters = " \t")
{
std::vector<std::string> stringVector = Tokenize(input, delimiters);
return Scan<T>( stringVector );
}
#endif

View File

@ -0,0 +1,45 @@
// $Id: Vocabulary.cpp 1565 2008-02-22 14:42:01Z bojar $
#include "Vocabulary.h"
// as in beamdecoder/tables.cpp
vector<WORD_ID> Vocabulary::Tokenize( const char input[] ) {
vector< WORD_ID > token;
bool betweenWords = true;
int start=0;
int i=0;
for(; input[i] != '\0'; i++) {
bool isSpace = (input[i] == ' ' || input[i] == '\t');
if (!isSpace && betweenWords) {
start = i;
betweenWords = false;
}
else if (isSpace && !betweenWords) {
token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
betweenWords = true;
}
}
if (!betweenWords)
token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
return token;
}
WORD_ID Vocabulary::StoreIfNew( const WORD& word ) {
map<WORD, WORD_ID>::iterator i = lookup.find( word );
if( i != lookup.end() )
return i->second;
WORD_ID id = vocab.size();
vocab.push_back( word );
lookup[ word ] = id;
return id;
}
WORD_ID Vocabulary::GetWordID( const WORD &word ) {
map<WORD, WORD_ID>::iterator i = lookup.find( word );
if( i == lookup.end() )
return 0;
WORD_ID w= (WORD_ID) i->second;
return w;
}

View File

@ -0,0 +1,40 @@
// $Id: tables-core.h 1470 2007-10-02 21:43:54Z redpony $
#pragma once
#include <iostream>
#include <fstream>
#include <assert.h>
#include <stdlib.h>
#include <string>
#include <queue>
#include <map>
#include <cmath>
using namespace std;
#define MAX_LENGTH 10000
#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) { \
_IS.getline(_LINE, _SIZE, _DELIM); \
if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \
if (_IS.gcount() == _SIZE-1) { \
cerr << "Line too long! Buffer overflow. Delete lines >=" \
<< _SIZE << " chars or raise MAX_LENGTH in phrase-extract/tables-core.cpp" \
<< endl; \
exit(1); \
} \
}
typedef string WORD;
typedef unsigned int WORD_ID;
class Vocabulary {
public:
map<WORD, WORD_ID> lookup;
vector< WORD > vocab;
WORD_ID StoreIfNew( const WORD& );
WORD_ID GetWordID( const WORD& );
vector<WORD_ID> Tokenize( const char[] );
inline WORD &GetWord( WORD_ID id ) const { WORD &i = (WORD&) vocab[ id ]; return i; }
};

View File

@ -0,0 +1,460 @@
#include <stdio.h>
#include <stdlib.h>
#include <getopt.h>
#include <map>
#include <algorithm>
#include <iostream>
#include <fstream>
#include <cstring>
#include <time.h>
#include <fstream>
#include "SentenceAlignment.h"
#include "fuzzy-match2.h"
#include "SuffixArray.h"
/** This implementation is explained in
Koehn and Senellart: "Fast Approximate String Matching
with Suffix Arrays and A* Parsing" (AMTA 2010) ***/
using namespace std;
int main(int argc, char* argv[])
{
vector< vector< WORD_ID > > source, input;
vector< vector< SentenceAlignment > > targetAndAlignment;
while(1) {
static struct option long_options[] = {
{"basic", no_argument, &basic_flag, 1},
{"word", no_argument, &lsed_flag, 0},
{"unrefined", no_argument, &refined_flag, 0},
{"nolengthfilter", no_argument, &length_filter_flag, 0},
{"noparse", no_argument, &parse_flag, 0},
{"multiple", no_argument, &multiple_flag, 1},
{"minmatch", required_argument, 0, 'm'},
{0, 0, 0, 0}
};
int option_index = 0;
int c = getopt_long (argc, argv, "m:", long_options, &option_index);
if (c == -1) break;
switch (c) {
case 0:
// if (long_options[option_index].flag != 0)
// break;
// printf ("option %s", long_options[option_index].name);
// if (optarg)
// printf (" with arg %s", optarg);
// printf ("\n");
break;
case 'm':
min_match = atoi(optarg);
if (min_match < 1 || min_match > 100) {
cerr << "error: --minmatch must have value in range 1..100\n";
exit(1);
}
cerr << "setting min match to " << min_match << endl;
break;
default:
cerr << "usage: syntax: ./fuzzy-match input corpus [--basic] [--word] [--minmatch 1..100]\n";
exit(1);
}
}
if (lsed_flag) { cerr << "lsed\n"; }
if (basic_flag) { cerr << "basic\n"; }
if (refined_flag) { cerr << "refined\n"; }
if (length_filter_flag) { cerr << "length filter\n"; }
if (parse_flag) { cerr << "parse\n"; }
// exit(1);
if (optind+4 != argc) {
cerr << "syntax: ./fuzzy-match input source target alignment [--basic] [--word] [--minmatch 1..100]\n";
exit(1);
}
load_corpus(argv[optind], input);
load_corpus(argv[optind+1], source);
load_target(argv[optind+2], targetAndAlignment);
load_alignment(argv[optind+3], targetAndAlignment);
// ./fuzzy-match input corpus [-basic]
// load_corpus("../corpus/tm.truecased.4.en", source);
// load_corpus("../corpus/tm.truecased.4.it", target);
// load_corpus("../evaluation/test.input.tc.4", input);
// load_corpus("../../acquis-truecase/corpus/acquis.truecased.190.en", source);
// load_corpus("../../acquis-truecase/evaluation/ac-test.input.tc.190", input);
// load_corpus("../corpus/tm.truecased.16.en", source);
// load_corpus("../evaluation/test.input.tc.16", input);
if (basic_flag) {
cerr << "using basic method\n";
clock_t start_main_clock2 = clock();
basic_fuzzy_match( source, input );
cerr << "total: " << (1000 * (clock()-start_main_clock2) / CLOCKS_PER_SEC) << endl;
exit(1);
}
cerr << "number of input sentences " << input.size() << endl;
cerr << "creating suffix array...\n";
// SuffixArray suffixArray( "../corpus/tm.truecased.4.en" );
// SuffixArray suffixArray( "../../acquis-truecase/corpus/acquis.truecased.190.en" );
SuffixArray suffixArray( argv[optind+1] );
clock_t start_main_clock = clock();
// looping through all input sentences...
cerr << "looping...\n";
for(unsigned int sentenceInd = 0; sentenceInd < input.size(); sentenceInd++)
{
clock_t start_clock = clock();
// if (i % 10 == 0) cerr << ".";
// establish some basic statistics
// int input_length = compute_length( input[i] );
int input_length = input[sentenceInd].size();
int best_cost = input_length * (100-min_match) / 100 + 1;
int match_count = 0; // how many substring matches to be considered
//cerr << endl << "sentence " << i << ", length " << input_length << ", best_cost " << best_cost << endl;
// find match ranges in suffix array
vector< vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > > match_range;
for(size_t start=0;start<input[sentenceInd].size();start++)
{
SuffixArray::INDEX prior_first_match = 0;
SuffixArray::INDEX prior_last_match = suffixArray.GetSize()-1;
vector< string > substring;
bool stillMatched = true;
vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > matchedAtThisStart;
//cerr << "start: " << start;
for(int word=start; stillMatched && word<input[sentenceInd].size(); word++)
{
substring.push_back( vocabulary.GetWord( input[sentenceInd][word] ) );
// only look up, if needed (i.e. no unnecessary short gram lookups)
// if (! word-start+1 <= short_match_max_length( input_length ) )
// {
SuffixArray::INDEX first_match, last_match;
stillMatched = false;
if (suffixArray.FindMatches( substring, first_match, last_match, prior_first_match, prior_last_match ) )
{
stillMatched = true;
matchedAtThisStart.push_back( make_pair( first_match, last_match ) );
//cerr << " (" << first_match << "," << last_match << ")";
//cerr << " " << ( last_match - first_match + 1 );
prior_first_match = first_match;
prior_last_match = last_match;
}
//}
}
//cerr << endl;
match_range.push_back( matchedAtThisStart );
}
clock_t clock_range = clock();
map< int, vector< Match > > sentence_match;
map< int, int > sentence_match_word_count;
// go through all matches, longest first
for(int length = input[sentenceInd].size(); length >= 1; length--)
{
// do not create matches, if these are handled by the short match function
if (length <= short_match_max_length( input_length ) )
{
continue;
}
unsigned int count = 0;
for(int start = 0; start <= input[sentenceInd].size() - length; start++)
{
if (match_range[start].size() >= length)
{
pair< SuffixArray::INDEX, SuffixArray::INDEX > &range = match_range[start][length-1];
// cerr << " (" << range.first << "," << range.second << ")";
count += range.second - range.first + 1;
for(SuffixArray::INDEX i=range.first; i<=range.second; i++)
{
int position = suffixArray.GetPosition( i );
// sentence length mismatch
size_t sentence_id = suffixArray.GetSentence( position );
int sentence_length = suffixArray.GetSentenceLength( sentence_id );
int diff = abs( (int)sentence_length - (int)input_length );
// cerr << endl << i << "\tsentence " << sentence_id << ", length " << sentence_length;
//if (length <= 2 && input_length>=5 &&
// sentence_match.find( sentence_id ) == sentence_match.end())
// continue;
if (diff > best_cost)
continue;
// compute minimal cost
int start_pos = suffixArray.GetWordInSentence( position );
int end_pos = start_pos + length-1;
// cerr << endl << "\t" << start_pos << "-" << end_pos << " (" << sentence_length << ") vs. "
// << start << "-" << (start+length-1) << " (" << input_length << ")";
// different number of prior words -> cost is at least diff
int min_cost = abs( start - start_pos );
// same number of words, but not sent. start -> cost is at least 1
if (start == start_pos && start>0)
min_cost++;
// different number of remaining words -> cost is at least diff
min_cost += abs( ( sentence_length-1 - end_pos ) -
( input_length-1 - (start+length-1) ) );
// same number of words, but not sent. end -> cost is at least 1
if ( sentence_length-1 - end_pos ==
input_length-1 - (start+length-1)
&& end_pos != sentence_length-1 )
min_cost++;
// cerr << " -> min_cost " << min_cost;
if (min_cost > best_cost)
continue;
// valid match
match_count++;
// compute maximal cost
int max_cost = max( start, start_pos )
+ max( sentence_length-1 - end_pos,
input_length-1 - (start+length-1) );
// cerr << ", max_cost " << max_cost;
Match m = Match( start, start+length-1,
start_pos, start_pos+length-1,
min_cost, max_cost, 0);
sentence_match[ sentence_id ].push_back( m );
sentence_match_word_count[ sentence_id ] += length;
if (max_cost < best_cost)
{
best_cost = max_cost;
if (best_cost == 0) break;
}
//if (match_count >= MAX_MATCH_COUNT) break;
}
}
// cerr << endl;
if (best_cost == 0) break;
//if (match_count >= MAX_MATCH_COUNT) break;
}
// cerr << count << " matches at length " << length << " in " << sentence_match.size() << " tm." << endl;
if (best_cost == 0) break;
//if (match_count >= MAX_MATCH_COUNT) break;
}
cerr << match_count << " matches in " << sentence_match.size() << " sentences." << endl;
clock_t clock_matches = clock();
// consider each sentence for which we have matches
int old_best_cost = best_cost;
int tm_count_word_match = 0;
int tm_count_word_match2 = 0;
int pruned_match_count = 0;
if (short_match_max_length( input_length ))
{
init_short_matches( input[sentenceInd] );
}
vector< int > best_tm;
typedef map< int, vector< Match > >::iterator I;
clock_t clock_validation_sum = 0;
for(I tm=sentence_match.begin(); tm!=sentence_match.end(); tm++)
{
int tmID = tm->first;
int tm_length = suffixArray.GetSentenceLength(tmID);
vector< Match > &match = tm->second;
add_short_matches( match, source[tmID], input_length, best_cost );
//cerr << "match in sentence " << tmID << ": " << match.size() << " [" << tm_length << "]" << endl;
// quick look: how many words are matched
int words_matched = 0;
for(int m=0;m<match.size();m++) {
if (match[m].min_cost <= best_cost) // makes no difference
words_matched += match[m].input_end - match[m].input_start + 1;
}
if (max(input_length,tm_length) - words_matched > best_cost)
{
if (length_filter_flag) continue;
}
tm_count_word_match++;
// prune, check again how many words are matched
vector< Match > pruned = prune_matches( match, best_cost );
words_matched = 0;
for(int p=0;p<pruned.size();p++) {
words_matched += pruned[p].input_end - pruned[p].input_start + 1;
}
if (max(input_length,tm_length) - words_matched > best_cost)
{
if (length_filter_flag) continue;
}
tm_count_word_match2++;
pruned_match_count += pruned.size();
int prior_best_cost = best_cost;
int cost;
clock_t clock_validation_start = clock();
if (! parse_flag ||
pruned.size()>=10) // to prevent worst cases
{
string path;
cost = sed( input[sentenceInd], source[tmID], path, false );
if (cost < best_cost)
{
best_cost = cost;
}
}
else
{
cost = parse_matches( pruned, input_length, tm_length, best_cost );
if (prior_best_cost != best_cost)
{
best_tm.clear();
}
}
clock_validation_sum += clock() - clock_validation_start;
if (cost == best_cost)
{
best_tm.push_back( tmID );
}
}
cerr << "reduced best cost from " << old_best_cost << " to " << best_cost << endl;
cerr << "tm considered: " << sentence_match.size()
<< " word-matched: " << tm_count_word_match
<< " word-matched2: " << tm_count_word_match2
<< " best: " << best_tm.size() << endl;
cerr << "pruned matches: " << ((float)pruned_match_count/(float)tm_count_word_match2) << endl;
// create xml and extract files
string inputStr, sourceStr;
for (size_t pos = 0; pos < input_length; ++pos) {
inputStr += vocabulary.GetWord(input[sentenceInd][pos]) + " ";
}
// do not try to find the best ... report multiple matches
if (multiple_flag) {
int input_letter_length = compute_length( input[sentenceInd] );
for(int si=0; si<best_tm.size(); si++) {
int s = best_tm[si];
string path;
unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true );
// do not report multiple identical sentences, but just their count
cout << sentenceInd << " "; // sentence number
cout << letter_cost << "/" << input_letter_length << " ";
cout << "(" << best_cost <<"/" << input_length <<") ";
cout << "||| " << s << " ||| " << path << endl;
vector<WORD_ID> &sourceSentence = source[s];
vector<SentenceAlignment> &targets = targetAndAlignment[s];
create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, path);
}
} // if (multiple_flag)
else {
// find the best matches according to letter sed
string best_path = "";
int best_match = -1;
int best_letter_cost;
if (lsed_flag) {
best_letter_cost = compute_length( input[sentenceInd] ) * min_match / 100 + 1;
for(int si=0; si<best_tm.size(); si++)
{
int s = best_tm[si];
string path;
unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true );
if (letter_cost < best_letter_cost)
{
best_letter_cost = letter_cost;
best_path = path;
best_match = s;
}
}
}
// if letter sed turned off, just compute path for first match
else {
if (best_tm.size() > 0) {
string path;
sed( input[sentenceInd], source[best_tm[0]], path, false );
best_path = path;
best_match = best_tm[0];
}
}
cerr << "elapsed: " << (1000 * (clock()-start_clock) / CLOCKS_PER_SEC)
<< " ( range: " << (1000 * (clock_range-start_clock) / CLOCKS_PER_SEC)
<< " match: " << (1000 * (clock_matches-clock_range) / CLOCKS_PER_SEC)
<< " tm: " << (1000 * (clock()-clock_matches) / CLOCKS_PER_SEC)
<< " (validation: " << (1000 * (clock_validation_sum) / CLOCKS_PER_SEC) << ")"
<< " )" << endl;
if (lsed_flag) {
cout << best_letter_cost << "/" << compute_length( input[sentenceInd] ) << " (";
}
cout << best_cost <<"/" << input_length;
if (lsed_flag) cout << ")";
cout << " ||| " << best_match << " ||| " << best_path << endl;
// creat xml & extracts
vector<WORD_ID> &sourceSentence = source[best_match];
vector<SentenceAlignment> &targets = targetAndAlignment[best_match];
create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, best_path);
} // else if (multiple_flag)
}
cerr << "total: " << (1000 * (clock()-start_main_clock) / CLOCKS_PER_SEC) << endl;
}
void create_extract(int sentenceInd, int cost, const vector< WORD_ID > &sourceSentence, const vector<SentenceAlignment> &targets, const string &inputStr, const string &path)
{
string sourceStr;
for (size_t pos = 0; pos < sourceSentence.size(); ++pos) {
WORD_ID wordId = sourceSentence[pos];
sourceStr += vocabulary.GetWord(wordId) + " ";
}
char *inputFileName = tmpnam(NULL);
ofstream inputFile(inputFileName);
for (size_t targetInd = 0; targetInd < targets.size(); ++targetInd) {
const SentenceAlignment &sentenceAlignment = targets[targetInd];
string targetStr = sentenceAlignment.getTargetString();
string alignStr = sentenceAlignment.getAlignmentString();
inputFile
<< sentenceInd << endl
<< cost << endl
<< sourceStr << endl
<< inputStr << endl
<< targetStr << endl
<< alignStr << endl
<< path << endl
<< sentenceAlignment.count << endl;
}
string cmd = string("perl create_xml.perl < ") + inputFileName;
cerr << cmd << endl;
inputFile.close();
}

View File

@ -0,0 +1,561 @@
//
// fuzzy-match2.h
// fuzzy-match
//
// Created by Hieu Hoang on 25/07/2012.
// Copyright 2012 __MyCompanyName__. All rights reserved.
//
#ifndef fuzzy_match_fuzzy_match2_h
#define fuzzy_match_fuzzy_match2_h
#include <string>
#include <sstream>
#include <vector>
#include "Vocabulary.h"
#include "SuffixArray.h"
#include "Util.h"
#include "Match.h"
#define MAX_MATCH_COUNT 10000000
Vocabulary vocabulary;
int basic_flag = false;
int lsed_flag = true;
int refined_flag = true;
int length_filter_flag = true;
int parse_flag = true;
int min_match = 70;
int multiple_flag = false;
int multiple_slack = 0;
int multiple_max = 100;
map< WORD_ID,vector< int > > single_word_index;
// global cache for word pairs
map< pair< WORD_ID, WORD_ID >, unsigned int > lsed;
void create_extract(int sentenceInd, int cost, const vector< WORD_ID > &sourceSentence, const vector<SentenceAlignment> &targets, const string &inputStr, const string &path);
/* Letter string edit distance, e.g. sub 'their' to 'there' costs 2 */
unsigned int letter_sed( WORD_ID aIdx, WORD_ID bIdx )
{
// check if already computed -> lookup in cache
pair< WORD_ID, WORD_ID > pIdx = make_pair( aIdx, bIdx );
map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = lsed.find( pIdx );
if (lookup != lsed.end())
{
return (lookup->second);
}
// get surface strings for word indices
const string &a = vocabulary.GetWord( aIdx );
const string &b = vocabulary.GetWord( bIdx );
// initialize cost matrix
unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 );
for( unsigned int i=0; i<=a.size(); i++ ) {
cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
cost[i][0] = i;
}
for( unsigned int j=0; j<=b.size(); j++ ) {
cost[0][j] = j;
}
// core string edit distance loop
for( unsigned int i=1; i<=a.size(); i++ ) {
for( unsigned int j=1; j<=b.size(); j++ ) {
unsigned int ins = cost[i-1][j] + 1;
unsigned int del = cost[i][j-1] + 1;
bool match = (a.substr(i-1,1).compare( b.substr(j-1,1) ) == 0);
unsigned int diag = cost[i-1][j-1] + (match ? 0 : 1);
unsigned int min = (ins < del) ? ins : del;
min = (diag < min) ? diag : min;
cost[i][j] = min;
}
}
// clear out memory
unsigned int final = cost[a.size()][b.size()];
for( unsigned int i=0; i<=a.size(); i++ ) {
free( cost[i] );
}
free( cost );
// cache and return result
lsed[ pIdx ] = final;
return final;
}
/* string edit distance implementation */
unsigned int sed( const vector< WORD_ID > &a, const vector< WORD_ID > &b, string &best_path, bool use_letter_sed ) {
// initialize cost and path matrices
unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 );
char **path = (char**) calloc( sizeof( char* ), a.size()+1 );
for( unsigned int i=0; i<=a.size(); i++ ) {
cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
path[i] = (char*) calloc( sizeof(char), b.size()+1 );
if (i>0)
{
cost[i][0] = cost[i-1][0];
if (use_letter_sed)
{
cost[i][0] += vocabulary.GetWord( a[i-1] ).size();
}
else
{
cost[i][0]++;
}
}
else
{
cost[i][0] = 0;
}
path[i][0] = 'I';
}
for( unsigned int j=0; j<=b.size(); j++ ) {
if (j>0)
{
cost[0][j] = cost[0][j-1];
if (use_letter_sed)
{
cost[0][j] += vocabulary.GetWord( b[j-1] ).size();
}
else
{
cost[0][j]++;
}
}
else
{
cost[0][j] = 0;
}
path[0][j] = 'D';
}
// core string edit distance algorithm
for( unsigned int i=1; i<=a.size(); i++ ) {
for( unsigned int j=1; j<=b.size(); j++ ) {
unsigned int ins = cost[i-1][j];
unsigned int del = cost[i][j-1];
unsigned int match;
if (use_letter_sed)
{
ins += vocabulary.GetWord( a[i-1] ).size();
del += vocabulary.GetWord( b[j-1] ).size();
match = letter_sed( a[i-1], b[j-1] );
}
else
{
ins++;
del++;
match = ( a[i-1] == b[j-1] ) ? 0 : 1;
}
unsigned int diag = cost[i-1][j-1] + match;
char action = (ins < del) ? 'I' : 'D';
unsigned int min = (ins < del) ? ins : del;
if (diag < min)
{
action = (match>0) ? 'S' : 'M';
min = diag;
}
cost[i][j] = min;
path[i][j] = action;
}
}
// construct string for best path
unsigned int i = a.size();
unsigned int j = b.size();
best_path = "";
while( i>0 || j>0 )
{
best_path = path[i][j] + best_path;
if (path[i][j] == 'I')
{
i--;
}
else if (path[i][j] == 'D')
{
j--;
}
else
{
i--;
j--;
}
}
// clear out memory
unsigned int final = cost[a.size()][b.size()];
for( unsigned int i=0; i<=a.size(); i++ ) {
free( cost[i] );
free( path[i] );
}
free( cost );
free( path );
// return result
return final;
}
/* utlility function: compute length of sentence in characters
(spaces do not count) */
unsigned int compute_length( const vector< WORD_ID > &sentence )
{
unsigned int length = 0; for( unsigned int i=0; i<sentence.size(); i++ )
{
length += vocabulary.GetWord( sentence[i] ).size();
}
return length;
}
/* brute force method: compare input to all corpus sentences */
int basic_fuzzy_match( vector< vector< WORD_ID > > source,
vector< vector< WORD_ID > > input )
{
// go through input set...
for(unsigned int i=0;i<input.size();i++)
{
bool use_letter_sed = false;
// compute sentence length and worst allowed cost
unsigned int input_length;
if (use_letter_sed)
{
input_length = compute_length( input[i] );
}
else
{
input_length = input[i].size();
}
unsigned int best_cost = input_length * (100-min_match) / 100 + 2;
string best_path = "";
int best_match = -1;
// go through all corpus sentences
for(unsigned int s=0;s<source.size();s++)
{
int source_length;
if (use_letter_sed)
{
source_length = compute_length( source[s] );
}
else
{
source_length = source[s].size();
}
int diff = abs((int)source_length - (int)input_length);
if (length_filter_flag && (diff >= best_cost))
{
continue;
}
// compute string edit distance
string path;
unsigned int cost = sed( input[i], source[s], path, use_letter_sed );
// update if new best
if (cost < best_cost)
{
best_cost = cost;
best_path = path;
best_match = s;
}
}
cout << best_cost << " ||| " << best_match << " ||| " << best_path << endl;
}
}
/* definition of short matches
very short n-gram matches (1-grams) will not be looked up in
the suffix array, since there are too many matches
and for longer sentences, at least one 2-gram match must occur */
inline int short_match_max_length( int input_length )
{
if ( ! refined_flag )
return 0;
if ( input_length >= 5 )
return 1;
return 0;
}
/* if we have non-short matches in a sentence, we need to
take a closer look at it.
this function creates a hash map for all input words and their positions
(to be used by the next function)
(done here, because this has be done only once for an input sentence) */
void init_short_matches( const vector< WORD_ID > &input )
{
int max_length = short_match_max_length( input.size() );
if (max_length == 0)
return;
single_word_index.clear();
// store input words and their positions in hash map
for(int i=0; i<input.size(); i++)
{
if (single_word_index.find( input[i] ) == single_word_index.end())
{
vector< int > position_vector;
single_word_index[ input[i] ] = position_vector;
}
single_word_index[ input[i] ].push_back( i );
}
}
/* add all short matches to list of matches for a sentence */
void add_short_matches( vector< Match > &match, const vector< WORD_ID > &tm, int input_length, int best_cost )
{
int max_length = short_match_max_length( input_length );
if (max_length == 0)
return;
int tm_length = tm.size();
map< WORD_ID,vector< int > >::iterator input_word_hit;
for(int t_pos=0; t_pos<tm.size(); t_pos++)
{
input_word_hit = single_word_index.find( tm[t_pos] );
if (input_word_hit != single_word_index.end())
{
vector< int > &position_vector = input_word_hit->second;
for(int j=0; j<position_vector.size(); j++)
{
int &i_pos = position_vector[j];
// before match
int max_cost = max( i_pos , t_pos );
int min_cost = abs( i_pos - t_pos );
if ( i_pos>0 && i_pos == t_pos )
min_cost++;
// after match
max_cost += max( (input_length-i_pos) , (tm_length-t_pos));
min_cost += abs( (input_length-i_pos) - (tm_length-t_pos));
if ( i_pos != input_length-1 && (input_length-i_pos) == (tm_length-t_pos))
min_cost++;
if (min_cost <= best_cost)
{
Match new_match( i_pos,i_pos, t_pos,t_pos, min_cost,max_cost,0 );
match.push_back( new_match );
}
}
}
}
}
/* remove matches that are subsumed by a larger match */
vector< Match > prune_matches( const vector< Match > &match, int best_cost )
{
//cerr << "\tpruning";
vector< Match > pruned;
for(int i=match.size()-1; i>=0; i--)
{
//cerr << " (" << match[i].input_start << "," << match[i].input_end
// << " ; " << match[i].tm_start << "," << match[i].tm_end
// << " * " << match[i].min_cost << ")";
//if (match[i].min_cost > best_cost)
// continue;
bool subsumed = false;
for(int j=match.size()-1; j>=0; j--)
{
if (i!=j // do not compare match with itself
&& ( match[i].input_end - match[i].input_start <=
match[j].input_end - match[j].input_start ) // i shorter than j
&& ((match[i].input_start == match[j].input_start &&
match[i].tm_start == match[j].tm_start ) ||
(match[i].input_end == match[j].input_end &&
match[i].tm_end == match[j].tm_end) ) )
{
subsumed = true;
}
}
if (! subsumed && match[i].min_cost <= best_cost)
{
//cerr << "*";
pruned.push_back( match[i] );
}
}
//cerr << endl;
return pruned;
}
/* A* parsing method to compute string edit distance */
int parse_matches( vector< Match > &match, int input_length, int tm_length, int &best_cost )
{
// cerr << "sentence has " << match.size() << " matches, best cost: " << best_cost << ", lengths input: " << input_length << " tm: " << tm_length << endl;
if (match.size() == 1)
return match[0].max_cost;
if (match.size() == 0)
return input_length+tm_length;
int this_best_cost = input_length + tm_length;
for(int i=0;i<match.size();i++)
{
this_best_cost = min( this_best_cost, match[i].max_cost );
}
// cerr << "\tthis best cost: " << this_best_cost << endl;
// bottom up combination of spans
vector< vector< Match > > multi_match;
multi_match.push_back( match );
int match_level = 1;
while(multi_match[ match_level-1 ].size()>0)
{
// init vector
vector< Match > empty;
multi_match.push_back( empty );
for(int first_level = 0; first_level <= (match_level-1)/2; first_level++)
{
int second_level = match_level - first_level -1;
//cerr << "\tcombining level " << first_level << " and " << second_level << endl;
vector< Match > &first_match = multi_match[ first_level ];
vector< Match > &second_match = multi_match[ second_level ];
for(int i1 = 0; i1 < first_match.size(); i1++) {
for(int i2 = 0; i2 < second_match.size(); i2++) {
// do not combine the same pair twice
if (first_level == second_level && i2 <= i1)
{
continue;
}
// get sorted matches (first is before second)
Match *first, *second;
if (first_match[i1].input_start < second_match[i2].input_start )
{
first = &first_match[i1];
second = &second_match[i2];
}
else
{
second = &first_match[i1];
first = &second_match[i2];
}
//cerr << "\tcombining "
// << "(" << first->input_start << "," << first->input_end << "), "
// << first->tm_start << " [" << first->internal_cost << "]"
// << " with "
// << "(" << second->input_start << "," << second->input_end << "), "
// << second->tm_start<< " [" << second->internal_cost << "]"
// << endl;
// do not process overlapping matches
if (first->input_end >= second->input_start)
{
continue;
}
// no overlap / mismatch in tm
if (first->tm_end >= second->tm_start)
{
continue;
}
// compute cost
int min_cost = 0;
int max_cost = 0;
// initial
min_cost += abs( first->input_start - first->tm_start );
max_cost += max( first->input_start, first->tm_start );
// same number of words, but not sent. start -> cost is at least 1
if (first->input_start == first->tm_start && first->input_start > 0)
{
min_cost++;
}
// in-between
int skipped_words = second->input_start - first->input_end -1;
int skipped_words_tm = second->tm_start - first->tm_end -1;
int internal_cost = max( skipped_words, skipped_words_tm );
internal_cost += first->internal_cost + second->internal_cost;
min_cost += internal_cost;
max_cost += internal_cost;
// final
min_cost += abs( (tm_length-1 - second->tm_end) -
(input_length-1 - second->input_end) );
max_cost += max( (tm_length-1 - second->tm_end),
(input_length-1 - second->input_end) );
// same number of words, but not sent. end -> cost is at least 1
if ( ( input_length-1 - second->input_end
== tm_length-1 - second->tm_end )
&& input_length-1 != second->input_end )
{
min_cost++;
}
// cerr << "\tcost: " << min_cost << "-" << max_cost << endl;
// if worst than best cost, forget it
if (min_cost > best_cost)
{
continue;
}
// add match
Match new_match( first->input_start,
second->input_end,
first->tm_start,
second->tm_end,
min_cost,
max_cost,
internal_cost);
multi_match[ match_level ].push_back( new_match );
// cerr << "\tstored\n";
// possibly updating this_best_cost
if (max_cost < this_best_cost)
{
// cerr << "\tupdating this best cost to " << max_cost << "\n";
this_best_cost = max_cost;
// possibly updating best_cost
if (max_cost < best_cost)
{
// cerr << "\tupdating best cost to " << max_cost << "\n";
best_cost = max_cost;
}
}
}
}
}
match_level++;
}
return this_best_cost;
}
#endif

View File

@ -0,0 +1,214 @@
#!/usr/bin/perl -w
use strict;
my $DEBUG = 1;
my $match_file = "tm/BEST.acquis-xml-escaped.4.uniq";
my $source_file = "data/acquis.truecased.4.en.uniq";
my $target_file = "data/acquis.truecased.4.fr.uniq.most-frequent";
my $alignment_file = "data/acquis.truecased.4.align.uniq.most-frequent";
my $out_file = "data/ac-test.input.xml.4.uniq";
my $in_file = "evaluation/ac-test.input.tc.4";
#my $match_file = "tm/BEST.acquis-xml-escaped.4";
#my $source_file = "corpus/acquis.truecased.4.en";
#my $target_file = "corpus/acquis.truecased.4.fr";
#my $alignment_file = "model/aligned.4.grow-diag-final-and";
#my $out_file = "data/ac-test.input.xml.4";
#my $in_file = "evaluation/ac-test.input.tc.4";
#my $match_file = "tm/BEST.acquis.with";
#my $source_file = "../acquis-truecase/corpus/acquis.truecased.190.en";
#my $target_file = "../acquis-truecase/corpus/acquis.truecased.190.fr";
#my $alignment_file = "../acquis-truecase/model/aligned.190.grow-diag-final-and";
#my $out_file = "data/ac-test.input.xml";
#my $in_file = "evaluation/ac-test.input.tc.1";
my @INPUT = `cat $in_file`; chop(@INPUT);
my @SOURCE = `cat $source_file`; chop(@SOURCE);
my @TARGET = `cat $target_file`; chop(@TARGET);
my @ALIGNMENT = `cat $alignment_file`; chop(@ALIGNMENT);
open(MATCH,$match_file);
open(FRAME,">$out_file");
for(my $i=0;$i<4107;$i++) {
# get match data
my $match = <MATCH>;
chop($match);
my ($score,$sentence,$path) = split(/ \|\|\| /,$match);
# construct frame
if ($sentence < 1e9 && $sentence >= 0) {
my $frame = &create_xml($SOURCE[$sentence],
$INPUT[$i],
$TARGET[$sentence],
$ALIGNMENT[$sentence],
$path);
print FRAME $frame."\n";
}
# no frame -> output source
else {
print FRAME $INPUT[$i]."\n";
}
}
close(FRAME);
close(MATCH);
sub create_xml {
my ($source,$input,$target,$alignment,$path) = @_;
my @INPUT = split(/ /,$input);
my @SOURCE = split(/ /,$source);
my @TARGET = split(/ /,$target);
my %ALIGN = &create_alignment($alignment);
my %FRAME_INPUT;
my @TARGET_BITMAP;
foreach (@TARGET) { push @TARGET_BITMAP,1 }
### STEP 1: FIND MISMATCHES
my ($s,$i) = (0,0);
my $currently_matching = 0;
my ($start_s,$start_i) = (0,0);
$path .= "X"; # indicate end
print "$input\n$source\n$target\n$path\n";
for(my $p=0;$p<length($path);$p++) {
my $action = substr($path,$p,1);
# beginning of a mismatch
if ($currently_matching && $action ne "M" && $action ne "X") {
$start_i = $i;
$start_s = $s;
$currently_matching = 0;
}
# end of a mismatch
elsif (!$currently_matching &&
($action eq "M" || $action eq "X")) {
# remove use of affected target words
for(my $ss = $start_s; $ss<$s; $ss++) {
foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
$TARGET_BITMAP[$tt] = 0;
}
# also remove enclosed unaligned words?
}
# are there input words that need to be inserted ?
print "($start_i<$i)?\n";
if ($start_i<$i) {
# take note of input words to be inserted
my $insertion = "";
for(my $ii = $start_i; $ii<$i; $ii++) {
$insertion .= $INPUT[$ii]." ";
}
# find position for inserted input words
# find first removed target word
my $start_t = 1000;
for(my $ss = $start_s; $ss<$s; $ss++) {
foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
$start_t = $tt if $tt < $start_t;
}
}
# end of sentence? add to end
if ($start_t == 1000 && $i > $#INPUT) {
$start_t = $#TARGET;
}
# backtrack to previous words if unaligned
if ($start_t == 1000) {
$start_t = -1;
for(my $ss = $s-1; $start_t==-1 && $ss>=0; $ss--) {
foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
$start_t = $tt if $tt > $start_t;
}
}
}
$FRAME_INPUT{$start_t} .= $insertion;
}
$currently_matching = 1;
}
print "$action $s $i ($start_s $start_i) $currently_matching";
if ($action ne "I") {
print " ->";
foreach my $tt (keys %{${$ALIGN{'s'}}[$s]}) {
print " ".$tt;
}
}
print "\n";
$s++ unless $action eq "I";
$i++ unless $action eq "D";
}
print $target."\n";
foreach (@TARGET_BITMAP) { print $_; } print "\n";
foreach (sort keys %FRAME_INPUT) {
print "$_: $FRAME_INPUT{$_}\n";
}
### STEP 2: BUILD FRAME
# modify frame
my $frame = "";
$frame = $FRAME_INPUT{-1} if defined $FRAME_INPUT{-1};
my $currently_included = 0;
my $start_t = -1;
push @TARGET_BITMAP,0; # indicate end
for(my $t=0;$t<=scalar(@TARGET);$t++) {
# beginning of tm target inclusion
if (!$currently_included && $TARGET_BITMAP[$t]) {
$start_t = $t;
$currently_included = 1;
}
# end of tm target inclusion (not included word or inserted input)
elsif ($currently_included &&
(!$TARGET_BITMAP[$t] || defined($FRAME_INPUT{$t}))) {
# add xml (unless change is at the beginning of the sentence
if ($start_t >= 0) {
my $target = "";
print "for(tt=$start_t;tt<$t+$TARGET_BITMAP[$t]);\n";
for(my $tt=$start_t;$tt<$t+$TARGET_BITMAP[$t];$tt++) {
$target .= $TARGET[$tt] . " ";
}
chop($target);
$frame .= "<xml translation=\"$target\"> x </xml> ";
}
$currently_included = 0;
}
$frame .= $FRAME_INPUT{$t} if defined $FRAME_INPUT{$t};
print "$TARGET_BITMAP[$t] $t ($start_t) $currently_included\n";
}
print $frame."\n-------------------------------------\n";
return $frame;
}
sub create_alignment {
my ($line) = @_;
my (@ALIGNED_TO_S,@ALIGNED_TO_T);
foreach my $point (split(/ /,$line)) {
my ($s,$t) = split(/\-/,$point);
$ALIGNED_TO_S[$s]{$t}++;
$ALIGNED_TO_T[$t]{$s}++;
}
my %ALIGNMENT = ( 's' => \@ALIGNED_TO_S, 't' => \@ALIGNED_TO_T );
return %ALIGNMENT;
}

View File

@ -0,0 +1,982 @@
#include <stdio.h>
#include <stdlib.h>
#include <getopt.h>
#include <vector>
#include <map>
#include <string>
#include <algorithm>
#include <iostream>
#include <fstream>
#include <cstring>
#include <time.h>
#include "Vocabulary.h"
#include "SuffixArray.h"
/** This implementation is explained in
Koehn and Senellart: "Fast Approximate String Matching
with Suffix Arrays and A* Parsing" (AMTA 2010) ***/
using namespace std;
Vocabulary vocabulary;
int basic_flag = false;
int lsed_flag = true;
int refined_flag = true;
int length_filter_flag = true;
int parse_flag = true;
int min_match = 70;
int multiple_flag = false;
int multiple_slack = 0;
int multiple_max = 100;
void load_corpus( char* fileName, vector< vector< WORD_ID > > &corpus )
{
ifstream fileStream;
fileStream.open(fileName);
if (!fileStream) {
cerr << "file not found: " << fileName << endl;
exit(1);
}
istream *fileStreamP = &fileStream;
char line[LINE_MAX_LENGTH];
while(true)
{
SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
if (fileStreamP->eof()) break;
corpus.push_back( vocabulary.Tokenize( line ) );
}
}
/* Letter string edit distance, e.g. sub 'their' to 'there' costs 2 */
// global cache for word pairs
map< pair< WORD_ID, WORD_ID >, unsigned int > lsed;
unsigned int letter_sed( WORD_ID aIdx, WORD_ID bIdx )
{
// check if already computed -> lookup in cache
pair< WORD_ID, WORD_ID > pIdx = make_pair( aIdx, bIdx );
map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = lsed.find( pIdx );
if (lookup != lsed.end())
{
return (lookup->second);
}
// get surface strings for word indices
const string &a = vocabulary.GetWord( aIdx );
const string &b = vocabulary.GetWord( bIdx );
// initialize cost matrix
unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 );
for( unsigned int i=0; i<=a.size(); i++ ) {
cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
cost[i][0] = i;
}
for( unsigned int j=0; j<=b.size(); j++ ) {
cost[0][j] = j;
}
// core string edit distance loop
for( unsigned int i=1; i<=a.size(); i++ ) {
for( unsigned int j=1; j<=b.size(); j++ ) {
unsigned int ins = cost[i-1][j] + 1;
unsigned int del = cost[i][j-1] + 1;
bool match = (a.substr(i-1,1).compare( b.substr(j-1,1) ) == 0);
unsigned int diag = cost[i-1][j-1] + (match ? 0 : 1);
unsigned int min = (ins < del) ? ins : del;
min = (diag < min) ? diag : min;
cost[i][j] = min;
}
}
// clear out memory
unsigned int final = cost[a.size()][b.size()];
for( unsigned int i=0; i<=a.size(); i++ ) {
free( cost[i] );
}
free( cost );
// cache and return result
lsed[ pIdx ] = final;
return final;
}
/* string edit distance implementation */
unsigned int sed( const vector< WORD_ID > &a, const vector< WORD_ID > &b, string &best_path, bool use_letter_sed ) {
// initialize cost and path matrices
unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 );
char **path = (char**) calloc( sizeof( char* ), a.size()+1 );
for( unsigned int i=0; i<=a.size(); i++ ) {
cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
path[i] = (char*) calloc( sizeof(char), b.size()+1 );
if (i>0)
{
cost[i][0] = cost[i-1][0];
if (use_letter_sed)
{
cost[i][0] += vocabulary.GetWord( a[i-1] ).size();
}
else
{
cost[i][0]++;
}
}
else
{
cost[i][0] = 0;
}
path[i][0] = 'I';
}
for( unsigned int j=0; j<=b.size(); j++ ) {
if (j>0)
{
cost[0][j] = cost[0][j-1];
if (use_letter_sed)
{
cost[0][j] += vocabulary.GetWord( b[j-1] ).size();
}
else
{
cost[0][j]++;
}
}
else
{
cost[0][j] = 0;
}
path[0][j] = 'D';
}
// core string edit distance algorithm
for( unsigned int i=1; i<=a.size(); i++ ) {
for( unsigned int j=1; j<=b.size(); j++ ) {
unsigned int ins = cost[i-1][j];
unsigned int del = cost[i][j-1];
unsigned int match;
if (use_letter_sed)
{
ins += vocabulary.GetWord( a[i-1] ).size();
del += vocabulary.GetWord( b[j-1] ).size();
match = letter_sed( a[i-1], b[j-1] );
}
else
{
ins++;
del++;
match = ( a[i-1] == b[j-1] ) ? 0 : 1;
}
unsigned int diag = cost[i-1][j-1] + match;
char action = (ins < del) ? 'I' : 'D';
unsigned int min = (ins < del) ? ins : del;
if (diag < min)
{
action = (match>0) ? 'S' : 'M';
min = diag;
}
cost[i][j] = min;
path[i][j] = action;
}
}
// construct string for best path
unsigned int i = a.size();
unsigned int j = b.size();
best_path = "";
while( i>0 || j>0 )
{
best_path = path[i][j] + best_path;
if (path[i][j] == 'I')
{
i--;
}
else if (path[i][j] == 'D')
{
j--;
}
else
{
i--;
j--;
}
}
// clear out memory
unsigned int final = cost[a.size()][b.size()];
for( unsigned int i=0; i<=a.size(); i++ ) {
free( cost[i] );
free( path[i] );
}
free( cost );
free( path );
// return result
return final;
}
/* utlility function: compute length of sentence in characters
(spaces do not count) */
unsigned int compute_length( const vector< WORD_ID > &sentence )
{
unsigned int length = 0; for( unsigned int i=0; i<sentence.size(); i++ )
{
length += vocabulary.GetWord( sentence[i] ).size();
}
return length;
}
/* brute force method: compare input to all corpus sentences */
int basic_fuzzy_match( vector< vector< WORD_ID > > source,
vector< vector< WORD_ID > > input )
{
// go through input set...
for(unsigned int i=0;i<input.size();i++)
{
bool use_letter_sed = false;
// compute sentence length and worst allowed cost
unsigned int input_length;
if (use_letter_sed)
{
input_length = compute_length( input[i] );
}
else
{
input_length = input[i].size();
}
unsigned int best_cost = input_length * (100-min_match) / 100 + 2;
string best_path = "";
int best_match = -1;
// go through all corpus sentences
for(unsigned int s=0;s<source.size();s++)
{
int source_length;
if (use_letter_sed)
{
source_length = compute_length( source[s] );
}
else
{
source_length = source[s].size();
}
int diff = abs((int)source_length - (int)input_length);
if (length_filter_flag && (diff >= best_cost))
{
continue;
}
// compute string edit distance
string path;
unsigned int cost = sed( input[i], source[s], path, use_letter_sed );
// update if new best
if (cost < best_cost)
{
best_cost = cost;
best_path = path;
best_match = s;
}
}
cout << best_cost << " ||| " << best_match << " ||| " << best_path << endl;
}
}
#define MAX_MATCH_COUNT 10000000
/* data structure for n-gram match between input and corpus */
class Match {
public:
int input_start;
int input_end;
int tm_start;
int tm_end;
int min_cost;
int max_cost;
int internal_cost;
Match( int is, int ie, int ts, int te, int min, int max, int i )
:input_start(is), input_end(ie), tm_start(ts), tm_end(te), min_cost(min), max_cost(max), internal_cost(i)
{}
};
map< WORD_ID,vector< int > > single_word_index;
/* definition of short matches
very short n-gram matches (1-grams) will not be looked up in
the suffix array, since there are too many matches
and for longer sentences, at least one 2-gram match must occur */
inline int short_match_max_length( int input_length )
{
if ( ! refined_flag )
return 0;
if ( input_length >= 5 )
return 1;
return 0;
}
/* if we have non-short matches in a sentence, we need to
take a closer look at it.
this function creates a hash map for all input words and their positions
(to be used by the next function)
(done here, because this has be done only once for an input sentence) */
void init_short_matches( const vector< WORD_ID > &input )
{
int max_length = short_match_max_length( input.size() );
if (max_length == 0)
return;
single_word_index.clear();
// store input words and their positions in hash map
for(int i=0; i<input.size(); i++)
{
if (single_word_index.find( input[i] ) == single_word_index.end())
{
vector< int > position_vector;
single_word_index[ input[i] ] = position_vector;
}
single_word_index[ input[i] ].push_back( i );
}
}
/* add all short matches to list of matches for a sentence */
void add_short_matches( vector< Match > &match, const vector< WORD_ID > &tm, int input_length, int best_cost )
{
int max_length = short_match_max_length( input_length );
if (max_length == 0)
return;
int tm_length = tm.size();
map< WORD_ID,vector< int > >::iterator input_word_hit;
for(int t_pos=0; t_pos<tm.size(); t_pos++)
{
input_word_hit = single_word_index.find( tm[t_pos] );
if (input_word_hit != single_word_index.end())
{
vector< int > &position_vector = input_word_hit->second;
for(int j=0; j<position_vector.size(); j++)
{
int &i_pos = position_vector[j];
// before match
int max_cost = max( i_pos , t_pos );
int min_cost = abs( i_pos - t_pos );
if ( i_pos>0 && i_pos == t_pos )
min_cost++;
// after match
max_cost += max( (input_length-i_pos) , (tm_length-t_pos));
min_cost += abs( (input_length-i_pos) - (tm_length-t_pos));
if ( i_pos != input_length-1 && (input_length-i_pos) == (tm_length-t_pos))
min_cost++;
if (min_cost <= best_cost)
{
Match new_match( i_pos,i_pos, t_pos,t_pos, min_cost,max_cost,0 );
match.push_back( new_match );
}
}
}
}
}
/* remove matches that are subsumed by a larger match */
vector< Match > prune_matches( const vector< Match > &match, int best_cost )
{
//cerr << "\tpruning";
vector< Match > pruned;
for(int i=match.size()-1; i>=0; i--)
{
//cerr << " (" << match[i].input_start << "," << match[i].input_end
// << " ; " << match[i].tm_start << "," << match[i].tm_end
// << " * " << match[i].min_cost << ")";
//if (match[i].min_cost > best_cost)
// continue;
bool subsumed = false;
for(int j=match.size()-1; j>=0; j--)
{
if (i!=j // do not compare match with itself
&& ( match[i].input_end - match[i].input_start <=
match[j].input_end - match[j].input_start ) // i shorter than j
&& ((match[i].input_start == match[j].input_start &&
match[i].tm_start == match[j].tm_start ) ||
(match[i].input_end == match[j].input_end &&
match[i].tm_end == match[j].tm_end) ) )
{
subsumed = true;
}
}
if (! subsumed && match[i].min_cost <= best_cost)
{
//cerr << "*";
pruned.push_back( match[i] );
}
}
//cerr << endl;
return pruned;
}
/* A* parsing method to compute string edit distance */
int parse_matches( vector< Match > &match, int input_length, int tm_length, int &best_cost )
{
// cerr << "sentence has " << match.size() << " matches, best cost: " << best_cost << ", lengths input: " << input_length << " tm: " << tm_length << endl;
if (match.size() == 1)
return match[0].max_cost;
if (match.size() == 0)
return input_length+tm_length;
int this_best_cost = input_length + tm_length;
for(int i=0;i<match.size();i++)
{
this_best_cost = min( this_best_cost, match[i].max_cost );
}
// cerr << "\tthis best cost: " << this_best_cost << endl;
// bottom up combination of spans
vector< vector< Match > > multi_match;
multi_match.push_back( match );
int match_level = 1;
while(multi_match[ match_level-1 ].size()>0)
{
// init vector
vector< Match > empty;
multi_match.push_back( empty );
for(int first_level = 0; first_level <= (match_level-1)/2; first_level++)
{
int second_level = match_level - first_level -1;
//cerr << "\tcombining level " << first_level << " and " << second_level << endl;
vector< Match > &first_match = multi_match[ first_level ];
vector< Match > &second_match = multi_match[ second_level ];
for(int i1 = 0; i1 < first_match.size(); i1++) {
for(int i2 = 0; i2 < second_match.size(); i2++) {
// do not combine the same pair twice
if (first_level == second_level && i2 <= i1)
{
continue;
}
// get sorted matches (first is before second)
Match *first, *second;
if (first_match[i1].input_start < second_match[i2].input_start )
{
first = &first_match[i1];
second = &second_match[i2];
}
else
{
second = &first_match[i1];
first = &second_match[i2];
}
//cerr << "\tcombining "
// << "(" << first->input_start << "," << first->input_end << "), "
// << first->tm_start << " [" << first->internal_cost << "]"
// << " with "
// << "(" << second->input_start << "," << second->input_end << "), "
// << second->tm_start<< " [" << second->internal_cost << "]"
// << endl;
// do not process overlapping matches
if (first->input_end >= second->input_start)
{
continue;
}
// no overlap / mismatch in tm
if (first->tm_end >= second->tm_start)
{
continue;
}
// compute cost
int min_cost = 0;
int max_cost = 0;
// initial
min_cost += abs( first->input_start - first->tm_start );
max_cost += max( first->input_start, first->tm_start );
// same number of words, but not sent. start -> cost is at least 1
if (first->input_start == first->tm_start && first->input_start > 0)
{
min_cost++;
}
// in-between
int skipped_words = second->input_start - first->input_end -1;
int skipped_words_tm = second->tm_start - first->tm_end -1;
int internal_cost = max( skipped_words, skipped_words_tm );
internal_cost += first->internal_cost + second->internal_cost;
min_cost += internal_cost;
max_cost += internal_cost;
// final
min_cost += abs( (tm_length-1 - second->tm_end) -
(input_length-1 - second->input_end) );
max_cost += max( (tm_length-1 - second->tm_end),
(input_length-1 - second->input_end) );
// same number of words, but not sent. end -> cost is at least 1
if ( ( input_length-1 - second->input_end
== tm_length-1 - second->tm_end )
&& input_length-1 != second->input_end )
{
min_cost++;
}
// cerr << "\tcost: " << min_cost << "-" << max_cost << endl;
// if worst than best cost, forget it
if (min_cost > best_cost)
{
continue;
}
// add match
Match new_match( first->input_start,
second->input_end,
first->tm_start,
second->tm_end,
min_cost,
max_cost,
internal_cost);
multi_match[ match_level ].push_back( new_match );
// cerr << "\tstored\n";
// possibly updating this_best_cost
if (max_cost < this_best_cost)
{
// cerr << "\tupdating this best cost to " << max_cost << "\n";
this_best_cost = max_cost;
// possibly updating best_cost
if (max_cost < best_cost)
{
// cerr << "\tupdating best cost to " << max_cost << "\n";
best_cost = max_cost;
}
}
}
}
}
match_level++;
}
return this_best_cost;
}
int main(int argc, char* argv[])
{
vector< vector< WORD_ID > > source, input;
while(1) {
static struct option long_options[] = {
{"basic", no_argument, &basic_flag, 1},
{"word", no_argument, &lsed_flag, 0},
{"unrefined", no_argument, &refined_flag, 0},
{"nolengthfilter", no_argument, &length_filter_flag, 0},
{"noparse", no_argument, &parse_flag, 0},
{"multiple", no_argument, &multiple_flag, 1},
{"minmatch", required_argument, 0, 'm'},
{0, 0, 0, 0}
};
int option_index = 0;
int c = getopt_long (argc, argv, "m:", long_options, &option_index);
if (c == -1) break;
switch (c) {
case 0:
// if (long_options[option_index].flag != 0)
// break;
// printf ("option %s", long_options[option_index].name);
// if (optarg)
// printf (" with arg %s", optarg);
// printf ("\n");
break;
case 'm':
min_match = atoi(optarg);
if (min_match < 1 || min_match > 100) {
cerr << "error: --minmatch must have value in range 1..100\n";
exit(1);
}
cerr << "setting min match to " << min_match << endl;
break;
default:
cerr << "usage: syntax: ./fuzzy-match input corpus [--basic] [--word] [--minmatch 1..100]\n";
exit(1);
}
}
if (lsed_flag) { cerr << "lsed\n"; }
if (basic_flag) { cerr << "basic\n"; }
if (refined_flag) { cerr << "refined\n"; }
if (length_filter_flag) { cerr << "length filter\n"; }
if (parse_flag) { cerr << "parse\n"; }
// exit(1);
if (optind+2 != argc) {
cerr << "syntax: ./fuzzy-match input corpus [--basic] [--word] [--minmatch 1..100]\n";
exit(1);
}
cerr << "loading corpus...\n";
load_corpus(argv[optind], input);
load_corpus(argv[optind+1], source);
// ./fuzzy-match input corpus [-basic]
// load_corpus("../corpus/tm.truecased.4.en", source);
// load_corpus("../corpus/tm.truecased.4.it", target);
// load_corpus("../evaluation/test.input.tc.4", input);
// load_corpus("../../acquis-truecase/corpus/acquis.truecased.190.en", source);
// load_corpus("../../acquis-truecase/evaluation/ac-test.input.tc.190", input);
// load_corpus("../corpus/tm.truecased.16.en", source);
// load_corpus("../evaluation/test.input.tc.16", input);
if (basic_flag) {
cerr << "using basic method\n";
clock_t start_main_clock2 = clock();
basic_fuzzy_match( source, input );
cerr << "total: " << (1000 * (clock()-start_main_clock2) / CLOCKS_PER_SEC) << endl;
exit(1);
}
cerr << "number of input sentences " << input.size() << endl;
cerr << "creating suffix array...\n";
// SuffixArray suffixArray( "../corpus/tm.truecased.4.en" );
// SuffixArray suffixArray( "../../acquis-truecase/corpus/acquis.truecased.190.en" );
SuffixArray suffixArray( argv[optind+1] );
clock_t start_main_clock = clock();
// looping through all input sentences...
cerr << "looping...\n";
for(unsigned int i=0;i<input.size();i++)
{
clock_t start_clock = clock();
// if (i % 10 == 0) cerr << ".";
int input_id = i; // clean up this mess!
// establish some basic statistics
// int input_length = compute_length( input[i] );
int input_length = input[i].size();
int best_cost = input_length * (100-min_match) / 100 + 1;
int match_count = 0; // how many substring matches to be considered
//cerr << endl << "sentence " << i << ", length " << input_length << ", best_cost " << best_cost << endl;
// find match ranges in suffix array
vector< vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > > match_range;
for(size_t start=0;start<input[i].size();start++)
{
SuffixArray::INDEX prior_first_match = 0;
SuffixArray::INDEX prior_last_match = suffixArray.GetSize()-1;
vector< string > substring;
bool stillMatched = true;
vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > matchedAtThisStart;
//cerr << "start: " << start;
for(int word=start; stillMatched && word<input[i].size(); word++)
{
substring.push_back( vocabulary.GetWord( input[i][word] ) );
// only look up, if needed (i.e. no unnecessary short gram lookups)
// if (! word-start+1 <= short_match_max_length( input_length ) )
// {
SuffixArray::INDEX first_match, last_match;
stillMatched = false;
if (suffixArray.FindMatches( substring, first_match, last_match, prior_first_match, prior_last_match ) )
{
stillMatched = true;
matchedAtThisStart.push_back( make_pair( first_match, last_match ) );
//cerr << " (" << first_match << "," << last_match << ")";
//cerr << " " << ( last_match - first_match + 1 );
prior_first_match = first_match;
prior_last_match = last_match;
}
//}
}
//cerr << endl;
match_range.push_back( matchedAtThisStart );
}
clock_t clock_range = clock();
map< int, vector< Match > > sentence_match;
map< int, int > sentence_match_word_count;
// go through all matches, longest first
for(int length = input[i].size(); length >= 1; length--)
{
// do not create matches, if these are handled by the short match function
if (length <= short_match_max_length( input_length ) )
{
continue;
}
unsigned int count = 0;
for(int start = 0; start <= input[i].size() - length; start++)
{
if (match_range[start].size() >= length)
{
pair< SuffixArray::INDEX, SuffixArray::INDEX > &range = match_range[start][length-1];
// cerr << " (" << range.first << "," << range.second << ")";
count += range.second - range.first + 1;
for(SuffixArray::INDEX i=range.first; i<=range.second; i++)
{
int position = suffixArray.GetPosition( i );
// sentence length mismatch
size_t sentence_id = suffixArray.GetSentence( position );
int sentence_length = suffixArray.GetSentenceLength( sentence_id );
int diff = abs( (int)sentence_length - (int)input_length );
// cerr << endl << i << "\tsentence " << sentence_id << ", length " << sentence_length;
//if (length <= 2 && input_length>=5 &&
// sentence_match.find( sentence_id ) == sentence_match.end())
// continue;
if (diff > best_cost)
continue;
// compute minimal cost
int start_pos = suffixArray.GetWordInSentence( position );
int end_pos = start_pos + length-1;
// cerr << endl << "\t" << start_pos << "-" << end_pos << " (" << sentence_length << ") vs. "
// << start << "-" << (start+length-1) << " (" << input_length << ")";
// different number of prior words -> cost is at least diff
int min_cost = abs( start - start_pos );
// same number of words, but not sent. start -> cost is at least 1
if (start == start_pos && start>0)
min_cost++;
// different number of remaining words -> cost is at least diff
min_cost += abs( ( sentence_length-1 - end_pos ) -
( input_length-1 - (start+length-1) ) );
// same number of words, but not sent. end -> cost is at least 1
if ( sentence_length-1 - end_pos ==
input_length-1 - (start+length-1)
&& end_pos != sentence_length-1 )
min_cost++;
// cerr << " -> min_cost " << min_cost;
if (min_cost > best_cost)
continue;
// valid match
match_count++;
// compute maximal cost
int max_cost = max( start, start_pos )
+ max( sentence_length-1 - end_pos,
input_length-1 - (start+length-1) );
// cerr << ", max_cost " << max_cost;
Match m = Match( start, start+length-1,
start_pos, start_pos+length-1,
min_cost, max_cost, 0);
sentence_match[ sentence_id ].push_back( m );
sentence_match_word_count[ sentence_id ] += length;
if (max_cost < best_cost)
{
best_cost = max_cost;
if (best_cost == 0) break;
}
//if (match_count >= MAX_MATCH_COUNT) break;
}
}
// cerr << endl;
if (best_cost == 0) break;
//if (match_count >= MAX_MATCH_COUNT) break;
}
// cerr << count << " matches at length " << length << " in " << sentence_match.size() << " tm." << endl;
if (best_cost == 0) break;
//if (match_count >= MAX_MATCH_COUNT) break;
}
cerr << match_count << " matches in " << sentence_match.size() << " sentences." << endl;
clock_t clock_matches = clock();
// consider each sentence for which we have matches
int old_best_cost = best_cost;
int tm_count_word_match = 0;
int tm_count_word_match2 = 0;
int pruned_match_count = 0;
if (short_match_max_length( input_length ))
{
init_short_matches( input[i] );
}
vector< int > best_tm;
typedef map< int, vector< Match > >::iterator I;
clock_t clock_validation_sum = 0;
for(I tm=sentence_match.begin(); tm!=sentence_match.end(); tm++)
{
int tmID = tm->first;
int tm_length = suffixArray.GetSentenceLength(tmID);
vector< Match > &match = tm->second;
add_short_matches( match, source[tmID], input_length, best_cost );
//cerr << "match in sentence " << tmID << ": " << match.size() << " [" << tm_length << "]" << endl;
// quick look: how many words are matched
int words_matched = 0;
for(int m=0;m<match.size();m++) {
if (match[m].min_cost <= best_cost) // makes no difference
words_matched += match[m].input_end - match[m].input_start + 1;
}
if (max(input_length,tm_length) - words_matched > best_cost)
{
if (length_filter_flag) continue;
}
tm_count_word_match++;
// prune, check again how many words are matched
vector< Match > pruned = prune_matches( match, best_cost );
words_matched = 0;
for(int p=0;p<pruned.size();p++) {
words_matched += pruned[p].input_end - pruned[p].input_start + 1;
}
if (max(input_length,tm_length) - words_matched > best_cost)
{
if (length_filter_flag) continue;
}
tm_count_word_match2++;
pruned_match_count += pruned.size();
int prior_best_cost = best_cost;
int cost;
clock_t clock_validation_start = clock();
if (! parse_flag ||
pruned.size()>=10) // to prevent worst cases
{
string path;
cost = sed( input[input_id], source[tmID], path, false );
if (cost < best_cost)
{
best_cost = cost;
}
}
else
{
cost = parse_matches( pruned, input_length, tm_length, best_cost );
if (prior_best_cost != best_cost)
{
best_tm.clear();
}
}
clock_validation_sum += clock() - clock_validation_start;
if (cost == best_cost)
{
best_tm.push_back( tmID );
}
}
cerr << "reduced best cost from " << old_best_cost << " to " << best_cost << endl;
cerr << "tm considered: " << sentence_match.size()
<< " word-matched: " << tm_count_word_match
<< " word-matched2: " << tm_count_word_match2
<< " best: " << best_tm.size() << endl;
cerr << "pruned matches: " << ((float)pruned_match_count/(float)tm_count_word_match2) << endl;
// do not try to find the best ... report multiple matches
if (multiple_flag) {
int input_letter_length = compute_length( input[input_id] );
for(int si=0; si<best_tm.size(); si++) {
int s = best_tm[si];
string path;
unsigned int letter_cost = sed( input[input_id], source[s], path, true );
// do not report multiple identical sentences, but just their count
cout << i << " "; // sentence number
cout << letter_cost << "/" << input_letter_length << " ";
cout << "(" << best_cost <<"/" << input_length <<") ";
cout << "||| " << s << " ||| " << path << endl;
}
continue;
}
// find the best matches according to letter sed
string best_path = "";
int best_match = -1;
int best_letter_cost;
if (lsed_flag) {
best_letter_cost = compute_length( input[input_id] ) * min_match / 100 + 1;
for(int si=0; si<best_tm.size(); si++)
{
int s = best_tm[si];
string path;
unsigned int letter_cost = sed( input[input_id], source[s], path, true );
if (letter_cost < best_letter_cost)
{
best_letter_cost = letter_cost;
best_path = path;
best_match = s;
}
}
}
// if letter sed turned off, just compute path for first match
else {
if (best_tm.size() > 0) {
string path;
sed( input[input_id], source[best_tm[0]], path, false );
best_path = path;
best_match = best_tm[0];
}
}
cerr << "elapsed: " << (1000 * (clock()-start_clock) / CLOCKS_PER_SEC)
<< " ( range: " << (1000 * (clock_range-start_clock) / CLOCKS_PER_SEC)
<< " match: " << (1000 * (clock_matches-clock_range) / CLOCKS_PER_SEC)
<< " tm: " << (1000 * (clock()-clock_matches) / CLOCKS_PER_SEC)
<< " (validation: " << (1000 * (clock_validation_sum) / CLOCKS_PER_SEC) << ")"
<< " )" << endl;
if (lsed_flag) {
cout << best_letter_cost << "/" << compute_length( input[input_id] ) << " (";
}
cout << best_cost <<"/" << input_length;
if (lsed_flag) cout << ")";
cout << " ||| " << best_match << " ||| " << best_path << endl;
}
cerr << "total: " << (1000 * (clock()-start_main_clock) / CLOCKS_PER_SEC) << endl;
}

View File

@ -0,0 +1,58 @@
#!/usr/bin/perl -w
use strict;
my $src_in = "corpus/acquis.truecased.4.en";
my $tgt_in = "corpus/acquis.truecased.4.fr";
my $align_in = "model/aligned.4.grow-diag-final-and";
my $src_out = "data/acquis.truecased.4.en.uniq";
my $tgt_out = "data/acquis.truecased.4.fr.uniq";
my $tgt_mf = "data/acquis.truecased.4.fr.uniq.most-frequent";
my $align_out = "data/acquis.truecased.4.align.uniq";
my $align_mf = "data/acquis.truecased.4.align.uniq.most-frequent";
my (%TRANS,%ALIGN);
open(SRC,$src_in);
open(TGT,$tgt_in);
open(ALIGN,$align_in);
while(my $src = <SRC>) {
my $tgt = <TGT>;
my $align = <ALIGN>;
chop($tgt);
chop($align);
$TRANS{$src}{$tgt}++;
$ALIGN{$src}{$tgt} = $align;
}
close(SRC);
close(TGT);
open(SRC_OUT,">$src_out");
open(TGT_OUT,">$tgt_out");
open(TGT_MF, ">$tgt_mf");
open(ALIGN_OUT,">$align_out");
open(ALIGN_MF, ">$align_mf");
foreach my $src (keys %TRANS) {
print SRC_OUT $src;
my $first = 1;
my ($max,$best) = (0);
foreach my $tgt (keys %{$TRANS{$src}}) {
print TGT_OUT " ||| " unless $first;
print TGT_OUT $TRANS{$src}{$tgt}." ".$tgt;
print ALIGN_OUT " ||| " unless $first;
print ALIGN_OUT $ALIGN{$src}{$tgt};
if ($TRANS{$src}{$tgt} > $max) {
$max = $TRANS{$src}{$tgt};
$best = $tgt;
}
$first = 0;
}
print TGT_OUT "\n";
print ALIGN_OUT "\n";
print TGT_MF $best."\n";
print ALIGN_MF $ALIGN{$src}{$best}."\n";
}
close(SRC_OUT);
close(TGT_OUT);

View File

@ -0,0 +1,308 @@
#!/usr/bin/perl -w
use strict;
use FindBin qw($RealBin);
use File::Basename;
my $DEBUG = 1;
my $OUTPUT_RULES = 1;
#my $data_root = "/Users/hieuhoang/workspace/experiment/data/tm-mt-integration/";
my $in_file = $ARGV[0]; #"$data_root/in/ac-test.input.tc.4";
my $source_file = $ARGV[1]; #"$data_root/in/acquis.truecased.4.en.uniq";
my $target_file = $ARGV[2]; #"$data_root/in/acquis.truecased.4.fr.uniq";
my $alignment_file = $ARGV[3]; #"$data_root/in/acquis.truecased.4.align.uniq";
my $lex_file = $ARGV[4]; #$data_root/in/lex.4;
my $pt_file = $ARGV[5]; #"$data_root/out/pt";
my $cmd;
my $TMPDIR=dirname($pt_file) ."/tmp.$$";
$cmd = "mkdir -p $TMPDIR";
`$cmd`;
my $match_file = "$TMPDIR/match";
# suffix array creation and extraction
$cmd = "$RealBin/fuzzy-match --multiple $in_file $source_file > $match_file";
print STDERR "$cmd \n";
`$cmd`;
# make into xml and pt
my $out_file = "$TMPDIR/ac-test.input.xml.4.uniq.multi.tuning";
my @INPUT = `cat $in_file`; chop(@INPUT);
my @ALL_SOURCE = `cat $source_file`; chop(@ALL_SOURCE);
my @ALL_TARGET = `cat $target_file`; chop(@ALL_TARGET);
my @ALL_ALIGNMENT = `cat $alignment_file`; chop(@ALL_ALIGNMENT);
open(MATCH,$match_file);
open(FRAME,">$out_file");
open(RULE,">$out_file.extract") if $OUTPUT_RULES;
open(RULE_INV,">$out_file.extract.inv") if $OUTPUT_RULES;
open(INFO,">$out_file.info");
while( my $match = <MATCH> ) {
chop($match);
my ($score,$sentence,$path) = split(/ \|\|\| /,$match);
$score =~ /^(\d+) (.+)/ || die;
my ($i,$match_score) = ($1,$2);
print STDERR "i=$i match_score=$match_score\n";
# construct frame
if ($sentence < 1e9 && $sentence >= 0) {
my $SOURCE = $ALL_SOURCE[$sentence];
my @ALIGNMENT = split(/ \|\|\| /,$ALL_ALIGNMENT[$sentence]);
my @TARGET = split(/ \|\|\| /,$ALL_TARGET[$sentence]);
for(my $j=0;$j<scalar(@TARGET);$j++) {
$TARGET[$j] =~ /^(\d+) (.+)$/ || die;
my ($target_count,$target) = ($1,$2);
my ($frame,$rule_s,$rule_t,$rule_alignment,$rule_alignment_inv) =
&create_xml($SOURCE,
$INPUT[$i],
$target,
$ALIGNMENT[$j],
$path);
print FRAME $frame."\n";
print RULE "$rule_s [X] ||| $rule_t [X] ||| $rule_alignment ||| $target_count\n" if $OUTPUT_RULES;
print RULE_INV "$rule_t [X] ||| $rule_s [X] ||| $rule_alignment_inv ||| $target_count\n" if $OUTPUT_RULES;
print INFO "$i ||| $match_score ||| $target_count\n";
}
}
}
close(FRAME);
close(MATCH);
close(RULE) if $OUTPUT_RULES;
close(RULE_INV) if $OUTPUT_RULES;
`LC_ALL=C sort $out_file.extract | gzip -c > $out_file.extract.sorted.gz`;
`LC_ALL=C sort $out_file.extract.inv | gzip -c > $out_file.extract.inv.sorted.gz`;
if ($OUTPUT_RULES)
{
$cmd = "$RealBin/../../scripts/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical -extract-file $out_file.extract -lexical-file $lex_file -phrase-translation-table $pt_file";
print STDERR "Executing: $cmd \n";
`$cmd`;
}
#$cmd = "rm -rf $TMPDIR";
#`$cmd`;
#######################################################
sub create_xml {
my ($source,$input,$target,$alignment,$path) = @_;
print STDERR " HIEU \n $source \n $input \n $target \n $alignment \n $path \n";
my @INPUT = split(/ /,$input);
my @SOURCE = split(/ /,$source);
my @TARGET = split(/ /,$target);
my %ALIGN = &create_alignment($alignment);
my %FRAME_INPUT;
my (@NT,@INPUT_BITMAP,@TARGET_BITMAP,%ALIGNMENT_I_TO_S);
foreach (@TARGET) { push @TARGET_BITMAP,1 }
### STEP 1: FIND MISMATCHES
my ($s,$i) = (0,0);
my $currently_matching = 0;
my ($start_s,$start_i) = (0,0);
$path .= "X"; # indicate end
print STDERR "$input\n$source\n$target\n$path\n";
for(my $p=0;$p<length($path);$p++) {
my $action = substr($path,$p,1);
# beginning of a mismatch
if ($currently_matching && $action ne "M" && $action ne "X") {
$start_i = $i;
$start_s = $s;
$currently_matching = 0;
}
# end of a mismatch
elsif (!$currently_matching &&
($action eq "M" || $action eq "X")) {
# remove use of affected target words
for(my $ss = $start_s; $ss<$s; $ss++) {
foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
$TARGET_BITMAP[$tt] = 0;
}
# also remove enclosed unaligned words?
}
# are there input words that need to be inserted ?
print STDERR "($start_i<$i)?\n";
if ($start_i<$i) {
# take note of input words to be inserted
my $insertion = "";
for(my $ii = $start_i; $ii<$i; $ii++) {
$insertion .= $INPUT[$ii]." ";
}
# find position for inserted input words
# find first removed target word
my $start_t = 1000;
for(my $ss = $start_s; $ss<$s; $ss++) {
foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
$start_t = $tt if $tt < $start_t;
}
}
# end of sentence? add to end
if ($start_t == 1000 && $i > $#INPUT) {
$start_t = $#TARGET;
}
# backtrack to previous words if unaligned
if ($start_t == 1000) {
$start_t = -1;
for(my $ss = $s-1; $start_t==-1 && $ss>=0; $ss--) {
foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
$start_t = $tt if $tt > $start_t;
}
}
}
$FRAME_INPUT{$start_t} .= $insertion;
my %NT = ("start_t" => $start_t,
"start_i" => $start_i );
push @NT,\%NT;
}
$currently_matching = 1;
}
print STDERR "$action $s $i ($start_s $start_i) $currently_matching";
if ($action ne "I") {
print STDERR " ->";
foreach my $tt (keys %{${$ALIGN{'s'}}[$s]}) {
print STDERR " ".$tt;
}
}
print STDERR "\n";
$s++ unless $action eq "I";
$i++ unless $action eq "D";
$ALIGNMENT_I_TO_S{$i} = $s unless $action eq "D";
push @INPUT_BITMAP, 1 if $action eq "M";
push @INPUT_BITMAP, 0 if $action eq "I" || $action eq "S";
}
print STDERR $target."\n";
foreach (@TARGET_BITMAP) { print STDERR $_; } print STDERR "\n";
foreach (sort keys %FRAME_INPUT) {
print STDERR "$_: $FRAME_INPUT{$_}\n";
}
### STEP 2: BUILD RULE AND FRAME
# hierarchical rule
my $rule_s = "";
my $rule_pos_s = 0;
my %RULE_ALIGNMENT_S;
for(my $i=0;$i<scalar(@INPUT_BITMAP);$i++) {
if ($INPUT_BITMAP[$i]) {
$rule_s .= $INPUT[$i]." ";
$RULE_ALIGNMENT_S{$ALIGNMENT_I_TO_S{$i}} = $rule_pos_s++;
}
foreach my $NT (@NT) {
if ($i == $$NT{"start_i"}) {
$rule_s .= "[X][X] ";
$$NT{"rule_pos_s"} = $rule_pos_s++;
}
}
}
my $rule_t = "";
my $rule_pos_t = 0;
my %RULE_ALIGNMENT_T;
for(my $t=-1;$t<scalar(@TARGET_BITMAP);$t++) {
if ($t>=0 && $TARGET_BITMAP[$t]) {
$rule_t .= $TARGET[$t]." ";
$RULE_ALIGNMENT_T{$t} = $rule_pos_t++;
}
foreach my $NT (@NT) {
if ($t == $$NT{"start_t"}) {
$rule_t .= "[X][X] ";
$$NT{"rule_pos_t"} = $rule_pos_t++;
}
}
}
my $rule_alignment = "";
foreach my $s (sort { $a <=> $b} keys %RULE_ALIGNMENT_S) {
foreach my $t (keys %{$ALIGN{"s"}[$s]}) {
next unless defined($RULE_ALIGNMENT_T{$t});
$rule_alignment .= $RULE_ALIGNMENT_S{$s}."-".$RULE_ALIGNMENT_T{$t}." ";
}
}
foreach my $NT (@NT) {
$rule_alignment .= $$NT{"rule_pos_s"}."-".$$NT{"rule_pos_t"}." ";
}
chop($rule_s);
chop($rule_t);
chop($rule_alignment);
my $rule_alignment_inv = "";
foreach (split(/ /,$rule_alignment)) {
/^(\d+)\-(\d+)$/;
$rule_alignment_inv .= "$2-$1 ";
}
chop($rule_alignment_inv);
# frame
my $frame = "";
$frame = $FRAME_INPUT{-1} if defined $FRAME_INPUT{-1};
my $currently_included = 0;
my $start_t = -1;
push @TARGET_BITMAP,0; # indicate end
for(my $t=0;$t<=scalar(@TARGET);$t++) {
# beginning of tm target inclusion
if (!$currently_included && $TARGET_BITMAP[$t]) {
$start_t = $t;
$currently_included = 1;
}
# end of tm target inclusion (not included word or inserted input)
elsif ($currently_included &&
(!$TARGET_BITMAP[$t] || defined($FRAME_INPUT{$t}))) {
# add xml (unless change is at the beginning of the sentence
if ($start_t >= 0) {
my $target = "";
print STDERR "for(tt=$start_t;tt<$t+$TARGET_BITMAP[$t]);\n";
for(my $tt=$start_t;$tt<$t+$TARGET_BITMAP[$t];$tt++) {
$target .= $TARGET[$tt] . " ";
}
chop($target);
$frame .= "<xml translation=\"$target\"> x </xml> ";
}
$currently_included = 0;
}
$frame .= $FRAME_INPUT{$t} if defined $FRAME_INPUT{$t};
print STDERR "$TARGET_BITMAP[$t] $t ($start_t) $currently_included\n";
}
print STDERR $frame."\n-------------------------------------\n";
return ($frame,$rule_s,$rule_t,$rule_alignment,$rule_alignment_inv);
}
sub create_alignment {
my ($line) = @_;
my (@ALIGNED_TO_S,@ALIGNED_TO_T);
foreach my $point (split(/ /,$line)) {
my ($s,$t) = split(/\-/,$point);
$ALIGNED_TO_S[$s]{$t}++;
$ALIGNED_TO_T[$t]{$s}++;
}
my %ALIGNMENT = ( 's' => \@ALIGNED_TO_S, 't' => \@ALIGNED_TO_T );
return %ALIGNMENT;
}

View File

@ -0,0 +1,300 @@
#!/usr/bin/perl -w -d
use strict;
use FindBin qw($RealBin);
use File::Basename;
my $DEBUG = 1;
my $OUTPUT_RULES = 1;
#my $data_root = "/Users/hieuhoang/workspace/experiment/data/tm-mt-integration/";
my $in_file = $ARGV[0]; #"$data_root/in/ac-test.input.tc.4";
my $source_file = $ARGV[1]; #"$data_root/in/acquis.truecased.4.en.uniq";
my $target_file = $ARGV[2]; #"$data_root/in/acquis.truecased.4.fr.uniq";
my $alignment_file = $ARGV[3]; #"$data_root/in/acquis.truecased.4.align.uniq";
my $lex_file = $ARGV[4]; #$data_root/in/lex.4;
my $pt_file = $ARGV[5]; #"$data_root/out/pt";
my $cmd;
my $TMPDIR= "/tmp/tmp.$$";
$cmd = "mkdir -p $TMPDIR";
`$cmd`;
$TMPDIR = "/Users/hieuhoang/workspace/experiment/data/tm-mt-integration/out/tmp.3196";
my $match_file = "$TMPDIR/match";
# suffix array creation and extraction
$cmd = "$RealBin/fuzzy-match --multiple $in_file $source_file > $match_file";
`$cmd`;
# make into xml and pt
my $out_file = "$TMPDIR/ac-test.input.xml.4.uniq.multi.tuning";
open(MATCH,$match_file);
open(FRAME,">$out_file");
open(RULE,">$out_file.extract") if $OUTPUT_RULES;
open(RULE_INV,">$out_file.extract.inv") if $OUTPUT_RULES;
open(INFO,">$out_file.info");
while( my $match = <MATCH> ) {
chop($match);
my ($score,$sentence,$path) = split(/ \|\|\| /,$match);
$score =~ /^(\d+) (.+)/ || die;
my ($i,$match_score) = ($1,$2);
# construct frame
if ($sentence < 1e9 && $sentence >= 0) {
my $SOURCE = $ALL_SOURCE[$sentence];
my @ALIGNMENT = split(/ \|\|\| /,$ALL_ALIGNMENT[$sentence]);
my @TARGET = split(/ \|\|\| /,$ALL_TARGET[$sentence]);
for(my $j=0;$j<scalar(@TARGET);$j++) {
$TARGET[$j] =~ /^(\d+) (.+)$/ || die;
my ($target_count,$target) = ($1,$2);
my ($frame,$rule_s,$rule_t,$rule_alignment,$rule_alignment_inv) =
&create_xml($SOURCE,
$INPUT[$i],
$target,
$ALIGNMENT[$j],
$path);
print FRAME $frame."\n";
print RULE "$rule_s [X] ||| $rule_t [X] ||| $rule_alignment ||| $target_count\n" if $OUTPUT_RULES;
print RULE_INV "$rule_t [X] ||| $rule_s [X] ||| $rule_alignment_inv ||| $target_count\n" if $OUTPUT_RULES;
print INFO "$i ||| $match_score ||| $target_count\n";
}
}
}
close(FRAME);
close(MATCH);
close(RULE) if $OUTPUT_RULES;
close(RULE_INV) if $OUTPUT_RULES;
`LC_ALL=C sort $out_file.extract | gzip -c > $out_file.extract.sorted.gz`;
`LC_ALL=C sort $out_file.extract.inv | gzip -c > $out_file.extract.inv.sorted.gz`;
if ($OUTPUT_RULES)
{
$cmd = "$RealBin/../../scripts/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical -extract-file $out_file.extract -lexical-file $lex_file -phrase-translation-table $pt_file";
print STDERR "Executing: $cmd \n";
`$cmd`;
}
#$cmd = "rm -rf $TMPDIR";
#`$cmd`;
#######################################################
sub create_xml {
my ($source,$input,$target,$alignment,$path) = @_;
my @INPUT = split(/ /,$input);
my @SOURCE = split(/ /,$source);
my @TARGET = split(/ /,$target);
my %ALIGN = &create_alignment($alignment);
my %FRAME_INPUT;
my (@NT,@INPUT_BITMAP,@TARGET_BITMAP,%ALIGNMENT_I_TO_S);
foreach (@TARGET) { push @TARGET_BITMAP,1 }
### STEP 1: FIND MISMATCHES
my ($s,$i) = (0,0);
my $currently_matching = 0;
my ($start_s,$start_i) = (0,0);
$path .= "X"; # indicate end
print STDERR "$input\n$source\n$target\n$path\n";
for(my $p=0;$p<length($path);$p++) {
my $action = substr($path,$p,1);
# beginning of a mismatch
if ($currently_matching && $action ne "M" && $action ne "X") {
$start_i = $i;
$start_s = $s;
$currently_matching = 0;
}
# end of a mismatch
elsif (!$currently_matching &&
($action eq "M" || $action eq "X")) {
# remove use of affected target words
for(my $ss = $start_s; $ss<$s; $ss++) {
foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
$TARGET_BITMAP[$tt] = 0;
}
# also remove enclosed unaligned words?
}
# are there input words that need to be inserted ?
print STDERR "($start_i<$i)?\n";
if ($start_i<$i) {
# take note of input words to be inserted
my $insertion = "";
for(my $ii = $start_i; $ii<$i; $ii++) {
$insertion .= $INPUT[$ii]." ";
}
# find position for inserted input words
# find first removed target word
my $start_t = 1000;
for(my $ss = $start_s; $ss<$s; $ss++) {
foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
$start_t = $tt if $tt < $start_t;
}
}
# end of sentence? add to end
if ($start_t == 1000 && $i > $#INPUT) {
$start_t = $#TARGET;
}
# backtrack to previous words if unaligned
if ($start_t == 1000) {
$start_t = -1;
for(my $ss = $s-1; $start_t==-1 && $ss>=0; $ss--) {
foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
$start_t = $tt if $tt > $start_t;
}
}
}
$FRAME_INPUT{$start_t} .= $insertion;
my %NT = ("start_t" => $start_t,
"start_i" => $start_i );
push @NT,\%NT;
}
$currently_matching = 1;
}
print STDERR "$action $s $i ($start_s $start_i) $currently_matching";
if ($action ne "I") {
print STDERR " ->";
foreach my $tt (keys %{${$ALIGN{'s'}}[$s]}) {
print STDERR " ".$tt;
}
}
print STDERR "\n";
$s++ unless $action eq "I";
$i++ unless $action eq "D";
$ALIGNMENT_I_TO_S{$i} = $s unless $action eq "D";
push @INPUT_BITMAP, 1 if $action eq "M";
push @INPUT_BITMAP, 0 if $action eq "I" || $action eq "S";
}
print STDERR $target."\n";
foreach (@TARGET_BITMAP) { print STDERR $_; } print STDERR "\n";
foreach (sort keys %FRAME_INPUT) {
print STDERR "$_: $FRAME_INPUT{$_}\n";
}
### STEP 2: BUILD RULE AND FRAME
# hierarchical rule
my $rule_s = "";
my $rule_pos_s = 0;
my %RULE_ALIGNMENT_S;
for(my $i=0;$i<scalar(@INPUT_BITMAP);$i++) {
if ($INPUT_BITMAP[$i]) {
$rule_s .= $INPUT[$i]." ";
$RULE_ALIGNMENT_S{$ALIGNMENT_I_TO_S{$i}} = $rule_pos_s++;
}
foreach my $NT (@NT) {
if ($i == $$NT{"start_i"}) {
$rule_s .= "[X][X] ";
$$NT{"rule_pos_s"} = $rule_pos_s++;
}
}
}
my $rule_t = "";
my $rule_pos_t = 0;
my %RULE_ALIGNMENT_T;
for(my $t=-1;$t<scalar(@TARGET_BITMAP);$t++) {
if ($t>=0 && $TARGET_BITMAP[$t]) {
$rule_t .= $TARGET[$t]." ";
$RULE_ALIGNMENT_T{$t} = $rule_pos_t++;
}
foreach my $NT (@NT) {
if ($t == $$NT{"start_t"}) {
$rule_t .= "[X][X] ";
$$NT{"rule_pos_t"} = $rule_pos_t++;
}
}
}
my $rule_alignment = "";
foreach my $s (sort { $a <=> $b} keys %RULE_ALIGNMENT_S) {
foreach my $t (keys %{$ALIGN{"s"}[$s]}) {
next unless defined($RULE_ALIGNMENT_T{$t});
$rule_alignment .= $RULE_ALIGNMENT_S{$s}."-".$RULE_ALIGNMENT_T{$t}." ";
}
}
foreach my $NT (@NT) {
$rule_alignment .= $$NT{"rule_pos_s"}."-".$$NT{"rule_pos_t"}." ";
}
chop($rule_s);
chop($rule_t);
chop($rule_alignment);
my $rule_alignment_inv = "";
foreach (split(/ /,$rule_alignment)) {
/^(\d+)\-(\d+)$/;
$rule_alignment_inv .= "$2-$1 ";
}
chop($rule_alignment_inv);
# frame
my $frame = "";
$frame = $FRAME_INPUT{-1} if defined $FRAME_INPUT{-1};
my $currently_included = 0;
my $start_t = -1;
push @TARGET_BITMAP,0; # indicate end
for(my $t=0;$t<=scalar(@TARGET);$t++) {
# beginning of tm target inclusion
if (!$currently_included && $TARGET_BITMAP[$t]) {
$start_t = $t;
$currently_included = 1;
}
# end of tm target inclusion (not included word or inserted input)
elsif ($currently_included &&
(!$TARGET_BITMAP[$t] || defined($FRAME_INPUT{$t}))) {
# add xml (unless change is at the beginning of the sentence
if ($start_t >= 0) {
my $target = "";
print STDERR "for(tt=$start_t;tt<$t+$TARGET_BITMAP[$t]);\n";
for(my $tt=$start_t;$tt<$t+$TARGET_BITMAP[$t];$tt++) {
$target .= $TARGET[$tt] . " ";
}
chop($target);
$frame .= "<xml translation=\"$target\"> x </xml> ";
}
$currently_included = 0;
}
$frame .= $FRAME_INPUT{$t} if defined $FRAME_INPUT{$t};
print STDERR "$TARGET_BITMAP[$t] $t ($start_t) $currently_included\n";
}
print STDERR $frame."\n-------------------------------------\n";
return ($frame,$rule_s,$rule_t,$rule_alignment,$rule_alignment_inv);
}
sub create_alignment {
my ($line) = @_;
my (@ALIGNED_TO_S,@ALIGNED_TO_T);
foreach my $point (split(/ /,$line)) {
my ($s,$t) = split(/\-/,$point);
$ALIGNED_TO_S[$s]{$t}++;
$ALIGNED_TO_T[$t]{$s}++;
}
my %ALIGNMENT = ( 's' => \@ALIGNED_TO_S, 't' => \@ALIGNED_TO_T );
return %ALIGNMENT;
}

View File

@ -0,0 +1,288 @@
#!/usr/bin/perl -w
use strict;
my $DEBUG = 1;
my $OUTPUT_RULES = 1;
my $scripts_root_dir = "/Users/hieuhoang/workspace/github/hieuhoang/scripts";
my $data_root = "/Users/hieuhoang/workspace/experiment/data/tm-mt-integration/";
#my $match_file = "$data_root/in/BEST.acquis-xml-escaped.4.uniq.multi.tuning";
my $match_file = "$data_root/out/BEST";
my $source_file = "$data_root/in/acquis.truecased.4.en.uniq";
my $target_file = "$data_root/in/acquis.truecased.4.fr.uniq";
my $alignment_file = "$data_root/in/acquis.truecased.4.align.uniq";
my $out_file = "$data_root/out/ac-test.input.xml.4.uniq.multi.tuning";
my $in_file = "$data_root/in/ac-test.input.tc.4";
#my $match_file = "tm/BEST.acquis-xml-escaped.4.uniq.multi";
#my $source_file = "data/acquis.truecased.4.en.uniq";
#my $target_file = "data/acquis.truecased.4.fr.uniq";
#my $alignment_file = "data/acquis.truecased.4.align.uniq";
#my $out_file = "data/ac-test.input.xml.4.uniq.multi.xxx";
#my $in_file = "evaluation/ac-test.input.tc.4";
my @INPUT = `cat $in_file`; chop(@INPUT);
my @ALL_SOURCE = `cat $source_file`; chop(@ALL_SOURCE);
my @ALL_TARGET = `cat $target_file`; chop(@ALL_TARGET);
my @ALL_ALIGNMENT = `cat $alignment_file`; chop(@ALL_ALIGNMENT);
open(MATCH,$match_file);
open(FRAME,">$out_file");
open(RULE,">$out_file.extract") if $OUTPUT_RULES;
open(RULE_INV,">$out_file.extract.inv") if $OUTPUT_RULES;
open(INFO,">$out_file.info");
while( my $match = <MATCH> ) {
chop($match);
my ($score,$sentence,$path) = split(/ \|\|\| /,$match);
$score =~ /^(\d+) (.+)/ || die;
my ($i,$match_score) = ($1,$2);
# construct frame
if ($sentence < 1e9 && $sentence >= 0) {
my $SOURCE = $ALL_SOURCE[$sentence];
my @ALIGNMENT = split(/ \|\|\| /,$ALL_ALIGNMENT[$sentence]);
my @TARGET = split(/ \|\|\| /,$ALL_TARGET[$sentence]);
for(my $j=0;$j<scalar(@TARGET);$j++) {
$TARGET[$j] =~ /^(\d+) (.+)$/ || die;
my ($target_count,$target) = ($1,$2);
my ($frame,$rule_s,$rule_t,$rule_alignment,$rule_alignment_inv) =
&create_xml($SOURCE,
$INPUT[$i],
$target,
$ALIGNMENT[$j],
$path);
print FRAME $frame."\n";
print RULE "$rule_s [X] ||| $rule_t [X] ||| $rule_alignment ||| $target_count\n" if $OUTPUT_RULES;
print RULE_INV "$rule_t [X] ||| $rule_s [X] ||| $rule_alignment_inv ||| $target_count\n" if $OUTPUT_RULES;
print INFO "$i ||| $match_score ||| $target_count\n";
}
}
}
close(FRAME);
close(MATCH);
close(RULE) if $OUTPUT_RULES;
close(RULE_INV) if $OUTPUT_RULES;
`LC_ALL=C sort $out_file.extract | gzip -c > $out_file.extract.sorted.gz`;
`LC_ALL=C sort $out_file.extract.inv | gzip -c > $out_file.extract.inv.sorted.gz`;
`$scripts_root_dir/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical -extract-file $out_file.extract -lexical-file $data_root/in/lex.4 -phrase-translation-table $out_file.phrase-table` if $OUTPUT_RULES;
sub create_xml {
my ($source,$input,$target,$alignment,$path) = @_;
my @INPUT = split(/ /,$input);
my @SOURCE = split(/ /,$source);
my @TARGET = split(/ /,$target);
my %ALIGN = &create_alignment($alignment);
my %FRAME_INPUT;
my (@NT,@INPUT_BITMAP,@TARGET_BITMAP,%ALIGNMENT_I_TO_S);
foreach (@TARGET) { push @TARGET_BITMAP,1 }
### STEP 1: FIND MISMATCHES
my ($s,$i) = (0,0);
my $currently_matching = 0;
my ($start_s,$start_i) = (0,0);
$path .= "X"; # indicate end
print "$input\n$source\n$target\n$path\n";
for(my $p=0;$p<length($path);$p++) {
my $action = substr($path,$p,1);
# beginning of a mismatch
if ($currently_matching && $action ne "M" && $action ne "X") {
$start_i = $i;
$start_s = $s;
$currently_matching = 0;
}
# end of a mismatch
elsif (!$currently_matching &&
($action eq "M" || $action eq "X")) {
# remove use of affected target words
for(my $ss = $start_s; $ss<$s; $ss++) {
foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
$TARGET_BITMAP[$tt] = 0;
}
# also remove enclosed unaligned words?
}
# are there input words that need to be inserted ?
print "($start_i<$i)?\n";
if ($start_i<$i) {
# take note of input words to be inserted
my $insertion = "";
for(my $ii = $start_i; $ii<$i; $ii++) {
$insertion .= $INPUT[$ii]." ";
}
# find position for inserted input words
# find first removed target word
my $start_t = 1000;
for(my $ss = $start_s; $ss<$s; $ss++) {
foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
$start_t = $tt if $tt < $start_t;
}
}
# end of sentence? add to end
if ($start_t == 1000 && $i > $#INPUT) {
$start_t = $#TARGET;
}
# backtrack to previous words if unaligned
if ($start_t == 1000) {
$start_t = -1;
for(my $ss = $s-1; $start_t==-1 && $ss>=0; $ss--) {
foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
$start_t = $tt if $tt > $start_t;
}
}
}
$FRAME_INPUT{$start_t} .= $insertion;
my %NT = ("start_t" => $start_t,
"start_i" => $start_i );
push @NT,\%NT;
}
$currently_matching = 1;
}
print "$action $s $i ($start_s $start_i) $currently_matching";
if ($action ne "I") {
print " ->";
foreach my $tt (keys %{${$ALIGN{'s'}}[$s]}) {
print " ".$tt;
}
}
print "\n";
$s++ unless $action eq "I";
$i++ unless $action eq "D";
$ALIGNMENT_I_TO_S{$i} = $s unless $action eq "D";
push @INPUT_BITMAP, 1 if $action eq "M";
push @INPUT_BITMAP, 0 if $action eq "I" || $action eq "S";
}
print $target."\n";
foreach (@TARGET_BITMAP) { print $_; } print "\n";
foreach (sort keys %FRAME_INPUT) {
print "$_: $FRAME_INPUT{$_}\n";
}
### STEP 2: BUILD RULE AND FRAME
# hierarchical rule
my $rule_s = "";
my $rule_pos_s = 0;
my %RULE_ALIGNMENT_S;
for(my $i=0;$i<scalar(@INPUT_BITMAP);$i++) {
if ($INPUT_BITMAP[$i]) {
$rule_s .= $INPUT[$i]." ";
$RULE_ALIGNMENT_S{$ALIGNMENT_I_TO_S{$i}} = $rule_pos_s++;
}
foreach my $NT (@NT) {
if ($i == $$NT{"start_i"}) {
$rule_s .= "[X][X] ";
$$NT{"rule_pos_s"} = $rule_pos_s++;
}
}
}
my $rule_t = "";
my $rule_pos_t = 0;
my %RULE_ALIGNMENT_T;
for(my $t=-1;$t<scalar(@TARGET_BITMAP);$t++) {
if ($t>=0 && $TARGET_BITMAP[$t]) {
$rule_t .= $TARGET[$t]." ";
$RULE_ALIGNMENT_T{$t} = $rule_pos_t++;
}
foreach my $NT (@NT) {
if ($t == $$NT{"start_t"}) {
$rule_t .= "[X][X] ";
$$NT{"rule_pos_t"} = $rule_pos_t++;
}
}
}
my $rule_alignment = "";
foreach my $s (sort { $a <=> $b} keys %RULE_ALIGNMENT_S) {
foreach my $t (keys %{$ALIGN{"s"}[$s]}) {
next unless defined($RULE_ALIGNMENT_T{$t});
$rule_alignment .= $RULE_ALIGNMENT_S{$s}."-".$RULE_ALIGNMENT_T{$t}." ";
}
}
foreach my $NT (@NT) {
$rule_alignment .= $$NT{"rule_pos_s"}."-".$$NT{"rule_pos_t"}." ";
}
chop($rule_s);
chop($rule_t);
chop($rule_alignment);
my $rule_alignment_inv = "";
foreach (split(/ /,$rule_alignment)) {
/^(\d+)\-(\d+)$/;
$rule_alignment_inv .= "$2-$1 ";
}
chop($rule_alignment_inv);
# frame
my $frame = "";
$frame = $FRAME_INPUT{-1} if defined $FRAME_INPUT{-1};
my $currently_included = 0;
my $start_t = -1;
push @TARGET_BITMAP,0; # indicate end
for(my $t=0;$t<=scalar(@TARGET);$t++) {
# beginning of tm target inclusion
if (!$currently_included && $TARGET_BITMAP[$t]) {
$start_t = $t;
$currently_included = 1;
}
# end of tm target inclusion (not included word or inserted input)
elsif ($currently_included &&
(!$TARGET_BITMAP[$t] || defined($FRAME_INPUT{$t}))) {
# add xml (unless change is at the beginning of the sentence
if ($start_t >= 0) {
my $target = "";
print "for(tt=$start_t;tt<$t+$TARGET_BITMAP[$t]);\n";
for(my $tt=$start_t;$tt<$t+$TARGET_BITMAP[$t];$tt++) {
$target .= $TARGET[$tt] . " ";
}
chop($target);
$frame .= "<xml translation=\"$target\"> x </xml> ";
}
$currently_included = 0;
}
$frame .= $FRAME_INPUT{$t} if defined $FRAME_INPUT{$t};
print "$TARGET_BITMAP[$t] $t ($start_t) $currently_included\n";
}
print $frame."\n-------------------------------------\n";
return ($frame,$rule_s,$rule_t,$rule_alignment,$rule_alignment_inv);
}
sub create_alignment {
my ($line) = @_;
my (@ALIGNED_TO_S,@ALIGNED_TO_T);
foreach my $point (split(/ /,$line)) {
my ($s,$t) = split(/\-/,$point);
$ALIGNED_TO_S[$s]{$t}++;
$ALIGNED_TO_T[$t]{$s}++;
}
my %ALIGNMENT = ( 's' => \@ALIGNED_TO_S, 't' => \@ALIGNED_TO_T );
return %ALIGNMENT;
}

View File

@ -0,0 +1,27 @@
#include "SuffixArray.h"
using namespace std;
int main(int argc, char* argv[])
{
SuffixArray suffixArray( "/home/pkoehn/syntax/grammars/wmt09-de-en/corpus.1k.de" );
//suffixArray.List(10,20);
vector< string > der;
der.push_back("der");
vector< string > inDer;
inDer.push_back("in");
inDer.push_back("der");
vector< string > zzz;
zzz.push_back("zzz");
vector< string > derDer;
derDer.push_back("der");
derDer.push_back("der");
cout << "count of 'der' " << suffixArray.Count( der ) << endl;
cout << "limited count of 'der' " << suffixArray.MinCount( der, 2 ) << endl;
cout << "count of 'in der' " << suffixArray.Count( inDer ) << endl;
cout << "count of 'der der' " << suffixArray.Count( derDer ) << endl;
cout << "limited count of 'der der' " << suffixArray.MinCount( derDer, 1 ) << endl;
// cout << "count of 'zzz' " << suffixArray.Count( zzz ) << endl;
// cout << "limited count of 'zzz' " << suffixArray.LimitedCount( zzz, 1 ) << endl;
}

View File

@ -1 +0,0 @@
/usr/share/automake-1.9/INSTALL

View File

@ -43,6 +43,8 @@
<OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
<IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Configuration)\</IntDir>
<LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
<IncludePath Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">C:\Program Files\boost\boost_1_47;$(IncludePath)</IncludePath>
<IncludePath Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">C:\Program Files\boost\boost_1_47;$(IncludePath)</IncludePath>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile>
@ -58,10 +60,11 @@
<AdditionalIncludeDirectories>C:\boost\boost_1_47;$(SolutionDir)/../../moses/src;$(SolutionDir)/../..;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<AdditionalDependencies>zdll.lib;$(SolutionDir)/$(Configuration)/moses.lib;$(SolutionDir)/$(Configuration)/kenlm.lib;$(SolutionDir)/$(Configuration)/OnDiskPt.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies>C:\GnuWin32\lib\zlib.lib;$(SolutionDir)/$(Configuration)/moses.lib;$(SolutionDir)/$(Configuration)/kenlm.lib;$(SolutionDir)/$(Configuration)/OnDiskPt.lib;%(AdditionalDependencies)</AdditionalDependencies>
<GenerateDebugInformation>true</GenerateDebugInformation>
<SubSystem>Console</SubSystem>
<TargetMachine>MachineX86</TargetMachine>
<AdditionalLibraryDirectories>C:\boost\boost_1_47\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
@ -69,7 +72,7 @@
<Optimization>MaxSpeed</Optimization>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>WITH_THREADS;NO_PIPES;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
<RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
<FunctionLevelLinking>true</FunctionLevelLinking>
<PrecompiledHeader>
</PrecompiledHeader>
@ -78,12 +81,13 @@
<AdditionalIncludeDirectories>C:\boost\boost_1_47;$(SolutionDir)/../../moses/src;$(SolutionDir)/../..;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<AdditionalDependencies>zdll.lib;$(SolutionDir)/$(Configuration)/moses.lib;$(SolutionDir)/$(Configuration)/kenlm.lib;$(SolutionDir)/$(Configuration)/OnDiskPt.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies>C:\GnuWin32\lib\zlib.lib;$(SolutionDir)/$(Configuration)/moses.lib;$(SolutionDir)/$(Configuration)/kenlm.lib;$(SolutionDir)/$(Configuration)/OnDiskPt.lib;%(AdditionalDependencies)</AdditionalDependencies>
<GenerateDebugInformation>true</GenerateDebugInformation>
<SubSystem>Console</SubSystem>
<OptimizeReferences>true</OptimizeReferences>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<TargetMachine>MachineX86</TargetMachine>
<AdditionalLibraryDirectories>C:\boost\boost_1_47\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
</Link>
</ItemDefinitionGroup>
<ItemGroup>

View File

@ -69,7 +69,7 @@
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile>
<Optimization>Disabled</Optimization>
<PreprocessorDefinitions>WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<PreprocessorDefinitions>WITH_THREADS;NO_PIPES;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<MinimalRebuild>true</MinimalRebuild>
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
<RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
@ -84,7 +84,7 @@
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<PreprocessorDefinitions>WITH_THREADS;NO_PIPES;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
<FunctionLevelLinking>true</FunctionLevelLinking>
<PrecompiledHeader>

View File

@ -41,9 +41,13 @@
<option id="gnu.cpp.compilermacosx.exe.debug.option.optimization.level.676959181" name="Optimization Level" superClass="gnu.cpp.compilermacosx.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
<option id="gnu.cpp.compiler.macosx.exe.debug.option.debugging.level.1484480101" name="Debug Level" superClass="gnu.cpp.compiler.macosx.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.include.paths.1556683035" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="/Users/hieuhoang/unison/workspace/github/moses-smt"/>
<listOptionValue builtIn="false" value="${workspace_loc}/../../"/>
<listOptionValue builtIn="false" value="${workspace_loc}/../../moses/src"/>
<listOptionValue builtIn="false" value="/opt/local/include"/>
</option>
<option id="gnu.cpp.compiler.option.preprocessor.def.1052680347" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" valueType="definedSymbols">
<listOptionValue builtIn="false" value="TRACE_ENABLE"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1930757481" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.c.compiler.macosx.exe.debug.1161943634" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.macosx.exe.debug">
@ -128,4 +132,5 @@
<storageModule moduleId="refreshScope" versionNumber="1">
<resource resourceType="PROJECT" workspacePath="/OnDiskPt"/>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
</cproject>

View File

@ -0,0 +1,292 @@
// !$*UTF8*$!
{
archiveVersion = 1;
classes = {
};
objectVersion = 46;
objects = {
/* Begin PBXBuildFile section */
1E42EFB615BEFAEB00E937EB /* fuzzy-match2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E42EFA515BEFABD00E937EB /* fuzzy-match2.cpp */; };
1E42EFB715BEFAEB00E937EB /* SuffixArray.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E806DCF15BED3D4001914A2 /* SuffixArray.cpp */; };
1E42EFB815BEFAEB00E937EB /* SuffixArray.h in Sources */ = {isa = PBXBuildFile; fileRef = 1E806DD015BED3D4001914A2 /* SuffixArray.h */; };
1E42EFB915BEFAEB00E937EB /* Vocabulary.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E806DCA15BED3AC001914A2 /* Vocabulary.cpp */; };
1E42EFBA15BEFAEB00E937EB /* Vocabulary.h in Sources */ = {isa = PBXBuildFile; fileRef = 1E806DCB15BED3AC001914A2 /* Vocabulary.h */; };
1E806DCC15BED3AC001914A2 /* Vocabulary.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E806DCA15BED3AC001914A2 /* Vocabulary.cpp */; };
1E806DD115BED3D4001914A2 /* SuffixArray.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E806DCF15BED3D4001914A2 /* SuffixArray.cpp */; };
1ECD60A815C15E28004172A4 /* Util.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1ECD60A515C15D3A004172A4 /* Util.cpp */; };
/* End PBXBuildFile section */
/* Begin PBXCopyFilesBuildPhase section */
1E42EFAA15BEFAD300E937EB /* CopyFiles */ = {
isa = PBXCopyFilesBuildPhase;
buildActionMask = 2147483647;
dstPath = /usr/share/man/man1/;
dstSubfolderSpec = 0;
files = (
);
runOnlyForDeploymentPostprocessing = 1;
};
1ED87EEB15BED331003E47AA /* CopyFiles */ = {
isa = PBXCopyFilesBuildPhase;
buildActionMask = 2147483647;
dstPath = /usr/share/man/man1/;
dstSubfolderSpec = 0;
files = (
);
runOnlyForDeploymentPostprocessing = 1;
};
/* End PBXCopyFilesBuildPhase section */
/* Begin PBXFileReference section */
1E42EFA515BEFABD00E937EB /* fuzzy-match2.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = "fuzzy-match2.cpp"; path = "../tm-mt-integration/fuzzy-match2.cpp"; sourceTree = "<group>"; };
1E42EFAC15BEFAD300E937EB /* fuzzy-match2 */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = "fuzzy-match2"; sourceTree = BUILT_PRODUCTS_DIR; };
1E42EFD115C00AC100E937EB /* fuzzy-match2.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "fuzzy-match2.h"; path = "../tm-mt-integration/fuzzy-match2.h"; sourceTree = "<group>"; };
1E42EFD215C00BAE00E937EB /* Util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = Util.h; path = "../tm-mt-integration/Util.h"; sourceTree = "<group>"; };
1E42EFD315C00C0A00E937EB /* SentenceAlignment.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = SentenceAlignment.h; path = "../tm-mt-integration/SentenceAlignment.h"; sourceTree = "<group>"; };
1E42EFD715C00D6300E937EB /* Match.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = Match.h; path = "../tm-mt-integration/Match.h"; sourceTree = "<group>"; };
1E806DCA15BED3AC001914A2 /* Vocabulary.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Vocabulary.cpp; path = "../tm-mt-integration/Vocabulary.cpp"; sourceTree = "<group>"; };
1E806DCB15BED3AC001914A2 /* Vocabulary.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = Vocabulary.h; path = "../tm-mt-integration/Vocabulary.h"; sourceTree = "<group>"; };
1E806DCF15BED3D4001914A2 /* SuffixArray.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = SuffixArray.cpp; path = "../tm-mt-integration/SuffixArray.cpp"; sourceTree = "<group>"; };
1E806DD015BED3D4001914A2 /* SuffixArray.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = SuffixArray.h; path = "../tm-mt-integration/SuffixArray.h"; sourceTree = "<group>"; };
1ECD60A515C15D3A004172A4 /* Util.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Util.cpp; path = "../tm-mt-integration/Util.cpp"; sourceTree = "<group>"; };
1ED87EED15BED331003E47AA /* fuzzy-match */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = "fuzzy-match"; sourceTree = BUILT_PRODUCTS_DIR; };
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
1E42EFA915BEFAD300E937EB /* Frameworks */ = {
isa = PBXFrameworksBuildPhase;
buildActionMask = 2147483647;
files = (
);
runOnlyForDeploymentPostprocessing = 0;
};
1ED87EEA15BED331003E47AA /* Frameworks */ = {
isa = PBXFrameworksBuildPhase;
buildActionMask = 2147483647;
files = (
);
runOnlyForDeploymentPostprocessing = 0;
};
/* End PBXFrameworksBuildPhase section */
/* Begin PBXGroup section */
1ED87EE215BED32F003E47AA = {
isa = PBXGroup;
children = (
1E42EFD715C00D6300E937EB /* Match.h */,
1E42EFD315C00C0A00E937EB /* SentenceAlignment.h */,
1E42EFD215C00BAE00E937EB /* Util.h */,
1ECD60A515C15D3A004172A4 /* Util.cpp */,
1E806DCF15BED3D4001914A2 /* SuffixArray.cpp */,
1E806DD015BED3D4001914A2 /* SuffixArray.h */,
1E42EFD115C00AC100E937EB /* fuzzy-match2.h */,
1E42EFA515BEFABD00E937EB /* fuzzy-match2.cpp */,
1E806DCA15BED3AC001914A2 /* Vocabulary.cpp */,
1E806DCB15BED3AC001914A2 /* Vocabulary.h */,
1ED87EEE15BED331003E47AA /* Products */,
);
sourceTree = "<group>";
};
1ED87EEE15BED331003E47AA /* Products */ = {
isa = PBXGroup;
children = (
1ED87EED15BED331003E47AA /* fuzzy-match */,
1E42EFAC15BEFAD300E937EB /* fuzzy-match2 */,
);
name = Products;
sourceTree = "<group>";
};
/* End PBXGroup section */
/* Begin PBXNativeTarget section */
1E42EFAB15BEFAD300E937EB /* fuzzy-match2 */ = {
isa = PBXNativeTarget;
buildConfigurationList = 1E42EFB315BEFAD300E937EB /* Build configuration list for PBXNativeTarget "fuzzy-match2" */;
buildPhases = (
1E42EFA815BEFAD300E937EB /* Sources */,
1E42EFA915BEFAD300E937EB /* Frameworks */,
1E42EFAA15BEFAD300E937EB /* CopyFiles */,
);
buildRules = (
);
dependencies = (
);
name = "fuzzy-match2";
productName = "fuzzy-match2";
productReference = 1E42EFAC15BEFAD300E937EB /* fuzzy-match2 */;
productType = "com.apple.product-type.tool";
};
1ED87EEC15BED331003E47AA /* fuzzy-match */ = {
isa = PBXNativeTarget;
buildConfigurationList = 1ED87EF715BED331003E47AA /* Build configuration list for PBXNativeTarget "fuzzy-match" */;
buildPhases = (
1ED87EE915BED331003E47AA /* Sources */,
1ED87EEA15BED331003E47AA /* Frameworks */,
1ED87EEB15BED331003E47AA /* CopyFiles */,
);
buildRules = (
);
dependencies = (
);
name = "fuzzy-match";
productName = "fuzzy-match";
productReference = 1ED87EED15BED331003E47AA /* fuzzy-match */;
productType = "com.apple.product-type.tool";
};
/* End PBXNativeTarget section */
/* Begin PBXProject section */
1ED87EE415BED32F003E47AA /* Project object */ = {
isa = PBXProject;
buildConfigurationList = 1ED87EE715BED32F003E47AA /* Build configuration list for PBXProject "fuzzy-match" */;
compatibilityVersion = "Xcode 3.2";
developmentRegion = English;
hasScannedForEncodings = 0;
knownRegions = (
en,
);
mainGroup = 1ED87EE215BED32F003E47AA;
productRefGroup = 1ED87EEE15BED331003E47AA /* Products */;
projectDirPath = "";
projectRoot = "";
targets = (
1ED87EEC15BED331003E47AA /* fuzzy-match */,
1E42EFAB15BEFAD300E937EB /* fuzzy-match2 */,
);
};
/* End PBXProject section */
/* Begin PBXSourcesBuildPhase section */
1E42EFA815BEFAD300E937EB /* Sources */ = {
isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647;
files = (
1ECD60A815C15E28004172A4 /* Util.cpp in Sources */,
1E42EFB615BEFAEB00E937EB /* fuzzy-match2.cpp in Sources */,
1E42EFB715BEFAEB00E937EB /* SuffixArray.cpp in Sources */,
1E42EFB815BEFAEB00E937EB /* SuffixArray.h in Sources */,
1E42EFB915BEFAEB00E937EB /* Vocabulary.cpp in Sources */,
1E42EFBA15BEFAEB00E937EB /* Vocabulary.h in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
1ED87EE915BED331003E47AA /* Sources */ = {
isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647;
files = (
1E806DCC15BED3AC001914A2 /* Vocabulary.cpp in Sources */,
1E806DD115BED3D4001914A2 /* SuffixArray.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
/* End PBXSourcesBuildPhase section */
/* Begin XCBuildConfiguration section */
1E42EFB415BEFAD300E937EB /* Debug */ = {
isa = XCBuildConfiguration;
buildSettings = {
PRODUCT_NAME = "$(TARGET_NAME)";
};
name = Debug;
};
1E42EFB515BEFAD300E937EB /* Release */ = {
isa = XCBuildConfiguration;
buildSettings = {
PRODUCT_NAME = "$(TARGET_NAME)";
};
name = Release;
};
1ED87EF515BED331003E47AA /* Debug */ = {
isa = XCBuildConfiguration;
buildSettings = {
ALWAYS_SEARCH_USER_PATHS = NO;
ARCHS = "$(ARCHS_STANDARD_64_BIT)";
COPY_PHASE_STRIP = NO;
GCC_C_LANGUAGE_STANDARD = gnu99;
GCC_DYNAMIC_NO_PIC = NO;
GCC_ENABLE_OBJC_EXCEPTIONS = YES;
GCC_OPTIMIZATION_LEVEL = 0;
GCC_PREPROCESSOR_DEFINITIONS = (
"DEBUG=1",
"$(inherited)",
);
GCC_SYMBOLS_PRIVATE_EXTERN = NO;
GCC_VERSION = com.apple.compilers.llvm.clang.1_0;
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
GCC_WARN_ABOUT_MISSING_PROTOTYPES = YES;
GCC_WARN_ABOUT_RETURN_TYPE = YES;
GCC_WARN_UNUSED_VARIABLE = YES;
MACOSX_DEPLOYMENT_TARGET = 10.7;
ONLY_ACTIVE_ARCH = YES;
SDKROOT = macosx;
};
name = Debug;
};
1ED87EF615BED331003E47AA /* Release */ = {
isa = XCBuildConfiguration;
buildSettings = {
ALWAYS_SEARCH_USER_PATHS = NO;
ARCHS = "$(ARCHS_STANDARD_64_BIT)";
COPY_PHASE_STRIP = YES;
DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
GCC_C_LANGUAGE_STANDARD = gnu99;
GCC_ENABLE_OBJC_EXCEPTIONS = YES;
GCC_VERSION = com.apple.compilers.llvm.clang.1_0;
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
GCC_WARN_ABOUT_MISSING_PROTOTYPES = YES;
GCC_WARN_ABOUT_RETURN_TYPE = YES;
GCC_WARN_UNUSED_VARIABLE = YES;
MACOSX_DEPLOYMENT_TARGET = 10.7;
SDKROOT = macosx;
};
name = Release;
};
1ED87EF815BED331003E47AA /* Debug */ = {
isa = XCBuildConfiguration;
buildSettings = {
PRODUCT_NAME = "$(TARGET_NAME)";
};
name = Debug;
};
1ED87EF915BED331003E47AA /* Release */ = {
isa = XCBuildConfiguration;
buildSettings = {
PRODUCT_NAME = "$(TARGET_NAME)";
};
name = Release;
};
/* End XCBuildConfiguration section */
/* Begin XCConfigurationList section */
1E42EFB315BEFAD300E937EB /* Build configuration list for PBXNativeTarget "fuzzy-match2" */ = {
isa = XCConfigurationList;
buildConfigurations = (
1E42EFB415BEFAD300E937EB /* Debug */,
1E42EFB515BEFAD300E937EB /* Release */,
);
defaultConfigurationIsVisible = 0;
defaultConfigurationName = Release;
};
1ED87EE715BED32F003E47AA /* Build configuration list for PBXProject "fuzzy-match" */ = {
isa = XCConfigurationList;
buildConfigurations = (
1ED87EF515BED331003E47AA /* Debug */,
1ED87EF615BED331003E47AA /* Release */,
);
defaultConfigurationIsVisible = 0;
defaultConfigurationName = Release;
};
1ED87EF715BED331003E47AA /* Build configuration list for PBXNativeTarget "fuzzy-match" */ = {
isa = XCConfigurationList;
buildConfigurations = (
1ED87EF815BED331003E47AA /* Debug */,
1ED87EF915BED331003E47AA /* Release */,
);
defaultConfigurationIsVisible = 0;
defaultConfigurationName = Release;
};
/* End XCConfigurationList section */
};
rootObject = 1ED87EE415BED32F003E47AA /* Project object */;
}

View File

@ -0,0 +1,21 @@
<?xml version="1.0" encoding="UTF-8"?>
<Bucket
type = "1"
version = "1.0">
<FileBreakpoints>
<FileBreakpoint
shouldBeEnabled = "Yes"
ignoreCount = "0"
continueAfterRunningActions = "No"
isPathRelative = "0"
filePath = "/Users/hieuhoang/unison/workspace/github/hieuhoang/contrib/tm-mt-integration/fuzzy-match2.cpp"
timestampString = "364996019.762643"
startingColumnNumber = "9223372036854775807"
endingColumnNumber = "9223372036854775807"
startingLineNumber = "456"
endingLineNumber = "456"
landmarkName = "create_extract(int sentenceInd, int cost, const vector&lt; WORD_ID &gt; &amp;sourceSentence, const vector&lt;SentenceAlignment&gt; &amp;targets, const string &amp;inputStr, const string &amp;path)"
landmarkType = "7">
</FileBreakpoint>
</FileBreakpoints>
</Bucket>

View File

@ -0,0 +1,78 @@
<?xml version="1.0" encoding="UTF-8"?>
<Scheme
version = "1.3">
<BuildAction
parallelizeBuildables = "YES"
buildImplicitDependencies = "YES">
<BuildActionEntries>
<BuildActionEntry
buildForTesting = "YES"
buildForRunning = "YES"
buildForProfiling = "YES"
buildForArchiving = "YES"
buildForAnalyzing = "YES">
<BuildableReference
BuildableIdentifier = "primary"
BlueprintIdentifier = "1ED87EEC15BED331003E47AA"
BuildableName = "fuzzy-match"
BlueprintName = "fuzzy-match"
ReferencedContainer = "container:fuzzy-match.xcodeproj">
</BuildableReference>
</BuildActionEntry>
</BuildActionEntries>
</BuildAction>
<TestAction
selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.GDB"
selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.GDB"
shouldUseLaunchSchemeArgsEnv = "YES"
buildConfiguration = "Debug">
<Testables>
</Testables>
</TestAction>
<LaunchAction
selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.GDB"
selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.GDB"
launchStyle = "0"
useCustomWorkingDirectory = "NO"
buildConfiguration = "Debug">
<BuildableProductRunnable>
<BuildableReference
BuildableIdentifier = "primary"
BlueprintIdentifier = "1ED87EEC15BED331003E47AA"
BuildableName = "fuzzy-match"
BlueprintName = "fuzzy-match"
ReferencedContainer = "container:fuzzy-match.xcodeproj">
</BuildableReference>
</BuildableProductRunnable>
<CommandLineArguments>
<CommandLineArgument
argument = "--multiple /Users/hieuhoang/workspace/experiment/data/tm-mt-integration//in/ac-test.input.tc.4 /Users/hieuhoang/workspace/experiment/data/tm-mt-integration//in/acquis.truecased.4.en.uniq"
isEnabled = "YES">
</CommandLineArgument>
</CommandLineArguments>
<AdditionalOptions>
</AdditionalOptions>
</LaunchAction>
<ProfileAction
shouldUseLaunchSchemeArgsEnv = "YES"
savedToolIdentifier = ""
useCustomWorkingDirectory = "NO"
buildConfiguration = "Release">
<BuildableProductRunnable>
<BuildableReference
BuildableIdentifier = "primary"
BlueprintIdentifier = "1ED87EEC15BED331003E47AA"
BuildableName = "fuzzy-match"
BlueprintName = "fuzzy-match"
ReferencedContainer = "container:fuzzy-match.xcodeproj">
</BuildableReference>
</BuildableProductRunnable>
</ProfileAction>
<AnalyzeAction
buildConfiguration = "Debug">
</AnalyzeAction>
<ArchiveAction
buildConfiguration = "Release"
revealArchiveInOrganizer = "YES">
</ArchiveAction>
</Scheme>

View File

@ -0,0 +1,79 @@
<?xml version="1.0" encoding="UTF-8"?>
<Scheme
version = "1.3">
<BuildAction
parallelizeBuildables = "YES"
buildImplicitDependencies = "YES">
<BuildActionEntries>
<BuildActionEntry
buildForTesting = "YES"
buildForRunning = "YES"
buildForProfiling = "YES"
buildForArchiving = "YES"
buildForAnalyzing = "YES">
<BuildableReference
BuildableIdentifier = "primary"
BlueprintIdentifier = "1E42EFAB15BEFAD300E937EB"
BuildableName = "fuzzy-match2"
BlueprintName = "fuzzy-match2"
ReferencedContainer = "container:fuzzy-match.xcodeproj">
</BuildableReference>
</BuildActionEntry>
</BuildActionEntries>
</BuildAction>
<TestAction
selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.GDB"
selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.GDB"
shouldUseLaunchSchemeArgsEnv = "YES"
buildConfiguration = "Debug">
<Testables>
</Testables>
</TestAction>
<LaunchAction
selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.GDB"
selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.GDB"
launchStyle = "0"
useCustomWorkingDirectory = "YES"
customWorkingDirectory = "/Users/hieuhoang/unison/workspace/experiment/data/tm-mt-integration/in"
buildConfiguration = "Debug">
<BuildableProductRunnable>
<BuildableReference
BuildableIdentifier = "primary"
BlueprintIdentifier = "1E42EFAB15BEFAD300E937EB"
BuildableName = "fuzzy-match2"
BlueprintName = "fuzzy-match2"
ReferencedContainer = "container:fuzzy-match.xcodeproj">
</BuildableReference>
</BuildableProductRunnable>
<CommandLineArguments>
<CommandLineArgument
argument = "--multiple ac-test.input.tc.4 acquis.truecased.4.en.uniq acquis.truecased.4.fr.uniq acquis.truecased.4.align.uniq"
isEnabled = "YES">
</CommandLineArgument>
</CommandLineArguments>
<AdditionalOptions>
</AdditionalOptions>
</LaunchAction>
<ProfileAction
shouldUseLaunchSchemeArgsEnv = "YES"
savedToolIdentifier = ""
useCustomWorkingDirectory = "NO"
buildConfiguration = "Release">
<BuildableProductRunnable>
<BuildableReference
BuildableIdentifier = "primary"
BlueprintIdentifier = "1E42EFAB15BEFAD300E937EB"
BuildableName = "fuzzy-match2"
BlueprintName = "fuzzy-match2"
ReferencedContainer = "container:fuzzy-match.xcodeproj">
</BuildableReference>
</BuildableProductRunnable>
</ProfileAction>
<AnalyzeAction
buildConfiguration = "Debug">
</AnalyzeAction>
<ArchiveAction
buildConfiguration = "Release"
revealArchiveInOrganizer = "YES">
</ArchiveAction>
</Scheme>

View File

@ -0,0 +1,32 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>SchemeUserState</key>
<dict>
<key>fuzzy-match.xcscheme</key>
<dict>
<key>orderHint</key>
<integer>0</integer>
</dict>
<key>fuzzy-match2.xcscheme</key>
<dict>
<key>orderHint</key>
<integer>1</integer>
</dict>
</dict>
<key>SuppressBuildableAutocreation</key>
<dict>
<key>1E42EFAB15BEFAD300E937EB</key>
<dict>
<key>primary</key>
<true/>
</dict>
<key>1ED87EEC15BED331003E47AA</key>
<dict>
<key>primary</key>
<true/>
</dict>
</dict>
</dict>
</plist>

View File

@ -1,4 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32">
@ -123,7 +123,12 @@
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<IncludePath>C:\Program Files\boost\boost_1_47;$(IncludePath)</IncludePath>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<IncludePath>C:\Program Files\boost\boost_1_47;$(IncludePath)</IncludePath>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile>
<PrecompiledHeader>
@ -131,7 +136,7 @@
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
<PreprocessorDefinitions>WITH_THREADS;NO_PIPES;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(SolutionDir)\..\..\lm\msinttypes;C:\boost\boost_1_47;$(SolutionDir)/../..</AdditionalIncludeDirectories>
<AdditionalIncludeDirectories>C:\boost\boost_1_47;$(SolutionDir)/../..</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Windows</SubSystem>
@ -147,7 +152,7 @@
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>WITH_THREADS;NO_PIPES;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(SolutionDir)\..\..\lm\msinttypes;C:\boost\boost_1_47;$(SolutionDir)/../..</AdditionalIncludeDirectories>
<AdditionalIncludeDirectories>C:\boost\boost_1_47;$(SolutionDir)/../..</AdditionalIncludeDirectories>
<RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
</ClCompile>
<Link>

View File

@ -405,6 +405,9 @@
/* Begin PBXProject section */
1EE8C2E01476A48E002496F2 /* Project object */ = {
isa = PBXProject;
attributes = {
LastUpgradeCheck = 0420;
};
buildConfigurationList = 1EE8C2E31476A48E002496F2 /* Build configuration list for PBXProject "lm" */;
compatibilityVersion = "Xcode 3.2";
developmentRegion = English;
@ -539,6 +542,7 @@
isa = XCBuildConfiguration;
buildSettings = {
EXECUTABLE_PREFIX = lib;
GCC_PREPROCESSOR_DEFINITIONS = "KENLM_MAX_ORDER=7";
LIBRARY_SEARCH_PATHS = (
"$(inherited)",
"\"$(SRCROOT)/../../lm/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi\"",
@ -556,6 +560,7 @@
isa = XCBuildConfiguration;
buildSettings = {
EXECUTABLE_PREFIX = lib;
GCC_PREPROCESSOR_DEFINITIONS = "KENLM_MAX_ORDER=7";
LIBRARY_SEARCH_PATHS = (
"$(inherited)",
"\"$(SRCROOT)/../../lm/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi\"",

View File

@ -42,7 +42,11 @@
<option id="gnu.cpp.compiler.macosx.exe.debug.option.debugging.level.7139692" name="Debug Level" superClass="gnu.cpp.compiler.macosx.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.include.paths.1988092227" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="/opt/local/include"/>
<listOptionValue builtIn="false" value="/Users/hieuhoang/unison/workspace/github/moses-smt"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../&quot;"/>
</option>
<option id="gnu.cpp.compiler.option.preprocessor.def.1980966336" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" valueType="definedSymbols">
<listOptionValue builtIn="false" value="KENLM_MAX_ORDER=7"/>
<listOptionValue builtIn="false" value="TRACE_ENABLE"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.20502600" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
@ -53,6 +57,9 @@
</tool>
</toolChain>
</folderInfo>
<sourceEntries>
<entry excluding="left_test.cc|model_test.cc" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
</sourceEntries>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
@ -122,4 +129,5 @@
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="refreshScope"/>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
</cproject>

View File

@ -326,6 +326,21 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/trie_sort.hh</locationURI>
</link>
<link>
<name>value.hh</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/value.hh</locationURI>
</link>
<link>
<name>value_build.cc</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/value_build.cc</locationURI>
</link>
<link>
<name>value_build.hh</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/value_build.hh</locationURI>
</link>
<link>
<name>virtual_interface.cc</name>
<type>1</type>

View File

@ -312,6 +312,7 @@
1E1D826815AC640800FE42E9 /* Release */,
);
defaultConfigurationIsVisible = 0;
defaultConfigurationName = Release;
};
1EB0AEFF1593A2180007E2A4 /* Build configuration list for PBXProject "mert" */ = {
isa = XCConfigurationList;

View File

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<Workspace
version = "1.0">
<FileRef
location = "self:mert.xcodeproj">
</FileRef>
</Workspace>

View File

@ -0,0 +1,35 @@
<?xml version="1.0" encoding="UTF-8"?>
<Bucket
type = "1"
version = "1.0">
<FileBreakpoints>
<FileBreakpoint
shouldBeEnabled = "Yes"
ignoreCount = "0"
continueAfterRunningActions = "No"
isPathRelative = "0"
filePath = "/Users/hieuhoang/unison/workspace/github/hieuhoang/mert/mert.cpp"
timestampString = "363625029.073606"
startingColumnNumber = "9223372036854775807"
endingColumnNumber = "9223372036854775807"
startingLineNumber = "316"
endingLineNumber = "316"
landmarkName = "main(int argc, char **argv)"
landmarkType = "7">
</FileBreakpoint>
<FileBreakpoint
shouldBeEnabled = "Yes"
ignoreCount = "0"
continueAfterRunningActions = "No"
isPathRelative = "0"
filePath = "/Users/hieuhoang/unison/workspace/github/hieuhoang/mert/mert.cpp"
timestampString = "363625081.848519"
startingColumnNumber = "9223372036854775807"
endingColumnNumber = "9223372036854775807"
startingLineNumber = "326"
endingLineNumber = "326"
landmarkName = "main(int argc, char **argv)"
landmarkType = "7">
</FileBreakpoint>
</FileBreakpoints>
</Bucket>

View File

@ -13,10 +13,10 @@
buildForAnalyzing = "YES">
<BuildableReference
BuildableIdentifier = "primary"
BlueprintIdentifier = "D2AAC045055464E500DB518D"
BuildableName = "libmoses.a"
BlueprintName = "moses"
ReferencedContainer = "container:moses.xcodeproj">
BlueprintIdentifier = "1E1D825E15AC640800FE42E9"
BuildableName = "extractor"
BlueprintName = "extractor"
ReferencedContainer = "container:mert.xcodeproj">
</BuildableReference>
</BuildActionEntry>
</BuildActionEntries>
@ -35,6 +35,15 @@
launchStyle = "0"
useCustomWorkingDirectory = "NO"
buildConfiguration = "Debug">
<BuildableProductRunnable>
<BuildableReference
BuildableIdentifier = "primary"
BlueprintIdentifier = "1E1D825E15AC640800FE42E9"
BuildableName = "extractor"
BlueprintName = "extractor"
ReferencedContainer = "container:mert.xcodeproj">
</BuildableReference>
</BuildableProductRunnable>
<AdditionalOptions>
</AdditionalOptions>
</LaunchAction>
@ -43,6 +52,15 @@
savedToolIdentifier = ""
useCustomWorkingDirectory = "NO"
buildConfiguration = "Release">
<BuildableProductRunnable>
<BuildableReference
BuildableIdentifier = "primary"
BlueprintIdentifier = "1E1D825E15AC640800FE42E9"
BuildableName = "extractor"
BlueprintName = "extractor"
ReferencedContainer = "container:mert.xcodeproj">
</BuildableReference>
</BuildableProductRunnable>
</ProfileAction>
<AnalyzeAction
buildConfiguration = "Debug">

View File

@ -13,10 +13,10 @@
buildForAnalyzing = "YES">
<BuildableReference
BuildableIdentifier = "primary"
BlueprintIdentifier = "D2AAC045055464E500DB518D"
BuildableName = "libOnDiskPt.a"
BlueprintName = "OnDiskPt"
ReferencedContainer = "container:OnDiskPt.xcodeproj">
BlueprintIdentifier = "1EB0AF041593A2180007E2A4"
BuildableName = "mert"
BlueprintName = "mert"
ReferencedContainer = "container:mert.xcodeproj">
</BuildableReference>
</BuildActionEntry>
</BuildActionEntries>
@ -35,6 +35,15 @@
launchStyle = "0"
useCustomWorkingDirectory = "NO"
buildConfiguration = "Debug">
<BuildableProductRunnable>
<BuildableReference
BuildableIdentifier = "primary"
BlueprintIdentifier = "1EB0AF041593A2180007E2A4"
BuildableName = "mert"
BlueprintName = "mert"
ReferencedContainer = "container:mert.xcodeproj">
</BuildableReference>
</BuildableProductRunnable>
<AdditionalOptions>
</AdditionalOptions>
</LaunchAction>
@ -43,6 +52,15 @@
savedToolIdentifier = ""
useCustomWorkingDirectory = "NO"
buildConfiguration = "Release">
<BuildableProductRunnable>
<BuildableReference
BuildableIdentifier = "primary"
BlueprintIdentifier = "1EB0AF041593A2180007E2A4"
BuildableName = "mert"
BlueprintName = "mert"
ReferencedContainer = "container:mert.xcodeproj">
</BuildableReference>
</BuildableProductRunnable>
</ProfileAction>
<AnalyzeAction
buildConfiguration = "Debug">

View File

@ -0,0 +1,32 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>SchemeUserState</key>
<dict>
<key>extractor.xcscheme</key>
<dict>
<key>orderHint</key>
<integer>1</integer>
</dict>
<key>mert.xcscheme</key>
<dict>
<key>orderHint</key>
<integer>2</integer>
</dict>
</dict>
<key>SuppressBuildableAutocreation</key>
<dict>
<key>1E1D825E15AC640800FE42E9</key>
<dict>
<key>primary</key>
<true/>
</dict>
<key>1EB0AF041593A2180007E2A4</key>
<dict>
<key>primary</key>
<true/>
</dict>
</dict>
</dict>
</plist>

View File

@ -308,6 +308,7 @@
../../irstlm/lib,
../../srilm/lib/macosx,
/opt/local/lib,
../../cmph/lib,
);
OTHER_LDFLAGS = (
"-lz",
@ -318,6 +319,9 @@
"-lflm",
"-llattice",
"-lboost_thread-mt",
"-lboost_filesystem-mt",
"-lboost_system-mt",
"-lcmph",
);
PRODUCT_NAME = "moses-chart-cmd";
USER_HEADER_SEARCH_PATHS = "../../ ../../moses/src";
@ -341,6 +345,7 @@
../../irstlm/lib,
../../srilm/lib/macosx,
/opt/local/lib,
../../cmph/lib,
);
OTHER_LDFLAGS = (
"-lz",
@ -351,6 +356,9 @@
"-lflm",
"-llattice",
"-lboost_thread-mt",
"-lboost_filesystem-mt",
"-lboost_system-mt",
"-lcmph",
);
PRODUCT_NAME = "moses-chart-cmd";
USER_HEADER_SEARCH_PATHS = "../../ ../../moses/src";

View File

@ -43,6 +43,10 @@
<OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
<IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Configuration)\</IntDir>
<LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
<IncludePath Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">C:\Program Files\boost\boost_1_47;$(IncludePath)</IncludePath>
<IncludePath Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">C:\Program Files\boost\boost_1_47;$(IncludePath)</IncludePath>
<LibraryPath Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">C:\Program Files\boost\boost_1_47\lib;$(LibraryPath)</LibraryPath>
<LibraryPath Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">C:\Program Files\boost\boost_1_47\lib;$(LibraryPath)</LibraryPath>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile>

View File

@ -326,15 +326,20 @@
../../irstlm/lib,
../../srilm/lib/macosx,
/opt/local/lib,
../../cmph/lib,
);
OTHER_LDFLAGS = (
"-lflm",
"-lmisc",
"-loolm",
"-ldstruct",
"-lz",
"-lirstlm",
"-lmisc",
"-ldstruct",
"-loolm",
"-lflm",
"-llattice",
"-lboost_thread-mt",
"-lboost_filesystem-mt",
"-lboost_system-mt",
"-lcmph",
);
PREBINDING = NO;
PRODUCT_NAME = "moses-cmd";
@ -369,15 +374,20 @@
../../irstlm/lib,
../../srilm/lib/macosx,
/opt/local/lib,
../../cmph/lib,
);
OTHER_LDFLAGS = (
"-lflm",
"-lmisc",
"-loolm",
"-ldstruct",
"-lz",
"-lirstlm",
"-lmisc",
"-ldstruct",
"-loolm",
"-lflm",
"-llattice",
"-lboost_thread-mt",
"-lboost_filesystem-mt",
"-lboost_system-mt",
"-lcmph",
);
PREBINDING = NO;
PRODUCT_NAME = "moses-cmd";
@ -409,15 +419,20 @@
../../irstlm/lib,
../../srilm/lib/macosx,
/opt/local/lib,
../../cmph/lib,
);
OTHER_LDFLAGS = (
"-lflm",
"-lmisc",
"-loolm",
"-ldstruct",
"-lz",
"-lirstlm",
"-lmisc",
"-ldstruct",
"-loolm",
"-lflm",
"-llattice",
"-lboost_thread-mt",
"-lboost_filesystem-mt",
"-lboost_system-mt",
"-lcmph",
);
PREBINDING = NO;
PRODUCT_NAME = "moses-cmd";

View File

@ -0,0 +1,72 @@
<?xml version="1.0" encoding="UTF-8"?>
<Scheme
version = "1.3">
<BuildAction
parallelizeBuildables = "YES"
buildImplicitDependencies = "YES">
<BuildActionEntries>
<BuildActionEntry
buildForTesting = "YES"
buildForRunning = "YES"
buildForProfiling = "YES"
buildForArchiving = "YES"
buildForAnalyzing = "YES">
<BuildableReference
BuildableIdentifier = "primary"
BlueprintIdentifier = "8DD76F620486A84900D96B5E"
BuildableName = "moses-cmd"
BlueprintName = "moses-cmd"
ReferencedContainer = "container:moses-cmd.xcodeproj">
</BuildableReference>
</BuildActionEntry>
</BuildActionEntries>
</BuildAction>
<TestAction
selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.GDB"
selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.GDB"
shouldUseLaunchSchemeArgsEnv = "YES"
buildConfiguration = "Debug">
<Testables>
</Testables>
</TestAction>
<LaunchAction
selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.GDB"
selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.GDB"
launchStyle = "0"
useCustomWorkingDirectory = "NO"
buildConfiguration = "Debug">
<BuildableProductRunnable>
<BuildableReference
BuildableIdentifier = "primary"
BlueprintIdentifier = "8DD76F620486A84900D96B5E"
BuildableName = "moses-cmd"
BlueprintName = "moses-cmd"
ReferencedContainer = "container:moses-cmd.xcodeproj">
</BuildableReference>
</BuildableProductRunnable>
<AdditionalOptions>
</AdditionalOptions>
</LaunchAction>
<ProfileAction
shouldUseLaunchSchemeArgsEnv = "YES"
savedToolIdentifier = ""
useCustomWorkingDirectory = "NO"
buildConfiguration = "Release">
<BuildableProductRunnable>
<BuildableReference
BuildableIdentifier = "primary"
BlueprintIdentifier = "8DD76F620486A84900D96B5E"
BuildableName = "moses-cmd"
BlueprintName = "moses-cmd"
ReferencedContainer = "container:moses-cmd.xcodeproj">
</BuildableReference>
</BuildableProductRunnable>
</ProfileAction>
<AnalyzeAction
buildConfiguration = "Debug">
</AnalyzeAction>
<ArchiveAction
buildConfiguration = "Release"
revealArchiveInOrganizer = "YES">
</ArchiveAction>
</Scheme>

View File

@ -0,0 +1,22 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>SchemeUserState</key>
<dict>
<key>moses-cmd.xcscheme</key>
<dict>
<key>orderHint</key>
<integer>2</integer>
</dict>
</dict>
<key>SuppressBuildableAutocreation</key>
<dict>
<key>8DD76F620486A84900D96B5E</key>
<dict>
<key>primary</key>
<true/>
</dict>
</dict>
</dict>
</plist>

View File

@ -25,17 +25,27 @@
<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.84059290" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug">
<option id="macosx.cpp.link.option.libs.1641794848" name="Libraries (-l)" superClass="macosx.cpp.link.option.libs" valueType="libs">
<listOptionValue builtIn="false" value="moses"/>
<listOptionValue builtIn="false" value="rt"/>
<listOptionValue builtIn="false" value="misc"/>
<listOptionValue builtIn="false" value="dstruct"/>
<listOptionValue builtIn="false" value="oolm"/>
<listOptionValue builtIn="false" value="flm"/>
<listOptionValue builtIn="false" value="lattice"/>
<listOptionValue builtIn="false" value="OnDiskPt"/>
<listOptionValue builtIn="false" value="lm"/>
<listOptionValue builtIn="false" value="util"/>
<listOptionValue builtIn="false" value="irstlm"/>
<listOptionValue builtIn="false" value="z"/>
<listOptionValue builtIn="false" value="boost_system"/>
<listOptionValue builtIn="false" value="boost_filesystem"/>
</option>
<option id="macosx.cpp.link.option.paths.1615268628" name="Library search path (-L)" superClass="macosx.cpp.link.option.paths" valueType="libPaths">
<listOptionValue builtIn="false" value="/Users/hieuhoang/workspace/github/moses-smt/contrib/other-builds/moses/Debug"/>
<listOptionValue builtIn="false" value="/Users/hieuhoang/workspace/github/moses-smt/contrib/other-builds/OnDiskPt/Debug"/>
<listOptionValue builtIn="false" value="/Users/hieuhoang/workspace/github/moses-smt/contrib/other-builds/lm/Debug"/>
<listOptionValue builtIn="false" value="/Users/hieuhoang/workspace/github/moses-smt/contrib/other-builds/util/Debug"/>
<listOptionValue builtIn="false" value="/Users/hieuhoang/workspace/github/moses-smt/irstlm/lib"/>
<listOptionValue builtIn="false" value="${workspace_loc:/moses}/Debug"/>
<listOptionValue builtIn="false" value="${workspace_loc:}/../../srilm/lib/i686-m64"/>
<listOptionValue builtIn="false" value="${workspace_loc:/OnDiskPt}/Debug"/>
<listOptionValue builtIn="false" value="${workspace_loc:/lm}/Debug"/>
<listOptionValue builtIn="false" value="${workspace_loc:/util}/Debug"/>
<listOptionValue builtIn="false" value="${workspace_loc:}/../../irstlm/lib"/>
</option>
<inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.412058804" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
@ -51,8 +61,11 @@
<option id="gnu.cpp.compiler.macosx.exe.debug.option.debugging.level.1176009559" name="Debug Level" superClass="gnu.cpp.compiler.macosx.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.include.paths.1024398579" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="/opt/local/include"/>
<listOptionValue builtIn="false" value="/Users/hieuhoang/unison/workspace/github/moses-smt/moses/src"/>
<listOptionValue builtIn="false" value="/Users/hieuhoang/unison/workspace/github/moses-smt"/>
<listOptionValue builtIn="false" value="${workspace_loc}/../../moses/src"/>
<listOptionValue builtIn="false" value="${workspace_loc}/../../"/>
</option>
<option id="gnu.cpp.compiler.option.preprocessor.def.491464216" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" valueType="definedSymbols">
<listOptionValue builtIn="false" value="TRACE_ENABLE"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.240921565" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
@ -122,12 +135,13 @@
<storageModule moduleId="refreshScope" versionNumber="1">
<resource resourceType="PROJECT" workspacePath="/moses-cmd"/>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
<storageModule moduleId="scannerConfiguration">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.macosx.exe.debug.341255150;cdt.managedbuild.config.gnu.macosx.exe.debug.341255150.;cdt.managedbuild.tool.gnu.c.compiler.macosx.exe.debug.1201400609;cdt.managedbuild.tool.gnu.c.compiler.input.2031799877">
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.macosx.exe.release.1916112479;cdt.managedbuild.config.macosx.exe.release.1916112479.;cdt.managedbuild.tool.gnu.c.compiler.macosx.exe.release.759110223;cdt.managedbuild.tool.gnu.c.compiler.input.1452105399">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.macosx.exe.release.1916112479;cdt.managedbuild.config.macosx.exe.release.1916112479.;cdt.managedbuild.tool.gnu.c.compiler.macosx.exe.release.759110223;cdt.managedbuild.tool.gnu.c.compiler.input.1452105399">
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.macosx.exe.debug.341255150;cdt.managedbuild.config.gnu.macosx.exe.debug.341255150.;cdt.managedbuild.tool.gnu.c.compiler.macosx.exe.debug.1201400609;cdt.managedbuild.tool.gnu.c.compiler.input.2031799877">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.macosx.exe.release.1916112479;cdt.managedbuild.config.macosx.exe.release.1916112479.;cdt.managedbuild.tool.gnu.cpp.compiler.macosx.exe.release.1219375865;cdt.managedbuild.tool.gnu.cpp.compiler.input.604224475">

View File

@ -20,6 +20,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CreateOnDisk", "CreateOnDis
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "kenlm", "kenlm.vcxproj", "{A5402E0B-6ED7-465C-9669-E4124A0CDDCB}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mosesserver", "mosesserver.vcxproj", "{85811FDF-8AD1-4490-A545-B2F51931A18C}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Win32 = Debug|Win32
@ -39,11 +41,17 @@ Global
{E2233DB1-5592-46FE-9420-E529420612FA}.Release|Win32.ActiveCfg = Release|Win32
{E2233DB1-5592-46FE-9420-E529420612FA}.Release|Win32.Build.0 = Release|Win32
{88AE90C9-72D2-42ED-8389-770ACDCD4308}.Debug|Win32.ActiveCfg = Debug|Win32
{88AE90C9-72D2-42ED-8389-770ACDCD4308}.Debug|Win32.Build.0 = Debug|Win32
{88AE90C9-72D2-42ED-8389-770ACDCD4308}.Release|Win32.ActiveCfg = Release|Win32
{88AE90C9-72D2-42ED-8389-770ACDCD4308}.Release|Win32.Build.0 = Release|Win32
{A5402E0B-6ED7-465C-9669-E4124A0CDDCB}.Debug|Win32.ActiveCfg = Debug|Win32
{A5402E0B-6ED7-465C-9669-E4124A0CDDCB}.Debug|Win32.Build.0 = Debug|Win32
{A5402E0B-6ED7-465C-9669-E4124A0CDDCB}.Release|Win32.ActiveCfg = Release|Win32
{A5402E0B-6ED7-465C-9669-E4124A0CDDCB}.Release|Win32.Build.0 = Release|Win32
{85811FDF-8AD1-4490-A545-B2F51931A18C}.Debug|Win32.ActiveCfg = Debug|Win32
{85811FDF-8AD1-4490-A545-B2F51931A18C}.Debug|Win32.Build.0 = Debug|Win32
{85811FDF-8AD1-4490-A545-B2F51931A18C}.Release|Win32.ActiveCfg = Release|Win32
{85811FDF-8AD1-4490-A545-B2F51931A18C}.Release|Win32.Build.0 = Release|Win32
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE

View File

@ -13,6 +13,7 @@
<ItemGroup>
<ClInclude Include="..\..\moses\src\AlignmentInfo.h" />
<ClInclude Include="..\..\moses\src\AlignmentInfoCollection.h" />
<ClInclude Include="..\..\moses\src\BilingualDynSuffixArray.h" />
<ClInclude Include="..\..\moses\src\BitmapContainer.h" />
<ClInclude Include="..\..\moses\src\CellCollection.h" />
<ClInclude Include="..\..\moses\src\ChartCell.h" />
@ -162,6 +163,7 @@
<ItemGroup>
<ClCompile Include="..\..\moses\src\AlignmentInfo.cpp" />
<ClCompile Include="..\..\moses\src\AlignmentInfoCollection.cpp" />
<ClCompile Include="..\..\moses\src\BilingualDynSuffixArray.cpp" />
<ClCompile Include="..\..\moses\src\BitmapContainer.cpp" />
<ClCompile Include="..\..\moses\src\ChartCell.cpp" />
<ClCompile Include="..\..\moses\src\ChartCellCollection.cpp" />
@ -319,13 +321,13 @@
<IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Configuration)\</IntDir>
<OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
<IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Configuration)\</IntDir>
<IncludePath Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">C:\GnuWin32\include;C:\Program Files\boost\boost_1_47;$(IncludePath)</IncludePath>
<IncludePath Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">C:\GnuWin32\include;C:\Program Files\boost\boost_1_47;$(IncludePath)</IncludePath>
<IncludePath Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">C:\Program Files\boost\boost_1_47;C:\GnuWin32\include;$(IncludePath)</IncludePath>
<IncludePath Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">C:\Program Files\boost\boost_1_47;C:\GnuWin32\include;$(IncludePath)</IncludePath>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile>
<Optimization>Disabled</Optimization>
<AdditionalIncludeDirectories>$(SolutionDir)\..\..\lm\msinttypes;C:\boost\boost_1_47;$(SolutionDir)/../../moses/src;$(SolutionDir)/../../;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<AdditionalIncludeDirectories>C:\boost\boost_1_47;$(SolutionDir)/../../moses/src;$(SolutionDir)/../../;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<PreprocessorDefinitions>WITH_THREADS;NO_PIPES;WIN32;_DEBUG;_CONSOLE;TRACE_ENABLE;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<MinimalRebuild>true</MinimalRebuild>
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
@ -344,7 +346,7 @@
<InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
<IntrinsicFunctions>true</IntrinsicFunctions>
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
<AdditionalIncludeDirectories>$(SolutionDir)\..\..\lm\msinttypes;C:\boost\boost_1_47;$(SolutionDir)/../../moses/src;$(SolutionDir)/../../;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<AdditionalIncludeDirectories>C:\boost\boost_1_47;$(SolutionDir)/../../moses/src;$(SolutionDir)/../../;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<PreprocessorDefinitions>WITH_THREADS;NO_PIPES;WIN32;NDEBUG;_CONSOLE;LM_INTERNAL;TRACE_ENABLE;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
<PrecompiledHeader>

View File

@ -7,8 +7,38 @@
objects = {
/* Begin PBXBuildFile section */
1E0BA41815B70E5F00AC70E1 /* PhraseDictionaryFuzzyMatch.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E0BA41615B70E5F00AC70E1 /* PhraseDictionaryFuzzyMatch.cpp */; };
1E0BA41915B70E5F00AC70E1 /* PhraseDictionaryFuzzyMatch.h in Headers */ = {isa = PBXBuildFile; fileRef = 1E0BA41715B70E5F00AC70E1 /* PhraseDictionaryFuzzyMatch.h */; };
1E1D824015AC29BB00FE42E9 /* FileHandler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E1D823E15AC29BB00FE42E9 /* FileHandler.cpp */; };
1E1D824115AC29BB00FE42E9 /* FileHandler.h in Headers */ = {isa = PBXBuildFile; fileRef = 1E1D823F15AC29BB00FE42E9 /* FileHandler.h */; };
1E365EEA16120F4600BA335B /* ChartTranslationOptions.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E365EE816120F4600BA335B /* ChartTranslationOptions.cpp */; };
1E365EEB16120F4600BA335B /* ChartTranslationOptions.h in Headers */ = {isa = PBXBuildFile; fileRef = 1E365EE916120F4600BA335B /* ChartTranslationOptions.h */; };
1E619EA115B8713700C2D7A7 /* ChartRuleLookupManagerMemoryPerSentence.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E619E9F15B8713600C2D7A7 /* ChartRuleLookupManagerMemoryPerSentence.cpp */; };
1E619EA215B8713700C2D7A7 /* ChartRuleLookupManagerMemoryPerSentence.h in Headers */ = {isa = PBXBuildFile; fileRef = 1E619EA015B8713700C2D7A7 /* ChartRuleLookupManagerMemoryPerSentence.h */; };
1E6D9FD615D027560064D436 /* BlockHashIndex.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E6D9FBD15D027560064D436 /* BlockHashIndex.cpp */; };
1E6D9FD715D027560064D436 /* BlockHashIndex.h in Headers */ = {isa = PBXBuildFile; fileRef = 1E6D9FBE15D027560064D436 /* BlockHashIndex.h */; };
1E6D9FD815D027560064D436 /* CanonicalHuffman.h in Headers */ = {isa = PBXBuildFile; fileRef = 1E6D9FBF15D027560064D436 /* CanonicalHuffman.h */; };
1E6D9FD915D027560064D436 /* CmphStringVectorAdapter.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E6D9FC015D027560064D436 /* CmphStringVectorAdapter.cpp */; };
1E6D9FDA15D027560064D436 /* CmphStringVectorAdapter.h in Headers */ = {isa = PBXBuildFile; fileRef = 1E6D9FC115D027560064D436 /* CmphStringVectorAdapter.h */; };
1E6D9FDB15D027560064D436 /* ConsistantPhrases.h in Headers */ = {isa = PBXBuildFile; fileRef = 1E6D9FC215D027560064D436 /* ConsistantPhrases.h */; };
1E6D9FDD15D027560064D436 /* LexicalReorderingTableCompact.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E6D9FC415D027560064D436 /* LexicalReorderingTableCompact.cpp */; };
1E6D9FDE15D027560064D436 /* LexicalReorderingTableCompact.h in Headers */ = {isa = PBXBuildFile; fileRef = 1E6D9FC515D027560064D436 /* LexicalReorderingTableCompact.h */; };
1E6D9FDF15D027560064D436 /* LexicalReorderingTableCreator.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E6D9FC615D027560064D436 /* LexicalReorderingTableCreator.cpp */; };
1E6D9FE015D027560064D436 /* LexicalReorderingTableCreator.h in Headers */ = {isa = PBXBuildFile; fileRef = 1E6D9FC715D027560064D436 /* LexicalReorderingTableCreator.h */; };
1E6D9FE115D027560064D436 /* ListCoders.h in Headers */ = {isa = PBXBuildFile; fileRef = 1E6D9FC815D027560064D436 /* ListCoders.h */; };
1E6D9FE215D027560064D436 /* MmapAllocator.h in Headers */ = {isa = PBXBuildFile; fileRef = 1E6D9FC915D027560064D436 /* MmapAllocator.h */; };
1E6D9FE315D027560064D436 /* MonotonicVector.h in Headers */ = {isa = PBXBuildFile; fileRef = 1E6D9FCA15D027560064D436 /* MonotonicVector.h */; };
1E6D9FE415D027560064D436 /* MurmurHash3.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E6D9FCB15D027560064D436 /* MurmurHash3.cpp */; };
1E6D9FE515D027560064D436 /* MurmurHash3.h in Headers */ = {isa = PBXBuildFile; fileRef = 1E6D9FCC15D027560064D436 /* MurmurHash3.h */; };
1E6D9FE615D027560064D436 /* PackedArray.h in Headers */ = {isa = PBXBuildFile; fileRef = 1E6D9FCD15D027560064D436 /* PackedArray.h */; };
1E6D9FE715D027560064D436 /* PhraseDecoder.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E6D9FCE15D027560064D436 /* PhraseDecoder.cpp */; };
1E6D9FE815D027560064D436 /* PhraseDecoder.h in Headers */ = {isa = PBXBuildFile; fileRef = 1E6D9FCF15D027560064D436 /* PhraseDecoder.h */; };
1E6D9FE915D027560064D436 /* PhraseDictionaryCompact.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E6D9FD015D027560064D436 /* PhraseDictionaryCompact.cpp */; };
1E6D9FEA15D027560064D436 /* PhraseDictionaryCompact.h in Headers */ = {isa = PBXBuildFile; fileRef = 1E6D9FD115D027560064D436 /* PhraseDictionaryCompact.h */; };
1E6D9FEB15D027560064D436 /* PhraseTableCreator.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E6D9FD215D027560064D436 /* PhraseTableCreator.cpp */; };
1E6D9FEC15D027560064D436 /* PhraseTableCreator.h in Headers */ = {isa = PBXBuildFile; fileRef = 1E6D9FD315D027560064D436 /* PhraseTableCreator.h */; };
1E6D9FED15D027560064D436 /* StringVector.h in Headers */ = {isa = PBXBuildFile; fileRef = 1E6D9FD415D027560064D436 /* StringVector.h */; };
1E6D9FEE15D027560064D436 /* TargetPhraseCollectionCache.h in Headers */ = {isa = PBXBuildFile; fileRef = 1E6D9FD515D027560064D436 /* TargetPhraseCollectionCache.h */; };
1E879EA715A346F90051F346 /* SearchNormalBatch.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E879EA515A346F90051F346 /* SearchNormalBatch.cpp */; };
1E879EA815A346F90051F346 /* SearchNormalBatch.h in Headers */ = {isa = PBXBuildFile; fileRef = 1E879EA615A346F90051F346 /* SearchNormalBatch.h */; };
1EAC363514CDC79300DF97C3 /* Loader.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EAC362C14CDC79300DF97C3 /* Loader.h */; };
@ -20,6 +50,8 @@
1EAC363B14CDC79300DF97C3 /* LoaderHiero.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EAC363214CDC79300DF97C3 /* LoaderHiero.h */; };
1EAC363C14CDC79300DF97C3 /* LoaderStandard.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EAC363314CDC79300DF97C3 /* LoaderStandard.cpp */; };
1EAC363D14CDC79300DF97C3 /* LoaderStandard.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EAC363414CDC79300DF97C3 /* LoaderStandard.h */; };
1EC32DB815D2D90700A313B1 /* ThrowingFwrite.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EC32DB615D2D90700A313B1 /* ThrowingFwrite.cpp */; };
1EC32DB915D2D90700A313B1 /* ThrowingFwrite.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EC32DB715D2D90700A313B1 /* ThrowingFwrite.h */; };
1EC7374614B977AB00238410 /* AlignmentInfo.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EC735D314B977AA00238410 /* AlignmentInfo.cpp */; };
1EC7374714B977AB00238410 /* AlignmentInfo.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EC735D414B977AA00238410 /* AlignmentInfo.h */; };
1EC7374814B977AB00238410 /* AlignmentInfoCollection.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EC735D514B977AA00238410 /* AlignmentInfoCollection.cpp */; };
@ -28,7 +60,6 @@
1EC7374B14B977AB00238410 /* BilingualDynSuffixArray.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EC735D814B977AA00238410 /* BilingualDynSuffixArray.h */; };
1EC7374C14B977AB00238410 /* BitmapContainer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EC735D914B977AA00238410 /* BitmapContainer.cpp */; };
1EC7374D14B977AB00238410 /* BitmapContainer.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EC735DA14B977AA00238410 /* BitmapContainer.h */; };
1EC7374E14B977AB00238410 /* CellCollection.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EC735DB14B977AA00238410 /* CellCollection.h */; };
1EC7374F14B977AB00238410 /* ChartCell.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EC735DC14B977AA00238410 /* ChartCell.cpp */; };
1EC7375014B977AB00238410 /* ChartCell.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EC735DD14B977AA00238410 /* ChartCell.h */; };
1EC7375114B977AB00238410 /* ChartCellCollection.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EC735DE14B977AA00238410 /* ChartCellCollection.cpp */; };
@ -42,10 +73,6 @@
1EC7375914B977AB00238410 /* ChartManager.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EC735E614B977AA00238410 /* ChartManager.cpp */; };
1EC7375A14B977AB00238410 /* ChartManager.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EC735E714B977AA00238410 /* ChartManager.h */; };
1EC7375C14B977AB00238410 /* ChartRuleLookupManager.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EC735E914B977AA00238410 /* ChartRuleLookupManager.h */; };
1EC7376114B977AB00238410 /* ChartTranslationOption.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EC735EE14B977AA00238410 /* ChartTranslationOption.cpp */; };
1EC7376214B977AB00238410 /* ChartTranslationOption.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EC735EF14B977AA00238410 /* ChartTranslationOption.h */; };
1EC7376314B977AB00238410 /* ChartTranslationOptionCollection.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EC735F014B977AA00238410 /* ChartTranslationOptionCollection.cpp */; };
1EC7376414B977AB00238410 /* ChartTranslationOptionCollection.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EC735F114B977AA00238410 /* ChartTranslationOptionCollection.h */; };
1EC7376514B977AB00238410 /* ChartTranslationOptionList.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EC735F214B977AA00238410 /* ChartTranslationOptionList.cpp */; };
1EC7376614B977AB00238410 /* ChartTranslationOptionList.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EC735F314B977AA00238410 /* ChartTranslationOptionList.h */; };
1EC7376714B977AB00238410 /* ChartTrellisDetour.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EC735F414B977AA00238410 /* ChartTrellisDetour.cpp */; };
@ -295,14 +322,53 @@
1EDA809114D19FBF003D2191 /* UTrie.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EDA808314D19FBF003D2191 /* UTrie.h */; };
1EDA809214D19FBF003D2191 /* UTrieNode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EDA808414D19FBF003D2191 /* UTrieNode.cpp */; };
1EDA809314D19FBF003D2191 /* UTrieNode.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EDA808514D19FBF003D2191 /* UTrieNode.h */; };
1EE418ED15C7FDCB0028F9AB /* Match.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EE418E415C7FDCB0028F9AB /* Match.h */; };
1EE418EE15C7FDCB0028F9AB /* SentenceAlignment.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EE418E515C7FDCB0028F9AB /* SentenceAlignment.cpp */; };
1EE418EF15C7FDCB0028F9AB /* SentenceAlignment.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EE418E615C7FDCB0028F9AB /* SentenceAlignment.h */; };
1EE418F015C7FDCB0028F9AB /* SuffixArray.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EE418E715C7FDCB0028F9AB /* SuffixArray.cpp */; };
1EE418F115C7FDCB0028F9AB /* SuffixArray.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EE418E815C7FDCB0028F9AB /* SuffixArray.h */; };
1EE418F215C7FDCB0028F9AB /* FuzzyMatchWrapper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EE418E915C7FDCB0028F9AB /* FuzzyMatchWrapper.cpp */; };
1EE418F315C7FDCB0028F9AB /* FuzzyMatchWrapper.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EE418EA15C7FDCB0028F9AB /* FuzzyMatchWrapper.h */; };
1EE418F415C7FDCB0028F9AB /* Vocabulary.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EE418EB15C7FDCB0028F9AB /* Vocabulary.cpp */; };
1EE418F515C7FDCB0028F9AB /* Vocabulary.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EE418EC15C7FDCB0028F9AB /* Vocabulary.h */; };
1EF0709314B9EFCC0052152A /* ParallelBackoff.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EF0709114B9EFCC0052152A /* ParallelBackoff.cpp */; };
1EF0709414B9EFCC0052152A /* ParallelBackoff.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EF0709214B9EFCC0052152A /* ParallelBackoff.h */; };
1EF8F2C4159A61970047B613 /* HypoList.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EF8F2C3159A61970047B613 /* HypoList.h */; };
/* End PBXBuildFile section */
/* Begin PBXFileReference section */
1E0BA41615B70E5F00AC70E1 /* PhraseDictionaryFuzzyMatch.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = PhraseDictionaryFuzzyMatch.cpp; path = ../../moses/src/RuleTable/PhraseDictionaryFuzzyMatch.cpp; sourceTree = "<group>"; };
1E0BA41715B70E5F00AC70E1 /* PhraseDictionaryFuzzyMatch.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = PhraseDictionaryFuzzyMatch.h; path = ../../moses/src/RuleTable/PhraseDictionaryFuzzyMatch.h; sourceTree = "<group>"; };
1E1D823E15AC29BB00FE42E9 /* FileHandler.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = FileHandler.cpp; sourceTree = "<group>"; };
1E1D823F15AC29BB00FE42E9 /* FileHandler.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = FileHandler.h; sourceTree = "<group>"; };
1E365EE816120F4600BA335B /* ChartTranslationOptions.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = ChartTranslationOptions.cpp; path = ../../moses/src/ChartTranslationOptions.cpp; sourceTree = "<group>"; };
1E365EE916120F4600BA335B /* ChartTranslationOptions.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ChartTranslationOptions.h; path = ../../moses/src/ChartTranslationOptions.h; sourceTree = "<group>"; };
1E619E9F15B8713600C2D7A7 /* ChartRuleLookupManagerMemoryPerSentence.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = ChartRuleLookupManagerMemoryPerSentence.cpp; path = ../../moses/src/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp; sourceTree = "<group>"; };
1E619EA015B8713700C2D7A7 /* ChartRuleLookupManagerMemoryPerSentence.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ChartRuleLookupManagerMemoryPerSentence.h; path = ../../moses/src/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h; sourceTree = "<group>"; };
1E6D9FBD15D027560064D436 /* BlockHashIndex.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = BlockHashIndex.cpp; path = ../../moses/src/CompactPT/BlockHashIndex.cpp; sourceTree = "<group>"; };
1E6D9FBE15D027560064D436 /* BlockHashIndex.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = BlockHashIndex.h; path = ../../moses/src/CompactPT/BlockHashIndex.h; sourceTree = "<group>"; };
1E6D9FBF15D027560064D436 /* CanonicalHuffman.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = CanonicalHuffman.h; path = ../../moses/src/CompactPT/CanonicalHuffman.h; sourceTree = "<group>"; };
1E6D9FC015D027560064D436 /* CmphStringVectorAdapter.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = CmphStringVectorAdapter.cpp; path = ../../moses/src/CompactPT/CmphStringVectorAdapter.cpp; sourceTree = "<group>"; };
1E6D9FC115D027560064D436 /* CmphStringVectorAdapter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = CmphStringVectorAdapter.h; path = ../../moses/src/CompactPT/CmphStringVectorAdapter.h; sourceTree = "<group>"; };
1E6D9FC215D027560064D436 /* ConsistantPhrases.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ConsistantPhrases.h; path = ../../moses/src/CompactPT/ConsistantPhrases.h; sourceTree = "<group>"; };
1E6D9FC415D027560064D436 /* LexicalReorderingTableCompact.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = LexicalReorderingTableCompact.cpp; path = ../../moses/src/CompactPT/LexicalReorderingTableCompact.cpp; sourceTree = "<group>"; };
1E6D9FC515D027560064D436 /* LexicalReorderingTableCompact.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = LexicalReorderingTableCompact.h; path = ../../moses/src/CompactPT/LexicalReorderingTableCompact.h; sourceTree = "<group>"; };
1E6D9FC615D027560064D436 /* LexicalReorderingTableCreator.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = LexicalReorderingTableCreator.cpp; path = ../../moses/src/CompactPT/LexicalReorderingTableCreator.cpp; sourceTree = "<group>"; };
1E6D9FC715D027560064D436 /* LexicalReorderingTableCreator.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = LexicalReorderingTableCreator.h; path = ../../moses/src/CompactPT/LexicalReorderingTableCreator.h; sourceTree = "<group>"; };
1E6D9FC815D027560064D436 /* ListCoders.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ListCoders.h; path = ../../moses/src/CompactPT/ListCoders.h; sourceTree = "<group>"; };
1E6D9FC915D027560064D436 /* MmapAllocator.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = MmapAllocator.h; path = ../../moses/src/CompactPT/MmapAllocator.h; sourceTree = "<group>"; };
1E6D9FCA15D027560064D436 /* MonotonicVector.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = MonotonicVector.h; path = ../../moses/src/CompactPT/MonotonicVector.h; sourceTree = "<group>"; };
1E6D9FCB15D027560064D436 /* MurmurHash3.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = MurmurHash3.cpp; path = ../../moses/src/CompactPT/MurmurHash3.cpp; sourceTree = "<group>"; };
1E6D9FCC15D027560064D436 /* MurmurHash3.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = MurmurHash3.h; path = ../../moses/src/CompactPT/MurmurHash3.h; sourceTree = "<group>"; };
1E6D9FCD15D027560064D436 /* PackedArray.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = PackedArray.h; path = ../../moses/src/CompactPT/PackedArray.h; sourceTree = "<group>"; };
1E6D9FCE15D027560064D436 /* PhraseDecoder.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = PhraseDecoder.cpp; path = ../../moses/src/CompactPT/PhraseDecoder.cpp; sourceTree = "<group>"; };
1E6D9FCF15D027560064D436 /* PhraseDecoder.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = PhraseDecoder.h; path = ../../moses/src/CompactPT/PhraseDecoder.h; sourceTree = "<group>"; };
1E6D9FD015D027560064D436 /* PhraseDictionaryCompact.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = PhraseDictionaryCompact.cpp; path = ../../moses/src/CompactPT/PhraseDictionaryCompact.cpp; sourceTree = "<group>"; };
1E6D9FD115D027560064D436 /* PhraseDictionaryCompact.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = PhraseDictionaryCompact.h; path = ../../moses/src/CompactPT/PhraseDictionaryCompact.h; sourceTree = "<group>"; };
1E6D9FD215D027560064D436 /* PhraseTableCreator.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = PhraseTableCreator.cpp; path = ../../moses/src/CompactPT/PhraseTableCreator.cpp; sourceTree = "<group>"; };
1E6D9FD315D027560064D436 /* PhraseTableCreator.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = PhraseTableCreator.h; path = ../../moses/src/CompactPT/PhraseTableCreator.h; sourceTree = "<group>"; };
1E6D9FD415D027560064D436 /* StringVector.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = StringVector.h; path = ../../moses/src/CompactPT/StringVector.h; sourceTree = "<group>"; };
1E6D9FD515D027560064D436 /* TargetPhraseCollectionCache.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = TargetPhraseCollectionCache.h; path = ../../moses/src/CompactPT/TargetPhraseCollectionCache.h; sourceTree = "<group>"; };
1E879EA515A346F90051F346 /* SearchNormalBatch.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = SearchNormalBatch.cpp; path = ../../moses/src/SearchNormalBatch.cpp; sourceTree = "<group>"; };
1E879EA615A346F90051F346 /* SearchNormalBatch.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = SearchNormalBatch.h; path = ../../moses/src/SearchNormalBatch.h; sourceTree = "<group>"; };
1EAC362C14CDC79300DF97C3 /* Loader.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = Loader.h; path = ../../moses/src/RuleTable/Loader.h; sourceTree = "<group>"; };
@ -314,6 +380,8 @@
1EAC363214CDC79300DF97C3 /* LoaderHiero.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = LoaderHiero.h; path = ../../moses/src/RuleTable/LoaderHiero.h; sourceTree = "<group>"; };
1EAC363314CDC79300DF97C3 /* LoaderStandard.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = LoaderStandard.cpp; path = ../../moses/src/RuleTable/LoaderStandard.cpp; sourceTree = "<group>"; };
1EAC363414CDC79300DF97C3 /* LoaderStandard.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = LoaderStandard.h; path = ../../moses/src/RuleTable/LoaderStandard.h; sourceTree = "<group>"; };
1EC32DB615D2D90700A313B1 /* ThrowingFwrite.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = ThrowingFwrite.cpp; path = ../../moses/src/CompactPT/ThrowingFwrite.cpp; sourceTree = "<group>"; };
1EC32DB715D2D90700A313B1 /* ThrowingFwrite.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ThrowingFwrite.h; path = ../../moses/src/CompactPT/ThrowingFwrite.h; sourceTree = "<group>"; };
1EC735D314B977AA00238410 /* AlignmentInfo.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = AlignmentInfo.cpp; path = ../../moses/src/AlignmentInfo.cpp; sourceTree = "<group>"; };
1EC735D414B977AA00238410 /* AlignmentInfo.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = AlignmentInfo.h; path = ../../moses/src/AlignmentInfo.h; sourceTree = "<group>"; };
1EC735D514B977AA00238410 /* AlignmentInfoCollection.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = AlignmentInfoCollection.cpp; path = ../../moses/src/AlignmentInfoCollection.cpp; sourceTree = "<group>"; };
@ -322,7 +390,6 @@
1EC735D814B977AA00238410 /* BilingualDynSuffixArray.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = BilingualDynSuffixArray.h; path = ../../moses/src/BilingualDynSuffixArray.h; sourceTree = "<group>"; };
1EC735D914B977AA00238410 /* BitmapContainer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = BitmapContainer.cpp; path = ../../moses/src/BitmapContainer.cpp; sourceTree = "<group>"; };
1EC735DA14B977AA00238410 /* BitmapContainer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = BitmapContainer.h; path = ../../moses/src/BitmapContainer.h; sourceTree = "<group>"; };
1EC735DB14B977AA00238410 /* CellCollection.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = CellCollection.h; path = ../../moses/src/CellCollection.h; sourceTree = "<group>"; };
1EC735DC14B977AA00238410 /* ChartCell.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = ChartCell.cpp; path = ../../moses/src/ChartCell.cpp; sourceTree = "<group>"; };
1EC735DD14B977AA00238410 /* ChartCell.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ChartCell.h; path = ../../moses/src/ChartCell.h; sourceTree = "<group>"; };
1EC735DE14B977AA00238410 /* ChartCellCollection.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = ChartCellCollection.cpp; path = ../../moses/src/ChartCellCollection.cpp; sourceTree = "<group>"; };
@ -336,10 +403,6 @@
1EC735E614B977AA00238410 /* ChartManager.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = ChartManager.cpp; path = ../../moses/src/ChartManager.cpp; sourceTree = "<group>"; };
1EC735E714B977AA00238410 /* ChartManager.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ChartManager.h; path = ../../moses/src/ChartManager.h; sourceTree = "<group>"; };
1EC735E914B977AA00238410 /* ChartRuleLookupManager.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ChartRuleLookupManager.h; path = ../../moses/src/ChartRuleLookupManager.h; sourceTree = "<group>"; };
1EC735EE14B977AA00238410 /* ChartTranslationOption.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = ChartTranslationOption.cpp; path = ../../moses/src/ChartTranslationOption.cpp; sourceTree = "<group>"; };
1EC735EF14B977AA00238410 /* ChartTranslationOption.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ChartTranslationOption.h; path = ../../moses/src/ChartTranslationOption.h; sourceTree = "<group>"; };
1EC735F014B977AA00238410 /* ChartTranslationOptionCollection.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = ChartTranslationOptionCollection.cpp; path = ../../moses/src/ChartTranslationOptionCollection.cpp; sourceTree = "<group>"; };
1EC735F114B977AA00238410 /* ChartTranslationOptionCollection.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ChartTranslationOptionCollection.h; path = ../../moses/src/ChartTranslationOptionCollection.h; sourceTree = "<group>"; };
1EC735F214B977AA00238410 /* ChartTranslationOptionList.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = ChartTranslationOptionList.cpp; path = ../../moses/src/ChartTranslationOptionList.cpp; sourceTree = "<group>"; };
1EC735F314B977AA00238410 /* ChartTranslationOptionList.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ChartTranslationOptionList.h; path = ../../moses/src/ChartTranslationOptionList.h; sourceTree = "<group>"; };
1EC735F414B977AA00238410 /* ChartTrellisDetour.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = ChartTrellisDetour.cpp; path = ../../moses/src/ChartTrellisDetour.cpp; sourceTree = "<group>"; };
@ -591,6 +654,15 @@
1EDA808314D19FBF003D2191 /* UTrie.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = UTrie.h; path = ../../moses/src/RuleTable/UTrie.h; sourceTree = "<group>"; };
1EDA808414D19FBF003D2191 /* UTrieNode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = UTrieNode.cpp; path = ../../moses/src/RuleTable/UTrieNode.cpp; sourceTree = "<group>"; };
1EDA808514D19FBF003D2191 /* UTrieNode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = UTrieNode.h; path = ../../moses/src/RuleTable/UTrieNode.h; sourceTree = "<group>"; };
1EE418E415C7FDCB0028F9AB /* Match.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = Match.h; path = "../../moses/src/fuzzy-match/Match.h"; sourceTree = "<group>"; };
1EE418E515C7FDCB0028F9AB /* SentenceAlignment.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = SentenceAlignment.cpp; path = "../../moses/src/fuzzy-match/SentenceAlignment.cpp"; sourceTree = "<group>"; };
1EE418E615C7FDCB0028F9AB /* SentenceAlignment.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = SentenceAlignment.h; path = "../../moses/src/fuzzy-match/SentenceAlignment.h"; sourceTree = "<group>"; };
1EE418E715C7FDCB0028F9AB /* SuffixArray.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = SuffixArray.cpp; path = "../../moses/src/fuzzy-match/SuffixArray.cpp"; sourceTree = "<group>"; };
1EE418E815C7FDCB0028F9AB /* SuffixArray.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = SuffixArray.h; path = "../../moses/src/fuzzy-match/SuffixArray.h"; sourceTree = "<group>"; };
1EE418E915C7FDCB0028F9AB /* FuzzyMatchWrapper.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = FuzzyMatchWrapper.cpp; path = "../../moses/src/fuzzy-match/FuzzyMatchWrapper.cpp"; sourceTree = "<group>"; };
1EE418EA15C7FDCB0028F9AB /* FuzzyMatchWrapper.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = FuzzyMatchWrapper.h; path = "../../moses/src/fuzzy-match/FuzzyMatchWrapper.h"; sourceTree = "<group>"; };
1EE418EB15C7FDCB0028F9AB /* Vocabulary.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Vocabulary.cpp; path = "../../moses/src/fuzzy-match/Vocabulary.cpp"; sourceTree = "<group>"; };
1EE418EC15C7FDCB0028F9AB /* Vocabulary.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = Vocabulary.h; path = "../../moses/src/fuzzy-match/Vocabulary.h"; sourceTree = "<group>"; };
1EF0709114B9EFCC0052152A /* ParallelBackoff.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ParallelBackoff.cpp; sourceTree = "<group>"; };
1EF0709214B9EFCC0052152A /* ParallelBackoff.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ParallelBackoff.h; sourceTree = "<group>"; };
1EF8F2C3159A61970047B613 /* HypoList.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = HypoList.h; path = ../../moses/src/HypoList.h; sourceTree = "<group>"; };
@ -621,8 +693,8 @@
08FB7795FE84155DC02AAC07 /* Source */ = {
isa = PBXGroup;
children = (
1E879EA515A346F90051F346 /* SearchNormalBatch.cpp */,
1E879EA615A346F90051F346 /* SearchNormalBatch.h */,
1E6D9FF015D027680064D436 /* CompactPT */,
1ECF13DE15C1A82400EA1DCE /* fuzzy-match */,
1EDA803514D19ECD003D2191 /* Scope3Parser */,
1EDA803414D19EB8003D2191 /* CYKPlusParser */,
1EC7365B14B977AA00238410 /* LM */,
@ -636,7 +708,6 @@
1EC735D814B977AA00238410 /* BilingualDynSuffixArray.h */,
1EC735D914B977AA00238410 /* BitmapContainer.cpp */,
1EC735DA14B977AA00238410 /* BitmapContainer.h */,
1EC735DB14B977AA00238410 /* CellCollection.h */,
1EC735DC14B977AA00238410 /* ChartCell.cpp */,
1EC735DD14B977AA00238410 /* ChartCell.h */,
1EC735DE14B977AA00238410 /* ChartCellCollection.cpp */,
@ -650,10 +721,8 @@
1EC735E614B977AA00238410 /* ChartManager.cpp */,
1EC735E714B977AA00238410 /* ChartManager.h */,
1EC735E914B977AA00238410 /* ChartRuleLookupManager.h */,
1EC735EE14B977AA00238410 /* ChartTranslationOption.cpp */,
1EC735EF14B977AA00238410 /* ChartTranslationOption.h */,
1EC735F014B977AA00238410 /* ChartTranslationOptionCollection.cpp */,
1EC735F114B977AA00238410 /* ChartTranslationOptionCollection.h */,
1E365EE816120F4600BA335B /* ChartTranslationOptions.cpp */,
1E365EE916120F4600BA335B /* ChartTranslationOptions.h */,
1EC735F214B977AA00238410 /* ChartTranslationOptionList.cpp */,
1EC735F314B977AA00238410 /* ChartTranslationOptionList.h */,
1EC735F414B977AA00238410 /* ChartTrellisDetour.cpp */,
@ -782,6 +851,8 @@
1EC736F414B977AB00238410 /* SearchCubePruning.h */,
1EC736F514B977AB00238410 /* SearchNormal.cpp */,
1EC736F614B977AB00238410 /* SearchNormal.h */,
1E879EA515A346F90051F346 /* SearchNormalBatch.cpp */,
1E879EA615A346F90051F346 /* SearchNormalBatch.h */,
1EC736F714B977AB00238410 /* Sentence.cpp */,
1EC736F814B977AB00238410 /* Sentence.h */,
1EC736F914B977AB00238410 /* SentenceStats.cpp */,
@ -845,6 +916,39 @@
name = Products;
sourceTree = "<group>";
};
1E6D9FF015D027680064D436 /* CompactPT */ = {
isa = PBXGroup;
children = (
1EC32DB615D2D90700A313B1 /* ThrowingFwrite.cpp */,
1EC32DB715D2D90700A313B1 /* ThrowingFwrite.h */,
1E6D9FBD15D027560064D436 /* BlockHashIndex.cpp */,
1E6D9FBE15D027560064D436 /* BlockHashIndex.h */,
1E6D9FBF15D027560064D436 /* CanonicalHuffman.h */,
1E6D9FC015D027560064D436 /* CmphStringVectorAdapter.cpp */,
1E6D9FC115D027560064D436 /* CmphStringVectorAdapter.h */,
1E6D9FC215D027560064D436 /* ConsistantPhrases.h */,
1E6D9FC415D027560064D436 /* LexicalReorderingTableCompact.cpp */,
1E6D9FC515D027560064D436 /* LexicalReorderingTableCompact.h */,
1E6D9FC615D027560064D436 /* LexicalReorderingTableCreator.cpp */,
1E6D9FC715D027560064D436 /* LexicalReorderingTableCreator.h */,
1E6D9FC815D027560064D436 /* ListCoders.h */,
1E6D9FC915D027560064D436 /* MmapAllocator.h */,
1E6D9FCA15D027560064D436 /* MonotonicVector.h */,
1E6D9FCB15D027560064D436 /* MurmurHash3.cpp */,
1E6D9FCC15D027560064D436 /* MurmurHash3.h */,
1E6D9FCD15D027560064D436 /* PackedArray.h */,
1E6D9FCE15D027560064D436 /* PhraseDecoder.cpp */,
1E6D9FCF15D027560064D436 /* PhraseDecoder.h */,
1E6D9FD015D027560064D436 /* PhraseDictionaryCompact.cpp */,
1E6D9FD115D027560064D436 /* PhraseDictionaryCompact.h */,
1E6D9FD215D027560064D436 /* PhraseTableCreator.cpp */,
1E6D9FD315D027560064D436 /* PhraseTableCreator.h */,
1E6D9FD415D027560064D436 /* StringVector.h */,
1E6D9FD515D027560064D436 /* TargetPhraseCollectionCache.h */,
);
name = CompactPT;
sourceTree = "<group>";
};
1EAC362B14CDC76200DF97C3 /* RuleTable */ = {
isa = PBXGroup;
children = (
@ -856,6 +960,8 @@
1EDA807D14D19FBF003D2191 /* PhraseDictionaryOnDisk.h */,
1EDA807E14D19FBF003D2191 /* PhraseDictionarySCFG.cpp */,
1EDA807F14D19FBF003D2191 /* PhraseDictionarySCFG.h */,
1E0BA41615B70E5F00AC70E1 /* PhraseDictionaryFuzzyMatch.cpp */,
1E0BA41715B70E5F00AC70E1 /* PhraseDictionaryFuzzyMatch.h */,
1EDA808014D19FBF003D2191 /* Trie.cpp */,
1EDA808114D19FBF003D2191 /* Trie.h */,
1EDA808214D19FBF003D2191 /* UTrie.cpp */,
@ -930,9 +1036,27 @@
path = ../../moses/src/LM;
sourceTree = "<group>";
};
1ECF13DE15C1A82400EA1DCE /* fuzzy-match */ = {
isa = PBXGroup;
children = (
1EE418E415C7FDCB0028F9AB /* Match.h */,
1EE418E515C7FDCB0028F9AB /* SentenceAlignment.cpp */,
1EE418E615C7FDCB0028F9AB /* SentenceAlignment.h */,
1EE418E715C7FDCB0028F9AB /* SuffixArray.cpp */,
1EE418E815C7FDCB0028F9AB /* SuffixArray.h */,
1EE418E915C7FDCB0028F9AB /* FuzzyMatchWrapper.cpp */,
1EE418EA15C7FDCB0028F9AB /* FuzzyMatchWrapper.h */,
1EE418EB15C7FDCB0028F9AB /* Vocabulary.cpp */,
1EE418EC15C7FDCB0028F9AB /* Vocabulary.h */,
);
name = "fuzzy-match";
sourceTree = "<group>";
};
1EDA803414D19EB8003D2191 /* CYKPlusParser */ = {
isa = PBXGroup;
children = (
1E619E9F15B8713600C2D7A7 /* ChartRuleLookupManagerMemoryPerSentence.cpp */,
1E619EA015B8713700C2D7A7 /* ChartRuleLookupManagerMemoryPerSentence.h */,
1EDA806214D19F12003D2191 /* ChartRuleLookupManagerCYKPlus.cpp */,
1EDA806314D19F12003D2191 /* ChartRuleLookupManagerCYKPlus.h */,
1EDA806414D19F12003D2191 /* ChartRuleLookupManagerMemory.cpp */,
@ -986,7 +1110,6 @@
1EC7374914B977AB00238410 /* AlignmentInfoCollection.h in Headers */,
1EC7374B14B977AB00238410 /* BilingualDynSuffixArray.h in Headers */,
1EC7374D14B977AB00238410 /* BitmapContainer.h in Headers */,
1EC7374E14B977AB00238410 /* CellCollection.h in Headers */,
1EC7375014B977AB00238410 /* ChartCell.h in Headers */,
1EC7375214B977AB00238410 /* ChartCellCollection.h in Headers */,
1EC7375314B977AB00238410 /* ChartCellLabel.h in Headers */,
@ -995,8 +1118,6 @@
1EC7375814B977AB00238410 /* ChartHypothesisCollection.h in Headers */,
1EC7375A14B977AB00238410 /* ChartManager.h in Headers */,
1EC7375C14B977AB00238410 /* ChartRuleLookupManager.h in Headers */,
1EC7376214B977AB00238410 /* ChartTranslationOption.h in Headers */,
1EC7376414B977AB00238410 /* ChartTranslationOptionCollection.h in Headers */,
1EC7376614B977AB00238410 /* ChartTranslationOptionList.h in Headers */,
1EC7376814B977AB00238410 /* ChartTrellisDetour.h in Headers */,
1EC7376A14B977AB00238410 /* ChartTrellisDetourQueue.h in Headers */,
@ -1143,6 +1264,31 @@
1EF8F2C4159A61970047B613 /* HypoList.h in Headers */,
1E879EA815A346F90051F346 /* SearchNormalBatch.h in Headers */,
1E1D824115AC29BB00FE42E9 /* FileHandler.h in Headers */,
1E0BA41915B70E5F00AC70E1 /* PhraseDictionaryFuzzyMatch.h in Headers */,
1E619EA215B8713700C2D7A7 /* ChartRuleLookupManagerMemoryPerSentence.h in Headers */,
1EE418ED15C7FDCB0028F9AB /* Match.h in Headers */,
1EE418EF15C7FDCB0028F9AB /* SentenceAlignment.h in Headers */,
1EE418F115C7FDCB0028F9AB /* SuffixArray.h in Headers */,
1EE418F315C7FDCB0028F9AB /* FuzzyMatchWrapper.h in Headers */,
1EE418F515C7FDCB0028F9AB /* Vocabulary.h in Headers */,
1E6D9FD715D027560064D436 /* BlockHashIndex.h in Headers */,
1E6D9FD815D027560064D436 /* CanonicalHuffman.h in Headers */,
1E6D9FDA15D027560064D436 /* CmphStringVectorAdapter.h in Headers */,
1E6D9FDB15D027560064D436 /* ConsistantPhrases.h in Headers */,
1E6D9FDE15D027560064D436 /* LexicalReorderingTableCompact.h in Headers */,
1E6D9FE015D027560064D436 /* LexicalReorderingTableCreator.h in Headers */,
1E6D9FE115D027560064D436 /* ListCoders.h in Headers */,
1E6D9FE215D027560064D436 /* MmapAllocator.h in Headers */,
1E6D9FE315D027560064D436 /* MonotonicVector.h in Headers */,
1E6D9FE515D027560064D436 /* MurmurHash3.h in Headers */,
1E6D9FE615D027560064D436 /* PackedArray.h in Headers */,
1E6D9FE815D027560064D436 /* PhraseDecoder.h in Headers */,
1E6D9FEA15D027560064D436 /* PhraseDictionaryCompact.h in Headers */,
1E6D9FEC15D027560064D436 /* PhraseTableCreator.h in Headers */,
1E6D9FED15D027560064D436 /* StringVector.h in Headers */,
1E6D9FEE15D027560064D436 /* TargetPhraseCollectionCache.h in Headers */,
1EC32DB915D2D90700A313B1 /* ThrowingFwrite.h in Headers */,
1E365EEB16120F4600BA335B /* ChartTranslationOptions.h in Headers */,
);
runOnlyForDeploymentPostprocessing = 0;
};
@ -1172,7 +1318,7 @@
08FB7793FE84155DC02AAC07 /* Project object */ = {
isa = PBXProject;
attributes = {
LastUpgradeCheck = 0410;
LastUpgradeCheck = 0420;
};
buildConfigurationList = 1DEB91EF08733DB70010E9CD /* Build configuration list for PBXProject "moses" */;
compatibilityVersion = "Xcode 3.2";
@ -1207,8 +1353,6 @@
1EC7375514B977AB00238410 /* ChartHypothesis.cpp in Sources */,
1EC7375714B977AB00238410 /* ChartHypothesisCollection.cpp in Sources */,
1EC7375914B977AB00238410 /* ChartManager.cpp in Sources */,
1EC7376114B977AB00238410 /* ChartTranslationOption.cpp in Sources */,
1EC7376314B977AB00238410 /* ChartTranslationOptionCollection.cpp in Sources */,
1EC7376514B977AB00238410 /* ChartTranslationOptionList.cpp in Sources */,
1EC7376714B977AB00238410 /* ChartTrellisDetour.cpp in Sources */,
1EC7376914B977AB00238410 /* ChartTrellisDetourQueue.cpp in Sources */,
@ -1328,6 +1472,22 @@
1EDA809214D19FBF003D2191 /* UTrieNode.cpp in Sources */,
1E879EA715A346F90051F346 /* SearchNormalBatch.cpp in Sources */,
1E1D824015AC29BB00FE42E9 /* FileHandler.cpp in Sources */,
1E0BA41815B70E5F00AC70E1 /* PhraseDictionaryFuzzyMatch.cpp in Sources */,
1E619EA115B8713700C2D7A7 /* ChartRuleLookupManagerMemoryPerSentence.cpp in Sources */,
1EE418EE15C7FDCB0028F9AB /* SentenceAlignment.cpp in Sources */,
1EE418F015C7FDCB0028F9AB /* SuffixArray.cpp in Sources */,
1EE418F215C7FDCB0028F9AB /* FuzzyMatchWrapper.cpp in Sources */,
1EE418F415C7FDCB0028F9AB /* Vocabulary.cpp in Sources */,
1E6D9FD615D027560064D436 /* BlockHashIndex.cpp in Sources */,
1E6D9FD915D027560064D436 /* CmphStringVectorAdapter.cpp in Sources */,
1E6D9FDD15D027560064D436 /* LexicalReorderingTableCompact.cpp in Sources */,
1E6D9FDF15D027560064D436 /* LexicalReorderingTableCreator.cpp in Sources */,
1E6D9FE415D027560064D436 /* MurmurHash3.cpp in Sources */,
1E6D9FE715D027560064D436 /* PhraseDecoder.cpp in Sources */,
1E6D9FE915D027560064D436 /* PhraseDictionaryCompact.cpp in Sources */,
1E6D9FEB15D027560064D436 /* PhraseTableCreator.cpp in Sources */,
1EC32DB815D2D90700A313B1 /* ThrowingFwrite.cpp in Sources */,
1E365EEA16120F4600BA335B /* ChartTranslationOptions.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
@ -1338,6 +1498,7 @@
isa = XCBuildConfiguration;
buildSettings = {
ALWAYS_SEARCH_USER_PATHS = NO;
ARCHS = "$(ARCHS_STANDARD_64_BIT)";
COPY_PHASE_STRIP = NO;
GCC_DYNAMIC_NO_PIC = NO;
GCC_MODEL_TUNING = G5;
@ -1352,6 +1513,9 @@
"_FILE_OFFSET_BITS=64",
_LARGE_FILES,
WITH_THREADS,
IS_XCODE,
HAVE_CMPH,
"KENLM_MAX_ORDER=7",
);
HEADER_SEARCH_PATHS = (
../..,
@ -1376,6 +1540,7 @@
"\"$(SRCROOT)/../../moses/src/Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi\"",
);
PRODUCT_NAME = moses;
USER_HEADER_SEARCH_PATHS = "../.. ../../moses/src ../../irstlm/include ../../srilm/include ../../kenlm ../../randlm/include /opt/local/include ../../synlm/hhmm/wsjparse/include ../../synlm/hhmm/rvtl/include/ ../.. ../../cmph/include";
};
name = Debug;
};
@ -1383,6 +1548,7 @@
isa = XCBuildConfiguration;
buildSettings = {
ALWAYS_SEARCH_USER_PATHS = NO;
ARCHS = "$(ARCHS_STANDARD_64_BIT)";
DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
GCC_MODEL_TUNING = G5;
GCC_PREPROCESSOR_DEFINITIONS = (
@ -1395,6 +1561,9 @@
"_FILE_OFFSET_BITS=64",
_LARGE_FILES,
WITH_THREADS,
IS_XCODE,
HAVE_CMPH,
"KENLM_MAX_ORDER=7",
);
HEADER_SEARCH_PATHS = (
../..,
@ -1419,6 +1588,7 @@
"\"$(SRCROOT)/../../moses/src/Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi\"",
);
PRODUCT_NAME = moses;
USER_HEADER_SEARCH_PATHS = "../.. ../../moses/src ../../irstlm/include ../../srilm/include ../../kenlm ../../randlm/include /opt/local/include ../../synlm/hhmm/wsjparse/include ../../synlm/hhmm/rvtl/include/ ../.. ../../cmph/include";
};
name = Release;
};

View File

@ -3,8 +3,8 @@
<cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<storageModule moduleId="org.eclipse.cdt.core.settings">
<cconfiguration id="cdt.managedbuild.config.gnu.macosx.exe.debug.1895695426">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.macosx.exe.debug.1895695426" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.656913512">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.656913512" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings>
<externalSetting>
<entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/moses"/>
@ -13,7 +13,7 @@
</externalSetting>
</externalSettings>
<extensions>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
@ -21,65 +21,70 @@
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactExtension="a" artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.staticLib" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.staticLib" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.macosx.exe.debug.1895695426" name="Debug" parent="cdt.managedbuild.config.gnu.macosx.exe.debug">
<folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.1895695426." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.497902212" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug">
<targetPlatform id="cdt.managedbuild.target.gnu.platform.macosx.exe.debug.1820609450" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.debug"/>
<builder buildPath="${workspace_loc:/moses/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.1998579330" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
<tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.1330311562" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug"/>
<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.1226580551" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug">
<inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.102127808" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input">
<configuration artifactExtension="a" artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.staticLib" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.staticLib" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.debug.656913512" name="Debug" parent="cdt.managedbuild.config.gnu.exe.debug">
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1793369992" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.1051650049" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
<builder buildPath="${workspace_loc:/moses/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.505583888" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.1976472988" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1774992327" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1759650532" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
<option id="gnu.cpp.compiler.exe.debug.option.debugging.level.2123672332" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.include.paths.57896781" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="/opt/local/include/"/>
<listOptionValue builtIn="false" value="${workspace_loc}/../../irstlm/include"/>
<listOptionValue builtIn="false" value="${workspace_loc}/../../srilm/include"/>
<listOptionValue builtIn="false" value="${workspace_loc}/../../moses/src"/>
<listOptionValue builtIn="false" value="${workspace_loc}/../../"/>
</option>
<option id="gnu.cpp.compiler.option.preprocessor.def.752586397" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" valueType="definedSymbols">
<listOptionValue builtIn="false" value="KENLM_MAX_ORDER=7"/>
<listOptionValue builtIn="false" value="TRACE_ENABLE"/>
<listOptionValue builtIn="false" value="LM_IRST"/>
<listOptionValue builtIn="false" value="_FILE_OFFSET_BIT=64"/>
<listOptionValue builtIn="false" value="_LARGE_FILES"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1905116220" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.2126314903" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug">
<option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.exe.debug.option.optimization.level.1524900118" name="Optimization Level" superClass="gnu.c.compiler.exe.debug.option.optimization.level" valueType="enumerated"/>
<option id="gnu.c.compiler.exe.debug.option.debugging.level.581728958" name="Debug Level" superClass="gnu.c.compiler.exe.debug.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.877210753" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.c.linker.exe.debug.1168585173" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.2074660557" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.340054018" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
<tool command="as" commandLinePattern="${COMMAND} ${FLAGS} ${OUTPUT_FLAG} ${OUTPUT_PREFIX}${OUTPUT} ${INPUTS}" id="cdt.managedbuild.tool.gnu.assembler.macosx.exe.debug.1556759720" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.macosx.exe.debug">
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.897776351" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.archiver.macosx.base.1820797229" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.macosx.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.macosx.exe.debug.1867588805" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.macosx.exe.debug">
<option id="gnu.cpp.compilermacosx.exe.debug.option.optimization.level.1898625650" name="Optimization Level" superClass="gnu.cpp.compilermacosx.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
<option id="gnu.cpp.compiler.macosx.exe.debug.option.debugging.level.806998992" name="Debug Level" superClass="gnu.cpp.compiler.macosx.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.include.paths.1819917957" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="/opt/local/include"/>
<listOptionValue builtIn="false" value="/Users/hieuhoang/unison/workspace/github/moses-smt/moses/src"/>
<listOptionValue builtIn="false" value="/Users/hieuhoang/unison/workspace/github/moses-smt"/>
<listOptionValue builtIn="false" value="/Users/hieuhoang/unison/workspace/github/moses-smt/srilm/include"/>
<listOptionValue builtIn="false" value="/Users/hieuhoang/unison/workspace/github/moses-smt/irstlm/include"/>
</option>
<option id="gnu.cpp.compiler.option.preprocessor.def.1569452418" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" valueType="definedSymbols">
<listOptionValue builtIn="false" value="LM_SRI"/>
<listOptionValue builtIn="false" value="LM_IRST"/>
<listOptionValue builtIn="false" value="TRACE_ENABLE"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1110302565" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.c.compiler.macosx.exe.debug.401409202" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.macosx.exe.debug">
<option defaultValue="gnu.c.optimization.level.none" id="gnu.c.compiler.macosx.exe.debug.option.optimization.level.753046525" name="Optimization Level" superClass="gnu.c.compiler.macosx.exe.debug.option.optimization.level" valueType="enumerated"/>
<option id="gnu.c.compiler.macosx.exe.debug.option.debugging.level.1396911098" name="Debug Level" superClass="gnu.c.compiler.macosx.exe.debug.option.debugging.level" value="gnu.c.debugging.level.max" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.1919272901" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
<tool id="cdt.managedbuild.tool.gnu.assembler.exe.debug.933467113" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.debug">
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.99047750" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
<fileInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.1895695426.1722029461" name="SyntacticLanguageModelState.h" rcbsApplicability="disable" resourcePath="SyntacticLanguageModelState.h" toolsToInvoke=""/>
<fileInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.1895695426.1432960145" name="SyntacticLanguageModelFiles.h" rcbsApplicability="disable" resourcePath="SyntacticLanguageModelFiles.h" toolsToInvoke=""/>
<fileInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.1895695426.1906856645" name="SyntacticLanguageModel.h" rcbsApplicability="disable" resourcePath="SyntacticLanguageModel.h" toolsToInvoke=""/>
<fileInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.1895695426.460380900" name="Rand.h" rcbsApplicability="disable" resourcePath="LM/Rand.h" toolsToInvoke=""/>
<fileInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.1895695426.1692203139" name="ORLM.h" rcbsApplicability="disable" resourcePath="LM/ORLM.h" toolsToInvoke=""/>
<fileInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.1895695426.538301588" name="Remote.h" rcbsApplicability="disable" resourcePath="LM/Remote.h" toolsToInvoke=""/>
<fileInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.1895695426.854427429" name="LDHT.h" rcbsApplicability="disable" resourcePath="LM/LDHT.h" toolsToInvoke=""/>
<fileInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512.558758254" name="SyntacticLanguageModelState.h" rcbsApplicability="disable" resourcePath="SyntacticLanguageModelState.h" toolsToInvoke=""/>
<fileInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512.1930327037" name="SyntacticLanguageModelFiles.h" rcbsApplicability="disable" resourcePath="SyntacticLanguageModelFiles.h" toolsToInvoke=""/>
<fileInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512.1751563578" name="PhraseTableCreator.cpp" rcbsApplicability="disable" resourcePath="CompactPT/PhraseTableCreator.cpp" toolsToInvoke="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1774992327.1652631861">
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1774992327.1652631861" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1774992327"/>
</fileInfo>
<fileInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512.1174630266" name="Rand.h" rcbsApplicability="disable" resourcePath="LM/Rand.h" toolsToInvoke=""/>
<fileInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512.707830535" name="SRI.h" rcbsApplicability="disable" resourcePath="LM/SRI.h" toolsToInvoke=""/>
<fileInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512.160366559" name="LDHT.h" rcbsApplicability="disable" resourcePath="LM/LDHT.h" toolsToInvoke=""/>
<fileInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512.622077510" name="ParallelBackoff.h" rcbsApplicability="disable" resourcePath="LM/ParallelBackoff.h" toolsToInvoke=""/>
<fileInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512.1084194539" name="SyntacticLanguageModel.h" rcbsApplicability="disable" resourcePath="SyntacticLanguageModel.h" toolsToInvoke=""/>
<sourceEntries>
<entry excluding="SyntacticLanguageModelState.h|SyntacticLanguageModelFiles.h|SyntacticLanguageModel.h|SyntacticLanguageModel.cpp|LM/LDHT.cpp|LM/LDHT.h|LM/Remote.h|LM/Remote.cpp|LM/Rand.h|LM/Rand.cpp|LM/ORLM.h|LM/ORLM.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
<entry excluding="CompactPT/PhraseTableCreator.cpp|CompactPT/LexicalReorderingTableCreator.cpp|LM/SRI.h|LM/SRI.cpp|SyntacticLanguageModelState.h|SyntacticLanguageModelFiles.h|SyntacticLanguageModel.h|SyntacticLanguageModel.cpp|LM/ParallelBackoff.h|LM/ParallelBackoff.cpp|LM/Rand.h|LM/Rand.cpp|LM/LDHT.h|LM/LDHT.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
</sourceEntries>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
<cconfiguration id="cdt.managedbuild.config.macosx.exe.release.722580523">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.macosx.exe.release.722580523" moduleId="org.eclipse.cdt.core.settings" name="Release">
<cconfiguration id="cdt.managedbuild.config.gnu.exe.release.401150096">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.401150096" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
@ -88,59 +93,41 @@
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.macosx.exe.release.722580523" name="Release" parent="cdt.managedbuild.config.macosx.exe.release">
<folderInfo id="cdt.managedbuild.config.macosx.exe.release.722580523." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.release.2070671582" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.release">
<targetPlatform id="cdt.managedbuild.target.gnu.platform.macosx.exe.release.503591386" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.release"/>
<builder buildPath="${workspace_loc:/moses/Release}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.release.108117223" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.release"/>
<tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.release.1203406445" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.release"/>
<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.release.1539915639" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.release">
<inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.1333560300" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input">
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.release.401150096" name="Release" parent="cdt.managedbuild.config.gnu.exe.release">
<folderInfo id="cdt.managedbuild.config.gnu.exe.release.401150096." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.release.36295137" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.release">
<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.release.538725710" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.release"/>
<builder buildPath="${workspace_loc:/moses/Release}" id="cdt.managedbuild.target.gnu.builder.exe.release.1875953334" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.release"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.1633496039" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.2060881562" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.release">
<option id="gnu.cpp.compiler.exe.release.option.optimization.level.1375372870" name="Optimization Level" superClass="gnu.cpp.compiler.exe.release.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
<option id="gnu.cpp.compiler.exe.release.option.debugging.level.815283803" name="Debug Level" superClass="gnu.cpp.compiler.exe.release.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1020483420" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.release.85324871" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.release">
<option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.exe.release.option.optimization.level.1137534635" name="Optimization Level" superClass="gnu.c.compiler.exe.release.option.optimization.level" valueType="enumerated"/>
<option id="gnu.c.compiler.exe.release.option.debugging.level.143589037" name="Debug Level" superClass="gnu.c.compiler.exe.release.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.304912704" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.c.linker.exe.release.283583965" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.release"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.release.2059280959" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.release">
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.2020956494" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
<tool id="cdt.managedbuild.tool.gnu.assembler.macosx.exe.release.1693865756" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.macosx.exe.release">
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.2000339940" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.archiver.macosx.base.505919286" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.macosx.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.macosx.exe.release.1662892925" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.macosx.exe.release">
<option id="gnu.cpp.compiler.macosx.exe.release.option.optimization.level.1036481202" name="Optimization Level" superClass="gnu.cpp.compiler.macosx.exe.release.option.optimization.level" value="gnu.cpp.compiler.optimization.level.most" valueType="enumerated"/>
<option id="gnu.cpp.compiler.macosx.exe.release.option.debugging.level.484015287" name="Debug Level" superClass="gnu.cpp.compiler.macosx.exe.release.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.preprocessor.def.1089615214" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" valueType="definedSymbols">
<listOptionValue builtIn="false" value="LM_SRI"/>
<listOptionValue builtIn="false" value="LM_IRST"/>
<listOptionValue builtIn="false" value="TRACE_ENABLE"/>
</option>
<option id="gnu.cpp.compiler.option.include.paths.1722702487" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="/opt/local/include"/>
<listOptionValue builtIn="false" value="/Users/hieuhoang/unison/workspace/github/moses-smt/moses/src"/>
<listOptionValue builtIn="false" value="/Users/hieuhoang/unison/workspace/github/moses-smt"/>
<listOptionValue builtIn="false" value="/Users/hieuhoang/unison/workspace/github/moses-smt/srilm/include"/>
<listOptionValue builtIn="false" value="/Users/hieuhoang/unison/workspace/github/moses-smt/irstlm/include"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.936283391" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.c.compiler.macosx.exe.release.1404156839" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.macosx.exe.release">
<option defaultValue="gnu.c.optimization.level.most" id="gnu.c.compiler.macosx.exe.release.option.optimization.level.1487222992" name="Optimization Level" superClass="gnu.c.compiler.macosx.exe.release.option.optimization.level" valueType="enumerated"/>
<option id="gnu.c.compiler.macosx.exe.release.option.debugging.level.1171203697" name="Debug Level" superClass="gnu.c.compiler.macosx.exe.release.option.debugging.level" value="gnu.c.debugging.level.none" valueType="enumerated"/>
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.1172147378" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
<tool id="cdt.managedbuild.tool.gnu.assembler.exe.release.782286837" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.exe.release">
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.1766138143" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
<fileInfo id="cdt.managedbuild.config.macosx.exe.release.722580523.1831545277" name="Rand.h" rcbsApplicability="disable" resourcePath="LM/Rand.h" toolsToInvoke=""/>
<fileInfo id="cdt.managedbuild.config.macosx.exe.release.722580523.1743378025" name="ORLM.h" rcbsApplicability="disable" resourcePath="LM/ORLM.h" toolsToInvoke=""/>
<fileInfo id="cdt.managedbuild.config.macosx.exe.release.722580523.1490362543" name="Remote.h" rcbsApplicability="disable" resourcePath="LM/Remote.h" toolsToInvoke=""/>
<sourceEntries>
<entry excluding="LM/LDHT.cpp|LM/Rand.h|LM/Rand.cpp|LM/ORLM.h|LM/ORLM.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
</sourceEntries>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<project id="moses.cdt.managedbuild.target.macosx.exe.1209017164" name="Executable" projectType="cdt.managedbuild.target.macosx.exe"/>
<project id="moses.cdt.managedbuild.target.gnu.exe.1375079569" name="Executable" projectType="cdt.managedbuild.target.gnu.exe"/>
</storageModule>
<storageModule moduleId="scannerConfiguration">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
@ -150,12 +137,24 @@
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.macosx.exe.release.722580523;cdt.managedbuild.config.macosx.exe.release.722580523.;cdt.managedbuild.tool.gnu.c.compiler.macosx.exe.release.1404156839;cdt.managedbuild.tool.gnu.c.compiler.input.1172147378">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.401150096;cdt.managedbuild.config.gnu.exe.release.401150096.;cdt.managedbuild.tool.gnu.c.compiler.exe.release.85324871;cdt.managedbuild.tool.gnu.c.compiler.input.304912704">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.656913512;cdt.managedbuild.config.gnu.exe.debug.656913512.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1774992327;cdt.managedbuild.tool.gnu.cpp.compiler.input.1905116220">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.macosx.exe.debug.1895695426;cdt.managedbuild.config.gnu.macosx.exe.debug.1895695426.;cdt.managedbuild.tool.gnu.cpp.compiler.macosx.exe.debug.1867588805;cdt.managedbuild.tool.gnu.cpp.compiler.input.1110302565">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.macosx.exe.release.722580523;cdt.managedbuild.config.macosx.exe.release.722580523.;cdt.managedbuild.tool.gnu.cpp.compiler.macosx.exe.release.1662892925;cdt.managedbuild.tool.gnu.cpp.compiler.input.936283391">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.656913512;cdt.managedbuild.config.gnu.exe.debug.656913512.;cdt.managedbuild.tool.gnu.c.compiler.exe.debug.2126314903;cdt.managedbuild.tool.gnu.c.compiler.input.877210753">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.401150096;cdt.managedbuild.config.gnu.exe.release.401150096.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.2060881562;cdt.managedbuild.tool.gnu.cpp.compiler.input.1020483420">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="refreshScope" versionNumber="1">
<resource resourceType="PROJECT" workspacePath="/moses"/>

View File

@ -101,6 +101,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/AlignmentInfoCollection.h</locationURI>
</link>
<link>
<name>ApplicableRuleTrie.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/ApplicableRuleTrie.cpp</locationURI>
</link>
<link>
<name>ApplicableRuleTrie.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/ApplicableRuleTrie.h</locationURI>
</link>
<link>
<name>BilingualDynSuffixArray.cpp</name>
<type>1</type>
@ -271,6 +281,11 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/ChartTrellisPathList.h</locationURI>
</link>
<link>
<name>CompactPT</name>
<type>2</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/CompactPT</locationURI>
</link>
<link>
<name>ConfusionNet.cpp</name>
<type>1</type>
@ -441,6 +456,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/FloydWarshall.h</locationURI>
</link>
<link>
<name>FuzzyMatchWrapper.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/fuzzy-match/FuzzyMatchWrapper.cpp</locationURI>
</link>
<link>
<name>FuzzyMatchWrapper.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/fuzzy-match/FuzzyMatchWrapper.h</locationURI>
</link>
<link>
<name>GenerationDictionary.cpp</name>
<type>1</type>
@ -536,6 +561,11 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/InputType.h</locationURI>
</link>
<link>
<name>IntermediateVarSpanNode.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/IntermediateVarSpanNode.h</locationURI>
</link>
<link>
<name>Jamfile</name>
<type>1</type>
@ -606,6 +636,11 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Manager.h</locationURI>
</link>
<link>
<name>Match.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/fuzzy-match/Match.h</locationURI>
</link>
<link>
<name>NonTerminal.cpp</name>
<type>1</type>
@ -661,6 +696,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Parameter.h</locationURI>
</link>
<link>
<name>Parser.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/Parser.cpp</locationURI>
</link>
<link>
<name>Parser.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/Parser.h</locationURI>
</link>
<link>
<name>PartialTranslOptColl.cpp</name>
<type>1</type>
@ -809,7 +854,7 @@
<link>
<name>RuleTable</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable</locationURI>
</link>
<link>
<name>SRI.lo</name>
@ -821,11 +866,6 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/SRI.o</locationURI>
</link>
<link>
<name>Scope3Parser</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>ScoreComponentCollection.cpp</name>
<type>1</type>
@ -886,6 +926,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/SearchNormal.h</locationURI>
</link>
<link>
<name>SearchNormalBatch.cpp</name>
<type>1</type>
<locationURI>PARENT-1-ECLIPSE_HOME/workspace/github/hieuhoang/moses/src/SearchNormalBatch.cpp</locationURI>
</link>
<link>
<name>SearchNormalBatch.h</name>
<type>1</type>
<locationURI>PARENT-1-ECLIPSE_HOME/workspace/github/hieuhoang/moses/src/SearchNormalBatch.h</locationURI>
</link>
<link>
<name>Sentence.cpp</name>
<type>1</type>
@ -896,6 +946,21 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Sentence.h</locationURI>
</link>
<link>
<name>SentenceAlignment.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/fuzzy-match/SentenceAlignment.cpp</locationURI>
</link>
<link>
<name>SentenceAlignment.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/fuzzy-match/SentenceAlignment.h</locationURI>
</link>
<link>
<name>SentenceMap.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/SentenceMap.h</locationURI>
</link>
<link>
<name>SentenceStats.cpp</name>
<type>1</type>
@ -916,6 +981,26 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/SquareMatrix.h</locationURI>
</link>
<link>
<name>StackLattice.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/StackLattice.h</locationURI>
</link>
<link>
<name>StackLatticeBuilder.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/StackLatticeBuilder.cpp</locationURI>
</link>
<link>
<name>StackLatticeBuilder.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/StackLatticeBuilder.h</locationURI>
</link>
<link>
<name>StackLatticeSearcher.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/StackLatticeSearcher.h</locationURI>
</link>
<link>
<name>StackVec.h</name>
<type>1</type>
@ -941,6 +1026,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/StaticData.o</locationURI>
</link>
<link>
<name>SuffixArray.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/fuzzy-match/SuffixArray.cpp</locationURI>
</link>
<link>
<name>SuffixArray.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/fuzzy-match/SuffixArray.h</locationURI>
</link>
<link>
<name>SyntacticLanguageModel.cpp</name>
<type>1</type>
@ -1181,6 +1276,31 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Util.o</locationURI>
</link>
<link>
<name>VarSpanNode.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/VarSpanNode.h</locationURI>
</link>
<link>
<name>VarSpanTrieBuilder.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/VarSpanTrieBuilder.cpp</locationURI>
</link>
<link>
<name>VarSpanTrieBuilder.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/VarSpanTrieBuilder.h</locationURI>
</link>
<link>
<name>Vocabulary.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/fuzzy-match/Vocabulary.cpp</locationURI>
</link>
<link>
<name>Vocabulary.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/fuzzy-match/Vocabulary.h</locationURI>
</link>
<link>
<name>Word.cpp</name>
<type>1</type>
@ -1336,6 +1456,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/ChartRuleLookupManagerMemory.h</locationURI>
</link>
<link>
<name>CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp</locationURI>
</link>
<link>
<name>CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h</locationURI>
</link>
<link>
<name>CYKPlusParser/ChartRuleLookupManagerOnDisk.cpp</name>
<type>1</type>
@ -1381,6 +1511,16 @@
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>DynSAInclude/FileHandler.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/FileHandler.cpp</locationURI>
</link>
<link>
<name>DynSAInclude/FileHandler.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/FileHandler.h</locationURI>
</link>
<link>
<name>DynSAInclude/Jamfile</name>
<type>1</type>
@ -1396,26 +1536,11 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/RandLMFilter.h</locationURI>
</link>
<link>
<name>DynSAInclude/bin</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>DynSAInclude/fdstream.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/fdstream.h</locationURI>
</link>
<link>
<name>DynSAInclude/file.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/file.cpp</locationURI>
</link>
<link>
<name>DynSAInclude/file.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/file.h</locationURI>
</link>
<link>
<name>DynSAInclude/hash.h</name>
<type>1</type>
@ -1616,211 +1741,16 @@
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>RuleTable/Jamfile</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/Jamfile</locationURI>
</link>
<link>
<name>RuleTable/Loader.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/Loader.h</locationURI>
</link>
<link>
<name>RuleTable/LoaderCompact.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/LoaderCompact.cpp</locationURI>
</link>
<link>
<name>RuleTable/LoaderCompact.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/LoaderCompact.h</locationURI>
</link>
<link>
<name>RuleTable/LoaderFactory.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/LoaderFactory.cpp</locationURI>
</link>
<link>
<name>RuleTable/LoaderFactory.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/LoaderFactory.h</locationURI>
</link>
<link>
<name>RuleTable/LoaderHiero.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/LoaderHiero.cpp</locationURI>
</link>
<link>
<name>RuleTable/LoaderHiero.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/LoaderHiero.h</locationURI>
</link>
<link>
<name>RuleTable/LoaderStandard.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/LoaderStandard.cpp</locationURI>
</link>
<link>
<name>RuleTable/LoaderStandard.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/LoaderStandard.h</locationURI>
</link>
<link>
<name>RuleTable/PhraseDictionaryALSuffixArray.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/PhraseDictionaryALSuffixArray.cpp</locationURI>
</link>
<link>
<name>RuleTable/PhraseDictionaryALSuffixArray.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/PhraseDictionaryALSuffixArray.h</locationURI>
</link>
<link>
<name>RuleTable/PhraseDictionaryNodeSCFG.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/PhraseDictionaryNodeSCFG.cpp</locationURI>
</link>
<link>
<name>RuleTable/PhraseDictionaryNodeSCFG.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/PhraseDictionaryNodeSCFG.h</locationURI>
</link>
<link>
<name>RuleTable/PhraseDictionaryOnDisk.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/PhraseDictionaryOnDisk.cpp</locationURI>
</link>
<link>
<name>RuleTable/PhraseDictionaryOnDisk.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/PhraseDictionaryOnDisk.h</locationURI>
</link>
<link>
<name>RuleTable/PhraseDictionarySCFG.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/PhraseDictionarySCFG.cpp</locationURI>
</link>
<link>
<name>RuleTable/PhraseDictionarySCFG.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/PhraseDictionarySCFG.h</locationURI>
</link>
<link>
<name>RuleTable/Trie.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/Trie.cpp</locationURI>
</link>
<link>
<name>RuleTable/Trie.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/Trie.h</locationURI>
</link>
<link>
<name>RuleTable/UTrie.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/UTrie.cpp</locationURI>
</link>
<link>
<name>RuleTable/UTrie.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/UTrie.h</locationURI>
</link>
<link>
<name>RuleTable/UTrieNode.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/UTrieNode.cpp</locationURI>
</link>
<link>
<name>RuleTable/UTrieNode.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/UTrieNode.h</locationURI>
</link>
<link>
<name>RuleTable/bin</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>Scope3Parser/ApplicableRuleTrie.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/ApplicableRuleTrie.cpp</locationURI>
</link>
<link>
<name>Scope3Parser/ApplicableRuleTrie.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/ApplicableRuleTrie.h</locationURI>
</link>
<link>
<name>Scope3Parser/IntermediateVarSpanNode.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/IntermediateVarSpanNode.h</locationURI>
</link>
<link>
<name>Scope3Parser/Jamfile</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/Jamfile</locationURI>
</link>
<link>
<name>Scope3Parser/Parser.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/Parser.cpp</locationURI>
</link>
<link>
<name>Scope3Parser/Parser.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/Parser.h</locationURI>
</link>
<link>
<name>Scope3Parser/SentenceMap.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/SentenceMap.h</locationURI>
</link>
<link>
<name>Scope3Parser/StackLattice.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/StackLattice.h</locationURI>
</link>
<link>
<name>Scope3Parser/StackLatticeBuilder.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/StackLatticeBuilder.cpp</locationURI>
</link>
<link>
<name>Scope3Parser/StackLatticeBuilder.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/StackLatticeBuilder.h</locationURI>
</link>
<link>
<name>Scope3Parser/StackLatticeSearcher.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/StackLatticeSearcher.h</locationURI>
</link>
<link>
<name>Scope3Parser/VarSpanNode.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/VarSpanNode.h</locationURI>
</link>
<link>
<name>Scope3Parser/VarSpanTrieBuilder.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/VarSpanTrieBuilder.cpp</locationURI>
</link>
<link>
<name>Scope3Parser/VarSpanTrieBuilder.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/VarSpanTrieBuilder.h</locationURI>
</link>
<link>
<name>Scope3Parser/bin</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>bin/darwin-4.2.1</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>bin/gcc-4.6</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>CYKPlusParser/bin/clang-darwin-4.2.1</name>
<type>2</type>
@ -1832,12 +1762,7 @@
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>DynSAInclude/bin/clang-darwin-4.2.1</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>DynSAInclude/bin/darwin-4.2.1</name>
<name>CYKPlusParser/bin/gcc-4.6</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
@ -1856,21 +1781,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/LM/bin/lm.log</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>Scope3Parser/bin/darwin-4.2.1</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>bin/darwin-4.2.1/release</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>bin/gcc-4.6/release</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>CYKPlusParser/bin/clang-darwin-4.2.1/release</name>
<type>2</type>
@ -1882,12 +1802,7 @@
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>DynSAInclude/bin/clang-darwin-4.2.1/release</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>DynSAInclude/bin/darwin-4.2.1/release</name>
<name>CYKPlusParser/bin/gcc-4.6/release</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
@ -1901,21 +1816,16 @@
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1/release</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>Scope3Parser/bin/darwin-4.2.1/release</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>bin/darwin-4.2.1/release/debug-symbols-on</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>bin/gcc-4.6/release/debug-symbols-on</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>CYKPlusParser/bin/clang-darwin-4.2.1/release/debug-symbols-on</name>
<type>2</type>
@ -1927,12 +1837,7 @@
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>DynSAInclude/bin/clang-darwin-4.2.1/release/debug-symbols-on</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>DynSAInclude/bin/darwin-4.2.1/release/debug-symbols-on</name>
<name>CYKPlusParser/bin/gcc-4.6/release/debug-symbols-on</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
@ -1951,21 +1856,16 @@
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1/release/debug-symbols-on</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>bin/darwin-4.2.1/release/debug-symbols-on/link-static</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>bin/gcc-4.6/release/debug-symbols-on/link-static</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>CYKPlusParser/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static</name>
<type>2</type>
@ -1982,12 +1882,7 @@
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>DynSAInclude/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>DynSAInclude/bin/darwin-4.2.1/release/debug-symbols-on/link-static</name>
<name>CYKPlusParser/bin/gcc-4.6/release/debug-symbols-on/link-static</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
@ -2011,31 +1906,16 @@
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/link-static</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>CYKPlusParser/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi</name>
<type>2</type>
@ -2072,12 +1952,7 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/DotChartOnDisk.o</locationURI>
</link>
<link>
<name>DynSAInclude/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>DynSAInclude/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi</name>
<name>CYKPlusParser/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
@ -2191,91 +2066,6 @@
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/LoaderCompact.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/LoaderCompact.o</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/LoaderFactory.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/LoaderFactory.o</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/LoaderHiero.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/LoaderHiero.o</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/LoaderStandard.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/LoaderStandard.o</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/PhraseDictionaryALSuffixArray.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/PhraseDictionaryALSuffixArray.o</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/PhraseDictionaryNodeSCFG.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/PhraseDictionaryNodeSCFG.o</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/PhraseDictionaryOnDisk.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/PhraseDictionaryOnDisk.o</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/PhraseDictionarySCFG.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/PhraseDictionarySCFG.o</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/Trie.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/Trie.o</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/UTrie.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/UTrie.o</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/UTrieNode.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/UTrieNode.o</locationURI>
</link>
<link>
<name>Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/ApplicableRuleTrie.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/ApplicableRuleTrie.o</locationURI>
</link>
<link>
<name>Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/Parser.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/Parser.o</locationURI>
</link>
<link>
<name>Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/StackLatticeBuilder.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/StackLatticeBuilder.o</locationURI>
</link>
<link>
<name>Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/VarSpanTrieBuilder.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/threading-multi/VarSpanTrieBuilder.o</locationURI>
</link>
<link>
<name>bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/AlignmentInfo.o</name>
<type>1</type>
@ -2751,6 +2541,56 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/libmoses_internal.a</locationURI>
</link>
<link>
<name>bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/ApplicableRuleTrie.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/ApplicableRuleTrie.o</locationURI>
</link>
<link>
<name>bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/FuzzyMatchWrapper.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/fuzzy-match/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/FuzzyMatchWrapper.o</locationURI>
</link>
<link>
<name>bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/Parser.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/Parser.o</locationURI>
</link>
<link>
<name>bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/SentenceAlignment.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/fuzzy-match/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/SentenceAlignment.o</locationURI>
</link>
<link>
<name>bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/StackLatticeBuilder.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/StackLatticeBuilder.o</locationURI>
</link>
<link>
<name>bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/SuffixArray.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/fuzzy-match/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/SuffixArray.o</locationURI>
</link>
<link>
<name>bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/VarSpanTrieBuilder.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/VarSpanTrieBuilder.o</locationURI>
</link>
<link>
<name>bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/Vocabulary.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/fuzzy-match/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/Vocabulary.o</locationURI>
</link>
<link>
<name>bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/libScope3Parser.a</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/libScope3Parser.a</locationURI>
</link>
<link>
<name>bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/libfuzzy-match.a</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/fuzzy-match/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/libfuzzy-match.a</locationURI>
</link>
<link>
<name>CYKPlusParser/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DotChartOnDisk.o</name>
<type>1</type>
@ -2787,24 +2627,39 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/libCYKPlusParser.a</locationURI>
</link>
<link>
<name>DynSAInclude/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>DynSAInclude/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/libdynsa.a</name>
<name>CYKPlusParser/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/ChartRuleLookupManagerCYKPlus.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/libdynsa.a</locationURI>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/ChartRuleLookupManagerCYKPlus.o</locationURI>
</link>
<link>
<name>DynSAInclude/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
<name>DynSAInclude/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/libdynsa.a</name>
<name>CYKPlusParser/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/ChartRuleLookupManagerMemory.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/libdynsa.a</locationURI>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/ChartRuleLookupManagerMemory.o</locationURI>
</link>
<link>
<name>CYKPlusParser/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/ChartRuleLookupManagerMemoryPerSentence.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/ChartRuleLookupManagerMemoryPerSentence.o</locationURI>
</link>
<link>
<name>CYKPlusParser/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/ChartRuleLookupManagerOnDisk.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/ChartRuleLookupManagerOnDisk.o</locationURI>
</link>
<link>
<name>CYKPlusParser/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/DotChartInMemory.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/DotChartInMemory.o</locationURI>
</link>
<link>
<name>CYKPlusParser/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/DotChartOnDisk.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/DotChartOnDisk.o</locationURI>
</link>
<link>
<name>CYKPlusParser/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/libCYKPlusParser.a</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/CYKPlusParser/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/libCYKPlusParser.a</locationURI>
</link>
<link>
<name>LM/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Base.o</name>
@ -2921,91 +2776,6 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/LM/bin/gcc-4.2.1/release/debug-symbols-on/link-static/threading-multi/libLM.a</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/LoaderCompact.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/LoaderCompact.o</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/LoaderFactory.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/LoaderFactory.o</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/LoaderHiero.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/LoaderHiero.o</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/LoaderStandard.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/LoaderStandard.o</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PhraseDictionaryALSuffixArray.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PhraseDictionaryALSuffixArray.o</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PhraseDictionaryNodeSCFG.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PhraseDictionaryNodeSCFG.o</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PhraseDictionaryOnDisk.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PhraseDictionaryOnDisk.o</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PhraseDictionarySCFG.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/PhraseDictionarySCFG.o</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Trie.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Trie.o</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/UTrie.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/UTrie.o</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/UTrieNode.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/UTrieNode.o</locationURI>
</link>
<link>
<name>RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/libRuleTable.a</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/RuleTable/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/libRuleTable.a</locationURI>
</link>
<link>
<name>Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ApplicableRuleTrie.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/ApplicableRuleTrie.o</locationURI>
</link>
<link>
<name>Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Parser.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/Parser.o</locationURI>
</link>
<link>
<name>Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/StackLatticeBuilder.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/StackLatticeBuilder.o</locationURI>
</link>
<link>
<name>Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/VarSpanTrieBuilder.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/VarSpanTrieBuilder.o</locationURI>
</link>
<link>
<name>Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/libScope3Parser.a</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/Scope3Parser/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/libScope3Parser.a</locationURI>
</link>
<link>
<name>bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/file.o</name>
<type>1</type>
@ -3021,35 +2791,5 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/vocab.o</locationURI>
</link>
<link>
<name>DynSAInclude/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/file.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/file.o</locationURI>
</link>
<link>
<name>DynSAInclude/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/params.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/params.o</locationURI>
</link>
<link>
<name>DynSAInclude/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/vocab.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/bin/clang-darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/vocab.o</locationURI>
</link>
<link>
<name>DynSAInclude/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/file.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/file.o</locationURI>
</link>
<link>
<name>DynSAInclude/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/params.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/params.o</locationURI>
</link>
<link>
<name>DynSAInclude/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/vocab.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/src/DynSAInclude/bin/darwin-4.2.1/release/debug-symbols-on/link-static/threading-multi/DynSAInclude/vocab.o</locationURI>
</link>
</linkedResources>
</projectDescription>

View File

@ -0,0 +1,102 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32">
<Configuration>Debug</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|Win32">
<Configuration>Release</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{85811FDF-8AD1-4490-A545-B2F51931A18C}</ProjectGuid>
<RootNamespace>mosescmd</RootNamespace>
<Keyword>Win32Proj</Keyword>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>Unicode</CharacterSet>
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion>
<OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
<IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Configuration)\</IntDir>
<LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</LinkIncremental>
<OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
<IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Configuration)\</IntDir>
<LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
<IncludePath Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">C:\Program Files\boost\boost_1_47;$(IncludePath)</IncludePath>
<IncludePath Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">C:\Program Files\boost\boost_1_47;$(IncludePath)</IncludePath>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile>
<Optimization>Disabled</Optimization>
<AdditionalIncludeDirectories>C:\xmlrpc-c\include;C:\boost\boost_1_47;$(SolutionDir)/../../moses/src;$(SolutionDir)/../..;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<PreprocessorDefinitions>WITH_THREADS;NO_PIPES;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<MinimalRebuild>true</MinimalRebuild>
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
<RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
<PrecompiledHeader>
</PrecompiledHeader>
<WarningLevel>Level3</WarningLevel>
<DebugInformationFormat>EditAndContinue</DebugInformationFormat>
</ClCompile>
<Link>
<AdditionalDependencies>libxmlrpc_server_abyss.lib;libxmlrpc_server.lib;libxmlrpc_abyss.lib;libxmlrpc.lib;libxmlrpc_util.lib;libxmlrpc_xmlparse.lib;libxmlrpc_xmltok.lib;libxmlrpc++.lib;C:\GnuWin32\lib\zlib.lib;$(SolutionDir)$(Configuration)\moses.lib;$(SolutionDir)$(Configuration)\kenlm.lib;$(SolutionDir)$(Configuration)\OnDiskPt.lib;%(AdditionalDependencies)</AdditionalDependencies>
<GenerateDebugInformation>true</GenerateDebugInformation>
<SubSystem>Console</SubSystem>
<RandomizedBaseAddress>false</RandomizedBaseAddress>
<DataExecutionPrevention>
</DataExecutionPrevention>
<TargetMachine>MachineX86</TargetMachine>
<AdditionalLibraryDirectories>C:\xmlrpc-c\bin\Debug-Static-Win32;C:\boost\boost_1_47\lib</AdditionalLibraryDirectories>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<ClCompile>
<AdditionalIncludeDirectories>C:\xmlrpc-c\include;C:\boost\boost_1_47;$(SolutionDir)/../../moses/src;$(SolutionDir)/../..;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<PreprocessorDefinitions>WITH_THREADS;NO_PIPES;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
<PrecompiledHeader>
</PrecompiledHeader>
<WarningLevel>Level3</WarningLevel>
<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
</ClCompile>
<Link>
<AdditionalDependencies>libxmlrpc_server_abyss.lib;libxmlrpc_server.lib;libxmlrpc_abyss.lib;libxmlrpc.lib;libxmlrpc_util.lib;libxmlrpc_xmlparse.lib;libxmlrpc_xmltok.lib;libxmlrpc++.lib;C:\GnuWin32\lib\zlib.lib;$(SolutionDir)$(Configuration)\moses.lib;$(SolutionDir)$(Configuration)\kenlm.lib;$(SolutionDir)$(Configuration)\OnDiskPt.lib;%(AdditionalDependencies)</AdditionalDependencies>
<GenerateDebugInformation>true</GenerateDebugInformation>
<SubSystem>Console</SubSystem>
<OptimizeReferences>true</OptimizeReferences>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<RandomizedBaseAddress>false</RandomizedBaseAddress>
<DataExecutionPrevention>
</DataExecutionPrevention>
<TargetMachine>MachineX86</TargetMachine>
<AdditionalLibraryDirectories>C:\xmlrpc-c\bin\Release-Static-Win32;C:\boost\boost_1_47\lib</AdditionalLibraryDirectories>
</Link>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="..\server\mosesserver.cpp" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
</Project>

View File

@ -0,0 +1,297 @@
// !$*UTF8*$!
{
archiveVersion = 1;
classes = {
};
objectVersion = 46;
objects = {
/* Begin PBXBuildFile section */
1E6D9FF115D027F00064D436 /* libmoses.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 1EB3EBD515D0269B006B9CF1 /* libmoses.a */; };
1EB3EBB315D024C7006B9CF1 /* processLexicalTableMin.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EB3EBB215D024C7006B9CF1 /* processLexicalTableMin.cpp */; };
/* End PBXBuildFile section */
/* Begin PBXContainerItemProxy section */
1E6D9FF215D0292D0064D436 /* PBXContainerItemProxy */ = {
isa = PBXContainerItemProxy;
containerPortal = 1EB3EBD015D0269B006B9CF1 /* moses.xcodeproj */;
proxyType = 1;
remoteGlobalIDString = D2AAC045055464E500DB518D;
remoteInfo = moses;
};
1EB3EBD415D0269B006B9CF1 /* PBXContainerItemProxy */ = {
isa = PBXContainerItemProxy;
containerPortal = 1EB3EBD015D0269B006B9CF1 /* moses.xcodeproj */;
proxyType = 2;
remoteGlobalIDString = D2AAC046055464E500DB518D;
remoteInfo = moses;
};
/* End PBXContainerItemProxy section */
/* Begin PBXCopyFilesBuildPhase section */
1E3A0AEA15D0242A003EF9B4 /* CopyFiles */ = {
isa = PBXCopyFilesBuildPhase;
buildActionMask = 2147483647;
dstPath = /usr/share/man/man1/;
dstSubfolderSpec = 0;
files = (
);
runOnlyForDeploymentPostprocessing = 1;
};
/* End PBXCopyFilesBuildPhase section */
/* Begin PBXFileReference section */
1E3A0AEC15D0242A003EF9B4 /* processLexicalTableMin */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = processLexicalTableMin; sourceTree = BUILT_PRODUCTS_DIR; };
1EB3EBB215D024C7006B9CF1 /* processLexicalTableMin.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = processLexicalTableMin.cpp; path = ../../misc/processLexicalTableMin.cpp; sourceTree = "<group>"; };
1EB3EBD015D0269B006B9CF1 /* moses.xcodeproj */ = {isa = PBXFileReference; lastKnownFileType = "wrapper.pb-project"; path = moses.xcodeproj; sourceTree = "<group>"; };
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
1E3A0AE915D0242A003EF9B4 /* Frameworks */ = {
isa = PBXFrameworksBuildPhase;
buildActionMask = 2147483647;
files = (
1E6D9FF115D027F00064D436 /* libmoses.a in Frameworks */,
);
runOnlyForDeploymentPostprocessing = 0;
};
/* End PBXFrameworksBuildPhase section */
/* Begin PBXGroup section */
1E3A0AE115D02427003EF9B4 = {
isa = PBXGroup;
children = (
1EB3EBB215D024C7006B9CF1 /* processLexicalTableMin.cpp */,
1E3A0AED15D0242A003EF9B4 /* Products */,
1EB3EBD015D0269B006B9CF1 /* moses.xcodeproj */,
);
sourceTree = "<group>";
};
1E3A0AED15D0242A003EF9B4 /* Products */ = {
isa = PBXGroup;
children = (
1E3A0AEC15D0242A003EF9B4 /* processLexicalTableMin */,
);
name = Products;
sourceTree = "<group>";
};
1EB3EBD115D0269B006B9CF1 /* Products */ = {
isa = PBXGroup;
children = (
1EB3EBD515D0269B006B9CF1 /* libmoses.a */,
);
name = Products;
sourceTree = "<group>";
};
/* End PBXGroup section */
/* Begin PBXNativeTarget section */
1E3A0AEB15D0242A003EF9B4 /* processLexicalTableMin */ = {
isa = PBXNativeTarget;
buildConfigurationList = 1E3A0AF615D0242B003EF9B4 /* Build configuration list for PBXNativeTarget "processLexicalTableMin" */;
buildPhases = (
1E3A0AE815D0242A003EF9B4 /* Sources */,
1E3A0AE915D0242A003EF9B4 /* Frameworks */,
1E3A0AEA15D0242A003EF9B4 /* CopyFiles */,
);
buildRules = (
);
dependencies = (
1E6D9FF315D0292D0064D436 /* PBXTargetDependency */,
);
name = processLexicalTableMin;
productName = processLexicalTableMin;
productReference = 1E3A0AEC15D0242A003EF9B4 /* processLexicalTableMin */;
productType = "com.apple.product-type.tool";
};
/* End PBXNativeTarget section */
/* Begin PBXProject section */
1E3A0AE315D02427003EF9B4 /* Project object */ = {
isa = PBXProject;
buildConfigurationList = 1E3A0AE615D02427003EF9B4 /* Build configuration list for PBXProject "processLexicalTableMin" */;
compatibilityVersion = "Xcode 3.2";
developmentRegion = English;
hasScannedForEncodings = 0;
knownRegions = (
en,
);
mainGroup = 1E3A0AE115D02427003EF9B4;
productRefGroup = 1E3A0AED15D0242A003EF9B4 /* Products */;
projectDirPath = "";
projectReferences = (
{
ProductGroup = 1EB3EBD115D0269B006B9CF1 /* Products */;
ProjectRef = 1EB3EBD015D0269B006B9CF1 /* moses.xcodeproj */;
},
);
projectRoot = "";
targets = (
1E3A0AEB15D0242A003EF9B4 /* processLexicalTableMin */,
);
};
/* End PBXProject section */
/* Begin PBXReferenceProxy section */
1EB3EBD515D0269B006B9CF1 /* libmoses.a */ = {
isa = PBXReferenceProxy;
fileType = archive.ar;
path = libmoses.a;
remoteRef = 1EB3EBD415D0269B006B9CF1 /* PBXContainerItemProxy */;
sourceTree = BUILT_PRODUCTS_DIR;
};
/* End PBXReferenceProxy section */
/* Begin PBXSourcesBuildPhase section */
1E3A0AE815D0242A003EF9B4 /* Sources */ = {
isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647;
files = (
1EB3EBB315D024C7006B9CF1 /* processLexicalTableMin.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
/* End PBXSourcesBuildPhase section */
/* Begin PBXTargetDependency section */
1E6D9FF315D0292D0064D436 /* PBXTargetDependency */ = {
isa = PBXTargetDependency;
name = moses;
targetProxy = 1E6D9FF215D0292D0064D436 /* PBXContainerItemProxy */;
};
/* End PBXTargetDependency section */
/* Begin XCBuildConfiguration section */
1E3A0AF415D0242B003EF9B4 /* Debug */ = {
isa = XCBuildConfiguration;
buildSettings = {
ALWAYS_SEARCH_USER_PATHS = NO;
ARCHS = "$(ARCHS_STANDARD_64_BIT)";
COPY_PHASE_STRIP = NO;
GCC_C_LANGUAGE_STANDARD = gnu99;
GCC_DYNAMIC_NO_PIC = NO;
GCC_ENABLE_OBJC_EXCEPTIONS = YES;
GCC_OPTIMIZATION_LEVEL = 0;
GCC_PREPROCESSOR_DEFINITIONS = (
"DEBUG=1",
"$(inherited)",
);
GCC_SYMBOLS_PRIVATE_EXTERN = NO;
GCC_VERSION = com.apple.compilers.llvm.clang.1_0;
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
GCC_WARN_ABOUT_MISSING_PROTOTYPES = YES;
GCC_WARN_ABOUT_RETURN_TYPE = YES;
GCC_WARN_UNUSED_VARIABLE = YES;
HEADER_SEARCH_PATHS = (
../../,
../../irstlm/include,
/opt/local/include,
);
MACOSX_DEPLOYMENT_TARGET = 10.7;
ONLY_ACTIVE_ARCH = YES;
SDKROOT = macosx;
USER_HEADER_SEARCH_PATHS = "../../ ../../irstlm/include /opt/local/include ../../moses/src";
};
name = Debug;
};
1E3A0AF515D0242B003EF9B4 /* Release */ = {
isa = XCBuildConfiguration;
buildSettings = {
ALWAYS_SEARCH_USER_PATHS = NO;
ARCHS = "$(ARCHS_STANDARD_64_BIT)";
COPY_PHASE_STRIP = YES;
DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
GCC_C_LANGUAGE_STANDARD = gnu99;
GCC_ENABLE_OBJC_EXCEPTIONS = YES;
GCC_VERSION = com.apple.compilers.llvm.clang.1_0;
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
GCC_WARN_ABOUT_MISSING_PROTOTYPES = YES;
GCC_WARN_ABOUT_RETURN_TYPE = YES;
GCC_WARN_UNUSED_VARIABLE = YES;
HEADER_SEARCH_PATHS = (
../../,
../../irstlm/include,
/opt/local/include,
);
MACOSX_DEPLOYMENT_TARGET = 10.7;
SDKROOT = macosx;
USER_HEADER_SEARCH_PATHS = "../../ ../../irstlm/include /opt/local/include ../../moses/src";
};
name = Release;
};
1E3A0AF715D0242B003EF9B4 /* Debug */ = {
isa = XCBuildConfiguration;
buildSettings = {
GCC_PREPROCESSOR_DEFINITIONS = WITH_THREADS;
"GCC_PREPROCESSOR_DEFINITIONS[arch=*]" = WITH_THREADS;
LIBRARY_SEARCH_PATHS = (
../../irstlm/lib,
../../srilm/lib/macosx,
../../randlm/lib,
/opt/local/lib,
);
OTHER_LDFLAGS = (
"-lz",
"-lirstlm",
"-lmisc",
"-ldstruct",
"-loolm",
"-lflm",
"-llattice",
"-lrandlm",
"-lboost_thread-mt",
);
PRODUCT_NAME = "$(TARGET_NAME)";
};
name = Debug;
};
1E3A0AF815D0242B003EF9B4 /* Release */ = {
isa = XCBuildConfiguration;
buildSettings = {
GCC_PREPROCESSOR_DEFINITIONS = WITH_THREADS;
LIBRARY_SEARCH_PATHS = (
../../irstlm/lib,
../../srilm/lib/macosx,
../../randlm/lib,
/opt/local/lib,
);
OTHER_LDFLAGS = (
"-lz",
"-lirstlm",
"-lmisc",
"-ldstruct",
"-loolm",
"-lflm",
"-llattice",
"-lrandlm",
"-lboost_thread-mt",
);
PRODUCT_NAME = "$(TARGET_NAME)";
};
name = Release;
};
/* End XCBuildConfiguration section */
/* Begin XCConfigurationList section */
1E3A0AE615D02427003EF9B4 /* Build configuration list for PBXProject "processLexicalTableMin" */ = {
isa = XCConfigurationList;
buildConfigurations = (
1E3A0AF415D0242B003EF9B4 /* Debug */,
1E3A0AF515D0242B003EF9B4 /* Release */,
);
defaultConfigurationIsVisible = 0;
defaultConfigurationName = Release;
};
1E3A0AF615D0242B003EF9B4 /* Build configuration list for PBXNativeTarget "processLexicalTableMin" */ = {
isa = XCConfigurationList;
buildConfigurations = (
1E3A0AF715D0242B003EF9B4 /* Debug */,
1E3A0AF815D0242B003EF9B4 /* Release */,
);
defaultConfigurationIsVisible = 0;
defaultConfigurationName = Release;
};
/* End XCConfigurationList section */
};
rootObject = 1E3A0AE315D02427003EF9B4 /* Project object */;
}

View File

@ -0,0 +1,304 @@
// !$*UTF8*$!
{
archiveVersion = 1;
classes = {
};
objectVersion = 46;
objects = {
/* Begin PBXBuildFile section */
1EF3D68A15D02AEF00969478 /* processPhraseTableMin.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EF3D68915D02AEF00969478 /* processPhraseTableMin.cpp */; };
1EF3D6A415D02B6400969478 /* libmoses.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 1EF3D69915D02B4400969478 /* libmoses.a */; };
/* End PBXBuildFile section */
/* Begin PBXContainerItemProxy section */
1EF3D69815D02B4400969478 /* PBXContainerItemProxy */ = {
isa = PBXContainerItemProxy;
containerPortal = 1EF3D69415D02B4400969478 /* moses.xcodeproj */;
proxyType = 2;
remoteGlobalIDString = D2AAC046055464E500DB518D;
remoteInfo = moses;
};
1EF3D6A515D02B6B00969478 /* PBXContainerItemProxy */ = {
isa = PBXContainerItemProxy;
containerPortal = 1EF3D69415D02B4400969478 /* moses.xcodeproj */;
proxyType = 1;
remoteGlobalIDString = D2AAC045055464E500DB518D;
remoteInfo = moses;
};
/* End PBXContainerItemProxy section */
/* Begin PBXCopyFilesBuildPhase section */
1E6D9FFD15D02A8D0064D436 /* CopyFiles */ = {
isa = PBXCopyFilesBuildPhase;
buildActionMask = 2147483647;
dstPath = /usr/share/man/man1/;
dstSubfolderSpec = 0;
files = (
);
runOnlyForDeploymentPostprocessing = 1;
};
/* End PBXCopyFilesBuildPhase section */
/* Begin PBXFileReference section */
1E6D9FFF15D02A8D0064D436 /* processPhraseTableMin */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = processPhraseTableMin; sourceTree = BUILT_PRODUCTS_DIR; };
1EF3D68915D02AEF00969478 /* processPhraseTableMin.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = processPhraseTableMin.cpp; path = ../../misc/processPhraseTableMin.cpp; sourceTree = "<group>"; };
1EF3D69415D02B4400969478 /* moses.xcodeproj */ = {isa = PBXFileReference; lastKnownFileType = "wrapper.pb-project"; path = moses.xcodeproj; sourceTree = "<group>"; };
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
1E6D9FFC15D02A8D0064D436 /* Frameworks */ = {
isa = PBXFrameworksBuildPhase;
buildActionMask = 2147483647;
files = (
1EF3D6A415D02B6400969478 /* libmoses.a in Frameworks */,
);
runOnlyForDeploymentPostprocessing = 0;
};
/* End PBXFrameworksBuildPhase section */
/* Begin PBXGroup section */
1E6D9FF415D02A8C0064D436 = {
isa = PBXGroup;
children = (
1EF3D68915D02AEF00969478 /* processPhraseTableMin.cpp */,
1E6DA00015D02A8D0064D436 /* Products */,
1EF3D69415D02B4400969478 /* moses.xcodeproj */,
);
sourceTree = "<group>";
};
1E6DA00015D02A8D0064D436 /* Products */ = {
isa = PBXGroup;
children = (
1E6D9FFF15D02A8D0064D436 /* processPhraseTableMin */,
);
name = Products;
sourceTree = "<group>";
};
1EF3D69515D02B4400969478 /* Products */ = {
isa = PBXGroup;
children = (
1EF3D69915D02B4400969478 /* libmoses.a */,
);
name = Products;
sourceTree = "<group>";
};
/* End PBXGroup section */
/* Begin PBXNativeTarget section */
1E6D9FFE15D02A8D0064D436 /* processPhraseTableMin */ = {
isa = PBXNativeTarget;
buildConfigurationList = 1E6DA00915D02A8D0064D436 /* Build configuration list for PBXNativeTarget "processPhraseTableMin" */;
buildPhases = (
1E6D9FFB15D02A8D0064D436 /* Sources */,
1E6D9FFC15D02A8D0064D436 /* Frameworks */,
1E6D9FFD15D02A8D0064D436 /* CopyFiles */,
);
buildRules = (
);
dependencies = (
1EF3D6A615D02B6B00969478 /* PBXTargetDependency */,
);
name = processPhraseTableMin;
productName = processPhraseTableMin;
productReference = 1E6D9FFF15D02A8D0064D436 /* processPhraseTableMin */;
productType = "com.apple.product-type.tool";
};
/* End PBXNativeTarget section */
/* Begin PBXProject section */
1E6D9FF615D02A8C0064D436 /* Project object */ = {
isa = PBXProject;
buildConfigurationList = 1E6D9FF915D02A8C0064D436 /* Build configuration list for PBXProject "processPhraseTableMin" */;
compatibilityVersion = "Xcode 3.2";
developmentRegion = English;
hasScannedForEncodings = 0;
knownRegions = (
en,
);
mainGroup = 1E6D9FF415D02A8C0064D436;
productRefGroup = 1E6DA00015D02A8D0064D436 /* Products */;
projectDirPath = "";
projectReferences = (
{
ProductGroup = 1EF3D69515D02B4400969478 /* Products */;
ProjectRef = 1EF3D69415D02B4400969478 /* moses.xcodeproj */;
},
);
projectRoot = "";
targets = (
1E6D9FFE15D02A8D0064D436 /* processPhraseTableMin */,
);
};
/* End PBXProject section */
/* Begin PBXReferenceProxy section */
1EF3D69915D02B4400969478 /* libmoses.a */ = {
isa = PBXReferenceProxy;
fileType = archive.ar;
path = libmoses.a;
remoteRef = 1EF3D69815D02B4400969478 /* PBXContainerItemProxy */;
sourceTree = BUILT_PRODUCTS_DIR;
};
/* End PBXReferenceProxy section */
/* Begin PBXSourcesBuildPhase section */
1E6D9FFB15D02A8D0064D436 /* Sources */ = {
isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647;
files = (
1EF3D68A15D02AEF00969478 /* processPhraseTableMin.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
/* End PBXSourcesBuildPhase section */
/* Begin PBXTargetDependency section */
1EF3D6A615D02B6B00969478 /* PBXTargetDependency */ = {
isa = PBXTargetDependency;
name = moses;
targetProxy = 1EF3D6A515D02B6B00969478 /* PBXContainerItemProxy */;
};
/* End PBXTargetDependency section */
/* Begin XCBuildConfiguration section */
1E6DA00715D02A8D0064D436 /* Debug */ = {
isa = XCBuildConfiguration;
buildSettings = {
ALWAYS_SEARCH_USER_PATHS = NO;
ARCHS = "$(ARCHS_STANDARD_64_BIT)";
COPY_PHASE_STRIP = NO;
GCC_C_LANGUAGE_STANDARD = gnu99;
GCC_DYNAMIC_NO_PIC = NO;
GCC_ENABLE_OBJC_EXCEPTIONS = YES;
GCC_OPTIMIZATION_LEVEL = 0;
GCC_PREPROCESSOR_DEFINITIONS = (
"DEBUG=1",
"$(inherited)",
);
GCC_SYMBOLS_PRIVATE_EXTERN = NO;
GCC_VERSION = com.apple.compilers.llvm.clang.1_0;
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
GCC_WARN_ABOUT_MISSING_PROTOTYPES = YES;
GCC_WARN_ABOUT_RETURN_TYPE = YES;
GCC_WARN_UNUSED_VARIABLE = YES;
LIBRARY_SEARCH_PATHS = "";
MACOSX_DEPLOYMENT_TARGET = 10.7;
ONLY_ACTIVE_ARCH = YES;
SDKROOT = macosx;
};
name = Debug;
};
1E6DA00815D02A8D0064D436 /* Release */ = {
isa = XCBuildConfiguration;
buildSettings = {
ALWAYS_SEARCH_USER_PATHS = NO;
ARCHS = "$(ARCHS_STANDARD_64_BIT)";
COPY_PHASE_STRIP = YES;
DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
GCC_C_LANGUAGE_STANDARD = gnu99;
GCC_ENABLE_OBJC_EXCEPTIONS = YES;
GCC_VERSION = com.apple.compilers.llvm.clang.1_0;
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
GCC_WARN_ABOUT_MISSING_PROTOTYPES = YES;
GCC_WARN_ABOUT_RETURN_TYPE = YES;
GCC_WARN_UNUSED_VARIABLE = YES;
LIBRARY_SEARCH_PATHS = "";
MACOSX_DEPLOYMENT_TARGET = 10.7;
SDKROOT = macosx;
};
name = Release;
};
1E6DA00A15D02A8D0064D436 /* Debug */ = {
isa = XCBuildConfiguration;
buildSettings = {
GCC_PREPROCESSOR_DEFINITIONS = WITH_THREADS;
HEADER_SEARCH_PATHS = (
../../,
../../irstlm/include,
/opt/local/include,
../../moses/src,
../../cmph/include,
);
LIBRARY_SEARCH_PATHS = (
../../irstlm/lib,
../../srilm/lib/macosx,
../../randlm/lib,
/opt/local/lib,
../../cmph/lib,
);
OTHER_LDFLAGS = (
"-lz",
"-lirstlm",
"-lmisc",
"-ldstruct",
"-loolm",
"-lflm",
"-llattice",
"-lrandlm",
"-lboost_thread-mt",
"-lcmph",
);
PRODUCT_NAME = "$(TARGET_NAME)";
};
name = Debug;
};
1E6DA00B15D02A8D0064D436 /* Release */ = {
isa = XCBuildConfiguration;
buildSettings = {
GCC_PREPROCESSOR_DEFINITIONS = WITH_THREADS;
HEADER_SEARCH_PATHS = (
../../,
../../irstlm/include,
/opt/local/include,
../../moses/src,
../../cmph/include,
);
LIBRARY_SEARCH_PATHS = (
../../irstlm/lib,
../../srilm/lib/macosx,
../../randlm/lib,
/opt/local/lib,
../../cmph/lib,
);
OTHER_LDFLAGS = (
"-lz",
"-lirstlm",
"-lmisc",
"-ldstruct",
"-loolm",
"-lflm",
"-llattice",
"-lrandlm",
"-lboost_thread-mt",
"-lcmph",
);
PRODUCT_NAME = "$(TARGET_NAME)";
};
name = Release;
};
/* End XCBuildConfiguration section */
/* Begin XCConfigurationList section */
1E6D9FF915D02A8C0064D436 /* Build configuration list for PBXProject "processPhraseTableMin" */ = {
isa = XCConfigurationList;
buildConfigurations = (
1E6DA00715D02A8D0064D436 /* Debug */,
1E6DA00815D02A8D0064D436 /* Release */,
);
defaultConfigurationIsVisible = 0;
defaultConfigurationName = Release;
};
1E6DA00915D02A8D0064D436 /* Build configuration list for PBXNativeTarget "processPhraseTableMin" */ = {
isa = XCConfigurationList;
buildConfigurations = (
1E6DA00A15D02A8D0064D436 /* Debug */,
1E6DA00B15D02A8D0064D436 /* Release */,
);
defaultConfigurationIsVisible = 0;
defaultConfigurationName = Release;
};
/* End XCConfigurationList section */
};
rootObject = 1E6D9FF615D02A8C0064D436 /* Project object */;
}

View File

@ -41,9 +41,12 @@
<option id="gnu.cpp.compilermacosx.exe.debug.option.optimization.level.623959371" name="Optimization Level" superClass="gnu.cpp.compilermacosx.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
<option id="gnu.cpp.compiler.macosx.exe.debug.option.debugging.level.892917290" name="Debug Level" superClass="gnu.cpp.compiler.macosx.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.include.paths.1401298824" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="/Users/hieuhoang/unison/workspace/github/moses-smt"/>
<listOptionValue builtIn="false" value="${workspace_loc}/../../"/>
<listOptionValue builtIn="false" value="/opt/local/include"/>
</option>
<option id="gnu.cpp.compiler.option.preprocessor.def.1952961175" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" valueType="definedSymbols">
<listOptionValue builtIn="false" value="TRACE_ENABLE"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1420621104" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.c.compiler.macosx.exe.debug.1724141901" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.macosx.exe.debug">
@ -130,4 +133,5 @@
<storageModule moduleId="refreshScope" versionNumber="1">
<resource resourceType="PROJECT" workspacePath="/util"/>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
</cproject>

28
contrib/python/README.md Normal file
View File

@ -0,0 +1,28 @@
# Python interface to Moses
The idea is to have some of Moses' internals exposed to Python (inspired on pycdec).
## What's been interfaced?
* Binary phrase table:
Moses::PhraseDictionaryTree.h
## Building
1. Build the python extension
python setup.py build_ext -i [--with-cmph]
3. Check the example code
echo "casa" | python example.py examples/phrase-table 5 1
echo "essa casa" | python example.py examples/phrase-table 5 1
## Changing the code
If you want to add your changes you are going to have to recompile the cython code.
1. Compile the cython code (use Cython 0.16): this will generate binpt/binpt.cpp
cython --cplus binpt/binpt.pyx

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,25 @@
from libcpp.string cimport string
from libcpp.vector cimport vector
from libcpp.pair cimport pair
ctypedef string* str_pointer
cdef extern from 'TypeDef.h' namespace 'Moses':
ctypedef vector[float] Scores
ctypedef pair[vector[str_pointer], Scores] StringTgtCand
cdef extern from 'PhraseDictionaryTree.h' namespace 'Moses':
cdef cppclass PhraseDictionaryTree:
PhraseDictionaryTree(unsigned nscores)
void UseWordAlignment(bint use)
bint UseWordAlignment()
int Read(string& path)
void GetTargetCandidates(vector[string]& fs,
vector[StringTgtCand]& rv)
void GetTargetCandidates(vector[string]& fs,
vector[StringTgtCand]& rv,
vector[string]& wa)
cdef extern from 'Util.h' namespace 'Moses':
cdef vector[string] Tokenize(string& text, string& delimiters)

View File

@ -0,0 +1,166 @@
from libcpp.string cimport string
from libcpp.vector cimport vector
import os
import cython
cpdef int fsign(float x):
'''Simply returns the sign of float x (zero is assumed +), it's defined here just so one gains a little bit with static typing'''
return 1 if x >= 0 else -1
cdef bytes as_str(data):
if isinstance(data, bytes):
return data
elif isinstance(data, unicode):
return data.encode('UTF-8')
raise TypeError('Cannot convert %s to string' % type(data))
cdef class QueryResult(object):
'''This class represents a query result, that is,
a target phrase (tuple of words/strings),
a feature vector (tuple of floats)
and possibly an alignment info (string).
Here we don't bother parsing the alignment info, as it's often only
used as is, threfore saving some time.'''
cdef tuple _words
cdef tuple _scores
cdef bytes _wa
def __cinit__(self, words, scores, wa = None):
'''Requires a tuple of words (as strings) and a tuple of scores (as floats).
Word-alignment info (as string) may be provided'''
self._words = words
self._scores = scores
self._wa = wa
@property
def words(self):
'''Tuple of words (as strings)'''
return self._words
@property
def scores(self):
'''Tuple of scores (as floats)'''
return self._scores
@property
def wa(self):
'''Word-alignment info (as string)'''
return self._wa
@staticmethod
def desc(x, y, keys = lambda r: r.scores[0]):
'''Returns the sign of keys(y) - keys(x).
Can only be used if scores is not an empty vector as
keys defaults to scores[0]'''
return fsign(keys(y) - keys(x))
def __str__(self):
'''Returns a string such as: <words> ||| <scores> [||| word-alignment info]'''
if self._wa:
return ' ||| '.join( (' '.join(self._words),
' '.join([str(x) for x in self._scores]),
self._wa) )
else:
return ' ||| '.join( (' '.join(self._words),
' '.join([str(x) for x in self._scores]) ) )
def __repr__(self):
return repr((repr(self._words), repr(self._scores), repr(self._wa)))
cdef QueryResult get_query_result(StringTgtCand& cand, object wa = None):
'''Converts a StringTgtCandidate (c++ object) and possibly a word-alignment info (string)
to a QueryResult (python object).'''
cdef tuple words = tuple([cand.first[i].c_str() for i in range(cand.first.size())])
cdef tuple scores = tuple([cand.second[i] for i in range(cand.second.size())])
return QueryResult(words, scores, wa)
cdef class BinaryPhraseTable(object):
'''This class encapsulates a Moses::PhraseDictionaryTree for operations over
binary phrase tables.'''
cdef PhraseDictionaryTree* __tree
cdef bytes _path
cdef unsigned _nscores
cdef bint _wa
cdef bytes _delimiters
def __cinit__(self, bytes path, unsigned nscores = 5, bint wa = False, delimiters = ' \t'):
'''It requies a path to binary phrase table (stem of the table, e.g europarl.fr-en
is the stem for europar.fr-en.binphr.*).
Moses::PhraseDictionaryTree also needs to be aware of the number of scores (usually 5),
and whether or not there is word-alignment info in the table (usually not).
One can also specify the token delimiters, for Moses::Tokenize(text, delimiters), which is space or tab by default.'''
if not BinaryPhraseTable.isValidBinaryTable(path, wa):
raise ValueError, "'%s' doesn't seem a valid binary table." % path
self._path = path
self._nscores = nscores
self._wa = wa
self._delimiters = delimiters
self.__tree = new PhraseDictionaryTree(nscores)
self.__tree.UseWordAlignment(wa)
self.__tree.Read(string(path))
def __dealloc__(self):
del self.__tree
@staticmethod
def isValidBinaryTable(stem, bint wa = False):
'''This sanity check was added to the constructor, but you can access it from outside this class
to determine whether or not you are providing a valid stem to BinaryPhraseTable.'''
if wa:
return os.path.isfile(stem + ".binphr.idx") \
and os.path.isfile(stem + ".binphr.srctree.wa") \
and os.path.isfile(stem + ".binphr.srcvoc") \
and os.path.isfile(stem + ".binphr.tgtdata.wa") \
and os.path.isfile(stem + ".binphr.tgtvoc")
else:
return os.path.isfile(stem + ".binphr.idx") \
and os.path.isfile(stem + ".binphr.srctree") \
and os.path.isfile(stem + ".binphr.srcvoc") \
and os.path.isfile(stem + ".binphr.tgtdata") \
and os.path.isfile(stem + ".binphr.tgtvoc")
@property
def path(self):
return self._path
@property
def nscores(self):
return self._nscores
@property
def wa(self):
return self._wa
@property
def delimiters(self):
return self._delimiters
def query(self, line, cmp = None, top = 0):
'''Queries the phrase table and returns a list of matches.
Each match is a QueryResult.
If 'cmp' is defined the return list is sorted.
If 'top' is defined, onlye the top elements will be returned.'''
cdef bytes text = as_str(line)
cdef vector[string] fphrase = Tokenize(string(text), string(self._delimiters))
cdef vector[StringTgtCand]* rv = new vector[StringTgtCand]()
cdef vector[string]* wa = NULL
cdef list phrases
if not self.__tree.UseWordAlignment():
self.__tree.GetTargetCandidates(fphrase, rv[0])
phrases = [get_query_result(rv[0][i]) for i in range(rv.size())]
else:
wa = new vector[string]()
self.__tree.GetTargetCandidates(fphrase, rv[0], wa[0])
phrases = [get_query_result(rv[0][i], wa[0][i].c_str()) for i in range(rv.size())]
del wa
del rv
if cmp:
phrases.sort(cmp=cmp)
if top > 0:
return phrases[0:top]
else:
return phrases

31
contrib/python/example.py Normal file
View File

@ -0,0 +1,31 @@
import binpt
#from binpt import QueryResult
import sys
if len(sys.argv) < 3:
print "Usage: %s phrase-table nscores [wa] < query > result" % (sys.argv[0])
sys.exit(0)
pt_file = sys.argv[1]
nscores = int(sys.argv[2])
wa = len(sys.argv) == 4
pt = binpt.BinaryPhraseTable(pt_file, nscores, wa)
print >> sys.stderr, "-ttable %s -nscores %d -alignment-info %s -delimiter '%s'\n" %(pt.path, pt.nscores, str(pt.wa), pt.delimiters)
for line in sys.stdin:
f = line.strip()
matches = pt.query(f, cmp = binpt.QueryResult.desc, top = 20)
print '\n'.join([' ||| '.join((f, str(e))) for e in matches])
'''
# This is how one would use the QueryResult object
for e in matches:
print ' '.join(e.words) # tuple of strings
print e.scores # tuple of floats
if e.wa:
print e.wa # string
'''

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,2 @@
1 essa
0 casa

Binary file not shown.

View File

@ -0,0 +1,4 @@
3 this
2 location
1 house
0 building

View File

@ -0,0 +1,4 @@
casa ||| building ||| 0.6 0.75 0.35 0.35 2.718 ||| 0-0 ||| 2 2
casa ||| house ||| 0.7 0.75 0.35 0.35 2.718 ||| 0-0 ||| 2 2
casa ||| location ||| 0.5 0.75 0.35 0.35 2.718 ||| 0-0 ||| 2 2
essa casa ||| this house ||| 0.7 0.5 0.8 0.6 2.718 ||| 0-0 1-1 ||| 2 2

47
contrib/python/setup.py Normal file
View File

@ -0,0 +1,47 @@
from distutils.core import setup
from distutils.extension import Extension
import os
import sys
available_switches = ['--with-cmph']
with_cmph = False
while sys.argv[-1] in available_switches:
switch = sys.argv.pop()
if switch == '--with-cmph':
with_cmph = True
#### From here you probably don't need to change anything
#### unless a new dependency shows up in Moses
mosesdir = os.path.abspath('../../')
includes = [mosesdir, os.path.join(mosesdir, 'moses/src'), os.path.join(mosesdir, 'util')]
libdir = os.path.join(mosesdir, 'lib')
basic=['z', 'stdc++', 'pthread', 'm', 'gcc_s', 'c', 'boost_system', 'boost_thread', 'boost_filesystem', 'rt']
moses=['OnDiskPt', 'kenutil', 'kenlm', 'LM', 'mert_lib', 'moses_internal', 'CYKPlusParser', 'Scope3Parser', 'fuzzy-match', 'RuleTable', 'CompactPT', 'moses', 'dynsa', 'pcfg_common' ]
additional=[]
if with_cmph:
additional.append('cmph')
exobj = [os.path.join(libdir, 'lib' + l + '.so') for l in moses]
ext_modules = [
Extension(name = 'binpt',
sources = ['binpt/binpt.cpp'],
language = 'C++',
include_dirs = includes,
extra_objects = exobj,
library_dirs = [libdir],
runtime_library_dirs = [libdir],
libraries = basic + moses + additional,
extra_compile_args = ['-O3', '-DNDEBUG'],
)
]
setup(
name='binpt',
ext_modules=ext_modules
)

View File

@ -0,0 +1 @@
Wang Ling - lingwang at cs dot cmu dot edu

View File

@ -0,0 +1,91 @@
Implementation of the Relative Entropy-based Phrase table filtering algorithm by Wang Ling (Ling et al, 2012).
This implementation also calculates the significance scores for the phrase tables based on the Fisher's Test(Johnson et al, 2007). Uses a slightly modified version of the "sigtest-filter" by Chris Dyer.
-------BUILD INSTRUCTIONS-------
1 - Build the sigtest-filter binary
1.1 - Download and build SALM available at http://projectile.sv.cmu.edu/research/public/tools/salm/salm.htm
1.2 - Run "make SALMDIR=<path_to_salm>" in "<path_to_moses>/contrib/relent-filter/sigtest-filter" to create the executable filter-pt
2 - Build moses project by running "./bjam <options>", this will create the executables for relent filtering
-------USAGE INSTRUCTIONS-------
Required files:
s_train - source training file
t_train - target training file
moses_ini - path to the moses configuration file ( after tuning )
pruning_binaries - path to the relent pruning binaries ( should be "<path_to_moses>/bin" )
pruning_scripts - path to the relent pruning scripts ( should be "<path_to_moses>/contrib/relent-filter/scripts" )
sigbin - path to the sigtest filter binaries ( should be "<path_to_moses>/contrib/relent-filter/sigtest-filter" )
output_dir - path to write the output
1 - build suffix arrays for the source and target parallel training data
1.1 - run "<path to salm>/Bin/Linux/Index/IndexSA.O32 <s_train>" (or IndexSA.O64)
1.2 - run "<path to salm>/Bin/Linux/Index/IndexSA.O32 <t_train>" (or IndexSA.O64)
2 - calculate phrase pair scores by running:
perl <pruning_scripts>/calcPruningScores.pl -moses_ini <moses_ini> -training_s <s_train> -training_t <t_train> -prune_bin <pruning_binaries> -prune_scripts <pruning_scripts> -moses_scripts <path_to_moses>/scripts/training/ -workdir <output_dir> -dec_size 10000
this will create the following files in the <output_dir/scores/> dir:
count.txt - counts of the phrase pairs for N(s,t) N(s,*) and N(*,t)
divergence.txt - negative log of the divergence of the phrase pair
empirical.txt - empirical distribution of the phrase pairs N(s,t)/N(*,*)
rel_ent.txt - relative entropy of the phrase pairs
significance.txt - significance of the phrase pairs
You can use any one of these files for pruning and also combine these scores using <pruning_scripts>/interpolateScores.pl
3 - To actually prune a phrase table you should run <pruning_scripts>/prunePT.pl
For instance, to prune 30% of the phrase table using rel_ent run:
perl <pruning_scripts>/prunePT.pl -table <phrase_table_file> -scores <output_dir>/scores/rel_ent.txt -percentage 70 > <pruned_phrase_table_file>
You can also prune by threshold
perl <pruning_scripts>/prunePT.pl -table <phrase_table_file> -scores <output_dir>/scores/rel_ent.txt -threshold 0.1 > <pruned_phrase_table_file>
The same must be done for the reordering table by replacing <phrase_table_file> with the <reord_table_file>
perl <pruning_scripts>/prunePT.pl -table <reord_table_file> -scores <output_dir>/scores/rel_ent.txt -percentage 70 > <pruned_reord_table_file>
-------RUNNING STEP 2 IN PARALLEL-------
Step 2 requires the forced decoding of the whole set of phrase pairs in the table, so unless you test it on a small corpora, it usually requires large amounts of time to process.
Thus, we recommend users to run multiple instances of "<pruning_scripts>/calcPruningScores.pl" in parallel to process different parts of the phrase table.
To do this, run:
perl <pruning_scripts>/calcPruningScores.pl -moses_ini <moses_ini> -training_s <s_train> -training_t <t_train> -prune_bin <pruning_binaries> -prune_scripts <pruning_scripts> -moses_scripts <path_to_moses>/scripts/training/ -workdir <output_dir> -dec_size 10000 -start 0 -end 100000
The -start and -end tags tell the script to only calculate the results for phrase pairs between 0 and 99999.
Thus, an example of a shell script to run for the whole phrase table would be:
size=`wc <phrase_table_file> | gawk '{print $1}'`
phrases_per_process=100000
for i in $(seq 0 $phrases_per_process $size)
do
end=`expr $i + $phrases_per_process`
perl <pruning_scripts>/calcPruningScores.pl -moses_ini <moses_ini> -training_s <s_train> -training_t <t_train> -prune_bin <pruning_binaries> -prune_scripts <pruning_scripts> -moses_scripts <path_to_moses>/scripts/training/ -workdir <output_dir>.$i-$end -dec_size 10000 -start $i -end $end
done
After all processes finish, simply join the partial score files together in the same order.
-------REFERENCES-------
Ling, W., Graça, J., Trancoso, I., and Black, A. (2012). Entropy-based pruning for phrase-based
machine translation. In Proceedings of the 2012
Joint Conference on Empirical Methods in Natural Language Processing and
Computational Natural Language Learning (EMNLP-CoNLL), pp. 962-971.
H. Johnson, J. Martin, G. Foster and R. Kuhn. (2007) Improving Translation
Quality by Discarding Most of the Phrasetable. In Proceedings of the 2007
Joint Conference on Empirical Methods in Natural Language Processing and
Computational Natural Language Learning (EMNLP-CoNLL), pp. 967-975.

View File

@ -0,0 +1,53 @@
#!/usr/bin/perl -w
# read arguments
my $countFile = $ARGV[0];
my $ZCAT = "gzip -cd";
my $BZCAT = "bzcat";
&process_count_file($countFile);
sub process_count_file {
$file = $_[0];
open(COUNT_READER, &open_compressed($file)) or die "ERROR: Can't read $file";
print STDERR "reading file to calculate normalizer";
$normalizer=0;
while(<COUNT_READER>) {
my $line = $_;
chomp($line);
my @line_array = split(/\s+/, $line);
my $count = $line_array[0];
$normalizer+=$count;
}
close(COUNT_READER);
print STDERR "reading file again to print the counts";
open(COUNT_READER, &open_compressed($file)) or die "ERROR: Can't read $file";
while(<COUNT_READER>) {
my $line = $_;
chomp($line);
my @line_array = split(/\s+/, $line);
my $score = $line_array[0]/$normalizer;
print $score."\n";
}
close(COUNT_READER);
}
sub open_compressed {
my ($file) = @_;
print STDERR "FILE: $file\n";
# add extensions, if necessary
$file = $file.".bz2" if ! -e $file && -e $file.".bz2";
$file = $file.".gz" if ! -e $file && -e $file.".gz";
# pipe zipped, if necessary
return "$BZCAT $file|" if $file =~ /\.bz2$/;
return "$ZCAT $file|" if $file =~ /\.gz$/;
return $file;
}

View File

@ -0,0 +1,351 @@
#!/usr/bin/perl -w
use Getopt::Long;
use File::Basename;
use POSIX;
# read arguments
my $line_start = 0;
my $line_end = LONG_MAX;
my $tmp_dir = "";
my $dec_size = LONG_MAX;
$_HELP = 1 if (@ARGV < 1 or !GetOptions ("moses_ini=s" => \$moses_ini, #moses conf file
"start:i" => \$line_start, #fisrt phrase to process
"end:i" => \$line_end, #last sentence to process (not including)
"training_s=s" => \$training_s, #source training file
"training_t=s" => \$training_t, #target training file
"prune_bin=s" => \$prune_bin, #binary files in the pruning toolkit
"prune_scripts=s" => \$prune_scripts, #scripts in the pruning toolkit
"sig_bin=s" => \$sig_bin, #binary files to calculate significance
"moses_scripts=s" => \$moses_scripts, #dir with the moses scripts
"tmp_dir:s" => \$tmp_dir, #dir with the moses scripts
"dec_size:i" => \$dec_size, #dir with the moses scripts
"workdir=s" => \$workdir)); #directory to put all the output files
# help message if arguments are not correct
if ($_HELP) {
print "
Usage: perl calcPruningScores.pl [PARAMS]
Function: Calculates relative entropy for each phrase pair in a translation model.
Authors: Wang Ling ( lingwang at cs dot cmu dot edu )
PARAMS:
-moses_ini : moses configuration file with the model to prune (phrase table, reordering table, weights etc...)
-training_s : source training file, please run salm first
-training_t : target training file, please run salm first
-prune_bin : path to the binaries for pruning (probably <PATH_TO_MOSES>/bin)
-prune_scripts : path to the scripts for pruning (probably the directory where this script is)
-sig_bin : path to the binary for significance testing included in this toolkit
-moses_scripts : path to the moses training scripts (where filter-model-given-input.pl is)
-workdir : directory to produce the output
-tmp_dir : directory to store temporary files (improve performance if stored in a local disk), omit to store in workdir
-dec_size : number of phrase pairs to be decoded at a time, omit to decode all selected phrase pairs at once
-start and -end : starting and ending phrase pairs to process, to be used if you want to launch multiple processes in parallel for different parts of the phrase table. If specified the process will process the phrase pairs from <start> to <end-1>
For any questions contact lingwang at cs dot cmu dot edu
";
exit(1);
}
# setting up working dirs
my $TMP_DIR = $tmp_dir;
if ($tmp_dir eq ""){
$TMP_DIR = "$workdir/tmp";
}
my $SCORE_DIR = "$workdir/scores";
my $FILTER_DIR = "$TMP_DIR/filter";
# files for divergence module
my $SOURCE_FILE = "$TMP_DIR/source.txt";
my $CONSTRAINT_FILE = "$TMP_DIR/constraint.txt";
my $DIVERGENCE_FILE = "$SCORE_DIR/divergence.txt";
# files for significance module
my $SIG_TABLE_FILE = "$TMP_DIR/source_target.txt";
my $SIG_MOD_OUTPUT = "$TMP_DIR/sig_mod.out";
my $SIG_FILE = "$SCORE_DIR/significance.txt";
my $COUNT_FILE = "$SCORE_DIR/count.txt";
my $EMP_DIST_FILE= "$SCORE_DIR/empirical.txt";
my $REL_ENT_FILE= "$SCORE_DIR/rel_ent.txt";
# setting up executables
my $ZCAT = "gzip -cd";
my $BZCAT = "bzcat";
my $CP = "cp";
my $SED = "sed";
my $RM = "rm";
my $SORT_EXEC = "sort";
my $PRUNE_EXEC = "$prune_bin/calcDivergence";
my $SIG_EXEC = "$sig_bin/filter-pt";
my $FILTER_EXEC = "perl $moses_scripts/filter-model-given-input.pl";
my $CALC_EMP_EXEC ="perl $prune_scripts/calcEmpiricalDistribution.pl";
my $INT_TABLE_EXEC = "perl $prune_scripts/interpolateScores.pl";
# moses ini variables
my ($TRANSLATION_TABLE_FILE, $REORDERING_TABLE_FILE);
# phrase table variables
my ($N_PHRASES, $N_PHRASES_TO_PROCESS);
# main functions
&prepare();
&calc_sig_and_counts();
&calc_div();
&clear_up();
# (1) preparing data
sub prepare {
print STDERR "(1) preparing data @ ".`date`;
safesystem("mkdir -p $workdir") or die("ERROR: could not create work dir $workdir");
safesystem("mkdir -p $TMP_DIR") or die("ERROR: could not create work dir $TMP_DIR");
safesystem("mkdir -p $SCORE_DIR") or die("ERROR: could not create work dir $SCORE_DIR");
&get_moses_ini_params();
&copy_tables_to_tmp_dir();
&write_data_files();
$N_PHRASES = &get_number_of_phrases();
$line_end = ($line_end > $N_PHRASES) ? $N_PHRASES : $line_end;
$N_PHRASES_TO_PROCESS = $line_end - $line_start;
}
sub write_data_files {
open(SOURCE_WRITER,">".$SOURCE_FILE) or die "ERROR: Can't write $SOURCE_FILE";
open(CONSTRAINT_WRITER,">".$CONSTRAINT_FILE) or die "ERROR: Can't write $CONSTRAINT_FILE";
open(TABLE_WRITER,">".$SIG_TABLE_FILE) or die "ERROR: Can't write $SIG_TABLE_FILE";
open(TTABLE_READER, &open_compressed($TRANSLATION_TABLE_FILE)) or die "ERROR: Can't read $TRANSLATION_TABLE_FILE";
$line_number = 0;
while($line_number < $line_start && !eof(TTABLE_READER)){
<TTABLE_READER>;
$line_number++;
}
while($line_number < $line_end && !eof(TTABLE_READER)) {
my $line = <TTABLE_READER>;
chomp($line);
my @line_array = split(/\s+\|\|\|\s+/, $line);
my $source = $line_array[0];
my $target = $line_array[1];
my $scores = $line_array[2];
print TABLE_WRITER $source." ||| ".$target." ||| ".$scores."\n";
print SOURCE_WRITER $source."\n";
print CONSTRAINT_WRITER $target."\n";
$line_number++;
}
close(SOURCE_WRITER);
close(CONSTRAINT_WRITER);
close(TABLE_WRITER);
close(TTABLE_READER);
}
sub copy_tables_to_tmp_dir {
$tmp_t_table = "$TMP_DIR/".basename($TRANSLATION_TABLE_FILE);
$tmp_r_table = "$TMP_DIR/".basename($REORDERING_TABLE_FILE);
$tmp_moses_ini = "$TMP_DIR/moses.ini";
$cp_t_cmd = "$CP $TRANSLATION_TABLE_FILE $TMP_DIR";
$cp_r_cmd = "$CP $REORDERING_TABLE_FILE $TMP_DIR";
safesystem("$cp_t_cmd") or die("ERROR: could not run:\n $cp_t_cmd");
safesystem("$cp_r_cmd") or die("ERROR: could not run:\n $cp_r_cmd");
$sed_cmd = "$SED s#$TRANSLATION_TABLE_FILE#$tmp_t_table#g $moses_ini | $SED s#$REORDERING_TABLE_FILE#$tmp_r_table#g > $tmp_moses_ini";
safesystem("$sed_cmd") or die("ERROR: could not run:\n $sed_cmd");
$TRANSLATION_TABLE_FILE = $tmp_t_table;
$REORDERING_TABLE_FILE = $tmp_r_table;
$moses_ini = $tmp_moses_ini;
}
# (2) calculating sig and counts
sub calc_sig_and_counts {
print STDERR "(2) calculating counts and significance".`date`;
print STDERR "(2.1) running significance module".`date`;
&run_significance_module();
print STDERR "(2.2) writing counts and significance tables".`date`;
&write_counts_and_significance_table();
print STDERR "(2.3) calculating empirical distribution".`date`;
}
sub write_counts_and_significance_table {
open(COUNT_WRITER,">".$COUNT_FILE) or die "ERROR: Can't write $COUNT_FILE";
open(SIG_WRITER,">".$SIG_FILE) or die "ERROR: Can't write $SIG_FILE";
open(SIG_MOD_READER, &open_compressed($SIG_MOD_OUTPUT)) or die "ERROR: Can't read $SIG_MOD_OUTPUT";
while(<SIG_MOD_READER>) {
my($line) = $_;
chomp($line);
my @line_array = split(/\s+\|\|\|\s+/, $line);
my $count = $line_array[0];
my $sig = $line_array[1];
print COUNT_WRITER $count."\n";
print SIG_WRITER $sig."\n";
}
close(SIG_MOD_READER);
close(COUNT_WRITER);
close(SIG_WRITER);
}
sub run_significance_module {
my $sig_cmd = "cat $SIG_TABLE_FILE | $SIG_EXEC -e $training_t -f $training_s -l -10000 -p -c > $SIG_MOD_OUTPUT";
safesystem("$sig_cmd") or die("ERROR: could not run:\n $sig_cmd");
}
# (3) calculating divergence
sub calc_div {
print STDERR "(3) calculating relative entropy".`date`;
print STDERR "(3.1) calculating empirical distribution".`date`;
&calculate_empirical_distribution();
print STDERR "(3.2) calculating divergence (this might take a while)".`date`;
if($N_PHRASES_TO_PROCESS > $dec_size) {
&calculate_divergence_shared("$FILTER_DIR");
}
else{
&calculate_divergence($moses_ini);
}
print STDERR "(3.3) calculating relative entropy from empirical and divergence distributions".`date`;
&calculate_relative_entropy();
}
sub calculate_empirical_distribution {
my $emp_cmd = "$CALC_EMP_EXEC $COUNT_FILE > $EMP_DIST_FILE";
safesystem("$emp_cmd") or die("ERROR: could not run:\n $emp_cmd");
}
sub get_fragmented_file_name {
my ($name, $frag, $interval) = @_;
return "$name-$frag-".($frag+$interval);
}
sub calculate_divergence {
my $moses_ini_file = $_[0];
print STDERR "force decoding phrase pairs\n";
my $prune_cmd = "cat $SOURCE_FILE | $PRUNE_EXEC -f $moses_ini_file -constraint $CONSTRAINT_FILE -early-discarding-threshold 0 -s 100000 -ttable-limit 0 > $DIVERGENCE_FILE 2> /dev/null";
safesystem("$prune_cmd") or die("ERROR: could not run:\n $prune_cmd");
}
sub calculate_divergence_shared {
my $filter_dir = $_[0];
&split_file_into_chunks($SOURCE_FILE, $dec_size, $N_PHRASES_TO_PROCESS);
&split_file_into_chunks($CONSTRAINT_FILE, $dec_size, $N_PHRASES_TO_PROCESS);
for(my $i = 0; $i < $N_PHRASES_TO_PROCESS; $i = $i + $dec_size) {
my $filter_cmd = "$FILTER_EXEC ".&get_fragmented_file_name($FILTER_DIR, $i, $dec_size)." $moses_ini ".&get_fragmented_file_name($SOURCE_FILE, $i, $dec_size);
safesystem("$filter_cmd") or die("ERROR: could not run:\n $filter_cmd");
my $moses_ini_file = &get_fragmented_file_name($filter_dir, $i, $dec_size)."/moses.ini";
my $source_file = &get_fragmented_file_name($SOURCE_FILE, $i, $dec_size);
my $constraint_file = &get_fragmented_file_name($CONSTRAINT_FILE, $i, $dec_size);
my $prune_cmd;
print STDERR "force decoding phrase pairs $i to ".($i + $dec_size)."\n";
if($i == 0){
$prune_cmd = "cat $source_file | $PRUNE_EXEC -f $moses_ini_file -constraint $constraint_file -early-discarding-threshold 0 -s 100000 -ttable-limit 0 > $DIVERGENCE_FILE 2> /dev/null";
}
else{
$prune_cmd = "cat $source_file | $PRUNE_EXEC -f $moses_ini_file -constraint $constraint_file -early-discarding-threshold 0 -s 100000 -ttable-limit 0 >> $DIVERGENCE_FILE 2> /dev/null";
}
safesystem("$prune_cmd") or die("ERROR: could not run:\n $prune_cmd");
my $rm_cmd = "$RM -r ".&get_fragmented_file_name($FILTER_DIR, $i, $dec_size);
safesystem("$rm_cmd") or die("ERROR: could not run:\n $rm_cmd");
}
}
sub calculate_relative_entropy {
my $int_cmd = "$INT_TABLE_EXEC -files \"$EMP_DIST_FILE $DIVERGENCE_FILE\" -weights \"1 1\" -operation \"*\" > $REL_ENT_FILE";
safesystem("$int_cmd") or die("ERROR: could not run:\n $int_cmd");
}
# (4) clear up stuff that is not needed
sub clear_up {
print STDERR "(4) removing tmp dir".`date`;
$rm_cmd = "$RM -r $TMP_DIR";
safesystem("$rm_cmd") or die("ERROR: could not run:\n $rm_cmd");
}
# utility functions
sub safesystem {
print STDERR "Executing: @_\n";
system(@_);
if ($? == -1) {
print STDERR "ERROR: Failed to execute: @_\n $!\n";
exit(1);
}
elsif ($? & 127) {
printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n",
($? & 127), ($? & 128) ? 'with' : 'without';
exit(1);
}
else {
my $exitcode = $? >> 8;
print STDERR "Exit code: $exitcode\n" if $exitcode;
return ! $exitcode;
}
}
sub open_compressed {
my ($file) = @_;
print STDERR "FILE: $file\n";
# add extensions, if necessary
$file = $file.".bz2" if ! -e $file && -e $file.".bz2";
$file = $file.".gz" if ! -e $file && -e $file.".gz";
# pipe zipped, if necessary
return "$BZCAT $file|" if $file =~ /\.bz2$/;
return "$ZCAT $file|" if $file =~ /\.gz$/;
return $file;
}
sub get_moses_ini_params {
open(MOSES_READER, $moses_ini);
while(<MOSES_READER>) {
my($line) = $_;
chomp($line);
if($line eq "[ttable-file]"){
$tableLine = <MOSES_READER>;
chomp($tableLine);
($_,$_,$_,$_,$TRANSLATION_TABLE_FILE) = split(" ",$tableLine); # put the other parameters there if needed
}
if($line eq "[distortion-file]"){
$tableLine = <MOSES_READER>;
chomp($tableLine);
($_,$_,$_,$REORDERING_TABLE_FILE) = split(" ",$tableLine); # put the other parameters there if needed
}
}
close(MOSES_READER);
}
sub get_number_of_phrases {
my $ret = 0;
open(TABLE_READER, &open_compressed($TRANSLATION_TABLE_FILE)) or die "ERROR: Can't read $TRANSLATION_TABLE_FILE";
while(<TABLE_READER>) {
$ret++;
}
close (TABLE_READER);
return $ret;
}
sub split_file_into_chunks {
my ($file_to_split, $chunk_size, $number_of_phrases_to_process) = @_;
open(SOURCE_READER, &open_compressed($file_to_split)) or die "ERROR: Can't read $file_to_split";
my $FRAG_SOURCE_WRITER;
for(my $i = 0; $i < $number_of_phrases_to_process && !eof(SOURCE_READER); $i++) {
if(($i % $chunk_size) == 0){ # open fragmented file to write
my $frag_file = &get_fragmented_file_name($file_to_split, $i, $chunk_size);
open(FRAG_SOURCE_WRITER, ">".$frag_file) or die "ERROR: Can't write $frag_file";
}
my $line = <SOURCE_READER>;
print FRAG_SOURCE_WRITER $line;
if((%i % $chunk_size) == $chunk_size - 1 || (%i % $chunk_size) == $number_of_phrases_to_process - 1){ # close fragmented file before opening a new one
close(FRAG_SOURCE_WRITER);
}
}
}

View File

@ -0,0 +1,94 @@
#!/usr/bin/perl -w
use Getopt::Long;
use File::Basename;
use POSIX;
$operation="+";
# read arguments
$_HELP = 1 if (@ARGV < 1 or !GetOptions ("files=s" => \$files, #moses conf file
"weights=s" => \$weights,
"operation=s" => \$operation)); #directory to put all the output files
# help message if arguments are not correct
if ($_HELP) {
print "Relative Entropy Pruning
Usage: perl interpolateScores.pl [PARAMS]
Function: interpolates any number of score files interlated by their weights
Authors: Wang Ling ( lingwang at cs dot cmu dot edu )
PARAMS:
-files=s : table files to interpolate separated by a space (Ex \"file1 file2 file3\")
-weights : interpolation weights separated by a space (Ex \"0.3 0.3 0.4\")
-operation : +,* or min depending on the operation to perform to combine scores
For any questions contact lingwang at cs dot cmu dot edu
";
exit(1);
}
@FILES = split(/\s+/, $files);
@WEIGHTS = split(/\s+/, $weights);
my $ZCAT = "gzip -cd";
my $BZCAT = "bzcat";
&interpolate();
sub interpolate {
my @READERS;
for($i = 0; $i < @FILES; $i++){
local *FILE;
open(FILE, &open_compressed($FILES[$i])) or die "ERROR: Can't read $FILES[$i]";
push(@READERS, *FILE);
}
$FIRST = $READERS[0];
while(!eof($FIRST)) {
if($operation eq "+"){
my $score = 0;
for($i = 0; $i < @FILES; $i++){
my $READER = $READERS[$i];
my $line = <$READER>;
chomp($line);
$score += $line*$WEIGHTS[$i];
}
print "$score\n";
}
if($operation eq "*"){
my $score = 1;
for($i = 0; $i < @FILES; $i++){
my $READER = $READERS[$i];
my $line = <$READER>;
chomp($line);
$score *= $line ** $WEIGHTS[$i];
}
print "$score\n"
}
if($operation eq "min"){
my $score = 99999;
for($i = 0; $i < @FILES; $i++){
my $READER = $READERS[$i];
my $line = <$READER>;
chomp($line);
if ($score > $line*$WEIGHTS[$i]){
$score = $line*$WEIGHTS[$i];
}
}
print "$score\n"
}
}
}
sub open_compressed {
my ($file) = @_;
print STDERR "FILE: $file\n";
# add extensions, if necessary
$file = $file.".bz2" if ! -e $file && -e $file.".bz2";
$file = $file.".gz" if ! -e $file && -e $file.".gz";
# pipe zipped, if necessary
return "$BZCAT $file|" if $file =~ /\.bz2$/;
return "$ZCAT $file|" if $file =~ /\.gz$/;
return $file;
}

View File

@ -0,0 +1,114 @@
#!/usr/bin/perl -w
# read arguments
my $tmp_dir = "";
my $percentage = -1;
my $threshold = -1;
use Getopt::Long;
$_HELP = 1 if (@ARGV < 1 or !GetOptions ("table=s" => \$table, #table to filter
"scores=s" => \$scores_file, #scores of each phrase pair, should have same size as the table to filter
"percentage=i" => \$percentage, # percentage of phrase table to remain
"threshold=i" => \$threshold)); # threshold (score < threshold equals prune entry)
# help message if arguments are not correct
if ($_HELP) {
print "Relative Entropy Pruning
Usage: perl prunePT.pl [PARAMS]
Function: prunes a phrase table given a score file
Authors: Wang Ling ( lingwang at cs dot cmu dot edu )
PARAMS:
-table : table to prune
-percentage : percentage of phrase table to remain (if the scores do not allow the exact percentage if multiple entries have the same threshold, the script chooses to retain more than the given percentage)
-threshold : threshold to prune (score < threshold equals prune entry), do not use this if percentage is specified
For any questions contact lingwang at cs dot cmu dot edu
";
exit(1);
}
my $THRESHOLD = $threshold;
if ($percentage != -1){
$THRESHOLD = &get_threshold_by_percentage($percentage);
}
my $ZCAT = "gzip -cd";
my $BZCAT = "bzcat";
&prune_by_threshold($THRESHOLD);
sub prune_by_threshold {
my $th = $_[0];
print STDERR "pruning using threshold $th \n";
open (SCORE_READER, &open_compressed($scores_file));
open (TABLE_READER, &open_compressed($table));
$number_of_phrases=0;
$number_of_unpruned_phrases=0;
while(!eof(SCORE_READER) && !eof(TABLE_READER)){
$score_line = <SCORE_READER>;
$table_line = <TABLE_READER>;
chomp($score_line);
if($score_line >= $th){
print $table_line;
$number_of_unpruned_phrases++;
}
$number_of_phrases++;
}
print STDERR "pruned ".($number_of_phrases - $number_of_unpruned_phrases)." phrase pairs out of $number_of_phrases\n";
}
sub get_threshold_by_percentage {
my $percentage = $_[0];
$ret = 0;
$number_of_phrases = &get_number_of_phrases();
$stop_phrase = ($percentage * $number_of_phrases) / 100;
$phrase_number = 0;
open (SCORE_READER, &open_compressed($scores_file));
while(<SCORE_READER>) {
my $line = $_;
}
close (SCORE_READER);
open (SCORE_READER, "cat $scores_file | LC_ALL=c sort -g |");
while(<SCORE_READER>) {
my $line = $_;
if($phrase_number >= $stop_phrase){
chomp($line);
$ret = $line;
last;
}
$phrase_number++;
}
close (SCORE_READER);
return $ret;
}
sub get_number_of_phrases {
$ret = 0;
open (SCORE_READER, $scores_file);
while(<SCORE_READER>) {
$ret++;
}
close (SCORE_READER);
return $ret;
}
sub open_compressed {
my ($file) = @_;
print STDERR "FILE: $file\n";
# add extensions, if necessary
$file = $file.".bz2" if ! -e $file && -e $file.".bz2";
$file = $file.".gz" if ! -e $file && -e $file.".gz";
# pipe zipped, if necessary
return "$BZCAT $file|" if $file =~ /\.bz2$/;
return "$ZCAT $file|" if $file =~ /\.gz$/;
return $file;
}

View File

@ -0,0 +1,10 @@
SALMDIR=/Users/hieuhoang/workspace/salm
FLAVOR?=o64
INC=-I$(SALMDIR)/Src/Shared -I$(SALMDIR)/Src/SuffixArrayApplications -I$(SALMDIR)/Src/SuffixArrayApplications/SuffixArraySearch
OBJS=$(SALMDIR)/Distribution/Linux/Objs/Search/_SuffixArrayApplicationBase.$(FLAVOR) $(SALMDIR)/Distribution/Linux/Objs/Search/_SuffixArraySearchApplicationBase.$(FLAVOR) $(SALMDIR)/Distribution/Linux/Objs/Shared/_String.$(FLAVOR) $(SALMDIR)/Distribution/Linux/Objs/Shared/_IDVocabulary.$(FLAVOR)
all: filter-pt
filter-pt: filter-pt.cpp
./check-install $(SALMDIR)
$(CXX) -O6 $(INC) $(OBJS) -o filter-pt filter-pt.cpp

View File

@ -0,0 +1,42 @@
Re-implementation of Johnson et al. (2007)'s phrasetable filtering strategy.
This implementation relies on Joy Zhang's SALM Suffix Array toolkit. It is
available here:
http://projectile.sv.cmu.edu/research/public/tools/salm/salm.htm
--Chris Dyer <redpony@umd.edu>
BUILD INSTRUCTIONS
---------------------------------
1. Download and build SALM.
2. make SALMDIR=/path/to/SALM
USAGE INSTRUCTIONS
---------------------------------
1. Using the SALM/Bin/Linux/Index/IndexSA.O32, create a suffix array index
of the source and target sides of your training bitext.
2. cat phrase-table.txt | ./filter-pt -e TARG.suffix -f SOURCE.suffix \
-l <FILTER-VALUE>
FILTER-VALUE is the -log prob threshold described in Johnson et al.
(2007)'s paper. It may be either 'a+e', 'a-e', or a positive real
value. 'a+e' is a good setting- it filters out <1,1,1> phrase pairs.
I also recommend using -n 30, which filteres out all but the top
30 phrase pairs, sorted by P(e|f). This was used in the paper.
3. Run with no options to see more use-cases.
REFERENCES
---------------------------------
H. Johnson, J. Martin, G. Foster and R. Kuhn. (2007) Improving Translation
Quality by Discarding Most of the Phrasetable. In Proceedings of the 2007
Joint Conference on Empirical Methods in Natural Language Processing and
Computational Natural Language Learning (EMNLP-CoNLL), pp. 967-975.

View File

@ -0,0 +1,231 @@
// XGetopt.cpp Version 1.2
//
// Author: Hans Dietrich
// hdietrich2@hotmail.com
//
// Description:
// XGetopt.cpp implements getopt(), a function to parse command lines.
//
// History
// Version 1.2 - 2003 May 17
// - Added Unicode support
//
// Version 1.1 - 2002 March 10
// - Added example to XGetopt.cpp module header
//
// This software is released into the public domain.
// You are free to use it in any way you like.
//
// This software is provided "as is" with no expressed
// or implied warranty. I accept no liability for any
// damage or loss of business that this software may cause.
//
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// if you are using precompiled headers then include this line:
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// if you are not using precompiled headers then include these lines:
//#include <windows.h>
//#include <stdio.h>
//#include <tchar.h>
///////////////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include <string.h>
#include <math.h>
#include "WIN32_functions.h"
///////////////////////////////////////////////////////////////////////////////
//
// X G e t o p t . c p p
//
//
// NAME
// getopt -- parse command line options
//
// SYNOPSIS
// int getopt(int argc, char *argv[], char *optstring)
//
// extern char *optarg;
// extern int optind;
//
// DESCRIPTION
// The getopt() function parses the command line arguments. Its
// arguments argc and argv are the argument count and array as
// passed into the application on program invocation. In the case
// of Visual C++ programs, argc and argv are available via the
// variables __argc and __argv (double underscores), respectively.
// getopt returns the next option letter in argv that matches a
// letter in optstring. (Note: Unicode programs should use
// __targv instead of __argv. Also, all character and string
// literals should be enclosed in ( ) ).
//
// optstring is a string of recognized option letters; if a letter
// is followed by a colon, the option is expected to have an argument
// that may or may not be separated from it by white space. optarg
// is set to point to the start of the option argument on return from
// getopt.
//
// Option letters may be combined, e.g., "-ab" is equivalent to
// "-a -b". Option letters are case sensitive.
//
// getopt places in the external variable optind the argv index
// of the next argument to be processed. optind is initialized
// to 0 before the first call to getopt.
//
// When all options have been processed (i.e., up to the first
// non-option argument), getopt returns EOF, optarg will point
// to the argument, and optind will be set to the argv index of
// the argument. If there are no non-option arguments, optarg
// will be set to NULL.
//
// The special option "--" may be used to delimit the end of the
// options; EOF will be returned, and "--" (and everything after it)
// will be skipped.
//
// RETURN VALUE
// For option letters contained in the string optstring, getopt
// will return the option letter. getopt returns a question mark (?)
// when it encounters an option letter not included in optstring.
// EOF is returned when processing is finished.
//
// BUGS
// 1) Long options are not supported.
// 2) The GNU double-colon extension is not supported.
// 3) The environment variable POSIXLY_CORRECT is not supported.
// 4) The + syntax is not supported.
// 5) The automatic permutation of arguments is not supported.
// 6) This implementation of getopt() returns EOF if an error is
// encountered, instead of -1 as the latest standard requires.
//
// EXAMPLE
// BOOL CMyApp::ProcessCommandLine(int argc, char *argv[])
// {
// int c;
//
// while ((c = getopt(argc, argv, ("aBn:"))) != EOF)
// {
// switch (c)
// {
// case ('a'):
// TRACE(("option a\n"));
// //
// // set some flag here
// //
// break;
//
// case ('B'):
// TRACE( ("option B\n"));
// //
// // set some other flag here
// //
// break;
//
// case ('n'):
// TRACE(("option n: value=%d\n"), atoi(optarg));
// //
// // do something with value here
// //
// break;
//
// case ('?'):
// TRACE(("ERROR: illegal option %s\n"), argv[optind-1]);
// return FALSE;
// break;
//
// default:
// TRACE(("WARNING: no handler for option %c\n"), c);
// return FALSE;
// break;
// }
// }
// //
// // check for non-option args here
// //
// return TRUE;
// }
//
///////////////////////////////////////////////////////////////////////////////
char *optarg; // global argument pointer
int optind = 0; // global argv index
int getopt(int argc, char *argv[], char *optstring)
{
static char *next = NULL;
if (optind == 0)
next = NULL;
optarg = NULL;
if (next == NULL || *next =='\0') {
if (optind == 0)
optind++;
if (optind >= argc || argv[optind][0] != ('-') || argv[optind][1] == ('\0')) {
optarg = NULL;
if (optind < argc)
optarg = argv[optind];
return EOF;
}
if (strcmp(argv[optind], "--") == 0) {
optind++;
optarg = NULL;
if (optind < argc)
optarg = argv[optind];
return EOF;
}
next = argv[optind];
next++; // skip past -
optind++;
}
char c = *next++;
char *cp = strchr(optstring, c);
if (cp == NULL || c == (':'))
return ('?');
cp++;
if (*cp == (':')) {
if (*next != ('\0')) {
optarg = next;
next = NULL;
} else if (optind < argc) {
optarg = argv[optind];
optind++;
} else {
return ('?');
}
}
return c;
}
// for an overview, see
// W. Press, S. Teukolsky and W. Vetterling. (1992) Numerical Recipes in C. Chapter 6.1.
double lgamma(int x)
{
// size_t xx=(size_t)x; xx--; size_t sum=1; while (xx) { sum *= xx--; } return log((double)(sum));
if (x <= 2) {
return 0.0;
}
static double coefs[6] = {76.18009172947146, -86.50532032941677, 24.01409824083091, -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5};
double tmp=(double)x+5.5;
tmp -= (((double)x)+0.5)*log(tmp);
double y=(double)x;
double sum = 1.000000000190015;
for (size_t j=0; j<6; ++j) {
sum += coefs[j]/++y;
}
return -tmp+log(2.5066282746310005*sum/(double)x);
}

View File

@ -0,0 +1,24 @@
// XGetopt.h Version 1.2
//
// Author: Hans Dietrich
// hdietrich2@hotmail.com
//
// This software is released into the public domain.
// You are free to use it in any way you like.
//
// This software is provided "as is" with no expressed
// or implied warranty. I accept no liability for any
// damage or loss of business that this software may cause.
//
///////////////////////////////////////////////////////////////////////////////
#ifndef XGETOPT_H
#define XGETOPT_H
extern int optind, opterr;
extern char *optarg;
int getopt(int argc, char *argv[], char *optstring);
double lgamma(int x);
#endif //XGETOPT_H

View File

@ -0,0 +1,5 @@
#!/usr/bin/perl -w
use strict;
my $path = shift @ARGV;
die "Can't find SALM installation path: $path\nPlease use:\n\n make SALMDIR=/path/to/SALM\n\n" unless (-d $path);
exit 0;

View File

@ -0,0 +1,377 @@
#include <cstring>
#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <algorithm>
#include "_SuffixArraySearchApplicationBase.h"
#include <vector>
#include <iostream>
#include <set>
#ifdef WIN32
#include "WIN32_functions.h"
#else
#include <unistd.h>
#endif
typedef std::set<TextLenType> SentIdSet;
typedef std::map<std::string, SentIdSet> PhraseSetMap;
#undef min
// constants
const size_t MINIMUM_SIZE_TO_KEEP = 10000; // reduce this to improve memory usage,
// increase for speed
const std::string SEPARATOR = " ||| ";
const double ALPHA_PLUS_EPS = -1000.0; // dummy value
const double ALPHA_MINUS_EPS = -2000.0; // dummy value
// configuration params
int pfe_filter_limit = 0; // 0 = don't filter anything based on P(f|e)
bool print_cooc_counts = false; // add cooc counts to phrase table?
bool print_neglog_significance = false; // add -log(p) to phrase table?
double sig_filter_limit = 0; // keep phrase pairs with -log(sig) > sig_filter_limit
// higher = filter-more
bool pef_filter_only = false; // only filter based on pef
// globals
PhraseSetMap esets;
double p_111 = 0.0; // alpha
size_t nremoved_sigfilter = 0;
size_t nremoved_pfefilter = 0;
C_SuffixArraySearchApplicationBase e_sa;
C_SuffixArraySearchApplicationBase f_sa;
int num_lines;
void usage()
{
std::cerr << "\nFilter phrase table using significance testing as described\n"
<< "in H. Johnson, et al. (2007) Improving Translation Quality\n"
<< "by Discarding Most of the Phrasetable. EMNLP 2007.\n"
<< "\nUsage:\n"
<< "\n filter-pt -e english.suf-arr -f french.suf-arr\n"
<< " [-c] [-p] [-l threshold] [-n num] < PHRASE-TABLE > FILTERED-PHRASE-TABLE\n\n"
<< " [-l threshold] >0.0, a+e, or a-e: keep values that have a -log significance > this\n"
<< " [-n num ] 0, 1...: 0=no filtering, >0 sort by P(e|f) and keep the top num elements\n"
<< " [-c ] add the cooccurence counts to the phrase table\n"
<< " [-p ] add -log(significance) to the phrasetable\n\n";
exit(1);
}
struct PTEntry {
PTEntry(const std::string& str, int index);
std::string f_phrase;
std::string e_phrase;
std::string extra;
std::string scores;
float pfe;
int cf;
int ce;
int cfe;
float nlog_pte;
void set_cooc_stats(int _cef, int _cf, int _ce, float nlp) {
cfe = _cef;
cf = _cf;
ce = _ce;
nlog_pte = nlp;
}
};
PTEntry::PTEntry(const std::string& str, int index) :
cf(0), ce(0), cfe(0), nlog_pte(0.0)
{
size_t pos = 0;
std::string::size_type nextPos = str.find(SEPARATOR, pos);
this->f_phrase = str.substr(pos,nextPos);
pos = nextPos + SEPARATOR.size();
nextPos = str.find(SEPARATOR, pos);
this->e_phrase = str.substr(pos,nextPos-pos);
pos = nextPos + SEPARATOR.size();
nextPos = str.find(SEPARATOR, pos);
this->scores = str.substr(pos,nextPos-pos);
pos = nextPos + SEPARATOR.size();
this->extra = str.substr(pos);
int c = 0;
std::string::iterator i=scores.begin();
if (index > 0) {
for (; i != scores.end(); ++i) {
if ((*i) == ' ') {
c++;
if (c == index) break;
}
}
}
if (i != scores.end()) {
++i;
}
char f[24];
char *fp=f;
while (i != scores.end() && *i != ' ') {
*fp++=*i++;
}
*fp++=0;
this->pfe = atof(f);
// std::cerr << "L: " << f_phrase << " ::: " << e_phrase << " ::: " << scores << " ::: " << pfe << std::endl;
// std::cerr << "X: " << extra << "\n";
}
struct PfeComparer {
bool operator()(const PTEntry* a, const PTEntry* b) const {
return a->pfe > b->pfe;
}
};
struct NlogSigThresholder {
NlogSigThresholder(float threshold) : t(threshold) {}
float t;
bool operator()(const PTEntry* a) const {
if (a->nlog_pte < t) {
delete a;
return true;
} else return false;
}
};
std::ostream& operator << (std::ostream& os, const PTEntry& pp)
{
//os << pp.f_phrase << " ||| " << pp.e_phrase;
//os << " ||| " << pp.scores;
//if (pp.extra.size()>0) os << " ||| " << pp.extra;
if (print_cooc_counts) os << pp.cfe << " " << pp.cf << " " << pp.ce;
if (print_neglog_significance) os << " ||| " << pp.nlog_pte;
return os;
}
void print(int a, int b, int c, int d, float p)
{
std::cerr << a << "\t" << b << "\t P=" << p << "\n"
<< c << "\t" << d << "\t xf=" << (double)(b)*(double)(c)/(double)(a+1)/(double)(d+1) << "\n\n";
}
// 2x2 (one-sided) Fisher's exact test
// see B. Moore. (2004) On Log Likelihood and the Significance of Rare Events
double fisher_exact(int cfe, int ce, int cf)
{
assert(cfe <= ce);
assert(cfe <= cf);
int a = cfe;
int b = (cf - cfe);
int c = (ce - cfe);
int d = (num_lines - ce - cf + cfe);
int n = a + b + c + d;
double cp = exp(lgamma(1+a+c) + lgamma(1+b+d) + lgamma(1+a+b) + lgamma(1+c+d) - lgamma(1+n) - lgamma(1+a) - lgamma(1+b) - lgamma(1+c) - lgamma(1+d));
double total_p = 0.0;
int tc = std::min(b,c);
for (int i=0; i<=tc; i++) {
total_p += cp;
// double lg = lgamma(1+a+c) + lgamma(1+b+d) + lgamma(1+a+b) + lgamma(1+c+d) - lgamma(1+n) - lgamma(1+a) - lgamma(1+b) - lgamma(1+c) - lgamma(1+d); double cp = exp(lg);
// print(a,b,c,d,cp);
double coef = (double)(b)*(double)(c)/(double)(a+1)/(double)(d+1);
cp *= coef;
++a;
--c;
++d;
--b;
}
return total_p;
}
// input: unordered list of translation options for a single source phrase
void compute_cooc_stats_and_filter(std::vector<PTEntry*>& options)
{
if (pfe_filter_limit>0 && options.size() > pfe_filter_limit) {
nremoved_pfefilter += (options.size() - pfe_filter_limit);
std::nth_element(options.begin(), options.begin()+pfe_filter_limit, options.end(), PfeComparer());
for (std::vector<PTEntry*>::iterator i=options.begin()+pfe_filter_limit; i != options.end(); ++i)
delete *i;
options.erase(options.begin()+pfe_filter_limit,options.end());
}
if (pef_filter_only) return;
SentIdSet fset;
vector<S_SimplePhraseLocationElement> locations;
//std::cerr << "Looking up f-phrase: " << options.front()->f_phrase << "\n";
locations = f_sa.locateExactPhraseInCorpus(options.front()->f_phrase.c_str());
if(locations.size()==0) {
cerr<<"No occurrences found!!\n";
}
for (vector<S_SimplePhraseLocationElement>::iterator i=locations.begin();
i != locations.end();
++i) {
fset.insert(i->sentIdInCorpus);
}
size_t cf = fset.size();
for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) {
const std::string& e_phrase = (*i)->e_phrase;
size_t cef=0;
SentIdSet& eset = esets[(*i)->e_phrase];
if (eset.empty()) {
//std::cerr << "Looking up e-phrase: " << e_phrase << "\n";
vector<S_SimplePhraseLocationElement> locations = e_sa.locateExactPhraseInCorpus(e_phrase.c_str());
for (vector<S_SimplePhraseLocationElement>::iterator i=locations.begin(); i!= locations.end(); ++i) {
TextLenType curSentId = i->sentIdInCorpus;
eset.insert(curSentId);
}
}
size_t ce=eset.size();
if (ce < cf) {
for (SentIdSet::iterator i=eset.begin(); i != eset.end(); ++i) {
if (fset.find(*i) != fset.end()) cef++;
}
} else {
for (SentIdSet::iterator i=fset.begin(); i != fset.end(); ++i) {
if (eset.find(*i) != eset.end()) cef++;
}
}
double nlp = -log(fisher_exact(cef, cf, ce));
(*i)->set_cooc_stats(cef, cf, ce, nlp);
if (ce < MINIMUM_SIZE_TO_KEEP) {
esets.erase(e_phrase);
}
}
std::vector<PTEntry*>::iterator new_end =
std::remove_if(options.begin(), options.end(), NlogSigThresholder(sig_filter_limit));
nremoved_sigfilter += (options.end() - new_end);
options.erase(new_end,options.end());
}
int main(int argc, char * argv[])
{
int c;
const char* efile=0;
const char* ffile=0;
int pfe_index = 2;
while ((c = getopt(argc, argv, "cpf:e:i:n:l:")) != -1) {
switch (c) {
case 'e':
efile = optarg;
break;
case 'f':
ffile = optarg;
break;
case 'i': // index of pfe in phrase table
pfe_index = atoi(optarg);
break;
case 'n': // keep only the top n entries in phrase table sorted by p(f|e) (0=all)
pfe_filter_limit = atoi(optarg);
std::cerr << "P(f|e) filter limit: " << pfe_filter_limit << std::endl;
break;
case 'c':
print_cooc_counts = true;
break;
case 'p':
print_neglog_significance = true;
break;
case 'l':
std::cerr << "-l = " << optarg << "\n";
if (strcmp(optarg,"a+e") == 0) {
sig_filter_limit = ALPHA_PLUS_EPS;
} else if (strcmp(optarg,"a-e") == 0) {
sig_filter_limit = ALPHA_MINUS_EPS;
} else {
char *x;
sig_filter_limit = strtod(optarg, &x);
}
break;
default:
usage();
}
}
//-----------------------------------------------------------------------------
if (optind != argc || ((!efile || !ffile) && !pef_filter_only)) {
usage();
}
//load the indexed corpus with vocabulary(noVoc=false) and with offset(noOffset=false)
if (!pef_filter_only) {
e_sa.loadData_forSearch(efile, false, false);
f_sa.loadData_forSearch(ffile, false, false);
size_t elines = e_sa.returnTotalSentNumber();
size_t flines = f_sa.returnTotalSentNumber();
if (elines != flines) {
std::cerr << "Number of lines in e-corpus != number of lines in f-corpus!\n";
usage();
} else {
std::cerr << "Training corpus: " << elines << " lines\n";
num_lines = elines;
}
p_111 = -log(fisher_exact(1,1,1));
std::cerr << "\\alpha = " << p_111 << "\n";
if (sig_filter_limit == ALPHA_MINUS_EPS) {
sig_filter_limit = p_111 - 0.001;
} else if (sig_filter_limit == ALPHA_PLUS_EPS) {
sig_filter_limit = p_111 + 0.001;
}
std::cerr << "Sig filter threshold is = " << sig_filter_limit << "\n";
} else {
std::cerr << "Filtering using P(e|f) only. n=" << pfe_filter_limit << std::endl;
}
char tmpString[10000];
std::string prev = "";
std::vector<PTEntry*> options;
size_t pt_lines = 0;
while(!cin.eof()) {
cin.getline(tmpString,10000,'\n');
if(++pt_lines%10000==0) {
std::cerr << ".";
if(pt_lines%500000==0) std::cerr << "[n:"<<pt_lines<<"]\n";
}
if(strlen(tmpString)>0) {
PTEntry* pp = new PTEntry(tmpString, pfe_index);
if (prev != pp->f_phrase) {
prev = pp->f_phrase;
if (!options.empty()) { // always true after first line
compute_cooc_stats_and_filter(options);
}
for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) {
std::cout << **i << std::endl;
delete *i;
}
options.clear();
options.push_back(pp);
} else {
options.push_back(pp);
}
// for(int i=0;i<locations.size(); i++){
// cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl;
// }
}
}
compute_cooc_stats_and_filter(options);
for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) {
std::cout << **i << std::endl;
delete *i;
}
float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines;
float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines;
std::cerr << "\n\n------------------------------------------------------\n"
<< " unfiltered phrases pairs: " << pt_lines << "\n"
<< "\n"
<< " P(f|e) filter [first]: " << nremoved_pfefilter << " (" << pfefper << "%)\n"
<< " significance filter: " << nremoved_sigfilter << " (" << sigfper << "%)\n"
<< " TOTAL FILTERED: " << (nremoved_pfefilter + nremoved_sigfilter) << " (" << (sigfper + pfefper) << "%)\n"
<< "\n"
<< " FILTERED phrase pairs: " << (pt_lines - nremoved_pfefilter - nremoved_sigfilter) << " (" << (100.0-sigfper - pfefper) << "%)\n"
<< "------------------------------------------------------\n";
return 0;
}

View File

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 9.00
# Visual Studio 2005
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "sigtest-filter", "sigtest-filter.vcproj", "{FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Win32 = Debug|Win32
Release|Win32 = Release|Win32
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}.Debug|Win32.ActiveCfg = Debug|Win32
{FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}.Debug|Win32.Build.0 = Debug|Win32
{FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}.Release|Win32.ActiveCfg = Release|Win32
{FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}.Release|Win32.Build.0 = Release|Win32
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,580 @@
// $Id$
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (c) 2006 University of Edinburgh
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the University of Edinburgh nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
***********************************************************************/
// example file on how to use moses library
#include <iostream>
#include <stack>
#include "TypeDef.h"
#include "Util.h"
#include "IOWrapper.h"
#include "Hypothesis.h"
#include "WordsRange.h"
#include "TrellisPathList.h"
#include "StaticData.h"
#include "DummyScoreProducers.h"
#include "InputFileStream.h"
using namespace std;
using namespace Moses;
namespace MosesCmd
{
IOWrapper::IOWrapper(
const vector<FactorType> &inputFactorOrder
, const vector<FactorType> &outputFactorOrder
, const FactorMask &inputFactorUsed
, size_t nBestSize
, const string &nBestFilePath)
:m_inputFactorOrder(inputFactorOrder)
,m_outputFactorOrder(outputFactorOrder)
,m_inputFactorUsed(inputFactorUsed)
,m_inputFile(NULL)
,m_inputStream(&std::cin)
,m_nBestStream(NULL)
,m_outputWordGraphStream(NULL)
,m_outputSearchGraphStream(NULL)
,m_detailedTranslationReportingStream(NULL)
,m_alignmentOutputStream(NULL)
{
Initialization(inputFactorOrder, outputFactorOrder
, inputFactorUsed
, nBestSize, nBestFilePath);
}
IOWrapper::IOWrapper(const std::vector<FactorType> &inputFactorOrder
, const std::vector<FactorType> &outputFactorOrder
, const FactorMask &inputFactorUsed
, size_t nBestSize
, const std::string &nBestFilePath
, const std::string &inputFilePath)
:m_inputFactorOrder(inputFactorOrder)
,m_outputFactorOrder(outputFactorOrder)
,m_inputFactorUsed(inputFactorUsed)
,m_inputFilePath(inputFilePath)
,m_inputFile(new InputFileStream(inputFilePath))
,m_nBestStream(NULL)
,m_outputWordGraphStream(NULL)
,m_outputSearchGraphStream(NULL)
,m_detailedTranslationReportingStream(NULL)
,m_alignmentOutputStream(NULL)
{
Initialization(inputFactorOrder, outputFactorOrder
, inputFactorUsed
, nBestSize, nBestFilePath);
m_inputStream = m_inputFile;
}
IOWrapper::~IOWrapper()
{
if (m_inputFile != NULL)
delete m_inputFile;
if (m_nBestStream != NULL && !m_surpressSingleBestOutput) {
// outputting n-best to file, rather than stdout. need to close file and delete obj
delete m_nBestStream;
}
if (m_outputWordGraphStream != NULL) {
delete m_outputWordGraphStream;
}
if (m_outputSearchGraphStream != NULL) {
delete m_outputSearchGraphStream;
}
delete m_detailedTranslationReportingStream;
delete m_alignmentOutputStream;
}
void IOWrapper::Initialization(const std::vector<FactorType> &/*inputFactorOrder*/
, const std::vector<FactorType> &/*outputFactorOrder*/
, const FactorMask &/*inputFactorUsed*/
, size_t nBestSize
, const std::string &nBestFilePath)
{
const StaticData &staticData = StaticData::Instance();
// n-best
m_surpressSingleBestOutput = false;
if (nBestSize > 0) {
if (nBestFilePath == "-" || nBestFilePath == "/dev/stdout") {
m_nBestStream = &std::cout;
m_surpressSingleBestOutput = true;
} else {
std::ofstream *file = new std::ofstream;
m_nBestStream = file;
file->open(nBestFilePath.c_str());
}
}
// wordgraph output
if (staticData.GetOutputWordGraph()) {
string fileName = staticData.GetParam("output-word-graph")[0];
std::ofstream *file = new std::ofstream;
m_outputWordGraphStream = file;
file->open(fileName.c_str());
}
// search graph output
if (staticData.GetOutputSearchGraph()) {
string fileName;
if (staticData.GetOutputSearchGraphExtended())
fileName = staticData.GetParam("output-search-graph-extended")[0];
else
fileName = staticData.GetParam("output-search-graph")[0];
std::ofstream *file = new std::ofstream;
m_outputSearchGraphStream = file;
file->open(fileName.c_str());
}
// detailed translation reporting
if (staticData.IsDetailedTranslationReportingEnabled()) {
const std::string &path = staticData.GetDetailedTranslationReportingFilePath();
m_detailedTranslationReportingStream = new std::ofstream(path.c_str());
CHECK(m_detailedTranslationReportingStream->good());
}
// sentence alignment output
if (! staticData.GetAlignmentOutputFile().empty()) {
m_alignmentOutputStream = new ofstream(staticData.GetAlignmentOutputFile().c_str());
CHECK(m_alignmentOutputStream->good());
}
}
InputType*IOWrapper::GetInput(InputType* inputType)
{
if(inputType->Read(*m_inputStream, m_inputFactorOrder)) {
if (long x = inputType->GetTranslationId()) {
if (x>=m_translationId) m_translationId = x+1;
} else inputType->SetTranslationId(m_translationId++);
return inputType;
} else {
delete inputType;
return NULL;
}
}
/***
* print surface factor only for the given phrase
*/
void OutputSurface(std::ostream &out, const Hypothesis &edge, const std::vector<FactorType> &outputFactorOrder,
bool reportSegmentation, bool reportAllFactors)
{
CHECK(outputFactorOrder.size() > 0);
const Phrase& phrase = edge.GetCurrTargetPhrase();
if (reportAllFactors == true) {
out << phrase;
} else {
size_t size = phrase.GetSize();
for (size_t pos = 0 ; pos < size ; pos++) {
const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]);
out << *factor;
CHECK(factor);
for (size_t i = 1 ; i < outputFactorOrder.size() ; i++) {
const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[i]);
CHECK(factor);
out << "|" << *factor;
}
out << " ";
}
}
// trace option "-t"
if (reportSegmentation == true && phrase.GetSize() > 0) {
out << "|" << edge.GetCurrSourceWordsRange().GetStartPos()
<< "-" << edge.GetCurrSourceWordsRange().GetEndPos() << "| ";
}
}
void OutputBestSurface(std::ostream &out, const Hypothesis *hypo, const std::vector<FactorType> &outputFactorOrder,
bool reportSegmentation, bool reportAllFactors)
{
if (hypo != NULL) {
// recursively retrace this best path through the lattice, starting from the end of the hypothesis sentence
OutputBestSurface(out, hypo->GetPrevHypo(), outputFactorOrder, reportSegmentation, reportAllFactors);
OutputSurface(out, *hypo, outputFactorOrder, reportSegmentation, reportAllFactors);
}
}
void OutputAlignment(ostream &out, const AlignmentInfo &ai, size_t sourceOffset, size_t targetOffset)
{
typedef std::vector< const std::pair<size_t,size_t>* > AlignVec;
AlignVec alignments = ai.GetSortedAlignments();
AlignVec::const_iterator it;
for (it = alignments.begin(); it != alignments.end(); ++it) {
const std::pair<size_t,size_t> &alignment = **it;
out << alignment.first + sourceOffset << "-" << alignment.second + targetOffset << " ";
}
}
void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
{
size_t targetOffset = 0;
for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
const Hypothesis &edge = *edges[currEdge];
const TargetPhrase &tp = edge.GetCurrTargetPhrase();
size_t sourceOffset = edge.GetCurrSourceWordsRange().GetStartPos();
OutputAlignment(out, tp.GetAlignmentInfo(), sourceOffset, targetOffset);
targetOffset += tp.GetSize();
}
out << std::endl;
}
void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<const Hypothesis *> &edges)
{
ostringstream out;
OutputAlignment(out, edges);
collector->Write(lineNo,out.str());
}
void OutputAlignment(OutputCollector* collector, size_t lineNo , const Hypothesis *hypo)
{
if (collector) {
std::vector<const Hypothesis *> edges;
const Hypothesis *currentHypo = hypo;
while (currentHypo) {
edges.push_back(currentHypo);
currentHypo = currentHypo->GetPrevHypo();
}
OutputAlignment(collector,lineNo, edges);
}
}
void OutputAlignment(OutputCollector* collector, size_t lineNo , const TrellisPath &path)
{
if (collector) {
OutputAlignment(collector,lineNo, path.GetEdges());
}
}
void OutputBestHypo(const Moses::TrellisPath &path, long /*translationId*/, bool reportSegmentation, bool reportAllFactors, std::ostream &out)
{
const std::vector<const Hypothesis *> &edges = path.GetEdges();
for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
const Hypothesis &edge = *edges[currEdge];
OutputSurface(out, edge, StaticData::Instance().GetOutputFactorOrder(), reportSegmentation, reportAllFactors);
}
out << endl;
}
void IOWrapper::Backtrack(const Hypothesis *hypo)
{
if (hypo->GetPrevHypo() != NULL) {
VERBOSE(3,hypo->GetId() << " <= ");
Backtrack(hypo->GetPrevHypo());
}
}
void OutputBestHypo(const std::vector<Word>& mbrBestHypo, long /*translationId*/, bool /*reportSegmentation*/, bool /*reportAllFactors*/, ostream& out)
{
for (size_t i = 0 ; i < mbrBestHypo.size() ; i++) {
const Factor *factor = mbrBestHypo[i].GetFactor(StaticData::Instance().GetOutputFactorOrder()[0]);
CHECK(factor);
if (i>0) out << " " << *factor;
else out << *factor;
}
out << endl;
}
void OutputInput(std::vector<const Phrase*>& map, const Hypothesis* hypo)
{
if (hypo->GetPrevHypo()) {
OutputInput(map, hypo->GetPrevHypo());
map[hypo->GetCurrSourceWordsRange().GetStartPos()] = hypo->GetSourcePhrase();
}
}
void OutputInput(std::ostream& os, const Hypothesis* hypo)
{
size_t len = hypo->GetInput().GetSize();
std::vector<const Phrase*> inp_phrases(len, 0);
OutputInput(inp_phrases, hypo);
for (size_t i=0; i<len; ++i)
if (inp_phrases[i]) os << *inp_phrases[i];
}
void IOWrapper::OutputBestHypo(const Hypothesis *hypo, long /*translationId*/, bool reportSegmentation, bool reportAllFactors)
{
if (hypo != NULL) {
VERBOSE(1,"BEST TRANSLATION: " << *hypo << endl);
VERBOSE(3,"Best path: ");
Backtrack(hypo);
VERBOSE(3,"0" << std::endl);
if (!m_surpressSingleBestOutput) {
if (StaticData::Instance().IsPathRecoveryEnabled()) {
OutputInput(cout, hypo);
cout << "||| ";
}
OutputBestSurface(cout, hypo, m_outputFactorOrder, reportSegmentation, reportAllFactors);
cout << endl;
}
} else {
VERBOSE(1, "NO BEST TRANSLATION" << endl);
if (!m_surpressSingleBestOutput) {
cout << endl;
}
}
}
void OutputNBest(std::ostream& out, const Moses::TrellisPathList &nBestList, const std::vector<Moses::FactorType>& outputFactorOrder, const TranslationSystem* system, long translationId, bool reportSegmentation)
{
const StaticData &staticData = StaticData::Instance();
bool labeledOutput = staticData.IsLabeledNBestList();
bool reportAllFactors = staticData.GetReportAllFactorsNBest();
bool includeAlignment = staticData.NBestIncludesAlignment();
bool includeWordAlignment = staticData.PrintAlignmentInfoInNbest();
TrellisPathList::const_iterator iter;
for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
const TrellisPath &path = **iter;
const std::vector<const Hypothesis *> &edges = path.GetEdges();
// print the surface factor of the translation
out << translationId << " ||| ";
for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
const Hypothesis &edge = *edges[currEdge];
OutputSurface(out, edge, outputFactorOrder, reportSegmentation, reportAllFactors);
}
out << " |||";
std::string lastName = "";
const vector<const StatefulFeatureFunction*>& sff = system->GetStatefulFeatureFunctions();
for( size_t i=0; i<sff.size(); i++ ) {
if( labeledOutput && lastName != sff[i]->GetScoreProducerWeightShortName() ) {
lastName = sff[i]->GetScoreProducerWeightShortName();
out << " " << lastName << ":";
}
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( sff[i] );
for (size_t j = 0; j<scores.size(); ++j) {
out << " " << scores[j];
}
}
const vector<const StatelessFeatureFunction*>& slf = system->GetStatelessFeatureFunctions();
for( size_t i=0; i<slf.size(); i++ ) {
if( labeledOutput && lastName != slf[i]->GetScoreProducerWeightShortName() ) {
lastName = slf[i]->GetScoreProducerWeightShortName();
out << " " << lastName << ":";
}
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( slf[i] );
for (size_t j = 0; j<scores.size(); ++j) {
out << " " << scores[j];
}
}
// translation components
const vector<PhraseDictionaryFeature*>& pds = system->GetPhraseDictionaries();
if (pds.size() > 0) {
for( size_t i=0; i<pds.size(); i++ ) {
size_t pd_numinputscore = pds[i]->GetNumInputScores();
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( pds[i] );
for (size_t j = 0; j<scores.size(); ++j){
if (labeledOutput && (i == 0) ){
if ((j == 0) || (j == pd_numinputscore)){
lastName = pds[i]->GetScoreProducerWeightShortName(j);
out << " " << lastName << ":";
}
}
out << " " << scores[j];
}
}
}
// generation
const vector<GenerationDictionary*>& gds = system->GetGenerationDictionaries();
if (gds.size() > 0) {
for( size_t i=0; i<gds.size(); i++ ) {
size_t pd_numinputscore = gds[i]->GetNumInputScores();
vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( gds[i] );
for (size_t j = 0; j<scores.size(); ++j){
if (labeledOutput && (i == 0) ){
if ((j == 0) || (j == pd_numinputscore)){
lastName = gds[i]->GetScoreProducerWeightShortName(j);
out << " " << lastName << ":";
}
}
out << " " << scores[j];
}
}
}
// total
out << " ||| " << path.GetTotalScore();
//phrase-to-phrase alignment
if (includeAlignment) {
out << " |||";
for (int currEdge = (int)edges.size() - 2 ; currEdge >= 0 ; currEdge--) {
const Hypothesis &edge = *edges[currEdge];
const WordsRange &sourceRange = edge.GetCurrSourceWordsRange();
WordsRange targetRange = path.GetTargetWordsRange(edge);
out << " " << sourceRange.GetStartPos();
if (sourceRange.GetStartPos() < sourceRange.GetEndPos()) {
out << "-" << sourceRange.GetEndPos();
}
out<< "=" << targetRange.GetStartPos();
if (targetRange.GetStartPos() < targetRange.GetEndPos()) {
out<< "-" << targetRange.GetEndPos();
}
}
}
if (includeWordAlignment) {
out << " ||| ";
for (int currEdge = (int)edges.size() - 2 ; currEdge >= 0 ; currEdge--) {
const Hypothesis &edge = *edges[currEdge];
const WordsRange &sourceRange = edge.GetCurrSourceWordsRange();
WordsRange targetRange = path.GetTargetWordsRange(edge);
const int sourceOffset = sourceRange.GetStartPos();
const int targetOffset = targetRange.GetStartPos();
const AlignmentInfo &ai = edge.GetCurrTargetPhrase().GetAlignmentInfo();
OutputAlignment(out, ai, sourceOffset, targetOffset);
}
}
if (StaticData::Instance().IsPathRecoveryEnabled()) {
out << "|||";
OutputInput(out, edges[0]);
}
out << endl;
}
out <<std::flush;
}
void OutputLatticeMBRNBest(std::ostream& out, const vector<LatticeMBRSolution>& solutions,long translationId)
{
for (vector<LatticeMBRSolution>::const_iterator si = solutions.begin(); si != solutions.end(); ++si) {
out << translationId;
out << " |||";
const vector<Word> mbrHypo = si->GetWords();
for (size_t i = 0 ; i < mbrHypo.size() ; i++) {
const Factor *factor = mbrHypo[i].GetFactor(StaticData::Instance().GetOutputFactorOrder()[0]);
if (i>0) out << " " << *factor;
else out << *factor;
}
out << " |||";
out << " map: " << si->GetMapScore();
out << " w: " << mbrHypo.size();
const vector<float>& ngramScores = si->GetNgramScores();
for (size_t i = 0; i < ngramScores.size(); ++i) {
out << " " << ngramScores[i];
}
out << " ||| " << si->GetScore();
out << endl;
}
}
void IOWrapper::OutputLatticeMBRNBestList(const vector<LatticeMBRSolution>& solutions,long translationId)
{
OutputLatticeMBRNBest(*m_nBestStream, solutions,translationId);
}
bool ReadInput(IOWrapper &ioWrapper, InputTypeEnum inputType, InputType*& source)
{
delete source;
switch(inputType) {
case SentenceInput:
source = ioWrapper.GetInput(new Sentence);
break;
case ConfusionNetworkInput:
source = ioWrapper.GetInput(new ConfusionNet);
break;
case WordLatticeInput:
source = ioWrapper.GetInput(new WordLattice);
break;
default:
TRACE_ERR("Unknown input type: " << inputType << "\n");
}
return (source ? true : false);
}
IOWrapper *GetIOWrapper(const StaticData &staticData)
{
IOWrapper *ioWrapper;
const std::vector<FactorType> &inputFactorOrder = staticData.GetInputFactorOrder()
,&outputFactorOrder = staticData.GetOutputFactorOrder();
FactorMask inputFactorUsed(inputFactorOrder);
// io
if (staticData.GetParam("input-file").size() == 1) {
VERBOSE(2,"IO from File" << endl);
string filePath = staticData.GetParam("input-file")[0];
ioWrapper = new IOWrapper(inputFactorOrder, outputFactorOrder, inputFactorUsed
, staticData.GetNBestSize()
, staticData.GetNBestFilePath()
, filePath);
} else {
VERBOSE(1,"IO from STDOUT/STDIN" << endl);
ioWrapper = new IOWrapper(inputFactorOrder, outputFactorOrder, inputFactorUsed
, staticData.GetNBestSize()
, staticData.GetNBestFilePath());
}
ioWrapper->ResetTranslationId();
IFVERBOSE(1)
PrintUserTime("Created input-output object");
return ioWrapper;
}
}

View File

@ -0,0 +1,142 @@
// $Id$
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (c) 2006 University of Edinburgh
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the University of Edinburgh nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
***********************************************************************/
// example file on how to use moses library
#ifndef moses_cmd_IOWrapper_h
#define moses_cmd_IOWrapper_h
#include <cassert>
#include <fstream>
#include <ostream>
#include <vector>
#include "util/check.hh"
#include "TypeDef.h"
#include "Sentence.h"
#include "FactorTypeSet.h"
#include "FactorCollection.h"
#include "Hypothesis.h"
#include "OutputCollector.h"
#include "TrellisPathList.h"
#include "InputFileStream.h"
#include "InputType.h"
#include "WordLattice.h"
#include "LatticeMBR.h"
namespace MosesCmd
{
/** Helper class that holds misc variables to write data out to command line.
*/
class IOWrapper
{
protected:
long m_translationId;
const std::vector<Moses::FactorType> &m_inputFactorOrder;
const std::vector<Moses::FactorType> &m_outputFactorOrder;
const Moses::FactorMask &m_inputFactorUsed;
std::string m_inputFilePath;
Moses::InputFileStream *m_inputFile;
std::istream *m_inputStream;
std::ostream *m_nBestStream
,*m_outputWordGraphStream,*m_outputSearchGraphStream;
std::ostream *m_detailedTranslationReportingStream;
std::ofstream *m_alignmentOutputStream;
bool m_surpressSingleBestOutput;
void Initialization(const std::vector<Moses::FactorType> &inputFactorOrder
, const std::vector<Moses::FactorType> &outputFactorOrder
, const Moses::FactorMask &inputFactorUsed
, size_t nBestSize
, const std::string &nBestFilePath);
public:
IOWrapper(const std::vector<Moses::FactorType> &inputFactorOrder
, const std::vector<Moses::FactorType> &outputFactorOrder
, const Moses::FactorMask &inputFactorUsed
, size_t nBestSize
, const std::string &nBestFilePath);
IOWrapper(const std::vector<Moses::FactorType> &inputFactorOrder
, const std::vector<Moses::FactorType> &outputFactorOrder
, const Moses::FactorMask &inputFactorUsed
, size_t nBestSize
, const std::string &nBestFilePath
, const std::string &infilePath);
~IOWrapper();
Moses::InputType* GetInput(Moses::InputType *inputType);
void OutputBestHypo(const Moses::Hypothesis *hypo, long translationId, bool reportSegmentation, bool reportAllFactors);
void OutputLatticeMBRNBestList(const std::vector<LatticeMBRSolution>& solutions,long translationId);
void Backtrack(const Moses::Hypothesis *hypo);
void ResetTranslationId() {
m_translationId = 0;
}
std::ofstream *GetAlignmentOutputStream() {
return m_alignmentOutputStream;
}
std::ostream &GetOutputWordGraphStream() {
return *m_outputWordGraphStream;
}
std::ostream &GetOutputSearchGraphStream() {
return *m_outputSearchGraphStream;
}
std::ostream &GetDetailedTranslationReportingStream() {
assert (m_detailedTranslationReportingStream);
return *m_detailedTranslationReportingStream;
}
};
IOWrapper *GetIOWrapper(const Moses::StaticData &staticData);
bool ReadInput(IOWrapper &ioWrapper, Moses::InputTypeEnum inputType, Moses::InputType*& source);
void OutputBestSurface(std::ostream &out, const Moses::Hypothesis *hypo, const std::vector<Moses::FactorType> &outputFactorOrder, bool reportSegmentation, bool reportAllFactors);
void OutputNBest(std::ostream& out, const Moses::TrellisPathList &nBestList, const std::vector<Moses::FactorType>&,
const Moses::TranslationSystem* system, long translationId, bool reportSegmentation);
void OutputLatticeMBRNBest(std::ostream& out, const std::vector<LatticeMBRSolution>& solutions,long translationId);
void OutputBestHypo(const std::vector<Moses::Word>& mbrBestHypo, long /*translationId*/,
bool reportSegmentation, bool reportAllFactors, std::ostream& out);
void OutputBestHypo(const Moses::TrellisPath &path, long /*translationId*/,bool reportSegmentation, bool reportAllFactors, std::ostream &out);
void OutputInput(std::ostream& os, const Moses::Hypothesis* hypo);
void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::Hypothesis *hypo);
void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::TrellisPath &path);
}
#endif

View File

@ -0,0 +1,6 @@
alias deps : ../../../moses/src//moses ;
exe calcDivergence : Main.cpp mbr.cpp IOWrapper.cpp TranslationAnalysis.cpp LatticeMBR.cpp RelativeEntropyCalc.cpp deps ;
alias programs : calcDivergence ;

View File

@ -0,0 +1,669 @@
/*
* LatticeMBR.cpp
* moses-cmd
*
* Created by Abhishek Arun on 26/01/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include "LatticeMBR.h"
#include "StaticData.h"
#include <algorithm>
#include <set>
using namespace std;
using namespace Moses;
namespace MosesCmd
{
size_t bleu_order = 4;
float UNKNGRAMLOGPROB = -20;
void GetOutputWords(const TrellisPath &path, vector <Word> &translation)
{
const std::vector<const Hypothesis *> &edges = path.GetEdges();
// print the surface factor of the translation
for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
const Hypothesis &edge = *edges[currEdge];
const Phrase &phrase = edge.GetCurrTargetPhrase();
size_t size = phrase.GetSize();
for (size_t pos = 0 ; pos < size ; pos++) {
translation.push_back(phrase.GetWord(pos));
}
}
}
void extract_ngrams(const vector<Word >& sentence, map < Phrase, int > & allngrams)
{
for (int k = 0; k < (int)bleu_order; k++) {
for(int i =0; i < max((int)sentence.size()-k,0); i++) {
Phrase ngram( k+1);
for ( int j = i; j<= i+k; j++) {
ngram.AddWord(sentence[j]);
}
++allngrams[ngram];
}
}
}
void NgramScores::addScore(const Hypothesis* node, const Phrase& ngram, float score)
{
set<Phrase>::const_iterator ngramIter = m_ngrams.find(ngram);
if (ngramIter == m_ngrams.end()) {
ngramIter = m_ngrams.insert(ngram).first;
}
map<const Phrase*,float>& ngramScores = m_scores[node];
map<const Phrase*,float>::iterator scoreIter = ngramScores.find(&(*ngramIter));
if (scoreIter == ngramScores.end()) {
ngramScores[&(*ngramIter)] = score;
} else {
ngramScores[&(*ngramIter)] = log_sum(score,scoreIter->second);
}
}
NgramScores::NodeScoreIterator NgramScores::nodeBegin(const Hypothesis* node)
{
return m_scores[node].begin();
}
NgramScores::NodeScoreIterator NgramScores::nodeEnd(const Hypothesis* node)
{
return m_scores[node].end();
}
LatticeMBRSolution::LatticeMBRSolution(const TrellisPath& path, bool isMap) :
m_score(0.0f)
{
const std::vector<const Hypothesis *> &edges = path.GetEdges();
for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
const Hypothesis &edge = *edges[currEdge];
const Phrase &phrase = edge.GetCurrTargetPhrase();
size_t size = phrase.GetSize();
for (size_t pos = 0 ; pos < size ; pos++) {
m_words.push_back(phrase.GetWord(pos));
}
}
if (isMap) {
m_mapScore = path.GetTotalScore();
} else {
m_mapScore = 0;
}
}
void LatticeMBRSolution::CalcScore(map<Phrase, float>& finalNgramScores, const vector<float>& thetas, float mapWeight)
{
m_ngramScores.assign(thetas.size()-1, -10000);
map < Phrase, int > counts;
extract_ngrams(m_words,counts);
//Now score this translation
m_score = thetas[0] * m_words.size();
//Calculate the ngramScores, working in log space at first
for (map < Phrase, int >::iterator ngrams = counts.begin(); ngrams != counts.end(); ++ngrams) {
float ngramPosterior = UNKNGRAMLOGPROB;
map<Phrase,float>::const_iterator ngramPosteriorIt = finalNgramScores.find(ngrams->first);
if (ngramPosteriorIt != finalNgramScores.end()) {
ngramPosterior = ngramPosteriorIt->second;
}
size_t ngramSize = ngrams->first.GetSize();
m_ngramScores[ngramSize-1] = log_sum(log((float)ngrams->second) + ngramPosterior,m_ngramScores[ngramSize-1]);
}
//convert from log to probability and create weighted sum
for (size_t i = 0; i < m_ngramScores.size(); ++i) {
m_ngramScores[i] = exp(m_ngramScores[i]);
m_score += thetas[i+1] * m_ngramScores[i];
}
//The map score
m_score += m_mapScore*mapWeight;
}
void pruneLatticeFB(Lattice & connectedHyp, map < const Hypothesis*, set <const Hypothesis* > > & outgoingHyps, map<const Hypothesis*, vector<Edge> >& incomingEdges,
const vector< float> & estimatedScores, const Hypothesis* bestHypo, size_t edgeDensity, float scale)
{
//Need hyp 0 in connectedHyp - Find empty hypothesis
VERBOSE(2,"Pruning lattice to edge density " << edgeDensity << endl);
const Hypothesis* emptyHyp = connectedHyp.at(0);
while (emptyHyp->GetId() != 0) {
emptyHyp = emptyHyp->GetPrevHypo();
}
connectedHyp.push_back(emptyHyp); //Add it to list of hyps
//Need hyp 0's outgoing Hyps
for (size_t i = 0; i < connectedHyp.size(); ++i) {
if (connectedHyp[i]->GetId() > 0 && connectedHyp[i]->GetPrevHypo()->GetId() == 0)
outgoingHyps[emptyHyp].insert(connectedHyp[i]);
}
//sort hyps based on estimated scores - do so by copying to multimap
multimap<float, const Hypothesis*> sortHypsByVal;
for (size_t i =0; i < estimatedScores.size(); ++i) {
sortHypsByVal.insert(make_pair(estimatedScores[i], connectedHyp[i]));
}
multimap<float, const Hypothesis*>::const_iterator it = --sortHypsByVal.end();
float bestScore = it->first;
//store best score as score of hyp 0
sortHypsByVal.insert(make_pair(bestScore, emptyHyp));
IFVERBOSE(3) {
for (multimap<float, const Hypothesis*>::const_iterator it = --sortHypsByVal.end(); it != --sortHypsByVal.begin(); --it) {
const Hypothesis* currHyp = it->second;
cerr << "Hyp " << currHyp->GetId() << ", estimated score: " << it->first << endl;
}
}
set <const Hypothesis*> survivingHyps; //store hyps that make the cut in this
VERBOSE(2, "BEST HYPO TARGET LENGTH : " << bestHypo->GetSize() << endl)
size_t numEdgesTotal = edgeDensity * bestHypo->GetSize(); //as per Shankar, aim for (density * target length of MAP solution) arcs
size_t numEdgesCreated = 0;
VERBOSE(2, "Target edge count: " << numEdgesTotal << endl);
float prevScore = -999999;
//now iterate over multimap
for (multimap<float, const Hypothesis*>::const_iterator it = --sortHypsByVal.end(); it != --sortHypsByVal.begin(); --it) {
float currEstimatedScore = it->first;
const Hypothesis* currHyp = it->second;
if (numEdgesCreated >= numEdgesTotal && prevScore > currEstimatedScore) //if this hyp has equal estimated score to previous, include its edges too
break;
prevScore = currEstimatedScore;
VERBOSE(3, "Num edges created : "<< numEdgesCreated << ", numEdges wanted " << numEdgesTotal << endl)
VERBOSE(3, "Considering hyp " << currHyp->GetId() << ", estimated score: " << it->first << endl)
survivingHyps.insert(currHyp); //CurrHyp made the cut
// is its best predecessor already included ?
if (survivingHyps.find(currHyp->GetPrevHypo()) != survivingHyps.end()) { //yes, then add an edge
vector <Edge>& edges = incomingEdges[currHyp];
Edge winningEdge(currHyp->GetPrevHypo(),currHyp,scale*(currHyp->GetScore() - currHyp->GetPrevHypo()->GetScore()),currHyp->GetCurrTargetPhrase());
edges.push_back(winningEdge);
++numEdgesCreated;
}
//let's try the arcs too
const ArcList *arcList = currHyp->GetArcList();
if (arcList != NULL) {
ArcList::const_iterator iterArcList;
for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList) {
const Hypothesis *loserHypo = *iterArcList;
const Hypothesis* loserPrevHypo = loserHypo->GetPrevHypo();
if (survivingHyps.find(loserPrevHypo) != survivingHyps.end()) { //found it, add edge
double arcScore = loserHypo->GetScore() - loserPrevHypo->GetScore();
Edge losingEdge(loserPrevHypo, currHyp, arcScore*scale, loserHypo->GetCurrTargetPhrase());
vector <Edge>& edges = incomingEdges[currHyp];
edges.push_back(losingEdge);
++numEdgesCreated;
}
}
}
//Now if a successor node has already been visited, add an edge connecting the two
map < const Hypothesis*, set < const Hypothesis* > >::const_iterator outgoingIt = outgoingHyps.find(currHyp);
if (outgoingIt != outgoingHyps.end()) {//currHyp does have successors
const set<const Hypothesis*> & outHyps = outgoingIt->second; //the successors
for (set<const Hypothesis*>::const_iterator outHypIts = outHyps.begin(); outHypIts != outHyps.end(); ++outHypIts) {
const Hypothesis* succHyp = *outHypIts;
if (survivingHyps.find(succHyp) == survivingHyps.end()) //Have we encountered the successor yet?
continue; //No, move on to next
//Curr Hyp can be : a) the best predecessor of succ b) or an arc attached to succ
if (succHyp->GetPrevHypo() == currHyp) { //best predecessor
vector <Edge>& succEdges = incomingEdges[succHyp];
Edge succWinningEdge(currHyp, succHyp, scale*(succHyp->GetScore() - currHyp->GetScore()), succHyp->GetCurrTargetPhrase());
succEdges.push_back(succWinningEdge);
survivingHyps.insert(succHyp);
++numEdgesCreated;
}
//now, let's find an arc
const ArcList *arcList = succHyp->GetArcList();
if (arcList != NULL) {
ArcList::const_iterator iterArcList;
//QUESTION: What happens if there's more than one loserPrevHypo?
for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList) {
const Hypothesis *loserHypo = *iterArcList;
const Hypothesis* loserPrevHypo = loserHypo->GetPrevHypo();
if (loserPrevHypo == currHyp) { //found it
vector <Edge>& succEdges = incomingEdges[succHyp];
double arcScore = loserHypo->GetScore() - currHyp->GetScore();
Edge losingEdge(currHyp, succHyp,scale* arcScore, loserHypo->GetCurrTargetPhrase());
succEdges.push_back(losingEdge);
++numEdgesCreated;
}
}
}
}
}
}
connectedHyp.clear();
for (set <const Hypothesis*>::iterator it = survivingHyps.begin(); it != survivingHyps.end(); ++it) {
connectedHyp.push_back(*it);
}
VERBOSE(2, "Done! Num edges created : "<< numEdgesCreated << ", numEdges wanted " << numEdgesTotal << endl)
IFVERBOSE(3) {
cerr << "Surviving hyps: " ;
for (set <const Hypothesis*>::iterator it = survivingHyps.begin(); it != survivingHyps.end(); ++it) {
cerr << (*it)->GetId() << " ";
}
cerr << endl;
}
}
void calcNgramExpectations(Lattice & connectedHyp, map<const Hypothesis*, vector<Edge> >& incomingEdges,
map<Phrase, float>& finalNgramScores, bool posteriors)
{
sort(connectedHyp.begin(),connectedHyp.end(),ascendingCoverageCmp); //sort by increasing source word cov
/*cerr << "Lattice:" << endl;
for (Lattice::const_iterator i = connectedHyp.begin(); i != connectedHyp.end(); ++i) {
const Hypothesis* h = *i;
cerr << *h << endl;
const vector<Edge>& edges = incomingEdges[h];
for (size_t e = 0; e < edges.size(); ++e) {
cerr << edges[e];
}
}*/
map<const Hypothesis*, float> forwardScore;
forwardScore[connectedHyp[0]] = 0.0f; //forward score of hyp 0 is 1 (or 0 in logprob space)
set< const Hypothesis *> finalHyps; //store completed hyps
NgramScores ngramScores;//ngram scores for each hyp
for (size_t i = 1; i < connectedHyp.size(); ++i) {
const Hypothesis* currHyp = connectedHyp[i];
if (currHyp->GetWordsBitmap().IsComplete()) {
finalHyps.insert(currHyp);
}
VERBOSE(3, "Processing hyp: " << currHyp->GetId() << ", num words cov= " << currHyp->GetWordsBitmap().GetNumWordsCovered() << endl)
vector <Edge> & edges = incomingEdges[currHyp];
for (size_t e = 0; e < edges.size(); ++e) {
const Edge& edge = edges[e];
if (forwardScore.find(currHyp) == forwardScore.end()) {
forwardScore[currHyp] = forwardScore[edge.GetTailNode()] + edge.GetScore();
VERBOSE(3, "Fwd score["<<currHyp->GetId()<<"] = fwdScore["<<edge.GetTailNode()->GetId() << "] + edge Score: " << edge.GetScore() << endl)
} else {
forwardScore[currHyp] = log_sum(forwardScore[currHyp], forwardScore[edge.GetTailNode()] + edge.GetScore());
VERBOSE(3, "Fwd score["<<currHyp->GetId()<<"] += fwdScore["<<edge.GetTailNode()->GetId() << "] + edge Score: " << edge.GetScore() << endl)
}
}
//Process ngrams now
for (size_t j =0 ; j < edges.size(); ++j) {
Edge& edge = edges[j];
const NgramHistory & incomingPhrases = edge.GetNgrams(incomingEdges);
//let's first score ngrams introduced by this edge
for (NgramHistory::const_iterator it = incomingPhrases.begin(); it != incomingPhrases.end(); ++it) {
const Phrase& ngram = it->first;
const PathCounts& pathCounts = it->second;
VERBOSE(4, "Calculating score for: " << it->first << endl)
for (PathCounts::const_iterator pathCountIt = pathCounts.begin(); pathCountIt != pathCounts.end(); ++pathCountIt) {
//Score of an n-gram is forward score of head node of leftmost edge + all edge scores
const Path& path = pathCountIt->first;
//cerr << "path count for " << ngram << " is " << pathCountIt->second << endl;
float score = forwardScore[path[0]->GetTailNode()];
for (size_t i = 0; i < path.size(); ++i) {
score += path[i]->GetScore();
}
//if we're doing expectations, then the number of times the ngram
//appears on the path is relevant.
size_t count = posteriors ? 1 : pathCountIt->second;
for (size_t k = 0; k < count; ++k) {
ngramScores.addScore(currHyp,ngram,score);
}
}
}
//Now score ngrams that are just being propagated from the history
for (NgramScores::NodeScoreIterator it = ngramScores.nodeBegin(edge.GetTailNode());
it != ngramScores.nodeEnd(edge.GetTailNode()); ++it) {
const Phrase & currNgram = *(it->first);
float currNgramScore = it->second;
VERBOSE(4, "Calculating score for: " << currNgram << endl)
// For posteriors, don't double count ngrams
if (!posteriors || incomingPhrases.find(currNgram) == incomingPhrases.end()) {
float score = edge.GetScore() + currNgramScore;
ngramScores.addScore(currHyp,currNgram,score);
}
}
}
}
float Z = 9999999; //the total score of the lattice
//Done - Print out ngram posteriors for final hyps
for (set< const Hypothesis *>::iterator finalHyp = finalHyps.begin(); finalHyp != finalHyps.end(); ++finalHyp) {
const Hypothesis* hyp = *finalHyp;
for (NgramScores::NodeScoreIterator it = ngramScores.nodeBegin(hyp); it != ngramScores.nodeEnd(hyp); ++it) {
const Phrase& ngram = *(it->first);
if (finalNgramScores.find(ngram) == finalNgramScores.end()) {
finalNgramScores[ngram] = it->second;
} else {
finalNgramScores[ngram] = log_sum(it->second, finalNgramScores[ngram]);
}
}
if (Z == 9999999) {
Z = forwardScore[hyp];
} else {
Z = log_sum(Z, forwardScore[hyp]);
}
}
//Z *= scale; //scale the score
for (map<Phrase, float>::iterator finalScoresIt = finalNgramScores.begin(); finalScoresIt != finalNgramScores.end(); ++finalScoresIt) {
finalScoresIt->second = finalScoresIt->second - Z;
IFVERBOSE(2) {
VERBOSE(2,finalScoresIt->first << " [" << finalScoresIt->second << "]" << endl);
}
}
}
const NgramHistory& Edge::GetNgrams(map<const Hypothesis*, vector<Edge> > & incomingEdges)
{
if (m_ngrams.size() > 0)
return m_ngrams;
const Phrase& currPhrase = GetWords();
//Extract the n-grams local to this edge
for (size_t start = 0; start < currPhrase.GetSize(); ++start) {
for (size_t end = start; end < start + bleu_order; ++end) {
if (end < currPhrase.GetSize()) {
Phrase edgeNgram(end-start+1);
for (size_t index = start; index <= end; ++index) {
edgeNgram.AddWord(currPhrase.GetWord(index));
}
//cout << "Inserting Phrase : " << edgeNgram << endl;
vector<const Edge*> edgeHistory;
edgeHistory.push_back(this);
storeNgramHistory(edgeNgram, edgeHistory);
} else {
break;
}
}
}
map<const Hypothesis*, vector<Edge> >::iterator it = incomingEdges.find(m_tailNode);
if (it != incomingEdges.end()) { //node has incoming edges
vector<Edge> & inEdges = it->second;
for (vector<Edge>::iterator edge = inEdges.begin(); edge != inEdges.end(); ++edge) {//add the ngrams straddling prev and curr edge
const NgramHistory & edgeIncomingNgrams = edge->GetNgrams(incomingEdges);
for (NgramHistory::const_iterator edgeInNgramHist = edgeIncomingNgrams.begin(); edgeInNgramHist != edgeIncomingNgrams.end(); ++edgeInNgramHist) {
const Phrase& edgeIncomingNgram = edgeInNgramHist->first;
const PathCounts & edgeIncomingNgramPaths = edgeInNgramHist->second;
size_t back = min(edgeIncomingNgram.GetSize(), edge->GetWordsSize());
const Phrase& edgeWords = edge->GetWords();
IFVERBOSE(3) {
cerr << "Edge: "<< *edge <<endl;
cerr << "edgeWords: " << edgeWords << endl;
cerr << "edgeInNgram: " << edgeIncomingNgram << endl;
}
Phrase edgeSuffix(ARRAY_SIZE_INCR);
Phrase ngramSuffix(ARRAY_SIZE_INCR);
GetPhraseSuffix(edgeWords,back,edgeSuffix);
GetPhraseSuffix(edgeIncomingNgram,back,ngramSuffix);
if (ngramSuffix == edgeSuffix) { //we've got the suffix of previous edge
size_t edgeInNgramSize = edgeIncomingNgram.GetSize();
for (size_t i = 0; i < GetWordsSize() && i + edgeInNgramSize < bleu_order ; ++i) {
Phrase newNgram(edgeIncomingNgram);
for (size_t j = 0; j <= i ; ++j) {
newNgram.AddWord(GetWords().GetWord(j));
}
VERBOSE(3, "Inserting New Phrase : " << newNgram << endl)
for (PathCounts::const_iterator pathIt = edgeIncomingNgramPaths.begin(); pathIt != edgeIncomingNgramPaths.end(); ++pathIt) {
Path newNgramPath = pathIt->first;
newNgramPath.push_back(this);
storeNgramHistory(newNgram, newNgramPath, pathIt->second);
}
}
}
}
}
}
return m_ngrams;
}
//Add the last lastN words of origPhrase to targetPhrase
void Edge::GetPhraseSuffix(const Phrase& origPhrase, size_t lastN, Phrase& targetPhrase) const
{
size_t origSize = origPhrase.GetSize();
size_t startIndex = origSize - lastN;
for (size_t index = startIndex; index < origPhrase.GetSize(); ++index) {
targetPhrase.AddWord(origPhrase.GetWord(index));
}
}
bool Edge::operator< (const Edge& compare ) const
{
if (m_headNode->GetId() < compare.m_headNode->GetId())
return true;
if (compare.m_headNode->GetId() < m_headNode->GetId())
return false;
if (m_tailNode->GetId() < compare.m_tailNode->GetId())
return true;
if (compare.m_tailNode->GetId() < m_tailNode->GetId())
return false;
return GetScore() < compare.GetScore();
}
ostream& operator<< (ostream& out, const Edge& edge)
{
out << "Head: " << edge.m_headNode->GetId() << ", Tail: " << edge.m_tailNode->GetId() << ", Score: " << edge.m_score << ", Phrase: " << edge.m_targetPhrase << endl;
return out;
}
bool ascendingCoverageCmp(const Hypothesis* a, const Hypothesis* b)
{
return a->GetWordsBitmap().GetNumWordsCovered() < b->GetWordsBitmap().GetNumWordsCovered();
}
void getLatticeMBRNBest(Manager& manager, TrellisPathList& nBestList,
vector<LatticeMBRSolution>& solutions, size_t n)
{
const StaticData& staticData = StaticData::Instance();
std::map < int, bool > connected;
std::vector< const Hypothesis *> connectedList;
map<Phrase, float> ngramPosteriors;
std::map < const Hypothesis*, set <const Hypothesis*> > outgoingHyps;
map<const Hypothesis*, vector<Edge> > incomingEdges;
vector< float> estimatedScores;
manager.GetForwardBackwardSearchGraph(&connected, &connectedList, &outgoingHyps, &estimatedScores);
pruneLatticeFB(connectedList, outgoingHyps, incomingEdges, estimatedScores, manager.GetBestHypothesis(), staticData.GetLatticeMBRPruningFactor(),staticData.GetMBRScale());
calcNgramExpectations(connectedList, incomingEdges, ngramPosteriors,true);
vector<float> mbrThetas = staticData.GetLatticeMBRThetas();
float p = staticData.GetLatticeMBRPrecision();
float r = staticData.GetLatticeMBRPRatio();
float mapWeight = staticData.GetLatticeMBRMapWeight();
if (mbrThetas.size() == 0) { //thetas not specified on the command line, use p and r instead
mbrThetas.push_back(-1); //Theta 0
mbrThetas.push_back(1/(bleu_order*p));
for (size_t i = 2; i <= bleu_order; ++i) {
mbrThetas.push_back(mbrThetas[i-1] / r);
}
}
IFVERBOSE(2) {
VERBOSE(2,"Thetas: ");
for (size_t i = 0; i < mbrThetas.size(); ++i) {
VERBOSE(2,mbrThetas[i] << " ");
}
VERBOSE(2,endl);
}
TrellisPathList::const_iterator iter;
size_t ctr = 0;
LatticeMBRSolutionComparator comparator;
for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter, ++ctr) {
const TrellisPath &path = **iter;
solutions.push_back(LatticeMBRSolution(path,iter==nBestList.begin()));
solutions.back().CalcScore(ngramPosteriors,mbrThetas,mapWeight);
sort(solutions.begin(), solutions.end(), comparator);
while (solutions.size() > n) {
solutions.pop_back();
}
}
VERBOSE(2,"LMBR Score: " << solutions[0].GetScore() << endl);
}
vector<Word> doLatticeMBR(Manager& manager, TrellisPathList& nBestList)
{
vector<LatticeMBRSolution> solutions;
getLatticeMBRNBest(manager, nBestList, solutions,1);
return solutions.at(0).GetWords();
}
const TrellisPath doConsensusDecoding(Manager& manager, TrellisPathList& nBestList)
{
static const int BLEU_ORDER = 4;
static const float SMOOTH = 1;
//calculate the ngram expectations
const StaticData& staticData = StaticData::Instance();
std::map < int, bool > connected;
std::vector< const Hypothesis *> connectedList;
map<Phrase, float> ngramExpectations;
std::map < const Hypothesis*, set <const Hypothesis*> > outgoingHyps;
map<const Hypothesis*, vector<Edge> > incomingEdges;
vector< float> estimatedScores;
manager.GetForwardBackwardSearchGraph(&connected, &connectedList, &outgoingHyps, &estimatedScores);
pruneLatticeFB(connectedList, outgoingHyps, incomingEdges, estimatedScores, manager.GetBestHypothesis(), staticData.GetLatticeMBRPruningFactor(),staticData.GetMBRScale());
calcNgramExpectations(connectedList, incomingEdges, ngramExpectations,false);
//expected length is sum of expected unigram counts
//cerr << "Thread " << pthread_self() << " Ngram expectations size: " << ngramExpectations.size() << endl;
float ref_length = 0.0f;
for (map<Phrase,float>::const_iterator ref_iter = ngramExpectations.begin();
ref_iter != ngramExpectations.end(); ++ref_iter) {
//cerr << "Ngram: " << ref_iter->first << " score: " <<
// ref_iter->second << endl;
if (ref_iter->first.GetSize() == 1) {
ref_length += exp(ref_iter->second);
// cerr << "Expected for " << ref_iter->first << " is " << exp(ref_iter->second) << endl;
}
}
VERBOSE(2,"REF Length: " << ref_length << endl);
//use the ngram expectations to rescore the nbest list.
TrellisPathList::const_iterator iter;
TrellisPathList::const_iterator best = nBestList.end();
float bestScore = -100000;
//cerr << "nbest list size: " << nBestList.GetSize() << endl;
for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
const TrellisPath &path = **iter;
vector<Word> words;
map<Phrase,int> ngrams;
GetOutputWords(path,words);
/*for (size_t i = 0; i < words.size(); ++i) {
cerr << words[i].GetFactor(0)->GetString() << " ";
}
cerr << endl;
*/
extract_ngrams(words,ngrams);
vector<float> comps(2*BLEU_ORDER+1);
float logbleu = 0.0;
float brevity = 0.0;
int hyp_length = words.size();
for (int i = 0; i < BLEU_ORDER; ++i) {
comps[2*i] = 0.0;
comps[2*i+1] = max(hyp_length-i,0);
}
for (map<Phrase,int>::const_iterator hyp_iter = ngrams.begin();
hyp_iter != ngrams.end(); ++hyp_iter) {
map<Phrase,float>::const_iterator ref_iter = ngramExpectations.find(hyp_iter->first);
if (ref_iter != ngramExpectations.end()) {
comps[2*(hyp_iter->first.GetSize()-1)] += min(exp(ref_iter->second), (float)(hyp_iter->second));
}
}
comps[comps.size()-1] = ref_length;
/*for (size_t i = 0; i < comps.size(); ++i) {
cerr << comps[i] << " ";
}
cerr << endl;
*/
float score = 0.0f;
if (comps[0] != 0) {
for (int i=0; i<BLEU_ORDER; i++) {
if ( i > 0 ) {
logbleu += log((float)comps[2*i]+SMOOTH)-log((float)comps[2*i+1]+SMOOTH);
} else {
logbleu += log((float)comps[2*i])-log((float)comps[2*i+1]);
}
}
logbleu /= BLEU_ORDER;
brevity = 1.0-(float)comps[comps.size()-1]/comps[1]; // comps[comps_n-1] is the ref length, comps[1] is the test length
if (brevity < 0.0) {
logbleu += brevity;
}
score = exp(logbleu);
}
//cerr << "score: " << score << " bestScore: " << bestScore << endl;
if (score > bestScore) {
bestScore = score;
best = iter;
VERBOSE(2,"NEW BEST: " << score << endl);
//for (size_t i = 0; i < comps.size(); ++i) {
// cerr << comps[i] << " ";
//}
//cerr << endl;
}
}
assert (best != nBestList.end());
return **best;
//vector<Word> bestWords;
//GetOutputWords(**best,bestWords);
//return bestWords;
}
}

View File

@ -0,0 +1,153 @@
/*
* LatticeMBR.h
* moses-cmd
*
* Created by Abhishek Arun on 26/01/2010.
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#ifndef moses_cmd_LatticeMBR_h
#define moses_cmd_LatticeMBR_h
#include <map>
#include <vector>
#include <set>
#include "Hypothesis.h"
#include "Manager.h"
#include "TrellisPathList.h"
namespace MosesCmd
{
class Edge;
typedef std::vector< const Moses::Hypothesis *> Lattice;
typedef std::vector<const Edge*> Path;
typedef std::map<Path, size_t> PathCounts;
typedef std::map<Moses::Phrase, PathCounts > NgramHistory;
class Edge
{
const Moses::Hypothesis* m_tailNode;
const Moses::Hypothesis* m_headNode;
float m_score;
Moses::TargetPhrase m_targetPhrase;
NgramHistory m_ngrams;
public:
Edge(const Moses::Hypothesis* from, const Moses::Hypothesis* to, float score, const Moses::TargetPhrase& targetPhrase) : m_tailNode(from), m_headNode(to), m_score(score), m_targetPhrase(targetPhrase) {
//cout << "Creating new edge from Node " << from->GetId() << ", to Node : " << to->GetId() << ", score: " << score << " phrase: " << targetPhrase << endl;
}
const Moses::Hypothesis* GetHeadNode() const {
return m_headNode;
}
const Moses::Hypothesis* GetTailNode() const {
return m_tailNode;
}
float GetScore() const {
return m_score;
}
size_t GetWordsSize() const {
return m_targetPhrase.GetSize();
}
const Moses::Phrase& GetWords() const {
return m_targetPhrase;
}
friend std::ostream& operator<< (std::ostream& out, const Edge& edge);
const NgramHistory& GetNgrams( std::map<const Moses::Hypothesis*, std::vector<Edge> > & incomingEdges) ;
bool operator < (const Edge & compare) const;
void GetPhraseSuffix(const Moses::Phrase& origPhrase, size_t lastN, Moses::Phrase& targetPhrase) const;
void storeNgramHistory(const Moses::Phrase& phrase, Path & path, size_t count = 1) {
m_ngrams[phrase][path]+= count;
}
};
/**
* Data structure to hold the ngram scores as we traverse the lattice. Maps (hypo,ngram) to score
*/
class NgramScores
{
public:
NgramScores() {}
/** logsum this score to the existing score */
void addScore(const Moses::Hypothesis* node, const Moses::Phrase& ngram, float score);
/** Iterate through ngrams for selected node */
typedef std::map<const Moses::Phrase*, float>::const_iterator NodeScoreIterator;
NodeScoreIterator nodeBegin(const Moses::Hypothesis* node);
NodeScoreIterator nodeEnd(const Moses::Hypothesis* node);
private:
std::set<Moses::Phrase> m_ngrams;
std::map<const Moses::Hypothesis*, std::map<const Moses::Phrase*, float> > m_scores;
};
/** Holds a lattice mbr solution, and its scores */
class LatticeMBRSolution
{
public:
/** Read the words from the path */
LatticeMBRSolution(const Moses::TrellisPath& path, bool isMap);
const std::vector<float>& GetNgramScores() const {
return m_ngramScores;
}
const std::vector<Moses::Word>& GetWords() const {
return m_words;
}
float GetMapScore() const {
return m_mapScore;
}
float GetScore() const {
return m_score;
}
/** Initialise ngram scores */
void CalcScore(std::map<Moses::Phrase, float>& finalNgramScores, const std::vector<float>& thetas, float mapWeight);
private:
std::vector<Moses::Word> m_words;
float m_mapScore;
std::vector<float> m_ngramScores;
float m_score;
};
struct LatticeMBRSolutionComparator {
bool operator()(const LatticeMBRSolution& a, const LatticeMBRSolution& b) {
return a.GetScore() > b.GetScore();
}
};
void pruneLatticeFB(Lattice & connectedHyp, std::map < const Moses::Hypothesis*, std::set <const Moses::Hypothesis* > > & outgoingHyps, std::map<const Moses::Hypothesis*, std::vector<Edge> >& incomingEdges,
const std::vector< float> & estimatedScores, const Moses::Hypothesis*, size_t edgeDensity,float scale);
//Use the ngram scores to rerank the nbest list, return at most n solutions
void getLatticeMBRNBest(Moses::Manager& manager, Moses::TrellisPathList& nBestList, std::vector<LatticeMBRSolution>& solutions, size_t n);
//calculate expectated ngram counts, clipping at 1 (ie calculating posteriors) if posteriors==true.
void calcNgramExpectations(Lattice & connectedHyp, std::map<const Moses::Hypothesis*, std::vector<Edge> >& incomingEdges, std::map<Moses::Phrase,
float>& finalNgramScores, bool posteriors);
void GetOutputFactors(const Moses::TrellisPath &path, std::vector <Moses::Word> &translation);
void extract_ngrams(const std::vector<Moses::Word >& sentence, std::map < Moses::Phrase, int > & allngrams);
bool ascendingCoverageCmp(const Moses::Hypothesis* a, const Moses::Hypothesis* b);
std::vector<Moses::Word> doLatticeMBR(Moses::Manager& manager, Moses::TrellisPathList& nBestList);
const Moses::TrellisPath doConsensusDecoding(Moses::Manager& manager, Moses::TrellisPathList& nBestList);
//std::vector<Moses::Word> doConsensusDecoding(Moses::Manager& manager, Moses::TrellisPathList& nBestList);
}
#endif

View File

@ -0,0 +1,213 @@
// $Id: LatticeMBRGrid.cpp 3045 2010-04-05 13:07:29Z hieuhoang1972 $
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (c) 2010 University of Edinburgh
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the University of Edinburgh nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
***********************************************************************/
/**
* Lattice MBR grid search. Enables a grid search through the four parameters (p,r,scale and prune) used in lattice MBR.
See 'Lattice Minimum Bayes-Risk Decoding for Statistical Machine Translation by Tromble, Kumar, Och and Macherey,
EMNLP 2008 for details of the parameters.
The grid search is controlled by specifying comma separated lists for the lmbr parameters (-lmbr-p, -lmbr-r,
-lmbr-pruning-factor and -mbr-scale). All other parameters are passed through to moses. If any of the lattice mbr
parameters are missing, then they are set to their default values. Output is of the form:
sentence-id ||| p r prune scale ||| translation-hypothesis
**/
#include <cstdlib>
#include <iostream>
#include <map>
#include <stdexcept>
#include <set>
#include "IOWrapper.h"
#include "LatticeMBR.h"
#include "Manager.h"
#include "StaticData.h"
using namespace std;
using namespace Moses;
using namespace MosesCmd;
//keys
enum gridkey {lmbr_p,lmbr_r,lmbr_prune,lmbr_scale};
namespace MosesCmd
{
class Grid
{
public:
/** Add a parameter with key, command line argument, and default value */
void addParam(gridkey key, const string& arg, float defaultValue) {
m_args[arg] = key;
CHECK(m_grid.find(key) == m_grid.end());
m_grid[key].push_back(defaultValue);
}
/** Parse the arguments, removing those that define the grid and returning a copy of the rest */
void parseArgs(int& argc, char**& argv) {
char** newargv = new char*[argc+1]; //Space to add mbr parameter
int newargc = 0;
for (int i = 0; i < argc; ++i) {
bool consumed = false;
for (map<string,gridkey>::const_iterator argi = m_args.begin(); argi != m_args.end(); ++argi) {
if (!strcmp(argv[i], argi->first.c_str())) {
++i;
if (i >= argc) {
cerr << "Error: missing parameter for " << argi->first << endl;
throw runtime_error("Missing parameter");
} else {
string value = argv[i];
gridkey key = argi->second;
if (m_grid[key].size() != 1) {
throw runtime_error("Duplicate grid argument");
}
m_grid[key].clear();
char delim = ',';
string::size_type lastpos = value.find_first_not_of(delim);
string::size_type pos = value.find_first_of(delim,lastpos);
while (string::npos != pos || string::npos != lastpos) {
float param = atof(value.substr(lastpos, pos-lastpos).c_str());
if (!param) {
cerr << "Error: Illegal grid parameter for " << argi->first << endl;
throw runtime_error("Illegal grid parameter");
}
m_grid[key].push_back(param);
lastpos = value.find_first_not_of(delim,pos);
pos = value.find_first_of(delim,lastpos);
}
consumed = true;
}
if (consumed) break;
}
}
if (!consumed) {
newargv[newargc] = new char[strlen(argv[i]) + 1];
strcpy(newargv[newargc],argv[i]);
++newargc;
}
}
argc = newargc;
argv = newargv;
}
/** Get the grid for a particular key.*/
const vector<float>& getGrid(gridkey key) const {
map<gridkey,vector<float> >::const_iterator iter = m_grid.find(key);
assert (iter != m_grid.end());
return iter->second;
}
private:
map<gridkey,vector<float> > m_grid;
map<string,gridkey> m_args;
};
} // namespace
int main(int argc, char* argv[])
{
cerr << "Lattice MBR Grid search" << endl;
Grid grid;
grid.addParam(lmbr_p, "-lmbr-p", 0.5);
grid.addParam(lmbr_r, "-lmbr-r", 0.5);
grid.addParam(lmbr_prune, "-lmbr-pruning-factor",30.0);
grid.addParam(lmbr_scale, "-mbr-scale",1.0);
grid.parseArgs(argc,argv);
Parameter* params = new Parameter();
if (!params->LoadParam(argc,argv)) {
params->Explain();
exit(1);
}
if (!StaticData::LoadDataStatic(params, argv[0])) {
exit(1);
}
StaticData& staticData = const_cast<StaticData&>(StaticData::Instance());
staticData.SetUseLatticeMBR(true);
IOWrapper* ioWrapper = GetIOWrapper(staticData);
if (!ioWrapper) {
throw runtime_error("Failed to initialise IOWrapper");
}
size_t nBestSize = staticData.GetMBRSize();
if (nBestSize <= 0) {
throw new runtime_error("Non-positive size specified for n-best list");
}
size_t lineCount = 0;
InputType* source = NULL;
const vector<float>& pgrid = grid.getGrid(lmbr_p);
const vector<float>& rgrid = grid.getGrid(lmbr_r);
const vector<float>& prune_grid = grid.getGrid(lmbr_prune);
const vector<float>& scale_grid = grid.getGrid(lmbr_scale);
while(ReadInput(*ioWrapper,staticData.GetInputType(),source)) {
++lineCount;
Sentence sentence;
const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
Manager manager(*source,staticData.GetSearchAlgorithm(), &system);
manager.ProcessSentence();
TrellisPathList nBestList;
manager.CalcNBest(nBestSize, nBestList,true);
//grid search
for (vector<float>::const_iterator pi = pgrid.begin(); pi != pgrid.end(); ++pi) {
float p = *pi;
staticData.SetLatticeMBRPrecision(p);
for (vector<float>::const_iterator ri = rgrid.begin(); ri != rgrid.end(); ++ri) {
float r = *ri;
staticData.SetLatticeMBRPRatio(r);
for (vector<float>::const_iterator prune_i = prune_grid.begin(); prune_i != prune_grid.end(); ++prune_i) {
size_t prune = (size_t)(*prune_i);
staticData.SetLatticeMBRPruningFactor(prune);
for (vector<float>::const_iterator scale_i = scale_grid.begin(); scale_i != scale_grid.end(); ++scale_i) {
float scale = *scale_i;
staticData.SetMBRScale(scale);
cout << lineCount << " ||| " << p << " " << r << " " << prune << " " << scale << " ||| ";
vector<Word> mbrBestHypo = doLatticeMBR(manager,nBestList);
OutputBestHypo(mbrBestHypo, lineCount, staticData.GetReportSegmentation(),
staticData.GetReportAllFactors(),cout);
}
}
}
}
}
}

View File

@ -0,0 +1,282 @@
/***********************************************************************
Relative Entropy-based Phrase table Pruning
Copyright (C) 2012 Wang Ling
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
/**
* Moses main, for single-threaded and multi-threaded.
**/
#include <exception>
#include <fstream>
#include <sstream>
#include <vector>
#ifdef WIN32
// Include Visual Leak Detector
//#include <vld.h>
#endif
#include "Hypothesis.h"
#include "Manager.h"
#include "IOWrapper.h"
#include "StaticData.h"
#include "Util.h"
#include "ThreadPool.h"
#include "TranslationAnalysis.h"
#include "OutputCollector.h"
#include "RelativeEntropyCalc.h"
#include "LexicalReordering.h"
#include "LexicalReorderingState.h"
#ifdef HAVE_PROTOBUF
#include "hypergraph.pb.h"
#endif
using namespace std;
using namespace Moses;
using namespace MosesCmd;
namespace MosesCmd
{
// output floats with three significant digits
static const size_t PRECISION = 3;
/** Enforce rounding */
void fix(std::ostream& stream, size_t size)
{
stream.setf(std::ios::fixed);
stream.precision(size);
}
/** Translates a sentence.
* - calls the search (Manager)
* - applies the decision rule
* - outputs best translation and additional reporting
**/
class TranslationTask : public Task
{
public:
TranslationTask(size_t lineNumber,
InputType* source, OutputCollector* searchGraphCollector) :
m_source(source), m_lineNumber(lineNumber),
m_searchGraphCollector(searchGraphCollector) {}
/** Translate one sentence
* gets called by main function implemented at end of this source file */
void Run() {
// report thread number
#if defined(WITH_THREADS) && defined(BOOST_HAS_PTHREADS)
TRACE_ERR("Translating line " << m_lineNumber << " in thread id " << pthread_self() << std::endl);
#endif
// shorthand for "global data"
const StaticData &staticData = StaticData::Instance();
// input sentence
Sentence sentence();
// set translation system
const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
// execute the translation
// note: this executes the search, resulting in a search graph
// we still need to apply the decision rule (MAP, MBR, ...)
Manager manager(m_lineNumber, *m_source,staticData.GetSearchAlgorithm(), &system);
manager.ProcessSentence();
// output search graph
if (m_searchGraphCollector) {
ostringstream out;
fix(out,PRECISION);
vector<SearchGraphNode> searchGraph;
manager.GetSearchGraph(searchGraph);
out << RelativeEntropyCalc::CalcRelativeEntropy(m_lineNumber,searchGraph) << endl;
m_searchGraphCollector->Write(m_lineNumber, out.str());
}
manager.CalcDecoderStatistics();
}
~TranslationTask() {
delete m_source;
}
private:
InputType* m_source;
size_t m_lineNumber;
OutputCollector* m_searchGraphCollector;
std::ofstream *m_alignmentStream;
};
static void PrintFeatureWeight(const FeatureFunction* ff)
{
size_t weightStart = StaticData::Instance().GetScoreIndexManager().GetBeginIndex(ff->GetScoreBookkeepingID());
size_t weightEnd = StaticData::Instance().GetScoreIndexManager().GetEndIndex(ff->GetScoreBookkeepingID());
for (size_t i = weightStart; i < weightEnd; ++i) {
cout << ff->GetScoreProducerDescription(i-weightStart) << " " << ff->GetScoreProducerWeightShortName(i-weightStart) << " "
<< StaticData::Instance().GetAllWeights()[i] << endl;
}
}
static void ShowWeights()
{
fix(cout,6);
const StaticData& staticData = StaticData::Instance();
const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
for (size_t i = 0; i < sff.size(); ++i) {
PrintFeatureWeight(sff[i]);
}
for (size_t i = 0; i < slf.size(); ++i) {
PrintFeatureWeight(slf[i]);
}
for (size_t i = 0; i < pds.size(); ++i) {
PrintFeatureWeight(pds[i]);
}
for (size_t i = 0; i < gds.size(); ++i) {
PrintFeatureWeight(gds[i]);
}
}
} //namespace
/** main function of the command line version of the decoder **/
int main(int argc, char** argv)
{
try {
// echo command line, if verbose
IFVERBOSE(1) {
TRACE_ERR("command: ");
for(int i=0; i<argc; ++i) TRACE_ERR(argv[i]<<" ");
TRACE_ERR(endl);
}
// set number of significant decimals in output
fix(cout,PRECISION);
fix(cerr,PRECISION);
// load all the settings into the Parameter class
// (stores them as strings, or array of strings)
Parameter* params = new Parameter();
if (!params->LoadParam(argc,argv)) {
params->Explain();
exit(1);
}
// initialize all "global" variables, which are stored in StaticData
// note: this also loads models such as the language model, etc.
if (!StaticData::LoadDataStatic(params, argv[0])) {
exit(1);
}
// setting "-show-weights" -> just dump out weights and exit
if (params->isParamSpecified("show-weights")) {
ShowWeights();
exit(0);
}
// shorthand for accessing information in StaticData
const StaticData& staticData = StaticData::Instance();
//initialise random numbers
srand(time(NULL));
// set up read/writing class
IOWrapper* ioWrapper = GetIOWrapper(staticData);
if (!ioWrapper) {
cerr << "Error; Failed to create IO object" << endl;
exit(1);
}
// check on weights
vector<float> weights = staticData.GetAllWeights();
IFVERBOSE(2) {
TRACE_ERR("The score component vector looks like this:\n" << staticData.GetScoreIndexManager());
TRACE_ERR("The global weight vector looks like this:");
for (size_t j=0; j<weights.size(); j++) {
TRACE_ERR(" " << weights[j]);
}
TRACE_ERR("\n");
}
// every score must have a weight! check that here:
if(weights.size() != staticData.GetScoreIndexManager().GetTotalNumberOfScores()) {
TRACE_ERR("ERROR: " << staticData.GetScoreIndexManager().GetTotalNumberOfScores() << " score components, but " << weights.size() << " weights defined" << std::endl);
exit(1);
}
// setting lexicalized reordering setup
PhraseBasedReorderingState::m_useFirstBackwardScore = false;
auto_ptr<OutputCollector> outputCollector;
outputCollector.reset(new OutputCollector());
#ifdef WITH_THREADS
ThreadPool pool(staticData.ThreadCount());
#endif
// main loop over set of input sentences
InputType* source = NULL;
size_t lineCount = 0;
while(ReadInput(*ioWrapper,staticData.GetInputType(),source)) {
IFVERBOSE(1) {
ResetUserTime();
}
// set up task of translating one sentence
TranslationTask* task =
new TranslationTask(lineCount,source, outputCollector.get());
// execute task
#ifdef WITH_THREADS
pool.Submit(task);
#else
task->Run();
delete task;
#endif
source = NULL; //make sure it doesn't get deleted
++lineCount;
}
// we are done, finishing up
#ifdef WITH_THREADS
pool.Stop(true); //flush remaining jobs
#endif
} catch (const std::exception &e) {
std::cerr << "Exception: " << e.what() << std::endl;
return EXIT_FAILURE;
}
#ifndef EXIT_RETURN
//This avoids that destructors are called (it can take a long time)
exit(EXIT_SUCCESS);
#else
return EXIT_SUCCESS;
#endif
}

View File

@ -0,0 +1,39 @@
/*********************************************************************
Relative Entropy-based Phrase table Pruning
Copyright (C) 2012 Wang Ling
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the University of Edinburgh nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
***********************************************************************/
#ifndef moses_cmd_Main_h
#define moses_cmd_Main_h
#include "StaticData.h"
class IOWrapper;
int main(int argc, char* argv[]);
#endif

View File

@ -0,0 +1,83 @@
/***********************************************************************
Relative Entropy-based Phrase table Pruning
Copyright (C) 2012 Wang Ling
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <vector>
#include "Hypothesis.h"
#include "StaticData.h"
#include "RelativeEntropyCalc.h"
#include "Manager.h"
using namespace std;
using namespace Moses;
using namespace MosesCmd;
namespace MosesCmd
{
double RelativeEntropyCalc::CalcRelativeEntropy(int translationId, std::vector<SearchGraphNode>& searchGraph){
const StaticData &staticData = StaticData::Instance();
const Phrase *m_constraint = staticData.GetConstrainingPhrase(translationId);
double prunedScore = -numeric_limits<double>::max();
double unprunedScore = -numeric_limits<double>::max();
for (size_t i = 0; i < searchGraph.size(); ++i) {
const SearchGraphNode& searchNode = searchGraph[i];
int nodeId = searchNode.hypo->GetId();
if(nodeId == 0) continue; // initial hypothesis
int forwardId = searchNode.forward;
if(forwardId == -1){ // is final hypothesis
Phrase catOutput(0);
ConcatOutputPhraseRecursive(catOutput, searchNode.hypo);
if(catOutput == *m_constraint){ // is the output actually the same as the constraint (forced decoding does not always force the output)
const Hypothesis *prevHypo = searchNode.hypo->GetPrevHypo();
int backId = prevHypo->GetId();
double derivationScore = searchNode.hypo->GetScore();
if(backId != 0){ // derivation using smaller units
if(prunedScore < derivationScore){
prunedScore = derivationScore;
}
}
if(unprunedScore < derivationScore){
unprunedScore = derivationScore;
}
}
}
}
double neg_log_div = 0;
if( unprunedScore == -numeric_limits<double>::max()){
neg_log_div = numeric_limits<double>::max(); // could not find phrase pair, give it a low score so that it doesnt get pruned
}
else{
neg_log_div = unprunedScore - prunedScore;
}
if (neg_log_div > 100){
return 100;
}
return neg_log_div;
}
void RelativeEntropyCalc::ConcatOutputPhraseRecursive(Phrase& phrase, const Hypothesis *hypo){
int nodeId = hypo->GetId();
if(nodeId == 0) return; // initial hypothesis
ConcatOutputPhraseRecursive(phrase, hypo->GetPrevHypo());
const Phrase &endPhrase = hypo->GetCurrTargetPhrase();
phrase.Append(endPhrase);
}
}

View File

@ -0,0 +1,51 @@
/*********************************************************************
Relative Entropy-based Phrase table Pruning
Copyright (C) 2012 Wang Ling
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the University of Edinburgh nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
***********************************************************************/
#include <vector>
#include "Hypothesis.h"
#include "StaticData.h"
#include "Manager.h"
using namespace std;
using namespace Moses;
namespace MosesCmd
{
class RelativeEntropyCalc
{
public:
static double CalcRelativeEntropy(int translationId, std::vector<SearchGraphNode>& searchGraph);
protected:
static void ConcatOutputPhraseRecursive(Phrase& phrase, const Hypothesis *hypo);
};
}

View File

@ -0,0 +1,126 @@
// $Id$
#include <iostream>
#include <sstream>
#include <algorithm>
#include "StaticData.h"
#include "Hypothesis.h"
#include "TranslationAnalysis.h"
using namespace Moses;
namespace TranslationAnalysis
{
void PrintTranslationAnalysis(const TranslationSystem* system, std::ostream &os, const Hypothesis* hypo)
{
os << std::endl << "TRANSLATION HYPOTHESIS DETAILS:" << std::endl;
std::vector<const Hypothesis*> translationPath;
while (hypo) {
translationPath.push_back(hypo);
hypo = hypo->GetPrevHypo();
}
std::reverse(translationPath.begin(), translationPath.end());
std::vector<std::string> droppedWords;
std::vector<const Hypothesis*>::iterator tpi = translationPath.begin();
if(tpi == translationPath.end())
return;
++tpi; // skip initial translation state
std::vector<std::string> sourceMap;
std::vector<std::string> targetMap;
std::vector<unsigned int> lmAcc(0);
size_t lmCalls = 0;
bool doLMStats = ((*tpi)->GetLMStats() != 0);
if (doLMStats)
lmAcc.resize((*tpi)->GetLMStats()->size(), 0);
for (; tpi != translationPath.end(); ++tpi) {
std::ostringstream sms;
std::ostringstream tms;
std::string target = (*tpi)->GetTargetPhraseStringRep();
std::string source = (*tpi)->GetSourcePhraseStringRep();
WordsRange twr = (*tpi)->GetCurrTargetWordsRange();
WordsRange swr = (*tpi)->GetCurrSourceWordsRange();
const AlignmentInfo &alignmentInfo = (*tpi)->GetCurrTargetPhrase().GetAlignmentInfo();
// language model backoff stats,
if (doLMStats) {
std::vector<std::vector<unsigned int> >& lmstats = *(*tpi)->GetLMStats();
std::vector<std::vector<unsigned int> >::iterator i = lmstats.begin();
std::vector<unsigned int>::iterator acc = lmAcc.begin();
for (; i != lmstats.end(); ++i, ++acc) {
std::vector<unsigned int>::iterator j = i->begin();
lmCalls += i->size();
for (; j != i->end(); ++j) {
(*acc) += *j;
}
}
}
bool epsilon = false;
if (target == "") {
target="<EPSILON>";
epsilon = true;
droppedWords.push_back(source);
}
os << " SOURCE: " << swr << " " << source << std::endl
<< " TRANSLATED AS: " << target << std::endl
<< " WORD ALIGNED: " << alignmentInfo << std::endl;
size_t twr_i = twr.GetStartPos();
size_t swr_i = swr.GetStartPos();
if (!epsilon) {
sms << twr_i;
}
if (epsilon) {
tms << "del(" << swr_i << ")";
} else {
tms << swr_i;
}
swr_i++;
twr_i++;
for (; twr_i <= twr.GetEndPos() && twr.GetEndPos() != NOT_FOUND; twr_i++) {
sms << '-' << twr_i;
}
for (; swr_i <= swr.GetEndPos() && swr.GetEndPos() != NOT_FOUND; swr_i++) {
tms << '-' << swr_i;
}
if (!epsilon) targetMap.push_back(sms.str());
sourceMap.push_back(tms.str());
}
std::vector<std::string>::iterator si = sourceMap.begin();
std::vector<std::string>::iterator ti = targetMap.begin();
os << std::endl << "SOURCE/TARGET SPANS:";
os << std::endl << " SOURCE:";
for (; si != sourceMap.end(); ++si) {
os << " " << *si;
}
os << std::endl << " TARGET:";
for (; ti != targetMap.end(); ++ti) {
os << " " << *ti;
}
os << std::endl << std::endl;
if (doLMStats && lmCalls > 0) {
std::vector<unsigned int>::iterator acc = lmAcc.begin();
const LMList& lmlist = system->GetLanguageModels();
LMList::const_iterator i = lmlist.begin();
for (; acc != lmAcc.end(); ++acc, ++i) {
char buf[256];
sprintf(buf, "%.4f", (float)(*acc)/(float)lmCalls);
os << (*i)->GetScoreProducerDescription() <<", AVG N-GRAM LENGTH: " << buf << std::endl;
}
}
if (droppedWords.size() > 0) {
std::vector<std::string>::iterator dwi = droppedWords.begin();
os << std::endl << "WORDS/PHRASES DROPPED:" << std::endl;
for (; dwi != droppedWords.end(); ++dwi) {
os << "\tdropped=" << *dwi << std::endl;
}
}
os << std::endl << "SCORES (UNWEIGHTED/WEIGHTED): ";
StaticData::Instance().GetScoreIndexManager().PrintLabeledWeightedScores(os, translationPath.back()->GetScoreBreakdown(), StaticData::Instance().GetAllWeights());
os << std::endl;
}
}

View File

@ -0,0 +1,25 @@
// $Id$
/*
* also see moses/SentenceStats
*/
#ifndef moses_cmd_TranslationAnalysis_h
#define moses_cmd_TranslationAnalysis_h
#include <iostream>
#include "Hypothesis.h"
#include "TranslationSystem.h"
namespace TranslationAnalysis
{
/***
* print details about the translation represented in hypothesis to
* os. Included information: phrase alignment, words dropped, scores
*/
void PrintTranslationAnalysis(const Moses::TranslationSystem* system, std::ostream &os, const Moses::Hypothesis* hypo);
}
#endif

178
contrib/relent-filter/src/mbr.cpp Executable file
View File

@ -0,0 +1,178 @@
#include <iostream>
#include <fstream>
#include <sstream>
#include <iomanip>
#include <vector>
#include <map>
#include <stdlib.h>
#include <math.h>
#include <algorithm>
#include <stdio.h>
#include "TrellisPathList.h"
#include "TrellisPath.h"
#include "StaticData.h"
#include "Util.h"
#include "mbr.h"
using namespace std ;
using namespace Moses;
/* Input :
1. a sorted n-best list, with duplicates filtered out in the following format
0 ||| amr moussa is currently on a visit to libya , tomorrow , sunday , to hold talks with regard to the in sudan . ||| 0 -4.94418 0 0 -2.16036 0 0 -81.4462 -106.593 -114.43 -105.55 -12.7873 -26.9057 -25.3715 -52.9336 7.99917 -24 ||| -4.58432
2. a weight vector
3. bleu order ( default = 4)
4. scaling factor to weigh the weight vector (default = 1.0)
Output :
translations that minimise the Bayes Risk of the n-best list
*/
int BLEU_ORDER = 4;
int SMOOTH = 1;
float min_interval = 1e-4;
void extract_ngrams(const vector<const Factor* >& sentence, map < vector < const Factor* >, int > & allngrams)
{
vector< const Factor* > ngram;
for (int k = 0; k < BLEU_ORDER; k++) {
for(int i =0; i < max((int)sentence.size()-k,0); i++) {
for ( int j = i; j<= i+k; j++) {
ngram.push_back(sentence[j]);
}
++allngrams[ngram];
ngram.clear();
}
}
}
float calculate_score(const vector< vector<const Factor*> > & sents, int ref, int hyp, vector < map < vector < const Factor *>, int > > & ngram_stats )
{
int comps_n = 2*BLEU_ORDER+1;
vector<int> comps(comps_n);
float logbleu = 0.0, brevity;
int hyp_length = sents[hyp].size();
for (int i =0; i<BLEU_ORDER; i++) {
comps[2*i] = 0;
comps[2*i+1] = max(hyp_length-i,0);
}
map< vector < const Factor * > ,int > & hyp_ngrams = ngram_stats[hyp] ;
map< vector < const Factor * >, int > & ref_ngrams = ngram_stats[ref] ;
for (map< vector< const Factor * >, int >::iterator it = hyp_ngrams.begin();
it != hyp_ngrams.end(); it++) {
map< vector< const Factor * >, int >::iterator ref_it = ref_ngrams.find(it->first);
if(ref_it != ref_ngrams.end()) {
comps[2* (it->first.size()-1)] += min(ref_it->second,it->second);
}
}
comps[comps_n-1] = sents[ref].size();
for (int i=0; i<BLEU_ORDER; i++) {
if (comps[0] == 0)
return 0.0;
if ( i > 0 )
logbleu += log((float)comps[2*i]+SMOOTH)-log((float)comps[2*i+1]+SMOOTH);
else
logbleu += log((float)comps[2*i])-log((float)comps[2*i+1]);
}
logbleu /= BLEU_ORDER;
brevity = 1.0-(float)comps[comps_n-1]/comps[1]; // comps[comps_n-1] is the ref length, comps[1] is the test length
if (brevity < 0.0)
logbleu += brevity;
return exp(logbleu);
}
const TrellisPath doMBR(const TrellisPathList& nBestList)
{
float marginal = 0;
vector<float> joint_prob_vec;
vector< vector<const Factor*> > translations;
float joint_prob;
vector< map < vector <const Factor *>, int > > ngram_stats;
TrellisPathList::const_iterator iter;
// get max score to prevent underflow
float maxScore = -1e20;
for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
const TrellisPath &path = **iter;
float score = StaticData::Instance().GetMBRScale()
* path.GetScoreBreakdown().InnerProduct(StaticData::Instance().GetAllWeights());
if (maxScore < score) maxScore = score;
}
for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
const TrellisPath &path = **iter;
joint_prob = UntransformScore(StaticData::Instance().GetMBRScale() * path.GetScoreBreakdown().InnerProduct(StaticData::Instance().GetAllWeights()) - maxScore);
marginal += joint_prob;
joint_prob_vec.push_back(joint_prob);
// get words in translation
vector<const Factor*> translation;
GetOutputFactors(path, translation);
// collect n-gram counts
map < vector < const Factor *>, int > counts;
extract_ngrams(translation,counts);
ngram_stats.push_back(counts);
translations.push_back(translation);
}
vector<float> mbr_loss;
float bleu, weightedLoss;
float weightedLossCumul = 0;
float minMBRLoss = 1000000;
int minMBRLossIdx = -1;
/* Main MBR computation done here */
iter = nBestList.begin();
for (unsigned int i = 0; i < nBestList.GetSize(); i++) {
weightedLossCumul = 0;
for (unsigned int j = 0; j < nBestList.GetSize(); j++) {
if ( i != j) {
bleu = calculate_score(translations, j, i,ngram_stats );
weightedLoss = ( 1 - bleu) * ( joint_prob_vec[j]/marginal);
weightedLossCumul += weightedLoss;
if (weightedLossCumul > minMBRLoss)
break;
}
}
if (weightedLossCumul < minMBRLoss) {
minMBRLoss = weightedLossCumul;
minMBRLossIdx = i;
}
iter++;
}
/* Find sentence that minimises Bayes Risk under 1- BLEU loss */
return nBestList.at(minMBRLossIdx);
//return translations[minMBRLossIdx];
}
void GetOutputFactors(const TrellisPath &path, vector <const Factor*> &translation)
{
const std::vector<const Hypothesis *> &edges = path.GetEdges();
const std::vector<FactorType>& outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
assert (outputFactorOrder.size() == 1);
// print the surface factor of the translation
for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
const Hypothesis &edge = *edges[currEdge];
const Phrase &phrase = edge.GetCurrTargetPhrase();
size_t size = phrase.GetSize();
for (size_t pos = 0 ; pos < size ; pos++) {
const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]);
translation.push_back(factor);
}
}
}

Some files were not shown because too many files have changed in this diff Show More