support for sparse feature functions (mert support only when using PRO)

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4184 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
phkoehn 2011-09-07 16:37:33 +00:00
parent 9fee4a97f2
commit 41a1849437
16 changed files with 391 additions and 126 deletions

View File

@ -13,7 +13,8 @@
Data::Data(Scorer& ptr):
theScorer(&ptr)
theScorer(&ptr),
_sparse_flag(false)
{
score_type = (*theScorer).getName();
TRACE_ERR("Data::score_type " << score_type << std::endl);
@ -40,7 +41,6 @@ void Data::loadnbest(const std::string &file)
std::string theSentence;
std::string::size_type loc;
while (getline(inp,stringBuf,'\n')) {
if (stringBuf.empty()) continue;
@ -56,16 +56,15 @@ void Data::loadnbest(const std::string &file)
featentry.reset();
scoreentry.clear();
theScorer->prepareStats(sentence_index, theSentence, scoreentry);
scoredata->add(scoreentry, sentence_index);
getNextPound(stringBuf, substring, "|||"); //third field
// examine first line for name of features
if (!existsFeatureNames()) {
std::string stringsupport=substring;
// adding feature names
std::string features="";
std::string tmpname="";
@ -75,10 +74,17 @@ void Data::loadnbest(const std::string &file)
getNextPound(stringsupport, subsubstring);
// string ending with ":" are skipped, because they are the names of the features
if ((loc = subsubstring.find(":")) != subsubstring.length()-1) {
if ((loc = subsubstring.find_last_of(":")) != subsubstring.length()-1) {
features+=tmpname+"_"+stringify(tmpidx)+" ";
tmpidx++;
} else {
}
// ignore sparse feature name
else if (subsubstring.find("_") != string::npos) {
// also ignore its value
getNextPound(stringsupport, subsubstring);
}
// update current feature name
else {
tmpidx=0;
tmpname=subsubstring.substr(0,subsubstring.size() - 1);
}
@ -87,22 +93,36 @@ void Data::loadnbest(const std::string &file)
featdata->setFeatureMap(features);
}
// adding features
// adding features
while (!substring.empty()) {
// TRACE_ERR("Decompounding: " << substring << std::endl);
getNextPound(substring, subsubstring);
// string ending with ":" are skipped, because they are the names of the features
if ((loc = subsubstring.find(":")) != subsubstring.length()-1) {
// no ':' -> feature value that needs to be stored
if ((loc = subsubstring.find_last_of(":")) != subsubstring.length()-1) {
featentry.add(ATOFST(subsubstring.c_str()));
}
// sparse feature name? store as well
else if (subsubstring.find("_") != string::npos) {
std::string name = subsubstring;
getNextPound(substring, subsubstring);
featentry.addSparse( name, atof(subsubstring.c_str()) );
_sparse_flag = true;
}
}
//cerr << "number of sparse features: " << featentry.getSparse().size() << endl;
featdata->add(featentry,sentence_index);
}
inp.close();
}
// TODO
void Data::mergeSparseFeatures() {
std::cerr << "ERROR: sparse features can only be trained with pairwise ranked optimizer (PRO), not traditional MERT\n";
exit(1);
}
// really not the right place...
float sentenceLevelBleuPlusOne( ScoreStats &stats ) {
float logbleu = 0.0;
@ -144,7 +164,7 @@ public:
};
void Data::sample_ranked_pairs( const std::string &rankedpairfile ) {
void Data::sampleRankedPairs( const std::string &rankedpairfile ) {
cout << "Sampling ranked pairs." << endl;
ofstream *outFile = new ofstream();
@ -187,20 +207,15 @@ void Data::sample_ranked_pairs( const std::string &rankedpairfile ) {
for(unsigned int i=0; i<samples.size() && collected < n_samples; i++) {
if (samples[i]->getDiff() >= min_diff) {
collected++;
FeatureStats &f1 = featdata->get(S,samples[i]->getTranslation1());
FeatureStats &f2 = featdata->get(S,samples[i]->getTranslation2());
*out << "1";
for(unsigned int j=0; j<f1.size(); j++)
if (abs(f1.get(j)-f2.get(j)) > 0.00001)
*out << " F" << j << " " << (f1.get(j)-f2.get(j));
*out << endl;
outputSample( *out, featdata->get(S,samples[i]->getTranslation1()),
featdata->get(S,samples[i]->getTranslation2()) );
*out << endl;
*out << "0";
for(unsigned int j=0; j<f1.size(); j++)
if (abs(f1.get(j)-f2.get(j)) > 0.00001)
*out << " F" << j << " " << (f2.get(j)-f1.get(j));
*out << endl;
outputSample( *out, featdata->get(S,samples[i]->getTranslation2()),
featdata->get(S,samples[i]->getTranslation1()) );
*out << endl;
}
delete samples[i];
}
@ -209,3 +224,31 @@ void Data::sample_ranked_pairs( const std::string &rankedpairfile ) {
out->flush();
outFile->close();
}
void Data::outputSample( ostream &out, const FeatureStats &f1, const FeatureStats &f2 )
{
// difference in score in regular features
for(unsigned int j=0; j<f1.size(); j++)
if (abs(f1.get(j)-f2.get(j)) > 0.00001)
out << " F" << j << " " << (f1.get(j)-f2.get(j));
if (!hasSparseFeatures())
return;
// sparse features
const sparse_featstats_t &s1 = f1.getSparse();
const sparse_featstats_t &s2 = f2.getSparse();
for( sparse_featstats_t::const_iterator i=s1.begin(); i!=s1.end(); i++) {
if (s2.find(i->first) == s2.end())
out << " " << i->first << " " << i->second;
else {
float diff = i->second - s2.find(i->first)->second;
if (abs(diff) > 0.00001)
out << " " << i->first << " " << diff;
}
}
for( sparse_featstats_t::const_iterator i=s2.begin(); i!=s2.end(); i++) {
if (s1.find(i->first) == s1.end())
out << " " << i->first << " " << (- i->second);
}
}

View File

@ -31,10 +31,10 @@ private:
Scorer* theScorer;
std::string score_type;
size_t number_of_scores; //number of scores
bool _sparse_flag;
public:
Data(Scorer& sc);
~Data() {};
inline void clear() {
@ -62,11 +62,16 @@ public:
featdata->Features(f);
}
inline bool hasSparseFeatures() const { return _sparse_flag; }
void mergeSparseFeatures();
void loadnbest(const std::string &file);
void load(const std::string &featfile,const std::string &scorefile) {
featdata->load(featfile);
scoredata->load(scorefile);
if (featdata->hasSparseFeatures())
_sparse_flag = true;
}
void save(const std::string &featfile,const std::string &scorefile, bool bin=false) {
@ -90,7 +95,8 @@ public:
return featdata->getFeatureIndex(name);
};
void sample_ranked_pairs( const std::string &rankedPairFile );
void sampleRankedPairs( const std::string &rankedPairFile );
void outputSample( std::ostream &out, const FeatureStats &f1, const FeatureStats &f2 );
};

View File

@ -11,7 +11,7 @@
#include "Util.h"
FeatureArray::FeatureArray(): idx("")
FeatureArray::FeatureArray(): idx(""), _sparse_flag(false)
{};
void FeatureArray::savetxt(std::ofstream& outFile)
@ -69,6 +69,8 @@ void FeatureArray::loadtxt(ifstream& inFile, size_t n)
for (size_t i=0 ; i < n; i++) {
entry.loadtxt(inFile);
add(entry);
if (entry.getSparse().size()>0)
_sparse_flag = true;
}
}

View File

@ -30,6 +30,7 @@ protected:
featarray_t array_;
size_t number_of_features;
std::string features;
bool _sparse_flag;
private:
std::string idx; // idx to identify the utterance, it can differ from the index inside the vector
@ -43,6 +44,10 @@ public:
array_.clear();
}
inline bool hasSparseFeatures() const {
return _sparse_flag;
}
inline std::string getIndex() {
return idx;
}

View File

@ -51,9 +51,12 @@ void FeatureData::load(ifstream& inFile)
if (entry.size() == 0)
break;
if (size() == 0) {
if (size() == 0)
setFeatureMap(entry.Features());
}
if (entry.hasSparseFeatures())
_sparse_flag = true;
add(entry);
}
}

View File

@ -26,10 +26,10 @@ protected:
idx2name idx2arrayname_; //map from index to name of array
name2idx arrayname2idx_; //map from name to index of array
private:
size_t number_of_features;
std::string features;
bool _sparse_flag;
map<std::string, size_t> featname2idx_; //map from name to index of features
map<size_t, std::string> idx2featname_; //map from index to name of features
@ -43,6 +43,9 @@ public:
array_.clear();
}
inline bool hasSparseFeatures() const {
return _sparse_flag;
}
inline FeatureArray get(const std::string& idx) {
return array_.at(getIndex(idx));
}

View File

@ -30,6 +30,7 @@ FeatureStats::FeatureStats(const FeatureStats &stats)
entries_ = stats.size();
array_ = new FeatureStatsType[available_];
memcpy(array_,stats.getArray(),featbytes_);
map_ = stats.getSparse();
};
FeatureStats::FeatureStats(const size_t size)
@ -61,6 +62,11 @@ void FeatureStats::add(FeatureStatsType v)
array_[entries_++]=v;
}
void FeatureStats::addSparse(string name, FeatureStatsType v)
{
map_[name]=v;
}
void FeatureStats::set(std::string &theString)
{
std::string substring, stringBuf;
@ -68,7 +74,15 @@ void FeatureStats::set(std::string &theString)
while (!theString.empty()) {
getNextPound(theString, substring);
add(ATOFST(substring.c_str()));
// regular feature
if (substring.find(":") == string::npos) {
add(ATOFST(substring.c_str()));
}
// sparse feature
else {
size_t separator = substring.find_last_of(":");
addSparse(substring.substr(0,separator), atof(substring.substr(separator+1).c_str()) );
}
}
}
@ -123,6 +137,7 @@ FeatureStats& FeatureStats::operator=(const FeatureStats &stats)
entries_ = stats.size();
array_ = new FeatureStatsType[available_];
memcpy(array_,stats.getArray(),featbytes_);
map_ = stats.getSparse();
return *this;
}
@ -131,7 +146,14 @@ FeatureStats& FeatureStats::operator=(const FeatureStats &stats)
/**write the whole object to a stream*/
ostream& operator<<(ostream& o, const FeatureStats& e)
{
for (size_t i=0; i< e.size(); i++)
// print regular features
for (size_t i=0; i< e.size(); i++) {
o << e.get(i) << " ";
}
// sparse features
const sparse_featstats_t &sparse = e.getSparse();
for(sparse_featstats_t::const_iterator i = sparse.begin(); i != sparse.end(); i++) {
o << i->first << i->second << " ";
}
return o;
}

View File

@ -26,6 +26,7 @@ class FeatureStats
{
private:
featstats_t array_;
sparse_featstats_t map_;
size_t entries_;
size_t available_;
@ -43,9 +44,11 @@ public:
}
void expand();
void add(FeatureStatsType v);
void addSparse(string name, FeatureStatsType v);
inline void clear() {
memset((void*) array_,0,featbytes_);
map_.clear();
}
inline FeatureStatsType get(size_t i) {
@ -57,6 +60,9 @@ public:
inline featstats_t getArray() const {
return array_;
}
inline sparse_featstats_t getSparse() const {
return map_;
}
void set(std::string &theString);

View File

@ -26,6 +26,7 @@ typedef vector<statscore_t> statscores_t;
typedef float FeatureStatsType;
typedef FeatureStatsType* featstats_t;
typedef map<string,FeatureStatsType> sparse_featstats_t;
//typedef vector<FeatureStatsType> featstats_t;
typedef vector<FeatureStats> featarray_t;
typedef vector<FeatureArray> featdata_t;

View File

@ -278,6 +278,12 @@ int main (int argc, char **argv)
PrintUserTime("Data loaded");
// starting point score over latest n-best, accumulative n-best
//vector<unsigned> bests;
//compute bests with sparse features needs to be implemented
//currently sparse weights are not even loaded
//statscore_t score = TheScorer->score(bests);
if (tooptimizestr.length() > 0) {
cerr << "Weights to optimize: " << tooptimizestr << endl;
@ -305,16 +311,20 @@ int main (int argc, char **argv)
}
if (pairedrankfile.compare("") != 0) {
D.sample_ranked_pairs(pairedrankfile);
D.sampleRankedPairs(pairedrankfile);
PrintUserTime("Stopping...");
exit(0);
}
// treat sparse features just like regular features
if (D.hasSparseFeatures()) {
D.mergeSparseFeatures();
}
Optimizer *O=OptimizerFactory::BuildOptimizer(pdim,tooptimize,start_list[0],type,nrandom);
O->SetScorer(TheScorer);
O->SetFData(D.getFeatureData());
#ifdef WITH_THREADS
cerr << "Creating a pool of " << threads << " threads" << endl;
Moses::ThreadPool pool(threads);

View File

@ -11,6 +11,7 @@
# Excerpts from revision history
# Sept 2011 multi-threaded mert (Barry Haddow)
# 3 Aug 2011 Added random directions, historic best, pairwise ranked (PK)
# Jul 2011 simplifications (Ondrej Bojar)
# -- rely on moses' -show-weights instead of parsing moses.ini
# ... so moses is also run once *before* mert starts, checking
@ -287,8 +288,6 @@ $qsubwrapper="$SCRIPTS_ROOTDIR/generic/qsub-wrapper.pl" if !defined $qsubwrapper
$moses_parallel_cmd = "$SCRIPTS_ROOTDIR/generic/moses-parallel.pl"
if !defined $moses_parallel_cmd;
if (!defined $mertdir) {
$mertdir = "$SCRIPTS_ROOTDIR/../mert";
print STDERR "Assuming --mertdir=$mertdir\n";
@ -357,13 +356,11 @@ die "Not executable: $moses_parallel_cmd" if defined $___JOBS && ! -x $moses_par
die "Not executable: $qsubwrapper" if defined $___JOBS && ! -x $qsubwrapper;
die "Not executable: $___DECODER" if ! -x $___DECODER;
my $input_abs = ensure_full_path($___DEV_F);
die "File not found: $___DEV_F (interpreted as $input_abs)."
if ! -e $input_abs;
$___DEV_F = $input_abs;
# Option to pass to qsubwrapper and moses-parallel
my $pass_old_sge = $old_sge ? "-old-sge" : "";
@ -372,7 +369,6 @@ die "File not executable: $___DECODER (interpreted as $decoder_abs)."
if ! -x $decoder_abs;
$___DECODER = $decoder_abs;
my $ref_abs = ensure_full_path($___DEV_E);
# check if English dev set (reference translations) exist and store a list of all references
my @references;
@ -409,9 +405,6 @@ if ($___DECODER_FLAGS =~ /(^|\s)-(config|f) /
# normalize initial LAMBDAs, too
my $need_to_normalize = 1;
#store current directory and create the working directory (if needed)
my $cwd = `pawd 2>/dev/null`;
if(!$cwd){$cwd = `pwd`;}
@ -431,17 +424,16 @@ my $mert_logfile = "mert.log";
my $weights_in_file = "init.opt";
my $weights_out_file = "weights.txt";
# set start run
my $start_run = 1;
my $bestpoint = undef;
my $devbleu = undef;
my $sparse_weights_file = undef;
my $prev_feature_file = undef;
my $prev_score_file = undef;
my $prev_init_file = undef;
if ($___FILTER_PHRASE_TABLE) {
my $outdir = "filtered";
if (-e "$outdir/moses.ini") {
@ -471,7 +463,6 @@ else{
$___CONFIG_ORIG = $___CONFIG;
}
# we run moses to check validity of moses.ini and to obtain all the feature
# names
my $featlist = get_featlist_from_moses($___CONFIG);
@ -579,28 +570,19 @@ if ($continue) {
print STDERR "All needed data are available\n";
print STDERR "Loading information from last step ($step)\n";
open(IN,"run$step.$mert_logfile") or die "Can't open run$step.$mert_logfile";
while (<IN>) {
if (/Best point:\s*([\s\d\.\-e]+?)\s*=> ([\-\d\.]+)/) {
$bestpoint = $1;
$devbleu = $2;
last;
}
}
close IN;
my %dummy; # sparse features
($bestpoint,$devbleu) = &get_weights_from_mert("run$step.$mert_outfile","run$step.$mert_logfile",scalar @{$featlist->{"names"}},\%dummy);
die "Failed to parse mert.log, missed Best point there."
if !defined $bestpoint || !defined $devbleu;
print "($step) BEST at $step $bestpoint => $devbleu at ".`date`;
my @newweights = split /\s+/, $bestpoint;
# Sanity check: order of lambdas must match
sanity_check_order_of_lambdas($featlist,
"gunzip -c < run$step.best$___N_BEST_LIST_SIZE.out.gz |");
# update my cache of lambda values
$featlist->{"values"} = \@newweights;
}
else{
print STDERR "No previous data are needed\n";
@ -630,10 +612,10 @@ while(1) {
print "run $run start at ".`date`;
# In case something dies later, we might wish to have a copy
create_config($___CONFIG, "./run$run.moses.ini", $featlist, $run, (defined$devbleu?$devbleu:"--not-estimated--"));
create_config($___CONFIG, "./run$run.moses.ini", $featlist, $run, (defined$devbleu?$devbleu:"--not-estimated--"),$sparse_weights_file);
# skip if the user wanted
# skip running the decoder if the user wanted
if (!$skip_decoder) {
print "($run) run decoder to produce n-best lists\n";
$nbest_file = run_decoder($featlist, $run, $need_to_normalize);
@ -648,8 +630,6 @@ while(1) {
$need_to_normalize = 0;
}
# extract score statistics and features from the nbest lists
print STDERR "Scoring the nbestlist.\n";
@ -740,7 +720,7 @@ while(1) {
if ! -s $weights_out_file;
# backup copies
# backup copies
safesystem ("\\cp -f extract.err run$run.extract.err") or die;
safesystem ("\\cp -f extract.out run$run.extract.out") or die;
if ($___PAIRWISE_RANKED_OPTIMIZER) { safesystem ("\\cp -f pro.data run$run.pro.data") or die; }
@ -751,34 +731,10 @@ while(1) {
print "run $run end at ".`date`;
$bestpoint = undef;
$devbleu = undef;
if ($___PAIRWISE_RANKED_OPTIMIZER) {
open(IN,"run$run.$mert_outfile") or die "Can't open run$run.$mert_outfile";
my (@WEIGHT,$sum);
foreach (@CURR) { push @WEIGHT, 0; }
while(<IN>) {
if (/^F(\d+) ([\-\.\de]+)/) {
$WEIGHT[$1] = $2;
$sum += abs($2);
}
}
$devbleu = "unknown";
foreach (@WEIGHT) { $_ /= $sum; }
$bestpoint = join(" ",@WEIGHT);
close IN;
}
else {
open(IN,"run$run.$mert_logfile") or die "Can't open run$run.$mert_logfile";
while (<IN>) {
if (/Best point:\s*([\s\d\.\-e]+?)\s*=> ([\-\d\.]+)/) {
$bestpoint = $1;
$devbleu = $2;
last;
}
}
close IN;
}
my %sparse_weights; # sparse features
($bestpoint,$devbleu) = &get_weights_from_mert("run$run.$mert_outfile","run$run.$mert_logfile",scalar @{$featlist->{"names"}},\%sparse_weights);
die "Failed to parse mert.log, missed Best point there."
if !defined $bestpoint || !defined $devbleu;
print "($run) BEST at $run: $bestpoint => $devbleu at ".`date`;
@ -788,6 +744,15 @@ while(1) {
$featlist->{"values"} = \@newweights;
if (scalar keys %sparse_weights) {
$sparse_weights_file = "run".($run+1).".sparse-weights";
open(SPARSE,">".$sparse_weights_file);
foreach my $feature (keys %sparse_weights) {
print SPARSE "$feature $sparse_weights{$feature}\n";
}
close(SPARSE);
}
## additional stopping criterion: weights have not changed
my $shouldstop = 1;
for(my $i=0; $i<@CURR; $i++) {
@ -864,6 +829,43 @@ chdir($cwd);
} # end of local scope
sub get_weights_from_mert {
my ($outfile,$logfile,$weight_count,$sparse_weights) = @_;
my ($bestpoint,$devbleu);
if ($___PAIRWISE_RANKED_OPTIMIZER) {
open(IN,$outfile) or die "Can't open $outfile";
my (@WEIGHT,$sum);
for(my $i=0;$i<$weight_count;$i++) { push @WEIGHT, 0; }
while(<IN>) {
# regular features
if (/^F(\d+) ([\-\.\de]+)/) {
$WEIGHT[$1] = $2;
$sum += abs($2);
}
# sparse features
elsif(/^(.+_.+) ([\-\.\de]+)/) {
$$sparse_weights{$1} = $2;
}
}
$devbleu = "unknown";
foreach (@WEIGHT) { $_ /= $sum; }
$bestpoint = join(" ",@WEIGHT);
close IN;
}
else {
open(IN,$logfile) or die "Can't open $logfile";
while (<IN>) {
if (/Best point:\s*([\s\d\.\-e]+?)\s*=> ([\-\d\.]+)/) {
$bestpoint = $1;
$devbleu = $2;
last;
}
}
close IN;
}
return ($bestpoint,$devbleu);
}
sub run_decoder {
my ($featlist, $run, $need_to_normalize) = @_;
my $filename_template = "run%d.best$___N_BEST_LIST_SIZE.out";
@ -984,6 +986,7 @@ sub get_featlist_from_moses {
$nr++;
chomp;
my ($longname, $feature, $value) = split / /;
next if $value eq "sparse";
push @errs, "$featlistfn:$nr:Bad initial value of $feature: $value\n"
if $value !~ /^[+-]?[0-9.e]+$/;
push @errs, "$featlistfn:$nr:Unknown feature '$feature', please add it to \@ABBR_FULL_MAP\n"
@ -1015,14 +1018,20 @@ sub get_order_of_scores_from_nbestlist {
my @order = ();
my $label = undef;
my $sparse = 0; # we ignore sparse features here
foreach my $tok (split /\s+/, $scores) {
if ($tok =~ /^([a-z][0-9a-z]*):/i) {
if ($tok =~ /.+_.+:/) {
$sparse = 1;
} elsif ($tok =~ /^([a-z][0-9a-z]*):/i) {
$label = $1;
} elsif ($tok =~ /^-?[-0-9.e]+$/) {
# a score found, remember it
die "Found a score but no label before it! Bad nbestlist '$fname_or_source'!"
if !defined $label;
push @order, $label;
if (!$sparse) {
# a score found, remember it
die "Found a score but no label before it! Bad nbestlist '$fname_or_source'!"
if !defined $label;
push @order, $label;
}
$sparse = 0;
} else {
die "Not a label, not a score '$tok'. Failed to parse the scores string: '$scores' of nbestlist '$fname_or_source'";
}
@ -1037,6 +1046,7 @@ sub create_config {
my $featlist = shift; # the lambdas we should write
my $iteration = shift; # just for verbosity
my $bleu_achieved = shift; # just for verbosity
my $sparse_weights_file = shift; # only defined when optimizing sparse features
my %P; # the hash of all parameters we wish to override
@ -1076,6 +1086,10 @@ sub create_config {
push @{$P{$name}}, $val;
}
if (defined($sparse_weights_file)) {
push @{$P{"weights-file"}}, $___WORKING_DIR."/".$sparse_weights_file;
}
# create new moses.ini decoder config file by cloning and overriding the original one
open(INI,$infn) or die "Can't read $infn";
delete($P{"config"}); # never output

View File

@ -36,10 +36,15 @@ using namespace std;
bool hierarchicalFlag = false;
bool onlyDirectFlag = false;
bool phraseCountFlag = true;
bool lowCountFlag = false;
bool goodTuringFlag = false;
bool kneserNeyFlag = false;
bool logProbFlag = false;
char line[LINE_MAX_LENGTH];
inline float maybeLogProb( float a ) { return logProbFlag ? log(a) : a; }
void processFiles( char*, char*, char* );
char line[LINE_MAX_LENGTH];
void processFiles( char*, char*, char*, char* );
void loadCountOfCounts( char* );
bool getLine( istream &fileP, vector< string > &item );
vector< string > splitLine();
@ -55,6 +60,7 @@ int main(int argc, char* argv[])
char* &fileNameDirect = argv[1];
char* &fileNameIndirect = argv[2];
char* &fileNameConsolidated = argv[3];
char* fileNameCountOfCounts;
for(int i=4; i<argc; i++) {
if (strcmp(argv[i],"--Hierarchical") == 0) {
@ -66,6 +72,25 @@ int main(int argc, char* argv[])
} else if (strcmp(argv[i],"--NoPhraseCount") == 0) {
phraseCountFlag = false;
cerr << "not including the phrase count feature\n";
} else if (strcmp(argv[i],"--GoodTuring") == 0) {
goodTuringFlag = true;
if (i+1==argc) {
cerr << "ERROR: specify count of count files for Good Turing discounting!\n";
exit(1);
}
fileNameCountOfCounts = argv[++i];
cerr << "adjusting phrase translation probabilities with Good Turing discounting\n";
} else if (strcmp(argv[i],"--KneserNey") == 0) {
kneserNeyFlag = true;
if (i+1==argc) {
cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n";
exit(1);
}
fileNameCountOfCounts = argv[++i];
cerr << "adjusting phrase translation probabilities with Kneser Ney discounting\n";
} else if (strcmp(argv[i],"--LowCountFeature") == 0) {
lowCountFlag = true;
cerr << "including the low count feature\n";
} else if (strcmp(argv[i],"--LogProb") == 0) {
logProbFlag = true;
cerr << "using log-probabilities\n";
@ -75,11 +100,61 @@ int main(int argc, char* argv[])
}
}
processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated );
processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts );
}
void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated )
vector< float > countOfCounts;
vector< float > goodTuringDiscount;
float kneserNey_D1, kneserNey_D2, kneserNey_D3, totalCount = -1;
void loadCountOfCounts( char* fileNameCountOfCounts )
{
Moses::InputFileStream fileCountOfCounts(fileNameCountOfCounts);
if (fileCountOfCounts.fail()) {
cerr << "ERROR: could not open count of counts file " << fileNameCountOfCounts << endl;
exit(1);
}
istream &fileP = fileCountOfCounts;
countOfCounts.push_back(0.0);
while(1) {
if (fileP.eof()) break;
SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
if (fileP.eof()) break;
if (totalCount < 0)
totalCount = atof(line); // total number of distinct phrase pairs
else
countOfCounts.push_back( atof(line) );
}
fileCountOfCounts.Close();
// compute Good Turing discounts
if (goodTuringFlag) {
goodTuringDiscount.push_back(0.01); // floor value
for( size_t i=1; i<countOfCounts.size()-1; i++ ) {
goodTuringDiscount.push_back(((float)i+1)/(float)i*((countOfCounts[i+1]+0.1) / ((float)countOfCounts[i]+0.1)));
if (goodTuringDiscount[i]>1)
goodTuringDiscount[i] = 1;
if (goodTuringDiscount[i]<goodTuringDiscount[i-1])
goodTuringDiscount[i] = goodTuringDiscount[i-1];
}
}
// compute Kneser Ney co-efficients [Chen&Goodman, 1998]
float Y = countOfCounts[1] / (countOfCounts[1] + 2*countOfCounts[2]);
kneserNey_D1 = 1 - 2*Y * countOfCounts[2] / countOfCounts[1];
kneserNey_D2 = 2 - 3*Y * countOfCounts[3] / countOfCounts[2];
kneserNey_D3 = 3 - 4*Y * countOfCounts[4] / countOfCounts[3];
// sanity constraints
if (kneserNey_D1 > 0.9) kneserNey_D1 = 0.9;
if (kneserNey_D2 > 1.9) kneserNey_D2 = 1.9;
if (kneserNey_D3 > 2.9) kneserNey_D3 = 2.9;
}
void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts )
{
if (goodTuringFlag || kneserNeyFlag)
loadCountOfCounts( fileNameCountOfCounts );
// open input files
Moses::InputFileStream fileDirect(fileNameDirect);
Moses::InputFileStream fileIndirect(fileNameIndirect);
@ -134,29 +209,67 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
// output hierarchical phrase pair (with separated labels)
fileConsolidated << itemDirect[0] << " ||| " << itemDirect[1];
// probs
fileConsolidated << " ||| ";
if (!onlyDirectFlag) {
fileConsolidated << itemIndirect[2]; // prob indirect
// SCORES ...
fileConsolidated << " |||";
vector<string> directCounts = tokenize(itemDirect[4].c_str());
vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
float countF = atof(directCounts[0].c_str());
float countE = atof(indirectCounts[0].c_str());
float countEF = atof(indirectCounts[1].c_str());
float n1_F, n1_E;
if (kneserNeyFlag) {
n1_F = atof(directCounts[2].c_str());
n1_E = atof(indirectCounts[2].c_str());
}
fileConsolidated << " " << itemDirect[2]; // prob direct
// Good Turing discounting
float adjustedCountEF = countEF;
if (goodTuringFlag && countEF+0.99999 < goodTuringDiscount.size()-1)
adjustedCountEF *= goodTuringDiscount[(int)(countEF+0.99998)];
float adjustedCountEF_indirect = adjustedCountEF;
// Kneser Ney discounting [Foster et al, 2006]
if (kneserNeyFlag) {
float D = kneserNey_D3;
if (countEF < 2) D = kneserNey_D1;
if (countEF < 3) D = kneserNey_D2;
if (D > countEF) D = countEF - 0.01; // sanity constraint
float p_b_E = n1_E / totalCount; // target phrase prob based on distinct
float alpha_F = D * n1_F / countF; // available mass
adjustedCountEF = countEF - D + countF * alpha_F * p_b_E;
// for indirect
float p_b_F = n1_F / totalCount; // target phrase prob based on distinct
float alpha_E = D * n1_E / countE; // available mass
adjustedCountEF_indirect = countEF - D + countE * alpha_E * p_b_F;
}
// prob indirect
if (!onlyDirectFlag) {
fileConsolidated << " " << maybeLogProb(adjustedCountEF_indirect/countE);
fileConsolidated << " " << itemIndirect[2];
}
// prob direct
fileConsolidated << " " << maybeLogProb(adjustedCountEF/countF);
fileConsolidated << " " << itemDirect[2];
// phrase count feature
if (phraseCountFlag) {
fileConsolidated << " " << (logProbFlag ? 1 : 2.718); // phrase count feature
fileConsolidated << " " << maybeLogProb(2.718);
}
// low count feature
if (lowCountFlag) {
fileConsolidated << " " << maybeLogProb(exp(-1.0/countEF));
}
// alignment
fileConsolidated << " ||| " << itemDirect[3];
// counts, for debugging
vector<string> directCounts = tokenize(itemDirect[4].c_str());
vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
fileConsolidated << "||| " << indirectCounts[0] << " " << directCounts[0];
// output rule count if present in either file
if (directCounts.size() > 1) {
fileConsolidated << " " << directCounts[1];
} else if (indirectCounts.size() > 1) {
fileConsolidated << " " << indirectCounts[1];
}
fileConsolidated << "||| " << countE << " " << countF; // << " " << countEF;
fileConsolidated << endl;
}
@ -165,6 +278,7 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
fileConsolidated.close();
}
bool getLine( istream &fileP, vector< string > &item )
{
if (fileP.eof())

View File

@ -45,7 +45,7 @@
#include "tables-core.h"
#include "XmlTree.h"
#define LINE_MAX_LENGTH 60000
#define LINE_MAX_LENGTH 500000
using namespace std;

View File

@ -24,7 +24,7 @@
using namespace std;
#define LINE_MAX_LENGTH 60000
#define LINE_MAX_LENGTH 500000
// HPhraseVertex represents a point in the alignment matrix
typedef pair <int, int> HPhraseVertex;

View File

@ -29,6 +29,10 @@ vector<string> tokenize( const char* input )
return token;
}
bool isNonTerminal( const WORD &symbol ) {
return symbol.substr(0, 1) == "[" && symbol.substr(symbol.size()-1, 1) == "]";
}
WORD_ID Vocabulary::storeIfNew( const WORD& word )
{
map<WORD, WORD_ID>::iterator i = lookup.find( word );

View File

@ -34,9 +34,10 @@ my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_
$_PHRASE_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
$_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
$_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS,
$_ADDITIONAL_INI,
$_DICTIONARY, $_EPPEX);
my $debug = 0; # debug this script, do not delete any files in debug mode
my $debug = 1; # debug this script, do not delete any files in debug mode
# the following line is set installation time by 'make release'. BEWARE!
my $BINDIR="/home/pkoehn/statmt/bin";
@ -109,7 +110,7 @@ $_HELP = 1
'memscore:s' => \$_MEMSCORE,
'force-factored-filenames' => \$_FORCE_FACTORED_FILENAMES,
'dictionary=s' => \$_DICTIONARY,
'eppex:s' => \$_EPPEX,
'additional-ini=s' => \$_ADDITIONAL_INI
);
if ($_HELP) {
@ -1372,11 +1373,28 @@ sub score_phrase {
sub score_phrase_phrase_extract {
my ($ttable_file,$lexical_file,$extract_file) = @_;
# remove consolidation options
my $ONLY_DIRECT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/);
my $PHRASE_COUNT = (!defined($_SCORE_OPTIONS) || $_SCORE_OPTIONS !~ /NoPhraseCount/);
my $CORE_SCORE_OPTIONS = defined($_SCORE_OPTIONS) ? $_SCORE_OPTIONS : "";
$CORE_SCORE_OPTIONS =~ s/\-+OnlyDirect//i;
$CORE_SCORE_OPTIONS =~ s/\-+NoPhraseCount//i;
my $LOW_COUNT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LowCountFeature/);
my $UNALIGNED_COUNT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedPenalty/);
my ($UNALIGNED_FW_COUNT,$UNALIGNED_FW_F,$UNALIGNED_FW_E);
if (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedFunctionWordPenalty +(\S+) +(\S+)/) {
$UNALIGNED_FW_COUNT = 1;
$UNALIGNED_FW_F = $1;
$UNALIGNED_FW_E = $2;
}
my $GOOD_TURING = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /GoodTuring/);
my $KNESER_NEY = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /KneserNey/);
my $LOG_PROB = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LogProb/);
my $NEG_LOG_PROB = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NegLogProb/);
my $NO_LEX = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/);
my $MIN_COUNT_HIERARCHICAL = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /MinCountHierarchical ([\d\.]+)/) ? $1 : undef;
my $CORE_SCORE_OPTIONS = "";
$CORE_SCORE_OPTIONS .= " --LogProb" if $LOG_PROB;
$CORE_SCORE_OPTIONS .= " --NegLogProb" if $NEG_LOG_PROB;
$CORE_SCORE_OPTIONS .= " --NoLex" if $NO_LEX;
my $substep = 1;
for my $direction ("f2e","e2f") {
next if $___CONTINUE && -e "$ttable_file.half.$direction";
@ -1405,6 +1423,11 @@ sub score_phrase_phrase_extract {
my $cmd = "$PHRASE_SCORE $extract $lexical_file.$direction $ttable_file.half.$direction $inverse";
$cmd .= " --Hierarchical" if $_HIERARCHICAL;
$cmd .= " --WordAlignment" if $_PHRASE_WORD_ALIGNMENT;
$cmd .= " --KneserNey $ttable_file.coc" if $KNESER_NEY;
$cmd .= " --GoodTuring $ttable_file.coc" if $GOOD_TURING && $inverse eq "";
$cmd .= " --UnalignedPenalty" if $UNALIGNED_COUNT;
$cmd .= " --UnalignedFunctionWordPenalty ".($inverse ? $UNALIGNED_FW_F : $UNALIGNED_FW_E) if $UNALIGNED_FW_COUNT;
$cmd .= " --MinCountHierarchical $MIN_COUNT_HIERARCHICAL" if $MIN_COUNT_HIERARCHICAL;
$cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS);
print $cmd."\n";
safesystem($cmd) or die "ERROR: Scoring of phrases failed";
@ -1423,8 +1446,13 @@ sub score_phrase_phrase_extract {
return if $___CONTINUE && -e "$ttable_file.gz";
my $cmd = "$PHRASE_CONSOLIDATE $ttable_file.half.f2e $ttable_file.half.e2f.sorted $ttable_file";
$cmd .= " --Hierarchical" if $_HIERARCHICAL;
$cmd .= " --LogProb" if $LOG_PROB;
$cmd .= " --NegLogProb" if $NEG_LOG_PROB;
$cmd .= " --OnlyDirect" if $ONLY_DIRECT;
$cmd .= " --NoPhraseCount" unless $PHRASE_COUNT;
$cmd .= " --LowCountFeature" if $LOW_COUNT;
$cmd .= " --GoodTuring $ttable_file.coc" if $GOOD_TURING;
$cmd .= " --KneserNey $ttable_file.coc" if $KNESER_NEY;
safesystem($cmd) or die "ERROR: Consolidating the two phrase table halves failed";
if (! $debug) { safesystem("rm -f $ttable_file.half.*") or die("ERROR"); }
if (! $___DONT_ZIP) {
@ -1681,6 +1709,13 @@ sub create_ini {
[ttable-file]\n";
my $num_of_ttables = 0;
my @SPECIFIED_TABLE = @_PHRASE_TABLE;
my $basic_weight_count = 4; # both directions, lex and phrase
$basic_weight_count-=2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/;
$basic_weight_count+=2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedPenalty/; # word ins/del
$basic_weight_count+=2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedFunctionWordPenalty/;
$basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/;
$basic_weight_count++ unless defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoPhraseCount/; # phrase count feature
$basic_weight_count++ if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LowCountFeature/; # low count feature
foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) {
$num_of_ttables++;
my $ff = $f;
@ -1688,10 +1723,6 @@ sub create_ini {
my $file = "$___MODEL_DIR/".($_HIERARCHICAL?"rule-table":"phrase-table").($___NOT_FACTORED ? "" : ".$f").".gz";
$file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
my $phrase_table_impl = ($_HIERARCHICAL ? 6 : 0);
my $basic_weight_count = 4; # both directions, lex and phrase
$basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/;
$basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/;
$basic_weight_count++ unless defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoPhraseCount/; # phrase count feature
print INI "$phrase_table_impl $ff $basic_weight_count $file\n";
}
if ($_GLUE_GRAMMAR) {
@ -1783,10 +1814,6 @@ sub create_ini {
print INI "\n\n# translation model weights\n[weight-t]\n";
foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) {
my $basic_weight_count = 4; # both directions, lex and phrase
$basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/;
$basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/;
$basic_weight_count++ unless defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoPhraseCount/; # phrase count feature
for(1..$basic_weight_count) {
printf INI "%.2f\n", 1/$basic_weight_count;
}
@ -1826,6 +1853,11 @@ sub create_ini {
print INI "\n# delimiter between factors in input\n[factor-delimiter]\n$___FACTOR_DELIMITER\n\n"
}
if ($_ADDITIONAL_INI) {
print INI "\n# additional settings\n\n";
foreach (split(/<br>/i,$_ADDITIONAL_INI)) { print INI $_."\n"; }
}
close(INI);
}