mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 13:23:25 +03:00
support for sparse feature functions (mert support only when using PRO)
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4184 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
parent
9fee4a97f2
commit
41a1849437
@ -13,7 +13,8 @@
|
||||
|
||||
|
||||
Data::Data(Scorer& ptr):
|
||||
theScorer(&ptr)
|
||||
theScorer(&ptr),
|
||||
_sparse_flag(false)
|
||||
{
|
||||
score_type = (*theScorer).getName();
|
||||
TRACE_ERR("Data::score_type " << score_type << std::endl);
|
||||
@ -40,7 +41,6 @@ void Data::loadnbest(const std::string &file)
|
||||
std::string theSentence;
|
||||
std::string::size_type loc;
|
||||
|
||||
|
||||
while (getline(inp,stringBuf,'\n')) {
|
||||
if (stringBuf.empty()) continue;
|
||||
|
||||
@ -56,16 +56,15 @@ void Data::loadnbest(const std::string &file)
|
||||
featentry.reset();
|
||||
scoreentry.clear();
|
||||
|
||||
|
||||
theScorer->prepareStats(sentence_index, theSentence, scoreentry);
|
||||
|
||||
scoredata->add(scoreentry, sentence_index);
|
||||
|
||||
getNextPound(stringBuf, substring, "|||"); //third field
|
||||
|
||||
// examine first line for name of features
|
||||
if (!existsFeatureNames()) {
|
||||
std::string stringsupport=substring;
|
||||
// adding feature names
|
||||
std::string features="";
|
||||
std::string tmpname="";
|
||||
|
||||
@ -75,10 +74,17 @@ void Data::loadnbest(const std::string &file)
|
||||
getNextPound(stringsupport, subsubstring);
|
||||
|
||||
// string ending with ":" are skipped, because they are the names of the features
|
||||
if ((loc = subsubstring.find(":")) != subsubstring.length()-1) {
|
||||
if ((loc = subsubstring.find_last_of(":")) != subsubstring.length()-1) {
|
||||
features+=tmpname+"_"+stringify(tmpidx)+" ";
|
||||
tmpidx++;
|
||||
} else {
|
||||
}
|
||||
// ignore sparse feature name
|
||||
else if (subsubstring.find("_") != string::npos) {
|
||||
// also ignore its value
|
||||
getNextPound(stringsupport, subsubstring);
|
||||
}
|
||||
// update current feature name
|
||||
else {
|
||||
tmpidx=0;
|
||||
tmpname=subsubstring.substr(0,subsubstring.size() - 1);
|
||||
}
|
||||
@ -87,22 +93,36 @@ void Data::loadnbest(const std::string &file)
|
||||
featdata->setFeatureMap(features);
|
||||
}
|
||||
|
||||
// adding features
|
||||
// adding features
|
||||
while (!substring.empty()) {
|
||||
// TRACE_ERR("Decompounding: " << substring << std::endl);
|
||||
getNextPound(substring, subsubstring);
|
||||
|
||||
// string ending with ":" are skipped, because they are the names of the features
|
||||
if ((loc = subsubstring.find(":")) != subsubstring.length()-1) {
|
||||
// no ':' -> feature value that needs to be stored
|
||||
if ((loc = subsubstring.find_last_of(":")) != subsubstring.length()-1) {
|
||||
featentry.add(ATOFST(subsubstring.c_str()));
|
||||
}
|
||||
// sparse feature name? store as well
|
||||
else if (subsubstring.find("_") != string::npos) {
|
||||
std::string name = subsubstring;
|
||||
getNextPound(substring, subsubstring);
|
||||
featentry.addSparse( name, atof(subsubstring.c_str()) );
|
||||
_sparse_flag = true;
|
||||
}
|
||||
}
|
||||
//cerr << "number of sparse features: " << featentry.getSparse().size() << endl;
|
||||
featdata->add(featentry,sentence_index);
|
||||
}
|
||||
|
||||
inp.close();
|
||||
}
|
||||
|
||||
// TODO
|
||||
void Data::mergeSparseFeatures() {
|
||||
std::cerr << "ERROR: sparse features can only be trained with pairwise ranked optimizer (PRO), not traditional MERT\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// really not the right place...
|
||||
float sentenceLevelBleuPlusOne( ScoreStats &stats ) {
|
||||
float logbleu = 0.0;
|
||||
@ -144,7 +164,7 @@ public:
|
||||
};
|
||||
|
||||
|
||||
void Data::sample_ranked_pairs( const std::string &rankedpairfile ) {
|
||||
void Data::sampleRankedPairs( const std::string &rankedpairfile ) {
|
||||
cout << "Sampling ranked pairs." << endl;
|
||||
|
||||
ofstream *outFile = new ofstream();
|
||||
@ -187,20 +207,15 @@ void Data::sample_ranked_pairs( const std::string &rankedpairfile ) {
|
||||
for(unsigned int i=0; i<samples.size() && collected < n_samples; i++) {
|
||||
if (samples[i]->getDiff() >= min_diff) {
|
||||
collected++;
|
||||
FeatureStats &f1 = featdata->get(S,samples[i]->getTranslation1());
|
||||
FeatureStats &f2 = featdata->get(S,samples[i]->getTranslation2());
|
||||
|
||||
*out << "1";
|
||||
for(unsigned int j=0; j<f1.size(); j++)
|
||||
if (abs(f1.get(j)-f2.get(j)) > 0.00001)
|
||||
*out << " F" << j << " " << (f1.get(j)-f2.get(j));
|
||||
*out << endl;
|
||||
|
||||
outputSample( *out, featdata->get(S,samples[i]->getTranslation1()),
|
||||
featdata->get(S,samples[i]->getTranslation2()) );
|
||||
*out << endl;
|
||||
*out << "0";
|
||||
for(unsigned int j=0; j<f1.size(); j++)
|
||||
if (abs(f1.get(j)-f2.get(j)) > 0.00001)
|
||||
*out << " F" << j << " " << (f2.get(j)-f1.get(j));
|
||||
*out << endl;
|
||||
outputSample( *out, featdata->get(S,samples[i]->getTranslation2()),
|
||||
featdata->get(S,samples[i]->getTranslation1()) );
|
||||
*out << endl;
|
||||
}
|
||||
delete samples[i];
|
||||
}
|
||||
@ -209,3 +224,31 @@ void Data::sample_ranked_pairs( const std::string &rankedpairfile ) {
|
||||
out->flush();
|
||||
outFile->close();
|
||||
}
|
||||
|
||||
void Data::outputSample( ostream &out, const FeatureStats &f1, const FeatureStats &f2 )
|
||||
{
|
||||
// difference in score in regular features
|
||||
for(unsigned int j=0; j<f1.size(); j++)
|
||||
if (abs(f1.get(j)-f2.get(j)) > 0.00001)
|
||||
out << " F" << j << " " << (f1.get(j)-f2.get(j));
|
||||
|
||||
if (!hasSparseFeatures())
|
||||
return;
|
||||
|
||||
// sparse features
|
||||
const sparse_featstats_t &s1 = f1.getSparse();
|
||||
const sparse_featstats_t &s2 = f2.getSparse();
|
||||
for( sparse_featstats_t::const_iterator i=s1.begin(); i!=s1.end(); i++) {
|
||||
if (s2.find(i->first) == s2.end())
|
||||
out << " " << i->first << " " << i->second;
|
||||
else {
|
||||
float diff = i->second - s2.find(i->first)->second;
|
||||
if (abs(diff) > 0.00001)
|
||||
out << " " << i->first << " " << diff;
|
||||
}
|
||||
}
|
||||
for( sparse_featstats_t::const_iterator i=s2.begin(); i!=s2.end(); i++) {
|
||||
if (s1.find(i->first) == s1.end())
|
||||
out << " " << i->first << " " << (- i->second);
|
||||
}
|
||||
}
|
||||
|
10
mert/Data.h
10
mert/Data.h
@ -31,10 +31,10 @@ private:
|
||||
Scorer* theScorer;
|
||||
std::string score_type;
|
||||
size_t number_of_scores; //number of scores
|
||||
bool _sparse_flag;
|
||||
|
||||
public:
|
||||
Data(Scorer& sc);
|
||||
|
||||
~Data() {};
|
||||
|
||||
inline void clear() {
|
||||
@ -62,11 +62,16 @@ public:
|
||||
featdata->Features(f);
|
||||
}
|
||||
|
||||
inline bool hasSparseFeatures() const { return _sparse_flag; }
|
||||
void mergeSparseFeatures();
|
||||
|
||||
void loadnbest(const std::string &file);
|
||||
|
||||
void load(const std::string &featfile,const std::string &scorefile) {
|
||||
featdata->load(featfile);
|
||||
scoredata->load(scorefile);
|
||||
if (featdata->hasSparseFeatures())
|
||||
_sparse_flag = true;
|
||||
}
|
||||
|
||||
void save(const std::string &featfile,const std::string &scorefile, bool bin=false) {
|
||||
@ -90,7 +95,8 @@ public:
|
||||
return featdata->getFeatureIndex(name);
|
||||
};
|
||||
|
||||
void sample_ranked_pairs( const std::string &rankedPairFile );
|
||||
void sampleRankedPairs( const std::string &rankedPairFile );
|
||||
void outputSample( std::ostream &out, const FeatureStats &f1, const FeatureStats &f2 );
|
||||
};
|
||||
|
||||
|
||||
|
@ -11,7 +11,7 @@
|
||||
#include "Util.h"
|
||||
|
||||
|
||||
FeatureArray::FeatureArray(): idx("")
|
||||
FeatureArray::FeatureArray(): idx(""), _sparse_flag(false)
|
||||
{};
|
||||
|
||||
void FeatureArray::savetxt(std::ofstream& outFile)
|
||||
@ -69,6 +69,8 @@ void FeatureArray::loadtxt(ifstream& inFile, size_t n)
|
||||
for (size_t i=0 ; i < n; i++) {
|
||||
entry.loadtxt(inFile);
|
||||
add(entry);
|
||||
if (entry.getSparse().size()>0)
|
||||
_sparse_flag = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -30,6 +30,7 @@ protected:
|
||||
featarray_t array_;
|
||||
size_t number_of_features;
|
||||
std::string features;
|
||||
bool _sparse_flag;
|
||||
|
||||
private:
|
||||
std::string idx; // idx to identify the utterance, it can differ from the index inside the vector
|
||||
@ -43,6 +44,10 @@ public:
|
||||
array_.clear();
|
||||
}
|
||||
|
||||
inline bool hasSparseFeatures() const {
|
||||
return _sparse_flag;
|
||||
}
|
||||
|
||||
inline std::string getIndex() {
|
||||
return idx;
|
||||
}
|
||||
|
@ -51,9 +51,12 @@ void FeatureData::load(ifstream& inFile)
|
||||
if (entry.size() == 0)
|
||||
break;
|
||||
|
||||
if (size() == 0) {
|
||||
if (size() == 0)
|
||||
setFeatureMap(entry.Features());
|
||||
}
|
||||
|
||||
if (entry.hasSparseFeatures())
|
||||
_sparse_flag = true;
|
||||
|
||||
add(entry);
|
||||
}
|
||||
}
|
||||
|
@ -26,10 +26,10 @@ protected:
|
||||
idx2name idx2arrayname_; //map from index to name of array
|
||||
name2idx arrayname2idx_; //map from name to index of array
|
||||
|
||||
|
||||
private:
|
||||
size_t number_of_features;
|
||||
std::string features;
|
||||
bool _sparse_flag;
|
||||
|
||||
map<std::string, size_t> featname2idx_; //map from name to index of features
|
||||
map<size_t, std::string> idx2featname_; //map from index to name of features
|
||||
@ -43,6 +43,9 @@ public:
|
||||
array_.clear();
|
||||
}
|
||||
|
||||
inline bool hasSparseFeatures() const {
|
||||
return _sparse_flag;
|
||||
}
|
||||
inline FeatureArray get(const std::string& idx) {
|
||||
return array_.at(getIndex(idx));
|
||||
}
|
||||
|
@ -30,6 +30,7 @@ FeatureStats::FeatureStats(const FeatureStats &stats)
|
||||
entries_ = stats.size();
|
||||
array_ = new FeatureStatsType[available_];
|
||||
memcpy(array_,stats.getArray(),featbytes_);
|
||||
map_ = stats.getSparse();
|
||||
};
|
||||
|
||||
FeatureStats::FeatureStats(const size_t size)
|
||||
@ -61,6 +62,11 @@ void FeatureStats::add(FeatureStatsType v)
|
||||
array_[entries_++]=v;
|
||||
}
|
||||
|
||||
void FeatureStats::addSparse(string name, FeatureStatsType v)
|
||||
{
|
||||
map_[name]=v;
|
||||
}
|
||||
|
||||
void FeatureStats::set(std::string &theString)
|
||||
{
|
||||
std::string substring, stringBuf;
|
||||
@ -68,7 +74,15 @@ void FeatureStats::set(std::string &theString)
|
||||
|
||||
while (!theString.empty()) {
|
||||
getNextPound(theString, substring);
|
||||
add(ATOFST(substring.c_str()));
|
||||
// regular feature
|
||||
if (substring.find(":") == string::npos) {
|
||||
add(ATOFST(substring.c_str()));
|
||||
}
|
||||
// sparse feature
|
||||
else {
|
||||
size_t separator = substring.find_last_of(":");
|
||||
addSparse(substring.substr(0,separator), atof(substring.substr(separator+1).c_str()) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -123,6 +137,7 @@ FeatureStats& FeatureStats::operator=(const FeatureStats &stats)
|
||||
entries_ = stats.size();
|
||||
array_ = new FeatureStatsType[available_];
|
||||
memcpy(array_,stats.getArray(),featbytes_);
|
||||
map_ = stats.getSparse();
|
||||
|
||||
return *this;
|
||||
}
|
||||
@ -131,7 +146,14 @@ FeatureStats& FeatureStats::operator=(const FeatureStats &stats)
|
||||
/**write the whole object to a stream*/
|
||||
ostream& operator<<(ostream& o, const FeatureStats& e)
|
||||
{
|
||||
for (size_t i=0; i< e.size(); i++)
|
||||
// print regular features
|
||||
for (size_t i=0; i< e.size(); i++) {
|
||||
o << e.get(i) << " ";
|
||||
}
|
||||
// sparse features
|
||||
const sparse_featstats_t &sparse = e.getSparse();
|
||||
for(sparse_featstats_t::const_iterator i = sparse.begin(); i != sparse.end(); i++) {
|
||||
o << i->first << i->second << " ";
|
||||
}
|
||||
return o;
|
||||
}
|
||||
|
@ -26,6 +26,7 @@ class FeatureStats
|
||||
{
|
||||
private:
|
||||
featstats_t array_;
|
||||
sparse_featstats_t map_;
|
||||
size_t entries_;
|
||||
size_t available_;
|
||||
|
||||
@ -43,9 +44,11 @@ public:
|
||||
}
|
||||
void expand();
|
||||
void add(FeatureStatsType v);
|
||||
void addSparse(string name, FeatureStatsType v);
|
||||
|
||||
inline void clear() {
|
||||
memset((void*) array_,0,featbytes_);
|
||||
map_.clear();
|
||||
}
|
||||
|
||||
inline FeatureStatsType get(size_t i) {
|
||||
@ -57,6 +60,9 @@ public:
|
||||
inline featstats_t getArray() const {
|
||||
return array_;
|
||||
}
|
||||
inline sparse_featstats_t getSparse() const {
|
||||
return map_;
|
||||
}
|
||||
|
||||
void set(std::string &theString);
|
||||
|
||||
|
@ -26,6 +26,7 @@ typedef vector<statscore_t> statscores_t;
|
||||
|
||||
typedef float FeatureStatsType;
|
||||
typedef FeatureStatsType* featstats_t;
|
||||
typedef map<string,FeatureStatsType> sparse_featstats_t;
|
||||
//typedef vector<FeatureStatsType> featstats_t;
|
||||
typedef vector<FeatureStats> featarray_t;
|
||||
typedef vector<FeatureArray> featdata_t;
|
||||
|
@ -278,6 +278,12 @@ int main (int argc, char **argv)
|
||||
|
||||
PrintUserTime("Data loaded");
|
||||
|
||||
// starting point score over latest n-best, accumulative n-best
|
||||
//vector<unsigned> bests;
|
||||
//compute bests with sparse features needs to be implemented
|
||||
//currently sparse weights are not even loaded
|
||||
//statscore_t score = TheScorer->score(bests);
|
||||
|
||||
if (tooptimizestr.length() > 0) {
|
||||
cerr << "Weights to optimize: " << tooptimizestr << endl;
|
||||
|
||||
@ -305,16 +311,20 @@ int main (int argc, char **argv)
|
||||
}
|
||||
|
||||
if (pairedrankfile.compare("") != 0) {
|
||||
D.sample_ranked_pairs(pairedrankfile);
|
||||
D.sampleRankedPairs(pairedrankfile);
|
||||
PrintUserTime("Stopping...");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
// treat sparse features just like regular features
|
||||
if (D.hasSparseFeatures()) {
|
||||
D.mergeSparseFeatures();
|
||||
}
|
||||
|
||||
Optimizer *O=OptimizerFactory::BuildOptimizer(pdim,tooptimize,start_list[0],type,nrandom);
|
||||
O->SetScorer(TheScorer);
|
||||
O->SetFData(D.getFeatureData());
|
||||
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
cerr << "Creating a pool of " << threads << " threads" << endl;
|
||||
Moses::ThreadPool pool(threads);
|
||||
|
@ -11,6 +11,7 @@
|
||||
# Excerpts from revision history
|
||||
|
||||
# Sept 2011 multi-threaded mert (Barry Haddow)
|
||||
# 3 Aug 2011 Added random directions, historic best, pairwise ranked (PK)
|
||||
# Jul 2011 simplifications (Ondrej Bojar)
|
||||
# -- rely on moses' -show-weights instead of parsing moses.ini
|
||||
# ... so moses is also run once *before* mert starts, checking
|
||||
@ -287,8 +288,6 @@ $qsubwrapper="$SCRIPTS_ROOTDIR/generic/qsub-wrapper.pl" if !defined $qsubwrapper
|
||||
$moses_parallel_cmd = "$SCRIPTS_ROOTDIR/generic/moses-parallel.pl"
|
||||
if !defined $moses_parallel_cmd;
|
||||
|
||||
|
||||
|
||||
if (!defined $mertdir) {
|
||||
$mertdir = "$SCRIPTS_ROOTDIR/../mert";
|
||||
print STDERR "Assuming --mertdir=$mertdir\n";
|
||||
@ -357,13 +356,11 @@ die "Not executable: $moses_parallel_cmd" if defined $___JOBS && ! -x $moses_par
|
||||
die "Not executable: $qsubwrapper" if defined $___JOBS && ! -x $qsubwrapper;
|
||||
die "Not executable: $___DECODER" if ! -x $___DECODER;
|
||||
|
||||
|
||||
my $input_abs = ensure_full_path($___DEV_F);
|
||||
die "File not found: $___DEV_F (interpreted as $input_abs)."
|
||||
if ! -e $input_abs;
|
||||
$___DEV_F = $input_abs;
|
||||
|
||||
|
||||
# Option to pass to qsubwrapper and moses-parallel
|
||||
my $pass_old_sge = $old_sge ? "-old-sge" : "";
|
||||
|
||||
@ -372,7 +369,6 @@ die "File not executable: $___DECODER (interpreted as $decoder_abs)."
|
||||
if ! -x $decoder_abs;
|
||||
$___DECODER = $decoder_abs;
|
||||
|
||||
|
||||
my $ref_abs = ensure_full_path($___DEV_E);
|
||||
# check if English dev set (reference translations) exist and store a list of all references
|
||||
my @references;
|
||||
@ -409,9 +405,6 @@ if ($___DECODER_FLAGS =~ /(^|\s)-(config|f) /
|
||||
# normalize initial LAMBDAs, too
|
||||
my $need_to_normalize = 1;
|
||||
|
||||
|
||||
|
||||
|
||||
#store current directory and create the working directory (if needed)
|
||||
my $cwd = `pawd 2>/dev/null`;
|
||||
if(!$cwd){$cwd = `pwd`;}
|
||||
@ -431,17 +424,16 @@ my $mert_logfile = "mert.log";
|
||||
my $weights_in_file = "init.opt";
|
||||
my $weights_out_file = "weights.txt";
|
||||
|
||||
|
||||
# set start run
|
||||
my $start_run = 1;
|
||||
my $bestpoint = undef;
|
||||
my $devbleu = undef;
|
||||
my $sparse_weights_file = undef;
|
||||
|
||||
my $prev_feature_file = undef;
|
||||
my $prev_score_file = undef;
|
||||
my $prev_init_file = undef;
|
||||
|
||||
|
||||
if ($___FILTER_PHRASE_TABLE) {
|
||||
my $outdir = "filtered";
|
||||
if (-e "$outdir/moses.ini") {
|
||||
@ -471,7 +463,6 @@ else{
|
||||
$___CONFIG_ORIG = $___CONFIG;
|
||||
}
|
||||
|
||||
|
||||
# we run moses to check validity of moses.ini and to obtain all the feature
|
||||
# names
|
||||
my $featlist = get_featlist_from_moses($___CONFIG);
|
||||
@ -579,28 +570,19 @@ if ($continue) {
|
||||
print STDERR "All needed data are available\n";
|
||||
|
||||
print STDERR "Loading information from last step ($step)\n";
|
||||
open(IN,"run$step.$mert_logfile") or die "Can't open run$step.$mert_logfile";
|
||||
while (<IN>) {
|
||||
if (/Best point:\s*([\s\d\.\-e]+?)\s*=> ([\-\d\.]+)/) {
|
||||
$bestpoint = $1;
|
||||
$devbleu = $2;
|
||||
last;
|
||||
}
|
||||
}
|
||||
close IN;
|
||||
my %dummy; # sparse features
|
||||
($bestpoint,$devbleu) = &get_weights_from_mert("run$step.$mert_outfile","run$step.$mert_logfile",scalar @{$featlist->{"names"}},\%dummy);
|
||||
die "Failed to parse mert.log, missed Best point there."
|
||||
if !defined $bestpoint || !defined $devbleu;
|
||||
print "($step) BEST at $step $bestpoint => $devbleu at ".`date`;
|
||||
|
||||
my @newweights = split /\s+/, $bestpoint;
|
||||
|
||||
# Sanity check: order of lambdas must match
|
||||
sanity_check_order_of_lambdas($featlist,
|
||||
"gunzip -c < run$step.best$___N_BEST_LIST_SIZE.out.gz |");
|
||||
|
||||
|
||||
# update my cache of lambda values
|
||||
$featlist->{"values"} = \@newweights;
|
||||
|
||||
}
|
||||
else{
|
||||
print STDERR "No previous data are needed\n";
|
||||
@ -630,10 +612,10 @@ while(1) {
|
||||
print "run $run start at ".`date`;
|
||||
|
||||
# In case something dies later, we might wish to have a copy
|
||||
create_config($___CONFIG, "./run$run.moses.ini", $featlist, $run, (defined$devbleu?$devbleu:"--not-estimated--"));
|
||||
create_config($___CONFIG, "./run$run.moses.ini", $featlist, $run, (defined$devbleu?$devbleu:"--not-estimated--"),$sparse_weights_file);
|
||||
|
||||
|
||||
# skip if the user wanted
|
||||
# skip running the decoder if the user wanted
|
||||
if (!$skip_decoder) {
|
||||
print "($run) run decoder to produce n-best lists\n";
|
||||
$nbest_file = run_decoder($featlist, $run, $need_to_normalize);
|
||||
@ -648,8 +630,6 @@ while(1) {
|
||||
$need_to_normalize = 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
# extract score statistics and features from the nbest lists
|
||||
print STDERR "Scoring the nbestlist.\n";
|
||||
|
||||
@ -740,7 +720,7 @@ while(1) {
|
||||
if ! -s $weights_out_file;
|
||||
|
||||
|
||||
# backup copies
|
||||
# backup copies
|
||||
safesystem ("\\cp -f extract.err run$run.extract.err") or die;
|
||||
safesystem ("\\cp -f extract.out run$run.extract.out") or die;
|
||||
if ($___PAIRWISE_RANKED_OPTIMIZER) { safesystem ("\\cp -f pro.data run$run.pro.data") or die; }
|
||||
@ -751,34 +731,10 @@ while(1) {
|
||||
|
||||
print "run $run end at ".`date`;
|
||||
|
||||
$bestpoint = undef;
|
||||
$devbleu = undef;
|
||||
if ($___PAIRWISE_RANKED_OPTIMIZER) {
|
||||
open(IN,"run$run.$mert_outfile") or die "Can't open run$run.$mert_outfile";
|
||||
my (@WEIGHT,$sum);
|
||||
foreach (@CURR) { push @WEIGHT, 0; }
|
||||
while(<IN>) {
|
||||
if (/^F(\d+) ([\-\.\de]+)/) {
|
||||
$WEIGHT[$1] = $2;
|
||||
$sum += abs($2);
|
||||
}
|
||||
}
|
||||
$devbleu = "unknown";
|
||||
foreach (@WEIGHT) { $_ /= $sum; }
|
||||
$bestpoint = join(" ",@WEIGHT);
|
||||
close IN;
|
||||
}
|
||||
else {
|
||||
open(IN,"run$run.$mert_logfile") or die "Can't open run$run.$mert_logfile";
|
||||
while (<IN>) {
|
||||
if (/Best point:\s*([\s\d\.\-e]+?)\s*=> ([\-\d\.]+)/) {
|
||||
$bestpoint = $1;
|
||||
$devbleu = $2;
|
||||
last;
|
||||
}
|
||||
}
|
||||
close IN;
|
||||
}
|
||||
my %sparse_weights; # sparse features
|
||||
($bestpoint,$devbleu) = &get_weights_from_mert("run$run.$mert_outfile","run$run.$mert_logfile",scalar @{$featlist->{"names"}},\%sparse_weights);
|
||||
|
||||
|
||||
die "Failed to parse mert.log, missed Best point there."
|
||||
if !defined $bestpoint || !defined $devbleu;
|
||||
print "($run) BEST at $run: $bestpoint => $devbleu at ".`date`;
|
||||
@ -788,6 +744,15 @@ while(1) {
|
||||
|
||||
$featlist->{"values"} = \@newweights;
|
||||
|
||||
if (scalar keys %sparse_weights) {
|
||||
$sparse_weights_file = "run".($run+1).".sparse-weights";
|
||||
open(SPARSE,">".$sparse_weights_file);
|
||||
foreach my $feature (keys %sparse_weights) {
|
||||
print SPARSE "$feature $sparse_weights{$feature}\n";
|
||||
}
|
||||
close(SPARSE);
|
||||
}
|
||||
|
||||
## additional stopping criterion: weights have not changed
|
||||
my $shouldstop = 1;
|
||||
for(my $i=0; $i<@CURR; $i++) {
|
||||
@ -864,6 +829,43 @@ chdir($cwd);
|
||||
|
||||
} # end of local scope
|
||||
|
||||
sub get_weights_from_mert {
|
||||
my ($outfile,$logfile,$weight_count,$sparse_weights) = @_;
|
||||
my ($bestpoint,$devbleu);
|
||||
if ($___PAIRWISE_RANKED_OPTIMIZER) {
|
||||
open(IN,$outfile) or die "Can't open $outfile";
|
||||
my (@WEIGHT,$sum);
|
||||
for(my $i=0;$i<$weight_count;$i++) { push @WEIGHT, 0; }
|
||||
while(<IN>) {
|
||||
# regular features
|
||||
if (/^F(\d+) ([\-\.\de]+)/) {
|
||||
$WEIGHT[$1] = $2;
|
||||
$sum += abs($2);
|
||||
}
|
||||
# sparse features
|
||||
elsif(/^(.+_.+) ([\-\.\de]+)/) {
|
||||
$$sparse_weights{$1} = $2;
|
||||
}
|
||||
}
|
||||
$devbleu = "unknown";
|
||||
foreach (@WEIGHT) { $_ /= $sum; }
|
||||
$bestpoint = join(" ",@WEIGHT);
|
||||
close IN;
|
||||
}
|
||||
else {
|
||||
open(IN,$logfile) or die "Can't open $logfile";
|
||||
while (<IN>) {
|
||||
if (/Best point:\s*([\s\d\.\-e]+?)\s*=> ([\-\d\.]+)/) {
|
||||
$bestpoint = $1;
|
||||
$devbleu = $2;
|
||||
last;
|
||||
}
|
||||
}
|
||||
close IN;
|
||||
}
|
||||
return ($bestpoint,$devbleu);
|
||||
}
|
||||
|
||||
sub run_decoder {
|
||||
my ($featlist, $run, $need_to_normalize) = @_;
|
||||
my $filename_template = "run%d.best$___N_BEST_LIST_SIZE.out";
|
||||
@ -984,6 +986,7 @@ sub get_featlist_from_moses {
|
||||
$nr++;
|
||||
chomp;
|
||||
my ($longname, $feature, $value) = split / /;
|
||||
next if $value eq "sparse";
|
||||
push @errs, "$featlistfn:$nr:Bad initial value of $feature: $value\n"
|
||||
if $value !~ /^[+-]?[0-9.e]+$/;
|
||||
push @errs, "$featlistfn:$nr:Unknown feature '$feature', please add it to \@ABBR_FULL_MAP\n"
|
||||
@ -1015,14 +1018,20 @@ sub get_order_of_scores_from_nbestlist {
|
||||
|
||||
my @order = ();
|
||||
my $label = undef;
|
||||
my $sparse = 0; # we ignore sparse features here
|
||||
foreach my $tok (split /\s+/, $scores) {
|
||||
if ($tok =~ /^([a-z][0-9a-z]*):/i) {
|
||||
if ($tok =~ /.+_.+:/) {
|
||||
$sparse = 1;
|
||||
} elsif ($tok =~ /^([a-z][0-9a-z]*):/i) {
|
||||
$label = $1;
|
||||
} elsif ($tok =~ /^-?[-0-9.e]+$/) {
|
||||
# a score found, remember it
|
||||
die "Found a score but no label before it! Bad nbestlist '$fname_or_source'!"
|
||||
if !defined $label;
|
||||
push @order, $label;
|
||||
if (!$sparse) {
|
||||
# a score found, remember it
|
||||
die "Found a score but no label before it! Bad nbestlist '$fname_or_source'!"
|
||||
if !defined $label;
|
||||
push @order, $label;
|
||||
}
|
||||
$sparse = 0;
|
||||
} else {
|
||||
die "Not a label, not a score '$tok'. Failed to parse the scores string: '$scores' of nbestlist '$fname_or_source'";
|
||||
}
|
||||
@ -1037,6 +1046,7 @@ sub create_config {
|
||||
my $featlist = shift; # the lambdas we should write
|
||||
my $iteration = shift; # just for verbosity
|
||||
my $bleu_achieved = shift; # just for verbosity
|
||||
my $sparse_weights_file = shift; # only defined when optimizing sparse features
|
||||
|
||||
my %P; # the hash of all parameters we wish to override
|
||||
|
||||
@ -1076,6 +1086,10 @@ sub create_config {
|
||||
push @{$P{$name}}, $val;
|
||||
}
|
||||
|
||||
if (defined($sparse_weights_file)) {
|
||||
push @{$P{"weights-file"}}, $___WORKING_DIR."/".$sparse_weights_file;
|
||||
}
|
||||
|
||||
# create new moses.ini decoder config file by cloning and overriding the original one
|
||||
open(INI,$infn) or die "Can't read $infn";
|
||||
delete($P{"config"}); # never output
|
||||
|
@ -36,10 +36,15 @@ using namespace std;
|
||||
bool hierarchicalFlag = false;
|
||||
bool onlyDirectFlag = false;
|
||||
bool phraseCountFlag = true;
|
||||
bool lowCountFlag = false;
|
||||
bool goodTuringFlag = false;
|
||||
bool kneserNeyFlag = false;
|
||||
bool logProbFlag = false;
|
||||
char line[LINE_MAX_LENGTH];
|
||||
inline float maybeLogProb( float a ) { return logProbFlag ? log(a) : a; }
|
||||
|
||||
void processFiles( char*, char*, char* );
|
||||
char line[LINE_MAX_LENGTH];
|
||||
void processFiles( char*, char*, char*, char* );
|
||||
void loadCountOfCounts( char* );
|
||||
bool getLine( istream &fileP, vector< string > &item );
|
||||
vector< string > splitLine();
|
||||
|
||||
@ -55,6 +60,7 @@ int main(int argc, char* argv[])
|
||||
char* &fileNameDirect = argv[1];
|
||||
char* &fileNameIndirect = argv[2];
|
||||
char* &fileNameConsolidated = argv[3];
|
||||
char* fileNameCountOfCounts;
|
||||
|
||||
for(int i=4; i<argc; i++) {
|
||||
if (strcmp(argv[i],"--Hierarchical") == 0) {
|
||||
@ -66,6 +72,25 @@ int main(int argc, char* argv[])
|
||||
} else if (strcmp(argv[i],"--NoPhraseCount") == 0) {
|
||||
phraseCountFlag = false;
|
||||
cerr << "not including the phrase count feature\n";
|
||||
} else if (strcmp(argv[i],"--GoodTuring") == 0) {
|
||||
goodTuringFlag = true;
|
||||
if (i+1==argc) {
|
||||
cerr << "ERROR: specify count of count files for Good Turing discounting!\n";
|
||||
exit(1);
|
||||
}
|
||||
fileNameCountOfCounts = argv[++i];
|
||||
cerr << "adjusting phrase translation probabilities with Good Turing discounting\n";
|
||||
} else if (strcmp(argv[i],"--KneserNey") == 0) {
|
||||
kneserNeyFlag = true;
|
||||
if (i+1==argc) {
|
||||
cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n";
|
||||
exit(1);
|
||||
}
|
||||
fileNameCountOfCounts = argv[++i];
|
||||
cerr << "adjusting phrase translation probabilities with Kneser Ney discounting\n";
|
||||
} else if (strcmp(argv[i],"--LowCountFeature") == 0) {
|
||||
lowCountFlag = true;
|
||||
cerr << "including the low count feature\n";
|
||||
} else if (strcmp(argv[i],"--LogProb") == 0) {
|
||||
logProbFlag = true;
|
||||
cerr << "using log-probabilities\n";
|
||||
@ -75,11 +100,61 @@ int main(int argc, char* argv[])
|
||||
}
|
||||
}
|
||||
|
||||
processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated );
|
||||
processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts );
|
||||
}
|
||||
|
||||
void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated )
|
||||
vector< float > countOfCounts;
|
||||
vector< float > goodTuringDiscount;
|
||||
float kneserNey_D1, kneserNey_D2, kneserNey_D3, totalCount = -1;
|
||||
void loadCountOfCounts( char* fileNameCountOfCounts )
|
||||
{
|
||||
Moses::InputFileStream fileCountOfCounts(fileNameCountOfCounts);
|
||||
if (fileCountOfCounts.fail()) {
|
||||
cerr << "ERROR: could not open count of counts file " << fileNameCountOfCounts << endl;
|
||||
exit(1);
|
||||
}
|
||||
istream &fileP = fileCountOfCounts;
|
||||
|
||||
countOfCounts.push_back(0.0);
|
||||
while(1) {
|
||||
if (fileP.eof()) break;
|
||||
SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
|
||||
if (fileP.eof()) break;
|
||||
if (totalCount < 0)
|
||||
totalCount = atof(line); // total number of distinct phrase pairs
|
||||
else
|
||||
countOfCounts.push_back( atof(line) );
|
||||
}
|
||||
fileCountOfCounts.Close();
|
||||
|
||||
// compute Good Turing discounts
|
||||
if (goodTuringFlag) {
|
||||
goodTuringDiscount.push_back(0.01); // floor value
|
||||
for( size_t i=1; i<countOfCounts.size()-1; i++ ) {
|
||||
goodTuringDiscount.push_back(((float)i+1)/(float)i*((countOfCounts[i+1]+0.1) / ((float)countOfCounts[i]+0.1)));
|
||||
if (goodTuringDiscount[i]>1)
|
||||
goodTuringDiscount[i] = 1;
|
||||
if (goodTuringDiscount[i]<goodTuringDiscount[i-1])
|
||||
goodTuringDiscount[i] = goodTuringDiscount[i-1];
|
||||
}
|
||||
}
|
||||
|
||||
// compute Kneser Ney co-efficients [Chen&Goodman, 1998]
|
||||
float Y = countOfCounts[1] / (countOfCounts[1] + 2*countOfCounts[2]);
|
||||
kneserNey_D1 = 1 - 2*Y * countOfCounts[2] / countOfCounts[1];
|
||||
kneserNey_D2 = 2 - 3*Y * countOfCounts[3] / countOfCounts[2];
|
||||
kneserNey_D3 = 3 - 4*Y * countOfCounts[4] / countOfCounts[3];
|
||||
// sanity constraints
|
||||
if (kneserNey_D1 > 0.9) kneserNey_D1 = 0.9;
|
||||
if (kneserNey_D2 > 1.9) kneserNey_D2 = 1.9;
|
||||
if (kneserNey_D3 > 2.9) kneserNey_D3 = 2.9;
|
||||
}
|
||||
|
||||
void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts )
|
||||
{
|
||||
if (goodTuringFlag || kneserNeyFlag)
|
||||
loadCountOfCounts( fileNameCountOfCounts );
|
||||
|
||||
// open input files
|
||||
Moses::InputFileStream fileDirect(fileNameDirect);
|
||||
Moses::InputFileStream fileIndirect(fileNameIndirect);
|
||||
@ -134,29 +209,67 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
|
||||
// output hierarchical phrase pair (with separated labels)
|
||||
fileConsolidated << itemDirect[0] << " ||| " << itemDirect[1];
|
||||
|
||||
// probs
|
||||
fileConsolidated << " ||| ";
|
||||
if (!onlyDirectFlag) {
|
||||
fileConsolidated << itemIndirect[2]; // prob indirect
|
||||
// SCORES ...
|
||||
fileConsolidated << " |||";
|
||||
vector<string> directCounts = tokenize(itemDirect[4].c_str());
|
||||
vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
|
||||
float countF = atof(directCounts[0].c_str());
|
||||
float countE = atof(indirectCounts[0].c_str());
|
||||
float countEF = atof(indirectCounts[1].c_str());
|
||||
float n1_F, n1_E;
|
||||
if (kneserNeyFlag) {
|
||||
n1_F = atof(directCounts[2].c_str());
|
||||
n1_E = atof(indirectCounts[2].c_str());
|
||||
}
|
||||
fileConsolidated << " " << itemDirect[2]; // prob direct
|
||||
|
||||
// Good Turing discounting
|
||||
float adjustedCountEF = countEF;
|
||||
if (goodTuringFlag && countEF+0.99999 < goodTuringDiscount.size()-1)
|
||||
adjustedCountEF *= goodTuringDiscount[(int)(countEF+0.99998)];
|
||||
float adjustedCountEF_indirect = adjustedCountEF;
|
||||
|
||||
// Kneser Ney discounting [Foster et al, 2006]
|
||||
if (kneserNeyFlag) {
|
||||
float D = kneserNey_D3;
|
||||
if (countEF < 2) D = kneserNey_D1;
|
||||
if (countEF < 3) D = kneserNey_D2;
|
||||
if (D > countEF) D = countEF - 0.01; // sanity constraint
|
||||
|
||||
float p_b_E = n1_E / totalCount; // target phrase prob based on distinct
|
||||
float alpha_F = D * n1_F / countF; // available mass
|
||||
adjustedCountEF = countEF - D + countF * alpha_F * p_b_E;
|
||||
|
||||
// for indirect
|
||||
float p_b_F = n1_F / totalCount; // target phrase prob based on distinct
|
||||
float alpha_E = D * n1_E / countE; // available mass
|
||||
adjustedCountEF_indirect = countEF - D + countE * alpha_E * p_b_F;
|
||||
}
|
||||
|
||||
// prob indirect
|
||||
if (!onlyDirectFlag) {
|
||||
fileConsolidated << " " << maybeLogProb(adjustedCountEF_indirect/countE);
|
||||
fileConsolidated << " " << itemIndirect[2];
|
||||
}
|
||||
|
||||
// prob direct
|
||||
fileConsolidated << " " << maybeLogProb(adjustedCountEF/countF);
|
||||
fileConsolidated << " " << itemDirect[2];
|
||||
|
||||
// phrase count feature
|
||||
if (phraseCountFlag) {
|
||||
fileConsolidated << " " << (logProbFlag ? 1 : 2.718); // phrase count feature
|
||||
fileConsolidated << " " << maybeLogProb(2.718);
|
||||
}
|
||||
|
||||
// low count feature
|
||||
if (lowCountFlag) {
|
||||
fileConsolidated << " " << maybeLogProb(exp(-1.0/countEF));
|
||||
}
|
||||
|
||||
// alignment
|
||||
fileConsolidated << " ||| " << itemDirect[3];
|
||||
|
||||
// counts, for debugging
|
||||
vector<string> directCounts = tokenize(itemDirect[4].c_str());
|
||||
vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
|
||||
fileConsolidated << "||| " << indirectCounts[0] << " " << directCounts[0];
|
||||
// output rule count if present in either file
|
||||
if (directCounts.size() > 1) {
|
||||
fileConsolidated << " " << directCounts[1];
|
||||
} else if (indirectCounts.size() > 1) {
|
||||
fileConsolidated << " " << indirectCounts[1];
|
||||
}
|
||||
fileConsolidated << "||| " << countE << " " << countF; // << " " << countEF;
|
||||
|
||||
fileConsolidated << endl;
|
||||
}
|
||||
@ -165,6 +278,7 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
|
||||
fileConsolidated.close();
|
||||
}
|
||||
|
||||
|
||||
bool getLine( istream &fileP, vector< string > &item )
|
||||
{
|
||||
if (fileP.eof())
|
||||
|
@ -45,7 +45,7 @@
|
||||
#include "tables-core.h"
|
||||
#include "XmlTree.h"
|
||||
|
||||
#define LINE_MAX_LENGTH 60000
|
||||
#define LINE_MAX_LENGTH 500000
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
@ -24,7 +24,7 @@
|
||||
|
||||
using namespace std;
|
||||
|
||||
#define LINE_MAX_LENGTH 60000
|
||||
#define LINE_MAX_LENGTH 500000
|
||||
|
||||
// HPhraseVertex represents a point in the alignment matrix
|
||||
typedef pair <int, int> HPhraseVertex;
|
||||
|
@ -29,6 +29,10 @@ vector<string> tokenize( const char* input )
|
||||
return token;
|
||||
}
|
||||
|
||||
bool isNonTerminal( const WORD &symbol ) {
|
||||
return symbol.substr(0, 1) == "[" && symbol.substr(symbol.size()-1, 1) == "]";
|
||||
}
|
||||
|
||||
WORD_ID Vocabulary::storeIfNew( const WORD& word )
|
||||
{
|
||||
map<WORD, WORD_ID>::iterator i = lookup.find( word );
|
||||
|
@ -34,9 +34,10 @@ my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_
|
||||
$_PHRASE_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
|
||||
$_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
|
||||
$_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS,
|
||||
$_ADDITIONAL_INI,
|
||||
$_DICTIONARY, $_EPPEX);
|
||||
|
||||
my $debug = 0; # debug this script, do not delete any files in debug mode
|
||||
my $debug = 1; # debug this script, do not delete any files in debug mode
|
||||
|
||||
# the following line is set installation time by 'make release'. BEWARE!
|
||||
my $BINDIR="/home/pkoehn/statmt/bin";
|
||||
@ -109,7 +110,7 @@ $_HELP = 1
|
||||
'memscore:s' => \$_MEMSCORE,
|
||||
'force-factored-filenames' => \$_FORCE_FACTORED_FILENAMES,
|
||||
'dictionary=s' => \$_DICTIONARY,
|
||||
'eppex:s' => \$_EPPEX,
|
||||
'additional-ini=s' => \$_ADDITIONAL_INI
|
||||
);
|
||||
|
||||
if ($_HELP) {
|
||||
@ -1372,11 +1373,28 @@ sub score_phrase {
|
||||
sub score_phrase_phrase_extract {
|
||||
my ($ttable_file,$lexical_file,$extract_file) = @_;
|
||||
|
||||
# remove consolidation options
|
||||
my $ONLY_DIRECT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/);
|
||||
my $PHRASE_COUNT = (!defined($_SCORE_OPTIONS) || $_SCORE_OPTIONS !~ /NoPhraseCount/);
|
||||
my $CORE_SCORE_OPTIONS = defined($_SCORE_OPTIONS) ? $_SCORE_OPTIONS : "";
|
||||
$CORE_SCORE_OPTIONS =~ s/\-+OnlyDirect//i;
|
||||
$CORE_SCORE_OPTIONS =~ s/\-+NoPhraseCount//i;
|
||||
my $LOW_COUNT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LowCountFeature/);
|
||||
my $UNALIGNED_COUNT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedPenalty/);
|
||||
my ($UNALIGNED_FW_COUNT,$UNALIGNED_FW_F,$UNALIGNED_FW_E);
|
||||
if (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedFunctionWordPenalty +(\S+) +(\S+)/) {
|
||||
$UNALIGNED_FW_COUNT = 1;
|
||||
$UNALIGNED_FW_F = $1;
|
||||
$UNALIGNED_FW_E = $2;
|
||||
}
|
||||
my $GOOD_TURING = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /GoodTuring/);
|
||||
my $KNESER_NEY = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /KneserNey/);
|
||||
my $LOG_PROB = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LogProb/);
|
||||
my $NEG_LOG_PROB = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NegLogProb/);
|
||||
my $NO_LEX = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/);
|
||||
my $MIN_COUNT_HIERARCHICAL = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /MinCountHierarchical ([\d\.]+)/) ? $1 : undef;
|
||||
my $CORE_SCORE_OPTIONS = "";
|
||||
$CORE_SCORE_OPTIONS .= " --LogProb" if $LOG_PROB;
|
||||
$CORE_SCORE_OPTIONS .= " --NegLogProb" if $NEG_LOG_PROB;
|
||||
$CORE_SCORE_OPTIONS .= " --NoLex" if $NO_LEX;
|
||||
|
||||
my $substep = 1;
|
||||
for my $direction ("f2e","e2f") {
|
||||
next if $___CONTINUE && -e "$ttable_file.half.$direction";
|
||||
@ -1405,6 +1423,11 @@ sub score_phrase_phrase_extract {
|
||||
my $cmd = "$PHRASE_SCORE $extract $lexical_file.$direction $ttable_file.half.$direction $inverse";
|
||||
$cmd .= " --Hierarchical" if $_HIERARCHICAL;
|
||||
$cmd .= " --WordAlignment" if $_PHRASE_WORD_ALIGNMENT;
|
||||
$cmd .= " --KneserNey $ttable_file.coc" if $KNESER_NEY;
|
||||
$cmd .= " --GoodTuring $ttable_file.coc" if $GOOD_TURING && $inverse eq "";
|
||||
$cmd .= " --UnalignedPenalty" if $UNALIGNED_COUNT;
|
||||
$cmd .= " --UnalignedFunctionWordPenalty ".($inverse ? $UNALIGNED_FW_F : $UNALIGNED_FW_E) if $UNALIGNED_FW_COUNT;
|
||||
$cmd .= " --MinCountHierarchical $MIN_COUNT_HIERARCHICAL" if $MIN_COUNT_HIERARCHICAL;
|
||||
$cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS);
|
||||
print $cmd."\n";
|
||||
safesystem($cmd) or die "ERROR: Scoring of phrases failed";
|
||||
@ -1423,8 +1446,13 @@ sub score_phrase_phrase_extract {
|
||||
return if $___CONTINUE && -e "$ttable_file.gz";
|
||||
my $cmd = "$PHRASE_CONSOLIDATE $ttable_file.half.f2e $ttable_file.half.e2f.sorted $ttable_file";
|
||||
$cmd .= " --Hierarchical" if $_HIERARCHICAL;
|
||||
$cmd .= " --LogProb" if $LOG_PROB;
|
||||
$cmd .= " --NegLogProb" if $NEG_LOG_PROB;
|
||||
$cmd .= " --OnlyDirect" if $ONLY_DIRECT;
|
||||
$cmd .= " --NoPhraseCount" unless $PHRASE_COUNT;
|
||||
$cmd .= " --LowCountFeature" if $LOW_COUNT;
|
||||
$cmd .= " --GoodTuring $ttable_file.coc" if $GOOD_TURING;
|
||||
$cmd .= " --KneserNey $ttable_file.coc" if $KNESER_NEY;
|
||||
safesystem($cmd) or die "ERROR: Consolidating the two phrase table halves failed";
|
||||
if (! $debug) { safesystem("rm -f $ttable_file.half.*") or die("ERROR"); }
|
||||
if (! $___DONT_ZIP) {
|
||||
@ -1681,6 +1709,13 @@ sub create_ini {
|
||||
[ttable-file]\n";
|
||||
my $num_of_ttables = 0;
|
||||
my @SPECIFIED_TABLE = @_PHRASE_TABLE;
|
||||
my $basic_weight_count = 4; # both directions, lex and phrase
|
||||
$basic_weight_count-=2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/;
|
||||
$basic_weight_count+=2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedPenalty/; # word ins/del
|
||||
$basic_weight_count+=2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedFunctionWordPenalty/;
|
||||
$basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/;
|
||||
$basic_weight_count++ unless defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoPhraseCount/; # phrase count feature
|
||||
$basic_weight_count++ if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LowCountFeature/; # low count feature
|
||||
foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) {
|
||||
$num_of_ttables++;
|
||||
my $ff = $f;
|
||||
@ -1688,10 +1723,6 @@ sub create_ini {
|
||||
my $file = "$___MODEL_DIR/".($_HIERARCHICAL?"rule-table":"phrase-table").($___NOT_FACTORED ? "" : ".$f").".gz";
|
||||
$file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
|
||||
my $phrase_table_impl = ($_HIERARCHICAL ? 6 : 0);
|
||||
my $basic_weight_count = 4; # both directions, lex and phrase
|
||||
$basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/;
|
||||
$basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/;
|
||||
$basic_weight_count++ unless defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoPhraseCount/; # phrase count feature
|
||||
print INI "$phrase_table_impl $ff $basic_weight_count $file\n";
|
||||
}
|
||||
if ($_GLUE_GRAMMAR) {
|
||||
@ -1783,10 +1814,6 @@ sub create_ini {
|
||||
|
||||
print INI "\n\n# translation model weights\n[weight-t]\n";
|
||||
foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) {
|
||||
my $basic_weight_count = 4; # both directions, lex and phrase
|
||||
$basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/;
|
||||
$basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/;
|
||||
$basic_weight_count++ unless defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoPhraseCount/; # phrase count feature
|
||||
for(1..$basic_weight_count) {
|
||||
printf INI "%.2f\n", 1/$basic_weight_count;
|
||||
}
|
||||
@ -1826,6 +1853,11 @@ sub create_ini {
|
||||
print INI "\n# delimiter between factors in input\n[factor-delimiter]\n$___FACTOR_DELIMITER\n\n"
|
||||
}
|
||||
|
||||
if ($_ADDITIONAL_INI) {
|
||||
print INI "\n# additional settings\n\n";
|
||||
foreach (split(/<br>/i,$_ADDITIONAL_INI)) { print INI $_."\n"; }
|
||||
}
|
||||
|
||||
close(INI);
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user