Rewrote the lexical reordering model scoring in C++. Adapted train-factored-phrase-model.perl to that change. Minor fixes in other places, for compatibility

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/branches/hierarchical-reo@2884 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
sarst 2010-02-10 17:19:06 +00:00
parent 9adc3ee500
commit 92368ba490
9 changed files with 1267 additions and 2735 deletions

View File

@ -40,8 +40,8 @@ class LexicalReorderingState : public FFState {
static const LexicalReordering::ReorderingType D = 2; // discontinuous
static const LexicalReordering::ReorderingType DL = 2; // discontinuous, left
static const LexicalReordering::ReorderingType DR = 3; // discontinuous, right
static const LexicalReordering::ReorderingType L = 0; // left
static const LexicalReordering::ReorderingType R = 1; // right
static const LexicalReordering::ReorderingType R = 0; // right
static const LexicalReordering::ReorderingType L = 1; // left
};
//! State for the standard Moses implementation of lexical reordering models

View File

@ -26,7 +26,7 @@ RELEASEDIR=$(TARGETDIR)/scripts-$(TS)
all: compile
SUBDIRS=cmert-0.5 phrase-extract symal mbr
SUBDIRS=cmert-0.5 phrase-extract symal mbr lexical-reordering
SUBDIRS_CLEAN=$(SUBDIRS) memscore
compile: compile-memscore

View File

@ -0,0 +1,15 @@
all: score
clean:
rm -f *.o
.cpp.o:
$(CXX) -O6 -g -c $<
score: score.cpp reordering_classes.o
$(CXX) -lz score.cpp reordering_classes.o -o score
#reordering_classes.o: reordering_classes.h reordering_classes.cpp
# $(CXX) reordering_classes.cpp

View File

@ -0,0 +1,427 @@
#include <vector>
#include <iostream>
#include <cstdlib>
#include <numeric>
#include <cstdio>
//#include <iostream>
#include <sstream>
#include <string>
#include "zlib.h"
#include "reordering_classes.h"
using namespace std;
ModelScore::ModelScore() {
for(int i=MONO; i<=NOMONO; ++i) {
count_fe_prev.push_back(0);
count_fe_next.push_back(0);
count_f_prev.push_back(0);
count_f_next.push_back(0);
}
}
ModelScore* ModelScore::createModelScore(const string& modeltype) {
if (modeltype.compare("mslr") == 0) {
return new ModelScoreMSLR();
} else if (modeltype.compare("msd") == 0) {
return new ModelScoreMSD();
} else if (modeltype.compare("monotonoicity") == 0 ) {
return new ModelScoreMonotonicity();
} else if (modeltype.compare("leftright") == 0) {
return new ModelScoreLR();
} else {
cerr << "Illegal model type given for lexical reordering model scoring: " << modeltype << endl;
exit(1);
}
}
void ModelScore::reset_fe() {
for(int i=MONO; i<=NOMONO; ++i) {
count_fe_prev[i] = 0;
count_fe_next[i] = 0;
}
}
void ModelScore::reset_f() {
for(int i=MONO; i<=NOMONO; ++i) {
count_f_prev[i] = 0;
count_f_next[i] = 0;
}
}
void ModelScore::add_example(const std::string& previous, std::string& next) {
count_fe_prev[getType(previous)]++;
count_f_prev[getType(previous)]++;
count_fe_next[getType(next)]++;
count_f_next[getType(next)]++;
}
const std::vector<double>& ModelScore::get_scores_fe_prev() const {
return count_fe_prev;
}
const std::vector<double>& ModelScore::get_scores_fe_next() const {
return count_fe_next;
}
const std::vector<double>& ModelScore::get_scores_f_prev() const {
return count_f_prev;
}
const std::vector<double>& ModelScore::get_scores_f_next() const {
return count_f_next;
}
ORIENTATION ModelScore::getType(const std::string& s) {
if (s.compare("mono") == 0) {
return MONO;
} else if (s.compare("swap") == 0) {
return SWAP;
} else if (s.compare("dright") == 0) {
return DRIGHT;
} else if (s.compare("dleft") == 0) {
return DLEFT;
} else if (s.compare("other") == 0) {
return OTHER;
} else if (s.compare("nomono") == 0) {
return NOMONO;
} else {
cerr << "Illegal reordering type used: " << s << endl;
exit(1);
}
}
ORIENTATION ModelScoreMSLR::getType(const std::string& s) {
if (s.compare("mono") == 0) {
return MONO;
} else if (s.compare("swap") == 0) {
return SWAP;
} else if (s.compare("dright") == 0) {
return DRIGHT;
} else if (s.compare("dleft") == 0) {
return DLEFT;
} else if (s.compare("other") == 0 || s.compare("nomono") == 0) {
cerr << "Illegal reordering type used: " << s << " for model type MSLR" << endl;
exit(1);
} else {
cerr << "Illegal reordering type used: " << s << endl;
exit(1);
}
}
ORIENTATION ModelScoreLR::getType(const std::string& s) {
if (s.compare("mono") == 0 || s.compare("dright") == 0) {
return DRIGHT;
} else if (s.compare("swap") == 0 || s.compare("dleft") == 0) {
return DLEFT;
} else if (s.compare("other") == 0 || s.compare("nomono") == 0) {
cerr << "Illegal reordering type used: " << s << " for model type LeftRight" << endl;
exit(1);
} else {
cerr << "Illegal reordering type used: " << s << endl;
exit(1);
}
}
ORIENTATION ModelScoreMSD::getType(const std::string& s) {
if (s.compare("mono") == 0) {
return MONO;
} else if (s.compare("swap") == 0) {
return SWAP;
} else if (s.compare("dleft") == 0 ||
s.compare("dright") == 0 ||
s.compare("other") == 0) {
return OTHER;
} else if (s.compare("nomono") == 0) {
cerr << "Illegal reordering type used: " << s << " for model type MSD" << endl;
exit(1);
} else {
cerr << "Illegal reordering type used: " << s << endl;
exit(1);
}
}
ORIENTATION ModelScoreMonotonicity::getType(const std::string& s) {
if (s.compare("mono") == 0) {
return MONO;
} else if (s.compare("swap") == 0 ||
s.compare("dleft") == 0 ||
s.compare("dright") == 0 ||
s.compare("other") == 0 ||
s.compare("nomono") == 0 ) {
return NOMONO;
} else {
cerr << "Illegal reordering type used: " << s << endl;
exit(1);
}
}
std::vector<double> ScorerMSLR::createSmoothing(std::vector<double> scores, double weight) const {
double total = accumulate(scores.begin(), scores.end(), 0);
vector<double> res;
res.push_back(weight*(scores[MONO]+0.1)/total);
res.push_back(weight*(scores[SWAP]+0.1)/total);
res.push_back(weight*(scores[DRIGHT]+0.1)/total);
res.push_back(weight*(scores[DLEFT]+0.1)/total);
return res;
}
std::vector<double> ScorerMSLR::createConstSmoothing(double weight) const {
vector<double> smoothing;
for (int i=1; i<=4; ++i) {
smoothing.push_back(weight);
}
return smoothing;
}
std::vector<double> ScorerMSD::createSmoothing(std::vector<double> scores, double weight) const {
double total = accumulate(scores.begin(), scores.end(), 0);
vector<double> res;
res.push_back(weight*(scores[MONO]+0.1)/total);
res.push_back(weight*(scores[SWAP]+0.1)/total);
res.push_back(weight*(scores[DLEFT]+scores[DRIGHT]+scores[OTHER]+0.1)/total);
return res;
}
std::vector<double> ScorerMSD::createConstSmoothing(double weight) const {
vector<double> smoothing;
for (int i=1; i<=3; ++i) {
smoothing.push_back(weight);
}
return smoothing;
}
std::vector<double> ScorerMonotonicity::createSmoothing(std::vector<double> scores, double weight) const {
double total = accumulate(scores.begin(), scores.end(), 0);
vector<double> res;
res.push_back(weight*(scores[MONO]+0.1)/total);
res.push_back(weight*(scores[SWAP]+scores[DLEFT]+scores[DRIGHT]+scores[OTHER]+scores[NOMONO]+0.1)/total);
return res;
}
std::vector<double> ScorerMonotonicity::createConstSmoothing(double weight) const {
vector<double> smoothing;
for (double i=1; i<=2; ++i) {
smoothing.push_back(weight);
}
return smoothing;
}
std::vector<double> ScorerLR::createSmoothing(std::vector<double> scores, double weight) const {
double total = accumulate(scores.begin(), scores.end(), 0);
vector<double> res;
res.push_back(weight*(scores[MONO]+scores[DRIGHT]+0.1)/total);
res.push_back(weight*(scores[SWAP]+scores[DLEFT])/total);
return res;
}
std::vector<double> ScorerLR::createConstSmoothing(double weight) const {
vector<double> smoothing;
for (int i=1; i<=2; ++i) {
smoothing.push_back(weight);
}
return smoothing;
}
std::vector<double> ScorerMSLR::score(vector<double> all_scores) const {
vector<double> s;
s.push_back(all_scores[MONO]);
s.push_back(all_scores[SWAP]);
s.push_back(all_scores[DRIGHT]);
s.push_back(all_scores[DLEFT]);
return s;
}
std::vector<double> ScorerMSD::score(vector<double> all_scores) const {
vector<double> s;
s.push_back(all_scores[MONO]);
s.push_back(all_scores[SWAP]);
s.push_back(all_scores[DRIGHT]+all_scores[DLEFT]+all_scores[OTHER]);
return s;
}
std::vector<double> ScorerMonotonicity::score(vector<double> all_scores) const {
vector<double> s;
s.push_back(all_scores[MONO]);
s.push_back(all_scores[SWAP]+all_scores[DRIGHT]+all_scores[DLEFT]+all_scores[OTHER]+all_scores[NOMONO]);
return s;
}
std::vector<double> ScorerLR::score(vector<double> all_scores) const {
vector<double> s;
s.push_back(all_scores[MONO]+all_scores[DRIGHT]);
s.push_back(all_scores[SWAP]+all_scores[DLEFT]);
return s;
}
void Model::score_fe(const string& f, const string& e) {
if (!fe) //Make sure we do not do anything if it is not a fe model
return;
//file >> f >> " " >> e >> " ||| ";
fprintf(file,"%s ||| %s ||| ",f.c_str(),e.c_str());
//condition on the previous phrase
if (previous) {
vector<double> scores = scorer->score(modelscore->get_scores_fe_prev());
double sum = 0;
for(int i=0; i<scores.size(); ++i) {
scores[i] += smoothing_prev[i];
sum += scores[i];
}
for(int i=0; i<scores.size(); ++i) {
//file >> scores[i]/sum >> " ";
fprintf(file,"%f ",scores[i]/sum);
}
}
//condition on the next phrase
if (next) {
//file >> "||| ";
fprintf(file, "||| ");
vector<double> scores = scorer->score(modelscore->get_scores_fe_next());
double sum = 0;
for(int i=0; i<scores.size(); ++i) {
scores[i] += smoothing_next[i];
sum += scores[i];
}
for(int i=0; i<scores.size(); ++i) {
//file >> scores[i]/sum >> " ";
fprintf(file, "%f ", scores[i]/sum);
}
}
//file >> "\n";
fprintf(file,"\n");
}
void Model::score_f(const string& f) {
if (fe) //Make sure we do not do anything if it is not a f model
return;
//file >> f >> " ||| ";
fprintf(file, "%s ||| ", f.c_str());
//condition on the previous phrase
if (previous) {
vector<double> scores = scorer->score(modelscore->get_scores_f_prev());
double sum = 0;
for(int i=0; i<scores.size(); ++i) {
scores[i] += smoothing_prev[i];
sum += scores[i];
}
for(int i=0; i<scores.size(); ++i) {
fprintf(file, "%f ", scores[i]/sum);
}
}
//condition on the next phrase
if (next) {
//file >> "||| ";
fprintf(file, "||| ");
vector<double> scores = scorer->score(modelscore->get_scores_f_next());
double sum = 0;
for(int i=0; i<scores.size(); ++i) {
scores[i] += smoothing_next[i];
sum += scores[i];
}
for(int i=0; i<scores.size(); ++i) {
//file >> scores[i]/sum >> " ";
fprintf(file, "%f ", scores[i]/sum);
}
}
//file >> "\n";
fprintf(file, "\n");
}
Model::Model(ModelScore* ms, Scorer* sc, const string& dir, const string& lang, const string& fn)
: modelscore(ms), scorer(sc), filename(fn) {
file = fopen(filename.c_str(),"w");
if (!file) {
cerr << "Could not open the model output file: " << filename << endl;
exit(1);
}
fe = false;
if (lang.compare("fe") == 0) {
fe = true;
} else if (lang.compare("f") != 0) {
cerr << "You have given an illegal language to condition on: " << lang
<< "\nLegal types: fe (on both languages), f (only on source language)\n";
exit(1);
}
previous = true;
next = true;
if (dir.compare("backward") == 0) {
next = false;
} else if (dir.compare("forward") == 0) {
previous = false;
}
}
Model::~Model() {
fclose(file);
delete modelscore;
delete scorer;
}
void Model::zipFile() {
fclose(file);
file = fopen(filename.c_str(), "rb");
FILE* gzfile = (FILE*) gzopen((filename+".gz").c_str(),"wb");
char inbuffer[128];
int num_read;
while ((num_read = fread(inbuffer, 1, sizeof(inbuffer), file)) > 0) {
gzwrite(gzfile, inbuffer, num_read);
}
fclose(file);
gzclose(gzfile);
//Remove the unzipped file
remove(filename.c_str());
}
void Model::split_config(const string& config, string& dir, string& lang, string& orient) {
istringstream is(config);
string type;
getline(is, type, '-');
getline(is, orient, '-');
getline(is, dir, '-');
getline(is, lang, '-');
}
Model* Model::createModel(ModelScore* modelscore, const std::string& config, const std::string& filepath) {
string dir, lang, orient, filename;
split_config(config,dir,lang,orient);
filename = filepath + config;
if (orient.compare("mslr") == 0) {
return new Model(modelscore, new ScorerMSLR(), dir, lang, filename);
} else if (orient.compare("msd") == 0) {
return new Model(modelscore, new ScorerMSD(), dir, lang, filename);
} else if (orient.compare("monotonicity") == 0) {
return new Model(modelscore, new ScorerMonotonicity(), dir, lang, filename);
} else if (orient.compare("leftright") == 0) {
return new Model(modelscore, new ScorerLR(), dir, lang, filename);
} else {
cerr << "Illegal orientation type of reordering model: " << orient
<< "\n allowed types: mslr, msd, monotonicity, leftright\n";
exit(1);
}
}
void Model::createSmoothing(double w) {
smoothing_prev = scorer->createSmoothing(modelscore->get_scores_fe_prev(),w);
smoothing_next = scorer->createSmoothing(modelscore->get_scores_fe_prev(),w);
}
void Model::createConstSmoothing(double w) {
vector<double> i;
smoothing_prev = scorer->createConstSmoothing(w);
smoothing_next = scorer->createConstSmoothing(w);
}

View File

@ -0,0 +1,134 @@
/*
* reordering_classes.h
* Utility classes for lexical reordering table scoring
*
* Created by: Sara Stymne - Linköping University
* Machine Translation Marathon 2010, Dublin
*/
#pragma once
#include <vector>
#include <string>
#include <fstream>
enum ORIENTATION {MONO, SWAP, DRIGHT, DLEFT, OTHER, NOMONO};
//Keeps the counts for the different reordering types
//(Instantiated in 1-3 instances, one for each type of model (hier, phrase, wbe))
class ModelScore {
private:
std::vector<double> count_fe_prev;
std::vector<double> count_fe_next;
std::vector<double> count_f_prev;
std::vector<double> count_f_next;
protected:
virtual ORIENTATION getType(const std::string& s);
public:
ModelScore();
void add_example(const std::string& previous, std::string& next);
void reset_fe();
void reset_f();
const std::vector<double>& get_scores_fe_prev() const;
const std::vector<double>& get_scores_fe_next() const;
const std::vector<double>& get_scores_f_prev() const;
const std::vector<double>& get_scores_f_next() const;
static ModelScore* createModelScore(const std::string& modeltype);
};
class ModelScoreMSLR : public ModelScore {
protected:
virtual ORIENTATION getType(const std::string& s);
};
class ModelScoreLR : public ModelScore {
protected:
virtual ORIENTATION getType(const std::string& s);
};
class ModelScoreMSD : public ModelScore {
protected:
virtual ORIENTATION getType(const std::string& s);
};
class ModelScoreMonotonicity : public ModelScore {
protected:
virtual ORIENTATION getType(const std::string& s);
};
//Class for calculating total counts, and to calculate smoothing
class Scorer {
public:
~Scorer() {}
virtual std::vector<double> score(std::vector<double>) const = 0;
virtual std::vector<double> createSmoothing(std::vector<double>, double) const = 0;
virtual std::vector<double> createConstSmoothing(double) const = 0;
};
class ScorerMSLR : public Scorer {
public:
virtual std::vector<double> score(std::vector<double>) const;
virtual std::vector<double> createSmoothing(std::vector<double>, double) const;
virtual std::vector<double> createConstSmoothing(double) const;
};
class ScorerMSD : public Scorer {
public:
virtual std::vector<double> score(std::vector<double>) const;
virtual std::vector<double> createSmoothing(std::vector<double>, double) const;
virtual std::vector<double> createConstSmoothing(double) const;
};
class ScorerMonotonicity : public Scorer {
public:
virtual std::vector<double> score(std::vector<double>) const;
virtual std::vector<double> createSmoothing(std::vector<double>, double) const;
virtual std::vector<double> createConstSmoothing(double) const;
};
class ScorerLR : public Scorer {
public:
virtual std::vector<double> score(std::vector<double>) const;
virtual std::vector<double> createSmoothing(std::vector<double>, double) const;
virtual std::vector<double> createConstSmoothing(double) const;
};
//Class for representing each model
//Contains a modelscore and scorer (which can be of different model types (mslr, msd...)),
//and file handling.
//This class also keeps track of bidirectionality, and which language to condition on
class Model {
private:
ModelScore* modelscore;
Scorer* scorer;
std::FILE* file;
std::string filename;
bool fe;
bool previous;
bool next;
std::vector<double> smoothing_prev;
std::vector<double> smoothing_next;
static void split_config(const std::string& config, std::string& dir,
std::string& lang, std::string& orient);
public:
Model(ModelScore* ms, Scorer* sc, const std::string& dir,
const std::string& lang, const std::string& fn);
~Model();
static Model* createModel(ModelScore*, const std::string&, const std::string&);
void createSmoothing(double w);
void createConstSmoothing(double w);
void score_fe(const std::string& f, const std::string& e);
void score_f(const std::string& f);
void zipFile();
};

View File

@ -0,0 +1,221 @@
/*
* score_reordering.cpp
*
* Created by: Sara Stymne - Linköping University
* Machine Translation Marathon 2010, Dublin
*/
#include <string>
#include <vector>
#include <map>
#include <iostream>
#include <fstream>
#include <sstream>
#include <cstdlib>
#include <cstring>
#include "reordering_classes.h"
using namespace std;
void split_line(const string& line, string& foreign, string& english, string& wbe, string& phrase, string& hier);
void get_orientations(const string& pair, string& previous, string& next);
int main(int argc, char* argv[])
{
cerr << "Lexical Reordering Scorer, written by Sara Stymne\n"
<< "scores lexical reordering models of several types (hierarchical, phrase-based and word-based-extraction\n";
if (argc < 3) {
cerr << "syntax: score_reordering extractFile smoothingValue filepath (--model \"type max-orientation (specification-strings)\" )+\n";
exit(1);
}
char* extractFileName = argv[1];
double smoothingValue = atof(argv[2]);
string filepath = argv[3];
ifstream eFile(extractFileName);
if (!eFile) {
cerr << "Could not open the extract file " << extractFileName <<"for scoring of lexical reordering models\n";
exit(1);
}
bool smoothWithCounts = false;
map<string,ModelScore*> modelScores;
vector<Model*> models;
bool hier = false;
bool phrase = false;
bool wbe = false;
string e,f,w,p,h;
string prev, next;
int i = 4;
while (i<argc) {
if (strcmp(argv[i],"--SmoothWithCounts") == 0) {
smoothWithCounts = true;
} else if (strcmp(argv[i],"--model") == 0) {
if (i+1 >= argc){
cerr << "score: syntax error, no model information provided to the option" << argv[i] << endl;
exit(1);
}
istringstream is(argv[++i]);
string m,t;
is >> m >> t;
modelScores[m] = ModelScore::createModelScore(t);
if (m.compare("hier") == 0) {
hier = true;
} else if (m.compare("phrase") == 0) {
phrase = true;
} if (m.compare("wbe") == 0) {
wbe = true;
}
if (!hier && !phrase && !wbe) {
cerr << "WARNING: No models specified for lexical reordering. No lexical reordering table will be trained.\n";
return 0;
}
string config;
//Store all models
while (is >> config) {
models.push_back(Model::createModel(modelScores[m],config,filepath));
}
} else {
cerr << "illegal option given to lexical reordering model score\n";
exit(1);
}
i++;
}
////////////////////////////////////
//calculate smoothing
if (smoothWithCounts) {
string line;
while (getline(eFile,line)) {
split_line(line,e,f,w,p,h);
if (hier) {
get_orientations(h, prev, next);
modelScores["hier"]->add_example(prev,next);
}
if (phrase) {
get_orientations(p, prev, next);
modelScores["phrase"]->add_example(prev,next);
}
if (wbe) {
get_orientations(w, prev, next);
modelScores["wbe"]->add_example(prev,next);
}
}
// calculate smoothing for each model
for (int i=0; i<models.size();++i) {
models[i]->createSmoothing(smoothingValue);
}
//reopen eFile
eFile.close();
eFile.open(extractFileName);
}
else {
//constant smoothing
for (int i=0; i<models.size();++i) {
models[i]->createConstSmoothing(smoothingValue);
}
}
////////////////////////////////////
//calculate scores for reordering table
string line,f_current,e_current;
bool first = true;
while (getline(eFile, line)) {
split_line(line,f,e,w,p,h);
if (first) {
f_current = f;
e_current = e;
first = false;
} else if (f.compare(f_current) != 0 || e.compare(e_current) != 0) {
//fe - score
for (int i=0; i<models.size();++i) {
models[i]->score_fe(f,e);
}
//reset
for(map<string,ModelScore*>::const_iterator it = modelScores.begin(); it != modelScores.end(); ++it) {
it->second->reset_fe();
}
if (f.compare(f_current) != 0) {
//f - score
for (int i=0; i<models.size();++i) {
models[i]->score_f(f);
}
//reset
for(map<string,ModelScore*>::const_iterator it = modelScores.begin(); it != modelScores.end(); ++it) {
it->second->reset_f();
}
}
f_current = f;
e_current = e;
}
// uppdate counts
if (hier) {
get_orientations(h, prev, next);
modelScores["hier"]->add_example(prev,next);
}
if (phrase) {
get_orientations(p, prev, next);
modelScores["phrase"]->add_example(prev,next);
}
if (wbe) {
get_orientations(w, prev, next);
modelScores["wbe"]->add_example(prev,next);
}
}
//Score the last phrases
for (int i=0; i<models.size();++i) {
models[i]->score_fe(f,e);
}
for (int i=0; i<models.size();++i) {
models[i]->score_f(f);
}
//Zip all files
for (int i=0; i<models.size();++i) {
models[i]->zipFile();
}
return 0;
}
void split_line(const string& line, string& foreign, string& english, string& wbe, string& phrase, string& hier) {
int begin = 0;
int end = line.find(" ||| ");
foreign = line.substr(begin, end - begin);
begin = end+5;
end = line.find(" ||| ", begin);
english = line.substr(begin, end - begin);
begin = end+5;
end = line.find(" | ", begin);
wbe = line.substr(begin, end - begin);
begin = end+3;
end = line.find(" | ", begin);
phrase = line.substr(begin, end - begin);
begin = end+3;
hier = line.substr(begin, line.size() - begin);
}
void get_orientations(const string& pair, string& previous, string& next) {
istringstream is(pair);
is >> previous >> next;
}

View File

@ -21,16 +21,16 @@
using namespace std;
#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) { \
_IS.getline(_LINE, _SIZE, _DELIM); \
if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \
if (_IS.gcount() == _SIZE-1) { \
cerr << "Line too long! Buffer overflow. Delete lines >=" \
<< _SIZE << " chars or raise LINE_MAX_LENGTH in phrase-extract/extract.cpp" \
<< endl; \
exit(1); \
} \
}
#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) { \
_IS.getline(_LINE, _SIZE, _DELIM); \
if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \
if (_IS.gcount() == _SIZE-1) { \
cerr << "Line too long! Buffer overflow. Delete lines >=" \
<< _SIZE << " chars or raise LINE_MAX_LENGTH in phrase-extract/extract.cpp" \
<< endl; \
exit(1); \
} \
}
#define LINE_MAX_LENGTH 60000
// HPhraseVertex represents a point in the alignment matrix
@ -51,32 +51,32 @@ enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO};
enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN};
class SentenceAlignment {
public:
vector<string> english;
vector<string> foreign;
vector<int> alignedCountF;
vector< vector<int> > alignedToE;
public:
vector<string> english;
vector<string> foreign;
vector<int> alignedCountF;
vector< vector<int> > alignedToE;
int create( char[], char[], char[], int );
// void clear() { delete(alignment); };
int create( char[], char[], char[], int );
// void clear() { delete(alignment); };
};
REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE,
int, int, int, int, int, int, int,
bool (*)(int, int), bool (*)(int, int));
int, int, int, int, int, int, int,
bool (*)(int, int), bool (*)(int, int));
REO_POS getOrientPhraseModel(REO_MODEL_TYPE,
int, int, int, int, int, int, int,
bool (*)(int, int), bool (*)(int, int),
const HSenteceVertices &, const HSenteceVertices &);
int, int, int, int, int, int, int,
bool (*)(int, int), bool (*)(int, int),
const HSenteceVertices &, const HSenteceVertices &);
REO_POS getOrientHierModel(REO_MODEL_TYPE,
int, int, int, int, int, int, int,
bool (*)(int, int), bool (*)(int, int),
const HSenteceVertices &, const HSenteceVertices &,
REO_POS);
int, int, int, int, int, int, int,
bool (*)(int, int), bool (*)(int, int),
const HSenteceVertices &, const HSenteceVertices &,
REO_POS);
void insertVertex(HSenteceVertices &, int, int);
void insertPhraseVertices(HSenteceVertices &, HSenteceVertices &, HSenteceVertices &, HSenteceVertices &,
int, int, int, int);
int, int, int, int);
string getOrientString(REO_POS, REO_MODEL_TYPE);
bool ge(int, int);
@ -113,439 +113,440 @@ bool properConditioning = false;
int main(int argc, char* argv[])
{
cerr << "PhraseExtract v1.4, written by Philipp Koehn\n"
<< "phrase extraction from an aligned parallel corpus\n";
time_t starttime = time(NULL);
cerr << "PhraseExtract v1.4, written by Philipp Koehn\n"
<< "phrase extraction from an aligned parallel corpus\n";
time_t starttime = time(NULL);
if (argc < 6) {
cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] | --OnlyOutputSpanInfo | --NoFileLimit | --ProperConditioning]\n";
exit(1);
if (argc < 6) {
cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] | --OnlyOutputSpanInfo | --NoFileLimit | --ProperConditioning]\n";
exit(1);
}
char* &fileNameE = argv[1];
char* &fileNameF = argv[2];
char* &fileNameA = argv[3];
fileNameExtract = argv[4];
maxPhraseLength = atoi(argv[5]);
for(int i=6;i<argc;i++) {
if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
onlyOutputSpanInfo = true;
}
else if (strcmp(argv[i],"--NoFileLimit") == 0) {
noFileLimit = true;
}
else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
orientationFlag = true;
}
else if(strcmp(argv[i],"--model") == 0){
if (i+1 >= argc){
cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
exit(1);
}
char* modelParams = argv[++i];
char* modelName = strtok(modelParams, "-");
char* modelType = strtok(NULL, "-");
REO_MODEL_TYPE intModelType;
if(strcmp(modelName, "wbe") == 0){
wordModel = true;
if(strcmp(modelType, "msd") == 0)
wordType = REO_MSD;
else if(strcmp(modelType, "mslr") == 0)
wordType = REO_MSLR;
else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
wordType = REO_MONO;
else{
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
exit(1);
}
char* &fileNameE = argv[1];
char* &fileNameF = argv[2];
char* &fileNameA = argv[3];
fileNameExtract = argv[4];
maxPhraseLength = atoi(argv[5]);
for(int i=6;i<argc;i++) {
if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
onlyOutputSpanInfo = true;
}
else if (strcmp(argv[i],"--NoFileLimit") == 0) {
noFileLimit = true;
}
else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
orientationFlag = true;
}
else if(strcmp(argv[i],"--model") == 0){
if (i+1 >= argc){
cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
exit(1);
}
char* modelParams = argv[++i];
char* modelName = strtok(modelParams, "-");
char* modelType = strtok(NULL, "-");
REO_MODEL_TYPE intModelType;
if(strcmp(modelName, "wbe") == 0){
wordModel = true;
if(strcmp(modelType, "msd") == 0)
wordType = REO_MSD;
else if(strcmp(modelType, "mslr") == 0)
wordType = REO_MSLR;
else if(strcmp(modelType, "mono") == 0)
wordType = REO_MONO;
else{
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
exit(1);
}
}
else if(strcmp(modelName, "phrase") == 0){
phraseModel = true;
if(strcmp(modelType, "msd") == 0)
phraseType = REO_MSD;
else if(strcmp(modelType, "mslr") == 0)
phraseType = REO_MSLR;
else if(strcmp(modelType, "mono") == 0)
phraseType = REO_MONO;
else{
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
exit(1);
}
}
else if(strcmp(modelName, "hier") == 0){
hierModel = true;
if(strcmp(modelType, "msd") == 0)
hierType = REO_MSD;
else if(strcmp(modelType, "mslr") == 0)
hierType = REO_MSLR;
else if(strcmp(modelType, "mono") == 0)
hierType = REO_MONO;
else{
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
exit(1);
}
}
else{
cerr << "extract: syntax error, unknown reordering model: " << modelName << endl;
exit(1);
}
allModelsOutputFlag = true;
}
else if (strcmp(argv[i],"--ZipFiles") == 0) {
zipFiles = true;
}
else if (strcmp(argv[i],"--ProperConditioning") == 0) {
properConditioning = true;
}
else {
cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
exit(1);
}
}
else if(strcmp(modelName, "phrase") == 0){
phraseModel = true;
if(strcmp(modelType, "msd") == 0)
phraseType = REO_MSD;
else if(strcmp(modelType, "mslr") == 0)
phraseType = REO_MSLR;
else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
phraseType = REO_MONO;
else{
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
exit(1);
}
// default reordreing model if no model selected
// allows for the old syntax to be used
if(orientationFlag && !allModelsOutputFlag){
wordModel = true;
wordType = REO_MSD;
}
else if(strcmp(modelName, "hier") == 0){
hierModel = true;
if(strcmp(modelType, "msd") == 0)
hierType = REO_MSD;
else if(strcmp(modelType, "mslr") == 0)
hierType = REO_MSLR;
else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
hierType = REO_MONO;
else{
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
exit(1);
}
}
else{
cerr << "extract: syntax error, unknown reordering model: " << modelName << endl;
exit(1);
}
ifstream eFile;
ifstream fFile;
ifstream aFile;
eFile.open(fileNameE);
fFile.open(fileNameF);
aFile.open(fileNameA);
istream *eFileP = &eFile;
istream *fFileP = &fFile;istream *aFileP = &aFile;
allModelsOutputFlag = true;
}
else if (strcmp(argv[i],"--ZipFiles") == 0) {
zipFiles = true;
}
else if (strcmp(argv[i],"--ProperConditioning") == 0) {
properConditioning = true;
}
else {
cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
exit(1);
}
}
int i=0;
while(true) {
i++;
if (i%10000 == 0) cerr << "." << flush;
char englishString[LINE_MAX_LENGTH];
char foreignString[LINE_MAX_LENGTH];
char alignmentString[LINE_MAX_LENGTH];
SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n');
if (eFileP->eof()) break;
SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n');
SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n');
SentenceAlignment sentence;
// cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
//az: output src, tgt, and alingment line
if (onlyOutputSpanInfo) {
cout << "LOG: SRC: " << foreignString << endl;
cout << "LOG: TGT: " << englishString << endl;
cout << "LOG: ALT: " << alignmentString << endl;
cout << "LOG: PHRASES_BEGIN:" << endl;
}
// default reordering model if no model selected
// allows for the old syntax to be used
if(orientationFlag && !allModelsOutputFlag){
wordModel = true;
wordType = REO_MSD;
}
if (sentence.create( englishString, foreignString, alignmentString, i )) {
extract(sentence);
}
if (onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
}
eFile.close();
fFile.close();
aFile.close();
//az: only close if we actually opened it
if (!onlyOutputSpanInfo) {
extractFile.close();
extractFileInv.close();
if (orientationFlag) extractFileOrientation.close();
}
ifstream eFile;
ifstream fFile;
ifstream aFile;
eFile.open(fileNameE);
fFile.open(fileNameF);
aFile.open(fileNameA);
istream *eFileP = &eFile;
istream *fFileP = &fFile;
istream *aFileP = &aFile;
int i=0;
while(true) {
i++;
if (i%10000 == 0) cerr << "." << flush;
char englishString[LINE_MAX_LENGTH];
char foreignString[LINE_MAX_LENGTH];
char alignmentString[LINE_MAX_LENGTH];
SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n');
if (eFileP->eof()) break;
SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n');
SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n');
SentenceAlignment sentence;
// cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
//az: output src, tgt, and alingment line
if (onlyOutputSpanInfo) {
cout << "LOG: SRC: " << foreignString << endl;
cout << "LOG: TGT: " << englishString << endl;
cout << "LOG: ALT: " << alignmentString << endl;
cout << "LOG: PHRASES_BEGIN:" << endl;
}
if (sentence.create( englishString, foreignString, alignmentString, i )) {
extract(sentence);
}
if (onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
}
eFile.close();
fFile.close();
aFile.close();
//az: only close if we actually opened it
if (!onlyOutputSpanInfo) {
extractFile.close();
extractFileInv.close();
if (orientationFlag) extractFileOrientation.close();
}
}
void extract(SentenceAlignment &sentence) {
int countE = sentence.english.size();
int countF = sentence.foreign.size();
int countE = sentence.english.size();
int countF = sentence.foreign.size();
HPhraseVector inboundPhrases;
HPhraseVector inboundPhrases;
HSenteceVertices inTopLeft;
HSenteceVertices inTopRight;
HSenteceVertices inBottomLeft;
HSenteceVertices inBottomRight;
HSenteceVertices inTopLeft;
HSenteceVertices inTopRight;
HSenteceVertices inBottomLeft;
HSenteceVertices inBottomRight;
HSenteceVertices outTopLeft;
HSenteceVertices outTopRight;
HSenteceVertices outBottomLeft;
HSenteceVertices outBottomRight;
HSenteceVertices outTopLeft;
HSenteceVertices outTopRight;
HSenteceVertices outBottomLeft;
HSenteceVertices outBottomRight;
HSenteceVertices::const_iterator it;
HSenteceVertices::const_iterator it;
bool relaxLimit = hierModel;
bool buildExtraStructure = phraseModel || hierModel;
bool relaxLimit = hierModel;
bool buildExtraStructure = phraseModel || hierModel;
// check alignments for english phrase startE...endE
// loop over extracted phrases which are compatible with the word-alignments
for(int startE=0;startE<countE;startE++) {
for(int endE=startE;
(endE<countE && (relaxLimit || endE<startE+maxPhraseLength));
endE++) {
// check alignments for english phrase startE...endE
// loop over extracted phrases which are compatible with the word-alignments
for(int startE=0;startE<countE;startE++) {
for(int endE=startE;
(endE<countE && (relaxLimit || endE<startE+maxPhraseLength));
endE++) {
int minF = 9999;
int maxF = -1;
vector< int > usedF = sentence.alignedCountF;
for(int ei=startE;ei<=endE;ei++) {
for(int i=0;i<sentence.alignedToE[ei].size();i++) {
int fi = sentence.alignedToE[ei][i];
if (fi<minF) { minF = fi; }
if (fi>maxF) { maxF = fi; }
usedF[ fi ]--;
}
}
if (maxF >= 0 && // aligned to any foreign words at all
(relaxLimit || maxF-minF < maxPhraseLength)) { // foreign phrase within limits
// check if foreign words are aligned to out of bound english words
bool out_of_bounds = false;
for(int fi=minF;fi<=maxF && !out_of_bounds;fi++)
if (usedF[fi]>0) {
// cout << "ouf of bounds: " << fi << "\n";
out_of_bounds = true;
}
// cout << "doing if for ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n";
if (!out_of_bounds){
// start point of foreign phrase may retreat over unaligned
for(int startF=minF;
(startF>=0 &&
(relaxLimit || startF>maxF-maxPhraseLength) && // within length limit
(startF==minF || sentence.alignedCountF[startF]==0)); // unaligned
startF--)
// end point of foreign phrase may advance over unaligned
for(int endF=maxF;
(endF<countF &&
(relaxLimit || endF<startF+maxPhraseLength) && // within length limit
(endF==maxF || sentence.alignedCountF[endF]==0)); // unaligned
endF++){ // at this point we have extracted a phrase
if(buildExtraStructure){ // phrase || hier
if(endE-startE < maxPhraseLength && endF-startF < maxPhraseLength){ // within limit
inboundPhrases.push_back(
HPhrase(
HPhraseVertex(startF,startE),
HPhraseVertex(endF,endE)
)
);
insertPhraseVertices(inTopLeft, inTopRight, inBottomLeft, inBottomRight,
startF, startE, endF, endE);
}
else
insertPhraseVertices(outTopLeft, outTopRight, outBottomLeft, outBottomRight,
startF, startE, endF, endE);
}
else{
string orientationInfo = "";
if(wordModel){
REO_POS wordPrevOrient, wordNextOrient;
wordPrevOrient = getOrientWordModel(sentence, wordType, startF, endF, startE, endE, countF, 0, 1, &ge, &lt);
wordNextOrient = getOrientWordModel(sentence, wordType, endF, startF, endE, startE, 0, countF, -1, &lt, &ge);
if(allModelsOutputFlag)
orientationInfo += getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType) + "| | ";
else
orientationInfo += getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType);
}
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
}
}
}
}
}
int minF = 9999;
int maxF = -1;
vector< int > usedF = sentence.alignedCountF;
for(int ei=startE;ei<=endE;ei++) {
for(int i=0;i<sentence.alignedToE[ei].size();i++) {
int fi = sentence.alignedToE[ei][i];
if (fi<minF) { minF = fi; }
if (fi>maxF) { maxF = fi; }
usedF[ fi ]--;
}
}
if(buildExtraStructure){ // phrase || hier
string orientationInfo = "";
REO_POS wordPrevOrient, wordNextOrient, phrasePrevOrient, phraseNextOrient, hierPrevOrient, hierNextOrient;
if (maxF >= 0 && // aligned to any foreign words at all
(relaxLimit || maxF-minF < maxPhraseLength)) { // foreign phrase within limits
for(int i = 0; i < inboundPhrases.size(); i++){
int startF = inboundPhrases[i].first.first;
int startE = inboundPhrases[i].first.second;
int endF = inboundPhrases[i].second.first;
int endE = inboundPhrases[i].second.second;
// check if foreign words are aligned to out of bound english words
bool out_of_bounds = false;
for(int fi=minF;fi<=maxF && !out_of_bounds;fi++)
if (usedF[fi]>0) {
// cout << "ouf of bounds: " << fi << "\n";
out_of_bounds = true;
}
if(wordModel){
wordPrevOrient = getOrientWordModel(sentence, wordType,
startF, endF, startE, endE, countF, 0, 1,
&ge, &lt);
wordNextOrient = getOrientWordModel(sentence, wordType,
endF, startF, endE, startE, 0, countF, -1,
&lt, &ge);
}
phrasePrevOrient = getOrientPhraseModel(phraseType, startF, endF, startE, endE, countF-1, 0, 1, &ge, &lt, inBottomRight, inBottomLeft);
phraseNextOrient = getOrientPhraseModel(phraseType, endF, startF, endE, startE, 0, countF-1, -1, &lt, &ge, inBottomLeft, inBottomRight);
if(hierModel){
hierPrevOrient = getOrientHierModel(phraseType, startF, endF, startE, endE, countF-1, 0, 1, &ge, &lt, outBottomRight, outBottomLeft, phrasePrevOrient);
hierNextOrient = getOrientHierModel(phraseType, endF, startF, endE, startE, 0, countF-1, -1, &lt, &ge, outBottomLeft, outBottomRight, phraseNextOrient);
}
orientationInfo = ((wordModel)? getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType) : " ") + "|" +
((phraseModel)? getOrientString(phrasePrevOrient, phraseType) + " " + getOrientString(phraseNextOrient, phraseType) : " ") + "|" +
((hierModel)? getOrientString(hierPrevOrient, hierType) + " " + getOrientString(hierNextOrient, hierType) : " ");
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
// cout << "doing if for ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n";
if (!out_of_bounds){
// start point of foreign phrase may retreat over unaligned
for(int startF=minF;
(startF>=0 &&
(relaxLimit || startF>maxF-maxPhraseLength) && // within length limit
(startF==minF || sentence.alignedCountF[startF]==0)); // unaligned
startF--)
// end point of foreign phrase may advance over unaligned
for(int endF=maxF;
(endF<countF &&
(relaxLimit || endF<startF+maxPhraseLength) && // within length limit
(endF==maxF || sentence.alignedCountF[endF]==0)); // unaligned
endF++){ // at this point we have extracted a phrase
if(buildExtraStructure){ // phrase || hier
if(endE-startE < maxPhraseLength && endF-startF < maxPhraseLength){ // within limit
inboundPhrases.push_back(
HPhrase(
HPhraseVertex(startF,startE),
HPhraseVertex(endF,endE)
)
);
insertPhraseVertices(inTopLeft, inTopRight, inBottomLeft, inBottomRight,
startF, startE, endF, endE);
}
else
insertPhraseVertices(outTopLeft, outTopRight, outBottomLeft, outBottomRight,
startF, startE, endF, endE);
}
else{
string orientationInfo = "";
if(wordModel){
REO_POS wordPrevOrient, wordNextOrient;
wordPrevOrient = getOrientWordModel(sentence, wordType, startF, endF, startE, endE, countF, 0, 1, &ge, &lt);
wordNextOrient = getOrientWordModel(sentence, wordType, endF, startF, endE, startE, 0, countF, -1, &lt, &ge);
if(allModelsOutputFlag)
orientationInfo += getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType) + " | | ";
else
orientationInfo += getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType);
}
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
}
}
}
}
}
}
if(buildExtraStructure){ // phrase || hier
string orientationInfo = "";
REO_POS wordPrevOrient, wordNextOrient, phrasePrevOrient, phraseNextOrient, hierPrevOrient, hierNextOrient;
for(int i = 0; i < inboundPhrases.size(); i++){
int startF = inboundPhrases[i].first.first;
int startE = inboundPhrases[i].first.second;
int endF = inboundPhrases[i].second.first;
int endE = inboundPhrases[i].second.second;
if(wordModel){
wordPrevOrient = getOrientWordModel(sentence, wordType,
startF, endF, startE, endE, countF, 0, 1,
&ge, &lt);
wordNextOrient = getOrientWordModel(sentence, wordType,
endF, startF, endE, startE, 0, countF, -1,
&lt, &ge);
}
phrasePrevOrient = getOrientPhraseModel(phraseType, startF, endF, startE, endE, countF-1, 0, 1, &ge, &lt, inBottomRight, inBottomLeft);
phraseNextOrient = getOrientPhraseModel(phraseType, endF, startF, endE, startE, 0, countF-1, -1, &lt, &ge, inBottomLeft, inBottomRight);
if(hierModel){
hierPrevOrient = getOrientHierModel(phraseType, startF, endF, startE, endE, countF-1, 0, 1, &ge, &lt, outBottomRight, outBottomLeft, phrasePrevOrient);
hierNextOrient = getOrientHierModel(phraseType, endF, startF, endE, startE, 0, countF-1, -1, &lt, &ge, outBottomLeft, outBottomRight, phraseNextOrient);
}
orientationInfo = ((wordModel)? getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType) : " ") + " | " +
((phraseModel)? getOrientString(phrasePrevOrient, phraseType) + " " + getOrientString(phraseNextOrient, phraseType) : " ") + " | " +
((hierModel)? getOrientString(hierPrevOrient, hierType) + " " + getOrientString(hierNextOrient, hierType) : " ");
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
}
}
}
REO_POS getOrientWordModel(SentenceAlignment & sentence, REO_MODEL_TYPE modelType,
int startF, int endF, int startE, int endE, int countF, int zero, int unit,
bool (*ge)(int, int), bool (*lt)(int, int) ){
int startF, int endF, int startE, int endE, int countF, int zero, int unit,
bool (*ge)(int, int), bool (*lt)(int, int) ){
bool connectedLeftTop = isAligned( sentence, startF-unit, startE-unit );
bool connectedRightTop = isAligned( sentence, endF+unit, startE-unit );
if( connectedLeftTop && !connectedRightTop)
return LEFT;
if(modelType == REO_MONO)
return UNKNOWN;
if (!connectedLeftTop && connectedRightTop)
return RIGHT;
if(modelType == REO_MSD)
return UNKNOWN;
for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit)
connectedLeftTop = isAligned(sentence, indexF, startE-unit);
for(int indexF=endF+2*unit; (*lt)(indexF,countF) && !connectedRightTop; indexF=indexF+unit)
connectedRightTop = isAligned(sentence, indexF, startE-unit);
if(connectedLeftTop && !connectedRightTop)
return DRIGHT;
else if(!connectedLeftTop && connectedRightTop)
return DLEFT;
return UNKNOWN;
bool connectedLeftTop = isAligned( sentence, startF-unit, startE-unit );
bool connectedRightTop = isAligned( sentence, endF+unit, startE-unit );
if( connectedLeftTop && !connectedRightTop)
return LEFT;
if(modelType == REO_MONO)
return UNKNOWN;
if (!connectedLeftTop && connectedRightTop)
return RIGHT;
if(modelType == REO_MSD)
return UNKNOWN;
for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit)
connectedLeftTop = isAligned(sentence, indexF, startE-unit);
for(int indexF=endF+2*unit; (*lt)(indexF,countF) && !connectedRightTop; indexF=indexF+unit)
connectedRightTop = isAligned(sentence, indexF, startE-unit);
if(connectedLeftTop && !connectedRightTop)
return DRIGHT;
else if(!connectedLeftTop && connectedRightTop)
return DLEFT;
return UNKNOWN;
}
// to be called with countF-1 instead of countF
REO_POS getOrientPhraseModel (REO_MODEL_TYPE modelType,
int startF, int endF, int startE, int endE, int countF, int zero, int unit,
bool (*ge)(int, int), bool (*le)(int, int),
const HSenteceVertices & inBottomRight, const HSenteceVertices & inBottomLeft){
int startF, int endF, int startE, int endE, int countF, int zero, int unit,
bool (*ge)(int, int), bool (*le)(int, int),
const HSenteceVertices & inBottomRight, const HSenteceVertices & inBottomLeft){
HSenteceVertices::const_iterator it;
HSenteceVertices::const_iterator it;
if((startE == zero && startF == zero) ||
(it = inBottomRight.find(startE - unit)) != inBottomRight.end() &&
it->second.find(startF-unit) != it->second.end())
return LEFT;
if(modelType == REO_MONO)
return UNKNOWN;
if((it = inBottomLeft.find(startE - unit)) != inBottomLeft.end() && it->second.find(endF + unit) != it->second.end())
return RIGHT;
if(modelType == REO_MSD)
return UNKNOWN;
bool connectedLeftTop = false;
for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit)
if(connectedLeftTop = (it = inBottomRight.find(startE - unit)) != inBottomRight.end() &&
it->second.find(indexF) != it->second.end())
return DRIGHT;
bool connectedRightTop = false;
for(int indexF=endF+2*unit; (*le)(indexF, countF) && !connectedRightTop; indexF=indexF+unit)
if(connectedRightTop = (it = inBottomLeft.find(startE - unit)) != inBottomRight.end() &&
it->second.find(indexF) != it->second.end())
return DLEFT;
return DRIGHT;
if((startE == zero && startF == zero) ||
(it = inBottomRight.find(startE - unit)) != inBottomRight.end() &&
it->second.find(startF-unit) != it->second.end())
return LEFT;
if(modelType == REO_MONO)
return UNKNOWN;
if((it = inBottomLeft.find(startE - unit)) != inBottomLeft.end() && it->second.find(endF + unit) != it->second.end())
return RIGHT;
if(modelType == REO_MSD)
return UNKNOWN;
bool connectedLeftTop = false;
for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit)
if(connectedLeftTop = (it = inBottomRight.find(startE - unit)) != inBottomRight.end() &&
it->second.find(indexF) != it->second.end())
return DRIGHT;
bool connectedRightTop = false;
for(int indexF=endF+2*unit; (*le)(indexF, countF) && !connectedRightTop; indexF=indexF+unit)
if(connectedRightTop = (it = inBottomLeft.find(startE - unit)) != inBottomRight.end() &&
it->second.find(indexF) != it->second.end())
return DLEFT;
return DRIGHT;
}
// to be called with countF-1 instead of countF
REO_POS getOrientHierModel (REO_MODEL_TYPE modelType,
int startF, int endF, int startE, int endE, int countF, int zero, int unit,
bool (*ge)(int, int), bool (*le)(int, int),
const HSenteceVertices & outBottomRight, const HSenteceVertices & outBottomLeft,
REO_POS phraseOrient){
int startF, int endF, int startE, int endE, int countF, int zero, int unit,
bool (*ge)(int, int), bool (*le)(int, int),
const HSenteceVertices & outBottomRight, const HSenteceVertices & outBottomLeft,
REO_POS phraseOrient){
HSenteceVertices::const_iterator it;
HSenteceVertices::const_iterator it;
if(phraseOrient == LEFT || ((it = outBottomRight.find(startE - unit)) != outBottomRight.end() &&
it->second.find(startF-unit) != it->second.end()))
return LEFT;
if(modelType == REO_MONO)
return UNKNOWN;
if(phraseOrient == RIGHT || ((it = outBottomLeft.find(startE - unit)) != outBottomLeft.end() && it->second.find(endF + unit) != it->second.end()))
return RIGHT;
if(modelType == REO_MSD)
return UNKNOWN;
if(phraseOrient == DRIGHT)
return DRIGHT;
if(phraseOrient == DLEFT)
return DLEFT;
bool connectedLeftTop = false;
for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit)
if(connectedLeftTop = (it = outBottomRight.find(startE - unit)) != outBottomRight.end() &&
it->second.find(indexF) != it->second.end())
return DRIGHT;
bool connectedRightTop = false;
for(int indexF=endF+2*unit; (*le)(indexF, countF) && !connectedRightTop; indexF=indexF+unit)
if(connectedRightTop = (it = outBottomLeft.find(startE - unit)) != outBottomRight.end() &&
it->second.find(indexF) != it->second.end())
return DLEFT;
return UNKNOWN;
if(phraseOrient == LEFT || ((it = outBottomRight.find(startE - unit)) != outBottomRight.end() &&
it->second.find(startF-unit) != it->second.end()))
return LEFT;
if(modelType == REO_MONO)
return UNKNOWN;
if(phraseOrient == RIGHT || ((it = outBottomLeft.find(startE - unit)) != outBottomLeft.end() && it->second.find(endF + unit) != it->second.end()))
return RIGHT;
if(modelType == REO_MSD)
return UNKNOWN;
if(phraseOrient == DRIGHT)
return DRIGHT;
if(phraseOrient == DLEFT)
return DLEFT;
bool connectedLeftTop = false;
for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit)
if(connectedLeftTop = (it = outBottomRight.find(startE - unit)) != outBottomRight.end() &&
it->second.find(indexF) != it->second.end())
return DRIGHT;
bool connectedRightTop = false;
for(int indexF=endF+2*unit; (*le)(indexF, countF) && !connectedRightTop; indexF=indexF+unit)
if(connectedRightTop = (it = outBottomLeft.find(startE - unit)) != outBottomRight.end() &&
it->second.find(indexF) != it->second.end())
return DLEFT;
return UNKNOWN;
}
bool isAligned ( SentenceAlignment &sentence, int fi, int ei ){
if (ei == -1 && fi == -1)
return true;
if (ei <= -1 || fi <= -1)
return false;
if (ei == sentence.english.size() && fi == sentence.foreign.size())
return true;
if (ei >= sentence.english.size() || fi >= sentence.foreign.size())
return false;
for(int i=0;i<sentence.alignedToE[ei].size();i++)
if (sentence.alignedToE[ei][i] == fi)
return true;
return false;
if (ei == -1 && fi == -1)
return true;
if (ei <= -1 || fi <= -1)
return false;
if (ei == sentence.english.size() && fi == sentence.foreign.size())
return true;
if (ei >= sentence.english.size() || fi >= sentence.foreign.size())
return false;
for(int i=0;i<sentence.alignedToE[ei].size();i++)
if (sentence.alignedToE[ei][i] == fi)
return true;
return false;
}
bool ge(int first, int second){
return first >= second;
return first >= second;
}
bool le(int first, int second){
return first <= second;
return first <= second;
}
bool lt(int first, int second){
return first < second;
return first < second;
}
void insertVertex( HSenteceVertices & corners, int x, int y ){
set<int> tmp;
tmp.insert(x);
pair< HSenteceVertices::iterator, bool > ret = corners.insert( pair<int, set<int> > (y, tmp) );
if(ret.second == false){
ret.first->second.insert(x);
}
set<int> tmp;
tmp.insert(x);
pair< HSenteceVertices::iterator, bool > ret = corners.insert( pair<int, set<int> > (y, tmp) );
if(ret.second == false){
ret.first->second.insert(x);
}
}
void insertPhraseVertices(
HSenteceVertices & topLeft,
HSenteceVertices & topRight,
HSenteceVertices & bottomLeft,
HSenteceVertices & bottomRight,
int startF, int startE, int endF, int endE) {
HSenteceVertices & topLeft,
HSenteceVertices & topRight,
HSenteceVertices & bottomLeft,
HSenteceVertices & bottomRight,
int startF, int startE, int endF, int endE) {
insertVertex(topLeft, startF, startE);
insertVertex(topRight, endF, startE);
insertVertex(bottomLeft, startF, endE);
insertVertex(bottomRight, endF, endE);
insertVertex(topLeft, startF, startE);
insertVertex(topRight, endF, startE);
insertVertex(bottomLeft, startF, endE);
insertVertex(bottomRight, endF, endE);
}
string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType){
switch(orient){
case LEFT: return "mono"; break;
case RIGHT: return "swap"; break;
case DRIGHT: return "dright"; break;
case DLEFT: return "dleft"; break;
case UNKNOWN:
switch(modelType){
case REO_MONO: return "nomono"; break;
case REO_MSD: return "other"; break;
case REO_MSLR: return "dright"; break;
}
break;
}
switch(orient){
case LEFT: return "mono"; break;
case RIGHT: return "swap"; break;
case DRIGHT: return "dright"; break;
case DLEFT: return "dleft"; break;
case UNKNOWN:
switch(modelType){
case REO_MONO: return "nomono"; break;
case REO_MSD: return "other"; break;
case REO_MSLR: return "dright"; break;
}
break;
}
}
void addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo) {
@ -620,7 +621,7 @@ void addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, i
}
if (orientationFlag)
extractFileOrientation << orientationInfo;
extractFileOrientation << orientationInfo;
extractFile << "\n";
extractFileInv << "\n";
@ -688,7 +689,7 @@ int SentenceAlignment::create( char englishString[], char foreignString[], char
cerr << "E: " << englishString << endl << "F: " << foreignString << endl;
return 0;
}
// cout << "alignmentSequence[i] " << alignmentSequence[i] << " is " << f << ", " << e << endl;
// cout << "alignmentSequence[i] " << alignmentSequence[i] << " is " << f << ", " << e << endl;
if (e >= english.size() || f >= foreign.size()) {
cerr << "WARNING: sentence " << sentenceID << " has alignment point (" << f << ", " << e << ") out of bounds (" << foreign.size() << ", " << english.size() << ")\n";
cerr << "E: " << englishString << endl << "F: " << foreignString << endl;

File diff suppressed because it is too large Load Diff

View File

@ -135,6 +135,7 @@ my $MKCLS = "$BINDIR/mkcls";
# supporting scripts/binaries from this package
my $PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract";
my $LEXICAL_REO_SCORER = "$SCRIPTS_ROOTDIR/training/lexical-reordering/score";
my $MEMSCORE = "$SCRIPTS_ROOTDIR/training/memscore/memscore";
my $SYMAL = "$SCRIPTS_ROOTDIR/training/symal/symal";
my $GIZA2BAL = "$SCRIPTS_ROOTDIR/training/symal/giza2bal.pl";
@ -296,7 +297,7 @@ foreach my $r (split(/\,/,$___REORDERING)) {
#set default values
push @REORDERING_MODELS, {};
$REORDERING_MODELS[$model_num]{"dir"} = "backward";
$REORDERING_MODELS[$model_num]{"type"} = "word";
$REORDERING_MODELS[$model_num]{"type"} = "wbe";
$REORDERING_MODELS[$model_num]{"collapse"} = "allff";
#handle the options set in the config string
@ -305,27 +306,19 @@ foreach my $r (split(/\,/,$___REORDERING)) {
$REORDERING_LEXICAL = 0;
next;
}
if ($reoconf =~ /(msd)|(mslr)|(monotonicity)|(leftright)/) {
if ($reoconf =~ /^((msd)|(mslr)|(monotonicity)|(leftright))/) {
$REORDERING_MODELS[$model_num]{"orient"} = $reoconf;
}
elsif ($reoconf =~ /((bidirectional)|(backward)|(forward))/) {
elsif ($reoconf =~ /^((bidirectional)|(backward)|(forward))/) {
$REORDERING_MODELS[$model_num]{"dir"} = $reoconf;
}
elsif ($reoconf =~ /^(fe)|(f))/) {
elsif ($reoconf =~ /^((fe)|(f))/) {
$REORDERING_MODELS[$model_num]{"lang"} = $reoconf;
}
elsif ($reoconf =~ /(hier)|(phrase)|(word)/) {
if ($model_num == 0) {
$reotype = $reoconf;
}
elsif ($reotype ne $reoconf) {
#TODO: update extract to make it possible to have more types of model (return the options for all models used!!)
print STDERR "you are not allowed to use more than one reordering model type, now using: $reotype and $reoconf";
exit(1);
}
elsif ($reoconf =~ /^((hier)|(phrase)|(wbe))/) {
$REORDERING_MODELS[$model_num]{"type"} = $reoconf;
}
elsif ($reoconf =~ /(collapseff)|(allff)/) {
elsif ($reoconf =~ /^((collapseff)|(allff))/) {
$REORDERING_MODELS[$model_num]{"collapse"} = $reoconf;
}
else {
@ -333,8 +326,18 @@ foreach my $r (split(/\,/,$___REORDERING)) {
exit(1);
}
}
#check that the required attributes are given
if (!defined($REORDERING_MODELS[$model_num]{"type"})) {
print STDERR "you have to give the type of the reordering models (mslr, msd, monotonicity or leftright); it is not done in $r\n";
exit(1);
}
if (!defined($REORDERING_MODELS[$model_num]{"lang"})) {
print STDERR "you have specify which languages to condition on (f or fe); it is not done in $r\n";
exit(1);
}
#fix the all-string
$REORDERING_MODELS[$model_num]{"all"} = $REORDERING_MODELS[$model_num]{"orient"}.'-'.$REORDERING_MODELS[$model_num]{"dir"}."-".$REORDERING_MODELS[$model_num]{"lang"}."-".$REORDERING_MODELS[$model_num]{"type"}."-".$REORDERING_MODELS[$model_num]{"collapse"};
$REORDERING_MODELS[$model_num]{"all"} = $REORDERING_MODELS[$model_num]{"type"}."-".$REORDERING_MODELS[$model_num]{"orient"}.'-'.$REORDERING_MODELS[$model_num]{"dir"}."-".$REORDERING_MODELS[$model_num]{"lang"}."-".$REORDERING_MODELS[$model_num]{"collapse"};
# fix numfeatures
$REORDERING_MODELS[$model_num]{"numfeatures"} = 1;
@ -365,7 +368,7 @@ foreach my $r (split(/\,/,$___REORDERING)) {
# pick the overall most specific model for each reordering model type
for my $mtype ( keys %REORDERING_MODEL_TYPES) {
if ($REORDERING_MODEL_TYPES{$mtype} =~ /lr/) {
if ($REORDERING_MODEL_TYPES{$mtype} =~ /(mslr)|(leftright)/) {
$REORDERING_MODEL_TYPES{$mtype} = "mslr"
}
elsif ($REORDERING_MODEL_TYPES{$mtype} =~ /msd/) {
@ -376,6 +379,7 @@ for my $mtype ( keys %REORDERING_MODEL_TYPES) {
}
}
#TODO - remove the below
my ($mono_previous_f,$swap_previous_f,$left_previous_f,$right_previous_f,$other_previous_f);
my ($mono_previous_fe,$swap_previous_fe,$left_previous_fe,$right_previous_fe,$other_previous_fe);
my ($mono_following_f,$swap_following_f,$left_following_f,$right_following_f,$other_following_f);
@ -1084,7 +1088,6 @@ sub get_extract_reordering_flags {
$config_string .= " --model $type-".$REORDERING_MODEL_TYPES{$type};
}
print STDERR "extract-flags: $config_string\n";
return ""; #comment out when using new training scripts (do we need an option for backward compatibility???
return $config_string;
}
@ -1325,38 +1328,34 @@ sub score_phrase_memscore {
sub get_reordering_factored {
print STDERR "(7) learn reordering model @ ".`date`;
my @SPECIFIED_TABLE = @_REORDERING_TABLE;
# my @TYPE = ("msd-f","msd-fe","msd-bidirectional-f","msd-bidirectional-fe","monotonicity-f","monotonicity-fe","monotonicity-bidirectional-f","monotonicity-bidirectional-fe");
#This @REORDERING_TABLE is now not used. Did anyone use it???
# my @SPECIFIED_TABLE = @_REORDERING_TABLE;
if (scalar(@_REORDERING_TABLE)) {
print STDERR "WARNING: you specified -reordering-table. That feature is not implemented in this version of train-factored-phrase-model.perl. Standard file names will be used.\n";
}
if ($REORDERING_LEXICAL) {
if ($___NOT_FACTORED) {
# my %FILE;
foreach my $model (@REORDERING_MODELS) {
# if (defined($REORDERING_MODELS{$type})) {
my $file = "$___MODEL_DIR/reordering-table.";
$file .= $model->{"all"};
#$file .= ".$type" if (scalar keys %REORDERING_MODELS) > 2;
$file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
$model->{"file"} = $file;
# }
}
&get_reordering($___EXTRACT_FILE);
print STDERR "(7.1) [no factors] learn reordering model @ ".`date`;
# foreach my $model (@REORDERING_MODELS) {
# #my $file = "$___MODEL_DIR/reordering-table.";
# $file .= $model->{"all"};
# #$file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
# $model->{"file"} = $file;
# }
&get_reordering($___EXTRACT_FILE,"$___MODEL_DIR/reordering-table.");
}
else {
foreach my $factor (split(/\+/,$___REORDERING_FACTORS)) {
print STDERR "(7) [$factor] learn reordering model @ ".`date`;
print STDERR "(7.1) [$factor] learn reordering model @ ".`date`;
my ($factor_f,$factor_e) = split(/\-/,$factor);
# my %FILE;
foreach my $model (@REORDERING_MODELS) { #$type (@TYPE) {
#if (defined($REORDERING_MODELS{$type})) {
my $file = "$___MODEL_DIR/reordering-table.$factor";
#$file .= ".$type" if (scalar keys %REORDERING_MODELS) > 2;
$file .= $model->{"all"};
$file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
$model->{"file"} = $file;
}
# &get_reordering(\%FILE,"$___EXTRACT_FILE.$factor");
&get_reordering("$___EXTRACT_FILE.$factor");
# foreach my $model (@REORDERING_MODELS) {
# my $file = "$___MODEL_DIR/reordering-table.$factor";
# $file .= $model->{"all"};
# $file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
# $model->{"file"} = $file;
# }
&get_reordering("$___EXTRACT_FILE.$factor","$___MODEL_DIR/reordering-table.$factor");
}
}
}
@ -1366,8 +1365,7 @@ sub get_reordering_factored {
}
sub get_reordering {
#my ($MODEL_FILE,$extract_file) = @_;
my ($extract_file) = @_;
my ($extract_file,$reo_model_path) = @_;
if (-e "$extract_file.o.gz") {
safesystem("gunzip < $extract_file.o.gz | LC_ALL=C sort -T $___TEMP_DIR > $extract_file.o.sorted") or die("ERROR");
}
@ -1376,233 +1374,28 @@ sub get_reordering {
}
my $smooth = $___REORDERING_SMOOTH;
@REORDERING_SMOOTH_PREVIOUS = ($smooth,$smooth,$smooth,$smooth,$smooth);
@REORDERING_SMOOTH_FOLLOWING = ($smooth,$smooth,$smooth,$smooth,$smooth);
my (%SMOOTH_PREVIOUS,%SMOOTH_FOLLOWING);
if ($smooth =~ /(.+)u$/) {
$smooth = $1;
my $smooth_total = 0;
open(O,"$extract_file.o.sorted")
or die "ERROR: Can't read $extract_file.o.sorted";
while(<O>) {
chomp;
my ($f,$e,$o) = split(/ \|\|\| /);
my ($o_previous,$o_following) = split(/ /,$o);
$SMOOTH_PREVIOUS{$o_previous}++;
$SMOOTH_FOLLOWING{$o_following}++;
$smooth_total++;
}
close(O);
@REORDERING_SMOOTH_PREVIOUS = ($smooth*($SMOOTH_PREVIOUS{"mono"}+0.1)/$smooth_total,
$smooth*($SMOOTH_PREVIOUS{"swap"}+0.1)/$smooth_total,
$smooth*($SMOOTH_PREVIOUS{"left"}+0.1)/$smooth_total,
$smooth*($SMOOTH_PREVIOUS{"right"}+0.1)/$smooth_total,
$smooth*($SMOOTH_PREVIOUS{"other"}+0.1)/$smooth_total);
@REORDERING_SMOOTH_FOLLOWING = ($smooth*($SMOOTH_FOLLOWING{"mono"}+0.1)/$smooth_total,
$smooth*($SMOOTH_FOLLOWING{"swap"}+0.1)/$smooth_total,
$smooth*($SMOOTH_FOLLOWING{"left"}+0.1)/$smooth_total,
$smooth*($SMOOTH_FOLLOWING{"right"}+0.1)/$smooth_total,
$smooth*($SMOOTH_FOLLOWING{"other"}+0.1)/$smooth_total);
printf "$smooth*($SMOOTH_FOLLOWING{mono}+0.1)/$smooth_total,
$smooth*($SMOOTH_FOLLOWING{swap}+0.1)/$smooth_total,
$smooth*($SMOOTH_FOLLOWING{other}+0.1)/$smooth_total\n";
printf "smoothed following to %f,%f,%f\n",@REORDERING_SMOOTH_FOLLOWING;
}
($mono_previous_f,$swap_previous_f,$left_previous_f,$right_previous_f,$other_previous_f) = (0,0,0,0,0); #@REORDERING_SMOOTH_PREVIOUS;
($mono_previous_fe,$swap_previous_fe,$left_previous_fe,$right_previous_fe,$other_previous_fe) = (0,0,0,0,0); #@REORDERING_SMOOTH_PREVIOUS;
($mono_following_f,$swap_following_f,$left_following_f,$right_following_f,$other_following_f) = (0,0,0,0,0); #@REORDERING_SMOOTH_FOLLOWING;
($mono_following_fe,$swap_following_fe,$left_following_fe,$right_following_fe,$other_following_fe) = (0,0,0,0,0); #@REORDERING_SMOOTH_FOLLOWING;
print STDERR "(7.2) building tables @ ".`date`;
open(O,"$extract_file.o.sorted")
or die "ERROR: Can't read $extract_file.o.sorted";
foreach my $model ( @REORDERING_MODELS ) {
local *FILE;
open(FILE, "|gzip >".$model->{"file"}.".gz");
$model->{"filehandle"} = *FILE;
}
my $first = 1;
while(<O>) {
chomp;
my ($f,$e,$o) = split(/ \|\|\| /);
my ($o_previous,$o_following) = split(/ /,$o);
# store counts if new f,e
if ($first) {
$f_current = $f;
$e_current = $e;
$first = 0;
}
elsif ($f ne $f_current || $e ne $e_current) {
#always store the counts for both directions.
# if (defined($REORDERING_MODELS{"fe"})) {
# compute probs, store them
&store_reordering_fe();
# reset counters
($mono_previous_fe,$swap_previous_fe,$left_previous_fe,$right_previous_fe,$other_previous_fe) = (0,0,0,0,0);
($mono_following_fe,$swap_following_fe,$left_following_fe,$right_following_fe,$other_following_fe) = (0,0,0,0,0);
# }
# store counts if new f
if ($f ne $f_current) { # && defined($REORDERING_MODELS{"f"})) {
# compute probs, store them
&store_reordering_f();
# reset counters
($mono_previous_f,$swap_previous_f,$left_previous_f,$right_previous_f,$other_previous_f) = (0,0,0,0,0);
($mono_following_f,$swap_following_f,$left_following_f,$right_following_f,$other_following_fe) = (0,0,0,0,0);
#create cmd string for lexical reordering scoring
my $cmd = "$LEXICAL_REO_SCORER $extract_file.o.sorted $smooth $reo_model_path";
$cmd .= " --SmoothWithCounts" if ($smooth =~ /(.+)u$/);
for my $mtype (keys %REORDERING_MODEL_TYPES) {
$cmd .= " --model \"$mtype $REORDERING_MODEL_TYPES{$mtype}";
foreach my $model (@REORDERING_MODELS) {
if ($model->{"type"} eq $mtype) {
$cmd .= " ".$model->{"all"};
}
$f_current = $f;
$e_current = $e;
}
# update counts
if ($o_previous eq 'mono') { $mono_previous_f++; $mono_previous_fe++; }
elsif ($o_previous eq 'swap') { $swap_previous_f++; $swap_previous_fe++; }
elsif ($o_previous eq 'left'){ $left_previous_f++; $left_previous_fe++; }
elsif ($o_previous eq 'right'){ $right_previous_f++; $right_previous_fe++; }
#keep other option for backward compatibility
elsif ($o_previous eq 'other'){ $other_previous_f++; $other_previous_fe++; }
else { print STDERR "buggy line (o_previous:$o_previous): $_\n"; }
if ($o_following eq 'mono') { $mono_following_f++; $mono_following_fe++; }
elsif ($o_following eq 'swap') { $swap_following_f++; $swap_following_fe++; }
elsif ($o_following eq 'left') { $left_following_f++; $left_following_fe++; }
elsif ($o_following eq 'right') { $right_previous_f++; $right_previous_fe++; }
#keep other option for backward compatibility
elsif ($o_following eq 'other') { $other_previous_f++; $other_previous_fe++; }
else { print STDERR "buggy line (o_following:$o_following): $_\n"; }
}
$cmd .= "\"";
}
&store_reordering_f();
&store_reordering_fe();
#Call the lexical reordering scorer
safesystem("$cmd") or die "ERROR: Lexical reordering scoring failed";
if (! $debug) { safesystem("rm $extract_file.o.sorted") or die("ERROR");}
}
sub store_reordering_f {
my ($total_previous_f,$total_following_f);
$total_previous_f = ($mono_previous_f+$swap_previous_f+$left_previous_f+$right_previous_f+$other_previous_f);
$total_following_f = ($mono_following_f+$swap_following_f+$left_following_f+$right_following_f+$other_following_f);
foreach my $model (@REORDERING_MODELS) {
next if ($model->{"lang"} ne "f");
if ($model->{"orient"} eq "mslr") {
$total_previous_f += $REORDERING_SMOOTH_PREVIOUS[0]+$REORDERING_SMOOTH_PREVIOUS[1]+$REORDERING_SMOOTH_PREVIOUS[2]+$REORDERING_SMOOTH_PREVIOUS[3];
printf { $model->{"filehandle"} } ("%s ||| %g %g %g %g\n",
$f_current,
($mono_previous_f+$REORDERING_SMOOTH_PREVIOUS[0])/
$total_previous_f,
($swap_previous_f+$REORDERING_SMOOTH_PREVIOUS[1])/
$total_previous_f,
($left_previous_f+$REORDERING_SMOOTH_PREVIOUS[2])/
$total_previous_f,
($right_previous_f+$REORDERING_SMOOTH_PREVIOUS[3])
/$total_previous_f);
}
elsif ($model->{"orient"} eq "msd") {
$total_previous_f += $REORDERING_SMOOTH_PREVIOUS[0]+$REORDERING_SMOOTH_PREVIOUS[1]+$REORDERING_SMOOTH_PREVIOUS[4];
printf { $model->{"filehandle"} } ("%s ||| %g %g %g\n",
$f_current,
($mono_previous_f+$REORDERING_SMOOTH_PREVIOUS[0])/$total_previous_f,
($swap_previous_f+$REORDERING_SMOOTH_PREVIOUS[1])/$total_previous_f,
($other_previous_f+$REORDERING_SMOOTH_PREVIOUS[4])/
$total_previous_f);
}
elsif ($model->{"orient"} eq "monotonicity") {
$total_previous_f += $REORDERING_SMOOTH_PREVIOUS[0]+$REORDERING_SMOOTH_PREVIOUS[1];
printf { $model->{"filehandle"} } ("%s ||| %g %g\n",
$f_current,
($mono_previous_f+$REORDERING_SMOOTH_PREVIOUS[0])/$total_previous_f,
($swap_previous_f+$left_previous_f+$right_previous_f+$REORDERING_SMOOTH_PREVIOUS[1])/$total_previous_f);
}
elsif ($model->{"orient"} eq "leftright") {
$total_previous_f += $REORDERING_SMOOTH_PREVIOUS[0]+$REORDERING_SMOOTH_PREVIOUS[1];
printf { $model->{"filehandle"} } ("%s ||| %g %g\n",
$f_current,
($mono_previous_f+$left_previous_f+$REORDERING_SMOOTH_PREVIOUS[0])/$total_previous_f,
($swap_previous_f+$right_previous_f+$REORDERING_SMOOTH_PREVIOUS[1])/$total_previous_f);
}
}
}
sub store_reordering_fe {
my $total_previous_fe = $mono_previous_fe+$swap_previous_fe+$left_previous_fe+$right_previous_fe;
my $total_following_fe = $mono_following_fe+$swap_following_fe+$left_following_fe+$right_following_fe;
foreach my $model (@REORDERING_MODELS) {
next if ($model->{"lang"} ne "fe");
if ($model->{"orient"} eq "mslr") {
$total_previous_fe += $REORDERING_SMOOTH_PREVIOUS[0]+$REORDERING_SMOOTH_PREVIOUS[1]+$REORDERING_SMOOTH_PREVIOUS[2]+$REORDERING_SMOOTH_PREVIOUS[3];
$total_following_fe += $REORDERING_SMOOTH_FOLLOWING[0]+$REORDERING_SMOOTH_FOLLOWING[1]+$REORDERING_SMOOTH_FOLLOWING[2]+$REORDERING_SMOOTH_FOLLOWING[3];
printf { $model->{"filehandle"} } ("%s ||| %s ||| %g %g %g %g ",
$f_current, $e_current,
($mono_previous_fe+$REORDERING_SMOOTH_PREVIOUS[0])/$total_previous_fe,
($swap_previous_fe+$REORDERING_SMOOTH_PREVIOUS[1])/$total_previous_fe,
($left_previous_fe+$REORDERING_SMOOTH_PREVIOUS[2])/$total_previous_fe,
($right_previous_fe+$REORDERING_SMOOTH_PREVIOUS[3])/$total_previous_fe);
if ($model->{"dir"} eq "bidirectional") {
printf { $model->{"filehandle"} } ("%g %g %g %g",
($mono_following_fe+$REORDERING_SMOOTH_FOLLOWING[0])/$total_following_fe,
($swap_following_fe+$REORDERING_SMOOTH_FOLLOWING[1])/$total_following_fe,
($left_following_fe+$REORDERING_SMOOTH_FOLLOWING[2])/$total_following_fe,
($right_following_fe+$REORDERING_SMOOTH_FOLLOWING[3])/$total_following_fe);
}
printf { $model->{"filehandle"} } ("\n");
}
elsif ($model->{"orient"} eq "msd") {
$total_previous_fe += $REORDERING_SMOOTH_PREVIOUS[0]+$REORDERING_SMOOTH_PREVIOUS[1]+$REORDERING_SMOOTH_PREVIOUS[4];
$total_following_fe += $REORDERING_SMOOTH_FOLLOWING[0]+$REORDERING_SMOOTH_FOLLOWING[1]+$REORDERING_SMOOTH_FOLLOWING[4];
printf { $model->{"filehandle"} } ("%s ||| %s ||| %g %g %g ",
$f_current, $e_current,
($mono_previous_fe+$REORDERING_SMOOTH_PREVIOUS[0])/$total_previous_fe,
($swap_previous_fe+$REORDERING_SMOOTH_PREVIOUS[1])/$total_previous_fe,
($other_previous_fe+$REORDERING_SMOOTH_PREVIOUS[4])/$total_previous_fe);
if ($model->{"dir"} eq "bidirectional") {
printf { $model->{"filehandle"} } ("%g %g %g",
($mono_following_fe+$REORDERING_SMOOTH_FOLLOWING[0])/$total_following_fe,
($swap_following_fe+$REORDERING_SMOOTH_FOLLOWING[1])/$total_following_fe,
($other_following_fe+$REORDERING_SMOOTH_FOLLOWING[4])/$total_following_fe);
}
printf { $model->{"filehandle"} } ("\n");
}
elsif ($model->{"orient"} eq "monotonicity") {
$total_previous_fe += $REORDERING_SMOOTH_PREVIOUS[0]+$REORDERING_SMOOTH_PREVIOUS[1];
$total_following_fe += $REORDERING_SMOOTH_FOLLOWING[0]+$REORDERING_SMOOTH_FOLLOWING[1];
printf { $model->{"filehandle"} } ("%s %s ||| %g %g ",
$f_current, $e_current,
($mono_previous_fe+$REORDERING_SMOOTH_PREVIOUS[0])/$total_previous_fe,
($swap_previous_fe+$left_previous_fe+$right_previous_fe+$REORDERING_SMOOTH_PREVIOUS[1])/$total_previous_fe);
if ($model->{"dir"} eq "bidirectional") {
printf { $model->{"filehandle"} } ("%g %g",
($mono_following_fe+$REORDERING_SMOOTH_FOLLOWING[0])/$total_following_fe,
($swap_following_fe+$left_following_fe+$right_following_fe+$REORDERING_SMOOTH_FOLLOWING[1])/$total_following_fe);
}
printf { $model->{"filehandle"} } ("\n");
}
elsif ($model->{"orient"} eq "leftright") {
$total_previous_fe += $REORDERING_SMOOTH_PREVIOUS[0]+$REORDERING_SMOOTH_PREVIOUS[1];
$total_following_fe += $REORDERING_SMOOTH_FOLLOWING[0]+$REORDERING_SMOOTH_FOLLOWING[1];
printf { $model->{"filehandle"} } ("%s ||| %s ||| %g %g ",
$f_current, $e_current,
($mono_previous_fe+$left_previous_fe+$REORDERING_SMOOTH_PREVIOUS[0])/$total_previous_fe,
($swap_previous_fe+$right_previous_fe+$REORDERING_SMOOTH_PREVIOUS[1])/$total_previous_fe);
if ($model->{"dir"} eq "bidirectional") {
printf { $model->{"filehandle"} } ("%g %g",
($mono_following_fe+$left_following_fe+$REORDERING_SMOOTH_FOLLOWING[0])/$total_following_fe,
($swap_following_fe+$right_following_fe+$REORDERING_SMOOTH_FOLLOWING[1])/$total_following_fe);
}
printf { $model->{"filehandle"} } ("\n");
}
}
}
### (8) LEARN GENERATION MODEL
@ -1703,7 +1496,7 @@ sub create_ini {
if (defined $___TRANSLATION_FACTORS) {
print INI "# input factors\n";
print INI "[input-factors]\n";
print INI "[input-factors]\n";
my $INPUT_FACTOR_MAX = 0;
foreach my $table (split /\+/, $___TRANSLATION_FACTORS) {
my ($factor_list, $output) = split /-+/, $table;
@ -1798,25 +1591,16 @@ print INI "\n\n\# limit on how many phrase translations e for each phrase f are
my $file = "# distortion (reordering) files\n\[distortion-file]\n";
my $factor_i = 0;
my @SPECIFIED_TABLE = @_REORDERING_TABLE;
#my @SPECIFIED_TABLE = @_REORDERING_TABLE;
foreach my $factor (split(/\+/,$___REORDERING_FACTORS)) {
# foreach my $type (keys %REORDERING_MODELS) {
foreach my $model (@REORDERING_MODELS) {
# next if $type eq "fe" || $type eq "f";
# next if $type eq "distance";
# my $w;
# if ($type =~ /msd/) { $w = 3; } else { $w = 1; }
# if ($type =~ /bi/) { $w *= 2; }
$weight_d_count += $model->{"numfeatures"};
my $table_file = "$___MODEL_DIR/reordering-table.";
$table_file .= ".$factor" unless $___NOT_FACTORED;
# $table_file .= ".$type" if (scalar keys %REORDERING_MODELS) > 2;
$table_file .= $model->{"all"};
$table_file .= ".gz";
$table_file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
#$type =~ s/\-f/\-unidirectional\-f/ unless $type =~ /\-bi/;
$file .= ".$factor ".$model->{"all"}." ".$model->{"numfeatures"}." $table_file\n";
$weight_d_count += $model->{"numfeatures"};
my $table_file = "$___MODEL_DIR/reordering-table.";
$table_file .= ".$factor" unless $___NOT_FACTORED;
$table_file .= $model->{"all"};
$table_file .= ".gz";
#$table_file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
$file .= ".$factor ".$model->{"all"}." ".$model->{"numfeatures"}." $table_file\n";
}
$factor_i++;
}