mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-29 15:04:05 +03:00
Rewrote the lexical reordering model scoring in C++. Adapted train-factored-phrase-model.perl to that change. Minor fixes in other places, for compatibility
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/branches/hierarchical-reo@2884 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
parent
9adc3ee500
commit
92368ba490
@ -40,8 +40,8 @@ class LexicalReorderingState : public FFState {
|
||||
static const LexicalReordering::ReorderingType D = 2; // discontinuous
|
||||
static const LexicalReordering::ReorderingType DL = 2; // discontinuous, left
|
||||
static const LexicalReordering::ReorderingType DR = 3; // discontinuous, right
|
||||
static const LexicalReordering::ReorderingType L = 0; // left
|
||||
static const LexicalReordering::ReorderingType R = 1; // right
|
||||
static const LexicalReordering::ReorderingType R = 0; // right
|
||||
static const LexicalReordering::ReorderingType L = 1; // left
|
||||
};
|
||||
|
||||
//! State for the standard Moses implementation of lexical reordering models
|
||||
|
@ -26,7 +26,7 @@ RELEASEDIR=$(TARGETDIR)/scripts-$(TS)
|
||||
|
||||
all: compile
|
||||
|
||||
SUBDIRS=cmert-0.5 phrase-extract symal mbr
|
||||
SUBDIRS=cmert-0.5 phrase-extract symal mbr lexical-reordering
|
||||
SUBDIRS_CLEAN=$(SUBDIRS) memscore
|
||||
|
||||
compile: compile-memscore
|
||||
|
15
scripts/training/lexical-reordering/Makefile
Normal file
15
scripts/training/lexical-reordering/Makefile
Normal file
@ -0,0 +1,15 @@
|
||||
|
||||
all: score
|
||||
|
||||
clean:
|
||||
rm -f *.o
|
||||
|
||||
.cpp.o:
|
||||
$(CXX) -O6 -g -c $<
|
||||
|
||||
score: score.cpp reordering_classes.o
|
||||
$(CXX) -lz score.cpp reordering_classes.o -o score
|
||||
|
||||
#reordering_classes.o: reordering_classes.h reordering_classes.cpp
|
||||
# $(CXX) reordering_classes.cpp
|
||||
|
427
scripts/training/lexical-reordering/reordering_classes.cpp
Normal file
427
scripts/training/lexical-reordering/reordering_classes.cpp
Normal file
@ -0,0 +1,427 @@
|
||||
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
#include <numeric>
|
||||
#include <cstdio>
|
||||
//#include <iostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include "zlib.h"
|
||||
|
||||
#include "reordering_classes.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
ModelScore::ModelScore() {
|
||||
for(int i=MONO; i<=NOMONO; ++i) {
|
||||
count_fe_prev.push_back(0);
|
||||
count_fe_next.push_back(0);
|
||||
count_f_prev.push_back(0);
|
||||
count_f_next.push_back(0);
|
||||
}
|
||||
}
|
||||
|
||||
ModelScore* ModelScore::createModelScore(const string& modeltype) {
|
||||
if (modeltype.compare("mslr") == 0) {
|
||||
return new ModelScoreMSLR();
|
||||
} else if (modeltype.compare("msd") == 0) {
|
||||
return new ModelScoreMSD();
|
||||
} else if (modeltype.compare("monotonoicity") == 0 ) {
|
||||
return new ModelScoreMonotonicity();
|
||||
} else if (modeltype.compare("leftright") == 0) {
|
||||
return new ModelScoreLR();
|
||||
} else {
|
||||
cerr << "Illegal model type given for lexical reordering model scoring: " << modeltype << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
void ModelScore::reset_fe() {
|
||||
for(int i=MONO; i<=NOMONO; ++i) {
|
||||
count_fe_prev[i] = 0;
|
||||
count_fe_next[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void ModelScore::reset_f() {
|
||||
for(int i=MONO; i<=NOMONO; ++i) {
|
||||
count_f_prev[i] = 0;
|
||||
count_f_next[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void ModelScore::add_example(const std::string& previous, std::string& next) {
|
||||
count_fe_prev[getType(previous)]++;
|
||||
count_f_prev[getType(previous)]++;
|
||||
count_fe_next[getType(next)]++;
|
||||
count_f_next[getType(next)]++;
|
||||
}
|
||||
|
||||
const std::vector<double>& ModelScore::get_scores_fe_prev() const {
|
||||
return count_fe_prev;
|
||||
}
|
||||
|
||||
const std::vector<double>& ModelScore::get_scores_fe_next() const {
|
||||
return count_fe_next;
|
||||
}
|
||||
|
||||
const std::vector<double>& ModelScore::get_scores_f_prev() const {
|
||||
return count_f_prev;
|
||||
}
|
||||
|
||||
const std::vector<double>& ModelScore::get_scores_f_next() const {
|
||||
return count_f_next;
|
||||
}
|
||||
|
||||
|
||||
ORIENTATION ModelScore::getType(const std::string& s) {
|
||||
if (s.compare("mono") == 0) {
|
||||
return MONO;
|
||||
} else if (s.compare("swap") == 0) {
|
||||
return SWAP;
|
||||
} else if (s.compare("dright") == 0) {
|
||||
return DRIGHT;
|
||||
} else if (s.compare("dleft") == 0) {
|
||||
return DLEFT;
|
||||
} else if (s.compare("other") == 0) {
|
||||
return OTHER;
|
||||
} else if (s.compare("nomono") == 0) {
|
||||
return NOMONO;
|
||||
} else {
|
||||
cerr << "Illegal reordering type used: " << s << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
ORIENTATION ModelScoreMSLR::getType(const std::string& s) {
|
||||
if (s.compare("mono") == 0) {
|
||||
return MONO;
|
||||
} else if (s.compare("swap") == 0) {
|
||||
return SWAP;
|
||||
} else if (s.compare("dright") == 0) {
|
||||
return DRIGHT;
|
||||
} else if (s.compare("dleft") == 0) {
|
||||
return DLEFT;
|
||||
} else if (s.compare("other") == 0 || s.compare("nomono") == 0) {
|
||||
cerr << "Illegal reordering type used: " << s << " for model type MSLR" << endl;
|
||||
exit(1);
|
||||
} else {
|
||||
cerr << "Illegal reordering type used: " << s << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
ORIENTATION ModelScoreLR::getType(const std::string& s) {
|
||||
if (s.compare("mono") == 0 || s.compare("dright") == 0) {
|
||||
return DRIGHT;
|
||||
} else if (s.compare("swap") == 0 || s.compare("dleft") == 0) {
|
||||
return DLEFT;
|
||||
} else if (s.compare("other") == 0 || s.compare("nomono") == 0) {
|
||||
cerr << "Illegal reordering type used: " << s << " for model type LeftRight" << endl;
|
||||
exit(1);
|
||||
} else {
|
||||
cerr << "Illegal reordering type used: " << s << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
ORIENTATION ModelScoreMSD::getType(const std::string& s) {
|
||||
if (s.compare("mono") == 0) {
|
||||
return MONO;
|
||||
} else if (s.compare("swap") == 0) {
|
||||
return SWAP;
|
||||
} else if (s.compare("dleft") == 0 ||
|
||||
s.compare("dright") == 0 ||
|
||||
s.compare("other") == 0) {
|
||||
return OTHER;
|
||||
} else if (s.compare("nomono") == 0) {
|
||||
cerr << "Illegal reordering type used: " << s << " for model type MSD" << endl;
|
||||
exit(1);
|
||||
} else {
|
||||
cerr << "Illegal reordering type used: " << s << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
ORIENTATION ModelScoreMonotonicity::getType(const std::string& s) {
|
||||
if (s.compare("mono") == 0) {
|
||||
return MONO;
|
||||
} else if (s.compare("swap") == 0 ||
|
||||
s.compare("dleft") == 0 ||
|
||||
s.compare("dright") == 0 ||
|
||||
s.compare("other") == 0 ||
|
||||
s.compare("nomono") == 0 ) {
|
||||
return NOMONO;
|
||||
} else {
|
||||
cerr << "Illegal reordering type used: " << s << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
std::vector<double> ScorerMSLR::createSmoothing(std::vector<double> scores, double weight) const {
|
||||
double total = accumulate(scores.begin(), scores.end(), 0);
|
||||
vector<double> res;
|
||||
res.push_back(weight*(scores[MONO]+0.1)/total);
|
||||
res.push_back(weight*(scores[SWAP]+0.1)/total);
|
||||
res.push_back(weight*(scores[DRIGHT]+0.1)/total);
|
||||
res.push_back(weight*(scores[DLEFT]+0.1)/total);
|
||||
return res;
|
||||
}
|
||||
|
||||
std::vector<double> ScorerMSLR::createConstSmoothing(double weight) const {
|
||||
vector<double> smoothing;
|
||||
for (int i=1; i<=4; ++i) {
|
||||
smoothing.push_back(weight);
|
||||
}
|
||||
return smoothing;
|
||||
}
|
||||
|
||||
|
||||
std::vector<double> ScorerMSD::createSmoothing(std::vector<double> scores, double weight) const {
|
||||
double total = accumulate(scores.begin(), scores.end(), 0);
|
||||
vector<double> res;
|
||||
res.push_back(weight*(scores[MONO]+0.1)/total);
|
||||
res.push_back(weight*(scores[SWAP]+0.1)/total);
|
||||
res.push_back(weight*(scores[DLEFT]+scores[DRIGHT]+scores[OTHER]+0.1)/total);
|
||||
return res;
|
||||
}
|
||||
|
||||
std::vector<double> ScorerMSD::createConstSmoothing(double weight) const {
|
||||
vector<double> smoothing;
|
||||
for (int i=1; i<=3; ++i) {
|
||||
smoothing.push_back(weight);
|
||||
}
|
||||
return smoothing;
|
||||
}
|
||||
|
||||
std::vector<double> ScorerMonotonicity::createSmoothing(std::vector<double> scores, double weight) const {
|
||||
double total = accumulate(scores.begin(), scores.end(), 0);
|
||||
vector<double> res;
|
||||
res.push_back(weight*(scores[MONO]+0.1)/total);
|
||||
res.push_back(weight*(scores[SWAP]+scores[DLEFT]+scores[DRIGHT]+scores[OTHER]+scores[NOMONO]+0.1)/total);
|
||||
return res;
|
||||
}
|
||||
|
||||
std::vector<double> ScorerMonotonicity::createConstSmoothing(double weight) const {
|
||||
vector<double> smoothing;
|
||||
for (double i=1; i<=2; ++i) {
|
||||
smoothing.push_back(weight);
|
||||
}
|
||||
return smoothing;
|
||||
}
|
||||
|
||||
|
||||
std::vector<double> ScorerLR::createSmoothing(std::vector<double> scores, double weight) const {
|
||||
double total = accumulate(scores.begin(), scores.end(), 0);
|
||||
vector<double> res;
|
||||
res.push_back(weight*(scores[MONO]+scores[DRIGHT]+0.1)/total);
|
||||
res.push_back(weight*(scores[SWAP]+scores[DLEFT])/total);
|
||||
return res;
|
||||
}
|
||||
|
||||
std::vector<double> ScorerLR::createConstSmoothing(double weight) const {
|
||||
vector<double> smoothing;
|
||||
for (int i=1; i<=2; ++i) {
|
||||
smoothing.push_back(weight);
|
||||
}
|
||||
return smoothing;
|
||||
}
|
||||
|
||||
std::vector<double> ScorerMSLR::score(vector<double> all_scores) const {
|
||||
vector<double> s;
|
||||
s.push_back(all_scores[MONO]);
|
||||
s.push_back(all_scores[SWAP]);
|
||||
s.push_back(all_scores[DRIGHT]);
|
||||
s.push_back(all_scores[DLEFT]);
|
||||
return s;
|
||||
}
|
||||
|
||||
std::vector<double> ScorerMSD::score(vector<double> all_scores) const {
|
||||
vector<double> s;
|
||||
s.push_back(all_scores[MONO]);
|
||||
s.push_back(all_scores[SWAP]);
|
||||
s.push_back(all_scores[DRIGHT]+all_scores[DLEFT]+all_scores[OTHER]);
|
||||
return s;
|
||||
}
|
||||
|
||||
std::vector<double> ScorerMonotonicity::score(vector<double> all_scores) const {
|
||||
vector<double> s;
|
||||
s.push_back(all_scores[MONO]);
|
||||
s.push_back(all_scores[SWAP]+all_scores[DRIGHT]+all_scores[DLEFT]+all_scores[OTHER]+all_scores[NOMONO]);
|
||||
return s;
|
||||
}
|
||||
|
||||
|
||||
std::vector<double> ScorerLR::score(vector<double> all_scores) const {
|
||||
vector<double> s;
|
||||
s.push_back(all_scores[MONO]+all_scores[DRIGHT]);
|
||||
s.push_back(all_scores[SWAP]+all_scores[DLEFT]);
|
||||
return s;
|
||||
}
|
||||
|
||||
void Model::score_fe(const string& f, const string& e) {
|
||||
if (!fe) //Make sure we do not do anything if it is not a fe model
|
||||
return;
|
||||
//file >> f >> " " >> e >> " ||| ";
|
||||
fprintf(file,"%s ||| %s ||| ",f.c_str(),e.c_str());
|
||||
//condition on the previous phrase
|
||||
if (previous) {
|
||||
vector<double> scores = scorer->score(modelscore->get_scores_fe_prev());
|
||||
double sum = 0;
|
||||
for(int i=0; i<scores.size(); ++i) {
|
||||
scores[i] += smoothing_prev[i];
|
||||
sum += scores[i];
|
||||
}
|
||||
for(int i=0; i<scores.size(); ++i) {
|
||||
//file >> scores[i]/sum >> " ";
|
||||
fprintf(file,"%f ",scores[i]/sum);
|
||||
}
|
||||
}
|
||||
//condition on the next phrase
|
||||
if (next) {
|
||||
//file >> "||| ";
|
||||
fprintf(file, "||| ");
|
||||
vector<double> scores = scorer->score(modelscore->get_scores_fe_next());
|
||||
double sum = 0;
|
||||
for(int i=0; i<scores.size(); ++i) {
|
||||
scores[i] += smoothing_next[i];
|
||||
sum += scores[i];
|
||||
}
|
||||
for(int i=0; i<scores.size(); ++i) {
|
||||
//file >> scores[i]/sum >> " ";
|
||||
fprintf(file, "%f ", scores[i]/sum);
|
||||
}
|
||||
}
|
||||
//file >> "\n";
|
||||
fprintf(file,"\n");
|
||||
}
|
||||
|
||||
void Model::score_f(const string& f) {
|
||||
if (fe) //Make sure we do not do anything if it is not a f model
|
||||
return;
|
||||
//file >> f >> " ||| ";
|
||||
fprintf(file, "%s ||| ", f.c_str());
|
||||
//condition on the previous phrase
|
||||
if (previous) {
|
||||
vector<double> scores = scorer->score(modelscore->get_scores_f_prev());
|
||||
double sum = 0;
|
||||
for(int i=0; i<scores.size(); ++i) {
|
||||
scores[i] += smoothing_prev[i];
|
||||
sum += scores[i];
|
||||
}
|
||||
for(int i=0; i<scores.size(); ++i) {
|
||||
fprintf(file, "%f ", scores[i]/sum);
|
||||
}
|
||||
}
|
||||
//condition on the next phrase
|
||||
if (next) {
|
||||
//file >> "||| ";
|
||||
fprintf(file, "||| ");
|
||||
vector<double> scores = scorer->score(modelscore->get_scores_f_next());
|
||||
double sum = 0;
|
||||
for(int i=0; i<scores.size(); ++i) {
|
||||
scores[i] += smoothing_next[i];
|
||||
sum += scores[i];
|
||||
}
|
||||
for(int i=0; i<scores.size(); ++i) {
|
||||
//file >> scores[i]/sum >> " ";
|
||||
fprintf(file, "%f ", scores[i]/sum);
|
||||
}
|
||||
}
|
||||
//file >> "\n";
|
||||
fprintf(file, "\n");
|
||||
}
|
||||
|
||||
Model::Model(ModelScore* ms, Scorer* sc, const string& dir, const string& lang, const string& fn)
|
||||
: modelscore(ms), scorer(sc), filename(fn) {
|
||||
|
||||
file = fopen(filename.c_str(),"w");
|
||||
if (!file) {
|
||||
cerr << "Could not open the model output file: " << filename << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
fe = false;
|
||||
if (lang.compare("fe") == 0) {
|
||||
fe = true;
|
||||
} else if (lang.compare("f") != 0) {
|
||||
cerr << "You have given an illegal language to condition on: " << lang
|
||||
<< "\nLegal types: fe (on both languages), f (only on source language)\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
previous = true;
|
||||
next = true;
|
||||
if (dir.compare("backward") == 0) {
|
||||
next = false;
|
||||
} else if (dir.compare("forward") == 0) {
|
||||
previous = false;
|
||||
}
|
||||
}
|
||||
|
||||
Model::~Model() {
|
||||
fclose(file);
|
||||
delete modelscore;
|
||||
delete scorer;
|
||||
}
|
||||
|
||||
void Model::zipFile() {
|
||||
fclose(file);
|
||||
file = fopen(filename.c_str(), "rb");
|
||||
FILE* gzfile = (FILE*) gzopen((filename+".gz").c_str(),"wb");
|
||||
char inbuffer[128];
|
||||
int num_read;
|
||||
while ((num_read = fread(inbuffer, 1, sizeof(inbuffer), file)) > 0) {
|
||||
gzwrite(gzfile, inbuffer, num_read);
|
||||
}
|
||||
fclose(file);
|
||||
gzclose(gzfile);
|
||||
|
||||
//Remove the unzipped file
|
||||
remove(filename.c_str());
|
||||
}
|
||||
|
||||
void Model::split_config(const string& config, string& dir, string& lang, string& orient) {
|
||||
istringstream is(config);
|
||||
string type;
|
||||
getline(is, type, '-');
|
||||
getline(is, orient, '-');
|
||||
getline(is, dir, '-');
|
||||
getline(is, lang, '-');
|
||||
}
|
||||
|
||||
Model* Model::createModel(ModelScore* modelscore, const std::string& config, const std::string& filepath) {
|
||||
string dir, lang, orient, filename;
|
||||
split_config(config,dir,lang,orient);
|
||||
|
||||
filename = filepath + config;
|
||||
if (orient.compare("mslr") == 0) {
|
||||
return new Model(modelscore, new ScorerMSLR(), dir, lang, filename);
|
||||
} else if (orient.compare("msd") == 0) {
|
||||
return new Model(modelscore, new ScorerMSD(), dir, lang, filename);
|
||||
} else if (orient.compare("monotonicity") == 0) {
|
||||
return new Model(modelscore, new ScorerMonotonicity(), dir, lang, filename);
|
||||
} else if (orient.compare("leftright") == 0) {
|
||||
return new Model(modelscore, new ScorerLR(), dir, lang, filename);
|
||||
} else {
|
||||
cerr << "Illegal orientation type of reordering model: " << orient
|
||||
<< "\n allowed types: mslr, msd, monotonicity, leftright\n";
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
void Model::createSmoothing(double w) {
|
||||
smoothing_prev = scorer->createSmoothing(modelscore->get_scores_fe_prev(),w);
|
||||
smoothing_next = scorer->createSmoothing(modelscore->get_scores_fe_prev(),w);
|
||||
}
|
||||
|
||||
void Model::createConstSmoothing(double w) {
|
||||
vector<double> i;
|
||||
smoothing_prev = scorer->createConstSmoothing(w);
|
||||
smoothing_next = scorer->createConstSmoothing(w);
|
||||
}
|
134
scripts/training/lexical-reordering/reordering_classes.h
Normal file
134
scripts/training/lexical-reordering/reordering_classes.h
Normal file
@ -0,0 +1,134 @@
|
||||
/*
|
||||
* reordering_classes.h
|
||||
* Utility classes for lexical reordering table scoring
|
||||
*
|
||||
* Created by: Sara Stymne - Linköping University
|
||||
* Machine Translation Marathon 2010, Dublin
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <fstream>
|
||||
|
||||
|
||||
enum ORIENTATION {MONO, SWAP, DRIGHT, DLEFT, OTHER, NOMONO};
|
||||
|
||||
|
||||
//Keeps the counts for the different reordering types
|
||||
//(Instantiated in 1-3 instances, one for each type of model (hier, phrase, wbe))
|
||||
class ModelScore {
|
||||
private:
|
||||
std::vector<double> count_fe_prev;
|
||||
std::vector<double> count_fe_next;
|
||||
std::vector<double> count_f_prev;
|
||||
std::vector<double> count_f_next;
|
||||
|
||||
protected:
|
||||
virtual ORIENTATION getType(const std::string& s);
|
||||
|
||||
public:
|
||||
ModelScore();
|
||||
void add_example(const std::string& previous, std::string& next);
|
||||
void reset_fe();
|
||||
void reset_f();
|
||||
const std::vector<double>& get_scores_fe_prev() const;
|
||||
const std::vector<double>& get_scores_fe_next() const;
|
||||
const std::vector<double>& get_scores_f_prev() const;
|
||||
const std::vector<double>& get_scores_f_next() const;
|
||||
|
||||
static ModelScore* createModelScore(const std::string& modeltype);
|
||||
};
|
||||
|
||||
class ModelScoreMSLR : public ModelScore {
|
||||
protected:
|
||||
virtual ORIENTATION getType(const std::string& s);
|
||||
};
|
||||
|
||||
class ModelScoreLR : public ModelScore {
|
||||
protected:
|
||||
virtual ORIENTATION getType(const std::string& s);
|
||||
};
|
||||
|
||||
class ModelScoreMSD : public ModelScore {
|
||||
protected:
|
||||
virtual ORIENTATION getType(const std::string& s);
|
||||
};
|
||||
|
||||
class ModelScoreMonotonicity : public ModelScore {
|
||||
protected:
|
||||
virtual ORIENTATION getType(const std::string& s);
|
||||
};
|
||||
|
||||
//Class for calculating total counts, and to calculate smoothing
|
||||
class Scorer {
|
||||
public:
|
||||
~Scorer() {}
|
||||
virtual std::vector<double> score(std::vector<double>) const = 0;
|
||||
virtual std::vector<double> createSmoothing(std::vector<double>, double) const = 0;
|
||||
virtual std::vector<double> createConstSmoothing(double) const = 0;
|
||||
};
|
||||
|
||||
class ScorerMSLR : public Scorer {
|
||||
public:
|
||||
virtual std::vector<double> score(std::vector<double>) const;
|
||||
virtual std::vector<double> createSmoothing(std::vector<double>, double) const;
|
||||
virtual std::vector<double> createConstSmoothing(double) const;
|
||||
};
|
||||
|
||||
class ScorerMSD : public Scorer {
|
||||
public:
|
||||
virtual std::vector<double> score(std::vector<double>) const;
|
||||
virtual std::vector<double> createSmoothing(std::vector<double>, double) const;
|
||||
virtual std::vector<double> createConstSmoothing(double) const;
|
||||
};
|
||||
|
||||
class ScorerMonotonicity : public Scorer {
|
||||
public:
|
||||
virtual std::vector<double> score(std::vector<double>) const;
|
||||
virtual std::vector<double> createSmoothing(std::vector<double>, double) const;
|
||||
virtual std::vector<double> createConstSmoothing(double) const;
|
||||
};
|
||||
|
||||
class ScorerLR : public Scorer {
|
||||
public:
|
||||
virtual std::vector<double> score(std::vector<double>) const;
|
||||
virtual std::vector<double> createSmoothing(std::vector<double>, double) const;
|
||||
virtual std::vector<double> createConstSmoothing(double) const;
|
||||
};
|
||||
|
||||
|
||||
//Class for representing each model
|
||||
//Contains a modelscore and scorer (which can be of different model types (mslr, msd...)),
|
||||
//and file handling.
|
||||
//This class also keeps track of bidirectionality, and which language to condition on
|
||||
class Model {
|
||||
private:
|
||||
ModelScore* modelscore;
|
||||
Scorer* scorer;
|
||||
|
||||
std::FILE* file;
|
||||
std::string filename;
|
||||
|
||||
bool fe;
|
||||
bool previous;
|
||||
bool next;
|
||||
|
||||
std::vector<double> smoothing_prev;
|
||||
std::vector<double> smoothing_next;
|
||||
|
||||
static void split_config(const std::string& config, std::string& dir,
|
||||
std::string& lang, std::string& orient);
|
||||
public:
|
||||
Model(ModelScore* ms, Scorer* sc, const std::string& dir,
|
||||
const std::string& lang, const std::string& fn);
|
||||
~Model();
|
||||
static Model* createModel(ModelScore*, const std::string&, const std::string&);
|
||||
void createSmoothing(double w);
|
||||
void createConstSmoothing(double w);
|
||||
void score_fe(const std::string& f, const std::string& e);
|
||||
void score_f(const std::string& f);
|
||||
void zipFile();
|
||||
};
|
||||
|
221
scripts/training/lexical-reordering/score.cpp
Normal file
221
scripts/training/lexical-reordering/score.cpp
Normal file
@ -0,0 +1,221 @@
|
||||
/*
|
||||
* score_reordering.cpp
|
||||
*
|
||||
* Created by: Sara Stymne - Linköping University
|
||||
* Machine Translation Marathon 2010, Dublin
|
||||
*/
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
|
||||
#include "reordering_classes.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
void split_line(const string& line, string& foreign, string& english, string& wbe, string& phrase, string& hier);
|
||||
void get_orientations(const string& pair, string& previous, string& next);
|
||||
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
cerr << "Lexical Reordering Scorer, written by Sara Stymne\n"
|
||||
<< "scores lexical reordering models of several types (hierarchical, phrase-based and word-based-extraction\n";
|
||||
|
||||
if (argc < 3) {
|
||||
cerr << "syntax: score_reordering extractFile smoothingValue filepath (--model \"type max-orientation (specification-strings)\" )+\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
char* extractFileName = argv[1];
|
||||
double smoothingValue = atof(argv[2]);
|
||||
string filepath = argv[3];
|
||||
|
||||
ifstream eFile(extractFileName);
|
||||
if (!eFile) {
|
||||
cerr << "Could not open the extract file " << extractFileName <<"for scoring of lexical reordering models\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
bool smoothWithCounts = false;
|
||||
map<string,ModelScore*> modelScores;
|
||||
vector<Model*> models;
|
||||
bool hier = false;
|
||||
bool phrase = false;
|
||||
bool wbe = false;
|
||||
|
||||
string e,f,w,p,h;
|
||||
string prev, next;
|
||||
|
||||
int i = 4;
|
||||
while (i<argc) {
|
||||
if (strcmp(argv[i],"--SmoothWithCounts") == 0) {
|
||||
smoothWithCounts = true;
|
||||
} else if (strcmp(argv[i],"--model") == 0) {
|
||||
if (i+1 >= argc){
|
||||
cerr << "score: syntax error, no model information provided to the option" << argv[i] << endl;
|
||||
exit(1);
|
||||
}
|
||||
istringstream is(argv[++i]);
|
||||
string m,t;
|
||||
is >> m >> t;
|
||||
modelScores[m] = ModelScore::createModelScore(t);
|
||||
if (m.compare("hier") == 0) {
|
||||
hier = true;
|
||||
} else if (m.compare("phrase") == 0) {
|
||||
phrase = true;
|
||||
} if (m.compare("wbe") == 0) {
|
||||
wbe = true;
|
||||
}
|
||||
|
||||
if (!hier && !phrase && !wbe) {
|
||||
cerr << "WARNING: No models specified for lexical reordering. No lexical reordering table will be trained.\n";
|
||||
return 0;
|
||||
}
|
||||
|
||||
string config;
|
||||
//Store all models
|
||||
while (is >> config) {
|
||||
models.push_back(Model::createModel(modelScores[m],config,filepath));
|
||||
}
|
||||
} else {
|
||||
cerr << "illegal option given to lexical reordering model score\n";
|
||||
exit(1);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
////////////////////////////////////
|
||||
//calculate smoothing
|
||||
if (smoothWithCounts) {
|
||||
string line;
|
||||
while (getline(eFile,line)) {
|
||||
split_line(line,e,f,w,p,h);
|
||||
if (hier) {
|
||||
get_orientations(h, prev, next);
|
||||
modelScores["hier"]->add_example(prev,next);
|
||||
}
|
||||
if (phrase) {
|
||||
get_orientations(p, prev, next);
|
||||
modelScores["phrase"]->add_example(prev,next);
|
||||
}
|
||||
if (wbe) {
|
||||
get_orientations(w, prev, next);
|
||||
modelScores["wbe"]->add_example(prev,next);
|
||||
}
|
||||
}
|
||||
|
||||
// calculate smoothing for each model
|
||||
for (int i=0; i<models.size();++i) {
|
||||
models[i]->createSmoothing(smoothingValue);
|
||||
}
|
||||
|
||||
//reopen eFile
|
||||
eFile.close();
|
||||
eFile.open(extractFileName);
|
||||
}
|
||||
else {
|
||||
//constant smoothing
|
||||
for (int i=0; i<models.size();++i) {
|
||||
models[i]->createConstSmoothing(smoothingValue);
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////
|
||||
//calculate scores for reordering table
|
||||
string line,f_current,e_current;
|
||||
bool first = true;
|
||||
while (getline(eFile, line)) {
|
||||
split_line(line,f,e,w,p,h);
|
||||
|
||||
if (first) {
|
||||
f_current = f;
|
||||
e_current = e;
|
||||
first = false;
|
||||
} else if (f.compare(f_current) != 0 || e.compare(e_current) != 0) {
|
||||
//fe - score
|
||||
for (int i=0; i<models.size();++i) {
|
||||
models[i]->score_fe(f,e);
|
||||
}
|
||||
//reset
|
||||
for(map<string,ModelScore*>::const_iterator it = modelScores.begin(); it != modelScores.end(); ++it) {
|
||||
it->second->reset_fe();
|
||||
}
|
||||
|
||||
if (f.compare(f_current) != 0) {
|
||||
//f - score
|
||||
for (int i=0; i<models.size();++i) {
|
||||
models[i]->score_f(f);
|
||||
}
|
||||
//reset
|
||||
for(map<string,ModelScore*>::const_iterator it = modelScores.begin(); it != modelScores.end(); ++it) {
|
||||
it->second->reset_f();
|
||||
}
|
||||
}
|
||||
f_current = f;
|
||||
e_current = e;
|
||||
}
|
||||
|
||||
// uppdate counts
|
||||
if (hier) {
|
||||
get_orientations(h, prev, next);
|
||||
modelScores["hier"]->add_example(prev,next);
|
||||
}
|
||||
if (phrase) {
|
||||
get_orientations(p, prev, next);
|
||||
modelScores["phrase"]->add_example(prev,next);
|
||||
}
|
||||
if (wbe) {
|
||||
get_orientations(w, prev, next);
|
||||
modelScores["wbe"]->add_example(prev,next);
|
||||
}
|
||||
}
|
||||
//Score the last phrases
|
||||
for (int i=0; i<models.size();++i) {
|
||||
models[i]->score_fe(f,e);
|
||||
}
|
||||
for (int i=0; i<models.size();++i) {
|
||||
models[i]->score_f(f);
|
||||
}
|
||||
|
||||
//Zip all files
|
||||
for (int i=0; i<models.size();++i) {
|
||||
models[i]->zipFile();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void split_line(const string& line, string& foreign, string& english, string& wbe, string& phrase, string& hier) {
|
||||
|
||||
int begin = 0;
|
||||
int end = line.find(" ||| ");
|
||||
foreign = line.substr(begin, end - begin);
|
||||
|
||||
begin = end+5;
|
||||
end = line.find(" ||| ", begin);
|
||||
english = line.substr(begin, end - begin);
|
||||
|
||||
begin = end+5;
|
||||
end = line.find(" | ", begin);
|
||||
wbe = line.substr(begin, end - begin);
|
||||
|
||||
begin = end+3;
|
||||
end = line.find(" | ", begin);
|
||||
phrase = line.substr(begin, end - begin);
|
||||
|
||||
begin = end+3;
|
||||
hier = line.substr(begin, line.size() - begin);
|
||||
}
|
||||
|
||||
void get_orientations(const string& pair, string& previous, string& next) {
|
||||
istringstream is(pair);
|
||||
is >> previous >> next;
|
||||
}
|
@ -21,16 +21,16 @@
|
||||
|
||||
using namespace std;
|
||||
|
||||
#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) { \
|
||||
_IS.getline(_LINE, _SIZE, _DELIM); \
|
||||
if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \
|
||||
if (_IS.gcount() == _SIZE-1) { \
|
||||
cerr << "Line too long! Buffer overflow. Delete lines >=" \
|
||||
<< _SIZE << " chars or raise LINE_MAX_LENGTH in phrase-extract/extract.cpp" \
|
||||
<< endl; \
|
||||
exit(1); \
|
||||
} \
|
||||
}
|
||||
#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) { \
|
||||
_IS.getline(_LINE, _SIZE, _DELIM); \
|
||||
if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \
|
||||
if (_IS.gcount() == _SIZE-1) { \
|
||||
cerr << "Line too long! Buffer overflow. Delete lines >=" \
|
||||
<< _SIZE << " chars or raise LINE_MAX_LENGTH in phrase-extract/extract.cpp" \
|
||||
<< endl; \
|
||||
exit(1); \
|
||||
} \
|
||||
}
|
||||
#define LINE_MAX_LENGTH 60000
|
||||
|
||||
// HPhraseVertex represents a point in the alignment matrix
|
||||
@ -51,32 +51,32 @@ enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO};
|
||||
enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN};
|
||||
|
||||
class SentenceAlignment {
|
||||
public:
|
||||
vector<string> english;
|
||||
vector<string> foreign;
|
||||
vector<int> alignedCountF;
|
||||
vector< vector<int> > alignedToE;
|
||||
public:
|
||||
vector<string> english;
|
||||
vector<string> foreign;
|
||||
vector<int> alignedCountF;
|
||||
vector< vector<int> > alignedToE;
|
||||
|
||||
int create( char[], char[], char[], int );
|
||||
// void clear() { delete(alignment); };
|
||||
int create( char[], char[], char[], int );
|
||||
// void clear() { delete(alignment); };
|
||||
};
|
||||
|
||||
REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE,
|
||||
int, int, int, int, int, int, int,
|
||||
bool (*)(int, int), bool (*)(int, int));
|
||||
int, int, int, int, int, int, int,
|
||||
bool (*)(int, int), bool (*)(int, int));
|
||||
REO_POS getOrientPhraseModel(REO_MODEL_TYPE,
|
||||
int, int, int, int, int, int, int,
|
||||
bool (*)(int, int), bool (*)(int, int),
|
||||
const HSenteceVertices &, const HSenteceVertices &);
|
||||
int, int, int, int, int, int, int,
|
||||
bool (*)(int, int), bool (*)(int, int),
|
||||
const HSenteceVertices &, const HSenteceVertices &);
|
||||
REO_POS getOrientHierModel(REO_MODEL_TYPE,
|
||||
int, int, int, int, int, int, int,
|
||||
bool (*)(int, int), bool (*)(int, int),
|
||||
const HSenteceVertices &, const HSenteceVertices &,
|
||||
REO_POS);
|
||||
int, int, int, int, int, int, int,
|
||||
bool (*)(int, int), bool (*)(int, int),
|
||||
const HSenteceVertices &, const HSenteceVertices &,
|
||||
REO_POS);
|
||||
|
||||
void insertVertex(HSenteceVertices &, int, int);
|
||||
void insertPhraseVertices(HSenteceVertices &, HSenteceVertices &, HSenteceVertices &, HSenteceVertices &,
|
||||
int, int, int, int);
|
||||
int, int, int, int);
|
||||
string getOrientString(REO_POS, REO_MODEL_TYPE);
|
||||
|
||||
bool ge(int, int);
|
||||
@ -113,439 +113,440 @@ bool properConditioning = false;
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
cerr << "PhraseExtract v1.4, written by Philipp Koehn\n"
|
||||
<< "phrase extraction from an aligned parallel corpus\n";
|
||||
time_t starttime = time(NULL);
|
||||
cerr << "PhraseExtract v1.4, written by Philipp Koehn\n"
|
||||
<< "phrase extraction from an aligned parallel corpus\n";
|
||||
time_t starttime = time(NULL);
|
||||
|
||||
if (argc < 6) {
|
||||
cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] | --OnlyOutputSpanInfo | --NoFileLimit | --ProperConditioning]\n";
|
||||
exit(1);
|
||||
if (argc < 6) {
|
||||
cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] | --OnlyOutputSpanInfo | --NoFileLimit | --ProperConditioning]\n";
|
||||
exit(1);
|
||||
}
|
||||
char* &fileNameE = argv[1];
|
||||
char* &fileNameF = argv[2];
|
||||
char* &fileNameA = argv[3];
|
||||
fileNameExtract = argv[4];
|
||||
maxPhraseLength = atoi(argv[5]);
|
||||
|
||||
for(int i=6;i<argc;i++) {
|
||||
if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
|
||||
onlyOutputSpanInfo = true;
|
||||
}
|
||||
else if (strcmp(argv[i],"--NoFileLimit") == 0) {
|
||||
noFileLimit = true;
|
||||
}
|
||||
else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
|
||||
orientationFlag = true;
|
||||
}
|
||||
else if(strcmp(argv[i],"--model") == 0){
|
||||
if (i+1 >= argc){
|
||||
cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
|
||||
exit(1);
|
||||
}
|
||||
char* modelParams = argv[++i];
|
||||
char* modelName = strtok(modelParams, "-");
|
||||
char* modelType = strtok(NULL, "-");
|
||||
|
||||
REO_MODEL_TYPE intModelType;
|
||||
|
||||
if(strcmp(modelName, "wbe") == 0){
|
||||
wordModel = true;
|
||||
if(strcmp(modelType, "msd") == 0)
|
||||
wordType = REO_MSD;
|
||||
else if(strcmp(modelType, "mslr") == 0)
|
||||
wordType = REO_MSLR;
|
||||
else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
|
||||
wordType = REO_MONO;
|
||||
else{
|
||||
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
|
||||
exit(1);
|
||||
}
|
||||
char* &fileNameE = argv[1];
|
||||
char* &fileNameF = argv[2];
|
||||
char* &fileNameA = argv[3];
|
||||
fileNameExtract = argv[4];
|
||||
maxPhraseLength = atoi(argv[5]);
|
||||
|
||||
for(int i=6;i<argc;i++) {
|
||||
if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
|
||||
onlyOutputSpanInfo = true;
|
||||
}
|
||||
else if (strcmp(argv[i],"--NoFileLimit") == 0) {
|
||||
noFileLimit = true;
|
||||
}
|
||||
else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
|
||||
orientationFlag = true;
|
||||
}
|
||||
else if(strcmp(argv[i],"--model") == 0){
|
||||
if (i+1 >= argc){
|
||||
cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
|
||||
exit(1);
|
||||
}
|
||||
char* modelParams = argv[++i];
|
||||
char* modelName = strtok(modelParams, "-");
|
||||
char* modelType = strtok(NULL, "-");
|
||||
|
||||
REO_MODEL_TYPE intModelType;
|
||||
|
||||
if(strcmp(modelName, "wbe") == 0){
|
||||
wordModel = true;
|
||||
if(strcmp(modelType, "msd") == 0)
|
||||
wordType = REO_MSD;
|
||||
else if(strcmp(modelType, "mslr") == 0)
|
||||
wordType = REO_MSLR;
|
||||
else if(strcmp(modelType, "mono") == 0)
|
||||
wordType = REO_MONO;
|
||||
else{
|
||||
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
else if(strcmp(modelName, "phrase") == 0){
|
||||
phraseModel = true;
|
||||
if(strcmp(modelType, "msd") == 0)
|
||||
phraseType = REO_MSD;
|
||||
else if(strcmp(modelType, "mslr") == 0)
|
||||
phraseType = REO_MSLR;
|
||||
else if(strcmp(modelType, "mono") == 0)
|
||||
phraseType = REO_MONO;
|
||||
else{
|
||||
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
else if(strcmp(modelName, "hier") == 0){
|
||||
hierModel = true;
|
||||
if(strcmp(modelType, "msd") == 0)
|
||||
hierType = REO_MSD;
|
||||
else if(strcmp(modelType, "mslr") == 0)
|
||||
hierType = REO_MSLR;
|
||||
else if(strcmp(modelType, "mono") == 0)
|
||||
hierType = REO_MONO;
|
||||
else{
|
||||
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
else{
|
||||
cerr << "extract: syntax error, unknown reordering model: " << modelName << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
allModelsOutputFlag = true;
|
||||
}
|
||||
else if (strcmp(argv[i],"--ZipFiles") == 0) {
|
||||
zipFiles = true;
|
||||
}
|
||||
else if (strcmp(argv[i],"--ProperConditioning") == 0) {
|
||||
properConditioning = true;
|
||||
}
|
||||
else {
|
||||
cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
else if(strcmp(modelName, "phrase") == 0){
|
||||
phraseModel = true;
|
||||
if(strcmp(modelType, "msd") == 0)
|
||||
phraseType = REO_MSD;
|
||||
else if(strcmp(modelType, "mslr") == 0)
|
||||
phraseType = REO_MSLR;
|
||||
else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
|
||||
phraseType = REO_MONO;
|
||||
else{
|
||||
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// default reordreing model if no model selected
|
||||
// allows for the old syntax to be used
|
||||
if(orientationFlag && !allModelsOutputFlag){
|
||||
wordModel = true;
|
||||
wordType = REO_MSD;
|
||||
}
|
||||
else if(strcmp(modelName, "hier") == 0){
|
||||
hierModel = true;
|
||||
if(strcmp(modelType, "msd") == 0)
|
||||
hierType = REO_MSD;
|
||||
else if(strcmp(modelType, "mslr") == 0)
|
||||
hierType = REO_MSLR;
|
||||
else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
|
||||
hierType = REO_MONO;
|
||||
else{
|
||||
cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
else{
|
||||
cerr << "extract: syntax error, unknown reordering model: " << modelName << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
ifstream eFile;
|
||||
ifstream fFile;
|
||||
ifstream aFile;
|
||||
eFile.open(fileNameE);
|
||||
fFile.open(fileNameF);
|
||||
aFile.open(fileNameA);
|
||||
istream *eFileP = &eFile;
|
||||
istream *fFileP = &fFile;istream *aFileP = &aFile;
|
||||
allModelsOutputFlag = true;
|
||||
}
|
||||
else if (strcmp(argv[i],"--ZipFiles") == 0) {
|
||||
zipFiles = true;
|
||||
}
|
||||
else if (strcmp(argv[i],"--ProperConditioning") == 0) {
|
||||
properConditioning = true;
|
||||
}
|
||||
else {
|
||||
cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
int i=0;
|
||||
while(true) {
|
||||
i++;
|
||||
if (i%10000 == 0) cerr << "." << flush;
|
||||
char englishString[LINE_MAX_LENGTH];
|
||||
char foreignString[LINE_MAX_LENGTH];
|
||||
char alignmentString[LINE_MAX_LENGTH];
|
||||
SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n');
|
||||
if (eFileP->eof()) break;
|
||||
SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n');
|
||||
SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n');
|
||||
SentenceAlignment sentence;
|
||||
// cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
|
||||
//az: output src, tgt, and alingment line
|
||||
if (onlyOutputSpanInfo) {
|
||||
cout << "LOG: SRC: " << foreignString << endl;
|
||||
cout << "LOG: TGT: " << englishString << endl;
|
||||
cout << "LOG: ALT: " << alignmentString << endl;
|
||||
cout << "LOG: PHRASES_BEGIN:" << endl;
|
||||
}
|
||||
// default reordering model if no model selected
|
||||
// allows for the old syntax to be used
|
||||
if(orientationFlag && !allModelsOutputFlag){
|
||||
wordModel = true;
|
||||
wordType = REO_MSD;
|
||||
}
|
||||
|
||||
if (sentence.create( englishString, foreignString, alignmentString, i )) {
|
||||
extract(sentence);
|
||||
}
|
||||
if (onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
|
||||
}
|
||||
eFile.close();
|
||||
fFile.close();
|
||||
aFile.close();
|
||||
//az: only close if we actually opened it
|
||||
if (!onlyOutputSpanInfo) {
|
||||
extractFile.close();
|
||||
extractFileInv.close();
|
||||
if (orientationFlag) extractFileOrientation.close();
|
||||
}
|
||||
ifstream eFile;
|
||||
ifstream fFile;
|
||||
ifstream aFile;
|
||||
eFile.open(fileNameE);
|
||||
fFile.open(fileNameF);
|
||||
aFile.open(fileNameA);
|
||||
istream *eFileP = &eFile;
|
||||
istream *fFileP = &fFile;
|
||||
istream *aFileP = &aFile;
|
||||
|
||||
int i=0;
|
||||
while(true) {
|
||||
i++;
|
||||
if (i%10000 == 0) cerr << "." << flush;
|
||||
char englishString[LINE_MAX_LENGTH];
|
||||
char foreignString[LINE_MAX_LENGTH];
|
||||
char alignmentString[LINE_MAX_LENGTH];
|
||||
SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n');
|
||||
if (eFileP->eof()) break;
|
||||
SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n');
|
||||
SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n');
|
||||
SentenceAlignment sentence;
|
||||
// cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
|
||||
//az: output src, tgt, and alingment line
|
||||
if (onlyOutputSpanInfo) {
|
||||
cout << "LOG: SRC: " << foreignString << endl;
|
||||
cout << "LOG: TGT: " << englishString << endl;
|
||||
cout << "LOG: ALT: " << alignmentString << endl;
|
||||
cout << "LOG: PHRASES_BEGIN:" << endl;
|
||||
}
|
||||
|
||||
if (sentence.create( englishString, foreignString, alignmentString, i )) {
|
||||
extract(sentence);
|
||||
}
|
||||
if (onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
|
||||
}
|
||||
eFile.close();
|
||||
fFile.close();
|
||||
aFile.close();
|
||||
//az: only close if we actually opened it
|
||||
if (!onlyOutputSpanInfo) {
|
||||
extractFile.close();
|
||||
extractFileInv.close();
|
||||
if (orientationFlag) extractFileOrientation.close();
|
||||
}
|
||||
}
|
||||
|
||||
void extract(SentenceAlignment &sentence) {
|
||||
int countE = sentence.english.size();
|
||||
int countF = sentence.foreign.size();
|
||||
int countE = sentence.english.size();
|
||||
int countF = sentence.foreign.size();
|
||||
|
||||
HPhraseVector inboundPhrases;
|
||||
HPhraseVector inboundPhrases;
|
||||
|
||||
HSenteceVertices inTopLeft;
|
||||
HSenteceVertices inTopRight;
|
||||
HSenteceVertices inBottomLeft;
|
||||
HSenteceVertices inBottomRight;
|
||||
HSenteceVertices inTopLeft;
|
||||
HSenteceVertices inTopRight;
|
||||
HSenteceVertices inBottomLeft;
|
||||
HSenteceVertices inBottomRight;
|
||||
|
||||
HSenteceVertices outTopLeft;
|
||||
HSenteceVertices outTopRight;
|
||||
HSenteceVertices outBottomLeft;
|
||||
HSenteceVertices outBottomRight;
|
||||
HSenteceVertices outTopLeft;
|
||||
HSenteceVertices outTopRight;
|
||||
HSenteceVertices outBottomLeft;
|
||||
HSenteceVertices outBottomRight;
|
||||
|
||||
HSenteceVertices::const_iterator it;
|
||||
HSenteceVertices::const_iterator it;
|
||||
|
||||
bool relaxLimit = hierModel;
|
||||
bool buildExtraStructure = phraseModel || hierModel;
|
||||
bool relaxLimit = hierModel;
|
||||
bool buildExtraStructure = phraseModel || hierModel;
|
||||
|
||||
// check alignments for english phrase startE...endE
|
||||
// loop over extracted phrases which are compatible with the word-alignments
|
||||
for(int startE=0;startE<countE;startE++) {
|
||||
for(int endE=startE;
|
||||
(endE<countE && (relaxLimit || endE<startE+maxPhraseLength));
|
||||
endE++) {
|
||||
// check alignments for english phrase startE...endE
|
||||
// loop over extracted phrases which are compatible with the word-alignments
|
||||
for(int startE=0;startE<countE;startE++) {
|
||||
for(int endE=startE;
|
||||
(endE<countE && (relaxLimit || endE<startE+maxPhraseLength));
|
||||
endE++) {
|
||||
|
||||
int minF = 9999;
|
||||
int maxF = -1;
|
||||
vector< int > usedF = sentence.alignedCountF;
|
||||
for(int ei=startE;ei<=endE;ei++) {
|
||||
for(int i=0;i<sentence.alignedToE[ei].size();i++) {
|
||||
int fi = sentence.alignedToE[ei][i];
|
||||
if (fi<minF) { minF = fi; }
|
||||
if (fi>maxF) { maxF = fi; }
|
||||
usedF[ fi ]--;
|
||||
}
|
||||
}
|
||||
|
||||
if (maxF >= 0 && // aligned to any foreign words at all
|
||||
(relaxLimit || maxF-minF < maxPhraseLength)) { // foreign phrase within limits
|
||||
|
||||
// check if foreign words are aligned to out of bound english words
|
||||
bool out_of_bounds = false;
|
||||
for(int fi=minF;fi<=maxF && !out_of_bounds;fi++)
|
||||
if (usedF[fi]>0) {
|
||||
// cout << "ouf of bounds: " << fi << "\n";
|
||||
out_of_bounds = true;
|
||||
}
|
||||
|
||||
// cout << "doing if for ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n";
|
||||
if (!out_of_bounds){
|
||||
// start point of foreign phrase may retreat over unaligned
|
||||
for(int startF=minF;
|
||||
(startF>=0 &&
|
||||
(relaxLimit || startF>maxF-maxPhraseLength) && // within length limit
|
||||
(startF==minF || sentence.alignedCountF[startF]==0)); // unaligned
|
||||
startF--)
|
||||
// end point of foreign phrase may advance over unaligned
|
||||
for(int endF=maxF;
|
||||
(endF<countF &&
|
||||
(relaxLimit || endF<startF+maxPhraseLength) && // within length limit
|
||||
(endF==maxF || sentence.alignedCountF[endF]==0)); // unaligned
|
||||
endF++){ // at this point we have extracted a phrase
|
||||
if(buildExtraStructure){ // phrase || hier
|
||||
if(endE-startE < maxPhraseLength && endF-startF < maxPhraseLength){ // within limit
|
||||
inboundPhrases.push_back(
|
||||
HPhrase(
|
||||
HPhraseVertex(startF,startE),
|
||||
HPhraseVertex(endF,endE)
|
||||
)
|
||||
);
|
||||
insertPhraseVertices(inTopLeft, inTopRight, inBottomLeft, inBottomRight,
|
||||
startF, startE, endF, endE);
|
||||
}
|
||||
else
|
||||
insertPhraseVertices(outTopLeft, outTopRight, outBottomLeft, outBottomRight,
|
||||
startF, startE, endF, endE);
|
||||
}
|
||||
else{
|
||||
string orientationInfo = "";
|
||||
if(wordModel){
|
||||
REO_POS wordPrevOrient, wordNextOrient;
|
||||
wordPrevOrient = getOrientWordModel(sentence, wordType, startF, endF, startE, endE, countF, 0, 1, &ge, <);
|
||||
wordNextOrient = getOrientWordModel(sentence, wordType, endF, startF, endE, startE, 0, countF, -1, <, &ge);
|
||||
if(allModelsOutputFlag)
|
||||
orientationInfo += getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType) + "| | ";
|
||||
else
|
||||
orientationInfo += getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType);
|
||||
}
|
||||
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
int minF = 9999;
|
||||
int maxF = -1;
|
||||
vector< int > usedF = sentence.alignedCountF;
|
||||
for(int ei=startE;ei<=endE;ei++) {
|
||||
for(int i=0;i<sentence.alignedToE[ei].size();i++) {
|
||||
int fi = sentence.alignedToE[ei][i];
|
||||
if (fi<minF) { minF = fi; }
|
||||
if (fi>maxF) { maxF = fi; }
|
||||
usedF[ fi ]--;
|
||||
}
|
||||
}
|
||||
|
||||
if(buildExtraStructure){ // phrase || hier
|
||||
string orientationInfo = "";
|
||||
REO_POS wordPrevOrient, wordNextOrient, phrasePrevOrient, phraseNextOrient, hierPrevOrient, hierNextOrient;
|
||||
if (maxF >= 0 && // aligned to any foreign words at all
|
||||
(relaxLimit || maxF-minF < maxPhraseLength)) { // foreign phrase within limits
|
||||
|
||||
for(int i = 0; i < inboundPhrases.size(); i++){
|
||||
int startF = inboundPhrases[i].first.first;
|
||||
int startE = inboundPhrases[i].first.second;
|
||||
int endF = inboundPhrases[i].second.first;
|
||||
int endE = inboundPhrases[i].second.second;
|
||||
// check if foreign words are aligned to out of bound english words
|
||||
bool out_of_bounds = false;
|
||||
for(int fi=minF;fi<=maxF && !out_of_bounds;fi++)
|
||||
if (usedF[fi]>0) {
|
||||
// cout << "ouf of bounds: " << fi << "\n";
|
||||
out_of_bounds = true;
|
||||
}
|
||||
|
||||
if(wordModel){
|
||||
wordPrevOrient = getOrientWordModel(sentence, wordType,
|
||||
startF, endF, startE, endE, countF, 0, 1,
|
||||
&ge, <);
|
||||
wordNextOrient = getOrientWordModel(sentence, wordType,
|
||||
endF, startF, endE, startE, 0, countF, -1,
|
||||
<, &ge);
|
||||
}
|
||||
phrasePrevOrient = getOrientPhraseModel(phraseType, startF, endF, startE, endE, countF-1, 0, 1, &ge, <, inBottomRight, inBottomLeft);
|
||||
phraseNextOrient = getOrientPhraseModel(phraseType, endF, startF, endE, startE, 0, countF-1, -1, <, &ge, inBottomLeft, inBottomRight);
|
||||
if(hierModel){
|
||||
hierPrevOrient = getOrientHierModel(phraseType, startF, endF, startE, endE, countF-1, 0, 1, &ge, <, outBottomRight, outBottomLeft, phrasePrevOrient);
|
||||
hierNextOrient = getOrientHierModel(phraseType, endF, startF, endE, startE, 0, countF-1, -1, <, &ge, outBottomLeft, outBottomRight, phraseNextOrient);
|
||||
}
|
||||
|
||||
orientationInfo = ((wordModel)? getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType) : " ") + "|" +
|
||||
((phraseModel)? getOrientString(phrasePrevOrient, phraseType) + " " + getOrientString(phraseNextOrient, phraseType) : " ") + "|" +
|
||||
((hierModel)? getOrientString(hierPrevOrient, hierType) + " " + getOrientString(hierNextOrient, hierType) : " ");
|
||||
|
||||
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
|
||||
// cout << "doing if for ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n";
|
||||
if (!out_of_bounds){
|
||||
// start point of foreign phrase may retreat over unaligned
|
||||
for(int startF=minF;
|
||||
(startF>=0 &&
|
||||
(relaxLimit || startF>maxF-maxPhraseLength) && // within length limit
|
||||
(startF==minF || sentence.alignedCountF[startF]==0)); // unaligned
|
||||
startF--)
|
||||
// end point of foreign phrase may advance over unaligned
|
||||
for(int endF=maxF;
|
||||
(endF<countF &&
|
||||
(relaxLimit || endF<startF+maxPhraseLength) && // within length limit
|
||||
(endF==maxF || sentence.alignedCountF[endF]==0)); // unaligned
|
||||
endF++){ // at this point we have extracted a phrase
|
||||
if(buildExtraStructure){ // phrase || hier
|
||||
if(endE-startE < maxPhraseLength && endF-startF < maxPhraseLength){ // within limit
|
||||
inboundPhrases.push_back(
|
||||
HPhrase(
|
||||
HPhraseVertex(startF,startE),
|
||||
HPhraseVertex(endF,endE)
|
||||
)
|
||||
);
|
||||
insertPhraseVertices(inTopLeft, inTopRight, inBottomLeft, inBottomRight,
|
||||
startF, startE, endF, endE);
|
||||
}
|
||||
else
|
||||
insertPhraseVertices(outTopLeft, outTopRight, outBottomLeft, outBottomRight,
|
||||
startF, startE, endF, endE);
|
||||
}
|
||||
else{
|
||||
string orientationInfo = "";
|
||||
if(wordModel){
|
||||
REO_POS wordPrevOrient, wordNextOrient;
|
||||
wordPrevOrient = getOrientWordModel(sentence, wordType, startF, endF, startE, endE, countF, 0, 1, &ge, <);
|
||||
wordNextOrient = getOrientWordModel(sentence, wordType, endF, startF, endE, startE, 0, countF, -1, <, &ge);
|
||||
if(allModelsOutputFlag)
|
||||
orientationInfo += getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType) + " | | ";
|
||||
else
|
||||
orientationInfo += getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType);
|
||||
}
|
||||
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(buildExtraStructure){ // phrase || hier
|
||||
string orientationInfo = "";
|
||||
REO_POS wordPrevOrient, wordNextOrient, phrasePrevOrient, phraseNextOrient, hierPrevOrient, hierNextOrient;
|
||||
|
||||
for(int i = 0; i < inboundPhrases.size(); i++){
|
||||
int startF = inboundPhrases[i].first.first;
|
||||
int startE = inboundPhrases[i].first.second;
|
||||
int endF = inboundPhrases[i].second.first;
|
||||
int endE = inboundPhrases[i].second.second;
|
||||
|
||||
if(wordModel){
|
||||
wordPrevOrient = getOrientWordModel(sentence, wordType,
|
||||
startF, endF, startE, endE, countF, 0, 1,
|
||||
&ge, <);
|
||||
wordNextOrient = getOrientWordModel(sentence, wordType,
|
||||
endF, startF, endE, startE, 0, countF, -1,
|
||||
<, &ge);
|
||||
}
|
||||
phrasePrevOrient = getOrientPhraseModel(phraseType, startF, endF, startE, endE, countF-1, 0, 1, &ge, <, inBottomRight, inBottomLeft);
|
||||
phraseNextOrient = getOrientPhraseModel(phraseType, endF, startF, endE, startE, 0, countF-1, -1, <, &ge, inBottomLeft, inBottomRight);
|
||||
if(hierModel){
|
||||
hierPrevOrient = getOrientHierModel(phraseType, startF, endF, startE, endE, countF-1, 0, 1, &ge, <, outBottomRight, outBottomLeft, phrasePrevOrient);
|
||||
hierNextOrient = getOrientHierModel(phraseType, endF, startF, endE, startE, 0, countF-1, -1, <, &ge, outBottomLeft, outBottomRight, phraseNextOrient);
|
||||
}
|
||||
|
||||
orientationInfo = ((wordModel)? getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType) : " ") + " | " +
|
||||
((phraseModel)? getOrientString(phrasePrevOrient, phraseType) + " " + getOrientString(phraseNextOrient, phraseType) : " ") + " | " +
|
||||
((hierModel)? getOrientString(hierPrevOrient, hierType) + " " + getOrientString(hierNextOrient, hierType) : " ");
|
||||
|
||||
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
REO_POS getOrientWordModel(SentenceAlignment & sentence, REO_MODEL_TYPE modelType,
|
||||
int startF, int endF, int startE, int endE, int countF, int zero, int unit,
|
||||
bool (*ge)(int, int), bool (*lt)(int, int) ){
|
||||
int startF, int endF, int startE, int endE, int countF, int zero, int unit,
|
||||
bool (*ge)(int, int), bool (*lt)(int, int) ){
|
||||
|
||||
bool connectedLeftTop = isAligned( sentence, startF-unit, startE-unit );
|
||||
bool connectedRightTop = isAligned( sentence, endF+unit, startE-unit );
|
||||
if( connectedLeftTop && !connectedRightTop)
|
||||
return LEFT;
|
||||
if(modelType == REO_MONO)
|
||||
return UNKNOWN;
|
||||
if (!connectedLeftTop && connectedRightTop)
|
||||
return RIGHT;
|
||||
if(modelType == REO_MSD)
|
||||
return UNKNOWN;
|
||||
for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit)
|
||||
connectedLeftTop = isAligned(sentence, indexF, startE-unit);
|
||||
for(int indexF=endF+2*unit; (*lt)(indexF,countF) && !connectedRightTop; indexF=indexF+unit)
|
||||
connectedRightTop = isAligned(sentence, indexF, startE-unit);
|
||||
if(connectedLeftTop && !connectedRightTop)
|
||||
return DRIGHT;
|
||||
else if(!connectedLeftTop && connectedRightTop)
|
||||
return DLEFT;
|
||||
return UNKNOWN;
|
||||
bool connectedLeftTop = isAligned( sentence, startF-unit, startE-unit );
|
||||
bool connectedRightTop = isAligned( sentence, endF+unit, startE-unit );
|
||||
if( connectedLeftTop && !connectedRightTop)
|
||||
return LEFT;
|
||||
if(modelType == REO_MONO)
|
||||
return UNKNOWN;
|
||||
if (!connectedLeftTop && connectedRightTop)
|
||||
return RIGHT;
|
||||
if(modelType == REO_MSD)
|
||||
return UNKNOWN;
|
||||
for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit)
|
||||
connectedLeftTop = isAligned(sentence, indexF, startE-unit);
|
||||
for(int indexF=endF+2*unit; (*lt)(indexF,countF) && !connectedRightTop; indexF=indexF+unit)
|
||||
connectedRightTop = isAligned(sentence, indexF, startE-unit);
|
||||
if(connectedLeftTop && !connectedRightTop)
|
||||
return DRIGHT;
|
||||
else if(!connectedLeftTop && connectedRightTop)
|
||||
return DLEFT;
|
||||
return UNKNOWN;
|
||||
}
|
||||
|
||||
// to be called with countF-1 instead of countF
|
||||
REO_POS getOrientPhraseModel (REO_MODEL_TYPE modelType,
|
||||
int startF, int endF, int startE, int endE, int countF, int zero, int unit,
|
||||
bool (*ge)(int, int), bool (*le)(int, int),
|
||||
const HSenteceVertices & inBottomRight, const HSenteceVertices & inBottomLeft){
|
||||
int startF, int endF, int startE, int endE, int countF, int zero, int unit,
|
||||
bool (*ge)(int, int), bool (*le)(int, int),
|
||||
const HSenteceVertices & inBottomRight, const HSenteceVertices & inBottomLeft){
|
||||
|
||||
HSenteceVertices::const_iterator it;
|
||||
HSenteceVertices::const_iterator it;
|
||||
|
||||
if((startE == zero && startF == zero) ||
|
||||
(it = inBottomRight.find(startE - unit)) != inBottomRight.end() &&
|
||||
it->second.find(startF-unit) != it->second.end())
|
||||
return LEFT;
|
||||
if(modelType == REO_MONO)
|
||||
return UNKNOWN;
|
||||
if((it = inBottomLeft.find(startE - unit)) != inBottomLeft.end() && it->second.find(endF + unit) != it->second.end())
|
||||
return RIGHT;
|
||||
if(modelType == REO_MSD)
|
||||
return UNKNOWN;
|
||||
bool connectedLeftTop = false;
|
||||
for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit)
|
||||
if(connectedLeftTop = (it = inBottomRight.find(startE - unit)) != inBottomRight.end() &&
|
||||
it->second.find(indexF) != it->second.end())
|
||||
return DRIGHT;
|
||||
bool connectedRightTop = false;
|
||||
for(int indexF=endF+2*unit; (*le)(indexF, countF) && !connectedRightTop; indexF=indexF+unit)
|
||||
if(connectedRightTop = (it = inBottomLeft.find(startE - unit)) != inBottomRight.end() &&
|
||||
it->second.find(indexF) != it->second.end())
|
||||
return DLEFT;
|
||||
return DRIGHT;
|
||||
if((startE == zero && startF == zero) ||
|
||||
(it = inBottomRight.find(startE - unit)) != inBottomRight.end() &&
|
||||
it->second.find(startF-unit) != it->second.end())
|
||||
return LEFT;
|
||||
if(modelType == REO_MONO)
|
||||
return UNKNOWN;
|
||||
if((it = inBottomLeft.find(startE - unit)) != inBottomLeft.end() && it->second.find(endF + unit) != it->second.end())
|
||||
return RIGHT;
|
||||
if(modelType == REO_MSD)
|
||||
return UNKNOWN;
|
||||
bool connectedLeftTop = false;
|
||||
for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit)
|
||||
if(connectedLeftTop = (it = inBottomRight.find(startE - unit)) != inBottomRight.end() &&
|
||||
it->second.find(indexF) != it->second.end())
|
||||
return DRIGHT;
|
||||
bool connectedRightTop = false;
|
||||
for(int indexF=endF+2*unit; (*le)(indexF, countF) && !connectedRightTop; indexF=indexF+unit)
|
||||
if(connectedRightTop = (it = inBottomLeft.find(startE - unit)) != inBottomRight.end() &&
|
||||
it->second.find(indexF) != it->second.end())
|
||||
return DLEFT;
|
||||
return DRIGHT;
|
||||
}
|
||||
|
||||
// to be called with countF-1 instead of countF
|
||||
REO_POS getOrientHierModel (REO_MODEL_TYPE modelType,
|
||||
int startF, int endF, int startE, int endE, int countF, int zero, int unit,
|
||||
bool (*ge)(int, int), bool (*le)(int, int),
|
||||
const HSenteceVertices & outBottomRight, const HSenteceVertices & outBottomLeft,
|
||||
REO_POS phraseOrient){
|
||||
int startF, int endF, int startE, int endE, int countF, int zero, int unit,
|
||||
bool (*ge)(int, int), bool (*le)(int, int),
|
||||
const HSenteceVertices & outBottomRight, const HSenteceVertices & outBottomLeft,
|
||||
REO_POS phraseOrient){
|
||||
|
||||
HSenteceVertices::const_iterator it;
|
||||
HSenteceVertices::const_iterator it;
|
||||
|
||||
if(phraseOrient == LEFT || ((it = outBottomRight.find(startE - unit)) != outBottomRight.end() &&
|
||||
it->second.find(startF-unit) != it->second.end()))
|
||||
return LEFT;
|
||||
if(modelType == REO_MONO)
|
||||
return UNKNOWN;
|
||||
if(phraseOrient == RIGHT || ((it = outBottomLeft.find(startE - unit)) != outBottomLeft.end() && it->second.find(endF + unit) != it->second.end()))
|
||||
return RIGHT;
|
||||
if(modelType == REO_MSD)
|
||||
return UNKNOWN;
|
||||
if(phraseOrient == DRIGHT)
|
||||
return DRIGHT;
|
||||
if(phraseOrient == DLEFT)
|
||||
return DLEFT;
|
||||
bool connectedLeftTop = false;
|
||||
for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit)
|
||||
if(connectedLeftTop = (it = outBottomRight.find(startE - unit)) != outBottomRight.end() &&
|
||||
it->second.find(indexF) != it->second.end())
|
||||
return DRIGHT;
|
||||
bool connectedRightTop = false;
|
||||
for(int indexF=endF+2*unit; (*le)(indexF, countF) && !connectedRightTop; indexF=indexF+unit)
|
||||
if(connectedRightTop = (it = outBottomLeft.find(startE - unit)) != outBottomRight.end() &&
|
||||
it->second.find(indexF) != it->second.end())
|
||||
return DLEFT;
|
||||
return UNKNOWN;
|
||||
if(phraseOrient == LEFT || ((it = outBottomRight.find(startE - unit)) != outBottomRight.end() &&
|
||||
it->second.find(startF-unit) != it->second.end()))
|
||||
return LEFT;
|
||||
if(modelType == REO_MONO)
|
||||
return UNKNOWN;
|
||||
if(phraseOrient == RIGHT || ((it = outBottomLeft.find(startE - unit)) != outBottomLeft.end() && it->second.find(endF + unit) != it->second.end()))
|
||||
return RIGHT;
|
||||
if(modelType == REO_MSD)
|
||||
return UNKNOWN;
|
||||
if(phraseOrient == DRIGHT)
|
||||
return DRIGHT;
|
||||
if(phraseOrient == DLEFT)
|
||||
return DLEFT;
|
||||
bool connectedLeftTop = false;
|
||||
for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit)
|
||||
if(connectedLeftTop = (it = outBottomRight.find(startE - unit)) != outBottomRight.end() &&
|
||||
it->second.find(indexF) != it->second.end())
|
||||
return DRIGHT;
|
||||
bool connectedRightTop = false;
|
||||
for(int indexF=endF+2*unit; (*le)(indexF, countF) && !connectedRightTop; indexF=indexF+unit)
|
||||
if(connectedRightTop = (it = outBottomLeft.find(startE - unit)) != outBottomRight.end() &&
|
||||
it->second.find(indexF) != it->second.end())
|
||||
return DLEFT;
|
||||
return UNKNOWN;
|
||||
}
|
||||
|
||||
bool isAligned ( SentenceAlignment &sentence, int fi, int ei ){
|
||||
if (ei == -1 && fi == -1)
|
||||
return true;
|
||||
if (ei <= -1 || fi <= -1)
|
||||
return false;
|
||||
if (ei == sentence.english.size() && fi == sentence.foreign.size())
|
||||
return true;
|
||||
if (ei >= sentence.english.size() || fi >= sentence.foreign.size())
|
||||
return false;
|
||||
for(int i=0;i<sentence.alignedToE[ei].size();i++)
|
||||
if (sentence.alignedToE[ei][i] == fi)
|
||||
return true;
|
||||
return false;
|
||||
if (ei == -1 && fi == -1)
|
||||
return true;
|
||||
if (ei <= -1 || fi <= -1)
|
||||
return false;
|
||||
if (ei == sentence.english.size() && fi == sentence.foreign.size())
|
||||
return true;
|
||||
if (ei >= sentence.english.size() || fi >= sentence.foreign.size())
|
||||
return false;
|
||||
for(int i=0;i<sentence.alignedToE[ei].size();i++)
|
||||
if (sentence.alignedToE[ei][i] == fi)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
bool ge(int first, int second){
|
||||
return first >= second;
|
||||
return first >= second;
|
||||
}
|
||||
|
||||
bool le(int first, int second){
|
||||
return first <= second;
|
||||
return first <= second;
|
||||
}
|
||||
|
||||
bool lt(int first, int second){
|
||||
return first < second;
|
||||
return first < second;
|
||||
}
|
||||
|
||||
void insertVertex( HSenteceVertices & corners, int x, int y ){
|
||||
set<int> tmp;
|
||||
tmp.insert(x);
|
||||
pair< HSenteceVertices::iterator, bool > ret = corners.insert( pair<int, set<int> > (y, tmp) );
|
||||
if(ret.second == false){
|
||||
ret.first->second.insert(x);
|
||||
}
|
||||
set<int> tmp;
|
||||
tmp.insert(x);
|
||||
pair< HSenteceVertices::iterator, bool > ret = corners.insert( pair<int, set<int> > (y, tmp) );
|
||||
if(ret.second == false){
|
||||
ret.first->second.insert(x);
|
||||
}
|
||||
}
|
||||
|
||||
void insertPhraseVertices(
|
||||
HSenteceVertices & topLeft,
|
||||
HSenteceVertices & topRight,
|
||||
HSenteceVertices & bottomLeft,
|
||||
HSenteceVertices & bottomRight,
|
||||
int startF, int startE, int endF, int endE) {
|
||||
HSenteceVertices & topLeft,
|
||||
HSenteceVertices & topRight,
|
||||
HSenteceVertices & bottomLeft,
|
||||
HSenteceVertices & bottomRight,
|
||||
int startF, int startE, int endF, int endE) {
|
||||
|
||||
insertVertex(topLeft, startF, startE);
|
||||
insertVertex(topRight, endF, startE);
|
||||
insertVertex(bottomLeft, startF, endE);
|
||||
insertVertex(bottomRight, endF, endE);
|
||||
insertVertex(topLeft, startF, startE);
|
||||
insertVertex(topRight, endF, startE);
|
||||
insertVertex(bottomLeft, startF, endE);
|
||||
insertVertex(bottomRight, endF, endE);
|
||||
}
|
||||
|
||||
string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType){
|
||||
switch(orient){
|
||||
case LEFT: return "mono"; break;
|
||||
case RIGHT: return "swap"; break;
|
||||
case DRIGHT: return "dright"; break;
|
||||
case DLEFT: return "dleft"; break;
|
||||
case UNKNOWN:
|
||||
switch(modelType){
|
||||
case REO_MONO: return "nomono"; break;
|
||||
case REO_MSD: return "other"; break;
|
||||
case REO_MSLR: return "dright"; break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
switch(orient){
|
||||
case LEFT: return "mono"; break;
|
||||
case RIGHT: return "swap"; break;
|
||||
case DRIGHT: return "dright"; break;
|
||||
case DLEFT: return "dleft"; break;
|
||||
case UNKNOWN:
|
||||
switch(modelType){
|
||||
case REO_MONO: return "nomono"; break;
|
||||
case REO_MSD: return "other"; break;
|
||||
case REO_MSLR: return "dright"; break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo) {
|
||||
@ -620,7 +621,7 @@ void addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, i
|
||||
}
|
||||
|
||||
if (orientationFlag)
|
||||
extractFileOrientation << orientationInfo;
|
||||
extractFileOrientation << orientationInfo;
|
||||
|
||||
extractFile << "\n";
|
||||
extractFileInv << "\n";
|
||||
@ -688,7 +689,7 @@ int SentenceAlignment::create( char englishString[], char foreignString[], char
|
||||
cerr << "E: " << englishString << endl << "F: " << foreignString << endl;
|
||||
return 0;
|
||||
}
|
||||
// cout << "alignmentSequence[i] " << alignmentSequence[i] << " is " << f << ", " << e << endl;
|
||||
// cout << "alignmentSequence[i] " << alignmentSequence[i] << " is " << f << ", " << e << endl;
|
||||
if (e >= english.size() || f >= foreign.size()) {
|
||||
cerr << "WARNING: sentence " << sentenceID << " has alignment point (" << f << ", " << e << ") out of bounds (" << foreign.size() << ", " << english.size() << ")\n";
|
||||
cerr << "E: " << englishString << endl << "F: " << foreignString << endl;
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -135,6 +135,7 @@ my $MKCLS = "$BINDIR/mkcls";
|
||||
|
||||
# supporting scripts/binaries from this package
|
||||
my $PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract";
|
||||
my $LEXICAL_REO_SCORER = "$SCRIPTS_ROOTDIR/training/lexical-reordering/score";
|
||||
my $MEMSCORE = "$SCRIPTS_ROOTDIR/training/memscore/memscore";
|
||||
my $SYMAL = "$SCRIPTS_ROOTDIR/training/symal/symal";
|
||||
my $GIZA2BAL = "$SCRIPTS_ROOTDIR/training/symal/giza2bal.pl";
|
||||
@ -296,7 +297,7 @@ foreach my $r (split(/\,/,$___REORDERING)) {
|
||||
#set default values
|
||||
push @REORDERING_MODELS, {};
|
||||
$REORDERING_MODELS[$model_num]{"dir"} = "backward";
|
||||
$REORDERING_MODELS[$model_num]{"type"} = "word";
|
||||
$REORDERING_MODELS[$model_num]{"type"} = "wbe";
|
||||
$REORDERING_MODELS[$model_num]{"collapse"} = "allff";
|
||||
|
||||
#handle the options set in the config string
|
||||
@ -305,27 +306,19 @@ foreach my $r (split(/\,/,$___REORDERING)) {
|
||||
$REORDERING_LEXICAL = 0;
|
||||
next;
|
||||
}
|
||||
if ($reoconf =~ /(msd)|(mslr)|(monotonicity)|(leftright)/) {
|
||||
if ($reoconf =~ /^((msd)|(mslr)|(monotonicity)|(leftright))/) {
|
||||
$REORDERING_MODELS[$model_num]{"orient"} = $reoconf;
|
||||
}
|
||||
elsif ($reoconf =~ /((bidirectional)|(backward)|(forward))/) {
|
||||
elsif ($reoconf =~ /^((bidirectional)|(backward)|(forward))/) {
|
||||
$REORDERING_MODELS[$model_num]{"dir"} = $reoconf;
|
||||
}
|
||||
elsif ($reoconf =~ /^(fe)|(f))/) {
|
||||
elsif ($reoconf =~ /^((fe)|(f))/) {
|
||||
$REORDERING_MODELS[$model_num]{"lang"} = $reoconf;
|
||||
}
|
||||
elsif ($reoconf =~ /(hier)|(phrase)|(word)/) {
|
||||
if ($model_num == 0) {
|
||||
$reotype = $reoconf;
|
||||
}
|
||||
elsif ($reotype ne $reoconf) {
|
||||
#TODO: update extract to make it possible to have more types of model (return the options for all models used!!)
|
||||
print STDERR "you are not allowed to use more than one reordering model type, now using: $reotype and $reoconf";
|
||||
exit(1);
|
||||
}
|
||||
elsif ($reoconf =~ /^((hier)|(phrase)|(wbe))/) {
|
||||
$REORDERING_MODELS[$model_num]{"type"} = $reoconf;
|
||||
}
|
||||
elsif ($reoconf =~ /(collapseff)|(allff)/) {
|
||||
elsif ($reoconf =~ /^((collapseff)|(allff))/) {
|
||||
$REORDERING_MODELS[$model_num]{"collapse"} = $reoconf;
|
||||
}
|
||||
else {
|
||||
@ -333,8 +326,18 @@ foreach my $r (split(/\,/,$___REORDERING)) {
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
#check that the required attributes are given
|
||||
if (!defined($REORDERING_MODELS[$model_num]{"type"})) {
|
||||
print STDERR "you have to give the type of the reordering models (mslr, msd, monotonicity or leftright); it is not done in $r\n";
|
||||
exit(1);
|
||||
}
|
||||
if (!defined($REORDERING_MODELS[$model_num]{"lang"})) {
|
||||
print STDERR "you have specify which languages to condition on (f or fe); it is not done in $r\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
#fix the all-string
|
||||
$REORDERING_MODELS[$model_num]{"all"} = $REORDERING_MODELS[$model_num]{"orient"}.'-'.$REORDERING_MODELS[$model_num]{"dir"}."-".$REORDERING_MODELS[$model_num]{"lang"}."-".$REORDERING_MODELS[$model_num]{"type"}."-".$REORDERING_MODELS[$model_num]{"collapse"};
|
||||
$REORDERING_MODELS[$model_num]{"all"} = $REORDERING_MODELS[$model_num]{"type"}."-".$REORDERING_MODELS[$model_num]{"orient"}.'-'.$REORDERING_MODELS[$model_num]{"dir"}."-".$REORDERING_MODELS[$model_num]{"lang"}."-".$REORDERING_MODELS[$model_num]{"collapse"};
|
||||
|
||||
# fix numfeatures
|
||||
$REORDERING_MODELS[$model_num]{"numfeatures"} = 1;
|
||||
@ -365,7 +368,7 @@ foreach my $r (split(/\,/,$___REORDERING)) {
|
||||
|
||||
# pick the overall most specific model for each reordering model type
|
||||
for my $mtype ( keys %REORDERING_MODEL_TYPES) {
|
||||
if ($REORDERING_MODEL_TYPES{$mtype} =~ /lr/) {
|
||||
if ($REORDERING_MODEL_TYPES{$mtype} =~ /(mslr)|(leftright)/) {
|
||||
$REORDERING_MODEL_TYPES{$mtype} = "mslr"
|
||||
}
|
||||
elsif ($REORDERING_MODEL_TYPES{$mtype} =~ /msd/) {
|
||||
@ -376,6 +379,7 @@ for my $mtype ( keys %REORDERING_MODEL_TYPES) {
|
||||
}
|
||||
}
|
||||
|
||||
#TODO - remove the below
|
||||
my ($mono_previous_f,$swap_previous_f,$left_previous_f,$right_previous_f,$other_previous_f);
|
||||
my ($mono_previous_fe,$swap_previous_fe,$left_previous_fe,$right_previous_fe,$other_previous_fe);
|
||||
my ($mono_following_f,$swap_following_f,$left_following_f,$right_following_f,$other_following_f);
|
||||
@ -1084,7 +1088,6 @@ sub get_extract_reordering_flags {
|
||||
$config_string .= " --model $type-".$REORDERING_MODEL_TYPES{$type};
|
||||
}
|
||||
print STDERR "extract-flags: $config_string\n";
|
||||
return ""; #comment out when using new training scripts (do we need an option for backward compatibility???
|
||||
return $config_string;
|
||||
}
|
||||
|
||||
@ -1325,38 +1328,34 @@ sub score_phrase_memscore {
|
||||
|
||||
sub get_reordering_factored {
|
||||
print STDERR "(7) learn reordering model @ ".`date`;
|
||||
my @SPECIFIED_TABLE = @_REORDERING_TABLE;
|
||||
# my @TYPE = ("msd-f","msd-fe","msd-bidirectional-f","msd-bidirectional-fe","monotonicity-f","monotonicity-fe","monotonicity-bidirectional-f","monotonicity-bidirectional-fe");
|
||||
|
||||
#This @REORDERING_TABLE is now not used. Did anyone use it???
|
||||
# my @SPECIFIED_TABLE = @_REORDERING_TABLE;
|
||||
if (scalar(@_REORDERING_TABLE)) {
|
||||
print STDERR "WARNING: you specified -reordering-table. That feature is not implemented in this version of train-factored-phrase-model.perl. Standard file names will be used.\n";
|
||||
}
|
||||
if ($REORDERING_LEXICAL) {
|
||||
if ($___NOT_FACTORED) {
|
||||
# my %FILE;
|
||||
foreach my $model (@REORDERING_MODELS) {
|
||||
# if (defined($REORDERING_MODELS{$type})) {
|
||||
my $file = "$___MODEL_DIR/reordering-table.";
|
||||
$file .= $model->{"all"};
|
||||
#$file .= ".$type" if (scalar keys %REORDERING_MODELS) > 2;
|
||||
$file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
|
||||
$model->{"file"} = $file;
|
||||
# }
|
||||
}
|
||||
&get_reordering($___EXTRACT_FILE);
|
||||
print STDERR "(7.1) [no factors] learn reordering model @ ".`date`;
|
||||
# foreach my $model (@REORDERING_MODELS) {
|
||||
# #my $file = "$___MODEL_DIR/reordering-table.";
|
||||
# $file .= $model->{"all"};
|
||||
# #$file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
|
||||
# $model->{"file"} = $file;
|
||||
# }
|
||||
&get_reordering($___EXTRACT_FILE,"$___MODEL_DIR/reordering-table.");
|
||||
}
|
||||
else {
|
||||
foreach my $factor (split(/\+/,$___REORDERING_FACTORS)) {
|
||||
print STDERR "(7) [$factor] learn reordering model @ ".`date`;
|
||||
print STDERR "(7.1) [$factor] learn reordering model @ ".`date`;
|
||||
my ($factor_f,$factor_e) = split(/\-/,$factor);
|
||||
# my %FILE;
|
||||
foreach my $model (@REORDERING_MODELS) { #$type (@TYPE) {
|
||||
#if (defined($REORDERING_MODELS{$type})) {
|
||||
my $file = "$___MODEL_DIR/reordering-table.$factor";
|
||||
#$file .= ".$type" if (scalar keys %REORDERING_MODELS) > 2;
|
||||
$file .= $model->{"all"};
|
||||
$file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
|
||||
$model->{"file"} = $file;
|
||||
}
|
||||
|
||||
# &get_reordering(\%FILE,"$___EXTRACT_FILE.$factor");
|
||||
&get_reordering("$___EXTRACT_FILE.$factor");
|
||||
# foreach my $model (@REORDERING_MODELS) {
|
||||
# my $file = "$___MODEL_DIR/reordering-table.$factor";
|
||||
# $file .= $model->{"all"};
|
||||
# $file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
|
||||
# $model->{"file"} = $file;
|
||||
# }
|
||||
&get_reordering("$___EXTRACT_FILE.$factor","$___MODEL_DIR/reordering-table.$factor");
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1366,8 +1365,7 @@ sub get_reordering_factored {
|
||||
}
|
||||
|
||||
sub get_reordering {
|
||||
#my ($MODEL_FILE,$extract_file) = @_;
|
||||
my ($extract_file) = @_;
|
||||
my ($extract_file,$reo_model_path) = @_;
|
||||
if (-e "$extract_file.o.gz") {
|
||||
safesystem("gunzip < $extract_file.o.gz | LC_ALL=C sort -T $___TEMP_DIR > $extract_file.o.sorted") or die("ERROR");
|
||||
}
|
||||
@ -1376,233 +1374,28 @@ sub get_reordering {
|
||||
}
|
||||
|
||||
my $smooth = $___REORDERING_SMOOTH;
|
||||
@REORDERING_SMOOTH_PREVIOUS = ($smooth,$smooth,$smooth,$smooth,$smooth);
|
||||
@REORDERING_SMOOTH_FOLLOWING = ($smooth,$smooth,$smooth,$smooth,$smooth);
|
||||
|
||||
my (%SMOOTH_PREVIOUS,%SMOOTH_FOLLOWING);
|
||||
if ($smooth =~ /(.+)u$/) {
|
||||
$smooth = $1;
|
||||
my $smooth_total = 0;
|
||||
open(O,"$extract_file.o.sorted")
|
||||
or die "ERROR: Can't read $extract_file.o.sorted";
|
||||
while(<O>) {
|
||||
chomp;
|
||||
my ($f,$e,$o) = split(/ \|\|\| /);
|
||||
my ($o_previous,$o_following) = split(/ /,$o);
|
||||
$SMOOTH_PREVIOUS{$o_previous}++;
|
||||
$SMOOTH_FOLLOWING{$o_following}++;
|
||||
$smooth_total++;
|
||||
}
|
||||
close(O);
|
||||
@REORDERING_SMOOTH_PREVIOUS = ($smooth*($SMOOTH_PREVIOUS{"mono"}+0.1)/$smooth_total,
|
||||
$smooth*($SMOOTH_PREVIOUS{"swap"}+0.1)/$smooth_total,
|
||||
$smooth*($SMOOTH_PREVIOUS{"left"}+0.1)/$smooth_total,
|
||||
$smooth*($SMOOTH_PREVIOUS{"right"}+0.1)/$smooth_total,
|
||||
$smooth*($SMOOTH_PREVIOUS{"other"}+0.1)/$smooth_total);
|
||||
@REORDERING_SMOOTH_FOLLOWING = ($smooth*($SMOOTH_FOLLOWING{"mono"}+0.1)/$smooth_total,
|
||||
$smooth*($SMOOTH_FOLLOWING{"swap"}+0.1)/$smooth_total,
|
||||
$smooth*($SMOOTH_FOLLOWING{"left"}+0.1)/$smooth_total,
|
||||
$smooth*($SMOOTH_FOLLOWING{"right"}+0.1)/$smooth_total,
|
||||
$smooth*($SMOOTH_FOLLOWING{"other"}+0.1)/$smooth_total);
|
||||
printf "$smooth*($SMOOTH_FOLLOWING{mono}+0.1)/$smooth_total,
|
||||
$smooth*($SMOOTH_FOLLOWING{swap}+0.1)/$smooth_total,
|
||||
$smooth*($SMOOTH_FOLLOWING{other}+0.1)/$smooth_total\n";
|
||||
printf "smoothed following to %f,%f,%f\n",@REORDERING_SMOOTH_FOLLOWING;
|
||||
}
|
||||
|
||||
($mono_previous_f,$swap_previous_f,$left_previous_f,$right_previous_f,$other_previous_f) = (0,0,0,0,0); #@REORDERING_SMOOTH_PREVIOUS;
|
||||
($mono_previous_fe,$swap_previous_fe,$left_previous_fe,$right_previous_fe,$other_previous_fe) = (0,0,0,0,0); #@REORDERING_SMOOTH_PREVIOUS;
|
||||
($mono_following_f,$swap_following_f,$left_following_f,$right_following_f,$other_following_f) = (0,0,0,0,0); #@REORDERING_SMOOTH_FOLLOWING;
|
||||
($mono_following_fe,$swap_following_fe,$left_following_fe,$right_following_fe,$other_following_fe) = (0,0,0,0,0); #@REORDERING_SMOOTH_FOLLOWING;
|
||||
|
||||
print STDERR "(7.2) building tables @ ".`date`;
|
||||
open(O,"$extract_file.o.sorted")
|
||||
or die "ERROR: Can't read $extract_file.o.sorted";
|
||||
|
||||
foreach my $model ( @REORDERING_MODELS ) {
|
||||
local *FILE;
|
||||
open(FILE, "|gzip >".$model->{"file"}.".gz");
|
||||
$model->{"filehandle"} = *FILE;
|
||||
}
|
||||
|
||||
my $first = 1;
|
||||
while(<O>) {
|
||||
chomp;
|
||||
my ($f,$e,$o) = split(/ \|\|\| /);
|
||||
my ($o_previous,$o_following) = split(/ /,$o);
|
||||
|
||||
# store counts if new f,e
|
||||
if ($first) {
|
||||
$f_current = $f;
|
||||
$e_current = $e;
|
||||
$first = 0;
|
||||
}
|
||||
elsif ($f ne $f_current || $e ne $e_current) {
|
||||
|
||||
#always store the counts for both directions.
|
||||
# if (defined($REORDERING_MODELS{"fe"})) {
|
||||
# compute probs, store them
|
||||
&store_reordering_fe();
|
||||
|
||||
# reset counters
|
||||
($mono_previous_fe,$swap_previous_fe,$left_previous_fe,$right_previous_fe,$other_previous_fe) = (0,0,0,0,0);
|
||||
($mono_following_fe,$swap_following_fe,$left_following_fe,$right_following_fe,$other_following_fe) = (0,0,0,0,0);
|
||||
# }
|
||||
|
||||
# store counts if new f
|
||||
if ($f ne $f_current) { # && defined($REORDERING_MODELS{"f"})) {
|
||||
|
||||
# compute probs, store them
|
||||
&store_reordering_f();
|
||||
|
||||
# reset counters
|
||||
($mono_previous_f,$swap_previous_f,$left_previous_f,$right_previous_f,$other_previous_f) = (0,0,0,0,0);
|
||||
($mono_following_f,$swap_following_f,$left_following_f,$right_following_f,$other_following_fe) = (0,0,0,0,0);
|
||||
|
||||
#create cmd string for lexical reordering scoring
|
||||
my $cmd = "$LEXICAL_REO_SCORER $extract_file.o.sorted $smooth $reo_model_path";
|
||||
$cmd .= " --SmoothWithCounts" if ($smooth =~ /(.+)u$/);
|
||||
for my $mtype (keys %REORDERING_MODEL_TYPES) {
|
||||
$cmd .= " --model \"$mtype $REORDERING_MODEL_TYPES{$mtype}";
|
||||
foreach my $model (@REORDERING_MODELS) {
|
||||
if ($model->{"type"} eq $mtype) {
|
||||
$cmd .= " ".$model->{"all"};
|
||||
}
|
||||
$f_current = $f;
|
||||
$e_current = $e;
|
||||
}
|
||||
# update counts
|
||||
if ($o_previous eq 'mono') { $mono_previous_f++; $mono_previous_fe++; }
|
||||
elsif ($o_previous eq 'swap') { $swap_previous_f++; $swap_previous_fe++; }
|
||||
elsif ($o_previous eq 'left'){ $left_previous_f++; $left_previous_fe++; }
|
||||
elsif ($o_previous eq 'right'){ $right_previous_f++; $right_previous_fe++; }
|
||||
#keep other option for backward compatibility
|
||||
elsif ($o_previous eq 'other'){ $other_previous_f++; $other_previous_fe++; }
|
||||
else { print STDERR "buggy line (o_previous:$o_previous): $_\n"; }
|
||||
|
||||
if ($o_following eq 'mono') { $mono_following_f++; $mono_following_fe++; }
|
||||
elsif ($o_following eq 'swap') { $swap_following_f++; $swap_following_fe++; }
|
||||
elsif ($o_following eq 'left') { $left_following_f++; $left_following_fe++; }
|
||||
elsif ($o_following eq 'right') { $right_previous_f++; $right_previous_fe++; }
|
||||
#keep other option for backward compatibility
|
||||
elsif ($o_following eq 'other') { $other_previous_f++; $other_previous_fe++; }
|
||||
else { print STDERR "buggy line (o_following:$o_following): $_\n"; }
|
||||
|
||||
}
|
||||
$cmd .= "\"";
|
||||
}
|
||||
&store_reordering_f();
|
||||
&store_reordering_fe();
|
||||
|
||||
#Call the lexical reordering scorer
|
||||
safesystem("$cmd") or die "ERROR: Lexical reordering scoring failed";
|
||||
|
||||
if (! $debug) { safesystem("rm $extract_file.o.sorted") or die("ERROR");}
|
||||
}
|
||||
|
||||
sub store_reordering_f {
|
||||
my ($total_previous_f,$total_following_f);
|
||||
$total_previous_f = ($mono_previous_f+$swap_previous_f+$left_previous_f+$right_previous_f+$other_previous_f);
|
||||
$total_following_f = ($mono_following_f+$swap_following_f+$left_following_f+$right_following_f+$other_following_f);
|
||||
|
||||
foreach my $model (@REORDERING_MODELS) {
|
||||
next if ($model->{"lang"} ne "f");
|
||||
if ($model->{"orient"} eq "mslr") {
|
||||
$total_previous_f += $REORDERING_SMOOTH_PREVIOUS[0]+$REORDERING_SMOOTH_PREVIOUS[1]+$REORDERING_SMOOTH_PREVIOUS[2]+$REORDERING_SMOOTH_PREVIOUS[3];
|
||||
printf { $model->{"filehandle"} } ("%s ||| %g %g %g %g\n",
|
||||
$f_current,
|
||||
($mono_previous_f+$REORDERING_SMOOTH_PREVIOUS[0])/
|
||||
$total_previous_f,
|
||||
($swap_previous_f+$REORDERING_SMOOTH_PREVIOUS[1])/
|
||||
$total_previous_f,
|
||||
($left_previous_f+$REORDERING_SMOOTH_PREVIOUS[2])/
|
||||
$total_previous_f,
|
||||
($right_previous_f+$REORDERING_SMOOTH_PREVIOUS[3])
|
||||
/$total_previous_f);
|
||||
}
|
||||
elsif ($model->{"orient"} eq "msd") {
|
||||
$total_previous_f += $REORDERING_SMOOTH_PREVIOUS[0]+$REORDERING_SMOOTH_PREVIOUS[1]+$REORDERING_SMOOTH_PREVIOUS[4];
|
||||
printf { $model->{"filehandle"} } ("%s ||| %g %g %g\n",
|
||||
$f_current,
|
||||
($mono_previous_f+$REORDERING_SMOOTH_PREVIOUS[0])/$total_previous_f,
|
||||
($swap_previous_f+$REORDERING_SMOOTH_PREVIOUS[1])/$total_previous_f,
|
||||
($other_previous_f+$REORDERING_SMOOTH_PREVIOUS[4])/
|
||||
$total_previous_f);
|
||||
}
|
||||
elsif ($model->{"orient"} eq "monotonicity") {
|
||||
$total_previous_f += $REORDERING_SMOOTH_PREVIOUS[0]+$REORDERING_SMOOTH_PREVIOUS[1];
|
||||
printf { $model->{"filehandle"} } ("%s ||| %g %g\n",
|
||||
$f_current,
|
||||
($mono_previous_f+$REORDERING_SMOOTH_PREVIOUS[0])/$total_previous_f,
|
||||
($swap_previous_f+$left_previous_f+$right_previous_f+$REORDERING_SMOOTH_PREVIOUS[1])/$total_previous_f);
|
||||
}
|
||||
elsif ($model->{"orient"} eq "leftright") {
|
||||
$total_previous_f += $REORDERING_SMOOTH_PREVIOUS[0]+$REORDERING_SMOOTH_PREVIOUS[1];
|
||||
printf { $model->{"filehandle"} } ("%s ||| %g %g\n",
|
||||
$f_current,
|
||||
($mono_previous_f+$left_previous_f+$REORDERING_SMOOTH_PREVIOUS[0])/$total_previous_f,
|
||||
($swap_previous_f+$right_previous_f+$REORDERING_SMOOTH_PREVIOUS[1])/$total_previous_f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sub store_reordering_fe {
|
||||
my $total_previous_fe = $mono_previous_fe+$swap_previous_fe+$left_previous_fe+$right_previous_fe;
|
||||
my $total_following_fe = $mono_following_fe+$swap_following_fe+$left_following_fe+$right_following_fe;
|
||||
|
||||
foreach my $model (@REORDERING_MODELS) {
|
||||
next if ($model->{"lang"} ne "fe");
|
||||
if ($model->{"orient"} eq "mslr") {
|
||||
$total_previous_fe += $REORDERING_SMOOTH_PREVIOUS[0]+$REORDERING_SMOOTH_PREVIOUS[1]+$REORDERING_SMOOTH_PREVIOUS[2]+$REORDERING_SMOOTH_PREVIOUS[3];
|
||||
$total_following_fe += $REORDERING_SMOOTH_FOLLOWING[0]+$REORDERING_SMOOTH_FOLLOWING[1]+$REORDERING_SMOOTH_FOLLOWING[2]+$REORDERING_SMOOTH_FOLLOWING[3];
|
||||
printf { $model->{"filehandle"} } ("%s ||| %s ||| %g %g %g %g ",
|
||||
$f_current, $e_current,
|
||||
($mono_previous_fe+$REORDERING_SMOOTH_PREVIOUS[0])/$total_previous_fe,
|
||||
($swap_previous_fe+$REORDERING_SMOOTH_PREVIOUS[1])/$total_previous_fe,
|
||||
($left_previous_fe+$REORDERING_SMOOTH_PREVIOUS[2])/$total_previous_fe,
|
||||
($right_previous_fe+$REORDERING_SMOOTH_PREVIOUS[3])/$total_previous_fe);
|
||||
if ($model->{"dir"} eq "bidirectional") {
|
||||
printf { $model->{"filehandle"} } ("%g %g %g %g",
|
||||
($mono_following_fe+$REORDERING_SMOOTH_FOLLOWING[0])/$total_following_fe,
|
||||
($swap_following_fe+$REORDERING_SMOOTH_FOLLOWING[1])/$total_following_fe,
|
||||
($left_following_fe+$REORDERING_SMOOTH_FOLLOWING[2])/$total_following_fe,
|
||||
($right_following_fe+$REORDERING_SMOOTH_FOLLOWING[3])/$total_following_fe);
|
||||
}
|
||||
printf { $model->{"filehandle"} } ("\n");
|
||||
}
|
||||
elsif ($model->{"orient"} eq "msd") {
|
||||
$total_previous_fe += $REORDERING_SMOOTH_PREVIOUS[0]+$REORDERING_SMOOTH_PREVIOUS[1]+$REORDERING_SMOOTH_PREVIOUS[4];
|
||||
$total_following_fe += $REORDERING_SMOOTH_FOLLOWING[0]+$REORDERING_SMOOTH_FOLLOWING[1]+$REORDERING_SMOOTH_FOLLOWING[4];
|
||||
printf { $model->{"filehandle"} } ("%s ||| %s ||| %g %g %g ",
|
||||
$f_current, $e_current,
|
||||
($mono_previous_fe+$REORDERING_SMOOTH_PREVIOUS[0])/$total_previous_fe,
|
||||
($swap_previous_fe+$REORDERING_SMOOTH_PREVIOUS[1])/$total_previous_fe,
|
||||
($other_previous_fe+$REORDERING_SMOOTH_PREVIOUS[4])/$total_previous_fe);
|
||||
if ($model->{"dir"} eq "bidirectional") {
|
||||
printf { $model->{"filehandle"} } ("%g %g %g",
|
||||
($mono_following_fe+$REORDERING_SMOOTH_FOLLOWING[0])/$total_following_fe,
|
||||
($swap_following_fe+$REORDERING_SMOOTH_FOLLOWING[1])/$total_following_fe,
|
||||
($other_following_fe+$REORDERING_SMOOTH_FOLLOWING[4])/$total_following_fe);
|
||||
}
|
||||
printf { $model->{"filehandle"} } ("\n");
|
||||
}
|
||||
elsif ($model->{"orient"} eq "monotonicity") {
|
||||
$total_previous_fe += $REORDERING_SMOOTH_PREVIOUS[0]+$REORDERING_SMOOTH_PREVIOUS[1];
|
||||
$total_following_fe += $REORDERING_SMOOTH_FOLLOWING[0]+$REORDERING_SMOOTH_FOLLOWING[1];
|
||||
printf { $model->{"filehandle"} } ("%s %s ||| %g %g ",
|
||||
$f_current, $e_current,
|
||||
($mono_previous_fe+$REORDERING_SMOOTH_PREVIOUS[0])/$total_previous_fe,
|
||||
($swap_previous_fe+$left_previous_fe+$right_previous_fe+$REORDERING_SMOOTH_PREVIOUS[1])/$total_previous_fe);
|
||||
if ($model->{"dir"} eq "bidirectional") {
|
||||
printf { $model->{"filehandle"} } ("%g %g",
|
||||
($mono_following_fe+$REORDERING_SMOOTH_FOLLOWING[0])/$total_following_fe,
|
||||
($swap_following_fe+$left_following_fe+$right_following_fe+$REORDERING_SMOOTH_FOLLOWING[1])/$total_following_fe);
|
||||
}
|
||||
printf { $model->{"filehandle"} } ("\n");
|
||||
}
|
||||
elsif ($model->{"orient"} eq "leftright") {
|
||||
$total_previous_fe += $REORDERING_SMOOTH_PREVIOUS[0]+$REORDERING_SMOOTH_PREVIOUS[1];
|
||||
$total_following_fe += $REORDERING_SMOOTH_FOLLOWING[0]+$REORDERING_SMOOTH_FOLLOWING[1];
|
||||
printf { $model->{"filehandle"} } ("%s ||| %s ||| %g %g ",
|
||||
$f_current, $e_current,
|
||||
($mono_previous_fe+$left_previous_fe+$REORDERING_SMOOTH_PREVIOUS[0])/$total_previous_fe,
|
||||
($swap_previous_fe+$right_previous_fe+$REORDERING_SMOOTH_PREVIOUS[1])/$total_previous_fe);
|
||||
if ($model->{"dir"} eq "bidirectional") {
|
||||
printf { $model->{"filehandle"} } ("%g %g",
|
||||
($mono_following_fe+$left_following_fe+$REORDERING_SMOOTH_FOLLOWING[0])/$total_following_fe,
|
||||
($swap_following_fe+$right_following_fe+$REORDERING_SMOOTH_FOLLOWING[1])/$total_following_fe);
|
||||
}
|
||||
printf { $model->{"filehandle"} } ("\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
### (8) LEARN GENERATION MODEL
|
||||
@ -1703,7 +1496,7 @@ sub create_ini {
|
||||
|
||||
if (defined $___TRANSLATION_FACTORS) {
|
||||
print INI "# input factors\n";
|
||||
print INI "[input-factors]\n";
|
||||
print INI "[input-factors]\n";
|
||||
my $INPUT_FACTOR_MAX = 0;
|
||||
foreach my $table (split /\+/, $___TRANSLATION_FACTORS) {
|
||||
my ($factor_list, $output) = split /-+/, $table;
|
||||
@ -1798,25 +1591,16 @@ print INI "\n\n\# limit on how many phrase translations e for each phrase f are
|
||||
my $file = "# distortion (reordering) files\n\[distortion-file]\n";
|
||||
my $factor_i = 0;
|
||||
|
||||
my @SPECIFIED_TABLE = @_REORDERING_TABLE;
|
||||
#my @SPECIFIED_TABLE = @_REORDERING_TABLE;
|
||||
foreach my $factor (split(/\+/,$___REORDERING_FACTORS)) {
|
||||
# foreach my $type (keys %REORDERING_MODELS) {
|
||||
foreach my $model (@REORDERING_MODELS) {
|
||||
# next if $type eq "fe" || $type eq "f";
|
||||
# next if $type eq "distance";
|
||||
# my $w;
|
||||
# if ($type =~ /msd/) { $w = 3; } else { $w = 1; }
|
||||
# if ($type =~ /bi/) { $w *= 2; }
|
||||
$weight_d_count += $model->{"numfeatures"};
|
||||
|
||||
my $table_file = "$___MODEL_DIR/reordering-table.";
|
||||
$table_file .= ".$factor" unless $___NOT_FACTORED;
|
||||
# $table_file .= ".$type" if (scalar keys %REORDERING_MODELS) > 2;
|
||||
$table_file .= $model->{"all"};
|
||||
$table_file .= ".gz";
|
||||
$table_file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
|
||||
#$type =~ s/\-f/\-unidirectional\-f/ unless $type =~ /\-bi/;
|
||||
$file .= ".$factor ".$model->{"all"}." ".$model->{"numfeatures"}." $table_file\n";
|
||||
$weight_d_count += $model->{"numfeatures"};
|
||||
my $table_file = "$___MODEL_DIR/reordering-table.";
|
||||
$table_file .= ".$factor" unless $___NOT_FACTORED;
|
||||
$table_file .= $model->{"all"};
|
||||
$table_file .= ".gz";
|
||||
#$table_file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
|
||||
$file .= ".$factor ".$model->{"all"}." ".$model->{"numfeatures"}." $table_file\n";
|
||||
}
|
||||
$factor_i++;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user