lex reordering scoring uses FilePiece/StringPiece

This commit is contained in:
Barry Haddow 2013-01-09 17:38:48 +00:00
parent 1530ae4f5f
commit 2e8bad22e4
4 changed files with 101 additions and 40 deletions

View File

@ -1,2 +1,2 @@
exe lexical-reordering-score : InputFileStream.cpp reordering_classes.cpp score.cpp ../..//z ;
exe lexical-reordering-score : InputFileStream.cpp reordering_classes.cpp score.cpp ../../util//kenutil ../..//z ;

View File

@ -56,7 +56,7 @@ void ModelScore::reset_f()
}
}
void ModelScore::add_example(const string& previous, string& next)
void ModelScore::add_example(const StringPiece& previous, const StringPiece& next)
{
count_fe_prev[getType(previous)]++;
count_f_prev[getType(previous)]++;
@ -85,7 +85,7 @@ const vector<double>& ModelScore::get_scores_f_next() const
}
ORIENTATION ModelScore::getType(const string& s)
ORIENTATION ModelScore::getType(const StringPiece& s)
{
if (s.compare("mono") == 0) {
return MONO;
@ -106,7 +106,7 @@ ORIENTATION ModelScore::getType(const string& s)
}
ORIENTATION ModelScoreMSLR::getType(const string& s)
ORIENTATION ModelScoreMSLR::getType(const StringPiece& s)
{
if (s.compare("mono") == 0) {
return MONO;
@ -126,7 +126,7 @@ ORIENTATION ModelScoreMSLR::getType(const string& s)
}
ORIENTATION ModelScoreLR::getType(const string& s)
ORIENTATION ModelScoreLR::getType(const StringPiece& s)
{
if (s.compare("mono") == 0 || s.compare("dright") == 0) {
return DRIGHT;
@ -142,7 +142,7 @@ ORIENTATION ModelScoreLR::getType(const string& s)
}
ORIENTATION ModelScoreMSD::getType(const string& s)
ORIENTATION ModelScoreMSD::getType(const StringPiece& s)
{
if (s.compare("mono") == 0) {
return MONO;
@ -161,7 +161,7 @@ ORIENTATION ModelScoreMSD::getType(const string& s)
}
}
ORIENTATION ModelScoreMonotonicity::getType(const string& s)
ORIENTATION ModelScoreMonotonicity::getType(const StringPiece& s)
{
if (s.compare("mono") == 0) {
return MONO;

View File

@ -12,6 +12,8 @@
#include <string>
#include <fstream>
#include "util/string_piece.hh"
enum ORIENTATION {MONO, SWAP, DRIGHT, DLEFT, OTHER, NOMONO};
@ -27,12 +29,12 @@ private:
std::vector<double> count_f_next;
protected:
virtual ORIENTATION getType(const std::string& s);
virtual ORIENTATION getType(const StringPiece& s);
public:
ModelScore();
virtual ~ModelScore();
void add_example(const std::string& previous, std::string& next);
void add_example(const StringPiece& previous, const StringPiece& next);
void reset_fe();
void reset_f();
const std::vector<double>& get_scores_fe_prev() const;
@ -46,25 +48,25 @@ public:
class ModelScoreMSLR : public ModelScore
{
protected:
virtual ORIENTATION getType(const std::string& s);
virtual ORIENTATION getType(const StringPiece& s);
};
class ModelScoreLR : public ModelScore
{
protected:
virtual ORIENTATION getType(const std::string& s);
virtual ORIENTATION getType(const StringPiece& s);
};
class ModelScoreMSD : public ModelScore
{
protected:
virtual ORIENTATION getType(const std::string& s);
virtual ORIENTATION getType(const StringPiece& s);
};
class ModelScoreMonotonicity : public ModelScore
{
protected:
virtual ORIENTATION getType(const std::string& s);
virtual ORIENTATION getType(const StringPiece& s);
};
//Class for calculating total counts, and to calculate smoothing

View File

@ -13,15 +13,28 @@
#include <sstream>
#include <cstdlib>
#include <cstring>
#include "InputFileStream.h"
#include "util/exception.hh"
#include "util/file_piece.hh"
#include "util/string_piece.hh"
#include "util/tokenize_piece.hh"
#include "InputFileStream.h"
#include "reordering_classes.h"
using namespace std;
void split_line(const string& line, string& foreign, string& english, string& wbe, string& phrase, string& hier);
void get_orientations(const string& pair, string& previous, string& next);
void split_line(const StringPiece& line, StringPiece& foreign, StringPiece& english, StringPiece& wbe, StringPiece& phrase, StringPiece& hier);
void get_orientations(const StringPiece& pair, StringPiece& previous, StringPiece& next);
class FileFormatException : public util::Exception
{
public:
FileFormatException() throw() {
*this << "Invalid extract file format: ";
}
~FileFormatException() throw() {}
};
int main(int argc, char* argv[])
{
@ -38,11 +51,7 @@ int main(int argc, char* argv[])
double smoothingValue = atof(argv[2]);
string filepath = argv[3];
Moses::InputFileStream eFile(extractFileName);
if (!eFile) {
cerr << "Could not open the extract file " << extractFileName <<"for scoring of lexical reordering models\n";
exit(1);
}
util::FilePiece eFile(extractFileName);
bool smoothWithCounts = false;
map<string,ModelScore*> modelScores;
@ -51,8 +60,8 @@ int main(int argc, char* argv[])
bool phrase = false;
bool wbe = false;
string e,f,w,p,h;
string prev, next;
StringPiece e,f,w,p,h;
StringPiece prev, next;
int i = 4;
while (i<argc) {
@ -96,8 +105,14 @@ int main(int argc, char* argv[])
////////////////////////////////////
//calculate smoothing
if (smoothWithCounts) {
string line;
while (getline(eFile,line)) {
util::FilePiece eFileForCounts(extractFileName);
while (true) {
StringPiece line;
try {
line = eFileForCounts.ReadLine();
} catch (util::EndOfFileException &e) {
break;
}
split_line(line,e,f,w,p,h);
if (hier) {
get_orientations(h, prev, next);
@ -118,9 +133,6 @@ int main(int argc, char* argv[])
models[i]->createSmoothing(smoothingValue);
}
//reopen eFile
eFile.Close();
eFile.Open(extractFileName);
} else {
//constant smoothing
for (size_t i=0; i<models.size(); ++i) {
@ -130,14 +142,20 @@ int main(int argc, char* argv[])
////////////////////////////////////
//calculate scores for reordering table
string line,f_current,e_current;
string f_current,e_current;
bool first = true;
while (getline(eFile, line)) {
while (true) {
StringPiece line;
try {
line = eFile.ReadLine();
} catch (util::EndOfFileException &e) {
break;
}
split_line(line,f,e,w,p,h);
if (first) {
f_current = f;
e_current = e;
f_current = f.as_string(); //FIXME: Avoid the copy.
e_current = e.as_string();
first = false;
} else if (f.compare(f_current) != 0 || e.compare(e_current) != 0) {
//fe - score
@ -159,8 +177,8 @@ int main(int argc, char* argv[])
it->second->reset_f();
}
}
f_current = f;
e_current = e;
f_current = f.as_string();
e_current = e.as_string();
}
// uppdate counts
@ -179,10 +197,10 @@ int main(int argc, char* argv[])
}
//Score the last phrases
for (size_t i=0; i<models.size(); ++i) {
models[i]->score_fe(f,e);
models[i]->score_fe(f_current,e_current);
}
for (size_t i=0; i<models.size(); ++i) {
models[i]->score_f(f);
models[i]->score_f(f_current);
}
//Zip all files
@ -193,22 +211,61 @@ int main(int argc, char* argv[])
return 0;
}
template <class It> StringPiece
GrabOrDie(It &it, const StringPiece& line) {
UTIL_THROW_IF(!it, FileFormatException, line.as_string());
return *it++;
}
void split_line(const string& line, string& foreign, string& english, string& wbe, string& phrase, string& hier)
void split_line(
const StringPiece& line,
StringPiece& foreign,
StringPiece& english,
StringPiece& wbe,
StringPiece& phrase,
StringPiece& hier)
{
/*Format is source ||| target ||| orientations
followed by one of the following 4 possibilities
eps
||| weights
| phrase | hier
| phrase | hier ||| weight
*/
util::TokenIter<util::MultiCharacter> pipes(line, util::MultiCharacter(" ||| "));
foreign = GrabOrDie(pipes,line);
english = GrabOrDie(pipes,line);
StringPiece rest = GrabOrDie(pipes,line);
util::TokenIter<util::MultiCharacter> singlePipe(rest, util::MultiCharacter(" | "));
wbe = GrabOrDie(singlePipe,line);
if (singlePipe) {
phrase = GrabOrDie(singlePipe, line);
hier = GrabOrDie(singlePipe, line);
} else {
phrase.clear();
hier.clear();
}
/*
int begin = 0;
int end = line.find(" ||| ");
foreign = line.substr(begin, end - begin);
cerr << begin << " " << end << " " << foreign << endl;
begin = end+5;
end = line.find(" ||| ", begin);
english = line.substr(begin, end - begin);
cerr << begin << " " << end << " " << english << endl;
begin = end+5;
end = line.find(" | ", begin);
wbe = line.substr(begin, end - begin);
cerr << begin << " " << end << " " << wbe << endl;
exit(0);
begin = end+3;
end = line.find(" | ", begin);
@ -216,10 +273,12 @@ void split_line(const string& line, string& foreign, string& english, string& wb
begin = end+3;
hier = line.substr(begin, line.size() - begin);
*/
}
void get_orientations(const string& pair, string& previous, string& next)
void get_orientations(const StringPiece& pair, StringPiece& previous, StringPiece& next)
{
istringstream is(pair);
is >> previous >> next;
util::TokenIter<util::SingleCharacter> tok(pair, util::SingleCharacter(' '));
previous = GrabOrDie(tok,pair);
next = GrabOrDie(tok,pair);
}