I am adding new object for handling statistics forerror measures

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1633 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
nicolabertoldi 2008-05-14 08:14:13 +00:00
parent 49891adeed
commit 9c12330785
8 changed files with 603 additions and 1 deletions

View File

@ -30,7 +30,7 @@ protected:
private:
char databuf_[BUFSIZ];
size_t bufLen_;
size_t idx; // idx to identify the utterance
size_t idx; // idx to identify the utterance, it can differ from the index inside the vector
public:
FeatureArray();

23
mert/Makefile Executable file
View File

@ -0,0 +1,23 @@
OBJS= Util.o Timer.o Parameter.o \
ScoreStats.o ScoreArray.o ScoreData.o \
FeatureStats.o FeatureArray.o FeatureData.o
CFLAGS=-O3 -DTRACE_ENABLE
GCC=g++
LDFLAGS=
LDLIBS=-lm
all: main feature_extractor
clean:
rm -f *.o
%.o : %.cpp %.h
$(GCC) -c $(CFLAGS) $< -o $@
feature_extractor: $(OBJS) feature_extractor.cpp
$(GCC) $(CFLAGS) $(OBJS) $(LDLIBS) -o $@ $@.cpp
main: $(OBJS) main.cpp
$(GCC) $(CFLAGS) $(OBJS) $(LDLIBS) -o $@ $@.cpp

95
mert/ScoreArray.cpp Normal file
View File

@ -0,0 +1,95 @@
/*
* ScoreArray.cpp
* met - Minimum Error Training
*
* Created by Nicola Bertoldi on 13/05/08.
*
*/
#include <fstream>
#include "ScoreArray.h"
#include "Util.h"
ScoreArray::ScoreArray():
bufLen_(0),idx(0)
{};
void ScoreArray::savetxt(std::ofstream& outFile)
{
ScoreStats entry;
outFile << SCORES_BEGIN << " " << idx << " " << array_.size() << std::endl;
for (vector<ScoreStats>::iterator i = array_.begin(); i !=array_.end(); i++)
(*i).savetxt(outFile);
outFile << SCORES_END << std::endl;
}
void ScoreArray::savetxt(const std::string &file)
{
TRACE_ERR("saving the array into " << file << std::endl);
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
ScoreStats entry;
savetxt(outFile);
}
void ScoreArray::loadtxt(ifstream& inFile)
{
ScoreStats entry;
int sentence_index;
int number_of_entries;
int nextPound;
std::string substring, stringBuf, sentence_code = "";
std::string::size_type loc;
TRACE_ERR("starting loadtxt..." << std::endl);
std::getline(inFile, stringBuf);
if (stringBuf.empty()){
TRACE_ERR("ERROR: Empty string" << std::endl);
return;
}
if (!stringBuf.empty()){
// TRACE_ERR("Reading: " << stringBuf << std::endl);
nextPound = getNextPound(stringBuf, substring);
nextPound = getNextPound(stringBuf, substring);
idx = atoi(substring.c_str());
nextPound = getNextPound(stringBuf, substring);
number_of_entries = atoi(substring.c_str());
// TRACE_ERR("idx: " << idx " nbest: " << number_of_entries << std::endl);
}
for (int i=0 ; i < number_of_entries; i++)
{
entry.clear();
std::getline(inFile, stringBuf);
entry.set(stringBuf);
add(entry);
}
std::getline(inFile, stringBuf);
if (!stringBuf.empty()){
// TRACE_ERR("Reading: " << stringBuf << std::endl);
if ((loc = stringBuf.find(SCORES_END)) != 0){
TRACE_ERR("ERROR: ScoreArray::loadtxt(): Wrong footer");
return;
}
}
}
void ScoreArray::loadtxt(const std::string &file)
{
TRACE_ERR("loading data from " << file << std::endl);
std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
loadtxt(inFile);
inFile.close();
}

62
mert/ScoreArray.h Normal file
View File

@ -0,0 +1,62 @@
/*
* ScoreArray.h
* met - Minimum Error Training
*
* Created by Nicola Bertoldi on 13/05/08.
*
*/
#ifndef SCORE_ARRAY_H
#define SCORE_ARRAY_H
#define SCORES_BEGIN "SCORES_BEGIN_0"
#define SCORES_END "SCORES_END_0"
using namespace std;
#include <limits>
#include <vector>
#include <iostream>
#include <fstream>
#include "Util.h"
#include "ScoreStats.h"
class ScoreArray
{
protected:
vector<ScoreStats> array_;
private:
char databuf_[BUFSIZ];
size_t bufLen_;
int idx; // idx to identify the utterance, it can differ from the index inside the vector
public:
ScoreArray();
~ScoreArray(){};
inline void clear() { array_.clear(); }
inline size_t getIndex(){ return idx; }
inline void setIndex(size_t value){ idx=value; }
inline ScoreStats get(int i){ return array_.at(i); }
void add(ScoreStats e){ array_.push_back(e); }
inline size_t size(){ return array_.size(); }
inline size_t memsize(){ return bufLen_; }
void savetxt(const std::string &file);
void savetxt(ofstream& outFile);
inline void savetxt(){ savetxt("/dev/stdout"); }
void loadtxt(ifstream& inFile);
void loadtxt(const std::string &file);
};
#endif

133
mert/ScoreData.cpp Normal file
View File

@ -0,0 +1,133 @@
/*
* ScoreData.cpp
* met - Minimum Error Training
*
* Created by Nicola Bertoldi on 13/05/08.
*
*/
#include <fstream>
#include "ScoreData.h"
#include "Util.h"
ScoreData::ScoreData():
bufLen_(0)
{};
void ScoreData::savetxt(std::ofstream& outFile)
{
ScoreArray entry;
for (vector<ScoreArray>::iterator i = array_.begin(); i !=array_.end(); i++)
(*i).savetxt(outFile);
}
void ScoreData::savetxt(const std::string &file)
{
TRACE_ERR("saving the array into " << file << std::endl);
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
ScoreStats entry;
savetxt(outFile);
}
void ScoreData::loadtxt(ifstream& inFile)
{
ScoreArray entry;
int iter=0;
while (!inFile.eof()){
TRACE_ERR("iter " << iter << " size " << size() << std::endl);
entry.clear();
entry.loadtxt(inFile);
if (entry.size() == 0){
TRACE_ERR("no more data" << std::endl);
continue;
}
entry.savetxt();
add(entry);
savetxt();
iter++;
}
}
void ScoreData::loadtxt(const std::string &file)
{
TRACE_ERR("loading data from " << file << std::endl);
std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
loadtxt(inFile);
inFile.close();
}
void ScoreData::loadnbest(const std::string &file)
{
TRACE_ERR("loading nbest from " << file << std::endl);
ScoreStats entry;
int sentence_index;
int nextPound;
std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
while (!inFile.eof()){
std::string substring, subsubstring, stringBuf;
std::string theSentence;
std::string::size_type loc;
std::getline(inFile, stringBuf);
if (stringBuf.empty()) continue;
// TRACE_ERR("Reading: " << stringBuf << std::endl);
nextPound = getNextPound(stringBuf, substring, "|||"); //first field
sentence_index = atoi(substring.c_str());
nextPound = getNextPound(stringBuf, substring, "|||"); //second field
theSentence = substring;
entry.clear();
/* HERE IS THE SECTION TO COMPUTE STATISTICS FOR ERROR MEASURE
* theSentence contains the translation
* append each statistic in the entry object
* with a command like the following:
*
* entry.add(value);
*
*/
/* DO NOT MODIFY BELOW */
// entry.savetxt();
add(entry,sentence_index);
}
inFile.close();
}
void ScoreData::add(ScoreStats e, int sent_idx){
if (exists(sent_idx)){
array_.at(sent_idx).add(e);
ScoreArray a=get(sent_idx);;
}
else{
ScoreArray a;
a.add(e);
a.setIndex(sent_idx);
add(a);
}
}

61
mert/ScoreData.h Normal file
View File

@ -0,0 +1,61 @@
/*
* ScoreData.h
* met - Minimum Error Training
*
* Created by Nicola Bertoldi on 13/05/08.
*
*/
#ifndef SCORE_DATA_H
#define SCORE_DATA_H
using namespace std;
#include <limits>
#include <vector>
#include <iostream>
#include "Util.h"
#include "ScoreArray.h"
class ScoreData
{
protected:
vector<ScoreArray> array_;
private:
char databuf_[BUFSIZ];
size_t bufLen_;
public:
ScoreData();
~ScoreData(){};
inline void clear() { array_.clear(); }
inline ScoreArray get(int i){ return array_.at(i); }
inline bool exists(int i){ return (i<array_.size())?true:false; }
inline ScoreStats get(int i, int j){ return array_.at(i).get(j); }
void add(ScoreArray e){ array_.push_back(e); }
void add(ScoreStats e, int sent_idx);
inline size_t size(){ return array_.size(); }
inline size_t memsize(){ return bufLen_; }
void savetxt(const std::string &file);
void savetxt(ofstream& outFile);
inline void savetxt(){ savetxt("/dev/stdout"); }
void loadtxt(ifstream& inFile);
void loadtxt(const std::string &file);
void loadnbest(const std::string &file);
};
#endif

146
mert/ScoreStats.cpp Normal file
View File

@ -0,0 +1,146 @@
/*
* FeatureStats.cpp
* met - Minimum Error Training
*
* Created by Nicola Bertoldi on 13/05/08.
*
*/
#include <fstream>
#include "ScoreStats.h"
ScoreStats::ScoreStats():
bufLen_(0)
{};
ScoreStats::ScoreStats(const ScoreStats &stats):
array_(stats.array_),
bufLen_(0)
{};
ScoreStats::ScoreStats(const size_t size):
bufLen_(0)
{
for(int i = 0; i < size; i++)
array_.push_back(0);
};
ScoreStats::ScoreStats(std::string &theString)
{
set(theString);
}
void ScoreStats::set(std::string &theString)
{
std::string substring, stringBuf;
std::string::size_type loc;
int nextPound;
ScoreStatsType sc;
// TRACE_ERR("Decompounding string: " << theString << std::endl);
while (!theString.empty()){
nextPound = getNextPound(theString, substring);
sc = ATOSST(substring.c_str());
array_.push_back(sc);
}
}
void ScoreStats::loadtxt(std::ifstream& inFile)
{
std::string theString;
std::getline(inFile, theString);
set(theString);
}
void ScoreStats::loadtxt(const std::string &file)
{
// TRACE_ERR("loading the stats from " << file << std::endl);
std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
loadtxt(inFile);
}
void ScoreStats::savetxt(const std::string &file)
{
// TRACE_ERR("saving the stats into " << file << std::endl);
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
savetxt(outFile);
}
void ScoreStats::savetxt(std::ofstream& outFile)
{
outFile << array_.at(0);
vector<ScoreStatsType>::iterator i = array_.begin();
outFile << " " << *i;
i++;
while (i !=array_.end()){
outFile << " " << *i;
i++;
}
outFile << std::endl;
}
ScoreStats& ScoreStats::operator=(const ScoreStats &stats)
{
array_ = stats.array_;
bufLen_ = 0;
return *this;
}
void ScoreStats::setBuffer(char* buffer, size_t sz)
{
memcpy(databuf_, (char *)buffer, sz);
// Now pack the data into a single contiguous memory location for storage.
bufLen_ = 0;
unpackVector(databuf_, bufLen_, array_);
}
/*
* Marshalls this classes data members into a single
* contiguous memory location for the purpose of storing
* the data in a database.
*/
char *ScoreStats::getBuffer()
{
// Zero out the buffer
memset(databuf_, 0, BUFSIZ);
// Now pack the data into a single contiguous memory location for storage.
bufLen_ = 0;
packVector(databuf_, bufLen_, array_);
return databuf_;
}
int ScoreStats::pack(char *buffer, size_t &bufferlen)
{
getBuffer();
size_t size = packVariable(buffer, bufferlen, bufLen_);
memcpy(buffer + bufferlen, databuf_, bufLen_);
bufferlen += bufLen_;
return size + bufLen_;
}
int ScoreStats::unpack(char *buffer, size_t &bufferlen)
{
size_t size = unpackVariable(buffer, bufferlen, bufLen_);
memcpy(databuf_, buffer + bufferlen, bufLen_);
bufferlen += bufLen_;
setBuffer(databuf_, bufLen_);
return size + bufLen_;
}

82
mert/ScoreStats.h Normal file
View File

@ -0,0 +1,82 @@
/*
* ScoreStats.h
* met - Minimum Error Training
*
* Created by Nicola Bertoldi on 13/05/08.
*
*/
#ifndef SCORE_STATS_H
#define SCORE_STATS_H
using namespace std;
#include <limits>
#include <vector>
#include <iostream>
#include "Util.h"
typedef int ScoreStatsType;
#define SCORE_STATS_MIN (numeric_limits<ScoreStatsType>::min())
#define ATOSST(str) ((ScoreStatsType) atoi(str))
class ScoreStats
{
protected:
vector<ScoreStatsType> array_;
private:
char databuf_[BUFSIZ];
size_t bufLen_;
public:
ScoreStats();
ScoreStats(const size_t size);
ScoreStats(const ScoreStats &stats);
ScoreStats(std::string &theString);
ScoreStats& operator=(const ScoreStats &stats);
~ScoreStats(){};
inline void clear() { array_.clear(); }
inline ScoreStatsType get(int i){ return array_.at(i); }
void set(std::string &theString);
inline size_t size(){ return array_.size(); }
inline size_t memsize(){ return bufLen_; }
void savetxt(const std::string &file);
void savetxt(ofstream& outFile);
inline void savetxt(){ savetxt("/dev/stdout"); }
void loadtxt(ifstream& inFile);
void loadtxt(const std::string &file);
inline void reset()
{
for (vector<ScoreStatsType>::iterator i = array_.begin(); i != array_.end(); i++)
*i = 0;
}
/*
* Marshalls this classes data members into a single
* contiguous memory location for the purpose of storing
* the data in a database.
*/
char *getBuffer();
void setBuffer(char* buffer, size_t sz);
int pack(char *buffer, size_t &bufferlen);
int unpack(char *buffer, size_t &bufferlen);
};
#endif