reading from textual gzipped file is now possible

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1786 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
nicolabertoldi 2008-05-20 14:15:30 +00:00
parent f30000b875
commit 8a594fc254
12 changed files with 145 additions and 175 deletions

View File

@ -29,23 +29,18 @@ void Data::loadnbest(const std::string &file)
int sentence_index;
int nextPound;
std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
inputfilestream inp(file); // matches a stream with a file. Opens the file
if (!inFile) {
throw runtime_error("Unable to open: " + file);
}
if (!inp.good())
throw runtime_error("Unable to open: " + file);
while (!inFile.eof()){
std::string substring, subsubstring, stringBuf;
std::string theSentence;
std::string::size_type loc;
std::string substring, subsubstring, stringBuf;
std::string theSentence;
std::string::size_type loc;
std::getline(inFile, stringBuf);
while (getline(inp,stringBuf,'\n')){
if (stringBuf.empty()) continue;
// TRACE_ERR("Reading: " << stringBuf << std::endl);
nextPound = getNextPound(stringBuf, substring, "|||"); //first field
sentence_index = atoi(substring.c_str());
@ -53,19 +48,19 @@ void Data::loadnbest(const std::string &file)
theSentence = substring;
// adding statistics for error measures
scoreentry.clear();
theScorer->prepareStats(sentence_index, theSentence,scoreentry);
scoredata->add(scoreentry,sentence_index);
featentry.clear();
scoreentry.clear();
theScorer->prepareStats(sentence_index, theSentence, scoreentry);
scoredata->add(scoreentry, sentence_index);
nextPound = getNextPound(stringBuf, substring, "|||"); //third field
// adding features
featentry.clear();
scoreentry.clear();
while (!substring.empty()){
// TRACE_ERR("Decompounding: " << substring << std::endl);
nextPound = getNextPound(substring, subsubstring);
// string ending with ":" are skipped, because they are the names of the features
if ((loc = subsubstring.find(":")) != subsubstring.length()-1){
featentry.add(ATOFST(subsubstring.c_str()));
}
@ -73,5 +68,5 @@ void Data::loadnbest(const std::string &file)
featdata->add(featentry,sentence_index);
}
inFile.close();
inp.close();
}

View File

@ -66,13 +66,12 @@ void FeatureArray::loadtxt(ifstream& inFile)
std::getline(inFile, stringBuf);
if (stringBuf.empty()){
TRACE_ERR("ERROR: Empty string" << std::endl);
if (!inFile.good()){
return;
}
}
if (!stringBuf.empty()){
// TRACE_ERR("Reading: " << stringBuf << std::endl);
TRACE_ERR("Reading: " << stringBuf << std::endl);
if ((loc = stringBuf.find(FEATURES_TXT_BEGIN)) != 0){
TRACE_ERR("ERROR: FeatureArray::loadtxt(): Wrong header");
return;
@ -121,9 +120,9 @@ void FeatureArray::load(const std::string &file, bool bin)
{
TRACE_ERR("loading data from " << file << std::endl);
std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
inputfilestream inFile(file); // matches a stream with a file. Opens the file
load(inFile, bin);
load((ifstream&) inFile, bin);
inFile.close();

View File

@ -39,12 +39,16 @@ void FeatureData::load(ifstream& inFile)
int iter=0;
while (!inFile.eof()){
if (!inFile.good()){
std::cerr << "ERROR FeatureData::load inFile.good()" << std::endl;
}
entry.clear();
entry.load(inFile);
if (entry.size() == 0){
TRACE_ERR("no more data" << std::endl);
continue;
return;
}
add(entry);
iter++;
@ -56,54 +60,16 @@ void FeatureData::load(const std::string &file)
{
TRACE_ERR("loading data from " << file << std::endl);
std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
inputfilestream inFile(file); // matches a stream with a file. Opens the file
load(inFile);
inFile.close();
}
void FeatureData::loadnbest(const std::string &file)
{
TRACE_ERR("loading nbest from " << file << std::endl);
FeatureStats entry;
int sentence_index;
int nextPound;
std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
while (!inFile.eof()){
std::string substring, subsubstring, stringBuf;
std::string::size_type loc;
std::getline(inFile, stringBuf);
if (stringBuf.empty()) continue;
// TRACE_ERR("Reading: " << stringBuf << std::endl);
nextPound = getNextPound(stringBuf, substring, "|||"); //first field
sentence_index = atoi(substring.c_str());
nextPound = getNextPound(stringBuf, substring, "|||"); //second field
nextPound = getNextPound(stringBuf, substring, "|||"); //third field
entry.clear();
while (!substring.empty()){
// TRACE_ERR("Decompounding: " << substring << std::endl);
nextPound = getNextPound(substring, subsubstring);
if ((loc = subsubstring.find(":")) != subsubstring.length()-1){
entry.add(ATOFST(subsubstring.c_str()));
}
}
// entry.save();
add(entry,sentence_index);
if (!inFile) {
throw runtime_error("Unable to open feature file: " + file);
}
load((ifstream&) inFile);
inFile.close();
}
void FeatureData::add(FeatureArray& e){
if (e.getIndex() < size()){ // array at poistion e.getIndex() already exists
//enlarge array at position e.getIndex()

View File

@ -23,6 +23,7 @@ class FeatureData
protected:
vector<FeatureArray> array_;
vector<int> idxmap_;
size_t number_of_feature;
private:
@ -57,6 +58,7 @@ public:
void add(FeatureArray& e);
void add(FeatureStats e, int sent_idx);
inline size_t FeatureSize(){ return number_of_feature; }
inline size_t size(){ return array_.size(); }
void save(const std::string &file, bool bin=false);
@ -65,8 +67,6 @@ public:
void load(ifstream& inFile);
void load(const std::string &file);
void loadnbest(const std::string &file);
};

View File

@ -4,9 +4,9 @@ FeatureStats.o FeatureArray.o FeatureData.o \
Data.o \
BleuScorer.o \
Point.o \
Optimizer.o \
PerScorer.o \
Scorer.o
#Optimizer.o \
ifndef DEBUG
CFLAGS=-O3 -DTRACE_ENABLE
@ -17,12 +17,13 @@ endif
GCC=g++
LDFLAGS=
LDLIBS=-lm
LDLIBS=-lm -lz
all: \
mert \
extractor \
test_scorer
prova-gz
#test_scorer \
#mert \
clean:
rm -f *.o
@ -39,3 +40,6 @@ mert: $(OBJS) mert.cpp
test_scorer: $(OBJS) test_scorer.cpp
$(GCC) $(CFLAGS) $(OBJS) $(LDLIBS) -o $@ $@.cpp
prova-gz: $(OBJS) prova-gz.cpp
$(GCC) $(CFLAGS) $(OBJS) $(LDLIBS) -o $@ $@.cpp

View File

@ -67,10 +67,9 @@ void ScoreArray::loadtxt(ifstream& inFile)
TRACE_ERR("starting loadtxt..." << std::endl);
std::getline(inFile, stringBuf);
if (stringBuf.empty()){
TRACE_ERR("ERROR: Empty string" << std::endl);
if (!inFile.good()){
return;
}
}
if (!stringBuf.empty()){
// TRACE_ERR("Reading: " << stringBuf << std::endl);
@ -116,10 +115,11 @@ void ScoreArray::load(ifstream& inFile, bool bin)
void ScoreArray::load(const std::string &file , bool bin)
{
TRACE_ERR("loading data from " << file << std::endl);
TRACE_ERR("loading data from " << file << std::endl);
std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
inputfilestream inFile(file); // matches a stream with a file. Opens the file
load((ifstream&) inFile, bin);
load(inFile, bin);
inFile.close();
}

View File

@ -53,7 +53,6 @@ void ScoreData::load(ifstream& inFile)
entry.loadtxt(inFile);
if (entry.size() == 0){
TRACE_ERR("no more data" << std::endl);
continue;
}
add(entry);
@ -64,58 +63,19 @@ void ScoreData::load(ifstream& inFile)
void ScoreData::load(const std::string &file)
{
TRACE_ERR("loading score data from " << file << std::endl);
TRACE_ERR("loading score data from " << file << std::endl);
std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
inputfilestream inFile(file); // matches a stream with a file. Opens the file
if (!inFile) {
throw runtime_error("Unable to open score file: " + file);
}
load(inFile);
inFile.close();
}
void ScoreData::loadnbest(const std::string &file)
{
TRACE_ERR("loading nbest from " << file << std::endl);
ScoreStats entry;
int sentence_index;
int nextPound;
std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
while (!inFile.eof()){
std::string substring, subsubstring, stringBuf;
std::string theSentence;
std::string::size_type loc;
std::getline(inFile, stringBuf);
if (stringBuf.empty()) continue;
// TRACE_ERR("Reading: " << stringBuf << std::endl);
nextPound = getNextPound(stringBuf, substring, "|||"); //first field
sentence_index = atoi(substring.c_str());
nextPound = getNextPound(stringBuf, substring, "|||"); //second field
theSentence = substring;
entry.clear();
theScorer->prepareStats(sentence_index, theSentence,entry);
add(entry,sentence_index);
if (!inFile) {
throw runtime_error("Unable to open score file: " + file);
}
load((ifstream&) inFile);
inFile.close();
}
void ScoreData::add(const ScoreStats& e, int sent_idx){
if (exists(sent_idx)){
array_.at(sent_idx).add(e);

View File

@ -28,6 +28,7 @@ protected:
private:
Scorer* theScorer;
std::string score_type;
size_t number_of_scores;
public:
ScoreData(Scorer& sc);
@ -46,6 +47,7 @@ public:
void add(const ScoreArray& e){ array_.push_back(e); }
void add(const ScoreStats& e, int sent_idx);
inline size_t ScoreSize(){ return number_of_scores; }
inline size_t size(){ return array_.size(); }
void save(const std::string &file, bool bin=false);
@ -54,9 +56,6 @@ public:
void load(ifstream& inFile);
void load(const std::string &file);
void loadnbest(const std::string &file);
};

View File

@ -6,6 +6,7 @@
*
*/
#include <stdexcept>
#include "Util.h"
int verbose=0;
@ -37,3 +38,57 @@ int getNextPound(std::string &theString, std::string &substring, const std::stri
}
return (pos);
};
inputfilestream::inputfilestream(const std::string &filePath)
: std::istream(0),
m_streambuf(0)
{
//check if file is readable
std::filebuf* fb = new std::filebuf();
_good=(fb->open(filePath.c_str(), std::ios::in)!=NULL);
if (filePath.size() > 3 &&
filePath.substr(filePath.size() - 3, 3) == ".gz")
{
fb->close(); delete fb;
m_streambuf = new gzfilebuf(filePath.c_str());
} else {
m_streambuf = fb;
}
this->init(m_streambuf);
}
inputfilestream::~inputfilestream()
{
delete m_streambuf; m_streambuf = 0;
}
void inputfilestream::close()
{
}
outputfilestream::outputfilestream(const std::string &filePath)
: std::ostream(0),
m_streambuf(0)
{
//check if file is readable
std::filebuf* fb = new std::filebuf();
_good=(fb->open(filePath.c_str(), std::ios::out)!=NULL);
if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz")
{
throw runtime_error("Output to a zipped file not supported!");
} else {
m_streambuf = fb;
}
this->init(m_streambuf);
}
outputfilestream::~outputfilestream()
{
delete m_streambuf; m_streambuf = 0;
}
void outputfilestream::close()
{
}

View File

@ -11,15 +11,21 @@
using namespace std;
#include <stdexcept>
#include <limits>
#define US_NOSET (numeric_limits<unsigned short>::max())
#define MAX_LINE 1024
#include <vector>
#include <map>
#include <iostream>
#include <sstream>
#include <string>
#include <fstream>
#include "gzfilebuf.h"
#include "ScoreStats.h"
#include "FeatureStats.h"
@ -48,50 +54,30 @@ inline T Scan(const std::string &input)
return ret;
};
template<typename T>
int packVariable(char *buffer, size_t &bufferlen, T theVariable)
class inputfilestream : public std::istream
{
size_t variable_size = sizeof(T);
memcpy(buffer + bufferlen, (char*) &theVariable, variable_size);
bufferlen += variable_size;
return variable_size;
protected:
std::streambuf *m_streambuf;
bool _good;
public:
inputfilestream(const std::string &filePath);
~inputfilestream();
bool good(){return _good;}
void close();
};
template<typename T>
int unpackVariable(char *buffer, size_t &bufferlen, T &theVariable)
class outputfilestream : public std::ostream
{
size_t variable_size = sizeof(T);
theVariable = *((T*)(buffer + bufferlen));
bufferlen += variable_size;
return variable_size;
};
template<typename T>
int packVector(char *buffer, size_t &bufferlen, vector<T> theVector)
{
int vector_size = packVariable(buffer, bufferlen, theVector.size());
for (int i = 0; i < theVector.size(); i++)
vector_size += packVariable(buffer, bufferlen, theVector.at(i));
return vector_size;
};
template<typename T>
int unpackVector(char *buffer, size_t &bufferlen, vector<T> &theVector)
{
int vector_size;
int vector_memsize = unpackVariable(buffer, bufferlen, vector_size);
theVector.clear();
T theVariable;
for (int i = 0; i < vector_size; i++)
{
vector_memsize += unpackVariable(buffer, bufferlen, theVariable);
theVector.push_back(theVariable);
}
return vector_memsize;
protected:
std::streambuf *m_streambuf;
bool _good;
public:
outputfilestream(const std::string &filePath);
~outputfilestream();
bool good(){return _good;}
void close();
};
#endif

View File

@ -1,6 +1,13 @@
cmd=../extractor
#$cmd -NbestFile NBEST -Reference REF -OutputFeatureStatistics FEATSTAT.out -OutputScoreStatistics SCORESTAT.out
$cmd --nbest NBEST --reference REF.0,REF.1,REF.2 --ffile FEATSTAT.out --scfile SCORESTAT.out
#to read an nbest file; output is in text format
$cmd --nbest NBEST --reference REF.0,REF.1,REF.2 --ffile FEATSTAT.out --scfile SCORESTAT.out --sctype BLEU4
#to read a gzipped nbest file; output is in text format
$cmd --nbest NBEST.gz --reference REF.0,REF.1,REF.2 --ffile FEATSTAT.out --scfile SCORESTAT.out --sctype BLEU4
exit
cp FEATSTAT.out FEATSTAT.in
cp SCORESTAT.out SCORESTAT.in

View File

@ -6,15 +6,14 @@ for normtype in '' '-n' ; do
for reflentype in '' '-a' '-s' '-e' ; do
basename=OLDMERT${casetype}${normtype}${reflentype}
#cat NBEST | $scorecmd ${casetype} ${normtype} ${reflentype} REF.0 REF.1 REF.2 ./$basename.
cat NBEST NBEST | sort -mnk 1,1 | $scorecmd ${casetype} ${normtype} ${reflentype} REF.0 REF.1 REF.2 ./$basename.
cat NBEST | $scorecmd ${casetype} ${normtype} ${reflentype} REF.0 REF.1 REF.2 ./$basename.
#cat NBEST NBEST | sort -mnk 1,1 | $scorecmd ${casetype} ${normtype} ${reflentype} REF.0 REF.1 REF.2 ./$basename.
cat ./$basename.feats.opt | cut -d' ' -f 16- > ./$basename.SCORESTAT.out
cat ./$basename.feats.opt | cut -d' ' -f 1-15 > ./$basename.FEATSTAT.out
echo comparing SCORESTAT.out and $basename.SCORESTAT.out
cat SCORESTAT.out | sort | grep -v "^SCORE"> AAA$$
cat $basename.SCORESTAT.out | sort >BBB$$
#head -3 AAA$$ BBB$$
cmp AAA$$ BBB$$
echo comparing FEATSTAT.out and $basename.FEATSTAT.out