Finish and test feature and score data iterators.

This commit is contained in:
Barry Haddow 2011-11-15 13:12:14 +00:00
parent 3a6c0e0680
commit 0a2e0f44a6
6 changed files with 242 additions and 27 deletions

View File

@ -1,6 +1,3 @@
// $Id$
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2011- University of Edinburgh
@ -22,7 +19,6 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <iostream>
#include <sstream>
#include "util/string_piece.hh"
#include "util/tokenize_piece.hh"
#include "FeatureArray.h"
@ -32,16 +28,36 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
using namespace std;
using namespace util;
int ParseInt(const StringPiece& str ) {
char* errIndex;
//could wrap?
int value = static_cast<int>(strtol(str.data(), &errIndex,10));
if (errIndex == str.data()) {
throw util::ParseNumberException(str);
}
return value;
}
float ParseFloat(const StringPiece& str) {
char* errIndex;
float value = static_cast<float>(strtod(str.data(), &errIndex));
if (errIndex == str.data()) {
throw util::ParseNumberException(str);
}
return value;
}
FeatureDataIterator::FeatureDataIterator() {}
FeatureDataIterator::FeatureDataIterator(const string filename) {
FeatureDataIterator::FeatureDataIterator(const string& filename) {
m_in.reset(new FilePiece(filename.c_str()));
readNext();
}
void FeatureDataIterator::readNext() {
m_next.clear();
try {
StringPiece marker = m_in->ReadDelimited();
if (marker != StringPiece(FEATURES_TXT_BEGIN)) {
@ -49,20 +65,30 @@ void FeatureDataIterator::readNext() {
}
size_t sentenceId = m_in->ReadULong();
size_t count = m_in->ReadULong();
cerr << "Expecting " << count << endl;
size_t length = m_in->ReadULong();
m_in->ReadLine(); //discard rest of line
for (size_t i = 0; i < count; ++i) {
StringPiece line = m_in->ReadLine();
for (util::TokenIter<util::AnyCharacter, true> token(line, util::AnyCharacter(" \t")); token; ++token) {
//TODO: Create FeatureDataItem
char* err_ind;
float value = static_cast<float>(strtod(token->data(), &err_ind));
if (err_ind == token->data()) {
throw FileFormatException(m_in->FileName(), line.as_string());
m_next.push_back(FeatureDataItem());
for (TokenIter<AnyCharacter, true> token(line, AnyCharacter(" \t")); token; ++token) {
TokenIter<AnyCharacter,false> value(*token,AnyCharacter(":"));
if (!value) throw FileFormatException(m_in->FileName(), line.as_string());
StringPiece first = *value;
++value;
if (!value) {
//regular feature
float floatValue = ParseFloat(first);
m_next.back().dense.push_back(floatValue);
} else {
//sparse feature
StringPiece second = *value;
float floatValue = ParseFloat(second);
m_next.back().sparse.set(first.as_string(),floatValue);
}
cerr << value << ",";
}
cerr << "\n";
if (length != m_next.back().dense.size()) {
throw FileFormatException(m_in->FileName(), line.as_string());
}
}
StringPiece line = m_in->ReadLine();
if (line != StringPiece(FEATURES_TXT_END)) {

View File

@ -1,6 +1,3 @@
// $Id$
// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2011- University of Edinburgh
@ -29,7 +26,6 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <fstream>
#include <map>
#include <memory>
#include <stdexcept>
#include <vector>
@ -37,23 +33,34 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <boost/shared_ptr.hpp>
#include "util/file_piece.hh"
#include "util/string_piece.hh"
#include "FeatureStats.h"
class FeatureDataItem {
public:
std::vector<float> dense;
SparseVector sparse;
};
class FileFormatException : public util::Exception {
class FileFormatException : public util::Exception
{
public:
explicit FileFormatException(const std::string filename, const std::string& line) {
*this << "Error in line \"" << line << "\" of " << filename;
}
};
/** Assumes a delimiter, so only apply to tokens */
int ParseInt(const StringPiece& str );
/** Assumes a delimiter, so only apply to tokens */
float ParseFloat(const StringPiece& str);
class FeatureDataItem
{
public:
std::vector<float> dense;
SparseVector sparse;
};
class FeatureDataIterator :
public boost::iterator_facade<FeatureDataIterator,
const std::vector<FeatureDataItem>,
@ -61,7 +68,7 @@ class FeatureDataIterator :
{
public:
FeatureDataIterator();
FeatureDataIterator(const std::string filename);
FeatureDataIterator(const std::string& filename);
static FeatureDataIterator end() {
return FeatureDataIterator();

View File

@ -6,6 +6,7 @@ libmert_la_SOURCES = \
Util.cpp \
Timer.cpp \
ScoreStats.cpp ScoreArray.cpp ScoreData.cpp \
ScoreDataIterator.cpp \
FeatureStats.cpp FeatureArray.cpp FeatureData.cpp \
FeatureDataIterator.cpp \
Data.cpp \
@ -38,5 +39,5 @@ extractor_LDADD = libmert.la -lm -lz
mert_LDADD = libmert.la -lm -lz $(BOOST_THREAD_LDFLAGS) $(BOOST_THREAD_LIBS)
evaluator_LDADD = libmert.la -lm -lz
pro_LDADD = libmert.la @KENLM_LDFLAGS@ $(BOOST_LDFLAGS) $(BOOST_PROGRAM_OPTIONS_LDFLAGS) $(BOOST_PROGRAM_OPTIONS_LIBS)
pro_DEPENDENCIES = $(top_srcdir)/kenlm/libkenlm.la
pro_DEPENDENCIES = $(top_srcdir)/kenlm/libkenlm.la libmert.la

View File

@ -0,0 +1,90 @@
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2011- University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <iostream>
#include "util/tokenize_piece.hh"
#include "ScoreArray.h"
#include "ScoreDataIterator.h"
using namespace std;
using namespace util;
ScoreDataIterator::ScoreDataIterator() {}
ScoreDataIterator::ScoreDataIterator(const string& filename) {
m_in.reset(new FilePiece(filename.c_str()));
readNext();
}
void ScoreDataIterator::readNext() {
m_next.clear();
try {
StringPiece marker = m_in->ReadDelimited();
if (marker != StringPiece(SCORES_TXT_BEGIN)) {
throw FileFormatException(m_in->FileName(), marker.as_string());
}
size_t sentenceId = m_in->ReadULong();
size_t count = m_in->ReadULong();
size_t length = m_in->ReadULong();
m_in->ReadLine(); //ignore rest of line
for (size_t i = 0; i < count; ++i) {
StringPiece line = m_in->ReadLine();
cerr << line << endl;
m_next.push_back(ScoreDataItem());
for (TokenIter<AnyCharacter, true> token(line,AnyCharacter(" \t")); token; ++token) {
float value = ParseFloat(*token);
m_next.back().push_back(value);
}
if (length != m_next.back().size()) {
throw FileFormatException(m_in->FileName(), line.as_string());
}
}
StringPiece line = m_in->ReadLine();
if (line != StringPiece(SCORES_TXT_END)) {
throw FileFormatException(m_in->FileName(), line.as_string());
}
} catch (EndOfFileException& e) {
m_in.reset();
}
}
void ScoreDataIterator::increment() {
readNext();
}
bool ScoreDataIterator::equal(const ScoreDataIterator& rhs) const {
if (!m_in && !rhs.m_in) {
return true;
} else if (!m_in) {
return false;
} else if (!rhs.m_in) {
return false;
} else {
return m_in->FileName() == rhs.m_in->FileName() &&
m_in->Offset() == rhs.m_in->Offset();
}
}
const vector<ScoreDataItem>& ScoreDataIterator::dereference() const {
return m_next;
}

67
mert/ScoreDataIterator.h Normal file
View File

@ -0,0 +1,67 @@
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2011- University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef _SCORE_DATA_ITERATOR_
#define _SCORE_DATA_ITERATOR_
/*
* For loading from the score data file.
**/
#include <vector>
#include <boost/iterator/iterator_facade.hpp>
#include <boost/shared_ptr.hpp>
#include "util/file_piece.hh"
#include "util/string_piece.hh"
#include "FeatureDataIterator.h"
typedef std::vector<float> ScoreDataItem;
class ScoreDataIterator :
public boost::iterator_facade<ScoreDataIterator,
const std::vector<ScoreDataItem>,
boost::forward_traversal_tag>
{
public:
ScoreDataIterator();
ScoreDataIterator(const std::string& filename);
static ScoreDataIterator end() {
return ScoreDataIterator();
}
private:
friend class boost::iterator_core_access;
void increment();
bool equal(const ScoreDataIterator& rhs) const;
const std::vector<ScoreDataItem>& dereference() const;
void readNext();
boost::shared_ptr<util::FilePiece> m_in;
std::vector<ScoreDataItem> m_next;
};
#endif

View File

@ -36,6 +36,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <boost/program_options.hpp>
#include "FeatureDataIterator.h"
#include "ScoreDataIterator.h"
using namespace std;
@ -80,6 +81,29 @@ int main(int argc, char** argv)
//cerr << featureFiles[0] << endl;
for (; fi != FeatureDataIterator::end(); ++fi) {
const vector<FeatureDataItem>& featureData = *fi;
cerr << "Read " << featureData.size() << " items " << endl;
for (size_t i = 0; i < featureData.size(); ++i) {
cerr << "Dense: ";
for (size_t j = 0; j < featureData[i].dense.size(); ++j) {
cerr << featureData[i].dense[j] << " ";
}
cerr << "\n";
}
cerr << "\n";
}
ScoreDataIterator si(scoreFiles[0]);
for (; si != ScoreDataIterator::end(); ++si) {
const vector<ScoreDataItem>& scoreData = *si;
cerr << "Read " << scoreData.size() << " items " << endl;
for (size_t i = 0; i < scoreData.size(); ++i) {
cerr << "SD: ";
for (size_t j = 0; j < scoreData[i].size(); ++j) {
cerr << scoreData[i][j] << " ";
}
cerr << "\n";
}
cerr << "\n";
}
}