mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-10-26 11:28:48 +03:00
Finish and test feature and score data iterators.
This commit is contained in:
parent
3a6c0e0680
commit
0a2e0f44a6
@ -1,6 +1,3 @@
|
||||
// $Id$
|
||||
// vim:tabstop=2
|
||||
|
||||
/***********************************************************************
|
||||
Moses - factored phrase-based language decoder
|
||||
Copyright (C) 2011- University of Edinburgh
|
||||
@ -22,7 +19,6 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
|
||||
#include "util/string_piece.hh"
|
||||
#include "util/tokenize_piece.hh"
|
||||
|
||||
#include "FeatureArray.h"
|
||||
@ -32,16 +28,36 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
using namespace std;
|
||||
using namespace util;
|
||||
|
||||
int ParseInt(const StringPiece& str ) {
|
||||
char* errIndex;
|
||||
//could wrap?
|
||||
int value = static_cast<int>(strtol(str.data(), &errIndex,10));
|
||||
if (errIndex == str.data()) {
|
||||
throw util::ParseNumberException(str);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
float ParseFloat(const StringPiece& str) {
|
||||
char* errIndex;
|
||||
float value = static_cast<float>(strtod(str.data(), &errIndex));
|
||||
if (errIndex == str.data()) {
|
||||
throw util::ParseNumberException(str);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
|
||||
|
||||
FeatureDataIterator::FeatureDataIterator() {}
|
||||
|
||||
FeatureDataIterator::FeatureDataIterator(const string filename) {
|
||||
FeatureDataIterator::FeatureDataIterator(const string& filename) {
|
||||
m_in.reset(new FilePiece(filename.c_str()));
|
||||
readNext();
|
||||
}
|
||||
|
||||
void FeatureDataIterator::readNext() {
|
||||
m_next.clear();
|
||||
try {
|
||||
StringPiece marker = m_in->ReadDelimited();
|
||||
if (marker != StringPiece(FEATURES_TXT_BEGIN)) {
|
||||
@ -49,20 +65,30 @@ void FeatureDataIterator::readNext() {
|
||||
}
|
||||
size_t sentenceId = m_in->ReadULong();
|
||||
size_t count = m_in->ReadULong();
|
||||
cerr << "Expecting " << count << endl;
|
||||
size_t length = m_in->ReadULong();
|
||||
m_in->ReadLine(); //discard rest of line
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
StringPiece line = m_in->ReadLine();
|
||||
for (util::TokenIter<util::AnyCharacter, true> token(line, util::AnyCharacter(" \t")); token; ++token) {
|
||||
//TODO: Create FeatureDataItem
|
||||
char* err_ind;
|
||||
float value = static_cast<float>(strtod(token->data(), &err_ind));
|
||||
if (err_ind == token->data()) {
|
||||
throw FileFormatException(m_in->FileName(), line.as_string());
|
||||
m_next.push_back(FeatureDataItem());
|
||||
for (TokenIter<AnyCharacter, true> token(line, AnyCharacter(" \t")); token; ++token) {
|
||||
TokenIter<AnyCharacter,false> value(*token,AnyCharacter(":"));
|
||||
if (!value) throw FileFormatException(m_in->FileName(), line.as_string());
|
||||
StringPiece first = *value;
|
||||
++value;
|
||||
if (!value) {
|
||||
//regular feature
|
||||
float floatValue = ParseFloat(first);
|
||||
m_next.back().dense.push_back(floatValue);
|
||||
} else {
|
||||
//sparse feature
|
||||
StringPiece second = *value;
|
||||
float floatValue = ParseFloat(second);
|
||||
m_next.back().sparse.set(first.as_string(),floatValue);
|
||||
}
|
||||
cerr << value << ",";
|
||||
}
|
||||
cerr << "\n";
|
||||
if (length != m_next.back().dense.size()) {
|
||||
throw FileFormatException(m_in->FileName(), line.as_string());
|
||||
}
|
||||
}
|
||||
StringPiece line = m_in->ReadLine();
|
||||
if (line != StringPiece(FEATURES_TXT_END)) {
|
||||
|
@ -1,6 +1,3 @@
|
||||
// $Id$
|
||||
// vim:tabstop=2
|
||||
|
||||
/***********************************************************************
|
||||
Moses - factored phrase-based language decoder
|
||||
Copyright (C) 2011- University of Edinburgh
|
||||
@ -29,7 +26,6 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
|
||||
#include <fstream>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
|
||||
@ -37,23 +33,34 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include <boost/shared_ptr.hpp>
|
||||
|
||||
#include "util/file_piece.hh"
|
||||
#include "util/string_piece.hh"
|
||||
|
||||
#include "FeatureStats.h"
|
||||
|
||||
|
||||
class FeatureDataItem {
|
||||
public:
|
||||
std::vector<float> dense;
|
||||
SparseVector sparse;
|
||||
};
|
||||
|
||||
class FileFormatException : public util::Exception {
|
||||
class FileFormatException : public util::Exception
|
||||
{
|
||||
public:
|
||||
explicit FileFormatException(const std::string filename, const std::string& line) {
|
||||
*this << "Error in line \"" << line << "\" of " << filename;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/** Assumes a delimiter, so only apply to tokens */
|
||||
int ParseInt(const StringPiece& str );
|
||||
|
||||
/** Assumes a delimiter, so only apply to tokens */
|
||||
float ParseFloat(const StringPiece& str);
|
||||
|
||||
|
||||
class FeatureDataItem
|
||||
{
|
||||
public:
|
||||
std::vector<float> dense;
|
||||
SparseVector sparse;
|
||||
};
|
||||
|
||||
class FeatureDataIterator :
|
||||
public boost::iterator_facade<FeatureDataIterator,
|
||||
const std::vector<FeatureDataItem>,
|
||||
@ -61,7 +68,7 @@ class FeatureDataIterator :
|
||||
{
|
||||
public:
|
||||
FeatureDataIterator();
|
||||
FeatureDataIterator(const std::string filename);
|
||||
FeatureDataIterator(const std::string& filename);
|
||||
|
||||
static FeatureDataIterator end() {
|
||||
return FeatureDataIterator();
|
||||
|
@ -6,6 +6,7 @@ libmert_la_SOURCES = \
|
||||
Util.cpp \
|
||||
Timer.cpp \
|
||||
ScoreStats.cpp ScoreArray.cpp ScoreData.cpp \
|
||||
ScoreDataIterator.cpp \
|
||||
FeatureStats.cpp FeatureArray.cpp FeatureData.cpp \
|
||||
FeatureDataIterator.cpp \
|
||||
Data.cpp \
|
||||
@ -38,5 +39,5 @@ extractor_LDADD = libmert.la -lm -lz
|
||||
mert_LDADD = libmert.la -lm -lz $(BOOST_THREAD_LDFLAGS) $(BOOST_THREAD_LIBS)
|
||||
evaluator_LDADD = libmert.la -lm -lz
|
||||
pro_LDADD = libmert.la @KENLM_LDFLAGS@ $(BOOST_LDFLAGS) $(BOOST_PROGRAM_OPTIONS_LDFLAGS) $(BOOST_PROGRAM_OPTIONS_LIBS)
|
||||
pro_DEPENDENCIES = $(top_srcdir)/kenlm/libkenlm.la
|
||||
pro_DEPENDENCIES = $(top_srcdir)/kenlm/libkenlm.la libmert.la
|
||||
|
||||
|
90
mert/ScoreDataIterator.cpp
Normal file
90
mert/ScoreDataIterator.cpp
Normal file
@ -0,0 +1,90 @@
|
||||
/***********************************************************************
|
||||
Moses - factored phrase-based language decoder
|
||||
Copyright (C) 2011- University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
#include <iostream>
|
||||
|
||||
#include "util/tokenize_piece.hh"
|
||||
|
||||
#include "ScoreArray.h"
|
||||
#include "ScoreDataIterator.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace util;
|
||||
|
||||
ScoreDataIterator::ScoreDataIterator() {}
|
||||
|
||||
ScoreDataIterator::ScoreDataIterator(const string& filename) {
|
||||
m_in.reset(new FilePiece(filename.c_str()));
|
||||
readNext();
|
||||
}
|
||||
|
||||
void ScoreDataIterator::readNext() {
|
||||
m_next.clear();
|
||||
try {
|
||||
StringPiece marker = m_in->ReadDelimited();
|
||||
if (marker != StringPiece(SCORES_TXT_BEGIN)) {
|
||||
throw FileFormatException(m_in->FileName(), marker.as_string());
|
||||
}
|
||||
size_t sentenceId = m_in->ReadULong();
|
||||
size_t count = m_in->ReadULong();
|
||||
size_t length = m_in->ReadULong();
|
||||
m_in->ReadLine(); //ignore rest of line
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
StringPiece line = m_in->ReadLine();
|
||||
cerr << line << endl;
|
||||
m_next.push_back(ScoreDataItem());
|
||||
for (TokenIter<AnyCharacter, true> token(line,AnyCharacter(" \t")); token; ++token) {
|
||||
float value = ParseFloat(*token);
|
||||
m_next.back().push_back(value);
|
||||
}
|
||||
if (length != m_next.back().size()) {
|
||||
throw FileFormatException(m_in->FileName(), line.as_string());
|
||||
}
|
||||
}
|
||||
StringPiece line = m_in->ReadLine();
|
||||
if (line != StringPiece(SCORES_TXT_END)) {
|
||||
throw FileFormatException(m_in->FileName(), line.as_string());
|
||||
}
|
||||
} catch (EndOfFileException& e) {
|
||||
m_in.reset();
|
||||
}
|
||||
}
|
||||
|
||||
void ScoreDataIterator::increment() {
|
||||
readNext();
|
||||
}
|
||||
|
||||
|
||||
bool ScoreDataIterator::equal(const ScoreDataIterator& rhs) const {
|
||||
if (!m_in && !rhs.m_in) {
|
||||
return true;
|
||||
} else if (!m_in) {
|
||||
return false;
|
||||
} else if (!rhs.m_in) {
|
||||
return false;
|
||||
} else {
|
||||
return m_in->FileName() == rhs.m_in->FileName() &&
|
||||
m_in->Offset() == rhs.m_in->Offset();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
const vector<ScoreDataItem>& ScoreDataIterator::dereference() const {
|
||||
return m_next;
|
||||
}
|
||||
|
67
mert/ScoreDataIterator.h
Normal file
67
mert/ScoreDataIterator.h
Normal file
@ -0,0 +1,67 @@
|
||||
/***********************************************************************
|
||||
Moses - factored phrase-based language decoder
|
||||
Copyright (C) 2011- University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
#ifndef _SCORE_DATA_ITERATOR_
|
||||
#define _SCORE_DATA_ITERATOR_
|
||||
|
||||
/*
|
||||
* For loading from the score data file.
|
||||
**/
|
||||
#include <vector>
|
||||
|
||||
|
||||
#include <boost/iterator/iterator_facade.hpp>
|
||||
#include <boost/shared_ptr.hpp>
|
||||
|
||||
#include "util/file_piece.hh"
|
||||
#include "util/string_piece.hh"
|
||||
|
||||
#include "FeatureDataIterator.h"
|
||||
|
||||
typedef std::vector<float> ScoreDataItem;
|
||||
|
||||
class ScoreDataIterator :
|
||||
public boost::iterator_facade<ScoreDataIterator,
|
||||
const std::vector<ScoreDataItem>,
|
||||
boost::forward_traversal_tag>
|
||||
{
|
||||
public:
|
||||
ScoreDataIterator();
|
||||
ScoreDataIterator(const std::string& filename);
|
||||
|
||||
static ScoreDataIterator end() {
|
||||
return ScoreDataIterator();
|
||||
}
|
||||
|
||||
private:
|
||||
friend class boost::iterator_core_access;
|
||||
|
||||
void increment();
|
||||
bool equal(const ScoreDataIterator& rhs) const;
|
||||
const std::vector<ScoreDataItem>& dereference() const;
|
||||
|
||||
void readNext();
|
||||
|
||||
boost::shared_ptr<util::FilePiece> m_in;
|
||||
std::vector<ScoreDataItem> m_next;
|
||||
};
|
||||
|
||||
|
||||
#endif
|
||||
|
24
mert/pro.cpp
24
mert/pro.cpp
@ -36,6 +36,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include <boost/program_options.hpp>
|
||||
|
||||
#include "FeatureDataIterator.h"
|
||||
#include "ScoreDataIterator.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -80,6 +81,29 @@ int main(int argc, char** argv)
|
||||
//cerr << featureFiles[0] << endl;
|
||||
for (; fi != FeatureDataIterator::end(); ++fi) {
|
||||
const vector<FeatureDataItem>& featureData = *fi;
|
||||
cerr << "Read " << featureData.size() << " items " << endl;
|
||||
for (size_t i = 0; i < featureData.size(); ++i) {
|
||||
cerr << "Dense: ";
|
||||
for (size_t j = 0; j < featureData[i].dense.size(); ++j) {
|
||||
cerr << featureData[i].dense[j] << " ";
|
||||
}
|
||||
cerr << "\n";
|
||||
}
|
||||
cerr << "\n";
|
||||
}
|
||||
|
||||
ScoreDataIterator si(scoreFiles[0]);
|
||||
for (; si != ScoreDataIterator::end(); ++si) {
|
||||
const vector<ScoreDataItem>& scoreData = *si;
|
||||
cerr << "Read " << scoreData.size() << " items " << endl;
|
||||
for (size_t i = 0; i < scoreData.size(); ++i) {
|
||||
cerr << "SD: ";
|
||||
for (size_t j = 0; j < scoreData[i].size(); ++j) {
|
||||
cerr << scoreData[i][j] << " ";
|
||||
}
|
||||
cerr << "\n";
|
||||
}
|
||||
cerr << "\n";
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user