Use util::TokenIter to tokenize n-best lists.

Reduce creating std::string objects, too. In both ScoreArray
and FeatureArray classes, the private members to track sentence
indices (namely, "m_index") were unnecessarily declared as
std::string, but it's better to directly declare them as 'int'.
This commit is contained in:
Tetsuo Kiso 2012-12-07 01:39:22 +09:00
parent cd3fb3b831
commit 38e145e556
14 changed files with 72 additions and 79 deletions

View File

@ -17,6 +17,10 @@
#include "Util.h"
#include "util/check.hh"
#include "util/tokenize_piece.hh"
#include "util/string_piece.hh"
#include "FeatureDataIterator.h"
using namespace std;
namespace MosesTuning
@ -137,24 +141,32 @@ void Data::loadNBest(const string &file)
throw runtime_error("Unable to open: " + file);
ScoreStats scoreentry;
string line, sentence_index, sentence, feature_str, alignment;
string line, sentence, feature_str, alignment;
int sentence_index;
while (getline(inp, line, '\n')) {
if (line.empty()) continue;
// adding statistics for error measures
scoreentry.clear();
getNextPound(line, sentence_index, "|||"); // first field
getNextPound(line, sentence, "|||"); // second field
getNextPound(line, feature_str, "|||"); // third field
util::TokenIter<util::MultiCharacter> it(line, util::MultiCharacter("|||"));
if (line.length() > 0) {
string temp;
getNextPound(line, temp, "|||"); //fourth field sentence score
if (line.length() > 0) {
getNextPound(line, alignment, "|||"); //fifth field (if present) is either phrase or word alignment
if (line.length() > 0) {
getNextPound(line, alignment, "|||"); //sixth field (if present) is word alignment
sentence_index = ParseInt(*it);
++it;
sentence = it->as_string();
++it;
feature_str = it->as_string();
++it;
if (it) {
++it; // skip model score.
if (it) {
++it;
alignment = it->as_string(); //fifth field (if present) is either phrase or word alignment
if (it) {
++it;
alignment = it->as_string(); //sixth field (if present) is word alignment
}
}
}
@ -216,7 +228,7 @@ void Data::InitFeatureMap(const string& str) {
}
void Data::AddFeatures(const string& str,
const string& sentence_index) {
int sentence_index) {
string buf = str;
string substr;
FeatureStats feature_entry;

View File

@ -18,7 +18,7 @@
namespace MosesTuning
{
class Scorer;
typedef boost::shared_ptr<ScoreData> ScoreDataHandle;
@ -91,7 +91,7 @@ public:
// Helper functions for loadnbest();
void InitFeatureMap(const std::string& str);
void AddFeatures(const std::string& str,
const std::string& sentence_index);
int sentence_index);
};
}

View File

@ -15,14 +15,14 @@ BOOST_AUTO_TEST_CASE(shard_basic) {
Data data(scorer.get());
FeatureArray fa1, fa2, fa3, fa4;
ScoreArray sa1, sa2, sa3, sa4;
fa1.setIndex("1");
fa2.setIndex("2");
fa3.setIndex("3");
fa4.setIndex("4");
sa1.setIndex("1");
sa2.setIndex("2");
sa3.setIndex("3");
sa4.setIndex("4");
fa1.setIndex(1);
fa2.setIndex(2);
fa3.setIndex(3);
fa4.setIndex(4);
sa1.setIndex(1);
sa2.setIndex(2);
sa3.setIndex(3);
sa4.setIndex(4);
data.getFeatureData()->add(fa1);
data.getFeatureData()->add(fa2);
data.getFeatureData()->add(fa3);

View File

@ -16,10 +16,10 @@ using namespace std;
namespace MosesTuning
{
FeatureArray::FeatureArray()
: m_index(""), m_num_features(0){}
: m_index(0), m_num_features(0){}
FeatureArray::~FeatureArray() {}
@ -115,7 +115,7 @@ void FeatureArray::load(istream* is, const SparseVector& sparseWeights)
}
getNextPound(stringBuf, substring);
getNextPound(stringBuf, substring);
m_index = substring;
m_index = atoi(substring.c_str());
getNextPound(stringBuf, substring);
number_of_entries = atoi(substring.c_str());
getNextPound(stringBuf, substring);
@ -160,4 +160,3 @@ bool FeatureArray::check_consistency() const
}
}

View File

@ -15,7 +15,7 @@
namespace MosesTuning
{
const char FEATURES_TXT_BEGIN[] = "FEATURES_TXT_BEGIN_0";
const char FEATURES_TXT_END[] = "FEATURES_TXT_END_0";
@ -27,8 +27,7 @@ class FeatureArray
private:
// idx to identify the utterance. It can differ from
// the index inside the vector.
std::string m_index;
int m_index;
featarray_t m_array;
std::size_t m_num_features;
std::string m_features;
@ -40,8 +39,8 @@ public:
void clear() { m_array.clear(); }
std::string getIndex() const { return m_index; }
void setIndex(const std::string& value) { m_index = value; }
int getIndex() const { return m_index; }
void setIndex(const int value) { m_index = value; }
FeatureStats& get(std::size_t i) { return m_array.at(i); }
const FeatureStats& get(std::size_t i) const { return m_array.at(i); }

View File

@ -16,7 +16,7 @@ using namespace std;
namespace MosesTuning
{
FeatureData::FeatureData()
@ -90,7 +90,7 @@ void FeatureData::add(FeatureArray& e)
}
}
void FeatureData::add(FeatureStats& e, const string& sent_idx)
void FeatureData::add(FeatureStats& e, int sent_idx)
{
if (exists(sent_idx)) { // array at position e.getIndex() already exists
//enlarge array at position e.getIndex()
@ -168,4 +168,3 @@ string FeatureData::ToString() const {
}
}

View File

@ -16,7 +16,7 @@
namespace MosesTuning
{
class FeatureData
{
@ -35,9 +35,6 @@ public:
void clear() { m_array.clear(); }
FeatureArray get(const std::string& idx) {
return m_array.at(getIndex(idx));
}
FeatureArray& get(size_t idx) {
return m_array.at(idx);
}
@ -45,11 +42,11 @@ public:
return m_array.at(idx);
}
inline bool exists(const std::string& sent_idx) const {
return exists(getIndex(sent_idx));
inline bool exists(int sent_idx) const {
return existsInternal(getIndex(sent_idx));
}
inline bool exists(int sent_idx) const {
inline bool existsInternal(int sent_idx) const {
return (sent_idx > -1 && sent_idx < static_cast<int>(m_array.size())) ? true : false;
}
@ -62,7 +59,7 @@ public:
}
void add(FeatureArray& e);
void add(FeatureStats& e, const std::string& sent_idx);
void add(FeatureStats& e, int sent_idx);
std::size_t size() const { return m_array.size(); }
@ -83,7 +80,7 @@ public:
void setIndex();
inline int getIndex(const std::string& idx) const {
inline int getIndex(int idx) const {
name2idx::const_iterator i = m_array_name_to_index.find(idx);
if (i != m_array_name_to_index.end())
return i->second;
@ -91,7 +88,7 @@ public:
return -1;
}
inline std::string getIndex(std::size_t idx) const {
inline int getName(std::size_t idx) const {
idx2name::const_iterator i = m_index_to_array_name.find(idx);
if (i != m_index_to_array_name.end())
throw std::runtime_error("there is no entry at index " + idx);

View File

@ -6,7 +6,7 @@ using namespace std;
namespace MosesTuning
{
// TODO: This is too long. Consider creating a function for
// initialization such as Init().
@ -88,10 +88,6 @@ void InterpolatedScorer::setScoreData(ScoreData* data)
for (size_t i = 0; i < data->size(); i++) {
ScoreArray scoreArray = data->get(i);
ScoreArray newScoreArray;
std::string istr;
std::stringstream out;
out << i;
istr = out.str();
size_t numNBest = scoreArray.size();
//cout << " Datasize " << data->size() << " NumNBest " << numNBest << endl ;
for (size_t j = 0; j < numNBest ; j++) {
@ -105,7 +101,7 @@ void InterpolatedScorer::setScoreData(ScoreData* data)
//cout << " last " << last << " NumScores " << numScoresScorer << "newScorestats " << newScoreStats << endl;
newScoreArray.add(newScoreStats);
}
newScoreArray.setIndex(istr);
newScoreArray.setIndex(i);
newData->add(newScoreArray);
}
//newData->dump();
@ -224,4 +220,3 @@ void InterpolatedScorer::setFilter(const string& filterCommand)
}
}

View File

@ -14,10 +14,10 @@ using namespace std;
namespace MosesTuning
{
ScoreArray::ScoreArray()
: m_num_scores(0), m_index("") {}
: m_num_scores(0), m_index(0) {}
void ScoreArray::savetxt(ostream* os, const string& sctype)
{
@ -109,7 +109,7 @@ void ScoreArray::load(istream* is)
}
getNextPound(stringBuf, substring);
getNextPound(stringBuf, substring);
m_index = substring;
m_index = atoi(substring.c_str());
getNextPound(stringBuf, substring);
number_of_entries = atoi(substring.c_str());
getNextPound(stringBuf, substring);
@ -166,4 +166,3 @@ bool ScoreArray::check_consistency() const
}
}

View File

@ -17,7 +17,7 @@
namespace MosesTuning
{
const char SCORES_TXT_BEGIN[] = "SCORES_TXT_BEGIN_0";
const char SCORES_TXT_END[] = "SCORES_TXT_END_0";
const char SCORES_BIN_BEGIN[] = "SCORES_BIN_BEGIN_0";
@ -32,7 +32,7 @@ class ScoreArray
// indexx to identify the utterance.
// It can differ from the index inside the vector.
std::string m_index;
int m_index;
public:
ScoreArray();
@ -40,9 +40,9 @@ public:
void clear() { m_array.clear(); }
std::string getIndex() const { return m_index; }
int getIndex() const { return m_index; }
void setIndex(const std::string& value) { m_index = value; }
void setIndex(int value) { m_index = value; }
ScoreStats& get(std::size_t i) { return m_array.at(i); }

View File

@ -18,7 +18,7 @@ using namespace std;
namespace MosesTuning
{
ScoreData::ScoreData(Scorer* scorer) :
m_scorer(scorer)
@ -95,7 +95,7 @@ void ScoreData::add(ScoreArray& e)
}
}
void ScoreData::add(const ScoreStats& e, const string& sent_idx)
void ScoreData::add(const ScoreStats& e, int sent_idx)
{
if (exists(sent_idx)) { // array at position e.getIndex() already exists
// Enlarge array at position e.getIndex()
@ -139,4 +139,3 @@ void ScoreData::setIndex()
}
}

View File

@ -18,7 +18,7 @@
namespace MosesTuning
{
class Scorer;
@ -42,10 +42,6 @@ public:
void clear() { m_array.clear(); }
inline ScoreArray get(const std::string& idx) {
return m_array.at(getIndex(idx));
}
inline ScoreArray& get(std::size_t idx) {
return m_array.at(idx);
}
@ -54,11 +50,11 @@ public:
return m_array.at(idx);
}
inline bool exists(const std::string& sent_idx) const {
return exists(getIndex(sent_idx));
inline bool exists(int sent_idx) const {
return existsInternal(getIndex(sent_idx));
}
inline bool exists(int sent_idx) const {
inline bool existsInternal(int sent_idx) const {
return (sent_idx > -1 && sent_idx < static_cast<int>(m_array.size())) ? true : false;
}
@ -77,7 +73,7 @@ public:
}
void add(ScoreArray& e);
void add(const ScoreStats& e, const std::string& sent_idx);
void add(const ScoreStats& e, int sent_idx);
std::size_t NumberOfScores() const { return m_num_scores; }
std::size_t size() const { return m_array.size(); }
@ -93,7 +89,7 @@ public:
void setIndex();
inline int getIndex(const std::string& idx) const {
inline int getIndex(const int idx) const {
name2idx::const_iterator i = m_array_name_to_index.find(idx);
if (i != m_array_name_to_index.end())
return i->second;
@ -101,7 +97,7 @@ public:
return -1;
}
inline std::string getIndex(std::size_t idx) const {
inline int getName(std::size_t idx) const {
idx2name::const_iterator i = m_index_to_array_name.find(idx);
if (i != m_index_to_array_name.end())
throw std::runtime_error("there is no entry at index " + idx);

View File

@ -39,8 +39,8 @@ typedef ScoreStatsType* scorestats_t;
typedef std::vector<ScoreStats> scorearray_t;
typedef std::vector<ScoreArray> scoredata_t;
typedef std::map<std::size_t, std::string> idx2name;
typedef std::map<std::string, std::size_t> name2idx;
typedef std::map<std::size_t, int> idx2name;
typedef std::map<int, std::size_t> name2idx;
typedef enum { HAMMING_DISTANCE=0, KENDALL_DISTANCE } distanceMetric_t;
typedef enum { REFERENCE_CHOICE_AVERAGE=0, REFERENCE_CHOICE_CLOSEST } distanceMetricReferenceChoice_t;

View File

@ -60,8 +60,7 @@ void EvaluatorUtil::evaluate(const string& candFile, int bootstrap)
for (int j = 0; j < n; ++j)
{
int randomIndex = random() % n;
string str_j = int2string(j);
scoredata->add(entries[randomIndex], str_j);
scoredata->add(entries[randomIndex], j);
}
g_scorer->setScoreData(scoredata);
candidates_t candidates(n, 0);
@ -93,8 +92,7 @@ void EvaluatorUtil::evaluate(const string& candFile, int bootstrap)
ScoreData* scoredata = new ScoreData(g_scorer);
for (int sid = 0; sid < n; ++sid)
{
string str_sid = int2string(sid);
scoredata->add(entries[sid], str_sid);
scoredata->add(entries[sid], sid);
}
g_scorer->setScoreData(scoredata);
candidates_t candidates(n, 0);