Merge branch 'master' of github.com:moses-smt/mosesdecoder

This commit is contained in:
Kenneth Heafield 2012-03-20 10:45:40 -04:00
commit ee580d22be
71 changed files with 2186 additions and 1542 deletions

View File

@ -66,8 +66,6 @@
1EBA458F14B97E92003CC0EA /* Jamfile in Sources */ = {isa = PBXBuildFile; fileRef = 1EBA454814B97E92003CC0EA /* Jamfile */; };
1EBA459014B97E92003CC0EA /* joint_sort_test.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1EBA454914B97E92003CC0EA /* joint_sort_test.cc */; };
1EBA459114B97E92003CC0EA /* joint_sort.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1EBA454A14B97E92003CC0EA /* joint_sort.hh */; };
1EBA459214B97E92003CC0EA /* key_value_packing_test.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1EBA454B14B97E92003CC0EA /* key_value_packing_test.cc */; };
1EBA459314B97E92003CC0EA /* key_value_packing.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1EBA454C14B97E92003CC0EA /* key_value_packing.hh */; };
1EBA459414B97E92003CC0EA /* mmap.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1EBA454E14B97E92003CC0EA /* mmap.cc */; };
1EBA459514B97E92003CC0EA /* mmap.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1EBA454F14B97E92003CC0EA /* mmap.hh */; };
1EBA459614B97E92003CC0EA /* murmur_hash.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1EBA455014B97E92003CC0EA /* murmur_hash.cc */; };
@ -165,8 +163,6 @@
1EBA454814B97E92003CC0EA /* Jamfile */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.jam; name = Jamfile; path = ../../util/Jamfile; sourceTree = "<group>"; };
1EBA454914B97E92003CC0EA /* joint_sort_test.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = joint_sort_test.cc; path = ../../util/joint_sort_test.cc; sourceTree = "<group>"; };
1EBA454A14B97E92003CC0EA /* joint_sort.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = joint_sort.hh; path = ../../util/joint_sort.hh; sourceTree = "<group>"; };
1EBA454B14B97E92003CC0EA /* key_value_packing_test.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = key_value_packing_test.cc; path = ../../util/key_value_packing_test.cc; sourceTree = "<group>"; };
1EBA454C14B97E92003CC0EA /* key_value_packing.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = key_value_packing.hh; path = ../../util/key_value_packing.hh; sourceTree = "<group>"; };
1EBA454D14B97E92003CC0EA /* LICENSE */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; name = LICENSE; path = ../../util/LICENSE; sourceTree = "<group>"; };
1EBA454E14B97E92003CC0EA /* mmap.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = mmap.cc; path = ../../util/mmap.cc; sourceTree = "<group>"; };
1EBA454F14B97E92003CC0EA /* mmap.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = mmap.hh; path = ../../util/mmap.hh; sourceTree = "<group>"; };
@ -277,8 +273,6 @@
1EBA454814B97E92003CC0EA /* Jamfile */,
1EBA454914B97E92003CC0EA /* joint_sort_test.cc */,
1EBA454A14B97E92003CC0EA /* joint_sort.hh */,
1EBA454B14B97E92003CC0EA /* key_value_packing_test.cc */,
1EBA454C14B97E92003CC0EA /* key_value_packing.hh */,
1EBA454D14B97E92003CC0EA /* LICENSE */,
1EBA454E14B97E92003CC0EA /* mmap.cc */,
1EBA454F14B97E92003CC0EA /* mmap.hh */,
@ -363,7 +357,6 @@
1EBA458D14B97E92003CC0EA /* getopt.hh in Headers */,
1EBA458E14B97E92003CC0EA /* have.hh in Headers */,
1EBA459114B97E92003CC0EA /* joint_sort.hh in Headers */,
1EBA459314B97E92003CC0EA /* key_value_packing.hh in Headers */,
1EBA459514B97E92003CC0EA /* mmap.hh in Headers */,
1EBA459714B97E92003CC0EA /* murmur_hash.hh in Headers */,
1EBA459914B97E92003CC0EA /* probing_hash_table.hh in Headers */,
@ -466,7 +459,6 @@
1EBA458C14B97E92003CC0EA /* getopt.c in Sources */,
1EBA458F14B97E92003CC0EA /* Jamfile in Sources */,
1EBA459014B97E92003CC0EA /* joint_sort_test.cc in Sources */,
1EBA459214B97E92003CC0EA /* key_value_packing_test.cc in Sources */,
1EBA459414B97E92003CC0EA /* mmap.cc in Sources */,
1EBA459614B97E92003CC0EA /* murmur_hash.cc in Sources */,
1EBA459814B97E92003CC0EA /* probing_hash_table_test.cc in Sources */,

View File

@ -112,7 +112,7 @@ inline size_t hash_value(const ChartState &state) {
size_t hashes[2];
hashes[0] = hash_value(state.left);
hashes[1] = hash_value(state.right);
return util::MurmurHashNative(hashes, sizeof(size_t), state.full);
return util::MurmurHashNative(hashes, sizeof(size_t) * 2, state.full);
}
template <class M> class RuleScore {

View File

@ -7,6 +7,7 @@
#include <vector>
#include <ctype.h>
#include <math.h>
#include <string.h>
#include <stdint.h>
@ -93,7 +94,11 @@ void ReadBackoff(util::FilePiece &in, ProbBackoff &weights) {
case '\t':
weights.backoff = in.ReadFloat();
if (weights.backoff == ngram::kExtensionBackoff) weights.backoff = ngram::kNoExtensionBackoff;
if ((in.get() != '\n')) UTIL_THROW(FormatLoadException, "Expected newline after backoff");
{
int float_class = fpclassify(weights.backoff);
UTIL_THROW_IF(float_class == FP_NAN || float_class == FP_INFINITE, FormatLoadException, "Bad backoff " << weights.backoff);
}
UTIL_THROW_IF((in.get() != '\n'), FormatLoadException, "Expected newline after backoff");
break;
case '\n':
weights.backoff = ngram::kNoExtensionBackoff;

View File

@ -10,6 +10,8 @@
#include <iosfwd>
#include <vector>
#include <math.h>
namespace lm {
void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number);
@ -29,20 +31,26 @@ class PositiveProbWarn {
explicit PositiveProbWarn(WarningAction action) : action_(action) {}
void Warn(float prob);
float ReadProb(util::FilePiece &f) {
float prob = f.ReadFloat();
UTIL_THROW_IF(f.get() != '\t', FormatLoadException, "Expected tab after probability");
UTIL_THROW_IF(isnan(prob), FormatLoadException, "NaN probability");
if (prob > 0.0) {
Warn(prob);
prob = 0.0;
}
return prob;
}
private:
void Warn(float prob);
WarningAction action_;
};
template <class Voc> void Read1Gram(util::FilePiece &f, Voc &vocab, ProbBackoff *unigrams, PositiveProbWarn &warn) {
try {
float prob = f.ReadFloat();
if (prob > 0.0) {
warn.Warn(prob);
prob = 0.0;
}
if (f.get() != '\t') UTIL_THROW(FormatLoadException, "Expected tab after probability");
float prob = warn.ReadProb(f);
ProbBackoff &value = unigrams[vocab.Insert(f.ReadDelimited(kARPASpaces))];
value.prob = prob;
ReadBackoff(f, value);
@ -64,11 +72,7 @@ template <class Voc> void Read1Grams(util::FilePiece &f, std::size_t count, Voc
// Return true if a positive log probability came out.
template <class Voc, class Weights> void ReadNGram(util::FilePiece &f, const unsigned char n, const Voc &vocab, WordIndex *const reverse_indices, Weights &weights, PositiveProbWarn &warn) {
try {
weights.prob = f.ReadFloat();
if (weights.prob > 0.0) {
warn.Warn(weights.prob);
weights.prob = 0.0;
}
weights.prob = warn.ReadProb(f);
for (WordIndex *vocab_out = reverse_indices + n - 1; vocab_out >= reverse_indices; --vocab_out) {
*vocab_out = vocab.Index(f.ReadDelimited(kARPASpaces));
}

View File

@ -1,12 +1,16 @@
#include "BleuScorer.h"
#include <algorithm>
#include <cassert>
#include <cmath>
#include <climits>
#include <fstream>
#include <iostream>
#include <stdexcept>
#include "Ngram.h"
#include "Reference.h"
#include "Util.h"
#include "Vocabulary.h"
namespace {
@ -18,74 +22,8 @@ const char REFLEN_CLOSEST[] = "closest";
} // namespace
// A simple STL-map based n-gram counts.
// Basically, we provide typical accessors and mutaors, but
// we intentionally does not allow erasing elements.
class BleuScorer::NgramCounts {
public:
// Used to construct the ngram map
struct NgramComparator {
bool operator()(const vector<int>& a, const vector<int>& b) const {
size_t i;
const size_t as = a.size();
const size_t bs = b.size();
for (i = 0; i < as && i < bs; ++i) {
if (a[i] < b[i]) {
return true;
}
if (a[i] > b[i]) {
return false;
}
}
// entries are equal, shortest wins
return as < bs;
}
};
typedef vector<int> Key;
typedef int Value;
typedef map<Key, Value, NgramComparator>::iterator iterator;
typedef map<Key, Value, NgramComparator>::const_iterator const_iterator;
NgramCounts() : kDefaultCount(1) { }
virtual ~NgramCounts() { }
// If the specified "ngram" is found, we add counts.
// If not, we insert the default count in the container.
void add(const Key& ngram) {
const_iterator it = find(ngram);
if (it != end()) {
m_counts[ngram] = it->second + 1;
} else {
m_counts[ngram] = kDefaultCount;
}
}
void clear() { m_counts.clear(); }
bool empty() const { return m_counts.empty(); }
size_t size() const { return m_counts.size(); }
size_t max_size() const { return m_counts.max_size(); }
iterator find(const Key& ngram) { return m_counts.find(ngram); }
const_iterator find(const Key& ngram) const { return m_counts.find(ngram); }
Value& operator[](const Key& ngram) { return m_counts[ngram]; }
iterator begin() { return m_counts.begin(); }
const_iterator begin() const { return m_counts.begin(); }
iterator end() { return m_counts.end(); }
const_iterator end() const { return m_counts.end(); }
private:
const int kDefaultCount;
map<Key, Value, NgramComparator> m_counts;
};
BleuScorer::BleuScorer(const string& config)
: StatisticsBasedScorer("BLEU", config),
kLENGTH(4),
m_ref_length_type(CLOSEST) {
const string reflen = getConfig(KEY_REFLEN, REFLEN_CLOSEST);
if (reflen == REFLEN_AVERAGE) {
@ -101,9 +39,10 @@ BleuScorer::BleuScorer(const string& config)
BleuScorer::~BleuScorer() {}
size_t BleuScorer::countNgrams(const string& line, NgramCounts& counts,
size_t BleuScorer::CountNgrams(const string& line, NgramCounts& counts,
unsigned int n)
{
assert(n > 0);
vector<int> encoded_tokens;
TokenizeAndEncode(line, encoded_tokens);
for (size_t k = 1; k <= n; ++k) {
@ -116,7 +55,7 @@ size_t BleuScorer::countNgrams(const string& line, NgramCounts& counts,
for (size_t j = i; j < i+k && j < encoded_tokens.size(); ++j) {
ngram.push_back(encoded_tokens[j]);
}
counts.add(ngram);
counts.Add(ngram);
}
}
return encoded_tokens.size();
@ -124,10 +63,9 @@ size_t BleuScorer::countNgrams(const string& line, NgramCounts& counts,
void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
{
//make sure reference data is clear
m_ref_counts.reset();
m_ref_lengths.clear();
ClearEncoder();
// Make sure reference data is clear
m_references.reset();
mert::VocabularyFactory::GetVocabulary()->clear();
//load reference data
for (size_t i = 0; i < referenceFiles.size(); ++i) {
@ -139,33 +77,30 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
string line;
size_t sid = 0; //sentence counter
while (getline(refin,line)) {
line = this->applyFactors(line);
line = applyFactors(line);
if (i == 0) {
NgramCounts *counts = new NgramCounts; //these get leaked
m_ref_counts.push_back(counts);
vector<size_t> lengths;
m_ref_lengths.push_back(lengths);
Reference* ref = new Reference;
m_references.push_back(ref); // Take ownership of the Reference object.
}
if (m_ref_counts.size() <= sid) {
if (m_references.size() <= sid) {
throw runtime_error("File " + referenceFiles[i] + " has too many sentences");
}
NgramCounts counts;
size_t length = countNgrams(line, counts, kLENGTH);
size_t length = CountNgrams(line, counts, kBleuNgramOrder);
//for any counts larger than those already there, merge them in
for (NgramCounts::const_iterator ci = counts.begin(); ci != counts.end(); ++ci) {
NgramCounts::const_iterator oldcount_it = m_ref_counts[sid]->find(ci->first);
int oldcount = 0;
if (oldcount_it != m_ref_counts[sid]->end()) {
oldcount = oldcount_it->second;
}
int newcount = ci->second;
const NgramCounts::Key& ngram = ci->first;
const NgramCounts::Value newcount = ci->second;
NgramCounts::Value oldcount = 0;
m_references[sid]->get_counts()->Lookup(ngram, &oldcount);
if (newcount > oldcount) {
m_ref_counts[sid]->operator[](ci->first) = newcount;
m_references[sid]->get_counts()->operator[](ngram) = newcount;
}
}
//add in the length
m_ref_lengths[sid].push_back(length);
m_references[sid]->push_back(length);
if (sid > 0 && sid % 100 == 0) {
TRACE_ERR(".");
}
@ -177,44 +112,33 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
if (sid >= m_ref_counts.size()) {
if (sid >= m_references.size()) {
stringstream msg;
msg << "Sentence id (" << sid << ") not found in reference set";
throw runtime_error(msg.str());
}
NgramCounts testcounts;
// stats for this line
vector<ScoreStatsType> stats(kLENGTH * 2);
string sentence = this->applyFactors(text);
const size_t length = countNgrams(sentence, testcounts, kLENGTH);
vector<ScoreStatsType> stats(kBleuNgramOrder * 2);
string sentence = applyFactors(text);
const size_t length = CountNgrams(sentence, testcounts, kBleuNgramOrder);
// Calculate effective reference length.
switch (m_ref_length_type) {
case SHORTEST:
CalcShortest(sid, stats);
break;
case AVERAGE:
CalcAverage(sid, stats);
break;
case CLOSEST:
CalcClosest(sid, length, stats);
break;
default:
throw runtime_error("Unsupported reflength strategy");
}
const int reference_len = CalcReferenceLength(sid, length);
stats.push_back(reference_len);
//precision on each ngram type
for (NgramCounts::const_iterator testcounts_it = testcounts.begin();
testcounts_it != testcounts.end(); ++testcounts_it) {
NgramCounts::const_iterator refcounts_it = m_ref_counts[sid]->find(testcounts_it->first);
int correct = 0;
const int guess = testcounts_it->second;
if (refcounts_it != m_ref_counts[sid]->end()) {
correct = min(refcounts_it->second,guess);
}
const NgramCounts::Value guess = testcounts_it->second;
const size_t len = testcounts_it->first.size();
stats[len*2-2] += correct;
stats[len*2-1] += guess;
NgramCounts::Value correct = 0;
NgramCounts::Value v = 0;
if (m_references[sid]->get_counts()->Lookup(testcounts_it->first, &v)) {
correct = min(v, guess);
}
stats[len * 2 - 2] += correct;
stats[len * 2 - 1] += guess;
}
entry.set(stats);
}
@ -222,23 +146,41 @@ void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
float BleuScorer::calculateScore(const vector<int>& comps) const
{
float logbleu = 0.0;
for (int i = 0; i < kLENGTH; ++i) {
for (int i = 0; i < kBleuNgramOrder; ++i) {
if (comps[2*i] == 0) {
return 0.0;
}
logbleu += log(comps[2*i]) - log(comps[2*i+1]);
}
logbleu /= kLENGTH;
const float brevity = 1.0 - static_cast<float>(comps[kLENGTH*2]) / comps[1];//reflength divided by test length
logbleu /= kBleuNgramOrder;
// reflength divided by test length
const float brevity = 1.0 - static_cast<float>(comps[kBleuNgramOrder * 2]) / comps[1];
if (brevity < 0.0) {
logbleu += brevity;
}
return exp(logbleu);
}
void BleuScorer::dump_counts(ostream* os,
const NgramCounts& counts) const {
int BleuScorer::CalcReferenceLength(size_t sentence_id, size_t length) {
switch (m_ref_length_type) {
case AVERAGE:
return m_references[sentence_id]->CalcAverage();
break;
case CLOSEST:
return m_references[sentence_id]->CalcClosest(length);
break;
case SHORTEST:
return m_references[sentence_id]->CalcShortest();
break;
default:
cerr << "unknown reference types." << endl;
exit(1);
}
}
void BleuScorer::DumpCounts(ostream* os,
const NgramCounts& counts) const {
for (NgramCounts::const_iterator it = counts.begin();
it != counts.end(); ++it) {
*os << "(";
@ -254,44 +196,3 @@ void BleuScorer::dump_counts(ostream* os,
*os << endl;
}
void BleuScorer::CalcAverage(size_t sentence_id,
vector<ScoreStatsType>& stats) const {
int total = 0;
for (size_t i = 0;
i < m_ref_lengths[sentence_id].size(); ++i) {
total += m_ref_lengths[sentence_id][i];
}
const float mean = static_cast<float>(total) /
m_ref_lengths[sentence_id].size();
stats.push_back(static_cast<ScoreStatsType>(mean));
}
void BleuScorer::CalcClosest(size_t sentence_id,
size_t length,
vector<ScoreStatsType>& stats) const {
int min_diff = INT_MAX;
int min_idx = 0;
for (size_t i = 0; i < m_ref_lengths[sentence_id].size(); ++i) {
const int reflength = m_ref_lengths[sentence_id][i];
const int length_diff = abs(reflength - static_cast<int>(length));
// Look for the closest reference
if (length_diff < abs(min_diff)) {
min_diff = reflength - length;
min_idx = i;
// if two references has the same closest length, take the shortest
} else if (length_diff == abs(min_diff)) {
if (reflength < static_cast<int>(m_ref_lengths[sentence_id][min_idx])) {
min_idx = i;
}
}
}
stats.push_back(m_ref_lengths[sentence_id][min_idx]);
}
void BleuScorer::CalcShortest(size_t sentence_id,
vector<ScoreStatsType>& stats) const {
const int shortest = *min_element(m_ref_lengths[sentence_id].begin(),
m_ref_lengths[sentence_id].end());
stats.push_back(shortest);
}

View File

@ -12,55 +12,50 @@
using namespace std;
const int kBleuNgramOrder = 4;
class NgramCounts;
class Reference;
/**
* Bleu scoring
*/
class BleuScorer: public StatisticsBasedScorer
{
public:
enum ReferenceLengthType {
AVERAGE,
CLOSEST,
SHORTEST
};
explicit BleuScorer(const string& config = "");
~BleuScorer();
virtual void setReferenceFiles(const vector<string>& referenceFiles);
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
virtual float calculateScore(const vector<int>& comps) const;
virtual size_t NumberOfScores() const { return 2 * kLENGTH + 1; }
virtual size_t NumberOfScores() const { return 2 * kBleuNgramOrder + 1; }
private:
enum ReferenceLengthType {
AVERAGE,
SHORTEST,
CLOSEST
};
int CalcReferenceLength(size_t sentence_id, size_t length);
/**
* A NgramCounts is a key-value store.
* Clients don't have to worry about the actual implementation
* since this type is used in internal only.
*/
class NgramCounts;
ReferenceLengthType GetReferenceLengthType() const { return m_ref_length_type; }
void SetReferenceLengthType(ReferenceLengthType type) { m_ref_length_type = type; }
const std::vector<Reference*>& GetReferences() const { return m_references.get(); }
/**
* Count the ngrams of each type, up to the given length in the input line.
*/
size_t countNgrams(const string& line, NgramCounts& counts, unsigned int n);
size_t CountNgrams(const string& line, NgramCounts& counts, unsigned int n);
void dump_counts(std::ostream* os, const NgramCounts& counts) const;
void DumpCounts(std::ostream* os, const NgramCounts& counts) const;
// For calculating effective reference length.
void CalcAverage(size_t sentence_id,
vector<ScoreStatsType>& stats) const;
void CalcClosest(size_t sentence_id, size_t length,
vector<ScoreStatsType>& stats) const;
void CalcShortest(size_t sentence_id,
vector<ScoreStatsType>& stats) const;
const int kLENGTH;
private:
ReferenceLengthType m_ref_length_type;
// data extracted from reference files
ScopedVector<NgramCounts> m_ref_counts;
vector<vector<size_t> > m_ref_lengths;
// reference translations.
ScopedVector<Reference> m_references;
// no copying allowed
BleuScorer(const BleuScorer&);

155
mert/BleuScorerTest.cpp Normal file
View File

@ -0,0 +1,155 @@
#include "BleuScorer.h"
#define BOOST_TEST_MODULE MertBleuScorer
#include <boost/test/unit_test.hpp>
#include "Ngram.h"
#include "Vocabulary.h"
#include "Util.h"
namespace {
NgramCounts* g_counts = NULL;
NgramCounts* GetNgramCounts() {
assert(g_counts);
return g_counts;
}
void SetNgramCounts(NgramCounts* counts) {
g_counts = counts;
}
struct Unigram {
Unigram(const std::string& a) {
instance.push_back(mert::VocabularyFactory::GetVocabulary()->Encode(a));
}
NgramCounts::Key instance;
};
struct Bigram {
Bigram(const std::string& a, const std::string& b) {
instance.push_back(mert::VocabularyFactory::GetVocabulary()->Encode(a));
instance.push_back(mert::VocabularyFactory::GetVocabulary()->Encode(b));
}
NgramCounts::Key instance;
};
struct Trigram {
Trigram(const std::string& a, const std::string& b, const std::string& c) {
instance.push_back(mert::VocabularyFactory::GetVocabulary()->Encode(a));
instance.push_back(mert::VocabularyFactory::GetVocabulary()->Encode(b));
instance.push_back(mert::VocabularyFactory::GetVocabulary()->Encode(c));
}
NgramCounts::Key instance;
};
struct Fourgram {
Fourgram(const std::string& a, const std::string& b,
const std::string& c, const std::string& d) {
instance.push_back(mert::VocabularyFactory::GetVocabulary()->Encode(a));
instance.push_back(mert::VocabularyFactory::GetVocabulary()->Encode(b));
instance.push_back(mert::VocabularyFactory::GetVocabulary()->Encode(c));
instance.push_back(mert::VocabularyFactory::GetVocabulary()->Encode(d));
}
NgramCounts::Key instance;
};
bool CheckUnigram(const std::string& str) {
Unigram unigram(str);
NgramCounts::Value v;
return GetNgramCounts()->Lookup(unigram.instance, &v);
}
bool CheckBigram(const std::string& a, const std::string& b) {
Bigram bigram(a, b);
NgramCounts::Value v;
return GetNgramCounts()->Lookup(bigram.instance, &v);
}
bool CheckTrigram(const std::string& a, const std::string& b,
const std::string& c) {
Trigram trigram(a, b, c);
NgramCounts::Value v;
return GetNgramCounts()->Lookup(trigram.instance, &v);
}
bool CheckFourgram(const std::string& a, const std::string& b,
const std::string& c, const std::string& d) {
Fourgram fourgram(a, b, c, d);
NgramCounts::Value v;
return GetNgramCounts()->Lookup(fourgram.instance, &v);
}
} // namespace
BOOST_AUTO_TEST_CASE(bleu_reference_type) {
BleuScorer scorer;
// BleuScorer will use "closest" by default.
BOOST_CHECK_EQUAL(scorer.GetReferenceLengthType(), BleuScorer::CLOSEST);
scorer.SetReferenceLengthType(BleuScorer::AVERAGE);
BOOST_CHECK_EQUAL(scorer.GetReferenceLengthType(), BleuScorer::AVERAGE);
scorer.SetReferenceLengthType(BleuScorer::SHORTEST);
BOOST_CHECK_EQUAL(scorer.GetReferenceLengthType(), BleuScorer::SHORTEST);
}
BOOST_AUTO_TEST_CASE(bleu_count_ngrams) {
BleuScorer scorer;
std::string line = "I saw a girl with a telescope .";
// In the above string, we will get the 25 ngrams.
//
// unigram: "I", "saw", "a", "girl", "with", "telescope", "."
// bigram: "I saw", "saw a", "a girl", "girl with", "with a", "a telescope"
// "telescope ."
// trigram: "I saw a", "saw a girl", "a girl with", "girl with a",
// "with a telescope", "a telescope ."
// 4-gram: "I saw a girl", "saw a girl with", "a girl with a",
// "girl with a telescope", "with a telescope ."
NgramCounts counts;
BOOST_REQUIRE(scorer.CountNgrams(line, counts, kBleuNgramOrder) == 8);
BOOST_CHECK_EQUAL(25, counts.size());
mert::Vocabulary* vocab = scorer.GetVocab();
BOOST_CHECK_EQUAL(7, vocab->size());
std::vector<std::string> res;
Tokenize(line.c_str(), ' ', &res);
std::vector<int> ids(res.size());
for (std::size_t i = 0; i < res.size(); ++i) {
BOOST_CHECK(vocab->Lookup(res[i], &ids[i]));
}
SetNgramCounts(&counts);
// unigram
for (std::size_t i = 0; i < res.size(); ++i) {
BOOST_CHECK(CheckUnigram(res[i]));
}
// bigram
BOOST_CHECK(CheckBigram("I", "saw"));
BOOST_CHECK(CheckBigram("saw", "a"));
BOOST_CHECK(CheckBigram("a", "girl"));
BOOST_CHECK(CheckBigram("girl", "with"));
BOOST_CHECK(CheckBigram("with", "a"));
BOOST_CHECK(CheckBigram("a", "telescope"));
BOOST_CHECK(CheckBigram("telescope", "."));
// trigram
BOOST_CHECK(CheckTrigram("I", "saw", "a"));
BOOST_CHECK(CheckTrigram("saw", "a", "girl"));
BOOST_CHECK(CheckTrigram("a", "girl", "with"));
BOOST_CHECK(CheckTrigram("girl", "with", "a"));
BOOST_CHECK(CheckTrigram("with", "a", "telescope"));
BOOST_CHECK(CheckTrigram("a", "telescope", "."));
// 4-gram
BOOST_CHECK(CheckFourgram("I", "saw", "a", "girl"));
BOOST_CHECK(CheckFourgram("saw", "a", "girl", "with"));
BOOST_CHECK(CheckFourgram("a", "girl", "with", "a"));
BOOST_CHECK(CheckFourgram("girl", "with", "a", "telescope"));
BOOST_CHECK(CheckFourgram("with", "a", "telescope", "."));
}

View File

@ -70,6 +70,7 @@ float CderScorer::calculateScore(const vector<int>& comps) const
if (comps.size() != 2) {
throw runtime_error("Size of stat vector for CDER is not 2");
}
if (comps[1] == 0) return 1.0f;
return 1.0f - (comps[0] / static_cast<float>(comps[1]));
}

View File

@ -7,7 +7,6 @@
*/
#include <algorithm>
#include "util/check.hh"
#include <cmath>
#include <fstream>
@ -16,87 +15,82 @@
#include "Scorer.h"
#include "ScorerFactory.h"
#include "Util.h"
#include "util/check.hh"
Data::Data()
: theScorer(NULL),
number_of_scores(0),
_sparse_flag(false),
scoredata(),
featdata() {}
: m_scorer(NULL),
m_num_scores(0),
m_sparse_flag(false),
m_score_data(),
m_feature_data() {}
Data::Data(Scorer& ptr)
: theScorer(&ptr),
score_type(theScorer->getName()),
number_of_scores(0),
_sparse_flag(false),
scoredata(new ScoreData(*theScorer)),
featdata(new FeatureData)
Data::Data(Scorer* scorer)
: m_scorer(scorer),
m_score_type(m_scorer->getName()),
m_num_scores(0),
m_sparse_flag(false),
m_score_data(new ScoreData(m_scorer)),
m_feature_data(new FeatureData)
{
TRACE_ERR("Data::score_type " << score_type << std::endl);
TRACE_ERR("Data::Scorer type from Scorer: " << theScorer->getName() << endl);
TRACE_ERR("Data::m_score_type " << m_score_type << endl);
TRACE_ERR("Data::Scorer type from Scorer: " << m_scorer->getName() << endl);
}
//ADDED BY TS
void Data::remove_duplicates() {
// TODO: This is too long; consider creating additional functions to
// reduce the lines of this function.
void Data::removeDuplicates() {
size_t nSentences = m_feature_data->size();
assert(m_score_data->size() == nSentences);
size_t nSentences = featdata->size();
assert(scoredata->size() == nSentences);
for (size_t s=0; s < nSentences; s++) {
FeatureArray& feat_array = featdata->get(s);
ScoreArray& score_array = scoredata->get(s);
for (size_t s = 0; s < nSentences; s++) {
FeatureArray& feat_array = m_feature_data->get(s);
ScoreArray& score_array = m_score_data->get(s);
assert(feat_array.size() == score_array.size());
//serves as a hash-map:
std::map<double, std::vector<size_t> > lookup;
map<double, vector<size_t> > lookup;
size_t end_pos = feat_array.size() - 1;
size_t nRemoved = 0;
for (size_t k=0; k <= end_pos; k++) {
for (size_t k = 0; k <= end_pos; k++) {
const FeatureStats& cur_feats = feat_array.get(k);
double sum = 0.0;
for (size_t l=0; l < cur_feats.size(); l++)
sum += cur_feats.get(l);
for (size_t l = 0; l < cur_feats.size(); l++)
sum += cur_feats.get(l);
if (lookup.find(sum) != lookup.end()) {
//std::cerr << "hit" << std::endl;
//cerr << "hit" << endl;
vector<size_t>& cur_list = lookup[sum];
std::vector<size_t>& cur_list = lookup[sum];
// TODO: Make sure this is correct because we have already used 'l'.
// If this does not impact on the removing duplicates, it is better
// to change
size_t l = 0;
for (l = 0; l < cur_list.size(); l++) {
size_t j = cur_list[l];
size_t l=0;
for (l=0; l < cur_list.size(); l++) {
size_t j=cur_list[l];
if (cur_feats == feat_array.get(j)
&& score_array.get(k) == score_array.get(j)) {
if (k < end_pos) {
feat_array.swap(k,end_pos);
score_array.swap(k,end_pos);
k--;
}
end_pos--;
nRemoved++;
break;
}
}
if (l == lookup[sum].size())
cur_list.push_back(k);
if (cur_feats == feat_array.get(j)
&& score_array.get(k) == score_array.get(j)) {
if (k < end_pos) {
feat_array.swap(k,end_pos);
score_array.swap(k,end_pos);
k--;
}
end_pos--;
nRemoved++;
break;
}
}
if (l == lookup[sum].size())
cur_list.push_back(k);
} else {
lookup[sum].push_back(k);
}
else
lookup[sum].push_back(k);
// for (size_t j=0; j < k; j++) {
// if (feat_array.get(k) == feat_array.get(j)
@ -115,11 +109,9 @@ void Data::remove_duplicates() {
// break;
// }
// }
}
} // end for k
if (nRemoved > 0) {
feat_array.resize(end_pos+1);
score_array.resize(end_pos+1);
}
@ -127,124 +119,133 @@ void Data::remove_duplicates() {
}
//END_ADDED
void Data::load(const std::string &featfile, const std::string &scorefile) {
m_feature_data->load(featfile);
m_score_data->load(scorefile);
if (m_feature_data->hasSparseFeatures())
m_sparse_flag = true;
}
void Data::loadnbest(const std::string &file)
void Data::loadNBest(const string &file)
{
TRACE_ERR("loading nbest from " << file << std::endl);
FeatureStats featentry;
ScoreStats scoreentry;
std::string sentence_index;
TRACE_ERR("loading nbest from " << file << endl);
inputfilestream inp(file); // matches a stream with a file. Opens the file
if (!inp.good())
throw runtime_error("Unable to open: " + file);
std::string substring, subsubstring, stringBuf;
std::string theSentence;
std::string::size_type loc;
while (getline(inp,stringBuf,'\n')) {
if (stringBuf.empty()) continue;
// TRACE_ERR("stringBuf: " << stringBuf << std::endl);
getNextPound(stringBuf, substring, "|||"); //first field
sentence_index = substring;
getNextPound(stringBuf, substring, "|||"); //second field
theSentence = substring;
ScoreStats scoreentry;
string line, sentence_index, sentence, feature_str;
while (getline(inp, line, '\n')) {
if (line.empty()) continue;
// adding statistics for error measures
featentry.reset();
scoreentry.clear();
theScorer->prepareStats(sentence_index, theSentence, scoreentry);
getNextPound(line, sentence_index, "|||"); // first field
getNextPound(line, sentence, "|||"); // second field
getNextPound(line, feature_str, "|||"); // third field
scoredata->add(scoreentry, sentence_index);
getNextPound(stringBuf, substring, "|||"); //third field
m_scorer->prepareStats(sentence_index, sentence, scoreentry);
m_score_data->add(scoreentry, sentence_index);
// examine first line for name of features
if (!existsFeatureNames()) {
std::string stringsupport=substring;
std::string features="";
std::string tmpname="";
size_t tmpidx=0;
while (!stringsupport.empty()) {
// TRACE_ERR("Decompounding: " << substring << std::endl);
getNextPound(stringsupport, subsubstring);
// string ending with ":" are skipped, because they are the names of the features
if ((loc = subsubstring.find_last_of(":")) != subsubstring.length()-1) {
features+=tmpname+"_"+stringify(tmpidx)+" ";
tmpidx++;
}
// ignore sparse feature name
else if (subsubstring.find("_") != string::npos) {
// also ignore its value
getNextPound(stringsupport, subsubstring);
}
// update current feature name
else {
tmpidx=0;
tmpname=subsubstring.substr(0,subsubstring.size() - 1);
}
}
featdata->setFeatureMap(features);
InitFeatureMap(feature_str);
}
// adding features
while (!substring.empty()) {
// TRACE_ERR("Decompounding: " << substring << std::endl);
getNextPound(substring, subsubstring);
// no ':' -> feature value that needs to be stored
if ((loc = subsubstring.find_last_of(":")) != subsubstring.length()-1) {
featentry.add(ConvertStringToFeatureStatsType(subsubstring));
}
// sparse feature name? store as well
else if (subsubstring.find("_") != string::npos) {
std::string name = subsubstring;
getNextPound(substring, subsubstring);
featentry.addSparse( name, atof(subsubstring.c_str()) );
_sparse_flag = true;
}
}
//cerr << "number of sparse features: " << featentry.getSparse().size() << endl;
featdata->add(featentry,sentence_index);
AddFeatures(feature_str, sentence_index);
}
inp.close();
}
void Data::save(const std::string &featfile, const std::string &scorefile, bool bin) {
if (bin)
cerr << "Binary write mode is selected" << endl;
else
cerr << "Binary write mode is NOT selected" << endl;
m_feature_data->save(featfile, bin);
m_score_data->save(scorefile, bin);
}
void Data::InitFeatureMap(const string& str) {
string buf = str;
string substr;
string features = "";
string tmp_name = "";
size_t tmp_index = 0;
string::size_type loc;
char tmp[64]; // for snprintf();
while (!buf.empty()) {
getNextPound(buf, substr);
// string ending with ":" are skipped, because they are the names of the features
if ((loc = substr.find_last_of(":")) != substr.length()-1) {
snprintf(tmp, sizeof(tmp), "%s_%lu ", tmp_name.c_str(), tmp_index);
features.append(tmp);
tmp_index++;
} else if (substr.find("_") != string::npos) {
// ignore sparse feature name and its value
getNextPound(buf, substr);
} else { // update current feature name
tmp_index = 0;
tmp_name = substr.substr(0, substr.size() - 1);
}
}
m_feature_data->setFeatureMap(features);
}
void Data::AddFeatures(const string& str,
const string& sentence_index) {
string::size_type loc;
string buf = str;
string substr;
FeatureStats feature_entry;
feature_entry.reset();
while (!buf.empty()) {
getNextPound(buf, substr);
// no ':' -> feature value that needs to be stored
if ((loc = substr.find_last_of(":")) != substr.length()-1) {
feature_entry.add(ConvertStringToFeatureStatsType(substr));
} else if (substr.find("_") != string::npos) {
// sparse feature name? store as well
string name = substr;
getNextPound(buf, substr);
feature_entry.addSparse(name, atof(substr.c_str()));
m_sparse_flag = true;
}
}
m_feature_data->add(feature_entry, sentence_index);
}
// TODO
void Data::mergeSparseFeatures() {
std::cerr << "ERROR: sparse features can only be trained with pairwise ranked optimizer (PRO), not traditional MERT\n";
cerr << "ERROR: sparse features can only be trained with pairwise ranked optimizer (PRO), not traditional MERT\n";
exit(1);
}
void Data::createShards(size_t shard_count, float shard_size, const string& scorerconfig,
std::vector<Data>& shards)
vector<Data>& shards)
{
CHECK(shard_count);
CHECK(shard_size >= 0);
CHECK(shard_size <= 1);
size_t data_size = scoredata->size();
CHECK(data_size == featdata->size());
size_t data_size = m_score_data->size();
CHECK(data_size == m_feature_data->size());
shard_size *= data_size;
const float coeff = static_cast<float>(data_size) / shard_count;
for (size_t shard_id = 0; shard_id < shard_count; ++shard_id) {
vector<size_t> shard_contents;
if (shard_size == 0) {
//split into roughly equal size shards
const size_t shard_start = floor(0.5 + shard_id * static_cast<float>(data_size) / shard_count);
const size_t shard_end = floor(0.5 + (shard_id + 1) * static_cast<float>(data_size) / shard_count);
const size_t shard_start = floor(0.5 + shard_id * coeff);
const size_t shard_end = floor(0.5 + (shard_id + 1) * coeff);
for (size_t i = shard_start; i < shard_end; ++i) {
shard_contents.push_back(i);
}
@ -255,15 +256,15 @@ void Data::createShards(size_t shard_count, float shard_size, const string& scor
}
}
Scorer* scorer = ScorerFactory::getScorer(score_type, scorerconfig);
Scorer* scorer = ScorerFactory::getScorer(m_score_type, scorerconfig);
shards.push_back(Data(*scorer));
shards.back().score_type = score_type;
shards.back().number_of_scores = number_of_scores;
shards.back()._sparse_flag = _sparse_flag;
shards.push_back(Data(scorer));
shards.back().m_score_type = m_score_type;
shards.back().m_num_scores = m_num_scores;
shards.back().m_sparse_flag = m_sparse_flag;
for (size_t i = 0; i < shard_contents.size(); ++i) {
shards.back().featdata->add(featdata->get(shard_contents[i]));
shards.back().scoredata->add(scoredata->get(shard_contents[i]));
shards.back().m_feature_data->add(m_feature_data->get(shard_contents[i]));
shards.back().m_score_data->add(m_score_data->get(shard_contents[i]));
}
//cerr << endl;
}

View File

@ -11,11 +11,8 @@
using namespace std;
#include <limits>
#include <vector>
#include <iostream>
#include<boost/shared_ptr.hpp>
#include <boost/shared_ptr.hpp>
#include "Util.h"
#include "FeatureData.h"
@ -26,90 +23,70 @@ class Scorer;
typedef boost::shared_ptr<ScoreData> ScoreDataHandle;
typedef boost::shared_ptr<FeatureData> FeatureDataHandle;
// NOTE: there is no copy constructor implemented, so only the
// compiler synthesised shallow copy is available.
class Data
{
private:
Scorer* theScorer;
std::string score_type;
size_t number_of_scores;
bool _sparse_flag;
Scorer* m_scorer;
std::string m_score_type;
size_t m_num_scores;
bool m_sparse_flag;
ScoreDataHandle m_score_data;
FeatureDataHandle m_feature_data;
protected:
ScoreDataHandle scoredata;
FeatureDataHandle featdata;
// Helper functions for loadnbest();
void InitFeatureMap(const std::string& str);
void AddFeatures(const std::string& str,
const std::string& sentence_index);
public:
explicit Data(Scorer& sc);
explicit Data(Scorer* scorer);
Data();
//Note that there is no copy constructor implemented, so only the
//compiler synthesised shallow copy is available
inline void clear() {
scoredata->clear();
featdata->clear();
void clear() {
m_score_data->clear();
m_feature_data->clear();
}
ScoreDataHandle getScoreData() {
return scoredata;
ScoreDataHandle getScoreData() { return m_score_data; }
FeatureDataHandle getFeatureData() { return m_feature_data; }
Scorer* getScorer() { return m_scorer; }
size_t NumberOfFeatures() const {
return m_feature_data->NumberOfFeatures();
}
FeatureDataHandle getFeatureData() {
return featdata;
}
void NumberOfFeatures(size_t v) { m_feature_data->NumberOfFeatures(v); }
Scorer* getScorer() {
return theScorer;
}
std::string Features() const { return m_feature_data->Features(); }
void Features(const std::string &f) { m_feature_data->Features(f); }
inline size_t NumberOfFeatures() const {
return featdata->NumberOfFeatures();
}
inline void NumberOfFeatures(size_t v) {
featdata->NumberOfFeatures(v);
}
inline std::string Features() const {
return featdata->Features();
}
inline void Features(const std::string &f) {
featdata->Features(f);
}
inline bool hasSparseFeatures() const { return _sparse_flag; }
bool hasSparseFeatures() const { return m_sparse_flag; }
void mergeSparseFeatures();
void loadnbest(const std::string &file);
void load(const std::string &featfile,const std::string &scorefile) {
featdata->load(featfile);
scoredata->load(scorefile);
if (featdata->hasSparseFeatures())
_sparse_flag = true;
}
void loadNBest(const std::string &file);
void load(const std::string &featfile, const std::string &scorefile);
void save(const std::string &featfile, const std::string &scorefile, bool bin=false);
//ADDED BY TS
void remove_duplicates();
void removeDuplicates();
//END_ADDED
void save(const std::string &featfile,const std::string &scorefile, bool bin=false) {
if (bin) cerr << "Binary write mode is selected" << endl;
else cerr << "Binary write mode is NOT selected" << endl;
featdata->save(featfile, bin);
scoredata->save(scorefile, bin);
}
inline bool existsFeatureNames() const {
return featdata->existsFeatureNames();
return m_feature_data->existsFeatureNames();
}
inline std::string getFeatureName(size_t idx) const {
return featdata->getFeatureName(idx);
return m_feature_data->getFeatureName(idx);
}
inline size_t getFeatureIndex(const std::string& name) const {
return featdata->getFeatureIndex(name);
return m_feature_data->getFeatureIndex(name);
}
/**

View File

@ -10,7 +10,7 @@
//very basic test of sharding
BOOST_AUTO_TEST_CASE(shard_basic) {
boost::scoped_ptr<Scorer> scorer(ScorerFactory::getScorer("BLEU", ""));
Data data(*scorer);
Data data(scorer.get());
FeatureArray fa1, fa2, fa3, fa4;
ScoreArray sa1, sa2, sa3, sa4;
fa1.setIndex("1");

View File

@ -6,135 +6,147 @@
*
*/
#include <fstream>
#include "FeatureArray.h"
#include "FileStream.h"
#include "Util.h"
FeatureArray::FeatureArray()
: idx(""), number_of_features(0), _sparse_flag(false) {}
: m_index(""), m_num_features(0), m_sparse_flag(false) {}
FeatureArray::~FeatureArray() {}
void FeatureArray::savetxt(std::ofstream& outFile)
void FeatureArray::savetxt(ostream* os)
{
outFile << FEATURES_TXT_BEGIN << " " << idx << " " << array_.size()
<< " " << number_of_features << " " << features << std::endl;
for (featarray_t::iterator i = array_.begin(); i !=array_.end(); i++) {
i->savetxt(outFile);
outFile << std::endl;
*os << FEATURES_TXT_BEGIN << " " << m_index << " " << m_array.size()
<< " " << m_num_features << " " << m_features << endl;
for (featarray_t::iterator i = m_array.begin(); i != m_array.end(); ++i) {
i->savetxt(os);
*os << endl;
}
outFile << FEATURES_TXT_END << std::endl;
*os << FEATURES_TXT_END << endl;
}
void FeatureArray::savebin(std::ofstream& outFile)
void FeatureArray::savebin(ostream* os)
{
outFile << FEATURES_BIN_BEGIN << " " << idx << " " << array_.size()
<< " " << number_of_features << " " << features << std::endl;
for (featarray_t::iterator i = array_.begin(); i !=array_.end(); i++)
i->savebin(outFile);
*os << FEATURES_BIN_BEGIN << " " << m_index << " " << m_array.size()
<< " " << m_num_features << " " << m_features << endl;
for (featarray_t::iterator i = m_array.begin(); i != m_array.end(); ++i)
i->savebin(os);
outFile << FEATURES_BIN_END << std::endl;
*os << FEATURES_BIN_END << endl;
}
void FeatureArray::save(std::ofstream& inFile, bool bin)
void FeatureArray::save(ostream* os, bool bin)
{
if (size()>0)
(bin)?savebin(inFile):savetxt(inFile);
if (size() <= 0) return;
if (bin) {
savebin(os);
} else {
savetxt(os);
}
}
void FeatureArray::save(const std::string &file, bool bin)
void FeatureArray::save(const string &file, bool bin)
{
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
save(outFile);
outFile.close();
ofstream ofs(file.c_str(), ios::out);
if (!ofs) {
cerr << "Failed to open " << file << endl;
exit(1);
}
ostream *os = &ofs;
save(os, bin);
ofs.close();
}
void FeatureArray::loadbin(ifstream& inFile, size_t n)
void FeatureArray::save(bool bin)
{
FeatureStats entry(number_of_features);
save(&cout, bin);
}
for (size_t i=0 ; i < n; i++) {
entry.loadbin(inFile);
void FeatureArray::loadbin(istream* is, size_t n)
{
FeatureStats entry(m_num_features);
for (size_t i = 0 ; i < n; i++) {
entry.loadbin(is);
add(entry);
}
}
void FeatureArray::loadtxt(ifstream& inFile, size_t n)
void FeatureArray::loadtxt(istream* is, size_t n)
{
FeatureStats entry(number_of_features);
FeatureStats entry(m_num_features);
for (size_t i=0 ; i < n; i++) {
entry.loadtxt(inFile);
for (size_t i = 0; i < n; i++) {
entry.loadtxt(is);
add(entry);
if (entry.getSparse().size()>0)
_sparse_flag = true;
m_sparse_flag = true;
}
}
void FeatureArray::load(ifstream& inFile)
void FeatureArray::load(istream* is)
{
size_t number_of_entries=0;
bool binmode=false;
size_t number_of_entries = 0;
bool binmode = false;
std::string substring, stringBuf;
std::string::size_type loc;
string substring, stringBuf;
string::size_type loc;
std::getline(inFile, stringBuf);
if (!inFile.good()) {
getline(*is, stringBuf);
if (!is->good()) {
return;
}
if (!stringBuf.empty()) {
if ((loc = stringBuf.find(FEATURES_TXT_BEGIN)) == 0) {
binmode=false;
binmode = false;
} else if ((loc = stringBuf.find(FEATURES_BIN_BEGIN)) == 0) {
binmode=true;
binmode = true;
} else {
TRACE_ERR("ERROR: FeatureArray::load(): Wrong header");
return;
}
getNextPound(stringBuf, substring);
getNextPound(stringBuf, substring);
idx = substring;
m_index = substring;
getNextPound(stringBuf, substring);
number_of_entries = atoi(substring.c_str());
getNextPound(stringBuf, substring);
number_of_features = atoi(substring.c_str());
features = stringBuf;
m_num_features = atoi(substring.c_str());
m_features = stringBuf;
}
(binmode)?loadbin(inFile, number_of_entries):loadtxt(inFile, number_of_entries);
if (binmode) {
loadbin(is, number_of_entries);
} else {
loadtxt(is, number_of_entries);
}
std::getline(inFile, stringBuf);
getline(*is, stringBuf);
if (!stringBuf.empty()) {
if ((loc = stringBuf.find(FEATURES_TXT_END)) != 0 && (loc = stringBuf.find(FEATURES_BIN_END)) != 0) {
if ((loc = stringBuf.find(FEATURES_TXT_END)) != 0 &&
(loc = stringBuf.find(FEATURES_BIN_END)) != 0) {
TRACE_ERR("ERROR: FeatureArray::load(): Wrong footer");
return;
}
}
}
void FeatureArray::load(const std::string &file)
void FeatureArray::load(const string &file)
{
TRACE_ERR("loading data from " << file << std::endl);
inputfilestream inFile(file); // matches a stream with a file. Opens the file
load((ifstream&) inFile);
inFile.close();
TRACE_ERR("loading data from " << file << endl);
inputfilestream input_stream(file); // matches a stream with a file. Opens the file
istream* is = &input_stream;
load(is);
input_stream.close();
}
void FeatureArray::merge(FeatureArray& e)
{
//dummy implementation
for (size_t i=0; i<e.size(); i++)
for (size_t i = 0; i < e.size(); i++)
add(e.get(i));
}
@ -144,10 +156,9 @@ bool FeatureArray::check_consistency() const
if (sz == 0)
return true;
for (featarray_t::const_iterator i = array_.begin(); i != array_.end(); i++) {
for (featarray_t::const_iterator i = m_array.begin(); i != m_array.end(); i++) {
if (i->size() != sz)
return false;
}
return true;
}

View File

@ -11,7 +11,6 @@
#include <vector>
#include <iostream>
#include <fstream>
#include "FeatureStats.h"
using namespace std;
@ -26,82 +25,57 @@ class FeatureArray
private:
// idx to identify the utterance. It can differ from
// the index inside the vector.
std::string idx;
protected:
featarray_t array_;
size_t number_of_features;
std::string features;
bool _sparse_flag;
std::string m_index;
featarray_t m_array;
size_t m_num_features;
std::string m_features;
bool m_sparse_flag;
public:
FeatureArray();
~FeatureArray();
inline void clear() {
array_.clear();
}
void clear() { m_array.clear(); }
inline bool hasSparseFeatures() const {
return _sparse_flag;
}
bool hasSparseFeatures() const { return m_sparse_flag; }
inline std::string getIndex() const {
return idx;
}
inline void setIndex(const std::string& value) {
idx = value;
}
std::string getIndex() const { return m_index; }
void setIndex(const std::string& value) { m_index = value; }
inline FeatureStats& get(size_t i) {
return array_.at(i);
}
inline const FeatureStats& get(size_t i)const {
return array_.at(i);
}
void add(FeatureStats& e) {
array_.push_back(e);
}
FeatureStats& get(size_t i) { return m_array.at(i); }
const FeatureStats& get(size_t i) const { return m_array.at(i); }
void add(FeatureStats& e) { m_array.push_back(e); }
//ADDED BY TS
void swap(size_t i, size_t j) {
std::swap(array_[i],array_[j]);
std::swap(m_array[i], m_array[j]);
}
void resize(size_t new_size) {
array_.resize(std::min(new_size,array_.size()));
m_array.resize(std::min(new_size, m_array.size()));
}
//END_ADDED
void merge(FeatureArray& e);
inline size_t size() const {
return array_.size();
}
inline size_t NumberOfFeatures() const {
return number_of_features;
}
inline void NumberOfFeatures(size_t v) {
number_of_features = v;
}
inline std::string Features() const {
return features;
}
inline void Features(const std::string& f) {
features = f;
}
size_t size() const { return m_array.size(); }
void savetxt(ofstream& outFile);
void savebin(ofstream& outFile);
void save(ofstream& outFile, bool bin=false);
size_t NumberOfFeatures() const { return m_num_features; }
void NumberOfFeatures(size_t v) { m_num_features = v; }
std::string Features() const { return m_features; }
void Features(const std::string& f) { m_features = f; }
void savetxt(std::ostream* os);
void savebin(std::ostream* os);
void save(std::ostream* os, bool bin=false);
void save(const std::string &file, bool bin=false);
inline void save(bool bin=false) {
save("/dev/stdout",bin);
}
void save(bool bin=false);
void loadtxt(ifstream& inFile, size_t n);
void loadbin(ifstream& inFile, size_t n);
void load(ifstream& inFile);
void loadtxt(std::istream* is, size_t n);
void loadbin(std::istream* is, size_t n);
void load(std::istream* is);
void load(const std::string &file);
bool check_consistency() const;

View File

@ -13,44 +13,45 @@
#include "Util.h"
#include <cstdio>
static const float MIN_FLOAT=-1.0*numeric_limits<float>::max();
static const float MAX_FLOAT=numeric_limits<float>::max();
static const float MIN_FLOAT = -1.0 * numeric_limits<float>::max();
static const float MAX_FLOAT = numeric_limits<float>::max();
FeatureData::FeatureData()
: number_of_features(0),
_sparse_flag(false) {}
: m_num_features(0),
m_sparse_flag(false) {}
void FeatureData::save(std::ofstream& outFile, bool bin)
void FeatureData::save(ostream* os, bool bin)
{
for (featdata_t::iterator i = array_.begin(); i !=array_.end(); i++)
i->save(outFile, bin);
for (featdata_t::iterator i = m_array.begin(); i != m_array.end(); i++)
i->save(os, bin);
}
void FeatureData::save(const std::string &file, bool bin)
void FeatureData::save(const string &file, bool bin)
{
if (file.empty()) return;
TRACE_ERR("saving the array into " << file << std::endl);
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
save(outFile, bin);
outFile.close();
TRACE_ERR("saving the array into " << file << endl);
ofstream ofs(file.c_str(), ios::out); // matches a stream with a file. Opens the file
ostream* os = &ofs;
save(os, bin);
ofs.close();
}
void FeatureData::load(ifstream& inFile)
void FeatureData::save(bool bin) {
save(&cout, bin);
}
void FeatureData::load(istream* is)
{
FeatureArray entry;
while (!inFile.eof()) {
while (!is->eof()) {
if (!inFile.good()) {
std::cerr << "ERROR FeatureData::load inFile.good()" << std::endl;
if (!is->good()) {
cerr << "ERROR FeatureData::load inFile.good()" << endl;
}
entry.clear();
entry.load(inFile);
entry.load(is);
if (entry.size() == 0)
break;
@ -59,26 +60,23 @@ void FeatureData::load(ifstream& inFile)
setFeatureMap(entry.Features());
if (entry.hasSparseFeatures())
_sparse_flag = true;
m_sparse_flag = true;
add(entry);
}
}
void FeatureData::load(const std::string &file)
void FeatureData::load(const string &file)
{
TRACE_ERR("loading feature data from " << file << std::endl);
inputfilestream inFile(file); // matches a stream with a file. Opens the file
if (!inFile) {
TRACE_ERR("loading feature data from " << file << endl);
inputfilestream input_stream(file); // matches a stream with a file. Opens the file
if (!input_stream) {
throw runtime_error("Unable to open feature file: " + file);
}
load((ifstream&) inFile);
inFile.close();
istream* is = &input_stream;
load(is);
input_stream.close();
}
void FeatureData::add(FeatureArray& e)
@ -86,25 +84,25 @@ void FeatureData::add(FeatureArray& e)
if (exists(e.getIndex())) { // array at position e.getIndex() already exists
//enlarge array at position e.getIndex()
size_t pos = getIndex(e.getIndex());
array_.at(pos).merge(e);
m_array.at(pos).merge(e);
} else {
array_.push_back(e);
m_array.push_back(e);
setIndex();
}
}
void FeatureData::add(FeatureStats& e, const std::string& sent_idx)
void FeatureData::add(FeatureStats& e, const string& sent_idx)
{
if (exists(sent_idx)) { // array at position e.getIndex() already exists
//enlarge array at position e.getIndex()
size_t pos = getIndex(sent_idx);
// TRACE_ERR("Inserting " << e << " in array " << sent_idx << std::endl);
array_.at(pos).add(e);
m_array.at(pos).add(e);
} else {
// TRACE_ERR("Creating a new entry in the array and inserting " << e << std::endl);
FeatureArray a;
a.NumberOfFeatures(number_of_features);
a.Features(features);
a.NumberOfFeatures(m_num_features);
a.Features(m_features);
a.setIndex(sent_idx);
a.add(e);
add(a);
@ -113,10 +111,10 @@ void FeatureData::add(FeatureStats& e, const std::string& sent_idx)
bool FeatureData::check_consistency() const
{
if (array_.size() == 0)
if (m_array.size() == 0)
return true;
for (featdata_t::const_iterator i = array_.begin(); i != array_.end(); i++)
for (featdata_t::const_iterator i = m_array.begin(); i != m_array.end(); i++)
if (!i->check_consistency()) return false;
return true;
@ -125,26 +123,26 @@ bool FeatureData::check_consistency() const
void FeatureData::setIndex()
{
size_t j=0;
for (featdata_t::iterator i = array_.begin(); i !=array_.end(); i++) {
idx2arrayname_[j]=(*i).getIndex();
arrayname2idx_[(*i).getIndex()] = j;
for (featdata_t::iterator i = m_array.begin(); i !=m_array.end(); i++) {
m_index_to_array_name[j]=(*i).getIndex();
m_array_name_to_index[(*i).getIndex()] = j;
j++;
}
}
void FeatureData::setFeatureMap(const std::string& feat)
void FeatureData::setFeatureMap(const string& feat)
{
number_of_features = 0;
features = feat;
m_num_features = 0;
m_features = feat;
std::string substring, stringBuf;
stringBuf = features;
while (!stringBuf.empty()) {
getNextPound(stringBuf, substring);
featname2idx_[substring] = idx2featname_.size();
idx2featname_[idx2featname_.size()] = substring;
number_of_features++;
vector<string> buf;
Tokenize(feat.c_str(), ' ', &buf);
for (vector<string>::const_iterator it = buf.begin();
it != buf.end(); ++it) {
const size_t size = m_index_to_feature_name.size();
m_feature_name_to_index[*it] = size;
m_index_to_feature_name[size] = *it;
++m_num_features;
}
}
@ -152,26 +150,23 @@ string FeatureData::ToString() const {
string res;
char buf[100];
snprintf(buf, sizeof(buf), "number of features: %lu, ", number_of_features);
snprintf(buf, sizeof(buf), "number of features: %lu, ", m_num_features);
res.append(buf);
snprintf(buf, sizeof(buf), "features: ");
res.append(buf);
res.append(features);
res.append("features: ");
res.append(m_features);
snprintf(buf, sizeof(buf), ", sparse flag: %s, ", (_sparse_flag) ? "yes" : "no");
snprintf(buf, sizeof(buf), ", sparse flag: %s, ", (m_sparse_flag) ? "yes" : "no");
res.append(buf);
snprintf(buf, sizeof(buf), "feature_id_map = { ");
res.append(buf);
for (map<string, size_t>::const_iterator it = featname2idx_.begin();
it != featname2idx_.end(); ++it) {
res.append("feature_id_map = { ");
for (map<string, size_t>::const_iterator it = m_feature_name_to_index.begin();
it != m_feature_name_to_index.end(); ++it) {
snprintf(buf, sizeof(buf), "%s => %lu, ",
it->first.c_str(), it->second);
res.append(buf);
}
snprintf(buf, sizeof(buf), "}");
res.append(buf);
res.append("}");
return res;
}

View File

@ -19,109 +19,92 @@ using namespace std;
class FeatureData
{
private:
size_t number_of_features;
std::string features;
bool _sparse_flag;
map<std::string, size_t> featname2idx_; // map from name to index of features
map<size_t, std::string> idx2featname_; // map from index to name of features
protected:
featdata_t array_;
idx2name idx2arrayname_; // map from index to name of array
name2idx arrayname2idx_; // map from name to index of array
size_t m_num_features;
std::string m_features;
bool m_sparse_flag;
map<std::string, size_t> m_feature_name_to_index; // map from name to index of features
map<size_t, std::string> m_index_to_feature_name; // map from index to name of features
featdata_t m_array;
idx2name m_index_to_array_name; // map from index to name of array
name2idx m_array_name_to_index; // map from name to index of array
public:
FeatureData();
~FeatureData() {}
inline void clear() {
array_.clear();
void clear() { m_array.clear(); }
bool hasSparseFeatures() const { return m_sparse_flag; }
FeatureArray get(const std::string& idx) {
return m_array.at(getIndex(idx));
}
inline bool hasSparseFeatures() const {
return _sparse_flag;
}
inline FeatureArray get(const std::string& idx) {
return array_.at(getIndex(idx));
}
inline FeatureArray& get(size_t idx) {
return array_.at(idx);
}
inline const FeatureArray& get(size_t idx) const {
return array_.at(idx);
}
FeatureArray& get(size_t idx) { return m_array.at(idx); }
const FeatureArray& get(size_t idx) const { return m_array.at(idx); }
inline bool exists(const std::string& sent_idx) const {
return exists(getIndex(sent_idx));
}
inline bool exists(int sent_idx) const {
return (sent_idx > -1 && sent_idx < static_cast<int>(array_.size())) ? true : false;
return (sent_idx > -1 && sent_idx < static_cast<int>(m_array.size())) ? true : false;
}
inline FeatureStats& get(size_t i, size_t j) {
return array_.at(i).get(j);
return m_array.at(i).get(j);
}
inline const FeatureStats& get(size_t i, size_t j) const {
return array_.at(i).get(j);
inline const FeatureStats& get(size_t i, size_t j) const {
return m_array.at(i).get(j);
}
void add(FeatureArray& e);
void add(FeatureStats& e, const std::string& sent_idx);
inline size_t size() const {
return array_.size();
}
inline size_t NumberOfFeatures() const {
return number_of_features;
}
inline void NumberOfFeatures(size_t v) {
number_of_features = v;
}
inline std::string Features() const {
return features;
}
inline void Features(const std::string& f) {
features = f;
}
size_t size() const { return m_array.size(); }
size_t NumberOfFeatures() const { return m_num_features; }
void NumberOfFeatures(size_t v) { m_num_features = v; }
std::string Features() const { return m_features; }
void Features(const std::string& f) { m_features = f; }
void save(const std::string &file, bool bin=false);
void save(ofstream& outFile, bool bin=false);
inline void save(bool bin=false) {
save("/dev/stdout", bin);
}
void save(std::ostream* os, bool bin=false);
void save(bool bin=false);
void load(ifstream& inFile);
void load(std::istream* is);
void load(const std::string &file);
bool check_consistency() const;
void setIndex();
inline int getIndex(const std::string& idx) const {
name2idx::const_iterator i = arrayname2idx_.find(idx);
if (i != arrayname2idx_.end())
name2idx::const_iterator i = m_array_name_to_index.find(idx);
if (i != m_array_name_to_index.end())
return i->second;
else
return -1;
}
inline std::string getIndex(size_t idx) const {
idx2name::const_iterator i = idx2arrayname_.find(idx);
if (i != idx2arrayname_.end())
idx2name::const_iterator i = m_index_to_array_name.find(idx);
if (i != m_index_to_array_name.end())
throw runtime_error("there is no entry at index " + idx);
return i->second;
}
bool existsFeatureNames() const {
return (idx2featname_.size() > 0) ? true : false;
return (m_index_to_feature_name.size() > 0) ? true : false;
}
std::string getFeatureName(size_t idx) const {
if (idx >= idx2featname_.size())
if (idx >= m_index_to_feature_name.size())
throw runtime_error("Error: you required an too big index");
map<size_t, std::string>::const_iterator it = idx2featname_.find(idx);
if (it == idx2featname_.end()) {
map<size_t, std::string>::const_iterator it = m_index_to_feature_name.find(idx);
if (it == m_index_to_feature_name.end()) {
throw runtime_error("Error: specified id is unknown: " + idx);
} else {
return it->second;
@ -129,8 +112,8 @@ public:
}
size_t getFeatureIndex(const std::string& name) const {
map<std::string, size_t>::const_iterator it = featname2idx_.find(name);
if (it == featname2idx_.end())
map<std::string, size_t>::const_iterator it = m_feature_name_to_index.find(name);
if (it == m_feature_name_to_index.end())
throw runtime_error("Error: feature " + name + " is unknown");
return it->second;
}

39
mert/FeatureDataTest.cpp Normal file
View File

@ -0,0 +1,39 @@
#include "FeatureData.h"
#define BOOST_TEST_MODULE FeatureData
#include <boost/test/unit_test.hpp>
#include <cstdio>
namespace {
void CheckFeatureMap(const FeatureData* feature_data,
const char* str, int num_feature, int* cnt) {
char tmp[32];
for (int i = 0; i < num_feature; ++i) {
std::snprintf(tmp, sizeof(tmp), "%s_%d", str, i);
BOOST_CHECK_EQUAL(feature_data->getFeatureIndex(tmp), *cnt);
BOOST_CHECK_EQUAL(feature_data->getFeatureName(*cnt).c_str(), tmp);
++(*cnt);
}
}
} // namespace
BOOST_AUTO_TEST_CASE(set_feature_map) {
std::string str("d_0 d_1 d_2 d_3 d_4 d_5 d_6 lm_0 lm_1 tm_0 tm_1 tm_2 tm_3 tm_4 w_0 ");
FeatureData feature_data;
feature_data.setFeatureMap(str);
BOOST_REQUIRE(feature_data.Features() == str);
BOOST_REQUIRE(feature_data.NumberOfFeatures() == 15);
int cnt = 0;
CheckFeatureMap(&feature_data, "d", 7, &cnt);
CheckFeatureMap(&feature_data, "lm", 2, &cnt);
CheckFeatureMap(&feature_data, "tm", 5, &cnt);
BOOST_CHECK_EQUAL(feature_data.getFeatureIndex("w_0"), cnt);
BOOST_CHECK_EQUAL(feature_data.getFeatureName(cnt).c_str(), "w_0");
}

View File

@ -8,6 +8,7 @@
#include "FeatureStats.h"
#include <fstream>
#include <cmath>
#include "Util.h"
@ -15,58 +16,58 @@ namespace {
const int kAvailableSize = 8;
} // namespace
SparseVector::name2id_t SparseVector::name2id_;
SparseVector::id2name_t SparseVector::id2name_;
SparseVector::name2id_t SparseVector::m_name_to_id;
SparseVector::id2name_t SparseVector::m_id_to_name;
FeatureStatsType SparseVector::get(const string& name) const {
name2id_t::const_iterator name2id_iter = name2id_.find(name);
if (name2id_iter == name2id_.end()) return 0;
name2id_t::const_iterator name2id_iter = m_name_to_id.find(name);
if (name2id_iter == m_name_to_id.end()) return 0;
size_t id = name2id_iter->second;
return get(id);
}
FeatureStatsType SparseVector::get(size_t id) const {
fvector_t::const_iterator fvector_iter = fvector_.find(id);
if (fvector_iter == fvector_.end()) return 0;
fvector_t::const_iterator fvector_iter = m_fvector.find(id);
if (fvector_iter == m_fvector.end()) return 0;
return fvector_iter->second;
}
void SparseVector::set(const string& name, FeatureStatsType value) {
name2id_t::const_iterator name2id_iter = name2id_.find(name);
name2id_t::const_iterator name2id_iter = m_name_to_id.find(name);
size_t id = 0;
if (name2id_iter == name2id_.end()) {
id = id2name_.size();
id2name_.push_back(name);
name2id_[name] = id;
if (name2id_iter == m_name_to_id.end()) {
id = m_id_to_name.size();
m_id_to_name.push_back(name);
m_name_to_id[name] = id;
} else {
id = name2id_iter->second;
}
fvector_[id] = value;
m_fvector[id] = value;
}
void SparseVector::write(ostream& out, const string& sep) const {
for (fvector_t::const_iterator i = fvector_.begin(); i != fvector_.end(); ++i) {
for (fvector_t::const_iterator i = m_fvector.begin(); i != m_fvector.end(); ++i) {
if (abs(i->second) < 0.00001) continue;
string name = id2name_[i->first];
string name = m_id_to_name[i->first];
out << name << sep << i->second << " ";
}
}
void SparseVector::clear() {
fvector_.clear();
m_fvector.clear();
}
SparseVector& SparseVector::operator-=(const SparseVector& rhs) {
//All the elements that have values in *this
for (fvector_t::iterator i = fvector_.begin(); i != fvector_.end(); ++i) {
fvector_[i->first] = i->second - rhs.get(i->first);
for (fvector_t::iterator i = m_fvector.begin(); i != m_fvector.end(); ++i) {
m_fvector[i->first] = i->second - rhs.get(i->first);
}
//Any elements in rhs, that have no value in *this
for (fvector_t::const_iterator i = rhs.fvector_.begin();
i != rhs.fvector_.end(); ++i) {
if (fvector_.find(i->first) == fvector_.end()) {
fvector_[i->first] = -(i->second);
for (fvector_t::const_iterator i = rhs.m_fvector.begin();
i != rhs.m_fvector.end(); ++i) {
if (m_fvector.find(i->first) == m_fvector.end()) {
m_fvector[i->first] = -(i->second);
}
}
return *this;
@ -79,37 +80,37 @@ SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs) {
}
FeatureStats::FeatureStats()
: available_(kAvailableSize), entries_(0),
array_(new FeatureStatsType[available_]) {}
: m_available_size(kAvailableSize), m_entries(0),
m_array(new FeatureStatsType[m_available_size]) {}
FeatureStats::FeatureStats(const size_t size)
: available_(size), entries_(size),
array_(new FeatureStatsType[available_])
: m_available_size(size), m_entries(size),
m_array(new FeatureStatsType[m_available_size])
{
memset(array_, 0, GetArraySizeWithBytes());
memset(m_array, 0, GetArraySizeWithBytes());
}
FeatureStats::FeatureStats(std::string &theString)
: available_(0), entries_(0), array_(NULL)
FeatureStats::FeatureStats(string &theString)
: m_available_size(0), m_entries(0), m_array(NULL)
{
set(theString);
}
FeatureStats::~FeatureStats()
{
if (array_) {
delete [] array_;
array_ = NULL;
if (m_array) {
delete [] m_array;
m_array = NULL;
}
}
void FeatureStats::Copy(const FeatureStats &stats)
{
available_ = stats.available();
entries_ = stats.size();
array_ = new FeatureStatsType[available_];
memcpy(array_, stats.getArray(), GetArraySizeWithBytes());
map_ = stats.getSparse();
m_available_size = stats.available();
m_entries = stats.size();
m_array = new FeatureStatsType[m_available_size];
memcpy(m_array, stats.getArray(), GetArraySizeWithBytes());
m_map = stats.getSparse();
}
FeatureStats::FeatureStats(const FeatureStats &stats)
@ -119,34 +120,34 @@ FeatureStats::FeatureStats(const FeatureStats &stats)
FeatureStats& FeatureStats::operator=(const FeatureStats &stats)
{
delete [] array_;
delete [] m_array;
Copy(stats);
return *this;
}
void FeatureStats::expand()
{
available_ *= 2;
featstats_t t_ = new FeatureStatsType[available_];
memcpy(t_, array_, GetArraySizeWithBytes());
delete [] array_;
array_ = t_;
m_available_size *= 2;
featstats_t t_ = new FeatureStatsType[m_available_size];
memcpy(t_, m_array, GetArraySizeWithBytes());
delete [] m_array;
m_array = t_;
}
void FeatureStats::add(FeatureStatsType v)
{
if (isfull()) expand();
array_[entries_++]=v;
m_array[m_entries++]=v;
}
void FeatureStats::addSparse(const string& name, FeatureStatsType v)
{
map_.set(name,v);
m_map.set(name,v);
}
void FeatureStats::set(std::string &theString)
void FeatureStats::set(string &theString)
{
std::string substring, stringBuf;
string substring, stringBuf;
reset();
while (!theString.empty()) {
@ -163,48 +164,50 @@ void FeatureStats::set(std::string &theString)
}
}
void FeatureStats::loadbin(std::ifstream& inFile)
void FeatureStats::loadbin(istream* is)
{
inFile.read((char*) array_, GetArraySizeWithBytes());
is->read(reinterpret_cast<char*>(m_array),
static_cast<streamsize>(GetArraySizeWithBytes()));
}
void FeatureStats::loadtxt(std::ifstream& inFile)
void FeatureStats::loadtxt(istream* is)
{
std::string theString;
std::getline(inFile, theString);
set(theString);
string line;
getline(*is, line);
set(line);
}
void FeatureStats::loadtxt(const std::string &file)
void FeatureStats::loadtxt(const string &file)
{
// TRACE_ERR("loading the stats from " << file << std::endl);
std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
loadtxt(inFile);
ifstream ifs(file.c_str(), ios::in);
if (!ifs) {
cerr << "Failed to open " << file << endl;
exit(1);
}
istream* is = &ifs;
loadtxt(is);
}
void FeatureStats::savetxt(const std::string &file)
void FeatureStats::savetxt(const string &file)
{
// TRACE_ERR("saving the stats into " << file << std::endl);
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
savetxt(outFile);
ofstream ofs(file.c_str(), ios::out);
ostream* os = &ofs;
savetxt(os);
}
void FeatureStats::savetxt(std::ofstream& outFile)
void FeatureStats::savetxt(ostream* os)
{
// TRACE_ERR("saving the stats" << std::endl);
outFile << *this;
*os << *this;
}
void FeatureStats::savebin(std::ofstream& outFile)
void FeatureStats::savetxt() {
savetxt(&cout);
}
void FeatureStats::savebin(ostream* os)
{
outFile.write((char*) array_, GetArraySizeWithBytes());
os->write(reinterpret_cast<char*>(m_array),
static_cast<streamsize>(GetArraySizeWithBytes()));
}
ostream& operator<<(ostream& o, const FeatureStats& e)
@ -230,7 +233,7 @@ bool operator==(const FeatureStats& f1, const FeatureStats& f2) {
if (f1.get(k) != f2.get(k))
return false;
}
return true;
}
//END_ADDED

View File

@ -10,7 +10,6 @@
#define MERT_FEATURE_STATS_H_
#include <cstring>
#include <fstream>
#include <iostream>
#include <map>
#include <string>
@ -30,18 +29,16 @@ public:
FeatureStatsType get(size_t id) const;
void set(const std::string& name, FeatureStatsType value);
void clear();
size_t size() const {
return fvector_.size();
}
size_t size() const { return m_fvector.size(); }
void write(std::ostream& out, const std::string& sep = " ") const;
SparseVector& operator-=(const SparseVector& rhs);
private:
static name2id_t name2id_;
static id2name_t id2name_;
fvector_t fvector_;
static name2id_t m_name_to_id;
static id2name_t m_id_to_name;
fvector_t m_fvector;
};
SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs);
@ -49,12 +46,12 @@ SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs);
class FeatureStats
{
private:
size_t available_;
size_t entries_;
size_t m_available_size;
size_t m_entries;
// TODO: Use smart pointer for exceptional-safety.
featstats_t array_;
SparseVector map_;
featstats_t m_array;
SparseVector m_map;
public:
FeatureStats();
@ -69,64 +66,47 @@ public:
void Copy(const FeatureStats &stats);
bool isfull() const {
return (entries_ < available_) ? 0 : 1;
}
bool isfull() const { return (m_entries < m_available_size) ? 0 : 1; }
void expand();
void add(FeatureStatsType v);
void addSparse(const string& name, FeatureStatsType v);
void clear() {
memset((void*)array_, 0, GetArraySizeWithBytes());
map_.clear();
memset((void*)m_array, 0, GetArraySizeWithBytes());
m_map.clear();
}
void reset() {
entries_ = 0;
m_entries = 0;
clear();
}
inline FeatureStatsType get(size_t i) {
return array_[i];
}
inline FeatureStatsType get(size_t i)const {
return array_[i];
}
inline featstats_t getArray() const {
return array_;
}
inline const SparseVector& getSparse() const {
return map_;
}
FeatureStatsType get(size_t i) { return m_array[i]; }
FeatureStatsType get(size_t i)const { return m_array[i]; }
featstats_t getArray() const { return m_array; }
const SparseVector& getSparse() const { return m_map; }
void set(std::string &theString);
inline size_t bytes() const {
return GetArraySizeWithBytes();
}
inline size_t bytes() const { return GetArraySizeWithBytes(); }
size_t GetArraySizeWithBytes() const {
return entries_ * sizeof(FeatureStatsType);
return m_entries * sizeof(FeatureStatsType);
}
inline size_t size() const {
return entries_;
}
size_t size() const { return m_entries; }
inline size_t available() const {
return available_;
}
size_t available() const { return m_available_size; }
void savetxt(const std::string &file);
void savetxt(ofstream& outFile);
void savebin(ofstream& outFile);
inline void savetxt() {
savetxt("/dev/stdout");
}
void savetxt(std::ostream* os);
void savebin(std::ostream* os);
void savetxt();
void loadtxt(const std::string &file);
void loadtxt(ifstream& inFile);
void loadbin(ifstream& inFile);
void loadtxt(std::istream* is);
void loadbin(std::istream* is);
/**
* Write the whole object to a stream.

View File

@ -13,11 +13,11 @@ bool IsGzipFile(const std::string &filename) {
} // namespace
inputfilestream::inputfilestream(const std::string &filePath)
: std::istream(0), m_streambuf(0), is_good(false)
: std::istream(0), m_streambuf(0), m_is_good(false)
{
// check if file is readable
std::filebuf* fb = new std::filebuf();
is_good = (fb->open(filePath.c_str(), std::ios::in) != NULL);
m_is_good = (fb->open(filePath.c_str(), std::ios::in) != NULL);
if (IsGzipFile(filePath)) {
fb->close();
@ -40,11 +40,11 @@ void inputfilestream::close()
}
outputfilestream::outputfilestream(const std::string &filePath)
: std::ostream(0), m_streambuf(0), is_good(false)
: std::ostream(0), m_streambuf(0), m_is_good(false)
{
// check if file is readable
std::filebuf* fb = new std::filebuf();
is_good = (fb->open(filePath.c_str(), std::ios::out) != NULL);
m_is_good = (fb->open(filePath.c_str(), std::ios::out) != NULL);
if (IsGzipFile(filePath)) {
throw runtime_error("Output to a zipped file not supported!");

View File

@ -2,6 +2,7 @@
#define MERT_FILE_STREAM_H_
#include <fstream>
#include <iostream>
#include <streambuf>
#include <string>
@ -9,13 +10,13 @@ class inputfilestream : public std::istream
{
protected:
std::streambuf *m_streambuf;
bool is_good;
bool m_is_good;
public:
explicit inputfilestream(const std::string &filePath);
virtual ~inputfilestream();
bool good() const { return is_good; }
bool good() const { return m_is_good; }
void close();
};
@ -23,13 +24,13 @@ class outputfilestream : public std::ostream
{
protected:
std::streambuf *m_streambuf;
bool is_good;
bool m_is_good;
public:
explicit outputfilestream(const std::string &filePath);
virtual ~outputfilestream();
bool good() const { return is_good; }
bool good() const { return m_is_good; }
void close();
};

View File

@ -1,35 +1,36 @@
#include "ScorerFactory.h"
#include "InterpolatedScorer.h"
#include "ScorerFactory.h"
#include "Util.h"
using namespace std;
InterpolatedScorer::InterpolatedScorer (const string& name, const string& config): Scorer(name,config)
// TODO: This is too long. Consider creating a function for
// initialization such as Init().
InterpolatedScorer::InterpolatedScorer(const string& name, const string& config)
: Scorer(name,config)
{
// name would be: HAMMING,BLEU or similar
string scorers = name;
while (scorers.length() > 0) {
string scorertype = "";
getNextPound(scorers,scorertype,",");
Scorer *theScorer=ScorerFactory::getScorer(scorertype,config);
_scorers.push_back(theScorer);
getNextPound(scorers, scorertype,",");
Scorer *scorer = ScorerFactory::getScorer(scorertype,config);
m_scorers.push_back(scorer);
}
if (_scorers.size() == 0) {
if (m_scorers.size() == 0) {
throw runtime_error("There are no scorers");
}
cerr << "Number of scorers: " << _scorers.size() << endl;
cerr << "Number of scorers: " << m_scorers.size() << endl;
//TODO debug this
string wtype = getConfig("weights","");
//Default weights set to uniform ie. if two weights 0.5 each
//weights should add to 1
if (wtype.length() == 0) {
float weight = 1.0/_scorers.size() ;
float weight = 1.0 / m_scorers.size() ;
//cout << " Default weights:" << weight << endl;
for (size_t i = 0; i < _scorers.size(); i ++) {
_scorerWeights.push_back(weight);
for (size_t i = 0; i < m_scorers.size(); i ++) {
m_scorer_weights.push_back(weight);
}
} else {
float tot=0;
@ -38,24 +39,24 @@ InterpolatedScorer::InterpolatedScorer (const string& name, const string& config
string scoreweight = "";
getNextPound(wtype,scoreweight,"+");
float weight = atof(scoreweight.c_str());
_scorerWeights.push_back(weight);
m_scorer_weights.push_back(weight);
tot += weight;
//cout << " :" << weight ;
}
//cout << endl;
if (tot != float(1)) {
for (vector<float>::iterator it = _scorerWeights.begin(); it != _scorerWeights.end(); ++it)
{
if (tot != float(1)) { // TODO: fix this checking in terms of readability.
for (vector<float>::iterator it = m_scorer_weights.begin();
it != m_scorer_weights.end(); ++it) {
*it /= tot;
}
}
if (_scorers.size() != _scorerWeights.size()) {
if (m_scorers.size() != m_scorer_weights.size()) {
throw runtime_error("The number of weights does not equal the number of scorers!");
}
}
cerr << "The weights for the interpolated scorers are: " << endl;
for (vector<float>::iterator it = _scorerWeights.begin(); it < _scorerWeights.end(); it++) {
for (vector<float>::iterator it = m_scorer_weights.begin(); it < m_scorer_weights.end(); it++) {
cerr << *it << " " ;
}
cerr <<endl;
@ -65,9 +66,10 @@ void InterpolatedScorer::setScoreData(ScoreData* data)
{
size_t last = 0;
m_score_data = data;
for (ScopedVector<Scorer>::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) {
for (ScopedVector<Scorer>::iterator itsc = m_scorers.begin();
itsc != m_scorers.end(); ++itsc) {
int numScoresScorer = (*itsc)->NumberOfScores();
ScoreData* newData =new ScoreData(**itsc);
ScoreData* newData =new ScoreData(*itsc);
for (size_t i = 0; i < data->size(); i++) {
ScoreArray scoreArray = data->get(i);
ScoreArray newScoreArray;
@ -110,14 +112,16 @@ void InterpolatedScorer::score(const candidates_t& candidates, const diffs_t& di
{
//cout << "*******InterpolatedScorer::score" << endl;
size_t scorerNum = 0;
for (ScopedVector<Scorer>::const_iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) {
for (ScopedVector<Scorer>::const_iterator itsc = m_scorers.begin();
itsc != m_scorers.end(); ++itsc) {
//int numScores = (*itsc)->NumberOfScores();
statscores_t tscores;
(*itsc)->score(candidates,diffs,tscores);
size_t inc = 0;
for (statscores_t::iterator itstatsc = tscores.begin(); itstatsc!=tscores.end(); itstatsc++) {
for (statscores_t::iterator itstatsc = tscores.begin();
itstatsc != tscores.end(); ++itstatsc) {
//cout << "Scores " << (*itstatsc) << endl;
float weight = _scorerWeights[scorerNum];
float weight = m_scorer_weights[scorerNum];
if (weight == 0) {
stringstream msg;
msg << "No weights for scorer" << scorerNum ;
@ -139,7 +143,8 @@ void InterpolatedScorer::score(const candidates_t& candidates, const diffs_t& di
void InterpolatedScorer::setReferenceFiles(const vector<string>& referenceFiles)
{
for (ScopedVector<Scorer>::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) {
for (ScopedVector<Scorer>::iterator itsc = m_scorers.begin();
itsc != m_scorers.end(); ++itsc) {
(*itsc)->setReferenceFiles(referenceFiles);
}
}
@ -147,8 +152,9 @@ void InterpolatedScorer::setReferenceFiles(const vector<string>& referenceFiles)
void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
stringstream buff;
int i=0;
for (ScopedVector<Scorer>::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) {
int i = 0;
for (ScopedVector<Scorer>::iterator itsc = m_scorers.begin();
itsc != m_scorers.end(); ++itsc) {
ScoreStats tempEntry;
(*itsc)->prepareStats(sid, text, tempEntry);
if (i > 0) buff << " ";
@ -167,16 +173,10 @@ void InterpolatedScorer::setFactors(const string& factors)
vector<string> fsplit;
split(factors, ',', fsplit);
if (fsplit.size() != _scorers.size()) throw runtime_error("Number of factor specifications does not equal number of interpolated scorers.");
for (size_t i = 0; i < _scorers.size(); ++i)
{
_scorers[i]->setFactors(fsplit[i]);
if (fsplit.size() != m_scorers.size())
throw runtime_error("Number of factor specifications does not equal number of interpolated scorers.");
for (size_t i = 0; i < m_scorers.size(); ++i) {
m_scorers[i]->setFactors(fsplit[i]);
}
}

View File

@ -1,14 +1,6 @@
#ifndef __INTERPOLATED_SCORER_H__
#define __INTERPOLATED_SCORER_H__
#ifndef MERT_INTERPOLATED_SCORER_H_
#define MERT_INTERPOLATED_SCORER_H_
#include <algorithm>
#include <cmath>
#include <iostream>
#include <iterator>
#include <limits>
#include <set>
#include <sstream>
#include <stdexcept>
#include <string>
#include <vector>
#include "Types.h"
@ -33,12 +25,13 @@ public:
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
virtual size_t NumberOfScores() const {
size_t sz=0;
for (ScopedVector<Scorer>::const_iterator itsc = _scorers.begin(); itsc != _scorers.end(); itsc++) {
size_t sz = 0;
for (ScopedVector<Scorer>::const_iterator itsc = m_scorers.begin();
itsc != m_scorers.end(); ++itsc) {
sz += (*itsc)->NumberOfScores();
}
return sz;
};
}
virtual void setScoreData(ScoreData* data);
@ -48,13 +41,13 @@ public:
virtual void setFactors(const string& factors);
protected:
ScopedVector<Scorer> _scorers;
ScopedVector<Scorer> m_scorers;
// Take the ownership of the heap-allocated the objects
// by Scorer objects.
ScopedVector<ScoreData> m_scorers_score_data;
vector<float> _scorerWeights;
vector<float> m_scorer_weights;
};
#endif //__INTERPOLATED_SCORER_H
#endif // MERT_INTERPOLATED_SCORER_H_

View File

@ -6,9 +6,13 @@ lib mert_lib :
Util.cpp
FileStream.cpp
Timer.cpp
ScoreStats.cpp ScoreArray.cpp ScoreData.cpp
ScoreStats.cpp
ScoreArray.cpp
ScoreData.cpp
ScoreDataIterator.cpp
FeatureStats.cpp FeatureArray.cpp FeatureData.cpp
FeatureStats.cpp
FeatureArray.cpp
FeatureData.cpp
FeatureDataIterator.cpp
Data.cpp
BleuScorer.cpp
@ -18,6 +22,7 @@ PerScorer.cpp
Scorer.cpp
ScorerFactory.cpp
Optimizer.cpp
OptimizerFactory.cpp
TER/alignmentStruct.cpp
TER/hashMap.cpp
TER/hashMapStringInfos.cpp
@ -32,6 +37,7 @@ TER/tools.cpp
TerScorer.cpp
CderScorer.cpp
MergeScorer.cpp
Vocabulary.cpp
../util//kenutil m ..//z ;
exe mert : mert.cpp mert_lib ../moses/src//ThreadPool ;
@ -44,8 +50,15 @@ exe pro : pro.cpp mert_lib ..//boost_program_options ;
alias programs : mert extractor evaluator pro ;
unit-test bleu_scorer_test : BleuScorerTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test feature_data_test : FeatureDataTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test data_test : DataTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test ngram_test : NgramTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test optimizer_factory_test : OptimizerFactoryTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test reference_test : ReferenceTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test singleton_test : SingletonTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test timer_test : TimerTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test util_test : UtilTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test vocabulary_test : VocabularyTest.cpp mert_lib ..//boost_unit_test_framework ;
install legacy : programs : <location>. ;

View File

@ -14,7 +14,8 @@
using namespace TERCpp;
MergeScorer::MergeScorer(const string& config)
: StatisticsBasedScorer("MERGE",config), kLENGTH(4) {}
: StatisticsBasedScorer("MERGE", config) {}
MergeScorer::~MergeScorer() {}
void MergeScorer::setReferenceFiles(const vector<string>& referenceFiles)

View File

@ -13,6 +13,8 @@ using namespace std;
class PerScorer;
class ScoreStats;
const int kMergeScorerLength = 4;
/**
* Merge scoring.
*/
@ -23,23 +25,13 @@ public:
virtual void setReferenceFiles(const vector<string>& referenceFiles);
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
virtual size_t NumberOfScores() const
{
return 0;
}
void whoami() const {
cerr << "I AM MergeScorer" << endl;
}
virtual size_t NumberOfScores() const { return 0; }
protected:
friend class PerScorer;
virtual float calculateScore(const vector<int>& comps) const;
private:
const int kLENGTH;
// no copying allowed
MergeScorer(const MergeScorer&);
MergeScorer& operator=(const MergeScorer&);

98
mert/Ngram.h Normal file
View File

@ -0,0 +1,98 @@
#ifndef MERT_NGRAM_H_
#define MERT_NGRAM_H_
#include <vector>
#include <map>
#include <string>
/** A simple STL-std::map based n-gram counts. Basically, we provide
* typical accessors and mutaors, but we intentionally does not allow
* erasing elements.
*/
class NgramCounts {
public:
// Used to construct the ngram map
struct NgramComparator {
bool operator()(const std::vector<int>& a, const std::vector<int>& b) const {
std::size_t i;
const std::size_t as = a.size();
const std::size_t bs = b.size();
for (i = 0; i < as && i < bs; ++i) {
if (a[i] < b[i]) {
return true;
}
if (a[i] > b[i]) {
return false;
}
}
// entries are equal, shortest wins
return as < bs;
}
};
typedef std::vector<int> Key;
typedef int Value;
typedef std::map<Key, Value, NgramComparator>::iterator iterator;
typedef std::map<Key, Value, NgramComparator>::const_iterator const_iterator;
NgramCounts() : kDefaultCount(1) { }
virtual ~NgramCounts() { }
/**
* If the specified "ngram" is found, we add counts.
* If not, we insert the default count in the container. */
void Add(const Key& ngram) {
const_iterator it = find(ngram);
if (it != end()) {
m_counts[ngram] = it->second + 1;
} else {
m_counts[ngram] = kDefaultCount;
}
}
/**
* Return true iff the specified "ngram" is found in the container.
*/
bool Lookup(const Key& ngram, Value* v) const {
const_iterator it = m_counts.find(ngram);
if (it == m_counts.end()) return false;
*v = it->second;
return true;
}
/**
* Clear all elments in the container.
*/
void clear() { m_counts.clear(); }
/**
* Return true iff the container is empty.
*/
bool empty() const { return m_counts.empty(); }
/**
* Return the the number of elements in the container.
*/
std::size_t size() const { return m_counts.size(); }
std::size_t max_size() const { return m_counts.max_size(); }
// Note: This is mainly used by unit tests.
int get_default_count() const { return kDefaultCount; }
iterator find(const Key& ngram) { return m_counts.find(ngram); }
const_iterator find(const Key& ngram) const { return m_counts.find(ngram); }
Value& operator[](const Key& ngram) { return m_counts[ngram]; }
iterator begin() { return m_counts.begin(); }
const_iterator begin() const { return m_counts.begin(); }
iterator end() { return m_counts.end(); }
const_iterator end() const { return m_counts.end(); }
private:
const int kDefaultCount;
std::map<Key, Value, NgramComparator> m_counts;
};
#endif // MERT_NGRAM_H_

83
mert/NgramTest.cpp Normal file
View File

@ -0,0 +1,83 @@
#include "Ngram.h"
#define BOOST_TEST_MODULE MertNgram
#include <boost/test/unit_test.hpp>
BOOST_AUTO_TEST_CASE(ngram_basic) {
NgramCounts counts;
NgramCounts::Key key;
key.push_back(1);
key.push_back(2);
key.push_back(4);
counts.Add(key);
BOOST_REQUIRE(!counts.empty());
BOOST_CHECK_EQUAL(counts.size(), 1);
NgramCounts::const_iterator it = counts.find(key);
BOOST_CHECK(it != counts.end());
BOOST_CHECK_EQUAL(it->first.size(), key.size());
for (size_t i = 0; i < key.size(); ++i) {
BOOST_CHECK_EQUAL(it->first[i], key[i]);
}
BOOST_CHECK_EQUAL(it->second, 1);
}
BOOST_AUTO_TEST_CASE(ngram_Add) {
NgramCounts counts;
NgramCounts::Key key;
key.push_back(1);
key.push_back(2);
counts.Add(key);
BOOST_REQUIRE(!counts.empty());
BOOST_CHECK_EQUAL(counts[key], counts.get_default_count());
NgramCounts::Key key2;
key2.push_back(1);
key2.push_back(2);
counts.Add(key2);
BOOST_CHECK_EQUAL(counts.size(), 1);
BOOST_CHECK_EQUAL(counts[key], counts.get_default_count() + 1);
BOOST_CHECK_EQUAL(counts[key2], counts.get_default_count() + 1);
NgramCounts::Key key3;
key3.push_back(10);
counts.Add(key3);
BOOST_CHECK_EQUAL(counts.size(), 2);
BOOST_CHECK_EQUAL(counts[key3], counts.get_default_count());
}
BOOST_AUTO_TEST_CASE(ngram_lookup) {
NgramCounts counts;
NgramCounts::Key key;
key.push_back(1);
key.push_back(2);
key.push_back(4);
counts.Add(key);
{
NgramCounts::Value v;
BOOST_REQUIRE(counts.Lookup(key, &v));
BOOST_CHECK_EQUAL(v, 1);
}
// the case the key is not found.
{
NgramCounts::Key key2;
key2.push_back(0);
key2.push_back(4);
NgramCounts::Value v;
// We only check the return value;
// we don't check the value of "v" because it makes sense
// to check the value when the specified ngram is found.
BOOST_REQUIRE(!counts.Lookup(key2, &v));
}
// test after clear
counts.clear();
BOOST_CHECK(counts.empty());
{
NgramCounts::Value v;
BOOST_CHECK(!counts.Lookup(key, &v));
}
}

View File

@ -32,36 +32,25 @@ inline float intersect(float m1, float b1, float m2, float b2)
} // namespace
void Optimizer::SetScorer(Scorer *_scorer)
Optimizer::Optimizer(unsigned Pd, const vector<unsigned>& i2O, const vector<parameter_t>& start, unsigned int nrandom)
: m_scorer(NULL), m_feature_data(), m_num_random_directions(nrandom)
{
scorer = _scorer;
}
void Optimizer::SetFData(FeatureDataHandle _FData)
{
FData = _FData;
}
Optimizer::Optimizer(unsigned Pd, vector<unsigned> i2O, vector<parameter_t> start, unsigned int nrandom)
: scorer(NULL), FData(), number_of_random_directions(nrandom)
{
// Warning: the init vector is a full set of parameters, of dimension pdim!
Point::pdim = Pd;
// Warning: the init vector is a full set of parameters, of dimension m_pdim!
Point::m_pdim = Pd;
CHECK(start.size() == Pd);
Point::dim = i2O.size();
Point::optindices = i2O;
if (Point::pdim > Point::dim) {
for (unsigned int i = 0; i < Point::pdim; i++) {
Point::m_dim = i2O.size();
Point::m_opt_indices = i2O;
if (Point::m_pdim > Point::m_dim) {
for (unsigned int i = 0; i < Point::m_pdim; i++) {
unsigned int j = 0;
while (j < Point::dim && i != i2O[j])
while (j < Point::m_dim && i != i2O[j])
j++;
// The index i wasnt found on optindices, it is a fixed index,
// The index i wasnt found on m_opt_indices, it is a fixed index,
// we use the value of the start vector.
if (j == Point::dim)
Point::fixedweights[i] = start[i];
if (j == Point::m_dim)
Point::m_fixed_weights[i] = start[i];
}
}
}
@ -76,7 +65,7 @@ statscore_t Optimizer::GetStatScore(const Point& param) const
return score;
}
map<float,diff_t >::iterator AddThreshold(map<float,diff_t >& thresholdmap, float newt, pair<unsigned,unsigned> newdiff)
map<float,diff_t >::iterator AddThreshold(map<float,diff_t >& thresholdmap, float newt, const pair<unsigned,unsigned>& newdiff)
{
map<float,diff_t>::iterator it = thresholdmap.find(newt);
if (it != thresholdmap.end()) {
@ -112,12 +101,12 @@ statscore_t Optimizer::LineOptimize(const Point& origin, const Point& direction,
//cerr << "Sentence " << S << endl;
multimap<float, unsigned> gradient;
vector<float> f0;
f0.resize(FData->get(S).size());
for (unsigned j = 0; j < FData->get(S).size(); j++) {
f0.resize(m_feature_data->get(S).size());
for (unsigned j = 0; j < m_feature_data->get(S).size(); j++) {
// gradient of the feature function for this particular target sentence
gradient.insert(pair<float, unsigned>(direction * (FData->get(S,j)), j));
gradient.insert(pair<float, unsigned>(direction * (m_feature_data->get(S,j)), j));
// compute the feature function at the origin point
f0[j] = origin * FData->get(S, j);
f0[j] = origin * m_feature_data->get(S, j);
}
// Now let's compute the 1best for each value of x.
@ -308,7 +297,7 @@ statscore_t Optimizer::LineOptimize(const Point& origin, const Point& direction,
void Optimizer::Get1bests(const Point& P, vector<unsigned>& bests) const
{
CHECK(FData);
CHECK(m_feature_data);
bests.clear();
bests.resize(size());
@ -316,8 +305,8 @@ void Optimizer::Get1bests(const Point& P, vector<unsigned>& bests) const
float bestfs = MIN_FLOAT;
unsigned idx = 0;
unsigned j;
for (j = 0; j < FData->get(i).size(); j++) {
float curfs = P * FData->get(i, j);
for (j = 0; j < m_feature_data->get(i).size(); j++) {
float curfs = P * m_feature_data->get(i, j);
if (curfs > bestfs) {
bestfs = curfs;
idx = j;
@ -330,15 +319,15 @@ void Optimizer::Get1bests(const Point& P, vector<unsigned>& bests) const
statscore_t Optimizer::Run(Point& P) const
{
if (!FData) {
if (!m_feature_data) {
cerr << "error trying to optimize without Features loaded" << endl;
exit(2);
}
if (!scorer) {
if (!m_scorer) {
cerr << "error trying to optimize without a Scorer loaded" << endl;
exit(2);
}
if (scorer->getReferenceSize() != FData->size()) {
if (m_scorer->getReferenceSize() != m_feature_data->size()) {
cerr << "error length mismatch between feature file and score file" << endl;
exit(2);
}
@ -359,13 +348,13 @@ statscore_t Optimizer::Run(Point& P) const
}
vector<statscore_t> Optimizer::GetIncStatScore(vector<unsigned> thefirst, vector<vector <pair<unsigned,unsigned> > > thediffs) const
vector<statscore_t> Optimizer::GetIncStatScore(const vector<unsigned>& thefirst, const vector<vector <pair<unsigned,unsigned> > >& thediffs) const
{
CHECK(scorer);
CHECK(m_scorer);
vector<statscore_t> theres;
scorer->score(thefirst, thediffs, theres);
m_scorer->score(thefirst, thediffs, theres);
return theres;
}
@ -392,7 +381,7 @@ statscore_t SimpleOptimizer::TrueRun(Point& P) const
Point linebest;
for (unsigned int d = 0; d < Point::getdim()+number_of_random_directions; d++) {
for (unsigned int d = 0; d < Point::getdim() + m_num_random_directions; d++) {
if (verboselevel() > 4) {
// cerr<<"minimizing along direction "<<d<<endl;
cerr << "starting point: " << P << " => " << prevscore << endl;
@ -440,7 +429,7 @@ statscore_t RandomDirectionOptimizer::TrueRun(Point& P) const
// do specified number of random direction optimizations
unsigned int nrun = 0;
unsigned int nrun_no_change = 0;
for (; nrun_no_change < number_of_random_directions; nrun++, nrun_no_change++)
for (; nrun_no_change < m_num_random_directions; nrun++, nrun_no_change++)
{
// choose a random direction in which to optimize
Point direction;
@ -473,63 +462,3 @@ statscore_t RandomOptimizer::TrueRun(Point& P) const
P.SetScore(score);
return score;
}
//--------------------------------------
vector<string> OptimizerFactory::typenames;
void OptimizerFactory::SetTypeNames()
{
if (typenames.empty()) {
typenames.resize(NOPTIMIZER);
typenames[POWELL]="powell";
typenames[RANDOM_DIRECTION]="random-direction";
typenames[RANDOM]="random";
// Add new type there
}
}
vector<string> OptimizerFactory::GetTypeNames()
{
if (typenames.empty())
SetTypeNames();
return typenames;
}
OptimizerFactory::OptType OptimizerFactory::GetOType(const string& type)
{
unsigned int thetype;
if (typenames.empty())
SetTypeNames();
for (thetype = 0; thetype < typenames.size(); thetype++)
if (typenames[thetype] == type)
break;
return((OptType)thetype);
}
Optimizer* OptimizerFactory::BuildOptimizer(unsigned dim, vector<unsigned> i2o, vector<parameter_t> start, const string& type, unsigned int nrandom)
{
OptType T = GetOType(type);
if (T == NOPTIMIZER) {
cerr << "Error: unknown Optimizer type " << type << endl;
cerr << "Known Algorithm are:" << endl;
unsigned int thetype;
for (thetype = 0; thetype < typenames.size(); thetype++)
cerr << typenames[thetype] << endl;
throw ("unknown Optimizer Type");
}
switch ((OptType)T) {
case POWELL:
return new SimpleOptimizer(dim, i2o, start, nrandom);
break;
case RANDOM_DIRECTION:
return new RandomDirectionOptimizer(dim, i2o, start, nrandom);
break;
case RANDOM:
return new RandomOptimizer(dim, i2o, start, nrandom);
break;
default:
cerr << "Error: unknown optimizer" << type << endl;
return NULL;
}
}

View File

@ -10,8 +10,6 @@
using namespace std;
typedef float featurescore;
class Point;
/**
@ -20,18 +18,19 @@ class Point;
class Optimizer
{
protected:
Scorer *scorer; // no accessor for them only child can use them
FeatureDataHandle FData; // no accessor for them only child can use them
unsigned int number_of_random_directions;
Scorer *m_scorer; // no accessor for them only child can use them
FeatureDataHandle m_feature_data; // no accessor for them only child can use them
unsigned int m_num_random_directions;
public:
Optimizer(unsigned Pd, vector<unsigned> i2O, vector<parameter_t> start, unsigned int nrandom);
void SetScorer(Scorer *_scorer);
void SetFData(FeatureDataHandle _FData);
Optimizer(unsigned Pd, const vector<unsigned>& i2O, const vector<parameter_t>& start, unsigned int nrandom);
void SetScorer(Scorer *scorer) { m_scorer = scorer; }
void SetFeatureData(FeatureDataHandle feature_data) { m_feature_data = feature_data; }
virtual ~Optimizer();
unsigned size() const {
return FData ? FData->size() : 0;
return m_feature_data ? m_feature_data->size() : 0;
}
/**
@ -53,12 +52,12 @@ public:
* Given a set of nbests, get the Statistical score.
*/
statscore_t GetStatScore(const vector<unsigned>& nbests) const {
return scorer->score(nbests);
return m_scorer->score(nbests);
}
statscore_t GetStatScore(const Point& param) const;
vector<statscore_t> GetIncStatScore(vector<unsigned> ref, vector<vector<pair<unsigned,unsigned> > >) const;
vector<statscore_t> GetIncStatScore(const vector<unsigned>& ref, const vector<vector<pair<unsigned,unsigned> > >& diffs) const;
/**
* Get the optimal Lambda and the best score in a particular direction from a given Point.
@ -76,7 +75,7 @@ class SimpleOptimizer : public Optimizer
private:
const float kEPS;
public:
SimpleOptimizer(unsigned dim, vector<unsigned> i2O, vector<parameter_t> start, unsigned int nrandom)
SimpleOptimizer(unsigned dim, const vector<unsigned>& i2O, const vector<parameter_t>& start, unsigned int nrandom)
: Optimizer(dim, i2O, start,nrandom), kEPS(0.0001) {}
virtual statscore_t TrueRun(Point&) const;
};
@ -89,7 +88,7 @@ class RandomDirectionOptimizer : public Optimizer
private:
const float kEPS;
public:
RandomDirectionOptimizer(unsigned dim, vector<unsigned> i2O, vector<parameter_t> start, unsigned int nrandom)
RandomDirectionOptimizer(unsigned dim, const vector<unsigned>& i2O, const vector<parameter_t>& start, unsigned int nrandom)
: Optimizer(dim, i2O, start, nrandom), kEPS(0.0001) {}
virtual statscore_t TrueRun(Point&) const;
};
@ -100,36 +99,9 @@ public:
class RandomOptimizer : public Optimizer
{
public:
RandomOptimizer(unsigned dim, vector<unsigned> i2O, vector<parameter_t> start, unsigned int nrandom)
RandomOptimizer(unsigned dim, const vector<unsigned>& i2O, const vector<parameter_t>& start, unsigned int nrandom)
: Optimizer(dim, i2O, start, nrandom) {}
virtual statscore_t TrueRun(Point&) const;
};
class OptimizerFactory
{
public:
static vector<string> GetTypeNames();
static Optimizer* BuildOptimizer(unsigned dim, vector<unsigned> tooptimize, vector<parameter_t> start, const string& type, unsigned int nrandom);
private:
OptimizerFactory() {}
~OptimizerFactory() {}
// Add new optimizer here BEFORE NOPTIMZER
enum OptType {
POWELL = 0,
RANDOM_DIRECTION = 1,
RANDOM,
NOPTIMIZER
};
// Get optimizer type.
static OptType GetOType(const string& type);
// Setup optimization types.
static void SetTypeNames();
static vector<string> typenames;
};
#endif // OPTIMIZER_H

66
mert/OptimizerFactory.cpp Normal file
View File

@ -0,0 +1,66 @@
#include "OptimizerFactory.h"
#include "Optimizer.h"
using namespace std;
vector<string> OptimizerFactory::m_type_names;
void OptimizerFactory::SetTypeNames()
{
if (m_type_names.empty()) {
m_type_names.resize(NOPTIMIZER);
m_type_names[POWELL] = "powell";
m_type_names[RANDOM_DIRECTION] = "random-direction";
m_type_names[RANDOM] = "random";
// Add new type there
}
}
vector<string> OptimizerFactory::GetTypeNames()
{
if (m_type_names.empty())
SetTypeNames();
return m_type_names;
}
OptimizerFactory::OptimizerType OptimizerFactory::GetOptimizerType(const string& type)
{
unsigned int t;
if (m_type_names.empty())
SetTypeNames();
for (t = 0; t < m_type_names.size(); t++)
if (m_type_names[t] == type)
break;
return((OptimizerType)t);
}
Optimizer* OptimizerFactory::BuildOptimizer(unsigned dim,
const vector<unsigned>& i2o,
const vector<parameter_t>& start,
const string& type,
unsigned int nrandom)
{
OptimizerType opt_type = GetOptimizerType(type);
if (opt_type == NOPTIMIZER) {
cerr << "Error: unknown Optimizer type " << type << endl;
cerr << "Known Algorithm are:" << endl;
unsigned int t;
for (t = 0; t < m_type_names.size(); t++)
cerr << m_type_names[t] << endl;
throw ("unknown Optimizer Type");
}
switch (opt_type) {
case POWELL:
return new SimpleOptimizer(dim, i2o, start, nrandom);
break;
case RANDOM_DIRECTION:
return new RandomDirectionOptimizer(dim, i2o, start, nrandom);
break;
case RANDOM:
return new RandomOptimizer(dim, i2o, start, nrandom);
break;
default:
cerr << "Error: unknown optimizer" << type << endl;
return NULL;
}
}

41
mert/OptimizerFactory.h Normal file
View File

@ -0,0 +1,41 @@
#ifndef MERT_OPTIMIZER_FACTORY_H_
#define MERT_OPTIMIZER_FACTORY_H_
#include <vector>
#include "Types.h"
class Optimizer;
class OptimizerFactory
{
public:
// NOTE: Add new optimizer here BEFORE NOPTIMZER
enum OptimizerType {
POWELL = 0,
RANDOM_DIRECTION = 1,
RANDOM,
NOPTIMIZER
};
static std::vector<string> GetTypeNames();
// Setup optimization types.
static void SetTypeNames();
// Get optimizer type.
static OptimizerType GetOptimizerType(const std::string& type);
static Optimizer* BuildOptimizer(unsigned dim,
const std::vector<unsigned>& to_optimize,
const std::vector<parameter_t>& start,
const std::string& type,
unsigned int nrandom);
private:
OptimizerFactory() {}
~OptimizerFactory() {}
static vector<string> m_type_names;
};
#endif // MERT_OPTIMIZER_FACTORY_H_

View File

@ -0,0 +1,46 @@
#include "OptimizerFactory.h"
#include "Optimizer.h"
#define BOOST_TEST_MODULE MertOptimizerFactory
#include <boost/test/unit_test.hpp>
#include <boost/scoped_ptr.hpp>
namespace {
inline void CheckBuildOptimizer(unsigned dim,
const vector<unsigned>& to_optimize,
const vector<parameter_t>& start,
const string& type,
unsigned int num_random) {
boost::scoped_ptr<Optimizer> optimizer(
OptimizerFactory::BuildOptimizer(dim, to_optimize, start, type, num_random));
BOOST_CHECK(optimizer.get() != NULL);
}
} // namespace
BOOST_AUTO_TEST_CASE(optimizer_type) {
BOOST_CHECK_EQUAL(OptimizerFactory::GetOptimizerType("powell"),
OptimizerFactory::POWELL);
BOOST_CHECK_EQUAL(OptimizerFactory::GetOptimizerType("random"),
OptimizerFactory::RANDOM);
BOOST_CHECK_EQUAL(OptimizerFactory::GetOptimizerType("random-direction"),
OptimizerFactory::RANDOM_DIRECTION);
}
BOOST_AUTO_TEST_CASE(optimizer_build) {
const unsigned dim = 3;
std::vector<unsigned> to_optimize;
to_optimize.push_back(1);
to_optimize.push_back(2);
to_optimize.push_back(3);
std::vector<parameter_t> start;
start.push_back(0.3);
start.push_back(0.1);
start.push_back(0.2);
const unsigned int num_random = 1;
CheckBuildOptimizer(dim, to_optimize, start, "powell", num_random);
CheckBuildOptimizer(dim, to_optimize, start, "random", num_random);
CheckBuildOptimizer(dim, to_optimize, start, "random-direction", num_random);
}

View File

@ -1,9 +1,7 @@
#ifndef MERT_PER_SCORER_H_
#define MERT_PER_SCORER_H_
#include <iostream>
#include <set>
#include <sstream>
#include <string>
#include <vector>
#include "Types.h"
@ -27,18 +25,9 @@ public:
virtual void setReferenceFiles(const vector<string>& referenceFiles);
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
virtual size_t NumberOfScores() const {
// cerr << "PerScorer: 3" << endl;
return 3;
}
virtual size_t NumberOfScores() const { return 3; }
virtual float calculateScore(const vector<int>& comps) const;
void whoami() const {
cerr << "I AM PerScorer" << std::endl;
}
private:
// no copying allowed
PerScorer(const PerScorer&);

View File

@ -8,41 +8,41 @@
using namespace std;
vector<unsigned> Point::optindices;
vector<unsigned> Point::m_opt_indices;
unsigned Point::dim = 0;
unsigned Point::m_dim = 0;
map<unsigned,statscore_t> Point::fixedweights;
map<unsigned,statscore_t> Point::m_fixed_weights;
unsigned Point::pdim = 0;
unsigned Point::ncall = 0;
unsigned Point::m_pdim = 0;
unsigned Point::m_ncall = 0;
vector<parameter_t> Point::m_min;
vector<parameter_t> Point::m_max;
Point::Point() : vector<parameter_t>(dim), score_(0.0) {}
Point::Point() : vector<parameter_t>(m_dim), m_score(0.0) {}
//Can initialize from a vector of dim or pdim
//Can initialize from a vector of dim or m_pdim
Point::Point(const vector<parameter_t>& init,
const vector<parameter_t>& min,
const vector<parameter_t>& max)
: vector<parameter_t>(Point::dim), score_(0.0)
: vector<parameter_t>(Point::m_dim), m_score(0.0)
{
m_min.resize(Point::dim);
m_max.resize(Point::dim);
if(init.size()==dim) {
for (unsigned int i=0; i<Point::dim; i++) {
operator[](i)=init[i];
m_min.resize(Point::m_dim);
m_max.resize(Point::m_dim);
if (init.size() == m_dim) {
for (unsigned int i = 0; i < Point::m_dim; i++) {
operator[](i) = init[i];
m_min[i] = min[i];
m_max[i] = max[i];
}
} else {
CHECK(init.size()==pdim);
CHECK(optindices.size() == Point::dim);
for (unsigned int i=0; i<Point::dim; i++) {
operator[](i)=init[optindices[i]];
m_min[i] = min[optindices[i]];
m_max[i] = max[optindices[i]];
CHECK(init.size() == m_pdim);
CHECK(m_opt_indices.size() == Point::m_dim);
for (unsigned int i = 0; i < Point::m_dim; i++) {
operator[](i) = init[m_opt_indices[i]];
m_min[i] = min[m_opt_indices[i]];
m_max[i] = max[m_opt_indices[i]];
}
}
}
@ -51,9 +51,9 @@ Point::~Point() {}
void Point::Randomize()
{
CHECK(m_min.size()==Point::dim);
CHECK(m_max.size()==Point::dim);
for (unsigned int i=0; i<size(); i++) {
CHECK(m_min.size() == Point::m_dim);
CHECK(m_max.size() == Point::m_dim);
for (unsigned int i = 0; i < size(); i++) {
operator[](i) = m_min[i] +
static_cast<float>(random()) / static_cast<float>(RAND_MAX) * (m_max[i] - m_min[i]);
}
@ -61,16 +61,17 @@ void Point::Randomize()
double Point::operator*(const FeatureStats& F) const
{
ncall++; // to track performance
double prod=0.0;
if(OptimizeAll())
m_ncall++; // to track performance
double prod = 0.0;
if (OptimizeAll())
for (unsigned i=0; i<size(); i++)
prod+= operator[](i)*F.get(i);
prod += operator[](i) * F.get(i);
else {
for (unsigned i=0; i<size(); i++)
prod+= operator[](i)*F.get(optindices[i]);
for(map<unsigned,float >::iterator it=fixedweights.begin(); it!=fixedweights.end(); it++)
prod+=it->second*F.get(it->first);
for (unsigned i = 0; i < size(); i++)
prod += operator[](i) * F.get(m_opt_indices[i]);
for(map<unsigned, float>::iterator it = m_fixed_weights.begin();
it != m_fixed_weights.end(); ++it)
prod += it->second * F.get(it->first);
}
return prod;
}
@ -83,7 +84,7 @@ Point Point::operator+(const Point& p2) const
Res[i] += p2[i];
}
Res.score_ = numeric_limits<statscore_t>::max();
Res.m_score = numeric_limits<statscore_t>::max();
return Res;
}
@ -93,7 +94,7 @@ void Point::operator+=(const Point& p2)
for (unsigned i = 0; i < size(); i++) {
operator[](i) += p2[i];
}
score_ = numeric_limits<statscore_t>::max();
m_score = numeric_limits<statscore_t>::max();
}
Point Point::operator*(float l) const
@ -102,14 +103,14 @@ Point Point::operator*(float l) const
for (unsigned i = 0; i < size(); i++) {
Res[i] *= l;
}
Res.score_ = numeric_limits<statscore_t>::max();
Res.m_score = numeric_limits<statscore_t>::max();
return Res;
}
ostream& operator<<(ostream& o, const Point& P)
{
vector<parameter_t> w = P.GetAllWeights();
for (unsigned int i = 0; i < Point::pdim; i++) {
for (unsigned int i = 0; i < Point::m_pdim; i++) {
o << w[i] << " ";
}
return o;
@ -118,24 +119,24 @@ ostream& operator<<(ostream& o, const Point& P)
void Point::NormalizeL2()
{
parameter_t norm=0.0;
for (unsigned int i=0; i<size(); i++)
norm+= operator[](i)*operator[](i);
if(norm!=0.0) {
norm=sqrt(norm);
for (unsigned int i=0; i<size(); i++)
operator[](i)/=norm;
for (unsigned int i = 0; i < size(); i++)
norm += operator[](i) * operator[](i);
if (norm != 0.0) {
norm = sqrt(norm);
for (unsigned int i = 0; i < size(); i++)
operator[](i) /= norm;
}
}
void Point::NormalizeL1()
{
parameter_t norm=0.0;
for (unsigned int i=0; i<size(); i++)
norm+= abs(operator[](i));
if(norm!=0.0) {
for (unsigned int i=0; i<size(); i++)
operator[](i)/=norm;
parameter_t norm = 0.0;
for (unsigned int i = 0; i < size(); i++)
norm += abs(operator[](i));
if (norm != 0.0) {
for (unsigned int i = 0; i < size(); i++)
operator[](i) /= norm;
}
}
@ -143,14 +144,16 @@ void Point::NormalizeL1()
vector<parameter_t> Point::GetAllWeights()const
{
vector<parameter_t> w;
if(OptimizeAll()) {
w=*this;
if (OptimizeAll()) {
w = *this;
} else {
w.resize(pdim);
for (unsigned int i=0; i<size(); i++)
w[optindices[i]]=operator[](i);
for(map<unsigned,float >::iterator it=fixedweights.begin(); it!=fixedweights.end(); it++)
w.resize(m_pdim);
for (unsigned int i = 0; i < size(); i++)
w[m_opt_indices[i]] = operator[](i);
for (map<unsigned, float>::iterator it = m_fixed_weights.begin();
it != m_fixed_weights.end(); ++it) {
w[it->first]=it->second;
}
}
return w;
}

View File

@ -1,7 +1,7 @@
#ifndef MERT_POINT_H_
#define MERT_POINT_H_
#include <fstream>
#include <ostream>
#include <map>
#include <vector>
#include "Types.h"
@ -16,61 +16,55 @@ class Optimizer;
class Point : public vector<parameter_t>
{
friend class Optimizer;
private:
/**
* The indices over which we optimize.
*/
static vector<unsigned int> optindices;
static vector<unsigned int> m_opt_indices;
/**
* Dimension of optindices and of the parent vector.
* Dimension of m_opt_indices and of the parent vector.
*/
static unsigned int dim;
static unsigned int m_dim;
/**
* Fixed weights in case of partial optimzation.
*/
static map<unsigned int,parameter_t> fixedweights;
static map<unsigned int,parameter_t> m_fixed_weights;
/**
* Total size of the parameter space; we have
* pdim = FixedWeight.size() + optinidices.size().
* m_pdim = FixedWeight.size() + optinidices.size().
*/
static unsigned int pdim;
static unsigned int ncall;
static unsigned int m_pdim;
static unsigned int m_ncall;
/**
* The limits for randomization, both vectors are of full length, pdim.
* The limits for randomization, both vectors are of full length, m_pdim.
*/
static vector<parameter_t> m_min;
static vector<parameter_t> m_max;
statscore_t score_;
statscore_t m_score;
public:
static unsigned int getdim() {
return dim;
}
static unsigned int getpdim() {
return pdim;
}
static void setpdim(size_t pd) {
pdim = pd;
}
static void setdim(size_t d) {
dim = d;
}
static unsigned int getdim() { return m_dim; }
static void setdim(size_t d) { m_dim = d; }
static unsigned int getpdim() { return m_pdim; }
static void setpdim(size_t pd) { m_pdim = pd; }
static void set_optindices(const vector<unsigned int>& indices) {
optindices = indices;
m_opt_indices = indices;
}
static const vector<unsigned int>& get_optindices() {
return optindices;
return m_opt_indices;
}
static bool OptimizeAll() {
return fixedweights.empty();
return m_fixed_weights.empty();
}
Point();
@ -88,7 +82,7 @@ public:
Point operator*(float) const;
/**
* Write the Whole featureweight to a stream (ie pdim float).
* Write the Whole featureweight to a stream (ie m_pdim float).
*/
friend ostream& operator<<(ostream& o,const Point& P);
@ -97,16 +91,13 @@ public:
void NormalizeL1();
/**
* Return a vector of size pdim where all weights have been
* Return a vector of size m_pdim where all weights have been
* put (including fixed ones).
*/
vector<parameter_t> GetAllWeights() const;
statscore_t GetScore() const {
return score_;
}
void SetScore(statscore_t score) { score_ = score; }
statscore_t GetScore() const { return m_score; }
void SetScore(statscore_t score) { m_score = score; }
};
#endif // MERT_POINT_H

80
mert/Reference.h Normal file
View File

@ -0,0 +1,80 @@
#ifndef MERT_REFERENCE_H_
#define MERT_REFERENCE_H_
#include <algorithm>
#include <climits>
#include <vector>
#include "Ngram.h"
/**
* Reference class represents reference translations for an output
* translation used in calculating BLEU score.
*/
class Reference {
public:
// for m_length
typedef std::vector<size_t>::iterator iterator;
typedef std::vector<size_t>::const_iterator const_iterator;
Reference() : m_counts(new NgramCounts) { }
~Reference() { delete m_counts; }
NgramCounts* get_counts() { return m_counts; }
const NgramCounts* get_counts() const { return m_counts; }
iterator begin() { return m_length.begin(); }
const_iterator begin() const { return m_length.begin(); }
iterator end() { return m_length.end(); }
const_iterator end() const { return m_length.end(); }
void push_back(size_t len) { m_length.push_back(len); }
size_t num_references() const { return m_length.size(); }
int CalcAverage() const;
int CalcClosest(size_t length) const;
int CalcShortest() const;
private:
NgramCounts* m_counts;
// multiple reference lengths
std::vector<size_t> m_length;
};
inline int Reference::CalcAverage() const {
int total = 0;
for (size_t i = 0; i < m_length.size(); ++i) {
total += m_length[i];
}
return static_cast<int>(
static_cast<float>(total) / m_length.size());
}
inline int Reference::CalcClosest(size_t length) const {
int min_diff = INT_MAX;
int closest_ref_id = 0; // an index of the closest reference translation
for (size_t i = 0; i < m_length.size(); ++i) {
const int ref_length = m_length[i];
const int length_diff = abs(ref_length - static_cast<int>(length));
const int abs_min_diff = abs(min_diff);
// Look for the closest reference
if (length_diff < abs_min_diff) {
min_diff = ref_length - length;
closest_ref_id = i;
// if two references has the same closest length, take the shortest
} else if (length_diff == abs_min_diff) {
if (ref_length < static_cast<int>(m_length[closest_ref_id])) {
closest_ref_id = i;
}
}
}
return static_cast<int>(m_length[closest_ref_id]);
}
inline int Reference::CalcShortest() const {
return *std::min_element(m_length.begin(), m_length.end());
}
#endif // MERT_REFERENCE_H_

116
mert/ReferenceTest.cpp Normal file
View File

@ -0,0 +1,116 @@
#include "Reference.h"
#define BOOST_TEST_MODULE MertReference
#include <boost/test/unit_test.hpp>
BOOST_AUTO_TEST_CASE(refernece_count) {
Reference ref;
BOOST_CHECK(ref.get_counts() != NULL);
}
BOOST_AUTO_TEST_CASE(refernece_length_iterator) {
Reference ref;
ref.push_back(4);
ref.push_back(2);
BOOST_REQUIRE(ref.num_references() == 2);
Reference::iterator it = ref.begin();
BOOST_CHECK_EQUAL(*it, 4);
++it;
BOOST_CHECK_EQUAL(*it, 2);
++it;
BOOST_CHECK(it == ref.end());
}
BOOST_AUTO_TEST_CASE(refernece_length_average) {
{
Reference ref;
ref.push_back(4);
ref.push_back(1);
BOOST_CHECK_EQUAL(2, ref.CalcAverage());
}
{
Reference ref;
ref.push_back(4);
ref.push_back(3);
BOOST_CHECK_EQUAL(3, ref.CalcAverage());
}
{
Reference ref;
ref.push_back(4);
ref.push_back(3);
ref.push_back(4);
ref.push_back(5);
BOOST_CHECK_EQUAL(4, ref.CalcAverage());
}
}
BOOST_AUTO_TEST_CASE(refernece_length_closest) {
{
Reference ref;
ref.push_back(4);
ref.push_back(1);
BOOST_REQUIRE(ref.num_references() == 2);
BOOST_CHECK_EQUAL(1, ref.CalcClosest(2));
BOOST_CHECK_EQUAL(1, ref.CalcClosest(1));
BOOST_CHECK_EQUAL(4, ref.CalcClosest(3));
BOOST_CHECK_EQUAL(4, ref.CalcClosest(4));
BOOST_CHECK_EQUAL(4, ref.CalcClosest(5));
}
{
Reference ref;
ref.push_back(4);
ref.push_back(3);
BOOST_REQUIRE(ref.num_references() == 2);
BOOST_CHECK_EQUAL(3, ref.CalcClosest(1));
BOOST_CHECK_EQUAL(3, ref.CalcClosest(2));
BOOST_CHECK_EQUAL(3, ref.CalcClosest(3));
BOOST_CHECK_EQUAL(4, ref.CalcClosest(4));
BOOST_CHECK_EQUAL(4, ref.CalcClosest(5));
}
{
Reference ref;
ref.push_back(4);
ref.push_back(3);
ref.push_back(4);
ref.push_back(5);
BOOST_REQUIRE(ref.num_references() == 4);
BOOST_CHECK_EQUAL(3, ref.CalcClosest(1));
BOOST_CHECK_EQUAL(3, ref.CalcClosest(2));
BOOST_CHECK_EQUAL(3, ref.CalcClosest(3));
BOOST_CHECK_EQUAL(4, ref.CalcClosest(4));
BOOST_CHECK_EQUAL(5, ref.CalcClosest(5));
}
}
BOOST_AUTO_TEST_CASE(refernece_length_shortest) {
{
Reference ref;
ref.push_back(4);
ref.push_back(1);
BOOST_CHECK_EQUAL(1, ref.CalcShortest());
}
{
Reference ref;
ref.push_back(4);
ref.push_back(3);
BOOST_CHECK_EQUAL(3, ref.CalcShortest());
}
{
Reference ref;
ref.push_back(4);
ref.push_back(3);
ref.push_back(4);
ref.push_back(5);
BOOST_CHECK_EQUAL(3, ref.CalcShortest());
}
}

View File

@ -12,39 +12,39 @@ class ScopedVector {
ScopedVector() {}
virtual ~ScopedVector() { reset(); }
bool empty() const { return vec_.empty(); }
bool empty() const { return m_vec.empty(); }
void push_back(T *e) { vec_.push_back(e); }
void push_back(T *e) { m_vec.push_back(e); }
void reset() {
for (iterator it = vec_.begin(); it != vec_.end(); ++it) {
for (iterator it = m_vec.begin(); it != m_vec.end(); ++it) {
delete *it;
}
vec_.clear();
m_vec.clear();
}
void reserve(size_t capacity) { vec_.reserve(capacity); }
void resize(size_t size) { vec_.resize(size); }
void reserve(size_t capacity) { m_vec.reserve(capacity); }
void resize(size_t size) { m_vec.resize(size); }
size_t size() const {return vec_.size(); }
size_t size() const {return m_vec.size(); }
iterator begin() { return vec_.begin(); }
const_iterator begin() const { return vec_.begin(); }
iterator begin() { return m_vec.begin(); }
const_iterator begin() const { return m_vec.begin(); }
iterator end() { return vec_.end(); }
const_iterator end() const { return vec_.end(); }
iterator end() { return m_vec.end(); }
const_iterator end() const { return m_vec.end(); }
std::vector<T*>& get() { return vec_; }
const std::vector<T*>& get() const { return vec_; }
std::vector<T*>& get() { return m_vec; }
const std::vector<T*>& get() const { return m_vec; }
std::vector<T*>* operator->() { return &vec_; }
const std::vector<T*>* operator->() const { return &vec_; }
std::vector<T*>* operator->() { return &m_vec; }
const std::vector<T*>* operator->() const { return &m_vec; }
T*& operator[](size_t i) { return vec_[i]; }
const T* operator[](size_t i) const { return vec_[i]; }
T*& operator[](size_t i) { return m_vec[i]; }
const T* operator[](size_t i) const { return m_vec[i]; }
private:
std::vector<T*> vec_;
std::vector<T*> m_vec;
// no copying allowed.
ScopedVector<T>(const ScopedVector<T>&);

View File

@ -10,76 +10,85 @@
#include "Util.h"
#include "FileStream.h"
ScoreArray::ScoreArray()
: number_of_scores(0), idx("") {}
: m_num_scores(0), m_index("") {}
void ScoreArray::savetxt(std::ofstream& outFile, const std::string& sctype)
void ScoreArray::savetxt(ostream* os, const string& sctype)
{
outFile << SCORES_TXT_BEGIN << " " << idx << " " << array_.size()
<< " " << number_of_scores << " " << sctype << std::endl;
for (scorearray_t::iterator i = array_.begin(); i !=array_.end(); i++) {
i->savetxt(outFile);
outFile << std::endl;
*os << SCORES_TXT_BEGIN << " " << m_index << " " << m_array.size()
<< " " << m_num_scores << " " << sctype << endl;
for (scorearray_t::iterator i = m_array.begin(); i !=m_array.end(); i++) {
i->savetxt(os);
*os << endl;
}
outFile << SCORES_TXT_END << std::endl;
*os << SCORES_TXT_END << endl;
}
void ScoreArray::savebin(std::ofstream& outFile, const std::string& sctype)
void ScoreArray::savebin(ostream* os, const string& score_type)
{
outFile << SCORES_BIN_BEGIN << " " << idx << " " << array_.size()
<< " " << number_of_scores << " " << sctype << std::endl;
for (scorearray_t::iterator i = array_.begin(); i !=array_.end(); i++)
i->savebin(outFile);
outFile << SCORES_BIN_END << std::endl;
*os << SCORES_BIN_BEGIN << " " << m_index << " " << m_array.size()
<< " " << m_num_scores << " " << score_type << endl;
for (scorearray_t::iterator i = m_array.begin();
i != m_array.end(); i++) {
i->savebin(os);
}
*os << SCORES_BIN_END << endl;
}
void ScoreArray::save(std::ofstream& inFile, const std::string& sctype, bool bin)
void ScoreArray::save(ostream* os, const string& score_type, bool bin)
{
if (size()>0)
(bin)?savebin(inFile, sctype):savetxt(inFile, sctype);
if (size() <= 0) return;
if (bin) {
savebin(os, score_type);
} else {
savetxt(os, score_type);
}
}
void ScoreArray::save(const std::string &file, const std::string& sctype, bool bin)
void ScoreArray::save(const string &file, const string& score_type, bool bin)
{
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
save(outFile, sctype, bin);
outFile.close();
ofstream ofs(file.c_str(), ios::out);
if (!ofs) {
cerr << "Failed to open " << file << endl;
exit(1);
}
ostream* os = &ofs;
save(os, score_type, bin);
ofs.close();
}
void ScoreArray::loadbin(ifstream& inFile, size_t n)
{
ScoreStats entry(number_of_scores);
void ScoreArray::save(const string& score_type, bool bin) {
save(&cout, score_type, bin);
}
for (size_t i=0 ; i < n; i++) {
entry.loadbin(inFile);
void ScoreArray::loadbin(istream* is, size_t n)
{
ScoreStats entry(m_num_scores);
for (size_t i = 0; i < n; i++) {
entry.loadbin(is);
add(entry);
}
}
void ScoreArray::loadtxt(ifstream& inFile, size_t n)
void ScoreArray::loadtxt(istream* is, size_t n)
{
ScoreStats entry(number_of_scores);
for (size_t i=0 ; i < n; i++) {
entry.loadtxt(inFile);
ScoreStats entry(m_num_scores);
for (size_t i = 0; i < n; i++) {
entry.loadtxt(is);
add(entry);
}
}
void ScoreArray::load(ifstream& inFile)
void ScoreArray::load(istream* is)
{
size_t number_of_entries=0;
bool binmode=false;
size_t number_of_entries = 0;
bool binmode = false;
std::string substring, stringBuf;
std::string::size_type loc;
string substring, stringBuf;
string::size_type loc;
std::getline(inFile, stringBuf);
if (!inFile.good()) {
getline(*is, stringBuf);
if (!is->good()) {
return;
}
@ -94,35 +103,38 @@ void ScoreArray::load(ifstream& inFile)
}
getNextPound(stringBuf, substring);
getNextPound(stringBuf, substring);
idx = substring;
m_index = substring;
getNextPound(stringBuf, substring);
number_of_entries = atoi(substring.c_str());
getNextPound(stringBuf, substring);
number_of_scores = atoi(substring.c_str());
m_num_scores = atoi(substring.c_str());
getNextPound(stringBuf, substring);
score_type = substring;
m_score_type = substring;
}
(binmode)?loadbin(inFile, number_of_entries):loadtxt(inFile, number_of_entries);
if (binmode) {
loadbin(is, number_of_entries);
} else {
loadtxt(is, number_of_entries);
}
std::getline(inFile, stringBuf);
getline(*is, stringBuf);
if (!stringBuf.empty()) {
if ((loc = stringBuf.find(SCORES_TXT_END)) != 0 && (loc = stringBuf.find(SCORES_BIN_END)) != 0) {
if ((loc = stringBuf.find(SCORES_TXT_END)) != 0 &&
(loc = stringBuf.find(SCORES_BIN_END)) != 0) {
TRACE_ERR("ERROR: ScoreArray::load(): Wrong footer");
return;
}
}
}
void ScoreArray::load(const std::string &file)
void ScoreArray::load(const string &file)
{
TRACE_ERR("loading data from " << file << std::endl);
inputfilestream inFile(file); // matches a stream with a file. Opens the file
load((ifstream&) inFile);
inFile.close();
TRACE_ERR("loading data from " << file << endl);
inputfilestream input_stream(file); // matches a stream with a file. Opens the file
istream* is = &input_stream;
load(is);
input_stream.close();
}
@ -139,7 +151,8 @@ bool ScoreArray::check_consistency() const
if (sz == 0)
return true;
for (scorearray_t::const_iterator i = array_.begin(); i != array_.end(); ++i) {
for (scorearray_t::const_iterator i = m_array.begin();
i != m_array.end(); ++i) {
if (i->size() != sz)
return false;
}

View File

@ -24,85 +24,62 @@ const char SCORES_BIN_END[] = "SCORES_BIN_END_0";
class ScoreArray
{
protected:
scorearray_t array_;
std::string score_type;
size_t number_of_scores;
private:
scorearray_t m_array;
std::string m_score_type;
size_t m_num_scores;
private:
// idx to identify the utterance.
// indexx to identify the utterance.
// It can differ from the index inside the vector.
std::string idx;
std::string m_index;
public:
ScoreArray();
~ScoreArray() {}
inline void clear() {
array_.clear();
}
void clear() { m_array.clear(); }
inline std::string getIndex() const {
return idx;
}
inline void setIndex(const std::string& value) {
idx=value;
}
std::string getIndex() const { return m_index; }
// inline ScoreStats get(size_t i){ return array_.at(i); }
void setIndex(const std::string& value) { m_index = value; }
inline ScoreStats& get(size_t i) {
return array_.at(i);
}
inline const ScoreStats& get(size_t i)const {
return array_.at(i);
}
ScoreStats& get(size_t i) { return m_array.at(i); }
void add(const ScoreStats& e) {
array_.push_back(e);
}
const ScoreStats& get(size_t i) const { return m_array.at(i); }
void add(const ScoreStats& e) { m_array.push_back(e); }
//ADDED BY TS
void swap(size_t i, size_t j) {
std::swap(array_[i],array_[j]);
std::swap(m_array[i], m_array[j]);
}
void resize(size_t new_size) {
array_.resize(std::min(new_size,array_.size()));
m_array.resize(std::min(new_size, m_array.size()));
}
//END_ADDED
void merge(ScoreArray& e);
inline std::string name() const {
return score_type;
}
std::string name() const { return m_score_type; }
inline void name(std::string &sctype) {
score_type = sctype;
}
void name(std::string &score_type) { m_score_type = score_type; }
inline size_t size() const {
return array_.size();
}
inline size_t NumberOfScores() const {
return number_of_scores;
}
inline void NumberOfScores(size_t v) {
number_of_scores = v;
}
size_t size() const { return m_array.size(); }
void savetxt(ofstream& outFile, const std::string& sctype);
void savebin(ofstream& outFile, const std::string& sctype);
void save(ofstream& outFile, const std::string& sctype, bool bin=false);
void save(const std::string &file, const std::string& sctype, bool bin=false);
inline void save(const std::string& sctype, bool bin=false) {
save("/dev/stdout", sctype, bin);
}
size_t NumberOfScores() const { return m_num_scores; }
void loadtxt(ifstream& inFile, size_t n);
void loadbin(ifstream& inFile, size_t n);
void load(ifstream& inFile);
void NumberOfScores(size_t v) { m_num_scores = v; }
void savetxt(std::ostream* os, const std::string& score_type);
void savebin(std::ostream* os, const std::string& score_type);
void save(std::ostream* os, const std::string& score_type, bool bin=false);
void save(const std::string &file, const std::string& score_type, bool bin=false);
void save(const std::string& score_type, bool bin=false);
void loadtxt(std::istream* is, size_t n);
void loadbin(std::istream* is, size_t n);
void load(std::istream* is);
void load(const std::string &file);
bool check_consistency() const;

View File

@ -7,55 +7,56 @@
*/
#include "ScoreData.h"
#include <fstream>
#include "Scorer.h"
#include "Util.h"
#include "FileStream.h"
ScoreData::ScoreData(Scorer& ptr):
theScorer(&ptr)
ScoreData::ScoreData(Scorer* scorer) :
m_scorer(scorer)
{
score_type = theScorer->getName();
m_score_type = m_scorer->getName();
// This is not dangerous: we don't use the this pointer in SetScoreData.
theScorer->setScoreData(this);
number_of_scores = theScorer->NumberOfScores();
// TRACE_ERR("ScoreData: number_of_scores: " << number_of_scores << std::endl);
m_scorer->setScoreData(this);
m_num_scores = m_scorer->NumberOfScores();
// TRACE_ERR("ScoreData: m_num_scores: " << m_num_scores << std::endl);
}
void ScoreData::save(std::ofstream& outFile, bool bin)
void ScoreData::save(ostream* os, bool bin)
{
for (scoredata_t::iterator i = array_.begin(); i !=array_.end(); i++) {
i->save(outFile, score_type, bin);
for (scoredata_t::iterator i = m_array.begin();
i != m_array.end(); ++i) {
i->save(os, m_score_type, bin);
}
}
void ScoreData::save(const std::string &file, bool bin)
void ScoreData::save(const string &file, bool bin)
{
if (file.empty()) return;
TRACE_ERR("saving the array into " << file << std::endl);
TRACE_ERR("saving the array into " << file << endl);
// matches a stream with a file. Opens the file.
std::ofstream outFile(file.c_str(), std::ios::out);
ScoreStats entry;
save(outFile, bin);
outFile.close();
ofstream ofs(file.c_str(), ios::out);
ostream* os = &ofs;
save(os, bin);
ofs.close();
}
void ScoreData::load(ifstream& inFile)
void ScoreData::save(bool bin) {
save(&cout, bin);
}
void ScoreData::load(istream* is)
{
ScoreArray entry;
while (!inFile.eof()) {
if (!inFile.good()) {
std::cerr << "ERROR ScoreData::load inFile.good()" << std::endl;
while (!is->eof()) {
if (!is->good()) {
cerr << "ERROR ScoreData::load inFile.good()" << endl;
}
entry.clear();
entry.load(inFile);
entry.load(is);
if (entry.size() == 0) {
break;
}
@ -63,63 +64,58 @@ void ScoreData::load(ifstream& inFile)
}
}
void ScoreData::load(const std::string &file)
void ScoreData::load(const string &file)
{
TRACE_ERR("loading score data from " << file << std::endl);
inputfilestream inFile(file); // matches a stream with a file. Opens the file
if (!inFile) {
TRACE_ERR("loading score data from " << file << endl);
inputfilestream input_stream(file); // matches a stream with a file. Opens the file
if (!input_stream) {
throw runtime_error("Unable to open score file: " + file);
}
load((ifstream&) inFile);
inFile.close();
istream* is = &input_stream;
load(is);
input_stream.close();
}
void ScoreData::add(ScoreArray& e)
{
if (exists(e.getIndex())) { // array at position e.getIndex() already exists
//enlarge array at position e.getIndex()
size_t pos = getIndex(e.getIndex());
array_.at(pos).merge(e);
m_array.at(pos).merge(e);
} else {
array_.push_back(e);
m_array.push_back(e);
setIndex();
}
}
void ScoreData::add(const ScoreStats& e, const std::string& sent_idx)
void ScoreData::add(const ScoreStats& e, const string& sent_idx)
{
if (exists(sent_idx)) { // array at position e.getIndex() already exists
// Enlarge array at position e.getIndex()
size_t pos = getIndex(sent_idx);
// TRACE_ERR("Inserting in array " << sent_idx << std::endl);
array_.at(pos).add(e);
m_array.at(pos).add(e);
// TRACE_ERR("size: " << size() << " -> " << a.size() << std::endl);
} else {
// TRACE_ERR("Creating a new entry in the array" << std::endl);
ScoreArray a;
a.NumberOfScores(number_of_scores);
a.NumberOfScores(m_num_scores);
a.add(e);
a.setIndex(sent_idx);
size_t idx = array_.size();
array_.push_back(a);
idx2arrayname_[idx] = sent_idx;
arrayname2idx_[sent_idx]=idx;
size_t idx = m_array.size();
m_array.push_back(a);
m_index_to_array_name[idx] = sent_idx;
m_array_name_to_index[sent_idx]=idx;
// TRACE_ERR("size: " << size() << " -> " << a.size() << std::endl);
}
}
bool ScoreData::check_consistency() const
{
if (array_.size() == 0)
if (m_array.size() == 0)
return true;
for (scoredata_t::const_iterator i = array_.begin(); i != array_.end(); ++i)
for (scoredata_t::const_iterator i = m_array.begin(); i != m_array.end(); ++i)
if (!i->check_consistency()) return false;
return true;
@ -127,10 +123,10 @@ bool ScoreData::check_consistency() const
void ScoreData::setIndex()
{
size_t j=0;
for (scoredata_t::iterator i = array_.begin(); i !=array_.end(); i++) {
idx2arrayname_[j]=i->getIndex();
arrayname2idx_[i->getIndex()]=j;
size_t j = 0;
for (scoredata_t::iterator i = m_array.begin(); i != m_array.end(); ++i) {
m_index_to_array_name[j] = i->getIndex();
m_array_name_to_index[i->getIndex()]=j;
j++;
}
}

View File

@ -9,9 +9,8 @@
#ifndef MERT_SCORE_DATA_H_
#define MERT_SCORE_DATA_H_
#include <fstream>
#include <vector>
#include <iostream>
#include <vector>
#include <stdexcept>
#include <string>
#include "ScoreArray.h"
@ -23,35 +22,34 @@ class Scorer;
class ScoreData
{
protected:
scoredata_t array_;
idx2name idx2arrayname_; // map from index to name of array
name2idx arrayname2idx_; // map from name to index of array
private:
// Do not allow the user to instanciate without arguments.
ScoreData() {}
Scorer* theScorer;
std::string score_type;
size_t number_of_scores;
scoredata_t m_array;
idx2name m_index_to_array_name; // map from index to name of array
name2idx m_array_name_to_index; // map from name to index of array
Scorer* m_scorer;
std::string m_score_type;
size_t m_num_scores;
public:
ScoreData(Scorer& sc);
ScoreData(Scorer* scorer);
~ScoreData() {}
inline void clear() {
array_.clear();
}
void clear() { m_array.clear(); }
inline ScoreArray get(const std::string& idx) {
return array_.at(getIndex(idx));
return m_array.at(getIndex(idx));
}
inline ScoreArray& get(size_t idx) {
return array_.at(idx);
return m_array.at(idx);
}
inline const ScoreArray& get(size_t idx) const {
return array_.at(idx);
return m_array.at(idx);
}
inline bool exists(const std::string& sent_idx) const {
@ -59,56 +57,51 @@ public:
}
inline bool exists(int sent_idx) const {
return (sent_idx > -1 && sent_idx < static_cast<int>(array_.size())) ? true : false;
return (sent_idx > -1 && sent_idx < static_cast<int>(m_array.size())) ? true : false;
}
inline ScoreStats& get(size_t i, size_t j) {
return array_.at(i).get(j);
}
inline const ScoreStats& get(size_t i, size_t j) const {
return array_.at(i).get(j);
return m_array.at(i).get(j);
}
inline std::string name() const {
return score_type;
inline const ScoreStats& get(size_t i, size_t j) const {
return m_array.at(i).get(j);
}
inline std::string name(const std::string &sctype) {
return score_type = sctype;
std::string name() const { return m_score_type; }
std::string name(const std::string &score_type) {
return m_score_type = score_type;
}
void add(ScoreArray& e);
void add(const ScoreStats& e, const std::string& sent_idx);
inline size_t NumberOfScores() const {
return number_of_scores;
}
inline size_t size() const {
return array_.size();
}
size_t NumberOfScores() const { return m_num_scores; }
size_t size() const { return m_array.size(); }
void save(const std::string &file, bool bin=false);
void save(ofstream& outFile, bool bin=false);
inline void save(bool bin=false) {
save("/dev/stdout", bin);
}
void save(std::ostream* os, bool bin=false);
void save(bool bin=false);
void load(ifstream& inFile);
void load(std::istream* is);
void load(const std::string &file);
bool check_consistency() const;
void setIndex();
inline int getIndex(const std::string& idx) const {
name2idx::const_iterator i = arrayname2idx_.find(idx);
if (i != arrayname2idx_.end())
name2idx::const_iterator i = m_array_name_to_index.find(idx);
if (i != m_array_name_to_index.end())
return i->second;
else
return -1;
}
inline std::string getIndex(size_t idx) const {
idx2name::const_iterator i = idx2arrayname_.find(idx);
if (i != idx2arrayname_.end())
idx2name::const_iterator i = m_index_to_array_name.find(idx);
if (i != m_index_to_array_name.end())
throw runtime_error("there is no entry at index " + idx);
return i->second;
}

View File

@ -14,30 +14,30 @@ const int kAvailableSize = 8;
} // namespace
ScoreStats::ScoreStats()
: available_(kAvailableSize), entries_(0),
array_(new ScoreStatsType[available_]) {}
: m_available_size(kAvailableSize), m_entries(0),
m_array(new ScoreStatsType[m_available_size]) {}
ScoreStats::ScoreStats(const size_t size)
: available_(size), entries_(size),
array_(new ScoreStatsType[available_])
: m_available_size(size), m_entries(size),
m_array(new ScoreStatsType[m_available_size])
{
memset(array_, 0, GetArraySizeWithBytes());
memset(m_array, 0, GetArraySizeWithBytes());
}
ScoreStats::~ScoreStats()
{
if (array_) {
delete [] array_;
array_ = NULL;
if (m_array) {
delete [] m_array;
m_array = NULL;
}
}
void ScoreStats::Copy(const ScoreStats &stats)
{
available_ = stats.available();
entries_ = stats.size();
array_ = new ScoreStatsType[available_];
memcpy(array_, stats.getArray(), GetArraySizeWithBytes());
m_available_size = stats.available();
m_entries = stats.size();
m_array = new ScoreStatsType[m_available_size];
memcpy(m_array, stats.getArray(), GetArraySizeWithBytes());
}
ScoreStats::ScoreStats(const ScoreStats &stats)
@ -47,27 +47,27 @@ ScoreStats::ScoreStats(const ScoreStats &stats)
ScoreStats& ScoreStats::operator=(const ScoreStats &stats)
{
delete [] array_;
delete [] m_array;
Copy(stats);
return *this;
}
void ScoreStats::expand()
{
available_ *= 2;
scorestats_t buf = new ScoreStatsType[available_];
memcpy(buf, array_, GetArraySizeWithBytes());
delete [] array_;
array_ = buf;
m_available_size *= 2;
scorestats_t buf = new ScoreStatsType[m_available_size];
memcpy(buf, m_array, GetArraySizeWithBytes());
delete [] m_array;
m_array = buf;
}
void ScoreStats::add(ScoreStatsType v)
{
if (isfull()) expand();
array_[entries_++]=v;
m_array[m_entries++]=v;
}
void ScoreStats::set(const std::string& str)
void ScoreStats::set(const string& str)
{
reset();
vector<string> out;
@ -78,46 +78,51 @@ void ScoreStats::set(const std::string& str)
}
}
void ScoreStats::loadbin(std::ifstream& inFile)
void ScoreStats::loadbin(istream* is)
{
inFile.read((char*)array_, GetArraySizeWithBytes());
is->read(reinterpret_cast<char*>(m_array),
static_cast<streamsize>(GetArraySizeWithBytes()));
}
void ScoreStats::loadtxt(std::ifstream& inFile)
void ScoreStats::loadtxt(istream* is)
{
std::string theString;
std::getline(inFile, theString);
set(theString);
string line;
getline(*is, line);
set(line);
}
void ScoreStats::loadtxt(const std::string &file)
void ScoreStats::loadtxt(const string &file)
{
// TRACE_ERR("loading the stats from " << file << std::endl);
std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
loadtxt(inFile);
ifstream ifs(file.c_str(), ios::in); // matches a stream with a file. Opens the file
if (!ifs) {
cerr << "Failed to open " << file << endl;
exit(1);
}
istream* is = &ifs;
loadtxt(is);
}
void ScoreStats::savetxt(const std::string &file)
void ScoreStats::savetxt(const string &file)
{
// TRACE_ERR("saving the stats into " << file << std::endl);
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
savetxt(outFile);
ofstream ofs(file.c_str(), ios::out); // matches a stream with a file. Opens the file
ostream* os = &ofs;
savetxt(os);
}
void ScoreStats::savetxt(std::ofstream& outFile)
void ScoreStats::savetxt(ostream* os)
{
outFile << *this;
*os << *this;
}
void ScoreStats::savebin(std::ofstream& outFile)
void ScoreStats::savetxt() {
savetxt(&cout);
}
void ScoreStats::savebin(ostream* os)
{
outFile.write((char*)array_, GetArraySizeWithBytes());
os->write(reinterpret_cast<char*>(m_array),
static_cast<streamsize>(GetArraySizeWithBytes()));
}
ostream& operator<<(ostream& o, const ScoreStats& e)

View File

@ -22,11 +22,11 @@ using namespace std;
class ScoreStats
{
private:
size_t available_;
size_t entries_;
size_t m_available_size;
size_t m_entries;
// TODO: Use smart pointer for exceptional-safety.
scorestats_t array_;
scorestats_t m_array;
public:
ScoreStats();
@ -40,31 +40,23 @@ public:
void Copy(const ScoreStats &stats);
bool isfull() const {
return (entries_ < available_) ? 0 : 1;
}
bool isfull() const { return (m_entries < m_available_size) ? 0 : 1; }
void expand();
void add(ScoreStatsType v);
void clear() {
memset((void*)array_, 0, GetArraySizeWithBytes());
memset((void*)m_array, 0, GetArraySizeWithBytes());
}
void reset() {
entries_ = 0;
m_entries = 0;
clear();
}
inline ScoreStatsType get(size_t i) {
return array_[i];
}
inline ScoreStatsType get(size_t i)const {
return array_[i];
}
inline scorestats_t getArray() const {
return array_;
}
ScoreStatsType get(size_t i) { return m_array[i]; }
ScoreStatsType get(size_t i) const { return m_array[i]; }
scorestats_t getArray() const { return m_array; }
void set(const std::string& str);
@ -76,31 +68,24 @@ public:
}
}
inline size_t bytes() const {
return GetArraySizeWithBytes();
}
size_t bytes() const { return GetArraySizeWithBytes(); }
size_t GetArraySizeWithBytes() const {
return entries_ * sizeof(ScoreStatsType);
return m_entries * sizeof(ScoreStatsType);
}
inline size_t size() const {
return entries_;
}
inline size_t available() const {
return available_;
}
size_t size() const { return m_entries; }
size_t available() const { return m_available_size; }
void savetxt(const std::string &file);
void savetxt(ofstream& outFile);
void savebin(ofstream& outFile);
inline void savetxt() {
savetxt("/dev/stdout");
}
void savetxt(ostream* os);
void savebin(ostream* os);
void savetxt();
void loadtxt(const std::string &file);
void loadtxt(ifstream& inFile);
void loadbin(ifstream& inFile);
void loadtxt(istream* is);
void loadbin(istream* is);
/**
* Write the whole object to a stream.

View File

@ -1,6 +1,9 @@
#include "Scorer.h"
#include <limits>
#include "Vocabulary.h"
#include "Util.h"
#include "Singleton.h"
namespace {
@ -34,14 +37,14 @@ inline float score_average(const statscores_t& scores, size_t start, size_t end)
Scorer::Scorer(const string& name, const string& config)
: m_name(name),
m_encoder(new Encoder),
m_vocab(mert::VocabularyFactory::GetVocabulary()),
m_score_data(0),
m_enable_preserve_case(true) {
InitConfig(config);
}
Scorer::~Scorer() {
delete m_encoder;
Singleton<mert::Vocabulary>::Delete();
}
void Scorer::InitConfig(const string& config) {
@ -65,23 +68,6 @@ void Scorer::InitConfig(const string& config) {
}
}
Scorer::Encoder::Encoder() {}
Scorer::Encoder::~Encoder() {}
int Scorer::Encoder::Encode(const string& token) {
map<string, int>::iterator it = m_vocab.find(token);
int encoded_token;
if (it == m_vocab.end()) {
// Add an new entry to the vocaburary.
encoded_token = static_cast<int>(m_vocab.size());
m_vocab[token] = encoded_token;
} else {
encoded_token = it->second;
}
return encoded_token;
}
void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded) {
std::istringstream in(line);
std::string token;
@ -92,7 +78,7 @@ void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded) {
*it = tolower(*it);
}
}
encoded.push_back(m_encoder->Encode(token));
encoded.push_back(m_vocab->Encode(token));
}
}
@ -107,40 +93,40 @@ void Scorer::setFactors(const string& factors)
for(vector<string>::iterator it = factors_vec.begin(); it != factors_vec.end(); ++it)
{
int factor = atoi(it->c_str());
m_factors.push_back(factor);
m_factors.push_back(factor);
}
}
/**
* Take the factored sentence and return the desired factors
*/
string Scorer::applyFactors(const string& sentence)
string Scorer::applyFactors(const string& sentence) const
{
if (m_factors.size() == 0) return sentence;
vector<string> tokens;
split(sentence, ' ', tokens);
stringstream sstream;
stringstream sstream;
for (size_t i = 0; i < tokens.size(); ++i)
{
if (tokens[i] == "") continue;
if (tokens[i] == "") continue;
vector<string> factors;
split(tokens[i], '|', factors);
int fsize = factors.size();
if (i>0) sstream << " ";
if (i > 0) sstream << " ";
for (size_t j = 0; j < m_factors.size(); ++j)
{
int findex = m_factors[j];
if (findex < 0 || findex >= fsize) throw runtime_error("Factor index is out of range.");
if (j>0) sstream << "|";
if (j > 0) sstream << "|";
sstream << factors[findex];
}
}
}
return sstream.str();
}

View File

@ -13,6 +13,12 @@ using namespace std;
class ScoreStats;
namespace mert {
class Vocabulary;
} // namespace mert
/**
* Superclass of all scorers and dummy implementation.
*
@ -105,24 +111,15 @@ class Scorer
/**
* Take the factored sentence and return the desired factors
*/
virtual string applyFactors(const string& sentece);
virtual string applyFactors(const string& sentece) const;
mert::Vocabulary* GetVocab() const { return m_vocab; }
private:
class Encoder {
public:
Encoder();
virtual ~Encoder();
int Encode(const std::string& token);
void Clear() { m_vocab.clear(); }
private:
std::map<std::string, int> m_vocab;
};
void InitConfig(const string& config);
string m_name;
Encoder* m_encoder;
mert::Vocabulary* m_vocab;
map<string, string> m_config;
vector<int> m_factors;
@ -144,14 +141,11 @@ class Scorer
/**
* Tokenise line and encode.
* Note: We assume that all tokens are separated by single spaces.
* Note: We assume that all tokens are separated by whitespaces.
*/
void TokenizeAndEncode(const string& line, vector<int>& encoded);
void ClearEncoder() { m_encoder->Clear(); }
};
/**
* Abstract base class for Scorers that work by adding statistics across all
* outout sentences, then apply some formula, e.g., BLEU, PER.

33
mert/Singleton.h Normal file
View File

@ -0,0 +1,33 @@
#ifndef MERT_SINGLETON_H_
#define MERT_SINGLETON_H_
#include <cstdlib>
// thread *un*safe singleton.
// TODO: replace this with thread-safe singleton.
template <typename T>
class Singleton {
public:
static T* GetInstance() {
if (m_instance == NULL) {
m_instance = new T;
}
return m_instance;
}
static void Delete() {
if (m_instance) {
delete m_instance;
m_instance = NULL;
}
}
private:
Singleton();
static T* m_instance;
};
template <typename T>
T* Singleton<T>::m_instance = NULL;
#endif // MERT_SINGLETON_H_

27
mert/SingletonTest.cpp Normal file
View File

@ -0,0 +1,27 @@
#include "Singleton.h"
#define BOOST_TEST_MODULE MertSingleton
#include <boost/test/unit_test.hpp>
namespace {
static int g_count = 0;
class Instance {
public:
Instance() { ++g_count; }
~Instance() {}
};
} // namespace
BOOST_AUTO_TEST_CASE(singleton_basic) {
Instance* instance1 = Singleton<Instance>::GetInstance();
Instance* instance2 = Singleton<Instance>::GetInstance();
Instance* instance3 = Singleton<Instance>::GetInstance();
BOOST_REQUIRE(instance1 == instance2);
BOOST_REQUIRE(instance2 == instance3);
BOOST_CHECK_EQUAL(1, g_count);
Singleton<Instance>::Delete();
}

21
mert/Vocabulary.cpp Normal file
View File

@ -0,0 +1,21 @@
#include "Vocabulary.h"
#include "Singleton.h"
namespace mert {
namespace {
Vocabulary* g_vocab = NULL;
} // namespace
Vocabulary* VocabularyFactory::GetVocabulary() {
if (g_vocab == NULL) {
return Singleton<Vocabulary>::GetInstance();
} else {
return g_vocab;
}
}
void VocabularyFactory::SetVocabulary(Vocabulary* vocab) {
g_vocab = vocab;
}
} // namespace mert

79
mert/Vocabulary.h Normal file
View File

@ -0,0 +1,79 @@
#ifndef MERT_VOCABULARY_H_
#define MERT_VOCABULARY_H_
#include <map>
#include <string>
namespace mert {
/**
* A embarrassingly simple map to handle vocabularies to calculate
* various scores such as BLEU.
*
* TODO: replace this with more efficient data structure.
*/
class Vocabulary {
public:
typedef std::map<std::string, int>::iterator iterator;
typedef std::map<std::string, int>::const_iterator const_iterator;
Vocabulary() {}
virtual ~Vocabulary() {}
/** Returns the assiged id for given "token". */
int Encode(const std::string& token) {
iterator it = m_vocab.find(token);
int encoded_token;
if (it == m_vocab.end()) {
// Add an new entry to the vocaburary.
encoded_token = static_cast<int>(m_vocab.size());
m_vocab[token] = encoded_token;
} else {
encoded_token = it->second;
}
return encoded_token;
}
/**
* Return true iff the specified "str" is found in the container.
*/
bool Lookup(const std::string&str , int* v) const {
const_iterator it = m_vocab.find(str);
if (it == m_vocab.end()) return false;
*v = it->second;
return true;
}
void clear() { m_vocab.clear(); }
bool empty() const { return m_vocab.empty(); }
size_t size() const { return m_vocab.size(); }
iterator find(const std::string& str) { return m_vocab.find(str); }
const_iterator find(const std::string& str) const { return m_vocab.find(str); }
int& operator[](const std::string& str) { return m_vocab[str]; }
iterator begin() { return m_vocab.begin(); }
const_iterator begin() const { return m_vocab.begin(); }
iterator end() { return m_vocab.end(); }
const_iterator end() const { return m_vocab.end(); }
private:
std::map<std::string, int> m_vocab;
};
class VocabularyFactory {
public:
static Vocabulary* GetVocabulary();
static void SetVocabulary(Vocabulary* vocab);
private:
VocabularyFactory() {}
virtual ~VocabularyFactory() {}
};
} // namespace mert
#endif // MERT_VOCABULARY_H_

52
mert/VocabularyTest.cpp Normal file
View File

@ -0,0 +1,52 @@
#include "Vocabulary.h"
#define BOOST_TEST_MODULE MertVocabulary
#include <boost/test/unit_test.hpp>
#include "Singleton.h"
namespace mert {
namespace {
void TearDown() {
Singleton<Vocabulary>::Delete();
}
} // namespace
BOOST_AUTO_TEST_CASE(vocab_basic) {
Vocabulary vocab;
BOOST_REQUIRE(vocab.empty());
vocab.clear();
BOOST_CHECK_EQUAL(0, vocab.Encode("hello"));
BOOST_CHECK_EQUAL(0, vocab.Encode("hello"));
BOOST_CHECK_EQUAL(1, vocab.Encode("world"));
BOOST_CHECK_EQUAL(2, vocab.size());
int v;
BOOST_CHECK(vocab.Lookup("hello", &v));
BOOST_CHECK_EQUAL(0, v);
BOOST_CHECK(vocab.Lookup("world", &v));
BOOST_CHECK_EQUAL(1, v);
BOOST_CHECK(!vocab.Lookup("java", &v));
vocab.clear();
BOOST_CHECK(!vocab.Lookup("hello", &v));
BOOST_CHECK(!vocab.Lookup("world", &v));
}
BOOST_AUTO_TEST_CASE(vocab_factory_test) {
Vocabulary* vocab1 = VocabularyFactory::GetVocabulary();
Vocabulary* vocab2 = VocabularyFactory::GetVocabulary();
Vocabulary* vocab3 = VocabularyFactory::GetVocabulary();
BOOST_REQUIRE(vocab1 != NULL);
BOOST_CHECK(vocab1 == vocab2);
BOOST_CHECK(vocab2 == vocab3);
TearDown();
}
} // namespace mert

View File

@ -55,7 +55,7 @@ void EvaluatorUtil::evaluate(const string& candFile, int bootstrap)
for (int i = 0; i < bootstrap; ++i)
{
// TODO: Use smart pointer for exceptional-safety.
ScoreData* scoredata = new ScoreData(*g_scorer);
ScoreData* scoredata = new ScoreData(g_scorer);
for (int j = 0; j < n; ++j)
{
int randomIndex = random() % n;
@ -89,7 +89,7 @@ void EvaluatorUtil::evaluate(const string& candFile, int bootstrap)
else
{
// TODO: Use smart pointer for exceptional-safety.
ScoreData* scoredata = new ScoreData(*g_scorer);
ScoreData* scoredata = new ScoreData(g_scorer);
for (int sid = 0; sid < n; ++sid)
{
string str_sid = int2string(sid);

View File

@ -197,7 +197,7 @@ int main(int argc, char** argv)
PrintUserTime("References loaded");
Data data(*scorer);
Data data(scorer.get());
// load old data
for (size_t i = 0; i < prevScoreDataFiles.size(); i++) {
@ -208,13 +208,13 @@ int main(int argc, char** argv)
// computing score statistics of each nbest file
for (size_t i = 0; i < nbestFiles.size(); i++) {
data.loadnbest(nbestFiles.at(i));
data.loadNBest(nbestFiles.at(i));
}
PrintUserTime("Nbest entries loaded and scored");
//ADDED_BY_TS
data.remove_duplicates();
data.removeDuplicates();
//END_ADDED
data.save(option.featureDataFile, option.scoreDataFile, option.binmode);

7
mert/mert.cpp Executable file → Normal file
View File

@ -20,6 +20,7 @@
#include "ScoreData.h"
#include "FeatureData.h"
#include "Optimizer.h"
#include "OptimizerFactory.h"
#include "Types.h"
#include "Timer.h"
#include "Util.h"
@ -338,7 +339,7 @@ int main(int argc, char **argv)
ScorerFactory::getScorer(option.scorer_type, option.scorer_config));
//load data
Data data(*scorer);
Data data(scorer.get());
for (size_t i = 0; i < ScoreDataFiles.size(); i++) {
cerr<<"Loading Data from: "<< ScoreDataFiles.at(i) << " and " << FeatureDataFiles.at(i) << endl;
@ -348,7 +349,7 @@ int main(int argc, char **argv)
scorer->setScoreData(data.getScoreData().get());
//ADDED_BY_TS
data.remove_duplicates();
data.removeDuplicates();
//END_ADDED
PrintUserTime("Data loaded");
@ -434,7 +435,7 @@ int main(int argc, char **argv)
vector<OptimizationTask*>& tasks = allTasks[i];
Optimizer *optimizer = OptimizerFactory::BuildOptimizer(option.pdim, to_optimize, start_list[0], option.optimize_type, option.nrandom);
optimizer->SetScorer(data_ref.getScorer());
optimizer->SetFData(data_ref.getFeatureData());
optimizer->SetFeatureData(data_ref.getFeatureData());
// A task for each start point
for (size_t j = 0; j < startingPoints.size(); ++j) {
OptimizationTask* task = new OptimizationTask(optimizer, startingPoints[j]);

View File

@ -21,8 +21,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
/**
* This is part of the PRO implementation. It converts the features and scores
/**
* This is part of the PRO implementation. It converts the features and scores
* files into a form suitable for input into the megam maxent trainer.
*
* For details of PRO, refer to Hopkins & May (EMNLP 2011)
@ -34,9 +34,11 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <iostream>
#include <string>
#include <vector>
#include <utility>
#include <boost/program_options.hpp>
#include "BleuScorer.h"
#include "FeatureDataIterator.h"
#include "ScoreDataIterator.h"
@ -46,49 +48,49 @@ namespace po = boost::program_options;
class SampledPair {
private:
pair<size_t,size_t> translation1;
pair<size_t,size_t> translation2;
float scoreDiff;
pair<size_t,size_t> m_translation1;
pair<size_t,size_t> m_translation2;
float m_score_diff;
public:
SampledPair(const pair<size_t,size_t>& t1, const pair<size_t,size_t>& t2, float diff ) {
if (diff > 0) {
translation1 = t1;
translation2 = t2;
scoreDiff = diff;
}
else {
translation1 = t2;
translation2 = t1;
scoreDiff = -diff;
}
}
float getDiff() const { return scoreDiff; }
const pair<size_t,size_t>& getTranslation1() const { return translation1; }
const pair<size_t,size_t>& getTranslation2() const { return translation2; }
SampledPair(const pair<size_t,size_t>& t1, const pair<size_t,size_t>& t2, float diff ) {
if (diff > 0) {
m_translation1 = t1;
m_translation2 = t2;
m_score_diff = diff;
} else {
m_translation1 = t2;
m_translation2 = t1;
m_score_diff = -diff;
}
}
float getDiff() const { return m_score_diff; }
const pair<size_t,size_t>& getTranslation1() const { return m_translation1; }
const pair<size_t,size_t>& getTranslation2() const { return m_translation2; }
};
static float sentenceLevelBleuPlusOne(const vector<float>& stats) {
float logbleu = 0.0;
const unsigned int bleu_order = 4;
for (unsigned int j=0; j<bleu_order; j++) {
//cerr << (stats.get(2*j)+1) << "/" << (stats.get(2*j+1)+1) << " ";
logbleu += log(stats[2*j]+1) - log(stats[2*j+1]+1);
}
logbleu /= bleu_order;
const float brevity = 1.0 - static_cast<float>(stats[(bleu_order*2)]) / stats[1];
if (brevity < 0.0) {
logbleu += brevity;
}
//cerr << brevity << " -> " << exp(logbleu) << endl;
return exp(logbleu);
float logbleu = 0.0;
for (unsigned int j=0; j<kBleuNgramOrder; j++) {
//cerr << (stats.get(2*j)+1) << "/" << (stats.get(2*j+1)+1) << " ";
logbleu += log(stats[2*j]+1) - log(stats[2*j+1]+1);
}
logbleu /= kBleuNgramOrder;
const float brevity = 1.0 - static_cast<float>(stats[(kBleuNgramOrder * 2)]) / stats[1];
if (brevity < 0.0) {
logbleu += brevity;
}
//cerr << brevity << " -> " << exp(logbleu) << endl;
return exp(logbleu);
}
static void outputSample(ostream& out, const FeatureDataItem& f1, const FeatureDataItem& f2) {
// difference in score in regular features
for(unsigned int j=0; j<f1.dense.size(); j++)
if (abs(f1.dense[j]-f2.dense[j]) > 0.00001)
out << " F" << j << " " << (f1.dense[j]-f2.dense[j]);
for(unsigned int j=0; j<f1.dense.size(); j++)
if (abs(f1.dense[j]-f2.dense[j]) > 0.00001)
out << " F" << j << " " << (f1.dense[j]-f2.dense[j]);
if (f1.sparse.size() || f2.sparse.size()) {
out << " ";
@ -101,27 +103,27 @@ static void outputSample(ostream& out, const FeatureDataItem& f1, const FeatureD
}
}
int main(int argc, char** argv)
int main(int argc, char** argv)
{
bool help;
vector<string> scoreFiles;
vector<string> featureFiles;
int seed;
string outputFile;
//TODO: options
const unsigned int n_candidates = 5000; // Gamma, in Hopkins & May
const unsigned int n_samples = 50; // Xi, in Hopkins & May
const float min_diff = 0.05;
// TODO: Add these constants to options
const unsigned int n_candidates = 5000; // Gamma, in Hopkins & May
const unsigned int n_samples = 50; // Xi, in Hopkins & May
const float min_diff = 0.05;
po::options_description desc("Allowed options");
desc.add_options()
("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
("scfile,S", po::value<vector<string> >(&scoreFiles), "Scorer data files")
("ffile,F", po::value<vector<string> > (&featureFiles), "Feature data files")
("random-seed,r", po::value<int>(&seed), "Seed for random number generation")
("output-file,o", po::value<string>(&outputFile), "Output file")
;
("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
("scfile,S", po::value<vector<string> >(&scoreFiles), "Scorer data files")
("ffile,F", po::value<vector<string> > (&featureFiles), "Feature data files")
("random-seed,r", po::value<int>(&seed), "Seed for random number generation")
("output-file,o", po::value<string>(&outputFile), "Output file")
;
po::options_description cmdline_options;
cmdline_options.add(desc);
@ -134,7 +136,7 @@ int main(int argc, char** argv)
cout << desc << endl;
exit(0);
}
if (vm.count("random-seed")) {
cerr << "Initialising random seed to " << seed << endl;
srand(seed);
@ -167,7 +169,7 @@ int main(int argc, char** argv)
out = &cout;
}
vector<FeatureDataIterator> featureDataIters;
vector<ScoreDataIterator> scoreDataIters;
for (size_t i = 0; i < featureFiles.size(); ++i) {
@ -179,7 +181,7 @@ int main(int argc, char** argv)
size_t sentenceId = 0;
while(1) {
vector<pair<size_t,size_t> > hypotheses;
//TODO: de-deuping. Collect hashes of score,feature pairs and
//TODO: de-deuping. Collect hashes of score,feature pairs and
//only add index if it's unique.
if (featureDataIters[0] == FeatureDataIterator::end()) {
break;
@ -214,7 +216,7 @@ int main(int argc, char** argv)
size_t rand2 = rand() % n_translations;
pair<size_t,size_t> translation2 = hypotheses[rand2];
float bleu2 = sentenceLevelBleuPlusOne(scoreDataIters[translation2.first]->operator[](translation2.second));
/*
cerr << "t(" << translation1.first << "," << translation1.second << ") = " << bleu1 <<
" t(" << translation2.first << "," << translation2.second << ") = " <<
@ -222,7 +224,7 @@ int main(int argc, char** argv)
*/
if (abs(bleu1-bleu2) < min_diff)
continue;
samples.push_back(SampledPair(translation1, translation2, bleu1-bleu2));
scores.push_back(1.0-abs(bleu1-bleu2));
}
@ -261,4 +263,3 @@ int main(int argc, char** argv)
outFile.close();
}

View File

@ -42,7 +42,6 @@ GlobalLexicalModel::~GlobalLexicalModel()
}
delete iter->first; // delete output word
}
// if (m_cache != NULL) delete m_cache;
}
void GlobalLexicalModel::LoadData(const string &filePath,
@ -153,7 +152,7 @@ float GlobalLexicalModel::ScorePhrase( const TargetPhrase& targetPhrase ) const
float GlobalLexicalModel::GetFromCacheOrScorePhrase( const TargetPhrase& targetPhrase ) const
{
LexiconCache& m_cache = m_local->cache;
map< const TargetPhrase*, float >::const_iterator query = m_cache.find( &targetPhrase );
const LexiconCache::const_iterator query = m_cache.find( &targetPhrase );
if ( query != m_cache.end() ) {
return query->second;
}

View File

@ -1034,14 +1034,13 @@ sub execute_steps {
}
elsif (! -e &versionize(&step_file($i)).".DONE") {
my $step = &versionize(&step_file($i));
print "\texecuting $step via ";
&define_step($i);
&write_info($i);
# cluster job submission
if ($CLUSTER && ! &is_qsub_script($i)) {
$DO{$i}++;
print "qsub\n";
print "\texecuting $step via qsub ($active active)\n";
my $qsub_args = &get_qsub_args($DO_STEP[$i]);
`qsub $qsub_args -e $step.STDERR -o $step.STDOUT $step`;
}
@ -1050,16 +1049,13 @@ sub execute_steps {
elsif ($CLUSTER || $active < $MAX_ACTIVE) {
$active++;
$DO{$i}++;
print "sh ($active active)\n";
print "\texecuting $step via sh ($active active)\n";
sleep(5);
if (!fork) {
`sh $step >$step.STDOUT 2> $step.STDERR`;
exit;
}
}
else {
print " --- on hold\n";
}
}
}
@ -1853,6 +1849,9 @@ sub define_training_create_config {
$cmd .= "-lm $factor:$order:$lm_file:$type ";
}
my $additional_ini = &get("TRAINING:additional-ini");
$cmd .= "-additional-ini '$additional_ini' " if defined($additional_ini);
&create_step($step_id,$cmd);
}
@ -2185,6 +2184,7 @@ sub define_evaluation_decode {
my $nbest = &backoff_and_get("EVALUATION:$set:nbest");
my $moses_parallel = &backoff_and_get("EVALUATION:$set:moses-parallel");
my $report_segmentation = &backoff_and_get("EVALUATION:$set:report-segmentation");
my $analyze_search_graph = &backoff_and_get("EVALUATION:$set:analyze-search-graph");
my $report_precision_by_coverage = &backoff_and_get("EVALUATION:$set:report-precision-by-coverage");
my $hierarchical = &get("TRAINING:hierarchical-rule-set");
@ -2193,6 +2193,9 @@ sub define_evaluation_decode {
$settings .= " -use-alignment-info -alignment-output-file $system_output.wa";
$report_segmentation = "yes";
}
if (defined($analyze_search_graph) && $analyze_search_graph eq "yes") {
$settings .= " -unpruned-search-graph -osg $system_output.graph";
}
if (defined($report_segmentation) && $report_segmentation eq "yes") {
if ($hierarchical) {
$settings .= " -T $system_output.trace";
@ -2237,12 +2240,17 @@ sub define_evaluation_analysis {
$output,$reference,$input) = &get_output_and_input($step_id);
my $script = &backoff_and_get("EVALUATION:$set:analysis");
my $report_segmentation = &backoff_and_get("EVALUATION:$set:report-segmentation");
my $analyze_search_graph = &backoff_and_get("EVALUATION:$set:analyze-search-graph");
my $cmd = "$script -system $output -reference $reference -input $input -dir $analysis";
if (defined($report_segmentation) && $report_segmentation eq "yes") {
my $segmentation_file = &get_default_file("EVALUATION",$set,"decode");
$cmd .= " -segmentation $segmentation_file";
}
if (defined($analyze_search_graph) && $analyze_search_graph eq "yes") {
my $search_graph_file = &get_default_file("EVALUATION",$set,"decode");
$cmd .= " -search-graph $search_graph_file.graph";
}
if (&get("TRAINING:hierarchical-rule-set")) {
$cmd .= " -hierarchical";
}

View File

@ -110,7 +110,7 @@ print STDERR "\n=== BUILDING FINAL LM ===\n\n";
sub interpolate {
my ($name,@LM) = @_;
die("cannot interpolate more than 10 language models at once.")
die("cannot interpolate more than 10 language models at once: ",join(",",@LM))
if scalar(@LM) > 10;
my $tmp = tempdir(DIR=>$TEMPDIR);

View File

@ -17,12 +17,17 @@ close(ORDER);
# get from sgm file which lines belong to which system
my %DOC;
my $system_from_refset = 0;
my ($doc,$system);
open(REF,$ref);
while(<REF>) {
if (/<refset/ && /refid="([^\"]+)"/i) {
$system = $1;
$system_from_refset = 1;
}
if (/<doc/i) {
die unless /sysid="([^\"]+)"/i;
$system = $1;
die unless /sysid="([^\"]+)"/i || $system_from_refset;
$system = $1 unless $system_from_refset;
die unless /docid="([^\"]+)"/i;
$doc = $1;
}

View File

@ -18,8 +18,9 @@ while(<SRC>) {
elsif (/^<\/srcset/) {
s/<\/srcset/<\/tstset/;
}
elsif (/^<DOC/i) {
s/<DOC/<DOC sysid="$system"/i;
elsif (/^<doc/i) {
s/ *sysid="[^\"]+"//;
s/<doc/<doc sysid="$system"/i;
}
elsif (/<seg/) {
my $line = shift(@OUT);

View File

@ -101,13 +101,14 @@ if ($numParallel > 1)
print STDERR $extractCmd;
print STDERR $extractInvCmd;
print STDERR $extractOrderingCmd;
`$extractCmd`;
`$extractInvCmd`;
systemCheck($extractCmd);
systemCheck($extractInvCmd);
my $numStr = NumStr(0);
if (-e "$TMPDIR/extract.$numStr.o")
{
`$extractOrderingCmd`;
systemCheck($extractOrderingCmd);
}
}
else
@ -130,6 +131,15 @@ print STDERR $cmd;
print STDERR "Finished ".localtime() ."\n";
sub systemCheck($)
{
my $cmd = shift;
my $retVal = system($cmd);
if ($retVal != 0)
{
exit(1);
}
}
sub NumStr($)
{

View File

@ -63,8 +63,13 @@ sub detokenize {
my($text) = @_;
chomp($text);
$text = " $text ";
$text =~ s/ \@\-\@ /-/g;
$text =~ s/ \@\-\@ /-/g;
# de-escape special chars
$text =~ s/\&bar;/\|/g;
$text =~ s/\&lt;/\</g;
$text =~ s/\&gt;/\>/g;
$text =~ s/\&amp;/\&/g;
my $word;
my $i;
my @words = split(/ /,$text);

View File

@ -18,6 +18,7 @@ my $language = "en";
my $QUIET = 0;
my $HELP = 0;
my $AGGRESSIVE = 0;
my $SKIP_XML = 0;
#my $start = [ Time::HiRes::gettimeofday( ) ];
@ -27,6 +28,7 @@ while (@ARGV) {
/^-l$/ && ($language = shift, next);
/^-q$/ && ($QUIET = 1, next);
/^-h$/ && ($HELP = 1, next);
/^-x$/ && ($SKIP_XML = 1, next);
/^-a$/ && ($AGGRESSIVE = 1, next);
}
@ -50,7 +52,7 @@ if (scalar(%NONBREAKING_PREFIX) eq 0){
}
while(<STDIN>) {
if (/^<.+>$/ || /^\s*$/) {
if (($SKIP_XML && /^<.+>$/) || /^\s*$/) {
#don't try to tokenize XML/HTML tag lines
print $_;
}
@ -141,7 +143,13 @@ sub tokenize {
$text =~ s/DOTDOTMULTI/DOTMULTI./g;
}
$text =~ s/DOTMULTI/./g;
#escape special chars
$text =~ s/\&/\&amp;/g;
$text =~ s/\|/\&bar;/g;
$text =~ s/\</\&lt;/g;
$text =~ s/\>/\&gt;/g;
#ensure final line break
$text .= "\n" unless $text =~ /\n$/;

View File

@ -404,6 +404,9 @@ if (-e $ref_abs) {
else {
# if multiple file, get a full list of the files
my $part = 0;
if (! -e $ref_abs."0" && -e $ref_abs.".ref0") {
$ref_abs .= ".ref";
}
while (-e $ref_abs.$part) {
push @references, $ref_abs.$part;
$part++;

View File

@ -38,6 +38,10 @@
1E2C902D141FDED400EA06A6 /* InputFileStream.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EB8A211129C024C00041956 /* InputFileStream.cpp */; };
1E2C902E141FDF6D00EA06A6 /* tables-core.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1CE8CE4B0FC6EAA200924FEA /* tables-core.cpp */; };
1EB1C8321200D5C00079FCBB /* PhraseAlignment.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EB1C8311200D5C00079FCBB /* PhraseAlignment.cpp */; };
1EB29A3B1511C253005BC4BA /* InputFileStream.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EB8A211129C024C00041956 /* InputFileStream.cpp */; };
1EB29A3C1511C253005BC4BA /* InputFileStream.h in Sources */ = {isa = PBXBuildFile; fileRef = 1EB8A210129C024C00041956 /* InputFileStream.h */; };
1EB29A3E1511C2D9005BC4BA /* InputFileStream.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EB8A211129C024C00041956 /* InputFileStream.cpp */; };
1EB29A3F1511C2D9005BC4BA /* InputFileStream.h in Sources */ = {isa = PBXBuildFile; fileRef = 1EB8A210129C024C00041956 /* InputFileStream.h */; };
1EB8A212129C024C00041956 /* InputFileStream.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EB8A211129C024C00041956 /* InputFileStream.cpp */; };
1EB8A261129C04C700041956 /* InputFileStream.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EB8A211129C024C00041956 /* InputFileStream.cpp */; };
1EB8A297129C06A300041956 /* gzfilebuf.h in Sources */ = {isa = PBXBuildFile; fileRef = 1EB8A20C129C022000041956 /* gzfilebuf.h */; };
@ -354,6 +358,8 @@
isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647;
files = (
1EB29A3B1511C253005BC4BA /* InputFileStream.cpp in Sources */,
1EB29A3C1511C253005BC4BA /* InputFileStream.h in Sources */,
1C05BA281174CF10003585B2 /* extract-rules.cpp in Sources */,
1C05BA251174CF03003585B2 /* Hole.h in Sources */,
1C05BA261174CF03003585B2 /* HoleCollection.cpp in Sources */,
@ -376,6 +382,8 @@
isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647;
files = (
1EB29A3E1511C2D9005BC4BA /* InputFileStream.cpp in Sources */,
1EB29A3F1511C2D9005BC4BA /* InputFileStream.h in Sources */,
1C05BA381174CFAD003585B2 /* tables-core.cpp in Sources */,
1C05BA391174CFAD003585B2 /* tables-core.h in Sources */,
1C05BA351174CF98003585B2 /* AlignmentPhrase.cpp in Sources */,
@ -444,6 +452,7 @@
GCC_MODEL_TUNING = G5;
GCC_OPTIMIZATION_LEVEL = 0;
INSTALL_PATH = /usr/local/bin;
OTHER_LDFLAGS = "-lz";
PREBINDING = NO;
PRODUCT_NAME = "extract-rules";
SDKROOT = macosx10.6;
@ -461,6 +470,7 @@
GCC_MODEL_TUNING = G5;
INSTALL_PATH = /usr/local/bin;
ONLY_ACTIVE_ARCH = YES;
OTHER_LDFLAGS = "-lz";
PREBINDING = NO;
PRODUCT_NAME = "extract-rules";
SDKROOT = macosx10.6;
@ -479,6 +489,7 @@
GCC_MODEL_TUNING = G5;
GCC_OPTIMIZATION_LEVEL = 0;
INSTALL_PATH = /usr/local/bin;
OTHER_LDFLAGS = "-lz";
PREBINDING = NO;
PRODUCT_NAME = statistics;
SDKROOT = macosx10.6;
@ -496,6 +507,7 @@
GCC_MODEL_TUNING = G5;
INSTALL_PATH = /usr/local/bin;
ONLY_ACTIVE_ARCH = YES;
OTHER_LDFLAGS = "-lz";
PREBINDING = NO;
PRODUCT_NAME = statistics;
SDKROOT = macosx10.6;

View File

@ -40,7 +40,7 @@ my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_
my $debug = 0; # debug this script, do not delete any files in debug mode
# the following line is set installation time by 'make release'. BEWARE!
my $BINDIR="/Users/hieuhoang/workspace/bin";
my $BINDIR="/Users/hieuhoang/workspace/bin/";
$_HELP = 1
unless &GetOptions('root-dir=s' => \$_ROOT_DIR,
@ -1490,10 +1490,10 @@ sub score_phrase_phrase_extract {
# sorting
print STDERR "(6.".($substep++).") sorting $direction @ ".`date`;
if (-e "$extract_filename.gz") {
safesystem("gunzip < $extract_filename.gz | LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR > $extract_filename.sorted") or die("ERROR");
safesystem("gunzip < $extract_filename.gz | LC_ALL=C $SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR > $extract_filename.sorted") or die("ERROR");
}
else {
safesystem("LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR $extract_filename > $extract_filename.sorted") or die("ERROR");
safesystem("LC_ALL=C $SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR $extract_filename > $extract_filename.sorted") or die("ERROR");
}
}
@ -1515,7 +1515,7 @@ sub score_phrase_phrase_extract {
# sorting inverse phrase-table-half to sync up with regular one
if ($direction eq "e2f" && ! ($___CONTINUE && -e "$ttable_file.half.e2f.sorted")) {
print STDERR "(6." . ($substep++) . ") sorting inverse e2f table@ ".`date`;
safesystem("LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR $ttable_file.half.e2f > $ttable_file.half.e2f.sorted") or die("ERROR");
safesystem("LC_ALL=C $SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR $ttable_file.half.e2f > $ttable_file.half.e2f.sorted") or die("ERROR");
if (! $debug) { safesystem("rm -f $ttable_file.half.e2f") or die("ERROR"); }
}
@ -1570,7 +1570,7 @@ sub score_phrase_memscore {
# The output is sorted to avoid breaking scripts that rely on the
# sorting behaviour of the previous scoring algorithm.
my $cmd = "$MEMSCORE $options | LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR | gzip >$ttable_file.gz";
my $cmd = "$MEMSCORE $options | LC_ALL=C $SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR | gzip >$ttable_file.gz";
if (-e "$extract_file.gz") {
$cmd = "$ZCAT $extract_file.gz | ".$cmd;
} else {
@ -1626,10 +1626,10 @@ sub get_reordering_factored {
sub get_reordering {
my ($extract_file,$reo_model_path) = @_;
if (-e "$extract_file.o.gz") {
safesystem("gunzip < $extract_file.o.gz | LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR > $extract_file.o.sorted") or die("ERROR");
safesystem("gunzip < $extract_file.o.gz | LC_ALL=C $SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR > $extract_file.o.sorted") or die("ERROR");
}
else {
safesystem("LC_ALL=C sort -T $___TEMP_DIR $extract_file.o > $extract_file.o.sorted") or die("ERROR");
safesystem("LC_ALL=C $SORT_EXEC -T $___TEMP_DIR $extract_file.o > $extract_file.o.sorted") or die("ERROR");
}
my $smooth = $___REORDERING_SMOOTH;

View File

@ -6,8 +6,8 @@ my ($size,$in,$out) = @ARGV;
open(IN,$in);
open(OUT,">$out");
binmode(IN, ":utf8");
binmode(OUT, ":utf8");
binmode(IN, ":UTF8");
binmode(OUT, ":UTF8");
while(<IN>) {
my $first = 1;

View File

@ -42,6 +42,16 @@ int OpenReadOrThrow(const char *name) {
return ret;
}
int CreateOrThrow(const char *name) {
int ret;
#if defined(_WIN32) || defined(_WIN64)
UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name);
#else
UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name);
#endif
return ret;
}
uint64_t SizeFile(int fd) {
#if defined(_WIN32) || defined(_WIN64)
__int64 ret = _filelengthi64(fd);

View File

@ -65,7 +65,10 @@ class scoped_FILE {
std::FILE *file_;
};
// Open for read only.
int OpenReadOrThrow(const char *name);
// Create file if it doesn't exist, truncate if it does. Opened for write.
int CreateOrThrow(const char *name);
// Return value for SizeFile when it can't size properly.
const uint64_t kBadSize = (uint64_t)-1;

View File

@ -10,6 +10,7 @@ Code given out at the 1985 UNIFORUM conference in Dallas.
#include "getopt.hh"
#include <stdio.h>
#include <string.h>
#define NULL 0
#define EOF (-1)
@ -74,4 +75,4 @@ char **argv, *opts;
return(c);
}
#endif /* __GNUC__ */
#endif /* __GNUC__ */

View File

@ -170,20 +170,6 @@ void *MapZeroedWrite(int fd, std::size_t size) {
return MapOrThrow(size, true, kFileFlags, false, fd, 0);
}
namespace {
int CreateOrThrow(const char *name) {
int ret;
#if defined(_WIN32) || defined(_WIN64)
UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name);
#else
UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name);
#endif
return ret;
}
} // namespace
void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) {
file.reset(CreateOrThrow(name));
try {