sentence-bleu-nbest

This commit is contained in:
Matthias Huck 2015-04-30 19:44:29 +01:00
parent e98a2fc980
commit 34d1d3a904
9 changed files with 130 additions and 93 deletions

View File

@ -45,14 +45,14 @@ BleuScorer::BleuScorer(const string& config)
} else if (reflen == REFLEN_CLOSEST) {
m_ref_length_type = CLOSEST;
} else {
throw runtime_error("Unknown reference length strategy: " + reflen);
UTIL_THROW2("Unknown reference length strategy: " + reflen);
}
}
BleuScorer::~BleuScorer() {}
size_t BleuScorer::CountNgrams(const string& line, NgramCounts& counts,
unsigned int n, bool is_testing)
unsigned int n, bool is_testing) const
{
assert(n > 0);
vector<int> encoded_tokens;
@ -94,25 +94,18 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
mert::VocabularyFactory::GetVocabulary()->clear();
//load reference data
for (size_t i = 0; i < referenceFiles.size(); ++i) {
for (size_t i = 0; i < referenceFiles.size(); ++i)
{
TRACE_ERR("Loading reference from " << referenceFiles[i] << endl);
if (!OpenReference(referenceFiles[i].c_str(), i)) {
throw runtime_error("Unable to open " + referenceFiles[i]);
ifstream ifs(referenceFiles[i].c_str());
UTIL_THROW_IF2(!ifs, "Cannot open " << referenceFiles[i]);
if (!OpenReferenceStream(&ifs, i)) {
UTIL_THROW2("Unable to open " + referenceFiles[i]);
}
}
}
bool BleuScorer::OpenReference(const char* filename, size_t file_id)
{
ifstream ifs(filename);
if (!ifs) {
cerr << "Cannot open " << filename << endl;
return false;
}
return OpenReferenceStream(&ifs, file_id);
}
bool BleuScorer::OpenReferenceStream(istream* is, size_t file_id)
{
if (is == NULL) return false;
@ -120,15 +113,27 @@ bool BleuScorer::OpenReferenceStream(istream* is, size_t file_id)
string line;
size_t sid = 0;
while (getline(*is, line)) {
// TODO: rather than loading the whole reference corpus into memory, can we stream it line by line?
// (loading the whole reference corpus can take gigabytes of RAM if done with millions of sentences)
line = preprocessSentence(line);
if (file_id == 0) {
Reference* ref = new Reference;
m_references.push_back(ref); // Take ownership of the Reference object.
}
if (m_references.size() <= sid) {
cerr << "Reference " << file_id << "has too many sentences." << endl;
return false;
UTIL_THROW_IF2(m_references.size() <= sid, "Reference " << file_id << "has too many sentences.");
ProcessReferenceLine(line, m_references[sid]);
if (sid > 0 && sid % 100 == 0) {
TRACE_ERR(".");
}
++sid;
}
return true;
}
void BleuScorer::ProcessReferenceLine(const std::string& line, Reference* ref) const
{
NgramCounts counts;
size_t length = CountNgrams(line, counts, kBleuNgramOrder);
@ -138,35 +143,30 @@ bool BleuScorer::OpenReferenceStream(istream* is, size_t file_id)
const NgramCounts::Value newcount = ci->second;
NgramCounts::Value oldcount = 0;
m_references[sid]->get_counts()->Lookup(ngram, &oldcount);
ref->get_counts()->Lookup(ngram, &oldcount);
if (newcount > oldcount) {
m_references[sid]->get_counts()->operator[](ngram) = newcount;
ref->get_counts()->operator[](ngram) = newcount;
}
}
//add in the length
m_references[sid]->push_back(length);
if (sid > 0 && sid % 100 == 0) {
TRACE_ERR(".");
}
++sid;
}
return true;
ref->push_back(length);
}
void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
if (sid >= m_references.size()) {
stringstream msg;
msg << "Sentence id (" << sid << ") not found in reference set";
throw runtime_error(msg.str());
}
UTIL_THROW_IF2(sid >= m_references.size(), "Sentence id (" << sid << ") not found in reference set");
CalcBleuStats(m_references[sid], text, entry);
}
void BleuScorer::CalcBleuStats(const Reference* ref, const std::string& text, ScoreStats& entry) const
{
NgramCounts testcounts;
// stats for this line
vector<ScoreStatsType> stats(kBleuNgramOrder * 2);
string sentence = preprocessSentence(text);
const size_t length = CountNgrams(sentence, testcounts, kBleuNgramOrder, true);
const int reference_len = CalcReferenceLength(sid, length);
const int reference_len = CalcReferenceLength(ref, length);
stats.push_back(reference_len);
//precision on each ngram type
@ -177,7 +177,7 @@ void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
NgramCounts::Value correct = 0;
NgramCounts::Value v = 0;
if (m_references[sid]->get_counts()->Lookup(testcounts_it->first, &v)) {
if (ref->get_counts()->Lookup(testcounts_it->first, &v)) {
correct = min(v, guess);
}
stats[len * 2 - 2] += correct;
@ -207,21 +207,20 @@ statscore_t BleuScorer::calculateScore(const vector<ScoreStatsType>& comps) cons
return exp(logbleu);
}
int BleuScorer::CalcReferenceLength(size_t sentence_id, size_t length)
int BleuScorer::CalcReferenceLength(const Reference* ref, std::size_t length) const
{
switch (m_ref_length_type) {
case AVERAGE:
return m_references[sentence_id]->CalcAverage();
return ref->CalcAverage();
break;
case CLOSEST:
return m_references[sentence_id]->CalcClosest(length);
return ref->CalcClosest(length);
break;
case SHORTEST:
return m_references[sentence_id]->CalcShortest();
return ref->CalcShortest();
break;
default:
cerr << "unknown reference types." << endl;
exit(1);
UTIL_THROW2("Unknown reference types");
}
}
@ -298,29 +297,23 @@ vector<float> BleuScorer::ScoreNbestList(const string& scoreFile, const string&
vector<FeatureDataIterator> featureDataIters;
vector<ScoreDataIterator> scoreDataIters;
for (size_t i = 0; i < featureFiles.size(); ++i) {
for (size_t i = 0; i < featureFiles.size(); ++i)
{
featureDataIters.push_back(FeatureDataIterator(featureFiles[i]));
scoreDataIters.push_back(ScoreDataIterator(scoreFiles[i]));
}
vector<pair<size_t,size_t> > hypotheses;
if (featureDataIters[0] == FeatureDataIterator::end()) {
cerr << "Error: at the end of feature data iterator" << endl;
exit(1);
}
for (size_t i = 0; i < featureFiles.size(); ++i) {
if (featureDataIters[i] == FeatureDataIterator::end()) {
cerr << "Error: Feature file " << i << " ended prematurely" << endl;
exit(1);
}
if (scoreDataIters[i] == ScoreDataIterator::end()) {
cerr << "Error: Score file " << i << " ended prematurely" << endl;
exit(1);
}
if (featureDataIters[i]->size() != scoreDataIters[i]->size()) {
cerr << "Error: features and scores have different size" << endl;
exit(1);
}
UTIL_THROW_IF2(featureDataIters[0] == FeatureDataIterator::end(),
"At the end of feature data iterator");
for (size_t i = 0; i < featureFiles.size(); ++i)
{
UTIL_THROW_IF2(featureDataIters[i] == FeatureDataIterator::end(),
"Feature file " << i << " ended prematurely");
UTIL_THROW_IF2(scoreDataIters[i] == ScoreDataIterator::end(),
"Score file " << i << " ended prematurely");
UTIL_THROW_IF2(featureDataIters[i]->size() != scoreDataIters[i]->size(),
"Features and scores have different size");
for (size_t j = 0; j < featureDataIters[i]->size(); ++j) {
hypotheses.push_back(pair<size_t,size_t>(i,j));
}

View File

@ -42,11 +42,14 @@ public:
return 2 * kBleuNgramOrder + 1;
}
int CalcReferenceLength(std::size_t sentence_id, std::size_t length);
void CalcBleuStats(const Reference* ref, const std::string& text, ScoreStats& entry) const;
int CalcReferenceLength(const Reference* ref, std::size_t length) const;
ReferenceLengthType GetReferenceLengthType() const {
return m_ref_length_type;
}
void SetReferenceLengthType(ReferenceLengthType type) {
m_ref_length_type = type;
}
@ -62,14 +65,14 @@ public:
/**
* Count the ngrams of each type, up to the given length in the input line.
*/
std::size_t CountNgrams(const std::string& line, NgramCounts& counts, unsigned int n, bool is_testing=false);
std::size_t CountNgrams(const std::string& line, NgramCounts& counts, unsigned int n, bool is_testing=false) const;
void DumpCounts(std::ostream* os, const NgramCounts& counts) const;
bool OpenReference(const char* filename, std::size_t file_id);
// NOTE: this function is used for unit testing.
virtual bool OpenReferenceStream(std::istream* is, std::size_t file_id);
bool OpenReferenceStream(std::istream* is, std::size_t file_id);
void ProcessReferenceLine(const std::string& line, Reference* ref) const;
//private:
protected:

View File

@ -66,11 +66,13 @@ exe evaluator : evaluator.cpp mert_lib ;
exe sentence-bleu : sentence-bleu.cpp mert_lib ;
exe sentence-bleu-nbest : sentence-bleu-nbest.cpp mert_lib ;
exe pro : pro.cpp mert_lib ..//boost_program_options ;
exe kbmira : kbmira.cpp mert_lib ..//boost_program_options ..//boost_filesystem ;
alias programs : mert extractor evaluator pro kbmira sentence-bleu ;
alias programs : mert extractor evaluator pro kbmira sentence-bleu sentence-bleu-nbest ;
unit-test bleu_scorer_test : BleuScorerTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test feature_data_test : FeatureDataTest.cpp mert_lib ..//boost_unit_test_framework ;

View File

@ -64,7 +64,7 @@ void Scorer::InitConfig(const string& config)
}
}
void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded)
void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded) const
{
for (util::TokenIter<util::AnyCharacter, true> it(line, util::AnyCharacter(" "));
it; ++it) {
@ -81,7 +81,7 @@ void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded)
}
}
void Scorer::TokenizeAndEncodeTesting(const string& line, vector<int>& encoded)
void Scorer::TokenizeAndEncodeTesting(const string& line, vector<int>& encoded) const
{
for (util::TokenIter<util::AnyCharacter, true> it(line, util::AnyCharacter(" "));
it; ++it) {

View File

@ -187,12 +187,12 @@ protected:
* Tokenise line and encode.
* Note: We assume that all tokens are separated by whitespaces.
*/
void TokenizeAndEncode(const std::string& line, std::vector<int>& encoded);
void TokenizeAndEncode(const std::string& line, std::vector<int>& encoded) const;
/*
* Tokenize functions for testing only.
*/
void TokenizeAndEncodeTesting(const std::string& line, std::vector<int>& encoded);
void TokenizeAndEncodeTesting(const std::string& line, std::vector<int>& encoded) const;
/**
* Every inherited scorer should call this function for each sentence

View File

@ -0,0 +1,44 @@
#include <iostream>
#include <vector>
#include <string>
#include "BleuScorer.h"
#include "moses/Util.h"
using namespace MosesTuning;
int main(int argc, char **argv)
{
if (argc == 1) {
std::cerr << "Usage: ./sentence-bleu-nbest ref1 [ref2 ...] < plain-nbest > bleu-scores" << std::endl;
return 1;
}
std::vector<std::string> refFiles(argv + 1, argv + argc);
// TODO all of these are empty for now
std::string config;
std::string factors;
std::string filter;
BleuScorer scorer(config);
scorer.setFactors(factors);
scorer.setFilter(filter);
scorer.setReferenceFiles(refFiles); // TODO: we don't need to load the whole reference corpus into memory (this can take gigabytes of RAM if done with millions of sentences)
// Loading sentences and preparing statistics
std::string nbestLine;
while ( getline(std::cin, nbestLine) )
{
std::vector<std::string> items;
Moses::TokenizeMultiCharSeparator(items, nbestLine, " ||| ");
size_t sid = Moses::Scan<size_t>(items[0]);
ScoreStats scoreStats;
scorer.prepareStats(sid, items[1], scoreStats);
std::vector<float> stats(scoreStats.getArray(), scoreStats.getArray() + scoreStats.size());
std::cout << smoothedSentenceBleu(stats) << std::endl;
}
return 0;
}

View File

@ -23,22 +23,19 @@ int main(int argc, char **argv)
BleuScorer scorer(config);
scorer.setFactors(factors);
scorer.setFilter(filter);
scorer.setReferenceFiles(refFiles);
vector<ScoreStats> entries;
scorer.setReferenceFiles(refFiles); // TODO: we don't need to load the whole reference corpus into memory (this can take gigabytes of RAM if done with millions of sentences)
// Loading sentences and preparing statistics
ScoreStats scoreentry;
string line;
while (getline(cin, line)) {
scorer.prepareStats(entries.size(), line, scoreentry);
entries.push_back(scoreentry);
string hypothesisLine;
size_t sid = 0;
while (getline(std::cin, hypothesisLine))
{
ScoreStats scoreStats;
scorer.prepareStats(sid, hypothesisLine, scoreStats);
vector<float> stats(scoreStats.getArray(), scoreStats.getArray() + scoreStats.size());
std::cout << smoothedSentenceBleu(stats) << std::endl;
++sid;
}
vector<ScoreStats>::const_iterator sentIt;
for (sentIt = entries.begin(); sentIt != entries.end(); sentIt++) {
vector<float> stats(sentIt->getArray(), sentIt->getArray() + sentIt->size());
cout << smoothedSentenceBleu(stats) << "\n";
}
return 0;
}

View File

@ -90,13 +90,6 @@ bool FileExists(const std::string& filePath)
return !ifs.fail();
}
const std::string Trim(const std::string& str, const std::string dropChars)
{
std::string res = str;
res.erase(str.find_last_not_of(dropChars)+1);
return res.erase(0, res.find_first_not_of(dropChars));
}
void ResetUserTime()
{
g_timer.start();

View File

@ -19,8 +19,7 @@ License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_Util_h
#define moses_Util_h
#pragma once
#include <iostream>
#include <fstream>
@ -89,10 +88,17 @@ namespace Moses
#define NTH_ELEMENT4(begin, middle, end, orderer) std::nth_element(begin, middle, end, orderer)
#endif
//! delete white spaces at beginning and end of string
const std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r");
const std::string ToLower(const std::string& str);
//! delete white spaces at beginning and end of string
inline std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r")
{
std::string res = str;
res.erase(str.find_last_not_of(dropChars)+1);
return res.erase(0, res.find_first_not_of(dropChars));
}
//! get string representation of any object/variable, as long as it can pipe to a stream
template<typename T>
inline std::string SPrint(const T &input)
@ -533,4 +539,3 @@ void ShowWeights();
} // namespace
#endif