mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-10-05 15:58:03 +03:00
sentence-bleu-nbest
This commit is contained in:
parent
e98a2fc980
commit
34d1d3a904
@ -45,14 +45,14 @@ BleuScorer::BleuScorer(const string& config)
|
||||
} else if (reflen == REFLEN_CLOSEST) {
|
||||
m_ref_length_type = CLOSEST;
|
||||
} else {
|
||||
throw runtime_error("Unknown reference length strategy: " + reflen);
|
||||
UTIL_THROW2("Unknown reference length strategy: " + reflen);
|
||||
}
|
||||
}
|
||||
|
||||
BleuScorer::~BleuScorer() {}
|
||||
|
||||
size_t BleuScorer::CountNgrams(const string& line, NgramCounts& counts,
|
||||
unsigned int n, bool is_testing)
|
||||
unsigned int n, bool is_testing) const
|
||||
{
|
||||
assert(n > 0);
|
||||
vector<int> encoded_tokens;
|
||||
@ -94,25 +94,18 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
|
||||
mert::VocabularyFactory::GetVocabulary()->clear();
|
||||
|
||||
//load reference data
|
||||
for (size_t i = 0; i < referenceFiles.size(); ++i) {
|
||||
for (size_t i = 0; i < referenceFiles.size(); ++i)
|
||||
{
|
||||
TRACE_ERR("Loading reference from " << referenceFiles[i] << endl);
|
||||
|
||||
if (!OpenReference(referenceFiles[i].c_str(), i)) {
|
||||
throw runtime_error("Unable to open " + referenceFiles[i]);
|
||||
ifstream ifs(referenceFiles[i].c_str());
|
||||
UTIL_THROW_IF2(!ifs, "Cannot open " << referenceFiles[i]);
|
||||
if (!OpenReferenceStream(&ifs, i)) {
|
||||
UTIL_THROW2("Unable to open " + referenceFiles[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool BleuScorer::OpenReference(const char* filename, size_t file_id)
|
||||
{
|
||||
ifstream ifs(filename);
|
||||
if (!ifs) {
|
||||
cerr << "Cannot open " << filename << endl;
|
||||
return false;
|
||||
}
|
||||
return OpenReferenceStream(&ifs, file_id);
|
||||
}
|
||||
|
||||
bool BleuScorer::OpenReferenceStream(istream* is, size_t file_id)
|
||||
{
|
||||
if (is == NULL) return false;
|
||||
@ -120,15 +113,27 @@ bool BleuScorer::OpenReferenceStream(istream* is, size_t file_id)
|
||||
string line;
|
||||
size_t sid = 0;
|
||||
while (getline(*is, line)) {
|
||||
// TODO: rather than loading the whole reference corpus into memory, can we stream it line by line?
|
||||
// (loading the whole reference corpus can take gigabytes of RAM if done with millions of sentences)
|
||||
line = preprocessSentence(line);
|
||||
if (file_id == 0) {
|
||||
Reference* ref = new Reference;
|
||||
m_references.push_back(ref); // Take ownership of the Reference object.
|
||||
}
|
||||
if (m_references.size() <= sid) {
|
||||
cerr << "Reference " << file_id << "has too many sentences." << endl;
|
||||
return false;
|
||||
UTIL_THROW_IF2(m_references.size() <= sid, "Reference " << file_id << "has too many sentences.");
|
||||
|
||||
ProcessReferenceLine(line, m_references[sid]);
|
||||
|
||||
if (sid > 0 && sid % 100 == 0) {
|
||||
TRACE_ERR(".");
|
||||
}
|
||||
++sid;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void BleuScorer::ProcessReferenceLine(const std::string& line, Reference* ref) const
|
||||
{
|
||||
NgramCounts counts;
|
||||
size_t length = CountNgrams(line, counts, kBleuNgramOrder);
|
||||
|
||||
@ -138,35 +143,30 @@ bool BleuScorer::OpenReferenceStream(istream* is, size_t file_id)
|
||||
const NgramCounts::Value newcount = ci->second;
|
||||
|
||||
NgramCounts::Value oldcount = 0;
|
||||
m_references[sid]->get_counts()->Lookup(ngram, &oldcount);
|
||||
ref->get_counts()->Lookup(ngram, &oldcount);
|
||||
if (newcount > oldcount) {
|
||||
m_references[sid]->get_counts()->operator[](ngram) = newcount;
|
||||
ref->get_counts()->operator[](ngram) = newcount;
|
||||
}
|
||||
}
|
||||
//add in the length
|
||||
m_references[sid]->push_back(length);
|
||||
if (sid > 0 && sid % 100 == 0) {
|
||||
TRACE_ERR(".");
|
||||
}
|
||||
++sid;
|
||||
}
|
||||
return true;
|
||||
ref->push_back(length);
|
||||
}
|
||||
|
||||
void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
|
||||
{
|
||||
if (sid >= m_references.size()) {
|
||||
stringstream msg;
|
||||
msg << "Sentence id (" << sid << ") not found in reference set";
|
||||
throw runtime_error(msg.str());
|
||||
}
|
||||
UTIL_THROW_IF2(sid >= m_references.size(), "Sentence id (" << sid << ") not found in reference set");
|
||||
CalcBleuStats(m_references[sid], text, entry);
|
||||
}
|
||||
|
||||
void BleuScorer::CalcBleuStats(const Reference* ref, const std::string& text, ScoreStats& entry) const
|
||||
{
|
||||
NgramCounts testcounts;
|
||||
// stats for this line
|
||||
vector<ScoreStatsType> stats(kBleuNgramOrder * 2);
|
||||
string sentence = preprocessSentence(text);
|
||||
const size_t length = CountNgrams(sentence, testcounts, kBleuNgramOrder, true);
|
||||
|
||||
const int reference_len = CalcReferenceLength(sid, length);
|
||||
const int reference_len = CalcReferenceLength(ref, length);
|
||||
stats.push_back(reference_len);
|
||||
|
||||
//precision on each ngram type
|
||||
@ -177,7 +177,7 @@ void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
|
||||
NgramCounts::Value correct = 0;
|
||||
|
||||
NgramCounts::Value v = 0;
|
||||
if (m_references[sid]->get_counts()->Lookup(testcounts_it->first, &v)) {
|
||||
if (ref->get_counts()->Lookup(testcounts_it->first, &v)) {
|
||||
correct = min(v, guess);
|
||||
}
|
||||
stats[len * 2 - 2] += correct;
|
||||
@ -207,21 +207,20 @@ statscore_t BleuScorer::calculateScore(const vector<ScoreStatsType>& comps) cons
|
||||
return exp(logbleu);
|
||||
}
|
||||
|
||||
int BleuScorer::CalcReferenceLength(size_t sentence_id, size_t length)
|
||||
int BleuScorer::CalcReferenceLength(const Reference* ref, std::size_t length) const
|
||||
{
|
||||
switch (m_ref_length_type) {
|
||||
case AVERAGE:
|
||||
return m_references[sentence_id]->CalcAverage();
|
||||
return ref->CalcAverage();
|
||||
break;
|
||||
case CLOSEST:
|
||||
return m_references[sentence_id]->CalcClosest(length);
|
||||
return ref->CalcClosest(length);
|
||||
break;
|
||||
case SHORTEST:
|
||||
return m_references[sentence_id]->CalcShortest();
|
||||
return ref->CalcShortest();
|
||||
break;
|
||||
default:
|
||||
cerr << "unknown reference types." << endl;
|
||||
exit(1);
|
||||
UTIL_THROW2("Unknown reference types");
|
||||
}
|
||||
}
|
||||
|
||||
@ -298,29 +297,23 @@ vector<float> BleuScorer::ScoreNbestList(const string& scoreFile, const string&
|
||||
|
||||
vector<FeatureDataIterator> featureDataIters;
|
||||
vector<ScoreDataIterator> scoreDataIters;
|
||||
for (size_t i = 0; i < featureFiles.size(); ++i) {
|
||||
for (size_t i = 0; i < featureFiles.size(); ++i)
|
||||
{
|
||||
featureDataIters.push_back(FeatureDataIterator(featureFiles[i]));
|
||||
scoreDataIters.push_back(ScoreDataIterator(scoreFiles[i]));
|
||||
}
|
||||
|
||||
vector<pair<size_t,size_t> > hypotheses;
|
||||
if (featureDataIters[0] == FeatureDataIterator::end()) {
|
||||
cerr << "Error: at the end of feature data iterator" << endl;
|
||||
exit(1);
|
||||
}
|
||||
for (size_t i = 0; i < featureFiles.size(); ++i) {
|
||||
if (featureDataIters[i] == FeatureDataIterator::end()) {
|
||||
cerr << "Error: Feature file " << i << " ended prematurely" << endl;
|
||||
exit(1);
|
||||
}
|
||||
if (scoreDataIters[i] == ScoreDataIterator::end()) {
|
||||
cerr << "Error: Score file " << i << " ended prematurely" << endl;
|
||||
exit(1);
|
||||
}
|
||||
if (featureDataIters[i]->size() != scoreDataIters[i]->size()) {
|
||||
cerr << "Error: features and scores have different size" << endl;
|
||||
exit(1);
|
||||
}
|
||||
UTIL_THROW_IF2(featureDataIters[0] == FeatureDataIterator::end(),
|
||||
"At the end of feature data iterator");
|
||||
for (size_t i = 0; i < featureFiles.size(); ++i)
|
||||
{
|
||||
UTIL_THROW_IF2(featureDataIters[i] == FeatureDataIterator::end(),
|
||||
"Feature file " << i << " ended prematurely");
|
||||
UTIL_THROW_IF2(scoreDataIters[i] == ScoreDataIterator::end(),
|
||||
"Score file " << i << " ended prematurely");
|
||||
UTIL_THROW_IF2(featureDataIters[i]->size() != scoreDataIters[i]->size(),
|
||||
"Features and scores have different size");
|
||||
for (size_t j = 0; j < featureDataIters[i]->size(); ++j) {
|
||||
hypotheses.push_back(pair<size_t,size_t>(i,j));
|
||||
}
|
||||
|
@ -42,11 +42,14 @@ public:
|
||||
return 2 * kBleuNgramOrder + 1;
|
||||
}
|
||||
|
||||
int CalcReferenceLength(std::size_t sentence_id, std::size_t length);
|
||||
void CalcBleuStats(const Reference* ref, const std::string& text, ScoreStats& entry) const;
|
||||
|
||||
int CalcReferenceLength(const Reference* ref, std::size_t length) const;
|
||||
|
||||
ReferenceLengthType GetReferenceLengthType() const {
|
||||
return m_ref_length_type;
|
||||
}
|
||||
|
||||
void SetReferenceLengthType(ReferenceLengthType type) {
|
||||
m_ref_length_type = type;
|
||||
}
|
||||
@ -62,14 +65,14 @@ public:
|
||||
/**
|
||||
* Count the ngrams of each type, up to the given length in the input line.
|
||||
*/
|
||||
std::size_t CountNgrams(const std::string& line, NgramCounts& counts, unsigned int n, bool is_testing=false);
|
||||
std::size_t CountNgrams(const std::string& line, NgramCounts& counts, unsigned int n, bool is_testing=false) const;
|
||||
|
||||
void DumpCounts(std::ostream* os, const NgramCounts& counts) const;
|
||||
|
||||
bool OpenReference(const char* filename, std::size_t file_id);
|
||||
|
||||
// NOTE: this function is used for unit testing.
|
||||
virtual bool OpenReferenceStream(std::istream* is, std::size_t file_id);
|
||||
bool OpenReferenceStream(std::istream* is, std::size_t file_id);
|
||||
|
||||
void ProcessReferenceLine(const std::string& line, Reference* ref) const;
|
||||
|
||||
//private:
|
||||
protected:
|
||||
|
@ -66,11 +66,13 @@ exe evaluator : evaluator.cpp mert_lib ;
|
||||
|
||||
exe sentence-bleu : sentence-bleu.cpp mert_lib ;
|
||||
|
||||
exe sentence-bleu-nbest : sentence-bleu-nbest.cpp mert_lib ;
|
||||
|
||||
exe pro : pro.cpp mert_lib ..//boost_program_options ;
|
||||
|
||||
exe kbmira : kbmira.cpp mert_lib ..//boost_program_options ..//boost_filesystem ;
|
||||
|
||||
alias programs : mert extractor evaluator pro kbmira sentence-bleu ;
|
||||
alias programs : mert extractor evaluator pro kbmira sentence-bleu sentence-bleu-nbest ;
|
||||
|
||||
unit-test bleu_scorer_test : BleuScorerTest.cpp mert_lib ..//boost_unit_test_framework ;
|
||||
unit-test feature_data_test : FeatureDataTest.cpp mert_lib ..//boost_unit_test_framework ;
|
||||
|
@ -64,7 +64,7 @@ void Scorer::InitConfig(const string& config)
|
||||
}
|
||||
}
|
||||
|
||||
void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded)
|
||||
void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded) const
|
||||
{
|
||||
for (util::TokenIter<util::AnyCharacter, true> it(line, util::AnyCharacter(" "));
|
||||
it; ++it) {
|
||||
@ -81,7 +81,7 @@ void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded)
|
||||
}
|
||||
}
|
||||
|
||||
void Scorer::TokenizeAndEncodeTesting(const string& line, vector<int>& encoded)
|
||||
void Scorer::TokenizeAndEncodeTesting(const string& line, vector<int>& encoded) const
|
||||
{
|
||||
for (util::TokenIter<util::AnyCharacter, true> it(line, util::AnyCharacter(" "));
|
||||
it; ++it) {
|
||||
|
@ -187,12 +187,12 @@ protected:
|
||||
* Tokenise line and encode.
|
||||
* Note: We assume that all tokens are separated by whitespaces.
|
||||
*/
|
||||
void TokenizeAndEncode(const std::string& line, std::vector<int>& encoded);
|
||||
void TokenizeAndEncode(const std::string& line, std::vector<int>& encoded) const;
|
||||
|
||||
/*
|
||||
* Tokenize functions for testing only.
|
||||
*/
|
||||
void TokenizeAndEncodeTesting(const std::string& line, std::vector<int>& encoded);
|
||||
void TokenizeAndEncodeTesting(const std::string& line, std::vector<int>& encoded) const;
|
||||
|
||||
/**
|
||||
* Every inherited scorer should call this function for each sentence
|
||||
|
44
mert/sentence-bleu-nbest.cpp
Normal file
44
mert/sentence-bleu-nbest.cpp
Normal file
@ -0,0 +1,44 @@
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "BleuScorer.h"
|
||||
#include "moses/Util.h"
|
||||
|
||||
using namespace MosesTuning;
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
if (argc == 1) {
|
||||
std::cerr << "Usage: ./sentence-bleu-nbest ref1 [ref2 ...] < plain-nbest > bleu-scores" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::vector<std::string> refFiles(argv + 1, argv + argc);
|
||||
|
||||
// TODO all of these are empty for now
|
||||
std::string config;
|
||||
std::string factors;
|
||||
std::string filter;
|
||||
|
||||
BleuScorer scorer(config);
|
||||
scorer.setFactors(factors);
|
||||
scorer.setFilter(filter);
|
||||
scorer.setReferenceFiles(refFiles); // TODO: we don't need to load the whole reference corpus into memory (this can take gigabytes of RAM if done with millions of sentences)
|
||||
|
||||
// Loading sentences and preparing statistics
|
||||
std::string nbestLine;
|
||||
while ( getline(std::cin, nbestLine) )
|
||||
{
|
||||
std::vector<std::string> items;
|
||||
Moses::TokenizeMultiCharSeparator(items, nbestLine, " ||| ");
|
||||
size_t sid = Moses::Scan<size_t>(items[0]);
|
||||
|
||||
ScoreStats scoreStats;
|
||||
scorer.prepareStats(sid, items[1], scoreStats);
|
||||
std::vector<float> stats(scoreStats.getArray(), scoreStats.getArray() + scoreStats.size());
|
||||
std::cout << smoothedSentenceBleu(stats) << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
@ -23,22 +23,19 @@ int main(int argc, char **argv)
|
||||
BleuScorer scorer(config);
|
||||
scorer.setFactors(factors);
|
||||
scorer.setFilter(filter);
|
||||
scorer.setReferenceFiles(refFiles);
|
||||
|
||||
vector<ScoreStats> entries;
|
||||
scorer.setReferenceFiles(refFiles); // TODO: we don't need to load the whole reference corpus into memory (this can take gigabytes of RAM if done with millions of sentences)
|
||||
|
||||
// Loading sentences and preparing statistics
|
||||
ScoreStats scoreentry;
|
||||
string line;
|
||||
while (getline(cin, line)) {
|
||||
scorer.prepareStats(entries.size(), line, scoreentry);
|
||||
entries.push_back(scoreentry);
|
||||
string hypothesisLine;
|
||||
size_t sid = 0;
|
||||
while (getline(std::cin, hypothesisLine))
|
||||
{
|
||||
ScoreStats scoreStats;
|
||||
scorer.prepareStats(sid, hypothesisLine, scoreStats);
|
||||
vector<float> stats(scoreStats.getArray(), scoreStats.getArray() + scoreStats.size());
|
||||
std::cout << smoothedSentenceBleu(stats) << std::endl;
|
||||
++sid;
|
||||
}
|
||||
|
||||
vector<ScoreStats>::const_iterator sentIt;
|
||||
for (sentIt = entries.begin(); sentIt != entries.end(); sentIt++) {
|
||||
vector<float> stats(sentIt->getArray(), sentIt->getArray() + sentIt->size());
|
||||
cout << smoothedSentenceBleu(stats) << "\n";
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@ -90,13 +90,6 @@ bool FileExists(const std::string& filePath)
|
||||
return !ifs.fail();
|
||||
}
|
||||
|
||||
const std::string Trim(const std::string& str, const std::string dropChars)
|
||||
{
|
||||
std::string res = str;
|
||||
res.erase(str.find_last_not_of(dropChars)+1);
|
||||
return res.erase(0, res.find_first_not_of(dropChars));
|
||||
}
|
||||
|
||||
void ResetUserTime()
|
||||
{
|
||||
g_timer.start();
|
||||
|
15
moses/Util.h
15
moses/Util.h
@ -19,8 +19,7 @@ License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
#ifndef moses_Util_h
|
||||
#define moses_Util_h
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
@ -89,10 +88,17 @@ namespace Moses
|
||||
#define NTH_ELEMENT4(begin, middle, end, orderer) std::nth_element(begin, middle, end, orderer)
|
||||
#endif
|
||||
|
||||
//! delete white spaces at beginning and end of string
|
||||
const std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r");
|
||||
|
||||
const std::string ToLower(const std::string& str);
|
||||
|
||||
//! delete white spaces at beginning and end of string
|
||||
inline std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r")
|
||||
{
|
||||
std::string res = str;
|
||||
res.erase(str.find_last_not_of(dropChars)+1);
|
||||
return res.erase(0, res.find_first_not_of(dropChars));
|
||||
}
|
||||
|
||||
//! get string representation of any object/variable, as long as it can pipe to a stream
|
||||
template<typename T>
|
||||
inline std::string SPrint(const T &input)
|
||||
@ -533,4 +539,3 @@ void ShowWeights();
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user