diff --git a/mert/BleuDocScorer.cpp b/mert/BleuDocScorer.cpp index 48c17ee96..d71c5171d 100644 --- a/mert/BleuDocScorer.cpp +++ b/mert/BleuDocScorer.cpp @@ -174,7 +174,7 @@ statscore_t BleuDocScorer::calculateScore(const vector& comps) const UTIL_THROW_IF(comps.size() != kBleuNgramOrder * 2 + 1, util::Exception, "Error"); float logbleu = 0.0; - for (int i = 0; i < kBleuNgramOrder; ++i) { + for (size_t i = 0; i < kBleuNgramOrder; ++i) { if (comps[2*i] == 0) { return 0.0; } diff --git a/mert/BleuDocScorer.h b/mert/BleuDocScorer.h index 9677410f8..d27088254 100644 --- a/mert/BleuDocScorer.h +++ b/mert/BleuDocScorer.h @@ -1,5 +1,4 @@ -#ifndef MERT_BLEU_DOC_SCORER_H_ -#define MERT_BLEU_DOC_SCORER_H_ +#pragma once #include #include @@ -64,4 +63,3 @@ private: } -#endif // MERT_BLEU_DOC_SCORER_H_ diff --git a/mert/BleuScorer.cpp b/mert/BleuScorer.cpp index 13a472447..dc926054f 100644 --- a/mert/BleuScorer.cpp +++ b/mert/BleuScorer.cpp @@ -99,9 +99,8 @@ void BleuScorer::setReferenceFiles(const vector& referenceFiles) TRACE_ERR("Loading reference from " << referenceFiles[i] << endl); ifstream ifs(referenceFiles[i].c_str()); - UTIL_THROW_IF2(!ifs, "Cannot open " << referenceFiles[i]); if (!OpenReferenceStream(&ifs, i)) { - UTIL_THROW2("Unable to open " + referenceFiles[i]); + UTIL_THROW2("Cannot open " + referenceFiles[i]); } } } @@ -152,13 +151,26 @@ void BleuScorer::ProcessReferenceLine(const std::string& line, Reference* ref) c ref->push_back(length); } +bool BleuScorer::GetNextReferenceFromStreams(std::vector >& referenceStreams, Reference& ref) const +{ + for (vector >::iterator ifs=referenceStreams.begin(); ifs!=referenceStreams.end(); ++ifs) + { + if (!(*ifs)) return false; + string line; + if (!getline(**ifs, line)) return false; + line = preprocessSentence(line); + ProcessReferenceLine(line, &ref); + } + return true; +} + void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { UTIL_THROW_IF2(sid >= m_references.size(), "Sentence id (" << sid << ") not found in reference set"); - CalcBleuStats(m_references[sid], text, entry); + CalcBleuStats(*(m_references[sid]), text, entry); } -void BleuScorer::CalcBleuStats(const Reference* ref, const std::string& text, ScoreStats& entry) const +void BleuScorer::CalcBleuStats(const Reference& ref, const std::string& text, ScoreStats& entry) const { NgramCounts testcounts; // stats for this line @@ -177,7 +189,7 @@ void BleuScorer::CalcBleuStats(const Reference* ref, const std::string& text, Sc NgramCounts::Value correct = 0; NgramCounts::Value v = 0; - if (ref->get_counts()->Lookup(testcounts_it->first, &v)) { + if (ref.get_counts()->Lookup(testcounts_it->first, &v)) { correct = min(v, guess); } stats[len * 2 - 2] += correct; @@ -207,17 +219,17 @@ statscore_t BleuScorer::calculateScore(const vector& comps) cons return exp(logbleu); } -int BleuScorer::CalcReferenceLength(const Reference* ref, std::size_t length) const +int BleuScorer::CalcReferenceLength(const Reference& ref, std::size_t length) const { switch (m_ref_length_type) { case AVERAGE: - return ref->CalcAverage(); + return ref.CalcAverage(); break; case CLOSEST: - return ref->CalcClosest(length); + return ref.CalcClosest(length); break; case SHORTEST: - return ref->CalcShortest(); + return ref.CalcShortest(); break; default: UTIL_THROW2("Unknown reference types"); diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h index e90915822..d7ee8e4e7 100644 --- a/mert/BleuScorer.h +++ b/mert/BleuScorer.h @@ -1,23 +1,23 @@ -#ifndef MERT_BLEU_SCORER_H_ -#define MERT_BLEU_SCORER_H_ +#pragma once -#include +#include #include #include -#include "Types.h" +#include + +#include "Ngram.h" +#include "Reference.h" +#include "ScopedVector.h" #include "ScoreData.h" #include "StatisticsBasedScorer.h" -#include "ScopedVector.h" +#include "Types.h" namespace MosesTuning { const size_t kBleuNgramOrder = 4; -class NgramCounts; -class Reference; - /** * Bleu scoring */ @@ -42,9 +42,9 @@ public: return 2 * kBleuNgramOrder + 1; } - void CalcBleuStats(const Reference* ref, const std::string& text, ScoreStats& entry) const; + void CalcBleuStats(const Reference& ref, const std::string& text, ScoreStats& entry) const; - int CalcReferenceLength(const Reference* ref, std::size_t length) const; + int CalcReferenceLength(const Reference& ref, std::size_t length) const; ReferenceLengthType GetReferenceLengthType() const { return m_ref_length_type; @@ -65,7 +65,7 @@ public: /** * Count the ngrams of each type, up to the given length in the input line. */ - std::size_t CountNgrams(const std::string& line, NgramCounts& counts, unsigned int n, bool is_testing=false) const; + size_t CountNgrams(const std::string& line, NgramCounts& counts, unsigned int n, bool is_testing=false) const; void DumpCounts(std::ostream* os, const NgramCounts& counts) const; @@ -74,6 +74,8 @@ public: void ProcessReferenceLine(const std::string& line, Reference* ref) const; + bool GetNextReferenceFromStreams(std::vector >& referenceStreams, Reference& ref) const; + //private: protected: ReferenceLengthType m_ref_length_type; @@ -102,4 +104,3 @@ float sentenceLevelBackgroundBleu(const std::vector& sent, const std::vec } -#endif // MERT_BLEU_SCORER_H_ diff --git a/mert/HopeFearDecoder.cpp b/mert/HopeFearDecoder.cpp index 5288116d6..be9d8f2c9 100644 --- a/mert/HopeFearDecoder.cpp +++ b/mert/HopeFearDecoder.cpp @@ -98,7 +98,7 @@ void NbestHopeFearDecoder::HopeFear( size_t hope_index=0, fear_index=0, model_index=0; ValType hope_score=0, fear_score=0, model_score=0; for(size_t safe_loop=0; safe_loop<2; safe_loop++) { - ValType hope_bleu, hope_model; + ValType hope_bleu=0, hope_model=0; for(size_t i=0; i< train_->cur_size(); i++) { const MiraFeatureVector& vec=train_->featuresAt(i); ValType score = wv.score(vec); diff --git a/mert/HopeFearDecoder.h b/mert/HopeFearDecoder.h index 53c0e935d..73f0e97d9 100644 --- a/mert/HopeFearDecoder.h +++ b/mert/HopeFearDecoder.h @@ -16,8 +16,7 @@ You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ -#ifndef MERT_HOPEFEARDECODER_H -#define MERT_HOPEFEARDECODER_H +#pragma once #include @@ -160,5 +159,3 @@ private: }; -#endif - diff --git a/mert/Ngram.h b/mert/Ngram.h index 521dc4928..de2703605 100644 --- a/mert/Ngram.h +++ b/mert/Ngram.h @@ -1,5 +1,4 @@ -#ifndef MERT_NGRAM_H_ -#define MERT_NGRAM_H_ +#pragma once #include #include @@ -121,4 +120,3 @@ private: } -#endif // MERT_NGRAM_H_ diff --git a/mert/Reference.h b/mert/Reference.h index 2c12f2ed7..a7878f3e7 100644 --- a/mert/Reference.h +++ b/mert/Reference.h @@ -59,6 +59,11 @@ public: int CalcClosest(std::size_t length) const; int CalcShortest() const; + void clear() { + m_length.clear(); + m_counts->clear(); + } + private: NgramCounts* m_counts; diff --git a/mert/Scorer.h b/mert/Scorer.h index 4383c68f2..a08fc436d 100644 --- a/mert/Scorer.h +++ b/mert/Scorer.h @@ -1,5 +1,4 @@ -#ifndef MERT_SCORER_H_ -#define MERT_SCORER_H_ +#pragma once #include #include @@ -236,4 +235,3 @@ inline float score_average(const statscores_t& scores, size_t start, size_t end) } -#endif // MERT_SCORER_H_ diff --git a/mert/sentence-bleu-nbest.cpp b/mert/sentence-bleu-nbest.cpp index 023e9faae..f869386e3 100644 --- a/mert/sentence-bleu-nbest.cpp +++ b/mert/sentence-bleu-nbest.cpp @@ -1,9 +1,14 @@ +#include #include #include #include +#include + #include "BleuScorer.h" +#include "Reference.h" #include "moses/Util.h" +#include "util/exception.hh" using namespace MosesTuning; @@ -24,21 +29,40 @@ int main(int argc, char **argv) BleuScorer scorer(config); scorer.setFactors(factors); scorer.setFilter(filter); - scorer.setReferenceFiles(refFiles); // TODO: we don't need to load the whole reference corpus into memory (this can take gigabytes of RAM if done with millions of sentences) - // Loading sentences and preparing statistics + // initialize reference streams + std::vector > refStreams; + for (std::vector::const_iterator refFile=refFiles.begin(); refFile!=refFiles.end(); ++refFile) + { + TRACE_ERR("Loading reference from " << *refFile << std::endl); + boost::shared_ptr ifs(new std::ifstream(refFile->c_str())); + UTIL_THROW_IF2(!ifs, "Cannot open " << *refFile); + refStreams.push_back(ifs); + } + + // load sentences, preparing statistics, score std::string nbestLine; + int sid = -1; + Reference ref; while ( getline(std::cin, nbestLine) ) { std::vector items; Moses::TokenizeMultiCharSeparator(items, nbestLine, " ||| "); - size_t sid = Moses::Scan(items[0]); + int sidCurrent = Moses::Scan(items[0]); + if (sidCurrent != sid) { + ref.clear(); + if (!scorer.GetNextReferenceFromStreams(refStreams, ref)) { + UTIL_THROW2("Missing references"); + } + sid = sidCurrent; + } ScoreStats scoreStats; - scorer.prepareStats(sid, items[1], scoreStats); + scorer.CalcBleuStats(ref, items[1], scoreStats); std::vector stats(scoreStats.getArray(), scoreStats.getArray() + scoreStats.size()); std::cout << smoothedSentenceBleu(stats) << std::endl; } return 0; } + diff --git a/mert/sentence-bleu.cpp b/mert/sentence-bleu.cpp index 425122c05..9bdab30d2 100644 --- a/mert/sentence-bleu.cpp +++ b/mert/sentence-bleu.cpp @@ -1,18 +1,26 @@ +#include #include #include #include +#include + #include "BleuScorer.h" +#include "Reference.h" +#include "moses/Util.h" +#include "util/exception.hh" using namespace std; using namespace MosesTuning; + int main(int argc, char **argv) { if (argc == 1) { cerr << "Usage: ./sentence-bleu ref1 [ref2 ...] < candidate > bleu-scores" << endl; return 1; } + vector refFiles(argv + 1, argv + argc); // TODO all of these are empty for now @@ -23,15 +31,28 @@ int main(int argc, char **argv) BleuScorer scorer(config); scorer.setFactors(factors); scorer.setFilter(filter); - scorer.setReferenceFiles(refFiles); // TODO: we don't need to load the whole reference corpus into memory (this can take gigabytes of RAM if done with millions of sentences) - // Loading sentences and preparing statistics + // initialize reference streams + vector > refStreams; + for (vector::const_iterator refFile=refFiles.begin(); refFile!=refFiles.end(); ++refFile) + { + TRACE_ERR("Loading reference from " << *refFile << endl); + boost::shared_ptr ifs(new ifstream(refFile->c_str())); + UTIL_THROW_IF2(!ifs, "Cannot open " << *refFile); + refStreams.push_back(ifs); + } + + // load sentences, preparing statistics, score string hypothesisLine; size_t sid = 0; while (getline(std::cin, hypothesisLine)) { + Reference ref; + if (!scorer.GetNextReferenceFromStreams(refStreams, ref)) { + UTIL_THROW2("Missing references"); + } ScoreStats scoreStats; - scorer.prepareStats(sid, hypothesisLine, scoreStats); + scorer.CalcBleuStats(ref, hypothesisLine, scoreStats); vector stats(scoreStats.getArray(), scoreStats.getArray() + scoreStats.size()); std::cout << smoothedSentenceBleu(stats) << std::endl; ++sid; @@ -39,3 +60,4 @@ int main(int argc, char **argv) return 0; } +