/* * CHRFScorer.cpp * * Created on: Dec 28, 2016 * Author: pramathur@ebay.com */ #include "CHRFScorer.h" #include #include #include "Util.h" #include "math.h" #include #include #include #include #include #include #include #include "ScoreStats.h" #include "util/exception.hh" #include "Util.h" #include "ScoreDataIterator.h" #include "FeatureDataIterator.h" #include "Vocabulary.h" namespace { const char KEY_REFLEN[] = "reflen"; const char REFLEN_AVERAGE[] = "average"; const char REFLEN_SHORTEST[] = "shortest"; const char REFLEN_CLOSEST[] = "closest"; const char KEY_BETA[] = "beta"; const char KEY_BETA_DEF[] = "3"; const char KEY_SMOOTH[] = "smooth"; const char KEY_SMOOTH_DEF[] = "0"; float BETA=3; float SMOOTH=0; } namespace MosesTuning { CHRFScorer::CHRFScorer(const std::string& config) : StatisticsBasedScorer("CHRF",config), m_ref_length_type(CLOSEST), m_beta(3), m_smooth(0) { const std::string reflen = getConfig(KEY_REFLEN, REFLEN_CLOSEST); if (reflen == REFLEN_AVERAGE) { m_ref_length_type = AVERAGE; } else if (reflen == REFLEN_SHORTEST) { m_ref_length_type = SHORTEST; } else if (reflen == REFLEN_CLOSEST) { m_ref_length_type = CLOSEST; } else { UTIL_THROW2("Unknown reference length strategy: " + reflen); } const std::string beta = getConfig(KEY_BETA, KEY_BETA_DEF); const std::string smooth = getConfig(KEY_SMOOTH, KEY_SMOOTH_DEF); if(beta == KEY_BETA_DEF){ m_beta=3.0; } else{ m_beta = ::atof(beta.c_str()); } if(smooth == KEY_SMOOTH_DEF){ m_smooth=0.0; }else{ m_smooth = ::atof(smooth.c_str()); } BETA= m_beta; SMOOTH = m_smooth; } CHRFScorer::~CHRFScorer() {} void CHRFScorer::setReferenceFiles(const std::vector& referenceFiles) { // Make sure reference data is clear m_references.reset(); mert::VocabularyFactory::GetVocabulary()->clear(); //load reference data for (size_t i = 0; i < referenceFiles.size(); ++i) { TRACE_ERR("Loading reference from " << referenceFiles[i] << std::endl); std::ifstream ifs(referenceFiles[i].c_str()); if (!OpenReferenceStream(&ifs, i)) { UTIL_THROW2("Cannot open " + referenceFiles[i]); } } } bool CHRFScorer::OpenReferenceStream(std::istream* is, size_t file_id) { if (is == NULL) return false; std::string line; size_t sid = 0; while (getline(*is, line)) { // TODO: rather than loading the whole reference corpus into memory, can we stream it line by line? // (loading the whole reference corpus can take gigabytes of RAM if done with millions of sentences) line = preprocessSentence(line); // chrf stuff here // split line into characters std::string temp_line; for(size_t i=0; i 0 && sid % 100 == 0) { TRACE_ERR("."); } ++sid; } return true; } void CHRFScorer::ProcessReferenceLine(const std::string& line, Reference* ref) const { NgramCounts counts; size_t length = CountNgrams(line, counts, CHRFNgramOrder); //for any counts larger than those already there, merge them in for (NgramCounts::const_iterator ci = counts.begin(); ci != counts.end(); ++ci) { const NgramCounts::Key& ngram = ci->first; const NgramCounts::Value newcount = ci->second; NgramCounts::Value oldcount = 0; ref->get_counts()->Lookup(ngram, &oldcount); if (newcount > oldcount) { ref->get_counts()->operator[](ngram) = newcount; } } //add in the length ref->push_back(length); } size_t CHRFScorer::CountNgrams(const std::string& line, NgramCounts& counts, unsigned int n, bool is_testing) const { assert(n > 0); std::vector encoded_tokens; // When performing tokenization of a hypothesis translation, we don't have // to update the Scorer's word vocabulary. However, the tokenization of // reference translations requires modifying the vocabulary, which means // this procedure might be slower than the tokenization the hypothesis // translation. if (is_testing) { TokenizeAndEncodeTesting(line, encoded_tokens); } else { TokenizeAndEncode(line, encoded_tokens); } const size_t len = encoded_tokens.size(); std::vector ngram; for (size_t k = 1; k <= n; ++k) { //ngram order longer than sentence - no point if (k > len) { continue; } for (size_t i = 0; i < len - k + 1; ++i) { ngram.clear(); ngram.reserve(len); for (size_t j = i; j < i+k && j < len; ++j) { ngram.push_back(encoded_tokens[j]); } counts.Add(ngram); } } // DumpCounts(&std::cerr, counts); return len; } void CHRFScorer::prepareStats(size_t sid, const std::string& text, ScoreStats& entry) { UTIL_THROW_IF2(sid >= m_references.size(), "Sentence id (" << sid << ") not found in reference set"); CalcCHRFStats(*(m_references[sid]), text, entry); } void CHRFScorer::CalcCHRFStats(const Reference& ref, const std::string& text, ScoreStats& entry) const { NgramCounts testcounts; // stats for this line std::vector stats(CHRFNgramOrder * 3); std::string sentence = preprocessSentence(text); // chrf stuff here // split line into characters std::string temp_line; for(size_t i=0; isecond; const size_t len = testcounts_it->first.size(); NgramCounts::Value correct = 0; NgramCounts::Value v = 0; if (ref.get_counts()->Lookup(testcounts_it->first, &v)) { correct = std::min(v, guess); } stats[len * 3 - 3] += correct; stats[len * 3 - 2] += guess; stats[len * 3 - 1] += v; } entry.set(stats); } statscore_t CHRFScorer::calculateScore(const std::vector& comps) const { UTIL_THROW_IF(comps.size() != CHRFNgramOrder * 3 + 2, util::Exception, "Error"); float f1=0.0; float precision = 0.0; float recall = 0.0; for (size_t i = 0; i < CHRFNgramOrder; i++){ precision += ((comps[3*i] + m_smooth)*1.0) / ((comps[3*i+1] + m_smooth)*1.0); recall += ((comps[3*i] + m_smooth)*1.0) / ((comps[3*i+2] + m_smooth)*1.0); } precision /= CHRFNgramOrder; recall /= CHRFNgramOrder; f1 = ((1 + pow(m_beta, 2) ) * (precision * recall) ) / ( ( pow(m_beta, 2) * precision) + recall) ; return f1; } int CHRFScorer::CalcReferenceLength(const Reference& ref, std::size_t length) const { switch (m_ref_length_type) { case AVERAGE: return ref.CalcAverage(); break; case CLOSEST: return ref.CalcClosest(length); break; case SHORTEST: return ref.CalcShortest(); break; default: UTIL_THROW2("Unknown reference types"); } } void CHRFScorer::DumpCounts(std::ostream* os, const NgramCounts& counts) const { for (NgramCounts::const_iterator it = counts.begin(); it != counts.end(); ++it) { *os << "("; const NgramCounts::Key& keys = it->first; for (size_t i = 0; i < keys.size(); ++i) { if (i != 0) { *os << " "; } *os << keys[i]; } *os << ") : " << it->second << ", "; } *os << std::endl; } } /* namespace MosesTuning */