From 2a3c9fc6797c97595f5f6bda7dab0b17ce758afc Mon Sep 17 00:00:00 2001 From: Tetsuo Kiso Date: Fri, 7 Dec 2012 08:45:47 +0900 Subject: [PATCH] Further optimization for extractor. Fixes inefficient updating N-gram counts. NOTE: Using '--binary' option (this option is not enabled by default yet) for saving outputs would lead to significant speed up. --- mert/BleuScorer.cpp | 15 +++++++++------ mert/Ngram.h | 9 +-------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/mert/BleuScorer.cpp b/mert/BleuScorer.cpp index 8fb814390..1adbd0276 100644 --- a/mert/BleuScorer.cpp +++ b/mert/BleuScorer.cpp @@ -65,21 +65,24 @@ size_t BleuScorer::CountNgrams(const string& line, NgramCounts& counts, } else { TokenizeAndEncode(line, encoded_tokens); } + const size_t len = encoded_tokens.size(); + vector ngram; + for (size_t k = 1; k <= n; ++k) { //ngram order longer than sentence - no point - if (k > encoded_tokens.size()) { + if (k > len) { continue; } - for (size_t i = 0; i < encoded_tokens.size()-k+1; ++i) { - vector ngram; - ngram.reserve(encoded_tokens.size()); - for (size_t j = i; j < i+k && j < encoded_tokens.size(); ++j) { + for (size_t i = 0; i < len - k + 1; ++i) { + ngram.clear(); + ngram.reserve(len); + for (size_t j = i; j < i+k && j < len; ++j) { ngram.push_back(encoded_tokens[j]); } counts.Add(ngram); } } - return encoded_tokens.size(); + return len; } void BleuScorer::setReferenceFiles(const vector& referenceFiles) diff --git a/mert/Ngram.h b/mert/Ngram.h index 811f21f27..6363c847c 100644 --- a/mert/Ngram.h +++ b/mert/Ngram.h @@ -45,14 +45,7 @@ class NgramCounts { /** * If the specified "ngram" is found, we add counts. * If not, we insert the default count in the container. */ - void Add(const Key& ngram) { - const_iterator it = find(ngram); - if (it != end()) { - m_counts[ngram] = it->second + 1; - } else { - m_counts[ngram] = kDefaultCount; - } - } + inline void Add(const Key& ngram) { m_counts[ngram]++; } /** * Return true iff the specified "ngram" is found in the container.