Further optimization for extractor.

Fixes inefficient updating N-gram counts. NOTE: Using '--binary' option (this option is not enabled by default yet) for saving outputs would lead to significant speed up.
2024-10-26 19:37:58 +03:00 · 2012-12-07 08:45:47 +09:00 · 2012-12-07 08:45:47 +09:00 · 2a3c9fc679
commit 2a3c9fc679
parent 8fdec9bf30
2 changed files with 10 additions and 14 deletions
--- a/mert/BleuScorer.cpp
+++ b/mert/BleuScorer.cpp
@ -65,21 +65,24 @@ size_t BleuScorer::CountNgrams(const string& line, NgramCounts& counts,
  } else {
    TokenizeAndEncode(line, encoded_tokens);
  }
+  const size_t len = encoded_tokens.size();
+  vector<int> ngram;
+
  for (size_t k = 1; k <= n; ++k) {
    //ngram order longer than sentence - no point
-    if (k > encoded_tokens.size()) {
+    if (k > len) {
      continue;
    }
-    for (size_t i = 0; i < encoded_tokens.size()-k+1; ++i) {
-      vector<int> ngram;
-      ngram.reserve(encoded_tokens.size());
-      for (size_t j = i; j < i+k && j < encoded_tokens.size(); ++j) {
+    for (size_t i = 0; i < len - k + 1; ++i) {
+      ngram.clear();
+      ngram.reserve(len);
+      for (size_t j = i; j < i+k && j < len; ++j) {
        ngram.push_back(encoded_tokens[j]);
      }
      counts.Add(ngram);
    }
  }
-  return encoded_tokens.size();
+  return len;
 }

 void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
--- a/mert/Ngram.h
+++ b/mert/Ngram.h
@ -45,14 +45,7 @@ class NgramCounts {
  /**
   * If the specified "ngram" is found, we add counts.
   * If not, we insert the default count in the container. */
-  void Add(const Key& ngram) {
-    const_iterator it = find(ngram);
-    if (it != end()) {
-      m_counts[ngram] = it->second + 1;
-    } else {
-      m_counts[ngram] = kDefaultCount;
-    }
-  }
+  inline void Add(const Key& ngram) { m_counts[ngram]++; }

  /**
   * Return true iff the specified "ngram" is found in the container.