Further optimization for extractor.

Fixes inefficient updating N-gram counts.

NOTE: Using '--binary' option (this option is not enabled by default yet)
for saving outputs would lead to significant speed up.
This commit is contained in:
Tetsuo Kiso 2012-12-07 08:45:47 +09:00
parent 8fdec9bf30
commit 2a3c9fc679
2 changed files with 10 additions and 14 deletions

View File

@ -65,21 +65,24 @@ size_t BleuScorer::CountNgrams(const string& line, NgramCounts& counts,
} else {
TokenizeAndEncode(line, encoded_tokens);
}
const size_t len = encoded_tokens.size();
vector<int> ngram;
for (size_t k = 1; k <= n; ++k) {
//ngram order longer than sentence - no point
if (k > encoded_tokens.size()) {
if (k > len) {
continue;
}
for (size_t i = 0; i < encoded_tokens.size()-k+1; ++i) {
vector<int> ngram;
ngram.reserve(encoded_tokens.size());
for (size_t j = i; j < i+k && j < encoded_tokens.size(); ++j) {
for (size_t i = 0; i < len - k + 1; ++i) {
ngram.clear();
ngram.reserve(len);
for (size_t j = i; j < i+k && j < len; ++j) {
ngram.push_back(encoded_tokens[j]);
}
counts.Add(ngram);
}
}
return encoded_tokens.size();
return len;
}
void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)

View File

@ -45,14 +45,7 @@ class NgramCounts {
/**
* If the specified "ngram" is found, we add counts.
* If not, we insert the default count in the container. */
void Add(const Key& ngram) {
const_iterator it = find(ngram);
if (it != end()) {
m_counts[ngram] = it->second + 1;
} else {
m_counts[ngram] = kDefaultCount;
}
}
inline void Add(const Key& ngram) { m_counts[ngram]++; }
/**
* Return true iff the specified "ngram" is found in the container.