mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-10-26 19:37:58 +03:00
Further optimization for extractor.
Fixes inefficient updating N-gram counts. NOTE: Using '--binary' option (this option is not enabled by default yet) for saving outputs would lead to significant speed up.
This commit is contained in:
parent
8fdec9bf30
commit
2a3c9fc679
@ -65,21 +65,24 @@ size_t BleuScorer::CountNgrams(const string& line, NgramCounts& counts,
|
||||
} else {
|
||||
TokenizeAndEncode(line, encoded_tokens);
|
||||
}
|
||||
const size_t len = encoded_tokens.size();
|
||||
vector<int> ngram;
|
||||
|
||||
for (size_t k = 1; k <= n; ++k) {
|
||||
//ngram order longer than sentence - no point
|
||||
if (k > encoded_tokens.size()) {
|
||||
if (k > len) {
|
||||
continue;
|
||||
}
|
||||
for (size_t i = 0; i < encoded_tokens.size()-k+1; ++i) {
|
||||
vector<int> ngram;
|
||||
ngram.reserve(encoded_tokens.size());
|
||||
for (size_t j = i; j < i+k && j < encoded_tokens.size(); ++j) {
|
||||
for (size_t i = 0; i < len - k + 1; ++i) {
|
||||
ngram.clear();
|
||||
ngram.reserve(len);
|
||||
for (size_t j = i; j < i+k && j < len; ++j) {
|
||||
ngram.push_back(encoded_tokens[j]);
|
||||
}
|
||||
counts.Add(ngram);
|
||||
}
|
||||
}
|
||||
return encoded_tokens.size();
|
||||
return len;
|
||||
}
|
||||
|
||||
void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
|
||||
|
@ -45,14 +45,7 @@ class NgramCounts {
|
||||
/**
|
||||
* If the specified "ngram" is found, we add counts.
|
||||
* If not, we insert the default count in the container. */
|
||||
void Add(const Key& ngram) {
|
||||
const_iterator it = find(ngram);
|
||||
if (it != end()) {
|
||||
m_counts[ngram] = it->second + 1;
|
||||
} else {
|
||||
m_counts[ngram] = kDefaultCount;
|
||||
}
|
||||
}
|
||||
inline void Add(const Key& ngram) { m_counts[ngram]++; }
|
||||
|
||||
/**
|
||||
* Return true iff the specified "ngram" is found in the container.
|
||||
|
Loading…
Reference in New Issue
Block a user