mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2025-01-02 17:09:36 +03:00
5cca6fafcb
Mostly fixes from Tetsuo Kiso and Jonathan Graehl
293 lines
8.1 KiB
C++
293 lines
8.1 KiB
C++
#include "lm/filter/phrase.hh"
|
|
|
|
#include "lm/filter/format.hh"
|
|
|
|
#include <algorithm>
|
|
#include <functional>
|
|
#include <iostream>
|
|
#include <queue>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include <ctype.h>
|
|
|
|
namespace lm {
|
|
namespace phrase {
|
|
|
|
unsigned int ReadMultiple(std::istream &in, Substrings &out) {
|
|
bool sentence_content = false;
|
|
unsigned int sentence_id = 0;
|
|
std::vector<Hash> phrase;
|
|
std::string word;
|
|
while (in) {
|
|
char c;
|
|
// Gather a word.
|
|
while (!isspace(c = in.get()) && in) word += c;
|
|
// Treat EOF like a newline.
|
|
if (!in) c = '\n';
|
|
// Add the word to the phrase.
|
|
if (!word.empty()) {
|
|
phrase.push_back(util::MurmurHashNative(word.data(), word.size()));
|
|
word.clear();
|
|
}
|
|
if (c == ' ') continue;
|
|
// It's more than just a space. Close out the phrase.
|
|
if (!phrase.empty()) {
|
|
sentence_content = true;
|
|
out.AddPhrase(sentence_id, phrase.begin(), phrase.end());
|
|
phrase.clear();
|
|
}
|
|
if (c == '\t' || c == '\v') continue;
|
|
// It's more than a space or tab: a newline.
|
|
if (sentence_content) {
|
|
++sentence_id;
|
|
sentence_content = false;
|
|
}
|
|
}
|
|
if (!in.eof()) in.exceptions(std::istream::failbit | std::istream::badbit);
|
|
return sentence_id + sentence_content;
|
|
}
|
|
|
|
namespace {
|
|
typedef unsigned int Sentence;
|
|
typedef std::vector<Sentence> Sentences;
|
|
} // namespace
|
|
|
|
namespace detail {
|
|
|
|
const StringPiece kEndSentence("</s>");
|
|
|
|
class Arc {
|
|
public:
|
|
Arc() {}
|
|
|
|
// For arcs from one vertex to another.
|
|
void SetPhrase(detail::Vertex &from, detail::Vertex &to, const Sentences &intersect) {
|
|
Set(to, intersect);
|
|
from_ = &from;
|
|
}
|
|
|
|
/* For arcs from before the n-gram begins to somewhere in the n-gram (right
|
|
* aligned). These have no from_ vertex; it implictly matches every
|
|
* sentence. This also handles when the n-gram is a substring of a phrase.
|
|
*/
|
|
void SetRight(detail::Vertex &to, const Sentences &complete) {
|
|
Set(to, complete);
|
|
from_ = NULL;
|
|
}
|
|
|
|
Sentence Current() const {
|
|
return *current_;
|
|
}
|
|
|
|
bool Empty() const {
|
|
return current_ == last_;
|
|
}
|
|
|
|
/* When this function returns:
|
|
* If Empty() then there's nothing left from this intersection.
|
|
*
|
|
* If Current() == to then to is part of the intersection.
|
|
*
|
|
* Otherwise, Current() > to. In this case, to is not part of the
|
|
* intersection and neither is anything < Current(). To determine if
|
|
* any value >= Current() is in the intersection, call LowerBound again
|
|
* with the value.
|
|
*/
|
|
void LowerBound(const Sentence to);
|
|
|
|
private:
|
|
void Set(detail::Vertex &to, const Sentences &sentences);
|
|
|
|
const Sentence *current_;
|
|
const Sentence *last_;
|
|
detail::Vertex *from_;
|
|
};
|
|
|
|
struct ArcGreater : public std::binary_function<const Arc *, const Arc *, bool> {
|
|
bool operator()(const Arc *first, const Arc *second) const {
|
|
return first->Current() > second->Current();
|
|
}
|
|
};
|
|
|
|
class Vertex {
|
|
public:
|
|
Vertex() : current_(0) {}
|
|
|
|
Sentence Current() const {
|
|
return current_;
|
|
}
|
|
|
|
bool Empty() const {
|
|
return incoming_.empty();
|
|
}
|
|
|
|
void LowerBound(const Sentence to);
|
|
|
|
private:
|
|
friend class Arc;
|
|
|
|
void AddIncoming(Arc *arc) {
|
|
if (!arc->Empty()) incoming_.push(arc);
|
|
}
|
|
|
|
unsigned int current_;
|
|
std::priority_queue<Arc*, std::vector<Arc*>, ArcGreater> incoming_;
|
|
};
|
|
|
|
void Arc::LowerBound(const Sentence to) {
|
|
current_ = std::lower_bound(current_, last_, to);
|
|
// If *current_ > to, don't advance from_. The intervening values of
|
|
// from_ may be useful for another one of its outgoing arcs.
|
|
if (!from_ || Empty() || (Current() > to)) return;
|
|
assert(Current() == to);
|
|
from_->LowerBound(to);
|
|
if (from_->Empty()) {
|
|
current_ = last_;
|
|
return;
|
|
}
|
|
assert(from_->Current() >= to);
|
|
if (from_->Current() > to) {
|
|
current_ = std::lower_bound(current_ + 1, last_, from_->Current());
|
|
}
|
|
}
|
|
|
|
void Arc::Set(Vertex &to, const Sentences &sentences) {
|
|
current_ = &*sentences.begin();
|
|
last_ = &*sentences.end();
|
|
to.AddIncoming(this);
|
|
}
|
|
|
|
void Vertex::LowerBound(const Sentence to) {
|
|
if (Empty()) return;
|
|
// Union lower bound.
|
|
while (true) {
|
|
Arc *top = incoming_.top();
|
|
if (top->Current() > to) {
|
|
current_ = top->Current();
|
|
return;
|
|
}
|
|
// If top->Current() == to, we still need to verify that's an actual
|
|
// element and not just a bound.
|
|
incoming_.pop();
|
|
top->LowerBound(to);
|
|
if (!top->Empty()) {
|
|
incoming_.push(top);
|
|
if (top->Current() == to) {
|
|
current_ = to;
|
|
return;
|
|
}
|
|
} else if (Empty()) {
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace detail
|
|
|
|
namespace {
|
|
|
|
void BuildGraph(const Substrings &phrase, const std::vector<Hash> &hashes, detail::Vertex *const vertices, detail::Arc *free_arc) {
|
|
using detail::Vertex;
|
|
using detail::Arc;
|
|
assert(!hashes.empty());
|
|
|
|
const Hash *const first_word = &*hashes.begin();
|
|
const Hash *const last_word = &*hashes.end() - 1;
|
|
|
|
Hash hash = 0;
|
|
const Sentences *found;
|
|
// Phrases starting at or before the first word in the n-gram.
|
|
{
|
|
Vertex *vertex = vertices;
|
|
for (const Hash *word = first_word; ; ++word, ++vertex) {
|
|
hash = util::MurmurHashNative(&hash, sizeof(uint64_t), *word);
|
|
// Now hash is [hashes.begin(), word].
|
|
if (word == last_word) {
|
|
if (phrase.FindSubstring(hash, found))
|
|
(free_arc++)->SetRight(*vertex, *found);
|
|
break;
|
|
}
|
|
if (!phrase.FindRight(hash, found)) break;
|
|
(free_arc++)->SetRight(*vertex, *found);
|
|
}
|
|
}
|
|
|
|
// Phrases starting at the second or later word in the n-gram.
|
|
Vertex *vertex_from = vertices;
|
|
for (const Hash *word_from = first_word + 1; word_from != &*hashes.end(); ++word_from, ++vertex_from) {
|
|
hash = 0;
|
|
Vertex *vertex_to = vertex_from + 1;
|
|
for (const Hash *word_to = word_from; ; ++word_to, ++vertex_to) {
|
|
// Notice that word_to and vertex_to have the same index.
|
|
hash = util::MurmurHashNative(&hash, sizeof(uint64_t), *word_to);
|
|
// Now hash covers [word_from, word_to].
|
|
if (word_to == last_word) {
|
|
if (phrase.FindLeft(hash, found))
|
|
(free_arc++)->SetPhrase(*vertex_from, *vertex_to, *found);
|
|
break;
|
|
}
|
|
if (!phrase.FindPhrase(hash, found)) break;
|
|
(free_arc++)->SetPhrase(*vertex_from, *vertex_to, *found);
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace
|
|
|
|
namespace detail {
|
|
|
|
// Here instead of header due to forward declaration.
|
|
ConditionCommon::ConditionCommon(const Substrings &substrings) : substrings_(substrings) {}
|
|
|
|
// Rest of the variables are temporaries anyway
|
|
ConditionCommon::ConditionCommon(const ConditionCommon &from) : substrings_(from.substrings_) {}
|
|
|
|
ConditionCommon::~ConditionCommon() {}
|
|
|
|
detail::Vertex &ConditionCommon::MakeGraph() {
|
|
assert(!hashes_.empty());
|
|
vertices_.clear();
|
|
vertices_.resize(hashes_.size());
|
|
arcs_.clear();
|
|
// One for every substring.
|
|
arcs_.resize(((hashes_.size() + 1) * hashes_.size()) / 2);
|
|
BuildGraph(substrings_, hashes_, &*vertices_.begin(), &*arcs_.begin());
|
|
return vertices_[hashes_.size() - 1];
|
|
}
|
|
|
|
} // namespace detail
|
|
|
|
bool Union::Evaluate() {
|
|
detail::Vertex &last_vertex = MakeGraph();
|
|
unsigned int lower = 0;
|
|
while (true) {
|
|
last_vertex.LowerBound(lower);
|
|
if (last_vertex.Empty()) return false;
|
|
if (last_vertex.Current() == lower) return true;
|
|
lower = last_vertex.Current();
|
|
}
|
|
}
|
|
|
|
template <class Output> void Multiple::Evaluate(const StringPiece &line, Output &output) {
|
|
detail::Vertex &last_vertex = MakeGraph();
|
|
unsigned int lower = 0;
|
|
while (true) {
|
|
last_vertex.LowerBound(lower);
|
|
if (last_vertex.Empty()) return;
|
|
if (last_vertex.Current() == lower) {
|
|
output.SingleAddNGram(lower, line);
|
|
++lower;
|
|
} else {
|
|
lower = last_vertex.Current();
|
|
}
|
|
}
|
|
}
|
|
|
|
template void Multiple::Evaluate<CountFormat::Multiple>(const StringPiece &line, CountFormat::Multiple &output);
|
|
template void Multiple::Evaluate<ARPAFormat::Multiple>(const StringPiece &line, ARPAFormat::Multiple &output);
|
|
template void Multiple::Evaluate<MultipleOutputBuffer>(const StringPiece &line, MultipleOutputBuffer &output);
|
|
|
|
} // namespace phrase
|
|
} // namespace lm
|