mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-29 06:52:34 +03:00
169 lines
5.4 KiB
C++
169 lines
5.4 KiB
C++
#include "TsgFilter.h"
|
|
|
|
#include "boost/scoped_ptr.hpp"
|
|
|
|
#include "util/string_piece.hh"
|
|
#include "util/string_piece_hash.hh"
|
|
#include "util/tokenize_piece.hh"
|
|
|
|
namespace MosesTraining
|
|
{
|
|
namespace Syntax
|
|
{
|
|
namespace FilterRuleTable
|
|
{
|
|
|
|
// Read a rule table from 'in' and filter it according to the test sentences.
|
|
//
|
|
// This involves testing TSG fragments for matches against at potential match
|
|
// sites in the set of test parse trees / forests. There are a few
|
|
// optimizations that make this reasonably fast in practice:
|
|
//
|
|
// Optimization 1
|
|
// If a rule has the same TSG fragment as the previous rule then re-use the
|
|
// result of the previous filtering decision.
|
|
//
|
|
// Optimization 2
|
|
// Test if the TSG fragment contains any symbols that don't occur in the
|
|
// symbol vocabulary of the test set. If it does then the rule can be
|
|
// discarded.
|
|
//
|
|
// Optimization 3
|
|
// Prior to filtering, a map is constructed from each distinct test set tree /
|
|
// forest vertex symbol to the set of vertices having that symbol. During
|
|
// filtering, for each rule's TSG fragment the leaf with the smallest number of
|
|
// corresponding test nodes nodes is determined. Matching is only attempted
|
|
// at those sites (this is done in MatchFragment, which has tree- and
|
|
// forest-specific implementations).
|
|
//
|
|
// Some statistics from real data (WMT14, English-German, tree-version):
|
|
//
|
|
// 4.4M Parallel sentences (source-side parsed with Berkeley parser)
|
|
// 2.7K Test sentences (newstest2014)
|
|
//
|
|
// 73.4M Original rule table size (number of distinct, composed GHKM rules)
|
|
// 22.9M Number of rules with same source-side as previous rule
|
|
// 50.5M Number of rules requiring vocabulary matching test
|
|
// 24.1M Number of rules requiring full tree matching test
|
|
// 6.7M Number of rules retained after filtering
|
|
//
|
|
void TsgFilter::Filter(std::istream &in, std::ostream &out)
|
|
{
|
|
const util::MultiCharacter delimiter("|||");
|
|
|
|
std::string line;
|
|
std::string prevLine;
|
|
StringPiece source;
|
|
bool keep = true;
|
|
int lineNum = 0;
|
|
std::vector<TreeFragmentToken> tokens;
|
|
std::vector<IdTree *> leaves;
|
|
|
|
while (std::getline(in, line)) {
|
|
++lineNum;
|
|
|
|
// Read the source-side of the rule.
|
|
util::TokenIter<util::MultiCharacter> it(line, delimiter);
|
|
|
|
// Check if this rule has the same source-side as the previous rule. If
|
|
// it does then we already know whether or not to keep the rule. This
|
|
// optimisation is based on the assumption that the rule table is sorted
|
|
// (which is the case in the standard Moses training pipeline).
|
|
if (*it == source) {
|
|
if (keep) {
|
|
out << line << std::endl;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
// The source-side is different from the previous rule's.
|
|
source = *it;
|
|
|
|
// Tokenize the source-side tree fragment.
|
|
tokens.clear();
|
|
for (TreeFragmentTokenizer p(source); p != TreeFragmentTokenizer(); ++p) {
|
|
tokens.push_back(*p);
|
|
}
|
|
|
|
// Construct an IdTree representing the source-side tree fragment. This
|
|
// will fail if the fragment contains any symbols that don't occur in
|
|
// m_testVocab and in that case the rule can be discarded. In practice,
|
|
// this catches a lot of discardable rules (see comment at the top of this
|
|
// function). If the fragment is successfully created then we attempt to
|
|
// match the tree fragment against the test trees. This test is exact, but
|
|
// slow.
|
|
int i = 0;
|
|
leaves.clear();
|
|
boost::scoped_ptr<IdTree> fragment(BuildTree(tokens, i, leaves));
|
|
keep = fragment.get() && MatchFragment(*fragment, leaves);
|
|
if (keep) {
|
|
out << line << std::endl;
|
|
}
|
|
|
|
// Retain line for the next iteration (in order that the source StringPiece
|
|
// remains valid).
|
|
prevLine.swap(line);
|
|
}
|
|
}
|
|
|
|
TsgFilter::IdTree *TsgFilter::BuildTree(
|
|
const std::vector<TreeFragmentToken> &tokens, int &i,
|
|
std::vector<IdTree *> &leaves)
|
|
{
|
|
// The subtree starting at tokens[i] is either:
|
|
// 1. a single non-variable symbol (like NP or dog), or
|
|
// 2. a variable symbol (like [NP]), or
|
|
// 3. a subtree with children (like [NP [DT] [NN dog]])
|
|
|
|
// First check for case 1.
|
|
if (tokens[i].type == TreeFragmentToken_WORD) {
|
|
Vocabulary::IdType id = m_testVocab.Lookup(tokens[i++].value,
|
|
StringPieceCompatibleHash(),
|
|
StringPieceCompatibleEquals());
|
|
if (id == Vocabulary::NullId()) {
|
|
return 0;
|
|
}
|
|
leaves.push_back(new IdTree(id));
|
|
return leaves.back();
|
|
}
|
|
|
|
// We must be dealing with either case 2 or 3. Case 2 looks like case 3 but
|
|
// without the children.
|
|
assert(tokens[i].type == TreeFragmentToken_LSB);
|
|
|
|
// Skip over the opening [
|
|
++i;
|
|
|
|
// Read the root symbol of the subtree.
|
|
Vocabulary::IdType id = m_testVocab.Lookup(tokens[i++].value,
|
|
StringPieceCompatibleHash(),
|
|
StringPieceCompatibleEquals());
|
|
if (id == Vocabulary::NullId()) {
|
|
return 0;
|
|
}
|
|
IdTree *root = new IdTree(id);
|
|
|
|
// Read the children (in case 2 there won't be any).
|
|
while (tokens[i].type != TreeFragmentToken_RSB) {
|
|
IdTree *child = BuildTree(tokens, i, leaves);
|
|
if (!child) {
|
|
delete root;
|
|
return 0;
|
|
}
|
|
root->children().push_back(child);
|
|
child->parent() = root;
|
|
}
|
|
|
|
if (root->IsLeaf()) {
|
|
leaves.push_back(root);
|
|
}
|
|
|
|
// Skip over the closing ] and we're done.
|
|
++i;
|
|
return root;
|
|
}
|
|
|
|
} // namespace FilterRuleTable
|
|
} // namespace Syntax
|
|
} // namespace MosesTraining
|