mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-29 06:52:34 +03:00
121 lines
3.7 KiB
C++
121 lines
3.7 KiB
C++
#include "TreeTsgFilter.h"
|
|
|
|
namespace MosesTraining
|
|
{
|
|
namespace Syntax
|
|
{
|
|
namespace FilterRuleTable
|
|
{
|
|
|
|
TreeTsgFilter::TreeTsgFilter(
|
|
const std::vector<boost::shared_ptr<StringTree> > &sentences)
|
|
{
|
|
// Convert each StringTree to an IdTree.
|
|
m_sentences.reserve(sentences.size());
|
|
for (std::vector<boost::shared_ptr<StringTree> >::const_iterator p =
|
|
sentences.begin(); p != sentences.end(); ++p) {
|
|
m_sentences.push_back(boost::shared_ptr<IdTree>(StringTreeToIdTree(**p)));
|
|
}
|
|
|
|
m_labelToTree.resize(m_testVocab.Size());
|
|
// Construct a map from vocabulary Ids to IdTree nodes.
|
|
for (std::vector<boost::shared_ptr<IdTree> >::const_iterator p =
|
|
m_sentences.begin(); p != m_sentences.end(); ++p) {
|
|
AddNodesToMap(**p);
|
|
}
|
|
}
|
|
|
|
TreeTsgFilter::IdTree *TreeTsgFilter::StringTreeToIdTree(const StringTree &s)
|
|
{
|
|
IdTree *t = new IdTree(m_testVocab.Insert(s.value()));
|
|
const std::vector<StringTree*> &sChildren = s.children();
|
|
std::vector<IdTree*> &tChildren = t->children();
|
|
tChildren.reserve(sChildren.size());
|
|
for (std::vector<StringTree*>::const_iterator p = sChildren.begin();
|
|
p != sChildren.end(); ++p) {
|
|
IdTree *child = StringTreeToIdTree(**p);
|
|
child->parent() = t;
|
|
tChildren.push_back(child);
|
|
}
|
|
return t;
|
|
}
|
|
|
|
void TreeTsgFilter::AddNodesToMap(const IdTree &tree)
|
|
{
|
|
m_labelToTree[tree.value()].push_back(&tree);
|
|
const std::vector<IdTree*> &children = tree.children();
|
|
for (std::vector<IdTree*>::const_iterator p = children.begin();
|
|
p != children.end(); ++p) {
|
|
AddNodesToMap(**p);
|
|
}
|
|
}
|
|
|
|
bool TreeTsgFilter::MatchFragment(const IdTree &fragment,
|
|
const std::vector<IdTree *> &leaves)
|
|
{
|
|
typedef std::vector<const IdTree *> TreeVec;
|
|
|
|
// Determine which of the fragment's leaves has the smallest number of
|
|
// subtree matches in the test set. If the fragment contains a rare word
|
|
// (which is pretty likely assuming a Zipfian distribution) then we only
|
|
// have to try matching the fragment against a small number of potential
|
|
// match sites.
|
|
const IdTree *rarestLeaf = leaves[0];
|
|
std::size_t lowestCount = m_labelToTree[rarestLeaf->value()].size();
|
|
for (std::size_t i = 1; i < leaves.size(); ++i) {
|
|
const IdTree *leaf = leaves[i];
|
|
std::size_t count = m_labelToTree[leaf->value()].size();
|
|
if (count < lowestCount) {
|
|
lowestCount = count;
|
|
rarestLeaf = leaf;
|
|
}
|
|
}
|
|
|
|
// Determine the depth of the chosen leaf.
|
|
const std::size_t depth = rarestLeaf->Depth();
|
|
|
|
// Try to match the rule fragment against the test set subtrees where a
|
|
// leaf match was found.
|
|
TreeVec &nodes = m_labelToTree[rarestLeaf->value()];
|
|
for (TreeVec::const_iterator p = nodes.begin(); p != nodes.end(); ++p) {
|
|
// Navigate 'depth' positions up the subtree to find the root of the
|
|
// potential match site.
|
|
const IdTree *t = *p;
|
|
std::size_t d = depth;
|
|
while (d && t->parent()) {
|
|
t = t->parent();
|
|
--d;
|
|
}
|
|
if (d > 0) {
|
|
// The potential match site is not tall enough.
|
|
continue;
|
|
}
|
|
if (MatchFragment(fragment, *t)) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool TreeTsgFilter::MatchFragment(const IdTree &fragment, const IdTree &tree)
|
|
{
|
|
if (fragment.value() != tree.value()) {
|
|
return false;
|
|
}
|
|
const std::vector<IdTree*> &fragChildren = fragment.children();
|
|
const std::vector<IdTree*> &treeChildren = tree.children();
|
|
if (!fragChildren.empty() && fragChildren.size() != treeChildren.size()) {
|
|
return false;
|
|
}
|
|
for (std::size_t i = 0; i < fragChildren.size(); ++i) {
|
|
if (!MatchFragment(*fragChildren[i], *treeChildren[i])) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
} // namespace FilterRuleTable
|
|
} // namespace Syntax
|
|
} // namespace MosesTraining
|