filter-rule-table: comments + minor clean-up.

This commit is contained in:
Phil Williams 2015-02-11 12:03:27 +00:00
parent 8fcf00869d
commit e1d60211a4
3 changed files with 27 additions and 50 deletions

View File

@ -12,10 +12,6 @@
#include <boost/make_shared.hpp>
#include <boost/program_options.hpp>
#include "util/string_piece.hh"
#include "util/string_piece_hash.hh"
#include "util/tokenize_piece.hh"
#include "syntax-common/exception.h"
#include "syntax-common/xml_tree_parser.h"
@ -111,7 +107,6 @@ void FilterRuleTable::ReadTestSet(
std::istream &input,
std::vector<boost::shared_ptr<std::string> > &sentences)
{
const util::AnyCharacter symbolDelimiter(" \t");
int lineNum = 0;
std::string line;
while (std::getline(input, line)) {
@ -121,13 +116,7 @@ void FilterRuleTable::ReadTestSet(
<< std::endl;
continue;
}
std::ostringstream tmp;
tmp << " ";
for (util::TokenIter<util::AnyCharacter, true> p(line, symbolDelimiter);
p; ++p) {
tmp << *p << " ";
}
sentences.push_back(boost::make_shared<std::string>(tmp.str()));
sentences.push_back(boost::make_shared<std::string>(line));
}
}

View File

@ -89,6 +89,9 @@ void StringCfgFilter::Filter(std::istream &in, std::ostream &out)
p; ++p) {
symbols.push_back(*p);
}
// Generate a pattern (fails if any source-side terminal is not in the
// test set vocabulary) and attempt to match it against the test sentences.
keep = GeneratePattern(symbols, pattern) && MatchPattern(pattern);
if (keep) {
out << line << std::endl;
@ -221,39 +224,27 @@ bool StringCfgFilter::MatchPattern(const Pattern &pattern) const
intersection.swap(tmp);
}
// Step 3: For each sentence in the intersection, construct a trellis
// with a column of intra-sentence positions for each subpattern.
// If there is a consistent path of position values through the
// trellis then there is a match ('consistent' here means that the
// subpatterns occur in the right order and are separated by at
// least the minimum widths required by the pattern's gaps).
// Step 3: For each sentence in the intersection, try to find a consistent
// sequence of intra-sentence positions (one for each subpattern).
// 'Consistent' here means that the subpatterns occur in the right
// order and are separated by at least the minimum widths required
// by the pattern's gaps).
for (std::vector<int>::const_iterator p = intersection.begin();
p != intersection.end(); ++p) {
const int sentenceId = *p;
const int sentenceLength = m_sentenceLengths[sentenceId];
SentenceTrellis trellis;
// For each subpattern's CoordinateTable:
for (std::vector<const CoordinateTable *>::const_iterator
q = tables.begin(); q != tables.end(); ++q) {
const CoordinateTable &table = **q;
// Add the intra-sentence position sequence as a column of the trellis.
boost::unordered_map<int, PositionSeq>::const_iterator r =
table.intraSentencePositions.find(sentenceId);
assert(r != table.intraSentencePositions.end());
trellis.columns.push_back(&(r->second));
}
// Search the trellis for a consistent sequence of position values.
if (MatchPattern(trellis, sentenceLength, pattern)) {
if (MatchPattern(pattern, tables, *p)) {
return true;
}
}
return false;
}
bool StringCfgFilter::MatchPattern(const SentenceTrellis &trellis,
int sentenceLength,
const Pattern &pattern) const
bool StringCfgFilter::MatchPattern(
const Pattern &pattern,
std::vector<const CoordinateTable *> &tables,
int sentenceId) const
{
const int sentenceLength = m_sentenceLengths[sentenceId];
// In the for loop below, we need to know the set of start position ranges
// where subpattern i is allowed to occur (rangeSet) and we are generating
// the ranges for subpattern i+1 (nextRangeSet).
@ -268,11 +259,16 @@ bool StringCfgFilter::MatchPattern(const SentenceTrellis &trellis,
// Attempt to match subpatterns.
for (int i = 0; i < pattern.subpatterns.size(); ++i) {
const PositionSeq &col = *trellis.columns[i];
// Look-up the intra-sentence position sequence.
boost::unordered_map<int, PositionSeq>::const_iterator r =
tables[i]->intraSentencePositions.find(sentenceId);
assert(r != tables[i]->intraSentencePositions.end());
const PositionSeq &col = r->second;
for (PositionSeq::const_iterator p = col.begin(); p != col.end(); ++p) {
bool inRange = false;
for (std::vector<Range>::const_iterator q = rangeSet.begin();
q != rangeSet.end(); ++q) {
// TODO Use the fact that the ranges are ordered to break early.
if (*p >= q->first && *p <= q->second) {
inRange = true;
break;

View File

@ -49,7 +49,7 @@ class StringCfgFilter : public CfgFilter {
// Max NGram length.
static const std::size_t kMaxNGramLength;
// Maps symbols (terminals and non-terminals) from strings to integers.
// Maps words from strings to integers.
typedef NumberedSet<std::string, std::size_t> Vocabulary;
// A NGram is a sequence of words.
@ -83,13 +83,6 @@ class StringCfgFilter : public CfgFilter {
// A range of start positions.
typedef std::pair<int, int> Range;
// A SentenceTrellis holds the positions at which each of a pattern's
// subpatterns occur in a single sentence.
struct SentenceTrellis
{
std::vector<const PositionSeq *> columns;
};
// A CoordinateTable records the set of sentences in which a single
// n-gram occurs and for each of those sentences, the start positions
struct CoordinateTable {
@ -118,7 +111,6 @@ class StringCfgFilter : public CfgFilter {
// do not occur in the test sentence vocabulary.
bool GeneratePattern(const std::vector<StringPiece> &, Pattern &) const;
// Calculate the minimum width of the pattern suffix starting
// at subpattern i.
int MinWidth(const Pattern &p, int i) const;
@ -128,10 +120,10 @@ class StringCfgFilter : public CfgFilter {
// Try to match the pattern p against any sentence in the test set.
bool MatchPattern(const Pattern &p) const;
// Try to match the pattern p against the SentenceTrellis t of a single
// sentence.
bool MatchPattern(const SentenceTrellis &t, int sentenceLength,
const Pattern &p) const;
// Try to match the pattern p against the sentence with the given ID.
bool MatchPattern(const Pattern &p,
std::vector<const CoordinateTable *> &tables,
int id) const;
// The main search structure constructed from the test set sentences.
NGramCoordinateMap m_ngramCoordinateMap;