Add score-stsg, a program for scoring STSG extract files

This commit is contained in:
Phil Williams 2014-12-02 17:10:20 +00:00
parent 56921cae3b
commit f84f159247
20 changed files with 1295 additions and 0 deletions

View File

@ -225,6 +225,7 @@ phrase-extract//extract-ghkm
phrase-extract//pcfg-extract
phrase-extract//pcfg-score
phrase-extract//extract-mixed-syntax
phrase-extract//score-stsg
biconcor
mira//mira
contrib/server//mosesserver

View File

@ -0,0 +1,23 @@
#pragma once
#include <string>
namespace Moses
{
namespace ScoreStsg
{
class Exception
{
public:
Exception(const char *msg) : m_msg(msg) {}
Exception(const std::string &msg) : m_msg(msg) {}
const std::string &GetMsg() const {
return m_msg;
}
private:
std::string m_msg;
};
} // namespace ScoreStsg
} // namespace Moses

View File

@ -0,0 +1 @@
exe score-stsg : [ glob *.cpp ] ..//deps ../..//boost_iostreams ../..//boost_program_options ../..//z : <include>.. ;

View File

@ -0,0 +1,53 @@
#include "LexicalTable.h"
#include "util/tokenize_piece.hh"
#include <cstdlib>
#include <iostream>
namespace Moses
{
namespace ScoreStsg
{
LexicalTable::LexicalTable(Vocabulary &srcVocab, Vocabulary &tgtVocab)
: m_srcVocab(srcVocab)
, m_tgtVocab(tgtVocab)
{
}
void LexicalTable::Load(std::istream &input)
{
const util::AnyCharacter delimiter(" \t");
std::string line;
std::string tmp;
int i = 0;
while (getline(input, line)) {
++i;
if (i%100000 == 0) {
std::cerr << ".";
}
util::TokenIter<util::AnyCharacter> it(line, delimiter);
// Target word
it->CopyToString(&tmp);
Vocabulary::IdType tgtId = m_tgtVocab.Insert(tmp);
++it;
// Source word.
it->CopyToString(&tmp);
Vocabulary::IdType srcId = m_srcVocab.Insert(tmp);
++it;
// Probability.
it->CopyToString(&tmp);
double prob = atof(tmp.c_str());
m_table[srcId][tgtId] = prob;
}
std::cerr << std::endl;
}
} // namespace ScoreStsg
} // namespace Moses

View File

@ -0,0 +1,43 @@
#pragma once
#include <istream>
#include <string>
#include <vector>
#include <boost/unordered_map.hpp>
#include "Vocabulary.h"
namespace Moses
{
namespace ScoreStsg
{
class LexicalTable
{
public:
LexicalTable(Vocabulary &, Vocabulary &);
void Load(std::istream &);
double PermissiveLookup(Vocabulary::IdType s, Vocabulary::IdType t) {
OuterMap::const_iterator p = m_table.find(s);
if (p == m_table.end()) {
return 1.0;
}
const InnerMap &inner = p->second;
InnerMap::const_iterator q = inner.find(t);
return q == inner.end() ? 1.0 : q->second;
}
private:
typedef boost::unordered_map<Vocabulary::IdType, double> InnerMap;
typedef boost::unordered_map<Vocabulary::IdType, InnerMap> OuterMap;
Vocabulary &m_srcVocab;
Vocabulary &m_tgtVocab;
OuterMap m_table;
};
} // namespace ScoreStsg
} // namespace Moses

View File

@ -0,0 +1,7 @@
#include "ScoreStsg.h"
int main(int argc, char *argv[])
{
Moses::ScoreStsg::ScoreStsg tool;
return tool.Main(argc, argv);
}

View File

@ -0,0 +1,110 @@
#pragma once
#include <limits>
#include <sstream>
#include <vector>
#include <boost/unordered_map.hpp>
#include "Exception.h"
namespace Moses {
namespace ScoreStsg {
// Stores a set of elements of type T, each of which is allocated an integral
// ID of type I. IDs are contiguous starting at 0. Individual elements cannot
// be removed once inserted (but the whole set can be cleared).
template<typename T, typename I=size_t>
class NumberedSet {
private:
typedef boost::unordered_map<T, I> ElementToIdMap;
typedef std::vector<const T *> IdToElementMap;
public:
typedef I IdType;
typedef typename IdToElementMap::const_iterator const_iterator;
NumberedSet() {}
const_iterator begin() const { return id_to_element_.begin(); }
const_iterator end() const { return id_to_element_.end(); }
// Static value
static I NullId() { return std::numeric_limits<I>::max(); }
bool IsEmpty() const { return id_to_element_.empty(); }
size_t Size() const { return id_to_element_.size(); }
// Insert the given object and return its ID.
I Insert(const T &);
// Look up the given object and return its ID.
I Lookup(const T &) const;
// Look up the given object using an alternative key type (this is useful if,
// for example, T is std::string and you want to look up the object using a
// StringPiece).
template<typename CompatibleKey, typename CompatibleHash,
typename CompatiblePredicate>
I Lookup(const CompatibleKey &, const CompatibleHash &,
const CompatiblePredicate &) const;
// Lookup the object with the given ID.
const T &Lookup(I) const;
void Clear();
private:
ElementToIdMap element_to_id_;
IdToElementMap id_to_element_;
};
template<typename T, typename I>
I NumberedSet<T, I>::Lookup(const T &s) const {
typename ElementToIdMap::const_iterator p = element_to_id_.find(s);
return (p == element_to_id_.end()) ? NullId() : p->second;
}
template<typename T, typename I>
template<typename CompatibleKey, typename CompatibleHash,
typename CompatiblePredicate>
I NumberedSet<T, I>::Lookup(const CompatibleKey &key,
const CompatibleHash &hash,
const CompatiblePredicate &pred) const {
typename ElementToIdMap::const_iterator p =
element_to_id_.find(key, hash, pred);
return (p == element_to_id_.end()) ? NullId() : p->second;
}
template<typename T, typename I>
const T &NumberedSet<T, I>::Lookup(I id) const {
// FIXME Need to check id is > 0 iff I is a signed type.
//if (id < 0 || id >= id_to_element_.size()) {
if (id >= id_to_element_.size()) {
std::ostringstream msg;
msg << "Value not found: " << id;
throw Exception(msg.str());
}
return *(id_to_element_[id]);
}
template<typename T, typename I>
I NumberedSet<T, I>::Insert(const T &x) {
std::pair<T, I> value(x, id_to_element_.size());
std::pair<typename ElementToIdMap::iterator, bool> result =
element_to_id_.insert(value);
if (result.second) {
// x is a new element.
id_to_element_.push_back(&result.first->first);
}
return result.first->second;
}
template<typename T, typename I>
void NumberedSet<T, I>::Clear() {
element_to_id_.clear();
id_to_element_.clear();
}
} // namespace ScoreStsg
} // namespace Moses

View File

@ -0,0 +1,41 @@
#pragma once
#include <string>
namespace Moses
{
namespace ScoreStsg
{
struct Options {
public:
Options()
: goodTuring(false)
, inverse(false)
, kneserNey(false)
, logProb(false)
, minCountHierarchical(0)
, negLogProb(false)
, noLex(false)
, noWordAlignment(false)
, pcfg(false) {}
// Positional options
std::string extractFile;
std::string lexFile;
std::string tableFile;
// All other options
bool goodTuring;
bool inverse;
bool kneserNey;
bool logProb;
int minCountHierarchical;
bool negLogProb;
bool noLex;
bool noWordAlignment;
bool pcfg;
};
} // namespace ScoreStsg
} // namespace Moses

View File

@ -0,0 +1,42 @@
#include "RuleGroup.h"
namespace Moses
{
namespace ScoreStsg
{
void RuleGroup::SetNewSource(const StringPiece &source)
{
source.CopyToString(&m_source);
m_distinctRules.clear();
m_totalCount = 0;
}
void RuleGroup::AddRule(const StringPiece &target, const StringPiece &ntAlign,
const StringPiece &fullAlign, int count)
{
if (m_distinctRules.empty() ||
ntAlign != m_distinctRules.back().ntAlign ||
target != m_distinctRules.back().target) {
DistinctRule r;
target.CopyToString(&r.target);
ntAlign.CopyToString(&r.ntAlign);
r.alignments.resize(r.alignments.size()+1);
fullAlign.CopyToString(&r.alignments.back().first);
r.alignments.back().second = count;
r.count = count;
m_distinctRules.push_back(r);
} else {
DistinctRule &r = m_distinctRules.back();
if (r.alignments.back().first != fullAlign) {
r.alignments.resize(r.alignments.size()+1);
fullAlign.CopyToString(&r.alignments.back().first);
}
r.alignments.back().second += count;
r.count += count;
}
m_totalCount += count;
}
} // namespace ScoreStsg
} // namespace Moses

View File

@ -0,0 +1,68 @@
#pragma once
#include <string>
#include <utility>
#include <vector>
#include "util/string_piece.hh"
namespace Moses
{
namespace ScoreStsg
{
// A group of rules that share the same source-side. Rules are added through
// calls to SetNewSource() and AddRule(). They can then be accessed via the
// iterators.
//
// It is assumed that rules with the same (target, ntAlign, alignment) value
// will be added consecutively, and so will rules with the same
// (target, ntAlign) value. In other words, it is assumed that rules will be
// added in the order they occur in a correctly-sorted extract file.
class RuleGroup
{
public:
// Stores the target-side and NT-alignment of a distinct rule. Also records
// the rule's count and the observed symbol alignments (plus their
// frequencies).
struct DistinctRule {
std::string target;
std::string ntAlign;
std::vector<std::pair<std::string, int> > alignments;
int count;
};
typedef std::vector<DistinctRule>::const_iterator ConstIterator;
// Begin and End iterators for iterating over the group's distinct rules.
ConstIterator Begin() const { return m_distinctRules.begin(); }
ConstIterator End() const { return m_distinctRules.end(); }
// Get the current source-side value.
const std::string &GetSource() const { return m_source; }
// Get the number of distinct rules.
int GetSize() const { return m_distinctRules.size(); }
// Get the total count.
int GetTotalCount() const { return m_totalCount; }
// Clear the rule group and set a new source-side value. This must be
// done once for every new source-side value, prior to the first call to
// AddRule().
void SetNewSource(const StringPiece &source);
// Add a rule. To determine rule distinctness, the target and ntAlign
// values will be checked against those of the previous rule only (in other
// words, the input is assumed to be ordered).
void AddRule(const StringPiece &target, const StringPiece &ntAlign,
const StringPiece &fullAlign, int count);
private:
std::string m_source;
std::vector<DistinctRule> m_distinctRules;
int m_totalCount;
};
} // namespace ScoreStsg
} // namespace Moses

View File

@ -0,0 +1,17 @@
#pragma once
#include "util/string_piece.hh"
namespace Moses
{
namespace ScoreStsg
{
struct RuleSymbol
{
StringPiece value;
bool isNonTerminal;
};
} // namespace ScoreStsg
} // namespace Moses

View File

@ -0,0 +1,78 @@
#include "RuleTableWriter.h"
#include <cassert>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <iterator>
#include <string>
#include <sstream>
#include <vector>
#include "util/string_piece.hh"
#include "util/tokenize_piece.hh"
#include "Exception.h"
#include "InputFileStream.h"
#include "LexicalTable.h"
#include "OutputFileStream.h"
#include "Options.h"
#include "RuleGroup.h"
namespace Moses
{
namespace ScoreStsg
{
void RuleTableWriter::WriteLine(const TokenizedRuleHalf &source,
const TokenizedRuleHalf &target,
const std::string &bestAlignment,
double lexScore, int count, int totalCount,
int distinctCount)
{
if (m_options.inverse) {
WriteRuleHalf(target);
m_out << " ||| ";
WriteRuleHalf(source);
} else {
WriteRuleHalf(source);
m_out << " ||| ";
WriteRuleHalf(target);
}
m_out << " |||" << bestAlignment << "||| ";
if (!m_options.noLex) {
m_out << MaybeLog(lexScore);
}
// TODO PCFG
m_out << " ||| " << totalCount << " " << count;
if (m_options.kneserNey) {
m_out << " " << distinctCount;
}
m_out << " |||";
m_out << std::endl;
}
void RuleTableWriter::WriteRuleHalf(const TokenizedRuleHalf &half)
{
if (half.IsTree()) {
m_out << half.string;
return;
}
for (std::vector<RuleSymbol>::const_iterator p = half.frontierSymbols.begin();
p != half.frontierSymbols.end(); ++p) {
if (p->isNonTerminal) {
m_out << "[" << p->value << "][" << p->value << "] ";
} else {
m_out << p->value << " ";
}
}
m_out << "[X]";
}
} // namespace ScoreStsg
} // namespace Moses

View File

@ -0,0 +1,41 @@
#pragma once
#include <cmath>
#include <string>
#include "OutputFileStream.h"
#include "Options.h"
#include "TokenizedRuleHalf.h"
namespace Moses
{
namespace ScoreStsg
{
class RuleTableWriter
{
public:
RuleTableWriter(const Options &options, OutputFileStream &out)
: m_options(options)
, m_out(out) {}
void WriteLine(const TokenizedRuleHalf &, const TokenizedRuleHalf &,
const std::string &, double, int, int, int);
private:
double MaybeLog(double a) const {
if (!m_options.logProb) {
return a;
}
return m_options.negLogProb ? -log(a) : log(a);
}
void WriteRuleHalf(const TokenizedRuleHalf &);
const Options &m_options;
OutputFileStream &m_out;
};
} // namespace ScoreStsg
} // namespace Moses

View File

@ -0,0 +1,440 @@
#include "ScoreStsg.h"
#include <cassert>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <iterator>
#include <string>
#include <sstream>
#include <vector>
#include <boost/program_options.hpp>
#include "util/string_piece.hh"
#include "util/string_piece_hash.hh"
#include "util/tokenize_piece.hh"
#include "Exception.h"
#include "InputFileStream.h"
#include "LexicalTable.h"
#include "OutputFileStream.h"
#include "Options.h"
#include "RuleGroup.h"
#include "RuleTableWriter.h"
namespace Moses
{
namespace ScoreStsg
{
const int ScoreStsg::kCountOfCountsMax = 10;
ScoreStsg::ScoreStsg()
: m_name("score-stsg")
, m_lexTable(m_srcVocab, m_tgtVocab)
, m_countOfCounts(kCountOfCountsMax, 0)
, m_totalDistinct(0)
{
}
int ScoreStsg::Main(int argc, char *argv[])
{
// Process command-line options.
ProcessOptions(argc, argv, m_options);
// Open input files.
InputFileStream extractStream(m_options.extractFile);
InputFileStream lexStream(m_options.lexFile);
// Open output files.
OutputFileStream outStream;
OutputFileStream countOfCountsStream;
OpenOutputFileOrDie(m_options.tableFile, outStream);
if (m_options.goodTuring || m_options.kneserNey) {
OpenOutputFileOrDie(m_options.tableFile+".coc", countOfCountsStream);
}
// Load lexical table.
if (!m_options.noLex) {
m_lexTable.Load(lexStream);
}
const util::MultiCharacter delimiter("|||");
std::size_t lineNum = 0;
std::size_t startLine= 0;
std::string line;
std::string tmp;
RuleGroup ruleGroup;
RuleTableWriter ruleTableWriter(m_options, outStream);
while (std::getline(extractStream, line)) {
++lineNum;
// Tokenize the input line.
util::TokenIter<util::MultiCharacter> it(line, delimiter);
StringPiece source = *it++;
StringPiece target = *it++;
StringPiece ntAlign = *it++;
StringPiece fullAlign = *it++;
it->CopyToString(&tmp);
int count = std::atoi(tmp.c_str());
// If this is the first line or if source has changed since the last
// line then process the current rule group and start a new one.
if (source != ruleGroup.GetSource()) {
if (lineNum > 1) {
ProcessRuleGroupOrDie(ruleGroup, ruleTableWriter, startLine, lineNum-1);
}
startLine = lineNum;
ruleGroup.SetNewSource(source);
}
// Add the rule to the current rule group.
ruleGroup.AddRule(target, ntAlign, fullAlign, count);
}
// Process the final rule group.
ProcessRuleGroupOrDie(ruleGroup, ruleTableWriter, startLine, lineNum);
// Write count of counts file.
if (m_options.goodTuring || m_options.kneserNey) {
// Kneser-Ney needs the total number of distinct rules.
countOfCountsStream << m_totalDistinct << std::endl;
// Write out counts of counts.
for (int i = 1; i <= kCountOfCountsMax; ++i) {
countOfCountsStream << m_countOfCounts[i] << std::endl;
}
}
return 0;
}
void ScoreStsg::TokenizeRuleHalf(const std::string &s, TokenizedRuleHalf &half)
{
// Copy s to half.string, but strip any leading or trailing whitespace.
std::size_t start = s.find_first_not_of(" \t");
if (start == std::string::npos) {
throw Exception("rule half is empty");
}
std::size_t end = s.find_last_not_of(" \t");
assert(end != std::string::npos);
half.string = s.substr(start, end-start+1);
// Tokenize half.string.
half.tokens.clear();
for (TreeFragmentTokenizer p(half.string);
p != TreeFragmentTokenizer(); ++p) {
half.tokens.push_back(*p);
}
// Extract the frontier symbols.
half.frontierSymbols.clear();
const std::size_t numTokens = half.tokens.size();
for (int i = 0; i < numTokens; ++i) {
if (half.tokens[i].type != TreeFragmentToken_WORD) {
continue;
}
if (i == 0 || half.tokens[i-1].type != TreeFragmentToken_LSB) {
// A word is a terminal iff it doesn't follow '['
half.frontierSymbols.resize(half.frontierSymbols.size()+1);
half.frontierSymbols.back().value = half.tokens[i].value;
half.frontierSymbols.back().isNonTerminal = false;
} else if (i+1 < numTokens &&
half.tokens[i+1].type == TreeFragmentToken_RSB) {
// A word is a non-terminal iff it it follows '[' and is succeeded by ']'
half.frontierSymbols.resize(half.frontierSymbols.size()+1);
half.frontierSymbols.back().value = half.tokens[i].value;
half.frontierSymbols.back().isNonTerminal = true;
++i; // Skip over the ']'
}
}
}
void ScoreStsg::ProcessRuleGroupOrDie(const RuleGroup &group,
RuleTableWriter &writer,
std::size_t start,
std::size_t end)
{
try {
ProcessRuleGroup(group, writer);
} catch (const Exception &e) {
std::ostringstream msg;
msg << "failed to process rule group at lines " << start << "-" << end
<< ": " << e.GetMsg();
Error(msg.str());
} catch (const std::exception &e) {
std::ostringstream msg;
msg << "failed to process rule group at lines " << start << "-" << end
<< ": " << e.what();
Error(msg.str());
}
}
void ScoreStsg::ProcessRuleGroup(const RuleGroup &group,
RuleTableWriter &writer)
{
const std::size_t totalCount = group.GetTotalCount();
const std::size_t distinctCount = group.GetSize();
TokenizeRuleHalf(group.GetSource(), m_sourceHalf);
const bool fullyLexical = m_sourceHalf.IsFullyLexical();
// Process each distinct rule in turn.
for (RuleGroup::ConstIterator p = group.Begin(); p != group.End(); ++p) {
const RuleGroup::DistinctRule &rule = *p;
// Update count of count statistics.
if (m_options.goodTuring || m_options.kneserNey) {
++m_totalDistinct;
int countInt = rule.count + 0.99999;
if (countInt <= kCountOfCountsMax) {
++m_countOfCounts[countInt];
}
}
// If the rule is not fully lexical then discard it if the count is below
// the threshold value.
if (!fullyLexical && rule.count < m_options.minCountHierarchical) {
continue;
}
TokenizeRuleHalf(rule.target, m_targetHalf);
// Find the most frequent alignment (if there's a tie, take the first one).
std::vector<std::pair<std::string, int> >::const_iterator q =
rule.alignments.begin();
const std::pair<std::string, int> *bestAlignmentAndCount = &(*q++);
for (; q != rule.alignments.end(); ++q) {
if (q->second > bestAlignmentAndCount->second) {
bestAlignmentAndCount = &(*q);
}
}
const std::string &bestAlignment = bestAlignmentAndCount->first;
ParseAlignmentString(bestAlignment, m_targetHalf.frontierSymbols.size(),
m_tgtToSrc);
// Compute the lexical translation probability.
double lexProb = ComputeLexProb(m_sourceHalf.frontierSymbols,
m_targetHalf.frontierSymbols, m_tgtToSrc);
// TODO PCFG score
// Write a line to the rule table.
writer.WriteLine(m_sourceHalf, m_targetHalf, bestAlignment, lexProb,
p->count, totalCount, distinctCount);
}
}
void ScoreStsg::ParseAlignmentString(const std::string &s, int numTgtWords,
MosesTraining::ALIGNMENT &tgtToSrc)
{
tgtToSrc.clear();
tgtToSrc.resize(numTgtWords);
const std::string digits = "0123456789";
std::string::size_type begin = 0;
while (true) {
std::string::size_type end = s.find("-", begin);
if (end == std::string::npos) {
return;
}
int src = std::atoi(s.substr(begin, end-begin).c_str());
if (end+1 == s.size()) {
throw Exception("Target index missing");
}
begin = end+1;
end = s.find_first_not_of(digits, begin+1);
int tgt;
if (end == std::string::npos) {
tgt = std::atoi(s.substr(begin).c_str());
tgtToSrc[tgt].insert(src);
return;
} else {
tgt = std::atoi(s.substr(begin, end-begin).c_str());
tgtToSrc[tgt].insert(src);
}
begin = end+1;
}
}
double ScoreStsg::ComputeLexProb(const std::vector<RuleSymbol> &sourceFrontier,
const std::vector<RuleSymbol> &targetFrontier,
const MosesTraining::ALIGNMENT &tgtToSrc)
{
double lexScore = 1.0;
for (std::size_t i = 0; i < targetFrontier.size(); ++i) {
if (targetFrontier[i].isNonTerminal) {
continue;
}
Vocabulary::IdType tgtId = m_tgtVocab.Lookup(targetFrontier[i].value,
StringPieceCompatibleHash(),
StringPieceCompatibleEquals());
const std::set<std::size_t> &srcIndices = tgtToSrc[i];
if (srcIndices.empty()) {
// Explain unaligned word by NULL.
lexScore *= m_lexTable.PermissiveLookup(Vocabulary::NullId(), tgtId);
} else {
double thisWordScore = 0.0;
for (std::set<std::size_t>::const_iterator p = srcIndices.begin();
p != srcIndices.end(); ++p) {
Vocabulary::IdType srcId =
m_srcVocab.Lookup(sourceFrontier[*p].value,
StringPieceCompatibleHash(),
StringPieceCompatibleEquals());
thisWordScore += m_lexTable.PermissiveLookup(srcId, tgtId);
}
lexScore *= thisWordScore / static_cast<double>(srcIndices.size());
}
}
return lexScore;
}
void ScoreStsg::OpenOutputFileOrDie(const std::string &filename,
OutputFileStream &stream)
{
bool ret = stream.Open(filename);
if (!ret) {
std::ostringstream msg;
msg << "failed to open output file: " << filename;
Error(msg.str());
}
}
void ScoreStsg::ProcessOptions(int argc, char *argv[], Options &options) const
{
namespace po = boost::program_options;
namespace cls = boost::program_options::command_line_style;
// Construct the 'top' of the usage message: the bit that comes before the
// options list.
std::ostringstream usageTop;
usageTop << "Usage: " << GetName()
<< " [OPTION]... EXTRACT LEX TABLE\n\n"
<< "STSG rule scorer\n\n"
<< "Options";
// Construct the 'bottom' of the usage message.
std::ostringstream usageBottom;
usageBottom << "TODO";
// Declare the command line options that are visible to the user.
po::options_description visible(usageTop.str());
visible.add_options()
("GoodTuring",
"apply Good-Turing smoothing to relative frequency probability estimates")
("Hierarchical",
"ignored (included for compatibility with score)")
("Inverse",
"use inverse mode")
("KneserNey",
"apply Kneser-Ney smoothing to relative frequency probability estimates")
("LogProb",
"output log probabilities")
("MinCountHierarchical",
po::value(&options.minCountHierarchical)->
default_value(options.minCountHierarchical),
"filter out rules with frequency < arg (except fully lexical rules)")
("NegLogProb",
"output negative log probabilities")
("NoLex",
"do not compute lexical translation score")
("NoWordAlignment",
"do not output word alignments")
("PCFG",
"include pre-computed PCFG score from extract")
("UnpairedExtractFormat",
"ignored (included for compatibility with score)")
;
// Declare the command line options that are hidden from the user
// (these are used as positional options).
po::options_description hidden("Hidden options");
hidden.add_options()
("ExtractFile",
po::value(&options.extractFile),
"extract file")
("LexFile",
po::value(&options.lexFile),
"lexical probability file")
("TableFile",
po::value(&options.tableFile),
"output file")
;
// Compose the full set of command-line options.
po::options_description cmdLineOptions;
cmdLineOptions.add(visible).add(hidden);
// Register the positional options.
po::positional_options_description p;
p.add("ExtractFile", 1);
p.add("LexFile", 1);
p.add("TableFile", 1);
// Process the command-line.
po::variables_map vm;
const int optionStyle = cls::allow_long
| cls::long_allow_adjacent
| cls::long_allow_next;
try {
po::store(po::command_line_parser(argc, argv).style(optionStyle).
options(cmdLineOptions).positional(p).run(), vm);
po::notify(vm);
} catch (const std::exception &e) {
std::ostringstream msg;
msg << e.what() << "\n\n" << visible << usageBottom.str();
Error(msg.str());
}
if (vm.count("help")) {
std::cout << visible << usageBottom.str() << std::endl;
std::exit(0);
}
// Check all positional options were given.
if (!vm.count("ExtractFile") ||
!vm.count("LexFile") ||
!vm.count("TableFile")) {
std::ostringstream msg;
std::cerr << visible << usageBottom.str() << std::endl;
std::exit(1);
}
// Process Boolean options.
if (vm.count("GoodTuring")) {
options.goodTuring = true;
}
if (vm.count("Inverse")) {
options.inverse = true;
}
if (vm.count("KneserNey")) {
options.kneserNey = true;
}
if (vm.count("LogProb")) {
options.logProb = true;
}
if (vm.count("NegLogProb")) {
options.negLogProb = true;
}
if (vm.count("NoLex")) {
options.noLex = true;
}
if (vm.count("NoWordAlignment")) {
options.noWordAlignment = true;
}
if (vm.count("PCFG")) {
options.pcfg = true;
}
}
void ScoreStsg::Error(const std::string &msg) const
{
std::cerr << GetName() << ": " << msg << std::endl;
std::exit(1);
}
} // namespace ScoreStsg
} // namespace Moses

View File

@ -0,0 +1,75 @@
#pragma once
#include <map>
#include <ostream>
#include <set>
#include <string>
#include <vector>
#include "ExtractionPhrasePair.h"
#include "LexicalTable.h"
#include "Options.h"
#include "RuleSymbol.h"
#include "TokenizedRuleHalf.h"
#include "Vocabulary.h"
namespace Moses
{
class OutputFileStream;
namespace ScoreStsg
{
class RuleGroup;
class RuleTableWriter;
class ScoreStsg
{
public:
ScoreStsg();
const std::string &GetName() const {
return m_name;
}
int Main(int argc, char *argv[]);
private:
static const int kCountOfCountsMax;
double ComputeLexProb(const std::vector<RuleSymbol> &,
const std::vector<RuleSymbol> &,
const MosesTraining::ALIGNMENT &);
void Error(const std::string &) const;
void OpenOutputFileOrDie(const std::string &, OutputFileStream &);
void ParseAlignmentString(const std::string &, int,
MosesTraining::ALIGNMENT &);
void ProcessOptions(int, char *[], Options &) const;
void ProcessRuleGroup(const RuleGroup &, RuleTableWriter &);
void ProcessRuleGroupOrDie(const RuleGroup &, RuleTableWriter &,
std::size_t, std::size_t);
void TokenizeRuleHalf(const std::string &, TokenizedRuleHalf &);
std::string m_name;
Options m_options;
Vocabulary m_srcVocab;
Vocabulary m_tgtVocab;
LexicalTable m_lexTable;
std::vector<int> m_countOfCounts;
int m_totalDistinct;
TokenizedRuleHalf m_sourceHalf;
TokenizedRuleHalf m_targetHalf;
MosesTraining::ALIGNMENT m_tgtToSrc;
};
} // namespace ScoreStsg
} // namespace Moses

View File

@ -0,0 +1,37 @@
#include "TokenizedRuleHalf.h"
namespace Moses
{
namespace ScoreStsg
{
bool TokenizedRuleHalf::IsFullyLexical() const
{
for (std::vector<RuleSymbol>::const_iterator p = frontierSymbols.begin();
p != frontierSymbols.end(); ++p) {
if (p->isNonTerminal) {
return false;
}
}
return true;
}
bool TokenizedRuleHalf::IsString() const
{
// A rule half is either a string (like "[X] and [X]") or a tree (like
// "[NP [NP] [CC and] [NP]]").
//
// A string must start with a terminal or a non-terminal (in square brackets).
// A tree must start with '[' followed by a word then either another word or
// another '['.
return (tokens[0].type == TreeFragmentToken_WORD ||
tokens[2].type == TreeFragmentToken_RSB);
}
bool TokenizedRuleHalf::IsTree() const
{
return !IsString();
}
} // namespace ScoreStsg
} // namespace Moses

View File

@ -0,0 +1,45 @@
#pragma once
#include <string>
#include <vector>
#include "RuleSymbol.h"
#include "TreeFragmentTokenizer.h"
namespace Moses
{
namespace ScoreStsg
{
// Stores one half of a STSG rule, as represented in the extract file. The
// original string is stored as the member 'string', along with its token
// sequence ('tokens') and frontier symbol sequence ('frontierSymbols'). Note
// that 'tokens' and 'frontierSymbols' use StringPiece objects that depend on
// the original string. Therefore changing the value of 'string' invalidates
// both 'tokens' and 'frontierSymbols'.
struct TokenizedRuleHalf
{
bool IsFullyLexical() const;
bool IsString() const;
bool IsTree() const;
// The rule half as it appears in the extract file, except with any trailing
// or leading spaces removed (here a space is defined as a blank or a tab).
std::string string;
// The token sequence for the string.
std::vector<TreeFragmentToken> tokens;
// The frontier symbols of the rule half. For example:
//
// string: "[VP [VBN] [PP [IN] [NP [DT] [JJ positive] [NN light]]]]"
// frontier: ("VBN",t), ("IN",t), ("DT",t), ("positive",f), ("light",f)
//
// string: "[X] [X] Sinne [X]"
// frontier: ("X",t), ("X",t), ("Sinne",f), ("X",t)
//
std::vector<RuleSymbol> frontierSymbols;
};
} // namespace ScoreStsg
} // namespace Moses

View File

@ -0,0 +1,90 @@
#include "TreeFragmentTokenizer.h"
#include <cctype>
namespace Moses
{
namespace ScoreStsg
{
TreeFragmentToken::TreeFragmentToken(TreeFragmentTokenType t,
StringPiece v, std::size_t p)
: type(t)
, value(v)
, pos(p) {
}
TreeFragmentTokenizer::TreeFragmentTokenizer()
: value_(TreeFragmentToken_EOS, "", -1) {
}
TreeFragmentTokenizer::TreeFragmentTokenizer(const StringPiece &s)
: str_(s)
, value_(TreeFragmentToken_EOS, "", -1)
, iter_(s.begin())
, end_(s.end())
, pos_(0) {
++(*this);
}
TreeFragmentTokenizer &TreeFragmentTokenizer::operator++() {
while (iter_ != end_ && (*iter_ == ' ' || *iter_ == '\t')) {
++iter_;
++pos_;
}
if (iter_ == end_) {
value_ = TreeFragmentToken(TreeFragmentToken_EOS, "", pos_);
return *this;
}
if (*iter_ == '[') {
value_ = TreeFragmentToken(TreeFragmentToken_LSB, "[", pos_);
++iter_;
++pos_;
} else if (*iter_ == ']') {
value_ = TreeFragmentToken(TreeFragmentToken_RSB, "]", pos_);
++iter_;
++pos_;
} else {
std::size_t start = pos_;
while (true) {
++iter_;
++pos_;
if (iter_ == end_ || *iter_ == ' ' || *iter_ == '\t') {
break;
}
if (*iter_ == '[' || *iter_ == ']') {
break;
}
}
StringPiece word = str_.substr(start, pos_-start);
value_ = TreeFragmentToken(TreeFragmentToken_WORD, word, start);
}
return *this;
}
TreeFragmentTokenizer TreeFragmentTokenizer::operator++(int) {
TreeFragmentTokenizer tmp(*this);
++*this;
return tmp;
}
bool operator==(const TreeFragmentTokenizer &lhs,
const TreeFragmentTokenizer &rhs) {
if (lhs.value_.type == TreeFragmentToken_EOS ||
rhs.value_.type == TreeFragmentToken_EOS) {
return lhs.value_.type == TreeFragmentToken_EOS &&
rhs.value_.type == TreeFragmentToken_EOS;
}
return lhs.iter_ == rhs.iter_;
}
bool operator!=(const TreeFragmentTokenizer &lhs,
const TreeFragmentTokenizer &rhs) {
return !(lhs == rhs);
}
} // namespace ScoreStsg
} // namespace Moses

View File

@ -0,0 +1,70 @@
#pragma once
#include "util/string_piece.hh"
namespace Moses
{
namespace ScoreStsg
{
enum TreeFragmentTokenType {
TreeFragmentToken_EOS,
TreeFragmentToken_LSB,
TreeFragmentToken_RSB,
TreeFragmentToken_WORD
};
struct TreeFragmentToken {
public:
TreeFragmentToken(TreeFragmentTokenType, StringPiece, std::size_t);
TreeFragmentTokenType type;
StringPiece value;
std::size_t pos;
};
// Tokenizes tree fragment strings in Moses format.
//
// For example, the string "[NP [NP [NN a]] [NP]]" is tokenized to the sequence:
//
// 1 LSB "["
// 2 WORD "NP"
// 3 LSB "["
// 4 WORD "NP"
// 5 LSB "["
// 6 WORD "NN"
// 7 WORD "a"
// 8 RSB "]"
// 9 RSB "]"
// 10 LSB "["
// 11 WORD "NP"
// 12 RSB "]"
// 13 RSB "]"
// 14 EOS undefined
//
class TreeFragmentTokenizer {
public:
TreeFragmentTokenizer();
TreeFragmentTokenizer(const StringPiece &);
const TreeFragmentToken &operator*() const { return value_; }
const TreeFragmentToken *operator->() const { return &value_; }
TreeFragmentTokenizer &operator++();
TreeFragmentTokenizer operator++(int);
friend bool operator==(const TreeFragmentTokenizer &,
const TreeFragmentTokenizer &);
friend bool operator!=(const TreeFragmentTokenizer &,
const TreeFragmentTokenizer &);
private:
StringPiece str_;
TreeFragmentToken value_;
StringPiece::const_iterator iter_;
StringPiece::const_iterator end_;
std::size_t pos_;
};
} // namespace ScoreStsg
} // namespace Moses

View File

@ -0,0 +1,13 @@
#pragma once
#include <string>
#include "NumberedSet.h"
namespace Moses {
namespace ScoreStsg {
typedef NumberedSet<std::string, std::size_t> Vocabulary;
} // namespace ScoreStsg
} // namespace Moses