Add score-stsg, a program for scoring STSG extract files

2024-09-11 19:27:11 +03:00 · 2014-12-02 17:10:20 +00:00 · 2014-12-02 17:10:20 +00:00 · f84f159247
commit f84f159247
parent 56921cae3b
20 changed files with 1295 additions and 0 deletions
--- a/1
+++ b/1
@ -225,6 +225,7 @@ phrase-extract//extract-ghkm
 phrase-extract//pcfg-extract 
 phrase-extract//pcfg-score 
 phrase-extract//extract-mixed-syntax 
+phrase-extract//score-stsg
 biconcor 
 mira//mira 
 contrib/server//mosesserver 
--- a/phrase-extract/score-stsg/Exception.h
+++ b/phrase-extract/score-stsg/Exception.h
@ -0,0 +1,23 @@
+#pragma once
+
+#include <string>
+
+namespace Moses
+{
+namespace ScoreStsg
+{
+
+class Exception
+{
+public:
+  Exception(const char *msg) : m_msg(msg) {}
+  Exception(const std::string &msg) : m_msg(msg) {}
+  const std::string &GetMsg() const {
+    return m_msg;
+  }
+private:
+  std::string m_msg;
+};
+
+} // namespace ScoreStsg
+} // namespace Moses
--- a/phrase-extract/score-stsg/Jamfile
+++ b/phrase-extract/score-stsg/Jamfile
@ -0,0 +1 @@
+exe score-stsg : [ glob *.cpp ] ..//deps ../..//boost_iostreams ../..//boost_program_options ../..//z : <include>.. ;
--- a/phrase-extract/score-stsg/LexicalTable.cpp
+++ b/phrase-extract/score-stsg/LexicalTable.cpp
@ -0,0 +1,53 @@
+#include "LexicalTable.h"
+
+#include "util/tokenize_piece.hh"
+
+#include <cstdlib>
+#include <iostream>
+
+namespace Moses
+{
+namespace ScoreStsg
+{
+
+LexicalTable::LexicalTable(Vocabulary &srcVocab, Vocabulary &tgtVocab)
+    : m_srcVocab(srcVocab)
+    , m_tgtVocab(tgtVocab)
+{
+}
+
+void LexicalTable::Load(std::istream &input)
+{
+  const util::AnyCharacter delimiter(" \t");
+
+  std::string line;
+  std::string tmp;
+  int i = 0;
+  while (getline(input, line)) {
+    ++i;
+    if (i%100000 == 0) {
+      std::cerr << ".";
+    }
+
+    util::TokenIter<util::AnyCharacter> it(line, delimiter);
+
+    // Target word
+    it->CopyToString(&tmp);
+    Vocabulary::IdType tgtId = m_tgtVocab.Insert(tmp);
+    ++it;
+
+    // Source word.
+    it->CopyToString(&tmp);
+    Vocabulary::IdType srcId = m_srcVocab.Insert(tmp);
+    ++it;
+
+    // Probability.
+    it->CopyToString(&tmp);
+    double prob = atof(tmp.c_str());
+    m_table[srcId][tgtId] = prob;
+  }
+  std::cerr << std::endl;
+}
+
+}  // namespace ScoreStsg
+}  // namespace Moses
--- a/phrase-extract/score-stsg/LexicalTable.h
+++ b/phrase-extract/score-stsg/LexicalTable.h
@ -0,0 +1,43 @@
+#pragma once
+
+#include <istream>
+#include <string>
+#include <vector>
+
+#include <boost/unordered_map.hpp>
+
+#include "Vocabulary.h"
+
+namespace Moses
+{
+namespace ScoreStsg
+{
+
+class LexicalTable
+{
+public:
+  LexicalTable(Vocabulary &, Vocabulary &);
+
+  void Load(std::istream &);
+
+  double PermissiveLookup(Vocabulary::IdType s, Vocabulary::IdType t) {
+    OuterMap::const_iterator p = m_table.find(s);
+    if (p == m_table.end()) {
+      return 1.0;
+    }
+    const InnerMap &inner = p->second;
+    InnerMap::const_iterator q = inner.find(t);
+    return q == inner.end() ? 1.0 : q->second;
+  }
+
+private:
+  typedef boost::unordered_map<Vocabulary::IdType, double> InnerMap;
+  typedef boost::unordered_map<Vocabulary::IdType, InnerMap> OuterMap;
+
+  Vocabulary &m_srcVocab;
+  Vocabulary &m_tgtVocab;
+  OuterMap m_table;
+};
+
+}  // namespace ScoreStsg
+}  // namespace Moses
--- a/phrase-extract/score-stsg/Main.cpp
+++ b/phrase-extract/score-stsg/Main.cpp
@ -0,0 +1,7 @@
+#include "ScoreStsg.h"
+
+int main(int argc, char *argv[])
+{
+  Moses::ScoreStsg::ScoreStsg tool;
+  return tool.Main(argc, argv);
+}
--- a/phrase-extract/score-stsg/NumberedSet.h
+++ b/phrase-extract/score-stsg/NumberedSet.h
@ -0,0 +1,110 @@
+#pragma once
+
+#include <limits>
+#include <sstream>
+#include <vector>
+
+#include <boost/unordered_map.hpp>
+
+#include "Exception.h"
+
+namespace Moses {
+namespace ScoreStsg {
+
+// Stores a set of elements of type T, each of which is allocated an integral
+// ID of type I.  IDs are contiguous starting at 0.  Individual elements cannot
+// be removed once inserted (but the whole set can be cleared).
+template<typename T, typename I=size_t>
+class NumberedSet {
+ private:
+  typedef boost::unordered_map<T, I> ElementToIdMap;
+  typedef std::vector<const T *> IdToElementMap;
+
+ public:
+  typedef I IdType;
+  typedef typename IdToElementMap::const_iterator const_iterator;
+
+  NumberedSet() {}
+
+  const_iterator begin() const { return id_to_element_.begin(); }
+  const_iterator end() const { return id_to_element_.end(); }
+
+  // Static value
+  static I NullId() { return std::numeric_limits<I>::max(); }
+
+  bool IsEmpty() const { return id_to_element_.empty(); }
+  size_t Size() const { return id_to_element_.size(); }
+
+  // Insert the given object and return its ID.
+  I Insert(const T &);
+
+  // Look up the given object and return its ID.
+  I Lookup(const T &) const;
+
+  // Look up the given object using an alternative key type (this is useful if,
+  // for example, T is std::string and you want to look up the object using a
+  // StringPiece).
+  template<typename CompatibleKey, typename CompatibleHash,
+           typename CompatiblePredicate>
+  I Lookup(const CompatibleKey &, const CompatibleHash &,
+           const CompatiblePredicate &) const;
+
+  // Lookup the object with the given ID.
+  const T &Lookup(I) const;
+
+  void Clear();
+
+ private:
+  ElementToIdMap element_to_id_;
+  IdToElementMap id_to_element_;
+};
+
+template<typename T, typename I>
+I NumberedSet<T, I>::Lookup(const T &s) const {
+  typename ElementToIdMap::const_iterator p = element_to_id_.find(s);
+  return (p == element_to_id_.end()) ? NullId() : p->second;
+}
+
+template<typename T, typename I>
+template<typename CompatibleKey, typename CompatibleHash,
+         typename CompatiblePredicate>
+I NumberedSet<T, I>::Lookup(const CompatibleKey &key,
+                            const CompatibleHash &hash,
+                            const CompatiblePredicate &pred) const {
+  typename ElementToIdMap::const_iterator p =
+      element_to_id_.find(key, hash, pred);
+  return (p == element_to_id_.end()) ? NullId() : p->second;
+}
+
+template<typename T, typename I>
+const T &NumberedSet<T, I>::Lookup(I id) const {
+  // FIXME Need to check id is > 0 iff I is a signed type.
+  //if (id < 0 || id >= id_to_element_.size()) {
+  if (id >= id_to_element_.size()) {
+    std::ostringstream msg;
+    msg << "Value not found: " << id;
+    throw Exception(msg.str());
+  }
+  return *(id_to_element_[id]);
+}
+
+template<typename T, typename I>
+I NumberedSet<T, I>::Insert(const T &x) {
+  std::pair<T, I> value(x, id_to_element_.size());
+  std::pair<typename ElementToIdMap::iterator, bool> result =
+      element_to_id_.insert(value);
+  if (result.second) {
+    // x is a new element.
+    id_to_element_.push_back(&result.first->first);
+  }
+  return result.first->second;
+}
+
+template<typename T, typename I>
+void NumberedSet<T, I>::Clear() {
+  element_to_id_.clear();
+  id_to_element_.clear();
+}
+
+}  // namespace ScoreStsg
+}  // namespace Moses
--- a/phrase-extract/score-stsg/Options.h
+++ b/phrase-extract/score-stsg/Options.h
@ -0,0 +1,41 @@
+#pragma once
+
+#include <string>
+
+namespace Moses
+{
+namespace ScoreStsg
+{
+
+struct Options {
+public:
+  Options()
+    : goodTuring(false)
+    , inverse(false)
+    , kneserNey(false)
+    , logProb(false)
+    , minCountHierarchical(0)
+    , negLogProb(false)
+    , noLex(false)
+    , noWordAlignment(false)
+    , pcfg(false) {}
+
+  // Positional options
+  std::string extractFile;
+  std::string lexFile;
+  std::string tableFile;
+
+  // All other options
+  bool goodTuring;
+  bool inverse;
+  bool kneserNey;
+  bool logProb;
+  int minCountHierarchical;
+  bool negLogProb;
+  bool noLex;
+  bool noWordAlignment;
+  bool pcfg;
+};
+
+}  // namespace ScoreStsg
+}  // namespace Moses
--- a/phrase-extract/score-stsg/RuleGroup.cpp
+++ b/phrase-extract/score-stsg/RuleGroup.cpp
@ -0,0 +1,42 @@
+#include "RuleGroup.h"
+
+namespace Moses
+{
+namespace ScoreStsg
+{
+
+void RuleGroup::SetNewSource(const StringPiece &source)
+{
+  source.CopyToString(&m_source);
+  m_distinctRules.clear();
+  m_totalCount = 0;
+}
+
+void RuleGroup::AddRule(const StringPiece &target, const StringPiece &ntAlign,
+                        const StringPiece &fullAlign, int count)
+{
+  if (m_distinctRules.empty() ||
+      ntAlign != m_distinctRules.back().ntAlign ||
+      target != m_distinctRules.back().target) {
+    DistinctRule r;
+    target.CopyToString(&r.target);
+    ntAlign.CopyToString(&r.ntAlign);
+    r.alignments.resize(r.alignments.size()+1);
+    fullAlign.CopyToString(&r.alignments.back().first);
+    r.alignments.back().second = count;
+    r.count = count;
+    m_distinctRules.push_back(r);
+  } else {
+    DistinctRule &r = m_distinctRules.back();
+    if (r.alignments.back().first != fullAlign) {
+      r.alignments.resize(r.alignments.size()+1);
+      fullAlign.CopyToString(&r.alignments.back().first);
+    }
+    r.alignments.back().second += count;
+    r.count += count;
+  }
+  m_totalCount += count;
+}
+
+}  // namespace ScoreStsg
+}  // namespace Moses
--- a/phrase-extract/score-stsg/RuleGroup.h
+++ b/phrase-extract/score-stsg/RuleGroup.h
@ -0,0 +1,68 @@
+#pragma once
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "util/string_piece.hh"
+
+namespace Moses
+{
+namespace ScoreStsg
+{
+
+// A group of rules that share the same source-side.  Rules are added through
+// calls to SetNewSource() and AddRule().  They can then be accessed via the
+// iterators.
+//
+// It is assumed that rules with the same (target, ntAlign, alignment) value
+// will be added consecutively, and so will rules with the same
+// (target, ntAlign) value.  In other words, it is assumed that rules will be
+// added in the order they occur in a correctly-sorted extract file.
+class RuleGroup
+{
+public:
+  // Stores the target-side and NT-alignment of a distinct rule.  Also records
+  // the rule's count and the observed symbol alignments (plus their
+  // frequencies).
+  struct DistinctRule {
+    std::string target;
+    std::string ntAlign;
+    std::vector<std::pair<std::string, int> > alignments;
+    int count;
+  };
+
+  typedef std::vector<DistinctRule>::const_iterator ConstIterator;
+
+  // Begin and End iterators for iterating over the group's distinct rules.
+  ConstIterator Begin() const { return m_distinctRules.begin(); }
+  ConstIterator End() const { return m_distinctRules.end(); }
+
+  // Get the current source-side value.
+  const std::string &GetSource() const { return m_source; }
+
+  // Get the number of distinct rules.
+  int GetSize() const { return m_distinctRules.size(); }
+
+  // Get the total count.
+  int GetTotalCount() const { return m_totalCount; }
+
+  // Clear the rule group and set a new source-side value.  This must be
+  // done once for every new source-side value, prior to the first call to
+  // AddRule().
+  void SetNewSource(const StringPiece &source);
+
+  // Add a rule.  To determine rule distinctness, the target and ntAlign
+  // values will be checked against those of the previous rule only (in other
+  // words, the input is assumed to be ordered).
+  void AddRule(const StringPiece &target, const StringPiece &ntAlign,
+               const StringPiece &fullAlign, int count);
+
+private:
+  std::string m_source;
+  std::vector<DistinctRule> m_distinctRules;
+  int m_totalCount;
+};
+
+}  // namespace ScoreStsg
+}  // namespace Moses
--- a/phrase-extract/score-stsg/RuleSymbol.h
+++ b/phrase-extract/score-stsg/RuleSymbol.h
@ -0,0 +1,17 @@
+#pragma once
+
+#include "util/string_piece.hh"
+
+namespace Moses
+{
+namespace ScoreStsg
+{
+
+struct RuleSymbol
+{
+  StringPiece value;
+  bool isNonTerminal;
+};
+
+}  // namespace ScoreStsg
+}  // namespace Moses
--- a/phrase-extract/score-stsg/RuleTableWriter.cpp
+++ b/phrase-extract/score-stsg/RuleTableWriter.cpp
@ -0,0 +1,78 @@
+#include "RuleTableWriter.h"
+
+#include <cassert>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <string>
+#include <sstream>
+#include <vector>
+
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+
+#include "Exception.h"
+#include "InputFileStream.h"
+#include "LexicalTable.h"
+#include "OutputFileStream.h"
+#include "Options.h"
+#include "RuleGroup.h"
+
+namespace Moses
+{
+namespace ScoreStsg
+{
+
+void RuleTableWriter::WriteLine(const TokenizedRuleHalf &source,
+                                const TokenizedRuleHalf &target,
+                                const std::string &bestAlignment,
+                                double lexScore, int count, int totalCount,
+                                int distinctCount)
+{
+  if (m_options.inverse) {
+    WriteRuleHalf(target);
+    m_out << " ||| ";
+    WriteRuleHalf(source);
+  } else {
+    WriteRuleHalf(source);
+    m_out << " ||| ";
+    WriteRuleHalf(target);
+  }
+
+  m_out << " |||" << bestAlignment << "||| ";
+
+  if (!m_options.noLex) {
+    m_out << MaybeLog(lexScore);
+  }
+
+  // TODO PCFG
+
+  m_out << " ||| " << totalCount << " " << count;
+  if (m_options.kneserNey) {
+    m_out << " " << distinctCount;
+  }
+  m_out << " |||";
+  m_out << std::endl;
+}
+
+void RuleTableWriter::WriteRuleHalf(const TokenizedRuleHalf &half)
+{
+  if (half.IsTree()) {
+    m_out << half.string;
+    return;
+  }
+
+  for (std::vector<RuleSymbol>::const_iterator p = half.frontierSymbols.begin();
+       p != half.frontierSymbols.end(); ++p) {
+    if (p->isNonTerminal) {
+      m_out << "[" << p->value << "][" << p->value << "] ";
+    } else {
+      m_out << p->value << " ";
+    }
+  }
+  m_out << "[X]";
+}
+
+}  // namespace ScoreStsg
+}  // namespace Moses
--- a/phrase-extract/score-stsg/RuleTableWriter.h
+++ b/phrase-extract/score-stsg/RuleTableWriter.h
@ -0,0 +1,41 @@
+#pragma once
+
+#include <cmath>
+#include <string>
+
+#include "OutputFileStream.h"
+
+#include "Options.h"
+#include "TokenizedRuleHalf.h"
+
+namespace Moses
+{
+namespace ScoreStsg
+{
+
+class RuleTableWriter
+{
+public:
+  RuleTableWriter(const Options &options, OutputFileStream &out)
+      : m_options(options)
+      , m_out(out) {}
+
+  void WriteLine(const TokenizedRuleHalf &, const TokenizedRuleHalf &,
+                 const std::string &, double, int, int, int);
+
+private:
+  double MaybeLog(double a) const {
+    if (!m_options.logProb) {
+      return a;
+    }
+    return m_options.negLogProb ? -log(a) : log(a);
+  }
+
+  void WriteRuleHalf(const TokenizedRuleHalf &);
+
+  const Options &m_options;
+  OutputFileStream &m_out;
+};
+
+}  // namespace ScoreStsg
+}  // namespace Moses
--- a/phrase-extract/score-stsg/ScoreStsg.cpp
+++ b/phrase-extract/score-stsg/ScoreStsg.cpp
@ -0,0 +1,440 @@
+#include "ScoreStsg.h"
+
+#include <cassert>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <string>
+#include <sstream>
+#include <vector>
+
+#include <boost/program_options.hpp>
+
+#include "util/string_piece.hh"
+#include "util/string_piece_hash.hh"
+#include "util/tokenize_piece.hh"
+
+#include "Exception.h"
+#include "InputFileStream.h"
+#include "LexicalTable.h"
+#include "OutputFileStream.h"
+#include "Options.h"
+#include "RuleGroup.h"
+#include "RuleTableWriter.h"
+
+namespace Moses
+{
+namespace ScoreStsg
+{
+
+const int ScoreStsg::kCountOfCountsMax = 10;
+
+ScoreStsg::ScoreStsg()
+    : m_name("score-stsg")
+    , m_lexTable(m_srcVocab, m_tgtVocab)
+    , m_countOfCounts(kCountOfCountsMax, 0)
+    , m_totalDistinct(0)
+{
+}
+
+int ScoreStsg::Main(int argc, char *argv[])
+{
+  // Process command-line options.
+  ProcessOptions(argc, argv, m_options);
+
+  // Open input files.
+  InputFileStream extractStream(m_options.extractFile);
+  InputFileStream lexStream(m_options.lexFile);
+
+  // Open output files.
+  OutputFileStream outStream;
+  OutputFileStream countOfCountsStream;
+  OpenOutputFileOrDie(m_options.tableFile, outStream);
+  if (m_options.goodTuring || m_options.kneserNey) {
+    OpenOutputFileOrDie(m_options.tableFile+".coc", countOfCountsStream);
+  }
+
+  // Load lexical table.
+  if (!m_options.noLex) {
+    m_lexTable.Load(lexStream);
+  }
+
+  const util::MultiCharacter delimiter("|||");
+  std::size_t lineNum = 0;
+  std::size_t startLine= 0;
+  std::string line;
+  std::string tmp;
+  RuleGroup ruleGroup;
+  RuleTableWriter ruleTableWriter(m_options, outStream);
+
+  while (std::getline(extractStream, line)) {
+    ++lineNum;
+
+    // Tokenize the input line.
+    util::TokenIter<util::MultiCharacter> it(line, delimiter);
+    StringPiece source = *it++;
+    StringPiece target = *it++;
+    StringPiece ntAlign = *it++;
+    StringPiece fullAlign = *it++;
+    it->CopyToString(&tmp);
+    int count = std::atoi(tmp.c_str());
+
+    // If this is the first line or if source has changed since the last
+    // line then process the current rule group and start a new one.
+    if (source != ruleGroup.GetSource()) {
+      if (lineNum > 1) {
+        ProcessRuleGroupOrDie(ruleGroup, ruleTableWriter, startLine, lineNum-1);
+      }
+      startLine = lineNum;
+      ruleGroup.SetNewSource(source);
+    }
+
+    // Add the rule to the current rule group.
+    ruleGroup.AddRule(target, ntAlign, fullAlign, count);
+  }
+
+  // Process the final rule group.
+  ProcessRuleGroupOrDie(ruleGroup, ruleTableWriter, startLine, lineNum);
+
+  // Write count of counts file.
+  if (m_options.goodTuring || m_options.kneserNey) {
+    // Kneser-Ney needs the total number of distinct rules.
+    countOfCountsStream << m_totalDistinct << std::endl;
+    // Write out counts of counts.
+    for (int i = 1; i <= kCountOfCountsMax; ++i) {
+      countOfCountsStream << m_countOfCounts[i] << std::endl;
+    }
+  }
+
+  return 0;
+}
+
+void ScoreStsg::TokenizeRuleHalf(const std::string &s, TokenizedRuleHalf &half)
+{
+  // Copy s to half.string, but strip any leading or trailing whitespace.
+  std::size_t start = s.find_first_not_of(" \t");
+  if (start == std::string::npos) {
+    throw Exception("rule half is empty");
+  }
+  std::size_t end = s.find_last_not_of(" \t");
+  assert(end != std::string::npos);
+  half.string = s.substr(start, end-start+1);
+
+  // Tokenize half.string.
+  half.tokens.clear();
+  for (TreeFragmentTokenizer p(half.string);
+       p != TreeFragmentTokenizer(); ++p) {
+    half.tokens.push_back(*p);
+  }
+
+  // Extract the frontier symbols.
+  half.frontierSymbols.clear();
+  const std::size_t numTokens = half.tokens.size();
+  for (int i = 0; i < numTokens; ++i) {
+    if (half.tokens[i].type != TreeFragmentToken_WORD) {
+      continue;
+    }
+    if (i == 0 || half.tokens[i-1].type != TreeFragmentToken_LSB) {
+      // A word is a terminal iff it doesn't follow '['
+      half.frontierSymbols.resize(half.frontierSymbols.size()+1);
+      half.frontierSymbols.back().value = half.tokens[i].value;
+      half.frontierSymbols.back().isNonTerminal = false;
+    } else if (i+1 < numTokens &&
+               half.tokens[i+1].type == TreeFragmentToken_RSB) {
+      // A word is a non-terminal iff it it follows '[' and is succeeded by ']'
+      half.frontierSymbols.resize(half.frontierSymbols.size()+1);
+      half.frontierSymbols.back().value = half.tokens[i].value;
+      half.frontierSymbols.back().isNonTerminal = true;
+      ++i;  // Skip over the ']'
+    }
+  }
+}
+
+void ScoreStsg::ProcessRuleGroupOrDie(const RuleGroup &group,
+                                      RuleTableWriter &writer,
+                                      std::size_t start,
+                                      std::size_t end)
+{
+  try {
+    ProcessRuleGroup(group, writer);
+  } catch (const Exception &e) {
+    std::ostringstream msg;
+    msg << "failed to process rule group at lines " << start << "-" << end
+        << ": " << e.GetMsg();
+    Error(msg.str());
+  } catch (const std::exception &e) {
+    std::ostringstream msg;
+    msg << "failed to process rule group at lines " << start << "-" << end
+        << ": " << e.what();
+    Error(msg.str());
+  }
+}
+
+void ScoreStsg::ProcessRuleGroup(const RuleGroup &group,
+                                 RuleTableWriter &writer)
+{
+  const std::size_t totalCount = group.GetTotalCount();
+  const std::size_t distinctCount = group.GetSize();
+
+  TokenizeRuleHalf(group.GetSource(), m_sourceHalf);
+
+  const bool fullyLexical = m_sourceHalf.IsFullyLexical();
+
+  // Process each distinct rule in turn.
+  for (RuleGroup::ConstIterator p = group.Begin(); p != group.End(); ++p) {
+    const RuleGroup::DistinctRule &rule = *p;
+
+    // Update count of count statistics.
+    if (m_options.goodTuring || m_options.kneserNey) {
+      ++m_totalDistinct;
+      int countInt = rule.count + 0.99999;
+      if (countInt <= kCountOfCountsMax) {
+        ++m_countOfCounts[countInt];
+      }
+    }
+
+    // If the rule is not fully lexical then discard it if the count is below
+    // the threshold value.
+    if (!fullyLexical && rule.count < m_options.minCountHierarchical) {
+      continue;
+    }
+
+    TokenizeRuleHalf(rule.target, m_targetHalf);
+
+    // Find the most frequent alignment (if there's a tie, take the first one).
+    std::vector<std::pair<std::string, int> >::const_iterator q =
+      rule.alignments.begin();
+    const std::pair<std::string, int> *bestAlignmentAndCount = &(*q++);
+    for (; q != rule.alignments.end(); ++q) {
+      if (q->second > bestAlignmentAndCount->second) {
+        bestAlignmentAndCount = &(*q);
+      }
+    }
+    const std::string &bestAlignment = bestAlignmentAndCount->first;
+    ParseAlignmentString(bestAlignment, m_targetHalf.frontierSymbols.size(),
+                         m_tgtToSrc);
+
+    // Compute the lexical translation probability.
+    double lexProb = ComputeLexProb(m_sourceHalf.frontierSymbols,
+                                    m_targetHalf.frontierSymbols, m_tgtToSrc);
+
+    // TODO PCFG score
+
+    // Write a line to the rule table.
+    writer.WriteLine(m_sourceHalf, m_targetHalf, bestAlignment, lexProb,
+                     p->count, totalCount, distinctCount);
+  }
+}
+
+void ScoreStsg::ParseAlignmentString(const std::string &s, int numTgtWords,
+                                     MosesTraining::ALIGNMENT &tgtToSrc)
+{
+  tgtToSrc.clear();
+  tgtToSrc.resize(numTgtWords);
+
+  const std::string digits = "0123456789";
+
+  std::string::size_type begin = 0;
+  while (true) {
+    std::string::size_type end = s.find("-", begin);
+    if (end == std::string::npos) {
+      return;
+    }
+    int src = std::atoi(s.substr(begin, end-begin).c_str());
+    if (end+1 == s.size()) {
+      throw Exception("Target index missing");
+    }
+    begin = end+1;
+    end = s.find_first_not_of(digits, begin+1);
+    int tgt;
+    if (end == std::string::npos) {
+      tgt = std::atoi(s.substr(begin).c_str());
+      tgtToSrc[tgt].insert(src);
+      return;
+    } else {
+      tgt = std::atoi(s.substr(begin, end-begin).c_str());
+      tgtToSrc[tgt].insert(src);
+    }
+    begin = end+1;
+  }
+}
+
+double ScoreStsg::ComputeLexProb(const std::vector<RuleSymbol> &sourceFrontier,
+                                 const std::vector<RuleSymbol> &targetFrontier,
+                                 const MosesTraining::ALIGNMENT &tgtToSrc)
+{
+  double lexScore = 1.0;
+  for (std::size_t i = 0; i < targetFrontier.size(); ++i) {
+    if (targetFrontier[i].isNonTerminal) {
+      continue;
+    }
+    Vocabulary::IdType tgtId = m_tgtVocab.Lookup(targetFrontier[i].value,
+                                                 StringPieceCompatibleHash(),
+                                                 StringPieceCompatibleEquals());
+    const std::set<std::size_t> &srcIndices = tgtToSrc[i];
+    if (srcIndices.empty()) {
+      // Explain unaligned word by NULL.
+      lexScore *= m_lexTable.PermissiveLookup(Vocabulary::NullId(), tgtId);
+    } else {
+      double thisWordScore = 0.0;
+      for (std::set<std::size_t>::const_iterator p = srcIndices.begin();
+           p != srcIndices.end(); ++p) {
+        Vocabulary::IdType srcId =
+            m_srcVocab.Lookup(sourceFrontier[*p].value,
+                              StringPieceCompatibleHash(),
+                              StringPieceCompatibleEquals());
+        thisWordScore += m_lexTable.PermissiveLookup(srcId, tgtId);
+      }
+      lexScore *= thisWordScore / static_cast<double>(srcIndices.size());
+    }
+  }
+  return lexScore;
+}
+
+void ScoreStsg::OpenOutputFileOrDie(const std::string &filename,
+                                    OutputFileStream &stream)
+{
+  bool ret = stream.Open(filename);
+  if (!ret) {
+    std::ostringstream msg;
+    msg << "failed to open output file: " << filename;
+    Error(msg.str());
+  }
+}
+
+void ScoreStsg::ProcessOptions(int argc, char *argv[], Options &options) const
+{
+  namespace po = boost::program_options;
+  namespace cls = boost::program_options::command_line_style;
+
+  // Construct the 'top' of the usage message: the bit that comes before the
+  // options list.
+  std::ostringstream usageTop;
+  usageTop << "Usage: " << GetName()
+           << " [OPTION]... EXTRACT LEX TABLE\n\n"
+           << "STSG rule scorer\n\n"
+           << "Options";
+
+  // Construct the 'bottom' of the usage message.
+  std::ostringstream usageBottom;
+  usageBottom << "TODO";
+
+  // Declare the command line options that are visible to the user.
+  po::options_description visible(usageTop.str());
+  visible.add_options()
+  ("GoodTuring",
+   "apply Good-Turing smoothing to relative frequency probability estimates")
+  ("Hierarchical",
+   "ignored (included for compatibility with score)")
+  ("Inverse",
+   "use inverse mode")
+  ("KneserNey",
+   "apply Kneser-Ney smoothing to relative frequency probability estimates")
+  ("LogProb",
+   "output log probabilities")
+  ("MinCountHierarchical",
+   po::value(&options.minCountHierarchical)->
+      default_value(options.minCountHierarchical),
+      "filter out rules with frequency < arg (except fully lexical rules)")
+  ("NegLogProb",
+   "output negative log probabilities")
+  ("NoLex",
+   "do not compute lexical translation score")
+  ("NoWordAlignment",
+   "do not output word alignments")
+  ("PCFG",
+   "include pre-computed PCFG score from extract")
+  ("UnpairedExtractFormat",
+   "ignored (included for compatibility with score)")
+  ;
+
+  // Declare the command line options that are hidden from the user
+  // (these are used as positional options).
+  po::options_description hidden("Hidden options");
+  hidden.add_options()
+  ("ExtractFile",
+   po::value(&options.extractFile),
+   "extract file")
+  ("LexFile",
+   po::value(&options.lexFile),
+   "lexical probability file")
+  ("TableFile",
+   po::value(&options.tableFile),
+   "output file")
+  ;
+
+  // Compose the full set of command-line options.
+  po::options_description cmdLineOptions;
+  cmdLineOptions.add(visible).add(hidden);
+
+  // Register the positional options.
+  po::positional_options_description p;
+  p.add("ExtractFile", 1);
+  p.add("LexFile", 1);
+  p.add("TableFile", 1);
+
+  // Process the command-line.
+  po::variables_map vm;
+  const int optionStyle = cls::allow_long
+                          | cls::long_allow_adjacent
+                          | cls::long_allow_next;
+  try {
+    po::store(po::command_line_parser(argc, argv).style(optionStyle).
+              options(cmdLineOptions).positional(p).run(), vm);
+    po::notify(vm);
+  } catch (const std::exception &e) {
+    std::ostringstream msg;
+    msg << e.what() << "\n\n" << visible << usageBottom.str();
+    Error(msg.str());
+  }
+
+  if (vm.count("help")) {
+    std::cout << visible << usageBottom.str() << std::endl;
+    std::exit(0);
+  }
+
+  // Check all positional options were given.
+  if (!vm.count("ExtractFile") ||
+      !vm.count("LexFile") ||
+      !vm.count("TableFile")) {
+    std::ostringstream msg;
+    std::cerr << visible << usageBottom.str() << std::endl;
+    std::exit(1);
+  }
+
+  // Process Boolean options.
+  if (vm.count("GoodTuring")) {
+    options.goodTuring = true;
+  }
+  if (vm.count("Inverse")) {
+    options.inverse = true;
+  }
+  if (vm.count("KneserNey")) {
+    options.kneserNey = true;
+  }
+  if (vm.count("LogProb")) {
+    options.logProb = true;
+  }
+  if (vm.count("NegLogProb")) {
+    options.negLogProb = true;
+  }
+  if (vm.count("NoLex")) {
+    options.noLex = true;
+  }
+  if (vm.count("NoWordAlignment")) {
+    options.noWordAlignment = true;
+  }
+  if (vm.count("PCFG")) {
+    options.pcfg = true;
+  }
+}
+
+void ScoreStsg::Error(const std::string &msg) const
+{
+  std::cerr << GetName() << ": " << msg << std::endl;
+  std::exit(1);
+}
+
+}  // namespace ScoreStsg
+}  // namespace Moses
--- a/phrase-extract/score-stsg/ScoreStsg.h
+++ b/phrase-extract/score-stsg/ScoreStsg.h
@ -0,0 +1,75 @@
+#pragma once
+
+#include <map>
+#include <ostream>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "ExtractionPhrasePair.h"
+
+#include "LexicalTable.h"
+#include "Options.h"
+#include "RuleSymbol.h"
+#include "TokenizedRuleHalf.h"
+#include "Vocabulary.h"
+
+namespace Moses
+{
+
+class OutputFileStream;
+
+namespace ScoreStsg
+{
+
+class RuleGroup;
+class RuleTableWriter;
+
+class ScoreStsg
+{
+public:
+  ScoreStsg();
+
+  const std::string &GetName() const {
+    return m_name;
+  }
+
+  int Main(int argc, char *argv[]);
+
+private:
+  static const int kCountOfCountsMax;
+
+  double ComputeLexProb(const std::vector<RuleSymbol> &,
+                        const std::vector<RuleSymbol> &,
+                        const MosesTraining::ALIGNMENT &);
+
+  void Error(const std::string &) const;
+
+  void OpenOutputFileOrDie(const std::string &, OutputFileStream &);
+
+  void ParseAlignmentString(const std::string &, int,
+                            MosesTraining::ALIGNMENT &);
+
+  void ProcessOptions(int, char *[], Options &) const;
+
+  void ProcessRuleGroup(const RuleGroup &, RuleTableWriter &);
+
+  void ProcessRuleGroupOrDie(const RuleGroup &, RuleTableWriter &,
+                             std::size_t, std::size_t);
+
+  void TokenizeRuleHalf(const std::string &, TokenizedRuleHalf &);
+
+  std::string m_name;
+  Options m_options;
+  Vocabulary m_srcVocab;
+  Vocabulary m_tgtVocab;
+  LexicalTable m_lexTable;
+  std::vector<int> m_countOfCounts;
+  int m_totalDistinct;
+  TokenizedRuleHalf m_sourceHalf;
+  TokenizedRuleHalf m_targetHalf;
+  MosesTraining::ALIGNMENT m_tgtToSrc;
+};
+
+}  // namespace ScoreStsg
+}  // namespace Moses
--- a/phrase-extract/score-stsg/TokenizedRuleHalf.cpp
+++ b/phrase-extract/score-stsg/TokenizedRuleHalf.cpp
@ -0,0 +1,37 @@
+#include "TokenizedRuleHalf.h"
+
+namespace Moses
+{
+namespace ScoreStsg
+{
+
+bool TokenizedRuleHalf::IsFullyLexical() const
+{
+  for (std::vector<RuleSymbol>::const_iterator p = frontierSymbols.begin();
+       p != frontierSymbols.end(); ++p) {
+    if (p->isNonTerminal) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool TokenizedRuleHalf::IsString() const
+{
+  // A rule half is either a string (like "[X] and [X]") or a tree (like
+  // "[NP [NP] [CC and] [NP]]").
+  //
+  // A string must start with a terminal or a non-terminal (in square brackets).
+  // A tree must start with '[' followed by a word then either another word or
+  // another '['.
+  return (tokens[0].type == TreeFragmentToken_WORD ||
+          tokens[2].type == TreeFragmentToken_RSB);
+}
+
+bool TokenizedRuleHalf::IsTree() const
+{
+  return !IsString();
+}
+
+}  // namespace ScoreStsg
+}  // namespace Moses
--- a/phrase-extract/score-stsg/TokenizedRuleHalf.h
+++ b/phrase-extract/score-stsg/TokenizedRuleHalf.h
@ -0,0 +1,45 @@
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "RuleSymbol.h"
+#include "TreeFragmentTokenizer.h"
+
+namespace Moses
+{
+namespace ScoreStsg
+{
+
+// Stores one half of a STSG rule, as represented in the extract file.  The
+// original string is stored as the member 'string', along with its token
+// sequence ('tokens') and frontier symbol sequence ('frontierSymbols').  Note
+// that 'tokens' and 'frontierSymbols' use StringPiece objects that depend on
+// the original string.  Therefore changing the value of 'string' invalidates
+// both 'tokens' and 'frontierSymbols'.
+struct TokenizedRuleHalf
+{
+  bool IsFullyLexical() const;
+  bool IsString() const;
+  bool IsTree() const;
+
+  // The rule half as it appears in the extract file, except with any trailing
+  // or leading spaces removed (here a space is defined as a blank or a tab).
+  std::string string;
+
+  // The token sequence for the string.
+  std::vector<TreeFragmentToken> tokens;
+
+  // The frontier symbols of the rule half.  For example:
+  //
+  // string:    "[VP [VBN] [PP [IN] [NP [DT] [JJ positive] [NN light]]]]"
+  // frontier:  ("VBN",t), ("IN",t), ("DT",t), ("positive",f), ("light",f)
+  //
+  // string:  "[X] [X] Sinne [X]"
+  // frontier:  ("X",t), ("X",t), ("Sinne",f), ("X",t)
+  //
+  std::vector<RuleSymbol> frontierSymbols;
+};
+
+}  // namespace ScoreStsg
+}  // namespace Moses
--- a/phrase-extract/score-stsg/TreeFragmentTokenizer.cpp
+++ b/phrase-extract/score-stsg/TreeFragmentTokenizer.cpp
@ -0,0 +1,90 @@
+#include "TreeFragmentTokenizer.h"
+
+#include <cctype>
+
+namespace Moses
+{
+namespace ScoreStsg
+{
+
+TreeFragmentToken::TreeFragmentToken(TreeFragmentTokenType t,
+                                     StringPiece v, std::size_t p)
+    : type(t)
+    , value(v)
+    , pos(p) {
+}
+
+TreeFragmentTokenizer::TreeFragmentTokenizer()
+    : value_(TreeFragmentToken_EOS, "", -1) {
+}
+
+TreeFragmentTokenizer::TreeFragmentTokenizer(const StringPiece &s)
+    : str_(s)
+    , value_(TreeFragmentToken_EOS, "", -1)
+    , iter_(s.begin())
+    , end_(s.end())
+    , pos_(0) {
+  ++(*this);
+}
+
+TreeFragmentTokenizer &TreeFragmentTokenizer::operator++() {
+  while (iter_ != end_ && (*iter_ == ' ' || *iter_ == '\t')) {
+    ++iter_;
+    ++pos_;
+  }
+
+  if (iter_ == end_) {
+    value_ = TreeFragmentToken(TreeFragmentToken_EOS, "", pos_);
+    return *this;
+  }
+
+  if (*iter_ == '[') {
+    value_ = TreeFragmentToken(TreeFragmentToken_LSB, "[", pos_);
+    ++iter_;
+    ++pos_;
+  } else if (*iter_ == ']') {
+    value_ = TreeFragmentToken(TreeFragmentToken_RSB, "]", pos_);
+    ++iter_;
+    ++pos_;
+  } else {
+    std::size_t start = pos_;
+    while (true) {
+      ++iter_;
+      ++pos_;
+      if (iter_ == end_ || *iter_ == ' ' || *iter_ == '\t') {
+        break;
+      }
+      if (*iter_ == '[' || *iter_ == ']') {
+        break;
+      }
+    }
+    StringPiece word = str_.substr(start, pos_-start);
+    value_ = TreeFragmentToken(TreeFragmentToken_WORD, word, start);
+  }
+
+  return *this;
+}
+
+TreeFragmentTokenizer TreeFragmentTokenizer::operator++(int) {
+  TreeFragmentTokenizer tmp(*this);
+  ++*this;
+  return tmp;
+}
+
+bool operator==(const TreeFragmentTokenizer &lhs,
+                const TreeFragmentTokenizer &rhs) {
+  if (lhs.value_.type == TreeFragmentToken_EOS ||
+      rhs.value_.type == TreeFragmentToken_EOS) {
+    return lhs.value_.type == TreeFragmentToken_EOS &&
+           rhs.value_.type == TreeFragmentToken_EOS;
+  }
+  return lhs.iter_ == rhs.iter_;
+}
+
+bool operator!=(const TreeFragmentTokenizer &lhs,
+                const TreeFragmentTokenizer &rhs) {
+  return !(lhs == rhs);
+}
+
+}  // namespace ScoreStsg
+}  // namespace Moses
--- a/phrase-extract/score-stsg/TreeFragmentTokenizer.h
+++ b/phrase-extract/score-stsg/TreeFragmentTokenizer.h
@ -0,0 +1,70 @@
+#pragma once
+
+#include "util/string_piece.hh"
+
+namespace Moses
+{
+namespace ScoreStsg
+{
+
+enum TreeFragmentTokenType {
+  TreeFragmentToken_EOS,
+  TreeFragmentToken_LSB,
+  TreeFragmentToken_RSB,
+  TreeFragmentToken_WORD
+};
+
+struct TreeFragmentToken {
+ public:
+  TreeFragmentToken(TreeFragmentTokenType, StringPiece, std::size_t);
+  TreeFragmentTokenType type;
+  StringPiece value;
+  std::size_t pos;
+};
+
+// Tokenizes tree fragment strings in Moses format.
+//
+// For example, the string "[NP [NP [NN a]] [NP]]" is tokenized to the sequence:
+//
+//    1   LSB   "["
+//    2   WORD  "NP"
+//    3   LSB   "["
+//    4   WORD  "NP"
+//    5   LSB   "["
+//    6   WORD  "NN"
+//    7   WORD  "a"
+//    8   RSB   "]"
+//    9   RSB   "]"
+//    10  LSB   "["
+//    11  WORD  "NP"
+//    12  RSB   "]"
+//    13  RSB   "]"
+//    14  EOS   undefined
+//
+class TreeFragmentTokenizer {
+ public:
+  TreeFragmentTokenizer();
+  TreeFragmentTokenizer(const StringPiece &);
+
+  const TreeFragmentToken &operator*() const { return value_; }
+  const TreeFragmentToken *operator->() const { return &value_; }
+
+  TreeFragmentTokenizer &operator++();
+  TreeFragmentTokenizer operator++(int);
+
+  friend bool operator==(const TreeFragmentTokenizer &,
+                         const TreeFragmentTokenizer &);
+
+  friend bool operator!=(const TreeFragmentTokenizer &,
+                         const TreeFragmentTokenizer &);
+
+ private:
+  StringPiece str_;
+  TreeFragmentToken value_;
+  StringPiece::const_iterator iter_;
+  StringPiece::const_iterator end_;
+  std::size_t pos_;
+};
+
+}  // namespace ScoreStsg
+}  // namespace Moses
--- a/phrase-extract/score-stsg/Vocabulary.h
+++ b/phrase-extract/score-stsg/Vocabulary.h
@ -0,0 +1,13 @@
+#pragma once
+
+#include <string>
+
+#include "NumberedSet.h"
+
+namespace Moses {
+namespace ScoreStsg {
+
+typedef NumberedSet<std::string, std::size_t> Vocabulary;
+
+}  // namespace ScoreStsg
+}  // namespace Moses
				`@ -0,0 +1 @@`
				`exe score-stsg : [ glob *.cpp ] ..//deps ../..//boost_iostreams ../..//boost_program_options ../..//z : <include>.. ;`