Merge pull request #165 from moses-smt/mjdenkowski

Phrase distance feature
2024-09-11 11:25:40 +03:00 · 2016-08-12 12:20:23 -04:00 · 2016-08-12 12:20:23 -04:00 · 950c7de458
commit 950c7de458
parent a8325a3e8e 3aedc0bf68
36 changed files with 435 additions and 59 deletions
--- a/contrib/other-builds/moses/.project
+++ b/contrib/other-builds/moses/.project
@ -1319,6 +1319,16 @@
 			<name>FF/PhraseBoundaryFeature.h</name>
 			<type>1</type>
 			<locationURI>PARENT-3-PROJECT_LOC/moses/FF/PhraseBoundaryFeature.h</locationURI>
+        </link>
+		<link>
+			<name>FF/PhraseDistanceFeature.cpp</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/moses/FF/PhraseDistanceFeature.cpp</locationURI>
+		</link>
+		<link>
+			<name>FF/PhraseDistanceFeature.h</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/moses/FF/PhraseDistanceFeature.h</locationURI>
 		</link>
 		<link>
 			<name>FF/PhraseLengthFeature.cpp</name>
@ -3654,7 +3664,7 @@
 			<name>TranslationModel/UG/sapt_pscore_coherence.h</name>
 			<type>1</type>
 			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_coherence.h</locationURI>
-		</link>
+        </link>
 		<link>
 			<name>TranslationModel/UG/sapt_pscore_lex1.h</name>
 			<type>1</type>
@ -3699,7 +3709,7 @@
 			<name>TranslationModel/UG/sapt_pscore_wordcount.h</name>
 			<type>1</type>
 			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_wordcount.h</locationURI>
-		</link>
+        </link>
 		<link>
 			<name>TranslationModel/UG/sim-pe.cc</name>
 			<type>1</type>
--- a/moses/FF/Factory.cpp
+++ b/moses/FF/Factory.cpp
@ -30,6 +30,7 @@
 #include "moses/FF/TargetBigramFeature.h"
 #include "moses/FF/TargetNgramFeature.h"
 #include "moses/FF/PhraseBoundaryFeature.h"
+#include "moses/FF/PhraseDistanceFeature.h"
 #include "moses/FF/PhrasePairFeature.h"
 #include "moses/FF/RulePairUnlexicalizedSource.h"
 #include "moses/FF/PhraseLengthFeature.h"
@ -252,6 +253,7 @@ FeatureRegistry::FeatureRegistry()
  MOSES_FNAME(SourceWordDeletionFeature);
  MOSES_FNAME(TargetWordInsertionFeature);
  MOSES_FNAME(PhraseBoundaryFeature);
+  MOSES_FNAME(PhraseDistanceFeature);
  MOSES_FNAME(PhraseLengthFeature);
  MOSES_FNAME(WordTranslationFeature);
  MOSES_FNAME(TargetBigramFeature);
--- a/moses/FF/PhraseDistanceFeature.cpp
+++ b/moses/FF/PhraseDistanceFeature.cpp
@ -0,0 +1,123 @@
+#include "PhraseDistanceFeature.h"
+
+#include <vector>
+#include <boost/foreach.hpp>
+#include "moses/InputType.h"
+#include "moses/ScoreComponentCollection.h"
+#include "moses/StaticData.h"
+#include "util/exception.hh"
+
+using namespace std;
+
+namespace Moses
+{
+PhraseDistanceFeature::PhraseDistanceFeature(const string &line)
+  : StatelessFeatureFunction(2, line)
+  , m_space("")
+  , m_spaceID(0)
+  , m_measure(EuclideanDistance)
+{
+  ReadParameters();
+}
+
+void PhraseDistanceFeature::EvaluateWithSourceContext(const InputType &input
+                                               , const InputPath &inputPath
+                                               , const TargetPhrase &targetPhrase
+                                               , const StackVec *stackVec
+                                               , ScoreComponentCollection &scoreBreakdown
+                                               , ScoreComponentCollection *estimatedScores) const
+{
+  vector<float> scores(m_numScoreComponents, 0);
+  bool broken = false;
+  // Input coord
+  map<size_t const, vector<float> >::const_iterator ii;
+  if (input.m_coordMap) {
+    ii = input.m_coordMap->find(m_spaceID);
+  } else {
+    TRACE_ERR("No coordinates for space " << m_space << " on input (specify with coord XML tag)" << endl);
+    TRACE_ERR("Scores for " << m_description << " will be incorrect and probably all zeros" << endl);
+    broken = true;
+  }
+  if (ii == input.m_coordMap->end()) {
+    TRACE_ERR("No coordinates for space " << m_space << " on input (specify with coord XML tag)" << endl);
+    TRACE_ERR("Scores for " << m_description << " will be incorrect and probably all zeros" << endl);
+    broken = true;
+  }
+  // Target phrase coord
+  vector<SPTR<vector<float> > > const* tpp = targetPhrase.GetCoordList(m_spaceID);
+  if (tpp == NULL) {
+    TRACE_ERR("No coordinates for space " << m_space << " on target phrase (PhraseDictionary implementation needs to set)" << endl);
+    TRACE_ERR("Scores for " << m_description << " will be incorrect and probably all zeros" << endl);
+    broken = true;
+  }
+  // Compute scores
+  if (!broken) {
+    vector<float> const& inputCoord = ii->second;
+    vector<SPTR<vector<float> > > const& tpCoord = *tpp;
+    // Centroid of target phrase instances (from phrase extraction)
+    vector<float> centroid = vector<float>(inputCoord.size(), 0);
+    BOOST_FOREACH(SPTR<vector<float> > const coord, tpCoord) {
+      for (size_t i = 0; i < inputCoord.size(); ++i) {
+        centroid[i] += (*coord)[i];
+      }
+    }
+    for (size_t i = 0; i < inputCoord.size(); ++i) {
+      centroid[i] /= tpCoord.size();
+    }
+    // Average distance from the target phrase instances to (1) the input and
+    // (2) the target phrase centroid
+    float inputDistance = 0;
+    float centroidDistance = 0;
+    if (m_measure == EuclideanDistance) {
+      BOOST_FOREACH(SPTR<vector<float> > const coord, tpCoord) {
+        float pointInputDistance = 0;
+        float pointCentroidDistance = 0;
+        for (size_t i = 0; i < inputCoord.size(); ++i) {
+          pointInputDistance += pow(inputCoord[i] - (*coord)[i], 2);
+          pointCentroidDistance += pow(centroid[i] - (*coord)[i], 2);
+        }
+        inputDistance += sqrt(pointInputDistance);
+        centroidDistance += sqrt(pointCentroidDistance);
+      }
+    } else if (m_measure == TotalVariationDistance) {
+      BOOST_FOREACH(SPTR<vector<float> > const coord, tpCoord) {
+        float pointInputDistance = 0;
+        float pointCentroidDistance = 0;
+        for (size_t i = 0; i < inputCoord.size(); ++i) {
+          pointInputDistance += abs(inputCoord[i] - (*coord)[i]);
+          pointCentroidDistance += abs(centroid[i] - (*coord)[i]);
+        }
+        inputDistance += pointInputDistance / 2;
+        centroidDistance += pointCentroidDistance / 2;
+      }
+    }
+    inputDistance /= tpCoord.size();
+    centroidDistance /= tpCoord.size();
+    // Log transform scores, max with float epsilon to avoid domain error
+    scores[0] = log(max(inputDistance, Moses::FLOAT_EPSILON));
+    scores[1] = log(max(centroidDistance, Moses::FLOAT_EPSILON));
+  }
+  // Set scores
+  scoreBreakdown.Assign(this, scores);
+  return;
+}
+
+void PhraseDistanceFeature::SetParameter(const string& key, const string& value)
+{
+  if (key == "space") {
+    m_space = value;
+    m_spaceID = StaticData::InstanceNonConst().MapCoordSpace(m_space);
+  } else if (key == "measure") {
+    if (value == "euc") {
+      m_measure = EuclideanDistance;
+    } else if (value == "var") {
+      m_measure = TotalVariationDistance;
+    } else {
+      UTIL_THROW2("Unknown measure " << value << ", choices: euc var");
+    }
+  } else {
+    StatelessFeatureFunction::SetParameter(key, value);
+  }
+}
+
+} // namespace
--- a/moses/FF/PhraseDistanceFeature.h
+++ b/moses/FF/PhraseDistanceFeature.h
@ -0,0 +1,57 @@
+#pragma once
+
+#include "StatelessFeatureFunction.h"
+
+namespace Moses
+{
+
+class PhraseDistanceFeature : public StatelessFeatureFunction
+{
+  enum Measure
+  {
+    EuclideanDistance,
+    TotalVariationDistance,
+  };
+
+public:
+  PhraseDistanceFeature(const std::string &line);
+
+  bool IsUseable(const FactorMask &mask) const {
+    return true;
+  }
+
+  virtual void EvaluateInIsolation(const Phrase &source
+                                   , const TargetPhrase &targetPhrase
+                                   , ScoreComponentCollection &scoreBreakdown
+                                   , ScoreComponentCollection &estimatedScores) const {
+  }
+
+  void EvaluateWhenApplied(const Hypothesis& hypo,
+                           ScoreComponentCollection* accumulator) const {
+  }
+  void EvaluateWhenApplied(const ChartHypothesis &hypo,
+                           ScoreComponentCollection* accumulator) const {
+  }
+  void EvaluateWhenApplied(const Syntax::SHyperedge &hyperedge,
+                           ScoreComponentCollection* accumulator) const {
+  }
+
+  void EvaluateWithSourceContext(const InputType &input
+                                 , const InputPath &inputPath
+                                 , const TargetPhrase &targetPhrase
+                                 , const StackVec *stackVec
+                                 , ScoreComponentCollection &scoreBreakdown
+                                 , ScoreComponentCollection *estimatedScores = NULL) const;
+
+  void EvaluateTranslationOptionListWithSourceContext(const InputType &input
+      , const TranslationOptionList &translationOptionList) const {
+  }
+  void SetParameter(const std::string& key, const std::string& value);
+
+protected:
+  Measure m_measure;
+  std::string m_space;
+  size_t m_spaceID;
+};
+
+} //namespace
--- a/moses/InputType.h
+++ b/moses/InputType.h
@ -68,6 +68,9 @@ public:
  size_t m_frontSpanCoveredLength;
  // how many words from the beginning are covered

+  // Coordinates in user-defined spaces (see "coord" XML tag)
+  SPTR<std::map<size_t const, std::vector<float> > > m_coordMap;
+
  InputType(AllOptions::ptr const& opts, long translationId = 0);
  virtual ~InputType();

--- a/moses/Sentence.cpp
+++ b/moses/Sentence.cpp
@ -154,7 +154,8 @@ aux_interpret_xml(std::string& line, std::vector<size_t> & xmlWalls,
    bool OK = ProcessAndStripXMLTags(*m_options, line,
                                     m_xmlOptions,
                                     m_reorderingConstraint,
-                                     xmlWalls, placeholders);
+                                     xmlWalls, placeholders,
+                                     *this);
    if (!OK) {
      TRACE_ERR("Unable to parse XML in line: " << line);
    }
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@ -936,4 +936,25 @@ void StaticData::ResetWeights(const std::string &denseWeights, const std::string
  }
 }

+size_t StaticData::GetCoordSpace(string space) const
+{
+  map<string, size_t>::const_iterator m = m_coordSpaceMap.find(space);
+  if(m == m_coordSpaceMap.end()) {
+    return 0;
+  }
+  return m->second;
+}
+
+size_t StaticData::MapCoordSpace(string space)
+{
+  map<string, size_t>::const_iterator m = m_coordSpaceMap.find(space);
+  if (m != m_coordSpaceMap.end()) {
+    return m->second;
+  }
+  size_t id = m_coordSpaceNextID;
+  m_coordSpaceNextID += 1;
+  m_coordSpaceMap[space] = id;
+  return id;
+}
+
 } // namespace
--- a/moses/StaticData.h
+++ b/moses/StaticData.h
@ -60,7 +60,7 @@ class PhraseDictionaryDynamicCacheBased;
 typedef std::pair<std::string, float> UnknownLHSEntry;
 typedef std::vector<UnknownLHSEntry>  UnknownLHSList;

-/** Contains global variables and contants.
+/** Contains global variables and constants.
 *  Only 1 object of this class should be instantiated.
 *  A const object of this class is accessible by any function during decoding by calling StaticData::Instance();
 */
@ -152,6 +152,12 @@ protected:
  bool ini_performance_options();

  void initialize_features();
+
+  // Coordinate space name map for matching spaces across XML input ("coord"
+  // tag) and feature functions that assign or use coordinates on target phrases
+  std::map< std::string const, size_t > m_coordSpaceMap;
+  size_t m_coordSpaceNextID = 1;
+
 public:

  //! destructor
@ -394,6 +400,9 @@ public:
    return m_requireSortingAfterSourceContext;
  }

+  // Coordinate spaces
+  size_t GetCoordSpace(std::string space) const;
+  size_t MapCoordSpace(std::string space);
 };

 }
--- a/moses/TargetPhrase.cpp
+++ b/moses/TargetPhrase.cpp
@ -125,6 +125,7 @@ TargetPhrase::TargetPhrase(const Phrase &phrase, const PhraseDictionary *pt)

 TargetPhrase::TargetPhrase(const TargetPhrase &copy)
  : Phrase(copy)
+  , m_cached_coord(copy.m_cached_coord)
  , m_cached_scores(copy.m_cached_scores)
  , m_scope(copy.m_scope)
  , m_futureScore(copy.m_futureScore)
@ -333,6 +334,31 @@ SetExtraScores(FeatureFunction const* ff,
  m_cached_scores[ff] = s;
 }

+vector<SPTR<vector<float> > > const*
+TargetPhrase::
+GetCoordList(size_t const spaceID) const
+{
+  if(!m_cached_coord) {
+    return NULL;
+  }
+  CoordCache_t::const_iterator m = m_cached_coord->find(spaceID);
+  if(m == m_cached_coord->end()) {
+    return NULL;
+  }
+  return &m->second;
+}
+
+void
+TargetPhrase::
+PushCoord(size_t const spaceID,
+          SPTR<vector<float> > const coord)
+{
+  if (!m_cached_coord) {
+    m_cached_coord.reset(new CoordCache_t);
+  }
+  vector<SPTR<vector<float> > >& coordList = (*m_cached_coord)[spaceID];
+  coordList.push_back(coord);
+}

 void TargetPhrase::SetProperties(const StringPiece &str)
 {
--- a/moses/TargetPhrase.h
+++ b/moses/TargetPhrase.h
@ -56,9 +56,13 @@ public:
  Scores const* GetExtraScores(FeatureFunction const* ff) const;
  void SetExtraScores(FeatureFunction const* ff,boost::shared_ptr<Scores> const& scores);

+  typedef std::map<size_t const, std::vector<SPTR<std::vector<float> > > > CoordCache_t;
+  std::vector<SPTR<std::vector<float> > > const* GetCoordList(size_t const spaceID) const;
+  void PushCoord(size_t const spaceID, SPTR<std::vector<float> > const coord);

 private:
  ScoreCache_t m_cached_scores;
+  SPTR<CoordCache_t> m_cached_coord;
  WPTR<ContextScope> m_scope;

 private:
--- a/moses/TranslationModel/UG/mm/ug_bitext.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext.h
@ -130,7 +130,6 @@ namespace sapt
    mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2;
    // caches for unbiased sampling; biased sampling uses the caches that
    // are stored locally on the translation task
-
  public:
    SPTR<Ttrack<char> >  Tx; // word alignments
    SPTR<Ttrack<Token> > T1; // token track
@ -164,7 +163,8 @@ namespace sapt

 #ifndef NO_MOSES
    SPTR<pstats>
-    prep2(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const;
+    prep2(ttasksptr const& ttask, iter const& phrase, bool const track_sids,
+          int max_sample = -1) const;
 #endif 

  protected:
@ -189,7 +189,7 @@ namespace sapt
    SPTR<pstats>
    lookup(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const;

-    void prep(ttasksptr const& ttask, iter const& phrase) const;
+    void prep(ttasksptr const& ttask, iter const& phrase, bool const track_sids) const;
 #endif

    void   setDefaultSampleSize(size_t const max_samples);
--- a/moses/TranslationModel/UG/mm/ug_bitext_agenda.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda.h
@ -33,7 +33,8 @@ public:
  SPTR<pstats>
  add_job(Bitext<Token> const* const theBitext,
 	  typename TSA<Token>::tree_iterator const& phrase,
-	  size_t const max_samples, SPTR<SamplingBias const> const& bias);
+	  size_t const max_samples, SPTR<SamplingBias const> const& bias,
+    bool const track_sids);
    // add_job(Bitext<Token> const* const theBitext,
    // 	  typename TSA<Token>::tree_iterator const& phrase,
    // 	  size_t const max_samples, SamplingBias const* const bias);
@ -93,13 +94,14 @@ SPTR<pstats> Bitext<Token>
 ::agenda
 ::add_job(Bitext<Token> const* const theBitext,
 	  typename TSA<Token>::tree_iterator const& phrase,
-	  size_t const max_samples, SPTR<SamplingBias const> const& bias)
+	  size_t const max_samples, SPTR<SamplingBias const> const& bias,
+	  bool const track_sids)
 {
  boost::unique_lock<boost::mutex> lk(this->lock);
  static boost::posix_time::time_duration nodelay(0,0,0,0);
  bool fwd = phrase.root == bt.I1.get();
  SPTR<job> j(new job(theBitext, phrase, fwd ? bt.I1 : bt.I2,
-		      max_samples, fwd, bias));
+		      max_samples, fwd, bias, track_sids));
  j->stats->register_worker();

  joblist.push_back(j);
--- a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h
@ -35,6 +35,8 @@ public:
  SPTR<pstats>     stats; // stores statistics collected during sampling
  SPTR<SamplingBias const> const m_bias; // sentence-level bias for sampling
  float bias_total;
+  bool m_track_sids;  // track sentence ids in sample?
+
  bool nextSample(uint64_t & sid, uint64_t & offset); // select next occurrence

  int
@ -46,7 +48,7 @@ public:
  job(Bitext<Token> const* const theBitext,
      typename TSA<Token>::tree_iterator const& m,
      SPTR<TSA<Token> > const& r, size_t maxsmpl, bool isfwd,
-      SPTR<SamplingBias const> const& bias);
+      SPTR<SamplingBias const> const& bias, bool const track_sids);
  ~job();
 };

@ -66,7 +68,8 @@ Bitext<Token>::agenda::job
 ::job(Bitext<Token> const* const theBitext,
      typename TSA<Token>::tree_iterator const& m,
      SPTR<TSA<Token> > const& r, size_t maxsmpl,
-      bool isfwd, SPTR<SamplingBias const> const& bias)
+      bool isfwd, SPTR<SamplingBias const> const& bias,
+      bool const track_sids)
  : m_bitext(theBitext)
  , rnd(0)
  , rnddenom(rnd.max() + 1.)
@ -80,8 +83,9 @@ Bitext<Token>::agenda::job
  , len(m.size())
  , fwd(isfwd)
  , m_bias(bias)
+  , m_track_sids(track_sids)
 {
-  stats.reset(new pstats());
+  stats.reset(new pstats(m_track_sids));
  stats->raw_cnt = m.approxOccurrenceCount();
  bias_total = 0;

--- a/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h
@ -90,7 +90,7 @@ Bitext<Token>::agenda
 		  size_t raw2 = b->approxOccurrenceCount();
 		  float bwgt = j->m_bias ? (*j->m_bias)[sid] : 1;
 		  j->stats->add(tpid, sample_weight, bwgt, aln, raw2,
-				po_fwd, po_bwd, docid);
+				po_fwd, po_bwd, docid, sid);
 		  bool ok = (i == e2) || b->extend(o[i].id());
 		  UTIL_THROW_IF2(!ok, "Could not extend target phrase.");
 		}
--- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc
+++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc
@ -29,6 +29,7 @@ namespace sapt
    my_wcnt = other.wcnt();
    my_bcnt = other.bcnt();
    my_aln  = other.aln();
+    sids = other.sids;
    indoc   = other.indoc;
    for (int i = 0; i <= LRModel::NONE; i++)
      {
@ -56,7 +57,8 @@ namespace sapt
  size_t
  jstats::
  add(float w, float b, std::vector<unsigned char> const& a, uint32_t const cnt2,
-      uint32_t fwd_orient, uint32_t bwd_orient, int const docid)
+      uint32_t fwd_orient, uint32_t bwd_orient, int const docid,
+      uint32_t const sid, bool const track_sid)
  {
    boost::lock_guard<boost::mutex> lk(this->lock);
    my_cnt2 = cnt2;
@ -76,6 +78,13 @@ namespace sapt
      }
    ++ofwd[fwd_orient];
    ++obwd[bwd_orient];
+    // Record sentence id if requested
+    if (track_sid)
+      {
+        if (!sids)
+          sids.reset(new std::vector<uint32_t>);
+        sids->push_back(sid);
+      }
    if (docid >= 0)
      {
        // while (int(indoc.size()) <= docid) indoc.push_back(0);
--- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h
@ -28,6 +28,7 @@ namespace sapt
    uint32_t obwd[LRModel::NONE+1]; // backward distortion type counts

  public:
+    SPTR<std::vector<uint32_t> > sids; // list of sentence ids in this sample
    std::map<uint32_t,uint32_t> indoc;
    // std::vector<uint32_t> indoc; // counts origin of samples (for biased sampling)
    jstats();
@ -41,7 +42,8 @@ namespace sapt

    size_t 
    add(float w, float b, std::vector<unsigned char> const& a, uint32_t const cnt2, 
-	uint32_t fwd_orient, uint32_t bwd_orient, int const docid);
+	uint32_t fwd_orient, uint32_t bwd_orient, int const docid, uint32_t const sid,
+	bool const track_sid);

    void invalidate();
    void validate();
--- a/moses/TranslationModel/UG/mm/ug_bitext_moses.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_moses.h
@ -30,9 +30,9 @@ lookup(ttasksptr const& ttask, iter const& phrase, int max_sample) const
 template<typename Token>
 void
 Bitext<Token>::
-prep(ttasksptr const& ttask, iter const& phrase) const
+prep(ttasksptr const& ttask, iter const& phrase, bool const track_sids) const
 {
-  prep2(ttask, phrase, m_default_sample_size);
+  prep2(ttask, phrase, track_sids, m_default_sample_size);
 }


@ -44,7 +44,8 @@ template<typename Token>
 SPTR<pstats>
 Bitext<Token>
 ::prep2
-( ttasksptr const& ttask, iter const& phrase, int max_sample) const
+( ttasksptr const& ttask, iter const& phrase, bool const track_sids,
+  int max_sample) const
 {
  if (max_sample < 0) max_sample = m_default_sample_size;
  SPTR<SamplingBias> bias;
@ -74,7 +75,7 @@ Bitext<Token>
      if (m_num_workers > 1)
 	ag->add_workers(m_num_workers);
    }
-  ret = ag->add_job(this, phrase, max_sample, bias);
+  ret = ag->add_job(this, phrase, max_sample, bias, track_sids);
  if (cache) cache->set(phrase.getPid(),ret);
  UTIL_THROW_IF2(ret == NULL, "Couldn't schedule sampling job.");
  return ret;
--- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc
+++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc
@ -10,7 +10,7 @@ namespace sapt
 #endif

  pstats::
-  pstats() : raw_cnt(0), sample_cnt(0), good(0), sum_pairs(0), in_progress(0)
+  pstats(bool const track_sids) : raw_cnt(0), sample_cnt(0), good(0), sum_pairs(0), in_progress(0), track_sids(track_sids)
  {
    for (int i = 0; i <= LRModel::NONE; ++i)
      ofwd[i] = obwd[i] = 0;
@ -69,11 +69,11 @@ namespace sapt
      std::vector<unsigned char> const& a,
      uint32_t const cnt2,
      uint32_t fwd_o,
-      uint32_t bwd_o, int const docid)
+      uint32_t bwd_o, int const docid, uint32_t const sid)
  {
    boost::lock_guard<boost::mutex> guard(this->lock);
    jstats& entry = this->trg[pid];
-    size_t ret = entry.add(w, b, a, cnt2, fwd_o, bwd_o, docid);
+    size_t ret = entry.add(w, b, a, cnt2, fwd_o, bwd_o, docid, sid, track_sids);
    if (this->good < entry.rcnt())
      {
        UTIL_THROW(util::Exception, "more joint counts than good counts:"
--- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h
@ -35,7 +35,8 @@ namespace sapt

    indoc_map_t indoc;
    trg_map_t trg;
-    pstats();
+    bool track_sids;
+    pstats(bool const track_sids);
    ~pstats();
    void release();
    void register_worker();
@ -49,7 +50,8 @@ namespace sapt
        uint32_t const cnt2, // raw target phrase count
        uint32_t fwd_o,      // fwd. phrase orientation
        uint32_t bwd_o,      // bwd. phrase orientation
-        int const docid);    // document where sample was found
+        int const docid,     // document where sample was found
+        uint32_t const sid); // index of sentence where sample was found
    
    void
    count_sample(int const docid,        // document where sample was found
--- a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h
@ -70,6 +70,7 @@ BitextSampler : public Moses::reference_counter
  size_t m_num_occurrences; // estimated number of phrase occurrences in corpus
  boost::taus88 m_rnd;  // every job has its own pseudo random generator
  double m_bias_total;
+  bool m_track_sids; // track sentence ids in stats?

  size_t consider_sample(TokenPosition const& p);
  size_t perform_random_sampling();
@ -86,7 +87,8 @@ public:
                SPTR<SamplingBias const> const& bias, 
                size_t const min_samples, 
                size_t const max_samples,
-                sampling_method const method); 
+                sampling_method const method,
+                bool const track_sids);
  ~BitextSampler();
  SPTR<pstats> stats();
  bool done() const;
@ -185,7 +187,7 @@ BitextSampler<Token>::
 BitextSampler(SPTR<Bitext<Token> const> const& bitext, 
              typename bitext::iter const& phrase,
              SPTR<SamplingBias const> const& bias, size_t const min_samples, size_t const max_samples,
-              sampling_method const method)
+              sampling_method const method, bool const track_sids)
  : m_bitext(bitext)
  , m_plen(phrase.size())
  , m_fwd(phrase.root == bitext->I1.get())
@ -201,8 +203,9 @@ BitextSampler(SPTR<Bitext<Token> const> const& bitext,
  , m_finished(false)
  , m_num_occurrences(phrase.ca())
  , m_rnd(0)
+  , m_track_sids(track_sids)
 {
-  m_stats.reset(new pstats);
+  m_stats.reset(new pstats(m_track_sids));
  m_stats->raw_cnt = phrase.ca();
  m_stats->register_worker();
 }
@ -332,7 +335,8 @@ consider_sample(TokenPosition const& p)
          size_t raw2 = b->approxOccurrenceCount();
          size_t evid = m_stats->add(tpid, sample_weight, 
                                     m_bias ? (*m_bias)[p.sid] : 1, 
-                                     aln, raw2, rec.po_fwd, rec.po_bwd, docid);
+                                     aln, raw2, rec.po_fwd, rec.po_bwd, docid,
+                                     p.sid);
          max_evidence = std::max(max_evidence, evid);
          bool ok = (i == rec.e2) || b->extend(o[i].id());
          UTIL_THROW_IF2(!ok, "Could not extend target phrase.");
--- a/moses/TranslationModel/UG/mm/ug_phrasepair.h
+++ b/moses/TranslationModel/UG/mm/ug_phrasepair.h
@ -31,6 +31,8 @@ namespace sapt
    std::vector<unsigned char> aln;
    float score;
    bool inverse;
+    SPTR<std::vector<uint32_t> > sids; // list of sampled sentence ids where
+                                       // this phrase pair was found
    // std::vector<uint32_t> indoc;
    std::map<uint32_t,uint32_t> indoc;
    PhrasePair() { };
@ -132,6 +134,7 @@ namespace sapt
        dbwd[i] = js.dcnt_bwd(po);
      }
    
+    sids = js.sids;
    indoc = js.indoc;
    return *this;
  }
@ -182,6 +185,8 @@ namespace sapt
    sample2 += o.sample2;
    cum_bias += o.cum_bias;
    // todo: add distortion counts
+    if (sids && o.sids)
+      sids->insert(sids->end(), o.sids->begin(), o.sids->end());
    return *this;
  }

@ -199,6 +204,7 @@ namespace sapt
    , aln(o.aln)
    , score(o.score)
    , inverse(o.inverse)
+    , sids(o.sids)
    , indoc(o.indoc)
  {
    for (int i = 0; i <= LRModel::NONE; ++i)
--- a/moses/TranslationModel/UG/mmsapt.cpp
+++ b/moses/TranslationModel/UG/mmsapt.cpp
@ -275,6 +275,40 @@ namespace Moses
    m = param.find("name");
    if (m != param.end()) m_name = m->second;

+    // Optional coordinates for training corpus
+    // Takes form coord=name1:file1.gz,name2:file2.gz,...
+    // Names should match with XML input (coord tag)
+    param.insert(pair<string,string>("coord","0"));
+    if(param["coord"] != "0")
+      {
+        m_track_coord = true;
+        vector<string> coord_instances = Tokenize(param["coord"], ",");
+        BOOST_FOREACH(std::string instance, coord_instances)
+          {
+            vector<string> toks = Moses::Tokenize(instance, ":");
+            string space = toks[0];
+            string file = toks[1];
+            // Register that this model uses the given space
+            m_coord_spaces.push_back(StaticData::InstanceNonConst().MapCoordSpace(space));
+            // Load sid coordinates from file
+            m_sid_coord_list.push_back(vector<SPTR<vector<float> > >());
+            vector<SPTR<vector<float> > >& sid_coord = m_sid_coord_list[m_sid_coord_list.size() - 1];
+            //TODO: support extra data for btdyn, here? extra?
+            sid_coord.reserve(btfix->T1->size());
+            string line;
+            cerr << "Loading coordinate lines for space \"" << space << "\" from " << file << endl;
+            iostreams::filtering_istream in;
+            ugdiss::open_input_stream(file, in);
+            while(getline(in, line))
+              {
+                SPTR<vector<float> > coord(new vector<float>);
+                Scan<float>(*coord, Tokenize(line));
+                sid_coord.push_back(coord);
+              }
+            cerr << "Loaded " << sid_coord.size() << " lines" << endl;
+          }
+      }
+
    // check for unknown parameters
    vector<string> known_parameters; known_parameters.reserve(50);
    known_parameters.push_back("L1");
@ -290,6 +324,7 @@ namespace Moses
    known_parameters.push_back("cache");
    known_parameters.push_back("coh");
    known_parameters.push_back("config");
+    known_parameters.push_back("coord");
    known_parameters.push_back("cumb");
    known_parameters.push_back("extra");
    known_parameters.push_back("feature-sets");
@ -616,6 +651,29 @@ namespace Moses
      }
 #endif

+    // Track coordinates if requested
+    if (m_track_coord)
+    {
+      BOOST_FOREACH(uint32_t const sid, *pool.sids)
+        {
+          for(size_t i = 0; i < m_coord_spaces.size(); ++i)
+            {
+              tp->PushCoord(m_coord_spaces[i], m_sid_coord_list[i][sid]);
+            }
+        }
+      /*
+      cerr << btfix->toString(pool.p1, 0) << " ::: " << btfix->toString(pool.p2, 1);
+      BOOST_FOREACH(size_t id, m_coord_spaces)
+        {
+          cerr << " [" << id << "]";
+          vector<vector<float> const*> const* coordList = tp->GetCoordList(id);
+          BOOST_FOREACH(vector<float> const* coord, *coordList)
+            cerr << " : " << Join(" ", *coord);
+        }
+      cerr << endl;
+      */
+    }
+
    return tp;
  }

@ -691,7 +749,7 @@ namespace Moses
    SPTR<ContextScope> const& scope = ttask->GetScope();
    SPTR<TPCollCache> cache = scope->get<TPCollCache>(cache_key);
    if (!cache) cache = m_cache; // no context-specific cache, use global one
-      
+
    ret = cache->get(phrasekey, dyn->revision());
    // TO DO: we should revise the revision mechanism: we take the
    // length of the dynamic bitext (in sentences) at the time the PT
@ -705,12 +763,12 @@ namespace Moses
    // std::cerr << ret << " with " << ret->refCount << " references at " 
    // << HERE << std::endl;
    boost::upgrade_lock<boost::shared_mutex> rlock(ret->lock);
-    if (ret->GetSize()) return ret; 
+    if (ret->GetSize()) return ret;

    // new TPC (not found or old one was not up to date)
    boost::upgrade_to_unique_lock<boost::shared_mutex> wlock(rlock);
    // maybe another thread did the work while we waited for the lock ?
-    if (ret->GetSize()) return ret; 
+    if (ret->GetSize()) return ret;

    // OK: pt entry NOT found or NOT up to date
    // lookup and expansion could be done in parallel threads,
@ -730,7 +788,8 @@ namespace Moses
            BitextSampler<Token> s(btfix, mfix, context->bias, 
                                   m_min_sample_size, 
                                   m_default_sample_size, 
-                                   m_sampling_method);
+                                   m_sampling_method,
+                                   m_track_coord);
            s();
            sfix = s.stats();
          }
@ -918,7 +977,7 @@ namespace Moses
          {
            BitextSampler<Token> s(btfix, mfix, context->bias, 
                                   m_min_sample_size, m_default_sample_size, 
-                                   m_sampling_method);
+                                   m_sampling_method, m_track_coord);
            if (*context->cache1->get(pid, s.stats()) == s.stats())
              m_thread_pool->add(s);
          }
@ -939,7 +998,7 @@ namespace Moses
        for (size_t i = 0; mdyn.size() == i && i < myphrase.size(); ++i)
          mdyn.extend(myphrase[i]);
        // let's assume a uniform bias over the foreground corpus
-        if (mdyn.size() == myphrase.size()) dyn->prep(ttask, mdyn);
+        if (mdyn.size() == myphrase.size()) dyn->prep(ttask, mdyn, m_track_coord);
      }
    return mdyn.size() == myphrase.size();
  }
--- a/moses/TranslationModel/UG/mmsapt.h
+++ b/moses/TranslationModel/UG/mmsapt.h
@ -119,6 +119,12 @@ namespace Moses
    std::vector<SPTR<pscorer > > m_active_ff_common;
    // activated feature functions (dyn)

+    bool m_track_coord = false; // track coordinates?  Track sids when sampling
+                                // from bitext, append coords to target phrases
+    // Space < Sid < sptr sentence coords > >
+    std::vector<std::vector<SPTR<std::vector<float> > > > m_sid_coord_list;
+    std::vector<size_t> m_coord_spaces;
+
    void
    parse_factor_spec(std::vector<FactorType>& flist, std::string const key);

--- a/moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h
+++ b/moses/TranslationModel/UG/sapt_pscore_cumulative_bias.h
@ -28,8 +28,8 @@ namespace sapt  {
    
    void
    operator()(Bitext<Token> const& bt,
-	       PhrasePair<Token>& pp,
-	       std::vector<float> * dest = NULL) const
+         PhrasePair<Token>& pp,
+         std::vector<float> * dest = NULL) const
    {
      if (!dest) dest = &pp.fvals;
      (*dest)[this->m_index] = log(std::max(m_floor,pp.cum_bias));
--- a/moses/TranslationModel/UG/sapt_pscore_length_ratio.h
+++ b/moses/TranslationModel/UG/sapt_pscore_length_ratio.h
@ -48,8 +48,8 @@ namespace sapt  {

    void
    operator()(Bitext<Token> const& bt,
-               PhrasePair<Token>& pp,
-               std::vector<float> * dest = NULL) const
+         PhrasePair<Token>& pp,
+         std::vector<float> * dest = NULL) const
    {
      if (!dest) dest = &pp.fvals;
      float p  = float(bt.T1->numTokens());
--- a/moses/TranslationModel/UG/sapt_pscore_lex1.h
+++ b/moses/TranslationModel/UG/sapt_pscore_lex1.h
@ -36,8 +36,8 @@ namespace sapt

    void
    operator()(Bitext<Token> const& bt,
-	       PhrasePair<Token>& pp,
-	       std::vector<float> * dest = NULL) const
+         PhrasePair<Token>& pp,
+         std::vector<float> * dest = NULL) const
    {
      if (!dest) dest = &pp.fvals;
      // uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0;
--- a/moses/TranslationModel/UG/sapt_pscore_logcnt.h
+++ b/moses/TranslationModel/UG/sapt_pscore_logcnt.h
@ -37,8 +37,8 @@ namespace sapt  {

    void
    operator()(Bitext<Token> const& bt,
-	       PhrasePair<Token>& pp,
-	       std::vector<float> * dest = NULL) const
+         PhrasePair<Token>& pp,
+         std::vector<float> * dest = NULL) const
    {
      if (!dest) dest = &pp.fvals;
      assert(pp.raw1);
--- a/moses/TranslationModel/UG/sapt_pscore_pbwd.h
+++ b/moses/TranslationModel/UG/sapt_pscore_pbwd.h
@ -38,8 +38,8 @@ namespace sapt

    void
    operator()(Bitext<Token> const& bt,
-	       PhrasePair<Token>& pp,
-	       std::vector<float> * dest = NULL) const
+         PhrasePair<Token>& pp,
+         std::vector<float> * dest = NULL) const
    {
      if (!dest) dest = &pp.fvals;
      // we use the denominator specification to scale the raw counts on the
--- a/moses/TranslationModel/UG/sapt_pscore_pfwd.h
+++ b/moses/TranslationModel/UG/sapt_pscore_pfwd.h
@ -38,8 +38,9 @@ namespace sapt
    }

    void
-    operator()(Bitext<Token> const& bt, PhrasePair<Token> & pp,
-	       std::vector<float> * dest = NULL) const
+    operator()(Bitext<Token> const& bt,
+         PhrasePair<Token>& pp,
+         std::vector<float> * dest = NULL) const
    {
      if (!dest) dest = &pp.fvals;
      if (pp.joint > pp.good1)
--- a/moses/TranslationModel/UG/sapt_pscore_phrasecount.h
+++ b/moses/TranslationModel/UG/sapt_pscore_phrasecount.h
@ -22,8 +22,8 @@ namespace sapt

    void
    operator()(Bitext<Token> const& bt,
-	       PhrasePair<Token>& pp,
-	       std::vector<float> * dest = NULL) const
+         PhrasePair<Token>& pp,
+         std::vector<float> * dest = NULL) const
    {
      if (!dest) dest = &pp.fvals;
      (*dest)[this->m_index] = 1;
--- a/moses/TranslationModel/UG/sapt_pscore_provenance.h
+++ b/moses/TranslationModel/UG/sapt_pscore_provenance.h
@ -28,8 +28,8 @@ namespace sapt {

    void
    operator()(Bitext<Token> const& bt,
-	       PhrasePair<Token>& pp,
-	       std::vector<float> * dest = NULL) const
+         PhrasePair<Token>& pp,
+         std::vector<float> * dest = NULL) const
    {
      if (!dest) dest = &pp.fvals;
      size_t i = this->m_index;
--- a/moses/TranslationModel/UG/sapt_pscore_rareness.h
+++ b/moses/TranslationModel/UG/sapt_pscore_rareness.h
@ -26,8 +26,8 @@ namespace sapt  {

    void
    operator()(Bitext<Token> const& bt,
-	       PhrasePair<Token>& pp,
-	       std::vector<float> * dest = NULL) const
+         PhrasePair<Token>& pp,
+         std::vector<float> * dest = NULL) const
    {
      if (!dest) dest = &pp.fvals;
      size_t i = this->m_index;
--- a/moses/TranslationModel/UG/sapt_pscore_unaligned.h
+++ b/moses/TranslationModel/UG/sapt_pscore_unaligned.h
@ -37,8 +37,8 @@ namespace sapt

    void
    operator()(Bitext<Token> const& bt,
-	       PhrasePair<Token>& pp,
-	       std::vector<float> * dest = NULL) const
+         PhrasePair<Token>& pp,
+         std::vector<float> * dest = NULL) const
    {
      if (!dest) dest = &pp.fvals;
      // uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0;
--- a/moses/TranslationModel/UG/sapt_pscore_wordcount.h
+++ b/moses/TranslationModel/UG/sapt_pscore_wordcount.h
@ -22,8 +22,8 @@ namespace sapt

    void
    operator()(Bitext<Token> const& bt,
-	       PhrasePair<Token>& pp,
-	       std::vector<float> * dest = NULL) const
+        PhrasePair<Token>& pp,
+        std::vector<float> * dest = NULL) const
    {
      if (!dest) dest = &pp.fvals;
      (*dest)[this->m_index] = pp.len2;
--- a/moses/XmlOption.cpp
+++ b/moses/XmlOption.cpp
@ -163,7 +163,8 @@ ProcessAndStripXMLTags(AllOptions const& opts, string &line,
                       vector<XmlOption const*> &res,
                       ReorderingConstraint &reorderingConstraint,
                       vector< size_t > &walls,
-                       std::vector< std::pair<size_t, std::string> > &placeholders)
+                       std::vector< std::pair<size_t, std::string> > &placeholders,
+                       InputType &input)
 {
  //parse XML markup in translation line

@ -401,6 +402,28 @@ ProcessAndStripXMLTags(AllOptions const& opts, string &line,
          StaticData::InstanceNonConst().SetAllWeights(allWeights);
        }

+        // Coord: coordinates of the input sentence in a user-defined space
+        // <coord space="NAME" coord="X Y Z ..." />
+        // where NAME is the name of the space and X Y Z ... are floats.  See
+        // PhraseDistanceFeature for an example of using this information for
+        // feature scoring.
+        else if (tagName == "coord") {
+          // Parse tag
+          string space = ParseXmlTagAttribute(tagContent, "space");
+          vector<string> tok = Tokenize(ParseXmlTagAttribute(tagContent, "coord"));
+          size_t id = StaticData::Instance().GetCoordSpace(space);
+          if (!id) {
+            TRACE_ERR("ERROR: no models use space " << space << ", will be ignored" << endl);
+          } else {
+            // Init if needed
+            if (!input.m_coordMap) {
+              input.m_coordMap.reset(new map<size_t const, vector<float> >);
+            }
+            vector<float>& coord = (*input.m_coordMap)[id];
+            Scan<float>(coord, tok);
+          }
+        }
+
        // default: opening tag that specifies translation options
        else {
          if (startPos > endPos) {
--- a/moses/XmlOption.h
+++ b/moses/XmlOption.h
@ -34,7 +34,8 @@ bool ProcessAndStripXMLTags(AllOptions const& opts,
                            std::string &line, std::vector<XmlOption const*> &res,
                            ReorderingConstraint &reorderingConstraint,
                            std::vector< size_t > &walls,
-                            std::vector< std::pair<size_t, std::string> > &placeholders);
+                            std::vector< std::pair<size_t, std::string> > &placeholders,
+                            InputType &input);


 }