Merge branch 'master' of github.com:moses-smt/mosesdecoder

2024-09-11 19:27:11 +03:00 · 2014-04-23 12:11:50 +01:00 · 2014-04-23 12:11:50 +01:00 · 3f32e48f97
commit 3f32e48f97
parent 00505ba048 ff8ac92be1
13 changed files with 435 additions and 120 deletions
--- a/contrib/server/Jamfile
+++ b/contrib/server/Jamfile
@ -35,7 +35,7 @@ if $(build-moses-server) = true
  xmlrpc-linkflags = [ shell_or_die "$(xmlrpc-command) c++2 abyss-server --libs" ] ;
  xmlrpc-cxxflags = [ shell_or_die "$(xmlrpc-command) c++2 abyss-server --cflags" ] ;

-  exe mosesserver : mosesserver.cpp ../../moses//moses ../../OnDiskPt//OnDiskPt : <linkflags>$(xmlrpc-linkflags) <cxxflags>$(xmlrpc-cxxflags) ;
+  exe mosesserver : mosesserver.cpp ../../moses//moses ../../moses-cmd/IOWrapper.cpp ../../OnDiskPt//OnDiskPt : <linkflags>$(xmlrpc-linkflags) <cxxflags>$(xmlrpc-cxxflags) ;
 } else {
  alias mosesserver ;
 }
--- a/contrib/server/mosesserver.cpp
+++ b/contrib/server/mosesserver.cpp
@ -12,6 +12,7 @@
 #include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h"
 #include "moses/TreeInput.h"
 #include "moses/LM/ORLM.h"
+#include "moses-cmd/IOWrapper.h"

 #ifdef WITH_THREADS
 #include <boost/thread.hpp>
@ -22,6 +23,7 @@
 #include <xmlrpc-c/server_abyss.hpp>

 using namespace Moses;
+using namespace MosesCmd;
 using namespace std;

 typedef std::map<std::string, xmlrpc_c::value> params_t;
@ -215,6 +217,8 @@ public:
    cerr << "Input: " << source << endl;
    si = params.find("align");
    bool addAlignInfo = (si != params.end());
+    si = params.find("word-align");
+    bool addWordAlignInfo = (si != params.end());
    si = params.find("sg");
    bool addGraphInfo = (si != params.end());
    si = params.find("topt");
@ -278,6 +282,20 @@ public:
        if (addAlignInfo) {
          retData.insert(pair<string, xmlrpc_c::value>("align", xmlrpc_c::value_array(alignInfo)));
        }
+        if (addWordAlignInfo) {
+          stringstream wordAlignment;
+          OutputAlignment(wordAlignment, hypo);
+          vector<xmlrpc_c::value> alignments;
+          string alignmentPair;
+          while (wordAlignment >> alignmentPair) {
+          	int pos = alignmentPair.find('-');
+          	map<string, xmlrpc_c::value> wordAlignInfo;
+          	wordAlignInfo["source-word"] = xmlrpc_c::value_int(atoi(alignmentPair.substr(0, pos).c_str()));
+          	wordAlignInfo["target-word"] = xmlrpc_c::value_int(atoi(alignmentPair.substr(pos + 1).c_str()));
+          	alignments.push_back(xmlrpc_c::value_struct(wordAlignInfo));
+          }
+          retData.insert(pair<string, xmlrpc_c::value_array>("word-align", alignments));
+        }

        if(addGraphInfo) {
          insertGraphInfo(manager,retData);
@ -415,9 +433,25 @@ public:
      }
      nBestXMLItem["hyp"] = xmlrpc_c::value_string(out.str());

-      if (addAlignmentInfo)
+      if (addAlignmentInfo) {
        nBestXMLItem["align"] = xmlrpc_c::value_array(alignInfo);

+        if ((int)edges.size() > 0) {
+          stringstream wordAlignment;
+          OutputAlignment(wordAlignment, edges[0]);
+          vector<xmlrpc_c::value> alignments;
+          string alignmentPair;
+          while (wordAlignment >> alignmentPair) {
+          	int pos = alignmentPair.find('-');
+          	map<string, xmlrpc_c::value> wordAlignInfo;
+          	wordAlignInfo["source-word"] = xmlrpc_c::value_int(atoi(alignmentPair.substr(0, pos).c_str()));
+          	wordAlignInfo["target-word"] = xmlrpc_c::value_int(atoi(alignmentPair.substr(pos + 1).c_str()));
+          	alignments.push_back(xmlrpc_c::value_struct(wordAlignInfo));
+          }
+          nBestXMLItem["word-align"] = xmlrpc_c::value_array(alignments);
+        }
+      }
+
      // weighted score
      nBestXMLItem["totalScore"] = xmlrpc_c::value_double(path.GetTotalScore());
      nBestXml.push_back(xmlrpc_c::value_struct(nBestXMLItem));
--- a/moses-chart-cmd/IOWrapper.cpp
+++ b/moses-chart-cmd/IOWrapper.cpp
@ -553,7 +553,7 @@ void IOWrapper::OutputDetailedTreeFragmentsTranslationReport(

 //DIMw
 void IOWrapper::OutputDetailedAllTranslationReport(
-  const ChartTrellisPathList &nBestList,
+  const std::vector<boost::shared_ptr<Moses::ChartKBestExtractor::Derivation> > &nBestList,
  const ChartManager &manager,
  const Sentence &sentence,
  long translationId)
@ -793,6 +793,58 @@ void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, long tran
  m_nBestOutputCollector->Write(translationId, out.str());
 }

+void IOWrapper::OutputNBestList(const ChartKBestExtractor::KBestVec &nBestList,
+                                long translationId)
+{
+  std::ostringstream out;
+
+  if (m_nBestOutputCollector->OutputIsCout()) {
+    // Set precision only if we're writing the n-best list to cout.  This is to
+    // preserve existing behaviour, but should probably be done either way.
+    IOWrapper::FixPrecision(out);
+  }
+
+  bool includeWordAlignment =
+      StaticData::Instance().PrintAlignmentInfoInNbest();
+
+  for (ChartKBestExtractor::KBestVec::const_iterator p = nBestList.begin();
+       p != nBestList.end(); ++p) {
+    const ChartKBestExtractor::Derivation &derivation = **p;
+
+    // get the derivation's target-side yield
+    Phrase outputPhrase = ChartKBestExtractor::GetOutputPhrase(derivation);
+
+    // delete <s> and </s>
+    UTIL_THROW_IF2(outputPhrase.GetSize() < 2,
+        "Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
+    outputPhrase.RemoveWord(0);
+    outputPhrase.RemoveWord(outputPhrase.GetSize() - 1);
+
+    // print the translation ID, surface factors, and scores
+    out << translationId << " ||| ";
+    OutputSurface(out, outputPhrase, m_outputFactorOrder, false);
+    out << " ||| ";
+    OutputAllFeatureScores(derivation.scoreBreakdown, out);
+    out << " ||| " << derivation.score;
+
+    // optionally, print word alignments
+    if (includeWordAlignment) {
+      out << " ||| ";
+      Alignments align;
+      OutputAlignmentNBest(align, derivation, 0);
+      for (Alignments::const_iterator q = align.begin(); q != align.end();
+           ++q) {
+        out << q->first << "-" << q->second << " ";
+      }
+    }
+
+    out << std::endl;
+  }
+
+  assert(m_nBestOutputCollector);
+  m_nBestOutputCollector->Write(translationId, out.str());
+}
+
 void IOWrapper::OutputNBestList(const std::vector<search::Applied> &nbest, long translationId)
 {
  std::ostringstream out;
@ -927,6 +979,85 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT
  return totalTargetSize;
 }

+size_t IOWrapper::OutputAlignmentNBest(
+    Alignments &retAlign,
+    const Moses::ChartKBestExtractor::Derivation &derivation,
+    size_t startTarget)
+{
+  const ChartHypothesis &hypo = derivation.edge.head->hypothesis;
+
+  size_t totalTargetSize = 0;
+  size_t startSource = hypo.GetCurrSourceRange().GetStartPos();
+
+  const TargetPhrase &tp = hypo.GetCurrTargetPhrase();
+
+  size_t thisSourceSize = CalcSourceSize(&hypo);
+
+  // position of each terminal word in translation rule, irrespective of alignment
+  // if non-term, number is undefined
+  vector<size_t> sourceOffsets(thisSourceSize, 0);
+  vector<size_t> targetOffsets(tp.GetSize(), 0);
+
+  const AlignmentInfo &aiNonTerm = hypo.GetCurrTargetPhrase().GetAlignNonTerm();
+  vector<size_t> sourceInd2pos = aiNonTerm.GetSourceIndex2PosMap();
+  const AlignmentInfo::NonTermIndexMap &targetPos2SourceInd = aiNonTerm.GetNonTermIndexMap();
+
+  UTIL_THROW_IF2(sourceInd2pos.size() != derivation.subderivations.size(),
+                 "Error");
+
+  size_t targetInd = 0;
+  for (size_t targetPos = 0; targetPos < tp.GetSize(); ++targetPos) {
+    if (tp.GetWord(targetPos).IsNonTerminal()) {
+      UTIL_THROW_IF2(targetPos >= targetPos2SourceInd.size(), "Error");
+      size_t sourceInd = targetPos2SourceInd[targetPos];
+      size_t sourcePos = sourceInd2pos[sourceInd];
+
+      const Moses::ChartKBestExtractor::Derivation &subderivation =
+        *derivation.subderivations[sourceInd];
+
+      // calc source size
+      size_t sourceSize = subderivation.edge.head->hypothesis.GetCurrSourceRange().GetNumWordsCovered();
+      sourceOffsets[sourcePos] = sourceSize;
+
+      // calc target size.
+      // Recursively look thru child hypos
+      size_t currStartTarget = startTarget + totalTargetSize;
+      size_t targetSize = OutputAlignmentNBest(retAlign, subderivation,
+                                               currStartTarget);
+      targetOffsets[targetPos] = targetSize;
+
+      totalTargetSize += targetSize;
+      ++targetInd;
+    } else {
+      ++totalTargetSize;
+    }
+  }
+
+  // convert position within translation rule to absolute position within
+  // source sentence / output sentence
+  ShiftOffsets(sourceOffsets, startSource);
+  ShiftOffsets(targetOffsets, startTarget);
+
+  // get alignments from this hypo
+  const AlignmentInfo &aiTerm = hypo.GetCurrTargetPhrase().GetAlignTerm();
+
+  // add to output arg, offsetting by source & target
+  AlignmentInfo::const_iterator iter;
+  for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) {
+    const std::pair<size_t,size_t> &align = *iter;
+    size_t relSource = align.first;
+    size_t relTarget = align.second;
+    size_t absSource = sourceOffsets[relSource];
+    size_t absTarget = targetOffsets[relTarget];
+
+    pair<size_t, size_t> alignPoint(absSource, absTarget);
+    pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
+    UTIL_THROW_IF2(!ret.second, "Error");
+  }
+
+  return totalTargetSize;
+}
+
 void IOWrapper::OutputAlignment(size_t translationId , const Moses::ChartHypothesis *hypo)
 {
  ostringstream out;
--- a/moses-chart-cmd/IOWrapper.h
+++ b/moses-chart-cmd/IOWrapper.h
@ -40,6 +40,7 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "moses/TypeDef.h"
 #include "moses/Sentence.h"
 #include "moses/FactorTypeSet.h"
+#include "moses/ChartKBestExtractor.h"
 #include "moses/ChartTrellisPathList.h"
 #include "moses/OutputCollector.h"
 #include "moses/ChartHypothesis.h"
@ -90,6 +91,7 @@ protected:

  typedef std::set< std::pair<size_t, size_t>  > Alignments;
  size_t OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartTrellisNode &node, size_t startTarget);
+  std::size_t OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartKBestExtractor::Derivation &derivation, std::size_t startTarget);
  size_t OutputAlignment(Alignments &retAlign, const Moses::ChartHypothesis *hypo, size_t startTarget);
  void OutputAlignment(std::vector< std::set<size_t> > &retAlignmentsS2T, const Moses::AlignmentInfo &ai);
  void OutputTranslationOption(std::ostream &out, ApplicationContext &applicationContext, const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId);
@ -129,12 +131,13 @@ public:
  void OutputBestHypo(const std::vector<const Moses::Factor*>&  mbrBestHypo, long translationId);
  void OutputBestNone(long translationId);
  void OutputNBestList(const Moses::ChartTrellisPathList &nBestList, long translationId);
+  void OutputNBestList(const std::vector<boost::shared_ptr<Moses::ChartKBestExtractor::Derivation> > &nBestList, long translationId);
  void OutputNBestList(const std::vector<search::Applied> &nbest, long translationId);
  void OutputDetailedTranslationReport(const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId);
  void OutputDetailedTranslationReport(const search::Applied *applied, const Moses::Sentence &sentence, long translationId);
  void OutputDetailedTreeFragmentsTranslationReport(const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId);
  void OutputDetailedTreeFragmentsTranslationReport(const search::Applied *applied, const Moses::Sentence &sentence, long translationId);
-  void OutputDetailedAllTranslationReport(const Moses::ChartTrellisPathList &nBestList, const Moses::ChartManager &manager, const Moses::Sentence &sentence, long translationId);
+  void OutputDetailedAllTranslationReport(const std::vector<boost::shared_ptr<Moses::ChartKBestExtractor::Derivation> > &nBestList, const Moses::ChartManager &manager, const Moses::Sentence &sentence, long translationId);
  void Backtrack(const Moses::ChartHypothesis *hypo);

  void ResetTranslationId();
--- a/moses-chart-cmd/Main.cpp
+++ b/moses-chart-cmd/Main.cpp
@ -151,7 +151,7 @@ public:
    if (staticData.IsDetailedAllTranslationReportingEnabled()) {
      const Sentence &sentence = dynamic_cast<const Sentence &>(*m_source);
      size_t nBestSize = staticData.GetNBestSize();
-      ChartTrellisPathList nBestList;
+      std::vector<boost::shared_ptr<ChartKBestExtractor::Derivation> > nBestList;
      manager.CalcNBest(nBestSize, nBestList, staticData.GetDistinctNBest());
      m_ioWrapper.OutputDetailedAllTranslationReport(nBestList, manager, sentence, translationId);
    }
@ -160,7 +160,7 @@ public:
    size_t nBestSize = staticData.GetNBestSize();
    if (nBestSize > 0) {
      VERBOSE(2,"WRITING " << nBestSize << " TRANSLATION ALTERNATIVES TO " << staticData.GetNBestFilePath() << endl);
-      ChartTrellisPathList nBestList;
+      std::vector<boost::shared_ptr<ChartKBestExtractor::Derivation> > nBestList;
      manager.CalcNBest(nBestSize, nBestList,staticData.GetDistinctNBest());
      m_ioWrapper.OutputNBestList(nBestList, translationId);
      IFVERBOSE(2) {
--- a/moses-cmd/Main.cpp
+++ b/moses-cmd/Main.cpp
@ -180,6 +180,7 @@ public:
      } else {
        TRACE_ERR("Cannot output HTK standard lattice for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl);
      }
+      delete file;
    }

    // Output search graph in hypergraph format for Kenneth Heafield's lazy hypergraph decoder
@ -233,7 +234,7 @@ public:

        } else {
          stringstream hypergraphDirName;
-          hypergraphDirName << boost::filesystem::current_path() << "/hypergraph";
+          hypergraphDirName << boost::filesystem::current_path().string() << "/hypergraph";
          hypergraphDir = hypergraphDirName.str();
        }
      }
@ -530,9 +531,7 @@ size_t OutputFeatureWeightsForHypergraph(size_t index, const FeatureFunction* ff
    }
    return index+numScoreComps;
  } else {
-    cerr << "Sparse features are not yet supported when outputting hypergraph format" << endl;
-    assert(false);
-    return 0;
+    UTIL_THROW2("Sparse features are not yet supported when outputting hypergraph format");
  }
 }

@ -644,7 +643,7 @@ int main(int argc, char** argv)
          boost::filesystem::path nbestPath(nbestFile);
          weightsFilename << nbestPath.parent_path().filename() << "/weights";
        } else {
-          weightsFilename << boost::filesystem::current_path() << "/hypergraph/weights";
+          weightsFilename << boost::filesystem::current_path().string() << "/hypergraph/weights";
        }
      }
      boost::filesystem::path weightsFilePath(weightsFilename.str());
--- a/moses/ChartHypothesis.h
+++ b/moses/ChartHypothesis.h
@ -45,6 +45,7 @@ typedef std::vector<ChartHypothesis*> ChartArcList;
 class ChartHypothesis
 {
  friend std::ostream& operator<<(std::ostream&, const ChartHypothesis&);
+  friend class ChartKBestExtractor;

 protected:
 #ifdef USE_HYPO_POOL
@ -75,6 +76,9 @@ protected:
  //! not implemented
  ChartHypothesis(const ChartHypothesis &copy);

+  //! only used by ChartKBestExtractor
+  ChartHypothesis(const ChartHypothesis &, const ChartKBestExtractor &);
+
 public:
 #ifdef USE_HYPO_POOL
  void *operator new(size_t /* num_bytes */) {
@ -93,9 +97,6 @@ public:
  }
 #endif

-  //! only used by ChartKBestExtractor
-  ChartHypothesis(const ChartHypothesis &, const ChartKBestExtractor &);
-
  ChartHypothesis(const ChartTranslationOptions &, const RuleCubeItem &item,
                  ChartManager &manager);

--- a/moses/ChartKBestExtractor.cpp
+++ b/moses/ChartKBestExtractor.cpp
@ -32,52 +32,48 @@ namespace Moses

 // Extract the k-best list from the search graph.
 void ChartKBestExtractor::Extract(
-    const std::vector<const ChartHypothesis*> &topHypos, std::size_t k,
+    const std::vector<const ChartHypothesis*> &topLevelHypos, std::size_t k,
    KBestVec &kBestList)
 {
-  typedef std::vector<const ChartHypothesis*> HypoVec;
-
  kBestList.clear();
-  if (topHypos.empty()) {
+  if (topLevelHypos.empty()) {
    return;
  }

-  // Create a new top-level ChartHypothesis that has the best hypothesis as its
-  // predecessor.  This is the search hypergraph's target vertex.
-  HypoVec::const_iterator iter = topHypos.begin();
+  // Create a new ChartHypothesis object, supremeHypo, that has the best
+  // top-level hypothesis as its predecessor and has the same score.
+  std::vector<const ChartHypothesis*>::const_iterator p = topLevelHypos.begin();
+  const ChartHypothesis &bestTopLevelHypo = **p;
  boost::scoped_ptr<ChartHypothesis> supremeHypo(
-    new ChartHypothesis(**iter, *this));
+      new ChartHypothesis(bestTopLevelHypo, *this));

  // Do the same for each alternative top-level hypothesis, but add the new
  // ChartHypothesis objects as arcs from supremeHypo, as if they had been
  // recombined.
-  float prevScore = (*iter)->GetTotalScore();
-  for (++iter; iter != topHypos.end(); ++iter) {
-    // Check that the first item in topHypos really was the best.
-    UTIL_THROW_IF2((*iter)->GetTotalScore() <= prevScore,
-                   "top-level vertices are not correctly sorted");
+  for (++p; p != topLevelHypos.end(); ++p) {
+    // Check that the first item in topLevelHypos really was the best.
+    UTIL_THROW_IF2((*p)->GetTotalScore() <= bestTopLevelHypo.GetTotalScore(),
+                   "top-level hypotheses are not correctly sorted");
    // Note: there's no need for a smart pointer here: supremeHypo will take
    // ownership of altHypo.
-    ChartHypothesis *altHypo = new ChartHypothesis(**iter, *this);
+    ChartHypothesis *altHypo = new ChartHypothesis(**p, *this);
    supremeHypo->AddArc(altHypo);
  }

-  // Create the target vertex corresponding to supremeHypo then generate
-  // it's k-best list.
-  boost::shared_ptr<Vertex> top = FindOrCreateVertex(*supremeHypo);
-  LazyKthBest(*top, k, k);
+  // Create the target vertex then lazily fill its k-best list.
+  boost::shared_ptr<Vertex> targetVertex = FindOrCreateVertex(*supremeHypo);
+  LazyKthBest(*targetVertex, k, k);

  // Copy the k-best list from the target vertex, but drop the top edge from
  // each derivation.
-  kBestList.reserve(top->kBestList.size());
-  for (KBestVec::const_iterator p = top->kBestList.begin();
-       p != top->kBestList.end(); ++p) {
-    const Derivation &d = **p;
-    assert(d.edge.tail.size() == 1);  // d should have exactly one predecessor.
-    assert(d.backPointers.size() == 1);
-    std::size_t i = d.backPointers[0];
-    boost::shared_ptr<Derivation> pred = d.edge.tail[0]->kBestList[i];
-    kBestList.push_back(pred);
+  kBestList.reserve(targetVertex->kBestList.size());
+  for (std::vector<boost::weak_ptr<Derivation> >::const_iterator
+        q = targetVertex->kBestList.begin();
+        q != targetVertex->kBestList.end(); ++q) {
+    const boost::shared_ptr<Derivation> d(*q);
+    assert(d);
+    assert(d->subderivations.size() == 1);
+    kBestList.push_back(d->subderivations[0]);
  }
 }

@ -96,8 +92,7 @@ Phrase ChartKBestExtractor::GetOutputPhrase(const Derivation &d)
    const Word &word = phrase.GetWord(pos);
    if (word.IsNonTerminal()) {
      std::size_t nonTermInd = nonTermIndexMap[pos];
-      const Derivation &subderivation =
-        *d.edge.tail[nonTermInd]->kBestList[d.backPointers[nonTermInd]];
+      const Derivation &subderivation = *d.subderivations[nonTermInd];
      Phrase subPhrase = GetOutputPhrase(subderivation);
      ret.Append(subPhrase);
    } else {
@ -142,26 +137,6 @@ ChartKBestExtractor::UnweightedHyperarc ChartKBestExtractor::CreateEdge(
  return edge;
 }

-void ChartKBestExtractor::GetCandidates(Vertex &v, std::size_t k)
-{
-  // Create a derivation for v's best incoming edge.
-  UnweightedHyperarc bestEdge = CreateEdge(v.hypothesis);
-  boost::shared_ptr<Derivation> d(new Derivation(bestEdge));
-  v.candidates.push(d);
-  v.seen.insert(d);
-  // Create derivations for the rest of v's incoming edges.
-  const ChartArcList *arcList = v.hypothesis.GetArcList();
-  if (arcList) {
-    for (std::size_t i = 0; i < arcList->size(); ++i) {
-      const ChartHypothesis &recombinedHypo = *(*arcList)[i];
-      UnweightedHyperarc edge = CreateEdge(recombinedHypo);
-      boost::shared_ptr<Derivation> d(new Derivation(edge));
-      v.candidates.push(d);
-      v.seen.insert(d);
-    }
-  }
-}
-
 // Look for the vertex corresponding to a given ChartHypothesis, creating
 // a new one if necessary.
 boost::shared_ptr<ChartKBestExtractor::Vertex>
@ -174,66 +149,110 @@ ChartKBestExtractor::FindOrCreateVertex(const ChartHypothesis &h)
    return sp;  // Vertex was already in m_vertexMap.
  }
  sp.reset(new Vertex(h));
+  // Create the 1-best derivation and add it to the vertex's kBestList.
+  UnweightedHyperarc bestEdge;
+  bestEdge.head = sp;
+  const std::vector<const ChartHypothesis*> &prevHypos = h.GetPrevHypos();
+  bestEdge.tail.resize(prevHypos.size());
+  for (std::size_t i = 0; i < prevHypos.size(); ++i) {
+    const ChartHypothesis *prevHypo = prevHypos[i];
+    bestEdge.tail[i] = FindOrCreateVertex(*prevHypo);
+  }
+  boost::shared_ptr<Derivation> bestDerivation(new Derivation(bestEdge));
+  std::pair<DerivationSet::iterator, bool> q =
+    m_derivations.insert(bestDerivation);
+  assert(q.second);
+  sp->kBestList.push_back(bestDerivation);
  return sp;
 }

+// Create the 1-best derivation for each edge in BS(v) (except the best one)
+// and add it to v's candidate queue.
+void ChartKBestExtractor::GetCandidates(Vertex &v, std::size_t k)
+{
+  // Create derivations for all of v's incoming edges except the best.  This
+  // means everything in v.hypothesis.GetArcList() and not the edge defined
+  // by v.hypothesis itself.  The 1-best derivation for that edge will already
+  // have been created.
+  const ChartArcList *arcList = v.hypothesis.GetArcList();
+  if (arcList) {
+    for (std::size_t i = 0; i < arcList->size(); ++i) {
+      const ChartHypothesis &recombinedHypo = *(*arcList)[i];
+      boost::shared_ptr<Vertex> w = FindOrCreateVertex(recombinedHypo);
+      assert(w->kBestList.size() == 1);
+      v.candidates.push(w->kBestList[0]);
+    }
+  }
+}
+
+// Lazily fill v's k-best list.
 void ChartKBestExtractor::LazyKthBest(Vertex &v, std::size_t k,
                                      std::size_t globalK)
 {
  // If this is the first visit to vertex v then initialize the priority queue.
  if (v.visited == false) {
+    // The 1-best derivation should already be in v's k-best list.
+    assert(v.kBestList.size() == 1);
+    // Initialize v's priority queue.
    GetCandidates(v, globalK);
    v.visited = true;
  }
  // Add derivations to the k-best list until it contains k or there are none
  // left to add.
  while (v.kBestList.size() < k) {
-    if (!v.kBestList.empty()) {
-      // Update the priority queue by adding the successors of the last
-      // derivation (unless they've been seen before).
-      const Derivation &d = *v.kBestList.back();
-      LazyNext(v, d, globalK);
-    }
+    assert(!v.kBestList.empty());
+    // Update the priority queue by adding the successors of the last
+    // derivation (unless they've been seen before).
+    boost::shared_ptr<Derivation> d(v.kBestList.back());
+    LazyNext(v, *d, globalK);
    // Check if there are any derivations left in the queue.
    if (v.candidates.empty()) {
      break;
    }
    // Get the next best derivation and delete it from the queue.
-    boost::shared_ptr<Derivation> d = v.candidates.top();
+    boost::weak_ptr<Derivation> next = v.candidates.top();
    v.candidates.pop();
    // Add it to the k-best list.
-    v.kBestList.push_back(d);
+    v.kBestList.push_back(next);
  }
 }

+// Create the neighbours of Derivation d and add them to v's candidate queue.
 void ChartKBestExtractor::LazyNext(Vertex &v, const Derivation &d,
                                   std::size_t globalK)
 {
-  // Create the neighbours of Derivation d.
-  for (std::size_t i = 0; i < d.backPointers.size(); ++i) {
-    Vertex &predVertex = *d.edge.tail[i];
-    // Ensure that predVertex's k-best list contains enough derivations.
+  for (std::size_t i = 0; i < d.edge.tail.size(); ++i) {
+    Vertex &pred = *d.edge.tail[i];
+    // Ensure that pred's k-best list contains enough derivations.
    std::size_t k = d.backPointers[i] + 2;
-    LazyKthBest(predVertex, k, globalK);
-    if (predVertex.kBestList.size() < k) {
-      // predVertex's derivations have been exhausted.
+    LazyKthBest(pred, k, globalK);
+    if (pred.kBestList.size() < k) {
+      // pred's derivations have been exhausted.
      continue;
    }
    // Create the neighbour.
    boost::shared_ptr<Derivation> next(new Derivation(d, i));
    // Check if it has been created before.
-    std::pair<Vertex::DerivationSet::iterator, bool> p = v.seen.insert(next);
+    std::pair<DerivationSet::iterator, bool> p = m_derivations.insert(next);
    if (p.second) {
      v.candidates.push(next);  // Haven't previously seen it.
    }
  }
 }

-// Construct a Derivation corresponding to a ChartHypothesis.
+// Construct the 1-best Derivation that ends at edge e.
 ChartKBestExtractor::Derivation::Derivation(const UnweightedHyperarc &e)
 {
  edge = e;
-  backPointers.resize(edge.tail.size(), 0);
+  std::size_t arity = edge.tail.size();
+  backPointers.resize(arity, 0);
+  subderivations.reserve(arity);
+  for (std::size_t i = 0; i < arity; ++i) {
+    const Vertex &pred = *edge.tail[i];
+    assert(pred.kBestList.size() >= 1);
+    boost::shared_ptr<Derivation> sub(pred.kBestList[0]);
+    subderivations.push_back(sub);
+  }
  scoreBreakdown = edge.head->hypothesis.GetScoreBreakdown();
  score = edge.head->hypothesis.GetTotalScore();
 }
@ -244,14 +263,16 @@ ChartKBestExtractor::Derivation::Derivation(const Derivation &d, std::size_t i)
  edge.head = d.edge.head;
  edge.tail = d.edge.tail;
  backPointers = d.backPointers;
+  subderivations = d.subderivations;
  std::size_t j = ++backPointers[i];
  scoreBreakdown = d.scoreBreakdown;
  // Deduct the score of the old subderivation.
-  const Derivation &oldSubderivation = *(edge.tail[i]->kBestList[j-1]);
-  scoreBreakdown.MinusEquals(oldSubderivation.scoreBreakdown);
+  scoreBreakdown.MinusEquals(subderivations[i]->scoreBreakdown);
+  // Update the subderivation pointer.
+  boost::shared_ptr<Derivation> newSub(edge.tail[i]->kBestList[j]);
+  subderivations[i] = newSub;
  // Add the score of the new subderivation.
-  const Derivation &newSubderivation = *(edge.tail[i]->kBestList[j]);
-  scoreBreakdown.PlusEquals(newSubderivation.scoreBreakdown);
+  scoreBreakdown.PlusEquals(subderivations[i]->scoreBreakdown);
  score = scoreBreakdown.GetWeightedScore();
 }

--- a/moses/ChartKBestExtractor.h
+++ b/moses/ChartKBestExtractor.h
@ -24,6 +24,7 @@
 #include "ScoreComponentCollection.h"

 #include <boost/unordered_set.hpp>
+#include <boost/weak_ptr.hpp>

 #include <queue>
 #include <vector>
@ -53,17 +54,46 @@ public:

    UnweightedHyperarc edge;
    std::vector<std::size_t> backPointers;
+    std::vector<boost::shared_ptr<Derivation> > subderivations;
    ScoreComponentCollection scoreBreakdown;
    float score;
  };

  struct DerivationOrderer {
-    bool operator()(const boost::shared_ptr<Derivation> &d1,
-                    const boost::shared_ptr<Derivation> &d2) const {
-      return d1->score < d2->score;
+    bool operator()(const boost::weak_ptr<Derivation> &d1,
+                    const boost::weak_ptr<Derivation> &d2) const {
+      boost::shared_ptr<Derivation> s1(d1);
+      boost::shared_ptr<Derivation> s2(d2);
+      return s1->score < s2->score;
    }
  };

+  struct Vertex {
+    typedef std::priority_queue<boost::weak_ptr<Derivation>,
+                                std::vector<boost::weak_ptr<Derivation> >,
+                                DerivationOrderer> DerivationQueue;
+
+    Vertex(const ChartHypothesis &h) : hypothesis(h), visited(false) {}
+
+    const ChartHypothesis &hypothesis;
+    std::vector<boost::weak_ptr<Derivation> > kBestList;
+    DerivationQueue candidates;
+    bool visited;
+  };
+
+  typedef std::vector<boost::shared_ptr<Derivation> > KBestVec;
+
+  // Extract the k-best list from the search hypergraph given the full, sorted
+  // list of top-level vertices.
+  void Extract(const std::vector<const ChartHypothesis*> &topHypos,
+               std::size_t k, KBestVec &);
+
+  static Phrase GetOutputPhrase(const Derivation &);
+
+private:
+  typedef boost::unordered_map<const ChartHypothesis *,
+                               boost::shared_ptr<Vertex> > VertexMap;
+
  struct DerivationHasher {
    std::size_t operator()(const boost::shared_ptr<Derivation> &d) const {
      std::size_t seed = 0;
@ -83,36 +113,8 @@ public:
    }
  };

-  struct Vertex {
-    typedef std::priority_queue<boost::shared_ptr<Derivation>,
-                                std::vector<boost::shared_ptr<Derivation> >,
-                                DerivationOrderer> DerivationQueue;
-
-    typedef boost::unordered_set<boost::shared_ptr<Derivation>,
-                                 DerivationHasher,
-                                 DerivationEqualityPred> DerivationSet;
-
-    Vertex(const ChartHypothesis &h) : hypothesis(h), visited(false) {}
-
-    const ChartHypothesis &hypothesis;
-    std::vector<boost::shared_ptr<Derivation> > kBestList;
-    DerivationQueue candidates;
-    DerivationSet seen;
-    bool visited;
-  };
-
-  typedef std::vector<boost::shared_ptr<Derivation> > KBestVec;
-
-  // Extract the k-best list from the search hypergraph given the full, sorted
-  // list of top-level vertices.
-  void Extract(const std::vector<const ChartHypothesis*> &topHypos,
-               std::size_t k, KBestVec &);
-
-  static Phrase GetOutputPhrase(const Derivation &);
-
-private:
-  typedef boost::unordered_map<const ChartHypothesis *,
-                               boost::shared_ptr<Vertex> > VertexMap;
+  typedef boost::unordered_set<boost::shared_ptr<Derivation>, DerivationHasher,
+                               DerivationEqualityPred> DerivationSet;

  UnweightedHyperarc CreateEdge(const ChartHypothesis &);
  boost::shared_ptr<Vertex> FindOrCreateVertex(const ChartHypothesis &);
@ -121,6 +123,7 @@ private:
  void LazyNext(Vertex &, const Derivation &, std::size_t);

  VertexMap m_vertexMap;
+  DerivationSet m_derivations;
 };

 }  // namespace Moses
--- a/moses/ChartManager.cpp
+++ b/moses/ChartManager.cpp
@ -23,6 +23,7 @@
 #include "ChartManager.h"
 #include "ChartCell.h"
 #include "ChartHypothesis.h"
+#include "ChartKBestExtractor.h"
 #include "ChartTranslationOptions.h"
 #include "ChartTrellisDetourQueue.h"
 #include "ChartTrellisNode.h"
@ -261,6 +262,65 @@ void ChartManager::CalcNBest(size_t count, ChartTrellisPathList &ret,bool onlyDi
  }
 }

+/** Calculate the n-best paths through the output hypergraph.
+ * Return the list of paths with the variable ret
+ * \param n how may paths to return
+ * \param ret return argument
+ * \param onlyDistinct whether to check for distinct output sentence or not (default - don't check, just return top n-paths)
+ */
+void ChartManager::CalcNBest(
+    std::size_t n,
+    std::vector<boost::shared_ptr<ChartKBestExtractor::Derivation> > &nBestList,
+    bool onlyDistinct) const
+{
+  nBestList.clear();
+  if (n == 0 || m_source.GetSize() == 0) {
+    return;
+  }
+
+  // Get the list of top-level hypotheses, sorted by score.
+  WordsRange range(0, m_source.GetSize()-1);
+  const ChartCell &lastCell = m_hypoStackColl.Get(range);
+  boost::scoped_ptr<const std::vector<const ChartHypothesis*> > topLevelHypos(
+      lastCell.GetAllSortedHypotheses());
+  if (!topLevelHypos) {
+    return;
+  }
+
+  ChartKBestExtractor extractor;
+
+  if (!onlyDistinct) {
+    // Return the n-best list as is, including duplicate translations.
+    extractor.Extract(*topLevelHypos, n, nBestList);
+    return;
+  }
+
+  // Determine how many derivations to extract.  If the n-best list is
+  // restricted to distinct translations then this limit should be bigger
+  // than n.  The n-best factor determines how much bigger the limit should be,
+  // with 0 being 'unlimited.'  This actually sets a large-ish limit in case
+  // too many translations are identical.
+  const StaticData &staticData = StaticData::Instance();
+  const std::size_t nBestFactor = staticData.GetNBestFactor();
+  std::size_t numDerivations = (nBestFactor == 0) ? n*1000 : n*nBestFactor;
+
+  // Extract the derivations.
+  ChartKBestExtractor::KBestVec bigList;
+  bigList.reserve(numDerivations);
+  extractor.Extract(*topLevelHypos, numDerivations, bigList);
+
+  // Copy derivations into nBestList, skipping ones with repeated translations.
+  std::set<Phrase> distinct;
+  for (ChartKBestExtractor::KBestVec::const_iterator p = bigList.begin();
+       nBestList.size() < n && p != bigList.end(); ++p) {
+    boost::shared_ptr<ChartKBestExtractor::Derivation> derivation = *p;
+    Phrase translation = ChartKBestExtractor::GetOutputPhrase(*derivation);
+    if (distinct.insert(translation).second) {
+      nBestList.push_back(derivation);
+    }
+  }
+}
+
 void ChartManager::GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const
 {
  size_t size = m_source.GetSize();
--- a/moses/ChartManager.h
+++ b/moses/ChartManager.h
@ -30,6 +30,7 @@
 #include "SentenceStats.h"
 #include "ChartTranslationOptionList.h"
 #include "ChartParser.h"
+#include "ChartKBestExtractor.h"

 #include <boost/shared_ptr.hpp>

@ -71,6 +72,7 @@ public:
  void AddXmlChartOptions();
  const ChartHypothesis *GetBestHypothesis() const;
  void CalcNBest(size_t count, ChartTrellisPathList &ret, bool onlyDistinct=0) const;
+  void CalcNBest(size_t n, std::vector<boost::shared_ptr<ChartKBestExtractor::Derivation> > &nBestList, bool onlyDistinct=false) const;

  void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const;
  void FindReachableHypotheses( const ChartHypothesis *hypo, std::map<unsigned,bool> &reachable ) const; /* auxilliary function for GetSearchGraph */
--- a/moses/Parameter.cpp
+++ b/moses/Parameter.cpp
@ -102,8 +102,8 @@ Parameter::Parameter()
  AddParam("output-search-graph", "osg", "Output connected hypotheses of search into specified filename");
  AddParam("output-search-graph-extended", "osgx", "Output connected hypotheses of search into specified filename, in extended format");
  AddParam("unpruned-search-graph", "usg", "When outputting chart search graph, do not exclude dead ends. Note: stack pruning may have eliminated some hypotheses");
-  AddParam("output-search-graph-slf", "slf", "Output connected hypotheses of search into specified directory, one file per sentence, in HTK standard lattice format (SLF)");
-  AddParam("output-search-graph-hypergraph", "Output connected hypotheses of search into specified directory, one file per sentence, in a hypergraph format (see Kenneth Heafield's lazy hypergraph decoder)");
+  AddParam("output-search-graph-slf", "slf", "Output connected hypotheses of search into specified directory, one file per sentence, in HTK standard lattice format (SLF) - the flag should be followed byy a directory name, which must exist");
+  AddParam("output-search-graph-hypergraph", "Output connected hypotheses of search into specified directory, one file per sentence, in a hypergraph format (see Kenneth Heafield's lazy hypergraph decoder). This flag is followed by 3 values: 'true (gz|txt|bz) directory-name'");
  AddParam("include-lhs-in-search-graph", "lhssg", "When outputting chart search graph, include the label of the LHS of the rule (useful when using syntax)");
 #ifdef HAVE_PROTOBUF
  AddParam("output-search-graph-pb", "pb", "Write phrase lattice to protocol buffer objects in the specified path.");
--- a/scripts/other/delete-scores.perl
+++ b/scripts/other/delete-scores.perl
@ -0,0 +1,61 @@
+#!/usr/bin/perl 
+
+use strict;
+use Getopt::Long "GetOptions";
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+sub trim($);
+sub DeleteScore;
+
+my $keepScoresStr;
+GetOptions(
+  "keep-scores=s" => \$keepScoresStr
+) or exit(1);
+
+my @keepScores = split(/,/, $keepScoresStr);
+
+#MAIN LOOP
+while (my $line = <STDIN>) {
+  chomp($line);
+  #print STDERR "line=$line\n";
+  
+  my @toks = split(/\|/, $line);
+  my @scores = split(/ /, $toks[6]);
+  
+  $toks[6] = DeleteScore($toks[6], \@keepScores);
+
+  # output
+  print $toks[0];
+  for (my $i = 1; $i < scalar(@toks); ++$i) {
+    print "|" .$toks[$i];
+  }
+  print "\n";
+}
+
+######################
+# Perl trim function to remove whitespace from the start and end of the string
+sub trim($) {
+  my $string = shift;
+  $string =~ s/^\s+//;
+  $string =~ s/\s+$//;
+  return $string;
+}
+
+sub DeleteScore
+{
+  my $string = $_[0];
+  my @keepScores = @{$_[1]};
+  
+  $string = trim($string);
+  my @toks = split(/ /, $string);
+
+  $string = "";
+  for (my $i = 0; $i < scalar(@keepScores); ++$i) {
+    $string .= $toks[ $keepScores[$i] ] ." ";
+  }
+  $string = " " .$string;
+  
+  return $string;
+}