merge remaining changes to mira, word pair features, phrase pair features

2024-09-11 11:25:40 +03:00 · 2012-10-03 18:53:55 +01:00 · 2012-10-03 18:53:55 +01:00 · e7e4dbd405
commit e7e4dbd405
parent ebbf0d028c 9931a1e0fd
910 changed files with 79145 additions and 17577 deletions
--- a/.gitignore
+++ b/.gitignore
@ -61,3 +61,7 @@ scripts/training/train-model.perl
 dist
 bin
 previous.sh
+contrib/other-builds/*.xcodeproj/project.xcworkspace/
+contrib/other-builds/*.xcodeproj/xcuserdata/
+*/*.xcodeproj/project.xcworkspace
+*/*.xcodeproj/xcuserdata
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,3 @@
+[submodule "regression-testing/tests"]
+	path = regression-testing/tests
+	url = ../moses-regression-tests.git
--- a/BUILD-INSTRUCTIONS.txt
+++ b/BUILD-INSTRUCTIONS.txt
@ -8,7 +8,7 @@ available at http://boost.org .

 There are several optional dependencies:

-GIZA++ from http://code.google.com/p/giza-pp/ is used to build phrase tables.
+GIZA++ from http://code.google.com/p/giza-pp/ is used to align words in the parallel corpus during training.

 Moses server requires xmlrpc-c with abyss-server.  Source is available from
 http://xmlrpc-c.sourceforge.net/.  
@ -85,7 +85,6 @@ Building consists of running
 Common options are:
 --with-srilm=/path/to/srilm to compile the decoder with SRILM support
 --with-irstlm=/path/to/irstlm to compile the decoder with IRSTLM support
--with-giza=/path/to/giza to enable training scripts
 -jN where N is the number of CPUs

 --with-macports=/path/to/macports use MacPorts on Mac OS X.
--- a/43
+++ b/43
@ -15,9 +15,8 @@
 #Note that, like language models, this is the --prefix where the library was
 #installed, not some executable within the library.  
 #
-#--with-giza=/path/to/giza 
-#Indicates where binaries GIZA++, snt2cooc.out, and mkcls live.  
-#Builds scripts/training/train-model.perl using these paths. 
+#Compact phrase table and compact lexical reordering table
+#--with-cmph=/path/to/cmph
 #
 #Thread-caching malloc (optional):
 #--with-tcmalloc
@ -25,14 +24,14 @@
 #REGRESSION TESTING
 #--with-regtest=/path/to/moses-reg-test-data
 #
-#
 #INSTALLATION
-#--prefix=/path/to/prefix sets the install prefix [dist].
+#--prefix=/path/to/prefix sets the install prefix [default is source root].
 #--bindir=/path/to/prefix/bin sets the bin directory [PREFIX/bin]
 #--libdir=/path/to/prefix/lib sets the lib directory [PREFIX/lib]
 #--includedir=/path/to/prefix/include installs headers.  
 #  Does not install if missing.  No argument defaults to PREFIX/include .
 #--install-scripts=/path/to/scripts copies scripts into a directory.
+#  Does not install if missing.  No argument defaults to PREFIX/scripts .
 #--git appends the git revision to the prefix directory.
 #
 #
@ -45,7 +44,9 @@
 # variant=release|debug|profile  builds optimized (default), for debug, or for
 #                                profiling
 #
-# link=static|shared             controls linking (default static)
+# link=static|shared             controls preferred linking (default static)
+# --static                       forces static linking (the default will fall
+#                                back to shared)
 #
 # debug-symbols=on|off           include (default) or exclude debugging
 #                                information also known as -g
@ -54,6 +55,9 @@
 # --enable-boost-pool            uses Boost pools for the memory SCFG table
 #
 # --enable-mpi                   switch on mpi
+# --without-libsegfault          does not link with libSegFault
+#
+# --max-kenlm-order              maximum ngram order that kenlm can process (default 6)
 #
 #CONTROLLING THE BUILD
 #-a to build from scratch
@ -88,6 +92,10 @@ if [ option.get "enable-mpi" : : "yes" ] {
 requirements += [ option.get "notrace" : <define>TRACE_ENABLE=1 ] ;
 requirements += [ option.get "enable-boost-pool" : : <define>USE_BOOST_POOL ] ;

+if [ option.get "with-cmph" ] {
+  requirements += <define>HAVE_CMPH ;
+}
+
 project : default-build
  <threading>multi
  <warnings>on
@ -104,23 +112,12 @@ project : requirements
  ;

 #Add directories here if you want their incidental targets too (i.e. tests).
-build-project lm ; 
-build-project util ;
-#Trigger instllation into legacy paths.  
-build-project mert ;
-build-project moses-cmd/src ;
-build-project moses-chart-cmd/src ;
-build-project mira ;
-build-project moses/src ;
-#Scripts have their own binaries.
-build-project scripts ;
-#Regression tests (only does anything if --with-regtest is passed)
-build-project regression-testing ;
+build-projects util lm mert moses-cmd/src moses-chart-cmd/src mira scripts regression-testing  ;

-alias programs : lm//query lm//build_binary moses-chart-cmd/src//moses_chart moses-cmd/src//programs OnDiskPt//CreateOnDisk OnDiskPt//queryOnDiskPt mert//programs contrib/server//mosesserver misc//programs mira//programs ;
+alias programs : lm//query lm//build_binary lm//kenlm_max_order moses-chart-cmd/src//moses_chart moses-cmd/src//programs OnDiskPt//CreateOnDiskPt OnDiskPt//queryOnDiskPt mert//programs contrib/server//mosesserver misc//programs mira//programs symal phrase-extract phrase-extract//lexical-reordering phrase-extract//extract-ghkm phrase-extract//pcfg-extract phrase-extract//pcfg-score biconcor  ;

 install-bin-libs programs ;
-install-headers headers-base : [ glob-tree *.h *.hh : jam-files dist kenlm moses ] : . ;
+install-headers headers-base : [ path.glob-tree biconcor contrib lm mert misc moses-chart-cmd moses-cmd OnDiskPt phrase-extract symal util : *.hh *.h ] : . ;
 install-headers headers-moses : moses/src//headers-to-install : moses/src ;

 alias install : prefix-bin prefix-lib headers-base headers-moses ;
@ -128,3 +125,9 @@ alias install : prefix-bin prefix-lib headers-base headers-moses ;
 if ! [ option.get "includedir" : : $(prefix)/include ] {
  explicit install headers-base headers-moses ;
 }
+
+if [ path.exists $(TOP)/dist ] && $(prefix) != dist {
+  echo "You have a $(TOP)/dist directory, but the build system now places files directly in the root i.e. $(TOP)/bin ." ;
+  echo "To disable this message, delete $(TOP)/dist ." ;
+  echo ;
+}
--- a/3
+++ b/3
@ -0,0 +1,3 @@
+This code includes data from Daniel Naber's Language Tools (czech abbreviations).
+
+This code includes data from czech wiktionary (also czech abbreviations).
--- a/OnDiskPt/Jamfile
+++ b/OnDiskPt/Jamfile
@ -1,5 +1,5 @@
 lib OnDiskPt : OnDiskWrapper.cpp SourcePhrase.cpp TargetPhrase.cpp Word.cpp Phrase.cpp PhraseNode.cpp TargetPhraseCollection.cpp Vocab.cpp ../moses/src//headers ;

-exe CreateOnDisk : Main.cpp ../moses/src//moses OnDiskPt ;
+exe CreateOnDiskPt : Main.cpp ../moses/src//moses OnDiskPt ;
 exe queryOnDiskPt : queryOnDiskPt.cpp ../moses/src//moses OnDiskPt ;

--- a/OnDiskPt/Main.cpp
+++ b/OnDiskPt/Main.cpp
@ -77,7 +77,7 @@ int main (int argc, char * const argv[])
    std::vector<float> misc(1);
    SourcePhrase sourcePhrase;
    TargetPhrase *targetPhrase = new TargetPhrase(numScores);
-    OnDiskPt::Phrase *spShort = Tokenize(sourcePhrase, *targetPhrase, line, onDiskWrapper, numScores, misc);
+    OnDiskPt::PhrasePtr spShort = Tokenize(sourcePhrase, *targetPhrase, line, onDiskWrapper, numScores, misc);
    assert(misc.size() == onDiskWrapper.GetNumCounts());

    rootNode.AddTargetPhrase(sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, misc, spShort);
@ -105,7 +105,7 @@ bool Flush(const OnDiskPt::SourcePhrase *prevSourcePhrase, const OnDiskPt::Sourc
  return ret;
 }

-OnDiskPt::Phrase *Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, char *line, OnDiskWrapper &onDiskWrapper, int numScores, vector<float> &misc)
+OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, char *line, OnDiskWrapper &onDiskWrapper, int numScores, vector<float> &misc)
 {
  size_t scoreInd = 0;

@ -118,14 +118,14 @@ OnDiskPt::Phrase *Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhras
   4 = count
   */
  char *tok = strtok (line," ");
-  OnDiskPt::Phrase *out = new Phrase();
+  OnDiskPt::PhrasePtr out(new Phrase());
  while (tok != NULL) {
    if (0 == strcmp(tok, "|||")) {
      ++stage;
    } else {
      switch (stage) {
      case 0: {
-    	Word *w = Tokenize(sourcePhrase, tok, true, true, onDiskWrapper);
+    	WordPtr w = Tokenize(sourcePhrase, tok, true, true, onDiskWrapper);
    	if (w != NULL)
    	  out->AddWord(w);
    	
@ -184,7 +184,7 @@ OnDiskPt::Phrase *Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhras
  return out;
 } // Tokenize()

-OnDiskPt::Word *Tokenize(OnDiskPt::Phrase &phrase
+OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
              , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
              , OnDiskPt::OnDiskWrapper &onDiskWrapper)
 {
@ -198,7 +198,7 @@ OnDiskPt::Word *Tokenize(OnDiskPt::Phrase &phrase
    nonTerm = comStr == 0;
  }

-  OnDiskPt::Word *out = NULL;
+  OnDiskPt::WordPtr out;
  if (nonTerm) {
    // non-term
    size_t splitPos		= token.find_first_of("[", 2);
@ -206,20 +206,20 @@ OnDiskPt::Word *Tokenize(OnDiskPt::Phrase &phrase

    if (splitPos == string::npos) {
      // lhs - only 1 word
-      Word *word = new Word();
+      WordPtr word(new Word());
      word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
      phrase.AddWord(word);
    } else {
      // source & target non-terms
      if (addSourceNonTerm) {
-        Word *word = new Word();
+        WordPtr word(new Word());
        word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
        phrase.AddWord(word);        
      }

      wordStr = token.substr(splitPos, tokSize - splitPos);
      if (addTargetNonTerm) {
-        Word *word = new Word();
+        WordPtr word(new Word());
        word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
        phrase.AddWord(word);
        out = word;
@ -228,7 +228,7 @@ OnDiskPt::Word *Tokenize(OnDiskPt::Phrase &phrase
    }
  } else {
    // term
-    Word *word = new Word();
+    WordPtr word(new Word());
    word->CreateFromString(token, onDiskWrapper.GetVocab());
    phrase.AddWord(word);
    out = word;
--- a/OnDiskPt/Main.h
+++ b/OnDiskPt/Main.h
@ -25,10 +25,10 @@
 typedef std::pair<size_t, size_t>  AlignPair;
 typedef std::vector<AlignPair> AlignType;

-OnDiskPt::Word *Tokenize(OnDiskPt::Phrase &phrase
+OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
              , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
              , OnDiskPt::OnDiskWrapper &onDiskWrapper);
-OnDiskPt::Phrase *Tokenize(OnDiskPt::SourcePhrase &sourcePhrase, OnDiskPt::TargetPhrase &targetPhrase
+OnDiskPt::PhrasePtr Tokenize(OnDiskPt::SourcePhrase &sourcePhrase, OnDiskPt::TargetPhrase &targetPhrase
              , char *line, OnDiskPt::OnDiskWrapper &onDiskWrapper
              , int numScores
              , std::vector<float> &misc);
--- a/OnDiskPt/OnDiskWrapper.cpp
+++ b/OnDiskPt/OnDiskWrapper.cpp
@ -163,7 +163,7 @@ void OnDiskWrapper::EndSave()

 void OnDiskWrapper::SaveMisc()
 {
-  m_fileMisc << "Version 3" << endl;
+  m_fileMisc << "Version 4" << endl;
  m_fileMisc << "NumSourceFactors " << m_numSourceFactors << endl;
  m_fileMisc << "NumTargetFactors " << m_numTargetFactors << endl;
  m_fileMisc << "NumScores " << m_numScores << endl;
@ -172,12 +172,12 @@ void OnDiskWrapper::SaveMisc()

 size_t OnDiskWrapper::GetSourceWordSize() const
 {
-  return m_numSourceFactors * sizeof(UINT64) + sizeof(char);
+  return sizeof(UINT64) + sizeof(char);
 }

 size_t OnDiskWrapper::GetTargetWordSize() const
 {
-  return m_numTargetFactors * sizeof(UINT64) + sizeof(char);
+  return sizeof(UINT64) + sizeof(char);
 }

 UINT64 OnDiskWrapper::GetMisc(const std::string &key) const
@ -199,32 +199,37 @@ Word *OnDiskWrapper::ConvertFromMoses(Moses::FactorDirection /* direction */
                                      , const Moses::Word &origWord) const
 {
  bool isNonTerminal = origWord.IsNonTerminal();
-  Word *newWord = new Word(1, isNonTerminal); // TODO - num of factors
+  Word *newWord = new Word(isNonTerminal);
+  stringstream strme;

-  for (size_t ind = 0 ; ind < factorsVec.size() ; ++ind) {
+  size_t factorType = factorsVec[0];  
+  const Moses::Factor *factor = origWord.GetFactor(factorType);
+  CHECK(factor);  
+  string str = factor->GetString();
+  strme << str;
+
+  for (size_t ind = 1 ; ind < factorsVec.size() ; ++ind) {
    size_t factorType = factorsVec[ind];
-
    const Moses::Factor *factor = origWord.GetFactor(factorType);
+    if (factor == NULL)
+    { // can have less factors than factorType.size()
+      break;
+    }
    CHECK(factor);
-
    string str = factor->GetString();
-    if (isNonTerminal) {
-      str = "[" + str + "]";
-    }
-
-    bool found;
-    UINT64 vocabId = m_vocab.GetVocabId(str, found);
-    if (!found) {
-      // factor not in phrase table -> phrse definately not in. exit
-      delete newWord;
-      return NULL;
-    } else {
-      newWord->SetVocabId(ind, vocabId);
-    }
+    strme << "|" << str;    
  } // for (size_t factorType

-  return newWord;
-
+  bool found;
+  UINT64 vocabId = m_vocab.GetVocabId(strme.str(), found);
+  if (!found) {
+    // factor not in phrase table -> phrse definately not in. exit
+    delete newWord;
+    return NULL;
+  } else {
+    newWord->SetVocabId(vocabId);
+    return newWord;
+  }
 }


--- a/OnDiskPt/OnDiskWrapper.h
+++ b/OnDiskPt/OnDiskWrapper.h
@ -28,6 +28,10 @@ namespace OnDiskPt
 {
 const float DEFAULT_COUNT = 66666;

+/** Global class with misc information need to create and use the on-disk rule table. 
+ * 1 object of this class should be instantiated per rule table.
+ * Currently only hierarchical/syntax models use this, but can & should be used with pb models too
+ */
 class OnDiskWrapper
 {
 protected:
--- a/OnDiskPt/Phrase.cpp
+++ b/OnDiskPt/Phrase.cpp
@ -27,27 +27,13 @@ using namespace std;
 namespace OnDiskPt
 {

-Phrase::Phrase(const Phrase &copy)
-  :m_words(copy.GetSize())
-{
-  for (size_t pos = 0; pos < copy.GetSize(); ++pos) {
-    const Word &oldWord = copy.GetWord(pos);
-    Word *newWord = new Word(oldWord);
-    m_words[pos] = newWord;
-  }
-}

-Phrase::~Phrase()
-{
-  Moses::RemoveAllInColl(m_words);
-}
-
-void Phrase::AddWord(Word *word)
+void Phrase::AddWord(WordPtr word)
 {
  m_words.push_back(word);
 }

-void Phrase::AddWord(Word *word, size_t pos)
+void Phrase::AddWord(WordPtr word, size_t pos)
 {
  CHECK(pos < m_words.size());
  m_words.insert(m_words.begin() + pos + 1, word);
--- a/OnDiskPt/Phrase.h
+++ b/OnDiskPt/Phrase.h
@ -20,27 +20,29 @@
 ***********************************************************************/
 #include <vector>
 #include <iostream>
+#include <boost/shared_ptr.hpp>
 #include "Word.h"

 namespace OnDiskPt
 {
 class Vocab;

+
+/** A contiguous phrase. SourcePhrase & TargetPhrase inherit from this and add the on-disk functionality
+ */
 class Phrase
 {
  friend std::ostream& operator<<(std::ostream&, const Phrase&);

 protected:
-  std::vector<Word*>	m_words;
+  std::vector<WordPtr>	m_words;

 public:
  Phrase()
  {}
-  Phrase(const Phrase &copy);
-  virtual ~Phrase();

-  void AddWord(Word *word);
-  void AddWord(Word *word, size_t pos);
+  void AddWord(WordPtr word);
+  void AddWord(WordPtr word, size_t pos);

  const Word &GetWord(size_t pos) const {
    return *m_words[pos];
@ -57,4 +59,6 @@ public:
  bool operator==(const Phrase &compare) const;
 };

+typedef boost::shared_ptr<Phrase> PhrasePtr;
+
 }
--- a/OnDiskPt/PhraseNode.cpp
+++ b/OnDiskPt/PhraseNode.cpp
@ -160,14 +160,14 @@ void PhraseNode::Save(OnDiskWrapper &onDiskWrapper, size_t pos, size_t tableLimi

 void PhraseNode::AddTargetPhrase(const SourcePhrase &sourcePhrase, TargetPhrase *targetPhrase
                                 , OnDiskWrapper &onDiskWrapper, size_t tableLimit
-                                 , const std::vector<float> &counts, OnDiskPt::Phrase *spShort)
+                                 , const std::vector<float> &counts, OnDiskPt::PhrasePtr spShort)
 {
  AddTargetPhrase(0, sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, counts, spShort);
 }

 void PhraseNode::AddTargetPhrase(size_t pos, const SourcePhrase &sourcePhrase
                                 , TargetPhrase *targetPhrase, OnDiskWrapper &onDiskWrapper
-                                 , size_t tableLimit, const std::vector<float> &counts, OnDiskPt::Phrase *spShort)
+                                 , size_t tableLimit, const std::vector<float> &counts, OnDiskPt::PhrasePtr spShort)
 {	
  size_t phraseSize = sourcePhrase.GetSize();
  if (pos < phraseSize) {
@ -228,20 +228,19 @@ void PhraseNode::GetChild(Word &wordFound, UINT64 &childFilePos, size_t ind, OnD

  size_t wordSize = onDiskWrapper.GetSourceWordSize();
  size_t childSize = wordSize + sizeof(UINT64);
-  size_t numFactors = onDiskWrapper.GetNumSourceFactors();

  char *currMem = m_memLoad
                  + sizeof(UINT64) * 2 // size & file pos of target phrase coll
                  + sizeof(float) * onDiskWrapper.GetNumCounts() // count info
                  + childSize * ind;

-  size_t memRead = ReadChild(wordFound, childFilePos, currMem, numFactors);
+  size_t memRead = ReadChild(wordFound, childFilePos, currMem);
  CHECK(memRead == childSize);
 }

-size_t PhraseNode::ReadChild(Word &wordFound, UINT64 &childFilePos, const char *mem, size_t numFactors) const
+size_t PhraseNode::ReadChild(Word &wordFound, UINT64 &childFilePos, const char *mem) const
 {
-  size_t memRead = wordFound.ReadFromMemory(mem, numFactors);
+  size_t memRead = wordFound.ReadFromMemory(mem);

  const char *currMem = mem + memRead;
  UINT64 *memArray = (UINT64*) (currMem);
--- a/OnDiskPt/PhraseNode.h
+++ b/OnDiskPt/PhraseNode.h
@ -31,6 +31,7 @@ namespace OnDiskPt
 class OnDiskWrapper;
 class SourcePhrase;

+/** A node in the source tree trie */
 class PhraseNode
 {
  friend std::ostream& operator<<(std::ostream&, const PhraseNode&);
@ -51,8 +52,8 @@ protected:

  void AddTargetPhrase(size_t pos, const SourcePhrase &sourcePhrase
                       , TargetPhrase *targetPhrase, OnDiskWrapper &onDiskWrapper
-                       , size_t tableLimit, const std::vector<float> &counts, OnDiskPt::Phrase *spShort);
-  size_t ReadChild(Word &wordFound, UINT64 &childFilePos, const char *mem, size_t numFactors) const;
+                       , size_t tableLimit, const std::vector<float> &counts, OnDiskPt::PhrasePtr spShort);
+  size_t ReadChild(Word &wordFound, UINT64 &childFilePos, const char *mem) const; 
  void GetChild(Word &wordFound, UINT64 &childFilePos, size_t ind, OnDiskWrapper &onDiskWrapper) const;

 public:
@ -67,7 +68,7 @@ public:

  void AddTargetPhrase(const SourcePhrase &sourcePhrase, TargetPhrase *targetPhrase
                       , OnDiskWrapper &onDiskWrapper, size_t tableLimit
-                       , const std::vector<float> &counts, OnDiskPt::Phrase *spShort);
+                       , const std::vector<float> &counts, OnDiskPt::PhrasePtr spShort);

  UINT64 GetFilePos() const {
    return m_filePos;
--- a/OnDiskPt/SourcePhrase.h
+++ b/OnDiskPt/SourcePhrase.h
@ -25,6 +25,8 @@
 namespace OnDiskPt
 {

+/** A source phrase. No extension of a norm Phrase class because source phrases are saved as tries.
+ */
 class SourcePhrase: public Phrase
 {
 protected:
@ -32,4 +34,5 @@ protected:
 public:
 };

+
 }
--- a/OnDiskPt/TargetPhrase.cpp
+++ b/OnDiskPt/TargetPhrase.cpp
@ -50,7 +50,7 @@ TargetPhrase::~TargetPhrase()
 {
 }

-void TargetPhrase::SetLHS(Word *lhs)
+void TargetPhrase::SetLHS(WordPtr lhs)
 {
  AddWord(lhs);
 }
@ -99,7 +99,7 @@ char *TargetPhrase::WriteToMemory(OnDiskWrapper &onDiskWrapper, size_t &memUsed)
  size_t phraseSize = GetSize();
  size_t targetWordSize = onDiskWrapper.GetTargetWordSize();
  
-  const Phrase* sp = GetSourcePhrase();
+  const PhrasePtr sp = GetSourcePhrase();
  size_t spSize = sp->GetSize();
  size_t sourceWordSize = onDiskWrapper.GetSourceWordSize();
  
@ -240,9 +240,7 @@ Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::Facto
  --phraseSize;

  for (size_t pos = 0; pos < phraseSize; ++pos) {
-    Moses::Word *mosesWord = GetWord(pos).ConvertToMoses(Moses::Output, outputFactors, vocab);
-    ret->AddWord(*mosesWord);
-    delete mosesWord;
+    GetWord(pos).ConvertToMoses(outputFactors, vocab, ret->AddWord());
  }

  // scores
@ -252,7 +250,7 @@ Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::Facto
  int indicator[m_align.size()];
  int index = 0;
  std::set<std::pair<size_t, size_t> > alignmentInfo;
-  const Phrase* sp = GetSourcePhrase(); 
+  const PhrasePtr sp = GetSourcePhrase(); 
  for (size_t ind = 0; ind < m_align.size(); ++ind) {
    const std::pair<size_t, size_t> &entry = m_align[ind];
    alignmentInfo.insert(entry);
@ -261,18 +259,14 @@ Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::Facto
  }
  ret->SetAlignmentInfo(alignmentInfo, indicator);

-  Moses::Word *lhs = GetWord(GetSize() - 1).ConvertToMoses(Moses::Output, outputFactors, vocab);
-  ret->SetTargetLHS(*lhs);
-  delete lhs;
+  GetWord(GetSize() - 1).ConvertToMoses(outputFactors, vocab, ret->MutableTargetLHS());
  
  // set source phrase
-  Moses::Phrase *mosesSP = new Moses::Phrase(Moses::Input);
+  Moses::Phrase mosesSP(Moses::Input);
  for (size_t pos = 0; pos < sp->GetSize(); ++pos) {
-    Moses::Word *mosesWord = sp->GetWord(pos).ConvertToMoses(Moses::Input, inputFactors, vocab);
-    mosesSP->AddWord(*mosesWord);
-    delete mosesWord;
+    sp->GetWord(pos).ConvertToMoses(inputFactors, vocab, mosesSP.AddWord());
  }
-  ret->SetSourcePhrase(*mosesSP);
+  ret->SetSourcePhrase(mosesSP);
  
  return ret;
 }
@ -295,7 +289,7 @@ UINT64 TargetPhrase::ReadOtherInfoFromFile(UINT64 filePos, std::fstream &fileTPC
  return memUsed;
 }

-UINT64 TargetPhrase::ReadFromFile(std::fstream &fileTP, size_t numFactors, size_t numSourceFactors)
+UINT64 TargetPhrase::ReadFromFile(std::fstream &fileTP)
 {
  UINT64 bytesRead = 0;

@ -306,8 +300,8 @@ UINT64 TargetPhrase::ReadFromFile(std::fstream &fileTP, size_t numFactors, size_
  bytesRead += sizeof(UINT64);

  for (size_t ind = 0; ind < numWords; ++ind) {
-    Word *word = new Word();
-    bytesRead += word->ReadFromFile(fileTP, numFactors);
+    WordPtr word(new Word());
+    bytesRead += word->ReadFromFile(fileTP);
    AddWord(word);
  }
  
@ -316,10 +310,10 @@ UINT64 TargetPhrase::ReadFromFile(std::fstream &fileTP, size_t numFactors, size_
  fileTP.read((char*) &numSourceWords, sizeof(UINT64));
  bytesRead += sizeof(UINT64);

-  SourcePhrase *sp = new SourcePhrase();
+  PhrasePtr sp(new SourcePhrase());
  for (size_t ind = 0; ind < numSourceWords; ++ind) {
-    Word *word = new Word();
-    bytesRead += word->ReadFromFile(fileTP, numSourceFactors);
+    WordPtr word( new Word());
+    bytesRead += word->ReadFromFile(fileTP);
    sp->AddWord(word);
  }
  SetSourcePhrase(sp);
--- a/OnDiskPt/TargetPhrase.h
+++ b/OnDiskPt/TargetPhrase.h
@ -43,12 +43,15 @@ typedef std::vector<AlignPair> AlignType;

 class Vocab;

+/** A target phrase, with the score breakdowns, alignment info and assorted other information it need.
+ *  Readable and writeable to disk
+ */
 class TargetPhrase: public Phrase
 {
  friend std::ostream& operator<<(std::ostream&, const TargetPhrase&);
 protected:
  AlignType m_align;
-  Phrase* m_sourcePhrase; 
+  PhrasePtr m_sourcePhrase; 

  std::vector<float>	m_scores;
  UINT64 m_filePos;
@ -64,15 +67,14 @@ public:
  TargetPhrase(const 	TargetPhrase &copy);
  virtual ~TargetPhrase();

-  void SetSourcePhrase(Phrase *p) {
-	Phrase *copy = new Phrase(*p);
-    m_sourcePhrase = copy;
+  void SetSourcePhrase(PhrasePtr p) {
+    m_sourcePhrase = p;
  }
-  const Phrase* GetSourcePhrase() const {
-	return m_sourcePhrase;
+  const PhrasePtr GetSourcePhrase() const {
+	  return m_sourcePhrase;
  }
  
-  void SetLHS(Word *lhs);
+  void SetLHS(WordPtr lhs);

  void Create1AlignFromString(const std::string &align1Str);
  void CreateAlignFromString(const std::string &align1Str);
@ -102,7 +104,7 @@ public:
                                      , const Moses::WordPenaltyProducer* wpProducer
                                      , const Moses::LMList &lmList) const;
  UINT64 ReadOtherInfoFromFile(UINT64 filePos, std::fstream &fileTPColl);
-  UINT64 ReadFromFile(std::fstream &fileTP, size_t numFactors, size_t numSourceFactors);
+  UINT64 ReadFromFile(std::fstream &fileTP);

 	virtual void DebugPrint(std::ostream &out, const Vocab &vocab) const;

--- a/OnDiskPt/TargetPhraseCollection.cpp
+++ b/OnDiskPt/TargetPhraseCollection.cpp
@ -156,9 +156,8 @@ void TargetPhraseCollection::ReadFromFile(size_t tableLimit, UINT64 filePos, OnD
  fstream &fileTP = onDiskWrapper.GetFileTargetInd();
    
  size_t numScores = onDiskWrapper.GetNumScores();
-  size_t numTargetFactors = onDiskWrapper.GetNumTargetFactors();
-  size_t numSourceFactors = onDiskWrapper.GetNumSourceFactors();
    
+
  UINT64 numPhrases;

  UINT64 currFilePos = filePos;
@ -172,8 +171,9 @@ void TargetPhraseCollection::ReadFromFile(size_t tableLimit, UINT64 filePos, OnD
 
  for (size_t ind = 0; ind < numPhrases; ++ind) {
    TargetPhrase *tp = new TargetPhrase(numScores);    
+
    UINT64 sizeOtherInfo = tp->ReadOtherInfoFromFile(currFilePos, fileTPColl);
-    tp->ReadFromFile(fileTP, numTargetFactors, numSourceFactors);
+    tp->ReadFromFile(fileTP);

    currFilePos += sizeOtherInfo;

--- a/OnDiskPt/TargetPhraseCollection.h
+++ b/OnDiskPt/TargetPhraseCollection.h
@ -33,6 +33,8 @@ class WordPenaltyProducer;
 namespace OnDiskPt
 {

+/** A vector of target phrases
+ */
 class TargetPhraseCollection
 {
  class TargetPhraseOrderByScore
--- a/OnDiskPt/Vocab.cpp
+++ b/OnDiskPt/Vocab.cpp
@ -21,7 +21,6 @@
 #include <fstream>
 #include "OnDiskWrapper.h"
 #include "Vocab.h"
-#include "../moses/src/FactorCollection.h"

 using namespace std;

@ -69,13 +68,13 @@ void Vocab::Save(OnDiskWrapper &onDiskWrapper)
  }
 }

-UINT64 Vocab::AddVocabId(const std::string &factorString)
+UINT64 Vocab::AddVocabId(const std::string &str)
 {
  // find string id
-  CollType::const_iterator iter = m_vocabColl.find(factorString);
+  CollType::const_iterator iter = m_vocabColl.find(str);
  if (iter == m_vocabColl.end()) {
    // add new vocab entry
-    m_vocabColl[factorString] = m_nextId;
+    m_vocabColl[str] = m_nextId;
    return m_nextId++;
  } else {
    // return existing entry
@ -83,10 +82,10 @@ UINT64 Vocab::AddVocabId(const std::string &factorString)
  }
 }

-UINT64 Vocab::GetVocabId(const std::string &factorString, bool &found) const
+UINT64 Vocab::GetVocabId(const std::string &str, bool &found) const
 {
  // find string id
-  CollType::const_iterator iter = m_vocabColl.find(factorString);
+  CollType::const_iterator iter = m_vocabColl.find(str);
  if (iter == m_vocabColl.end()) {
    found = false;
    return 0; //return whatever
@ -97,14 +96,4 @@ UINT64 Vocab::GetVocabId(const std::string &factorString, bool &found) const
  }
 }

-const Moses::Factor *Vocab::GetFactor(UINT32 vocabId, Moses::FactorType factorType, Moses::FactorDirection direction, bool isNonTerminal) const
-{
-  string str = GetString(vocabId);
-  if (isNonTerminal) {
-    str = str.substr(1, str.size() - 2);
-  }
-  const Moses::Factor *factor = Moses::FactorCollection::Instance().AddFactor(direction, factorType, str);
-  return factor;
-}
-
 }
--- a/OnDiskPt/Vocab.h
+++ b/OnDiskPt/Vocab.h
@ -22,16 +22,15 @@
 #include <map>
 #include "../moses/src/TypeDef.h"

-namespace Moses
-{
-class Factor;
-}

 namespace OnDiskPt
 {

 class OnDiskWrapper;

+/* A bidirectional map of string<->contiguous id
+ * No distinction between source and target language
+ */
 class Vocab
 {
 protected:
@ -45,9 +44,8 @@ public:
  Vocab()
    :m_nextId(1)
  {}
-  UINT64 AddVocabId(const std::string &factorString);
-  UINT64 GetVocabId(const std::string &factorString, bool &found) const;
-  const Moses::Factor *GetFactor(UINT32 vocabId, Moses::FactorType factorType, Moses::FactorDirection direction, bool isNonTerminal) const;
+  UINT64 AddVocabId(const std::string &str);
+  UINT64 GetVocabId(const std::string &str, bool &found) const;
  const std::string &GetString(UINT32 vocabId) const {
    return m_lookup[vocabId];
  }
--- a/OnDiskPt/Word.cpp
+++ b/OnDiskPt/Word.cpp
@ -18,10 +18,14 @@
 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 ***********************************************************************/

+#include "../moses/src/FactorCollection.h"
 #include "../moses/src/Util.h"
 #include "../moses/src/Word.h"
 #include "Word.h"

+#include "util/tokenize_piece.hh"
+#include "util/exception.hh"
+
 using namespace std;

 namespace OnDiskPt
@ -29,7 +33,7 @@ namespace OnDiskPt

 Word::Word(const Word &copy)
  :m_isNonTerminal(copy.m_isNonTerminal)
-  ,m_factors(copy.m_factors)
+  ,m_vocabId(copy.m_vocabId)
 {}

 Word::~Word()
@ -40,23 +44,21 @@ void Word::CreateFromString(const std::string &inString, Vocab &vocab)
  if (inString.substr(0, 1) == "[" && inString.substr(inString.size() - 1, 1) == "]") {
    // non-term
    m_isNonTerminal = true;
+    string str = inString.substr(1, inString.size() - 2);
+    m_vocabId = vocab.AddVocabId(str);
  } else {
    m_isNonTerminal = false;
+    m_vocabId = vocab.AddVocabId(inString);
  }

-  m_factors.resize(1);
-  m_factors[0] = vocab.AddVocabId(inString);
 }

 size_t Word::WriteToMemory(char *mem) const
 {
  UINT64 *vocabMem = (UINT64*) mem;
+  vocabMem[0] = m_vocabId;

-  // factors
-  for (size_t ind = 0; ind < m_factors.size(); ind++)
-    vocabMem[ind] = m_factors[ind];
-
-  size_t size = sizeof(UINT64) * m_factors.size();
+  size_t size = sizeof(UINT64);

  // is non-term
  char bNonTerm = (char) m_isNonTerminal;
@ -66,16 +68,12 @@ size_t Word::WriteToMemory(char *mem) const
  return size;
 }

-size_t Word::ReadFromMemory(const char *mem, size_t numFactors)
+size_t Word::ReadFromMemory(const char *mem)
 {
-  m_factors.resize(numFactors);
  UINT64 *vocabMem = (UINT64*) mem;
+  m_vocabId = vocabMem[0];

-  // factors
-  for (size_t ind = 0; ind < m_factors.size(); ind++)
-    m_factors[ind] = vocabMem[ind];
-
-  size_t memUsed = sizeof(UINT64) * m_factors.size();
+  size_t memUsed = sizeof(UINT64);

  // is non-term
  char bNonTerm;
@ -86,34 +84,34 @@ size_t Word::ReadFromMemory(const char *mem, size_t numFactors)
  return memUsed;
 }

-size_t Word::ReadFromFile(std::fstream &file, size_t numFactors)
+size_t Word::ReadFromFile(std::fstream &file)
 {
-  size_t memAlloc = numFactors * sizeof(UINT64) + sizeof(char);
+  size_t memAlloc = sizeof(UINT64) + sizeof(char);
  char *mem = (char*) malloc(memAlloc);
  file.read(mem, memAlloc);

-  size_t memUsed = ReadFromMemory(mem, numFactors);
+  size_t memUsed = ReadFromMemory(mem);
  CHECK(memAlloc == memUsed);
  free(mem);

  return memUsed;
 }

-Moses::Word *Word::ConvertToMoses(Moses::FactorDirection direction
-                                  , const std::vector<Moses::FactorType> &outputFactorsVec
-                                  , const Vocab &vocab) const
-{
-  Moses::Word *ret = new Moses::Word(m_isNonTerminal);
+void Word::ConvertToMoses(
+    const std::vector<Moses::FactorType> &outputFactorsVec, 
+    const Vocab &vocab,
+    Moses::Word &overwrite) const {
+  Moses::FactorCollection &factorColl = Moses::FactorCollection::Instance();
+  overwrite = Moses::Word(m_isNonTerminal);

-  for (size_t ind = 0; ind < m_factors.size(); ++ind) {
-    Moses::FactorType factorType = outputFactorsVec[ind];
-    UINT32 vocabId = m_factors[ind];
-    const Moses::Factor *factor = vocab.GetFactor(vocabId, factorType, direction, m_isNonTerminal);
-    ret->SetFactor(factorType, factor);
+  // TODO: this conversion should have been done at load time.  
+  util::TokenIter<util::SingleCharacter> tok(vocab.GetString(m_vocabId), '|');
+
+  for (std::vector<Moses::FactorType>::const_iterator t = outputFactorsVec.begin(); t != outputFactorsVec.end(); ++t, ++tok) {
+    UTIL_THROW_IF(!tok, util::Exception, "Too few factors in \"" << vocab.GetString(m_vocabId) << "\"; was expecting " << outputFactorsVec.size());
+    overwrite.SetFactor(*t, factorColl.AddFactor(*tok));
  }
-
-  return ret;
-
+  UTIL_THROW_IF(tok, util::Exception, "Too many factors in \"" << vocab.GetString(m_vocabId) << "\"; was expecting " << outputFactorsVec.size());
 }

 int Word::Compare(const Word &compare) const
@ -123,9 +121,9 @@ int Word::Compare(const Word &compare) const
  if (m_isNonTerminal != compare.m_isNonTerminal)
    return m_isNonTerminal ?-1 : 1;

-  if (m_factors < compare.m_factors)
+  if (m_vocabId < compare.m_vocabId)
    ret = -1;
-  else if (m_factors > compare.m_factors)
+  else if (m_vocabId > compare.m_vocabId)
    ret = 1;
  else
    ret = 0;
@ -147,27 +145,14 @@ bool Word::operator==(const Word &compare) const

 void Word::DebugPrint(ostream &out, const Vocab &vocab) const
 {
-  std::vector<UINT64>::const_iterator iter;
-  for (size_t ind = 0; ind < m_factors.size() - 1; ++ind) {
-  	UINT64 vocabId = *iter;
-  	const string &str = vocab.GetString(vocabId);
-    out << str << "|";
-  }
-
-  // last
-	UINT64 vocabId = m_factors.back();
-	const string &str = vocab.GetString(vocabId);
-	out << str;
+ 	const string &str = vocab.GetString(m_vocabId);
+  out << str;
 }

 std::ostream& operator<<(std::ostream &out, const Word &word)
 {
  out << "(";
-
-  std::vector<UINT64>::const_iterator iter;
-  for (iter = word.m_factors.begin(); iter != word.m_factors.end(); ++iter) {
-    out << *iter << "|";
-  }
+ 	out << word.m_vocabId;

  out << (word.m_isNonTerminal ? "n" : "t");
  out << ")";
--- a/OnDiskPt/Word.h
+++ b/OnDiskPt/Word.h
@ -22,6 +22,7 @@
 #include <vector>
 #include <iostream>
 #include <fstream>
+#include <boost/shared_ptr.hpp>
 #include "Vocab.h"

 namespace Moses
@ -33,21 +34,24 @@ namespace OnDiskPt
 {
 class Vocab;

+/* A wrapper around a vocab id, and a boolean indicating whther it is a term or non-term.
+ * Factors can be represented by using a vocab string with | character, eg go|VB
+ */
 class Word
 {
  friend std::ostream& operator<<(std::ostream&, const Word&);

 protected:
  bool m_isNonTerminal;
-  std::vector<UINT64> m_factors;
+  UINT64 m_vocabId;

 public:
  explicit Word()
  {}

-  explicit Word(size_t numFactors, bool isNonTerminal)
+  explicit Word(bool isNonTerminal)
  :m_isNonTerminal(isNonTerminal)
-  ,m_factors(numFactors)
+  ,m_vocabId(0)
  {}

  Word(const Word &copy);
@ -60,16 +64,17 @@ public:
  }

  size_t WriteToMemory(char *mem) const;
-  size_t ReadFromMemory(const char *mem, size_t numFactors);
-  size_t ReadFromFile(std::fstream &file, size_t numFactors);
+  size_t ReadFromMemory(const char *mem);
+  size_t ReadFromFile(std::fstream &file);

-  void SetVocabId(size_t ind, UINT32 vocabId) {
-    m_factors[ind] = vocabId;
+  void SetVocabId(UINT32 vocabId) {
+    m_vocabId = vocabId;
  }

-  Moses::Word *ConvertToMoses(Moses::FactorDirection direction
-                              , const std::vector<Moses::FactorType> &outputFactorsVec
-                              , const Vocab &vocab) const;
+  void ConvertToMoses(
+    const std::vector<Moses::FactorType> &outputFactorsVec,
+    const Vocab &vocab,
+    Moses::Word &overwrite) const;

 	virtual void DebugPrint(std::ostream &out, const Vocab &vocab) const;

@ -78,5 +83,7 @@ public:
  bool operator==(const Word &compare) const;

 };
+
+typedef boost::shared_ptr<Word> WordPtr;
 }

--- a/OnDiskPt/queryOnDiskPt.cpp
+++ b/OnDiskPt/queryOnDiskPt.cpp
@ -38,20 +38,20 @@ void Tokenize(OnDiskPt::Phrase &phrase

    if (splitPos == string::npos) {
      // lhs - only 1 word
-      Word *word = new Word();
+      WordPtr word (new Word());
      word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
      phrase.AddWord(word);
    } else {
      // source & target non-terms
      if (addSourceNonTerm) {
-        Word *word = new Word();
+        WordPtr word( new Word());
        word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
        phrase.AddWord(word);
      }

      wordStr = token.substr(splitPos, tokSize - splitPos);
      if (addTargetNonTerm) {
-        Word *word = new Word();
+        WordPtr word(new Word());
        word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
        phrase.AddWord(word);
      }
@ -59,7 +59,7 @@ void Tokenize(OnDiskPt::Phrase &phrase
    }
  } else {
    // term
-    Word *word = new Word();
+    WordPtr word(new Word());
    word->CreateFromString(token, onDiskWrapper.GetVocab());
    phrase.AddWord(word);
  }
--- a/scripts/ems/biconcor/Alignment.cpp
+++ b/scripts/ems/biconcor/Alignment.cpp
--- a/scripts/ems/biconcor/Alignment.h
+++ b/scripts/ems/biconcor/Alignment.h
--- a/scripts/ems/biconcor/Jamfile
+++ b/scripts/ems/biconcor/Jamfile
@ -1,3 +1,2 @@
 exe biconcor : Vocabulary.cpp SuffixArray.cpp TargetCorpus.cpp Alignment.cpp Mismatch.cpp PhrasePair.cpp PhrasePairCollection.cpp biconcor.cpp base64.cpp ;

-install legacy : biconcor : <location>. ;
--- a/scripts/ems/biconcor/Mismatch.cpp
+++ b/scripts/ems/biconcor/Mismatch.cpp
--- a/scripts/ems/biconcor/Mismatch.h
+++ b/scripts/ems/biconcor/Mismatch.h
--- a/scripts/ems/biconcor/PhrasePair.cpp
+++ b/scripts/ems/biconcor/PhrasePair.cpp
--- a/scripts/ems/biconcor/PhrasePair.h
+++ b/scripts/ems/biconcor/PhrasePair.h
--- a/scripts/ems/biconcor/PhrasePairCollection.cpp
+++ b/scripts/ems/biconcor/PhrasePairCollection.cpp
--- a/scripts/ems/biconcor/PhrasePairCollection.h
+++ b/scripts/ems/biconcor/PhrasePairCollection.h
--- a/scripts/ems/biconcor/SuffixArray.cpp
+++ b/scripts/ems/biconcor/SuffixArray.cpp
--- a/scripts/ems/biconcor/SuffixArray.h
+++ b/scripts/ems/biconcor/SuffixArray.h
--- a/scripts/ems/biconcor/TargetCorpus.cpp
+++ b/scripts/ems/biconcor/TargetCorpus.cpp
--- a/scripts/ems/biconcor/TargetCorpus.h
+++ b/scripts/ems/biconcor/TargetCorpus.h
--- a/scripts/ems/biconcor/Vocabulary.cpp
+++ b/scripts/ems/biconcor/Vocabulary.cpp
--- a/scripts/ems/biconcor/Vocabulary.h
+++ b/scripts/ems/biconcor/Vocabulary.h
--- a/scripts/ems/biconcor/base64.cpp
+++ b/scripts/ems/biconcor/base64.cpp
--- a/scripts/ems/biconcor/base64.h
+++ b/scripts/ems/biconcor/base64.h
--- a/scripts/ems/biconcor/biconcor.cpp
+++ b/scripts/ems/biconcor/biconcor.cpp
--- a/4
+++ b/4
@ -4,8 +4,8 @@ if
  bjam="$(which bjam 2>/dev/null)" && #exists
  [ ${#bjam} != 0 ] && #paranoia about which printing nothing then returning true
  ! grep UFIHGUFIHBDJKNCFZXAEVA "${bjam}" </dev/null >/dev/null && #bjam in path isn't this script
-  "${bjam}" --help >/dev/null 2>/dev/null && #bjam in path isn't broken (i.e. has boost-build)
-  "${bjam}" --version |grep "Boost.Build 201" >/dev/null 2>/dev/null #It's recent enough.  
+  "${bjam}" --sanity-test 2>/dev/null |grep Sane >/dev/null && #The test in jam-files/sanity.jam passes
+  (cd jam-files/fail && ! "${bjam}") >/dev/null #Returns non-zero on failure
 then
  #Delegate to system bjam
  exec "${bjam}" "$@"
--- a/contrib/Extract_TMX_Corpus/Extract_TMX_Corpus.py
+++ b/contrib/Extract_TMX_Corpus/Extract_TMX_Corpus.py
@ -1,594 +0,0 @@
-#! /usr/bin/env python
-# -*- coding: utf_8 -*-
-"""This program is used to prepare corpora extracted from TMX files.
-It is particularly useful for translators not very familiar
-with machine translation systems that want to use Moses with a highly customised
-corpus.
-
-It extracts  from a directory containing TMX files (and from all of its subdirectories)
-all the segments of one or more language pairs (except empty segments and segments that are equal in both languages)
-and removes all other information. It then creates 2 separate monolingual files per language pair,
-both of which have strictly parallel (aligned) segments. This kind of corpus can easily be transformed
-in other formats, if need be.
-
-The program requires that Pythoncard and wxPython (as well as Python) be previously installed.
-
-Copyright 2009, João L. A. C. Rosas
-
-Distributed under GNU GPL v3 licence (see http://www.gnu.org/licenses/)
-
-E-mail: extracttmxcorpus@gmail.com """
-
-__version__ = "$Revision: 1.043$"
-__date__ = "$Date: 2011/08/13$"
-__author__="$João L. A. C. Rosas$"
-#Special thanks to Gary Daine for a helpful suggestion about a regex expression
-#Updated to run on Linux by Tom Hoar
-
-from PythonCard import clipboard, dialog, graphic, model
-from PythonCard.components import button, combobox,statictext,checkbox,staticbox
-import wx
-import os, re
-import string
-import sys
-from time import strftime
-import codecs
-
-
-class Extract_TMX_Corpus(model.Background):
-
-    def on_initialize(self, event):
-        """Initialize values
-
-        
-        @self.inputdir: directory whose files will be treated
-        @self.outputfile: base name of the resulting corpora files
-        @self.outputpath: root directory of the resulting corpora files
-        @currdir: program's current working directory
-        @self.languages: list of languages whose segments can be processed
-        @self.startinglanguage: something like 'EN-GB'
-        @self.destinationlanguage: something like 'FR-FR'
-        @self.components.cbStartingLanguage.items: list of values of the Starting Language combobox of the program's window
-        @self.components.cbDestinationLanguage.items: list of values of the Destination Language combobox of the program's window
-        @self.numtus: number of translation units extracted so far
-        @self.presentfile: TMX file being currently processed
-        @self.errortypes: variable that stocks the types of errors detected in the TMX file that is being processed
-        @self.wroteactions: variable that indicates whether the actions files has already been written to
-        """
-        
-        self.inputdir=''
-        self.outputfile=''
-        self.outputpath=''
-        #Get directory where program file is and ...
-        currdir=os.path.abspath(os.path.dirname(os.path.realpath(sys.argv[0])))
-        #... load the file ("LanguageCodes.txt") with the list of languages that the program can process
-        try:
-            self.languages=open(currdir+os.sep+r'LanguageCodes.txt','r+').readlines()
-        except:
-            # If the languages file doesn't exist in the program directory, alert user that it is essential for the good working of the program and exit
-            result = dialog.alertDialog(self, 'The file "LanguageCodes.txt" is missing. The program will now close.', 'Essential file missing')
-            sys.exit()
-        #remove end of line marker from each line in "LanguageCodes.txt"
-        for lang in range(len(self.languages)):
-            self.languages[lang]=self.languages[lang].rstrip()
-        self.startinglanguage=''
-        self.destinationlanguage=''
-        #Insert list of language names in appropriate program window's combo boxes
-        self.components.cbStartingLanguage.items=self.languages
-        self.components.cbDestinationLanguage.items=self.languages
-        self.tottus=0
-        self.numtus=0
-        self.numequaltus=0
-        self.presentfile=''
-        self.errortypes=''
-        self.wroteactions=False
-        self.errors=''
-
-    def extract_language_segments_tmx(self,text):
-        """Extracts TMX language segments from TMX files
-        
-        @text: the text of the TMX file
-        @pattern: compiled regular expression object, which can be used for matching
-        @tus: list that collects the translation units of the text
-        @segs: list that collects the segment units of the relevant pair of languages
-        @numtus: number of translation units extracted
-        @present_tu: variable that stocks the translation unit relevant segments (of the chosen language pair) that are being processed
-        @self.errortypes: variable that stocks the types of errors detected in the TMX file that is being processed
-        """
-        #print 'extract_language_segments: start at '+strftime('%H-%M-%S')
-        result=('','')
-        try:
-            if text:
-                # Convert character entities to "normal"  characters
-                pattern=re.compile('&gt;',re.U)
-                text=re.sub(pattern,'>',text)
-                pattern=re.compile('&lt;',re.U)
-                text=re.sub(pattern,'<',text)
-                pattern=re.compile('&amp;',re.U)
-                text=re.sub(pattern,'&',text)
-                pattern=re.compile('&quot;',re.U)
-                text=re.sub(pattern,'"',text)
-                pattern=re.compile('&apos;',re.U)
-                text=re.sub(pattern,"'",text)
-                # Extract translation units
-                pattern=re.compile('(?s)<tu.*?>(.*?)</tu>')
-                tus=re.findall(pattern,text)
-                ling1=''
-                ling2=''
-                #Extract relevant segments and store them in the @text variable
-                if tus:
-                    for tu in tus:
-                        pattern=re.compile('(?s)<tuv.*?lang="'+self.startinglanguage+'">.*?<seg>(.*?)</seg>.*?<tuv.*?lang="'+self.destinationlanguage+'">.*?<seg>(.*?)</seg>')
-                        present_tu=re.findall(pattern,tu)
-                        self.tottus+=1
-                        #reject empty segments
-                        if present_tu: # and not present_tu[0][0].startswith("<")
-                            present_tu1=present_tu[0][0].strip()
-                            present_tu2=present_tu[0][1].strip()
-                            present_tu1 = re.sub('<bpt.*</bpt>', '', present_tu1)
-                            present_tu2 = re.sub('<bpt.*</bpt>', '', present_tu2)
-                            present_tu1 = re.sub(r'<ept.*</ept>', '', present_tu1)
-                            present_tu2 = re.sub(r'<ept.*</ept>', '', present_tu2)
-                            present_tu1 = re.sub(r'<ut.*</ut>', '', present_tu1)
-                            present_tu2 = re.sub(r'<ut.*</ut>', '', present_tu2)
-                            present_tu1 = re.sub(r'<ph.*</ph>', '', present_tu1)
-                            present_tu2 = re.sub(r'<ph.*</ph>', '', present_tu2)
-                            #Thanks to Gary Daine
-                            present_tu1 = re.sub('^[0-9\.() \t\-_]*$', '', present_tu1)
-                            #Thanks to Gary Daine
-                            present_tu2 = re.sub('^[0-9\.() \t\-_]*$', '', present_tu2)
-                            if present_tu1 != present_tu2:
-                                x=len(present_tu1)
-                                y=len(present_tu2)
-                                if (x <= y*3) and (y <= x*3):
-                                    ling1=ling1+present_tu1+'\n'
-                                    ling2=ling2+present_tu2+'\n'
-                                    self.numtus+=1
-                            else:
-                                self.numequaltus+=1
-                        pattern=re.compile('(?s)<tuv.*?lang="'+self.destinationlanguage+'">.*?<seg>(.*?)</seg>.*?<tuv.*?lang="'+self.startinglanguage+'">.*?<seg>(.*?)</seg>')
-                        present_tu=re.findall(pattern,tu)
-                        #print present_tu
-                        if present_tu:
-                            present_tu1=present_tu[0][1].strip()
-                            present_tu2=present_tu[0][0].strip()
-                            present_tu1 = re.sub('<bpt.*</bpt>', '', present_tu1)
-                            present_tu2 = re.sub('<bpt.*</bpt>', '', present_tu2)
-                            present_tu1 = re.sub(r'<ept.*</ept>', '', present_tu1)
-                            present_tu2 = re.sub(r'<ept.*</ept>', '', present_tu2)
-                            present_tu1 = re.sub(r'<ut.*</ut>', '', present_tu1)
-                            present_tu2 = re.sub(r'<ut.*</ut>', '', present_tu2)
-                            present_tu1 = re.sub(r'<ph.*</ph>', '', present_tu1)
-                            present_tu2 = re.sub(r'<ph.*</ph>', '', present_tu2)
-                            #Thanks to Gary Daine
-                            present_tu1 = re.sub('^[0-9\.() \t\-_]*$', '', present_tu1)
-                            #Thanks to Gary Daine
-                            present_tu2 = re.sub('^[0-9\.() \t\-_]*$', '', present_tu2)
-                            if present_tu1 != present_tu2:
-                                x=len(present_tu1)
-                                y=len(present_tu2)
-                                if (x <= y*3) and (y <= x*3):
-                                    ling1=ling1+present_tu1+'\n'
-                                    ling2=ling2+present_tu2+'\n'
-                                    self.numtus+=1
-                            else:
-                                self.numequaltus+=1
-                    result=(ling1,ling2)
-        except:
-            self.errortypes=self.errortypes+'   - Extract Language Segments error\n'
-        return result
-
-    def locate(self,pattern, basedir):
-        """Locate all files matching supplied filename pattern in and below
-        supplied root directory.
-        
-        @pattern: something like '*.tmx'
-        @basedir:whole directory to be treated
-        """
-        import fnmatch
-        for path, dirs, files in os.walk(os.path.abspath(basedir)):
-            for filename in fnmatch.filter(files, pattern):
-                yield os.path.join(path, filename)
-                
-    def getallsegments(self):
-        """Get all language segments from the TMX files in the specified 
-        directory
-        
-        @self.startinglanguage: something like 'EN-GB'
-        @self.destinationlanguage: something like 'FR-FR'
-        @fileslist: list of files that should be processed
-        @self.inputdir: directory whose files will be treated
-        @startfile:output file containing all segments in the @startinglanguage; file
-            will be created in @self.inputdir
-        @destfile:output file containing all segments in the @destinationlanguage; file
-            will be created in @self.inputdir
-        @actions:output file indicating the names of all files that were processed without errors; file
-            will be created in @self.inputdir
-        @self.errortypes: variable that stocks the types of errors detected in the TMX file that is being processed
-        @self.presentfile: TMX file being currently processed
-        @preptext: parsed XML text with all tags extracted and in string format
-        @tus: list that receives the extracted TMX language translation units just with segments of the relevant language pair
-        @num: loop control variable between 0 and length of @tus - 1
-        @self.numtus: number of translation units extracted so far
-        """
-        self.statusBar.text='Processing '+ self.inputdir
-        try:
-            # Get a list of all TMX files that need to be processed
-            fileslist=self.locate('*.tmx',self.inputdir)
-            # Open output files for writing
-            startfile=open(self.outputpath+os.sep+self.startinglanguage+  ' ('+self.destinationlanguage+')_' +self.outputfile,'w+b')
-            destfile=open(self.outputpath+os.sep+self.destinationlanguage+' ('+self.startinglanguage+')_'+self.outputfile,'w+b')
-            actions=open(self.outputpath+os.sep+'_processing_info'+os.sep+self.startinglanguage+ '-'+self.destinationlanguage+'_'+'actions_'+self.outputfile+'.txt','w+')
-        except:
-            # if any error up to now, add the name of the TMX file to the output file @errors
-            self.errortypes=self.errortypes+'   - Get All Segments: creation of output files error\n'
-        if fileslist:
-            # For each relevant TMX file ...
-            for self.presentfile in fileslist:
-                self.errortypes=''
-                try:
-                    print self.presentfile
-                    fileObj = codecs.open(self.presentfile, "rb", "utf-16","replace",0 )
-                    pos=0
-                    while True:
-                        # read a new chunk of text...
-                        preptext = fileObj.read(692141)
-                        if not preptext:
-                            break
-                        last5=''
-                        y=''
-                        #... and make it end at the end of a translation unit
-                        while True:
-                            y=fileObj.read(1)
-                            if not y:
-                                break
-                            last5=last5+y
-                            if '</tu>' in last5:
-                                break
-                        preptext=preptext+last5
-                        # ... and extract its relevant segments ...
-                        if not self.errortypes:
-                            segs1,segs2=self.extract_language_segments_tmx(preptext)
-                            preptext=''
-                            #... and write those segments to the output files
-                            if segs1 and segs2:
-                                try:
-                                    startfile.write('%s' % (segs1.encode('utf-8','strict')))
-                                    destfile.write('%s' % (segs2.encode('utf-8','strict')))
-                                except:
-                                    self.errortypes=self.errortypes+'   - Get All Segments: writing of output files error\n'
-                                    print 'erro'
-                    #if no errors up to now, insert the name of the TMX file in the @actions output file
-                    #encoding is necessary because @actions may be in a directory whose name has special diacritic characters
-                    if self.errortypes=='':
-                        try:
-                            actions.write(self.presentfile.encode('utf_8','replace')+'\n')
-                            self.wroteactions=True
-                        except:
-                            self.errortypes=self.errortypes+'   - Get All Segments: writing of actions file error\n'
-                    fileObj.close()
-                except:
-                    self.errortypes=self.errortypes+'   - Error reading input file\n'
-            try:
-                if self.wroteactions:
-                    actions.write('\n*************************************************\n\n')
-                    actions.write('Total number of translation units: '+str(self.tottus)+'\n')
-                    actions.write('Number of extracted translation units (source segment not equal to destination segment): '+str(self.numtus)+'\n')
-                    actions.write('Number of removed translation units (source segment equal to destination segment): '+str(self.numequaltus)+'\n')
-                    actions.write('Number of empty translation units (source segment and/or destination segment not present): '+str(self.tottus-self.numequaltus-self.numtus))
-                   
-            except:
-                self.errortypes=self.errortypes+'   - Get All Segments: writing of actions file error\n'
-            # Close output files
-            actions.close()
-            destfile.close()
-            startfile.close()
-                    
-    def SelectDirectory(self):
-        """Select the directory where the TMX files to be processed are
-
-        @result: object returned by the dialog window with attributes accepted (true if user clicked OK button, false otherwise) and
-            path (list of strings containing the full pathnames to all files selected by the user)
-        @self.inputdir: directory where TMX files to be processed are (and where output files will be written)
-        @self.statusBar.text: text displayed in the program window status bar"""
-        
-        result= dialog.directoryDialog(self, 'Choose a directory', 'a')
-        if result.accepted:
-            self.inputdir=result.path
-            self.statusBar.text=self.inputdir+' selected.'
-
-    def on_menuFileSelectDirectory_select(self, event):
-        self.SelectDirectory()
-
-    def on_btnSelectDirectory_mouseClick(self, event):
-        self.SelectDirectory()
-        
-    def GetOutputFileBaseName(self):
-        """Get base name of the corpus files
-
-        @expr: variable containing the base name of the output files
-        @wildcard: list of wildcards used in the dialog window to filter types of files
-        @result: object returned by the Open File dialog window with attributes accepted (true if user clicked OK button, false otherwise) and
-            path (list of strings containing the full pathnames to all files selected by the user)
-        @self.inputdir: directory where TMX files to be processed are (and where output files will be written)
-        @location: variable containing the full path to the base name output file
-        @self.outputpath: base directory of output files
-        @self.outputfile: base name of the output files
-        """
-        
-        # Default base name of the corpora files that will be produced. If you choose as base name "Corpus.txt", as starting language "EN-GB" and as destination
-        # language "FR-FR" the corpora files will be named "Corpus_EN-GB.txt" and "Corpus_FR-FR.txt"
-        expr='Corpus'
-        #open a dialog that lets you choose the base name of the corpora files that will be produced. 
-        wildcard = "Text files (*.txt;*.TXT)|*.txt;*.TXT"
-        result = dialog.openFileDialog(None, "Name of corpus file", self.inputdir,expr,wildcard=wildcard)
-        if result.accepted:
-            location=os.path.split(result.paths[0])
-            self.outputpath=location[0]
-            self.outputfile = location[1]
-            if not os.path.exists(self.outputpath+os.sep+'_processing_info'):
-                try:
-                    os.mkdir(self.outputpath+os.sep+'_processing_info')
-                except:
-                    result1 = dialog.alertDialog(self, "The program can't create the directory " + self.outputpath+os.sep+r'_processing_info, which is necessary for ' + \
-                        'the creation of the output files. The program will now close.','Error')
-                    sys.exit()
-
-    def on_menuGetOutputFileBaseName_select(self, event):
-        self.GetOutputFileBaseName()
-
-    def on_btnGetOutputFileBaseName_mouseClick(self, event):
-        self.GetOutputFileBaseName()
-        
-    def ExtractCorpus(self):
-        """Get the directory where TMX files to be processed are, get the choice of the pair of languages that will be treated and launch the extraction
-        of the corpus
-
-        @self.errortypes: variable that stocks the types of errors detected in the TMX file that is being processed
-        @self.presentfile: TMX file being currently processed
-        @self.numtus: number of translation units extracted so far
-        @self.startinglanguage: something like 'EN-GB'
-        @self.destinationlanguage: something like 'FR-FR'
-        @self.inputdir: directory whose files will be treated
-        @self.components.cbStartingLanguage.items: list of values of the Starting Language combobox of the program's window
-        @self.components.cbDestinationLanguage.items: list of values of the Destination Language combobox of the program's window
-        @self.outputfile: base name of the resulting corpora files
-        @self.errors:output file indicating the types of error that occurred in each processed TMX file
-        @self.numtus: number of translation units extracted so far
-        """
-
-        print 'Extract corpus: started at '+strftime('%H-%M-%S')
-        self.errortypes=''
-        self.presentfile=''
-        self.numtus=0
-        #get the startinglanguage name (e.g.: "EN-GB") from the program window
-        self.startinglanguage=self.components.cbStartingLanguage.text
-        #get the destinationlanguage name from the program window
-        self.destinationlanguage=self.components.cbDestinationLanguage.text
-        #if the directory where TMX files (@inputdir) or the pair of languages were not previously chosen, open a dialog box explaining
-        #the conditions that have to be met so that the extraction can be made and do nothing...
-        if (self.inputdir=='') or (self.components.cbStartingLanguage.text=='') or (self.components.cbDestinationLanguage.text=='') or (self.outputfile=='') \
-           or (self.components.cbStartingLanguage.text==self.components.cbDestinationLanguage.text):
-            result = dialog.alertDialog(self, 'In order to extract a corpus, you need to:\n\n 1) indicate the directory where the TMX files are,\n 2)' \
-                        +' the starting language,\n 3) the destination language (the 2 languages must be different), and\n 4) the base name of the output files.', 'Error')
-        
-        #...else, go ahead
-        else:
-            try:
-                self.errors=open(self.outputpath+os.sep+'_processing_info'+os.sep+self.startinglanguage+ '-'+self.destinationlanguage+'_'+'errors_'+self.outputfile+'.txt','w+')
-            except:
-                pass
-            self.statusBar.text='Please wait. This can be a long process ...'
-            #Launch the segment extraction
-            self.numtus=0
-            self.getallsegments()
-            # if any error up to now, add the name of the TMX file to the output file @errors
-            if self.errortypes:
-                try:
-                    self.errors.write(self.presentfile.encode('utf_8','replace')+':\n'+self.errortypes)
-                except:
-                    pass
-            try:
-                self.errors.close()
-            except:
-                pass
-            self.statusBar.text='Processing finished.'
-            #Open dialog box telling that processing is finished and where can the resulting files be found
-            self.inputdir=''
-            self.outputfile=''
-            self.outputpath=''
-            print 'Extract corpus: finished at '+strftime('%H-%M-%S')
-            result = dialog.alertDialog(self, 'Processing done. Results found in:\n\n1) '+ \
-                self.outputpath+os.sep+self.startinglanguage+  ' ('+self.destinationlanguage+')_' +self.outputfile+ ' (starting language corpus)\n2) '+ \
-                self.outputpath+os.sep+self.destinationlanguage+' ('+self.startinglanguage+')_'+self.outputfile+ \
-                ' (destination language corpus)\n3) '+self.outputpath+os.sep+'_processing_info'+os.sep+self.startinglanguage+ '-'+self.destinationlanguage+'_'+ \
-                'errors_'+self.outputfile+'.txt'+ ' (list of files that caused errors)\n4) '+self.outputpath+os.sep+'_processing_info'+os.sep+self.startinglanguage+ \
-                '-'+self.destinationlanguage+'_'+'actions_'+self.outputfile+'.txt'+ ' (list of files where processing was successful)', 'Processing Done')
-
-    def on_menuFileExtractCorpus_select(self, event):
-        self.ExtractCorpus()
-    def on_btnExtractCorpus_mouseClick(self, event):
-        self.ExtractCorpus()
-
-    def ExtractAllCorpora(self):
-        """Extracts all the LanguagePairs that can be composed with the languages indicated in the file "LanguageCodes.txt"
-
-        @self.presentfile: TMX file being currently processed
-        @self.numtus: number of translation units extracted so far
-        @numcorpora: number of language pair being processed
-        @self.inputdir: directory whose files will be treated
-        @self.outputfile: base name of the resulting corpora files
-        @self.errors:output file indicating the types of error that occurred in each processed TMX file
-        @self.startinglanguage: something like 'EN-GB'
-        @self.destinationlanguage: something like 'FR-FR'
-        @lang1: code of the starting language
-        @lang2: code of the destination language
-        @self.errortypes: variable that stocks the types of errors detected in the TMX file that is being processed
-        @self.wroteactions: variable that indicates whether the actions files has already been written to
-        """
-        
-        print 'Extract All Corpora: started at '+strftime('%H-%M-%S')
-        self.presentfile=''
-        self.numtus=0
-        numcorpora=0
-        #if the directory where TMX files (@inputdir) or the base name of the output files were not previously chosen, open a dialog box explaining
-        #the conditions that have to be met so that the extraction can be made and do nothing...
-        if (self.inputdir=='') or (self.outputfile==''):
-            result = dialog.alertDialog(self, 'In order to extract all corpora, you need to:\n\n 1) indicate the directory where the TMX files are, and\n ' \
-                                        + '2) the base name of the output files.', 'Error')
-        #...else, go ahead
-        else:
-            try:
-                for lang1 in self.languages:
-                    for lang2 in self.languages:
-                        if lang2 > lang1:
-                            print lang1+'/'+lang2+' corpus being created...'
-                            numcorpora=numcorpora+1
-                            self.errortypes=''
-                            self.numtus=0
-                            self.wroteactions=False
-                            #get the startinglanguage name (e.g.: "EN-GB") from the program window
-                            self.startinglanguage=lang1
-                            #get the destinationlanguage name from the program window
-                            self.destinationlanguage=lang2
-                            try:
-                                self.errors=open(self.outputpath+os.sep+'_processing_info'+os.sep+self.startinglanguage+ '-'+self.destinationlanguage+'_'+'errors.txt','w+')
-                            except:
-                                pass
-                            self.statusBar.text='Language pair '+str(numcorpora)+' being processed. Please wait.'
-                            #Launch the segment extraction
-                            self.getallsegments()
-                            # if any error up to now, add the name of the TMX file to the output file @errors
-                            if self.errortypes:
-                                try:
-                                    self.errors.write(self.presentfile.encode('utf_8','replace')+':\n'+self.errortypes.encode('utf_8','replace'))
-                                except:
-                                    pass
-                            try:
-                                self.errors.close()
-                            except:
-                                pass
-                self.statusBar.text='Processing finished.'
-            except:
-                self.errortypes=self.errortypes+'   - Extract All Corpora error\n'
-                self.errors.write(self.presentfile.encode('utf_8','replace')+':\n'+self.errortypes.encode('utf_8','replace'))
-                self.errors.close()
-            #Open dialog box telling that processing is finished and where can the resulting files be found
-            self.inputdir=''
-            self.outputfile=''
-            self.outputpath=''
-            print 'Extract All Corpora: finished at '+strftime('%H-%M-%S')
-            result = dialog.alertDialog(self, 'Results found in: '+ self.outputpath+'.', 'Processing done')
-                    
-
-    def on_menuFileExtractAllCorpora_select(self, event):
-        self.ExtractAllCorpora()
-    def on_btnExtractAllCorpora_mouseClick(self, event):
-        self.ExtractAllCorpora()
-
-    def ExtractSomeCorpora(self):
-        """Extracts the segments of the LanguagePairs indicated in the file "LanguagePairs.txt" located in the program's root directory
-
-        @self.presentfile: TMX file being currently processed
-        @self.numtus: number of translation units extracted so far
-        @currdir: current working directory of the program
-        @pairsoflanguages: list of the pairs of language that are going to be processed
-        @self.languages: list of languages whose segments can be processed
-        @numcorpora: number of language pair being processed
-        @self.inputdir: directory whose files will be treated
-        @self.outputfile: base name of the resulting corpora files
-        @self.errors:output file indicating the types of error that occurred in each processed TMX file
-        @self.startinglanguage: something like 'EN-GB'
-        @self.destinationlanguage: something like 'FR-FR'
-        @lang1: code of the starting language
-        @lang2: code of the destination language
-        @self.errortypes: variable that stocks the types of errors detected in the TMX file that is being processed
-        @self.wroteactions: variable that indicates whether the actions files has already been written to
-        """
-        
-        print 'Extract Some Corpora: started at '+strftime('%H-%M-%S')
-        self.presentfile=''
-        self.numtus=0
-        currdir=os.path.abspath(os.path.dirname(os.path.realpath(sys.argv[0])))
-        #... load the file ("LanguageCodes.txt") with the list of languages that the program can process
-        try:
-            pairsoflanguages=open(currdir+os.sep+r'LanguagePairs.txt','r+').readlines()
-        except:
-            # If the languages file doesn't exist in the program directory, alert user that it is essential for the good working of the program and exit
-            result = dialog.alertDialog(self, 'The file "LanguagePairs.txt" is missing. The program will now close.', 'Essential file missing')
-            sys.exit()
-        #remove end of line marker from each line in "LanguageCodes.txt"
-        if pairsoflanguages:
-            for item in range(len(pairsoflanguages)):
-                pairsoflanguages[item]=pairsoflanguages[item].strip()
-                pos=pairsoflanguages[item].find("/")
-                pairsoflanguages[item]=(pairsoflanguages[item][:pos],pairsoflanguages[item][pos+1:])
-        else:
-            # If the languages file is empty, alert user that it is essential for the good working of the program and exit
-            result = dialog.alertDialog(self, 'The file "LanguagePairs.txt" is an essential file and is empty. The program will now close.', 'Empty file')
-            sys.exit()
-            
-        #if the directory where TMX files (@inputdir) or the base name of the output files were not previously chosen, open a dialog box explaining
-        #the conditions that have to be met so that the extraction can be made and do nothing...
-        if (self.inputdir=='') or (self.outputfile==''):
-            result = dialog.alertDialog(self, 'In order to extract all corpora, you need to:\n\n 1) indicate the directory where the TMX files are, and\n ' \
-                                        + '2) the base name of the output files.', 'Error')
-        #...else, go ahead
-        else:
-            numcorpora=0
-            for (lang1,lang2) in pairsoflanguages:
-                if lang1<>lang2:
-                    print lang1+'/'+lang2+' corpus being created...'
-                    self.errortypes=''
-                    numcorpora=numcorpora+1
-                    #get the startinglanguage code (e.g.: "EN-GB") 
-                    self.startinglanguage=lang1
-                    #get the destinationlanguage code 
-                    self.destinationlanguage=lang2
-                    try:
-                        self.errors=open(self.outputpath+os.sep+'_processing_info'+os.sep+self.startinglanguage+ '-'+self.destinationlanguage+'_'+'errors.txt','w+')
-                    except:
-                        pass
-                    self.statusBar.text='Language pair '+str(numcorpora)+' being processed. Please wait.'
-                    #Launch the segment extraction
-                    self.numtus=0
-                    self.wroteactions=False
-                    self.getallsegments()
-                    # if any error up to now, add the name of the TMX file to the output file @errors
-                    if self.errortypes:
-                        try:
-                            self.errors.write(self.presentfile.encode('utf_8','replace')+':\n'+self.errortypes.encode('utf_8','replace'))
-                        except:
-                            pass
-                    try:
-                        self.errors.close()
-                    except:
-                        pass
-                else:
-                    result = dialog.alertDialog(self, 'A bilingual corpus involves two different languages. The pair "'+lang1+'/'+lang2 + \
-                            '" will not be processed.', 'Alert')
-                self.statusBar.text='Processing finished.'
-            #Open dialog box telling that processing is finished and where can the resulting files be found
-            self.inputdir=''
-            self.outputfile=''
-            self.outputpath=''
-            print 'Extract Some Corpora: finished at '+strftime('%H-%M-%S')
-            result = dialog.alertDialog(self, 'Results found in: '+ self.outputpath+'.', 'Processing done')
-
-    def on_menuFileExtractSomeCorpora_select(self, event):
-        self.ExtractSomeCorpora()
-    def on_btnExtractSomeCorpora_mouseClick(self, event):
-        self.ExtractSomeCorpora()
-
-    def on_menuHelpHelp_select(self, event):
-        try:
-            f = open('_READ_ME_FIRST.txt', "r")
-            msg = f.read()
-            result = dialog.scrolledMessageDialog(self, msg, 'readme.txt')
-        except:
-            result = dialog.alertDialog(self, 'Help file missing', 'Problem with the Help file')
-        
-            
-if __name__ == '__main__':
-    app = model.Application(Extract_TMX_Corpus)
-    app.MainLoop()
--- a/contrib/Extract_TMX_Corpus/Extract_TMX_Corpus.rsrc.py
+++ b/contrib/Extract_TMX_Corpus/Extract_TMX_Corpus.rsrc.py
@ -1,141 +0,0 @@
-{'application':{'type':'Application',
-          'name':'Extract_TMX_Corpus',
-    'backgrounds': [
-    {'type':'Background',
-          'name':'bgExtract_TMX_Corpus',
-          'title':u'Extract_TMX_Corpus',
-          'size':(275, 410),
-          'statusBar':1,
-
-        'menubar': {'type':'MenuBar',
-         'menus': [
-             {'type':'Menu',
-             'name':'menuFile',
-             'label':'&File',
-             'items': [
-                  {'type':'MenuItem',
-                   'name':'menuFileSelectDirectory',
-                   'label':u'Select &input/output directory...\tCtrl+I',
-                   'command':'SelectListOfDirectories',
-                  },
-                  {'type':'MenuItem',
-                   'name':'menuGetOutputFileBaseName',
-                   'label':u'Get &output file base name...\tCtrl+O',
-                   'command':'GetOutputFileBaseName',
-                  },
-                  {'type':'MenuItem',
-                   'name':'fileSep1',
-                   'label':'-',
-                  },
-                  {'type':'MenuItem',
-                   'name':'menuFileExtractCorpus',
-                   'label':u'&Extract corpus\tCtrl+E',
-                   'command':'ExtractCorpus',
-                  },
-                  {'type':'MenuItem',
-                   'name':'menuFileExtractSomeCorpora',
-                   'label':u'Extract &some corpora\tCtrl+S',
-                   'command':'ExtractSomeCorpora',
-                  },
-                  {'type':'MenuItem',
-                   'name':'menuFileExtractAllCorpora',
-                   'label':u'Extract &all corpora\tCtrl+A',
-                   'command':'ExtractAllCorpora',
-                  },
-                  {'type':'MenuItem',
-                   'name':'fileSep2',
-                   'label':u'-',
-                  },
-                  {'type':'MenuItem',
-                   'name':'menuFileExit',
-                   'label':'E&xit\tAlt+X',
-                   'command':'Doexit',
-                  },
-              ]
-             },
-             {'type':'Menu',
-             'name':'menuHelp',
-             'label':u'&Help',
-             'items': [
-                  {'type':'MenuItem',
-                   'name':'menuHelpHelp',
-                   'label':u'&Help...\tCtrl+H',
-                  },
-              ]
-             },
-         ]
-     },
-         'components': [
-
-{'type':'Button', 
-    'name':'btnExtractSomeCorpora', 
-    'position':(18, 267), 
-    'size':(225, 25), 
-    'font':{'faceName': u'Arial', 'family': 'sansSerif', 'size': 10}, 
-    'label':u'Extract some corpora', 
-    },
-
-{'type':'Button', 
-    'name':'btnExtractAllCorpora', 
-    'position':(18, 233), 
-    'size':(225, 25), 
-    'font':{'faceName': u'Arial', 'family': 'sansSerif', 'size': 10}, 
-    'label':u'Extract all corpora', 
-    },
-
-{'type':'StaticText', 
-    'name':'StaticText3', 
-    'position':(18, 107), 
-    'font':{'faceName': u'Arial', 'family': 'sansSerif', 'size': 10}, 
-    'text':u'Destination Language:', 
-    },
-
-{'type':'ComboBox', 
-    'name':'cbDestinationLanguage', 
-    'position':(18, 129), 
-    'size':(225, -1), 
-    'items':[], 
-    },
-
-{'type':'Button', 
-    'name':'btnSelectDirectory', 
-    'position':(18, 19), 
-    'size':(225, 25), 
-    'font':{'faceName': u'Arial', 'family': 'sansSerif', 'size': 10}, 
-    'label':u'Select input / output directory...', 
-    },
-
-{'type':'ComboBox', 
-    'name':'cbStartingLanguage', 
-    'position':(18, 74), 
-    'size':(225, -1), 
-    'items':[u'DE-PT', u'EN-PT', u'ES-PT', u'FR-PT'], 
-    },
-
-{'type':'Button', 
-    'name':'btnGetOutputFileBaseName', 
-    'position':(18, 166), 
-    'size':(225, 25), 
-    'font':{'faceName': u'Arial', 'family': 'sansSerif', 'size': 10}, 
-    'label':u'Select base name of output file...', 
-    },
-
-{'type':'Button', 
-    'name':'btnExtractCorpus', 
-    'position':(18, 200), 
-    'size':(225, 25), 
-    'font':{'faceName': u'Arial', 'family': 'sansSerif', 'size': 10}, 
-    'label':u'Extract one corpus', 
-    },
-
-{'type':'StaticText', 
-    'name':'StaticText1', 
-    'position':(18, 53), 
-    'font':{'faceName': u'Arial', 'family': 'sansSerif', 'size': 10}, 
-    'text':u'Starting Language:', 
-    },
-
-] # end components
-} # end background
-] # end backgrounds
-} }
--- a/contrib/Extract_TMX_Corpus/LanguageCodes.txt
+++ b/contrib/Extract_TMX_Corpus/LanguageCodes.txt
@ -1,22 +0,0 @@
-BG-01
-CS-01
-DA-01
-DE-DE
-EL-01
-EN-GB
-ES-ES
-ET-01
-FI-01
-FR-FR
-HU-01
-IT-IT
-LT-01
-LV-01
-MT-01
-NL-NL
-PL-01
-PT-PT
-RO-RO
-SK-01
-SL-01
-SV-SE
--- a/contrib/Extract_TMX_Corpus/LanguagePairs.txt
+++ b/contrib/Extract_TMX_Corpus/LanguagePairs.txt
@ -1,3 +0,0 @@
-BG-01/CS-01
-FR-FR/PT-PT
-EN-GB/LT-01
--- a/contrib/Extract_TMX_Corpus/_READ_ME_FIRST.txt
+++ b/contrib/Extract_TMX_Corpus/_READ_ME_FIRST.txt
@ -1,241 +0,0 @@
-Summary:
-	PURPOSE
-	PERFORMANCE
-	REQUIREMENTS
-	INSTALLATION
-	HOW TO USE
-	GETTING THE RESULTS
-	THANKS
-	LICENSE
-
-
-********************************************************************************
-PURPOSE:
-********************************************************************************
-This is the MS Windows and Linux version (tested with Ubuntu 10.10 and 11.04)
-of Extract_Tmx_Corpus_1.044.
-
-Extract_Tmx_Corpus_1.044 was created initially as a Windows program (tested in
-Windows 7, Vista and XP) with a view to enable translators not necessarily with
-a deep knowledge of linguistic tools to create highly customised corpora that
-can be used with the Moses machine translation system and with other systems.
-Some users call it "et cetera", playing a bit with its initials (ETC) and
-meaning that it can treat a never-ending number of files.
-
-In order to create corpora that are most useful to train machine translation
-systems, one should strive to include segments that are relevant for the task in
-hand. One of the ways of finding such segments could involve the usage of
-previous translation memory files (TMX files). This way the corpora could be
-customised for the person or for the type of task in question. The present
-program uses such files as input.
-
-The program can create strictly aligned corpora for a single pair of languages,
-several pairs of languages or all the pairs of languages contained in the TMX
-files.
-
-The program creates 2 separate files (UTF-8 format; Unix line endings) for each
-language pair that it processes: one for the starting language and another for
-the destination language. The lines of a given TMX translation unit are placed
-in strictly the same line in both files. The program suppresses empty TMX
-translation units, as well as those where the text for the first language is the
-same as that of the second language (like translation units consisting solely of
-numbers, or those in which the first language segment has not been translated
-into the second language). If you are interested in another format of corpus, it
-should be relatively easy to adapt this format to the format you are interested
-in.
-
-The program also informs about errors that might occur during processing and
-creates a file that lists the name(s) of the TMX files that caused them, as well
-as a separate one listing the files successfully treated and the number of
-segments extracted for the language pair.
-
-********************************************************************************
-REQUIREMENTS:
-********************************************************************************
-The program requires the following to be pre-installed in your computer:
-
-1) Python 2.5 or higher (The program has been tested on Python 2.5 to 2.7.)
-	Windows users download and install from http://www.python.org/download/
-	Ubuntu users can use the pre-installed Python distribution
-
-2) wxPython 2.8 or higher
-	Windows users download and install the Unicode version from
-	http://www.wxpython.org/download.php
-	Ubuntu users install with:
-		sudo apt-get install python-wxtools
-
-3) Pythoncard 0.8.2 or higher
-	Windows users download and install
-	http://sourceforge.net/projects/pythoncard/files/PythonCard/0.8.2/PythonCard-0.8.2.win32.exe/download
-	Ubuntu/Debian users install with:
-		sudo apt-get install pythoncard
-
-********************************************************************************
-INSTALLATION:
-********************************************************************************
-Windows users:
-1) Download the Extract_TMX_Corpus_1.041.exe file
-2) Double-click Extract_TMX_Corpus_1.041.exe and follow the wizard's
-   instructions.
-NOTE: Windows Vista users, to run the installation programs, by right-click on
-the installation file in Windows Explorer and choose "Execute as administrator"
-in the contextual menu.
-
-Ubuntu users:
-1) Download the Moses2TMX.tgz compressed file to a directory of your choice.
-2) Expand the compressed file and run from the expanded directory.
-
-***IMPORTANT***: Never erase the file "LanguageCodes.txt" in that directory. It
-is necessary for telling the program the languages that it has to process. If
-your TMX files use language codes that are different from those contained in
-this file, please replace the codes contained in the file with the codes used in
-your TMX files. You can always add or delete new codes to this file (when the
-program is not running).
-
-********************************************************************************
-HOW TO USE:
-********************************************************************************
-1) Create a directory where you will copy the TMX files that you want to 
-   process.
-
-2) Copy the TMX files to that directory.
-Note: If you do not have TMX files, try the following site:
-http://langtech.jrc.it/DGT-TM.html#Download. It contains the European Union
-DGT's Translation Memory, containing legislative documents of the European
-Union. For more details, see http://wt.jrc.it/lt/Acquis/DGT_TU_1.0/data/. These
-files are compressed in zip format and need to be unzipped before they can be
-used.
-
-3) Launch the program.
-
-4) Operate on the main window of the program in the direction from top to 
-   bottom:
-
-	a) Click the "Select input/output directory" button to tell the root
-	directory where the TMX files are (this directory can have subdirectories,
-	all of which will also be processed), as well as where the output files
-	produced by the program will be placed;
-	NOTE: Please take note of this directory because the result files will also
-	be placed there.
-
-	b) In case you want to extract a ***single*** pair of languages, choose them
-	in the "Starting Language" and "Destination Language" comboboxes. Do nothing
-	if you want to extract more than one pair of languages.
-
-	c) Click the "Select base name of output file" button and choose a base name
-	for the output files (default: "Corpus.txt").
-	Note: This base name is used to compose the names of the output files, which
-	will also include the names of the starting and destination languages. If
-	you accept the default "Corpus.txt" and choose "EN-GB" as starting language
-	and "PT-PT" as destination language, for that corpus pair the respective
-	corpora files will be named, respectively, "EN-GB (PT-PT)_Corpus.txt" and
-	"PT-PT (EN-GB)_Corpus.txt".
-	***TIP***: The base name is useful for getting different names for different
-	corpora of the same language.
-
-	d) Click one (***just one***) of the following buttons:
-		- "Extract one corpus": this creates a single pair of strictly aligned
-		corpora in the languages chosen in the "Starting Language" and
-		"Destination Language" comboboxes;
-		- "Extract all corpora": this extracts all the combination pairs of
-		languages for all the languages available in the "Starting Language" and
-		"Destination language" comboboxes; if a language pair does not have
-		segments of both languages in all of the translation units of all the
-		TMX files, the result will be two empty corpora files for that language
-		pair. If, however, there is just a single relevant translation unit, the
-		corpus won't be empty.
-		- "Extract some corpora": this extracts the pairs of languages listed in
-		the file "LanguagePairs.txt". Each line of this file has the following
-		structure:
-			{Starting Language}/{Destination Language}.
-
-Here is an example of a file with 2 lines:
-
-EN-GB/PT-PT
-FR-FR/PT-PT
-
-This will create corpora for 4 pairs of languages: EN-PT, PT-EN and FR-PT and
-PT-FR. A sample "LanguagePairs.txt" comes with the program to serve as an
-example. Customise it to your needs respecting the syntax described above.
-
-NOTE: Never erase the "LanguagePairs.txt" file and always make sure that each
-pair of languages that you choose does exist in your TMX files. Otherwise, you
-won't get any results.
-
-The "Extract some corpora" and "Extract all corpora" functions are particularly
-useful if you want to prepare corpora for several or many language pairs. If
-your TMX files have translation units in all of the languages you are interested
-in, put them in a single directory (it can have subdirectories) and use those
-functions!
-
-********************************************************************************
-GETTING THE RESULTS:
-********************************************************************************
-The results are the aligned corpora files, as well as other files indicating how
-well the processing was done.
-
-When the processing is finished, you will find the corpora files in the
-directory you have chosen when you selected "Select input/output directory". In
-the "_processing_info" subdirectory of that directory you will find one or more
-*errors.txt file(s), listing the name of the TMX files that caused an error, and
-*actions.txt file(s), listing the files that were successfully processed as well
-as the number of translation units extracted.
-
-If you ask for the extraction of several corpora at once, you'll get lots of
-corpora files. If you feel somewhat confused by that abundance, please note 2
-things:
-a) If you sort the files by order of modified date, you'll reconstitute the
-chronological order in which the corpora were made (corpora are always made in
-pairs one after the other);
-b) The name of the corpora file has the following structure:
-
-{Language of the segments} ({Language with which they are aligned})_{Base name
-of the corpus}.txt
-Example: the file "BG-01 (MT-01)_Corpus.txt" has segments in the BG-01
-(Bulgarian) language that also have a translation in the MT-01 (Maltese)
-language and corresponds to the corpus whose base name is "Corpus.txt". There
-should be an equivalent "MT-01 (BG-01)_Corpus.txt", this time with all the
-Maltese segments that have a translation in Bulgarian. Together, these 2 files
-constitute an aligned corpus ready to be fed to Moses.
-
-You can now feed Moses your customised corpora :-)
-
-********************************************************************************
-PERFORMANCE:
-********************************************************************************
-The program can process very large numbers of TMX files (tens of thousands or
-more). It can also process extremely big TMX files (500 MB or more; it
-successfully processed a 2,3 GB file). The extraction of the corpus of a pair of
-languages in a very large (6,15 GB) set of TMX files took approximately 45
-minutes in an Intel Core 2 Solo U3500 computer @ 1.4 GHz with 4 GB RAM.
-
-The starting language and the destination language segments can be in any order
-in the TMX files (e.g., the starting language segment may be found either before
-or after the destination language segment in one, several or all translation
-units of the TMX file).
-
-The program accepts and preserves text in any language (including special
-diacritical characters), but has only been tested with European Union official
-languages.
-
-********************************************************************************
-THANKS:
-********************************************************************************
-Thanks to Gary Daine, who pointed out a way to improve one of the regex
-expressions used in the code.
-
-********************************************************************************
-LICENSE:
-********************************************************************************
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation (version 3 of the License).
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program.  If not, see <http://www.gnu.org/licenses/>.
--- a/contrib/Extract_TMX_Corpus/gpl.txt
+++ b/contrib/Extract_TMX_Corpus/gpl.txt
@ -1,674 +0,0 @@
-                    GNU GENERAL PUBLIC LICENSE
-                       Version 3, 29 June 2007
-
- Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
-                            Preamble
-
-  The GNU General Public License is a free, copyleft license for
-software and other kinds of works.
-
-  The licenses for most software and other practical works are designed
-to take away your freedom to share and change the works.  By contrast,
-the GNU General Public License is intended to guarantee your freedom to
-share and change all versions of a program--to make sure it remains free
-software for all its users.  We, the Free Software Foundation, use the
-GNU General Public License for most of our software; it applies also to
-any other work released this way by its authors.  You can apply it to
-your programs, too.
-
-  When we speak of free software, we are referring to freedom, not
-price.  Our General Public Licenses are designed to make sure that you
-have the freedom to distribute copies of free software (and charge for
-them if you wish), that you receive source code or can get it if you
-want it, that you can change the software or use pieces of it in new
-free programs, and that you know you can do these things.
-
-  To protect your rights, we need to prevent others from denying you
-these rights or asking you to surrender the rights.  Therefore, you have
-certain responsibilities if you distribute copies of the software, or if
-you modify it: responsibilities to respect the freedom of others.
-
-  For example, if you distribute copies of such a program, whether
-gratis or for a fee, you must pass on to the recipients the same
-freedoms that you received.  You must make sure that they, too, receive
-or can get the source code.  And you must show them these terms so they
-know their rights.
-
-  Developers that use the GNU GPL protect your rights with two steps:
-(1) assert copyright on the software, and (2) offer you this License
-giving you legal permission to copy, distribute and/or modify it.
-
-  For the developers' and authors' protection, the GPL clearly explains
-that there is no warranty for this free software.  For both users' and
-authors' sake, the GPL requires that modified versions be marked as
-changed, so that their problems will not be attributed erroneously to
-authors of previous versions.
-
-  Some devices are designed to deny users access to install or run
-modified versions of the software inside them, although the manufacturer
-can do so.  This is fundamentally incompatible with the aim of
-protecting users' freedom to change the software.  The systematic
-pattern of such abuse occurs in the area of products for individuals to
-use, which is precisely where it is most unacceptable.  Therefore, we
-have designed this version of the GPL to prohibit the practice for those
-products.  If such problems arise substantially in other domains, we
-stand ready to extend this provision to those domains in future versions
-of the GPL, as needed to protect the freedom of users.
-
-  Finally, every program is threatened constantly by software patents.
-States should not allow patents to restrict development and use of
-software on general-purpose computers, but in those that do, we wish to
-avoid the special danger that patents applied to a free program could
-make it effectively proprietary.  To prevent this, the GPL assures that
-patents cannot be used to render the program non-free.
-
-  The precise terms and conditions for copying, distribution and
-modification follow.
-
-                       TERMS AND CONDITIONS
-
-  0. Definitions.
-
-  "This License" refers to version 3 of the GNU General Public License.
-
-  "Copyright" also means copyright-like laws that apply to other kinds of
-works, such as semiconductor masks.
-
-  "The Program" refers to any copyrightable work licensed under this
-License.  Each licensee is addressed as "you".  "Licensees" and
-"recipients" may be individuals or organizations.
-
-  To "modify" a work means to copy from or adapt all or part of the work
-in a fashion requiring copyright permission, other than the making of an
-exact copy.  The resulting work is called a "modified version" of the
-earlier work or a work "based on" the earlier work.
-
-  A "covered work" means either the unmodified Program or a work based
-on the Program.
-
-  To "propagate" a work means to do anything with it that, without
-permission, would make you directly or secondarily liable for
-infringement under applicable copyright law, except executing it on a
-computer or modifying a private copy.  Propagation includes copying,
-distribution (with or without modification), making available to the
-public, and in some countries other activities as well.
-
-  To "convey" a work means any kind of propagation that enables other
-parties to make or receive copies.  Mere interaction with a user through
-a computer network, with no transfer of a copy, is not conveying.
-
-  An interactive user interface displays "Appropriate Legal Notices"
-to the extent that it includes a convenient and prominently visible
-feature that (1) displays an appropriate copyright notice, and (2)
-tells the user that there is no warranty for the work (except to the
-extent that warranties are provided), that licensees may convey the
-work under this License, and how to view a copy of this License.  If
-the interface presents a list of user commands or options, such as a
-menu, a prominent item in the list meets this criterion.
-
-  1. Source Code.
-
-  The "source code" for a work means the preferred form of the work
-for making modifications to it.  "Object code" means any non-source
-form of a work.
-
-  A "Standard Interface" means an interface that either is an official
-standard defined by a recognized standards body, or, in the case of
-interfaces specified for a particular programming language, one that
-is widely used among developers working in that language.
-
-  The "System Libraries" of an executable work include anything, other
-than the work as a whole, that (a) is included in the normal form of
-packaging a Major Component, but which is not part of that Major
-Component, and (b) serves only to enable use of the work with that
-Major Component, or to implement a Standard Interface for which an
-implementation is available to the public in source code form.  A
-"Major Component", in this context, means a major essential component
-(kernel, window system, and so on) of the specific operating system
-(if any) on which the executable work runs, or a compiler used to
-produce the work, or an object code interpreter used to run it.
-
-  The "Corresponding Source" for a work in object code form means all
-the source code needed to generate, install, and (for an executable
-work) run the object code and to modify the work, including scripts to
-control those activities.  However, it does not include the work's
-System Libraries, or general-purpose tools or generally available free
-programs which are used unmodified in performing those activities but
-which are not part of the work.  For example, Corresponding Source
-includes interface definition files associated with source files for
-the work, and the source code for shared libraries and dynamically
-linked subprograms that the work is specifically designed to require,
-such as by intimate data communication or control flow between those
-subprograms and other parts of the work.
-
-  The Corresponding Source need not include anything that users
-can regenerate automatically from other parts of the Corresponding
-Source.
-
-  The Corresponding Source for a work in source code form is that
-same work.
-
-  2. Basic Permissions.
-
-  All rights granted under this License are granted for the term of
-copyright on the Program, and are irrevocable provided the stated
-conditions are met.  This License explicitly affirms your unlimited
-permission to run the unmodified Program.  The output from running a
-covered work is covered by this License only if the output, given its
-content, constitutes a covered work.  This License acknowledges your
-rights of fair use or other equivalent, as provided by copyright law.
-
-  You may make, run and propagate covered works that you do not
-convey, without conditions so long as your license otherwise remains
-in force.  You may convey covered works to others for the sole purpose
-of having them make modifications exclusively for you, or provide you
-with facilities for running those works, provided that you comply with
-the terms of this License in conveying all material for which you do
-not control copyright.  Those thus making or running the covered works
-for you must do so exclusively on your behalf, under your direction
-and control, on terms that prohibit them from making any copies of
-your copyrighted material outside their relationship with you.
-
-  Conveying under any other circumstances is permitted solely under
-the conditions stated below.  Sublicensing is not allowed; section 10
-makes it unnecessary.
-
-  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
-
-  No covered work shall be deemed part of an effective technological
-measure under any applicable law fulfilling obligations under article
-11 of the WIPO copyright treaty adopted on 20 December 1996, or
-similar laws prohibiting or restricting circumvention of such
-measures.
-
-  When you convey a covered work, you waive any legal power to forbid
-circumvention of technological measures to the extent such circumvention
-is effected by exercising rights under this License with respect to
-the covered work, and you disclaim any intention to limit operation or
-modification of the work as a means of enforcing, against the work's
-users, your or third parties' legal rights to forbid circumvention of
-technological measures.
-
-  4. Conveying Verbatim Copies.
-
-  You may convey verbatim copies of the Program's source code as you
-receive it, in any medium, provided that you conspicuously and
-appropriately publish on each copy an appropriate copyright notice;
-keep intact all notices stating that this License and any
-non-permissive terms added in accord with section 7 apply to the code;
-keep intact all notices of the absence of any warranty; and give all
-recipients a copy of this License along with the Program.
-
-  You may charge any price or no price for each copy that you convey,
-and you may offer support or warranty protection for a fee.
-
-  5. Conveying Modified Source Versions.
-
-  You may convey a work based on the Program, or the modifications to
-produce it from the Program, in the form of source code under the
-terms of section 4, provided that you also meet all of these conditions:
-
-    a) The work must carry prominent notices stating that you modified
-    it, and giving a relevant date.
-
-    b) The work must carry prominent notices stating that it is
-    released under this License and any conditions added under section
-    7.  This requirement modifies the requirement in section 4 to
-    "keep intact all notices".
-
-    c) You must license the entire work, as a whole, under this
-    License to anyone who comes into possession of a copy.  This
-    License will therefore apply, along with any applicable section 7
-    additional terms, to the whole of the work, and all its parts,
-    regardless of how they are packaged.  This License gives no
-    permission to license the work in any other way, but it does not
-    invalidate such permission if you have separately received it.
-
-    d) If the work has interactive user interfaces, each must display
-    Appropriate Legal Notices; however, if the Program has interactive
-    interfaces that do not display Appropriate Legal Notices, your
-    work need not make them do so.
-
-  A compilation of a covered work with other separate and independent
-works, which are not by their nature extensions of the covered work,
-and which are not combined with it such as to form a larger program,
-in or on a volume of a storage or distribution medium, is called an
-"aggregate" if the compilation and its resulting copyright are not
-used to limit the access or legal rights of the compilation's users
-beyond what the individual works permit.  Inclusion of a covered work
-in an aggregate does not cause this License to apply to the other
-parts of the aggregate.
-
-  6. Conveying Non-Source Forms.
-
-  You may convey a covered work in object code form under the terms
-of sections 4 and 5, provided that you also convey the
-machine-readable Corresponding Source under the terms of this License,
-in one of these ways:
-
-    a) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by the
-    Corresponding Source fixed on a durable physical medium
-    customarily used for software interchange.
-
-    b) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by a
-    written offer, valid for at least three years and valid for as
-    long as you offer spare parts or customer support for that product
-    model, to give anyone who possesses the object code either (1) a
-    copy of the Corresponding Source for all the software in the
-    product that is covered by this License, on a durable physical
-    medium customarily used for software interchange, for a price no
-    more than your reasonable cost of physically performing this
-    conveying of source, or (2) access to copy the
-    Corresponding Source from a network server at no charge.
-
-    c) Convey individual copies of the object code with a copy of the
-    written offer to provide the Corresponding Source.  This
-    alternative is allowed only occasionally and noncommercially, and
-    only if you received the object code with such an offer, in accord
-    with subsection 6b.
-
-    d) Convey the object code by offering access from a designated
-    place (gratis or for a charge), and offer equivalent access to the
-    Corresponding Source in the same way through the same place at no
-    further charge.  You need not require recipients to copy the
-    Corresponding Source along with the object code.  If the place to
-    copy the object code is a network server, the Corresponding Source
-    may be on a different server (operated by you or a third party)
-    that supports equivalent copying facilities, provided you maintain
-    clear directions next to the object code saying where to find the
-    Corresponding Source.  Regardless of what server hosts the
-    Corresponding Source, you remain obligated to ensure that it is
-    available for as long as needed to satisfy these requirements.
-
-    e) Convey the object code using peer-to-peer transmission, provided
-    you inform other peers where the object code and Corresponding
-    Source of the work are being offered to the general public at no
-    charge under subsection 6d.
-
-  A separable portion of the object code, whose source code is excluded
-from the Corresponding Source as a System Library, need not be
-included in conveying the object code work.
-
-  A "User Product" is either (1) a "consumer product", which means any
-tangible personal property which is normally used for personal, family,
-or household purposes, or (2) anything designed or sold for incorporation
-into a dwelling.  In determining whether a product is a consumer product,
-doubtful cases shall be resolved in favor of coverage.  For a particular
-product received by a particular user, "normally used" refers to a
-typical or common use of that class of product, regardless of the status
-of the particular user or of the way in which the particular user
-actually uses, or expects or is expected to use, the product.  A product
-is a consumer product regardless of whether the product has substantial
-commercial, industrial or non-consumer uses, unless such uses represent
-the only significant mode of use of the product.
-
-  "Installation Information" for a User Product means any methods,
-procedures, authorization keys, or other information required to install
-and execute modified versions of a covered work in that User Product from
-a modified version of its Corresponding Source.  The information must
-suffice to ensure that the continued functioning of the modified object
-code is in no case prevented or interfered with solely because
-modification has been made.
-
-  If you convey an object code work under this section in, or with, or
-specifically for use in, a User Product, and the conveying occurs as
-part of a transaction in which the right of possession and use of the
-User Product is transferred to the recipient in perpetuity or for a
-fixed term (regardless of how the transaction is characterized), the
-Corresponding Source conveyed under this section must be accompanied
-by the Installation Information.  But this requirement does not apply
-if neither you nor any third party retains the ability to install
-modified object code on the User Product (for example, the work has
-been installed in ROM).
-
-  The requirement to provide Installation Information does not include a
-requirement to continue to provide support service, warranty, or updates
-for a work that has been modified or installed by the recipient, or for
-the User Product in which it has been modified or installed.  Access to a
-network may be denied when the modification itself materially and
-adversely affects the operation of the network or violates the rules and
-protocols for communication across the network.
-
-  Corresponding Source conveyed, and Installation Information provided,
-in accord with this section must be in a format that is publicly
-documented (and with an implementation available to the public in
-source code form), and must require no special password or key for
-unpacking, reading or copying.
-
-  7. Additional Terms.
-
-  "Additional permissions" are terms that supplement the terms of this
-License by making exceptions from one or more of its conditions.
-Additional permissions that are applicable to the entire Program shall
-be treated as though they were included in this License, to the extent
-that they are valid under applicable law.  If additional permissions
-apply only to part of the Program, that part may be used separately
-under those permissions, but the entire Program remains governed by
-this License without regard to the additional permissions.
-
-  When you convey a copy of a covered work, you may at your option
-remove any additional permissions from that copy, or from any part of
-it.  (Additional permissions may be written to require their own
-removal in certain cases when you modify the work.)  You may place
-additional permissions on material, added by you to a covered work,
-for which you have or can give appropriate copyright permission.
-
-  Notwithstanding any other provision of this License, for material you
-add to a covered work, you may (if authorized by the copyright holders of
-that material) supplement the terms of this License with terms:
-
-    a) Disclaiming warranty or limiting liability differently from the
-    terms of sections 15 and 16 of this License; or
-
-    b) Requiring preservation of specified reasonable legal notices or
-    author attributions in that material or in the Appropriate Legal
-    Notices displayed by works containing it; or
-
-    c) Prohibiting misrepresentation of the origin of that material, or
-    requiring that modified versions of such material be marked in
-    reasonable ways as different from the original version; or
-
-    d) Limiting the use for publicity purposes of names of licensors or
-    authors of the material; or
-
-    e) Declining to grant rights under trademark law for use of some
-    trade names, trademarks, or service marks; or
-
-    f) Requiring indemnification of licensors and authors of that
-    material by anyone who conveys the material (or modified versions of
-    it) with contractual assumptions of liability to the recipient, for
-    any liability that these contractual assumptions directly impose on
-    those licensors and authors.
-
-  All other non-permissive additional terms are considered "further
-restrictions" within the meaning of section 10.  If the Program as you
-received it, or any part of it, contains a notice stating that it is
-governed by this License along with a term that is a further
-restriction, you may remove that term.  If a license document contains
-a further restriction but permits relicensing or conveying under this
-License, you may add to a covered work material governed by the terms
-of that license document, provided that the further restriction does
-not survive such relicensing or conveying.
-
-  If you add terms to a covered work in accord with this section, you
-must place, in the relevant source files, a statement of the
-additional terms that apply to those files, or a notice indicating
-where to find the applicable terms.
-
-  Additional terms, permissive or non-permissive, may be stated in the
-form of a separately written license, or stated as exceptions;
-the above requirements apply either way.
-
-  8. Termination.
-
-  You may not propagate or modify a covered work except as expressly
-provided under this License.  Any attempt otherwise to propagate or
-modify it is void, and will automatically terminate your rights under
-this License (including any patent licenses granted under the third
-paragraph of section 11).
-
-  However, if you cease all violation of this License, then your
-license from a particular copyright holder is reinstated (a)
-provisionally, unless and until the copyright holder explicitly and
-finally terminates your license, and (b) permanently, if the copyright
-holder fails to notify you of the violation by some reasonable means
-prior to 60 days after the cessation.
-
-  Moreover, your license from a particular copyright holder is
-reinstated permanently if the copyright holder notifies you of the
-violation by some reasonable means, this is the first time you have
-received notice of violation of this License (for any work) from that
-copyright holder, and you cure the violation prior to 30 days after
-your receipt of the notice.
-
-  Termination of your rights under this section does not terminate the
-licenses of parties who have received copies or rights from you under
-this License.  If your rights have been terminated and not permanently
-reinstated, you do not qualify to receive new licenses for the same
-material under section 10.
-
-  9. Acceptance Not Required for Having Copies.
-
-  You are not required to accept this License in order to receive or
-run a copy of the Program.  Ancillary propagation of a covered work
-occurring solely as a consequence of using peer-to-peer transmission
-to receive a copy likewise does not require acceptance.  However,
-nothing other than this License grants you permission to propagate or
-modify any covered work.  These actions infringe copyright if you do
-not accept this License.  Therefore, by modifying or propagating a
-covered work, you indicate your acceptance of this License to do so.
-
-  10. Automatic Licensing of Downstream Recipients.
-
-  Each time you convey a covered work, the recipient automatically
-receives a license from the original licensors, to run, modify and
-propagate that work, subject to this License.  You are not responsible
-for enforcing compliance by third parties with this License.
-
-  An "entity transaction" is a transaction transferring control of an
-organization, or substantially all assets of one, or subdividing an
-organization, or merging organizations.  If propagation of a covered
-work results from an entity transaction, each party to that
-transaction who receives a copy of the work also receives whatever
-licenses to the work the party's predecessor in interest had or could
-give under the previous paragraph, plus a right to possession of the
-Corresponding Source of the work from the predecessor in interest, if
-the predecessor has it or can get it with reasonable efforts.
-
-  You may not impose any further restrictions on the exercise of the
-rights granted or affirmed under this License.  For example, you may
-not impose a license fee, royalty, or other charge for exercise of
-rights granted under this License, and you may not initiate litigation
-(including a cross-claim or counterclaim in a lawsuit) alleging that
-any patent claim is infringed by making, using, selling, offering for
-sale, or importing the Program or any portion of it.
-
-  11. Patents.
-
-  A "contributor" is a copyright holder who authorizes use under this
-License of the Program or a work on which the Program is based.  The
-work thus licensed is called the contributor's "contributor version".
-
-  A contributor's "essential patent claims" are all patent claims
-owned or controlled by the contributor, whether already acquired or
-hereafter acquired, that would be infringed by some manner, permitted
-by this License, of making, using, or selling its contributor version,
-but do not include claims that would be infringed only as a
-consequence of further modification of the contributor version.  For
-purposes of this definition, "control" includes the right to grant
-patent sublicenses in a manner consistent with the requirements of
-this License.
-
-  Each contributor grants you a non-exclusive, worldwide, royalty-free
-patent license under the contributor's essential patent claims, to
-make, use, sell, offer for sale, import and otherwise run, modify and
-propagate the contents of its contributor version.
-
-  In the following three paragraphs, a "patent license" is any express
-agreement or commitment, however denominated, not to enforce a patent
-(such as an express permission to practice a patent or covenant not to
-sue for patent infringement).  To "grant" such a patent license to a
-party means to make such an agreement or commitment not to enforce a
-patent against the party.
-
-  If you convey a covered work, knowingly relying on a patent license,
-and the Corresponding Source of the work is not available for anyone
-to copy, free of charge and under the terms of this License, through a
-publicly available network server or other readily accessible means,
-then you must either (1) cause the Corresponding Source to be so
-available, or (2) arrange to deprive yourself of the benefit of the
-patent license for this particular work, or (3) arrange, in a manner
-consistent with the requirements of this License, to extend the patent
-license to downstream recipients.  "Knowingly relying" means you have
-actual knowledge that, but for the patent license, your conveying the
-covered work in a country, or your recipient's use of the covered work
-in a country, would infringe one or more identifiable patents in that
-country that you have reason to believe are valid.
-
-  If, pursuant to or in connection with a single transaction or
-arrangement, you convey, or propagate by procuring conveyance of, a
-covered work, and grant a patent license to some of the parties
-receiving the covered work authorizing them to use, propagate, modify
-or convey a specific copy of the covered work, then the patent license
-you grant is automatically extended to all recipients of the covered
-work and works based on it.
-
-  A patent license is "discriminatory" if it does not include within
-the scope of its coverage, prohibits the exercise of, or is
-conditioned on the non-exercise of one or more of the rights that are
-specifically granted under this License.  You may not convey a covered
-work if you are a party to an arrangement with a third party that is
-in the business of distributing software, under which you make payment
-to the third party based on the extent of your activity of conveying
-the work, and under which the third party grants, to any of the
-parties who would receive the covered work from you, a discriminatory
-patent license (a) in connection with copies of the covered work
-conveyed by you (or copies made from those copies), or (b) primarily
-for and in connection with specific products or compilations that
-contain the covered work, unless you entered into that arrangement,
-or that patent license was granted, prior to 28 March 2007.
-
-  Nothing in this License shall be construed as excluding or limiting
-any implied license or other defenses to infringement that may
-otherwise be available to you under applicable patent law.
-
-  12. No Surrender of Others' Freedom.
-
-  If conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot convey a
-covered work so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you may
-not convey it at all.  For example, if you agree to terms that obligate you
-to collect a royalty for further conveying from those to whom you convey
-the Program, the only way you could satisfy both those terms and this
-License would be to refrain entirely from conveying the Program.
-
-  13. Use with the GNU Affero General Public License.
-
-  Notwithstanding any other provision of this License, you have
-permission to link or combine any covered work with a work licensed
-under version 3 of the GNU Affero General Public License into a single
-combined work, and to convey the resulting work.  The terms of this
-License will continue to apply to the part which is the covered work,
-but the special requirements of the GNU Affero General Public License,
-section 13, concerning interaction through a network will apply to the
-combination as such.
-
-  14. Revised Versions of this License.
-
-  The Free Software Foundation may publish revised and/or new versions of
-the GNU General Public License from time to time.  Such new versions will
-be similar in spirit to the present version, but may differ in detail to
-address new problems or concerns.
-
-  Each version is given a distinguishing version number.  If the
-Program specifies that a certain numbered version of the GNU General
-Public License "or any later version" applies to it, you have the
-option of following the terms and conditions either of that numbered
-version or of any later version published by the Free Software
-Foundation.  If the Program does not specify a version number of the
-GNU General Public License, you may choose any version ever published
-by the Free Software Foundation.
-
-  If the Program specifies that a proxy can decide which future
-versions of the GNU General Public License can be used, that proxy's
-public statement of acceptance of a version permanently authorizes you
-to choose that version for the Program.
-
-  Later license versions may give you additional or different
-permissions.  However, no additional obligations are imposed on any
-author or copyright holder as a result of your choosing to follow a
-later version.
-
-  15. Disclaimer of Warranty.
-
-  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
-APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
-HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
-OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
-IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
-ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
-  16. Limitation of Liability.
-
-  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
-THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
-GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
-USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
-DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
-PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
-EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGES.
-
-  17. Interpretation of Sections 15 and 16.
-
-  If the disclaimer of warranty and limitation of liability provided
-above cannot be given local legal effect according to their terms,
-reviewing courts shall apply local law that most closely approximates
-an absolute waiver of all civil liability in connection with the
-Program, unless a warranty or assumption of liability accompanies a
-copy of the Program in return for a fee.
-
-                     END OF TERMS AND CONDITIONS
-
-            How to Apply These Terms to Your New Programs
-
-  If you develop a new program, and you want it to be of the greatest
-possible use to the public, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these terms.
-
-  To do so, attach the following notices to the program.  It is safest
-to attach them to the start of each source file to most effectively
-state the exclusion of warranty; and each file should have at least
-the "copyright" line and a pointer to where the full notice is found.
-
-    <one line to give the program's name and a brief idea of what it does.>
-    Copyright (C) <year>  <name of author>
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-Also add information on how to contact you by electronic and paper mail.
-
-  If the program does terminal interaction, make it output a short
-notice like this when it starts in an interactive mode:
-
-    <program>  Copyright (C) <year>  <name of author>
-    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
-    This is free software, and you are welcome to redistribute it
-    under certain conditions; type `show c' for details.
-
-The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License.  Of course, your program's commands
-might be different; for a GUI interface, you would use an "about box".
-
-  You should also get your employer (if you work as a programmer) or school,
-if any, to sign a "copyright disclaimer" for the program, if necessary.
-For more information on this, and how to apply and follow the GNU GPL, see
-<http://www.gnu.org/licenses/>.
-
-  The GNU General Public License does not permit incorporating your program
-into proprietary programs.  If your program is a subroutine library, you
-may consider it more useful to permit linking proprietary applications with
-the library.  If this is what you want to do, use the GNU Lesser General
-Public License instead of this License.  But first, please read
-<http://www.gnu.org/philosophy/why-not-lgpl.html>.
--- a/contrib/Moses2TMX/LanguageCodes.txt
+++ b/contrib/Moses2TMX/LanguageCodes.txt
@ -1,22 +0,0 @@
-BG-01
-CS-01
-DA-01
-DE-DE
-EL-01
-EN-GB
-ES-ES
-ET-01
-FI-01
-FR-FR
-HU-01
-IT-IT
-LT-01
-LV-01
-MT-01
-NL-NL
-PL-01
-PT-PT
-RO-RO
-SK-01
-SL-01
-SV-SE
--- a/contrib/Moses2TMX/Moses2TMX.py
+++ b/contrib/Moses2TMX/Moses2TMX.py
@ -1,166 +0,0 @@
-#! /usr/bin/env python
-# -*- coding: utf_8 -*-
-"""This program is used to prepare TMX files from corpora composed of 2 files for each language pair,
-where the position of a segment in the first language file is exactly the same as in the second
-language file.
-
-The program requires that Pythoncard and wxPython (as well as Python) be previously installed.
-
-Copyright 2009, 2010 João Luís A. C. Rosas
-
-Distributed under GNU GPL v3 licence (see http://www.gnu.org/licenses/)
-
-E-mail: joao.luis.rosas@gmail.com """
-
-__version__ = "$Revision: 1.033$"
-__date__ = "$Date: 2010/02/25$"
-__author__="$João Luís A. C. Rosas$"
-
-from PythonCard import clipboard, dialog, graphic, model
-from PythonCard.components import button, combobox,statictext,checkbox,staticbox
-import wx
-import os, re
-import string
-import sys
-from time import strftime
-import codecs
-
-class Moses2TMX(model.Background):
-
-    def on_initialize(self, event):
-        self.inputdir=''
-        #Get directory where program file is and ...
-        currdir=os.path.abspath(os.path.dirname(os.path.realpath(sys.argv[0])))
-        #... load the file ("LanguageCodes.txt") with the list of languages that the program can process
-        try:
-            self.languages=open(currdir+os.sep+r'LanguageCodes.txt','r+').readlines()
-        except:
-            # If the languages file doesn't exist in the program directory, alert user that it is essential for the good working of the program and exit
-            result = dialog.alertDialog(self, 'The file "LanguageCodes.txt" is missing. The program will now close.', 'Essential file missing')
-            sys.exit()
-        #remove end of line marker from each line in "LanguageCodes.txt"
-        for lang in range(len(self.languages)):
-            self.languages[lang]=self.languages[lang].rstrip()
-        self.lang1code=''
-        self.lang2code=''
-        #Insert list of language names in appropriate program window's combo boxes
-        self.components.cbStartingLanguage.items=self.languages
-        self.components.cbDestinationLanguage.items=self.languages
-
-    def CreateTMX(self, name):
-        print 'Started at '+strftime('%H-%M-%S')
-        #get the startinglanguage name (e.g.: "EN-GB") from the program window
-        self.lang1code=self.components.cbStartingLanguage.text
-        #get the destinationlanguage name from the program window
-        self.lang2code=self.components.cbDestinationLanguage.text
-        print name+'.'+self.lang2code[:2].lower()
-        e=codecs.open(name,'r',"utf-8","strict")
-        f=codecs.open(name+'.'+self.lang2code[:2].lower()+'.moses','r',"utf-8","strict")
-        a=codecs.open(name+'.tmp','w',"utf-8","strict")
-        b=codecs.open(name+'.'+self.lang2code[:2].lower()+'.moses.tmp','w',"utf-8","strict")
-        for line in e:
-            if line.strip():
-                a.write(line)
-        for line in f:
-            if line.strip():
-                b.write(line)
-        a=codecs.open(name+'.tmp','r',"utf-8","strict")
-        b=codecs.open(name+'.'+self.lang2code[:2].lower()+'.moses.tmp','r',"utf-8","strict")
-        g=codecs.open(name+'.tmx','w','utf-16','strict')
-        g.write('<?xml version="1.0" ?>\n<!DOCTYPE tmx SYSTEM "tmx14.dtd">\n<tmx version="version 1.4">\n\n<header\ncreationtool="moses2tmx"\ncreationtoolversion="1.032"\nsegtype="sentence"\ndatatype="PlainText"\nadminlang="EN-US"\nsrclang="'+self.lang1code+'"\n>\n</header>\n\n<body>\n')
-        parar=0
-        while True:
-            self.ling1segm=a.readline().strip()
-            self.ling2segm=b.readline().strip()
-            if not self.ling1segm:
-                break
-            elif not self.ling2segm:
-                break
-            else:
-                try:
-                    g.write('<tu creationid="MT!">\n<prop type="Txt::Translator">Moses</prop>\n<tuv xml:lang="'+self.lang1code+'">\n<seg>'+self.ling1segm+'</seg>\n</tuv>\n<tuv xml:lang="'+self.lang2code+ \
-                    '">\n<seg>'+self.ling2segm+'</seg>\n</tuv>\n</tu>\n\n')
-                except:
-                    pass
-        a.close()
-        b.close()
-        e.close()
-        f.close()
-        g.write('</body>\n</tmx>\n')
-        g.close()
-        #os.remove(name)
-        #os.remove(name+'.'+self.lang2code[:2].lower()+'.moses')
-        os.remove(name+'.tmp')
-        os.remove(name+'.'+self.lang2code[:2].lower()+'.moses.tmp')
-
-    def createTMXs(self):
-        try:
-            # Get a list of all TMX files that need to be processed
-            fileslist=self.locate('*.moses',self.inputdir)
-        except:
-            # if any error up to now, add the name of the TMX file to the output file @errors
-            self.errortypes=self.errortypes+'   - Get All Segments: creation of output files error\n'
-        if fileslist:
-            # For each relevant TMX file ...
-            for self.presentfile in fileslist:
-                filename=self.presentfile[:-9]
-                #print filename
-                self.CreateTMX(filename)
-        print 'Finished at '+strftime('%H-%M-%S')
-        result = dialog.alertDialog(self, 'Processing done.', 'Processing Done')
-
-    def on_btnCreateTMX_mouseClick(self, event):
-        self.createTMXs()
-
-    def on_menuFileCreateTMXFiles_select(self, event):
-        self.createTMXs()
-
-    def on_btnSelectLang1File_mouseClick(self, event):
-        self.input1=self.GetInputFileName()
-        
-    def on_btnSelectLang2File_mouseClick(self, event):
-        self.input2=self.GetInputFileName()
-        
-    def locate(self,pattern, basedir):
-        """Locate all files matching supplied filename pattern in and below
-        supplied root directory.
-        
-        @pattern: something like '*.tmx'
-        @basedir:whole directory to be treated
-        """
-        import fnmatch
-        for path, dirs, files in os.walk(os.path.abspath(basedir)):
-            for filename in fnmatch.filter(files, pattern):
-                yield os.path.join(path, filename)
-
-    def SelectDirectory(self):
-        """Select the directory where the files to be processed are
-
-        @result: object returned by the dialog window with attributes accepted (true if user clicked OK button, false otherwise) and
-            path (list of strings containing the full pathnames to all files selected by the user)
-        @self.inputdir: directory where files to be processed are (and where output files will be written)
-        @self.statusBar.text: text displayed in the program window status bar"""
-        
-        result= dialog.directoryDialog(self, 'Choose a directory', 'a')
-        if result.accepted:
-            self.inputdir=result.path
-            self.statusBar.text=self.inputdir+' selected.'
-
-    def on_menuFileSelectDirectory_select(self, event):
-        self.SelectDirectory()
-
-    def on_btnSelectDirectory_mouseClick(self, event):
-        self.SelectDirectory()
-
-    def on_menuHelpShowHelp_select(self, event):
-        f = open('_READ_ME_FIRST.txt', "r")
-        msg = f.read()
-        result = dialog.scrolledMessageDialog(self, msg, '_READ_ME_FIRST.txt')
-        
-    def on_menuFileExit_select(self, event):
-        sys.exit()
-        
-
-if __name__ == '__main__':
-    app = model.Application(Moses2TMX)
-    app.MainLoop()
--- a/contrib/Moses2TMX/Moses2TMX.rsrc.py
+++ b/contrib/Moses2TMX/Moses2TMX.rsrc.py
@ -1,95 +0,0 @@
-{'application':{'type':'Application',
-          'name':'Moses2TMX',
-    'backgrounds': [
-    {'type':'Background',
-          'name':'bgMoses2TMX',
-          'title':u'Moses2TMX-1.032',
-          'size':(277, 307),
-          'statusBar':1,
-
-        'menubar': {'type':'MenuBar',
-         'menus': [
-             {'type':'Menu',
-             'name':'menuFile',
-             'label':u'&File',
-             'items': [
-                  {'type':'MenuItem',
-                   'name':'menuFileSelectDirectory',
-                   'label':u'Select &Directory ...\tAlt+D',
-                  },
-                  {'type':'MenuItem',
-                   'name':'menuFileCreateTMXFiles',
-                   'label':u'&Create TMX Files\tAlt+C',
-                  },
-                  {'type':'MenuItem',
-                   'name':'Sep1',
-                   'label':u'-',
-                  },
-                  {'type':'MenuItem',
-                   'name':'menuFileExit',
-                   'label':u'&Exit\tAlt+E',
-                  },
-              ]
-             },
-             {'type':'Menu',
-             'name':'menuHelp',
-             'label':u'&Help',
-             'items': [
-                  {'type':'MenuItem',
-                   'name':'menuHelpShowHelp',
-                   'label':u'&Show Help\tAlt+S',
-                  },
-              ]
-             },
-         ]
-     },
-         'components': [
-
-{'type':'Button', 
-    'name':'btnSelectDirectory', 
-    'position':(15, 15), 
-    'size':(225, 25), 
-    'font':{'faceName': u'Arial', 'family': 'sansSerif', 'size': 10}, 
-    'label':u'Select Directory ...', 
-    },
-
-{'type':'StaticText', 
-    'name':'StaticText3', 
-    'position':(17, 106), 
-    'font':{'faceName': u'Arial', 'family': 'sansSerif', 'size': 10}, 
-    'text':u'Target Language:', 
-    },
-
-{'type':'ComboBox', 
-    'name':'cbStartingLanguage', 
-    'position':(18, 75), 
-    'size':(70, -1), 
-    'items':[], 
-    },
-
-{'type':'ComboBox', 
-    'name':'cbDestinationLanguage', 
-    'position':(17, 123), 
-    'size':(70, -1), 
-    'items':[u'DE-PT', u'EN-PT', u'ES-PT', u'FR-PT'], 
-    },
-
-{'type':'Button', 
-    'name':'btnCreateTMX', 
-    'position':(20, 160), 
-    'size':(225, 25), 
-    'font':{'faceName': u'Arial', 'family': 'sansSerif', 'size': 10}, 
-    'label':u'Create TMX Files', 
-    },
-
-{'type':'StaticText', 
-    'name':'StaticText1', 
-    'position':(18, 56), 
-    'font':{'faceName': u'Arial', 'family': 'sansSerif', 'size': 10}, 
-    'text':u'Source Language:', 
-    },
-
-] # end components
-} # end background
-] # end backgrounds
-} }
--- a/contrib/Moses2TMX/_READ_ME_FIRST.txt
+++ b/contrib/Moses2TMX/_READ_ME_FIRST.txt
@ -1,127 +0,0 @@
-Summary:
-	PURPOSE
-	REQUIREMENTS
-	INSTALLATION
-	HOW TO USE
-	LICENSE
-
-
-********************************************************************************
-PURPOSE:
-********************************************************************************
-This is the MS Windows and Linux version (tested with Ubuntu 10.10 and 11.04) of
-Moses2TMX 1.033.
-
-Moses2TMX started as a Windows program (tested on Windows7, Vista and XP) that
-enables translators not necessarily with a deep knowledge of linguistic tools to
-create TMX files from a Moses corpus or from any other corpus made up of 2
-separate files, one for the source language and another for the target language,
-whose lines are strictly aligned.
-
-The program processes a whole directory containing source language and
-corresponding target language documents and creates 1 TMX file (UTF-16 format;
-Windows line endings) for each document pair that it processes.
-
-The program accepts and preserves text in any language (including special
-diacritical characters), but has only been tested with European Union official
-languages.
-
-The program is specifically intended to work with the output of a series of
-Linux scripts together called Moses-for-Mere-Mortals.
-
-********************************************************************************
-REQUIREMENTS:
-********************************************************************************
-The program requires the following to be pre-installed in your computer:
-
-1) Python 2.5 or higher (The program has been tested on Python 2.5 to 2.7)
-	Windows users download and install from http://www.python.org/download/
-	Ubuntu users can use the pre-installed Python distribution
-
-2) wxPython 2.8 or higher
-	Windows users download and install the Unicode version from
-	http://www.wxpython.org/download.php
-	Ubuntu users install with: sudo apt-get install python-wxtools
-
-3) Pythoncard 0.8.2 or higher
-	Windows users download and install
-	http://sourceforge.net/projects/pythoncard/files/PythonCard/0.8.2/PythonCard-0.8.2.win32.exe/download
-	Ubuntu users install with: sudo apt-get install pythoncard
-
-********************************************************************************
-INSTALLATION:
-********************************************************************************
-Windows users:
-1) Download the Moses2TMX.exe file to a directory of your choice.
-2) Double-click Moses2TMX.exe and follow the wizard's instructions.
-NOTE: Windows Vista users, to run the installation programs, by right-click on
-the installation file in Windows Explorer and choose "Execute as administrator"
-in the contextual menu.
-
-Ubuntu users:
-1) Download the Moses2TMX.tgz compressed file to a directory of your choice.
-2) Expand the compressed file and run from the expanded directory.
-
-***IMPORTANT***: Never erase the file "LanguageCodes.txt" in that directory. It
-is necessary for telling the program the languages that it has to process. If
-your TMX files use language codes that are different from those contained in
-this file, please replace the codes contained in the file with the codes used in
-your TMX files. You can always add or delete new codes to this file (when the
-program is not running).
-
-********************************************************************************
-HOW TO USE:
-********************************************************************************
-1) Create a directory where you will copy the files that you want to process.
-
-2) Copy the source and target language documents that you want to process to
-that directory.
-NOTE YOU HAVE TO RESPECT SOME NAMING CONVENTIONS IN ORDER TO BE ABLE TO USE
-THIS PROGRAM:
-
-	a) the target documents have to have follow the following convention:
-
-		{basename}.{abbreviation of target language}.moses
-
-		where {abbreviation of target language} is a ***2 character*** string
-		containing the lowercased first 2 characters of any of the language
-		codes present in the LanguageCodes.txt (present in the base directory of
-		Moses2TMX)
-
-		Example: If {basename} = "200000" and the target language has a code
-		"EN-GB" in the LanguageCodes.txt, then the name of the target file
-		should be "200000.en.moses"
-
-	b) the source language document should have the name:
-
-		{basename}
-
-		Example: continuing the preceding example, the name of the corresponding
-		source document should be "200000".
-
-3) Launch the program as indicated above in the "Launching the program" section.
-
-4) Operate on the main window of the program in the direction from top to
-   bottom:
-	a) Click the "Select Directory..." button to indicate the directory
-	containing all the source and corresponding target documents that you want
-	to process;
-	b) Indicate the languages of your files refers to in the "Source Language"
-	and "Target Language" comboboxes;
-	c) Click the Create TMX Files button.
-
-********************************************************************************
-LICENSE:
-********************************************************************************
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation (version 3 of the License).
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program.  If not, see <http://www.gnu.org/licenses/>.
--- a/contrib/Moses2TMX/gpl.txt
+++ b/contrib/Moses2TMX/gpl.txt
@ -1,674 +0,0 @@
-                    GNU GENERAL PUBLIC LICENSE
-                       Version 3, 29 June 2007
-
- Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
-                            Preamble
-
-  The GNU General Public License is a free, copyleft license for
-software and other kinds of works.
-
-  The licenses for most software and other practical works are designed
-to take away your freedom to share and change the works.  By contrast,
-the GNU General Public License is intended to guarantee your freedom to
-share and change all versions of a program--to make sure it remains free
-software for all its users.  We, the Free Software Foundation, use the
-GNU General Public License for most of our software; it applies also to
-any other work released this way by its authors.  You can apply it to
-your programs, too.
-
-  When we speak of free software, we are referring to freedom, not
-price.  Our General Public Licenses are designed to make sure that you
-have the freedom to distribute copies of free software (and charge for
-them if you wish), that you receive source code or can get it if you
-want it, that you can change the software or use pieces of it in new
-free programs, and that you know you can do these things.
-
-  To protect your rights, we need to prevent others from denying you
-these rights or asking you to surrender the rights.  Therefore, you have
-certain responsibilities if you distribute copies of the software, or if
-you modify it: responsibilities to respect the freedom of others.
-
-  For example, if you distribute copies of such a program, whether
-gratis or for a fee, you must pass on to the recipients the same
-freedoms that you received.  You must make sure that they, too, receive
-or can get the source code.  And you must show them these terms so they
-know their rights.
-
-  Developers that use the GNU GPL protect your rights with two steps:
-(1) assert copyright on the software, and (2) offer you this License
-giving you legal permission to copy, distribute and/or modify it.
-
-  For the developers' and authors' protection, the GPL clearly explains
-that there is no warranty for this free software.  For both users' and
-authors' sake, the GPL requires that modified versions be marked as
-changed, so that their problems will not be attributed erroneously to
-authors of previous versions.
-
-  Some devices are designed to deny users access to install or run
-modified versions of the software inside them, although the manufacturer
-can do so.  This is fundamentally incompatible with the aim of
-protecting users' freedom to change the software.  The systematic
-pattern of such abuse occurs in the area of products for individuals to
-use, which is precisely where it is most unacceptable.  Therefore, we
-have designed this version of the GPL to prohibit the practice for those
-products.  If such problems arise substantially in other domains, we
-stand ready to extend this provision to those domains in future versions
-of the GPL, as needed to protect the freedom of users.
-
-  Finally, every program is threatened constantly by software patents.
-States should not allow patents to restrict development and use of
-software on general-purpose computers, but in those that do, we wish to
-avoid the special danger that patents applied to a free program could
-make it effectively proprietary.  To prevent this, the GPL assures that
-patents cannot be used to render the program non-free.
-
-  The precise terms and conditions for copying, distribution and
-modification follow.
-
-                       TERMS AND CONDITIONS
-
-  0. Definitions.
-
-  "This License" refers to version 3 of the GNU General Public License.
-
-  "Copyright" also means copyright-like laws that apply to other kinds of
-works, such as semiconductor masks.
-
-  "The Program" refers to any copyrightable work licensed under this
-License.  Each licensee is addressed as "you".  "Licensees" and
-"recipients" may be individuals or organizations.
-
-  To "modify" a work means to copy from or adapt all or part of the work
-in a fashion requiring copyright permission, other than the making of an
-exact copy.  The resulting work is called a "modified version" of the
-earlier work or a work "based on" the earlier work.
-
-  A "covered work" means either the unmodified Program or a work based
-on the Program.
-
-  To "propagate" a work means to do anything with it that, without
-permission, would make you directly or secondarily liable for
-infringement under applicable copyright law, except executing it on a
-computer or modifying a private copy.  Propagation includes copying,
-distribution (with or without modification), making available to the
-public, and in some countries other activities as well.
-
-  To "convey" a work means any kind of propagation that enables other
-parties to make or receive copies.  Mere interaction with a user through
-a computer network, with no transfer of a copy, is not conveying.
-
-  An interactive user interface displays "Appropriate Legal Notices"
-to the extent that it includes a convenient and prominently visible
-feature that (1) displays an appropriate copyright notice, and (2)
-tells the user that there is no warranty for the work (except to the
-extent that warranties are provided), that licensees may convey the
-work under this License, and how to view a copy of this License.  If
-the interface presents a list of user commands or options, such as a
-menu, a prominent item in the list meets this criterion.
-
-  1. Source Code.
-
-  The "source code" for a work means the preferred form of the work
-for making modifications to it.  "Object code" means any non-source
-form of a work.
-
-  A "Standard Interface" means an interface that either is an official
-standard defined by a recognized standards body, or, in the case of
-interfaces specified for a particular programming language, one that
-is widely used among developers working in that language.
-
-  The "System Libraries" of an executable work include anything, other
-than the work as a whole, that (a) is included in the normal form of
-packaging a Major Component, but which is not part of that Major
-Component, and (b) serves only to enable use of the work with that
-Major Component, or to implement a Standard Interface for which an
-implementation is available to the public in source code form.  A
-"Major Component", in this context, means a major essential component
-(kernel, window system, and so on) of the specific operating system
-(if any) on which the executable work runs, or a compiler used to
-produce the work, or an object code interpreter used to run it.
-
-  The "Corresponding Source" for a work in object code form means all
-the source code needed to generate, install, and (for an executable
-work) run the object code and to modify the work, including scripts to
-control those activities.  However, it does not include the work's
-System Libraries, or general-purpose tools or generally available free
-programs which are used unmodified in performing those activities but
-which are not part of the work.  For example, Corresponding Source
-includes interface definition files associated with source files for
-the work, and the source code for shared libraries and dynamically
-linked subprograms that the work is specifically designed to require,
-such as by intimate data communication or control flow between those
-subprograms and other parts of the work.
-
-  The Corresponding Source need not include anything that users
-can regenerate automatically from other parts of the Corresponding
-Source.
-
-  The Corresponding Source for a work in source code form is that
-same work.
-
-  2. Basic Permissions.
-
-  All rights granted under this License are granted for the term of
-copyright on the Program, and are irrevocable provided the stated
-conditions are met.  This License explicitly affirms your unlimited
-permission to run the unmodified Program.  The output from running a
-covered work is covered by this License only if the output, given its
-content, constitutes a covered work.  This License acknowledges your
-rights of fair use or other equivalent, as provided by copyright law.
-
-  You may make, run and propagate covered works that you do not
-convey, without conditions so long as your license otherwise remains
-in force.  You may convey covered works to others for the sole purpose
-of having them make modifications exclusively for you, or provide you
-with facilities for running those works, provided that you comply with
-the terms of this License in conveying all material for which you do
-not control copyright.  Those thus making or running the covered works
-for you must do so exclusively on your behalf, under your direction
-and control, on terms that prohibit them from making any copies of
-your copyrighted material outside their relationship with you.
-
-  Conveying under any other circumstances is permitted solely under
-the conditions stated below.  Sublicensing is not allowed; section 10
-makes it unnecessary.
-
-  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
-
-  No covered work shall be deemed part of an effective technological
-measure under any applicable law fulfilling obligations under article
-11 of the WIPO copyright treaty adopted on 20 December 1996, or
-similar laws prohibiting or restricting circumvention of such
-measures.
-
-  When you convey a covered work, you waive any legal power to forbid
-circumvention of technological measures to the extent such circumvention
-is effected by exercising rights under this License with respect to
-the covered work, and you disclaim any intention to limit operation or
-modification of the work as a means of enforcing, against the work's
-users, your or third parties' legal rights to forbid circumvention of
-technological measures.
-
-  4. Conveying Verbatim Copies.
-
-  You may convey verbatim copies of the Program's source code as you
-receive it, in any medium, provided that you conspicuously and
-appropriately publish on each copy an appropriate copyright notice;
-keep intact all notices stating that this License and any
-non-permissive terms added in accord with section 7 apply to the code;
-keep intact all notices of the absence of any warranty; and give all
-recipients a copy of this License along with the Program.
-
-  You may charge any price or no price for each copy that you convey,
-and you may offer support or warranty protection for a fee.
-
-  5. Conveying Modified Source Versions.
-
-  You may convey a work based on the Program, or the modifications to
-produce it from the Program, in the form of source code under the
-terms of section 4, provided that you also meet all of these conditions:
-
-    a) The work must carry prominent notices stating that you modified
-    it, and giving a relevant date.
-
-    b) The work must carry prominent notices stating that it is
-    released under this License and any conditions added under section
-    7.  This requirement modifies the requirement in section 4 to
-    "keep intact all notices".
-
-    c) You must license the entire work, as a whole, under this
-    License to anyone who comes into possession of a copy.  This
-    License will therefore apply, along with any applicable section 7
-    additional terms, to the whole of the work, and all its parts,
-    regardless of how they are packaged.  This License gives no
-    permission to license the work in any other way, but it does not
-    invalidate such permission if you have separately received it.
-
-    d) If the work has interactive user interfaces, each must display
-    Appropriate Legal Notices; however, if the Program has interactive
-    interfaces that do not display Appropriate Legal Notices, your
-    work need not make them do so.
-
-  A compilation of a covered work with other separate and independent
-works, which are not by their nature extensions of the covered work,
-and which are not combined with it such as to form a larger program,
-in or on a volume of a storage or distribution medium, is called an
-"aggregate" if the compilation and its resulting copyright are not
-used to limit the access or legal rights of the compilation's users
-beyond what the individual works permit.  Inclusion of a covered work
-in an aggregate does not cause this License to apply to the other
-parts of the aggregate.
-
-  6. Conveying Non-Source Forms.
-
-  You may convey a covered work in object code form under the terms
-of sections 4 and 5, provided that you also convey the
-machine-readable Corresponding Source under the terms of this License,
-in one of these ways:
-
-    a) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by the
-    Corresponding Source fixed on a durable physical medium
-    customarily used for software interchange.
-
-    b) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by a
-    written offer, valid for at least three years and valid for as
-    long as you offer spare parts or customer support for that product
-    model, to give anyone who possesses the object code either (1) a
-    copy of the Corresponding Source for all the software in the
-    product that is covered by this License, on a durable physical
-    medium customarily used for software interchange, for a price no
-    more than your reasonable cost of physically performing this
-    conveying of source, or (2) access to copy the
-    Corresponding Source from a network server at no charge.
-
-    c) Convey individual copies of the object code with a copy of the
-    written offer to provide the Corresponding Source.  This
-    alternative is allowed only occasionally and noncommercially, and
-    only if you received the object code with such an offer, in accord
-    with subsection 6b.
-
-    d) Convey the object code by offering access from a designated
-    place (gratis or for a charge), and offer equivalent access to the
-    Corresponding Source in the same way through the same place at no
-    further charge.  You need not require recipients to copy the
-    Corresponding Source along with the object code.  If the place to
-    copy the object code is a network server, the Corresponding Source
-    may be on a different server (operated by you or a third party)
-    that supports equivalent copying facilities, provided you maintain
-    clear directions next to the object code saying where to find the
-    Corresponding Source.  Regardless of what server hosts the
-    Corresponding Source, you remain obligated to ensure that it is
-    available for as long as needed to satisfy these requirements.
-
-    e) Convey the object code using peer-to-peer transmission, provided
-    you inform other peers where the object code and Corresponding
-    Source of the work are being offered to the general public at no
-    charge under subsection 6d.
-
-  A separable portion of the object code, whose source code is excluded
-from the Corresponding Source as a System Library, need not be
-included in conveying the object code work.
-
-  A "User Product" is either (1) a "consumer product", which means any
-tangible personal property which is normally used for personal, family,
-or household purposes, or (2) anything designed or sold for incorporation
-into a dwelling.  In determining whether a product is a consumer product,
-doubtful cases shall be resolved in favor of coverage.  For a particular
-product received by a particular user, "normally used" refers to a
-typical or common use of that class of product, regardless of the status
-of the particular user or of the way in which the particular user
-actually uses, or expects or is expected to use, the product.  A product
-is a consumer product regardless of whether the product has substantial
-commercial, industrial or non-consumer uses, unless such uses represent
-the only significant mode of use of the product.
-
-  "Installation Information" for a User Product means any methods,
-procedures, authorization keys, or other information required to install
-and execute modified versions of a covered work in that User Product from
-a modified version of its Corresponding Source.  The information must
-suffice to ensure that the continued functioning of the modified object
-code is in no case prevented or interfered with solely because
-modification has been made.
-
-  If you convey an object code work under this section in, or with, or
-specifically for use in, a User Product, and the conveying occurs as
-part of a transaction in which the right of possession and use of the
-User Product is transferred to the recipient in perpetuity or for a
-fixed term (regardless of how the transaction is characterized), the
-Corresponding Source conveyed under this section must be accompanied
-by the Installation Information.  But this requirement does not apply
-if neither you nor any third party retains the ability to install
-modified object code on the User Product (for example, the work has
-been installed in ROM).
-
-  The requirement to provide Installation Information does not include a
-requirement to continue to provide support service, warranty, or updates
-for a work that has been modified or installed by the recipient, or for
-the User Product in which it has been modified or installed.  Access to a
-network may be denied when the modification itself materially and
-adversely affects the operation of the network or violates the rules and
-protocols for communication across the network.
-
-  Corresponding Source conveyed, and Installation Information provided,
-in accord with this section must be in a format that is publicly
-documented (and with an implementation available to the public in
-source code form), and must require no special password or key for
-unpacking, reading or copying.
-
-  7. Additional Terms.
-
-  "Additional permissions" are terms that supplement the terms of this
-License by making exceptions from one or more of its conditions.
-Additional permissions that are applicable to the entire Program shall
-be treated as though they were included in this License, to the extent
-that they are valid under applicable law.  If additional permissions
-apply only to part of the Program, that part may be used separately
-under those permissions, but the entire Program remains governed by
-this License without regard to the additional permissions.
-
-  When you convey a copy of a covered work, you may at your option
-remove any additional permissions from that copy, or from any part of
-it.  (Additional permissions may be written to require their own
-removal in certain cases when you modify the work.)  You may place
-additional permissions on material, added by you to a covered work,
-for which you have or can give appropriate copyright permission.
-
-  Notwithstanding any other provision of this License, for material you
-add to a covered work, you may (if authorized by the copyright holders of
-that material) supplement the terms of this License with terms:
-
-    a) Disclaiming warranty or limiting liability differently from the
-    terms of sections 15 and 16 of this License; or
-
-    b) Requiring preservation of specified reasonable legal notices or
-    author attributions in that material or in the Appropriate Legal
-    Notices displayed by works containing it; or
-
-    c) Prohibiting misrepresentation of the origin of that material, or
-    requiring that modified versions of such material be marked in
-    reasonable ways as different from the original version; or
-
-    d) Limiting the use for publicity purposes of names of licensors or
-    authors of the material; or
-
-    e) Declining to grant rights under trademark law for use of some
-    trade names, trademarks, or service marks; or
-
-    f) Requiring indemnification of licensors and authors of that
-    material by anyone who conveys the material (or modified versions of
-    it) with contractual assumptions of liability to the recipient, for
-    any liability that these contractual assumptions directly impose on
-    those licensors and authors.
-
-  All other non-permissive additional terms are considered "further
-restrictions" within the meaning of section 10.  If the Program as you
-received it, or any part of it, contains a notice stating that it is
-governed by this License along with a term that is a further
-restriction, you may remove that term.  If a license document contains
-a further restriction but permits relicensing or conveying under this
-License, you may add to a covered work material governed by the terms
-of that license document, provided that the further restriction does
-not survive such relicensing or conveying.
-
-  If you add terms to a covered work in accord with this section, you
-must place, in the relevant source files, a statement of the
-additional terms that apply to those files, or a notice indicating
-where to find the applicable terms.
-
-  Additional terms, permissive or non-permissive, may be stated in the
-form of a separately written license, or stated as exceptions;
-the above requirements apply either way.
-
-  8. Termination.
-
-  You may not propagate or modify a covered work except as expressly
-provided under this License.  Any attempt otherwise to propagate or
-modify it is void, and will automatically terminate your rights under
-this License (including any patent licenses granted under the third
-paragraph of section 11).
-
-  However, if you cease all violation of this License, then your
-license from a particular copyright holder is reinstated (a)
-provisionally, unless and until the copyright holder explicitly and
-finally terminates your license, and (b) permanently, if the copyright
-holder fails to notify you of the violation by some reasonable means
-prior to 60 days after the cessation.
-
-  Moreover, your license from a particular copyright holder is
-reinstated permanently if the copyright holder notifies you of the
-violation by some reasonable means, this is the first time you have
-received notice of violation of this License (for any work) from that
-copyright holder, and you cure the violation prior to 30 days after
-your receipt of the notice.
-
-  Termination of your rights under this section does not terminate the
-licenses of parties who have received copies or rights from you under
-this License.  If your rights have been terminated and not permanently
-reinstated, you do not qualify to receive new licenses for the same
-material under section 10.
-
-  9. Acceptance Not Required for Having Copies.
-
-  You are not required to accept this License in order to receive or
-run a copy of the Program.  Ancillary propagation of a covered work
-occurring solely as a consequence of using peer-to-peer transmission
-to receive a copy likewise does not require acceptance.  However,
-nothing other than this License grants you permission to propagate or
-modify any covered work.  These actions infringe copyright if you do
-not accept this License.  Therefore, by modifying or propagating a
-covered work, you indicate your acceptance of this License to do so.
-
-  10. Automatic Licensing of Downstream Recipients.
-
-  Each time you convey a covered work, the recipient automatically
-receives a license from the original licensors, to run, modify and
-propagate that work, subject to this License.  You are not responsible
-for enforcing compliance by third parties with this License.
-
-  An "entity transaction" is a transaction transferring control of an
-organization, or substantially all assets of one, or subdividing an
-organization, or merging organizations.  If propagation of a covered
-work results from an entity transaction, each party to that
-transaction who receives a copy of the work also receives whatever
-licenses to the work the party's predecessor in interest had or could
-give under the previous paragraph, plus a right to possession of the
-Corresponding Source of the work from the predecessor in interest, if
-the predecessor has it or can get it with reasonable efforts.
-
-  You may not impose any further restrictions on the exercise of the
-rights granted or affirmed under this License.  For example, you may
-not impose a license fee, royalty, or other charge for exercise of
-rights granted under this License, and you may not initiate litigation
-(including a cross-claim or counterclaim in a lawsuit) alleging that
-any patent claim is infringed by making, using, selling, offering for
-sale, or importing the Program or any portion of it.
-
-  11. Patents.
-
-  A "contributor" is a copyright holder who authorizes use under this
-License of the Program or a work on which the Program is based.  The
-work thus licensed is called the contributor's "contributor version".
-
-  A contributor's "essential patent claims" are all patent claims
-owned or controlled by the contributor, whether already acquired or
-hereafter acquired, that would be infringed by some manner, permitted
-by this License, of making, using, or selling its contributor version,
-but do not include claims that would be infringed only as a
-consequence of further modification of the contributor version.  For
-purposes of this definition, "control" includes the right to grant
-patent sublicenses in a manner consistent with the requirements of
-this License.
-
-  Each contributor grants you a non-exclusive, worldwide, royalty-free
-patent license under the contributor's essential patent claims, to
-make, use, sell, offer for sale, import and otherwise run, modify and
-propagate the contents of its contributor version.
-
-  In the following three paragraphs, a "patent license" is any express
-agreement or commitment, however denominated, not to enforce a patent
-(such as an express permission to practice a patent or covenant not to
-sue for patent infringement).  To "grant" such a patent license to a
-party means to make such an agreement or commitment not to enforce a
-patent against the party.
-
-  If you convey a covered work, knowingly relying on a patent license,
-and the Corresponding Source of the work is not available for anyone
-to copy, free of charge and under the terms of this License, through a
-publicly available network server or other readily accessible means,
-then you must either (1) cause the Corresponding Source to be so
-available, or (2) arrange to deprive yourself of the benefit of the
-patent license for this particular work, or (3) arrange, in a manner
-consistent with the requirements of this License, to extend the patent
-license to downstream recipients.  "Knowingly relying" means you have
-actual knowledge that, but for the patent license, your conveying the
-covered work in a country, or your recipient's use of the covered work
-in a country, would infringe one or more identifiable patents in that
-country that you have reason to believe are valid.
-
-  If, pursuant to or in connection with a single transaction or
-arrangement, you convey, or propagate by procuring conveyance of, a
-covered work, and grant a patent license to some of the parties
-receiving the covered work authorizing them to use, propagate, modify
-or convey a specific copy of the covered work, then the patent license
-you grant is automatically extended to all recipients of the covered
-work and works based on it.
-
-  A patent license is "discriminatory" if it does not include within
-the scope of its coverage, prohibits the exercise of, or is
-conditioned on the non-exercise of one or more of the rights that are
-specifically granted under this License.  You may not convey a covered
-work if you are a party to an arrangement with a third party that is
-in the business of distributing software, under which you make payment
-to the third party based on the extent of your activity of conveying
-the work, and under which the third party grants, to any of the
-parties who would receive the covered work from you, a discriminatory
-patent license (a) in connection with copies of the covered work
-conveyed by you (or copies made from those copies), or (b) primarily
-for and in connection with specific products or compilations that
-contain the covered work, unless you entered into that arrangement,
-or that patent license was granted, prior to 28 March 2007.
-
-  Nothing in this License shall be construed as excluding or limiting
-any implied license or other defenses to infringement that may
-otherwise be available to you under applicable patent law.
-
-  12. No Surrender of Others' Freedom.
-
-  If conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot convey a
-covered work so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you may
-not convey it at all.  For example, if you agree to terms that obligate you
-to collect a royalty for further conveying from those to whom you convey
-the Program, the only way you could satisfy both those terms and this
-License would be to refrain entirely from conveying the Program.
-
-  13. Use with the GNU Affero General Public License.
-
-  Notwithstanding any other provision of this License, you have
-permission to link or combine any covered work with a work licensed
-under version 3 of the GNU Affero General Public License into a single
-combined work, and to convey the resulting work.  The terms of this
-License will continue to apply to the part which is the covered work,
-but the special requirements of the GNU Affero General Public License,
-section 13, concerning interaction through a network will apply to the
-combination as such.
-
-  14. Revised Versions of this License.
-
-  The Free Software Foundation may publish revised and/or new versions of
-the GNU General Public License from time to time.  Such new versions will
-be similar in spirit to the present version, but may differ in detail to
-address new problems or concerns.
-
-  Each version is given a distinguishing version number.  If the
-Program specifies that a certain numbered version of the GNU General
-Public License "or any later version" applies to it, you have the
-option of following the terms and conditions either of that numbered
-version or of any later version published by the Free Software
-Foundation.  If the Program does not specify a version number of the
-GNU General Public License, you may choose any version ever published
-by the Free Software Foundation.
-
-  If the Program specifies that a proxy can decide which future
-versions of the GNU General Public License can be used, that proxy's
-public statement of acceptance of a version permanently authorizes you
-to choose that version for the Program.
-
-  Later license versions may give you additional or different
-permissions.  However, no additional obligations are imposed on any
-author or copyright holder as a result of your choosing to follow a
-later version.
-
-  15. Disclaimer of Warranty.
-
-  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
-APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
-HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
-OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
-IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
-ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
-  16. Limitation of Liability.
-
-  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
-THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
-GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
-USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
-DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
-PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
-EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGES.
-
-  17. Interpretation of Sections 15 and 16.
-
-  If the disclaimer of warranty and limitation of liability provided
-above cannot be given local legal effect according to their terms,
-reviewing courts shall apply local law that most closely approximates
-an absolute waiver of all civil liability in connection with the
-Program, unless a warranty or assumption of liability accompanies a
-copy of the Program in return for a fee.
-
-                     END OF TERMS AND CONDITIONS
-
-            How to Apply These Terms to Your New Programs
-
-  If you develop a new program, and you want it to be of the greatest
-possible use to the public, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these terms.
-
-  To do so, attach the following notices to the program.  It is safest
-to attach them to the start of each source file to most effectively
-state the exclusion of warranty; and each file should have at least
-the "copyright" line and a pointer to where the full notice is found.
-
-    <one line to give the program's name and a brief idea of what it does.>
-    Copyright (C) <year>  <name of author>
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-Also add information on how to contact you by electronic and paper mail.
-
-  If the program does terminal interaction, make it output a short
-notice like this when it starts in an interactive mode:
-
-    <program>  Copyright (C) <year>  <name of author>
-    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
-    This is free software, and you are welcome to redistribute it
-    under certain conditions; type `show c' for details.
-
-The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License.  Of course, your program's commands
-might be different; for a GUI interface, you would use an "about box".
-
-  You should also get your employer (if you work as a programmer) or school,
-if any, to sign a "copyright disclaimer" for the program, if necessary.
-For more information on this, and how to apply and follow the GNU GPL, see
-<http://www.gnu.org/licenses/>.
-
-  The GNU General Public License does not permit incorporating your program
-into proprietary programs.  If your program is a subroutine library, you
-may consider it more useful to permit linking proprietary applications with
-the library.  If this is what you want to do, use the GNU Lesser General
-Public License instead of this License.  But first, please read
-<http://www.gnu.org/philosophy/why-not-lgpl.html>.
--- a/contrib/combine-ptables/README.md
+++ b/contrib/combine-ptables/README.md
@ -0,0 +1,139 @@
+`combine-ptables.pl`: fill-up and other techniques of translation models combination.
+
+Author: 
+Arianna Bisazza bisazza[AT]fbk.eu
+
+ABOUT
+-----
+This tool implements "fill-up" and other operations that are useful to combine translation and reordering tables.
+In the "fill-up" approach, the weights of out-domain data sources are estimated directly by MERT along with the 
+other model weights.
+
+This tool also supports linear interpolation, but weights must be provided by the user.
+If you want to automatically estimate linear interpolation weights, use `contrib/tmcombine` instead.
+
+
+REFERENCE
+---------
+When using this script, please cite: 
+Arianna Bisazza, Nick Ruiz, and Marcello Federico. 2011. 
+"Fill-up versus Interpolation Methods for Phrase-based SMT Adaptation."
+In International Workshop on Spoken Language Translation (IWSLT), San Francisco, CA.
+
+
+FILL-UP
+-------
+
+This combination technique is useful when the relevance of the models is known a priori,
+e.g. when one is trained on in-domain data and the others on out-of-domain data.
+
+This mode preserves all the entries and scores coming from the first model, and adds
+entries from the other models only if new.
+If more than two tables are provided, each entry is taken only from the first table 
+that contains it.
+
+Moreover, a binary feature is added for each additional table to denote the provenance
+of an entry. For in-domain entries, the binary features are all set to 1 (=exp(0)).
+Entries coming from the 2nd table will have the 1st binary feature set to 2.718 (=exp(1)).
+
+This technique was proposed in the following works:
+
+Preslav Nakov. 2008. 
+"Improving English-Spanish Statistical Machine Translation: Experiments in Domain 
+Adaptation, Sentence Paraphrasing, Tokenization, and Recasing."
+In Workshop on Statistical Machine Translation.
+
+Arianna Bisazza, Nick Ruiz, and Marcello Federico. 2011. 
+"Fill-up versus Interpolation Methods for Phrase-based SMT Adaptation."
+In International Workshop on Spoken Language Translation (IWSLT), San Francisco, CA.
+
+The latter paper contains details about the present implementation as well as an empirical
+evaluation of fill-up against other combination techniques.
+Reordering model fill-up, cascaded fill-up and pruning criteria are also discussed in the 
+same paper.
+
+Among the findings of this paper, pruning new (out-of-domain) phrases with more than 4
+source words appeared to be beneficial on the Arabic-English TED task when combining the
+in-domain models with MultiUn models.
+This corresponds to the option:
+   `--newSourceMaxLength=4`
+
+
+LINEAR INTERPOLATION
+--------------------
+
+This combination technique consists in linearly combining the feature values coming
+from all tables. The combination weights should be provided by the user, otherwise
+uniform weights are assumed.
+When a phrase pair is absent from a table, a constant value (epsilon) is assumed for 
+the corresponding feature values. You may want to set your own epsilon.
+
+See [Bisazza et al. 2011] for an empirical comparison of uniformly weighted linear 
+interpolation against fill-up and decoding-time log-linear interpolation. In that paper, 
+epsilon was always set to 1e-06.
+
+
+UNION
+-----
+
+This combination technique creates the union of all phrase pairs and assigns to each
+of them the concatenation of all tables scores. 
+
+
+INTERSECTION
+------------
+
+This combination technique creates the intersection of all phrase pairs: each phrase 
+pair that occurs in all phrase tables is output along with the feature vector taken 
+from the *first* table.
+The intersection can be used to prune the reordering table in order to match the 
+entries of a corresponding pruned phrase table.
+
+
+USAGE
+-----
+
+Get statistics about overlap of entries:
+    `combine-ptables.pl --mode=stats ptable1 ptable2 ... ptableN > ptables-overlap-stats`
+
+Interpolate phrase tables...
+- with uniform weights:
+    `combine-ptables.pl --mode=interp --phpenalty-at=4 ptable1 ptable2 ptable3 > interp-ptable.X`
+
+- with custom weights:
+    `combine-ptables.pl --mode=interp --phpenalty-at=4 --weights=0.8,0.1,0.1 ptable1 ptable2 ptable3 > interp-ptable.Y`
+
+- with custom epsilon:
+    `combine-ptables.pl --mode=interp --phpenalty-at=4 --epsilon=1e-05 ptable1 ptable2 ptable3 > interp-ptable.Z`
+
+
+Fillup phrase tables...
+- unpruned:
+    `combine-ptables.pl --mode=fillup ptable1 ptable2 ... ptableN > fillup-ptable`
+
+- pruned (new phrases only with max. 4 source words):
+    `combine-ptables.pl --mode=fillup --newSourceMaxLength=4 ptable1 ptable2 ... ptableN > fillup-ptable`
+
+
+Given a pruned phrase table, prune the corresponding reordering table:
+    `combine-ptables.pl --mode=intersect1 reotable1-unpruned ptable1-pruned > reotable1-pruned`
+
+
+NOTES
+-----
+
+The script works only with textual (non-binarized) phrase or reordering tables 
+that were *previously sorted* with `LC_ALL=C sort`
+
+The resulting combined tables are also textual and need to binarized normally.
+
+The script combine-ptables.pl can be used on lexicalized reordering tables as well.
+
+Input tables can be gzipped.
+
+When integrating filled up models into a Moses system, remember to:
+ - specify the correct number of features (typically 6) under [ttable-file] in the configuration file `moses.ini`
+ - add a weight under [weight-t] in `moses.ini`
+ - if you binarize the models, provide the correct number of features to the command:
+    `$moses/bin/processPhraseTable -ttable 0 0 - -nscores $nbFeatures`
+
--- a/contrib/combine-ptables/combine-ptables.pl
+++ b/contrib/combine-ptables/combine-ptables.pl
@ -0,0 +1,425 @@
+#! /usr/bin/perl
+
+#******************************************************************************
+# Arianna Bisazza @ FBK-irst. March 2012
+#******************************************************************************
+# combine-ptables.pl : Combine Moses-style phrase tables, using different approaches
+
+
+use strict;
+use open ':utf8';
+binmode STDIN, ':utf8';
+binmode STDOUT, ':utf8';
+
+use Getopt::Long "GetOptions";
+
+sub main {
+my $usage = "
+USAGE
+-----
+combine-ptables.pl --mode=(interp|union|fillup|intersect1|stats) ptable1 ptable2 ... ptableN > combined-ptable
+combine-ptables.pl --mode=intersect1 reotable-unpruned ptable-pruned > reotable-pruned
+-----
+#
+# This scripts reads two or more *sorted* phrase tables and combines them in different modes.
+#
+# (Note: if present, word alignments are ignored).
+#
+# ----------------
+# OPTIONS
+# ----------------
+#
+# Required:
+# --mode			fillup:	    Each entry is taken only from the first table that contains it.
+#		                            A binary feature is added from each table except the first.
+#				interp:     Linear interpolation.
+#				union:	    Union of entries, feature vectors are concatenated.
+#                               intersect1: Intersection of entries, feature vectors taken from the first table.
+#                               stats:      Only compute some statistics about tables overlap. No table is produced.
+#
+#                               NOTE: if present, additional fields such as word alignment, phrase counts etc. are always
+#                                     taken from the first table.
+#
+# Generic options:
+# --phpenalty=FLOAT             Constant value for phrase penalty. Default is exp(1)=2.718
+# --phpenalty-at=N              The (N+1)th score of each table is considered as phrase penalty with a constant value.
+#                               In 'interp' mode, the corresponding feature is not interpolated but simply set to the constant.
+#                               In 'union' mode, the ph.penalty (constant) is output only once, after all the other scores.
+#                               By default, no score is considered as phrase penalty.
+#
+#
+# Options for 'fillup':
+# --newSourceMaxLength=INT      Don't include \"new\" source phrases if longer than INT words.
+#
+# Options for 'interp':
+# --weights=W1,W2,...WN		Weights for interpolation. By default, uniform weights are applied.
+# --epsilon=X			Score to assume when a phrase pair is not contained in a table (in 'interp' and 'union' modes).
+#                               Default epsilon is 1e-06.
+#
+# Options for 'union':
+#
+#
+";
+
+my $combination_mode = '';
+my $debug = '';
+my $weights_str = '';
+my $epsilon = 0.000001;
+my $phPenalty = 2.718;	# exp(1)
+my $phPenalty_idx = -1; 
+my $delim= " ||| ";
+my $delim_RE= ' \|\|\| ';
+my $exp_one = 2.718;
+my $exp_zero = 1;
+my $newSourceMaxLength = -1;
+my $help = '';
+
+GetOptions ('debug' => \$debug, 
+	    'mode=s' => \$combination_mode,
+	    'weights=s' => \$weights_str,
+            'epsilon=f' => \$epsilon,
+	    'phpenalty=f' => \$phPenalty,
+            'phpenalty-at=i' => \$phPenalty_idx,
+	    'newSourceMaxLength=i' => \$newSourceMaxLength,
+            'help' => \$help);
+
+if($help) { die "$usage\n\n"; }
+
+if($combination_mode!~/(interp|union|fillup|intersect1|stats)/) {die "$usage\nUnknown combination mode!\n"}; 
+
+if(@ARGV < 2) {die "$usage\n\n Please provide at least 2 tables to combine \n\n";}
+
+print STDERR "
+WARNING: Your phrase tables must be sorted (with LC_ALL=C) !!
+******************************
+Combination mode is [$combination_mode]
+******************************
+";
+
+my @tables = @ARGV;
+my $nbtables = scalar(@tables);
+
+###########################################
+
+# The newSourceMaxLength option requires reading all the first PT before starting the combination
+my %sourcePhrasesPT1; 
+if($combination_mode eq "fillup" && $newSourceMaxLength>-1) {
+    my $table1=$tables[0];
+    $table1 =~ s/(.*\.gz)\s*$/gzip -dc < $1|/;
+    open(TABLE1, "$table1") or die "Cannot open $table1: ($!)\n";
+    while(my $line=<TABLE1>) {
+	$line=~m/^(.*?)$delim_RE/;
+	$sourcePhrasesPT1{$1}++;
+    }
+    close(TABLE1);
+}
+
+my @table_files=();
+foreach my $table (@tables) {
+    $table =~ s/(.*\.gz)\s*$/gzip -dc < $1|/;
+    #localize the file glob, so FILE is unique to the inner loop.
+    local *FILE;
+    open(FILE, "$table") or die "Cannot open $table: ($!)\n";
+    push(@table_files, *FILE);
+}
+
+
+# Read first line from all tables to find number of weights (and sanity checks)
+my @read_ppairs=();
+my $nbscores = &read_line_from_tables(\@table_files, \@read_ppairs);
+print STDERR "Each phrase table contains $nbscores features.\n";
+
+###########################################
+
+if($phPenalty_idx!=-1) {
+    if($phPenalty_idx<0 || $phPenalty_idx>=$nbscores) {
+	die "Invalid value for option phpenalty-at! Should be in the range [0,($nbscores-1)]\n\n";
+    }
+    else { print STDERR "Phrase penalty at index $phPenalty_idx\n"; }
+}
+
+#if($weights_str ne "") { die "Weights option NOT supported yet. Can only use uniform (1/nbscores)\n\n"; }
+#my $unifw = 1/$nbtables;
+
+my @weights=(); # Array of arrays each containing the feature weights for a phrase table
+if($combination_mode eq "interp") {
+    my @table_level_weights=();
+    if($weights_str eq "") {
+	@table_level_weights= ((1/$nbtables) x $nbtables);   # assuming uniform weights
+    }
+    else {
+	@table_level_weights= split(/,/, $weights_str);
+	if(scalar(@table_level_weights) != $nbtables) {
+	    die "$usage\n Invalid string for option --weights! Must be a comma-separated list of floats, one per ph.table.\n";
+	}
+    }
+
+    for(my $i=0; $i<$nbtables; $i++) {
+	my @weights_pt = (($table_level_weights[$i]) x $nbscores);
+	if($phPenalty_idx!=-1) {
+	    $weights_pt[$phPenalty_idx]=0;
+	}
+	print STDERR "WEIGHTS-PT_$i: ", join(" -- ", @weights_pt), "\n";
+	$weights[$i] = \@weights_pt;
+    }
+    print STDERR "EPSILON: $epsilon \n";
+}
+
+
+###########################################
+
+my @empty_ppair=("");
+my @epsilons = (($epsilon) x $nbscores);
+if($phPenalty_idx>-1) {
+    pop @epsilons;
+}
+
+my $nbPpairs_inAll=0;
+my @nbPairs_found_only_in=((0) x $nbtables);
+my $MINSCORE=1;
+
+print STDERR "Working...\n\n";
+while(1) {  
+    my $min_ppair="";
+    my $reached_end_of_tables=1;
+    my @tablesContainingPpair=((0) x $nbtables);
+    for(my $i=0; $i<$nbtables; $i++) {
+	my $ppair=$read_ppairs[$i]->[0];
+	if($ppair ne "") {
+	    $reached_end_of_tables=0;
+	    if($min_ppair eq "" || $ppair lt $min_ppair) {
+		$min_ppair=$ppair;
+		@tablesContainingPpair=((0) x $nbtables);
+		$tablesContainingPpair[$i]=1;
+	    }
+	    elsif($ppair eq $min_ppair) {
+                $tablesContainingPpair[$i]=1;
+	    }
+	}
+    }
+    last if($reached_end_of_tables);
+
+    ## Actual combination is performed here:
+    &combine_ppair(\@read_ppairs, \@tablesContainingPpair);
+
+    &read_line_from_tables(\@table_files, \@read_ppairs, \@tablesContainingPpair);
+    
+}
+
+print STDERR "...done!\n";
+
+print STDERR "The minimum score in all tables is $MINSCORE\n";
+
+if($combination_mode eq "stats") {
+my $tot_ppairs=0;
+print "
+# entries
+found in all tables:   $nbPpairs_inAll\n";
+
+for(my $i=0; $i<$nbtables; $i++) {
+    print "found only in PT_$i:    $nbPairs_found_only_in[$i]\n";
+}
+
+}
+
+####################################
+sub combine_ppair(PPAIRS_REFARRAY, TABLE_INDICES_REFARRAY) {
+    my $ra_ppairs=shift;   # 1st item: phrase-pair key (string); 
+                           # 2nd item: ref.array of scores;
+                           # 3rd item: additional info (string, may be empty)
+
+    my $ra_toRead=shift;   # Important: this says which phrase tables contain the ph.pair currently processed
+
+    my $ppair="";
+    my @scores=();
+    my $additional_info="";
+
+    my $to_print=1;
+
+    if($debug) {
+	print STDERR "combine_ppair:\n";
+	for(my $i=0; $i<$nbtables; $i++) {
+	    if($ra_toRead->[$i]) {
+		print STDERR "ppair_$i= ", join (" // ", @{$ra_ppairs->[$i]}), "\n";
+	    }
+	}
+    }
+
+    if($combination_mode eq "stats") {
+	$to_print=0;
+	my $found_in=-1;
+	my $nb_found=0;
+	for(my $i=0; $i<$nbtables; $i++) {
+	    if($ra_toRead->[$i]) {
+		$found_in=$i;
+		$nb_found++;
+	    }
+	}
+	if($nb_found==1) { $nbPairs_found_only_in[$found_in]++; }
+	elsif($nb_found==$nbtables) { $nbPpairs_inAll++; }
+    }
+    ### Fill-up + additional binary feature
+    elsif($combination_mode eq "fillup") {
+	my @bin_feats=(($exp_zero) x ($nbtables-1));
+	for(my $i=0; $i<$nbtables; $i++) {
+	    if($ra_toRead->[$i]) {
+		$ppair= shift(@{$ra_ppairs->[$i]});
+		# pruning criteria are applied here:
+		if($i>0 && $newSourceMaxLength>-1) {
+		    $ppair=~m/^(.*?)$delim_RE/;
+		    if(scalar(split(/ +/, $1)) > $newSourceMaxLength &&
+			!defined($sourcePhrasesPT1{$1})) 
+		       { $to_print=0; }
+		}
+#		@scores= @{$ra_ppairs->[$i]};
+		@scores = @{shift(@{$ra_ppairs->[$i]})};
+                # binary feature for ph.pair provenance fires here
+		if($i>0) { $bin_feats[$i-1]=$exp_one; } 
+		$additional_info=shift(@{$ra_ppairs->[$i]});
+		last;
+	    }
+	}
+	push(@scores, @bin_feats);
+    }
+    ### Linear interpolation
+    elsif($combination_mode eq "interp") {
+	my $firstPpair=-1;
+	@scores=((0) x $nbscores);
+	for(my $i=0; $i<$nbtables; $i++) {
+	    if($ra_toRead->[$i]) {
+		if($firstPpair==-1) { $firstPpair=$i; }
+		$ppair= shift(@{$ra_ppairs->[$i]});
+		my @scoresPT = @{shift(@{$ra_ppairs->[$i]})};
+		for(my $j=0; $j<$nbscores; $j++) {
+#		    $scores[$j]+= $weights[$i]->[$j]* $ra_ppairs->[$i][$j];
+		    $scores[$j]+= $weights[$i]->[$j]* $scoresPT[$j];
+		}
+	    }
+	    else {
+		for(my $j=0; $j<$nbscores; $j++) {
+                    $scores[$j]+= $weights[$i]->[$j]* $epsilon;
+                }
+	    }
+	    if($phPenalty_idx!=-1) {
+		$scores[$phPenalty_idx]= $phPenalty;
+	    }
+	}
+	if($debug) { print STDERR "..taking info from ptable_$firstPpair\n"; }
+	$additional_info= shift(@{$ra_ppairs->[$firstPpair]});
+    }
+    ### Union + feature concatenation
+    elsif($combination_mode eq "union") {
+        my $firstPpair=-1;
+	for(my $i=0; $i<$nbtables; $i++) {
+	    if($ra_toRead->[$i]) { 
+		if($firstPpair==-1) { $firstPpair=$i; }
+		$ppair= shift(@{$ra_ppairs->[$i]});
+		my @scoresPT= @{shift(@{$ra_ppairs->[$i]})};
+		if($phPenalty_idx!=-1) {
+#		    splice(@{$ra_ppairs->[$i]}, $phPenalty_idx, 1);
+		    splice(@scoresPT, $phPenalty_idx, 1);
+		}
+#		push(@scores, @{$ra_ppairs->[$i]});
+	        push(@scores, @scoresPT);
+	    } 
+	    else { 
+		push(@scores, @epsilons);
+	    }
+	}
+	if($phPenalty_idx!=-1) { 
+	    push(@scores, $phPenalty);
+	}
+	if($debug) { print STDERR "..taking info from ptable_$firstPpair\n"; }
+        $additional_info= shift(@{$ra_ppairs->[$firstPpair]});
+    }
+    ### Intersect + features from first table
+    elsif($combination_mode eq "intersect1") {
+        $to_print=0;
+        my $found_in_all=1;
+        for(my $i=0; $i<$nbtables; $i++) {
+            if(!$ra_toRead->[$i]) {
+		$found_in_all=0;
+		last;
+            }
+        }
+        if($found_in_all) { 
+	    $to_print=1;
+	    $ppair= shift(@{$ra_ppairs->[0]});
+#	    @scores= @{$ra_ppairs->[0]};
+	    @scores= @{shift(@{$ra_ppairs->[0]})};
+	    $additional_info= shift(@{$ra_ppairs->[0]});
+	}
+    }
+    else {
+	die "$usage\nUnknown combination mode!\n";
+    }
+
+
+    if($to_print) {
+	if($additional_info eq "") {
+	    print $ppair, join(" ", @scores), "\n";
+	}else {
+	    print $ppair, join(" ", @scores), $delim, $additional_info, "\n";
+	}
+    }
+}
+
+####################################
+# Read lines from all filehandles given in FILES_REFARRAY, 
+# or from the files whose indices are assigned 1 in the array TABLE_INDICES_REFARRAY
+# Parse each of them as a phrase pair entry and stores it to the corresponding position of PPAIRS_REFARRAY
+sub read_line_from_tables(FILES_REFARRAY, PPAIRS_REFARRAY, TABLE_INDICES_REFARRAY) {
+    my $ra_files=shift;
+    my $ra_ppairs=shift;
+
+    my $ra_toRead=shift;
+    my @toRead=((1) x $nbtables);   # by default read from all files
+    if($ra_toRead ne "") { 
+	@toRead=@$ra_toRead;
+    }
+
+    my $nbscores=-1;
+    my $key=""; my $additional_info="";
+    for(my $i=0; $i<$nbtables; $i++) {
+	next if($toRead[$i]==0);
+	my @ppair=();
+	my $file=$ra_files->[$i];
+	if(my $line = <$file>) { 
+	    chomp $line;
+	    my @fields = split(/$delim_RE/, $line);
+	    if(scalar(@fields)<3) {
+                die "Invalid phrase table entry:\n$line\n";
+	    }
+	    my @scores = split(/\s+/, $fields[2]);
+	    foreach my $score (@scores) {
+		if($score<$MINSCORE) { $MINSCORE=$score; }
+	    }
+	    # Get nb of scores from the 1st table. Check that all tables provide the same nb of scores,
+	    # unless mode is 'intersect' (then it doesn't matter as scores are taken only from 1st table)
+	    if($nbscores==-1) {
+		$nbscores=scalar(@scores);
+	    } elsif($nbscores!=scalar(@scores) && $combination_mode ne "intersect1") {
+		die "Wrong number of scores in table-$i! Should be $nbscores\n";
+	    }
+	    # Get additional fields if any (word aligment, phrase counts etc.)
+	    if(scalar(@fields)>3) {
+		$additional_info=join($delim, splice(@fields,3));
+		#print STDOUT "additional_info:__{$additional_info}__\n";
+	    }
+	    my $key = "$fields[0]$delim$fields[1]$delim";  ## IMPORTANT: the | delimiter at the end of the phrase pair is crucial to preserve sorting!!
+	    push(@ppair, $key, \@scores, $additional_info);
+	}
+	else { 
+	    push(@ppair, "");
+	}
+	$ra_ppairs->[$i]=\@ppair;
+    }
+
+    return $nbscores;
+}
+
+#########
+}
+
+
+&main;
--- a/contrib/eppex/configure
+++ b/contrib/eppex/configure
--- a/contrib/eppex/depcomp
+++ b/contrib/eppex/depcomp
--- a/contrib/eppex/install-sh
+++ b/contrib/eppex/install-sh
--- a/contrib/eppex/missing
+++ b/contrib/eppex/missing
--- a/contrib/fuzzy-match/Makefile
+++ b/contrib/fuzzy-match/Makefile
@ -0,0 +1,16 @@
+all: suffix-test fuzzy-match fuzzy-match2
+
+clean: 
+	rm -f *.o
+
+.cpp.o:
+	g++ -O6 -g -c $<
+
+suffix-test: Vocabulary.o SuffixArray.o suffix-test.o
+	g++ Vocabulary.o SuffixArray.o suffix-test.o -o suffix-test
+
+fuzzy-match: Vocabulary.o SuffixArray.o old/fuzzy-match.o
+	g++ Vocabulary.o SuffixArray.o fuzzy-match.o -o fuzzy-match
+
+fuzzy-match2: Vocabulary.o SuffixArray.o fuzzy-match2.o Util.o
+	g++ Vocabulary.o SuffixArray.o fuzzy-match2.o Util.o -o fuzzy-match2
--- a/contrib/fuzzy-match/Match.h
+++ b/contrib/fuzzy-match/Match.h
@ -0,0 +1,29 @@
+//
+//  Match.h
+//  fuzzy-match
+//
+//  Created by Hieu Hoang on 25/07/2012.
+//  Copyright 2012 __MyCompanyName__. All rights reserved.
+//
+
+#ifndef fuzzy_match_Match_h
+#define fuzzy_match_Match_h
+
+/* data structure for n-gram match between input and corpus */
+
+class Match {
+public:
+	int input_start;
+	int input_end;
+	int tm_start;
+	int tm_end;
+	int min_cost;
+	int max_cost;
+	int internal_cost;
+	Match( int is, int ie, int ts, int te, int min, int max, int i )
+  :input_start(is), input_end(ie), tm_start(ts), tm_end(te), min_cost(min), max_cost(max), internal_cost(i)
+  {}
+};
+
+
+#endif
--- a/contrib/fuzzy-match/SentenceAlignment.h
+++ b/contrib/fuzzy-match/SentenceAlignment.h
@ -0,0 +1,48 @@
+//
+//  SentenceAlignment.h
+//  fuzzy-match
+//
+//  Created by Hieu Hoang on 25/07/2012.
+//  Copyright 2012 __MyCompanyName__. All rights reserved.
+//
+
+#ifndef fuzzy_match_SentenceAlignment_h
+#define fuzzy_match_SentenceAlignment_h
+
+#include <sstream>
+#include "Vocabulary.h"
+
+extern Vocabulary vocabulary;
+
+struct SentenceAlignment
+{
+  int count;
+  vector< WORD_ID > target;
+  vector< pair<int,int> > alignment;
+  
+  SentenceAlignment()
+  {}
+  
+  string getTargetString() const
+  {
+    stringstream strme;
+    for (size_t i = 0; i < target.size(); ++i) {
+      const WORD &word = vocabulary.GetWord(target[i]);
+      strme << word << " ";
+    }
+    return strme.str();
+  }
+  
+  string getAlignmentString() const
+  {
+    stringstream strme;
+    for (size_t i = 0; i < alignment.size(); ++i) {
+      const pair<int,int> &alignPair = alignment[i];
+      strme << alignPair.first << "-" << alignPair.second << " ";
+    }
+    return strme.str();
+  }
+  
+};
+
+#endif
--- a/contrib/fuzzy-match/SuffixArray.cpp
+++ b/contrib/fuzzy-match/SuffixArray.cpp
@ -0,0 +1,244 @@
+#include "SuffixArray.h"
+#include <string>
+#include <stdlib.h>
+#include <cstring>
+
+using namespace std;
+
+SuffixArray::SuffixArray( string fileName ) 
+{
+	m_vcb.StoreIfNew( "<uNk>" );
+	m_endOfSentence = m_vcb.StoreIfNew( "<s>" );
+
+	ifstream extractFile;
+	char line[LINE_MAX_LENGTH];
+
+	// count the number of words first;
+	extractFile.open(fileName.c_str());
+	istream *fileP = &extractFile;
+	m_size = 0;
+	size_t sentenceCount = 0;
+	while(!fileP->eof()) {
+		SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
+		if (fileP->eof()) break;
+		vector< WORD_ID > words = m_vcb.Tokenize( line );
+		m_size += words.size() + 1;
+		sentenceCount++;
+	}
+	extractFile.close();
+	cerr << m_size << " words (incl. sentence boundaries)" << endl;
+
+	// allocate memory
+	m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
+	m_index = (INDEX*) calloc( sizeof( INDEX ), m_size );
+	m_wordInSentence = (char*) calloc( sizeof( char ), m_size );
+	m_sentence = (size_t*) calloc( sizeof( size_t ), m_size );
+	m_sentenceLength = (char*) calloc( sizeof( char ), sentenceCount );
+
+	// fill the array
+	int wordIndex = 0;
+	int sentenceId = 0;
+	extractFile.open(fileName.c_str());
+	fileP = &extractFile;
+	while(!fileP->eof()) {
+		SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
+		if (fileP->eof()) break;
+		vector< WORD_ID > words = m_vcb.Tokenize( line );
+		vector< WORD_ID >::const_iterator i;
+		
+		for( i=words.begin(); i!=words.end(); i++)
+		{
+			m_index[ wordIndex ] = wordIndex;
+			m_sentence[ wordIndex ] = sentenceId;
+			m_wordInSentence[ wordIndex ] = i-words.begin();
+			m_array[ wordIndex++ ] = *i;
+		}
+		m_index[ wordIndex ] = wordIndex;
+		m_array[ wordIndex++ ] = m_endOfSentence;
+		m_sentenceLength[ sentenceId++ ] = words.size();
+	}
+	extractFile.close();
+	cerr << "done reading " << wordIndex << " words, " << sentenceId << " sentences." << endl;
+	// List(0,9);
+
+	// sort
+	m_buffer = (INDEX*) calloc( sizeof( INDEX ), m_size );
+	Sort( 0, m_size-1 );
+	free( m_buffer );
+	cerr << "done sorting" << endl;
+}
+
+// good ol' quick sort
+void SuffixArray::Sort(INDEX start, INDEX end) {
+	if (start == end) return;
+	INDEX mid = (start+end+1)/2;
+	Sort( start, mid-1 );
+	Sort( mid, end );
+
+	// merge
+	int i = start;
+	int j = mid;
+	int k = 0;
+	int length = end-start+1;
+	while( k<length )
+	{
+		if (i == mid ) 
+		{
+			m_buffer[ k++ ] = m_index[ j++ ];
+		}
+		else if (j > end ) 
+		{
+			m_buffer[ k++ ] = m_index[ i++ ];
+		}
+		else {
+			if (CompareIndex( m_index[i], m_index[j] ) < 0) 
+			{
+				m_buffer[ k++ ] = m_index[ i++ ];
+			}	
+			else 
+			{
+				m_buffer[ k++ ] = m_index[ j++ ];
+			}
+		}
+	}
+	
+	memcpy( ((char*)m_index) + sizeof( INDEX ) * start,
+					((char*)m_buffer), sizeof( INDEX ) * (end-start+1) );
+}
+
+SuffixArray::~SuffixArray()
+{ 
+	free(m_index); 
+	free(m_array);
+}
+
+int SuffixArray::CompareIndex( INDEX a, INDEX b ) const
+{
+	// skip over identical words
+	INDEX offset = 0;
+	while( a+offset < m_size &&
+				 b+offset < m_size &&
+				 m_array[ a+offset ] == m_array[ b+offset ] )
+	{ offset++; }
+	
+	if( a+offset == m_size ) return -1;
+	if( b+offset == m_size ) return 1;
+	return CompareWord( m_array[ a+offset ], m_array[ b+offset ] );
+}
+
+inline int SuffixArray::CompareWord( WORD_ID a, WORD_ID b ) const
+{
+	// cerr << "c(" << m_vcb.GetWord(a) << ":" << m_vcb.GetWord(b) << ")=" << m_vcb.GetWord(a).compare( m_vcb.GetWord(b) ) << endl;
+	return m_vcb.GetWord(a).compare( m_vcb.GetWord(b) );
+}
+
+int SuffixArray::Count( const vector< WORD > &phrase )
+{
+	INDEX dummy;
+	return LimitedCount( phrase, m_size, dummy, dummy, 0, m_size-1 );
+}
+
+bool SuffixArray::MinCount( const vector< WORD > &phrase, INDEX min )
+{
+	INDEX dummy;
+	return LimitedCount( phrase, min, dummy, dummy, 0, m_size-1 ) >= min;
+}
+
+bool SuffixArray::Exists( const vector< WORD > &phrase )
+{
+	INDEX dummy;
+	return LimitedCount( phrase, 1, dummy, dummy, 0, m_size-1 ) == 1;
+}
+
+int SuffixArray::FindMatches( const vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end )
+{
+	return LimitedCount( phrase, m_size, firstMatch, lastMatch, search_start, search_end );
+}
+
+int SuffixArray::LimitedCount( const vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end )
+{
+	// cerr << "FindFirst\n";
+	INDEX start = search_start;
+	INDEX end = (search_end == -1) ? (m_size-1) : search_end;
+	INDEX mid = FindFirst( phrase, start, end );
+	// cerr << "done\n";
+	if (mid == m_size) return 0; // no matches
+	if (min == 1) return 1;      // only existance check
+
+	int matchCount = 1;
+
+	//cerr << "before...\n";
+	firstMatch = FindLast( phrase, mid, start, -1 );
+	matchCount += mid - firstMatch;
+
+	//cerr << "after...\n";
+	lastMatch = FindLast( phrase, mid, end, 1 );
+	matchCount += lastMatch - mid;
+
+	return matchCount;
+}
+
+SuffixArray::INDEX SuffixArray::FindLast( const vector< WORD > &phrase, INDEX start, INDEX end, int direction )
+{
+	end += direction;
+	while(true)
+	{
+		INDEX mid = ( start + end + (direction>0 ? 0 : 1) )/2;
+		
+		int match = Match( phrase, mid );
+		int matchNext = Match( phrase, mid+direction );
+		//cerr << "\t" << start << ";" << mid << ";" << end << " -> " << match << "," << matchNext << endl;
+		
+		if (match == 0 && matchNext != 0) return mid;
+
+		if (match == 0) // mid point is a match
+			start = mid;
+		else
+			end = mid;
+	}
+}
+
+SuffixArray::INDEX SuffixArray::FindFirst( const vector< WORD > &phrase, INDEX &start, INDEX &end )
+{
+	while(true)
+	{
+		INDEX mid = ( start + end + 1 )/2;
+		//cerr << "FindFirst(" << start << ";" << mid << ";" << end << ")\n";
+		int match = Match( phrase, mid );
+		
+		if (match == 0) return mid;
+		if (start >= end && match != 0 ) return m_size;
+		
+		if (match > 0)
+			start = mid+1;
+		else
+			end = mid-1;	
+	}
+}
+
+int SuffixArray::Match( const vector< WORD > &phrase, INDEX index )
+{
+	INDEX pos = m_index[ index ];
+	for(INDEX i=0; i<phrase.size() && i+pos<m_size; i++)
+	{
+		int match = CompareWord( m_vcb.GetWordID( phrase[i] ), m_array[ pos+i ] );
+		// cerr << "{" << index << "+" << i << "," << pos+i << ":" << match << "}" << endl;
+		if (match != 0) 
+			return match;
+	}
+	return 0;
+}
+
+void SuffixArray::List(INDEX start, INDEX end)
+{
+	for(INDEX i=start; i<=end; i++)
+	{
+		INDEX pos = m_index[ i ];
+		// cerr << i << ":" << pos << "\t";
+		for(int j=0; j<5 && j+pos<m_size; j++)
+		{
+			cout << " " << m_vcb.GetWord( m_array[ pos+j ] );
+		}
+		// cerr << "\n";
+	}
+}
--- a/contrib/fuzzy-match/SuffixArray.h
+++ b/contrib/fuzzy-match/SuffixArray.h
@ -0,0 +1,45 @@
+#include "Vocabulary.h"
+
+#pragma once
+
+#define LINE_MAX_LENGTH 10000
+
+
+class SuffixArray 
+{
+public:
+	typedef unsigned int INDEX;
+
+private:
+	WORD_ID *m_array;
+	INDEX *m_index;
+	INDEX *m_buffer;
+	char *m_wordInSentence;
+	size_t *m_sentence;
+	char *m_sentenceLength;
+	WORD_ID m_endOfSentence;
+	Vocabulary m_vcb;
+	INDEX m_size;
+
+public:
+	SuffixArray( string fileName );
+	~SuffixArray();
+
+	void Sort(INDEX start, INDEX end);
+	int CompareIndex( INDEX a, INDEX b ) const;
+	inline int CompareWord( WORD_ID a, WORD_ID b ) const;
+	int Count( const vector< WORD > &phrase );
+	bool MinCount( const vector< WORD > &phrase, INDEX min );
+	bool Exists( const vector< WORD > &phrase );
+	int FindMatches( const vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = 0, INDEX search_end = -1 );
+	int LimitedCount( const vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = -1, INDEX search_end = 0 );
+	INDEX FindFirst( const vector< WORD > &phrase, INDEX &start, INDEX &end );
+	INDEX FindLast( const vector< WORD > &phrase, INDEX start, INDEX end, int direction );
+	int Match( const vector< WORD > &phrase, INDEX index );
+	void List( INDEX start, INDEX end );
+	inline INDEX GetPosition( INDEX index ) { return m_index[ index ]; }
+	inline size_t GetSentence( INDEX position ) { return m_sentence[position]; }
+	inline char GetWordInSentence( INDEX position ) { return m_wordInSentence[position]; }
+	inline char GetSentenceLength( size_t sentenceId ) { return m_sentenceLength[sentenceId]; }
+	inline INDEX GetSize() { return m_size; }
+};
--- a/contrib/fuzzy-match/Util.cpp
+++ b/contrib/fuzzy-match/Util.cpp
@ -0,0 +1,147 @@
+//
+//  Util.cpp
+//  fuzzy-match
+//
+//  Created by Hieu Hoang on 26/07/2012.
+//  Copyright 2012 __MyCompanyName__. All rights reserved.
+//
+
+#include <iostream>
+#include <stdio.h>
+#include "Util.h"
+#include "SentenceAlignment.h"
+#include "SuffixArray.h"
+
+void load_corpus( const char* fileName, vector< vector< WORD_ID > > &corpus )
+{ // source 
+	ifstream fileStream;
+	fileStream.open(fileName);
+	if (!fileStream) {
+		cerr << "file not found: " << fileName << endl;
+		exit(1);
+	}
+  cerr << "loading " << fileName << endl;
+
+	istream *fileStreamP = &fileStream;
+  
+	char line[LINE_MAX_LENGTH];
+	while(true)
+	{
+		SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
+		if (fileStreamP->eof()) break;
+		corpus.push_back( vocabulary.Tokenize( line ) );
+	}
+}
+
+void load_target( const char* fileName, vector< vector< SentenceAlignment > > &corpus)
+{ 
+	ifstream fileStream;
+	fileStream.open(fileName);
+	if (!fileStream) {
+		cerr << "file not found: " << fileName << endl;
+		exit(1);
+	}
+  cerr << "loading " << fileName << endl;
+
+	istream *fileStreamP = &fileStream;
+  
+  WORD_ID delimiter = vocabulary.StoreIfNew("|||");
+  
+  int lineNum = 0;
+	char line[LINE_MAX_LENGTH];
+	while(true)
+	{
+		SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
+		if (fileStreamP->eof()) break;
+    
+    vector<WORD_ID> toks = vocabulary.Tokenize( line );
+    
+    corpus.push_back(vector< SentenceAlignment >());
+    vector< SentenceAlignment > &vec = corpus.back();
+    
+    vec.push_back(SentenceAlignment());
+    SentenceAlignment *sentence = &vec.back();
+    
+    const WORD &countStr = vocabulary.GetWord(toks[0]);
+    sentence->count = atoi(countStr.c_str());
+    
+    for (size_t i = 1; i < toks.size(); ++i) {
+      WORD_ID wordId = toks[i];
+      
+      if (wordId == delimiter) {
+        // target and alignments can have multiple sentences.
+        vec.push_back(SentenceAlignment());
+        sentence = &vec.back();
+        
+        // count
+        ++i;
+        
+        const WORD &countStr = vocabulary.GetWord(toks[i]);
+        sentence->count = atoi(countStr.c_str());
+      }
+      else {
+        // just a normal word, add
+        sentence->target.push_back(wordId);
+      }
+    }
+    
+    ++lineNum;
+    
+	}
+  
+}
+
+
+void load_alignment( const char* fileName, vector< vector< SentenceAlignment > > &corpus )
+{ 
+  ifstream fileStream;
+	fileStream.open(fileName);
+	if (!fileStream) {
+		cerr << "file not found: " << fileName << endl;
+		exit(1);
+	}
+  cerr << "loading " << fileName << endl;
+
+	istream *fileStreamP = &fileStream;
+  
+  string delimiter = "|||";
+  
+  int lineNum = 0;
+	char line[LINE_MAX_LENGTH];
+	while(true)
+	{
+		SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
+		if (fileStreamP->eof()) break;
+    
+    vector< SentenceAlignment > &vec = corpus[lineNum];
+    size_t targetInd = 0;
+    SentenceAlignment *sentence = &vec[targetInd];
+    
+    vector<string> toks = Tokenize(line);
+    
+    for (size_t i = 0; i < toks.size(); ++i) {
+      string &tok = toks[i];
+      
+      if (tok == delimiter) {
+        // target and alignments can have multiple sentences.
+        ++targetInd;
+        sentence = &vec[targetInd];
+        
+        ++i;
+      }
+      else {
+        // just a normal alignment, add
+        vector<int> alignPoint = Tokenize<int>(tok, "-");
+        assert(alignPoint.size() == 2);
+        sentence->alignment.push_back(pair<int,int>(alignPoint[0], alignPoint[1]));
+      }
+    }
+    
+    ++lineNum;
+    
+	}
+}
+
+
+
+
--- a/contrib/fuzzy-match/Util.h
+++ b/contrib/fuzzy-match/Util.h
@ -0,0 +1,87 @@
+//
+//  Util.h
+//  fuzzy-match
+//
+//  Created by Hieu Hoang on 25/07/2012.
+//  Copyright 2012 __MyCompanyName__. All rights reserved.
+//
+
+#ifndef fuzzy_match_Util_h
+#define fuzzy_match_Util_h
+
+#include <vector>
+#include <sstream>
+#include "Vocabulary.h"
+
+class SentenceAlignment;
+
+void load_corpus( const char* fileName, std::vector< std::vector< WORD_ID > > &corpus );
+void load_target( const char* fileName, std::vector< std::vector< SentenceAlignment > > &corpus);
+void load_alignment( const char* fileName, std::vector< std::vector< SentenceAlignment > > &corpus );
+
+/**
+ * Convert vector of type T to string
+ */
+template <typename T>
+std::string Join(const std::string& delimiter, const std::vector<T>& items)
+{
+  std::ostringstream outstr;
+  if(items.size() == 0) return "";
+  outstr << items[0];
+  for(unsigned int i = 1; i < items.size(); i++)
+    outstr << delimiter << items[i];
+  return outstr.str();
+}
+
+//! convert string to variable of type T. Used to reading floats, int etc from files
+template<typename T>
+inline T Scan(const std::string &input)
+{
+  std::stringstream stream(input);
+  T ret;
+  stream >> ret;
+  return ret;
+}
+
+//! convert vectors of string to vectors of type T variables
+template<typename T>
+inline std::vector<T> Scan(const std::vector< std::string > &input)
+{
+  std::vector<T> output(input.size());
+  for (size_t i = 0 ; i < input.size() ; i++) {
+    output[i] = Scan<T>( input[i] );
+  }
+  return output;
+}
+
+inline std::vector<std::string> Tokenize(const std::string& str,
+                                         const std::string& delimiters = " \t")
+{
+  std::vector<std::string> tokens;
+  // Skip delimiters at beginning.
+  std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
+  // Find first "non-delimiter".
+  std::string::size_type pos     = str.find_first_of(delimiters, lastPos);
+  
+  while (std::string::npos != pos || std::string::npos != lastPos) {
+    // Found a token, add it to the vector.
+    tokens.push_back(str.substr(lastPos, pos - lastPos));
+    // Skip delimiters.  Note the "not_of"
+    lastPos = str.find_first_not_of(delimiters, pos);
+    // Find next "non-delimiter"
+    pos = str.find_first_of(delimiters, lastPos);
+  }
+  
+  return tokens;
+}
+
+template<typename T>
+inline std::vector<T> Tokenize( const std::string &input
+                               , const std::string& delimiters = " \t")
+{
+  std::vector<std::string> stringVector = Tokenize(input, delimiters);
+  return Scan<T>( stringVector );
+}
+
+
+#endif
--- a/contrib/fuzzy-match/Vocabulary.cpp
+++ b/contrib/fuzzy-match/Vocabulary.cpp
@ -0,0 +1,45 @@
+// $Id: Vocabulary.cpp 1565 2008-02-22 14:42:01Z bojar $
+#include "Vocabulary.h"
+
+// as in beamdecoder/tables.cpp
+vector<WORD_ID> Vocabulary::Tokenize( const char input[] ) {
+  vector< WORD_ID > token;
+  bool betweenWords = true;
+  int start=0;
+  int i=0;
+  for(; input[i] != '\0'; i++) {
+    bool isSpace = (input[i] == ' ' || input[i] == '\t');
+
+    if (!isSpace && betweenWords) {
+      start = i;
+      betweenWords = false;
+    }
+    else if (isSpace && !betweenWords) {
+      token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
+      betweenWords = true;
+    }
+  }
+  if (!betweenWords)
+    token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
+  return token;
+}
+
+WORD_ID Vocabulary::StoreIfNew( const WORD& word ) {
+  map<WORD, WORD_ID>::iterator i = lookup.find( word );
+  
+  if( i != lookup.end() )
+    return i->second;
+
+  WORD_ID id = vocab.size();
+  vocab.push_back( word );
+  lookup[ word ] = id;
+  return id;  
+}
+
+WORD_ID Vocabulary::GetWordID( const WORD &word ) {
+  map<WORD, WORD_ID>::iterator i = lookup.find( word );
+  if( i == lookup.end() )
+    return 0;
+  WORD_ID w= (WORD_ID) i->second;
+  return w;
+}
--- a/contrib/fuzzy-match/Vocabulary.h
+++ b/contrib/fuzzy-match/Vocabulary.h
@ -0,0 +1,40 @@
+// $Id: tables-core.h 1470 2007-10-02 21:43:54Z redpony $
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <assert.h>
+#include <stdlib.h>
+#include <string>
+#include <queue>
+#include <map>
+#include <cmath>
+
+using namespace std;
+
+#define MAX_LENGTH 10000
+
+#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) { \
+                _IS.getline(_LINE, _SIZE, _DELIM); \
+                if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \
+                if (_IS.gcount() == _SIZE-1) { \
+                  cerr << "Line too long! Buffer overflow. Delete lines >=" \
+                    << _SIZE << " chars or raise MAX_LENGTH in phrase-extract/tables-core.cpp" \
+                    << endl; \
+                    exit(1); \
+                } \
+              }
+
+typedef string WORD;
+typedef unsigned int WORD_ID;
+
+class Vocabulary {
+ public:
+  map<WORD, WORD_ID> lookup;
+  vector< WORD > vocab;
+  WORD_ID StoreIfNew( const WORD& );
+  WORD_ID GetWordID( const WORD& );
+  vector<WORD_ID> Tokenize( const char[] );
+  inline WORD &GetWord( WORD_ID id ) const { WORD &i = (WORD&) vocab[ id ]; return i; }
+};
--- a/contrib/fuzzy-match/fuzzy-match2.cpp
+++ b/contrib/fuzzy-match/fuzzy-match2.cpp
@ -0,0 +1,460 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <map>
+#include <algorithm>
+#include <iostream>
+#include <fstream>
+#include <cstring>
+#include <time.h>
+#include <fstream>
+#include "SentenceAlignment.h"
+#include "fuzzy-match2.h"
+#include "SuffixArray.h"
+
+/** This implementation is explained in
+       Koehn and Senellart: "Fast Approximate String Matching 
+       with Suffix Arrays and A* Parsing" (AMTA 2010) ***/
+
+using namespace std;
+
+int main(int argc, char* argv[]) 
+{
+	vector< vector< WORD_ID > > source, input;
+	vector< vector< SentenceAlignment > > targetAndAlignment;
+	
+	
+	while(1) {
+		static struct option long_options[] = {
+			{"basic", no_argument, &basic_flag, 1},
+			{"word", no_argument, &lsed_flag, 0},
+			{"unrefined", no_argument, &refined_flag, 0},
+			{"nolengthfilter", no_argument, &length_filter_flag, 0},
+			{"noparse", no_argument, &parse_flag, 0},
+			{"multiple", no_argument, &multiple_flag, 1},
+			{"minmatch", required_argument, 0, 'm'},
+			{0, 0, 0, 0}
+		};
+		int option_index = 0;
+		int c = getopt_long (argc, argv, "m:", long_options, &option_index);
+		if (c == -1) break;
+		switch (c) {
+			case 0:
+//				if (long_options[option_index].flag != 0)
+//					break;
+//				printf ("option %s", long_options[option_index].name);
+//				if (optarg)
+//					printf (" with arg %s", optarg);
+//				printf ("\n");
+				break;
+			case 'm':
+				min_match = atoi(optarg);
+				if (min_match < 1 || min_match > 100) {
+					cerr << "error: --minmatch must have value in range 1..100\n";
+					exit(1);
+				}
+				cerr << "setting min match to " << min_match << endl;
+				break;
+			default:
+				cerr << "usage: syntax: ./fuzzy-match input corpus [--basic] [--word] [--minmatch 1..100]\n";
+				exit(1);
+		}
+	}
+	if (lsed_flag) { cerr << "lsed\n"; }
+	if (basic_flag) { cerr << "basic\n"; }
+	if (refined_flag) { cerr << "refined\n"; }
+	if (length_filter_flag) { cerr << "length filter\n"; }
+	if (parse_flag) { cerr << "parse\n"; }
+//	exit(1);
+
+
+	if (optind+4 != argc) {
+		cerr << "syntax: ./fuzzy-match input source target alignment [--basic] [--word] [--minmatch 1..100]\n";
+		exit(1);
+	}
+	
+	load_corpus(argv[optind], input);
+	load_corpus(argv[optind+1], source);
+	load_target(argv[optind+2], targetAndAlignment);
+	load_alignment(argv[optind+3], targetAndAlignment);
+
+  // ./fuzzy-match input corpus [-basic] 
+	
+//	load_corpus("../corpus/tm.truecased.4.en", source);
+//	load_corpus("../corpus/tm.truecased.4.it", target);
+//	load_corpus("../evaluation/test.input.tc.4", input);
+
+//	load_corpus("../../acquis-truecase/corpus/acquis.truecased.190.en", source);
+//	load_corpus("../../acquis-truecase/evaluation/ac-test.input.tc.190", input);
+
+//	load_corpus("../corpus/tm.truecased.16.en", source);
+//	load_corpus("../evaluation/test.input.tc.16", input);
+
+	if (basic_flag) {
+		cerr << "using basic method\n";
+		clock_t start_main_clock2 = clock();
+		basic_fuzzy_match( source, input );
+		cerr << "total: " << (1000 * (clock()-start_main_clock2) / CLOCKS_PER_SEC) << endl;
+		exit(1);
+	}
+
+	cerr << "number of input sentences " << input.size() << endl;
+
+	cerr << "creating suffix array...\n";
+//	SuffixArray suffixArray( "../corpus/tm.truecased.4.en" );
+//	SuffixArray suffixArray( "../../acquis-truecase/corpus/acquis.truecased.190.en" );
+	SuffixArray suffixArray( argv[optind+1] );
+	
+	clock_t start_main_clock = clock();
+
+	// looping through all input sentences...
+	cerr << "looping...\n";
+	for(unsigned int sentenceInd = 0; sentenceInd < input.size(); sentenceInd++)
+	{
+		clock_t start_clock = clock();
+		// if (i % 10 == 0) cerr << ".";
+
+		// establish some basic statistics
+
+		// int input_length = compute_length( input[i] );
+		int input_length = input[sentenceInd].size();
+		int best_cost = input_length * (100-min_match) / 100 + 1;
+
+		int match_count = 0; // how many substring matches to be considered
+		//cerr << endl << "sentence " << i << ", length " << input_length << ", best_cost " << best_cost << endl;
+
+		// find match ranges in suffix array
+		vector< vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > > match_range;
+		for(size_t start=0;start<input[sentenceInd].size();start++) 
+		{
+			SuffixArray::INDEX prior_first_match = 0;
+			SuffixArray::INDEX prior_last_match = suffixArray.GetSize()-1;
+			vector< string > substring;
+			bool stillMatched = true;
+			vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > matchedAtThisStart;
+			//cerr << "start: " << start;
+			for(int word=start; stillMatched && word<input[sentenceInd].size(); word++)
+			{
+				substring.push_back( vocabulary.GetWord( input[sentenceInd][word] ) );
+
+				// only look up, if needed (i.e. no unnecessary short gram lookups)
+//				if (! word-start+1 <= short_match_max_length( input_length ) )
+				//			{
+				SuffixArray::INDEX first_match, last_match;
+				stillMatched = false;
+				if (suffixArray.FindMatches( substring, first_match, last_match, prior_first_match, prior_last_match ) )
+				{
+					stillMatched = true;
+					matchedAtThisStart.push_back( make_pair( first_match, last_match ) );
+					//cerr << " (" << first_match << "," << last_match << ")";
+					//cerr << " " << ( last_match - first_match + 1 );
+					prior_first_match = first_match;
+					prior_last_match = last_match;
+				}
+					//}
+			}
+			//cerr << endl;
+			match_range.push_back( matchedAtThisStart );
+		}
+
+		clock_t clock_range = clock();
+
+		map< int, vector< Match > > sentence_match;
+		map< int, int > sentence_match_word_count;
+
+		// go through all matches, longest first
+		for(int length = input[sentenceInd].size(); length >= 1; length--)
+		{
+			// do not create matches, if these are handled by the short match function
+			if (length <= short_match_max_length( input_length ) )
+			{
+				continue;
+			}
+
+			unsigned int count = 0;
+			for(int start = 0; start <= input[sentenceInd].size() - length; start++)
+			{
+				if (match_range[start].size() >= length)
+				{
+					pair< SuffixArray::INDEX, SuffixArray::INDEX > &range = match_range[start][length-1];
+					// cerr << " (" << range.first << "," << range.second << ")";
+					count += range.second - range.first + 1;
+
+					for(SuffixArray::INDEX i=range.first; i<=range.second; i++)
+					{
+						int position = suffixArray.GetPosition( i );
+
+						// sentence length mismatch
+						size_t sentence_id = suffixArray.GetSentence( position );
+						int sentence_length = suffixArray.GetSentenceLength( sentence_id );
+						int diff = abs( (int)sentence_length - (int)input_length );
+						// cerr << endl << i << "\tsentence " << sentence_id << ", length " << sentence_length;
+						//if (length <= 2 && input_length>=5 &&
+						//		sentence_match.find( sentence_id ) == sentence_match.end())
+						//	continue;
+
+						if (diff > best_cost)
+							continue;
+
+						// compute minimal cost
+						int start_pos = suffixArray.GetWordInSentence( position );
+						int end_pos = start_pos + length-1;
+						// cerr << endl << "\t" << start_pos << "-" << end_pos << " (" << sentence_length << ") vs. " 
+						// << start << "-" << (start+length-1) << " (" << input_length << ")"; 
+						// different number of prior words -> cost is at least diff
+						int min_cost = abs( start - start_pos );
+						
+						// same number of words, but not sent. start -> cost is at least 1 
+						if (start == start_pos && start>0)
+							min_cost++;
+
+						// different number of remaining words -> cost is at least diff
+						min_cost += abs( ( sentence_length-1 - end_pos ) -
+														 ( input_length-1 - (start+length-1) ) );
+
+						// same number of words, but not sent. end -> cost is at least 1
+						if ( sentence_length-1 - end_pos ==
+								 input_length-1 - (start+length-1)
+								 && end_pos != sentence_length-1 )
+							min_cost++;
+
+						// cerr << " -> min_cost " << min_cost;
+						if (min_cost > best_cost)
+							continue;
+
+						// valid match
+						match_count++;
+
+						// compute maximal cost
+						int max_cost = max( start, start_pos )
+							+ max( sentence_length-1 - end_pos,
+										 input_length-1 - (start+length-1) );
+						// cerr << ", max_cost " << max_cost;
+						
+						Match m = Match( start, start+length-1, 
+														 start_pos, start_pos+length-1, 
+														 min_cost, max_cost, 0);
+						sentence_match[ sentence_id ].push_back( m );
+						sentence_match_word_count[ sentence_id ] += length;
+
+						if (max_cost < best_cost)
+						{
+							best_cost = max_cost;
+							if (best_cost == 0) break;
+						}
+						//if (match_count >= MAX_MATCH_COUNT) break;
+					}
+				}
+				// cerr << endl;
+				if (best_cost == 0) break;
+				//if (match_count >= MAX_MATCH_COUNT) break;
+			}
+			// cerr << count << " matches at length " << length << " in " << sentence_match.size() << " tm." << endl;
+
+			if (best_cost == 0) break;
+			//if (match_count >= MAX_MATCH_COUNT) break;
+		}
+		cerr << match_count << " matches in " << sentence_match.size() << " sentences." << endl;
+
+		clock_t clock_matches = clock();
+
+		// consider each sentence for which we have matches
+		int old_best_cost = best_cost;
+		int tm_count_word_match = 0;
+		int tm_count_word_match2 = 0;
+		int pruned_match_count = 0;
+		if (short_match_max_length( input_length ))
+		{
+			init_short_matches( input[sentenceInd] );
+		}
+		vector< int > best_tm;
+		typedef map< int, vector< Match > >::iterator I;
+
+		clock_t clock_validation_sum = 0;
+
+		for(I tm=sentence_match.begin(); tm!=sentence_match.end(); tm++)
+		{
+			int tmID = tm->first;
+			int tm_length = suffixArray.GetSentenceLength(tmID);
+			vector< Match > &match = tm->second;
+			add_short_matches( match, source[tmID], input_length, best_cost );
+
+			//cerr << "match in sentence " << tmID << ": " << match.size() << " [" << tm_length << "]" << endl;
+
+			// quick look: how many words are matched
+			int words_matched = 0;
+			for(int m=0;m<match.size();m++) {
+
+				if (match[m].min_cost <= best_cost) // makes no difference
+					words_matched += match[m].input_end - match[m].input_start + 1;
+			}
+			if (max(input_length,tm_length) - words_matched > best_cost)
+			{
+				if (length_filter_flag) continue;
+			}
+			tm_count_word_match++;
+
+			// prune, check again how many words are matched
+			vector< Match > pruned = prune_matches( match, best_cost );
+			words_matched = 0;
+			for(int p=0;p<pruned.size();p++) {
+				words_matched += pruned[p].input_end - pruned[p].input_start + 1;
+			}
+			if (max(input_length,tm_length) - words_matched > best_cost)
+			{
+				if (length_filter_flag) continue;
+			}
+			tm_count_word_match2++;
+
+			pruned_match_count += pruned.size();
+			int prior_best_cost = best_cost;
+			int cost;
+
+			clock_t clock_validation_start = clock();
+			if (! parse_flag ||
+			    pruned.size()>=10) // to prevent worst cases
+			{
+				string path;
+				cost = sed( input[sentenceInd], source[tmID], path, false );
+				if (cost <  best_cost) 
+				{
+					best_cost = cost;
+				}
+			}
+
+			else
+			{
+				cost = parse_matches( pruned, input_length, tm_length, best_cost );
+				if (prior_best_cost != best_cost)
+				{
+					best_tm.clear();
+				}
+			}
+			clock_validation_sum += clock() - clock_validation_start;
+			if (cost == best_cost)
+			{
+				best_tm.push_back( tmID );
+			}
+		}
+		cerr << "reduced best cost from " << old_best_cost << " to " << best_cost << endl;
+		cerr << "tm considered: " << sentence_match.size()
+				 << " word-matched: " << tm_count_word_match 
+				 << " word-matched2: " << tm_count_word_match2 
+				 << " best: " << best_tm.size() << endl;
+
+		cerr << "pruned matches: " << ((float)pruned_match_count/(float)tm_count_word_match2) << endl;
+
+    // create xml and extract files
+    string inputStr, sourceStr;
+    for (size_t pos = 0; pos < input_length; ++pos) {
+      inputStr += vocabulary.GetWord(input[sentenceInd][pos]) + " ";
+    }
+    
+		// do not try to find the best ... report multiple matches
+		if (multiple_flag) {
+			int input_letter_length = compute_length( input[sentenceInd] );
+			for(int si=0; si<best_tm.size(); si++) {
+				int s = best_tm[si];
+				string path;
+				unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true );
+				// do not report multiple identical sentences, but just their count
+				cout << sentenceInd << " "; // sentence number
+				cout << letter_cost << "/" << input_letter_length << " ";
+				cout << "(" << best_cost <<"/" << input_length <<") ";
+				cout << "||| " << s << " ||| " << path << endl;
+        
+        vector<WORD_ID> &sourceSentence = source[s];
+        vector<SentenceAlignment> &targets = targetAndAlignment[s];
+        create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, path);
+
+			}
+		} // if (multiple_flag)
+    else {
+
+      // find the best matches according to letter sed
+      string best_path = "";
+      int best_match = -1;
+      int best_letter_cost;
+      if (lsed_flag) {
+        best_letter_cost = compute_length( input[sentenceInd] ) * min_match / 100 + 1;
+        for(int si=0; si<best_tm.size(); si++)
+        {
+          int s = best_tm[si];
+          string path;
+          unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true );
+          if (letter_cost < best_letter_cost)
+          {
+            best_letter_cost = letter_cost;
+            best_path = path;
+            best_match = s;
+          }
+        }
+      }
+      // if letter sed turned off, just compute path for first match
+      else {
+        if (best_tm.size() > 0) {
+          string path;
+          sed( input[sentenceInd], source[best_tm[0]], path, false );
+          best_path = path;
+          best_match = best_tm[0];
+        }
+      }
+      cerr << "elapsed: " << (1000 * (clock()-start_clock) / CLOCKS_PER_SEC)
+           << " ( range: " << (1000 * (clock_range-start_clock) / CLOCKS_PER_SEC)
+           << " match: " << (1000 * (clock_matches-clock_range) / CLOCKS_PER_SEC)
+           << " tm: " << (1000 * (clock()-clock_matches) / CLOCKS_PER_SEC)
+           << " (validation: " << (1000 * (clock_validation_sum) / CLOCKS_PER_SEC) << ")"
+           << " )" << endl;
+      if (lsed_flag) {
+        cout << best_letter_cost << "/" << compute_length( input[sentenceInd] ) << " (";
+      }
+      cout << best_cost <<"/" << input_length;
+      if (lsed_flag) 	cout << ")";
+      cout << " ||| " << best_match << " ||| " << best_path << endl;
+
+      // creat xml & extracts
+      vector<WORD_ID> &sourceSentence = source[best_match];
+      vector<SentenceAlignment> &targets = targetAndAlignment[best_match];
+      create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, best_path);
+
+    } // else if (multiple_flag)
+    
+    
+  }
+	cerr << "total: " << (1000 * (clock()-start_main_clock) / CLOCKS_PER_SEC) << endl;
+	
+}
+
+void create_extract(int sentenceInd, int cost, const vector< WORD_ID > &sourceSentence, const vector<SentenceAlignment> &targets, const string &inputStr, const string  &path)
+{
+  string sourceStr;
+  for (size_t pos = 0; pos < sourceSentence.size(); ++pos) {
+    WORD_ID wordId = sourceSentence[pos];
+    sourceStr += vocabulary.GetWord(wordId) + " ";
+  }
+    
+  char *inputFileName = tmpnam(NULL);
+  ofstream inputFile(inputFileName);
+
+  for (size_t targetInd = 0; targetInd < targets.size(); ++targetInd) {
+    const SentenceAlignment &sentenceAlignment = targets[targetInd]; 
+    string targetStr = sentenceAlignment.getTargetString();
+    string alignStr = sentenceAlignment.getAlignmentString();
+    
+    inputFile 
+      << sentenceInd << endl
+      << cost << endl
+      << sourceStr << endl 
+      << inputStr << endl
+      << targetStr << endl
+      << alignStr << endl
+      << path << endl
+      << sentenceAlignment.count << endl;
+
+  }
+  
+  string cmd = string("perl create_xml.perl < ") + inputFileName;
+  cerr << cmd << endl;
+  inputFile.close();
+  
+}
--- a/contrib/fuzzy-match/fuzzy-match2.h
+++ b/contrib/fuzzy-match/fuzzy-match2.h
@ -0,0 +1,561 @@
+//
+//  fuzzy-match2.h
+//  fuzzy-match
+//
+//  Created by Hieu Hoang on 25/07/2012.
+//  Copyright 2012 __MyCompanyName__. All rights reserved.
+//
+
+#ifndef fuzzy_match_fuzzy_match2_h
+#define fuzzy_match_fuzzy_match2_h
+
+#include <string>
+#include <sstream>
+#include <vector>
+#include "Vocabulary.h"
+#include "SuffixArray.h"
+#include "Util.h"
+#include "Match.h"
+
+#define MAX_MATCH_COUNT 10000000
+
+Vocabulary vocabulary;
+
+int basic_flag = false;
+int lsed_flag = true;
+int refined_flag = true;
+int length_filter_flag = true;
+int parse_flag = true;
+int min_match = 70;
+int multiple_flag = false;
+int multiple_slack = 0;
+int multiple_max = 100;
+map< WORD_ID,vector< int > > single_word_index;
+// global cache for word pairs
+map< pair< WORD_ID, WORD_ID >, unsigned int > lsed;
+
+void create_extract(int sentenceInd, int cost, const vector< WORD_ID > &sourceSentence, const vector<SentenceAlignment> &targets, const string &inputStr, const string  &path);
+
+
+
+/* Letter string edit distance, e.g. sub 'their' to 'there' costs 2 */
+
+unsigned int letter_sed( WORD_ID aIdx, WORD_ID bIdx )
+{
+	// check if already computed -> lookup in cache
+	pair< WORD_ID, WORD_ID > pIdx = make_pair( aIdx, bIdx );
+	map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = lsed.find( pIdx );
+	if (lookup != lsed.end())
+	{
+		return (lookup->second);
+	}
+  
+	// get surface strings for word indices
+	const string &a = vocabulary.GetWord( aIdx );
+	const string &b = vocabulary.GetWord( bIdx );
+  
+	// initialize cost matrix
+	unsigned int **cost  = (unsigned int**) calloc( sizeof( unsigned int*  ), a.size()+1 );
+	for( unsigned int i=0; i<=a.size(); i++ ) {
+		cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
+		cost[i][0] = i;
+	}
+	for( unsigned int j=0; j<=b.size(); j++ ) {
+		cost[0][j] = j;
+	}
+  
+	// core string edit distance loop
+	for( unsigned int i=1; i<=a.size(); i++ ) {
+		for( unsigned int j=1; j<=b.size(); j++ ) {
+      
+			unsigned int ins = cost[i-1][j] + 1;
+			unsigned int del = cost[i][j-1] + 1;
+			bool match = (a.substr(i-1,1).compare( b.substr(j-1,1) ) == 0);
+			unsigned int diag = cost[i-1][j-1] + (match ? 0 : 1);
+      
+			unsigned int min = (ins < del) ? ins : del;
+			min = (diag < min) ? diag : min;
+      
+			cost[i][j] = min;
+		}
+	}
+  
+	// clear out memory
+	unsigned int final = cost[a.size()][b.size()];
+	for( unsigned int i=0; i<=a.size(); i++ ) {
+		free( cost[i] );
+	}
+	free( cost );
+  
+	// cache and return result
+	lsed[ pIdx ] = final;
+	return final;
+}
+
+/* string edit distance implementation */
+
+unsigned int sed( const vector< WORD_ID > &a, const vector< WORD_ID > &b, string &best_path, bool use_letter_sed ) {
+  
+	// initialize cost and path matrices
+	unsigned int **cost  = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 );
+	char **path = (char**) calloc( sizeof( char* ), a.size()+1 );
+	
+	for( unsigned int i=0; i<=a.size(); i++ ) {
+		cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
+		path[i] = (char*) calloc( sizeof(char), b.size()+1 );
+		if (i>0)
+		{
+			cost[i][0] = cost[i-1][0];
+			if (use_letter_sed)
+			{
+				cost[i][0] += vocabulary.GetWord( a[i-1] ).size();
+			}
+			else
+			{
+				cost[i][0]++;
+			}
+		}
+		else
+		{
+			cost[i][0] = 0;
+		}
+		path[i][0] = 'I';
+	}
+  
+	for( unsigned int j=0; j<=b.size(); j++ ) {
+		if (j>0) 
+		{
+			cost[0][j] = cost[0][j-1];
+			if (use_letter_sed)
+			{
+				cost[0][j] +=	vocabulary.GetWord( b[j-1] ).size();
+			}
+			else
+			{
+				cost[0][j]++;
+			}
+		}
+		else
+		{
+			cost[0][j] = 0;
+		}
+		path[0][j] = 'D';
+	}
+  
+	// core string edit distance algorithm
+	for( unsigned int i=1; i<=a.size(); i++ ) {
+		for( unsigned int j=1; j<=b.size(); j++ ) {
+			unsigned int ins = cost[i-1][j];
+			unsigned int del = cost[i][j-1];
+			unsigned int match;
+			if (use_letter_sed)
+			{
+				ins += vocabulary.GetWord( a[i-1] ).size();
+				del += vocabulary.GetWord( b[j-1] ).size();
+				match = letter_sed( a[i-1], b[j-1] );
+			}
+			else
+			{
+				ins++;
+				del++;
+				match = ( a[i-1] == b[j-1] ) ? 0 : 1;
+			}
+			unsigned int diag = cost[i-1][j-1] + match;
+      
+			char action = (ins < del) ? 'I' : 'D';
+			unsigned int min = (ins < del) ? ins : del;
+			if (diag < min)
+			{
+				action = (match>0) ? 'S' : 'M';
+				min = diag;
+			}
+      
+			cost[i][j] = min;
+			path[i][j] = action;
+		}
+	}
+  
+	// construct string for best path
+	unsigned int i = a.size();
+	unsigned int j = b.size();
+	best_path = "";
+	while( i>0 || j>0 )
+	{
+		best_path = path[i][j] + best_path;
+		if (path[i][j] == 'I') 
+		{
+			i--;
+		}
+		else if (path[i][j] == 'D') 
+		{
+			j--;
+		}
+		else 
+		{ 
+			i--; 
+			j--;
+		}
+	}
+	
+  
+	// clear out memory
+	unsigned int final = cost[a.size()][b.size()];
+  
+	for( unsigned int i=0; i<=a.size(); i++ ) {
+		free( cost[i] );
+		free( path[i] );
+	}
+	free( cost );
+	free( path );
+  
+	// return result
+	return final;
+}
+
+/* utlility function: compute length of sentence in characters 
+ (spaces do not count) */
+
+unsigned int compute_length( const vector< WORD_ID > &sentence )
+{
+	unsigned int length = 0; for( unsigned int i=0; i<sentence.size(); i++ )
+	{
+		length += vocabulary.GetWord( sentence[i] ).size();
+	}
+	return length;
+}
+
+/* brute force method: compare input to all corpus sentences */
+
+int basic_fuzzy_match( vector< vector< WORD_ID > > source, 
+                      vector< vector< WORD_ID > > input ) 
+{
+	// go through input set...
+	for(unsigned int i=0;i<input.size();i++)
+	{
+		bool use_letter_sed = false;
+    
+		// compute sentence length and worst allowed cost
+		unsigned int input_length;
+		if (use_letter_sed)
+		{
+			input_length = compute_length( input[i] );
+		}
+		else
+		{
+			input_length = input[i].size();
+		}
+		unsigned int best_cost = input_length * (100-min_match) / 100 + 2;
+		string best_path = "";
+		int best_match = -1;
+    
+		// go through all corpus sentences
+		for(unsigned int s=0;s<source.size();s++)
+		{
+			int source_length;
+			if (use_letter_sed)
+			{
+				source_length = compute_length( source[s] );
+			}
+			else
+			{
+				source_length = source[s].size();
+			}
+			int diff = abs((int)source_length - (int)input_length);
+			if (length_filter_flag && (diff >= best_cost))
+			{
+				continue;
+			}
+      
+			// compute string edit distance
+			string path;
+			unsigned int cost = sed( input[i], source[s], path, use_letter_sed );
+      
+			// update if new best
+			if (cost < best_cost) 
+			{
+				best_cost = cost;
+				best_path = path;
+				best_match = s;
+			}
+		}
+		cout << best_cost << " ||| " << best_match << " ||| " << best_path << endl;
+	}
+}
+
+/* definition of short matches
+ very short n-gram matches (1-grams) will not be looked up in
+ the suffix array, since there are too many matches
+ and for longer sentences, at least one 2-gram match must occur */
+
+inline int short_match_max_length( int input_length )
+{
+	if ( ! refined_flag ) 
+		return 0;
+	if ( input_length >= 5 )
+		return 1;
+	return 0;	
+}
+
+/* if we have non-short matches in a sentence, we need to
+ take a closer look at it. 
+ this function creates a hash map for all input words and their positions
+ (to be used by the next function) 
+ (done here, because this has be done only once for an input sentence) */
+
+void init_short_matches( const vector< WORD_ID > &input )
+{
+	int max_length = short_match_max_length( input.size() );
+	if (max_length == 0)
+		return;
+  
+	single_word_index.clear();
+	
+	// store input words and their positions in hash map
+	for(int i=0; i<input.size(); i++)
+	{
+		if (single_word_index.find( input[i] ) == single_word_index.end())
+		{
+			vector< int > position_vector;
+			single_word_index[ input[i] ] = position_vector;
+		}
+		single_word_index[ input[i] ].push_back( i );
+	}	
+}
+
+/* add all short matches to list of matches for a sentence */
+
+void add_short_matches( vector< Match > &match, const vector< WORD_ID > &tm, int input_length, int best_cost )
+{	
+	int max_length = short_match_max_length( input_length );
+	if (max_length == 0)
+		return;
+  
+	int tm_length = tm.size();
+	map< WORD_ID,vector< int > >::iterator input_word_hit;
+	for(int t_pos=0; t_pos<tm.size(); t_pos++)
+	{
+		input_word_hit = single_word_index.find( tm[t_pos] );
+		if (input_word_hit != single_word_index.end())
+		{
+			vector< int > &position_vector = input_word_hit->second;
+			for(int j=0; j<position_vector.size(); j++)
+			{
+				int &i_pos = position_vector[j];
+        
+				// before match
+				int max_cost = max( i_pos , t_pos );
+				int min_cost = abs( i_pos - t_pos );
+				if ( i_pos>0 && i_pos == t_pos ) 
+					min_cost++;
+				
+				// after match
+				max_cost += max( (input_length-i_pos) , (tm_length-t_pos));
+				min_cost += abs( (input_length-i_pos) - (tm_length-t_pos));
+				if ( i_pos != input_length-1 && (input_length-i_pos) == (tm_length-t_pos))
+					min_cost++;
+				
+				if (min_cost <= best_cost)
+				{
+					Match new_match( i_pos,i_pos, t_pos,t_pos, min_cost,max_cost,0 );
+					match.push_back( new_match );
+				}
+			}
+		} 
+	}
+}
+
+/* remove matches that are subsumed by a larger match */
+
+vector< Match > prune_matches( const vector< Match > &match, int best_cost )
+{
+	//cerr << "\tpruning";
+	vector< Match > pruned;
+	for(int i=match.size()-1; i>=0; i--)
+	{
+		//cerr << " (" << match[i].input_start << "," << match[i].input_end 
+		//		 << " ; " << match[i].tm_start << "," << match[i].tm_end 
+		//		 << " * " << match[i].min_cost << ")";
+    
+		//if (match[i].min_cost > best_cost)
+		//	continue;
+    
+		bool subsumed = false;
+		for(int j=match.size()-1; j>=0; j--)
+		{
+			if (i!=j // do not compare match with itself
+					&& ( match[i].input_end - match[i].input_start <= 
+              match[j].input_end - match[j].input_start ) // i shorter than j
+					&& ((match[i].input_start == match[j].input_start &&
+							 match[i].tm_start    == match[j].tm_start	) ||
+							(match[i].input_end   == match[j].input_end &&
+							 match[i].tm_end      == match[j].tm_end) ) )
+			{
+				subsumed = true;
+			}
+		}
+		if (! subsumed && match[i].min_cost <= best_cost)
+		{
+			//cerr << "*";
+			pruned.push_back( match[i] );
+		}
+	}
+	//cerr << endl;
+	return pruned;
+}
+
+/* A* parsing method to compute string edit distance */
+
+int parse_matches( vector< Match > &match, int input_length, int tm_length, int &best_cost )
+{	
+	// cerr << "sentence has " << match.size() << " matches, best cost: " << best_cost << ", lengths input: " << input_length << " tm: " << tm_length << endl;
+  
+	if (match.size() == 1)
+		return match[0].max_cost;
+	if (match.size() == 0)
+		return input_length+tm_length;
+	
+	int this_best_cost = input_length + tm_length;
+	for(int i=0;i<match.size();i++)
+	{
+		this_best_cost = min( this_best_cost, match[i].max_cost );
+	}
+	// cerr << "\tthis best cost: " << this_best_cost << endl;
+	
+	// bottom up combination of spans
+	vector< vector< Match > > multi_match;
+	multi_match.push_back( match );
+	
+	int match_level = 1;
+	while(multi_match[ match_level-1 ].size()>0)
+	{
+		// init vector
+		vector< Match > empty;
+		multi_match.push_back( empty );
+    
+		for(int first_level = 0; first_level <= (match_level-1)/2; first_level++)
+		{
+			int second_level = match_level - first_level -1;
+			//cerr << "\tcombining level " << first_level << " and " << second_level << endl;
+			
+			vector< Match > &first_match  = multi_match[ first_level ];
+			vector< Match > &second_match = multi_match[ second_level ];
+      
+			for(int i1 = 0; i1 < first_match.size(); i1++) {
+				for(int i2 = 0; i2 < second_match.size(); i2++) {
+          
+					// do not combine the same pair twice
+					if (first_level == second_level && i2 <= i1) 
+					{
+						continue;
+					}
+          
+					// get sorted matches (first is before second)
+					Match *first, *second;
+					if (first_match[i1].input_start < second_match[i2].input_start )
+					{
+						first = &first_match[i1];
+						second = &second_match[i2];
+					}
+					else
+					{
+						second = &first_match[i1];
+						first = &second_match[i2];
+					}
+          
+					//cerr << "\tcombining " 
+					//		 << "(" << first->input_start << "," << first->input_end << "), "
+					//		 << first->tm_start << " [" << first->internal_cost << "]"
+					//		 << " with "
+					//		 << "(" << second->input_start << "," << second->input_end << "), "
+					//		 << second->tm_start<< " [" << second->internal_cost << "]"
+					//		 << endl;
+          
+					// do not process overlapping matches
+					if (first->input_end >= second->input_start) 
+					{
+						continue;
+					}
+          
+					// no overlap / mismatch in tm
+					if (first->tm_end >= second->tm_start)
+					{
+						continue;
+					}
+          
+					// compute cost
+					int min_cost = 0;
+					int max_cost = 0;
+          
+					// initial
+					min_cost += abs( first->input_start - first->tm_start );
+					max_cost += max( first->input_start, first->tm_start );				 
+          
+					// same number of words, but not sent. start -> cost is at least 1 
+					if (first->input_start == first->tm_start && first->input_start > 0)
+					{
+						min_cost++;
+					}
+          
+					// in-between
+					int skipped_words = second->input_start - first->input_end -1;
+					int skipped_words_tm = second->tm_start - first->tm_end -1;
+					int internal_cost = max( skipped_words, skipped_words_tm );
+					internal_cost += first->internal_cost + second->internal_cost;
+					min_cost += internal_cost;
+					max_cost += internal_cost;
+					
+					// final
+					min_cost += abs( (tm_length-1 - second->tm_end) -
+                          (input_length-1 - second->input_end) );
+					max_cost += max( (tm_length-1 - second->tm_end),
+                          (input_length-1 - second->input_end) );
+          
+					// same number of words, but not sent. end -> cost is at least 1
+					if ( ( input_length-1 - second->input_end 
+                == tm_length-1 - second->tm_end )
+              && input_length-1 != second->input_end )
+					{
+						min_cost++;
+					}
+          
+					// cerr << "\tcost: " << min_cost << "-" << max_cost << endl;
+          
+					// if worst than best cost, forget it
+					if (min_cost > best_cost)					
+					{
+						continue;
+					}
+					
+					// add match
+					Match new_match( first->input_start,
+                          second->input_end,
+                          first->tm_start,
+                          second->tm_end,
+                          min_cost,
+                          max_cost,
+                          internal_cost);
+					multi_match[ match_level ].push_back( new_match );
+					// cerr << "\tstored\n";
+					
+					// possibly updating this_best_cost
+					if (max_cost < this_best_cost)
+					{
+						// cerr << "\tupdating this best cost to " << max_cost << "\n";
+						this_best_cost = max_cost;
+            
+						// possibly updating best_cost
+						if (max_cost < best_cost)
+						{
+							// cerr << "\tupdating best cost to " << max_cost << "\n";
+							best_cost = max_cost;
+						}					
+					}
+				}
+			}
+		}
+		match_level++;
+	}
+	return this_best_cost;
+}
+
+#endif
--- a/contrib/fuzzy-match/make-xml-from-match.perl
+++ b/contrib/fuzzy-match/make-xml-from-match.perl
@ -0,0 +1,214 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+my $DEBUG = 1;
+
+my $match_file  = "tm/BEST.acquis-xml-escaped.4.uniq";
+my $source_file = "data/acquis.truecased.4.en.uniq";
+my $target_file = "data/acquis.truecased.4.fr.uniq.most-frequent";
+my $alignment_file = "data/acquis.truecased.4.align.uniq.most-frequent";
+my $out_file = "data/ac-test.input.xml.4.uniq";
+my $in_file = "evaluation/ac-test.input.tc.4";
+
+#my $match_file  = "tm/BEST.acquis-xml-escaped.4";
+#my $source_file = "corpus/acquis.truecased.4.en";
+#my $target_file = "corpus/acquis.truecased.4.fr";
+#my $alignment_file = "model/aligned.4.grow-diag-final-and";
+#my $out_file = "data/ac-test.input.xml.4";
+#my $in_file = "evaluation/ac-test.input.tc.4";
+
+#my $match_file  = "tm/BEST.acquis.with";
+#my $source_file = "../acquis-truecase/corpus/acquis.truecased.190.en";
+#my $target_file = "../acquis-truecase/corpus/acquis.truecased.190.fr";
+#my $alignment_file = "../acquis-truecase/model/aligned.190.grow-diag-final-and";
+#my $out_file = "data/ac-test.input.xml";
+#my $in_file = "evaluation/ac-test.input.tc.1";
+
+my @INPUT = `cat $in_file`; chop(@INPUT);
+my @SOURCE = `cat $source_file`; chop(@SOURCE);
+my @TARGET = `cat $target_file`; chop(@TARGET);
+my @ALIGNMENT = `cat $alignment_file`; chop(@ALIGNMENT);
+
+open(MATCH,$match_file);
+open(FRAME,">$out_file");
+for(my $i=0;$i<4107;$i++) {
+
+    # get match data
+    my $match = <MATCH>;
+    chop($match);
+    my ($score,$sentence,$path) = split(/ \|\|\| /,$match);
+
+    # construct frame
+    if ($sentence < 1e9 && $sentence >= 0) {
+	my $frame = &create_xml($SOURCE[$sentence],
+				$INPUT[$i],
+				$TARGET[$sentence],
+				$ALIGNMENT[$sentence],
+				$path);
+	print FRAME $frame."\n";
+    }
+
+    # no frame -> output source
+    else {
+	print FRAME $INPUT[$i]."\n";
+    }
+}
+close(FRAME);
+close(MATCH);
+
+sub create_xml {
+    my ($source,$input,$target,$alignment,$path) = @_;
+    
+    my @INPUT = split(/ /,$input);
+    my @SOURCE = split(/ /,$source);
+    my @TARGET = split(/ /,$target);
+    my %ALIGN = &create_alignment($alignment);
+    
+    my %FRAME_INPUT;
+    my @TARGET_BITMAP; 
+    foreach (@TARGET) { push @TARGET_BITMAP,1 }
+    
+    ### STEP 1: FIND MISMATCHES
+
+    my ($s,$i) = (0,0);
+    my $currently_matching = 0;
+    my ($start_s,$start_i) = (0,0);
+
+    $path .= "X"; # indicate end
+    print "$input\n$source\n$target\n$path\n";
+    for(my $p=0;$p<length($path);$p++) {
+	my $action = substr($path,$p,1);
+	
+	# beginning of a mismatch
+	if ($currently_matching && $action ne "M" && $action ne "X") {
+	    $start_i = $i;
+	    $start_s = $s;
+	    $currently_matching = 0;
+	}
+	
+	# end of a mismatch
+	elsif (!$currently_matching && 
+	       ($action eq "M" || $action eq "X")) {
+	    
+	    # remove use of affected target words
+	    for(my $ss = $start_s; $ss<$s; $ss++) {
+		foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
+		    $TARGET_BITMAP[$tt] = 0;
+		}
+		
+		# also remove enclosed unaligned words?
+	    }
+	    
+	    # are there input words that need to be inserted ?
+	    print "($start_i<$i)?\n";
+	    if ($start_i<$i) {
+		
+		# take note of input words to be inserted
+		my $insertion = "";
+		for(my $ii = $start_i; $ii<$i; $ii++) {
+		    $insertion .= $INPUT[$ii]." ";
+		}
+		
+		# find position for inserted input words
+		
+		# find first removed target word
+		my $start_t = 1000;
+		for(my $ss = $start_s; $ss<$s; $ss++) {
+		    foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
+			$start_t = $tt if $tt < $start_t;
+		    }
+		}
+
+		# end of sentence? add to end
+		if ($start_t == 1000 && $i > $#INPUT) {
+		    $start_t = $#TARGET;
+		}
+		
+		# backtrack to previous words if unaligned
+		if ($start_t == 1000) {
+		    $start_t = -1;
+		    for(my $ss = $s-1; $start_t==-1 && $ss>=0; $ss--) {
+			foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
+			    $start_t = $tt if $tt > $start_t;
+			}
+		    }
+		}
+		$FRAME_INPUT{$start_t} .= $insertion;
+	    }
+	    
+	    $currently_matching = 1;
+	}
+	
+	print "$action $s $i ($start_s $start_i) $currently_matching";
+	if ($action ne "I") {
+	    print " ->";
+	    foreach my $tt (keys %{${$ALIGN{'s'}}[$s]}) {
+		print " ".$tt;
+	    }
+	}
+	print "\n";
+	$s++ unless $action eq "I";
+	$i++ unless $action eq "D";
+    }
+    
+
+    print $target."\n";
+    foreach (@TARGET_BITMAP) { print $_; } print "\n";
+    foreach (sort keys %FRAME_INPUT) { 
+	print "$_: $FRAME_INPUT{$_}\n";
+    }
+
+    ### STEP 2: BUILD FRAME
+
+    # modify frame
+    my $frame = "";
+    $frame = $FRAME_INPUT{-1} if defined $FRAME_INPUT{-1};
+    
+    my $currently_included = 0;
+    my $start_t = -1;
+    push @TARGET_BITMAP,0; # indicate end
+
+    for(my $t=0;$t<=scalar(@TARGET);$t++) {	    
+	
+	# beginning of tm target inclusion
+	if (!$currently_included && $TARGET_BITMAP[$t]) {
+	    $start_t = $t;
+	    $currently_included = 1;
+	}
+	
+	# end of tm target inclusion (not included word or inserted input)
+	elsif ($currently_included && 
+	       (!$TARGET_BITMAP[$t] || defined($FRAME_INPUT{$t}))) {
+	    # add xml (unless change is at the beginning of the sentence
+	    if ($start_t >= 0) {
+		my $target = "";
+		print "for(tt=$start_t;tt<$t+$TARGET_BITMAP[$t]);\n";
+		for(my $tt=$start_t;$tt<$t+$TARGET_BITMAP[$t];$tt++) {
+		    $target .= $TARGET[$tt] . " ";
+		}
+		chop($target);
+		$frame .= "<xml translation=\"$target\"> x </xml> ";
+	    }
+	    $currently_included = 0;
+	}
+	
+	$frame .= $FRAME_INPUT{$t} if defined $FRAME_INPUT{$t};
+	print "$TARGET_BITMAP[$t] $t ($start_t) $currently_included\n";
+    }
+
+    print $frame."\n-------------------------------------\n";
+    return $frame;
+}
+
+sub create_alignment {
+	my ($line) = @_;
+	my (@ALIGNED_TO_S,@ALIGNED_TO_T);
+	foreach my $point (split(/ /,$line)) {
+		my ($s,$t) = split(/\-/,$point);
+		$ALIGNED_TO_S[$s]{$t}++;
+		$ALIGNED_TO_T[$t]{$s}++;
+	}
+	my %ALIGNMENT = ( 's' => \@ALIGNED_TO_S, 't' => \@ALIGNED_TO_T );
+	return %ALIGNMENT;
+}
--- a/contrib/fuzzy-match/old/fuzzy-match.cpp
+++ b/contrib/fuzzy-match/old/fuzzy-match.cpp
@ -0,0 +1,982 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <vector>
+#include <map>
+#include <string>
+#include <algorithm>
+#include <iostream>
+#include <fstream>
+#include <cstring>
+#include <time.h>
+
+#include "Vocabulary.h"
+#include "SuffixArray.h"
+
+/** This implementation is explained in
+       Koehn and Senellart: "Fast Approximate String Matching 
+       with Suffix Arrays and A* Parsing" (AMTA 2010) ***/
+
+using namespace std;
+
+Vocabulary vocabulary;
+
+int basic_flag = false;
+int lsed_flag = true;
+int refined_flag = true;
+int length_filter_flag = true;
+int parse_flag = true;
+int min_match = 70;
+int multiple_flag = false;
+int multiple_slack = 0;
+int multiple_max = 100;
+
+void load_corpus( char* fileName, vector< vector< WORD_ID > > &corpus )
+{
+	ifstream fileStream;
+	fileStream.open(fileName);
+	if (!fileStream) {
+		cerr << "file not found: " << fileName << endl;
+		exit(1);
+	}
+	istream *fileStreamP = &fileStream;
+
+	char line[LINE_MAX_LENGTH];
+	while(true)
+	{
+		SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
+		if (fileStreamP->eof()) break;
+		corpus.push_back( vocabulary.Tokenize( line ) );
+	}
+}
+
+
+/* Letter string edit distance, e.g. sub 'their' to 'there' costs 2 */
+
+// global cache for word pairs
+map< pair< WORD_ID, WORD_ID >, unsigned int > lsed;
+
+unsigned int letter_sed( WORD_ID aIdx, WORD_ID bIdx )
+{
+	// check if already computed -> lookup in cache
+	pair< WORD_ID, WORD_ID > pIdx = make_pair( aIdx, bIdx );
+	map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = lsed.find( pIdx );
+	if (lookup != lsed.end())
+	{
+		return (lookup->second);
+	}
+
+	// get surface strings for word indices
+	const string &a = vocabulary.GetWord( aIdx );
+	const string &b = vocabulary.GetWord( bIdx );
+
+	// initialize cost matrix
+	unsigned int **cost  = (unsigned int**) calloc( sizeof( unsigned int*  ), a.size()+1 );
+	for( unsigned int i=0; i<=a.size(); i++ ) {
+		cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
+		cost[i][0] = i;
+	}
+	for( unsigned int j=0; j<=b.size(); j++ ) {
+		cost[0][j] = j;
+	}
+
+	// core string edit distance loop
+	for( unsigned int i=1; i<=a.size(); i++ ) {
+		for( unsigned int j=1; j<=b.size(); j++ ) {
+
+			unsigned int ins = cost[i-1][j] + 1;
+			unsigned int del = cost[i][j-1] + 1;
+			bool match = (a.substr(i-1,1).compare( b.substr(j-1,1) ) == 0);
+			unsigned int diag = cost[i-1][j-1] + (match ? 0 : 1);
+
+			unsigned int min = (ins < del) ? ins : del;
+			min = (diag < min) ? diag : min;
+
+			cost[i][j] = min;
+		}
+	}
+
+	// clear out memory
+	unsigned int final = cost[a.size()][b.size()];
+	for( unsigned int i=0; i<=a.size(); i++ ) {
+		free( cost[i] );
+	}
+	free( cost );
+
+	// cache and return result
+	lsed[ pIdx ] = final;
+	return final;
+}
+
+/* string edit distance implementation */
+
+unsigned int sed( const vector< WORD_ID > &a, const vector< WORD_ID > &b, string &best_path, bool use_letter_sed ) {
+
+	// initialize cost and path matrices
+	unsigned int **cost  = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 );
+	char **path = (char**) calloc( sizeof( char* ), a.size()+1 );
+	
+	for( unsigned int i=0; i<=a.size(); i++ ) {
+		cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
+		path[i] = (char*) calloc( sizeof(char), b.size()+1 );
+		if (i>0)
+		{
+			cost[i][0] = cost[i-1][0];
+			if (use_letter_sed)
+			{
+				cost[i][0] += vocabulary.GetWord( a[i-1] ).size();
+			}
+			else
+			{
+				cost[i][0]++;
+			}
+		}
+		else
+		{
+			cost[i][0] = 0;
+		}
+		path[i][0] = 'I';
+	}
+
+	for( unsigned int j=0; j<=b.size(); j++ ) {
+		if (j>0) 
+		{
+			cost[0][j] = cost[0][j-1];
+			if (use_letter_sed)
+			{
+				cost[0][j] +=	vocabulary.GetWord( b[j-1] ).size();
+			}
+			else
+			{
+				cost[0][j]++;
+			}
+		}
+		else
+		{
+			cost[0][j] = 0;
+		}
+		path[0][j] = 'D';
+	}
+
+	// core string edit distance algorithm
+	for( unsigned int i=1; i<=a.size(); i++ ) {
+		for( unsigned int j=1; j<=b.size(); j++ ) {
+			unsigned int ins = cost[i-1][j];
+			unsigned int del = cost[i][j-1];
+			unsigned int match;
+			if (use_letter_sed)
+			{
+				ins += vocabulary.GetWord( a[i-1] ).size();
+				del += vocabulary.GetWord( b[j-1] ).size();
+				match = letter_sed( a[i-1], b[j-1] );
+			}
+			else
+			{
+				ins++;
+				del++;
+				match = ( a[i-1] == b[j-1] ) ? 0 : 1;
+			}
+			unsigned int diag = cost[i-1][j-1] + match;
+
+			char action = (ins < del) ? 'I' : 'D';
+			unsigned int min = (ins < del) ? ins : del;
+			if (diag < min)
+			{
+				action = (match>0) ? 'S' : 'M';
+				min = diag;
+			}
+
+			cost[i][j] = min;
+			path[i][j] = action;
+		}
+	}
+
+	// construct string for best path
+	unsigned int i = a.size();
+	unsigned int j = b.size();
+	best_path = "";
+	while( i>0 || j>0 )
+	{
+		best_path = path[i][j] + best_path;
+		if (path[i][j] == 'I') 
+		{
+			i--;
+		}
+		else if (path[i][j] == 'D') 
+		{
+			j--;
+		}
+		else 
+		{ 
+			i--; 
+			j--;
+		}
+	}
+	
+
+	// clear out memory
+	unsigned int final = cost[a.size()][b.size()];
+
+	for( unsigned int i=0; i<=a.size(); i++ ) {
+		free( cost[i] );
+		free( path[i] );
+	}
+	free( cost );
+	free( path );
+
+	// return result
+	return final;
+}
+
+/* utlility function: compute length of sentence in characters 
+   (spaces do not count) */
+
+unsigned int compute_length( const vector< WORD_ID > &sentence )
+{
+	unsigned int length = 0; for( unsigned int i=0; i<sentence.size(); i++ )
+	{
+		length += vocabulary.GetWord( sentence[i] ).size();
+	}
+	return length;
+}
+
+/* brute force method: compare input to all corpus sentences */
+
+int basic_fuzzy_match( vector< vector< WORD_ID > > source, 
+                       vector< vector< WORD_ID > > input ) 
+{
+	// go through input set...
+	for(unsigned int i=0;i<input.size();i++)
+	{
+		bool use_letter_sed = false;
+
+		// compute sentence length and worst allowed cost
+		unsigned int input_length;
+		if (use_letter_sed)
+		{
+			input_length = compute_length( input[i] );
+		}
+		else
+		{
+			input_length = input[i].size();
+		}
+		unsigned int best_cost = input_length * (100-min_match) / 100 + 2;
+		string best_path = "";
+		int best_match = -1;
+
+		// go through all corpus sentences
+		for(unsigned int s=0;s<source.size();s++)
+		{
+			int source_length;
+			if (use_letter_sed)
+			{
+				source_length = compute_length( source[s] );
+			}
+			else
+			{
+				source_length = source[s].size();
+			}
+			int diff = abs((int)source_length - (int)input_length);
+			if (length_filter_flag && (diff >= best_cost))
+			{
+				continue;
+			}
+
+			// compute string edit distance
+			string path;
+			unsigned int cost = sed( input[i], source[s], path, use_letter_sed );
+
+			// update if new best
+			if (cost < best_cost) 
+			{
+				best_cost = cost;
+				best_path = path;
+				best_match = s;
+			}
+		}
+		cout << best_cost << " ||| " << best_match << " ||| " << best_path << endl;
+	}
+}
+
+#define MAX_MATCH_COUNT 10000000
+
+/* data structure for n-gram match between input and corpus */
+
+class Match {
+public:
+	int input_start;
+	int input_end;
+	int tm_start;
+	int tm_end;
+	int min_cost;
+	int max_cost;
+	int internal_cost;
+	Match( int is, int ie, int ts, int te, int min, int max, int i )
+		:input_start(is), input_end(ie), tm_start(ts), tm_end(te), min_cost(min), max_cost(max), internal_cost(i)
+		{}
+};
+
+map< WORD_ID,vector< int > > single_word_index;
+
+/* definition of short matches
+   very short n-gram matches (1-grams) will not be looked up in
+   the suffix array, since there are too many matches
+   and for longer sentences, at least one 2-gram match must occur */
+
+inline int short_match_max_length( int input_length )
+{
+	if ( ! refined_flag ) 
+		return 0;
+	if ( input_length >= 5 )
+		return 1;
+	return 0;	
+}
+
+/* if we have non-short matches in a sentence, we need to
+   take a closer look at it. 
+	 this function creates a hash map for all input words and their positions
+   (to be used by the next function) 
+   (done here, because this has be done only once for an input sentence) */
+
+void init_short_matches( const vector< WORD_ID > &input )
+{
+	int max_length = short_match_max_length( input.size() );
+	if (max_length == 0)
+		return;
+
+	single_word_index.clear();
+	
+	// store input words and their positions in hash map
+	for(int i=0; i<input.size(); i++)
+	{
+		if (single_word_index.find( input[i] ) == single_word_index.end())
+		{
+			vector< int > position_vector;
+			single_word_index[ input[i] ] = position_vector;
+		}
+		single_word_index[ input[i] ].push_back( i );
+	}	
+}
+
+/* add all short matches to list of matches for a sentence */
+
+void add_short_matches( vector< Match > &match, const vector< WORD_ID > &tm, int input_length, int best_cost )
+{	
+	int max_length = short_match_max_length( input_length );
+	if (max_length == 0)
+		return;
+
+	int tm_length = tm.size();
+	map< WORD_ID,vector< int > >::iterator input_word_hit;
+	for(int t_pos=0; t_pos<tm.size(); t_pos++)
+	{
+		input_word_hit = single_word_index.find( tm[t_pos] );
+		if (input_word_hit != single_word_index.end())
+		{
+			vector< int > &position_vector = input_word_hit->second;
+			for(int j=0; j<position_vector.size(); j++)
+			{
+				int &i_pos = position_vector[j];
+
+				// before match
+				int max_cost = max( i_pos , t_pos );
+				int min_cost = abs( i_pos - t_pos );
+				if ( i_pos>0 && i_pos == t_pos ) 
+					min_cost++;
+				
+				// after match
+				max_cost += max( (input_length-i_pos) , (tm_length-t_pos));
+				min_cost += abs( (input_length-i_pos) - (tm_length-t_pos));
+				if ( i_pos != input_length-1 && (input_length-i_pos) == (tm_length-t_pos))
+					min_cost++;
+				
+				if (min_cost <= best_cost)
+				{
+					Match new_match( i_pos,i_pos, t_pos,t_pos, min_cost,max_cost,0 );
+					match.push_back( new_match );
+				}
+			}
+		} 
+	}
+}
+
+/* remove matches that are subsumed by a larger match */
+
+vector< Match > prune_matches( const vector< Match > &match, int best_cost )
+{
+	//cerr << "\tpruning";
+	vector< Match > pruned;
+	for(int i=match.size()-1; i>=0; i--)
+	{
+		//cerr << " (" << match[i].input_start << "," << match[i].input_end 
+		//		 << " ; " << match[i].tm_start << "," << match[i].tm_end 
+		//		 << " * " << match[i].min_cost << ")";
+
+		//if (match[i].min_cost > best_cost)
+		//	continue;
+
+		bool subsumed = false;
+		for(int j=match.size()-1; j>=0; j--)
+		{
+			if (i!=j // do not compare match with itself
+					&& ( match[i].input_end - match[i].input_start <= 
+							 match[j].input_end - match[j].input_start ) // i shorter than j
+					&& ((match[i].input_start == match[j].input_start &&
+							 match[i].tm_start    == match[j].tm_start	) ||
+							(match[i].input_end   == match[j].input_end &&
+							 match[i].tm_end      == match[j].tm_end) ) )
+			{
+				subsumed = true;
+			}
+		}
+		if (! subsumed && match[i].min_cost <= best_cost)
+		{
+			//cerr << "*";
+			pruned.push_back( match[i] );
+		}
+	}
+	//cerr << endl;
+	return pruned;
+}
+
+/* A* parsing method to compute string edit distance */
+
+int parse_matches( vector< Match > &match, int input_length, int tm_length, int &best_cost )
+{	
+	// cerr << "sentence has " << match.size() << " matches, best cost: " << best_cost << ", lengths input: " << input_length << " tm: " << tm_length << endl;
+
+	if (match.size() == 1)
+		return match[0].max_cost;
+	if (match.size() == 0)
+		return input_length+tm_length;
+	
+	int this_best_cost = input_length + tm_length;
+	for(int i=0;i<match.size();i++)
+	{
+		this_best_cost = min( this_best_cost, match[i].max_cost );
+	}
+	// cerr << "\tthis best cost: " << this_best_cost << endl;
+	
+	// bottom up combination of spans
+	vector< vector< Match > > multi_match;
+	multi_match.push_back( match );
+	
+	int match_level = 1;
+	while(multi_match[ match_level-1 ].size()>0)
+	{
+		// init vector
+		vector< Match > empty;
+		multi_match.push_back( empty );
+
+		for(int first_level = 0; first_level <= (match_level-1)/2; first_level++)
+		{
+			int second_level = match_level - first_level -1;
+			//cerr << "\tcombining level " << first_level << " and " << second_level << endl;
+			
+			vector< Match > &first_match  = multi_match[ first_level ];
+			vector< Match > &second_match = multi_match[ second_level ];
+
+			for(int i1 = 0; i1 < first_match.size(); i1++) {
+				for(int i2 = 0; i2 < second_match.size(); i2++) {
+
+					// do not combine the same pair twice
+					if (first_level == second_level && i2 <= i1) 
+					{
+						continue;
+					}
+
+					// get sorted matches (first is before second)
+					Match *first, *second;
+					if (first_match[i1].input_start < second_match[i2].input_start )
+					{
+						first = &first_match[i1];
+						second = &second_match[i2];
+					}
+					else
+					{
+						second = &first_match[i1];
+						first = &second_match[i2];
+					}
+
+					//cerr << "\tcombining " 
+					//		 << "(" << first->input_start << "," << first->input_end << "), "
+					//		 << first->tm_start << " [" << first->internal_cost << "]"
+					//		 << " with "
+					//		 << "(" << second->input_start << "," << second->input_end << "), "
+					//		 << second->tm_start<< " [" << second->internal_cost << "]"
+					//		 << endl;
+
+					// do not process overlapping matches
+					if (first->input_end >= second->input_start) 
+					{
+						continue;
+					}
+
+					// no overlap / mismatch in tm
+					if (first->tm_end >= second->tm_start)
+					{
+						continue;
+					}
+
+					// compute cost
+					int min_cost = 0;
+					int max_cost = 0;
+
+					// initial
+					min_cost += abs( first->input_start - first->tm_start );
+					max_cost += max( first->input_start, first->tm_start );				 
+
+					// same number of words, but not sent. start -> cost is at least 1 
+					if (first->input_start == first->tm_start && first->input_start > 0)
+					{
+						min_cost++;
+					}
+
+					// in-between
+					int skipped_words = second->input_start - first->input_end -1;
+					int skipped_words_tm = second->tm_start - first->tm_end -1;
+					int internal_cost = max( skipped_words, skipped_words_tm );
+					internal_cost += first->internal_cost + second->internal_cost;
+					min_cost += internal_cost;
+					max_cost += internal_cost;
+					
+					// final
+					min_cost += abs( (tm_length-1 - second->tm_end) -
+													 (input_length-1 - second->input_end) );
+					max_cost += max( (tm_length-1 - second->tm_end),
+													 (input_length-1 - second->input_end) );
+
+					// same number of words, but not sent. end -> cost is at least 1
+					if ( ( input_length-1 - second->input_end 
+								 == tm_length-1 - second->tm_end )
+							 && input_length-1 != second->input_end )
+					{
+						min_cost++;
+					}
+
+					// cerr << "\tcost: " << min_cost << "-" << max_cost << endl;
+
+					// if worst than best cost, forget it
+					if (min_cost > best_cost)					
+					{
+						continue;
+					}
+					
+					// add match
+					Match new_match( first->input_start,
+													 second->input_end,
+													 first->tm_start,
+													 second->tm_end,
+													 min_cost,
+													 max_cost,
+													 internal_cost);
+					multi_match[ match_level ].push_back( new_match );
+					// cerr << "\tstored\n";
+					
+					// possibly updating this_best_cost
+					if (max_cost < this_best_cost)
+					{
+						// cerr << "\tupdating this best cost to " << max_cost << "\n";
+						this_best_cost = max_cost;
+
+						// possibly updating best_cost
+						if (max_cost < best_cost)
+						{
+							// cerr << "\tupdating best cost to " << max_cost << "\n";
+							best_cost = max_cost;
+						}					
+					}
+				}
+			}
+		}
+		match_level++;
+	}
+	return this_best_cost;
+}
+
+int main(int argc, char* argv[]) 
+{
+	vector< vector< WORD_ID > > source, input;
+
+	while(1) {
+		static struct option long_options[] = {
+			{"basic", no_argument, &basic_flag, 1},
+			{"word", no_argument, &lsed_flag, 0},
+			{"unrefined", no_argument, &refined_flag, 0},
+			{"nolengthfilter", no_argument, &length_filter_flag, 0},
+			{"noparse", no_argument, &parse_flag, 0},
+			{"multiple", no_argument, &multiple_flag, 1},
+			{"minmatch", required_argument, 0, 'm'},
+			{0, 0, 0, 0}
+		};
+		int option_index = 0;
+		int c = getopt_long (argc, argv, "m:", long_options, &option_index);
+		if (c == -1) break;
+		switch (c) {
+			case 0:
+//				if (long_options[option_index].flag != 0)
+//					break;
+//				printf ("option %s", long_options[option_index].name);
+//				if (optarg)
+//					printf (" with arg %s", optarg);
+//				printf ("\n");
+				break;
+			case 'm':
+				min_match = atoi(optarg);
+				if (min_match < 1 || min_match > 100) {
+					cerr << "error: --minmatch must have value in range 1..100\n";
+					exit(1);
+				}
+				cerr << "setting min match to " << min_match << endl;
+				break;
+			default:
+				cerr << "usage: syntax: ./fuzzy-match input corpus [--basic] [--word] [--minmatch 1..100]\n";
+				exit(1);
+		}
+	}
+	if (lsed_flag) { cerr << "lsed\n"; }
+	if (basic_flag) { cerr << "basic\n"; }
+	if (refined_flag) { cerr << "refined\n"; }
+	if (length_filter_flag) { cerr << "length filter\n"; }
+	if (parse_flag) { cerr << "parse\n"; }
+//	exit(1);
+
+
+	if (optind+2 != argc) {
+		cerr << "syntax: ./fuzzy-match input corpus [--basic] [--word] [--minmatch 1..100]\n";
+		exit(1);
+	}
+	
+	cerr << "loading corpus...\n";
+
+	load_corpus(argv[optind], input);
+	load_corpus(argv[optind+1], source);
+
+  // ./fuzzy-match input corpus [-basic] 
+	
+//	load_corpus("../corpus/tm.truecased.4.en", source);
+//	load_corpus("../corpus/tm.truecased.4.it", target);
+//	load_corpus("../evaluation/test.input.tc.4", input);
+
+//	load_corpus("../../acquis-truecase/corpus/acquis.truecased.190.en", source);
+//	load_corpus("../../acquis-truecase/evaluation/ac-test.input.tc.190", input);
+
+//	load_corpus("../corpus/tm.truecased.16.en", source);
+//	load_corpus("../evaluation/test.input.tc.16", input);
+
+	if (basic_flag) {
+		cerr << "using basic method\n";
+		clock_t start_main_clock2 = clock();
+		basic_fuzzy_match( source, input );
+		cerr << "total: " << (1000 * (clock()-start_main_clock2) / CLOCKS_PER_SEC) << endl;
+		exit(1);
+	}
+
+	cerr << "number of input sentences " << input.size() << endl;
+
+	cerr << "creating suffix array...\n";
+//	SuffixArray suffixArray( "../corpus/tm.truecased.4.en" );
+//	SuffixArray suffixArray( "../../acquis-truecase/corpus/acquis.truecased.190.en" );
+	SuffixArray suffixArray( argv[optind+1] );
+	
+	clock_t start_main_clock = clock();
+
+	// looping through all input sentences...
+	cerr << "looping...\n";
+	for(unsigned int i=0;i<input.size();i++)
+	{
+		clock_t start_clock = clock();
+		// if (i % 10 == 0) cerr << ".";
+		int input_id = i; // clean up this mess!
+
+		// establish some basic statistics
+
+		// int input_length = compute_length( input[i] );
+		int input_length = input[i].size();
+		int best_cost = input_length * (100-min_match) / 100 + 1;
+
+		int match_count = 0; // how many substring matches to be considered
+		//cerr << endl << "sentence " << i << ", length " << input_length << ", best_cost " << best_cost << endl;
+
+		// find match ranges in suffix array
+		vector< vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > > match_range;
+		for(size_t start=0;start<input[i].size();start++) 
+		{
+			SuffixArray::INDEX prior_first_match = 0;
+			SuffixArray::INDEX prior_last_match = suffixArray.GetSize()-1;
+			vector< string > substring;
+			bool stillMatched = true;
+			vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > matchedAtThisStart;
+			//cerr << "start: " << start;
+			for(int word=start; stillMatched && word<input[i].size(); word++)
+			{
+				substring.push_back( vocabulary.GetWord( input[i][word] ) );
+
+				// only look up, if needed (i.e. no unnecessary short gram lookups)
+//				if (! word-start+1 <= short_match_max_length( input_length ) )
+				//			{
+				SuffixArray::INDEX first_match, last_match;
+				stillMatched = false;
+				if (suffixArray.FindMatches( substring, first_match, last_match, prior_first_match, prior_last_match ) )
+				{
+					stillMatched = true;
+					matchedAtThisStart.push_back( make_pair( first_match, last_match ) );
+					//cerr << " (" << first_match << "," << last_match << ")";
+					//cerr << " " << ( last_match - first_match + 1 );
+					prior_first_match = first_match;
+					prior_last_match = last_match;
+				}
+					//}
+			}
+			//cerr << endl;
+			match_range.push_back( matchedAtThisStart );
+		}
+
+		clock_t clock_range = clock();
+
+		map< int, vector< Match > > sentence_match;
+		map< int, int > sentence_match_word_count;
+
+		// go through all matches, longest first
+		for(int length = input[i].size(); length >= 1; length--)
+		{
+			// do not create matches, if these are handled by the short match function
+			if (length <= short_match_max_length( input_length ) )
+			{
+				continue;
+			}
+
+			unsigned int count = 0;
+			for(int start = 0; start <= input[i].size() - length; start++)
+			{
+				if (match_range[start].size() >= length)
+				{
+					pair< SuffixArray::INDEX, SuffixArray::INDEX > &range = match_range[start][length-1];
+					// cerr << " (" << range.first << "," << range.second << ")";
+					count += range.second - range.first + 1;
+
+					for(SuffixArray::INDEX i=range.first; i<=range.second; i++)
+					{
+						int position = suffixArray.GetPosition( i );
+
+						// sentence length mismatch
+						size_t sentence_id = suffixArray.GetSentence( position );
+						int sentence_length = suffixArray.GetSentenceLength( sentence_id );
+						int diff = abs( (int)sentence_length - (int)input_length );
+						// cerr << endl << i << "\tsentence " << sentence_id << ", length " << sentence_length;
+						//if (length <= 2 && input_length>=5 &&
+						//		sentence_match.find( sentence_id ) == sentence_match.end())
+						//	continue;
+
+						if (diff > best_cost)
+							continue;
+
+						// compute minimal cost
+						int start_pos = suffixArray.GetWordInSentence( position );
+						int end_pos = start_pos + length-1;
+						// cerr << endl << "\t" << start_pos << "-" << end_pos << " (" << sentence_length << ") vs. " 
+						// << start << "-" << (start+length-1) << " (" << input_length << ")"; 
+						// different number of prior words -> cost is at least diff
+						int min_cost = abs( start - start_pos );
+						
+						// same number of words, but not sent. start -> cost is at least 1 
+						if (start == start_pos && start>0)
+							min_cost++;
+
+						// different number of remaining words -> cost is at least diff
+						min_cost += abs( ( sentence_length-1 - end_pos ) -
+														 ( input_length-1 - (start+length-1) ) );
+
+						// same number of words, but not sent. end -> cost is at least 1
+						if ( sentence_length-1 - end_pos ==
+								 input_length-1 - (start+length-1)
+								 && end_pos != sentence_length-1 )
+							min_cost++;
+
+						// cerr << " -> min_cost " << min_cost;
+						if (min_cost > best_cost)
+							continue;
+
+						// valid match
+						match_count++;
+
+						// compute maximal cost
+						int max_cost = max( start, start_pos )
+							+ max( sentence_length-1 - end_pos,
+										 input_length-1 - (start+length-1) );
+						// cerr << ", max_cost " << max_cost;
+						
+						Match m = Match( start, start+length-1, 
+														 start_pos, start_pos+length-1, 
+														 min_cost, max_cost, 0);
+						sentence_match[ sentence_id ].push_back( m );
+						sentence_match_word_count[ sentence_id ] += length;
+
+						if (max_cost < best_cost)
+						{
+							best_cost = max_cost;
+							if (best_cost == 0) break;
+						}
+						//if (match_count >= MAX_MATCH_COUNT) break;
+					}
+				}
+				// cerr << endl;
+				if (best_cost == 0) break;
+				//if (match_count >= MAX_MATCH_COUNT) break;
+			}
+			// cerr << count << " matches at length " << length << " in " << sentence_match.size() << " tm." << endl;
+
+			if (best_cost == 0) break;
+			//if (match_count >= MAX_MATCH_COUNT) break;
+		}
+		cerr << match_count << " matches in " << sentence_match.size() << " sentences." << endl;
+
+		clock_t clock_matches = clock();
+
+		// consider each sentence for which we have matches
+		int old_best_cost = best_cost;
+		int tm_count_word_match = 0;
+		int tm_count_word_match2 = 0;
+		int pruned_match_count = 0;
+		if (short_match_max_length( input_length ))
+		{
+			init_short_matches( input[i] );
+		}
+		vector< int > best_tm;
+		typedef map< int, vector< Match > >::iterator I;
+
+		clock_t clock_validation_sum = 0;
+
+		for(I tm=sentence_match.begin(); tm!=sentence_match.end(); tm++)
+		{
+			int tmID = tm->first;
+			int tm_length = suffixArray.GetSentenceLength(tmID);
+			vector< Match > &match = tm->second;
+			add_short_matches( match, source[tmID], input_length, best_cost );
+
+			//cerr << "match in sentence " << tmID << ": " << match.size() << " [" << tm_length << "]" << endl;
+
+			// quick look: how many words are matched
+			int words_matched = 0;
+			for(int m=0;m<match.size();m++) {
+
+				if (match[m].min_cost <= best_cost) // makes no difference
+					words_matched += match[m].input_end - match[m].input_start + 1;
+			}
+			if (max(input_length,tm_length) - words_matched > best_cost)
+			{
+				if (length_filter_flag) continue;
+			}
+			tm_count_word_match++;
+
+			// prune, check again how many words are matched
+			vector< Match > pruned = prune_matches( match, best_cost );
+			words_matched = 0;
+			for(int p=0;p<pruned.size();p++) {
+				words_matched += pruned[p].input_end - pruned[p].input_start + 1;
+			}
+			if (max(input_length,tm_length) - words_matched > best_cost)
+			{
+				if (length_filter_flag) continue;
+			}
+			tm_count_word_match2++;
+
+			pruned_match_count += pruned.size();
+			int prior_best_cost = best_cost;
+			int cost;
+
+			clock_t clock_validation_start = clock();
+			if (! parse_flag ||
+			    pruned.size()>=10) // to prevent worst cases
+			{
+				string path;
+				cost = sed( input[input_id], source[tmID], path, false );
+				if (cost <  best_cost) 
+				{
+					best_cost = cost;
+				}
+			}
+
+			else
+			{
+				cost = parse_matches( pruned, input_length, tm_length, best_cost );
+				if (prior_best_cost != best_cost)
+				{
+					best_tm.clear();
+				}
+			}
+			clock_validation_sum += clock() - clock_validation_start;
+			if (cost == best_cost)
+			{
+				best_tm.push_back( tmID );
+			}
+		}
+		cerr << "reduced best cost from " << old_best_cost << " to " << best_cost << endl;
+		cerr << "tm considered: " << sentence_match.size()
+				 << " word-matched: " << tm_count_word_match 
+				 << " word-matched2: " << tm_count_word_match2 
+				 << " best: " << best_tm.size() << endl;
+
+		cerr << "pruned matches: " << ((float)pruned_match_count/(float)tm_count_word_match2) << endl;
+
+		// do not try to find the best ... report multiple matches
+		if (multiple_flag) {
+			int input_letter_length = compute_length( input[input_id] );
+			for(int si=0; si<best_tm.size(); si++) {
+				int s = best_tm[si];
+				string path;
+				unsigned int letter_cost = sed( input[input_id], source[s], path, true );
+				// do not report multiple identical sentences, but just their count
+				cout << i << " "; // sentence number
+				cout << letter_cost << "/" << input_letter_length << " ";
+				cout << "(" << best_cost <<"/" << input_length <<") ";
+				cout << "||| " << s << " ||| " << path << endl;
+			}
+			continue;
+		}
+
+		// find the best matches according to letter sed
+		string best_path = "";
+		int best_match = -1;
+		int best_letter_cost;
+		if (lsed_flag) {
+			best_letter_cost = compute_length( input[input_id] ) * min_match / 100 + 1;
+			for(int si=0; si<best_tm.size(); si++)
+			{
+				int s = best_tm[si];
+				string path;
+				unsigned int letter_cost = sed( input[input_id], source[s], path, true );
+				if (letter_cost < best_letter_cost)
+				{
+					best_letter_cost = letter_cost;
+					best_path = path;
+					best_match = s;
+				}
+			}
+		}
+		// if letter sed turned off, just compute path for first match
+		else {
+			if (best_tm.size() > 0) {
+				string path;
+				sed( input[input_id], source[best_tm[0]], path, false );
+				best_path = path;
+				best_match = best_tm[0];
+			}
+		}
+		cerr << "elapsed: " << (1000 * (clock()-start_clock) / CLOCKS_PER_SEC)
+				 << " ( range: " << (1000 * (clock_range-start_clock) / CLOCKS_PER_SEC)
+				 << " match: " << (1000 * (clock_matches-clock_range) / CLOCKS_PER_SEC)
+				 << " tm: " << (1000 * (clock()-clock_matches) / CLOCKS_PER_SEC)
+				 << " (validation: " << (1000 * (clock_validation_sum) / CLOCKS_PER_SEC) << ")"
+				 << " )" << endl;
+		if (lsed_flag) {
+			cout << best_letter_cost << "/" << compute_length( input[input_id] ) << " (";
+		}
+		cout << best_cost <<"/" << input_length;
+		if (lsed_flag) 	cout << ")";
+		cout << " ||| " << best_match << " ||| " << best_path << endl;
+	}
+	cerr << "total: " << (1000 * (clock()-start_main_clock) / CLOCKS_PER_SEC) << endl;
+	
+
+}
--- a/contrib/fuzzy-match/old/get-multiple-translations-for-uniq-sources.perl
+++ b/contrib/fuzzy-match/old/get-multiple-translations-for-uniq-sources.perl
@ -0,0 +1,58 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+my $src_in = "corpus/acquis.truecased.4.en";
+my $tgt_in = "corpus/acquis.truecased.4.fr";
+my $align_in = "model/aligned.4.grow-diag-final-and";
+
+my $src_out = "data/acquis.truecased.4.en.uniq";
+my $tgt_out = "data/acquis.truecased.4.fr.uniq";
+my $tgt_mf  = "data/acquis.truecased.4.fr.uniq.most-frequent";
+my $align_out = "data/acquis.truecased.4.align.uniq";
+my $align_mf  = "data/acquis.truecased.4.align.uniq.most-frequent";
+
+my (%TRANS,%ALIGN);
+
+open(SRC,$src_in);
+open(TGT,$tgt_in);
+open(ALIGN,$align_in);
+while(my $src = <SRC>) {
+  my $tgt = <TGT>;
+  my $align = <ALIGN>;
+  chop($tgt);
+  chop($align);
+  $TRANS{$src}{$tgt}++;
+  $ALIGN{$src}{$tgt} = $align;
+}
+close(SRC);
+close(TGT);
+
+open(SRC_OUT,">$src_out");
+open(TGT_OUT,">$tgt_out");
+open(TGT_MF, ">$tgt_mf");
+open(ALIGN_OUT,">$align_out");
+open(ALIGN_MF, ">$align_mf");
+foreach my $src (keys %TRANS) {
+  print SRC_OUT $src;
+  my $first = 1;
+  my ($max,$best) = (0);
+  foreach my $tgt (keys %{$TRANS{$src}}) {
+    print TGT_OUT " ||| " unless $first;
+    print TGT_OUT $TRANS{$src}{$tgt}." ".$tgt;
+    print ALIGN_OUT " ||| " unless $first;
+    print ALIGN_OUT $ALIGN{$src}{$tgt};
+    if ($TRANS{$src}{$tgt} > $max) {
+      $max = $TRANS{$src}{$tgt};
+      $best = $tgt;
+    }
+    $first = 0;
+  }
+  print TGT_OUT "\n";
+  print ALIGN_OUT "\n";
+  print TGT_MF $best."\n";
+  print ALIGN_MF $ALIGN{$src}{$best}."\n";
+}
+close(SRC_OUT);
+close(TGT_OUT);
+
--- a/contrib/fuzzy-match/old/make-pt-from-tm.perl
+++ b/contrib/fuzzy-match/old/make-pt-from-tm.perl
@ -0,0 +1,308 @@
+#!/usr/bin/perl -w 
+
+use strict;
+use FindBin qw($RealBin);
+use File::Basename;
+
+my $DEBUG = 1;
+my $OUTPUT_RULES = 1;
+
+#my $data_root = "/Users/hieuhoang/workspace/experiment/data/tm-mt-integration/";
+my $in_file 		= $ARGV[0]; #"$data_root/in/ac-test.input.tc.4";
+my $source_file 	= $ARGV[1]; #"$data_root/in/acquis.truecased.4.en.uniq";
+my $target_file 	= $ARGV[2]; #"$data_root/in/acquis.truecased.4.fr.uniq";
+my $alignment_file	= $ARGV[3]; #"$data_root/in/acquis.truecased.4.align.uniq";
+my $lex_file		= $ARGV[4]; #$data_root/in/lex.4;
+my $pt_file			= $ARGV[5]; #"$data_root/out/pt";
+
+my $cmd;
+
+my $TMPDIR=dirname($pt_file)  ."/tmp.$$";
+$cmd = "mkdir -p $TMPDIR";
+`$cmd`;
+
+my $match_file  = "$TMPDIR/match";
+
+# suffix array creation and extraction
+$cmd = "$RealBin/fuzzy-match --multiple $in_file  $source_file > $match_file";
+print STDERR "$cmd \n";
+`$cmd`;
+
+# make into xml and pt
+my $out_file = "$TMPDIR/ac-test.input.xml.4.uniq.multi.tuning";
+
+my @INPUT = `cat $in_file`; chop(@INPUT);
+my @ALL_SOURCE = `cat $source_file`; chop(@ALL_SOURCE);
+my @ALL_TARGET = `cat $target_file`; chop(@ALL_TARGET);
+my @ALL_ALIGNMENT = `cat $alignment_file`; chop(@ALL_ALIGNMENT);
+
+open(MATCH,$match_file);
+open(FRAME,">$out_file");
+open(RULE,">$out_file.extract") if $OUTPUT_RULES;
+open(RULE_INV,">$out_file.extract.inv") if $OUTPUT_RULES;
+open(INFO,">$out_file.info");
+while( my $match = <MATCH> ) {
+    chop($match);
+    my ($score,$sentence,$path) = split(/ \|\|\| /,$match);
+
+    $score =~ /^(\d+) (.+)/ || die;
+    my ($i,$match_score) = ($1,$2);
+	print STDERR "i=$i match_score=$match_score\n";
+	
+    # construct frame
+    if ($sentence < 1e9 && $sentence >= 0) {
+		my $SOURCE = $ALL_SOURCE[$sentence];
+		my @ALIGNMENT = split(/ \|\|\| /,$ALL_ALIGNMENT[$sentence]);
+		my @TARGET = split(/ \|\|\| /,$ALL_TARGET[$sentence]);
+		
+		for(my $j=0;$j<scalar(@TARGET);$j++) {
+			$TARGET[$j] =~ /^(\d+) (.+)$/ || die;
+			my ($target_count,$target) = ($1,$2);
+			my ($frame,$rule_s,$rule_t,$rule_alignment,$rule_alignment_inv) = 
+			&create_xml($SOURCE,
+					$INPUT[$i],
+					$target,
+					$ALIGNMENT[$j],
+					$path);
+			print FRAME $frame."\n";
+			print RULE "$rule_s [X] ||| $rule_t [X] ||| $rule_alignment ||| $target_count\n" if $OUTPUT_RULES;
+			print RULE_INV "$rule_t [X] ||| $rule_s [X] ||| $rule_alignment_inv ||| $target_count\n" if $OUTPUT_RULES;
+			print INFO "$i ||| $match_score ||| $target_count\n";
+		}
+    }
+}
+close(FRAME);
+close(MATCH);
+close(RULE) if $OUTPUT_RULES;
+close(RULE_INV) if $OUTPUT_RULES;
+
+`LC_ALL=C sort $out_file.extract | gzip -c > $out_file.extract.sorted.gz`;
+`LC_ALL=C sort $out_file.extract.inv | gzip -c > $out_file.extract.inv.sorted.gz`;
+
+if ($OUTPUT_RULES)
+{
+  $cmd = "$RealBin/../../scripts/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical -extract-file $out_file.extract -lexical-file $lex_file -phrase-translation-table $pt_file";
+  print STDERR "Executing: $cmd \n";
+  `$cmd`;
+}
+
+#$cmd = "rm -rf $TMPDIR";
+#`$cmd`;
+
+#######################################################
+sub create_xml {
+    my ($source,$input,$target,$alignment,$path) = @_;
+    
+	print STDERR " HIEU \n $source \n $input \n $target \n $alignment \n $path \n";
+
+    my @INPUT = split(/ /,$input);
+    my @SOURCE = split(/ /,$source);
+    my @TARGET = split(/ /,$target);
+    my %ALIGN = &create_alignment($alignment);
+    
+    my %FRAME_INPUT;
+    my (@NT,@INPUT_BITMAP,@TARGET_BITMAP,%ALIGNMENT_I_TO_S);
+    foreach (@TARGET) { push @TARGET_BITMAP,1 }
+    
+    ### STEP 1: FIND MISMATCHES
+
+    my ($s,$i) = (0,0);
+    my $currently_matching = 0;
+    my ($start_s,$start_i) = (0,0);
+
+    $path .= "X"; # indicate end
+    print STDERR "$input\n$source\n$target\n$path\n";
+    for(my $p=0;$p<length($path);$p++) {
+	my $action = substr($path,$p,1);
+
+	# beginning of a mismatch
+	if ($currently_matching && $action ne "M" && $action ne "X") {
+	    $start_i = $i;
+	    $start_s = $s;
+	    $currently_matching = 0;
+	}
+	
+	# end of a mismatch
+	elsif (!$currently_matching && 
+	       ($action eq "M" || $action eq "X")) {
+	    
+	    # remove use of affected target words
+	    for(my $ss = $start_s; $ss<$s; $ss++) {
+		foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
+		    $TARGET_BITMAP[$tt] = 0;
+		}
+		
+		# also remove enclosed unaligned words?
+	    }
+	    
+	    # are there input words that need to be inserted ?
+	    print STDERR "($start_i<$i)?\n";
+	    if ($start_i<$i) {
+		
+		# take note of input words to be inserted
+		my $insertion = "";
+		for(my $ii = $start_i; $ii<$i; $ii++) {
+		    $insertion .= $INPUT[$ii]." ";
+		}
+		
+		# find position for inserted input words
+		
+		# find first removed target word
+		my $start_t = 1000;
+		for(my $ss = $start_s; $ss<$s; $ss++) {
+		    foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
+			$start_t = $tt if $tt < $start_t;
+		    }
+		}
+
+		# end of sentence? add to end
+		if ($start_t == 1000 && $i > $#INPUT) {
+		    $start_t = $#TARGET;
+		}
+		
+		# backtrack to previous words if unaligned
+		if ($start_t == 1000) {
+		    $start_t = -1;
+		    for(my $ss = $s-1; $start_t==-1 && $ss>=0; $ss--) {
+			foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
+			    $start_t = $tt if $tt > $start_t;
+			}
+		    }
+		}
+		$FRAME_INPUT{$start_t} .= $insertion;
+		my %NT = ("start_t" => $start_t,
+			  "start_i" => $start_i );
+		push @NT,\%NT;		
+	    }	    
+	    $currently_matching = 1;
+	}
+	
+	print STDERR "$action $s $i ($start_s $start_i) $currently_matching";
+	if ($action ne "I") {
+	    print STDERR " ->";
+	    foreach my $tt (keys %{${$ALIGN{'s'}}[$s]}) {
+		print STDERR " ".$tt;
+	    }
+	}
+	print STDERR "\n";
+	$s++ unless $action eq "I";
+	$i++ unless $action eq "D";
+	$ALIGNMENT_I_TO_S{$i} = $s unless $action eq "D";
+	push @INPUT_BITMAP, 1 if $action eq "M";
+	push @INPUT_BITMAP, 0 if $action eq "I" || $action eq "S";
+    }
+    
+
+    print STDERR $target."\n";
+    foreach (@TARGET_BITMAP) { print STDERR $_; } print STDERR "\n";
+    foreach (sort keys %FRAME_INPUT) { 
+	print STDERR "$_: $FRAME_INPUT{$_}\n";
+    }
+
+    ### STEP 2: BUILD RULE AND FRAME
+        
+    # hierarchical rule
+    my $rule_s = "";
+    my $rule_pos_s = 0;
+    my %RULE_ALIGNMENT_S;
+    for(my $i=0;$i<scalar(@INPUT_BITMAP);$i++) {
+		if ($INPUT_BITMAP[$i]) {
+			$rule_s .= $INPUT[$i]." ";
+			$RULE_ALIGNMENT_S{$ALIGNMENT_I_TO_S{$i}} = $rule_pos_s++;
+		}
+		foreach my $NT (@NT) {
+			if ($i == $$NT{"start_i"}) {
+				$rule_s .= "[X][X] ";
+				$$NT{"rule_pos_s"} = $rule_pos_s++;
+			}
+		}
+    }
+
+    my $rule_t = "";
+    my $rule_pos_t = 0;
+    my %RULE_ALIGNMENT_T;
+    for(my $t=-1;$t<scalar(@TARGET_BITMAP);$t++) {
+	if ($t>=0 && $TARGET_BITMAP[$t]) {
+	    $rule_t .= $TARGET[$t]." ";
+	    $RULE_ALIGNMENT_T{$t} = $rule_pos_t++;
+	}
+	foreach my $NT (@NT) {
+	    if ($t == $$NT{"start_t"}) {
+		$rule_t .= "[X][X] ";
+		$$NT{"rule_pos_t"} = $rule_pos_t++;
+	    }
+	}
+    }
+
+    my $rule_alignment = "";
+    foreach my $s (sort { $a <=> $b} keys %RULE_ALIGNMENT_S) {
+	foreach my $t (keys %{$ALIGN{"s"}[$s]}) {
+	    next unless defined($RULE_ALIGNMENT_T{$t});
+	    $rule_alignment .= $RULE_ALIGNMENT_S{$s}."-".$RULE_ALIGNMENT_T{$t}." ";
+	}
+    }
+    foreach my $NT (@NT) {
+	$rule_alignment .= $$NT{"rule_pos_s"}."-".$$NT{"rule_pos_t"}." ";
+    }
+    
+    chop($rule_s);
+    chop($rule_t);
+    chop($rule_alignment);
+
+    my $rule_alignment_inv = "";
+    foreach (split(/ /,$rule_alignment)) {
+	/^(\d+)\-(\d+)$/;
+	$rule_alignment_inv .= "$2-$1 ";
+    }
+    chop($rule_alignment_inv);
+
+    # frame
+    my $frame = "";
+    $frame = $FRAME_INPUT{-1} if defined $FRAME_INPUT{-1};
+
+    my $currently_included = 0;
+    my $start_t = -1;
+    push @TARGET_BITMAP,0; # indicate end
+
+    for(my $t=0;$t<=scalar(@TARGET);$t++) {	    
+	# beginning of tm target inclusion
+	if (!$currently_included && $TARGET_BITMAP[$t]) {
+	    $start_t = $t;
+	    $currently_included = 1;
+	}
+	
+	# end of tm target inclusion (not included word or inserted input)
+	elsif ($currently_included && 
+	       (!$TARGET_BITMAP[$t] || defined($FRAME_INPUT{$t}))) {
+	    # add xml (unless change is at the beginning of the sentence
+	    if ($start_t >= 0) {
+		my $target = "";
+		print STDERR "for(tt=$start_t;tt<$t+$TARGET_BITMAP[$t]);\n";
+		for(my $tt=$start_t;$tt<$t+$TARGET_BITMAP[$t];$tt++) {
+		    $target .= $TARGET[$tt] . " ";
+		}
+		chop($target);
+		$frame .= "<xml translation=\"$target\"> x </xml> ";
+	    }
+	    $currently_included = 0;
+	}
+	
+	$frame .= $FRAME_INPUT{$t} if defined $FRAME_INPUT{$t};
+	print STDERR "$TARGET_BITMAP[$t] $t ($start_t) $currently_included\n";
+    }
+
+    print STDERR $frame."\n-------------------------------------\n";
+    return ($frame,$rule_s,$rule_t,$rule_alignment,$rule_alignment_inv);
+}
+
+sub create_alignment {
+	my ($line) = @_;
+	my (@ALIGNED_TO_S,@ALIGNED_TO_T);
+	foreach my $point (split(/ /,$line)) {
+		my ($s,$t) = split(/\-/,$point);
+		$ALIGNED_TO_S[$s]{$t}++;
+		$ALIGNED_TO_T[$t]{$s}++;
+	}
+	my %ALIGNMENT = ( 's' => \@ALIGNED_TO_S, 't' => \@ALIGNED_TO_T );
+	return %ALIGNMENT;
+}
--- a/contrib/fuzzy-match/old/make-pt-from-tm2.perl
+++ b/contrib/fuzzy-match/old/make-pt-from-tm2.perl
@ -0,0 +1,300 @@
+#!/usr/bin/perl -w -d 
+
+use strict;
+use FindBin qw($RealBin);
+use File::Basename;
+
+my $DEBUG = 1;
+my $OUTPUT_RULES = 1;
+
+#my $data_root = "/Users/hieuhoang/workspace/experiment/data/tm-mt-integration/";
+my $in_file 		= $ARGV[0]; #"$data_root/in/ac-test.input.tc.4";
+my $source_file 	= $ARGV[1]; #"$data_root/in/acquis.truecased.4.en.uniq";
+my $target_file 	= $ARGV[2]; #"$data_root/in/acquis.truecased.4.fr.uniq";
+my $alignment_file	= $ARGV[3]; #"$data_root/in/acquis.truecased.4.align.uniq";
+my $lex_file		= $ARGV[4]; #$data_root/in/lex.4;
+my $pt_file			= $ARGV[5]; #"$data_root/out/pt";
+
+my $cmd;
+
+my $TMPDIR= "/tmp/tmp.$$";
+$cmd = "mkdir -p $TMPDIR";
+`$cmd`;
+$TMPDIR = "/Users/hieuhoang/workspace/experiment/data/tm-mt-integration/out/tmp.3196";
+
+my $match_file  = "$TMPDIR/match";
+
+# suffix array creation and extraction
+$cmd = "$RealBin/fuzzy-match --multiple $in_file  $source_file > $match_file";
+`$cmd`;
+
+# make into xml and pt
+my $out_file = "$TMPDIR/ac-test.input.xml.4.uniq.multi.tuning";
+
+open(MATCH,$match_file);
+open(FRAME,">$out_file");
+open(RULE,">$out_file.extract") if $OUTPUT_RULES;
+open(RULE_INV,">$out_file.extract.inv") if $OUTPUT_RULES;
+open(INFO,">$out_file.info");
+while( my $match = <MATCH> ) {
+    chop($match);
+    my ($score,$sentence,$path) = split(/ \|\|\| /,$match);
+
+    $score =~ /^(\d+) (.+)/ || die;
+    my ($i,$match_score) = ($1,$2);
+
+    # construct frame
+    if ($sentence < 1e9 && $sentence >= 0) {
+		my $SOURCE = $ALL_SOURCE[$sentence];
+		my @ALIGNMENT = split(/ \|\|\| /,$ALL_ALIGNMENT[$sentence]);
+		my @TARGET = split(/ \|\|\| /,$ALL_TARGET[$sentence]);
+		
+		for(my $j=0;$j<scalar(@TARGET);$j++) {
+			$TARGET[$j] =~ /^(\d+) (.+)$/ || die;
+			my ($target_count,$target) = ($1,$2);
+			my ($frame,$rule_s,$rule_t,$rule_alignment,$rule_alignment_inv) = 
+			&create_xml($SOURCE,
+					$INPUT[$i],
+					$target,
+					$ALIGNMENT[$j],
+					$path);
+			print FRAME $frame."\n";
+			print RULE "$rule_s [X] ||| $rule_t [X] ||| $rule_alignment ||| $target_count\n" if $OUTPUT_RULES;
+			print RULE_INV "$rule_t [X] ||| $rule_s [X] ||| $rule_alignment_inv ||| $target_count\n" if $OUTPUT_RULES;
+			print INFO "$i ||| $match_score ||| $target_count\n";
+		}
+    }
+}
+close(FRAME);
+close(MATCH);
+close(RULE) if $OUTPUT_RULES;
+close(RULE_INV) if $OUTPUT_RULES;
+
+`LC_ALL=C sort $out_file.extract | gzip -c > $out_file.extract.sorted.gz`;
+`LC_ALL=C sort $out_file.extract.inv | gzip -c > $out_file.extract.inv.sorted.gz`;
+
+if ($OUTPUT_RULES)
+{
+  $cmd = "$RealBin/../../scripts/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical -extract-file $out_file.extract -lexical-file $lex_file -phrase-translation-table $pt_file";
+  print STDERR "Executing: $cmd \n";
+  `$cmd`;
+}
+
+#$cmd = "rm -rf $TMPDIR";
+#`$cmd`;
+
+#######################################################
+sub create_xml {
+    my ($source,$input,$target,$alignment,$path) = @_;
+    
+    my @INPUT = split(/ /,$input);
+    my @SOURCE = split(/ /,$source);
+    my @TARGET = split(/ /,$target);
+    my %ALIGN = &create_alignment($alignment);
+    
+    my %FRAME_INPUT;
+    my (@NT,@INPUT_BITMAP,@TARGET_BITMAP,%ALIGNMENT_I_TO_S);
+    foreach (@TARGET) { push @TARGET_BITMAP,1 }
+    
+    ### STEP 1: FIND MISMATCHES
+
+    my ($s,$i) = (0,0);
+    my $currently_matching = 0;
+    my ($start_s,$start_i) = (0,0);
+
+    $path .= "X"; # indicate end
+    print STDERR "$input\n$source\n$target\n$path\n";
+    for(my $p=0;$p<length($path);$p++) {
+	my $action = substr($path,$p,1);
+
+	# beginning of a mismatch
+	if ($currently_matching && $action ne "M" && $action ne "X") {
+	    $start_i = $i;
+	    $start_s = $s;
+	    $currently_matching = 0;
+	}
+	
+	# end of a mismatch
+	elsif (!$currently_matching && 
+	       ($action eq "M" || $action eq "X")) {
+	    
+	    # remove use of affected target words
+	    for(my $ss = $start_s; $ss<$s; $ss++) {
+		foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
+		    $TARGET_BITMAP[$tt] = 0;
+		}
+		
+		# also remove enclosed unaligned words?
+	    }
+	    
+	    # are there input words that need to be inserted ?
+	    print STDERR "($start_i<$i)?\n";
+	    if ($start_i<$i) {
+		
+		# take note of input words to be inserted
+		my $insertion = "";
+		for(my $ii = $start_i; $ii<$i; $ii++) {
+		    $insertion .= $INPUT[$ii]." ";
+		}
+		
+		# find position for inserted input words
+		
+		# find first removed target word
+		my $start_t = 1000;
+		for(my $ss = $start_s; $ss<$s; $ss++) {
+		    foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
+			$start_t = $tt if $tt < $start_t;
+		    }
+		}
+
+		# end of sentence? add to end
+		if ($start_t == 1000 && $i > $#INPUT) {
+		    $start_t = $#TARGET;
+		}
+		
+		# backtrack to previous words if unaligned
+		if ($start_t == 1000) {
+		    $start_t = -1;
+		    for(my $ss = $s-1; $start_t==-1 && $ss>=0; $ss--) {
+			foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
+			    $start_t = $tt if $tt > $start_t;
+			}
+		    }
+		}
+		$FRAME_INPUT{$start_t} .= $insertion;
+		my %NT = ("start_t" => $start_t,
+			  "start_i" => $start_i );
+		push @NT,\%NT;		
+	    }	    
+	    $currently_matching = 1;
+	}
+	
+	print STDERR "$action $s $i ($start_s $start_i) $currently_matching";
+	if ($action ne "I") {
+	    print STDERR " ->";
+	    foreach my $tt (keys %{${$ALIGN{'s'}}[$s]}) {
+		print STDERR " ".$tt;
+	    }
+	}
+	print STDERR "\n";
+	$s++ unless $action eq "I";
+	$i++ unless $action eq "D";
+	$ALIGNMENT_I_TO_S{$i} = $s unless $action eq "D";
+	push @INPUT_BITMAP, 1 if $action eq "M";
+	push @INPUT_BITMAP, 0 if $action eq "I" || $action eq "S";
+    }
+    
+
+    print STDERR $target."\n";
+    foreach (@TARGET_BITMAP) { print STDERR $_; } print STDERR "\n";
+    foreach (sort keys %FRAME_INPUT) { 
+	print STDERR "$_: $FRAME_INPUT{$_}\n";
+    }
+
+    ### STEP 2: BUILD RULE AND FRAME
+        
+    # hierarchical rule
+    my $rule_s = "";
+    my $rule_pos_s = 0;
+    my %RULE_ALIGNMENT_S;
+    for(my $i=0;$i<scalar(@INPUT_BITMAP);$i++) {
+		if ($INPUT_BITMAP[$i]) {
+			$rule_s .= $INPUT[$i]." ";
+			$RULE_ALIGNMENT_S{$ALIGNMENT_I_TO_S{$i}} = $rule_pos_s++;
+		}
+		foreach my $NT (@NT) {
+			if ($i == $$NT{"start_i"}) {
+				$rule_s .= "[X][X] ";
+				$$NT{"rule_pos_s"} = $rule_pos_s++;
+			}
+		}
+    }
+
+    my $rule_t = "";
+    my $rule_pos_t = 0;
+    my %RULE_ALIGNMENT_T;
+    for(my $t=-1;$t<scalar(@TARGET_BITMAP);$t++) {
+	if ($t>=0 && $TARGET_BITMAP[$t]) {
+	    $rule_t .= $TARGET[$t]." ";
+	    $RULE_ALIGNMENT_T{$t} = $rule_pos_t++;
+	}
+	foreach my $NT (@NT) {
+	    if ($t == $$NT{"start_t"}) {
+		$rule_t .= "[X][X] ";
+		$$NT{"rule_pos_t"} = $rule_pos_t++;
+	    }
+	}
+    }
+
+    my $rule_alignment = "";
+    foreach my $s (sort { $a <=> $b} keys %RULE_ALIGNMENT_S) {
+	foreach my $t (keys %{$ALIGN{"s"}[$s]}) {
+	    next unless defined($RULE_ALIGNMENT_T{$t});
+	    $rule_alignment .= $RULE_ALIGNMENT_S{$s}."-".$RULE_ALIGNMENT_T{$t}." ";
+	}
+    }
+    foreach my $NT (@NT) {
+	$rule_alignment .= $$NT{"rule_pos_s"}."-".$$NT{"rule_pos_t"}." ";
+    }
+    
+    chop($rule_s);
+    chop($rule_t);
+    chop($rule_alignment);
+
+    my $rule_alignment_inv = "";
+    foreach (split(/ /,$rule_alignment)) {
+	/^(\d+)\-(\d+)$/;
+	$rule_alignment_inv .= "$2-$1 ";
+    }
+    chop($rule_alignment_inv);
+
+    # frame
+    my $frame = "";
+    $frame = $FRAME_INPUT{-1} if defined $FRAME_INPUT{-1};
+
+    my $currently_included = 0;
+    my $start_t = -1;
+    push @TARGET_BITMAP,0; # indicate end
+
+    for(my $t=0;$t<=scalar(@TARGET);$t++) {	    
+	# beginning of tm target inclusion
+	if (!$currently_included && $TARGET_BITMAP[$t]) {
+	    $start_t = $t;
+	    $currently_included = 1;
+	}
+	
+	# end of tm target inclusion (not included word or inserted input)
+	elsif ($currently_included && 
+	       (!$TARGET_BITMAP[$t] || defined($FRAME_INPUT{$t}))) {
+	    # add xml (unless change is at the beginning of the sentence
+	    if ($start_t >= 0) {
+		my $target = "";
+		print STDERR "for(tt=$start_t;tt<$t+$TARGET_BITMAP[$t]);\n";
+		for(my $tt=$start_t;$tt<$t+$TARGET_BITMAP[$t];$tt++) {
+		    $target .= $TARGET[$tt] . " ";
+		}
+		chop($target);
+		$frame .= "<xml translation=\"$target\"> x </xml> ";
+	    }
+	    $currently_included = 0;
+	}
+	
+	$frame .= $FRAME_INPUT{$t} if defined $FRAME_INPUT{$t};
+	print STDERR "$TARGET_BITMAP[$t] $t ($start_t) $currently_included\n";
+    }
+
+    print STDERR $frame."\n-------------------------------------\n";
+    return ($frame,$rule_s,$rule_t,$rule_alignment,$rule_alignment_inv);
+}
+
+sub create_alignment {
+	my ($line) = @_;
+	my (@ALIGNED_TO_S,@ALIGNED_TO_T);
+	foreach my $point (split(/ /,$line)) {
+		my ($s,$t) = split(/\-/,$point);
+		$ALIGNED_TO_S[$s]{$t}++;
+		$ALIGNED_TO_T[$t]{$s}++;
+	}
+	my %ALIGNMENT = ( 's' => \@ALIGNED_TO_S, 't' => \@ALIGNED_TO_T );
+	return %ALIGNMENT;
+}
--- a/contrib/fuzzy-match/old/make-xml-from-match-multiple.perl
+++ b/contrib/fuzzy-match/old/make-xml-from-match-multiple.perl
@ -0,0 +1,288 @@
+#!/usr/bin/perl -w 
+
+use strict;
+
+my $DEBUG = 1;
+my $OUTPUT_RULES = 1;
+
+my $scripts_root_dir = "/Users/hieuhoang/workspace/github/hieuhoang/scripts";
+
+my $data_root = "/Users/hieuhoang/workspace/experiment/data/tm-mt-integration/";
+#my $match_file  = "$data_root/in/BEST.acquis-xml-escaped.4.uniq.multi.tuning";
+my $match_file  = "$data_root/out/BEST";
+my $source_file = "$data_root/in/acquis.truecased.4.en.uniq";
+my $target_file = "$data_root/in/acquis.truecased.4.fr.uniq";
+my $alignment_file = "$data_root/in/acquis.truecased.4.align.uniq";
+my $out_file = "$data_root/out/ac-test.input.xml.4.uniq.multi.tuning";
+my $in_file = "$data_root/in/ac-test.input.tc.4";
+
+#my $match_file  = "tm/BEST.acquis-xml-escaped.4.uniq.multi";
+#my $source_file = "data/acquis.truecased.4.en.uniq";
+#my $target_file = "data/acquis.truecased.4.fr.uniq";
+#my $alignment_file = "data/acquis.truecased.4.align.uniq";
+#my $out_file = "data/ac-test.input.xml.4.uniq.multi.xxx";
+#my $in_file = "evaluation/ac-test.input.tc.4";
+
+my @INPUT = `cat $in_file`; chop(@INPUT);
+my @ALL_SOURCE = `cat $source_file`; chop(@ALL_SOURCE);
+my @ALL_TARGET = `cat $target_file`; chop(@ALL_TARGET);
+my @ALL_ALIGNMENT = `cat $alignment_file`; chop(@ALL_ALIGNMENT);
+
+open(MATCH,$match_file);
+open(FRAME,">$out_file");
+open(RULE,">$out_file.extract") if $OUTPUT_RULES;
+open(RULE_INV,">$out_file.extract.inv") if $OUTPUT_RULES;
+open(INFO,">$out_file.info");
+while( my $match = <MATCH> ) {
+    chop($match);
+    my ($score,$sentence,$path) = split(/ \|\|\| /,$match);
+
+    $score =~ /^(\d+) (.+)/ || die;
+    my ($i,$match_score) = ($1,$2);
+
+    # construct frame
+    if ($sentence < 1e9 && $sentence >= 0) {
+		my $SOURCE = $ALL_SOURCE[$sentence];
+		my @ALIGNMENT = split(/ \|\|\| /,$ALL_ALIGNMENT[$sentence]);
+		my @TARGET = split(/ \|\|\| /,$ALL_TARGET[$sentence]);
+		
+		for(my $j=0;$j<scalar(@TARGET);$j++) {
+			$TARGET[$j] =~ /^(\d+) (.+)$/ || die;
+			my ($target_count,$target) = ($1,$2);
+			my ($frame,$rule_s,$rule_t,$rule_alignment,$rule_alignment_inv) = 
+			&create_xml($SOURCE,
+					$INPUT[$i],
+					$target,
+					$ALIGNMENT[$j],
+					$path);
+			print FRAME $frame."\n";
+			print RULE "$rule_s [X] ||| $rule_t [X] ||| $rule_alignment ||| $target_count\n" if $OUTPUT_RULES;
+			print RULE_INV "$rule_t [X] ||| $rule_s [X] ||| $rule_alignment_inv ||| $target_count\n" if $OUTPUT_RULES;
+			print INFO "$i ||| $match_score ||| $target_count\n";
+		}
+    }
+}
+close(FRAME);
+close(MATCH);
+close(RULE) if $OUTPUT_RULES;
+close(RULE_INV) if $OUTPUT_RULES;
+
+`LC_ALL=C sort $out_file.extract | gzip -c > $out_file.extract.sorted.gz`;
+`LC_ALL=C sort $out_file.extract.inv | gzip -c > $out_file.extract.inv.sorted.gz`;
+
+`$scripts_root_dir/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical -extract-file $out_file.extract -lexical-file $data_root/in/lex.4 -phrase-translation-table $out_file.phrase-table` if $OUTPUT_RULES;
+
+sub create_xml {
+    my ($source,$input,$target,$alignment,$path) = @_;
+    
+    my @INPUT = split(/ /,$input);
+    my @SOURCE = split(/ /,$source);
+    my @TARGET = split(/ /,$target);
+    my %ALIGN = &create_alignment($alignment);
+    
+    my %FRAME_INPUT;
+    my (@NT,@INPUT_BITMAP,@TARGET_BITMAP,%ALIGNMENT_I_TO_S);
+    foreach (@TARGET) { push @TARGET_BITMAP,1 }
+    
+    ### STEP 1: FIND MISMATCHES
+
+    my ($s,$i) = (0,0);
+    my $currently_matching = 0;
+    my ($start_s,$start_i) = (0,0);
+
+    $path .= "X"; # indicate end
+    print "$input\n$source\n$target\n$path\n";
+    for(my $p=0;$p<length($path);$p++) {
+	my $action = substr($path,$p,1);
+
+	# beginning of a mismatch
+	if ($currently_matching && $action ne "M" && $action ne "X") {
+	    $start_i = $i;
+	    $start_s = $s;
+	    $currently_matching = 0;
+	}
+	
+	# end of a mismatch
+	elsif (!$currently_matching && 
+	       ($action eq "M" || $action eq "X")) {
+	    
+	    # remove use of affected target words
+	    for(my $ss = $start_s; $ss<$s; $ss++) {
+		foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
+		    $TARGET_BITMAP[$tt] = 0;
+		}
+		
+		# also remove enclosed unaligned words?
+	    }
+	    
+	    # are there input words that need to be inserted ?
+	    print "($start_i<$i)?\n";
+	    if ($start_i<$i) {
+		
+		# take note of input words to be inserted
+		my $insertion = "";
+		for(my $ii = $start_i; $ii<$i; $ii++) {
+		    $insertion .= $INPUT[$ii]." ";
+		}
+		
+		# find position for inserted input words
+		
+		# find first removed target word
+		my $start_t = 1000;
+		for(my $ss = $start_s; $ss<$s; $ss++) {
+		    foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
+			$start_t = $tt if $tt < $start_t;
+		    }
+		}
+
+		# end of sentence? add to end
+		if ($start_t == 1000 && $i > $#INPUT) {
+		    $start_t = $#TARGET;
+		}
+		
+		# backtrack to previous words if unaligned
+		if ($start_t == 1000) {
+		    $start_t = -1;
+		    for(my $ss = $s-1; $start_t==-1 && $ss>=0; $ss--) {
+			foreach my $tt (keys %{${$ALIGN{'s'}}[$ss]}) {
+			    $start_t = $tt if $tt > $start_t;
+			}
+		    }
+		}
+		$FRAME_INPUT{$start_t} .= $insertion;
+		my %NT = ("start_t" => $start_t,
+			  "start_i" => $start_i );
+		push @NT,\%NT;		
+	    }	    
+	    $currently_matching = 1;
+	}
+	
+	print "$action $s $i ($start_s $start_i) $currently_matching";
+	if ($action ne "I") {
+	    print " ->";
+	    foreach my $tt (keys %{${$ALIGN{'s'}}[$s]}) {
+		print " ".$tt;
+	    }
+	}
+	print "\n";
+	$s++ unless $action eq "I";
+	$i++ unless $action eq "D";
+	$ALIGNMENT_I_TO_S{$i} = $s unless $action eq "D";
+	push @INPUT_BITMAP, 1 if $action eq "M";
+	push @INPUT_BITMAP, 0 if $action eq "I" || $action eq "S";
+    }
+    
+
+    print $target."\n";
+    foreach (@TARGET_BITMAP) { print $_; } print "\n";
+    foreach (sort keys %FRAME_INPUT) { 
+	print "$_: $FRAME_INPUT{$_}\n";
+    }
+
+    ### STEP 2: BUILD RULE AND FRAME
+        
+    # hierarchical rule
+    my $rule_s = "";
+    my $rule_pos_s = 0;
+    my %RULE_ALIGNMENT_S;
+    for(my $i=0;$i<scalar(@INPUT_BITMAP);$i++) {
+	if ($INPUT_BITMAP[$i]) {
+	    $rule_s .= $INPUT[$i]." ";
+	    $RULE_ALIGNMENT_S{$ALIGNMENT_I_TO_S{$i}} = $rule_pos_s++;
+	}
+	foreach my $NT (@NT) {
+	    if ($i == $$NT{"start_i"}) {
+		$rule_s .= "[X][X] ";
+		$$NT{"rule_pos_s"} = $rule_pos_s++;
+	    }
+	}
+    }
+
+    my $rule_t = "";
+    my $rule_pos_t = 0;
+    my %RULE_ALIGNMENT_T;
+    for(my $t=-1;$t<scalar(@TARGET_BITMAP);$t++) {
+	if ($t>=0 && $TARGET_BITMAP[$t]) {
+	    $rule_t .= $TARGET[$t]." ";
+	    $RULE_ALIGNMENT_T{$t} = $rule_pos_t++;
+	}
+	foreach my $NT (@NT) {
+	    if ($t == $$NT{"start_t"}) {
+		$rule_t .= "[X][X] ";
+		$$NT{"rule_pos_t"} = $rule_pos_t++;
+	    }
+	}
+    }
+
+    my $rule_alignment = "";
+    foreach my $s (sort { $a <=> $b} keys %RULE_ALIGNMENT_S) {
+	foreach my $t (keys %{$ALIGN{"s"}[$s]}) {
+	    next unless defined($RULE_ALIGNMENT_T{$t});
+	    $rule_alignment .= $RULE_ALIGNMENT_S{$s}."-".$RULE_ALIGNMENT_T{$t}." ";
+	}
+    }
+    foreach my $NT (@NT) {
+	$rule_alignment .= $$NT{"rule_pos_s"}."-".$$NT{"rule_pos_t"}." ";
+    }
+    
+    chop($rule_s);
+    chop($rule_t);
+    chop($rule_alignment);
+
+    my $rule_alignment_inv = "";
+    foreach (split(/ /,$rule_alignment)) {
+	/^(\d+)\-(\d+)$/;
+	$rule_alignment_inv .= "$2-$1 ";
+    }
+    chop($rule_alignment_inv);
+
+    # frame
+    my $frame = "";
+    $frame = $FRAME_INPUT{-1} if defined $FRAME_INPUT{-1};
+
+    my $currently_included = 0;
+    my $start_t = -1;
+    push @TARGET_BITMAP,0; # indicate end
+
+    for(my $t=0;$t<=scalar(@TARGET);$t++) {	    
+	# beginning of tm target inclusion
+	if (!$currently_included && $TARGET_BITMAP[$t]) {
+	    $start_t = $t;
+	    $currently_included = 1;
+	}
+	
+	# end of tm target inclusion (not included word or inserted input)
+	elsif ($currently_included && 
+	       (!$TARGET_BITMAP[$t] || defined($FRAME_INPUT{$t}))) {
+	    # add xml (unless change is at the beginning of the sentence
+	    if ($start_t >= 0) {
+		my $target = "";
+		print "for(tt=$start_t;tt<$t+$TARGET_BITMAP[$t]);\n";
+		for(my $tt=$start_t;$tt<$t+$TARGET_BITMAP[$t];$tt++) {
+		    $target .= $TARGET[$tt] . " ";
+		}
+		chop($target);
+		$frame .= "<xml translation=\"$target\"> x </xml> ";
+	    }
+	    $currently_included = 0;
+	}
+	
+	$frame .= $FRAME_INPUT{$t} if defined $FRAME_INPUT{$t};
+	print "$TARGET_BITMAP[$t] $t ($start_t) $currently_included\n";
+    }
+
+    print $frame."\n-------------------------------------\n";
+    return ($frame,$rule_s,$rule_t,$rule_alignment,$rule_alignment_inv);
+}
+
+sub create_alignment {
+	my ($line) = @_;
+	my (@ALIGNED_TO_S,@ALIGNED_TO_T);
+	foreach my $point (split(/ /,$line)) {
+		my ($s,$t) = split(/\-/,$point);
+		$ALIGNED_TO_S[$s]{$t}++;
+		$ALIGNED_TO_T[$t]{$s}++;
+	}
+	my %ALIGNMENT = ( 's' => \@ALIGNED_TO_S, 't' => \@ALIGNED_TO_T );
+	return %ALIGNMENT;
+}
--- a/contrib/fuzzy-match/suffix-test.cpp
+++ b/contrib/fuzzy-match/suffix-test.cpp
@ -0,0 +1,27 @@
+#include "SuffixArray.h"
+
+using namespace std;
+
+int main(int argc, char* argv[]) 
+{
+	SuffixArray suffixArray( "/home/pkoehn/syntax/grammars/wmt09-de-en/corpus.1k.de" );
+	//suffixArray.List(10,20);
+	vector< string > der;
+	der.push_back("der");
+	vector< string > inDer;
+	inDer.push_back("in");
+	inDer.push_back("der");
+	vector< string > zzz;
+	zzz.push_back("zzz");
+	vector< string > derDer;
+	derDer.push_back("der");
+	derDer.push_back("der");
+
+	cout << "count of 'der' " << suffixArray.Count( der ) << endl;
+	cout << "limited count of 'der' " << suffixArray.MinCount( der, 2 ) << endl;
+	cout << "count of 'in der' " << suffixArray.Count( inDer ) << endl;
+	cout << "count of 'der der' " << suffixArray.Count( derDer ) << endl;
+	cout << "limited count of 'der der' " << suffixArray.MinCount( derDer, 1 ) << endl;
+	// cout << "count of 'zzz' " << suffixArray.Count( zzz ) << endl;
+	// cout << "limited count of 'zzz' " << suffixArray.LimitedCount( zzz, 1 ) << endl;
+}
--- a/contrib/iSenWeb/Introduction/iSenWeb
+++ b/contrib/iSenWeb/Introduction/iSenWeb
--- a/contrib/iSenWeb/Introduction/iSenWeb
+++ b/contrib/iSenWeb/Introduction/iSenWeb
--- a/contrib/iSenWeb/index.html
+++ b/contrib/iSenWeb/index.html
@ -0,0 +1,129 @@
+ <!DOCTYPE html>
+ <HTML> 
+        <head>
+                <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+                <title>Moses Translation System</title>
+		<script type="text/javascript" src="jquery-1.7.2.js"></script>
+                <link href="./themes/styles/common.css" rel="stylesheet" type="text/css" />
+                <link href="./themes/styles/search.css" rel="stylesheet" type="text/css"/>
+                <link href="./themes/styles/fanyi.css" rel="stylesheet" type="text/css" />
+        </head> 
+<script language="javascript">		                
+$(document).ready(function()
+{
+
+	var targetDiv = $("#outputText");
+	var input = $("#inputText"); 
+
+	$("#transForm").submit(function()
+	{
+			$.ajax(
+			{
+				type: "POST", url: 'trans_result.php',data: {input1: input.val()},
+				complete: function(data)
+				{
+        				targetDiv.html('');
+					targetDiv.append(data.responseText);
+
+				}
+			});
+	return false;  
+	});
+});
+
+</script>
+        <body>
+                <div class="topWrap">
+	                <div class="top">
+		                <div class="logo"><a href="/" title="English Chinese Translation Based on Moses">Home</a></div>
+			                
+		            </div>
+		                <!-- top end -->
+                </div>
+              <div class="ConBox">
+	            <div class="hd">
+                        <div id="inputMod" class="column fl">
+                                <div class="wrapper">
+<!-- 
+                                        <form action="trans_result.php" method="post" id="transForm" name="transForm">-->
+					<form action="" method="post" id="transForm" name="transForm">
+                                                <div class="row desc">
+                                                        Source Text:
+							<input type="reset" name="clear" value="Clear"/>
+                                                </div>
+                                                <div class="row border content">
+                                                        <textarea id="inputText" class="text" dir="ltr" tabindex="1" wrap="SOFT" name="inputText"></textarea>
+                                                       
+                                               </div>
+                                               <div class="row">         
+							<select>
+								<option value ="en-cn">English >> Chinese </option>
+							</select>
+							<input type="submit" value="Translation"/>
+                                               </div>
+                                        </form>
+                                 </div>
+            <!-- end of wrapper -->
+                        </div>
+      <!-- end of div inputMod -->
+   <div id="outputMod" class="column fr">
+            <div class="wrapper">
+                <div id="translated" style="display: block;">
+                <div class="row desc"><span id="outputLang">en->ch</span></div>
+                      <div class="row">
+                            <div id="outputText" class="row">
+                                <div class="translated_result">
+                         	
+                                         										
+                                </div>
+								
+                            </div>
+                      </div>
+					  
+                </div>
+                 
+                <!-- end of entryList -->               
+                <!-- end translated -->
+            </div>
+            <!-- end of wrapper -->        
+              
+              
+             <div class="row cf" id="addons">
+                <a id="feedback_link" target="_blank" href="#" class="fr">Feedback</a>
+                <span id="suggestYou">
+                    选择<a data-pos="web.o.leftbottom" class="clog-js" data-clog="FUFEI_CLICK" href="http://nlp2ct.sftw.umac.mo/" target="_blank">人工翻译服务</a>，获得更专业的翻译结果。
+                </span>
+            </div>
+        </div>
+      <div id="errorHolder"><span class="error_text"></span></div>
+    </div>
+    <div style="clear:both"></div>
+    <script type="text/javascript">
+		var global = {};
+		global.sessionFrom = "http://dict.youdao.com/";
+	</script>
+	<script type="text/javascript" src="http://impservice.dictweb.youdao.com/imp/dict_req_web_1.0.js"></script>
+	<script data-main="fanyi" type="text/javascript" src="./themes/fanyi/v2.1.3.1/scripts/fanyi.js"></script>
+		<div id="transBtnTip">
+    <div id="transBtnTipInner">
+            点击翻译按钮继续，查看网页翻译结果。
+            <p class="ar">   
+                <a href="#" id="transBtnTipOK">I have known</a> 
+            </p>   
+            <b id="transBtnTipArrow"></b>
+        </div>
+    </div>
+
+    <div class="Feedback"><a href="http://nlp2ct.sftw.umac.mo/" target="_blank">反馈信息给我们</a></div>
+    
+    
+    <div class="footer" style="clear:both">
+	<p><a href="http://nlp2ct.sftw.umac.mo/" target="_blank">Conect with us</a> <span>|</span>
+       <a href="http://nlp2ct.sftw.umac.mo/" target="_blank">Mosese Translated system</a> <span>|</span>
+         Copyright© &nbsp;&nbsp;2012-2012 NLP2CT All Right to Moses Group
+    </p>
+	<p>More</p>
+    </div> 
+</div>
+        </body>
+ </HTML>
--- a/contrib/iSenWeb/jquery-1.7.2.js
+++ b/contrib/iSenWeb/jquery-1.7.2.js
--- a/contrib/iSenWeb/moses.pl
+++ b/contrib/iSenWeb/moses.pl
@ -0,0 +1,59 @@
+#!/usr/bin/perl -w
+use warnings;
+use strict;
+$|++;
+
+# file: daemon.pl
+
+# Herve Saint-Amand
+# Universitaet des Saarlandes
+# Tue May 13 19:45:31 2008
+
+# This script starts Moses to run in the background, so that it can be used by
+# the CGI script. It spawns the Moses process, then binds itself to listen on
+# some port, and when it gets a connection, reads it line by line, feeds those
+# to Moses, and sends back the translation.
+
+# You can either run one instance of this on your Web server, or, if you have
+# the hardware setup for it, run several instances of this, then configure
+# translate.cgi to connect to these.
+
+#------------------------------------------------------------------------------
+# includes
+
+use IO::Socket::INET;
+use IPC::Open2;
+
+#------------------------------------------------------------------------------
+# constants, global vars, config
+
+my $MOSES      = '/home/tianliang/research/moses-smt/scripts/training/model/moses';
+my $MOSES_INI  = '/home/tianliang/research/moses-smt/scripts/training/model/moses.ini';
+
+die "usage: daemon.pl <hostname> <port>" unless (@ARGV == 2);
+my $LISTEN_HOST = shift;
+my $LISTEN_PORT = shift;
+
+#------------------------------------------------------------------------------
+# main
+
+# spawn moses
+my ($MOSES_IN, $MOSES_OUT);
+my $pid = open2 ($MOSES_OUT, $MOSES_IN, $MOSES, '-f', $MOSES_INI);
+
+# open server socket
+my $server_sock = new IO::Socket::INET
+    (LocalAddr => $LISTEN_HOST, LocalPort => $LISTEN_PORT, Listen => 1)
+    || die "Can't bind server socket";
+
+while (my $client_sock = $server_sock->accept) {
+    while (my $line = <$client_sock>) {
+        print $MOSES_IN $line;
+        $MOSES_IN->flush ();
+        print $client_sock scalar <$MOSES_OUT>;
+    }
+
+    $client_sock->close ();
+}
+
+#------------------------------------------------------------------------------
--- a/contrib/iSenWeb/themes/images/common/Logo
+++ b/contrib/iSenWeb/themes/images/common/Logo
--- a/contrib/iSenWeb/themes/images/common/Logo
+++ b/contrib/iSenWeb/themes/images/common/Logo
--- a/contrib/iSenWeb/themes/images/common/Logo
+++ b/contrib/iSenWeb/themes/images/common/Logo
--- a/contrib/iSenWeb/themes/images/common/Logo
+++ b/contrib/iSenWeb/themes/images/common/Logo
--- a/contrib/iSenWeb/themes/images/common/Logo.png
+++ b/contrib/iSenWeb/themes/images/common/Logo.png
--- a/contrib/iSenWeb/themes/images/common/Logo_lab.png
+++ b/contrib/iSenWeb/themes/images/common/Logo_lab.png
--- a/contrib/iSenWeb/themes/images/common/header_bg.png
+++ b/contrib/iSenWeb/themes/images/common/header_bg.png
--- a/contrib/iSenWeb/themes/images/common/ico_cor10.png
+++ b/contrib/iSenWeb/themes/images/common/ico_cor10.png
--- a/contrib/iSenWeb/themes/images/common/icon_feedback.png
+++ b/contrib/iSenWeb/themes/images/common/icon_feedback.png
--- a/contrib/iSenWeb/themes/images/common/logo_christmas.png
+++ b/contrib/iSenWeb/themes/images/common/logo_christmas.png
--- a/contrib/iSenWeb/themes/images/common/logo_christmas1.png
+++ b/contrib/iSenWeb/themes/images/common/logo_christmas1.png
--- a/contrib/iSenWeb/themes/images/common/logo_christmas2.png
+++ b/contrib/iSenWeb/themes/images/common/logo_christmas2.png
--- a/contrib/iSenWeb/themes/images/common/logo_christmas3.png
+++ b/contrib/iSenWeb/themes/images/common/logo_christmas3.png
--- a/contrib/iSenWeb/themes/images/common/nav_bgn.png
+++ b/contrib/iSenWeb/themes/images/common/nav_bgn.png
--- a/contrib/iSenWeb/themes/images/common/sidebar_bg.png
+++ b/contrib/iSenWeb/themes/images/common/sidebar_bg.png
--- a/contrib/iSenWeb/themes/images/fanyi/fanyi_sprite.png
+++ b/contrib/iSenWeb/themes/images/fanyi/fanyi_sprite.png
--- a/contrib/iSenWeb/themes/images/fanyi/inputTextBg.png
+++ b/contrib/iSenWeb/themes/images/fanyi/inputTextBg.png
--- a/Show More
+++ b/Show More