mosesdecoder/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp

//
//  FuzzyMatchWrapper.cpp
//  moses
//
//  Created by Hieu Hoang on 26/07/2012.
//  Copyright 2012 __MyCompanyName__. All rights reserved.
//

#include <iostream>
#include "FuzzyMatchWrapper.h"
#include "SentenceAlignment.h"
#include "Match.h"
#include "create_xml.h"
#include "moses/Util.h"
#include "moses/StaticData.h"
#include "util/file.hh"

using namespace std;

namespace tmmt
{

FuzzyMatchWrapper::FuzzyMatchWrapper(const std::string &sourcePath, const std::string &targetPath, const std::string &alignmentPath)
  :basic_flag(false)
  ,lsed_flag(true)
  ,refined_flag(true)
  ,length_filter_flag(true)
  ,parse_flag(true)
  ,min_match(70)
  ,multiple_flag(true)
  ,multiple_slack(0)
  ,multiple_max(100)
{
  cerr << "creating suffix array" << endl;
  suffixArray = new tmmt::SuffixArray( sourcePath );

  //cerr << "loading source data" << endl;
  //load_corpus(sourcePath, source);

  cerr << "loading target data" << endl;
  load_target(targetPath, targetAndAlignment);

  cerr << "loading alignment" << endl;
  load_alignment(alignmentPath, targetAndAlignment);

  // create suffix array
  //load_corpus(m_config[0], input);

  cerr << "loading completed" << endl;
}

string FuzzyMatchWrapper::Extract(long translationId, const string &dirNameStr)
{
  const Moses::StaticData &staticData = Moses::StaticData::Instance();

  WordIndex wordIndex;

  string fuzzyMatchFile = ExtractTM(wordIndex, translationId, dirNameStr);

  // create extrac files
  create_xml(fuzzyMatchFile);

  // create phrase table with usual Moses scoring and consolidate programs
  string cmd;
  cmd = "LC_ALL=C sort " + fuzzyMatchFile + ".extract | gzip -c > "
        + fuzzyMatchFile + ".extract.sorted.gz";
  system(cmd.c_str());
  cmd = "LC_ALL=C sort " + fuzzyMatchFile + ".extract.inv | gzip -c > "
        + fuzzyMatchFile + ".extract.inv.sorted.gz";
  system(cmd.c_str());

#ifdef IS_XCODE
  cmd = "/Users/hieuhoang/unison/workspace/github/moses-smt/bin";
#elif IS_ECLIPSE
  cmd = "/home/hieu/workspace/github/moses-smt/bin";
#else
  cmd = staticData.GetBinDirectory();
#endif

  cmd += string("/../scripts/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical ")
         + " -extract-file " + fuzzyMatchFile + ".extract -lexical-file - -score-options \"--NoLex\" "
         + " -phrase-translation-table " + fuzzyMatchFile + ".pt";
  system(cmd.c_str());


  return fuzzyMatchFile + ".pt.gz";
}

string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, const string &dirNameStr)
{
  const std::vector< std::vector< WORD_ID > > &source = suffixArray->GetCorpus();

  string inputPath = dirNameStr + "/in";
  string fuzzyMatchFile = dirNameStr + "/fuzzyMatchFile";
  ofstream fuzzyMatchStream(fuzzyMatchFile.c_str());

  vector< vector< WORD_ID > > input;
  load_corpus(inputPath, input);

  assert(input.size() == 1);
  size_t sentenceInd = 0;

  clock_t start_clock = clock();
  // if (i % 10 == 0) cerr << ".";

  // establish some basic statistics

  // int input_length = compute_length( input[i] );
  int input_length = input[sentenceInd].size();
  int best_cost = input_length * (100-min_match) / 100 + 1;

  int match_count = 0; // how many substring matches to be considered
  //cerr << endl << "sentence " << i << ", length " << input_length << ", best_cost " << best_cost << endl;

  // find match ranges in suffix array
  vector< vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > > match_range;
  for(size_t start=0; start<input[sentenceInd].size(); start++) {
    SuffixArray::INDEX prior_first_match = 0;
    SuffixArray::INDEX prior_last_match = suffixArray->GetSize()-1;
    vector< string > substring;
    bool stillMatched = true;
    vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > matchedAtThisStart;
    //cerr << "start: " << start;
    for(int word=start; stillMatched && word<input[sentenceInd].size(); word++) {
      substring.push_back( GetVocabulary().GetWord( input[sentenceInd][word] ) );

      // only look up, if needed (i.e. no unnecessary short gram lookups)
      //				if (! word-start+1 <= short_match_max_length( input_length ) )
      //			{
      SuffixArray::INDEX first_match, last_match;
      stillMatched = false;
      if (suffixArray->FindMatches( substring, first_match, last_match, prior_first_match, prior_last_match ) ) {
        stillMatched = true;
        matchedAtThisStart.push_back( make_pair( first_match, last_match ) );
        //cerr << " (" << first_match << "," << last_match << ")";
        //cerr << " " << ( last_match - first_match + 1 );
        prior_first_match = first_match;
        prior_last_match = last_match;
      }
      //}
    }
    //cerr << endl;
    match_range.push_back( matchedAtThisStart );
  }

  clock_t clock_range = clock();

  map< int, vector< Match > > sentence_match;
  map< int, int > sentence_match_word_count;

  // go through all matches, longest first
  for(int length = input[sentenceInd].size(); length >= 1; length--) {
    // do not create matches, if these are handled by the short match function
    if (length <= short_match_max_length( input_length ) ) {
      continue;
    }

    unsigned int count = 0;
    for(int start = 0; start <= input[sentenceInd].size() - length; start++) {
      if (match_range[start].size() >= length) {
        pair< SuffixArray::INDEX, SuffixArray::INDEX > &range = match_range[start][length-1];
        // cerr << " (" << range.first << "," << range.second << ")";
        count += range.second - range.first + 1;

        for(SuffixArray::INDEX i=range.first; i<=range.second; i++) {
          int position = suffixArray->GetPosition( i );

          // sentence length mismatch
          size_t sentence_id = suffixArray->GetSentence( position );
          int sentence_length = suffixArray->GetSentenceLength( sentence_id );
          int diff = abs( (int)sentence_length - (int)input_length );
          // cerr << endl << i << "\tsentence " << sentence_id << ", length " << sentence_length;
          //if (length <= 2 && input_length>=5 &&
          //		sentence_match.find( sentence_id ) == sentence_match.end())
          //	continue;

          if (diff > best_cost)
            continue;

          // compute minimal cost
          int start_pos = suffixArray->GetWordInSentence( position );
          int end_pos = start_pos + length-1;
          // cerr << endl << "\t" << start_pos << "-" << end_pos << " (" << sentence_length << ") vs. "
          // << start << "-" << (start+length-1) << " (" << input_length << ")";
          // different number of prior words -> cost is at least diff
          int min_cost = abs( start - start_pos );

          // same number of words, but not sent. start -> cost is at least 1
          if (start == start_pos && start>0)
            min_cost++;

          // different number of remaining words -> cost is at least diff
          min_cost += abs( ( sentence_length-1 - end_pos ) -
                           ( input_length-1 - (start+length-1) ) );

          // same number of words, but not sent. end -> cost is at least 1
          if ( sentence_length-1 - end_pos ==
               input_length-1 - (start+length-1)
               && end_pos != sentence_length-1 )
            min_cost++;

          // cerr << " -> min_cost " << min_cost;
          if (min_cost > best_cost)
            continue;

          // valid match
          match_count++;

          // compute maximal cost
          int max_cost = max( start, start_pos )
                         + max( sentence_length-1 - end_pos,
                                input_length-1 - (start+length-1) );
          // cerr << ", max_cost " << max_cost;

          Match m = Match( start, start+length-1,
                           start_pos, start_pos+length-1,
                           min_cost, max_cost, 0);
          sentence_match[ sentence_id ].push_back( m );
          sentence_match_word_count[ sentence_id ] += length;

          if (max_cost < best_cost) {
            best_cost = max_cost;
            if (best_cost == 0) break;
          }
          //if (match_count >= MAX_MATCH_COUNT) break;
        }
      }
      // cerr << endl;
      if (best_cost == 0) break;
      //if (match_count >= MAX_MATCH_COUNT) break;
    }
    // cerr << count << " matches at length " << length << " in " << sentence_match.size() << " tm." << endl;

    if (best_cost == 0) break;
    //if (match_count >= MAX_MATCH_COUNT) break;
  }
  cerr << match_count << " matches in " << sentence_match.size() << " sentences." << endl;

  clock_t clock_matches = clock();

  // consider each sentence for which we have matches
  int old_best_cost = best_cost;
  int tm_count_word_match = 0;
  int tm_count_word_match2 = 0;
  int pruned_match_count = 0;
  if (short_match_max_length( input_length )) {
    init_short_matches(wordIndex, translationId, input[sentenceInd] );
  }
  vector< int > best_tm;
  typedef map< int, vector< Match > >::iterator I;

  clock_t clock_validation_sum = 0;

  for(I tm=sentence_match.begin(); tm!=sentence_match.end(); tm++) {
    int tmID = tm->first;
    int tm_length = suffixArray->GetSentenceLength(tmID);
    vector< Match > &match = tm->second;
    add_short_matches(wordIndex, translationId, match, source[tmID], input_length, best_cost );

    //cerr << "match in sentence " << tmID << ": " << match.size() << " [" << tm_length << "]" << endl;

    // quick look: how many words are matched
    int words_matched = 0;
    for(int m=0; m<match.size(); m++) {

      if (match[m].min_cost <= best_cost) // makes no difference
        words_matched += match[m].input_end - match[m].input_start + 1;
    }
    if (max(input_length,tm_length) - words_matched > best_cost) {
      if (length_filter_flag) continue;
    }
    tm_count_word_match++;

    // prune, check again how many words are matched
    vector< Match > pruned = prune_matches( match, best_cost );
    words_matched = 0;
    for(int p=0; p<pruned.size(); p++) {
      words_matched += pruned[p].input_end - pruned[p].input_start + 1;
    }
    if (max(input_length,tm_length) - words_matched > best_cost) {
      if (length_filter_flag) continue;
    }
    tm_count_word_match2++;

    pruned_match_count += pruned.size();
    int prior_best_cost = best_cost;
    int cost;

    clock_t clock_validation_start = clock();
    if (! parse_flag ||
        pruned.size()>=10) { // to prevent worst cases
      string path;
      cost = sed( input[sentenceInd], source[tmID], path, false );
      if (cost <  best_cost) {
        best_cost = cost;
      }
    }

    else {
      cost = parse_matches( pruned, input_length, tm_length, best_cost );
      if (prior_best_cost != best_cost) {
        best_tm.clear();
      }
    }
    clock_validation_sum += clock() - clock_validation_start;
    if (cost == best_cost) {
      best_tm.push_back( tmID );
    }
  }
  cerr << "reduced best cost from " << old_best_cost << " to " << best_cost << endl;
  cerr << "tm considered: " << sentence_match.size()
       << " word-matched: " << tm_count_word_match
       << " word-matched2: " << tm_count_word_match2
       << " best: " << best_tm.size() << endl;

  cerr << "pruned matches: " << ((float)pruned_match_count/(float)tm_count_word_match2) << endl;

  // create xml and extract files
  string inputStr, sourceStr;
  for (size_t pos = 0; pos < input_length; ++pos) {
    inputStr += GetVocabulary().GetWord(input[sentenceInd][pos]) + " ";
  }

  // do not try to find the best ... report multiple matches
  if (multiple_flag) {
    int input_letter_length = compute_length( input[sentenceInd] );
    for(int si=0; si<best_tm.size(); si++) {
      int s = best_tm[si];
      string path;
      unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true );
      // do not report multiple identical sentences, but just their count
      //cout << sentenceInd << " "; // sentence number
      //cout << letter_cost << "/" << input_letter_length << " ";
      //cout << "(" << best_cost <<"/" << input_length <<") ";
      //cout << "||| " << s << " ||| " << path << endl;

      const vector<WORD_ID> &sourceSentence = source[s];
      vector<SentenceAlignment> &targets = targetAndAlignment[s];
      create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, path, fuzzyMatchStream);

    }
  } // if (multiple_flag)
  else {

    // find the best matches according to letter sed
    string best_path = "";
    int best_match = -1;
    int best_letter_cost;
    if (lsed_flag) {
      best_letter_cost = compute_length( input[sentenceInd] ) * min_match / 100 + 1;
      for(int si=0; si<best_tm.size(); si++) {
        int s = best_tm[si];
        string path;
        unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true );
        if (letter_cost < best_letter_cost) {
          best_letter_cost = letter_cost;
          best_path = path;
          best_match = s;
        }
      }
    }
    // if letter sed turned off, just compute path for first match
    else {
      if (best_tm.size() > 0) {
        string path;
        sed( input[sentenceInd], source[best_tm[0]], path, false );
        best_path = path;
        best_match = best_tm[0];
      }
    }
    cerr << "elapsed: " << (1000 * (clock()-start_clock) / CLOCKS_PER_SEC)
         << " ( range: " << (1000 * (clock_range-start_clock) / CLOCKS_PER_SEC)
         << " match: " << (1000 * (clock_matches-clock_range) / CLOCKS_PER_SEC)
         << " tm: " << (1000 * (clock()-clock_matches) / CLOCKS_PER_SEC)
         << " (validation: " << (1000 * (clock_validation_sum) / CLOCKS_PER_SEC) << ")"
         << " )" << endl;
    if (lsed_flag) {
      //cout << best_letter_cost << "/" << compute_length( input[sentenceInd] ) << " (";
    }
    //cout << best_cost <<"/" << input_length;
    if (lsed_flag) {
      //cout << ")";
    }
    //cout << " ||| " << best_match << " ||| " << best_path << endl;

    if (best_match == -1) {
      UTIL_THROW_IF(source.size() == 0, util::Exception, "Empty source phrase");
      best_match = 0;
    }

    // creat xml & extracts
    const vector<WORD_ID> &sourceSentence = source[best_match];
    vector<SentenceAlignment> &targets = targetAndAlignment[best_match];
    create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, best_path, fuzzyMatchStream);

  } // else if (multiple_flag)

  fuzzyMatchStream.close();

  return fuzzyMatchFile;
}

void FuzzyMatchWrapper::load_corpus( const std::string &fileName, vector< vector< WORD_ID > > &corpus )
{
  // source
  ifstream fileStream;
  fileStream.open(fileName.c_str());
  if (!fileStream) {
    cerr << "file not found: " << fileName << endl;
    exit(1);
  }
  cerr << "loading " << fileName << endl;

  istream *fileStreamP = &fileStream;

  char line[LINE_MAX_LENGTH];
  while(true) {
    SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
    if (fileStreamP->eof()) break;
    corpus.push_back( GetVocabulary().Tokenize( line ) );
  }
}

void FuzzyMatchWrapper::load_target(const std::string &fileName, vector< vector< SentenceAlignment > > &corpus)
{
  ifstream fileStream;
  fileStream.open(fileName.c_str());
  if (!fileStream) {
    cerr << "file not found: " << fileName << endl;
    exit(1);
  }
  cerr << "loading " << fileName << endl;

  istream *fileStreamP = &fileStream;

  WORD_ID delimiter = GetVocabulary().StoreIfNew("|||");

  int lineNum = 0;
  char line[LINE_MAX_LENGTH];
  while(true) {
    SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
    if (fileStreamP->eof()) break;

    vector<WORD_ID> toks = GetVocabulary().Tokenize( line );

    corpus.push_back(vector< SentenceAlignment >());
    vector< SentenceAlignment > &vec = corpus.back();

    vec.push_back(SentenceAlignment());
    SentenceAlignment *sentence = &vec.back();

    const WORD &countStr = GetVocabulary().GetWord(toks[0]);
    sentence->count = atoi(countStr.c_str());

    for (size_t i = 1; i < toks.size(); ++i) {
      WORD_ID wordId = toks[i];

      if (wordId == delimiter) {
        // target and alignments can have multiple sentences.
        vec.push_back(SentenceAlignment());
        sentence = &vec.back();

        // count
        ++i;

        const WORD &countStr = GetVocabulary().GetWord(toks[i]);
        sentence->count = atoi(countStr.c_str());
      } else {
        // just a normal word, add
        sentence->target.push_back(wordId);
      }
    }

    ++lineNum;

  }

}


void FuzzyMatchWrapper::load_alignment(const std::string &fileName, vector< vector< SentenceAlignment > > &corpus )
{
  ifstream fileStream;
  fileStream.open(fileName.c_str());
  if (!fileStream) {
    cerr << "file not found: " << fileName << endl;
    exit(1);
  }
  cerr << "loading " << fileName << endl;

  istream *fileStreamP = &fileStream;

  string delimiter = "|||";

  int lineNum = 0;
  char line[LINE_MAX_LENGTH];
  while(true) {
    SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
    if (fileStreamP->eof()) break;

    vector< SentenceAlignment > &vec = corpus[lineNum];
    size_t targetInd = 0;
    SentenceAlignment *sentence = &vec[targetInd];

    vector<string> toks = Moses::Tokenize(line);

    for (size_t i = 0; i < toks.size(); ++i) {
      string &tok = toks[i];

      if (tok == delimiter) {
        // target and alignments can have multiple sentences.
        ++targetInd;
        sentence = &vec[targetInd];

        ++i;
      } else {
        // just a normal alignment, add
        vector<int> alignPoint = Moses::Tokenize<int>(tok, "-");
        assert(alignPoint.size() == 2);
        sentence->alignment.push_back(pair<int,int>(alignPoint[0], alignPoint[1]));
      }
    }

    ++lineNum;

  }
}

bool FuzzyMatchWrapper::GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const
{
#ifdef WITH_THREADS
  boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
#endif
  map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = m_lsed.find( key );
  if (lookup != m_lsed.end()) {
    value = lookup->second;
    return true;
  }

  return false;
}

void FuzzyMatchWrapper::SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value)
{
#ifdef WITH_THREADS
  boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
#endif
  m_lsed[ key ] = value;
}

/* Letter string edit distance, e.g. sub 'their' to 'there' costs 2 */

unsigned int FuzzyMatchWrapper::letter_sed( WORD_ID aIdx, WORD_ID bIdx )
{
  // check if already computed -> lookup in cache
  pair< WORD_ID, WORD_ID > pIdx = make_pair( aIdx, bIdx );
  unsigned int value;
  bool ret = GetLSEDCache(pIdx, value);
  if (ret) {
    return value;
  }

  // get surface strings for word indices
  const string &a = GetVocabulary().GetWord( aIdx );
  const string &b = GetVocabulary().GetWord( bIdx );

  // initialize cost matrix
  unsigned int **cost  = (unsigned int**) calloc( sizeof( unsigned int*  ), a.size()+1 );
  for( unsigned int i=0; i<=a.size(); i++ ) {
    cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
    cost[i][0] = i;
  }
  for( unsigned int j=0; j<=b.size(); j++ ) {
    cost[0][j] = j;
  }

  // core string edit distance loop
  for( unsigned int i=1; i<=a.size(); i++ ) {
    for( unsigned int j=1; j<=b.size(); j++ ) {

      unsigned int ins = cost[i-1][j] + 1;
      unsigned int del = cost[i][j-1] + 1;
      bool match = (a.substr(i-1,1).compare( b.substr(j-1,1) ) == 0);
      unsigned int diag = cost[i-1][j-1] + (match ? 0 : 1);

      unsigned int min = (ins < del) ? ins : del;
      min = (diag < min) ? diag : min;

      cost[i][j] = min;
    }
  }

  // clear out memory
  unsigned int final = cost[a.size()][b.size()];
  for( unsigned int i=0; i<=a.size(); i++ ) {
    free( cost[i] );
  }
  free( cost );

  // cache and return result
  SetLSEDCache(pIdx, final);
  return final;
}

/* string edit distance implementation */

unsigned int FuzzyMatchWrapper::sed( const vector< WORD_ID > &a, const vector< WORD_ID > &b, string &best_path, bool use_letter_sed )
{

  // initialize cost and path matrices
  unsigned int **cost  = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 );
  char **path = (char**) calloc( sizeof( char* ), a.size()+1 );

  for( unsigned int i=0; i<=a.size(); i++ ) {
    cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
    path[i] = (char*) calloc( sizeof(char), b.size()+1 );
    if (i>0) {
      cost[i][0] = cost[i-1][0];
      if (use_letter_sed) {
        cost[i][0] += GetVocabulary().GetWord( a[i-1] ).size();
      } else {
        cost[i][0]++;
      }
    } else {
      cost[i][0] = 0;
    }
    path[i][0] = 'I';
  }

  for( unsigned int j=0; j<=b.size(); j++ ) {
    if (j>0) {
      cost[0][j] = cost[0][j-1];
      if (use_letter_sed) {
        cost[0][j] +=	GetVocabulary().GetWord( b[j-1] ).size();
      } else {
        cost[0][j]++;
      }
    } else {
      cost[0][j] = 0;
    }
    path[0][j] = 'D';
  }

  // core string edit distance algorithm
  for( unsigned int i=1; i<=a.size(); i++ ) {
    for( unsigned int j=1; j<=b.size(); j++ ) {
      unsigned int ins = cost[i-1][j];
      unsigned int del = cost[i][j-1];
      unsigned int match;
      if (use_letter_sed) {
        ins += GetVocabulary().GetWord( a[i-1] ).size();
        del += GetVocabulary().GetWord( b[j-1] ).size();
        match = letter_sed( a[i-1], b[j-1] );
      } else {
        ins++;
        del++;
        match = ( a[i-1] == b[j-1] ) ? 0 : 1;
      }
      unsigned int diag = cost[i-1][j-1] + match;

      char action = (ins < del) ? 'I' : 'D';
      unsigned int min = (ins < del) ? ins : del;
      if (diag < min) {
        action = (match>0) ? 'S' : 'M';
        min = diag;
      }

      cost[i][j] = min;
      path[i][j] = action;
    }
  }

  // construct string for best path
  unsigned int i = a.size();
  unsigned int j = b.size();
  best_path = "";
  while( i>0 || j>0 ) {
    best_path = path[i][j] + best_path;
    if (path[i][j] == 'I') {
      i--;
    } else if (path[i][j] == 'D') {
      j--;
    } else {
      i--;
      j--;
    }
  }


  // clear out memory
  unsigned int final = cost[a.size()][b.size()];

  for( unsigned int i=0; i<=a.size(); i++ ) {
    free( cost[i] );
    free( path[i] );
  }
  free( cost );
  free( path );

  // return result
  return final;
}

/* utlility function: compute length of sentence in characters
 (spaces do not count) */

unsigned int FuzzyMatchWrapper::compute_length( const vector< WORD_ID > &sentence )
{
  unsigned int length = 0;
  for( unsigned int i=0; i<sentence.size(); i++ ) {
    length += GetVocabulary().GetWord( sentence[i] ).size();
  }
  return length;
}

/* brute force method: compare input to all corpus sentences */

int FuzzyMatchWrapper::basic_fuzzy_match( vector< vector< WORD_ID > > source,
    vector< vector< WORD_ID > > input )
{
  // go through input set...
  for(unsigned int i=0; i<input.size(); i++) {
    bool use_letter_sed = false;

    // compute sentence length and worst allowed cost
    unsigned int input_length;
    if (use_letter_sed) {
      input_length = compute_length( input[i] );
    } else {
      input_length = input[i].size();
    }
    unsigned int best_cost = input_length * (100-min_match) / 100 + 2;
    string best_path = "";
    int best_match = -1;

    // go through all corpus sentences
    for(unsigned int s=0; s<source.size(); s++) {
      int source_length;
      if (use_letter_sed) {
        source_length = compute_length( source[s] );
      } else {
        source_length = source[s].size();
      }
      int diff = abs((int)source_length - (int)input_length);
      if (length_filter_flag && (diff >= best_cost)) {
        continue;
      }

      // compute string edit distance
      string path;
      unsigned int cost = sed( input[i], source[s], path, use_letter_sed );

      // update if new best
      if (cost < best_cost) {
        best_cost = cost;
        best_path = path;
        best_match = s;
      }
    }
    //cout << best_cost << " ||| " << best_match << " ||| " << best_path << endl;
  }
}

/* definition of short matches
 very short n-gram matches (1-grams) will not be looked up in
 the suffix array, since there are too many matches
 and for longer sentences, at least one 2-gram match must occur */

int FuzzyMatchWrapper::short_match_max_length( int input_length )
{
  if ( ! refined_flag )
    return 0;
  if ( input_length >= 5 )
    return 1;
  return 0;
}


/* if we have non-short matches in a sentence, we need to
 take a closer look at it.
 this function creates a hash map for all input words and their positions
 (to be used by the next function)
 (done here, because this has be done only once for an input sentence) */

void FuzzyMatchWrapper::init_short_matches(WordIndex &wordIndex, long translationId, const vector< WORD_ID > &input )
{
  int max_length = short_match_max_length( input.size() );
  if (max_length == 0)
    return;

  wordIndex.clear();

  // store input words and their positions in hash map
  for(int i=0; i<input.size(); i++) {
    if (wordIndex.find( input[i] ) == wordIndex.end()) {
      vector< int > position_vector;
      wordIndex[ input[i] ] = position_vector;
    }
    wordIndex[ input[i] ].push_back( i );
  }
}

/* add all short matches to list of matches for a sentence */

void FuzzyMatchWrapper::add_short_matches(WordIndex &wordIndex, long translationId, vector< Match > &match, const vector< WORD_ID > &tm, int input_length, int best_cost )
{
  int max_length = short_match_max_length( input_length );
  if (max_length == 0)
    return;

  int tm_length = tm.size();
  map< WORD_ID,vector< int > >::iterator input_word_hit;
  for(int t_pos=0; t_pos<tm.size(); t_pos++) {
    input_word_hit = wordIndex.find( tm[t_pos] );
    if (input_word_hit != wordIndex.end()) {
      vector< int > &position_vector = input_word_hit->second;
      for(int j=0; j<position_vector.size(); j++) {
        int &i_pos = position_vector[j];

        // before match
        int max_cost = max( i_pos , t_pos );
        int min_cost = abs( i_pos - t_pos );
        if ( i_pos>0 && i_pos == t_pos )
          min_cost++;

        // after match
        max_cost += max( (input_length-i_pos) , (tm_length-t_pos));
        min_cost += abs( (input_length-i_pos) - (tm_length-t_pos));
        if ( i_pos != input_length-1 && (input_length-i_pos) == (tm_length-t_pos))
          min_cost++;

        if (min_cost <= best_cost) {
          Match new_match( i_pos,i_pos, t_pos,t_pos, min_cost,max_cost,0 );
          match.push_back( new_match );
        }
      }
    }
  }
}

/* remove matches that are subsumed by a larger match */

vector< Match > FuzzyMatchWrapper::prune_matches( const vector< Match > &match, int best_cost )
{
  //cerr << "\tpruning";
  vector< Match > pruned;
  for(int i=match.size()-1; i>=0; i--) {
    //cerr << " (" << match[i].input_start << "," << match[i].input_end
    //		 << " ; " << match[i].tm_start << "," << match[i].tm_end
    //		 << " * " << match[i].min_cost << ")";

    //if (match[i].min_cost > best_cost)
    //	continue;

    bool subsumed = false;
    for(int j=match.size()-1; j>=0; j--) {
      if (i!=j // do not compare match with itself
          && ( match[i].input_end - match[i].input_start <=
               match[j].input_end - match[j].input_start ) // i shorter than j
          && ((match[i].input_start == match[j].input_start &&
               match[i].tm_start    == match[j].tm_start	) ||
              (match[i].input_end   == match[j].input_end &&
               match[i].tm_end      == match[j].tm_end) ) ) {
        subsumed = true;
      }
    }
    if (! subsumed && match[i].min_cost <= best_cost) {
      //cerr << "*";
      pruned.push_back( match[i] );
    }
  }
  //cerr << endl;
  return pruned;
}

/* A* parsing method to compute string edit distance */

int FuzzyMatchWrapper::parse_matches( vector< Match > &match, int input_length, int tm_length, int &best_cost )
{
  // cerr << "sentence has " << match.size() << " matches, best cost: " << best_cost << ", lengths input: " << input_length << " tm: " << tm_length << endl;

  if (match.size() == 1)
    return match[0].max_cost;
  if (match.size() == 0)
    return input_length+tm_length;

  int this_best_cost = input_length + tm_length;
  for(int i=0; i<match.size(); i++) {
    this_best_cost = min( this_best_cost, match[i].max_cost );
  }
  // cerr << "\tthis best cost: " << this_best_cost << endl;

  // bottom up combination of spans
  vector< vector< Match > > multi_match;
  multi_match.push_back( match );

  int match_level = 1;
  while(multi_match[ match_level-1 ].size()>0) {
    // init vector
    vector< Match > empty;
    multi_match.push_back( empty );

    for(int first_level = 0; first_level <= (match_level-1)/2; first_level++) {
      int second_level = match_level - first_level -1;
      //cerr << "\tcombining level " << first_level << " and " << second_level << endl;

      vector< Match > &first_match  = multi_match[ first_level ];
      vector< Match > &second_match = multi_match[ second_level ];

      for(int i1 = 0; i1 < first_match.size(); i1++) {
        for(int i2 = 0; i2 < second_match.size(); i2++) {

          // do not combine the same pair twice
          if (first_level == second_level && i2 <= i1) {
            continue;
          }

          // get sorted matches (first is before second)
          Match *first, *second;
          if (first_match[i1].input_start < second_match[i2].input_start ) {
            first = &first_match[i1];
            second = &second_match[i2];
          } else {
            second = &first_match[i1];
            first = &second_match[i2];
          }

          //cerr << "\tcombining "
          //		 << "(" << first->input_start << "," << first->input_end << "), "
          //		 << first->tm_start << " [" << first->internal_cost << "]"
          //		 << " with "
          //		 << "(" << second->input_start << "," << second->input_end << "), "
          //		 << second->tm_start<< " [" << second->internal_cost << "]"
          //		 << endl;

          // do not process overlapping matches
          if (first->input_end >= second->input_start) {
            continue;
          }

          // no overlap / mismatch in tm
          if (first->tm_end >= second->tm_start) {
            continue;
          }

          // compute cost
          int min_cost = 0;
          int max_cost = 0;

          // initial
          min_cost += abs( first->input_start - first->tm_start );
          max_cost += max( first->input_start, first->tm_start );

          // same number of words, but not sent. start -> cost is at least 1
          if (first->input_start == first->tm_start && first->input_start > 0) {
            min_cost++;
          }

          // in-between
          int skipped_words = second->input_start - first->input_end -1;
          int skipped_words_tm = second->tm_start - first->tm_end -1;
          int internal_cost = max( skipped_words, skipped_words_tm );
          internal_cost += first->internal_cost + second->internal_cost;
          min_cost += internal_cost;
          max_cost += internal_cost;

          // final
          min_cost += abs( (tm_length-1 - second->tm_end) -
                           (input_length-1 - second->input_end) );
          max_cost += max( (tm_length-1 - second->tm_end),
                           (input_length-1 - second->input_end) );

          // same number of words, but not sent. end -> cost is at least 1
          if ( ( input_length-1 - second->input_end
                 == tm_length-1 - second->tm_end )
               && input_length-1 != second->input_end ) {
            min_cost++;
          }

          // cerr << "\tcost: " << min_cost << "-" << max_cost << endl;

          // if worst than best cost, forget it
          if (min_cost > best_cost) {
            continue;
          }

          // add match
          Match new_match( first->input_start,
                           second->input_end,
                           first->tm_start,
                           second->tm_end,
                           min_cost,
                           max_cost,
                           internal_cost);
          multi_match[ match_level ].push_back( new_match );
          // cerr << "\tstored\n";

          // possibly updating this_best_cost
          if (max_cost < this_best_cost) {
            // cerr << "\tupdating this best cost to " << max_cost << "\n";
            this_best_cost = max_cost;

            // possibly updating best_cost
            if (max_cost < best_cost) {
              // cerr << "\tupdating best cost to " << max_cost << "\n";
              best_cost = max_cost;
            }
          }
        }
      }
    }
    match_level++;
  }
  return this_best_cost;
}


void FuzzyMatchWrapper::create_extract(int sentenceInd, int cost, const vector< WORD_ID > &sourceSentence, const vector<SentenceAlignment> &targets, const string &inputStr, const string  &path, ofstream &outputFile)
{
  string sourceStr;
  for (size_t pos = 0; pos < sourceSentence.size(); ++pos) {
    WORD_ID wordId = sourceSentence[pos];
    sourceStr += GetVocabulary().GetWord(wordId) + " ";
  }

  for (size_t targetInd = 0; targetInd < targets.size(); ++targetInd) {
    const SentenceAlignment &sentenceAlignment = targets[targetInd];
    string targetStr = sentenceAlignment.getTargetString(GetVocabulary());
    string alignStr = sentenceAlignment.getAlignmentString();

    outputFile
        << sentenceInd << endl
        << cost << endl
        << sourceStr << endl
        << inputStr << endl
        << targetStr << endl
        << alignStr << endl
        << path << endl
        << sentenceAlignment.count << endl;

  }
}

} // namespace