mosesdecoder/moses/src/RuleTable/PhraseDictionaryFuzzyMatch.cpp

// vim:tabstop=2

/***********************************************************************
 Moses - factored phrase-based language decoder
 Copyright (C) 2006 University of Edinburgh
 
 This library is free software; you can redistribute it and/or
 modify it under the terms of the GNU Lesser General Public
 License as published by the Free Software Foundation; either
 version 2.1 of the License, or (at your option) any later version.
 
 This library is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 Lesser General Public License for more details.
 
 You should have received a copy of the GNU Lesser General Public
 License along with this library; if not, write to the Free Software
 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 ***********************************************************************/

#include <fstream>
#include <string>
#include <iterator>
#include <algorithm>
#include "RuleTable/Loader.h"
#include "RuleTable/LoaderFactory.h"
#include "PhraseDictionaryFuzzyMatch.h"
#include "FactorCollection.h"
#include "Word.h"
#include "Util.h"
#include "InputFileStream.h"
#include "StaticData.h"
#include "WordsRange.h"
#include "UserMessage.h"
#include "util/file.hh"
#include "CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h"

using namespace std;

namespace Moses
{

  PhraseDictionaryFuzzyMatch::PhraseDictionaryFuzzyMatch(size_t numScoreComponents,
                            PhraseDictionaryFeature* feature)
  : PhraseDictionary(numScoreComponents, feature) 
  {
    const StaticData &staticData = StaticData::Instance();
    CHECK(staticData.ThreadCount() == 1);    
  }

  bool PhraseDictionaryFuzzyMatch::Load(const std::vector<FactorType> &input
            , const std::vector<FactorType> &output
            , const std::string &initStr
            , const std::vector<float> &weight
            , size_t tableLimit,
            const LMList& languageModels,
            const WordPenaltyProducer* wpProducer)
  {
    m_languageModels = &(languageModels);
    m_wpProducer = wpProducer;
    m_tableLimit = tableLimit;
    m_input		= &input;
    m_output	= &output;
    
    m_weight = &weight;
   
    cerr << "initStr=" << initStr << endl;
    m_config = Tokenize(initStr, ";");
    assert(m_config.size() == 3);

    m_FuzzyMatchWrapper = new tmmt::FuzzyMatchWrapper(m_config[0], m_config[1], m_config[2]);
    
    return true;
  }
    
  ChartRuleLookupManager *PhraseDictionaryFuzzyMatch::CreateRuleLookupManager(
                                                                        const InputType &sentence,
                                                                        const ChartCellCollection &cellCollection)
  {
    return new ChartRuleLookupManagerMemoryPerSentence(sentence, cellCollection, *this);
  }
    
  void PhraseDictionaryFuzzyMatch::InitializeForInput(InputType const& inputSentence)
  {
    util::TempMaker tempFile("moses");
    util::scoped_fd alive;
    string inFileName(tempFile.Name(alive));
    
    ofstream inFile(inFileName.c_str());
    
    for (size_t i = 1; i < inputSentence.GetSize() - 1; ++i)
    {
      inFile << inputSentence.GetWord(i);
    }
    inFile << endl;
    inFile.close();
        
    string ptFileName = m_FuzzyMatchWrapper->Extract(inFileName);

    // populate with rules for this sentence
    long translationId = inputSentence.GetTranslationId();

    PhraseDictionaryNodeSCFG &rootNode = m_collection[translationId];
    FormatType format = MosesFormat;
        
    // data from file
    InputFileStream inStream(ptFileName);
    
    // copied from class LoaderStandard
    PrintUserTime("Start loading fuzzy-match phrase model");
    
    const StaticData &staticData = StaticData::Instance();
    const std::string& factorDelimiter = staticData.GetFactorDelimiter();
    
    
    string lineOrig;
    size_t count = 0;
    
    while(getline(inStream, lineOrig)) {
      const string *line;
      if (format == HieroFormat) { // reformat line
        assert(false);
        //line = ReformatHieroRule(lineOrig);
      }
      else
      { // do nothing to format of line
        line = &lineOrig;
      }
      
      vector<string> tokens;
      vector<float> scoreVector;
      
      TokenizeMultiCharSeparator(tokens, *line , "|||" );
      
      if (tokens.size() != 4 && tokens.size() != 5) {
        stringstream strme;
        strme << "Syntax error at " << ptFileName << ":" << count;
        UserMessage::Add(strme.str());
        abort();
      }
      
      const string &sourcePhraseString = tokens[0]
      , &targetPhraseString = tokens[1]
      , &scoreString        = tokens[2]
      , &alignString        = tokens[3];
      
      bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
      if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
        TRACE_ERR( ptFileName << ":" << count << ": pt entry contains empty target, skipping\n");
        continue;
      }
      
      Tokenize<float>(scoreVector, scoreString);
      const size_t numScoreComponents = GetFeature()->GetNumScoreComponents();
      if (scoreVector.size() != numScoreComponents) {
        stringstream strme;
        strme << "Size of scoreVector != number (" << scoreVector.size() << "!="
        << numScoreComponents << ") of score components on line " << count;
        UserMessage::Add(strme.str());
        abort();
      }
      CHECK(scoreVector.size() == numScoreComponents);
      
      // parse source & find pt node
      
      // constituent labels
      Word sourceLHS, targetLHS;
      
      // source
      Phrase sourcePhrase( 0);
      sourcePhrase.CreateFromStringNewFormat(Input, *m_input, sourcePhraseString, factorDelimiter, sourceLHS);
      
      // create target phrase obj
      TargetPhrase *targetPhrase = new TargetPhrase(Output);
      targetPhrase->CreateFromStringNewFormat(Output, *m_output, targetPhraseString, factorDelimiter, targetLHS);
      
      // rest of target phrase
      targetPhrase->SetAlignmentInfo(alignString);
      targetPhrase->SetTargetLHS(targetLHS);
      //targetPhrase->SetDebugOutput(string("New Format pt ") + line);
      
      // component score, for n-best output
      std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore);
      std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);
      
      targetPhrase->SetScoreChart(GetFeature(), scoreVector, *m_weight, *m_languageModels, m_wpProducer);
      
      TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase, *targetPhrase, sourceLHS);
      phraseColl.Add(targetPhrase);
      
      count++;
      
      if (format == HieroFormat) { // reformat line
        delete line;
      }
      else
      { // do nothing
      }
      
    }
    
    // sort and prune each target phrase collection
    SortAndPrune(rootNode);
   
    remove(ptFileName.c_str());
    remove(inFileName.c_str());
  }
  
  TargetPhraseCollection &PhraseDictionaryFuzzyMatch::GetOrCreateTargetPhraseCollection(PhraseDictionaryNodeSCFG &rootNode
                                                                                  , const Phrase &source
                                                                                  , const TargetPhrase &target
                                                                                  , const Word &sourceLHS)
  {
    PhraseDictionaryNodeSCFG &currNode = GetOrCreateNode(rootNode, source, target, sourceLHS);
    return currNode.GetOrCreateTargetPhraseCollection();
  }

  PhraseDictionaryNodeSCFG &PhraseDictionaryFuzzyMatch::GetOrCreateNode(PhraseDictionaryNodeSCFG &rootNode
                                                                  , const Phrase &source
                                                                  , const TargetPhrase &target
                                                                  , const Word &sourceLHS)
  {
    cerr << source << endl << target << endl;
    const size_t size = source.GetSize();
    
    const AlignmentInfo &alignmentInfo = target.GetAlignmentInfo();
    AlignmentInfo::const_iterator iterAlign = alignmentInfo.begin();
    
    PhraseDictionaryNodeSCFG *currNode = &rootNode;
    for (size_t pos = 0 ; pos < size ; ++pos) {
      const Word& word = source.GetWord(pos);
      
      if (word.IsNonTerminal()) {
        // indexed by source label 1st
        const Word &sourceNonTerm = word;
        
        CHECK(iterAlign != target.GetAlignmentInfo().end());
        CHECK(iterAlign->first == pos);
        size_t targetNonTermInd = iterAlign->second;
        ++iterAlign;
        const Word &targetNonTerm = target.GetWord(targetNonTermInd);
        
        currNode = currNode->GetOrCreateChild(sourceNonTerm, targetNonTerm);
      } else {
        currNode = currNode->GetOrCreateChild(word);
      }
      
      CHECK(currNode != NULL);
    }
    
    // finally, the source LHS
    //currNode = currNode->GetOrCreateChild(sourceLHS);
    //CHECK(currNode != NULL);
    
    
    return *currNode;
  }

  void PhraseDictionaryFuzzyMatch::SortAndPrune(PhraseDictionaryNodeSCFG &rootNode)
  {
    if (GetTableLimit())
    {
      rootNode.Sort(GetTableLimit());
    }
  }
  
  void PhraseDictionaryFuzzyMatch::CleanUp(const InputType &source)
  {
    m_collection.erase(source.GetTranslationId());
  }

  const PhraseDictionaryNodeSCFG &PhraseDictionaryFuzzyMatch::GetRootNode(const InputType &source) const 
  {
    long transId = source.GetTranslationId();
    std::map<long, PhraseDictionaryNodeSCFG>::const_iterator iter = m_collection.find(transId);
    CHECK(iter != m_collection.end());
    return iter->second; 
  }
  PhraseDictionaryNodeSCFG &PhraseDictionaryFuzzyMatch::GetRootNode(const InputType &source) 
  {
    long transId = source.GetTranslationId();
    std::map<long, PhraseDictionaryNodeSCFG>::iterator iter = m_collection.find(transId);
    CHECK(iter != m_collection.end());
    return iter->second; 
  }
  
  TO_STRING_BODY(PhraseDictionaryFuzzyMatch);
  
  // friend
  ostream& operator<<(ostream& out, const PhraseDictionaryFuzzyMatch& phraseDict)
  {
    typedef PhraseDictionaryNodeSCFG::TerminalMap TermMap;
    typedef PhraseDictionaryNodeSCFG::NonTerminalMap NonTermMap;
    
    /*
    const PhraseDictionaryNodeSCFG &coll = phraseDict.m_collection;
    for (NonTermMap::const_iterator p = coll.m_nonTermMap.begin(); p != coll.m_nonTermMap.end(); ++p) {
      const Word &sourceNonTerm = p->first.first;
      out << sourceNonTerm;
    }
    for (TermMap::const_iterator p = coll.m_sourceTermMap.begin(); p != coll.m_sourceTermMap.end(); ++p) {
      const Word &sourceTerm = p->first;
      out << sourceTerm;
    }
     */
    
    return out;
  }
  
}
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 17:52:48 +04:00			`// vim:tabstop=2`

			`/***********************************************************************`
			`Moses - factored phrase-based language decoder`
			`Copyright (C) 2006 University of Edinburgh`

			`This library is free software; you can redistribute it and/or`
			`modify it under the terms of the GNU Lesser General Public`
			`License as published by the Free Software Foundation; either`
			`version 2.1 of the License, or (at your option) any later version.`

			`This library is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`Lesser General Public License for more details.`

			`You should have received a copy of the GNU Lesser General Public`
			`License along with this library; if not, write to the Free Software`
			`Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`***********************************************************************/`

			`#include <fstream>`
			`#include <string>`
			`#include <iterator>`
			`#include <algorithm>`
			`#include "RuleTable/Loader.h"`
			`#include "RuleTable/LoaderFactory.h"`
rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`#include "PhraseDictionaryFuzzyMatch.h"`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 17:52:48 +04:00			`#include "FactorCollection.h"`
			`#include "Word.h"`
			`#include "Util.h"`
			`#include "InputFileStream.h"`
			`#include "StaticData.h"`
			`#include "WordsRange.h"`
			`#include "UserMessage.h"`
get rid of tmpnam 2012-08-07 00:41:24 +04:00			`#include "util/file.hh"`
Add PhraseDictionaryTMExtract and ChartRuleLookupManagerMemoryPerSentence to deal with per per sentence grammar. eg. Adam's suffix array and Phi Koehn's TM extraction 2012-07-19 20:56:46 +04:00			`#include "CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h"`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 17:52:48 +04:00
			`using namespace std;`

			`namespace Moses`
			`{`
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles 2012-07-27 03:10:49 +04:00
rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`PhraseDictionaryFuzzyMatch::PhraseDictionaryFuzzyMatch(size_t numScoreComponents,`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 20:59:21 +04:00			`PhraseDictionaryFeature* feature)`
Add PhraseDictionaryTMExtract and ChartRuleLookupManagerMemoryPerSentence to deal with per per sentence grammar. eg. Adam's suffix array and Phi Koehn's TM extraction 2012-07-19 20:56:46 +04:00			`: PhraseDictionary(numScoreComponents, feature)`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 20:59:21 +04:00			`{`
			`const StaticData &staticData = StaticData::Instance();`
read in pt file. Compiles but and runs. Need to check scores 2012-07-24 15:13:42 +04:00			`CHECK(staticData.ThreadCount() == 1);`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 20:59:21 +04:00			`}`

rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`bool PhraseDictionaryFuzzyMatch::Load(const std::vector<FactorType> &input`
read in pt file. Compiles but and runs. Need to check scores 2012-07-23 17:31:05 +04:00			`, const std::vector<FactorType> &output`
read in pt file. Compiles but and runs. Need to check scores 2012-07-23 19:36:39 +04:00			`, const std::string &initStr`
read in pt file. Compiles but and runs. Need to check scores 2012-07-23 17:31:05 +04:00			`, const std::vector<float> &weight`
			`, size_t tableLimit,`
			`const LMList& languageModels,`
			`const WordPenaltyProducer* wpProducer)`
			`{`
			`m_languageModels = &(languageModels);`
			`m_wpProducer = wpProducer;`
			`m_tableLimit = tableLimit;`
			`m_input = &input;`
			`m_output = &output;`

			`m_weight = &weight;`
read in pt file. Compiles but and runs. Need to check scores 2012-07-23 19:36:39 +04:00
Per sentence grammar classes compiles but doesn't run 2012-07-19 22:36:46 +04:00			`cerr << "initStr=" << initStr << endl;`
			`m_config = Tokenize(initStr, ";");`
cleanup of variables. Need to delete temporary files 2012-07-31 05:21:48 +04:00			`assert(m_config.size() == 3);`
read in pt file. Compiles but and runs. Need to check scores 2012-07-23 19:36:39 +04:00
rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`m_FuzzyMatchWrapper = new tmmt::FuzzyMatchWrapper(m_config[0], m_config[1], m_config[2]);`
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles 2012-07-31 00:07:19 +04:00
read in pt file. Compiles but and runs. Need to check scores 2012-07-23 19:36:39 +04:00			`return true;`
Per sentence grammar classes compiles but doesn't run 2012-07-19 22:36:46 +04:00			`}`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 17:52:48 +04:00
rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`ChartRuleLookupManager *PhraseDictionaryFuzzyMatch::CreateRuleLookupManager(`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 17:52:48 +04:00			`const InputType &sentence,`
			`const ChartCellCollection &cellCollection)`
			`{`
Add PhraseDictionaryTMExtract and ChartRuleLookupManagerMemoryPerSentence to deal with per per sentence grammar. eg. Adam's suffix array and Phi Koehn's TM extraction 2012-07-19 20:56:46 +04:00			`return new ChartRuleLookupManagerMemoryPerSentence(sentence, cellCollection, *this);`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 17:52:48 +04:00			`}`
read in pt file. Compiles but doesn't run 2012-07-23 17:07:36 +04:00
rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`void PhraseDictionaryFuzzyMatch::InitializeForInput(InputType const& inputSentence)`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 20:59:21 +04:00			`{`
get rid of tmpnam 2012-08-07 00:41:24 +04:00			`util::TempMaker tempFile("moses");`
Fix temporary file handling 2012-08-08 18:14:47 +04:00			`util::scoped_fd alive;`
			`string inFileName(tempFile.Name(alive));`
Per sentence grammar classes compiles but doesn't run 2012-07-19 22:36:46 +04:00
cleanup of variables. Need to delete temporary files 2012-07-31 05:21:48 +04:00			`ofstream inFile(inFileName.c_str());`
Per sentence grammar classes compiles but doesn't run 2012-07-19 22:36:46 +04:00
read in pt file. Compiles but and runs. Need to check scores 2012-07-23 18:10:15 +04:00			`for (size_t i = 1; i < inputSentence.GetSize() - 1; ++i)`
Per sentence grammar classes compiles but doesn't run 2012-07-19 22:36:46 +04:00			`{`
read in pt file. Compiles but and runs. Need to check scores 2012-07-23 18:10:15 +04:00			`inFile << inputSentence.GetWord(i);`
Per sentence grammar classes compiles but doesn't run 2012-07-19 22:36:46 +04:00			`}`
			`inFile << endl;`
			`inFile.close();`
get rid of tmpnam 2012-08-07 00:41:24 +04:00
rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`string ptFileName = m_FuzzyMatchWrapper->Extract(inFileName);`
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles 2012-07-31 00:07:19 +04:00
read in pt file. Doing ff, doesn't yet compile 2012-07-23 15:26:15 +04:00			`// populate with rules for this sentence`
read in pt file. Compiles but doesn't run 2012-07-23 17:07:36 +04:00			`long translationId = inputSentence.GetTranslationId();`

			`PhraseDictionaryNodeSCFG &rootNode = m_collection[translationId];`
			`FormatType format = MosesFormat;`
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles. It works 2012-07-31 02:49:13 +04:00
read in pt file. Doing ff, doesn't yet compile 2012-07-23 15:26:15 +04:00			`// data from file`
cleanup of variables. Need to delete temporary files 2012-07-31 05:21:48 +04:00			`InputFileStream inStream(ptFileName);`
read in pt file. Doing ff, doesn't yet compile 2012-07-23 15:26:15 +04:00
			`// copied from class LoaderStandard`
better indication of what phrase table implementation is being loaded 2012-08-09 15:47:48 +04:00			`PrintUserTime("Start loading fuzzy-match phrase model");`
read in pt file. Doing ff, doesn't yet compile 2012-07-23 15:26:15 +04:00
			`const StaticData &staticData = StaticData::Instance();`
			`const std::string& factorDelimiter = staticData.GetFactorDelimiter();`


			`string lineOrig;`
			`size_t count = 0;`

			`while(getline(inStream, lineOrig)) {`
			`const string *line;`
			`if (format == HieroFormat) { // reformat line`
			`assert(false);`
			`//line = ReformatHieroRule(lineOrig);`
			`}`
			`else`
			`{ // do nothing to format of line`
			`line = &lineOrig;`
			`}`

			`vector<string> tokens;`
			`vector<float> scoreVector;`

			`TokenizeMultiCharSeparator(tokens, *line , "\|\|\|" );`

			`if (tokens.size() != 4 && tokens.size() != 5) {`
			`stringstream strme;`
cleanup of variables. Need to delete temporary files 2012-07-31 05:21:48 +04:00			`strme << "Syntax error at " << ptFileName << ":" << count;`
read in pt file. Doing ff, doesn't yet compile 2012-07-23 15:26:15 +04:00			`UserMessage::Add(strme.str());`
			`abort();`
			`}`

			`const string &sourcePhraseString = tokens[0]`
			`, &targetPhraseString = tokens[1]`
			`, &scoreString = tokens[2]`
			`, &alignString = tokens[3];`

			`bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);`
			`if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {`
cleanup of variables. Need to delete temporary files 2012-07-31 05:21:48 +04:00			`TRACE_ERR( ptFileName << ":" << count << ": pt entry contains empty target, skipping\n");`
read in pt file. Doing ff, doesn't yet compile 2012-07-23 15:26:15 +04:00			`continue;`
			`}`

			`Tokenize<float>(scoreVector, scoreString);`
			`const size_t numScoreComponents = GetFeature()->GetNumScoreComponents();`
			`if (scoreVector.size() != numScoreComponents) {`
			`stringstream strme;`
			`strme << "Size of scoreVector != number (" << scoreVector.size() << "!="`
			`<< numScoreComponents << ") of score components on line " << count;`
			`UserMessage::Add(strme.str());`
			`abort();`
			`}`
			`CHECK(scoreVector.size() == numScoreComponents);`

			`// parse source & find pt node`

			`// constituent labels`
			`Word sourceLHS, targetLHS;`

			`// source`
			`Phrase sourcePhrase( 0);`
			`sourcePhrase.CreateFromStringNewFormat(Input, *m_input, sourcePhraseString, factorDelimiter, sourceLHS);`

			`// create target phrase obj`
			`TargetPhrase *targetPhrase = new TargetPhrase(Output);`
			`targetPhrase->CreateFromStringNewFormat(Output, *m_output, targetPhraseString, factorDelimiter, targetLHS);`

			`// rest of target phrase`
			`targetPhrase->SetAlignmentInfo(alignString);`
			`targetPhrase->SetTargetLHS(targetLHS);`
			`//targetPhrase->SetDebugOutput(string("New Format pt ") + line);`

			`// component score, for n-best output`
			`std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore);`
			`std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);`

read in pt file. Compiles but doesn't run 2012-07-23 17:07:36 +04:00			`targetPhrase->SetScoreChart(GetFeature(), scoreVector, m_weight, m_languageModels, m_wpProducer);`
read in pt file. Doing ff, doesn't yet compile 2012-07-23 15:26:15 +04:00
read in pt file. Compiles but doesn't run 2012-07-23 17:07:36 +04:00			`TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase, *targetPhrase, sourceLHS);`
read in pt file. Doing ff, doesn't yet compile 2012-07-23 15:26:15 +04:00			`phraseColl.Add(targetPhrase);`

			`count++;`

			`if (format == HieroFormat) { // reformat line`
			`delete line;`
			`}`
			`else`
			`{ // do nothing`
			`}`

			`}`

			`// sort and prune each target phrase collection`
read in pt file. Compiles but doesn't run 2012-07-23 17:07:36 +04:00			`SortAndPrune(rootNode);`
read in pt file. Compiles but and runs. Need to check scores 2012-07-23 18:10:15 +04:00
cleanup of variables. Need to delete temporary files 2012-07-31 05:21:48 +04:00			`remove(ptFileName.c_str());`
			`remove(inFileName.c_str());`
read in pt file. Compiles but doesn't run 2012-07-23 17:07:36 +04:00			`}`

rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`TargetPhraseCollection &PhraseDictionaryFuzzyMatch::GetOrCreateTargetPhraseCollection(PhraseDictionaryNodeSCFG &rootNode`
read in pt file. Compiles but doesn't run 2012-07-23 17:07:36 +04:00			`, const Phrase &source`
			`, const TargetPhrase &target`
			`, const Word &sourceLHS)`
			`{`
			`PhraseDictionaryNodeSCFG &currNode = GetOrCreateNode(rootNode, source, target, sourceLHS);`
			`return currNode.GetOrCreateTargetPhraseCollection();`
			`}`

rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`PhraseDictionaryNodeSCFG &PhraseDictionaryFuzzyMatch::GetOrCreateNode(PhraseDictionaryNodeSCFG &rootNode`
read in pt file. Compiles but doesn't run 2012-07-23 17:07:36 +04:00			`, const Phrase &source`
			`, const TargetPhrase &target`
			`, const Word &sourceLHS)`
			`{`
read in pt file. Compiles but doesn't run 2012-07-23 17:18:49 +04:00			`cerr << source << endl << target << endl;`
read in pt file. Compiles but doesn't run 2012-07-23 17:07:36 +04:00			`const size_t size = source.GetSize();`

			`const AlignmentInfo &alignmentInfo = target.GetAlignmentInfo();`
			`AlignmentInfo::const_iterator iterAlign = alignmentInfo.begin();`

			`PhraseDictionaryNodeSCFG *currNode = &rootNode;`
			`for (size_t pos = 0 ; pos < size ; ++pos) {`
			`const Word& word = source.GetWord(pos);`

			`if (word.IsNonTerminal()) {`
			`// indexed by source label 1st`
			`const Word &sourceNonTerm = word;`

			`CHECK(iterAlign != target.GetAlignmentInfo().end());`
			`CHECK(iterAlign->first == pos);`
			`size_t targetNonTermInd = iterAlign->second;`
			`++iterAlign;`
			`const Word &targetNonTerm = target.GetWord(targetNonTermInd);`

			`currNode = currNode->GetOrCreateChild(sourceNonTerm, targetNonTerm);`
			`} else {`
			`currNode = currNode->GetOrCreateChild(word);`
			`}`

			`CHECK(currNode != NULL);`
			`}`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 20:59:21 +04:00
read in pt file. Compiles but doesn't run 2012-07-23 17:07:36 +04:00			`// finally, the source LHS`
			`//currNode = currNode->GetOrCreateChild(sourceLHS);`
			`//CHECK(currNode != NULL);`


			`return *currNode;`
			`}`

rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`void PhraseDictionaryFuzzyMatch::SortAndPrune(PhraseDictionaryNodeSCFG &rootNode)`
read in pt file. Compiles but doesn't run 2012-07-23 17:07:36 +04:00			`{`
			`if (GetTableLimit())`
			`{`
			`rootNode.Sort(GetTableLimit());`
			`}`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 20:59:21 +04:00			`}`

rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`void PhraseDictionaryFuzzyMatch::CleanUp(const InputType &source)`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 20:59:21 +04:00			`{`
Per sentence grammar classes compiles but doesn't run 2012-07-19 22:36:46 +04:00			`m_collection.erase(source.GetTranslationId());`
Per sentence grammar classes compiles but doesn't run 2012-07-19 21:59:50 +04:00			`}`

rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`const PhraseDictionaryNodeSCFG &PhraseDictionaryFuzzyMatch::GetRootNode(const InputType &source) const`
Per sentence grammar classes compiles but doesn't run 2012-07-19 21:59:50 +04:00			`{`
			`long transId = source.GetTranslationId();`
			`std::map<long, PhraseDictionaryNodeSCFG>::const_iterator iter = m_collection.find(transId);`
			`CHECK(iter != m_collection.end());`
			`return iter->second;`
			`}`
rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`PhraseDictionaryNodeSCFG &PhraseDictionaryFuzzyMatch::GetRootNode(const InputType &source)`
Per sentence grammar classes compiles but doesn't run 2012-07-19 21:59:50 +04:00			`{`
			`long transId = source.GetTranslationId();`
			`std::map<long, PhraseDictionaryNodeSCFG>::iterator iter = m_collection.find(transId);`
			`CHECK(iter != m_collection.end());`
			`return iter->second;`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 20:59:21 +04:00			`}`
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles 2012-07-27 03:10:49 +04:00
rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`TO_STRING_BODY(PhraseDictionaryFuzzyMatch);`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 17:52:48 +04:00
			`// friend`
rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`ostream& operator<<(ostream& out, const PhraseDictionaryFuzzyMatch& phraseDict)`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 17:52:48 +04:00			`{`
			`typedef PhraseDictionaryNodeSCFG::TerminalMap TermMap;`
			`typedef PhraseDictionaryNodeSCFG::NonTerminalMap NonTermMap;`

Per sentence grammar classes compiles but doesn't run 2012-07-19 21:59:50 +04:00			`/*`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 17:52:48 +04:00			`const PhraseDictionaryNodeSCFG &coll = phraseDict.m_collection;`
			`for (NonTermMap::const_iterator p = coll.m_nonTermMap.begin(); p != coll.m_nonTermMap.end(); ++p) {`
			`const Word &sourceNonTerm = p->first.first;`
			`out << sourceNonTerm;`
			`}`
			`for (TermMap::const_iterator p = coll.m_sourceTermMap.begin(); p != coll.m_sourceTermMap.end(); ++p) {`
			`const Word &sourceTerm = p->first;`
			`out << sourceTerm;`
			`}`
Per sentence grammar classes compiles but doesn't run 2012-07-19 21:59:50 +04:00			`*/`

Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 17:52:48 +04:00			`return out;`
			`}`

			`}`