mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp

// vim:tabstop=2

/***********************************************************************
 Moses - factored phrase-based language decoder
 Copyright (C) 2006 University of Edinburgh
 
 This library is free software; you can redistribute it and/or
 modify it under the terms of the GNU Lesser General Public
 License as published by the Free Software Foundation; either
 version 2.1 of the License, or (at your option) any later version.
 
 This library is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 Lesser General Public License for more details.
 
 You should have received a copy of the GNU Lesser General Public
 License along with this library; if not, write to the Free Software
 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 ***********************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <sys/types.h>
#include <unistd.h>
#include <dirent.h>

#include <fstream>
#include <string>
#include <iterator>
#include <algorithm>
#include "Loader.h"
#include "LoaderFactory.h"
#include "PhraseDictionaryFuzzyMatch.h"
#include "moses/FactorCollection.h"
#include "moses/Word.h"
#include "moses/Util.h"
#include "moses/InputFileStream.h"
#include "moses/StaticData.h"
#include "moses/WordsRange.h"
#include "moses/UserMessage.h"
#include "util/file.hh"
#include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h"

using namespace std;

namespace Moses
{

  PhraseDictionaryFuzzyMatch::PhraseDictionaryFuzzyMatch(size_t numScoreComponents,
                            PhraseDictionaryFeature* feature)
  : PhraseDictionary(numScoreComponents, feature) 
  {
    //const StaticData &staticData = StaticData::Instance();
    //CHECK(staticData.ThreadCount() == 1);
  }

  bool PhraseDictionaryFuzzyMatch::Load(const std::vector<FactorType> &input
            , const std::vector<FactorType> &output
            , const std::string &initStr
            , const std::vector<float> &weight
            , size_t tableLimit,
            const LMList& languageModels,
            const WordPenaltyProducer* wpProducer)
  {
    m_languageModels = &(languageModels);
    m_wpProducer = wpProducer;
    m_tableLimit = tableLimit;
    m_input		= &input;
    m_output	= &output;
    
    m_weight = new vector<float>(weight);
   
    cerr << "initStr=" << initStr << endl;
    m_config = Tokenize(initStr, ";");
    assert(m_config.size() == 3);

    m_FuzzyMatchWrapper = new tmmt::FuzzyMatchWrapper(m_config[0], m_config[1], m_config[2]);
    
    return true;
  }
    
  ChartRuleLookupManager *PhraseDictionaryFuzzyMatch::CreateRuleLookupManager(
                                                                        const InputType &sentence,
                                                                        const ChartCellCollectionBase &cellCollection)
  {
    return new ChartRuleLookupManagerMemoryPerSentence(sentence, cellCollection, *this);
  }
    
  
  int removedirectoryrecursively(const char *dirname)
  {
    DIR *dir;
    struct dirent *entry;
    char path[PATH_MAX];
    
    if (path == NULL) {
      fprintf(stderr, "Out of memory error\n");
      return 0;
    }
    dir = opendir(dirname);
    if (dir == NULL) {
      perror("Error opendir()");
      return 0;
    }
    
    while ((entry = readdir(dir)) != NULL) {
      if (strcmp(entry->d_name, ".") && strcmp(entry->d_name, "..")) {
        snprintf(path, (size_t) PATH_MAX, "%s/%s", dirname, entry->d_name);
        if (entry->d_type == DT_DIR) {
          removedirectoryrecursively(path);
        }
        
        remove(path);
        /*
         * Here, the actual deletion must be done.  Beacuse this is
         * quite a dangerous thing to do, and this program is not very
         * well tested, we are just printing as if we are deleting.
         */
        //printf("(not really) Deleting: %s\n", path);
        /*
         * When you are finished testing this and feel you are ready to do the real
         * deleting, use this: remove*STUB*(path);
         * (see "man 3 remove")
         * Please note that I DONT TAKE RESPONSIBILITY for data you delete with this!
         */
      }
      
    }
    closedir(dir);
    
    rmdir(dirname);
    /*
     * Now the directory is emtpy, finally delete the directory itself. (Just
     * printing here, see above) 
     */
    //printf("(not really) Deleting: %s\n", dirname);
    
    return 1;
  }

  void PhraseDictionaryFuzzyMatch::InitializeForInput(InputType const& inputSentence)
  {
    char dirName[] = "/tmp/moses.XXXXXX";
    char *temp = mkdtemp(dirName);
    CHECK(temp);
    string dirNameStr(dirName);
    
    string inFileName(dirNameStr + "/in");
    
    ofstream inFile(inFileName.c_str());
    
    for (size_t i = 1; i < inputSentence.GetSize() - 1; ++i)
    {
      inFile << inputSentence.GetWord(i);
    }
    inFile << endl;
    inFile.close();
        
    long translationId = inputSentence.GetTranslationId();
    string ptFileName = m_FuzzyMatchWrapper->Extract(translationId, dirNameStr);

    // populate with rules for this sentence
    PhraseDictionaryNodeSCFG &rootNode = m_collection[translationId];
    FormatType format = MosesFormat;
        
    // data from file
    InputFileStream inStream(ptFileName);
    
    // copied from class LoaderStandard
    PrintUserTime("Start loading fuzzy-match phrase model");
    
    const StaticData &staticData = StaticData::Instance();
    const std::string& factorDelimiter = staticData.GetFactorDelimiter();
    
    
    string lineOrig;
    size_t count = 0;
    
    while(getline(inStream, lineOrig)) {
      const string *line;
      if (format == HieroFormat) { // reformat line
        assert(false);
        //line = ReformatHieroRule(lineOrig);
      }
      else
      { // do nothing to format of line
        line = &lineOrig;
      }
      
      vector<string> tokens;
      vector<float> scoreVector;
      
      TokenizeMultiCharSeparator(tokens, *line , "|||" );
      
      if (tokens.size() != 4 && tokens.size() != 5) {
        stringstream strme;
        strme << "Syntax error at " << ptFileName << ":" << count;
        UserMessage::Add(strme.str());
        abort();
      }
      
      const string &sourcePhraseString = tokens[0]
      , &targetPhraseString = tokens[1]
      , &scoreString        = tokens[2]
      , &alignString        = tokens[3];
      
      bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
      if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
        TRACE_ERR( ptFileName << ":" << count << ": pt entry contains empty target, skipping\n");
        continue;
      }
      
      Tokenize<float>(scoreVector, scoreString);
      const size_t numScoreComponents = GetFeature()->GetNumScoreComponents();
      if (scoreVector.size() != numScoreComponents) {
        stringstream strme;
        strme << "Size of scoreVector != number (" << scoreVector.size() << "!="
        << numScoreComponents << ") of score components on line " << count;
        UserMessage::Add(strme.str());
        abort();
      }
      CHECK(scoreVector.size() == numScoreComponents);
      
      // parse source & find pt node
      
      // constituent labels
      Word sourceLHS, targetLHS;
      
      // source
      Phrase sourcePhrase( 0);
      sourcePhrase.CreateFromStringNewFormat(Input, *m_input, sourcePhraseString, factorDelimiter, sourceLHS);
      
      // create target phrase obj
      TargetPhrase *targetPhrase = new TargetPhrase();
      targetPhrase->CreateFromStringNewFormat(Output, *m_output, targetPhraseString, factorDelimiter, targetLHS);
      
      // rest of target phrase
      targetPhrase->SetAlignmentInfo(alignString);
      targetPhrase->SetTargetLHS(targetLHS);
      //targetPhrase->SetDebugOutput(string("New Format pt ") + line);
      
      // component score, for n-best output
      std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore);
      std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);
      
      targetPhrase->SetScoreChart(GetFeature(), scoreVector, *m_weight, *m_languageModels, m_wpProducer);
      
      TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase, *targetPhrase, sourceLHS);
      phraseColl.Add(targetPhrase);
      
      count++;
      
      if (format == HieroFormat) { // reformat line
        delete line;
      }
      else
      { // do nothing
      }
      
    }
    
    // sort and prune each target phrase collection
    SortAndPrune(rootNode);
   
    //removedirectoryrecursively(dirName);
  }
  
  TargetPhraseCollection &PhraseDictionaryFuzzyMatch::GetOrCreateTargetPhraseCollection(PhraseDictionaryNodeSCFG &rootNode
                                                                                  , const Phrase &source
                                                                                  , const TargetPhrase &target
                                                                                  , const Word &sourceLHS)
  {
    PhraseDictionaryNodeSCFG &currNode = GetOrCreateNode(rootNode, source, target, sourceLHS);
    return currNode.GetOrCreateTargetPhraseCollection();
  }

  PhraseDictionaryNodeSCFG &PhraseDictionaryFuzzyMatch::GetOrCreateNode(PhraseDictionaryNodeSCFG &rootNode
                                                                  , const Phrase &source
                                                                  , const TargetPhrase &target
                                                                  , const Word &sourceLHS)
  {
    cerr << source << endl << target << endl;
    const size_t size = source.GetSize();
    
    const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm();
    AlignmentInfo::const_iterator iterAlign = alignmentInfo.begin();
    
    PhraseDictionaryNodeSCFG *currNode = &rootNode;
    for (size_t pos = 0 ; pos < size ; ++pos) {
      const Word& word = source.GetWord(pos);
      
      if (word.IsNonTerminal()) {
        // indexed by source label 1st
        const Word &sourceNonTerm = word;
        
        CHECK(iterAlign != alignmentInfo.end());
        CHECK(iterAlign->first == pos);
        size_t targetNonTermInd = iterAlign->second;
        ++iterAlign;
        const Word &targetNonTerm = target.GetWord(targetNonTermInd);
        
        currNode = currNode->GetOrCreateChild(sourceNonTerm, targetNonTerm);
      } else {
        currNode = currNode->GetOrCreateChild(word);
      }
      
      CHECK(currNode != NULL);
    }
    
    // finally, the source LHS
    //currNode = currNode->GetOrCreateChild(sourceLHS);
    //CHECK(currNode != NULL);
    
    
    return *currNode;
  }

  void PhraseDictionaryFuzzyMatch::SortAndPrune(PhraseDictionaryNodeSCFG &rootNode)
  {
    if (GetTableLimit())
    {
      rootNode.Sort(GetTableLimit());
    }
  }
  
  void PhraseDictionaryFuzzyMatch::CleanUp(const InputType &source)
  {
    m_collection.erase(source.GetTranslationId());
  }

  const PhraseDictionaryNodeSCFG &PhraseDictionaryFuzzyMatch::GetRootNode(const InputType &source) const 
  {
    long transId = source.GetTranslationId();
    std::map<long, PhraseDictionaryNodeSCFG>::const_iterator iter = m_collection.find(transId);
    CHECK(iter != m_collection.end());
    return iter->second; 
  }
  PhraseDictionaryNodeSCFG &PhraseDictionaryFuzzyMatch::GetRootNode(const InputType &source) 
  {
    long transId = source.GetTranslationId();
    std::map<long, PhraseDictionaryNodeSCFG>::iterator iter = m_collection.find(transId);
    CHECK(iter != m_collection.end());
    return iter->second; 
  }
  
  TO_STRING_BODY(PhraseDictionaryFuzzyMatch);
  
  // friend
  ostream& operator<<(ostream& out, const PhraseDictionaryFuzzyMatch& phraseDict)
  {
    typedef PhraseDictionaryNodeSCFG::TerminalMap TermMap;
    typedef PhraseDictionaryNodeSCFG::NonTerminalMap NonTermMap;
    
    /*
    const PhraseDictionaryNodeSCFG &coll = phraseDict.m_collection;
    for (NonTermMap::const_iterator p = coll.m_nonTermMap.begin(); p != coll.m_nonTermMap.end(); ++p) {
      const Word &sourceNonTerm = p->first.first;
      out << sourceNonTerm;
    }
    for (TermMap::const_iterator p = coll.m_sourceTermMap.begin(); p != coll.m_sourceTermMap.end(); ++p) {
      const Word &sourceTerm = p->first;
      out << sourceTerm;
    }
     */
    
    return out;
  }
  
}
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 17:52:48 +04:00			`// vim:tabstop=2`

			`/***********************************************************************`
			`Moses - factored phrase-based language decoder`
			`Copyright (C) 2006 University of Edinburgh`

			`This library is free software; you can redistribute it and/or`
			`modify it under the terms of the GNU Lesser General Public`
			`License as published by the Free Software Foundation; either`
			`version 2.1 of the License, or (at your option) any later version.`

			`This library is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`Lesser General Public License for more details.`

			`You should have received a copy of the GNU Lesser General Public`
			`License along with this library; if not, write to the Free Software`
			`Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`***********************************************************************/`

put all temp files into temp directory and delete at the end 2012-10-08 23:52:27 +04:00			`#include <stdio.h>`
			`#include <stdlib.h>`
			`#include <string.h>`
			`#include <limits.h>`
			`#include <sys/types.h>`
			`#include <unistd.h>`
			`#include <dirent.h>`

Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 17:52:48 +04:00			`#include <fstream>`
			`#include <string>`
			`#include <iterator>`
			`#include <algorithm>`
move moses/src/* to moses/ 2012-11-12 23:56:18 +04:00			`#include "Loader.h"`
			`#include "LoaderFactory.h"`
rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`#include "PhraseDictionaryFuzzyMatch.h"`
move moses/src/* to moses/ 2012-11-12 23:56:18 +04:00			`#include "moses/FactorCollection.h"`
			`#include "moses/Word.h"`
			`#include "moses/Util.h"`
			`#include "moses/InputFileStream.h"`
			`#include "moses/StaticData.h"`
			`#include "moses/WordsRange.h"`
			`#include "moses/UserMessage.h"`
get rid of tmpnam 2012-08-07 00:41:24 +04:00			`#include "util/file.hh"`
move CKY+Parser to TranslationModel/ 2012-11-27 21:23:31 +04:00			`#include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h"`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 17:52:48 +04:00
			`using namespace std;`

			`namespace Moses`
			`{`
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles 2012-07-27 03:10:49 +04:00
rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`PhraseDictionaryFuzzyMatch::PhraseDictionaryFuzzyMatch(size_t numScoreComponents,`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 20:59:21 +04:00			`PhraseDictionaryFeature* feature)`
Add PhraseDictionaryTMExtract and ChartRuleLookupManagerMemoryPerSentence to deal with per per sentence grammar. eg. Adam's suffix array and Phi Koehn's TM extraction 2012-07-19 20:56:46 +04:00			`: PhraseDictionary(numScoreComponents, feature)`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 20:59:21 +04:00			`{`
move phrase tables and generation tables to staticData 2012-12-21 18:54:43 +04:00			`//const StaticData &staticData = StaticData::Instance();`
enable threading 2012-10-09 16:48:48 +04:00			`//CHECK(staticData.ThreadCount() == 1);`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 20:59:21 +04:00			`}`

rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`bool PhraseDictionaryFuzzyMatch::Load(const std::vector<FactorType> &input`
read in pt file. Compiles but and runs. Need to check scores 2012-07-23 17:31:05 +04:00			`, const std::vector<FactorType> &output`
read in pt file. Compiles but and runs. Need to check scores 2012-07-23 19:36:39 +04:00			`, const std::string &initStr`
read in pt file. Compiles but and runs. Need to check scores 2012-07-23 17:31:05 +04:00			`, const std::vector<float> &weight`
			`, size_t tableLimit,`
			`const LMList& languageModels,`
			`const WordPenaltyProducer* wpProducer)`
			`{`
			`m_languageModels = &(languageModels);`
			`m_wpProducer = wpProducer;`
			`m_tableLimit = tableLimit;`
			`m_input = &input;`
			`m_output = &output;`

new weight vector in fuzzy match rule table 2012-10-18 19:19:47 +04:00			`m_weight = new vector<float>(weight);`
read in pt file. Compiles but and runs. Need to check scores 2012-07-23 19:36:39 +04:00
Per sentence grammar classes compiles but doesn't run 2012-07-19 22:36:46 +04:00			`cerr << "initStr=" << initStr << endl;`
			`m_config = Tokenize(initStr, ";");`
cleanup of variables. Need to delete temporary files 2012-07-31 05:21:48 +04:00			`assert(m_config.size() == 3);`
read in pt file. Compiles but and runs. Need to check scores 2012-07-23 19:36:39 +04:00
rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`m_FuzzyMatchWrapper = new tmmt::FuzzyMatchWrapper(m_config[0], m_config[1], m_config[2]);`
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles 2012-07-31 00:07:19 +04:00
read in pt file. Compiles but and runs. Need to check scores 2012-07-23 19:36:39 +04:00			`return true;`
Per sentence grammar classes compiles but doesn't run 2012-07-19 22:36:46 +04:00			`}`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 17:52:48 +04:00
rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`ChartRuleLookupManager *PhraseDictionaryFuzzyMatch::CreateRuleLookupManager(`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 17:52:48 +04:00			`const InputType &sentence,`
Rule lookup only needs ChartCellCollectionBase 2012-10-11 17:27:30 +04:00			`const ChartCellCollectionBase &cellCollection)`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 17:52:48 +04:00			`{`
Add PhraseDictionaryTMExtract and ChartRuleLookupManagerMemoryPerSentence to deal with per per sentence grammar. eg. Adam's suffix array and Phi Koehn's TM extraction 2012-07-19 20:56:46 +04:00			`return new ChartRuleLookupManagerMemoryPerSentence(sentence, cellCollection, *this);`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 17:52:48 +04:00			`}`
read in pt file. Compiles but doesn't run 2012-07-23 17:07:36 +04:00
put all temp files into temp directory and delete at the end 2012-10-08 23:52:27 +04:00
			`int removedirectoryrecursively(const char *dirname)`
			`{`
			`DIR *dir;`
			`struct dirent *entry;`
			`char path[PATH_MAX];`

			`if (path == NULL) {`
			`fprintf(stderr, "Out of memory error\n");`
			`return 0;`
			`}`
			`dir = opendir(dirname);`
			`if (dir == NULL) {`
			`perror("Error opendir()");`
			`return 0;`
			`}`

			`while ((entry = readdir(dir)) != NULL) {`
			`if (strcmp(entry->d_name, ".") && strcmp(entry->d_name, "..")) {`
			`snprintf(path, (size_t) PATH_MAX, "%s/%s", dirname, entry->d_name);`
			`if (entry->d_type == DT_DIR) {`
			`removedirectoryrecursively(path);`
			`}`

			`remove(path);`
			`/*`
			`* Here, the actual deletion must be done. Beacuse this is`
			`* quite a dangerous thing to do, and this program is not very`
			`* well tested, we are just printing as if we are deleting.`
			`*/`
			`//printf("(not really) Deleting: %s\n", path);`
			`/*`
			`* When you are finished testing this and feel you are ready to do the real`
			`* deleting, use this: removeSTUB(path);`
			`* (see "man 3 remove")`
			`* Please note that I DONT TAKE RESPONSIBILITY for data you delete with this!`
			`*/`
			`}`

			`}`
			`closedir(dir);`

			`rmdir(dirname);`
			`/*`
			`* Now the directory is emtpy, finally delete the directory itself. (Just`
			`* printing here, see above)`
			`*/`
			`//printf("(not really) Deleting: %s\n", dirname);`

			`return 1;`
			`}`

rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`void PhraseDictionaryFuzzyMatch::InitializeForInput(InputType const& inputSentence)`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 20:59:21 +04:00			`{`
mkdtemp template must be have 6 'X' in Linux 2012-10-09 15:06:14 +04:00			`char dirName[] = "/tmp/moses.XXXXXX";`
put all temp files into temp directory and delete at the end 2012-10-08 23:52:27 +04:00			`char *temp = mkdtemp(dirName);`
			`CHECK(temp);`
			`string dirNameStr(dirName);`

			`string inFileName(dirNameStr + "/in");`
Per sentence grammar classes compiles but doesn't run 2012-07-19 22:36:46 +04:00
cleanup of variables. Need to delete temporary files 2012-07-31 05:21:48 +04:00			`ofstream inFile(inFileName.c_str());`
Per sentence grammar classes compiles but doesn't run 2012-07-19 22:36:46 +04:00
read in pt file. Compiles but and runs. Need to check scores 2012-07-23 18:10:15 +04:00			`for (size_t i = 1; i < inputSentence.GetSize() - 1; ++i)`
Per sentence grammar classes compiles but doesn't run 2012-07-19 22:36:46 +04:00			`{`
read in pt file. Compiles but and runs. Need to check scores 2012-07-23 18:10:15 +04:00			`inFile << inputSentence.GetWord(i);`
Per sentence grammar classes compiles but doesn't run 2012-07-19 22:36:46 +04:00			`}`
			`inFile << endl;`
			`inFile.close();`
get rid of tmpnam 2012-08-07 00:41:24 +04:00
read in pt file. Compiles but doesn't run 2012-07-23 17:07:36 +04:00			`long translationId = inputSentence.GetTranslationId();`
multithreaded fuzzy match 2012-11-27 14:42:28 +04:00			`string ptFileName = m_FuzzyMatchWrapper->Extract(translationId, dirNameStr);`
read in pt file. Compiles but doesn't run 2012-07-23 17:07:36 +04:00
multithreaded fuzzy match 2012-11-27 14:42:28 +04:00			`// populate with rules for this sentence`
read in pt file. Compiles but doesn't run 2012-07-23 17:07:36 +04:00			`PhraseDictionaryNodeSCFG &rootNode = m_collection[translationId];`
			`FormatType format = MosesFormat;`
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles. It works 2012-07-31 02:49:13 +04:00
read in pt file. Doing ff, doesn't yet compile 2012-07-23 15:26:15 +04:00			`// data from file`
cleanup of variables. Need to delete temporary files 2012-07-31 05:21:48 +04:00			`InputFileStream inStream(ptFileName);`
read in pt file. Doing ff, doesn't yet compile 2012-07-23 15:26:15 +04:00
			`// copied from class LoaderStandard`
better indication of what phrase table implementation is being loaded 2012-08-09 15:47:48 +04:00			`PrintUserTime("Start loading fuzzy-match phrase model");`
read in pt file. Doing ff, doesn't yet compile 2012-07-23 15:26:15 +04:00
			`const StaticData &staticData = StaticData::Instance();`
			`const std::string& factorDelimiter = staticData.GetFactorDelimiter();`


			`string lineOrig;`
			`size_t count = 0;`

			`while(getline(inStream, lineOrig)) {`
			`const string *line;`
			`if (format == HieroFormat) { // reformat line`
			`assert(false);`
			`//line = ReformatHieroRule(lineOrig);`
			`}`
			`else`
			`{ // do nothing to format of line`
			`line = &lineOrig;`
			`}`

			`vector<string> tokens;`
			`vector<float> scoreVector;`

			`TokenizeMultiCharSeparator(tokens, *line , "\|\|\|" );`

			`if (tokens.size() != 4 && tokens.size() != 5) {`
			`stringstream strme;`
cleanup of variables. Need to delete temporary files 2012-07-31 05:21:48 +04:00			`strme << "Syntax error at " << ptFileName << ":" << count;`
read in pt file. Doing ff, doesn't yet compile 2012-07-23 15:26:15 +04:00			`UserMessage::Add(strme.str());`
			`abort();`
			`}`

			`const string &sourcePhraseString = tokens[0]`
			`, &targetPhraseString = tokens[1]`
			`, &scoreString = tokens[2]`
			`, &alignString = tokens[3];`

			`bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);`
			`if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {`
cleanup of variables. Need to delete temporary files 2012-07-31 05:21:48 +04:00			`TRACE_ERR( ptFileName << ":" << count << ": pt entry contains empty target, skipping\n");`
read in pt file. Doing ff, doesn't yet compile 2012-07-23 15:26:15 +04:00			`continue;`
			`}`

			`Tokenize<float>(scoreVector, scoreString);`
			`const size_t numScoreComponents = GetFeature()->GetNumScoreComponents();`
			`if (scoreVector.size() != numScoreComponents) {`
			`stringstream strme;`
			`strme << "Size of scoreVector != number (" << scoreVector.size() << "!="`
			`<< numScoreComponents << ") of score components on line " << count;`
			`UserMessage::Add(strme.str());`
			`abort();`
			`}`
			`CHECK(scoreVector.size() == numScoreComponents);`

			`// parse source & find pt node`

			`// constituent labels`
			`Word sourceLHS, targetLHS;`

			`// source`
			`Phrase sourcePhrase( 0);`
			`sourcePhrase.CreateFromStringNewFormat(Input, *m_input, sourcePhraseString, factorDelimiter, sourceLHS);`

			`// create target phrase obj`
TargetPhrase doesn't have a FactorDirection constructor. It was implicitly doing TargetPhrase(Phrase(size_t)) wastefully 2012-10-22 20:40:23 +04:00			`TargetPhrase *targetPhrase = new TargetPhrase();`
read in pt file. Doing ff, doesn't yet compile 2012-07-23 15:26:15 +04:00			`targetPhrase->CreateFromStringNewFormat(Output, *m_output, targetPhraseString, factorDelimiter, targetLHS);`

			`// rest of target phrase`
			`targetPhrase->SetAlignmentInfo(alignString);`
			`targetPhrase->SetTargetLHS(targetLHS);`
			`//targetPhrase->SetDebugOutput(string("New Format pt ") + line);`

			`// component score, for n-best output`
			`std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore);`
			`std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);`

read in pt file. Compiles but doesn't run 2012-07-23 17:07:36 +04:00			`targetPhrase->SetScoreChart(GetFeature(), scoreVector, m_weight, m_languageModels, m_wpProducer);`
read in pt file. Doing ff, doesn't yet compile 2012-07-23 15:26:15 +04:00
read in pt file. Compiles but doesn't run 2012-07-23 17:07:36 +04:00			`TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase, *targetPhrase, sourceLHS);`
read in pt file. Doing ff, doesn't yet compile 2012-07-23 15:26:15 +04:00			`phraseColl.Add(targetPhrase);`

			`count++;`

			`if (format == HieroFormat) { // reformat line`
			`delete line;`
			`}`
			`else`
			`{ // do nothing`
			`}`

			`}`

			`// sort and prune each target phrase collection`
read in pt file. Compiles but doesn't run 2012-07-23 17:07:36 +04:00			`SortAndPrune(rootNode);`
read in pt file. Compiles but and runs. Need to check scores 2012-07-23 18:10:15 +04:00
eclipse files 2012-10-23 20:07:28 +04:00			`//removedirectoryrecursively(dirName);`
read in pt file. Compiles but doesn't run 2012-07-23 17:07:36 +04:00			`}`

rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`TargetPhraseCollection &PhraseDictionaryFuzzyMatch::GetOrCreateTargetPhraseCollection(PhraseDictionaryNodeSCFG &rootNode`
read in pt file. Compiles but doesn't run 2012-07-23 17:07:36 +04:00			`, const Phrase &source`
			`, const TargetPhrase &target`
			`, const Word &sourceLHS)`
			`{`
			`PhraseDictionaryNodeSCFG &currNode = GetOrCreateNode(rootNode, source, target, sourceLHS);`
			`return currNode.GetOrCreateTargetPhraseCollection();`
			`}`

rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`PhraseDictionaryNodeSCFG &PhraseDictionaryFuzzyMatch::GetOrCreateNode(PhraseDictionaryNodeSCFG &rootNode`
read in pt file. Compiles but doesn't run 2012-07-23 17:07:36 +04:00			`, const Phrase &source`
			`, const TargetPhrase &target`
			`, const Word &sourceLHS)`
			`{`
read in pt file. Compiles but doesn't run 2012-07-23 17:18:49 +04:00			`cerr << source << endl << target << endl;`
read in pt file. Compiles but doesn't run 2012-07-23 17:07:36 +04:00			`const size_t size = source.GetSize();`

terminal and non-terminal word alignments stored in 2 separate objects 2012-10-19 18:10:10 +04:00			`const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm();`
read in pt file. Compiles but doesn't run 2012-07-23 17:07:36 +04:00			`AlignmentInfo::const_iterator iterAlign = alignmentInfo.begin();`

			`PhraseDictionaryNodeSCFG *currNode = &rootNode;`
			`for (size_t pos = 0 ; pos < size ; ++pos) {`
			`const Word& word = source.GetWord(pos);`

			`if (word.IsNonTerminal()) {`
			`// indexed by source label 1st`
			`const Word &sourceNonTerm = word;`

terminal and non-terminal word alignments stored in 2 separate objects 2012-10-19 18:10:10 +04:00			`CHECK(iterAlign != alignmentInfo.end());`
read in pt file. Compiles but doesn't run 2012-07-23 17:07:36 +04:00			`CHECK(iterAlign->first == pos);`
			`size_t targetNonTermInd = iterAlign->second;`
			`++iterAlign;`
			`const Word &targetNonTerm = target.GetWord(targetNonTermInd);`

			`currNode = currNode->GetOrCreateChild(sourceNonTerm, targetNonTerm);`
			`} else {`
			`currNode = currNode->GetOrCreateChild(word);`
			`}`

			`CHECK(currNode != NULL);`
			`}`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 20:59:21 +04:00
read in pt file. Compiles but doesn't run 2012-07-23 17:07:36 +04:00			`// finally, the source LHS`
			`//currNode = currNode->GetOrCreateChild(sourceLHS);`
			`//CHECK(currNode != NULL);`


			`return *currNode;`
			`}`

rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`void PhraseDictionaryFuzzyMatch::SortAndPrune(PhraseDictionaryNodeSCFG &rootNode)`
read in pt file. Compiles but doesn't run 2012-07-23 17:07:36 +04:00			`{`
			`if (GetTableLimit())`
			`{`
			`rootNode.Sort(GetTableLimit());`
			`}`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 20:59:21 +04:00			`}`

rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`void PhraseDictionaryFuzzyMatch::CleanUp(const InputType &source)`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 20:59:21 +04:00			`{`
Per sentence grammar classes compiles but doesn't run 2012-07-19 22:36:46 +04:00			`m_collection.erase(source.GetTranslationId());`
Per sentence grammar classes compiles but doesn't run 2012-07-19 21:59:50 +04:00			`}`

rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`const PhraseDictionaryNodeSCFG &PhraseDictionaryFuzzyMatch::GetRootNode(const InputType &source) const`
Per sentence grammar classes compiles but doesn't run 2012-07-19 21:59:50 +04:00			`{`
			`long transId = source.GetTranslationId();`
			`std::map<long, PhraseDictionaryNodeSCFG>::const_iterator iter = m_collection.find(transId);`
			`CHECK(iter != m_collection.end());`
			`return iter->second;`
			`}`
rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`PhraseDictionaryNodeSCFG &PhraseDictionaryFuzzyMatch::GetRootNode(const InputType &source)`
Per sentence grammar classes compiles but doesn't run 2012-07-19 21:59:50 +04:00			`{`
			`long transId = source.GetTranslationId();`
			`std::map<long, PhraseDictionaryNodeSCFG>::iterator iter = m_collection.find(transId);`
			`CHECK(iter != m_collection.end());`
			`return iter->second;`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 20:59:21 +04:00			`}`
wrap phi's suffix array implementation and extraction method in a wrapper class. Compiles 2012-07-27 03:10:49 +04:00
rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`TO_STRING_BODY(PhraseDictionaryFuzzyMatch);`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 17:52:48 +04:00
			`// friend`
rename tm-mt extraction to fuzzy match 2012-08-14 02:53:14 +04:00			`ostream& operator<<(ostream& out, const PhraseDictionaryFuzzyMatch& phraseDict)`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 17:52:48 +04:00			`{`
			`typedef PhraseDictionaryNodeSCFG::TerminalMap TermMap;`
			`typedef PhraseDictionaryNodeSCFG::NonTerminalMap NonTermMap;`

Per sentence grammar classes compiles but doesn't run 2012-07-19 21:59:50 +04:00			`/*`
Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 17:52:48 +04:00			`const PhraseDictionaryNodeSCFG &coll = phraseDict.m_collection;`
			`for (NonTermMap::const_iterator p = coll.m_nonTermMap.begin(); p != coll.m_nonTermMap.end(); ++p) {`
			`const Word &sourceNonTerm = p->first.first;`
			`out << sourceNonTerm;`
			`}`
			`for (TermMap::const_iterator p = coll.m_sourceTermMap.begin(); p != coll.m_sourceTermMap.end(); ++p) {`
			`const Word &sourceTerm = p->first;`
			`out << sourceTerm;`
			`}`
Per sentence grammar classes compiles but doesn't run 2012-07-19 21:59:50 +04:00			`*/`

Added phrase dictionary for phi's tm extraction. Copy of SCFG phrase dictionart so far 2012-07-18 17:52:48 +04:00			`return out;`
			`}`

			`}`