mosesdecoder/moses/GenerationDictionary.cpp

// $Id$

/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.

This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
***********************************************************************/

#include <fstream>
#include <string>
#include "GenerationDictionary.h"
#include "FactorCollection.h"
#include "Word.h"
#include "Util.h"
#include "InputFileStream.h"
#include "StaticData.h"
#include "UserMessage.h"

using namespace std;

namespace Moses
{
  GenerationDictionary::GenerationDictionary(size_t numFeatures,
                                             const std::vector<FactorType> &input,
                                             const std::vector<FactorType> &output)
  : Dictionary(numFeatures), DecodeFeature("Generation",numFeatures,input,output) {}

bool GenerationDictionary::Load(const std::string &filePath, FactorDirection direction)
{
  FactorCollection &factorCollection = FactorCollection::Instance();

  const size_t numFeatureValuesInConfig = this->GetNumScoreComponents();


  // data from file
  InputFileStream inFile(filePath);
  if (!inFile.good()) {
    UserMessage::Add(string("Couldn't read ") + filePath);
    return false;
  }

  m_filePath = filePath;
  string line;
  size_t lineNum = 0;
  while(getline(inFile, line)) {
    ++lineNum;
    vector<string> token = Tokenize( line );

    // add each line in generation file into class
    Word *inputWord = new Word();  // deleted in destructor
    Word outputWord;

    // create word with certain factors filled out

    // inputs
    vector<string> factorString = Tokenize( token[0], "|" );
    for (size_t i = 0 ; i < GetInput().size() ; i++) {
      FactorType factorType = GetInput()[i];
      const Factor *factor = factorCollection.AddFactor( direction, factorType, factorString[i]);
      inputWord->SetFactor(factorType, factor);
    }

    factorString = Tokenize( token[1], "|" );
    for (size_t i = 0 ; i < GetOutput().size() ; i++) {
      FactorType factorType = GetOutput()[i];

      const Factor *factor = factorCollection.AddFactor( direction, factorType, factorString[i]);
      outputWord.SetFactor(factorType, factor);
    }

    size_t numFeaturesInFile = token.size() - 2;
    if (numFeaturesInFile < numFeatureValuesInConfig) {
      stringstream strme;
      strme << filePath << ":" << lineNum << ": expected " << numFeatureValuesInConfig
            << " feature values, but found " << numFeaturesInFile << std::endl;
      UserMessage::Add(strme.str());
      return false;
    }
    std::vector<float> scores(numFeatureValuesInConfig, 0.0f);
    for (size_t i = 0; i < numFeatureValuesInConfig; i++)
      scores[i] = FloorScore(TransformScore(Scan<float>(token[2+i])));

    Collection::iterator iterWord = m_collection.find(inputWord);
    if (iterWord == m_collection.end()) {
      m_collection[inputWord][outputWord].Assign(this, scores);
    } else {
      // source word already in there. delete input word to avoid mem leak
      (iterWord->second)[outputWord].Assign(this, scores);
      delete inputWord;
    }
  }

  inFile.Close();
  return true;
}

GenerationDictionary::~GenerationDictionary()
{
  Collection::const_iterator iter;
  for (iter = m_collection.begin() ; iter != m_collection.end() ; ++iter) {
    delete iter->first;
  }
}

const OutputWordCollection *GenerationDictionary::FindWord(const Word &word) const
{
  const OutputWordCollection *ret;

  Collection::const_iterator iter = m_collection.find(&word);
  if (iter == m_collection.end()) {
    // can't find source phrase
    ret = NULL;
  } else {
    ret = &iter->second;
  }
  return ret;
}

bool GenerationDictionary::ComputeValueInTranslationOption() const
{
  return true;
}


}
move cube pruning moses lib to trunk git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1848 1f5c12ca-751b-0410-a591-d2e778427230 2008-06-11 14:52:57 +04:00			`// $Id$`

			`/***********************************************************************`
			`Moses - factored phrase-based language decoder`
			`Copyright (C) 2006 University of Edinburgh`

			`This library is free software; you can redistribute it and/or`
			`modify it under the terms of the GNU Lesser General Public`
			`License as published by the Free Software Foundation; either`
			`version 2.1 of the License, or (at your option) any later version.`

			`This library is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`Lesser General Public License for more details.`

			`You should have received a copy of the GNU Lesser General Public`
			`License along with this library; if not, write to the Free Software`
			`Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`***********************************************************************/`

			`#include <fstream>`
			`#include <string>`
			`#include "GenerationDictionary.h"`
			`#include "FactorCollection.h"`
			`#include "Word.h"`
			`#include "Util.h"`
			`#include "InputFileStream.h"`
			`#include "StaticData.h"`
			`#include "UserMessage.h"`

			`using namespace std;`

create namespace git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1897 1f5c12ca-751b-0410-a591-d2e778427230 2008-10-09 03:51:26 +04:00			`namespace Moses`
			`{`
Goodbye ScoreIndexManager. Compiles ok, but haven't dared to run regression yet. git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/branches/mira-mtm5@3608 1f5c12ca-751b-0410-a591-d2e778427230 2010-10-07 02:06:49 +04:00			`GenerationDictionary::GenerationDictionary(size_t numFeatures,`
Merge in the multiple models branch. These changes allow the moses server to support multiple translation, language and generation models within the same process. The main design change is the introduction of a TranslationSystem object to manage the models, which have been moved out of StaticData. The changes should have no effect on existing systems. git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3394 1f5c12ca-751b-0410-a591-d2e778427230 2010-08-10 17:12:00 +04:00			`const std::vector<FactorType> &input,`
			`const std::vector<FactorType> &output)`
set num score components in ScoreProducer ctor 2011-11-09 01:22:34 +04:00			`: Dictionary(numFeatures), DecodeFeature("Generation",numFeatures,input,output) {}`
move cube pruning moses lib to trunk git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1848 1f5c12ca-751b-0410-a591-d2e778427230 2008-06-11 14:52:57 +04:00
Merge in the multiple models branch. These changes allow the moses server to support multiple translation, language and generation models within the same process. The main design change is the introduction of a TranslationSystem object to manage the models, which have been moved out of StaticData. The changes should have no effect on existing systems. git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3394 1f5c12ca-751b-0410-a591-d2e778427230 2010-08-10 17:12:00 +04:00			`bool GenerationDictionary::Load(const std::string &filePath, FactorDirection direction)`
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3901 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 16:14:42 +03:00			`{`
			`FactorCollection &factorCollection = FactorCollection::Instance();`

			`const size_t numFeatureValuesInConfig = this->GetNumScoreComponents();`


			`// data from file`
			`InputFileStream inFile(filePath);`
			`if (!inFile.good()) {`
			`UserMessage::Add(string("Couldn't read ") + filePath);`
			`return false;`
			`}`

			`m_filePath = filePath;`
			`string line;`
			`size_t lineNum = 0;`
			`while(getline(inFile, line)) {`
			`++lineNum;`
			`vector<string> token = Tokenize( line );`

			`// add each line in generation file into class`
			`Word *inputWord = new Word(); // deleted in destructor`
			`Word outputWord;`

			`// create word with certain factors filled out`

			`// inputs`
			`vector<string> factorString = Tokenize( token[0], "\|" );`
			`for (size_t i = 0 ; i < GetInput().size() ; i++) {`
			`FactorType factorType = GetInput()[i];`
			`const Factor *factor = factorCollection.AddFactor( direction, factorType, factorString[i]);`
			`inputWord->SetFactor(factorType, factor);`
			`}`

			`factorString = Tokenize( token[1], "\|" );`
			`for (size_t i = 0 ; i < GetOutput().size() ; i++) {`
			`FactorType factorType = GetOutput()[i];`

			`const Factor *factor = factorCollection.AddFactor( direction, factorType, factorString[i]);`
			`outputWord.SetFactor(factorType, factor);`
			`}`

			`size_t numFeaturesInFile = token.size() - 2;`
			`if (numFeaturesInFile < numFeatureValuesInConfig) {`
			`stringstream strme;`
			`strme << filePath << ":" << lineNum << ": expected " << numFeatureValuesInConfig`
			`<< " feature values, but found " << numFeaturesInFile << std::endl;`
			`UserMessage::Add(strme.str());`
			`return false;`
			`}`
			`std::vector<float> scores(numFeatureValuesInConfig, 0.0f);`
			`for (size_t i = 0; i < numFeatureValuesInConfig; i++)`
			`scores[i] = FloorScore(TransformScore(Scan<float>(token[2+i])));`

			`Collection::iterator iterWord = m_collection.find(inputWord);`
			`if (iterWord == m_collection.end()) {`
			`m_collection[inputWord][outputWord].Assign(this, scores);`
			`} else {`
			`// source word already in there. delete input word to avoid mem leak`
			`(iterWord->second)[outputWord].Assign(this, scores);`
			`delete inputWord;`
			`}`
			`}`

			`inFile.Close();`
			`return true;`
move cube pruning moses lib to trunk git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1848 1f5c12ca-751b-0410-a591-d2e778427230 2008-06-11 14:52:57 +04:00			`}`

			`GenerationDictionary::~GenerationDictionary()`
			`{`
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3901 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 16:14:42 +03:00			`Collection::const_iterator iter;`
			`for (iter = m_collection.begin() ; iter != m_collection.end() ; ++iter) {`
			`delete iter->first;`
			`}`
move cube pruning moses lib to trunk git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1848 1f5c12ca-751b-0410-a591-d2e778427230 2008-06-11 14:52:57 +04:00			`}`

			`const OutputWordCollection *GenerationDictionary::FindWord(const Word &word) const`
			`{`
run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3901 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 16:14:42 +03:00			`const OutputWordCollection *ret;`

			`Collection::const_iterator iter = m_collection.find(&word);`
			`if (iter == m_collection.end()) {`
			`// can't find source phrase`
			`ret = NULL;`
			`} else {`
			`ret = &iter->second;`
			`}`
			`return ret;`
move cube pruning moses lib to trunk git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1848 1f5c12ca-751b-0410-a591-d2e778427230 2008-06-11 14:52:57 +04:00			`}`

run beautify.perl. Consistent formatting for .h & .cpp files git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3901 1f5c12ca-751b-0410-a591-d2e778427230 2011-02-24 16:14:42 +03:00			`bool GenerationDictionary::ComputeValueInTranslationOption() const`
			`{`
			`return true;`
Feature function overhaul. Each feature function is computed in one of three ways: 1) Stateless feature functions from the phrase table/generation table: these are computed when the TranslationOption is created. They become part of the ScoreBreakdown object contained in the TranslationOption and are added to the feature value vector when a hypothesis is extended. 2) Stateless feature functions that are computed during state exploration. Currently, only WordPenalty falls into this category, but these functions implement a method Evaluate which do does not receive a Hypothesis or any contextual information. 3) Stateful feature functions: these features receive the arc information (translation option), compute some value and then return some context information. The context information created by a particular feature function is passed back to it as the previous context when a hypothesis originating at the node where the previous edge terminates is created. States in the search space may be recombined if the context information is identical. The context information must be stored in an object implementing the FFState interface. TODO: 1) the command line interface / MERT interface needs to go to named parameters that are otherwise opaque 2) StatefulFeatureFunction's Evaluate method should just take a TranslationOption and a context object. It is not good that it takes a hypothesis, because then people may be tempted to access information about the "previous" hypothesis without "declaring" this dependency. 3) Future cost estimates should be handled using feature functions. All stateful feature functions need some kind of future cost estimate. 4) Philipp's poor-man's cube pruning is broken. git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@2087 1f5c12ca-751b-0410-a591-d2e778427230 2009-02-06 18:43:06 +03:00			`}`

create namespace git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1897 1f5c12ca-751b-0410-a591-d2e778427230 2008-10-09 03:51:26 +04:00
			`}`