mosesdecoder/moses/LM/ParallelBackoff.cpp

// $Id$

/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.

This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
***********************************************************************/

#include "ParallelBackoff.h"

#include <vector>
#include <string>
#include <sstream>
#include <fstream>

#include "MultiFactor.h"
#include "moses/Word.h"
#include "moses/Factor.h"
#include "moses/FactorTypeSet.h"
#include "moses/FactorCollection.h"
#include "moses/Phrase.h"
#include "moses/TypeDef.h"
#include "moses/Util.h"

// By default, SRILM defines a function called zopen.
//
// However, on Mac OS X (and possibly other BSDs),
// <stdio.h> already defines a zopen function.
//
// To resolve this conflict, SRILM checks to see if HAVE_ZOPEN is defined.
// If it is, SRILM will rename its zopen function as my_zopen.
//
// So, before importing any SRILM headers,
// it is important to define HAVE_ZOPEN if we are on an Apple OS:
//
#ifdef __APPLE__
#define HAVE_ZOPEN
#endif

#include "FNgramSpecs.h"
#include "FNgramStats.h"
#include "FactoredVocab.h"
#include "FNgram.h"
#include "wmatrix.h"
#include "Vocab.h"
#include "File.h"

using namespace std;

namespace Moses
{

namespace
{
class LanguageModelParallelBackoff : public LanguageModelMultiFactor
{
private:
  std::vector<FactorType> m_factorTypesOrdered;

  FactoredVocab   *m_srilmVocab;
  FNgram          *m_srilmModel;
  VocabIndex  m_unknownId;
  VocabIndex  m_wtid;
  VocabIndex  m_wtbid;
  VocabIndex  m_wteid;
  FNgramSpecs<FNgramCount>* fnSpecs;
  //std::vector<VocabIndex> m_lmIdLookup;
  std::map<size_t, VocabIndex>* lmIdMap;
  std::fstream* debugStream;

  WidMatrix *widMatrix;

public:
  LanguageModelParallelBackoff(const std::string &line)
    :LanguageModelMultiFactor(line) {
  }

  ~LanguageModelParallelBackoff();

  bool Load(const std::string &filePath, const std::vector<FactorType> &factorTypes, size_t nGramOrder);

  VocabIndex GetLmID( const std::string &str ) const;

  VocabIndex GetLmID( const Factor *factor, FactorType ft ) const;

  void CreateFactors();

  LMResult GetValueForgotState(const std::vector<const Word*> &contextFactor, FFState &outState) const;
  const FFState *GetNullContextState() const;
  const FFState *GetBeginSentenceState() const;
  FFState *NewState(const FFState *from) const;
};

LanguageModelParallelBackoff::~LanguageModelParallelBackoff()
{
  ///
}


bool LanguageModelParallelBackoff::Load(const std::string &filePath, const std::vector<FactorType> &factorTypes, size_t nGramOrder)
{

  cerr << "Loading Language Model Parallel Backoff!!!\n";
  widMatrix = new ::WidMatrix();
  m_factorTypes	= FactorMask(factorTypes);
  m_srilmVocab = new ::FactoredVocab();
  //assert(m_srilmVocab != 0);

  fnSpecs = 0;
  File f(filePath.c_str(),"r");
  fnSpecs = new ::FNgramSpecs<FNgramCount>(f,*m_srilmVocab, 0/*debug*/);

  cerr << "Loaded fnSpecs!\n";

  m_srilmVocab->unkIsWord() = true;
  m_srilmVocab->nullIsWord() = true;
  m_srilmVocab->toLower() = false;

  FNgramStats *factoredStats = new FNgramStats(*m_srilmVocab, *fnSpecs);

  factoredStats->debugme(2);

  cerr << "Factored stats\n";

  FNgram* fngramLM = new FNgram(*m_srilmVocab,*fnSpecs);

  cerr << "FNgram object created\n";

  fngramLM->skipOOVs = false;

  if (!factoredStats->read()) {
    cerr << "error reading in counts in factor file\n";
    exit(1);
  }

  cerr << "Factored stats read!\n";

  factoredStats->estimateDiscounts();
  factoredStats->computeCardinalityFunctions();
  factoredStats->sumCounts();

  cerr << "Another three operations made!\n";

  if (!fngramLM->read()) {
    cerr << "format error in lm file\n";
    exit(1);
  }

  cerr << "fngramLM reads!\n";

  m_filePath = filePath;
  m_nGramOrder= nGramOrder;

  m_factorTypesOrdered= factorTypes;

  m_unknownId = m_srilmVocab->unkIndex();

  cerr << "m_unknowdId = " << m_unknownId << endl;

  m_srilmModel = fngramLM;

  cerr << "Create factors...\n";

  CreateFactors();

  cerr << "Factors created! \n";
  //FactorCollection &factorCollection = FactorCollection::Instance();

  /*for (size_t index = 0 ; index < m_factorTypesOrdered.size() ; ++index)
  {
  	FactorType factorType = m_factorTypesOrdered[index];
  	m_sentenceStartArray[factorType] 	= factorCollection.AddFactor(Output, factorType, BOS_);


  	m_sentenceEndArray[factorType] 		= factorCollection.AddFactor(Output, factorType, EOS_);

    //factorIdStart = m_sentenceStartArray[factorType]->GetId();
    //factorIdEnd = m_sentenceEndArray[factorType]->GetId();

    for (size_t i = 0; i < 10; i++)
    {
      lmIdMap[factorIdStart * 10 + i] = GetLmID(BOS_);
  		lmIdMap[factorIdEnd * 10 + i] = GetLmID(EOS_);
    }

  	//(*lmIdMap)[factorIdStart * 10 + index] = GetLmID(BOS_);
  	//(*lmIdMap)[factorIdEnd * 10 + index] = GetLmID(EOS_);

  }*/
  return true;
}

VocabIndex LanguageModelParallelBackoff::GetLmID( const std::string &str ) const
{
  return m_srilmVocab->getIndex( str.c_str(), m_unknownId );
}

VocabIndex LanguageModelParallelBackoff::GetLmID( const Factor *factor, size_t ft ) const
{

  size_t factorId = factor->GetId();
  if ( lmIdMap->find( factorId * 10 + ft ) != lmIdMap->end() ) {
    return lmIdMap->find( factorId * 10 + ft )->second;
  } else {
    return m_unknownId;
  }

}

void LanguageModelParallelBackoff::CreateFactors()
{

  // add factors which have srilm id
  FactorCollection &factorCollection = FactorCollection::Instance();

  lmIdMap = new std::map<size_t, VocabIndex>();


  VocabString str;
  VocabIter iter(*m_srilmVocab);

  iter.init();

  size_t pomFactorTypeNum = 0;


  while ( (str = iter.next()) != NULL) {

    if ((str[0] < 'a' || str[0] > 'k') && str[0] != 'W') {
      continue;
    }
    VocabIndex lmId = GetLmID(str);
    pomFactorTypeNum = str[0] - 'a';

    size_t factorId = factorCollection.AddFactor(Output, m_factorTypesOrdered[pomFactorTypeNum], &(str[2]) )->GetId();
    (*lmIdMap)[factorId * 10 + pomFactorTypeNum] = lmId;
  }

  size_t factorIdStart;
  size_t factorIdEnd;

  // sentence markers
  for (size_t index = 0 ; index < m_factorTypesOrdered.size() ; ++index) {
    FactorType factorType = m_factorTypesOrdered[index];
    m_sentenceStartWord[index] 	= factorCollection.AddFactor(Output, factorType, BOS_);


    m_sentenceEndWord[index] 		= factorCollection.AddFactor(Output, factorType, EOS_);

    factorIdStart = m_sentenceStartWord[index]->GetId();
    factorIdEnd = m_sentenceEndWord[index]->GetId();

    /*for (size_t i = 0; i < 10; i++)
    {
      lmIdMap[factorIdStart * 10 + i] = GetLmID(BOS_);
    	lmIdMap[factorIdEnd * 10 + i] = GetLmID(EOS_);
    }*/

    (*lmIdMap)[factorIdStart * 10 + index] = GetLmID(BOS_);
    (*lmIdMap)[factorIdEnd * 10 + index] = GetLmID(EOS_);

    cerr << "BOS_:" << GetLmID(BOS_) << ", EOS_:" << GetLmID(EOS_) << endl;

  }

  m_wtid = GetLmID("W-<unk>");
  m_wtbid = GetLmID("W-<s>");
  m_wteid = GetLmID("W-</s>");

  cerr << "W-<unk> index: " << m_wtid << endl;
  cerr << "W-<s> index: " << m_wtbid << endl;
  cerr << "W-</s> index: " << m_wteid << endl;


}

LMResult LanguageModelParallelBackoff::GetValueForgotState(const std::vector<const Word*> &contextFactor, FFState & /*outState */) const
{

  static WidMatrix widMatrix;

  for (int i=0; i<contextFactor.size(); i++)
    ::memset(widMatrix[i],0,(m_factorTypesOrdered.size() + 1)*sizeof(VocabIndex));


  for (size_t i = 0; i < contextFactor.size(); i++) {
    const Word &word = *contextFactor[i];

    for (size_t j = 0; j < m_factorTypesOrdered.size(); j++) {
      const Factor *factor = word[ m_factorTypesOrdered[j] ];

      if (factor == NULL)
        widMatrix[i][j + 1] = 0;
      else
        widMatrix[i][j + 1] = GetLmID(factor, j);
    }

    if (widMatrix[i][1] == GetLmID(m_sentenceStartWord[0], 0) ) {
      widMatrix[i][0] = m_wtbid;
    } else if (widMatrix[i][1] == GetLmID(m_sentenceEndWord[0], 0 )) {
      widMatrix[i][0] = m_wteid;
    } else {
      widMatrix[i][0] = m_wtid;
    }
  }


  LMResult ret;
  ret.score = m_srilmModel->wordProb( widMatrix, contextFactor.size() - 1, contextFactor.size() );
  ret.score = FloorScore(TransformLMScore(ret.score));
  ret.unknown = !contextFactor.empty() && (widMatrix[contextFactor.size() - 1][0] == m_unknownId);
  return ret;

  /*if (contextFactor.size() == 0)
  {
  	return 0;
  }

  for (size_t currPos = 0 ; currPos < m_nGramOrder ; ++currPos )
  {
  	const Word &word = *contextFactor[currPos];

  	for (size_t index = 0 ; index < m_factorTypesOrdered.size() ; ++index)
  	{
  		FactorType factorType = m_factorTypesOrdered[index];
  		const Factor *factor = word[factorType];

  		(*widMatrix)[currPos][index] = GetLmID(factor, index);

  	}

  }

  float p = m_srilmModel->wordProb( (*widMatrix), m_nGramOrder - 1, m_nGramOrder );
  return FloorScore(TransformLMScore(p)); */
}

// The old version did not initialize finalState like it should.  Technically that makes the behavior undefined, so it's not clear what else to do here.
FFState *LanguageModelParallelBackoff::NewState(const FFState * /*from*/) const
{
  return NULL;
}

const FFState *LanguageModelParallelBackoff::GetNullContextState() const
{
  return NULL;
}

const FFState *LanguageModelParallelBackoff::GetBeginSentenceState() const
{
  return NULL;
}

}


}