mosesdecoder/moses/src/StaticData.h

530 lines
16 KiB
C
Raw Normal View History

// $Id$
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_StaticData_h
#define moses_StaticData_h
#include <limits>
#include <list>
#include <vector>
#include <map>
#include <memory>
#include <utility>
#ifdef WITH_THREADS
#include <boost/thread/mutex.hpp>
#endif
#include "TypeDef.h"
#include "ScoreIndexManager.h"
#include "FactorCollection.h"
#include "Parameter.h"
#include "LanguageModel.h"
#include "LMList.h"
#include "SentenceStats.h"
#include "DecodeGraph.h"
#include "TranslationOptionList.h"
#if HAVE_CONFIG_H
#include "config.h"
#endif
//#include "UnknownWordHandler.h"
namespace Moses
{
class InputType;
class LexicalReordering;
class GlobalLexicalModel;
class PhraseDictionaryFeature;
class GenerationDictionary;
class DistortionScoreProducer;
class WordPenaltyProducer;
class DecodeStep;
class UnknownWordPenaltyProducer;
/** Contains global variables and contants */
class StaticData
{
private:
static StaticData s_instance;
protected:
std::map<long,Phrase> m_constraints;
std::vector<PhraseDictionaryFeature*> m_phraseDictionary;
std::vector<GenerationDictionary*> m_generationDictionary;
Parameter *m_parameter;
std::vector<FactorType> m_inputFactorOrder, m_outputFactorOrder;
LMList m_languageModel;
ScoreIndexManager m_scoreIndexManager;
std::vector<float> m_allWeights;
std::vector<LexicalReordering*> m_reorderModels;
std::vector<GlobalLexicalModel*> m_globalLexicalModels;
// Initial = 0 = can be used when creating poss trans
// Other = 1 = used to calculate LM score once all steps have been processed
float
m_beamWidth,
m_earlyDiscardingThreshold,
m_translationOptionThreshold,
m_weightDistortion,
m_weightWordPenalty,
m_wordDeletionWeight,
m_weightUnknownWord;
// PhraseTrans, Generation & LanguageModelScore has multiple weights.
int m_maxDistortion;
// do it differently from old pharaoh
// -ve = no limit on distortion
// 0 = no disortion (monotone in old pharaoh)
bool m_reorderingConstraint; // use additional reordering constraints
size_t
m_maxHypoStackSize //hypothesis-stack size that triggers pruning
, m_minHypoStackDiversity // minimum number of hypothesis in stack for each source word coverage
, m_nBestSize
, m_nBestFactor
, m_maxNoTransOptPerCoverage
, m_maxNoPartTransOpt
, m_maxPhraseLength
, m_numLinkParams;
std::string
m_constraintFileName;
std::string m_nBestFilePath;
bool m_fLMsLoaded, m_labeledNBestList,m_nBestIncludesAlignment;
/***
* false = treat unknown words as unknowns, and translate them as themselves;
* true = drop (ignore) them
*/
bool m_dropUnknown;
bool m_wordDeletionEnabled;
bool m_disableDiscarding;
bool m_printAllDerivations;
bool m_sourceStartPosMattersForRecombination;
bool m_recoverPath;
SearchAlgorithm m_searchAlgorithm;
InputTypeEnum m_inputType;
size_t m_numInputScores;
mutable size_t m_verboseLevel;
DistortionScoreProducer *m_distortionScoreProducer;
WordPenaltyProducer *m_wpProducer;
UnknownWordPenaltyProducer *m_unknownWordPenaltyProducer;
bool m_reportSegmentation;
bool m_reportAllFactors;
bool m_reportAllFactorsNBest;
bool m_isDetailedTranslationReportingEnabled;
bool m_onlyDistinctNBest;
bool m_computeLMBackoffStats;
bool m_UseAlignmentInfo;
bool m_PrintAlignmentInfo;
bool m_PrintAlignmentInfoNbest;
std::string m_factorDelimiter; //! by default, |, but it can be changed
size_t m_maxFactorIdx[2]; //! number of factors on source and target side
size_t m_maxNumFactors; //! max number of factors on both source and target sides
XmlInputType m_xmlInputType; //! method for handling sentence XML input
bool m_mbr; //! use MBR decoder
bool m_useLatticeMBR; //! use MBR decoder
bool m_useConsensusDecoding; //! Use Consensus decoding (DeNero et al 2009)
size_t m_mbrSize; //! number of translation candidates considered
float m_mbrScale; //! scaling factor for computing marginal probability of candidate translation
size_t m_lmbrPruning; //! average number of nodes per word wanted in pruned lattice
std::vector<float> m_lmbrThetas; //! theta(s) for lattice mbr calculation
bool m_useLatticeHypSetForLatticeMBR; //! to use nbest as hypothesis set during lattice MBR
float m_lmbrPrecision; //! unigram precision theta - see Tromble et al 08 for more details
float m_lmbrPRatio; //! decaying factor for ngram thetas - see Tromble et al 08 for more details
float m_lmbrMapWeight; //! Weight given to the map solution. See Kumar et al 09 for details
bool m_timeout; //! use timeout
size_t m_timeout_threshold; //! seconds after which time out is activated
bool m_useTransOptCache; //! flag indicating, if the persistent translation option cache should be used
mutable std::map<std::pair<size_t, Phrase>, std::pair<TranslationOptionList*,clock_t> > m_transOptCache; //! persistent translation option cache
size_t m_transOptCacheMaxSize; //! maximum size for persistent translation option cache
//FIXME: Single lock for cache not most efficient. However using a
//reader-writer for LRU cache is tricky - how to record last used time?
#ifdef WITH_THREADS
mutable boost::mutex m_transOptCacheMutex;
#endif
bool m_isAlwaysCreateDirectTranslationOption;
//! constructor. only the 1 static variable can be created
bool m_outputWordGraph; //! whether to output word graph
bool m_outputSearchGraph; //! whether to output search graph
bool m_outputSearchGraphExtended; //! ... in extended format
#ifdef HAVE_PROTOBUF
bool m_outputSearchGraphPB; //! whether to output search graph as a protobuf
#endif
size_t m_cubePruningPopLimit;
size_t m_cubePruningDiversity;
StaticData();
//! helper fn to set bool param from ini file/command line
void SetBooleanParameter(bool *paramter, std::string parameterName, bool defaultValue);
/***
* load all language models as specified in ini file
*/
bool LoadLanguageModels();
/***
* load not only the main phrase table but also any auxiliary tables that depend on which features are being used
* (eg word-deletion, word-insertion tables)
*/
bool LoadPhraseTables();
//! load all generation tables as specified in ini file
bool LoadGenerationTables();
//! load decoding steps
bool LoadLexicalReorderingModel();
bool LoadGlobalLexicalModel();
void ReduceTransOptCache() const;
public:
bool IsAlwaysCreateDirectTranslationOption() const {
return m_isAlwaysCreateDirectTranslationOption;
}
//! destructor
~StaticData();
//! return static instance for use like global variable
static const StaticData& Instance() { return s_instance; }
/** delete current static instance and replace with another.
* Used by gui front end
*/
#ifdef WIN32
static void Reset() { s_instance = StaticData(); }
#endif
/** load data into static instance. This function is required
* as LoadData() is not const
*/
static bool LoadDataStatic(Parameter *parameter)
{
return s_instance.LoadData(parameter);
}
/** Main function to load everything.
* Also initialize the Parameter object
*/
bool LoadData(Parameter *parameter);
const PARAM_VEC &GetParam(const std::string &paramName) const
{
return m_parameter->GetParam(paramName);
}
bool IsComputeLMBackoffStats() const
{
return m_computeLMBackoffStats;
}
const std::vector<FactorType> &GetInputFactorOrder() const
{
return m_inputFactorOrder;
}
const std::vector<FactorType> &GetOutputFactorOrder() const
{
return m_outputFactorOrder;
}
std::vector<DecodeGraph*> GetDecodeStepVL(const InputType& source) const;
inline bool GetSourceStartPosMattersForRecombination() const
{
return m_sourceStartPosMattersForRecombination;
}
inline bool GetDropUnknown() const
{
return m_dropUnknown;
}
inline bool GetDisableDiscarding() const
{
return m_disableDiscarding;
}
inline size_t GetMaxNoTransOptPerCoverage() const
{
return m_maxNoTransOptPerCoverage;
}
inline size_t GetMaxNoPartTransOpt() const
{
return m_maxNoPartTransOpt;
}
inline const Phrase* GetConstrainingPhrase(long sentenceID) const
{
std::map<long,Phrase>::const_iterator iter = m_constraints.find(sentenceID);
if (iter != m_constraints.end())
{
const Phrase& phrase = iter->second;
return &phrase;
}
else
{
return NULL;
}
}
inline size_t GetMaxPhraseLength() const
{
return m_maxPhraseLength;
}
const std::vector<LexicalReordering*> &GetReorderModels() const
{
return m_reorderModels;
}
float GetWeightDistortion() const
{
return m_weightDistortion;
}
float GetWeightWordPenalty() const
{
return m_weightWordPenalty;
}
float GetWeightUnknownWord() const
{
return m_weightUnknownWord;
}
bool IsWordDeletionEnabled() const
{
return m_wordDeletionEnabled;
}
size_t GetMaxHypoStackSize() const
{
return m_maxHypoStackSize;
}
size_t GetMinHypoStackDiversity() const
{
return m_minHypoStackDiversity;
}
size_t GetCubePruningPopLimit() const
{
return m_cubePruningPopLimit;
}
size_t GetCubePruningDiversity() const
{
return m_cubePruningDiversity;
}
size_t IsPathRecoveryEnabled() const
{
return m_recoverPath;
}
int GetMaxDistortion() const
{
return m_maxDistortion;
}
bool UseReorderingConstraint() const
{
return m_reorderingConstraint;
}
float GetBeamWidth() const
{
return m_beamWidth;
}
float GetEarlyDiscardingThreshold() const
{
return m_earlyDiscardingThreshold;
}
bool UseEarlyDiscarding() const
{
return m_earlyDiscardingThreshold != -std::numeric_limits<float>::infinity();
}
float GetTranslationOptionThreshold() const
{
return m_translationOptionThreshold;
}
//! returns the total number of score components across all types, all factors
size_t GetTotalScoreComponents() const
{
return m_scoreIndexManager.GetTotalNumberOfScores();
}
const ScoreIndexManager& GetScoreIndexManager() const
{
return m_scoreIndexManager;
}
size_t GetLMSize() const
{
return m_languageModel.size();
}
const LMList &GetAllLM() const
{
return m_languageModel;
}
size_t GetPhraseDictionarySize() const
{
return m_phraseDictionary.size();
}
const std::vector<PhraseDictionaryFeature*> &GetPhraseDictionaries() const
{
return m_phraseDictionary;
}
const std::vector<GenerationDictionary*> &GetGenerationDictionaries() const
{
return m_generationDictionary;
}
size_t GetGenerationDictionarySize() const
{
return m_generationDictionary.size();
}
size_t GetVerboseLevel() const
{
return m_verboseLevel;
}
void SetVerboseLevel(int x) const { m_verboseLevel = x; }
bool GetReportSegmentation() const
{
return m_reportSegmentation;
}
bool GetReportAllFactors() const
{
return m_reportAllFactors;
}
bool GetReportAllFactorsNBest() const
{
return m_reportAllFactorsNBest;
}
bool IsDetailedTranslationReportingEnabled() const
{
return m_isDetailedTranslationReportingEnabled;
}
bool IsLabeledNBestList() const
{
return m_labeledNBestList;
}
bool NBestIncludesAlignment() const
{
return m_nBestIncludesAlignment;
}
size_t GetNumLinkParams() const
{
return m_numLinkParams;
}
const std::vector<std::string> &GetDescription() const
{
return m_parameter->GetParam("description");
}
// for mert
size_t GetNBestSize() const
{
return m_nBestSize;
}
const std::string &GetNBestFilePath() const
{
return m_nBestFilePath;
}
bool IsNBestEnabled() const {
return (!m_nBestFilePath.empty()) || m_mbr || m_useLatticeMBR || m_outputSearchGraph || m_useConsensusDecoding
#ifdef HAVE_PROTOBUF
|| m_outputSearchGraphPB
#endif
;
}
size_t GetNBestFactor() const
{
return m_nBestFactor;
}
bool GetOutputWordGraph() const
{ return m_outputWordGraph; }
//! Sets the global score vector weights for a given ScoreProducer.
void SetWeightsForScoreProducer(const ScoreProducer* sp, const std::vector<float>& weights);
InputTypeEnum GetInputType() const {return m_inputType;}
SearchAlgorithm GetSearchAlgorithm() const {return m_searchAlgorithm;}
size_t GetNumInputScores() const {return m_numInputScores;}
void InitializeBeforeSentenceProcessing(InputType const&) const;
void CleanUpAfterSentenceProcessing() const;
const std::vector<float>& GetAllWeights() const
{
return m_allWeights;
}
const DistortionScoreProducer *GetDistortionScoreProducer() const { return m_distortionScoreProducer; }
const WordPenaltyProducer *GetWordPenaltyProducer() const { return m_wpProducer; }
const UnknownWordPenaltyProducer *GetUnknownWordPenaltyProducer() const { return m_unknownWordPenaltyProducer; }
bool UseAlignmentInfo() const { return m_UseAlignmentInfo;}
void UseAlignmentInfo(bool a){ m_UseAlignmentInfo=a; };
bool PrintAlignmentInfo() const { return m_PrintAlignmentInfo; }
bool PrintAlignmentInfoInNbest() const {return m_PrintAlignmentInfoNbest;}
bool GetDistinctNBest() const {return m_onlyDistinctNBest;}
const std::string& GetFactorDelimiter() const {return m_factorDelimiter;}
size_t GetMaxNumFactors(FactorDirection direction) const { return m_maxFactorIdx[(size_t)direction]+1; }
size_t GetMaxNumFactors() const { return m_maxNumFactors; }
bool UseMBR() const { return m_mbr; }
bool UseLatticeMBR() const { return m_useLatticeMBR ;}
bool UseConsensusDecoding() const {return m_useConsensusDecoding;}
void SetUseLatticeMBR(bool flag) {m_useLatticeMBR = flag; }
size_t GetMBRSize() const { return m_mbrSize; }
float GetMBRScale() const { return m_mbrScale; }
void SetMBRScale(float scale) {
m_mbrScale = scale;
}
size_t GetLatticeMBRPruningFactor() const { return m_lmbrPruning; }
void SetLatticeMBRPruningFactor(size_t prune) {
m_lmbrPruning = prune;
}
const std::vector<float>& GetLatticeMBRThetas() const {return m_lmbrThetas;}
bool UseLatticeHypSetForLatticeMBR() const { return m_useLatticeHypSetForLatticeMBR;}
float GetLatticeMBRPrecision() const {
return m_lmbrPrecision;
}
void SetLatticeMBRPrecision(float p) {
m_lmbrPrecision = p;
}
float GetLatticeMBRPRatio() const {
return m_lmbrPRatio;
}
void SetLatticeMBRPRatio(float r) {
m_lmbrPRatio = r;
}
float GetLatticeMBRMapWeight() const {return m_lmbrMapWeight;}
bool UseTimeout() const { return m_timeout; }
size_t GetTimeoutThreshold() const { return m_timeout_threshold; }
bool GetOutputSearchGraph() const { return m_outputSearchGraph; }
bool GetOutputSearchGraphExtended() const { return m_outputSearchGraphExtended; }
#ifdef HAVE_PROTOBUF
bool GetOutputSearchGraphPB() const { return m_outputSearchGraphPB; }
#endif
XmlInputType GetXmlInputType() const { return m_xmlInputType; }
bool GetUseTransOptCache() const { return m_useTransOptCache; }
void AddTransOptListToCache(const DecodeGraph &decodeGraph, const Phrase &sourcePhrase, const TranslationOptionList &transOptList) const;
const TranslationOptionList* FindTransOptListInCache(const DecodeGraph &decodeGraph, const Phrase &sourcePhrase) const;
bool PrintAllDerivations() const { return m_printAllDerivations;}
};
}
#endif