// vim:tabstop=2 /*********************************************************************** Moses - factored phrase-based language decoder Copyright (C) 2006 University of Edinburgh This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ #include "moses/TranslationModel/PhraseDictionary.h" #include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h" #include "moses/TranslationModel/RuleTable/PhraseDictionarySCFG.h" #include "moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h" #include "moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.h" #include "moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.h" #ifndef WIN32 #include "moses/TranslationModel/PhraseDictionaryDynSuffixArray.h" #include "moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h" #endif #include "moses/TranslationModel/RuleTable/UTrie.h" #include "moses/StaticData.h" #include "moses/InputType.h" #include "moses/TranslationOption.h" #include "moses/UserMessage.h" using namespace std; namespace Moses { PhraseDictionary::PhraseDictionary(const std::string &description, const std::string &line) :DecodeFeature(description, line) ,m_sparsePhraseDictionaryFeature(NULL) { m_tableLimit= 20; // TODO default? for (size_t i = 0; i < m_args.size(); ++i) { const vector &args = m_args[i]; if (args[0] == "input-factor") { m_input =Tokenize(args[1]); m_inputFactors = FactorMask(m_input); } else if (args[0] == "output-factor") { m_output =Tokenize(args[1]); m_outputFactors = FactorMask(m_output); } else if (args[0] == "num-input-features") { m_numInputScores = Scan(args[1]); } else if (args[0] == "path") { m_filePath = args[1]; } else if (args[0] == "table-limit") { m_tableLimit = Scan(args[1]); } else if (args[0] == "target-path") { m_targetFile = args[1]; } else if (args[0] == "alignment-path") { m_alignmentsFile = args[1]; } else { throw "Unknown argument " + args[0]; } } // for (size_t i = 0; i < toks.size(); ++i) { } const TargetPhraseCollection *PhraseDictionary:: GetTargetPhraseCollection(InputType const& src,WordsRange const& range) const { return GetTargetPhraseCollection(src.GetSubString(range)); } /* PhraseDictionary* PhraseDictionaryFeature::LoadPhraseTable(const TranslationSystem* system) { const StaticData& staticData = StaticData::Instance(); std::vector weightT = staticData.GetWeights(this); if (m_implementation == Memory) { // memory phrase table VERBOSE(2,"using standard phrase tables" << std::endl); if (!FileExists(m_filePath) && FileExists(m_filePath + ".gz")) { m_filePath += ".gz"; VERBOSE(2,"Using gzipped file" << std::endl); } if (staticData.GetInputType() != SentenceInput) { UserMessage::Add("Must use binary phrase table for this input type"); CHECK(false); } PhraseDictionaryMemory* pdm = new PhraseDictionaryMemory(GetNumScoreComponents(),this); bool ret = pdm->Load(GetInput(), GetOutput() , m_filePath , weightT , m_tableLimit , staticData.GetLMList() , staticData.GetWeightWordPenalty()); CHECK(ret); return pdm; } else if (m_implementation == Binary) { PhraseDictionaryTreeAdaptor* pdta = new PhraseDictionaryTreeAdaptor(GetNumScoreComponents(), m_numInputScores,this); bool ret = pdta->Load( GetInput() , GetOutput() , m_filePath , weightT , m_tableLimit , staticData.GetLMList() , staticData.GetWeightWordPenalty()); CHECK(ret); return pdta; } else if (m_implementation == SCFG || m_implementation == Hiero) { // memory phrase table if (m_implementation == Hiero) { VERBOSE(2,"using Hiero format phrase tables" << std::endl); } else { VERBOSE(2,"using Moses-formatted SCFG phrase tables" << std::endl); } if (!FileExists(m_filePath) && FileExists(m_filePath + ".gz")) { m_filePath += ".gz"; VERBOSE(2,"Using gzipped file" << std::endl); } RuleTableTrie *dict; if (staticData.GetParsingAlgorithm() == ParseScope3) { dict = new RuleTableUTrie(GetNumScoreComponents(), this); } else { dict = new PhraseDictionarySCFG(GetNumScoreComponents(), this); } bool ret = dict->Load(GetInput() , GetOutput() , m_filePath , weightT , m_tableLimit , staticData.GetLMList() , staticData.GetWordPenaltyProducer()); CHECK(ret); return dict; } else if (m_implementation == ALSuffixArray) { // memory phrase table VERBOSE(2,"using Hiero format phrase tables" << std::endl); if (!FileExists(m_filePath) && FileExists(m_filePath + ".gz")) { m_filePath += ".gz"; VERBOSE(2,"Using gzipped file" << std::endl); } PhraseDictionaryALSuffixArray* pdm = new PhraseDictionaryALSuffixArray(GetNumScoreComponents(),this); bool ret = pdm->Load(GetInput() , GetOutput() , m_filePath , weightT , m_tableLimit , staticData.GetLMList() , staticData.GetWordPenaltyProducer()); CHECK(ret); return pdm; } else if (m_implementation == OnDisk) { PhraseDictionaryOnDisk* pdta = new PhraseDictionaryOnDisk(GetNumScoreComponents(), this); bool ret = pdta->Load(GetInput() , GetOutput() , m_filePath , weightT , m_tableLimit , staticData.GetLMList() , staticData.GetWordPenaltyProducer()); CHECK(ret); return pdta; } else if (m_implementation == SuffixArray) { PhraseDictionaryDynSuffixArray *pd = new PhraseDictionaryDynSuffixArray(GetNumScoreComponents(), this); if(!(pd->Load( GetInput() ,GetOutput() ,m_filePath ,m_targetFile ,m_alignmentsFile ,weightT, m_tableLimit ,staticData.GetLMList() ,staticData.GetWeightWordPenalty()))) { std::cerr << "FAILED TO LOAD\n" << endl; delete pd; pd = NULL; } std::cerr << "Suffix array phrase table loaded" << std::endl; return pd; } else if (m_implementation == FuzzyMatch) { PhraseDictionaryFuzzyMatch *dict = new PhraseDictionaryFuzzyMatch(GetNumScoreComponents(), this); bool ret = dict->Load(GetInput() , GetOutput() , m_filePath , weightT , m_tableLimit , staticData.GetLMList() , staticData.GetWordPenaltyProducer()); CHECK(ret); return dict; } else if (m_implementation == Compact) { VERBOSE(2,"Using compact phrase table" << std::endl); PhraseDictionaryCompact* pd = new PhraseDictionaryCompact(GetNumScoreComponents(), m_implementation, this); bool ret = pd->Load(GetInput(), GetOutput() , m_filePath , weightT , m_tableLimit , staticData.GetLMList() , staticData.GetWeightWordPenalty()); CHECK(ret); return pd; } else { std::cerr << "Unknown phrase table type " << m_implementation << endl; CHECK(false); } } void PhraseDictionaryFeature::InitDictionary(const TranslationSystem* system) { //Thread-safe phrase dictionaries get loaded now if (m_useThreadSafePhraseDictionary && !m_threadSafePhraseDictionary.get()) { IFVERBOSE(1) PrintUserTime("Start loading phrase table from " + m_filePath); m_threadSafePhraseDictionary.reset(LoadPhraseTable(system)); //IFVERBOSE(1) //PrintUserTime("Finished loading phrase tables"); } //Other types will be lazy loaded } */ }