2008-06-11 14:52:57 +04:00
|
|
|
// $Id$
|
|
|
|
// vim:tabstop=2
|
|
|
|
|
|
|
|
/***********************************************************************
|
|
|
|
Moses - factored phrase-based language decoder
|
|
|
|
Copyright (C) 2006 University of Edinburgh
|
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
|
|
modify it under the terms of the GNU Lesser General Public
|
|
|
|
License as published by the Free Software Foundation; either
|
|
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
Lesser General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
|
|
License along with this library; if not, write to the Free Software
|
|
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
***********************************************************************/
|
|
|
|
|
|
|
|
#include <string>
|
2011-11-18 16:07:41 +04:00
|
|
|
#include "util/check.hh"
|
2012-11-27 19:08:31 +04:00
|
|
|
#include "moses/TranslationModel/PhraseDictionaryMemory.h"
|
2008-06-11 14:52:57 +04:00
|
|
|
#include "DecodeStepTranslation.h"
|
|
|
|
#include "DecodeStepGeneration.h"
|
|
|
|
#include "GenerationDictionary.h"
|
|
|
|
#include "DummyScoreProducers.h"
|
|
|
|
#include "StaticData.h"
|
|
|
|
#include "Util.h"
|
|
|
|
#include "FactorCollection.h"
|
|
|
|
#include "Timer.h"
|
2011-10-13 18:27:01 +04:00
|
|
|
#include "LM/Factory.h"
|
2008-06-11 14:52:57 +04:00
|
|
|
#include "LexicalReordering.h"
|
2009-05-26 23:30:35 +04:00
|
|
|
#include "GlobalLexicalModel.h"
|
2012-01-31 14:31:39 +04:00
|
|
|
#include "GlobalLexicalModelUnlimited.h"
|
2008-06-11 14:52:57 +04:00
|
|
|
#include "SentenceStats.h"
|
2011-05-11 02:02:25 +04:00
|
|
|
#include "PhraseBoundaryFeature.h"
|
2012-11-27 19:08:31 +04:00
|
|
|
#include "moses/TranslationModel/PhraseDictionary.h"
|
2011-09-20 19:32:26 +04:00
|
|
|
#include "SparsePhraseDictionaryFeature.h"
|
2011-03-22 17:33:16 +03:00
|
|
|
#include "PhrasePairFeature.h"
|
2011-08-06 18:10:43 +04:00
|
|
|
#include "PhraseLengthFeature.h"
|
2011-08-13 04:25:23 +04:00
|
|
|
#include "TargetWordInsertionFeature.h"
|
2011-08-13 05:39:35 +04:00
|
|
|
#include "SourceWordDeletionFeature.h"
|
2011-08-13 06:40:54 +04:00
|
|
|
#include "WordTranslationFeature.h"
|
2008-06-11 14:52:57 +04:00
|
|
|
#include "UserMessage.h"
|
|
|
|
#include "TranslationOption.h"
|
2010-09-17 17:36:03 +04:00
|
|
|
#include "TargetBigramFeature.h"
|
2011-11-04 20:40:12 +04:00
|
|
|
#include "TargetNgramFeature.h"
|
2008-06-11 14:52:57 +04:00
|
|
|
#include "DecodeGraph.h"
|
2008-06-19 03:14:09 +04:00
|
|
|
#include "InputFileStream.h"
|
2010-09-16 19:45:56 +04:00
|
|
|
#include "BleuScoreFeature.h"
|
2010-09-17 18:25:08 +04:00
|
|
|
#include "ScoreComponentCollection.h"
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2011-05-13 23:28:23 +04:00
|
|
|
#ifdef HAVE_SYNLM
|
|
|
|
#include "SyntacticLanguageModel.h"
|
|
|
|
#endif
|
|
|
|
|
2011-09-23 02:29:56 +04:00
|
|
|
#ifdef WITH_THREADS
|
|
|
|
#include <boost/thread.hpp>
|
|
|
|
#endif
|
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
using namespace std;
|
|
|
|
|
2008-10-09 03:51:26 +04:00
|
|
|
namespace Moses
|
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
static size_t CalcMax(size_t x, const vector<size_t>& y)
|
|
|
|
{
|
2008-06-11 14:52:57 +04:00
|
|
|
size_t max = x;
|
|
|
|
for (vector<size_t>::const_iterator i=y.begin(); i != y.end(); ++i)
|
|
|
|
if (*i > max) max = *i;
|
|
|
|
return max;
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
static size_t CalcMax(size_t x, const vector<size_t>& y, const vector<size_t>& z)
|
|
|
|
{
|
2008-06-11 14:52:57 +04:00
|
|
|
size_t max = x;
|
|
|
|
for (vector<size_t>::const_iterator i=y.begin(); i != y.end(); ++i)
|
|
|
|
if (*i > max) max = *i;
|
|
|
|
for (vector<size_t>::const_iterator i=z.begin(); i != z.end(); ++i)
|
|
|
|
if (*i > max) max = *i;
|
|
|
|
return max;
|
|
|
|
}
|
|
|
|
|
2013-01-01 21:27:26 +04:00
|
|
|
int GetFeatureIndex(std::map<string, int> &map, const string &featureName)
|
|
|
|
{
|
|
|
|
std::map<string, int>::iterator iter;
|
|
|
|
iter = map.find(featureName);
|
|
|
|
if (iter == map.end()) {
|
|
|
|
map[featureName] = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
int &index = iter->second;
|
|
|
|
++index;
|
|
|
|
return index;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
StaticData StaticData::s_instance;
|
|
|
|
|
|
|
|
StaticData::StaticData()
|
2012-12-28 18:26:30 +04:00
|
|
|
:m_fLMsLoaded(false)
|
2011-02-24 16:14:42 +03:00
|
|
|
,m_sourceStartPosMattersForRecombination(false)
|
|
|
|
,m_inputType(SentenceInput)
|
|
|
|
,m_detailedTranslationReportingFilePath()
|
|
|
|
,m_onlyDistinctNBest(false)
|
|
|
|
,m_factorDelimiter("|") // default delimiter between factors
|
2011-09-20 14:23:38 +04:00
|
|
|
,m_lmEnableOOVFeature(false)
|
2011-02-24 16:14:42 +03:00
|
|
|
,m_isAlwaysCreateDirectTranslationOption(false)
|
2012-11-14 23:01:25 +04:00
|
|
|
,m_needAlignmentInfo(false)
|
2008-06-11 14:52:57 +04:00
|
|
|
{
|
|
|
|
m_maxFactorIdx[0] = 0; // source side
|
|
|
|
m_maxFactorIdx[1] = 0; // target side
|
|
|
|
|
2011-11-16 16:38:22 +04:00
|
|
|
m_xmlBrackets.first="<";
|
|
|
|
m_xmlBrackets.second=">";
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
// memory pools
|
|
|
|
Phrase::InitializeMemPool();
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2012-07-31 00:07:19 +04:00
|
|
|
bool StaticData::LoadDataStatic(Parameter *parameter, const std::string &execPath) {
|
|
|
|
s_instance.SetExecPath(execPath);
|
|
|
|
return s_instance.LoadData(parameter);
|
2012-04-29 08:37:48 +04:00
|
|
|
}
|
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
bool StaticData::LoadData(Parameter *parameter)
|
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
ResetUserTime();
|
|
|
|
m_parameter = parameter;
|
|
|
|
|
|
|
|
// verbose level
|
|
|
|
m_verboseLevel = 1;
|
|
|
|
if (m_parameter->GetParam("verbose").size() == 1) {
|
|
|
|
m_verboseLevel = Scan<size_t>( m_parameter->GetParam("verbose")[0]);
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2012-01-26 15:38:40 +04:00
|
|
|
m_parsingAlgorithm = (m_parameter->GetParam("parsing-algorithm").size() > 0) ?
|
|
|
|
(ParsingAlgorithm) Scan<size_t>(m_parameter->GetParam("parsing-algorithm")[0]) : ParseCYKPlus;
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
// to cube or not to cube
|
|
|
|
m_searchAlgorithm = (m_parameter->GetParam("search-algorithm").size() > 0) ?
|
|
|
|
(SearchAlgorithm) Scan<size_t>(m_parameter->GetParam("search-algorithm")[0]) : Normal;
|
|
|
|
|
2012-10-12 17:09:45 +04:00
|
|
|
if (IsChart())
|
2011-02-24 16:14:42 +03:00
|
|
|
LoadChartDecodingParameters();
|
|
|
|
else
|
|
|
|
LoadPhraseBasedParameters();
|
|
|
|
|
|
|
|
// input type has to be specified BEFORE loading the phrase tables!
|
|
|
|
if(m_parameter->GetParam("inputtype").size())
|
|
|
|
m_inputType= (InputTypeEnum) Scan<int>(m_parameter->GetParam("inputtype")[0]);
|
|
|
|
std::string s_it = "text input";
|
|
|
|
if (m_inputType == 1) {
|
|
|
|
s_it = "confusion net";
|
|
|
|
}
|
|
|
|
if (m_inputType == 2) {
|
|
|
|
s_it = "word lattice";
|
|
|
|
}
|
|
|
|
VERBOSE(2,"input type is: "<<s_it<<"\n");
|
|
|
|
|
|
|
|
if(m_parameter->GetParam("recover-input-path").size()) {
|
|
|
|
m_recoverPath = Scan<bool>(m_parameter->GetParam("recover-input-path")[0]);
|
|
|
|
if (m_recoverPath && m_inputType == SentenceInput) {
|
|
|
|
TRACE_ERR("--recover-input-path should only be used with confusion net or word lattice input!\n");
|
|
|
|
m_recoverPath = false;
|
2011-02-03 12:08:42 +03:00
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2011-08-26 06:37:52 +04:00
|
|
|
if(m_parameter->GetParam("sort-word-alignment").size()) {
|
|
|
|
m_wordAlignmentSort = (WordAlignmentSort) Scan<size_t>(m_parameter->GetParam("sort-word-alignment")[0]);
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
// factor delimiter
|
|
|
|
if (m_parameter->GetParam("factor-delimiter").size() > 0) {
|
|
|
|
m_factorDelimiter = m_parameter->GetParam("factor-delimiter")[0];
|
|
|
|
}
|
|
|
|
|
|
|
|
SetBooleanParameter( &m_continuePartialTranslation, "continue-partial-translation", false );
|
2012-11-14 23:01:25 +04:00
|
|
|
SetBooleanParameter( &m_outputHypoScore, "output-hypo-score", false );
|
2011-02-24 16:14:42 +03:00
|
|
|
|
|
|
|
//word-to-word alignment
|
|
|
|
SetBooleanParameter( &m_PrintAlignmentInfoNbest, "print-alignment-info-in-n-best", false );
|
2012-11-14 23:01:25 +04:00
|
|
|
if (m_PrintAlignmentInfoNbest) {
|
|
|
|
m_needAlignmentInfo = true;
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (m_parameter->GetParam("alignment-output-file").size() > 0) {
|
|
|
|
m_alignmentOutputFile = Scan<std::string>(m_parameter->GetParam("alignment-output-file")[0]);
|
2012-11-14 23:01:25 +04:00
|
|
|
m_needAlignmentInfo = true;
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
// n-best
|
|
|
|
if (m_parameter->GetParam("n-best-list").size() >= 2) {
|
|
|
|
m_nBestFilePath = m_parameter->GetParam("n-best-list")[0];
|
|
|
|
m_nBestSize = Scan<size_t>( m_parameter->GetParam("n-best-list")[1] );
|
|
|
|
m_onlyDistinctNBest=(m_parameter->GetParam("n-best-list").size()>2 && m_parameter->GetParam("n-best-list")[2]=="distinct");
|
|
|
|
} else if (m_parameter->GetParam("n-best-list").size() == 1) {
|
2011-10-04 19:46:24 +04:00
|
|
|
UserMessage::Add(string("wrong format for switch -n-best-list file size"));
|
2011-02-24 16:14:42 +03:00
|
|
|
return false;
|
|
|
|
} else {
|
|
|
|
m_nBestSize = 0;
|
|
|
|
}
|
|
|
|
if (m_parameter->GetParam("n-best-factor").size() > 0) {
|
|
|
|
m_nBestFactor = Scan<size_t>( m_parameter->GetParam("n-best-factor")[0]);
|
|
|
|
} else {
|
|
|
|
m_nBestFactor = 20;
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
2012-05-28 10:03:45 +04:00
|
|
|
|
|
|
|
// explicit setting of distinct nbest
|
|
|
|
SetBooleanParameter( &m_onlyDistinctNBest, "distinct-nbest", false);
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2011-10-04 19:46:24 +04:00
|
|
|
//lattice samples
|
|
|
|
if (m_parameter->GetParam("lattice-samples").size() ==2 ) {
|
|
|
|
m_latticeSamplesFilePath = m_parameter->GetParam("lattice-samples")[0];
|
|
|
|
m_latticeSamplesSize = Scan<size_t>(m_parameter->GetParam("lattice-samples")[1]);
|
|
|
|
} else if (m_parameter->GetParam("lattice-samples").size() != 0 ) {
|
|
|
|
UserMessage::Add(string("wrong format for switch -lattice-samples file size"));
|
|
|
|
return false;
|
|
|
|
} else {
|
|
|
|
m_latticeSamplesSize = 0;
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
// word graph
|
|
|
|
if (m_parameter->GetParam("output-word-graph").size() == 2)
|
|
|
|
m_outputWordGraph = true;
|
|
|
|
else
|
|
|
|
m_outputWordGraph = false;
|
|
|
|
|
|
|
|
// search graph
|
|
|
|
if (m_parameter->GetParam("output-search-graph").size() > 0) {
|
|
|
|
if (m_parameter->GetParam("output-search-graph").size() != 1) {
|
|
|
|
UserMessage::Add(string("ERROR: wrong format for switch -output-search-graph file"));
|
|
|
|
return false;
|
2011-08-18 01:13:21 +04:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
m_outputSearchGraph = true;
|
|
|
|
}
|
|
|
|
// ... in extended format
|
|
|
|
else if (m_parameter->GetParam("output-search-graph-extended").size() > 0) {
|
|
|
|
if (m_parameter->GetParam("output-search-graph-extended").size() != 1) {
|
|
|
|
UserMessage::Add(string("ERROR: wrong format for switch -output-search-graph-extended file"));
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
m_outputSearchGraph = true;
|
|
|
|
m_outputSearchGraphExtended = true;
|
|
|
|
} else
|
|
|
|
m_outputSearchGraph = false;
|
2008-09-24 20:48:23 +04:00
|
|
|
#ifdef HAVE_PROTOBUF
|
2011-02-24 16:14:42 +03:00
|
|
|
if (m_parameter->GetParam("output-search-graph-pb").size() > 0) {
|
|
|
|
if (m_parameter->GetParam("output-search-graph-pb").size() != 1) {
|
|
|
|
UserMessage::Add(string("ERROR: wrong format for switch -output-search-graph-pb path"));
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
m_outputSearchGraphPB = true;
|
|
|
|
} else
|
|
|
|
m_outputSearchGraphPB = false;
|
2008-09-24 20:48:23 +04:00
|
|
|
#endif
|
2012-09-03 10:23:32 +04:00
|
|
|
SetBooleanParameter( &m_unprunedSearchGraph, "unpruned-search-graph", false );
|
|
|
|
SetBooleanParameter( &m_includeLHSInSearchGraph, "include-lhs-in-search-graph", false );
|
2012-09-21 11:55:37 +04:00
|
|
|
|
|
|
|
if (m_parameter->isParamSpecified("output-unknowns")) {
|
|
|
|
|
|
|
|
if (m_parameter->GetParam("output-unknowns").size() == 1) {
|
|
|
|
m_outputUnknownsFile =Scan<string>(m_parameter->GetParam("output-unknowns")[0]);
|
|
|
|
} else {
|
|
|
|
UserMessage::Add(string("need to specify exactly one file name for unknowns"));
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
// include feature names in the n-best list
|
|
|
|
SetBooleanParameter( &m_labeledNBestList, "labeled-n-best-list", true );
|
|
|
|
|
|
|
|
// include word alignment in the n-best list
|
2012-11-14 20:47:16 +04:00
|
|
|
SetBooleanParameter( &m_nBestIncludesSegmentation, "include-segmentation-in-n-best", false );
|
2011-02-24 16:14:42 +03:00
|
|
|
|
|
|
|
// printing source phrase spans
|
|
|
|
SetBooleanParameter( &m_reportSegmentation, "report-segmentation", false );
|
|
|
|
|
|
|
|
// print all factors of output translations
|
|
|
|
SetBooleanParameter( &m_reportAllFactors, "report-all-factors", false );
|
|
|
|
|
|
|
|
// print all factors of output translations
|
|
|
|
SetBooleanParameter( &m_reportAllFactorsNBest, "report-all-factors-in-n-best", false );
|
|
|
|
|
2011-08-19 20:09:36 +04:00
|
|
|
// caching of translation options
|
2011-02-24 16:14:42 +03:00
|
|
|
if (m_inputType == SentenceInput) {
|
|
|
|
SetBooleanParameter( &m_useTransOptCache, "use-persistent-cache", true );
|
|
|
|
m_transOptCacheMaxSize = (m_parameter->GetParam("persistent-cache-size").size() > 0)
|
|
|
|
? Scan<size_t>(m_parameter->GetParam("persistent-cache-size")[0]) : DEFAULT_MAX_TRANS_OPT_CACHE_SIZE;
|
|
|
|
} else {
|
|
|
|
m_useTransOptCache = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
//input factors
|
|
|
|
const vector<string> &inputFactorVector = m_parameter->GetParam("input-factors");
|
|
|
|
for(size_t i=0; i<inputFactorVector.size(); i++) {
|
|
|
|
m_inputFactorOrder.push_back(Scan<FactorType>(inputFactorVector[i]));
|
|
|
|
}
|
|
|
|
if(m_inputFactorOrder.empty()) {
|
|
|
|
UserMessage::Add(string("no input factor specified in config file"));
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
//output factors
|
|
|
|
const vector<string> &outputFactorVector = m_parameter->GetParam("output-factors");
|
|
|
|
for(size_t i=0; i<outputFactorVector.size(); i++) {
|
|
|
|
m_outputFactorOrder.push_back(Scan<FactorType>(outputFactorVector[i]));
|
|
|
|
}
|
|
|
|
if(m_outputFactorOrder.empty()) {
|
|
|
|
// default. output factor 0
|
|
|
|
m_outputFactorOrder.push_back(0);
|
|
|
|
}
|
|
|
|
|
|
|
|
//source word deletion
|
|
|
|
SetBooleanParameter( &m_wordDeletionEnabled, "phrase-drop-allowed", false );
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2010-03-07 10:57:48 +03:00
|
|
|
//Disable discarding
|
|
|
|
SetBooleanParameter(&m_disableDiscarding, "disable-discarding", false);
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2010-03-07 10:57:48 +03:00
|
|
|
//Print All Derivations
|
|
|
|
SetBooleanParameter( &m_printAllDerivations , "print-all-derivations", false );
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
// additional output
|
|
|
|
if (m_parameter->isParamSpecified("translation-details")) {
|
2010-05-08 19:51:59 +04:00
|
|
|
const vector<string> &args = m_parameter->GetParam("translation-details");
|
2011-02-24 16:14:42 +03:00
|
|
|
if (args.size() == 1) {
|
2010-05-08 19:51:59 +04:00
|
|
|
m_detailedTranslationReportingFilePath = args[0];
|
2011-02-24 16:14:42 +03:00
|
|
|
} else {
|
2010-05-08 19:51:59 +04:00
|
|
|
UserMessage::Add(string("the translation-details option requires exactly one filename argument"));
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
// word penalties
|
2012-12-19 19:38:57 +04:00
|
|
|
CHECK(m_parameter->GetWeights("WordPenalty", 0).size() == 1);
|
|
|
|
float weightWordPenalty = m_parameter->GetWeights("WordPenalty", 0)[0];
|
|
|
|
m_wpProducer = new WordPenaltyProducer();
|
2012-12-30 19:53:24 +04:00
|
|
|
|
2012-12-19 19:38:57 +04:00
|
|
|
SetWeight(m_wpProducer, weightWordPenalty);
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2012-12-11 22:57:42 +04:00
|
|
|
const vector<float> &weightsUnknownWord = m_parameter->GetWeights("UnknownWordPenalty", 0);
|
|
|
|
float weightUnknownWord = weightsUnknownWord.size() ? weightsUnknownWord[0] : 1.0;
|
|
|
|
|
2011-08-19 20:09:36 +04:00
|
|
|
m_unknownWordPenaltyProducer = new UnknownWordPenaltyProducer();
|
2012-12-30 23:42:53 +04:00
|
|
|
|
|
|
|
SetWeight(m_unknownWordPenaltyProducer, weightUnknownWord);
|
2011-02-24 16:14:42 +03:00
|
|
|
|
|
|
|
// reordering constraints
|
|
|
|
m_maxDistortion = (m_parameter->GetParam("distortion-limit").size() > 0) ?
|
|
|
|
Scan<int>(m_parameter->GetParam("distortion-limit")[0])
|
|
|
|
: -1;
|
|
|
|
SetBooleanParameter( &m_reorderingConstraint, "monotone-at-punctuation", false );
|
|
|
|
|
|
|
|
// settings for pruning
|
|
|
|
m_maxHypoStackSize = (m_parameter->GetParam("stack").size() > 0)
|
|
|
|
? Scan<size_t>(m_parameter->GetParam("stack")[0]) : DEFAULT_MAX_HYPOSTACK_SIZE;
|
2012-10-25 17:14:15 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
m_minHypoStackDiversity = 0;
|
|
|
|
if (m_parameter->GetParam("stack-diversity").size() > 0) {
|
|
|
|
if (m_maxDistortion > 15) {
|
|
|
|
UserMessage::Add("stack diversity > 0 is not allowed for distortion limits larger than 15");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (m_inputType == WordLatticeInput) {
|
|
|
|
UserMessage::Add("stack diversity > 0 is not allowed for lattice input");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
m_minHypoStackDiversity = Scan<size_t>(m_parameter->GetParam("stack-diversity")[0]);
|
|
|
|
}
|
|
|
|
|
|
|
|
m_beamWidth = (m_parameter->GetParam("beam-threshold").size() > 0) ?
|
|
|
|
TransformScore(Scan<float>(m_parameter->GetParam("beam-threshold")[0]))
|
|
|
|
: TransformScore(DEFAULT_BEAM_WIDTH);
|
|
|
|
m_earlyDiscardingThreshold = (m_parameter->GetParam("early-discarding-threshold").size() > 0) ?
|
|
|
|
TransformScore(Scan<float>(m_parameter->GetParam("early-discarding-threshold")[0]))
|
|
|
|
: TransformScore(DEFAULT_EARLY_DISCARDING_THRESHOLD);
|
|
|
|
m_translationOptionThreshold = (m_parameter->GetParam("translation-option-threshold").size() > 0) ?
|
|
|
|
TransformScore(Scan<float>(m_parameter->GetParam("translation-option-threshold")[0]))
|
|
|
|
: TransformScore(DEFAULT_TRANSLATION_OPTION_THRESHOLD);
|
|
|
|
|
|
|
|
m_maxNoTransOptPerCoverage = (m_parameter->GetParam("max-trans-opt-per-coverage").size() > 0)
|
|
|
|
? Scan<size_t>(m_parameter->GetParam("max-trans-opt-per-coverage")[0]) : DEFAULT_MAX_TRANS_OPT_SIZE;
|
|
|
|
|
|
|
|
m_maxNoPartTransOpt = (m_parameter->GetParam("max-partial-trans-opt").size() > 0)
|
|
|
|
? Scan<size_t>(m_parameter->GetParam("max-partial-trans-opt")[0]) : DEFAULT_MAX_PART_TRANS_OPT_SIZE;
|
|
|
|
|
|
|
|
m_maxPhraseLength = (m_parameter->GetParam("max-phrase-length").size() > 0)
|
|
|
|
? Scan<size_t>(m_parameter->GetParam("max-phrase-length")[0]) : DEFAULT_MAX_PHRASE_LENGTH;
|
|
|
|
|
|
|
|
m_cubePruningPopLimit = (m_parameter->GetParam("cube-pruning-pop-limit").size() > 0)
|
|
|
|
? Scan<size_t>(m_parameter->GetParam("cube-pruning-pop-limit")[0]) : DEFAULT_CUBE_PRUNING_POP_LIMIT;
|
|
|
|
|
|
|
|
m_cubePruningDiversity = (m_parameter->GetParam("cube-pruning-diversity").size() > 0)
|
|
|
|
? Scan<size_t>(m_parameter->GetParam("cube-pruning-diversity")[0]) : DEFAULT_CUBE_PRUNING_DIVERSITY;
|
|
|
|
|
2011-06-27 19:13:15 +04:00
|
|
|
SetBooleanParameter(&m_cubePruningLazyScoring, "cube-pruning-lazy-scoring", false);
|
|
|
|
|
2012-09-13 13:43:01 +04:00
|
|
|
// early distortion cost
|
|
|
|
SetBooleanParameter( &m_useEarlyDistortionCost, "early-distortion-cost", false );
|
2012-06-21 19:41:05 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
// unknown word processing
|
|
|
|
SetBooleanParameter( &m_dropUnknown, "drop-unknown", false );
|
|
|
|
|
2011-09-09 22:03:00 +04:00
|
|
|
SetBooleanParameter( &m_lmEnableOOVFeature, "lmodel-oov-feature", false);
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
// minimum Bayes risk decoding
|
|
|
|
SetBooleanParameter( &m_mbr, "minimum-bayes-risk", false );
|
2010-02-03 13:23:32 +03:00
|
|
|
m_mbrSize = (m_parameter->GetParam("mbr-size").size() > 0) ?
|
2011-02-24 16:14:42 +03:00
|
|
|
Scan<size_t>(m_parameter->GetParam("mbr-size")[0]) : 200;
|
|
|
|
m_mbrScale = (m_parameter->GetParam("mbr-scale").size() > 0) ?
|
|
|
|
Scan<float>(m_parameter->GetParam("mbr-scale")[0]) : 1.0f;
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2010-02-03 13:23:32 +03:00
|
|
|
//lattice mbr
|
|
|
|
SetBooleanParameter( &m_useLatticeMBR, "lminimum-bayes-risk", false );
|
2010-04-12 13:51:29 +04:00
|
|
|
if (m_useLatticeMBR && m_mbr) {
|
2011-02-24 16:14:42 +03:00
|
|
|
cerr << "Errror: Cannot use both n-best mbr and lattice mbr together" << endl;
|
|
|
|
exit(1);
|
2010-04-12 13:51:29 +04:00
|
|
|
}
|
2012-05-28 10:03:45 +04:00
|
|
|
|
|
|
|
//mira training
|
|
|
|
SetBooleanParameter( &m_mira, "mira", false );
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2010-04-12 13:51:29 +04:00
|
|
|
if (m_useLatticeMBR) m_mbr = true;
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2010-02-09 14:37:33 +03:00
|
|
|
m_lmbrPruning = (m_parameter->GetParam("lmbr-pruning-factor").size() > 0) ?
|
2011-02-24 16:14:42 +03:00
|
|
|
Scan<size_t>(m_parameter->GetParam("lmbr-pruning-factor")[0]) : 30;
|
2010-02-03 13:23:32 +03:00
|
|
|
m_lmbrThetas = Scan<float>(m_parameter->GetParam("lmbr-thetas"));
|
2010-02-03 14:20:20 +03:00
|
|
|
SetBooleanParameter( &m_useLatticeHypSetForLatticeMBR, "lattice-hypo-set", false );
|
2010-02-03 22:46:35 +03:00
|
|
|
m_lmbrPrecision = (m_parameter->GetParam("lmbr-p").size() > 0) ?
|
2011-02-24 16:14:42 +03:00
|
|
|
Scan<float>(m_parameter->GetParam("lmbr-p")[0]) : 0.8f;
|
2010-02-03 22:46:35 +03:00
|
|
|
m_lmbrPRatio = (m_parameter->GetParam("lmbr-r").size() > 0) ?
|
2011-02-24 16:14:42 +03:00
|
|
|
Scan<float>(m_parameter->GetParam("lmbr-r")[0]) : 0.6f;
|
2010-03-14 23:23:17 +03:00
|
|
|
m_lmbrMapWeight = (m_parameter->GetParam("lmbr-map-weight").size() >0) ?
|
2011-02-24 16:14:42 +03:00
|
|
|
Scan<float>(m_parameter->GetParam("lmbr-map-weight")[0]) : 0.0f;
|
|
|
|
|
|
|
|
//consensus decoding
|
2010-04-12 13:51:29 +04:00
|
|
|
SetBooleanParameter( &m_useConsensusDecoding, "consensus-decoding", false );
|
|
|
|
if (m_useConsensusDecoding && m_mbr) {
|
2011-02-24 16:14:42 +03:00
|
|
|
cerr<< "Error: Cannot use consensus decoding together with mbr" << endl;
|
|
|
|
exit(1);
|
2010-04-12 13:51:29 +04:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
if (m_useConsensusDecoding) m_mbr=true;
|
2012-08-04 17:39:30 +04:00
|
|
|
|
2012-08-03 18:38:45 +04:00
|
|
|
// Compact phrase table and reordering model
|
2012-08-03 14:04:39 +04:00
|
|
|
SetBooleanParameter( &m_minphrMemory, "minphr-memory", false );
|
|
|
|
SetBooleanParameter( &m_minlexrMemory, "minlexr-memory", false );
|
2011-02-24 16:14:42 +03:00
|
|
|
|
|
|
|
m_timeout_threshold = (m_parameter->GetParam("time-out").size() > 0) ?
|
|
|
|
Scan<size_t>(m_parameter->GetParam("time-out")[0]) : -1;
|
|
|
|
m_timeout = (GetTimeoutThreshold() == (size_t)-1) ? false : true;
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2010-04-23 19:01:06 +04:00
|
|
|
|
|
|
|
m_lmcache_cleanup_threshold = (m_parameter->GetParam("clean-lm-cache").size() > 0) ?
|
2011-02-24 16:14:42 +03:00
|
|
|
Scan<size_t>(m_parameter->GetParam("clean-lm-cache")[0]) : 1;
|
2010-04-23 19:01:06 +04:00
|
|
|
|
2011-09-23 02:29:56 +04:00
|
|
|
m_threadCount = 1;
|
|
|
|
const std::vector<std::string> &threadInfo = m_parameter->GetParam("threads");
|
|
|
|
if (!threadInfo.empty()) {
|
|
|
|
if (threadInfo[0] == "all") {
|
|
|
|
#ifdef WITH_THREADS
|
|
|
|
m_threadCount = boost::thread::hardware_concurrency();
|
|
|
|
if (!m_threadCount) {
|
|
|
|
UserMessage::Add("-threads all specified but Boost doesn't know how many cores there are");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
UserMessage::Add("-threads all specified but moses not built with thread support");
|
|
|
|
return false;
|
|
|
|
#endif
|
|
|
|
} else {
|
|
|
|
m_threadCount = Scan<int>(threadInfo[0]);
|
|
|
|
if (m_threadCount < 1) {
|
|
|
|
UserMessage::Add("Specify at least one thread.");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
#ifndef WITH_THREADS
|
|
|
|
if (m_threadCount > 1) {
|
|
|
|
UserMessage::Add(std::string("Error: Thread count of ") + threadInfo[0] + " but moses not built with thread support");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-11-13 21:14:40 +04:00
|
|
|
m_startTranslationId = (m_parameter->GetParam("start-translation-id").size() > 0) ?
|
|
|
|
Scan<long>(m_parameter->GetParam("start-translation-id")[0]) : 0;
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
// Read in constraint decoding file, if provided
|
|
|
|
if(m_parameter->GetParam("constraint").size()) {
|
|
|
|
if (m_parameter->GetParam("search-algorithm").size() > 0
|
|
|
|
&& Scan<size_t>(m_parameter->GetParam("search-algorithm")[0]) != 0) {
|
2010-01-29 20:11:34 +03:00
|
|
|
cerr << "Can use -constraint only with stack-based search (-search-algorithm 0)" << endl;
|
|
|
|
exit(1);
|
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
m_constraintFileName = m_parameter->GetParam("constraint")[0];
|
|
|
|
|
|
|
|
InputFileStream constraintFile(m_constraintFileName);
|
|
|
|
|
|
|
|
std::string line;
|
2011-11-13 21:14:40 +04:00
|
|
|
|
|
|
|
long sentenceID = GetStartTranslationId() - 1;
|
2011-02-24 16:14:42 +03:00
|
|
|
while (getline(constraintFile, line)) {
|
|
|
|
vector<string> vecStr = Tokenize(line, "\t");
|
|
|
|
|
|
|
|
if (vecStr.size() == 1) {
|
|
|
|
sentenceID++;
|
2011-11-21 14:49:26 +04:00
|
|
|
Phrase phrase(0);
|
2011-02-24 16:14:42 +03:00
|
|
|
phrase.CreateFromString(GetOutputFactorOrder(), vecStr[0], GetFactorDelimiter());
|
|
|
|
m_constraints.insert(make_pair(sentenceID,phrase));
|
|
|
|
} else if (vecStr.size() == 2) {
|
|
|
|
sentenceID = Scan<long>(vecStr[0]);
|
2011-11-21 14:49:26 +04:00
|
|
|
Phrase phrase(0);
|
2011-02-24 16:14:42 +03:00
|
|
|
phrase.CreateFromString(GetOutputFactorOrder(), vecStr[1], GetFactorDelimiter());
|
|
|
|
m_constraints.insert(make_pair(sentenceID,phrase));
|
|
|
|
} else {
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(false);
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// use of xml in input
|
|
|
|
if (m_parameter->GetParam("xml-input").size() == 0) m_xmlInputType = XmlPassThrough;
|
|
|
|
else if (m_parameter->GetParam("xml-input")[0]=="exclusive") m_xmlInputType = XmlExclusive;
|
|
|
|
else if (m_parameter->GetParam("xml-input")[0]=="inclusive") m_xmlInputType = XmlInclusive;
|
|
|
|
else if (m_parameter->GetParam("xml-input")[0]=="ignore") m_xmlInputType = XmlIgnore;
|
|
|
|
else if (m_parameter->GetParam("xml-input")[0]=="pass-through") m_xmlInputType = XmlPassThrough;
|
|
|
|
else {
|
|
|
|
UserMessage::Add("invalid xml-input value, must be pass-through, exclusive, inclusive, or ignore");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2011-11-16 16:38:22 +04:00
|
|
|
// specify XML tags opening and closing brackets for XML option
|
|
|
|
if (m_parameter->GetParam("xml-brackets").size() > 0) {
|
|
|
|
std::vector<std::string> brackets = Tokenize(m_parameter->GetParam("xml-brackets")[0]);
|
|
|
|
if(brackets.size()!=2) {
|
|
|
|
cerr << "invalid xml-brackets value, must specify exactly 2 blank-delimited strings for XML tags opening and closing brackets" << endl;
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
m_xmlBrackets.first= brackets[0];
|
|
|
|
m_xmlBrackets.second=brackets[1];
|
|
|
|
cerr << "XML tags opening and closing brackets for XML input are: " << m_xmlBrackets.first << " and " << m_xmlBrackets.second << endl;
|
|
|
|
}
|
|
|
|
|
2012-12-31 20:41:33 +04:00
|
|
|
// all features
|
2013-01-01 21:27:26 +04:00
|
|
|
map<string, int> featureIndexMap;
|
|
|
|
|
2012-12-31 20:41:33 +04:00
|
|
|
const vector<string> &features = m_parameter->GetParam("feature");
|
|
|
|
for (size_t i = 0; i < features.size(); ++i) {
|
|
|
|
const string &line = features[i];
|
|
|
|
vector<string> toks = Tokenize(line);
|
|
|
|
|
2013-01-01 21:27:26 +04:00
|
|
|
const string &feature = toks[0];
|
|
|
|
int featureIndex = GetFeatureIndex(featureIndexMap, feature);
|
|
|
|
|
|
|
|
if (feature == "GlobalLexicalModel") {
|
2012-12-31 20:41:33 +04:00
|
|
|
GlobalLexicalModel *model = new GlobalLexicalModel(line);
|
2013-01-01 21:27:26 +04:00
|
|
|
const vector<float> &weights = m_parameter->GetWeights(feature, featureIndex);
|
|
|
|
SetWeights(model, weights);
|
|
|
|
}
|
|
|
|
else if (feature == "glm") {
|
|
|
|
GlobalLexicalModelUnlimited *model = NULL; //new GlobalLexicalModelUnlimited(line);
|
|
|
|
const vector<float> &weights = m_parameter->GetWeights(feature, featureIndex);
|
2012-12-31 20:41:33 +04:00
|
|
|
SetWeights(model, weights);
|
|
|
|
}
|
2013-01-02 15:31:59 +04:00
|
|
|
else if (feature == "swd") {
|
|
|
|
SourceWordDeletionFeature *model = new SourceWordDeletionFeature(line);
|
|
|
|
const vector<float> &weights = m_parameter->GetWeights(feature, featureIndex);
|
|
|
|
//SetWeights(model, weights);
|
|
|
|
}
|
2012-12-31 20:41:33 +04:00
|
|
|
|
|
|
|
}
|
|
|
|
|
2011-05-13 23:28:23 +04:00
|
|
|
#ifdef HAVE_SYNLM
|
|
|
|
if (m_parameter->GetParam("slmodel-file").size() > 0) {
|
|
|
|
if (!LoadSyntacticLanguageModel()) return false;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
if (!LoadLexicalReorderingModel()) return false;
|
|
|
|
if (!LoadLanguageModels()) return false;
|
|
|
|
if (!LoadGenerationTables()) return false;
|
|
|
|
if (!LoadPhraseTables()) return false;
|
2010-09-14 20:25:33 +04:00
|
|
|
if (!LoadDecodeGraphs()) return false;
|
|
|
|
if (!LoadReferences()) return false;
|
2010-10-15 19:19:17 +04:00
|
|
|
if (!LoadDiscrimLMFeature()) return false;
|
2011-03-22 17:33:16 +03:00
|
|
|
if (!LoadPhrasePairFeature()) return false;
|
2011-05-11 02:02:25 +04:00
|
|
|
if (!LoadPhraseBoundaryFeature()) return false;
|
2011-08-06 18:10:43 +04:00
|
|
|
if (!LoadPhraseLengthFeature()) return false;
|
2011-08-13 04:25:23 +04:00
|
|
|
if (!LoadTargetWordInsertionFeature()) return false;
|
2011-08-13 06:40:54 +04:00
|
|
|
if (!LoadWordTranslationFeature()) return false;
|
2010-08-10 17:12:00 +04:00
|
|
|
|
2011-08-07 04:58:56 +04:00
|
|
|
// report individual sparse features in n-best list
|
|
|
|
if (m_parameter->GetParam("report-sparse-features").size() > 0) {
|
|
|
|
for(size_t i=0; i<m_parameter->GetParam("report-sparse-features").size(); i++) {
|
|
|
|
const std::string &name = m_parameter->GetParam("report-sparse-features")[i];
|
2011-09-21 01:25:56 +04:00
|
|
|
for (size_t j = 0; j < m_sparsePhraseDictionary.size(); ++j) {
|
2012-12-04 21:09:23 +04:00
|
|
|
if (m_sparsePhraseDictionary[j] && name.compare(m_sparsePhraseDictionary[j]->GetScoreProducerDescription()) == 0) {
|
2012-12-27 22:42:07 +04:00
|
|
|
m_sparsePhraseDictionary[j]->SetSparseFeatureReporting();
|
2011-09-21 01:25:56 +04:00
|
|
|
}
|
|
|
|
}
|
2012-12-27 22:42:07 +04:00
|
|
|
} // for(size_t i=0; i<m_parameter->GetParam("report-sparse-features").
|
2011-08-07 04:58:56 +04:00
|
|
|
}
|
|
|
|
|
2012-12-21 17:34:07 +04:00
|
|
|
//Instigate dictionary loading
|
2012-12-21 19:28:34 +04:00
|
|
|
ConfigDictionaries();
|
2012-12-21 17:34:07 +04:00
|
|
|
|
2012-12-21 18:54:43 +04:00
|
|
|
for (int i = 0; i < m_phraseDictionary.size(); i++)
|
|
|
|
cerr << m_phraseDictionary[i] << " ";
|
|
|
|
cerr << endl;
|
|
|
|
for (int i = 0; i < m_generationDictionary.size(); i++)
|
|
|
|
cerr << m_generationDictionary[i] << " ";
|
|
|
|
cerr << endl;
|
2012-12-21 17:34:07 +04:00
|
|
|
|
|
|
|
//Add any other features here.
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2010-10-15 19:19:17 +04:00
|
|
|
//Load extra feature weights
|
|
|
|
//NB: These are common to all translation systems (at the moment!)
|
|
|
|
vector<string> extraWeightConfig = m_parameter->GetParam("weight-file");
|
2011-08-19 20:09:36 +04:00
|
|
|
if (extraWeightConfig.size()) {
|
2012-07-26 20:32:50 +04:00
|
|
|
if (extraWeightConfig.size() != 1) {
|
|
|
|
UserMessage::Add("One argument should be supplied for weight-file");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
ScoreComponentCollection extraWeights;
|
|
|
|
if (!extraWeights.Load(extraWeightConfig[0])) {
|
|
|
|
UserMessage::Add("Unable to load weights from " + extraWeightConfig[0]);
|
|
|
|
return false;
|
|
|
|
}
|
2013-01-01 21:27:26 +04:00
|
|
|
|
2012-07-26 20:32:50 +04:00
|
|
|
m_allWeights.PlusEquals(extraWeights);
|
2012-02-01 18:05:49 +04:00
|
|
|
}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
return true;
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
void StaticData::SetBooleanParameter( bool *parameter, string parameterName, bool defaultValue )
|
2008-06-11 14:52:57 +04:00
|
|
|
{
|
|
|
|
// default value if nothing is specified
|
|
|
|
*parameter = defaultValue;
|
2011-02-24 16:14:42 +03:00
|
|
|
if (! m_parameter->isParamSpecified( parameterName ) ) {
|
2008-06-11 14:52:57 +04:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// if parameter is just specified as, e.g. "-parameter" set it true
|
2011-02-24 16:14:42 +03:00
|
|
|
if (m_parameter->GetParam( parameterName ).size() == 0) {
|
2008-06-11 14:52:57 +04:00
|
|
|
*parameter = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// if paramter is specified "-parameter true" or "-parameter false"
|
2011-02-24 16:14:42 +03:00
|
|
|
else if (m_parameter->GetParam( parameterName ).size() == 1) {
|
2008-06-11 14:52:57 +04:00
|
|
|
*parameter = Scan<bool>( m_parameter->GetParam( parameterName )[0]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-10-07 02:06:49 +04:00
|
|
|
void StaticData::SetWeight(const ScoreProducer* sp, float weight)
|
|
|
|
{
|
2011-11-09 21:16:02 +04:00
|
|
|
m_allWeights.Resize();
|
2010-10-07 02:06:49 +04:00
|
|
|
m_allWeights.Assign(sp,weight);
|
|
|
|
}
|
|
|
|
|
|
|
|
void StaticData::SetWeights(const ScoreProducer* sp, const std::vector<float>& weights)
|
|
|
|
{
|
2011-11-09 21:16:02 +04:00
|
|
|
m_allWeights.Resize();
|
2010-10-07 02:06:49 +04:00
|
|
|
m_allWeights.Assign(sp,weights);
|
|
|
|
}
|
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
StaticData::~StaticData()
|
|
|
|
{
|
2012-12-31 04:57:21 +04:00
|
|
|
/*
|
|
|
|
const std::vector<ScoreProducer*> &producers = FeatureFunction::GetFeatureFunctions();
|
|
|
|
for(size_t i=0;i<producers.size();++i) {
|
|
|
|
ScoreProducer *ff = producers[i];
|
|
|
|
cerr << endl << "Destroying" << ff << endl;
|
2012-12-27 15:41:52 +04:00
|
|
|
delete ff;
|
|
|
|
}
|
2012-12-31 04:57:21 +04:00
|
|
|
*/
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
// memory pools
|
|
|
|
Phrase::FinalizeMemPool();
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2011-05-13 23:28:23 +04:00
|
|
|
#ifdef HAVE_SYNLM
|
|
|
|
bool StaticData::LoadSyntacticLanguageModel() {
|
|
|
|
cerr << "Loading syntactic language models..." << std::endl;
|
|
|
|
|
|
|
|
const vector<float> weights = Scan<float>(m_parameter->GetParam("weight-slm"));
|
|
|
|
const vector<string> files = m_parameter->GetParam("slmodel-file");
|
|
|
|
|
|
|
|
const FactorType factorType = (m_parameter->GetParam("slmodel-factor").size() > 0) ?
|
|
|
|
TransformScore(Scan<int>(m_parameter->GetParam("slmodel-factor")[0]))
|
|
|
|
: 0;
|
|
|
|
|
|
|
|
const size_t beamWidth = (m_parameter->GetParam("slmodel-beam").size() > 0) ?
|
|
|
|
TransformScore(Scan<int>(m_parameter->GetParam("slmodel-beam")[0]))
|
|
|
|
: 500;
|
|
|
|
|
|
|
|
if (files.size() < 1) {
|
|
|
|
cerr << "No syntactic language model files specified!" << std::endl;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// check if feature is used
|
|
|
|
if (weights.size() >= 1) {
|
|
|
|
|
|
|
|
//cout.setf(ios::scientific,ios::floatfield);
|
|
|
|
//cerr.setf(ios::scientific,ios::floatfield);
|
|
|
|
|
|
|
|
// create the feature
|
|
|
|
m_syntacticLanguageModel = new SyntacticLanguageModel(files,weights,factorType,beamWidth);
|
|
|
|
|
|
|
|
/*
|
|
|
|
/////////////////////////////////////////
|
|
|
|
// BEGIN LANE's UNSTABLE EXPERIMENT :)
|
|
|
|
//
|
|
|
|
|
|
|
|
double ppl = m_syntacticLanguageModel->perplexity();
|
|
|
|
cerr << "Probability is " << ppl << endl;
|
|
|
|
|
|
|
|
|
|
|
|
//
|
|
|
|
// END LANE's UNSTABLE EXPERIMENT
|
|
|
|
/////////////////////////////////////////
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
if (m_syntacticLanguageModel==NULL) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
bool StaticData::LoadLexicalReorderingModel()
|
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
VERBOSE(1, "Loading lexical distortion models...");
|
2012-12-05 21:19:10 +04:00
|
|
|
const vector<string> fileStr = m_parameter->GetParam("distortion-file");
|
2011-02-24 16:14:42 +03:00
|
|
|
|
|
|
|
VERBOSE(1, "have " << fileStr.size() << " models" << std::endl);
|
2012-12-05 21:19:10 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
//load all models
|
|
|
|
for(size_t i = 0; i < fileStr.size(); ++i) {
|
2012-12-11 22:57:42 +04:00
|
|
|
vector<string> spec = Tokenize<string>(fileStr[i], " ");
|
|
|
|
const vector<float> &weights= m_parameter->GetWeights("LexicalReordering", i);
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
if(spec.size() != 4) {
|
2012-12-11 22:57:42 +04:00
|
|
|
UserMessage::Add("Invalid Lexical Reordering Model Specification: " + fileStr[i]);
|
2011-02-24 16:14:42 +03:00
|
|
|
return false;
|
2010-08-10 17:12:00 +04:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
|
|
|
|
// spec[0] = factor map
|
|
|
|
// spec[1] = name
|
|
|
|
// spec[2] = num weights
|
|
|
|
// spec[3] = fileName
|
|
|
|
|
|
|
|
// decode factor map
|
|
|
|
|
|
|
|
vector<FactorType> input, output;
|
|
|
|
vector<string> inputfactors = Tokenize(spec[0],"-");
|
|
|
|
if(inputfactors.size() == 2) {
|
|
|
|
input = Tokenize<FactorType>(inputfactors[0],",");
|
|
|
|
output = Tokenize<FactorType>(inputfactors[1],",");
|
|
|
|
} else if(inputfactors.size() == 1) {
|
|
|
|
//if there is only one side assume it is on e side... why?
|
|
|
|
output = Tokenize<FactorType>(inputfactors[0],",");
|
|
|
|
} else {
|
|
|
|
//format error
|
|
|
|
return false;
|
2010-08-10 17:12:00 +04:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
|
|
|
|
string modelType = spec[1];
|
|
|
|
|
|
|
|
// decode num weights and fetch weights from array
|
|
|
|
std::vector<float> mweights;
|
|
|
|
size_t numWeights = atoi(spec[2].c_str());
|
2012-12-11 22:57:42 +04:00
|
|
|
if(numWeights > weights.size()) {
|
|
|
|
UserMessage::Add("Lexicalized distortion model: Not enough weights, add to [weight-d]");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
for(size_t k = 0; k < numWeights; ++k) {
|
|
|
|
mweights.push_back(weights[k]);
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
|
|
|
|
string filePath = spec[3];
|
|
|
|
|
2012-12-30 19:53:24 +04:00
|
|
|
LexicalReordering *reorderModel = new LexicalReordering(input, output, LexicalReorderingConfiguration(modelType), filePath, mweights);
|
|
|
|
|
|
|
|
m_reorderModels.push_back(reorderModel);
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
|
|
|
return true;
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
bool StaticData::LoadLanguageModels()
|
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
if (m_parameter->GetParam("lmodel-file").size() > 0) {
|
|
|
|
|
|
|
|
// dictionary upper-bounds fo all IRST LMs
|
|
|
|
vector<int> LMdub = Scan<int>(m_parameter->GetParam("lmodel-dub"));
|
|
|
|
if (m_parameter->GetParam("lmodel-dub").size() == 0) {
|
|
|
|
for(size_t i=0; i<m_parameter->GetParam("lmodel-file").size(); i++)
|
|
|
|
LMdub.push_back(0);
|
|
|
|
}
|
|
|
|
|
|
|
|
// initialize n-gram order for each factor. populated only by factored lm
|
|
|
|
const vector<string> &lmVector = m_parameter->GetParam("lmodel-file");
|
|
|
|
//prevent language models from being loaded twice
|
|
|
|
map<string,LanguageModel*> languageModelsLoaded;
|
|
|
|
|
|
|
|
for(size_t i=0; i<lmVector.size(); i++) {
|
2012-12-11 22:57:42 +04:00
|
|
|
// weights
|
|
|
|
const vector<float> &weights = m_parameter->GetWeights("LM", i);
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
LanguageModel* lm = NULL;
|
|
|
|
if (languageModelsLoaded.find(lmVector[i]) != languageModelsLoaded.end()) {
|
2011-10-28 18:54:23 +04:00
|
|
|
lm = languageModelsLoaded[lmVector[i]]->Duplicate();
|
2011-02-24 16:14:42 +03:00
|
|
|
} else {
|
|
|
|
vector<string> token = Tokenize(lmVector[i]);
|
|
|
|
if (token.size() != 4 && token.size() != 5 ) {
|
|
|
|
UserMessage::Add("Expected format 'LM-TYPE FACTOR-TYPE NGRAM-ORDER filePath [mapFilePath (only for IRSTLM)]'");
|
|
|
|
return false;
|
2010-08-10 17:12:00 +04:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
// type = implementation, SRI, IRST etc
|
|
|
|
LMImplementation lmImplementation = static_cast<LMImplementation>(Scan<int>(token[0]));
|
|
|
|
|
|
|
|
// factorType = 0 = Surface, 1 = POS, 2 = Stem, 3 = Morphology, etc
|
|
|
|
vector<FactorType> factorTypes = Tokenize<FactorType>(token[1], ",");
|
|
|
|
|
|
|
|
// nGramOrder = 2 = bigram, 3 = trigram, etc
|
|
|
|
size_t nGramOrder = Scan<int>(token[2]);
|
|
|
|
|
|
|
|
string &languageModelFile = token[3];
|
|
|
|
if (token.size() == 5) {
|
|
|
|
if (lmImplementation==IRST)
|
|
|
|
languageModelFile += " " + token[4];
|
|
|
|
else {
|
|
|
|
UserMessage::Add("Expected format 'LM-TYPE FACTOR-TYPE NGRAM-ORDER filePath [mapFilePath (only for IRSTLM)]'");
|
2010-08-10 17:12:00 +04:00
|
|
|
return false;
|
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
|
|
|
IFVERBOSE(1)
|
|
|
|
PrintUserTime(string("Start loading LanguageModel ") + languageModelFile);
|
|
|
|
|
|
|
|
lm = LanguageModelFactory::CreateLanguageModel(
|
|
|
|
lmImplementation
|
|
|
|
, factorTypes
|
|
|
|
, nGramOrder
|
|
|
|
, languageModelFile
|
|
|
|
, LMdub[i]);
|
|
|
|
if (lm == NULL) {
|
|
|
|
UserMessage::Add("no LM created. We probably don't have it compiled");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
languageModelsLoaded[lmVector[i]] = lm;
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
m_languageModel.Add(lm);
|
2011-10-13 20:50:16 +04:00
|
|
|
if (m_lmEnableOOVFeature) {
|
2012-12-11 22:57:42 +04:00
|
|
|
CHECK(weights.size() == 2);
|
2011-10-13 20:50:16 +04:00
|
|
|
SetWeights(lm,weights);
|
2012-12-11 22:57:42 +04:00
|
|
|
}
|
|
|
|
else {
|
|
|
|
CHECK(weights.size() == 1);
|
|
|
|
SetWeight(lm,weights[0]);
|
2011-10-13 20:50:16 +04:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
// flag indicating that language models were loaded,
|
|
|
|
// since phrase table loading requires their presence
|
|
|
|
m_fLMsLoaded = true;
|
2011-02-24 16:14:42 +03:00
|
|
|
IFVERBOSE(1)
|
|
|
|
PrintUserTime("Finished loading LanguageModels");
|
2008-06-11 14:52:57 +04:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool StaticData::LoadGenerationTables()
|
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
if (m_parameter->GetParam("generation-file").size() > 0) {
|
|
|
|
const vector<string> &generationVector = m_parameter->GetParam("generation-file");
|
|
|
|
|
|
|
|
IFVERBOSE(1) {
|
2012-12-11 22:57:42 +04:00
|
|
|
TRACE_ERR( "weight-generation: " << endl);
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
for(size_t currDict = 0 ; currDict < generationVector.size(); currDict++) {
|
|
|
|
vector<string> token = Tokenize(generationVector[currDict]);
|
|
|
|
vector<FactorType> input = Tokenize<FactorType>(token[0], ",")
|
|
|
|
,output = Tokenize<FactorType>(token[1], ",");
|
2008-06-11 14:52:57 +04:00
|
|
|
m_maxFactorIdx[1] = CalcMax(m_maxFactorIdx[1], input, output);
|
2011-02-24 16:14:42 +03:00
|
|
|
string filePath;
|
|
|
|
size_t numFeatures;
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2012-12-11 22:57:42 +04:00
|
|
|
const vector<float> &weight = m_parameter->GetWeights("Generation", currDict);
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
numFeatures = Scan<size_t>(token[2]);
|
|
|
|
filePath = token[3];
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
if (!FileExists(filePath) && FileExists(filePath + ".gz")) {
|
|
|
|
filePath += ".gz";
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
VERBOSE(1, filePath << endl);
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2011-08-19 20:09:36 +04:00
|
|
|
m_generationDictionary.push_back(new GenerationDictionary(numFeatures, input,output));
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(m_generationDictionary.back() && "could not create GenerationDictionary");
|
2011-02-24 16:14:42 +03:00
|
|
|
if (!m_generationDictionary.back()->Load(filePath, Output)) {
|
|
|
|
delete m_generationDictionary.back();
|
|
|
|
return false;
|
|
|
|
}
|
2012-12-11 22:57:42 +04:00
|
|
|
SetWeights(m_generationDictionary.back(), weight);
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2010-08-10 17:12:00 +04:00
|
|
|
/* Doesn't load phrase tables any more. Just creates the features. */
|
2008-06-11 14:52:57 +04:00
|
|
|
bool StaticData::LoadPhraseTables()
|
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
VERBOSE(2,"Creating phrase table features" << endl);
|
|
|
|
|
|
|
|
// language models must be loaded prior to loading phrase tables
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(m_fLMsLoaded);
|
2011-02-24 16:14:42 +03:00
|
|
|
// load phrase translation tables
|
|
|
|
if (m_parameter->GetParam("ttable-file").size() > 0) {
|
|
|
|
// weights
|
|
|
|
const vector<string> &translationVector = m_parameter->GetParam("ttable-file");
|
|
|
|
vector<size_t> maxTargetPhrase = Scan<size_t>(m_parameter->GetParam("ttable-limit"));
|
|
|
|
|
|
|
|
if(maxTargetPhrase.size() == 1 && translationVector.size() > 1) {
|
|
|
|
VERBOSE(1, "Using uniform ttable-limit of " << maxTargetPhrase[0] << " for all translation tables." << endl);
|
|
|
|
for(size_t i = 1; i < translationVector.size(); i++)
|
|
|
|
maxTargetPhrase.push_back(maxTargetPhrase[0]);
|
|
|
|
} else if(maxTargetPhrase.size() != 1 && maxTargetPhrase.size() < translationVector.size()) {
|
|
|
|
stringstream strme;
|
|
|
|
strme << "You specified " << translationVector.size() << " translation tables, but only " << maxTargetPhrase.size() << " ttable-limits.";
|
|
|
|
UserMessage::Add(strme.str());
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2012-12-11 22:57:42 +04:00
|
|
|
// MAIN LOOP
|
2011-02-24 16:14:42 +03:00
|
|
|
bool oldFileFormat = false;
|
|
|
|
for(size_t currDict = 0 ; currDict < translationVector.size(); currDict++) {
|
|
|
|
vector<string> token = Tokenize(translationVector[currDict]);
|
2012-12-11 22:57:42 +04:00
|
|
|
const vector<float> &weights = m_parameter->GetWeights("PhraseModel", currDict);
|
2011-02-24 16:14:42 +03:00
|
|
|
|
|
|
|
if(currDict == 0 && token.size() == 4) {
|
|
|
|
VERBOSE(1, "Warning: Phrase table specification in old 4-field format. Assuming binary phrase tables (type 1)!" << endl);
|
|
|
|
oldFileFormat = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if((!oldFileFormat && token.size() < 5) || (oldFileFormat && token.size() != 4)) {
|
|
|
|
UserMessage::Add("invalid phrase table specification");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
PhraseTableImplementation implementation = (PhraseTableImplementation) Scan<int>(token[0]);
|
|
|
|
if(oldFileFormat) {
|
|
|
|
token.push_back(token[3]);
|
|
|
|
token[3] = token[2];
|
|
|
|
token[2] = token[1];
|
|
|
|
token[1] = token[0];
|
|
|
|
token[0] = "1";
|
|
|
|
implementation = Binary;
|
|
|
|
} else
|
|
|
|
implementation = (PhraseTableImplementation) Scan<int>(token[0]);
|
|
|
|
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(token.size() >= 5);
|
2011-02-24 16:14:42 +03:00
|
|
|
//characteristics of the phrase table
|
|
|
|
|
|
|
|
vector<FactorType> input = Tokenize<FactorType>(token[1], ",")
|
|
|
|
,output = Tokenize<FactorType>(token[2], ",");
|
|
|
|
m_maxFactorIdx[0] = CalcMax(m_maxFactorIdx[0], input);
|
|
|
|
m_maxFactorIdx[1] = CalcMax(m_maxFactorIdx[1], output);
|
2008-06-11 14:52:57 +04:00
|
|
|
m_maxNumFactors = std::max(m_maxFactorIdx[0], m_maxFactorIdx[1]) + 1;
|
2011-02-24 16:14:42 +03:00
|
|
|
size_t numScoreComponent = Scan<size_t>(token[3]);
|
|
|
|
string filePath= token[4];
|
|
|
|
|
2012-12-11 22:57:42 +04:00
|
|
|
CHECK(weights.size() >= numScoreComponent);
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2012-12-06 00:21:33 +04:00
|
|
|
if(m_inputType == ConfusionNetworkInput || m_inputType == WordLatticeInput) {
|
|
|
|
if (currDict==0) { // only the 1st pt. THis is shit
|
|
|
|
// TODO. find what the assumptions made by confusion network about phrase table output which makes
|
|
|
|
// it only work with binary file. This is a hack
|
|
|
|
CHECK(implementation == Binary);
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2012-12-06 00:21:33 +04:00
|
|
|
if (m_parameter->GetParam("input-scores").size()) {
|
|
|
|
m_numInputScores = Scan<size_t>(m_parameter->GetParam("input-scores")[0]);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
m_numInputScores = 1;
|
|
|
|
}
|
|
|
|
numScoreComponent += m_numInputScores;
|
|
|
|
|
|
|
|
if (m_parameter->GetParam("input-scores").size() > 1) {
|
|
|
|
m_numRealWordsInInput = Scan<size_t>(m_parameter->GetParam("input-scores")[1]);
|
2011-10-30 09:51:08 +04:00
|
|
|
}
|
2012-12-06 00:21:33 +04:00
|
|
|
else {
|
|
|
|
m_numRealWordsInInput = 0;
|
|
|
|
}
|
|
|
|
numScoreComponent += m_numRealWordsInInput;
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
|
|
|
}
|
2012-12-06 00:21:33 +04:00
|
|
|
else { // not confusion network or lattice input
|
|
|
|
m_numInputScores = 0;
|
|
|
|
m_numRealWordsInInput = 0;
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
string targetPath, alignmentsFile;
|
|
|
|
if (implementation == SuffixArray) {
|
|
|
|
targetPath = token[5];
|
|
|
|
alignmentsFile= token[6];
|
|
|
|
}
|
|
|
|
|
|
|
|
//This is needed for regression testing, but the phrase table
|
|
|
|
//might not really be loading here
|
|
|
|
IFVERBOSE(1)
|
|
|
|
PrintUserTime(string("Start loading PhraseTable ") + filePath);
|
|
|
|
VERBOSE(1,"filePath: " << filePath <<endl);
|
|
|
|
|
2011-09-20 19:32:26 +04:00
|
|
|
//optional create sparse phrase feature
|
|
|
|
SparsePhraseDictionaryFeature* spdf = NULL;
|
|
|
|
if (token.size() >= 6 && token[5] == "sparse") {
|
|
|
|
spdf = new SparsePhraseDictionaryFeature();
|
|
|
|
}
|
|
|
|
m_sparsePhraseDictionary.push_back(spdf);
|
|
|
|
|
|
|
|
|
2010-10-07 02:06:49 +04:00
|
|
|
PhraseDictionaryFeature* pdf = new PhraseDictionaryFeature(
|
2011-02-24 16:14:42 +03:00
|
|
|
implementation
|
2011-09-20 19:32:26 +04:00
|
|
|
, spdf
|
2011-02-24 16:14:42 +03:00
|
|
|
, numScoreComponent
|
2012-12-06 18:46:52 +04:00
|
|
|
, (currDict==0 ? m_numInputScores + m_numRealWordsInInput : 0)
|
2011-02-24 16:14:42 +03:00
|
|
|
, input
|
|
|
|
, output
|
|
|
|
, filePath
|
2012-12-11 22:57:42 +04:00
|
|
|
, weights
|
2012-06-01 04:49:42 +04:00
|
|
|
, currDict
|
2012-12-11 22:57:42 +04:00
|
|
|
, maxTargetPhrase[currDict]
|
2011-02-24 16:14:42 +03:00
|
|
|
, targetPath, alignmentsFile);
|
|
|
|
|
|
|
|
m_phraseDictionary.push_back(pdf);
|
|
|
|
|
2012-12-11 22:57:42 +04:00
|
|
|
SetWeights(m_phraseDictionary.back(),weights);
|
2011-02-24 16:14:42 +03:00
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
IFVERBOSE(1)
|
|
|
|
PrintUserTime("Finished loading phrase tables");
|
|
|
|
return true;
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2010-04-08 21:16:10 +04:00
|
|
|
void StaticData::LoadNonTerminals()
|
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
string defaultNonTerminals;
|
|
|
|
|
|
|
|
if (m_parameter->GetParam("non-terminals").size() == 0) {
|
|
|
|
defaultNonTerminals = "X";
|
|
|
|
} else {
|
|
|
|
vector<std::string> tokens = Tokenize(m_parameter->GetParam("non-terminals")[0]);
|
|
|
|
defaultNonTerminals = tokens[0];
|
|
|
|
}
|
|
|
|
|
|
|
|
FactorCollection &factorCollection = FactorCollection::Instance();
|
|
|
|
|
|
|
|
m_inputDefaultNonTerminal.SetIsNonTerminal(true);
|
|
|
|
const Factor *sourceFactor = factorCollection.AddFactor(Input, 0, defaultNonTerminals);
|
|
|
|
m_inputDefaultNonTerminal.SetFactor(0, sourceFactor);
|
|
|
|
|
|
|
|
m_outputDefaultNonTerminal.SetIsNonTerminal(true);
|
|
|
|
const Factor *targetFactor = factorCollection.AddFactor(Output, 0, defaultNonTerminals);
|
|
|
|
m_outputDefaultNonTerminal.SetFactor(0, targetFactor);
|
|
|
|
|
|
|
|
// for unknwon words
|
|
|
|
if (m_parameter->GetParam("unknown-lhs").size() == 0) {
|
|
|
|
UnknownLHSEntry entry(defaultNonTerminals, 0.0f);
|
|
|
|
m_unknownLHS.push_back(entry);
|
|
|
|
} else {
|
|
|
|
const string &filePath = m_parameter->GetParam("unknown-lhs")[0];
|
|
|
|
|
|
|
|
InputFileStream inStream(filePath);
|
|
|
|
string line;
|
|
|
|
while(getline(inStream, line)) {
|
|
|
|
vector<string> tokens = Tokenize(line);
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(tokens.size() == 2);
|
2011-02-24 16:14:42 +03:00
|
|
|
UnknownLHSEntry entry(tokens[0], Scan<float>(tokens[1]));
|
|
|
|
m_unknownLHS.push_back(entry);
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2010-04-08 21:16:10 +04:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2010-04-08 21:16:10 +04:00
|
|
|
void StaticData::LoadChartDecodingParameters()
|
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
LoadNonTerminals();
|
|
|
|
|
|
|
|
// source label overlap
|
|
|
|
if (m_parameter->GetParam("source-label-overlap").size() > 0) {
|
|
|
|
m_sourceLabelOverlap = (SourceLabelOverlap) Scan<int>(m_parameter->GetParam("source-label-overlap")[0]);
|
|
|
|
} else {
|
|
|
|
m_sourceLabelOverlap = SourceLabelOverlapAdd;
|
|
|
|
}
|
|
|
|
|
|
|
|
m_ruleLimit = (m_parameter->GetParam("rule-limit").size() > 0)
|
|
|
|
? Scan<size_t>(m_parameter->GetParam("rule-limit")[0]) : DEFAULT_MAX_TRANS_OPT_SIZE;
|
2010-04-08 21:16:10 +04:00
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2010-04-08 21:16:10 +04:00
|
|
|
void StaticData::LoadPhraseBasedParameters()
|
|
|
|
{
|
2012-12-11 22:57:42 +04:00
|
|
|
const vector<float> &distortionWeights = m_parameter->GetWeights("Distortion", 0);
|
|
|
|
CHECK(distortionWeights.size() == 1);
|
|
|
|
|
|
|
|
float weightDistortion = distortionWeights[0];
|
2012-12-19 20:51:55 +04:00
|
|
|
m_distortionScoreProducer = new DistortionScoreProducer();
|
2012-12-30 23:42:53 +04:00
|
|
|
|
2012-12-19 20:51:55 +04:00
|
|
|
SetWeight(m_distortionScoreProducer, weightDistortion);
|
2012-12-05 21:12:01 +04:00
|
|
|
|
2010-04-08 21:16:10 +04:00
|
|
|
}
|
2010-08-10 17:12:00 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
bool StaticData::LoadDecodeGraphs()
|
|
|
|
{
|
|
|
|
const vector<string> &mappingVector = m_parameter->GetParam("mapping");
|
|
|
|
const vector<size_t> &maxChartSpans = Scan<size_t>(m_parameter->GetParam("max-chart-span"));
|
|
|
|
|
|
|
|
DecodeStep *prev = 0;
|
|
|
|
size_t prevDecodeGraphInd = 0;
|
|
|
|
for(size_t i=0; i<mappingVector.size(); i++) {
|
|
|
|
vector<string> token = Tokenize(mappingVector[i]);
|
|
|
|
size_t decodeGraphInd;
|
|
|
|
DecodeType decodeType;
|
|
|
|
size_t index;
|
|
|
|
if (token.size() == 2) {
|
|
|
|
decodeGraphInd = 0;
|
|
|
|
decodeType = token[0] == "T" ? Translate : Generate;
|
|
|
|
index = Scan<size_t>(token[1]);
|
|
|
|
} else if (token.size() == 3) {
|
|
|
|
// For specifying multiple translation model
|
|
|
|
decodeGraphInd = Scan<size_t>(token[0]);
|
|
|
|
//the vectorList index can only increment by one
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(decodeGraphInd == prevDecodeGraphInd || decodeGraphInd == prevDecodeGraphInd + 1);
|
2011-02-24 16:14:42 +03:00
|
|
|
if (decodeGraphInd > prevDecodeGraphInd) {
|
2008-06-11 14:52:57 +04:00
|
|
|
prev = NULL;
|
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
decodeType = token[1] == "T" ? Translate : Generate;
|
|
|
|
index = Scan<size_t>(token[2]);
|
|
|
|
} else {
|
|
|
|
UserMessage::Add("Malformed mapping!");
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(false);
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
2011-08-18 01:13:21 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
DecodeStep* decodeStep = NULL;
|
|
|
|
switch (decodeType) {
|
|
|
|
case Translate:
|
|
|
|
if(index>=m_phraseDictionary.size()) {
|
|
|
|
stringstream strme;
|
|
|
|
strme << "No phrase dictionary with index "
|
|
|
|
<< index << " available!";
|
|
|
|
UserMessage::Add(strme.str());
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(false);
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
|
|
|
decodeStep = new DecodeStepTranslation(m_phraseDictionary[index], prev);
|
|
|
|
break;
|
|
|
|
case Generate:
|
|
|
|
if(index>=m_generationDictionary.size()) {
|
|
|
|
stringstream strme;
|
|
|
|
strme << "No generation dictionary with index "
|
|
|
|
<< index << " available!";
|
|
|
|
UserMessage::Add(strme.str());
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(false);
|
2011-02-24 16:14:42 +03:00
|
|
|
}
|
|
|
|
decodeStep = new DecodeStepGeneration(m_generationDictionary[index], prev);
|
|
|
|
break;
|
|
|
|
case InsertNullFertilityWord:
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(!"Please implement NullFertilityInsertion.");
|
2011-02-24 16:14:42 +03:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2011-11-18 16:07:41 +04:00
|
|
|
CHECK(decodeStep);
|
2011-02-24 16:14:42 +03:00
|
|
|
if (m_decodeGraphs.size() < decodeGraphInd + 1) {
|
|
|
|
DecodeGraph *decodeGraph;
|
2012-10-12 17:09:45 +04:00
|
|
|
if (IsChart()) {
|
2011-02-24 16:14:42 +03:00
|
|
|
size_t maxChartSpan = (decodeGraphInd < maxChartSpans.size()) ? maxChartSpans[decodeGraphInd] : DEFAULT_MAX_CHART_SPAN;
|
2012-06-01 04:49:42 +04:00
|
|
|
cerr << "max-chart-span: " << maxChartSpans[decodeGraphInd] << endl;
|
2011-02-24 16:14:42 +03:00
|
|
|
decodeGraph = new DecodeGraph(m_decodeGraphs.size(), maxChartSpan);
|
|
|
|
} else {
|
|
|
|
decodeGraph = new DecodeGraph(m_decodeGraphs.size());
|
|
|
|
}
|
|
|
|
|
|
|
|
m_decodeGraphs.push_back(decodeGraph); // TODO max chart span
|
|
|
|
}
|
|
|
|
|
|
|
|
m_decodeGraphs[decodeGraphInd]->Add(decodeStep);
|
|
|
|
prev = decodeStep;
|
|
|
|
prevDecodeGraphInd = decodeGraphInd;
|
|
|
|
}
|
|
|
|
|
|
|
|
// set maximum n-gram size for backoff approach to decoding paths
|
|
|
|
// default is always use subsequent paths (value = 0)
|
|
|
|
for(size_t i=0; i<m_decodeGraphs.size(); i++) {
|
|
|
|
m_decodeGraphBackoff.push_back( 0 );
|
|
|
|
}
|
|
|
|
// if specified, record maxmimum unseen n-gram size
|
|
|
|
const vector<string> &backoffVector = m_parameter->GetParam("decoding-graph-backoff");
|
|
|
|
for(size_t i=0; i<m_decodeGraphs.size() && i<backoffVector.size(); i++) {
|
|
|
|
m_decodeGraphBackoff[i] = Scan<size_t>(backoffVector[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2011-08-19 20:09:36 +04:00
|
|
|
bool StaticData::LoadReferences()
|
|
|
|
{
|
2011-10-20 17:32:05 +04:00
|
|
|
vector<string> bleuWeightStr = m_parameter->GetParam("weight-bl");
|
2010-09-14 20:25:33 +04:00
|
|
|
vector<string> referenceFiles = m_parameter->GetParam("references");
|
2011-08-19 20:09:36 +04:00
|
|
|
if ((!referenceFiles.size() && bleuWeightStr.size()) || (referenceFiles.size() && !bleuWeightStr.size())) {
|
2010-09-14 20:25:33 +04:00
|
|
|
UserMessage::Add("You cannot use the bleu feature without references, and vice-versa");
|
|
|
|
return false;
|
|
|
|
}
|
2010-09-14 20:55:33 +04:00
|
|
|
if (!referenceFiles.size()) {
|
|
|
|
return true;
|
|
|
|
}
|
2010-09-14 20:25:33 +04:00
|
|
|
if (bleuWeightStr.size() > 1) {
|
|
|
|
UserMessage::Add("Can only specify one weight for the bleu feature");
|
|
|
|
return false;
|
|
|
|
}
|
2012-04-03 17:39:45 +04:00
|
|
|
|
2010-09-14 20:25:33 +04:00
|
|
|
float bleuWeight = Scan<float>(bleuWeightStr[0]);
|
2012-12-28 19:45:36 +04:00
|
|
|
BleuScoreFeature *bleuScoreFeature = new BleuScoreFeature();
|
|
|
|
SetWeight(bleuScoreFeature, bleuWeight);
|
2011-08-19 20:09:36 +04:00
|
|
|
|
2012-04-03 17:39:45 +04:00
|
|
|
cerr << "Loading reference file " << referenceFiles[0] << endl;
|
2010-09-14 20:25:33 +04:00
|
|
|
vector<vector<string> > references(referenceFiles.size());
|
|
|
|
for (size_t i =0; i < referenceFiles.size(); ++i) {
|
|
|
|
ifstream in(referenceFiles[i].c_str());
|
|
|
|
if (!in) {
|
|
|
|
stringstream strme;
|
|
|
|
strme << "Unable to load references from " << referenceFiles[i];
|
|
|
|
UserMessage::Add(strme.str());
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
string line;
|
|
|
|
while (getline(in,line)) {
|
2012-05-29 02:49:39 +04:00
|
|
|
/* if (GetSearchAlgorithm() == ChartDecoding) {
|
|
|
|
stringstream tmp;
|
|
|
|
tmp << "<s> " << line << " </s>";
|
|
|
|
line = tmp.str();
|
|
|
|
}*/
|
2012-01-12 20:30:50 +04:00
|
|
|
references[i].push_back(line);
|
2010-09-14 20:25:33 +04:00
|
|
|
}
|
|
|
|
if (i > 0) {
|
|
|
|
if (references[i].size() != references[i-1].size()) {
|
|
|
|
UserMessage::Add("Reference files are of different lengths");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
in.close();
|
|
|
|
}
|
2010-09-16 16:49:57 +04:00
|
|
|
//Set the references in the bleu feature
|
2012-12-28 19:45:36 +04:00
|
|
|
bleuScoreFeature->LoadReferences(references);
|
|
|
|
|
2010-09-14 20:25:33 +04:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2010-10-15 19:19:17 +04:00
|
|
|
bool StaticData::LoadDiscrimLMFeature()
|
2010-09-17 17:36:03 +04:00
|
|
|
{
|
2011-12-07 01:33:23 +04:00
|
|
|
// only load if specified
|
2012-03-02 22:35:02 +04:00
|
|
|
const vector<string> &wordFile = m_parameter->GetParam("dlm-model");
|
2011-08-19 20:09:36 +04:00
|
|
|
if (wordFile.empty()) {
|
|
|
|
return true;
|
2010-10-15 19:19:17 +04:00
|
|
|
}
|
2011-12-07 01:33:23 +04:00
|
|
|
cerr << "Loading " << wordFile.size() << " discriminative language model(s).." << endl;
|
2011-11-24 23:27:12 +04:00
|
|
|
|
2011-11-22 16:15:15 +04:00
|
|
|
for (size_t i = 0; i < wordFile.size(); ++i) {
|
|
|
|
vector<string> tokens = Tokenize(wordFile[i]);
|
|
|
|
if (tokens.size() != 4) {
|
|
|
|
UserMessage::Add("Format of discriminative language model parameter is <order> <factor> <include-lower-ngrams> <filename>");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2012-12-28 16:58:47 +04:00
|
|
|
vector<float> &weights = m_parameter->GetWeights("DiscriminativeLM", i);
|
2012-12-28 17:44:02 +04:00
|
|
|
CHECK(weights.size() == 0 || weights.size() == 1);
|
2012-12-28 16:58:47 +04:00
|
|
|
|
2011-11-22 16:15:15 +04:00
|
|
|
size_t order = Scan<size_t>(tokens[0]);
|
|
|
|
FactorType factorId = Scan<size_t>(tokens[1]);
|
|
|
|
bool include_lower_ngrams = Scan<bool>(tokens[2]);
|
|
|
|
string filename = tokens[3];
|
|
|
|
|
|
|
|
if (order == 2 && !include_lower_ngrams) { // TODO: remove TargetBigramFeature ?
|
2012-12-28 16:58:47 +04:00
|
|
|
TargetBigramFeature *targetBigramFeature = new TargetBigramFeature(factorId);
|
2011-11-24 23:27:12 +04:00
|
|
|
cerr << "loading vocab from " << filename << endl;
|
2012-12-28 16:58:47 +04:00
|
|
|
if (!targetBigramFeature->Load(filename)) {
|
2011-11-22 16:15:15 +04:00
|
|
|
UserMessage::Add("Unable to load word list from file " + filename);
|
|
|
|
return false;
|
|
|
|
}
|
2012-12-28 16:58:47 +04:00
|
|
|
|
|
|
|
if (m_parameter->GetParam("report-sparse-features").size() > 0) {
|
|
|
|
targetBigramFeature->SetSparseFeatureReporting();
|
|
|
|
}
|
2011-11-22 16:15:15 +04:00
|
|
|
}
|
|
|
|
else {
|
2011-12-07 15:19:50 +04:00
|
|
|
if (m_searchAlgorithm == ChartDecoding && !include_lower_ngrams) {
|
|
|
|
UserMessage::Add("Excluding lower order DLM ngrams is currently not supported for chart decoding.");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2012-12-28 17:44:02 +04:00
|
|
|
TargetNgramFeature *targetNgramFeature = new TargetNgramFeature(factorId, order, include_lower_ngrams);
|
|
|
|
if (weights.size() == 1) {
|
|
|
|
targetNgramFeature->SetSparseProducerWeight(weights[0]);
|
|
|
|
}
|
2011-11-24 23:27:12 +04:00
|
|
|
cerr << "loading vocab from " << filename << endl;
|
2012-12-28 17:44:02 +04:00
|
|
|
if (!targetNgramFeature->Load(filename)) {
|
2011-11-22 16:15:15 +04:00
|
|
|
UserMessage::Add("Unable to load word list from file " + filename);
|
|
|
|
return false;
|
|
|
|
}
|
2012-12-28 17:44:02 +04:00
|
|
|
|
|
|
|
if (m_parameter->GetParam("report-sparse-features").size() > 0) {
|
|
|
|
targetNgramFeature->SetSparseFeatureReporting();
|
|
|
|
}
|
|
|
|
|
|
|
|
float sparseWeight = targetNgramFeature->GetSparseProducerWeight();
|
|
|
|
if (sparseWeight != 1) {
|
|
|
|
AddSparseProducer(targetNgramFeature);
|
|
|
|
cerr << "dlm sparse producer weight: " << sparseWeight << endl;
|
|
|
|
}
|
|
|
|
|
2011-11-22 16:15:15 +04:00
|
|
|
}
|
2011-08-19 20:09:36 +04:00
|
|
|
}
|
2010-09-17 17:36:03 +04:00
|
|
|
|
2011-08-19 20:09:36 +04:00
|
|
|
return true;
|
2010-09-17 17:36:03 +04:00
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2011-08-19 20:09:36 +04:00
|
|
|
bool StaticData::LoadPhraseBoundaryFeature()
|
2011-05-11 02:02:25 +04:00
|
|
|
{
|
2012-12-28 18:26:30 +04:00
|
|
|
const vector<float> &weight = m_parameter->GetWeights("PhraseBoundaryFeature", 0);
|
2012-03-15 04:32:27 +04:00
|
|
|
if (weight.size() > 1) {
|
2012-11-30 22:04:50 +04:00
|
|
|
std::cerr << "Only one sparse producer weight allowed for the phrase boundary feature" << std::endl;
|
2012-03-15 04:32:27 +04:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2011-08-19 20:09:36 +04:00
|
|
|
const vector<string> &phraseBoundarySourceFactors =
|
2011-05-11 02:02:25 +04:00
|
|
|
m_parameter->GetParam("phrase-boundary-source-feature");
|
2011-08-19 20:09:36 +04:00
|
|
|
const vector<string> &phraseBoundaryTargetFactors =
|
2011-05-11 02:02:25 +04:00
|
|
|
m_parameter->GetParam("phrase-boundary-target-feature");
|
|
|
|
if (phraseBoundarySourceFactors.size() == 0 && phraseBoundaryTargetFactors.size() == 0) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
if (phraseBoundarySourceFactors.size() > 1) {
|
|
|
|
UserMessage::Add("Need to specify comma separated list of source factors for phrase boundary");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (phraseBoundaryTargetFactors.size() > 1) {
|
|
|
|
UserMessage::Add("Need to specify comma separated list of target factors for phrase boundary");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
FactorList sourceFactors;
|
|
|
|
FactorList targetFactors;
|
|
|
|
if (phraseBoundarySourceFactors.size()) {
|
|
|
|
sourceFactors = Tokenize<FactorType>(phraseBoundarySourceFactors[0],",");
|
|
|
|
}
|
|
|
|
if (phraseBoundaryTargetFactors.size()) {
|
|
|
|
targetFactors = Tokenize<FactorType>(phraseBoundaryTargetFactors[0],",");
|
|
|
|
}
|
|
|
|
//cerr << "source "; for (size_t i = 0; i < sourceFactors.size(); ++i) cerr << sourceFactors[i] << " "; cerr << endl;
|
|
|
|
//cerr << "target "; for (size_t i = 0; i < targetFactors.size(); ++i) cerr << targetFactors[i] << " "; cerr << endl;
|
2012-12-28 18:26:30 +04:00
|
|
|
PhraseBoundaryFeature *phraseBoundaryFeature = new PhraseBoundaryFeature(sourceFactors,targetFactors);
|
|
|
|
if (weight.size() > 0) {
|
|
|
|
phraseBoundaryFeature->SetSparseProducerWeight(weight[0]);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (m_parameter->GetParam("report-sparse-features").size() > 0) {
|
|
|
|
phraseBoundaryFeature->SetSparseFeatureReporting();
|
|
|
|
}
|
|
|
|
|
|
|
|
float sparseWeight = phraseBoundaryFeature->GetSparseProducerWeight();
|
|
|
|
if (sparseWeight != 1) {
|
|
|
|
AddSparseProducer(phraseBoundaryFeature);
|
|
|
|
cerr << "pb sparse producer weight: " << sparseWeight << endl;
|
|
|
|
}
|
|
|
|
|
2011-05-11 02:02:25 +04:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2011-08-19 20:09:36 +04:00
|
|
|
bool StaticData::LoadPhrasePairFeature()
|
2011-03-22 17:33:16 +03:00
|
|
|
{
|
2012-07-26 20:32:50 +04:00
|
|
|
const vector<string> ¶meters = m_parameter->GetParam("phrase-pair-feature");
|
|
|
|
if (parameters.size() == 0) return true;
|
|
|
|
|
|
|
|
for (size_t i=0; i<parameters.size(); ++i) {
|
|
|
|
vector<string> tokens = Tokenize(parameters[i]);
|
|
|
|
if (! (tokens.size() >= 1 && tokens.size() <= 6)) {
|
|
|
|
UserMessage::Add("Format for phrase pair feature: --phrase-pair-feature <factor-src>-<factor-tgt> "
|
|
|
|
"[simple source-trigger] [ignore-punctuation] [domain-trigger] [filename-src]");
|
|
|
|
return false;
|
|
|
|
}
|
2012-12-27 22:42:07 +04:00
|
|
|
|
|
|
|
const vector<float> &weight = m_parameter->GetWeights("PhrasePairFeature", i);
|
2012-03-19 06:45:59 +04:00
|
|
|
|
2012-07-26 20:32:50 +04:00
|
|
|
vector <string> factors;
|
|
|
|
if (tokens.size() == 2)
|
|
|
|
factors = Tokenize(tokens[0]," ");
|
|
|
|
else
|
|
|
|
factors = Tokenize(tokens[0],"-");
|
|
|
|
|
|
|
|
size_t sourceFactorId = Scan<size_t>(factors[0]);
|
|
|
|
size_t targetFactorId = Scan<size_t>(factors[1]);
|
|
|
|
bool simple = true, sourceContext = false, ignorePunctuation = false, domainTrigger = false;
|
|
|
|
if (tokens.size() >= 3) {
|
|
|
|
simple = Scan<size_t>(tokens[1]);
|
|
|
|
sourceContext = Scan<size_t>(tokens[2]);
|
|
|
|
}
|
|
|
|
if (tokens.size() >= 4)
|
|
|
|
ignorePunctuation = Scan<size_t>(tokens[3]);
|
|
|
|
if (tokens.size() >= 5)
|
|
|
|
domainTrigger = Scan<size_t>(tokens[4]);
|
|
|
|
|
2012-12-27 22:42:07 +04:00
|
|
|
PhrasePairFeature *phrasePairFeature = new PhrasePairFeature(sourceFactorId, targetFactorId, simple, sourceContext,
|
|
|
|
ignorePunctuation, domainTrigger);
|
|
|
|
phrasePairFeature->SetSparseProducerWeight(weight[i]);
|
2012-07-26 20:32:50 +04:00
|
|
|
|
|
|
|
// load word list
|
|
|
|
if (tokens.size() == 6) {
|
|
|
|
string filenameSource = tokens[5];
|
|
|
|
if (domainTrigger) {
|
2012-12-27 22:42:07 +04:00
|
|
|
const vector<string> &texttype = m_parameter->GetParam("text-type");
|
|
|
|
if (texttype.size() != 1) {
|
|
|
|
UserMessage::Add("Need texttype to load dictionary for domain triggers.");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
stringstream filename(filenameSource + "." + texttype[0]);
|
|
|
|
filenameSource = filename.str();
|
|
|
|
cerr << "loading word translation term list from " << filenameSource << endl;
|
2012-07-26 20:32:50 +04:00
|
|
|
}
|
|
|
|
else {
|
2012-12-27 22:42:07 +04:00
|
|
|
cerr << "loading word translation word list from " << filenameSource << endl;
|
2012-07-26 20:32:50 +04:00
|
|
|
}
|
2012-12-27 22:42:07 +04:00
|
|
|
if (!phrasePairFeature->Load(filenameSource)) {
|
|
|
|
UserMessage::Add("Unable to load word lists for word translation feature from files " + filenameSource);
|
|
|
|
return false;
|
|
|
|
} // if (!phrasePairFeature->Load(filenameSource)) {
|
|
|
|
} // if (tokens.size() == 6) {
|
|
|
|
|
|
|
|
// TODO not sure about this
|
|
|
|
if (weight[0] != 1) {
|
|
|
|
AddSparseProducer(phrasePairFeature);
|
|
|
|
cerr << "pp sparse producer weight: " << weight[0] << endl;
|
|
|
|
if (m_mira)
|
|
|
|
m_metaFeatureProducer = new MetaFeatureProducer("pp");
|
2012-07-26 20:32:50 +04:00
|
|
|
}
|
2012-12-27 22:42:07 +04:00
|
|
|
|
|
|
|
if (m_parameter->GetParam("report-sparse-features").size() > 0) {
|
|
|
|
phrasePairFeature->SetSparseFeatureReporting();
|
|
|
|
}
|
|
|
|
} // for (size_t i=0; i<parameters.size(); ++i)
|
|
|
|
|
2011-03-22 17:33:16 +03:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2011-08-06 18:10:43 +04:00
|
|
|
bool StaticData::LoadPhraseLengthFeature()
|
|
|
|
{
|
|
|
|
if (m_parameter->isParamSpecified("phrase-length-feature")) {
|
2012-12-27 21:19:06 +04:00
|
|
|
PhraseLengthFeature *phraseLengthFeature = new PhraseLengthFeature();
|
|
|
|
if (m_parameter->GetParam("report-sparse-features").size() > 0) {
|
|
|
|
phraseLengthFeature->SetSparseFeatureReporting();
|
|
|
|
}
|
2011-08-06 18:10:43 +04:00
|
|
|
}
|
2012-12-27 21:19:06 +04:00
|
|
|
|
2011-08-06 18:10:43 +04:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2011-08-13 04:25:23 +04:00
|
|
|
bool StaticData::LoadTargetWordInsertionFeature()
|
|
|
|
{
|
2011-08-19 20:09:36 +04:00
|
|
|
const vector<string> ¶meters = m_parameter->GetParam("target-word-insertion-feature");
|
|
|
|
if (parameters.empty())
|
|
|
|
return true;
|
2011-08-13 04:25:23 +04:00
|
|
|
|
2011-08-19 20:09:36 +04:00
|
|
|
if (parameters.size() != 1) {
|
|
|
|
UserMessage::Add("Can only have one target-word-insertion-feature");
|
|
|
|
return false;
|
|
|
|
}
|
2011-08-13 04:25:23 +04:00
|
|
|
|
2011-08-19 20:09:36 +04:00
|
|
|
vector<string> tokens = Tokenize(parameters[0]);
|
2011-08-13 04:25:23 +04:00
|
|
|
if (tokens.size() != 1 && tokens.size() != 2) {
|
2011-08-13 05:39:35 +04:00
|
|
|
UserMessage::Add("Format of target word insertion feature parameter is: --target-word-insertion-feature <factor> [filename]");
|
2011-08-13 04:25:23 +04:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2012-11-14 23:01:25 +04:00
|
|
|
m_needAlignmentInfo = true;
|
2011-08-13 04:25:23 +04:00
|
|
|
|
2011-08-13 05:39:35 +04:00
|
|
|
// set factor
|
|
|
|
FactorType factorId = Scan<size_t>(tokens[0]);
|
2012-12-27 21:19:06 +04:00
|
|
|
|
|
|
|
TargetWordInsertionFeature *targetWordInsertionFeature = new TargetWordInsertionFeature(factorId);
|
2011-08-13 05:39:35 +04:00
|
|
|
|
|
|
|
// load word list for restricted feature set
|
|
|
|
if (tokens.size() == 2) {
|
|
|
|
string filename = tokens[1];
|
|
|
|
cerr << "loading target word insertion word list from " << filename << endl;
|
2012-12-27 21:19:06 +04:00
|
|
|
if (!targetWordInsertionFeature->Load(filename)) {
|
2011-08-19 20:09:36 +04:00
|
|
|
UserMessage::Add("Unable to load word list for target word insertion feature from file " + filename);
|
|
|
|
return false;
|
2011-08-13 05:39:35 +04:00
|
|
|
}
|
2011-08-19 20:09:36 +04:00
|
|
|
}
|
2011-08-13 05:39:35 +04:00
|
|
|
|
2012-12-27 21:19:06 +04:00
|
|
|
if (m_parameter->GetParam("report-sparse-features").size() > 0) {
|
|
|
|
targetWordInsertionFeature->SetSparseFeatureReporting();
|
|
|
|
}
|
|
|
|
|
2011-08-13 05:39:35 +04:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2011-08-13 06:40:54 +04:00
|
|
|
bool StaticData::LoadWordTranslationFeature()
|
|
|
|
{
|
2012-11-14 23:01:25 +04:00
|
|
|
const vector<string> ¶meters = m_parameter->GetParam("word-translation-feature");
|
|
|
|
if (parameters.empty())
|
|
|
|
return true;
|
|
|
|
|
2012-12-11 22:57:42 +04:00
|
|
|
const vector<float> &weight = m_parameter->GetWeights("WordPenalty", 0);
|
|
|
|
CHECK(weight.size() == 1);
|
2012-03-15 04:32:27 +04:00
|
|
|
|
2012-11-14 23:01:25 +04:00
|
|
|
m_needAlignmentInfo = true;
|
2011-08-13 06:40:54 +04:00
|
|
|
|
2012-07-26 20:32:50 +04:00
|
|
|
for (size_t i=0; i<parameters.size(); ++i) {
|
|
|
|
vector<string> tokens = Tokenize(parameters[i]);
|
|
|
|
if (tokens.size() != 1 && !(tokens.size() >= 4 && tokens.size() <= 8)) {
|
|
|
|
UserMessage::Add("Format of word translation feature parameter is: --word-translation-feature <factor-src>-<factor-tgt> "
|
|
|
|
"[simple source-trigger target-trigger] [ignore-punctuation] [domain-trigger] [filename-src] [filename-tgt]");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// set factor
|
|
|
|
vector <string> factors = Tokenize(tokens[0],"-");
|
|
|
|
FactorType factorIdSource = Scan<size_t>(factors[0]);
|
|
|
|
FactorType factorIdTarget = Scan<size_t>(factors[1]);
|
|
|
|
|
|
|
|
bool simple = true, sourceTrigger = false, targetTrigger = false, ignorePunctuation = false, domainTrigger = false;
|
|
|
|
if (tokens.size() >= 4) {
|
|
|
|
simple = Scan<size_t>(tokens[1]);
|
|
|
|
sourceTrigger = Scan<size_t>(tokens[2]);
|
|
|
|
targetTrigger = Scan<size_t>(tokens[3]);
|
|
|
|
}
|
|
|
|
if (tokens.size() >= 5) {
|
|
|
|
ignorePunctuation = Scan<size_t>(tokens[4]);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (tokens.size() >= 6) {
|
|
|
|
domainTrigger = Scan<size_t>(tokens[5]);
|
|
|
|
}
|
|
|
|
|
2012-12-27 23:30:57 +04:00
|
|
|
WordTranslationFeature *wordTranslationFeature = new WordTranslationFeature(factorIdSource, factorIdTarget, simple,
|
|
|
|
sourceTrigger, targetTrigger, ignorePunctuation, domainTrigger);
|
|
|
|
wordTranslationFeature->SetSparseProducerWeight(weight[i]);
|
2012-07-26 20:32:50 +04:00
|
|
|
|
|
|
|
// load word list for restricted feature set
|
|
|
|
if (tokens.size() == 7) {
|
|
|
|
string filenameSource = tokens[6];
|
|
|
|
if (domainTrigger) {
|
2012-12-27 23:30:57 +04:00
|
|
|
const vector<string> &texttype = m_parameter->GetParam("text-type");
|
|
|
|
if (texttype.size() != 1) {
|
|
|
|
UserMessage::Add("Need texttype to load dictionary for domain triggers.");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
stringstream filename(filenameSource + "." + texttype[0]);
|
|
|
|
filenameSource = filename.str();
|
|
|
|
cerr << "loading word translation term list from " << filenameSource << endl;
|
2012-07-26 20:32:50 +04:00
|
|
|
}
|
|
|
|
else {
|
2012-12-27 23:30:57 +04:00
|
|
|
cerr << "loading word translation word lists from " << filenameSource << endl;
|
2012-07-26 20:32:50 +04:00
|
|
|
}
|
2012-12-27 23:30:57 +04:00
|
|
|
if (!wordTranslationFeature->Load(filenameSource, "")) {
|
|
|
|
UserMessage::Add("Unable to load word lists for word translation feature from files " + filenameSource);
|
|
|
|
return false;
|
2012-07-26 20:32:50 +04:00
|
|
|
}
|
2012-12-27 23:30:57 +04:00
|
|
|
} // if (tokens.size() == 7)
|
2012-07-26 20:32:50 +04:00
|
|
|
else if (tokens.size() == 8) {
|
|
|
|
string filenameSource = tokens[6];
|
|
|
|
string filenameTarget = tokens[7];
|
|
|
|
cerr << "loading word translation word lists from " << filenameSource << " and " << filenameTarget << endl;
|
2012-12-27 23:30:57 +04:00
|
|
|
if (!wordTranslationFeature->Load(filenameSource, filenameTarget)) {
|
|
|
|
UserMessage::Add("Unable to load word lists for word translation feature from files " + filenameSource + " and " + filenameTarget);
|
|
|
|
return false;
|
2012-07-26 20:32:50 +04:00
|
|
|
}
|
2012-12-27 23:30:57 +04:00
|
|
|
} //else if (tokens.size() == 8) {
|
|
|
|
|
|
|
|
// TODO not sure about this
|
|
|
|
if (weight[0] != 1) {
|
|
|
|
AddSparseProducer(wordTranslationFeature);
|
|
|
|
cerr << "wt sparse producer weight: " << weight[0] << endl;
|
|
|
|
if (m_mira)
|
|
|
|
m_metaFeatureProducer = new MetaFeatureProducer("wt");
|
2012-07-26 20:32:50 +04:00
|
|
|
}
|
2012-12-27 23:30:57 +04:00
|
|
|
|
|
|
|
if (m_parameter->GetParam("report-sparse-features").size() > 0) {
|
|
|
|
wordTranslationFeature->SetSparseFeatureReporting();
|
|
|
|
}
|
|
|
|
} // for (size_t i=0; i<parameters.size(); ++i)
|
2011-08-13 06:40:54 +04:00
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2008-10-14 23:25:18 +04:00
|
|
|
const TranslationOptionList* StaticData::FindTransOptListInCache(const DecodeGraph &decodeGraph, const Phrase &sourcePhrase) const
|
2008-06-11 14:52:57 +04:00
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
std::pair<size_t, Phrase> key(decodeGraph.GetPosition(), sourcePhrase);
|
|
|
|
#ifdef WITH_THREADS
|
|
|
|
boost::mutex::scoped_lock lock(m_transOptCacheMutex);
|
|
|
|
#endif
|
|
|
|
std::map<std::pair<size_t, Phrase>, std::pair<TranslationOptionList*,clock_t> >::iterator iter
|
2011-08-19 20:09:36 +04:00
|
|
|
= m_transOptCache.find(key);
|
2011-02-24 16:14:42 +03:00
|
|
|
if (iter == m_transOptCache.end())
|
|
|
|
return NULL;
|
|
|
|
iter->second.second = clock(); // update last used time
|
|
|
|
return iter->second.first;
|
2009-01-01 21:16:54 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void StaticData::ReduceTransOptCache() const
|
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
if (m_transOptCache.size() <= m_transOptCacheMaxSize) return; // not full
|
|
|
|
clock_t t = clock();
|
|
|
|
|
|
|
|
// find cutoff for last used time
|
|
|
|
priority_queue< clock_t > lastUsedTimes;
|
|
|
|
std::map<std::pair<size_t, Phrase>, std::pair<TranslationOptionList*,clock_t> >::iterator iter;
|
|
|
|
iter = m_transOptCache.begin();
|
|
|
|
while( iter != m_transOptCache.end() ) {
|
|
|
|
lastUsedTimes.push( iter->second.second );
|
|
|
|
iter++;
|
|
|
|
}
|
|
|
|
for( size_t i=0; i < lastUsedTimes.size()-m_transOptCacheMaxSize/2; i++ )
|
|
|
|
lastUsedTimes.pop();
|
|
|
|
clock_t cutoffLastUsedTime = lastUsedTimes.top();
|
|
|
|
|
|
|
|
// remove all old entries
|
|
|
|
iter = m_transOptCache.begin();
|
|
|
|
while( iter != m_transOptCache.end() ) {
|
|
|
|
if (iter->second.second < cutoffLastUsedTime) {
|
|
|
|
std::map<std::pair<size_t, Phrase>, std::pair<TranslationOptionList*,clock_t> >::iterator iterRemove = iter++;
|
|
|
|
delete iterRemove->second.first;
|
|
|
|
m_transOptCache.erase(iterRemove);
|
|
|
|
} else iter++;
|
|
|
|
}
|
|
|
|
VERBOSE(2,"Reduced persistent translation option cache in " << ((clock()-t)/(float)CLOCKS_PER_SEC) << " seconds." << std::endl);
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
2008-10-14 23:25:18 +04:00
|
|
|
void StaticData::AddTransOptListToCache(const DecodeGraph &decodeGraph, const Phrase &sourcePhrase, const TranslationOptionList &transOptList) const
|
|
|
|
{
|
2011-02-24 16:14:42 +03:00
|
|
|
if (m_transOptCacheMaxSize == 0) return;
|
|
|
|
std::pair<size_t, Phrase> key(decodeGraph.GetPosition(), sourcePhrase);
|
|
|
|
TranslationOptionList* storedTransOptList = new TranslationOptionList(transOptList);
|
|
|
|
#ifdef WITH_THREADS
|
|
|
|
boost::mutex::scoped_lock lock(m_transOptCacheMutex);
|
2009-08-07 20:47:54 +04:00
|
|
|
#endif
|
2011-02-24 16:14:42 +03:00
|
|
|
m_transOptCache[key] = make_pair( storedTransOptList, clock() );
|
|
|
|
ReduceTransOptCache();
|
2008-10-14 23:25:18 +04:00
|
|
|
}
|
2011-05-31 13:42:27 +04:00
|
|
|
void StaticData::ClearTransOptionCache() const {
|
|
|
|
map<std::pair<size_t, Phrase>, std::pair< TranslationOptionList*, clock_t > >::iterator iterCache;
|
|
|
|
for (iterCache = m_transOptCache.begin() ; iterCache != m_transOptCache.end() ; ++iterCache) {
|
|
|
|
TranslationOptionList *transOptList = iterCache->second.first;
|
|
|
|
delete transOptList;
|
|
|
|
}
|
|
|
|
}
|
2008-10-14 23:25:18 +04:00
|
|
|
|
2010-09-14 13:42:37 +04:00
|
|
|
void StaticData::ReLoadParameter()
|
|
|
|
{
|
2012-12-04 21:09:23 +04:00
|
|
|
assert(false); // TODO completely redo. Too many hardcoded ff
|
|
|
|
/*
|
2011-08-19 20:09:36 +04:00
|
|
|
m_verboseLevel = 1;
|
|
|
|
if (m_parameter->GetParam("verbose").size() == 1) {
|
|
|
|
m_verboseLevel = Scan<size_t>( m_parameter->GetParam("verbose")[0]);
|
|
|
|
}
|
|
|
|
|
|
|
|
// check whether "weight-u" is already set
|
|
|
|
if (m_parameter->isParamShortNameSpecified("u")) {
|
|
|
|
if (m_parameter->GetParamShortName("u").size() < 1 ) {
|
|
|
|
PARAM_VEC w(1,"1.0");
|
|
|
|
m_parameter->OverwriteParamShortName("u", w);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
//loop over all ScoreProducer to update weights
|
|
|
|
const TranslationSystem &transSystem = GetTranslationSystem(TranslationSystem::DEFAULT);
|
|
|
|
|
|
|
|
std::vector<const ScoreProducer*>::const_iterator iterSP;
|
|
|
|
for (iterSP = transSystem.GetFeatureFunctions().begin() ; iterSP != transSystem.GetFeatureFunctions().end() ; ++iterSP) {
|
|
|
|
std::string paramShortName = (*iterSP)->GetScoreProducerWeightShortName();
|
|
|
|
vector<float> Weights = Scan<float>(m_parameter->GetParamShortName(paramShortName));
|
|
|
|
|
|
|
|
if (paramShortName == "d") { //basic distortion model takes the first weight
|
|
|
|
if ((*iterSP)->GetScoreProducerDescription() == "Distortion") {
|
|
|
|
Weights.resize(1); //take only the first element
|
|
|
|
} else { //lexicalized reordering model takes the other
|
|
|
|
Weights.erase(Weights.begin()); //remove the first element
|
|
|
|
}
|
|
|
|
// std::cerr << "this is the Distortion Score Producer -> " << (*iterSP)->GetScoreProducerDescription() << std::cerr;
|
|
|
|
// std::cerr << "this is the Distortion Score Producer; it has " << (*iterSP)->GetNumScoreComponents() << " weights"<< std::cerr;
|
|
|
|
// std::cerr << Weights << std::endl;
|
|
|
|
} else if (paramShortName == "tm") {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
SetWeights(*iterSP, Weights);
|
|
|
|
}
|
|
|
|
|
|
|
|
// std::cerr << "There are " << m_phraseDictionary.size() << " m_phraseDictionaryfeatures" << std::endl;
|
|
|
|
|
|
|
|
const vector<float> WeightsTM = Scan<float>(m_parameter->GetParamShortName("tm"));
|
|
|
|
// std::cerr << "WeightsTM: " << WeightsTM << std::endl;
|
|
|
|
|
|
|
|
const vector<float> WeightsLM = Scan<float>(m_parameter->GetParamShortName("lm"));
|
|
|
|
// std::cerr << "WeightsLM: " << WeightsLM << std::endl;
|
|
|
|
|
|
|
|
size_t index_WeightTM = 0;
|
|
|
|
for(size_t i=0; i<transSystem.GetPhraseDictionaries().size(); ++i) {
|
|
|
|
PhraseDictionaryFeature &phraseDictionaryFeature = *m_phraseDictionary[i];
|
|
|
|
|
|
|
|
// std::cerr << "phraseDictionaryFeature.GetNumScoreComponents():" << phraseDictionaryFeature.GetNumScoreComponents() << std::endl;
|
|
|
|
// std::cerr << "phraseDictionaryFeature.GetNumInputScores():" << phraseDictionaryFeature.GetNumInputScores() << std::endl;
|
|
|
|
|
|
|
|
vector<float> tmp_weights;
|
|
|
|
for(size_t j=0; j<phraseDictionaryFeature.GetNumScoreComponents(); ++j)
|
|
|
|
tmp_weights.push_back(WeightsTM[index_WeightTM++]);
|
|
|
|
|
|
|
|
// std::cerr << tmp_weights << std::endl;
|
|
|
|
|
|
|
|
SetWeights(&phraseDictionaryFeature, tmp_weights);
|
|
|
|
}
|
2012-12-04 21:09:23 +04:00
|
|
|
*/
|
2010-09-14 13:42:37 +04:00
|
|
|
}
|
2010-09-17 18:25:08 +04:00
|
|
|
|
2011-11-16 13:13:17 +04:00
|
|
|
void StaticData::ReLoadBleuScoreFeatureParameter(float weight)
|
2010-11-24 20:06:54 +03:00
|
|
|
{
|
2012-12-04 21:09:23 +04:00
|
|
|
assert(false);
|
|
|
|
/*
|
2011-11-16 13:13:17 +04:00
|
|
|
//loop over ScoreProducers to update weights of BleuScoreFeature
|
2011-08-19 20:09:36 +04:00
|
|
|
const TranslationSystem &transSystem = GetTranslationSystem(TranslationSystem::DEFAULT);
|
|
|
|
|
|
|
|
std::vector<const ScoreProducer*>::const_iterator iterSP;
|
|
|
|
for (iterSP = transSystem.GetFeatureFunctions().begin() ; iterSP != transSystem.GetFeatureFunctions().end() ; ++iterSP) {
|
|
|
|
std::string paramShortName = (*iterSP)->GetScoreProducerWeightShortName();
|
|
|
|
if (paramShortName == "bl") {
|
2011-11-16 13:13:17 +04:00
|
|
|
SetWeight(*iterSP, weight);
|
|
|
|
break;
|
2011-08-19 20:09:36 +04:00
|
|
|
}
|
|
|
|
}
|
2012-12-04 21:09:23 +04:00
|
|
|
*/
|
2010-11-24 20:06:54 +03:00
|
|
|
}
|
|
|
|
|
2010-09-17 18:25:08 +04:00
|
|
|
// ScoreComponentCollection StaticData::GetAllWeightsScoreComponentCollection() const {}
|
|
|
|
// in ScoreComponentCollection.h
|
2011-08-19 20:09:36 +04:00
|
|
|
|
2012-07-31 00:07:19 +04:00
|
|
|
void StaticData::SetExecPath(const std::string &path)
|
|
|
|
{
|
2012-10-04 18:08:22 +04:00
|
|
|
/*
|
|
|
|
namespace fs = boost::filesystem;
|
|
|
|
|
|
|
|
fs::path full_path( fs::initial_path<fs::path>() );
|
|
|
|
|
|
|
|
full_path = fs::system_complete( fs::path( path ) );
|
|
|
|
|
|
|
|
//Without file name
|
|
|
|
m_binPath = full_path.parent_path().string();
|
|
|
|
*/
|
2012-07-31 00:07:19 +04:00
|
|
|
|
2012-10-04 18:08:22 +04:00
|
|
|
// NOT TESTED
|
|
|
|
size_t pos = path.rfind("/");
|
|
|
|
if (pos != string::npos)
|
|
|
|
{
|
|
|
|
m_binPath = path.substr(0, pos);
|
|
|
|
}
|
2012-07-31 00:07:19 +04:00
|
|
|
cerr << m_binPath << endl;
|
|
|
|
}
|
|
|
|
|
|
|
|
const string &StaticData::GetBinDirectory() const
|
|
|
|
{
|
|
|
|
return m_binPath;
|
|
|
|
}
|
|
|
|
|
2012-12-19 19:38:57 +04:00
|
|
|
float StaticData::GetWeightWordPenalty() const {
|
|
|
|
float weightWP = GetWeight(m_wpProducer);
|
|
|
|
//VERBOSE(1, "Read weightWP from translation sytem: " << weightWP << std::endl);
|
|
|
|
return weightWP;
|
|
|
|
}
|
|
|
|
|
2012-12-19 20:22:10 +04:00
|
|
|
float StaticData::GetWeightUnknownWordPenalty() const {
|
|
|
|
return GetWeight(m_unknownWordPenaltyProducer);
|
|
|
|
}
|
|
|
|
|
2012-12-19 20:51:55 +04:00
|
|
|
float StaticData::GetWeightDistortion() const {
|
|
|
|
CHECK(m_distortionScoreProducer);
|
|
|
|
return StaticData::Instance().GetWeight(m_distortionScoreProducer);
|
2008-10-09 03:51:26 +04:00
|
|
|
}
|
|
|
|
|
2012-12-21 19:28:34 +04:00
|
|
|
void StaticData::ConfigDictionaries() {
|
|
|
|
for (vector<DecodeGraph*>::const_iterator i = m_decodeGraphs.begin();
|
|
|
|
i != m_decodeGraphs.end(); ++i) {
|
|
|
|
for (DecodeGraph::const_iterator j = (*i)->begin(); j != (*i)->end(); ++j) {
|
|
|
|
const DecodeStep* step = *j;
|
|
|
|
PhraseDictionaryFeature* pdict = const_cast<PhraseDictionaryFeature*>(step->GetPhraseDictionaryFeature());
|
|
|
|
if (pdict) {
|
|
|
|
pdict->InitDictionary(NULL);
|
|
|
|
}
|
|
|
|
GenerationDictionary* gdict = const_cast<GenerationDictionary*>(step->GetGenerationDictionaryFeature());
|
|
|
|
if (gdict) {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2012-12-24 22:52:04 +04:00
|
|
|
void StaticData::InitializeForInput(const InputType& source) const {
|
2012-12-31 04:57:21 +04:00
|
|
|
const std::vector<ScoreProducer*> &producers = FeatureFunction::GetFeatureFunctions();
|
|
|
|
for(size_t i=0;i<producers.size();++i) {
|
|
|
|
ScoreProducer &ff = *producers[i];
|
2012-12-27 16:41:10 +04:00
|
|
|
ff.InitializeForInput(source);
|
2012-12-21 19:59:52 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void StaticData::CleanUpAfterSentenceProcessing(const InputType& source) const {
|
2012-12-31 04:57:21 +04:00
|
|
|
const std::vector<ScoreProducer*> &producers = FeatureFunction::GetFeatureFunctions();
|
|
|
|
for(size_t i=0;i<producers.size();++i) {
|
|
|
|
ScoreProducer &ff = *producers[i];
|
2012-12-27 17:10:44 +04:00
|
|
|
ff.CleanUpAfterSentenceProcessing(source);
|
2012-12-31 04:57:21 +04:00
|
|
|
cerr << endl << "Cleaning " << &ff << endl;
|
2012-12-21 19:59:52 +04:00
|
|
|
}
|
2012-12-31 04:57:21 +04:00
|
|
|
|
|
|
|
|
2012-12-21 19:59:52 +04:00
|
|
|
}
|
2012-12-25 01:51:11 +04:00
|
|
|
|
2012-12-19 20:51:55 +04:00
|
|
|
} // namespace
|
|
|
|
|
2008-10-09 03:51:26 +04:00
|
|
|
|