mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-09-20 07:42:21 +03:00
Merge branch 'master' of github.com:moses-smt/mosesdecoder
This commit is contained in:
commit
68779c66b9
@ -1,7 +1,5 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<?fileVersion 4.0.0?>
|
||||
|
||||
<cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
|
||||
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
|
||||
<storageModule moduleId="org.eclipse.cdt.core.settings">
|
||||
<cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.162355801">
|
||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.162355801" moduleId="org.eclipse.cdt.core.settings" name="Debug">
|
||||
@ -55,6 +53,7 @@
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/../../boost/lib""/>
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/../../boost/lib64""/>
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/../../irstlm/lib""/>
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/../../randlm/lib""/>
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/../../srilm/lib/macosx""/>
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/../../srilm/lib/i686-m64""/>
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/../../srilm/lib/i686""/>
|
||||
@ -77,6 +76,7 @@
|
||||
<listOptionValue builtIn="false" value="OnDiskPt"/>
|
||||
<listOptionValue builtIn="false" value="lm"/>
|
||||
<listOptionValue builtIn="false" value="util"/>
|
||||
<listOptionValue builtIn="false" value="RandLM"/>
|
||||
<listOptionValue builtIn="false" value="boost_iostreams-mt"/>
|
||||
<listOptionValue builtIn="false" value="boost_system-mt"/>
|
||||
<listOptionValue builtIn="false" value="boost_thread-mt"/>
|
||||
|
@ -1,7 +1,5 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<?fileVersion 4.0.0?>
|
||||
|
||||
<cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
|
||||
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
|
||||
<storageModule moduleId="org.eclipse.cdt.core.settings">
|
||||
<cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.461114338">
|
||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.461114338" moduleId="org.eclipse.cdt.core.settings" name="Debug">
|
||||
@ -49,6 +47,7 @@
|
||||
<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.1546774818" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
|
||||
<option id="gnu.cpp.link.option.paths.523170942" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/../../irstlm/lib""/>
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/../../randlm/lib""/>
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/../../cmph/lib""/>
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/../../boost/lib64""/>
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/../../srilm/lib/macosx""/>
|
||||
@ -73,6 +72,7 @@
|
||||
<listOptionValue builtIn="false" value="OnDiskPt"/>
|
||||
<listOptionValue builtIn="false" value="lm"/>
|
||||
<listOptionValue builtIn="false" value="util"/>
|
||||
<listOptionValue builtIn="false" value="RandLM"/>
|
||||
<listOptionValue builtIn="false" value="boost_iostreams-mt"/>
|
||||
<listOptionValue builtIn="false" value="boost_system-mt"/>
|
||||
<listOptionValue builtIn="false" value="boost_thread-mt"/>
|
||||
|
@ -35,6 +35,7 @@
|
||||
<listOptionValue builtIn="false" value="/opt/local/include/"/>
|
||||
<listOptionValue builtIn="false" value="${workspace_loc}/../../irstlm/include"/>
|
||||
<listOptionValue builtIn="false" value="${workspace_loc}/../../srilm/include"/>
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc}/../../randlm/include/RandLM""/>
|
||||
<listOptionValue builtIn="false" value="${workspace_loc}/../../"/>
|
||||
</option>
|
||||
<option id="gnu.cpp.compiler.option.preprocessor.def.752586397" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" valueType="definedSymbols">
|
||||
@ -46,6 +47,7 @@
|
||||
<listOptionValue builtIn="false" value="KENLM_MAX_ORDER=7"/>
|
||||
<listOptionValue builtIn="false" value="TRACE_ENABLE"/>
|
||||
<listOptionValue builtIn="false" value="LM_IRST"/>
|
||||
<listOptionValue builtIn="false" value="LM_RAND"/>
|
||||
<listOptionValue builtIn="false" value="_FILE_OFFSET_BIT=64"/>
|
||||
<listOptionValue builtIn="false" value="_LARGE_FILES"/>
|
||||
</option>
|
||||
@ -68,8 +70,9 @@
|
||||
</tool>
|
||||
</toolChain>
|
||||
</folderInfo>
|
||||
<fileInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512.511477442" name="Rand.h" rcbsApplicability="disable" resourcePath="LM/Rand.h" toolsToInvoke=""/>
|
||||
<sourceEntries>
|
||||
<entry excluding="FF/PhraseLengthFeatureTest.cpp|PhraseLengthFeatureTest.cpp|LM/BackwardTest.cpp|LM/BackwardLMState.h|LM/BackwardLMState.cpp|LM/Backward.h|LM/Backward.cpp|FeatureVectorTest.cpp|LM/ParallelBackoff.h|LM/ParallelBackoff.cpp|src/SyntacticLanguageModelState.h|src/SyntacticLanguageModelFiles.h|src/SyntacticLanguageModel.h|src/SyntacticLanguageModel.cpp|src/LM/SRI.h|src/LM/SRI.cpp|src/LM/Rand.h|src/LM/Rand.cpp|src/LM/LDHT.h|src/LM/LDHT.cpp|SyntacticLanguageModelState.h|SyntacticLanguageModelFiles.h|SyntacticLanguageModel.h|SyntacticLanguageModel.cpp|LM/Rand.h|LM/Rand.cpp|LM/LDHT.h|LM/LDHT.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
|
||||
<entry excluding="FF/PhraseLengthFeatureTest.cpp|PhraseLengthFeatureTest.cpp|LM/BackwardTest.cpp|LM/BackwardLMState.h|LM/BackwardLMState.cpp|LM/Backward.h|LM/Backward.cpp|FeatureVectorTest.cpp|LM/ParallelBackoff.h|LM/ParallelBackoff.cpp|src/SyntacticLanguageModelState.h|src/SyntacticLanguageModelFiles.h|src/SyntacticLanguageModel.h|src/SyntacticLanguageModel.cpp|src/LM/SRI.h|src/LM/SRI.cpp|src/LM/Rand.h|src/LM/Rand.cpp|src/LM/LDHT.h|src/LM/LDHT.cpp|SyntacticLanguageModelState.h|SyntacticLanguageModelFiles.h|SyntacticLanguageModel.h|SyntacticLanguageModel.cpp|LM/LDHT.h|LM/LDHT.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
|
||||
</sourceEntries>
|
||||
</configuration>
|
||||
</storageModule>
|
||||
|
@ -1061,6 +1061,16 @@
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/ChartBasedFeatureContext.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/ControlRecombination.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/ControlRecombination.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/ControlRecombination.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/ControlRecombination.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/DistortionScoreProducer.cpp</name>
|
||||
<type>1</type>
|
||||
@ -1081,6 +1091,16 @@
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/FFState.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/Factory.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/Factory.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/Factory.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/Factory.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/FeatureFunction.cpp</name>
|
||||
<type>1</type>
|
||||
|
69
moses/FF/ControlRecombination.cpp
Normal file
69
moses/FF/ControlRecombination.cpp
Normal file
@ -0,0 +1,69 @@
|
||||
#include "ControlRecombination.h"
|
||||
#include "moses/Hypothesis.h"
|
||||
#include "util/exception.hh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Moses {
|
||||
|
||||
ControlRecombination::ControlRecombination(const std::string &line)
|
||||
:StatefulFeatureFunction("ControlRecombination", 0, line)
|
||||
,m_type(Output)
|
||||
{
|
||||
}
|
||||
|
||||
void ControlRecombination::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
if (key == "type") {
|
||||
m_type = (Type) Scan<size_t>(value);
|
||||
} else {
|
||||
StatefulFeatureFunction::SetParameter(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
FFState* ControlRecombination::Evaluate(
|
||||
const Hypothesis& cur_hypo,
|
||||
const FFState* prev_state,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{
|
||||
ControlRecombinationState *state = new ControlRecombinationState(&cur_hypo);
|
||||
return state;
|
||||
}
|
||||
|
||||
FFState* ControlRecombination::EvaluateChart(
|
||||
const ChartHypothesis& /* cur_hypo */,
|
||||
int /* featureID - used to index the state in the previous hypotheses */,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{
|
||||
UTIL_THROW(util::Exception, "Not implemented yet");
|
||||
}
|
||||
|
||||
const FFState* ControlRecombination::EmptyHypothesisState(const InputType &input) const
|
||||
{
|
||||
ControlRecombinationState *state = new ControlRecombinationState();
|
||||
}
|
||||
|
||||
ControlRecombinationState::ControlRecombinationState()
|
||||
:m_hypo(NULL)
|
||||
{
|
||||
}
|
||||
|
||||
ControlRecombinationState::ControlRecombinationState(const Hypothesis *hypo)
|
||||
:m_hypo(hypo)
|
||||
{
|
||||
}
|
||||
|
||||
int ControlRecombinationState::Compare(const FFState& other) const
|
||||
{
|
||||
const ControlRecombinationState &other2 = static_cast<const ControlRecombinationState&>(other);
|
||||
const Hypothesis *otherHypo = other2.m_hypo;
|
||||
|
||||
Phrase thisOutputPhrase, otherOutputPhrase;
|
||||
m_hypo->GetOutputPhrase(thisOutputPhrase);
|
||||
otherHypo->GetOutputPhrase(otherOutputPhrase);
|
||||
|
||||
int ret = thisOutputPhrase.Compare(otherOutputPhrase);
|
||||
return ret;
|
||||
}
|
||||
|
||||
}
|
58
moses/FF/ControlRecombination.h
Normal file
58
moses/FF/ControlRecombination.h
Normal file
@ -0,0 +1,58 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include "StatefulFeatureFunction.h"
|
||||
#include "moses/FF/FFState.h"
|
||||
|
||||
namespace Moses {
|
||||
|
||||
class ControlRecombinationState;
|
||||
|
||||
// force hypotheses NOT to recombine. For forced decoding
|
||||
class ControlRecombination : public StatefulFeatureFunction
|
||||
{
|
||||
public:
|
||||
enum Type
|
||||
{
|
||||
None,
|
||||
Output,
|
||||
Segmentation
|
||||
};
|
||||
|
||||
ControlRecombination(const std::string &line);
|
||||
|
||||
bool IsUseable(const FactorMask &mask) const {
|
||||
return true;
|
||||
}
|
||||
|
||||
virtual FFState* Evaluate(
|
||||
const Hypothesis& cur_hypo,
|
||||
const FFState* prev_state,
|
||||
ScoreComponentCollection* accumulator) const;
|
||||
|
||||
virtual FFState* EvaluateChart(
|
||||
const ChartHypothesis& /* cur_hypo */,
|
||||
int /* featureID - used to index the state in the previous hypotheses */,
|
||||
ScoreComponentCollection* accumulator) const;
|
||||
|
||||
//! return the state associated with the empty hypothesis for a given sentence
|
||||
virtual const FFState* EmptyHypothesisState(const InputType &input) const;
|
||||
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
protected:
|
||||
Type m_type;
|
||||
};
|
||||
|
||||
class ControlRecombinationState : public FFState
|
||||
{
|
||||
protected:
|
||||
const Hypothesis *m_hypo;
|
||||
|
||||
public:
|
||||
ControlRecombinationState();
|
||||
ControlRecombinationState(const Hypothesis *hypo);
|
||||
int Compare(const FFState& other) const;
|
||||
|
||||
};
|
||||
|
||||
} // namespace
|
177
moses/FF/Factory.cpp
Normal file
177
moses/FF/Factory.cpp
Normal file
@ -0,0 +1,177 @@
|
||||
#include "moses/FF/Factory.h"
|
||||
#include "moses/StaticData.h"
|
||||
|
||||
#include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h"
|
||||
#include "moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h"
|
||||
#include "moses/TranslationModel/PhraseDictionaryMemory.h"
|
||||
#include "moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h"
|
||||
#include "moses/TranslationModel/PhraseDictionaryMultiModel.h"
|
||||
#include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h"
|
||||
#include "moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.h"
|
||||
#include "moses/TranslationModel/PhraseDictionaryDynSuffixArray.h"
|
||||
|
||||
#include "moses/LexicalReordering.h"
|
||||
|
||||
#include "moses/FF/BleuScoreFeature.h"
|
||||
#include "moses/FF/TargetWordInsertionFeature.h"
|
||||
#include "moses/FF/SourceWordDeletionFeature.h"
|
||||
#include "moses/FF/GlobalLexicalModel.h"
|
||||
#include "moses/FF/GlobalLexicalModelUnlimited.h"
|
||||
#include "moses/FF/UnknownWordPenaltyProducer.h"
|
||||
#include "moses/FF/WordTranslationFeature.h"
|
||||
#include "moses/FF/TargetBigramFeature.h"
|
||||
#include "moses/FF/TargetNgramFeature.h"
|
||||
#include "moses/FF/PhraseBoundaryFeature.h"
|
||||
#include "moses/FF/PhrasePairFeature.h"
|
||||
#include "moses/FF/PhraseLengthFeature.h"
|
||||
#include "moses/FF/DistortionScoreProducer.h"
|
||||
#include "moses/FF/WordPenaltyProducer.h"
|
||||
#include "moses/FF/InputFeature.h"
|
||||
#include "moses/FF/PhrasePenalty.h"
|
||||
#include "moses/FF/OSM-Feature/OpSequenceModel.h"
|
||||
#include "moses/FF/ControlRecombination.h"
|
||||
|
||||
#include "moses/LM/Ken.h"
|
||||
#ifdef LM_IRST
|
||||
#include "moses/LM/IRST.h"
|
||||
#endif
|
||||
|
||||
#ifdef LM_SRI
|
||||
#include "moses/LM/SRI.h"
|
||||
#endif
|
||||
|
||||
#ifdef LM_RAND
|
||||
#include "moses/LM/Rand.h"
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_SYNLM
|
||||
#include "moses/SyntacticLanguageModel.h"
|
||||
#endif
|
||||
|
||||
#include "util/exception.hh"
|
||||
|
||||
#include <vector>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
class FeatureFactory
|
||||
{
|
||||
public:
|
||||
virtual ~FeatureFactory() {}
|
||||
|
||||
virtual void Create(const std::string &line) = 0;
|
||||
|
||||
protected:
|
||||
template <class F> static void DefaultSetup(F *feature);
|
||||
|
||||
FeatureFactory() {}
|
||||
};
|
||||
|
||||
template <class F> void FeatureFactory::DefaultSetup(F *feature)
|
||||
{
|
||||
StaticData &static_data = StaticData::InstanceNonConst();
|
||||
std::vector<float> &weights = static_data.GetParameter()->GetWeights(feature->GetScoreProducerDescription());
|
||||
|
||||
if (feature->IsTuneable() || weights.size()) {
|
||||
// if it's tuneable, ini file MUST have weights
|
||||
// even it it's not tuneable, people can still set the weights in the ini file
|
||||
static_data.SetWeights(feature, weights);
|
||||
} else {
|
||||
std::vector<float> defaultWeights = feature->DefaultWeights();
|
||||
static_data.SetWeights(feature, defaultWeights);
|
||||
}
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
template <class F> class DefaultFeatureFactory : public FeatureFactory
|
||||
{
|
||||
public:
|
||||
void Create(const std::string &line) {
|
||||
DefaultSetup(new F(line));
|
||||
}
|
||||
};
|
||||
|
||||
class KenFactory : public FeatureFactory
|
||||
{
|
||||
public:
|
||||
void Create(const std::string &line) {
|
||||
DefaultSetup(ConstructKenLM(line));
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
FeatureRegistry::FeatureRegistry()
|
||||
{
|
||||
// Feature with same name as class
|
||||
#define MOSES_FNAME(name) Add(#name, new DefaultFeatureFactory< name >());
|
||||
// Feature with different name than class.
|
||||
#define MOSES_FNAME2(name, type) Add(name, new DefaultFeatureFactory< type >());
|
||||
MOSES_FNAME(GlobalLexicalModel);
|
||||
//MOSES_FNAME(GlobalLexicalModelUnlimited); This was commented out in the original
|
||||
MOSES_FNAME(SourceWordDeletionFeature);
|
||||
MOSES_FNAME(TargetWordInsertionFeature);
|
||||
MOSES_FNAME(PhraseBoundaryFeature);
|
||||
MOSES_FNAME(PhraseLengthFeature);
|
||||
MOSES_FNAME(WordTranslationFeature);
|
||||
MOSES_FNAME(TargetBigramFeature);
|
||||
MOSES_FNAME(TargetNgramFeature);
|
||||
MOSES_FNAME(PhrasePairFeature);
|
||||
MOSES_FNAME(LexicalReordering);
|
||||
MOSES_FNAME2("Generation", GenerationDictionary);
|
||||
MOSES_FNAME(BleuScoreFeature);
|
||||
MOSES_FNAME2("Distortion", DistortionScoreProducer);
|
||||
MOSES_FNAME2("WordPenalty", WordPenaltyProducer);
|
||||
MOSES_FNAME(InputFeature);
|
||||
MOSES_FNAME2("PhraseDictionaryBinary", PhraseDictionaryTreeAdaptor);
|
||||
MOSES_FNAME(PhraseDictionaryOnDisk);
|
||||
MOSES_FNAME(PhraseDictionaryMemory);
|
||||
MOSES_FNAME(PhraseDictionaryCompact);
|
||||
MOSES_FNAME(PhraseDictionaryMultiModel);
|
||||
MOSES_FNAME(PhraseDictionaryMultiModelCounts);
|
||||
MOSES_FNAME(PhraseDictionaryALSuffixArray);
|
||||
MOSES_FNAME(PhraseDictionaryDynSuffixArray);
|
||||
MOSES_FNAME(OpSequenceModel);
|
||||
MOSES_FNAME(PhrasePenalty);
|
||||
MOSES_FNAME2("UnknownWordPenalty", UnknownWordPenaltyProducer);
|
||||
MOSES_FNAME(ControlRecombination);
|
||||
|
||||
#ifdef HAVE_SYNLM
|
||||
MOSES_FNAME(SyntacticLanguageModel);
|
||||
#endif
|
||||
#ifdef LM_IRST
|
||||
MOSES_FNAME2("IRSTLM", LanguageModelIRST);
|
||||
#endif
|
||||
#ifdef LM_SRI
|
||||
MOSES_FNAME2("SRILM", LanguageModelSRI);
|
||||
#endif
|
||||
#ifdef LM_RAND
|
||||
MOSES_FNAME2("RANDLM", LanguageModelRandLM);
|
||||
#endif
|
||||
Add("KENLM", new KenFactory());
|
||||
}
|
||||
|
||||
FeatureRegistry::~FeatureRegistry() {}
|
||||
|
||||
void FeatureRegistry::Add(const std::string &name, FeatureFactory *factory)
|
||||
{
|
||||
std::pair<std::string, boost::shared_ptr<FeatureFactory> > to_ins(name, boost::shared_ptr<FeatureFactory>(factory));
|
||||
UTIL_THROW_IF(!registry_.insert(to_ins).second, util::Exception, "Duplicate feature name " << name);
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
class UnknownFeatureException : public util::Exception {};
|
||||
}
|
||||
|
||||
void FeatureRegistry::Construct(const std::string &name, const std::string &line)
|
||||
{
|
||||
Map::iterator i = registry_.find(name);
|
||||
UTIL_THROW_IF(i == registry_.end(), UnknownFeatureException, "Feature name " << name << " is not registered.");
|
||||
i->second->Create(line);
|
||||
}
|
||||
|
||||
} // namespace Moses
|
30
moses/FF/Factory.h
Normal file
30
moses/FF/Factory.h
Normal file
@ -0,0 +1,30 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <boost/unordered_map.hpp>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
class FeatureFactory;
|
||||
|
||||
class FeatureRegistry
|
||||
{
|
||||
public:
|
||||
FeatureRegistry();
|
||||
|
||||
~FeatureRegistry();
|
||||
|
||||
void Construct(const std::string &name, const std::string &line);
|
||||
|
||||
private:
|
||||
void Add(const std::string &name, FeatureFactory *factory);
|
||||
|
||||
typedef boost::unordered_map<std::string, boost::shared_ptr<FeatureFactory> > Map;
|
||||
|
||||
Map registry_;
|
||||
};
|
||||
|
||||
} // namespace Moses
|
@ -74,7 +74,7 @@ void FeatureFunction::ParseLine(const std::string& description, const std::strin
|
||||
set<string> keys;
|
||||
|
||||
for (size_t i = 1; i < toks.size(); ++i) {
|
||||
vector<string> args = Tokenize(toks[i], "=");
|
||||
vector<string> args = TokenizeFirstOnly(toks[i], "=");
|
||||
CHECK(args.size() == 2);
|
||||
|
||||
pair<set<string>::iterator,bool> ret = keys.insert(args[0]);
|
||||
@ -109,5 +109,10 @@ void FeatureFunction::ReadParameters()
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<float> FeatureFunction::DefaultWeights() const
|
||||
{
|
||||
UTIL_THROW(util::Exception, "No default weights");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -78,6 +78,7 @@ public:
|
||||
virtual bool IsTuneable() const {
|
||||
return m_tuneable;
|
||||
}
|
||||
virtual std::vector<float> DefaultWeights() const;
|
||||
|
||||
//! Called before search and collecting of translation options
|
||||
virtual void InitializeForInput(InputType const& source) {
|
||||
|
@ -13,5 +13,11 @@ UnknownWordPenaltyProducer::UnknownWordPenaltyProducer(const std::string &line)
|
||||
ReadParameters();
|
||||
}
|
||||
|
||||
std::vector<float> UnknownWordPenaltyProducer::DefaultWeights() const
|
||||
{
|
||||
std::vector<float> ret(1, 1.0f);
|
||||
return ret;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -20,6 +20,7 @@ public:
|
||||
bool IsUseable(const FactorMask &mask) const {
|
||||
return true;
|
||||
}
|
||||
std::vector<float> DefaultWeights() const;
|
||||
|
||||
};
|
||||
|
||||
|
@ -401,6 +401,14 @@ void Hypothesis::CleanupArcList()
|
||||
}
|
||||
}
|
||||
|
||||
void Hypothesis::GetOutputPhrase(Phrase &out) const
|
||||
{
|
||||
if (m_prevHypo != NULL) {
|
||||
m_prevHypo->GetOutputPhrase(out);
|
||||
}
|
||||
out.Append(GetCurrTargetPhrase());
|
||||
}
|
||||
|
||||
TO_STRING_BODY(Hypothesis)
|
||||
|
||||
// friend
|
||||
|
@ -200,11 +200,12 @@ public:
|
||||
|
||||
int RecombineCompare(const Hypothesis &compare) const;
|
||||
|
||||
void GetOutputPhrase(Phrase &out) const;
|
||||
|
||||
void ToStream(std::ostream& out) const {
|
||||
if (m_prevHypo != NULL) {
|
||||
m_prevHypo->ToStream(out);
|
||||
}
|
||||
out << (Phrase) GetCurrTargetPhrase();
|
||||
Phrase ret;
|
||||
GetOutputPhrase(ret);
|
||||
out << ret;
|
||||
}
|
||||
|
||||
void ToStringStream(std::stringstream& out) const {
|
||||
|
@ -30,6 +30,25 @@ if $(have-clock[2]) = 0 {
|
||||
alias rt ;
|
||||
}
|
||||
|
||||
#This is a kludge to force rebuilding if different --with options are passed.
|
||||
#Could have used features like <srilm>on but getting these to apply only to
|
||||
#linking was ugly and it still didn't trigger an install (since the install
|
||||
#path doesn't encode features). It stores a file lm.log with the previous
|
||||
#options and forces a rebuild if the current options differ.
|
||||
local current = ;
|
||||
for local i in srilm irstlm randlm {
|
||||
local optval = [ option.get "with-$(i)" ] ;
|
||||
if $(optval) {
|
||||
current += "--with-$(i)=$(optval)" ;
|
||||
}
|
||||
}
|
||||
current = $(current:J=" ") ;
|
||||
current ?= "" ;
|
||||
path-constant LM-LOG : bin/lm.log ;
|
||||
update-if-changed $(LM-LOG) $(current) ;
|
||||
|
||||
obj FF_Factory.o : FF/Factory.cpp LM//macros headers ../lm//kenlm : <dependency>$(LM-LOG) ;
|
||||
|
||||
lib moses :
|
||||
[ glob
|
||||
*.cpp
|
||||
@ -45,8 +64,9 @@ lib moses :
|
||||
ThreadPool.cpp
|
||||
SyntacticLanguageModel.cpp
|
||||
*Test.cpp Mock*.cpp
|
||||
LM/Factory.cpp
|
||||
]
|
||||
headers LM//LM TranslationModel/CompactPT//CompactPT synlm ThreadPool rt
|
||||
headers FF_Factory.o LM//LM TranslationModel/CompactPT//CompactPT synlm ThreadPool rt
|
||||
..//search ../util/double-conversion//double-conversion ..//z ../OnDiskPt//OnDiskPt ;
|
||||
|
||||
alias headers-to-install : [ glob-tree *.h ] ;
|
||||
|
@ -7,26 +7,8 @@
|
||||
|
||||
import option path ;
|
||||
|
||||
#This is a kludge to force rebuilding if different --with options are passed.
|
||||
#Could have used features like <srilm>on but getting these to apply only to
|
||||
#linking was ugly and it still didn't trigger an install (since the install
|
||||
#path doesn't encode features). It stores a file lm.log with the previous
|
||||
#options and forces a rebuild if the current options differ.
|
||||
local current = ;
|
||||
for local i in srilm irstlm randlm {
|
||||
local optval = [ option.get "with-$(i)" ] ;
|
||||
if $(optval) {
|
||||
current += "--with-$(i)=$(optval)" ;
|
||||
}
|
||||
}
|
||||
current = $(current:J=" ") ;
|
||||
current ?= "" ;
|
||||
|
||||
path-constant LM-LOG : bin/lm.log ;
|
||||
update-if-changed $(LM-LOG) $(current) ;
|
||||
|
||||
|
||||
local dependencies = ;
|
||||
local lmmacros = ;
|
||||
|
||||
#IRSTLM
|
||||
local with-irstlm = [ option.get "with-irstlm" ] ;
|
||||
@ -35,6 +17,7 @@ if $(with-irstlm) {
|
||||
obj IRST.o : IRST.cpp ..//headers : <include>$(with-irstlm)/include <include>$(with-irstlm)/include/irstlm ;
|
||||
alias irst : IRST.o irstlm : : : <define>LM_IRST ;
|
||||
dependencies += irst ;
|
||||
lmmacros += LM_IRST ;
|
||||
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
|
||||
echo "!!! You are linking the IRSTLM library; be sure the release is >= 5.70.02 !!!" ;
|
||||
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" ;
|
||||
@ -63,6 +46,7 @@ if $(with-srilm) {
|
||||
obj ParallelBackoff.o : ParallelBackoff.cpp ..//headers : <include>$(with-srilm)/include <include>$(with-srilm)/include/srilm <warnings>off ;
|
||||
alias sri : SRI.o ParallelBackoff.o sri-libs : : : <define>LM_SRI ;
|
||||
dependencies += sri ;
|
||||
lmmacros += LM_SRI ;
|
||||
}
|
||||
|
||||
#RandLM
|
||||
@ -72,6 +56,7 @@ if $(with-randlm) {
|
||||
obj Rand.o : Rand.cpp RandLM ..//headers : <include>$(with-randlm)/include <include>$(with-randlm)/include/RandLM ;
|
||||
alias rand : Rand.o RandLM : : : <define>LM_RAND ;
|
||||
dependencies += rand ;
|
||||
lmmacros += LM_RAND ;
|
||||
}
|
||||
|
||||
# LDHTLM
|
||||
@ -82,6 +67,7 @@ if $(with-ldhtlm) {
|
||||
obj LDHT.o : LDHT.cpp LDHT ..//headers : <include>$(with-ldhtlm)/include <include>$(with-ldhtlm)/include/LDHT ;
|
||||
alias ldht : LDHT.o LDHT ticpp : : : <define>LM_LDHT ;
|
||||
dependencies += ldht ;
|
||||
lmmacros += LM_LDHT ;
|
||||
}
|
||||
|
||||
#ORLM is always compiled but needs special headers
|
||||
@ -92,4 +78,4 @@ obj ORLM.o : ORLM.cpp ..//headers ../TranslationModel/DynSAInclude//dynsa : : :
|
||||
alias LM : Base.cpp Implementation.cpp Joint.cpp Ken.cpp MultiFactor.cpp Remote.cpp SingleFactor.cpp ORLM.o
|
||||
../../lm//kenlm ..//headers $(dependencies) ;
|
||||
|
||||
|
||||
alias macros : : : : <define>$(lmmacros) ;
|
||||
|
@ -66,7 +66,7 @@ struct KenLMState : public FFState {
|
||||
template <class Model> class LanguageModelKen : public LanguageModel
|
||||
{
|
||||
public:
|
||||
LanguageModelKen(const std::string &description, const std::string &line, const std::string &file, FactorType factorType, bool lazy);
|
||||
LanguageModelKen(const std::string &line, const std::string &file, FactorType factorType, bool lazy);
|
||||
|
||||
const FFState *EmptyHypothesisState(const InputType &/*input*/) const {
|
||||
KenLMState *ret = new KenLMState();
|
||||
@ -137,8 +137,8 @@ private:
|
||||
std::vector<lm::WordIndex> &m_mapping;
|
||||
};
|
||||
|
||||
template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::string &description, const std::string &line, const std::string &file, FactorType factorType, bool lazy)
|
||||
:LanguageModel(description, line)
|
||||
template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::string &line, const std::string &file, FactorType factorType, bool lazy)
|
||||
:LanguageModel("KENLM", line)
|
||||
,m_factorType(factorType)
|
||||
{
|
||||
lm::ngram::Config config;
|
||||
@ -351,7 +351,7 @@ bool LanguageModelKen<Model>::IsUseable(const FactorMask &mask) const
|
||||
|
||||
} // namespace
|
||||
|
||||
LanguageModel *ConstructKenLM(const std::string &description, const std::string &line)
|
||||
LanguageModel *ConstructKenLM(const std::string &line)
|
||||
{
|
||||
FactorType factorType;
|
||||
string filePath;
|
||||
@ -375,10 +375,10 @@ LanguageModel *ConstructKenLM(const std::string &description, const std::string
|
||||
}
|
||||
}
|
||||
|
||||
return ConstructKenLM(description, line, filePath, factorType, lazy);
|
||||
return ConstructKenLM(line, filePath, factorType, lazy);
|
||||
}
|
||||
|
||||
LanguageModel *ConstructKenLM(const std::string &description, const std::string &line, const std::string &file, FactorType factorType, bool lazy)
|
||||
LanguageModel *ConstructKenLM(const std::string &line, const std::string &file, FactorType factorType, bool lazy)
|
||||
{
|
||||
try {
|
||||
lm::ngram::ModelType model_type;
|
||||
@ -386,23 +386,23 @@ LanguageModel *ConstructKenLM(const std::string &description, const std::string
|
||||
|
||||
switch(model_type) {
|
||||
case lm::ngram::PROBING:
|
||||
return new LanguageModelKen<lm::ngram::ProbingModel>(description, line, file, factorType, lazy);
|
||||
return new LanguageModelKen<lm::ngram::ProbingModel>(line, file, factorType, lazy);
|
||||
case lm::ngram::REST_PROBING:
|
||||
return new LanguageModelKen<lm::ngram::RestProbingModel>(description, line, file, factorType, lazy);
|
||||
return new LanguageModelKen<lm::ngram::RestProbingModel>(line, file, factorType, lazy);
|
||||
case lm::ngram::TRIE:
|
||||
return new LanguageModelKen<lm::ngram::TrieModel>(description, line, file, factorType, lazy);
|
||||
return new LanguageModelKen<lm::ngram::TrieModel>(line, file, factorType, lazy);
|
||||
case lm::ngram::QUANT_TRIE:
|
||||
return new LanguageModelKen<lm::ngram::QuantTrieModel>(description, line, file, factorType, lazy);
|
||||
return new LanguageModelKen<lm::ngram::QuantTrieModel>(line, file, factorType, lazy);
|
||||
case lm::ngram::ARRAY_TRIE:
|
||||
return new LanguageModelKen<lm::ngram::ArrayTrieModel>(description, line, file, factorType, lazy);
|
||||
return new LanguageModelKen<lm::ngram::ArrayTrieModel>(line, file, factorType, lazy);
|
||||
case lm::ngram::QUANT_ARRAY_TRIE:
|
||||
return new LanguageModelKen<lm::ngram::QuantArrayTrieModel>(description, line, file, factorType, lazy);
|
||||
return new LanguageModelKen<lm::ngram::QuantArrayTrieModel>(line, file, factorType, lazy);
|
||||
default:
|
||||
std::cerr << "Unrecognized kenlm model type " << model_type << std::endl;
|
||||
abort();
|
||||
}
|
||||
} else {
|
||||
return new LanguageModelKen<lm::ngram::ProbingModel>(description, line, file, factorType, lazy);
|
||||
return new LanguageModelKen<lm::ngram::ProbingModel>(line, file, factorType, lazy);
|
||||
}
|
||||
} catch (std::exception &e) {
|
||||
std::cerr << e.what() << std::endl;
|
||||
|
@ -31,10 +31,10 @@ namespace Moses
|
||||
|
||||
class LanguageModel;
|
||||
|
||||
LanguageModel *ConstructKenLM(const std::string &description, const std::string &line);
|
||||
LanguageModel *ConstructKenLM(const std::string &line);
|
||||
|
||||
//! This will also load. Returns a templated KenLM class
|
||||
LanguageModel *ConstructKenLM(const std::string &description, const std::string &line, const std::string &file, FactorType factorType, bool lazy);
|
||||
LanguageModel *ConstructKenLM(const std::string &line, const std::string &file, FactorType factorType, bool lazy);
|
||||
|
||||
} // namespace Moses
|
||||
|
||||
|
@ -20,11 +20,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include <limits>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "SingleFactor.h"
|
||||
#include "RandLM.h"
|
||||
#include "Rand.h"
|
||||
#include "moses/Factor.h"
|
||||
#include "moses/Util.h"
|
||||
@ -33,62 +29,34 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include "moses/InputFileStream.h"
|
||||
#include "moses/StaticData.h"
|
||||
#include "util/check.hh"
|
||||
#include "RandLM.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
namespace
|
||||
|
||||
LanguageModelRandLM::LanguageModelRandLM(const std::string &line)
|
||||
:LanguageModelSingleFactor("RandLM", line)
|
||||
, m_lm(0)
|
||||
{
|
||||
using namespace std;
|
||||
}
|
||||
|
||||
class LanguageModelRandLM : public LanguageModelSingleFactor
|
||||
{
|
||||
public:
|
||||
LanguageModelRandLM(const std::string &line)
|
||||
:LanguageModelSingleFactor("RandLM", line)
|
||||
, m_lm(0) {
|
||||
}
|
||||
bool Load(const std::string &filePath, FactorType factorType, size_t nGramOrder);
|
||||
virtual LMResult GetValue(const std::vector<const Word*> &contextFactor, State* finalState = NULL) const;
|
||||
~LanguageModelRandLM() {
|
||||
delete m_lm;
|
||||
}
|
||||
void InitializeForInput(InputType const& source) {
|
||||
m_lm->initThreadSpecificData(); // Creates thread specific data iff // compiled with multithreading.
|
||||
}
|
||||
void CleanUpAfterSentenceProcessing(const InputType& source) {
|
||||
m_lm->clearCaches(); // clear caches
|
||||
}
|
||||
protected:
|
||||
std::vector<randlm::WordID> m_randlm_ids_vec;
|
||||
randlm::RandLM* m_lm;
|
||||
randlm::WordID m_oov_id;
|
||||
void CreateFactors(FactorCollection &factorCollection);
|
||||
randlm::WordID GetLmID( const std::string &str ) const;
|
||||
randlm::WordID GetLmID( const Factor *factor ) const {
|
||||
size_t factorId = factor->GetId();
|
||||
return ( factorId >= m_randlm_ids_vec.size()) ? m_oov_id : m_randlm_ids_vec[factorId];
|
||||
};
|
||||
LanguageModelRandLM::~LanguageModelRandLM() {
|
||||
delete m_lm;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
bool LanguageModelRandLM::Load(const std::string &filePath, FactorType factorType,
|
||||
size_t nGramOrder)
|
||||
void LanguageModelRandLM::Load()
|
||||
{
|
||||
cerr << "Loading LanguageModelRandLM..." << endl;
|
||||
FactorCollection &factorCollection = FactorCollection::Instance();
|
||||
m_filePath = filePath;
|
||||
m_factorType = factorType;
|
||||
m_nGramOrder = nGramOrder;
|
||||
int cache_MB = 50; // increase cache size
|
||||
m_lm = randlm::RandLM::initRandLM(filePath, nGramOrder, cache_MB);
|
||||
m_lm = randlm::RandLM::initRandLM(m_filePath, m_nGramOrder, cache_MB);
|
||||
CHECK(m_lm != NULL);
|
||||
// get special word ids
|
||||
m_oov_id = m_lm->getWordID(m_lm->getOOV());
|
||||
CreateFactors(factorCollection);
|
||||
m_lm->initThreadSpecificData();
|
||||
return true;
|
||||
}
|
||||
|
||||
void LanguageModelRandLM::CreateFactors(FactorCollection &factorCollection) // add factors which have randlm id
|
||||
@ -132,6 +100,11 @@ randlm::WordID LanguageModelRandLM::GetLmID( const std::string &str ) const
|
||||
return m_lm->getWordID(str);
|
||||
}
|
||||
|
||||
randlm::WordID LanguageModelRandLM::GetLmID( const Factor *factor ) const {
|
||||
size_t factorId = factor->GetId();
|
||||
return ( factorId >= m_randlm_ids_vec.size()) ? m_oov_id : m_randlm_ids_vec[factorId];
|
||||
}
|
||||
|
||||
LMResult LanguageModelRandLM::GetValue(const vector<const Word*> &contextFactor,
|
||||
State* finalState) const
|
||||
{
|
||||
@ -154,6 +127,11 @@ LMResult LanguageModelRandLM::GetValue(const vector<const Word*> &contextFactor,
|
||||
return ret;
|
||||
}
|
||||
|
||||
void LanguageModelRandLM::InitializeForInput(InputType const& source) {
|
||||
m_lm->initThreadSpecificData(); // Creates thread specific data iff // compiled with multithreading.
|
||||
}
|
||||
void LanguageModelRandLM::CleanUpAfterSentenceProcessing(const InputType& source) {
|
||||
m_lm->clearCaches(); // clear caches
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,3 +1,5 @@
|
||||
#ifndef moses_LM_Rand_h
|
||||
#define moses_LM_Rand_h
|
||||
/***********************************************************************
|
||||
Moses - factored phrase-based language decoder
|
||||
Copyright (C) 2006 University of Edinburgh
|
||||
@ -16,14 +18,43 @@ You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <stdint.h>
|
||||
#include "SingleFactor.h"
|
||||
#include "moses/TypeDef.h"
|
||||
#include "moses/Word.h"
|
||||
//#include "RandLM.h"
|
||||
|
||||
#ifndef moses_LM_Rand_h
|
||||
#define moses_LM_Rand_h
|
||||
namespace randlm
|
||||
{
|
||||
class RandLM;
|
||||
}
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
class LanguageModelPointerState;
|
||||
LanguageModelPointerState *NewRandLM();
|
||||
class LanguageModelRandLM : public LanguageModelSingleFactor
|
||||
{
|
||||
public:
|
||||
LanguageModelRandLM(const std::string &line);
|
||||
~LanguageModelRandLM();
|
||||
|
||||
void Load();
|
||||
virtual LMResult GetValue(const std::vector<const Word*> &contextFactor, State* finalState = NULL) const;
|
||||
void InitializeForInput(InputType const& source);
|
||||
void CleanUpAfterSentenceProcessing(const InputType& source);
|
||||
|
||||
protected:
|
||||
//std::vector<randlm::WordID> m_randlm_ids_vec;
|
||||
std::vector<uint32_t> m_randlm_ids_vec; // Ken made me do this
|
||||
|
||||
randlm::RandLM* m_lm;
|
||||
uint32_t m_oov_id;
|
||||
void CreateFactors(FactorCollection &factorCollection);
|
||||
uint32_t GetLmID( const std::string &str ) const;
|
||||
uint32_t GetLmID( const Factor *factor ) const;
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
|
@ -30,6 +30,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include "InputFileStream.h"
|
||||
#include "StaticData.h"
|
||||
#include "UserMessage.h"
|
||||
#include "util/exception.hh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -38,25 +39,18 @@ namespace Moses
|
||||
/** define allowed parameters */
|
||||
Parameter::Parameter()
|
||||
{
|
||||
AddParam("mapping", "description of decoding steps");
|
||||
AddParam("beam-threshold", "b", "threshold for threshold pruning");
|
||||
AddParam("config", "f", "location of the configuration file");
|
||||
AddParam("continue-partial-translation", "cpt", "start from nonempty hypothesis");
|
||||
AddParam("decoding-graph-backoff", "dpb", "only use subsequent decoding paths for unknown spans of given length");
|
||||
AddParam("dlm-model", "Order, factor and vocabulary file for discriminative LM. Use * for filename to indicate unlimited vocabulary.");
|
||||
AddParam("drop-unknown", "du", "drop unknown words instead of copying them");
|
||||
AddParam("disable-discarding", "dd", "disable hypothesis discarding");
|
||||
AddParam("factor-delimiter", "fd", "specify a different factor delimiter than the default");
|
||||
AddParam("generation-file", "location and properties of the generation table");
|
||||
AddParam("global-lexical-file", "gl", "discriminatively trained global lexical translation model file");
|
||||
AddParam("glm-feature", "discriminatively trained global lexical translation feature, sparse producer");
|
||||
AddParam("input-factors", "list of factors in the input");
|
||||
AddParam("input-file", "i", "location of the input file to be translated");
|
||||
AddParam("inputtype", "text (0), confusion network (1), word lattice (2) (default = 0)");
|
||||
AddParam("labeled-n-best-list", "print out labels for each weight type in n-best list. default is true");
|
||||
AddParam("lmodel-file", "location and properties of the language models");
|
||||
AddParam("lmodel-dub", "dictionary upper bounds of language models");
|
||||
AddParam("lmodel-oov-feature", "add language model oov feature, one per model");
|
||||
AddParam("mapping", "description of decoding steps");
|
||||
AddParam("max-partial-trans-opt", "maximum number of partial translation options per input span (during mapping steps)");
|
||||
AddParam("max-trans-opt-per-coverage", "maximum number of translation options per input span (after applying mapping steps)");
|
||||
AddParam("max-phrase-length", "maximum phrase length (default 20)");
|
||||
@ -68,16 +62,10 @@ Parameter::Parameter()
|
||||
AddParam("phrase-drop-allowed", "da", "if present, allow dropping of source words"); //da = drop any (word); see -du for comparison
|
||||
AddParam("report-all-factors", "report all factors in output, not just first");
|
||||
AddParam("report-all-factors-in-n-best", "Report all factors in n-best-lists. Default is false");
|
||||
#ifdef HAVE_SYNLM
|
||||
AddParam("slmodel-file", "location of the syntactic language model file(s)");
|
||||
AddParam("slmodel-factor", "factor to use with syntactic language model");
|
||||
AddParam("slmodel-beam", "beam width to use with syntactic language model's parser");
|
||||
#endif
|
||||
AddParam("stack", "s", "maximum stack size for histogram pruning");
|
||||
AddParam("stack-diversity", "sd", "minimum number of hypothesis of each coverage in stack (default 0)");
|
||||
AddParam("threads","th", "number of threads to use in decoding (defaults to single-threaded)");
|
||||
AddParam("translation-details", "T", "for each best hypothesis, report translation details to the given file");
|
||||
AddParam("ttable-file", "location and properties of the translation tables");
|
||||
AddParam("translation-option-threshold", "tot", "threshold for translation options relative to best for input phrase");
|
||||
AddParam("early-discarding-threshold", "edt", "threshold for constructing hypotheses based on estimate cost");
|
||||
AddParam("verbose", "v", "verbosity level of the logging");
|
||||
@ -103,6 +91,7 @@ Parameter::Parameter()
|
||||
AddParam("lmbr-r", "ngram precision decay value for lattice mbr");
|
||||
AddParam("lmbr-map-weight", "weight given to map solution when doing lattice MBR (default 0)");
|
||||
AddParam("lattice-hypo-set", "to use lattice as hypo set during lattice MBR");
|
||||
AddParam("lmodel-oov-feature", "add language model oov feature, one per model");
|
||||
AddParam("clean-lm-cache", "clean language model caches after N translations (default N=1)");
|
||||
AddParam("use-persistent-cache", "cache translation options across sentences (default true)");
|
||||
AddParam("persistent-cache-size", "maximum size of cache for translation options (default 10,000 input phrases)");
|
||||
@ -129,13 +118,6 @@ Parameter::Parameter()
|
||||
AddParam("source-label-overlap", "What happens if a span already has a label. 0=add more. 1=replace. 2=discard. Default is 0");
|
||||
AddParam("output-hypo-score", "Output the hypo score to stdout with the output string. For search error analysis. Default is false");
|
||||
AddParam("unknown-lhs", "file containing target lhs of unknown words. 1 per line: LHS prob");
|
||||
AddParam("phrase-pair-feature", "Source and target factors for phrase pair feature");
|
||||
AddParam("phrase-boundary-source-feature", "Source factors for phrase boundary feature");
|
||||
AddParam("phrase-boundary-target-feature", "Target factors for phrase boundary feature");
|
||||
AddParam("phrase-length-feature", "Count features for source length, target length, both of each phrase");
|
||||
AddParam("target-word-insertion-feature", "Count feature for each unaligned target word");
|
||||
AddParam("source-word-deletion-feature", "Count feature for each unaligned source word");
|
||||
AddParam("word-translation-feature", "Count feature for word translation according to word alignment");
|
||||
AddParam("cube-pruning-lazy-scoring", "cbls", "Don't fully score a hypothesis until it is popped");
|
||||
AddParam("parsing-algorithm", "Which parsing algorithm to use. 0=CYK+, 1=scope-3. (default = 0)");
|
||||
AddParam("search-algorithm", "Which search algorithm to use. 0=normal stack, 1=cube pruning, 2=cube growing, 4=stack with batched lm requests (default = 0)");
|
||||
@ -185,6 +167,27 @@ Parameter::Parameter()
|
||||
AddParam("text-type", "DEPRECATED. DO NOT USE. should be one of dev/devtest/test, used for domain adaptation features");
|
||||
AddParam("input-scores", "DEPRECATED. DO NOT USE. 2 numbers on 2 lines - [1] of scores on each edge of a confusion network or lattice input (default=1). [2] Number of 'real' word scores (0 or 1. default=0)");
|
||||
|
||||
AddParam("dlm-model", "DEPRECATED. DO NOT USE. Order, factor and vocabulary file for discriminative LM. Use * for filename to indicate unlimited vocabulary.");
|
||||
AddParam("generation-file", "DEPRECATED. DO NOT USE. location and properties of the generation table");
|
||||
AddParam("global-lexical-file", "gl", "DEPRECATED. DO NOT USE. discriminatively trained global lexical translation model file");
|
||||
AddParam("glm-feature", "DEPRECATED. DO NOT USE. discriminatively trained global lexical translation feature, sparse producer");
|
||||
AddParam("lmodel-file", "DEPRECATED. DO NOT USE. location and properties of the language models");
|
||||
AddParam("lmodel-dub", "DEPRECATED. DO NOT USE. dictionary upper bounds of language models");
|
||||
|
||||
#ifdef HAVE_SYNLM
|
||||
AddParam("slmodel-file", "DEPRECATED. DO NOT USE. location of the syntactic language model file(s)");
|
||||
AddParam("slmodel-factor", "DEPRECATED. DO NOT USE. factor to use with syntactic language model");
|
||||
AddParam("slmodel-beam", "DEPRECATED. DO NOT USE. beam width to use with syntactic language model's parser");
|
||||
#endif
|
||||
AddParam("ttable-file", "DEPRECATED. DO NOT USE. location and properties of the translation tables");
|
||||
AddParam("phrase-pair-feature", "DEPRECATED. DO NOT USE. Source and target factors for phrase pair feature");
|
||||
AddParam("phrase-boundary-source-feature", "DEPRECATED. DO NOT USE. Source factors for phrase boundary feature");
|
||||
AddParam("phrase-boundary-target-feature", "DEPRECATED. DO NOT USE. Target factors for phrase boundary feature");
|
||||
AddParam("phrase-length-feature", "DEPRECATED. DO NOT USE. Count features for source length, target length, both of each phrase");
|
||||
AddParam("target-word-insertion-feature", "DEPRECATED. DO NOT USE. Count feature for each unaligned target word");
|
||||
AddParam("source-word-deletion-feature", "DEPRECATED. DO NOT USE. Count feature for each unaligned source word");
|
||||
AddParam("word-translation-feature", "DEPRECATED. DO NOT USE. Count feature for word translation according to word alignment");
|
||||
|
||||
AddParam("weight-file", "wf", "feature weights file. Do *not* put weights for 'core' features in here - they go in moses.ini");
|
||||
|
||||
AddParam("weight", "weights for ALL models, 1 per line 'WeightName value'. Weight names can be repeated");
|
||||
@ -195,6 +198,8 @@ Parameter::Parameter()
|
||||
AddParam("print-id", "prefix translations with id. Default if false");
|
||||
|
||||
AddParam("alternate-weight-setting", "aws", "alternate set of weights to used per xml specification");
|
||||
|
||||
AddParam("placeholder-factor", "Which factor to use to store the original text for placeholders");
|
||||
}
|
||||
|
||||
Parameter::~Parameter()
|
||||
@ -305,9 +310,27 @@ bool Parameter::LoadParam(int argc, char* argv[])
|
||||
}
|
||||
}
|
||||
|
||||
// don't mix old and new format
|
||||
if ((isParamSpecified("feature") || isParamSpecified("weight"))
|
||||
&& (isParamSpecified("weight-slm") || isParamSpecified("weight-bl") || isParamSpecified("weight-d") ||
|
||||
isParamSpecified("weight-dlm") || isParamSpecified("weight-lrl") || isParamSpecified("weight-generation") ||
|
||||
isParamSpecified("weight-i") || isParamSpecified("weight-l") || isParamSpecified("weight-lex") ||
|
||||
isParamSpecified("weight-glm") || isParamSpecified("weight-wt") || isParamSpecified("weight-pp") ||
|
||||
isParamSpecified("weight-pb") || isParamSpecified("weight-t") || isParamSpecified("weight-w") ||
|
||||
isParamSpecified("weight-u") || isParamSpecified("weight-e") ||
|
||||
isParamSpecified("dlm-mode") || isParamSpecified("generation-file") || isParamSpecified("global-lexical-file") ||
|
||||
isParamSpecified("glm-feature") || isParamSpecified("lmodel-file") || isParamSpecified("lmodel-dub") ||
|
||||
isParamSpecified("slmodel-file") || isParamSpecified("slmodel-factor") ||
|
||||
isParamSpecified("slmodel-beam") || isParamSpecified("ttable-file") || isParamSpecified("phrase-pair-feature") ||
|
||||
isParamSpecified("phrase-boundary-source-feature") || isParamSpecified("phrase-boundary-target-feature") || isParamSpecified("phrase-length-feature") ||
|
||||
isParamSpecified("target-word-insertion-feature") || isParamSpecified("source-word-deletion-feature") || isParamSpecified("word-translation-feature")
|
||||
)
|
||||
) {
|
||||
UTIL_THROW(util::Exception, "Don't mix old and new ini file format");
|
||||
}
|
||||
|
||||
// convert old weights args to new format
|
||||
// WHAT IS GOING ON HERE??? - UG
|
||||
if (!isParamSpecified("feature")) // UG
|
||||
if (!isParamSpecified("feature"))
|
||||
ConvertWeightArgs();
|
||||
CreateWeightsMap();
|
||||
WeightOverwrite();
|
||||
|
@ -121,15 +121,22 @@ int Sentence::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
|
||||
//const StaticData &staticData = StaticData::Instance();
|
||||
std::vector<XmlOption*> xmlOptionsList(0);
|
||||
std::vector< size_t > xmlWalls;
|
||||
std::vector< std::pair<size_t, std::string> > placeholders;
|
||||
|
||||
if (staticData.GetXmlInputType() != XmlPassThrough) {
|
||||
if (!ProcessAndStripXMLTags(line, xmlOptionsList, m_reorderingConstraint, xmlWalls, staticData.GetXmlBrackets().first, staticData.GetXmlBrackets().second)) {
|
||||
if (!ProcessAndStripXMLTags(line, xmlOptionsList, m_reorderingConstraint, xmlWalls, placeholders,
|
||||
staticData.GetXmlBrackets().first, staticData.GetXmlBrackets().second)) {
|
||||
const string msg("Unable to parse XML in line: " + line);
|
||||
TRACE_ERR(msg << endl);
|
||||
throw runtime_error(msg);
|
||||
}
|
||||
}
|
||||
|
||||
Phrase::CreateFromString(Input, factorOrder, line, factorDelimiter, NULL);
|
||||
|
||||
// placeholders
|
||||
ProcessPlaceholders(placeholders);
|
||||
|
||||
if (staticData.IsChart()) {
|
||||
InitStartEndWord();
|
||||
}
|
||||
@ -194,6 +201,22 @@ void Sentence::InitStartEndWord()
|
||||
AddWord(endWord);
|
||||
}
|
||||
|
||||
void Sentence::ProcessPlaceholders(const std::vector< std::pair<size_t, std::string> > &placeholders)
|
||||
{
|
||||
FactorType factorType = StaticData::Instance().GetPlaceholderFactor();
|
||||
if (factorType == NOT_FOUND) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < placeholders.size(); ++i) {
|
||||
size_t pos = placeholders[i].first;
|
||||
const string &str = placeholders[i].second;
|
||||
const Factor *factor = FactorCollection::Instance().AddFactor(str);
|
||||
Word &word = Phrase::GetWord(pos);
|
||||
word[factorType] = factor;
|
||||
}
|
||||
}
|
||||
|
||||
TranslationOptionCollection*
|
||||
Sentence::CreateTranslationOptionCollection() const
|
||||
{
|
||||
|
@ -57,6 +57,7 @@ private:
|
||||
NonTerminalSet m_defaultLabelSet;
|
||||
|
||||
void InitStartEndWord();
|
||||
void ProcessPlaceholders(const std::vector< std::pair<size_t, std::string> > &placeholders);
|
||||
|
||||
|
||||
public:
|
||||
|
@ -22,14 +22,11 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
|
||||
#include <string>
|
||||
#include "util/check.hh"
|
||||
#include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h"
|
||||
#include "moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h"
|
||||
#include "moses/TranslationModel/PhraseDictionaryMemory.h"
|
||||
#include "moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h"
|
||||
#include "moses/TranslationModel/PhraseDictionaryMultiModel.h"
|
||||
#include "moses/TranslationModel/PhraseDictionaryMultiModelCounts.h"
|
||||
#include "moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.h"
|
||||
#include "moses/TranslationModel/PhraseDictionaryDynSuffixArray.h"
|
||||
|
||||
#include "moses/FF/Factory.h"
|
||||
#include "moses/FF/WordPenaltyProducer.h"
|
||||
#include "moses/FF/UnknownWordPenaltyProducer.h"
|
||||
#include "moses/FF/InputFeature.h"
|
||||
|
||||
#include "DecodeStepTranslation.h"
|
||||
#include "DecodeStepGeneration.h"
|
||||
@ -46,37 +43,6 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include "InputFileStream.h"
|
||||
#include "ScoreComponentCollection.h"
|
||||
|
||||
#include "moses/FF/BleuScoreFeature.h"
|
||||
#include "moses/FF/TargetWordInsertionFeature.h"
|
||||
#include "moses/FF/SourceWordDeletionFeature.h"
|
||||
#include "moses/FF/GlobalLexicalModel.h"
|
||||
#include "moses/FF/GlobalLexicalModelUnlimited.h"
|
||||
#include "moses/FF/UnknownWordPenaltyProducer.h"
|
||||
#include "moses/FF/WordTranslationFeature.h"
|
||||
#include "moses/FF/TargetBigramFeature.h"
|
||||
#include "moses/FF/TargetNgramFeature.h"
|
||||
#include "moses/FF/PhraseBoundaryFeature.h"
|
||||
#include "moses/FF/PhrasePairFeature.h"
|
||||
#include "moses/FF/PhraseLengthFeature.h"
|
||||
#include "moses/FF/DistortionScoreProducer.h"
|
||||
#include "moses/FF/WordPenaltyProducer.h"
|
||||
#include "moses/FF/InputFeature.h"
|
||||
#include "moses/FF/PhrasePenalty.h"
|
||||
#include "moses/FF/OSM-Feature/OpSequenceModel.h"
|
||||
|
||||
#include "LM/Ken.h"
|
||||
#ifdef LM_IRST
|
||||
#include "LM/IRST.h"
|
||||
#endif
|
||||
|
||||
#ifdef LM_SRI
|
||||
#include "LM/SRI.h"
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_SYNLM
|
||||
#include "SyntacticLanguageModel.h"
|
||||
#endif
|
||||
|
||||
#ifdef WITH_THREADS
|
||||
#include <boost/thread.hpp>
|
||||
#endif
|
||||
@ -556,10 +522,18 @@ bool StaticData::LoadData(Parameter *parameter)
|
||||
cerr << "XML tags opening and closing brackets for XML input are: " << m_xmlBrackets.first << " and " << m_xmlBrackets.second << endl;
|
||||
}
|
||||
|
||||
if (m_parameter->GetParam("placeholder-factor").size() > 0) {
|
||||
m_placeHolderFactor = Scan<FactorType>(m_parameter->GetParam("placeholder-factor")[0]);
|
||||
} else {
|
||||
m_placeHolderFactor = NOT_FOUND;
|
||||
}
|
||||
|
||||
|
||||
// all features
|
||||
map<string, int> featureIndexMap;
|
||||
|
||||
const vector<string> &features = m_parameter->GetParam("feature");
|
||||
FeatureRegistry registry;
|
||||
for (size_t i = 0; i < features.size(); ++i) {
|
||||
const string &line = Trim(features[i]);
|
||||
cerr << "line=" << line << endl;
|
||||
@ -569,151 +543,8 @@ bool StaticData::LoadData(Parameter *parameter)
|
||||
vector<string> toks = Tokenize(line);
|
||||
|
||||
const string &feature = toks[0];
|
||||
//int featureIndex = GetFeatureIndex(featureIndexMap, feature);
|
||||
|
||||
if (feature == "GlobalLexicalModel") {
|
||||
GlobalLexicalModel *model = new GlobalLexicalModel(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
SetWeights(model, weights);
|
||||
} else if (feature == "GlobalLexicalModelUnlimited") {
|
||||
GlobalLexicalModelUnlimited *model = NULL; //new GlobalLexicalModelUnlimited(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
SetWeights(model, weights);
|
||||
} else if (feature == "SourceWordDeletionFeature") {
|
||||
SourceWordDeletionFeature *model = new SourceWordDeletionFeature(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
//SetWeights(model, weights);
|
||||
} else if (feature == "TargetWordInsertionFeature") {
|
||||
TargetWordInsertionFeature *model = new TargetWordInsertionFeature(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
//SetWeights(model, weights);
|
||||
} else if (feature == "PhraseBoundaryFeature") {
|
||||
PhraseBoundaryFeature *model = new PhraseBoundaryFeature(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
//SetWeights(model, weights);
|
||||
} else if (feature == "PhraseLengthFeature") {
|
||||
PhraseLengthFeature *model = new PhraseLengthFeature(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
//SetWeights(model, weights);
|
||||
} else if (feature == "WordTranslationFeature") {
|
||||
WordTranslationFeature *model = new WordTranslationFeature(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
//SetWeights(model, weights);
|
||||
} else if (feature == "TargetBigramFeature") {
|
||||
TargetBigramFeature *model = new TargetBigramFeature(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
//SetWeights(model, weights);
|
||||
} else if (feature == "TargetNgramFeature") {
|
||||
TargetNgramFeature *model = new TargetNgramFeature(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
//SetWeights(model, weights);
|
||||
} else if (feature == "PhrasePairFeature") {
|
||||
PhrasePairFeature *model = new PhrasePairFeature(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
//SetWeights(model, weights);
|
||||
} else if (feature == "LexicalReordering") {
|
||||
LexicalReordering *model = new LexicalReordering(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
SetWeights(model, weights);
|
||||
} else if (feature == "KENLM") {
|
||||
LanguageModel *model = ConstructKenLM(feature, line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
SetWeights(model, weights);
|
||||
}
|
||||
#ifdef LM_IRST
|
||||
else if (feature == "IRSTLM") {
|
||||
LanguageModelIRST *model = new LanguageModelIRST(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
SetWeights(model, weights);
|
||||
}
|
||||
#endif
|
||||
#ifdef LM_SRI
|
||||
else if (feature == "SRILM") {
|
||||
LanguageModelSRI *model = new LanguageModelSRI(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
SetWeights(model, weights);
|
||||
}
|
||||
#endif
|
||||
else if (feature == "Generation") {
|
||||
GenerationDictionary *model = new GenerationDictionary(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
SetWeights(model, weights);
|
||||
} else if (feature == "BleuScoreFeature") {
|
||||
BleuScoreFeature *model = new BleuScoreFeature(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
SetWeights(model, weights);
|
||||
} else if (feature == "Distortion") {
|
||||
DistortionScoreProducer *model = new DistortionScoreProducer(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
SetWeights(model, weights);
|
||||
} else if (feature == "WordPenalty") {
|
||||
WordPenaltyProducer *model = new WordPenaltyProducer(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
SetWeights(model, weights);
|
||||
} else if (feature == "UnknownWordPenalty") {
|
||||
UnknownWordPenaltyProducer *model = new UnknownWordPenaltyProducer(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
if (weights.size() == 0)
|
||||
weights.push_back(1.0f);
|
||||
SetWeights(model, weights);
|
||||
} else if (feature == "InputFeature") {
|
||||
InputFeature *model = new InputFeature(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
SetWeights(model, weights);
|
||||
|
||||
} else if (feature == "PhraseDictionaryBinary") {
|
||||
PhraseDictionaryTreeAdaptor* model = new PhraseDictionaryTreeAdaptor(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
SetWeights(model, weights);
|
||||
} else if (feature == "PhraseDictionaryOnDisk") {
|
||||
PhraseDictionaryOnDisk* model = new PhraseDictionaryOnDisk(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
SetWeights(model, weights);
|
||||
} else if (feature == "PhraseDictionaryMemory") {
|
||||
PhraseDictionaryMemory* model = new PhraseDictionaryMemory(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
SetWeights(model, weights);
|
||||
} else if (feature == "PhraseDictionaryCompact") {
|
||||
PhraseDictionaryCompact* model = new PhraseDictionaryCompact(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
SetWeights(model, weights);
|
||||
} else if (feature == "PhraseDictionaryMultiModel") {
|
||||
PhraseDictionaryMultiModel* model = new PhraseDictionaryMultiModel(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
SetWeights(model, weights);
|
||||
} else if (feature == "PhraseDictionaryMultiModelCounts") {
|
||||
PhraseDictionaryMultiModelCounts* model = new PhraseDictionaryMultiModelCounts(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
SetWeights(model, weights);
|
||||
} else if (feature == "PhraseDictionaryALSuffixArray") {
|
||||
PhraseDictionaryALSuffixArray* model = new PhraseDictionaryALSuffixArray(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
SetWeights(model, weights);
|
||||
} else if (feature == "PhraseDictionaryDynSuffixArray") {
|
||||
PhraseDictionaryDynSuffixArray* model = new PhraseDictionaryDynSuffixArray(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
SetWeights(model, weights);
|
||||
} else if (feature == "OpSequenceModel") {
|
||||
OpSequenceModel* model = new OpSequenceModel(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
SetWeights(model, weights);
|
||||
} else if (feature == "PhrasePenalty") {
|
||||
PhrasePenalty* model = new PhrasePenalty(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
SetWeights(model, weights);
|
||||
}
|
||||
|
||||
#ifdef HAVE_SYNLM
|
||||
else if (feature == "SyntacticLanguageModel") {
|
||||
SyntacticLanguageModel *model = new SyntacticLanguageModel(line);
|
||||
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
|
||||
SetWeights(model, weights);
|
||||
}
|
||||
#endif
|
||||
else {
|
||||
UserMessage::Add("Unknown feature function:" + feature);
|
||||
return false;
|
||||
}
|
||||
registry.Construct(feature, line);
|
||||
}
|
||||
|
||||
OverrideFeatures();
|
||||
@ -1317,7 +1148,7 @@ void StaticData::OverrideFeatures()
|
||||
CHECK(keyVal.size() == 2);
|
||||
|
||||
VERBOSE(1, "Override " << ff.GetScoreProducerDescription() << " "
|
||||
<< keyVal[0] << "=" << keyVal[1] << endl);
|
||||
<< keyVal[0] << "=" << keyVal[1] << endl);
|
||||
|
||||
ff.SetParameter(keyVal[0], keyVal[1]);
|
||||
|
||||
|
@ -214,6 +214,8 @@ protected:
|
||||
std::map< std::string, std::set< std::string > > m_weightSettingIgnoreFF; // feature function
|
||||
std::map< std::string, std::set< size_t > > m_weightSettingIgnoreDP; // decoding path
|
||||
|
||||
FactorType m_placeHolderFactor;
|
||||
|
||||
StaticData();
|
||||
|
||||
void LoadChartDecodingParameters();
|
||||
@ -768,6 +770,9 @@ public:
|
||||
|
||||
void OverrideFeatures();
|
||||
|
||||
FactorType GetPlaceholderFactor() const {
|
||||
return m_placeHolderFactor;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -85,7 +85,7 @@ public:
|
||||
void Sort(bool adhereTableLimit, size_t tableLimit);
|
||||
|
||||
void Clear() {
|
||||
RemoveAllInColl(m_collection);
|
||||
RemoveAllInColl(m_collection);
|
||||
}
|
||||
|
||||
};
|
||||
|
@ -154,12 +154,10 @@ void ChartRuleLookupManagerMemory::GetChartRuleCollection(
|
||||
const PhraseDictionaryNodeMemory &node = dottedRule.GetLastNode();
|
||||
|
||||
// look up target sides
|
||||
const TargetPhraseCollection *tpc = node.GetTargetPhraseCollection();
|
||||
const TargetPhraseCollection &tpc = node.GetTargetPhraseCollection();
|
||||
|
||||
// add the fully expanded rule (with lexical target side)
|
||||
if (tpc != NULL) {
|
||||
AddCompletedRule(dottedRule, *tpc, range, outColl);
|
||||
}
|
||||
AddCompletedRule(dottedRule, tpc, range, outColl);
|
||||
}
|
||||
|
||||
dottedRuleCol.Clear(relEndPos+1);
|
||||
|
@ -153,12 +153,10 @@ void ChartRuleLookupManagerMemoryPerSentence::GetChartRuleCollection(
|
||||
const PhraseDictionaryNodeMemory &node = dottedRule.GetLastNode();
|
||||
|
||||
// look up target sides
|
||||
const TargetPhraseCollection *tpc = node.GetTargetPhraseCollection();
|
||||
const TargetPhraseCollection &tpc = node.GetTargetPhraseCollection();
|
||||
|
||||
// add the fully expanded rule (with lexical target side)
|
||||
if (tpc != NULL) {
|
||||
AddCompletedRule(dottedRule, *tpc, range, outColl);
|
||||
}
|
||||
AddCompletedRule(dottedRule, tpc, range, outColl);
|
||||
}
|
||||
|
||||
dottedRuleCol.Clear(relEndPos+1);
|
||||
|
@ -51,7 +51,7 @@ TargetPhraseCollection &PhraseDictionaryMemory::GetOrCreateTargetPhraseCollectio
|
||||
, const Word *sourceLHS)
|
||||
{
|
||||
PhraseDictionaryNodeMemory &currNode = GetOrCreateNode(source, target, sourceLHS);
|
||||
return currNode.GetOrCreateTargetPhraseCollection();
|
||||
return currNode.GetTargetPhraseCollection();
|
||||
}
|
||||
|
||||
const TargetPhraseCollection *PhraseDictionaryMemory::GetTargetPhraseCollection(const Phrase& sourceOrig) const
|
||||
@ -70,7 +70,7 @@ const TargetPhraseCollection *PhraseDictionaryMemory::GetTargetPhraseCollection(
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return currNode->GetTargetPhraseCollection();
|
||||
return &currNode->GetTargetPhraseCollection();
|
||||
}
|
||||
|
||||
PhraseDictionaryNodeMemory &PhraseDictionaryMemory::GetOrCreateNode(const Phrase &source
|
||||
|
@ -223,7 +223,6 @@ void PhraseDictionaryMultiModelCounts::Load()
|
||||
|
||||
const TargetPhraseCollection *PhraseDictionaryMultiModelCounts::GetTargetPhraseCollection(const Phrase& src) const
|
||||
{
|
||||
|
||||
vector<vector<float> > multimodelweights;
|
||||
bool normalize;
|
||||
normalize = (m_mode == "interpolate") ? true : false;
|
||||
@ -346,11 +345,11 @@ float PhraseDictionaryMultiModelCounts::GetTargetCount(const Phrase &target, siz
|
||||
{
|
||||
|
||||
const PhraseDictionary &pd = *m_inverse_pd[modelIndex];
|
||||
TargetPhraseCollection *ret_raw = (TargetPhraseCollection*) pd.GetTargetPhraseCollection(target);
|
||||
const TargetPhraseCollection *ret_raw = pd.GetTargetPhraseCollection(target);
|
||||
|
||||
// in inverse mode, we want the first score of the first phrase pair (note: if we were to work with truly symmetric models, it would be the third score)
|
||||
if (ret_raw != NULL) {
|
||||
TargetPhrase * targetPhrase = *(ret_raw->begin());
|
||||
if (ret_raw && ret_raw->GetSize() > 0) {
|
||||
const TargetPhrase * targetPhrase = *(ret_raw->begin());
|
||||
return UntransformScore(targetPhrase->GetScoreBreakdown().GetScoresForProducer(&pd)[0]);
|
||||
}
|
||||
|
||||
|
@ -133,10 +133,10 @@ public:
|
||||
const PhraseDictionaryNodeMemory *GetChild(const Word &sourceTerm) const;
|
||||
const PhraseDictionaryNodeMemory *GetChild(const Word &sourceNonTerm, const Word &targetNonTerm) const;
|
||||
|
||||
const TargetPhraseCollection *GetTargetPhraseCollection() const {
|
||||
return &m_targetPhraseCollection;
|
||||
const TargetPhraseCollection &GetTargetPhraseCollection() const {
|
||||
return m_targetPhraseCollection;
|
||||
}
|
||||
TargetPhraseCollection &GetOrCreateTargetPhraseCollection() {
|
||||
TargetPhraseCollection &GetTargetPhraseCollection() {
|
||||
return m_targetPhraseCollection;
|
||||
}
|
||||
|
||||
|
@ -263,7 +263,7 @@ TargetPhraseCollection &PhraseDictionaryFuzzyMatch::GetOrCreateTargetPhraseColle
|
||||
, const Word *sourceLHS)
|
||||
{
|
||||
PhraseDictionaryNodeMemory &currNode = GetOrCreateNode(rootNode, source, target, sourceLHS);
|
||||
return currNode.GetOrCreateTargetPhraseCollection();
|
||||
return currNode.GetTargetPhraseCollection();
|
||||
}
|
||||
|
||||
PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetOrCreateNode(PhraseDictionaryNodeMemory &rootNode
|
||||
|
20
moses/Util.h
20
moses/Util.h
@ -233,6 +233,26 @@ inline void TokenizeMultiCharSeparator(std::vector<std::string> &output
|
||||
output.push_back(Trim(str.substr(pos, nextPos - pos)));
|
||||
}
|
||||
|
||||
/** only split of the first delimiter. Used by class FeatureFunction for parse key=value pair.
|
||||
* Value may have = character
|
||||
*/
|
||||
inline std::vector<std::string> TokenizeFirstOnly(const std::string& str,
|
||||
const std::string& delimiters = " \t")
|
||||
{
|
||||
std::vector<std::string> tokens;
|
||||
std::string::size_type pos = str.find_first_of(delimiters);
|
||||
|
||||
if (std::string::npos != pos) {
|
||||
// Found a token, add it to the vector.
|
||||
tokens.push_back(str.substr(0, pos));
|
||||
tokens.push_back(str.substr(pos + 1, str.size() - pos - 1));
|
||||
} else {
|
||||
tokens.push_back(str);
|
||||
}
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Convert vector of type T to string
|
||||
|
@ -134,6 +134,11 @@ public:
|
||||
return Compare(*this, compare) != 0;
|
||||
}
|
||||
|
||||
int Compare(const Word &other) const {
|
||||
return Compare(*this, other);
|
||||
}
|
||||
|
||||
|
||||
/* static functions */
|
||||
|
||||
/** transitive comparison of 2 word objects. Used by operator<.
|
||||
|
@ -150,10 +150,13 @@ vector<string> TokenizeXml(const string& str, const std::string& lbrackStr, cons
|
||||
* \param rbrackStr xml tag's right bracket string, typically ">"
|
||||
*/
|
||||
bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingConstraint &reorderingConstraint, vector< size_t > &walls,
|
||||
std::vector< std::pair<size_t, std::string> > &placeholders,
|
||||
const std::string& lbrackStr, const std::string& rbrackStr)
|
||||
{
|
||||
//parse XML markup in translation line
|
||||
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
|
||||
// no xml tag? we're done.
|
||||
//if (line.find_first_of('<') == string::npos) {
|
||||
if (line.find(lbrackStr) == string::npos) {
|
||||
@ -172,8 +175,8 @@ bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingCon
|
||||
string cleanLine; // return string (text without xml)
|
||||
size_t wordPos = 0; // position in sentence (in terms of number of words)
|
||||
|
||||
const vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
|
||||
const string &factorDelimiter = StaticData::Instance().GetFactorDelimiter();
|
||||
const vector<FactorType> &outputFactorOrder = staticData.GetOutputFactorOrder();
|
||||
const string &factorDelimiter = staticData.GetFactorDelimiter();
|
||||
|
||||
// loop through the tokens
|
||||
for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) {
|
||||
@ -290,6 +293,16 @@ bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingCon
|
||||
reorderingConstraint.SetZone( startPos, endPos-1 );
|
||||
}
|
||||
|
||||
// name-entity placeholder
|
||||
else if (tagName == "ne") {
|
||||
if (startPos != (endPos - 1)) {
|
||||
TRACE_ERR("ERROR: Placeholder must only span 1 word: " << line << endl);
|
||||
return false;
|
||||
}
|
||||
string entity = ParseXmlTagAttribute(tagContent,"entity");
|
||||
placeholders.push_back(std::pair<size_t, std::string>(startPos, entity));
|
||||
}
|
||||
|
||||
// default: opening tag that specifies translation options
|
||||
else {
|
||||
if (startPos >= endPos) {
|
||||
@ -329,7 +342,7 @@ bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingCon
|
||||
}
|
||||
|
||||
// store translation options into members
|
||||
if (StaticData::Instance().GetXmlInputType() != XmlIgnore) {
|
||||
if (staticData.GetXmlInputType() != XmlIgnore) {
|
||||
// only store options if we aren't ignoring them
|
||||
for (size_t i=0; i<altTexts.size(); ++i) {
|
||||
Phrase sourcePhrase; // TODO don't know what the source phrase is
|
||||
|
@ -31,6 +31,7 @@ bool isXmlTag(const std::string& tag, const std::string& lbrackStr="<", const st
|
||||
std::vector<std::string> TokenizeXml(const std::string& str, const std::string& lbrackStr="<", const std::string& rbrackStr=">");
|
||||
|
||||
bool ProcessAndStripXMLTags(std::string &line, std::vector<XmlOption*> &res, ReorderingConstraint &reorderingConstraint, std::vector< size_t > &walls,
|
||||
std::vector< std::pair<size_t, std::string> > &placeholders,
|
||||
const std::string& lbrackStr="<", const std::string& rbrackStr=">");
|
||||
|
||||
}
|
||||
|
@ -388,6 +388,12 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
|
||||
if (vm.count("UnpairedExtractFormat")) {
|
||||
options.unpairedExtractFormat = true;
|
||||
}
|
||||
|
||||
// Workaround for extract-parallel issue.
|
||||
if (options.sentenceOffset > 0) {
|
||||
options.glueGrammarFile.clear();
|
||||
options.unknownWordFile.clear();
|
||||
}
|
||||
}
|
||||
|
||||
void ExtractGHKM::Error(const std::string &msg) const
|
||||
|
@ -371,6 +371,23 @@ alignment-symmetrization-method = grow-diag-final-and
|
||||
#
|
||||
#biconcor = $moses-script-dir/ems/biconcor/biconcor
|
||||
|
||||
############################################################
|
||||
|
||||
### use of Operation Sequence Model
|
||||
### Durrani, Schmid and Fraser. (2011): "A Joint Sequence Translation Model with Integrated Reordering"
|
||||
|
||||
#operation-sequence-model = "yes"
|
||||
#operation-sequence-model-order = 5
|
||||
### compile Moses with --max-kenlm-order=9 if higher order is required
|
||||
|
||||
### if OSM training should be skipped,
|
||||
# point to OSM Model
|
||||
#
|
||||
# osm-model =
|
||||
|
||||
############################################################
|
||||
|
||||
|
||||
### lexicalized reordering: specify orientation type
|
||||
# (default: only distance-based reordering model)
|
||||
#
|
||||
|
@ -355,6 +355,23 @@ alignment-symmetrization-method = grow-diag-final-and
|
||||
#
|
||||
#biconcor = $moses-script-dir/ems/biconcor/biconcor
|
||||
|
||||
############################################################
|
||||
|
||||
### use of Operation Sequence Model
|
||||
### Durrani, Schmid and Fraser. (2011): "A Joint Sequence Translation Model with Integrated Reordering"
|
||||
|
||||
#operation-sequence-model = "yes"
|
||||
#operation-sequence-model-order = 5
|
||||
### compile Moses with --max-kenlm-order=9 if higher order is required
|
||||
|
||||
### if OSM training should be skipped,
|
||||
# point to OSM Model
|
||||
#
|
||||
# osm-model =
|
||||
|
||||
############################################################
|
||||
|
||||
|
||||
### lexicalized reordering: specify orientation type
|
||||
# (default: only distance-based reordering model)
|
||||
#
|
||||
|
@ -1,4 +1,5 @@
|
||||
#!/usr/bin/perl -w
|
||||
package ph_numbers;
|
||||
|
||||
# Script to recognize and replace numbers in Moses training corpora
|
||||
# and decoder input
|
||||
@ -7,45 +8,65 @@
|
||||
|
||||
use strict;
|
||||
|
||||
run() unless caller();
|
||||
use Getopt::Std;
|
||||
|
||||
my $debug = $ENV{DEBUG} || 0;
|
||||
|
||||
my %opts;
|
||||
if(!getopts('s:t:cm:hl',\%opts) || $opts{h}) {
|
||||
print "Usage: perl $0 [-s source_locale][-t target_locale][-c][-h][-l][-m symbol] < in > out\n";
|
||||
exit;
|
||||
sub run {
|
||||
my %opts;
|
||||
if(!getopts('s:t:cm:hl',\%opts) || $opts{h}) {
|
||||
print "Usage: perl $0 [-s source_locale][-t target_locale][-c][-h][-l][-m symbol] < in > out\n";
|
||||
exit;
|
||||
}
|
||||
my $sourceLocale = $opts{s} || "";
|
||||
my $targetLocale = $opts{t} || "";
|
||||
my $numberSymbol = $opts{m} || '@NUM@';
|
||||
while(<>) {
|
||||
chomp;
|
||||
print recognize($_,$opts{c},$opts{l},$numberSymbol,$_),"\n";
|
||||
}
|
||||
}
|
||||
my $sourceLocale = $opts{s} || "";
|
||||
my $targetLocale = $opts{t} || "";
|
||||
my $numberSymbol = $opts{m} || '@NUM@';
|
||||
|
||||
while(<>) {
|
||||
sub recognize {
|
||||
my $line = shift;
|
||||
my $corpusMode = shift;
|
||||
my $legacyMode = shift;
|
||||
my $numberSymbol = shift || '@NUM@';
|
||||
|
||||
# [-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?
|
||||
# while(/\G(.*?)\s*([+-]?\p{Digit}+[+-\p{Digit}\.,eE])/) {
|
||||
chomp;
|
||||
my $output = "";
|
||||
my $remainder = "";
|
||||
while(/\G(.*?)(\s*)([+-]?\p{Digit}*[\.,]?\p{Digit}+[\p{Digit}\.,+-eE]*)/g) {
|
||||
print STDERR "Between: x$1x\n" if $debug;
|
||||
print STDERR "Number: x$3x\n" if $debug;
|
||||
$output .= $1;
|
||||
if($opts{c}) {
|
||||
while($line =~ /\G(.*?)(\s*)([+-]?\p{Digit}*[\.,]?\p{Digit}+[\p{Digit}\.,+-eE]*)/g) {
|
||||
my $between = $1;
|
||||
my $number = $3;
|
||||
print STDERR "Between: x${between}x\n" if $debug;
|
||||
print STDERR "Number: x${number}x\n" if $debug;
|
||||
# If there are more numbers separated by whitespace, add these
|
||||
my $numberContinuation = "";
|
||||
while($line = /\G(\s+)([\p{Digit}\.,+-eE]*)/g) {
|
||||
$numberContinuation .= $1.$2;
|
||||
}
|
||||
$number .= $numberContinuation;
|
||||
$output .= $between;
|
||||
if($corpusMode) {
|
||||
$output .= $2.$numberSymbol;
|
||||
}
|
||||
else {
|
||||
if($opts{l}) {
|
||||
$output .= $2."<ne translation=\"$3\">$numberSymbol</ne>";
|
||||
if($legacyMode) {
|
||||
$output .= $2."<ne translation=\"$number\">$numberSymbol</ne>";
|
||||
}
|
||||
else {
|
||||
$output .= $2."<ne translation=\"$numberSymbol\" entity=\"$3\">$numberSymbol</ne>";
|
||||
$output .= $2."<ne translation=\"$numberSymbol\" entity=\"$number\">$numberSymbol</ne>";
|
||||
}
|
||||
}
|
||||
$remainder = $';
|
||||
}
|
||||
print STDERR "Remainder: x".$remainder."x\n" if $debug;
|
||||
print STDERR "\n" if $debug;
|
||||
$output .= $remainder if $remainder;
|
||||
$output .= "\n";
|
||||
print $output;
|
||||
print STDERR "Remainder: x".$remainder."x\n" if $debug;
|
||||
print STDERR "\n" if $debug;
|
||||
$output .= $remainder if $remainder;
|
||||
return $output;
|
||||
}
|
||||
|
||||
1;
|
||||
|
@ -33,6 +33,9 @@ while(<STDIN>) {
|
||||
s/\(/*LRB*/g;
|
||||
s/\)/*RRB*/g;
|
||||
|
||||
# handle @ (the parser does something weird with these)
|
||||
s/\@/\\\@/g;
|
||||
|
||||
print TMP $_;
|
||||
}
|
||||
close(TMP);
|
||||
@ -42,6 +45,7 @@ print STDERR $cmd."\n";
|
||||
|
||||
open(PARSE,"$cmd|");
|
||||
while(<PARSE>) {
|
||||
s/\\\@/\@/g;
|
||||
print $_;
|
||||
}
|
||||
close(PARSE);
|
||||
|
Loading…
Reference in New Issue
Block a user