mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-25 04:43:03 +03:00
Merge branch 'master' of github.com:moses-smt/mosesdecoder
This commit is contained in:
commit
27eee55a57
5
NOTICE
5
NOTICE
@ -1,5 +0,0 @@
|
||||
This code includes data from Daniel Naber's Language Tools (czech abbreviations).
|
||||
|
||||
This code includes data from czech wiktionary (also czech abbreviations).
|
||||
|
||||
|
@ -312,7 +312,7 @@ Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::Facto
|
||||
// property
|
||||
ret->SetProperties(m_property);
|
||||
|
||||
ret->Evaluate(mosesSP, phraseDict.GetFeaturesToApply());
|
||||
ret->EvaluateInIsolation(mosesSP, phraseDict.GetFeaturesToApply());
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -36,11 +36,15 @@
|
||||
<tool id="cdt.managedbuild.tool.gnu.c.linker.exe.debug.1950007837" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.debug"/>
|
||||
<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.110628197" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
|
||||
<option id="gnu.cpp.link.option.libs.1393924562" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
|
||||
<listOptionValue builtIn="false" value="moses"/>
|
||||
<listOptionValue builtIn="false" value="util"/>
|
||||
<listOptionValue builtIn="false" value="boost_iostreams"/>
|
||||
<listOptionValue builtIn="false" value="z"/>
|
||||
</option>
|
||||
<option id="gnu.cpp.link.option.paths.1967422094" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/../../boost/lib64""/>
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/moses/Debug""/>
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/util/Debug""/>
|
||||
</option>
|
||||
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1093223502" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
|
||||
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
|
||||
@ -52,6 +56,15 @@
|
||||
</tool>
|
||||
</toolChain>
|
||||
</folderInfo>
|
||||
<fileInfo id="cdt.managedbuild.config.gnu.exe.debug.2091728208.911524129" name="PropertiesConsolidator.cpp" rcbsApplicability="disable" resourcePath="PropertiesConsolidator.cpp" toolsToInvoke="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1626949654.741737356">
|
||||
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1626949654.741737356" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1626949654">
|
||||
<option id="gnu.cpp.compiler.option.include.paths.858416673" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:}/../../boost/include""/>
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc}/../..""/>
|
||||
</option>
|
||||
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.2042647079" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
|
||||
</tool>
|
||||
</fileInfo>
|
||||
</configuration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
|
||||
|
@ -3,6 +3,8 @@
|
||||
<name>consolidate</name>
|
||||
<comment></comment>
|
||||
<projects>
|
||||
<project>moses</project>
|
||||
<project>util</project>
|
||||
</projects>
|
||||
<buildSpec>
|
||||
<buildCommand>
|
||||
@ -45,6 +47,16 @@
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/OutputFileStream.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>PropertiesConsolidator.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/PropertiesConsolidator.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>PropertiesConsolidator.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/PropertiesConsolidator.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>consolidate-main.cpp</name>
|
||||
<type>1</type>
|
||||
|
@ -1306,6 +1306,16 @@
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/SoftMatchingFeature.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/SoftSourceSyntacticConstraintsFeature.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/SoftSourceSyntacticConstraintsFeature.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/SoftSourceSyntacticConstraintsFeature.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/SoftSourceSyntacticConstraintsFeature.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/SourceGHKMTreeInputMatchFeature.cpp</name>
|
||||
<type>1</type>
|
||||
@ -1686,6 +1696,16 @@
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/NonTermContextProperty.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>PP/OrientationPhraseProperty.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/OrientationPhraseProperty.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>PP/OrientationPhraseProperty.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/OrientationPhraseProperty.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>PP/PhraseProperty.cpp</name>
|
||||
<type>1</type>
|
||||
|
@ -1,3 +1,4 @@
|
||||
The documentation for memory-mapped, dynamic suffix arrays has moved to
|
||||
http://www.statmt.org/moses/?n=Moses.AdvancedFeatures#ntoc40
|
||||
|
||||
Search for PhraseDictionaryBitextSampling.
|
@ -215,7 +215,7 @@ Hypothesis *BackwardsEdge::CreateHypothesis(const Hypothesis &hypothesis, const
|
||||
IFVERBOSE(2) {
|
||||
hypothesis.GetManager().GetSentenceStats().StopTimeBuildHyp();
|
||||
}
|
||||
newHypo->Evaluate(m_futurescore);
|
||||
newHypo->EvaluateWhenApplied(m_futurescore);
|
||||
|
||||
return newHypo;
|
||||
}
|
||||
|
@ -60,7 +60,7 @@ ChartCell::~ChartCell() {}
|
||||
|
||||
/** Add the given hypothesis to the cell.
|
||||
* Returns true if added, false if not. Maybe it already exists in the collection or score falls below threshold etc.
|
||||
* This function just calls the correspondind AddHypothesis() in ChartHypothesisCollection
|
||||
* This function just calls the corresponding AddHypothesis() in ChartHypothesisCollection
|
||||
* \param hypo Hypothesis to be added
|
||||
*/
|
||||
bool ChartCell::AddHypothesis(ChartHypothesis *hypo)
|
||||
|
@ -212,7 +212,7 @@ int ChartHypothesis::RecombineCompare(const ChartHypothesis &compare) const
|
||||
/** calculate total score
|
||||
* @todo this should be in ScoreBreakdown
|
||||
*/
|
||||
void ChartHypothesis::Evaluate()
|
||||
void ChartHypothesis::EvaluateWhenApplied()
|
||||
{
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
// total scores from prev hypos
|
||||
|
@ -144,7 +144,7 @@ public:
|
||||
|
||||
int RecombineCompare(const ChartHypothesis &compare) const;
|
||||
|
||||
void Evaluate();
|
||||
void EvaluateWhenApplied();
|
||||
|
||||
void AddArc(ChartHypothesis *loserHypo);
|
||||
void CleanupArcList();
|
||||
|
@ -56,7 +56,7 @@ ChartHypothesisCollection::~ChartHypothesisCollection()
|
||||
/** public function to add hypothesis to this collection.
|
||||
* Returns false if equiv hypo exists in collection, otherwise returns true.
|
||||
* Takes care of update arc list for n-best list creation.
|
||||
* Will delete hypo is it exist - once this function is call don't delete hypothesis.
|
||||
* Will delete hypo if it exists - once this function is call don't delete hypothesis.
|
||||
* \param hypo hypothesis to add
|
||||
* \param manager pointer back to manager
|
||||
*/
|
||||
|
@ -87,7 +87,7 @@ void ChartManager::ProcessSentence()
|
||||
m_translationOptionList.ApplyThreshold();
|
||||
|
||||
const InputPath &inputPath = m_parser.GetInputPath(range);
|
||||
m_translationOptionList.Evaluate(m_source, inputPath);
|
||||
m_translationOptionList.EvaluateWithSourceContext(m_source, inputPath);
|
||||
|
||||
// decode
|
||||
ChartCell &cell = m_hypoStackColl.Get(range);
|
||||
@ -143,7 +143,7 @@ void ChartManager::AddXmlChartOptions()
|
||||
|
||||
RuleCubeItem* item = new RuleCubeItem( *opt, m_hypoStackColl );
|
||||
ChartHypothesis* hypo = new ChartHypothesis(*opt, *item, *this);
|
||||
hypo->Evaluate();
|
||||
hypo->EvaluateWhenApplied();
|
||||
|
||||
|
||||
ChartCell &cell = m_hypoStackColl.Get(range);
|
||||
|
@ -68,6 +68,12 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
|
||||
|
||||
m_unksrcs.push_back(unksrc);
|
||||
|
||||
// hack. Once the OOV FF is a phrase table, get rid of this
|
||||
PhraseDictionary *firstPt = NULL;
|
||||
if (PhraseDictionary::GetColl().size() == 0) {
|
||||
firstPt = PhraseDictionary::GetColl()[0];
|
||||
}
|
||||
|
||||
//TranslationOption *transOpt;
|
||||
if (! staticData.GetDropUnknown() || isDigit) {
|
||||
// loop
|
||||
@ -85,7 +91,7 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
|
||||
UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor for target LHS");
|
||||
|
||||
// add to dictionary
|
||||
TargetPhrase *targetPhrase = new TargetPhrase(NULL);
|
||||
TargetPhrase *targetPhrase = new TargetPhrase(firstPt);
|
||||
Word &targetWord = targetPhrase->AddWord();
|
||||
targetWord.CreateUnknownWord(sourceWord);
|
||||
|
||||
@ -93,7 +99,7 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
|
||||
float unknownScore = FloorScore(TransformScore(prob));
|
||||
|
||||
targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore);
|
||||
targetPhrase->Evaluate(*unksrc);
|
||||
targetPhrase->EvaluateInIsolation(*unksrc);
|
||||
|
||||
targetPhrase->SetTargetLHS(targetLHS);
|
||||
targetPhrase->SetAlignmentInfo("0-0");
|
||||
@ -108,7 +114,7 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
|
||||
// drop source word. create blank trans opt
|
||||
float unknownScore = FloorScore(-numeric_limits<float>::infinity());
|
||||
|
||||
TargetPhrase *targetPhrase = new TargetPhrase(NULL);
|
||||
TargetPhrase *targetPhrase = new TargetPhrase(firstPt);
|
||||
// loop
|
||||
const UnknownLHSList &lhsList = staticData.GetUnknownLHS();
|
||||
UnknownLHSList::const_iterator iterLHS;
|
||||
@ -121,7 +127,7 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
|
||||
UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor for target LHS");
|
||||
|
||||
targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore);
|
||||
targetPhrase->Evaluate(*unksrc);
|
||||
targetPhrase->EvaluateInIsolation(*unksrc);
|
||||
|
||||
targetPhrase->SetTargetLHS(targetLHS);
|
||||
|
||||
|
@ -25,7 +25,7 @@ public:
|
||||
|
||||
virtual void AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &waste_memory, const WordsRange &range) = 0;
|
||||
|
||||
virtual void Evaluate(const InputType &input, const InputPath &inputPath) = 0;
|
||||
virtual void EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath) = 0;
|
||||
|
||||
virtual float GetBestScore(const ChartCellLabel *chartCell) const = 0;
|
||||
|
||||
|
@ -10,7 +10,7 @@ ChartTranslationOption::ChartTranslationOption(const TargetPhrase &targetPhrase)
|
||||
{
|
||||
}
|
||||
|
||||
void ChartTranslationOption::Evaluate(const InputType &input,
|
||||
void ChartTranslationOption::EvaluateWithSourceContext(const InputType &input,
|
||||
const InputPath &inputPath,
|
||||
const StackVec &stackVec)
|
||||
{
|
||||
|
@ -44,7 +44,7 @@ public:
|
||||
return m_scoreBreakdown;
|
||||
}
|
||||
|
||||
void Evaluate(const InputType &input,
|
||||
void EvaluateWithSourceContext(const InputType &input,
|
||||
const InputPath &inputPath,
|
||||
const StackVec &stackVec);
|
||||
};
|
||||
|
@ -168,13 +168,13 @@ float ChartTranslationOptionList::GetBestScore(const ChartCellLabel *chartCell)
|
||||
return bestHypo.GetTotalScore();
|
||||
}
|
||||
|
||||
void ChartTranslationOptionList::Evaluate(const InputType &input, const InputPath &inputPath)
|
||||
void ChartTranslationOptionList::EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath)
|
||||
{
|
||||
// NEVER iterate over ALL of the collection. Just over the first m_size
|
||||
CollType::iterator iter;
|
||||
for (iter = m_collection.begin(); iter != m_collection.begin() + m_size; ++iter) {
|
||||
ChartTranslationOptions &transOpts = **iter;
|
||||
transOpts.Evaluate(input, inputPath);
|
||||
transOpts.EvaluateWithSourceContext(input, inputPath);
|
||||
}
|
||||
|
||||
// get rid of empty trans opts
|
||||
|
@ -65,7 +65,7 @@ public:
|
||||
|
||||
void Clear();
|
||||
void ApplyThreshold();
|
||||
void Evaluate(const InputType &input, const InputPath &inputPath);
|
||||
void EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath);
|
||||
|
||||
private:
|
||||
typedef std::vector<ChartTranslationOptions*> CollType;
|
||||
|
@ -51,7 +51,7 @@ ChartTranslationOptions::~ChartTranslationOptions()
|
||||
|
||||
}
|
||||
|
||||
void ChartTranslationOptions::Evaluate(const InputType &input, const InputPath &inputPath)
|
||||
void ChartTranslationOptions::EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath)
|
||||
{
|
||||
SetInputPath(&inputPath);
|
||||
if (StaticData::Instance().GetPlaceholderFactor() != NOT_FOUND) {
|
||||
@ -62,7 +62,7 @@ void ChartTranslationOptions::Evaluate(const InputType &input, const InputPath &
|
||||
for (iter = m_collection.begin(); iter != m_collection.end(); ++iter) {
|
||||
ChartTranslationOption &transOpt = **iter;
|
||||
transOpt.SetInputPath(&inputPath);
|
||||
transOpt.Evaluate(input, inputPath, m_stackVec);
|
||||
transOpt.EvaluateWithSourceContext(input, inputPath, m_stackVec);
|
||||
}
|
||||
|
||||
// get rid of -inf trans opts
|
||||
|
@ -85,7 +85,7 @@ public:
|
||||
return m_estimateOfBestScore;
|
||||
}
|
||||
|
||||
void Evaluate(const InputType &input, const InputPath &inputPath);
|
||||
void EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath);
|
||||
|
||||
void SetInputPath(const InputPath *inputPath);
|
||||
|
||||
|
@ -148,7 +148,7 @@ void DecodeStepGeneration::Process(const TranslationOption &inputPartialTranslOp
|
||||
outPhrase.GetScoreBreakdown().PlusEquals(generationScore);
|
||||
|
||||
outPhrase.MergeFactors(genPhrase, m_newOutputFactors);
|
||||
outPhrase.Evaluate(inputPath.GetPhrase(), m_featuresToApply);
|
||||
outPhrase.EvaluateInIsolation(inputPath.GetPhrase(), m_featuresToApply);
|
||||
|
||||
const WordsRange &sourceWordsRange = inputPartialTranslOpt.GetSourceWordsRange();
|
||||
|
||||
|
@ -84,7 +84,7 @@ void DecodeStepTranslation::Process(const TranslationOption &inputPartialTranslO
|
||||
}
|
||||
|
||||
outPhrase.Merge(targetPhrase, m_newOutputFactors);
|
||||
outPhrase.Evaluate(inputPath.GetPhrase(), m_featuresToApply); // need to do this as all non-transcores would be screwed up
|
||||
outPhrase.EvaluateInIsolation(inputPath.GetPhrase(), m_featuresToApply); // need to do this as all non-transcores would be screwed up
|
||||
|
||||
TranslationOption *newTransOpt = new TranslationOption(sourceWordsRange, outPhrase);
|
||||
assert(newTransOpt != NULL);
|
||||
@ -258,7 +258,7 @@ void DecodeStepTranslation::ProcessLEGACY(const TranslationOption &inputPartialT
|
||||
}
|
||||
|
||||
outPhrase.Merge(targetPhrase, m_newOutputFactors);
|
||||
outPhrase.Evaluate(inputPath.GetPhrase(), m_featuresToApply); // need to do this as all non-transcores would be screwed up
|
||||
outPhrase.EvaluateInIsolation(inputPath.GetPhrase(), m_featuresToApply); // need to do this as all non-transcores would be screwed up
|
||||
|
||||
|
||||
TranslationOption *newTransOpt = new TranslationOption(sourceWordsRange, outPhrase);
|
||||
|
@ -35,6 +35,7 @@
|
||||
#include "moses/FF/ControlRecombination.h"
|
||||
#include "moses/FF/ExternalFeature.h"
|
||||
#include "moses/FF/ConstrainedDecoding.h"
|
||||
#include "moses/FF/SoftSourceSyntacticConstraintsFeature.h"
|
||||
#include "moses/FF/CoveredReferenceFeature.h"
|
||||
#include "moses/FF/TreeStructureFeature.h"
|
||||
#include "moses/FF/SoftMatchingFeature.h"
|
||||
@ -48,11 +49,11 @@
|
||||
#include "NieceTerminal.h"
|
||||
#include "SpanLength.h"
|
||||
#include "SyntaxRHS.h"
|
||||
#include "SkeletonChangeInput.h"
|
||||
|
||||
#include "moses/FF/SkeletonStatelessFF.h"
|
||||
#include "moses/FF/SkeletonStatefulFF.h"
|
||||
#include "moses/LM/SkeletonLM.h"
|
||||
#include "SkeletonChangeInput.h"
|
||||
#include "moses/TranslationModel/SkeletonPT.h"
|
||||
|
||||
#ifdef HAVE_CMPH
|
||||
@ -197,6 +198,7 @@ FeatureRegistry::FeatureRegistry()
|
||||
MOSES_FNAME(CoveredReferenceFeature);
|
||||
MOSES_FNAME(ExternalFeature);
|
||||
MOSES_FNAME(SourceGHKMTreeInputMatchFeature);
|
||||
MOSES_FNAME(SoftSourceSyntacticConstraintsFeature);
|
||||
MOSES_FNAME(TreeStructureFeature);
|
||||
MOSES_FNAME(SoftMatchingFeature);
|
||||
MOSES_FNAME(HyperParameterAsWeight);
|
||||
@ -209,11 +211,11 @@ FeatureRegistry::FeatureRegistry()
|
||||
MOSES_FNAME(SparseHieroReorderingFeature);
|
||||
MOSES_FNAME(SpanLength);
|
||||
MOSES_FNAME(SyntaxRHS);
|
||||
MOSES_FNAME(SkeletonChangeInput);
|
||||
|
||||
MOSES_FNAME(SkeletonStatelessFF);
|
||||
MOSES_FNAME(SkeletonStatefulFF);
|
||||
MOSES_FNAME(SkeletonLM);
|
||||
MOSES_FNAME(SkeletonChangeInput);
|
||||
MOSES_FNAME(SkeletonPT);
|
||||
|
||||
#ifdef HAVE_CMPH
|
||||
|
536
moses/FF/SoftSourceSyntacticConstraintsFeature.cpp
Normal file
536
moses/FF/SoftSourceSyntacticConstraintsFeature.cpp
Normal file
@ -0,0 +1,536 @@
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
#include <assert.h>
|
||||
#include "SoftSourceSyntacticConstraintsFeature.h"
|
||||
#include "moses/StaticData.h"
|
||||
#include "moses/InputFileStream.h"
|
||||
#include "moses/ScoreComponentCollection.h"
|
||||
#include "moses/Hypothesis.h"
|
||||
#include "moses/ChartHypothesis.h"
|
||||
#include "moses/ChartManager.h"
|
||||
#include "moses/FactorCollection.h"
|
||||
#include "moses/TreeInput.h"
|
||||
#include "moses/PP/SourceLabelsPhraseProperty.h"
|
||||
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
SoftSourceSyntacticConstraintsFeature::SoftSourceSyntacticConstraintsFeature(const std::string &line)
|
||||
: StatelessFeatureFunction(3, line), m_featureVariant(0)
|
||||
{
|
||||
VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ...");
|
||||
ReadParameters();
|
||||
VERBOSE(1, " Done.");
|
||||
VERBOSE(1, " Feature variant: " << m_featureVariant << "." << std::endl);
|
||||
}
|
||||
|
||||
void SoftSourceSyntacticConstraintsFeature::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
if (key == "sourceLabelSetFile") {
|
||||
m_sourceLabelSetFile = value;
|
||||
} else if (key == "coreSourceLabelSetFile") {
|
||||
m_coreSourceLabelSetFile = value;
|
||||
} else if (key == "targetSourceLeftHandSideJointCountFile") {
|
||||
m_targetSourceLHSJointCountFile = value;
|
||||
} else if (key == "tuneable") {
|
||||
m_tuneable = Scan<bool>(value);
|
||||
} else if (key == "featureVariant") {
|
||||
m_featureVariant = Scan<size_t>(value); // 0: only dense features, 1: no mismatches (also set weights 1 0 0 and tuneable=false), 2: with sparse features, 3: with sparse features for core labels only
|
||||
} else {
|
||||
StatelessFeatureFunction::SetParameter(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void SoftSourceSyntacticConstraintsFeature::Load()
|
||||
{
|
||||
// don't change the loading order!
|
||||
LoadSourceLabelSet();
|
||||
if (m_featureVariant == 3) {
|
||||
LoadCoreSourceLabelSet();
|
||||
}
|
||||
if (!m_targetSourceLHSJointCountFile.empty()) {
|
||||
LoadTargetSourceLeftHandSideJointCountFile();
|
||||
}
|
||||
}
|
||||
|
||||
void SoftSourceSyntacticConstraintsFeature::LoadSourceLabelSet()
|
||||
{
|
||||
VERBOSE(2, GetScoreProducerDescription() << ": Loading source label set from file " << m_sourceLabelSetFile << std::endl);
|
||||
InputFileStream inFile(m_sourceLabelSetFile);
|
||||
|
||||
FactorCollection &factorCollection = FactorCollection::Instance();
|
||||
|
||||
// read source label set
|
||||
std::string line;
|
||||
m_sourceLabels.clear();
|
||||
m_sourceLabelsByIndex.clear();
|
||||
m_sourceLabelIndexesByFactor.clear();
|
||||
while (getline(inFile, line)) {
|
||||
std::istringstream tokenizer(line);
|
||||
std::string label;
|
||||
size_t index;
|
||||
try {
|
||||
tokenizer >> label >> index;
|
||||
} catch (const std::exception &e) {
|
||||
UTIL_THROW2(GetScoreProducerDescription()
|
||||
<< ": Error reading source label set file " << m_sourceLabelSetFile << " .");
|
||||
}
|
||||
std::pair< boost::unordered_map<std::string,size_t>::iterator, bool > inserted = m_sourceLabels.insert( std::pair<std::string,size_t>(label,index) );
|
||||
UTIL_THROW_IF2(!inserted.second, GetScoreProducerDescription()
|
||||
<< ": Source label set file " << m_sourceLabelSetFile << " should contain each syntactic label only once.");
|
||||
|
||||
if (index >= m_sourceLabelsByIndex.size()) {
|
||||
m_sourceLabelsByIndex.resize(index+1);
|
||||
}
|
||||
m_sourceLabelsByIndex[index] = label;
|
||||
const Factor* sourceLabelFactor = factorCollection.AddFactor(label,true);
|
||||
m_sourceLabelIndexesByFactor[sourceLabelFactor] = index;
|
||||
}
|
||||
|
||||
inFile.Close();
|
||||
|
||||
std::list<std::string> specialLabels;
|
||||
specialLabels.push_back("GlueTop");
|
||||
specialLabels.push_back("GlueX");
|
||||
// specialLabels.push_back("XRHS");
|
||||
// specialLabels.push_back("XLHS");
|
||||
for (std::list<std::string>::const_iterator iter=specialLabels.begin();
|
||||
iter!=specialLabels.end(); ++iter) {
|
||||
boost::unordered_map<std::string,size_t>::iterator found = m_sourceLabels.find(*iter);
|
||||
UTIL_THROW_IF2(found == m_sourceLabels.end(), GetScoreProducerDescription()
|
||||
<< ": Source label set file " << m_sourceLabelSetFile << " should contain an entry for the special label \"" << *iter << "\".");
|
||||
if (!(found->first).compare("GlueTop")) {
|
||||
m_GlueTopLabel = found->second;
|
||||
// } else if (!(found->first).compare("XRHS")) {
|
||||
// m_XRHSLabel = found->second;
|
||||
// } else if (!(found->first).compare("XLHS")) {
|
||||
// m_XLHSLabel = found->second;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void SoftSourceSyntacticConstraintsFeature::LoadCoreSourceLabelSet()
|
||||
{
|
||||
VERBOSE(2, GetScoreProducerDescription() << ": Loading core source label set from file " << m_coreSourceLabelSetFile << std::endl);
|
||||
InputFileStream inFile(m_coreSourceLabelSetFile);
|
||||
|
||||
// read core source label set
|
||||
std::string line;
|
||||
m_coreSourceLabels.clear();
|
||||
while (getline(inFile, line)) {
|
||||
istringstream tokenizer(line);
|
||||
std::string label;
|
||||
tokenizer >> label;
|
||||
boost::unordered_map<std::string,size_t>::iterator foundSourceLabelIndex = m_sourceLabels.find( label );
|
||||
if ( foundSourceLabelIndex != m_sourceLabels.end() ) {
|
||||
m_coreSourceLabels.insert(foundSourceLabelIndex->second);
|
||||
} else {
|
||||
VERBOSE(2, GetScoreProducerDescription()
|
||||
<< ": Ignoring unknown source label \"" << label << "\" "
|
||||
<< "from core source label set file " << m_coreSourceLabelSetFile << "."
|
||||
<< std::endl);
|
||||
}
|
||||
}
|
||||
|
||||
inFile.Close();
|
||||
}
|
||||
|
||||
void SoftSourceSyntacticConstraintsFeature::LoadTargetSourceLeftHandSideJointCountFile()
|
||||
{
|
||||
|
||||
VERBOSE(2, GetScoreProducerDescription() << ": Loading target/source label joint counts from file " << m_targetSourceLHSJointCountFile << std::endl);
|
||||
InputFileStream inFile(m_targetSourceLHSJointCountFile);
|
||||
|
||||
for (boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* >::iterator iter=m_labelPairProbabilities.begin();
|
||||
iter!=m_labelPairProbabilities.end(); ++iter) {
|
||||
delete iter->second;
|
||||
}
|
||||
m_labelPairProbabilities.clear();
|
||||
|
||||
// read joint counts
|
||||
std::string line;
|
||||
FactorCollection &factorCollection = FactorCollection::Instance();
|
||||
boost::unordered_map<const Factor*,float> targetLHSCounts;
|
||||
std::vector<float> sourceLHSCounts(m_sourceLabels.size(),0.0);
|
||||
|
||||
while (getline(inFile, line)) {
|
||||
istringstream tokenizer(line);
|
||||
std::string targetLabel;
|
||||
std::string sourceLabel;
|
||||
float count;
|
||||
tokenizer >> targetLabel;
|
||||
tokenizer >> sourceLabel;
|
||||
tokenizer >> count;
|
||||
|
||||
boost::unordered_map<std::string,size_t>::iterator foundSourceLabelIndex = m_sourceLabels.find( sourceLabel );
|
||||
UTIL_THROW_IF2(foundSourceLabelIndex == m_sourceLabels.end(), GetScoreProducerDescription()
|
||||
<< ": Target/source label joint count file " << m_targetSourceLHSJointCountFile
|
||||
<< " contains unknown source label \"" << sourceLabel << "\".");
|
||||
|
||||
const Factor* targetLabelFactor = factorCollection.AddFactor(targetLabel,true);
|
||||
|
||||
sourceLHSCounts[foundSourceLabelIndex->second] += count;
|
||||
std::pair< boost::unordered_map<const Factor*,float >::iterator, bool > insertedTargetLHSCount =
|
||||
targetLHSCounts.insert( std::pair<const Factor*,float>(targetLabelFactor,count) );
|
||||
if (!insertedTargetLHSCount.second) {
|
||||
(insertedTargetLHSCount.first)->second += count;
|
||||
boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* >::iterator jointCountIt =
|
||||
m_labelPairProbabilities.find( targetLabelFactor );
|
||||
assert(jointCountIt != m_labelPairProbabilities.end());
|
||||
(jointCountIt->second)->at(foundSourceLabelIndex->second).first += count;
|
||||
(jointCountIt->second)->at(foundSourceLabelIndex->second).second += count;
|
||||
} else {
|
||||
std::pair<float,float> init(0.0,0.0);
|
||||
std::vector< std::pair<float,float> >* sourceVector = new std::vector< std::pair<float,float> >(m_sourceLabels.size(),init);
|
||||
sourceVector->at(foundSourceLabelIndex->second) = std::pair<float,float>(count,count);
|
||||
std::pair< boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* >::iterator, bool > insertedJointCount =
|
||||
m_labelPairProbabilities.insert( std::pair<const Factor*, std::vector< std::pair<float,float> >* >(targetLabelFactor,sourceVector) );
|
||||
assert(insertedJointCount.second);
|
||||
}
|
||||
}
|
||||
|
||||
// normalization
|
||||
for (boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* >::iterator iter=m_labelPairProbabilities.begin();
|
||||
iter!=m_labelPairProbabilities.end(); ++iter) {
|
||||
float targetLHSCount = 0;
|
||||
boost::unordered_map<const Factor*,float >::const_iterator targetLHSCountIt = targetLHSCounts.find( iter->first );
|
||||
if ( targetLHSCountIt != targetLHSCounts.end() ) {
|
||||
targetLHSCount = targetLHSCountIt->second;
|
||||
}
|
||||
std::vector< std::pair<float,float> > &probabilities = *(iter->second);
|
||||
for (size_t index=0; index<probabilities.size(); ++index) {
|
||||
|
||||
if ( probabilities[index].first != 0 ) {
|
||||
assert(targetLHSCount != 0);
|
||||
probabilities[index].first /= targetLHSCount;
|
||||
}
|
||||
if ( probabilities[index].second != 0 ) {
|
||||
assert(sourceLHSCounts[index] != 0);
|
||||
probabilities[index].second /= sourceLHSCounts[index];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inFile.Close();
|
||||
}
|
||||
|
||||
|
||||
void SoftSourceSyntacticConstraintsFeature::EvaluateWhenApplied(
|
||||
const ChartHypothesis& hypo,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{
|
||||
// dense scores
|
||||
std::vector<float> newScores(m_numScoreComponents,0); // m_numScoreComponents == 3
|
||||
|
||||
const InputType& input = hypo.GetManager().GetSource();
|
||||
const TreeInput& treeInput = static_cast<const TreeInput&>(input);
|
||||
const StaticData& staticData = StaticData::Instance();
|
||||
const Word& outputDefaultNonTerminal = staticData.GetOutputDefaultNonTerminal();
|
||||
|
||||
size_t nNTs = 1;
|
||||
bool treeInputMismatchLHSBinary = true;
|
||||
size_t treeInputMismatchRHSCount = 0;
|
||||
bool hasCompleteTreeInputMatch = false;
|
||||
float t2sLabelsProb = 1;
|
||||
float s2tLabelsProb = 1;
|
||||
float ruleLabelledProbability = 1;
|
||||
|
||||
// read SourceLabels property
|
||||
const TargetPhrase &currTarPhr = hypo.GetCurrTargetPhrase();
|
||||
const Factor* targetLHS = currTarPhr.GetTargetLHS()[0];
|
||||
bool isGlueGrammarRule = false;
|
||||
bool isUnkRule = false;
|
||||
|
||||
if (const PhraseProperty *property = currTarPhr.GetProperty("SourceLabels")) {
|
||||
|
||||
const SourceLabelsPhraseProperty *sourceLabelsPhraseProperty = static_cast<const SourceLabelsPhraseProperty*>(property);
|
||||
|
||||
nNTs = sourceLabelsPhraseProperty->GetNumberOfNonTerminals();
|
||||
float totalCount = sourceLabelsPhraseProperty->GetTotalCount();
|
||||
|
||||
// prepare for input tree label matching
|
||||
std::vector< boost::unordered_set<size_t> > treeInputLabelsRHS(nNTs-1);
|
||||
boost::unordered_set<size_t> treeInputLabelsLHS;
|
||||
|
||||
// get index map for underlying hypotheses
|
||||
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
|
||||
currTarPhr.GetAlignNonTerm().GetNonTermIndexMap();
|
||||
|
||||
std::vector<const Factor*> targetLabelsRHS;
|
||||
if (nNTs > 1) { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule
|
||||
size_t nonTerminalNumber = 0;
|
||||
|
||||
for (size_t phrasePos=0; phrasePos<currTarPhr.GetSize(); ++phrasePos) {
|
||||
// consult rule for either word or non-terminal
|
||||
const Word &word = currTarPhr.GetWord(phrasePos);
|
||||
if ( word.IsNonTerminal() ) {
|
||||
// non-terminal: consult subderivation
|
||||
size_t nonTermIndex = nonTermIndexMap[phrasePos];
|
||||
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndex);
|
||||
targetLabelsRHS.push_back( prevHypo->GetTargetLHS()[0] );
|
||||
|
||||
// retrieve information that is required for input tree label matching (RHS)
|
||||
const WordsRange& prevWordsRange = prevHypo->GetCurrSourceRange();
|
||||
size_t prevStartPos = prevWordsRange.GetStartPos();
|
||||
size_t prevEndPos = prevWordsRange.GetEndPos();
|
||||
const NonTerminalSet& prevTreeInputLabels = treeInput.GetLabelSet(prevStartPos,prevEndPos);
|
||||
|
||||
for (NonTerminalSet::const_iterator prevTreeInputLabelsIt = prevTreeInputLabels.begin();
|
||||
prevTreeInputLabelsIt != prevTreeInputLabels.end(); ++prevTreeInputLabelsIt) {
|
||||
if (*prevTreeInputLabelsIt != outputDefaultNonTerminal) {
|
||||
boost::unordered_map<const Factor*,size_t>::const_iterator foundPrevTreeInputLabel
|
||||
= m_sourceLabelIndexesByFactor.find((*prevTreeInputLabelsIt)[0]);
|
||||
if (foundPrevTreeInputLabel != m_sourceLabelIndexesByFactor.end()) {
|
||||
size_t prevTreeInputLabelIndex = foundPrevTreeInputLabel->second;
|
||||
treeInputLabelsRHS[nonTerminalNumber].insert(prevTreeInputLabelIndex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
++nonTerminalNumber;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// retrieve information that is required for input tree label matching (LHS)
|
||||
const WordsRange& wordsRange = hypo.GetCurrSourceRange();
|
||||
size_t startPos = wordsRange.GetStartPos();
|
||||
size_t endPos = wordsRange.GetEndPos();
|
||||
const NonTerminalSet& treeInputLabels = treeInput.GetLabelSet(startPos,endPos);
|
||||
|
||||
for (NonTerminalSet::const_iterator treeInputLabelsIt = treeInputLabels.begin();
|
||||
treeInputLabelsIt != treeInputLabels.end(); ++treeInputLabelsIt) {
|
||||
if (*treeInputLabelsIt != outputDefaultNonTerminal) {
|
||||
boost::unordered_map<const Factor*,size_t>::const_iterator foundTreeInputLabel
|
||||
= m_sourceLabelIndexesByFactor.find((*treeInputLabelsIt)[0]);
|
||||
if (foundTreeInputLabel != m_sourceLabelIndexesByFactor.end()) {
|
||||
size_t treeInputLabelIndex = foundTreeInputLabel->second;
|
||||
treeInputLabelsLHS.insert(treeInputLabelIndex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// inspect source-labelled rule items
|
||||
|
||||
std::vector< boost::unordered_set<size_t> > sparseScoredTreeInputLabelsRHS(nNTs-1);
|
||||
boost::unordered_set<size_t> sparseScoredTreeInputLabelsLHS;
|
||||
|
||||
std::vector<bool> sourceLabelSeenAsLHS(m_sourceLabels.size(),false);
|
||||
std::vector<bool> treeInputMatchRHSCountByNonTerminal(nNTs-1,false);
|
||||
|
||||
const std::list<SourceLabelsPhrasePropertyItem> &sourceLabelItems = sourceLabelsPhraseProperty->GetSourceLabelItems();
|
||||
|
||||
for (std::list<SourceLabelsPhrasePropertyItem>::const_iterator sourceLabelItem = sourceLabelItems.begin();
|
||||
sourceLabelItem != sourceLabelItems.end() && !hasCompleteTreeInputMatch; ++sourceLabelItem) {
|
||||
|
||||
const std::list<size_t> &sourceLabelsRHS = sourceLabelItem->GetSourceLabelsRHS();
|
||||
// float sourceLabelsRHSCount = sourceLabelItem->GetSourceLabelsRHSCount();
|
||||
const std::list< std::pair<size_t,float> > &sourceLabelsLHSList = sourceLabelItem->GetSourceLabelsLHSList();
|
||||
|
||||
assert(sourceLabelsRHS.size() == nNTs-1);
|
||||
|
||||
bool currentSourceLabelItemIsCompleteTreeInputMatch = true;
|
||||
|
||||
size_t nonTerminalNumber=0;
|
||||
for (std::list<size_t>::const_iterator sourceLabelsRHSIt = sourceLabelsRHS.begin();
|
||||
sourceLabelsRHSIt != sourceLabelsRHS.end(); ++sourceLabelsRHSIt, ++nonTerminalNumber) {
|
||||
|
||||
if (treeInputLabelsRHS[nonTerminalNumber].find(*sourceLabelsRHSIt) != treeInputLabelsRHS[nonTerminalNumber].end()) {
|
||||
|
||||
treeInputMatchRHSCountByNonTerminal[nonTerminalNumber] = true;
|
||||
|
||||
if ( m_featureVariant == 2 ||
|
||||
(m_featureVariant == 3 && m_coreSourceLabels.find(*sourceLabelsRHSIt) != m_coreSourceLabels.end()) ) {
|
||||
// score sparse features: RHS match
|
||||
if (sparseScoredTreeInputLabelsRHS[nonTerminalNumber].find(*sourceLabelsRHSIt) == sparseScoredTreeInputLabelsRHS[nonTerminalNumber].end()) {
|
||||
// (only if no match has been scored for this tree input label and rule non-terminal with a previous sourceLabelItem)
|
||||
float score_RHS_1 = (float)1/treeInputLabelsRHS[nonTerminalNumber].size();
|
||||
accumulator->PlusEquals(this,
|
||||
std::string("RHS_1_" + m_sourceLabelsByIndex[*sourceLabelsRHSIt]),
|
||||
score_RHS_1);
|
||||
sparseScoredTreeInputLabelsRHS[nonTerminalNumber].insert(*sourceLabelsRHSIt);
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
currentSourceLabelItemIsCompleteTreeInputMatch = false;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// LHS source non-terminal labels seen with this RHS
|
||||
bool currentSourceLabelItemHasLHSTreeInputMatch = false;
|
||||
//float ruleLabelledCount = 0;
|
||||
std::list< std::pair<size_t,float> >::const_iterator sourceLabelsLHSIt;
|
||||
|
||||
for (sourceLabelsLHSIt = sourceLabelsLHSList.begin(); sourceLabelsLHSIt != sourceLabelsLHSList.end(); ++sourceLabelsLHSIt) {
|
||||
|
||||
if ( sourceLabelsLHSIt->first == m_GlueTopLabel ) {
|
||||
isGlueGrammarRule = true;
|
||||
}
|
||||
|
||||
if (treeInputLabelsLHS.find(sourceLabelsLHSIt->first) != treeInputLabelsLHS.end()) {
|
||||
|
||||
currentSourceLabelItemHasLHSTreeInputMatch = true;
|
||||
|
||||
if ( m_featureVariant == 2 ||
|
||||
(m_featureVariant == 3 && m_coreSourceLabels.find(sourceLabelsLHSIt->first) != m_coreSourceLabels.end()) ) {
|
||||
// score sparse features: LHS match
|
||||
if (sparseScoredTreeInputLabelsLHS.find(sourceLabelsLHSIt->first) == sparseScoredTreeInputLabelsLHS.end()) {
|
||||
// (only if no match has been scored for this tree input label and rule non-terminal with a previous sourceLabelItem)
|
||||
float score_LHS_1 = (float)1/treeInputLabelsLHS.size();
|
||||
accumulator->PlusEquals(this,
|
||||
std::string("LHS_1_" + m_sourceLabelsByIndex[sourceLabelsLHSIt->first]),
|
||||
score_LHS_1);
|
||||
sparseScoredTreeInputLabelsLHS.insert(sourceLabelsLHSIt->first);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
if (currentSourceLabelItemHasLHSTreeInputMatch) {
|
||||
// input tree matching (LHS)
|
||||
treeInputMismatchLHSBinary = false;
|
||||
} else {
|
||||
currentSourceLabelItemIsCompleteTreeInputMatch = false;
|
||||
}
|
||||
|
||||
if (currentSourceLabelItemIsCompleteTreeInputMatch) {
|
||||
hasCompleteTreeInputMatch = true;
|
||||
|
||||
ruleLabelledProbability = sourceLabelsLHSIt->second / totalCount;
|
||||
std::pair<float,float> probPair = GetLabelPairProbabilities( targetLHS, sourceLabelsLHSIt->first);
|
||||
t2sLabelsProb = probPair.first;
|
||||
s2tLabelsProb = probPair.second;
|
||||
nonTerminalNumber=0;
|
||||
for (std::list<size_t>::const_iterator sourceLabelsRHSIt = sourceLabelsRHS.begin();
|
||||
sourceLabelsRHSIt != sourceLabelsRHS.end(); ++sourceLabelsRHSIt, ++nonTerminalNumber) {
|
||||
probPair = GetLabelPairProbabilities( targetLabelsRHS[nonTerminalNumber], *sourceLabelsRHSIt );
|
||||
t2sLabelsProb += probPair.first;
|
||||
s2tLabelsProb += probPair.second;
|
||||
}
|
||||
t2sLabelsProb /= nNTs;
|
||||
s2tLabelsProb /= nNTs;
|
||||
assert(t2sLabelsProb != 0);
|
||||
assert(s2tLabelsProb != 0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// input tree matching (RHS)
|
||||
if ( !hasCompleteTreeInputMatch ) {
|
||||
treeInputMismatchRHSCount = nNTs-1;
|
||||
for (std::vector<bool>::const_iterator treeInputMatchRHSCountByNonTerminalIt = treeInputMatchRHSCountByNonTerminal.begin();
|
||||
treeInputMatchRHSCountByNonTerminalIt != treeInputMatchRHSCountByNonTerminal.end(); ++treeInputMatchRHSCountByNonTerminalIt) {
|
||||
if (*treeInputMatchRHSCountByNonTerminalIt) {
|
||||
--treeInputMismatchRHSCount;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// score sparse features: mismatches
|
||||
if ( m_featureVariant == 2 || m_featureVariant == 3 ) {
|
||||
|
||||
// RHS
|
||||
|
||||
for (size_t nonTerminalNumber = 0; nonTerminalNumber < nNTs-1; ++nonTerminalNumber) {
|
||||
// nNTs-1 because nNTs also counts the left-hand side non-terminal
|
||||
|
||||
float score_RHS_0 = (float)1/treeInputLabelsRHS[nonTerminalNumber].size();
|
||||
for (boost::unordered_set<size_t>::const_iterator treeInputLabelsRHSIt = treeInputLabelsRHS[nonTerminalNumber].begin();
|
||||
treeInputLabelsRHSIt != treeInputLabelsRHS[nonTerminalNumber].end(); ++treeInputLabelsRHSIt) {
|
||||
|
||||
if ( m_featureVariant == 2 ||
|
||||
(m_featureVariant == 3 && m_coreSourceLabels.find(*treeInputLabelsRHSIt) != m_coreSourceLabels.end()) ) {
|
||||
|
||||
if (sparseScoredTreeInputLabelsRHS[nonTerminalNumber].find(*treeInputLabelsRHSIt) == sparseScoredTreeInputLabelsRHS[nonTerminalNumber].end()) {
|
||||
// score sparse features: RHS mismatch
|
||||
accumulator->PlusEquals(this,
|
||||
std::string("RHS_0_" + m_sourceLabelsByIndex[*treeInputLabelsRHSIt]),
|
||||
score_RHS_0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// LHS
|
||||
|
||||
float score_LHS_0 = (float)1/treeInputLabelsLHS.size();
|
||||
for (boost::unordered_set<size_t>::const_iterator treeInputLabelsLHSIt = treeInputLabelsLHS.begin();
|
||||
treeInputLabelsLHSIt != treeInputLabelsLHS.end(); ++treeInputLabelsLHSIt) {
|
||||
|
||||
if ( m_featureVariant == 2 ||
|
||||
(m_featureVariant == 3 && m_coreSourceLabels.find(*treeInputLabelsLHSIt) != m_coreSourceLabels.end()) ) {
|
||||
|
||||
if (sparseScoredTreeInputLabelsLHS.find(*treeInputLabelsLHSIt) == sparseScoredTreeInputLabelsLHS.end()) {
|
||||
// score sparse features: RHS mismatch
|
||||
accumulator->PlusEquals(this,
|
||||
std::string("LHS_0_" + m_sourceLabelsByIndex[*treeInputLabelsLHSIt]),
|
||||
score_LHS_0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
// abort with error message if the phrase does not translate an unknown word
|
||||
UTIL_THROW_IF2(!currTarPhr.GetWord(0).IsOOV(), GetScoreProducerDescription()
|
||||
<< ": Missing SourceLabels property. "
|
||||
<< "Please check phrase table and glue rules.");
|
||||
|
||||
// unknown word
|
||||
isUnkRule = true;
|
||||
|
||||
}
|
||||
|
||||
// add scores
|
||||
|
||||
// input tree matching
|
||||
switch (m_featureVariant) {
|
||||
|
||||
case 0:
|
||||
newScores[0] = hasCompleteTreeInputMatch;
|
||||
break;
|
||||
|
||||
case 1:
|
||||
newScores[0] = ( (hasCompleteTreeInputMatch || isGlueGrammarRule || isUnkRule) ? 0 : std::numeric_limits<float>::min() );
|
||||
break;
|
||||
|
||||
default:
|
||||
newScores[0] = hasCompleteTreeInputMatch;
|
||||
}
|
||||
newScores[1] = treeInputMismatchLHSBinary;
|
||||
newScores[2] = treeInputMismatchRHSCount;
|
||||
// newScores[3] = hasCompleteTreeInputMatch ? std::log(t2sLabelsProb) : 0;
|
||||
// newScores[4] = hasCompleteTreeInputMatch ? std::log(s2tLabelsProb) : 0;
|
||||
// newScores[3] = hasCompleteTreeInputMatch ? std::log(ruleLabelledProbability) : 0;
|
||||
|
||||
accumulator->PlusEquals(this, newScores);
|
||||
}
|
||||
|
||||
|
||||
std::pair<float,float> SoftSourceSyntacticConstraintsFeature::GetLabelPairProbabilities(
|
||||
const Factor* target,
|
||||
const size_t source) const
|
||||
{
|
||||
boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* >::const_iterator found =
|
||||
m_labelPairProbabilities.find(target);
|
||||
if ( found == m_labelPairProbabilities.end() ) {
|
||||
return std::pair<float,float>(0,0);
|
||||
}
|
||||
return found->second->at(source);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
87
moses/FF/SoftSourceSyntacticConstraintsFeature.h
Normal file
87
moses/FF/SoftSourceSyntacticConstraintsFeature.h
Normal file
@ -0,0 +1,87 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <boost/unordered_map.hpp>
|
||||
#include <boost/unordered_set.hpp>
|
||||
#include "StatelessFeatureFunction.h"
|
||||
#include "FFState.h"
|
||||
#include "moses/Factor.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
|
||||
class SoftSourceSyntacticConstraintsFeature : public StatelessFeatureFunction
|
||||
{
|
||||
public:
|
||||
SoftSourceSyntacticConstraintsFeature(const std::string &line);
|
||||
|
||||
~SoftSourceSyntacticConstraintsFeature() {
|
||||
for (boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* >::iterator iter=m_labelPairProbabilities.begin();
|
||||
iter!=m_labelPairProbabilities.end(); ++iter) {
|
||||
delete iter->second;
|
||||
}
|
||||
}
|
||||
|
||||
bool IsUseable(const FactorMask &mask) const {
|
||||
return true;
|
||||
}
|
||||
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
|
||||
void EvaluateInIsolation(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const
|
||||
{};
|
||||
|
||||
void EvaluateWithSourceContext(const InputType &input
|
||||
, const InputPath &inputPath
|
||||
, const TargetPhrase &targetPhrase
|
||||
, const StackVec *stackVec
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection *estimatedFutureScore = NULL) const
|
||||
{};
|
||||
|
||||
void EvaluateWhenApplied(
|
||||
const Hypothesis& cur_hypo,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{};
|
||||
|
||||
void EvaluateWhenApplied(
|
||||
const ChartHypothesis& cur_hypo,
|
||||
ScoreComponentCollection* accumulator) const;
|
||||
|
||||
private:
|
||||
std::string m_sourceLabelSetFile;
|
||||
std::string m_coreSourceLabelSetFile;
|
||||
std::string m_targetSourceLHSJointCountFile;
|
||||
std::string m_unknownLeftHandSideFile;
|
||||
size_t m_featureVariant;
|
||||
|
||||
boost::unordered_map<std::string,size_t> m_sourceLabels;
|
||||
std::vector<std::string> m_sourceLabelsByIndex;
|
||||
boost::unordered_set<size_t> m_coreSourceLabels;
|
||||
boost::unordered_map<const Factor*,size_t> m_sourceLabelIndexesByFactor;
|
||||
size_t m_GlueTopLabel;
|
||||
// mutable size_t m_XRHSLabel;
|
||||
// mutable size_t m_XLHSLabel;
|
||||
|
||||
boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* > m_labelPairProbabilities;
|
||||
boost::unordered_map<size_t,float> m_unknownLHSProbabilities;
|
||||
float m_smoothingWeight;
|
||||
float m_unseenLHSSmoothingFactorForUnknowns;
|
||||
|
||||
void Load();
|
||||
void LoadSourceLabelSet();
|
||||
void LoadCoreSourceLabelSet();
|
||||
void LoadTargetSourceLeftHandSideJointCountFile();
|
||||
|
||||
std::pair<float,float> GetLabelPairProbabilities(const Factor* target,
|
||||
const size_t source) const;
|
||||
|
||||
};
|
||||
|
||||
|
||||
}
|
||||
|
@ -67,6 +67,23 @@ const Factor *FactorCollection::AddFactor(const StringPiece &factorString, bool
|
||||
return &ret.first->in;
|
||||
}
|
||||
|
||||
const Factor *FactorCollection::GetFactor(const StringPiece &factorString, bool isNonTerminal)
|
||||
{
|
||||
FactorFriend to_find;
|
||||
to_find.in.m_string = factorString;
|
||||
to_find.in.m_id = (isNonTerminal) ? m_factorIdNonTerminal : m_factorId;
|
||||
Set & set = (isNonTerminal) ? m_set : m_setNonTerminal;
|
||||
{ // read=lock scope
|
||||
#ifdef WITH_THREADS
|
||||
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
|
||||
#endif // WITH_THREADS
|
||||
Set::const_iterator i = set.find(to_find);
|
||||
if (i != set.end()) return &i->in;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
FactorCollection::~FactorCollection() {}
|
||||
|
||||
TO_STRING_BODY(FactorCollection);
|
||||
|
@ -114,6 +114,8 @@ public:
|
||||
return m_factorIdNonTerminal;
|
||||
}
|
||||
|
||||
const Factor *GetFactor(const StringPiece &factorString, bool isNonTerminal = false);
|
||||
|
||||
// TODO: remove calls to this function, replacing them with the simpler AddFactor(factorString)
|
||||
const Factor *AddFactor(FactorDirection /*direction*/, FactorType /*factorType*/, const StringPiece &factorString, bool isNonTerminal = false) {
|
||||
return AddFactor(factorString, isNonTerminal);
|
||||
|
@ -205,7 +205,7 @@ int Hypothesis::RecombineCompare(const Hypothesis &compare) const
|
||||
return 0;
|
||||
}
|
||||
|
||||
void Hypothesis::EvaluateWith(const StatefulFeatureFunction &sfff,
|
||||
void Hypothesis::EvaluateWhenApplied(const StatefulFeatureFunction &sfff,
|
||||
int state_idx)
|
||||
{
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
@ -217,7 +217,7 @@ void Hypothesis::EvaluateWith(const StatefulFeatureFunction &sfff,
|
||||
}
|
||||
}
|
||||
|
||||
void Hypothesis::EvaluateWith(const StatelessFeatureFunction& slff)
|
||||
void Hypothesis::EvaluateWhenApplied(const StatelessFeatureFunction& slff)
|
||||
{
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
if (! staticData.IsFeatureFunctionIgnored( slff )) {
|
||||
@ -228,7 +228,7 @@ void Hypothesis::EvaluateWith(const StatelessFeatureFunction& slff)
|
||||
/***
|
||||
* calculate the logarithm of our total translation score (sum up components)
|
||||
*/
|
||||
void Hypothesis::Evaluate(const SquareMatrix &futureScore)
|
||||
void Hypothesis::EvaluateWhenApplied(const SquareMatrix &futureScore)
|
||||
{
|
||||
IFVERBOSE(2) {
|
||||
m_manager.GetSentenceStats().StartTimeOtherScore();
|
||||
@ -244,7 +244,7 @@ void Hypothesis::Evaluate(const SquareMatrix &futureScore)
|
||||
StatelessFeatureFunction::GetStatelessFeatureFunctions();
|
||||
for (unsigned i = 0; i < sfs.size(); ++i) {
|
||||
const StatelessFeatureFunction &ff = *sfs[i];
|
||||
EvaluateWith(ff);
|
||||
EvaluateWhenApplied(ff);
|
||||
}
|
||||
|
||||
const vector<const StatefulFeatureFunction*>& ffs =
|
||||
@ -332,7 +332,7 @@ void Hypothesis::CleanupArcList()
|
||||
*/
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
size_t nBestSize = staticData.GetNBestSize();
|
||||
bool distinctNBest = staticData.GetDistinctNBest() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.GetOutputSearchGraphSLF() || staticData.GetOutputSearchGraphHypergraph() || staticData.UseLatticeMBR() ;
|
||||
bool distinctNBest = staticData.GetDistinctNBest() || staticData.GetLatticeSamplesSize() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.GetOutputSearchGraphSLF() || staticData.GetOutputSearchGraphHypergraph() || staticData.UseLatticeMBR() ;
|
||||
|
||||
if (!distinctNBest && m_arcList->size() > nBestSize * 5) {
|
||||
// prune arc list only if there too many arcs
|
||||
|
@ -142,7 +142,7 @@ public:
|
||||
return m_currTargetWordsRange.GetNumWordsCovered();
|
||||
}
|
||||
|
||||
void Evaluate(const SquareMatrix &futureScore);
|
||||
void EvaluateWhenApplied(const SquareMatrix &futureScore);
|
||||
|
||||
int GetId()const {
|
||||
return m_id;
|
||||
@ -256,8 +256,8 @@ public:
|
||||
}
|
||||
|
||||
// Added by oliver.wilson@ed.ac.uk for async lm stuff.
|
||||
void EvaluateWith(const StatefulFeatureFunction &sfff, int state_idx);
|
||||
void EvaluateWith(const StatelessFeatureFunction &slff);
|
||||
void EvaluateWhenApplied(const StatefulFeatureFunction &sfff, int state_idx);
|
||||
void EvaluateWhenApplied(const StatelessFeatureFunction &slff);
|
||||
|
||||
//! target span that trans opt would populate if applied to this hypo. Used for alignment check
|
||||
size_t GetNextStartPos(const TranslationOption &transOpt) const;
|
||||
|
@ -102,7 +102,7 @@ public:
|
||||
return vertex.BestChild();
|
||||
}
|
||||
|
||||
void Evaluate(const InputType &input, const InputPath &inputPath) {
|
||||
void EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath) {
|
||||
// TODO for input lattice
|
||||
}
|
||||
private:
|
||||
|
@ -12,7 +12,7 @@ if $(with-dlib) {
|
||||
|
||||
with-lbllm = [ option.get "with-lbllm" ] ;
|
||||
if $(with-lbllm) {
|
||||
lbllm2 = <cxxflags>-std=c++0x <define>LM_LBL <include>$(with-lbllm)/src <include>$(with-lbllm)/3rdparty/eigen-3 ;
|
||||
lbllm2 = <cxxflags>-std=c++0x <define>LM_LBL <include>$(with-lbllm)/src <include>$(with-lbllm)/third_party/eigen ;
|
||||
} else {
|
||||
lbllm2 = ;
|
||||
}
|
||||
|
@ -94,9 +94,10 @@ if $(with-nplm) {
|
||||
local with-lbllm = [ option.get "with-lbllm" ] ;
|
||||
if $(with-lbllm) {
|
||||
lib lbl : : <search>$(with-lbllm)/lib <search>$(with-lbllm)/lib64 ;
|
||||
obj LBLLM.o : oxlm/LBLLM.cpp lbl ..//headers : <include>$(with-lbllm)/src <include>$(with-lbllm)/3rdparty/eigen-3 ;
|
||||
obj Mapper.o : oxlm/Mapper.cpp lbl ..//headers : <include>$(with-lbllm)/src <include>$(with-lbllm)/3rdparty/eigen-3 ;
|
||||
alias lbllm : LBLLM.o Mapper.o lbl : : : <cxxflags>-std=c++0x <define>LM_LBL ;
|
||||
lib murmurhash : : <search>$(with-lbllm)/lib <search>$(with-lbllm)/lib64 ;
|
||||
obj LBLLM.o : oxlm/LBLLM.cpp lbl ..//headers : <include>$(with-lbllm)/src <include>$(with-lbllm)/third_party/eigen ;
|
||||
obj Mapper.o : oxlm/Mapper.cpp lbl ..//headers : <include>$(with-lbllm)/src <include>$(with-lbllm)/third_party/eigen ;
|
||||
alias lbllm : LBLLM.o Mapper.o lbl murmurhash /top//boost_filesystem : : : <cxxflags>-std=c++0x <define>LM_LBL ;
|
||||
dependencies += lbllm ;
|
||||
lmmacros += LM_LBL ;
|
||||
}
|
||||
|
@ -1,11 +1,171 @@
|
||||
|
||||
#include "LBLLM.h"
|
||||
|
||||
#include <boost/archive/binary_iarchive.hpp>
|
||||
#include <boost/archive/binary_oarchive.hpp>
|
||||
#include <boost/filesystem.hpp>
|
||||
#include <boost/functional/hash.hpp>
|
||||
|
||||
#include "moses/FactorCollection.h"
|
||||
#include "moses/InputType.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace oxlm;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
template<class Model>
|
||||
LBLLM<Model>::LBLLM(const string &line) : LanguageModelSingleFactor(line) {
|
||||
ReadParameters();
|
||||
|
||||
FactorCollection &factorCollection = FactorCollection::Instance();
|
||||
|
||||
// needed by parent language model classes. Why didn't they set these themselves?
|
||||
m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, BOS_);
|
||||
m_sentenceStartWord[m_factorType] = m_sentenceStart;
|
||||
|
||||
m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, EOS_);
|
||||
m_sentenceEndWord[m_factorType] = m_sentenceEnd;
|
||||
|
||||
cacheHits = totalHits = 0;
|
||||
}
|
||||
|
||||
|
||||
template<class Model>
|
||||
LBLLM<Model>::~LBLLM() {
|
||||
if (persistentCache) {
|
||||
double cache_hit_ratio = 100.0 * cacheHits / totalHits;
|
||||
cerr << "Cache hit ratio: " << cache_hit_ratio << endl;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<class Model>
|
||||
void LBLLM<Model>::SetParameter(const string& key, const string& value) {
|
||||
if (key == "persistent-cache") {
|
||||
persistentCache = Scan<bool>(value);
|
||||
} else {
|
||||
LanguageModelSingleFactor::SetParameter(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
template<class Model>
|
||||
void LBLLM<Model>::Load() {
|
||||
model.load(m_filePath);
|
||||
|
||||
Dict dict = model.getDict();
|
||||
mapper = boost::make_shared<OXLMMapper>(dict);
|
||||
|
||||
kSTART = dict.Convert("<s>");
|
||||
kSTOP = dict.Convert("</s>");
|
||||
kUNKNOWN = dict.Convert("<unk>");
|
||||
|
||||
size_t ngram_order = model.getConfig()->ngram_order;
|
||||
UTIL_THROW_IF2(
|
||||
m_nGramOrder != ngram_order,
|
||||
"Wrong order for LBLLM: LM has " << ngram_order << ", but Moses expects " << m_nGramOrder);
|
||||
}
|
||||
|
||||
template<class Model>
|
||||
LMResult LBLLM<Model>::GetValue(
|
||||
const vector<const Word*> &contextFactor, State* finalState) const {
|
||||
if (!cache.get()) {
|
||||
cache.reset(new QueryCache());
|
||||
}
|
||||
|
||||
vector<int> context;
|
||||
int word;
|
||||
mapper->convert(contextFactor, context, word);
|
||||
|
||||
size_t context_width = m_nGramOrder - 1;
|
||||
|
||||
if (!context.empty() && context.back() == kSTART) {
|
||||
context.resize(context_width, kSTART);
|
||||
} else {
|
||||
context.resize(context_width, kUNKNOWN);
|
||||
}
|
||||
|
||||
|
||||
double score;
|
||||
if (persistentCache) {
|
||||
++totalHits;
|
||||
NGram query(word, context);
|
||||
pair<double, bool> ret = cache->get(query);
|
||||
if (ret.second) {
|
||||
score = ret.first;
|
||||
++cacheHits;
|
||||
} else {
|
||||
score = model.predict(word, context);
|
||||
cache->put(query, score);
|
||||
}
|
||||
} else {
|
||||
score = model.predict(word, context);
|
||||
}
|
||||
|
||||
LMResult ret;
|
||||
ret.score = score;
|
||||
ret.unknown = (word == kUNKNOWN);
|
||||
|
||||
// calc state from hash of last n-1 words
|
||||
size_t seed = 0;
|
||||
boost::hash_combine(seed, word);
|
||||
for (size_t i = 0; i < context.size() && i < context_width - 1; ++i) {
|
||||
int id = context[i];
|
||||
boost::hash_combine(seed, id);
|
||||
}
|
||||
|
||||
(*finalState) = (State*) seed;
|
||||
return ret;
|
||||
}
|
||||
|
||||
template<class Model>
|
||||
void LBLLM<Model>::InitializeForInput(const InputType& source) {
|
||||
LanguageModelSingleFactor::InitializeForInput(source);
|
||||
|
||||
if (persistentCache) {
|
||||
if (!cache.get()) {
|
||||
cache.reset(new QueryCache());
|
||||
}
|
||||
|
||||
int sentence_id = source.GetTranslationId();
|
||||
string cacheFile = m_filePath + "." + to_string(sentence_id) + ".cache.bin";
|
||||
if (boost::filesystem::exists(cacheFile)) {
|
||||
ifstream f(cacheFile);
|
||||
boost::archive::binary_iarchive iar(f);
|
||||
cerr << "Loading n-gram probability cache from " << cacheFile << endl;
|
||||
iar >> *cache;
|
||||
cerr << "Done loading " << cache->size()
|
||||
<< " n-gram probabilities..." << endl;
|
||||
} else {
|
||||
cerr << "Cache file not found" << endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class Model>
|
||||
void LBLLM<Model>::CleanUpAfterSentenceProcessing(const InputType& source) {
|
||||
model.clearCache();
|
||||
|
||||
if (persistentCache) {
|
||||
int sentence_id = source.GetTranslationId();
|
||||
string cacheFile = m_filePath + "." + to_string(sentence_id) + ".cache.bin";
|
||||
ofstream f(cacheFile);
|
||||
boost::archive::binary_oarchive oar(f);
|
||||
cerr << "Saving persistent cache to " << cacheFile << endl;
|
||||
oar << *cache;
|
||||
cerr << "Done saving " << cache->size()
|
||||
<< " n-gram probabilities..." << endl;
|
||||
|
||||
cache->clear();
|
||||
}
|
||||
|
||||
LanguageModelSingleFactor::CleanUpAfterSentenceProcessing(source);
|
||||
}
|
||||
|
||||
template class LBLLM<LM>;
|
||||
template class LBLLM<FactoredLM>;
|
||||
template class LBLLM<FactoredMaxentLM>;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -2,15 +2,12 @@
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
#include <boost/functional/hash.hpp>
|
||||
|
||||
#include "moses/LM/SingleFactor.h"
|
||||
#include "moses/FactorCollection.h"
|
||||
|
||||
// lbl stuff
|
||||
#include "corpus/corpus.h"
|
||||
#include "lbl/lbl_features.h"
|
||||
#include "lbl/model.h"
|
||||
#include "lbl/process_identifier.h"
|
||||
#include "lbl/query_cache.h"
|
||||
|
||||
#include "Mapper.h"
|
||||
@ -22,100 +19,34 @@ namespace Moses
|
||||
template<class Model>
|
||||
class LBLLM : public LanguageModelSingleFactor
|
||||
{
|
||||
protected:
|
||||
|
||||
public:
|
||||
LBLLM(const std::string &line)
|
||||
:LanguageModelSingleFactor(line)
|
||||
{
|
||||
ReadParameters();
|
||||
LBLLM(const std::string &line);
|
||||
|
||||
FactorCollection &factorCollection = FactorCollection::Instance();
|
||||
~LBLLM();
|
||||
|
||||
// needed by parent language model classes. Why didn't they set these themselves?
|
||||
m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, BOS_);
|
||||
m_sentenceStartWord[m_factorType] = m_sentenceStart;
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
|
||||
m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, EOS_);
|
||||
m_sentenceEndWord[m_factorType] = m_sentenceEnd;
|
||||
}
|
||||
void Load();
|
||||
|
||||
~LBLLM()
|
||||
{}
|
||||
virtual LMResult GetValue(
|
||||
const std::vector<const Word*> &contextFactor,
|
||||
State* finalState = 0) const;
|
||||
|
||||
void Load()
|
||||
{
|
||||
model.load(m_filePath);
|
||||
virtual void InitializeForInput(const InputType& source);
|
||||
|
||||
config = model.getConfig();
|
||||
int context_width = config->ngram_order - 1;
|
||||
// For each state, we store at most context_width word ids to the left and
|
||||
// to the right and a kSTAR separator. The last bit represents the actual
|
||||
// size of the state.
|
||||
//int max_state_size = (2 * context_width + 1) * sizeof(int) + 1;
|
||||
//FeatureFunction::SetStateSize(max_state_size);
|
||||
|
||||
dict = model.getDict();
|
||||
mapper = boost::make_shared<OXLMMapper>(dict);
|
||||
//stateConverter = boost::make_shared<CdecStateConverter>(max_state_size - 1);
|
||||
//ruleConverter = boost::make_shared<CdecRuleConverter>(mapper, stateConverter);
|
||||
|
||||
kSTART = dict.Convert("<s>");
|
||||
kSTOP = dict.Convert("</s>");
|
||||
kUNKNOWN = dict.Convert("<unk>");
|
||||
}
|
||||
|
||||
|
||||
virtual LMResult GetValue(const std::vector<const Word*> &contextFactor, State* finalState = 0) const
|
||||
{
|
||||
std::vector<int> context;
|
||||
int word;
|
||||
mapper->convert(contextFactor, context, word);
|
||||
|
||||
size_t context_width = m_nGramOrder - 1;
|
||||
|
||||
if (!context.empty() && context.back() == kSTART) {
|
||||
context.resize(context_width, kSTART);
|
||||
} else {
|
||||
context.resize(context_width, kUNKNOWN);
|
||||
}
|
||||
|
||||
|
||||
double score;
|
||||
score = model.predict(word, context);
|
||||
|
||||
/*
|
||||
std::string str = DebugContextFactor(contextFactor);
|
||||
std::cerr << "contextFactor=" << str << " " << score << std::endl;
|
||||
*/
|
||||
|
||||
LMResult ret;
|
||||
ret.score = score;
|
||||
ret.unknown = (word == kUNKNOWN);
|
||||
|
||||
// calc state from hash of last n-1 words
|
||||
size_t seed = 0;
|
||||
boost::hash_combine(seed, word);
|
||||
for (size_t i = 0; i < context.size() && i < context_width - 1; ++i) {
|
||||
int id = context[i];
|
||||
boost::hash_combine(seed, id);
|
||||
}
|
||||
|
||||
(*finalState) = (State*) seed;
|
||||
return ret;
|
||||
}
|
||||
virtual void CleanUpAfterSentenceProcessing(const InputType& source);
|
||||
|
||||
protected:
|
||||
oxlm::Dict dict;
|
||||
boost::shared_ptr<oxlm::ModelData> config;
|
||||
Model model;
|
||||
boost::shared_ptr<OXLMMapper> mapper;
|
||||
|
||||
int kSTART;
|
||||
int kSTOP;
|
||||
int kUNKNOWN;
|
||||
|
||||
boost::shared_ptr<OXLMMapper> mapper;
|
||||
|
||||
bool persistentCache;
|
||||
mutable boost::thread_specific_ptr<oxlm::QueryCache> cache;
|
||||
mutable int cacheHits, totalHits;
|
||||
};
|
||||
|
||||
|
||||
|
@ -422,7 +422,7 @@ void PDTAimp::CreateTargetPhrase(TargetPhrase& targetPhrase,
|
||||
}
|
||||
|
||||
targetPhrase.GetScoreBreakdown().Assign(m_obj, transVector);
|
||||
targetPhrase.Evaluate(*srcPtr, m_obj->GetFeaturesToApply());
|
||||
targetPhrase.EvaluateInIsolation(*srcPtr, m_obj->GetFeaturesToApply());
|
||||
}
|
||||
|
||||
TargetPhraseCollectionWithSourcePhrase* PDTAimp::PruneTargetCandidates
|
||||
|
@ -9,6 +9,7 @@
|
||||
#include "moses/PP/TreeStructurePhraseProperty.h"
|
||||
#include "moses/PP/SpanLengthPhraseProperty.h"
|
||||
#include "moses/PP/NonTermContextProperty.h"
|
||||
#include "moses/PP/OrientationPhraseProperty.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
@ -59,6 +60,7 @@ PhrasePropertyFactory::PhrasePropertyFactory()
|
||||
MOSES_PNAME2("Tree",TreeStructurePhraseProperty);
|
||||
MOSES_PNAME2("SpanLength", SpanLengthPhraseProperty);
|
||||
MOSES_PNAME2("NonTermContext", NonTermContextProperty);
|
||||
MOSES_PNAME2("Orientation", OrientationPhraseProperty);
|
||||
}
|
||||
|
||||
PhrasePropertyFactory::~PhrasePropertyFactory()
|
||||
|
26
moses/PP/OrientationPhraseProperty.cpp
Normal file
26
moses/PP/OrientationPhraseProperty.cpp
Normal file
@ -0,0 +1,26 @@
|
||||
#include "moses/PP/OrientationPhraseProperty.h"
|
||||
#include <iostream>
|
||||
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
void OrientationPhraseProperty::ProcessValue(const std::string &value)
|
||||
{
|
||||
// bidirectional MSLR phrase orientation with 2x4 orientation classes:
|
||||
// mono swap dright dleft
|
||||
|
||||
std::istringstream tokenizer(value);
|
||||
|
||||
try {
|
||||
if (! (tokenizer >> m_l2rMonoProbability >> m_l2rSwapProbability >> m_l2rDrightProbability >> m_l2rDleftProbability
|
||||
>> m_r2lMonoProbability >> m_r2lSwapProbability >> m_r2lDrightProbability >> m_r2lDleftProbability)) {
|
||||
UTIL_THROW2("OrientationPhraseProperty: Not able to read value. Flawed property?");
|
||||
}
|
||||
} catch (const std::exception &e) {
|
||||
UTIL_THROW2("OrientationPhraseProperty: Read error. Flawed property?");
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Moses
|
||||
|
65
moses/PP/OrientationPhraseProperty.h
Normal file
65
moses/PP/OrientationPhraseProperty.h
Normal file
@ -0,0 +1,65 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "moses/PP/PhraseProperty.h"
|
||||
#include "util/exception.hh"
|
||||
#include <string>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
class OrientationPhraseProperty : public PhraseProperty
|
||||
{
|
||||
public:
|
||||
OrientationPhraseProperty() {};
|
||||
|
||||
virtual void ProcessValue(const std::string &value);
|
||||
|
||||
|
||||
double GetLeftToRightProbabilityMono() const {
|
||||
return m_l2rMonoProbability;
|
||||
};
|
||||
|
||||
double GetLeftToRightProbabilitySwap() const {
|
||||
return m_l2rSwapProbability;
|
||||
};
|
||||
|
||||
double GetLeftToRightProbabilityDright() const {
|
||||
return m_l2rDrightProbability;
|
||||
};
|
||||
|
||||
double GetLeftToRightProbabilityDleft() const {
|
||||
return m_l2rDleftProbability;
|
||||
};
|
||||
|
||||
|
||||
double GetRightToLeftProbabilityMono() const {
|
||||
return m_r2lMonoProbability;
|
||||
};
|
||||
|
||||
double GetRightToLeftProbabilitySwap() const {
|
||||
return m_r2lSwapProbability;
|
||||
};
|
||||
|
||||
double GetRightToLeftProbabilityDright() const {
|
||||
return m_r2lDrightProbability;
|
||||
};
|
||||
|
||||
double GetRightToLeftProbabilityDleft() const {
|
||||
return m_r2lDleftProbability;
|
||||
};
|
||||
|
||||
|
||||
virtual const std::string *GetValueString() const {
|
||||
UTIL_THROW2("OrientationPhraseProperty: value string not available in this phrase property");
|
||||
return NULL;
|
||||
};
|
||||
|
||||
protected:
|
||||
|
||||
float m_l2rMonoProbability, m_l2rSwapProbability, m_l2rDrightProbability, m_l2rDleftProbability,
|
||||
m_r2lMonoProbability, m_r2lSwapProbability, m_r2lDrightProbability, m_r2lDleftProbability;
|
||||
};
|
||||
|
||||
} // namespace Moses
|
||||
|
@ -16,12 +16,12 @@ void SourceLabelsPhraseProperty::ProcessValue(const std::string &value)
|
||||
std::istringstream tokenizer(value);
|
||||
|
||||
if (! (tokenizer >> m_nNTs)) { // first token: number of non-terminals (incl. left-hand side)
|
||||
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of non-terminals. Flawed property?");
|
||||
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of non-terminals. Flawed property? " << value);
|
||||
}
|
||||
assert( m_nNTs > 0 );
|
||||
|
||||
if (! (tokenizer >> m_totalCount)) { // second token: overall rule count
|
||||
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read overall rule count. Flawed property?");
|
||||
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read overall rule count. Flawed property? " << value);
|
||||
}
|
||||
assert( m_totalCount > 0.0 );
|
||||
|
||||
@ -32,7 +32,7 @@ void SourceLabelsPhraseProperty::ProcessValue(const std::string &value)
|
||||
std::priority_queue<float> ruleLabelledCountsPQ;
|
||||
|
||||
while (tokenizer.peek() != EOF) {
|
||||
try {
|
||||
// try {
|
||||
|
||||
SourceLabelsPhrasePropertyItem item;
|
||||
size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
|
||||
@ -46,28 +46,28 @@ void SourceLabelsPhraseProperty::ProcessValue(const std::string &value)
|
||||
for (size_t i=0; i<m_nNTs-1; ++i) { // RHS source non-terminal labels
|
||||
size_t sourceLabelRHS;
|
||||
if (! (tokenizer >> sourceLabelRHS) ) { // RHS source non-terminal label
|
||||
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side label index. Flawed property?");
|
||||
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side label index. Flawed property? " << value);
|
||||
}
|
||||
item.m_sourceLabelsRHS.push_back(sourceLabelRHS);
|
||||
}
|
||||
|
||||
if (! (tokenizer >> item.m_sourceLabelsRHSCount)) {
|
||||
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side count. Flawed property?");
|
||||
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side count. Flawed property? " << value);
|
||||
}
|
||||
|
||||
if (! (tokenizer >> numberOfLHSsGivenRHS)) {
|
||||
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of left-hand sides. Flawed property?");
|
||||
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of left-hand sides. Flawed property? " << value);
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS source non-terminal labels seen with this RHS
|
||||
size_t sourceLabelLHS;
|
||||
if (! (tokenizer >> sourceLabelLHS)) { // LHS source non-terminal label
|
||||
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read left-hand side label index. Flawed property?");
|
||||
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read left-hand side label index. Flawed property? " << value);
|
||||
}
|
||||
float ruleSourceLabelledCount;
|
||||
if (! (tokenizer >> ruleSourceLabelledCount)) {
|
||||
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read count. Flawed property?");
|
||||
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read count. Flawed property? " << value);
|
||||
}
|
||||
item.m_sourceLabelsLHSList.push_back( std::make_pair(sourceLabelLHS,ruleSourceLabelledCount) );
|
||||
ruleLabelledCountsPQ.push(ruleSourceLabelledCount);
|
||||
@ -75,9 +75,9 @@ void SourceLabelsPhraseProperty::ProcessValue(const std::string &value)
|
||||
|
||||
m_sourceLabelItems.push_back(item);
|
||||
|
||||
} catch (const std::exception &e) {
|
||||
UTIL_THROW2("SourceLabelsPhraseProperty: Read error. Flawed property?");
|
||||
}
|
||||
// } catch (const std::exception &e) {
|
||||
// UTIL_THROW2("SourceLabelsPhraseProperty: Read error. Flawed property?");
|
||||
// }
|
||||
}
|
||||
|
||||
// keep only top N label vectors
|
||||
|
@ -50,7 +50,7 @@ Parameter::Parameter()
|
||||
AddParam("factor-delimiter", "fd", "specify a different factor delimiter than the default");
|
||||
AddParam("input-factors", "list of factors in the input");
|
||||
AddParam("input-file", "i", "location of the input file to be translated");
|
||||
AddParam("inputtype", "text (0), confusion network (1), word lattice (2) (default = 0)");
|
||||
AddParam("inputtype", "text (0), confusion network (1), word lattice (2), tree (3) (default = 0)");
|
||||
AddParam("labeled-n-best-list", "print out labels for each weight type in n-best list. default is true");
|
||||
AddParam("mark-unknown", "mu", "mark unknown words in output");
|
||||
AddParam("max-partial-trans-opt", "maximum number of partial translation options per input span (during mapping steps)");
|
||||
|
@ -79,7 +79,7 @@ void RuleCubeItem::CreateHypothesis(const ChartTranslationOptions &transOpt,
|
||||
ChartManager &manager)
|
||||
{
|
||||
m_hypothesis = new ChartHypothesis(transOpt, *this, manager);
|
||||
m_hypothesis->Evaluate();
|
||||
m_hypothesis->EvaluateWhenApplied();
|
||||
m_score = m_hypothesis->GetTotalScore();
|
||||
}
|
||||
|
||||
|
@ -261,6 +261,11 @@ public:
|
||||
|
||||
void PlusEquals(const FeatureFunction* sp, const ScorePair &scorePair);
|
||||
|
||||
// Add score by index
|
||||
void PlusEquals(size_t index, float score) {
|
||||
m_scores[index] += score;
|
||||
}
|
||||
|
||||
//For features which have an unbounded number of components
|
||||
void SparsePlusEquals(const std::string& full_name, float score) {
|
||||
FName fname(full_name);
|
||||
@ -283,7 +288,7 @@ public:
|
||||
m_scores[indexes.first] = score;
|
||||
}
|
||||
|
||||
// Assign core weight by index
|
||||
// Assign score by index
|
||||
void Assign(size_t index, float score) {
|
||||
m_scores[index] = score;
|
||||
}
|
||||
@ -354,6 +359,11 @@ public:
|
||||
m_scores.capMin(minValue);
|
||||
}
|
||||
|
||||
std::pair<size_t,size_t> GetIndexesForProducer(const FeatureFunction* sp) const {
|
||||
IndexPair indexPair = GetIndexes(sp);
|
||||
return indexPair;
|
||||
}
|
||||
|
||||
//! if a FeatureFunction produces a single score (for example, a language model score)
|
||||
//! this will return it. If not, this method will throw
|
||||
float GetScoreForProducer(const FeatureFunction* sp) const {
|
||||
|
@ -288,7 +288,7 @@ void SearchNormal::ExpandHypothesis(const Hypothesis &hypothesis, const Translat
|
||||
stats.StopTimeBuildHyp();
|
||||
}
|
||||
if (newHypo==NULL) return;
|
||||
newHypo->Evaluate(m_transOptColl.GetFutureScore());
|
||||
newHypo->EvaluateWhenApplied(m_transOptColl.GetFutureScore());
|
||||
} else
|
||||
// early discarding: check if hypothesis is too bad to build
|
||||
{
|
||||
|
@ -159,13 +159,13 @@ void SearchNormalBatch::EvalAndMergePartialHypos()
|
||||
++sfff_iter) {
|
||||
const StatefulFeatureFunction &ff = *(sfff_iter->second);
|
||||
int state_idx = sfff_iter->first;
|
||||
hypo->EvaluateWith(ff, state_idx);
|
||||
hypo->EvaluateWhenApplied(ff, state_idx);
|
||||
}
|
||||
std::vector<const StatelessFeatureFunction*>::iterator slff_iter;
|
||||
for (slff_iter = m_stateless_ffs.begin();
|
||||
slff_iter != m_stateless_ffs.end();
|
||||
++slff_iter) {
|
||||
hypo->EvaluateWith(**slff_iter);
|
||||
hypo->EvaluateWhenApplied(**slff_iter);
|
||||
}
|
||||
}
|
||||
|
||||
@ -190,7 +190,7 @@ void SearchNormalBatch::EvalAndMergePartialHypos()
|
||||
dlm_iter != m_dlm_ffs.end();
|
||||
++dlm_iter) {
|
||||
LanguageModel &lm = *(dlm_iter->second);
|
||||
hypo->EvaluateWith(lm, (*dlm_iter).first);
|
||||
hypo->EvaluateWhenApplied(lm, (*dlm_iter).first);
|
||||
}
|
||||
|
||||
// Put completed hypothesis onto its stack.
|
||||
|
@ -125,6 +125,9 @@ bool StaticData::LoadData(Parameter *parameter)
|
||||
if (m_inputType == 2) {
|
||||
s_it = "word lattice";
|
||||
}
|
||||
if (m_inputType == 3) {
|
||||
s_it = "tree";
|
||||
}
|
||||
VERBOSE(2,"input type is: "<<s_it<<"\n");
|
||||
|
||||
if(m_parameter->GetParam("recover-input-path").size()) {
|
||||
|
@ -101,13 +101,13 @@ void TargetPhrase::WriteToRulePB(hgmert::Rule* pb) const
|
||||
}
|
||||
#endif
|
||||
|
||||
void TargetPhrase::Evaluate(const Phrase &source)
|
||||
void TargetPhrase::EvaluateInIsolation(const Phrase &source)
|
||||
{
|
||||
const std::vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
|
||||
Evaluate(source, ffs);
|
||||
EvaluateInIsolation(source, ffs);
|
||||
}
|
||||
|
||||
void TargetPhrase::Evaluate(const Phrase &source, const std::vector<FeatureFunction*> &ffs)
|
||||
void TargetPhrase::EvaluateInIsolation(const Phrase &source, const std::vector<FeatureFunction*> &ffs)
|
||||
{
|
||||
if (ffs.size()) {
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
@ -126,7 +126,7 @@ void TargetPhrase::Evaluate(const Phrase &source, const std::vector<FeatureFunct
|
||||
}
|
||||
}
|
||||
|
||||
void TargetPhrase::Evaluate(const InputType &input, const InputPath &inputPath)
|
||||
void TargetPhrase::EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath)
|
||||
{
|
||||
const std::vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
|
@ -71,14 +71,14 @@ public:
|
||||
~TargetPhrase();
|
||||
|
||||
// 1st evaluate method. Called during loading of phrase table.
|
||||
void Evaluate(const Phrase &source, const std::vector<FeatureFunction*> &ffs);
|
||||
void EvaluateInIsolation(const Phrase &source, const std::vector<FeatureFunction*> &ffs);
|
||||
|
||||
// as above, score with ALL FFs
|
||||
// Used only for OOV processing. Doesn't have a phrase table connect with it
|
||||
void Evaluate(const Phrase &source);
|
||||
void EvaluateInIsolation(const Phrase &source);
|
||||
|
||||
// 'inputPath' is guaranteed to be the raw substring from the input. No factors were added or taken away
|
||||
void Evaluate(const InputType &input, const InputPath &inputPath);
|
||||
void EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath);
|
||||
|
||||
void SetSparseScore(const FeatureFunction* translationScoreProducer, const StringPiece &sparseString);
|
||||
|
||||
|
@ -418,7 +418,7 @@ TargetPhraseVectorPtr PhraseDecoder::DecodeCollection(
|
||||
}
|
||||
|
||||
if(eval) {
|
||||
targetPhrase->Evaluate(sourcePhrase, m_phraseDictionary.GetFeaturesToApply());
|
||||
targetPhrase->EvaluateInIsolation(sourcePhrase, m_phraseDictionary.GetFeaturesToApply());
|
||||
}
|
||||
|
||||
if(m_coding == PREnc) {
|
||||
|
@ -60,7 +60,7 @@ GetTargetPhraseCollectionLEGACY(const Phrase& src) const
|
||||
BOOST_FOREACH(pstat_entry & e, pstats) {
|
||||
TargetPhrase* tp = m_biSA->GetMosesFactorIDs(e.first, src, this);
|
||||
tp->GetScoreBreakdown().Assign(this,e.second);
|
||||
tp->Evaluate(src);
|
||||
tp->EvaluateInIsolation(src);
|
||||
ret->Add(tp);
|
||||
}
|
||||
// return ret;
|
||||
|
@ -147,7 +147,7 @@ void PhraseDictionaryMultiModel::CollectSufficientStatistics(const Phrase& src,
|
||||
vector<FeatureFunction*> pd_feature;
|
||||
pd_feature.push_back(m_pd[i]);
|
||||
const vector<FeatureFunction*> pd_feature_const(pd_feature);
|
||||
statistics->targetPhrase->Evaluate(src, pd_feature_const);
|
||||
statistics->targetPhrase->EvaluateInIsolation(src, pd_feature_const);
|
||||
// zero out scores from original phrase table
|
||||
statistics->targetPhrase->GetScoreBreakdown().ZeroDenseFeatures(&pd);
|
||||
|
||||
@ -186,7 +186,7 @@ TargetPhraseCollection* PhraseDictionaryMultiModel::CreateTargetPhraseCollection
|
||||
vector<FeatureFunction*> pd_feature;
|
||||
pd_feature.push_back(const_cast<PhraseDictionaryMultiModel*>(this));
|
||||
const vector<FeatureFunction*> pd_feature_const(pd_feature);
|
||||
statistics->targetPhrase->Evaluate(src, pd_feature_const);
|
||||
statistics->targetPhrase->EvaluateInIsolation(src, pd_feature_const);
|
||||
|
||||
ret->Add(new TargetPhrase(*statistics->targetPhrase));
|
||||
}
|
||||
|
@ -189,7 +189,7 @@ void PhraseDictionaryMultiModelCounts::CollectSufficientStatistics(const Phrase&
|
||||
vector<FeatureFunction*> pd_feature;
|
||||
pd_feature.push_back(m_pd[i]);
|
||||
const vector<FeatureFunction*> pd_feature_const(pd_feature);
|
||||
statistics->targetPhrase->Evaluate(src, pd_feature_const);
|
||||
statistics->targetPhrase->EvaluateInIsolation(src, pd_feature_const);
|
||||
// zero out scores from original phrase table
|
||||
statistics->targetPhrase->GetScoreBreakdown().ZeroDenseFeatures(&pd);
|
||||
|
||||
@ -251,7 +251,7 @@ TargetPhraseCollection* PhraseDictionaryMultiModelCounts::CreateTargetPhraseColl
|
||||
vector<FeatureFunction*> pd_feature;
|
||||
pd_feature.push_back(const_cast<PhraseDictionaryMultiModelCounts*>(this));
|
||||
const vector<FeatureFunction*> pd_feature_const(pd_feature);
|
||||
statistics->targetPhrase->Evaluate(src, pd_feature_const);
|
||||
statistics->targetPhrase->EvaluateInIsolation(src, pd_feature_const);
|
||||
} catch (AlignmentException& e) {
|
||||
continue;
|
||||
}
|
||||
|
@ -132,7 +132,7 @@ std::vector<TargetPhrase*> PhraseDictionaryTransliteration::CreateTargetPhrases(
|
||||
tp->GetScoreBreakdown().PlusEquals(this, score);
|
||||
|
||||
// score of all other ff when this rule is being loaded
|
||||
tp->Evaluate(sourcePhrase, GetFeaturesToApply());
|
||||
tp->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply());
|
||||
|
||||
ret.push_back(tp);
|
||||
}
|
||||
|
@ -181,7 +181,7 @@ TargetPhrase *ProbingPT::CreateTargetPhrase(const Phrase &sourcePhrase, const ta
|
||||
*/
|
||||
|
||||
// score of all other ff when this rule is being loaded
|
||||
tp->Evaluate(sourcePhrase, GetFeaturesToApply());
|
||||
tp->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply());
|
||||
return tp;
|
||||
}
|
||||
|
||||
|
@ -226,7 +226,7 @@ bool RuleTableLoaderCompact::LoadRuleSection(
|
||||
targetPhrase->SetAlignNonTerm(alignNonTerm);
|
||||
targetPhrase->SetTargetLHS(targetLhs);
|
||||
|
||||
targetPhrase->Evaluate(sourcePhrase, ruleTable.GetFeaturesToApply());
|
||||
targetPhrase->EvaluateInIsolation(sourcePhrase, ruleTable.GetFeaturesToApply());
|
||||
|
||||
// Insert rule into table.
|
||||
TargetPhraseCollection &coll = GetOrCreateTargetPhraseCollection(
|
||||
|
@ -247,7 +247,7 @@ bool RuleTableLoaderStandard::Load(FormatType format
|
||||
}
|
||||
|
||||
targetPhrase->GetScoreBreakdown().Assign(&ruleTable, scoreVector);
|
||||
targetPhrase->Evaluate(sourcePhrase, ruleTable.GetFeaturesToApply());
|
||||
targetPhrase->EvaluateInIsolation(sourcePhrase, ruleTable.GetFeaturesToApply());
|
||||
|
||||
TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase, *targetPhrase, sourceLHS);
|
||||
phraseColl.Add(targetPhrase);
|
||||
|
@ -284,7 +284,7 @@ void PhraseDictionaryFuzzyMatch::InitializeForInput(InputType const& inputSenten
|
||||
std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);
|
||||
|
||||
targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
|
||||
targetPhrase->Evaluate(sourcePhrase, GetFeaturesToApply());
|
||||
targetPhrase->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply());
|
||||
|
||||
TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase, *targetPhrase, sourceLHS);
|
||||
phraseColl.Add(targetPhrase);
|
||||
|
@ -62,7 +62,7 @@ TargetPhrase *SkeletonPT::CreateTargetPhrase(const Phrase &sourcePhrase) const
|
||||
tp->GetScoreBreakdown().PlusEquals(this, scores);
|
||||
|
||||
// score of all other ff when this rule is being loaded
|
||||
tp->Evaluate(sourcePhrase, GetFeaturesToApply());
|
||||
tp->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply());
|
||||
|
||||
return tp;
|
||||
}
|
||||
|
@ -499,6 +499,16 @@ namespace Moses {
|
||||
aln[k] += s2 - s1;
|
||||
Token const* o = (j->fwd ? ag.bt.T2 : ag.bt.T1)->sntStart(sid);
|
||||
float sample_weight = 1./((s2-s1+1)*(e2-e1+1));
|
||||
|
||||
vector<uint64_t> seen;
|
||||
seen.reserve(100);
|
||||
// It is possible that the phrase extraction extracts the same
|
||||
// phrase twice, e.g., when word a co-occurs with sequence b b b
|
||||
// but is aligned only to the middle word. We can only count
|
||||
// each phrase pair once per source phrase occurrence, or else
|
||||
// run the risk of having more joint counts than marginal
|
||||
// counts.
|
||||
|
||||
for (size_t s = s1; s <= s2; ++s)
|
||||
{
|
||||
sptr<iter> b = (j->fwd ? ag.bt.I2 : ag.bt.I1)->find(o+s,e1-s);
|
||||
@ -507,7 +517,26 @@ namespace Moses {
|
||||
// assert(b);
|
||||
for (size_t i = e1; i <= e2; ++i)
|
||||
{
|
||||
if (! j->stats->add(b->getPid(),sample_weight,aln,
|
||||
uint64_t tpid = b->getPid();
|
||||
size_t s = 0;
|
||||
while (s < seen.size() && seen[s] != tpid) ++s;
|
||||
if (s < seen.size())
|
||||
{
|
||||
#if 0
|
||||
size_t sid, off, len;
|
||||
parse_pid(tpid,sid,off,len);
|
||||
cerr << "HA, gotcha! " << sid << ":" << off << " at " << HERE << endl;
|
||||
for (size_t z = 0; z < len; ++z)
|
||||
{
|
||||
id_type tid = ag.bt.T2->sntStart(sid)[off+z].id();
|
||||
cerr << (*ag.bt.V2)[tid] << " ";
|
||||
}
|
||||
cerr << endl;
|
||||
#endif
|
||||
continue;
|
||||
}
|
||||
seen.push_back(tpid);
|
||||
if (! j->stats->add(tpid,sample_weight,aln,
|
||||
b->approxOccurrenceCount(),
|
||||
po_fwd,po_bwd))
|
||||
{
|
||||
|
@ -476,7 +476,7 @@ namespace Moses
|
||||
tp->AddWord(w);
|
||||
}
|
||||
tp->GetScoreBreakdown().Assign(this, fvals);
|
||||
tp->Evaluate(src);
|
||||
tp->EvaluateInIsolation(src);
|
||||
return tp;
|
||||
}
|
||||
|
||||
|
@ -71,10 +71,10 @@ void TranslationOption::CacheLexReorderingScores(const LexicalReordering &produc
|
||||
m_lexReorderingScores[&producer] = score;
|
||||
}
|
||||
|
||||
void TranslationOption::Evaluate(const InputType &input)
|
||||
void TranslationOption::EvaluateWithSourceContext(const InputType &input)
|
||||
{
|
||||
const InputPath &inputPath = GetInputPath();
|
||||
m_targetPhrase.Evaluate(input, inputPath);
|
||||
m_targetPhrase.EvaluateWithSourceContext(input, inputPath);
|
||||
}
|
||||
|
||||
const InputPath &TranslationOption::GetInputPath() const
|
||||
|
@ -135,7 +135,7 @@ public:
|
||||
return m_targetPhrase.GetScoreBreakdown();
|
||||
}
|
||||
|
||||
void Evaluate(const InputType &input);
|
||||
void EvaluateWithSourceContext(const InputType &input);
|
||||
|
||||
/** returns cached scores */
|
||||
inline const Scores *GetLexReorderingScores(const LexicalReordering *scoreProducer) const {
|
||||
|
@ -212,6 +212,12 @@ void TranslationOptionCollection::ProcessOneUnknownWord(const InputPath &inputPa
|
||||
float unknownScore = FloorScore(TransformScore(0));
|
||||
const Word &sourceWord = inputPath.GetPhrase().GetWord(0);
|
||||
|
||||
// hack. Once the OOV FF is a phrase table, get rid of this
|
||||
PhraseDictionary *firstPt = NULL;
|
||||
if (PhraseDictionary::GetColl().size() == 0) {
|
||||
firstPt = PhraseDictionary::GetColl()[0];
|
||||
}
|
||||
|
||||
// unknown word, add as trans opt
|
||||
FactorCollection &factorCollection = FactorCollection::Instance();
|
||||
|
||||
@ -231,7 +237,7 @@ void TranslationOptionCollection::ProcessOneUnknownWord(const InputPath &inputPa
|
||||
// modify the starting bitmap
|
||||
}
|
||||
|
||||
TargetPhrase targetPhrase(NULL);
|
||||
TargetPhrase targetPhrase(firstPt);
|
||||
|
||||
if (!(staticData.GetDropUnknown() || isEpsilon) || isDigit) {
|
||||
// add to dictionary
|
||||
@ -266,7 +272,7 @@ void TranslationOptionCollection::ProcessOneUnknownWord(const InputPath &inputPa
|
||||
m_unksrcs.push_back(&sourcePhrase);
|
||||
WordsRange range(sourcePos, sourcePos + length - 1);
|
||||
|
||||
targetPhrase.Evaluate(sourcePhrase);
|
||||
targetPhrase.EvaluateInIsolation(sourcePhrase);
|
||||
|
||||
TranslationOption *transOpt = new TranslationOption(range, targetPhrase);
|
||||
transOpt->SetInputPath(inputPath);
|
||||
@ -410,7 +416,7 @@ void TranslationOptionCollection::CreateTranslationOptions()
|
||||
|
||||
ProcessUnknownWord();
|
||||
|
||||
EvaluateWithSource();
|
||||
EvaluateWithSourceContext();
|
||||
|
||||
// Prune
|
||||
Prune();
|
||||
@ -535,7 +541,7 @@ void TranslationOptionCollection::SetInputScore(const InputPath &inputPath, Part
|
||||
}
|
||||
}
|
||||
|
||||
void TranslationOptionCollection::EvaluateWithSource()
|
||||
void TranslationOptionCollection::EvaluateWithSourceContext()
|
||||
{
|
||||
const size_t size = m_source.GetSize();
|
||||
for (size_t startPos = 0 ; startPos < size ; ++startPos) {
|
||||
@ -549,7 +555,7 @@ void TranslationOptionCollection::EvaluateWithSource()
|
||||
TranslationOptionList::const_iterator iterTransOpt;
|
||||
for(iterTransOpt = transOptList.begin() ; iterTransOpt != transOptList.end() ; ++iterTransOpt) {
|
||||
TranslationOption &transOpt = **iterTransOpt;
|
||||
transOpt.Evaluate(m_source);
|
||||
transOpt.EvaluateWithSourceContext(m_source);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -96,7 +96,7 @@ protected:
|
||||
//! implemented by inherited class, called by this class
|
||||
virtual void ProcessUnknownWord(size_t sourcePos)=0;
|
||||
|
||||
void EvaluateWithSource();
|
||||
void EvaluateWithSourceContext();
|
||||
|
||||
void CacheLexReordering();
|
||||
|
||||
|
@ -147,7 +147,7 @@ void TranslationOptionCollectionLattice::CreateTranslationOptions()
|
||||
const TargetPhrase &tp = **iter;
|
||||
TranslationOption *transOpt = new TranslationOption(range, tp);
|
||||
transOpt->SetInputPath(path);
|
||||
transOpt->Evaluate(m_source);
|
||||
transOpt->EvaluateWithSourceContext(m_source);
|
||||
|
||||
Add(transOpt);
|
||||
}
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include "Util.h"
|
||||
#include "XmlOption.h"
|
||||
#include "FactorCollection.h"
|
||||
#include "moses/TranslationModel/PhraseDictionary.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -30,6 +31,12 @@ bool TreeInput::ProcessAndStripXMLTags(string &line, std::vector<XMLParseOutput>
|
||||
return true;
|
||||
}
|
||||
|
||||
// hack. What pt should XML trans opt be assigned to?
|
||||
PhraseDictionary *firstPt = NULL;
|
||||
if (PhraseDictionary::GetColl().size() == 0) {
|
||||
firstPt = PhraseDictionary::GetColl()[0];
|
||||
}
|
||||
|
||||
// break up input into a vector of xml tags and text
|
||||
// example: (this), (<b>), (is a), (</b>), (test .)
|
||||
vector<string> xmlTokens = TokenizeXml(line);
|
||||
@ -173,7 +180,7 @@ bool TreeInput::ProcessAndStripXMLTags(string &line, std::vector<XMLParseOutput>
|
||||
//TRACE_ERR("number of translations: " << altTexts.size() << endl);
|
||||
for (size_t i=0; i<altTexts.size(); ++i) {
|
||||
// set target phrase
|
||||
TargetPhrase targetPhrase(NULL);
|
||||
TargetPhrase targetPhrase(firstPt);
|
||||
// targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i],factorDelimiter, NULL);
|
||||
targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i], NULL);
|
||||
|
||||
@ -203,7 +210,7 @@ bool TreeInput::ProcessAndStripXMLTags(string &line, std::vector<XMLParseOutput>
|
||||
// convert from prob to log-prob
|
||||
float scoreValue = FloorScore(TransformScore(probValue));
|
||||
targetPhrase.SetXMLScore(scoreValue);
|
||||
targetPhrase.Evaluate(sourcePhrase);
|
||||
targetPhrase.EvaluateInIsolation(sourcePhrase);
|
||||
|
||||
// set span and create XmlOption
|
||||
WordsRange range(startPos+1,endPos);
|
||||
|
@ -30,6 +30,7 @@
|
||||
#include "TargetPhrase.h"
|
||||
#include "ReorderingConstraint.h"
|
||||
#include "FactorCollection.h"
|
||||
#include "moses/TranslationModel/PhraseDictionary.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
@ -160,6 +161,12 @@ bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingCon
|
||||
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
|
||||
// hack. What pt should XML trans opt be assigned to?
|
||||
PhraseDictionary *firstPt = NULL;
|
||||
if (PhraseDictionary::GetColl().size() == 0) {
|
||||
firstPt = PhraseDictionary::GetColl()[0];
|
||||
}
|
||||
|
||||
// no xml tag? we're done.
|
||||
//if (line.find_first_of('<') == string::npos) {
|
||||
if (line.find(lbrackStr) == string::npos) {
|
||||
@ -361,7 +368,7 @@ bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingCon
|
||||
float scoreValue = FloorScore(TransformScore(probValue));
|
||||
|
||||
WordsRange range(startPos + offset,endPos-1 + offset); // span covered by phrase
|
||||
TargetPhrase targetPhrase(NULL);
|
||||
TargetPhrase targetPhrase(firstPt);
|
||||
// targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i],factorDelimiter, NULL);
|
||||
targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i], NULL);
|
||||
|
||||
@ -375,7 +382,7 @@ bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingCon
|
||||
}
|
||||
|
||||
targetPhrase.SetXMLScore(scoreValue);
|
||||
targetPhrase.Evaluate(sourcePhrase);
|
||||
targetPhrase.EvaluateInIsolation(sourcePhrase);
|
||||
|
||||
XmlOption *option = new XmlOption(range,targetPhrase);
|
||||
assert(option);
|
||||
|
159
phrase-extract/PropertiesConsolidator.cpp
Normal file
159
phrase-extract/PropertiesConsolidator.cpp
Normal file
@ -0,0 +1,159 @@
|
||||
/***********************************************************************
|
||||
Moses - factored phrase-based language decoder
|
||||
Copyright (C) University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
#include "PropertiesConsolidator.h"
|
||||
|
||||
#include <sstream>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
|
||||
#include "moses/Util.h"
|
||||
#include "phrase-extract/InputFileStream.h"
|
||||
#include "phrase-extract/OutputFileStream.h"
|
||||
|
||||
|
||||
namespace MosesTraining
|
||||
{
|
||||
|
||||
void PropertiesConsolidator::ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile)
|
||||
{
|
||||
Moses::InputFileStream inFile(sourceLabelSetFile);
|
||||
|
||||
// read source label set
|
||||
m_sourceLabels.clear();
|
||||
std::string line;
|
||||
while (getline(inFile, line)) {
|
||||
std::istringstream tokenizer(line);
|
||||
std::string label;
|
||||
size_t index;
|
||||
try {
|
||||
tokenizer >> label >> index;
|
||||
} catch (const std::exception &e) {
|
||||
UTIL_THROW2("Error reading source label set file " << sourceLabelSetFile << " .");
|
||||
}
|
||||
std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_sourceLabels.insert( std::pair<std::string,size_t>(label,index) );
|
||||
UTIL_THROW_IF2(!inserted.second,"Source label set file " << sourceLabelSetFile << " should contain each syntactic label only once.");
|
||||
}
|
||||
|
||||
inFile.Close();
|
||||
|
||||
m_sourceLabelsFlag = true;
|
||||
}
|
||||
|
||||
|
||||
std::string PropertiesConsolidator::ProcessPropertiesString(const std::string &propertiesString) const
|
||||
{
|
||||
if ( propertiesString.empty() ) {
|
||||
return propertiesString;
|
||||
}
|
||||
|
||||
std::ostringstream out;
|
||||
std::vector<std::string> toks;
|
||||
Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{");
|
||||
for (size_t i = 1; i < toks.size(); ++i) {
|
||||
std::string &tok = toks[i];
|
||||
if (tok.empty()) {
|
||||
continue;
|
||||
}
|
||||
size_t endPos = tok.rfind("}");
|
||||
tok = tok.substr(0, endPos - 1);
|
||||
std::vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
|
||||
assert(keyValue.size() == 2);
|
||||
|
||||
if ( !keyValue[0].compare("SourceLabels") ) {
|
||||
|
||||
if ( m_sourceLabelsFlag ) {
|
||||
|
||||
// SourceLabels additional property: replace strings with vocabulary indices
|
||||
out << " {{" << keyValue[0];
|
||||
|
||||
std::istringstream tokenizer(keyValue[1]);
|
||||
|
||||
size_t nNTs;
|
||||
double totalCount;
|
||||
|
||||
if (! (tokenizer >> nNTs)) { // first token: number of non-terminals (incl. left-hand side)
|
||||
UTIL_THROW2("Not able to read number of non-terminals from SourceLabels property. "
|
||||
<< "Flawed SourceLabels property?");
|
||||
}
|
||||
assert( nNTs > 0 );
|
||||
out << " " << nNTs;
|
||||
|
||||
if (! (tokenizer >> totalCount)) { // second token: overall rule count
|
||||
UTIL_THROW2("Not able to read overall rule count from SourceLabels property. "
|
||||
<< "Flawed SourceLabels property?");
|
||||
}
|
||||
assert( totalCount > 0.0 );
|
||||
out << " " << totalCount;
|
||||
|
||||
while (tokenizer.peek() != EOF) {
|
||||
try {
|
||||
|
||||
size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
|
||||
|
||||
std::string token;
|
||||
|
||||
if (nNTs > 1) { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule
|
||||
for (size_t i=0; i<nNTs-1; ++i) { // RHS source non-terminal labels
|
||||
tokenizer >> token; // RHS source non-terminal label
|
||||
std::map<std::string,size_t>::const_iterator found = m_sourceLabels.find(token);
|
||||
UTIL_THROW_IF2(found == m_sourceLabels.end(), "Label \"" << token << "\" from the phrase table not found in given label set.");
|
||||
out << " " << found->second;
|
||||
}
|
||||
|
||||
tokenizer >> token; // sourceLabelsRHSCount
|
||||
out << " " << token;
|
||||
|
||||
tokenizer >> numberOfLHSsGivenRHS;
|
||||
out << " " << numberOfLHSsGivenRHS;
|
||||
}
|
||||
|
||||
for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS source non-terminal labels seen with this RHS
|
||||
tokenizer >> token; // LHS source non-terminal label
|
||||
std::map<std::string,size_t>::const_iterator found = m_sourceLabels.find(token);
|
||||
UTIL_THROW_IF2(found == m_sourceLabels.end() ,"Label \"" << token << "\" from the phrase table not found in given label set.");
|
||||
out << " " << found->second;
|
||||
|
||||
tokenizer >> token; // ruleSourceLabelledCount
|
||||
out << " " << token;
|
||||
}
|
||||
|
||||
} catch (const std::exception &e) {
|
||||
UTIL_THROW2("Flawed item in SourceLabels property?");
|
||||
}
|
||||
}
|
||||
|
||||
out << "}}";
|
||||
|
||||
} else { // don't process source labels additional property
|
||||
out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
// output other additional property
|
||||
out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
|
||||
}
|
||||
}
|
||||
|
||||
return out.str();
|
||||
}
|
||||
|
||||
} // namespace MosesTraining
|
||||
|
48
phrase-extract/PropertiesConsolidator.h
Normal file
48
phrase-extract/PropertiesConsolidator.h
Normal file
@ -0,0 +1,48 @@
|
||||
/***********************************************************************
|
||||
Moses - factored phrase-based language decoder
|
||||
Copyright (C) University of Edinburgh
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <map>
|
||||
|
||||
|
||||
namespace MosesTraining
|
||||
{
|
||||
|
||||
class PropertiesConsolidator
|
||||
{
|
||||
public:
|
||||
|
||||
PropertiesConsolidator() : m_sourceLabelsFlag(false) {};
|
||||
|
||||
void ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile);
|
||||
|
||||
std::string ProcessPropertiesString(const std::string &propertiesString) const;
|
||||
|
||||
private:
|
||||
|
||||
bool m_sourceLabelsFlag;
|
||||
std::map<std::string,size_t> m_sourceLabels;
|
||||
|
||||
};
|
||||
|
||||
} // namespace MosesTraining
|
||||
|
@ -28,6 +28,7 @@
|
||||
#include "tables-core.h"
|
||||
#include "InputFileStream.h"
|
||||
#include "OutputFileStream.h"
|
||||
#include "PropertiesConsolidator.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -37,13 +38,14 @@ bool phraseCountFlag = false;
|
||||
bool lowCountFlag = false;
|
||||
bool goodTuringFlag = false;
|
||||
bool kneserNeyFlag = false;
|
||||
bool sourceLabelsFlag = false;
|
||||
bool logProbFlag = false;
|
||||
inline float maybeLogProb( float a )
|
||||
{
|
||||
return logProbFlag ? log(a) : a;
|
||||
}
|
||||
|
||||
void processFiles( char*, char*, char*, char* );
|
||||
void processFiles( char*, char*, char*, char*, char* );
|
||||
void loadCountOfCounts( char* );
|
||||
void breakdownCoreAndSparse( string combined, string &core, string &sparse );
|
||||
bool getLine( istream &fileP, vector< string > &item );
|
||||
@ -57,13 +59,14 @@ int main(int argc, char* argv[])
|
||||
<< "consolidating direct and indirect rule tables\n";
|
||||
|
||||
if (argc < 4) {
|
||||
cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--PhraseCount] \n";
|
||||
cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--PhraseCount] [--GoodTuring counts-of-counts-file] [--KneserNey counts-of-counts-file] [--LowCountFeature] [--SourceLabels source-labels-file] \n";
|
||||
exit(1);
|
||||
}
|
||||
char* &fileNameDirect = argv[1];
|
||||
char* &fileNameIndirect = argv[2];
|
||||
char* &fileNameConsolidated = argv[3];
|
||||
char* fileNameCountOfCounts;
|
||||
char* fileNameSourceLabelSet;
|
||||
|
||||
for(int i=4; i<argc; i++) {
|
||||
if (strcmp(argv[i],"--Hierarchical") == 0) {
|
||||
@ -114,13 +117,21 @@ int main(int argc, char* argv[])
|
||||
} else if (strcmp(argv[i],"--LogProb") == 0) {
|
||||
logProbFlag = true;
|
||||
cerr << "using log-probabilities\n";
|
||||
} else if (strcmp(argv[i],"--SourceLabels") == 0) {
|
||||
sourceLabelsFlag = true;
|
||||
if (i+1==argc) {
|
||||
cerr << "ERROR: specify source label set file!\n";
|
||||
exit(1);
|
||||
}
|
||||
fileNameSourceLabelSet = argv[++i];
|
||||
cerr << "processing source labels property\n";
|
||||
} else {
|
||||
cerr << "ERROR: unknown option " << argv[i] << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts );
|
||||
processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts, fileNameSourceLabelSet );
|
||||
}
|
||||
|
||||
vector< float > countOfCounts;
|
||||
@ -169,7 +180,7 @@ void loadCountOfCounts( char* fileNameCountOfCounts )
|
||||
if (kneserNey_D3 > 2.9) kneserNey_D3 = 2.9;
|
||||
}
|
||||
|
||||
void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts )
|
||||
void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts, char* fileNameSourceLabelSet )
|
||||
{
|
||||
if (goodTuringFlag || kneserNeyFlag)
|
||||
loadCountOfCounts( fileNameCountOfCounts );
|
||||
@ -198,6 +209,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// create properties consolidator
|
||||
// (in case any additional phrase property requires further processing)
|
||||
MosesTraining::PropertiesConsolidator propertiesConsolidator = MosesTraining::PropertiesConsolidator();
|
||||
if (sourceLabelsFlag) {
|
||||
propertiesConsolidator.ActivateSourceLabelsProcessing(fileNameSourceLabelSet);
|
||||
}
|
||||
|
||||
// loop through all extracted phrase translations
|
||||
int i=0;
|
||||
while(true) {
|
||||
@ -307,12 +325,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
|
||||
// counts, for debugging
|
||||
fileConsolidated << "||| " << countE << " " << countF << " " << countEF;
|
||||
|
||||
// count bin feature (as a sparse feature)
|
||||
// sparse features
|
||||
fileConsolidated << " |||";
|
||||
if (directSparseScores.compare("") != 0)
|
||||
fileConsolidated << " " << directSparseScores;
|
||||
if (indirectSparseScores.compare("") != 0)
|
||||
fileConsolidated << " " << indirectSparseScores;
|
||||
// count bin feature (as a sparse feature)
|
||||
if (sparseCountBinFeatureFlag) {
|
||||
bool foundBin = false;
|
||||
for(size_t i=0; i < countBin.size(); i++) {
|
||||
@ -332,9 +351,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
|
||||
}
|
||||
|
||||
// arbitrary key-value pairs
|
||||
fileConsolidated << " ||| ";
|
||||
fileConsolidated << " |||";
|
||||
if (itemDirect.size() >= 6) {
|
||||
fileConsolidated << itemDirect[5];
|
||||
//if (sourceLabelsFlag) {
|
||||
fileConsolidated << propertiesConsolidator.ProcessPropertiesString(itemDirect[5]);
|
||||
//} else {
|
||||
// fileConsolidated << itemDirect[5];
|
||||
//}
|
||||
}
|
||||
|
||||
fileConsolidated << endl;
|
||||
|
@ -248,7 +248,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
|
||||
|
||||
const std::vector<const Subgraph *> &rules = (*p)->GetRules();
|
||||
|
||||
REO_POS l2rOrientation, r2lOrientation;
|
||||
REO_POS l2rOrientation=UNKNOWN, r2lOrientation=UNKNOWN;
|
||||
if (options.phraseOrientation && !rules.empty()) {
|
||||
int sourceSpanBegin = *((*p)->GetSpan().begin());
|
||||
int sourceSpanEnd = *((*p)->GetSpan().rbegin());
|
||||
@ -617,9 +617,8 @@ void ExtractGHKM::WriteGlueGrammar(
|
||||
}
|
||||
}
|
||||
|
||||
std::string sourceTopLabel = "TOPLABEL";
|
||||
std::string sourceSLabel = "S";
|
||||
std::string sourceSomeLabel = "SOMELABEL";
|
||||
size_t sourceLabelGlueTop = 0;
|
||||
size_t sourceLabelGlueX = 1;
|
||||
|
||||
// basic rules
|
||||
out << "<s> [X] ||| <s> [" << topLabel << "] ||| 1 ||| ||| ||| |||";
|
||||
@ -627,7 +626,7 @@ void ExtractGHKM::WriteGlueGrammar(
|
||||
out << " {{Tree [" << topLabel << " <s>]}}";
|
||||
}
|
||||
if (options.sourceLabels) {
|
||||
out << " {{SourceLabels 1 1 " << sourceTopLabel << " 1}}";
|
||||
out << " {{SourceLabels 1 1 " << sourceLabelGlueTop << " 1}}";
|
||||
}
|
||||
out << std::endl;
|
||||
|
||||
@ -636,7 +635,7 @@ void ExtractGHKM::WriteGlueGrammar(
|
||||
out << " {{Tree [" << topLabel << " [" << topLabel << "] </s>]}}";
|
||||
}
|
||||
if (options.sourceLabels) {
|
||||
out << " {{SourceLabels 2 1 " << sourceTopLabel << " 1 1 " << sourceTopLabel << " 1}}";
|
||||
out << " {{SourceLabels 2 1 " << sourceLabelGlueTop << " 1 1 " << sourceLabelGlueTop << " 1}}";
|
||||
}
|
||||
out << std::endl;
|
||||
|
||||
@ -648,7 +647,7 @@ void ExtractGHKM::WriteGlueGrammar(
|
||||
out << " {{Tree [" << topLabel << " <s> [" << i->first << "] </s>]}}";
|
||||
}
|
||||
if (options.sourceLabels) {
|
||||
out << " {{SourceLabels 2 1 " << sourceSLabel << " 1 1 " << sourceTopLabel << " 1}}";
|
||||
out << " {{SourceLabels 2 1 " << sourceLabelGlueX << " 1 1 " << sourceLabelGlueTop << " 1}}";
|
||||
}
|
||||
out << std::endl;
|
||||
}
|
||||
@ -661,7 +660,7 @@ void ExtractGHKM::WriteGlueGrammar(
|
||||
out << " {{Tree [" << topLabel << " ["<< topLabel << "] [" << *i << "]]}}";
|
||||
}
|
||||
if (options.sourceLabels) {
|
||||
out << " {{SourceLabels 3 2.718 " << sourceTopLabel << " " << sourceSomeLabel << " 2.718 1 " << sourceTopLabel << " 2.718}}"; // TODO: there should be better options than using "SOMELABEL"
|
||||
out << " {{SourceLabels 3 2.718 " << sourceLabelGlueTop << " " << sourceLabelGlueX << " 2.718 1 " << sourceLabelGlueTop << " 2.718}}"; // TODO: there should be better options than using "SOMELABEL"
|
||||
}
|
||||
out << std::endl;
|
||||
}
|
||||
@ -672,7 +671,7 @@ void ExtractGHKM::WriteGlueGrammar(
|
||||
out << " {{Tree [" << topLabel << " [" << topLabel << "] [X]]}}";
|
||||
}
|
||||
if (options.sourceLabels) {
|
||||
out << " {{SourceLabels 3 1 " << sourceTopLabel << " " << sourceSomeLabel << " 1 1 " << sourceTopLabel << " 1}}"; // TODO: there should be better options than using "SOMELABEL"
|
||||
out << " {{SourceLabels 3 1 " << sourceLabelGlueTop << " " << sourceLabelGlueX << " 1 1 " << sourceLabelGlueTop << " 1}}"; // TODO: there should be better options than using "SOMELABEL"
|
||||
}
|
||||
out << std::endl;
|
||||
}
|
||||
|
@ -187,7 +187,7 @@ const std::string PhraseOrientation::GetOrientationInfoString(int startF, int en
|
||||
|
||||
const std::string PhraseOrientation::GetOrientationInfoString(int startF, int startE, int endF, int endE, REO_DIR direction) const
|
||||
{
|
||||
REO_POS hierPrevOrient, hierNextOrient;
|
||||
REO_POS hierPrevOrient=UNKNOWN, hierNextOrient=UNKNOWN;
|
||||
|
||||
bool connectedLeftTopP = IsAligned( startF-1, startE-1 );
|
||||
bool connectedRightTopP = IsAligned( endF+1, startE-1 );
|
||||
|
@ -1860,7 +1860,7 @@ sub define_tuning_tune {
|
||||
$cmd .= " --lambdas \"$lambda\"" if $lambda;
|
||||
$cmd .= " --continue" if $tune_continue;
|
||||
$cmd .= " --skip-decoder" if $skip_decoder;
|
||||
$cmd .= " --inputtype $tune_inputtype" if $tune_inputtype;
|
||||
$cmd .= " --inputtype $tune_inputtype" if defined($tune_inputtype);
|
||||
|
||||
my $qsub_args = &get_qsub_args("TUNING");
|
||||
$cmd .= " --queue-flags=\"$qsub_args\"" if ($CLUSTER && $qsub_args);
|
||||
@ -2217,6 +2217,10 @@ sub define_training_extract_phrases {
|
||||
my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model",""));
|
||||
$cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file ";
|
||||
}
|
||||
|
||||
if (&get("TRAINING:ghkm-source-labels")) {
|
||||
$cmd .= "-ghkm-source-labels ";
|
||||
}
|
||||
}
|
||||
|
||||
my $extract_settings = &get("TRAINING:extract-settings");
|
||||
@ -2254,6 +2258,11 @@ sub define_training_build_ttable {
|
||||
my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model",""));
|
||||
$cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file ";
|
||||
}
|
||||
if (&get("TRAINING:ghkm-source-labels")) {
|
||||
$cmd .= "-ghkm-source-labels ";
|
||||
my $source_labels_file = &versionize(&long_file_name("source-labels","model",""));
|
||||
$cmd .= "-ghkm-source-labels-file $source_labels_file ";
|
||||
}
|
||||
}
|
||||
|
||||
&create_step($step_id,$cmd);
|
||||
@ -2438,6 +2447,12 @@ sub define_training_create_config {
|
||||
}
|
||||
}
|
||||
|
||||
if (&get("TRAINING:ghkm-source-labels")) {
|
||||
$cmd .= "-ghkm-source-labels ";
|
||||
my $source_labels_file = &versionize(&long_file_name("source-labels","model",""));
|
||||
$cmd .= "-ghkm-source-labels-file $source_labels_file ";
|
||||
}
|
||||
|
||||
# sparse lexical features provide additional content for config file
|
||||
$cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features;
|
||||
|
||||
@ -3412,7 +3427,7 @@ sub check_backoff_and_get_array {
|
||||
# the following two functions deal with getting information about
|
||||
# files that are passed between steps. this are either specified
|
||||
# in the meta file (default) or in the configuration file (here called
|
||||
# 'specified', in the step management refered to as 'given').
|
||||
# 'specified', in the step management referred to as 'given').
|
||||
|
||||
sub get_specified_or_default_file {
|
||||
my ($specified_module,$specified_set,$specified_parameter,
|
||||
|
@ -219,14 +219,14 @@ foreach (@children) {
|
||||
waitpid($_, 0);
|
||||
}
|
||||
|
||||
# glue rules
|
||||
# merge glue rules
|
||||
if (defined($glueFile)) {
|
||||
my $cmd = "cat $TMPDIR/glue.* | LC_ALL=C sort | uniq > $glueFile";
|
||||
print STDERR "Merging glue rules: $cmd \n";
|
||||
print STDERR `$cmd`;
|
||||
}
|
||||
|
||||
# phrase orientation priors (GHKM extraction)
|
||||
# merge phrase orientation priors (GHKM extraction)
|
||||
if ($phraseOrientation && defined($phraseOrientationPriorsFile)) {
|
||||
print STDERR "Merging phrase orientation priors\n";
|
||||
|
||||
|
@ -27,10 +27,22 @@ my $scoreCmd = $ARGV[2];
|
||||
my $extractFile = $ARGV[3]; # 1st arg of extract argument
|
||||
my $lexFile = $ARGV[4];
|
||||
my $ptHalf = $ARGV[5]; # output
|
||||
my $inverse = 0;
|
||||
my $sourceLabelsFile;
|
||||
|
||||
my $otherExtractArgs= "";
|
||||
for (my $i = 6; $i < $#ARGV; ++$i)
|
||||
{
|
||||
if ($ARGV[$i] eq '--SourceLabels') {
|
||||
$sourceLabelsFile = $ARGV[++$i];
|
||||
$otherExtractArgs .= "--SourceLabels --SourceLabelCountsLHS --SourceLabelSet ";
|
||||
next;
|
||||
}
|
||||
if ($ARGV[$i] eq '--Inverse') {
|
||||
$inverse = 1;
|
||||
$otherExtractArgs .= $ARGV[$i] ." ";
|
||||
next;
|
||||
}
|
||||
$otherExtractArgs .= $ARGV[$i] ." ";
|
||||
}
|
||||
#$scoreCmd $extractFile $lexFile $ptHalf $otherExtractArgs
|
||||
@ -258,6 +270,14 @@ if (-e $cocPath)
|
||||
close(FHCOC);
|
||||
}
|
||||
|
||||
# merge source label files
|
||||
if (!$inverse && defined($sourceLabelsFile))
|
||||
{
|
||||
my $cmd = "(echo \"GlueTop 0\"; echo \"GlueX 1\"; cat $TMPDIR/phrase-table.half.*.gz.syntaxLabels.src | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+1]}/\") > $sourceLabelsFile";
|
||||
print STDERR "Merging source label files: $cmd \n";
|
||||
`$cmd`;
|
||||
}
|
||||
|
||||
$cmd = "rm -rf $TMPDIR \n";
|
||||
print STDERR $cmd;
|
||||
systemCheck($cmd);
|
||||
|
@ -2,4 +2,7 @@ The language suffix can be found here:
|
||||
|
||||
http://www.loc.gov/standards/iso639-2/php/code_list.php
|
||||
|
||||
This code includes data from Daniel Naber's Language Tools (czech abbreviations).
|
||||
This code includes data from czech wiktionary (also czech abbreviations).
|
||||
|
||||
|
||||
|
5
scripts/tokenizer/basic-protected-patterns
Normal file
5
scripts/tokenizer/basic-protected-patterns
Normal file
@ -0,0 +1,5 @@
|
||||
<\/?\S+\/?>
|
||||
<\S+( [a-zA-Z0-9]+\=\"?[^\"]\")+ ?\/?>
|
||||
<\S+( [a-zA-Z0-9]+\=\'?[^\']\')+ ?\/?>
|
||||
(\w\-\_\.)+\@((\w\-\_)+\.)+[a-zA-Z]{2,}
|
||||
(http[s]?|ftp):\/\/[^:\/\s]+(\/\w+)*\/[\w\-\.]+
|
@ -232,15 +232,20 @@ sub tokenize
|
||||
# Find protected patterns
|
||||
my @protected = ();
|
||||
foreach my $protected_pattern (@protected_patterns) {
|
||||
foreach ($text =~ /($protected_pattern)/) {
|
||||
push @protected, $_;
|
||||
my $t = $text;
|
||||
while ($t =~ /($protected_pattern)(.*)$/) {
|
||||
push @protected, $1;
|
||||
$t = $2;
|
||||
}
|
||||
}
|
||||
|
||||
for (my $i = 0; $i < scalar(@protected); ++$i) {
|
||||
my $subst = sprintf("THISISPROTECTED%.3d", $i);
|
||||
$text =~ s,\Q$protected[$i],$subst,g;
|
||||
$text =~ s,\Q$protected[$i], $subst ,g;
|
||||
}
|
||||
$text =~ s/ +/ /g;
|
||||
$text =~ s/^ //g;
|
||||
$text =~ s/ $//g;
|
||||
|
||||
# seperate out all "other" special characters
|
||||
$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
|
||||
|
@ -127,8 +127,8 @@ my $___NOCASE = 0;
|
||||
# Use "--nonorm" to non normalize translation before computing scores
|
||||
my $___NONORM = 0;
|
||||
|
||||
# set 0 if input type is text, set 1 if input type is confusion network
|
||||
my $___INPUTTYPE = 0;
|
||||
# set 0 if input type is text, set 1 if input type is confusion network, set 3 if input type is parse tree
|
||||
my $___INPUTTYPE;
|
||||
|
||||
|
||||
my $mertdir = undef; # path to new mert directory
|
||||
@ -1228,14 +1228,18 @@ sub run_decoder {
|
||||
|
||||
if (defined $___JOBS && $___JOBS > 0) {
|
||||
die "Hypergraph mira not supported by moses-parallel" if $___HG_MIRA;
|
||||
$decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG -inputtype $___INPUTTYPE -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$___DECODER_FLAGS $decoder_config\" $lsamp_cmd -n-best-list \"$filename $___N_BEST_LIST_SIZE\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out";
|
||||
$decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG";
|
||||
$decoder_cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE);
|
||||
$decoder_cmd .= " -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$___DECODER_FLAGS $decoder_config\" $lsamp_cmd -n-best-list \"$filename $___N_BEST_LIST_SIZE distinct\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out";
|
||||
} else {
|
||||
my $nbest_list_cmd = "-n-best-list $filename $___N_BEST_LIST_SIZE";
|
||||
my $nbest_list_cmd = "-n-best-list $filename $___N_BEST_LIST_SIZE distinct";
|
||||
if ($___HG_MIRA) {
|
||||
safesystem("rm -rf $hypergraph_dir");
|
||||
$nbest_list_cmd = "-output-search-graph-hypergraph true gz";
|
||||
}
|
||||
$decoder_cmd = "$___DECODER $___DECODER_FLAGS -config $___CONFIG -inputtype $___INPUTTYPE $decoder_config $lsamp_cmd $nbest_list_cmd -input-file $___DEV_F > run$run.out";
|
||||
$decoder_cmd = "$___DECODER $___DECODER_FLAGS -config $___CONFIG";
|
||||
$decoder_cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE);
|
||||
$decoder_cmd .= " $decoder_config $lsamp_cmd $nbest_list_cmd -input-file $___DEV_F > run$run.out";
|
||||
}
|
||||
|
||||
print STDERR "Executing: $decoder_cmd \n";
|
||||
@ -1309,7 +1313,9 @@ sub get_featlist_from_moses {
|
||||
print STDERR "Using cached features list: $featlistfn\n";
|
||||
} else {
|
||||
print STDERR "Asking moses for feature names and values from $___CONFIG\n";
|
||||
my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn -inputtype $___INPUTTYPE -show-weights > $featlistfn";
|
||||
my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn";
|
||||
$cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE);
|
||||
$cmd .= " -show-weights > $featlistfn";
|
||||
print STDERR "Executing: $cmd\n";
|
||||
safesystem($cmd) or die "Failed to run moses with the config $configfn";
|
||||
}
|
||||
|
@ -32,7 +32,7 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
|
||||
$_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE,
|
||||
@_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS,
|
||||
$_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM, $_OSM_FACTORS, $_POST_DECODING_TRANSLIT, $_TRANSLITERATION_PHRASE_TABLE,
|
||||
$_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_GHKM_TREE_FRAGMENTS,$_GHKM_PHRASE_ORIENTATION,$_PHRASE_ORIENTATION_PRIORS_FILE,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,
|
||||
$_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_GHKM_TREE_FRAGMENTS,$_GHKM_PHRASE_ORIENTATION,$_PHRASE_ORIENTATION_PRIORS_FILE,$_GHKM_SOURCE_LABELS,$_GHKM_SOURCE_LABELS_FILE,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,
|
||||
$_ALT_DIRECT_RULE_SCORE_1, $_ALT_DIRECT_RULE_SCORE_2, $_UNKNOWN_WORD_SOFT_MATCHES_FILE,
|
||||
$_OMIT_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
|
||||
$_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
|
||||
@ -112,6 +112,8 @@ $_HELP = 1
|
||||
'ghkm-tree-fragments' => \$_GHKM_TREE_FRAGMENTS,
|
||||
'ghkm-phrase-orientation' => \$_GHKM_PHRASE_ORIENTATION,
|
||||
'phrase-orientation-priors-file=s' => \$_PHRASE_ORIENTATION_PRIORS_FILE, # currently relevant for GHKM extraction only; phrase orientation for PBT has different implementation
|
||||
'ghkm-source-labels' => \$_GHKM_SOURCE_LABELS,
|
||||
'ghkm-source-labels-file=s' => \$_GHKM_SOURCE_LABELS_FILE,
|
||||
'pcfg' => \$_PCFG,
|
||||
'alt-direct-rule-score-1' => \$_ALT_DIRECT_RULE_SCORE_1,
|
||||
'alt-direct-rule-score-2' => \$_ALT_DIRECT_RULE_SCORE_2,
|
||||
@ -1427,10 +1429,15 @@ sub extract_phrase {
|
||||
$cmd .= " --PCFG" if $_PCFG;
|
||||
$cmd .= " --UnpairedExtractFormat" if $_ALT_DIRECT_RULE_SCORE_1 || $_ALT_DIRECT_RULE_SCORE_2;
|
||||
$cmd .= " --ConditionOnTargetLHS" if $_ALT_DIRECT_RULE_SCORE_1;
|
||||
$cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS;
|
||||
$cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION;
|
||||
$cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if defined($_PHRASE_ORIENTATION_PRIORS_FILE);
|
||||
if (!defined($_GHKM)) {
|
||||
if (defined($_GHKM))
|
||||
{
|
||||
$cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS;
|
||||
$cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION;
|
||||
$cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if defined($_PHRASE_ORIENTATION_PRIORS_FILE);
|
||||
$cmd .= " --SourceLabels" if $_GHKM_SOURCE_LABELS;
|
||||
}
|
||||
else
|
||||
{
|
||||
$cmd .= " --SourceSyntax" if $_SOURCE_SYNTAX;
|
||||
$cmd .= " --TargetSyntax" if $_TARGET_SYNTAX;
|
||||
$cmd .= " --MaxSpan $max_length";
|
||||
@ -1609,6 +1616,7 @@ sub score_phrase_phrase_extract {
|
||||
$cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS;
|
||||
$cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION;
|
||||
$cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if $_GHKM_PHRASE_ORIENTATION && defined($_PHRASE_ORIENTATION_PRIORS_FILE);
|
||||
$cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
|
||||
$cmd .= " $DOMAIN" if $DOMAIN;
|
||||
$cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS);
|
||||
$cmd .= " --FlexibilityScore=$FLEX_SCORER" if $_FLEXIBILITY_SCORE;
|
||||
@ -1659,6 +1667,7 @@ sub score_phrase_phrase_extract {
|
||||
$cmd .= " --SparseCountBinFeature $SPARSE_COUNT_BIN" if $SPARSE_COUNT_BIN;
|
||||
$cmd .= " --GoodTuring $ttable_file.half.f2e.gz.coc" if $GOOD_TURING;
|
||||
$cmd .= " --KneserNey $ttable_file.half.f2e.gz.coc" if $KNESER_NEY;
|
||||
$cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
|
||||
|
||||
$cmd .= " | gzip -c > $ttable_file.gz";
|
||||
|
||||
@ -2164,6 +2173,7 @@ sub create_ini {
|
||||
print INI "WordPenalty\n";
|
||||
print INI "PhrasePenalty\n";
|
||||
print INI "SoftMatchingFeature name=SM0 path=$_UNKNOWN_WORD_SOFT_MATCHES_FILE\n" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_SOFT_MATCHES_FILE);
|
||||
print INI "SoftSourceSyntacticConstraintsFeature sourceLabelSetFile=$_GHKM_SOURCE_LABELS_FILE\n" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
|
||||
print INI $feature_spec;
|
||||
|
||||
print INI "\n# dense weights for feature functions\n";
|
||||
@ -2171,6 +2181,7 @@ sub create_ini {
|
||||
print INI "UnknownWordPenalty0= 1\n";
|
||||
print INI "WordPenalty0= -1\n";
|
||||
print INI "PhrasePenalty0= 0.2\n";
|
||||
print INI "SoftSourceSyntacticConstraintsFeature0= 0.3 -0.3 -0.3\n" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
|
||||
print INI $weight_spec;
|
||||
close(INI);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user