Merge branch 'master' of github.com:moses-smt/mosesdecoder

This commit is contained in:
Barry Haddow 2014-08-11 11:49:38 +01:00
commit 27eee55a57
79 changed files with 1449 additions and 217 deletions

5
NOTICE
View File

@ -1,5 +0,0 @@
This code includes data from Daniel Naber's Language Tools (czech abbreviations).
This code includes data from czech wiktionary (also czech abbreviations).

View File

@ -312,7 +312,7 @@ Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::Facto
// property
ret->SetProperties(m_property);
ret->Evaluate(mosesSP, phraseDict.GetFeaturesToApply());
ret->EvaluateInIsolation(mosesSP, phraseDict.GetFeaturesToApply());
return ret;
}

View File

@ -36,11 +36,15 @@
<tool id="cdt.managedbuild.tool.gnu.c.linker.exe.debug.1950007837" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.110628197" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
<option id="gnu.cpp.link.option.libs.1393924562" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
<listOptionValue builtIn="false" value="moses"/>
<listOptionValue builtIn="false" value="util"/>
<listOptionValue builtIn="false" value="boost_iostreams"/>
<listOptionValue builtIn="false" value="z"/>
</option>
<option id="gnu.cpp.link.option.paths.1967422094" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/moses/Debug&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/util/Debug&quot;"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1093223502" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
@ -52,6 +56,15 @@
</tool>
</toolChain>
</folderInfo>
<fileInfo id="cdt.managedbuild.config.gnu.exe.debug.2091728208.911524129" name="PropertiesConsolidator.cpp" rcbsApplicability="disable" resourcePath="PropertiesConsolidator.cpp" toolsToInvoke="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1626949654.741737356">
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1626949654.741737356" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1626949654">
<option id="gnu.cpp.compiler.option.include.paths.858416673" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/include&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../..&quot;"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.2042647079" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
</fileInfo>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>

View File

@ -3,6 +3,8 @@
<name>consolidate</name>
<comment></comment>
<projects>
<project>moses</project>
<project>util</project>
</projects>
<buildSpec>
<buildCommand>
@ -45,6 +47,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/OutputFileStream.h</locationURI>
</link>
<link>
<name>PropertiesConsolidator.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/PropertiesConsolidator.cpp</locationURI>
</link>
<link>
<name>PropertiesConsolidator.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/PropertiesConsolidator.h</locationURI>
</link>
<link>
<name>consolidate-main.cpp</name>
<type>1</type>

View File

@ -1306,6 +1306,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/SoftMatchingFeature.h</locationURI>
</link>
<link>
<name>FF/SoftSourceSyntacticConstraintsFeature.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/SoftSourceSyntacticConstraintsFeature.cpp</locationURI>
</link>
<link>
<name>FF/SoftSourceSyntacticConstraintsFeature.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/SoftSourceSyntacticConstraintsFeature.h</locationURI>
</link>
<link>
<name>FF/SourceGHKMTreeInputMatchFeature.cpp</name>
<type>1</type>
@ -1686,6 +1696,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/NonTermContextProperty.h</locationURI>
</link>
<link>
<name>PP/OrientationPhraseProperty.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/OrientationPhraseProperty.cpp</locationURI>
</link>
<link>
<name>PP/OrientationPhraseProperty.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/OrientationPhraseProperty.h</locationURI>
</link>
<link>
<name>PP/PhraseProperty.cpp</name>
<type>1</type>

View File

@ -1,3 +1,4 @@
The documentation for memory-mapped, dynamic suffix arrays has moved to
http://www.statmt.org/moses/?n=Moses.AdvancedFeatures#ntoc40
Search for PhraseDictionaryBitextSampling.

View File

@ -215,7 +215,7 @@ Hypothesis *BackwardsEdge::CreateHypothesis(const Hypothesis &hypothesis, const
IFVERBOSE(2) {
hypothesis.GetManager().GetSentenceStats().StopTimeBuildHyp();
}
newHypo->Evaluate(m_futurescore);
newHypo->EvaluateWhenApplied(m_futurescore);
return newHypo;
}

View File

@ -60,7 +60,7 @@ ChartCell::~ChartCell() {}
/** Add the given hypothesis to the cell.
* Returns true if added, false if not. Maybe it already exists in the collection or score falls below threshold etc.
* This function just calls the correspondind AddHypothesis() in ChartHypothesisCollection
* This function just calls the corresponding AddHypothesis() in ChartHypothesisCollection
* \param hypo Hypothesis to be added
*/
bool ChartCell::AddHypothesis(ChartHypothesis *hypo)

View File

@ -212,7 +212,7 @@ int ChartHypothesis::RecombineCompare(const ChartHypothesis &compare) const
/** calculate total score
* @todo this should be in ScoreBreakdown
*/
void ChartHypothesis::Evaluate()
void ChartHypothesis::EvaluateWhenApplied()
{
const StaticData &staticData = StaticData::Instance();
// total scores from prev hypos

View File

@ -144,7 +144,7 @@ public:
int RecombineCompare(const ChartHypothesis &compare) const;
void Evaluate();
void EvaluateWhenApplied();
void AddArc(ChartHypothesis *loserHypo);
void CleanupArcList();

View File

@ -56,7 +56,7 @@ ChartHypothesisCollection::~ChartHypothesisCollection()
/** public function to add hypothesis to this collection.
* Returns false if equiv hypo exists in collection, otherwise returns true.
* Takes care of update arc list for n-best list creation.
* Will delete hypo is it exist - once this function is call don't delete hypothesis.
* Will delete hypo if it exists - once this function is call don't delete hypothesis.
* \param hypo hypothesis to add
* \param manager pointer back to manager
*/

View File

@ -87,7 +87,7 @@ void ChartManager::ProcessSentence()
m_translationOptionList.ApplyThreshold();
const InputPath &inputPath = m_parser.GetInputPath(range);
m_translationOptionList.Evaluate(m_source, inputPath);
m_translationOptionList.EvaluateWithSourceContext(m_source, inputPath);
// decode
ChartCell &cell = m_hypoStackColl.Get(range);
@ -143,7 +143,7 @@ void ChartManager::AddXmlChartOptions()
RuleCubeItem* item = new RuleCubeItem( *opt, m_hypoStackColl );
ChartHypothesis* hypo = new ChartHypothesis(*opt, *item, *this);
hypo->Evaluate();
hypo->EvaluateWhenApplied();
ChartCell &cell = m_hypoStackColl.Get(range);

View File

@ -68,6 +68,12 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
m_unksrcs.push_back(unksrc);
// hack. Once the OOV FF is a phrase table, get rid of this
PhraseDictionary *firstPt = NULL;
if (PhraseDictionary::GetColl().size() == 0) {
firstPt = PhraseDictionary::GetColl()[0];
}
//TranslationOption *transOpt;
if (! staticData.GetDropUnknown() || isDigit) {
// loop
@ -85,7 +91,7 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor for target LHS");
// add to dictionary
TargetPhrase *targetPhrase = new TargetPhrase(NULL);
TargetPhrase *targetPhrase = new TargetPhrase(firstPt);
Word &targetWord = targetPhrase->AddWord();
targetWord.CreateUnknownWord(sourceWord);
@ -93,7 +99,7 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
float unknownScore = FloorScore(TransformScore(prob));
targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore);
targetPhrase->Evaluate(*unksrc);
targetPhrase->EvaluateInIsolation(*unksrc);
targetPhrase->SetTargetLHS(targetLHS);
targetPhrase->SetAlignmentInfo("0-0");
@ -108,7 +114,7 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
// drop source word. create blank trans opt
float unknownScore = FloorScore(-numeric_limits<float>::infinity());
TargetPhrase *targetPhrase = new TargetPhrase(NULL);
TargetPhrase *targetPhrase = new TargetPhrase(firstPt);
// loop
const UnknownLHSList &lhsList = staticData.GetUnknownLHS();
UnknownLHSList::const_iterator iterLHS;
@ -121,7 +127,7 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor for target LHS");
targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore);
targetPhrase->Evaluate(*unksrc);
targetPhrase->EvaluateInIsolation(*unksrc);
targetPhrase->SetTargetLHS(targetLHS);

View File

@ -25,7 +25,7 @@ public:
virtual void AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &waste_memory, const WordsRange &range) = 0;
virtual void Evaluate(const InputType &input, const InputPath &inputPath) = 0;
virtual void EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath) = 0;
virtual float GetBestScore(const ChartCellLabel *chartCell) const = 0;

View File

@ -10,7 +10,7 @@ ChartTranslationOption::ChartTranslationOption(const TargetPhrase &targetPhrase)
{
}
void ChartTranslationOption::Evaluate(const InputType &input,
void ChartTranslationOption::EvaluateWithSourceContext(const InputType &input,
const InputPath &inputPath,
const StackVec &stackVec)
{

View File

@ -44,7 +44,7 @@ public:
return m_scoreBreakdown;
}
void Evaluate(const InputType &input,
void EvaluateWithSourceContext(const InputType &input,
const InputPath &inputPath,
const StackVec &stackVec);
};

View File

@ -168,13 +168,13 @@ float ChartTranslationOptionList::GetBestScore(const ChartCellLabel *chartCell)
return bestHypo.GetTotalScore();
}
void ChartTranslationOptionList::Evaluate(const InputType &input, const InputPath &inputPath)
void ChartTranslationOptionList::EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath)
{
// NEVER iterate over ALL of the collection. Just over the first m_size
CollType::iterator iter;
for (iter = m_collection.begin(); iter != m_collection.begin() + m_size; ++iter) {
ChartTranslationOptions &transOpts = **iter;
transOpts.Evaluate(input, inputPath);
transOpts.EvaluateWithSourceContext(input, inputPath);
}
// get rid of empty trans opts

View File

@ -65,7 +65,7 @@ public:
void Clear();
void ApplyThreshold();
void Evaluate(const InputType &input, const InputPath &inputPath);
void EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath);
private:
typedef std::vector<ChartTranslationOptions*> CollType;

View File

@ -51,7 +51,7 @@ ChartTranslationOptions::~ChartTranslationOptions()
}
void ChartTranslationOptions::Evaluate(const InputType &input, const InputPath &inputPath)
void ChartTranslationOptions::EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath)
{
SetInputPath(&inputPath);
if (StaticData::Instance().GetPlaceholderFactor() != NOT_FOUND) {
@ -62,7 +62,7 @@ void ChartTranslationOptions::Evaluate(const InputType &input, const InputPath &
for (iter = m_collection.begin(); iter != m_collection.end(); ++iter) {
ChartTranslationOption &transOpt = **iter;
transOpt.SetInputPath(&inputPath);
transOpt.Evaluate(input, inputPath, m_stackVec);
transOpt.EvaluateWithSourceContext(input, inputPath, m_stackVec);
}
// get rid of -inf trans opts

View File

@ -85,7 +85,7 @@ public:
return m_estimateOfBestScore;
}
void Evaluate(const InputType &input, const InputPath &inputPath);
void EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath);
void SetInputPath(const InputPath *inputPath);

View File

@ -148,7 +148,7 @@ void DecodeStepGeneration::Process(const TranslationOption &inputPartialTranslOp
outPhrase.GetScoreBreakdown().PlusEquals(generationScore);
outPhrase.MergeFactors(genPhrase, m_newOutputFactors);
outPhrase.Evaluate(inputPath.GetPhrase(), m_featuresToApply);
outPhrase.EvaluateInIsolation(inputPath.GetPhrase(), m_featuresToApply);
const WordsRange &sourceWordsRange = inputPartialTranslOpt.GetSourceWordsRange();

View File

@ -84,7 +84,7 @@ void DecodeStepTranslation::Process(const TranslationOption &inputPartialTranslO
}
outPhrase.Merge(targetPhrase, m_newOutputFactors);
outPhrase.Evaluate(inputPath.GetPhrase(), m_featuresToApply); // need to do this as all non-transcores would be screwed up
outPhrase.EvaluateInIsolation(inputPath.GetPhrase(), m_featuresToApply); // need to do this as all non-transcores would be screwed up
TranslationOption *newTransOpt = new TranslationOption(sourceWordsRange, outPhrase);
assert(newTransOpt != NULL);
@ -258,7 +258,7 @@ void DecodeStepTranslation::ProcessLEGACY(const TranslationOption &inputPartialT
}
outPhrase.Merge(targetPhrase, m_newOutputFactors);
outPhrase.Evaluate(inputPath.GetPhrase(), m_featuresToApply); // need to do this as all non-transcores would be screwed up
outPhrase.EvaluateInIsolation(inputPath.GetPhrase(), m_featuresToApply); // need to do this as all non-transcores would be screwed up
TranslationOption *newTransOpt = new TranslationOption(sourceWordsRange, outPhrase);

View File

@ -35,6 +35,7 @@
#include "moses/FF/ControlRecombination.h"
#include "moses/FF/ExternalFeature.h"
#include "moses/FF/ConstrainedDecoding.h"
#include "moses/FF/SoftSourceSyntacticConstraintsFeature.h"
#include "moses/FF/CoveredReferenceFeature.h"
#include "moses/FF/TreeStructureFeature.h"
#include "moses/FF/SoftMatchingFeature.h"
@ -48,11 +49,11 @@
#include "NieceTerminal.h"
#include "SpanLength.h"
#include "SyntaxRHS.h"
#include "SkeletonChangeInput.h"
#include "moses/FF/SkeletonStatelessFF.h"
#include "moses/FF/SkeletonStatefulFF.h"
#include "moses/LM/SkeletonLM.h"
#include "SkeletonChangeInput.h"
#include "moses/TranslationModel/SkeletonPT.h"
#ifdef HAVE_CMPH
@ -197,6 +198,7 @@ FeatureRegistry::FeatureRegistry()
MOSES_FNAME(CoveredReferenceFeature);
MOSES_FNAME(ExternalFeature);
MOSES_FNAME(SourceGHKMTreeInputMatchFeature);
MOSES_FNAME(SoftSourceSyntacticConstraintsFeature);
MOSES_FNAME(TreeStructureFeature);
MOSES_FNAME(SoftMatchingFeature);
MOSES_FNAME(HyperParameterAsWeight);
@ -209,11 +211,11 @@ FeatureRegistry::FeatureRegistry()
MOSES_FNAME(SparseHieroReorderingFeature);
MOSES_FNAME(SpanLength);
MOSES_FNAME(SyntaxRHS);
MOSES_FNAME(SkeletonChangeInput);
MOSES_FNAME(SkeletonStatelessFF);
MOSES_FNAME(SkeletonStatefulFF);
MOSES_FNAME(SkeletonLM);
MOSES_FNAME(SkeletonChangeInput);
MOSES_FNAME(SkeletonPT);
#ifdef HAVE_CMPH

View File

@ -0,0 +1,536 @@
#include <vector>
#include <limits>
#include <assert.h>
#include "SoftSourceSyntacticConstraintsFeature.h"
#include "moses/StaticData.h"
#include "moses/InputFileStream.h"
#include "moses/ScoreComponentCollection.h"
#include "moses/Hypothesis.h"
#include "moses/ChartHypothesis.h"
#include "moses/ChartManager.h"
#include "moses/FactorCollection.h"
#include "moses/TreeInput.h"
#include "moses/PP/SourceLabelsPhraseProperty.h"
using namespace std;
namespace Moses
{
SoftSourceSyntacticConstraintsFeature::SoftSourceSyntacticConstraintsFeature(const std::string &line)
: StatelessFeatureFunction(3, line), m_featureVariant(0)
{
VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ...");
ReadParameters();
VERBOSE(1, " Done.");
VERBOSE(1, " Feature variant: " << m_featureVariant << "." << std::endl);
}
void SoftSourceSyntacticConstraintsFeature::SetParameter(const std::string& key, const std::string& value)
{
if (key == "sourceLabelSetFile") {
m_sourceLabelSetFile = value;
} else if (key == "coreSourceLabelSetFile") {
m_coreSourceLabelSetFile = value;
} else if (key == "targetSourceLeftHandSideJointCountFile") {
m_targetSourceLHSJointCountFile = value;
} else if (key == "tuneable") {
m_tuneable = Scan<bool>(value);
} else if (key == "featureVariant") {
m_featureVariant = Scan<size_t>(value); // 0: only dense features, 1: no mismatches (also set weights 1 0 0 and tuneable=false), 2: with sparse features, 3: with sparse features for core labels only
} else {
StatelessFeatureFunction::SetParameter(key, value);
}
}
void SoftSourceSyntacticConstraintsFeature::Load()
{
// don't change the loading order!
LoadSourceLabelSet();
if (m_featureVariant == 3) {
LoadCoreSourceLabelSet();
}
if (!m_targetSourceLHSJointCountFile.empty()) {
LoadTargetSourceLeftHandSideJointCountFile();
}
}
void SoftSourceSyntacticConstraintsFeature::LoadSourceLabelSet()
{
VERBOSE(2, GetScoreProducerDescription() << ": Loading source label set from file " << m_sourceLabelSetFile << std::endl);
InputFileStream inFile(m_sourceLabelSetFile);
FactorCollection &factorCollection = FactorCollection::Instance();
// read source label set
std::string line;
m_sourceLabels.clear();
m_sourceLabelsByIndex.clear();
m_sourceLabelIndexesByFactor.clear();
while (getline(inFile, line)) {
std::istringstream tokenizer(line);
std::string label;
size_t index;
try {
tokenizer >> label >> index;
} catch (const std::exception &e) {
UTIL_THROW2(GetScoreProducerDescription()
<< ": Error reading source label set file " << m_sourceLabelSetFile << " .");
}
std::pair< boost::unordered_map<std::string,size_t>::iterator, bool > inserted = m_sourceLabels.insert( std::pair<std::string,size_t>(label,index) );
UTIL_THROW_IF2(!inserted.second, GetScoreProducerDescription()
<< ": Source label set file " << m_sourceLabelSetFile << " should contain each syntactic label only once.");
if (index >= m_sourceLabelsByIndex.size()) {
m_sourceLabelsByIndex.resize(index+1);
}
m_sourceLabelsByIndex[index] = label;
const Factor* sourceLabelFactor = factorCollection.AddFactor(label,true);
m_sourceLabelIndexesByFactor[sourceLabelFactor] = index;
}
inFile.Close();
std::list<std::string> specialLabels;
specialLabels.push_back("GlueTop");
specialLabels.push_back("GlueX");
// specialLabels.push_back("XRHS");
// specialLabels.push_back("XLHS");
for (std::list<std::string>::const_iterator iter=specialLabels.begin();
iter!=specialLabels.end(); ++iter) {
boost::unordered_map<std::string,size_t>::iterator found = m_sourceLabels.find(*iter);
UTIL_THROW_IF2(found == m_sourceLabels.end(), GetScoreProducerDescription()
<< ": Source label set file " << m_sourceLabelSetFile << " should contain an entry for the special label \"" << *iter << "\".");
if (!(found->first).compare("GlueTop")) {
m_GlueTopLabel = found->second;
// } else if (!(found->first).compare("XRHS")) {
// m_XRHSLabel = found->second;
// } else if (!(found->first).compare("XLHS")) {
// m_XLHSLabel = found->second;
}
}
}
void SoftSourceSyntacticConstraintsFeature::LoadCoreSourceLabelSet()
{
VERBOSE(2, GetScoreProducerDescription() << ": Loading core source label set from file " << m_coreSourceLabelSetFile << std::endl);
InputFileStream inFile(m_coreSourceLabelSetFile);
// read core source label set
std::string line;
m_coreSourceLabels.clear();
while (getline(inFile, line)) {
istringstream tokenizer(line);
std::string label;
tokenizer >> label;
boost::unordered_map<std::string,size_t>::iterator foundSourceLabelIndex = m_sourceLabels.find( label );
if ( foundSourceLabelIndex != m_sourceLabels.end() ) {
m_coreSourceLabels.insert(foundSourceLabelIndex->second);
} else {
VERBOSE(2, GetScoreProducerDescription()
<< ": Ignoring unknown source label \"" << label << "\" "
<< "from core source label set file " << m_coreSourceLabelSetFile << "."
<< std::endl);
}
}
inFile.Close();
}
void SoftSourceSyntacticConstraintsFeature::LoadTargetSourceLeftHandSideJointCountFile()
{
VERBOSE(2, GetScoreProducerDescription() << ": Loading target/source label joint counts from file " << m_targetSourceLHSJointCountFile << std::endl);
InputFileStream inFile(m_targetSourceLHSJointCountFile);
for (boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* >::iterator iter=m_labelPairProbabilities.begin();
iter!=m_labelPairProbabilities.end(); ++iter) {
delete iter->second;
}
m_labelPairProbabilities.clear();
// read joint counts
std::string line;
FactorCollection &factorCollection = FactorCollection::Instance();
boost::unordered_map<const Factor*,float> targetLHSCounts;
std::vector<float> sourceLHSCounts(m_sourceLabels.size(),0.0);
while (getline(inFile, line)) {
istringstream tokenizer(line);
std::string targetLabel;
std::string sourceLabel;
float count;
tokenizer >> targetLabel;
tokenizer >> sourceLabel;
tokenizer >> count;
boost::unordered_map<std::string,size_t>::iterator foundSourceLabelIndex = m_sourceLabels.find( sourceLabel );
UTIL_THROW_IF2(foundSourceLabelIndex == m_sourceLabels.end(), GetScoreProducerDescription()
<< ": Target/source label joint count file " << m_targetSourceLHSJointCountFile
<< " contains unknown source label \"" << sourceLabel << "\".");
const Factor* targetLabelFactor = factorCollection.AddFactor(targetLabel,true);
sourceLHSCounts[foundSourceLabelIndex->second] += count;
std::pair< boost::unordered_map<const Factor*,float >::iterator, bool > insertedTargetLHSCount =
targetLHSCounts.insert( std::pair<const Factor*,float>(targetLabelFactor,count) );
if (!insertedTargetLHSCount.second) {
(insertedTargetLHSCount.first)->second += count;
boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* >::iterator jointCountIt =
m_labelPairProbabilities.find( targetLabelFactor );
assert(jointCountIt != m_labelPairProbabilities.end());
(jointCountIt->second)->at(foundSourceLabelIndex->second).first += count;
(jointCountIt->second)->at(foundSourceLabelIndex->second).second += count;
} else {
std::pair<float,float> init(0.0,0.0);
std::vector< std::pair<float,float> >* sourceVector = new std::vector< std::pair<float,float> >(m_sourceLabels.size(),init);
sourceVector->at(foundSourceLabelIndex->second) = std::pair<float,float>(count,count);
std::pair< boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* >::iterator, bool > insertedJointCount =
m_labelPairProbabilities.insert( std::pair<const Factor*, std::vector< std::pair<float,float> >* >(targetLabelFactor,sourceVector) );
assert(insertedJointCount.second);
}
}
// normalization
for (boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* >::iterator iter=m_labelPairProbabilities.begin();
iter!=m_labelPairProbabilities.end(); ++iter) {
float targetLHSCount = 0;
boost::unordered_map<const Factor*,float >::const_iterator targetLHSCountIt = targetLHSCounts.find( iter->first );
if ( targetLHSCountIt != targetLHSCounts.end() ) {
targetLHSCount = targetLHSCountIt->second;
}
std::vector< std::pair<float,float> > &probabilities = *(iter->second);
for (size_t index=0; index<probabilities.size(); ++index) {
if ( probabilities[index].first != 0 ) {
assert(targetLHSCount != 0);
probabilities[index].first /= targetLHSCount;
}
if ( probabilities[index].second != 0 ) {
assert(sourceLHSCounts[index] != 0);
probabilities[index].second /= sourceLHSCounts[index];
}
}
}
inFile.Close();
}
void SoftSourceSyntacticConstraintsFeature::EvaluateWhenApplied(
const ChartHypothesis& hypo,
ScoreComponentCollection* accumulator) const
{
// dense scores
std::vector<float> newScores(m_numScoreComponents,0); // m_numScoreComponents == 3
const InputType& input = hypo.GetManager().GetSource();
const TreeInput& treeInput = static_cast<const TreeInput&>(input);
const StaticData& staticData = StaticData::Instance();
const Word& outputDefaultNonTerminal = staticData.GetOutputDefaultNonTerminal();
size_t nNTs = 1;
bool treeInputMismatchLHSBinary = true;
size_t treeInputMismatchRHSCount = 0;
bool hasCompleteTreeInputMatch = false;
float t2sLabelsProb = 1;
float s2tLabelsProb = 1;
float ruleLabelledProbability = 1;
// read SourceLabels property
const TargetPhrase &currTarPhr = hypo.GetCurrTargetPhrase();
const Factor* targetLHS = currTarPhr.GetTargetLHS()[0];
bool isGlueGrammarRule = false;
bool isUnkRule = false;
if (const PhraseProperty *property = currTarPhr.GetProperty("SourceLabels")) {
const SourceLabelsPhraseProperty *sourceLabelsPhraseProperty = static_cast<const SourceLabelsPhraseProperty*>(property);
nNTs = sourceLabelsPhraseProperty->GetNumberOfNonTerminals();
float totalCount = sourceLabelsPhraseProperty->GetTotalCount();
// prepare for input tree label matching
std::vector< boost::unordered_set<size_t> > treeInputLabelsRHS(nNTs-1);
boost::unordered_set<size_t> treeInputLabelsLHS;
// get index map for underlying hypotheses
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
currTarPhr.GetAlignNonTerm().GetNonTermIndexMap();
std::vector<const Factor*> targetLabelsRHS;
if (nNTs > 1) { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule
size_t nonTerminalNumber = 0;
for (size_t phrasePos=0; phrasePos<currTarPhr.GetSize(); ++phrasePos) {
// consult rule for either word or non-terminal
const Word &word = currTarPhr.GetWord(phrasePos);
if ( word.IsNonTerminal() ) {
// non-terminal: consult subderivation
size_t nonTermIndex = nonTermIndexMap[phrasePos];
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndex);
targetLabelsRHS.push_back( prevHypo->GetTargetLHS()[0] );
// retrieve information that is required for input tree label matching (RHS)
const WordsRange& prevWordsRange = prevHypo->GetCurrSourceRange();
size_t prevStartPos = prevWordsRange.GetStartPos();
size_t prevEndPos = prevWordsRange.GetEndPos();
const NonTerminalSet& prevTreeInputLabels = treeInput.GetLabelSet(prevStartPos,prevEndPos);
for (NonTerminalSet::const_iterator prevTreeInputLabelsIt = prevTreeInputLabels.begin();
prevTreeInputLabelsIt != prevTreeInputLabels.end(); ++prevTreeInputLabelsIt) {
if (*prevTreeInputLabelsIt != outputDefaultNonTerminal) {
boost::unordered_map<const Factor*,size_t>::const_iterator foundPrevTreeInputLabel
= m_sourceLabelIndexesByFactor.find((*prevTreeInputLabelsIt)[0]);
if (foundPrevTreeInputLabel != m_sourceLabelIndexesByFactor.end()) {
size_t prevTreeInputLabelIndex = foundPrevTreeInputLabel->second;
treeInputLabelsRHS[nonTerminalNumber].insert(prevTreeInputLabelIndex);
}
}
}
++nonTerminalNumber;
}
}
}
// retrieve information that is required for input tree label matching (LHS)
const WordsRange& wordsRange = hypo.GetCurrSourceRange();
size_t startPos = wordsRange.GetStartPos();
size_t endPos = wordsRange.GetEndPos();
const NonTerminalSet& treeInputLabels = treeInput.GetLabelSet(startPos,endPos);
for (NonTerminalSet::const_iterator treeInputLabelsIt = treeInputLabels.begin();
treeInputLabelsIt != treeInputLabels.end(); ++treeInputLabelsIt) {
if (*treeInputLabelsIt != outputDefaultNonTerminal) {
boost::unordered_map<const Factor*,size_t>::const_iterator foundTreeInputLabel
= m_sourceLabelIndexesByFactor.find((*treeInputLabelsIt)[0]);
if (foundTreeInputLabel != m_sourceLabelIndexesByFactor.end()) {
size_t treeInputLabelIndex = foundTreeInputLabel->second;
treeInputLabelsLHS.insert(treeInputLabelIndex);
}
}
}
// inspect source-labelled rule items
std::vector< boost::unordered_set<size_t> > sparseScoredTreeInputLabelsRHS(nNTs-1);
boost::unordered_set<size_t> sparseScoredTreeInputLabelsLHS;
std::vector<bool> sourceLabelSeenAsLHS(m_sourceLabels.size(),false);
std::vector<bool> treeInputMatchRHSCountByNonTerminal(nNTs-1,false);
const std::list<SourceLabelsPhrasePropertyItem> &sourceLabelItems = sourceLabelsPhraseProperty->GetSourceLabelItems();
for (std::list<SourceLabelsPhrasePropertyItem>::const_iterator sourceLabelItem = sourceLabelItems.begin();
sourceLabelItem != sourceLabelItems.end() && !hasCompleteTreeInputMatch; ++sourceLabelItem) {
const std::list<size_t> &sourceLabelsRHS = sourceLabelItem->GetSourceLabelsRHS();
// float sourceLabelsRHSCount = sourceLabelItem->GetSourceLabelsRHSCount();
const std::list< std::pair<size_t,float> > &sourceLabelsLHSList = sourceLabelItem->GetSourceLabelsLHSList();
assert(sourceLabelsRHS.size() == nNTs-1);
bool currentSourceLabelItemIsCompleteTreeInputMatch = true;
size_t nonTerminalNumber=0;
for (std::list<size_t>::const_iterator sourceLabelsRHSIt = sourceLabelsRHS.begin();
sourceLabelsRHSIt != sourceLabelsRHS.end(); ++sourceLabelsRHSIt, ++nonTerminalNumber) {
if (treeInputLabelsRHS[nonTerminalNumber].find(*sourceLabelsRHSIt) != treeInputLabelsRHS[nonTerminalNumber].end()) {
treeInputMatchRHSCountByNonTerminal[nonTerminalNumber] = true;
if ( m_featureVariant == 2 ||
(m_featureVariant == 3 && m_coreSourceLabels.find(*sourceLabelsRHSIt) != m_coreSourceLabels.end()) ) {
// score sparse features: RHS match
if (sparseScoredTreeInputLabelsRHS[nonTerminalNumber].find(*sourceLabelsRHSIt) == sparseScoredTreeInputLabelsRHS[nonTerminalNumber].end()) {
// (only if no match has been scored for this tree input label and rule non-terminal with a previous sourceLabelItem)
float score_RHS_1 = (float)1/treeInputLabelsRHS[nonTerminalNumber].size();
accumulator->PlusEquals(this,
std::string("RHS_1_" + m_sourceLabelsByIndex[*sourceLabelsRHSIt]),
score_RHS_1);
sparseScoredTreeInputLabelsRHS[nonTerminalNumber].insert(*sourceLabelsRHSIt);
}
}
} else {
currentSourceLabelItemIsCompleteTreeInputMatch = false;
}
}
// LHS source non-terminal labels seen with this RHS
bool currentSourceLabelItemHasLHSTreeInputMatch = false;
//float ruleLabelledCount = 0;
std::list< std::pair<size_t,float> >::const_iterator sourceLabelsLHSIt;
for (sourceLabelsLHSIt = sourceLabelsLHSList.begin(); sourceLabelsLHSIt != sourceLabelsLHSList.end(); ++sourceLabelsLHSIt) {
if ( sourceLabelsLHSIt->first == m_GlueTopLabel ) {
isGlueGrammarRule = true;
}
if (treeInputLabelsLHS.find(sourceLabelsLHSIt->first) != treeInputLabelsLHS.end()) {
currentSourceLabelItemHasLHSTreeInputMatch = true;
if ( m_featureVariant == 2 ||
(m_featureVariant == 3 && m_coreSourceLabels.find(sourceLabelsLHSIt->first) != m_coreSourceLabels.end()) ) {
// score sparse features: LHS match
if (sparseScoredTreeInputLabelsLHS.find(sourceLabelsLHSIt->first) == sparseScoredTreeInputLabelsLHS.end()) {
// (only if no match has been scored for this tree input label and rule non-terminal with a previous sourceLabelItem)
float score_LHS_1 = (float)1/treeInputLabelsLHS.size();
accumulator->PlusEquals(this,
std::string("LHS_1_" + m_sourceLabelsByIndex[sourceLabelsLHSIt->first]),
score_LHS_1);
sparseScoredTreeInputLabelsLHS.insert(sourceLabelsLHSIt->first);
}
}
break;
}
}
if (currentSourceLabelItemHasLHSTreeInputMatch) {
// input tree matching (LHS)
treeInputMismatchLHSBinary = false;
} else {
currentSourceLabelItemIsCompleteTreeInputMatch = false;
}
if (currentSourceLabelItemIsCompleteTreeInputMatch) {
hasCompleteTreeInputMatch = true;
ruleLabelledProbability = sourceLabelsLHSIt->second / totalCount;
std::pair<float,float> probPair = GetLabelPairProbabilities( targetLHS, sourceLabelsLHSIt->first);
t2sLabelsProb = probPair.first;
s2tLabelsProb = probPair.second;
nonTerminalNumber=0;
for (std::list<size_t>::const_iterator sourceLabelsRHSIt = sourceLabelsRHS.begin();
sourceLabelsRHSIt != sourceLabelsRHS.end(); ++sourceLabelsRHSIt, ++nonTerminalNumber) {
probPair = GetLabelPairProbabilities( targetLabelsRHS[nonTerminalNumber], *sourceLabelsRHSIt );
t2sLabelsProb += probPair.first;
s2tLabelsProb += probPair.second;
}
t2sLabelsProb /= nNTs;
s2tLabelsProb /= nNTs;
assert(t2sLabelsProb != 0);
assert(s2tLabelsProb != 0);
}
}
// input tree matching (RHS)
if ( !hasCompleteTreeInputMatch ) {
treeInputMismatchRHSCount = nNTs-1;
for (std::vector<bool>::const_iterator treeInputMatchRHSCountByNonTerminalIt = treeInputMatchRHSCountByNonTerminal.begin();
treeInputMatchRHSCountByNonTerminalIt != treeInputMatchRHSCountByNonTerminal.end(); ++treeInputMatchRHSCountByNonTerminalIt) {
if (*treeInputMatchRHSCountByNonTerminalIt) {
--treeInputMismatchRHSCount;
}
}
}
// score sparse features: mismatches
if ( m_featureVariant == 2 || m_featureVariant == 3 ) {
// RHS
for (size_t nonTerminalNumber = 0; nonTerminalNumber < nNTs-1; ++nonTerminalNumber) {
// nNTs-1 because nNTs also counts the left-hand side non-terminal
float score_RHS_0 = (float)1/treeInputLabelsRHS[nonTerminalNumber].size();
for (boost::unordered_set<size_t>::const_iterator treeInputLabelsRHSIt = treeInputLabelsRHS[nonTerminalNumber].begin();
treeInputLabelsRHSIt != treeInputLabelsRHS[nonTerminalNumber].end(); ++treeInputLabelsRHSIt) {
if ( m_featureVariant == 2 ||
(m_featureVariant == 3 && m_coreSourceLabels.find(*treeInputLabelsRHSIt) != m_coreSourceLabels.end()) ) {
if (sparseScoredTreeInputLabelsRHS[nonTerminalNumber].find(*treeInputLabelsRHSIt) == sparseScoredTreeInputLabelsRHS[nonTerminalNumber].end()) {
// score sparse features: RHS mismatch
accumulator->PlusEquals(this,
std::string("RHS_0_" + m_sourceLabelsByIndex[*treeInputLabelsRHSIt]),
score_RHS_0);
}
}
}
}
// LHS
float score_LHS_0 = (float)1/treeInputLabelsLHS.size();
for (boost::unordered_set<size_t>::const_iterator treeInputLabelsLHSIt = treeInputLabelsLHS.begin();
treeInputLabelsLHSIt != treeInputLabelsLHS.end(); ++treeInputLabelsLHSIt) {
if ( m_featureVariant == 2 ||
(m_featureVariant == 3 && m_coreSourceLabels.find(*treeInputLabelsLHSIt) != m_coreSourceLabels.end()) ) {
if (sparseScoredTreeInputLabelsLHS.find(*treeInputLabelsLHSIt) == sparseScoredTreeInputLabelsLHS.end()) {
// score sparse features: RHS mismatch
accumulator->PlusEquals(this,
std::string("LHS_0_" + m_sourceLabelsByIndex[*treeInputLabelsLHSIt]),
score_LHS_0);
}
}
}
}
} else {
// abort with error message if the phrase does not translate an unknown word
UTIL_THROW_IF2(!currTarPhr.GetWord(0).IsOOV(), GetScoreProducerDescription()
<< ": Missing SourceLabels property. "
<< "Please check phrase table and glue rules.");
// unknown word
isUnkRule = true;
}
// add scores
// input tree matching
switch (m_featureVariant) {
case 0:
newScores[0] = hasCompleteTreeInputMatch;
break;
case 1:
newScores[0] = ( (hasCompleteTreeInputMatch || isGlueGrammarRule || isUnkRule) ? 0 : std::numeric_limits<float>::min() );
break;
default:
newScores[0] = hasCompleteTreeInputMatch;
}
newScores[1] = treeInputMismatchLHSBinary;
newScores[2] = treeInputMismatchRHSCount;
// newScores[3] = hasCompleteTreeInputMatch ? std::log(t2sLabelsProb) : 0;
// newScores[4] = hasCompleteTreeInputMatch ? std::log(s2tLabelsProb) : 0;
// newScores[3] = hasCompleteTreeInputMatch ? std::log(ruleLabelledProbability) : 0;
accumulator->PlusEquals(this, newScores);
}
std::pair<float,float> SoftSourceSyntacticConstraintsFeature::GetLabelPairProbabilities(
const Factor* target,
const size_t source) const
{
boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* >::const_iterator found =
m_labelPairProbabilities.find(target);
if ( found == m_labelPairProbabilities.end() ) {
return std::pair<float,float>(0,0);
}
return found->second->at(source);
}
}

View File

@ -0,0 +1,87 @@
#pragma once
#include <string>
#include <boost/unordered_map.hpp>
#include <boost/unordered_set.hpp>
#include "StatelessFeatureFunction.h"
#include "FFState.h"
#include "moses/Factor.h"
namespace Moses
{
class SoftSourceSyntacticConstraintsFeature : public StatelessFeatureFunction
{
public:
SoftSourceSyntacticConstraintsFeature(const std::string &line);
~SoftSourceSyntacticConstraintsFeature() {
for (boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* >::iterator iter=m_labelPairProbabilities.begin();
iter!=m_labelPairProbabilities.end(); ++iter) {
delete iter->second;
}
}
bool IsUseable(const FactorMask &mask) const {
return true;
}
void SetParameter(const std::string& key, const std::string& value);
void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
{};
void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const
{};
void EvaluateWhenApplied(
const Hypothesis& cur_hypo,
ScoreComponentCollection* accumulator) const
{};
void EvaluateWhenApplied(
const ChartHypothesis& cur_hypo,
ScoreComponentCollection* accumulator) const;
private:
std::string m_sourceLabelSetFile;
std::string m_coreSourceLabelSetFile;
std::string m_targetSourceLHSJointCountFile;
std::string m_unknownLeftHandSideFile;
size_t m_featureVariant;
boost::unordered_map<std::string,size_t> m_sourceLabels;
std::vector<std::string> m_sourceLabelsByIndex;
boost::unordered_set<size_t> m_coreSourceLabels;
boost::unordered_map<const Factor*,size_t> m_sourceLabelIndexesByFactor;
size_t m_GlueTopLabel;
// mutable size_t m_XRHSLabel;
// mutable size_t m_XLHSLabel;
boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* > m_labelPairProbabilities;
boost::unordered_map<size_t,float> m_unknownLHSProbabilities;
float m_smoothingWeight;
float m_unseenLHSSmoothingFactorForUnknowns;
void Load();
void LoadSourceLabelSet();
void LoadCoreSourceLabelSet();
void LoadTargetSourceLeftHandSideJointCountFile();
std::pair<float,float> GetLabelPairProbabilities(const Factor* target,
const size_t source) const;
};
}

View File

@ -67,6 +67,23 @@ const Factor *FactorCollection::AddFactor(const StringPiece &factorString, bool
return &ret.first->in;
}
const Factor *FactorCollection::GetFactor(const StringPiece &factorString, bool isNonTerminal)
{
FactorFriend to_find;
to_find.in.m_string = factorString;
to_find.in.m_id = (isNonTerminal) ? m_factorIdNonTerminal : m_factorId;
Set & set = (isNonTerminal) ? m_set : m_setNonTerminal;
{ // read=lock scope
#ifdef WITH_THREADS
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
#endif // WITH_THREADS
Set::const_iterator i = set.find(to_find);
if (i != set.end()) return &i->in;
}
return NULL;
}
FactorCollection::~FactorCollection() {}
TO_STRING_BODY(FactorCollection);

View File

@ -114,6 +114,8 @@ public:
return m_factorIdNonTerminal;
}
const Factor *GetFactor(const StringPiece &factorString, bool isNonTerminal = false);
// TODO: remove calls to this function, replacing them with the simpler AddFactor(factorString)
const Factor *AddFactor(FactorDirection /*direction*/, FactorType /*factorType*/, const StringPiece &factorString, bool isNonTerminal = false) {
return AddFactor(factorString, isNonTerminal);

View File

@ -205,7 +205,7 @@ int Hypothesis::RecombineCompare(const Hypothesis &compare) const
return 0;
}
void Hypothesis::EvaluateWith(const StatefulFeatureFunction &sfff,
void Hypothesis::EvaluateWhenApplied(const StatefulFeatureFunction &sfff,
int state_idx)
{
const StaticData &staticData = StaticData::Instance();
@ -217,7 +217,7 @@ void Hypothesis::EvaluateWith(const StatefulFeatureFunction &sfff,
}
}
void Hypothesis::EvaluateWith(const StatelessFeatureFunction& slff)
void Hypothesis::EvaluateWhenApplied(const StatelessFeatureFunction& slff)
{
const StaticData &staticData = StaticData::Instance();
if (! staticData.IsFeatureFunctionIgnored( slff )) {
@ -228,7 +228,7 @@ void Hypothesis::EvaluateWith(const StatelessFeatureFunction& slff)
/***
* calculate the logarithm of our total translation score (sum up components)
*/
void Hypothesis::Evaluate(const SquareMatrix &futureScore)
void Hypothesis::EvaluateWhenApplied(const SquareMatrix &futureScore)
{
IFVERBOSE(2) {
m_manager.GetSentenceStats().StartTimeOtherScore();
@ -244,7 +244,7 @@ void Hypothesis::Evaluate(const SquareMatrix &futureScore)
StatelessFeatureFunction::GetStatelessFeatureFunctions();
for (unsigned i = 0; i < sfs.size(); ++i) {
const StatelessFeatureFunction &ff = *sfs[i];
EvaluateWith(ff);
EvaluateWhenApplied(ff);
}
const vector<const StatefulFeatureFunction*>& ffs =
@ -332,7 +332,7 @@ void Hypothesis::CleanupArcList()
*/
const StaticData &staticData = StaticData::Instance();
size_t nBestSize = staticData.GetNBestSize();
bool distinctNBest = staticData.GetDistinctNBest() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.GetOutputSearchGraphSLF() || staticData.GetOutputSearchGraphHypergraph() || staticData.UseLatticeMBR() ;
bool distinctNBest = staticData.GetDistinctNBest() || staticData.GetLatticeSamplesSize() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.GetOutputSearchGraphSLF() || staticData.GetOutputSearchGraphHypergraph() || staticData.UseLatticeMBR() ;
if (!distinctNBest && m_arcList->size() > nBestSize * 5) {
// prune arc list only if there too many arcs

View File

@ -142,7 +142,7 @@ public:
return m_currTargetWordsRange.GetNumWordsCovered();
}
void Evaluate(const SquareMatrix &futureScore);
void EvaluateWhenApplied(const SquareMatrix &futureScore);
int GetId()const {
return m_id;
@ -256,8 +256,8 @@ public:
}
// Added by oliver.wilson@ed.ac.uk for async lm stuff.
void EvaluateWith(const StatefulFeatureFunction &sfff, int state_idx);
void EvaluateWith(const StatelessFeatureFunction &slff);
void EvaluateWhenApplied(const StatefulFeatureFunction &sfff, int state_idx);
void EvaluateWhenApplied(const StatelessFeatureFunction &slff);
//! target span that trans opt would populate if applied to this hypo. Used for alignment check
size_t GetNextStartPos(const TranslationOption &transOpt) const;

View File

@ -102,7 +102,7 @@ public:
return vertex.BestChild();
}
void Evaluate(const InputType &input, const InputPath &inputPath) {
void EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath) {
// TODO for input lattice
}
private:

View File

@ -12,7 +12,7 @@ if $(with-dlib) {
with-lbllm = [ option.get "with-lbllm" ] ;
if $(with-lbllm) {
lbllm2 = <cxxflags>-std=c++0x <define>LM_LBL <include>$(with-lbllm)/src <include>$(with-lbllm)/3rdparty/eigen-3 ;
lbllm2 = <cxxflags>-std=c++0x <define>LM_LBL <include>$(with-lbllm)/src <include>$(with-lbllm)/third_party/eigen ;
} else {
lbllm2 = ;
}

View File

@ -94,9 +94,10 @@ if $(with-nplm) {
local with-lbllm = [ option.get "with-lbllm" ] ;
if $(with-lbllm) {
lib lbl : : <search>$(with-lbllm)/lib <search>$(with-lbllm)/lib64 ;
obj LBLLM.o : oxlm/LBLLM.cpp lbl ..//headers : <include>$(with-lbllm)/src <include>$(with-lbllm)/3rdparty/eigen-3 ;
obj Mapper.o : oxlm/Mapper.cpp lbl ..//headers : <include>$(with-lbllm)/src <include>$(with-lbllm)/3rdparty/eigen-3 ;
alias lbllm : LBLLM.o Mapper.o lbl : : : <cxxflags>-std=c++0x <define>LM_LBL ;
lib murmurhash : : <search>$(with-lbllm)/lib <search>$(with-lbllm)/lib64 ;
obj LBLLM.o : oxlm/LBLLM.cpp lbl ..//headers : <include>$(with-lbllm)/src <include>$(with-lbllm)/third_party/eigen ;
obj Mapper.o : oxlm/Mapper.cpp lbl ..//headers : <include>$(with-lbllm)/src <include>$(with-lbllm)/third_party/eigen ;
alias lbllm : LBLLM.o Mapper.o lbl murmurhash /top//boost_filesystem : : : <cxxflags>-std=c++0x <define>LM_LBL ;
dependencies += lbllm ;
lmmacros += LM_LBL ;
}

View File

@ -1,11 +1,171 @@
#include "LBLLM.h"
#include <boost/archive/binary_iarchive.hpp>
#include <boost/archive/binary_oarchive.hpp>
#include <boost/filesystem.hpp>
#include <boost/functional/hash.hpp>
#include "moses/FactorCollection.h"
#include "moses/InputType.h"
using namespace std;
using namespace oxlm;
namespace Moses
{
template<class Model>
LBLLM<Model>::LBLLM(const string &line) : LanguageModelSingleFactor(line) {
ReadParameters();
FactorCollection &factorCollection = FactorCollection::Instance();
// needed by parent language model classes. Why didn't they set these themselves?
m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, BOS_);
m_sentenceStartWord[m_factorType] = m_sentenceStart;
m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, EOS_);
m_sentenceEndWord[m_factorType] = m_sentenceEnd;
cacheHits = totalHits = 0;
}
template<class Model>
LBLLM<Model>::~LBLLM() {
if (persistentCache) {
double cache_hit_ratio = 100.0 * cacheHits / totalHits;
cerr << "Cache hit ratio: " << cache_hit_ratio << endl;
}
}
template<class Model>
void LBLLM<Model>::SetParameter(const string& key, const string& value) {
if (key == "persistent-cache") {
persistentCache = Scan<bool>(value);
} else {
LanguageModelSingleFactor::SetParameter(key, value);
}
}
template<class Model>
void LBLLM<Model>::Load() {
model.load(m_filePath);
Dict dict = model.getDict();
mapper = boost::make_shared<OXLMMapper>(dict);
kSTART = dict.Convert("<s>");
kSTOP = dict.Convert("</s>");
kUNKNOWN = dict.Convert("<unk>");
size_t ngram_order = model.getConfig()->ngram_order;
UTIL_THROW_IF2(
m_nGramOrder != ngram_order,
"Wrong order for LBLLM: LM has " << ngram_order << ", but Moses expects " << m_nGramOrder);
}
template<class Model>
LMResult LBLLM<Model>::GetValue(
const vector<const Word*> &contextFactor, State* finalState) const {
if (!cache.get()) {
cache.reset(new QueryCache());
}
vector<int> context;
int word;
mapper->convert(contextFactor, context, word);
size_t context_width = m_nGramOrder - 1;
if (!context.empty() && context.back() == kSTART) {
context.resize(context_width, kSTART);
} else {
context.resize(context_width, kUNKNOWN);
}
double score;
if (persistentCache) {
++totalHits;
NGram query(word, context);
pair<double, bool> ret = cache->get(query);
if (ret.second) {
score = ret.first;
++cacheHits;
} else {
score = model.predict(word, context);
cache->put(query, score);
}
} else {
score = model.predict(word, context);
}
LMResult ret;
ret.score = score;
ret.unknown = (word == kUNKNOWN);
// calc state from hash of last n-1 words
size_t seed = 0;
boost::hash_combine(seed, word);
for (size_t i = 0; i < context.size() && i < context_width - 1; ++i) {
int id = context[i];
boost::hash_combine(seed, id);
}
(*finalState) = (State*) seed;
return ret;
}
template<class Model>
void LBLLM<Model>::InitializeForInput(const InputType& source) {
LanguageModelSingleFactor::InitializeForInput(source);
if (persistentCache) {
if (!cache.get()) {
cache.reset(new QueryCache());
}
int sentence_id = source.GetTranslationId();
string cacheFile = m_filePath + "." + to_string(sentence_id) + ".cache.bin";
if (boost::filesystem::exists(cacheFile)) {
ifstream f(cacheFile);
boost::archive::binary_iarchive iar(f);
cerr << "Loading n-gram probability cache from " << cacheFile << endl;
iar >> *cache;
cerr << "Done loading " << cache->size()
<< " n-gram probabilities..." << endl;
} else {
cerr << "Cache file not found" << endl;
}
}
}
template<class Model>
void LBLLM<Model>::CleanUpAfterSentenceProcessing(const InputType& source) {
model.clearCache();
if (persistentCache) {
int sentence_id = source.GetTranslationId();
string cacheFile = m_filePath + "." + to_string(sentence_id) + ".cache.bin";
ofstream f(cacheFile);
boost::archive::binary_oarchive oar(f);
cerr << "Saving persistent cache to " << cacheFile << endl;
oar << *cache;
cerr << "Done saving " << cache->size()
<< " n-gram probabilities..." << endl;
cache->clear();
}
LanguageModelSingleFactor::CleanUpAfterSentenceProcessing(source);
}
template class LBLLM<LM>;
template class LBLLM<FactoredLM>;
template class LBLLM<FactoredMaxentLM>;
}

View File

@ -2,15 +2,12 @@
#pragma once
#include <vector>
#include <boost/functional/hash.hpp>
#include "moses/LM/SingleFactor.h"
#include "moses/FactorCollection.h"
// lbl stuff
#include "corpus/corpus.h"
#include "lbl/lbl_features.h"
#include "lbl/model.h"
#include "lbl/process_identifier.h"
#include "lbl/query_cache.h"
#include "Mapper.h"
@ -22,100 +19,34 @@ namespace Moses
template<class Model>
class LBLLM : public LanguageModelSingleFactor
{
protected:
public:
LBLLM(const std::string &line)
:LanguageModelSingleFactor(line)
{
ReadParameters();
LBLLM(const std::string &line);
FactorCollection &factorCollection = FactorCollection::Instance();
~LBLLM();
// needed by parent language model classes. Why didn't they set these themselves?
m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, BOS_);
m_sentenceStartWord[m_factorType] = m_sentenceStart;
void SetParameter(const std::string& key, const std::string& value);
m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, EOS_);
m_sentenceEndWord[m_factorType] = m_sentenceEnd;
}
void Load();
~LBLLM()
{}
virtual LMResult GetValue(
const std::vector<const Word*> &contextFactor,
State* finalState = 0) const;
void Load()
{
model.load(m_filePath);
virtual void InitializeForInput(const InputType& source);
config = model.getConfig();
int context_width = config->ngram_order - 1;
// For each state, we store at most context_width word ids to the left and
// to the right and a kSTAR separator. The last bit represents the actual
// size of the state.
//int max_state_size = (2 * context_width + 1) * sizeof(int) + 1;
//FeatureFunction::SetStateSize(max_state_size);
dict = model.getDict();
mapper = boost::make_shared<OXLMMapper>(dict);
//stateConverter = boost::make_shared<CdecStateConverter>(max_state_size - 1);
//ruleConverter = boost::make_shared<CdecRuleConverter>(mapper, stateConverter);
kSTART = dict.Convert("<s>");
kSTOP = dict.Convert("</s>");
kUNKNOWN = dict.Convert("<unk>");
}
virtual LMResult GetValue(const std::vector<const Word*> &contextFactor, State* finalState = 0) const
{
std::vector<int> context;
int word;
mapper->convert(contextFactor, context, word);
size_t context_width = m_nGramOrder - 1;
if (!context.empty() && context.back() == kSTART) {
context.resize(context_width, kSTART);
} else {
context.resize(context_width, kUNKNOWN);
}
double score;
score = model.predict(word, context);
/*
std::string str = DebugContextFactor(contextFactor);
std::cerr << "contextFactor=" << str << " " << score << std::endl;
*/
LMResult ret;
ret.score = score;
ret.unknown = (word == kUNKNOWN);
// calc state from hash of last n-1 words
size_t seed = 0;
boost::hash_combine(seed, word);
for (size_t i = 0; i < context.size() && i < context_width - 1; ++i) {
int id = context[i];
boost::hash_combine(seed, id);
}
(*finalState) = (State*) seed;
return ret;
}
virtual void CleanUpAfterSentenceProcessing(const InputType& source);
protected:
oxlm::Dict dict;
boost::shared_ptr<oxlm::ModelData> config;
Model model;
boost::shared_ptr<OXLMMapper> mapper;
int kSTART;
int kSTOP;
int kUNKNOWN;
boost::shared_ptr<OXLMMapper> mapper;
bool persistentCache;
mutable boost::thread_specific_ptr<oxlm::QueryCache> cache;
mutable int cacheHits, totalHits;
};

View File

@ -422,7 +422,7 @@ void PDTAimp::CreateTargetPhrase(TargetPhrase& targetPhrase,
}
targetPhrase.GetScoreBreakdown().Assign(m_obj, transVector);
targetPhrase.Evaluate(*srcPtr, m_obj->GetFeaturesToApply());
targetPhrase.EvaluateInIsolation(*srcPtr, m_obj->GetFeaturesToApply());
}
TargetPhraseCollectionWithSourcePhrase* PDTAimp::PruneTargetCandidates

View File

@ -9,6 +9,7 @@
#include "moses/PP/TreeStructurePhraseProperty.h"
#include "moses/PP/SpanLengthPhraseProperty.h"
#include "moses/PP/NonTermContextProperty.h"
#include "moses/PP/OrientationPhraseProperty.h"
namespace Moses
{
@ -59,6 +60,7 @@ PhrasePropertyFactory::PhrasePropertyFactory()
MOSES_PNAME2("Tree",TreeStructurePhraseProperty);
MOSES_PNAME2("SpanLength", SpanLengthPhraseProperty);
MOSES_PNAME2("NonTermContext", NonTermContextProperty);
MOSES_PNAME2("Orientation", OrientationPhraseProperty);
}
PhrasePropertyFactory::~PhrasePropertyFactory()

View File

@ -0,0 +1,26 @@
#include "moses/PP/OrientationPhraseProperty.h"
#include <iostream>
namespace Moses
{
void OrientationPhraseProperty::ProcessValue(const std::string &value)
{
// bidirectional MSLR phrase orientation with 2x4 orientation classes:
// mono swap dright dleft
std::istringstream tokenizer(value);
try {
if (! (tokenizer >> m_l2rMonoProbability >> m_l2rSwapProbability >> m_l2rDrightProbability >> m_l2rDleftProbability
>> m_r2lMonoProbability >> m_r2lSwapProbability >> m_r2lDrightProbability >> m_r2lDleftProbability)) {
UTIL_THROW2("OrientationPhraseProperty: Not able to read value. Flawed property?");
}
} catch (const std::exception &e) {
UTIL_THROW2("OrientationPhraseProperty: Read error. Flawed property?");
}
};
} // namespace Moses

View File

@ -0,0 +1,65 @@
#pragma once
#include "moses/PP/PhraseProperty.h"
#include "util/exception.hh"
#include <string>
namespace Moses
{
class OrientationPhraseProperty : public PhraseProperty
{
public:
OrientationPhraseProperty() {};
virtual void ProcessValue(const std::string &value);
double GetLeftToRightProbabilityMono() const {
return m_l2rMonoProbability;
};
double GetLeftToRightProbabilitySwap() const {
return m_l2rSwapProbability;
};
double GetLeftToRightProbabilityDright() const {
return m_l2rDrightProbability;
};
double GetLeftToRightProbabilityDleft() const {
return m_l2rDleftProbability;
};
double GetRightToLeftProbabilityMono() const {
return m_r2lMonoProbability;
};
double GetRightToLeftProbabilitySwap() const {
return m_r2lSwapProbability;
};
double GetRightToLeftProbabilityDright() const {
return m_r2lDrightProbability;
};
double GetRightToLeftProbabilityDleft() const {
return m_r2lDleftProbability;
};
virtual const std::string *GetValueString() const {
UTIL_THROW2("OrientationPhraseProperty: value string not available in this phrase property");
return NULL;
};
protected:
float m_l2rMonoProbability, m_l2rSwapProbability, m_l2rDrightProbability, m_l2rDleftProbability,
m_r2lMonoProbability, m_r2lSwapProbability, m_r2lDrightProbability, m_r2lDleftProbability;
};
} // namespace Moses

View File

@ -16,12 +16,12 @@ void SourceLabelsPhraseProperty::ProcessValue(const std::string &value)
std::istringstream tokenizer(value);
if (! (tokenizer >> m_nNTs)) { // first token: number of non-terminals (incl. left-hand side)
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of non-terminals. Flawed property?");
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of non-terminals. Flawed property? " << value);
}
assert( m_nNTs > 0 );
if (! (tokenizer >> m_totalCount)) { // second token: overall rule count
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read overall rule count. Flawed property?");
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read overall rule count. Flawed property? " << value);
}
assert( m_totalCount > 0.0 );
@ -32,7 +32,7 @@ void SourceLabelsPhraseProperty::ProcessValue(const std::string &value)
std::priority_queue<float> ruleLabelledCountsPQ;
while (tokenizer.peek() != EOF) {
try {
// try {
SourceLabelsPhrasePropertyItem item;
size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
@ -46,28 +46,28 @@ void SourceLabelsPhraseProperty::ProcessValue(const std::string &value)
for (size_t i=0; i<m_nNTs-1; ++i) { // RHS source non-terminal labels
size_t sourceLabelRHS;
if (! (tokenizer >> sourceLabelRHS) ) { // RHS source non-terminal label
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side label index. Flawed property?");
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side label index. Flawed property? " << value);
}
item.m_sourceLabelsRHS.push_back(sourceLabelRHS);
}
if (! (tokenizer >> item.m_sourceLabelsRHSCount)) {
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side count. Flawed property?");
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side count. Flawed property? " << value);
}
if (! (tokenizer >> numberOfLHSsGivenRHS)) {
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of left-hand sides. Flawed property?");
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of left-hand sides. Flawed property? " << value);
}
}
for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS source non-terminal labels seen with this RHS
size_t sourceLabelLHS;
if (! (tokenizer >> sourceLabelLHS)) { // LHS source non-terminal label
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read left-hand side label index. Flawed property?");
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read left-hand side label index. Flawed property? " << value);
}
float ruleSourceLabelledCount;
if (! (tokenizer >> ruleSourceLabelledCount)) {
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read count. Flawed property?");
UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read count. Flawed property? " << value);
}
item.m_sourceLabelsLHSList.push_back( std::make_pair(sourceLabelLHS,ruleSourceLabelledCount) );
ruleLabelledCountsPQ.push(ruleSourceLabelledCount);
@ -75,9 +75,9 @@ void SourceLabelsPhraseProperty::ProcessValue(const std::string &value)
m_sourceLabelItems.push_back(item);
} catch (const std::exception &e) {
UTIL_THROW2("SourceLabelsPhraseProperty: Read error. Flawed property?");
}
// } catch (const std::exception &e) {
// UTIL_THROW2("SourceLabelsPhraseProperty: Read error. Flawed property?");
// }
}
// keep only top N label vectors

View File

@ -50,7 +50,7 @@ Parameter::Parameter()
AddParam("factor-delimiter", "fd", "specify a different factor delimiter than the default");
AddParam("input-factors", "list of factors in the input");
AddParam("input-file", "i", "location of the input file to be translated");
AddParam("inputtype", "text (0), confusion network (1), word lattice (2) (default = 0)");
AddParam("inputtype", "text (0), confusion network (1), word lattice (2), tree (3) (default = 0)");
AddParam("labeled-n-best-list", "print out labels for each weight type in n-best list. default is true");
AddParam("mark-unknown", "mu", "mark unknown words in output");
AddParam("max-partial-trans-opt", "maximum number of partial translation options per input span (during mapping steps)");

View File

@ -79,7 +79,7 @@ void RuleCubeItem::CreateHypothesis(const ChartTranslationOptions &transOpt,
ChartManager &manager)
{
m_hypothesis = new ChartHypothesis(transOpt, *this, manager);
m_hypothesis->Evaluate();
m_hypothesis->EvaluateWhenApplied();
m_score = m_hypothesis->GetTotalScore();
}

View File

@ -261,6 +261,11 @@ public:
void PlusEquals(const FeatureFunction* sp, const ScorePair &scorePair);
// Add score by index
void PlusEquals(size_t index, float score) {
m_scores[index] += score;
}
//For features which have an unbounded number of components
void SparsePlusEquals(const std::string& full_name, float score) {
FName fname(full_name);
@ -283,7 +288,7 @@ public:
m_scores[indexes.first] = score;
}
// Assign core weight by index
// Assign score by index
void Assign(size_t index, float score) {
m_scores[index] = score;
}
@ -354,6 +359,11 @@ public:
m_scores.capMin(minValue);
}
std::pair<size_t,size_t> GetIndexesForProducer(const FeatureFunction* sp) const {
IndexPair indexPair = GetIndexes(sp);
return indexPair;
}
//! if a FeatureFunction produces a single score (for example, a language model score)
//! this will return it. If not, this method will throw
float GetScoreForProducer(const FeatureFunction* sp) const {

View File

@ -288,7 +288,7 @@ void SearchNormal::ExpandHypothesis(const Hypothesis &hypothesis, const Translat
stats.StopTimeBuildHyp();
}
if (newHypo==NULL) return;
newHypo->Evaluate(m_transOptColl.GetFutureScore());
newHypo->EvaluateWhenApplied(m_transOptColl.GetFutureScore());
} else
// early discarding: check if hypothesis is too bad to build
{

View File

@ -159,13 +159,13 @@ void SearchNormalBatch::EvalAndMergePartialHypos()
++sfff_iter) {
const StatefulFeatureFunction &ff = *(sfff_iter->second);
int state_idx = sfff_iter->first;
hypo->EvaluateWith(ff, state_idx);
hypo->EvaluateWhenApplied(ff, state_idx);
}
std::vector<const StatelessFeatureFunction*>::iterator slff_iter;
for (slff_iter = m_stateless_ffs.begin();
slff_iter != m_stateless_ffs.end();
++slff_iter) {
hypo->EvaluateWith(**slff_iter);
hypo->EvaluateWhenApplied(**slff_iter);
}
}
@ -190,7 +190,7 @@ void SearchNormalBatch::EvalAndMergePartialHypos()
dlm_iter != m_dlm_ffs.end();
++dlm_iter) {
LanguageModel &lm = *(dlm_iter->second);
hypo->EvaluateWith(lm, (*dlm_iter).first);
hypo->EvaluateWhenApplied(lm, (*dlm_iter).first);
}
// Put completed hypothesis onto its stack.

View File

@ -125,6 +125,9 @@ bool StaticData::LoadData(Parameter *parameter)
if (m_inputType == 2) {
s_it = "word lattice";
}
if (m_inputType == 3) {
s_it = "tree";
}
VERBOSE(2,"input type is: "<<s_it<<"\n");
if(m_parameter->GetParam("recover-input-path").size()) {

View File

@ -101,13 +101,13 @@ void TargetPhrase::WriteToRulePB(hgmert::Rule* pb) const
}
#endif
void TargetPhrase::Evaluate(const Phrase &source)
void TargetPhrase::EvaluateInIsolation(const Phrase &source)
{
const std::vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
Evaluate(source, ffs);
EvaluateInIsolation(source, ffs);
}
void TargetPhrase::Evaluate(const Phrase &source, const std::vector<FeatureFunction*> &ffs)
void TargetPhrase::EvaluateInIsolation(const Phrase &source, const std::vector<FeatureFunction*> &ffs)
{
if (ffs.size()) {
const StaticData &staticData = StaticData::Instance();
@ -126,7 +126,7 @@ void TargetPhrase::Evaluate(const Phrase &source, const std::vector<FeatureFunct
}
}
void TargetPhrase::Evaluate(const InputType &input, const InputPath &inputPath)
void TargetPhrase::EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath)
{
const std::vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
const StaticData &staticData = StaticData::Instance();

View File

@ -71,14 +71,14 @@ public:
~TargetPhrase();
// 1st evaluate method. Called during loading of phrase table.
void Evaluate(const Phrase &source, const std::vector<FeatureFunction*> &ffs);
void EvaluateInIsolation(const Phrase &source, const std::vector<FeatureFunction*> &ffs);
// as above, score with ALL FFs
// Used only for OOV processing. Doesn't have a phrase table connect with it
void Evaluate(const Phrase &source);
void EvaluateInIsolation(const Phrase &source);
// 'inputPath' is guaranteed to be the raw substring from the input. No factors were added or taken away
void Evaluate(const InputType &input, const InputPath &inputPath);
void EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath);
void SetSparseScore(const FeatureFunction* translationScoreProducer, const StringPiece &sparseString);

View File

@ -418,7 +418,7 @@ TargetPhraseVectorPtr PhraseDecoder::DecodeCollection(
}
if(eval) {
targetPhrase->Evaluate(sourcePhrase, m_phraseDictionary.GetFeaturesToApply());
targetPhrase->EvaluateInIsolation(sourcePhrase, m_phraseDictionary.GetFeaturesToApply());
}
if(m_coding == PREnc) {

View File

@ -60,7 +60,7 @@ GetTargetPhraseCollectionLEGACY(const Phrase& src) const
BOOST_FOREACH(pstat_entry & e, pstats) {
TargetPhrase* tp = m_biSA->GetMosesFactorIDs(e.first, src, this);
tp->GetScoreBreakdown().Assign(this,e.second);
tp->Evaluate(src);
tp->EvaluateInIsolation(src);
ret->Add(tp);
}
// return ret;

View File

@ -147,7 +147,7 @@ void PhraseDictionaryMultiModel::CollectSufficientStatistics(const Phrase& src,
vector<FeatureFunction*> pd_feature;
pd_feature.push_back(m_pd[i]);
const vector<FeatureFunction*> pd_feature_const(pd_feature);
statistics->targetPhrase->Evaluate(src, pd_feature_const);
statistics->targetPhrase->EvaluateInIsolation(src, pd_feature_const);
// zero out scores from original phrase table
statistics->targetPhrase->GetScoreBreakdown().ZeroDenseFeatures(&pd);
@ -186,7 +186,7 @@ TargetPhraseCollection* PhraseDictionaryMultiModel::CreateTargetPhraseCollection
vector<FeatureFunction*> pd_feature;
pd_feature.push_back(const_cast<PhraseDictionaryMultiModel*>(this));
const vector<FeatureFunction*> pd_feature_const(pd_feature);
statistics->targetPhrase->Evaluate(src, pd_feature_const);
statistics->targetPhrase->EvaluateInIsolation(src, pd_feature_const);
ret->Add(new TargetPhrase(*statistics->targetPhrase));
}

View File

@ -189,7 +189,7 @@ void PhraseDictionaryMultiModelCounts::CollectSufficientStatistics(const Phrase&
vector<FeatureFunction*> pd_feature;
pd_feature.push_back(m_pd[i]);
const vector<FeatureFunction*> pd_feature_const(pd_feature);
statistics->targetPhrase->Evaluate(src, pd_feature_const);
statistics->targetPhrase->EvaluateInIsolation(src, pd_feature_const);
// zero out scores from original phrase table
statistics->targetPhrase->GetScoreBreakdown().ZeroDenseFeatures(&pd);
@ -251,7 +251,7 @@ TargetPhraseCollection* PhraseDictionaryMultiModelCounts::CreateTargetPhraseColl
vector<FeatureFunction*> pd_feature;
pd_feature.push_back(const_cast<PhraseDictionaryMultiModelCounts*>(this));
const vector<FeatureFunction*> pd_feature_const(pd_feature);
statistics->targetPhrase->Evaluate(src, pd_feature_const);
statistics->targetPhrase->EvaluateInIsolation(src, pd_feature_const);
} catch (AlignmentException& e) {
continue;
}

View File

@ -132,7 +132,7 @@ std::vector<TargetPhrase*> PhraseDictionaryTransliteration::CreateTargetPhrases(
tp->GetScoreBreakdown().PlusEquals(this, score);
// score of all other ff when this rule is being loaded
tp->Evaluate(sourcePhrase, GetFeaturesToApply());
tp->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply());
ret.push_back(tp);
}

View File

@ -181,7 +181,7 @@ TargetPhrase *ProbingPT::CreateTargetPhrase(const Phrase &sourcePhrase, const ta
*/
// score of all other ff when this rule is being loaded
tp->Evaluate(sourcePhrase, GetFeaturesToApply());
tp->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply());
return tp;
}

View File

@ -226,7 +226,7 @@ bool RuleTableLoaderCompact::LoadRuleSection(
targetPhrase->SetAlignNonTerm(alignNonTerm);
targetPhrase->SetTargetLHS(targetLhs);
targetPhrase->Evaluate(sourcePhrase, ruleTable.GetFeaturesToApply());
targetPhrase->EvaluateInIsolation(sourcePhrase, ruleTable.GetFeaturesToApply());
// Insert rule into table.
TargetPhraseCollection &coll = GetOrCreateTargetPhraseCollection(

View File

@ -247,7 +247,7 @@ bool RuleTableLoaderStandard::Load(FormatType format
}
targetPhrase->GetScoreBreakdown().Assign(&ruleTable, scoreVector);
targetPhrase->Evaluate(sourcePhrase, ruleTable.GetFeaturesToApply());
targetPhrase->EvaluateInIsolation(sourcePhrase, ruleTable.GetFeaturesToApply());
TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase, *targetPhrase, sourceLHS);
phraseColl.Add(targetPhrase);

View File

@ -284,7 +284,7 @@ void PhraseDictionaryFuzzyMatch::InitializeForInput(InputType const& inputSenten
std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);
targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
targetPhrase->Evaluate(sourcePhrase, GetFeaturesToApply());
targetPhrase->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply());
TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase, *targetPhrase, sourceLHS);
phraseColl.Add(targetPhrase);

View File

@ -62,7 +62,7 @@ TargetPhrase *SkeletonPT::CreateTargetPhrase(const Phrase &sourcePhrase) const
tp->GetScoreBreakdown().PlusEquals(this, scores);
// score of all other ff when this rule is being loaded
tp->Evaluate(sourcePhrase, GetFeaturesToApply());
tp->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply());
return tp;
}

View File

@ -499,6 +499,16 @@ namespace Moses {
aln[k] += s2 - s1;
Token const* o = (j->fwd ? ag.bt.T2 : ag.bt.T1)->sntStart(sid);
float sample_weight = 1./((s2-s1+1)*(e2-e1+1));
vector<uint64_t> seen;
seen.reserve(100);
// It is possible that the phrase extraction extracts the same
// phrase twice, e.g., when word a co-occurs with sequence b b b
// but is aligned only to the middle word. We can only count
// each phrase pair once per source phrase occurrence, or else
// run the risk of having more joint counts than marginal
// counts.
for (size_t s = s1; s <= s2; ++s)
{
sptr<iter> b = (j->fwd ? ag.bt.I2 : ag.bt.I1)->find(o+s,e1-s);
@ -507,7 +517,26 @@ namespace Moses {
// assert(b);
for (size_t i = e1; i <= e2; ++i)
{
if (! j->stats->add(b->getPid(),sample_weight,aln,
uint64_t tpid = b->getPid();
size_t s = 0;
while (s < seen.size() && seen[s] != tpid) ++s;
if (s < seen.size())
{
#if 0
size_t sid, off, len;
parse_pid(tpid,sid,off,len);
cerr << "HA, gotcha! " << sid << ":" << off << " at " << HERE << endl;
for (size_t z = 0; z < len; ++z)
{
id_type tid = ag.bt.T2->sntStart(sid)[off+z].id();
cerr << (*ag.bt.V2)[tid] << " ";
}
cerr << endl;
#endif
continue;
}
seen.push_back(tpid);
if (! j->stats->add(tpid,sample_weight,aln,
b->approxOccurrenceCount(),
po_fwd,po_bwd))
{

View File

@ -476,7 +476,7 @@ namespace Moses
tp->AddWord(w);
}
tp->GetScoreBreakdown().Assign(this, fvals);
tp->Evaluate(src);
tp->EvaluateInIsolation(src);
return tp;
}

View File

@ -71,10 +71,10 @@ void TranslationOption::CacheLexReorderingScores(const LexicalReordering &produc
m_lexReorderingScores[&producer] = score;
}
void TranslationOption::Evaluate(const InputType &input)
void TranslationOption::EvaluateWithSourceContext(const InputType &input)
{
const InputPath &inputPath = GetInputPath();
m_targetPhrase.Evaluate(input, inputPath);
m_targetPhrase.EvaluateWithSourceContext(input, inputPath);
}
const InputPath &TranslationOption::GetInputPath() const

View File

@ -135,7 +135,7 @@ public:
return m_targetPhrase.GetScoreBreakdown();
}
void Evaluate(const InputType &input);
void EvaluateWithSourceContext(const InputType &input);
/** returns cached scores */
inline const Scores *GetLexReorderingScores(const LexicalReordering *scoreProducer) const {

View File

@ -212,6 +212,12 @@ void TranslationOptionCollection::ProcessOneUnknownWord(const InputPath &inputPa
float unknownScore = FloorScore(TransformScore(0));
const Word &sourceWord = inputPath.GetPhrase().GetWord(0);
// hack. Once the OOV FF is a phrase table, get rid of this
PhraseDictionary *firstPt = NULL;
if (PhraseDictionary::GetColl().size() == 0) {
firstPt = PhraseDictionary::GetColl()[0];
}
// unknown word, add as trans opt
FactorCollection &factorCollection = FactorCollection::Instance();
@ -231,7 +237,7 @@ void TranslationOptionCollection::ProcessOneUnknownWord(const InputPath &inputPa
// modify the starting bitmap
}
TargetPhrase targetPhrase(NULL);
TargetPhrase targetPhrase(firstPt);
if (!(staticData.GetDropUnknown() || isEpsilon) || isDigit) {
// add to dictionary
@ -266,7 +272,7 @@ void TranslationOptionCollection::ProcessOneUnknownWord(const InputPath &inputPa
m_unksrcs.push_back(&sourcePhrase);
WordsRange range(sourcePos, sourcePos + length - 1);
targetPhrase.Evaluate(sourcePhrase);
targetPhrase.EvaluateInIsolation(sourcePhrase);
TranslationOption *transOpt = new TranslationOption(range, targetPhrase);
transOpt->SetInputPath(inputPath);
@ -410,7 +416,7 @@ void TranslationOptionCollection::CreateTranslationOptions()
ProcessUnknownWord();
EvaluateWithSource();
EvaluateWithSourceContext();
// Prune
Prune();
@ -535,7 +541,7 @@ void TranslationOptionCollection::SetInputScore(const InputPath &inputPath, Part
}
}
void TranslationOptionCollection::EvaluateWithSource()
void TranslationOptionCollection::EvaluateWithSourceContext()
{
const size_t size = m_source.GetSize();
for (size_t startPos = 0 ; startPos < size ; ++startPos) {
@ -549,7 +555,7 @@ void TranslationOptionCollection::EvaluateWithSource()
TranslationOptionList::const_iterator iterTransOpt;
for(iterTransOpt = transOptList.begin() ; iterTransOpt != transOptList.end() ; ++iterTransOpt) {
TranslationOption &transOpt = **iterTransOpt;
transOpt.Evaluate(m_source);
transOpt.EvaluateWithSourceContext(m_source);
}
}
}

View File

@ -96,7 +96,7 @@ protected:
//! implemented by inherited class, called by this class
virtual void ProcessUnknownWord(size_t sourcePos)=0;
void EvaluateWithSource();
void EvaluateWithSourceContext();
void CacheLexReordering();

View File

@ -147,7 +147,7 @@ void TranslationOptionCollectionLattice::CreateTranslationOptions()
const TargetPhrase &tp = **iter;
TranslationOption *transOpt = new TranslationOption(range, tp);
transOpt->SetInputPath(path);
transOpt->Evaluate(m_source);
transOpt->EvaluateWithSourceContext(m_source);
Add(transOpt);
}

View File

@ -5,6 +5,7 @@
#include "Util.h"
#include "XmlOption.h"
#include "FactorCollection.h"
#include "moses/TranslationModel/PhraseDictionary.h"
using namespace std;
@ -30,6 +31,12 @@ bool TreeInput::ProcessAndStripXMLTags(string &line, std::vector<XMLParseOutput>
return true;
}
// hack. What pt should XML trans opt be assigned to?
PhraseDictionary *firstPt = NULL;
if (PhraseDictionary::GetColl().size() == 0) {
firstPt = PhraseDictionary::GetColl()[0];
}
// break up input into a vector of xml tags and text
// example: (this), (<b>), (is a), (</b>), (test .)
vector<string> xmlTokens = TokenizeXml(line);
@ -173,7 +180,7 @@ bool TreeInput::ProcessAndStripXMLTags(string &line, std::vector<XMLParseOutput>
//TRACE_ERR("number of translations: " << altTexts.size() << endl);
for (size_t i=0; i<altTexts.size(); ++i) {
// set target phrase
TargetPhrase targetPhrase(NULL);
TargetPhrase targetPhrase(firstPt);
// targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i],factorDelimiter, NULL);
targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i], NULL);
@ -203,7 +210,7 @@ bool TreeInput::ProcessAndStripXMLTags(string &line, std::vector<XMLParseOutput>
// convert from prob to log-prob
float scoreValue = FloorScore(TransformScore(probValue));
targetPhrase.SetXMLScore(scoreValue);
targetPhrase.Evaluate(sourcePhrase);
targetPhrase.EvaluateInIsolation(sourcePhrase);
// set span and create XmlOption
WordsRange range(startPos+1,endPos);

View File

@ -30,6 +30,7 @@
#include "TargetPhrase.h"
#include "ReorderingConstraint.h"
#include "FactorCollection.h"
#include "moses/TranslationModel/PhraseDictionary.h"
namespace Moses
{
@ -160,6 +161,12 @@ bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingCon
const StaticData &staticData = StaticData::Instance();
// hack. What pt should XML trans opt be assigned to?
PhraseDictionary *firstPt = NULL;
if (PhraseDictionary::GetColl().size() == 0) {
firstPt = PhraseDictionary::GetColl()[0];
}
// no xml tag? we're done.
//if (line.find_first_of('<') == string::npos) {
if (line.find(lbrackStr) == string::npos) {
@ -361,7 +368,7 @@ bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingCon
float scoreValue = FloorScore(TransformScore(probValue));
WordsRange range(startPos + offset,endPos-1 + offset); // span covered by phrase
TargetPhrase targetPhrase(NULL);
TargetPhrase targetPhrase(firstPt);
// targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i],factorDelimiter, NULL);
targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i], NULL);
@ -375,7 +382,7 @@ bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingCon
}
targetPhrase.SetXMLScore(scoreValue);
targetPhrase.Evaluate(sourcePhrase);
targetPhrase.EvaluateInIsolation(sourcePhrase);
XmlOption *option = new XmlOption(range,targetPhrase);
assert(option);

View File

@ -0,0 +1,159 @@
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include "PropertiesConsolidator.h"
#include <sstream>
#include <limits>
#include <vector>
#include "moses/Util.h"
#include "phrase-extract/InputFileStream.h"
#include "phrase-extract/OutputFileStream.h"
namespace MosesTraining
{
void PropertiesConsolidator::ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile)
{
Moses::InputFileStream inFile(sourceLabelSetFile);
// read source label set
m_sourceLabels.clear();
std::string line;
while (getline(inFile, line)) {
std::istringstream tokenizer(line);
std::string label;
size_t index;
try {
tokenizer >> label >> index;
} catch (const std::exception &e) {
UTIL_THROW2("Error reading source label set file " << sourceLabelSetFile << " .");
}
std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_sourceLabels.insert( std::pair<std::string,size_t>(label,index) );
UTIL_THROW_IF2(!inserted.second,"Source label set file " << sourceLabelSetFile << " should contain each syntactic label only once.");
}
inFile.Close();
m_sourceLabelsFlag = true;
}
std::string PropertiesConsolidator::ProcessPropertiesString(const std::string &propertiesString) const
{
if ( propertiesString.empty() ) {
return propertiesString;
}
std::ostringstream out;
std::vector<std::string> toks;
Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{");
for (size_t i = 1; i < toks.size(); ++i) {
std::string &tok = toks[i];
if (tok.empty()) {
continue;
}
size_t endPos = tok.rfind("}");
tok = tok.substr(0, endPos - 1);
std::vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
assert(keyValue.size() == 2);
if ( !keyValue[0].compare("SourceLabels") ) {
if ( m_sourceLabelsFlag ) {
// SourceLabels additional property: replace strings with vocabulary indices
out << " {{" << keyValue[0];
std::istringstream tokenizer(keyValue[1]);
size_t nNTs;
double totalCount;
if (! (tokenizer >> nNTs)) { // first token: number of non-terminals (incl. left-hand side)
UTIL_THROW2("Not able to read number of non-terminals from SourceLabels property. "
<< "Flawed SourceLabels property?");
}
assert( nNTs > 0 );
out << " " << nNTs;
if (! (tokenizer >> totalCount)) { // second token: overall rule count
UTIL_THROW2("Not able to read overall rule count from SourceLabels property. "
<< "Flawed SourceLabels property?");
}
assert( totalCount > 0.0 );
out << " " << totalCount;
while (tokenizer.peek() != EOF) {
try {
size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
std::string token;
if (nNTs > 1) { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule
for (size_t i=0; i<nNTs-1; ++i) { // RHS source non-terminal labels
tokenizer >> token; // RHS source non-terminal label
std::map<std::string,size_t>::const_iterator found = m_sourceLabels.find(token);
UTIL_THROW_IF2(found == m_sourceLabels.end(), "Label \"" << token << "\" from the phrase table not found in given label set.");
out << " " << found->second;
}
tokenizer >> token; // sourceLabelsRHSCount
out << " " << token;
tokenizer >> numberOfLHSsGivenRHS;
out << " " << numberOfLHSsGivenRHS;
}
for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS source non-terminal labels seen with this RHS
tokenizer >> token; // LHS source non-terminal label
std::map<std::string,size_t>::const_iterator found = m_sourceLabels.find(token);
UTIL_THROW_IF2(found == m_sourceLabels.end() ,"Label \"" << token << "\" from the phrase table not found in given label set.");
out << " " << found->second;
tokenizer >> token; // ruleSourceLabelledCount
out << " " << token;
}
} catch (const std::exception &e) {
UTIL_THROW2("Flawed item in SourceLabels property?");
}
}
out << "}}";
} else { // don't process source labels additional property
out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
}
} else {
// output other additional property
out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
}
}
return out.str();
}
} // namespace MosesTraining

View File

@ -0,0 +1,48 @@
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) University of Edinburgh
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#pragma once
#include <string>
#include <map>
namespace MosesTraining
{
class PropertiesConsolidator
{
public:
PropertiesConsolidator() : m_sourceLabelsFlag(false) {};
void ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile);
std::string ProcessPropertiesString(const std::string &propertiesString) const;
private:
bool m_sourceLabelsFlag;
std::map<std::string,size_t> m_sourceLabels;
};
} // namespace MosesTraining

View File

@ -28,6 +28,7 @@
#include "tables-core.h"
#include "InputFileStream.h"
#include "OutputFileStream.h"
#include "PropertiesConsolidator.h"
using namespace std;
@ -37,13 +38,14 @@ bool phraseCountFlag = false;
bool lowCountFlag = false;
bool goodTuringFlag = false;
bool kneserNeyFlag = false;
bool sourceLabelsFlag = false;
bool logProbFlag = false;
inline float maybeLogProb( float a )
{
return logProbFlag ? log(a) : a;
}
void processFiles( char*, char*, char*, char* );
void processFiles( char*, char*, char*, char*, char* );
void loadCountOfCounts( char* );
void breakdownCoreAndSparse( string combined, string &core, string &sparse );
bool getLine( istream &fileP, vector< string > &item );
@ -57,13 +59,14 @@ int main(int argc, char* argv[])
<< "consolidating direct and indirect rule tables\n";
if (argc < 4) {
cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--PhraseCount] \n";
cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--PhraseCount] [--GoodTuring counts-of-counts-file] [--KneserNey counts-of-counts-file] [--LowCountFeature] [--SourceLabels source-labels-file] \n";
exit(1);
}
char* &fileNameDirect = argv[1];
char* &fileNameIndirect = argv[2];
char* &fileNameConsolidated = argv[3];
char* fileNameCountOfCounts;
char* fileNameSourceLabelSet;
for(int i=4; i<argc; i++) {
if (strcmp(argv[i],"--Hierarchical") == 0) {
@ -114,13 +117,21 @@ int main(int argc, char* argv[])
} else if (strcmp(argv[i],"--LogProb") == 0) {
logProbFlag = true;
cerr << "using log-probabilities\n";
} else if (strcmp(argv[i],"--SourceLabels") == 0) {
sourceLabelsFlag = true;
if (i+1==argc) {
cerr << "ERROR: specify source label set file!\n";
exit(1);
}
fileNameSourceLabelSet = argv[++i];
cerr << "processing source labels property\n";
} else {
cerr << "ERROR: unknown option " << argv[i] << endl;
exit(1);
}
}
processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts );
processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts, fileNameSourceLabelSet );
}
vector< float > countOfCounts;
@ -169,7 +180,7 @@ void loadCountOfCounts( char* fileNameCountOfCounts )
if (kneserNey_D3 > 2.9) kneserNey_D3 = 2.9;
}
void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts )
void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts, char* fileNameSourceLabelSet )
{
if (goodTuringFlag || kneserNeyFlag)
loadCountOfCounts( fileNameCountOfCounts );
@ -198,6 +209,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
exit(1);
}
// create properties consolidator
// (in case any additional phrase property requires further processing)
MosesTraining::PropertiesConsolidator propertiesConsolidator = MosesTraining::PropertiesConsolidator();
if (sourceLabelsFlag) {
propertiesConsolidator.ActivateSourceLabelsProcessing(fileNameSourceLabelSet);
}
// loop through all extracted phrase translations
int i=0;
while(true) {
@ -307,12 +325,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
// counts, for debugging
fileConsolidated << "||| " << countE << " " << countF << " " << countEF;
// count bin feature (as a sparse feature)
// sparse features
fileConsolidated << " |||";
if (directSparseScores.compare("") != 0)
fileConsolidated << " " << directSparseScores;
if (indirectSparseScores.compare("") != 0)
fileConsolidated << " " << indirectSparseScores;
// count bin feature (as a sparse feature)
if (sparseCountBinFeatureFlag) {
bool foundBin = false;
for(size_t i=0; i < countBin.size(); i++) {
@ -332,9 +351,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
}
// arbitrary key-value pairs
fileConsolidated << " ||| ";
fileConsolidated << " |||";
if (itemDirect.size() >= 6) {
fileConsolidated << itemDirect[5];
//if (sourceLabelsFlag) {
fileConsolidated << propertiesConsolidator.ProcessPropertiesString(itemDirect[5]);
//} else {
// fileConsolidated << itemDirect[5];
//}
}
fileConsolidated << endl;

View File

@ -248,7 +248,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
const std::vector<const Subgraph *> &rules = (*p)->GetRules();
REO_POS l2rOrientation, r2lOrientation;
REO_POS l2rOrientation=UNKNOWN, r2lOrientation=UNKNOWN;
if (options.phraseOrientation && !rules.empty()) {
int sourceSpanBegin = *((*p)->GetSpan().begin());
int sourceSpanEnd = *((*p)->GetSpan().rbegin());
@ -617,9 +617,8 @@ void ExtractGHKM::WriteGlueGrammar(
}
}
std::string sourceTopLabel = "TOPLABEL";
std::string sourceSLabel = "S";
std::string sourceSomeLabel = "SOMELABEL";
size_t sourceLabelGlueTop = 0;
size_t sourceLabelGlueX = 1;
// basic rules
out << "<s> [X] ||| <s> [" << topLabel << "] ||| 1 ||| ||| ||| |||";
@ -627,7 +626,7 @@ void ExtractGHKM::WriteGlueGrammar(
out << " {{Tree [" << topLabel << " <s>]}}";
}
if (options.sourceLabels) {
out << " {{SourceLabels 1 1 " << sourceTopLabel << " 1}}";
out << " {{SourceLabels 1 1 " << sourceLabelGlueTop << " 1}}";
}
out << std::endl;
@ -636,7 +635,7 @@ void ExtractGHKM::WriteGlueGrammar(
out << " {{Tree [" << topLabel << " [" << topLabel << "] </s>]}}";
}
if (options.sourceLabels) {
out << " {{SourceLabels 2 1 " << sourceTopLabel << " 1 1 " << sourceTopLabel << " 1}}";
out << " {{SourceLabels 2 1 " << sourceLabelGlueTop << " 1 1 " << sourceLabelGlueTop << " 1}}";
}
out << std::endl;
@ -648,7 +647,7 @@ void ExtractGHKM::WriteGlueGrammar(
out << " {{Tree [" << topLabel << " <s> [" << i->first << "] </s>]}}";
}
if (options.sourceLabels) {
out << " {{SourceLabels 2 1 " << sourceSLabel << " 1 1 " << sourceTopLabel << " 1}}";
out << " {{SourceLabels 2 1 " << sourceLabelGlueX << " 1 1 " << sourceLabelGlueTop << " 1}}";
}
out << std::endl;
}
@ -661,7 +660,7 @@ void ExtractGHKM::WriteGlueGrammar(
out << " {{Tree [" << topLabel << " ["<< topLabel << "] [" << *i << "]]}}";
}
if (options.sourceLabels) {
out << " {{SourceLabels 3 2.718 " << sourceTopLabel << " " << sourceSomeLabel << " 2.718 1 " << sourceTopLabel << " 2.718}}"; // TODO: there should be better options than using "SOMELABEL"
out << " {{SourceLabels 3 2.718 " << sourceLabelGlueTop << " " << sourceLabelGlueX << " 2.718 1 " << sourceLabelGlueTop << " 2.718}}"; // TODO: there should be better options than using "SOMELABEL"
}
out << std::endl;
}
@ -672,7 +671,7 @@ void ExtractGHKM::WriteGlueGrammar(
out << " {{Tree [" << topLabel << " [" << topLabel << "] [X]]}}";
}
if (options.sourceLabels) {
out << " {{SourceLabels 3 1 " << sourceTopLabel << " " << sourceSomeLabel << " 1 1 " << sourceTopLabel << " 1}}"; // TODO: there should be better options than using "SOMELABEL"
out << " {{SourceLabels 3 1 " << sourceLabelGlueTop << " " << sourceLabelGlueX << " 1 1 " << sourceLabelGlueTop << " 1}}"; // TODO: there should be better options than using "SOMELABEL"
}
out << std::endl;
}

View File

@ -187,7 +187,7 @@ const std::string PhraseOrientation::GetOrientationInfoString(int startF, int en
const std::string PhraseOrientation::GetOrientationInfoString(int startF, int startE, int endF, int endE, REO_DIR direction) const
{
REO_POS hierPrevOrient, hierNextOrient;
REO_POS hierPrevOrient=UNKNOWN, hierNextOrient=UNKNOWN;
bool connectedLeftTopP = IsAligned( startF-1, startE-1 );
bool connectedRightTopP = IsAligned( endF+1, startE-1 );

View File

@ -1860,7 +1860,7 @@ sub define_tuning_tune {
$cmd .= " --lambdas \"$lambda\"" if $lambda;
$cmd .= " --continue" if $tune_continue;
$cmd .= " --skip-decoder" if $skip_decoder;
$cmd .= " --inputtype $tune_inputtype" if $tune_inputtype;
$cmd .= " --inputtype $tune_inputtype" if defined($tune_inputtype);
my $qsub_args = &get_qsub_args("TUNING");
$cmd .= " --queue-flags=\"$qsub_args\"" if ($CLUSTER && $qsub_args);
@ -2217,6 +2217,10 @@ sub define_training_extract_phrases {
my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model",""));
$cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file ";
}
if (&get("TRAINING:ghkm-source-labels")) {
$cmd .= "-ghkm-source-labels ";
}
}
my $extract_settings = &get("TRAINING:extract-settings");
@ -2254,6 +2258,11 @@ sub define_training_build_ttable {
my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model",""));
$cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file ";
}
if (&get("TRAINING:ghkm-source-labels")) {
$cmd .= "-ghkm-source-labels ";
my $source_labels_file = &versionize(&long_file_name("source-labels","model",""));
$cmd .= "-ghkm-source-labels-file $source_labels_file ";
}
}
&create_step($step_id,$cmd);
@ -2438,6 +2447,12 @@ sub define_training_create_config {
}
}
if (&get("TRAINING:ghkm-source-labels")) {
$cmd .= "-ghkm-source-labels ";
my $source_labels_file = &versionize(&long_file_name("source-labels","model",""));
$cmd .= "-ghkm-source-labels-file $source_labels_file ";
}
# sparse lexical features provide additional content for config file
$cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features;
@ -3412,7 +3427,7 @@ sub check_backoff_and_get_array {
# the following two functions deal with getting information about
# files that are passed between steps. this are either specified
# in the meta file (default) or in the configuration file (here called
# 'specified', in the step management refered to as 'given').
# 'specified', in the step management referred to as 'given').
sub get_specified_or_default_file {
my ($specified_module,$specified_set,$specified_parameter,

View File

@ -219,14 +219,14 @@ foreach (@children) {
waitpid($_, 0);
}
# glue rules
# merge glue rules
if (defined($glueFile)) {
my $cmd = "cat $TMPDIR/glue.* | LC_ALL=C sort | uniq > $glueFile";
print STDERR "Merging glue rules: $cmd \n";
print STDERR `$cmd`;
}
# phrase orientation priors (GHKM extraction)
# merge phrase orientation priors (GHKM extraction)
if ($phraseOrientation && defined($phraseOrientationPriorsFile)) {
print STDERR "Merging phrase orientation priors\n";

View File

@ -27,10 +27,22 @@ my $scoreCmd = $ARGV[2];
my $extractFile = $ARGV[3]; # 1st arg of extract argument
my $lexFile = $ARGV[4];
my $ptHalf = $ARGV[5]; # output
my $inverse = 0;
my $sourceLabelsFile;
my $otherExtractArgs= "";
for (my $i = 6; $i < $#ARGV; ++$i)
{
if ($ARGV[$i] eq '--SourceLabels') {
$sourceLabelsFile = $ARGV[++$i];
$otherExtractArgs .= "--SourceLabels --SourceLabelCountsLHS --SourceLabelSet ";
next;
}
if ($ARGV[$i] eq '--Inverse') {
$inverse = 1;
$otherExtractArgs .= $ARGV[$i] ." ";
next;
}
$otherExtractArgs .= $ARGV[$i] ." ";
}
#$scoreCmd $extractFile $lexFile $ptHalf $otherExtractArgs
@ -258,6 +270,14 @@ if (-e $cocPath)
close(FHCOC);
}
# merge source label files
if (!$inverse && defined($sourceLabelsFile))
{
my $cmd = "(echo \"GlueTop 0\"; echo \"GlueX 1\"; cat $TMPDIR/phrase-table.half.*.gz.syntaxLabels.src | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+1]}/\") > $sourceLabelsFile";
print STDERR "Merging source label files: $cmd \n";
`$cmd`;
}
$cmd = "rm -rf $TMPDIR \n";
print STDERR $cmd;
systemCheck($cmd);

View File

@ -2,4 +2,7 @@ The language suffix can be found here:
http://www.loc.gov/standards/iso639-2/php/code_list.php
This code includes data from Daniel Naber's Language Tools (czech abbreviations).
This code includes data from czech wiktionary (also czech abbreviations).

View File

@ -0,0 +1,5 @@
<\/?\S+\/?>
<\S+( [a-zA-Z0-9]+\=\"?[^\"]\")+ ?\/?>
<\S+( [a-zA-Z0-9]+\=\'?[^\']\')+ ?\/?>
(\w\-\_\.)+\@((\w\-\_)+\.)+[a-zA-Z]{2,}
(http[s]?|ftp):\/\/[^:\/\s]+(\/\w+)*\/[\w\-\.]+

View File

@ -232,15 +232,20 @@ sub tokenize
# Find protected patterns
my @protected = ();
foreach my $protected_pattern (@protected_patterns) {
foreach ($text =~ /($protected_pattern)/) {
push @protected, $_;
my $t = $text;
while ($t =~ /($protected_pattern)(.*)$/) {
push @protected, $1;
$t = $2;
}
}
for (my $i = 0; $i < scalar(@protected); ++$i) {
my $subst = sprintf("THISISPROTECTED%.3d", $i);
$text =~ s,\Q$protected[$i],$subst,g;
$text =~ s,\Q$protected[$i], $subst ,g;
}
$text =~ s/ +/ /g;
$text =~ s/^ //g;
$text =~ s/ $//g;
# seperate out all "other" special characters
$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;

View File

@ -127,8 +127,8 @@ my $___NOCASE = 0;
# Use "--nonorm" to non normalize translation before computing scores
my $___NONORM = 0;
# set 0 if input type is text, set 1 if input type is confusion network
my $___INPUTTYPE = 0;
# set 0 if input type is text, set 1 if input type is confusion network, set 3 if input type is parse tree
my $___INPUTTYPE;
my $mertdir = undef; # path to new mert directory
@ -1228,14 +1228,18 @@ sub run_decoder {
if (defined $___JOBS && $___JOBS > 0) {
die "Hypergraph mira not supported by moses-parallel" if $___HG_MIRA;
$decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG -inputtype $___INPUTTYPE -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$___DECODER_FLAGS $decoder_config\" $lsamp_cmd -n-best-list \"$filename $___N_BEST_LIST_SIZE\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out";
$decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG";
$decoder_cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE);
$decoder_cmd .= " -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$___DECODER_FLAGS $decoder_config\" $lsamp_cmd -n-best-list \"$filename $___N_BEST_LIST_SIZE distinct\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out";
} else {
my $nbest_list_cmd = "-n-best-list $filename $___N_BEST_LIST_SIZE";
my $nbest_list_cmd = "-n-best-list $filename $___N_BEST_LIST_SIZE distinct";
if ($___HG_MIRA) {
safesystem("rm -rf $hypergraph_dir");
$nbest_list_cmd = "-output-search-graph-hypergraph true gz";
}
$decoder_cmd = "$___DECODER $___DECODER_FLAGS -config $___CONFIG -inputtype $___INPUTTYPE $decoder_config $lsamp_cmd $nbest_list_cmd -input-file $___DEV_F > run$run.out";
$decoder_cmd = "$___DECODER $___DECODER_FLAGS -config $___CONFIG";
$decoder_cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE);
$decoder_cmd .= " $decoder_config $lsamp_cmd $nbest_list_cmd -input-file $___DEV_F > run$run.out";
}
print STDERR "Executing: $decoder_cmd \n";
@ -1309,7 +1313,9 @@ sub get_featlist_from_moses {
print STDERR "Using cached features list: $featlistfn\n";
} else {
print STDERR "Asking moses for feature names and values from $___CONFIG\n";
my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn -inputtype $___INPUTTYPE -show-weights > $featlistfn";
my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn";
$cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE);
$cmd .= " -show-weights > $featlistfn";
print STDERR "Executing: $cmd\n";
safesystem($cmd) or die "Failed to run moses with the config $configfn";
}

View File

@ -32,7 +32,7 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
$_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE,
@_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS,
$_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM, $_OSM_FACTORS, $_POST_DECODING_TRANSLIT, $_TRANSLITERATION_PHRASE_TABLE,
$_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_GHKM_TREE_FRAGMENTS,$_GHKM_PHRASE_ORIENTATION,$_PHRASE_ORIENTATION_PRIORS_FILE,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,
$_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_GHKM_TREE_FRAGMENTS,$_GHKM_PHRASE_ORIENTATION,$_PHRASE_ORIENTATION_PRIORS_FILE,$_GHKM_SOURCE_LABELS,$_GHKM_SOURCE_LABELS_FILE,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,
$_ALT_DIRECT_RULE_SCORE_1, $_ALT_DIRECT_RULE_SCORE_2, $_UNKNOWN_WORD_SOFT_MATCHES_FILE,
$_OMIT_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
$_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
@ -112,6 +112,8 @@ $_HELP = 1
'ghkm-tree-fragments' => \$_GHKM_TREE_FRAGMENTS,
'ghkm-phrase-orientation' => \$_GHKM_PHRASE_ORIENTATION,
'phrase-orientation-priors-file=s' => \$_PHRASE_ORIENTATION_PRIORS_FILE, # currently relevant for GHKM extraction only; phrase orientation for PBT has different implementation
'ghkm-source-labels' => \$_GHKM_SOURCE_LABELS,
'ghkm-source-labels-file=s' => \$_GHKM_SOURCE_LABELS_FILE,
'pcfg' => \$_PCFG,
'alt-direct-rule-score-1' => \$_ALT_DIRECT_RULE_SCORE_1,
'alt-direct-rule-score-2' => \$_ALT_DIRECT_RULE_SCORE_2,
@ -1427,10 +1429,15 @@ sub extract_phrase {
$cmd .= " --PCFG" if $_PCFG;
$cmd .= " --UnpairedExtractFormat" if $_ALT_DIRECT_RULE_SCORE_1 || $_ALT_DIRECT_RULE_SCORE_2;
$cmd .= " --ConditionOnTargetLHS" if $_ALT_DIRECT_RULE_SCORE_1;
$cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS;
$cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION;
$cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if defined($_PHRASE_ORIENTATION_PRIORS_FILE);
if (!defined($_GHKM)) {
if (defined($_GHKM))
{
$cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS;
$cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION;
$cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if defined($_PHRASE_ORIENTATION_PRIORS_FILE);
$cmd .= " --SourceLabels" if $_GHKM_SOURCE_LABELS;
}
else
{
$cmd .= " --SourceSyntax" if $_SOURCE_SYNTAX;
$cmd .= " --TargetSyntax" if $_TARGET_SYNTAX;
$cmd .= " --MaxSpan $max_length";
@ -1609,6 +1616,7 @@ sub score_phrase_phrase_extract {
$cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS;
$cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION;
$cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if $_GHKM_PHRASE_ORIENTATION && defined($_PHRASE_ORIENTATION_PRIORS_FILE);
$cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
$cmd .= " $DOMAIN" if $DOMAIN;
$cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS);
$cmd .= " --FlexibilityScore=$FLEX_SCORER" if $_FLEXIBILITY_SCORE;
@ -1659,6 +1667,7 @@ sub score_phrase_phrase_extract {
$cmd .= " --SparseCountBinFeature $SPARSE_COUNT_BIN" if $SPARSE_COUNT_BIN;
$cmd .= " --GoodTuring $ttable_file.half.f2e.gz.coc" if $GOOD_TURING;
$cmd .= " --KneserNey $ttable_file.half.f2e.gz.coc" if $KNESER_NEY;
$cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
$cmd .= " | gzip -c > $ttable_file.gz";
@ -2164,6 +2173,7 @@ sub create_ini {
print INI "WordPenalty\n";
print INI "PhrasePenalty\n";
print INI "SoftMatchingFeature name=SM0 path=$_UNKNOWN_WORD_SOFT_MATCHES_FILE\n" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_SOFT_MATCHES_FILE);
print INI "SoftSourceSyntacticConstraintsFeature sourceLabelSetFile=$_GHKM_SOURCE_LABELS_FILE\n" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
print INI $feature_spec;
print INI "\n# dense weights for feature functions\n";
@ -2171,6 +2181,7 @@ sub create_ini {
print INI "UnknownWordPenalty0= 1\n";
print INI "WordPenalty0= -1\n";
print INI "PhrasePenalty0= 0.2\n";
print INI "SoftSourceSyntacticConstraintsFeature0= 0.3 -0.3 -0.3\n" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
print INI $weight_spec;
close(INI);
}