Merge branch 'master' of github.com:moses-smt/mosesdecoder

This commit is contained in:
Lane Schwartz 2016-03-23 10:26:07 -05:00
commit 167def1d52
78 changed files with 3386 additions and 1223 deletions

View File

@ -208,7 +208,7 @@ if [ option.get "with-icu" : : "yes" ]
# for probing pt
external-lib boost_serialization ;
requirements += <library>boost_serialization ;
requirements += <library>boost_serialization/<runtime-link>static ;
if [ option.get "with-vw" ] {
requirements += <define>HAVE_VW ;
@ -247,6 +247,7 @@ if [ option.get "with-mm-extras" : : "yes" ]
moses/TranslationModel/UG//bitext-find
moses/TranslationModel/UG//ptable-describe-features
moses/TranslationModel/UG//count-ptable-features
moses/TranslationModel/UG//ptable-sigtest-filter
moses/TranslationModel/UG//ptable-lookup
moses/TranslationModel/UG//ptable-lookup-corpus
moses/TranslationModel/UG//check-coverage

View File

@ -93,7 +93,7 @@ void SuffixArray::Create(const string& fileName )
CheckAllocation(m_sentenceLength != NULL, "m_sentenceLength");
if (m_useDocument) {
m_document = (INDEX*) calloc( sizeof( INDEX ), m_documentCount );
m_documentName = (INDEX*) calloc( sizeof( char ), m_documentCount );
m_documentName = (INDEX*) calloc( sizeof( INDEX ), m_documentCount );
m_documentNameBuffer = (char*) calloc( sizeof( char ), m_documentNameLength );
CheckAllocation(m_document != NULL, "m_document");
CheckAllocation(m_documentName != NULL, "m_documentName");

View File

@ -11,12 +11,12 @@
</externalSetting>
</externalSettings>
<extensions>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@ -72,13 +72,13 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.macosx.exe.release.701931933" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">

View File

@ -55,6 +55,41 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SentenceAlignment.h</locationURI>
</link>
<link>
<name>SentenceAlignmentWithSyntax.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SentenceAlignmentWithSyntax.cpp</locationURI>
</link>
<link>
<name>SentenceAlignmentWithSyntax.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SentenceAlignmentWithSyntax.h</locationURI>
</link>
<link>
<name>SyntaxNodeCollection.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SyntaxNodeCollection.cpp</locationURI>
</link>
<link>
<name>SyntaxNodeCollection.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SyntaxNodeCollection.h</locationURI>
</link>
<link>
<name>XmlException.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/XmlException.h</locationURI>
</link>
<link>
<name>XmlTree.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/XmlTree.cpp</locationURI>
</link>
<link>
<name>XmlTree.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/XmlTree.h</locationURI>
</link>
<link>
<name>extract-main.cpp</name>
<type>1</type>

View File

@ -11,11 +11,11 @@
</externalSetting>
</externalSettings>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
@ -74,7 +74,7 @@
<fileInfo id="cdt.managedbuild.config.gnu.exe.debug.1846963597.871386239" name="LDHT.h" rcbsApplicability="disable" resourcePath="LM/LDHT.h" toolsToInvoke=""/>
<fileInfo id="cdt.managedbuild.config.gnu.exe.debug.1846963597.1761300858" name="ParallelBackoff.h" rcbsApplicability="disable" resourcePath="LM/ParallelBackoff.h" toolsToInvoke=""/>
<sourceEntries>
<entry excluding="TranslationModel/UG/ptable-lookup.cc|TranslationModel/UG/ptable-lookup-corpus.cc|TranslationModel/UG/mm/test-http-client.cc|TranslationModel/UG/ptable-describe-features.cc|TranslationModel/UG/count-ptable-features.cc|TranslationModel/UG/try-align2.cc|TranslationModel/UG/try-align.cc|TranslationModel/UG/spe-check-coverage3.cc|TranslationModel/UG/spe-check-coverage2.cc|TranslationModel/UG/spe-check-coverage.cc|TranslationModel/UG/sim-pe.cc|TranslationModel/UG/generic/stringdist|TranslationModel/UG/mm/test-dynamic-im-tsa.cc|TranslationModel/UG/mm/mtt.count.cc|LM/ParallelBackoff.h|LM/ParallelBackoff.cpp|LM/bilingual-lm|LM/MaxEntSRI.h|LM/MaxEntSRI.cpp|LM/BilingualLM.h|LM/BilingualLM.cpp|LM/Rand.h|LM/Rand.cpp|LM/LDHT.h|LM/LDHT.cpp|LM/ORLM.h|LM/ORLM.cpp|LM/NeuralLMWrapper.h|LM/NeuralLMWrapper.cpp|LM/SRI.h|LM/SRI.cpp|LM/IRST.h|LM/IRST.cpp|LM/DALMWrapper.h|LM/DALMWrapper.cpp|LM/oxlm|TranslationModel/UG/util" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
<entry excluding="TranslationModel/UG/ptable-lookup.cc|TranslationModel/UG/ptable-lookup-corpus.cc|TranslationModel/UG/mm/test-http-client.cc|TranslationModel/UG/ptable-describe-features.cc|TranslationModel/UG/count-ptable-features.cc|TranslationModel/UG/try-align2.cc|TranslationModel/UG/try-align.cc|TranslationModel/UG/spe-check-coverage3.cc|TranslationModel/UG/spe-check-coverage2.cc|TranslationModel/UG/spe-check-coverage.cc|TranslationModel/UG/sim-pe.cc|TranslationModel/UG/generic/stringdist|TranslationModel/UG/mm/test-dynamic-im-tsa.cc|TranslationModel/UG/mm/mtt.count.cc|LM/ParallelBackoff.h|LM/ParallelBackoff.cpp|LM/bilingual-lm|LM/MaxEntSRI.h|LM/MaxEntSRI.cpp|LM/BilingualLM.h|LM/BilingualLM.cpp|LM/Rand.h|LM/Rand.cpp|LM/LDHT.h|LM/LDHT.cpp|LM/ORLM.h|LM/ORLM.cpp|LM/NeuralLMWrapper.h|LM/NeuralLMWrapper.cpp|LM/SRI.h|LM/SRI.cpp|LM/IRST.h|LM/IRST.cpp|LM/DALMWrapper.h|LM/DALMWrapper.cpp|LM/oxlm|TranslationModel/UG/util" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
</sourceEntries>
</configuration>
</storageModule>
@ -84,12 +84,12 @@
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.1911984684" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">

View File

@ -1625,6 +1625,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/TargetBigramFeature.h</locationURI>
</link>
<link>
<name>FF/TargetConstituentAdjacencyFeature.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/TargetConstituentAdjacencyFeature.cpp</locationURI>
</link>
<link>
<name>FF/TargetConstituentAdjacencyFeature.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/TargetConstituentAdjacencyFeature.h</locationURI>
</link>
<link>
<name>FF/TargetNgramFeature.cpp</name>
<type>1</type>
@ -1635,6 +1645,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/TargetNgramFeature.h</locationURI>
</link>
<link>
<name>FF/TargetPreferencesFeature.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/TargetPreferencesFeature.cpp</locationURI>
</link>
<link>
<name>FF/TargetPreferencesFeature.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/TargetPreferencesFeature.h</locationURI>
</link>
<link>
<name>FF/TargetWordInsertionFeature.cpp</name>
<type>1</type>
@ -1995,6 +2015,36 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/SpanLengthPhraseProperty.h</locationURI>
</link>
<link>
<name>PP/TargetConstituentBoundariesLeftPhraseProperty.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/TargetConstituentBoundariesLeftPhraseProperty.cpp</locationURI>
</link>
<link>
<name>PP/TargetConstituentBoundariesLeftPhraseProperty.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/TargetConstituentBoundariesLeftPhraseProperty.h</locationURI>
</link>
<link>
<name>PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.cpp</locationURI>
</link>
<link>
<name>PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.h</locationURI>
</link>
<link>
<name>PP/TargetPreferencesPhraseProperty.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/TargetPreferencesPhraseProperty.cpp</locationURI>
</link>
<link>
<name>PP/TargetPreferencesPhraseProperty.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/TargetPreferencesPhraseProperty.h</locationURI>
</link>
<link>
<name>PP/TreeStructurePhraseProperty.h</name>
<type>1</type>
@ -2495,6 +2545,56 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/SyntaxOptions.h</locationURI>
</link>
<link>
<name>FF/LexicalReordering/BidirectionalReorderingState.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/BidirectionalReorderingState.cpp</locationURI>
</link>
<link>
<name>FF/LexicalReordering/BidirectionalReorderingState.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/BidirectionalReorderingState.h</locationURI>
</link>
<link>
<name>FF/LexicalReordering/HReorderingBackwardState.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/HReorderingBackwardState.cpp</locationURI>
</link>
<link>
<name>FF/LexicalReordering/HReorderingBackwardState.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/HReorderingBackwardState.h</locationURI>
</link>
<link>
<name>FF/LexicalReordering/HReorderingForwardState.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/HReorderingForwardState.cpp</locationURI>
</link>
<link>
<name>FF/LexicalReordering/HReorderingForwardState.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/HReorderingForwardState.h</locationURI>
</link>
<link>
<name>FF/LexicalReordering/LRModel.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/LRModel.cpp</locationURI>
</link>
<link>
<name>FF/LexicalReordering/LRModel.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/LRModel.h</locationURI>
</link>
<link>
<name>FF/LexicalReordering/LRState.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/LRState.cpp</locationURI>
</link>
<link>
<name>FF/LexicalReordering/LRState.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/LRState.h</locationURI>
</link>
<link>
<name>FF/LexicalReordering/LexicalReordering.cpp</name>
<type>1</type>
@ -2505,16 +2605,6 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/LexicalReordering.h</locationURI>
</link>
<link>
<name>FF/LexicalReordering/LexicalReorderingState.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/LexicalReorderingState.cpp</locationURI>
</link>
<link>
<name>FF/LexicalReordering/LexicalReorderingState.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/LexicalReorderingState.h</locationURI>
</link>
<link>
<name>FF/LexicalReordering/LexicalReorderingTable.cpp</name>
<type>1</type>
@ -2525,6 +2615,16 @@
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/LexicalReorderingTable.h</locationURI>
</link>
<link>
<name>FF/LexicalReordering/PhraseBasedReorderingState.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/PhraseBasedReorderingState.cpp</locationURI>
</link>
<link>
<name>FF/LexicalReordering/PhraseBasedReorderingState.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/PhraseBasedReorderingState.h</locationURI>
</link>
<link>
<name>FF/LexicalReordering/ReorderingStack.cpp</name>
<type>1</type>

View File

@ -37,4 +37,4 @@ for local p in [ glob *_main.cc ] {
exes += $(name) ;
}
alias programs : $(exes) filter//filter builder//dump_counts : <threading>multi:<source>builder//lmplz ;
alias programs : $(exes) filter//filter filter//phrase_table_vocab builder//dump_counts : <threading>multi:<source>builder//lmplz ;

View File

@ -1,26 +1,31 @@
#include "DistortionScoreProducer.h"
#include "FFState.h"
#include "moses/InputPath.h"
#include "moses/Range.h"
#include "moses/StaticData.h"
#include "moses/Hypothesis.h"
#include "moses/Manager.h"
#include "moses/FactorCollection.h"
#include <cmath>
using namespace std;
namespace Moses
{
struct DistortionState_traditional : public FFState {
struct DistortionState : public FFState {
Range range;
int first_gap;
DistortionState_traditional(const Range& wr, int fg) : range(wr), first_gap(fg) {}
bool inSubordinateConjunction;
DistortionState(const Range& wr, int fg, bool subord=false) : range(wr), first_gap(fg), inSubordinateConjunction(subord) {}
size_t hash() const {
return range.GetEndPos();
}
virtual bool operator==(const FFState& other) const {
const DistortionState_traditional& o =
static_cast<const DistortionState_traditional&>(other);
return range.GetEndPos() == o.range.GetEndPos();
const DistortionState& o =
static_cast<const DistortionState&>(other);
return ( (range.GetEndPos() == o.range.GetEndPos()) && (inSubordinateConjunction == o.inSubordinateConjunction) );
}
};
@ -29,11 +34,36 @@ std::vector<const DistortionScoreProducer*> DistortionScoreProducer::s_staticCol
DistortionScoreProducer::DistortionScoreProducer(const std::string &line)
: StatefulFeatureFunction(1, line)
, m_useSparse(false)
, m_sparseDistance(false)
, m_sparseSubordinate(false)
{
s_staticColl.push_back(this);
ReadParameters();
}
void DistortionScoreProducer::SetParameter(const std::string& key, const std::string& value)
{
if (key == "sparse") {
m_useSparse = Scan<bool>(value);
} else if (key == "sparse-distance") {
m_sparseDistance = Scan<bool>(value);
} else if (key == "sparse-input-factor") {
m_sparseFactorTypeSource = Scan<FactorType>(value);
} else if (key == "sparse-output-factor") {
m_sparseFactorTypeTarget = Scan<FactorType>(value);
} else if (key == "sparse-subordinate") {
std::string subordinateConjunctionTag = Scan<std::string>(value);
FactorCollection &factorCollection = FactorCollection::Instance();
m_subordinateConjunctionTagFactor = factorCollection.AddFactor(subordinateConjunctionTag,false);
m_sparseSubordinate = true;
} else if (key == "sparse-subordinate-output-factor") {
m_sparseFactorTypeTargetSubordinate = Scan<FactorType>(value);
} else {
StatefulFeatureFunction::SetParameter(key, value);
}
}
const FFState* DistortionScoreProducer::EmptyHypothesisState(const InputType &input) const
{
// fake previous translated phrase start and end
@ -44,7 +74,7 @@ const FFState* DistortionScoreProducer::EmptyHypothesisState(const InputType &in
start = 0;
end = input.m_frontSpanCoveredLength -1;
}
return new DistortionState_traditional(
return new DistortionState(
Range(start, end),
NOT_FOUND);
}
@ -101,17 +131,184 @@ FFState* DistortionScoreProducer::EvaluateWhenApplied(
const FFState* prev_state,
ScoreComponentCollection* out) const
{
const DistortionState_traditional* prev = static_cast<const DistortionState_traditional*>(prev_state);
const DistortionState* prev = static_cast<const DistortionState*>(prev_state);
bool subordinateConjunction = prev->inSubordinateConjunction;
if (m_useSparse) {
int jumpFromPos = prev->range.GetEndPos()+1;
int jumpToPos = hypo.GetCurrSourceWordsRange().GetStartPos();
size_t distance = std::abs( jumpFromPos - jumpToPos );
const Sentence& sentence = static_cast<const Sentence&>(hypo.GetInput());
StringPiece jumpFromSourceFactorPrev;
StringPiece jumpFromSourceFactor;
StringPiece jumpToSourceFactor;
if (jumpFromPos < (int)sentence.GetSize()) {
jumpFromSourceFactor = sentence.GetWord(jumpFromPos).GetFactor(m_sparseFactorTypeSource)->GetString();
} else {
jumpFromSourceFactor = "</s>";
}
if (jumpFromPos > 0) {
jumpFromSourceFactorPrev = sentence.GetWord(jumpFromPos-1).GetFactor(m_sparseFactorTypeSource)->GetString();
} else {
jumpFromSourceFactorPrev = "<s>";
}
jumpToSourceFactor = sentence.GetWord(jumpToPos).GetFactor(m_sparseFactorTypeSource)->GetString();
const TargetPhrase& currTargetPhrase = hypo.GetCurrTargetPhrase();
StringPiece jumpToTargetFactor = currTargetPhrase.GetWord(0).GetFactor(m_sparseFactorTypeTarget)->GetString();
util::StringStream featureName;
// source factor (start position)
featureName = util::StringStream();
featureName << m_description << "_";
if ( jumpToPos > jumpFromPos ) {
featureName << "R";
} else if ( jumpToPos < jumpFromPos ) {
featureName << "L";
} else {
featureName << "M";
}
if (m_sparseDistance) {
featureName << distance;
}
featureName << "_SFS_" << jumpFromSourceFactor;
if (m_sparseSubordinate && subordinateConjunction) {
featureName << "_SUBORD";
}
out->SparsePlusEquals(featureName.str(), 1);
// source factor (start position minus 1)
featureName = util::StringStream();
featureName << m_description << "_";
if ( jumpToPos > jumpFromPos ) {
featureName << "R";
} else if ( jumpToPos < jumpFromPos ) {
featureName << "L";
} else {
featureName << "M";
}
if (m_sparseDistance) {
featureName << distance;
}
featureName << "_SFP_" << jumpFromSourceFactorPrev;
if (m_sparseSubordinate && subordinateConjunction) {
featureName << "_SUBORD";
}
out->SparsePlusEquals(featureName.str(), 1);
// source factor (end position)
featureName = util::StringStream();
featureName << m_description << "_";
if ( jumpToPos > jumpFromPos ) {
featureName << "R";
} else if ( jumpToPos < jumpFromPos ) {
featureName << "L";
} else {
featureName << "M";
}
if (m_sparseDistance) {
featureName << distance;
}
featureName << "_SFE_" << jumpToSourceFactor;
if (m_sparseSubordinate && subordinateConjunction) {
featureName << "_SUBORD";
}
out->SparsePlusEquals(featureName.str(), 1);
// target factor (end position)
featureName = util::StringStream();
featureName << m_description << "_";
if ( jumpToPos > jumpFromPos ) {
featureName << "R";
} else if ( jumpToPos < jumpFromPos ) {
featureName << "L";
} else {
featureName << "M";
}
if (m_sparseDistance) {
featureName << distance;
}
featureName << "_TFE_" << jumpToTargetFactor;
if (m_sparseSubordinate && subordinateConjunction) {
featureName << "_SUBORD";
}
out->SparsePlusEquals(featureName.str(), 1);
// relative source sentence position
featureName = util::StringStream();
featureName << m_description << "_";
if ( jumpToPos > jumpFromPos ) {
featureName << "R";
} else if ( jumpToPos < jumpFromPos ) {
featureName << "L";
} else {
featureName << "M";
}
if (m_sparseDistance) {
featureName << distance;
}
size_t relativeSourceSentencePosBin = std::floor( 5 * (float)jumpFromPos / (sentence.GetSize()+1) );
featureName << "_P_" << relativeSourceSentencePosBin;
if (m_sparseSubordinate && subordinateConjunction) {
featureName << "_SUBORD";
}
out->SparsePlusEquals(featureName.str(), 1);
// source sentence length bin
featureName = util::StringStream();
featureName << m_description << "_";
if ( jumpToPos > jumpFromPos ) {
featureName << "R";
} else if ( jumpToPos < jumpFromPos ) {
featureName << "L";
} else {
featureName << "M";
}
if (m_sparseDistance) {
featureName << distance;
}
size_t sourceSentenceLengthBin = 3;
if (sentence.GetSize() < 15) {
sourceSentenceLengthBin = 0;
} else if (sentence.GetSize() < 23) {
sourceSentenceLengthBin = 1;
} else if (sentence.GetSize() < 33) {
sourceSentenceLengthBin = 2;
}
featureName << "_SL_" << sourceSentenceLengthBin;
if (m_sparseSubordinate && subordinateConjunction) {
featureName << "_SUBORD";
}
out->SparsePlusEquals(featureName.str(), 1);
if (m_sparseSubordinate) {
for (size_t posT=0; posT<currTargetPhrase.GetSize(); ++posT) {
const Word &wordT = currTargetPhrase.GetWord(posT);
if (wordT[m_sparseFactorTypeTargetSubordinate] == m_subordinateConjunctionTagFactor) {
subordinateConjunction = true;
} else if (wordT[m_sparseFactorTypeTargetSubordinate]->GetString()[0] == 'V') {
subordinateConjunction = false;
}
};
}
}
const float distortionScore = CalculateDistortionScore(
hypo,
prev->range,
hypo.GetCurrSourceWordsRange(),
prev->first_gap);
out->PlusEquals(this, distortionScore);
DistortionState_traditional* res = new DistortionState_traditional(
DistortionState* state = new DistortionState(
hypo.GetCurrSourceWordsRange(),
hypo.GetWordsBitmap().GetFirstGapPos());
return res;
hypo.GetWordsBitmap().GetFirstGapPos(),
subordinateConjunction);
return state;
}

View File

@ -1,16 +1,11 @@
#pragma once
#include <stdexcept>
#include <string>
#include "StatefulFeatureFunction.h"
#include "moses/Range.h"
namespace Moses
{
class FFState;
class ScoreComponentCollection;
class Hypothesis;
class ChartHypothesis;
class Range;
/** Calculates Distortion scores
*/
@ -19,6 +14,14 @@ class DistortionScoreProducer : public StatefulFeatureFunction
protected:
static std::vector<const DistortionScoreProducer*> s_staticColl;
FactorType m_sparseFactorTypeSource;
FactorType m_sparseFactorTypeTarget;
bool m_useSparse;
bool m_sparseDistance;
bool m_sparseSubordinate;
FactorType m_sparseFactorTypeTargetSubordinate;
const Factor* m_subordinateConjunctionTagFactor;
public:
static const std::vector<const DistortionScoreProducer*>& GetDistortionFeatureFunctions() {
return s_staticColl;
@ -26,6 +29,8 @@ public:
DistortionScoreProducer(const std::string &line);
void SetParameter(const std::string& key, const std::string& value);
bool IsUseable(const FactorMask &mask) const {
return true;
}
@ -44,7 +49,7 @@ public:
const ChartHypothesis& /* cur_hypo */,
int /* featureID - used to index the state in the previous hypotheses */,
ScoreComponentCollection*) const {
throw std::logic_error("DistortionScoreProducer not supported in chart decoder, yet");
UTIL_THROW(util::Exception, "DIstortion not implemented in chart decoder");
}
};

View File

@ -42,6 +42,7 @@
#include "moses/FF/ControlRecombination.h"
#include "moses/FF/ConstrainedDecoding.h"
#include "moses/FF/SoftSourceSyntacticConstraintsFeature.h"
#include "moses/FF/TargetConstituentAdjacencyFeature.h"
#include "moses/FF/TargetPreferencesFeature.h"
#include "moses/FF/CoveredReferenceFeature.h"
#include "moses/FF/TreeStructureFeature.h"
@ -264,6 +265,7 @@ FeatureRegistry::FeatureRegistry()
MOSES_FNAME(CoveredReferenceFeature);
MOSES_FNAME(SourceGHKMTreeInputMatchFeature);
MOSES_FNAME(SoftSourceSyntacticConstraintsFeature);
MOSES_FNAME(TargetConstituentAdjacencyFeature);
MOSES_FNAME(TargetPreferencesFeature);
MOSES_FNAME(TreeStructureFeature);
MOSES_FNAME(SoftMatchingFeature);

View File

@ -0,0 +1,38 @@
#include "BidirectionalReorderingState.h"
namespace Moses
{
///////////////////////////
//BidirectionalReorderingState
size_t BidirectionalReorderingState::hash() const
{
size_t ret = m_backward->hash();
boost::hash_combine(ret, m_forward->hash());
return ret;
}
bool BidirectionalReorderingState::operator==(const FFState& o) const
{
if (&o == this) return 0;
BidirectionalReorderingState const &other
= static_cast<BidirectionalReorderingState const&>(o);
bool ret = (*m_backward == *other.m_backward) && (*m_forward == *other.m_forward);
return ret;
}
LRState*
BidirectionalReorderingState::
Expand(const TranslationOption& topt, const InputType& input,
ScoreComponentCollection* scores) const
{
LRState *newbwd = m_backward->Expand(topt,input, scores);
LRState *newfwd = m_forward->Expand(topt, input, scores);
return new BidirectionalReorderingState(m_configuration, newbwd, newfwd, m_offset);
}
}

View File

@ -0,0 +1,38 @@
#pragma once
#include "LRState.h"
namespace Moses
{
class BidirectionalReorderingState
: public LRState
{
private:
const LRState *m_backward;
const LRState *m_forward;
public:
BidirectionalReorderingState(const LRModel &config,
const LRState *bw,
const LRState *fw, size_t offset)
: LRState(config,
LRModel::Bidirectional,
offset)
, m_backward(bw)
, m_forward(fw)
{ }
~BidirectionalReorderingState() {
delete m_backward;
delete m_forward;
}
virtual size_t hash() const;
virtual bool operator==(const FFState& other) const;
LRState*
Expand(const TranslationOption& topt, const InputType& input,
ScoreComponentCollection* scores) const;
};
}

View File

@ -0,0 +1,50 @@
#include "HReorderingBackwardState.h"
namespace Moses
{
///////////////////////////
//HierarchicalReorderingBackwardState
HReorderingBackwardState::
HReorderingBackwardState(const HReorderingBackwardState *prev,
const TranslationOption &topt,
ReorderingStack reoStack)
: LRState(prev, topt), m_reoStack(reoStack)
{ }
HReorderingBackwardState::
HReorderingBackwardState(const LRModel &config, size_t offset)
: LRState(config, LRModel::Backward, offset)
{ }
size_t HReorderingBackwardState::hash() const
{
size_t ret = m_reoStack.hash();
return ret;
}
bool HReorderingBackwardState::operator==(const FFState& o) const
{
const HReorderingBackwardState& other
= static_cast<const HReorderingBackwardState&>(o);
bool ret = m_reoStack == other.m_reoStack;
return ret;
}
LRState*
HReorderingBackwardState::
Expand(const TranslationOption& topt, const InputType& input,
ScoreComponentCollection* scores) const
{
HReorderingBackwardState* nextState;
nextState = new HReorderingBackwardState(this, topt, m_reoStack);
Range swrange = topt.GetSourceWordsRange();
int reoDistance = nextState->m_reoStack.ShiftReduce(swrange);
ReorderingType reoType = m_configuration.GetOrientation(reoDistance);
CopyScores(scores, topt, input, reoType);
return nextState;
}
}

View File

@ -0,0 +1,33 @@
#pragma once
#include "LRState.h"
#include "ReorderingStack.h"
namespace Moses
{
//! State for a hierarchical reordering model (see Galley and Manning, A
//! Simple and Effective Hierarchical Phrase Reordering Model, EMNLP 2008)
//! backward state (conditioned on the previous phrase)
class HReorderingBackwardState : public LRState
{
private:
ReorderingStack m_reoStack;
public:
HReorderingBackwardState(const LRModel &config, size_t offset);
HReorderingBackwardState(const HReorderingBackwardState *prev,
const TranslationOption &topt,
ReorderingStack reoStack);
virtual size_t hash() const;
virtual bool operator==(const FFState& other) const;
virtual LRState* Expand(const TranslationOption& hypo, const InputType& input,
ScoreComponentCollection* scores) const;
private:
ReorderingType GetOrientationTypeMSD(int reoDistance) const;
ReorderingType GetOrientationTypeMSLR(int reoDistance) const;
ReorderingType GetOrientationTypeMonotonic(int reoDistance) const;
ReorderingType GetOrientationTypeLeftRight(int reoDistance) const;
};
}

View File

@ -0,0 +1,78 @@
#include "HReorderingForwardState.h"
namespace Moses
{
///////////////////////////
//HReorderingForwardState
HReorderingForwardState::
HReorderingForwardState(const LRModel &config,
size_t size, size_t offset)
: LRState(config, LRModel::Forward, offset)
, m_first(true)
, m_prevRange(NOT_FOUND,NOT_FOUND)
, m_coverage(size)
{ }
HReorderingForwardState::
HReorderingForwardState(const HReorderingForwardState *prev,
const TranslationOption &topt)
: LRState(prev, topt)
, m_first(false)
, m_prevRange(topt.GetSourceWordsRange())
, m_coverage(prev->m_coverage, topt.GetSourceWordsRange())
{
}
size_t HReorderingForwardState::hash() const
{
size_t ret;
ret = hash_value(m_prevRange);
return ret;
}
bool HReorderingForwardState::operator==(const FFState& o) const
{
if (&o == this) return true;
HReorderingForwardState const& other
= static_cast<HReorderingForwardState const&>(o);
int compareScores = ((m_prevRange == other.m_prevRange)
? ComparePrevScores(other.m_prevOption)
: (m_prevRange < other.m_prevRange) ? -1 : 1);
return compareScores == 0;
}
// For compatibility with the phrase-based reordering model, scoring is one
// step delayed.
// The forward model takes determines orientations heuristically as follows:
// mono: if the next phrase comes after the conditioning phrase and
// - there is a gap to the right of the conditioning phrase, or
// - the next phrase immediately follows it
// swap: if the next phrase goes before the conditioning phrase and
// - there is a gap to the left of the conditioning phrase, or
// - the next phrase immediately precedes it
// dright: if the next phrase follows the conditioning phrase and other
// stuff comes in between
// dleft: if the next phrase precedes the conditioning phrase and other
// stuff comes in between
LRState*
HReorderingForwardState::
Expand(TranslationOption const& topt, InputType const& input,
ScoreComponentCollection* scores) const
{
const Range cur = topt.GetSourceWordsRange();
// keep track of the current coverage ourselves so we don't need the hypothesis
Bitmap cov(m_coverage, cur);
if (!m_first) {
LRModel::ReorderingType reoType;
reoType = m_configuration.GetOrientation(m_prevRange,cur,cov);
CopyScores(scores, topt, input, reoType);
}
return new HReorderingForwardState(this, topt);
}
}

View File

@ -0,0 +1,33 @@
#pragma once
#include "LRState.h"
#include "moses/Range.h"
#include "moses/Bitmap.h"
namespace Moses
{
//!forward state (conditioned on the next phrase)
class HReorderingForwardState : public LRState
{
private:
bool m_first;
Range m_prevRange;
Bitmap m_coverage;
public:
HReorderingForwardState(const LRModel &config, size_t sentenceLength,
size_t offset);
HReorderingForwardState(const HReorderingForwardState *prev,
const TranslationOption &topt);
virtual size_t hash() const;
virtual bool operator==(const FFState& other) const;
virtual LRState* Expand(const TranslationOption& hypo,
const InputType& input,
ScoreComponentCollection* scores) const;
};
}

View File

@ -0,0 +1,219 @@
#include "LRModel.h"
#include "moses/Range.h"
#include "moses/Bitmap.h"
#include "moses/InputType.h"
#include "HReorderingForwardState.h"
#include "HReorderingBackwardState.h"
#include "PhraseBasedReorderingState.h"
#include "BidirectionalReorderingState.h"
#include "SparseReordering.h"
namespace Moses
{
bool
IsMonotonicStep(Range const& prev, // words range of last source phrase
Range const& cur, // words range of current source phrase
Bitmap const& cov) // coverage bitmap
{
size_t e = prev.GetEndPos() + 1;
size_t s = cur.GetStartPos();
return (s == e || (s >= e && !cov.GetValue(e)));
}
bool
IsSwap(Range const& prev, Range const& cur, Bitmap const& cov)
{
size_t s = prev.GetStartPos();
size_t e = cur.GetEndPos();
return (e+1 == s || (e < s && !cov.GetValue(s-1)));
}
size_t
LRModel::
GetNumberOfTypes() const
{
return ((m_modelType == MSD) ? 3 :
(m_modelType == MSLR) ? 4 : 2);
}
size_t
LRModel::
GetNumScoreComponents() const
{
size_t score_per_dir = m_collapseScores ? 1 : GetNumberOfTypes();
return ((m_direction == Bidirectional)
? 2 * score_per_dir + m_additionalScoreComponents
: score_per_dir + m_additionalScoreComponents);
}
void
LRModel::
ConfigureSparse(const std::map<std::string,std::string>& sparseArgs,
const LexicalReordering* producer)
{
if (sparseArgs.size()) {
m_sparse.reset(new SparseReordering(sparseArgs, producer));
}
}
void
LRModel::
SetAdditionalScoreComponents(size_t number)
{
m_additionalScoreComponents = number;
}
/// return orientation for the first phrase
LRModel::ReorderingType
LRModel::
GetOrientation(Range const& cur) const
{
UTIL_THROW_IF2(m_modelType == None, "Reordering Model Type is None");
return ((m_modelType == LeftRight) ? R :
(cur.GetStartPos() == 0) ? M :
(m_modelType == MSD) ? D :
(m_modelType == MSLR) ? DR : NM);
}
LRModel::ReorderingType
LRModel::
GetOrientation(Range const& prev, Range const& cur) const
{
UTIL_THROW_IF2(m_modelType == None, "No reordering model type specified");
return ((m_modelType == LeftRight)
? prev.GetEndPos() <= cur.GetStartPos() ? R : L
: (cur.GetStartPos() == prev.GetEndPos() + 1) ? M
: (m_modelType == Monotonic) ? NM
: (prev.GetStartPos() == cur.GetEndPos() + 1) ? S
: (m_modelType == MSD) ? D
: (cur.GetStartPos() > prev.GetEndPos()) ? DR : DL);
}
LRModel::ReorderingType
LRModel::
GetOrientation(int const reoDistance) const
{
// this one is for HierarchicalReorderingBackwardState
return ((m_modelType == LeftRight)
? (reoDistance >= 1) ? R : L
: (reoDistance == 1) ? M
: (m_modelType == Monotonic) ? NM
: (reoDistance == -1) ? S
: (m_modelType == MSD) ? D
: (reoDistance > 1) ? DR : DL);
}
LRModel::ReorderingType
LRModel::
GetOrientation(Range const& prev, Range const& cur,
Bitmap const& cov) const
{
return ((m_modelType == LeftRight)
? cur.GetStartPos() > prev.GetEndPos() ? R : L
: IsMonotonicStep(prev,cur,cov) ? M
: (m_modelType == Monotonic) ? NM
: IsSwap(prev,cur,cov) ? S
: (m_modelType == MSD) ? D
: cur.GetStartPos() > prev.GetEndPos() ? DR : DL);
}
LRModel::
LRModel(const std::string &modelType)
: m_modelString(modelType)
, m_scoreProducer(NULL)
, m_modelType(None)
, m_phraseBased(true)
, m_collapseScores(false)
, m_direction(Backward)
, m_additionalScoreComponents(0)
{
std::vector<std::string> config = Tokenize<std::string>(modelType, "-");
for (size_t i=0; i<config.size(); ++i) {
if (config[i] == "hier") {
m_phraseBased = false;
} else if (config[i] == "phrase") {
m_phraseBased = true;
} else if (config[i] == "wbe") {
m_phraseBased = true;
}
// no word-based decoding available, fall-back to phrase-based
// This is the old lexical reordering model combination of moses
else if (config[i] == "msd") {
m_modelType = MSD;
} else if (config[i] == "mslr") {
m_modelType = MSLR;
} else if (config[i] == "monotonicity") {
m_modelType = Monotonic;
} else if (config[i] == "leftright") {
m_modelType = LeftRight;
}
// unidirectional is deprecated, use backward instead
else if (config[i] == "unidirectional") {
m_direction = Backward;
} else if (config[i] == "backward") {
m_direction = Backward;
} else if (config[i] == "forward") {
m_direction = Forward;
} else if (config[i] == "bidirectional") {
m_direction = Bidirectional;
}
else if (config[i] == "f") {
m_condition = F;
} else if (config[i] == "fe") {
m_condition = FE;
}
else if (config[i] == "collapseff") {
m_collapseScores = true;
} else if (config[i] == "allff") {
m_collapseScores = false;
} else {
std::cerr
<< "Illegal part in the lexical reordering configuration string: "
<< config[i] << std::endl;
exit(1);
}
}
if (m_modelType == None) {
std::cerr
<< "You need to specify the type of the reordering model "
<< "(msd, monotonicity,...)" << std::endl;
exit(1);
}
}
LRState *
LRModel::
CreateLRState(const InputType &input) const
{
LRState *bwd = NULL, *fwd = NULL;
size_t offset = 0;
switch(m_direction) {
case Backward:
case Bidirectional:
if (m_phraseBased)
bwd = new PhraseBasedReorderingState(*this, Backward, offset);
else
bwd = new HReorderingBackwardState(*this, offset);
offset += m_collapseScores ? 1 : GetNumberOfTypes();
if (m_direction == Backward) return bwd; // else fall through
case Forward:
if (m_phraseBased)
fwd = new PhraseBasedReorderingState(*this, Forward, offset);
else
fwd = new HReorderingForwardState(*this, input.GetSize(), offset);
offset += m_collapseScores ? 1 : GetNumberOfTypes();
if (m_direction == Forward) return fwd;
}
return new BidirectionalReorderingState(*this, bwd, fwd, 0);
}
}

View File

@ -0,0 +1,133 @@
#pragma once
#include <string>
#include <map>
#include <boost/scoped_ptr.hpp>
namespace Moses
{
class Range;
class Bitmap;
class InputType;
class LRState;
class LexicalReordering;
class SparseReordering;
//! Factory class for lexical reordering states
class LRModel
{
public:
friend class LexicalReordering;
enum ModelType { Monotonic, MSD, MSLR, LeftRight, None };
enum Direction { Forward, Backward, Bidirectional };
enum Condition { F, E, FE };
// constants for the different types of reordering
// (correspond to indices in the respective table)
#if 0
typedef int ReorderingType;
static const ReorderingType M = 0; // monotonic
static const ReorderingType NM = 1; // non-monotonic
static const ReorderingType S = 1; // swap
static const ReorderingType D = 2; // discontinuous
static const ReorderingType DL = 2; // discontinuous, left
static const ReorderingType DR = 3; // discontinuous, right
static const ReorderingType R = 0; // right
static const ReorderingType L = 1; // left
static const ReorderingType MAX = 3; // largest possible
#else
enum ReorderingType {
M = 0, // monotonic
NM = 1, // non-monotonic
S = 1, // swap
D = 2, // discontinuous
DL = 2, // discontinuous, left
DR = 3, // discontinuous, right
R = 0, // right
L = 1, // left
MAX = 3, // largest possible
NONE = 4 // largest possible
};
#endif
// determine orientation, depending on model:
ReorderingType // for first phrase in phrase-based
GetOrientation(Range const& cur) const;
ReorderingType // for non-first phrases in phrase-based
GetOrientation(Range const& prev, Range const& cur) const;
ReorderingType // for HReorderingForwardState
GetOrientation(Range const& prev, Range const& cur,
Bitmap const& cov) const;
ReorderingType // for HReorderingBackwarddState
GetOrientation(int const reoDistance) const;
LRModel(const std::string &modelType);
void
ConfigureSparse(const std::map<std::string,std::string>& sparseArgs,
const LexicalReordering* producer);
LRState*
CreateLRState(const InputType &input) const;
size_t GetNumberOfTypes() const;
size_t GetNumScoreComponents() const;
void SetAdditionalScoreComponents(size_t number);
LexicalReordering*
GetScoreProducer() const {
return m_scoreProducer;
}
ModelType GetModelType() const {
return m_modelType;
}
Direction GetDirection() const {
return m_direction;
}
Condition GetCondition() const {
return m_condition;
}
bool
IsPhraseBased() const {
return m_phraseBased;
}
bool
CollapseScores() const {
return m_collapseScores;
}
SparseReordering const*
GetSparseReordering() const {
return m_sparse.get();
}
private:
void
SetScoreProducer(LexicalReordering* scoreProducer) {
m_scoreProducer = scoreProducer;
}
std::string const&
GetModelString() const {
return m_modelString;
}
std::string m_modelString;
LexicalReordering *m_scoreProducer;
ModelType m_modelType;
bool m_phraseBased;
bool m_collapseScores;
Direction m_direction;
Condition m_condition;
size_t m_additionalScoreComponents;
boost::scoped_ptr<SparseReordering> m_sparse;
};
}

View File

@ -0,0 +1,88 @@
// -*- c++ -*-
#include <vector>
#include <string>
#include "LRState.h"
#include "moses/FF/FFState.h"
#include "moses/Hypothesis.h"
#include "moses/Range.h"
#include "moses/TranslationOption.h"
#include "moses/Util.h"
#include "LexicalReordering.h"
namespace Moses
{
void
LRState::
CopyScores(ScoreComponentCollection* accum,
const TranslationOption &topt,
const InputType& input,
ReorderingType reoType) const
{
// don't call this on a bidirectional object
UTIL_THROW_IF2(m_direction != LRModel::Backward &&
m_direction != LRModel::Forward,
"Unknown direction: " << m_direction);
TranslationOption const* relevantOpt = ((m_direction == LRModel::Backward)
? &topt : m_prevOption);
LexicalReordering* producer = m_configuration.GetScoreProducer();
Scores const* cached = relevantOpt->GetLexReorderingScores(producer);
// The approach here is bizarre! Why create a whole vector and do
// vector addition (acumm->PlusEquals) to update a single value? - UG
size_t off_remote = m_offset + reoType;
size_t off_local = m_configuration.CollapseScores() ? m_offset : off_remote;
UTIL_THROW_IF2(off_local >= producer->GetNumScoreComponents(),
"offset out of vector bounds!");
// look up applicable score from vectore of scores
if(cached) {
UTIL_THROW_IF2(off_remote >= cached->size(), "offset out of vector bounds!");
Scores scores(producer->GetNumScoreComponents(),0);
scores[off_local ] = (*cached)[off_remote];
accum->PlusEquals(producer, scores);
}
// else: use default scores (if specified)
else if (producer->GetHaveDefaultScores()) {
Scores scores(producer->GetNumScoreComponents(),0);
scores[off_local] = producer->GetDefaultScore(off_remote);
accum->PlusEquals(m_configuration.GetScoreProducer(), scores);
}
// note: if no default score, no cost
const SparseReordering* sparse = m_configuration.GetSparseReordering();
if (sparse) sparse->CopyScores(*relevantOpt, m_prevOption, input, reoType,
m_direction, accum);
}
int
LRState::
ComparePrevScores(const TranslationOption *other) const
{
LexicalReordering* producer = m_configuration.GetScoreProducer();
const Scores* myScores = m_prevOption->GetLexReorderingScores(producer);
const Scores* yrScores = other->GetLexReorderingScores(producer);
if(myScores == yrScores) return 0;
// The pointers are NULL if a phrase pair isn't found in the reordering table.
if(yrScores == NULL) return -1;
if(myScores == NULL) return 1;
size_t stop = m_offset + m_configuration.GetNumberOfTypes();
for(size_t i = m_offset; i < stop; i++) {
if((*myScores)[i] < (*yrScores)[i]) return -1;
if((*myScores)[i] > (*yrScores)[i]) return 1;
}
return 0;
}
}

View File

@ -0,0 +1,81 @@
// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
#pragma once
#include <vector>
#include <string>
#include "moses/Hypothesis.h"
#include "moses/ScoreComponentCollection.h"
#include "moses/Range.h"
#include "moses/Bitmap.h"
#include "moses/TranslationOption.h"
#include "moses/FF/FFState.h"
#include "LRModel.h"
namespace Moses
{
//! Abstract class for lexical reordering model states
class LRState : public FFState
{
public:
typedef LRModel::ReorderingType ReorderingType;
virtual
LRState*
Expand(const TranslationOption& hypo, const InputType& input,
ScoreComponentCollection* scores) const = 0;
static
LRState*
CreateLRState(const std::vector<std::string>& config,
LRModel::Direction dir,
const InputType &input);
protected:
const LRModel& m_configuration;
// The following is the true direction of the object, which can be
// Backward or Forward even if the Configuration has Bidirectional.
LRModel::Direction m_direction;
size_t m_offset;
//forward scores are conditioned on prev option, so need to remember it
const TranslationOption *m_prevOption;
inline
LRState(const LRState *prev,
const TranslationOption &topt)
: m_configuration(prev->m_configuration)
, m_direction(prev->m_direction)
, m_offset(prev->m_offset)
, m_prevOption(&topt)
{ }
inline
LRState(const LRModel &config,
LRModel::Direction dir,
size_t offset)
: m_configuration(config)
, m_direction(dir)
, m_offset(offset)
, m_prevOption(NULL)
{ }
// copy the right scores in the right places, taking into account
// forward/backward, offset, collapse
void
CopyScores(ScoreComponentCollection* scores,
const TranslationOption& topt,
const InputType& input, ReorderingType reoType) const;
int
ComparePrevScores(const TranslationOption *other) const;
};
}

View File

@ -5,7 +5,7 @@
#include "moses/FF/FFState.h"
#include "moses/TranslationOptionList.h"
#include "LexicalReordering.h"
#include "LexicalReorderingState.h"
#include "LRState.h"
#include "moses/StaticData.h"
#include "moses/Util.h"
#include "moses/InputPath.h"

View File

@ -14,7 +14,7 @@
#include "moses/FF/StatefulFeatureFunction.h"
#include "util/exception.hh"
#include "LexicalReorderingState.h"
#include "LRState.h"
#include "LexicalReorderingTable.h"
#include "SparseReordering.h"

View File

@ -1,506 +0,0 @@
// -*- c++ -*-
#include <vector>
#include <string>
#include "moses/FF/FFState.h"
#include "moses/Hypothesis.h"
#include "moses/Range.h"
#include "moses/TranslationOption.h"
#include "moses/Util.h"
#include "LexicalReordering.h"
#include "LexicalReorderingState.h"
#include "ReorderingStack.h"
namespace Moses
{
bool
IsMonotonicStep(Range const& prev, // words range of last source phrase
Range const& cur, // words range of current source phrase
Bitmap const& cov) // coverage bitmap
{
size_t e = prev.GetEndPos() + 1;
size_t s = cur.GetStartPos();
return (s == e || (s >= e && !cov.GetValue(e)));
}
bool
IsSwap(Range const& prev, Range const& cur, Bitmap const& cov)
{
size_t s = prev.GetStartPos();
size_t e = cur.GetEndPos();
return (e+1 == s || (e < s && !cov.GetValue(s-1)));
}
size_t
LRModel::
GetNumberOfTypes() const
{
return ((m_modelType == MSD) ? 3 :
(m_modelType == MSLR) ? 4 : 2);
}
size_t
LRModel::
GetNumScoreComponents() const
{
size_t score_per_dir = m_collapseScores ? 1 : GetNumberOfTypes();
return ((m_direction == Bidirectional)
? 2 * score_per_dir + m_additionalScoreComponents
: score_per_dir + m_additionalScoreComponents);
}
void
LRModel::
ConfigureSparse(const std::map<std::string,std::string>& sparseArgs,
const LexicalReordering* producer)
{
if (sparseArgs.size()) {
m_sparse.reset(new SparseReordering(sparseArgs, producer));
}
}
void
LRModel::
SetAdditionalScoreComponents(size_t number)
{
m_additionalScoreComponents = number;
}
/// return orientation for the first phrase
LRModel::ReorderingType
LRModel::
GetOrientation(Range const& cur) const
{
UTIL_THROW_IF2(m_modelType == None, "Reordering Model Type is None");
return ((m_modelType == LeftRight) ? R :
(cur.GetStartPos() == 0) ? M :
(m_modelType == MSD) ? D :
(m_modelType == MSLR) ? DR : NM);
}
LRModel::ReorderingType
LRModel::
GetOrientation(Range const& prev, Range const& cur) const
{
UTIL_THROW_IF2(m_modelType == None, "No reordering model type specified");
return ((m_modelType == LeftRight)
? prev.GetEndPos() <= cur.GetStartPos() ? R : L
: (cur.GetStartPos() == prev.GetEndPos() + 1) ? M
: (m_modelType == Monotonic) ? NM
: (prev.GetStartPos() == cur.GetEndPos() + 1) ? S
: (m_modelType == MSD) ? D
: (cur.GetStartPos() > prev.GetEndPos()) ? DR : DL);
}
LRModel::ReorderingType
LRModel::
GetOrientation(int const reoDistance) const
{
// this one is for HierarchicalReorderingBackwardState
return ((m_modelType == LeftRight)
? (reoDistance >= 1) ? R : L
: (reoDistance == 1) ? M
: (m_modelType == Monotonic) ? NM
: (reoDistance == -1) ? S
: (m_modelType == MSD) ? D
: (reoDistance > 1) ? DR : DL);
}
LRModel::ReorderingType
LRModel::
GetOrientation(Range const& prev, Range const& cur,
Bitmap const& cov) const
{
return ((m_modelType == LeftRight)
? cur.GetStartPos() > prev.GetEndPos() ? R : L
: IsMonotonicStep(prev,cur,cov) ? M
: (m_modelType == Monotonic) ? NM
: IsSwap(prev,cur,cov) ? S
: (m_modelType == MSD) ? D
: cur.GetStartPos() > prev.GetEndPos() ? DR : DL);
}
LRModel::
LRModel(const std::string &modelType)
: m_modelString(modelType)
, m_scoreProducer(NULL)
, m_modelType(None)
, m_phraseBased(true)
, m_collapseScores(false)
, m_direction(Backward)
, m_additionalScoreComponents(0)
{
std::vector<std::string> config = Tokenize<std::string>(modelType, "-");
for (size_t i=0; i<config.size(); ++i) {
if (config[i] == "hier") {
m_phraseBased = false;
} else if (config[i] == "phrase") {
m_phraseBased = true;
} else if (config[i] == "wbe") {
m_phraseBased = true;
}
// no word-based decoding available, fall-back to phrase-based
// This is the old lexical reordering model combination of moses
else if (config[i] == "msd") {
m_modelType = MSD;
} else if (config[i] == "mslr") {
m_modelType = MSLR;
} else if (config[i] == "monotonicity") {
m_modelType = Monotonic;
} else if (config[i] == "leftright") {
m_modelType = LeftRight;
}
// unidirectional is deprecated, use backward instead
else if (config[i] == "unidirectional") {
m_direction = Backward;
} else if (config[i] == "backward") {
m_direction = Backward;
} else if (config[i] == "forward") {
m_direction = Forward;
} else if (config[i] == "bidirectional") {
m_direction = Bidirectional;
}
else if (config[i] == "f") {
m_condition = F;
} else if (config[i] == "fe") {
m_condition = FE;
}
else if (config[i] == "collapseff") {
m_collapseScores = true;
} else if (config[i] == "allff") {
m_collapseScores = false;
} else {
std::cerr
<< "Illegal part in the lexical reordering configuration string: "
<< config[i] << std::endl;
exit(1);
}
}
if (m_modelType == None) {
std::cerr
<< "You need to specify the type of the reordering model "
<< "(msd, monotonicity,...)" << std::endl;
exit(1);
}
}
LRState *
LRModel::
CreateLRState(const InputType &input) const
{
LRState *bwd = NULL, *fwd = NULL;
size_t offset = 0;
switch(m_direction) {
case Backward:
case Bidirectional:
if (m_phraseBased)
bwd = new PhraseBasedReorderingState(*this, Backward, offset);
else
bwd = new HReorderingBackwardState(*this, offset);
offset += m_collapseScores ? 1 : GetNumberOfTypes();
if (m_direction == Backward) return bwd; // else fall through
case Forward:
if (m_phraseBased)
fwd = new PhraseBasedReorderingState(*this, Forward, offset);
else
fwd = new HReorderingForwardState(*this, input.GetSize(), offset);
offset += m_collapseScores ? 1 : GetNumberOfTypes();
if (m_direction == Forward) return fwd;
}
return new BidirectionalReorderingState(*this, bwd, fwd, 0);
}
void
LRState::
CopyScores(ScoreComponentCollection* accum,
const TranslationOption &topt,
const InputType& input,
ReorderingType reoType) const
{
// don't call this on a bidirectional object
UTIL_THROW_IF2(m_direction != LRModel::Backward &&
m_direction != LRModel::Forward,
"Unknown direction: " << m_direction);
TranslationOption const* relevantOpt = ((m_direction == LRModel::Backward)
? &topt : m_prevOption);
LexicalReordering* producer = m_configuration.GetScoreProducer();
Scores const* cached = relevantOpt->GetLexReorderingScores(producer);
// The approach here is bizarre! Why create a whole vector and do
// vector addition (acumm->PlusEquals) to update a single value? - UG
size_t off_remote = m_offset + reoType;
size_t off_local = m_configuration.CollapseScores() ? m_offset : off_remote;
UTIL_THROW_IF2(off_local >= producer->GetNumScoreComponents(),
"offset out of vector bounds!");
// look up applicable score from vectore of scores
if(cached) {
UTIL_THROW_IF2(off_remote >= cached->size(), "offset out of vector bounds!");
Scores scores(producer->GetNumScoreComponents(),0);
scores[off_local ] = (*cached)[off_remote];
accum->PlusEquals(producer, scores);
}
// else: use default scores (if specified)
else if (producer->GetHaveDefaultScores()) {
Scores scores(producer->GetNumScoreComponents(),0);
scores[off_local] = producer->GetDefaultScore(off_remote);
accum->PlusEquals(m_configuration.GetScoreProducer(), scores);
}
// note: if no default score, no cost
const SparseReordering* sparse = m_configuration.GetSparseReordering();
if (sparse) sparse->CopyScores(*relevantOpt, m_prevOption, input, reoType,
m_direction, accum);
}
int
LRState::
ComparePrevScores(const TranslationOption *other) const
{
LexicalReordering* producer = m_configuration.GetScoreProducer();
const Scores* myScores = m_prevOption->GetLexReorderingScores(producer);
const Scores* yrScores = other->GetLexReorderingScores(producer);
if(myScores == yrScores) return 0;
// The pointers are NULL if a phrase pair isn't found in the reordering table.
if(yrScores == NULL) return -1;
if(myScores == NULL) return 1;
size_t stop = m_offset + m_configuration.GetNumberOfTypes();
for(size_t i = m_offset; i < stop; i++) {
if((*myScores)[i] < (*yrScores)[i]) return -1;
if((*myScores)[i] > (*yrScores)[i]) return 1;
}
return 0;
}
// ===========================================================================
// PHRASE BASED REORDERING STATE
// ===========================================================================
bool PhraseBasedReorderingState::m_useFirstBackwardScore = true;
PhraseBasedReorderingState::
PhraseBasedReorderingState(const PhraseBasedReorderingState *prev,
const TranslationOption &topt)
: LRState(prev, topt)
, m_prevRange(topt.GetSourceWordsRange())
, m_first(false)
{ }
PhraseBasedReorderingState::
PhraseBasedReorderingState(const LRModel &config,
LRModel::Direction dir, size_t offset)
: LRState(config, dir, offset)
, m_prevRange(NOT_FOUND,NOT_FOUND)
, m_first(true)
{ }
size_t PhraseBasedReorderingState::hash() const
{
size_t ret;
ret = hash_value(m_prevRange);
boost::hash_combine(ret, m_direction);
return ret;
}
bool PhraseBasedReorderingState::operator==(const FFState& o) const
{
if (&o == this) return true;
const PhraseBasedReorderingState &other = static_cast<const PhraseBasedReorderingState&>(o);
if (m_prevRange == other.m_prevRange) {
if (m_direction == LRModel::Forward) {
int compareScore = ComparePrevScores(other.m_prevOption);
return compareScore == 0;
} else {
return true;
}
} else {
return false;
}
}
LRState*
PhraseBasedReorderingState::
Expand(const TranslationOption& topt, const InputType& input,
ScoreComponentCollection* scores) const
{
// const LRModel::ModelType modelType = m_configuration.GetModelType();
if ((m_direction != LRModel::Forward && m_useFirstBackwardScore) || !m_first) {
LRModel const& lrmodel = m_configuration;
Range const cur = topt.GetSourceWordsRange();
LRModel::ReorderingType reoType = (m_first ? lrmodel.GetOrientation(cur)
: lrmodel.GetOrientation(m_prevRange,cur));
CopyScores(scores, topt, input, reoType);
}
return new PhraseBasedReorderingState(this, topt);
}
///////////////////////////
//BidirectionalReorderingState
size_t BidirectionalReorderingState::hash() const
{
size_t ret = m_backward->hash();
boost::hash_combine(ret, m_forward->hash());
return ret;
}
bool BidirectionalReorderingState::operator==(const FFState& o) const
{
if (&o == this) return 0;
BidirectionalReorderingState const &other
= static_cast<BidirectionalReorderingState const&>(o);
bool ret = (*m_backward == *other.m_backward) && (*m_forward == *other.m_forward);
return ret;
}
LRState*
BidirectionalReorderingState::
Expand(const TranslationOption& topt, const InputType& input,
ScoreComponentCollection* scores) const
{
LRState *newbwd = m_backward->Expand(topt,input, scores);
LRState *newfwd = m_forward->Expand(topt, input, scores);
return new BidirectionalReorderingState(m_configuration, newbwd, newfwd, m_offset);
}
///////////////////////////
//HierarchicalReorderingBackwardState
HReorderingBackwardState::
HReorderingBackwardState(const HReorderingBackwardState *prev,
const TranslationOption &topt,
ReorderingStack reoStack)
: LRState(prev, topt), m_reoStack(reoStack)
{ }
HReorderingBackwardState::
HReorderingBackwardState(const LRModel &config, size_t offset)
: LRState(config, LRModel::Backward, offset)
{ }
size_t HReorderingBackwardState::hash() const
{
size_t ret = m_reoStack.hash();
return ret;
}
bool HReorderingBackwardState::operator==(const FFState& o) const
{
const HReorderingBackwardState& other
= static_cast<const HReorderingBackwardState&>(o);
bool ret = m_reoStack == other.m_reoStack;
return ret;
}
LRState*
HReorderingBackwardState::
Expand(const TranslationOption& topt, const InputType& input,
ScoreComponentCollection* scores) const
{
HReorderingBackwardState* nextState;
nextState = new HReorderingBackwardState(this, topt, m_reoStack);
Range swrange = topt.GetSourceWordsRange();
int reoDistance = nextState->m_reoStack.ShiftReduce(swrange);
ReorderingType reoType = m_configuration.GetOrientation(reoDistance);
CopyScores(scores, topt, input, reoType);
return nextState;
}
///////////////////////////
//HReorderingForwardState
HReorderingForwardState::
HReorderingForwardState(const LRModel &config,
size_t size, size_t offset)
: LRState(config, LRModel::Forward, offset)
, m_first(true)
, m_prevRange(NOT_FOUND,NOT_FOUND)
, m_coverage(size)
{ }
HReorderingForwardState::
HReorderingForwardState(const HReorderingForwardState *prev,
const TranslationOption &topt)
: LRState(prev, topt)
, m_first(false)
, m_prevRange(topt.GetSourceWordsRange())
, m_coverage(prev->m_coverage, topt.GetSourceWordsRange())
{
}
size_t HReorderingForwardState::hash() const
{
size_t ret;
ret = hash_value(m_prevRange);
return ret;
}
bool HReorderingForwardState::operator==(const FFState& o) const
{
if (&o == this) return true;
HReorderingForwardState const& other
= static_cast<HReorderingForwardState const&>(o);
int compareScores = ((m_prevRange == other.m_prevRange)
? ComparePrevScores(other.m_prevOption)
: (m_prevRange < other.m_prevRange) ? -1 : 1);
return compareScores == 0;
}
// For compatibility with the phrase-based reordering model, scoring is one
// step delayed.
// The forward model takes determines orientations heuristically as follows:
// mono: if the next phrase comes after the conditioning phrase and
// - there is a gap to the right of the conditioning phrase, or
// - the next phrase immediately follows it
// swap: if the next phrase goes before the conditioning phrase and
// - there is a gap to the left of the conditioning phrase, or
// - the next phrase immediately precedes it
// dright: if the next phrase follows the conditioning phrase and other
// stuff comes in between
// dleft: if the next phrase precedes the conditioning phrase and other
// stuff comes in between
LRState*
HReorderingForwardState::
Expand(TranslationOption const& topt, InputType const& input,
ScoreComponentCollection* scores) const
{
const Range cur = topt.GetSourceWordsRange();
// keep track of the current coverage ourselves so we don't need the hypothesis
Bitmap cov(m_coverage, cur);
if (!m_first) {
LRModel::ReorderingType reoType;
reoType = m_configuration.GetOrientation(m_prevRange,cur,cov);
CopyScores(scores, topt, input, reoType);
}
return new HReorderingForwardState(this, topt);
}
}

View File

@ -1,308 +0,0 @@
// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
#pragma once
#include <vector>
#include <string>
#include <boost/scoped_ptr.hpp>
#include "moses/Hypothesis.h"
#include "moses/ScoreComponentCollection.h"
#include "moses/Range.h"
#include "moses/Bitmap.h"
#include "moses/TranslationOption.h"
#include "moses/FF/FFState.h"
#include "ReorderingStack.h"
namespace Moses
{
class LRState;
class LexicalReordering;
class SparseReordering;
//! Factory class for lexical reordering states
class LRModel
{
public:
friend class LexicalReordering;
enum ModelType { Monotonic, MSD, MSLR, LeftRight, None };
enum Direction { Forward, Backward, Bidirectional };
enum Condition { F, E, FE };
// constants for the different types of reordering
// (correspond to indices in the respective table)
#if 0
typedef int ReorderingType;
static const ReorderingType M = 0; // monotonic
static const ReorderingType NM = 1; // non-monotonic
static const ReorderingType S = 1; // swap
static const ReorderingType D = 2; // discontinuous
static const ReorderingType DL = 2; // discontinuous, left
static const ReorderingType DR = 3; // discontinuous, right
static const ReorderingType R = 0; // right
static const ReorderingType L = 1; // left
static const ReorderingType MAX = 3; // largest possible
#else
enum ReorderingType {
M = 0, // monotonic
NM = 1, // non-monotonic
S = 1, // swap
D = 2, // discontinuous
DL = 2, // discontinuous, left
DR = 3, // discontinuous, right
R = 0, // right
L = 1, // left
MAX = 3, // largest possible
NONE = 4 // largest possible
};
#endif
// determine orientation, depending on model:
ReorderingType // for first phrase in phrase-based
GetOrientation(Range const& cur) const;
ReorderingType // for non-first phrases in phrase-based
GetOrientation(Range const& prev, Range const& cur) const;
ReorderingType // for HReorderingForwardState
GetOrientation(Range const& prev, Range const& cur,
Bitmap const& cov) const;
ReorderingType // for HReorderingBackwarddState
GetOrientation(int const reoDistance) const;
LRModel(const std::string &modelType);
void
ConfigureSparse(const std::map<std::string,std::string>& sparseArgs,
const LexicalReordering* producer);
LRState*
CreateLRState(const InputType &input) const;
size_t GetNumberOfTypes() const;
size_t GetNumScoreComponents() const;
void SetAdditionalScoreComponents(size_t number);
LexicalReordering*
GetScoreProducer() const {
return m_scoreProducer;
}
ModelType GetModelType() const {
return m_modelType;
}
Direction GetDirection() const {
return m_direction;
}
Condition GetCondition() const {
return m_condition;
}
bool
IsPhraseBased() const {
return m_phraseBased;
}
bool
CollapseScores() const {
return m_collapseScores;
}
SparseReordering const*
GetSparseReordering() const {
return m_sparse.get();
}
private:
void
SetScoreProducer(LexicalReordering* scoreProducer) {
m_scoreProducer = scoreProducer;
}
std::string const&
GetModelString() const {
return m_modelString;
}
std::string m_modelString;
LexicalReordering *m_scoreProducer;
ModelType m_modelType;
bool m_phraseBased;
bool m_collapseScores;
Direction m_direction;
Condition m_condition;
size_t m_additionalScoreComponents;
boost::scoped_ptr<SparseReordering> m_sparse;
};
//! Abstract class for lexical reordering model states
class LRState : public FFState
{
public:
typedef LRModel::ReorderingType ReorderingType;
virtual
LRState*
Expand(const TranslationOption& hypo, const InputType& input,
ScoreComponentCollection* scores) const = 0;
static
LRState*
CreateLRState(const std::vector<std::string>& config,
LRModel::Direction dir,
const InputType &input);
protected:
const LRModel& m_configuration;
// The following is the true direction of the object, which can be
// Backward or Forward even if the Configuration has Bidirectional.
LRModel::Direction m_direction;
size_t m_offset;
//forward scores are conditioned on prev option, so need to remember it
const TranslationOption *m_prevOption;
inline
LRState(const LRState *prev,
const TranslationOption &topt)
: m_configuration(prev->m_configuration)
, m_direction(prev->m_direction)
, m_offset(prev->m_offset)
, m_prevOption(&topt)
{ }
inline
LRState(const LRModel &config,
LRModel::Direction dir,
size_t offset)
: m_configuration(config)
, m_direction(dir)
, m_offset(offset)
, m_prevOption(NULL)
{ }
// copy the right scores in the right places, taking into account
// forward/backward, offset, collapse
void
CopyScores(ScoreComponentCollection* scores,
const TranslationOption& topt,
const InputType& input, ReorderingType reoType) const;
int
ComparePrevScores(const TranslationOption *other) const;
};
//! @todo what is this?
class BidirectionalReorderingState
: public LRState
{
private:
const LRState *m_backward;
const LRState *m_forward;
public:
BidirectionalReorderingState(const LRModel &config,
const LRState *bw,
const LRState *fw, size_t offset)
: LRState(config,
LRModel::Bidirectional,
offset)
, m_backward(bw)
, m_forward(fw)
{ }
~BidirectionalReorderingState() {
delete m_backward;
delete m_forward;
}
virtual size_t hash() const;
virtual bool operator==(const FFState& other) const;
LRState*
Expand(const TranslationOption& topt, const InputType& input,
ScoreComponentCollection* scores) const;
};
//! State for the standard Moses implementation of lexical reordering models
//! (see Koehn et al, Edinburgh System Description for the 2005 NIST MT
//! Evaluation)
class PhraseBasedReorderingState
: public LRState
{
private:
Range m_prevRange;
bool m_first;
public:
static bool m_useFirstBackwardScore;
PhraseBasedReorderingState(const LRModel &config,
LRModel::Direction dir,
size_t offset);
PhraseBasedReorderingState(const PhraseBasedReorderingState *prev,
const TranslationOption &topt);
virtual size_t hash() const;
virtual bool operator==(const FFState& other) const;
virtual
LRState*
Expand(const TranslationOption& topt,const InputType& input,
ScoreComponentCollection* scores) const;
ReorderingType GetOrientationTypeMSD(Range currRange) const;
ReorderingType GetOrientationTypeMSLR(Range currRange) const;
ReorderingType GetOrientationTypeMonotonic(Range currRange) const;
ReorderingType GetOrientationTypeLeftRight(Range currRange) const;
};
//! State for a hierarchical reordering model (see Galley and Manning, A
//! Simple and Effective Hierarchical Phrase Reordering Model, EMNLP 2008)
//! backward state (conditioned on the previous phrase)
class HReorderingBackwardState : public LRState
{
private:
ReorderingStack m_reoStack;
public:
HReorderingBackwardState(const LRModel &config, size_t offset);
HReorderingBackwardState(const HReorderingBackwardState *prev,
const TranslationOption &topt,
ReorderingStack reoStack);
virtual size_t hash() const;
virtual bool operator==(const FFState& other) const;
virtual LRState* Expand(const TranslationOption& hypo, const InputType& input,
ScoreComponentCollection* scores) const;
private:
ReorderingType GetOrientationTypeMSD(int reoDistance) const;
ReorderingType GetOrientationTypeMSLR(int reoDistance) const;
ReorderingType GetOrientationTypeMonotonic(int reoDistance) const;
ReorderingType GetOrientationTypeLeftRight(int reoDistance) const;
};
//!forward state (conditioned on the next phrase)
class HReorderingForwardState : public LRState
{
private:
bool m_first;
Range m_prevRange;
Bitmap m_coverage;
public:
HReorderingForwardState(const LRModel &config, size_t sentenceLength,
size_t offset);
HReorderingForwardState(const HReorderingForwardState *prev,
const TranslationOption &topt);
virtual size_t hash() const;
virtual bool operator==(const FFState& other) const;
virtual LRState* Expand(const TranslationOption& hypo,
const InputType& input,
ScoreComponentCollection* scores) const;
};
}

View File

@ -0,0 +1,72 @@
#include "PhraseBasedReorderingState.h"
namespace Moses
{
// ===========================================================================
// PHRASE BASED REORDERING STATE
// ===========================================================================
bool PhraseBasedReorderingState::m_useFirstBackwardScore = true;
PhraseBasedReorderingState::
PhraseBasedReorderingState(const PhraseBasedReorderingState *prev,
const TranslationOption &topt)
: LRState(prev, topt)
, m_prevRange(topt.GetSourceWordsRange())
, m_first(false)
{ }
PhraseBasedReorderingState::
PhraseBasedReorderingState(const LRModel &config,
LRModel::Direction dir, size_t offset)
: LRState(config, dir, offset)
, m_prevRange(NOT_FOUND,NOT_FOUND)
, m_first(true)
{ }
size_t PhraseBasedReorderingState::hash() const
{
size_t ret;
ret = hash_value(m_prevRange);
boost::hash_combine(ret, m_direction);
return ret;
}
bool PhraseBasedReorderingState::operator==(const FFState& o) const
{
if (&o == this) return true;
const PhraseBasedReorderingState &other = static_cast<const PhraseBasedReorderingState&>(o);
if (m_prevRange == other.m_prevRange) {
if (m_direction == LRModel::Forward) {
int compareScore = ComparePrevScores(other.m_prevOption);
return compareScore == 0;
} else {
return true;
}
} else {
return false;
}
}
LRState*
PhraseBasedReorderingState::
Expand(const TranslationOption& topt, const InputType& input,
ScoreComponentCollection* scores) const
{
// const LRModel::ModelType modelType = m_configuration.GetModelType();
if ((m_direction != LRModel::Forward && m_useFirstBackwardScore) || !m_first) {
LRModel const& lrmodel = m_configuration;
Range const cur = topt.GetSourceWordsRange();
LRModel::ReorderingType reoType = (m_first ? lrmodel.GetOrientation(cur)
: lrmodel.GetOrientation(m_prevRange,cur));
CopyScores(scores, topt, input, reoType);
}
return new PhraseBasedReorderingState(this, topt);
}
}

View File

@ -0,0 +1,38 @@
#pragma once
#include "LRState.h"
namespace Moses
{
//! State for the standard Moses implementation of lexical reordering models
//! (see Koehn et al, Edinburgh System Description for the 2005 NIST MT
//! Evaluation)
class PhraseBasedReorderingState
: public LRState
{
private:
Range m_prevRange;
bool m_first;
public:
static bool m_useFirstBackwardScore;
PhraseBasedReorderingState(const LRModel &config,
LRModel::Direction dir,
size_t offset);
PhraseBasedReorderingState(const PhraseBasedReorderingState *prev,
const TranslationOption &topt);
virtual size_t hash() const;
virtual bool operator==(const FFState& other) const;
virtual
LRState*
Expand(const TranslationOption& topt,const InputType& input,
ScoreComponentCollection* scores) const;
ReorderingType GetOrientationTypeMSD(Range currRange) const;
ReorderingType GetOrientationTypeMSLR(Range currRange) const;
ReorderingType GetOrientationTypeMonotonic(Range currRange) const;
ReorderingType GetOrientationTypeLeftRight(Range currRange) const;
};
}

View File

@ -19,7 +19,7 @@
#include "moses/FeatureVector.h"
#include "moses/ScoreComponentCollection.h"
#include "LexicalReorderingState.h"
#include "LRState.h"
/**
Configuration of sparse reordering:

View File

@ -140,6 +140,8 @@ float Model1LexicalTable::GetProbability(const Factor* wordS, const Factor* word
Model1Feature::Model1Feature(const std::string &line)
: StatelessFeatureFunction(1, line)
, m_skipTargetPunctuation(false)
, m_is_syntax(false)
{
VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ...");
ReadParameters();
@ -150,10 +152,12 @@ void Model1Feature::SetParameter(const std::string& key, const std::string& valu
{
if (key == "path") {
m_fileNameModel1 = value;
} else if (key == "sourceVocabulary") {
} else if (key == "source-vocabulary") {
m_fileNameVcbS = value;
} else if (key == "targetVocabulary") {
} else if (key == "target-vocabulary") {
m_fileNameVcbT = value;
} else if (key == "skip-target-punctuation") {
m_skipTargetPunctuation = Scan<bool>(value);
} else {
StatelessFeatureFunction::SetParameter(key, value);
}
@ -162,6 +166,8 @@ void Model1Feature::SetParameter(const std::string& key, const std::string& valu
void Model1Feature::Load(AllOptions::ptr const& opts)
{
m_options = opts;
m_is_syntax = is_syntax(opts->search.algo);
FEATUREVERBOSE(2, GetScoreProducerDescription() << ": Loading source vocabulary from file " << m_fileNameVcbS << " ...");
Model1Vocabulary vcbS;
vcbS.Load(m_fileNameVcbS);
@ -177,6 +183,16 @@ void Model1Feature::Load(AllOptions::ptr const& opts)
m_emptyWord = factorCollection.GetFactor(Model1Vocabulary::GIZANULL,false);
UTIL_THROW_IF2(m_emptyWord==NULL, GetScoreProducerDescription()
<< ": Factor for GIZA empty word does not exist.");
if (m_skipTargetPunctuation) {
const std::string punctuation = ",;.:!?";
for (size_t i=0; i<punctuation.size(); ++i) {
const std::string punct = punctuation.substr(i,1);
FactorCollection &factorCollection = FactorCollection::Instance();
const Factor* punctFactor = factorCollection.AddFactor(punct,false);
std::pair<std::set<const Factor*>::iterator,bool> inserted = m_punctuation.insert(punctFactor);
}
}
}
void Model1Feature::EvaluateWithSourceContext(const InputType &input
@ -192,6 +208,12 @@ void Model1Feature::EvaluateWithSourceContext(const InputType &input
for (size_t posT=0; posT<targetPhrase.GetSize(); ++posT) {
const Word &wordT = targetPhrase.GetWord(posT);
if (m_skipTargetPunctuation) {
std::set<const Factor*>::const_iterator foundPunctuation = m_punctuation.find(wordT[0]);
if (foundPunctuation != m_punctuation.end()) {
continue;
}
}
if ( !wordT.IsNonTerminal() ) {
float thisWordProb = m_model1.GetProbability(m_emptyWord,wordT[0]); // probability conditioned on empty word
@ -213,7 +235,7 @@ void Model1Feature::EvaluateWithSourceContext(const InputType &input
}
if (!foundInCache) {
for (size_t posS=1; posS<sentence.GetSize()-1; ++posS) { // ignore <s> and </s>
for (size_t posS=(m_is_syntax?1:0); posS<(m_is_syntax?sentence.GetSize()-1:sentence.GetSize()); ++posS) { // ignore <s> and </s>
const Word &wordS = sentence.GetWord(posS);
float modelProb = m_model1.GetProbability(wordS[0],wordT[0]);
FEATUREVERBOSE(4, "p( " << wordT << " | " << wordS << " ) = " << modelProb << std::endl);

View File

@ -2,6 +2,7 @@
#include <string>
#include <limits>
#include <set>
#include <boost/unordered_map.hpp>
#include "StatelessFeatureFunction.h"
#include "moses/Factor.h"
@ -98,6 +99,9 @@ private:
std::string m_fileNameModel1;
Model1LexicalTable m_model1;
const Factor* m_emptyWord;
bool m_skipTargetPunctuation;
std::set<const Factor*> m_punctuation;
bool m_is_syntax;
void Load(AllOptions::ptr const& opts);

View File

@ -0,0 +1,189 @@
#include "TargetConstituentAdjacencyFeature.h"
#include "moses/PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.h"
#include "moses/PP/TargetConstituentBoundariesLeftPhraseProperty.h"
#include "moses/StaticData.h"
#include "moses/ScoreComponentCollection.h"
#include "moses/Hypothesis.h"
#include "moses/FactorCollection.h"
#include "moses/TreeInput.h"
#include <algorithm>
using namespace std;
namespace Moses
{
size_t TargetConstituentAdjacencyFeatureState::hash() const
{
if (m_recombine) {
return 0;
}
size_t ret = 0;
boost::hash_combine(ret, m_collection.size());
for (std::map<const Factor*, float>::const_iterator it=m_collection.begin();
it!=m_collection.end(); ++it) {
boost::hash_combine(ret, it->first);
}
return ret;
};
bool TargetConstituentAdjacencyFeatureState::operator==(const FFState& other) const
{
if (m_recombine) {
return true;
}
if (this == &other) {
return true;
}
const TargetConstituentAdjacencyFeatureState* otherState =
dynamic_cast<const TargetConstituentAdjacencyFeatureState*>(&other);
UTIL_THROW_IF2(otherState == NULL, "Wrong state type");
if (m_collection.size() != (otherState->m_collection).size()) {
return false;
}
std::map<const Factor*, float>::const_iterator thisIt, otherIt;
for (thisIt=m_collection.begin(), otherIt=(otherState->m_collection).begin();
thisIt!=m_collection.end(); ++thisIt, ++otherIt) {
if (thisIt->first != otherIt->first) {
return false;
}
}
return true;
};
TargetConstituentAdjacencyFeature::TargetConstituentAdjacencyFeature(const std::string &line)
: StatefulFeatureFunction(2, line)
, m_featureVariant(0)
, m_recombine(false)
{
VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ...");
ReadParameters();
VERBOSE(1, " Done." << std::endl);
VERBOSE(1, " Feature variant: " << m_featureVariant << "." << std::endl);
}
void TargetConstituentAdjacencyFeature::SetParameter(const std::string& key, const std::string& value)
{
if (key == "variant") {
m_featureVariant = Scan<size_t>(value);
} else if (key == "recombine") {
m_recombine = Scan<bool>(value);
} else {
StatefulFeatureFunction::SetParameter(key, value);
}
}
FFState* TargetConstituentAdjacencyFeature::EvaluateWhenApplied(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const
{
// dense scores
std::vector<float> newScores(m_numScoreComponents,0); // m_numScoreComponents == 2
// state
const TargetConstituentAdjacencyFeatureState *prevState = static_cast<const TargetConstituentAdjacencyFeatureState*>(prev_state);
// read TargetConstituentAdjacency property
const TargetPhrase &currTarPhr = cur_hypo.GetCurrTargetPhrase();
FEATUREVERBOSE(2, "Phrase: " << currTarPhr << std::endl);
if (const PhraseProperty *property = currTarPhr.GetProperty("TargetConstituentBoundariesLeft")) {
const TargetConstituentBoundariesLeftPhraseProperty *targetConstituentBoundariesLeftPhraseProperty =
static_cast<const TargetConstituentBoundariesLeftPhraseProperty*>(property);
const TargetConstituentBoundariesLeftCollection& leftConstituentCollection =
targetConstituentBoundariesLeftPhraseProperty->GetCollection();
float prob = 0;
size_t numMatch = 0;
size_t numOverall = 0;
if ( !cur_hypo.GetPrevHypo()->GetPrevHypo() ) {
// previous hypothesis is initial, i.e. target sentence starts here
++numOverall;
FactorCollection &factorCollection = FactorCollection::Instance();
const Factor* bosFactor = factorCollection.AddFactor("BOS_",false);
TargetConstituentBoundariesLeftCollection::const_iterator found =
leftConstituentCollection.find(bosFactor);
if ( found != leftConstituentCollection.end() ) {
++numMatch;
prob += found->second;
}
} else {
const std::map<const Factor*, float>& hypConstituentCollection = prevState->m_collection;
std::map<const Factor*, float>::const_iterator iter1 = hypConstituentCollection.begin();
std::map<const Factor*, float>::const_iterator iter2 = leftConstituentCollection.begin();
while ( iter1 != hypConstituentCollection.end() && iter2 != leftConstituentCollection.end() ) {
++numOverall;
if ( iter1->first < iter2->first ) {
++iter1;
} else if ( iter2->first < iter1->first ) {
++iter2;
} else {
++numMatch;
float currProb = iter1->second * iter2->second;
if (currProb > prob)
prob = currProb;
++iter1;
++iter2;
}
}
}
if ( (numMatch == 0) || (prob == 0) ) {
++newScores[1];
} else {
if ( m_featureVariant == 1 ) {
newScores[0] += TransformScore(prob);
} else {
newScores[0] += TransformScore( (float)numMatch/numOverall );
}
}
} else {
// abort with error message if the phrase does not translate an unknown word
UTIL_THROW_IF2(!currTarPhr.GetWord(0).IsOOV(), GetScoreProducerDescription()
<< ": Missing TargetConstituentBoundariesLeft property.");
++newScores[1];
}
TargetConstituentAdjacencyFeatureState *newState = new TargetConstituentAdjacencyFeatureState(m_recombine);
if (const PhraseProperty *property = currTarPhr.GetProperty("TargetConstituentBoundariesRightAdjacent")) {
const TargetConstituentBoundariesRightAdjacentPhraseProperty *targetConstituentBoundariesRightAdjacentPhraseProperty =
static_cast<const TargetConstituentBoundariesRightAdjacentPhraseProperty*>(property);
const TargetConstituentBoundariesLeftCollection& rightAdjacentConstituentCollection = targetConstituentBoundariesRightAdjacentPhraseProperty->GetCollection();
std::copy(rightAdjacentConstituentCollection.begin(), rightAdjacentConstituentCollection.end(),
std::inserter(newState->m_collection, newState->m_collection.begin()));
} else {
// abort with error message if the phrase does not translate an unknown word
UTIL_THROW_IF2(!currTarPhr.GetWord(0).IsOOV(), GetScoreProducerDescription()
<< ": Missing TargetConstituentBoundariesRightAdjacent property.");
}
// add scores
accumulator->PlusEquals(this, newScores);
return newState;
}
}

View File

@ -0,0 +1,101 @@
#pragma once
#include <string>
#include <vector>
#include <set>
#include <iostream>
#include "StatefulFeatureFunction.h"
#include "FFState.h"
#include "util/exception.hh"
#include <stdint.h>
namespace Moses
{
class TargetConstituentAdjacencyFeatureState : public FFState
{
public:
friend class TargetConstituentAdjacencyFeature;
TargetConstituentAdjacencyFeatureState(bool recombine)
: m_recombine(recombine)
{};
size_t hash() const;
virtual bool operator==(const FFState& other) const;
private:
const bool m_recombine;
std::map<const Factor*, float> m_collection;
};
class TargetConstituentAdjacencyFeature : public StatefulFeatureFunction
{
public:
TargetConstituentAdjacencyFeature(const std::string &line);
~TargetConstituentAdjacencyFeature()
{};
bool IsUseable(const FactorMask &mask) const {
return true;
};
virtual const FFState* EmptyHypothesisState(const InputType &input) const {
return new TargetConstituentAdjacencyFeatureState(m_recombine);
};
void SetParameter(const std::string& key, const std::string& value);
void Load(AllOptions::ptr const& opts)
{};
void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
{};
void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const
{};
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
, const TranslationOptionList &translationOptionList) const
{};
FFState* EvaluateWhenApplied(
const Hypothesis& cur_hypo,
const FFState* prev_state,
ScoreComponentCollection* accumulator) const;
FFState* EvaluateWhenApplied(
const ChartHypothesis& cur_hypo,
int featureID, // used to index the state in the previous hypotheses
ScoreComponentCollection* accumulator) const {
UTIL_THROW2(GetScoreProducerDescription() << ": feature currently not implemented for chart-based decoding.");
return new TargetConstituentAdjacencyFeatureState(m_recombine);
};
private:
size_t m_featureVariant;
bool m_recombine;
};
}

View File

@ -72,7 +72,7 @@ private:
std::string MakeNGram(const TargetPhrase &phrase, size_t start, size_t end) const {
std::vector<std::string> words;
while (start != end) {
words.push_back(phrase.GetWord(start).GetString(StaticData::Instance().options().output.factor_order, false));
words.push_back(phrase.GetWord(start).GetString(StaticData::Instance().options()->output.factor_order, false));
start++;
}
return Join(" ", words);

View File

@ -323,7 +323,7 @@ public:
Phrase *target = new Phrase();
target->CreateFromString(
Output
, StaticData::Instance().options().output.factor_order
, StaticData::Instance().options()->output.factor_order
, tabbedSentence.GetColumns()[0]
, NULL);

View File

@ -111,8 +111,7 @@ void WordTranslationFeature::Load(AllOptions::ptr const& opts)
}
inFileSource.close();
} else if (!m_filePathSource.empty() || !m_filePathTarget.empty()) {
return;
} else {
// restricted source word vocabulary
ifstream inFileSource(m_filePathSource.c_str());
UTIL_THROW_IF2(!inFileSource, "could not open file " << m_filePathSource);

View File

@ -19,8 +19,7 @@ License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef moses_Factor_h
#define moses_Factor_h
#pragma once
#include <ostream>
#include <string>
@ -98,4 +97,4 @@ public:
size_t hash_value(const Factor &f);
}
#endif

View File

@ -175,7 +175,7 @@ void FVector::resize(size_t newsize)
void FVector::clear()
{
m_coreFeatures.resize(0);
m_coreFeatures.resize(m_coreFeatures.size(), 0);
m_features.clear();
}

View File

@ -40,7 +40,8 @@ namespace Moses
{
/** Constructs a new backward language model. */
template <class Model> BackwardLanguageModel<Model>::BackwardLanguageModel(const std::string &line, const std::string &file, FactorType factorType, bool lazy) : LanguageModelKen<Model>(line,file,factorType,lazy)
// TODO(lane): load_method instead of lazy bool
template <class Model> BackwardLanguageModel<Model>::BackwardLanguageModel(const std::string &line, const std::string &file, FactorType factorType, bool lazy) : LanguageModelKen<Model>(line,file,factorType, lazy ? util::LAZY : util::POPULATE_OR_READ)
{
//
// This space intentionally left blank

View File

@ -69,63 +69,6 @@ struct KenLMState : public FFState {
};
///*
// * An implementation of single factor LM using Ken's code.
// */
//template <class Model> class LanguageModelKen : public LanguageModel
//{
//public:
// LanguageModelKen(const std::string &line, const std::string &file, FactorType factorType, bool lazy);
//
// const FFState *EmptyHypothesisState(const InputType &/*input*/) const {
// KenLMState *ret = new KenLMState();
// ret->state = m_ngram->BeginSentenceState();
// return ret;
// }
//
// void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const;
//
// FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const;
//
// FFState *EvaluateWhenApplied(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection *accumulator) const;
//
// void IncrementalCallback(Incremental::Manager &manager) const {
// manager.LMCallback(*m_ngram, m_lmIdLookup);
// }
//
// bool IsUseable(const FactorMask &mask) const;
//private:
// LanguageModelKen(const LanguageModelKen<Model> &copy_from);
//
// lm::WordIndex TranslateID(const Word &word) const {
// std::size_t factor = word.GetFactor(m_factorType)->GetId();
// return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]);
// }
//
// // Convert last words of hypothesis into vocab ids, returning an end pointer.
// lm::WordIndex *LastIDs(const Hypothesis &hypo, lm::WordIndex *indices) const {
// lm::WordIndex *index = indices;
// lm::WordIndex *end = indices + m_ngram->Order() - 1;
// int position = hypo.GetCurrTargetWordsRange().GetEndPos();
// for (; ; ++index, --position) {
// if (index == end) return index;
// if (position == -1) {
// *index = m_ngram->GetVocabulary().BeginSentence();
// return index + 1;
// }
// *index = TranslateID(hypo.GetWord(position));
// }
// }
//
// boost::shared_ptr<Model> m_ngram;
//
// std::vector<lm::WordIndex> m_lmIdLookup;
//
// FactorType m_factorType;
//
// const Factor *m_beginSentenceFactor;
//};
class MappingBuilder : public lm::EnumerateVocab
{
public:
@ -148,7 +91,7 @@ private:
} // namespace
template <class Model> void LanguageModelKen<Model>::LoadModel(const std::string &file, bool lazy)
template <class Model> void LanguageModelKen<Model>::LoadModel(const std::string &file, util::LoadMethod load_method)
{
m_lmIdLookup.clear();
@ -161,18 +104,18 @@ template <class Model> void LanguageModelKen<Model>::LoadModel(const std::string
FactorCollection &collection = FactorCollection::Instance();
MappingBuilder builder(collection, m_lmIdLookup);
config.enumerate_vocab = &builder;
config.load_method = lazy ? util::LAZY : util::POPULATE_OR_READ;
config.load_method = load_method;
m_ngram.reset(new Model(file.c_str(), config));
}
template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::string &line, const std::string &file, FactorType factorType, bool lazy)
template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::string &line, const std::string &file, FactorType factorType, util::LoadMethod load_method)
:LanguageModel(line)
,m_factorType(factorType)
,m_beginSentenceFactor(FactorCollection::Instance().AddFactor(BOS_))
{
ReadParameters();
LoadModel(file, lazy);
LoadModel(file, load_method);
}
template <class Model> LanguageModelKen<Model>::LanguageModelKen(const LanguageModelKen<Model> &copy_from)
@ -480,7 +423,7 @@ LanguageModel *ConstructKenLM(const std::string &lineOrig)
{
FactorType factorType = 0;
string filePath;
bool lazy = false;
util::LoadMethod load_method = util::POPULATE_OR_READ;
util::TokenIter<util::SingleCharacter, true> argument(lineOrig, ' ');
++argument; // KENLM
@ -501,38 +444,53 @@ LanguageModel *ConstructKenLM(const std::string &lineOrig)
} else if (name == "path") {
filePath.assign(value.data(), value.size());
} else if (name == "lazyken") {
lazy = boost::lexical_cast<bool>(value);
// deprecated: use load instead.
load_method = boost::lexical_cast<bool>(value) ? util::LAZY : util::POPULATE_OR_READ;
} else if (name == "load") {
if (value == "lazy") {
load_method = util::LAZY;
} else if (value == "populate_or_lazy") {
load_method = util::POPULATE_OR_LAZY;
} else if (value == "populate_or_read" || value == "populate") {
load_method = util::POPULATE_OR_READ;
} else if (value == "read") {
load_method = util::READ;
} else if (value == "parallel_read") {
load_method = util::PARALLEL_READ;
} else {
UTIL_THROW2("Unknown KenLM load method " << value);
}
} else {
// pass to base class to interpret
line << " " << name << "=" << value;
}
}
return ConstructKenLM(line.str(), filePath, factorType, lazy);
return ConstructKenLM(line.str(), filePath, factorType, load_method);
}
LanguageModel *ConstructKenLM(const std::string &line, const std::string &file, FactorType factorType, bool lazy)
LanguageModel *ConstructKenLM(const std::string &line, const std::string &file, FactorType factorType, util::LoadMethod load_method)
{
lm::ngram::ModelType model_type;
if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) {
switch(model_type) {
case lm::ngram::PROBING:
return new LanguageModelKen<lm::ngram::ProbingModel>(line, file, factorType, lazy);
return new LanguageModelKen<lm::ngram::ProbingModel>(line, file, factorType, load_method);
case lm::ngram::REST_PROBING:
return new LanguageModelKen<lm::ngram::RestProbingModel>(line, file, factorType, lazy);
return new LanguageModelKen<lm::ngram::RestProbingModel>(line, file, factorType, load_method);
case lm::ngram::TRIE:
return new LanguageModelKen<lm::ngram::TrieModel>(line, file, factorType, lazy);
return new LanguageModelKen<lm::ngram::TrieModel>(line, file, factorType, load_method);
case lm::ngram::QUANT_TRIE:
return new LanguageModelKen<lm::ngram::QuantTrieModel>(line, file, factorType, lazy);
return new LanguageModelKen<lm::ngram::QuantTrieModel>(line, file, factorType, load_method);
case lm::ngram::ARRAY_TRIE:
return new LanguageModelKen<lm::ngram::ArrayTrieModel>(line, file, factorType, lazy);
return new LanguageModelKen<lm::ngram::ArrayTrieModel>(line, file, factorType, load_method);
case lm::ngram::QUANT_ARRAY_TRIE:
return new LanguageModelKen<lm::ngram::QuantArrayTrieModel>(line, file, factorType, lazy);
return new LanguageModelKen<lm::ngram::QuantArrayTrieModel>(line, file, factorType, load_method);
default:
UTIL_THROW2("Unrecognized kenlm model type " << model_type);
}
} else {
return new LanguageModelKen<lm::ngram::ProbingModel>(line, file, factorType, lazy);
return new LanguageModelKen<lm::ngram::ProbingModel>(line, file, factorType, load_method);
}
}

View File

@ -26,6 +26,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <boost/shared_ptr.hpp>
#include "lm/word_index.hh"
#include "util/mmap.hh"
#include "moses/LM/Base.h"
#include "moses/Hypothesis.h"
@ -41,7 +42,7 @@ class FFState;
LanguageModel *ConstructKenLM(const std::string &line);
//! This will also load. Returns a templated KenLM class
LanguageModel *ConstructKenLM(const std::string &line, const std::string &file, FactorType factorType, bool lazy);
LanguageModel *ConstructKenLM(const std::string &line, const std::string &file, FactorType factorType, util::LoadMethod load_method);
/*
* An implementation of single factor LM using Kenneth's code.
@ -49,7 +50,7 @@ LanguageModel *ConstructKenLM(const std::string &line, const std::string &file,
template <class Model> class LanguageModelKen : public LanguageModel
{
public:
LanguageModelKen(const std::string &line, const std::string &file, FactorType factorType, bool lazy);
LanguageModelKen(const std::string &line, const std::string &file, FactorType factorType, util::LoadMethod load_method);
virtual const FFState *EmptyHypothesisState(const InputType &/*input*/) const;
@ -73,7 +74,7 @@ protected:
FactorType m_factorType;
void LoadModel(const std::string &file, bool lazy);
void LoadModel(const std::string &file, util::LoadMethod load_method);
lm::WordIndex TranslateID(const Word &word) const {
std::size_t factor = word.GetFactor(m_factorType)->GetId();

View File

@ -73,7 +73,7 @@ template <class Model> FFState *ReloadingLanguageModel<Model>::EvaluateWhenAppli
std::auto_ptr<FFState> kenlmState(LanguageModelKen<Model>::EvaluateWhenApplied(hypo, ps, out));
const lm::ngram::State &out_state = static_cast<const ReloadingLMState&>(*kenlmState).state;
std::auto_ptr<ReloadingLMState> ret(new ReloadingLMState());
ret->state = out_state;

View File

@ -64,18 +64,18 @@ private:
template <class Model> class ReloadingLanguageModel : public LanguageModelKen<Model>
{
public:
ReloadingLanguageModel(const std::string &line, const std::string &file, FactorType factorType, bool lazy) : LanguageModelKen<Model>(line, file, factorType, lazy), m_file(file), m_lazy(lazy)
{
// TODO(Lane) copy less code, update to load_method
ReloadingLanguageModel(const std::string &line, const std::string &file, FactorType factorType, bool lazy) : LanguageModelKen<Model>(line, file, factorType, lazy ? util::LAZY : util::POPULATE_OR_READ), m_file(file), m_lazy(lazy) {
std::cerr << "ReloadingLM constructor: " << m_file << std::endl;
// std::cerr << std::string(line).replace(0,11,"KENLM") << std::endl;
}
virtual void InitializeForInput(ttasksptr const& ttask) {
virtual void InitializeForInput(ttasksptr const& ttask) {
std::cerr << "ReloadingLM InitializeForInput" << std::endl;
LanguageModelKen<Model>::LoadModel(m_file, m_lazy);
// TODO(lane): load_method
LanguageModelKen<Model>::LoadModel(m_file, m_lazy ? util::LAZY : util::POPULATE_OR_READ);
/*
lm::ngram::Config config;
if(this->m_verbosity >= 1) {
@ -87,15 +87,15 @@ public:
MappingBuilder builder(collection, m_lmIdLookup);
config.enumerate_vocab = &builder;
config.load_method = m_lazy ? util::LAZY : util::POPULATE_OR_READ;
m_ngram.reset(new Model(m_file.c_str(), config));
m_beginSentenceFactor = collection.AddFactor(BOS_);
*/
};
/*
ReloadingLanguageModel(const std::string &line) : LanguageModelKen<Model>(ConstructKenLM(std::string(line).replace(0,11,"KENLM"))) {
ReloadingLanguageModel(const std::string &line) : LanguageModelKen<Model>(ConstructKenLM(std::string(line).replace(0,11,"KENLM"))) {
std::cerr << "ReloadingLM constructor" << std::endl;
std::cerr << std::string(line).replace(0,11,"KENLM") << std::endl;
}
@ -138,12 +138,12 @@ public:
}
private:
private:
LanguageModel *m_lm;
*/
protected:
protected:
using LanguageModelKen<Model>::m_ngram;
using LanguageModelKen<Model>::m_lmIdLookup;

View File

@ -58,7 +58,14 @@ public:
void Write(const std::string& fname) const {
std::ofstream out(fname.c_str());
// Little-known fact: ofstream tracks failures but does not, by default,
// report them. You have to tell it to, or check for errors yourself.
out.exceptions(std::ifstream::failbit | std::ifstream::badbit);
Write(out);
// Make sure the file is flushed, so that any errors are reported. If we
// flush implicitly in the destructor, it won't be able to throw
// exceptions.
out.close();
}
void Write(std::ostream& out) const {
for(int i=data.size()-1; i>=0; --i)

View File

@ -11,6 +11,8 @@
#include "moses/PP/SpanLengthPhraseProperty.h"
#include "moses/PP/NonTermContextProperty.h"
#include "moses/PP/OrientationPhraseProperty.h"
#include "moses/PP/TargetConstituentBoundariesLeftPhraseProperty.h"
#include "moses/PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.h"
namespace Moses
{
@ -58,6 +60,8 @@ PhrasePropertyFactory::PhrasePropertyFactory()
MOSES_PNAME2("Counts", CountsPhraseProperty);
MOSES_PNAME2("SourceLabels", SourceLabelsPhraseProperty);
MOSES_PNAME2("TargetConstituentBoundariesLeft", TargetConstituentBoundariesLeftPhraseProperty);
MOSES_PNAME2("TargetConstituentBoundariesRightAdjacent", TargetConstituentBoundariesRightAdjacentPhraseProperty);
MOSES_PNAME2("TargetPreferences", TargetPreferencesPhraseProperty);
MOSES_PNAME2("Tree",TreeStructurePhraseProperty);
MOSES_PNAME2("SpanLength", SpanLengthPhraseProperty);

View File

@ -5,9 +5,14 @@ namespace Moses
std::ostream& operator<<(std::ostream &out, const PhraseProperty &obj)
{
out << "Base phrase property";
obj.Print(out);
return out;
}
void PhraseProperty::Print(std::ostream &out) const
{
out << "Base phrase property";
}
}

View File

@ -28,6 +28,8 @@ public:
protected:
virtual void Print(std::ostream& out) const;
std::string *m_value;
};

View File

@ -0,0 +1,63 @@
#include "moses/PP/TargetConstituentBoundariesLeftPhraseProperty.h"
#include "moses/FactorCollection.h"
#include "moses/Util.h"
#include <iostream>
#include <queue>
#include <ostream>
namespace Moses
{
void TargetConstituentBoundariesLeftPhraseProperty::ProcessValue(const std::string &value)
{
FactorCollection &factorCollection = FactorCollection::Instance();
std::vector<std::string> tokens;
Tokenize(tokens, value, " ");
std::vector<std::string>::const_iterator tokenIter = tokens.begin();
while (tokenIter != tokens.end()) {
try {
std::vector<std::string> constituents;
Tokenize(constituents, *tokenIter, "<");
++tokenIter;
float count = std::atof( tokenIter->c_str() );
++tokenIter;
std::set<const Factor* > dedup;
for ( std::vector<std::string>::iterator constituentIter = constituents.begin();
constituentIter != constituents.end(); ++constituentIter ) {
const Factor* constituentFactor = factorCollection.AddFactor(*constituentIter,false);
std::pair< std::set<const Factor* >::iterator, bool > dedupIns =
dedup.insert(constituentFactor);
if ( dedupIns.second ) {
std::pair< TargetConstituentBoundariesLeftCollection::iterator, bool > inserted =
m_constituentsCollection.insert(std::make_pair(constituentFactor,count));
if ( !inserted.second ) {
(inserted.first)->second += count;
}
}
}
} catch (const std::exception &e) {
UTIL_THROW2("TargetConstituentBoundariesLeftPhraseProperty: Read error. Flawed property? " << value);
}
}
};
void TargetConstituentBoundariesLeftPhraseProperty::Print(std::ostream& out) const
{
for ( TargetConstituentBoundariesLeftCollection::const_iterator it = m_constituentsCollection.begin();
it != m_constituentsCollection.end(); ++it ) {
if ( it != m_constituentsCollection.begin() ) {
out << " ";
}
out << *(it->first) << " " << it->second;
}
}
} // namespace Moses

View File

@ -0,0 +1,40 @@
#pragma once
#include "moses/PP/PhraseProperty.h"
#include "moses/Factor.h"
#include "util/exception.hh"
#include <map>
#include <string>
namespace Moses
{
typedef std::map<const Factor*, float> TargetConstituentBoundariesLeftCollection;
class TargetConstituentBoundariesLeftPhraseProperty : public PhraseProperty
{
public:
TargetConstituentBoundariesLeftPhraseProperty()
{};
virtual void ProcessValue(const std::string &value);
const TargetConstituentBoundariesLeftCollection &GetCollection() const {
return m_constituentsCollection;
};
virtual const std::string *GetValueString() const {
UTIL_THROW2("TargetConstituentBoundariesLeftPhraseProperty: value string not available in this phrase property");
return NULL;
};
protected:
virtual void Print(std::ostream& out) const;
TargetConstituentBoundariesLeftCollection m_constituentsCollection;
};
} // namespace Moses

View File

@ -0,0 +1,63 @@
#include "moses/PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.h"
#include "moses/FactorCollection.h"
#include "moses/Util.h"
#include <iostream>
#include <queue>
#include <ostream>
namespace Moses
{
void TargetConstituentBoundariesRightAdjacentPhraseProperty::ProcessValue(const std::string &value)
{
FactorCollection &factorCollection = FactorCollection::Instance();
std::vector<std::string> tokens;
Tokenize(tokens, value, " ");
std::vector<std::string>::const_iterator tokenIter = tokens.begin();
while (tokenIter != tokens.end()) {
try {
std::vector<std::string> constituents;
Tokenize(constituents, *tokenIter, "<");
++tokenIter;
float count = std::atof( tokenIter->c_str() );
++tokenIter;
std::set<const Factor* > dedup;
for ( std::vector<std::string>::iterator constituentIter = constituents.begin();
constituentIter != constituents.end(); ++constituentIter ) {
const Factor* constituentFactor = factorCollection.AddFactor(*constituentIter,false);
std::pair< std::set<const Factor* >::iterator, bool > dedupIns =
dedup.insert(constituentFactor);
if ( dedupIns.second ) {
std::pair< TargetConstituentBoundariesRightAdjacentCollection::iterator, bool > inserted =
m_constituentsCollection.insert(std::make_pair(constituentFactor,count));
if ( !inserted.second ) {
(inserted.first)->second += count;
}
}
}
} catch (const std::exception &e) {
UTIL_THROW2("TargetConstituentBoundariesRightAdjacentPhraseProperty: Read error. Flawed property? " << value);
}
}
};
void TargetConstituentBoundariesRightAdjacentPhraseProperty::Print(std::ostream& out) const
{
for ( TargetConstituentBoundariesRightAdjacentCollection::const_iterator it = m_constituentsCollection.begin();
it != m_constituentsCollection.end(); ++it ) {
if ( it != m_constituentsCollection.begin() ) {
out << " ";
}
out << *(it->first) << " " << it->second;
}
}
} // namespace Moses

View File

@ -0,0 +1,40 @@
#pragma once
#include "moses/PP/PhraseProperty.h"
#include "moses/Factor.h"
#include "util/exception.hh"
#include <map>
#include <string>
namespace Moses
{
typedef std::map<const Factor*, float> TargetConstituentBoundariesRightAdjacentCollection;
class TargetConstituentBoundariesRightAdjacentPhraseProperty : public PhraseProperty
{
public:
TargetConstituentBoundariesRightAdjacentPhraseProperty()
{};
virtual void ProcessValue(const std::string &value);
const TargetConstituentBoundariesRightAdjacentCollection &GetCollection() const {
return m_constituentsCollection;
};
virtual const std::string *GetValueString() const {
UTIL_THROW2("TargetConstituentBoundariesRightAdjacentPhraseProperty: value string not available in this phrase property");
return NULL;
};
protected:
virtual void Print(std::ostream& out) const;
TargetConstituentBoundariesRightAdjacentCollection m_constituentsCollection;
};
} // namespace Moses

View File

@ -1,3 +1,13 @@
exe ptable-sigtest-filter :
filter-pt.cc
$(TOP)/moses//moses
$(TOP)/moses/TranslationModel/UG/generic//generic
$(TOP)//boost_iostreams
$(TOP)//boost_program_options
$(TOP)/moses/TranslationModel/UG/mm//mm
$(TOP)/moses/TranslationModel/UG//mmsapt
$(TOP)/util//kenutil
;
exe try-align :
try-align.cc
$(TOP)/moses//moses

View File

@ -0,0 +1,669 @@
// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
// significance filtering for phrase tables as described in
// H. Johnson, et al. (2007) Improving Translation Quality
// by Discarding Most of the Phrasetable. EMNLP 2007.
// Implemented by Marcin Junczys-Dowmunt
// recommended use: -l a+e -n <ttable-limit>
#include <cstring>
#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <algorithm>
#include <fstream>
#include <sstream>
#include <vector>
#include <iostream>
#include <set>
#include <boost/thread/tss.hpp>
#include <boost/thread.hpp>
#include <boost/unordered_map.hpp>
#include <boost/program_options.hpp>
#include <boost/shared_ptr.hpp>
#include <boost/foreach.hpp>
#ifdef WIN32
#include "WIN32_functions.h"
#else
#include <unistd.h>
#endif
#include "mm/ug_bitext.h"
// constants
const size_t MINIMUM_SIZE_TO_KEEP = 10000; // increase this to improve memory usage,
// reduce for speed
const std::string SEPARATOR = " ||| ";
const double ALPHA_PLUS_EPS = -1000.0; // dummy value
const double ALPHA_MINUS_EPS = -2000.0; // dummy value
// configuration params
int pfe_filter_limit = 0; // 0 = don't filter anything based on P(f|e)
bool print_cooc_counts = false; // add cooc counts to phrase table?
bool print_neglog_significance = false; // add -log(p) to phrase table?
double sig_filter_limit = 0; // keep phrase pairs with -log(sig) > sig_filter_limit
// higher = filter-more
bool pef_filter_only = false; // only filter based on pef
bool hierarchical = false;
double p_111 = 0.0; // alpha
size_t pt_lines = 0;
size_t nremoved_sigfilter = 0;
size_t nremoved_pfefilter = 0;
typedef sapt::L2R_Token<sapt::SimpleWordId> Token;
typedef sapt::mmTtrack<Token> ttrack_t;
typedef sapt::mmTSA<Token> tsa_t;
typedef sapt::TokenIndex tind_t;
int num_lines;
boost::mutex in_mutex;
boost::mutex out_mutex;
boost::mutex err_mutex;
typedef size_t TextLenType;
typedef boost::shared_ptr<std::vector<TextLenType> > SentIdSet;
class Cache {
typedef std::pair<SentIdSet, clock_t> ClockedSet;
typedef boost::unordered_map<std::string, ClockedSet> ClockedMap;
public:
SentIdSet get(const std::string& phrase) {
boost::shared_lock<boost::shared_mutex> lock(m_mutex);
if(m_cont.count(phrase)) {
ClockedSet& set = m_cont[phrase];
set.second = clock();
return set.first;
}
return SentIdSet( new SentIdSet::element_type() );
}
void put(const std::string& phrase, const SentIdSet set) {
boost::unique_lock<boost::shared_mutex> lock(m_mutex);
m_cont[phrase] = std::make_pair(set, clock());
}
static void set_max_cache(size_t max_cache) {
s_max_cache = max_cache;
}
void prune() {
if(s_max_cache > 0) {
boost::upgrade_lock<boost::shared_mutex> lock(m_mutex);
if(m_cont.size() > s_max_cache) {
std::vector<clock_t> clocks;
for(ClockedMap::iterator it = m_cont.begin(); it != m_cont.end(); it++)
clocks.push_back(it->second.second);
std::sort(clocks.begin(), clocks.end());
clock_t out = clocks[m_cont.size() - s_max_cache];
boost::upgrade_to_unique_lock<boost::shared_mutex> uniq_lock(lock);
for(ClockedMap::iterator it = m_cont.begin(); it != m_cont.end(); it++)
if(it->second.second < out)
m_cont.erase(it);
}
}
}
private:
ClockedMap m_cont;
boost::shared_mutex m_mutex;
static size_t s_max_cache;
};
size_t Cache::s_max_cache = 0;
struct SA {
tind_t V;
boost::shared_ptr<ttrack_t> T;
tsa_t I;
Cache cache;
};
std::vector<boost::shared_ptr<SA> > e_sas;
std::vector<boost::shared_ptr<SA> > f_sas;
#undef min
void usage()
{
std::cerr << "\nFilter phrase table using significance testing as described\n"
<< "in H. Johnson, et al. (2007) Improving Translation Quality\n"
<< "by Discarding Most of the Phrasetable. EMNLP 2007.\n";
}
struct PTEntry {
PTEntry(const std::string& str, int index);
std::string f_phrase;
std::string e_phrase;
std::string extra;
std::string scores;
float pfe;
int cf;
int ce;
int cfe;
float nlog_pte;
void set_cooc_stats(int _cef, int _cf, int _ce, float nlp) {
cfe = _cef;
cf = _cf;
ce = _ce;
nlog_pte = nlp;
}
};
PTEntry::PTEntry(const std::string& str, int index) :
cf(0), ce(0), cfe(0), nlog_pte(0.0)
{
size_t pos = 0;
std::string::size_type nextPos = str.find(SEPARATOR, pos);
this->f_phrase = str.substr(pos,nextPos);
pos = nextPos + SEPARATOR.size();
nextPos = str.find(SEPARATOR, pos);
this->e_phrase = str.substr(pos,nextPos-pos);
pos = nextPos + SEPARATOR.size();
nextPos = str.find(SEPARATOR, pos);
if (nextPos < str.size()) {
this->scores = str.substr(pos,nextPos-pos);
pos = nextPos + SEPARATOR.size();
this->extra = str.substr(pos);
}
else {
this->scores = str.substr(pos,str.size()-pos);
}
int c = 0;
std::string::iterator i=scores.begin();
if (index > 0) {
for (; i != scores.end(); ++i) {
if ((*i) == ' ') {
c++;
if (c == index) break;
}
}
}
if (i != scores.end()) {
++i;
}
char f[24];
char *fp=f;
while (i != scores.end() && *i != ' ') {
*fp++=*i++;
}
*fp++=0;
this->pfe = atof(f);
}
struct PfeComparer {
bool operator()(const PTEntry* a, const PTEntry* b) const {
return a->pfe > b->pfe;
}
};
struct NlogSigThresholder {
NlogSigThresholder(float threshold) : t(threshold) {}
float t;
bool operator()(const PTEntry* a) const {
if (a->nlog_pte < t) {
delete a;
return true;
} else return false;
}
};
std::ostream& operator << (std::ostream& os, const PTEntry& pp)
{
os << pp.f_phrase << " ||| " << pp.e_phrase;
os << " ||| " << pp.scores;
if (pp.extra.size()>0) os << " ||| " << pp.extra;
if (print_cooc_counts) os << " ||| " << pp.cfe << " " << pp.cf << " " << pp.ce;
if (print_neglog_significance) os << " ||| " << pp.nlog_pte;
return os;
}
void print(int a, int b, int c, int d, float p)
{
std::cerr << a << "\t" << b << "\t P=" << p << "\n"
<< c << "\t" << d << "\t xf="
<< (double)(b)*(double)(c)/(double)(a+1)/(double)(d+1) << "\n\n";
}
// 2x2 (one-sided) Fisher's exact test
// see B. Moore. (2004) On Log Likelihood and the Significance of Rare Events
double fisher_exact(int cfe, int ce, int cf)
{
assert(cfe <= ce);
assert(cfe <= cf);
int a = cfe;
int b = (cf - cfe);
int c = (ce - cfe);
int d = (num_lines - ce - cf + cfe);
int n = a + b + c + d;
double cp = exp(lgamma(1+a+c) + lgamma(1+b+d) + lgamma(1+a+b) + lgamma(1+c+d)
- lgamma(1+n) - lgamma(1+a) - lgamma(1+b) - lgamma(1+c)
- lgamma(1+d));
double total_p = 0.0;
int tc = std::min(b,c);
for (int i=0; i<=tc; i++) {
total_p += cp;
double coef = (double)(b)*(double)(c)/(double)(a+1)/(double)(d+1);
cp *= coef;
++a;
--c;
++d;
--b;
}
return total_p;
}
template <class setType>
void ordered_set_intersect(setType& out, const setType set_1, const setType set_2)
{
std::set_intersection(set_1->begin(), set_1->end(), set_2->begin(),
set_2->end(), inserter(*out, out->begin()) );
}
void lookup_phrase(SentIdSet& ids, const std::string& phrase,
tsa_t &my_sa, tind_t &my_v, Cache& cache)
{
ids = cache.get(phrase);
if(ids->empty()) {
std::vector<sapt::id_type> snt;
my_v.fillIdSeq(phrase, snt);
tsa_t::tree_iterator m(&my_sa);
size_t k = 0;
while (k < snt.size() && m.extend(snt[k])) ++k;
if(k == snt.size()) {
ids->reserve(m.approxOccurrenceCount()+10);
sapt::tsa::ArrayEntry I(m.lower_bound(-1));
char const* stop = m.upper_bound(-1);
do {
m.root->readEntry(I.next,I);
ids->push_back(I.sid);
} while (I.next != stop);
std::sort(ids->begin(), ids->end());
SentIdSet::element_type::iterator it =
std::unique(ids->begin(), ids->end());
ids->resize(it - ids->begin());
if(ids->size() >= MINIMUM_SIZE_TO_KEEP)
cache.put(phrase, ids);
}
}
}
void lookup_multiple_phrases(SentIdSet& ids, std::vector<std::string> & phrases,
tsa_t & my_sa, tind_t &my_v,
const std::string & rule, Cache& cache)
{
if (phrases.size() == 1) {
lookup_phrase(ids, phrases.front(), my_sa, my_v, cache);
}
else {
SentIdSet main_set( new SentIdSet::element_type() );
bool first = true;
SentIdSet first_set( new SentIdSet::element_type() );
lookup_phrase(first_set, phrases.front(), my_sa, my_v, cache);
for (std::vector<std::string>::iterator phrase=phrases.begin()+1;
phrase != phrases.end(); ++phrase) {
SentIdSet temp_set( new SentIdSet::element_type() );
lookup_phrase(temp_set, *phrase, my_sa, my_v, cache);
if (first) {
ordered_set_intersect(main_set, first_set, temp_set);
first = false;
}
else {
SentIdSet new_set( new SentIdSet::element_type() );
ordered_set_intersect(new_set, main_set, temp_set);
main_set->swap(*new_set);
}
}
ids->swap(*main_set);
}
}
void find_occurrences(SentIdSet& ids, const std::string& rule,
tsa_t& my_sa, tind_t &my_v, Cache& cache)
{
// we search for hierarchical rules by stripping away NT and looking for terminals sequences
// if a rule contains multiple sequences of terminals, we intersect their occurrences.
if (hierarchical) {
// std::cerr << "splitting up phrase: " << phrase << "\n";
int pos = 0;
int NTStartPos, NTEndPos;
std::vector<std::string> phrases;
while (rule.find("] ", pos) < rule.size()) {
NTStartPos = rule.find("[",pos) - 1; // -1 to cut space before NT
NTEndPos = rule.find("] ",pos);
if (NTStartPos < pos) { // no space: NT at start of rule (or two consecutive NTs)
pos = NTEndPos + 2;
continue;
}
phrases.push_back(rule.substr(pos,NTStartPos-pos));
pos = NTEndPos + 2;
}
NTStartPos = rule.find("[",pos) - 1; // LHS of rule
if (NTStartPos > pos) {
phrases.push_back(rule.substr(pos,NTStartPos-pos));
}
lookup_multiple_phrases(ids, phrases, my_sa, my_v, rule, cache);
}
else {
lookup_phrase(ids, rule, my_sa, my_v, cache);
}
}
// input: unordered list of translation options for a single source phrase
void compute_cooc_stats_and_filter(std::vector<PTEntry*>& options)
{
if (pfe_filter_limit > 0 && options.size() > pfe_filter_limit) {
nremoved_pfefilter += (options.size() - pfe_filter_limit);
std::nth_element(options.begin(), options.begin() + pfe_filter_limit,
options.end(), PfeComparer());
for (std::vector<PTEntry*>::iterator i = options.begin() + pfe_filter_limit;
i != options.end(); ++i)
delete *i;
options.erase(options.begin() + pfe_filter_limit,options.end());
}
if (pef_filter_only)
return;
if (options.empty())
return;
size_t cf = 0;
std::vector<SentIdSet> fsets;
BOOST_FOREACH(boost::shared_ptr<SA>& f_sa, f_sas) {
fsets.push_back( boost::shared_ptr<SentIdSet::element_type>(new SentIdSet::element_type()) );
find_occurrences(fsets.back(), options.front()->f_phrase, f_sa->I, f_sa->V, f_sa->cache);
cf += fsets.back()->size();
}
for (std::vector<PTEntry*>::iterator i = options.begin();
i != options.end(); ++i) {
const std::string& e_phrase = (*i)->e_phrase;
size_t ce = 0;
std::vector<SentIdSet> esets;
BOOST_FOREACH(boost::shared_ptr<SA>& e_sa, e_sas) {
esets.push_back( boost::shared_ptr<SentIdSet::element_type>(new SentIdSet::element_type()) );
find_occurrences(esets.back(), e_phrase, e_sa->I, e_sa->V, e_sa->cache);
ce += esets.back()->size();
}
size_t cef = 0;
for(size_t j = 0; j < fsets.size(); ++j) {
SentIdSet efset( new SentIdSet::element_type() );
ordered_set_intersect(efset, fsets[j], esets[j]);
cef += efset->size();
}
double nlp = -log(fisher_exact(cef, cf, ce));
(*i)->set_cooc_stats(cef, cf, ce, nlp);
}
std::vector<PTEntry*>::iterator new_end =
std::remove_if(options.begin(), options.end(),
NlogSigThresholder(sig_filter_limit));
nremoved_sigfilter += (options.end() - new_end);
options.erase(new_end,options.end());
}
void filter_thread(std::istream* in, std::ostream* out, int pfe_index) {
std::vector<std::string> lines;
std::string prev = "";
std::vector<PTEntry*> options;
while(true) {
{
boost::mutex::scoped_lock lock(in_mutex);
if(in->eof())
break;
lines.clear();
std::string line;
while(getline(*in, line) && lines.size() < 500000)
lines.push_back(line);
}
std::stringstream out_temp;
for(std::vector<std::string>::iterator it = lines.begin(); it != lines.end(); it++) {
size_t tmp_lines = ++pt_lines;
if(tmp_lines % 10000 == 0) {
boost::mutex::scoped_lock lock(err_mutex);
std::cerr << ".";
if(tmp_lines % 500000 == 0)
std::cerr << "[n:" << tmp_lines << "]\n";
if(tmp_lines % 10000000 == 0) {
float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines;
float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines;
std::cerr << "------------------------------------------------------\n"
<< " unfiltered phrases pairs: " << pt_lines << "\n"
<< "\n"
<< " P(f|e) filter [first]: " << nremoved_pfefilter << " (" << pfefper << "%)\n"
<< " significance filter: " << nremoved_sigfilter << " (" << sigfper << "%)\n"
<< " TOTAL FILTERED: " << (nremoved_pfefilter + nremoved_sigfilter) << " (" << (sigfper + pfefper) << "%)\n"
<< "\n"
<< " FILTERED phrase pairs: " << (pt_lines - nremoved_pfefilter - nremoved_sigfilter) << " (" << (100.0-sigfper - pfefper) << "%)\n"
<< "------------------------------------------------------\n";
}
}
if(pt_lines % 10000 == 0) {
BOOST_FOREACH(boost::shared_ptr<SA> f_sa, f_sas)
f_sa->cache.prune();
BOOST_FOREACH(boost::shared_ptr<SA> e_sa, e_sas)
e_sa->cache.prune();
}
if(it->length() > 0) {
PTEntry* pp = new PTEntry(it->c_str(), pfe_index);
if (prev != pp->f_phrase) {
prev = pp->f_phrase;
if (!options.empty()) { // always true after first line
compute_cooc_stats_and_filter(options);
}
for (std::vector<PTEntry*>::iterator i = options.begin();
i != options.end(); ++i) {
out_temp << **i << '\n';
delete *i;
}
options.clear();
options.push_back(pp);
} else {
options.push_back(pp);
}
}
}
boost::mutex::scoped_lock lock(out_mutex);
*out << out_temp.str() << std::flush;
}
compute_cooc_stats_and_filter(options);
boost::mutex::scoped_lock lock(out_mutex);
for (std::vector<PTEntry*>::iterator i = options.begin();
i != options.end(); ++i) {
*out << **i << '\n';
delete *i;
}
*out << std::flush;
}
namespace po = boost::program_options;
int main(int argc, char * argv[])
{
bool help;
std::vector<std::string> efiles;
std::vector<std::string> ffiles;
int pfe_index = 2;
int threads = 1;
size_t max_cache = 0;
std::string str_sig_filter_limit;
po::options_description general("General options");
general.add_options()
("english,e", po::value<std::vector<std::string> >(&efiles)->multitoken(),
"english.suf-arr")
("french,f", po::value<std::vector<std::string> >(&ffiles)->multitoken(),
"french.suf-arr")
("pfe-index,i", po::value(&pfe_index)->default_value(2),
"Index of P(f|e) in phrase table")
("pfe-filter-limit,n", po::value(&pfe_filter_limit)->default_value(0),
"0, 1...: 0=no filtering, >0 sort by P(e|f) and keep the top num elements")
("threads,t", po::value(&threads)->default_value(1),
"number of threads to use")
("max-cache,m", po::value(&max_cache)->default_value(0),
"limit cache to arg most recent phrases")
("print-cooc,c", po::value(&print_cooc_counts)->zero_tokens()->default_value(false),
"add the coocurrence counts to the phrase table")
("print-significance,p", po::value(&print_neglog_significance)->zero_tokens()->default_value(false),
"add -log(significance) to the phrase table")
("hierarchical,x", po::value(&hierarchical)->zero_tokens()->default_value(false),
"filter hierarchical rule table")
("sig-filter-limit,l", po::value(&str_sig_filter_limit),
">0.0, a+e, or a-e: keep values that have a -log significance > this")
("help,h", po::value(&help)->zero_tokens()->default_value(false),
"display this message")
;
po::options_description cmdline_options("Allowed options");
cmdline_options.add(general);
po::variables_map vm;
try {
po::store(po::command_line_parser(argc,argv).
options(cmdline_options).run(), vm);
po::notify(vm);
}
catch (std::exception& e) {
std::cout << "Error: " << e.what() << std::endl << std::endl;
usage();
std::cout << cmdline_options << std::endl;
exit(0);
}
if(vm["help"].as<bool>()) {
usage();
std::cout << cmdline_options << std::endl;
exit(0);
}
if(vm.count("pfe-filter-limit"))
std::cerr << "P(f|e) filter limit: " << pfe_filter_limit << std::endl;
if(vm.count("threads"))
std::cerr << "Using threads: " << threads << std::endl;
if(vm.count("max-cache"))
std::cerr << "Using max phrases in caches: " << max_cache << std::endl;
if (strcmp(str_sig_filter_limit.c_str(),"a+e") == 0) {
sig_filter_limit = ALPHA_PLUS_EPS;
} else if (strcmp(str_sig_filter_limit.c_str(),"a-e") == 0) {
sig_filter_limit = ALPHA_MINUS_EPS;
} else {
char *x;
sig_filter_limit = strtod(str_sig_filter_limit.c_str(), &x);
if (sig_filter_limit < 0.0) {
std::cerr << "Filter limit (-l) must be either 'a+e', 'a-e' or a real number >= 0.0\n";
usage();
}
}
if (sig_filter_limit == 0.0) pef_filter_only = true;
//-----------------------------------------------------------------------------
if (optind != argc || ((efiles.empty() || ffiles.empty()) && !pef_filter_only)) {
usage();
}
if (!pef_filter_only) {
size_t elines = 0;
BOOST_FOREACH(std::string& efile, efiles) {
e_sas.push_back(boost::shared_ptr<SA>(new SA()));
e_sas.back()->V.open(efile + ".tdx");
e_sas.back()->T.reset(new ttrack_t());
e_sas.back()->T->open(efile + ".mct");
e_sas.back()->I.open(efile + ".sfa", e_sas.back()->T);
elines += e_sas.back()->T->size();
}
size_t flines = 0;
BOOST_FOREACH(std::string& ffile, ffiles) {
f_sas.push_back(boost::shared_ptr<SA>(new SA()));
f_sas.back()->V.open(ffile + ".tdx");
f_sas.back()->T.reset(new ttrack_t());
f_sas.back()->T->open(ffile + ".mct");
f_sas.back()->I.open(ffile + ".sfa", f_sas.back()->T);
flines += f_sas.back()->T->size();
}
if (elines != flines) {
std::cerr << "Number of lines in e-corpus != number of lines in f-corpus!\n";
usage();
exit(1);
} else {
std::cerr << "Training corpus: " << elines << " lines\n";
num_lines = elines;
}
p_111 = -log(fisher_exact(1,1,1));
std::cerr << "\\alpha = " << p_111 << "\n";
if (sig_filter_limit == ALPHA_MINUS_EPS) {
sig_filter_limit = p_111 - 0.001;
} else if (sig_filter_limit == ALPHA_PLUS_EPS) {
sig_filter_limit = p_111 + 0.001;
}
std::cerr << "Sig filter threshold is = " << sig_filter_limit << "\n";
} else {
std::cerr << "Filtering using P(e|f) only. n=" << pfe_filter_limit << std::endl;
}
Cache::set_max_cache(max_cache);
std::ios_base::sync_with_stdio(false);
boost::thread_group threadGroup;
for(int i = 0; i < threads; i++)
threadGroup.add_thread(new boost::thread(filter_thread, &std::cin, &std::cout, pfe_index));
threadGroup.join_all();
float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines;
float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines;
std::cerr << "\n\n------------------------------------------------------\n"
<< " unfiltered phrases pairs: " << pt_lines << "\n"
<< "\n"
<< " P(f|e) filter [first]: " << nremoved_pfefilter << " (" << pfefper << "%)\n"
<< " significance filter: " << nremoved_sigfilter << " (" << sigfper << "%)\n"
<< " TOTAL FILTERED: " << (nremoved_pfefilter + nremoved_sigfilter) << " (" << (sigfper + pfefper) << "%)\n"
<< "\n"
<< " FILTERED phrase pairs: " << (pt_lines - nremoved_pfefilter - nremoved_sigfilter) << " (" << (100.0-sigfper - pfefper) << "%)\n"
<< "------------------------------------------------------\n";
}

View File

@ -5,7 +5,7 @@
#include <vector>
#ifndef NO_MOSES
#include "moses/FF/LexicalReordering/LexicalReorderingState.h"
#include "moses/FF/LexicalReordering/LRState.h"
#endif
namespace sapt {

View File

@ -4,7 +4,7 @@
#include "ug_typedefs.h"
#include "ug_bitext_pstats.h"
#ifndef NO_MOSES
#include "moses/FF/LexicalReordering/LexicalReorderingState.h"
#include "moses/FF/LexicalReordering/LRState.h"
#endif
#include "boost/format.hpp"
#include "tpt_tokenindex.h"

View File

@ -42,7 +42,7 @@ TrellisPath::TrellisPath(const Hypothesis *hypo)
void TrellisPath::InitTotalScore()
{
m_totalScore = m_path[0]->GetWinningHypo()->GetFutureScore();
m_totalScore = m_path[0]->GetWinningHypo()->GetFutureScore();
//calc score
size_t sizePath = m_path.size();
@ -50,7 +50,7 @@ void TrellisPath::InitTotalScore()
const Hypothesis *hypo = m_path[pos];
const Hypothesis *winningHypo = hypo->GetWinningHypo();
if (hypo != winningHypo) {
m_totalScore = m_totalScore - winningHypo->GetFutureScore() + hypo->GetFutureScore();
m_totalScore += hypo->GetFutureScore() - winningHypo->GetFutureScore();
}
}
}
@ -169,9 +169,6 @@ TrellisPath::
GetScoreBreakdown() const
{
if (!m_scoreBreakdown) {
float totalScore = m_path[0]->GetWinningHypo()->GetFutureScore();
// calculated for sanity check only
m_scoreBreakdown.reset(new ScoreComponentCollection());
m_scoreBreakdown->PlusEquals(m_path[0]->GetWinningHypo()->GetScoreBreakdown());
@ -184,13 +181,10 @@ GetScoreBreakdown() const
const Hypothesis *hypo = m_path[pos];
const Hypothesis *winningHypo = hypo->GetWinningHypo();
if (hypo != winningHypo) {
totalScore += hypo->GetFutureScore() - winningHypo->GetFutureScore();
m_scoreBreakdown->MinusEquals(winningHypo->GetScoreBreakdown());
m_scoreBreakdown->PlusEquals(hypo->GetScoreBreakdown());
}
}
assert(totalScore == m_totalScore);
}
return m_scoreBreakdown;

View File

@ -3,6 +3,7 @@
#include "moses/ContextScope.h"
#include <boost/foreach.hpp>
#include "moses/Util.h"
#include "moses/TreeInput.h"
#include "moses/Hypothesis.h"
namespace MosesServer
@ -24,6 +25,7 @@ using Moses::FValue;
using Moses::PhraseDictionaryMultiModel;
using Moses::FindPhraseDictionary;
using Moses::Sentence;
using Moses::TreeInput;
boost::shared_ptr<TranslationRequest>
TranslationRequest::
@ -317,7 +319,13 @@ parse_request(std::map<std::string, xmlrpc_c::value> const& params)
// for (size_t i = 1; i < tmp.size(); i += 2)
// m_bias[xmlrpc_c::value_int(tmp[i-1])] = xmlrpc_c::value_double(tmp[i]);
// }
m_source.reset(new Sentence(m_options,0,m_source_string));
if (is_syntax(m_options->search.algo)) {
m_source.reset(new TreeInput(m_options));
istringstream in(m_source_string + "\n");
m_source->Read(in);
} else {
m_source.reset(new Sentence(m_options,0,m_source_string));
}
} // end of Translationtask::parse_request()
@ -334,7 +342,7 @@ run_chart_decoder()
const Moses::ChartHypothesis *hypo = manager.GetBestHypothesis();
ostringstream out;
outputChartHypo(out,hypo);
if (hypo) outputChartHypo(out,hypo);
m_target_string = out.str();
m_retData["text"] = xmlrpc_c::value_string(m_target_string);

View File

@ -311,12 +311,14 @@ std::string ExtractionPhrasePair::CollectAllPropertyValues(const std::string &ke
std::ostringstream oss;
for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
iter!=allPropertyValues->end(); ++iter) {
if (iter!=allPropertyValues->begin()) {
if (!(iter->first).empty()) {
if (iter!=allPropertyValues->begin()) {
oss << " ";
}
oss << iter->first;
oss << " ";
oss << iter->second;
}
oss << iter->first;
oss << " ";
oss << iter->second;
}
std::string allPropertyValuesString(oss.str());

View File

@ -50,7 +50,10 @@ private:
bool onlyOutputSpanInfo;
bool gzOutput;
std::string instanceWeightsFile; //weights for each sentence
bool targetConstituentConstrainedFlag;
bool targetConstituentBoundariesFlag;
bool flexScoreFlag;
bool singleWordHeuristicFlag;
public:
std::vector<std::string> placeholders;
@ -72,7 +75,10 @@ public:
includeSentenceIdFlag(false),
onlyOutputSpanInfo(false),
gzOutput(false),
targetConstituentConstrainedFlag(false),
targetConstituentBoundariesFlag(false),
flexScoreFlag(false),
singleWordHeuristicFlag(false),
debug(false) {
}
@ -116,9 +122,18 @@ public:
void initInstanceWeightsFile(const char* initInstanceWeightsFile) {
instanceWeightsFile = std::string(initInstanceWeightsFile);
}
void initTargetConstituentConstrainedFlag(const bool initTargetConstituentConstrainedFlag) {
targetConstituentConstrainedFlag = initTargetConstituentConstrainedFlag;
}
void initTargetConstituentBoundariesFlag(const bool initTargetConstituentBoundariesFlag) {
targetConstituentBoundariesFlag = initTargetConstituentBoundariesFlag;
}
void initFlexScoreFlag(const bool initflexScoreFlag) {
flexScoreFlag=initflexScoreFlag;
}
void initSingleWordHeuristicFlag(const bool initSingleWordHeuristicFlag) {
singleWordHeuristicFlag = initSingleWordHeuristicFlag;
}
// functions for getting values
bool isAllModelsOutputFlag() const {
@ -160,9 +175,18 @@ public:
std::string getInstanceWeightsFile() const {
return instanceWeightsFile;
}
bool isTargetConstituentConstrainedFlag() const {
return targetConstituentConstrainedFlag;
}
bool isTargetConstituentBoundariesFlag() const {
return targetConstituentBoundariesFlag;
}
bool isFlexScoreFlag() const {
return flexScoreFlag;
}
bool isSingleWordHeuristicFlag() const {
return singleWordHeuristicFlag;
}
};
}

View File

@ -18,8 +18,6 @@
***********************************************************************/
#pragma once
#ifndef RULEEXTRACTIONOPTIONS_H_INCLUDED_
#define RULEEXTRACTIONOPTIONS_H_INCLUDED_
namespace MosesTraining
{
@ -95,4 +93,3 @@ public:
}
#endif

View File

@ -35,7 +35,7 @@ namespace MosesTraining
bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetString, int sentenceID, bool boundaryRules)
{
if (!m_options.targetSyntax) {
if (!m_targetSyntax) {
return SentenceAlignment::processTargetSentence(targetString, sentenceID, boundaryRules);
}
@ -56,7 +56,7 @@ bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetStrin
bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceString, int sentenceID, bool boundaryRules)
{
if (!m_options.sourceSyntax) {
if (!m_sourceSyntax) {
return SentenceAlignment::processSourceSentence(sourceString, sentenceID, boundaryRules);
}

View File

@ -18,8 +18,6 @@
***********************************************************************/
#pragma once
#ifndef SENTENCEALIGNMENTWITHSYNTAX_H_INCLUDED_
#define SENTENCEALIGNMENTWITHSYNTAX_H_INCLUDED_
#include <map>
#include <set>
@ -42,18 +40,20 @@ public:
std::set<std::string> & m_sourceLabelCollection;
std::map<std::string, int> & m_targetTopLabelCollection;
std::map<std::string, int> & m_sourceTopLabelCollection;
const RuleExtractionOptions & m_options;
const bool m_targetSyntax, m_sourceSyntax;
SentenceAlignmentWithSyntax(std::set<std::string> & tgtLabelColl,
std::set<std::string> & srcLabelColl,
std::map<std::string,int> & tgtTopLabelColl,
std::map<std::string,int> & srcTopLabelColl,
const RuleExtractionOptions & options)
bool targetSyntax,
bool sourceSyntax)
: m_targetLabelCollection(tgtLabelColl)
, m_sourceLabelCollection(srcLabelColl)
, m_targetTopLabelCollection(tgtTopLabelColl)
, m_sourceTopLabelCollection(srcTopLabelColl)
, m_options(options) {
, m_targetSyntax(targetSyntax)
, m_sourceSyntax(sourceSyntax) {
}
virtual ~SentenceAlignmentWithSyntax() {}
@ -67,4 +67,3 @@ public:
}
#endif

View File

@ -47,6 +47,8 @@ SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos,
SyntaxNode* newNode = new SyntaxNode(label, startPos, endPos);
m_nodes.push_back( newNode );
m_index[ startPos ][ endPos ].push_back( newNode );
m_endPositionsIndex[ endPos ].push_back( newNode );
m_startPositionsIndex[ startPos ].push_back( newNode ); // TODO: may not need this: access m_index by startPos and iterate over its InnerNodeIndex (= end positions)?
m_numWords = std::max(endPos+1, m_numWords);
return newNode;
}
@ -70,6 +72,36 @@ const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes(
return endIndex->second;
}
bool SyntaxNodeCollection::HasNodeStartingAtPosition( int startPos ) const
{
return GetNodesByStartPosition(startPos).size() > 0;
}
const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodesByStartPosition(
int startPos ) const
{
InnerNodeIndex::const_iterator startIndex = m_startPositionsIndex.find( startPos );
if (startIndex == m_startPositionsIndex.end() )
return m_emptyNode;
return startIndex->second;
}
bool SyntaxNodeCollection::HasNodeEndingAtPosition( int endPos ) const
{
return GetNodesByEndPosition(endPos).size() > 0;
}
const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodesByEndPosition(
int endPos ) const
{
InnerNodeIndex::const_iterator endIndex = m_endPositionsIndex.find( endPos );
if (endIndex == m_endPositionsIndex.end() )
return m_emptyNode;
return endIndex->second;
}
std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
{
std::map<SyntaxNode *, SyntaxTree *> nodeToTree;

View File

@ -50,6 +50,11 @@ public:
//! Lookup the SyntaxNodes for a given span.
const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const;
bool HasNodeStartingAtPosition( int startPos ) const;
const std::vector< SyntaxNode* >& GetNodesByStartPosition( int startPos ) const;
bool HasNodeEndingAtPosition( int endPos ) const;
const std::vector< SyntaxNode* >& GetNodesByEndPosition( int endPos ) const;
//! Get a vector of pointers to all SyntaxNodes (unordered).
const std::vector< SyntaxNode* >& GetAllNodes() {
return m_nodes;
@ -78,6 +83,9 @@ private:
NodeIndex m_index;
int m_numWords;
std::vector< SyntaxNode* > m_emptyNode;
InnerNodeIndex m_endPositionsIndex;
InnerNodeIndex m_startPositionsIndex;
};
} // namespace MosesTraining

View File

@ -1,11 +1,3 @@
/*
* extract.cpp
* Modified by: Rohit Gupta CDAC, Mumbai, India
* on July 15, 2012 to implement parallel processing
* Modified by: Nadi Tomeh - LIMSI/CNRS
* Machine Translation Marathon 2010, Dublin
*/
#include <cstdio>
#include <iostream>
#include <fstream>
@ -20,11 +12,13 @@
#include <vector>
#include <limits>
#include "SentenceAlignment.h"
#include "tables-core.h"
#include "InputFileStream.h"
#include "OutputFileStream.h"
#include "PhraseExtractionOptions.h"
#include "SentenceAlignmentWithSyntax.h"
#include "SyntaxNode.h"
#include "moses/Util.h"
using namespace std;
using namespace MosesTraining;
@ -46,14 +40,14 @@ typedef vector < HPhrase > HPhraseVector;
// The key of the map is the English index and the value is a set of the source ones
typedef map <int, set<int> > HSentenceVertices;
REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
REO_POS getOrientWordModel(SentenceAlignmentWithSyntax &, REO_MODEL_TYPE, bool, bool,
int, int, int, int, int, int, int,
bool (*)(int, int), bool (*)(int, int));
REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
REO_POS getOrientPhraseModel(SentenceAlignmentWithSyntax &, REO_MODEL_TYPE, bool, bool,
int, int, int, int, int, int, int,
bool (*)(int, int), bool (*)(int, int),
const HSentenceVertices &, const HSentenceVertices &);
REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
REO_POS getOrientHierModel(SentenceAlignmentWithSyntax &, REO_MODEL_TYPE, bool, bool,
int, int, int, int, int, int, int,
bool (*)(int, int), bool (*)(int, int),
const HSentenceVertices &, const HSentenceVertices &,
@ -69,25 +63,16 @@ bool ge(int, int);
bool le(int, int);
bool lt(int, int);
bool isAligned (SentenceAlignment &, int, int);
bool isAligned (SentenceAlignmentWithSyntax &, int, int);
int sentenceOffset = 0;
std::vector<std::string> Tokenize(const std::string& str,
const std::string& delimiters = " \t");
bool flexScoreFlag = false;
}
namespace MosesTraining
{
class ExtractTask
{
public:
ExtractTask(
size_t id, SentenceAlignment &sentence,
size_t id, SentenceAlignmentWithSyntax &sentence,
PhraseExtractionOptions &initoptions,
Moses::OutputFileStream &extractFile,
Moses::OutputFileStream &extractFileInv,
@ -109,14 +94,26 @@ private:
vector< string > m_extractedPhrasesSid;
vector< string > m_extractedPhrasesContext;
vector< string > m_extractedPhrasesContextInv;
void extractBase(SentenceAlignment &);
void extract(SentenceAlignment &);
void addPhrase(SentenceAlignment &, int, int, int, int, string &);
void extractBase();
void extract();
void addPhrase(int, int, int, int, const std::string &);
void writePhrasesToFile();
bool checkPlaceholders (const SentenceAlignment &sentence, int startE, int endE, int startF, int endF);
bool isPlaceholder(const string &word);
bool checkPlaceholders(int startE, int endE, int startF, int endF) const;
bool isPlaceholder(const string &word) const;
bool checkTargetConstituentBoundaries(int startE, int endE, int startF, int endF,
ostringstream &outextractstrPhraseProperties) const;
void getOrientationInfo(int startE, int endE, int startF, int endF,
const HSentenceVertices& inTopLeft,
const HSentenceVertices& inTopRight,
const HSentenceVertices& inBottomLeft,
const HSentenceVertices& inBottomRight,
const HSentenceVertices& outTopLeft,
const HSentenceVertices& outTopRight,
const HSentenceVertices& outBottomLeft,
const HSentenceVertices& outBottomRight,
std::string &orientationInfo) const;
SentenceAlignment &m_sentence;
SentenceAlignmentWithSyntax &m_sentence;
const PhraseExtractionOptions &m_options;
Moses::OutputFileStream &m_extractFile;
Moses::OutputFileStream &m_extractFileInv;
@ -128,12 +125,13 @@ private:
int main(int argc, char* argv[])
{
cerr << "PhraseExtract v1.4, written by Philipp Koehn\n"
<< "phrase extraction from an aligned parallel corpus\n";
cerr << "PhraseExtract v1.5, written by Philipp Koehn et al." << std::endl
<< "phrase extraction from an aligned parallel corpus" << std::endl;
if (argc < 6) {
cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] ";
cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n | --InstanceWeights filename ]\n";
cerr << "| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n | --InstanceWeights filename ";
cerr << "| --TargetConstituentConstrained | --TargetConstituentBoundaries ]" << std::endl;
exit(1);
}
@ -153,8 +151,14 @@ int main(int argc, char* argv[])
options.initOnlyOutputSpanInfo(true);
} else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
options.initOrientationFlag(true);
} else if (strcmp(argv[i],"--TargetConstituentConstrained") == 0) {
options.initTargetConstituentConstrainedFlag(true);
} else if (strcmp(argv[i],"--TargetConstituentBoundaries") == 0) {
options.initTargetConstituentBoundariesFlag(true);
} else if (strcmp(argv[i],"--FlexibilityScore") == 0) {
options.initFlexScoreFlag(true);
} else if (strcmp(argv[i],"--SingleWordHeuristic") == 0) {
options.initSingleWordHeuristicFlag(true);
} else if (strcmp(argv[i],"--NoTTable") == 0) {
options.initTranslationFlag(false);
} else if (strcmp(argv[i], "--IncludeSentenceId") == 0) {
@ -231,9 +235,9 @@ int main(int argc, char* argv[])
} else if (strcmp(argv[i], "--Placeholders") == 0) {
++i;
string str = argv[i];
options.placeholders = Tokenize(str.c_str(), ",");
Moses::Tokenize(options.placeholders, str.c_str(), ",");
} else {
cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'" << std::endl;
exit(1);
}
}
@ -278,11 +282,16 @@ int main(int argc, char* argv[])
extractFileContextInv.Open(fileNameExtractContextInv.c_str());
}
// stats on labels for glue grammar and unknown word label probabilities
set< string > targetLabelCollection, sourceLabelCollection;
map< string, int > targetTopLabelCollection, sourceTopLabelCollection;
const bool targetSyntax = true;
int i = sentenceOffset;
string englishString, foreignString, alignmentString, weightString;
while(getline(*eFileP, englishString)) {
while (getline(*eFileP, englishString)) {
// Print progress dots to stderr.
i++;
if (i%10000 == 0) cerr << "." << flush;
@ -293,7 +302,10 @@ int main(int argc, char* argv[])
getline(*iwFileP, weightString);
}
SentenceAlignment sentence;
SentenceAlignmentWithSyntax sentence
(targetLabelCollection, sourceLabelCollection,
targetTopLabelCollection, sourceTopLabelCollection,
targetSyntax, false);
// cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
//az: output src, tgt, and alingment line
if (options.isOnlyOutputSpanInfo()) {
@ -347,7 +359,7 @@ namespace MosesTraining
{
void ExtractTask::Run()
{
extract(m_sentence);
extract();
writePhrasesToFile();
m_extractedPhrases.clear();
m_extractedPhrasesInv.clear();
@ -358,10 +370,10 @@ void ExtractTask::Run()
}
void ExtractTask::extract(SentenceAlignment &sentence)
void ExtractTask::extract()
{
int countE = sentence.target.size();
int countF = sentence.source.size();
int countE = m_sentence.target.size();
int countF = m_sentence.source.size();
HPhraseVector inboundPhrases;
@ -376,21 +388,20 @@ void ExtractTask::extract(SentenceAlignment &sentence)
HSentenceVertices outBottomRight;
bool relaxLimit = m_options.isHierModel();
bool buildExtraStructure = m_options.isPhraseModel() || m_options.isHierModel();
// check alignments for target phrase startE...endE
// loop over extracted phrases which are compatible with the word-alignments
for(int startE=0; startE<countE; startE++) {
for(int endE=startE;
(endE<countE && (relaxLimit || endE<startE+m_options.maxPhraseLength));
endE++) {
for (int startE=0; startE<countE; startE++) {
for (int endE=startE;
(endE<countE && (relaxLimit || endE<startE+m_options.maxPhraseLength));
endE++) {
int minF = std::numeric_limits<int>::max();
int maxF = -1;
vector< int > usedF = sentence.alignedCountS;
for(int ei=startE; ei<=endE; ei++) {
for(size_t i=0; i<sentence.alignedToT[ei].size(); i++) {
int fi = sentence.alignedToT[ei][i];
vector< int > usedF = m_sentence.alignedCountS;
for (int ei=startE; ei<=endE; ei++) {
for (size_t i=0; i<m_sentence.alignedToT[ei].size(); i++) {
int fi = m_sentence.alignedToT[ei][i];
if (fi<minF) {
minF = fi;
}
@ -406,111 +417,142 @@ void ExtractTask::extract(SentenceAlignment &sentence)
// check if source words are aligned to out of bound target words
bool out_of_bounds = false;
for(int fi=minF; fi<=maxF && !out_of_bounds; fi++)
for (int fi=minF; fi<=maxF && !out_of_bounds; fi++)
if (usedF[fi]>0) {
// cout << "ouf of bounds: " << fi << "\n";
// cout << "ouf of bounds: " << fi << std::endl;
out_of_bounds = true;
}
// cout << "doing if for ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n";
// cout << "doing if for ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")" << std::endl;
if (!out_of_bounds) {
// start point of source phrase may retreat over unaligned
for(int startF=minF;
(startF>=0 &&
(relaxLimit || startF>maxF-m_options.maxPhraseLength) && // within length limit
(startF==minF || sentence.alignedCountS[startF]==0)); // unaligned
startF--)
for (int startF=minF;
(startF>=0 &&
(relaxLimit || startF>maxF-m_options.maxPhraseLength) && // within length limit
(startF==minF || m_sentence.alignedCountS[startF]==0)); // unaligned
startF--) {
// end point of source phrase may advance over unaligned
for(int endF=maxF;
(endF<countF &&
(relaxLimit || endF<startF+m_options.maxPhraseLength) && // within length limit
(endF==maxF || sentence.alignedCountS[endF]==0)); // unaligned
endF++) { // at this point we have extracted a phrase
if(buildExtraStructure) { // phrase || hier
if(endE-startE < m_options.maxPhraseLength && endF-startF < m_options.maxPhraseLength) { // within limit
inboundPhrases.push_back(HPhrase(HPhraseVertex(startF,startE),
HPhraseVertex(endF,endE)));
insertPhraseVertices(inTopLeft, inTopRight, inBottomLeft, inBottomRight,
startF, startE, endF, endE);
} else
insertPhraseVertices(outTopLeft, outTopRight, outBottomLeft, outBottomRight,
startF, startE, endF, endE);
for (int endF=maxF;
(endF<countF &&
(relaxLimit || endF<startF+m_options.maxPhraseLength) && // within length limit
(endF==maxF || m_sentence.alignedCountS[endF]==0)); // unaligned
endF++) { // at this point we have extracted a phrase
if(endE-startE < m_options.maxPhraseLength && endF-startF < m_options.maxPhraseLength) { // within limit
inboundPhrases.push_back(HPhrase(HPhraseVertex(startF,startE),
HPhraseVertex(endF,endE)));
insertPhraseVertices(inTopLeft, inTopRight, inBottomLeft, inBottomRight,
startF, startE, endF, endE);
} else {
string orientationInfo = "";
if(m_options.isWordModel()) {
REO_POS wordPrevOrient, wordNextOrient;
bool connectedLeftTopP = isAligned( sentence, startF-1, startE-1 );
bool connectedRightTopP = isAligned( sentence, endF+1, startE-1 );
bool connectedLeftTopN = isAligned( sentence, endF+1, endE+1 );
bool connectedRightTopN = isAligned( sentence, startF-1, endE+1 );
wordPrevOrient = getOrientWordModel(sentence, m_options.isWordType(), connectedLeftTopP, connectedRightTopP, startF, endF, startE, endE, countF, 0, 1, &ge, &lt);
wordNextOrient = getOrientWordModel(sentence, m_options.isWordType(), connectedLeftTopN, connectedRightTopN, endF, startF, endE, startE, 0, countF, -1, &lt, &ge);
orientationInfo += getOrientString(wordPrevOrient, m_options.isWordType()) + " " + getOrientString(wordNextOrient, m_options.isWordType());
// if(m_options.isAllModelsOutputFlag())
// " | | ";
}
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
insertPhraseVertices(outTopLeft, outTopRight, outBottomLeft, outBottomRight,
startF, startE, endF, endE);
}
}
}
}
}
}
}
if(buildExtraStructure) { // phrase || hier
string orientationInfo = "";
REO_POS wordPrevOrient=UNKNOWN, wordNextOrient=UNKNOWN, phrasePrevOrient, phraseNextOrient, hierPrevOrient, hierNextOrient;
std::string orientationInfo = "";
for(size_t i = 0; i < inboundPhrases.size(); i++) {
int startF = inboundPhrases[i].first.first;
int startE = inboundPhrases[i].first.second;
int endF = inboundPhrases[i].second.first;
int endE = inboundPhrases[i].second.second;
for (size_t i = 0; i < inboundPhrases.size(); i++) {
bool connectedLeftTopP = isAligned( sentence, startF-1, startE-1 );
bool connectedRightTopP = isAligned( sentence, endF+1, startE-1 );
bool connectedLeftTopN = isAligned( sentence, endF+1, endE+1 );
bool connectedRightTopN = isAligned( sentence, startF-1, endE+1 );
int startF = inboundPhrases[i].first.first;
int startE = inboundPhrases[i].first.second;
int endF = inboundPhrases[i].second.first;
int endE = inboundPhrases[i].second.second;
if(m_options.isWordModel()) {
wordPrevOrient = getOrientWordModel(sentence, m_options.isWordType(),
connectedLeftTopP, connectedRightTopP,
startF, endF, startE, endE, countF, 0, 1,
&ge, &lt);
wordNextOrient = getOrientWordModel(sentence, m_options.isWordType(),
connectedLeftTopN, connectedRightTopN,
endF, startF, endE, startE, 0, countF, -1,
&lt, &ge);
getOrientationInfo(startE, endE, startF, endF,
inTopLeft, inTopRight, inBottomLeft, inBottomRight,
outTopLeft, outTopRight, outBottomLeft, outBottomRight,
orientationInfo);
addPhrase(startE, endE, startF, endF, orientationInfo);
}
if (m_options.isSingleWordHeuristicFlag()) {
// add single word phrases that are not consistent with the word alignment
m_sentence.invertAlignment();
for (int ei=0; ei<countE; ei++) {
for (size_t i=0; i<m_sentence.alignedToT[ei].size(); i++) {
int fi = m_sentence.alignedToT[ei][i];
if ((m_sentence.alignedToT[ei].size() > 1) || (m_sentence.alignedToS[fi].size() > 1)) {
if (m_options.isOrientationFlag()) {
getOrientationInfo(ei, ei, fi, fi,
inTopLeft, inTopRight, inBottomLeft, inBottomRight,
outTopLeft, outTopRight, outBottomLeft, outBottomRight,
orientationInfo);
}
addPhrase(ei, ei, fi, fi, orientationInfo);
}
}
if (m_options.isPhraseModel()) {
phrasePrevOrient = getOrientPhraseModel(sentence, m_options.isPhraseType(),
connectedLeftTopP, connectedRightTopP,
startF, endF, startE, endE, countF-1, 0, 1, &ge, &lt, inBottomRight, inBottomLeft);
phraseNextOrient = getOrientPhraseModel(sentence, m_options.isPhraseType(),
connectedLeftTopN, connectedRightTopN,
endF, startF, endE, startE, 0, countF-1, -1, &lt, &ge, inBottomLeft, inBottomRight);
} else {
phrasePrevOrient = phraseNextOrient = UNKNOWN;
}
if(m_options.isHierModel()) {
hierPrevOrient = getOrientHierModel(sentence, m_options.isHierType(),
connectedLeftTopP, connectedRightTopP,
startF, endF, startE, endE, countF-1, 0, 1, &ge, &lt, inBottomRight, inBottomLeft, outBottomRight, outBottomLeft, phrasePrevOrient);
hierNextOrient = getOrientHierModel(sentence, m_options.isHierType(),
connectedLeftTopN, connectedRightTopN,
endF, startF, endE, startE, 0, countF-1, -1, &lt, &ge, inBottomLeft, inBottomRight, outBottomLeft, outBottomRight, phraseNextOrient);
}
orientationInfo = ((m_options.isWordModel())? getOrientString(wordPrevOrient, m_options.isWordType()) + " " + getOrientString(wordNextOrient, m_options.isWordType()) : "") + " | " +
((m_options.isPhraseModel())? getOrientString(phrasePrevOrient, m_options.isPhraseType()) + " " + getOrientString(phraseNextOrient, m_options.isPhraseType()) : "") + " | " +
((m_options.isHierModel())? getOrientString(hierPrevOrient, m_options.isHierType()) + " " + getOrientString(hierNextOrient, m_options.isHierType()) : "");
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
}
}
}
REO_POS getOrientWordModel(SentenceAlignment & sentence, REO_MODEL_TYPE modelType,
void ExtractTask::getOrientationInfo(int startE, int endE, int startF, int endF,
const HSentenceVertices& inTopLeft,
const HSentenceVertices& inTopRight,
const HSentenceVertices& inBottomLeft,
const HSentenceVertices& inBottomRight,
const HSentenceVertices& outTopLeft,
const HSentenceVertices& outTopRight,
const HSentenceVertices& outBottomLeft,
const HSentenceVertices& outBottomRight,
std::string &orientationInfo) const
{
REO_POS wordPrevOrient=UNKNOWN, wordNextOrient=UNKNOWN;
REO_POS phrasePrevOrient=UNKNOWN, phraseNextOrient=UNKNOWN;
REO_POS hierPrevOrient=UNKNOWN, hierNextOrient=UNKNOWN;
bool connectedLeftTopP = isAligned( m_sentence, startF-1, startE-1 );
bool connectedRightTopP = isAligned( m_sentence, endF+1, startE-1 );
bool connectedLeftTopN = isAligned( m_sentence, endF+1, endE+1 );
bool connectedRightTopN = isAligned( m_sentence, startF-1, endE+1 );
const int countF = m_sentence.source.size();
if (m_options.isWordModel()) {
wordPrevOrient = getOrientWordModel(m_sentence, m_options.isWordType(),
connectedLeftTopP, connectedRightTopP,
startF, endF, startE, endE, countF, 0, 1,
&ge, &lt);
wordNextOrient = getOrientWordModel(m_sentence, m_options.isWordType(),
connectedLeftTopN, connectedRightTopN,
endF, startF, endE, startE, 0, countF, -1,
&lt, &ge);
}
if (m_options.isPhraseModel()) {
phrasePrevOrient = getOrientPhraseModel(m_sentence, m_options.isPhraseType(),
connectedLeftTopP, connectedRightTopP,
startF, endF, startE, endE, countF-1, 0, 1, &ge, &lt, inBottomRight, inBottomLeft);
phraseNextOrient = getOrientPhraseModel(m_sentence, m_options.isPhraseType(),
connectedLeftTopN, connectedRightTopN,
endF, startF, endE, startE, 0, countF-1, -1, &lt, &ge, inBottomLeft, inBottomRight);
}
if (m_options.isHierModel()) {
hierPrevOrient = getOrientHierModel(m_sentence, m_options.isHierType(),
connectedLeftTopP, connectedRightTopP,
startF, endF, startE, endE, countF-1, 0, 1, &ge, &lt, inBottomRight, inBottomLeft, outBottomRight, outBottomLeft, phrasePrevOrient);
hierNextOrient = getOrientHierModel(m_sentence, m_options.isHierType(),
connectedLeftTopN, connectedRightTopN,
endF, startF, endE, startE, 0, countF-1, -1, &lt, &ge, inBottomLeft, inBottomRight, outBottomLeft, outBottomRight, phraseNextOrient);
}
if (m_options.isWordModel()) {
orientationInfo = getOrientString(wordPrevOrient, m_options.isWordType()) + " " + getOrientString(wordNextOrient, m_options.isWordType());
} else {
orientationInfo = " | " +
((m_options.isPhraseModel())? getOrientString(phrasePrevOrient, m_options.isPhraseType()) + " " + getOrientString(phraseNextOrient, m_options.isPhraseType()) : "") + " | " +
((m_options.isHierModel())? getOrientString(hierPrevOrient, m_options.isHierType()) + " " + getOrientString(hierNextOrient, m_options.isHierType()) : "");
}
}
REO_POS getOrientWordModel(SentenceAlignmentWithSyntax & sentence, REO_MODEL_TYPE modelType,
bool connectedLeftTop, bool connectedRightTop,
int startF, int endF, int startE, int endE, int countF, int zero, int unit,
bool (*ge)(int, int), bool (*lt)(int, int) )
@ -536,7 +578,7 @@ REO_POS getOrientWordModel(SentenceAlignment & sentence, REO_MODEL_TYPE modelTyp
}
// to be called with countF-1 instead of countF
REO_POS getOrientPhraseModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelType,
REO_POS getOrientPhraseModel (SentenceAlignmentWithSyntax & sentence, REO_MODEL_TYPE modelType,
bool connectedLeftTop, bool connectedRightTop,
int startF, int endF, int startE, int endE, int countF, int zero, int unit,
bool (*ge)(int, int), bool (*lt)(int, int),
@ -572,7 +614,7 @@ REO_POS getOrientPhraseModel (SentenceAlignment & sentence, REO_MODEL_TYPE model
}
// to be called with countF-1 instead of countF
REO_POS getOrientHierModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelType,
REO_POS getOrientHierModel (SentenceAlignmentWithSyntax & sentence, REO_MODEL_TYPE modelType,
bool connectedLeftTop, bool connectedRightTop,
int startF, int endF, int startE, int endE, int countF, int zero, int unit,
bool (*ge)(int, int), bool (*lt)(int, int),
@ -624,7 +666,7 @@ REO_POS getOrientHierModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelTy
return UNKNOWN;
}
bool isAligned ( SentenceAlignment &sentence, int fi, int ei )
bool isAligned ( SentenceAlignmentWithSyntax &sentence, int fi, int ei )
{
if (ei == -1 && fi == -1)
return true;
@ -660,7 +702,7 @@ void insertVertex( HSentenceVertices & corners, int x, int y )
set<int> tmp;
tmp.insert(x);
pair< HSentenceVertices::iterator, bool > ret = corners.insert( pair<int, set<int> > (y, tmp) );
if(ret.second == false) {
if (ret.second == false) {
ret.first->second.insert(x);
}
}
@ -711,41 +753,174 @@ string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType)
return "";
}
void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo)
bool ExtractTask::checkTargetConstituentBoundaries(int startE, int endE, int startF, int endF,
ostringstream &outextractstrPhraseProperties) const
{
// source
// // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n";
if (m_options.isTargetConstituentBoundariesFlag()) {
outextractstrPhraseProperties << " {{TargetConstituentBoundariesLeft ";
}
bool validTargetConstituentBoundaries = false;
bool outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = true;
if (m_options.isTargetConstituentBoundariesFlag()) {
if (startE==0) {
outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = false;
outextractstrPhraseProperties << "BOS_";
}
}
if (!m_sentence.targetTree.HasNodeStartingAtPosition(startE)) {
validTargetConstituentBoundaries = false;
} else {
const std::vector< SyntaxNode* >& startingNodes = m_sentence.targetTree.GetNodesByStartPosition(startE);
for ( std::vector< SyntaxNode* >::const_reverse_iterator iter = startingNodes.rbegin(); iter != startingNodes.rend(); ++iter ) {
if ( (*iter)->end == endE ) {
validTargetConstituentBoundaries = true;
if (!m_options.isTargetConstituentBoundariesFlag()) {
break;
}
}
if (m_options.isTargetConstituentBoundariesFlag()) {
if (outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst) {
outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = false;
} else {
outextractstrPhraseProperties << "<";
}
outextractstrPhraseProperties << (*iter)->label;
}
}
}
if (m_options.isTargetConstituentBoundariesFlag()) {
if (outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst) {
outextractstrPhraseProperties << "<";
}
outextractstrPhraseProperties << "}}";
}
if (m_options.isTargetConstituentConstrainedFlag() && !validTargetConstituentBoundaries) {
// skip over all boundary punctuation and check again
bool relaxedValidTargetConstituentBoundaries = false;
int relaxedStartE = startE;
int relaxedEndE = endE;
const std::string punctuation = ",;.:!?";
while ( (relaxedStartE < endE) &&
(m_sentence.target[relaxedStartE].size() == 1) &&
(punctuation.find(m_sentence.target[relaxedStartE].at(0)) != std::string::npos) ) {
++relaxedStartE;
}
while ( (relaxedEndE > relaxedStartE) &&
(m_sentence.target[relaxedEndE].size() == 1) &&
(punctuation.find(m_sentence.target[relaxedEndE].at(0)) != std::string::npos) ) {
--relaxedEndE;
}
if ( (relaxedStartE != startE) || (relaxedEndE !=endE) ) {
const std::vector< SyntaxNode* >& startingNodes = m_sentence.targetTree.GetNodesByStartPosition(relaxedStartE);
for ( std::vector< SyntaxNode* >::const_reverse_iterator iter = startingNodes.rbegin();
(iter != startingNodes.rend() && !relaxedValidTargetConstituentBoundaries);
++iter ) {
if ( (*iter)->end == relaxedEndE ) {
relaxedValidTargetConstituentBoundaries = true;
}
}
}
if (!relaxedValidTargetConstituentBoundaries) {
return false;
}
}
if (m_options.isTargetConstituentBoundariesFlag()) {
outextractstrPhraseProperties << " {{TargetConstituentBoundariesRightAdjacent ";
outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = true;
if (endE==(int)m_sentence.target.size()-1) {
outextractstrPhraseProperties << "EOS_";
outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = false;
} else {
const std::vector< SyntaxNode* >& adjacentNodes = m_sentence.targetTree.GetNodesByStartPosition(endE+1);
for ( std::vector< SyntaxNode* >::const_reverse_iterator iter = adjacentNodes.rbegin(); iter != adjacentNodes.rend(); ++iter ) {
if (outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst) {
outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = false;
} else {
outextractstrPhraseProperties << "<";
}
outextractstrPhraseProperties << (*iter)->label;
}
}
if (outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst) {
outextractstrPhraseProperties << "<";
}
outextractstrPhraseProperties << "}}";
}
return true;
}
void ExtractTask::addPhrase( int startE, int endE, int startF, int endF,
const std::string &orientationInfo)
{
ostringstream outextractstrPhraseProperties;
if (m_options.isTargetConstituentBoundariesFlag() || m_options.isTargetConstituentConstrainedFlag()) {
bool isTargetConstituentCovered = checkTargetConstituentBoundaries(startE, endE, startF, endF, outextractstrPhraseProperties);
if (m_options.isTargetConstituentBoundariesFlag() && !isTargetConstituentCovered) {
return;
}
}
if (m_options.placeholders.size() && !checkPlaceholders(startE, endE, startF, endF)) {
return;
}
if (m_options.isOnlyOutputSpanInfo()) {
cout << startF << " " << endF << " " << startE << " " << endE << std::endl;
return;
}
ostringstream outextractstr;
ostringstream outextractstrInv;
ostringstream outextractstrOrientation;
if (m_options.isOnlyOutputSpanInfo()) {
cout << startF << " " << endF << " " << startE << " " << endE << endl;
return;
}
if (m_options.placeholders.size() && !checkPlaceholders(sentence, startE, endE, startF, endF)) {
return;
}
if (m_options.debug) {
outextractstr << "sentenceID=" << sentence.sentenceID << " ";
outextractstrInv << "sentenceID=" << sentence.sentenceID << " ";
outextractstrOrientation << "sentenceID=" << sentence.sentenceID << " ";
outextractstr << "sentenceID=" << m_sentence.sentenceID << " ";
outextractstrInv << "sentenceID=" << m_sentence.sentenceID << " ";
outextractstrOrientation << "sentenceID=" << m_sentence.sentenceID << " ";
}
// source
for(int fi=startF; fi<=endF; fi++) {
if (m_options.isTranslationFlag()) outextractstr << sentence.source[fi] << " ";
if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.source[fi] << " ";
if (m_options.isTranslationFlag()) outextractstr << m_sentence.source[fi] << " ";
if (m_options.isOrientationFlag()) outextractstrOrientation << m_sentence.source[fi] << " ";
}
if (m_options.isTranslationFlag()) outextractstr << "||| ";
if (m_options.isOrientationFlag()) outextractstrOrientation << "||| ";
// target
for(int ei=startE; ei<=endE; ei++) {
if (m_options.isTranslationFlag()) outextractstr << sentence.target[ei] << " ";
if (m_options.isTranslationFlag()) outextractstrInv << sentence.target[ei] << " ";
if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.target[ei] << " ";
if (m_options.isTranslationFlag()) {
outextractstr << m_sentence.target[ei] << " ";
outextractstrInv << m_sentence.target[ei] << " ";
}
if (m_options.isOrientationFlag()) {
outextractstrOrientation << m_sentence.target[ei] << " ";
}
}
if (m_options.isTranslationFlag()) outextractstr << "|||";
if (m_options.isTranslationFlag()) outextractstrInv << "||| ";
@ -755,17 +930,22 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE,
if (m_options.isTranslationFlag()) {
for(int fi=startF; fi<=endF; fi++)
outextractstrInv << sentence.source[fi] << " ";
outextractstrInv << m_sentence.source[fi] << " ";
outextractstrInv << "|||";
}
// alignment
if (m_options.isTranslationFlag()) {
for(int ei=startE; ei<=endE; ei++) {
for(unsigned int i=0; i<sentence.alignedToT[ei].size(); i++) {
int fi = sentence.alignedToT[ei][i];
outextractstr << " " << fi-startF << "-" << ei-startE;
outextractstrInv << " " << ei-startE << "-" << fi-startF;
if (m_options.isSingleWordHeuristicFlag() && (startE==endE) && (startF==endF)) {
outextractstr << " 0-0";
outextractstrInv << " 0-0";
} else {
for(int ei=startE; ei<=endE; ei++) {
for(unsigned int i=0; i<m_sentence.alignedToT[ei].size(); i++) {
int fi = m_sentence.alignedToT[ei][i];
outextractstr << " " << fi-startF << "-" << ei-startE;
outextractstrInv << " " << ei-startE << "-" << fi-startF;
}
}
}
}
@ -774,20 +954,20 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE,
outextractstrOrientation << orientationInfo;
if (m_options.isIncludeSentenceIdFlag()) {
outextractstr << " ||| " << sentence.sentenceID;
outextractstr << " ||| " << m_sentence.sentenceID;
}
if (m_options.getInstanceWeightsFile().length()) {
if (m_options.isTranslationFlag()) {
outextractstr << " ||| " << sentence.weightString;
outextractstrInv << " ||| " << sentence.weightString;
outextractstr << " ||| " << m_sentence.weightString;
outextractstrInv << " ||| " << m_sentence.weightString;
}
if (m_options.isOrientationFlag()) {
outextractstrOrientation << " ||| " << sentence.weightString;
outextractstrOrientation << " ||| " << m_sentence.weightString;
}
}
outextractstr << outextractstrPhraseProperties.str();
// generate two lines for every extracted phrase:
// once with left, once with right context
@ -797,20 +977,20 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE,
ostringstream outextractstrContextInv;
for(int fi=startF; fi<=endF; fi++) {
outextractstrContext << sentence.source[fi] << " ";
outextractstrContext << m_sentence.source[fi] << " ";
}
outextractstrContext << "||| ";
// target
for(int ei=startE; ei<=endE; ei++) {
outextractstrContext << sentence.target[ei] << " ";
outextractstrContextInv << sentence.target[ei] << " ";
outextractstrContext << m_sentence.target[ei] << " ";
outextractstrContextInv << m_sentence.target[ei] << " ";
}
outextractstrContext << "||| ";
outextractstrContextInv << "||| ";
for(int fi=startF; fi<=endF; fi++)
outextractstrContextInv << sentence.source[fi] << " ";
outextractstrContextInv << m_sentence.source[fi] << " ";
outextractstrContextInv << "|||";
@ -823,25 +1003,25 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE,
// write context to left
outextractstrContext << "< ";
if (startF == 0) outextractstrContext << "<s>";
else outextractstrContext << sentence.source[startF-1];
else outextractstrContext << m_sentence.source[startF-1];
outextractstrContextInv << " < ";
if (startE == 0) outextractstrContextInv << "<s>";
else outextractstrContextInv << sentence.target[startE-1];
else outextractstrContextInv << m_sentence.target[startE-1];
// write context to right
outextractstrContextRight << "> ";
if (endF+1 == sentence.source.size()) outextractstrContextRight << "<s>";
else outextractstrContextRight << sentence.source[endF+1];
if (endF+1 == (int)m_sentence.source.size()) outextractstrContextRight << "<s>";
else outextractstrContextRight << m_sentence.source[endF+1];
outextractstrContextRightInv << " > ";
if (endE+1 == sentence.target.size()) outextractstrContextRightInv << "<s>";
else outextractstrContextRightInv << sentence.target[endE+1];
if (endE+1 == (int)m_sentence.target.size()) outextractstrContextRightInv << "<s>";
else outextractstrContextRightInv << m_sentence.target[endE+1];
outextractstrContext << "\n";
outextractstrContextInv << "\n";
outextractstrContextRight << "\n";
outextractstrContextRightInv << "\n";
outextractstrContext << std::endl;
outextractstrContextInv << std::endl;
outextractstrContextRight << std::endl;
outextractstrContextRightInv << std::endl;
m_extractedPhrasesContext.push_back(outextractstrContext.str());
m_extractedPhrasesContextInv.push_back(outextractstrContextInv.str());
@ -849,9 +1029,9 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE,
m_extractedPhrasesContextInv.push_back(outextractstrContextRightInv.str());
}
if (m_options.isTranslationFlag()) outextractstr << "\n";
if (m_options.isTranslationFlag()) outextractstrInv << "\n";
if (m_options.isOrientationFlag()) outextractstrOrientation << "\n";
if (m_options.isTranslationFlag()) outextractstr << std::endl;
if (m_options.isTranslationFlag()) outextractstrInv << std::endl;
if (m_options.isOrientationFlag()) outextractstrOrientation << std::endl;
m_extractedPhrases.push_back(outextractstr.str());
@ -896,30 +1076,30 @@ void ExtractTask::writePhrasesToFile()
// if proper conditioning, we need the number of times a source phrase occured
void ExtractTask::extractBase( SentenceAlignment &sentence )
void ExtractTask::extractBase()
{
ostringstream outextractFile;
ostringstream outextractFileInv;
int countF = sentence.source.size();
int countF = m_sentence.source.size();
for(int startF=0; startF<countF; startF++) {
for(int endF=startF;
(endF<countF && endF<startF+m_options.maxPhraseLength);
endF++) {
for(int fi=startF; fi<=endF; fi++) {
outextractFile << sentence.source[fi] << " ";
outextractFile << m_sentence.source[fi] << " ";
}
outextractFile << "|||" << endl;
}
}
int countE = sentence.target.size();
int countE = m_sentence.target.size();
for(int startE=0; startE<countE; startE++) {
for(int endE=startE;
(endE<countE && endE<startE+m_options.maxPhraseLength);
endE++) {
for(int ei=startE; ei<=endE; ei++) {
outextractFileInv << sentence.target[ei] << " ";
outextractFileInv << m_sentence.target[ei] << " ";
}
outextractFileInv << "|||" << endl;
}
@ -930,17 +1110,17 @@ void ExtractTask::extractBase( SentenceAlignment &sentence )
}
bool ExtractTask::checkPlaceholders (const SentenceAlignment &sentence, int startE, int endE, int startF, int endF)
bool ExtractTask::checkPlaceholders(int startE, int endE, int startF, int endF) const
{
for (size_t pos = startF; pos <= endF; ++pos) {
const string &sourceWord = sentence.source[pos];
for (int pos = startF; pos <= endF; ++pos) {
const string &sourceWord = m_sentence.source[pos];
if (isPlaceholder(sourceWord)) {
if (sentence.alignedToS.at(pos).size() != 1) {
if (m_sentence.alignedToS.at(pos).size() != 1) {
return false;
} else {
// check it actually lines up to another placeholder
int targetPos = sentence.alignedToS.at(pos).at(0);
const string &otherWord = sentence.target[targetPos];
int targetPos = m_sentence.alignedToS.at(pos).at(0);
const string &otherWord = m_sentence.target[targetPos];
if (!isPlaceholder(otherWord)) {
return false;
}
@ -948,15 +1128,15 @@ bool ExtractTask::checkPlaceholders (const SentenceAlignment &sentence, int star
}
}
for (size_t pos = startE; pos <= endE; ++pos) {
const string &targetWord = sentence.target[pos];
for (int pos = startE; pos <= endE; ++pos) {
const string &targetWord = m_sentence.target[pos];
if (isPlaceholder(targetWord)) {
if (sentence.alignedToT.at(pos).size() != 1) {
if (m_sentence.alignedToT.at(pos).size() != 1) {
return false;
} else {
// check it actually lines up to another placeholder
int sourcePos = sentence.alignedToT.at(pos).at(0);
const string &otherWord = sentence.source[sourcePos];
int sourcePos = m_sentence.alignedToT.at(pos).at(0);
const string &otherWord = m_sentence.source[sourcePos];
if (!isPlaceholder(otherWord)) {
return false;
}
@ -966,7 +1146,7 @@ bool ExtractTask::checkPlaceholders (const SentenceAlignment &sentence, int star
return true;
}
bool ExtractTask::isPlaceholder(const string &word)
bool ExtractTask::isPlaceholder(const string &word) const
{
for (size_t i = 0; i < m_options.placeholders.size(); ++i) {
const string &placeholder = m_options.placeholders[i];
@ -976,28 +1156,5 @@ bool ExtractTask::isPlaceholder(const string &word)
}
return false;
}
/** tokenise input string to vector of string. each element has been separated by a character in the delimiters argument.
The separator can only be 1 character long. The default delimiters are space or tab
*/
std::vector<std::string> Tokenize(const std::string& str,
const std::string& delimiters)
{
std::vector<std::string> tokens;
// Skip delimiters at beginning.
std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
// Find first "non-delimiter".
std::string::size_type pos = str.find_first_of(delimiters, lastPos);
while (std::string::npos != pos || std::string::npos != lastPos) {
// Found a token, add it to the vector.
tokens.push_back(str.substr(lastPos, pos - lastPos));
// Skip delimiters. Note the "not_of"
lastPos = str.find_first_not_of(delimiters, pos);
// Find next "non-delimiter"
pos = str.find_first_of(delimiters, lastPos);
}
return tokens;
}
}

View File

@ -347,7 +347,8 @@ int main(int argc, char* argv[])
SentenceAlignmentWithSyntax sentence
(targetLabelCollection, sourceLabelCollection,
targetTopLabelCollection, sourceTopLabelCollection, options);
targetTopLabelCollection, sourceTopLabelCollection,
options.targetSyntax, options.sourceSyntax);
//az: output src, tgt, and alingment line
if (options.onlyOutputSpanInfo) {
cout << "LOG: SRC: " << sourceString << endl;

View File

@ -68,6 +68,7 @@ bool spanLength = false;
bool ruleLength = false;
bool nonTermContext = false;
bool nonTermContextTarget = false;
bool targetConstituentBoundariesFlag = false;
int countOfCounts[COC_MAX+1];
int totalDistinct = 0;
@ -286,6 +287,9 @@ int main(int argc, char* argv[])
} else if (strcmp(argv[i],"--NonTermContextTarget") == 0) {
nonTermContextTarget = true;
std::cerr << "non-term context (target)" << std::endl;
} else if (strcmp(argv[i],"--TargetConstituentBoundaries") == 0) {
targetConstituentBoundariesFlag = true;
std::cerr << "including target constituent boundaries information" << std::endl;
} else {
featureArgs.push_back(argv[i]);
++i;
@ -957,6 +961,18 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
}
}
// target constituent boundaries
if (targetConstituentBoundariesFlag && !inverseFlag) {
const std::string targetConstituentBoundariesLeftValues = phrasePair.CollectAllPropertyValues("TargetConstituentBoundariesLeft");
if (!targetConstituentBoundariesLeftValues.empty()) {
phraseTableFile << " {{TargetConstituentBoundariesLeft " << targetConstituentBoundariesLeftValues << "}}";
}
const std::string targetConstituentBoundariesRightAdjacentValues = phrasePair.CollectAllPropertyValues("TargetConstituentBoundariesRightAdjacent");
if (!targetConstituentBoundariesRightAdjacentValues.empty()) {
phraseTableFile << " {{TargetConstituentBoundariesRightAdjacent " << targetConstituentBoundariesRightAdjacentValues << "}}";
}
}
phraseTableFile << std::endl;
}

View File

@ -53,18 +53,18 @@ git submodule update regtest
# -- compile from scratch with server, run regtests
set -x
if [ "$full" == true ] ; then
./bjam -j$j --with-irstlm=$irstlm --with-boost=$boost --with-cmph=$cmph --no-xmlrpc-c --with-regtest=$regtest -a $skipcompact $@ $q || exit $?
./bjam -j$j --with-mm --with-mm-extras --with-irstlm=$irstlm --with-boost=$boost --with-cmph=$cmph --no-xmlrpc-c --with-regtest=$regtest -a $skipcompact $@ $q || exit $?
if ./regression-testing/run-single-test.perl --server --startuptest ; then
./bjam -j$j --with-irstlm=$irstlm --with-boost=$boost --with-cmph=$cmph $xmlrpc --with-regtest=$regtest -a $skipcompact $@ $q
./bjam -j$j --with-mm --with-mm-extras --with-irstlm=$irstlm --with-boost=$boost --with-cmph=$cmph $xmlrpc --with-regtest=$regtest -a $skipcompact $@ $q
fi
else
# when investigating failures, always run single-threaded
if [ "$q" == "-q" ] ; then j=1; fi
if ./regression-testing/run-single-test.perl --server --startuptest ; then
./bjam -j$j $q $a --with-irstlm=$irstlm --with-boost=$boost --with-cmph=$cmph $xmlrpc --with-regtest=$regtest $skipcompact $@
./bjam -j$j --with-mm $q $a --with-irstlm=$irstlm --with-boost=$boost --with-cmph=$cmph $xmlrpc --with-regtest=$regtest $skipcompact $@
else
./bjam -j$j $q $a --with-irstlm=$irstlm --with-boost=$boost --with-cmph=$cmph --no-xmlrpc-c --with-regtest=$regtest $skipcompact $@
./bjam -j$j --with-mm --with-mm-extras $q $a --with-irstlm=$irstlm --with-boost=$boost --with-cmph=$cmph --no-xmlrpc-c --with-regtest=$regtest $skipcompact $@
fi
fi

View File

@ -811,7 +811,8 @@ generation-prune
in: generation-table
out: generation-table-pruned
rerun-on-change: TRAINING:prune-generation
ignore-unless: AND TRAINING:prune-generation
pass-unless: TRAINING:prune-generation
ignore-unless: generation-factors
default-name: model/generation-table-pruned
final-model: yes
template: $TRAINING:prune-generation IN OUT

View File

@ -384,11 +384,11 @@ sub read_config {
$resolve = 0;
foreach my $parameter (keys %CONFIG) {
foreach (@{$CONFIG{$parameter}}) {
next unless /\$/;
next unless /\$[a-z\{]/i;
my $escaped = 0;
die ("BAD USE OF \$ IN VALUE used in parameter $parameter")
if ! ( /^(.*)\$([a-z\-\:\d]+)(.*)$/i ||
(/^(.*)\$\{([a-z\-\:\d]+)\}(.*)$/i && ($escaped = 1)));
if ! ( /^(.*)\$([a-z][a-z\-\:\d]*)(.*)$/i ||
(/^(.*)\$\{([a-z][a-z\-\:\d]*)\}(.*)$/i && ($escaped = 1)));
my ($pre,$substitution,$post) = ($1,$2,$3);
my $pattern = $substitution;
if ($substitution !~ /\:/) { # handle local variables
@ -1800,6 +1800,10 @@ sub define_lm_train_bilingual_lm {
my $epochs = &get_bilingual_lm_epochs($set);
$cmd .= " -e $epochs" if defined($epochs);
my $nnjm_settings = backoff_and_get("LM:$set:nnjm-settings");
$cmd .= " ";
$cmd .= $nnjm_settings;
my $nplm_settings = backoff_and_get("LM:$set:nplm-settings");
$cmd .= " --extra-settings \"$nplm_settings\"" if defined($nplm_settings);
@ -2403,6 +2407,12 @@ sub define_training_extract_phrases {
if (&get("TRAINING:ghkm-strip-bitpar-nonterminal-labels")) {
$cmd .= "-ghkm-strip-bitpar-nonterminal-labels ";
}
} else { # !hierarchical-rule-set
if (&get("TRAINING:target-constituent-boundaries")) {
$cmd .= "-target-constituent-boundaries ";
}
}
my $extract_settings = &get("TRAINING:extract-settings");
@ -2460,6 +2470,12 @@ sub define_training_build_ttable {
my $parts_of_speech_labels_file = &versionize(&long_file_name("parts-of-speech","model",""));
$cmd .= "-ghkm-parts-of-speech-file $parts_of_speech_labels_file ";
}
} else { # !hierarchical-rule-set
if (&get("TRAINING:target-constituent-boundaries")) {
$cmd .= "-target-constituent-boundaries ";
}
}
&create_step($step_id,$cmd);
@ -2674,6 +2690,10 @@ sub define_training_create_config {
$cmd .= "-ghkm-parts-of-speech-file $parts_of_speech_labels_file ";
}
if (&get("TRAINING:target-constituent-boundaries")) {
$cmd .= "-target-constituent-boundaries ";
}
# sparse lexical features provide additional content for config file
my @additional_ini_files;
push (@additional_ini_files, "$sparse_lexical_features.ini") if $sparse_lexical_features;
@ -3601,8 +3621,8 @@ sub define_template {
print "\tcmd is $cmd\n" if $VERBOSE;
# replace variables
while ($cmd =~ /^([\S\s]*)\$(\??)\{([^\s\/\"\']+)\}([\S\s]*)$/ ||
$cmd =~ /^([\S\s]*)\$(\??)([^\s\/\"\']+)([\S\s]*)$/) {
while ($cmd =~ /^([\S\s]*)\$(\??)\{([a-z][^\s\/\"\']*)\}([\S\s]*)$/i ||
$cmd =~ /^([\S\s]*)\$(\??)([a-z][^\s\/\"\']*)([\S\s]*)$/i) {
my ($pre,$optional,$variable,$post) = ($1,$2,$3,$4);
my $value;
if ($optional eq '?') {
@ -3616,7 +3636,8 @@ sub define_template {
}
# deal with pipelined commands
$cmd =~ s/\|(.*)(\<\s*\S+) /$2 \| $1 /g;
$cmd =~ s/\|(.*[^\\])(\<\s*\S+) /$2 \| $1 /g;
$cmd =~ s/\\\</\</g;
# deal with gzipped input
my $c = "";

View File

@ -782,7 +782,8 @@ sub hs_scan_line {
if ($line =~ /^Trans Opt/) {
# Old format
$line =~ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ ||
$line =~ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>\S+ \-\> (.+) :([\(\),\d\- ]*): c=/ || return 0;
$line =~ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>\S+ \-\> (.+) :([\(\),\d\- ]*): c=/ ||
$line =~ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>\S+ \-\> (.+) :([\(\),\d\- ]*): term=.*: nonterm=.*: c=/ || return 0;
my ($sentence,$start,$end,$spans,$rule_lhs,$rule_rhs,$alignment) = ($1,$2,$3,$4,$5,$6,$7);
${$ref_sentence} = $sentence;
@ -1202,7 +1203,8 @@ sub process_search_graph {
if (/^(\d+) (\d+)\-?\>?(\S*) (\S+) =\> (.+) :(.*): pC=([\de\-\.]+), c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\d\-\.]+)\] \<\</) {
($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$heuristic_rule_score,$from,$to,$children,$hyp_score) = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12);
}
elsif (/^(\d+) (\d+)\-?\>?(\S*) (\S+) =\> (.+) :(.*): c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\d\-\.]+)\] core/) {
elsif (/^(\d+) (\d+)\-?\>?(\S*) (\S+) =\> (.+) :(.*): c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\de\-\.]+)\] core/ ||
/^(\d+) (\d+)\-?\>?(\S*) (\S+) =\> (.+) :(.*): c=([\de\-\.]+) core=\(.*\) \[(\d+)\.\.(\d+)\] (.*)\[total=([\de\-\.]+)\] core/) {
($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$from,$to,$children,$hyp_score) = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12);
$heuristic_rule_score = $rule_score; # hmmmm....
}

View File

@ -472,6 +472,7 @@ def compose_score_command(extract_file, context_file, half_file,
command += [
'&&',
find_first_executable(['bzcat']),
half_file,
'|',
quote(args.flexibility_score),
quote(context_file),

View File

@ -0,0 +1,79 @@
# N-best List Re-Scorer
Written by Michael Denkowski
These scripts simplify running N-best re-ranking experiments with Moses. You
can score N-best lists with external tools (such as models that would be very
costly to integrate with Moses just for feasibility experiments), then use the
extended feature set to select translations that may be of a higher quality than
those preferred by the Moses features alone. In some cases, training a
re-ranker even without any new features can yield improvement.
### Training
* Use Moses to generate large N-best lists for a dev set. Use a config file
(moses.ini) that has been optimized with MERT, MIRA, or similar:
```
cat dev-src.txt |moses -f moses.ini -n-best-list dev.best1000.out 1000 distinct
```
* (Optionally) add new feature scores to the N-best list using any external
tools. Make sure the features are added to the correct field using the correct
format. You don't need to update the final scores (right now your new features
have zero weight):
```
0 ||| some translation ||| Feature0= -1.75645 Feature1= -1.38629 -2.19722 -2.31428 -0.81093 AwesomeNewFeature= -1.38629 ||| -4.42063
```
* Run the optimizer (currently K-best MIRA) to learn new re-ranking weights for
all features in your N-best list. Supply the reference translation for the dev
set:
```
python train.py --nbest dev.best1000.with-new-features --ref dev-ref.txt --working-dir rescore-work
```
* You now have a new config file that contains N-best re-scoring weights:
```
rescore-work/rescore.ini
```
### Test
* Use the **original** config file to generate N-best lists for the test set:
```
cat test-src.txt |moses -f moses.ini -n-best-list test.best1000.out 100 distinct
```
* Add any new features you added for training
* Re-score the N-best list (update total scores) using the **re-scoring**
weights file:
```
python rescore.py rescore-work/rescore.ini <test.best1000.with-new-features >test.best1000.rescored
```
* The N-best list is **not** re-sorted, so the entries will be out of order.
Use the top-best script to extract the highest scoring entry for each sentence:
```
python topbest.py <test.best1000.rescored >test.topbest
```
### Not implemented yet
The following could be relatively easily implemented by replicating the
behavior of mert-moses.pl:
* Sparse features (sparse weight file)
* Other optimizers (MERT, PRO, etc.)
* Other objective functions (TER, Meteor, etc.)
* Multiple reference translations

View File

@ -0,0 +1,56 @@
#!/usr/bin/env python
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
import sys
FEAT_FIELD = 2
SCORE_FIELD = 3
def main():
if len(sys.argv[1:]) != 1:
sys.stderr.write('Usage: {} moses.ini <nbest.with-new-features >nbest.rescored\n'.format(sys.argv[0]))
sys.stderr.write('Entries are _not_ re-sorted based on new score. Use topbest.py\n')
sys.exit(2)
weights = {}
# moses.ini
ini = open(sys.argv[1])
while True:
line = ini.readline()
if not line:
sys.stderr.write('Error: no [weight] section\n')
sys.exit(1)
if line.strip() == '[weight]':
break
while True:
line = ini.readline()
if not line or line.strip().startswith('['):
break
if line.strip() == '':
continue
fields = line.split()
weights[fields[0]] = [float(f) for f in fields[1:]]
# N-best
for line in sys.stdin:
fields = [f.strip() for f in line.split('|||')]
feats = fields[FEAT_FIELD].split()
key = ''
i = 0
score = 0
for f in feats:
if f.endswith('='):
key = f
i = 0
else:
score += (float(f) * weights[key][i])
i += 1
fields[SCORE_FIELD] = str(score)
sys.stdout.write('{}\n'.format(' ||| '.join(fields)))
if __name__ == '__main__':
main()

View File

@ -0,0 +1,30 @@
#!/usr/bin/env python
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
import sys
SCORE_FIELD = 3
def main():
i = ''
hyp = ''
top = 0
for line in sys.stdin:
fields = [f.strip() for f in line.split('|||')]
id = fields[0]
if i != id:
if i:
sys.stdout.write('{}\n'.format(hyp))
score = float(fields[SCORE_FIELD])
if score > top or i != id:
i = id
hyp = fields[1]
top = score
sys.stdout.write('{}\n'.format(hyp))
if __name__ == '__main__':
main()

116
scripts/nbest-rescore/train.py Executable file
View File

@ -0,0 +1,116 @@
#!/usr/bin/env python
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
import argparse
import os
import subprocess
import sys
# Feature field in N-best format
FEAT_FIELD = 2
# Location of mert, kbmira, etc. in relation to this script
BIN_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'bin')
def main():
# Args
parser = argparse.ArgumentParser(description='Learn N-best rescoring weights')
parser.add_argument('--nbest', metavar='nbest', \
help='Dev set N-best list augmented with new features', required=True)
parser.add_argument('--ref', metavar='ref', \
help='Dev set reference translation', required=True)
parser.add_argument('--working-dir', metavar='rescore-work', \
help='Optimizer working directory', required=True)
parser.add_argument('--bin-dir', metavar='DIR', \
help='Moses bin dir, containing kbmira, evaluator, etc.', default=BIN_DIR)
# Since we're starting with uniform weights and only running kbmira once,
# run a gratuitous number of iterations. (mert-moses.pl default is 60
# iterations for each Moses run)
parser.add_argument('--iterations', metavar='N', type=int, \
help='Number of K-best MIRA iterations to run (default: 300)', default=300)
args = parser.parse_args()
# Find executables
extractor = os.path.join(args.bin_dir, 'extractor')
kbmira = os.path.join(args.bin_dir, 'kbmira')
for exe in (extractor, kbmira):
if not os.path.exists(exe):
sys.stderr.write('Error: cannot find executable "{}" in "{}", please specify --bin-dir\n'.format(exe, args.bin_dir))
sys.exit(1)
# rescore-work dir
if not os.path.exists(args.working_dir):
os.mkdir(args.working_dir)
# Feature names and numbers of weights from N-best list
# Assume all features are dense (present for each entry)
init_weights = []
fields = [f.strip() for f in open(args.nbest).readline().split('|||')]
feats = fields[FEAT_FIELD].split()
for i in range(len(feats)):
if feats[i].endswith('='):
n_weights = 0
j = i + 1
while j < len(feats):
if feats[j].endswith('='):
break
n_weights += 1
j += 1
# Start all weights at 0
init_weights.append([feats[i], [0] * n_weights])
# Extract score and feature data from N-best list
extractor_cmd = [extractor, \
'--sctype', 'BLEU', '--scconfig', 'case:true', \
'--scfile', os.path.join(args.working_dir, 'scores.dat'), \
'--ffile', os.path.join(args.working_dir, 'features.dat'), \
'-r', args.ref, \
'-n', args.nbest]
subprocess.call(extractor_cmd)
# Write dense feature list
with open(os.path.join(args.working_dir, 'init.dense'), 'w') as out:
for (feat, weights) in init_weights:
for w in weights:
out.write('{} {}\n'.format(feat, w))
# Run K-best MIRA optimizer
kbmira_cmd = [kbmira, \
'--dense-init', os.path.join(args.working_dir, 'init.dense'), \
'--ffile', os.path.join(args.working_dir, 'features.dat'), \
'--scfile', os.path.join(args.working_dir, 'scores.dat'), \
'-o', os.path.join(args.working_dir, 'mert.out'), \
'--iters', str(args.iterations)]
subprocess.call(kbmira_cmd)
# Read optimized weights, sum for normalization
opt_weights = []
total = 0
with open(os.path.join(args.working_dir, 'mert.out')) as inp:
# Same structure as original weight list
for (feat, weights) in init_weights:
opt_weights.append([feat, []])
for _ in weights:
w = float(inp.readline().split()[1])
opt_weights[-1][1].append(w)
# Sum for normalization
total += abs(w)
# Normalize weights
for (_, weights) in opt_weights:
for i in range(len(weights)):
weights[i] /= total
# Generate rescore.ini
with open(os.path.join(args.working_dir, 'rescore.ini'), 'w') as out:
out.write('# For use with Moses N-best rescorer "scripts/nbest-rescore/rescore.py"\n')
out.write('\n')
out.write('[weight]\n')
for (feat, weights) in opt_weights:
out.write('{} {}\n'.format(feat, ' '.join(str(w) for w in weights)))
if __name__ == '__main__':
main()

View File

@ -348,6 +348,9 @@ sub tokenize
$text =~ s/^ //g;
$text =~ s/ $//g;
# .' at end of sentence is missed
$text =~ s/\.\' ?$/ . ' /;
# restore protected
for (my $i = 0; $i < scalar(@protected); ++$i) {
my $subst = sprintf("THISISPROTECTED%.3d", $i);

View File

@ -234,7 +234,7 @@ while(my $line = <INI>) {
$w = $args[1];
}
elsif ($args[0] eq "input-factor") {
$source_factor = chomp($args[1]);
$source_factor = $args[1];
}
elsif ($args[0] eq "output-factor") {
#$t = chomp($args[1]);

View File

@ -134,6 +134,7 @@ my($_EXTERNAL_BINDIR,
$_LMODEL_OOV_FEATURE,
$_NUM_LATTICE_FEATURES,
$IGNORE,
$_TARGET_CONSTITUENT_BOUNDARIES,
$_FLEXIBILITY_SCORE,
$_FEATURE_LINES,
$_WEIGHT_LINES,
@ -258,6 +259,7 @@ $_HELP = 1
'instance-weights-file=s' => \$_INSTANCE_WEIGHTS_FILE,
'lmodel-oov-feature' => \$_LMODEL_OOV_FEATURE,
'num-lattice-features=i' => \$_NUM_LATTICE_FEATURES,
'target-constituent-boundaries' => \$_TARGET_CONSTITUENT_BOUNDARIES,
'flexibility-score' => \$_FLEXIBILITY_SCORE,
'config-add-feature-lines=s' => \$_FEATURE_LINES,
'config-add-weight-lines=s' => \$_WEIGHT_LINES,
@ -321,7 +323,6 @@ my $_ADDITIONAL_INI; # allow multiple switches
foreach (@_ADDITIONAL_INI) { $_ADDITIONAL_INI .= $_." "; }
chop($_ADDITIONAL_INI) if $_ADDITIONAL_INI;
$_HIERARCHICAL = 1 if $_SOURCE_SYNTAX || $_TARGET_SYNTAX;
$_XML = 1 if $_SOURCE_SYNTAX || $_TARGET_SYNTAX;
my $___FACTOR_DELIMITER = $_FACTOR_DELIMITER;
$___FACTOR_DELIMITER = '|' unless ($_FACTOR_DELIMITER);
@ -1608,6 +1609,7 @@ sub extract_phrase {
$cmd .= " --GZOutput ";
$cmd .= " --InstanceWeights $_INSTANCE_WEIGHTS_FILE " if defined $_INSTANCE_WEIGHTS_FILE;
$cmd .= " --BaselineExtract $_BASELINE_EXTRACT" if defined($_BASELINE_EXTRACT) && $PHRASE_EXTRACT =~ /extract-parallel.perl/;
$cmd .= " --TargetConstituentBoundaries" if $_TARGET_CONSTITUENT_BOUNDARIES;
$cmd .= " --FlexibilityScore" if $_FLEXIBILITY_SCORE;
$cmd .= " --NoTTable" if $_MMSAPT;
@ -1765,9 +1767,10 @@ sub score_phrase_phrase_extract {
$cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
$cmd .= " --TargetSyntacticPreferences $_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE" if $_TARGET_SYNTACTIC_PREFERENCES && defined($_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE);
$cmd .= " --PartsOfSpeech $_GHKM_PARTS_OF_SPEECH_FILE" if $_GHKM_PARTS_OF_SPEECH && defined($_GHKM_PARTS_OF_SPEECH_FILE);
$cmd .= " --TargetConstituentBoundaries" if $_TARGET_CONSTITUENT_BOUNDARIES;
$cmd .= " --FlexibilityScore=$FLEX_SCORER" if $_FLEXIBILITY_SCORE;
$cmd .= " $DOMAIN" if $DOMAIN;
$cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS);
$cmd .= " --FlexibilityScore=$FLEX_SCORER" if $_FLEXIBILITY_SCORE;
# sorting
if ($direction eq "e2f" || $_ALT_DIRECT_RULE_SCORE_1 || $_ALT_DIRECT_RULE_SCORE_2) {
@ -1904,7 +1907,7 @@ sub get_reordering {
# * the value stored in $REORDERING_MODEL_TYPES{$mtype} is a concatenation of the "orient"
# attributes such as "msd"
# * the "filename" attribute is appended to the filename, but actually serves as the main configuration specification
# for reordering scoring. it holds a string such as "wbe-msd-didirectional-fe"
# for reordering scoring. it holds a string such as "wbe-msd-bidirectional-fe"
# which has the more general format type-orient-dir-lang
$cmd .= " --model \"$mtype $REORDERING_MODEL_TYPES{$mtype}";
foreach my $model (@REORDERING_MODELS) {
@ -2325,7 +2328,7 @@ sub create_ini {
# hierarchical model settings
print INI "\n";
if ($_HIERARCHICAL) {
print INI "[unknown-lhs]\n$_UNKNOWN_WORD_LABEL_FILE\n\n" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_LABEL_FILE);
print INI "[unknown-lhs]\n$_UNKNOWN_WORD_LABEL_FILE\n\n" if $_TARGET_SYNTAX && !$_TARGET_SYNTACTIC_PREFERENCES && defined($_UNKNOWN_WORD_LABEL_FILE);
print INI "[cube-pruning-pop-limit]\n1000\n\n";
print INI "[non-terminals]\nX\n\n";
print INI "[search-algorithm]\n3\n\n";
@ -2382,6 +2385,12 @@ sub create_ini {
chomp($TOPLABEL);
print INI " glue-label=$TOPLABEL\n";
}
if ($_HIERARCHICAL && $_TARGET_SYNTAX && $_TARGET_SYNTACTIC_PREFERENCES && defined($_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE)) {
print INI "TargetPreferencesFeature label-set-file=$_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE";
print INI " unknown-word-labels-file=$_UNKNOWN_WORD_LABEL_FILE" if defined($_UNKNOWN_WORD_LABEL_FILE);
print INI "\n";
}
print INI "TargetConstituentAdjacencyFeature\n" if $_TARGET_CONSTITUENT_BOUNDARIES;
print INI $feature_spec;
print INI "\n# dense weights for feature functions\n";
@ -2393,6 +2402,8 @@ sub create_ini {
print INI "PhrasePenalty0= 0.2\n";
print INI "SoftSourceSyntacticConstraintsFeature0= -0.2 -0.2 -0.2 0.1 0.1 0.1\n" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
print INI "PhraseOrientationFeature0= 0.05 0.05 0.05 0.05 0.05 0.05\n" if $_PHRASE_ORIENTATION;
print INI "TargetPreferencesFeature0= 0.2 -0.2\n" if $_HIERARCHICAL && $_TARGET_SYNTAX && $_TARGET_SYNTACTIC_PREFERENCES && defined($_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE);
print INI "TargetConstituentAdjacencyFeature0= 0.05 -0.1\n" if $_TARGET_CONSTITUENT_BOUNDARIES;
print INI $weight_spec;
close(INI);
}

View File

@ -58,6 +58,7 @@ class FileStream : public FakeOStream<FileStream> {
}
FileStream &seekp(uint64_t to) {
flush();
util::SeekOrThrow(fd_, to);
return *this;
}