mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-10-26 19:37:58 +03:00
Merge branch 'master' of github.com:moses-smt/mosesdecoder
This commit is contained in:
commit
167def1d52
3
Jamroot
3
Jamroot
@ -208,7 +208,7 @@ if [ option.get "with-icu" : : "yes" ]
|
||||
|
||||
# for probing pt
|
||||
external-lib boost_serialization ;
|
||||
requirements += <library>boost_serialization ;
|
||||
requirements += <library>boost_serialization/<runtime-link>static ;
|
||||
|
||||
if [ option.get "with-vw" ] {
|
||||
requirements += <define>HAVE_VW ;
|
||||
@ -247,6 +247,7 @@ if [ option.get "with-mm-extras" : : "yes" ]
|
||||
moses/TranslationModel/UG//bitext-find
|
||||
moses/TranslationModel/UG//ptable-describe-features
|
||||
moses/TranslationModel/UG//count-ptable-features
|
||||
moses/TranslationModel/UG//ptable-sigtest-filter
|
||||
moses/TranslationModel/UG//ptable-lookup
|
||||
moses/TranslationModel/UG//ptable-lookup-corpus
|
||||
moses/TranslationModel/UG//check-coverage
|
||||
|
@ -93,7 +93,7 @@ void SuffixArray::Create(const string& fileName )
|
||||
CheckAllocation(m_sentenceLength != NULL, "m_sentenceLength");
|
||||
if (m_useDocument) {
|
||||
m_document = (INDEX*) calloc( sizeof( INDEX ), m_documentCount );
|
||||
m_documentName = (INDEX*) calloc( sizeof( char ), m_documentCount );
|
||||
m_documentName = (INDEX*) calloc( sizeof( INDEX ), m_documentCount );
|
||||
m_documentNameBuffer = (char*) calloc( sizeof( char ), m_documentNameLength );
|
||||
CheckAllocation(m_document != NULL, "m_document");
|
||||
CheckAllocation(m_documentName != NULL, "m_documentName");
|
||||
|
@ -11,12 +11,12 @@
|
||||
</externalSetting>
|
||||
</externalSettings>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
@ -72,13 +72,13 @@
|
||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.macosx.exe.release.701931933" moduleId="org.eclipse.cdt.core.settings" name="Release">
|
||||
<externalSettings/>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.MachO64" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
|
@ -55,6 +55,41 @@
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SentenceAlignment.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>SentenceAlignmentWithSyntax.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SentenceAlignmentWithSyntax.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>SentenceAlignmentWithSyntax.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SentenceAlignmentWithSyntax.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>SyntaxNodeCollection.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SyntaxNodeCollection.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>SyntaxNodeCollection.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/SyntaxNodeCollection.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>XmlException.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/XmlException.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>XmlTree.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/XmlTree.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>XmlTree.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/phrase-extract/XmlTree.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>extract-main.cpp</name>
|
||||
<type>1</type>
|
||||
|
@ -11,11 +11,11 @@
|
||||
</externalSetting>
|
||||
</externalSettings>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
@ -74,7 +74,7 @@
|
||||
<fileInfo id="cdt.managedbuild.config.gnu.exe.debug.1846963597.871386239" name="LDHT.h" rcbsApplicability="disable" resourcePath="LM/LDHT.h" toolsToInvoke=""/>
|
||||
<fileInfo id="cdt.managedbuild.config.gnu.exe.debug.1846963597.1761300858" name="ParallelBackoff.h" rcbsApplicability="disable" resourcePath="LM/ParallelBackoff.h" toolsToInvoke=""/>
|
||||
<sourceEntries>
|
||||
<entry excluding="TranslationModel/UG/ptable-lookup.cc|TranslationModel/UG/ptable-lookup-corpus.cc|TranslationModel/UG/mm/test-http-client.cc|TranslationModel/UG/ptable-describe-features.cc|TranslationModel/UG/count-ptable-features.cc|TranslationModel/UG/try-align2.cc|TranslationModel/UG/try-align.cc|TranslationModel/UG/spe-check-coverage3.cc|TranslationModel/UG/spe-check-coverage2.cc|TranslationModel/UG/spe-check-coverage.cc|TranslationModel/UG/sim-pe.cc|TranslationModel/UG/generic/stringdist|TranslationModel/UG/mm/test-dynamic-im-tsa.cc|TranslationModel/UG/mm/mtt.count.cc|LM/ParallelBackoff.h|LM/ParallelBackoff.cpp|LM/bilingual-lm|LM/MaxEntSRI.h|LM/MaxEntSRI.cpp|LM/BilingualLM.h|LM/BilingualLM.cpp|LM/Rand.h|LM/Rand.cpp|LM/LDHT.h|LM/LDHT.cpp|LM/ORLM.h|LM/ORLM.cpp|LM/NeuralLMWrapper.h|LM/NeuralLMWrapper.cpp|LM/SRI.h|LM/SRI.cpp|LM/IRST.h|LM/IRST.cpp|LM/DALMWrapper.h|LM/DALMWrapper.cpp|LM/oxlm|TranslationModel/UG/util" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
|
||||
<entry excluding="TranslationModel/UG/ptable-lookup.cc|TranslationModel/UG/ptable-lookup-corpus.cc|TranslationModel/UG/mm/test-http-client.cc|TranslationModel/UG/ptable-describe-features.cc|TranslationModel/UG/count-ptable-features.cc|TranslationModel/UG/try-align2.cc|TranslationModel/UG/try-align.cc|TranslationModel/UG/spe-check-coverage3.cc|TranslationModel/UG/spe-check-coverage2.cc|TranslationModel/UG/spe-check-coverage.cc|TranslationModel/UG/sim-pe.cc|TranslationModel/UG/generic/stringdist|TranslationModel/UG/mm/test-dynamic-im-tsa.cc|TranslationModel/UG/mm/mtt.count.cc|LM/ParallelBackoff.h|LM/ParallelBackoff.cpp|LM/bilingual-lm|LM/MaxEntSRI.h|LM/MaxEntSRI.cpp|LM/BilingualLM.h|LM/BilingualLM.cpp|LM/Rand.h|LM/Rand.cpp|LM/LDHT.h|LM/LDHT.cpp|LM/ORLM.h|LM/ORLM.cpp|LM/NeuralLMWrapper.h|LM/NeuralLMWrapper.cpp|LM/SRI.h|LM/SRI.cpp|LM/IRST.h|LM/IRST.cpp|LM/DALMWrapper.h|LM/DALMWrapper.cpp|LM/oxlm|TranslationModel/UG/util" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
|
||||
</sourceEntries>
|
||||
</configuration>
|
||||
</storageModule>
|
||||
@ -84,12 +84,12 @@
|
||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.1911984684" moduleId="org.eclipse.cdt.core.settings" name="Release">
|
||||
<externalSettings/>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
|
@ -1625,6 +1625,16 @@
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/TargetBigramFeature.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/TargetConstituentAdjacencyFeature.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/TargetConstituentAdjacencyFeature.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/TargetConstituentAdjacencyFeature.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/TargetConstituentAdjacencyFeature.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/TargetNgramFeature.cpp</name>
|
||||
<type>1</type>
|
||||
@ -1635,6 +1645,16 @@
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/TargetNgramFeature.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/TargetPreferencesFeature.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/TargetPreferencesFeature.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/TargetPreferencesFeature.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/TargetPreferencesFeature.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/TargetWordInsertionFeature.cpp</name>
|
||||
<type>1</type>
|
||||
@ -1995,6 +2015,36 @@
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/SpanLengthPhraseProperty.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>PP/TargetConstituentBoundariesLeftPhraseProperty.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/TargetConstituentBoundariesLeftPhraseProperty.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>PP/TargetConstituentBoundariesLeftPhraseProperty.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/TargetConstituentBoundariesLeftPhraseProperty.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>PP/TargetPreferencesPhraseProperty.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/TargetPreferencesPhraseProperty.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>PP/TargetPreferencesPhraseProperty.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/PP/TargetPreferencesPhraseProperty.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>PP/TreeStructurePhraseProperty.h</name>
|
||||
<type>1</type>
|
||||
@ -2495,6 +2545,56 @@
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/parameters/SyntaxOptions.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/LexicalReordering/BidirectionalReorderingState.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/BidirectionalReorderingState.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/LexicalReordering/BidirectionalReorderingState.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/BidirectionalReorderingState.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/LexicalReordering/HReorderingBackwardState.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/HReorderingBackwardState.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/LexicalReordering/HReorderingBackwardState.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/HReorderingBackwardState.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/LexicalReordering/HReorderingForwardState.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/HReorderingForwardState.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/LexicalReordering/HReorderingForwardState.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/HReorderingForwardState.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/LexicalReordering/LRModel.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/LRModel.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/LexicalReordering/LRModel.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/LRModel.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/LexicalReordering/LRState.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/LRState.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/LexicalReordering/LRState.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/LRState.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/LexicalReordering/LexicalReordering.cpp</name>
|
||||
<type>1</type>
|
||||
@ -2505,16 +2605,6 @@
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/LexicalReordering.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/LexicalReordering/LexicalReorderingState.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/LexicalReorderingState.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/LexicalReordering/LexicalReorderingState.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/LexicalReorderingState.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/LexicalReordering/LexicalReorderingTable.cpp</name>
|
||||
<type>1</type>
|
||||
@ -2525,6 +2615,16 @@
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/LexicalReorderingTable.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/LexicalReordering/PhraseBasedReorderingState.cpp</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/PhraseBasedReorderingState.cpp</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/LexicalReordering/PhraseBasedReorderingState.h</name>
|
||||
<type>1</type>
|
||||
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/LexicalReordering/PhraseBasedReorderingState.h</locationURI>
|
||||
</link>
|
||||
<link>
|
||||
<name>FF/LexicalReordering/ReorderingStack.cpp</name>
|
||||
<type>1</type>
|
||||
|
@ -37,4 +37,4 @@ for local p in [ glob *_main.cc ] {
|
||||
exes += $(name) ;
|
||||
}
|
||||
|
||||
alias programs : $(exes) filter//filter builder//dump_counts : <threading>multi:<source>builder//lmplz ;
|
||||
alias programs : $(exes) filter//filter filter//phrase_table_vocab builder//dump_counts : <threading>multi:<source>builder//lmplz ;
|
||||
|
@ -1,26 +1,31 @@
|
||||
|
||||
#include "DistortionScoreProducer.h"
|
||||
#include "FFState.h"
|
||||
#include "moses/InputPath.h"
|
||||
#include "moses/Range.h"
|
||||
#include "moses/StaticData.h"
|
||||
#include "moses/Hypothesis.h"
|
||||
#include "moses/Manager.h"
|
||||
#include "moses/FactorCollection.h"
|
||||
#include <cmath>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
struct DistortionState_traditional : public FFState {
|
||||
struct DistortionState : public FFState {
|
||||
Range range;
|
||||
int first_gap;
|
||||
DistortionState_traditional(const Range& wr, int fg) : range(wr), first_gap(fg) {}
|
||||
bool inSubordinateConjunction;
|
||||
DistortionState(const Range& wr, int fg, bool subord=false) : range(wr), first_gap(fg), inSubordinateConjunction(subord) {}
|
||||
|
||||
size_t hash() const {
|
||||
return range.GetEndPos();
|
||||
}
|
||||
virtual bool operator==(const FFState& other) const {
|
||||
const DistortionState_traditional& o =
|
||||
static_cast<const DistortionState_traditional&>(other);
|
||||
return range.GetEndPos() == o.range.GetEndPos();
|
||||
const DistortionState& o =
|
||||
static_cast<const DistortionState&>(other);
|
||||
return ( (range.GetEndPos() == o.range.GetEndPos()) && (inSubordinateConjunction == o.inSubordinateConjunction) );
|
||||
}
|
||||
|
||||
};
|
||||
@ -29,11 +34,36 @@ std::vector<const DistortionScoreProducer*> DistortionScoreProducer::s_staticCol
|
||||
|
||||
DistortionScoreProducer::DistortionScoreProducer(const std::string &line)
|
||||
: StatefulFeatureFunction(1, line)
|
||||
, m_useSparse(false)
|
||||
, m_sparseDistance(false)
|
||||
, m_sparseSubordinate(false)
|
||||
{
|
||||
s_staticColl.push_back(this);
|
||||
ReadParameters();
|
||||
}
|
||||
|
||||
void DistortionScoreProducer::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
if (key == "sparse") {
|
||||
m_useSparse = Scan<bool>(value);
|
||||
} else if (key == "sparse-distance") {
|
||||
m_sparseDistance = Scan<bool>(value);
|
||||
} else if (key == "sparse-input-factor") {
|
||||
m_sparseFactorTypeSource = Scan<FactorType>(value);
|
||||
} else if (key == "sparse-output-factor") {
|
||||
m_sparseFactorTypeTarget = Scan<FactorType>(value);
|
||||
} else if (key == "sparse-subordinate") {
|
||||
std::string subordinateConjunctionTag = Scan<std::string>(value);
|
||||
FactorCollection &factorCollection = FactorCollection::Instance();
|
||||
m_subordinateConjunctionTagFactor = factorCollection.AddFactor(subordinateConjunctionTag,false);
|
||||
m_sparseSubordinate = true;
|
||||
} else if (key == "sparse-subordinate-output-factor") {
|
||||
m_sparseFactorTypeTargetSubordinate = Scan<FactorType>(value);
|
||||
} else {
|
||||
StatefulFeatureFunction::SetParameter(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
const FFState* DistortionScoreProducer::EmptyHypothesisState(const InputType &input) const
|
||||
{
|
||||
// fake previous translated phrase start and end
|
||||
@ -44,7 +74,7 @@ const FFState* DistortionScoreProducer::EmptyHypothesisState(const InputType &in
|
||||
start = 0;
|
||||
end = input.m_frontSpanCoveredLength -1;
|
||||
}
|
||||
return new DistortionState_traditional(
|
||||
return new DistortionState(
|
||||
Range(start, end),
|
||||
NOT_FOUND);
|
||||
}
|
||||
@ -101,17 +131,184 @@ FFState* DistortionScoreProducer::EvaluateWhenApplied(
|
||||
const FFState* prev_state,
|
||||
ScoreComponentCollection* out) const
|
||||
{
|
||||
const DistortionState_traditional* prev = static_cast<const DistortionState_traditional*>(prev_state);
|
||||
const DistortionState* prev = static_cast<const DistortionState*>(prev_state);
|
||||
bool subordinateConjunction = prev->inSubordinateConjunction;
|
||||
|
||||
if (m_useSparse) {
|
||||
int jumpFromPos = prev->range.GetEndPos()+1;
|
||||
int jumpToPos = hypo.GetCurrSourceWordsRange().GetStartPos();
|
||||
size_t distance = std::abs( jumpFromPos - jumpToPos );
|
||||
|
||||
const Sentence& sentence = static_cast<const Sentence&>(hypo.GetInput());
|
||||
|
||||
StringPiece jumpFromSourceFactorPrev;
|
||||
StringPiece jumpFromSourceFactor;
|
||||
StringPiece jumpToSourceFactor;
|
||||
if (jumpFromPos < (int)sentence.GetSize()) {
|
||||
jumpFromSourceFactor = sentence.GetWord(jumpFromPos).GetFactor(m_sparseFactorTypeSource)->GetString();
|
||||
} else {
|
||||
jumpFromSourceFactor = "</s>";
|
||||
}
|
||||
if (jumpFromPos > 0) {
|
||||
jumpFromSourceFactorPrev = sentence.GetWord(jumpFromPos-1).GetFactor(m_sparseFactorTypeSource)->GetString();
|
||||
} else {
|
||||
jumpFromSourceFactorPrev = "<s>";
|
||||
}
|
||||
jumpToSourceFactor = sentence.GetWord(jumpToPos).GetFactor(m_sparseFactorTypeSource)->GetString();
|
||||
|
||||
const TargetPhrase& currTargetPhrase = hypo.GetCurrTargetPhrase();
|
||||
StringPiece jumpToTargetFactor = currTargetPhrase.GetWord(0).GetFactor(m_sparseFactorTypeTarget)->GetString();
|
||||
|
||||
util::StringStream featureName;
|
||||
|
||||
// source factor (start position)
|
||||
featureName = util::StringStream();
|
||||
featureName << m_description << "_";
|
||||
if ( jumpToPos > jumpFromPos ) {
|
||||
featureName << "R";
|
||||
} else if ( jumpToPos < jumpFromPos ) {
|
||||
featureName << "L";
|
||||
} else {
|
||||
featureName << "M";
|
||||
}
|
||||
if (m_sparseDistance) {
|
||||
featureName << distance;
|
||||
}
|
||||
featureName << "_SFS_" << jumpFromSourceFactor;
|
||||
if (m_sparseSubordinate && subordinateConjunction) {
|
||||
featureName << "_SUBORD";
|
||||
}
|
||||
out->SparsePlusEquals(featureName.str(), 1);
|
||||
|
||||
// source factor (start position minus 1)
|
||||
featureName = util::StringStream();
|
||||
featureName << m_description << "_";
|
||||
if ( jumpToPos > jumpFromPos ) {
|
||||
featureName << "R";
|
||||
} else if ( jumpToPos < jumpFromPos ) {
|
||||
featureName << "L";
|
||||
} else {
|
||||
featureName << "M";
|
||||
}
|
||||
if (m_sparseDistance) {
|
||||
featureName << distance;
|
||||
}
|
||||
featureName << "_SFP_" << jumpFromSourceFactorPrev;
|
||||
if (m_sparseSubordinate && subordinateConjunction) {
|
||||
featureName << "_SUBORD";
|
||||
}
|
||||
out->SparsePlusEquals(featureName.str(), 1);
|
||||
|
||||
// source factor (end position)
|
||||
featureName = util::StringStream();
|
||||
featureName << m_description << "_";
|
||||
if ( jumpToPos > jumpFromPos ) {
|
||||
featureName << "R";
|
||||
} else if ( jumpToPos < jumpFromPos ) {
|
||||
featureName << "L";
|
||||
} else {
|
||||
featureName << "M";
|
||||
}
|
||||
if (m_sparseDistance) {
|
||||
featureName << distance;
|
||||
}
|
||||
featureName << "_SFE_" << jumpToSourceFactor;
|
||||
if (m_sparseSubordinate && subordinateConjunction) {
|
||||
featureName << "_SUBORD";
|
||||
}
|
||||
out->SparsePlusEquals(featureName.str(), 1);
|
||||
|
||||
// target factor (end position)
|
||||
featureName = util::StringStream();
|
||||
featureName << m_description << "_";
|
||||
if ( jumpToPos > jumpFromPos ) {
|
||||
featureName << "R";
|
||||
} else if ( jumpToPos < jumpFromPos ) {
|
||||
featureName << "L";
|
||||
} else {
|
||||
featureName << "M";
|
||||
}
|
||||
if (m_sparseDistance) {
|
||||
featureName << distance;
|
||||
}
|
||||
featureName << "_TFE_" << jumpToTargetFactor;
|
||||
if (m_sparseSubordinate && subordinateConjunction) {
|
||||
featureName << "_SUBORD";
|
||||
}
|
||||
out->SparsePlusEquals(featureName.str(), 1);
|
||||
|
||||
// relative source sentence position
|
||||
featureName = util::StringStream();
|
||||
featureName << m_description << "_";
|
||||
if ( jumpToPos > jumpFromPos ) {
|
||||
featureName << "R";
|
||||
} else if ( jumpToPos < jumpFromPos ) {
|
||||
featureName << "L";
|
||||
} else {
|
||||
featureName << "M";
|
||||
}
|
||||
if (m_sparseDistance) {
|
||||
featureName << distance;
|
||||
}
|
||||
size_t relativeSourceSentencePosBin = std::floor( 5 * (float)jumpFromPos / (sentence.GetSize()+1) );
|
||||
featureName << "_P_" << relativeSourceSentencePosBin;
|
||||
if (m_sparseSubordinate && subordinateConjunction) {
|
||||
featureName << "_SUBORD";
|
||||
}
|
||||
out->SparsePlusEquals(featureName.str(), 1);
|
||||
|
||||
// source sentence length bin
|
||||
featureName = util::StringStream();
|
||||
featureName << m_description << "_";
|
||||
if ( jumpToPos > jumpFromPos ) {
|
||||
featureName << "R";
|
||||
} else if ( jumpToPos < jumpFromPos ) {
|
||||
featureName << "L";
|
||||
} else {
|
||||
featureName << "M";
|
||||
}
|
||||
if (m_sparseDistance) {
|
||||
featureName << distance;
|
||||
}
|
||||
size_t sourceSentenceLengthBin = 3;
|
||||
if (sentence.GetSize() < 15) {
|
||||
sourceSentenceLengthBin = 0;
|
||||
} else if (sentence.GetSize() < 23) {
|
||||
sourceSentenceLengthBin = 1;
|
||||
} else if (sentence.GetSize() < 33) {
|
||||
sourceSentenceLengthBin = 2;
|
||||
}
|
||||
featureName << "_SL_" << sourceSentenceLengthBin;
|
||||
if (m_sparseSubordinate && subordinateConjunction) {
|
||||
featureName << "_SUBORD";
|
||||
}
|
||||
out->SparsePlusEquals(featureName.str(), 1);
|
||||
|
||||
if (m_sparseSubordinate) {
|
||||
for (size_t posT=0; posT<currTargetPhrase.GetSize(); ++posT) {
|
||||
const Word &wordT = currTargetPhrase.GetWord(posT);
|
||||
if (wordT[m_sparseFactorTypeTargetSubordinate] == m_subordinateConjunctionTagFactor) {
|
||||
subordinateConjunction = true;
|
||||
} else if (wordT[m_sparseFactorTypeTargetSubordinate]->GetString()[0] == 'V') {
|
||||
subordinateConjunction = false;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
const float distortionScore = CalculateDistortionScore(
|
||||
hypo,
|
||||
prev->range,
|
||||
hypo.GetCurrSourceWordsRange(),
|
||||
prev->first_gap);
|
||||
out->PlusEquals(this, distortionScore);
|
||||
DistortionState_traditional* res = new DistortionState_traditional(
|
||||
|
||||
DistortionState* state = new DistortionState(
|
||||
hypo.GetCurrSourceWordsRange(),
|
||||
hypo.GetWordsBitmap().GetFirstGapPos());
|
||||
return res;
|
||||
hypo.GetWordsBitmap().GetFirstGapPos(),
|
||||
subordinateConjunction);
|
||||
|
||||
return state;
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,16 +1,11 @@
|
||||
#pragma once
|
||||
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include "StatefulFeatureFunction.h"
|
||||
#include "moses/Range.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
class FFState;
|
||||
class ScoreComponentCollection;
|
||||
class Hypothesis;
|
||||
class ChartHypothesis;
|
||||
class Range;
|
||||
|
||||
/** Calculates Distortion scores
|
||||
*/
|
||||
@ -19,6 +14,14 @@ class DistortionScoreProducer : public StatefulFeatureFunction
|
||||
protected:
|
||||
static std::vector<const DistortionScoreProducer*> s_staticColl;
|
||||
|
||||
FactorType m_sparseFactorTypeSource;
|
||||
FactorType m_sparseFactorTypeTarget;
|
||||
bool m_useSparse;
|
||||
bool m_sparseDistance;
|
||||
bool m_sparseSubordinate;
|
||||
FactorType m_sparseFactorTypeTargetSubordinate;
|
||||
const Factor* m_subordinateConjunctionTagFactor;
|
||||
|
||||
public:
|
||||
static const std::vector<const DistortionScoreProducer*>& GetDistortionFeatureFunctions() {
|
||||
return s_staticColl;
|
||||
@ -26,6 +29,8 @@ public:
|
||||
|
||||
DistortionScoreProducer(const std::string &line);
|
||||
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
|
||||
bool IsUseable(const FactorMask &mask) const {
|
||||
return true;
|
||||
}
|
||||
@ -44,7 +49,7 @@ public:
|
||||
const ChartHypothesis& /* cur_hypo */,
|
||||
int /* featureID - used to index the state in the previous hypotheses */,
|
||||
ScoreComponentCollection*) const {
|
||||
throw std::logic_error("DistortionScoreProducer not supported in chart decoder, yet");
|
||||
UTIL_THROW(util::Exception, "DIstortion not implemented in chart decoder");
|
||||
}
|
||||
|
||||
};
|
||||
|
@ -42,6 +42,7 @@
|
||||
#include "moses/FF/ControlRecombination.h"
|
||||
#include "moses/FF/ConstrainedDecoding.h"
|
||||
#include "moses/FF/SoftSourceSyntacticConstraintsFeature.h"
|
||||
#include "moses/FF/TargetConstituentAdjacencyFeature.h"
|
||||
#include "moses/FF/TargetPreferencesFeature.h"
|
||||
#include "moses/FF/CoveredReferenceFeature.h"
|
||||
#include "moses/FF/TreeStructureFeature.h"
|
||||
@ -264,6 +265,7 @@ FeatureRegistry::FeatureRegistry()
|
||||
MOSES_FNAME(CoveredReferenceFeature);
|
||||
MOSES_FNAME(SourceGHKMTreeInputMatchFeature);
|
||||
MOSES_FNAME(SoftSourceSyntacticConstraintsFeature);
|
||||
MOSES_FNAME(TargetConstituentAdjacencyFeature);
|
||||
MOSES_FNAME(TargetPreferencesFeature);
|
||||
MOSES_FNAME(TreeStructureFeature);
|
||||
MOSES_FNAME(SoftMatchingFeature);
|
||||
|
38
moses/FF/LexicalReordering/BidirectionalReorderingState.cpp
Normal file
38
moses/FF/LexicalReordering/BidirectionalReorderingState.cpp
Normal file
@ -0,0 +1,38 @@
|
||||
#include "BidirectionalReorderingState.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
///////////////////////////
|
||||
//BidirectionalReorderingState
|
||||
|
||||
size_t BidirectionalReorderingState::hash() const
|
||||
{
|
||||
size_t ret = m_backward->hash();
|
||||
boost::hash_combine(ret, m_forward->hash());
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool BidirectionalReorderingState::operator==(const FFState& o) const
|
||||
{
|
||||
if (&o == this) return 0;
|
||||
|
||||
BidirectionalReorderingState const &other
|
||||
= static_cast<BidirectionalReorderingState const&>(o);
|
||||
|
||||
bool ret = (*m_backward == *other.m_backward) && (*m_forward == *other.m_forward);
|
||||
return ret;
|
||||
}
|
||||
|
||||
LRState*
|
||||
BidirectionalReorderingState::
|
||||
Expand(const TranslationOption& topt, const InputType& input,
|
||||
ScoreComponentCollection* scores) const
|
||||
{
|
||||
LRState *newbwd = m_backward->Expand(topt,input, scores);
|
||||
LRState *newfwd = m_forward->Expand(topt, input, scores);
|
||||
return new BidirectionalReorderingState(m_configuration, newbwd, newfwd, m_offset);
|
||||
}
|
||||
|
||||
}
|
||||
|
38
moses/FF/LexicalReordering/BidirectionalReorderingState.h
Normal file
38
moses/FF/LexicalReordering/BidirectionalReorderingState.h
Normal file
@ -0,0 +1,38 @@
|
||||
#pragma once
|
||||
#include "LRState.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
class BidirectionalReorderingState
|
||||
: public LRState
|
||||
{
|
||||
private:
|
||||
const LRState *m_backward;
|
||||
const LRState *m_forward;
|
||||
public:
|
||||
BidirectionalReorderingState(const LRModel &config,
|
||||
const LRState *bw,
|
||||
const LRState *fw, size_t offset)
|
||||
: LRState(config,
|
||||
LRModel::Bidirectional,
|
||||
offset)
|
||||
, m_backward(bw)
|
||||
, m_forward(fw)
|
||||
{ }
|
||||
|
||||
~BidirectionalReorderingState() {
|
||||
delete m_backward;
|
||||
delete m_forward;
|
||||
}
|
||||
|
||||
virtual size_t hash() const;
|
||||
virtual bool operator==(const FFState& other) const;
|
||||
|
||||
LRState*
|
||||
Expand(const TranslationOption& topt, const InputType& input,
|
||||
ScoreComponentCollection* scores) const;
|
||||
};
|
||||
|
||||
}
|
||||
|
50
moses/FF/LexicalReordering/HReorderingBackwardState.cpp
Normal file
50
moses/FF/LexicalReordering/HReorderingBackwardState.cpp
Normal file
@ -0,0 +1,50 @@
|
||||
#include "HReorderingBackwardState.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
///////////////////////////
|
||||
//HierarchicalReorderingBackwardState
|
||||
|
||||
HReorderingBackwardState::
|
||||
HReorderingBackwardState(const HReorderingBackwardState *prev,
|
||||
const TranslationOption &topt,
|
||||
ReorderingStack reoStack)
|
||||
: LRState(prev, topt), m_reoStack(reoStack)
|
||||
{ }
|
||||
|
||||
HReorderingBackwardState::
|
||||
HReorderingBackwardState(const LRModel &config, size_t offset)
|
||||
: LRState(config, LRModel::Backward, offset)
|
||||
{ }
|
||||
|
||||
size_t HReorderingBackwardState::hash() const
|
||||
{
|
||||
size_t ret = m_reoStack.hash();
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool HReorderingBackwardState::operator==(const FFState& o) const
|
||||
{
|
||||
const HReorderingBackwardState& other
|
||||
= static_cast<const HReorderingBackwardState&>(o);
|
||||
bool ret = m_reoStack == other.m_reoStack;
|
||||
return ret;
|
||||
}
|
||||
|
||||
LRState*
|
||||
HReorderingBackwardState::
|
||||
Expand(const TranslationOption& topt, const InputType& input,
|
||||
ScoreComponentCollection* scores) const
|
||||
{
|
||||
HReorderingBackwardState* nextState;
|
||||
nextState = new HReorderingBackwardState(this, topt, m_reoStack);
|
||||
Range swrange = topt.GetSourceWordsRange();
|
||||
int reoDistance = nextState->m_reoStack.ShiftReduce(swrange);
|
||||
ReorderingType reoType = m_configuration.GetOrientation(reoDistance);
|
||||
CopyScores(scores, topt, input, reoType);
|
||||
return nextState;
|
||||
}
|
||||
|
||||
}
|
||||
|
33
moses/FF/LexicalReordering/HReorderingBackwardState.h
Normal file
33
moses/FF/LexicalReordering/HReorderingBackwardState.h
Normal file
@ -0,0 +1,33 @@
|
||||
#pragma once
|
||||
#include "LRState.h"
|
||||
#include "ReorderingStack.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
//! State for a hierarchical reordering model (see Galley and Manning, A
|
||||
//! Simple and Effective Hierarchical Phrase Reordering Model, EMNLP 2008)
|
||||
//! backward state (conditioned on the previous phrase)
|
||||
class HReorderingBackwardState : public LRState
|
||||
{
|
||||
private:
|
||||
ReorderingStack m_reoStack;
|
||||
public:
|
||||
HReorderingBackwardState(const LRModel &config, size_t offset);
|
||||
HReorderingBackwardState(const HReorderingBackwardState *prev,
|
||||
const TranslationOption &topt,
|
||||
ReorderingStack reoStack);
|
||||
virtual size_t hash() const;
|
||||
virtual bool operator==(const FFState& other) const;
|
||||
|
||||
virtual LRState* Expand(const TranslationOption& hypo, const InputType& input,
|
||||
ScoreComponentCollection* scores) const;
|
||||
|
||||
private:
|
||||
ReorderingType GetOrientationTypeMSD(int reoDistance) const;
|
||||
ReorderingType GetOrientationTypeMSLR(int reoDistance) const;
|
||||
ReorderingType GetOrientationTypeMonotonic(int reoDistance) const;
|
||||
ReorderingType GetOrientationTypeLeftRight(int reoDistance) const;
|
||||
};
|
||||
|
||||
}
|
78
moses/FF/LexicalReordering/HReorderingForwardState.cpp
Normal file
78
moses/FF/LexicalReordering/HReorderingForwardState.cpp
Normal file
@ -0,0 +1,78 @@
|
||||
#include "HReorderingForwardState.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
///////////////////////////
|
||||
//HReorderingForwardState
|
||||
|
||||
HReorderingForwardState::
|
||||
HReorderingForwardState(const LRModel &config,
|
||||
size_t size, size_t offset)
|
||||
: LRState(config, LRModel::Forward, offset)
|
||||
, m_first(true)
|
||||
, m_prevRange(NOT_FOUND,NOT_FOUND)
|
||||
, m_coverage(size)
|
||||
{ }
|
||||
|
||||
HReorderingForwardState::
|
||||
HReorderingForwardState(const HReorderingForwardState *prev,
|
||||
const TranslationOption &topt)
|
||||
: LRState(prev, topt)
|
||||
, m_first(false)
|
||||
, m_prevRange(topt.GetSourceWordsRange())
|
||||
, m_coverage(prev->m_coverage, topt.GetSourceWordsRange())
|
||||
{
|
||||
}
|
||||
|
||||
size_t HReorderingForwardState::hash() const
|
||||
{
|
||||
size_t ret;
|
||||
ret = hash_value(m_prevRange);
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool HReorderingForwardState::operator==(const FFState& o) const
|
||||
{
|
||||
if (&o == this) return true;
|
||||
|
||||
HReorderingForwardState const& other
|
||||
= static_cast<HReorderingForwardState const&>(o);
|
||||
|
||||
int compareScores = ((m_prevRange == other.m_prevRange)
|
||||
? ComparePrevScores(other.m_prevOption)
|
||||
: (m_prevRange < other.m_prevRange) ? -1 : 1);
|
||||
return compareScores == 0;
|
||||
}
|
||||
|
||||
// For compatibility with the phrase-based reordering model, scoring is one
|
||||
// step delayed.
|
||||
// The forward model takes determines orientations heuristically as follows:
|
||||
// mono: if the next phrase comes after the conditioning phrase and
|
||||
// - there is a gap to the right of the conditioning phrase, or
|
||||
// - the next phrase immediately follows it
|
||||
// swap: if the next phrase goes before the conditioning phrase and
|
||||
// - there is a gap to the left of the conditioning phrase, or
|
||||
// - the next phrase immediately precedes it
|
||||
// dright: if the next phrase follows the conditioning phrase and other
|
||||
// stuff comes in between
|
||||
// dleft: if the next phrase precedes the conditioning phrase and other
|
||||
// stuff comes in between
|
||||
|
||||
LRState*
|
||||
HReorderingForwardState::
|
||||
Expand(TranslationOption const& topt, InputType const& input,
|
||||
ScoreComponentCollection* scores) const
|
||||
{
|
||||
const Range cur = topt.GetSourceWordsRange();
|
||||
// keep track of the current coverage ourselves so we don't need the hypothesis
|
||||
Bitmap cov(m_coverage, cur);
|
||||
if (!m_first) {
|
||||
LRModel::ReorderingType reoType;
|
||||
reoType = m_configuration.GetOrientation(m_prevRange,cur,cov);
|
||||
CopyScores(scores, topt, input, reoType);
|
||||
}
|
||||
return new HReorderingForwardState(this, topt);
|
||||
}
|
||||
|
||||
}
|
33
moses/FF/LexicalReordering/HReorderingForwardState.h
Normal file
33
moses/FF/LexicalReordering/HReorderingForwardState.h
Normal file
@ -0,0 +1,33 @@
|
||||
#pragma once
|
||||
|
||||
#include "LRState.h"
|
||||
#include "moses/Range.h"
|
||||
#include "moses/Bitmap.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
//!forward state (conditioned on the next phrase)
|
||||
class HReorderingForwardState : public LRState
|
||||
{
|
||||
private:
|
||||
bool m_first;
|
||||
Range m_prevRange;
|
||||
Bitmap m_coverage;
|
||||
|
||||
public:
|
||||
HReorderingForwardState(const LRModel &config, size_t sentenceLength,
|
||||
size_t offset);
|
||||
HReorderingForwardState(const HReorderingForwardState *prev,
|
||||
const TranslationOption &topt);
|
||||
|
||||
virtual size_t hash() const;
|
||||
virtual bool operator==(const FFState& other) const;
|
||||
|
||||
virtual LRState* Expand(const TranslationOption& hypo,
|
||||
const InputType& input,
|
||||
ScoreComponentCollection* scores) const;
|
||||
};
|
||||
|
||||
}
|
||||
|
219
moses/FF/LexicalReordering/LRModel.cpp
Normal file
219
moses/FF/LexicalReordering/LRModel.cpp
Normal file
@ -0,0 +1,219 @@
|
||||
#include "LRModel.h"
|
||||
#include "moses/Range.h"
|
||||
#include "moses/Bitmap.h"
|
||||
#include "moses/InputType.h"
|
||||
#include "HReorderingForwardState.h"
|
||||
#include "HReorderingBackwardState.h"
|
||||
#include "PhraseBasedReorderingState.h"
|
||||
#include "BidirectionalReorderingState.h"
|
||||
#include "SparseReordering.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
bool
|
||||
IsMonotonicStep(Range const& prev, // words range of last source phrase
|
||||
Range const& cur, // words range of current source phrase
|
||||
Bitmap const& cov) // coverage bitmap
|
||||
{
|
||||
size_t e = prev.GetEndPos() + 1;
|
||||
size_t s = cur.GetStartPos();
|
||||
return (s == e || (s >= e && !cov.GetValue(e)));
|
||||
}
|
||||
|
||||
bool
|
||||
IsSwap(Range const& prev, Range const& cur, Bitmap const& cov)
|
||||
{
|
||||
size_t s = prev.GetStartPos();
|
||||
size_t e = cur.GetEndPos();
|
||||
return (e+1 == s || (e < s && !cov.GetValue(s-1)));
|
||||
}
|
||||
|
||||
size_t
|
||||
LRModel::
|
||||
GetNumberOfTypes() const
|
||||
{
|
||||
return ((m_modelType == MSD) ? 3 :
|
||||
(m_modelType == MSLR) ? 4 : 2);
|
||||
}
|
||||
|
||||
size_t
|
||||
LRModel::
|
||||
GetNumScoreComponents() const
|
||||
{
|
||||
size_t score_per_dir = m_collapseScores ? 1 : GetNumberOfTypes();
|
||||
return ((m_direction == Bidirectional)
|
||||
? 2 * score_per_dir + m_additionalScoreComponents
|
||||
: score_per_dir + m_additionalScoreComponents);
|
||||
}
|
||||
|
||||
void
|
||||
LRModel::
|
||||
ConfigureSparse(const std::map<std::string,std::string>& sparseArgs,
|
||||
const LexicalReordering* producer)
|
||||
{
|
||||
if (sparseArgs.size()) {
|
||||
m_sparse.reset(new SparseReordering(sparseArgs, producer));
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
LRModel::
|
||||
SetAdditionalScoreComponents(size_t number)
|
||||
{
|
||||
m_additionalScoreComponents = number;
|
||||
}
|
||||
|
||||
/// return orientation for the first phrase
|
||||
LRModel::ReorderingType
|
||||
LRModel::
|
||||
GetOrientation(Range const& cur) const
|
||||
{
|
||||
UTIL_THROW_IF2(m_modelType == None, "Reordering Model Type is None");
|
||||
return ((m_modelType == LeftRight) ? R :
|
||||
(cur.GetStartPos() == 0) ? M :
|
||||
(m_modelType == MSD) ? D :
|
||||
(m_modelType == MSLR) ? DR : NM);
|
||||
}
|
||||
|
||||
LRModel::ReorderingType
|
||||
LRModel::
|
||||
GetOrientation(Range const& prev, Range const& cur) const
|
||||
{
|
||||
UTIL_THROW_IF2(m_modelType == None, "No reordering model type specified");
|
||||
return ((m_modelType == LeftRight)
|
||||
? prev.GetEndPos() <= cur.GetStartPos() ? R : L
|
||||
: (cur.GetStartPos() == prev.GetEndPos() + 1) ? M
|
||||
: (m_modelType == Monotonic) ? NM
|
||||
: (prev.GetStartPos() == cur.GetEndPos() + 1) ? S
|
||||
: (m_modelType == MSD) ? D
|
||||
: (cur.GetStartPos() > prev.GetEndPos()) ? DR : DL);
|
||||
}
|
||||
|
||||
LRModel::ReorderingType
|
||||
LRModel::
|
||||
GetOrientation(int const reoDistance) const
|
||||
{
|
||||
// this one is for HierarchicalReorderingBackwardState
|
||||
return ((m_modelType == LeftRight)
|
||||
? (reoDistance >= 1) ? R : L
|
||||
: (reoDistance == 1) ? M
|
||||
: (m_modelType == Monotonic) ? NM
|
||||
: (reoDistance == -1) ? S
|
||||
: (m_modelType == MSD) ? D
|
||||
: (reoDistance > 1) ? DR : DL);
|
||||
}
|
||||
|
||||
LRModel::ReorderingType
|
||||
LRModel::
|
||||
GetOrientation(Range const& prev, Range const& cur,
|
||||
Bitmap const& cov) const
|
||||
{
|
||||
return ((m_modelType == LeftRight)
|
||||
? cur.GetStartPos() > prev.GetEndPos() ? R : L
|
||||
: IsMonotonicStep(prev,cur,cov) ? M
|
||||
: (m_modelType == Monotonic) ? NM
|
||||
: IsSwap(prev,cur,cov) ? S
|
||||
: (m_modelType == MSD) ? D
|
||||
: cur.GetStartPos() > prev.GetEndPos() ? DR : DL);
|
||||
}
|
||||
|
||||
LRModel::
|
||||
LRModel(const std::string &modelType)
|
||||
: m_modelString(modelType)
|
||||
, m_scoreProducer(NULL)
|
||||
, m_modelType(None)
|
||||
, m_phraseBased(true)
|
||||
, m_collapseScores(false)
|
||||
, m_direction(Backward)
|
||||
, m_additionalScoreComponents(0)
|
||||
{
|
||||
std::vector<std::string> config = Tokenize<std::string>(modelType, "-");
|
||||
|
||||
for (size_t i=0; i<config.size(); ++i) {
|
||||
if (config[i] == "hier") {
|
||||
m_phraseBased = false;
|
||||
} else if (config[i] == "phrase") {
|
||||
m_phraseBased = true;
|
||||
} else if (config[i] == "wbe") {
|
||||
m_phraseBased = true;
|
||||
}
|
||||
// no word-based decoding available, fall-back to phrase-based
|
||||
// This is the old lexical reordering model combination of moses
|
||||
|
||||
else if (config[i] == "msd") {
|
||||
m_modelType = MSD;
|
||||
} else if (config[i] == "mslr") {
|
||||
m_modelType = MSLR;
|
||||
} else if (config[i] == "monotonicity") {
|
||||
m_modelType = Monotonic;
|
||||
} else if (config[i] == "leftright") {
|
||||
m_modelType = LeftRight;
|
||||
}
|
||||
|
||||
// unidirectional is deprecated, use backward instead
|
||||
else if (config[i] == "unidirectional") {
|
||||
m_direction = Backward;
|
||||
} else if (config[i] == "backward") {
|
||||
m_direction = Backward;
|
||||
} else if (config[i] == "forward") {
|
||||
m_direction = Forward;
|
||||
} else if (config[i] == "bidirectional") {
|
||||
m_direction = Bidirectional;
|
||||
}
|
||||
|
||||
else if (config[i] == "f") {
|
||||
m_condition = F;
|
||||
} else if (config[i] == "fe") {
|
||||
m_condition = FE;
|
||||
}
|
||||
|
||||
else if (config[i] == "collapseff") {
|
||||
m_collapseScores = true;
|
||||
} else if (config[i] == "allff") {
|
||||
m_collapseScores = false;
|
||||
} else {
|
||||
std::cerr
|
||||
<< "Illegal part in the lexical reordering configuration string: "
|
||||
<< config[i] << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
if (m_modelType == None) {
|
||||
std::cerr
|
||||
<< "You need to specify the type of the reordering model "
|
||||
<< "(msd, monotonicity,...)" << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
LRState *
|
||||
LRModel::
|
||||
CreateLRState(const InputType &input) const
|
||||
{
|
||||
LRState *bwd = NULL, *fwd = NULL;
|
||||
size_t offset = 0;
|
||||
|
||||
switch(m_direction) {
|
||||
case Backward:
|
||||
case Bidirectional:
|
||||
if (m_phraseBased)
|
||||
bwd = new PhraseBasedReorderingState(*this, Backward, offset);
|
||||
else
|
||||
bwd = new HReorderingBackwardState(*this, offset);
|
||||
offset += m_collapseScores ? 1 : GetNumberOfTypes();
|
||||
if (m_direction == Backward) return bwd; // else fall through
|
||||
case Forward:
|
||||
if (m_phraseBased)
|
||||
fwd = new PhraseBasedReorderingState(*this, Forward, offset);
|
||||
else
|
||||
fwd = new HReorderingForwardState(*this, input.GetSize(), offset);
|
||||
offset += m_collapseScores ? 1 : GetNumberOfTypes();
|
||||
if (m_direction == Forward) return fwd;
|
||||
}
|
||||
return new BidirectionalReorderingState(*this, bwd, fwd, 0);
|
||||
}
|
||||
|
||||
}
|
||||
|
133
moses/FF/LexicalReordering/LRModel.h
Normal file
133
moses/FF/LexicalReordering/LRModel.h
Normal file
@ -0,0 +1,133 @@
|
||||
#pragma once
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <boost/scoped_ptr.hpp>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
class Range;
|
||||
class Bitmap;
|
||||
class InputType;
|
||||
class LRState;
|
||||
class LexicalReordering;
|
||||
class SparseReordering;
|
||||
|
||||
//! Factory class for lexical reordering states
|
||||
class LRModel
|
||||
{
|
||||
public:
|
||||
friend class LexicalReordering;
|
||||
enum ModelType { Monotonic, MSD, MSLR, LeftRight, None };
|
||||
enum Direction { Forward, Backward, Bidirectional };
|
||||
enum Condition { F, E, FE };
|
||||
|
||||
// constants for the different types of reordering
|
||||
// (correspond to indices in the respective table)
|
||||
#if 0
|
||||
typedef int ReorderingType;
|
||||
static const ReorderingType M = 0; // monotonic
|
||||
static const ReorderingType NM = 1; // non-monotonic
|
||||
static const ReorderingType S = 1; // swap
|
||||
static const ReorderingType D = 2; // discontinuous
|
||||
static const ReorderingType DL = 2; // discontinuous, left
|
||||
static const ReorderingType DR = 3; // discontinuous, right
|
||||
static const ReorderingType R = 0; // right
|
||||
static const ReorderingType L = 1; // left
|
||||
static const ReorderingType MAX = 3; // largest possible
|
||||
#else
|
||||
enum ReorderingType {
|
||||
M = 0, // monotonic
|
||||
NM = 1, // non-monotonic
|
||||
S = 1, // swap
|
||||
D = 2, // discontinuous
|
||||
DL = 2, // discontinuous, left
|
||||
DR = 3, // discontinuous, right
|
||||
R = 0, // right
|
||||
L = 1, // left
|
||||
MAX = 3, // largest possible
|
||||
NONE = 4 // largest possible
|
||||
};
|
||||
#endif
|
||||
// determine orientation, depending on model:
|
||||
|
||||
|
||||
ReorderingType // for first phrase in phrase-based
|
||||
GetOrientation(Range const& cur) const;
|
||||
|
||||
ReorderingType // for non-first phrases in phrase-based
|
||||
GetOrientation(Range const& prev, Range const& cur) const;
|
||||
|
||||
ReorderingType // for HReorderingForwardState
|
||||
GetOrientation(Range const& prev, Range const& cur,
|
||||
Bitmap const& cov) const;
|
||||
|
||||
ReorderingType // for HReorderingBackwarddState
|
||||
GetOrientation(int const reoDistance) const;
|
||||
|
||||
LRModel(const std::string &modelType);
|
||||
|
||||
void
|
||||
ConfigureSparse(const std::map<std::string,std::string>& sparseArgs,
|
||||
const LexicalReordering* producer);
|
||||
|
||||
LRState*
|
||||
CreateLRState(const InputType &input) const;
|
||||
|
||||
size_t GetNumberOfTypes() const;
|
||||
size_t GetNumScoreComponents() const;
|
||||
void SetAdditionalScoreComponents(size_t number);
|
||||
|
||||
LexicalReordering*
|
||||
GetScoreProducer() const {
|
||||
return m_scoreProducer;
|
||||
}
|
||||
|
||||
ModelType GetModelType() const {
|
||||
return m_modelType;
|
||||
}
|
||||
Direction GetDirection() const {
|
||||
return m_direction;
|
||||
}
|
||||
Condition GetCondition() const {
|
||||
return m_condition;
|
||||
}
|
||||
|
||||
bool
|
||||
IsPhraseBased() const {
|
||||
return m_phraseBased;
|
||||
}
|
||||
|
||||
bool
|
||||
CollapseScores() const {
|
||||
return m_collapseScores;
|
||||
}
|
||||
|
||||
SparseReordering const*
|
||||
GetSparseReordering() const {
|
||||
return m_sparse.get();
|
||||
}
|
||||
|
||||
private:
|
||||
void
|
||||
SetScoreProducer(LexicalReordering* scoreProducer) {
|
||||
m_scoreProducer = scoreProducer;
|
||||
}
|
||||
|
||||
std::string const&
|
||||
GetModelString() const {
|
||||
return m_modelString;
|
||||
}
|
||||
|
||||
std::string m_modelString;
|
||||
LexicalReordering *m_scoreProducer;
|
||||
ModelType m_modelType;
|
||||
bool m_phraseBased;
|
||||
bool m_collapseScores;
|
||||
Direction m_direction;
|
||||
Condition m_condition;
|
||||
size_t m_additionalScoreComponents;
|
||||
boost::scoped_ptr<SparseReordering> m_sparse;
|
||||
};
|
||||
|
||||
}
|
||||
|
88
moses/FF/LexicalReordering/LRState.cpp
Normal file
88
moses/FF/LexicalReordering/LRState.cpp
Normal file
@ -0,0 +1,88 @@
|
||||
// -*- c++ -*-
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "LRState.h"
|
||||
#include "moses/FF/FFState.h"
|
||||
#include "moses/Hypothesis.h"
|
||||
#include "moses/Range.h"
|
||||
#include "moses/TranslationOption.h"
|
||||
#include "moses/Util.h"
|
||||
|
||||
#include "LexicalReordering.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
void
|
||||
LRState::
|
||||
CopyScores(ScoreComponentCollection* accum,
|
||||
const TranslationOption &topt,
|
||||
const InputType& input,
|
||||
ReorderingType reoType) const
|
||||
{
|
||||
// don't call this on a bidirectional object
|
||||
UTIL_THROW_IF2(m_direction != LRModel::Backward &&
|
||||
m_direction != LRModel::Forward,
|
||||
"Unknown direction: " << m_direction);
|
||||
|
||||
TranslationOption const* relevantOpt = ((m_direction == LRModel::Backward)
|
||||
? &topt : m_prevOption);
|
||||
|
||||
LexicalReordering* producer = m_configuration.GetScoreProducer();
|
||||
Scores const* cached = relevantOpt->GetLexReorderingScores(producer);
|
||||
|
||||
// The approach here is bizarre! Why create a whole vector and do
|
||||
// vector addition (acumm->PlusEquals) to update a single value? - UG
|
||||
size_t off_remote = m_offset + reoType;
|
||||
size_t off_local = m_configuration.CollapseScores() ? m_offset : off_remote;
|
||||
|
||||
UTIL_THROW_IF2(off_local >= producer->GetNumScoreComponents(),
|
||||
"offset out of vector bounds!");
|
||||
|
||||
// look up applicable score from vectore of scores
|
||||
if(cached) {
|
||||
UTIL_THROW_IF2(off_remote >= cached->size(), "offset out of vector bounds!");
|
||||
Scores scores(producer->GetNumScoreComponents(),0);
|
||||
scores[off_local ] = (*cached)[off_remote];
|
||||
accum->PlusEquals(producer, scores);
|
||||
}
|
||||
|
||||
// else: use default scores (if specified)
|
||||
else if (producer->GetHaveDefaultScores()) {
|
||||
Scores scores(producer->GetNumScoreComponents(),0);
|
||||
scores[off_local] = producer->GetDefaultScore(off_remote);
|
||||
accum->PlusEquals(m_configuration.GetScoreProducer(), scores);
|
||||
}
|
||||
// note: if no default score, no cost
|
||||
|
||||
const SparseReordering* sparse = m_configuration.GetSparseReordering();
|
||||
if (sparse) sparse->CopyScores(*relevantOpt, m_prevOption, input, reoType,
|
||||
m_direction, accum);
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
LRState::
|
||||
ComparePrevScores(const TranslationOption *other) const
|
||||
{
|
||||
LexicalReordering* producer = m_configuration.GetScoreProducer();
|
||||
const Scores* myScores = m_prevOption->GetLexReorderingScores(producer);
|
||||
const Scores* yrScores = other->GetLexReorderingScores(producer);
|
||||
|
||||
if(myScores == yrScores) return 0;
|
||||
|
||||
// The pointers are NULL if a phrase pair isn't found in the reordering table.
|
||||
if(yrScores == NULL) return -1;
|
||||
if(myScores == NULL) return 1;
|
||||
|
||||
size_t stop = m_offset + m_configuration.GetNumberOfTypes();
|
||||
for(size_t i = m_offset; i < stop; i++) {
|
||||
if((*myScores)[i] < (*yrScores)[i]) return -1;
|
||||
if((*myScores)[i] > (*yrScores)[i]) return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
81
moses/FF/LexicalReordering/LRState.h
Normal file
81
moses/FF/LexicalReordering/LRState.h
Normal file
@ -0,0 +1,81 @@
|
||||
// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
|
||||
#pragma once
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "moses/Hypothesis.h"
|
||||
#include "moses/ScoreComponentCollection.h"
|
||||
#include "moses/Range.h"
|
||||
#include "moses/Bitmap.h"
|
||||
#include "moses/TranslationOption.h"
|
||||
#include "moses/FF/FFState.h"
|
||||
#include "LRModel.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
//! Abstract class for lexical reordering model states
|
||||
class LRState : public FFState
|
||||
{
|
||||
public:
|
||||
|
||||
typedef LRModel::ReorderingType ReorderingType;
|
||||
|
||||
virtual
|
||||
LRState*
|
||||
Expand(const TranslationOption& hypo, const InputType& input,
|
||||
ScoreComponentCollection* scores) const = 0;
|
||||
|
||||
static
|
||||
LRState*
|
||||
CreateLRState(const std::vector<std::string>& config,
|
||||
LRModel::Direction dir,
|
||||
const InputType &input);
|
||||
|
||||
protected:
|
||||
|
||||
const LRModel& m_configuration;
|
||||
|
||||
// The following is the true direction of the object, which can be
|
||||
// Backward or Forward even if the Configuration has Bidirectional.
|
||||
LRModel::Direction m_direction;
|
||||
size_t m_offset;
|
||||
//forward scores are conditioned on prev option, so need to remember it
|
||||
const TranslationOption *m_prevOption;
|
||||
|
||||
inline
|
||||
LRState(const LRState *prev,
|
||||
const TranslationOption &topt)
|
||||
: m_configuration(prev->m_configuration)
|
||||
, m_direction(prev->m_direction)
|
||||
, m_offset(prev->m_offset)
|
||||
, m_prevOption(&topt)
|
||||
{ }
|
||||
|
||||
inline
|
||||
LRState(const LRModel &config,
|
||||
LRModel::Direction dir,
|
||||
size_t offset)
|
||||
: m_configuration(config)
|
||||
, m_direction(dir)
|
||||
, m_offset(offset)
|
||||
, m_prevOption(NULL)
|
||||
{ }
|
||||
|
||||
// copy the right scores in the right places, taking into account
|
||||
// forward/backward, offset, collapse
|
||||
void
|
||||
CopyScores(ScoreComponentCollection* scores,
|
||||
const TranslationOption& topt,
|
||||
const InputType& input, ReorderingType reoType) const;
|
||||
|
||||
int
|
||||
ComparePrevScores(const TranslationOption *other) const;
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
@ -5,7 +5,7 @@
|
||||
#include "moses/FF/FFState.h"
|
||||
#include "moses/TranslationOptionList.h"
|
||||
#include "LexicalReordering.h"
|
||||
#include "LexicalReorderingState.h"
|
||||
#include "LRState.h"
|
||||
#include "moses/StaticData.h"
|
||||
#include "moses/Util.h"
|
||||
#include "moses/InputPath.h"
|
||||
|
@ -14,7 +14,7 @@
|
||||
#include "moses/FF/StatefulFeatureFunction.h"
|
||||
#include "util/exception.hh"
|
||||
|
||||
#include "LexicalReorderingState.h"
|
||||
#include "LRState.h"
|
||||
#include "LexicalReorderingTable.h"
|
||||
#include "SparseReordering.h"
|
||||
|
||||
|
@ -1,506 +0,0 @@
|
||||
// -*- c++ -*-
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "moses/FF/FFState.h"
|
||||
#include "moses/Hypothesis.h"
|
||||
#include "moses/Range.h"
|
||||
#include "moses/TranslationOption.h"
|
||||
#include "moses/Util.h"
|
||||
|
||||
#include "LexicalReordering.h"
|
||||
#include "LexicalReorderingState.h"
|
||||
#include "ReorderingStack.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
bool
|
||||
IsMonotonicStep(Range const& prev, // words range of last source phrase
|
||||
Range const& cur, // words range of current source phrase
|
||||
Bitmap const& cov) // coverage bitmap
|
||||
{
|
||||
size_t e = prev.GetEndPos() + 1;
|
||||
size_t s = cur.GetStartPos();
|
||||
return (s == e || (s >= e && !cov.GetValue(e)));
|
||||
}
|
||||
|
||||
bool
|
||||
IsSwap(Range const& prev, Range const& cur, Bitmap const& cov)
|
||||
{
|
||||
size_t s = prev.GetStartPos();
|
||||
size_t e = cur.GetEndPos();
|
||||
return (e+1 == s || (e < s && !cov.GetValue(s-1)));
|
||||
}
|
||||
|
||||
size_t
|
||||
LRModel::
|
||||
GetNumberOfTypes() const
|
||||
{
|
||||
return ((m_modelType == MSD) ? 3 :
|
||||
(m_modelType == MSLR) ? 4 : 2);
|
||||
}
|
||||
|
||||
size_t
|
||||
LRModel::
|
||||
GetNumScoreComponents() const
|
||||
{
|
||||
size_t score_per_dir = m_collapseScores ? 1 : GetNumberOfTypes();
|
||||
return ((m_direction == Bidirectional)
|
||||
? 2 * score_per_dir + m_additionalScoreComponents
|
||||
: score_per_dir + m_additionalScoreComponents);
|
||||
}
|
||||
|
||||
void
|
||||
LRModel::
|
||||
ConfigureSparse(const std::map<std::string,std::string>& sparseArgs,
|
||||
const LexicalReordering* producer)
|
||||
{
|
||||
if (sparseArgs.size()) {
|
||||
m_sparse.reset(new SparseReordering(sparseArgs, producer));
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
LRModel::
|
||||
SetAdditionalScoreComponents(size_t number)
|
||||
{
|
||||
m_additionalScoreComponents = number;
|
||||
}
|
||||
|
||||
/// return orientation for the first phrase
|
||||
LRModel::ReorderingType
|
||||
LRModel::
|
||||
GetOrientation(Range const& cur) const
|
||||
{
|
||||
UTIL_THROW_IF2(m_modelType == None, "Reordering Model Type is None");
|
||||
return ((m_modelType == LeftRight) ? R :
|
||||
(cur.GetStartPos() == 0) ? M :
|
||||
(m_modelType == MSD) ? D :
|
||||
(m_modelType == MSLR) ? DR : NM);
|
||||
}
|
||||
|
||||
LRModel::ReorderingType
|
||||
LRModel::
|
||||
GetOrientation(Range const& prev, Range const& cur) const
|
||||
{
|
||||
UTIL_THROW_IF2(m_modelType == None, "No reordering model type specified");
|
||||
return ((m_modelType == LeftRight)
|
||||
? prev.GetEndPos() <= cur.GetStartPos() ? R : L
|
||||
: (cur.GetStartPos() == prev.GetEndPos() + 1) ? M
|
||||
: (m_modelType == Monotonic) ? NM
|
||||
: (prev.GetStartPos() == cur.GetEndPos() + 1) ? S
|
||||
: (m_modelType == MSD) ? D
|
||||
: (cur.GetStartPos() > prev.GetEndPos()) ? DR : DL);
|
||||
}
|
||||
|
||||
LRModel::ReorderingType
|
||||
LRModel::
|
||||
GetOrientation(int const reoDistance) const
|
||||
{
|
||||
// this one is for HierarchicalReorderingBackwardState
|
||||
return ((m_modelType == LeftRight)
|
||||
? (reoDistance >= 1) ? R : L
|
||||
: (reoDistance == 1) ? M
|
||||
: (m_modelType == Monotonic) ? NM
|
||||
: (reoDistance == -1) ? S
|
||||
: (m_modelType == MSD) ? D
|
||||
: (reoDistance > 1) ? DR : DL);
|
||||
}
|
||||
|
||||
LRModel::ReorderingType
|
||||
LRModel::
|
||||
GetOrientation(Range const& prev, Range const& cur,
|
||||
Bitmap const& cov) const
|
||||
{
|
||||
return ((m_modelType == LeftRight)
|
||||
? cur.GetStartPos() > prev.GetEndPos() ? R : L
|
||||
: IsMonotonicStep(prev,cur,cov) ? M
|
||||
: (m_modelType == Monotonic) ? NM
|
||||
: IsSwap(prev,cur,cov) ? S
|
||||
: (m_modelType == MSD) ? D
|
||||
: cur.GetStartPos() > prev.GetEndPos() ? DR : DL);
|
||||
}
|
||||
|
||||
LRModel::
|
||||
LRModel(const std::string &modelType)
|
||||
: m_modelString(modelType)
|
||||
, m_scoreProducer(NULL)
|
||||
, m_modelType(None)
|
||||
, m_phraseBased(true)
|
||||
, m_collapseScores(false)
|
||||
, m_direction(Backward)
|
||||
, m_additionalScoreComponents(0)
|
||||
{
|
||||
std::vector<std::string> config = Tokenize<std::string>(modelType, "-");
|
||||
|
||||
for (size_t i=0; i<config.size(); ++i) {
|
||||
if (config[i] == "hier") {
|
||||
m_phraseBased = false;
|
||||
} else if (config[i] == "phrase") {
|
||||
m_phraseBased = true;
|
||||
} else if (config[i] == "wbe") {
|
||||
m_phraseBased = true;
|
||||
}
|
||||
// no word-based decoding available, fall-back to phrase-based
|
||||
// This is the old lexical reordering model combination of moses
|
||||
|
||||
else if (config[i] == "msd") {
|
||||
m_modelType = MSD;
|
||||
} else if (config[i] == "mslr") {
|
||||
m_modelType = MSLR;
|
||||
} else if (config[i] == "monotonicity") {
|
||||
m_modelType = Monotonic;
|
||||
} else if (config[i] == "leftright") {
|
||||
m_modelType = LeftRight;
|
||||
}
|
||||
|
||||
// unidirectional is deprecated, use backward instead
|
||||
else if (config[i] == "unidirectional") {
|
||||
m_direction = Backward;
|
||||
} else if (config[i] == "backward") {
|
||||
m_direction = Backward;
|
||||
} else if (config[i] == "forward") {
|
||||
m_direction = Forward;
|
||||
} else if (config[i] == "bidirectional") {
|
||||
m_direction = Bidirectional;
|
||||
}
|
||||
|
||||
else if (config[i] == "f") {
|
||||
m_condition = F;
|
||||
} else if (config[i] == "fe") {
|
||||
m_condition = FE;
|
||||
}
|
||||
|
||||
else if (config[i] == "collapseff") {
|
||||
m_collapseScores = true;
|
||||
} else if (config[i] == "allff") {
|
||||
m_collapseScores = false;
|
||||
} else {
|
||||
std::cerr
|
||||
<< "Illegal part in the lexical reordering configuration string: "
|
||||
<< config[i] << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
if (m_modelType == None) {
|
||||
std::cerr
|
||||
<< "You need to specify the type of the reordering model "
|
||||
<< "(msd, monotonicity,...)" << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
LRState *
|
||||
LRModel::
|
||||
CreateLRState(const InputType &input) const
|
||||
{
|
||||
LRState *bwd = NULL, *fwd = NULL;
|
||||
size_t offset = 0;
|
||||
|
||||
switch(m_direction) {
|
||||
case Backward:
|
||||
case Bidirectional:
|
||||
if (m_phraseBased)
|
||||
bwd = new PhraseBasedReorderingState(*this, Backward, offset);
|
||||
else
|
||||
bwd = new HReorderingBackwardState(*this, offset);
|
||||
offset += m_collapseScores ? 1 : GetNumberOfTypes();
|
||||
if (m_direction == Backward) return bwd; // else fall through
|
||||
case Forward:
|
||||
if (m_phraseBased)
|
||||
fwd = new PhraseBasedReorderingState(*this, Forward, offset);
|
||||
else
|
||||
fwd = new HReorderingForwardState(*this, input.GetSize(), offset);
|
||||
offset += m_collapseScores ? 1 : GetNumberOfTypes();
|
||||
if (m_direction == Forward) return fwd;
|
||||
}
|
||||
return new BidirectionalReorderingState(*this, bwd, fwd, 0);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
LRState::
|
||||
CopyScores(ScoreComponentCollection* accum,
|
||||
const TranslationOption &topt,
|
||||
const InputType& input,
|
||||
ReorderingType reoType) const
|
||||
{
|
||||
// don't call this on a bidirectional object
|
||||
UTIL_THROW_IF2(m_direction != LRModel::Backward &&
|
||||
m_direction != LRModel::Forward,
|
||||
"Unknown direction: " << m_direction);
|
||||
|
||||
TranslationOption const* relevantOpt = ((m_direction == LRModel::Backward)
|
||||
? &topt : m_prevOption);
|
||||
|
||||
LexicalReordering* producer = m_configuration.GetScoreProducer();
|
||||
Scores const* cached = relevantOpt->GetLexReorderingScores(producer);
|
||||
|
||||
// The approach here is bizarre! Why create a whole vector and do
|
||||
// vector addition (acumm->PlusEquals) to update a single value? - UG
|
||||
size_t off_remote = m_offset + reoType;
|
||||
size_t off_local = m_configuration.CollapseScores() ? m_offset : off_remote;
|
||||
|
||||
UTIL_THROW_IF2(off_local >= producer->GetNumScoreComponents(),
|
||||
"offset out of vector bounds!");
|
||||
|
||||
// look up applicable score from vectore of scores
|
||||
if(cached) {
|
||||
UTIL_THROW_IF2(off_remote >= cached->size(), "offset out of vector bounds!");
|
||||
Scores scores(producer->GetNumScoreComponents(),0);
|
||||
scores[off_local ] = (*cached)[off_remote];
|
||||
accum->PlusEquals(producer, scores);
|
||||
}
|
||||
|
||||
// else: use default scores (if specified)
|
||||
else if (producer->GetHaveDefaultScores()) {
|
||||
Scores scores(producer->GetNumScoreComponents(),0);
|
||||
scores[off_local] = producer->GetDefaultScore(off_remote);
|
||||
accum->PlusEquals(m_configuration.GetScoreProducer(), scores);
|
||||
}
|
||||
// note: if no default score, no cost
|
||||
|
||||
const SparseReordering* sparse = m_configuration.GetSparseReordering();
|
||||
if (sparse) sparse->CopyScores(*relevantOpt, m_prevOption, input, reoType,
|
||||
m_direction, accum);
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
LRState::
|
||||
ComparePrevScores(const TranslationOption *other) const
|
||||
{
|
||||
LexicalReordering* producer = m_configuration.GetScoreProducer();
|
||||
const Scores* myScores = m_prevOption->GetLexReorderingScores(producer);
|
||||
const Scores* yrScores = other->GetLexReorderingScores(producer);
|
||||
|
||||
if(myScores == yrScores) return 0;
|
||||
|
||||
// The pointers are NULL if a phrase pair isn't found in the reordering table.
|
||||
if(yrScores == NULL) return -1;
|
||||
if(myScores == NULL) return 1;
|
||||
|
||||
size_t stop = m_offset + m_configuration.GetNumberOfTypes();
|
||||
for(size_t i = m_offset; i < stop; i++) {
|
||||
if((*myScores)[i] < (*yrScores)[i]) return -1;
|
||||
if((*myScores)[i] > (*yrScores)[i]) return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// ===========================================================================
|
||||
// PHRASE BASED REORDERING STATE
|
||||
// ===========================================================================
|
||||
bool PhraseBasedReorderingState::m_useFirstBackwardScore = true;
|
||||
|
||||
PhraseBasedReorderingState::
|
||||
PhraseBasedReorderingState(const PhraseBasedReorderingState *prev,
|
||||
const TranslationOption &topt)
|
||||
: LRState(prev, topt)
|
||||
, m_prevRange(topt.GetSourceWordsRange())
|
||||
, m_first(false)
|
||||
{ }
|
||||
|
||||
|
||||
PhraseBasedReorderingState::
|
||||
PhraseBasedReorderingState(const LRModel &config,
|
||||
LRModel::Direction dir, size_t offset)
|
||||
: LRState(config, dir, offset)
|
||||
, m_prevRange(NOT_FOUND,NOT_FOUND)
|
||||
, m_first(true)
|
||||
{ }
|
||||
|
||||
|
||||
size_t PhraseBasedReorderingState::hash() const
|
||||
{
|
||||
size_t ret;
|
||||
ret = hash_value(m_prevRange);
|
||||
boost::hash_combine(ret, m_direction);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool PhraseBasedReorderingState::operator==(const FFState& o) const
|
||||
{
|
||||
if (&o == this) return true;
|
||||
|
||||
const PhraseBasedReorderingState &other = static_cast<const PhraseBasedReorderingState&>(o);
|
||||
if (m_prevRange == other.m_prevRange) {
|
||||
if (m_direction == LRModel::Forward) {
|
||||
int compareScore = ComparePrevScores(other.m_prevOption);
|
||||
return compareScore == 0;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
LRState*
|
||||
PhraseBasedReorderingState::
|
||||
Expand(const TranslationOption& topt, const InputType& input,
|
||||
ScoreComponentCollection* scores) const
|
||||
{
|
||||
// const LRModel::ModelType modelType = m_configuration.GetModelType();
|
||||
|
||||
if ((m_direction != LRModel::Forward && m_useFirstBackwardScore) || !m_first) {
|
||||
LRModel const& lrmodel = m_configuration;
|
||||
Range const cur = topt.GetSourceWordsRange();
|
||||
LRModel::ReorderingType reoType = (m_first ? lrmodel.GetOrientation(cur)
|
||||
: lrmodel.GetOrientation(m_prevRange,cur));
|
||||
CopyScores(scores, topt, input, reoType);
|
||||
}
|
||||
return new PhraseBasedReorderingState(this, topt);
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////
|
||||
//BidirectionalReorderingState
|
||||
|
||||
size_t BidirectionalReorderingState::hash() const
|
||||
{
|
||||
size_t ret = m_backward->hash();
|
||||
boost::hash_combine(ret, m_forward->hash());
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool BidirectionalReorderingState::operator==(const FFState& o) const
|
||||
{
|
||||
if (&o == this) return 0;
|
||||
|
||||
BidirectionalReorderingState const &other
|
||||
= static_cast<BidirectionalReorderingState const&>(o);
|
||||
|
||||
bool ret = (*m_backward == *other.m_backward) && (*m_forward == *other.m_forward);
|
||||
return ret;
|
||||
}
|
||||
|
||||
LRState*
|
||||
BidirectionalReorderingState::
|
||||
Expand(const TranslationOption& topt, const InputType& input,
|
||||
ScoreComponentCollection* scores) const
|
||||
{
|
||||
LRState *newbwd = m_backward->Expand(topt,input, scores);
|
||||
LRState *newfwd = m_forward->Expand(topt, input, scores);
|
||||
return new BidirectionalReorderingState(m_configuration, newbwd, newfwd, m_offset);
|
||||
}
|
||||
|
||||
///////////////////////////
|
||||
//HierarchicalReorderingBackwardState
|
||||
|
||||
HReorderingBackwardState::
|
||||
HReorderingBackwardState(const HReorderingBackwardState *prev,
|
||||
const TranslationOption &topt,
|
||||
ReorderingStack reoStack)
|
||||
: LRState(prev, topt), m_reoStack(reoStack)
|
||||
{ }
|
||||
|
||||
HReorderingBackwardState::
|
||||
HReorderingBackwardState(const LRModel &config, size_t offset)
|
||||
: LRState(config, LRModel::Backward, offset)
|
||||
{ }
|
||||
|
||||
size_t HReorderingBackwardState::hash() const
|
||||
{
|
||||
size_t ret = m_reoStack.hash();
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool HReorderingBackwardState::operator==(const FFState& o) const
|
||||
{
|
||||
const HReorderingBackwardState& other
|
||||
= static_cast<const HReorderingBackwardState&>(o);
|
||||
bool ret = m_reoStack == other.m_reoStack;
|
||||
return ret;
|
||||
}
|
||||
|
||||
LRState*
|
||||
HReorderingBackwardState::
|
||||
Expand(const TranslationOption& topt, const InputType& input,
|
||||
ScoreComponentCollection* scores) const
|
||||
{
|
||||
HReorderingBackwardState* nextState;
|
||||
nextState = new HReorderingBackwardState(this, topt, m_reoStack);
|
||||
Range swrange = topt.GetSourceWordsRange();
|
||||
int reoDistance = nextState->m_reoStack.ShiftReduce(swrange);
|
||||
ReorderingType reoType = m_configuration.GetOrientation(reoDistance);
|
||||
CopyScores(scores, topt, input, reoType);
|
||||
return nextState;
|
||||
}
|
||||
|
||||
///////////////////////////
|
||||
//HReorderingForwardState
|
||||
|
||||
HReorderingForwardState::
|
||||
HReorderingForwardState(const LRModel &config,
|
||||
size_t size, size_t offset)
|
||||
: LRState(config, LRModel::Forward, offset)
|
||||
, m_first(true)
|
||||
, m_prevRange(NOT_FOUND,NOT_FOUND)
|
||||
, m_coverage(size)
|
||||
{ }
|
||||
|
||||
HReorderingForwardState::
|
||||
HReorderingForwardState(const HReorderingForwardState *prev,
|
||||
const TranslationOption &topt)
|
||||
: LRState(prev, topt)
|
||||
, m_first(false)
|
||||
, m_prevRange(topt.GetSourceWordsRange())
|
||||
, m_coverage(prev->m_coverage, topt.GetSourceWordsRange())
|
||||
{
|
||||
}
|
||||
|
||||
size_t HReorderingForwardState::hash() const
|
||||
{
|
||||
size_t ret;
|
||||
ret = hash_value(m_prevRange);
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool HReorderingForwardState::operator==(const FFState& o) const
|
||||
{
|
||||
if (&o == this) return true;
|
||||
|
||||
HReorderingForwardState const& other
|
||||
= static_cast<HReorderingForwardState const&>(o);
|
||||
|
||||
int compareScores = ((m_prevRange == other.m_prevRange)
|
||||
? ComparePrevScores(other.m_prevOption)
|
||||
: (m_prevRange < other.m_prevRange) ? -1 : 1);
|
||||
return compareScores == 0;
|
||||
}
|
||||
|
||||
// For compatibility with the phrase-based reordering model, scoring is one
|
||||
// step delayed.
|
||||
// The forward model takes determines orientations heuristically as follows:
|
||||
// mono: if the next phrase comes after the conditioning phrase and
|
||||
// - there is a gap to the right of the conditioning phrase, or
|
||||
// - the next phrase immediately follows it
|
||||
// swap: if the next phrase goes before the conditioning phrase and
|
||||
// - there is a gap to the left of the conditioning phrase, or
|
||||
// - the next phrase immediately precedes it
|
||||
// dright: if the next phrase follows the conditioning phrase and other
|
||||
// stuff comes in between
|
||||
// dleft: if the next phrase precedes the conditioning phrase and other
|
||||
// stuff comes in between
|
||||
|
||||
LRState*
|
||||
HReorderingForwardState::
|
||||
Expand(TranslationOption const& topt, InputType const& input,
|
||||
ScoreComponentCollection* scores) const
|
||||
{
|
||||
const Range cur = topt.GetSourceWordsRange();
|
||||
// keep track of the current coverage ourselves so we don't need the hypothesis
|
||||
Bitmap cov(m_coverage, cur);
|
||||
if (!m_first) {
|
||||
LRModel::ReorderingType reoType;
|
||||
reoType = m_configuration.GetOrientation(m_prevRange,cur,cov);
|
||||
CopyScores(scores, topt, input, reoType);
|
||||
}
|
||||
return new HReorderingForwardState(this, topt);
|
||||
}
|
||||
}
|
||||
|
@ -1,308 +0,0 @@
|
||||
// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
|
||||
#pragma once
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include <boost/scoped_ptr.hpp>
|
||||
|
||||
#include "moses/Hypothesis.h"
|
||||
#include "moses/ScoreComponentCollection.h"
|
||||
#include "moses/Range.h"
|
||||
#include "moses/Bitmap.h"
|
||||
#include "moses/TranslationOption.h"
|
||||
#include "moses/FF/FFState.h"
|
||||
#include "ReorderingStack.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
class LRState;
|
||||
class LexicalReordering;
|
||||
class SparseReordering;
|
||||
|
||||
//! Factory class for lexical reordering states
|
||||
class LRModel
|
||||
{
|
||||
public:
|
||||
friend class LexicalReordering;
|
||||
enum ModelType { Monotonic, MSD, MSLR, LeftRight, None };
|
||||
enum Direction { Forward, Backward, Bidirectional };
|
||||
enum Condition { F, E, FE };
|
||||
|
||||
// constants for the different types of reordering
|
||||
// (correspond to indices in the respective table)
|
||||
#if 0
|
||||
typedef int ReorderingType;
|
||||
static const ReorderingType M = 0; // monotonic
|
||||
static const ReorderingType NM = 1; // non-monotonic
|
||||
static const ReorderingType S = 1; // swap
|
||||
static const ReorderingType D = 2; // discontinuous
|
||||
static const ReorderingType DL = 2; // discontinuous, left
|
||||
static const ReorderingType DR = 3; // discontinuous, right
|
||||
static const ReorderingType R = 0; // right
|
||||
static const ReorderingType L = 1; // left
|
||||
static const ReorderingType MAX = 3; // largest possible
|
||||
#else
|
||||
enum ReorderingType {
|
||||
M = 0, // monotonic
|
||||
NM = 1, // non-monotonic
|
||||
S = 1, // swap
|
||||
D = 2, // discontinuous
|
||||
DL = 2, // discontinuous, left
|
||||
DR = 3, // discontinuous, right
|
||||
R = 0, // right
|
||||
L = 1, // left
|
||||
MAX = 3, // largest possible
|
||||
NONE = 4 // largest possible
|
||||
};
|
||||
#endif
|
||||
// determine orientation, depending on model:
|
||||
|
||||
|
||||
ReorderingType // for first phrase in phrase-based
|
||||
GetOrientation(Range const& cur) const;
|
||||
|
||||
ReorderingType // for non-first phrases in phrase-based
|
||||
GetOrientation(Range const& prev, Range const& cur) const;
|
||||
|
||||
ReorderingType // for HReorderingForwardState
|
||||
GetOrientation(Range const& prev, Range const& cur,
|
||||
Bitmap const& cov) const;
|
||||
|
||||
ReorderingType // for HReorderingBackwarddState
|
||||
GetOrientation(int const reoDistance) const;
|
||||
|
||||
LRModel(const std::string &modelType);
|
||||
|
||||
void
|
||||
ConfigureSparse(const std::map<std::string,std::string>& sparseArgs,
|
||||
const LexicalReordering* producer);
|
||||
|
||||
LRState*
|
||||
CreateLRState(const InputType &input) const;
|
||||
|
||||
size_t GetNumberOfTypes() const;
|
||||
size_t GetNumScoreComponents() const;
|
||||
void SetAdditionalScoreComponents(size_t number);
|
||||
|
||||
LexicalReordering*
|
||||
GetScoreProducer() const {
|
||||
return m_scoreProducer;
|
||||
}
|
||||
|
||||
ModelType GetModelType() const {
|
||||
return m_modelType;
|
||||
}
|
||||
Direction GetDirection() const {
|
||||
return m_direction;
|
||||
}
|
||||
Condition GetCondition() const {
|
||||
return m_condition;
|
||||
}
|
||||
|
||||
bool
|
||||
IsPhraseBased() const {
|
||||
return m_phraseBased;
|
||||
}
|
||||
|
||||
bool
|
||||
CollapseScores() const {
|
||||
return m_collapseScores;
|
||||
}
|
||||
|
||||
SparseReordering const*
|
||||
GetSparseReordering() const {
|
||||
return m_sparse.get();
|
||||
}
|
||||
|
||||
private:
|
||||
void
|
||||
SetScoreProducer(LexicalReordering* scoreProducer) {
|
||||
m_scoreProducer = scoreProducer;
|
||||
}
|
||||
|
||||
std::string const&
|
||||
GetModelString() const {
|
||||
return m_modelString;
|
||||
}
|
||||
|
||||
std::string m_modelString;
|
||||
LexicalReordering *m_scoreProducer;
|
||||
ModelType m_modelType;
|
||||
bool m_phraseBased;
|
||||
bool m_collapseScores;
|
||||
Direction m_direction;
|
||||
Condition m_condition;
|
||||
size_t m_additionalScoreComponents;
|
||||
boost::scoped_ptr<SparseReordering> m_sparse;
|
||||
};
|
||||
|
||||
//! Abstract class for lexical reordering model states
|
||||
class LRState : public FFState
|
||||
{
|
||||
public:
|
||||
|
||||
typedef LRModel::ReorderingType ReorderingType;
|
||||
|
||||
virtual
|
||||
LRState*
|
||||
Expand(const TranslationOption& hypo, const InputType& input,
|
||||
ScoreComponentCollection* scores) const = 0;
|
||||
|
||||
static
|
||||
LRState*
|
||||
CreateLRState(const std::vector<std::string>& config,
|
||||
LRModel::Direction dir,
|
||||
const InputType &input);
|
||||
|
||||
protected:
|
||||
|
||||
const LRModel& m_configuration;
|
||||
|
||||
// The following is the true direction of the object, which can be
|
||||
// Backward or Forward even if the Configuration has Bidirectional.
|
||||
LRModel::Direction m_direction;
|
||||
size_t m_offset;
|
||||
//forward scores are conditioned on prev option, so need to remember it
|
||||
const TranslationOption *m_prevOption;
|
||||
|
||||
inline
|
||||
LRState(const LRState *prev,
|
||||
const TranslationOption &topt)
|
||||
: m_configuration(prev->m_configuration)
|
||||
, m_direction(prev->m_direction)
|
||||
, m_offset(prev->m_offset)
|
||||
, m_prevOption(&topt)
|
||||
{ }
|
||||
|
||||
inline
|
||||
LRState(const LRModel &config,
|
||||
LRModel::Direction dir,
|
||||
size_t offset)
|
||||
: m_configuration(config)
|
||||
, m_direction(dir)
|
||||
, m_offset(offset)
|
||||
, m_prevOption(NULL)
|
||||
{ }
|
||||
|
||||
// copy the right scores in the right places, taking into account
|
||||
// forward/backward, offset, collapse
|
||||
void
|
||||
CopyScores(ScoreComponentCollection* scores,
|
||||
const TranslationOption& topt,
|
||||
const InputType& input, ReorderingType reoType) const;
|
||||
|
||||
int
|
||||
ComparePrevScores(const TranslationOption *other) const;
|
||||
};
|
||||
|
||||
//! @todo what is this?
|
||||
class BidirectionalReorderingState
|
||||
: public LRState
|
||||
{
|
||||
private:
|
||||
const LRState *m_backward;
|
||||
const LRState *m_forward;
|
||||
public:
|
||||
BidirectionalReorderingState(const LRModel &config,
|
||||
const LRState *bw,
|
||||
const LRState *fw, size_t offset)
|
||||
: LRState(config,
|
||||
LRModel::Bidirectional,
|
||||
offset)
|
||||
, m_backward(bw)
|
||||
, m_forward(fw)
|
||||
{ }
|
||||
|
||||
~BidirectionalReorderingState() {
|
||||
delete m_backward;
|
||||
delete m_forward;
|
||||
}
|
||||
|
||||
virtual size_t hash() const;
|
||||
virtual bool operator==(const FFState& other) const;
|
||||
|
||||
LRState*
|
||||
Expand(const TranslationOption& topt, const InputType& input,
|
||||
ScoreComponentCollection* scores) const;
|
||||
};
|
||||
|
||||
//! State for the standard Moses implementation of lexical reordering models
|
||||
//! (see Koehn et al, Edinburgh System Description for the 2005 NIST MT
|
||||
//! Evaluation)
|
||||
class PhraseBasedReorderingState
|
||||
: public LRState
|
||||
{
|
||||
private:
|
||||
Range m_prevRange;
|
||||
bool m_first;
|
||||
public:
|
||||
static bool m_useFirstBackwardScore;
|
||||
PhraseBasedReorderingState(const LRModel &config,
|
||||
LRModel::Direction dir,
|
||||
size_t offset);
|
||||
PhraseBasedReorderingState(const PhraseBasedReorderingState *prev,
|
||||
const TranslationOption &topt);
|
||||
|
||||
virtual size_t hash() const;
|
||||
virtual bool operator==(const FFState& other) const;
|
||||
|
||||
virtual
|
||||
LRState*
|
||||
Expand(const TranslationOption& topt,const InputType& input,
|
||||
ScoreComponentCollection* scores) const;
|
||||
|
||||
ReorderingType GetOrientationTypeMSD(Range currRange) const;
|
||||
ReorderingType GetOrientationTypeMSLR(Range currRange) const;
|
||||
ReorderingType GetOrientationTypeMonotonic(Range currRange) const;
|
||||
ReorderingType GetOrientationTypeLeftRight(Range currRange) const;
|
||||
};
|
||||
|
||||
//! State for a hierarchical reordering model (see Galley and Manning, A
|
||||
//! Simple and Effective Hierarchical Phrase Reordering Model, EMNLP 2008)
|
||||
//! backward state (conditioned on the previous phrase)
|
||||
class HReorderingBackwardState : public LRState
|
||||
{
|
||||
private:
|
||||
ReorderingStack m_reoStack;
|
||||
public:
|
||||
HReorderingBackwardState(const LRModel &config, size_t offset);
|
||||
HReorderingBackwardState(const HReorderingBackwardState *prev,
|
||||
const TranslationOption &topt,
|
||||
ReorderingStack reoStack);
|
||||
virtual size_t hash() const;
|
||||
virtual bool operator==(const FFState& other) const;
|
||||
|
||||
virtual LRState* Expand(const TranslationOption& hypo, const InputType& input,
|
||||
ScoreComponentCollection* scores) const;
|
||||
|
||||
private:
|
||||
ReorderingType GetOrientationTypeMSD(int reoDistance) const;
|
||||
ReorderingType GetOrientationTypeMSLR(int reoDistance) const;
|
||||
ReorderingType GetOrientationTypeMonotonic(int reoDistance) const;
|
||||
ReorderingType GetOrientationTypeLeftRight(int reoDistance) const;
|
||||
};
|
||||
|
||||
|
||||
//!forward state (conditioned on the next phrase)
|
||||
class HReorderingForwardState : public LRState
|
||||
{
|
||||
private:
|
||||
bool m_first;
|
||||
Range m_prevRange;
|
||||
Bitmap m_coverage;
|
||||
|
||||
public:
|
||||
HReorderingForwardState(const LRModel &config, size_t sentenceLength,
|
||||
size_t offset);
|
||||
HReorderingForwardState(const HReorderingForwardState *prev,
|
||||
const TranslationOption &topt);
|
||||
|
||||
virtual size_t hash() const;
|
||||
virtual bool operator==(const FFState& other) const;
|
||||
|
||||
virtual LRState* Expand(const TranslationOption& hypo,
|
||||
const InputType& input,
|
||||
ScoreComponentCollection* scores) const;
|
||||
};
|
||||
}
|
||||
|
72
moses/FF/LexicalReordering/PhraseBasedReorderingState.cpp
Normal file
72
moses/FF/LexicalReordering/PhraseBasedReorderingState.cpp
Normal file
@ -0,0 +1,72 @@
|
||||
#include "PhraseBasedReorderingState.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
// ===========================================================================
|
||||
// PHRASE BASED REORDERING STATE
|
||||
// ===========================================================================
|
||||
bool PhraseBasedReorderingState::m_useFirstBackwardScore = true;
|
||||
|
||||
PhraseBasedReorderingState::
|
||||
PhraseBasedReorderingState(const PhraseBasedReorderingState *prev,
|
||||
const TranslationOption &topt)
|
||||
: LRState(prev, topt)
|
||||
, m_prevRange(topt.GetSourceWordsRange())
|
||||
, m_first(false)
|
||||
{ }
|
||||
|
||||
|
||||
PhraseBasedReorderingState::
|
||||
PhraseBasedReorderingState(const LRModel &config,
|
||||
LRModel::Direction dir, size_t offset)
|
||||
: LRState(config, dir, offset)
|
||||
, m_prevRange(NOT_FOUND,NOT_FOUND)
|
||||
, m_first(true)
|
||||
{ }
|
||||
|
||||
|
||||
size_t PhraseBasedReorderingState::hash() const
|
||||
{
|
||||
size_t ret;
|
||||
ret = hash_value(m_prevRange);
|
||||
boost::hash_combine(ret, m_direction);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool PhraseBasedReorderingState::operator==(const FFState& o) const
|
||||
{
|
||||
if (&o == this) return true;
|
||||
|
||||
const PhraseBasedReorderingState &other = static_cast<const PhraseBasedReorderingState&>(o);
|
||||
if (m_prevRange == other.m_prevRange) {
|
||||
if (m_direction == LRModel::Forward) {
|
||||
int compareScore = ComparePrevScores(other.m_prevOption);
|
||||
return compareScore == 0;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
LRState*
|
||||
PhraseBasedReorderingState::
|
||||
Expand(const TranslationOption& topt, const InputType& input,
|
||||
ScoreComponentCollection* scores) const
|
||||
{
|
||||
// const LRModel::ModelType modelType = m_configuration.GetModelType();
|
||||
|
||||
if ((m_direction != LRModel::Forward && m_useFirstBackwardScore) || !m_first) {
|
||||
LRModel const& lrmodel = m_configuration;
|
||||
Range const cur = topt.GetSourceWordsRange();
|
||||
LRModel::ReorderingType reoType = (m_first ? lrmodel.GetOrientation(cur)
|
||||
: lrmodel.GetOrientation(m_prevRange,cur));
|
||||
CopyScores(scores, topt, input, reoType);
|
||||
}
|
||||
return new PhraseBasedReorderingState(this, topt);
|
||||
}
|
||||
|
||||
}
|
||||
|
38
moses/FF/LexicalReordering/PhraseBasedReorderingState.h
Normal file
38
moses/FF/LexicalReordering/PhraseBasedReorderingState.h
Normal file
@ -0,0 +1,38 @@
|
||||
#pragma once
|
||||
#include "LRState.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
//! State for the standard Moses implementation of lexical reordering models
|
||||
//! (see Koehn et al, Edinburgh System Description for the 2005 NIST MT
|
||||
//! Evaluation)
|
||||
class PhraseBasedReorderingState
|
||||
: public LRState
|
||||
{
|
||||
private:
|
||||
Range m_prevRange;
|
||||
bool m_first;
|
||||
public:
|
||||
static bool m_useFirstBackwardScore;
|
||||
PhraseBasedReorderingState(const LRModel &config,
|
||||
LRModel::Direction dir,
|
||||
size_t offset);
|
||||
PhraseBasedReorderingState(const PhraseBasedReorderingState *prev,
|
||||
const TranslationOption &topt);
|
||||
|
||||
virtual size_t hash() const;
|
||||
virtual bool operator==(const FFState& other) const;
|
||||
|
||||
virtual
|
||||
LRState*
|
||||
Expand(const TranslationOption& topt,const InputType& input,
|
||||
ScoreComponentCollection* scores) const;
|
||||
|
||||
ReorderingType GetOrientationTypeMSD(Range currRange) const;
|
||||
ReorderingType GetOrientationTypeMSLR(Range currRange) const;
|
||||
ReorderingType GetOrientationTypeMonotonic(Range currRange) const;
|
||||
ReorderingType GetOrientationTypeLeftRight(Range currRange) const;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -19,7 +19,7 @@
|
||||
|
||||
#include "moses/FeatureVector.h"
|
||||
#include "moses/ScoreComponentCollection.h"
|
||||
#include "LexicalReorderingState.h"
|
||||
#include "LRState.h"
|
||||
|
||||
/**
|
||||
Configuration of sparse reordering:
|
||||
|
@ -140,6 +140,8 @@ float Model1LexicalTable::GetProbability(const Factor* wordS, const Factor* word
|
||||
|
||||
Model1Feature::Model1Feature(const std::string &line)
|
||||
: StatelessFeatureFunction(1, line)
|
||||
, m_skipTargetPunctuation(false)
|
||||
, m_is_syntax(false)
|
||||
{
|
||||
VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ...");
|
||||
ReadParameters();
|
||||
@ -150,10 +152,12 @@ void Model1Feature::SetParameter(const std::string& key, const std::string& valu
|
||||
{
|
||||
if (key == "path") {
|
||||
m_fileNameModel1 = value;
|
||||
} else if (key == "sourceVocabulary") {
|
||||
} else if (key == "source-vocabulary") {
|
||||
m_fileNameVcbS = value;
|
||||
} else if (key == "targetVocabulary") {
|
||||
} else if (key == "target-vocabulary") {
|
||||
m_fileNameVcbT = value;
|
||||
} else if (key == "skip-target-punctuation") {
|
||||
m_skipTargetPunctuation = Scan<bool>(value);
|
||||
} else {
|
||||
StatelessFeatureFunction::SetParameter(key, value);
|
||||
}
|
||||
@ -162,6 +166,8 @@ void Model1Feature::SetParameter(const std::string& key, const std::string& valu
|
||||
void Model1Feature::Load(AllOptions::ptr const& opts)
|
||||
{
|
||||
m_options = opts;
|
||||
m_is_syntax = is_syntax(opts->search.algo);
|
||||
|
||||
FEATUREVERBOSE(2, GetScoreProducerDescription() << ": Loading source vocabulary from file " << m_fileNameVcbS << " ...");
|
||||
Model1Vocabulary vcbS;
|
||||
vcbS.Load(m_fileNameVcbS);
|
||||
@ -177,6 +183,16 @@ void Model1Feature::Load(AllOptions::ptr const& opts)
|
||||
m_emptyWord = factorCollection.GetFactor(Model1Vocabulary::GIZANULL,false);
|
||||
UTIL_THROW_IF2(m_emptyWord==NULL, GetScoreProducerDescription()
|
||||
<< ": Factor for GIZA empty word does not exist.");
|
||||
|
||||
if (m_skipTargetPunctuation) {
|
||||
const std::string punctuation = ",;.:!?";
|
||||
for (size_t i=0; i<punctuation.size(); ++i) {
|
||||
const std::string punct = punctuation.substr(i,1);
|
||||
FactorCollection &factorCollection = FactorCollection::Instance();
|
||||
const Factor* punctFactor = factorCollection.AddFactor(punct,false);
|
||||
std::pair<std::set<const Factor*>::iterator,bool> inserted = m_punctuation.insert(punctFactor);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Model1Feature::EvaluateWithSourceContext(const InputType &input
|
||||
@ -192,6 +208,12 @@ void Model1Feature::EvaluateWithSourceContext(const InputType &input
|
||||
|
||||
for (size_t posT=0; posT<targetPhrase.GetSize(); ++posT) {
|
||||
const Word &wordT = targetPhrase.GetWord(posT);
|
||||
if (m_skipTargetPunctuation) {
|
||||
std::set<const Factor*>::const_iterator foundPunctuation = m_punctuation.find(wordT[0]);
|
||||
if (foundPunctuation != m_punctuation.end()) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if ( !wordT.IsNonTerminal() ) {
|
||||
float thisWordProb = m_model1.GetProbability(m_emptyWord,wordT[0]); // probability conditioned on empty word
|
||||
|
||||
@ -213,7 +235,7 @@ void Model1Feature::EvaluateWithSourceContext(const InputType &input
|
||||
}
|
||||
|
||||
if (!foundInCache) {
|
||||
for (size_t posS=1; posS<sentence.GetSize()-1; ++posS) { // ignore <s> and </s>
|
||||
for (size_t posS=(m_is_syntax?1:0); posS<(m_is_syntax?sentence.GetSize()-1:sentence.GetSize()); ++posS) { // ignore <s> and </s>
|
||||
const Word &wordS = sentence.GetWord(posS);
|
||||
float modelProb = m_model1.GetProbability(wordS[0],wordT[0]);
|
||||
FEATUREVERBOSE(4, "p( " << wordT << " | " << wordS << " ) = " << modelProb << std::endl);
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
#include <string>
|
||||
#include <limits>
|
||||
#include <set>
|
||||
#include <boost/unordered_map.hpp>
|
||||
#include "StatelessFeatureFunction.h"
|
||||
#include "moses/Factor.h"
|
||||
@ -98,6 +99,9 @@ private:
|
||||
std::string m_fileNameModel1;
|
||||
Model1LexicalTable m_model1;
|
||||
const Factor* m_emptyWord;
|
||||
bool m_skipTargetPunctuation;
|
||||
std::set<const Factor*> m_punctuation;
|
||||
bool m_is_syntax;
|
||||
|
||||
void Load(AllOptions::ptr const& opts);
|
||||
|
||||
|
189
moses/FF/TargetConstituentAdjacencyFeature.cpp
Normal file
189
moses/FF/TargetConstituentAdjacencyFeature.cpp
Normal file
@ -0,0 +1,189 @@
|
||||
#include "TargetConstituentAdjacencyFeature.h"
|
||||
#include "moses/PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.h"
|
||||
#include "moses/PP/TargetConstituentBoundariesLeftPhraseProperty.h"
|
||||
#include "moses/StaticData.h"
|
||||
#include "moses/ScoreComponentCollection.h"
|
||||
#include "moses/Hypothesis.h"
|
||||
#include "moses/FactorCollection.h"
|
||||
#include "moses/TreeInput.h"
|
||||
#include <algorithm>
|
||||
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
size_t TargetConstituentAdjacencyFeatureState::hash() const
|
||||
{
|
||||
if (m_recombine) {
|
||||
return 0;
|
||||
}
|
||||
size_t ret = 0;
|
||||
boost::hash_combine(ret, m_collection.size());
|
||||
for (std::map<const Factor*, float>::const_iterator it=m_collection.begin();
|
||||
it!=m_collection.end(); ++it) {
|
||||
boost::hash_combine(ret, it->first);
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
|
||||
bool TargetConstituentAdjacencyFeatureState::operator==(const FFState& other) const
|
||||
{
|
||||
if (m_recombine) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (this == &other) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const TargetConstituentAdjacencyFeatureState* otherState =
|
||||
dynamic_cast<const TargetConstituentAdjacencyFeatureState*>(&other);
|
||||
UTIL_THROW_IF2(otherState == NULL, "Wrong state type");
|
||||
|
||||
if (m_collection.size() != (otherState->m_collection).size()) {
|
||||
return false;
|
||||
}
|
||||
std::map<const Factor*, float>::const_iterator thisIt, otherIt;
|
||||
for (thisIt=m_collection.begin(), otherIt=(otherState->m_collection).begin();
|
||||
thisIt!=m_collection.end(); ++thisIt, ++otherIt) {
|
||||
if (thisIt->first != otherIt->first) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
|
||||
TargetConstituentAdjacencyFeature::TargetConstituentAdjacencyFeature(const std::string &line)
|
||||
: StatefulFeatureFunction(2, line)
|
||||
, m_featureVariant(0)
|
||||
, m_recombine(false)
|
||||
{
|
||||
VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ...");
|
||||
ReadParameters();
|
||||
VERBOSE(1, " Done." << std::endl);
|
||||
VERBOSE(1, " Feature variant: " << m_featureVariant << "." << std::endl);
|
||||
}
|
||||
|
||||
|
||||
void TargetConstituentAdjacencyFeature::SetParameter(const std::string& key, const std::string& value)
|
||||
{
|
||||
if (key == "variant") {
|
||||
m_featureVariant = Scan<size_t>(value);
|
||||
} else if (key == "recombine") {
|
||||
m_recombine = Scan<bool>(value);
|
||||
} else {
|
||||
StatefulFeatureFunction::SetParameter(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
FFState* TargetConstituentAdjacencyFeature::EvaluateWhenApplied(
|
||||
const Hypothesis& cur_hypo,
|
||||
const FFState* prev_state,
|
||||
ScoreComponentCollection* accumulator) const
|
||||
{
|
||||
// dense scores
|
||||
std::vector<float> newScores(m_numScoreComponents,0); // m_numScoreComponents == 2
|
||||
|
||||
// state
|
||||
const TargetConstituentAdjacencyFeatureState *prevState = static_cast<const TargetConstituentAdjacencyFeatureState*>(prev_state);
|
||||
|
||||
// read TargetConstituentAdjacency property
|
||||
const TargetPhrase &currTarPhr = cur_hypo.GetCurrTargetPhrase();
|
||||
FEATUREVERBOSE(2, "Phrase: " << currTarPhr << std::endl);
|
||||
|
||||
if (const PhraseProperty *property = currTarPhr.GetProperty("TargetConstituentBoundariesLeft")) {
|
||||
|
||||
const TargetConstituentBoundariesLeftPhraseProperty *targetConstituentBoundariesLeftPhraseProperty =
|
||||
static_cast<const TargetConstituentBoundariesLeftPhraseProperty*>(property);
|
||||
const TargetConstituentBoundariesLeftCollection& leftConstituentCollection =
|
||||
targetConstituentBoundariesLeftPhraseProperty->GetCollection();
|
||||
float prob = 0;
|
||||
size_t numMatch = 0;
|
||||
size_t numOverall = 0;
|
||||
|
||||
if ( !cur_hypo.GetPrevHypo()->GetPrevHypo() ) {
|
||||
// previous hypothesis is initial, i.e. target sentence starts here
|
||||
|
||||
++numOverall;
|
||||
FactorCollection &factorCollection = FactorCollection::Instance();
|
||||
const Factor* bosFactor = factorCollection.AddFactor("BOS_",false);
|
||||
TargetConstituentBoundariesLeftCollection::const_iterator found =
|
||||
leftConstituentCollection.find(bosFactor);
|
||||
if ( found != leftConstituentCollection.end() ) {
|
||||
++numMatch;
|
||||
prob += found->second;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
const std::map<const Factor*, float>& hypConstituentCollection = prevState->m_collection;
|
||||
std::map<const Factor*, float>::const_iterator iter1 = hypConstituentCollection.begin();
|
||||
std::map<const Factor*, float>::const_iterator iter2 = leftConstituentCollection.begin();
|
||||
while ( iter1 != hypConstituentCollection.end() && iter2 != leftConstituentCollection.end() ) {
|
||||
++numOverall;
|
||||
if ( iter1->first < iter2->first ) {
|
||||
++iter1;
|
||||
} else if ( iter2->first < iter1->first ) {
|
||||
++iter2;
|
||||
} else {
|
||||
++numMatch;
|
||||
float currProb = iter1->second * iter2->second;
|
||||
if (currProb > prob)
|
||||
prob = currProb;
|
||||
++iter1;
|
||||
++iter2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( (numMatch == 0) || (prob == 0) ) {
|
||||
++newScores[1];
|
||||
} else {
|
||||
if ( m_featureVariant == 1 ) {
|
||||
newScores[0] += TransformScore(prob);
|
||||
} else {
|
||||
newScores[0] += TransformScore( (float)numMatch/numOverall );
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
// abort with error message if the phrase does not translate an unknown word
|
||||
UTIL_THROW_IF2(!currTarPhr.GetWord(0).IsOOV(), GetScoreProducerDescription()
|
||||
<< ": Missing TargetConstituentBoundariesLeft property.");
|
||||
|
||||
++newScores[1];
|
||||
|
||||
}
|
||||
|
||||
TargetConstituentAdjacencyFeatureState *newState = new TargetConstituentAdjacencyFeatureState(m_recombine);
|
||||
|
||||
if (const PhraseProperty *property = currTarPhr.GetProperty("TargetConstituentBoundariesRightAdjacent")) {
|
||||
|
||||
const TargetConstituentBoundariesRightAdjacentPhraseProperty *targetConstituentBoundariesRightAdjacentPhraseProperty =
|
||||
static_cast<const TargetConstituentBoundariesRightAdjacentPhraseProperty*>(property);
|
||||
const TargetConstituentBoundariesLeftCollection& rightAdjacentConstituentCollection = targetConstituentBoundariesRightAdjacentPhraseProperty->GetCollection();
|
||||
|
||||
std::copy(rightAdjacentConstituentCollection.begin(), rightAdjacentConstituentCollection.end(),
|
||||
std::inserter(newState->m_collection, newState->m_collection.begin()));
|
||||
|
||||
} else {
|
||||
|
||||
// abort with error message if the phrase does not translate an unknown word
|
||||
UTIL_THROW_IF2(!currTarPhr.GetWord(0).IsOOV(), GetScoreProducerDescription()
|
||||
<< ": Missing TargetConstituentBoundariesRightAdjacent property.");
|
||||
|
||||
}
|
||||
|
||||
// add scores
|
||||
accumulator->PlusEquals(this, newScores);
|
||||
|
||||
return newState;
|
||||
}
|
||||
|
||||
}
|
||||
|
101
moses/FF/TargetConstituentAdjacencyFeature.h
Normal file
101
moses/FF/TargetConstituentAdjacencyFeature.h
Normal file
@ -0,0 +1,101 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <iostream>
|
||||
#include "StatefulFeatureFunction.h"
|
||||
#include "FFState.h"
|
||||
#include "util/exception.hh"
|
||||
#include <stdint.h>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
class TargetConstituentAdjacencyFeatureState : public FFState
|
||||
{
|
||||
|
||||
public:
|
||||
|
||||
friend class TargetConstituentAdjacencyFeature;
|
||||
|
||||
TargetConstituentAdjacencyFeatureState(bool recombine)
|
||||
: m_recombine(recombine)
|
||||
{};
|
||||
|
||||
size_t hash() const;
|
||||
|
||||
virtual bool operator==(const FFState& other) const;
|
||||
|
||||
private:
|
||||
|
||||
const bool m_recombine;
|
||||
std::map<const Factor*, float> m_collection;
|
||||
|
||||
};
|
||||
|
||||
|
||||
class TargetConstituentAdjacencyFeature : public StatefulFeatureFunction
|
||||
{
|
||||
|
||||
public:
|
||||
|
||||
TargetConstituentAdjacencyFeature(const std::string &line);
|
||||
|
||||
~TargetConstituentAdjacencyFeature()
|
||||
{};
|
||||
|
||||
bool IsUseable(const FactorMask &mask) const {
|
||||
return true;
|
||||
};
|
||||
|
||||
virtual const FFState* EmptyHypothesisState(const InputType &input) const {
|
||||
return new TargetConstituentAdjacencyFeatureState(m_recombine);
|
||||
};
|
||||
|
||||
void SetParameter(const std::string& key, const std::string& value);
|
||||
|
||||
void Load(AllOptions::ptr const& opts)
|
||||
{};
|
||||
|
||||
void EvaluateInIsolation(const Phrase &source
|
||||
, const TargetPhrase &targetPhrase
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection &estimatedFutureScore) const
|
||||
{};
|
||||
|
||||
void EvaluateWithSourceContext(const InputType &input
|
||||
, const InputPath &inputPath
|
||||
, const TargetPhrase &targetPhrase
|
||||
, const StackVec *stackVec
|
||||
, ScoreComponentCollection &scoreBreakdown
|
||||
, ScoreComponentCollection *estimatedFutureScore = NULL) const
|
||||
{};
|
||||
|
||||
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
|
||||
, const TranslationOptionList &translationOptionList) const
|
||||
{};
|
||||
|
||||
FFState* EvaluateWhenApplied(
|
||||
const Hypothesis& cur_hypo,
|
||||
const FFState* prev_state,
|
||||
ScoreComponentCollection* accumulator) const;
|
||||
|
||||
FFState* EvaluateWhenApplied(
|
||||
const ChartHypothesis& cur_hypo,
|
||||
int featureID, // used to index the state in the previous hypotheses
|
||||
ScoreComponentCollection* accumulator) const {
|
||||
UTIL_THROW2(GetScoreProducerDescription() << ": feature currently not implemented for chart-based decoding.");
|
||||
return new TargetConstituentAdjacencyFeatureState(m_recombine);
|
||||
};
|
||||
|
||||
|
||||
private:
|
||||
|
||||
size_t m_featureVariant;
|
||||
bool m_recombine;
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -72,7 +72,7 @@ private:
|
||||
std::string MakeNGram(const TargetPhrase &phrase, size_t start, size_t end) const {
|
||||
std::vector<std::string> words;
|
||||
while (start != end) {
|
||||
words.push_back(phrase.GetWord(start).GetString(StaticData::Instance().options().output.factor_order, false));
|
||||
words.push_back(phrase.GetWord(start).GetString(StaticData::Instance().options()->output.factor_order, false));
|
||||
start++;
|
||||
}
|
||||
return Join(" ", words);
|
||||
|
@ -323,7 +323,7 @@ public:
|
||||
Phrase *target = new Phrase();
|
||||
target->CreateFromString(
|
||||
Output
|
||||
, StaticData::Instance().options().output.factor_order
|
||||
, StaticData::Instance().options()->output.factor_order
|
||||
, tabbedSentence.GetColumns()[0]
|
||||
, NULL);
|
||||
|
||||
|
@ -111,8 +111,7 @@ void WordTranslationFeature::Load(AllOptions::ptr const& opts)
|
||||
}
|
||||
|
||||
inFileSource.close();
|
||||
} else if (!m_filePathSource.empty() || !m_filePathTarget.empty()) {
|
||||
return;
|
||||
} else {
|
||||
// restricted source word vocabulary
|
||||
ifstream inFileSource(m_filePathSource.c_str());
|
||||
UTIL_THROW_IF2(!inFileSource, "could not open file " << m_filePathSource);
|
||||
|
@ -19,8 +19,7 @@ License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
#ifndef moses_Factor_h
|
||||
#define moses_Factor_h
|
||||
#pragma once
|
||||
|
||||
#include <ostream>
|
||||
#include <string>
|
||||
@ -98,4 +97,4 @@ public:
|
||||
size_t hash_value(const Factor &f);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -175,7 +175,7 @@ void FVector::resize(size_t newsize)
|
||||
|
||||
void FVector::clear()
|
||||
{
|
||||
m_coreFeatures.resize(0);
|
||||
m_coreFeatures.resize(m_coreFeatures.size(), 0);
|
||||
m_features.clear();
|
||||
}
|
||||
|
||||
|
@ -40,7 +40,8 @@ namespace Moses
|
||||
{
|
||||
|
||||
/** Constructs a new backward language model. */
|
||||
template <class Model> BackwardLanguageModel<Model>::BackwardLanguageModel(const std::string &line, const std::string &file, FactorType factorType, bool lazy) : LanguageModelKen<Model>(line,file,factorType,lazy)
|
||||
// TODO(lane): load_method instead of lazy bool
|
||||
template <class Model> BackwardLanguageModel<Model>::BackwardLanguageModel(const std::string &line, const std::string &file, FactorType factorType, bool lazy) : LanguageModelKen<Model>(line,file,factorType, lazy ? util::LAZY : util::POPULATE_OR_READ)
|
||||
{
|
||||
//
|
||||
// This space intentionally left blank
|
||||
|
102
moses/LM/Ken.cpp
102
moses/LM/Ken.cpp
@ -69,63 +69,6 @@ struct KenLMState : public FFState {
|
||||
|
||||
};
|
||||
|
||||
///*
|
||||
// * An implementation of single factor LM using Ken's code.
|
||||
// */
|
||||
//template <class Model> class LanguageModelKen : public LanguageModel
|
||||
//{
|
||||
//public:
|
||||
// LanguageModelKen(const std::string &line, const std::string &file, FactorType factorType, bool lazy);
|
||||
//
|
||||
// const FFState *EmptyHypothesisState(const InputType &/*input*/) const {
|
||||
// KenLMState *ret = new KenLMState();
|
||||
// ret->state = m_ngram->BeginSentenceState();
|
||||
// return ret;
|
||||
// }
|
||||
//
|
||||
// void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const;
|
||||
//
|
||||
// FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const;
|
||||
//
|
||||
// FFState *EvaluateWhenApplied(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection *accumulator) const;
|
||||
//
|
||||
// void IncrementalCallback(Incremental::Manager &manager) const {
|
||||
// manager.LMCallback(*m_ngram, m_lmIdLookup);
|
||||
// }
|
||||
//
|
||||
// bool IsUseable(const FactorMask &mask) const;
|
||||
//private:
|
||||
// LanguageModelKen(const LanguageModelKen<Model> ©_from);
|
||||
//
|
||||
// lm::WordIndex TranslateID(const Word &word) const {
|
||||
// std::size_t factor = word.GetFactor(m_factorType)->GetId();
|
||||
// return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]);
|
||||
// }
|
||||
//
|
||||
// // Convert last words of hypothesis into vocab ids, returning an end pointer.
|
||||
// lm::WordIndex *LastIDs(const Hypothesis &hypo, lm::WordIndex *indices) const {
|
||||
// lm::WordIndex *index = indices;
|
||||
// lm::WordIndex *end = indices + m_ngram->Order() - 1;
|
||||
// int position = hypo.GetCurrTargetWordsRange().GetEndPos();
|
||||
// for (; ; ++index, --position) {
|
||||
// if (index == end) return index;
|
||||
// if (position == -1) {
|
||||
// *index = m_ngram->GetVocabulary().BeginSentence();
|
||||
// return index + 1;
|
||||
// }
|
||||
// *index = TranslateID(hypo.GetWord(position));
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// boost::shared_ptr<Model> m_ngram;
|
||||
//
|
||||
// std::vector<lm::WordIndex> m_lmIdLookup;
|
||||
//
|
||||
// FactorType m_factorType;
|
||||
//
|
||||
// const Factor *m_beginSentenceFactor;
|
||||
//};
|
||||
|
||||
class MappingBuilder : public lm::EnumerateVocab
|
||||
{
|
||||
public:
|
||||
@ -148,7 +91,7 @@ private:
|
||||
|
||||
} // namespace
|
||||
|
||||
template <class Model> void LanguageModelKen<Model>::LoadModel(const std::string &file, bool lazy)
|
||||
template <class Model> void LanguageModelKen<Model>::LoadModel(const std::string &file, util::LoadMethod load_method)
|
||||
{
|
||||
m_lmIdLookup.clear();
|
||||
|
||||
@ -161,18 +104,18 @@ template <class Model> void LanguageModelKen<Model>::LoadModel(const std::string
|
||||
FactorCollection &collection = FactorCollection::Instance();
|
||||
MappingBuilder builder(collection, m_lmIdLookup);
|
||||
config.enumerate_vocab = &builder;
|
||||
config.load_method = lazy ? util::LAZY : util::POPULATE_OR_READ;
|
||||
config.load_method = load_method;
|
||||
|
||||
m_ngram.reset(new Model(file.c_str(), config));
|
||||
}
|
||||
|
||||
template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::string &line, const std::string &file, FactorType factorType, bool lazy)
|
||||
template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::string &line, const std::string &file, FactorType factorType, util::LoadMethod load_method)
|
||||
:LanguageModel(line)
|
||||
,m_factorType(factorType)
|
||||
,m_beginSentenceFactor(FactorCollection::Instance().AddFactor(BOS_))
|
||||
{
|
||||
ReadParameters();
|
||||
LoadModel(file, lazy);
|
||||
LoadModel(file, load_method);
|
||||
}
|
||||
|
||||
template <class Model> LanguageModelKen<Model>::LanguageModelKen(const LanguageModelKen<Model> ©_from)
|
||||
@ -480,7 +423,7 @@ LanguageModel *ConstructKenLM(const std::string &lineOrig)
|
||||
{
|
||||
FactorType factorType = 0;
|
||||
string filePath;
|
||||
bool lazy = false;
|
||||
util::LoadMethod load_method = util::POPULATE_OR_READ;
|
||||
|
||||
util::TokenIter<util::SingleCharacter, true> argument(lineOrig, ' ');
|
||||
++argument; // KENLM
|
||||
@ -501,38 +444,53 @@ LanguageModel *ConstructKenLM(const std::string &lineOrig)
|
||||
} else if (name == "path") {
|
||||
filePath.assign(value.data(), value.size());
|
||||
} else if (name == "lazyken") {
|
||||
lazy = boost::lexical_cast<bool>(value);
|
||||
// deprecated: use load instead.
|
||||
load_method = boost::lexical_cast<bool>(value) ? util::LAZY : util::POPULATE_OR_READ;
|
||||
} else if (name == "load") {
|
||||
if (value == "lazy") {
|
||||
load_method = util::LAZY;
|
||||
} else if (value == "populate_or_lazy") {
|
||||
load_method = util::POPULATE_OR_LAZY;
|
||||
} else if (value == "populate_or_read" || value == "populate") {
|
||||
load_method = util::POPULATE_OR_READ;
|
||||
} else if (value == "read") {
|
||||
load_method = util::READ;
|
||||
} else if (value == "parallel_read") {
|
||||
load_method = util::PARALLEL_READ;
|
||||
} else {
|
||||
UTIL_THROW2("Unknown KenLM load method " << value);
|
||||
}
|
||||
} else {
|
||||
// pass to base class to interpret
|
||||
line << " " << name << "=" << value;
|
||||
}
|
||||
}
|
||||
|
||||
return ConstructKenLM(line.str(), filePath, factorType, lazy);
|
||||
return ConstructKenLM(line.str(), filePath, factorType, load_method);
|
||||
}
|
||||
|
||||
LanguageModel *ConstructKenLM(const std::string &line, const std::string &file, FactorType factorType, bool lazy)
|
||||
LanguageModel *ConstructKenLM(const std::string &line, const std::string &file, FactorType factorType, util::LoadMethod load_method)
|
||||
{
|
||||
lm::ngram::ModelType model_type;
|
||||
if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) {
|
||||
switch(model_type) {
|
||||
case lm::ngram::PROBING:
|
||||
return new LanguageModelKen<lm::ngram::ProbingModel>(line, file, factorType, lazy);
|
||||
return new LanguageModelKen<lm::ngram::ProbingModel>(line, file, factorType, load_method);
|
||||
case lm::ngram::REST_PROBING:
|
||||
return new LanguageModelKen<lm::ngram::RestProbingModel>(line, file, factorType, lazy);
|
||||
return new LanguageModelKen<lm::ngram::RestProbingModel>(line, file, factorType, load_method);
|
||||
case lm::ngram::TRIE:
|
||||
return new LanguageModelKen<lm::ngram::TrieModel>(line, file, factorType, lazy);
|
||||
return new LanguageModelKen<lm::ngram::TrieModel>(line, file, factorType, load_method);
|
||||
case lm::ngram::QUANT_TRIE:
|
||||
return new LanguageModelKen<lm::ngram::QuantTrieModel>(line, file, factorType, lazy);
|
||||
return new LanguageModelKen<lm::ngram::QuantTrieModel>(line, file, factorType, load_method);
|
||||
case lm::ngram::ARRAY_TRIE:
|
||||
return new LanguageModelKen<lm::ngram::ArrayTrieModel>(line, file, factorType, lazy);
|
||||
return new LanguageModelKen<lm::ngram::ArrayTrieModel>(line, file, factorType, load_method);
|
||||
case lm::ngram::QUANT_ARRAY_TRIE:
|
||||
return new LanguageModelKen<lm::ngram::QuantArrayTrieModel>(line, file, factorType, lazy);
|
||||
return new LanguageModelKen<lm::ngram::QuantArrayTrieModel>(line, file, factorType, load_method);
|
||||
default:
|
||||
UTIL_THROW2("Unrecognized kenlm model type " << model_type);
|
||||
}
|
||||
} else {
|
||||
return new LanguageModelKen<lm::ngram::ProbingModel>(line, file, factorType, lazy);
|
||||
return new LanguageModelKen<lm::ngram::ProbingModel>(line, file, factorType, load_method);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -26,6 +26,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include <boost/shared_ptr.hpp>
|
||||
|
||||
#include "lm/word_index.hh"
|
||||
#include "util/mmap.hh"
|
||||
|
||||
#include "moses/LM/Base.h"
|
||||
#include "moses/Hypothesis.h"
|
||||
@ -41,7 +42,7 @@ class FFState;
|
||||
LanguageModel *ConstructKenLM(const std::string &line);
|
||||
|
||||
//! This will also load. Returns a templated KenLM class
|
||||
LanguageModel *ConstructKenLM(const std::string &line, const std::string &file, FactorType factorType, bool lazy);
|
||||
LanguageModel *ConstructKenLM(const std::string &line, const std::string &file, FactorType factorType, util::LoadMethod load_method);
|
||||
|
||||
/*
|
||||
* An implementation of single factor LM using Kenneth's code.
|
||||
@ -49,7 +50,7 @@ LanguageModel *ConstructKenLM(const std::string &line, const std::string &file,
|
||||
template <class Model> class LanguageModelKen : public LanguageModel
|
||||
{
|
||||
public:
|
||||
LanguageModelKen(const std::string &line, const std::string &file, FactorType factorType, bool lazy);
|
||||
LanguageModelKen(const std::string &line, const std::string &file, FactorType factorType, util::LoadMethod load_method);
|
||||
|
||||
virtual const FFState *EmptyHypothesisState(const InputType &/*input*/) const;
|
||||
|
||||
@ -73,7 +74,7 @@ protected:
|
||||
|
||||
FactorType m_factorType;
|
||||
|
||||
void LoadModel(const std::string &file, bool lazy);
|
||||
void LoadModel(const std::string &file, util::LoadMethod load_method);
|
||||
|
||||
lm::WordIndex TranslateID(const Word &word) const {
|
||||
std::size_t factor = word.GetFactor(m_factorType)->GetId();
|
||||
|
@ -73,7 +73,7 @@ template <class Model> FFState *ReloadingLanguageModel<Model>::EvaluateWhenAppli
|
||||
|
||||
std::auto_ptr<FFState> kenlmState(LanguageModelKen<Model>::EvaluateWhenApplied(hypo, ps, out));
|
||||
const lm::ngram::State &out_state = static_cast<const ReloadingLMState&>(*kenlmState).state;
|
||||
|
||||
|
||||
|
||||
std::auto_ptr<ReloadingLMState> ret(new ReloadingLMState());
|
||||
ret->state = out_state;
|
||||
|
@ -64,18 +64,18 @@ private:
|
||||
template <class Model> class ReloadingLanguageModel : public LanguageModelKen<Model>
|
||||
{
|
||||
public:
|
||||
|
||||
ReloadingLanguageModel(const std::string &line, const std::string &file, FactorType factorType, bool lazy) : LanguageModelKen<Model>(line, file, factorType, lazy), m_file(file), m_lazy(lazy)
|
||||
{
|
||||
// TODO(Lane) copy less code, update to load_method
|
||||
ReloadingLanguageModel(const std::string &line, const std::string &file, FactorType factorType, bool lazy) : LanguageModelKen<Model>(line, file, factorType, lazy ? util::LAZY : util::POPULATE_OR_READ), m_file(file), m_lazy(lazy) {
|
||||
|
||||
std::cerr << "ReloadingLM constructor: " << m_file << std::endl;
|
||||
// std::cerr << std::string(line).replace(0,11,"KENLM") << std::endl;
|
||||
|
||||
|
||||
}
|
||||
|
||||
virtual void InitializeForInput(ttasksptr const& ttask) {
|
||||
virtual void InitializeForInput(ttasksptr const& ttask) {
|
||||
std::cerr << "ReloadingLM InitializeForInput" << std::endl;
|
||||
LanguageModelKen<Model>::LoadModel(m_file, m_lazy);
|
||||
// TODO(lane): load_method
|
||||
LanguageModelKen<Model>::LoadModel(m_file, m_lazy ? util::LAZY : util::POPULATE_OR_READ);
|
||||
/*
|
||||
lm::ngram::Config config;
|
||||
if(this->m_verbosity >= 1) {
|
||||
@ -87,15 +87,15 @@ public:
|
||||
MappingBuilder builder(collection, m_lmIdLookup);
|
||||
config.enumerate_vocab = &builder;
|
||||
config.load_method = m_lazy ? util::LAZY : util::POPULATE_OR_READ;
|
||||
|
||||
|
||||
m_ngram.reset(new Model(m_file.c_str(), config));
|
||||
|
||||
|
||||
m_beginSentenceFactor = collection.AddFactor(BOS_);
|
||||
*/
|
||||
};
|
||||
|
||||
/*
|
||||
ReloadingLanguageModel(const std::string &line) : LanguageModelKen<Model>(ConstructKenLM(std::string(line).replace(0,11,"KENLM"))) {
|
||||
ReloadingLanguageModel(const std::string &line) : LanguageModelKen<Model>(ConstructKenLM(std::string(line).replace(0,11,"KENLM"))) {
|
||||
std::cerr << "ReloadingLM constructor" << std::endl;
|
||||
std::cerr << std::string(line).replace(0,11,"KENLM") << std::endl;
|
||||
}
|
||||
@ -138,12 +138,12 @@ public:
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
private:
|
||||
|
||||
LanguageModel *m_lm;
|
||||
*/
|
||||
|
||||
protected:
|
||||
protected:
|
||||
|
||||
using LanguageModelKen<Model>::m_ngram;
|
||||
using LanguageModelKen<Model>::m_lmIdLookup;
|
||||
|
@ -58,7 +58,14 @@ public:
|
||||
|
||||
void Write(const std::string& fname) const {
|
||||
std::ofstream out(fname.c_str());
|
||||
// Little-known fact: ofstream tracks failures but does not, by default,
|
||||
// report them. You have to tell it to, or check for errors yourself.
|
||||
out.exceptions(std::ifstream::failbit | std::ifstream::badbit);
|
||||
Write(out);
|
||||
// Make sure the file is flushed, so that any errors are reported. If we
|
||||
// flush implicitly in the destructor, it won't be able to throw
|
||||
// exceptions.
|
||||
out.close();
|
||||
}
|
||||
void Write(std::ostream& out) const {
|
||||
for(int i=data.size()-1; i>=0; --i)
|
||||
|
@ -11,6 +11,8 @@
|
||||
#include "moses/PP/SpanLengthPhraseProperty.h"
|
||||
#include "moses/PP/NonTermContextProperty.h"
|
||||
#include "moses/PP/OrientationPhraseProperty.h"
|
||||
#include "moses/PP/TargetConstituentBoundariesLeftPhraseProperty.h"
|
||||
#include "moses/PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
@ -58,6 +60,8 @@ PhrasePropertyFactory::PhrasePropertyFactory()
|
||||
|
||||
MOSES_PNAME2("Counts", CountsPhraseProperty);
|
||||
MOSES_PNAME2("SourceLabels", SourceLabelsPhraseProperty);
|
||||
MOSES_PNAME2("TargetConstituentBoundariesLeft", TargetConstituentBoundariesLeftPhraseProperty);
|
||||
MOSES_PNAME2("TargetConstituentBoundariesRightAdjacent", TargetConstituentBoundariesRightAdjacentPhraseProperty);
|
||||
MOSES_PNAME2("TargetPreferences", TargetPreferencesPhraseProperty);
|
||||
MOSES_PNAME2("Tree",TreeStructurePhraseProperty);
|
||||
MOSES_PNAME2("SpanLength", SpanLengthPhraseProperty);
|
||||
|
@ -5,9 +5,14 @@ namespace Moses
|
||||
|
||||
std::ostream& operator<<(std::ostream &out, const PhraseProperty &obj)
|
||||
{
|
||||
out << "Base phrase property";
|
||||
obj.Print(out);
|
||||
return out;
|
||||
}
|
||||
|
||||
void PhraseProperty::Print(std::ostream &out) const
|
||||
{
|
||||
out << "Base phrase property";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -28,6 +28,8 @@ public:
|
||||
|
||||
protected:
|
||||
|
||||
virtual void Print(std::ostream& out) const;
|
||||
|
||||
std::string *m_value;
|
||||
|
||||
};
|
||||
|
63
moses/PP/TargetConstituentBoundariesLeftPhraseProperty.cpp
Normal file
63
moses/PP/TargetConstituentBoundariesLeftPhraseProperty.cpp
Normal file
@ -0,0 +1,63 @@
|
||||
#include "moses/PP/TargetConstituentBoundariesLeftPhraseProperty.h"
|
||||
#include "moses/FactorCollection.h"
|
||||
#include "moses/Util.h"
|
||||
#include <iostream>
|
||||
#include <queue>
|
||||
#include <ostream>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
void TargetConstituentBoundariesLeftPhraseProperty::ProcessValue(const std::string &value)
|
||||
{
|
||||
FactorCollection &factorCollection = FactorCollection::Instance();
|
||||
std::vector<std::string> tokens;
|
||||
Tokenize(tokens, value, " ");
|
||||
std::vector<std::string>::const_iterator tokenIter = tokens.begin();
|
||||
while (tokenIter != tokens.end()) {
|
||||
try {
|
||||
|
||||
std::vector<std::string> constituents;
|
||||
Tokenize(constituents, *tokenIter, "<");
|
||||
++tokenIter;
|
||||
float count = std::atof( tokenIter->c_str() );
|
||||
++tokenIter;
|
||||
|
||||
std::set<const Factor* > dedup;
|
||||
|
||||
for ( std::vector<std::string>::iterator constituentIter = constituents.begin();
|
||||
constituentIter != constituents.end(); ++constituentIter ) {
|
||||
|
||||
const Factor* constituentFactor = factorCollection.AddFactor(*constituentIter,false);
|
||||
|
||||
std::pair< std::set<const Factor* >::iterator, bool > dedupIns =
|
||||
dedup.insert(constituentFactor);
|
||||
if ( dedupIns.second ) {
|
||||
|
||||
std::pair< TargetConstituentBoundariesLeftCollection::iterator, bool > inserted =
|
||||
m_constituentsCollection.insert(std::make_pair(constituentFactor,count));
|
||||
if ( !inserted.second ) {
|
||||
(inserted.first)->second += count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} catch (const std::exception &e) {
|
||||
UTIL_THROW2("TargetConstituentBoundariesLeftPhraseProperty: Read error. Flawed property? " << value);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void TargetConstituentBoundariesLeftPhraseProperty::Print(std::ostream& out) const
|
||||
{
|
||||
for ( TargetConstituentBoundariesLeftCollection::const_iterator it = m_constituentsCollection.begin();
|
||||
it != m_constituentsCollection.end(); ++it ) {
|
||||
if ( it != m_constituentsCollection.begin() ) {
|
||||
out << " ";
|
||||
}
|
||||
out << *(it->first) << " " << it->second;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Moses
|
||||
|
40
moses/PP/TargetConstituentBoundariesLeftPhraseProperty.h
Normal file
40
moses/PP/TargetConstituentBoundariesLeftPhraseProperty.h
Normal file
@ -0,0 +1,40 @@
|
||||
#pragma once
|
||||
|
||||
#include "moses/PP/PhraseProperty.h"
|
||||
#include "moses/Factor.h"
|
||||
#include "util/exception.hh"
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
typedef std::map<const Factor*, float> TargetConstituentBoundariesLeftCollection;
|
||||
|
||||
|
||||
class TargetConstituentBoundariesLeftPhraseProperty : public PhraseProperty
|
||||
{
|
||||
public:
|
||||
TargetConstituentBoundariesLeftPhraseProperty()
|
||||
{};
|
||||
|
||||
virtual void ProcessValue(const std::string &value);
|
||||
|
||||
const TargetConstituentBoundariesLeftCollection &GetCollection() const {
|
||||
return m_constituentsCollection;
|
||||
};
|
||||
|
||||
virtual const std::string *GetValueString() const {
|
||||
UTIL_THROW2("TargetConstituentBoundariesLeftPhraseProperty: value string not available in this phrase property");
|
||||
return NULL;
|
||||
};
|
||||
|
||||
protected:
|
||||
|
||||
virtual void Print(std::ostream& out) const;
|
||||
|
||||
TargetConstituentBoundariesLeftCollection m_constituentsCollection;
|
||||
};
|
||||
|
||||
} // namespace Moses
|
||||
|
@ -0,0 +1,63 @@
|
||||
#include "moses/PP/TargetConstituentBoundariesRightAdjacentPhraseProperty.h"
|
||||
#include "moses/FactorCollection.h"
|
||||
#include "moses/Util.h"
|
||||
#include <iostream>
|
||||
#include <queue>
|
||||
#include <ostream>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
void TargetConstituentBoundariesRightAdjacentPhraseProperty::ProcessValue(const std::string &value)
|
||||
{
|
||||
FactorCollection &factorCollection = FactorCollection::Instance();
|
||||
std::vector<std::string> tokens;
|
||||
Tokenize(tokens, value, " ");
|
||||
std::vector<std::string>::const_iterator tokenIter = tokens.begin();
|
||||
while (tokenIter != tokens.end()) {
|
||||
try {
|
||||
|
||||
std::vector<std::string> constituents;
|
||||
Tokenize(constituents, *tokenIter, "<");
|
||||
++tokenIter;
|
||||
float count = std::atof( tokenIter->c_str() );
|
||||
++tokenIter;
|
||||
|
||||
std::set<const Factor* > dedup;
|
||||
|
||||
for ( std::vector<std::string>::iterator constituentIter = constituents.begin();
|
||||
constituentIter != constituents.end(); ++constituentIter ) {
|
||||
|
||||
const Factor* constituentFactor = factorCollection.AddFactor(*constituentIter,false);
|
||||
|
||||
std::pair< std::set<const Factor* >::iterator, bool > dedupIns =
|
||||
dedup.insert(constituentFactor);
|
||||
if ( dedupIns.second ) {
|
||||
|
||||
std::pair< TargetConstituentBoundariesRightAdjacentCollection::iterator, bool > inserted =
|
||||
m_constituentsCollection.insert(std::make_pair(constituentFactor,count));
|
||||
if ( !inserted.second ) {
|
||||
(inserted.first)->second += count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} catch (const std::exception &e) {
|
||||
UTIL_THROW2("TargetConstituentBoundariesRightAdjacentPhraseProperty: Read error. Flawed property? " << value);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void TargetConstituentBoundariesRightAdjacentPhraseProperty::Print(std::ostream& out) const
|
||||
{
|
||||
for ( TargetConstituentBoundariesRightAdjacentCollection::const_iterator it = m_constituentsCollection.begin();
|
||||
it != m_constituentsCollection.end(); ++it ) {
|
||||
if ( it != m_constituentsCollection.begin() ) {
|
||||
out << " ";
|
||||
}
|
||||
out << *(it->first) << " " << it->second;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Moses
|
||||
|
@ -0,0 +1,40 @@
|
||||
#pragma once
|
||||
|
||||
#include "moses/PP/PhraseProperty.h"
|
||||
#include "moses/Factor.h"
|
||||
#include "util/exception.hh"
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
|
||||
typedef std::map<const Factor*, float> TargetConstituentBoundariesRightAdjacentCollection;
|
||||
|
||||
|
||||
class TargetConstituentBoundariesRightAdjacentPhraseProperty : public PhraseProperty
|
||||
{
|
||||
public:
|
||||
TargetConstituentBoundariesRightAdjacentPhraseProperty()
|
||||
{};
|
||||
|
||||
virtual void ProcessValue(const std::string &value);
|
||||
|
||||
const TargetConstituentBoundariesRightAdjacentCollection &GetCollection() const {
|
||||
return m_constituentsCollection;
|
||||
};
|
||||
|
||||
virtual const std::string *GetValueString() const {
|
||||
UTIL_THROW2("TargetConstituentBoundariesRightAdjacentPhraseProperty: value string not available in this phrase property");
|
||||
return NULL;
|
||||
};
|
||||
|
||||
protected:
|
||||
|
||||
virtual void Print(std::ostream& out) const;
|
||||
|
||||
TargetConstituentBoundariesRightAdjacentCollection m_constituentsCollection;
|
||||
};
|
||||
|
||||
} // namespace Moses
|
||||
|
@ -1,3 +1,13 @@
|
||||
exe ptable-sigtest-filter :
|
||||
filter-pt.cc
|
||||
$(TOP)/moses//moses
|
||||
$(TOP)/moses/TranslationModel/UG/generic//generic
|
||||
$(TOP)//boost_iostreams
|
||||
$(TOP)//boost_program_options
|
||||
$(TOP)/moses/TranslationModel/UG/mm//mm
|
||||
$(TOP)/moses/TranslationModel/UG//mmsapt
|
||||
$(TOP)/util//kenutil
|
||||
;
|
||||
exe try-align :
|
||||
try-align.cc
|
||||
$(TOP)/moses//moses
|
||||
|
669
moses/TranslationModel/UG/filter-pt.cc
Normal file
669
moses/TranslationModel/UG/filter-pt.cc
Normal file
@ -0,0 +1,669 @@
|
||||
// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
|
||||
// significance filtering for phrase tables as described in
|
||||
// H. Johnson, et al. (2007) Improving Translation Quality
|
||||
// by Discarding Most of the Phrasetable. EMNLP 2007.
|
||||
// Implemented by Marcin Junczys-Dowmunt
|
||||
// recommended use: -l a+e -n <ttable-limit>
|
||||
#include <cstring>
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <set>
|
||||
|
||||
#include <boost/thread/tss.hpp>
|
||||
#include <boost/thread.hpp>
|
||||
#include <boost/unordered_map.hpp>
|
||||
#include <boost/program_options.hpp>
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <boost/foreach.hpp>
|
||||
|
||||
#ifdef WIN32
|
||||
#include "WIN32_functions.h"
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#include "mm/ug_bitext.h"
|
||||
|
||||
// constants
|
||||
const size_t MINIMUM_SIZE_TO_KEEP = 10000; // increase this to improve memory usage,
|
||||
// reduce for speed
|
||||
const std::string SEPARATOR = " ||| ";
|
||||
|
||||
const double ALPHA_PLUS_EPS = -1000.0; // dummy value
|
||||
const double ALPHA_MINUS_EPS = -2000.0; // dummy value
|
||||
|
||||
// configuration params
|
||||
int pfe_filter_limit = 0; // 0 = don't filter anything based on P(f|e)
|
||||
bool print_cooc_counts = false; // add cooc counts to phrase table?
|
||||
bool print_neglog_significance = false; // add -log(p) to phrase table?
|
||||
double sig_filter_limit = 0; // keep phrase pairs with -log(sig) > sig_filter_limit
|
||||
// higher = filter-more
|
||||
bool pef_filter_only = false; // only filter based on pef
|
||||
bool hierarchical = false;
|
||||
|
||||
double p_111 = 0.0; // alpha
|
||||
size_t pt_lines = 0;
|
||||
size_t nremoved_sigfilter = 0;
|
||||
size_t nremoved_pfefilter = 0;
|
||||
|
||||
typedef sapt::L2R_Token<sapt::SimpleWordId> Token;
|
||||
typedef sapt::mmTtrack<Token> ttrack_t;
|
||||
typedef sapt::mmTSA<Token> tsa_t;
|
||||
typedef sapt::TokenIndex tind_t;
|
||||
|
||||
int num_lines;
|
||||
|
||||
boost::mutex in_mutex;
|
||||
boost::mutex out_mutex;
|
||||
boost::mutex err_mutex;
|
||||
|
||||
typedef size_t TextLenType;
|
||||
|
||||
typedef boost::shared_ptr<std::vector<TextLenType> > SentIdSet;
|
||||
|
||||
class Cache {
|
||||
typedef std::pair<SentIdSet, clock_t> ClockedSet;
|
||||
typedef boost::unordered_map<std::string, ClockedSet> ClockedMap;
|
||||
|
||||
public:
|
||||
|
||||
SentIdSet get(const std::string& phrase) {
|
||||
boost::shared_lock<boost::shared_mutex> lock(m_mutex);
|
||||
if(m_cont.count(phrase)) {
|
||||
ClockedSet& set = m_cont[phrase];
|
||||
set.second = clock();
|
||||
return set.first;
|
||||
}
|
||||
return SentIdSet( new SentIdSet::element_type() );
|
||||
}
|
||||
|
||||
void put(const std::string& phrase, const SentIdSet set) {
|
||||
boost::unique_lock<boost::shared_mutex> lock(m_mutex);
|
||||
m_cont[phrase] = std::make_pair(set, clock());
|
||||
}
|
||||
|
||||
static void set_max_cache(size_t max_cache) {
|
||||
s_max_cache = max_cache;
|
||||
}
|
||||
|
||||
void prune() {
|
||||
if(s_max_cache > 0) {
|
||||
boost::upgrade_lock<boost::shared_mutex> lock(m_mutex);
|
||||
if(m_cont.size() > s_max_cache) {
|
||||
std::vector<clock_t> clocks;
|
||||
for(ClockedMap::iterator it = m_cont.begin(); it != m_cont.end(); it++)
|
||||
clocks.push_back(it->second.second);
|
||||
|
||||
std::sort(clocks.begin(), clocks.end());
|
||||
clock_t out = clocks[m_cont.size() - s_max_cache];
|
||||
|
||||
boost::upgrade_to_unique_lock<boost::shared_mutex> uniq_lock(lock);
|
||||
for(ClockedMap::iterator it = m_cont.begin(); it != m_cont.end(); it++)
|
||||
if(it->second.second < out)
|
||||
m_cont.erase(it);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
ClockedMap m_cont;
|
||||
boost::shared_mutex m_mutex;
|
||||
static size_t s_max_cache;
|
||||
};
|
||||
|
||||
size_t Cache::s_max_cache = 0;
|
||||
|
||||
struct SA {
|
||||
tind_t V;
|
||||
boost::shared_ptr<ttrack_t> T;
|
||||
tsa_t I;
|
||||
Cache cache;
|
||||
};
|
||||
|
||||
std::vector<boost::shared_ptr<SA> > e_sas;
|
||||
std::vector<boost::shared_ptr<SA> > f_sas;
|
||||
|
||||
#undef min
|
||||
|
||||
void usage()
|
||||
{
|
||||
std::cerr << "\nFilter phrase table using significance testing as described\n"
|
||||
<< "in H. Johnson, et al. (2007) Improving Translation Quality\n"
|
||||
<< "by Discarding Most of the Phrasetable. EMNLP 2007.\n";
|
||||
}
|
||||
|
||||
struct PTEntry {
|
||||
PTEntry(const std::string& str, int index);
|
||||
std::string f_phrase;
|
||||
std::string e_phrase;
|
||||
std::string extra;
|
||||
std::string scores;
|
||||
float pfe;
|
||||
int cf;
|
||||
int ce;
|
||||
int cfe;
|
||||
float nlog_pte;
|
||||
void set_cooc_stats(int _cef, int _cf, int _ce, float nlp) {
|
||||
cfe = _cef;
|
||||
cf = _cf;
|
||||
ce = _ce;
|
||||
nlog_pte = nlp;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
PTEntry::PTEntry(const std::string& str, int index) :
|
||||
cf(0), ce(0), cfe(0), nlog_pte(0.0)
|
||||
{
|
||||
size_t pos = 0;
|
||||
std::string::size_type nextPos = str.find(SEPARATOR, pos);
|
||||
this->f_phrase = str.substr(pos,nextPos);
|
||||
|
||||
pos = nextPos + SEPARATOR.size();
|
||||
nextPos = str.find(SEPARATOR, pos);
|
||||
this->e_phrase = str.substr(pos,nextPos-pos);
|
||||
|
||||
pos = nextPos + SEPARATOR.size();
|
||||
nextPos = str.find(SEPARATOR, pos);
|
||||
if (nextPos < str.size()) {
|
||||
this->scores = str.substr(pos,nextPos-pos);
|
||||
|
||||
pos = nextPos + SEPARATOR.size();
|
||||
this->extra = str.substr(pos);
|
||||
}
|
||||
else {
|
||||
this->scores = str.substr(pos,str.size()-pos);
|
||||
}
|
||||
|
||||
int c = 0;
|
||||
std::string::iterator i=scores.begin();
|
||||
if (index > 0) {
|
||||
for (; i != scores.end(); ++i) {
|
||||
if ((*i) == ' ') {
|
||||
c++;
|
||||
if (c == index) break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (i != scores.end()) {
|
||||
++i;
|
||||
}
|
||||
char f[24];
|
||||
char *fp=f;
|
||||
while (i != scores.end() && *i != ' ') {
|
||||
*fp++=*i++;
|
||||
}
|
||||
*fp++=0;
|
||||
|
||||
this->pfe = atof(f);
|
||||
}
|
||||
|
||||
struct PfeComparer {
|
||||
bool operator()(const PTEntry* a, const PTEntry* b) const {
|
||||
return a->pfe > b->pfe;
|
||||
}
|
||||
};
|
||||
|
||||
struct NlogSigThresholder {
|
||||
NlogSigThresholder(float threshold) : t(threshold) {}
|
||||
float t;
|
||||
bool operator()(const PTEntry* a) const {
|
||||
if (a->nlog_pte < t) {
|
||||
delete a;
|
||||
return true;
|
||||
} else return false;
|
||||
}
|
||||
};
|
||||
|
||||
std::ostream& operator << (std::ostream& os, const PTEntry& pp)
|
||||
{
|
||||
os << pp.f_phrase << " ||| " << pp.e_phrase;
|
||||
os << " ||| " << pp.scores;
|
||||
if (pp.extra.size()>0) os << " ||| " << pp.extra;
|
||||
if (print_cooc_counts) os << " ||| " << pp.cfe << " " << pp.cf << " " << pp.ce;
|
||||
if (print_neglog_significance) os << " ||| " << pp.nlog_pte;
|
||||
return os;
|
||||
}
|
||||
|
||||
void print(int a, int b, int c, int d, float p)
|
||||
{
|
||||
std::cerr << a << "\t" << b << "\t P=" << p << "\n"
|
||||
<< c << "\t" << d << "\t xf="
|
||||
<< (double)(b)*(double)(c)/(double)(a+1)/(double)(d+1) << "\n\n";
|
||||
}
|
||||
|
||||
// 2x2 (one-sided) Fisher's exact test
|
||||
// see B. Moore. (2004) On Log Likelihood and the Significance of Rare Events
|
||||
double fisher_exact(int cfe, int ce, int cf)
|
||||
{
|
||||
assert(cfe <= ce);
|
||||
assert(cfe <= cf);
|
||||
|
||||
int a = cfe;
|
||||
int b = (cf - cfe);
|
||||
int c = (ce - cfe);
|
||||
int d = (num_lines - ce - cf + cfe);
|
||||
int n = a + b + c + d;
|
||||
|
||||
double cp = exp(lgamma(1+a+c) + lgamma(1+b+d) + lgamma(1+a+b) + lgamma(1+c+d)
|
||||
- lgamma(1+n) - lgamma(1+a) - lgamma(1+b) - lgamma(1+c)
|
||||
- lgamma(1+d));
|
||||
double total_p = 0.0;
|
||||
int tc = std::min(b,c);
|
||||
for (int i=0; i<=tc; i++) {
|
||||
total_p += cp;
|
||||
double coef = (double)(b)*(double)(c)/(double)(a+1)/(double)(d+1);
|
||||
cp *= coef;
|
||||
++a;
|
||||
--c;
|
||||
++d;
|
||||
--b;
|
||||
}
|
||||
return total_p;
|
||||
}
|
||||
|
||||
template <class setType>
|
||||
void ordered_set_intersect(setType& out, const setType set_1, const setType set_2)
|
||||
{
|
||||
std::set_intersection(set_1->begin(), set_1->end(), set_2->begin(),
|
||||
set_2->end(), inserter(*out, out->begin()) );
|
||||
}
|
||||
|
||||
|
||||
void lookup_phrase(SentIdSet& ids, const std::string& phrase,
|
||||
tsa_t &my_sa, tind_t &my_v, Cache& cache)
|
||||
{
|
||||
ids = cache.get(phrase);
|
||||
if(ids->empty()) {
|
||||
|
||||
std::vector<sapt::id_type> snt;
|
||||
my_v.fillIdSeq(phrase, snt);
|
||||
|
||||
tsa_t::tree_iterator m(&my_sa);
|
||||
size_t k = 0;
|
||||
while (k < snt.size() && m.extend(snt[k])) ++k;
|
||||
if(k == snt.size()) {
|
||||
ids->reserve(m.approxOccurrenceCount()+10);
|
||||
sapt::tsa::ArrayEntry I(m.lower_bound(-1));
|
||||
char const* stop = m.upper_bound(-1);
|
||||
do {
|
||||
m.root->readEntry(I.next,I);
|
||||
ids->push_back(I.sid);
|
||||
} while (I.next != stop);
|
||||
|
||||
std::sort(ids->begin(), ids->end());
|
||||
SentIdSet::element_type::iterator it =
|
||||
std::unique(ids->begin(), ids->end());
|
||||
ids->resize(it - ids->begin());
|
||||
|
||||
if(ids->size() >= MINIMUM_SIZE_TO_KEEP)
|
||||
cache.put(phrase, ids);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void lookup_multiple_phrases(SentIdSet& ids, std::vector<std::string> & phrases,
|
||||
tsa_t & my_sa, tind_t &my_v,
|
||||
const std::string & rule, Cache& cache)
|
||||
{
|
||||
|
||||
if (phrases.size() == 1) {
|
||||
lookup_phrase(ids, phrases.front(), my_sa, my_v, cache);
|
||||
}
|
||||
else {
|
||||
SentIdSet main_set( new SentIdSet::element_type() );
|
||||
bool first = true;
|
||||
SentIdSet first_set( new SentIdSet::element_type() );
|
||||
lookup_phrase(first_set, phrases.front(), my_sa, my_v, cache);
|
||||
for (std::vector<std::string>::iterator phrase=phrases.begin()+1;
|
||||
phrase != phrases.end(); ++phrase) {
|
||||
SentIdSet temp_set( new SentIdSet::element_type() );
|
||||
lookup_phrase(temp_set, *phrase, my_sa, my_v, cache);
|
||||
if (first) {
|
||||
ordered_set_intersect(main_set, first_set, temp_set);
|
||||
first = false;
|
||||
}
|
||||
else {
|
||||
SentIdSet new_set( new SentIdSet::element_type() );
|
||||
ordered_set_intersect(new_set, main_set, temp_set);
|
||||
main_set->swap(*new_set);
|
||||
}
|
||||
}
|
||||
ids->swap(*main_set);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void find_occurrences(SentIdSet& ids, const std::string& rule,
|
||||
tsa_t& my_sa, tind_t &my_v, Cache& cache)
|
||||
{
|
||||
// we search for hierarchical rules by stripping away NT and looking for terminals sequences
|
||||
// if a rule contains multiple sequences of terminals, we intersect their occurrences.
|
||||
if (hierarchical) {
|
||||
// std::cerr << "splitting up phrase: " << phrase << "\n";
|
||||
int pos = 0;
|
||||
int NTStartPos, NTEndPos;
|
||||
std::vector<std::string> phrases;
|
||||
while (rule.find("] ", pos) < rule.size()) {
|
||||
NTStartPos = rule.find("[",pos) - 1; // -1 to cut space before NT
|
||||
NTEndPos = rule.find("] ",pos);
|
||||
if (NTStartPos < pos) { // no space: NT at start of rule (or two consecutive NTs)
|
||||
pos = NTEndPos + 2;
|
||||
continue;
|
||||
}
|
||||
phrases.push_back(rule.substr(pos,NTStartPos-pos));
|
||||
pos = NTEndPos + 2;
|
||||
}
|
||||
|
||||
NTStartPos = rule.find("[",pos) - 1; // LHS of rule
|
||||
if (NTStartPos > pos) {
|
||||
phrases.push_back(rule.substr(pos,NTStartPos-pos));
|
||||
}
|
||||
|
||||
lookup_multiple_phrases(ids, phrases, my_sa, my_v, rule, cache);
|
||||
}
|
||||
else {
|
||||
lookup_phrase(ids, rule, my_sa, my_v, cache);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// input: unordered list of translation options for a single source phrase
|
||||
void compute_cooc_stats_and_filter(std::vector<PTEntry*>& options)
|
||||
{
|
||||
if (pfe_filter_limit > 0 && options.size() > pfe_filter_limit) {
|
||||
nremoved_pfefilter += (options.size() - pfe_filter_limit);
|
||||
std::nth_element(options.begin(), options.begin() + pfe_filter_limit,
|
||||
options.end(), PfeComparer());
|
||||
for (std::vector<PTEntry*>::iterator i = options.begin() + pfe_filter_limit;
|
||||
i != options.end(); ++i)
|
||||
delete *i;
|
||||
options.erase(options.begin() + pfe_filter_limit,options.end());
|
||||
}
|
||||
|
||||
if (pef_filter_only)
|
||||
return;
|
||||
|
||||
if (options.empty())
|
||||
return;
|
||||
|
||||
size_t cf = 0;
|
||||
std::vector<SentIdSet> fsets;
|
||||
BOOST_FOREACH(boost::shared_ptr<SA>& f_sa, f_sas) {
|
||||
fsets.push_back( boost::shared_ptr<SentIdSet::element_type>(new SentIdSet::element_type()) );
|
||||
find_occurrences(fsets.back(), options.front()->f_phrase, f_sa->I, f_sa->V, f_sa->cache);
|
||||
cf += fsets.back()->size();
|
||||
}
|
||||
|
||||
for (std::vector<PTEntry*>::iterator i = options.begin();
|
||||
i != options.end(); ++i) {
|
||||
const std::string& e_phrase = (*i)->e_phrase;
|
||||
|
||||
size_t ce = 0;
|
||||
std::vector<SentIdSet> esets;
|
||||
BOOST_FOREACH(boost::shared_ptr<SA>& e_sa, e_sas) {
|
||||
esets.push_back( boost::shared_ptr<SentIdSet::element_type>(new SentIdSet::element_type()) );
|
||||
find_occurrences(esets.back(), e_phrase, e_sa->I, e_sa->V, e_sa->cache);
|
||||
ce += esets.back()->size();
|
||||
}
|
||||
|
||||
size_t cef = 0;
|
||||
for(size_t j = 0; j < fsets.size(); ++j) {
|
||||
SentIdSet efset( new SentIdSet::element_type() );
|
||||
ordered_set_intersect(efset, fsets[j], esets[j]);
|
||||
cef += efset->size();
|
||||
}
|
||||
|
||||
double nlp = -log(fisher_exact(cef, cf, ce));
|
||||
(*i)->set_cooc_stats(cef, cf, ce, nlp);
|
||||
}
|
||||
|
||||
std::vector<PTEntry*>::iterator new_end =
|
||||
std::remove_if(options.begin(), options.end(),
|
||||
NlogSigThresholder(sig_filter_limit));
|
||||
nremoved_sigfilter += (options.end() - new_end);
|
||||
options.erase(new_end,options.end());
|
||||
}
|
||||
|
||||
void filter_thread(std::istream* in, std::ostream* out, int pfe_index) {
|
||||
|
||||
std::vector<std::string> lines;
|
||||
std::string prev = "";
|
||||
std::vector<PTEntry*> options;
|
||||
while(true) {
|
||||
{
|
||||
boost::mutex::scoped_lock lock(in_mutex);
|
||||
if(in->eof())
|
||||
break;
|
||||
|
||||
lines.clear();
|
||||
std::string line;
|
||||
while(getline(*in, line) && lines.size() < 500000)
|
||||
lines.push_back(line);
|
||||
}
|
||||
|
||||
std::stringstream out_temp;
|
||||
for(std::vector<std::string>::iterator it = lines.begin(); it != lines.end(); it++) {
|
||||
size_t tmp_lines = ++pt_lines;
|
||||
if(tmp_lines % 10000 == 0) {
|
||||
boost::mutex::scoped_lock lock(err_mutex);
|
||||
std::cerr << ".";
|
||||
|
||||
if(tmp_lines % 500000 == 0)
|
||||
std::cerr << "[n:" << tmp_lines << "]\n";
|
||||
|
||||
if(tmp_lines % 10000000 == 0) {
|
||||
float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines;
|
||||
float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines;
|
||||
std::cerr << "------------------------------------------------------\n"
|
||||
<< " unfiltered phrases pairs: " << pt_lines << "\n"
|
||||
<< "\n"
|
||||
<< " P(f|e) filter [first]: " << nremoved_pfefilter << " (" << pfefper << "%)\n"
|
||||
<< " significance filter: " << nremoved_sigfilter << " (" << sigfper << "%)\n"
|
||||
<< " TOTAL FILTERED: " << (nremoved_pfefilter + nremoved_sigfilter) << " (" << (sigfper + pfefper) << "%)\n"
|
||||
<< "\n"
|
||||
<< " FILTERED phrase pairs: " << (pt_lines - nremoved_pfefilter - nremoved_sigfilter) << " (" << (100.0-sigfper - pfefper) << "%)\n"
|
||||
<< "------------------------------------------------------\n";
|
||||
}
|
||||
}
|
||||
|
||||
if(pt_lines % 10000 == 0) {
|
||||
BOOST_FOREACH(boost::shared_ptr<SA> f_sa, f_sas)
|
||||
f_sa->cache.prune();
|
||||
BOOST_FOREACH(boost::shared_ptr<SA> e_sa, e_sas)
|
||||
e_sa->cache.prune();
|
||||
}
|
||||
|
||||
if(it->length() > 0) {
|
||||
PTEntry* pp = new PTEntry(it->c_str(), pfe_index);
|
||||
if (prev != pp->f_phrase) {
|
||||
prev = pp->f_phrase;
|
||||
|
||||
if (!options.empty()) { // always true after first line
|
||||
compute_cooc_stats_and_filter(options);
|
||||
}
|
||||
|
||||
for (std::vector<PTEntry*>::iterator i = options.begin();
|
||||
i != options.end(); ++i) {
|
||||
out_temp << **i << '\n';
|
||||
delete *i;
|
||||
}
|
||||
|
||||
options.clear();
|
||||
options.push_back(pp);
|
||||
|
||||
} else {
|
||||
options.push_back(pp);
|
||||
}
|
||||
}
|
||||
}
|
||||
boost::mutex::scoped_lock lock(out_mutex);
|
||||
*out << out_temp.str() << std::flush;
|
||||
}
|
||||
compute_cooc_stats_and_filter(options);
|
||||
|
||||
boost::mutex::scoped_lock lock(out_mutex);
|
||||
for (std::vector<PTEntry*>::iterator i = options.begin();
|
||||
i != options.end(); ++i) {
|
||||
*out << **i << '\n';
|
||||
delete *i;
|
||||
}
|
||||
*out << std::flush;
|
||||
}
|
||||
|
||||
namespace po = boost::program_options;
|
||||
|
||||
int main(int argc, char * argv[])
|
||||
{
|
||||
bool help;
|
||||
std::vector<std::string> efiles;
|
||||
std::vector<std::string> ffiles;
|
||||
int pfe_index = 2;
|
||||
int threads = 1;
|
||||
size_t max_cache = 0;
|
||||
std::string str_sig_filter_limit;
|
||||
|
||||
po::options_description general("General options");
|
||||
general.add_options()
|
||||
("english,e", po::value<std::vector<std::string> >(&efiles)->multitoken(),
|
||||
"english.suf-arr")
|
||||
("french,f", po::value<std::vector<std::string> >(&ffiles)->multitoken(),
|
||||
"french.suf-arr")
|
||||
("pfe-index,i", po::value(&pfe_index)->default_value(2),
|
||||
"Index of P(f|e) in phrase table")
|
||||
("pfe-filter-limit,n", po::value(&pfe_filter_limit)->default_value(0),
|
||||
"0, 1...: 0=no filtering, >0 sort by P(e|f) and keep the top num elements")
|
||||
("threads,t", po::value(&threads)->default_value(1),
|
||||
"number of threads to use")
|
||||
("max-cache,m", po::value(&max_cache)->default_value(0),
|
||||
"limit cache to arg most recent phrases")
|
||||
("print-cooc,c", po::value(&print_cooc_counts)->zero_tokens()->default_value(false),
|
||||
"add the coocurrence counts to the phrase table")
|
||||
("print-significance,p", po::value(&print_neglog_significance)->zero_tokens()->default_value(false),
|
||||
"add -log(significance) to the phrase table")
|
||||
("hierarchical,x", po::value(&hierarchical)->zero_tokens()->default_value(false),
|
||||
"filter hierarchical rule table")
|
||||
("sig-filter-limit,l", po::value(&str_sig_filter_limit),
|
||||
">0.0, a+e, or a-e: keep values that have a -log significance > this")
|
||||
("help,h", po::value(&help)->zero_tokens()->default_value(false),
|
||||
"display this message")
|
||||
;
|
||||
|
||||
po::options_description cmdline_options("Allowed options");
|
||||
cmdline_options.add(general);
|
||||
po::variables_map vm;
|
||||
|
||||
try {
|
||||
po::store(po::command_line_parser(argc,argv).
|
||||
options(cmdline_options).run(), vm);
|
||||
po::notify(vm);
|
||||
}
|
||||
catch (std::exception& e) {
|
||||
std::cout << "Error: " << e.what() << std::endl << std::endl;
|
||||
|
||||
usage();
|
||||
std::cout << cmdline_options << std::endl;
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if(vm["help"].as<bool>()) {
|
||||
usage();
|
||||
std::cout << cmdline_options << std::endl;
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if(vm.count("pfe-filter-limit"))
|
||||
std::cerr << "P(f|e) filter limit: " << pfe_filter_limit << std::endl;
|
||||
if(vm.count("threads"))
|
||||
std::cerr << "Using threads: " << threads << std::endl;
|
||||
if(vm.count("max-cache"))
|
||||
std::cerr << "Using max phrases in caches: " << max_cache << std::endl;
|
||||
|
||||
if (strcmp(str_sig_filter_limit.c_str(),"a+e") == 0) {
|
||||
sig_filter_limit = ALPHA_PLUS_EPS;
|
||||
} else if (strcmp(str_sig_filter_limit.c_str(),"a-e") == 0) {
|
||||
sig_filter_limit = ALPHA_MINUS_EPS;
|
||||
} else {
|
||||
char *x;
|
||||
sig_filter_limit = strtod(str_sig_filter_limit.c_str(), &x);
|
||||
if (sig_filter_limit < 0.0) {
|
||||
std::cerr << "Filter limit (-l) must be either 'a+e', 'a-e' or a real number >= 0.0\n";
|
||||
usage();
|
||||
}
|
||||
}
|
||||
|
||||
if (sig_filter_limit == 0.0) pef_filter_only = true;
|
||||
//-----------------------------------------------------------------------------
|
||||
if (optind != argc || ((efiles.empty() || ffiles.empty()) && !pef_filter_only)) {
|
||||
usage();
|
||||
}
|
||||
|
||||
if (!pef_filter_only) {
|
||||
size_t elines = 0;
|
||||
BOOST_FOREACH(std::string& efile, efiles) {
|
||||
e_sas.push_back(boost::shared_ptr<SA>(new SA()));
|
||||
e_sas.back()->V.open(efile + ".tdx");
|
||||
e_sas.back()->T.reset(new ttrack_t());
|
||||
e_sas.back()->T->open(efile + ".mct");
|
||||
e_sas.back()->I.open(efile + ".sfa", e_sas.back()->T);
|
||||
elines += e_sas.back()->T->size();
|
||||
}
|
||||
|
||||
size_t flines = 0;
|
||||
BOOST_FOREACH(std::string& ffile, ffiles) {
|
||||
f_sas.push_back(boost::shared_ptr<SA>(new SA()));
|
||||
f_sas.back()->V.open(ffile + ".tdx");
|
||||
f_sas.back()->T.reset(new ttrack_t());
|
||||
f_sas.back()->T->open(ffile + ".mct");
|
||||
f_sas.back()->I.open(ffile + ".sfa", f_sas.back()->T);
|
||||
flines += f_sas.back()->T->size();
|
||||
}
|
||||
|
||||
if (elines != flines) {
|
||||
std::cerr << "Number of lines in e-corpus != number of lines in f-corpus!\n";
|
||||
usage();
|
||||
exit(1);
|
||||
} else {
|
||||
std::cerr << "Training corpus: " << elines << " lines\n";
|
||||
num_lines = elines;
|
||||
}
|
||||
p_111 = -log(fisher_exact(1,1,1));
|
||||
std::cerr << "\\alpha = " << p_111 << "\n";
|
||||
if (sig_filter_limit == ALPHA_MINUS_EPS) {
|
||||
sig_filter_limit = p_111 - 0.001;
|
||||
} else if (sig_filter_limit == ALPHA_PLUS_EPS) {
|
||||
sig_filter_limit = p_111 + 0.001;
|
||||
}
|
||||
std::cerr << "Sig filter threshold is = " << sig_filter_limit << "\n";
|
||||
} else {
|
||||
std::cerr << "Filtering using P(e|f) only. n=" << pfe_filter_limit << std::endl;
|
||||
}
|
||||
|
||||
Cache::set_max_cache(max_cache);
|
||||
std::ios_base::sync_with_stdio(false);
|
||||
|
||||
boost::thread_group threadGroup;
|
||||
for(int i = 0; i < threads; i++)
|
||||
threadGroup.add_thread(new boost::thread(filter_thread, &std::cin, &std::cout, pfe_index));
|
||||
threadGroup.join_all();
|
||||
|
||||
float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines;
|
||||
float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines;
|
||||
|
||||
std::cerr << "\n\n------------------------------------------------------\n"
|
||||
<< " unfiltered phrases pairs: " << pt_lines << "\n"
|
||||
<< "\n"
|
||||
<< " P(f|e) filter [first]: " << nremoved_pfefilter << " (" << pfefper << "%)\n"
|
||||
<< " significance filter: " << nremoved_sigfilter << " (" << sigfper << "%)\n"
|
||||
<< " TOTAL FILTERED: " << (nremoved_pfefilter + nremoved_sigfilter) << " (" << (sigfper + pfefper) << "%)\n"
|
||||
<< "\n"
|
||||
<< " FILTERED phrase pairs: " << (pt_lines - nremoved_pfefilter - nremoved_sigfilter) << " (" << (100.0-sigfper - pfefper) << "%)\n"
|
||||
<< "------------------------------------------------------\n";
|
||||
}
|
@ -5,7 +5,7 @@
|
||||
#include <vector>
|
||||
|
||||
#ifndef NO_MOSES
|
||||
#include "moses/FF/LexicalReordering/LexicalReorderingState.h"
|
||||
#include "moses/FF/LexicalReordering/LRState.h"
|
||||
#endif
|
||||
|
||||
namespace sapt {
|
||||
|
@ -4,7 +4,7 @@
|
||||
#include "ug_typedefs.h"
|
||||
#include "ug_bitext_pstats.h"
|
||||
#ifndef NO_MOSES
|
||||
#include "moses/FF/LexicalReordering/LexicalReorderingState.h"
|
||||
#include "moses/FF/LexicalReordering/LRState.h"
|
||||
#endif
|
||||
#include "boost/format.hpp"
|
||||
#include "tpt_tokenindex.h"
|
||||
|
@ -42,7 +42,7 @@ TrellisPath::TrellisPath(const Hypothesis *hypo)
|
||||
|
||||
void TrellisPath::InitTotalScore()
|
||||
{
|
||||
m_totalScore = m_path[0]->GetWinningHypo()->GetFutureScore();
|
||||
m_totalScore = m_path[0]->GetWinningHypo()->GetFutureScore();
|
||||
|
||||
//calc score
|
||||
size_t sizePath = m_path.size();
|
||||
@ -50,7 +50,7 @@ void TrellisPath::InitTotalScore()
|
||||
const Hypothesis *hypo = m_path[pos];
|
||||
const Hypothesis *winningHypo = hypo->GetWinningHypo();
|
||||
if (hypo != winningHypo) {
|
||||
m_totalScore = m_totalScore - winningHypo->GetFutureScore() + hypo->GetFutureScore();
|
||||
m_totalScore += hypo->GetFutureScore() - winningHypo->GetFutureScore();
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -169,9 +169,6 @@ TrellisPath::
|
||||
GetScoreBreakdown() const
|
||||
{
|
||||
if (!m_scoreBreakdown) {
|
||||
float totalScore = m_path[0]->GetWinningHypo()->GetFutureScore();
|
||||
// calculated for sanity check only
|
||||
|
||||
m_scoreBreakdown.reset(new ScoreComponentCollection());
|
||||
m_scoreBreakdown->PlusEquals(m_path[0]->GetWinningHypo()->GetScoreBreakdown());
|
||||
|
||||
@ -184,13 +181,10 @@ GetScoreBreakdown() const
|
||||
const Hypothesis *hypo = m_path[pos];
|
||||
const Hypothesis *winningHypo = hypo->GetWinningHypo();
|
||||
if (hypo != winningHypo) {
|
||||
totalScore += hypo->GetFutureScore() - winningHypo->GetFutureScore();
|
||||
m_scoreBreakdown->MinusEquals(winningHypo->GetScoreBreakdown());
|
||||
m_scoreBreakdown->PlusEquals(hypo->GetScoreBreakdown());
|
||||
}
|
||||
}
|
||||
|
||||
assert(totalScore == m_totalScore);
|
||||
}
|
||||
|
||||
return m_scoreBreakdown;
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include "moses/ContextScope.h"
|
||||
#include <boost/foreach.hpp>
|
||||
#include "moses/Util.h"
|
||||
#include "moses/TreeInput.h"
|
||||
#include "moses/Hypothesis.h"
|
||||
|
||||
namespace MosesServer
|
||||
@ -24,6 +25,7 @@ using Moses::FValue;
|
||||
using Moses::PhraseDictionaryMultiModel;
|
||||
using Moses::FindPhraseDictionary;
|
||||
using Moses::Sentence;
|
||||
using Moses::TreeInput;
|
||||
|
||||
boost::shared_ptr<TranslationRequest>
|
||||
TranslationRequest::
|
||||
@ -317,7 +319,13 @@ parse_request(std::map<std::string, xmlrpc_c::value> const& params)
|
||||
// for (size_t i = 1; i < tmp.size(); i += 2)
|
||||
// m_bias[xmlrpc_c::value_int(tmp[i-1])] = xmlrpc_c::value_double(tmp[i]);
|
||||
// }
|
||||
m_source.reset(new Sentence(m_options,0,m_source_string));
|
||||
if (is_syntax(m_options->search.algo)) {
|
||||
m_source.reset(new TreeInput(m_options));
|
||||
istringstream in(m_source_string + "\n");
|
||||
m_source->Read(in);
|
||||
} else {
|
||||
m_source.reset(new Sentence(m_options,0,m_source_string));
|
||||
}
|
||||
} // end of Translationtask::parse_request()
|
||||
|
||||
|
||||
@ -334,7 +342,7 @@ run_chart_decoder()
|
||||
|
||||
const Moses::ChartHypothesis *hypo = manager.GetBestHypothesis();
|
||||
ostringstream out;
|
||||
outputChartHypo(out,hypo);
|
||||
if (hypo) outputChartHypo(out,hypo);
|
||||
|
||||
m_target_string = out.str();
|
||||
m_retData["text"] = xmlrpc_c::value_string(m_target_string);
|
||||
|
@ -311,12 +311,14 @@ std::string ExtractionPhrasePair::CollectAllPropertyValues(const std::string &ke
|
||||
std::ostringstream oss;
|
||||
for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
|
||||
iter!=allPropertyValues->end(); ++iter) {
|
||||
if (iter!=allPropertyValues->begin()) {
|
||||
if (!(iter->first).empty()) {
|
||||
if (iter!=allPropertyValues->begin()) {
|
||||
oss << " ";
|
||||
}
|
||||
oss << iter->first;
|
||||
oss << " ";
|
||||
oss << iter->second;
|
||||
}
|
||||
oss << iter->first;
|
||||
oss << " ";
|
||||
oss << iter->second;
|
||||
}
|
||||
|
||||
std::string allPropertyValuesString(oss.str());
|
||||
|
@ -50,7 +50,10 @@ private:
|
||||
bool onlyOutputSpanInfo;
|
||||
bool gzOutput;
|
||||
std::string instanceWeightsFile; //weights for each sentence
|
||||
bool targetConstituentConstrainedFlag;
|
||||
bool targetConstituentBoundariesFlag;
|
||||
bool flexScoreFlag;
|
||||
bool singleWordHeuristicFlag;
|
||||
|
||||
public:
|
||||
std::vector<std::string> placeholders;
|
||||
@ -72,7 +75,10 @@ public:
|
||||
includeSentenceIdFlag(false),
|
||||
onlyOutputSpanInfo(false),
|
||||
gzOutput(false),
|
||||
targetConstituentConstrainedFlag(false),
|
||||
targetConstituentBoundariesFlag(false),
|
||||
flexScoreFlag(false),
|
||||
singleWordHeuristicFlag(false),
|
||||
debug(false) {
|
||||
}
|
||||
|
||||
@ -116,9 +122,18 @@ public:
|
||||
void initInstanceWeightsFile(const char* initInstanceWeightsFile) {
|
||||
instanceWeightsFile = std::string(initInstanceWeightsFile);
|
||||
}
|
||||
void initTargetConstituentConstrainedFlag(const bool initTargetConstituentConstrainedFlag) {
|
||||
targetConstituentConstrainedFlag = initTargetConstituentConstrainedFlag;
|
||||
}
|
||||
void initTargetConstituentBoundariesFlag(const bool initTargetConstituentBoundariesFlag) {
|
||||
targetConstituentBoundariesFlag = initTargetConstituentBoundariesFlag;
|
||||
}
|
||||
void initFlexScoreFlag(const bool initflexScoreFlag) {
|
||||
flexScoreFlag=initflexScoreFlag;
|
||||
}
|
||||
void initSingleWordHeuristicFlag(const bool initSingleWordHeuristicFlag) {
|
||||
singleWordHeuristicFlag = initSingleWordHeuristicFlag;
|
||||
}
|
||||
|
||||
// functions for getting values
|
||||
bool isAllModelsOutputFlag() const {
|
||||
@ -160,9 +175,18 @@ public:
|
||||
std::string getInstanceWeightsFile() const {
|
||||
return instanceWeightsFile;
|
||||
}
|
||||
bool isTargetConstituentConstrainedFlag() const {
|
||||
return targetConstituentConstrainedFlag;
|
||||
}
|
||||
bool isTargetConstituentBoundariesFlag() const {
|
||||
return targetConstituentBoundariesFlag;
|
||||
}
|
||||
bool isFlexScoreFlag() const {
|
||||
return flexScoreFlag;
|
||||
}
|
||||
bool isSingleWordHeuristicFlag() const {
|
||||
return singleWordHeuristicFlag;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -18,8 +18,6 @@
|
||||
***********************************************************************/
|
||||
|
||||
#pragma once
|
||||
#ifndef RULEEXTRACTIONOPTIONS_H_INCLUDED_
|
||||
#define RULEEXTRACTIONOPTIONS_H_INCLUDED_
|
||||
|
||||
namespace MosesTraining
|
||||
{
|
||||
@ -95,4 +93,3 @@ public:
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -35,7 +35,7 @@ namespace MosesTraining
|
||||
|
||||
bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetString, int sentenceID, bool boundaryRules)
|
||||
{
|
||||
if (!m_options.targetSyntax) {
|
||||
if (!m_targetSyntax) {
|
||||
return SentenceAlignment::processTargetSentence(targetString, sentenceID, boundaryRules);
|
||||
}
|
||||
|
||||
@ -56,7 +56,7 @@ bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetStrin
|
||||
|
||||
bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceString, int sentenceID, bool boundaryRules)
|
||||
{
|
||||
if (!m_options.sourceSyntax) {
|
||||
if (!m_sourceSyntax) {
|
||||
return SentenceAlignment::processSourceSentence(sourceString, sentenceID, boundaryRules);
|
||||
}
|
||||
|
||||
|
@ -18,8 +18,6 @@
|
||||
***********************************************************************/
|
||||
|
||||
#pragma once
|
||||
#ifndef SENTENCEALIGNMENTWITHSYNTAX_H_INCLUDED_
|
||||
#define SENTENCEALIGNMENTWITHSYNTAX_H_INCLUDED_
|
||||
|
||||
#include <map>
|
||||
#include <set>
|
||||
@ -42,18 +40,20 @@ public:
|
||||
std::set<std::string> & m_sourceLabelCollection;
|
||||
std::map<std::string, int> & m_targetTopLabelCollection;
|
||||
std::map<std::string, int> & m_sourceTopLabelCollection;
|
||||
const RuleExtractionOptions & m_options;
|
||||
const bool m_targetSyntax, m_sourceSyntax;
|
||||
|
||||
SentenceAlignmentWithSyntax(std::set<std::string> & tgtLabelColl,
|
||||
std::set<std::string> & srcLabelColl,
|
||||
std::map<std::string,int> & tgtTopLabelColl,
|
||||
std::map<std::string,int> & srcTopLabelColl,
|
||||
const RuleExtractionOptions & options)
|
||||
bool targetSyntax,
|
||||
bool sourceSyntax)
|
||||
: m_targetLabelCollection(tgtLabelColl)
|
||||
, m_sourceLabelCollection(srcLabelColl)
|
||||
, m_targetTopLabelCollection(tgtTopLabelColl)
|
||||
, m_sourceTopLabelCollection(srcTopLabelColl)
|
||||
, m_options(options) {
|
||||
, m_targetSyntax(targetSyntax)
|
||||
, m_sourceSyntax(sourceSyntax) {
|
||||
}
|
||||
|
||||
virtual ~SentenceAlignmentWithSyntax() {}
|
||||
@ -67,4 +67,3 @@ public:
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -47,6 +47,8 @@ SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos,
|
||||
SyntaxNode* newNode = new SyntaxNode(label, startPos, endPos);
|
||||
m_nodes.push_back( newNode );
|
||||
m_index[ startPos ][ endPos ].push_back( newNode );
|
||||
m_endPositionsIndex[ endPos ].push_back( newNode );
|
||||
m_startPositionsIndex[ startPos ].push_back( newNode ); // TODO: may not need this: access m_index by startPos and iterate over its InnerNodeIndex (= end positions)?
|
||||
m_numWords = std::max(endPos+1, m_numWords);
|
||||
return newNode;
|
||||
}
|
||||
@ -70,6 +72,36 @@ const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes(
|
||||
return endIndex->second;
|
||||
}
|
||||
|
||||
bool SyntaxNodeCollection::HasNodeStartingAtPosition( int startPos ) const
|
||||
{
|
||||
return GetNodesByStartPosition(startPos).size() > 0;
|
||||
}
|
||||
|
||||
const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodesByStartPosition(
|
||||
int startPos ) const
|
||||
{
|
||||
InnerNodeIndex::const_iterator startIndex = m_startPositionsIndex.find( startPos );
|
||||
if (startIndex == m_startPositionsIndex.end() )
|
||||
return m_emptyNode;
|
||||
|
||||
return startIndex->second;
|
||||
}
|
||||
|
||||
bool SyntaxNodeCollection::HasNodeEndingAtPosition( int endPos ) const
|
||||
{
|
||||
return GetNodesByEndPosition(endPos).size() > 0;
|
||||
}
|
||||
|
||||
const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodesByEndPosition(
|
||||
int endPos ) const
|
||||
{
|
||||
InnerNodeIndex::const_iterator endIndex = m_endPositionsIndex.find( endPos );
|
||||
if (endIndex == m_endPositionsIndex.end() )
|
||||
return m_emptyNode;
|
||||
|
||||
return endIndex->second;
|
||||
}
|
||||
|
||||
std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
|
||||
{
|
||||
std::map<SyntaxNode *, SyntaxTree *> nodeToTree;
|
||||
|
@ -50,6 +50,11 @@ public:
|
||||
//! Lookup the SyntaxNodes for a given span.
|
||||
const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const;
|
||||
|
||||
bool HasNodeStartingAtPosition( int startPos ) const;
|
||||
const std::vector< SyntaxNode* >& GetNodesByStartPosition( int startPos ) const;
|
||||
bool HasNodeEndingAtPosition( int endPos ) const;
|
||||
const std::vector< SyntaxNode* >& GetNodesByEndPosition( int endPos ) const;
|
||||
|
||||
//! Get a vector of pointers to all SyntaxNodes (unordered).
|
||||
const std::vector< SyntaxNode* >& GetAllNodes() {
|
||||
return m_nodes;
|
||||
@ -78,6 +83,9 @@ private:
|
||||
NodeIndex m_index;
|
||||
int m_numWords;
|
||||
std::vector< SyntaxNode* > m_emptyNode;
|
||||
|
||||
InnerNodeIndex m_endPositionsIndex;
|
||||
InnerNodeIndex m_startPositionsIndex;
|
||||
};
|
||||
|
||||
} // namespace MosesTraining
|
||||
|
@ -1,11 +1,3 @@
|
||||
/*
|
||||
* extract.cpp
|
||||
* Modified by: Rohit Gupta CDAC, Mumbai, India
|
||||
* on July 15, 2012 to implement parallel processing
|
||||
* Modified by: Nadi Tomeh - LIMSI/CNRS
|
||||
* Machine Translation Marathon 2010, Dublin
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
@ -20,11 +12,13 @@
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
|
||||
#include "SentenceAlignment.h"
|
||||
#include "tables-core.h"
|
||||
#include "InputFileStream.h"
|
||||
#include "OutputFileStream.h"
|
||||
#include "PhraseExtractionOptions.h"
|
||||
#include "SentenceAlignmentWithSyntax.h"
|
||||
#include "SyntaxNode.h"
|
||||
#include "moses/Util.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace MosesTraining;
|
||||
@ -46,14 +40,14 @@ typedef vector < HPhrase > HPhraseVector;
|
||||
// The key of the map is the English index and the value is a set of the source ones
|
||||
typedef map <int, set<int> > HSentenceVertices;
|
||||
|
||||
REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
|
||||
REO_POS getOrientWordModel(SentenceAlignmentWithSyntax &, REO_MODEL_TYPE, bool, bool,
|
||||
int, int, int, int, int, int, int,
|
||||
bool (*)(int, int), bool (*)(int, int));
|
||||
REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
|
||||
REO_POS getOrientPhraseModel(SentenceAlignmentWithSyntax &, REO_MODEL_TYPE, bool, bool,
|
||||
int, int, int, int, int, int, int,
|
||||
bool (*)(int, int), bool (*)(int, int),
|
||||
const HSentenceVertices &, const HSentenceVertices &);
|
||||
REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
|
||||
REO_POS getOrientHierModel(SentenceAlignmentWithSyntax &, REO_MODEL_TYPE, bool, bool,
|
||||
int, int, int, int, int, int, int,
|
||||
bool (*)(int, int), bool (*)(int, int),
|
||||
const HSentenceVertices &, const HSentenceVertices &,
|
||||
@ -69,25 +63,16 @@ bool ge(int, int);
|
||||
bool le(int, int);
|
||||
bool lt(int, int);
|
||||
|
||||
bool isAligned (SentenceAlignment &, int, int);
|
||||
bool isAligned (SentenceAlignmentWithSyntax &, int, int);
|
||||
|
||||
int sentenceOffset = 0;
|
||||
|
||||
std::vector<std::string> Tokenize(const std::string& str,
|
||||
const std::string& delimiters = " \t");
|
||||
|
||||
bool flexScoreFlag = false;
|
||||
|
||||
}
|
||||
|
||||
namespace MosesTraining
|
||||
{
|
||||
|
||||
class ExtractTask
|
||||
{
|
||||
public:
|
||||
ExtractTask(
|
||||
size_t id, SentenceAlignment &sentence,
|
||||
size_t id, SentenceAlignmentWithSyntax &sentence,
|
||||
PhraseExtractionOptions &initoptions,
|
||||
Moses::OutputFileStream &extractFile,
|
||||
Moses::OutputFileStream &extractFileInv,
|
||||
@ -109,14 +94,26 @@ private:
|
||||
vector< string > m_extractedPhrasesSid;
|
||||
vector< string > m_extractedPhrasesContext;
|
||||
vector< string > m_extractedPhrasesContextInv;
|
||||
void extractBase(SentenceAlignment &);
|
||||
void extract(SentenceAlignment &);
|
||||
void addPhrase(SentenceAlignment &, int, int, int, int, string &);
|
||||
void extractBase();
|
||||
void extract();
|
||||
void addPhrase(int, int, int, int, const std::string &);
|
||||
void writePhrasesToFile();
|
||||
bool checkPlaceholders (const SentenceAlignment &sentence, int startE, int endE, int startF, int endF);
|
||||
bool isPlaceholder(const string &word);
|
||||
bool checkPlaceholders(int startE, int endE, int startF, int endF) const;
|
||||
bool isPlaceholder(const string &word) const;
|
||||
bool checkTargetConstituentBoundaries(int startE, int endE, int startF, int endF,
|
||||
ostringstream &outextractstrPhraseProperties) const;
|
||||
void getOrientationInfo(int startE, int endE, int startF, int endF,
|
||||
const HSentenceVertices& inTopLeft,
|
||||
const HSentenceVertices& inTopRight,
|
||||
const HSentenceVertices& inBottomLeft,
|
||||
const HSentenceVertices& inBottomRight,
|
||||
const HSentenceVertices& outTopLeft,
|
||||
const HSentenceVertices& outTopRight,
|
||||
const HSentenceVertices& outBottomLeft,
|
||||
const HSentenceVertices& outBottomRight,
|
||||
std::string &orientationInfo) const;
|
||||
|
||||
SentenceAlignment &m_sentence;
|
||||
SentenceAlignmentWithSyntax &m_sentence;
|
||||
const PhraseExtractionOptions &m_options;
|
||||
Moses::OutputFileStream &m_extractFile;
|
||||
Moses::OutputFileStream &m_extractFileInv;
|
||||
@ -128,12 +125,13 @@ private:
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
cerr << "PhraseExtract v1.4, written by Philipp Koehn\n"
|
||||
<< "phrase extraction from an aligned parallel corpus\n";
|
||||
cerr << "PhraseExtract v1.5, written by Philipp Koehn et al." << std::endl
|
||||
<< "phrase extraction from an aligned parallel corpus" << std::endl;
|
||||
|
||||
if (argc < 6) {
|
||||
cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] ";
|
||||
cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n | --InstanceWeights filename ]\n";
|
||||
cerr << "| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n | --InstanceWeights filename ";
|
||||
cerr << "| --TargetConstituentConstrained | --TargetConstituentBoundaries ]" << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
@ -153,8 +151,14 @@ int main(int argc, char* argv[])
|
||||
options.initOnlyOutputSpanInfo(true);
|
||||
} else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
|
||||
options.initOrientationFlag(true);
|
||||
} else if (strcmp(argv[i],"--TargetConstituentConstrained") == 0) {
|
||||
options.initTargetConstituentConstrainedFlag(true);
|
||||
} else if (strcmp(argv[i],"--TargetConstituentBoundaries") == 0) {
|
||||
options.initTargetConstituentBoundariesFlag(true);
|
||||
} else if (strcmp(argv[i],"--FlexibilityScore") == 0) {
|
||||
options.initFlexScoreFlag(true);
|
||||
} else if (strcmp(argv[i],"--SingleWordHeuristic") == 0) {
|
||||
options.initSingleWordHeuristicFlag(true);
|
||||
} else if (strcmp(argv[i],"--NoTTable") == 0) {
|
||||
options.initTranslationFlag(false);
|
||||
} else if (strcmp(argv[i], "--IncludeSentenceId") == 0) {
|
||||
@ -231,9 +235,9 @@ int main(int argc, char* argv[])
|
||||
} else if (strcmp(argv[i], "--Placeholders") == 0) {
|
||||
++i;
|
||||
string str = argv[i];
|
||||
options.placeholders = Tokenize(str.c_str(), ",");
|
||||
Moses::Tokenize(options.placeholders, str.c_str(), ",");
|
||||
} else {
|
||||
cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
|
||||
cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'" << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
@ -278,11 +282,16 @@ int main(int argc, char* argv[])
|
||||
extractFileContextInv.Open(fileNameExtractContextInv.c_str());
|
||||
}
|
||||
|
||||
// stats on labels for glue grammar and unknown word label probabilities
|
||||
set< string > targetLabelCollection, sourceLabelCollection;
|
||||
map< string, int > targetTopLabelCollection, sourceTopLabelCollection;
|
||||
const bool targetSyntax = true;
|
||||
|
||||
int i = sentenceOffset;
|
||||
|
||||
string englishString, foreignString, alignmentString, weightString;
|
||||
|
||||
while(getline(*eFileP, englishString)) {
|
||||
while (getline(*eFileP, englishString)) {
|
||||
// Print progress dots to stderr.
|
||||
i++;
|
||||
if (i%10000 == 0) cerr << "." << flush;
|
||||
@ -293,7 +302,10 @@ int main(int argc, char* argv[])
|
||||
getline(*iwFileP, weightString);
|
||||
}
|
||||
|
||||
SentenceAlignment sentence;
|
||||
SentenceAlignmentWithSyntax sentence
|
||||
(targetLabelCollection, sourceLabelCollection,
|
||||
targetTopLabelCollection, sourceTopLabelCollection,
|
||||
targetSyntax, false);
|
||||
// cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
|
||||
//az: output src, tgt, and alingment line
|
||||
if (options.isOnlyOutputSpanInfo()) {
|
||||
@ -347,7 +359,7 @@ namespace MosesTraining
|
||||
{
|
||||
void ExtractTask::Run()
|
||||
{
|
||||
extract(m_sentence);
|
||||
extract();
|
||||
writePhrasesToFile();
|
||||
m_extractedPhrases.clear();
|
||||
m_extractedPhrasesInv.clear();
|
||||
@ -358,10 +370,10 @@ void ExtractTask::Run()
|
||||
|
||||
}
|
||||
|
||||
void ExtractTask::extract(SentenceAlignment &sentence)
|
||||
void ExtractTask::extract()
|
||||
{
|
||||
int countE = sentence.target.size();
|
||||
int countF = sentence.source.size();
|
||||
int countE = m_sentence.target.size();
|
||||
int countF = m_sentence.source.size();
|
||||
|
||||
HPhraseVector inboundPhrases;
|
||||
|
||||
@ -376,21 +388,20 @@ void ExtractTask::extract(SentenceAlignment &sentence)
|
||||
HSentenceVertices outBottomRight;
|
||||
|
||||
bool relaxLimit = m_options.isHierModel();
|
||||
bool buildExtraStructure = m_options.isPhraseModel() || m_options.isHierModel();
|
||||
|
||||
// check alignments for target phrase startE...endE
|
||||
// loop over extracted phrases which are compatible with the word-alignments
|
||||
for(int startE=0; startE<countE; startE++) {
|
||||
for(int endE=startE;
|
||||
(endE<countE && (relaxLimit || endE<startE+m_options.maxPhraseLength));
|
||||
endE++) {
|
||||
for (int startE=0; startE<countE; startE++) {
|
||||
for (int endE=startE;
|
||||
(endE<countE && (relaxLimit || endE<startE+m_options.maxPhraseLength));
|
||||
endE++) {
|
||||
|
||||
int minF = std::numeric_limits<int>::max();
|
||||
int maxF = -1;
|
||||
vector< int > usedF = sentence.alignedCountS;
|
||||
for(int ei=startE; ei<=endE; ei++) {
|
||||
for(size_t i=0; i<sentence.alignedToT[ei].size(); i++) {
|
||||
int fi = sentence.alignedToT[ei][i];
|
||||
vector< int > usedF = m_sentence.alignedCountS;
|
||||
for (int ei=startE; ei<=endE; ei++) {
|
||||
for (size_t i=0; i<m_sentence.alignedToT[ei].size(); i++) {
|
||||
int fi = m_sentence.alignedToT[ei][i];
|
||||
if (fi<minF) {
|
||||
minF = fi;
|
||||
}
|
||||
@ -406,111 +417,142 @@ void ExtractTask::extract(SentenceAlignment &sentence)
|
||||
|
||||
// check if source words are aligned to out of bound target words
|
||||
bool out_of_bounds = false;
|
||||
for(int fi=minF; fi<=maxF && !out_of_bounds; fi++)
|
||||
for (int fi=minF; fi<=maxF && !out_of_bounds; fi++)
|
||||
if (usedF[fi]>0) {
|
||||
// cout << "ouf of bounds: " << fi << "\n";
|
||||
// cout << "ouf of bounds: " << fi << std::endl;
|
||||
out_of_bounds = true;
|
||||
}
|
||||
|
||||
// cout << "doing if for ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n";
|
||||
// cout << "doing if for ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")" << std::endl;
|
||||
if (!out_of_bounds) {
|
||||
// start point of source phrase may retreat over unaligned
|
||||
for(int startF=minF;
|
||||
(startF>=0 &&
|
||||
(relaxLimit || startF>maxF-m_options.maxPhraseLength) && // within length limit
|
||||
(startF==minF || sentence.alignedCountS[startF]==0)); // unaligned
|
||||
startF--)
|
||||
for (int startF=minF;
|
||||
(startF>=0 &&
|
||||
(relaxLimit || startF>maxF-m_options.maxPhraseLength) && // within length limit
|
||||
(startF==minF || m_sentence.alignedCountS[startF]==0)); // unaligned
|
||||
startF--) {
|
||||
// end point of source phrase may advance over unaligned
|
||||
for(int endF=maxF;
|
||||
(endF<countF &&
|
||||
(relaxLimit || endF<startF+m_options.maxPhraseLength) && // within length limit
|
||||
(endF==maxF || sentence.alignedCountS[endF]==0)); // unaligned
|
||||
endF++) { // at this point we have extracted a phrase
|
||||
if(buildExtraStructure) { // phrase || hier
|
||||
if(endE-startE < m_options.maxPhraseLength && endF-startF < m_options.maxPhraseLength) { // within limit
|
||||
inboundPhrases.push_back(HPhrase(HPhraseVertex(startF,startE),
|
||||
HPhraseVertex(endF,endE)));
|
||||
insertPhraseVertices(inTopLeft, inTopRight, inBottomLeft, inBottomRight,
|
||||
startF, startE, endF, endE);
|
||||
} else
|
||||
insertPhraseVertices(outTopLeft, outTopRight, outBottomLeft, outBottomRight,
|
||||
startF, startE, endF, endE);
|
||||
for (int endF=maxF;
|
||||
(endF<countF &&
|
||||
(relaxLimit || endF<startF+m_options.maxPhraseLength) && // within length limit
|
||||
(endF==maxF || m_sentence.alignedCountS[endF]==0)); // unaligned
|
||||
endF++) { // at this point we have extracted a phrase
|
||||
|
||||
if(endE-startE < m_options.maxPhraseLength && endF-startF < m_options.maxPhraseLength) { // within limit
|
||||
inboundPhrases.push_back(HPhrase(HPhraseVertex(startF,startE),
|
||||
HPhraseVertex(endF,endE)));
|
||||
insertPhraseVertices(inTopLeft, inTopRight, inBottomLeft, inBottomRight,
|
||||
startF, startE, endF, endE);
|
||||
} else {
|
||||
string orientationInfo = "";
|
||||
if(m_options.isWordModel()) {
|
||||
REO_POS wordPrevOrient, wordNextOrient;
|
||||
bool connectedLeftTopP = isAligned( sentence, startF-1, startE-1 );
|
||||
bool connectedRightTopP = isAligned( sentence, endF+1, startE-1 );
|
||||
bool connectedLeftTopN = isAligned( sentence, endF+1, endE+1 );
|
||||
bool connectedRightTopN = isAligned( sentence, startF-1, endE+1 );
|
||||
wordPrevOrient = getOrientWordModel(sentence, m_options.isWordType(), connectedLeftTopP, connectedRightTopP, startF, endF, startE, endE, countF, 0, 1, &ge, <);
|
||||
wordNextOrient = getOrientWordModel(sentence, m_options.isWordType(), connectedLeftTopN, connectedRightTopN, endF, startF, endE, startE, 0, countF, -1, <, &ge);
|
||||
orientationInfo += getOrientString(wordPrevOrient, m_options.isWordType()) + " " + getOrientString(wordNextOrient, m_options.isWordType());
|
||||
// if(m_options.isAllModelsOutputFlag())
|
||||
// " | | ";
|
||||
}
|
||||
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
|
||||
insertPhraseVertices(outTopLeft, outTopRight, outBottomLeft, outBottomRight,
|
||||
startF, startE, endF, endE);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(buildExtraStructure) { // phrase || hier
|
||||
string orientationInfo = "";
|
||||
REO_POS wordPrevOrient=UNKNOWN, wordNextOrient=UNKNOWN, phrasePrevOrient, phraseNextOrient, hierPrevOrient, hierNextOrient;
|
||||
std::string orientationInfo = "";
|
||||
|
||||
for(size_t i = 0; i < inboundPhrases.size(); i++) {
|
||||
int startF = inboundPhrases[i].first.first;
|
||||
int startE = inboundPhrases[i].first.second;
|
||||
int endF = inboundPhrases[i].second.first;
|
||||
int endE = inboundPhrases[i].second.second;
|
||||
for (size_t i = 0; i < inboundPhrases.size(); i++) {
|
||||
|
||||
bool connectedLeftTopP = isAligned( sentence, startF-1, startE-1 );
|
||||
bool connectedRightTopP = isAligned( sentence, endF+1, startE-1 );
|
||||
bool connectedLeftTopN = isAligned( sentence, endF+1, endE+1 );
|
||||
bool connectedRightTopN = isAligned( sentence, startF-1, endE+1 );
|
||||
int startF = inboundPhrases[i].first.first;
|
||||
int startE = inboundPhrases[i].first.second;
|
||||
int endF = inboundPhrases[i].second.first;
|
||||
int endE = inboundPhrases[i].second.second;
|
||||
|
||||
if(m_options.isWordModel()) {
|
||||
wordPrevOrient = getOrientWordModel(sentence, m_options.isWordType(),
|
||||
connectedLeftTopP, connectedRightTopP,
|
||||
startF, endF, startE, endE, countF, 0, 1,
|
||||
&ge, <);
|
||||
wordNextOrient = getOrientWordModel(sentence, m_options.isWordType(),
|
||||
connectedLeftTopN, connectedRightTopN,
|
||||
endF, startF, endE, startE, 0, countF, -1,
|
||||
<, &ge);
|
||||
getOrientationInfo(startE, endE, startF, endF,
|
||||
inTopLeft, inTopRight, inBottomLeft, inBottomRight,
|
||||
outTopLeft, outTopRight, outBottomLeft, outBottomRight,
|
||||
orientationInfo);
|
||||
|
||||
addPhrase(startE, endE, startF, endF, orientationInfo);
|
||||
}
|
||||
|
||||
if (m_options.isSingleWordHeuristicFlag()) {
|
||||
// add single word phrases that are not consistent with the word alignment
|
||||
m_sentence.invertAlignment();
|
||||
for (int ei=0; ei<countE; ei++) {
|
||||
for (size_t i=0; i<m_sentence.alignedToT[ei].size(); i++) {
|
||||
int fi = m_sentence.alignedToT[ei][i];
|
||||
if ((m_sentence.alignedToT[ei].size() > 1) || (m_sentence.alignedToS[fi].size() > 1)) {
|
||||
|
||||
if (m_options.isOrientationFlag()) {
|
||||
getOrientationInfo(ei, ei, fi, fi,
|
||||
inTopLeft, inTopRight, inBottomLeft, inBottomRight,
|
||||
outTopLeft, outTopRight, outBottomLeft, outBottomRight,
|
||||
orientationInfo);
|
||||
}
|
||||
|
||||
addPhrase(ei, ei, fi, fi, orientationInfo);
|
||||
}
|
||||
}
|
||||
if (m_options.isPhraseModel()) {
|
||||
phrasePrevOrient = getOrientPhraseModel(sentence, m_options.isPhraseType(),
|
||||
connectedLeftTopP, connectedRightTopP,
|
||||
startF, endF, startE, endE, countF-1, 0, 1, &ge, <, inBottomRight, inBottomLeft);
|
||||
phraseNextOrient = getOrientPhraseModel(sentence, m_options.isPhraseType(),
|
||||
connectedLeftTopN, connectedRightTopN,
|
||||
endF, startF, endE, startE, 0, countF-1, -1, <, &ge, inBottomLeft, inBottomRight);
|
||||
} else {
|
||||
phrasePrevOrient = phraseNextOrient = UNKNOWN;
|
||||
}
|
||||
if(m_options.isHierModel()) {
|
||||
hierPrevOrient = getOrientHierModel(sentence, m_options.isHierType(),
|
||||
connectedLeftTopP, connectedRightTopP,
|
||||
startF, endF, startE, endE, countF-1, 0, 1, &ge, <, inBottomRight, inBottomLeft, outBottomRight, outBottomLeft, phrasePrevOrient);
|
||||
hierNextOrient = getOrientHierModel(sentence, m_options.isHierType(),
|
||||
connectedLeftTopN, connectedRightTopN,
|
||||
endF, startF, endE, startE, 0, countF-1, -1, <, &ge, inBottomLeft, inBottomRight, outBottomLeft, outBottomRight, phraseNextOrient);
|
||||
}
|
||||
|
||||
orientationInfo = ((m_options.isWordModel())? getOrientString(wordPrevOrient, m_options.isWordType()) + " " + getOrientString(wordNextOrient, m_options.isWordType()) : "") + " | " +
|
||||
((m_options.isPhraseModel())? getOrientString(phrasePrevOrient, m_options.isPhraseType()) + " " + getOrientString(phraseNextOrient, m_options.isPhraseType()) : "") + " | " +
|
||||
((m_options.isHierModel())? getOrientString(hierPrevOrient, m_options.isHierType()) + " " + getOrientString(hierNextOrient, m_options.isHierType()) : "");
|
||||
|
||||
addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
REO_POS getOrientWordModel(SentenceAlignment & sentence, REO_MODEL_TYPE modelType,
|
||||
void ExtractTask::getOrientationInfo(int startE, int endE, int startF, int endF,
|
||||
const HSentenceVertices& inTopLeft,
|
||||
const HSentenceVertices& inTopRight,
|
||||
const HSentenceVertices& inBottomLeft,
|
||||
const HSentenceVertices& inBottomRight,
|
||||
const HSentenceVertices& outTopLeft,
|
||||
const HSentenceVertices& outTopRight,
|
||||
const HSentenceVertices& outBottomLeft,
|
||||
const HSentenceVertices& outBottomRight,
|
||||
std::string &orientationInfo) const
|
||||
{
|
||||
REO_POS wordPrevOrient=UNKNOWN, wordNextOrient=UNKNOWN;
|
||||
REO_POS phrasePrevOrient=UNKNOWN, phraseNextOrient=UNKNOWN;
|
||||
REO_POS hierPrevOrient=UNKNOWN, hierNextOrient=UNKNOWN;
|
||||
|
||||
bool connectedLeftTopP = isAligned( m_sentence, startF-1, startE-1 );
|
||||
bool connectedRightTopP = isAligned( m_sentence, endF+1, startE-1 );
|
||||
bool connectedLeftTopN = isAligned( m_sentence, endF+1, endE+1 );
|
||||
bool connectedRightTopN = isAligned( m_sentence, startF-1, endE+1 );
|
||||
|
||||
const int countF = m_sentence.source.size();
|
||||
|
||||
if (m_options.isWordModel()) {
|
||||
wordPrevOrient = getOrientWordModel(m_sentence, m_options.isWordType(),
|
||||
connectedLeftTopP, connectedRightTopP,
|
||||
startF, endF, startE, endE, countF, 0, 1,
|
||||
&ge, <);
|
||||
wordNextOrient = getOrientWordModel(m_sentence, m_options.isWordType(),
|
||||
connectedLeftTopN, connectedRightTopN,
|
||||
endF, startF, endE, startE, 0, countF, -1,
|
||||
<, &ge);
|
||||
}
|
||||
if (m_options.isPhraseModel()) {
|
||||
phrasePrevOrient = getOrientPhraseModel(m_sentence, m_options.isPhraseType(),
|
||||
connectedLeftTopP, connectedRightTopP,
|
||||
startF, endF, startE, endE, countF-1, 0, 1, &ge, <, inBottomRight, inBottomLeft);
|
||||
phraseNextOrient = getOrientPhraseModel(m_sentence, m_options.isPhraseType(),
|
||||
connectedLeftTopN, connectedRightTopN,
|
||||
endF, startF, endE, startE, 0, countF-1, -1, <, &ge, inBottomLeft, inBottomRight);
|
||||
}
|
||||
if (m_options.isHierModel()) {
|
||||
hierPrevOrient = getOrientHierModel(m_sentence, m_options.isHierType(),
|
||||
connectedLeftTopP, connectedRightTopP,
|
||||
startF, endF, startE, endE, countF-1, 0, 1, &ge, <, inBottomRight, inBottomLeft, outBottomRight, outBottomLeft, phrasePrevOrient);
|
||||
hierNextOrient = getOrientHierModel(m_sentence, m_options.isHierType(),
|
||||
connectedLeftTopN, connectedRightTopN,
|
||||
endF, startF, endE, startE, 0, countF-1, -1, <, &ge, inBottomLeft, inBottomRight, outBottomLeft, outBottomRight, phraseNextOrient);
|
||||
}
|
||||
|
||||
if (m_options.isWordModel()) {
|
||||
orientationInfo = getOrientString(wordPrevOrient, m_options.isWordType()) + " " + getOrientString(wordNextOrient, m_options.isWordType());
|
||||
} else {
|
||||
orientationInfo = " | " +
|
||||
((m_options.isPhraseModel())? getOrientString(phrasePrevOrient, m_options.isPhraseType()) + " " + getOrientString(phraseNextOrient, m_options.isPhraseType()) : "") + " | " +
|
||||
((m_options.isHierModel())? getOrientString(hierPrevOrient, m_options.isHierType()) + " " + getOrientString(hierNextOrient, m_options.isHierType()) : "");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
REO_POS getOrientWordModel(SentenceAlignmentWithSyntax & sentence, REO_MODEL_TYPE modelType,
|
||||
bool connectedLeftTop, bool connectedRightTop,
|
||||
int startF, int endF, int startE, int endE, int countF, int zero, int unit,
|
||||
bool (*ge)(int, int), bool (*lt)(int, int) )
|
||||
@ -536,7 +578,7 @@ REO_POS getOrientWordModel(SentenceAlignment & sentence, REO_MODEL_TYPE modelTyp
|
||||
}
|
||||
|
||||
// to be called with countF-1 instead of countF
|
||||
REO_POS getOrientPhraseModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelType,
|
||||
REO_POS getOrientPhraseModel (SentenceAlignmentWithSyntax & sentence, REO_MODEL_TYPE modelType,
|
||||
bool connectedLeftTop, bool connectedRightTop,
|
||||
int startF, int endF, int startE, int endE, int countF, int zero, int unit,
|
||||
bool (*ge)(int, int), bool (*lt)(int, int),
|
||||
@ -572,7 +614,7 @@ REO_POS getOrientPhraseModel (SentenceAlignment & sentence, REO_MODEL_TYPE model
|
||||
}
|
||||
|
||||
// to be called with countF-1 instead of countF
|
||||
REO_POS getOrientHierModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelType,
|
||||
REO_POS getOrientHierModel (SentenceAlignmentWithSyntax & sentence, REO_MODEL_TYPE modelType,
|
||||
bool connectedLeftTop, bool connectedRightTop,
|
||||
int startF, int endF, int startE, int endE, int countF, int zero, int unit,
|
||||
bool (*ge)(int, int), bool (*lt)(int, int),
|
||||
@ -624,7 +666,7 @@ REO_POS getOrientHierModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelTy
|
||||
return UNKNOWN;
|
||||
}
|
||||
|
||||
bool isAligned ( SentenceAlignment &sentence, int fi, int ei )
|
||||
bool isAligned ( SentenceAlignmentWithSyntax &sentence, int fi, int ei )
|
||||
{
|
||||
if (ei == -1 && fi == -1)
|
||||
return true;
|
||||
@ -660,7 +702,7 @@ void insertVertex( HSentenceVertices & corners, int x, int y )
|
||||
set<int> tmp;
|
||||
tmp.insert(x);
|
||||
pair< HSentenceVertices::iterator, bool > ret = corners.insert( pair<int, set<int> > (y, tmp) );
|
||||
if(ret.second == false) {
|
||||
if (ret.second == false) {
|
||||
ret.first->second.insert(x);
|
||||
}
|
||||
}
|
||||
@ -711,41 +753,174 @@ string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType)
|
||||
return "";
|
||||
}
|
||||
|
||||
void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo)
|
||||
|
||||
bool ExtractTask::checkTargetConstituentBoundaries(int startE, int endE, int startF, int endF,
|
||||
ostringstream &outextractstrPhraseProperties) const
|
||||
{
|
||||
// source
|
||||
// // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n";
|
||||
if (m_options.isTargetConstituentBoundariesFlag()) {
|
||||
outextractstrPhraseProperties << " {{TargetConstituentBoundariesLeft ";
|
||||
}
|
||||
|
||||
bool validTargetConstituentBoundaries = false;
|
||||
bool outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = true;
|
||||
|
||||
if (m_options.isTargetConstituentBoundariesFlag()) {
|
||||
if (startE==0) {
|
||||
outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = false;
|
||||
outextractstrPhraseProperties << "BOS_";
|
||||
}
|
||||
}
|
||||
|
||||
if (!m_sentence.targetTree.HasNodeStartingAtPosition(startE)) {
|
||||
|
||||
validTargetConstituentBoundaries = false;
|
||||
|
||||
} else {
|
||||
|
||||
const std::vector< SyntaxNode* >& startingNodes = m_sentence.targetTree.GetNodesByStartPosition(startE);
|
||||
for ( std::vector< SyntaxNode* >::const_reverse_iterator iter = startingNodes.rbegin(); iter != startingNodes.rend(); ++iter ) {
|
||||
if ( (*iter)->end == endE ) {
|
||||
validTargetConstituentBoundaries = true;
|
||||
if (!m_options.isTargetConstituentBoundariesFlag()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (m_options.isTargetConstituentBoundariesFlag()) {
|
||||
if (outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst) {
|
||||
outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = false;
|
||||
} else {
|
||||
outextractstrPhraseProperties << "<";
|
||||
}
|
||||
outextractstrPhraseProperties << (*iter)->label;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (m_options.isTargetConstituentBoundariesFlag()) {
|
||||
if (outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst) {
|
||||
outextractstrPhraseProperties << "<";
|
||||
}
|
||||
outextractstrPhraseProperties << "}}";
|
||||
}
|
||||
|
||||
|
||||
if (m_options.isTargetConstituentConstrainedFlag() && !validTargetConstituentBoundaries) {
|
||||
// skip over all boundary punctuation and check again
|
||||
bool relaxedValidTargetConstituentBoundaries = false;
|
||||
int relaxedStartE = startE;
|
||||
int relaxedEndE = endE;
|
||||
const std::string punctuation = ",;.:!?";
|
||||
while ( (relaxedStartE < endE) &&
|
||||
(m_sentence.target[relaxedStartE].size() == 1) &&
|
||||
(punctuation.find(m_sentence.target[relaxedStartE].at(0)) != std::string::npos) ) {
|
||||
++relaxedStartE;
|
||||
}
|
||||
while ( (relaxedEndE > relaxedStartE) &&
|
||||
(m_sentence.target[relaxedEndE].size() == 1) &&
|
||||
(punctuation.find(m_sentence.target[relaxedEndE].at(0)) != std::string::npos) ) {
|
||||
--relaxedEndE;
|
||||
}
|
||||
|
||||
if ( (relaxedStartE != startE) || (relaxedEndE !=endE) ) {
|
||||
const std::vector< SyntaxNode* >& startingNodes = m_sentence.targetTree.GetNodesByStartPosition(relaxedStartE);
|
||||
for ( std::vector< SyntaxNode* >::const_reverse_iterator iter = startingNodes.rbegin();
|
||||
(iter != startingNodes.rend() && !relaxedValidTargetConstituentBoundaries);
|
||||
++iter ) {
|
||||
if ( (*iter)->end == relaxedEndE ) {
|
||||
relaxedValidTargetConstituentBoundaries = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!relaxedValidTargetConstituentBoundaries) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (m_options.isTargetConstituentBoundariesFlag()) {
|
||||
|
||||
outextractstrPhraseProperties << " {{TargetConstituentBoundariesRightAdjacent ";
|
||||
outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = true;
|
||||
|
||||
if (endE==(int)m_sentence.target.size()-1) {
|
||||
|
||||
outextractstrPhraseProperties << "EOS_";
|
||||
outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = false;
|
||||
|
||||
} else {
|
||||
|
||||
const std::vector< SyntaxNode* >& adjacentNodes = m_sentence.targetTree.GetNodesByStartPosition(endE+1);
|
||||
for ( std::vector< SyntaxNode* >::const_reverse_iterator iter = adjacentNodes.rbegin(); iter != adjacentNodes.rend(); ++iter ) {
|
||||
if (outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst) {
|
||||
outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst = false;
|
||||
} else {
|
||||
outextractstrPhraseProperties << "<";
|
||||
}
|
||||
outextractstrPhraseProperties << (*iter)->label;
|
||||
}
|
||||
}
|
||||
|
||||
if (outextractstrPhrasePropertyTargetConstituentBoundariesIsFirst) {
|
||||
outextractstrPhraseProperties << "<";
|
||||
}
|
||||
outextractstrPhraseProperties << "}}";
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
void ExtractTask::addPhrase( int startE, int endE, int startF, int endF,
|
||||
const std::string &orientationInfo)
|
||||
{
|
||||
ostringstream outextractstrPhraseProperties;
|
||||
if (m_options.isTargetConstituentBoundariesFlag() || m_options.isTargetConstituentConstrainedFlag()) {
|
||||
bool isTargetConstituentCovered = checkTargetConstituentBoundaries(startE, endE, startF, endF, outextractstrPhraseProperties);
|
||||
if (m_options.isTargetConstituentBoundariesFlag() && !isTargetConstituentCovered) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (m_options.placeholders.size() && !checkPlaceholders(startE, endE, startF, endF)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_options.isOnlyOutputSpanInfo()) {
|
||||
cout << startF << " " << endF << " " << startE << " " << endE << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
ostringstream outextractstr;
|
||||
ostringstream outextractstrInv;
|
||||
ostringstream outextractstrOrientation;
|
||||
|
||||
if (m_options.isOnlyOutputSpanInfo()) {
|
||||
cout << startF << " " << endF << " " << startE << " " << endE << endl;
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_options.placeholders.size() && !checkPlaceholders(sentence, startE, endE, startF, endF)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_options.debug) {
|
||||
outextractstr << "sentenceID=" << sentence.sentenceID << " ";
|
||||
outextractstrInv << "sentenceID=" << sentence.sentenceID << " ";
|
||||
outextractstrOrientation << "sentenceID=" << sentence.sentenceID << " ";
|
||||
outextractstr << "sentenceID=" << m_sentence.sentenceID << " ";
|
||||
outextractstrInv << "sentenceID=" << m_sentence.sentenceID << " ";
|
||||
outextractstrOrientation << "sentenceID=" << m_sentence.sentenceID << " ";
|
||||
}
|
||||
|
||||
// source
|
||||
for(int fi=startF; fi<=endF; fi++) {
|
||||
if (m_options.isTranslationFlag()) outextractstr << sentence.source[fi] << " ";
|
||||
if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.source[fi] << " ";
|
||||
if (m_options.isTranslationFlag()) outextractstr << m_sentence.source[fi] << " ";
|
||||
if (m_options.isOrientationFlag()) outextractstrOrientation << m_sentence.source[fi] << " ";
|
||||
}
|
||||
if (m_options.isTranslationFlag()) outextractstr << "||| ";
|
||||
if (m_options.isOrientationFlag()) outextractstrOrientation << "||| ";
|
||||
|
||||
|
||||
// target
|
||||
for(int ei=startE; ei<=endE; ei++) {
|
||||
if (m_options.isTranslationFlag()) outextractstr << sentence.target[ei] << " ";
|
||||
if (m_options.isTranslationFlag()) outextractstrInv << sentence.target[ei] << " ";
|
||||
if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.target[ei] << " ";
|
||||
|
||||
if (m_options.isTranslationFlag()) {
|
||||
outextractstr << m_sentence.target[ei] << " ";
|
||||
outextractstrInv << m_sentence.target[ei] << " ";
|
||||
}
|
||||
|
||||
if (m_options.isOrientationFlag()) {
|
||||
outextractstrOrientation << m_sentence.target[ei] << " ";
|
||||
}
|
||||
}
|
||||
if (m_options.isTranslationFlag()) outextractstr << "|||";
|
||||
if (m_options.isTranslationFlag()) outextractstrInv << "||| ";
|
||||
@ -755,17 +930,22 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE,
|
||||
|
||||
if (m_options.isTranslationFlag()) {
|
||||
for(int fi=startF; fi<=endF; fi++)
|
||||
outextractstrInv << sentence.source[fi] << " ";
|
||||
outextractstrInv << m_sentence.source[fi] << " ";
|
||||
outextractstrInv << "|||";
|
||||
}
|
||||
|
||||
// alignment
|
||||
if (m_options.isTranslationFlag()) {
|
||||
for(int ei=startE; ei<=endE; ei++) {
|
||||
for(unsigned int i=0; i<sentence.alignedToT[ei].size(); i++) {
|
||||
int fi = sentence.alignedToT[ei][i];
|
||||
outextractstr << " " << fi-startF << "-" << ei-startE;
|
||||
outextractstrInv << " " << ei-startE << "-" << fi-startF;
|
||||
if (m_options.isSingleWordHeuristicFlag() && (startE==endE) && (startF==endF)) {
|
||||
outextractstr << " 0-0";
|
||||
outextractstrInv << " 0-0";
|
||||
} else {
|
||||
for(int ei=startE; ei<=endE; ei++) {
|
||||
for(unsigned int i=0; i<m_sentence.alignedToT[ei].size(); i++) {
|
||||
int fi = m_sentence.alignedToT[ei][i];
|
||||
outextractstr << " " << fi-startF << "-" << ei-startE;
|
||||
outextractstrInv << " " << ei-startE << "-" << fi-startF;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -774,20 +954,20 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE,
|
||||
outextractstrOrientation << orientationInfo;
|
||||
|
||||
if (m_options.isIncludeSentenceIdFlag()) {
|
||||
outextractstr << " ||| " << sentence.sentenceID;
|
||||
outextractstr << " ||| " << m_sentence.sentenceID;
|
||||
}
|
||||
|
||||
if (m_options.getInstanceWeightsFile().length()) {
|
||||
if (m_options.isTranslationFlag()) {
|
||||
outextractstr << " ||| " << sentence.weightString;
|
||||
outextractstrInv << " ||| " << sentence.weightString;
|
||||
outextractstr << " ||| " << m_sentence.weightString;
|
||||
outextractstrInv << " ||| " << m_sentence.weightString;
|
||||
}
|
||||
if (m_options.isOrientationFlag()) {
|
||||
outextractstrOrientation << " ||| " << sentence.weightString;
|
||||
outextractstrOrientation << " ||| " << m_sentence.weightString;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
outextractstr << outextractstrPhraseProperties.str();
|
||||
|
||||
// generate two lines for every extracted phrase:
|
||||
// once with left, once with right context
|
||||
@ -797,20 +977,20 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE,
|
||||
ostringstream outextractstrContextInv;
|
||||
|
||||
for(int fi=startF; fi<=endF; fi++) {
|
||||
outextractstrContext << sentence.source[fi] << " ";
|
||||
outextractstrContext << m_sentence.source[fi] << " ";
|
||||
}
|
||||
outextractstrContext << "||| ";
|
||||
|
||||
// target
|
||||
for(int ei=startE; ei<=endE; ei++) {
|
||||
outextractstrContext << sentence.target[ei] << " ";
|
||||
outextractstrContextInv << sentence.target[ei] << " ";
|
||||
outextractstrContext << m_sentence.target[ei] << " ";
|
||||
outextractstrContextInv << m_sentence.target[ei] << " ";
|
||||
}
|
||||
outextractstrContext << "||| ";
|
||||
outextractstrContextInv << "||| ";
|
||||
|
||||
for(int fi=startF; fi<=endF; fi++)
|
||||
outextractstrContextInv << sentence.source[fi] << " ";
|
||||
outextractstrContextInv << m_sentence.source[fi] << " ";
|
||||
|
||||
outextractstrContextInv << "|||";
|
||||
|
||||
@ -823,25 +1003,25 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE,
|
||||
// write context to left
|
||||
outextractstrContext << "< ";
|
||||
if (startF == 0) outextractstrContext << "<s>";
|
||||
else outextractstrContext << sentence.source[startF-1];
|
||||
else outextractstrContext << m_sentence.source[startF-1];
|
||||
|
||||
outextractstrContextInv << " < ";
|
||||
if (startE == 0) outextractstrContextInv << "<s>";
|
||||
else outextractstrContextInv << sentence.target[startE-1];
|
||||
else outextractstrContextInv << m_sentence.target[startE-1];
|
||||
|
||||
// write context to right
|
||||
outextractstrContextRight << "> ";
|
||||
if (endF+1 == sentence.source.size()) outextractstrContextRight << "<s>";
|
||||
else outextractstrContextRight << sentence.source[endF+1];
|
||||
if (endF+1 == (int)m_sentence.source.size()) outextractstrContextRight << "<s>";
|
||||
else outextractstrContextRight << m_sentence.source[endF+1];
|
||||
|
||||
outextractstrContextRightInv << " > ";
|
||||
if (endE+1 == sentence.target.size()) outextractstrContextRightInv << "<s>";
|
||||
else outextractstrContextRightInv << sentence.target[endE+1];
|
||||
if (endE+1 == (int)m_sentence.target.size()) outextractstrContextRightInv << "<s>";
|
||||
else outextractstrContextRightInv << m_sentence.target[endE+1];
|
||||
|
||||
outextractstrContext << "\n";
|
||||
outextractstrContextInv << "\n";
|
||||
outextractstrContextRight << "\n";
|
||||
outextractstrContextRightInv << "\n";
|
||||
outextractstrContext << std::endl;
|
||||
outextractstrContextInv << std::endl;
|
||||
outextractstrContextRight << std::endl;
|
||||
outextractstrContextRightInv << std::endl;
|
||||
|
||||
m_extractedPhrasesContext.push_back(outextractstrContext.str());
|
||||
m_extractedPhrasesContextInv.push_back(outextractstrContextInv.str());
|
||||
@ -849,9 +1029,9 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE,
|
||||
m_extractedPhrasesContextInv.push_back(outextractstrContextRightInv.str());
|
||||
}
|
||||
|
||||
if (m_options.isTranslationFlag()) outextractstr << "\n";
|
||||
if (m_options.isTranslationFlag()) outextractstrInv << "\n";
|
||||
if (m_options.isOrientationFlag()) outextractstrOrientation << "\n";
|
||||
if (m_options.isTranslationFlag()) outextractstr << std::endl;
|
||||
if (m_options.isTranslationFlag()) outextractstrInv << std::endl;
|
||||
if (m_options.isOrientationFlag()) outextractstrOrientation << std::endl;
|
||||
|
||||
|
||||
m_extractedPhrases.push_back(outextractstr.str());
|
||||
@ -896,30 +1076,30 @@ void ExtractTask::writePhrasesToFile()
|
||||
|
||||
// if proper conditioning, we need the number of times a source phrase occured
|
||||
|
||||
void ExtractTask::extractBase( SentenceAlignment &sentence )
|
||||
void ExtractTask::extractBase()
|
||||
{
|
||||
ostringstream outextractFile;
|
||||
ostringstream outextractFileInv;
|
||||
|
||||
int countF = sentence.source.size();
|
||||
int countF = m_sentence.source.size();
|
||||
for(int startF=0; startF<countF; startF++) {
|
||||
for(int endF=startF;
|
||||
(endF<countF && endF<startF+m_options.maxPhraseLength);
|
||||
endF++) {
|
||||
for(int fi=startF; fi<=endF; fi++) {
|
||||
outextractFile << sentence.source[fi] << " ";
|
||||
outextractFile << m_sentence.source[fi] << " ";
|
||||
}
|
||||
outextractFile << "|||" << endl;
|
||||
}
|
||||
}
|
||||
|
||||
int countE = sentence.target.size();
|
||||
int countE = m_sentence.target.size();
|
||||
for(int startE=0; startE<countE; startE++) {
|
||||
for(int endE=startE;
|
||||
(endE<countE && endE<startE+m_options.maxPhraseLength);
|
||||
endE++) {
|
||||
for(int ei=startE; ei<=endE; ei++) {
|
||||
outextractFileInv << sentence.target[ei] << " ";
|
||||
outextractFileInv << m_sentence.target[ei] << " ";
|
||||
}
|
||||
outextractFileInv << "|||" << endl;
|
||||
}
|
||||
@ -930,17 +1110,17 @@ void ExtractTask::extractBase( SentenceAlignment &sentence )
|
||||
}
|
||||
|
||||
|
||||
bool ExtractTask::checkPlaceholders (const SentenceAlignment &sentence, int startE, int endE, int startF, int endF)
|
||||
bool ExtractTask::checkPlaceholders(int startE, int endE, int startF, int endF) const
|
||||
{
|
||||
for (size_t pos = startF; pos <= endF; ++pos) {
|
||||
const string &sourceWord = sentence.source[pos];
|
||||
for (int pos = startF; pos <= endF; ++pos) {
|
||||
const string &sourceWord = m_sentence.source[pos];
|
||||
if (isPlaceholder(sourceWord)) {
|
||||
if (sentence.alignedToS.at(pos).size() != 1) {
|
||||
if (m_sentence.alignedToS.at(pos).size() != 1) {
|
||||
return false;
|
||||
} else {
|
||||
// check it actually lines up to another placeholder
|
||||
int targetPos = sentence.alignedToS.at(pos).at(0);
|
||||
const string &otherWord = sentence.target[targetPos];
|
||||
int targetPos = m_sentence.alignedToS.at(pos).at(0);
|
||||
const string &otherWord = m_sentence.target[targetPos];
|
||||
if (!isPlaceholder(otherWord)) {
|
||||
return false;
|
||||
}
|
||||
@ -948,15 +1128,15 @@ bool ExtractTask::checkPlaceholders (const SentenceAlignment &sentence, int star
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t pos = startE; pos <= endE; ++pos) {
|
||||
const string &targetWord = sentence.target[pos];
|
||||
for (int pos = startE; pos <= endE; ++pos) {
|
||||
const string &targetWord = m_sentence.target[pos];
|
||||
if (isPlaceholder(targetWord)) {
|
||||
if (sentence.alignedToT.at(pos).size() != 1) {
|
||||
if (m_sentence.alignedToT.at(pos).size() != 1) {
|
||||
return false;
|
||||
} else {
|
||||
// check it actually lines up to another placeholder
|
||||
int sourcePos = sentence.alignedToT.at(pos).at(0);
|
||||
const string &otherWord = sentence.source[sourcePos];
|
||||
int sourcePos = m_sentence.alignedToT.at(pos).at(0);
|
||||
const string &otherWord = m_sentence.source[sourcePos];
|
||||
if (!isPlaceholder(otherWord)) {
|
||||
return false;
|
||||
}
|
||||
@ -966,7 +1146,7 @@ bool ExtractTask::checkPlaceholders (const SentenceAlignment &sentence, int star
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ExtractTask::isPlaceholder(const string &word)
|
||||
bool ExtractTask::isPlaceholder(const string &word) const
|
||||
{
|
||||
for (size_t i = 0; i < m_options.placeholders.size(); ++i) {
|
||||
const string &placeholder = m_options.placeholders[i];
|
||||
@ -976,28 +1156,5 @@ bool ExtractTask::isPlaceholder(const string &word)
|
||||
}
|
||||
return false;
|
||||
}
|
||||
/** tokenise input string to vector of string. each element has been separated by a character in the delimiters argument.
|
||||
The separator can only be 1 character long. The default delimiters are space or tab
|
||||
*/
|
||||
std::vector<std::string> Tokenize(const std::string& str,
|
||||
const std::string& delimiters)
|
||||
{
|
||||
std::vector<std::string> tokens;
|
||||
// Skip delimiters at beginning.
|
||||
std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
|
||||
// Find first "non-delimiter".
|
||||
std::string::size_type pos = str.find_first_of(delimiters, lastPos);
|
||||
|
||||
while (std::string::npos != pos || std::string::npos != lastPos) {
|
||||
// Found a token, add it to the vector.
|
||||
tokens.push_back(str.substr(lastPos, pos - lastPos));
|
||||
// Skip delimiters. Note the "not_of"
|
||||
lastPos = str.find_first_not_of(delimiters, pos);
|
||||
// Find next "non-delimiter"
|
||||
pos = str.find_first_of(delimiters, lastPos);
|
||||
}
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -347,7 +347,8 @@ int main(int argc, char* argv[])
|
||||
|
||||
SentenceAlignmentWithSyntax sentence
|
||||
(targetLabelCollection, sourceLabelCollection,
|
||||
targetTopLabelCollection, sourceTopLabelCollection, options);
|
||||
targetTopLabelCollection, sourceTopLabelCollection,
|
||||
options.targetSyntax, options.sourceSyntax);
|
||||
//az: output src, tgt, and alingment line
|
||||
if (options.onlyOutputSpanInfo) {
|
||||
cout << "LOG: SRC: " << sourceString << endl;
|
||||
|
@ -68,6 +68,7 @@ bool spanLength = false;
|
||||
bool ruleLength = false;
|
||||
bool nonTermContext = false;
|
||||
bool nonTermContextTarget = false;
|
||||
bool targetConstituentBoundariesFlag = false;
|
||||
|
||||
int countOfCounts[COC_MAX+1];
|
||||
int totalDistinct = 0;
|
||||
@ -286,6 +287,9 @@ int main(int argc, char* argv[])
|
||||
} else if (strcmp(argv[i],"--NonTermContextTarget") == 0) {
|
||||
nonTermContextTarget = true;
|
||||
std::cerr << "non-term context (target)" << std::endl;
|
||||
} else if (strcmp(argv[i],"--TargetConstituentBoundaries") == 0) {
|
||||
targetConstituentBoundariesFlag = true;
|
||||
std::cerr << "including target constituent boundaries information" << std::endl;
|
||||
} else {
|
||||
featureArgs.push_back(argv[i]);
|
||||
++i;
|
||||
@ -957,6 +961,18 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
|
||||
}
|
||||
}
|
||||
|
||||
// target constituent boundaries
|
||||
if (targetConstituentBoundariesFlag && !inverseFlag) {
|
||||
const std::string targetConstituentBoundariesLeftValues = phrasePair.CollectAllPropertyValues("TargetConstituentBoundariesLeft");
|
||||
if (!targetConstituentBoundariesLeftValues.empty()) {
|
||||
phraseTableFile << " {{TargetConstituentBoundariesLeft " << targetConstituentBoundariesLeftValues << "}}";
|
||||
}
|
||||
const std::string targetConstituentBoundariesRightAdjacentValues = phrasePair.CollectAllPropertyValues("TargetConstituentBoundariesRightAdjacent");
|
||||
if (!targetConstituentBoundariesRightAdjacentValues.empty()) {
|
||||
phraseTableFile << " {{TargetConstituentBoundariesRightAdjacent " << targetConstituentBoundariesRightAdjacentValues << "}}";
|
||||
}
|
||||
}
|
||||
|
||||
phraseTableFile << std::endl;
|
||||
}
|
||||
|
||||
|
@ -53,18 +53,18 @@ git submodule update regtest
|
||||
# -- compile from scratch with server, run regtests
|
||||
set -x
|
||||
if [ "$full" == true ] ; then
|
||||
./bjam -j$j --with-irstlm=$irstlm --with-boost=$boost --with-cmph=$cmph --no-xmlrpc-c --with-regtest=$regtest -a $skipcompact $@ $q || exit $?
|
||||
./bjam -j$j --with-mm --with-mm-extras --with-irstlm=$irstlm --with-boost=$boost --with-cmph=$cmph --no-xmlrpc-c --with-regtest=$regtest -a $skipcompact $@ $q || exit $?
|
||||
if ./regression-testing/run-single-test.perl --server --startuptest ; then
|
||||
./bjam -j$j --with-irstlm=$irstlm --with-boost=$boost --with-cmph=$cmph $xmlrpc --with-regtest=$regtest -a $skipcompact $@ $q
|
||||
./bjam -j$j --with-mm --with-mm-extras --with-irstlm=$irstlm --with-boost=$boost --with-cmph=$cmph $xmlrpc --with-regtest=$regtest -a $skipcompact $@ $q
|
||||
fi
|
||||
else
|
||||
# when investigating failures, always run single-threaded
|
||||
if [ "$q" == "-q" ] ; then j=1; fi
|
||||
|
||||
if ./regression-testing/run-single-test.perl --server --startuptest ; then
|
||||
./bjam -j$j $q $a --with-irstlm=$irstlm --with-boost=$boost --with-cmph=$cmph $xmlrpc --with-regtest=$regtest $skipcompact $@
|
||||
./bjam -j$j --with-mm $q $a --with-irstlm=$irstlm --with-boost=$boost --with-cmph=$cmph $xmlrpc --with-regtest=$regtest $skipcompact $@
|
||||
else
|
||||
./bjam -j$j $q $a --with-irstlm=$irstlm --with-boost=$boost --with-cmph=$cmph --no-xmlrpc-c --with-regtest=$regtest $skipcompact $@
|
||||
./bjam -j$j --with-mm --with-mm-extras $q $a --with-irstlm=$irstlm --with-boost=$boost --with-cmph=$cmph --no-xmlrpc-c --with-regtest=$regtest $skipcompact $@
|
||||
fi
|
||||
fi
|
||||
|
||||
|
@ -811,7 +811,8 @@ generation-prune
|
||||
in: generation-table
|
||||
out: generation-table-pruned
|
||||
rerun-on-change: TRAINING:prune-generation
|
||||
ignore-unless: AND TRAINING:prune-generation
|
||||
pass-unless: TRAINING:prune-generation
|
||||
ignore-unless: generation-factors
|
||||
default-name: model/generation-table-pruned
|
||||
final-model: yes
|
||||
template: $TRAINING:prune-generation IN OUT
|
||||
|
@ -384,11 +384,11 @@ sub read_config {
|
||||
$resolve = 0;
|
||||
foreach my $parameter (keys %CONFIG) {
|
||||
foreach (@{$CONFIG{$parameter}}) {
|
||||
next unless /\$/;
|
||||
next unless /\$[a-z\{]/i;
|
||||
my $escaped = 0;
|
||||
die ("BAD USE OF \$ IN VALUE used in parameter $parameter")
|
||||
if ! ( /^(.*)\$([a-z\-\:\d]+)(.*)$/i ||
|
||||
(/^(.*)\$\{([a-z\-\:\d]+)\}(.*)$/i && ($escaped = 1)));
|
||||
if ! ( /^(.*)\$([a-z][a-z\-\:\d]*)(.*)$/i ||
|
||||
(/^(.*)\$\{([a-z][a-z\-\:\d]*)\}(.*)$/i && ($escaped = 1)));
|
||||
my ($pre,$substitution,$post) = ($1,$2,$3);
|
||||
my $pattern = $substitution;
|
||||
if ($substitution !~ /\:/) { # handle local variables
|
||||
@ -1800,6 +1800,10 @@ sub define_lm_train_bilingual_lm {
|
||||
my $epochs = &get_bilingual_lm_epochs($set);
|
||||
$cmd .= " -e $epochs" if defined($epochs);
|
||||
|
||||
my $nnjm_settings = backoff_and_get("LM:$set:nnjm-settings");
|
||||
$cmd .= " ";
|
||||
$cmd .= $nnjm_settings;
|
||||
|
||||
my $nplm_settings = backoff_and_get("LM:$set:nplm-settings");
|
||||
$cmd .= " --extra-settings \"$nplm_settings\"" if defined($nplm_settings);
|
||||
|
||||
@ -2403,6 +2407,12 @@ sub define_training_extract_phrases {
|
||||
if (&get("TRAINING:ghkm-strip-bitpar-nonterminal-labels")) {
|
||||
$cmd .= "-ghkm-strip-bitpar-nonterminal-labels ";
|
||||
}
|
||||
|
||||
} else { # !hierarchical-rule-set
|
||||
|
||||
if (&get("TRAINING:target-constituent-boundaries")) {
|
||||
$cmd .= "-target-constituent-boundaries ";
|
||||
}
|
||||
}
|
||||
|
||||
my $extract_settings = &get("TRAINING:extract-settings");
|
||||
@ -2460,6 +2470,12 @@ sub define_training_build_ttable {
|
||||
my $parts_of_speech_labels_file = &versionize(&long_file_name("parts-of-speech","model",""));
|
||||
$cmd .= "-ghkm-parts-of-speech-file $parts_of_speech_labels_file ";
|
||||
}
|
||||
|
||||
} else { # !hierarchical-rule-set
|
||||
|
||||
if (&get("TRAINING:target-constituent-boundaries")) {
|
||||
$cmd .= "-target-constituent-boundaries ";
|
||||
}
|
||||
}
|
||||
|
||||
&create_step($step_id,$cmd);
|
||||
@ -2674,6 +2690,10 @@ sub define_training_create_config {
|
||||
$cmd .= "-ghkm-parts-of-speech-file $parts_of_speech_labels_file ";
|
||||
}
|
||||
|
||||
if (&get("TRAINING:target-constituent-boundaries")) {
|
||||
$cmd .= "-target-constituent-boundaries ";
|
||||
}
|
||||
|
||||
# sparse lexical features provide additional content for config file
|
||||
my @additional_ini_files;
|
||||
push (@additional_ini_files, "$sparse_lexical_features.ini") if $sparse_lexical_features;
|
||||
@ -3601,8 +3621,8 @@ sub define_template {
|
||||
print "\tcmd is $cmd\n" if $VERBOSE;
|
||||
|
||||
# replace variables
|
||||
while ($cmd =~ /^([\S\s]*)\$(\??)\{([^\s\/\"\']+)\}([\S\s]*)$/ ||
|
||||
$cmd =~ /^([\S\s]*)\$(\??)([^\s\/\"\']+)([\S\s]*)$/) {
|
||||
while ($cmd =~ /^([\S\s]*)\$(\??)\{([a-z][^\s\/\"\']*)\}([\S\s]*)$/i ||
|
||||
$cmd =~ /^([\S\s]*)\$(\??)([a-z][^\s\/\"\']*)([\S\s]*)$/i) {
|
||||
my ($pre,$optional,$variable,$post) = ($1,$2,$3,$4);
|
||||
my $value;
|
||||
if ($optional eq '?') {
|
||||
@ -3616,7 +3636,8 @@ sub define_template {
|
||||
}
|
||||
|
||||
# deal with pipelined commands
|
||||
$cmd =~ s/\|(.*)(\<\s*\S+) /$2 \| $1 /g;
|
||||
$cmd =~ s/\|(.*[^\\])(\<\s*\S+) /$2 \| $1 /g;
|
||||
$cmd =~ s/\\\</\</g;
|
||||
|
||||
# deal with gzipped input
|
||||
my $c = "";
|
||||
|
@ -782,7 +782,8 @@ sub hs_scan_line {
|
||||
if ($line =~ /^Trans Opt/) {
|
||||
# Old format
|
||||
$line =~ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ ||
|
||||
$line =~ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>\S+ \-\> (.+) :([\(\),\d\- ]*): c=/ || return 0;
|
||||
$line =~ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>\S+ \-\> (.+) :([\(\),\d\- ]*): c=/ ||
|
||||
$line =~ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>\S+ \-\> (.+) :([\(\),\d\- ]*): term=.*: nonterm=.*: c=/ || return 0;
|
||||
my ($sentence,$start,$end,$spans,$rule_lhs,$rule_rhs,$alignment) = ($1,$2,$3,$4,$5,$6,$7);
|
||||
|
||||
${$ref_sentence} = $sentence;
|
||||
@ -1202,7 +1203,8 @@ sub process_search_graph {
|
||||
if (/^(\d+) (\d+)\-?\>?(\S*) (\S+) =\> (.+) :(.*): pC=([\de\-\.]+), c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\d\-\.]+)\] \<\</) {
|
||||
($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$heuristic_rule_score,$from,$to,$children,$hyp_score) = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12);
|
||||
}
|
||||
elsif (/^(\d+) (\d+)\-?\>?(\S*) (\S+) =\> (.+) :(.*): c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\d\-\.]+)\] core/) {
|
||||
elsif (/^(\d+) (\d+)\-?\>?(\S*) (\S+) =\> (.+) :(.*): c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\de\-\.]+)\] core/ ||
|
||||
/^(\d+) (\d+)\-?\>?(\S*) (\S+) =\> (.+) :(.*): c=([\de\-\.]+) core=\(.*\) \[(\d+)\.\.(\d+)\] (.*)\[total=([\de\-\.]+)\] core/) {
|
||||
($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$from,$to,$children,$hyp_score) = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12);
|
||||
$heuristic_rule_score = $rule_score; # hmmmm....
|
||||
}
|
||||
|
@ -472,6 +472,7 @@ def compose_score_command(extract_file, context_file, half_file,
|
||||
command += [
|
||||
'&&',
|
||||
find_first_executable(['bzcat']),
|
||||
half_file,
|
||||
'|',
|
||||
quote(args.flexibility_score),
|
||||
quote(context_file),
|
||||
|
79
scripts/nbest-rescore/README.md
Normal file
79
scripts/nbest-rescore/README.md
Normal file
@ -0,0 +1,79 @@
|
||||
# N-best List Re-Scorer
|
||||
|
||||
Written by Michael Denkowski
|
||||
|
||||
These scripts simplify running N-best re-ranking experiments with Moses. You
|
||||
can score N-best lists with external tools (such as models that would be very
|
||||
costly to integrate with Moses just for feasibility experiments), then use the
|
||||
extended feature set to select translations that may be of a higher quality than
|
||||
those preferred by the Moses features alone. In some cases, training a
|
||||
re-ranker even without any new features can yield improvement.
|
||||
|
||||
### Training
|
||||
|
||||
* Use Moses to generate large N-best lists for a dev set. Use a config file
|
||||
(moses.ini) that has been optimized with MERT, MIRA, or similar:
|
||||
|
||||
```
|
||||
cat dev-src.txt |moses -f moses.ini -n-best-list dev.best1000.out 1000 distinct
|
||||
```
|
||||
|
||||
* (Optionally) add new feature scores to the N-best list using any external
|
||||
tools. Make sure the features are added to the correct field using the correct
|
||||
format. You don't need to update the final scores (right now your new features
|
||||
have zero weight):
|
||||
|
||||
```
|
||||
0 ||| some translation ||| Feature0= -1.75645 Feature1= -1.38629 -2.19722 -2.31428 -0.81093 AwesomeNewFeature= -1.38629 ||| -4.42063
|
||||
```
|
||||
|
||||
* Run the optimizer (currently K-best MIRA) to learn new re-ranking weights for
|
||||
all features in your N-best list. Supply the reference translation for the dev
|
||||
set:
|
||||
|
||||
```
|
||||
python train.py --nbest dev.best1000.with-new-features --ref dev-ref.txt --working-dir rescore-work
|
||||
```
|
||||
|
||||
* You now have a new config file that contains N-best re-scoring weights:
|
||||
|
||||
```
|
||||
rescore-work/rescore.ini
|
||||
```
|
||||
|
||||
### Test
|
||||
|
||||
* Use the **original** config file to generate N-best lists for the test set:
|
||||
|
||||
```
|
||||
cat test-src.txt |moses -f moses.ini -n-best-list test.best1000.out 100 distinct
|
||||
```
|
||||
|
||||
* Add any new features you added for training
|
||||
|
||||
* Re-score the N-best list (update total scores) using the **re-scoring**
|
||||
weights file:
|
||||
|
||||
```
|
||||
python rescore.py rescore-work/rescore.ini <test.best1000.with-new-features >test.best1000.rescored
|
||||
```
|
||||
|
||||
* The N-best list is **not** re-sorted, so the entries will be out of order.
|
||||
Use the top-best script to extract the highest scoring entry for each sentence:
|
||||
|
||||
```
|
||||
python topbest.py <test.best1000.rescored >test.topbest
|
||||
```
|
||||
|
||||
### Not implemented yet
|
||||
|
||||
The following could be relatively easily implemented by replicating the
|
||||
behavior of mert-moses.pl:
|
||||
|
||||
* Sparse features (sparse weight file)
|
||||
|
||||
* Other optimizers (MERT, PRO, etc.)
|
||||
|
||||
* Other objective functions (TER, Meteor, etc.)
|
||||
|
||||
* Multiple reference translations
|
56
scripts/nbest-rescore/rescore.py
Executable file
56
scripts/nbest-rescore/rescore.py
Executable file
@ -0,0 +1,56 @@
|
||||
#!/usr/bin/env python
|
||||
#
|
||||
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
||||
# Public License version 2.1 or, at your option, any later version.
|
||||
|
||||
import sys
|
||||
|
||||
FEAT_FIELD = 2
|
||||
SCORE_FIELD = 3
|
||||
|
||||
def main():
|
||||
|
||||
if len(sys.argv[1:]) != 1:
|
||||
sys.stderr.write('Usage: {} moses.ini <nbest.with-new-features >nbest.rescored\n'.format(sys.argv[0]))
|
||||
sys.stderr.write('Entries are _not_ re-sorted based on new score. Use topbest.py\n')
|
||||
sys.exit(2)
|
||||
|
||||
weights = {}
|
||||
|
||||
# moses.ini
|
||||
ini = open(sys.argv[1])
|
||||
while True:
|
||||
line = ini.readline()
|
||||
if not line:
|
||||
sys.stderr.write('Error: no [weight] section\n')
|
||||
sys.exit(1)
|
||||
if line.strip() == '[weight]':
|
||||
break
|
||||
while True:
|
||||
line = ini.readline()
|
||||
if not line or line.strip().startswith('['):
|
||||
break
|
||||
if line.strip() == '':
|
||||
continue
|
||||
fields = line.split()
|
||||
weights[fields[0]] = [float(f) for f in fields[1:]]
|
||||
|
||||
# N-best
|
||||
for line in sys.stdin:
|
||||
fields = [f.strip() for f in line.split('|||')]
|
||||
feats = fields[FEAT_FIELD].split()
|
||||
key = ''
|
||||
i = 0
|
||||
score = 0
|
||||
for f in feats:
|
||||
if f.endswith('='):
|
||||
key = f
|
||||
i = 0
|
||||
else:
|
||||
score += (float(f) * weights[key][i])
|
||||
i += 1
|
||||
fields[SCORE_FIELD] = str(score)
|
||||
sys.stdout.write('{}\n'.format(' ||| '.join(fields)))
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
30
scripts/nbest-rescore/topbest.py
Executable file
30
scripts/nbest-rescore/topbest.py
Executable file
@ -0,0 +1,30 @@
|
||||
#!/usr/bin/env python
|
||||
#
|
||||
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
||||
# Public License version 2.1 or, at your option, any later version.
|
||||
|
||||
import sys
|
||||
|
||||
SCORE_FIELD = 3
|
||||
|
||||
def main():
|
||||
|
||||
i = ''
|
||||
hyp = ''
|
||||
top = 0
|
||||
|
||||
for line in sys.stdin:
|
||||
fields = [f.strip() for f in line.split('|||')]
|
||||
id = fields[0]
|
||||
if i != id:
|
||||
if i:
|
||||
sys.stdout.write('{}\n'.format(hyp))
|
||||
score = float(fields[SCORE_FIELD])
|
||||
if score > top or i != id:
|
||||
i = id
|
||||
hyp = fields[1]
|
||||
top = score
|
||||
sys.stdout.write('{}\n'.format(hyp))
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
116
scripts/nbest-rescore/train.py
Executable file
116
scripts/nbest-rescore/train.py
Executable file
@ -0,0 +1,116 @@
|
||||
#!/usr/bin/env python
|
||||
#
|
||||
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
||||
# Public License version 2.1 or, at your option, any later version.
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
# Feature field in N-best format
|
||||
FEAT_FIELD = 2
|
||||
|
||||
# Location of mert, kbmira, etc. in relation to this script
|
||||
BIN_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'bin')
|
||||
|
||||
def main():
|
||||
|
||||
# Args
|
||||
parser = argparse.ArgumentParser(description='Learn N-best rescoring weights')
|
||||
parser.add_argument('--nbest', metavar='nbest', \
|
||||
help='Dev set N-best list augmented with new features', required=True)
|
||||
parser.add_argument('--ref', metavar='ref', \
|
||||
help='Dev set reference translation', required=True)
|
||||
parser.add_argument('--working-dir', metavar='rescore-work', \
|
||||
help='Optimizer working directory', required=True)
|
||||
parser.add_argument('--bin-dir', metavar='DIR', \
|
||||
help='Moses bin dir, containing kbmira, evaluator, etc.', default=BIN_DIR)
|
||||
# Since we're starting with uniform weights and only running kbmira once,
|
||||
# run a gratuitous number of iterations. (mert-moses.pl default is 60
|
||||
# iterations for each Moses run)
|
||||
parser.add_argument('--iterations', metavar='N', type=int, \
|
||||
help='Number of K-best MIRA iterations to run (default: 300)', default=300)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Find executables
|
||||
extractor = os.path.join(args.bin_dir, 'extractor')
|
||||
kbmira = os.path.join(args.bin_dir, 'kbmira')
|
||||
for exe in (extractor, kbmira):
|
||||
if not os.path.exists(exe):
|
||||
sys.stderr.write('Error: cannot find executable "{}" in "{}", please specify --bin-dir\n'.format(exe, args.bin_dir))
|
||||
sys.exit(1)
|
||||
|
||||
# rescore-work dir
|
||||
if not os.path.exists(args.working_dir):
|
||||
os.mkdir(args.working_dir)
|
||||
|
||||
# Feature names and numbers of weights from N-best list
|
||||
# Assume all features are dense (present for each entry)
|
||||
init_weights = []
|
||||
fields = [f.strip() for f in open(args.nbest).readline().split('|||')]
|
||||
feats = fields[FEAT_FIELD].split()
|
||||
for i in range(len(feats)):
|
||||
if feats[i].endswith('='):
|
||||
n_weights = 0
|
||||
j = i + 1
|
||||
while j < len(feats):
|
||||
if feats[j].endswith('='):
|
||||
break
|
||||
n_weights += 1
|
||||
j += 1
|
||||
# Start all weights at 0
|
||||
init_weights.append([feats[i], [0] * n_weights])
|
||||
|
||||
# Extract score and feature data from N-best list
|
||||
extractor_cmd = [extractor, \
|
||||
'--sctype', 'BLEU', '--scconfig', 'case:true', \
|
||||
'--scfile', os.path.join(args.working_dir, 'scores.dat'), \
|
||||
'--ffile', os.path.join(args.working_dir, 'features.dat'), \
|
||||
'-r', args.ref, \
|
||||
'-n', args.nbest]
|
||||
subprocess.call(extractor_cmd)
|
||||
|
||||
# Write dense feature list
|
||||
with open(os.path.join(args.working_dir, 'init.dense'), 'w') as out:
|
||||
for (feat, weights) in init_weights:
|
||||
for w in weights:
|
||||
out.write('{} {}\n'.format(feat, w))
|
||||
|
||||
# Run K-best MIRA optimizer
|
||||
kbmira_cmd = [kbmira, \
|
||||
'--dense-init', os.path.join(args.working_dir, 'init.dense'), \
|
||||
'--ffile', os.path.join(args.working_dir, 'features.dat'), \
|
||||
'--scfile', os.path.join(args.working_dir, 'scores.dat'), \
|
||||
'-o', os.path.join(args.working_dir, 'mert.out'), \
|
||||
'--iters', str(args.iterations)]
|
||||
subprocess.call(kbmira_cmd)
|
||||
|
||||
# Read optimized weights, sum for normalization
|
||||
opt_weights = []
|
||||
total = 0
|
||||
with open(os.path.join(args.working_dir, 'mert.out')) as inp:
|
||||
# Same structure as original weight list
|
||||
for (feat, weights) in init_weights:
|
||||
opt_weights.append([feat, []])
|
||||
for _ in weights:
|
||||
w = float(inp.readline().split()[1])
|
||||
opt_weights[-1][1].append(w)
|
||||
# Sum for normalization
|
||||
total += abs(w)
|
||||
|
||||
# Normalize weights
|
||||
for (_, weights) in opt_weights:
|
||||
for i in range(len(weights)):
|
||||
weights[i] /= total
|
||||
|
||||
# Generate rescore.ini
|
||||
with open(os.path.join(args.working_dir, 'rescore.ini'), 'w') as out:
|
||||
out.write('# For use with Moses N-best rescorer "scripts/nbest-rescore/rescore.py"\n')
|
||||
out.write('\n')
|
||||
out.write('[weight]\n')
|
||||
for (feat, weights) in opt_weights:
|
||||
out.write('{} {}\n'.format(feat, ' '.join(str(w) for w in weights)))
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -348,6 +348,9 @@ sub tokenize
|
||||
$text =~ s/^ //g;
|
||||
$text =~ s/ $//g;
|
||||
|
||||
# .' at end of sentence is missed
|
||||
$text =~ s/\.\' ?$/ . ' /;
|
||||
|
||||
# restore protected
|
||||
for (my $i = 0; $i < scalar(@protected); ++$i) {
|
||||
my $subst = sprintf("THISISPROTECTED%.3d", $i);
|
||||
|
@ -234,7 +234,7 @@ while(my $line = <INI>) {
|
||||
$w = $args[1];
|
||||
}
|
||||
elsif ($args[0] eq "input-factor") {
|
||||
$source_factor = chomp($args[1]);
|
||||
$source_factor = $args[1];
|
||||
}
|
||||
elsif ($args[0] eq "output-factor") {
|
||||
#$t = chomp($args[1]);
|
||||
|
@ -134,6 +134,7 @@ my($_EXTERNAL_BINDIR,
|
||||
$_LMODEL_OOV_FEATURE,
|
||||
$_NUM_LATTICE_FEATURES,
|
||||
$IGNORE,
|
||||
$_TARGET_CONSTITUENT_BOUNDARIES,
|
||||
$_FLEXIBILITY_SCORE,
|
||||
$_FEATURE_LINES,
|
||||
$_WEIGHT_LINES,
|
||||
@ -258,6 +259,7 @@ $_HELP = 1
|
||||
'instance-weights-file=s' => \$_INSTANCE_WEIGHTS_FILE,
|
||||
'lmodel-oov-feature' => \$_LMODEL_OOV_FEATURE,
|
||||
'num-lattice-features=i' => \$_NUM_LATTICE_FEATURES,
|
||||
'target-constituent-boundaries' => \$_TARGET_CONSTITUENT_BOUNDARIES,
|
||||
'flexibility-score' => \$_FLEXIBILITY_SCORE,
|
||||
'config-add-feature-lines=s' => \$_FEATURE_LINES,
|
||||
'config-add-weight-lines=s' => \$_WEIGHT_LINES,
|
||||
@ -321,7 +323,6 @@ my $_ADDITIONAL_INI; # allow multiple switches
|
||||
foreach (@_ADDITIONAL_INI) { $_ADDITIONAL_INI .= $_." "; }
|
||||
chop($_ADDITIONAL_INI) if $_ADDITIONAL_INI;
|
||||
|
||||
$_HIERARCHICAL = 1 if $_SOURCE_SYNTAX || $_TARGET_SYNTAX;
|
||||
$_XML = 1 if $_SOURCE_SYNTAX || $_TARGET_SYNTAX;
|
||||
my $___FACTOR_DELIMITER = $_FACTOR_DELIMITER;
|
||||
$___FACTOR_DELIMITER = '|' unless ($_FACTOR_DELIMITER);
|
||||
@ -1608,6 +1609,7 @@ sub extract_phrase {
|
||||
$cmd .= " --GZOutput ";
|
||||
$cmd .= " --InstanceWeights $_INSTANCE_WEIGHTS_FILE " if defined $_INSTANCE_WEIGHTS_FILE;
|
||||
$cmd .= " --BaselineExtract $_BASELINE_EXTRACT" if defined($_BASELINE_EXTRACT) && $PHRASE_EXTRACT =~ /extract-parallel.perl/;
|
||||
$cmd .= " --TargetConstituentBoundaries" if $_TARGET_CONSTITUENT_BOUNDARIES;
|
||||
$cmd .= " --FlexibilityScore" if $_FLEXIBILITY_SCORE;
|
||||
$cmd .= " --NoTTable" if $_MMSAPT;
|
||||
|
||||
@ -1765,9 +1767,10 @@ sub score_phrase_phrase_extract {
|
||||
$cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
|
||||
$cmd .= " --TargetSyntacticPreferences $_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE" if $_TARGET_SYNTACTIC_PREFERENCES && defined($_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE);
|
||||
$cmd .= " --PartsOfSpeech $_GHKM_PARTS_OF_SPEECH_FILE" if $_GHKM_PARTS_OF_SPEECH && defined($_GHKM_PARTS_OF_SPEECH_FILE);
|
||||
$cmd .= " --TargetConstituentBoundaries" if $_TARGET_CONSTITUENT_BOUNDARIES;
|
||||
$cmd .= " --FlexibilityScore=$FLEX_SCORER" if $_FLEXIBILITY_SCORE;
|
||||
$cmd .= " $DOMAIN" if $DOMAIN;
|
||||
$cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS);
|
||||
$cmd .= " --FlexibilityScore=$FLEX_SCORER" if $_FLEXIBILITY_SCORE;
|
||||
|
||||
# sorting
|
||||
if ($direction eq "e2f" || $_ALT_DIRECT_RULE_SCORE_1 || $_ALT_DIRECT_RULE_SCORE_2) {
|
||||
@ -1904,7 +1907,7 @@ sub get_reordering {
|
||||
# * the value stored in $REORDERING_MODEL_TYPES{$mtype} is a concatenation of the "orient"
|
||||
# attributes such as "msd"
|
||||
# * the "filename" attribute is appended to the filename, but actually serves as the main configuration specification
|
||||
# for reordering scoring. it holds a string such as "wbe-msd-didirectional-fe"
|
||||
# for reordering scoring. it holds a string such as "wbe-msd-bidirectional-fe"
|
||||
# which has the more general format type-orient-dir-lang
|
||||
$cmd .= " --model \"$mtype $REORDERING_MODEL_TYPES{$mtype}";
|
||||
foreach my $model (@REORDERING_MODELS) {
|
||||
@ -2325,7 +2328,7 @@ sub create_ini {
|
||||
# hierarchical model settings
|
||||
print INI "\n";
|
||||
if ($_HIERARCHICAL) {
|
||||
print INI "[unknown-lhs]\n$_UNKNOWN_WORD_LABEL_FILE\n\n" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_LABEL_FILE);
|
||||
print INI "[unknown-lhs]\n$_UNKNOWN_WORD_LABEL_FILE\n\n" if $_TARGET_SYNTAX && !$_TARGET_SYNTACTIC_PREFERENCES && defined($_UNKNOWN_WORD_LABEL_FILE);
|
||||
print INI "[cube-pruning-pop-limit]\n1000\n\n";
|
||||
print INI "[non-terminals]\nX\n\n";
|
||||
print INI "[search-algorithm]\n3\n\n";
|
||||
@ -2382,6 +2385,12 @@ sub create_ini {
|
||||
chomp($TOPLABEL);
|
||||
print INI " glue-label=$TOPLABEL\n";
|
||||
}
|
||||
if ($_HIERARCHICAL && $_TARGET_SYNTAX && $_TARGET_SYNTACTIC_PREFERENCES && defined($_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE)) {
|
||||
print INI "TargetPreferencesFeature label-set-file=$_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE";
|
||||
print INI " unknown-word-labels-file=$_UNKNOWN_WORD_LABEL_FILE" if defined($_UNKNOWN_WORD_LABEL_FILE);
|
||||
print INI "\n";
|
||||
}
|
||||
print INI "TargetConstituentAdjacencyFeature\n" if $_TARGET_CONSTITUENT_BOUNDARIES;
|
||||
print INI $feature_spec;
|
||||
|
||||
print INI "\n# dense weights for feature functions\n";
|
||||
@ -2393,6 +2402,8 @@ sub create_ini {
|
||||
print INI "PhrasePenalty0= 0.2\n";
|
||||
print INI "SoftSourceSyntacticConstraintsFeature0= -0.2 -0.2 -0.2 0.1 0.1 0.1\n" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
|
||||
print INI "PhraseOrientationFeature0= 0.05 0.05 0.05 0.05 0.05 0.05\n" if $_PHRASE_ORIENTATION;
|
||||
print INI "TargetPreferencesFeature0= 0.2 -0.2\n" if $_HIERARCHICAL && $_TARGET_SYNTAX && $_TARGET_SYNTACTIC_PREFERENCES && defined($_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE);
|
||||
print INI "TargetConstituentAdjacencyFeature0= 0.05 -0.1\n" if $_TARGET_CONSTITUENT_BOUNDARIES;
|
||||
print INI $weight_spec;
|
||||
close(INI);
|
||||
}
|
||||
|
@ -58,6 +58,7 @@ class FileStream : public FakeOStream<FileStream> {
|
||||
}
|
||||
|
||||
FileStream &seekp(uint64_t to) {
|
||||
flush();
|
||||
util::SeekOrThrow(fd_, to);
|
||||
return *this;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user