From f379e5cb8a1d7e04072b4931b4f852b9a8ceb979 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 10 Sep 2013 15:36:21 +0200 Subject: [PATCH 1/9] lattice decoding with sparse features --- moses/InputPath.cpp | 2 ++ moses/ScoreComponentCollection.cpp | 31 +++++++++++++++++++++++++++++- moses/ScoreComponentCollection.h | 10 +++------- 3 files changed, 35 insertions(+), 8 deletions(-) diff --git a/moses/InputPath.cpp b/moses/InputPath.cpp index 6340e11d7..eb27e41f6 100644 --- a/moses/InputPath.cpp +++ b/moses/InputPath.cpp @@ -18,6 +18,8 @@ InputPath::InputPath(const Phrase &phrase, const NonTerminalSet &sourceNonTerms, ,m_range(range) ,m_inputScore(inputScore) { + //cerr << "phrase=" << phrase << " m_inputScore=" << *m_inputScore << endl; + FactorType placeholderFactor = StaticData::Instance().GetPlaceholderFactor().first; if (placeholderFactor != NOT_FOUND) { for (size_t pos = 0; pos < m_phrase.GetSize(); ++pos) { diff --git a/moses/ScoreComponentCollection.cpp b/moses/ScoreComponentCollection.cpp index 7cd9be701..0ce50d992 100644 --- a/moses/ScoreComponentCollection.cpp +++ b/moses/ScoreComponentCollection.cpp @@ -1,6 +1,6 @@ // $Id$ #include - +#include "util/exception.hh" #include "ScoreComponentCollection.h" #include "StaticData.h" @@ -30,6 +30,20 @@ void ScorePair::PlusEquals(const StringPiece &key, float value) } } +std::ostream& operator<<(std::ostream& os, const ScorePair& rhs) +{ + for (size_t i = 0; i < rhs.denseScores.size(); ++i) { + os << rhs.denseScores[i] << ","; + } + + std::map::const_iterator iter; + for (iter = rhs.sparseScores.begin(); iter != rhs.sparseScores.end(); ++iter) { + os << iter->first << "=" << iter->second << ","; + } + + return os; +} + ScoreComponentCollection::ScoreIndexMap ScoreComponentCollection::s_scoreIndexes; size_t ScoreComponentCollection::s_denseVectorSize = 0; @@ -206,6 +220,21 @@ void ScoreComponentCollection::Assign(const FeatureFunction* sp, const string li } } +void ScoreComponentCollection::Assign(const FeatureFunction* sp, const std::vector& scores) { + IndexPair indexes = GetIndexes(sp); + size_t numScores = indexes.second - indexes.first; + + if (scores.size() != numScores) { + UTIL_THROW(util::Exception, "Feature function " << sp->GetScoreProducerDescription() << " specified " + << numScores << " dense scores or weights. Actually has " << scores.size()); + } + + for (size_t i = 0; i < scores.size(); ++i) { + m_scores[i + indexes.first] = scores[i]; + } +} + + void ScoreComponentCollection::InvertDenseFeatures(const FeatureFunction* sp) { diff --git a/moses/ScoreComponentCollection.h b/moses/ScoreComponentCollection.h index 5eb353981..6501242ec 100644 --- a/moses/ScoreComponentCollection.h +++ b/moses/ScoreComponentCollection.h @@ -46,6 +46,8 @@ namespace Moses */ struct ScorePair { + friend std::ostream& operator<<(std::ostream& os, const ScorePair& rhs); + std::vector denseScores; std::map sparseScores; @@ -262,13 +264,7 @@ public: m_scores[fname] += score; } - void Assign(const FeatureFunction* sp, const std::vector& scores) { - IndexPair indexes = GetIndexes(sp); - CHECK(scores.size() == indexes.second - indexes.first); - for (size_t i = 0; i < scores.size(); ++i) { - m_scores[i + indexes.first] = scores[i]; - } - } + void Assign(const FeatureFunction* sp, const std::vector& scores); //! Special version Assign(ScoreProducer, vector) //! to add the score from a single ScoreProducer that produces From 03997dfc3aa6a023c75f5de4d9bc6dd68fb66595 Mon Sep 17 00:00:00 2001 From: Barry Haddow Date: Wed, 11 Sep 2013 10:41:17 +0100 Subject: [PATCH 2/9] Change number of weights in example --- scripts/ems/example/data/weight.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ems/example/data/weight.ini b/scripts/ems/example/data/weight.ini index e0ab92249..36b2039cb 100644 --- a/scripts/ems/example/data/weight.ini +++ b/scripts/ems/example/data/weight.ini @@ -6,7 +6,7 @@ Distortion0= 0.3 UnknownWordPenalty0= 1 WordPenalty0= -1 -TranslationModel0= 0.2 0.2 0.2 0.2 0.2 +TranslationModel0= 0.2 0.2 0.2 0.2 LexicalReordering0= 0.3 0.3 0.3 0.3 0.3 0.3 LM0= 0.5 From ef43d6e0388f2ff05e63d3eb591855fadac7a287 Mon Sep 17 00:00:00 2001 From: Barry Haddow Date: Wed, 11 Sep 2013 10:59:48 +0100 Subject: [PATCH 3/9] Need phrase penalty weight --- scripts/ems/example/data/weight.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/ems/example/data/weight.ini b/scripts/ems/example/data/weight.ini index 36b2039cb..4e941b662 100644 --- a/scripts/ems/example/data/weight.ini +++ b/scripts/ems/example/data/weight.ini @@ -7,6 +7,7 @@ Distortion0= 0.3 UnknownWordPenalty0= 1 WordPenalty0= -1 TranslationModel0= 0.2 0.2 0.2 0.2 +PhrasePenalty0= 0.2 LexicalReordering0= 0.3 0.3 0.3 0.3 0.3 0.3 LM0= 0.5 From c36d8d048fbe39e64f1771dc249c6c062890803a Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 12 Sep 2013 09:48:17 +0200 Subject: [PATCH 4/9] ExternalFeature /cdec & phil blumson --- contrib/other-builds/moses/.project | 10 ++++ moses/FF/ExternalFeature.cpp | 73 +++++++++++++++++++++++ moses/FF/ExternalFeature.h | 92 +++++++++++++++++++++++++++++ moses/FF/Factory.cpp | 3 + 4 files changed, 178 insertions(+) create mode 100644 moses/FF/ExternalFeature.cpp create mode 100644 moses/FF/ExternalFeature.h diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project index 3edf604e3..a71267f8f 100644 --- a/contrib/other-builds/moses/.project +++ b/contrib/other-builds/moses/.project @@ -1066,6 +1066,16 @@ 1 PARENT-3-PROJECT_LOC/moses/FF/DistortionScoreProducer.h + + FF/ExternalFeature.cpp + 1 + PARENT-3-PROJECT_LOC/moses/FF/ExternalFeature.cpp + + + FF/ExternalFeature.h + 1 + PARENT-3-PROJECT_LOC/moses/FF/ExternalFeature.h + FF/FFState.cpp 1 diff --git a/moses/FF/ExternalFeature.cpp b/moses/FF/ExternalFeature.cpp new file mode 100644 index 000000000..9e966bffe --- /dev/null +++ b/moses/FF/ExternalFeature.cpp @@ -0,0 +1,73 @@ +#include "ExternalFeature.h" +#include + +using namespace std; + +namespace Moses +{ +ExternalFeatureState::ExternalFeatureState(int stateSize, void *data) +{ + m_stateSize = stateSize; + m_data = malloc(stateSize); + memcpy(m_data, data, stateSize); +} + +void ExternalFeature::Load() +{ + string nparam = "testing"; + + if (m_path.size() < 1) { + cerr << "External requires a path to a dynamic library!\n"; + abort(); + } + lib_handle = dlopen(m_path.c_str(), RTLD_LAZY); + if (!lib_handle) { + cerr << "dlopen reports: " << dlerror() << endl; + cerr << "Did you provide a full path to the dynamic library?\n"; + abort(); + } + CdecFF* (*fn)(const string&) = + (CdecFF* (*)(const string&))(dlsym(lib_handle, "create_ff")); + if (!fn) { + cerr << "dlsym reports: " << dlerror() << endl; + abort(); + } + ff_ext = (*fn)(nparam); + m_stateSize = ff_ext->StateSize(); + +} + +ExternalFeature::~ExternalFeature() { + delete ff_ext; + dlclose(lib_handle); +} + +void ExternalFeature::SetParameter(const std::string& key, const std::string& value) +{ + if (key == "path") { + m_path = value; + } + else { + StatefulFeatureFunction::SetParameter(key, value); + } +} + +FFState* ExternalFeature::Evaluate( + const Hypothesis& cur_hypo, + const FFState* prev_state, + ScoreComponentCollection* accumulator) const +{ + return new ExternalFeatureState(m_stateSize); +} + +FFState* ExternalFeature::EvaluateChart( + const ChartHypothesis& /* cur_hypo */, + int /* featureID - used to index the state in the previous hypotheses */, + ScoreComponentCollection* accumulator) const +{ + return new ExternalFeatureState(m_stateSize); +} + + +} + diff --git a/moses/FF/ExternalFeature.h b/moses/FF/ExternalFeature.h new file mode 100644 index 000000000..7082caeaf --- /dev/null +++ b/moses/FF/ExternalFeature.h @@ -0,0 +1,92 @@ +#pragma once + +#include +#include "StatefulFeatureFunction.h" +#include "FFState.h" + +namespace Moses +{ +class CdecFF; + +class ExternalFeatureState : public FFState +{ +protected: + int m_stateSize; + void *m_data; +public: + ExternalFeatureState(int stateSize) + :m_stateSize(stateSize) + ,m_data(NULL) + {} + ExternalFeatureState(int stateSize, void *data); + + ~ExternalFeatureState() + { + free(m_data); + } + + int Compare(const FFState& other) const + { + const ExternalFeatureState &otherFF = static_cast(other); + int ret = memcmp(m_data, otherFF.m_data, m_stateSize); + return ret; + } +}; + +// copied from cdec +class ExternalFeature : public StatefulFeatureFunction +{ +public: + ExternalFeature(const std::string &line) + :StatefulFeatureFunction("ExternalFeature", line) + { + ReadParameters(); + } + ~ExternalFeature(); + + void Load(); + + bool IsUseable(const FactorMask &mask) const + { return true; } + + void SetParameter(const std::string& key, const std::string& value); + + void Evaluate(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const + {} + void Evaluate(const InputType &input + , const InputPath &inputPath + , ScoreComponentCollection &scoreBreakdown) const + {} + FFState* Evaluate( + const Hypothesis& cur_hypo, + const FFState* prev_state, + ScoreComponentCollection* accumulator) const; + + FFState* EvaluateChart( + const ChartHypothesis& /* cur_hypo */, + int /* featureID - used to index the state in the previous hypotheses */, + ScoreComponentCollection* accumulator) const; + + virtual const FFState* EmptyHypothesisState(const InputType &input) const + { + return new ExternalFeatureState(m_stateSize); + } + +protected: + std::string m_path; + void* lib_handle; + CdecFF *ff_ext; + int m_stateSize; +}; + +class CdecFF +{ +public: + virtual int StateSize() const = 0; +}; + +} + diff --git a/moses/FF/Factory.cpp b/moses/FF/Factory.cpp index 01b12d920..a400b2602 100644 --- a/moses/FF/Factory.cpp +++ b/moses/FF/Factory.cpp @@ -30,6 +30,8 @@ #include "moses/FF/PhrasePenalty.h" #include "moses/FF/OSM-Feature/OpSequenceModel.h" #include "moses/FF/ControlRecombination.h" +#include "moses/FF/ExternalFeature.h" + #include "moses/FF/SkeletonStatelessFF.h" #include "moses/FF/SkeletonStatefulFF.h" @@ -142,6 +144,7 @@ FeatureRegistry::FeatureRegistry() MOSES_FNAME(ControlRecombination); MOSES_FNAME(SkeletonStatelessFF); MOSES_FNAME(SkeletonStatefulFF); + MOSES_FNAME(ExternalFeature); #ifdef HAVE_SYNLM MOSES_FNAME(SyntacticLanguageModel); From b534e8c4b7bbc4318732aeea568ce3d3c8119feb Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 12 Sep 2013 09:17:18 +0100 Subject: [PATCH 5/9] f*cking link error --- Jamroot | 4 ++++ moses/FF/ExternalFeature.h | 1 + 2 files changed, 5 insertions(+) diff --git a/Jamroot b/Jamroot index 00fbd7b00..32e93e872 100644 --- a/Jamroot +++ b/Jamroot @@ -75,6 +75,10 @@ include $(TOP)/jam-files/sanity.jam ; boost 103600 ; external-lib z ; +external-lib dl ; + +requirements += dl ; + if ! [ option.get "without-tcmalloc" : : "yes" ] && [ test_library "tcmalloc_minimal" ] { if [ option.get "full-tcmalloc" : : "yes" ] { diff --git a/moses/FF/ExternalFeature.h b/moses/FF/ExternalFeature.h index 7082caeaf..20bff4154 100644 --- a/moses/FF/ExternalFeature.h +++ b/moses/FF/ExternalFeature.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include "StatefulFeatureFunction.h" #include "FFState.h" From 5e506ed91409ff94b5436b5f06946f47d719d1ee Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Thu, 12 Sep 2013 09:27:01 +0100 Subject: [PATCH 6/9] Make dl have the same linkage as glibc --- Jamroot | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jamroot b/Jamroot index 32e93e872..e4a8a9ed3 100644 --- a/Jamroot +++ b/Jamroot @@ -75,8 +75,8 @@ include $(TOP)/jam-files/sanity.jam ; boost 103600 ; external-lib z ; -external-lib dl ; +lib dl : : static:static shared:shared ; requirements += dl ; From 9950279bc72c2acdf227ca30ef4bba2d900c241b Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 12 Sep 2013 19:50:09 +0200 Subject: [PATCH 7/9] add pointer to input path to chart translation option --- moses/ChartTranslationOption.h | 6 ++++++ moses/ChartTranslationOptions.cpp | 1 + 2 files changed, 7 insertions(+) diff --git a/moses/ChartTranslationOption.h b/moses/ChartTranslationOption.h index 8984adf66..271ff7256 100644 --- a/moses/ChartTranslationOption.h +++ b/moses/ChartTranslationOption.h @@ -13,6 +13,7 @@ class ChartTranslationOption protected: const TargetPhrase &m_targetPhrase; ScoreComponentCollection m_scoreBreakdown; + const InputPath *m_inputPath; public: ChartTranslationOption(const TargetPhrase &targetPhrase); @@ -21,6 +22,11 @@ public: return m_targetPhrase; } + void SetInputPath(const InputPath *inputPath) + { m_inputPath = inputPath; } + const InputPath *GetInputPath() const + { return m_inputPath; } + const ScoreComponentCollection &GetScores() const { return m_scoreBreakdown; } diff --git a/moses/ChartTranslationOptions.cpp b/moses/ChartTranslationOptions.cpp index 4befcacaa..08247f523 100644 --- a/moses/ChartTranslationOptions.cpp +++ b/moses/ChartTranslationOptions.cpp @@ -69,6 +69,7 @@ void ChartTranslationOptions::Evaluate(const InputType &input, const InputPath & CollType::iterator iter; for (iter = m_collection.begin(); iter != m_collection.end(); ++iter) { ChartTranslationOption &transOpt = **iter; + transOpt.SetInputPath(&inputPath); transOpt.Evaluate(input, inputPath); } From 3b03d803d98194772e4b0ac6df7041c8c98c0119 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 13 Sep 2013 09:27:48 +0200 Subject: [PATCH 8/9] make sure Evaluate(inputPath) is called for chart translation options --- moses/ChartManager.cpp | 3 +++ moses/ChartParser.cpp | 5 +++++ moses/ChartParser.h | 1 + 3 files changed, 9 insertions(+) diff --git a/moses/ChartManager.cpp b/moses/ChartManager.cpp index 8a25be1f2..aa47dc161 100644 --- a/moses/ChartManager.cpp +++ b/moses/ChartManager.cpp @@ -93,6 +93,9 @@ void ChartManager::ProcessSentence() m_parser.Create(range, m_translationOptionList); m_translationOptionList.ApplyThreshold(); + const InputPath &inputPath = m_parser.GetInputPath(range); + m_translationOptionList.Evaluate(m_source, inputPath); + // decode ChartCell &cell = m_hypoStackColl.Get(range); diff --git a/moses/ChartParser.cpp b/moses/ChartParser.cpp index 7170c4d92..bc5b86a3e 100644 --- a/moses/ChartParser.cpp +++ b/moses/ChartParser.cpp @@ -219,6 +219,11 @@ void ChartParser::CreateInputPaths(const InputType &input) } } +const InputPath &ChartParser::GetInputPath(WordsRange &range) const +{ + return GetInputPath(range.GetStartPos(), range.GetEndPos()); +} + const InputPath &ChartParser::GetInputPath(size_t startPos, size_t endPos) const { size_t offset = endPos - startPos; diff --git a/moses/ChartParser.h b/moses/ChartParser.h index d80296771..55a2c85fd 100644 --- a/moses/ChartParser.h +++ b/moses/ChartParser.h @@ -66,6 +66,7 @@ public: long GetTranslationId() const; size_t GetSize() const; const InputPath &GetInputPath(size_t startPos, size_t endPos) const; + const InputPath &GetInputPath(WordsRange &range) const; private: ChartParserUnknown m_unknown; From cdd9df19d26723645454f8ddef467489d1ed341b Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Fri, 13 Sep 2013 22:16:42 +0100 Subject: [PATCH 9/9] Remove --OutputNTLengths from extract-rules, etc. The option isn't used in master and the output is compatible with the current rule table format. If anyone wants this in master it should probably be fixed in the span-length branch then merged. --- phrase-extract/ExtractedRule.cpp | 44 ----------- phrase-extract/ExtractedRule.h | 11 --- phrase-extract/RuleExtractionOptions.h | 2 - phrase-extract/consolidate-main.cpp | 9 +-- phrase-extract/extract-rules-main.cpp | 9 --- phrase-extract/score-main.cpp | 104 +------------------------ 6 files changed, 2 insertions(+), 177 deletions(-) delete mode 100644 phrase-extract/ExtractedRule.cpp diff --git a/phrase-extract/ExtractedRule.cpp b/phrase-extract/ExtractedRule.cpp deleted file mode 100644 index 50d9085e6..000000000 --- a/phrase-extract/ExtractedRule.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// -// ExtractedRule.cpp -// extract -// -// Created by Hieu Hoang on 13/09/2011. -// Copyright 2011 __MyCompanyName__. All rights reserved. -// - -#include "ExtractedRule.h" - -using namespace std; - -namespace MosesTraining -{ - -void ExtractedRule::OutputNTLengths(std::ostream &out) const -{ - ostringstream outString; - OutputNTLengths(outString); - out << outString; -} - -void ExtractedRule::OutputNTLengths(std::ostringstream &outString) const -{ - std::map >::const_iterator iter; - for (iter = m_ntLengths.begin(); iter != m_ntLengths.end(); ++iter) { - size_t sourcePos = iter->first; - const std::pair &spanLengths = iter->second; - outString << sourcePos << "=" << spanLengths.first << "," < > m_ntLengths; - ExtractedRule(int sT, int eT, int sS, int eS) : source() , target() @@ -76,13 +72,6 @@ public: , count(0) , pcfgScore(0.0) { } - - void SetSpanLength(size_t sourcePos, size_t sourceLength, size_t targetLength) { - m_ntLengths[sourcePos] = std::pair(sourceLength, targetLength); - } - - void OutputNTLengths(std::ostream &out) const; - void OutputNTLengths(std::ostringstream &out) const; }; } diff --git a/phrase-extract/RuleExtractionOptions.h b/phrase-extract/RuleExtractionOptions.h index a9b0ce9e6..b38258470 100644 --- a/phrase-extract/RuleExtractionOptions.h +++ b/phrase-extract/RuleExtractionOptions.h @@ -49,7 +49,6 @@ public: bool duplicateRules; bool fractionalCounting; bool pcfgScore; - bool outputNTLengths; bool gzOutput; bool unpairedExtractFormat; bool conditionOnTargetLhs; @@ -83,7 +82,6 @@ public: , duplicateRules(true) , fractionalCounting(true) , pcfgScore(false) - , outputNTLengths(false) , gzOutput(false) , unpairedExtractFormat(false) , conditionOnTargetLhs(false) diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp index 221b7048c..b0e2c8594 100644 --- a/phrase-extract/consolidate-main.cpp +++ b/phrase-extract/consolidate-main.cpp @@ -41,7 +41,6 @@ bool lowCountFlag = false; bool goodTuringFlag = false; bool kneserNeyFlag = false; bool logProbFlag = false; -bool outputNTLengths = false; inline float maybeLogProb( float a ) { return logProbFlag ? log(a) : a; @@ -62,7 +61,7 @@ int main(int argc, char* argv[]) << "consolidating direct and indirect rule tables\n"; if (argc < 4) { - cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--OutputNTLengths] \n"; + cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] \n"; exit(1); } char* &fileNameDirect = argv[1]; @@ -119,8 +118,6 @@ int main(int argc, char* argv[]) } else if (strcmp(argv[i],"--LogProb") == 0) { logProbFlag = true; cerr << "using log-probabilities\n"; - } else if (strcmp(argv[i],"--OutputNTLengths") == 0) { - outputNTLengths = true; } else { cerr << "ERROR: unknown option " << argv[i] << endl; exit(1); @@ -315,10 +312,6 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC // counts, for debugging fileConsolidated << "||| " << countE << " " << countF << " " << countEF; - if (outputNTLengths) { - fileConsolidated << " ||| " << itemDirect[5]; - } - // count bin feature (as a sparse feature) if (sparseCountBinFeatureFlag || directSparseScores.compare("") != 0 || diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp index 97a593085..c625c3582 100644 --- a/phrase-extract/extract-rules-main.cpp +++ b/phrase-extract/extract-rules-main.cpp @@ -129,7 +129,6 @@ int main(int argc, char* argv[]) << " --GlueGrammar FILE" << " | --UnknownWordLabel FILE" << " | --OnlyDirect" - << " | --OutputNTLengths" << " | --MaxSpan[" << options.maxSpan << "]" << " | --MinHoleTarget[" << options.minHoleTarget << "]" << " | --MinHoleSource[" << options.minHoleSource << "]" @@ -262,8 +261,6 @@ int main(int argc, char* argv[]) options.fractionalCounting = false; } else if (strcmp(argv[i],"--PCFG") == 0) { options.pcfgScore = true; - } else if (strcmp(argv[i],"--OutputNTLengths") == 0) { - options.outputNTLengths = true; } else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) { options.unpairedExtractFormat = true; } else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) { @@ -663,9 +660,6 @@ void ExtractTask::saveHieroAlignment( int startT, int endT, int startS, int endS rule.alignment += sourceSymbolIndex + "-" + targetSymbolIndex + " "; if (!m_options.onlyDirectFlag) rule.alignmentInv += targetSymbolIndex + "-" + sourceSymbolIndex + " "; - - rule.SetSpanLength(hole.GetPos(0), hole.GetSize(0), hole.GetSize(1) ) ; - } rule.alignment.erase(rule.alignment.size()-1); @@ -1077,9 +1071,6 @@ void ExtractTask::writeRulesToFile() << rule->target << " ||| " << rule->alignment << " ||| " << rule->count << " ||| "; - if (m_options.outputNTLengths) { - rule->OutputNTLengths(out); - } if (m_options.pcfgScore) { out << " ||| " << rule->pcfgScore; } diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp index 3042cbe3e..b7ccea0fc 100644 --- a/phrase-extract/score-main.cpp +++ b/phrase-extract/score-main.cpp @@ -59,7 +59,6 @@ int negLogProb = 1; bool lexFlag = true; bool unalignedFlag = false; bool unalignedFWFlag = false; -bool outputNTLengths = false; bool singletonFeature = false; bool crossedNonTerm = false; int countOfCounts[COC_MAX+1]; @@ -82,9 +81,6 @@ double computeUnalignedPenalty( const PHRASE &, const PHRASE &, const PhraseAlig set functionWordList; void loadFunctionWords( const string &fileNameFunctionWords ); double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & ); -void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs - , map > &sourceProb - , map > &targetProb); void printSourcePhrase(const PHRASE &, const PHRASE &, const PhraseAlignment &, ostream &); void printTargetPhrase(const PHRASE &, const PHRASE &, const PhraseAlignment &, ostream &); @@ -95,7 +91,7 @@ int main(int argc, char* argv[]) ScoreFeatureManager featureManager; if (argc < 4) { - cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--Singleton] [--CrossedNonTerm] \n"; + cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--Singleton] [--CrossedNonTerm] \n"; cerr << featureManager.usage() << endl; exit(1); } @@ -158,8 +154,6 @@ int main(int argc, char* argv[]) minCountHierarchical = atof(argv[++i]); cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times\n"; minCountHierarchical -= 0.00001; // account for rounding - } else if (strcmp(argv[i],"--OutputNTLengths") == 0) { - outputNTLengths = true; } else if (strcmp(argv[i],"--Singleton") == 0) { singletonFeature = true; cerr << "binary singleton feature\n"; @@ -375,87 +369,6 @@ const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrase return *bestAlignment; } - -void calcNTLengthProb(const map > &lengths - , size_t total - , map > &probs) -{ - map >::const_iterator iterOuter; - for (iterOuter = lengths.begin(); iterOuter != lengths.end(); ++iterOuter) { - size_t sourcePos = iterOuter->first; - const map &inner = iterOuter->second; - - map::const_iterator iterInner; - for (iterInner = inner.begin(); iterInner != inner.end(); ++iterInner) { - size_t length = iterInner->first; - size_t count = iterInner->second; - float prob = (float) count / (float) total; - probs[sourcePos][length] = prob; - } - } -} - -void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs - , map > &sourceProb - , map > &targetProb) -{ - map > sourceLengths, targetLengths; - // 1st = position in source phrase, 2nd = length, 3rd = count - map totals; - // 1st = position in source phrase, 2nd = total counts - // each source pos should have same count? - - vector< PhraseAlignment* >::const_iterator iterOuter; - for (iterOuter = phrasePairs.begin(); iterOuter != phrasePairs.end(); ++iterOuter) { - const PhraseAlignment &phrasePair = **iterOuter; - const std::map > &ntLengths = phrasePair.GetNTLengths(); - - std::map >::const_iterator iterInner; - for (iterInner = ntLengths.begin(); iterInner != ntLengths.end(); ++iterInner) { - size_t sourcePos = iterInner->first; - size_t sourceLength = iterInner->second.first; - size_t targetLength = iterInner->second.second; - - sourceLengths[sourcePos][sourceLength]++; - targetLengths[sourcePos][targetLength]++; - - totals[sourcePos]++; - } - } - - if (totals.size() == 0) { - // no non-term. Don't bother - return; - } - - size_t total = totals.begin()->second; - if (totals.size() > 1) { - assert(total == (++totals.begin())->second ); - } - - calcNTLengthProb(sourceLengths, total, sourceProb); - calcNTLengthProb(targetLengths, total, targetProb); - -} - -void outputNTLengthProbs(ostream &phraseTableFile, const map > &probs, const string &prefix) -{ - map >::const_iterator iterOuter; - for (iterOuter = probs.begin(); iterOuter != probs.end(); ++iterOuter) { - size_t sourcePos = iterOuter->first; - const map &inner = iterOuter->second; - - map::const_iterator iterInner; - for (iterInner = inner.begin(); iterInner != inner.end(); ++iterInner) { - size_t length = iterInner->first; - float prob = iterInner->second; - - phraseTableFile << sourcePos << "|" << prefix << "|" << length << "=" << prob << " "; - } - } - -} - bool calcCrossedNonTerm(size_t sourcePos, size_t targetPos, const std::vector< std::set > &alignedToS) { for (size_t currSource = 0; currSource < alignedToS.size(); ++currSource) { @@ -664,21 +577,6 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo if (kneserNeyFlag) phraseTableFile << " " << distinctCount; - // nt lengths - if (outputNTLengths) { - phraseTableFile << " ||| "; - - if (!inverseFlag) { - map > sourceProb, targetProb; - // 1st sourcePos, 2nd = length, 3rd = prob - - calcNTLengthProb(phrasePair, sourceProb, targetProb); - - outputNTLengthProbs(phraseTableFile, sourceProb, "S"); - outputNTLengthProbs(phraseTableFile, targetProb, "T"); - } - } - phraseTableFile << endl; }