Utility to look up phrase table entries for a corpus

(specifically for dynamic phrase tables like Mmsapt and PhraseDictionaryGroup)
This commit is contained in:
Michael Denkowski 2015-12-18 14:59:07 -05:00
parent 0a39efb6c8
commit 5de46d6ae9
6 changed files with 121 additions and 3 deletions

View File

@ -248,6 +248,7 @@ if [ option.get "with-mm-extras" : : "yes" ]
moses/TranslationModel/UG//ptable-describe-features
moses/TranslationModel/UG//count-ptable-features
moses/TranslationModel/UG//ptable-lookup
moses/TranslationModel/UG//ptable-lookup-corpus
moses/TranslationModel/UG//check-coverage
moses/TranslationModel/UG/mm//mtt-demo1
moses/TranslationModel/UG/mm//mtt-dump

View File

@ -74,7 +74,7 @@
<fileInfo id="cdt.managedbuild.config.gnu.exe.debug.1846963597.871386239" name="LDHT.h" rcbsApplicability="disable" resourcePath="LM/LDHT.h" toolsToInvoke=""/>
<fileInfo id="cdt.managedbuild.config.gnu.exe.debug.1846963597.1761300858" name="ParallelBackoff.h" rcbsApplicability="disable" resourcePath="LM/ParallelBackoff.h" toolsToInvoke=""/>
<sourceEntries>
<entry excluding="TranslationModel/UG/ptable-lookup.cc|TranslationModel/UG/mm/test-http-client.cc|TranslationModel/UG/ptable-describe-features.cc|TranslationModel/UG/count-ptable-features.cc|TranslationModel/UG/try-align2.cc|TranslationModel/UG/try-align.cc|TranslationModel/UG/spe-check-coverage3.cc|TranslationModel/UG/spe-check-coverage2.cc|TranslationModel/UG/spe-check-coverage.cc|TranslationModel/UG/sim-pe.cc|TranslationModel/UG/generic/stringdist|TranslationModel/UG/mm/test-dynamic-im-tsa.cc|TranslationModel/UG/mm/mtt.count.cc|LM/ParallelBackoff.h|LM/ParallelBackoff.cpp|LM/bilingual-lm|LM/MaxEntSRI.h|LM/MaxEntSRI.cpp|LM/BilingualLM.h|LM/BilingualLM.cpp|LM/Rand.h|LM/Rand.cpp|LM/LDHT.h|LM/LDHT.cpp|LM/ORLM.h|LM/ORLM.cpp|LM/NeuralLMWrapper.h|LM/NeuralLMWrapper.cpp|LM/SRI.h|LM/SRI.cpp|LM/IRST.h|LM/IRST.cpp|LM/DALMWrapper.h|LM/DALMWrapper.cpp|LM/oxlm|TranslationModel/UG/util" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
<entry excluding="TranslationModel/UG/ptable-lookup.cc|TranslationModel/UG/ptable-lookup-corpus.cc|TranslationModel/UG/mm/test-http-client.cc|TranslationModel/UG/ptable-describe-features.cc|TranslationModel/UG/count-ptable-features.cc|TranslationModel/UG/try-align2.cc|TranslationModel/UG/try-align.cc|TranslationModel/UG/spe-check-coverage3.cc|TranslationModel/UG/spe-check-coverage2.cc|TranslationModel/UG/spe-check-coverage.cc|TranslationModel/UG/sim-pe.cc|TranslationModel/UG/generic/stringdist|TranslationModel/UG/mm/test-dynamic-im-tsa.cc|TranslationModel/UG/mm/mtt.count.cc|LM/ParallelBackoff.h|LM/ParallelBackoff.cpp|LM/bilingual-lm|LM/MaxEntSRI.h|LM/MaxEntSRI.cpp|LM/BilingualLM.h|LM/BilingualLM.cpp|LM/Rand.h|LM/Rand.cpp|LM/LDHT.h|LM/LDHT.cpp|LM/ORLM.h|LM/ORLM.cpp|LM/NeuralLMWrapper.h|LM/NeuralLMWrapper.cpp|LM/SRI.h|LM/SRI.cpp|LM/IRST.h|LM/IRST.cpp|LM/DALMWrapper.h|LM/DALMWrapper.cpp|LM/oxlm|TranslationModel/UG/util" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
</sourceEntries>
</configuration>
</storageModule>

View File

@ -3504,7 +3504,12 @@
<name>TranslationModel/UG/ptable-lookup.cc</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/ptable-lookup.cc</locationURI>
</link>
</link>
<link>
<name>TranslationModel/UG/ptable-lookup-corpus.cc</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/ptable-lookup-corpus.cc</locationURI>
</link>
<link>
<name>TranslationModel/UG/sapt_phrase_scorers.h</name>
<type>1</type>
@ -5584,7 +5589,17 @@
<name>TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ptable-lookup.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ptable-lookup.o</locationURI>
</link>
</link>
<link>
<name>TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ptable-lookup-corpus</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ptable-lookup-corpus</locationURI>
</link>
<link>
<name>TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ptable-lookup-corpus.o</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/ptable-lookup-corpus.o</locationURI>
</link>
<link>
<name>TranslationModel/UG/bin/gcc-4.8/release/debug-symbols-on/link-static/threading-multi/spe-check-coverage</name>
<type>1</type>

View File

@ -119,6 +119,7 @@
<File Name="../../../moses/TranslationModel/UG/mmsapt_align.cc"/>
<File Name="../../../moses/TranslationModel/UG/ptable-describe-features.cc"/>
<File Name="../../../moses/TranslationModel/UG/ptable-lookup.cc"/>
<File Name="../../../moses/TranslationModel/UG/ptable-lookup-corpus.cc"/>
<File Name="../../../moses/TranslationModel/UG/sapt_phrase_key.h"/>
<File Name="../../../moses/TranslationModel/UG/sapt_phrase_scorers.h"/>
<File Name="../../../moses/TranslationModel/UG/sapt_pscore_base.h"/>

View File

@ -59,6 +59,18 @@ $(TOP)/moses/TranslationModel/UG//mmsapt
$(TOP)/util//kenutil
;
exe ptable-lookup-corpus :
ptable-lookup-corpus.cc
$(TOP)/moses//moses
$(TOP)/moses/TranslationModel/UG/generic//generic
$(TOP)//boost_iostreams
$(TOP)//boost_filesystem
$(TOP)//boost_program_options
$(TOP)/moses/TranslationModel/UG/mm//mm
$(TOP)/moses/TranslationModel/UG//mmsapt
$(TOP)/util//kenutil
;
exe check-coverage :
check-coverage.cc
$(TOP)/moses//moses

View File

@ -0,0 +1,89 @@
#include "moses/TranslationTask.h"
#include <boost/foreach.hpp>
#include <boost/format.hpp>
#include <boost/tokenizer.hpp>
#include <boost/shared_ptr.hpp>
#include <iostream>
using namespace Moses;
using namespace std;
using namespace boost;
int main(int argc, char const* argv[])
{
// Load standard Moses config
Parameter params;
if (!params.LoadParam(argc, argv) || !StaticData::LoadDataStatic(&params, argv[0])) {
exit(2);
}
StaticData const& global = StaticData::Instance();
global.SetVerboseLevel(0);
vector<FactorType> ifo = global.options()->input.factor_order;
// Get last PhraseDictionary in config (either single model or model combination entry)
PhraseDictionary* pt = PhraseDictionary::GetColl()[PhraseDictionary::GetColl().size() - 1];
// Only lookup each phrase once
unordered_set<string> seen;
string line;
while (true) {
// Input line
if (getline(cin, line, '\n').eof()) {
break;
}
vector<string> words = Tokenize(line);
// For each start word
for (size_t i = 0; i < words.size(); ++i) {
// Try phrases of increasing length
for (size_t j = i + 1; j <= words.size(); ++j) {
string phrase_str = Join(" ", words.begin() + i, words.begin() + j);
// Unique check
if (seen.find(phrase_str) != seen.end()) {
continue;
}
seen.insert(phrase_str);
// New phrase
boost::shared_ptr<Sentence> phrase(new Sentence(global.options()));
phrase->init(phrase_str);
Phrase const& src = *phrase;
// Setup task for phrase
boost::shared_ptr<TranslationTask> ttask;
ttask = TranslationTask::create(phrase);
// Support model combinations (PhraseDictionaryGroup)
BOOST_FOREACH(PhraseDictionary* p, PhraseDictionary::GetColl()) {
p->InitializeForInput(ttask);
}
// Query PhraseDictionary
TargetPhraseCollection::shared_ptr tgts = pt->GetTargetPhraseCollectionLEGACY(ttask, src);
// No results breaks loop over increasing phrase lengths
if (!tgts) {
break;
}
for (size_t k = 0; k < tgts->GetSize(); ++k) {
TargetPhrase const& tgt = static_cast<TargetPhrase const&>(*(*tgts)[k]);
ScoreComponentCollection const& scc = tgt.GetScoreBreakdown();
size_t start = pt->GetIndex();
size_t stop = start + pt->GetNumScoreComponents();
FVector const& scores = scc.GetScoresVector();
cout << src << "||| " << static_cast<Phrase const>(tgt) << "|||";
for (size_t k = start; k < stop; ++k) {
float f = scores[k];
cout << " " << f;
}
cout << " ||| " << tgt.GetAlignTerm() << endl;
}
}
}
}
}