phrase table pruning

This commit is contained in:
Barry Haddow 2014-09-08 15:08:18 +01:00
parent 260954bfd5
commit 18437a0351
2 changed files with 78 additions and 13 deletions

View File

@ -46,4 +46,4 @@ $(TOP)//boost_iostreams
$(TOP)//boost_program_options
;
alias programs : 1-1-Extraction TMining generateSequences processPhraseTable processLexicalTable queryPhraseTable queryLexicalTable programsMin programsProbing merge-sortedprune PhraseTable ;
alias programs : 1-1-Extraction TMining generateSequences processPhraseTable processLexicalTable queryPhraseTable queryLexicalTable programsMin programsProbing merge-sorted prunePhraseTable ;

View File

@ -34,7 +34,9 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <boost/program_options.hpp>
#include <boost/scoped_ptr.hpp>
#include "moses/InputPath.h"
#include "moses/Parameter.h"
#include "moses/TranslationModel/PhraseDictionary.h"
#include "moses/StaticData.h"
#include "util/file_piece.hh"
@ -55,7 +57,45 @@ static void usage(const po::options_description& desc, char** argv) {
}
//Find top n translations of source, and send them to output
static void outputTopN(const StringPiece& sourcePhrase, const StaticData& staticData, ostream& out) {
static void outputTopN(const StringPiece& sourcePhraseString, PhraseDictionary* phraseTable, const std::vector<FactorType> &input, ostream& out) {
//get list of target phrases
Phrase sourcePhrase;
sourcePhrase.CreateFromString(Input,input,sourcePhraseString,NULL);
InputPath inputPath(sourcePhrase, NonTerminalSet(), WordsRange(0,sourcePhrase.GetSize()-1),NULL,NULL);
InputPathList inputPaths;
inputPaths.push_back(&inputPath);
phraseTable->GetTargetPhraseCollectionBatch(inputPaths);
//EvaluateInIsolation ??
const TargetPhraseCollection* targetPhrases = inputPath.GetTargetPhrases(*phraseTable);
//sort by total score and prune
// - Already done?
//print phrases
const std::vector<FactorType>& output = StaticData::Instance().GetOutputFactorOrder();
if (targetPhrases) {
for (TargetPhraseCollection::const_iterator i = targetPhrases->begin(); i != targetPhrases->end(); ++i) {
const TargetPhrase* targetPhrase = *i;
out << sourcePhrase.GetStringRep(input);
out << " ||| ";
out << targetPhrase->GetStringRep(output);
out << " ||| ";
const ScoreComponentCollection scores = targetPhrase->GetScoreBreakdown();
vector<float> phraseScores = scores.GetScoresForProducer(phraseTable);
for (size_t j = 0; j < phraseScores.size(); ++j) {
out << exp(phraseScores[j]) << " ";
}
out << "||| ";
const AlignmentInfo& align = targetPhrase->GetAlignTerm();
for (AlignmentInfo::const_iterator j = align.begin(); j != align.end(); ++j) {
out << j->first << "-" << j->second << " ";
}
out << endl;
}
}
}
@ -63,7 +103,6 @@ int main(int argc, char** argv)
{
bool help;
string input_file;
string output_file;
string config_file;
@ -71,15 +110,15 @@ int main(int argc, char** argv)
desc.add_options()
("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
("input-file,i", po::value<string>(&input_file), "Input file")
("output-file,o", po::value<string>(&output_file), "Output file")
("config-file,f", po::value<string>(&config_file), "Config file")
;
po::options_description cmdline_options;
cmdline_options.add(desc);
po::variables_map vm;
po::store(po::command_line_parser(argc,argv).
options(cmdline_options).run(), vm);
po::parsed_options parsed = po::command_line_parser(argc,argv).
options(cmdline_options).allow_unregistered().run();
po::store(parsed, vm);
po::notify(vm);
if (help) {
usage(desc, argv);
@ -90,11 +129,6 @@ int main(int argc, char** argv)
usage(desc, argv);
exit(1);
}
if (output_file.empty()) {
cerr << "ERROR: Please specify an output file" << endl << endl;
usage(desc, argv);
exit(1);
}
if (config_file.empty()) {
cerr << "ERROR: Please specify a config file" << endl << endl;
usage(desc, argv);
@ -105,6 +139,19 @@ int main(int argc, char** argv)
mosesargs.push_back(argv[0]);
mosesargs.push_back("-f");
mosesargs.push_back(config_file);
for (size_t i = 0; i < parsed.options.size(); ++i) {
if (parsed.options[i].position_key == -1 && !parsed.options[i].unregistered) continue;
const string& key = parsed.options[i].string_key;
if (!key.empty()) {
mosesargs.push_back(key);
}
for (size_t j = 0; j < parsed.options[i].value.size(); ++j) {
const string& value = parsed.options[i].value[j];
if (!value.empty()) {
mosesargs.push_back(value);
}
}
}
boost::scoped_ptr<Parameter> params(new Parameter());
char** mosesargv = new char*[mosesargs.size()];
@ -122,11 +169,27 @@ int main(int argc, char** argv)
exit(1);
}
const StaticData &staticData = StaticData::Instance();
const std::vector<FactorType> & input = staticData.GetInputFactorOrder();
//Find the phrase table to evaluate with
PhraseDictionary* phraseTable = NULL;
const vector<FeatureFunction*>& ffs = FeatureFunction::GetFeatureFunctions();
for (size_t i = 0; i < ffs.size(); ++i) {
PhraseDictionary* maybePhraseTable = dynamic_cast< PhraseDictionary*>(ffs[i]);
if (maybePhraseTable) {
UTIL_THROW_IF(phraseTable,util::Exception,"Can only score translations with one phrase table");
phraseTable = maybePhraseTable;
}
}
UTIL_THROW_IF(!phraseTable,util::Exception,"Unable to find scoring phrase table");
Sentence sentence;
phraseTable->InitializeForInput(sentence);
//
//Load and prune the phrase table. This is taken (with mods) from moses/TranslationModel/RuleTable/LoaderStandard.cpp
//
const StaticData &staticData = StaticData::Instance();
string lineOrig;
@ -152,9 +215,11 @@ int main(int argc, char** argv)
util::TokenIter<util::MultiCharacter> pipes(line, "|||");
StringPiece sourcePhraseString(*pipes);
if (sourcePhraseString != previous) {
outputTopN(previous, staticData, cout);
outputTopN(previous, phraseTable, input, cout);
previous = sourcePhraseString;
}
}
outputTopN(previous, phraseTable, input, cout);