Tree fragments in GHKM glue rules;

output of LHS tag in tree fragments for UNKs;
GHKMParse info is now denoted as Tree info
This commit is contained in:
Matthias Huck 2013-09-13 17:10:21 +02:00
parent bff123635e
commit c39bed60c0
13 changed files with 67 additions and 68 deletions

View File

@ -71,7 +71,7 @@ IOWrapper::IOWrapper(const std::vector<FactorType> &inputFactorOrder
,m_alignmentInfoStream(NULL)
,m_inputFilePath(inputFilePath)
,m_detailOutputCollector(NULL)
,m_detailGhkmOutputCollector(NULL)
,m_detailTreeFragmentsOutputCollector(NULL)
,m_nBestOutputCollector(NULL)
,m_searchGraphOutputCollector(NULL)
,m_singleBestOutputCollector(NULL)
@ -117,10 +117,10 @@ IOWrapper::IOWrapper(const std::vector<FactorType> &inputFactorOrder
m_detailOutputCollector = new Moses::OutputCollector(m_detailedTranslationReportingStream);
}
if (staticData.IsDetailedGhkmTranslationReportingEnabled()) {
const std::string &path = staticData.GetDetailedGhkmTranslationReportingFilePath();
m_detailedGhkmTranslationReportingStream = new std::ofstream(path.c_str());
m_detailGhkmOutputCollector = new Moses::OutputCollector(m_detailedGhkmTranslationReportingStream);
if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled()) {
const std::string &path = staticData.GetDetailedTreeFragmentsTranslationReportingFilePath();
m_detailedTreeFragmentsTranslationReportingStream = new std::ofstream(path.c_str());
m_detailTreeFragmentsOutputCollector = new Moses::OutputCollector(m_detailedTreeFragmentsTranslationReportingStream);
}
if (!staticData.GetAlignmentOutputFile().empty()) {
@ -137,7 +137,7 @@ IOWrapper::~IOWrapper()
}
delete m_outputSearchGraphStream;
delete m_detailedTranslationReportingStream;
delete m_detailedGhkmTranslationReportingStream;
delete m_detailedTreeFragmentsTranslationReportingStream;
delete m_alignmentInfoStream;
delete m_detailOutputCollector;
delete m_nBestOutputCollector;
@ -321,11 +321,11 @@ void IOWrapper::OutputTranslationOptions(std::ostream &out, ApplicationContext &
}
}
void IOWrapper::OutputGhkmTranslationOptions(std::ostream &out, ApplicationContext &applicationContext, const ChartHypothesis *hypo, const Sentence &sentence, long translationId)
void IOWrapper::OutputTreeFragmentsTranslationOptions(std::ostream &out, ApplicationContext &applicationContext, const ChartHypothesis *hypo, const Sentence &sentence, long translationId)
{
// recursive
if (hypo != NULL) {
const std::string key = "GHKMParse";
const std::string key = "Tree";
std::string value;
bool hasprop;
const TargetPhrase &currTarPhr = hypo->GetCurrTargetPhrase();
@ -340,11 +340,11 @@ void IOWrapper::OutputGhkmTranslationOptions(std::ostream &out, ApplicationConte
<< "-> " << hypo->GetCurrTargetPhrase()
<< " " << hypo->GetTotalScore() << hypo->GetScoreBreakdown();
out << std::endl;
out << " ||| ";
if (hasprop)
out << " " << value;
else
out << " " << "noGHKMParseInfo";
out << " " << "noTreeInfo";
out << std::endl;
}
@ -353,7 +353,7 @@ void IOWrapper::OutputGhkmTranslationOptions(std::ostream &out, ApplicationConte
std::vector<const ChartHypothesis*>::const_iterator iter;
for (iter = prevHypos.begin(); iter != prevHypos.end(); ++iter) {
const ChartHypothesis *prevHypo = *iter;
OutputGhkmTranslationOptions(out, applicationContext, prevHypo, sentence, translationId);
OutputTreeFragmentsTranslationOptions(out, applicationContext, prevHypo, sentence, translationId);
}
}
@ -373,7 +373,7 @@ void IOWrapper::OutputDetailedTranslationReport(
m_detailOutputCollector->Write(translationId, out.str());
}
void IOWrapper::OutputDetailedGhkmTranslationReport(
void IOWrapper::OutputDetailedTreeFragmentsTranslationReport(
const ChartHypothesis *hypo,
const Sentence &sentence,
long translationId)
@ -384,9 +384,9 @@ void IOWrapper::OutputDetailedGhkmTranslationReport(
std::ostringstream out;
ApplicationContext applicationContext;
OutputGhkmTranslationOptions(out, applicationContext, hypo, sentence, translationId);
CHECK(m_detailGhkmOutputCollector);
m_detailGhkmOutputCollector->Write(translationId, out.str());
OutputTreeFragmentsTranslationOptions(out, applicationContext, hypo, sentence, translationId);
CHECK(m_detailTreeFragmentsOutputCollector);
m_detailTreeFragmentsOutputCollector->Write(translationId, out.str());
}
void IOWrapper::OutputBestHypo(const ChartHypothesis *hypo, long translationId)

View File

@ -70,12 +70,12 @@ protected:
const Moses::FactorMask &m_inputFactorUsed;
std::ostream *m_outputSearchGraphStream;
std::ostream *m_detailedTranslationReportingStream;
std::ostream *m_detailedGhkmTranslationReportingStream;
std::ostream *m_detailedTreeFragmentsTranslationReportingStream;
std::ostream *m_alignmentInfoStream;
std::string m_inputFilePath;
std::istream *m_inputStream;
Moses::OutputCollector *m_detailOutputCollector;
Moses::OutputCollector *m_detailGhkmOutputCollector;
Moses::OutputCollector *m_detailTreeFragmentsOutputCollector;
Moses::OutputCollector *m_nBestOutputCollector;
Moses::OutputCollector *m_searchGraphOutputCollector;
Moses::OutputCollector *m_singleBestOutputCollector;
@ -86,7 +86,7 @@ protected:
size_t OutputAlignment(Alignments &retAlign, const Moses::ChartHypothesis *hypo, size_t startTarget);
void OutputAlignment(std::vector< std::set<size_t> > &retAlignmentsS2T, const Moses::AlignmentInfo &ai);
void OutputTranslationOptions(std::ostream &out, ApplicationContext &applicationContext, const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId);
void OutputGhkmTranslationOptions(std::ostream &out, ApplicationContext &applicationContext, const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId);
void OutputTreeFragmentsTranslationOptions(std::ostream &out, ApplicationContext &applicationContext, const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId);
void ReconstructApplicationContext(const Moses::ChartHypothesis &hypo,
const Moses::Sentence &sentence,
ApplicationContext &context);
@ -117,7 +117,7 @@ public:
void OutputNBestList(const Moses::ChartTrellisPathList &nBestList, long translationId);
void OutputNBestList(const std::vector<search::Applied> &nbest, long translationId);
void OutputDetailedTranslationReport(const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId);
void OutputDetailedGhkmTranslationReport(const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId);
void OutputDetailedTreeFragmentsTranslationReport(const Moses::ChartHypothesis *hypo, const Moses::Sentence &sentence, long translationId);
void Backtrack(const Moses::ChartHypothesis *hypo);
void ResetTranslationId();

View File

@ -127,9 +127,9 @@ public:
const Sentence &sentence = dynamic_cast<const Sentence &>(*m_source);
m_ioWrapper.OutputDetailedTranslationReport(bestHypo, sentence, translationId);
}
if (staticData.IsDetailedGhkmTranslationReportingEnabled()) {
if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled()) {
const Sentence &sentence = dynamic_cast<const Sentence &>(*m_source);
m_ioWrapper.OutputDetailedGhkmTranslationReport(bestHypo, sentence, translationId);
m_ioWrapper.OutputDetailedTreeFragmentsTranslationReport(bestHypo, sentence, translationId);
}
// n-best

View File

@ -96,8 +96,8 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
targetPhrase->SetTargetLHS(targetLHS);
targetPhrase->SetAlignmentInfo("0-0");
if (staticData.IsDetailedGhkmTranslationReportingEnabled()) {
targetPhrase->SetProperty("GHKMParse","( UNK "+sourceWord[0]->GetString().as_string()+" )");
if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled()) {
targetPhrase->SetProperty("Tree","( " + (*targetLHS)[0]->GetString().as_string() + " "+sourceWord[0]->GetString().as_string()+" )");
}
// chart rule

View File

@ -67,7 +67,7 @@ Parameter::Parameter()
AddParam("stack-diversity", "sd", "minimum number of hypothesis of each coverage in stack (default 0)");
AddParam("threads","th", "number of threads to use in decoding (defaults to single-threaded)");
AddParam("translation-details", "T", "for each best hypothesis, report translation details to the given file");
AddParam("ghkm-translation-details", "Tghkm", "for each hypothesis, report removed internal nodes to given file");
AddParam("tree-translation-details", "Ttree", "for each hypothesis, report translation details with tree fragment info to given file");
AddParam("translation-option-threshold", "tot", "threshold for translation options relative to best for input phrase");
AddParam("early-discarding-threshold", "edt", "threshold for constructing hypotheses based on estimate cost");
AddParam("verbose", "v", "verbosity level of the logging");

View File

@ -60,7 +60,7 @@ StaticData::StaticData()
,m_unknownWordPenaltyProducer(NULL)
,m_inputFeature(NULL)
,m_detailedTranslationReportingFilePath()
,m_detailedGhkmTranslationReportingFilePath()
,m_detailedTreeFragmentsTranslationReportingFilePath()
,m_onlyDistinctNBest(false)
,m_needAlignmentInfo(false)
,m_factorDelimiter("|") // default delimiter between factors
@ -308,12 +308,12 @@ bool StaticData::LoadData(Parameter *parameter)
return false;
}
}
if (m_parameter->isParamSpecified("ghkm-translation-details")) {
const vector<string> &args = m_parameter->GetParam("ghkm-translation-details");
if (m_parameter->isParamSpecified("tree-translation-details")) {
const vector<string> &args = m_parameter->GetParam("tree-translation-details");
if (args.size() == 1) {
m_detailedGhkmTranslationReportingFilePath = args[0];
m_detailedTreeFragmentsTranslationReportingFilePath = args[0];
} else {
UserMessage::Add(string("the ghkm-translation-details option requires exactly one filename argument"));
UserMessage::Add(string("the tree-translation-details option requires exactly one filename argument"));
return false;
}
}

View File

@ -137,7 +137,7 @@ protected:
bool m_reportAllFactors;
bool m_reportAllFactorsNBest;
std::string m_detailedTranslationReportingFilePath;
std::string m_detailedGhkmTranslationReportingFilePath;
std::string m_detailedTreeFragmentsTranslationReportingFilePath;
bool m_onlyDistinctNBest;
bool m_PrintAlignmentInfo;
bool m_needAlignmentInfo;
@ -368,11 +368,11 @@ public:
const std::string &GetDetailedTranslationReportingFilePath() const {
return m_detailedTranslationReportingFilePath;
}
bool IsDetailedGhkmTranslationReportingEnabled() const {
return !m_detailedGhkmTranslationReportingFilePath.empty();
bool IsDetailedTreeFragmentsTranslationReportingEnabled() const {
return !m_detailedTreeFragmentsTranslationReportingFilePath.empty();
}
const std::string &GetDetailedGhkmTranslationReportingFilePath() const {
return m_detailedGhkmTranslationReportingFilePath;
const std::string &GetDetailedTreeFragmentsTranslationReportingFilePath() const {
return m_detailedTreeFragmentsTranslationReportingFilePath;
}
bool IsLabeledNBestList() const {
return m_labeledNBestList;

View File

@ -26,7 +26,7 @@ void InternalStructFeature::add(const ScoreFeatureContext& context,
std::vector<float>& denseValues,
std::map<std::string,float>& sparseValues) const{
for(size_t i=0; i<context.phrasePair.size(); i++) {
add(&context.phrasePair[i]->ghkmParse, denseValues, sparseValues);
add(&context.phrasePair[i]->treeFragment, denseValues, sparseValues);
}
}

View File

@ -82,7 +82,7 @@ void PhraseAlignment::create( char line[], int lineID, bool includeSentenceIdFla
{
assert(phraseS.empty());
assert(phraseT.empty());
ghkmParse.clear();
treeFragment.clear();
vector< string > token = tokenize( line );
int item = 1;
@ -109,11 +109,11 @@ void PhraseAlignment::create( char line[], int lineID, bool includeSentenceIdFla
alignedToT[t].insert( s );
alignedToS[s].insert( t );
}
} else if ( (item >= 4) && (token[j] == "GHKMParse") ) { // check for information with a key field
} else if ( (item >= 4) && (token[j] == "Tree") ) { // check for information with a key field
++j;
while ( (j < token.size() ) && (token[j] != "|||") ) {
ghkmParse.append(" ");
ghkmParse.append(token[j]);
treeFragment.append(" ");
treeFragment.append(token[j]);
++j;
}
--j;

View File

@ -32,7 +32,7 @@ public:
float count;
int sentenceId;
std::string domain;
std::string ghkmParse;
std::string treeFragment;
std::vector< std::set<size_t> > alignedToT;
std::vector< std::set<size_t> > alignedToS;

View File

@ -446,22 +446,22 @@ void ExtractGHKM::WriteGlueGrammar(
}
// basic rules
out << "<s> [X] ||| <s> [" << topLabel << "] ||| 1 ||| " << std::endl;
out << "[X][" << topLabel << "] </s> [X] ||| [X][" << topLabel << "] </s> [" << topLabel << "] ||| 1 ||| 0-0 " << std::endl;
out << "<s> [X] ||| <s> [" << topLabel << "] ||| 1 ||| ||| ||| ||| {{Tree ( " << topLabel << " ( SSTART <s> ) )}}" << std::endl;
out << "[X][" << topLabel << "] </s> [X] ||| [X][" << topLabel << "] </s> [" << topLabel << "] ||| 1 ||| 0-0 ||| ||| ||| {{Tree ( " << topLabel << " ( SEND </s> ) )}}" << std::endl;
// top rules
for (std::map<std::string, int>::const_iterator i = topLabelSet.begin();
i != topLabelSet.end(); ++i) {
out << "<s> [X][" << i->first << "] </s> [X] ||| <s> [X][" << i->first << "] </s> [" << topLabel << "] ||| 1 ||| 1-1" << std::endl;
out << "<s> [X][" << i->first << "] </s> [X] ||| <s> [X][" << i->first << "] </s> [" << topLabel << "] ||| 1 ||| 1-1 ||| ||| ||| {{Tree ( " << topLabel << " ( SSTART <s> ) ( " << i->first << " ) ( SEND </s> ) )}}" << std::endl;
}
// glue rules
for(std::set<std::string>::const_iterator i = labelSet.begin();
i != labelSet.end(); i++ ) {
out << "[X][" << topLabel << "] [X][" << *i << "] [X] ||| [X][" << topLabel << "] [X][" << *i << "] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1" << std::endl;
out << "[X][" << topLabel << "] [X][" << *i << "] [X] ||| [X][" << topLabel << "] [X][" << *i << "] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1 ||| ||| ||| {{Tree ( " << topLabel << " ( "<< topLabel << " ) ( " << *i << " ) )}}" << std::endl;
}
// glue rule for unknown word...
out << "[X][" << topLabel << "] [X][X] [X] ||| [X][" << topLabel << "] [X][X] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1 " << std::endl;
out << "[X][" << topLabel << "] [X][X] [X] ||| [X][" << topLabel << "] [X][X] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1 ||| ||| ||| {{Tree ( " << topLabel << " ( X ) )}}" << std::endl;
}
void ExtractGHKM::CollectWordLabelCounts(

View File

@ -168,7 +168,7 @@ void ScfgRuleWriter::WriteSymbol(const Symbol &symbol, std::ostream &out)
void ScfgRuleWriter::Write(const ScfgRule &rule, const Subgraph &g)
{
Write(rule,false);
m_fwd << " GHKMParse ";
m_fwd << " Tree ";
g.PrintTree(m_fwd);
m_fwd << std::endl;
m_inv << std::endl;

View File

@ -49,7 +49,7 @@ LexicalTable lexTable;
bool inverseFlag = false;
bool hierarchicalFlag = false;
bool pcfgFlag = false;
bool ghkmParseFlag = false;
bool treeFragmentsFlag = false;
bool unpairedExtractFormatFlag = false;
bool conditionOnTargetLhsFlag = false;
bool wordAlignmentFlag = true;
@ -78,7 +78,7 @@ vector<string> tokenize( const char [] );
void writeCountOfCounts( const string &fileNameCountOfCounts );
void processPhrasePairs( vector< PhraseAlignment > & , ostream &phraseTableFile, bool isSingleton, const ScoreFeatureManager& featureManager, const MaybeLog& maybeLog);
const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrasePair );
const std::string &findBestGHKMParse(const PhraseAlignmentCollection &phrasePair );
const std::string &findBestTreeFragment(const PhraseAlignmentCollection &phrasePair );
void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float, int, ostream &phraseTableFile, bool isSingleton, const ScoreFeatureManager& featureManager, const MaybeLog& maybeLog );
double computeLexicalTranslation( const PHRASE &, const PHRASE &, const PhraseAlignment & );
double computeUnalignedPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & );
@ -98,7 +98,7 @@ int main(int argc, char* argv[])
ScoreFeatureManager featureManager;
if (argc < 4) {
cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--GHKMParse] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--Singleton] [--CrossedNonTerm] \n";
cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--TreeFragments] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--Singleton] [--CrossedNonTerm] \n";
cerr << featureManager.usage() << endl;
exit(1);
}
@ -119,9 +119,9 @@ int main(int argc, char* argv[])
} else if (strcmp(argv[i],"--PCFG") == 0) {
pcfgFlag = true;
cerr << "including PCFG scores\n";
} else if (strcmp(argv[i],"--GHKMParse") == 0) {
ghkmParseFlag = true;
cerr << "including GHKM parse\n";
} else if (strcmp(argv[i],"--TreeFragments") == 0) {
treeFragmentsFlag = true;
cerr << "including tree fragments from syntactic parse\n";
} else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) {
unpairedExtractFormatFlag = true;
cerr << "processing unpaired extract format\n";
@ -381,27 +381,27 @@ const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrase
return *bestAlignment;
}
const std::string &findBestGHKMParse(const PhraseAlignmentCollection &phrasePair )
const std::string &findBestTreeFragment(const PhraseAlignmentCollection &phrasePair )
{
float bestGHKMParseCount = -1;
PhraseAlignment *bestGHKMParse = NULL;
float bestTreeFragmentCount = -1;
PhraseAlignment *bestTreeFragment = NULL;
for(size_t i=0; i<phrasePair.size(); i++) {
size_t ghkmParseInd;
size_t treeFragmentInd;
if (inverseFlag) {
// count backwards, so that alignments for ties will be the same for both normal & inverse scores
ghkmParseInd = phrasePair.size() - i - 1;
treeFragmentInd = phrasePair.size() - i - 1;
} else {
ghkmParseInd = i;
treeFragmentInd = i;
}
if (phrasePair[ghkmParseInd]->count > bestGHKMParseCount) {
bestGHKMParseCount = phrasePair[ghkmParseInd]->count;
bestGHKMParse = phrasePair[ghkmParseInd];
if (phrasePair[treeFragmentInd]->count > bestTreeFragmentCount) {
bestTreeFragmentCount = phrasePair[treeFragmentInd]->count;
bestTreeFragment = phrasePair[treeFragmentInd];
}
}
return bestGHKMParse->ghkmParse;
return bestTreeFragment->treeFragment;
}
@ -708,12 +708,11 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
}
}
// GHKM parse
if (ghkmParseFlag && !inverseFlag) {
const std::string &bestGHKMParse = findBestGHKMParse( phrasePair );
if ( !bestGHKMParse.empty() )
phraseTableFile << " ||| {{GHKMParse" << bestGHKMParse << "}}";
// tree fragments
if (treeFragmentsFlag && !inverseFlag) {
const std::string &bestTreeFragment = findBestTreeFragment( phrasePair );
if ( !bestTreeFragment.empty() )
phraseTableFile << " ||| {{Tree " << bestTreeFragment << "}}";
}