From 7cc75a0fa15b85861e2e3e8c7db88ca26c81b73f Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Tue, 30 Dec 2014 18:57:23 +0000 Subject: [PATCH] score-stsg: add --TreeScore option --- phrase-extract/score-stsg/Options.h | 4 ++-- phrase-extract/score-stsg/RuleGroup.cpp | 4 +++- phrase-extract/score-stsg/RuleGroup.h | 7 ++++--- phrase-extract/score-stsg/RuleTableWriter.cpp | 8 +++++--- phrase-extract/score-stsg/RuleTableWriter.h | 2 +- phrase-extract/score-stsg/ScoreStsg.cpp | 20 ++++++++++++------- 6 files changed, 28 insertions(+), 17 deletions(-) diff --git a/phrase-extract/score-stsg/Options.h b/phrase-extract/score-stsg/Options.h index 17b959c84..25e63a5c0 100644 --- a/phrase-extract/score-stsg/Options.h +++ b/phrase-extract/score-stsg/Options.h @@ -20,7 +20,7 @@ public: , negLogProb(false) , noLex(false) , noWordAlignment(false) - , pcfg(false) {} + , treeScore(false) {} // Positional options std::string extractFile; @@ -36,7 +36,7 @@ public: bool negLogProb; bool noLex; bool noWordAlignment; - bool pcfg; + bool treeScore; }; } // namespace ScoreStsg diff --git a/phrase-extract/score-stsg/RuleGroup.cpp b/phrase-extract/score-stsg/RuleGroup.cpp index bbbe3b2b6..a4e6ff3a2 100644 --- a/phrase-extract/score-stsg/RuleGroup.cpp +++ b/phrase-extract/score-stsg/RuleGroup.cpp @@ -15,7 +15,8 @@ void RuleGroup::SetNewSource(const StringPiece &source) } void RuleGroup::AddRule(const StringPiece &target, const StringPiece &ntAlign, - const StringPiece &fullAlign, int count) + const StringPiece &fullAlign, int count, + double treeScore) { if (m_distinctRules.empty() || ntAlign != m_distinctRules.back().ntAlign || @@ -27,6 +28,7 @@ void RuleGroup::AddRule(const StringPiece &target, const StringPiece &ntAlign, fullAlign.CopyToString(&r.alignments.back().first); r.alignments.back().second = count; r.count = count; + r.treeScore = treeScore; m_distinctRules.push_back(r); } else { DistinctRule &r = m_distinctRules.back(); diff --git a/phrase-extract/score-stsg/RuleGroup.h b/phrase-extract/score-stsg/RuleGroup.h index de0c25f17..8d9933263 100644 --- a/phrase-extract/score-stsg/RuleGroup.h +++ b/phrase-extract/score-stsg/RuleGroup.h @@ -25,13 +25,14 @@ class RuleGroup { public: // Stores the target-side and NT-alignment of a distinct rule. Also records - // the rule's count and the observed symbol alignments (plus their - // frequencies). + // the rule's count, the observed symbol alignments (plus their frequencies), + // and the tree score. struct DistinctRule { std::string target; std::string ntAlign; std::vector > alignments; int count; + double treeScore; }; typedef std::vector::const_iterator ConstIterator; @@ -58,7 +59,7 @@ public: // values will be checked against those of the previous rule only (in other // words, the input is assumed to be ordered). void AddRule(const StringPiece &target, const StringPiece &ntAlign, - const StringPiece &fullAlign, int count); + const StringPiece &fullAlign, int count, double treeScore); private: std::string m_source; diff --git a/phrase-extract/score-stsg/RuleTableWriter.cpp b/phrase-extract/score-stsg/RuleTableWriter.cpp index 0a1d5aa08..d7bbe9d7f 100644 --- a/phrase-extract/score-stsg/RuleTableWriter.cpp +++ b/phrase-extract/score-stsg/RuleTableWriter.cpp @@ -28,8 +28,8 @@ namespace ScoreStsg void RuleTableWriter::WriteLine(const TokenizedRuleHalf &source, const TokenizedRuleHalf &target, const std::string &bestAlignment, - double lexScore, int count, int totalCount, - int distinctCount) + double lexScore, double treeScore, int count, + int totalCount, int distinctCount) { if (m_options.inverse) { WriteRuleHalf(target); @@ -47,7 +47,9 @@ void RuleTableWriter::WriteLine(const TokenizedRuleHalf &source, m_out << MaybeLog(lexScore); } - // TODO PCFG + if (m_options.treeScore && !m_options.inverse) { + m_out << " " << MaybeLog(treeScore); + } m_out << " ||| " << totalCount << " " << count; if (m_options.kneserNey) { diff --git a/phrase-extract/score-stsg/RuleTableWriter.h b/phrase-extract/score-stsg/RuleTableWriter.h index db8924de3..340a4bf19 100644 --- a/phrase-extract/score-stsg/RuleTableWriter.h +++ b/phrase-extract/score-stsg/RuleTableWriter.h @@ -23,7 +23,7 @@ public: , m_out(out) {} void WriteLine(const TokenizedRuleHalf &, const TokenizedRuleHalf &, - const std::string &, double, int, int, int); + const std::string &, double, double, int, int, int); private: double MaybeLog(double a) const { diff --git a/phrase-extract/score-stsg/ScoreStsg.cpp b/phrase-extract/score-stsg/ScoreStsg.cpp index 04e3b5a44..642c5dc05 100644 --- a/phrase-extract/score-stsg/ScoreStsg.cpp +++ b/phrase-extract/score-stsg/ScoreStsg.cpp @@ -83,6 +83,12 @@ int ScoreStsg::Main(int argc, char *argv[]) StringPiece fullAlign = *it++; it->CopyToString(&tmp); int count = std::atoi(tmp.c_str()); + double treeScore = 0.0f; + if (m_options.treeScore && !m_options.inverse) { + ++it; + it->CopyToString(&tmp); + treeScore = std::atof(tmp.c_str()); + } // If this is the first line or if source has changed since the last // line then process the current rule group and start a new one. @@ -95,7 +101,7 @@ int ScoreStsg::Main(int argc, char *argv[]) } // Add the rule to the current rule group. - ruleGroup.AddRule(target, ntAlign, fullAlign, count); + ruleGroup.AddRule(target, ntAlign, fullAlign, count, treeScore); } // Process the final rule group. @@ -223,11 +229,9 @@ void ScoreStsg::ProcessRuleGroup(const RuleGroup &group, double lexProb = ComputeLexProb(m_sourceHalf.frontierSymbols, m_targetHalf.frontierSymbols, m_tgtToSrc); - // TODO PCFG score - // Write a line to the rule table. writer.WriteLine(m_sourceHalf, m_targetHalf, bestAlignment, lexProb, - p->count, totalCount, distinctCount); + rule.treeScore, p->count, totalCount, distinctCount); } } @@ -348,7 +352,9 @@ void ScoreStsg::ProcessOptions(int argc, char *argv[], Options &options) const ("NoWordAlignment", "do not output word alignments") ("PCFG", - "include pre-computed PCFG score from extract") + "synonym for TreeScore (included for compatibility with score)") + ("TreeScore", + "include pre-computed tree score from extract") ("UnpairedExtractFormat", "ignored (included for compatibility with score)") ; @@ -429,8 +435,8 @@ void ScoreStsg::ProcessOptions(int argc, char *argv[], Options &options) const if (vm.count("NoWordAlignment")) { options.noWordAlignment = true; } - if (vm.count("PCFG")) { - options.pcfg = true; + if (vm.count("TreeScore") || vm.count("PCFG")) { + options.treeScore = true; } }