score-stsg: add --TreeScore option

This commit is contained in:
Phil Williams 2014-12-30 18:57:23 +00:00
parent b7650b2d8b
commit 7cc75a0fa1
6 changed files with 28 additions and 17 deletions

View File

@ -20,7 +20,7 @@ public:
, negLogProb(false)
, noLex(false)
, noWordAlignment(false)
, pcfg(false) {}
, treeScore(false) {}
// Positional options
std::string extractFile;
@ -36,7 +36,7 @@ public:
bool negLogProb;
bool noLex;
bool noWordAlignment;
bool pcfg;
bool treeScore;
};
} // namespace ScoreStsg

View File

@ -15,7 +15,8 @@ void RuleGroup::SetNewSource(const StringPiece &source)
}
void RuleGroup::AddRule(const StringPiece &target, const StringPiece &ntAlign,
const StringPiece &fullAlign, int count)
const StringPiece &fullAlign, int count,
double treeScore)
{
if (m_distinctRules.empty() ||
ntAlign != m_distinctRules.back().ntAlign ||
@ -27,6 +28,7 @@ void RuleGroup::AddRule(const StringPiece &target, const StringPiece &ntAlign,
fullAlign.CopyToString(&r.alignments.back().first);
r.alignments.back().second = count;
r.count = count;
r.treeScore = treeScore;
m_distinctRules.push_back(r);
} else {
DistinctRule &r = m_distinctRules.back();

View File

@ -25,13 +25,14 @@ class RuleGroup
{
public:
// Stores the target-side and NT-alignment of a distinct rule. Also records
// the rule's count and the observed symbol alignments (plus their
// frequencies).
// the rule's count, the observed symbol alignments (plus their frequencies),
// and the tree score.
struct DistinctRule {
std::string target;
std::string ntAlign;
std::vector<std::pair<std::string, int> > alignments;
int count;
double treeScore;
};
typedef std::vector<DistinctRule>::const_iterator ConstIterator;
@ -58,7 +59,7 @@ public:
// values will be checked against those of the previous rule only (in other
// words, the input is assumed to be ordered).
void AddRule(const StringPiece &target, const StringPiece &ntAlign,
const StringPiece &fullAlign, int count);
const StringPiece &fullAlign, int count, double treeScore);
private:
std::string m_source;

View File

@ -28,8 +28,8 @@ namespace ScoreStsg
void RuleTableWriter::WriteLine(const TokenizedRuleHalf &source,
const TokenizedRuleHalf &target,
const std::string &bestAlignment,
double lexScore, int count, int totalCount,
int distinctCount)
double lexScore, double treeScore, int count,
int totalCount, int distinctCount)
{
if (m_options.inverse) {
WriteRuleHalf(target);
@ -47,7 +47,9 @@ void RuleTableWriter::WriteLine(const TokenizedRuleHalf &source,
m_out << MaybeLog(lexScore);
}
// TODO PCFG
if (m_options.treeScore && !m_options.inverse) {
m_out << " " << MaybeLog(treeScore);
}
m_out << " ||| " << totalCount << " " << count;
if (m_options.kneserNey) {

View File

@ -23,7 +23,7 @@ public:
, m_out(out) {}
void WriteLine(const TokenizedRuleHalf &, const TokenizedRuleHalf &,
const std::string &, double, int, int, int);
const std::string &, double, double, int, int, int);
private:
double MaybeLog(double a) const {

View File

@ -83,6 +83,12 @@ int ScoreStsg::Main(int argc, char *argv[])
StringPiece fullAlign = *it++;
it->CopyToString(&tmp);
int count = std::atoi(tmp.c_str());
double treeScore = 0.0f;
if (m_options.treeScore && !m_options.inverse) {
++it;
it->CopyToString(&tmp);
treeScore = std::atof(tmp.c_str());
}
// If this is the first line or if source has changed since the last
// line then process the current rule group and start a new one.
@ -95,7 +101,7 @@ int ScoreStsg::Main(int argc, char *argv[])
}
// Add the rule to the current rule group.
ruleGroup.AddRule(target, ntAlign, fullAlign, count);
ruleGroup.AddRule(target, ntAlign, fullAlign, count, treeScore);
}
// Process the final rule group.
@ -223,11 +229,9 @@ void ScoreStsg::ProcessRuleGroup(const RuleGroup &group,
double lexProb = ComputeLexProb(m_sourceHalf.frontierSymbols,
m_targetHalf.frontierSymbols, m_tgtToSrc);
// TODO PCFG score
// Write a line to the rule table.
writer.WriteLine(m_sourceHalf, m_targetHalf, bestAlignment, lexProb,
p->count, totalCount, distinctCount);
rule.treeScore, p->count, totalCount, distinctCount);
}
}
@ -348,7 +352,9 @@ void ScoreStsg::ProcessOptions(int argc, char *argv[], Options &options) const
("NoWordAlignment",
"do not output word alignments")
("PCFG",
"include pre-computed PCFG score from extract")
"synonym for TreeScore (included for compatibility with score)")
("TreeScore",
"include pre-computed tree score from extract")
("UnpairedExtractFormat",
"ignored (included for compatibility with score)")
;
@ -429,8 +435,8 @@ void ScoreStsg::ProcessOptions(int argc, char *argv[], Options &options) const
if (vm.count("NoWordAlignment")) {
options.noWordAlignment = true;
}
if (vm.count("PCFG")) {
options.pcfg = true;
if (vm.count("TreeScore") || vm.count("PCFG")) {
options.treeScore = true;
}
}