mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-10-26 19:37:58 +03:00
extract-ghkm: tweak label collection for unknown words
Produce a better label set when unary rule elimination is enabled.
This commit is contained in:
parent
fb8d20a22f
commit
0ca5b8932a
@ -144,7 +144,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
|
|||||||
|
|
||||||
// Record word counts.
|
// Record word counts.
|
||||||
if (!options.unknownWordFile.empty()) {
|
if (!options.unknownWordFile.empty()) {
|
||||||
CollectWordLabelCounts(*t, wordCount, wordLabel);
|
CollectWordLabelCounts(*t, options, wordCount, wordLabel);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Form an alignment graph from the target tree, source words, and
|
// Form an alignment graph from the target tree, source words, and
|
||||||
@ -458,6 +458,7 @@ void ExtractGHKM::WriteGlueGrammar(
|
|||||||
|
|
||||||
void ExtractGHKM::CollectWordLabelCounts(
|
void ExtractGHKM::CollectWordLabelCounts(
|
||||||
ParseTree &root,
|
ParseTree &root,
|
||||||
|
const Options &options,
|
||||||
std::map<std::string, int> &wordCount,
|
std::map<std::string, int> &wordCount,
|
||||||
std::map<std::string, std::string> &wordLabel)
|
std::map<std::string, std::string> &wordLabel)
|
||||||
{
|
{
|
||||||
@ -467,7 +468,18 @@ void ExtractGHKM::CollectWordLabelCounts(
|
|||||||
p != leaves.end(); ++p) {
|
p != leaves.end(); ++p) {
|
||||||
const ParseTree &leaf = **p;
|
const ParseTree &leaf = **p;
|
||||||
const std::string &word = leaf.GetLabel();
|
const std::string &word = leaf.GetLabel();
|
||||||
const std::string &label = leaf.GetParent()->GetLabel();
|
const ParseTree *ancestor = leaf.GetParent();
|
||||||
|
// If unary rule elimination is enabled and this word is at the end of a
|
||||||
|
// chain of unary rewrites, e.g.
|
||||||
|
// PN-SB -> NE -> word
|
||||||
|
// then record the constituent label at the top of the chain instead of
|
||||||
|
// the part-of-speech label.
|
||||||
|
while (!options.allowUnary &&
|
||||||
|
ancestor->GetParent() &&
|
||||||
|
ancestor->GetParent()->GetChildren().size() == 1) {
|
||||||
|
ancestor = ancestor->GetParent();
|
||||||
|
}
|
||||||
|
const std::string &label = ancestor->GetLabel();
|
||||||
++wordCount[word];
|
++wordCount[word];
|
||||||
wordLabel[word] = label;
|
wordLabel[word] = label;
|
||||||
}
|
}
|
||||||
|
@ -49,6 +49,7 @@ class ExtractGHKM
|
|||||||
void OpenOutputFileOrDie(const std::string &, OutputFileStream &);
|
void OpenOutputFileOrDie(const std::string &, OutputFileStream &);
|
||||||
void RecordTreeLabels(const ParseTree &, std::set<std::string> &);
|
void RecordTreeLabels(const ParseTree &, std::set<std::string> &);
|
||||||
void CollectWordLabelCounts(ParseTree &,
|
void CollectWordLabelCounts(ParseTree &,
|
||||||
|
const Options &,
|
||||||
std::map<std::string, int> &,
|
std::map<std::string, int> &,
|
||||||
std::map<std::string, std::string> &);
|
std::map<std::string, std::string> &);
|
||||||
void WriteUnknownWordLabel(const std::map<std::string, int> &,
|
void WriteUnknownWordLabel(const std::map<std::string, int> &,
|
||||||
|
Loading…
Reference in New Issue
Block a user