GHKM: write target parts-of-speech as a factor

This commit is contained in:
Matthias Huck 2015-03-09 21:54:03 +00:00
parent 524ed4406e
commit 25f5470216
11 changed files with 98 additions and 29 deletions

View File

@ -466,7 +466,9 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
("Minimal",
"extract minimal rules only")
("PartsOfSpeech",
"output parts-of-speech information (preterminals from the parse tree)")
"output parts-of-speech as property (preterminals from the parse tree)")
("PartsOfSpeechFactor",
"output parts-of-speech as factor (preterminals from the parse tree)")
("PCFG",
"include score based on PCFG scores in target corpus")
("PhraseOrientation",
@ -582,6 +584,9 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
if (vm.count("PartsOfSpeech")) {
options.partsOfSpeech = true;
}
if (vm.count("PartsOfSpeechFactor")) {
options.partsOfSpeechFactor = true;
}
if (vm.count("PCFG")) {
options.pcfg = true;
}
@ -672,15 +677,27 @@ void ExtractGHKM::WriteGlueGrammar(
const size_t sourceLabelGlueX = 1;
const size_t sourceLabelSentenceStart = 2;
const size_t sourceLabelSentenceEnd = 3;
const size_t partOfSpeechSentenceStart = 0;
const size_t partOfSpeechSentenceEnd = 1;
std::string sentenceStartSource = "<s>";
std::string sentenceEndSource = "</s>";
std::string sentenceStartTarget = "<s>";
std::string sentenceEndTarget = "</s>";
// const size_t partOfSpeechSentenceStart = 0;
// const size_t partOfSpeechSentenceEnd = 1;
#ifndef BOS_
#define BOS_ "<s>" //Beginning of sentence symbol
#endif
#ifndef EOS_
#define EOS_ "</s>" //End of sentence symbol
#endif
std::string sentenceStartSource = BOS_;
std::string sentenceEndSource = EOS_;
std::string sentenceStartTarget = BOS_;
std::string sentenceEndTarget = EOS_;
if (options.partsOfSpeech) {
sentenceStartTarget = sentenceStartTarget + "|" + sentenceStartTarget;
sentenceEndTarget = sentenceEndTarget + "|" + sentenceEndTarget;
sentenceStartTarget = sentenceStartTarget + "|" + BOS_;
sentenceEndTarget = sentenceEndTarget + "|" + EOS_;
}
if (options.partsOfSpeechFactor) {
sentenceStartTarget = sentenceStartTarget + "|" + BOS_;
sentenceEndTarget = sentenceEndTarget + "|" + EOS_;
}
// basic rules

View File

@ -18,8 +18,6 @@
***********************************************************************/
#pragma once
#ifndef EXTRACT_GHKM_EXTRACT_GHKM_H_
#define EXTRACT_GHKM_EXTRACT_GHKM_H_
#include <map>
#include <ostream>
@ -81,4 +79,3 @@ private:
} // namespace GHKM
} // namespace Moses
#endif

View File

@ -18,8 +18,6 @@
***********************************************************************/
#pragma once
#ifndef EXTRACT_GHKM_OPTIONS_H_
#define EXTRACT_GHKM_OPTIONS_H_
#include <string>
@ -41,6 +39,7 @@ public:
, maxScope(3)
, minimal(false)
, partsOfSpeech(false)
, partsOfSpeechFactor(false)
, pcfg(false)
, phraseOrientation(false)
, sentenceOffset(0)
@ -70,6 +69,7 @@ public:
int maxScope;
bool minimal;
bool partsOfSpeech;
bool partsOfSpeechFactor;
bool pcfg;
bool phraseOrientation;
int sentenceOffset;
@ -89,4 +89,3 @@ public:
} // namespace GHKM
} // namespace Moses
#endif

View File

@ -32,7 +32,8 @@ namespace GHKM
ScfgRule::ScfgRule(const Subgraph &fragment,
const MosesTraining::SyntaxTree *sourceSyntaxTree)
: m_sourceLHS("X", NonTerminal)
: m_graphFragment(fragment)
, m_sourceLHS("X", NonTerminal)
, m_targetLHS(fragment.GetRoot()->GetLabel(), NonTerminal)
, m_pcfgScore(fragment.GetPcfgScore())
, m_hasSourceLabels(sourceSyntaxTree)

View File

@ -18,8 +18,6 @@
***********************************************************************/
#pragma once
#ifndef EXTRACT_GHKM_SCFG_RULE_H_
#define EXTRACT_GHKM_SCFG_RULE_H_
#include "Alignment.h"
#include "Rule.h"
@ -45,6 +43,9 @@ public:
ScfgRule(const Subgraph &fragment,
const MosesTraining::SyntaxTree *sourceSyntaxTree = 0);
const Subgraph &GetGraphFragment() const {
return m_graphFragment;
}
const Symbol &GetSourceLHS() const {
return m_sourceLHS;
}
@ -81,6 +82,7 @@ private:
const Node *node,
const std::string &nonMatchingLabel);
const Subgraph& m_graphFragment;
Symbol m_sourceLHS;
Symbol m_targetLHS;
std::vector<Symbol> m_sourceRHS;
@ -94,4 +96,3 @@ private:
} // namespace GHKM
} // namespace Moses
#endif

View File

@ -121,6 +121,13 @@ void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule,
}
}
// If parts-of-speech as a factor requested: retrieve preterminals from graph fragment
std::vector<std::string> partsOfSpeech;
if (m_options.partsOfSpeechFactor) {
const Subgraph &graphFragment = rule.GetGraphFragment();
graphFragment.GetPartsOfSpeech(partsOfSpeech);
}
// Write the source side of the rule to sourceSS.
int i = 0;
for (std::vector<Symbol>::const_iterator p(sourceRHS.begin());
@ -140,6 +147,7 @@ void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule,
// Write the target side of the rule to targetSS.
i = 0;
int targetTerminalIndex = 0;
for (std::vector<Symbol>::const_iterator p(targetRHS.begin());
p != targetRHS.end(); ++p, ++i) {
if (p->GetType() == NonTerminal) {
@ -147,6 +155,12 @@ void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule,
WriteSymbol(sourceRHS[sourceIndex], targetSS);
}
WriteSymbol(*p, targetSS);
// If parts-of-speech as a factor requested: write part-of-speech
if (m_options.partsOfSpeechFactor && (p->GetType() != NonTerminal)) {
assert(targetTerminalIndex<partsOfSpeech.size());
targetSS << "|" << partsOfSpeech[targetTerminalIndex];
++targetTerminalIndex;
}
targetSS << " ";
}
WriteSymbol(rule.GetTargetLHS(), targetSS);
@ -159,10 +173,16 @@ void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule,
const std::vector<Symbol> &sourceRHS = rule.GetSourceRHS();
const std::vector<Symbol> &targetRHS = rule.GetTargetRHS();
// If parts-of-speech as a factor requested: retrieve preterminals from graph fragment
std::vector<std::string> partsOfSpeech;
if (m_options.partsOfSpeechFactor) {
const Subgraph &graphFragment = rule.GetGraphFragment();
graphFragment.GetPartsOfSpeech(partsOfSpeech);
}
// Write the source side of the rule to sourceSS.
int i = 0;
for (std::vector<Symbol>::const_iterator p(sourceRHS.begin());
p != sourceRHS.end(); ++p, ++i) {
p != sourceRHS.end(); ++p) {
WriteSymbol(*p, sourceSS);
sourceSS << " ";
}
@ -173,10 +193,16 @@ void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule,
}
// Write the target side of the rule to targetSS.
i = 0;
int targetTerminalIndex = 0;
for (std::vector<Symbol>::const_iterator p(targetRHS.begin());
p != targetRHS.end(); ++p, ++i) {
p != targetRHS.end(); ++p) {
WriteSymbol(*p, targetSS);
// If parts-of-speech as a factor requested: write part-of-speech
if (m_options.partsOfSpeechFactor && (p->GetType() != NonTerminal)) {
assert(targetTerminalIndex<partsOfSpeech.size());
targetSS << "|" << partsOfSpeech[targetTerminalIndex];
++targetTerminalIndex;
}
targetSS << " ";
}
WriteSymbol(rule.GetTargetLHS(), targetSS);

View File

@ -18,8 +18,6 @@
***********************************************************************/
#pragma once
#ifndef EXTRACT_GHKM_RULE_WRITER_H_
#define EXTRACT_GHKM_RULE_WRITER_H_
#include "Subgraph.h"
@ -61,4 +59,3 @@ private:
} // namespace GHKM
} // namespace Moses
#endif

View File

@ -168,5 +168,30 @@ void Subgraph::RecursivelyPrintPartsOfSpeech(const Node *n, std::ostream &out) c
}
}
void Subgraph::GetPartsOfSpeech(std::vector<std::string> &out) const
{
out.clear();
RecursivelyGetPartsOfSpeech(m_root,out);
}
void Subgraph::RecursivelyGetPartsOfSpeech(const Node *n, std::vector<std::string> &out) const
{
NodeType nodeType = n->GetType();
if (nodeType == TREE) {
if (m_leaves.find(n) == m_leaves.end()) {
const std::vector<Node *> &children = n->GetChildren();
for (std::vector<Node *>::const_iterator p(children.begin());
p != children.end(); ++p) {
Node *child = *p;
if (child->GetType() == TARGET) {
out.push_back(n->GetLabel());
} else {
RecursivelyGetPartsOfSpeech(child,out);
}
}
}
}
}
} // namespace Moses
} // namespace GHKM

View File

@ -18,8 +18,6 @@
***********************************************************************/
#pragma once
#ifndef EXTRACT_GHKM_SUBGRAPH_H_
#define EXTRACT_GHKM_SUBGRAPH_H_
#include "Node.h"
@ -118,6 +116,7 @@ public:
void GetTargetLeaves(std::vector<const Node *> &) const;
void PrintTree(std::ostream &out) const;
void PrintPartsOfSpeech(std::ostream &out) const;
void GetPartsOfSpeech(std::vector<std::string> &out) const;
private:
void GetTargetLeaves(const Node *, std::vector<const Node *> &) const;
@ -127,6 +126,7 @@ private:
int CountNodes(const Node *) const;
void RecursivelyPrintTree(const Node *n, std::ostream &out) const;
void RecursivelyPrintPartsOfSpeech(const Node *n, std::ostream &out) const;
void RecursivelyGetPartsOfSpeech(const Node *n, std::vector<std::string> &out) const;
const Node *m_root;
std::set<const Node *> m_leaves;
@ -139,4 +139,3 @@ private:
} // namespace GHKM
} // namespace Moses
#endif

View File

@ -2249,6 +2249,10 @@ sub define_training_extract_phrases {
my $parts_of_speech_labels_file = &versionize(&long_file_name("parts-of-speech","model",""));
$cmd .= "-ghkm-parts-of-speech-file $parts_of_speech_labels_file ";
}
if (&get("TRAINING:ghkm-parts-of-speech-factor")) {
$cmd .= "-ghkm-parts-of-speech-factor ";
}
}
my $extract_settings = &get("TRAINING:extract-settings");

View File

@ -96,6 +96,7 @@ my($_EXTERNAL_BINDIR,
$_GHKM_SOURCE_LABELS_FILE,
$_GHKM_PARTS_OF_SPEECH,
$_GHKM_PARTS_OF_SPEECH_FILE,
$_GHKM_PARTS_OF_SPEECH_FACTOR,
$_PCFG,
@_EXTRACT_OPTIONS,
@_SCORE_OPTIONS,
@ -204,6 +205,7 @@ $_HELP = 1
'ghkm-source-labels-file=s' => \$_GHKM_SOURCE_LABELS_FILE,
'ghkm-parts-of-speech' => \$_GHKM_PARTS_OF_SPEECH,
'ghkm-parts-of-speech-file=s' => \$_GHKM_PARTS_OF_SPEECH_FILE,
'ghkm-parts-of-speech-factor' => \$_GHKM_PARTS_OF_SPEECH_FACTOR,
'pcfg' => \$_PCFG,
'alt-direct-rule-score-1' => \$_ALT_DIRECT_RULE_SCORE_1,
'alt-direct-rule-score-2' => \$_ALT_DIRECT_RULE_SCORE_2,
@ -1543,6 +1545,7 @@ sub extract_phrase {
$cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if defined($_PHRASE_ORIENTATION_PRIORS_FILE);
$cmd .= " --SourceLabels" if $_GHKM_SOURCE_LABELS;
$cmd .= " --PartsOfSpeech" if $_GHKM_PARTS_OF_SPEECH;
$cmd .= " --PartsOfSpeechFactor" if $_GHKM_PARTS_OF_SPEECH_FACTOR;
}
else
{