mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 13:23:25 +03:00
GHKM: write target parts-of-speech as a factor
This commit is contained in:
parent
524ed4406e
commit
25f5470216
@ -466,7 +466,9 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
|
||||
("Minimal",
|
||||
"extract minimal rules only")
|
||||
("PartsOfSpeech",
|
||||
"output parts-of-speech information (preterminals from the parse tree)")
|
||||
"output parts-of-speech as property (preterminals from the parse tree)")
|
||||
("PartsOfSpeechFactor",
|
||||
"output parts-of-speech as factor (preterminals from the parse tree)")
|
||||
("PCFG",
|
||||
"include score based on PCFG scores in target corpus")
|
||||
("PhraseOrientation",
|
||||
@ -582,6 +584,9 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
|
||||
if (vm.count("PartsOfSpeech")) {
|
||||
options.partsOfSpeech = true;
|
||||
}
|
||||
if (vm.count("PartsOfSpeechFactor")) {
|
||||
options.partsOfSpeechFactor = true;
|
||||
}
|
||||
if (vm.count("PCFG")) {
|
||||
options.pcfg = true;
|
||||
}
|
||||
@ -672,15 +677,27 @@ void ExtractGHKM::WriteGlueGrammar(
|
||||
const size_t sourceLabelGlueX = 1;
|
||||
const size_t sourceLabelSentenceStart = 2;
|
||||
const size_t sourceLabelSentenceEnd = 3;
|
||||
const size_t partOfSpeechSentenceStart = 0;
|
||||
const size_t partOfSpeechSentenceEnd = 1;
|
||||
std::string sentenceStartSource = "<s>";
|
||||
std::string sentenceEndSource = "</s>";
|
||||
std::string sentenceStartTarget = "<s>";
|
||||
std::string sentenceEndTarget = "</s>";
|
||||
// const size_t partOfSpeechSentenceStart = 0;
|
||||
// const size_t partOfSpeechSentenceEnd = 1;
|
||||
|
||||
#ifndef BOS_
|
||||
#define BOS_ "<s>" //Beginning of sentence symbol
|
||||
#endif
|
||||
#ifndef EOS_
|
||||
#define EOS_ "</s>" //End of sentence symbol
|
||||
#endif
|
||||
|
||||
std::string sentenceStartSource = BOS_;
|
||||
std::string sentenceEndSource = EOS_;
|
||||
std::string sentenceStartTarget = BOS_;
|
||||
std::string sentenceEndTarget = EOS_;
|
||||
if (options.partsOfSpeech) {
|
||||
sentenceStartTarget = sentenceStartTarget + "|" + sentenceStartTarget;
|
||||
sentenceEndTarget = sentenceEndTarget + "|" + sentenceEndTarget;
|
||||
sentenceStartTarget = sentenceStartTarget + "|" + BOS_;
|
||||
sentenceEndTarget = sentenceEndTarget + "|" + EOS_;
|
||||
}
|
||||
if (options.partsOfSpeechFactor) {
|
||||
sentenceStartTarget = sentenceStartTarget + "|" + BOS_;
|
||||
sentenceEndTarget = sentenceEndTarget + "|" + EOS_;
|
||||
}
|
||||
|
||||
// basic rules
|
||||
|
@ -18,8 +18,6 @@
|
||||
***********************************************************************/
|
||||
|
||||
#pragma once
|
||||
#ifndef EXTRACT_GHKM_EXTRACT_GHKM_H_
|
||||
#define EXTRACT_GHKM_EXTRACT_GHKM_H_
|
||||
|
||||
#include <map>
|
||||
#include <ostream>
|
||||
@ -81,4 +79,3 @@ private:
|
||||
} // namespace GHKM
|
||||
} // namespace Moses
|
||||
|
||||
#endif
|
||||
|
@ -18,8 +18,6 @@
|
||||
***********************************************************************/
|
||||
|
||||
#pragma once
|
||||
#ifndef EXTRACT_GHKM_OPTIONS_H_
|
||||
#define EXTRACT_GHKM_OPTIONS_H_
|
||||
|
||||
#include <string>
|
||||
|
||||
@ -41,6 +39,7 @@ public:
|
||||
, maxScope(3)
|
||||
, minimal(false)
|
||||
, partsOfSpeech(false)
|
||||
, partsOfSpeechFactor(false)
|
||||
, pcfg(false)
|
||||
, phraseOrientation(false)
|
||||
, sentenceOffset(0)
|
||||
@ -70,6 +69,7 @@ public:
|
||||
int maxScope;
|
||||
bool minimal;
|
||||
bool partsOfSpeech;
|
||||
bool partsOfSpeechFactor;
|
||||
bool pcfg;
|
||||
bool phraseOrientation;
|
||||
int sentenceOffset;
|
||||
@ -89,4 +89,3 @@ public:
|
||||
} // namespace GHKM
|
||||
} // namespace Moses
|
||||
|
||||
#endif
|
||||
|
@ -32,7 +32,8 @@ namespace GHKM
|
||||
|
||||
ScfgRule::ScfgRule(const Subgraph &fragment,
|
||||
const MosesTraining::SyntaxTree *sourceSyntaxTree)
|
||||
: m_sourceLHS("X", NonTerminal)
|
||||
: m_graphFragment(fragment)
|
||||
, m_sourceLHS("X", NonTerminal)
|
||||
, m_targetLHS(fragment.GetRoot()->GetLabel(), NonTerminal)
|
||||
, m_pcfgScore(fragment.GetPcfgScore())
|
||||
, m_hasSourceLabels(sourceSyntaxTree)
|
||||
|
@ -18,8 +18,6 @@
|
||||
***********************************************************************/
|
||||
|
||||
#pragma once
|
||||
#ifndef EXTRACT_GHKM_SCFG_RULE_H_
|
||||
#define EXTRACT_GHKM_SCFG_RULE_H_
|
||||
|
||||
#include "Alignment.h"
|
||||
#include "Rule.h"
|
||||
@ -45,6 +43,9 @@ public:
|
||||
ScfgRule(const Subgraph &fragment,
|
||||
const MosesTraining::SyntaxTree *sourceSyntaxTree = 0);
|
||||
|
||||
const Subgraph &GetGraphFragment() const {
|
||||
return m_graphFragment;
|
||||
}
|
||||
const Symbol &GetSourceLHS() const {
|
||||
return m_sourceLHS;
|
||||
}
|
||||
@ -81,6 +82,7 @@ private:
|
||||
const Node *node,
|
||||
const std::string &nonMatchingLabel);
|
||||
|
||||
const Subgraph& m_graphFragment;
|
||||
Symbol m_sourceLHS;
|
||||
Symbol m_targetLHS;
|
||||
std::vector<Symbol> m_sourceRHS;
|
||||
@ -94,4 +96,3 @@ private:
|
||||
} // namespace GHKM
|
||||
} // namespace Moses
|
||||
|
||||
#endif
|
||||
|
@ -121,6 +121,13 @@ void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule,
|
||||
}
|
||||
}
|
||||
|
||||
// If parts-of-speech as a factor requested: retrieve preterminals from graph fragment
|
||||
std::vector<std::string> partsOfSpeech;
|
||||
if (m_options.partsOfSpeechFactor) {
|
||||
const Subgraph &graphFragment = rule.GetGraphFragment();
|
||||
graphFragment.GetPartsOfSpeech(partsOfSpeech);
|
||||
}
|
||||
|
||||
// Write the source side of the rule to sourceSS.
|
||||
int i = 0;
|
||||
for (std::vector<Symbol>::const_iterator p(sourceRHS.begin());
|
||||
@ -140,6 +147,7 @@ void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule,
|
||||
|
||||
// Write the target side of the rule to targetSS.
|
||||
i = 0;
|
||||
int targetTerminalIndex = 0;
|
||||
for (std::vector<Symbol>::const_iterator p(targetRHS.begin());
|
||||
p != targetRHS.end(); ++p, ++i) {
|
||||
if (p->GetType() == NonTerminal) {
|
||||
@ -147,6 +155,12 @@ void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule,
|
||||
WriteSymbol(sourceRHS[sourceIndex], targetSS);
|
||||
}
|
||||
WriteSymbol(*p, targetSS);
|
||||
// If parts-of-speech as a factor requested: write part-of-speech
|
||||
if (m_options.partsOfSpeechFactor && (p->GetType() != NonTerminal)) {
|
||||
assert(targetTerminalIndex<partsOfSpeech.size());
|
||||
targetSS << "|" << partsOfSpeech[targetTerminalIndex];
|
||||
++targetTerminalIndex;
|
||||
}
|
||||
targetSS << " ";
|
||||
}
|
||||
WriteSymbol(rule.GetTargetLHS(), targetSS);
|
||||
@ -159,10 +173,16 @@ void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule,
|
||||
const std::vector<Symbol> &sourceRHS = rule.GetSourceRHS();
|
||||
const std::vector<Symbol> &targetRHS = rule.GetTargetRHS();
|
||||
|
||||
// If parts-of-speech as a factor requested: retrieve preterminals from graph fragment
|
||||
std::vector<std::string> partsOfSpeech;
|
||||
if (m_options.partsOfSpeechFactor) {
|
||||
const Subgraph &graphFragment = rule.GetGraphFragment();
|
||||
graphFragment.GetPartsOfSpeech(partsOfSpeech);
|
||||
}
|
||||
|
||||
// Write the source side of the rule to sourceSS.
|
||||
int i = 0;
|
||||
for (std::vector<Symbol>::const_iterator p(sourceRHS.begin());
|
||||
p != sourceRHS.end(); ++p, ++i) {
|
||||
p != sourceRHS.end(); ++p) {
|
||||
WriteSymbol(*p, sourceSS);
|
||||
sourceSS << " ";
|
||||
}
|
||||
@ -173,10 +193,16 @@ void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule,
|
||||
}
|
||||
|
||||
// Write the target side of the rule to targetSS.
|
||||
i = 0;
|
||||
int targetTerminalIndex = 0;
|
||||
for (std::vector<Symbol>::const_iterator p(targetRHS.begin());
|
||||
p != targetRHS.end(); ++p, ++i) {
|
||||
p != targetRHS.end(); ++p) {
|
||||
WriteSymbol(*p, targetSS);
|
||||
// If parts-of-speech as a factor requested: write part-of-speech
|
||||
if (m_options.partsOfSpeechFactor && (p->GetType() != NonTerminal)) {
|
||||
assert(targetTerminalIndex<partsOfSpeech.size());
|
||||
targetSS << "|" << partsOfSpeech[targetTerminalIndex];
|
||||
++targetTerminalIndex;
|
||||
}
|
||||
targetSS << " ";
|
||||
}
|
||||
WriteSymbol(rule.GetTargetLHS(), targetSS);
|
||||
|
@ -18,8 +18,6 @@
|
||||
***********************************************************************/
|
||||
|
||||
#pragma once
|
||||
#ifndef EXTRACT_GHKM_RULE_WRITER_H_
|
||||
#define EXTRACT_GHKM_RULE_WRITER_H_
|
||||
|
||||
#include "Subgraph.h"
|
||||
|
||||
@ -61,4 +59,3 @@ private:
|
||||
} // namespace GHKM
|
||||
} // namespace Moses
|
||||
|
||||
#endif
|
||||
|
@ -168,5 +168,30 @@ void Subgraph::RecursivelyPrintPartsOfSpeech(const Node *n, std::ostream &out) c
|
||||
}
|
||||
}
|
||||
|
||||
void Subgraph::GetPartsOfSpeech(std::vector<std::string> &out) const
|
||||
{
|
||||
out.clear();
|
||||
RecursivelyGetPartsOfSpeech(m_root,out);
|
||||
}
|
||||
|
||||
void Subgraph::RecursivelyGetPartsOfSpeech(const Node *n, std::vector<std::string> &out) const
|
||||
{
|
||||
NodeType nodeType = n->GetType();
|
||||
if (nodeType == TREE) {
|
||||
if (m_leaves.find(n) == m_leaves.end()) {
|
||||
const std::vector<Node *> &children = n->GetChildren();
|
||||
for (std::vector<Node *>::const_iterator p(children.begin());
|
||||
p != children.end(); ++p) {
|
||||
Node *child = *p;
|
||||
if (child->GetType() == TARGET) {
|
||||
out.push_back(n->GetLabel());
|
||||
} else {
|
||||
RecursivelyGetPartsOfSpeech(child,out);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Moses
|
||||
} // namespace GHKM
|
||||
|
@ -18,8 +18,6 @@
|
||||
***********************************************************************/
|
||||
|
||||
#pragma once
|
||||
#ifndef EXTRACT_GHKM_SUBGRAPH_H_
|
||||
#define EXTRACT_GHKM_SUBGRAPH_H_
|
||||
|
||||
#include "Node.h"
|
||||
|
||||
@ -118,6 +116,7 @@ public:
|
||||
void GetTargetLeaves(std::vector<const Node *> &) const;
|
||||
void PrintTree(std::ostream &out) const;
|
||||
void PrintPartsOfSpeech(std::ostream &out) const;
|
||||
void GetPartsOfSpeech(std::vector<std::string> &out) const;
|
||||
|
||||
private:
|
||||
void GetTargetLeaves(const Node *, std::vector<const Node *> &) const;
|
||||
@ -127,6 +126,7 @@ private:
|
||||
int CountNodes(const Node *) const;
|
||||
void RecursivelyPrintTree(const Node *n, std::ostream &out) const;
|
||||
void RecursivelyPrintPartsOfSpeech(const Node *n, std::ostream &out) const;
|
||||
void RecursivelyGetPartsOfSpeech(const Node *n, std::vector<std::string> &out) const;
|
||||
|
||||
const Node *m_root;
|
||||
std::set<const Node *> m_leaves;
|
||||
@ -139,4 +139,3 @@ private:
|
||||
} // namespace GHKM
|
||||
} // namespace Moses
|
||||
|
||||
#endif
|
||||
|
@ -2249,6 +2249,10 @@ sub define_training_extract_phrases {
|
||||
my $parts_of_speech_labels_file = &versionize(&long_file_name("parts-of-speech","model",""));
|
||||
$cmd .= "-ghkm-parts-of-speech-file $parts_of_speech_labels_file ";
|
||||
}
|
||||
|
||||
if (&get("TRAINING:ghkm-parts-of-speech-factor")) {
|
||||
$cmd .= "-ghkm-parts-of-speech-factor ";
|
||||
}
|
||||
}
|
||||
|
||||
my $extract_settings = &get("TRAINING:extract-settings");
|
||||
|
@ -96,6 +96,7 @@ my($_EXTERNAL_BINDIR,
|
||||
$_GHKM_SOURCE_LABELS_FILE,
|
||||
$_GHKM_PARTS_OF_SPEECH,
|
||||
$_GHKM_PARTS_OF_SPEECH_FILE,
|
||||
$_GHKM_PARTS_OF_SPEECH_FACTOR,
|
||||
$_PCFG,
|
||||
@_EXTRACT_OPTIONS,
|
||||
@_SCORE_OPTIONS,
|
||||
@ -204,6 +205,7 @@ $_HELP = 1
|
||||
'ghkm-source-labels-file=s' => \$_GHKM_SOURCE_LABELS_FILE,
|
||||
'ghkm-parts-of-speech' => \$_GHKM_PARTS_OF_SPEECH,
|
||||
'ghkm-parts-of-speech-file=s' => \$_GHKM_PARTS_OF_SPEECH_FILE,
|
||||
'ghkm-parts-of-speech-factor' => \$_GHKM_PARTS_OF_SPEECH_FACTOR,
|
||||
'pcfg' => \$_PCFG,
|
||||
'alt-direct-rule-score-1' => \$_ALT_DIRECT_RULE_SCORE_1,
|
||||
'alt-direct-rule-score-2' => \$_ALT_DIRECT_RULE_SCORE_2,
|
||||
@ -1543,6 +1545,7 @@ sub extract_phrase {
|
||||
$cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if defined($_PHRASE_ORIENTATION_PRIORS_FILE);
|
||||
$cmd .= " --SourceLabels" if $_GHKM_SOURCE_LABELS;
|
||||
$cmd .= " --PartsOfSpeech" if $_GHKM_PARTS_OF_SPEECH;
|
||||
$cmd .= " --PartsOfSpeechFactor" if $_GHKM_PARTS_OF_SPEECH_FACTOR;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user