GHKM: extract POS phrase property (from preterminals in the syntactic parse tree)

This commit is contained in:
Matthias Huck 2015-03-04 21:40:56 +00:00
parent 6d9b6764a6
commit 06e87d851e
12 changed files with 155 additions and 36 deletions

View File

@ -240,10 +240,11 @@ void ExtractionPhrasePair::AddProperties( const std::string &propertiesString, f
tok = tok.substr(0, endPos - 1);
vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
assert(keyValue.size() == 2);
if (keyValue.size() == 2) {
AddProperty(keyValue[0], keyValue[1], count);
}
}
}
const ALIGNMENT *ExtractionPhrasePair::FindBestAlignmentTargetToSource() const
@ -555,6 +556,27 @@ void ExtractionPhrasePair::CollectAllPhraseOrientations(const std::string &key,
}
void ExtractionPhrasePair::UpdateVocabularyFromValueTokens(const std::string& propertyKey,
std::set<std::string>& vocabulary) const
{
const PROPERTY_VALUES *allPropertyValues = GetProperty( propertyKey );
if ( allPropertyValues == NULL ) {
return;
}
for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
iter!=allPropertyValues->end(); ++iter) {
std::vector<std::string> tokens = Moses::Tokenize(iter->first);
for (std::vector<std::string>::const_iterator tokenIt=tokens.begin();
tokenIt!=tokens.end(); ++tokenIt) {
vocabulary.insert(*tokenIt);
}
}
}
}

View File

@ -139,6 +139,9 @@ public:
double smoothingFactor,
std::ostream &out) const;
void UpdateVocabularyFromValueTokens(const std::string& propertyKey,
std::set<std::string>& vocabulary) const;
void AddProperties(const std::string &str, float count);
void AddProperty(const std::string &key, const std::string &value, float count) {

View File

@ -293,10 +293,16 @@ int ExtractGHKM::Main(int argc, char *argv[])
}
// TODO Can scope pruning be done earlier?
if (r->Scope() <= options.maxScope) {
if (!options.treeFragments) {
scfgWriter.Write(*r,lineNum,false);
} else {
scfgWriter.Write(*r,**q,lineNum,false);
if (options.treeFragments) {
fwdExtractStream << " {{Tree ";
(*q)->PrintTree(fwdExtractStream);
fwdExtractStream << "}}";
}
if (options.partsOfSpeech) {
fwdExtractStream << " {{POS";
(*q)->PrintPartsOfSpeech(fwdExtractStream);
fwdExtractStream << "}}";
}
if (options.phraseOrientation) {
fwdExtractStream << " {{Orientation ";
@ -459,6 +465,8 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
"set maximum allowed scope")
("Minimal",
"extract minimal rules only")
("PartsOfSpeech",
"output parts-of-speech information (preterminals from the parse tree)")
("PCFG",
"include score based on PCFG scores in target corpus")
("PhraseOrientation",
@ -571,6 +579,9 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
if (vm.count("Minimal")) {
options.minimal = true;
}
if (vm.count("PartsOfSpeech")) {
options.partsOfSpeech = true;
}
if (vm.count("PCFG")) {
options.pcfg = true;
}
@ -667,6 +678,9 @@ void ExtractGHKM::WriteGlueGrammar(
if (options.treeFragments) {
out << " {{Tree [" << topLabel << " [SSTART <s>]]}}";
}
if (options.partsOfSpeech) {
out << " {{POS SSTART}}";
}
if (options.sourceLabels) {
out << " {{SourceLabels 2 1 " << sourceLabelSentenceStart << " 1 1 " << sourceLabelGlueTop << " 1}}";
}
@ -679,6 +693,9 @@ void ExtractGHKM::WriteGlueGrammar(
if (options.treeFragments) {
out << " {{Tree [" << topLabel << " [" << topLabel << "] [SEND </s>]]}}";
}
if (options.partsOfSpeech) {
out << " {{POS SEND}}";
}
if (options.sourceLabels) {
out << " {{SourceLabels 4 1 " << sourceLabelSentenceStart << " " << sourceLabelGlueTop << " " << sourceLabelSentenceEnd << " 1 1 " << sourceLabelGlueTop << " 1}}";
}
@ -694,6 +711,9 @@ void ExtractGHKM::WriteGlueGrammar(
if (options.treeFragments) {
out << " {{Tree [" << topLabel << " [SSTART <s>] [" << i->first << "] [SEND </s>]]}}";
}
if (options.partsOfSpeech) {
out << " {{POS SSTART SEND}}";
}
if (options.sourceLabels) {
out << " {{SourceLabels 4 1 " << sourceLabelSentenceStart << " " << sourceLabelGlueX << " " << sourceLabelSentenceEnd << " 1 1 " << sourceLabelGlueTop << " 1}}";
}

View File

@ -40,6 +40,7 @@ public:
, maxRuleSize(3)
, maxScope(3)
, minimal(false)
, partsOfSpeech(false)
, pcfg(false)
, phraseOrientation(false)
, sentenceOffset(0)
@ -68,6 +69,7 @@ public:
int maxRuleSize;
int maxScope;
bool minimal;
bool partsOfSpeech;
bool pcfg;
bool phraseOrientation;
int sentenceOffset;

View File

@ -191,18 +191,5 @@ void ScfgRuleWriter::WriteSymbol(const Symbol &symbol, std::ostream &out)
}
}
void ScfgRuleWriter::Write(const ScfgRule &rule, const Subgraph &g, size_t lineNum, bool printEndl)
{
Write(rule,lineNum,false);
m_fwd << " {{Tree ";
g.PrintTree(m_fwd);
m_fwd << "}}";
if (printEndl) {
m_fwd << std::endl;
m_inv << std::endl;
}
}
} // namespace GHKM
} // namespace Moses

View File

@ -44,8 +44,6 @@ public:
void Write(const ScfgRule &rule, size_t lineNum, bool printEndl=true);
void Write(const ScfgRule &rule, const Subgraph &g, size_t lineNum, bool printEndl=true);
private:
// Disallow copying
ScfgRuleWriter(const ScfgRuleWriter &);

View File

@ -144,5 +144,29 @@ void Subgraph::RecursivelyPrintTree(const Node *n, std::ostream &out) const
}
}
void Subgraph::PrintPartsOfSpeech(std::ostream &out) const
{
RecursivelyPrintPartsOfSpeech(m_root,out);
}
void Subgraph::RecursivelyPrintPartsOfSpeech(const Node *n, std::ostream &out) const
{
NodeType nodeType = n->GetType();
if (nodeType == TREE) {
if (m_leaves.find(n) == m_leaves.end()) {
const std::vector<Node *> &children = n->GetChildren();
for (std::vector<Node *>::const_iterator p(children.begin());
p != children.end(); ++p) {
Node *child = *p;
if (child->GetType() == TARGET) {
out << " " << n->GetLabel();
} else {
RecursivelyPrintPartsOfSpeech(child,out);
}
}
}
}
}
} // namespace Moses
} // namespace GHKM

View File

@ -116,8 +116,8 @@ public:
}
void GetTargetLeaves(std::vector<const Node *> &) const;
void PrintTree(std::ostream &out) const;
void PrintPartsOfSpeech(std::ostream &out) const;
private:
void GetTargetLeaves(const Node *, std::vector<const Node *> &) const;
@ -126,6 +126,7 @@ private:
float CalcPcfgScore() const;
int CountNodes(const Node *) const;
void RecursivelyPrintTree(const Node *n, std::ostream &out) const;
void RecursivelyPrintPartsOfSpeech(const Node *n, std::ostream &out) const;
const Node *m_root;
std::set<const Node *> m_leaves;

View File

@ -50,8 +50,8 @@ bool hierarchicalFlag = false;
bool pcfgFlag = false;
bool phraseOrientationFlag = false;
bool treeFragmentsFlag = false;
bool partsOfSpeechFlag = false;
bool sourceSyntaxLabelsFlag = false;
bool sourceSyntaxLabelSetFlag = false;
bool sourceSyntaxLabelCountsLHSFlag = false;
bool targetPreferenceLabelsFlag = false;
bool unpairedExtractFormatFlag = false;
@ -80,6 +80,8 @@ std::set<std::string> sourceLabelSet;
std::map<std::string,size_t> sourceLabels;
std::vector<std::string> sourceLabelsByIndex;
std::set<std::string> partsOfSpeechSet;
boost::unordered_map<std::string,float> targetPreferenceLHSCounts;
boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > ruleTargetLHSAndTargetPreferenceLHSJointCounts;
std::set<std::string> targetPreferenceLabelSet;
@ -129,7 +131,7 @@ int main(int argc, char* argv[])
ScoreFeatureManager featureManager;
if (argc < 4) {
std::cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PCFG] [--TreeFragments] [--SourceLabels] [--SourceLabelSet] [--SourceLabelCountsLHS] [--TargetPreferenceLabels] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl;
std::cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PCFG] [--TreeFragments] [--SourceLabels] [--SourceLabelCountsLHS] [--TargetPreferenceLabels] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl;
std::cerr << featureManager.usage() << std::endl;
exit(1);
}
@ -137,6 +139,7 @@ int main(int argc, char* argv[])
std::string fileNameLex = argv[2];
std::string fileNamePhraseTable = argv[3];
std::string fileNameSourceLabelSet;
std::string fileNamePartsOfSpeechSet;
std::string fileNameCountOfCounts;
std::string fileNameFunctionWords;
std::string fileNameLeftHandSideSourceLabelCounts;
@ -163,11 +166,14 @@ int main(int argc, char* argv[])
} else if (strcmp(argv[i],"--TreeFragments") == 0) {
treeFragmentsFlag = true;
std::cerr << "including tree fragment information from syntactic parse" << std::endl;
} else if (strcmp(argv[i],"--PartsOfSpeech") == 0) {
partsOfSpeechFlag = true;
std::cerr << "including parts-of-speech information from syntactic parse" << std::endl;
fileNamePartsOfSpeechSet = std::string(fileNamePhraseTable) + ".partsOfSpeech";
std::cerr << "writing parts-of-speech set to file " << fileNamePartsOfSpeechSet << std::endl;
} else if (strcmp(argv[i],"--SourceLabels") == 0) {
sourceSyntaxLabelsFlag = true;
std::cerr << "including source label information" << std::endl;
} else if (strcmp(argv[i],"--SourceLabelSet") == 0) {
sourceSyntaxLabelSetFlag = true;
fileNameSourceLabelSet = std::string(fileNamePhraseTable) + ".syntaxLabels.src";
std::cerr << "writing source syntax label set to file " << fileNameSourceLabelSet << std::endl;
} else if (strcmp(argv[i],"--SourceLabelCountsLHS") == 0) {
@ -452,7 +458,7 @@ int main(int argc, char* argv[])
}
// source syntax labels
if (sourceSyntaxLabelsFlag && sourceSyntaxLabelSetFlag && !inverseFlag) {
if (sourceSyntaxLabelsFlag && !inverseFlag) {
writeLabelSet( sourceLabelSet, fileNameSourceLabelSet );
}
if (sourceSyntaxLabelsFlag && sourceSyntaxLabelCountsLHSFlag && !inverseFlag) {
@ -462,6 +468,11 @@ int main(int argc, char* argv[])
fileNameLeftHandSideTargetSourceLabelCounts );
}
// parts-of-speech
if (partsOfSpeechFlag && !inverseFlag) {
writeLabelSet( partsOfSpeechSet, fileNamePartsOfSpeechSet );
}
// target preference labels
if (targetPreferenceLabelsFlag && !inverseFlag) {
writeLabelSet( targetPreferenceLabelSet, fileNameTargetPreferenceLabelSet );
@ -615,8 +626,8 @@ void writeLabelSet( const std::set<std::string> &labelSet, const std::string &fi
Moses::OutputFileStream out;
bool success = out.Open(fileName.c_str());
if (!success) {
std::cerr << "ERROR: could not open label set file "
<< fileName << std::endl;
std::cerr << "ERROR: could not open file "
<< fileName << " for writing" << std::endl;
return;
}
@ -811,6 +822,15 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
}
}
// parts-of-speech
if (partsOfSpeechFlag && !inverseFlag) {
phrasePair.UpdateVocabularyFromValueTokens("POS", partsOfSpeechSet);
const std::string *bestPartOfSpeech = phrasePair.FindBestPropertyValue("POS");
if (bestPartOfSpeech) {
phraseTableFile << " {{POS " << *bestPartOfSpeech << "}}";
}
}
// syntax labels
if ((sourceSyntaxLabelsFlag || targetPreferenceLabelsFlag) && !inverseFlag) {
unsigned nNTs = 1;

View File

@ -2240,6 +2240,14 @@ sub define_training_extract_phrases {
if (&get("TRAINING:ghkm-source-labels")) {
$cmd .= "-ghkm-source-labels ";
my $source_labels_file = &versionize(&long_file_name("source-labels","model",""));
$cmd .= "-ghkm-source-labels-file $source_labels_file ";
}
if (&get("TRAINING:ghkm-parts-of-speech")) {
$cmd .= "-ghkm-parts-of-speech ";
my $parts_of_speech_labels_file = &versionize(&long_file_name("parts-of-speech","model",""));
$cmd .= "-ghkm-parts-of-speech-file $parts_of_speech_labels_file ";
}
}
@ -2270,19 +2278,28 @@ sub define_training_build_ttable {
$cmd .= &define_domain_feature_score_option($domains) if &get("TRAINING:domain-features");
if (&get("TRAINING:hierarchical-rule-set")) {
if (&get("TRAINING:ghkm-tree-fragments")) {
$cmd .= "-ghkm-tree-fragments ";
}
if (&get("TRAINING:ghkm-phrase-orientation")) {
$cmd .= "-ghkm-phrase-orientation ";
my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model",""));
$cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file ";
}
if (&get("TRAINING:ghkm-source-labels")) {
$cmd .= "-ghkm-source-labels ";
my $source_labels_file = &versionize(&long_file_name("source-labels","model",""));
$cmd .= "-ghkm-source-labels-file $source_labels_file ";
}
if (&get("TRAINING:ghkm-parts-of-speech")) {
$cmd .= "-ghkm-parts-of-speech ";
my $parts_of_speech_labels_file = &versionize(&long_file_name("parts-of-speech","model",""));
$cmd .= "-ghkm-parts-of-speech-file $parts_of_speech_labels_file ";
}
}
&create_step($step_id,$cmd);
@ -2476,6 +2493,12 @@ sub define_training_create_config {
$cmd .= "-ghkm-source-labels-file $source_labels_file ";
}
if (&get("TRAINING:ghkm-parts-of-speech")) {
$cmd .= "-ghkm-parts-of-speech ";
my $parts_of_speech_labels_file = &versionize(&long_file_name("parts-of-speech","model",""));
$cmd .= "-ghkm-parts-of-speech-file $parts_of_speech_labels_file ";
}
# sparse lexical features provide additional content for config file
$cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features;

View File

@ -38,13 +38,19 @@ my $lexFile = $ARGV[4];
my $ptHalf = $ARGV[5]; # output
my $inverse = 0;
my $sourceLabelsFile;
my $partsOfSpeechFile;
my $otherExtractArgs= "";
for (my $i = 6; $i < $#ARGV; ++$i)
{
if ($ARGV[$i] eq '--SourceLabels') {
$sourceLabelsFile = $ARGV[++$i];
$otherExtractArgs .= "--SourceLabels --SourceLabelCountsLHS --SourceLabelSet ";
$otherExtractArgs .= "--SourceLabels --SourceLabelCountsLHS ";
next;
}
if ($ARGV[$i] eq '--PartsOfSpeech') {
$partsOfSpeechFile = $ARGV[++$i];
$otherExtractArgs .= "--PartsOfSpeech ";
next;
}
if ($ARGV[$i] eq '--Inverse') {
@ -287,6 +293,15 @@ if (!$inverse && defined($sourceLabelsFile))
`$cmd`;
}
# merge parts-of-speech files
if (!$inverse && defined($partsOfSpeechFile))
{
my $cmd = "(echo \"SSTART 0\"; echo \"SEND 1\"; cat $TMPDIR/phrase-table.half.*.gz.partsOfSpeech | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+1]}/\") > $partsOfSpeechFile";
print STDERR "Merging parts-of-speech files: $cmd \n";
`$cmd`;
}
$cmd = "rm -rf $TMPDIR \n";
print STDERR $cmd;
systemCheck($cmd);

View File

@ -32,7 +32,9 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
$_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE,
@_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS,
$_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM, $_OSM_FACTORS, $_POST_DECODING_TRANSLIT, $_TRANSLITERATION_PHRASE_TABLE,
$_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_GHKM_TREE_FRAGMENTS,$_GHKM_PHRASE_ORIENTATION,$_PHRASE_ORIENTATION_PRIORS_FILE,$_GHKM_SOURCE_LABELS,$_GHKM_SOURCE_LABELS_FILE,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,$_S2T,
$_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,
$_GHKM_TREE_FRAGMENTS,$_GHKM_PHRASE_ORIENTATION,$_PHRASE_ORIENTATION_PRIORS_FILE,$_GHKM_SOURCE_LABELS,$_GHKM_SOURCE_LABELS_FILE,$_GHKM_PARTS_OF_SPEECH,$_GHKM_PARTS_OF_SPEECH_FILE,
$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,$_S2T,
$_ALT_DIRECT_RULE_SCORE_1, $_ALT_DIRECT_RULE_SCORE_2, $_UNKNOWN_WORD_SOFT_MATCHES_FILE,
$_OMIT_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
$_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
@ -116,6 +118,8 @@ $_HELP = 1
'phrase-orientation-priors-file=s' => \$_PHRASE_ORIENTATION_PRIORS_FILE, # currently relevant for GHKM extraction only; phrase orientation for PBT has different implementation
'ghkm-source-labels' => \$_GHKM_SOURCE_LABELS,
'ghkm-source-labels-file=s' => \$_GHKM_SOURCE_LABELS_FILE,
'ghkm-parts-of-speech' => \$_GHKM_PARTS_OF_SPEECH,
'ghkm-parts-of-speech-file=s' => \$_GHKM_PARTS_OF_SPEECH_FILE,
'pcfg' => \$_PCFG,
'alt-direct-rule-score-1' => \$_ALT_DIRECT_RULE_SCORE_1,
'alt-direct-rule-score-2' => \$_ALT_DIRECT_RULE_SCORE_2,
@ -1454,6 +1458,7 @@ sub extract_phrase {
$cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION;
$cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if defined($_PHRASE_ORIENTATION_PRIORS_FILE);
$cmd .= " --SourceLabels" if $_GHKM_SOURCE_LABELS;
$cmd .= " --PartsOfSpeech" if $_GHKM_PARTS_OF_SPEECH;
}
else
{
@ -1583,7 +1588,6 @@ sub score_phrase_phrase_extract {
my $MIN_COUNT_HIERARCHICAL = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /MinCountHierarchical ([\d\.]+)/) ? $1 : undef;
my $SOURCE_LABELS = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /SourceLabels/);
my $SOURCE_LABEL_COUNTS_LHS = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /SourceLabelCountsLHS/);
my $SOURCE_LABEL_SET = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /SourceLabelSet/);
my $SPAN_LENGTH = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /SpanLength/);
my $CORE_SCORE_OPTIONS = "";
$CORE_SCORE_OPTIONS .= " --LogProb" if $LOG_PROB;
@ -1593,7 +1597,6 @@ sub score_phrase_phrase_extract {
$CORE_SCORE_OPTIONS .= " --CrossedNonTerm" if $CROSSEDNONTERM;
$CORE_SCORE_OPTIONS .= " --SourceLabels" if $SOURCE_LABELS;
$CORE_SCORE_OPTIONS .= " --SourceLabelCountsLHS " if $SOURCE_LABEL_COUNTS_LHS;
$CORE_SCORE_OPTIONS .= " --SourceLabelSet " if $SOURCE_LABEL_SET;
my $substep = 1;
my $isParent = 1;
@ -1637,6 +1640,7 @@ sub score_phrase_phrase_extract {
$cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION;
$cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if $_GHKM_PHRASE_ORIENTATION && defined($_PHRASE_ORIENTATION_PRIORS_FILE);
$cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
$cmd .= " --PartsOfSpeech $_GHKM_PARTS_OF_SPEECH_FILE" if $_GHKM_PARTS_OF_SPEECH && defined($_GHKM_PARTS_OF_SPEECH_FILE);
$cmd .= " $DOMAIN" if $DOMAIN;
$cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS);
$cmd .= " --FlexibilityScore=$FLEX_SCORER" if $_FLEXIBILITY_SCORE;