mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 05:14:36 +03:00
GHKM: extract POS phrase property (from preterminals in the syntactic parse tree)
This commit is contained in:
parent
6d9b6764a6
commit
06e87d851e
@ -240,8 +240,9 @@ void ExtractionPhrasePair::AddProperties( const std::string &propertiesString, f
|
|||||||
tok = tok.substr(0, endPos - 1);
|
tok = tok.substr(0, endPos - 1);
|
||||||
|
|
||||||
vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
|
vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
|
||||||
assert(keyValue.size() == 2);
|
if (keyValue.size() == 2) {
|
||||||
AddProperty(keyValue[0], keyValue[1], count);
|
AddProperty(keyValue[0], keyValue[1], count);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -555,6 +556,27 @@ void ExtractionPhrasePair::CollectAllPhraseOrientations(const std::string &key,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void ExtractionPhrasePair::UpdateVocabularyFromValueTokens(const std::string& propertyKey,
|
||||||
|
std::set<std::string>& vocabulary) const
|
||||||
|
{
|
||||||
|
const PROPERTY_VALUES *allPropertyValues = GetProperty( propertyKey );
|
||||||
|
|
||||||
|
if ( allPropertyValues == NULL ) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
|
||||||
|
iter!=allPropertyValues->end(); ++iter) {
|
||||||
|
|
||||||
|
std::vector<std::string> tokens = Moses::Tokenize(iter->first);
|
||||||
|
for (std::vector<std::string>::const_iterator tokenIt=tokens.begin();
|
||||||
|
tokenIt!=tokens.end(); ++tokenIt) {
|
||||||
|
vocabulary.insert(*tokenIt);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -139,11 +139,14 @@ public:
|
|||||||
double smoothingFactor,
|
double smoothingFactor,
|
||||||
std::ostream &out) const;
|
std::ostream &out) const;
|
||||||
|
|
||||||
void AddProperties( const std::string &str, float count );
|
void UpdateVocabularyFromValueTokens(const std::string& propertyKey,
|
||||||
|
std::set<std::string>& vocabulary) const;
|
||||||
|
|
||||||
void AddProperty( const std::string &key, const std::string &value, float count ) {
|
void AddProperties(const std::string &str, float count);
|
||||||
|
|
||||||
|
void AddProperty(const std::string &key, const std::string &value, float count) {
|
||||||
std::map<std::string,
|
std::map<std::string,
|
||||||
std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter = m_properties.find(key);
|
std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter = m_properties.find(key);
|
||||||
if ( iter == m_properties.end() ) {
|
if ( iter == m_properties.end() ) {
|
||||||
// key not found: insert property key and value
|
// key not found: insert property key and value
|
||||||
PROPERTY_VALUES *propertyValues = new PROPERTY_VALUES();
|
PROPERTY_VALUES *propertyValues = new PROPERTY_VALUES();
|
||||||
|
@ -293,10 +293,16 @@ int ExtractGHKM::Main(int argc, char *argv[])
|
|||||||
}
|
}
|
||||||
// TODO Can scope pruning be done earlier?
|
// TODO Can scope pruning be done earlier?
|
||||||
if (r->Scope() <= options.maxScope) {
|
if (r->Scope() <= options.maxScope) {
|
||||||
if (!options.treeFragments) {
|
scfgWriter.Write(*r,lineNum,false);
|
||||||
scfgWriter.Write(*r,lineNum,false);
|
if (options.treeFragments) {
|
||||||
} else {
|
fwdExtractStream << " {{Tree ";
|
||||||
scfgWriter.Write(*r,**q,lineNum,false);
|
(*q)->PrintTree(fwdExtractStream);
|
||||||
|
fwdExtractStream << "}}";
|
||||||
|
}
|
||||||
|
if (options.partsOfSpeech) {
|
||||||
|
fwdExtractStream << " {{POS";
|
||||||
|
(*q)->PrintPartsOfSpeech(fwdExtractStream);
|
||||||
|
fwdExtractStream << "}}";
|
||||||
}
|
}
|
||||||
if (options.phraseOrientation) {
|
if (options.phraseOrientation) {
|
||||||
fwdExtractStream << " {{Orientation ";
|
fwdExtractStream << " {{Orientation ";
|
||||||
@ -459,6 +465,8 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
|
|||||||
"set maximum allowed scope")
|
"set maximum allowed scope")
|
||||||
("Minimal",
|
("Minimal",
|
||||||
"extract minimal rules only")
|
"extract minimal rules only")
|
||||||
|
("PartsOfSpeech",
|
||||||
|
"output parts-of-speech information (preterminals from the parse tree)")
|
||||||
("PCFG",
|
("PCFG",
|
||||||
"include score based on PCFG scores in target corpus")
|
"include score based on PCFG scores in target corpus")
|
||||||
("PhraseOrientation",
|
("PhraseOrientation",
|
||||||
@ -571,6 +579,9 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
|
|||||||
if (vm.count("Minimal")) {
|
if (vm.count("Minimal")) {
|
||||||
options.minimal = true;
|
options.minimal = true;
|
||||||
}
|
}
|
||||||
|
if (vm.count("PartsOfSpeech")) {
|
||||||
|
options.partsOfSpeech = true;
|
||||||
|
}
|
||||||
if (vm.count("PCFG")) {
|
if (vm.count("PCFG")) {
|
||||||
options.pcfg = true;
|
options.pcfg = true;
|
||||||
}
|
}
|
||||||
@ -667,6 +678,9 @@ void ExtractGHKM::WriteGlueGrammar(
|
|||||||
if (options.treeFragments) {
|
if (options.treeFragments) {
|
||||||
out << " {{Tree [" << topLabel << " [SSTART <s>]]}}";
|
out << " {{Tree [" << topLabel << " [SSTART <s>]]}}";
|
||||||
}
|
}
|
||||||
|
if (options.partsOfSpeech) {
|
||||||
|
out << " {{POS SSTART}}";
|
||||||
|
}
|
||||||
if (options.sourceLabels) {
|
if (options.sourceLabels) {
|
||||||
out << " {{SourceLabels 2 1 " << sourceLabelSentenceStart << " 1 1 " << sourceLabelGlueTop << " 1}}";
|
out << " {{SourceLabels 2 1 " << sourceLabelSentenceStart << " 1 1 " << sourceLabelGlueTop << " 1}}";
|
||||||
}
|
}
|
||||||
@ -679,6 +693,9 @@ void ExtractGHKM::WriteGlueGrammar(
|
|||||||
if (options.treeFragments) {
|
if (options.treeFragments) {
|
||||||
out << " {{Tree [" << topLabel << " [" << topLabel << "] [SEND </s>]]}}";
|
out << " {{Tree [" << topLabel << " [" << topLabel << "] [SEND </s>]]}}";
|
||||||
}
|
}
|
||||||
|
if (options.partsOfSpeech) {
|
||||||
|
out << " {{POS SEND}}";
|
||||||
|
}
|
||||||
if (options.sourceLabels) {
|
if (options.sourceLabels) {
|
||||||
out << " {{SourceLabels 4 1 " << sourceLabelSentenceStart << " " << sourceLabelGlueTop << " " << sourceLabelSentenceEnd << " 1 1 " << sourceLabelGlueTop << " 1}}";
|
out << " {{SourceLabels 4 1 " << sourceLabelSentenceStart << " " << sourceLabelGlueTop << " " << sourceLabelSentenceEnd << " 1 1 " << sourceLabelGlueTop << " 1}}";
|
||||||
}
|
}
|
||||||
@ -694,6 +711,9 @@ void ExtractGHKM::WriteGlueGrammar(
|
|||||||
if (options.treeFragments) {
|
if (options.treeFragments) {
|
||||||
out << " {{Tree [" << topLabel << " [SSTART <s>] [" << i->first << "] [SEND </s>]]}}";
|
out << " {{Tree [" << topLabel << " [SSTART <s>] [" << i->first << "] [SEND </s>]]}}";
|
||||||
}
|
}
|
||||||
|
if (options.partsOfSpeech) {
|
||||||
|
out << " {{POS SSTART SEND}}";
|
||||||
|
}
|
||||||
if (options.sourceLabels) {
|
if (options.sourceLabels) {
|
||||||
out << " {{SourceLabels 4 1 " << sourceLabelSentenceStart << " " << sourceLabelGlueX << " " << sourceLabelSentenceEnd << " 1 1 " << sourceLabelGlueTop << " 1}}";
|
out << " {{SourceLabels 4 1 " << sourceLabelSentenceStart << " " << sourceLabelGlueX << " " << sourceLabelSentenceEnd << " 1 1 " << sourceLabelGlueTop << " 1}}";
|
||||||
}
|
}
|
||||||
|
@ -40,6 +40,7 @@ public:
|
|||||||
, maxRuleSize(3)
|
, maxRuleSize(3)
|
||||||
, maxScope(3)
|
, maxScope(3)
|
||||||
, minimal(false)
|
, minimal(false)
|
||||||
|
, partsOfSpeech(false)
|
||||||
, pcfg(false)
|
, pcfg(false)
|
||||||
, phraseOrientation(false)
|
, phraseOrientation(false)
|
||||||
, sentenceOffset(0)
|
, sentenceOffset(0)
|
||||||
@ -68,6 +69,7 @@ public:
|
|||||||
int maxRuleSize;
|
int maxRuleSize;
|
||||||
int maxScope;
|
int maxScope;
|
||||||
bool minimal;
|
bool minimal;
|
||||||
|
bool partsOfSpeech;
|
||||||
bool pcfg;
|
bool pcfg;
|
||||||
bool phraseOrientation;
|
bool phraseOrientation;
|
||||||
int sentenceOffset;
|
int sentenceOffset;
|
||||||
|
@ -191,18 +191,5 @@ void ScfgRuleWriter::WriteSymbol(const Symbol &symbol, std::ostream &out)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ScfgRuleWriter::Write(const ScfgRule &rule, const Subgraph &g, size_t lineNum, bool printEndl)
|
|
||||||
{
|
|
||||||
Write(rule,lineNum,false);
|
|
||||||
m_fwd << " {{Tree ";
|
|
||||||
g.PrintTree(m_fwd);
|
|
||||||
m_fwd << "}}";
|
|
||||||
|
|
||||||
if (printEndl) {
|
|
||||||
m_fwd << std::endl;
|
|
||||||
m_inv << std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace GHKM
|
} // namespace GHKM
|
||||||
} // namespace Moses
|
} // namespace Moses
|
||||||
|
@ -44,8 +44,6 @@ public:
|
|||||||
|
|
||||||
void Write(const ScfgRule &rule, size_t lineNum, bool printEndl=true);
|
void Write(const ScfgRule &rule, size_t lineNum, bool printEndl=true);
|
||||||
|
|
||||||
void Write(const ScfgRule &rule, const Subgraph &g, size_t lineNum, bool printEndl=true);
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Disallow copying
|
// Disallow copying
|
||||||
ScfgRuleWriter(const ScfgRuleWriter &);
|
ScfgRuleWriter(const ScfgRuleWriter &);
|
||||||
|
@ -144,5 +144,29 @@ void Subgraph::RecursivelyPrintTree(const Node *n, std::ostream &out) const
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Subgraph::PrintPartsOfSpeech(std::ostream &out) const
|
||||||
|
{
|
||||||
|
RecursivelyPrintPartsOfSpeech(m_root,out);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Subgraph::RecursivelyPrintPartsOfSpeech(const Node *n, std::ostream &out) const
|
||||||
|
{
|
||||||
|
NodeType nodeType = n->GetType();
|
||||||
|
if (nodeType == TREE) {
|
||||||
|
if (m_leaves.find(n) == m_leaves.end()) {
|
||||||
|
const std::vector<Node *> &children = n->GetChildren();
|
||||||
|
for (std::vector<Node *>::const_iterator p(children.begin());
|
||||||
|
p != children.end(); ++p) {
|
||||||
|
Node *child = *p;
|
||||||
|
if (child->GetType() == TARGET) {
|
||||||
|
out << " " << n->GetLabel();
|
||||||
|
} else {
|
||||||
|
RecursivelyPrintPartsOfSpeech(child,out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace Moses
|
} // namespace Moses
|
||||||
} // namespace GHKM
|
} // namespace GHKM
|
||||||
|
@ -116,8 +116,8 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
void GetTargetLeaves(std::vector<const Node *> &) const;
|
void GetTargetLeaves(std::vector<const Node *> &) const;
|
||||||
|
|
||||||
void PrintTree(std::ostream &out) const;
|
void PrintTree(std::ostream &out) const;
|
||||||
|
void PrintPartsOfSpeech(std::ostream &out) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void GetTargetLeaves(const Node *, std::vector<const Node *> &) const;
|
void GetTargetLeaves(const Node *, std::vector<const Node *> &) const;
|
||||||
@ -126,6 +126,7 @@ private:
|
|||||||
float CalcPcfgScore() const;
|
float CalcPcfgScore() const;
|
||||||
int CountNodes(const Node *) const;
|
int CountNodes(const Node *) const;
|
||||||
void RecursivelyPrintTree(const Node *n, std::ostream &out) const;
|
void RecursivelyPrintTree(const Node *n, std::ostream &out) const;
|
||||||
|
void RecursivelyPrintPartsOfSpeech(const Node *n, std::ostream &out) const;
|
||||||
|
|
||||||
const Node *m_root;
|
const Node *m_root;
|
||||||
std::set<const Node *> m_leaves;
|
std::set<const Node *> m_leaves;
|
||||||
|
@ -50,8 +50,8 @@ bool hierarchicalFlag = false;
|
|||||||
bool pcfgFlag = false;
|
bool pcfgFlag = false;
|
||||||
bool phraseOrientationFlag = false;
|
bool phraseOrientationFlag = false;
|
||||||
bool treeFragmentsFlag = false;
|
bool treeFragmentsFlag = false;
|
||||||
|
bool partsOfSpeechFlag = false;
|
||||||
bool sourceSyntaxLabelsFlag = false;
|
bool sourceSyntaxLabelsFlag = false;
|
||||||
bool sourceSyntaxLabelSetFlag = false;
|
|
||||||
bool sourceSyntaxLabelCountsLHSFlag = false;
|
bool sourceSyntaxLabelCountsLHSFlag = false;
|
||||||
bool targetPreferenceLabelsFlag = false;
|
bool targetPreferenceLabelsFlag = false;
|
||||||
bool unpairedExtractFormatFlag = false;
|
bool unpairedExtractFormatFlag = false;
|
||||||
@ -80,6 +80,8 @@ std::set<std::string> sourceLabelSet;
|
|||||||
std::map<std::string,size_t> sourceLabels;
|
std::map<std::string,size_t> sourceLabels;
|
||||||
std::vector<std::string> sourceLabelsByIndex;
|
std::vector<std::string> sourceLabelsByIndex;
|
||||||
|
|
||||||
|
std::set<std::string> partsOfSpeechSet;
|
||||||
|
|
||||||
boost::unordered_map<std::string,float> targetPreferenceLHSCounts;
|
boost::unordered_map<std::string,float> targetPreferenceLHSCounts;
|
||||||
boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > ruleTargetLHSAndTargetPreferenceLHSJointCounts;
|
boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > ruleTargetLHSAndTargetPreferenceLHSJointCounts;
|
||||||
std::set<std::string> targetPreferenceLabelSet;
|
std::set<std::string> targetPreferenceLabelSet;
|
||||||
@ -129,7 +131,7 @@ int main(int argc, char* argv[])
|
|||||||
|
|
||||||
ScoreFeatureManager featureManager;
|
ScoreFeatureManager featureManager;
|
||||||
if (argc < 4) {
|
if (argc < 4) {
|
||||||
std::cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PCFG] [--TreeFragments] [--SourceLabels] [--SourceLabelSet] [--SourceLabelCountsLHS] [--TargetPreferenceLabels] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl;
|
std::cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PCFG] [--TreeFragments] [--SourceLabels] [--SourceLabelCountsLHS] [--TargetPreferenceLabels] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl;
|
||||||
std::cerr << featureManager.usage() << std::endl;
|
std::cerr << featureManager.usage() << std::endl;
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
@ -137,6 +139,7 @@ int main(int argc, char* argv[])
|
|||||||
std::string fileNameLex = argv[2];
|
std::string fileNameLex = argv[2];
|
||||||
std::string fileNamePhraseTable = argv[3];
|
std::string fileNamePhraseTable = argv[3];
|
||||||
std::string fileNameSourceLabelSet;
|
std::string fileNameSourceLabelSet;
|
||||||
|
std::string fileNamePartsOfSpeechSet;
|
||||||
std::string fileNameCountOfCounts;
|
std::string fileNameCountOfCounts;
|
||||||
std::string fileNameFunctionWords;
|
std::string fileNameFunctionWords;
|
||||||
std::string fileNameLeftHandSideSourceLabelCounts;
|
std::string fileNameLeftHandSideSourceLabelCounts;
|
||||||
@ -163,11 +166,14 @@ int main(int argc, char* argv[])
|
|||||||
} else if (strcmp(argv[i],"--TreeFragments") == 0) {
|
} else if (strcmp(argv[i],"--TreeFragments") == 0) {
|
||||||
treeFragmentsFlag = true;
|
treeFragmentsFlag = true;
|
||||||
std::cerr << "including tree fragment information from syntactic parse" << std::endl;
|
std::cerr << "including tree fragment information from syntactic parse" << std::endl;
|
||||||
|
} else if (strcmp(argv[i],"--PartsOfSpeech") == 0) {
|
||||||
|
partsOfSpeechFlag = true;
|
||||||
|
std::cerr << "including parts-of-speech information from syntactic parse" << std::endl;
|
||||||
|
fileNamePartsOfSpeechSet = std::string(fileNamePhraseTable) + ".partsOfSpeech";
|
||||||
|
std::cerr << "writing parts-of-speech set to file " << fileNamePartsOfSpeechSet << std::endl;
|
||||||
} else if (strcmp(argv[i],"--SourceLabels") == 0) {
|
} else if (strcmp(argv[i],"--SourceLabels") == 0) {
|
||||||
sourceSyntaxLabelsFlag = true;
|
sourceSyntaxLabelsFlag = true;
|
||||||
std::cerr << "including source label information" << std::endl;
|
std::cerr << "including source label information" << std::endl;
|
||||||
} else if (strcmp(argv[i],"--SourceLabelSet") == 0) {
|
|
||||||
sourceSyntaxLabelSetFlag = true;
|
|
||||||
fileNameSourceLabelSet = std::string(fileNamePhraseTable) + ".syntaxLabels.src";
|
fileNameSourceLabelSet = std::string(fileNamePhraseTable) + ".syntaxLabels.src";
|
||||||
std::cerr << "writing source syntax label set to file " << fileNameSourceLabelSet << std::endl;
|
std::cerr << "writing source syntax label set to file " << fileNameSourceLabelSet << std::endl;
|
||||||
} else if (strcmp(argv[i],"--SourceLabelCountsLHS") == 0) {
|
} else if (strcmp(argv[i],"--SourceLabelCountsLHS") == 0) {
|
||||||
@ -452,7 +458,7 @@ int main(int argc, char* argv[])
|
|||||||
}
|
}
|
||||||
|
|
||||||
// source syntax labels
|
// source syntax labels
|
||||||
if (sourceSyntaxLabelsFlag && sourceSyntaxLabelSetFlag && !inverseFlag) {
|
if (sourceSyntaxLabelsFlag && !inverseFlag) {
|
||||||
writeLabelSet( sourceLabelSet, fileNameSourceLabelSet );
|
writeLabelSet( sourceLabelSet, fileNameSourceLabelSet );
|
||||||
}
|
}
|
||||||
if (sourceSyntaxLabelsFlag && sourceSyntaxLabelCountsLHSFlag && !inverseFlag) {
|
if (sourceSyntaxLabelsFlag && sourceSyntaxLabelCountsLHSFlag && !inverseFlag) {
|
||||||
@ -462,6 +468,11 @@ int main(int argc, char* argv[])
|
|||||||
fileNameLeftHandSideTargetSourceLabelCounts );
|
fileNameLeftHandSideTargetSourceLabelCounts );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// parts-of-speech
|
||||||
|
if (partsOfSpeechFlag && !inverseFlag) {
|
||||||
|
writeLabelSet( partsOfSpeechSet, fileNamePartsOfSpeechSet );
|
||||||
|
}
|
||||||
|
|
||||||
// target preference labels
|
// target preference labels
|
||||||
if (targetPreferenceLabelsFlag && !inverseFlag) {
|
if (targetPreferenceLabelsFlag && !inverseFlag) {
|
||||||
writeLabelSet( targetPreferenceLabelSet, fileNameTargetPreferenceLabelSet );
|
writeLabelSet( targetPreferenceLabelSet, fileNameTargetPreferenceLabelSet );
|
||||||
@ -615,8 +626,8 @@ void writeLabelSet( const std::set<std::string> &labelSet, const std::string &fi
|
|||||||
Moses::OutputFileStream out;
|
Moses::OutputFileStream out;
|
||||||
bool success = out.Open(fileName.c_str());
|
bool success = out.Open(fileName.c_str());
|
||||||
if (!success) {
|
if (!success) {
|
||||||
std::cerr << "ERROR: could not open label set file "
|
std::cerr << "ERROR: could not open file "
|
||||||
<< fileName << std::endl;
|
<< fileName << " for writing" << std::endl;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -811,6 +822,15 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// parts-of-speech
|
||||||
|
if (partsOfSpeechFlag && !inverseFlag) {
|
||||||
|
phrasePair.UpdateVocabularyFromValueTokens("POS", partsOfSpeechSet);
|
||||||
|
const std::string *bestPartOfSpeech = phrasePair.FindBestPropertyValue("POS");
|
||||||
|
if (bestPartOfSpeech) {
|
||||||
|
phraseTableFile << " {{POS " << *bestPartOfSpeech << "}}";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// syntax labels
|
// syntax labels
|
||||||
if ((sourceSyntaxLabelsFlag || targetPreferenceLabelsFlag) && !inverseFlag) {
|
if ((sourceSyntaxLabelsFlag || targetPreferenceLabelsFlag) && !inverseFlag) {
|
||||||
unsigned nNTs = 1;
|
unsigned nNTs = 1;
|
||||||
|
@ -2240,6 +2240,14 @@ sub define_training_extract_phrases {
|
|||||||
|
|
||||||
if (&get("TRAINING:ghkm-source-labels")) {
|
if (&get("TRAINING:ghkm-source-labels")) {
|
||||||
$cmd .= "-ghkm-source-labels ";
|
$cmd .= "-ghkm-source-labels ";
|
||||||
|
my $source_labels_file = &versionize(&long_file_name("source-labels","model",""));
|
||||||
|
$cmd .= "-ghkm-source-labels-file $source_labels_file ";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (&get("TRAINING:ghkm-parts-of-speech")) {
|
||||||
|
$cmd .= "-ghkm-parts-of-speech ";
|
||||||
|
my $parts_of_speech_labels_file = &versionize(&long_file_name("parts-of-speech","model",""));
|
||||||
|
$cmd .= "-ghkm-parts-of-speech-file $parts_of_speech_labels_file ";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2270,19 +2278,28 @@ sub define_training_build_ttable {
|
|||||||
$cmd .= &define_domain_feature_score_option($domains) if &get("TRAINING:domain-features");
|
$cmd .= &define_domain_feature_score_option($domains) if &get("TRAINING:domain-features");
|
||||||
|
|
||||||
if (&get("TRAINING:hierarchical-rule-set")) {
|
if (&get("TRAINING:hierarchical-rule-set")) {
|
||||||
|
|
||||||
if (&get("TRAINING:ghkm-tree-fragments")) {
|
if (&get("TRAINING:ghkm-tree-fragments")) {
|
||||||
$cmd .= "-ghkm-tree-fragments ";
|
$cmd .= "-ghkm-tree-fragments ";
|
||||||
}
|
}
|
||||||
|
|
||||||
if (&get("TRAINING:ghkm-phrase-orientation")) {
|
if (&get("TRAINING:ghkm-phrase-orientation")) {
|
||||||
$cmd .= "-ghkm-phrase-orientation ";
|
$cmd .= "-ghkm-phrase-orientation ";
|
||||||
my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model",""));
|
my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model",""));
|
||||||
$cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file ";
|
$cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file ";
|
||||||
}
|
}
|
||||||
|
|
||||||
if (&get("TRAINING:ghkm-source-labels")) {
|
if (&get("TRAINING:ghkm-source-labels")) {
|
||||||
$cmd .= "-ghkm-source-labels ";
|
$cmd .= "-ghkm-source-labels ";
|
||||||
my $source_labels_file = &versionize(&long_file_name("source-labels","model",""));
|
my $source_labels_file = &versionize(&long_file_name("source-labels","model",""));
|
||||||
$cmd .= "-ghkm-source-labels-file $source_labels_file ";
|
$cmd .= "-ghkm-source-labels-file $source_labels_file ";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (&get("TRAINING:ghkm-parts-of-speech")) {
|
||||||
|
$cmd .= "-ghkm-parts-of-speech ";
|
||||||
|
my $parts_of_speech_labels_file = &versionize(&long_file_name("parts-of-speech","model",""));
|
||||||
|
$cmd .= "-ghkm-parts-of-speech-file $parts_of_speech_labels_file ";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
&create_step($step_id,$cmd);
|
&create_step($step_id,$cmd);
|
||||||
@ -2476,6 +2493,12 @@ sub define_training_create_config {
|
|||||||
$cmd .= "-ghkm-source-labels-file $source_labels_file ";
|
$cmd .= "-ghkm-source-labels-file $source_labels_file ";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (&get("TRAINING:ghkm-parts-of-speech")) {
|
||||||
|
$cmd .= "-ghkm-parts-of-speech ";
|
||||||
|
my $parts_of_speech_labels_file = &versionize(&long_file_name("parts-of-speech","model",""));
|
||||||
|
$cmd .= "-ghkm-parts-of-speech-file $parts_of_speech_labels_file ";
|
||||||
|
}
|
||||||
|
|
||||||
# sparse lexical features provide additional content for config file
|
# sparse lexical features provide additional content for config file
|
||||||
$cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features;
|
$cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features;
|
||||||
|
|
||||||
|
@ -38,13 +38,19 @@ my $lexFile = $ARGV[4];
|
|||||||
my $ptHalf = $ARGV[5]; # output
|
my $ptHalf = $ARGV[5]; # output
|
||||||
my $inverse = 0;
|
my $inverse = 0;
|
||||||
my $sourceLabelsFile;
|
my $sourceLabelsFile;
|
||||||
|
my $partsOfSpeechFile;
|
||||||
|
|
||||||
my $otherExtractArgs= "";
|
my $otherExtractArgs= "";
|
||||||
for (my $i = 6; $i < $#ARGV; ++$i)
|
for (my $i = 6; $i < $#ARGV; ++$i)
|
||||||
{
|
{
|
||||||
if ($ARGV[$i] eq '--SourceLabels') {
|
if ($ARGV[$i] eq '--SourceLabels') {
|
||||||
$sourceLabelsFile = $ARGV[++$i];
|
$sourceLabelsFile = $ARGV[++$i];
|
||||||
$otherExtractArgs .= "--SourceLabels --SourceLabelCountsLHS --SourceLabelSet ";
|
$otherExtractArgs .= "--SourceLabels --SourceLabelCountsLHS ";
|
||||||
|
next;
|
||||||
|
}
|
||||||
|
if ($ARGV[$i] eq '--PartsOfSpeech') {
|
||||||
|
$partsOfSpeechFile = $ARGV[++$i];
|
||||||
|
$otherExtractArgs .= "--PartsOfSpeech ";
|
||||||
next;
|
next;
|
||||||
}
|
}
|
||||||
if ($ARGV[$i] eq '--Inverse') {
|
if ($ARGV[$i] eq '--Inverse') {
|
||||||
@ -287,6 +293,15 @@ if (!$inverse && defined($sourceLabelsFile))
|
|||||||
`$cmd`;
|
`$cmd`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# merge parts-of-speech files
|
||||||
|
if (!$inverse && defined($partsOfSpeechFile))
|
||||||
|
{
|
||||||
|
my $cmd = "(echo \"SSTART 0\"; echo \"SEND 1\"; cat $TMPDIR/phrase-table.half.*.gz.partsOfSpeech | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+1]}/\") > $partsOfSpeechFile";
|
||||||
|
print STDERR "Merging parts-of-speech files: $cmd \n";
|
||||||
|
`$cmd`;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
$cmd = "rm -rf $TMPDIR \n";
|
$cmd = "rm -rf $TMPDIR \n";
|
||||||
print STDERR $cmd;
|
print STDERR $cmd;
|
||||||
systemCheck($cmd);
|
systemCheck($cmd);
|
||||||
|
@ -32,7 +32,9 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
|
|||||||
$_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE,
|
$_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE,
|
||||||
@_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS,
|
@_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS,
|
||||||
$_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM, $_OSM_FACTORS, $_POST_DECODING_TRANSLIT, $_TRANSLITERATION_PHRASE_TABLE,
|
$_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM, $_OSM_FACTORS, $_POST_DECODING_TRANSLIT, $_TRANSLITERATION_PHRASE_TABLE,
|
||||||
$_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_GHKM_TREE_FRAGMENTS,$_GHKM_PHRASE_ORIENTATION,$_PHRASE_ORIENTATION_PRIORS_FILE,$_GHKM_SOURCE_LABELS,$_GHKM_SOURCE_LABELS_FILE,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,$_S2T,
|
$_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,
|
||||||
|
$_GHKM_TREE_FRAGMENTS,$_GHKM_PHRASE_ORIENTATION,$_PHRASE_ORIENTATION_PRIORS_FILE,$_GHKM_SOURCE_LABELS,$_GHKM_SOURCE_LABELS_FILE,$_GHKM_PARTS_OF_SPEECH,$_GHKM_PARTS_OF_SPEECH_FILE,
|
||||||
|
$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,$_S2T,
|
||||||
$_ALT_DIRECT_RULE_SCORE_1, $_ALT_DIRECT_RULE_SCORE_2, $_UNKNOWN_WORD_SOFT_MATCHES_FILE,
|
$_ALT_DIRECT_RULE_SCORE_1, $_ALT_DIRECT_RULE_SCORE_2, $_UNKNOWN_WORD_SOFT_MATCHES_FILE,
|
||||||
$_OMIT_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
|
$_OMIT_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
|
||||||
$_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
|
$_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
|
||||||
@ -116,6 +118,8 @@ $_HELP = 1
|
|||||||
'phrase-orientation-priors-file=s' => \$_PHRASE_ORIENTATION_PRIORS_FILE, # currently relevant for GHKM extraction only; phrase orientation for PBT has different implementation
|
'phrase-orientation-priors-file=s' => \$_PHRASE_ORIENTATION_PRIORS_FILE, # currently relevant for GHKM extraction only; phrase orientation for PBT has different implementation
|
||||||
'ghkm-source-labels' => \$_GHKM_SOURCE_LABELS,
|
'ghkm-source-labels' => \$_GHKM_SOURCE_LABELS,
|
||||||
'ghkm-source-labels-file=s' => \$_GHKM_SOURCE_LABELS_FILE,
|
'ghkm-source-labels-file=s' => \$_GHKM_SOURCE_LABELS_FILE,
|
||||||
|
'ghkm-parts-of-speech' => \$_GHKM_PARTS_OF_SPEECH,
|
||||||
|
'ghkm-parts-of-speech-file=s' => \$_GHKM_PARTS_OF_SPEECH_FILE,
|
||||||
'pcfg' => \$_PCFG,
|
'pcfg' => \$_PCFG,
|
||||||
'alt-direct-rule-score-1' => \$_ALT_DIRECT_RULE_SCORE_1,
|
'alt-direct-rule-score-1' => \$_ALT_DIRECT_RULE_SCORE_1,
|
||||||
'alt-direct-rule-score-2' => \$_ALT_DIRECT_RULE_SCORE_2,
|
'alt-direct-rule-score-2' => \$_ALT_DIRECT_RULE_SCORE_2,
|
||||||
@ -1454,6 +1458,7 @@ sub extract_phrase {
|
|||||||
$cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION;
|
$cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION;
|
||||||
$cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if defined($_PHRASE_ORIENTATION_PRIORS_FILE);
|
$cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if defined($_PHRASE_ORIENTATION_PRIORS_FILE);
|
||||||
$cmd .= " --SourceLabels" if $_GHKM_SOURCE_LABELS;
|
$cmd .= " --SourceLabels" if $_GHKM_SOURCE_LABELS;
|
||||||
|
$cmd .= " --PartsOfSpeech" if $_GHKM_PARTS_OF_SPEECH;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -1583,7 +1588,6 @@ sub score_phrase_phrase_extract {
|
|||||||
my $MIN_COUNT_HIERARCHICAL = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /MinCountHierarchical ([\d\.]+)/) ? $1 : undef;
|
my $MIN_COUNT_HIERARCHICAL = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /MinCountHierarchical ([\d\.]+)/) ? $1 : undef;
|
||||||
my $SOURCE_LABELS = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /SourceLabels/);
|
my $SOURCE_LABELS = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /SourceLabels/);
|
||||||
my $SOURCE_LABEL_COUNTS_LHS = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /SourceLabelCountsLHS/);
|
my $SOURCE_LABEL_COUNTS_LHS = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /SourceLabelCountsLHS/);
|
||||||
my $SOURCE_LABEL_SET = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /SourceLabelSet/);
|
|
||||||
my $SPAN_LENGTH = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /SpanLength/);
|
my $SPAN_LENGTH = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /SpanLength/);
|
||||||
my $CORE_SCORE_OPTIONS = "";
|
my $CORE_SCORE_OPTIONS = "";
|
||||||
$CORE_SCORE_OPTIONS .= " --LogProb" if $LOG_PROB;
|
$CORE_SCORE_OPTIONS .= " --LogProb" if $LOG_PROB;
|
||||||
@ -1593,7 +1597,6 @@ sub score_phrase_phrase_extract {
|
|||||||
$CORE_SCORE_OPTIONS .= " --CrossedNonTerm" if $CROSSEDNONTERM;
|
$CORE_SCORE_OPTIONS .= " --CrossedNonTerm" if $CROSSEDNONTERM;
|
||||||
$CORE_SCORE_OPTIONS .= " --SourceLabels" if $SOURCE_LABELS;
|
$CORE_SCORE_OPTIONS .= " --SourceLabels" if $SOURCE_LABELS;
|
||||||
$CORE_SCORE_OPTIONS .= " --SourceLabelCountsLHS " if $SOURCE_LABEL_COUNTS_LHS;
|
$CORE_SCORE_OPTIONS .= " --SourceLabelCountsLHS " if $SOURCE_LABEL_COUNTS_LHS;
|
||||||
$CORE_SCORE_OPTIONS .= " --SourceLabelSet " if $SOURCE_LABEL_SET;
|
|
||||||
|
|
||||||
my $substep = 1;
|
my $substep = 1;
|
||||||
my $isParent = 1;
|
my $isParent = 1;
|
||||||
@ -1637,6 +1640,7 @@ sub score_phrase_phrase_extract {
|
|||||||
$cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION;
|
$cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION;
|
||||||
$cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if $_GHKM_PHRASE_ORIENTATION && defined($_PHRASE_ORIENTATION_PRIORS_FILE);
|
$cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if $_GHKM_PHRASE_ORIENTATION && defined($_PHRASE_ORIENTATION_PRIORS_FILE);
|
||||||
$cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
|
$cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
|
||||||
|
$cmd .= " --PartsOfSpeech $_GHKM_PARTS_OF_SPEECH_FILE" if $_GHKM_PARTS_OF_SPEECH && defined($_GHKM_PARTS_OF_SPEECH_FILE);
|
||||||
$cmd .= " $DOMAIN" if $DOMAIN;
|
$cmd .= " $DOMAIN" if $DOMAIN;
|
||||||
$cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS);
|
$cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS);
|
||||||
$cmd .= " --FlexibilityScore=$FLEX_SCORER" if $_FLEXIBILITY_SCORE;
|
$cmd .= " --FlexibilityScore=$FLEX_SCORER" if $_FLEXIBILITY_SCORE;
|
||||||
|
Loading…
Reference in New Issue
Block a user