preparing extraction of Hiero soft syntactic preferences (target syntax)

This commit is contained in:
Matthias Huck 2016-01-09 23:02:31 +00:00
parent 8750c71ef4
commit 1d3feba8d0
10 changed files with 291 additions and 56 deletions

View File

@ -45,6 +45,7 @@ public:
std::string targetContextRight;
std::string sourceHoleString;
std::string targetHoleString;
std::string targetSyntacticPreference;
int startT;
int endT;
int startS;
@ -65,6 +66,7 @@ public:
, targetContextRight()
, sourceHoleString()
, targetHoleString()
, targetSyntacticPreference()
, startT(sT)
, endT(eT)
, startS(sS)

View File

@ -83,6 +83,32 @@ void PropertiesConsolidator::ActivatePartsOfSpeechProcessing(const std::string &
}
void PropertiesConsolidator::ActivateTargetSyntacticPreferencesProcessing(const std::string &targetSyntacticPreferencesLabelSetFile)
{
Moses::InputFileStream inFile(targetSyntacticPreferencesLabelSetFile);
// read target syntactic preferences label set
m_targetSyntacticPreferencesLabels.clear();
std::string line;
while (getline(inFile, line)) {
std::istringstream tokenizer(line);
std::string label;
size_t index;
try {
tokenizer >> label >> index;
} catch (const std::exception &e) {
UTIL_THROW2("Error reading target syntactic preferences label set file " << targetSyntacticPreferencesLabelSetFile << " .");
}
std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_targetSyntacticPreferencesLabels.insert( std::pair<std::string,size_t>(label,index) );
UTIL_THROW_IF2(!inserted.second,"Target syntactic preferences label set file " << targetSyntacticPreferencesLabelSetFile << " should contain each syntactic label only once.");
}
inFile.Close();
m_targetSyntacticPreferencesFlag = true;
}
void PropertiesConsolidator::ProcessPropertiesString(const std::string &propertiesString, Moses::OutputFileStream& out) const
{
if ( propertiesString.empty() ) {
@ -129,6 +155,19 @@ void PropertiesConsolidator::ProcessPropertiesString(const std::string &properti
}
*/
} else if ( !keyValue[0].compare("TargetPreferences") ) {
if ( m_targetSyntacticPreferencesFlag ) {
// TargetPreferences property: replace strings with vocabulary indices
out << " {{" << keyValue[0];
ProcessTargetSyntacticPreferencesPropertyValue(keyValue[1], out);
out << "}}";
} else { // don't process TargetPreferences property
out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
}
} else {
// output other property
@ -246,5 +285,66 @@ bool PropertiesConsolidator::GetPOSPropertyValueFromPropertiesString(const std::
}
void PropertiesConsolidator::ProcessTargetSyntacticPreferencesPropertyValue(const std::string &value, Moses::OutputFileStream& out) const
{
// TargetPreferences property: replace strings with vocabulary indices
std::istringstream tokenizer(value);
size_t nNTs;
double totalCount;
if (! (tokenizer >> nNTs)) { // first token: number of non-terminals (incl. left-hand side)
UTIL_THROW2("Not able to read number of non-terminals from TargetPreferences property. "
<< "Flawed TargetPreferences property?");
}
assert( nNTs > 0 );
out << " " << nNTs;
if (! (tokenizer >> totalCount)) { // second token: overall rule count
UTIL_THROW2("Not able to read overall rule count from TargetPreferences property. "
<< "Flawed TargetPreferences property?");
}
assert( totalCount > 0.0 );
out << " " << totalCount;
while (tokenizer.peek() != EOF) {
try {
size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
std::string token;
if (nNTs > 1) { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule
for (size_t i=0; i<nNTs-1; ++i) { // RHS target preference non-terminal labels
tokenizer >> token; // RHS target preference non-terminal label
std::map<std::string,size_t>::const_iterator found = m_targetSyntacticPreferencesLabels.find(token);
UTIL_THROW_IF2(found == m_targetSyntacticPreferencesLabels.end(), "Label \"" << token << "\" from the phrase table not found in given label set.");
out << " " << found->second;
}
tokenizer >> token; // targetPreferenceRHSCount
out << " " << token;
tokenizer >> numberOfLHSsGivenRHS;
out << " " << numberOfLHSsGivenRHS;
}
for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS target preference non-terminal labels seen with this RHS
tokenizer >> token; // LHS target preference non-terminal label
std::map<std::string,size_t>::const_iterator found = m_targetSyntacticPreferencesLabels.find(token);
UTIL_THROW_IF2(found == m_targetSyntacticPreferencesLabels.end() ,"Label \"" << token << "\" from the phrase table not found in given label set.");
out << " " << found->second;
tokenizer >> token; // ruleTargetPreferenceLabelledCount
out << " " << token;
}
} catch (const std::exception &e) {
UTIL_THROW2("Flawed item in TargetPreferences property?");
}
}
}
} // namespace MosesTraining

View File

@ -34,10 +34,15 @@ class PropertiesConsolidator
{
public:
PropertiesConsolidator() : m_sourceLabelsFlag(false) {};
PropertiesConsolidator()
: m_sourceLabelsFlag(false)
, m_partsOfSpeechFlag(false)
, m_targetSyntacticPreferencesFlag(false)
{};
void ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile);
void ActivatePartsOfSpeechProcessing(const std::string &partsOfSpeechFile);
void ActivateTargetSyntacticPreferencesProcessing(const std::string &targetSyntacticPreferencesLabelSetFile);
bool GetPOSPropertyValueFromPropertiesString(const std::string &propertiesString, std::vector<std::string>& out) const;
@ -47,11 +52,14 @@ protected:
void ProcessSourceLabelsPropertyValue(const std::string &value, Moses::OutputFileStream& out) const;
void ProcessPOSPropertyValue(const std::string &value, Moses::OutputFileStream& out) const;
void ProcessTargetSyntacticPreferencesPropertyValue(const std::string &value, Moses::OutputFileStream& out) const;
bool m_sourceLabelsFlag;
std::map<std::string,size_t> m_sourceLabels;
bool m_partsOfSpeechFlag;
std::map<std::string,size_t> m_partsOfSpeechVocabulary;
bool m_targetSyntacticPreferencesFlag;
std::map<std::string,size_t> m_targetSyntacticPreferencesLabels;
};

View File

@ -46,6 +46,7 @@ public:
bool requireAlignedWord;
bool sourceSyntax;
bool targetSyntax;
bool targetSyntacticPreferences;
bool duplicateRules;
bool fractionalCounting;
bool pcfgScore;
@ -80,6 +81,7 @@ public:
, requireAlignedWord(true)
, sourceSyntax(false)
, targetSyntax(false)
, targetSyntacticPreferences(false)
, duplicateRules(true)
, fractionalCounting(true)
, pcfgScore(false)

View File

@ -38,6 +38,7 @@ bool onlyDirectFlag = false;
bool partsOfSpeechFlag = false;
bool phraseCountFlag = false;
bool sourceLabelsFlag = false;
bool targetSyntacticPreferencesFlag = false;
bool sparseCountBinFeatureFlag = false;
std::vector< int > countBin;
@ -49,7 +50,7 @@ std::vector< float > goodTuringDiscount;
float kneserNey_D1, kneserNey_D2, kneserNey_D3, totalCount = -1;
void processFiles( const std::string&, const std::string&, const std::string&, const std::string&, const std::string&, const std::string& );
void processFiles( const std::string&, const std::string&, const std::string&, const std::string&, const std::string&, const std::string&, const std::string& );
void loadCountOfCounts( const std::string& );
void breakdownCoreAndSparse( const std::string &combined, std::string &core, std::string &sparse );
bool getLine( Moses::InputFileStream &file, std::vector< std::string > &item );
@ -93,6 +94,7 @@ int main(int argc, char* argv[])
std::string fileNameCountOfCounts;
std::string fileNameSourceLabelSet;
std::string fileNamePartsOfSpeechVocabulary;
std::string fileNameTargetSyntacticPreferencesLabelSet;
for(int i=4; i<argc; i++) {
if (strcmp(argv[i],"--Hierarchical") == 0) {
@ -150,6 +152,11 @@ int main(int argc, char* argv[])
UTIL_THROW_IF2(i+1==argc, "specify parts-of-speech file!");
fileNamePartsOfSpeechVocabulary = argv[++i];
std::cerr << "processing parts-of-speech property" << std::endl;
} else if (strcmp(argv[i],"--TargetSyntacticPreferences") == 0) {
targetSyntacticPreferencesFlag = true;
UTIL_THROW_IF2(i+1==argc, "specify target syntactic preferences label set file!");
fileNameTargetSyntacticPreferencesLabelSet = argv[++i];
std::cerr << "processing target syntactic preferences property" << std::endl;
} else if (strcmp(argv[i],"--MinScore") == 0) {
std::string setting = argv[++i];
bool done = false;
@ -182,7 +189,7 @@ int main(int argc, char* argv[])
}
}
processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts, fileNameSourceLabelSet, fileNamePartsOfSpeechVocabulary );
processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts, fileNameSourceLabelSet, fileNamePartsOfSpeechVocabulary, fileNameTargetSyntacticPreferencesLabelSet );
}
@ -231,7 +238,8 @@ void processFiles( const std::string& fileNameDirect,
const std::string& fileNameConsolidated,
const std::string& fileNameCountOfCounts,
const std::string& fileNameSourceLabelSet,
const std::string& fileNamePartsOfSpeechVocabulary )
const std::string& fileNamePartsOfSpeechVocabulary,
const std::string& fileNameTargetSyntacticPreferencesLabelSet )
{
if (goodTuringFlag || kneserNeyFlag)
loadCountOfCounts( fileNameCountOfCounts );
@ -256,6 +264,9 @@ void processFiles( const std::string& fileNameDirect,
if (partsOfSpeechFlag) {
propertiesConsolidator.ActivatePartsOfSpeechProcessing(fileNamePartsOfSpeechVocabulary);
}
if (targetSyntacticPreferencesFlag) {
propertiesConsolidator.ActivateTargetSyntacticPreferencesProcessing(fileNameTargetSyntacticPreferencesLabelSet);
}
// loop through all extracted phrase translations
int i=0;

View File

@ -79,14 +79,15 @@ private:
, RuleExist &ruleExist, HoleCollection &holeColl, int numHoles, int initStartF, int wordCountT, int wordCountS);
void saveHieroPhrase( int startT, int endT, int startS, int endS
, HoleCollection &holeColl, LabelIndex &labelIndex, int countS);
string saveTargetHieroPhrase( int startT, int endT, int startS, int endS
, WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore, int countS);
string saveTargetHieroPhrase( int startT, int endT, int startS, int endS
, WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore, int countS);
string saveSourceHieroPhrase( int startT, int endT, int startS, int endS
, HoleCollection &holeColl, const LabelIndex &labelIndex);
void preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS
, WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex);
void saveHieroAlignment( int startT, int endT, int startS, int endS
, const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule);
void saveTargetSyntacticPreference( const HoleCollection &holeColl, const LabelIndex &labelIndex, ExtractedRule &rule);
void saveAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl, int countS);
inline string IntToString( int i ) {
@ -225,6 +226,8 @@ int main(int argc, char* argv[])
// allow consecutive non-terminals (X Y | X Y)
else if (strcmp(argv[i],"--TargetSyntax") == 0) {
options.targetSyntax = true;
} else if (strcmp(argv[i],"--TargetSyntacticPreferences") == 0) {
options.targetSyntacticPreferences = true;
} else if (strcmp(argv[i],"--SourceSyntax") == 0) {
options.sourceSyntax = true;
} else if (strcmp(argv[i],"--AllowOnlyUnalignedWords") == 0) {
@ -422,7 +425,8 @@ void ExtractTask::extractRules()
int endT = startT + lengthT - 1;
// if there is target side syntax, there has to be a node
if (m_options.targetSyntax && !m_sentence.targetTree.HasNode(startT,endT))
if (m_options.targetSyntax && !m_options.targetSyntacticPreferences && !m_sentence.targetTree.HasNode(startT,endT))
// if (m_options.targetSyntax && !m_sentence.targetTree.HasNode(startT,endT))
continue;
// find find aligned source words
@ -566,7 +570,7 @@ string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int
int labelI = labelIndex[ 2+holeCount ];
string targetLabel;
if (m_options.targetSyntax) {
if (m_options.targetSyntax && !m_options.targetSyntacticPreferences) {
targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->label;
} else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
targetLabel = "S";
@ -628,7 +632,7 @@ string ExtractTask::saveSourceHieroPhrase( int startT, int endT, int startS, int
if (m_options.unpairedExtractFormat) {
out += "[" + sourceLabel + "] ";
} else {
out += "[" + sourceLabel + "][" + targetLabel + "] ";
out += "[" + sourceLabel + "][" + (m_options.targetSyntacticPreferences ? "X" : targetLabel) + "] ";
}
currPos = hole.GetEnd(0);
@ -682,6 +686,33 @@ void ExtractTask::saveHieroAlignment( int startT, int endT, int startS, int endS
}
}
void ExtractTask::saveTargetSyntacticPreference( const HoleCollection &holeColl, const LabelIndex &labelIndex, ExtractedRule &rule)
{
rule.targetSyntacticPreference = "";
int holeCount = 0;
for (HoleList::const_iterator iterHoleList = holeColl.GetHoles().begin();
iterHoleList != holeColl.GetHoles().end();
++iterHoleList) {
const Hole &hole = *iterHoleList;
int labelI = labelIndex[ 2+holeCount ];
string targetLabel = "X";
int startT = hole.GetStart(1);
int endT = hole.GetEnd(1);
if (m_sentence.targetTree.HasNode(startT,endT)) {
rule.targetSyntacticPreference += m_sentence.targetTree.GetNodes(startT,endT)[labelI]->label;
rule.targetSyntacticPreference += " ";
} else {
rule.targetSyntacticPreference += "X ";
}
++holeCount;
}
rule.targetSyntacticPreference.erase(rule.targetSyntacticPreference.size()-1);
}
void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS
, HoleCollection &holeColl, LabelIndex &labelIndex, int countS)
{
@ -691,7 +722,8 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS
// phrase labels
string targetLabel;
if (m_options.targetSyntax) {
// if (m_options.targetSyntax && m_sentence.targetTree.HasNode(startT,endT)) {
if (m_options.targetSyntax && !m_options.targetSyntacticPreferences) {
targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->label;
} else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
targetLabel = "S";
@ -776,6 +808,17 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS
// std::cerr << "phraseOrientationR2L " << m_phraseOrientation.GetOrientationInfo(startS,endS,PhraseOrientation::REO_DIR_R2L) << std::endl;
}
// target syntactic preferences
if (m_options.targetSyntacticPreferences) {
saveTargetSyntacticPreference(holeColl, labelIndex, rule);
if (m_sentence.targetTree.HasNode(startT,endT)) {
rule.targetSyntacticPreference += " ";
rule.targetSyntacticPreference += m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->label;
} else {
rule.targetSyntacticPreference += " X";
}
}
addRuleToCollection( rule );
}
@ -785,6 +828,9 @@ void ExtractTask::saveAllHieroPhrases( int startT, int endT, int startS, int end
// number of target head labels
int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(startT,endT).size() : 1;
if (m_options.targetSyntacticPreferences && !numLabels) {
numLabels++;
}
labelCount.push_back(numLabels);
labelIndex.push_back(0);
@ -796,7 +842,10 @@ void ExtractTask::saveAllHieroPhrases( int startT, int endT, int startS, int end
// number of target hole labels
for( HoleList::const_iterator hole = holeColl.GetHoles().begin();
hole != holeColl.GetHoles().end(); hole++ ) {
int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ;
int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ;
if (m_options.targetSyntacticPreferences && !numLabels) {
numLabels++;
}
labelCount.push_back(numLabels);
labelIndex.push_back(0);
}
@ -973,12 +1022,19 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count
// phrase labels
string targetLabel,sourceLabel;
if (m_options.targetSyntax && m_options.conditionOnTargetLhs) {
sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label;
if (m_sentence.targetTree.HasNode(startT,endT) && !m_options.targetSyntacticPreferences) {
sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label;
} else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
sourceLabel = "S";
} else {
sourceLabel = "X";
}
} else {
sourceLabel = m_options.sourceSyntax ?
m_sentence.sourceTree.GetNodes(startS,endS)[0]->label : "X";
if (m_options.targetSyntax) {
if (m_options.targetSyntax && !m_options.targetSyntacticPreferences) {
// if (m_options.targetSyntax && !m_options.targetSyntacticPreferences && !m_sentence.targetTree.HasNode(startT,endT))
targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label;
} else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
targetLabel = "S";
@ -1037,6 +1093,15 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count
// std::cerr << "phraseOrientationR2L " << m_phraseOrientation.GetOrientationInfo(startS,endS,PhraseOrientation::REO_DIR_R2L) << std::endl;
}
// target syntactic preferences
if (m_options.targetSyntacticPreferences) {
if (m_sentence.targetTree.HasNode(startT,endT)) {
rule.targetSyntacticPreference += m_sentence.targetTree.GetNodes(startT,endT)[0]->label;
} else {
rule.targetSyntacticPreference += "X";
}
}
addRuleToCollection( rule );
}
@ -1114,6 +1179,11 @@ void ExtractTask::writeRulesToFile()
m_phraseOrientation.IncrementPriorCount(PhraseOrientation::REO_DIR_R2L,rule->r2lOrientation,1);
out << "}}";
}
if (m_options.targetSyntacticPreferences) {
out << " {{TargetPreferences ";
out << rule->targetSyntacticPreference;
out << "}}";
}
out << "\n";
if (!m_options.onlyDirectFlag) {
@ -1167,12 +1237,12 @@ void writeGlueGrammar( const string & fileName, RuleExtractionOptions &options,
if (options.phraseOrientation) {
glueRulesPhraseProperty.append(" ||| ||| {{Orientation 1 1 0.5 0.5 1 1 0.5 0.5}}");
}
if (!options.targetSyntax) {
if (!options.targetSyntax || options.targetSyntacticPreferences) {
grammarFile << "<s> [X] ||| <s> [S] ||| 1 ||| 0-0 ||| 0" << glueRulesPhraseProperty << endl
<< "[X][S] </s> [X] ||| [X][S] </s> [S] ||| 1 ||| 0-0 1-1 ||| 0" << glueRulesPhraseProperty << endl
<< "[X][S] [X][X] [X] ||| [X][S] [X][X] [S] ||| 2.718 ||| 0-0 1-1 ||| 0" << glueRulesPhraseProperty << endl;
} else {
// chose a top label that is not already a label
// choose a top label that is not already a label
string topLabel = "QQQQQQ";
for( unsigned int i=1; i<=topLabel.length(); i++) {
if(targetLabelCollection.find( topLabel.substr(0,i) ) == targetLabelCollection.end() ) {
@ -1202,7 +1272,7 @@ void writeGlueGrammar( const string & fileName, RuleExtractionOptions &options,
// collect counts for labels for each word
// ( labels of singleton words are used to estimate
// distribution oflabels for unknown words )
// distribution of labels for unknown words )
map<string,int> wordCount;
map<string,string> wordLabel;

View File

@ -51,7 +51,7 @@ bool treeFragmentsFlag = false;
bool partsOfSpeechFlag = false;
bool sourceSyntaxLabelsFlag = false;
bool sourceSyntaxLabelCountsLHSFlag = false;
bool targetPreferenceLabelsFlag = false;
bool targetSyntacticPreferencesFlag = false;
bool unpairedExtractFormatFlag = false;
bool conditionOnTargetLhsFlag = false;
bool wordAlignmentFlag = true;
@ -83,11 +83,11 @@ std::vector<std::string> sourceLabelsByIndex;
std::set<std::string> partsOfSpeechSet;
boost::unordered_map<std::string,float> targetPreferenceLHSCounts;
boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > ruleTargetLHSAndTargetPreferenceLHSJointCounts;
std::set<std::string> targetPreferenceLabelSet;
std::map<std::string,size_t> targetPreferenceLabels;
std::vector<std::string> targetPreferenceLabelsByIndex;
boost::unordered_map<std::string,float> targetSyntacticPreferencesLHSCounts;
boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > ruleTargetLHSAndTargetSyntacticPreferencesLHSJointCounts;
std::set<std::string> targetSyntacticPreferencesLabelSet;
std::map<std::string,size_t> targetSyntacticPreferencesLabels;
std::vector<std::string> targetSyntacticPreferencesLabelsByIndex;
std::vector<float> orientationClassPriorsL2R(4,0); // mono swap dleft dright
std::vector<float> orientationClassPriorsR2L(4,0); // mono swap dleft dright
@ -150,7 +150,7 @@ int main(int argc, char* argv[])
"[--TreeFragments] "
"[--SourceLabels] "
"[--SourceLabelCountsLHS] "
"[--TargetPreferenceLabels] "
"[--TargetSyntacticPreferences] "
"[--UnpairedExtractFormat] "
"[--ConditionOnTargetLHS] "
"[--CrossedNonTerm]"
@ -167,9 +167,9 @@ int main(int argc, char* argv[])
std::string fileNameFunctionWords;
std::string fileNameLeftHandSideSourceLabelCounts;
std::string fileNameLeftHandSideTargetSourceLabelCounts;
std::string fileNameTargetPreferenceLabelSet;
std::string fileNameLeftHandSideTargetPreferenceLabelCounts;
std::string fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts;
std::string fileNameTargetSyntacticPreferencesLabelSet;
std::string fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts;
std::string fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts;
std::string fileNamePhraseOrientationPriors;
// All unknown args are passed to feature manager.
std::vector<std::string> featureArgs;
@ -205,14 +205,18 @@ int main(int argc, char* argv[])
fileNameLeftHandSideSourceLabelCounts = std::string(fileNamePhraseTable) + ".src.lhs";
fileNameLeftHandSideTargetSourceLabelCounts = std::string(fileNamePhraseTable) + ".tgt-src.lhs";
std::cerr << "counting left-hand side source labels and writing them to files " << fileNameLeftHandSideSourceLabelCounts << " and " << fileNameLeftHandSideTargetSourceLabelCounts << std::endl;
} else if (strcmp(argv[i],"--TargetPreferenceLabels") == 0) {
targetPreferenceLabelsFlag = true;
std::cerr << "including target preference label information" << std::endl;
fileNameTargetPreferenceLabelSet = std::string(fileNamePhraseTable) + ".syntaxLabels.tgtpref";
std::cerr << "writing target preference label set to file " << fileNameTargetPreferenceLabelSet << std::endl;
fileNameLeftHandSideTargetPreferenceLabelCounts = std::string(fileNamePhraseTable) + ".tgtpref.lhs";
fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts = std::string(fileNamePhraseTable) + ".tgt-tgtpref.lhs";
std::cerr << "counting left-hand side target preference labels and writing them to files " << fileNameLeftHandSideTargetPreferenceLabelCounts << " and " << fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts << std::endl;
} else if (strcmp(argv[i],"--TargetSyntacticPreferences") == 0) {
targetSyntacticPreferencesFlag = true;
std::cerr << "including target syntactic preferences information" << std::endl;
fileNameTargetSyntacticPreferencesLabelSet = std::string(fileNamePhraseTable) + ".syntaxLabels.tgtpref";
std::cerr << "writing target syntactic preferences label set to file " << fileNameTargetSyntacticPreferencesLabelSet << std::endl;
fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts = std::string(fileNamePhraseTable) + ".tgtpref.lhs";
fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts = std::string(fileNamePhraseTable) + ".tgt-tgtpref.lhs";
std::cerr << "counting left-hand side target syntactic preferences labels and writing them to files "
<< fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts
<< " and "
<< fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts
<< std::endl;
} else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) {
unpairedExtractFormatFlag = true;
std::cerr << "processing unpaired extract format" << std::endl;
@ -508,13 +512,13 @@ int main(int argc, char* argv[])
writeLabelSet( partsOfSpeechSet, fileNamePartsOfSpeechSet );
}
// target preference labels
if (targetPreferenceLabelsFlag && !inverseFlag) {
writeLabelSet( targetPreferenceLabelSet, fileNameTargetPreferenceLabelSet );
writeLeftHandSideLabelCounts( targetPreferenceLHSCounts,
ruleTargetLHSAndTargetPreferenceLHSJointCounts,
fileNameLeftHandSideTargetPreferenceLabelCounts,
fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts );
// target syntactic preferences labels
if (targetSyntacticPreferencesFlag && !inverseFlag) {
writeLabelSet( targetSyntacticPreferencesLabelSet, fileNameTargetSyntacticPreferencesLabelSet );
writeLeftHandSideLabelCounts( targetSyntacticPreferencesLHSCounts,
ruleTargetLHSAndTargetSyntacticPreferencesLHSJointCounts,
fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts,
fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts );
}
}
@ -874,7 +878,7 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
}
// syntax labels
if ((sourceSyntaxLabelsFlag || targetPreferenceLabelsFlag) && !inverseFlag) {
if ((sourceSyntaxLabelsFlag || targetSyntacticPreferencesFlag) && !inverseFlag) {
unsigned nNTs = 1;
for(size_t j=0; j<phraseSource->size()-1; ++j) {
if (isNonTerminal(vcbS.getWord( phraseSource->at(j) )))
@ -897,20 +901,20 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
<< "}}";
}
}
// target preference labels
if (targetPreferenceLabelsFlag) {
std::string targetPreferenceLabelCounts;
targetPreferenceLabelCounts = phrasePair.CollectAllLabelsSeparateLHSAndRHS("TargetPreferences",
targetPreferenceLabelSet,
targetPreferenceLHSCounts,
ruleTargetLHSAndTargetPreferenceLHSJointCounts,
vcbT);
if ( !targetPreferenceLabelCounts.empty() ) {
// target syntactic preferences labels
if (targetSyntacticPreferencesFlag) {
std::string targetSyntacticPreferencesLabelCounts;
targetSyntacticPreferencesLabelCounts = phrasePair.CollectAllLabelsSeparateLHSAndRHS("TargetPreferences",
targetSyntacticPreferencesLabelSet,
targetSyntacticPreferencesLHSCounts,
ruleTargetLHSAndTargetSyntacticPreferencesLHSJointCounts,
vcbT);
if (!targetSyntacticPreferencesLabelCounts.empty()) {
phraseTableFile << " {{TargetPreferences "
<< nNTs // for convenience: number of non-terminal symbols in this rule (incl. left hand side NT)
<< " "
<< count // rule count
<< targetPreferenceLabelCounts
<< targetSyntacticPreferencesLabelCounts
<< "}}";
}
}

View File

@ -2374,6 +2374,12 @@ sub define_training_extract_phrases {
$cmd .= "-ghkm ";
}
if (&get("TRAINING:target-syntactic-preferences")) {
$cmd .= "-target-syntactic-preferences ";
my $target_syntactic_preferences_labels_file = &versionize(&long_file_name("target-syntactic-preferences-labels","model",""));
$cmd .= "-target-syntactic-preferences-labels-file $target_syntactic_preferences_labels_file ";
}
if (&get("TRAINING:ghkm-tree-fragments")) {
$cmd .= "-ghkm-tree-fragments ";
}
@ -2433,6 +2439,12 @@ sub define_training_build_ttable {
$cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file ";
}
if (&get("TRAINING:target-syntactic-preferences")) {
$cmd .= "-target-syntactic-preferences ";
my $target_syntactic_preferences_labels_file = &versionize(&long_file_name("target-syntactic-preferences-labels","model",""));
$cmd .= "-target-syntactic-preferences-labels-file $target_syntactic_preferences_labels_file ";
}
if (&get("TRAINING:ghkm-tree-fragments")) {
$cmd .= "-ghkm-tree-fragments ";
}
@ -2644,6 +2656,12 @@ sub define_training_create_config {
$cmd .= "-phrase-orientation ";
}
if (&get("TRAINING:target-syntactic-preferences")) {
$cmd .= "-target-syntactic-preferences ";
my $target_syntactic_preferences_labels_file = &versionize(&long_file_name("target-syntactic-preferences-labels","model",""));
$cmd .= "-target-syntactic-preferences-labels-file $target_syntactic_preferences_labels_file ";
}
if (&get("TRAINING:ghkm-source-labels")) {
$cmd .= "-ghkm-source-labels ";
my $source_labels_file = &versionize(&long_file_name("source-labels","model",""));

View File

@ -43,6 +43,7 @@ my $ptHalf = $ARGV[5]; # output
my $inverse = 0;
my $sourceLabelsFile;
my $partsOfSpeechFile;
my $targetSyntacticPreferencesLabelsFile;
my $otherExtractArgs= "";
for (my $i = 6; $i < $#ARGV; ++$i)
@ -57,6 +58,11 @@ for (my $i = 6; $i < $#ARGV; ++$i)
$otherExtractArgs .= "--PartsOfSpeech ";
next;
}
if ($ARGV[$i] eq '--TargetSyntacticPreferences') {
$targetSyntacticPreferencesLabelsFile = $ARGV[++$i];
$otherExtractArgs .= "--TargetSyntacticPreferences ";
next;
}
if ($ARGV[$i] eq '--Inverse') {
$inverse = 1;
$otherExtractArgs .= $ARGV[$i] ." ";
@ -289,11 +295,11 @@ if (-e $cocPath)
close(FHCOC);
}
# merge source label files
# merge source labels files
if (!$inverse && defined($sourceLabelsFile))
{
my $cmd = "(echo \"GlueTop 0\"; echo \"GlueX 1\"; echo \"SSTART 2\"; echo \"SEND 3\"; cat $TMPDIR/phrase-table.half.*.gz.syntaxLabels.src | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+3]}/\") > $sourceLabelsFile";
print STDERR "Merging source label files: $cmd \n";
print STDERR "Merging source labels files: $cmd \n";
`$cmd`;
}
@ -305,6 +311,13 @@ if (!$inverse && defined($partsOfSpeechFile))
`$cmd`;
}
# merge target syntactic preferences labels files
if (!$inverse && defined($targetSyntacticPreferencesLabelsFile))
{
my $cmd = "(echo \"GlueTop 0\"; echo \"GlueX 1\"; echo \"SSTART 2\"; echo \"SEND 3\"; cat $TMPDIR/phrase-table.half.*.gz.syntaxLabels.tgtpref | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+3]}/\") > $targetSyntacticPreferencesLabelsFile";
print STDERR "Merging target syntactic preferences labels files: $cmd \n";
`$cmd`;
}
$cmd = "rm -rf $TMPDIR \n";
print STDERR $cmd;

View File

@ -89,6 +89,8 @@ my($_EXTERNAL_BINDIR,
$_XML,
$_SOURCE_SYNTAX,
$_TARGET_SYNTAX,
$_TARGET_SYNTACTIC_PREFERENCES,
$_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE,
$_GLUE_GRAMMAR,
$_GLUE_GRAMMAR_FILE,
$_DONT_TUNE_GLUE_GRAMMAR,
@ -227,6 +229,8 @@ $_HELP = 1
'score-options=s' => \@_SCORE_OPTIONS,
'source-syntax' => \$_SOURCE_SYNTAX,
'target-syntax' => \$_TARGET_SYNTAX,
'target-syntactic-preferences' => \$_TARGET_SYNTACTIC_PREFERENCES,
'target-syntactic-preferences-labels-file=s' => \$_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE,
'use-syntax-input-weight-feature' => \$_USE_SYNTAX_INPUT_WEIGHT_FEATURE,
'xml' => \$_XML,
'no-word-alignment' => \$_OMIT_WORD_ALIGNMENT,
@ -1575,6 +1579,7 @@ sub extract_phrase {
{
$cmd .= " --SourceSyntax" if $_SOURCE_SYNTAX;
$cmd .= " --TargetSyntax" if $_TARGET_SYNTAX;
$cmd .= " --TargetSyntacticPreferences" if $_TARGET_SYNTACTIC_PREFERENCES;
$cmd .= " --MaxSpan $max_length";
}
$cmd .= " ".$_EXTRACT_OPTIONS if defined($_EXTRACT_OPTIONS);
@ -1712,8 +1717,8 @@ sub score_phrase_phrase_extract {
$CORE_SCORE_OPTIONS .= " --NoLex" if $NO_LEX;
$CORE_SCORE_OPTIONS .= " --Singleton" if $SINGLETON;
$CORE_SCORE_OPTIONS .= " --CrossedNonTerm" if $CROSSEDNONTERM;
$CORE_SCORE_OPTIONS .= " --SourceLabels" if $SOURCE_LABELS;
$CORE_SCORE_OPTIONS .= " --SourceLabelCountsLHS " if $SOURCE_LABEL_COUNTS_LHS;
$CORE_SCORE_OPTIONS .= " --SourceLabels" if $SOURCE_LABELS;
$CORE_SCORE_OPTIONS .= " --SourceLabelCountsLHS " if $SOURCE_LABEL_COUNTS_LHS;
my $substep = 1;
my $isParent = 1;
@ -1758,6 +1763,7 @@ sub score_phrase_phrase_extract {
$cmd .= " --PhraseOrientation" if $_PHRASE_ORIENTATION;
$cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if $_PHRASE_ORIENTATION && defined($_PHRASE_ORIENTATION_PRIORS_FILE);
$cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
$cmd .= " --TargetSyntacticPreferences $_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE" if $_TARGET_SYNTACTIC_PREFERENCES && defined($_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE);
$cmd .= " --PartsOfSpeech $_GHKM_PARTS_OF_SPEECH_FILE" if $_GHKM_PARTS_OF_SPEECH && defined($_GHKM_PARTS_OF_SPEECH_FILE);
$cmd .= " $DOMAIN" if $DOMAIN;
$cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS);
@ -1811,6 +1817,7 @@ sub score_phrase_phrase_extract {
$cmd .= " --GoodTuring $ttable_file.half.f2e.gz.coc" if $GOOD_TURING;
$cmd .= " --KneserNey $ttable_file.half.f2e.gz.coc" if $KNESER_NEY;
$cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
$cmd .= " --TargetSyntacticPreferences $_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE" if $_TARGET_SYNTACTIC_PREFERENCES && defined($_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE);
$cmd .= " --PartsOfSpeech $_GHKM_PARTS_OF_SPEECH_FILE" if $_GHKM_PARTS_OF_SPEECH && defined($_GHKM_PARTS_OF_SPEECH_FILE);
$cmd .= " | $GZIP_EXEC -c > $ttable_file.gz";