mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-10-05 15:58:03 +03:00
preparing extraction of Hiero soft syntactic preferences (target syntax)
This commit is contained in:
parent
8750c71ef4
commit
1d3feba8d0
@ -45,6 +45,7 @@ public:
|
||||
std::string targetContextRight;
|
||||
std::string sourceHoleString;
|
||||
std::string targetHoleString;
|
||||
std::string targetSyntacticPreference;
|
||||
int startT;
|
||||
int endT;
|
||||
int startS;
|
||||
@ -65,6 +66,7 @@ public:
|
||||
, targetContextRight()
|
||||
, sourceHoleString()
|
||||
, targetHoleString()
|
||||
, targetSyntacticPreference()
|
||||
, startT(sT)
|
||||
, endT(eT)
|
||||
, startS(sS)
|
||||
|
@ -83,6 +83,32 @@ void PropertiesConsolidator::ActivatePartsOfSpeechProcessing(const std::string &
|
||||
}
|
||||
|
||||
|
||||
void PropertiesConsolidator::ActivateTargetSyntacticPreferencesProcessing(const std::string &targetSyntacticPreferencesLabelSetFile)
|
||||
{
|
||||
Moses::InputFileStream inFile(targetSyntacticPreferencesLabelSetFile);
|
||||
|
||||
// read target syntactic preferences label set
|
||||
m_targetSyntacticPreferencesLabels.clear();
|
||||
std::string line;
|
||||
while (getline(inFile, line)) {
|
||||
std::istringstream tokenizer(line);
|
||||
std::string label;
|
||||
size_t index;
|
||||
try {
|
||||
tokenizer >> label >> index;
|
||||
} catch (const std::exception &e) {
|
||||
UTIL_THROW2("Error reading target syntactic preferences label set file " << targetSyntacticPreferencesLabelSetFile << " .");
|
||||
}
|
||||
std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_targetSyntacticPreferencesLabels.insert( std::pair<std::string,size_t>(label,index) );
|
||||
UTIL_THROW_IF2(!inserted.second,"Target syntactic preferences label set file " << targetSyntacticPreferencesLabelSetFile << " should contain each syntactic label only once.");
|
||||
}
|
||||
|
||||
inFile.Close();
|
||||
|
||||
m_targetSyntacticPreferencesFlag = true;
|
||||
}
|
||||
|
||||
|
||||
void PropertiesConsolidator::ProcessPropertiesString(const std::string &propertiesString, Moses::OutputFileStream& out) const
|
||||
{
|
||||
if ( propertiesString.empty() ) {
|
||||
@ -129,6 +155,19 @@ void PropertiesConsolidator::ProcessPropertiesString(const std::string &properti
|
||||
}
|
||||
*/
|
||||
|
||||
} else if ( !keyValue[0].compare("TargetPreferences") ) {
|
||||
|
||||
if ( m_targetSyntacticPreferencesFlag ) {
|
||||
|
||||
// TargetPreferences property: replace strings with vocabulary indices
|
||||
out << " {{" << keyValue[0];
|
||||
ProcessTargetSyntacticPreferencesPropertyValue(keyValue[1], out);
|
||||
out << "}}";
|
||||
|
||||
} else { // don't process TargetPreferences property
|
||||
out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
// output other property
|
||||
@ -246,5 +285,66 @@ bool PropertiesConsolidator::GetPOSPropertyValueFromPropertiesString(const std::
|
||||
}
|
||||
|
||||
|
||||
void PropertiesConsolidator::ProcessTargetSyntacticPreferencesPropertyValue(const std::string &value, Moses::OutputFileStream& out) const
|
||||
{
|
||||
// TargetPreferences property: replace strings with vocabulary indices
|
||||
std::istringstream tokenizer(value);
|
||||
|
||||
size_t nNTs;
|
||||
double totalCount;
|
||||
|
||||
if (! (tokenizer >> nNTs)) { // first token: number of non-terminals (incl. left-hand side)
|
||||
UTIL_THROW2("Not able to read number of non-terminals from TargetPreferences property. "
|
||||
<< "Flawed TargetPreferences property?");
|
||||
}
|
||||
assert( nNTs > 0 );
|
||||
out << " " << nNTs;
|
||||
|
||||
if (! (tokenizer >> totalCount)) { // second token: overall rule count
|
||||
UTIL_THROW2("Not able to read overall rule count from TargetPreferences property. "
|
||||
<< "Flawed TargetPreferences property?");
|
||||
}
|
||||
assert( totalCount > 0.0 );
|
||||
out << " " << totalCount;
|
||||
|
||||
while (tokenizer.peek() != EOF) {
|
||||
try {
|
||||
|
||||
size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
|
||||
|
||||
std::string token;
|
||||
|
||||
if (nNTs > 1) { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule
|
||||
for (size_t i=0; i<nNTs-1; ++i) { // RHS target preference non-terminal labels
|
||||
tokenizer >> token; // RHS target preference non-terminal label
|
||||
std::map<std::string,size_t>::const_iterator found = m_targetSyntacticPreferencesLabels.find(token);
|
||||
UTIL_THROW_IF2(found == m_targetSyntacticPreferencesLabels.end(), "Label \"" << token << "\" from the phrase table not found in given label set.");
|
||||
out << " " << found->second;
|
||||
}
|
||||
|
||||
tokenizer >> token; // targetPreferenceRHSCount
|
||||
out << " " << token;
|
||||
|
||||
tokenizer >> numberOfLHSsGivenRHS;
|
||||
out << " " << numberOfLHSsGivenRHS;
|
||||
}
|
||||
|
||||
for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS target preference non-terminal labels seen with this RHS
|
||||
tokenizer >> token; // LHS target preference non-terminal label
|
||||
std::map<std::string,size_t>::const_iterator found = m_targetSyntacticPreferencesLabels.find(token);
|
||||
UTIL_THROW_IF2(found == m_targetSyntacticPreferencesLabels.end() ,"Label \"" << token << "\" from the phrase table not found in given label set.");
|
||||
out << " " << found->second;
|
||||
|
||||
tokenizer >> token; // ruleTargetPreferenceLabelledCount
|
||||
out << " " << token;
|
||||
}
|
||||
|
||||
} catch (const std::exception &e) {
|
||||
UTIL_THROW2("Flawed item in TargetPreferences property?");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // namespace MosesTraining
|
||||
|
||||
|
@ -34,10 +34,15 @@ class PropertiesConsolidator
|
||||
{
|
||||
public:
|
||||
|
||||
PropertiesConsolidator() : m_sourceLabelsFlag(false) {};
|
||||
PropertiesConsolidator()
|
||||
: m_sourceLabelsFlag(false)
|
||||
, m_partsOfSpeechFlag(false)
|
||||
, m_targetSyntacticPreferencesFlag(false)
|
||||
{};
|
||||
|
||||
void ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile);
|
||||
void ActivatePartsOfSpeechProcessing(const std::string &partsOfSpeechFile);
|
||||
void ActivateTargetSyntacticPreferencesProcessing(const std::string &targetSyntacticPreferencesLabelSetFile);
|
||||
|
||||
bool GetPOSPropertyValueFromPropertiesString(const std::string &propertiesString, std::vector<std::string>& out) const;
|
||||
|
||||
@ -47,11 +52,14 @@ protected:
|
||||
|
||||
void ProcessSourceLabelsPropertyValue(const std::string &value, Moses::OutputFileStream& out) const;
|
||||
void ProcessPOSPropertyValue(const std::string &value, Moses::OutputFileStream& out) const;
|
||||
void ProcessTargetSyntacticPreferencesPropertyValue(const std::string &value, Moses::OutputFileStream& out) const;
|
||||
|
||||
bool m_sourceLabelsFlag;
|
||||
std::map<std::string,size_t> m_sourceLabels;
|
||||
bool m_partsOfSpeechFlag;
|
||||
std::map<std::string,size_t> m_partsOfSpeechVocabulary;
|
||||
bool m_targetSyntacticPreferencesFlag;
|
||||
std::map<std::string,size_t> m_targetSyntacticPreferencesLabels;
|
||||
|
||||
};
|
||||
|
||||
|
@ -46,6 +46,7 @@ public:
|
||||
bool requireAlignedWord;
|
||||
bool sourceSyntax;
|
||||
bool targetSyntax;
|
||||
bool targetSyntacticPreferences;
|
||||
bool duplicateRules;
|
||||
bool fractionalCounting;
|
||||
bool pcfgScore;
|
||||
@ -80,6 +81,7 @@ public:
|
||||
, requireAlignedWord(true)
|
||||
, sourceSyntax(false)
|
||||
, targetSyntax(false)
|
||||
, targetSyntacticPreferences(false)
|
||||
, duplicateRules(true)
|
||||
, fractionalCounting(true)
|
||||
, pcfgScore(false)
|
||||
|
@ -38,6 +38,7 @@ bool onlyDirectFlag = false;
|
||||
bool partsOfSpeechFlag = false;
|
||||
bool phraseCountFlag = false;
|
||||
bool sourceLabelsFlag = false;
|
||||
bool targetSyntacticPreferencesFlag = false;
|
||||
bool sparseCountBinFeatureFlag = false;
|
||||
|
||||
std::vector< int > countBin;
|
||||
@ -49,7 +50,7 @@ std::vector< float > goodTuringDiscount;
|
||||
float kneserNey_D1, kneserNey_D2, kneserNey_D3, totalCount = -1;
|
||||
|
||||
|
||||
void processFiles( const std::string&, const std::string&, const std::string&, const std::string&, const std::string&, const std::string& );
|
||||
void processFiles( const std::string&, const std::string&, const std::string&, const std::string&, const std::string&, const std::string&, const std::string& );
|
||||
void loadCountOfCounts( const std::string& );
|
||||
void breakdownCoreAndSparse( const std::string &combined, std::string &core, std::string &sparse );
|
||||
bool getLine( Moses::InputFileStream &file, std::vector< std::string > &item );
|
||||
@ -93,6 +94,7 @@ int main(int argc, char* argv[])
|
||||
std::string fileNameCountOfCounts;
|
||||
std::string fileNameSourceLabelSet;
|
||||
std::string fileNamePartsOfSpeechVocabulary;
|
||||
std::string fileNameTargetSyntacticPreferencesLabelSet;
|
||||
|
||||
for(int i=4; i<argc; i++) {
|
||||
if (strcmp(argv[i],"--Hierarchical") == 0) {
|
||||
@ -150,6 +152,11 @@ int main(int argc, char* argv[])
|
||||
UTIL_THROW_IF2(i+1==argc, "specify parts-of-speech file!");
|
||||
fileNamePartsOfSpeechVocabulary = argv[++i];
|
||||
std::cerr << "processing parts-of-speech property" << std::endl;
|
||||
} else if (strcmp(argv[i],"--TargetSyntacticPreferences") == 0) {
|
||||
targetSyntacticPreferencesFlag = true;
|
||||
UTIL_THROW_IF2(i+1==argc, "specify target syntactic preferences label set file!");
|
||||
fileNameTargetSyntacticPreferencesLabelSet = argv[++i];
|
||||
std::cerr << "processing target syntactic preferences property" << std::endl;
|
||||
} else if (strcmp(argv[i],"--MinScore") == 0) {
|
||||
std::string setting = argv[++i];
|
||||
bool done = false;
|
||||
@ -182,7 +189,7 @@ int main(int argc, char* argv[])
|
||||
}
|
||||
}
|
||||
|
||||
processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts, fileNameSourceLabelSet, fileNamePartsOfSpeechVocabulary );
|
||||
processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts, fileNameSourceLabelSet, fileNamePartsOfSpeechVocabulary, fileNameTargetSyntacticPreferencesLabelSet );
|
||||
}
|
||||
|
||||
|
||||
@ -231,7 +238,8 @@ void processFiles( const std::string& fileNameDirect,
|
||||
const std::string& fileNameConsolidated,
|
||||
const std::string& fileNameCountOfCounts,
|
||||
const std::string& fileNameSourceLabelSet,
|
||||
const std::string& fileNamePartsOfSpeechVocabulary )
|
||||
const std::string& fileNamePartsOfSpeechVocabulary,
|
||||
const std::string& fileNameTargetSyntacticPreferencesLabelSet )
|
||||
{
|
||||
if (goodTuringFlag || kneserNeyFlag)
|
||||
loadCountOfCounts( fileNameCountOfCounts );
|
||||
@ -256,6 +264,9 @@ void processFiles( const std::string& fileNameDirect,
|
||||
if (partsOfSpeechFlag) {
|
||||
propertiesConsolidator.ActivatePartsOfSpeechProcessing(fileNamePartsOfSpeechVocabulary);
|
||||
}
|
||||
if (targetSyntacticPreferencesFlag) {
|
||||
propertiesConsolidator.ActivateTargetSyntacticPreferencesProcessing(fileNameTargetSyntacticPreferencesLabelSet);
|
||||
}
|
||||
|
||||
// loop through all extracted phrase translations
|
||||
int i=0;
|
||||
|
@ -79,14 +79,15 @@ private:
|
||||
, RuleExist &ruleExist, HoleCollection &holeColl, int numHoles, int initStartF, int wordCountT, int wordCountS);
|
||||
void saveHieroPhrase( int startT, int endT, int startS, int endS
|
||||
, HoleCollection &holeColl, LabelIndex &labelIndex, int countS);
|
||||
string saveTargetHieroPhrase( int startT, int endT, int startS, int endS
|
||||
, WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore, int countS);
|
||||
string saveTargetHieroPhrase( int startT, int endT, int startS, int endS
|
||||
, WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore, int countS);
|
||||
string saveSourceHieroPhrase( int startT, int endT, int startS, int endS
|
||||
, HoleCollection &holeColl, const LabelIndex &labelIndex);
|
||||
void preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS
|
||||
, WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex);
|
||||
void saveHieroAlignment( int startT, int endT, int startS, int endS
|
||||
, const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule);
|
||||
void saveTargetSyntacticPreference( const HoleCollection &holeColl, const LabelIndex &labelIndex, ExtractedRule &rule);
|
||||
void saveAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl, int countS);
|
||||
|
||||
inline string IntToString( int i ) {
|
||||
@ -225,6 +226,8 @@ int main(int argc, char* argv[])
|
||||
// allow consecutive non-terminals (X Y | X Y)
|
||||
else if (strcmp(argv[i],"--TargetSyntax") == 0) {
|
||||
options.targetSyntax = true;
|
||||
} else if (strcmp(argv[i],"--TargetSyntacticPreferences") == 0) {
|
||||
options.targetSyntacticPreferences = true;
|
||||
} else if (strcmp(argv[i],"--SourceSyntax") == 0) {
|
||||
options.sourceSyntax = true;
|
||||
} else if (strcmp(argv[i],"--AllowOnlyUnalignedWords") == 0) {
|
||||
@ -422,7 +425,8 @@ void ExtractTask::extractRules()
|
||||
int endT = startT + lengthT - 1;
|
||||
|
||||
// if there is target side syntax, there has to be a node
|
||||
if (m_options.targetSyntax && !m_sentence.targetTree.HasNode(startT,endT))
|
||||
if (m_options.targetSyntax && !m_options.targetSyntacticPreferences && !m_sentence.targetTree.HasNode(startT,endT))
|
||||
// if (m_options.targetSyntax && !m_sentence.targetTree.HasNode(startT,endT))
|
||||
continue;
|
||||
|
||||
// find find aligned source words
|
||||
@ -566,7 +570,7 @@ string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int
|
||||
|
||||
int labelI = labelIndex[ 2+holeCount ];
|
||||
string targetLabel;
|
||||
if (m_options.targetSyntax) {
|
||||
if (m_options.targetSyntax && !m_options.targetSyntacticPreferences) {
|
||||
targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->label;
|
||||
} else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
|
||||
targetLabel = "S";
|
||||
@ -628,7 +632,7 @@ string ExtractTask::saveSourceHieroPhrase( int startT, int endT, int startS, int
|
||||
if (m_options.unpairedExtractFormat) {
|
||||
out += "[" + sourceLabel + "] ";
|
||||
} else {
|
||||
out += "[" + sourceLabel + "][" + targetLabel + "] ";
|
||||
out += "[" + sourceLabel + "][" + (m_options.targetSyntacticPreferences ? "X" : targetLabel) + "] ";
|
||||
}
|
||||
|
||||
currPos = hole.GetEnd(0);
|
||||
@ -682,6 +686,33 @@ void ExtractTask::saveHieroAlignment( int startT, int endT, int startS, int endS
|
||||
}
|
||||
}
|
||||
|
||||
void ExtractTask::saveTargetSyntacticPreference( const HoleCollection &holeColl, const LabelIndex &labelIndex, ExtractedRule &rule)
|
||||
{
|
||||
rule.targetSyntacticPreference = "";
|
||||
int holeCount = 0;
|
||||
for (HoleList::const_iterator iterHoleList = holeColl.GetHoles().begin();
|
||||
iterHoleList != holeColl.GetHoles().end();
|
||||
++iterHoleList) {
|
||||
|
||||
const Hole &hole = *iterHoleList;
|
||||
|
||||
int labelI = labelIndex[ 2+holeCount ];
|
||||
string targetLabel = "X";
|
||||
int startT = hole.GetStart(1);
|
||||
int endT = hole.GetEnd(1);
|
||||
if (m_sentence.targetTree.HasNode(startT,endT)) {
|
||||
rule.targetSyntacticPreference += m_sentence.targetTree.GetNodes(startT,endT)[labelI]->label;
|
||||
rule.targetSyntacticPreference += " ";
|
||||
} else {
|
||||
rule.targetSyntacticPreference += "X ";
|
||||
}
|
||||
++holeCount;
|
||||
}
|
||||
|
||||
rule.targetSyntacticPreference.erase(rule.targetSyntacticPreference.size()-1);
|
||||
}
|
||||
|
||||
|
||||
void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS
|
||||
, HoleCollection &holeColl, LabelIndex &labelIndex, int countS)
|
||||
{
|
||||
@ -691,7 +722,8 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS
|
||||
|
||||
// phrase labels
|
||||
string targetLabel;
|
||||
if (m_options.targetSyntax) {
|
||||
// if (m_options.targetSyntax && m_sentence.targetTree.HasNode(startT,endT)) {
|
||||
if (m_options.targetSyntax && !m_options.targetSyntacticPreferences) {
|
||||
targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->label;
|
||||
} else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
|
||||
targetLabel = "S";
|
||||
@ -776,6 +808,17 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS
|
||||
// std::cerr << "phraseOrientationR2L " << m_phraseOrientation.GetOrientationInfo(startS,endS,PhraseOrientation::REO_DIR_R2L) << std::endl;
|
||||
}
|
||||
|
||||
// target syntactic preferences
|
||||
if (m_options.targetSyntacticPreferences) {
|
||||
saveTargetSyntacticPreference(holeColl, labelIndex, rule);
|
||||
if (m_sentence.targetTree.HasNode(startT,endT)) {
|
||||
rule.targetSyntacticPreference += " ";
|
||||
rule.targetSyntacticPreference += m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->label;
|
||||
} else {
|
||||
rule.targetSyntacticPreference += " X";
|
||||
}
|
||||
}
|
||||
|
||||
addRuleToCollection( rule );
|
||||
}
|
||||
|
||||
@ -785,6 +828,9 @@ void ExtractTask::saveAllHieroPhrases( int startT, int endT, int startS, int end
|
||||
|
||||
// number of target head labels
|
||||
int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(startT,endT).size() : 1;
|
||||
if (m_options.targetSyntacticPreferences && !numLabels) {
|
||||
numLabels++;
|
||||
}
|
||||
labelCount.push_back(numLabels);
|
||||
labelIndex.push_back(0);
|
||||
|
||||
@ -796,7 +842,10 @@ void ExtractTask::saveAllHieroPhrases( int startT, int endT, int startS, int end
|
||||
// number of target hole labels
|
||||
for( HoleList::const_iterator hole = holeColl.GetHoles().begin();
|
||||
hole != holeColl.GetHoles().end(); hole++ ) {
|
||||
int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ;
|
||||
int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ;
|
||||
if (m_options.targetSyntacticPreferences && !numLabels) {
|
||||
numLabels++;
|
||||
}
|
||||
labelCount.push_back(numLabels);
|
||||
labelIndex.push_back(0);
|
||||
}
|
||||
@ -973,12 +1022,19 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count
|
||||
// phrase labels
|
||||
string targetLabel,sourceLabel;
|
||||
if (m_options.targetSyntax && m_options.conditionOnTargetLhs) {
|
||||
sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label;
|
||||
if (m_sentence.targetTree.HasNode(startT,endT) && !m_options.targetSyntacticPreferences) {
|
||||
sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label;
|
||||
} else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
|
||||
sourceLabel = "S";
|
||||
} else {
|
||||
sourceLabel = "X";
|
||||
}
|
||||
} else {
|
||||
sourceLabel = m_options.sourceSyntax ?
|
||||
m_sentence.sourceTree.GetNodes(startS,endS)[0]->label : "X";
|
||||
|
||||
if (m_options.targetSyntax) {
|
||||
if (m_options.targetSyntax && !m_options.targetSyntacticPreferences) {
|
||||
// if (m_options.targetSyntax && !m_options.targetSyntacticPreferences && !m_sentence.targetTree.HasNode(startT,endT))
|
||||
targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->label;
|
||||
} else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
|
||||
targetLabel = "S";
|
||||
@ -1037,6 +1093,15 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count
|
||||
// std::cerr << "phraseOrientationR2L " << m_phraseOrientation.GetOrientationInfo(startS,endS,PhraseOrientation::REO_DIR_R2L) << std::endl;
|
||||
}
|
||||
|
||||
// target syntactic preferences
|
||||
if (m_options.targetSyntacticPreferences) {
|
||||
if (m_sentence.targetTree.HasNode(startT,endT)) {
|
||||
rule.targetSyntacticPreference += m_sentence.targetTree.GetNodes(startT,endT)[0]->label;
|
||||
} else {
|
||||
rule.targetSyntacticPreference += "X";
|
||||
}
|
||||
}
|
||||
|
||||
addRuleToCollection( rule );
|
||||
}
|
||||
|
||||
@ -1114,6 +1179,11 @@ void ExtractTask::writeRulesToFile()
|
||||
m_phraseOrientation.IncrementPriorCount(PhraseOrientation::REO_DIR_R2L,rule->r2lOrientation,1);
|
||||
out << "}}";
|
||||
}
|
||||
if (m_options.targetSyntacticPreferences) {
|
||||
out << " {{TargetPreferences ";
|
||||
out << rule->targetSyntacticPreference;
|
||||
out << "}}";
|
||||
}
|
||||
out << "\n";
|
||||
|
||||
if (!m_options.onlyDirectFlag) {
|
||||
@ -1167,12 +1237,12 @@ void writeGlueGrammar( const string & fileName, RuleExtractionOptions &options,
|
||||
if (options.phraseOrientation) {
|
||||
glueRulesPhraseProperty.append(" ||| ||| {{Orientation 1 1 0.5 0.5 1 1 0.5 0.5}}");
|
||||
}
|
||||
if (!options.targetSyntax) {
|
||||
if (!options.targetSyntax || options.targetSyntacticPreferences) {
|
||||
grammarFile << "<s> [X] ||| <s> [S] ||| 1 ||| 0-0 ||| 0" << glueRulesPhraseProperty << endl
|
||||
<< "[X][S] </s> [X] ||| [X][S] </s> [S] ||| 1 ||| 0-0 1-1 ||| 0" << glueRulesPhraseProperty << endl
|
||||
<< "[X][S] [X][X] [X] ||| [X][S] [X][X] [S] ||| 2.718 ||| 0-0 1-1 ||| 0" << glueRulesPhraseProperty << endl;
|
||||
} else {
|
||||
// chose a top label that is not already a label
|
||||
// choose a top label that is not already a label
|
||||
string topLabel = "QQQQQQ";
|
||||
for( unsigned int i=1; i<=topLabel.length(); i++) {
|
||||
if(targetLabelCollection.find( topLabel.substr(0,i) ) == targetLabelCollection.end() ) {
|
||||
@ -1202,7 +1272,7 @@ void writeGlueGrammar( const string & fileName, RuleExtractionOptions &options,
|
||||
|
||||
// collect counts for labels for each word
|
||||
// ( labels of singleton words are used to estimate
|
||||
// distribution oflabels for unknown words )
|
||||
// distribution of labels for unknown words )
|
||||
|
||||
map<string,int> wordCount;
|
||||
map<string,string> wordLabel;
|
||||
|
@ -51,7 +51,7 @@ bool treeFragmentsFlag = false;
|
||||
bool partsOfSpeechFlag = false;
|
||||
bool sourceSyntaxLabelsFlag = false;
|
||||
bool sourceSyntaxLabelCountsLHSFlag = false;
|
||||
bool targetPreferenceLabelsFlag = false;
|
||||
bool targetSyntacticPreferencesFlag = false;
|
||||
bool unpairedExtractFormatFlag = false;
|
||||
bool conditionOnTargetLhsFlag = false;
|
||||
bool wordAlignmentFlag = true;
|
||||
@ -83,11 +83,11 @@ std::vector<std::string> sourceLabelsByIndex;
|
||||
|
||||
std::set<std::string> partsOfSpeechSet;
|
||||
|
||||
boost::unordered_map<std::string,float> targetPreferenceLHSCounts;
|
||||
boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > ruleTargetLHSAndTargetPreferenceLHSJointCounts;
|
||||
std::set<std::string> targetPreferenceLabelSet;
|
||||
std::map<std::string,size_t> targetPreferenceLabels;
|
||||
std::vector<std::string> targetPreferenceLabelsByIndex;
|
||||
boost::unordered_map<std::string,float> targetSyntacticPreferencesLHSCounts;
|
||||
boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > ruleTargetLHSAndTargetSyntacticPreferencesLHSJointCounts;
|
||||
std::set<std::string> targetSyntacticPreferencesLabelSet;
|
||||
std::map<std::string,size_t> targetSyntacticPreferencesLabels;
|
||||
std::vector<std::string> targetSyntacticPreferencesLabelsByIndex;
|
||||
|
||||
std::vector<float> orientationClassPriorsL2R(4,0); // mono swap dleft dright
|
||||
std::vector<float> orientationClassPriorsR2L(4,0); // mono swap dleft dright
|
||||
@ -150,7 +150,7 @@ int main(int argc, char* argv[])
|
||||
"[--TreeFragments] "
|
||||
"[--SourceLabels] "
|
||||
"[--SourceLabelCountsLHS] "
|
||||
"[--TargetPreferenceLabels] "
|
||||
"[--TargetSyntacticPreferences] "
|
||||
"[--UnpairedExtractFormat] "
|
||||
"[--ConditionOnTargetLHS] "
|
||||
"[--CrossedNonTerm]"
|
||||
@ -167,9 +167,9 @@ int main(int argc, char* argv[])
|
||||
std::string fileNameFunctionWords;
|
||||
std::string fileNameLeftHandSideSourceLabelCounts;
|
||||
std::string fileNameLeftHandSideTargetSourceLabelCounts;
|
||||
std::string fileNameTargetPreferenceLabelSet;
|
||||
std::string fileNameLeftHandSideTargetPreferenceLabelCounts;
|
||||
std::string fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts;
|
||||
std::string fileNameTargetSyntacticPreferencesLabelSet;
|
||||
std::string fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts;
|
||||
std::string fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts;
|
||||
std::string fileNamePhraseOrientationPriors;
|
||||
// All unknown args are passed to feature manager.
|
||||
std::vector<std::string> featureArgs;
|
||||
@ -205,14 +205,18 @@ int main(int argc, char* argv[])
|
||||
fileNameLeftHandSideSourceLabelCounts = std::string(fileNamePhraseTable) + ".src.lhs";
|
||||
fileNameLeftHandSideTargetSourceLabelCounts = std::string(fileNamePhraseTable) + ".tgt-src.lhs";
|
||||
std::cerr << "counting left-hand side source labels and writing them to files " << fileNameLeftHandSideSourceLabelCounts << " and " << fileNameLeftHandSideTargetSourceLabelCounts << std::endl;
|
||||
} else if (strcmp(argv[i],"--TargetPreferenceLabels") == 0) {
|
||||
targetPreferenceLabelsFlag = true;
|
||||
std::cerr << "including target preference label information" << std::endl;
|
||||
fileNameTargetPreferenceLabelSet = std::string(fileNamePhraseTable) + ".syntaxLabels.tgtpref";
|
||||
std::cerr << "writing target preference label set to file " << fileNameTargetPreferenceLabelSet << std::endl;
|
||||
fileNameLeftHandSideTargetPreferenceLabelCounts = std::string(fileNamePhraseTable) + ".tgtpref.lhs";
|
||||
fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts = std::string(fileNamePhraseTable) + ".tgt-tgtpref.lhs";
|
||||
std::cerr << "counting left-hand side target preference labels and writing them to files " << fileNameLeftHandSideTargetPreferenceLabelCounts << " and " << fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts << std::endl;
|
||||
} else if (strcmp(argv[i],"--TargetSyntacticPreferences") == 0) {
|
||||
targetSyntacticPreferencesFlag = true;
|
||||
std::cerr << "including target syntactic preferences information" << std::endl;
|
||||
fileNameTargetSyntacticPreferencesLabelSet = std::string(fileNamePhraseTable) + ".syntaxLabels.tgtpref";
|
||||
std::cerr << "writing target syntactic preferences label set to file " << fileNameTargetSyntacticPreferencesLabelSet << std::endl;
|
||||
fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts = std::string(fileNamePhraseTable) + ".tgtpref.lhs";
|
||||
fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts = std::string(fileNamePhraseTable) + ".tgt-tgtpref.lhs";
|
||||
std::cerr << "counting left-hand side target syntactic preferences labels and writing them to files "
|
||||
<< fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts
|
||||
<< " and "
|
||||
<< fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts
|
||||
<< std::endl;
|
||||
} else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) {
|
||||
unpairedExtractFormatFlag = true;
|
||||
std::cerr << "processing unpaired extract format" << std::endl;
|
||||
@ -508,13 +512,13 @@ int main(int argc, char* argv[])
|
||||
writeLabelSet( partsOfSpeechSet, fileNamePartsOfSpeechSet );
|
||||
}
|
||||
|
||||
// target preference labels
|
||||
if (targetPreferenceLabelsFlag && !inverseFlag) {
|
||||
writeLabelSet( targetPreferenceLabelSet, fileNameTargetPreferenceLabelSet );
|
||||
writeLeftHandSideLabelCounts( targetPreferenceLHSCounts,
|
||||
ruleTargetLHSAndTargetPreferenceLHSJointCounts,
|
||||
fileNameLeftHandSideTargetPreferenceLabelCounts,
|
||||
fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts );
|
||||
// target syntactic preferences labels
|
||||
if (targetSyntacticPreferencesFlag && !inverseFlag) {
|
||||
writeLabelSet( targetSyntacticPreferencesLabelSet, fileNameTargetSyntacticPreferencesLabelSet );
|
||||
writeLeftHandSideLabelCounts( targetSyntacticPreferencesLHSCounts,
|
||||
ruleTargetLHSAndTargetSyntacticPreferencesLHSJointCounts,
|
||||
fileNameLeftHandSideTargetSyntacticPreferencesLabelCounts,
|
||||
fileNameLeftHandSideRuleTargetTargetSyntacticPreferencesLabelCounts );
|
||||
}
|
||||
}
|
||||
|
||||
@ -874,7 +878,7 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
|
||||
}
|
||||
|
||||
// syntax labels
|
||||
if ((sourceSyntaxLabelsFlag || targetPreferenceLabelsFlag) && !inverseFlag) {
|
||||
if ((sourceSyntaxLabelsFlag || targetSyntacticPreferencesFlag) && !inverseFlag) {
|
||||
unsigned nNTs = 1;
|
||||
for(size_t j=0; j<phraseSource->size()-1; ++j) {
|
||||
if (isNonTerminal(vcbS.getWord( phraseSource->at(j) )))
|
||||
@ -897,20 +901,20 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
|
||||
<< "}}";
|
||||
}
|
||||
}
|
||||
// target preference labels
|
||||
if (targetPreferenceLabelsFlag) {
|
||||
std::string targetPreferenceLabelCounts;
|
||||
targetPreferenceLabelCounts = phrasePair.CollectAllLabelsSeparateLHSAndRHS("TargetPreferences",
|
||||
targetPreferenceLabelSet,
|
||||
targetPreferenceLHSCounts,
|
||||
ruleTargetLHSAndTargetPreferenceLHSJointCounts,
|
||||
vcbT);
|
||||
if ( !targetPreferenceLabelCounts.empty() ) {
|
||||
// target syntactic preferences labels
|
||||
if (targetSyntacticPreferencesFlag) {
|
||||
std::string targetSyntacticPreferencesLabelCounts;
|
||||
targetSyntacticPreferencesLabelCounts = phrasePair.CollectAllLabelsSeparateLHSAndRHS("TargetPreferences",
|
||||
targetSyntacticPreferencesLabelSet,
|
||||
targetSyntacticPreferencesLHSCounts,
|
||||
ruleTargetLHSAndTargetSyntacticPreferencesLHSJointCounts,
|
||||
vcbT);
|
||||
if (!targetSyntacticPreferencesLabelCounts.empty()) {
|
||||
phraseTableFile << " {{TargetPreferences "
|
||||
<< nNTs // for convenience: number of non-terminal symbols in this rule (incl. left hand side NT)
|
||||
<< " "
|
||||
<< count // rule count
|
||||
<< targetPreferenceLabelCounts
|
||||
<< targetSyntacticPreferencesLabelCounts
|
||||
<< "}}";
|
||||
}
|
||||
}
|
||||
|
@ -2374,6 +2374,12 @@ sub define_training_extract_phrases {
|
||||
$cmd .= "-ghkm ";
|
||||
}
|
||||
|
||||
if (&get("TRAINING:target-syntactic-preferences")) {
|
||||
$cmd .= "-target-syntactic-preferences ";
|
||||
my $target_syntactic_preferences_labels_file = &versionize(&long_file_name("target-syntactic-preferences-labels","model",""));
|
||||
$cmd .= "-target-syntactic-preferences-labels-file $target_syntactic_preferences_labels_file ";
|
||||
}
|
||||
|
||||
if (&get("TRAINING:ghkm-tree-fragments")) {
|
||||
$cmd .= "-ghkm-tree-fragments ";
|
||||
}
|
||||
@ -2433,6 +2439,12 @@ sub define_training_build_ttable {
|
||||
$cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file ";
|
||||
}
|
||||
|
||||
if (&get("TRAINING:target-syntactic-preferences")) {
|
||||
$cmd .= "-target-syntactic-preferences ";
|
||||
my $target_syntactic_preferences_labels_file = &versionize(&long_file_name("target-syntactic-preferences-labels","model",""));
|
||||
$cmd .= "-target-syntactic-preferences-labels-file $target_syntactic_preferences_labels_file ";
|
||||
}
|
||||
|
||||
if (&get("TRAINING:ghkm-tree-fragments")) {
|
||||
$cmd .= "-ghkm-tree-fragments ";
|
||||
}
|
||||
@ -2644,6 +2656,12 @@ sub define_training_create_config {
|
||||
$cmd .= "-phrase-orientation ";
|
||||
}
|
||||
|
||||
if (&get("TRAINING:target-syntactic-preferences")) {
|
||||
$cmd .= "-target-syntactic-preferences ";
|
||||
my $target_syntactic_preferences_labels_file = &versionize(&long_file_name("target-syntactic-preferences-labels","model",""));
|
||||
$cmd .= "-target-syntactic-preferences-labels-file $target_syntactic_preferences_labels_file ";
|
||||
}
|
||||
|
||||
if (&get("TRAINING:ghkm-source-labels")) {
|
||||
$cmd .= "-ghkm-source-labels ";
|
||||
my $source_labels_file = &versionize(&long_file_name("source-labels","model",""));
|
||||
|
@ -43,6 +43,7 @@ my $ptHalf = $ARGV[5]; # output
|
||||
my $inverse = 0;
|
||||
my $sourceLabelsFile;
|
||||
my $partsOfSpeechFile;
|
||||
my $targetSyntacticPreferencesLabelsFile;
|
||||
|
||||
my $otherExtractArgs= "";
|
||||
for (my $i = 6; $i < $#ARGV; ++$i)
|
||||
@ -57,6 +58,11 @@ for (my $i = 6; $i < $#ARGV; ++$i)
|
||||
$otherExtractArgs .= "--PartsOfSpeech ";
|
||||
next;
|
||||
}
|
||||
if ($ARGV[$i] eq '--TargetSyntacticPreferences') {
|
||||
$targetSyntacticPreferencesLabelsFile = $ARGV[++$i];
|
||||
$otherExtractArgs .= "--TargetSyntacticPreferences ";
|
||||
next;
|
||||
}
|
||||
if ($ARGV[$i] eq '--Inverse') {
|
||||
$inverse = 1;
|
||||
$otherExtractArgs .= $ARGV[$i] ." ";
|
||||
@ -289,11 +295,11 @@ if (-e $cocPath)
|
||||
close(FHCOC);
|
||||
}
|
||||
|
||||
# merge source label files
|
||||
# merge source labels files
|
||||
if (!$inverse && defined($sourceLabelsFile))
|
||||
{
|
||||
my $cmd = "(echo \"GlueTop 0\"; echo \"GlueX 1\"; echo \"SSTART 2\"; echo \"SEND 3\"; cat $TMPDIR/phrase-table.half.*.gz.syntaxLabels.src | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+3]}/\") > $sourceLabelsFile";
|
||||
print STDERR "Merging source label files: $cmd \n";
|
||||
print STDERR "Merging source labels files: $cmd \n";
|
||||
`$cmd`;
|
||||
}
|
||||
|
||||
@ -305,6 +311,13 @@ if (!$inverse && defined($partsOfSpeechFile))
|
||||
`$cmd`;
|
||||
}
|
||||
|
||||
# merge target syntactic preferences labels files
|
||||
if (!$inverse && defined($targetSyntacticPreferencesLabelsFile))
|
||||
{
|
||||
my $cmd = "(echo \"GlueTop 0\"; echo \"GlueX 1\"; echo \"SSTART 2\"; echo \"SEND 3\"; cat $TMPDIR/phrase-table.half.*.gz.syntaxLabels.tgtpref | LC_ALL=C sort | uniq | perl -pe \"s/\$/ \@{[\$.+3]}/\") > $targetSyntacticPreferencesLabelsFile";
|
||||
print STDERR "Merging target syntactic preferences labels files: $cmd \n";
|
||||
`$cmd`;
|
||||
}
|
||||
|
||||
$cmd = "rm -rf $TMPDIR \n";
|
||||
print STDERR $cmd;
|
||||
|
@ -89,6 +89,8 @@ my($_EXTERNAL_BINDIR,
|
||||
$_XML,
|
||||
$_SOURCE_SYNTAX,
|
||||
$_TARGET_SYNTAX,
|
||||
$_TARGET_SYNTACTIC_PREFERENCES,
|
||||
$_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE,
|
||||
$_GLUE_GRAMMAR,
|
||||
$_GLUE_GRAMMAR_FILE,
|
||||
$_DONT_TUNE_GLUE_GRAMMAR,
|
||||
@ -227,6 +229,8 @@ $_HELP = 1
|
||||
'score-options=s' => \@_SCORE_OPTIONS,
|
||||
'source-syntax' => \$_SOURCE_SYNTAX,
|
||||
'target-syntax' => \$_TARGET_SYNTAX,
|
||||
'target-syntactic-preferences' => \$_TARGET_SYNTACTIC_PREFERENCES,
|
||||
'target-syntactic-preferences-labels-file=s' => \$_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE,
|
||||
'use-syntax-input-weight-feature' => \$_USE_SYNTAX_INPUT_WEIGHT_FEATURE,
|
||||
'xml' => \$_XML,
|
||||
'no-word-alignment' => \$_OMIT_WORD_ALIGNMENT,
|
||||
@ -1575,6 +1579,7 @@ sub extract_phrase {
|
||||
{
|
||||
$cmd .= " --SourceSyntax" if $_SOURCE_SYNTAX;
|
||||
$cmd .= " --TargetSyntax" if $_TARGET_SYNTAX;
|
||||
$cmd .= " --TargetSyntacticPreferences" if $_TARGET_SYNTACTIC_PREFERENCES;
|
||||
$cmd .= " --MaxSpan $max_length";
|
||||
}
|
||||
$cmd .= " ".$_EXTRACT_OPTIONS if defined($_EXTRACT_OPTIONS);
|
||||
@ -1712,8 +1717,8 @@ sub score_phrase_phrase_extract {
|
||||
$CORE_SCORE_OPTIONS .= " --NoLex" if $NO_LEX;
|
||||
$CORE_SCORE_OPTIONS .= " --Singleton" if $SINGLETON;
|
||||
$CORE_SCORE_OPTIONS .= " --CrossedNonTerm" if $CROSSEDNONTERM;
|
||||
$CORE_SCORE_OPTIONS .= " --SourceLabels" if $SOURCE_LABELS;
|
||||
$CORE_SCORE_OPTIONS .= " --SourceLabelCountsLHS " if $SOURCE_LABEL_COUNTS_LHS;
|
||||
$CORE_SCORE_OPTIONS .= " --SourceLabels" if $SOURCE_LABELS;
|
||||
$CORE_SCORE_OPTIONS .= " --SourceLabelCountsLHS " if $SOURCE_LABEL_COUNTS_LHS;
|
||||
|
||||
my $substep = 1;
|
||||
my $isParent = 1;
|
||||
@ -1758,6 +1763,7 @@ sub score_phrase_phrase_extract {
|
||||
$cmd .= " --PhraseOrientation" if $_PHRASE_ORIENTATION;
|
||||
$cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if $_PHRASE_ORIENTATION && defined($_PHRASE_ORIENTATION_PRIORS_FILE);
|
||||
$cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
|
||||
$cmd .= " --TargetSyntacticPreferences $_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE" if $_TARGET_SYNTACTIC_PREFERENCES && defined($_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE);
|
||||
$cmd .= " --PartsOfSpeech $_GHKM_PARTS_OF_SPEECH_FILE" if $_GHKM_PARTS_OF_SPEECH && defined($_GHKM_PARTS_OF_SPEECH_FILE);
|
||||
$cmd .= " $DOMAIN" if $DOMAIN;
|
||||
$cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS);
|
||||
@ -1811,6 +1817,7 @@ sub score_phrase_phrase_extract {
|
||||
$cmd .= " --GoodTuring $ttable_file.half.f2e.gz.coc" if $GOOD_TURING;
|
||||
$cmd .= " --KneserNey $ttable_file.half.f2e.gz.coc" if $KNESER_NEY;
|
||||
$cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
|
||||
$cmd .= " --TargetSyntacticPreferences $_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE" if $_TARGET_SYNTACTIC_PREFERENCES && defined($_TARGET_SYNTACTIC_PREFERENCES_LABELS_FILE);
|
||||
$cmd .= " --PartsOfSpeech $_GHKM_PARTS_OF_SPEECH_FILE" if $_GHKM_PARTS_OF_SPEECH && defined($_GHKM_PARTS_OF_SPEECH_FILE);
|
||||
|
||||
$cmd .= " | $GZIP_EXEC -c > $ttable_file.gz";
|
||||
|
Loading…
Reference in New Issue
Block a user