POS property: map tags to indices in consolidate

This commit is contained in:
Matthias Huck 2015-03-04 22:48:34 +00:00
parent 06e87d851e
commit 638e9c3f60
5 changed files with 72 additions and 8 deletions

View File

@ -57,6 +57,32 @@ void PropertiesConsolidator::ActivateSourceLabelsProcessing(const std::string &s
}
void PropertiesConsolidator::ActivatePartsOfSpeechProcessing(const std::string &partsOfSpeechFile)
{
Moses::InputFileStream inFile(partsOfSpeechFile);
// read parts-of-speech vocabulary
m_partsOfSpeechVocabulary.clear();
std::string line;
while (getline(inFile, line)) {
std::istringstream tokenizer(line);
std::string label;
size_t index;
try {
tokenizer >> label >> index;
} catch (const std::exception &e) {
UTIL_THROW2("Error reading part-of-speech vocabulary file " << partsOfSpeechFile << " .");
}
std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_partsOfSpeechVocabulary.insert( std::pair<std::string,size_t>(label,index) );
UTIL_THROW_IF2(!inserted.second,"Part-of-speech vocabulary file " << partsOfSpeechFile << " should contain each POS tag only once.");
}
inFile.Close();
m_partsOfSpeechFlag = true;
}
std::string PropertiesConsolidator::ProcessPropertiesString(const std::string &propertiesString) const
{
if ( propertiesString.empty() ) {
@ -76,11 +102,12 @@ std::string PropertiesConsolidator::ProcessPropertiesString(const std::string &p
std::vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
assert(keyValue.size() == 2);
// TODO: individual methods for different properties
if ( !keyValue[0].compare("SourceLabels") ) {
if ( m_sourceLabelsFlag ) {
// SourceLabels additional property: replace strings with vocabulary indices
// SourceLabels property: replace strings with vocabulary indices
out << " {{" << keyValue[0];
std::istringstream tokenizer(keyValue[1]);
@ -141,13 +168,33 @@ std::string PropertiesConsolidator::ProcessPropertiesString(const std::string &p
out << "}}";
} else { // don't process source labels additional property
} else { // don't process source labels property
out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
}
} else if ( !keyValue[0].compare("POS") ) {
if ( m_partsOfSpeechFlag ) {
// POS property: replace strings with vocabulary indices
out << " {{" << keyValue[0];
std::istringstream tokenizer(keyValue[1]);
while (tokenizer.peek() != EOF) {
std::string token;
tokenizer >> token;
std::map<std::string,size_t>::const_iterator found = m_partsOfSpeechVocabulary.find(token);
UTIL_THROW_IF2(found == m_partsOfSpeechVocabulary.end() ,"Part-of-speech \"" << token << "\" from the phrase table not found in given part-of-speech vocabulary.");
out << " " << found->second;
}
out << "}}";
} else { // don't process parts-of-speech property
out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
}
} else {
// output other additional property
// output other propertyi
out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
}
}

View File

@ -34,6 +34,7 @@ public:
PropertiesConsolidator() : m_sourceLabelsFlag(false) {};
void ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile);
void ActivatePartsOfSpeechProcessing(const std::string &partsOfSpeechFile);
std::string ProcessPropertiesString(const std::string &propertiesString) const;
@ -41,6 +42,8 @@ private:
bool m_sourceLabelsFlag;
std::map<std::string,size_t> m_sourceLabels;
bool m_partsOfSpeechFlag;
std::map<std::string,size_t> m_partsOfSpeechVocabulary;
};

View File

@ -39,6 +39,7 @@ bool lowCountFlag = false;
bool goodTuringFlag = false;
bool kneserNeyFlag = false;
bool sourceLabelsFlag = false;
bool partsOfSpeechFlag = false;
bool logProbFlag = false;
float minScore0 = 0;
float minScore2 = 0;
@ -48,7 +49,7 @@ inline float maybeLogProb( float a )
return logProbFlag ? log(a) : a;
}
void processFiles( char*, char*, char*, char*, char* );
void processFiles( char*, char*, char*, char*, char*, char* );
void loadCountOfCounts( char* );
void breakdownCoreAndSparse( string combined, string &core, string &sparse );
bool getLine( istream &fileP, vector< string > &item );
@ -62,7 +63,7 @@ int main(int argc, char* argv[])
<< "consolidating direct and indirect rule tables\n";
if (argc < 4) {
cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--PhraseCount] [--GoodTuring counts-of-counts-file] [--KneserNey counts-of-counts-file] [--LowCountFeature] [--SourceLabels source-labels-file] [--MinScore id:threshold[,id:threshold]*]\n";
cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--PhraseCount] [--GoodTuring counts-of-counts-file] [--KneserNey counts-of-counts-file] [--LowCountFeature] [--SourceLabels source-labels-file] [--PartsOfSpeech parts-of-speech-file] [--MinScore id:threshold[,id:threshold]*]\n";
exit(1);
}
char* &fileNameDirect = argv[1];
@ -70,6 +71,7 @@ int main(int argc, char* argv[])
char* &fileNameConsolidated = argv[3];
char* fileNameCountOfCounts = 0;
char* fileNameSourceLabelSet = 0;
char* fileNamePartsOfSpeechVocabulary = 0;
for(int i=4; i<argc; i++) {
if (strcmp(argv[i],"--Hierarchical") == 0) {
@ -128,6 +130,14 @@ int main(int argc, char* argv[])
}
fileNameSourceLabelSet = argv[++i];
cerr << "processing source labels property\n";
} else if (strcmp(argv[i],"--PartsOfSpeech") == 0) {
partsOfSpeechFlag = true;
if (i+1==argc) {
cerr << "ERROR: specify parts-of-speech file!\n";
exit(1);
}
fileNamePartsOfSpeechVocabulary = argv[++i];
cerr << "processing parts-of-speech property\n";
} else if (strcmp(argv[i],"--MinScore") == 0) {
string setting = argv[++i];
bool done = false;
@ -164,7 +174,7 @@ int main(int argc, char* argv[])
}
}
processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts, fileNameSourceLabelSet );
processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts, fileNameSourceLabelSet, fileNamePartsOfSpeechVocabulary );
}
vector< float > countOfCounts;
@ -213,7 +223,7 @@ void loadCountOfCounts( char* fileNameCountOfCounts )
if (kneserNey_D3 > 2.9) kneserNey_D3 = 2.9;
}
void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts, char* fileNameSourceLabelSet )
void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts, char* fileNameSourceLabelSet, char* fileNamePartsOfSpeechVocabulary )
{
if (goodTuringFlag || kneserNeyFlag)
loadCountOfCounts( fileNameCountOfCounts );
@ -248,6 +258,9 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
if (sourceLabelsFlag) {
propertiesConsolidator.ActivateSourceLabelsProcessing(fileNameSourceLabelSet);
}
if (partsOfSpeechFlag) {
propertiesConsolidator.ActivatePartsOfSpeechProcessing(fileNamePartsOfSpeechVocabulary);
}
// loop through all extracted phrase translations
int i=0;

View File

@ -131,7 +131,7 @@ int main(int argc, char* argv[])
ScoreFeatureManager featureManager;
if (argc < 4) {
std::cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PCFG] [--TreeFragments] [--SourceLabels] [--SourceLabelCountsLHS] [--TargetPreferenceLabels] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl;
std::cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PartsOfSpeech] [--PCFG] [--TreeFragments] [--SourceLabels] [--SourceLabelCountsLHS] [--TargetPreferenceLabels] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl;
std::cerr << featureManager.usage() << std::endl;
exit(1);
}

View File

@ -1693,6 +1693,7 @@ sub score_phrase_phrase_extract {
$cmd .= " --GoodTuring $ttable_file.half.f2e.gz.coc" if $GOOD_TURING;
$cmd .= " --KneserNey $ttable_file.half.f2e.gz.coc" if $KNESER_NEY;
$cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
$cmd .= " --PartsOfSpeech $_GHKM_PARTS_OF_SPEECH_FILE" if $_GHKM_PARTS_OF_SPEECH && defined($_GHKM_PARTS_OF_SPEECH_FILE);
$cmd .= " | $GZIP_EXEC -c > $ttable_file.gz";