mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 21:42:19 +03:00
POS property: map tags to indices in consolidate
This commit is contained in:
parent
06e87d851e
commit
638e9c3f60
@ -57,6 +57,32 @@ void PropertiesConsolidator::ActivateSourceLabelsProcessing(const std::string &s
|
||||
}
|
||||
|
||||
|
||||
void PropertiesConsolidator::ActivatePartsOfSpeechProcessing(const std::string &partsOfSpeechFile)
|
||||
{
|
||||
Moses::InputFileStream inFile(partsOfSpeechFile);
|
||||
|
||||
// read parts-of-speech vocabulary
|
||||
m_partsOfSpeechVocabulary.clear();
|
||||
std::string line;
|
||||
while (getline(inFile, line)) {
|
||||
std::istringstream tokenizer(line);
|
||||
std::string label;
|
||||
size_t index;
|
||||
try {
|
||||
tokenizer >> label >> index;
|
||||
} catch (const std::exception &e) {
|
||||
UTIL_THROW2("Error reading part-of-speech vocabulary file " << partsOfSpeechFile << " .");
|
||||
}
|
||||
std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_partsOfSpeechVocabulary.insert( std::pair<std::string,size_t>(label,index) );
|
||||
UTIL_THROW_IF2(!inserted.second,"Part-of-speech vocabulary file " << partsOfSpeechFile << " should contain each POS tag only once.");
|
||||
}
|
||||
|
||||
inFile.Close();
|
||||
|
||||
m_partsOfSpeechFlag = true;
|
||||
}
|
||||
|
||||
|
||||
std::string PropertiesConsolidator::ProcessPropertiesString(const std::string &propertiesString) const
|
||||
{
|
||||
if ( propertiesString.empty() ) {
|
||||
@ -76,11 +102,12 @@ std::string PropertiesConsolidator::ProcessPropertiesString(const std::string &p
|
||||
std::vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
|
||||
assert(keyValue.size() == 2);
|
||||
|
||||
// TODO: individual methods for different properties
|
||||
if ( !keyValue[0].compare("SourceLabels") ) {
|
||||
|
||||
if ( m_sourceLabelsFlag ) {
|
||||
|
||||
// SourceLabels additional property: replace strings with vocabulary indices
|
||||
// SourceLabels property: replace strings with vocabulary indices
|
||||
out << " {{" << keyValue[0];
|
||||
|
||||
std::istringstream tokenizer(keyValue[1]);
|
||||
@ -141,13 +168,33 @@ std::string PropertiesConsolidator::ProcessPropertiesString(const std::string &p
|
||||
|
||||
out << "}}";
|
||||
|
||||
} else { // don't process source labels additional property
|
||||
} else { // don't process source labels property
|
||||
out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
|
||||
}
|
||||
|
||||
} else if ( !keyValue[0].compare("POS") ) {
|
||||
|
||||
if ( m_partsOfSpeechFlag ) {
|
||||
|
||||
// POS property: replace strings with vocabulary indices
|
||||
out << " {{" << keyValue[0];
|
||||
std::istringstream tokenizer(keyValue[1]);
|
||||
while (tokenizer.peek() != EOF) {
|
||||
std::string token;
|
||||
tokenizer >> token;
|
||||
std::map<std::string,size_t>::const_iterator found = m_partsOfSpeechVocabulary.find(token);
|
||||
UTIL_THROW_IF2(found == m_partsOfSpeechVocabulary.end() ,"Part-of-speech \"" << token << "\" from the phrase table not found in given part-of-speech vocabulary.");
|
||||
out << " " << found->second;
|
||||
}
|
||||
out << "}}";
|
||||
|
||||
} else { // don't process parts-of-speech property
|
||||
out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
// output other additional property
|
||||
// output other propertyi
|
||||
out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
|
||||
}
|
||||
}
|
||||
|
@ -34,6 +34,7 @@ public:
|
||||
PropertiesConsolidator() : m_sourceLabelsFlag(false) {};
|
||||
|
||||
void ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile);
|
||||
void ActivatePartsOfSpeechProcessing(const std::string &partsOfSpeechFile);
|
||||
|
||||
std::string ProcessPropertiesString(const std::string &propertiesString) const;
|
||||
|
||||
@ -41,6 +42,8 @@ private:
|
||||
|
||||
bool m_sourceLabelsFlag;
|
||||
std::map<std::string,size_t> m_sourceLabels;
|
||||
bool m_partsOfSpeechFlag;
|
||||
std::map<std::string,size_t> m_partsOfSpeechVocabulary;
|
||||
|
||||
};
|
||||
|
||||
|
@ -39,6 +39,7 @@ bool lowCountFlag = false;
|
||||
bool goodTuringFlag = false;
|
||||
bool kneserNeyFlag = false;
|
||||
bool sourceLabelsFlag = false;
|
||||
bool partsOfSpeechFlag = false;
|
||||
bool logProbFlag = false;
|
||||
float minScore0 = 0;
|
||||
float minScore2 = 0;
|
||||
@ -48,7 +49,7 @@ inline float maybeLogProb( float a )
|
||||
return logProbFlag ? log(a) : a;
|
||||
}
|
||||
|
||||
void processFiles( char*, char*, char*, char*, char* );
|
||||
void processFiles( char*, char*, char*, char*, char*, char* );
|
||||
void loadCountOfCounts( char* );
|
||||
void breakdownCoreAndSparse( string combined, string &core, string &sparse );
|
||||
bool getLine( istream &fileP, vector< string > &item );
|
||||
@ -62,7 +63,7 @@ int main(int argc, char* argv[])
|
||||
<< "consolidating direct and indirect rule tables\n";
|
||||
|
||||
if (argc < 4) {
|
||||
cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--PhraseCount] [--GoodTuring counts-of-counts-file] [--KneserNey counts-of-counts-file] [--LowCountFeature] [--SourceLabels source-labels-file] [--MinScore id:threshold[,id:threshold]*]\n";
|
||||
cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--PhraseCount] [--GoodTuring counts-of-counts-file] [--KneserNey counts-of-counts-file] [--LowCountFeature] [--SourceLabels source-labels-file] [--PartsOfSpeech parts-of-speech-file] [--MinScore id:threshold[,id:threshold]*]\n";
|
||||
exit(1);
|
||||
}
|
||||
char* &fileNameDirect = argv[1];
|
||||
@ -70,6 +71,7 @@ int main(int argc, char* argv[])
|
||||
char* &fileNameConsolidated = argv[3];
|
||||
char* fileNameCountOfCounts = 0;
|
||||
char* fileNameSourceLabelSet = 0;
|
||||
char* fileNamePartsOfSpeechVocabulary = 0;
|
||||
|
||||
for(int i=4; i<argc; i++) {
|
||||
if (strcmp(argv[i],"--Hierarchical") == 0) {
|
||||
@ -128,6 +130,14 @@ int main(int argc, char* argv[])
|
||||
}
|
||||
fileNameSourceLabelSet = argv[++i];
|
||||
cerr << "processing source labels property\n";
|
||||
} else if (strcmp(argv[i],"--PartsOfSpeech") == 0) {
|
||||
partsOfSpeechFlag = true;
|
||||
if (i+1==argc) {
|
||||
cerr << "ERROR: specify parts-of-speech file!\n";
|
||||
exit(1);
|
||||
}
|
||||
fileNamePartsOfSpeechVocabulary = argv[++i];
|
||||
cerr << "processing parts-of-speech property\n";
|
||||
} else if (strcmp(argv[i],"--MinScore") == 0) {
|
||||
string setting = argv[++i];
|
||||
bool done = false;
|
||||
@ -164,7 +174,7 @@ int main(int argc, char* argv[])
|
||||
}
|
||||
}
|
||||
|
||||
processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts, fileNameSourceLabelSet );
|
||||
processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts, fileNameSourceLabelSet, fileNamePartsOfSpeechVocabulary );
|
||||
}
|
||||
|
||||
vector< float > countOfCounts;
|
||||
@ -213,7 +223,7 @@ void loadCountOfCounts( char* fileNameCountOfCounts )
|
||||
if (kneserNey_D3 > 2.9) kneserNey_D3 = 2.9;
|
||||
}
|
||||
|
||||
void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts, char* fileNameSourceLabelSet )
|
||||
void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts, char* fileNameSourceLabelSet, char* fileNamePartsOfSpeechVocabulary )
|
||||
{
|
||||
if (goodTuringFlag || kneserNeyFlag)
|
||||
loadCountOfCounts( fileNameCountOfCounts );
|
||||
@ -248,6 +258,9 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
|
||||
if (sourceLabelsFlag) {
|
||||
propertiesConsolidator.ActivateSourceLabelsProcessing(fileNameSourceLabelSet);
|
||||
}
|
||||
if (partsOfSpeechFlag) {
|
||||
propertiesConsolidator.ActivatePartsOfSpeechProcessing(fileNamePartsOfSpeechVocabulary);
|
||||
}
|
||||
|
||||
// loop through all extracted phrase translations
|
||||
int i=0;
|
||||
|
@ -131,7 +131,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
ScoreFeatureManager featureManager;
|
||||
if (argc < 4) {
|
||||
std::cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PCFG] [--TreeFragments] [--SourceLabels] [--SourceLabelCountsLHS] [--TargetPreferenceLabels] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl;
|
||||
std::cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PartsOfSpeech] [--PCFG] [--TreeFragments] [--SourceLabels] [--SourceLabelCountsLHS] [--TargetPreferenceLabels] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl;
|
||||
std::cerr << featureManager.usage() << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
|
@ -1693,6 +1693,7 @@ sub score_phrase_phrase_extract {
|
||||
$cmd .= " --GoodTuring $ttable_file.half.f2e.gz.coc" if $GOOD_TURING;
|
||||
$cmd .= " --KneserNey $ttable_file.half.f2e.gz.coc" if $KNESER_NEY;
|
||||
$cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
|
||||
$cmd .= " --PartsOfSpeech $_GHKM_PARTS_OF_SPEECH_FILE" if $_GHKM_PARTS_OF_SPEECH && defined($_GHKM_PARTS_OF_SPEECH_FILE);
|
||||
|
||||
$cmd .= " | $GZIP_EXEC -c > $ttable_file.gz";
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user