POS property: map tags to indices in consolidate

2024-12-26 21:42:19 +03:00 · 2015-03-04 22:48:34 +00:00 · 2015-03-04 22:48:34 +00:00 · 638e9c3f60
commit 638e9c3f60
parent 06e87d851e
5 changed files with 72 additions and 8 deletions
--- a/phrase-extract/PropertiesConsolidator.cpp
+++ b/phrase-extract/PropertiesConsolidator.cpp
@ -57,6 +57,32 @@ void PropertiesConsolidator::ActivateSourceLabelsProcessing(const std::string &s
 }


+void PropertiesConsolidator::ActivatePartsOfSpeechProcessing(const std::string &partsOfSpeechFile)
+{
+  Moses::InputFileStream inFile(partsOfSpeechFile);
+
+  // read parts-of-speech vocabulary
+  m_partsOfSpeechVocabulary.clear();
+  std::string line;
+  while (getline(inFile, line)) {
+    std::istringstream tokenizer(line);
+    std::string label;
+    size_t index;
+    try {
+      tokenizer >> label >> index;
+    } catch (const std::exception &e) {
+      UTIL_THROW2("Error reading part-of-speech vocabulary file " << partsOfSpeechFile << " .");
+    }
+    std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_partsOfSpeechVocabulary.insert( std::pair<std::string,size_t>(label,index) );
+    UTIL_THROW_IF2(!inserted.second,"Part-of-speech vocabulary file " << partsOfSpeechFile << " should contain each POS tag only once.");
+  }
+
+  inFile.Close();
+
+  m_partsOfSpeechFlag = true;
+}
+
+
 std::string PropertiesConsolidator::ProcessPropertiesString(const std::string &propertiesString) const
 {
  if ( propertiesString.empty() ) {
@ -76,11 +102,12 @@ std::string PropertiesConsolidator::ProcessPropertiesString(const std::string &p
    std::vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
    assert(keyValue.size() == 2);

+    // TODO: individual methods for different properties
    if ( !keyValue[0].compare("SourceLabels") ) {

      if ( m_sourceLabelsFlag ) {

-        // SourceLabels additional property: replace strings with vocabulary indices
+        // SourceLabels property: replace strings with vocabulary indices
        out << " {{" << keyValue[0];

        std::istringstream tokenizer(keyValue[1]);
@ -141,13 +168,33 @@ std::string PropertiesConsolidator::ProcessPropertiesString(const std::string &p

        out << "}}";

-      } else { // don't process source labels additional property
+      } else { // don't process source labels property
+        out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
+      }
+
+    } else if ( !keyValue[0].compare("POS") ) {
+
+      if ( m_partsOfSpeechFlag ) {
+
+        // POS property: replace strings with vocabulary indices
+        out << " {{" << keyValue[0];
+        std::istringstream tokenizer(keyValue[1]);
+        while (tokenizer.peek() != EOF) {
+          std::string token;
+          tokenizer >> token;
+          std::map<std::string,size_t>::const_iterator found = m_partsOfSpeechVocabulary.find(token);
+          UTIL_THROW_IF2(found == m_partsOfSpeechVocabulary.end() ,"Part-of-speech \"" << token << "\" from the phrase table not found in given part-of-speech vocabulary.");
+          out << " " << found->second;
+        }
+        out << "}}";
+
+      } else { // don't process parts-of-speech property
        out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
      }

    } else {

-      // output other additional property
+      // output other propertyi
      out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
    }
  }
--- a/phrase-extract/PropertiesConsolidator.h
+++ b/phrase-extract/PropertiesConsolidator.h
@ -34,6 +34,7 @@ public:
  PropertiesConsolidator() : m_sourceLabelsFlag(false) {};

  void ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile);
+  void ActivatePartsOfSpeechProcessing(const std::string &partsOfSpeechFile);

  std::string ProcessPropertiesString(const std::string &propertiesString) const;

@ -41,6 +42,8 @@ private:

  bool m_sourceLabelsFlag;
  std::map<std::string,size_t> m_sourceLabels;
+  bool m_partsOfSpeechFlag;
+  std::map<std::string,size_t> m_partsOfSpeechVocabulary;

 };

--- a/phrase-extract/consolidate-main.cpp
+++ b/phrase-extract/consolidate-main.cpp
@ -39,6 +39,7 @@ bool lowCountFlag = false;
 bool goodTuringFlag = false;
 bool kneserNeyFlag = false;
 bool sourceLabelsFlag = false;
+bool partsOfSpeechFlag = false;
 bool logProbFlag = false;
 float minScore0 = 0;
 float minScore2 = 0;
@ -48,7 +49,7 @@ inline float maybeLogProb( float a )
  return logProbFlag ? log(a) : a;
 }

-void processFiles( char*, char*, char*, char*, char* );
+void processFiles( char*, char*, char*, char*, char*, char* );
 void loadCountOfCounts( char* );
 void breakdownCoreAndSparse( string combined, string &core, string &sparse );
 bool getLine( istream &fileP, vector< string > &item );
@ -62,7 +63,7 @@ int main(int argc, char* argv[])
       << "consolidating direct and indirect rule tables\n";

  if (argc < 4) {
-    cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--PhraseCount] [--GoodTuring counts-of-counts-file] [--KneserNey counts-of-counts-file] [--LowCountFeature] [--SourceLabels source-labels-file] [--MinScore  id:threshold[,id:threshold]*]\n";
+    cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--PhraseCount] [--GoodTuring counts-of-counts-file] [--KneserNey counts-of-counts-file] [--LowCountFeature] [--SourceLabels source-labels-file]  [--PartsOfSpeech parts-of-speech-file] [--MinScore id:threshold[,id:threshold]*]\n";
    exit(1);
  }
  char* &fileNameDirect = argv[1];
@ -70,6 +71,7 @@ int main(int argc, char* argv[])
  char* &fileNameConsolidated = argv[3];
  char* fileNameCountOfCounts = 0;
  char* fileNameSourceLabelSet = 0;
+  char* fileNamePartsOfSpeechVocabulary = 0;

  for(int i=4; i<argc; i++) {
    if (strcmp(argv[i],"--Hierarchical") == 0) {
@ -128,6 +130,14 @@ int main(int argc, char* argv[])
      }
      fileNameSourceLabelSet = argv[++i];
      cerr << "processing source labels property\n";
+    } else if (strcmp(argv[i],"--PartsOfSpeech") == 0) {
+      partsOfSpeechFlag = true;
+      if (i+1==argc) {
+        cerr << "ERROR: specify parts-of-speech file!\n";
+        exit(1);
+      }
+      fileNamePartsOfSpeechVocabulary = argv[++i];
+      cerr << "processing parts-of-speech property\n";
    } else if (strcmp(argv[i],"--MinScore") == 0) {
      string setting = argv[++i];
      bool done = false;
@ -164,7 +174,7 @@ int main(int argc, char* argv[])
    }
  }

-  processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts, fileNameSourceLabelSet );
+  processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts, fileNameSourceLabelSet, fileNamePartsOfSpeechVocabulary );
 }

 vector< float > countOfCounts;
@ -213,7 +223,7 @@ void loadCountOfCounts( char* fileNameCountOfCounts )
  if (kneserNey_D3 > 2.9) kneserNey_D3 = 2.9;
 }

-void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts, char* fileNameSourceLabelSet )
+void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts, char* fileNameSourceLabelSet, char* fileNamePartsOfSpeechVocabulary )
 {
  if (goodTuringFlag || kneserNeyFlag)
    loadCountOfCounts( fileNameCountOfCounts );
@ -248,6 +258,9 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
  if (sourceLabelsFlag) {
    propertiesConsolidator.ActivateSourceLabelsProcessing(fileNameSourceLabelSet);
  }
+  if (partsOfSpeechFlag) {
+    propertiesConsolidator.ActivatePartsOfSpeechProcessing(fileNamePartsOfSpeechVocabulary);
+  }

  // loop through all extracted phrase translations
  int i=0;
--- a/phrase-extract/score-main.cpp
+++ b/phrase-extract/score-main.cpp
@ -131,7 +131,7 @@ int main(int argc, char* argv[])

  ScoreFeatureManager featureManager;
  if (argc < 4) {
-    std::cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PCFG] [--TreeFragments] [--SourceLabels] [--SourceLabelCountsLHS] [--TargetPreferenceLabels] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl;
+    std::cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PartsOfSpeech] [--PCFG] [--TreeFragments] [--SourceLabels] [--SourceLabelCountsLHS] [--TargetPreferenceLabels] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--CrossedNonTerm]" << std::endl;
    std::cerr << featureManager.usage() << std::endl;
    exit(1);
  }
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@ -1693,6 +1693,7 @@ sub score_phrase_phrase_extract {
    $cmd .= " --GoodTuring $ttable_file.half.f2e.gz.coc" if $GOOD_TURING;
    $cmd .= " --KneserNey $ttable_file.half.f2e.gz.coc" if $KNESER_NEY;
    $cmd .= " --SourceLabels $_GHKM_SOURCE_LABELS_FILE" if $_GHKM_SOURCE_LABELS && defined($_GHKM_SOURCE_LABELS_FILE);
+    $cmd .= " --PartsOfSpeech $_GHKM_PARTS_OF_SPEECH_FILE" if $_GHKM_PARTS_OF_SPEECH && defined($_GHKM_PARTS_OF_SPEECH_FILE);
    
    $cmd .= " | $GZIP_EXEC -c > $ttable_file.gz";