Merge branch 'GHKMStruct' of github.com:moses-smt/mosesdecoder into GHKMStruct

2024-12-27 22:14:57 +03:00 · 2013-09-14 10:46:17 +02:00 · 2013-09-14 10:46:17 +02:00 · df86f0e78b
commit df86f0e78b
parent 5f37a545b1 296eb6804a
6 changed files with 2 additions and 177 deletions
--- a/phrase-extract/ExtractedRule.cpp
+++ b/phrase-extract/ExtractedRule.cpp
@ -1,44 +0,0 @@
-//
-//  ExtractedRule.cpp
-//  extract
-//
-//  Created by Hieu Hoang on 13/09/2011.
-//  Copyright 2011 __MyCompanyName__. All rights reserved.
-//
-
-#include "ExtractedRule.h"
-
-using namespace std;
-
-namespace MosesTraining
-{
-
-void ExtractedRule::OutputNTLengths(std::ostream &out) const
-{
-  ostringstream outString;
-  OutputNTLengths(outString);
-  out << outString;
-}
-
-void ExtractedRule::OutputNTLengths(std::ostringstream &outString) const
-{
-  std::map<size_t, std::pair<size_t, size_t> >::const_iterator iter;
-  for (iter = m_ntLengths.begin(); iter != m_ntLengths.end(); ++iter) {
-    size_t sourcePos = iter->first;
-    const std::pair<size_t, size_t> &spanLengths = iter->second;
-    outString << sourcePos << "=" << spanLengths.first << "," <<spanLengths.second << " ";
-  }
-}
-
-std::ostream& operator<<(std::ostream &out, const ExtractedRule &obj)
-{
-  out << obj.source << " ||| " << obj.target << " ||| "
-      << obj.alignment << " ||| "
-      << obj.alignmentInv << " ||| ";
-
-  obj.OutputNTLengths(out);
-
-  return out;
-}
-
-} // namespace
--- a/phrase-extract/ExtractedRule.h
+++ b/phrase-extract/ExtractedRule.h
@ -32,8 +32,6 @@ namespace MosesTraining
 // sentence-level collection of rules
 class ExtractedRule
 {
-  friend std::ostream& operator<<(std::ostream &, const ExtractedRule &);
-
 public:
  std::string source;
  std::string target;
@ -54,8 +52,6 @@ public:
  float count;
  double pcfgScore;

-  std::map<size_t, std::pair<size_t, size_t> > m_ntLengths;
-
  ExtractedRule(int sT, int eT, int sS, int eS)
    : source()
    , target()
@ -76,13 +72,6 @@ public:
    , count(0)
    , pcfgScore(0.0) {
  }
-
-  void SetSpanLength(size_t sourcePos, size_t sourceLength, size_t targetLength) {
-    m_ntLengths[sourcePos] = std::pair<size_t, size_t>(sourceLength, targetLength);
-  }
-
-  void OutputNTLengths(std::ostream &out) const;
-  void OutputNTLengths(std::ostringstream &out) const;
 };

 }
--- a/phrase-extract/RuleExtractionOptions.h
+++ b/phrase-extract/RuleExtractionOptions.h
@ -49,7 +49,6 @@ public:
  bool duplicateRules;
  bool fractionalCounting;
  bool pcfgScore;
-  bool outputNTLengths;
  bool gzOutput;
  bool unpairedExtractFormat;
  bool conditionOnTargetLhs;
@ -83,7 +82,6 @@ public:
    , duplicateRules(true)
    , fractionalCounting(true)
    , pcfgScore(false)
-    , outputNTLengths(false)
    , gzOutput(false)
    , unpairedExtractFormat(false)
    , conditionOnTargetLhs(false)
--- a/phrase-extract/consolidate-main.cpp
+++ b/phrase-extract/consolidate-main.cpp
@ -41,7 +41,6 @@ bool lowCountFlag = false;
 bool goodTuringFlag = false;
 bool kneserNeyFlag = false;
 bool logProbFlag = false;
-bool outputNTLengths = false;
 inline float maybeLogProb( float a )
 {
  return logProbFlag ? log(a) : a;
@ -62,7 +61,7 @@ int main(int argc, char* argv[])
       << "consolidating direct and indirect rule tables\n";

  if (argc < 4) {
-    cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--OutputNTLengths] \n";
+    cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] \n";
    exit(1);
  }
  char* &fileNameDirect = argv[1];
@ -119,8 +118,6 @@ int main(int argc, char* argv[])
    } else if (strcmp(argv[i],"--LogProb") == 0) {
      logProbFlag = true;
      cerr << "using log-probabilities\n";
-    } else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
-      outputNTLengths = true;
    } else {
      cerr << "ERROR: unknown option " << argv[i] << endl;
      exit(1);
@ -315,10 +312,6 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
    // counts, for debugging
    fileConsolidated << "||| " << countE << " " << countF << " " << countEF;

-    if (outputNTLengths) {
-      fileConsolidated << " ||| " << itemDirect[5];
-    }
-
    // count bin feature (as a sparse feature)
    if (sparseCountBinFeatureFlag ||
        directSparseScores.compare("") != 0 ||
--- a/phrase-extract/extract-rules-main.cpp
+++ b/phrase-extract/extract-rules-main.cpp
@ -129,7 +129,6 @@ int main(int argc, char* argv[])
         << " --GlueGrammar FILE"
         << " | --UnknownWordLabel FILE"
         << " | --OnlyDirect"
-         << " | --OutputNTLengths"
         << " | --MaxSpan[" << options.maxSpan << "]"
         << " | --MinHoleTarget[" << options.minHoleTarget << "]"
         << " | --MinHoleSource[" << options.minHoleSource << "]"
@ -262,8 +261,6 @@ int main(int argc, char* argv[])
      options.fractionalCounting = false;
    } else if (strcmp(argv[i],"--PCFG") == 0) {
      options.pcfgScore = true;
-    } else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
-      options.outputNTLengths = true;
    } else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) {
      options.unpairedExtractFormat = true;
    } else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) {
@ -663,9 +660,6 @@ void ExtractTask::saveHieroAlignment( int startT, int endT, int startS, int endS
    rule.alignment      += sourceSymbolIndex + "-" + targetSymbolIndex + " ";
    if (!m_options.onlyDirectFlag)
      rule.alignmentInv += targetSymbolIndex + "-" + sourceSymbolIndex + " ";
-
-    rule.SetSpanLength(hole.GetPos(0), hole.GetSize(0), hole.GetSize(1) ) ;
-
  }

  rule.alignment.erase(rule.alignment.size()-1);
@ -1077,9 +1071,6 @@ void ExtractTask::writeRulesToFile()
        << rule->target << " ||| "
        << rule->alignment << " ||| "
        << rule->count << " ||| ";
-    if (m_options.outputNTLengths) {
-      rule->OutputNTLengths(out);
-    }
    if (m_options.pcfgScore) {
      out << " ||| " << rule->pcfgScore;
    }
--- a/phrase-extract/score-main.cpp
+++ b/phrase-extract/score-main.cpp
@ -61,7 +61,6 @@ int negLogProb = 1;
 bool lexFlag = true;
 bool unalignedFlag = false;
 bool unalignedFWFlag = false;
-bool outputNTLengths = false;
 bool singletonFeature = false;
 bool crossedNonTerm = false;
 int countOfCounts[COC_MAX+1];
@ -85,9 +84,6 @@ double computeUnalignedPenalty( const PHRASE &, const PHRASE &, const PhraseAlig
 set<string> functionWordList;
 void loadFunctionWords( const string &fileNameFunctionWords );
 double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & );
-void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs
-                      , map<size_t, map<size_t, float> > &sourceProb
-                      , map<size_t, map<size_t, float> > &targetProb);
 void printSourcePhrase(const PHRASE &, const PHRASE &, const PhraseAlignment &, ostream &);
 void printTargetPhrase(const PHRASE &, const PHRASE &, const PhraseAlignment &, ostream &);

@ -98,7 +94,7 @@ int main(int argc, char* argv[])

  ScoreFeatureManager featureManager;
  if (argc < 4) {
-    cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--TreeFragments] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--Singleton] [--CrossedNonTerm] \n";
+    cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PCFG] [--TreeFragments] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--Singleton] [--CrossedNonTerm] \n";
    cerr << featureManager.usage() << endl;
    exit(1);
  }
@ -164,8 +160,6 @@ int main(int argc, char* argv[])
      minCountHierarchical = atof(argv[++i]);
      cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times\n";
      minCountHierarchical -= 0.00001; // account for rounding
-    } else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
-      outputNTLengths = true;
    } else if (strcmp(argv[i],"--Singleton") == 0) {
      singletonFeature = true;
      cerr << "binary singleton feature\n";
@ -405,86 +399,6 @@ const std::string &findBestTreeFragment(const PhraseAlignmentCollection &phraseP
 }


-void calcNTLengthProb(const map<size_t, map<size_t, size_t> > &lengths
-                      , size_t total
-                      , map<size_t, map<size_t, float> > &probs)
-{
-  map<size_t, map<size_t, size_t> >::const_iterator iterOuter;
-  for (iterOuter = lengths.begin(); iterOuter != lengths.end(); ++iterOuter) {
-    size_t sourcePos = iterOuter->first;
-    const map<size_t, size_t> &inner = iterOuter->second;
-
-    map<size_t, size_t>::const_iterator iterInner;
-    for (iterInner = inner.begin(); iterInner != inner.end(); ++iterInner) {
-      size_t length = iterInner->first;
-      size_t count = iterInner->second;
-      float prob = (float) count / (float) total;
-      probs[sourcePos][length] = prob;
-    }
-  }
-}
-
-void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs
-                      , map<size_t, map<size_t, float> > &sourceProb
-                      , map<size_t, map<size_t, float> > &targetProb)
-{
-  map<size_t, map<size_t, size_t> > sourceLengths, targetLengths;
-  // 1st = position in source phrase, 2nd = length, 3rd = count
-  map<size_t, size_t> totals;
-  // 1st = position in source phrase, 2nd = total counts
-  // each source pos should have same count?
-
-  vector< PhraseAlignment* >::const_iterator iterOuter;
-  for (iterOuter = phrasePairs.begin(); iterOuter != phrasePairs.end(); ++iterOuter) {
-    const PhraseAlignment &phrasePair = **iterOuter;
-    const std::map<size_t, std::pair<size_t, size_t> > &ntLengths = phrasePair.GetNTLengths();
-
-    std::map<size_t, std::pair<size_t, size_t> >::const_iterator iterInner;
-    for (iterInner = ntLengths.begin(); iterInner != ntLengths.end(); ++iterInner) {
-      size_t sourcePos = iterInner->first;
-      size_t sourceLength = iterInner->second.first;
-      size_t targetLength = iterInner->second.second;
-
-      sourceLengths[sourcePos][sourceLength]++;
-      targetLengths[sourcePos][targetLength]++;
-
-      totals[sourcePos]++;
-    }
-  }
-
-  if (totals.size() == 0) {
-    // no non-term. Don't bother
-    return;
-  }
-
-  size_t total = totals.begin()->second;
-  if (totals.size() > 1) {
-    assert(total == (++totals.begin())->second );
-  }
-
-  calcNTLengthProb(sourceLengths, total, sourceProb);
-  calcNTLengthProb(targetLengths, total, targetProb);
-
-}
-
-void outputNTLengthProbs(ostream &phraseTableFile, const map<size_t, map<size_t, float> > &probs, const string &prefix)
-{
-  map<size_t, map<size_t, float> >::const_iterator iterOuter;
-  for (iterOuter = probs.begin(); iterOuter != probs.end(); ++iterOuter) {
-    size_t sourcePos = iterOuter->first;
-    const map<size_t, float> &inner = iterOuter->second;
-
-    map<size_t, float>::const_iterator iterInner;
-    for (iterInner = inner.begin(); iterInner != inner.end(); ++iterInner) {
-      size_t length = iterInner->first;
-      float prob = iterInner->second;
-
-      phraseTableFile << sourcePos << "|" << prefix << "|" << length << "=" << prob << " ";
-    }
-  }
-
-}
-
 bool calcCrossedNonTerm(size_t sourcePos, size_t targetPos, const std::vector< std::set<size_t> > &alignedToS)
 {
  for (size_t currSource = 0; currSource < alignedToS.size(); ++currSource) {
@ -694,21 +608,6 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
  if (kneserNeyFlag)
    phraseTableFile << " " << distinctCount;

-  // nt lengths
-  if (outputNTLengths) {
-    phraseTableFile << " ||| ";
-
-    if (!inverseFlag) {
-      map<size_t, map<size_t, float> > sourceProb, targetProb;
-      // 1st sourcePos, 2nd = length, 3rd = prob
-
-      calcNTLengthProb(phrasePair, sourceProb, targetProb);
-
-      outputNTLengthProbs(phraseTableFile, sourceProb, "S");
-      outputNTLengthProbs(phraseTableFile, targetProb, "T");
-    }
-  }
-
  //MARIA
  //sparse features
  phraseTableFile << " ||| ";
@ -727,7 +626,6 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
  }


-
  phraseTableFile << endl;
 }