binary hiero reordering feature. Implementation of 1 described in nist 2012. 1 if non-term is reordered wrt to other words or non-terms. 0 otherwise

2024-10-27 11:59:52 +03:00 · 2012-08-25 00:47:57 +01:00 · 2012-08-25 00:47:57 +01:00 · 33c03edfbb
commit 33c03edfbb
parent 1931bfe959
1 changed files with 106 additions and 31 deletions
--- a/phrase-extract/score.cpp
+++ b/phrase-extract/score.cpp
@ -58,6 +58,7 @@ bool unalignedFlag = false;
 bool unalignedFWFlag = false;
 bool outputNTLengths = false;
 bool singletonFeature = false;
+bool crossedNonTerm = false;
 int countOfCounts[COC_MAX+1];
 int totalDistinct = 0;
 float minCountHierarchical = 0;
@ -71,13 +72,13 @@ vector<string> tokenize( const char [] );

 void writeCountOfCounts( const string &fileNameCountOfCounts );
 void processPhrasePairs( vector< PhraseAlignment > & , ostream &phraseTableFile, bool isSingleton);
-PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair );
+const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrasePair );
 void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float, int, ostream &phraseTableFile, bool isSingleton );
-double computeLexicalTranslation( const PHRASE &, const PHRASE &, PhraseAlignment * );
-double computeUnalignedPenalty( const PHRASE &, const PHRASE &, PhraseAlignment * );
+double computeLexicalTranslation( const PHRASE &, const PHRASE &, const PhraseAlignment & );
+double computeUnalignedPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & );
 set<string> functionWordList;
 void loadFunctionWords( const string &fileNameFunctionWords );
-double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, PhraseAlignment * );
+double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & );
 void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs
                      , map<size_t, map<size_t, float> > &sourceProb
                      , map<size_t, map<size_t, float> > &targetProb);
@ -90,7 +91,7 @@ int main(int argc, char* argv[])
       << "scoring methods for extracted rules\n";

  if (argc < 4) {
-    cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--Singleton] \n";
+    cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--Singleton] [--CrossedNonTerm] \n";
    exit(1);
  }
  string fileNameExtract = argv[1];
@ -156,6 +157,9 @@ int main(int argc, char* argv[])
    } else if (strcmp(argv[i],"--Singleton") == 0) {
      singletonFeature = true;
      cerr << "binary singleton feature\n";
+    } else if (strcmp(argv[i],"--CrossedNonTerm") == 0) {
+      crossedNonTerm = true;
+      cerr << "crossed non-term reordering feature\n";
    } else {
      cerr << "ERROR: unknown option " << argv[i] << endl;
      exit(1);
@ -243,12 +247,12 @@ int main(int argc, char* argv[])
      processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton );
      
      phrasePairsWithSameF.clear();
-      isSingleton = true;
+      isSingleton = false;
      lastPhrasePair = NULL;
    }
    else
    {
-      isSingleton = false;
+      isSingleton = true;
    }

    // add phrase pairs to list, it's now the last one
@ -336,7 +340,7 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseT
  
 }

-PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair )
+const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrasePair )
 {
  float bestAlignmentCount = -1;
  PhraseAlignment* bestAlignment;
@ -357,7 +361,7 @@ PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair )
    }
  }    

-  return bestAlignment;
+  return *bestAlignment;
 }


@ -448,11 +452,73 @@ void outputNTLengthProbs(ostream &phraseTableFile, const map<size_t, map<size_t,

 }

+bool calcCrossedNonTerm(int sourcePos, int targetPos, const std::vector< std::set<size_t> > &alignedToS)
+{
+  for (int currSource = 0; currSource < alignedToS.size(); ++currSource)
+  {
+    if (currSource == sourcePos)
+    { // skip
+    }
+    else 
+    {
+      const std::set<size_t> &targetSet = alignedToS[currSource];
+      std::set<size_t>::const_iterator iter;
+      for (iter = targetSet.begin(); iter != targetSet.end(); ++iter)
+      {
+        size_t currTarget = *iter;
+        
+        if ((currSource < sourcePos && currTarget > targetPos)
+            || (currSource > sourcePos && currTarget < targetPos)
+          )
+        {
+          return true;
+        }
+      }
+      
+    }
+  }
+  
+  return false;
+}
+
+int calcCrossedNonTerm(const PHRASE &phraseS, const PhraseAlignment &bestAlignment)
+{
+  const std::vector< std::set<size_t> > &alignedToS = bestAlignment.alignedToS;
+  
+  for (int sourcePos = 0; sourcePos < alignedToS.size(); ++sourcePos)
+  {
+    const std::set<size_t> &targetSet = alignedToS[sourcePos];
+    cerr << "size=" << targetSet.size() << " ";
+    std::set<size_t>::const_iterator iter;
+    for (iter = targetSet.begin(); iter != targetSet.end(); ++iter)
+    {
+      size_t targetPos = *iter;
+      cerr << sourcePos << "-" << targetPos << " ";
+    }
+    cerr << endl;
+    
+    WORD_ID wordId = phraseS[sourcePos];
+    const WORD &word = vcbS.getWord(wordId);
+    bool isNonTerm = isNonTerminal(word);
+    
+    if (isNonTerm)
+    {
+      assert(targetSet.size() == 1);
+      int targetPos = *targetSet.begin();
+      bool ret = calcCrossedNonTerm(sourcePos, targetPos, alignedToS);
+      if (ret)
+        return 1;
+    }
+  }
+  
+  return 0;
+}
+
 void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile, bool isSingleton )
 {
  if (phrasePair.size() == 0) return;

-  PhraseAlignment *bestAlignment = findBestAlignment( phrasePair );
+  const PhraseAlignment &bestAlignment = findBestAlignment( phrasePair );
    
  // compute count
  float count = 0;
@ -492,17 +558,17 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo

  // source phrase (unless inverse)
  if (! inverseFlag) {
-    printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
+    printSourcePhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
    phraseTableFile << " ||| ";
  }

  // target phrase
-  printTargetPhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
+  printTargetPhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
  phraseTableFile << " ||| ";

  // source phrase (if inverse)
  if (inverseFlag) {
-    printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
+    printSourcePhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
    phraseTableFile << " ||| ";
  }

@ -525,7 +591,11 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
  }

  if (singletonFeature) {
-    phraseTableFile << " " << (isSingleton?1:0);
+    phraseTableFile << " " << (isSingleton ? 1 : 0);
+  }
+  
+  if (crossedNonTerm && !inverseFlag) {
+    phraseTableFile << " " << calcCrossedNonTerm(phraseS, bestAlignment);
  }
  
  // target-side PCFG score
@ -539,26 +609,31 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
  if (! inverseFlag) {
    if (hierarchicalFlag) {
      // always output alignment if hiero style, but only for non-terms
-      assert(phraseT.size() == bestAlignment->alignedToT.size() + 1);
+      assert(phraseT.size() == bestAlignment.alignedToT.size() + 1);
      for(size_t j = 0; j < phraseT.size() - 1; j++) {
        if (isNonTerminal(vcbT.getWord( phraseT[j] ))) {
-          if (bestAlignment->alignedToT[ j ].size() != 1) {
+          if (bestAlignment.alignedToT[ j ].size() != 1) {
            cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << endl;
            phraseTableFile.flush();
-            assert(bestAlignment->alignedToT[ j ].size() == 1);
+            assert(bestAlignment.alignedToT[ j ].size() == 1);
          }
-          int sourcePos = *(bestAlignment->alignedToT[ j ].begin());
+          int sourcePos = *(bestAlignment.alignedToT[ j ].begin());
          phraseTableFile << sourcePos << "-" << j << " ";
        }
        else if (wordAlignmentFlag) {
-          int sourcePos = *(bestAlignment->alignedToT[ j ].begin());
-          phraseTableFile << sourcePos << "-" << j << " ";
+          const std::set<size_t> &sourceSet = bestAlignment.alignedToT[ j ];
+          std::set<size_t>::const_iterator iter;
+          for (iter = sourceSet.begin(); iter != sourceSet.end(); ++iter)
+          {
+            int sourcePos = *iter;
+            phraseTableFile << sourcePos << "-" << j << " ";            
+          }
        }
      }
    } else if (wordAlignmentFlag) {
      // alignment info in pb model
-      for(size_t j=0; j<bestAlignment->alignedToT.size(); j++) {
-        const set< size_t > &aligned = bestAlignment->alignedToT[j];
+      for(size_t j=0; j<bestAlignment.alignedToT.size(); j++) {
+        const set< size_t > &aligned = bestAlignment.alignedToT[j];
        for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); ++p) {
          phraseTableFile << *p << "-" << j << " ";
        }
@ -592,13 +667,13 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
  phraseTableFile << endl;
 }

-double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
+double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
 {
  // unaligned word counter
  double unaligned = 1.0;
  // only checking target words - source words are caught when computing inverse
-  for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) {
-    const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
+  for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
+    const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
    if (srcIndices.empty()) {
      unaligned *= 2.718;
    }
@ -606,13 +681,13 @@ double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, Ph
  return unaligned;
 }

-double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
+double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
 {
  // unaligned word counter
  double unaligned = 1.0;
  // only checking target words - source words are caught when computing inverse
-  for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) {
-    const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
+  for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
+    const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
    if (srcIndices.empty() && functionWordList.find( vcbT.getWord( phraseT[ ti ] ) ) != functionWordList.end()) {
      unaligned *= 2.718;
    }
@ -645,14 +720,14 @@ void loadFunctionWords( const string &fileName )
  inFile.close();
 }

-double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
+double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
 {
  // lexical translation probability
  double lexScore = 1.0;
  int null = vcbS.getWordID("NULL");
  // all target words have to be explained
-  for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) {
-    const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
+  for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
+    const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
    if (srcIndices.empty()) {
      // explain unaligned word by NULL
      lexScore *= lexTable.permissiveLookup( null, phraseT[ ti ] );