make filter-pt compile under windows

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1454 1f5c12ca-751b-0410-a591-d2e778427230
2024-12-28 14:32:38 +03:00 · 2007-08-16 18:13:04 +00:00 · 2007-08-16 18:13:04 +00:00 · 72d53aee46
commit 72d53aee46
parent af9488c7ba
6 changed files with 864 additions and 357 deletions
--- a/sigtest-filter/README.txt
+++ b/sigtest-filter/README.txt
--- a/sigtest-filter/XGetopt.cpp
+++ b/sigtest-filter/XGetopt.cpp
@ -0,0 +1,220 @@
+// XGetopt.cpp  Version 1.2
+//
+// Author:  Hans Dietrich
+//          hdietrich2@hotmail.com
+//
+// Description:
+//     XGetopt.cpp implements getopt(), a function to parse command lines.
+//
+// History
+//     Version 1.2 - 2003 May 17
+//     - Added Unicode support
+//
+//     Version 1.1 - 2002 March 10
+//     - Added example to XGetopt.cpp module header 
+//
+// This software is released into the public domain.
+// You are free to use it in any way you like.
+//
+// This software is provided "as is" with no expressed
+// or implied warranty.  I accept no liability for any
+// damage or loss of business that this software may cause.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+
+///////////////////////////////////////////////////////////////////////////////
+// if you are using precompiled headers then include this line:
+///////////////////////////////////////////////////////////////////////////////
+
+
+///////////////////////////////////////////////////////////////////////////////
+// if you are not using precompiled headers then include these lines:
+//#include <windows.h>
+//#include <stdio.h>
+//#include <tchar.h>
+///////////////////////////////////////////////////////////////////////////////
+
+
+#include <stdio.h>
+#include <string.h>
+#include "XGetopt.h"
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+//  X G e t o p t . c p p
+//
+//
+//  NAME
+//       getopt -- parse command line options
+//
+//  SYNOPSIS
+//       int getopt(int argc, char *argv[], char *optstring)
+//
+//       extern char *optarg;
+//       extern int optind;
+//
+//  DESCRIPTION
+//       The getopt() function parses the command line arguments. Its
+//       arguments argc and argv are the argument count and array as
+//       passed into the application on program invocation.  In the case
+//       of Visual C++ programs, argc and argv are available via the
+//       variables __argc and __argv (double underscores), respectively.
+//       getopt returns the next option letter in argv that matches a
+//       letter in optstring.  (Note:  Unicode programs should use
+//       __targv instead of __argv.  Also, all character and string
+//       literals should be enclosed in ( ) ).
+//
+//       optstring is a string of recognized option letters;  if a letter
+//       is followed by a colon, the option is expected to have an argument
+//       that may or may not be separated from it by white space.  optarg
+//       is set to point to the start of the option argument on return from
+//       getopt.
+//
+//       Option letters may be combined, e.g., "-ab" is equivalent to
+//       "-a -b".  Option letters are case sensitive.
+//
+//       getopt places in the external variable optind the argv index
+//       of the next argument to be processed.  optind is initialized
+//       to 0 before the first call to getopt.
+//
+//       When all options have been processed (i.e., up to the first
+//       non-option argument), getopt returns EOF, optarg will point
+//       to the argument, and optind will be set to the argv index of
+//       the argument.  If there are no non-option arguments, optarg
+//       will be set to NULL.
+//
+//       The special option "--" may be used to delimit the end of the
+//       options;  EOF will be returned, and "--" (and everything after it)
+//       will be skipped.
+//
+//  RETURN VALUE
+//       For option letters contained in the string optstring, getopt
+//       will return the option letter.  getopt returns a question mark (?)
+//       when it encounters an option letter not included in optstring.
+//       EOF is returned when processing is finished.
+//
+//  BUGS
+//       1)  Long options are not supported.
+//       2)  The GNU double-colon extension is not supported.
+//       3)  The environment variable POSIXLY_CORRECT is not supported.
+//       4)  The + syntax is not supported.
+//       5)  The automatic permutation of arguments is not supported.
+//       6)  This implementation of getopt() returns EOF if an error is
+//           encountered, instead of -1 as the latest standard requires.
+//
+//  EXAMPLE
+//       BOOL CMyApp::ProcessCommandLine(int argc, char *argv[])
+//       {
+//           int c;
+//
+//           while ((c = getopt(argc, argv, ("aBn:"))) != EOF)
+//           {
+//               switch (c)
+//               {
+//                   case ('a'):
+//                       TRACE(("option a\n"));
+//                       //
+//                       // set some flag here
+//                       //
+//                       break;
+//
+//                   case ('B'):
+//                       TRACE( ("option B\n"));
+//                       //
+//                       // set some other flag here
+//                       //
+//                       break;
+//
+//                   case ('n'):
+//                       TRACE(("option n: value=%d\n"), atoi(optarg));
+//                       //
+//                       // do something with value here
+//                       //
+//                       break;
+//
+//                   case ('?'):
+//                       TRACE(("ERROR: illegal option %s\n"), argv[optind-1]);
+//                       return FALSE;
+//                       break;
+//
+//                   default:
+//                       TRACE(("WARNING: no handler for option %c\n"), c);
+//                       return FALSE;
+//                       break;
+//               }
+//           }
+//           //
+//           // check for non-option args here
+//           //
+//           return TRUE;
+//       }
+//
+///////////////////////////////////////////////////////////////////////////////
+
+char	*optarg;		// global argument pointer
+int		optind = 0; 	// global argv index
+
+int getopt(int argc, char *argv[], char *optstring)
+{
+	static char *next = NULL;
+	if (optind == 0)
+		next = NULL;
+
+	optarg = NULL;
+
+	if (next == NULL || *next =='\0')
+	{
+		if (optind == 0)
+			optind++;
+
+		if (optind >= argc || argv[optind][0] != ('-') || argv[optind][1] == ('\0'))
+		{
+			optarg = NULL;
+			if (optind < argc)
+				optarg = argv[optind];
+			return EOF;
+		}
+
+		if (strcmp(argv[optind], "--") == 0)
+		{
+			optind++;
+			optarg = NULL;
+			if (optind < argc)
+				optarg = argv[optind];
+			return EOF;
+		}
+
+		next = argv[optind];
+		next++;		// skip past -
+		optind++;
+	}
+
+	char c = *next++;
+	char *cp = strchr(optstring, c);
+
+	if (cp == NULL || c == (':'))
+		return ('?');
+
+	cp++;
+	if (*cp == (':'))
+	{
+		if (*next != ('\0'))
+		{
+			optarg = next;
+			next = NULL;
+		}
+		else if (optind < argc)
+		{
+			optarg = argv[optind];
+			optind++;
+		}
+		else
+		{
+			return ('?');
+		}
+	}
+
+	return c;
+}
--- a/sigtest-filter/XGetopt.h
+++ b/sigtest-filter/XGetopt.h
@ -0,0 +1,23 @@
+// XGetopt.h  Version 1.2
+//
+// Author:  Hans Dietrich
+//          hdietrich2@hotmail.com
+//
+// This software is released into the public domain.
+// You are free to use it in any way you like.
+//
+// This software is provided "as is" with no expressed
+// or implied warranty.  I accept no liability for any
+// damage or loss of business that this software may cause.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef XGETOPT_H
+#define XGETOPT_H
+
+extern int optind, opterr;
+extern char *optarg;
+
+int getopt(int argc, char *argv[], char *optstring);
+
+#endif //XGETOPT_H
--- a/sigtest-filter/filter-pt.cpp
+++ b/sigtest-filter/filter-pt.cpp
@ -1,357 +1,364 @@
-#include <cassert>
-#include <cstdio>
-#include <cstdlib>
-#include <unistd.h>
-
-#include "_SuffixArraySearchApplicationBase.h"
-
-#include <vector>
-#include <iostream>
-#include <set>
-
-typedef std::set<TextLenType> SentIdSet;
-typedef std::map<std::string, SentIdSet> PhraseSetMap;
-
-// constants
-const size_t MINIMUM_SIZE_TO_KEEP = 10000;     // reduce this to improve memory usage,
-                                               // increase for speed
-const std::string SEPARATOR       = " ||| ";
-
-const double ALPHA_PLUS_EPS  = -1000.0;        // dummy value
-const double ALPHA_MINUS_EPS = -2000.0;        // dummy value
-
-// configuration params
-int pfe_filter_limit = 0;               // 0 = don't filter anything based on P(f|e)
-bool print_cooc_counts = false;         // add cooc counts to phrase table?
-bool print_neglog_significance = false; // add -log(p) to phrase table?
-double sig_filter_limit = 0;            // keep phrase pairs with -log(sig) > sig_filter_limit
-                                        //    higher = filter-more
-
-// globals
-PhraseSetMap esets;
-double p_111 = 0.0;                     // alpha
-size_t nremoved_sigfilter = 0;
-size_t nremoved_pfefilter = 0;
-
-C_SuffixArraySearchApplicationBase e_sa;
-C_SuffixArraySearchApplicationBase f_sa;
-int num_lines;
-
-void usage()
-{
-    std::cerr << "\nFilter phrase table using significance testing as described\n"
-              << "in H. Johnson, et al. (2007) Improving Translation Quality\n"
-	      << "by Discarding Most of the Phrasetable. EMNLP 2007.\n"
-        << "\nUsage:\n"
-        << "\n  filter-pt -e english.suf-arr -f french.suf-arr\n"
-	<< "      [-c] [-p] [-l threshold] [-n num] < PHRASE-TABLE > FILTERED-PHRASE-TABLE\n\n"
-	<< "   [-l threshold] >0.0, a+e, or a-e: keep values that have a -log significance > this\n"
-	<< "   [-n num      ] 0, 1...: 0=no filtering, >0 sort by P(e|f) and keep the top num elements\n"
-	<< "   [-c          ] add the cooccurence counts to the phrase table\n"
-	<< "   [-p          ] add -log(significance) to the phrasetable\n\n";
-    exit(1);
-}
-
-struct PTEntry {
-    PTEntry(const std::string& str, int index);
-    std::string f_phrase;
-    std::string e_phrase;
-    std::string extra;
-    std::string scores;
-    float pfe;
-    int cf;
-    int ce;
-    int cfe;
-    float nlog_pte;
-    void set_cooc_stats(int _cef, int _cf, int _ce, float nlp) {
-      cfe = _cef;
-      cf = _cf;
-      ce = _ce;
-      nlog_pte = nlp;
-    }
-    
-};
-
-PTEntry::PTEntry(const std::string& str, int index) :
- cf(0), ce(0), cfe(0), nlog_pte(0.0)
-{
-    size_t pos = 0;
-    std::string::size_type nextPos = str.find(SEPARATOR, pos);
-    this->f_phrase = str.substr(pos,nextPos); pos = nextPos + SEPARATOR.size();
-    nextPos = str.find(SEPARATOR, pos);
-    this->e_phrase = str.substr(pos,nextPos-pos); pos = nextPos + SEPARATOR.size();
-    nextPos = str.rfind(SEPARATOR);
-    this->extra = str.substr(pos, nextPos-pos);
-    this->scores = str.substr(nextPos + SEPARATOR.size(),std::string::npos);
-    int c = 0;
-    std::string::iterator i=scores.begin();
-    if (index > 0) {
-        for (; i != scores.end(); ++i) {
-            if ((*i) == ' ') {
-                c++;
-                if (c == index) break;
-            }
-        }
-    }
-    ++i;
-    char f[24];
-    char *fp=f;
-    while (i != scores.end() && *i != ' ') {
-      *fp++=*i++;
-    }
-    *fp++=0;
-    char *x;
-    this->pfe = strtof(f, &x);
-    // std::cerr << "L: " << f_phrase << " ::: " << e_phrase << " ::: " << scores << " ::: " << pfe << std::endl;
-    // std::cerr << "X: " << extra << "\n"; 
-}
-
-struct PfeComparer {
-  bool operator()(const PTEntry* a, const PTEntry* b) const { return a->pfe > b->pfe; }
-};
-
-struct NlogSigThresholder {
-  NlogSigThresholder(float threshold) : t(threshold) {}
-  float t;
-  bool operator()(const PTEntry* a) const { if (a->nlog_pte < t) { delete a; return true; } else return false; }
-};
-
-std::ostream& operator << (std::ostream& os, const PTEntry& pp)
-{
-  os << pp.f_phrase << " ||| " << pp.e_phrase;
-  if (pp.extra.size()>0) os << " ||| " << pp.extra;
-  os << " ||| " << pp.scores;
-  if (print_cooc_counts) os << " ||| " << pp.cfe << " " << pp.cf << " " << pp.ce;
-  if (print_neglog_significance) os << " ||| " << pp.nlog_pte;
-  return os;
-}
-
-// for an overview, see 
-//    W. Press, S. Teukolsky and W. Vetterling. (1992) Numerical Recipes in C. Chapter 6.1.
-double log_gamma(int x)
-{
-  // size_t xx=(size_t)x; xx--; size_t sum=1; while (xx) { sum *= xx--; } return log((double)(sum));
-  if (x <= 2) { return 0.0; }
-  static double coefs[6] = {76.18009172947146, -86.50532032941677, 24.01409824083091, -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5};
-  double tmp=(double)x+5.5;
-  tmp -= (((double)x)+0.5)*log(tmp);
-  double y=(double)x;
-  double sum = 1.000000000190015;
-  for (size_t j=0;j<6;++j) { sum += coefs[j]/++y; }
-  return -tmp+log(2.5066282746310005*sum/(double)x); 
-}
-
-void print(int a, int b, int c, int d, float p) {
-  std::cerr << a << "\t" << b << "\t P=" << p << "\n"
-            << c << "\t" << d << "\t xf=" << (double)(b)*(double)(c)/(double)(a+1)/(double)(d+1) << "\n\n";
-}
-
-// 2x2 (one-sided) Fisher's exact test
-// see B. Moore. (2004) On Log Likelihood and the Significance of Rare Events
-double fisher_exact(int cfe, int ce, int cf)
-{
-    assert(cfe <= ce);
-    assert(cfe <= cf);
-
-    int a = cfe;
-    int b = (cf - cfe);
-    int c = (ce - cfe);
-    int d = (num_lines - ce - cf + cfe);
-    int n = a + b + c + d;
-
-    double cp = exp(log_gamma(1+a+c) + log_gamma(1+b+d) + log_gamma(1+a+b) + log_gamma(1+c+d) - log_gamma(1+n) - log_gamma(1+a) - log_gamma(1+b) - log_gamma(1+c) - log_gamma(1+d));
-    double total_p = 0.0;
-    int tc = std::min(b,c);
-    for (int i=0; i<=tc; i++) {
-      total_p += cp;
-//      double lg = log_gamma(1+a+c) + log_gamma(1+b+d) + log_gamma(1+a+b) + log_gamma(1+c+d) - log_gamma(1+n) - log_gamma(1+a) - log_gamma(1+b) - log_gamma(1+c) - log_gamma(1+d); double cp = exp(lg);
-//      print(a,b,c,d,cp);
-      double coef = (double)(b)*(double)(c)/(double)(a+1)/(double)(d+1);
-      cp *= coef;
-      ++a; --c; ++d; --b;
-    }
-  return total_p;
-}
-
-// input: unordered list of translation options for a single source phrase
-void compute_cooc_stats_and_filter(std::vector<PTEntry*>& options)
-{
-  if (pfe_filter_limit>0 && options.size() > pfe_filter_limit) {
-    nremoved_pfefilter += (options.size() - pfe_filter_limit);
-    std::nth_element(options.begin(), options.begin()+pfe_filter_limit, options.end(), PfeComparer());
-    for (std::vector<PTEntry*>::iterator i=options.begin()+pfe_filter_limit; i != options.end(); ++i)
-      delete *i;
-    options.erase(options.begin()+pfe_filter_limit,options.end());
-  }
-  
-  SentIdSet fset;
-  vector<S_SimplePhraseLocationElement> locations;
-  //std::cerr << "Looking up f-phrase: " << options.front()->f_phrase << "\n";
-
-  locations = f_sa.locateExactPhraseInCorpus(options.front()->f_phrase.c_str());
-  if(locations.size()==0){
-	  cerr<<"No occurrences found!!\n";
-  }
-  for (vector<S_SimplePhraseLocationElement>::iterator i=locations.begin();
-		  i != locations.end();
-		  ++i)
-  {
-	  fset.insert(i->sentIdInCorpus);
-  }
-  size_t cf = fset.size();
-  for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) {
-    const std::string& e_phrase = (*i)->e_phrase;
-    size_t cef=0;
-    SentIdSet& eset = esets[(*i)->e_phrase];
-    if (eset.empty()) {
-      //std::cerr << "Looking up e-phrase: " << e_phrase << "\n";  
-      vector<S_SimplePhraseLocationElement> locations = e_sa.locateExactPhraseInCorpus(e_phrase.c_str());
-      for (vector<S_SimplePhraseLocationElement>::iterator i=locations.begin(); i!= locations.end(); ++i) {
-          TextLenType curSentId = i->sentIdInCorpus;
-  	  eset.insert(curSentId);
-      }
-    }
-    size_t ce=eset.size();
-    if (ce < cf) {
-      for (SentIdSet::iterator i=eset.begin(); i != eset.end(); ++i) {
-        if (fset.find(*i) != fset.end()) cef++;
-      }
-    } else {
-      for (SentIdSet::iterator i=fset.begin(); i != fset.end(); ++i) {
-        if (eset.find(*i) != eset.end()) cef++;
-      }
-    }
-    double nlp = -log(fisher_exact(cef, cf, ce));
-    (*i)->set_cooc_stats(cef, cf, ce, nlp);
-    if (ce < MINIMUM_SIZE_TO_KEEP) {
-      esets.erase(e_phrase);
-    }
-  }
-  std::vector<PTEntry*>::iterator new_end = 
-    std::remove_if(options.begin(), options.end(), NlogSigThresholder(sig_filter_limit));
-  nremoved_sigfilter += (options.end() - new_end);
-  options.erase(new_end,options.end());
-}
-
-int main(int argc, char * argv[]){
-    int c;
-    const char* efile=0;
-    const char* ffile=0;
-    int pfe_index = 2;
-    while ((c = getopt(argc, argv, "cpf:e:i:n:l:")) != -1) {
-        switch (c) {
-            case 'e':
-                efile = optarg;
-                break;
-            case 'f':
-                ffile = optarg;
-                break;
-            case 'i':  // index of pfe in phrase table
-                pfe_index = atoi(optarg);
-                break;
-            case 'n':  // keep only the top n entries in phrase table sorted by p(f|e) (0=all)
-                pfe_filter_limit = atoi(optarg);
-		std::cerr << "P(f|e) filter limit: " << pfe_filter_limit << std::endl;
-                break;
-	    case 'c':
-	    	print_cooc_counts = true;
-	    	break;
-	    case 'p':
-	    	print_neglog_significance = true;
-	    	break;
-	    case 'l':
-	        std::cerr << "-l = " << optarg << "\n";
-	        if (strcmp(optarg,"a+e") == 0) {
-		  sig_filter_limit = ALPHA_PLUS_EPS;
-		} else if (strcmp(optarg,"a-e") == 0) {
-		  sig_filter_limit = ALPHA_MINUS_EPS;
-		} else {
-		  char *x;
-		  sig_filter_limit = strtod(optarg, &x);
-		  if (sig_filter_limit < 0.0) {
-		    std::cerr << "Filter limit (-l) must be either 'a+e', 'a-e' or a real number >= 0.0\n";
-		    usage();
-		  }
-		}
-	    	break;
-            default:
-                usage();
-        }
-    }
-    //-----------------------------------------------------------------------------	
-    if (optind != argc || !efile || !ffile) {
-      usage();
-    }
-
-    //load the indexed corpus with vocabulary(noVoc=false) and with offset(noOffset=false)
-    e_sa.loadData_forSearch(efile, false, false);
-    f_sa.loadData_forSearch(ffile, false, false);
-    size_t elines = e_sa.returnTotalSentNumber();
-    size_t flines = f_sa.returnTotalSentNumber();
-    if (elines != flines) {
-      std::cerr << "Number of lines in e-corpus != number of lines in f-corpus!\n";
-      usage();
-    } else {
-      std::cerr << "Training corpus: " << elines << " lines\n";
-      num_lines = elines;
-    }
-    p_111 = -log(fisher_exact(1,1,1));
-    std::cerr << "\\alpha = " << p_111 << "\n";
-    if (sig_filter_limit == ALPHA_MINUS_EPS) { sig_filter_limit = p_111 - 0.001; }
-      else if (sig_filter_limit == ALPHA_PLUS_EPS) { sig_filter_limit = p_111 + 0.001; }
-    std::cerr << "Sig filter threshold is = " << sig_filter_limit << "\n";
-
-    char tmpString[10000];
-    std::string prev = "";
-    std::vector<PTEntry*> options;
-    size_t pt_lines = 0;
-    while(!cin.eof()){
-	cin.getline(tmpString,10000,'\n');
-	if(++pt_lines%10000==0)
-	{
-	    std::cerr << ".";
-	    if(pt_lines%500000==0) std::cerr << "[n:"<<pt_lines<<"]\n";
-	}
-
-	if(strlen(tmpString)>0){
-            PTEntry* pp = new PTEntry(tmpString, pfe_index);
-            if (prev != pp->f_phrase) {
-                prev = pp->f_phrase;
-
-		if (!options.empty()) {  // always true after first line
-		  compute_cooc_stats_and_filter(options);
-		}
-		for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) {
-			std::cout << **i << std::endl;
-			delete *i;
-		}
-		options.clear();
-		options.push_back(pp);
-		
-            } else {
-	        options.push_back(pp);
-	    }
-            //			  for(int i=0;i<locations.size(); i++){
-            //				  cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl;
-            //			  }
-        }
-    }
-    compute_cooc_stats_and_filter(options);
-    for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) {
-        std::cout << **i << std::endl;
-        delete *i;
-    }
-    float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines;
-    float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines;
-    std::cerr << "\n\n------------------------------------------------------\n"
-              << "  unfiltered phrases pairs: " << pt_lines << "\n"
-              << "\n"
-	      << "     P(f|e) filter [first]: " << nremoved_pfefilter << "   (" << pfefper << "%)\n"
-	      << "       significance filter: " << nremoved_sigfilter << "   (" << sigfper << "%)\n"
-	      << "            TOTAL FILTERED: " << (nremoved_pfefilter + nremoved_sigfilter) << "   (" << (sigfper + pfefper) << "%)\n"
-              << "\n"
-              << "     FILTERED phrase pairs: " << (pt_lines - nremoved_pfefilter - nremoved_sigfilter) << "   (" << (100.0-sigfper - pfefper) << "%)\n"
-	      << "------------------------------------------------------\n";
-	      
-    return 0;
-}
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+//#include <unistd.h>
+#include <algorithm>
+
+#include "_SuffixArraySearchApplicationBase.h"
+
+#include <vector>
+#include <iostream>
+#include <set>
+
+#include "XGetopt.h"
+
+typedef std::set<TextLenType> SentIdSet;
+typedef std::map<std::string, SentIdSet> PhraseSetMap;
+
+#undef min
+
+// constants
+const size_t MINIMUM_SIZE_TO_KEEP = 10000;     // reduce this to improve memory usage,
+                                               // increase for speed
+const std::string SEPARATOR       = " ||| ";
+
+const double ALPHA_PLUS_EPS  = -1000.0;        // dummy value
+const double ALPHA_MINUS_EPS = -2000.0;        // dummy value
+
+// configuration params
+int pfe_filter_limit = 0;               // 0 = don't filter anything based on P(f|e)
+bool print_cooc_counts = false;         // add cooc counts to phrase table?
+bool print_neglog_significance = false; // add -log(p) to phrase table?
+double sig_filter_limit = 0;            // keep phrase pairs with -log(sig) > sig_filter_limit
+                                        //    higher = filter-more
+
+// globals
+PhraseSetMap esets;
+double p_111 = 0.0;                     // alpha
+size_t nremoved_sigfilter = 0;
+size_t nremoved_pfefilter = 0;
+
+C_SuffixArraySearchApplicationBase e_sa;
+C_SuffixArraySearchApplicationBase f_sa;
+int num_lines;
+
+void usage()
+{
+    std::cerr << "\nFilter phrase table using significance testing as described\n"
+              << "in H. Johnson, et al. (2007) Improving Translation Quality\n"
+	      << "by Discarding Most of the Phrasetable. EMNLP 2007.\n"
+        << "\nUsage:\n"
+        << "\n  filter-pt -e english.suf-arr -f french.suf-arr\n"
+	<< "      [-c] [-p] [-l threshold] [-n num] < PHRASE-TABLE > FILTERED-PHRASE-TABLE\n\n"
+	<< "   [-l threshold] >0.0, a+e, or a-e: keep values that have a -log significance > this\n"
+	<< "   [-n num      ] 0, 1...: 0=no filtering, >0 sort by P(e|f) and keep the top num elements\n"
+	<< "   [-c          ] add the cooccurence counts to the phrase table\n"
+	<< "   [-p          ] add -log(significance) to the phrasetable\n\n";
+    exit(1);
+}
+
+struct PTEntry {
+    PTEntry(const std::string& str, int index);
+    std::string f_phrase;
+    std::string e_phrase;
+    std::string extra;
+    std::string scores;
+    float pfe;
+    int cf;
+    int ce;
+    int cfe;
+    float nlog_pte;
+    void set_cooc_stats(int _cef, int _cf, int _ce, float nlp) {
+      cfe = _cef;
+      cf = _cf;
+      ce = _ce;
+      nlog_pte = nlp;
+    }
+    
+};
+
+PTEntry::PTEntry(const std::string& str, int index) :
+ cf(0), ce(0), cfe(0), nlog_pte(0.0)
+{
+    size_t pos = 0;
+    std::string::size_type nextPos = str.find(SEPARATOR, pos);
+    this->f_phrase = str.substr(pos,nextPos); pos = nextPos + SEPARATOR.size();
+    nextPos = str.find(SEPARATOR, pos);
+    this->e_phrase = str.substr(pos,nextPos-pos); pos = nextPos + SEPARATOR.size();
+    nextPos = str.rfind(SEPARATOR);
+    this->extra = str.substr(pos, nextPos-pos);
+    this->scores = str.substr(nextPos + SEPARATOR.size(),std::string::npos);
+    int c = 0;
+    std::string::iterator i=scores.begin();
+    if (index > 0) {
+        for (; i != scores.end(); ++i) {
+            if ((*i) == ' ') {
+                c++;
+                if (c == index) break;
+            }
+        }
+    }
+    ++i;
+    char f[24];
+    char *fp=f;
+    while (i != scores.end() && *i != ' ') {
+      *fp++=*i++;
+    }
+    *fp++=0;
+    
+		this->pfe = atof(f);
+
+    // std::cerr << "L: " << f_phrase << " ::: " << e_phrase << " ::: " << scores << " ::: " << pfe << std::endl;
+    // std::cerr << "X: " << extra << "\n"; 
+}
+
+struct PfeComparer {
+  bool operator()(const PTEntry* a, const PTEntry* b) const { return a->pfe > b->pfe; }
+};
+
+struct NlogSigThresholder {
+  NlogSigThresholder(float threshold) : t(threshold) {}
+  float t;
+  bool operator()(const PTEntry* a) const { if (a->nlog_pte < t) { delete a; return true; } else return false; }
+};
+
+std::ostream& operator << (std::ostream& os, const PTEntry& pp)
+{
+  os << pp.f_phrase << " ||| " << pp.e_phrase;
+  if (pp.extra.size()>0) os << " ||| " << pp.extra;
+  os << " ||| " << pp.scores;
+  if (print_cooc_counts) os << " ||| " << pp.cfe << " " << pp.cf << " " << pp.ce;
+  if (print_neglog_significance) os << " ||| " << pp.nlog_pte;
+  return os;
+}
+
+// for an overview, see 
+//    W. Press, S. Teukolsky and W. Vetterling. (1992) Numerical Recipes in C. Chapter 6.1.
+double log_gamma(int x)
+{
+  // size_t xx=(size_t)x; xx--; size_t sum=1; while (xx) { sum *= xx--; } return log((double)(sum));
+  if (x <= 2) { return 0.0; }
+  static double coefs[6] = {76.18009172947146, -86.50532032941677, 24.01409824083091, -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5};
+  double tmp=(double)x+5.5;
+  tmp -= (((double)x)+0.5)*log(tmp);
+  double y=(double)x;
+  double sum = 1.000000000190015;
+  for (size_t j=0;j<6;++j) { sum += coefs[j]/++y; }
+  return -tmp+log(2.5066282746310005*sum/(double)x); 
+}
+
+void print(int a, int b, int c, int d, float p) {
+  std::cerr << a << "\t" << b << "\t P=" << p << "\n"
+            << c << "\t" << d << "\t xf=" << (double)(b)*(double)(c)/(double)(a+1)/(double)(d+1) << "\n\n";
+}
+
+// 2x2 (one-sided) Fisher's exact test
+// see B. Moore. (2004) On Log Likelihood and the Significance of Rare Events
+double fisher_exact(int cfe, int ce, int cf)
+{
+    assert(cfe <= ce);
+    assert(cfe <= cf);
+
+    int a = cfe;
+    int b = (cf - cfe);
+    int c = (ce - cfe);
+    int d = (num_lines - ce - cf + cfe);
+    int n = a + b + c + d;
+
+    double cp = exp(log_gamma(1+a+c) + log_gamma(1+b+d) + log_gamma(1+a+b) + log_gamma(1+c+d) - log_gamma(1+n) - log_gamma(1+a) - log_gamma(1+b) - log_gamma(1+c) - log_gamma(1+d));
+    double total_p = 0.0;
+    int tc = std::min(b,c);
+    for (int i=0; i<=tc; i++) {
+      total_p += cp;
+//      double lg = log_gamma(1+a+c) + log_gamma(1+b+d) + log_gamma(1+a+b) + log_gamma(1+c+d) - log_gamma(1+n) - log_gamma(1+a) - log_gamma(1+b) - log_gamma(1+c) - log_gamma(1+d); double cp = exp(lg);
+//      print(a,b,c,d,cp);
+      double coef = (double)(b)*(double)(c)/(double)(a+1)/(double)(d+1);
+      cp *= coef;
+      ++a; --c; ++d; --b;
+    }
+  return total_p;
+}
+
+// input: unordered list of translation options for a single source phrase
+void compute_cooc_stats_and_filter(std::vector<PTEntry*>& options)
+{
+  if (pfe_filter_limit>0 && options.size() > pfe_filter_limit) {
+    nremoved_pfefilter += (options.size() - pfe_filter_limit);
+    std::nth_element(options.begin(), options.begin()+pfe_filter_limit, options.end(), PfeComparer());
+    for (std::vector<PTEntry*>::iterator i=options.begin()+pfe_filter_limit; i != options.end(); ++i)
+      delete *i;
+    options.erase(options.begin()+pfe_filter_limit,options.end());
+  }
+  
+  SentIdSet fset;
+  vector<S_SimplePhraseLocationElement> locations;
+  //std::cerr << "Looking up f-phrase: " << options.front()->f_phrase << "\n";
+
+  locations = f_sa.locateExactPhraseInCorpus(options.front()->f_phrase.c_str());
+  if(locations.size()==0){
+	  cerr<<"No occurrences found!!\n";
+  }
+  for (vector<S_SimplePhraseLocationElement>::iterator i=locations.begin();
+		  i != locations.end();
+		  ++i)
+  {
+	  fset.insert(i->sentIdInCorpus);
+  }
+  size_t cf = fset.size();
+  for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) {
+    const std::string& e_phrase = (*i)->e_phrase;
+    size_t cef=0;
+    SentIdSet& eset = esets[(*i)->e_phrase];
+    if (eset.empty()) {
+      //std::cerr << "Looking up e-phrase: " << e_phrase << "\n";  
+      vector<S_SimplePhraseLocationElement> locations = e_sa.locateExactPhraseInCorpus(e_phrase.c_str());
+      for (vector<S_SimplePhraseLocationElement>::iterator i=locations.begin(); i!= locations.end(); ++i) {
+          TextLenType curSentId = i->sentIdInCorpus;
+  	  eset.insert(curSentId);
+      }
+    }
+    size_t ce=eset.size();
+    if (ce < cf) {
+      for (SentIdSet::iterator i=eset.begin(); i != eset.end(); ++i) {
+        if (fset.find(*i) != fset.end()) cef++;
+      }
+    } else {
+      for (SentIdSet::iterator i=fset.begin(); i != fset.end(); ++i) {
+        if (eset.find(*i) != eset.end()) cef++;
+      }
+    }
+    double nlp = -log(fisher_exact(cef, cf, ce));
+    (*i)->set_cooc_stats(cef, cf, ce, nlp);
+    if (ce < MINIMUM_SIZE_TO_KEEP) {
+      esets.erase(e_phrase);
+    }
+  }
+  std::vector<PTEntry*>::iterator new_end = 
+    std::remove_if(options.begin(), options.end(), NlogSigThresholder(sig_filter_limit));
+  nremoved_sigfilter += (options.end() - new_end);
+  options.erase(new_end,options.end());
+}
+
+int main(int argc, char * argv[]){
+    int c;
+    const char* efile=0;
+    const char* ffile=0;
+    int pfe_index = 2;
+    while ((c = getopt(argc, argv, "cpf:e:i:n:l:")) != -1) {
+        switch (c) {
+            case 'e':
+                efile = optarg;
+                break;
+            case 'f':
+                ffile = optarg;
+                break;
+            case 'i':  // index of pfe in phrase table
+                pfe_index = atoi(optarg);
+                break;
+            case 'n':  // keep only the top n entries in phrase table sorted by p(f|e) (0=all)
+                pfe_filter_limit = atoi(optarg);
+		std::cerr << "P(f|e) filter limit: " << pfe_filter_limit << std::endl;
+                break;
+	    case 'c':
+	    	print_cooc_counts = true;
+	    	break;
+	    case 'p':
+	    	print_neglog_significance = true;
+	    	break;
+	    case 'l':
+	        std::cerr << "-l = " << optarg << "\n";
+	        if (strcmp(optarg,"a+e") == 0) {
+		  sig_filter_limit = ALPHA_PLUS_EPS;
+		} else if (strcmp(optarg,"a-e") == 0) {
+		  sig_filter_limit = ALPHA_MINUS_EPS;
+		} else {
+		  char *x;
+		  sig_filter_limit = strtod(optarg, &x);
+		  if (sig_filter_limit < 0.0) {
+		    std::cerr << "Filter limit (-l) must be either 'a+e', 'a-e' or a real number >= 0.0\n";
+		    usage();
+		  }
+		}
+	    	break;
+            default:
+                usage();
+        }
+    }
+    //-----------------------------------------------------------------------------	
+    if (optind != argc || !efile || !ffile) {
+      usage();
+    }
+
+    //load the indexed corpus with vocabulary(noVoc=false) and with offset(noOffset=false)
+    e_sa.loadData_forSearch(efile, false, false);
+    f_sa.loadData_forSearch(ffile, false, false);
+    size_t elines = e_sa.returnTotalSentNumber();
+    size_t flines = f_sa.returnTotalSentNumber();
+    if (elines != flines) {
+      std::cerr << "Number of lines in e-corpus != number of lines in f-corpus!\n";
+      usage();
+    } else {
+      std::cerr << "Training corpus: " << elines << " lines\n";
+      num_lines = elines;
+    }
+    p_111 = -log(fisher_exact(1,1,1));
+    std::cerr << "\\alpha = " << p_111 << "\n";
+    if (sig_filter_limit == ALPHA_MINUS_EPS) { sig_filter_limit = p_111 - 0.001; }
+      else if (sig_filter_limit == ALPHA_PLUS_EPS) { sig_filter_limit = p_111 + 0.001; }
+    std::cerr << "Sig filter threshold is = " << sig_filter_limit << "\n";
+
+    char tmpString[10000];
+    std::string prev = "";
+    std::vector<PTEntry*> options;
+    size_t pt_lines = 0;
+    while(!cin.eof()){
+	cin.getline(tmpString,10000,'\n');
+	if(++pt_lines%10000==0)
+	{
+	    std::cerr << ".";
+	    if(pt_lines%500000==0) std::cerr << "[n:"<<pt_lines<<"]\n";
+	}
+
+	if(strlen(tmpString)>0){
+            PTEntry* pp = new PTEntry(tmpString, pfe_index);
+            if (prev != pp->f_phrase) {
+                prev = pp->f_phrase;
+
+		if (!options.empty()) {  // always true after first line
+		  compute_cooc_stats_and_filter(options);
+		}
+		for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) {
+			std::cout << **i << std::endl;
+			delete *i;
+		}
+		options.clear();
+		options.push_back(pp);
+		
+            } else {
+	        options.push_back(pp);
+	    }
+            //			  for(int i=0;i<locations.size(); i++){
+            //				  cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl;
+            //			  }
+        }
+    }
+    compute_cooc_stats_and_filter(options);
+    for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) {
+        std::cout << **i << std::endl;
+        delete *i;
+    }
+    float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines;
+    float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines;
+    std::cerr << "\n\n------------------------------------------------------\n"
+              << "  unfiltered phrases pairs: " << pt_lines << "\n"
+              << "\n"
+	      << "     P(f|e) filter [first]: " << nremoved_pfefilter << "   (" << pfefper << "%)\n"
+	      << "       significance filter: " << nremoved_sigfilter << "   (" << sigfper << "%)\n"
+	      << "            TOTAL FILTERED: " << (nremoved_pfefilter + nremoved_sigfilter) << "   (" << (sigfper + pfefper) << "%)\n"
+              << "\n"
+              << "     FILTERED phrase pairs: " << (pt_lines - nremoved_pfefilter - nremoved_sigfilter) << "   (" << (100.0-sigfper - pfefper) << "%)\n"
+	      << "------------------------------------------------------\n";
+	      
+    return 0;
+}
--- a/sigtest-filter/sigtest-filter.sln
+++ b/sigtest-filter/sigtest-filter.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 9.00
+# Visual Studio 2005
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "sigtest-filter", "sigtest-filter.vcproj", "{FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Release|Win32 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}.Debug|Win32.ActiveCfg = Debug|Win32
+		{FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}.Debug|Win32.Build.0 = Debug|Win32
+		{FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}.Release|Win32.ActiveCfg = Release|Win32
+		{FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}.Release|Win32.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/sigtest-filter/sigtest-filter.vcproj
+++ b/sigtest-filter/sigtest-filter.vcproj
@ -0,0 +1,237 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="8.00"
+	Name="sigtest-filter"
+	ProjectGUID="{FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}"
+	RootNamespace="sigtestfilter"
+	Keyword="Win32Proj"
+	>
+	<Platforms>
+		<Platform
+			Name="Win32"
+		/>
+	</Platforms>
+	<ToolFiles>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+			IntermediateDirectory="$(ConfigurationName)"
+			ConfigurationType="1"
+			UseOfMFC="2"
+			CharacterSet="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="..\..\..\SALM\Src\SuffixArrayApplications\SuffixArraySearch;..\..\..\SALM\Src\SuffixArrayApplications;..\..\..\SALM\Src\Shared"
+				PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				Detect64BitPortabilityProblems="true"
+				DebugInformationFormat="4"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				LinkIncremental="2"
+				GenerateDebugInformation="true"
+				SubSystem="1"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCWebDeploymentTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+			IntermediateDirectory="$(ConfigurationName)"
+			ConfigurationType="1"
+			UseOfMFC="2"
+			CharacterSet="1"
+			WholeProgramOptimization="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				AdditionalIncludeDirectories="..\..\..\SALM\Src\SuffixArrayApplications\SuffixArraySearch;..\..\..\SALM\Src\SuffixArrayApplications;..\..\..\SALM\Src\Shared"
+				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
+				RuntimeLibrary="2"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				Detect64BitPortabilityProblems="true"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				LinkIncremental="1"
+				GenerateDebugInformation="true"
+				SubSystem="1"
+				OptimizeReferences="2"
+				EnableCOMDATFolding="2"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCWebDeploymentTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter
+			Name="Source Files"
+			Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
+			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
+			>
+			<File
+				RelativePath="..\..\..\SALM\Src\Shared\_IDVocabulary.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\SALM\Src\Shared\_String.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\SALM\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\SALM\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\filter-pt.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\XGetopt.cpp"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Header Files"
+			Filter="h;hpp;hxx;hm;inl;inc;xsd"
+			UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
+			>
+			<File
+				RelativePath="..\..\..\SALM\Src\Shared\_IDVocabulary.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\SALM\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\SALM\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.h"
+				>
+			</File>
+			<File
+				RelativePath=".\XGetopt.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Resource Files"
+			Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"
+			UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
+			>
+		</Filter>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>